Merge remote-tracking branch 'origin/main' into problame/benchmarking/pr/python-perftest

This commit is contained in:
Christian Schwarz
2024-01-08 14:24:32 +00:00
106 changed files with 4535 additions and 1223 deletions

View File

@@ -105,11 +105,11 @@ jobs:
- name: Install Python deps
run: ./scripts/pysync
- name: Run ruff to ensure code format
run: poetry run ruff .
- name: Run `ruff check` to ensure code format
run: poetry run ruff check .
- name: Run black to ensure code format
run: poetry run black --diff --check .
- name: Run `ruff format` to ensure code format
run: poetry run ruff format --check .
- name: Run mypy to check types
run: poetry run mypy .

149
Cargo.lock generated
View File

@@ -30,6 +30,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cd7d5a2cecb58716e47d67d5703a249964b14c7be1ec3cad3affc295b2d1c35d"
dependencies = [
"cfg-if",
"const-random",
"getrandom 0.2.11",
"once_cell",
"version_check",
"zerocopy",
@@ -50,6 +52,12 @@ version = "0.2.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5"
[[package]]
name = "android-tzdata"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0"
[[package]]
name = "android_system_properties"
version = "0.1.5"
@@ -247,6 +255,12 @@ dependencies = [
"syn 2.0.32",
]
[[package]]
name = "atomic"
version = "0.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c59bdb34bc650a32731b31bd8f0829cc15d24a708ee31559e0bb34f2bc320cba"
[[package]]
name = "atomic-polyfill"
version = "1.0.2"
@@ -1011,17 +1025,17 @@ dependencies = [
[[package]]
name = "chrono"
version = "0.4.24"
version = "0.4.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4e3c5919066adf22df73762e50cffcde3a758f2a848b113b586d1f86728b673b"
checksum = "7f2c685bad3eb3d45a01354cedb7d5faa66194d1d58ba6e267a8de788f79db38"
dependencies = [
"android-tzdata",
"iana-time-zone",
"js-sys",
"num-integer",
"num-traits",
"serde",
"wasm-bindgen",
"winapi",
"windows-targets 0.48.0",
]
[[package]]
@@ -2475,6 +2489,12 @@ dependencies = [
"web-sys",
]
[[package]]
name = "integer-encoding"
version = "3.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02"
[[package]]
name = "io-lifetimes"
version = "1.0.11"
@@ -2838,6 +2858,19 @@ dependencies = [
"winapi",
]
[[package]]
name = "num"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b05180d69e3da0e530ba2a1dae5110317e49e3b7f3d41be227dc5f92e49ee7af"
dependencies = [
"num-complex",
"num-integer",
"num-iter",
"num-rational",
"num-traits",
]
[[package]]
name = "num-bigint"
version = "0.4.3"
@@ -2849,6 +2882,15 @@ dependencies = [
"num-traits",
]
[[package]]
name = "num-complex"
version = "0.4.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1ba157ca0885411de85d6ca030ba7e2a83a28636056c7c699b07c8b6f7383214"
dependencies = [
"num-traits",
]
[[package]]
name = "num-integer"
version = "0.1.45"
@@ -2859,6 +2901,28 @@ dependencies = [
"num-traits",
]
[[package]]
name = "num-iter"
version = "0.1.43"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7d03e6c028c5dc5cac6e2dec0efda81fc887605bb3d884578bb6d6bf7514e252"
dependencies = [
"autocfg",
"num-integer",
"num-traits",
]
[[package]]
name = "num-rational"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0638a1c9d0a3c0914158145bc76cff373a75a627e6ecbfb71cbe6f453a5a19b0"
dependencies = [
"autocfg",
"num-integer",
"num-traits",
]
[[package]]
name = "num-traits"
version = "0.2.15"
@@ -3081,6 +3145,15 @@ dependencies = [
"tokio-stream",
]
[[package]]
name = "ordered-float"
version = "2.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "68f19d67e5a2795c94e73e0bb1cc1a7edeb2e28efd39e2e1c9b7a40c1108b11c"
dependencies = [
"num-traits",
]
[[package]]
name = "ordered-multimap"
version = "0.7.1"
@@ -3339,6 +3412,35 @@ dependencies = [
"windows-targets 0.48.0",
]
[[package]]
name = "parquet"
version = "49.0.0"
source = "git+https://github.com/neondatabase/arrow-rs?branch=neon-fix-bugs#8a0bc58aa67b98aabbd8eee7c6ca4281967ff9e9"
dependencies = [
"ahash",
"bytes",
"chrono",
"hashbrown 0.14.0",
"num",
"num-bigint",
"paste",
"seq-macro",
"thrift",
"twox-hash",
"zstd",
]
[[package]]
name = "parquet_derive"
version = "49.0.0"
source = "git+https://github.com/neondatabase/arrow-rs?branch=neon-fix-bugs#8a0bc58aa67b98aabbd8eee7c6ca4281967ff9e9"
dependencies = [
"parquet",
"proc-macro2",
"quote",
"syn 2.0.32",
]
[[package]]
name = "password-hash"
version = "0.5.0"
@@ -3762,6 +3864,8 @@ dependencies = [
"base64 0.13.1",
"bstr",
"bytes",
"camino",
"camino-tempfile",
"chrono",
"clap",
"consumption_metrics",
@@ -3784,6 +3888,8 @@ dependencies = [
"once_cell",
"opentelemetry",
"parking_lot 0.12.1",
"parquet",
"parquet_derive",
"pbkdf2",
"pin-project-lite",
"postgres-native-tls",
@@ -3794,6 +3900,7 @@ dependencies = [
"rand 0.8.5",
"rcgen",
"regex",
"remote_storage",
"reqwest",
"reqwest-middleware",
"reqwest-retry",
@@ -4475,6 +4582,7 @@ dependencies = [
"serde",
"serde_json",
"serde_with",
"sha2",
"signal-hook",
"storage_broker",
"thiserror",
@@ -4681,6 +4789,12 @@ dependencies = [
"uuid",
]
[[package]]
name = "seq-macro"
version = "0.3.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"
[[package]]
name = "serde"
version = "1.0.183"
@@ -5201,6 +5315,17 @@ dependencies = [
"once_cell",
]
[[package]]
name = "thrift"
version = "0.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09"
dependencies = [
"byteorder",
"integer-encoding",
"ordered-float",
]
[[package]]
name = "time"
version = "0.3.21"
@@ -5745,6 +5870,16 @@ dependencies = [
"utf-8",
]
[[package]]
name = "twox-hash"
version = "1.6.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675"
dependencies = [
"cfg-if",
"static_assertions",
]
[[package]]
name = "typenum"
version = "1.16.0"
@@ -5922,10 +6057,11 @@ dependencies = [
[[package]]
name = "uuid"
version = "1.3.3"
version = "1.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "345444e32442451b267fc254ae85a209c64be56d2890e601a0c37ff0c3c5ecd2"
checksum = "5e395fcf16a7a3d8127ec99782007af141946b4795001f876d54fb0d55978560"
dependencies = [
"atomic",
"getrandom 0.2.11",
"serde",
]
@@ -6421,6 +6557,7 @@ dependencies = [
"num-integer",
"num-traits",
"once_cell",
"parquet",
"prost",
"rand 0.8.5",
"regex",

View File

@@ -107,6 +107,8 @@ opentelemetry = "0.19.0"
opentelemetry-otlp = { version = "0.12.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
opentelemetry-semantic-conventions = "0.11.0"
parking_lot = "0.12"
parquet = { version = "49.0.0", default-features = false, features = ["zstd"] }
parquet_derive = "49.0.0"
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
pin-project-lite = "0.2"
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
@@ -161,7 +163,7 @@ tracing-error = "0.2.0"
tracing-opentelemetry = "0.19.0"
tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
url = "2.2"
uuid = { version = "1.2", features = ["v4", "serde"] }
uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
walkdir = "2.3.2"
webpki-roots = "0.25"
x509-parser = "0.15"
@@ -215,6 +217,10 @@ tonic-build = "0.9"
# TODO: we should probably fork `tokio-postgres-rustls` instead.
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
# bug fixes for UUID
parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
parquet_derive = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
################# Binary contents sections
[profile.release]

View File

@@ -135,7 +135,7 @@ WORKDIR /home/nonroot
# Rust
# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
ENV RUSTC_VERSION=1.74.0
ENV RUSTC_VERSION=1.75.0
ENV RUSTUP_HOME="/home/nonroot/.rustup"
ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \

View File

@@ -3,7 +3,7 @@ use std::{thread, time::Duration};
use chrono::{DateTime, Utc};
use postgres::{Client, NoTls};
use tracing::{debug, info};
use tracing::{debug, info, warn};
use crate::compute::ComputeNode;
@@ -84,6 +84,29 @@ fn watch_compute_activity(compute: &ComputeNode) {
}
}
// If there are existing (logical) walsenders, do not suspend.
//
// walproposer doesn't currently show up in pg_stat_replication,
// but protect if it will be
let ws_count_query = "select count(*) from pg_stat_replication where application_name != 'walproposer';";
match cli.query_one(ws_count_query, &[]) {
Ok(r) => match r.try_get::<&str, i64>("count") {
Ok(num_ws) => {
if num_ws > 0 {
last_active = Some(Utc::now());
}
}
Err(e) => {
warn!("failed to parse ws count: {:?}", e);
continue;
}
},
Err(e) => {
warn!("failed to get list of walsenders: {:?}", e);
continue;
}
}
// Update the last activity in the shared state if we got a more recent one.
let mut state = compute.state.lock().unwrap();
// NB: `Some(<DateTime>)` is always greater than `None`.

View File

@@ -485,6 +485,13 @@ impl PageServerNode {
Ok(self.http_client.list_timelines(*tenant_id).await?)
}
pub async fn tenant_secondary_download(&self, tenant_id: &TenantShardId) -> anyhow::Result<()> {
Ok(self
.http_client
.tenant_secondary_download(*tenant_id)
.await?)
}
pub async fn timeline_create(
&self,
tenant_id: TenantId,

View File

@@ -11,6 +11,7 @@ use crate::{
use pageserver_api::models::{
LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
};
use pageserver_api::shard::TenantShardId;
use std::collections::HashMap;
use std::time::Duration;
use utils::{
@@ -40,9 +41,9 @@ async fn await_lsn(
loop {
let latest = match get_lsns(tenant_id, pageserver).await {
Ok(l) => l,
Err(e) => {
Err(_e) => {
println!(
"🕑 Can't get LSNs on pageserver {} yet, waiting ({e})",
"🕑 Waiting for pageserver {} to activate...",
pageserver.conf.id
);
std::thread::sleep(Duration::from_millis(500));
@@ -89,7 +90,7 @@ pub async fn migrate_tenant(
tenant_id: TenantId,
dest_ps: PageServerNode,
) -> anyhow::Result<()> {
// Get a new generation
println!("🤔 Checking existing status...");
let attachment_service = AttachmentService::from_env(env);
fn build_location_config(
@@ -135,6 +136,20 @@ pub async fn migrate_tenant(
baseline_lsns = Some(get_lsns(tenant_id, &origin_ps).await?);
}
println!(
"🔁 Downloading latest layers to destination pageserver {}",
dest_ps.conf.id
);
match dest_ps
.tenant_secondary_download(&TenantShardId::unsharded(tenant_id))
.await
{
Ok(()) => {}
Err(_) => {
println!(" (skipping, destination wasn't in secondary mode)")
}
}
let gen = attachment_service
.attach_hook(tenant_id, dest_ps.conf.id)
.await?;

View File

@@ -129,13 +129,13 @@ Run `poetry shell` to activate the virtual environment.
Alternatively, use `poetry run` to run a single command in the venv, e.g. `poetry run pytest`.
### Obligatory checks
We force code formatting via `black`, `ruff`, and type hints via `mypy`.
We force code formatting via `ruff`, and type hints via `mypy`.
Run the following commands in the repository's root (next to `pyproject.toml`):
```bash
poetry run black . # All code is reformatted
poetry run ruff . # Python linter
poetry run mypy . # Ensure there are no typing errors
poetry run ruff format . # All code is reformatted
poetry run ruff check . # Python linter
poetry run mypy . # Ensure there are no typing errors
```
**WARNING**: do not run `mypy` from a directory other than the root of the repository.

View File

@@ -142,7 +142,7 @@ impl Key {
}
pub fn is_rel_block_key(key: &Key) -> bool {
key.field1 == 0x00 && key.field4 != 0
key.field1 == 0x00 && key.field4 != 0 && key.field6 != 0xffffffff
}
impl std::str::FromStr for Key {

View File

@@ -124,6 +124,9 @@ impl KeySpaceAccum {
if range.start == accum.end {
accum.end = range.end;
} else {
// TODO: to efficiently support small sharding stripe sizes, we should avoid starting
// a new range here if the skipped region was all keys that don't belong on this shard.
// (https://github.com/neondatabase/neon/issues/6247)
assert!(range.start > accum.end);
self.ranges.push(accum.clone());
*accum = range;

View File

@@ -422,6 +422,21 @@ impl ShardIdentity {
}
}
/// Return true if the key should be discarded if found in this shard's
/// data store, e.g. during compaction after a split
pub fn is_key_disposable(&self, key: &Key) -> bool {
if key_is_shard0(key) {
// Q: Why can't we dispose of shard0 content if we're not shard 0?
// A: because the WAL ingestion logic currently ingests some shard 0
// content on all shards, even though it's only read on shard 0. If we
// dropped it, then subsequent WAL ingest to these keys would encounter
// an error.
false
} else {
!self.is_key_local(key)
}
}
pub fn shard_slug(&self) -> String {
if self.count > ShardCount(0) {
format!("-{:02x}{:02x}", self.number.0, self.count.0)
@@ -515,12 +530,7 @@ fn key_is_shard0(key: &Key) -> bool {
// relation pages are distributed to shards other than shard zero. Everything else gets
// stored on shard 0. This guarantees that shard 0 can independently serve basebackup
// requests, and any request other than those for particular blocks in relations.
//
// In this condition:
// - is_rel_block_key includes only relations, i.e. excludes SLRU data and
// all metadata.
// - field6 is set to -1 for relation size pages.
!(is_rel_block_key(key) && key.field6 != 0xffffffff)
!is_rel_block_key(key)
}
/// Provide the same result as the function in postgres `hashfn.h` with the same name

View File

@@ -35,6 +35,12 @@ pub enum QueryError {
/// We were instructed to shutdown while processing the query
#[error("Shutting down")]
Shutdown,
/// Query handler indicated that client should reconnect
#[error("Server requested reconnect")]
Reconnect,
/// Query named an entity that was not found
#[error("Not found: {0}")]
NotFound(std::borrow::Cow<'static, str>),
/// Authentication failure
#[error("Unauthorized: {0}")]
Unauthorized(std::borrow::Cow<'static, str>),
@@ -54,9 +60,9 @@ impl From<io::Error> for QueryError {
impl QueryError {
pub fn pg_error_code(&self) -> &'static [u8; 5] {
match self {
Self::Disconnected(_) | Self::SimulatedConnectionError => b"08006", // connection failure
Self::Disconnected(_) | Self::SimulatedConnectionError | Self::Reconnect => b"08006", // connection failure
Self::Shutdown => SQLSTATE_ADMIN_SHUTDOWN,
Self::Unauthorized(_) => SQLSTATE_INTERNAL_ERROR,
Self::Unauthorized(_) | Self::NotFound(_) => SQLSTATE_INTERNAL_ERROR,
Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
}
}
@@ -425,6 +431,11 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
info!("Stopped due to shutdown");
Ok(())
}
Err(QueryError::Reconnect) => {
// Dropping out of this loop implicitly disconnects
info!("Stopped due to handler reconnect request");
Ok(())
}
Err(QueryError::Disconnected(e)) => {
info!("Disconnected ({e:#})");
// Disconnection is not an error: we just use it that way internally to drop
@@ -974,7 +985,9 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> AsyncWrite for CopyDataWriter<'a, I
pub fn short_error(e: &QueryError) -> String {
match e {
QueryError::Disconnected(connection_error) => connection_error.to_string(),
QueryError::Reconnect => "reconnect".to_string(),
QueryError::Shutdown => "shutdown".to_string(),
QueryError::NotFound(_) => "not found".to_string(),
QueryError::Unauthorized(_e) => "JWT authentication error".to_string(),
QueryError::SimulatedConnectionError => "simulated connection error".to_string(),
QueryError::Other(e) => format!("{e:#}"),
@@ -996,9 +1009,15 @@ fn log_query_error(query: &str, e: &QueryError) {
QueryError::SimulatedConnectionError => {
error!("query handler for query '{query}' failed due to a simulated connection error")
}
QueryError::Reconnect => {
info!("query handler for '{query}' requested client to reconnect")
}
QueryError::Shutdown => {
info!("query handler for '{query}' cancelled during tenant shutdown")
}
QueryError::NotFound(reason) => {
info!("query handler for '{query}' entity not found: {reason}")
}
QueryError::Unauthorized(e) => {
warn!("query handler for '{query}' failed with authentication error: {e}");
}

View File

@@ -322,6 +322,12 @@ impl RemoteStorage for AzureBlobStorage {
}
Ok(())
}
async fn copy(&self, _from: &RemotePath, _to: &RemotePath) -> anyhow::Result<()> {
Err(anyhow::anyhow!(
"copy for azure blob storage is not implemented"
))
}
}
pin_project_lite::pin_project! {

View File

@@ -207,6 +207,9 @@ pub trait RemoteStorage: Send + Sync + 'static {
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>;
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>;
/// Copy a remote object inside a bucket from one path to another.
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()>;
}
pub type DownloadStream = Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>;
@@ -374,6 +377,15 @@ impl GenericRemoteStorage {
Self::Unreliable(s) => s.delete_objects(paths).await,
}
}
pub async fn copy_object(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
match self {
Self::LocalFs(s) => s.copy(from, to).await,
Self::AwsS3(s) => s.copy(from, to).await,
Self::AzureBlob(s) => s.copy(from, to).await,
Self::Unreliable(s) => s.copy(from, to).await,
}
}
}
impl GenericRemoteStorage {
@@ -660,6 +672,7 @@ impl ConcurrencyLimiter {
RequestKind::Put => &self.write,
RequestKind::List => &self.read,
RequestKind::Delete => &self.write,
RequestKind::Copy => &self.write,
}
}

View File

@@ -409,6 +409,20 @@ impl RemoteStorage for LocalFs {
}
Ok(())
}
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
let from_path = from.with_base(&self.storage_root);
let to_path = to.with_base(&self.storage_root);
create_target_directory(&to_path).await?;
fs::copy(&from_path, &to_path).await.with_context(|| {
format!(
"Failed to copy file from '{from_path}' to '{to_path}'",
from_path = from_path,
to_path = to_path
)
})?;
Ok(())
}
}
fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {

View File

@@ -493,6 +493,38 @@ impl RemoteStorage for S3Bucket {
Ok(())
}
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
let kind = RequestKind::Copy;
let _guard = self.permit(kind).await;
let started_at = start_measuring_requests(kind);
// we need to specify bucket_name as a prefix
let copy_source = format!(
"{}/{}",
self.bucket_name,
self.relative_path_to_s3_object(from)
);
let res = self
.client
.copy_object()
.bucket(self.bucket_name.clone())
.key(self.relative_path_to_s3_object(to))
.copy_source(copy_source)
.send()
.await;
let started_at = ScopeGuard::into_inner(started_at);
metrics::BUCKET_METRICS
.req_seconds
.observe_elapsed(kind, &res, started_at);
res?;
Ok(())
}
async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
// if prefix is not none then download file `prefix/from`
// if prefix is none then download file `from`

View File

@@ -11,6 +11,7 @@ pub(crate) enum RequestKind {
Put = 1,
Delete = 2,
List = 3,
Copy = 4,
}
use RequestKind::*;
@@ -22,6 +23,7 @@ impl RequestKind {
Put => "put_object",
Delete => "delete_object",
List => "list_objects",
Copy => "copy_object",
}
}
const fn as_index(&self) -> usize {
@@ -29,7 +31,7 @@ impl RequestKind {
}
}
pub(super) struct RequestTyped<C>([C; 4]);
pub(super) struct RequestTyped<C>([C; 5]);
impl<C> RequestTyped<C> {
pub(super) fn get(&self, kind: RequestKind) -> &C {
@@ -38,8 +40,8 @@ impl<C> RequestTyped<C> {
fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
use RequestKind::*;
let mut it = [Get, Put, Delete, List].into_iter();
let arr = std::array::from_fn::<C, 4, _>(|index| {
let mut it = [Get, Put, Delete, List, Copy].into_iter();
let arr = std::array::from_fn::<C, 5, _>(|index| {
let next = it.next().unwrap();
assert_eq!(index, next.as_index());
f(next)

View File

@@ -162,4 +162,11 @@ impl RemoteStorage for UnreliableWrapper {
}
Ok(())
}
async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
// copy is equivalent to download + upload
self.attempt(RemoteOp::Download(from.clone()))?;
self.attempt(RemoteOp::Upload(to.clone()))?;
self.inner.copy_object(from, to).await
}
}

View File

@@ -51,3 +51,9 @@ pub struct SkTimelineInfo {
#[serde(default)]
pub http_connstr: Option<String>,
}
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct TimelineCopyRequest {
pub target_timeline_id: TimelineId,
pub until_lsn: Lsn,
}

View File

@@ -31,6 +31,9 @@ pub enum ApiError {
#[error("Shutting down")]
ShuttingDown,
#[error("Timeout")]
Timeout(Cow<'static, str>),
#[error(transparent)]
InternalServerError(anyhow::Error),
}
@@ -67,6 +70,10 @@ impl ApiError {
err.to_string(),
StatusCode::SERVICE_UNAVAILABLE,
),
ApiError::Timeout(err) => HttpErrorBody::response_from_msg_and_status(
err.to_string(),
StatusCode::REQUEST_TIMEOUT,
),
ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
err.to_string(),
StatusCode::INTERNAL_SERVER_ERROR,

View File

@@ -85,6 +85,8 @@ pub mod sync;
pub mod failpoint_support;
pub mod yielding_loop;
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
///
/// we have several cases:

View File

@@ -15,6 +15,12 @@ pub struct Gate {
name: String,
}
impl std::fmt::Debug for Gate {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "Gate<{}>", self.name)
}
}
/// RAII guard for a [`Gate`]: as long as this exists, calls to [`Gate::close`] will
/// not complete.
#[derive(Debug)]

View File

@@ -0,0 +1,35 @@
use tokio_util::sync::CancellationToken;
#[derive(thiserror::Error, Debug)]
pub enum YieldingLoopError {
#[error("Cancelled")]
Cancelled,
}
/// Helper for long synchronous loops, e.g. over all tenants in the system. Periodically
/// yields to avoid blocking the executor, and after resuming checks the provided
/// cancellation token to drop out promptly on shutdown.
#[inline(always)]
pub async fn yielding_loop<I, T, F>(
interval: usize,
cancel: &CancellationToken,
iter: I,
mut visitor: F,
) -> Result<(), YieldingLoopError>
where
I: Iterator<Item = T>,
F: FnMut(T),
{
for (i, item) in iter.enumerate() {
visitor(item);
if i + 1 % interval == 0 {
tokio::task::yield_now().await;
if cancel.is_cancelled() {
return Err(YieldingLoopError::Cancelled);
}
}
}
Ok(())
}

View File

@@ -13,6 +13,7 @@ use bytes::{Buf, Bytes};
use pageserver::{
config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager,
};
use pageserver_api::shard::TenantShardId;
use utils::{id::TenantId, lsn::Lsn};
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
@@ -26,9 +27,9 @@ fn redo_scenarios(c: &mut Criterion) {
let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
let conf = Box::leak(Box::new(conf));
let tenant_id = TenantId::generate();
let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
let manager = PostgresRedoManager::new(conf, tenant_id);
let manager = PostgresRedoManager::new(conf, tenant_shard_id);
let manager = Arc::new(manager);

View File

@@ -1,4 +1,4 @@
use pageserver_api::models::*;
use pageserver_api::{models::*, shard::TenantShardId};
use reqwest::{IntoUrl, Method};
use utils::{
http::error::HttpErrorBody,
@@ -164,6 +164,18 @@ impl Client {
Ok(())
}
pub async fn tenant_secondary_download(&self, tenant_id: TenantShardId) -> Result<()> {
let uri = format!(
"{}/v1/tenant/{}/secondary/download",
self.mgmt_api_endpoint, tenant_id
);
self.request(Method::POST, &uri, ())
.await?
.error_for_status()
.map(|_| ())
.map_err(|e| Error::ApiError(format!("{}", e)))
}
pub async fn location_config(
&self,
tenant_id: TenantId,

View File

@@ -37,8 +37,8 @@ use crate::tenant::{
TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
};
use crate::{
IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_LOCATION_CONFIG_NAME,
TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
};
use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
@@ -75,6 +75,7 @@ pub mod defaults {
pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8;
pub const DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY: usize = 1;
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
@@ -130,6 +131,7 @@ pub mod defaults {
#gc_feedback = false
#heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY}
#secondary_download_concurrency = {DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY}
[remote_storage]
@@ -239,6 +241,10 @@ pub struct PageServerConf {
/// heatmap uploads vs. other remote storage operations.
pub heatmap_upload_concurrency: usize,
/// How many remote storage downloads may be done for secondary tenants concurrently. Implicitly
/// deprioritises secondary downloads vs. remote storage operations for attached tenants.
pub secondary_download_concurrency: usize,
/// Maximum number of WAL records to be ingested and committed at the same time
pub ingest_batch_size: u64,
}
@@ -322,6 +328,7 @@ struct PageServerConfigBuilder {
control_plane_emergency_mode: BuilderValue<bool>,
heatmap_upload_concurrency: BuilderValue<usize>,
secondary_download_concurrency: BuilderValue<usize>,
ingest_batch_size: BuilderValue<u64>,
}
@@ -396,6 +403,7 @@ impl Default for PageServerConfigBuilder {
control_plane_emergency_mode: Set(false),
heatmap_upload_concurrency: Set(DEFAULT_HEATMAP_UPLOAD_CONCURRENCY),
secondary_download_concurrency: Set(DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY),
ingest_batch_size: Set(DEFAULT_INGEST_BATCH_SIZE),
}
@@ -546,6 +554,10 @@ impl PageServerConfigBuilder {
self.heatmap_upload_concurrency = BuilderValue::Set(value)
}
pub fn secondary_download_concurrency(&mut self, value: usize) {
self.secondary_download_concurrency = BuilderValue::Set(value)
}
pub fn ingest_batch_size(&mut self, ingest_batch_size: u64) {
self.ingest_batch_size = BuilderValue::Set(ingest_batch_size)
}
@@ -651,6 +663,9 @@ impl PageServerConfigBuilder {
heatmap_upload_concurrency: self
.heatmap_upload_concurrency
.ok_or(anyhow!("missing heatmap_upload_concurrency"))?,
secondary_download_concurrency: self
.secondary_download_concurrency
.ok_or(anyhow!("missing secondary_download_concurrency"))?,
ingest_batch_size: self
.ingest_batch_size
.ok_or(anyhow!("missing ingest_batch_size"))?,
@@ -711,6 +726,11 @@ impl PageServerConf {
.join(TENANT_LOCATION_CONFIG_NAME)
}
pub(crate) fn tenant_heatmap_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
self.tenant_path(tenant_shard_id)
.join(TENANT_HEATMAP_BASENAME)
}
pub fn timelines_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
self.tenant_path(tenant_shard_id)
.join(TIMELINES_SEGMENT_NAME)
@@ -896,6 +916,9 @@ impl PageServerConf {
"heatmap_upload_concurrency" => {
builder.heatmap_upload_concurrency(parse_toml_u64(key, item)? as usize)
},
"secondary_download_concurrency" => {
builder.secondary_download_concurrency(parse_toml_u64(key, item)? as usize)
},
"ingest_batch_size" => builder.ingest_batch_size(parse_toml_u64(key, item)?),
_ => bail!("unrecognized pageserver option '{key}'"),
}
@@ -968,6 +991,7 @@ impl PageServerConf {
control_plane_api_token: None,
control_plane_emergency_mode: false,
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
}
}
@@ -1198,6 +1222,7 @@ background_task_maximum_delay = '334 s'
control_plane_api_token: None,
control_plane_emergency_mode: false,
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
},
"Correct defaults should be used when no config values are provided"
@@ -1260,6 +1285,7 @@ background_task_maximum_delay = '334 s'
control_plane_api_token: None,
control_plane_emergency_mode: false,
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
ingest_batch_size: 100,
},
"Should be able to parse all basic config values correctly"

View File

@@ -152,6 +152,7 @@ impl From<PageReconstructError> for ApiError {
PageReconstructError::AncestorStopping(_) => {
ApiError::ResourceUnavailable(format!("{pre}").into())
}
PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()),
PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre),
}
}
@@ -1275,6 +1276,23 @@ async fn put_tenant_location_config_handler(
// which is not a 400 but a 409.
.map_err(ApiError::BadRequest)?;
if let Some(_flush_ms) = flush {
match state
.secondary_controller
.upload_tenant(tenant_shard_id)
.await
{
Ok(()) => {
tracing::info!("Uploaded heatmap during flush");
}
Err(e) => {
tracing::warn!("Failed to flush heatmap: {e}");
}
}
} else {
tracing::info!("No flush requested when configuring");
}
json_response(StatusCode::OK, ())
}
@@ -1612,6 +1630,21 @@ async fn secondary_upload_handler(
json_response(StatusCode::OK, ())
}
async fn secondary_download_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let state = get_state(&request);
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
state
.secondary_controller
.download_tenant(tenant_shard_id)
.await
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::OK, ())
}
async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
json_response(
StatusCode::NOT_FOUND,
@@ -1880,6 +1913,9 @@ pub fn make_router(
.put("/v1/deletion_queue/flush", |r| {
api_handler(r, deletion_queue_flush)
})
.post("/v1/tenant/:tenant_shard_id/secondary/download", |r| {
api_handler(r, secondary_download_handler)
})
.put("/v1/tenant/:tenant_shard_id/break", |r| {
testing_api_handler("set tenant state to broken", r, handle_tenant_break)
})

View File

@@ -117,6 +117,10 @@ pub const TENANT_CONFIG_NAME: &str = "config";
/// Full path: `tenants/<tenant_id>/config`.
pub const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1";
/// Per-tenant copy of their remote heatmap, downloaded into the local
/// tenant path while in secondary mode.
pub const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json";
/// A suffix used for various temporary files. Any temporary files found in the
/// data directory at pageserver startup can be automatically removed.
pub const TEMP_FILE_SUFFIX: &str = "___temp";

View File

@@ -29,7 +29,7 @@ const CRITICAL_OP_BUCKETS: &[f64] = &[
// Metrics collected on operations on the storage repository.
#[derive(Debug, EnumVariantNames, IntoStaticStr)]
#[strum(serialize_all = "kebab_case")]
pub enum StorageTimeOperation {
pub(crate) enum StorageTimeOperation {
#[strum(serialize = "layer flush")]
LayerFlush,
@@ -55,7 +55,7 @@ pub enum StorageTimeOperation {
CreateTenant,
}
pub static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
pub(crate) static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
register_counter_vec!(
"pageserver_storage_operations_seconds_sum",
"Total time spent on storage operations with operation, tenant and timeline dimensions",
@@ -64,7 +64,7 @@ pub static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
pub(crate) static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_storage_operations_seconds_count",
"Count of storage operations with operation, tenant and timeline dimensions",
@@ -150,7 +150,7 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub struct PageCacheMetricsForTaskKind {
pub(crate) struct PageCacheMetricsForTaskKind {
pub read_accesses_materialized_page: IntCounter,
pub read_accesses_immutable: IntCounter,
@@ -159,7 +159,7 @@ pub struct PageCacheMetricsForTaskKind {
pub read_hits_materialized_page_older_lsn: IntCounter,
}
pub struct PageCacheMetrics {
pub(crate) struct PageCacheMetrics {
map: EnumMap<TaskKind, EnumMap<PageContentKind, PageCacheMetricsForTaskKind>>,
}
@@ -181,7 +181,7 @@ static PAGE_CACHE_READ_ACCESSES: Lazy<IntCounterVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
pub(crate) static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
map: EnumMap::from_array(std::array::from_fn(|task_kind| {
let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind);
let task_kind: &'static str = task_kind.into();
@@ -243,10 +243,9 @@ impl PageCacheMetrics {
}
}
pub struct PageCacheSizeMetrics {
pub(crate) struct PageCacheSizeMetrics {
pub max_bytes: UIntGauge,
pub current_bytes_ephemeral: UIntGauge,
pub current_bytes_immutable: UIntGauge,
pub current_bytes_materialized_page: UIntGauge,
}
@@ -260,31 +259,26 @@ static PAGE_CACHE_SIZE_CURRENT_BYTES: Lazy<UIntGaugeVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheSizeMetrics {
max_bytes: {
register_uint_gauge!(
"pageserver_page_cache_size_max_bytes",
"Maximum size of the page cache in bytes"
)
.expect("failed to define a metric")
},
current_bytes_ephemeral: {
PAGE_CACHE_SIZE_CURRENT_BYTES
.get_metric_with_label_values(&["ephemeral"])
.unwrap()
},
current_bytes_immutable: {
PAGE_CACHE_SIZE_CURRENT_BYTES
.get_metric_with_label_values(&["immutable"])
.unwrap()
},
current_bytes_materialized_page: {
PAGE_CACHE_SIZE_CURRENT_BYTES
.get_metric_with_label_values(&["materialized_page"])
.unwrap()
},
});
pub(crate) static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> =
Lazy::new(|| PageCacheSizeMetrics {
max_bytes: {
register_uint_gauge!(
"pageserver_page_cache_size_max_bytes",
"Maximum size of the page cache in bytes"
)
.expect("failed to define a metric")
},
current_bytes_immutable: {
PAGE_CACHE_SIZE_CURRENT_BYTES
.get_metric_with_label_values(&["immutable"])
.unwrap()
},
current_bytes_materialized_page: {
PAGE_CACHE_SIZE_CURRENT_BYTES
.get_metric_with_label_values(&["materialized_page"])
.unwrap()
},
});
pub(crate) mod page_cache_eviction_metrics {
use std::num::NonZeroUsize;
@@ -740,13 +734,13 @@ pub(crate) static TENANT: Lazy<TenantMetrics> = Lazy::new(|| {
/// Each `Timeline`'s [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
#[derive(Debug)]
pub struct EvictionsWithLowResidenceDuration {
pub(crate) struct EvictionsWithLowResidenceDuration {
data_source: &'static str,
threshold: Duration,
counter: Option<IntCounter>,
}
pub struct EvictionsWithLowResidenceDurationBuilder {
pub(crate) struct EvictionsWithLowResidenceDurationBuilder {
data_source: &'static str,
threshold: Duration,
}
@@ -1009,7 +1003,7 @@ pub enum SmgrQueryType {
}
#[derive(Debug)]
pub struct SmgrQueryTimePerTimeline {
pub(crate) struct SmgrQueryTimePerTimeline {
metrics: [GlobalAndPerTimelineHistogram; SmgrQueryType::COUNT],
}
@@ -1181,8 +1175,8 @@ static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
.map(|ms| (ms as f64) / 1000.0)
});
pub struct BasebackupQueryTime(HistogramVec);
pub static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
pub(crate) struct BasebackupQueryTime(HistogramVec);
pub(crate) static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
BasebackupQueryTime({
register_histogram_vec!(
"pageserver_basebackup_query_seconds",
@@ -1202,7 +1196,7 @@ impl DurationResultObserver for BasebackupQueryTime {
}
}
pub static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
pub(crate) static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
register_int_gauge_vec!(
"pageserver_live_connections",
"Number of live network connections",
@@ -1369,6 +1363,8 @@ pub(crate) struct SecondaryModeMetrics {
pub(crate) upload_heatmap: IntCounter,
pub(crate) upload_heatmap_errors: IntCounter,
pub(crate) upload_heatmap_duration: Histogram,
pub(crate) download_heatmap: IntCounter,
pub(crate) download_layer: IntCounter,
}
pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| SecondaryModeMetrics {
upload_heatmap: register_int_counter!(
@@ -1386,6 +1382,16 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| Seco
"Time to build and upload a heatmap, including any waiting inside the S3 client"
)
.expect("failed to define a metric"),
download_heatmap: register_int_counter!(
"pageserver_secondary_download_heatmap",
"Number of downloads of heatmaps by secondary mode locations"
)
.expect("failed to define a metric"),
download_layer: register_int_counter!(
"pageserver_secondary_download_layer",
"Number of downloads of layers by secondary mode locations"
)
.expect("failed to define a metric"),
});
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
@@ -1655,7 +1661,7 @@ pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
Lazy::new(WalRedoProcessCounters::default);
/// Similar to `prometheus::HistogramTimer` but does not record on drop.
pub struct StorageTimeMetricsTimer {
pub(crate) struct StorageTimeMetricsTimer {
metrics: StorageTimeMetrics,
start: Instant,
}
@@ -1680,7 +1686,7 @@ impl StorageTimeMetricsTimer {
/// Timing facilities for an globally histogrammed metric, which is supported by per tenant and
/// timeline total sum and count.
#[derive(Clone, Debug)]
pub struct StorageTimeMetrics {
pub(crate) struct StorageTimeMetrics {
/// Sum of f64 seconds, per operation, tenant_id and timeline_id
timeline_sum: Counter,
/// Number of oeprations, per operation, tenant_id and timeline_id
@@ -1719,7 +1725,7 @@ impl StorageTimeMetrics {
}
#[derive(Debug)]
pub struct TimelineMetrics {
pub(crate) struct TimelineMetrics {
tenant_id: String,
shard_id: String,
timeline_id: String,
@@ -1927,7 +1933,7 @@ impl Drop for PerTimelineRemotePhysicalSizeGauge {
}
}
pub struct RemoteTimelineClientMetrics {
pub(crate) struct RemoteTimelineClientMetrics {
tenant_id: String,
timeline_id: String,
remote_physical_size_gauge: Mutex<Option<PerTimelineRemotePhysicalSizeGauge>>,
@@ -2225,7 +2231,7 @@ impl Drop for RemoteTimelineClientMetrics {
/// Wrapper future that measures the time spent by a remote storage operation,
/// and records the time and success/failure as a prometheus metric.
pub trait MeasureRemoteOp: Sized {
pub(crate) trait MeasureRemoteOp: Sized {
fn measure_remote_op(
self,
tenant_id: TenantId,
@@ -2250,7 +2256,7 @@ pub trait MeasureRemoteOp: Sized {
impl<T: Sized> MeasureRemoteOp for T {}
pin_project! {
pub struct MeasuredRemoteOp<F>
pub(crate) struct MeasuredRemoteOp<F>
{
#[pin]
inner: F,

View File

@@ -25,6 +25,7 @@ use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, Qu
use pq_proto::framed::ConnectionError;
use pq_proto::FeStartupPacket;
use pq_proto::{BeMessage, FeMessage, RowDescriptor};
use std::borrow::Cow;
use std::io;
use std::net::TcpListener;
use std::pin::pin;
@@ -61,6 +62,9 @@ use crate::tenant::mgr;
use crate::tenant::mgr::get_active_tenant_with_timeout;
use crate::tenant::mgr::GetActiveTenantError;
use crate::tenant::mgr::ShardSelector;
use crate::tenant::timeline::WaitLsnError;
use crate::tenant::GetTimelineError;
use crate::tenant::PageReconstructError;
use crate::tenant::Timeline;
use crate::trace::Tracer;
@@ -283,6 +287,64 @@ struct PageServerHandler {
connection_ctx: RequestContext,
}
#[derive(thiserror::Error, Debug)]
enum PageStreamError {
/// We encountered an error that should prompt the client to reconnect:
/// in practice this means we drop the connection without sending a response.
#[error("Reconnect required: {0}")]
Reconnect(Cow<'static, str>),
/// We were instructed to shutdown while processing the query
#[error("Shutting down")]
Shutdown,
/// Something went wrong reading a page: this likely indicates a pageserver bug
#[error("Read error: {0}")]
Read(PageReconstructError),
/// Ran out of time waiting for an LSN
#[error("LSN timeout: {0}")]
LsnTimeout(WaitLsnError),
/// The entity required to serve the request (tenant or timeline) is not found,
/// or is not found in a suitable state to serve a request.
#[error("Not found: {0}")]
NotFound(std::borrow::Cow<'static, str>),
/// Request asked for something that doesn't make sense, like an invalid LSN
#[error("Bad request: {0}")]
BadRequest(std::borrow::Cow<'static, str>),
}
impl From<PageReconstructError> for PageStreamError {
fn from(value: PageReconstructError) -> Self {
match value {
PageReconstructError::Cancelled => Self::Shutdown,
e => Self::Read(e),
}
}
}
impl From<GetActiveTimelineError> for PageStreamError {
fn from(value: GetActiveTimelineError) -> Self {
match value {
GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled) => Self::Shutdown,
GetActiveTimelineError::Tenant(e) => Self::NotFound(format!("{e}").into()),
GetActiveTimelineError::Timeline(e) => Self::NotFound(format!("{e}").into()),
}
}
}
impl From<WaitLsnError> for PageStreamError {
fn from(value: WaitLsnError) -> Self {
match value {
e @ WaitLsnError::Timeout(_) => Self::LsnTimeout(e),
WaitLsnError::Shutdown => Self::Shutdown,
WaitLsnError::BadState => Self::Reconnect("Timeline is not active".into()),
}
}
}
impl PageServerHandler {
pub fn new(
conf: &'static PageServerConf,
@@ -428,7 +490,7 @@ impl PageServerHandler {
// Check that the timeline exists
let timeline = tenant
.get_timeline(timeline_id, true)
.map_err(|e| anyhow::anyhow!(e))?;
.map_err(|e| QueryError::NotFound(format!("{e}").into()))?;
// Avoid starting new requests if the timeline has already started shutting down,
// and block timeline shutdown until this request is complete, or drops out due
@@ -520,32 +582,44 @@ impl PageServerHandler {
}
};
if let Err(e) = &response {
// Requests may fail as soon as we are Stopping, even if the Timeline's cancellation token wasn't fired yet,
// because wait_lsn etc will drop out
// is_stopping(): [`Timeline::flush_and_shutdown`] has entered
// is_canceled(): [`Timeline::shutdown`]` has entered
if timeline.cancel.is_cancelled() || timeline.is_stopping() {
match response {
Err(PageStreamError::Shutdown) => {
// If we fail to fulfil a request during shutdown, which may be _because_ of
// shutdown, then do not send the error to the client. Instead just drop the
// connection.
span.in_scope(|| info!("dropped response during shutdown: {e:#}"));
span.in_scope(|| info!("dropping connection due to shutdown"));
return Err(QueryError::Shutdown);
}
Err(PageStreamError::Reconnect(reason)) => {
span.in_scope(|| info!("handler requested reconnect: {reason}"));
return Err(QueryError::Reconnect);
}
Err(e) if timeline.cancel.is_cancelled() || timeline.is_stopping() => {
// This branch accomodates code within request handlers that returns an anyhow::Error instead of a clean
// shutdown error, this may be buried inside a PageReconstructError::Other for example.
//
// Requests may fail as soon as we are Stopping, even if the Timeline's cancellation token wasn't fired yet,
// because wait_lsn etc will drop out
// is_stopping(): [`Timeline::flush_and_shutdown`] has entered
// is_canceled(): [`Timeline::shutdown`]` has entered
span.in_scope(|| info!("dropped error response during shutdown: {e:#}"));
return Err(QueryError::Shutdown);
}
r => {
let response_msg = r.unwrap_or_else(|e| {
// print the all details to the log with {:#}, but for the client the
// error message is enough. Do not log if shutting down, as the anyhow::Error
// here includes cancellation which is not an error.
span.in_scope(|| error!("error reading relation or page version: {:#}", e));
PagestreamBeMessage::Error(PagestreamErrorResponse {
message: e.to_string(),
})
});
pgb.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?;
self.flush_cancellable(pgb, &timeline.cancel).await?;
}
}
let response = response.unwrap_or_else(|e| {
// print the all details to the log with {:#}, but for the client the
// error message is enough. Do not log if shutting down, as the anyhow::Error
// here includes cancellation which is not an error.
span.in_scope(|| error!("error reading relation or page version: {:#}", e));
PagestreamBeMessage::Error(PagestreamErrorResponse {
message: e.to_string(),
})
});
pgb.write_message_noflush(&BeMessage::CopyData(&response.serialize()))?;
self.flush_cancellable(pgb, &timeline.cancel).await?;
}
Ok(())
}
@@ -692,7 +766,7 @@ impl PageServerHandler {
latest: bool,
latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
ctx: &RequestContext,
) -> anyhow::Result<Lsn> {
) -> Result<Lsn, PageStreamError> {
if latest {
// Latest page version was requested. If LSN is given, it is a hint
// to the page server that there have been no modifications to the
@@ -723,15 +797,19 @@ impl PageServerHandler {
}
} else {
if lsn == Lsn(0) {
anyhow::bail!("invalid LSN(0) in request");
return Err(PageStreamError::BadRequest(
"invalid LSN(0) in request".into(),
));
}
timeline.wait_lsn(lsn, ctx).await?;
}
anyhow::ensure!(
lsn >= **latest_gc_cutoff_lsn,
"tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
lsn, **latest_gc_cutoff_lsn
);
if lsn < **latest_gc_cutoff_lsn {
return Err(PageStreamError::BadRequest(format!(
"tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
lsn, **latest_gc_cutoff_lsn
).into()));
}
Ok(lsn)
}
@@ -740,7 +818,7 @@ impl PageServerHandler {
timeline: &Timeline,
req: &PagestreamExistsRequest,
ctx: &RequestContext,
) -> anyhow::Result<PagestreamBeMessage> {
) -> Result<PagestreamBeMessage, PageStreamError> {
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn =
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
@@ -760,7 +838,7 @@ impl PageServerHandler {
timeline: &Timeline,
req: &PagestreamNblocksRequest,
ctx: &RequestContext,
) -> anyhow::Result<PagestreamBeMessage> {
) -> Result<PagestreamBeMessage, PageStreamError> {
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn =
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
@@ -780,7 +858,7 @@ impl PageServerHandler {
timeline: &Timeline,
req: &PagestreamDbSizeRequest,
ctx: &RequestContext,
) -> anyhow::Result<PagestreamBeMessage> {
) -> Result<PagestreamBeMessage, PageStreamError> {
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn =
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
@@ -802,30 +880,35 @@ impl PageServerHandler {
}))
}
async fn do_handle_get_page_at_lsn_request(
&self,
timeline: &Timeline,
req: &PagestreamGetPageRequest,
ctx: &RequestContext,
) -> Result<PagestreamBeMessage, PageStreamError> {
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn =
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
.await?;
let page = timeline
.get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), req.latest, ctx)
.await?;
Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
page,
}))
}
async fn handle_get_page_at_lsn_request(
&self,
timeline: &Timeline,
req: &PagestreamGetPageRequest,
ctx: &RequestContext,
) -> anyhow::Result<PagestreamBeMessage> {
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn =
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
.await?;
/*
// Add a 1s delay to some requests. The delay helps the requests to
// hit the race condition from github issue #1047 more easily.
use rand::Rng;
if rand::thread_rng().gen::<u8>() < 5 {
std::thread::sleep(std::time::Duration::from_millis(1000));
}
*/
) -> Result<PagestreamBeMessage, PageStreamError> {
let key = rel_block_to_key(req.rel, req.blkno);
let page = if timeline.get_shard_identity().is_key_local(&key) {
timeline
.get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), req.latest, ctx)
.await?
if timeline.get_shard_identity().is_key_local(&key) {
self.do_handle_get_page_at_lsn_request(timeline, req, ctx)
.await
} else {
// The Tenant shard we looked up at connection start does not hold this particular
// key: look for other shards in this tenant. This scenario occurs if a pageserver
@@ -844,30 +927,30 @@ impl PageServerHandler {
Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
// We already know this tenant exists in general, because we resolved it at
// start of connection. Getting a NotFound here indicates that the shard containing
// the requested page is not present on this node.
// TODO: this should be some kind of structured error that the client will understand,
// so that it can block until its config is updated: this error is expected in the case
// that the Tenant's shards' placements are being updated and the client hasn't been
// informed yet.
//
// https://github.com/neondatabase/neon/issues/6038
return Err(anyhow::anyhow!("Request routed to wrong shard"));
// the requested page is not present on this node: the client's knowledge of shard->pageserver
// mapping is out of date.
tracing::info!("Page request routed to wrong shard: my identity {:?}, should go to shard {}, key {}",
timeline.get_shard_identity(), timeline.get_shard_identity().get_shard_number(&key).0, key);
// Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
// client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
// and talk to a different pageserver.
return Err(PageStreamError::Reconnect(
"getpage@lsn request routed to wrong shard".into(),
));
}
Err(e) => return Err(e.into()),
};
// Take a GateGuard for the duration of this request. If we were using our main Timeline object,
// the GateGuard was already held over the whole connection.
let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?;
timeline
.get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), req.latest, ctx)
.await?
};
let _timeline_guard = timeline
.gate
.enter()
.map_err(|_| PageStreamError::Shutdown)?;
Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
page,
}))
self.do_handle_get_page_at_lsn_request(&timeline, req, ctx)
.await
}
}
#[allow(clippy::too_many_arguments)]
@@ -1008,9 +1091,7 @@ impl PageServerHandler {
)
.await
.map_err(GetActiveTimelineError::Tenant)?;
let timeline = tenant
.get_timeline(timeline_id, true)
.map_err(|e| GetActiveTimelineError::Timeline(anyhow::anyhow!(e)))?;
let timeline = tenant.get_timeline(timeline_id, true)?;
Ok(timeline)
}
}
@@ -1432,14 +1513,15 @@ enum GetActiveTimelineError {
#[error(transparent)]
Tenant(GetActiveTenantError),
#[error(transparent)]
Timeline(anyhow::Error),
Timeline(#[from] GetTimelineError),
}
impl From<GetActiveTimelineError> for QueryError {
fn from(e: GetActiveTimelineError) -> Self {
match e {
GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled) => QueryError::Shutdown,
GetActiveTimelineError::Tenant(e) => e.into(),
GetActiveTimelineError::Timeline(e) => QueryError::Other(e),
GetActiveTimelineError::Timeline(e) => QueryError::NotFound(format!("{e}").into()),
}
}
}

View File

@@ -160,7 +160,7 @@ impl Timeline {
//------------------------------------------------------------------------------
/// Look up given page version.
pub async fn get_rel_page_at_lsn(
pub(crate) async fn get_rel_page_at_lsn(
&self,
tag: RelTag,
blknum: BlockNumber,
@@ -191,7 +191,7 @@ impl Timeline {
}
// Get size of a database in blocks
pub async fn get_db_size(
pub(crate) async fn get_db_size(
&self,
spcnode: Oid,
dbnode: Oid,
@@ -211,7 +211,7 @@ impl Timeline {
}
/// Get size of a relation file
pub async fn get_rel_size(
pub(crate) async fn get_rel_size(
&self,
tag: RelTag,
version: Version<'_>,
@@ -256,7 +256,7 @@ impl Timeline {
}
/// Does relation exist?
pub async fn get_rel_exists(
pub(crate) async fn get_rel_exists(
&self,
tag: RelTag,
version: Version<'_>,
@@ -291,7 +291,7 @@ impl Timeline {
/// # Cancel-Safety
///
/// This method is cancellation-safe.
pub async fn list_rels(
pub(crate) async fn list_rels(
&self,
spcnode: Oid,
dbnode: Oid,
@@ -319,7 +319,7 @@ impl Timeline {
}
/// Look up given SLRU page version.
pub async fn get_slru_page_at_lsn(
pub(crate) async fn get_slru_page_at_lsn(
&self,
kind: SlruKind,
segno: u32,
@@ -332,7 +332,7 @@ impl Timeline {
}
/// Get size of an SLRU segment
pub async fn get_slru_segment_size(
pub(crate) async fn get_slru_segment_size(
&self,
kind: SlruKind,
segno: u32,
@@ -345,7 +345,7 @@ impl Timeline {
}
/// Get size of an SLRU segment
pub async fn get_slru_segment_exists(
pub(crate) async fn get_slru_segment_exists(
&self,
kind: SlruKind,
segno: u32,
@@ -372,7 +372,7 @@ impl Timeline {
/// so it's not well defined which LSN you get if there were multiple commits
/// "in flight" at that point in time.
///
pub async fn find_lsn_for_timestamp(
pub(crate) async fn find_lsn_for_timestamp(
&self,
search_timestamp: TimestampTz,
cancel: &CancellationToken,
@@ -452,7 +452,7 @@ impl Timeline {
/// Additionally, sets 'found_smaller'/'found_Larger, if encounters any commits
/// with a smaller/larger timestamp.
///
pub async fn is_latest_commit_timestamp_ge_than(
pub(crate) async fn is_latest_commit_timestamp_ge_than(
&self,
search_timestamp: TimestampTz,
probe_lsn: Lsn,
@@ -475,7 +475,7 @@ impl Timeline {
/// Obtain the possible timestamp range for the given lsn.
///
/// If the lsn has no timestamps, returns None. returns `(min, max, median)` if it has timestamps.
pub async fn get_timestamp_for_lsn(
pub(crate) async fn get_timestamp_for_lsn(
&self,
probe_lsn: Lsn,
ctx: &RequestContext,
@@ -532,7 +532,7 @@ impl Timeline {
}
/// Get a list of SLRU segments
pub async fn list_slru_segments(
pub(crate) async fn list_slru_segments(
&self,
kind: SlruKind,
version: Version<'_>,
@@ -548,7 +548,7 @@ impl Timeline {
}
}
pub async fn get_relmap_file(
pub(crate) async fn get_relmap_file(
&self,
spcnode: Oid,
dbnode: Oid,
@@ -561,7 +561,7 @@ impl Timeline {
Ok(buf)
}
pub async fn list_dbdirs(
pub(crate) async fn list_dbdirs(
&self,
lsn: Lsn,
ctx: &RequestContext,
@@ -575,7 +575,7 @@ impl Timeline {
}
}
pub async fn get_twophase_file(
pub(crate) async fn get_twophase_file(
&self,
xid: TransactionId,
lsn: Lsn,
@@ -586,7 +586,7 @@ impl Timeline {
Ok(buf)
}
pub async fn list_twophase_files(
pub(crate) async fn list_twophase_files(
&self,
lsn: Lsn,
ctx: &RequestContext,
@@ -600,7 +600,7 @@ impl Timeline {
}
}
pub async fn get_control_file(
pub(crate) async fn get_control_file(
&self,
lsn: Lsn,
ctx: &RequestContext,
@@ -608,7 +608,7 @@ impl Timeline {
self.get(CONTROLFILE_KEY, lsn, ctx).await
}
pub async fn get_checkpoint(
pub(crate) async fn get_checkpoint(
&self,
lsn: Lsn,
ctx: &RequestContext,
@@ -616,7 +616,7 @@ impl Timeline {
self.get(CHECKPOINT_KEY, lsn, ctx).await
}
pub async fn list_aux_files(
pub(crate) async fn list_aux_files(
&self,
lsn: Lsn,
ctx: &RequestContext,

View File

@@ -147,7 +147,7 @@ pub(crate) static BACKGROUND_RUNTIME_WORKER_THREADS: Lazy<usize> = Lazy::new(||
// else, but that has not been needed in a long time.
std::env::var("TOKIO_WORKER_THREADS")
.map(|s| s.parse::<usize>().unwrap())
.unwrap_or_else(|_e| usize::max(1, num_cpus::get()))
.unwrap_or_else(|_e| usize::max(2, num_cpus::get()))
});
#[derive(Debug, Clone, Copy)]
@@ -258,6 +258,9 @@ pub enum TaskKind {
/// See [`crate::disk_usage_eviction_task`].
DiskUsageEviction,
/// See [`crate::tenant::secondary`].
SecondaryDownloads,
/// See [`crate::tenant::secondary`].
SecondaryUploads,

View File

@@ -56,6 +56,7 @@ use self::timeline::uninit::TimelineUninitMark;
use self::timeline::uninit::UninitializedTimeline;
use self::timeline::EvictionTaskTenantState;
use self::timeline::TimelineResources;
use self::timeline::WaitLsnError;
use crate::config::PageServerConf;
use crate::context::{DownloadBehavior, RequestContext};
use crate::deletion_queue::DeletionQueueClient;
@@ -595,10 +596,9 @@ impl Tenant {
mode: SpawnMode,
ctx: &RequestContext,
) -> anyhow::Result<Arc<Tenant>> {
// TODO(sharding): make WalRedoManager shard-aware
let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
conf,
tenant_shard_id.tenant_id,
tenant_shard_id,
)));
let TenantSharedResources {
@@ -1148,10 +1148,9 @@ impl Tenant {
tenant_shard_id: TenantShardId,
reason: String,
) -> Arc<Tenant> {
// TODO(sharding): make WalRedoManager shard-aware
let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
conf,
tenant_shard_id.tenant_id,
tenant_shard_id,
)));
Arc::new(Tenant::new(
TenantState::Broken {
@@ -1763,7 +1762,15 @@ impl Tenant {
// decoding the new WAL might need to look up previous pages, relation
// sizes etc. and that would get confused if the previous page versions
// are not in the repository yet.
ancestor_timeline.wait_lsn(*lsn, ctx).await?;
ancestor_timeline
.wait_lsn(*lsn, ctx)
.await
.map_err(|e| match e {
e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState) => {
CreateTimelineError::AncestorLsn(anyhow::anyhow!(e))
}
WaitLsnError::Shutdown => CreateTimelineError::ShuttingDown,
})?;
}
self.branch_timeline(

View File

@@ -588,7 +588,7 @@ impl DeleteTenantFlow {
}
break;
}
TenantsMapRemoveResult::Occupied(TenantSlot::Secondary) => {
TenantsMapRemoveResult::Occupied(TenantSlot::Secondary(_)) => {
// This is unexpected: this secondary tenants should not have been created, and we
// are not in a position to shut it down from here.
tracing::warn!("Tenant transitioned to secondary mode while deleting!");

View File

@@ -44,6 +44,7 @@ use utils::generation::Generation;
use utils::id::{TenantId, TimelineId};
use super::delete::DeleteTenantError;
use super::secondary::SecondaryTenant;
use super::TenantSharedResources;
/// For a tenant that appears in TenantsMap, it may either be
@@ -57,7 +58,7 @@ use super::TenantSharedResources;
/// having a properly acquired generation (Secondary doesn't need a generation)
pub(crate) enum TenantSlot {
Attached(Arc<Tenant>),
Secondary,
Secondary(Arc<SecondaryTenant>),
/// In this state, other administrative operations acting on the TenantId should
/// block, or return a retry indicator equivalent to HTTP 503.
InProgress(utils::completion::Barrier),
@@ -67,7 +68,7 @@ impl std::fmt::Debug for TenantSlot {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Attached(tenant) => write!(f, "Attached({})", tenant.current_state()),
Self::Secondary => write!(f, "Secondary"),
Self::Secondary(_) => write!(f, "Secondary"),
Self::InProgress(_) => write!(f, "InProgress"),
}
}
@@ -78,7 +79,7 @@ impl TenantSlot {
fn get_attached(&self) -> Option<&Arc<Tenant>> {
match self {
Self::Attached(t) => Some(t),
Self::Secondary => None,
Self::Secondary(_) => None,
Self::InProgress(_) => None,
}
}
@@ -130,7 +131,7 @@ impl TenantsMap {
/// A page service client sends a TenantId, and to look up the correct Tenant we must
/// resolve this to a fully qualified TenantShardId.
fn resolve_shard(
fn resolve_attached_shard(
&self,
tenant_id: &TenantId,
selector: ShardSelector,
@@ -140,25 +141,27 @@ impl TenantsMap {
TenantsMap::Initializing => None,
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
for slot in m.range(TenantShardId::tenant_range(*tenant_id)) {
// Ignore all slots that don't contain an attached tenant
let tenant = match &slot.1 {
TenantSlot::Attached(t) => t,
_ => continue,
};
match selector {
ShardSelector::First => return Some(*slot.0),
ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
return Some(*slot.0)
}
ShardSelector::Page(key) => {
if let Some(tenant) = slot.1.get_attached() {
// First slot we see for this tenant, calculate the expected shard number
// for the key: we will use this for checking if this and subsequent
// slots contain the key, rather than recalculating the hash each time.
if want_shard.is_none() {
want_shard = Some(tenant.shard_identity.get_shard_number(&key));
}
// First slot we see for this tenant, calculate the expected shard number
// for the key: we will use this for checking if this and subsequent
// slots contain the key, rather than recalculating the hash each time.
if want_shard.is_none() {
want_shard = Some(tenant.shard_identity.get_shard_number(&key));
}
if Some(tenant.shard_identity.number) == want_shard {
return Some(*slot.0);
}
} else {
continue;
if Some(tenant.shard_identity.number) == want_shard {
return Some(*slot.0);
}
}
_ => continue,
@@ -464,12 +467,18 @@ pub async fn init_tenant_mgr(
*gen
} else {
match &location_conf.mode {
LocationMode::Secondary(_) => {
LocationMode::Secondary(secondary_config) => {
// We do not require the control plane's permission for secondary mode
// tenants, because they do no remote writes and hence require no
// generation number
info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Loaded tenant in secondary mode");
tenants.insert(tenant_shard_id, TenantSlot::Secondary);
tenants.insert(
tenant_shard_id,
TenantSlot::Secondary(SecondaryTenant::new(
tenant_shard_id,
secondary_config,
)),
);
}
LocationMode::Attached(_) => {
// TODO: augment re-attach API to enable the control plane to
@@ -661,8 +670,14 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
total_attached += 1;
}
TenantSlot::Secondary => {
shutdown_state.insert(tenant_shard_id, TenantSlot::Secondary);
TenantSlot::Secondary(state) => {
// We don't need to wait for this individually per-tenant: the
// downloader task will be waited on eventually, this cancel
// is just to encourage it to drop out if it is doing work
// for this tenant right now.
state.cancel.cancel();
shutdown_state.insert(tenant_shard_id, TenantSlot::Secondary(state));
}
TenantSlot::InProgress(notify) => {
// InProgress tenants are not visible in TenantsMap::ShuttingDown: we will
@@ -845,12 +860,28 @@ impl TenantManager {
Some(TenantSlot::InProgress(_)) => {
Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
}
None | Some(TenantSlot::Secondary) => {
None | Some(TenantSlot::Secondary(_)) => {
Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
}
}
}
pub(crate) fn get_secondary_tenant_shard(
&self,
tenant_shard_id: TenantShardId,
) -> Option<Arc<SecondaryTenant>> {
let locked = self.tenants.read().unwrap();
let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
.ok()
.flatten();
match peek_slot {
Some(TenantSlot::Secondary(s)) => Some(s.clone()),
_ => None,
}
}
#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
pub(crate) async fn upsert_location(
&self,
@@ -862,10 +893,15 @@ impl TenantManager {
debug_assert_current_span_has_tenant_id();
info!("configuring tenant location to state {new_location_config:?}");
// Special case fast-path for updates to Tenant: if our upsert is only updating configuration,
enum FastPathModified {
Attached(Arc<Tenant>),
Secondary(Arc<SecondaryTenant>),
}
// Special case fast-path for updates to existing slots: if our upsert is only updating configuration,
// then we do not need to set the slot to InProgress, we can just call into the
// existng tenant.
let modify_tenant = {
let fast_path_taken = {
let locked = self.tenants.read().unwrap();
let peek_slot =
tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Write)?;
@@ -879,12 +915,19 @@ impl TenantManager {
new_location_config.clone(),
)?);
Some(tenant.clone())
Some(FastPathModified::Attached(tenant.clone()))
} else {
// Different generations, fall through to general case
None
}
}
(
LocationMode::Secondary(secondary_conf),
Some(TenantSlot::Secondary(secondary_tenant)),
) => {
secondary_tenant.set_config(secondary_conf);
Some(FastPathModified::Secondary(secondary_tenant.clone()))
}
_ => {
// Not an Attached->Attached transition, fall through to general case
None
@@ -893,34 +936,51 @@ impl TenantManager {
};
// Fast-path continued: having dropped out of the self.tenants lock, do the async
// phase of waiting for flush, before returning.
if let Some(tenant) = modify_tenant {
// Transition to AttachedStale means we may well hold a valid generation
// still, and have been requested to go stale as part of a migration. If
// the caller set `flush`, then flush to remote storage.
if let LocationMode::Attached(AttachedLocationConfig {
generation: _,
attach_mode: AttachmentMode::Stale,
}) = &new_location_config.mode
{
if let Some(flush_timeout) = flush {
match tokio::time::timeout(flush_timeout, tenant.flush_remote()).await {
Ok(Err(e)) => {
return Err(e);
}
Ok(Ok(_)) => return Ok(()),
Err(_) => {
tracing::warn!(
// phase of writing config and/or waiting for flush, before returning.
match fast_path_taken {
Some(FastPathModified::Attached(tenant)) => {
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
.await
.map_err(SetNewTenantConfigError::Persist)?;
// Transition to AttachedStale means we may well hold a valid generation
// still, and have been requested to go stale as part of a migration. If
// the caller set `flush`, then flush to remote storage.
if let LocationMode::Attached(AttachedLocationConfig {
generation: _,
attach_mode: AttachmentMode::Stale,
}) = &new_location_config.mode
{
if let Some(flush_timeout) = flush {
match tokio::time::timeout(flush_timeout, tenant.flush_remote()).await {
Ok(Err(e)) => {
return Err(e);
}
Ok(Ok(_)) => return Ok(()),
Err(_) => {
tracing::warn!(
timeout_ms = flush_timeout.as_millis(),
"Timed out waiting for flush to remote storage, proceeding anyway."
)
}
}
}
}
}
return Ok(());
}
return Ok(());
}
Some(FastPathModified::Secondary(_secondary_tenant)) => {
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
.await
.map_err(SetNewTenantConfigError::Persist)?;
return Ok(());
}
None => {
// Proceed with the general case procedure, where we will shutdown & remove any existing
// slot contents and replace with a fresh one
}
};
// General case for upserts to TenantsMap, excluding the case above: we will substitute an
// InProgress value to the slot while we make whatever changes are required. The state for
@@ -929,33 +989,47 @@ impl TenantManager {
// not do significant I/O, and shutdowns should be prompt via cancellation tokens.
let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
if let Some(TenantSlot::Attached(tenant)) = slot_guard.get_old_value() {
// The case where we keep a Tenant alive was covered above in the special case
// for Attached->Attached transitions in the same generation. By this point,
// if we see an attached tenant we know it will be discarded and should be
// shut down.
let (_guard, progress) = utils::completion::channel();
match slot_guard.get_old_value() {
Some(TenantSlot::Attached(tenant)) => {
// The case where we keep a Tenant alive was covered above in the special case
// for Attached->Attached transitions in the same generation. By this point,
// if we see an attached tenant we know it will be discarded and should be
// shut down.
let (_guard, progress) = utils::completion::channel();
match tenant.get_attach_mode() {
AttachmentMode::Single | AttachmentMode::Multi => {
// Before we leave our state as the presumed holder of the latest generation,
// flush any outstanding deletions to reduce the risk of leaking objects.
self.resources.deletion_queue_client.flush_advisory()
}
AttachmentMode::Stale => {
// If we're stale there's not point trying to flush deletions
}
};
match tenant.get_attach_mode() {
AttachmentMode::Single | AttachmentMode::Multi => {
// Before we leave our state as the presumed holder of the latest generation,
// flush any outstanding deletions to reduce the risk of leaking objects.
self.resources.deletion_queue_client.flush_advisory()
}
AttachmentMode::Stale => {
// If we're stale there's not point trying to flush deletions
}
};
info!("Shutting down attached tenant");
match tenant.shutdown(progress, false).await {
Ok(()) => {}
Err(barrier) => {
info!("Shutdown already in progress, waiting for it to complete");
barrier.wait().await;
info!("Shutting down attached tenant");
match tenant.shutdown(progress, false).await {
Ok(()) => {}
Err(barrier) => {
info!("Shutdown already in progress, waiting for it to complete");
barrier.wait().await;
}
}
slot_guard.drop_old_value().expect("We just shut it down");
}
Some(TenantSlot::Secondary(state)) => {
info!("Shutting down secondary tenant");
state.shutdown().await;
}
Some(TenantSlot::InProgress(_)) => {
// This should never happen: acquire_slot should error out
// if the contents of a slot were InProgress.
anyhow::bail!("Acquired an InProgress slot, this is a bug.")
}
None => {
// Slot was vacant, nothing needs shutting down.
}
slot_guard.drop_old_value().expect("We just shut it down");
}
let tenant_path = self.conf.tenant_path(&tenant_shard_id);
@@ -978,7 +1052,9 @@ impl TenantManager {
.map_err(SetNewTenantConfigError::Persist)?;
let new_slot = match &new_location_config.mode {
LocationMode::Secondary(_) => TenantSlot::Secondary,
LocationMode::Secondary(secondary_config) => {
TenantSlot::Secondary(SecondaryTenant::new(tenant_shard_id, secondary_config))
}
LocationMode::Attached(_attach_config) => {
let shard_identity = new_location_config.shard;
let tenant = tenant_spawn(
@@ -1091,6 +1167,30 @@ impl TenantManager {
.collect(),
}
}
// Do some synchronous work for all tenant slots in Secondary state. The provided
// callback should be small and fast, as it will be called inside the global
// TenantsMap lock.
pub(crate) fn foreach_secondary_tenants<F>(&self, mut func: F)
where
// TODO: let the callback return a hint to drop out of the loop early
F: FnMut(&TenantShardId, &Arc<SecondaryTenant>),
{
let locked = self.tenants.read().unwrap();
let map = match &*locked {
TenantsMap::Initializing | TenantsMap::ShuttingDown(_) => return,
TenantsMap::Open(m) => m,
};
for (tenant_id, slot) in map {
if let TenantSlot::Secondary(state) = slot {
// Only expose secondary tenants that are not currently shutting down
if !state.cancel.is_cancelled() {
func(tenant_id, state)
}
}
}
}
pub(crate) async fn delete_tenant(
&self,
@@ -1205,7 +1305,7 @@ pub(crate) fn get_tenant(
Some(TenantSlot::InProgress(_)) => {
Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
}
None | Some(TenantSlot::Secondary) => {
None | Some(TenantSlot::Secondary(_)) => {
Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
}
}
@@ -1257,9 +1357,11 @@ pub(crate) async fn get_active_tenant_with_timeout(
let locked = TENANTS.read().unwrap();
// Resolve TenantId to TenantShardId
let tenant_shard_id = locked.resolve_shard(&tenant_id, shard_selector).ok_or(
GetActiveTenantError::NotFound(GetTenantError::NotFound(tenant_id)),
)?;
let tenant_shard_id = locked
.resolve_attached_shard(&tenant_id, shard_selector)
.ok_or(GetActiveTenantError::NotFound(GetTenantError::NotFound(
tenant_id,
)))?;
let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
.map_err(GetTenantError::MapState)?;
@@ -1276,7 +1378,7 @@ pub(crate) async fn get_active_tenant_with_timeout(
}
}
}
Some(TenantSlot::Secondary) => {
Some(TenantSlot::Secondary(_)) => {
return Err(GetActiveTenantError::NotFound(GetTenantError::NotActive(
tenant_id,
)))
@@ -1539,10 +1641,8 @@ pub(crate) async fn list_tenants(
};
Ok(m.iter()
.filter_map(|(id, tenant)| match tenant {
TenantSlot::Attached(tenant) => {
Some((*id, tenant.current_state(), tenant.generation()))
}
TenantSlot::Secondary => None,
TenantSlot::Attached(tenant) => Some((*id, tenant.current_state(), tenant.generation())),
TenantSlot::Secondary(_) => None,
TenantSlot::InProgress(_) => None,
})
.collect())
@@ -1799,11 +1899,7 @@ impl SlotGuard {
fn old_value_is_shutdown(&self) -> bool {
match self.old_value.as_ref() {
Some(TenantSlot::Attached(tenant)) => tenant.gate.close_complete(),
Some(TenantSlot::Secondary) => {
// TODO: when adding secondary mode tenants, this will check for shutdown
// in the same way that we do for `Tenant` above
true
}
Some(TenantSlot::Secondary(secondary_tenant)) => secondary_tenant.gate.close_complete(),
Some(TenantSlot::InProgress(_)) => {
// A SlotGuard cannot be constructed for a slot that was already InProgress
unreachable!()
@@ -2013,26 +2109,19 @@ where
let mut slot_guard =
tenant_map_acquire_slot_impl(&tenant_shard_id, tenants, TenantSlotAcquireMode::MustExist)?;
// The SlotGuard allows us to manipulate the Tenant object without fear of some
// concurrent API request doing something else for the same tenant ID.
let attached_tenant = match slot_guard.get_old_value() {
Some(TenantSlot::Attached(t)) => Some(t),
_ => None,
};
// allow pageserver shutdown to await for our completion
let (_guard, progress) = completion::channel();
// If the tenant was attached, shut it down gracefully. For secondary
// locations this part is not necessary
match &attached_tenant {
Some(attached_tenant) => {
// The SlotGuard allows us to manipulate the Tenant object without fear of some
// concurrent API request doing something else for the same tenant ID.
let attached_tenant = match slot_guard.get_old_value() {
Some(TenantSlot::Attached(tenant)) => {
// whenever we remove a tenant from memory, we don't want to flush and wait for upload
let freeze_and_flush = false;
// shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
// that we can continue safely to cleanup.
match attached_tenant.shutdown(progress, freeze_and_flush).await {
match tenant.shutdown(progress, freeze_and_flush).await {
Ok(()) => {}
Err(_other) => {
// if pageserver shutdown or other detach/ignore is already ongoing, we don't want to
@@ -2041,11 +2130,19 @@ where
return Err(TenantStateError::IsStopping(tenant_shard_id.tenant_id));
}
}
Some(tenant)
}
None => {
// Nothing to wait on when not attached, proceed.
Some(TenantSlot::Secondary(secondary_state)) => {
tracing::info!("Shutting down in secondary mode");
secondary_state.shutdown().await;
None
}
}
Some(TenantSlot::InProgress(_)) => {
// Acquiring a slot guarantees its old value was not InProgress
unreachable!();
}
None => None,
};
match tenant_cleanup
.await

View File

@@ -229,6 +229,7 @@ use crate::{
tenant::upload_queue::{
UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask,
},
TENANT_HEATMAP_BASENAME,
};
use utils::id::{TenantId, TimelineId};
@@ -818,8 +819,25 @@ impl RemoteTimelineClient {
fn schedule_deletion_of_unlinked0(
self: &Arc<Self>,
upload_queue: &mut UploadQueueInitialized,
with_metadata: Vec<(LayerFileName, LayerFileMetadata)>,
mut with_metadata: Vec<(LayerFileName, LayerFileMetadata)>,
) {
// Filter out any layers which were not created by this tenant shard. These are
// layers that originate from some ancestor shard after a split, and may still
// be referenced by other shards. We are free to delete them locally and remove
// them from our index (and would have already done so when we reach this point
// in the code), but we may not delete them remotely.
with_metadata.retain(|(name, meta)| {
let retain = meta.shard.shard_number == self.tenant_shard_id.shard_number
&& meta.shard.shard_count == self.tenant_shard_id.shard_count;
if !retain {
tracing::debug!(
"Skipping deletion of ancestor-shard layer {name}, from shard {}",
meta.shard
);
}
retain
});
for (name, meta) in &with_metadata {
info!(
"scheduling deletion of layer {}{} (shard {})",
@@ -1724,11 +1742,11 @@ pub fn remote_index_path(
.expect("Failed to construct path")
}
pub const HEATMAP_BASENAME: &str = "heatmap-v1.json";
pub(crate) fn remote_heatmap_path(tenant_shard_id: &TenantShardId) -> RemotePath {
RemotePath::from_string(&format!("tenants/{tenant_shard_id}/{HEATMAP_BASENAME}"))
.expect("Failed to construct path")
RemotePath::from_string(&format!(
"tenants/{tenant_shard_id}/{TENANT_HEATMAP_BASENAME}"
))
.expect("Failed to construct path")
}
/// Given the key of an index, parse out the generation part of the name

View File

@@ -1,24 +1,48 @@
mod downloader;
pub mod heatmap;
mod heatmap_uploader;
mod scheduler;
use std::sync::Arc;
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
use self::heatmap_uploader::heatmap_uploader_task;
use self::{
downloader::{downloader_task, SecondaryDetail},
heatmap_uploader::heatmap_uploader_task,
};
use super::mgr::TenantManager;
use super::{config::SecondaryLocationConfig, mgr::TenantManager};
use pageserver_api::shard::TenantShardId;
use remote_storage::GenericRemoteStorage;
use tokio_util::sync::CancellationToken;
use utils::completion::Barrier;
use utils::{completion::Barrier, sync::gate::Gate};
enum DownloadCommand {
Download(TenantShardId),
}
enum UploadCommand {
Upload(TenantShardId),
}
impl UploadCommand {
fn get_tenant_shard_id(&self) -> &TenantShardId {
match self {
Self::Upload(id) => id,
}
}
}
impl DownloadCommand {
fn get_tenant_shard_id(&self) -> &TenantShardId {
match self {
Self::Download(id) => id,
}
}
}
struct CommandRequest<T> {
payload: T,
response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
@@ -28,12 +52,73 @@ struct CommandResponse {
result: anyhow::Result<()>,
}
// Whereas [`Tenant`] represents an attached tenant, this type represents the work
// we do for secondary tenant locations: where we are not serving clients or
// ingesting WAL, but we are maintaining a warm cache of layer files.
//
// This type is all about the _download_ path for secondary mode. The upload path
// runs separately (see [`heatmap_uploader`]) while a regular attached `Tenant` exists.
//
// This structure coordinates TenantManager and SecondaryDownloader,
// so that the downloader can indicate which tenants it is currently
// operating on, and the manager can indicate when a particular
// secondary tenant should cancel any work in flight.
#[derive(Debug)]
pub(crate) struct SecondaryTenant {
/// Carrying a tenant shard ID simplifies callers such as the downloader
/// which need to organize many of these objects by ID.
tenant_shard_id: TenantShardId,
/// Cancellation token indicates to SecondaryDownloader that it should stop doing
/// any work for this tenant at the next opportunity.
pub(crate) cancel: CancellationToken,
pub(crate) gate: Gate,
detail: std::sync::Mutex<SecondaryDetail>,
}
impl SecondaryTenant {
pub(crate) fn new(
tenant_shard_id: TenantShardId,
config: &SecondaryLocationConfig,
) -> Arc<Self> {
Arc::new(Self {
tenant_shard_id,
// todo: shall we make this a descendent of the
// main cancellation token, or is it sufficient that
// on shutdown we walk the tenants and fire their
// individual cancellations?
cancel: CancellationToken::new(),
gate: Gate::new(format!("SecondaryTenant {tenant_shard_id}")),
detail: std::sync::Mutex::new(SecondaryDetail::new(config.clone())),
})
}
pub(crate) async fn shutdown(&self) {
self.cancel.cancel();
// Wait for any secondary downloader work to complete
self.gate.close().await;
}
pub(crate) fn set_config(&self, config: &SecondaryLocationConfig) {
self.detail.lock().unwrap().config = config.clone();
}
fn get_tenant_shard_id(&self) -> &TenantShardId {
&self.tenant_shard_id
}
}
/// The SecondaryController is a pseudo-rpc client for administrative control of secondary mode downloads,
/// and heatmap uploads. This is not a hot data path: it's primarily a hook for tests,
/// where we want to immediately upload/download for a particular tenant. In normal operation
/// uploads & downloads are autonomous and not driven by this interface.
pub struct SecondaryController {
upload_req_tx: tokio::sync::mpsc::Sender<CommandRequest<UploadCommand>>,
download_req_tx: tokio::sync::mpsc::Sender<CommandRequest<DownloadCommand>>,
}
impl SecondaryController {
@@ -63,6 +148,13 @@ impl SecondaryController {
self.dispatch(&self.upload_req_tx, UploadCommand::Upload(tenant_shard_id))
.await
}
pub async fn download_tenant(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
self.dispatch(
&self.download_req_tx,
DownloadCommand::Download(tenant_shard_id),
)
.await
}
}
pub fn spawn_tasks(
@@ -71,9 +163,37 @@ pub fn spawn_tasks(
background_jobs_can_start: Barrier,
cancel: CancellationToken,
) -> SecondaryController {
let mgr_clone = tenant_manager.clone();
let storage_clone = remote_storage.clone();
let cancel_clone = cancel.clone();
let bg_jobs_clone = background_jobs_can_start.clone();
let (download_req_tx, download_req_rx) =
tokio::sync::mpsc::channel::<CommandRequest<DownloadCommand>>(16);
let (upload_req_tx, upload_req_rx) =
tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
task_mgr::spawn(
BACKGROUND_RUNTIME.handle(),
TaskKind::SecondaryDownloads,
None,
None,
"secondary tenant downloads",
false,
async move {
downloader_task(
mgr_clone,
storage_clone,
download_req_rx,
bg_jobs_clone,
cancel_clone,
)
.await;
Ok(())
},
);
task_mgr::spawn(
BACKGROUND_RUNTIME.handle(),
TaskKind::SecondaryUploads,
@@ -89,16 +209,26 @@ pub fn spawn_tasks(
background_jobs_can_start,
cancel,
)
.await
.await;
Ok(())
},
);
SecondaryController { upload_req_tx }
SecondaryController {
download_req_tx,
upload_req_tx,
}
}
/// For running with remote storage disabled: a SecondaryController that is connected to nothing.
pub fn null_controller() -> SecondaryController {
let (download_req_tx, _download_req_rx) =
tokio::sync::mpsc::channel::<CommandRequest<DownloadCommand>>(16);
let (upload_req_tx, _upload_req_rx) =
tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
SecondaryController { upload_req_tx }
SecondaryController {
upload_req_tx,
download_req_tx,
}
}

View File

@@ -0,0 +1,801 @@
use std::{
collections::{HashMap, HashSet},
pin::Pin,
str::FromStr,
sync::Arc,
time::{Duration, Instant, SystemTime},
};
use crate::{
config::PageServerConf,
metrics::SECONDARY_MODE,
tenant::{
config::SecondaryLocationConfig,
debug_assert_current_span_has_tenant_and_timeline_id,
remote_timeline_client::{
index::LayerFileMetadata, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES,
},
span::debug_assert_current_span_has_tenant_id,
storage_layer::LayerFileName,
tasks::{warn_when_period_overrun, BackgroundLoopKind},
},
virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile},
METADATA_FILE_NAME, TEMP_FILE_SUFFIX,
};
use super::{
heatmap::HeatMapLayer,
scheduler::{self, Completion, JobGenerator, SchedulingResult, TenantBackgroundJobs},
SecondaryTenant,
};
use crate::tenant::{
mgr::TenantManager,
remote_timeline_client::{download::download_layer_file, remote_heatmap_path},
};
use chrono::format::{DelayedFormat, StrftimeItems};
use futures::Future;
use pageserver_api::shard::TenantShardId;
use rand::Rng;
use remote_storage::{DownloadError, GenericRemoteStorage};
use tokio_util::sync::CancellationToken;
use tracing::{info_span, instrument, Instrument};
use utils::{
backoff, completion::Barrier, crashsafe::path_with_suffix_extension, fs_ext, id::TimelineId,
};
use super::{
heatmap::{HeatMapTenant, HeatMapTimeline},
CommandRequest, DownloadCommand,
};
/// For each tenant, how long must have passed since the last download_tenant call before
/// calling it again. This is approximately the time by which local data is allowed
/// to fall behind remote data.
///
/// TODO: this should just be a default, and the actual period should be controlled
/// via the heatmap itself
/// `<ttps://github.com/neondatabase/neon/issues/6200>`
const DOWNLOAD_FRESHEN_INTERVAL: Duration = Duration::from_millis(60000);
pub(super) async fn downloader_task(
tenant_manager: Arc<TenantManager>,
remote_storage: GenericRemoteStorage,
command_queue: tokio::sync::mpsc::Receiver<CommandRequest<DownloadCommand>>,
background_jobs_can_start: Barrier,
cancel: CancellationToken,
) {
let concurrency = tenant_manager.get_conf().secondary_download_concurrency;
let generator = SecondaryDownloader {
tenant_manager,
remote_storage,
};
let mut scheduler = Scheduler::new(generator, concurrency);
scheduler
.run(command_queue, background_jobs_can_start, cancel)
.instrument(info_span!("secondary_downloads"))
.await
}
struct SecondaryDownloader {
tenant_manager: Arc<TenantManager>,
remote_storage: GenericRemoteStorage,
}
#[derive(Debug, Clone)]
pub(super) struct OnDiskState {
metadata: LayerFileMetadata,
access_time: SystemTime,
}
impl OnDiskState {
fn new(
_conf: &'static PageServerConf,
_tenant_shard_id: &TenantShardId,
_imeline_id: &TimelineId,
_ame: LayerFileName,
metadata: LayerFileMetadata,
access_time: SystemTime,
) -> Self {
Self {
metadata,
access_time,
}
}
}
#[derive(Debug, Clone, Default)]
pub(super) struct SecondaryDetailTimeline {
pub(super) on_disk_layers: HashMap<LayerFileName, OnDiskState>,
/// We remember when layers were evicted, to prevent re-downloading them.
pub(super) evicted_at: HashMap<LayerFileName, SystemTime>,
}
/// This state is written by the secondary downloader, it is opaque
/// to TenantManager
#[derive(Debug)]
pub(super) struct SecondaryDetail {
pub(super) config: SecondaryLocationConfig,
last_download: Option<Instant>,
next_download: Option<Instant>,
pub(super) timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
}
/// Helper for logging SystemTime
fn strftime(t: &'_ SystemTime) -> DelayedFormat<StrftimeItems<'_>> {
let datetime: chrono::DateTime<chrono::Utc> = (*t).into();
datetime.format("%d/%m/%Y %T")
}
impl SecondaryDetail {
pub(super) fn new(config: SecondaryLocationConfig) -> Self {
Self {
config,
last_download: None,
next_download: None,
timelines: HashMap::new(),
}
}
}
struct PendingDownload {
secondary_state: Arc<SecondaryTenant>,
last_download: Option<Instant>,
target_time: Option<Instant>,
period: Option<Duration>,
}
impl scheduler::PendingJob for PendingDownload {
fn get_tenant_shard_id(&self) -> &TenantShardId {
self.secondary_state.get_tenant_shard_id()
}
}
struct RunningDownload {
barrier: Barrier,
}
impl scheduler::RunningJob for RunningDownload {
fn get_barrier(&self) -> Barrier {
self.barrier.clone()
}
}
struct CompleteDownload {
secondary_state: Arc<SecondaryTenant>,
completed_at: Instant,
}
impl scheduler::Completion for CompleteDownload {
fn get_tenant_shard_id(&self) -> &TenantShardId {
self.secondary_state.get_tenant_shard_id()
}
}
type Scheduler = TenantBackgroundJobs<
SecondaryDownloader,
PendingDownload,
RunningDownload,
CompleteDownload,
DownloadCommand,
>;
#[async_trait::async_trait]
impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCommand>
for SecondaryDownloader
{
#[instrument(skip_all, fields(tenant_id=%completion.get_tenant_shard_id().tenant_id, shard_id=%completion.get_tenant_shard_id().shard_slug()))]
fn on_completion(&mut self, completion: CompleteDownload) {
let CompleteDownload {
secondary_state,
completed_at: _completed_at,
} = completion;
tracing::debug!("Secondary tenant download completed");
// Update freshened_at even if there was an error: we don't want errored tenants to implicitly
// take priority to run again.
let mut detail = secondary_state.detail.lock().unwrap();
detail.next_download = Some(Instant::now() + DOWNLOAD_FRESHEN_INTERVAL);
}
async fn schedule(&mut self) -> SchedulingResult<PendingDownload> {
let mut result = SchedulingResult {
jobs: Vec::new(),
want_interval: None,
};
// Step 1: identify some tenants that we may work on
let mut tenants: Vec<Arc<SecondaryTenant>> = Vec::new();
self.tenant_manager
.foreach_secondary_tenants(|_id, secondary_state| {
tenants.push(secondary_state.clone());
});
// Step 2: filter out tenants which are not yet elegible to run
let now = Instant::now();
result.jobs = tenants
.into_iter()
.filter_map(|secondary_tenant| {
let (last_download, next_download) = {
let mut detail = secondary_tenant.detail.lock().unwrap();
if !detail.config.warm {
// Downloads are disabled for this tenant
detail.next_download = None;
return None;
}
if detail.next_download.is_none() {
// Initialize with a jitter: this spreads initial downloads on startup
// or mass-attach across our freshen interval.
let jittered_period =
rand::thread_rng().gen_range(Duration::ZERO..DOWNLOAD_FRESHEN_INTERVAL);
detail.next_download = Some(now.checked_add(jittered_period).expect(
"Using our constant, which is known to be small compared with clock range",
));
}
(detail.last_download, detail.next_download.unwrap())
};
if now < next_download {
Some(PendingDownload {
secondary_state: secondary_tenant,
last_download,
target_time: Some(next_download),
period: Some(DOWNLOAD_FRESHEN_INTERVAL),
})
} else {
None
}
})
.collect();
// Step 3: sort by target execution time to run most urgent first.
result.jobs.sort_by_key(|j| j.target_time);
result
}
fn on_command(&mut self, command: DownloadCommand) -> anyhow::Result<PendingDownload> {
let tenant_shard_id = command.get_tenant_shard_id();
let tenant = self
.tenant_manager
.get_secondary_tenant_shard(*tenant_shard_id);
let Some(tenant) = tenant else {
{
return Err(anyhow::anyhow!("Not found or not in Secondary mode"));
}
};
Ok(PendingDownload {
target_time: None,
period: None,
last_download: None,
secondary_state: tenant,
})
}
fn spawn(
&mut self,
job: PendingDownload,
) -> (
RunningDownload,
Pin<Box<dyn Future<Output = CompleteDownload> + Send>>,
) {
let PendingDownload {
secondary_state,
last_download,
target_time,
period,
} = job;
let (completion, barrier) = utils::completion::channel();
let remote_storage = self.remote_storage.clone();
let conf = self.tenant_manager.get_conf();
let tenant_shard_id = *secondary_state.get_tenant_shard_id();
(RunningDownload { barrier }, Box::pin(async move {
let _completion = completion;
match TenantDownloader::new(conf, &remote_storage, &secondary_state)
.download()
.await
{
Err(UpdateError::NoData) => {
tracing::info!("No heatmap found for tenant. This is fine if it is new.");
},
Err(UpdateError::NoSpace) => {
tracing::warn!("Insufficient space while downloading. Will retry later.");
}
Err(UpdateError::Cancelled) => {
tracing::debug!("Shut down while downloading");
},
Err(UpdateError::Deserialize(e)) => {
tracing::error!("Corrupt content while downloading tenant: {e}");
},
Err(e @ (UpdateError::DownloadError(_) | UpdateError::Other(_))) => {
tracing::error!("Error while downloading tenant: {e}");
},
Ok(()) => {}
};
// Irrespective of the result, we will reschedule ourselves to run after our usual period.
// If the job had a target execution time, we may check our final execution
// time against that for observability purposes.
if let (Some(target_time), Some(period)) = (target_time, period) {
// Only track execution lag if this isn't our first download: otherwise, it is expected
// that execution will have taken longer than our configured interval, for example
// when starting up a pageserver and
if last_download.is_some() {
// Elapsed time includes any scheduling lag as well as the execution of the job
let elapsed = Instant::now().duration_since(target_time);
warn_when_period_overrun(
elapsed,
period,
BackgroundLoopKind::SecondaryDownload,
);
}
}
CompleteDownload {
secondary_state,
completed_at: Instant::now(),
}
}.instrument(info_span!(parent: None, "secondary_download", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))))
}
}
/// This type is a convenience to group together the various functions involved in
/// freshening a secondary tenant.
struct TenantDownloader<'a> {
conf: &'static PageServerConf,
remote_storage: &'a GenericRemoteStorage,
secondary_state: &'a SecondaryTenant,
}
/// Errors that may be encountered while updating a tenant
#[derive(thiserror::Error, Debug)]
enum UpdateError {
#[error("No remote data found")]
NoData,
#[error("Insufficient local storage space")]
NoSpace,
#[error("Failed to download")]
DownloadError(DownloadError),
#[error(transparent)]
Deserialize(#[from] serde_json::Error),
#[error("Cancelled")]
Cancelled,
#[error(transparent)]
Other(#[from] anyhow::Error),
}
impl From<DownloadError> for UpdateError {
fn from(value: DownloadError) -> Self {
match &value {
DownloadError::Cancelled => Self::Cancelled,
DownloadError::NotFound => Self::NoData,
_ => Self::DownloadError(value),
}
}
}
impl From<std::io::Error> for UpdateError {
fn from(value: std::io::Error) -> Self {
if let Some(nix::errno::Errno::ENOSPC) = value.raw_os_error().map(nix::errno::from_i32) {
UpdateError::NoSpace
} else {
// An I/O error from e.g. tokio::io::copy is most likely a remote storage issue
UpdateError::Other(anyhow::anyhow!(value))
}
}
}
impl<'a> TenantDownloader<'a> {
fn new(
conf: &'static PageServerConf,
remote_storage: &'a GenericRemoteStorage,
secondary_state: &'a SecondaryTenant,
) -> Self {
Self {
conf,
remote_storage,
secondary_state,
}
}
async fn download(&self) -> Result<(), UpdateError> {
debug_assert_current_span_has_tenant_id();
// For the duration of a download, we must hold the SecondaryTenant::gate, to ensure
// cover our access to local storage.
let Ok(_guard) = self.secondary_state.gate.enter() else {
// Shutting down
return Ok(());
};
let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
// Download the tenant's heatmap
let heatmap_bytes = tokio::select!(
bytes = self.download_heatmap() => {bytes?},
_ = self.secondary_state.cancel.cancelled() => return Ok(())
);
let heatmap = serde_json::from_slice::<HeatMapTenant>(&heatmap_bytes)?;
// Save the heatmap: this will be useful on restart, allowing us to reconstruct
// layer metadata without having to re-download it.
let heatmap_path = self.conf.tenant_heatmap_path(tenant_shard_id);
let temp_path = path_with_suffix_extension(&heatmap_path, TEMP_FILE_SUFFIX);
let context_msg = format!("write tenant {tenant_shard_id} heatmap to {heatmap_path}");
let heatmap_path_bg = heatmap_path.clone();
tokio::task::spawn_blocking(move || {
tokio::runtime::Handle::current().block_on(async move {
VirtualFile::crashsafe_overwrite(&heatmap_path_bg, &temp_path, &heatmap_bytes).await
})
})
.await
.expect("Blocking task is never aborted")
.maybe_fatal_err(&context_msg)?;
tracing::debug!("Wrote local heatmap to {}", heatmap_path);
// Download the layers in the heatmap
for timeline in heatmap.timelines {
if self.secondary_state.cancel.is_cancelled() {
return Ok(());
}
let timeline_id = timeline.timeline_id;
self.download_timeline(timeline)
.instrument(tracing::info_span!(
"secondary_download_timeline",
tenant_id=%tenant_shard_id.tenant_id,
shard_id=%tenant_shard_id.shard_slug(),
%timeline_id
))
.await?;
}
Ok(())
}
async fn download_heatmap(&self) -> Result<Vec<u8>, UpdateError> {
debug_assert_current_span_has_tenant_id();
let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
// TODO: make download conditional on ETag having changed since last download
// (https://github.com/neondatabase/neon/issues/6199)
tracing::debug!("Downloading heatmap for secondary tenant",);
let heatmap_path = remote_heatmap_path(tenant_shard_id);
let heatmap_bytes = backoff::retry(
|| async {
let download = self
.remote_storage
.download(&heatmap_path)
.await
.map_err(UpdateError::from)?;
let mut heatmap_bytes = Vec::new();
let mut body = tokio_util::io::StreamReader::new(download.download_stream);
let _size = tokio::io::copy(&mut body, &mut heatmap_bytes).await?;
Ok(heatmap_bytes)
},
|e| matches!(e, UpdateError::NoData | UpdateError::Cancelled),
FAILED_DOWNLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
"download heatmap",
backoff::Cancel::new(self.secondary_state.cancel.clone(), || {
UpdateError::Cancelled
}),
)
.await?;
SECONDARY_MODE.download_heatmap.inc();
Ok(heatmap_bytes)
}
async fn download_timeline(&self, timeline: HeatMapTimeline) -> Result<(), UpdateError> {
debug_assert_current_span_has_tenant_and_timeline_id();
let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
let timeline_path = self
.conf
.timeline_path(tenant_shard_id, &timeline.timeline_id);
// Accumulate updates to the state
let mut touched = Vec::new();
// Clone a view of what layers already exist on disk
let timeline_state = self
.secondary_state
.detail
.lock()
.unwrap()
.timelines
.get(&timeline.timeline_id)
.cloned();
let timeline_state = match timeline_state {
Some(t) => t,
None => {
// We have no existing state: need to scan local disk for layers first.
let timeline_state =
init_timeline_state(self.conf, tenant_shard_id, &timeline).await;
// Re-acquire detail lock now that we're done with async load from local FS
self.secondary_state
.detail
.lock()
.unwrap()
.timelines
.insert(timeline.timeline_id, timeline_state.clone());
timeline_state
}
};
let layers_in_heatmap = timeline
.layers
.iter()
.map(|l| &l.name)
.collect::<HashSet<_>>();
let layers_on_disk = timeline_state
.on_disk_layers
.iter()
.map(|l| l.0)
.collect::<HashSet<_>>();
// Remove on-disk layers that are no longer present in heatmap
for layer in layers_on_disk.difference(&layers_in_heatmap) {
let local_path = timeline_path.join(layer.to_string());
tracing::info!("Removing secondary local layer {layer} because it's absent in heatmap",);
tokio::fs::remove_file(&local_path)
.await
.or_else(fs_ext::ignore_not_found)
.maybe_fatal_err("Removing secondary layer")?;
}
// Download heatmap layers that are not present on local disk, or update their
// access time if they are already present.
for layer in timeline.layers {
if self.secondary_state.cancel.is_cancelled() {
return Ok(());
}
// Existing on-disk layers: just update their access time.
if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) {
tracing::debug!("Layer {} is already on disk", layer.name);
if on_disk.metadata != LayerFileMetadata::from(&layer.metadata)
|| on_disk.access_time != layer.access_time
{
// We already have this layer on disk. Update its access time.
tracing::debug!(
"Access time updated for layer {}: {} -> {}",
layer.name,
strftime(&on_disk.access_time),
strftime(&layer.access_time)
);
touched.push(layer);
}
continue;
} else {
tracing::debug!("Layer {} not present on disk yet", layer.name);
}
// Eviction: if we evicted a layer, then do not re-download it unless it was accessed more
// recently than it was evicted.
if let Some(evicted_at) = timeline_state.evicted_at.get(&layer.name) {
if &layer.access_time > evicted_at {
tracing::info!(
"Re-downloading evicted layer {}, accessed at {}, evicted at {}",
layer.name,
strftime(&layer.access_time),
strftime(evicted_at)
);
} else {
tracing::trace!(
"Not re-downloading evicted layer {}, accessed at {}, evicted at {}",
layer.name,
strftime(&layer.access_time),
strftime(evicted_at)
);
continue;
}
}
// Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
let downloaded_bytes = match download_layer_file(
self.conf,
self.remote_storage,
*tenant_shard_id,
timeline.timeline_id,
&layer.name,
&LayerFileMetadata::from(&layer.metadata),
&self.secondary_state.cancel,
)
.await
{
Ok(bytes) => bytes,
Err(e) => {
if let DownloadError::NotFound = e {
// A heatmap might be out of date and refer to a layer that doesn't exist any more.
// This is harmless: continue to download the next layer. It is expected during compaction
// GC.
tracing::debug!(
"Skipped downloading missing layer {}, raced with compaction/gc?",
layer.name
);
continue;
} else {
return Err(e.into());
}
}
};
if downloaded_bytes != layer.metadata.file_size {
let local_path = timeline_path.join(layer.name.to_string());
tracing::warn!(
"Downloaded layer {} with unexpected size {} != {}. Removing download.",
layer.name,
downloaded_bytes,
layer.metadata.file_size
);
tokio::fs::remove_file(&local_path)
.await
.or_else(fs_ext::ignore_not_found)?;
}
SECONDARY_MODE.download_layer.inc();
touched.push(layer)
}
// Write updates to state to record layers we just downloaded or touched.
{
let mut detail = self.secondary_state.detail.lock().unwrap();
let timeline_detail = detail.timelines.entry(timeline.timeline_id).or_default();
tracing::info!("Wrote timeline_detail for {} touched layers", touched.len());
for t in touched {
use std::collections::hash_map::Entry;
match timeline_detail.on_disk_layers.entry(t.name.clone()) {
Entry::Occupied(mut v) => {
v.get_mut().access_time = t.access_time;
}
Entry::Vacant(e) => {
e.insert(OnDiskState::new(
self.conf,
tenant_shard_id,
&timeline.timeline_id,
t.name,
LayerFileMetadata::from(&t.metadata),
t.access_time,
));
}
}
}
}
Ok(())
}
}
/// Scan local storage and build up Layer objects based on the metadata in a HeatMapTimeline
async fn init_timeline_state(
conf: &'static PageServerConf,
tenant_shard_id: &TenantShardId,
heatmap: &HeatMapTimeline,
) -> SecondaryDetailTimeline {
let timeline_path = conf.timeline_path(tenant_shard_id, &heatmap.timeline_id);
let mut detail = SecondaryDetailTimeline::default();
let mut dir = match tokio::fs::read_dir(&timeline_path).await {
Ok(d) => d,
Err(e) => {
if e.kind() == std::io::ErrorKind::NotFound {
let context = format!("Creating timeline directory {timeline_path}");
tracing::info!("{}", context);
tokio::fs::create_dir_all(&timeline_path)
.await
.fatal_err(&context);
// No entries to report: drop out.
return detail;
} else {
on_fatal_io_error(&e, &format!("Reading timeline dir {timeline_path}"));
}
}
};
// As we iterate through layers found on disk, we will look up their metadata from this map.
// Layers not present in metadata will be discarded.
let heatmap_metadata: HashMap<&LayerFileName, &HeatMapLayer> =
heatmap.layers.iter().map(|l| (&l.name, l)).collect();
while let Some(dentry) = dir
.next_entry()
.await
.fatal_err(&format!("Listing {timeline_path}"))
{
let dentry_file_name = dentry.file_name();
let file_name = dentry_file_name.to_string_lossy();
let local_meta = dentry.metadata().await.fatal_err(&format!(
"Read metadata on {}",
dentry.path().to_string_lossy()
));
// Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
if file_name == METADATA_FILE_NAME {
continue;
}
match LayerFileName::from_str(&file_name) {
Ok(name) => {
let remote_meta = heatmap_metadata.get(&name);
match remote_meta {
Some(remote_meta) => {
// TODO: checksums for layers (https://github.com/neondatabase/neon/issues/2784)
if local_meta.len() != remote_meta.metadata.file_size {
// This should not happen, because we do crashsafe write-then-rename when downloading
// layers, and layers in remote storage are immutable. Remove the local file because
// we cannot trust it.
tracing::warn!(
"Removing local layer {name} with unexpected local size {} != {}",
local_meta.len(),
remote_meta.metadata.file_size
);
} else {
// We expect the access time to be initialized immediately afterwards, when
// the latest heatmap is applied to the state.
detail.on_disk_layers.insert(
name.clone(),
OnDiskState::new(
conf,
tenant_shard_id,
&heatmap.timeline_id,
name,
LayerFileMetadata::from(&remote_meta.metadata),
remote_meta.access_time,
),
);
}
}
None => {
// FIXME: consider some optimization when transitioning from attached to secondary: maybe
// wait until we have seen a heatmap that is more recent than the most recent on-disk state? Otherwise
// we will end up deleting any layers which were created+uploaded more recently than the heatmap.
tracing::info!(
"Removing secondary local layer {} because it's absent in heatmap",
name
);
tokio::fs::remove_file(&dentry.path())
.await
.or_else(fs_ext::ignore_not_found)
.fatal_err(&format!(
"Removing layer {}",
dentry.path().to_string_lossy()
));
}
}
}
Err(_) => {
// Ignore it.
tracing::warn!("Unexpected file in timeline directory: {file_name}");
}
}
}
detail
}

View File

@@ -1,5 +1,6 @@
use std::{
collections::HashMap,
pin::Pin,
sync::{Arc, Weak},
time::{Duration, Instant},
};
@@ -7,35 +8,86 @@ use std::{
use crate::{
metrics::SECONDARY_MODE,
tenant::{
config::AttachmentMode, mgr::TenantManager, remote_timeline_client::remote_heatmap_path,
secondary::CommandResponse, span::debug_assert_current_span_has_tenant_id, Tenant,
config::AttachmentMode,
mgr::TenantManager,
remote_timeline_client::remote_heatmap_path,
span::debug_assert_current_span_has_tenant_id,
tasks::{warn_when_period_overrun, BackgroundLoopKind},
Tenant,
},
};
use futures::Future;
use md5;
use pageserver_api::shard::TenantShardId;
use rand::Rng;
use remote_storage::GenericRemoteStorage;
use tokio::task::JoinSet;
use super::{
scheduler::{self, JobGenerator, RunningJob, SchedulingResult, TenantBackgroundJobs},
CommandRequest,
};
use tokio_util::sync::CancellationToken;
use tracing::instrument;
use utils::{backoff, completion::Barrier};
use tracing::{info_span, instrument, Instrument};
use utils::{backoff, completion::Barrier, yielding_loop::yielding_loop};
use super::{heatmap::HeatMapTenant, CommandRequest, UploadCommand};
use super::{heatmap::HeatMapTenant, UploadCommand};
/// Period between heatmap uploader walking Tenants to look for work to do.
/// If any tenants have a heatmap upload period lower than this, it will be adjusted
/// downward to match.
const DEFAULT_SCHEDULING_INTERVAL: Duration = Duration::from_millis(60000);
const MIN_SCHEDULING_INTERVAL: Duration = Duration::from_millis(1000);
pub(super) async fn heatmap_uploader_task(
tenant_manager: Arc<TenantManager>,
remote_storage: GenericRemoteStorage,
command_queue: tokio::sync::mpsc::Receiver<CommandRequest<UploadCommand>>,
background_jobs_can_start: Barrier,
cancel: CancellationToken,
) {
let concurrency = tenant_manager.get_conf().heatmap_upload_concurrency;
let generator = HeatmapUploader {
tenant_manager,
remote_storage,
cancel: cancel.clone(),
tenants: HashMap::new(),
};
let mut scheduler = Scheduler::new(generator, concurrency);
scheduler
.run(command_queue, background_jobs_can_start, cancel)
.instrument(info_span!("heatmap_uploader"))
.await
}
/// This type is owned by a single task ([`heatmap_uploader_task`]) which runs an event
/// handling loop and mutates it as needed: there are no locks here, because that event loop
/// can hold &mut references to this type throughout.
struct HeatmapUploader {
tenant_manager: Arc<TenantManager>,
remote_storage: GenericRemoteStorage,
cancel: CancellationToken,
tenants: HashMap<TenantShardId, UploaderTenantState>,
}
struct WriteInProgress {
barrier: Barrier,
}
impl RunningJob for WriteInProgress {
fn get_barrier(&self) -> Barrier {
self.barrier.clone()
}
}
struct UploadPending {
tenant: Arc<Tenant>,
last_digest: Option<md5::Digest>,
target_time: Option<Instant>,
period: Option<Duration>,
}
impl scheduler::PendingJob for UploadPending {
fn get_tenant_shard_id(&self) -> &TenantShardId {
self.tenant.get_tenant_shard_id()
}
}
struct WriteComplete {
@@ -45,6 +97,12 @@ struct WriteComplete {
next_upload: Option<Instant>,
}
impl scheduler::Completion for WriteComplete {
fn get_tenant_shard_id(&self) -> &TenantShardId {
&self.tenant_shard_id
}
}
/// The heatmap uploader keeps a little bit of per-tenant state, mainly to remember
/// when we last did a write. We only populate this after doing at least one
/// write for a tenant -- this avoids holding state for tenants that have
@@ -68,267 +126,111 @@ struct UploaderTenantState {
next_upload: Option<Instant>,
}
/// This type is owned by a single task ([`heatmap_uploader_task`]) which runs an event
/// handling loop and mutates it as needed: there are no locks here, because that event loop
/// can hold &mut references to this type throughout.
struct HeatmapUploader {
tenant_manager: Arc<TenantManager>,
remote_storage: GenericRemoteStorage,
cancel: CancellationToken,
type Scheduler = TenantBackgroundJobs<
HeatmapUploader,
UploadPending,
WriteInProgress,
WriteComplete,
UploadCommand,
>;
tenants: HashMap<TenantShardId, UploaderTenantState>,
/// Tenants with work to do, for which tasks should be spawned as soon as concurrency
/// limits permit it.
tenants_pending: std::collections::VecDeque<UploadPending>,
/// Tenants for which a task in `tasks` has been spawned.
tenants_uploading: HashMap<TenantShardId, WriteInProgress>,
tasks: JoinSet<()>,
/// Channel for our child tasks to send results to: we use a channel for results rather than
/// just getting task results via JoinSet because we need the channel's recv() "sleep until something
/// is available" semantic, rather than JoinSet::join_next()'s "sleep until next thing is available _or_ I'm empty"
/// behavior.
task_result_tx: tokio::sync::mpsc::UnboundedSender<WriteComplete>,
task_result_rx: tokio::sync::mpsc::UnboundedReceiver<WriteComplete>,
concurrent_uploads: usize,
scheduling_interval: Duration,
}
/// The uploader task runs a loop that periodically wakes up and schedules tasks for
/// tenants that require an upload, or handles any commands that have been sent into
/// `command_queue`. No I/O is done in this loop: that all happens in the tasks we
/// spawn.
///
/// Scheduling iterations are somewhat infrequent. However, each one will enqueue
/// all tenants that require an upload, and in between scheduling iterations we will
/// continue to spawn new tasks for pending tenants, as our concurrency limit permits.
///
/// While we take a CancellationToken here, it is subordinate to the CancellationTokens
/// of tenants: i.e. we expect all Tenants to have been shut down before we are shut down, otherwise
/// we might block waiting on a Tenant.
pub(super) async fn heatmap_uploader_task(
tenant_manager: Arc<TenantManager>,
remote_storage: GenericRemoteStorage,
mut command_queue: tokio::sync::mpsc::Receiver<CommandRequest<UploadCommand>>,
background_jobs_can_start: Barrier,
cancel: CancellationToken,
) -> anyhow::Result<()> {
let concurrent_uploads = tenant_manager.get_conf().heatmap_upload_concurrency;
let (result_tx, result_rx) = tokio::sync::mpsc::unbounded_channel();
let mut uploader = HeatmapUploader {
tenant_manager,
remote_storage,
cancel: cancel.clone(),
tasks: JoinSet::new(),
tenants: HashMap::new(),
tenants_pending: std::collections::VecDeque::new(),
tenants_uploading: HashMap::new(),
task_result_tx: result_tx,
task_result_rx: result_rx,
concurrent_uploads,
scheduling_interval: DEFAULT_SCHEDULING_INTERVAL,
};
tracing::info!("Waiting for background_jobs_can start...");
background_jobs_can_start.wait().await;
tracing::info!("background_jobs_can is ready, proceeding.");
while !cancel.is_cancelled() {
// Look for new work: this is relatively expensive because we have to go acquire the lock on
// the tenant manager to retrieve tenants, and then iterate over them to figure out which ones
// require an upload.
uploader.schedule_iteration().await?;
// Between scheduling iterations, we will:
// - Drain any complete tasks and spawn pending tasks
// - Handle incoming administrative commands
// - Check our cancellation token
let next_scheduling_iteration = Instant::now()
.checked_add(uploader.scheduling_interval)
.unwrap_or_else(|| {
tracing::warn!(
"Scheduling interval invalid ({}s), running immediately!",
uploader.scheduling_interval.as_secs_f64()
);
Instant::now()
});
loop {
tokio::select! {
_ = cancel.cancelled() => {
// We do not simply drop the JoinSet, in order to have an orderly shutdown without cancellation.
tracing::info!("Heatmap uploader joining tasks");
while let Some(_r) = uploader.tasks.join_next().await {};
tracing::info!("Heatmap uploader terminating");
break;
},
_ = tokio::time::sleep(next_scheduling_iteration.duration_since(Instant::now())) => {
tracing::debug!("heatmap_uploader_task: woke for scheduling interval");
break;},
cmd = command_queue.recv() => {
tracing::debug!("heatmap_uploader_task: woke for command queue");
let cmd = match cmd {
Some(c) =>c,
None => {
// SecondaryController was destroyed, and this has raced with
// our CancellationToken
tracing::info!("Heatmap uploader terminating");
cancel.cancel();
break;
}
};
let CommandRequest{
response_tx,
payload
} = cmd;
uploader.handle_command(payload, response_tx);
},
_ = uploader.process_next_completion() => {
if !cancel.is_cancelled() {
uploader.spawn_pending();
}
}
}
}
}
Ok(())
}
impl HeatmapUploader {
/// Periodic execution phase: inspect all attached tenants and schedule any work they require.
async fn schedule_iteration(&mut self) -> anyhow::Result<()> {
#[async_trait::async_trait]
impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
for HeatmapUploader
{
async fn schedule(&mut self) -> SchedulingResult<UploadPending> {
// Cull any entries in self.tenants whose Arc<Tenant> is gone
self.tenants
.retain(|_k, v| v.tenant.upgrade().is_some() && v.next_upload.is_some());
// The priority order of previously scheduled work may be invalidated by current state: drop
// all pending work (it will be re-scheduled if still needed)
self.tenants_pending.clear();
// Used a fixed 'now' through the following loop, for efficiency and fairness.
let now = Instant::now();
// While iterating over the potentially-long list of tenants, we will periodically yield
// to avoid blocking executor.
const YIELD_ITERATIONS: usize = 1000;
let mut result = SchedulingResult {
jobs: Vec::new(),
want_interval: None,
};
// Iterate over tenants looking for work to do.
let tenants = self.tenant_manager.get_attached_active_tenant_shards();
for (i, tenant) in tenants.into_iter().enumerate() {
// Process is shutting down, drop out
if self.cancel.is_cancelled() {
return Ok(());
}
// Skip tenants that already have a write in flight
if self
.tenants_uploading
.contains_key(tenant.get_tenant_shard_id())
{
continue;
}
yielding_loop(1000, &self.cancel, tenants.into_iter(), |tenant| {
let period = match tenant.get_heatmap_period() {
None => {
// Heatmaps are disabled for this tenant
return;
}
Some(period) => {
// If any tenant has asked for uploads more frequent than our scheduling interval,
// reduce it to match so that we can keep up. This is mainly useful in testing, where
// we may set rather short intervals.
result.want_interval = match result.want_interval {
None => Some(period),
Some(existing) => Some(std::cmp::min(period, existing)),
};
self.maybe_schedule_upload(&now, tenant);
period
}
};
if i + 1 % YIELD_ITERATIONS == 0 {
tokio::task::yield_now().await;
}
}
// Spawn tasks for as many of our pending tenants as we can.
self.spawn_pending();
Ok(())
}
///
/// Cancellation: this method is cancel-safe.
async fn process_next_completion(&mut self) {
match self.task_result_rx.recv().await {
Some(r) => {
self.on_completion(r);
}
None => {
unreachable!("Result sender is stored on Self");
}
}
}
/// The 'maybe' refers to the tenant's state: whether it is configured
/// for heatmap uploads at all, and whether sufficient time has passed
/// since the last upload.
fn maybe_schedule_upload(&mut self, now: &Instant, tenant: Arc<Tenant>) {
match tenant.get_heatmap_period() {
None => {
// Heatmaps are disabled for this tenant
// Stale attachments do not upload anything: if we are in this state, there is probably some
// other attachment in mode Single or Multi running on another pageserver, and we don't
// want to thrash and overwrite their heatmap uploads.
if tenant.get_attach_mode() == AttachmentMode::Stale {
return;
}
Some(period) => {
// If any tenant has asked for uploads more frequent than our scheduling interval,
// reduce it to match so that we can keep up. This is mainly useful in testing, where
// we may set rather short intervals.
if period < self.scheduling_interval {
self.scheduling_interval = std::cmp::max(period, MIN_SCHEDULING_INTERVAL);
}
// Create an entry in self.tenants if one doesn't already exist: this will later be updated
// with the completion time in on_completion.
let state = self
.tenants
.entry(*tenant.get_tenant_shard_id())
.or_insert_with(|| {
let jittered_period = rand::thread_rng().gen_range(Duration::ZERO..period);
UploaderTenantState {
tenant: Arc::downgrade(&tenant),
last_upload: None,
next_upload: Some(now.checked_add(jittered_period).unwrap_or(now)),
last_digest: None,
}
});
// Decline to do the upload if insufficient time has passed
if state.next_upload.map(|nu| nu > now).unwrap_or(false) {
return;
}
}
// Stale attachments do not upload anything: if we are in this state, there is probably some
// other attachment in mode Single or Multi running on another pageserver, and we don't
// want to thrash and overwrite their heatmap uploads.
if tenant.get_attach_mode() == AttachmentMode::Stale {
return;
}
// Create an entry in self.tenants if one doesn't already exist: this will later be updated
// with the completion time in on_completion.
let state = self
.tenants
.entry(*tenant.get_tenant_shard_id())
.or_insert_with(|| UploaderTenantState {
tenant: Arc::downgrade(&tenant),
last_upload: None,
next_upload: Some(Instant::now()),
last_digest: None,
let last_digest = state.last_digest;
result.jobs.push(UploadPending {
tenant,
last_digest,
target_time: state.next_upload,
period: Some(period),
});
})
.await
.ok();
// Decline to do the upload if insufficient time has passed
if state.next_upload.map(|nu| &nu > now).unwrap_or(false) {
return;
}
result
}
let last_digest = state.last_digest;
self.tenants_pending.push_back(UploadPending {
fn spawn(
&mut self,
job: UploadPending,
) -> (
WriteInProgress,
Pin<Box<dyn Future<Output = WriteComplete> + Send>>,
) {
let UploadPending {
tenant,
last_digest,
})
}
target_time,
period,
} = job;
fn spawn_pending(&mut self) {
while !self.tenants_pending.is_empty()
&& self.tenants_uploading.len() < self.concurrent_uploads
{
// unwrap: loop condition includes !is_empty()
let pending = self.tenants_pending.pop_front().unwrap();
self.spawn_upload(pending.tenant, pending.last_digest);
}
}
fn spawn_upload(&mut self, tenant: Arc<Tenant>, last_digest: Option<md5::Digest>) {
let remote_storage = self.remote_storage.clone();
let tenant_shard_id = *tenant.get_tenant_shard_id();
let (completion, barrier) = utils::completion::channel();
let result_tx = self.task_result_tx.clone();
self.tasks.spawn(async move {
let tenant_shard_id = *tenant.get_tenant_shard_id();
(WriteInProgress { barrier }, Box::pin(async move {
// Guard for the barrier in [`WriteInProgress`]
let _completion = completion;
@@ -362,22 +264,47 @@ impl HeatmapUploader {
};
let now = Instant::now();
// If the job had a target execution time, we may check our final execution
// time against that for observability purposes.
if let (Some(target_time), Some(period)) = (target_time, period) {
// Elapsed time includes any scheduling lag as well as the execution of the job
let elapsed = now.duration_since(target_time);
warn_when_period_overrun(elapsed, period, BackgroundLoopKind::HeatmapUpload);
}
let next_upload = tenant
.get_heatmap_period()
.and_then(|period| now.checked_add(period));
result_tx
.send(WriteComplete {
WriteComplete {
tenant_shard_id: *tenant.get_tenant_shard_id(),
completed_at: now,
digest,
next_upload,
})
.ok();
});
}
}.instrument(info_span!(parent: None, "heatmap_upload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))))
}
self.tenants_uploading
.insert(tenant_shard_id, WriteInProgress { barrier });
fn on_command(&mut self, command: UploadCommand) -> anyhow::Result<UploadPending> {
let tenant_shard_id = command.get_tenant_shard_id();
tracing::info!(
tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
"Starting heatmap write on command");
let tenant = self
.tenant_manager
.get_attached_tenant_shard(*tenant_shard_id, true)
.map_err(|e| anyhow::anyhow!(e))?;
Ok(UploadPending {
// Ignore our state for last digest: this forces an upload even if nothing has changed
last_digest: None,
tenant,
target_time: None,
period: None,
})
}
#[instrument(skip_all, fields(tenant_id=%completion.tenant_shard_id.tenant_id, shard_id=%completion.tenant_shard_id.shard_slug()))]
@@ -389,7 +316,6 @@ impl HeatmapUploader {
digest,
next_upload,
} = completion;
self.tenants_uploading.remove(&tenant_shard_id);
use std::collections::hash_map::Entry;
match self.tenants.entry(tenant_shard_id) {
Entry::Vacant(_) => {
@@ -402,69 +328,6 @@ impl HeatmapUploader {
}
}
}
fn handle_command(
&mut self,
command: UploadCommand,
response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
) {
match command {
UploadCommand::Upload(tenant_shard_id) => {
// If an upload was ongoing for this tenant, let it finish first.
let barrier = if let Some(writing_state) =
self.tenants_uploading.get(&tenant_shard_id)
{
tracing::info!(
tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
"Waiting for heatmap write to complete");
writing_state.barrier.clone()
} else {
// Spawn the upload then immediately wait for it. This will block processing of other commands and
// starting of other background work.
tracing::info!(
tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
"Starting heatmap write on command");
let tenant = match self
.tenant_manager
.get_attached_tenant_shard(tenant_shard_id, true)
{
Ok(t) => t,
Err(e) => {
// Drop result of send: we don't care if caller dropped their receiver
drop(response_tx.send(CommandResponse {
result: Err(e.into()),
}));
return;
}
};
self.spawn_upload(tenant, None);
let writing_state = self
.tenants_uploading
.get(&tenant_shard_id)
.expect("We just inserted this");
tracing::info!(
tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
"Waiting for heatmap upload to complete");
writing_state.barrier.clone()
};
// This task does no I/O: it only listens for a barrier's completion and then
// sends to the command response channel. It is therefore safe to spawn this without
// any gates/task_mgr hooks.
tokio::task::spawn(async move {
barrier.wait().await;
tracing::info!(
tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
"Heatmap upload complete");
// Drop result of send: we don't care if caller dropped their receiver
drop(response_tx.send(CommandResponse { result: Ok(()) }))
});
}
}
}
}
enum UploadHeatmapOutcome {
@@ -487,7 +350,6 @@ enum UploadHeatmapError {
/// The inner upload operation. This will skip if `last_digest` is Some and matches the digest
/// of the object we would have uploaded.
#[instrument(skip_all, fields(tenant_id = %tenant.get_tenant_shard_id().tenant_id, shard_id = %tenant.get_tenant_shard_id().shard_slug()))]
async fn upload_tenant_heatmap(
remote_storage: GenericRemoteStorage,
tenant: &Arc<Tenant>,

View File

@@ -0,0 +1,361 @@
use async_trait;
use futures::Future;
use std::{
collections::HashMap,
marker::PhantomData,
pin::Pin,
time::{Duration, Instant},
};
use pageserver_api::shard::TenantShardId;
use tokio::task::JoinSet;
use tokio_util::sync::CancellationToken;
use utils::{completion::Barrier, yielding_loop::yielding_loop};
use super::{CommandRequest, CommandResponse};
/// Scheduling interval is the time between calls to JobGenerator::schedule.
/// When we schedule jobs, the job generator may provide a hint of its preferred
/// interval, which we will respect within these intervals.
const MAX_SCHEDULING_INTERVAL: Duration = Duration::from_secs(10);
const MIN_SCHEDULING_INTERVAL: Duration = Duration::from_secs(1);
/// Scheduling helper for background work across many tenants.
///
/// Systems that need to run background work across many tenants may use this type
/// to schedule jobs within a concurrency limit, along with their own [`JobGenerator`]
/// implementation to provide the work to execute. This is a simple scheduler that just
/// polls the generator for outstanding work, replacing its queue of pending work with
/// what the generator yields on each call: the job generator can change its mind about
/// the order of jobs between calls. The job generator is notified when jobs complete,
/// and additionally may expose a command hook to generate jobs on-demand (e.g. to implement
/// admin APIs).
///
/// For an example see [`crate::tenant::secondary::heatmap_uploader`]
///
/// G: A JobGenerator that this scheduler will poll to find pending jobs
/// PJ: 'Pending Job': type for job descriptors that are ready to run
/// RJ: 'Running Job' type' for jobs that have been spawned
/// C : 'Completion' type that spawned jobs will send when they finish
/// CMD: 'Command' type that the job generator will accept to create jobs on-demand
pub(super) struct TenantBackgroundJobs<G, PJ, RJ, C, CMD>
where
G: JobGenerator<PJ, RJ, C, CMD>,
C: Completion,
PJ: PendingJob,
RJ: RunningJob,
{
generator: G,
/// Ready to run. Will progress to `running` once concurrent limit is satisfied, or
/// be removed on next scheduling pass.
pending: std::collections::VecDeque<PJ>,
/// Tasks currently running in Self::tasks for these tenants. Check this map
/// before pushing more work into pending for the same tenant.
running: HashMap<TenantShardId, RJ>,
tasks: JoinSet<C>,
concurrency: usize,
/// How often we would like schedule_interval to be called.
pub(super) scheduling_interval: Duration,
_phantom: PhantomData<(PJ, RJ, C, CMD)>,
}
#[async_trait::async_trait]
pub(crate) trait JobGenerator<PJ, RJ, C, CMD>
where
C: Completion,
PJ: PendingJob,
RJ: RunningJob,
{
/// Called at each scheduling interval. Return a list of jobs to run, most urgent first.
///
/// This function may be expensive (e.g. walk all tenants), but should not do any I/O.
/// Implementations should take care to yield the executor periodically if running
/// very long loops.
///
/// Yielding a job here does _not_ guarantee that it will run: if the queue of pending
/// jobs is not drained by the next scheduling interval, pending jobs will be cleared
/// and re-generated.
async fn schedule(&mut self) -> SchedulingResult<PJ>;
/// Called when a pending job is ready to be run.
///
/// The job generation provides a future, and a RJ (Running Job) descriptor that tracks it.
fn spawn(&mut self, pending_job: PJ) -> (RJ, Pin<Box<dyn Future<Output = C> + Send>>);
/// Called when a job previously spawned with spawn() transmits its completion
fn on_completion(&mut self, completion: C);
/// Called when a command is received. A job will be spawned immediately if the return
/// value is Some, ignoring concurrency limits and the pending queue.
fn on_command(&mut self, cmd: CMD) -> anyhow::Result<PJ>;
}
/// [`JobGenerator`] returns this to provide pending jobs, and hints about scheduling
pub(super) struct SchedulingResult<PJ> {
pub(super) jobs: Vec<PJ>,
/// The job generator would like to be called again this soon
pub(super) want_interval: Option<Duration>,
}
/// See [`TenantBackgroundJobs`].
pub(super) trait PendingJob {
fn get_tenant_shard_id(&self) -> &TenantShardId;
}
/// See [`TenantBackgroundJobs`].
pub(super) trait Completion: Send + 'static {
fn get_tenant_shard_id(&self) -> &TenantShardId;
}
/// See [`TenantBackgroundJobs`].
pub(super) trait RunningJob {
fn get_barrier(&self) -> Barrier;
}
impl<G, PJ, RJ, C, CMD> TenantBackgroundJobs<G, PJ, RJ, C, CMD>
where
C: Completion,
PJ: PendingJob,
RJ: RunningJob,
G: JobGenerator<PJ, RJ, C, CMD>,
{
pub(super) fn new(generator: G, concurrency: usize) -> Self {
Self {
generator,
pending: std::collections::VecDeque::new(),
running: HashMap::new(),
tasks: JoinSet::new(),
concurrency,
scheduling_interval: MAX_SCHEDULING_INTERVAL,
_phantom: PhantomData,
}
}
pub(super) async fn run(
&mut self,
mut command_queue: tokio::sync::mpsc::Receiver<CommandRequest<CMD>>,
background_jobs_can_start: Barrier,
cancel: CancellationToken,
) {
tracing::info!("Waiting for background_jobs_can start...");
background_jobs_can_start.wait().await;
tracing::info!("background_jobs_can is ready, proceeding.");
while !cancel.is_cancelled() {
// Look for new work: this is relatively expensive because we have to go acquire the lock on
// the tenant manager to retrieve tenants, and then iterate over them to figure out which ones
// require an upload.
self.schedule_iteration(&cancel).await;
if cancel.is_cancelled() {
return;
}
// Schedule some work, if concurrency limit permits it
self.spawn_pending();
// Between scheduling iterations, we will:
// - Drain any complete tasks and spawn pending tasks
// - Handle incoming administrative commands
// - Check our cancellation token
let next_scheduling_iteration = Instant::now()
.checked_add(self.scheduling_interval)
.unwrap_or_else(|| {
tracing::warn!(
"Scheduling interval invalid ({}s)",
self.scheduling_interval.as_secs_f64()
);
// unwrap(): this constant is small, cannot fail to add to time unless
// we are close to the end of the universe.
Instant::now().checked_add(MIN_SCHEDULING_INTERVAL).unwrap()
});
loop {
tokio::select! {
_ = cancel.cancelled() => {
tracing::info!("joining tasks");
// We do not simply drop the JoinSet, in order to have an orderly shutdown without cancellation.
// It is the callers responsibility to make sure that the tasks they scheduled
// respect an appropriate cancellation token, to shut down promptly. It is only
// safe to wait on joining these tasks because we can see the cancellation token
// has been set.
while let Some(_r) = self.tasks.join_next().await {}
tracing::info!("terminating on cancellation token.");
break;
},
_ = tokio::time::sleep(next_scheduling_iteration.duration_since(Instant::now())) => {
tracing::debug!("woke for scheduling interval");
break;},
cmd = command_queue.recv() => {
tracing::debug!("woke for command queue");
let cmd = match cmd {
Some(c) =>c,
None => {
// SecondaryController was destroyed, and this has raced with
// our CancellationToken
tracing::info!("terminating on command queue destruction");
cancel.cancel();
break;
}
};
let CommandRequest{
response_tx,
payload
} = cmd;
self.handle_command(payload, response_tx);
},
_ = async {
let completion = self.process_next_completion().await;
match completion {
Some(c) => {
self.generator.on_completion(c);
if !cancel.is_cancelled() {
self.spawn_pending();
}
},
None => {
// Nothing is running, so just wait: expect that this future
// will be dropped when something in the outer select! fires.
cancel.cancelled().await;
}
}
} => {}
}
}
}
}
fn do_spawn(&mut self, job: PJ) {
let tenant_shard_id = *job.get_tenant_shard_id();
let (in_progress, fut) = self.generator.spawn(job);
self.tasks.spawn(fut);
self.running.insert(tenant_shard_id, in_progress);
}
/// For all pending tenants that are elegible for execution, spawn their task.
///
/// Caller provides the spawn operation, we track the resulting execution.
fn spawn_pending(&mut self) {
while !self.pending.is_empty() && self.running.len() < self.concurrency {
// unwrap: loop condition includes !is_empty()
let pending = self.pending.pop_front().unwrap();
self.do_spawn(pending);
}
}
/// For administrative commands: skip the pending queue, ignore concurrency limits
fn spawn_now(&mut self, job: PJ) -> &RJ {
let tenant_shard_id = *job.get_tenant_shard_id();
self.do_spawn(job);
self.running
.get(&tenant_shard_id)
.expect("We just inserted this")
}
/// Wait until the next task completes, and handle its completion
///
/// Cancellation: this method is cancel-safe.
async fn process_next_completion(&mut self) -> Option<C> {
match self.tasks.join_next().await {
Some(r) => {
// We use a channel to drive completions, but also
// need to drain the JoinSet to avoid completed tasks
// accumulating. These calls are 1:1 because every task
// we spawn into this joinset submits is result to the channel.
let completion = r.expect("Panic in background task");
self.running.remove(completion.get_tenant_shard_id());
Some(completion)
}
None => {
// Nothing is running, so we have nothing to wait for. We may drop out: the
// main even loop will call us again after the next time it has run something.
None
}
}
}
/// Convert the command into a pending job, spawn it, and when the spawned
/// job completes, send the result down `response_tx`.
fn handle_command(
&mut self,
cmd: CMD,
response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
) {
let job = match self.generator.on_command(cmd) {
Ok(j) => j,
Err(e) => {
response_tx.send(CommandResponse { result: Err(e) }).ok();
return;
}
};
let tenant_shard_id = job.get_tenant_shard_id();
let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) {
barrier
} else {
let running = self.spawn_now(job);
running.get_barrier().clone()
};
// This task does no I/O: it only listens for a barrier's completion and then
// sends to the command response channel. It is therefore safe to spawn this without
// any gates/task_mgr hooks.
tokio::task::spawn(async move {
barrier.wait().await;
response_tx.send(CommandResponse { result: Ok(()) }).ok();
});
}
fn get_running(&self, tenant_shard_id: &TenantShardId) -> Option<Barrier> {
self.running.get(tenant_shard_id).map(|r| r.get_barrier())
}
/// Periodic execution phase: inspect all attached tenants and schedule any work they require.
///
/// The type in `tenants` should be a tenant-like structure, e.g. [`crate::tenant::Tenant`] or [`crate::tenant::secondary::SecondaryTenant`]
///
/// This function resets the pending list: it is assumed that the caller may change their mind about
/// which tenants need work between calls to schedule_iteration.
async fn schedule_iteration(&mut self, cancel: &CancellationToken) {
let SchedulingResult {
jobs,
want_interval,
} = self.generator.schedule().await;
// Adjust interval based on feedback from the job generator
if let Some(want_interval) = want_interval {
// Calculation uses second granularity: this scheduler is not intended for high frequency tasks
self.scheduling_interval = Duration::from_secs(std::cmp::min(
std::cmp::max(MIN_SCHEDULING_INTERVAL.as_secs(), want_interval.as_secs()),
MAX_SCHEDULING_INTERVAL.as_secs(),
));
}
// The priority order of previously scheduled work may be invalidated by current state: drop
// all pending work (it will be re-scheduled if still needed)
self.pending.clear();
// While iterating over the potentially-long list of tenants, we will periodically yield
// to avoid blocking executor.
yielding_loop(1000, cancel, jobs.into_iter(), |job| {
// Skip tenants that already have a write in flight
if !self.running.contains_key(job.get_tenant_shard_id()) {
self.pending.push_back(job);
}
})
.await
.ok();
}
}

View File

@@ -320,8 +320,8 @@ impl DeltaLayer {
.metadata()
.context("get file metadata to determine size")?;
// TODO(sharding): we must get the TenantShardId from the path instead of reading the Summary.
// we should also validate the path against the Summary, as both should contain the same tenant, timeline, key, lsn.
// This function is never used for constructing layers in a running pageserver,
// so it does not need an accurate TenantShardId.
let tenant_shard_id = TenantShardId::unsharded(summary.tenant_id);
Ok(DeltaLayer {

View File

@@ -278,8 +278,8 @@ impl ImageLayer {
.metadata()
.context("get file metadata to determine size")?;
// TODO(sharding): we should get TenantShardId from path.
// OR, not at all: any layer we load from disk should also get reconciled with remote IndexPart.
// This function is never used for constructing layers in a running pageserver,
// so it does not need an accurate TenantShardId.
let tenant_shard_id = TenantShardId::unsharded(summary.tenant_id);
Ok(ImageLayer {

View File

@@ -1118,6 +1118,7 @@ impl LayerInner {
tracing::info!("evicted layer after unknown residence period");
}
}
timeline.metrics.evictions.inc();
timeline
.metrics
.resident_physical_size_sub(self.desc.file_size);

View File

@@ -45,6 +45,8 @@ pub(crate) enum BackgroundLoopKind {
ConsumptionMetricsCollectMetrics,
ConsumptionMetricsSyntheticSizeWorker,
InitialLogicalSizeCalculation,
HeatmapUpload,
SecondaryDownload,
}
impl BackgroundLoopKind {

View File

@@ -373,15 +373,20 @@ pub struct GcInfo {
}
/// An error happened in a get() operation.
#[derive(thiserror::Error)]
pub enum PageReconstructError {
#[derive(thiserror::Error, Debug)]
pub(crate) enum PageReconstructError {
#[error(transparent)]
Other(#[from] anyhow::Error),
#[error("Ancestor LSN wait error: {0}")]
AncestorLsnTimeout(#[from] WaitLsnError),
/// The operation was cancelled
#[error("Cancelled")]
Cancelled,
/// The ancestor of this is being stopped
#[error("ancestor timeline {0} is being stopped")]
AncestorStopping(TimelineId),
/// An error happened replaying WAL records
@@ -402,32 +407,6 @@ enum FlushLayerError {
Other(#[from] anyhow::Error),
}
impl std::fmt::Debug for PageReconstructError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
match self {
Self::Other(err) => err.fmt(f),
Self::Cancelled => write!(f, "cancelled"),
Self::AncestorStopping(timeline_id) => {
write!(f, "ancestor timeline {timeline_id} is being stopped")
}
Self::WalRedo(err) => err.fmt(f),
}
}
}
impl std::fmt::Display for PageReconstructError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
match self {
Self::Other(err) => err.fmt(f),
Self::Cancelled => write!(f, "cancelled"),
Self::AncestorStopping(timeline_id) => {
write!(f, "ancestor timeline {timeline_id} is being stopped")
}
Self::WalRedo(err) => err.fmt(f),
}
}
}
#[derive(Clone, Copy)]
pub enum LogicalSizeCalculationCause {
Initial,
@@ -452,6 +431,21 @@ impl std::fmt::Debug for Timeline {
}
}
#[derive(thiserror::Error, Debug)]
pub(crate) enum WaitLsnError {
// Called on a timeline which is shutting down
#[error("Shutdown")]
Shutdown,
// Called on an timeline not in active state or shutting down
#[error("Bad state (not active)")]
BadState,
// Timeout expired while waiting for LSN to catch up with goal.
#[error("{0}")]
Timeout(String),
}
/// Public interface functions
impl Timeline {
/// Get the LSN where this branch was created
@@ -486,7 +480,7 @@ impl Timeline {
/// # Cancel-Safety
///
/// This method is cancellation-safe.
pub async fn get(
pub(crate) async fn get(
&self,
key: Key,
lsn: Lsn,
@@ -496,6 +490,11 @@ impl Timeline {
return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN")));
}
// This check is debug-only because of the cost of hashing, and because it's a double-check: we
// already checked the key against the shard_identity when looking up the Timeline from
// page_service.
debug_assert!(!self.shard_identity.is_key_disposable(&key));
// XXX: structured stats collection for layer eviction here.
trace!(
"get page request for {}@{} from task kind {:?}",
@@ -629,24 +628,28 @@ impl Timeline {
/// You should call this before any of the other get_* or list_* functions. Calling
/// those functions with an LSN that has been processed yet is an error.
///
pub async fn wait_lsn(
pub(crate) async fn wait_lsn(
&self,
lsn: Lsn,
_ctx: &RequestContext, /* Prepare for use by cancellation */
) -> anyhow::Result<()> {
anyhow::ensure!(self.is_active(), "Cannot wait for Lsn on inactive timeline");
) -> Result<(), WaitLsnError> {
if self.cancel.is_cancelled() {
return Err(WaitLsnError::Shutdown);
} else if !self.is_active() {
return Err(WaitLsnError::BadState);
}
// This should never be called from the WAL receiver, because that could lead
// to a deadlock.
anyhow::ensure!(
debug_assert!(
task_mgr::current_task_kind() != Some(TaskKind::WalReceiverManager),
"wait_lsn cannot be called in WAL receiver"
);
anyhow::ensure!(
debug_assert!(
task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionHandler),
"wait_lsn cannot be called in WAL receiver"
);
anyhow::ensure!(
debug_assert!(
task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionPoller),
"wait_lsn cannot be called in WAL receiver"
);
@@ -660,18 +663,22 @@ impl Timeline {
{
Ok(()) => Ok(()),
Err(e) => {
// don't count the time spent waiting for lock below, and also in walreceiver.status(), towards the wait_lsn_time_histo
drop(_timer);
let walreceiver_status = self.walreceiver_status();
Err(anyhow::Error::new(e).context({
format!(
use utils::seqwait::SeqWaitError::*;
match e {
Shutdown => Err(WaitLsnError::Shutdown),
Timeout => {
// don't count the time spent waiting for lock below, and also in walreceiver.status(), towards the wait_lsn_time_histo
drop(_timer);
let walreceiver_status = self.walreceiver_status();
Err(WaitLsnError::Timeout(format!(
"Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}, WalReceiver status: {}",
lsn,
self.get_last_record_lsn(),
self.get_disk_consistent_lsn(),
walreceiver_status,
)
}))
)))
}
}
}
}
}
@@ -2224,13 +2231,13 @@ impl Timeline {
return Err(layer_traversal_error(
if cfg!(test) {
format!(
"could not find data for key {} at LSN {}, for request at LSN {}\n{}",
key, cont_lsn, request_lsn, std::backtrace::Backtrace::force_capture(),
"could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}\n{}",
key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn, std::backtrace::Backtrace::force_capture(),
)
} else {
format!(
"could not find data for key {} at LSN {}, for request at LSN {}",
key, cont_lsn, request_lsn
"could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}",
key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn
)
},
traversal_path,
@@ -2290,11 +2297,12 @@ impl Timeline {
ancestor
.wait_lsn(timeline.ancestor_lsn, ctx)
.await
.with_context(|| {
format!(
"wait for lsn {} on ancestor timeline_id={}",
timeline.ancestor_lsn, ancestor.timeline_id
)
.map_err(|e| match e {
e @ WaitLsnError::Timeout(_) => PageReconstructError::AncestorLsnTimeout(e),
WaitLsnError::Shutdown => PageReconstructError::Cancelled,
e @ WaitLsnError::BadState => {
PageReconstructError::Other(anyhow::anyhow!(e))
}
})?;
timeline_owned = ancestor;
@@ -3054,6 +3062,15 @@ impl Timeline {
for range in &partition.ranges {
let mut key = range.start;
while key < range.end {
if self.shard_identity.is_key_disposable(&key) {
debug!(
"Dropping key {} during compaction (it belongs on shard {:?})",
key,
self.shard_identity.get_shard_number(&key)
);
key = key.next();
continue;
}
let img = match self.get(key, lsn, ctx).await {
Ok(img) => img,
Err(err) => {
@@ -3080,6 +3097,7 @@ impl Timeline {
}
}
};
image_layer_writer.put_image(key, &img).await?;
key = key.next();
}
@@ -3650,7 +3668,15 @@ impl Timeline {
)))
});
writer.as_mut().unwrap().put_value(key, lsn, value).await?;
if !self.shard_identity.is_key_disposable(&key) {
writer.as_mut().unwrap().put_value(key, lsn, value).await?;
} else {
debug!(
"Dropping key {} during compaction (it belongs on shard {:?})",
key,
self.shard_identity.get_shard_number(&key)
);
}
if !new_layers.is_empty() {
fail_point!("after-timeline-compacted-first-L1");
@@ -4205,7 +4231,7 @@ impl Timeline {
.context("Failed to reconstruct a page image:")
{
Ok(img) => img,
Err(e) => return Err(PageReconstructError::from(e)),
Err(e) => return Err(PageReconstructError::WalRedo(e)),
};
if img.len() == page_cache::PAGE_SZ {

View File

@@ -22,6 +22,7 @@ use anyhow::Context;
use byteorder::{ByteOrder, LittleEndian};
use bytes::{BufMut, Bytes, BytesMut};
use nix::poll::*;
use pageserver_api::shard::TenantShardId;
use serde::Serialize;
use std::collections::VecDeque;
use std::io;
@@ -35,14 +36,11 @@ use std::sync::{Arc, Mutex, MutexGuard, RwLock};
use std::time::Duration;
use std::time::Instant;
use tracing::*;
use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock};
use utils::{bin_ser::BeSer, lsn::Lsn, nonblock::set_nonblock};
#[cfg(feature = "testing")]
use std::sync::atomic::{AtomicUsize, Ordering};
#[cfg(feature = "testing")]
use pageserver_api::shard::TenantShardId;
use crate::config::PageServerConf;
use crate::metrics::{
WalRedoKillCause, WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_COUNTERS,
@@ -92,7 +90,7 @@ struct ProcessOutput {
/// records.
///
pub struct PostgresRedoManager {
tenant_id: TenantId,
tenant_shard_id: TenantShardId,
conf: &'static PageServerConf,
last_redo_at: std::sync::Mutex<Option<Instant>>,
redo_process: RwLock<Option<Arc<WalRedoProcess>>>,
@@ -186,10 +184,13 @@ impl PostgresRedoManager {
///
/// Create a new PostgresRedoManager.
///
pub fn new(conf: &'static PageServerConf, tenant_id: TenantId) -> PostgresRedoManager {
pub fn new(
conf: &'static PageServerConf,
tenant_shard_id: TenantShardId,
) -> PostgresRedoManager {
// The actual process is launched lazily, on first request.
PostgresRedoManager {
tenant_id,
tenant_shard_id,
conf,
last_redo_at: std::sync::Mutex::default(),
redo_process: RwLock::new(None),
@@ -244,8 +245,12 @@ impl PostgresRedoManager {
let timer =
WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.start_timer();
let proc = Arc::new(
WalRedoProcess::launch(self.conf, self.tenant_id, pg_version)
.context("launch walredo process")?,
WalRedoProcess::launch(
self.conf,
self.tenant_shard_id,
pg_version,
)
.context("launch walredo process")?,
);
timer.observe_duration();
*proc_guard = Some(Arc::clone(&proc));
@@ -638,7 +643,7 @@ impl<C: CommandExt> CloseFileDescriptors for C {
struct WalRedoProcess {
#[allow(dead_code)]
conf: &'static PageServerConf,
tenant_id: TenantId,
tenant_shard_id: TenantShardId,
// Some() on construction, only becomes None on Drop.
child: Option<NoLeakChild>,
stdout: Mutex<ProcessOutput>,
@@ -652,10 +657,10 @@ impl WalRedoProcess {
//
// Start postgres binary in special WAL redo mode.
//
#[instrument(skip_all,fields(tenant_id=%tenant_id, pg_version=pg_version))]
#[instrument(skip_all,fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), pg_version=pg_version))]
fn launch(
conf: &'static PageServerConf,
tenant_id: TenantId,
tenant_shard_id: TenantShardId,
pg_version: u32,
) -> anyhow::Result<Self> {
let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
@@ -680,7 +685,7 @@ impl WalRedoProcess {
// as close-on-exec by default, but that's not enough, since we use
// libraries that directly call libc open without setting that flag.
.close_fds()
.spawn_no_leak_child(tenant_id)
.spawn_no_leak_child(tenant_shard_id)
.context("spawn process")?;
WAL_REDO_PROCESS_COUNTERS.started.inc();
let mut child = scopeguard::guard(child, |child| {
@@ -741,12 +746,12 @@ impl WalRedoProcess {
error!(error=?e, "failed to read from walredo stderr");
}
}
}.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_id, %pg_version))
}.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
);
Ok(Self {
conf,
tenant_id,
tenant_shard_id,
child: Some(child),
stdin: Mutex::new(ProcessInput {
stdin,
@@ -772,7 +777,7 @@ impl WalRedoProcess {
// Apply given WAL records ('records') over an old page image. Returns
// new page image.
//
#[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.id()))]
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
fn apply_wal_records(
&self,
tag: BufferTag,
@@ -966,11 +971,7 @@ impl WalRedoProcess {
// these files will be collected to an allure report
let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
// TODO(sharding): update this call when WalRedoProcess gets a TenantShardId.
let path = self
.conf
.tenant_path(&TenantShardId::unsharded(self.tenant_id))
.join(&filename);
let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
let res = std::fs::OpenOptions::new()
.write(true)
@@ -1004,7 +1005,7 @@ impl Drop for WalRedoProcess {
/// Wrapper type around `std::process::Child` which guarantees that the child
/// will be killed and waited-for by this process before being dropped.
struct NoLeakChild {
tenant_id: TenantId,
tenant_id: TenantShardId,
child: Option<Child>,
}
@@ -1023,7 +1024,7 @@ impl DerefMut for NoLeakChild {
}
impl NoLeakChild {
fn spawn(tenant_id: TenantId, command: &mut Command) -> io::Result<Self> {
fn spawn(tenant_id: TenantShardId, command: &mut Command) -> io::Result<Self> {
let child = command.spawn()?;
Ok(NoLeakChild {
tenant_id,
@@ -1078,7 +1079,7 @@ impl Drop for NoLeakChild {
Some(child) => child,
None => return,
};
let tenant_id = self.tenant_id;
let tenant_shard_id = self.tenant_id;
// Offload the kill+wait of the child process into the background.
// If someone stops the runtime, we'll leak the child process.
// We can ignore that case because we only stop the runtime on pageserver exit.
@@ -1086,7 +1087,11 @@ impl Drop for NoLeakChild {
tokio::task::spawn_blocking(move || {
// Intentionally don't inherit the tracing context from whoever is dropping us.
// This thread here is going to outlive of our dropper.
let span = tracing::info_span!("walredo", %tenant_id);
let span = tracing::info_span!(
"walredo",
tenant_id = %tenant_shard_id.tenant_id,
shard_id = %tenant_shard_id.shard_slug()
);
let _entered = span.enter();
Self::kill_and_wait_impl(child, WalRedoKillCause::NoLeakChildDrop);
})
@@ -1096,11 +1101,11 @@ impl Drop for NoLeakChild {
}
trait NoLeakChildCommandExt {
fn spawn_no_leak_child(&mut self, tenant_id: TenantId) -> io::Result<NoLeakChild>;
fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result<NoLeakChild>;
}
impl NoLeakChildCommandExt for Command {
fn spawn_no_leak_child(&mut self, tenant_id: TenantId) -> io::Result<NoLeakChild> {
fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result<NoLeakChild> {
NoLeakChild::spawn(tenant_id, self)
}
}
@@ -1155,6 +1160,7 @@ mod tests {
use crate::repository::Key;
use crate::{config::PageServerConf, walrecord::NeonWalRecord};
use bytes::Bytes;
use pageserver_api::shard::TenantShardId;
use std::str::FromStr;
use utils::{id::TenantId, lsn::Lsn};
@@ -1264,9 +1270,9 @@ mod tests {
let repo_dir = camino_tempfile::tempdir()?;
let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
let conf = Box::leak(Box::new(conf));
let tenant_id = TenantId::generate();
let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
let manager = PostgresRedoManager::new(conf, tenant_id);
let manager = PostgresRedoManager::new(conf, tenant_shard_id);
Ok(RedoHarness {
_repo_dir: repo_dir,

View File

@@ -35,7 +35,7 @@
#define PageStoreTrace DEBUG5
#define MIN_RECONNECT_INTERVAL_USEC 100
#define MIN_RECONNECT_INTERVAL_USEC 1000
#define MAX_RECONNECT_INTERVAL_USEC 1000000
bool connected = false;

View File

@@ -1091,34 +1091,10 @@ SendProposerElected(Safekeeper *sk)
{
/* safekeeper is empty or no common point, start from the beginning */
sk->startStreamingAt = wp->propTermHistory.entries[0].lsn;
if (sk->startStreamingAt < wp->truncateLsn)
{
/*
* There's a gap between the WAL starting point and a truncateLsn,
* which can't appear in a normal working cluster. That gap means
* that all safekeepers reported that they have persisted WAL up
* to the truncateLsn before, but now current safekeeper tells
* otherwise.
*
* Also we have a special condition here, which is empty
* safekeeper with no history. In combination with a gap, that can
* happen when we introduce a new safekeeper to the cluster. This
* is a rare case, which is triggered manually for now, and should
* be treated with care.
*/
/*
* truncateLsn will not change without ack from current
* safekeeper, and it's aligned to the WAL record, so we can
* safely start streaming from this point.
*/
sk->startStreamingAt = wp->truncateLsn;
wp_log(WARNING, "empty safekeeper joined cluster as %s:%s, historyStart=%X/%X, sk->startStreamingAt=%X/%X",
sk->host, sk->port, LSN_FORMAT_ARGS(wp->propTermHistory.entries[0].lsn),
LSN_FORMAT_ARGS(sk->startStreamingAt));
}
wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, timelineStartLsn=%X/%X, termHistory.n_entries=%u" ,
sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), LSN_FORMAT_ARGS(wp->timelineStartLsn), wp->propTermHistory.n_entries);
/* wp->timelineStartLsn == InvalidXLogRecPtr can be only when timeline is created manually (test_s3_wal_replay) */
Assert(sk->startStreamingAt == wp->timelineStartLsn || wp->timelineStartLsn == InvalidXLogRecPtr);
}
else
{
@@ -1141,7 +1117,7 @@ SendProposerElected(Safekeeper *sk)
}
}
Assert(sk->startStreamingAt >= wp->truncateLsn && sk->startStreamingAt <= wp->availableLsn);
Assert(sk->startStreamingAt <= wp->availableLsn);
msg.tag = 'e';
msg.term = wp->propTerm;

127
poetry.lock generated
View File

@@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
[[package]]
name = "aiohttp"
@@ -288,55 +288,6 @@ files = [
{file = "backoff-2.2.1.tar.gz", hash = "sha256:03f829f5bb1923180821643f8753b0502c3b682293992485b0eef2807afa5cba"},
]
[[package]]
name = "black"
version = "23.3.0"
description = "The uncompromising code formatter."
optional = false
python-versions = ">=3.7"
files = [
{file = "black-23.3.0-cp310-cp310-macosx_10_16_arm64.whl", hash = "sha256:0945e13506be58bf7db93ee5853243eb368ace1c08a24c65ce108986eac65915"},
{file = "black-23.3.0-cp310-cp310-macosx_10_16_universal2.whl", hash = "sha256:67de8d0c209eb5b330cce2469503de11bca4085880d62f1628bd9972cc3366b9"},
{file = "black-23.3.0-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:7c3eb7cea23904399866c55826b31c1f55bbcd3890ce22ff70466b907b6775c2"},
{file = "black-23.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:32daa9783106c28815d05b724238e30718f34155653d4d6e125dc7daec8e260c"},
{file = "black-23.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:35d1381d7a22cc5b2be2f72c7dfdae4072a3336060635718cc7e1ede24221d6c"},
{file = "black-23.3.0-cp311-cp311-macosx_10_16_arm64.whl", hash = "sha256:a8a968125d0a6a404842fa1bf0b349a568634f856aa08ffaff40ae0dfa52e7c6"},
{file = "black-23.3.0-cp311-cp311-macosx_10_16_universal2.whl", hash = "sha256:c7ab5790333c448903c4b721b59c0d80b11fe5e9803d8703e84dcb8da56fec1b"},
{file = "black-23.3.0-cp311-cp311-macosx_10_16_x86_64.whl", hash = "sha256:a6f6886c9869d4daae2d1715ce34a19bbc4b95006d20ed785ca00fa03cba312d"},
{file = "black-23.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f3c333ea1dd6771b2d3777482429864f8e258899f6ff05826c3a4fcc5ce3f70"},
{file = "black-23.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:11c410f71b876f961d1de77b9699ad19f939094c3a677323f43d7a29855fe326"},
{file = "black-23.3.0-cp37-cp37m-macosx_10_16_x86_64.whl", hash = "sha256:1d06691f1eb8de91cd1b322f21e3bfc9efe0c7ca1f0e1eb1db44ea367dff656b"},
{file = "black-23.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50cb33cac881766a5cd9913e10ff75b1e8eb71babf4c7104f2e9c52da1fb7de2"},
{file = "black-23.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:e114420bf26b90d4b9daa597351337762b63039752bdf72bf361364c1aa05925"},
{file = "black-23.3.0-cp38-cp38-macosx_10_16_arm64.whl", hash = "sha256:48f9d345675bb7fbc3dd85821b12487e1b9a75242028adad0333ce36ed2a6d27"},
{file = "black-23.3.0-cp38-cp38-macosx_10_16_universal2.whl", hash = "sha256:714290490c18fb0126baa0fca0a54ee795f7502b44177e1ce7624ba1c00f2331"},
{file = "black-23.3.0-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:064101748afa12ad2291c2b91c960be28b817c0c7eaa35bec09cc63aa56493c5"},
{file = "black-23.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:562bd3a70495facf56814293149e51aa1be9931567474993c7942ff7d3533961"},
{file = "black-23.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:e198cf27888ad6f4ff331ca1c48ffc038848ea9f031a3b40ba36aced7e22f2c8"},
{file = "black-23.3.0-cp39-cp39-macosx_10_16_arm64.whl", hash = "sha256:3238f2aacf827d18d26db07524e44741233ae09a584273aa059066d644ca7b30"},
{file = "black-23.3.0-cp39-cp39-macosx_10_16_universal2.whl", hash = "sha256:f0bd2f4a58d6666500542b26354978218a9babcdc972722f4bf90779524515f3"},
{file = "black-23.3.0-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:92c543f6854c28a3c7f39f4d9b7694f9a6eb9d3c5e2ece488c327b6e7ea9b266"},
{file = "black-23.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a150542a204124ed00683f0db1f5cf1c2aaaa9cc3495b7a3b5976fb136090ab"},
{file = "black-23.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:6b39abdfb402002b8a7d030ccc85cf5afff64ee90fa4c5aebc531e3ad0175ddb"},
{file = "black-23.3.0-py3-none-any.whl", hash = "sha256:ec751418022185b0c1bb7d7736e6933d40bbb14c14a0abcf9123d1b159f98dd4"},
{file = "black-23.3.0.tar.gz", hash = "sha256:1c7b8d606e728a41ea1ccbd7264677e494e87cf630e399262ced92d4a8dac940"},
]
[package.dependencies]
click = ">=8.0.0"
mypy-extensions = ">=0.4.3"
packaging = ">=22.0"
pathspec = ">=0.9.0"
platformdirs = ">=2"
tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
typing-extensions = {version = ">=3.10.0.0", markers = "python_version < \"3.10\""}
[package.extras]
colorama = ["colorama (>=0.4.3)"]
d = ["aiohttp (>=3.7.4)"]
jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"]
uvloop = ["uvloop (>=0.15.2)"]
[[package]]
name = "boto3"
version = "1.34.11"
@@ -1627,17 +1578,6 @@ files = [
{file = "packaging-23.0.tar.gz", hash = "sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97"},
]
[[package]]
name = "pathspec"
version = "0.9.0"
description = "Utility library for gitignore style pattern matching of file paths."
optional = false
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7"
files = [
{file = "pathspec-0.9.0-py2.py3-none-any.whl", hash = "sha256:7d15c4ddb0b5c802d161efc417ec1a2558ea2653c2e8ad9c19098201dc1c993a"},
{file = "pathspec-0.9.0.tar.gz", hash = "sha256:e564499435a2673d586f6b2130bb5b95f04a3ba06f81b8f895b651a3c76aabb1"},
]
[[package]]
name = "pbr"
version = "5.9.0"
@@ -1649,21 +1589,6 @@ files = [
{file = "pbr-5.9.0.tar.gz", hash = "sha256:e8dca2f4b43560edef58813969f52a56cef023146cbb8931626db80e6c1c4308"},
]
[[package]]
name = "platformdirs"
version = "2.5.2"
description = "A small Python module for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
optional = false
python-versions = ">=3.7"
files = [
{file = "platformdirs-2.5.2-py3-none-any.whl", hash = "sha256:027d8e83a2d7de06bbac4e5ef7e023c02b863d7ea5d079477e722bb41ab25788"},
{file = "platformdirs-2.5.2.tar.gz", hash = "sha256:58c8abb07dcb441e6ee4b11d8df0ac856038f944ab98b7be6b27b2a3c7feef19"},
]
[package.extras]
docs = ["furo (>=2021.7.5b38)", "proselint (>=0.10.2)", "sphinx (>=4)", "sphinx-autodoc-typehints (>=1.12)"]
test = ["appdirs (==1.4.4)", "pytest (>=6)", "pytest-cov (>=2.7)", "pytest-mock (>=3.6)"]
[[package]]
name = "pluggy"
version = "1.0.0"
@@ -2207,28 +2132,28 @@ pyasn1 = ">=0.1.3"
[[package]]
name = "ruff"
version = "0.0.269"
description = "An extremely fast Python linter, written in Rust."
version = "0.1.11"
description = "An extremely fast Python linter and code formatter, written in Rust."
optional = false
python-versions = ">=3.7"
files = [
{file = "ruff-0.0.269-py3-none-macosx_10_7_x86_64.whl", hash = "sha256:3569bcdee679045c09c0161fabc057599759c49219a08d9a4aad2cc3982ccba3"},
{file = "ruff-0.0.269-py3-none-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:56347da63757a56cbce7d4b3d6044ca4f1941cd1bbff3714f7554360c3361f83"},
{file = "ruff-0.0.269-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6da8ee25ef2f0cc6cc8e6e20942c1d44d25a36dce35070d7184655bc14f63f63"},
{file = "ruff-0.0.269-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bd81b8e681b9eaa6cf15484f3985bd8bd97c3d114e95bff3e8ea283bf8865062"},
{file = "ruff-0.0.269-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1f19f59ca3c28742955241fb452f3346241ddbd34e72ac5cb3d84fadebcf6bc8"},
{file = "ruff-0.0.269-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:f062059b8289a4fab7f6064601b811d447c2f9d3d432a17f689efe4d68988450"},
{file = "ruff-0.0.269-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3f5dc7aac52c58e82510217e3c7efd80765c134c097c2815d59e40face0d1fe6"},
{file = "ruff-0.0.269-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e131b4dbe798c391090c6407641d6ab12c0fa1bb952379dde45e5000e208dabb"},
{file = "ruff-0.0.269-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a374434e588e06550df0f8dcb74777290f285678de991fda4e1063c367ab2eb2"},
{file = "ruff-0.0.269-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:cec2f4b84a14b87f1b121488649eb5b4eaa06467a2387373f750da74bdcb5679"},
{file = "ruff-0.0.269-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:374b161753a247904aec7a32d45e165302b76b6e83d22d099bf3ff7c232c888f"},
{file = "ruff-0.0.269-py3-none-musllinux_1_2_i686.whl", hash = "sha256:9ca0a1ddb1d835b5f742db9711c6cf59f213a1ad0088cb1e924a005fd399e7d8"},
{file = "ruff-0.0.269-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:5a20658f0b97d207c7841c13d528f36d666bf445b00b01139f28a8ccb80093bb"},
{file = "ruff-0.0.269-py3-none-win32.whl", hash = "sha256:03ff42bc91ceca58e0f0f072cb3f9286a9208f609812753474e799a997cdad1a"},
{file = "ruff-0.0.269-py3-none-win_amd64.whl", hash = "sha256:f3b59ccff57b21ef0967ea8021fd187ec14c528ec65507d8bcbe035912050776"},
{file = "ruff-0.0.269-py3-none-win_arm64.whl", hash = "sha256:bbeb857b1e508a4487bdb02ca1e6d41dd8d5ac5335a5246e25de8a3dff38c1ff"},
{file = "ruff-0.0.269.tar.gz", hash = "sha256:11ddcfbab32cf5c420ea9dd5531170ace5a3e59c16d9251c7bd2581f7b16f602"},
{file = "ruff-0.1.11-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:a7f772696b4cdc0a3b2e527fc3c7ccc41cdcb98f5c80fdd4f2b8c50eb1458196"},
{file = "ruff-0.1.11-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:934832f6ed9b34a7d5feea58972635c2039c7a3b434fe5ba2ce015064cb6e955"},
{file = "ruff-0.1.11-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea0d3e950e394c4b332bcdd112aa566010a9f9c95814844a7468325290aabfd9"},
{file = "ruff-0.1.11-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9bd4025b9c5b429a48280785a2b71d479798a69f5c2919e7d274c5f4b32c3607"},
{file = "ruff-0.1.11-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e1ad00662305dcb1e987f5ec214d31f7d6a062cae3e74c1cbccef15afd96611d"},
{file = "ruff-0.1.11-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:4b077ce83f47dd6bea1991af08b140e8b8339f0ba8cb9b7a484c30ebab18a23f"},
{file = "ruff-0.1.11-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4a88efecec23c37b11076fe676e15c6cdb1271a38f2b415e381e87fe4517f18"},
{file = "ruff-0.1.11-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5b25093dad3b055667730a9b491129c42d45e11cdb7043b702e97125bcec48a1"},
{file = "ruff-0.1.11-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:231d8fb11b2cc7c0366a326a66dafc6ad449d7fcdbc268497ee47e1334f66f77"},
{file = "ruff-0.1.11-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:09c415716884950080921dd6237767e52e227e397e2008e2bed410117679975b"},
{file = "ruff-0.1.11-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:0f58948c6d212a6b8d41cd59e349751018797ce1727f961c2fa755ad6208ba45"},
{file = "ruff-0.1.11-py3-none-musllinux_1_2_i686.whl", hash = "sha256:190a566c8f766c37074d99640cd9ca3da11d8deae2deae7c9505e68a4a30f740"},
{file = "ruff-0.1.11-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:6464289bd67b2344d2a5d9158d5eb81025258f169e69a46b741b396ffb0cda95"},
{file = "ruff-0.1.11-py3-none-win32.whl", hash = "sha256:9b8f397902f92bc2e70fb6bebfa2139008dc72ae5177e66c383fa5426cb0bf2c"},
{file = "ruff-0.1.11-py3-none-win_amd64.whl", hash = "sha256:eb85ee287b11f901037a6683b2374bb0ec82928c5cbc984f575d0437979c521a"},
{file = "ruff-0.1.11-py3-none-win_arm64.whl", hash = "sha256:97ce4d752f964ba559c7023a86e5f8e97f026d511e48013987623915431c7ea9"},
{file = "ruff-0.1.11.tar.gz", hash = "sha256:f9d4d88cb6eeb4dfe20f9f0519bd2eaba8119bde87c3d5065c541dbae2b5a2cb"},
]
[[package]]
@@ -2496,16 +2421,6 @@ files = [
{file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
{file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
{file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
{file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
{file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
{file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
{file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
{file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
{file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
{file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
@@ -2743,4 +2658,4 @@ cffi = ["cffi (>=1.11)"]
[metadata]
lock-version = "2.0"
python-versions = "^3.9"
content-hash = "8de8b05a9b35a6f76da7d7e3652ddbb521f1eca53fce7b933f537080a9d6eada"
content-hash = "35c237fe6a9278b2dc65b06ed96bde5afb9e393d52c01b00c59acf1df3a8d482"

View File

@@ -36,17 +36,17 @@ def rustfmt(fix_inplace: bool = False, no_color: bool = False) -> str:
return cmd
def black(fix_inplace: bool) -> str:
cmd = "poetry run black"
if not fix_inplace:
cmd += " --diff --check"
def ruff_check(fix_inplace: bool) -> str:
cmd = "poetry run ruff check"
if fix_inplace:
cmd += " --fix"
return cmd
def ruff(fix_inplace: bool) -> str:
cmd = "poetry run ruff"
if fix_inplace:
cmd += " --fix"
def ruff_format(fix_inplace: bool) -> str:
cmd = "poetry run ruff format"
if not fix_inplace:
cmd += " --diff --check"
return cmd
@@ -109,16 +109,16 @@ if __name__ == "__main__":
no_color=args.no_color,
)
check(
name="black",
name="ruff check",
suffix=".py",
cmd=black(fix_inplace=args.fix_inplace),
cmd=ruff_check(fix_inplace=args.fix_inplace),
changed_files=files,
no_color=args.no_color,
)
check(
name="ruff",
name="ruff format",
suffix=".py",
cmd=ruff(fix_inplace=args.fix_inplace),
cmd=ruff_format(fix_inplace=args.fix_inplace),
changed_files=files,
no_color=args.no_color,
)

View File

@@ -5,7 +5,7 @@ edition.workspace = true
license.workspace = true
[features]
default = []
default = ["testing"]
testing = []
[dependencies]
@@ -14,6 +14,7 @@ async-trait.workspace = true
base64.workspace = true
bstr.workspace = true
bytes = { workspace = true, features = ["serde"] }
camino.workspace = true
chrono.workspace = true
clap.workspace = true
consumption_metrics.workspace = true
@@ -35,6 +36,8 @@ metrics.workspace = true
once_cell.workspace = true
opentelemetry.workspace = true
parking_lot.workspace = true
parquet.workspace = true
parquet_derive.workspace = true
pbkdf2 = { workspace = true, features = ["simple", "std"] }
pin-project-lite.workspace = true
postgres_backend.workspace = true
@@ -42,6 +45,7 @@ pq_proto.workspace = true
prometheus.workspace = true
rand.workspace = true
regex.workspace = true
remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
reqwest = { workspace = true, features = ["json"] }
reqwest-middleware.workspace = true
reqwest-retry.workspace = true
@@ -80,6 +84,7 @@ smol_str.workspace = true
workspace_hack.workspace = true
[dev-dependencies]
camino-tempfile.workspace = true
rcgen.workspace = true
rstest.workspace = true
tokio-postgres-rustls.workspace = true

View File

@@ -10,6 +10,7 @@ use crate::auth::credentials::check_peer_addr_is_in_list;
use crate::auth::validate_password_and_exchange;
use crate::console::errors::GetAuthInfoError;
use crate::console::AuthSecret;
use crate::context::RequestMonitoring;
use crate::proxy::connect_compute::handle_try_wake;
use crate::proxy::retry::retry_after;
use crate::scram;
@@ -22,12 +23,10 @@ use crate::{
provider::{CachedNodeInfo, ConsoleReqExtra},
Api,
},
metrics::LatencyTimer,
stream, url,
};
use futures::TryFutureExt;
use std::borrow::Cow;
use std::net::IpAddr;
use std::ops::ControlFlow;
use std::sync::Arc;
use tokio::io::{AsyncRead, AsyncWrite};
@@ -129,7 +128,6 @@ pub struct ComputeCredentials<T> {
pub struct ComputeUserInfoNoEndpoint {
pub user: SmolStr,
pub peer_addr: IpAddr,
pub cache_key: SmolStr,
}
@@ -151,7 +149,6 @@ impl TryFrom<ClientCredentials> for ComputeUserInfo {
fn try_from(creds: ClientCredentials) -> Result<Self, Self::Error> {
let inner = ComputeUserInfoNoEndpoint {
user: creds.user,
peer_addr: creds.peer_addr,
cache_key: creds.cache_key,
};
match creds.project {
@@ -166,33 +163,34 @@ impl TryFrom<ClientCredentials> for ComputeUserInfo {
///
/// All authentication flows will emit an AuthenticationOk message if successful.
async fn auth_quirks(
ctx: &mut RequestMonitoring,
api: &impl console::Api,
extra: &ConsoleReqExtra,
creds: ClientCredentials,
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
allow_cleartext: bool,
config: &'static AuthenticationConfig,
latency_timer: &mut LatencyTimer,
) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
// If there's no project so far, that entails that client doesn't
// support SNI or other means of passing the endpoint (project) name.
// We now expect to see a very specific payload in the place of password.
let (info, unauthenticated_password) = match creds.try_into() {
Err(info) => {
let res = hacks::password_hack_no_authentication(info, client, latency_timer).await?;
let res = hacks::password_hack_no_authentication(info, client, &mut ctx.latency_timer)
.await?;
ctx.set_endpoint_id(Some(res.info.endpoint.clone()));
(res.info, Some(res.keys))
}
Ok(info) => (info, None),
};
info!("fetching user's authentication info");
let allowed_ips = api.get_allowed_ips(extra, &info).await?;
let allowed_ips = api.get_allowed_ips(ctx, &info).await?;
// check allowed list
if !check_peer_addr_is_in_list(&info.inner.peer_addr, &allowed_ips) {
if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
return Err(auth::AuthError::ip_address_not_allowed());
}
let cached_secret = api.get_role_secret(extra, &info).await?;
let cached_secret = api.get_role_secret(ctx, &info).await?;
let secret = cached_secret.clone().unwrap_or_else(|| {
// If we don't have an authentication secret, we mock one to
@@ -202,13 +200,13 @@ async fn auth_quirks(
AuthSecret::Scram(scram::ServerSecret::mock(&info.inner.user, rand::random()))
});
match authenticate_with_secret(
ctx,
secret,
info,
client,
unauthenticated_password,
allow_cleartext,
config,
latency_timer,
)
.await
{
@@ -224,13 +222,13 @@ async fn auth_quirks(
}
async fn authenticate_with_secret(
ctx: &mut RequestMonitoring,
secret: AuthSecret,
info: ComputeUserInfo,
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
unauthenticated_password: Option<Vec<u8>>,
allow_cleartext: bool,
config: &'static AuthenticationConfig,
latency_timer: &mut LatencyTimer,
) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
if let Some(password) = unauthenticated_password {
let auth_outcome = validate_password_and_exchange(&password, secret)?;
@@ -253,38 +251,31 @@ async fn authenticate_with_secret(
// Perform cleartext auth if we're allowed to do that.
// Currently, we use it for websocket connections (latency).
if allow_cleartext {
return hacks::authenticate_cleartext(info, client, latency_timer, secret).await;
return hacks::authenticate_cleartext(info, client, &mut ctx.latency_timer, secret).await;
}
// Finally, proceed with the main auth flow (SCRAM-based).
classic::authenticate(info, client, config, latency_timer, secret).await
classic::authenticate(info, client, config, &mut ctx.latency_timer, secret).await
}
/// Authenticate the user and then wake a compute (or retrieve an existing compute session from cache)
/// only if authentication was successfuly.
async fn auth_and_wake_compute(
ctx: &mut RequestMonitoring,
api: &impl console::Api,
extra: &ConsoleReqExtra,
creds: ClientCredentials,
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
allow_cleartext: bool,
config: &'static AuthenticationConfig,
latency_timer: &mut LatencyTimer,
) -> auth::Result<(CachedNodeInfo, ComputeUserInfo)> {
let compute_credentials = auth_quirks(
api,
extra,
creds,
client,
allow_cleartext,
config,
latency_timer,
)
.await?;
let compute_credentials = auth_quirks(ctx, api, creds, client, allow_cleartext, config).await?;
let mut num_retries = 0;
let mut node = loop {
let wake_res = api.wake_compute(extra, &compute_credentials.info).await;
let wake_res = api
.wake_compute(ctx, extra, &compute_credentials.info)
.await;
match handle_try_wake(wake_res, num_retries) {
Err(e) => {
error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
@@ -301,6 +292,8 @@ async fn auth_and_wake_compute(
tokio::time::sleep(wait_duration).await;
};
ctx.set_project(node.aux.clone());
match compute_credentials.keys {
#[cfg(feature = "testing")]
ComputeCredentialKeys::Password(password) => node.config.password(password),
@@ -343,11 +336,11 @@ impl<'a> BackendType<'a, ClientCredentials> {
#[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)]
pub async fn authenticate(
self,
ctx: &mut RequestMonitoring,
extra: &ConsoleReqExtra,
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
allow_cleartext: bool,
config: &'static AuthenticationConfig,
latency_timer: &mut LatencyTimer,
) -> auth::Result<(CachedNodeInfo, BackendType<'a, ComputeUserInfo>)> {
use BackendType::*;
@@ -360,13 +353,13 @@ impl<'a> BackendType<'a, ClientCredentials> {
);
let (cache_info, user_info) = auth_and_wake_compute(
ctx,
&*api,
extra,
creds,
client,
allow_cleartext,
config,
latency_timer,
)
.await?;
(cache_info, BackendType::Console(api, user_info))
@@ -380,13 +373,13 @@ impl<'a> BackendType<'a, ClientCredentials> {
);
let (cache_info, user_info) = auth_and_wake_compute(
ctx,
&*api,
extra,
creds,
client,
allow_cleartext,
config,
latency_timer,
)
.await?;
(cache_info, BackendType::Postgres(api, user_info))
@@ -416,13 +409,13 @@ impl<'a> BackendType<'a, ClientCredentials> {
impl BackendType<'_, ComputeUserInfo> {
pub async fn get_allowed_ips(
&self,
extra: &ConsoleReqExtra,
ctx: &mut RequestMonitoring,
) -> Result<Arc<Vec<String>>, GetAuthInfoError> {
use BackendType::*;
match self {
Console(api, creds) => api.get_allowed_ips(extra, creds).await,
Console(api, creds) => api.get_allowed_ips(ctx, creds).await,
#[cfg(feature = "testing")]
Postgres(api, creds) => api.get_allowed_ips(extra, creds).await,
Postgres(api, creds) => api.get_allowed_ips(ctx, creds).await,
Link(_) => Ok(Arc::new(vec![])),
#[cfg(test)]
Test(x) => x.get_allowed_ips(),
@@ -433,14 +426,15 @@ impl BackendType<'_, ComputeUserInfo> {
/// The link auth flow doesn't support this, so we return [`None`] in that case.
pub async fn wake_compute(
&self,
ctx: &mut RequestMonitoring,
extra: &ConsoleReqExtra,
) -> Result<Option<CachedNodeInfo>, console::errors::WakeComputeError> {
use BackendType::*;
match self {
Console(api, creds) => api.wake_compute(extra, creds).map_ok(Some).await,
Console(api, creds) => api.wake_compute(ctx, extra, creds).map_ok(Some).await,
#[cfg(feature = "testing")]
Postgres(api, creds) => api.wake_compute(extra, creds).map_ok(Some).await,
Postgres(api, creds) => api.wake_compute(ctx, extra, creds).map_ok(Some).await,
Link(_) => Ok(None),
#[cfg(test)]
Test(x) => x.wake_compute().map(Some),

View File

@@ -1,7 +1,7 @@
//! User credentials used in authentication.
use crate::{
auth::password_hack::parse_endpoint_param, error::UserFacingError,
auth::password_hack::parse_endpoint_param, context::RequestMonitoring, error::UserFacingError,
metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, proxy::neon_options_str,
};
use itertools::Itertools;
@@ -44,7 +44,6 @@ pub struct ClientCredentials {
pub project: Option<SmolStr>,
pub cache_key: SmolStr,
pub peer_addr: IpAddr,
}
impl ClientCredentials {
@@ -56,16 +55,21 @@ impl ClientCredentials {
impl ClientCredentials {
pub fn parse(
ctx: &mut RequestMonitoring,
params: &StartupMessageParams,
sni: Option<&str>,
common_names: Option<HashSet<String>>,
peer_addr: IpAddr,
) -> Result<Self, ClientCredsParseError> {
use ClientCredsParseError::*;
// Some parameters are stored in the startup message.
let get_param = |key| params.get(key).ok_or(MissingKey(key));
let user = get_param("user")?.into();
let user: SmolStr = get_param("user")?.into();
// record the values if we have them
ctx.set_application(params.get("application_name").map(SmolStr::from));
ctx.set_user(user.clone());
ctx.set_endpoint_id(sni.map(SmolStr::from));
// Project name might be passed via PG's command-line options.
let project_option = params
@@ -147,7 +151,6 @@ impl ClientCredentials {
user,
project,
cache_key,
peer_addr,
})
}
}
@@ -219,8 +222,8 @@ mod tests {
fn parse_bare_minimum() -> anyhow::Result<()> {
// According to postgresql, only `user` should be required.
let options = StartupMessageParams::new([("user", "john_doe")]);
let peer_addr = IpAddr::from([127, 0, 0, 1]);
let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
let mut ctx = RequestMonitoring::test();
let creds = ClientCredentials::parse(&mut ctx, &options, None, None)?;
assert_eq!(creds.user, "john_doe");
assert_eq!(creds.project, None);
@@ -234,8 +237,8 @@ mod tests {
("database", "world"), // should be ignored
("foo", "bar"), // should be ignored
]);
let peer_addr = IpAddr::from([127, 0, 0, 1]);
let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
let mut ctx = RequestMonitoring::test();
let creds = ClientCredentials::parse(&mut ctx, &options, None, None)?;
assert_eq!(creds.user, "john_doe");
assert_eq!(creds.project, None);
@@ -249,8 +252,8 @@ mod tests {
let sni = Some("foo.localhost");
let common_names = Some(["localhost".into()].into());
let peer_addr = IpAddr::from([127, 0, 0, 1]);
let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
let mut ctx = RequestMonitoring::test();
let creds = ClientCredentials::parse(&mut ctx, &options, sni, common_names)?;
assert_eq!(creds.user, "john_doe");
assert_eq!(creds.project.as_deref(), Some("foo"));
assert_eq!(creds.cache_key, "foo");
@@ -265,8 +268,8 @@ mod tests {
("options", "-ckey=1 project=bar -c geqo=off"),
]);
let peer_addr = IpAddr::from([127, 0, 0, 1]);
let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
let mut ctx = RequestMonitoring::test();
let creds = ClientCredentials::parse(&mut ctx, &options, None, None)?;
assert_eq!(creds.user, "john_doe");
assert_eq!(creds.project.as_deref(), Some("bar"));
@@ -280,8 +283,8 @@ mod tests {
("options", "-ckey=1 endpoint=bar -c geqo=off"),
]);
let peer_addr = IpAddr::from([127, 0, 0, 1]);
let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
let mut ctx = RequestMonitoring::test();
let creds = ClientCredentials::parse(&mut ctx, &options, None, None)?;
assert_eq!(creds.user, "john_doe");
assert_eq!(creds.project.as_deref(), Some("bar"));
@@ -298,8 +301,8 @@ mod tests {
),
]);
let peer_addr = IpAddr::from([127, 0, 0, 1]);
let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
let mut ctx = RequestMonitoring::test();
let creds = ClientCredentials::parse(&mut ctx, &options, None, None)?;
assert_eq!(creds.user, "john_doe");
assert!(creds.project.is_none());
@@ -313,8 +316,8 @@ mod tests {
("options", "-ckey=1 endpoint=bar project=foo -c geqo=off"),
]);
let peer_addr = IpAddr::from([127, 0, 0, 1]);
let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
let mut ctx = RequestMonitoring::test();
let creds = ClientCredentials::parse(&mut ctx, &options, None, None)?;
assert_eq!(creds.user, "john_doe");
assert!(creds.project.is_none());
@@ -328,8 +331,8 @@ mod tests {
let sni = Some("baz.localhost");
let common_names = Some(["localhost".into()].into());
let peer_addr = IpAddr::from([127, 0, 0, 1]);
let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
let mut ctx = RequestMonitoring::test();
let creds = ClientCredentials::parse(&mut ctx, &options, sni, common_names)?;
assert_eq!(creds.user, "john_doe");
assert_eq!(creds.project.as_deref(), Some("baz"));
@@ -342,14 +345,14 @@ mod tests {
let common_names = Some(["a.com".into(), "b.com".into()].into());
let sni = Some("p1.a.com");
let peer_addr = IpAddr::from([127, 0, 0, 1]);
let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
let mut ctx = RequestMonitoring::test();
let creds = ClientCredentials::parse(&mut ctx, &options, sni, common_names)?;
assert_eq!(creds.project.as_deref(), Some("p1"));
let common_names = Some(["a.com".into(), "b.com".into()].into());
let sni = Some("p1.b.com");
let peer_addr = IpAddr::from([127, 0, 0, 1]);
let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
let mut ctx = RequestMonitoring::test();
let creds = ClientCredentials::parse(&mut ctx, &options, sni, common_names)?;
assert_eq!(creds.project.as_deref(), Some("p1"));
Ok(())
@@ -363,8 +366,8 @@ mod tests {
let sni = Some("second.localhost");
let common_names = Some(["localhost".into()].into());
let peer_addr = IpAddr::from([127, 0, 0, 1]);
let err = ClientCredentials::parse(&options, sni, common_names, peer_addr)
let mut ctx = RequestMonitoring::test();
let err = ClientCredentials::parse(&mut ctx, &options, sni, common_names)
.expect_err("should fail");
match err {
InconsistentProjectNames { domain, option } => {
@@ -382,8 +385,8 @@ mod tests {
let sni = Some("project.localhost");
let common_names = Some(["example.com".into()].into());
let peer_addr = IpAddr::from([127, 0, 0, 1]);
let err = ClientCredentials::parse(&options, sni, common_names, peer_addr)
let mut ctx = RequestMonitoring::test();
let err = ClientCredentials::parse(&mut ctx, &options, sni, common_names)
.expect_err("should fail");
match err {
UnknownCommonName { cn } => {
@@ -402,8 +405,8 @@ mod tests {
let sni = Some("project.localhost");
let common_names = Some(["localhost".into()].into());
let peer_addr = IpAddr::from([127, 0, 0, 1]);
let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
let mut ctx = RequestMonitoring::test();
let creds = ClientCredentials::parse(&mut ctx, &options, sni, common_names)?;
assert_eq!(creds.project.as_deref(), Some("project"));
assert_eq!(creds.cache_key, "projectendpoint_type:read_write lsn:0/2");

View File

@@ -8,6 +8,7 @@ use std::{net::SocketAddr, sync::Arc};
use futures::future::Either;
use itertools::Itertools;
use proxy::config::TlsServerEndPoint;
use proxy::context::RequestMonitoring;
use proxy::proxy::run_until_cancelled;
use tokio::net::TcpListener;
@@ -170,7 +171,16 @@ async fn task_main(
.context("failed to set socket option")?;
info!(%peer_addr, "serving");
handle_client(dest_suffix, tls_config, tls_server_end_point, socket).await
let mut ctx =
RequestMonitoring::new(session_id, peer_addr.ip(), "sni_router", "sni");
handle_client(
&mut ctx,
dest_suffix,
tls_config,
tls_server_end_point,
socket,
)
.await
}
.unwrap_or_else(|e| {
// Acknowledge that the task has finished with an error.
@@ -236,6 +246,7 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
}
async fn handle_client(
ctx: &mut RequestMonitoring,
dest_suffix: Arc<String>,
tls_config: Arc<rustls::ServerConfig>,
tls_server_end_point: TlsServerEndPoint,
@@ -261,5 +272,5 @@ async fn handle_client(
let client = tokio::net::TcpStream::connect(destination).await?;
let metrics_aux: MetricsAuxInfo = Default::default();
proxy::proxy::proxy_pass(tls_stream, client, metrics_aux).await
proxy::proxy::proxy_pass(ctx, tls_stream, client, metrics_aux).await
}

View File

@@ -7,6 +7,7 @@ use proxy::console;
use proxy::console::provider::AllowedIpsCache;
use proxy::console::provider::NodeInfoCache;
use proxy::console::provider::RoleSecretCache;
use proxy::context::parquet::ParquetUploadArgs;
use proxy::http;
use proxy::rate_limiter::EndpointRateLimiter;
use proxy::rate_limiter::RateBucketInfo;
@@ -44,6 +45,9 @@ enum AuthBackend {
#[derive(Parser)]
#[command(version = GIT_VERSION, about)]
struct ProxyCliArgs {
/// Name of the region this proxy is deployed in
#[clap(long, default_value_t = String::new())]
region: String,
/// listen for incoming client connections on ip:port
#[clap(short, long, default_value = "127.0.0.1:4432")]
proxy: String,
@@ -133,6 +137,9 @@ struct ProxyCliArgs {
/// disable ip check for http requests. If it is too time consuming, it could be turned off.
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
disable_ip_check_for_http: bool,
#[clap(flatten)]
parquet_upload: ParquetUploadArgs,
}
#[derive(clap::Args, Clone, Copy, Debug)]
@@ -221,6 +228,11 @@ async fn main() -> anyhow::Result<()> {
));
}
client_tasks.spawn(proxy::context::parquet::worker(
cancellation_token.clone(),
args.parquet_upload,
));
// maintenance tasks. these never return unless there's an error
let mut maintenance_tasks = JoinSet::new();
maintenance_tasks.spawn(proxy::handle_signals(cancellation_token));
@@ -380,6 +392,8 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
require_client_ip: args.require_client_ip,
disable_ip_check_for_http: args.disable_ip_check_for_http,
endpoint_rps_limit,
// TODO: add this argument
region: args.region.clone(),
}));
Ok(config)

View File

@@ -1,6 +1,7 @@
use crate::{
auth::parse_endpoint_param, cancellation::CancelClosure, console::errors::WakeComputeError,
error::UserFacingError, metrics::NUM_DB_CONNECTIONS_GAUGE, proxy::neon_option,
context::RequestMonitoring, error::UserFacingError, metrics::NUM_DB_CONNECTIONS_GAUGE,
proxy::neon_option,
};
use futures::{FutureExt, TryFutureExt};
use itertools::Itertools;
@@ -232,9 +233,9 @@ impl ConnCfg {
/// Connect to a corresponding compute node.
pub async fn connect(
&self,
ctx: &mut RequestMonitoring,
allow_self_signed_compute: bool,
timeout: Duration,
proto: &'static str,
) -> Result<PostgresConnection, ConnectionError> {
let (socket_addr, stream, host) = self.connect_raw(timeout).await?;
@@ -268,7 +269,9 @@ impl ConnCfg {
stream,
params,
cancel_closure,
_guage: NUM_DB_CONNECTIONS_GAUGE.with_label_values(&[proto]).guard(),
_guage: NUM_DB_CONNECTIONS_GAUGE
.with_label_values(&[ctx.protocol])
.guard(),
};
Ok(connection)

View File

@@ -21,6 +21,7 @@ pub struct ProxyConfig {
pub require_client_ip: bool,
pub disable_ip_check_for_http: bool,
pub endpoint_rps_limit: Vec<RateBucketInfo>,
pub region: String,
}
#[derive(Debug)]

View File

@@ -6,7 +6,9 @@ use super::messages::MetricsAuxInfo;
use crate::{
auth::backend::ComputeUserInfo,
cache::{timed_lru, TimedLru},
compute, scram,
compute,
context::RequestMonitoring,
scram,
};
use async_trait::async_trait;
use dashmap::DashMap;
@@ -198,10 +200,6 @@ pub mod errors {
/// Extra query params we'd like to pass to the console.
pub struct ConsoleReqExtra {
/// A unique identifier for a connection.
pub session_id: uuid::Uuid,
/// Name of client application, if set.
pub application_name: String,
pub options: Vec<(String, String)>,
}
@@ -263,19 +261,20 @@ pub trait Api {
/// Get the client's auth secret for authentication.
async fn get_role_secret(
&self,
extra: &ConsoleReqExtra,
ctx: &mut RequestMonitoring,
creds: &ComputeUserInfo,
) -> Result<CachedRoleSecret, errors::GetAuthInfoError>;
async fn get_allowed_ips(
&self,
extra: &ConsoleReqExtra,
ctx: &mut RequestMonitoring,
creds: &ComputeUserInfo,
) -> Result<Arc<Vec<String>>, errors::GetAuthInfoError>;
/// Wake up the compute node and return the corresponding connection info.
async fn wake_compute(
&self,
ctx: &mut RequestMonitoring,
extra: &ConsoleReqExtra,
creds: &ComputeUserInfo,
) -> Result<CachedNodeInfo, errors::WakeComputeError>;

View File

@@ -6,8 +6,8 @@ use super::{
errors::{ApiError, GetAuthInfoError, WakeComputeError},
AuthInfo, AuthSecret, CachedNodeInfo, ConsoleReqExtra, NodeInfo,
};
use crate::console::provider::CachedRoleSecret;
use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl};
use crate::{console::provider::CachedRoleSecret, context::RequestMonitoring};
use async_trait::async_trait;
use futures::TryFutureExt;
use thiserror::Error;
@@ -145,7 +145,7 @@ impl super::Api for Api {
#[tracing::instrument(skip_all)]
async fn get_role_secret(
&self,
_extra: &ConsoleReqExtra,
_ctx: &mut RequestMonitoring,
creds: &ComputeUserInfo,
) -> Result<CachedRoleSecret, GetAuthInfoError> {
Ok(CachedRoleSecret::new_uncached(
@@ -155,7 +155,7 @@ impl super::Api for Api {
async fn get_allowed_ips(
&self,
_extra: &ConsoleReqExtra,
_ctx: &mut RequestMonitoring,
creds: &ComputeUserInfo,
) -> Result<Arc<Vec<String>>, GetAuthInfoError> {
Ok(Arc::new(self.do_get_auth_info(creds).await?.allowed_ips))
@@ -164,6 +164,7 @@ impl super::Api for Api {
#[tracing::instrument(skip_all)]
async fn wake_compute(
&self,
_ctx: &mut RequestMonitoring,
_extra: &ConsoleReqExtra,
_creds: &ComputeUserInfo,
) -> Result<CachedNodeInfo, WakeComputeError> {

View File

@@ -6,8 +6,11 @@ use super::{
ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedNodeInfo, CachedRoleSecret, ConsoleReqExtra,
NodeInfo,
};
use crate::metrics::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER};
use crate::{auth::backend::ComputeUserInfo, compute, http, scram};
use crate::{
context::RequestMonitoring,
metrics::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER},
};
use async_trait::async_trait;
use futures::TryFutureExt;
use itertools::Itertools;
@@ -49,19 +52,20 @@ impl Api {
async fn do_get_auth_info(
&self,
extra: &ConsoleReqExtra,
ctx: &mut RequestMonitoring,
creds: &ComputeUserInfo,
) -> Result<AuthInfo, GetAuthInfoError> {
let request_id = uuid::Uuid::new_v4().to_string();
let application_name = ctx.console_application_name();
async {
let request = self
.endpoint
.get("proxy_get_role_secret")
.header("X-Request-ID", &request_id)
.header("Authorization", format!("Bearer {}", &self.jwt))
.query(&[("session_id", extra.session_id)])
.query(&[("session_id", ctx.session_id)])
.query(&[
("application_name", extra.application_name.as_str()),
("application_name", application_name.as_str()),
("project", creds.endpoint.as_str()),
("role", creds.inner.user.as_str()),
])
@@ -102,19 +106,21 @@ impl Api {
async fn do_wake_compute(
&self,
ctx: &mut RequestMonitoring,
extra: &ConsoleReqExtra,
creds: &ComputeUserInfo,
) -> Result<NodeInfo, WakeComputeError> {
let request_id = uuid::Uuid::new_v4().to_string();
let application_name = ctx.console_application_name();
async {
let mut request_builder = self
.endpoint
.get("proxy_wake_compute")
.header("X-Request-ID", &request_id)
.header("Authorization", format!("Bearer {}", &self.jwt))
.query(&[("session_id", extra.session_id)])
.query(&[("session_id", ctx.session_id)])
.query(&[
("application_name", extra.application_name.as_str()),
("application_name", application_name.as_str()),
("project", creds.endpoint.as_str()),
]);
@@ -162,7 +168,7 @@ impl super::Api for Api {
#[tracing::instrument(skip_all)]
async fn get_role_secret(
&self,
extra: &ConsoleReqExtra,
ctx: &mut RequestMonitoring,
creds: &ComputeUserInfo,
) -> Result<CachedRoleSecret, GetAuthInfoError> {
let ep = creds.endpoint.clone();
@@ -170,7 +176,7 @@ impl super::Api for Api {
if let Some(role_secret) = self.caches.role_secret.get(&(ep.clone(), user.clone())) {
return Ok(role_secret);
}
let auth_info = self.do_get_auth_info(extra, creds).await?;
let auth_info = self.do_get_auth_info(ctx, creds).await?;
let (_, secret) = self
.caches
.role_secret
@@ -183,7 +189,7 @@ impl super::Api for Api {
async fn get_allowed_ips(
&self,
extra: &ConsoleReqExtra,
ctx: &mut RequestMonitoring,
creds: &ComputeUserInfo,
) -> Result<Arc<Vec<String>>, GetAuthInfoError> {
if let Some(allowed_ips) = self.caches.allowed_ips.get(&creds.endpoint) {
@@ -195,7 +201,7 @@ impl super::Api for Api {
ALLOWED_IPS_BY_CACHE_OUTCOME
.with_label_values(&["miss"])
.inc();
let auth_info = self.do_get_auth_info(extra, creds).await?;
let auth_info = self.do_get_auth_info(ctx, creds).await?;
let allowed_ips = Arc::new(auth_info.allowed_ips);
let ep = creds.endpoint.clone();
let user = creds.inner.user.clone();
@@ -209,6 +215,7 @@ impl super::Api for Api {
#[tracing::instrument(skip_all)]
async fn wake_compute(
&self,
ctx: &mut RequestMonitoring,
extra: &ConsoleReqExtra,
creds: &ComputeUserInfo,
) -> Result<CachedNodeInfo, WakeComputeError> {
@@ -236,7 +243,7 @@ impl super::Api for Api {
}
}
let node = self.do_wake_compute(extra, creds).await?;
let node = self.do_wake_compute(ctx, extra, creds).await?;
let (_, cached) = self.caches.node_info.insert(key.clone(), node);
info!(key = &*key, "created a cache entry for compute node info");

110
proxy/src/context.rs Normal file
View File

@@ -0,0 +1,110 @@
//! Connection request monitoring contexts
use chrono::Utc;
use once_cell::sync::OnceCell;
use smol_str::SmolStr;
use std::net::IpAddr;
use tokio::sync::mpsc;
use uuid::Uuid;
use crate::{console::messages::MetricsAuxInfo, error::ErrorKind, metrics::LatencyTimer};
pub mod parquet;
static LOG_CHAN: OnceCell<mpsc::WeakUnboundedSender<RequestMonitoring>> = OnceCell::new();
#[derive(Clone)]
/// Context data for a single request to connect to a database.
///
/// This data should **not** be used for connection logic, only for observability and limiting purposes.
/// All connection logic should instead use strongly typed state machines, not a bunch of Options.
pub struct RequestMonitoring {
pub peer_addr: IpAddr,
pub session_id: Uuid,
pub protocol: &'static str,
first_packet: chrono::DateTime<Utc>,
region: &'static str,
// filled in as they are discovered
project: Option<SmolStr>,
branch: Option<SmolStr>,
endpoint_id: Option<SmolStr>,
user: Option<SmolStr>,
application: Option<SmolStr>,
error_kind: Option<ErrorKind>,
// extra
// This sender is here to keep the request monitoring channel open while requests are taking place.
sender: Option<mpsc::UnboundedSender<RequestMonitoring>>,
pub latency_timer: LatencyTimer,
}
impl RequestMonitoring {
pub fn new(
session_id: Uuid,
peer_addr: IpAddr,
protocol: &'static str,
region: &'static str,
) -> Self {
Self {
peer_addr,
session_id,
protocol,
first_packet: Utc::now(),
region,
project: None,
branch: None,
endpoint_id: None,
user: None,
application: None,
error_kind: None,
sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
latency_timer: LatencyTimer::new(protocol),
}
}
#[cfg(test)]
pub fn test() -> Self {
RequestMonitoring::new(Uuid::now_v7(), [127, 0, 0, 1].into(), "test", "test")
}
pub fn console_application_name(&self) -> String {
format!(
"{}/{}",
self.application.as_deref().unwrap_or_default(),
self.protocol
)
}
pub fn set_project(&mut self, x: MetricsAuxInfo) {
self.branch = Some(x.branch_id);
self.endpoint_id = Some(x.endpoint_id);
self.project = Some(x.project_id);
}
pub fn set_endpoint_id(&mut self, endpoint_id: Option<SmolStr>) {
self.endpoint_id = endpoint_id.or_else(|| self.endpoint_id.clone());
}
pub fn set_application(&mut self, app: Option<SmolStr>) {
self.application = app.or_else(|| self.application.clone());
}
pub fn set_user(&mut self, user: SmolStr) {
self.user = Some(user);
}
pub fn log(&mut self) {
if let Some(tx) = self.sender.take() {
let _: Result<(), _> = tx.send(self.clone());
}
}
}
impl Drop for RequestMonitoring {
fn drop(&mut self) {
self.log()
}
}

View File

@@ -0,0 +1,641 @@
use std::sync::Arc;
use anyhow::Context;
use bytes::BytesMut;
use futures::{Stream, StreamExt};
use parquet::{
basic::Compression,
file::{
metadata::RowGroupMetaDataPtr,
properties::{WriterProperties, WriterPropertiesPtr, DEFAULT_PAGE_SIZE},
writer::SerializedFileWriter,
},
record::RecordWriter,
};
use remote_storage::{GenericRemoteStorage, RemotePath, RemoteStorageConfig};
use tokio::{sync::mpsc, time};
use tokio_util::sync::CancellationToken;
use tracing::{debug, info, Span};
use utils::backoff;
use super::{RequestMonitoring, LOG_CHAN};
#[derive(clap::Args, Clone, Debug)]
pub struct ParquetUploadArgs {
/// Storage location to upload the parquet files to.
/// Encoded as toml (same format as pageservers), eg
/// `{bucket_name='the-bucket',bucket_region='us-east-1',prefix_in_bucket='proxy',endpoint='http://minio:9000'}`
#[clap(long, default_value = "{}", value_parser = remote_storage_from_toml)]
parquet_upload_remote_storage: OptRemoteStorageConfig,
/// How many rows to include in a row group
#[clap(long, default_value_t = 8192)]
parquet_upload_row_group_size: usize,
/// How large each column page should be in bytes
#[clap(long, default_value_t = DEFAULT_PAGE_SIZE)]
parquet_upload_page_size: usize,
/// How large the total parquet file should be in bytes
#[clap(long, default_value_t = 100_000_000)]
parquet_upload_size: i64,
/// How long to wait before forcing a file upload
#[clap(long, default_value = "20m", value_parser = humantime::parse_duration)]
parquet_upload_maximum_duration: tokio::time::Duration,
/// What level of compression to use
#[clap(long, default_value_t = Compression::UNCOMPRESSED)]
parquet_upload_compression: Compression,
}
/// Hack to avoid clap being smarter. If you don't use this type alias, clap assumes more about the optional state and you get
/// runtime type errors from the value parser we use.
type OptRemoteStorageConfig = Option<RemoteStorageConfig>;
fn remote_storage_from_toml(s: &str) -> anyhow::Result<OptRemoteStorageConfig> {
RemoteStorageConfig::from_toml(&s.parse()?)
}
// Occasional network issues and such can cause remote operations to fail, and
// that's expected. If a upload fails, we log it at info-level, and retry.
// But after FAILED_UPLOAD_WARN_THRESHOLD retries, we start to log it at WARN
// level instead, as repeated failures can mean a more serious problem. If it
// fails more than FAILED_UPLOAD_RETRIES times, we give up
pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
pub(crate) const FAILED_UPLOAD_MAX_RETRIES: u32 = 10;
// the parquet crate leaves a lot to be desired...
// what follows is an attempt to write parquet files with minimal allocs.
// complication: parquet is a columnar format, while we want to write in as rows.
// design:
// * we batch up to 1024 rows, then flush them into a 'row group'
// * after each rowgroup write, we check the length of the file and upload to s3 if large enough
#[derive(parquet_derive::ParquetRecordWriter)]
struct RequestData {
region: &'static str,
protocol: &'static str,
/// Must be UTC. The derive macro doesn't like the timezones
timestamp: chrono::NaiveDateTime,
session_id: uuid::Uuid,
peer_addr: String,
username: Option<String>,
application_name: Option<String>,
endpoint_id: Option<String>,
project: Option<String>,
branch: Option<String>,
error: Option<&'static str>,
}
impl From<RequestMonitoring> for RequestData {
fn from(value: RequestMonitoring) -> Self {
Self {
session_id: value.session_id,
peer_addr: value.peer_addr.to_string(),
timestamp: value.first_packet.naive_utc(),
username: value.user.as_deref().map(String::from),
application_name: value.application.as_deref().map(String::from),
endpoint_id: value.endpoint_id.as_deref().map(String::from),
project: value.project.as_deref().map(String::from),
branch: value.branch.as_deref().map(String::from),
protocol: value.protocol,
region: value.region,
error: value.error_kind.as_ref().map(|e| e.to_str()),
}
}
}
/// Parquet request context worker
///
/// It listened on a channel for all completed requests, extracts the data and writes it into a parquet file,
/// then uploads a completed batch to S3
pub async fn worker(
cancellation_token: CancellationToken,
config: ParquetUploadArgs,
) -> anyhow::Result<()> {
let Some(remote_storage_config) = config.parquet_upload_remote_storage else {
tracing::warn!("parquet request upload: no s3 bucket configured");
return Ok(());
};
let (tx, mut rx) = mpsc::unbounded_channel();
LOG_CHAN.set(tx.downgrade()).unwrap();
// setup row stream that will close on cancellation
tokio::spawn(async move {
cancellation_token.cancelled().await;
// dropping this sender will cause the channel to close only once
// all the remaining inflight requests have been completed.
drop(tx);
});
let rx = futures::stream::poll_fn(move |cx| rx.poll_recv(cx));
let rx = rx.map(RequestData::from);
let storage =
GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?;
let properties = WriterProperties::builder()
.set_data_page_size_limit(config.parquet_upload_page_size)
.set_compression(config.parquet_upload_compression);
let parquet_config = ParquetConfig {
propeties: Arc::new(properties.build()),
rows_per_group: config.parquet_upload_row_group_size,
file_size: config.parquet_upload_size,
max_duration: config.parquet_upload_maximum_duration,
#[cfg(any(test, feature = "testing"))]
test_remote_failures: 0,
};
worker_inner(storage, rx, parquet_config).await
}
struct ParquetConfig {
propeties: WriterPropertiesPtr,
rows_per_group: usize,
file_size: i64,
max_duration: tokio::time::Duration,
#[cfg(any(test, feature = "testing"))]
test_remote_failures: u64,
}
async fn worker_inner(
storage: GenericRemoteStorage,
rx: impl Stream<Item = RequestData>,
config: ParquetConfig,
) -> anyhow::Result<()> {
#[cfg(any(test, feature = "testing"))]
let storage = if config.test_remote_failures > 0 {
GenericRemoteStorage::unreliable_wrapper(storage, config.test_remote_failures)
} else {
storage
};
let mut rx = std::pin::pin!(rx);
let mut rows = Vec::with_capacity(config.rows_per_group);
let schema = rows.as_slice().schema()?;
let file = BytesWriter::default();
let mut w = SerializedFileWriter::new(file, schema.clone(), config.propeties.clone())?;
let mut last_upload = time::Instant::now();
let mut len = 0;
while let Some(row) = rx.next().await {
rows.push(row);
let force = last_upload.elapsed() > config.max_duration;
if rows.len() == config.rows_per_group || force {
let rg_meta;
(rows, w, rg_meta) = flush_rows(rows, w).await?;
len += rg_meta.compressed_size();
}
if len > config.file_size || force {
last_upload = time::Instant::now();
let file = upload_parquet(w, len, &storage).await?;
w = SerializedFileWriter::new(file, schema.clone(), config.propeties.clone())?;
len = 0;
}
}
if !rows.is_empty() {
let rg_meta;
(_, w, rg_meta) = flush_rows(rows, w).await?;
len += rg_meta.compressed_size();
}
if !w.flushed_row_groups().is_empty() {
let _: BytesWriter = upload_parquet(w, len, &storage).await?;
}
Ok(())
}
async fn flush_rows(
rows: Vec<RequestData>,
mut w: SerializedFileWriter<BytesWriter>,
) -> anyhow::Result<(
Vec<RequestData>,
SerializedFileWriter<BytesWriter>,
RowGroupMetaDataPtr,
)> {
let span = Span::current();
let (mut rows, w, rg_meta) = tokio::task::spawn_blocking(move || {
let _enter = span.enter();
let mut rg = w.next_row_group()?;
rows.as_slice().write_to_row_group(&mut rg)?;
let rg_meta = rg.close()?;
let size = rg_meta.compressed_size();
let compression = rg_meta.compressed_size() as f64 / rg_meta.total_byte_size() as f64;
debug!(size, compression, "flushed row group to parquet file");
Ok::<_, parquet::errors::ParquetError>((rows, w, rg_meta))
})
.await
.unwrap()?;
rows.clear();
Ok((rows, w, rg_meta))
}
async fn upload_parquet(
w: SerializedFileWriter<BytesWriter>,
len: i64,
storage: &GenericRemoteStorage,
) -> anyhow::Result<BytesWriter> {
let len_uncompressed = w
.flushed_row_groups()
.iter()
.map(|rg| rg.total_byte_size())
.sum::<i64>();
// I don't know how compute intensive this is, although it probably isn't much... better be safe than sorry.
// finish method only available on the fork: https://github.com/apache/arrow-rs/issues/5253
let (mut file, metadata) = tokio::task::spawn_blocking(move || w.finish())
.await
.unwrap()?;
let data = file.buf.split().freeze();
let compression = len as f64 / len_uncompressed as f64;
let size = data.len();
let id = uuid::Uuid::now_v7();
info!(
%id,
rows = metadata.num_rows,
size, compression, "uploading request parquet file"
);
let path = RemotePath::from_string(&format!("requests_{id}.parquet"))?;
backoff::retry(
|| async {
let stream = futures::stream::once(futures::future::ready(Ok(data.clone())));
storage.upload(stream, data.len(), &path, None).await
},
|_e| false,
FAILED_UPLOAD_WARN_THRESHOLD,
FAILED_UPLOAD_MAX_RETRIES,
"request_data_upload",
// we don't want cancellation to interrupt here, so we make a dummy cancel token
backoff::Cancel::new(CancellationToken::new(), || anyhow::anyhow!("Cancelled")),
)
.await
.context("request_data_upload")?;
Ok(file)
}
// why doesn't BytesMut impl io::Write?
#[derive(Default)]
struct BytesWriter {
buf: BytesMut,
}
impl std::io::Write for BytesWriter {
fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
self.buf.extend_from_slice(buf);
Ok(buf.len())
}
fn flush(&mut self) -> std::io::Result<()> {
Ok(())
}
}
#[cfg(test)]
mod tests {
use std::{net::Ipv4Addr, num::NonZeroUsize, sync::Arc};
use camino::Utf8Path;
use clap::Parser;
use futures::{Stream, StreamExt};
use itertools::Itertools;
use parquet::{
basic::{Compression, ZstdLevel},
file::{
properties::{WriterProperties, DEFAULT_PAGE_SIZE},
reader::FileReader,
serialized_reader::SerializedFileReader,
},
};
use rand::{rngs::StdRng, Rng, SeedableRng};
use remote_storage::{
GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind, S3Config,
DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
};
use tokio::{sync::mpsc, time};
use super::{worker_inner, ParquetConfig, ParquetUploadArgs, RequestData};
#[derive(Parser)]
struct ProxyCliArgs {
#[clap(flatten)]
parquet_upload: ParquetUploadArgs,
}
#[test]
fn default_parser() {
let ProxyCliArgs { parquet_upload } = ProxyCliArgs::parse_from(["proxy"]);
assert_eq!(parquet_upload.parquet_upload_remote_storage, None);
assert_eq!(parquet_upload.parquet_upload_row_group_size, 8192);
assert_eq!(parquet_upload.parquet_upload_page_size, DEFAULT_PAGE_SIZE);
assert_eq!(parquet_upload.parquet_upload_size, 100_000_000);
assert_eq!(
parquet_upload.parquet_upload_maximum_duration,
time::Duration::from_secs(20 * 60)
);
assert_eq!(
parquet_upload.parquet_upload_compression,
Compression::UNCOMPRESSED
);
}
#[test]
fn full_parser() {
let ProxyCliArgs { parquet_upload } = ProxyCliArgs::parse_from([
"proxy",
"--parquet-upload-remote-storage",
"{bucket_name='default',prefix_in_bucket='proxy/',bucket_region='us-east-1',endpoint='http://minio:9000'}",
"--parquet-upload-row-group-size",
"100",
"--parquet-upload-page-size",
"10000",
"--parquet-upload-size",
"10000000",
"--parquet-upload-maximum-duration",
"10m",
"--parquet-upload-compression",
"zstd(5)",
]);
assert_eq!(
parquet_upload.parquet_upload_remote_storage,
Some(RemoteStorageConfig {
storage: RemoteStorageKind::AwsS3(S3Config {
bucket_name: "default".into(),
bucket_region: "us-east-1".into(),
prefix_in_bucket: Some("proxy/".into()),
endpoint: Some("http://minio:9000".into()),
concurrency_limit: NonZeroUsize::new(
DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT
)
.unwrap(),
max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
})
})
);
assert_eq!(parquet_upload.parquet_upload_row_group_size, 100);
assert_eq!(parquet_upload.parquet_upload_page_size, 10000);
assert_eq!(parquet_upload.parquet_upload_size, 10_000_000);
assert_eq!(
parquet_upload.parquet_upload_maximum_duration,
time::Duration::from_secs(10 * 60)
);
assert_eq!(
parquet_upload.parquet_upload_compression,
Compression::ZSTD(ZstdLevel::try_new(5).unwrap())
);
}
fn generate_request_data(rng: &mut impl Rng) -> RequestData {
RequestData {
session_id: uuid::Builder::from_random_bytes(rng.gen()).into_uuid(),
peer_addr: Ipv4Addr::from(rng.gen::<[u8; 4]>()).to_string(),
timestamp: chrono::NaiveDateTime::from_timestamp_millis(
rng.gen_range(1703862754..1803862754),
)
.unwrap(),
application_name: Some("test".to_owned()),
username: Some(hex::encode(rng.gen::<[u8; 4]>())),
endpoint_id: Some(hex::encode(rng.gen::<[u8; 16]>())),
project: Some(hex::encode(rng.gen::<[u8; 16]>())),
branch: Some(hex::encode(rng.gen::<[u8; 16]>())),
protocol: ["tcp", "ws", "http"][rng.gen_range(0..3)],
region: "us-east-1",
error: None,
}
}
fn random_stream(len: usize) -> impl Stream<Item = RequestData> + Unpin {
let mut rng = StdRng::from_seed([0x39; 32]);
futures::stream::iter(
std::iter::repeat_with(move || generate_request_data(&mut rng)).take(len),
)
}
async fn run_test(
tmpdir: &Utf8Path,
config: ParquetConfig,
rx: impl Stream<Item = RequestData>,
) -> Vec<(u64, usize, i64)> {
let remote_storage_config = RemoteStorageConfig {
storage: RemoteStorageKind::LocalFs(tmpdir.to_path_buf()),
};
let storage = GenericRemoteStorage::from_config(&remote_storage_config).unwrap();
worker_inner(storage, rx, config).await.unwrap();
let mut files = std::fs::read_dir(tmpdir.as_std_path())
.unwrap()
.map(|entry| entry.unwrap().path())
.collect_vec();
files.sort();
files
.into_iter()
.map(|path| std::fs::File::open(tmpdir.as_std_path().join(path)).unwrap())
.map(|file| {
(
file.metadata().unwrap(),
SerializedFileReader::new(file).unwrap().metadata().clone(),
)
})
.map(|(file_meta, parquet_meta)| {
(
file_meta.len(),
parquet_meta.num_row_groups(),
parquet_meta.file_metadata().num_rows(),
)
})
.collect()
}
#[tokio::test]
async fn verify_parquet_no_compression() {
let tmpdir = camino_tempfile::tempdir().unwrap();
let config = ParquetConfig {
propeties: Arc::new(WriterProperties::new()),
rows_per_group: 2_000,
file_size: 1_000_000,
max_duration: time::Duration::from_secs(20 * 60),
test_remote_failures: 0,
};
let rx = random_stream(50_000);
let file_stats = run_test(tmpdir.path(), config, rx).await;
assert_eq!(
file_stats,
[
(1029153, 3, 6000),
(1029075, 3, 6000),
(1029216, 3, 6000),
(1029129, 3, 6000),
(1029250, 3, 6000),
(1029017, 3, 6000),
(1029175, 3, 6000),
(1029247, 3, 6000),
(343124, 1, 2000)
],
);
tmpdir.close().unwrap();
}
#[tokio::test]
async fn verify_parquet_min_compression() {
let tmpdir = camino_tempfile::tempdir().unwrap();
let config = ParquetConfig {
propeties: Arc::new(
WriterProperties::builder()
.set_compression(parquet::basic::Compression::ZSTD(ZstdLevel::default()))
.build(),
),
rows_per_group: 2_000,
file_size: 1_000_000,
max_duration: time::Duration::from_secs(20 * 60),
test_remote_failures: 0,
};
let rx = random_stream(50_000);
let file_stats = run_test(tmpdir.path(), config, rx).await;
// with compression, there are fewer files with more rows per file
assert_eq!(
file_stats,
[
(1166201, 6, 12000),
(1163577, 6, 12000),
(1164641, 6, 12000),
(1168772, 6, 12000),
(196761, 1, 2000)
],
);
tmpdir.close().unwrap();
}
#[tokio::test]
async fn verify_parquet_strong_compression() {
let tmpdir = camino_tempfile::tempdir().unwrap();
let config = ParquetConfig {
propeties: Arc::new(
WriterProperties::builder()
.set_compression(parquet::basic::Compression::ZSTD(
ZstdLevel::try_new(10).unwrap(),
))
.build(),
),
rows_per_group: 2_000,
file_size: 1_000_000,
max_duration: time::Duration::from_secs(20 * 60),
test_remote_failures: 0,
};
let rx = random_stream(50_000);
let file_stats = run_test(tmpdir.path(), config, rx).await;
// with strong compression, the files are smaller
assert_eq!(
file_stats,
[
(1144934, 6, 12000),
(1144941, 6, 12000),
(1144735, 6, 12000),
(1144936, 6, 12000),
(191035, 1, 2000)
],
);
tmpdir.close().unwrap();
}
#[tokio::test]
async fn verify_parquet_unreliable_upload() {
let tmpdir = camino_tempfile::tempdir().unwrap();
let config = ParquetConfig {
propeties: Arc::new(WriterProperties::new()),
rows_per_group: 2_000,
file_size: 1_000_000,
max_duration: time::Duration::from_secs(20 * 60),
test_remote_failures: 2,
};
let rx = random_stream(50_000);
let file_stats = run_test(tmpdir.path(), config, rx).await;
assert_eq!(
file_stats,
[
(1029153, 3, 6000),
(1029075, 3, 6000),
(1029216, 3, 6000),
(1029129, 3, 6000),
(1029250, 3, 6000),
(1029017, 3, 6000),
(1029175, 3, 6000),
(1029247, 3, 6000),
(343124, 1, 2000)
],
);
tmpdir.close().unwrap();
}
#[tokio::test(start_paused = true)]
async fn verify_parquet_regular_upload() {
let tmpdir = camino_tempfile::tempdir().unwrap();
let config = ParquetConfig {
propeties: Arc::new(WriterProperties::new()),
rows_per_group: 2_000,
file_size: 1_000_000,
max_duration: time::Duration::from_secs(60),
test_remote_failures: 2,
};
let (tx, mut rx) = mpsc::unbounded_channel();
tokio::spawn(async move {
for _ in 0..3 {
let mut s = random_stream(3000);
while let Some(r) = s.next().await {
tx.send(r).unwrap();
}
time::sleep(time::Duration::from_secs(70)).await
}
});
let rx = futures::stream::poll_fn(move |cx| rx.poll_recv(cx));
let file_stats = run_test(tmpdir.path(), config, rx).await;
// files are smaller than the size threshold, but they took too long to fill so were flushed early
assert_eq!(
file_stats,
[(515807, 2, 3001), (515585, 2, 3000), (515425, 2, 2999)],
);
tmpdir.close().unwrap();
}
}

View File

@@ -28,3 +28,37 @@ pub trait UserFacingError: fmt::Display {
self.to_string()
}
}
#[derive(Clone)]
pub enum ErrorKind {
/// Wrong password, unknown endpoint, protocol violation, etc...
User,
/// Network error between user and proxy. Not necessarily user error
Disconnect,
/// Proxy self-imposed rate limits
RateLimit,
/// internal errors
Service,
/// Error communicating with control plane
ControlPlane,
/// Error communicating with compute
Compute,
}
impl ErrorKind {
pub fn to_str(&self) -> &'static str {
match self {
ErrorKind::User => "request failed due to user error",
ErrorKind::Disconnect => "client disconnected",
ErrorKind::RateLimit => "request cancelled due to rate limit",
ErrorKind::Service => "internal service error",
ErrorKind::ControlPlane => "non-retryable control plane error",
ErrorKind::Compute => "non-retryable compute error (or exhausted retry capacity)",
}
}
}

View File

@@ -13,6 +13,7 @@ pub mod cancellation;
pub mod compute;
pub mod config;
pub mod console;
pub mod context;
pub mod error;
pub mod http;
pub mod logging;

View File

@@ -115,11 +115,12 @@ pub static ALLOWED_IPS_NUMBER: Lazy<Histogram> = Lazy::new(|| {
.unwrap()
});
#[derive(Clone)]
pub struct LatencyTimer {
// time since the stopwatch was started
start: Option<time::Instant>,
// accumulated time on the stopwatch
accumulated: std::time::Duration,
pub accumulated: std::time::Duration,
// label data
protocol: &'static str,
cache_miss: bool,
@@ -160,7 +161,12 @@ impl LatencyTimer {
self.pool_miss = false;
}
pub fn success(mut self) {
pub fn success(&mut self) {
// stop the stopwatch and record the time that we have accumulated
let start = self.start.take().expect("latency timer should be started");
self.accumulated += start.elapsed();
// success
self.outcome = "success";
}
}

View File

@@ -10,8 +10,9 @@ use crate::{
compute,
config::{AuthenticationConfig, ProxyConfig, TlsConfig},
console::{self, messages::MetricsAuxInfo},
context::RequestMonitoring,
metrics::{
LatencyTimer, NUM_BYTES_PROXIED_COUNTER, NUM_BYTES_PROXIED_PER_CLIENT_COUNTER,
NUM_BYTES_PROXIED_COUNTER, NUM_BYTES_PROXIED_PER_CLIENT_COUNTER,
NUM_CLIENT_CONNECTION_GAUGE, NUM_CONNECTION_REQUESTS_GAUGE,
},
protocol2::WithClientIp,
@@ -25,7 +26,7 @@ use itertools::Itertools;
use once_cell::sync::OnceCell;
use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
use regex::Regex;
use std::{net::IpAddr, sync::Arc};
use std::sync::Arc;
use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
use tokio_util::sync::CancellationToken;
use tracing::{error, info, info_span, Instrument};
@@ -82,14 +83,16 @@ pub async fn task_main(
info!("accepted postgres client connection");
let mut socket = WithClientIp::new(socket);
let mut peer_addr = peer_addr;
if let Some(ip) = socket.wait_for_addr().await? {
peer_addr = ip;
tracing::Span::current().record("peer_addr", &tracing::field::display(ip));
let mut peer_addr = peer_addr.ip();
if let Some(addr) = socket.wait_for_addr().await? {
peer_addr = addr.ip();
tracing::Span::current().record("peer_addr", &tracing::field::display(addr));
} else if config.require_client_ip {
bail!("missing required client IP");
}
let mut ctx = RequestMonitoring::new(session_id, peer_addr, "tcp", &config.region);
socket
.inner
.set_nodelay(true)
@@ -97,11 +100,10 @@ pub async fn task_main(
handle_client(
config,
&mut ctx,
&cancel_map,
session_id,
socket,
ClientMode::Tcp,
peer_addr.ip(),
endpoint_rate_limiter,
)
.await
@@ -134,13 +136,6 @@ pub enum ClientMode {
/// Abstracts the logic of handling TCP vs WS clients
impl ClientMode {
fn protocol_label(&self) -> &'static str {
match self {
ClientMode::Tcp => "tcp",
ClientMode::Websockets { .. } => "ws",
}
}
fn allow_cleartext(&self) -> bool {
match self {
ClientMode::Tcp => false,
@@ -173,19 +168,18 @@ impl ClientMode {
pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
config: &'static ProxyConfig,
ctx: &mut RequestMonitoring,
cancel_map: &CancelMap,
session_id: uuid::Uuid,
stream: S,
mode: ClientMode,
peer_addr: IpAddr,
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
) -> anyhow::Result<()> {
info!(
protocol = mode.protocol_label(),
protocol = ctx.protocol,
"handling interactive connection from client"
);
let proto = mode.protocol_label();
let proto = ctx.protocol;
let _client_gauge = NUM_CLIENT_CONNECTION_GAUGE
.with_label_values(&[proto])
.guard();
@@ -195,20 +189,23 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
let tls = config.tls_config.as_ref();
let pause = ctx.latency_timer.pause();
let do_handshake = handshake(stream, mode.handshake_tls(tls), cancel_map);
let (mut stream, params) = match do_handshake.await? {
Some(x) => x,
None => return Ok(()), // it's a cancellation request
};
drop(pause);
// Extract credentials which we're going to use for auth.
let creds = {
let hostname = mode.hostname(stream.get_ref());
let common_names = tls.and_then(|tls| tls.common_names.clone());
let result = config
.auth_backend
.as_ref()
.map(|_| auth::ClientCredentials::parse(&params, hostname, common_names, peer_addr))
.map(|_| auth::ClientCredentials::parse(ctx, &params, hostname, common_names))
.transpose();
match result {
@@ -217,16 +214,19 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
}
};
ctx.set_endpoint_id(creds.get_endpoint());
let client = Client::new(
stream,
creds,
&params,
session_id,
mode.allow_self_signed_compute(config),
endpoint_rate_limiter,
);
cancel_map
.with_session(|session| client.connect_to_db(session, mode, &config.authentication_config))
.with_session(|session| {
client.connect_to_db(ctx, session, mode, &config.authentication_config)
})
.await
}
@@ -348,10 +348,13 @@ async fn prepare_client_connection(
/// Forward bytes in both directions (client <-> compute).
#[tracing::instrument(skip_all)]
pub async fn proxy_pass(
ctx: &mut RequestMonitoring,
client: impl AsyncRead + AsyncWrite + Unpin,
compute: impl AsyncRead + AsyncWrite + Unpin,
aux: MetricsAuxInfo,
) -> anyhow::Result<()> {
ctx.log();
let usage = USAGE_METRICS.register(Ids {
endpoint_id: aux.endpoint_id.clone(),
branch_id: aux.branch_id.clone(),
@@ -397,8 +400,6 @@ struct Client<'a, S> {
creds: auth::BackendType<'a, auth::ClientCredentials>,
/// KV-dictionary with PostgreSQL connection params.
params: &'a StartupMessageParams,
/// Unique connection ID.
session_id: uuid::Uuid,
/// Allow self-signed certificates (for testing).
allow_self_signed_compute: bool,
/// Rate limiter for endpoints
@@ -411,7 +412,6 @@ impl<'a, S> Client<'a, S> {
stream: PqStream<Stream<S>>,
creds: auth::BackendType<'a, auth::ClientCredentials>,
params: &'a StartupMessageParams,
session_id: uuid::Uuid,
allow_self_signed_compute: bool,
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
) -> Self {
@@ -419,7 +419,6 @@ impl<'a, S> Client<'a, S> {
stream,
creds,
params,
session_id,
allow_self_signed_compute,
endpoint_rate_limiter,
}
@@ -433,6 +432,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
#[tracing::instrument(name = "", fields(ep = %self.creds.get_endpoint().unwrap_or_default()), skip_all)]
async fn connect_to_db(
self,
ctx: &mut RequestMonitoring,
session: cancellation::Session<'_>,
mode: ClientMode,
config: &'static AuthenticationConfig,
@@ -441,7 +441,6 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
mut stream,
creds,
params,
session_id,
allow_self_signed_compute,
endpoint_rate_limiter,
} = self;
@@ -455,27 +454,13 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
}
}
let proto = mode.protocol_label();
let extra = console::ConsoleReqExtra {
session_id, // aka this connection's id
application_name: format!(
"{}/{}",
params.get("application_name").unwrap_or_default(),
proto
),
options: neon_options(params),
};
let mut latency_timer = LatencyTimer::new(proto);
let user = creds.get_user().to_owned();
let auth_result = match creds
.authenticate(
&extra,
&mut stream,
mode.allow_cleartext(),
config,
&mut latency_timer,
)
.authenticate(ctx, &extra, &mut stream, mode.allow_cleartext(), config)
.await
{
Ok(auth_result) => auth_result,
@@ -493,15 +478,9 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
node_info.allow_self_signed_compute = allow_self_signed_compute;
let aux = node_info.aux.clone();
let mut node = connect_to_compute(
&TcpMechanism { params, proto },
node_info,
&extra,
&creds,
latency_timer,
)
.or_else(|e| stream.throw_error(e))
.await?;
let mut node = connect_to_compute(ctx, &TcpMechanism { params }, node_info, &extra, &creds)
.or_else(|e| stream.throw_error(e))
.await?;
prepare_client_connection(&node, session, &mut stream).await?;
// Before proxy passing, forward to compute whatever data is left in the
@@ -510,7 +489,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
// immediately after opening the connection.
let (stream, read_buf) = stream.into_inner();
node.stream.write_all(&read_buf).await?;
proxy_pass(stream, node.stream, aux).await
proxy_pass(ctx, stream, node.stream, aux).await
}
}

View File

@@ -2,7 +2,8 @@ use crate::{
auth,
compute::{self, PostgresConnection},
console::{self, errors::WakeComputeError, Api},
metrics::{bool_to_str, LatencyTimer, NUM_CONNECTION_FAILURES, NUM_WAKEUP_FAILURES},
context::RequestMonitoring,
metrics::{bool_to_str, NUM_CONNECTION_FAILURES, NUM_WAKEUP_FAILURES},
proxy::retry::{retry_after, ShouldRetry},
};
use async_trait::async_trait;
@@ -35,15 +36,15 @@ pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> compute::ConnCfg
/// Try to connect to the compute node once.
#[tracing::instrument(name = "connect_once", fields(pid = tracing::field::Empty), skip_all)]
async fn connect_to_compute_once(
ctx: &mut RequestMonitoring,
node_info: &console::CachedNodeInfo,
timeout: time::Duration,
proto: &'static str,
) -> Result<PostgresConnection, compute::ConnectionError> {
let allow_self_signed_compute = node_info.allow_self_signed_compute;
node_info
.config
.connect(allow_self_signed_compute, timeout, proto)
.connect(ctx, allow_self_signed_compute, timeout)
.await
}
@@ -54,6 +55,7 @@ pub trait ConnectMechanism {
type Error: From<Self::ConnectError>;
async fn connect_once(
&self,
ctx: &mut RequestMonitoring,
node_info: &console::CachedNodeInfo,
timeout: time::Duration,
) -> Result<Self::Connection, Self::ConnectError>;
@@ -64,7 +66,6 @@ pub trait ConnectMechanism {
pub struct TcpMechanism<'a> {
/// KV-dictionary with PostgreSQL connection params.
pub params: &'a StartupMessageParams,
pub proto: &'static str,
}
#[async_trait]
@@ -75,10 +76,11 @@ impl ConnectMechanism for TcpMechanism<'_> {
async fn connect_once(
&self,
ctx: &mut RequestMonitoring,
node_info: &console::CachedNodeInfo,
timeout: time::Duration,
) -> Result<PostgresConnection, Self::Error> {
connect_to_compute_once(node_info, timeout, self.proto).await
connect_to_compute_once(ctx, node_info, timeout).await
}
fn update_connect_config(&self, config: &mut compute::ConnCfg) {
@@ -123,11 +125,11 @@ fn report_error(e: &WakeComputeError, retry: bool) {
/// This function might update `node_info`, so we take it by `&mut`.
#[tracing::instrument(skip_all)]
pub async fn connect_to_compute<M: ConnectMechanism>(
ctx: &mut RequestMonitoring,
mechanism: &M,
mut node_info: console::CachedNodeInfo,
extra: &console::ConsoleReqExtra,
creds: &auth::BackendType<'_, auth::backend::ComputeUserInfo>,
mut latency_timer: LatencyTimer,
) -> Result<M::Connection, M::Error>
where
M::ConnectError: ShouldRetry + std::fmt::Debug,
@@ -136,9 +138,12 @@ where
mechanism.update_connect_config(&mut node_info.config);
// try once
let (config, err) = match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
let (config, err) = match mechanism
.connect_once(ctx, &node_info, CONNECT_TIMEOUT)
.await
{
Ok(res) => {
latency_timer.success();
ctx.latency_timer.success();
return Ok(res);
}
Err(e) => {
@@ -147,7 +152,7 @@ where
}
};
latency_timer.cache_miss();
ctx.latency_timer.cache_miss();
let mut num_retries = 1;
@@ -155,9 +160,9 @@ where
info!("compute node's state has likely changed; requesting a wake-up");
let node_info = loop {
let wake_res = match creds {
auth::BackendType::Console(api, creds) => api.wake_compute(extra, creds).await,
auth::BackendType::Console(api, creds) => api.wake_compute(ctx, extra, creds).await,
#[cfg(feature = "testing")]
auth::BackendType::Postgres(api, creds) => api.wake_compute(extra, creds).await,
auth::BackendType::Postgres(api, creds) => api.wake_compute(ctx, extra, creds).await,
// nothing to do?
auth::BackendType::Link(_) => return Err(err.into()),
// test backend
@@ -195,9 +200,12 @@ where
// * DNS connection settings haven't quite propagated yet
info!("wake_compute success. attempting to connect");
loop {
match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
match mechanism
.connect_once(ctx, &node_info, CONNECT_TIMEOUT)
.await
{
Ok(res) => {
latency_timer.success();
ctx.latency_timer.success();
return Ok(res);
}
Err(e) => {

View File

@@ -425,6 +425,7 @@ impl ConnectMechanism for TestConnectMechanism {
async fn connect_once(
&self,
_ctx: &mut RequestMonitoring,
_node_info: &console::CachedNodeInfo,
_timeout: std::time::Duration,
) -> Result<Self::Connection, Self::ConnectError> {
@@ -491,11 +492,7 @@ fn helper_create_connect_info(
auth::BackendType<'_, ComputeUserInfo>,
) {
let cache = helper_create_cached_node_info();
let extra = console::ConsoleReqExtra {
session_id: uuid::Uuid::new_v4(),
application_name: "TEST".into(),
options: vec![],
};
let extra = console::ConsoleReqExtra { options: vec![] };
let creds = auth::BackendType::Test(mechanism);
(cache, extra, creds)
}
@@ -503,9 +500,10 @@ fn helper_create_connect_info(
#[tokio::test]
async fn connect_to_compute_success() {
use ConnectAction::*;
let mut ctx = RequestMonitoring::test();
let mechanism = TestConnectMechanism::new(vec![Connect]);
let (cache, extra, creds) = helper_create_connect_info(&mechanism);
connect_to_compute(&mechanism, cache, &extra, &creds, LatencyTimer::new("test"))
connect_to_compute(&mut ctx, &mechanism, cache, &extra, &creds)
.await
.unwrap();
mechanism.verify();
@@ -514,9 +512,10 @@ async fn connect_to_compute_success() {
#[tokio::test]
async fn connect_to_compute_retry() {
use ConnectAction::*;
let mut ctx = RequestMonitoring::test();
let mechanism = TestConnectMechanism::new(vec![Retry, Wake, Retry, Connect]);
let (cache, extra, creds) = helper_create_connect_info(&mechanism);
connect_to_compute(&mechanism, cache, &extra, &creds, LatencyTimer::new("test"))
connect_to_compute(&mut ctx, &mechanism, cache, &extra, &creds)
.await
.unwrap();
mechanism.verify();
@@ -526,9 +525,10 @@ async fn connect_to_compute_retry() {
#[tokio::test]
async fn connect_to_compute_non_retry_1() {
use ConnectAction::*;
let mut ctx = RequestMonitoring::test();
let mechanism = TestConnectMechanism::new(vec![Retry, Wake, Retry, Fail]);
let (cache, extra, creds) = helper_create_connect_info(&mechanism);
connect_to_compute(&mechanism, cache, &extra, &creds, LatencyTimer::new("test"))
connect_to_compute(&mut ctx, &mechanism, cache, &extra, &creds)
.await
.unwrap_err();
mechanism.verify();
@@ -538,9 +538,10 @@ async fn connect_to_compute_non_retry_1() {
#[tokio::test]
async fn connect_to_compute_non_retry_2() {
use ConnectAction::*;
let mut ctx = RequestMonitoring::test();
let mechanism = TestConnectMechanism::new(vec![Fail, Wake, Retry, Connect]);
let (cache, extra, creds) = helper_create_connect_info(&mechanism);
connect_to_compute(&mechanism, cache, &extra, &creds, LatencyTimer::new("test"))
connect_to_compute(&mut ctx, &mechanism, cache, &extra, &creds)
.await
.unwrap();
mechanism.verify();
@@ -551,12 +552,13 @@ async fn connect_to_compute_non_retry_2() {
async fn connect_to_compute_non_retry_3() {
assert_eq!(NUM_RETRIES_CONNECT, 16);
use ConnectAction::*;
let mut ctx = RequestMonitoring::test();
let mechanism = TestConnectMechanism::new(vec![
Retry, Wake, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry,
Retry, Retry, Retry, Retry, /* the 17th time */ Retry,
]);
let (cache, extra, creds) = helper_create_connect_info(&mechanism);
connect_to_compute(&mechanism, cache, &extra, &creds, LatencyTimer::new("test"))
connect_to_compute(&mut ctx, &mechanism, cache, &extra, &creds)
.await
.unwrap_err();
mechanism.verify();
@@ -566,9 +568,10 @@ async fn connect_to_compute_non_retry_3() {
#[tokio::test]
async fn wake_retry() {
use ConnectAction::*;
let mut ctx = RequestMonitoring::test();
let mechanism = TestConnectMechanism::new(vec![Retry, WakeRetry, Wake, Connect]);
let (cache, extra, creds) = helper_create_connect_info(&mechanism);
connect_to_compute(&mechanism, cache, &extra, &creds, LatencyTimer::new("test"))
connect_to_compute(&mut ctx, &mechanism, cache, &extra, &creds)
.await
.unwrap();
mechanism.verify();
@@ -578,9 +581,10 @@ async fn wake_retry() {
#[tokio::test]
async fn wake_non_retry() {
use ConnectAction::*;
let mut ctx = RequestMonitoring::test();
let mechanism = TestConnectMechanism::new(vec![Retry, WakeFail]);
let (cache, extra, creds) = helper_create_connect_info(&mechanism);
connect_to_compute(&mechanism, cache, &extra, &creds, LatencyTimer::new("test"))
connect_to_compute(&mut ctx, &mechanism, cache, &extra, &creds)
.await
.unwrap_err();
mechanism.verify();

View File

@@ -17,6 +17,7 @@ pub use reqwest_middleware::{ClientWithMiddleware, Error};
pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
use tokio_util::task::TaskTracker;
use crate::context::RequestMonitoring;
use crate::metrics::NUM_CLIENT_CONNECTION_GAUGE;
use crate::protocol2::{ProxyProtocolAccept, WithClientIp};
use crate::rate_limiter::EndpointRateLimiter;
@@ -218,13 +219,14 @@ async fn request_handler(
ws_connections.spawn(
async move {
let mut ctx = RequestMonitoring::new(session_id, peer_addr, "ws", &config.region);
if let Err(e) = websocket::serve_websocket(
websocket,
config,
&mut ctx,
websocket,
&cancel_map,
session_id,
host,
peer_addr,
endpoint_rate_limiter,
)
.await
@@ -238,13 +240,14 @@ async fn request_handler(
// Return the response so the spawned future can continue.
Ok(response)
} else if request.uri().path() == "/sql" && request.method() == Method::POST {
let mut ctx = RequestMonitoring::new(session_id, peer_addr, "http", &config.region);
sql_over_http::handle(
&config.http_config,
&mut ctx,
request,
sni_hostname,
conn_pool,
session_id,
peer_addr,
&config.http_config,
)
.await
} else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS {

View File

@@ -13,7 +13,7 @@ use pq_proto::StartupMessageParams;
use prometheus::{exponential_buckets, register_histogram, Histogram};
use rand::Rng;
use smol_str::SmolStr;
use std::{collections::HashMap, net::IpAddr, pin::pin, sync::Arc, sync::Weak, time::Duration};
use std::{collections::HashMap, pin::pin, sync::Arc, sync::Weak, time::Duration};
use std::{
fmt,
task::{ready, Poll},
@@ -28,7 +28,8 @@ use tokio_postgres::{AsyncMessage, ReadyForQueryStatus};
use crate::{
auth::{self, backend::ComputeUserInfo, check_peer_addr_is_in_list},
console,
metrics::{LatencyTimer, NUM_DB_CONNECTIONS_GAUGE},
context::RequestMonitoring,
metrics::NUM_DB_CONNECTIONS_GAUGE,
proxy::{connect_compute::ConnectMechanism, neon_options},
usage_metrics::{Ids, MetricCounter, USAGE_METRICS},
};
@@ -309,13 +310,11 @@ impl GlobalConnPool {
pub async fn get(
self: &Arc<Self>,
ctx: &mut RequestMonitoring,
conn_info: ConnInfo,
force_new: bool,
session_id: uuid::Uuid,
peer_addr: IpAddr,
) -> anyhow::Result<Client> {
let mut client: Option<ClientInner> = None;
let mut latency_timer = LatencyTimer::new("http");
let mut hash_valid = false;
let mut endpoint_pool = Weak::new();
@@ -360,23 +359,21 @@ impl GlobalConnPool {
info!(%conn_id, "pool: cached connection '{conn_info}' is closed, opening a new one");
connect_to_compute(
self.proxy_config,
ctx,
&conn_info,
conn_id,
session_id,
latency_timer,
peer_addr,
endpoint_pool.clone(),
)
.await
} else {
info!("pool: reusing connection '{conn_info}'");
client.session.send(session_id)?;
client.session.send(ctx.session_id)?;
tracing::Span::current().record(
"pid",
&tracing::field::display(client.inner.get_process_id()),
);
latency_timer.pool_hit();
latency_timer.success();
ctx.latency_timer.pool_hit();
ctx.latency_timer.success();
return Ok(Client::new(client, conn_info, endpoint_pool).await);
}
} else {
@@ -384,11 +381,9 @@ impl GlobalConnPool {
info!(%conn_id, "pool: opening a new connection '{conn_info}'");
connect_to_compute(
self.proxy_config,
ctx,
&conn_info,
conn_id,
session_id,
latency_timer,
peer_addr,
endpoint_pool.clone(),
)
.await
@@ -483,7 +478,6 @@ impl GlobalConnPool {
struct TokioMechanism<'a> {
pool: Weak<RwLock<EndpointConnPool>>,
conn_info: &'a ConnInfo,
session_id: uuid::Uuid,
conn_id: uuid::Uuid,
idle: Duration,
}
@@ -496,15 +490,16 @@ impl ConnectMechanism for TokioMechanism<'_> {
async fn connect_once(
&self,
ctx: &mut RequestMonitoring,
node_info: &console::CachedNodeInfo,
timeout: time::Duration,
) -> Result<Self::Connection, Self::ConnectError> {
connect_to_compute_once(
ctx,
node_info,
self.conn_info,
timeout,
self.conn_id,
self.session_id,
self.pool.clone(),
self.idle,
)
@@ -520,11 +515,9 @@ impl ConnectMechanism for TokioMechanism<'_> {
#[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)]
async fn connect_to_compute(
config: &config::ProxyConfig,
ctx: &mut RequestMonitoring,
conn_info: &ConnInfo,
conn_id: uuid::Uuid,
session_id: uuid::Uuid,
latency_timer: LatencyTimer,
peer_addr: IpAddr,
pool: Weak<RwLock<EndpointConnPool>>,
) -> anyhow::Result<ClientInner> {
let tls = config.tls_config.as_ref();
@@ -536,12 +529,8 @@ async fn connect_to_compute(
("application_name", APP_NAME),
("options", conn_info.options.as_deref().unwrap_or("")),
]);
let creds = auth::ClientCredentials::parse(
&params,
Some(&conn_info.hostname),
common_names,
peer_addr,
)?;
let creds =
auth::ClientCredentials::parse(ctx, &params, Some(&conn_info.hostname), common_names)?;
let creds =
ComputeUserInfo::try_from(creds).map_err(|_| anyhow!("missing endpoint identifier"))?;
@@ -549,48 +538,48 @@ async fn connect_to_compute(
let console_options = neon_options(&params);
let extra = console::ConsoleReqExtra {
session_id: uuid::Uuid::new_v4(),
application_name: APP_NAME.to_string(),
options: console_options,
};
if !config.disable_ip_check_for_http {
let allowed_ips = backend.get_allowed_ips(&extra).await?;
if !check_peer_addr_is_in_list(&peer_addr, &allowed_ips) {
let allowed_ips = backend.get_allowed_ips(ctx).await?;
if !check_peer_addr_is_in_list(&ctx.peer_addr, &allowed_ips) {
return Err(auth::AuthError::ip_address_not_allowed().into());
}
}
let extra = console::ConsoleReqExtra {
options: console_options,
};
let node_info = backend
.wake_compute(&extra)
.wake_compute(ctx, &extra)
.await?
.context("missing cache entry from wake_compute")?;
ctx.set_project(node_info.aux.clone());
crate::proxy::connect_compute::connect_to_compute(
ctx,
&TokioMechanism {
conn_id,
conn_info,
session_id,
pool,
idle: config.http_config.pool_options.idle_timeout,
},
node_info,
&extra,
&backend,
latency_timer,
)
.await
}
async fn connect_to_compute_once(
ctx: &mut RequestMonitoring,
node_info: &console::CachedNodeInfo,
conn_info: &ConnInfo,
timeout: time::Duration,
conn_id: uuid::Uuid,
mut session: uuid::Uuid,
pool: Weak<RwLock<EndpointConnPool>>,
idle: Duration,
) -> Result<ClientInner, tokio_postgres::Error> {
let mut config = (*node_info.config).clone();
let mut session = ctx.session_id;
let (client, mut connection) = config
.user(&conn_info.username)
@@ -601,7 +590,7 @@ async fn connect_to_compute_once(
.await?;
let conn_gauge = NUM_DB_CONNECTIONS_GAUGE
.with_label_values(&["http"])
.with_label_values(&[ctx.protocol])
.guard();
tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id()));

View File

@@ -1,4 +1,3 @@
use std::net::IpAddr;
use std::sync::Arc;
use anyhow::bail;
@@ -14,6 +13,7 @@ use hyper::{Body, HeaderMap, Request};
use serde_json::json;
use serde_json::Map;
use serde_json::Value;
use smol_str::SmolStr;
use tokio_postgres::error::DbError;
use tokio_postgres::types::Kind;
use tokio_postgres::types::Type;
@@ -29,6 +29,7 @@ use utils::http::error::ApiError;
use utils::http::json::json_response;
use crate::config::HttpConfig;
use crate::context::RequestMonitoring;
use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
use super::conn_pool::ConnInfo;
@@ -121,6 +122,7 @@ fn json_array_to_pg_array(value: &Value) -> Option<String> {
}
fn get_conn_info(
ctx: &mut RequestMonitoring,
headers: &HeaderMap,
sni_hostname: Option<String>,
) -> Result<ConnInfo, anyhow::Error> {
@@ -146,10 +148,11 @@ fn get_conn_info(
.next()
.ok_or(anyhow::anyhow!("invalid database name"))?;
let username = connection_url.username();
let username = SmolStr::from(connection_url.username());
if username.is_empty() {
return Err(anyhow::anyhow!("missing username"));
}
ctx.set_user(username.clone());
let password = connection_url
.password()
@@ -176,6 +179,9 @@ fn get_conn_info(
}
}
let hostname: SmolStr = hostname.into();
ctx.set_endpoint_id(Some(hostname.clone()));
let pairs = connection_url.query_pairs();
let mut options = Option::None;
@@ -188,9 +194,9 @@ fn get_conn_info(
}
Ok(ConnInfo {
username: username.into(),
username,
dbname: dbname.into(),
hostname: hostname.into(),
hostname,
password: password.into(),
options,
})
@@ -198,23 +204,15 @@ fn get_conn_info(
// TODO: return different http error codes
pub async fn handle(
config: &'static HttpConfig,
ctx: &mut RequestMonitoring,
request: Request<Body>,
sni_hostname: Option<String>,
conn_pool: Arc<GlobalConnPool>,
session_id: uuid::Uuid,
peer_addr: IpAddr,
config: &'static HttpConfig,
) -> Result<Response<Body>, ApiError> {
let result = tokio::time::timeout(
config.request_timeout,
handle_inner(
config,
request,
sni_hostname,
conn_pool,
session_id,
peer_addr,
),
handle_inner(config, ctx, request, sni_hostname, conn_pool),
)
.await;
let mut response = match result {
@@ -297,11 +295,10 @@ pub async fn handle(
#[instrument(name = "sql-over-http", fields(pid = tracing::field::Empty), skip_all)]
async fn handle_inner(
config: &'static HttpConfig,
ctx: &mut RequestMonitoring,
request: Request<Body>,
sni_hostname: Option<String>,
conn_pool: Arc<GlobalConnPool>,
session_id: uuid::Uuid,
peer_addr: IpAddr,
) -> anyhow::Result<Response<Body>> {
let _request_gauge = NUM_CONNECTION_REQUESTS_GAUGE
.with_label_values(&["http"])
@@ -311,7 +308,7 @@ async fn handle_inner(
// Determine the destination and connection params
//
let headers = request.headers();
let conn_info = get_conn_info(headers, sni_hostname)?;
let conn_info = get_conn_info(ctx, headers, sni_hostname)?;
// Determine the output options. Default behaviour is 'false'. Anything that is not
// strictly 'true' assumed to be false.
@@ -340,10 +337,12 @@ async fn handle_inner(
let txn_read_only = headers.get(&TXN_READ_ONLY) == Some(&HEADER_VALUE_TRUE);
let txn_deferrable = headers.get(&TXN_DEFERRABLE) == Some(&HEADER_VALUE_TRUE);
let paused = ctx.latency_timer.pause();
let request_content_length = match request.body().size_hint().upper() {
Some(v) => v,
None => MAX_REQUEST_SIZE + 1,
};
drop(paused);
// we don't have a streaming request support yet so this is to prevent OOM
// from a malicious user sending an extremely large request body
@@ -359,9 +358,7 @@ async fn handle_inner(
let body = hyper::body::to_bytes(request.into_body()).await?;
let payload: Payload = serde_json::from_slice(&body)?;
let mut client = conn_pool
.get(conn_info, !allow_pool, session_id, peer_addr)
.await?;
let mut client = conn_pool.get(ctx, conn_info, !allow_pool).await?;
let mut response = Response::builder()
.status(StatusCode::OK)
@@ -449,6 +446,7 @@ async fn handle_inner(
}
};
ctx.log();
let metrics = client.metrics();
// how could this possibly fail

View File

@@ -1,6 +1,7 @@
use crate::{
cancellation::CancelMap,
config::ProxyConfig,
context::RequestMonitoring,
error::io_error,
proxy::{handle_client, ClientMode},
rate_limiter::EndpointRateLimiter,
@@ -12,7 +13,6 @@ use hyper_tungstenite::{tungstenite::Message, HyperWebsocket, WebSocketStream};
use pin_project_lite::pin_project;
use std::{
net::IpAddr,
pin::Pin,
sync::Arc,
task::{ready, Context, Poll},
@@ -130,22 +130,20 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncBufRead for WebSocketRw<S> {
}
pub async fn serve_websocket(
websocket: HyperWebsocket,
config: &'static ProxyConfig,
ctx: &mut RequestMonitoring,
websocket: HyperWebsocket,
cancel_map: &CancelMap,
session_id: uuid::Uuid,
hostname: Option<String>,
peer_addr: IpAddr,
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
) -> anyhow::Result<()> {
let websocket = websocket.await?;
handle_client(
config,
ctx,
cancel_map,
session_id,
WebSocketRw::new(websocket),
ClientMode::Websockets { hostname },
peer_addr,
endpoint_rate_limiter,
)
.await?;

View File

@@ -40,22 +40,13 @@ pytest-split = "^0.8.1"
zstandard = "^0.21.0"
[tool.poetry.group.dev.dependencies]
black = "^23.3.0"
mypy = "==1.3.0"
ruff = "^0.0.269"
ruff = "^0.1.11"
[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"
[tool.black]
line-length = 100
extend-exclude = '''
/(
vendor
)/
'''
[tool.mypy]
exclude = "^vendor/"
check_untyped_defs = true
@@ -82,7 +73,9 @@ ignore_missing_imports = true
[tool.ruff]
target-version = "py39"
extend-exclude = ["vendor/"]
ignore = ["E501"]
ignore = [
"E501", # Line too long, we don't want to be too strict about it
]
select = [
"E", # pycodestyle
"F", # Pyflakes
@@ -90,3 +83,4 @@ select = [
"W", # pycodestyle
"B", # bugbear
]
line-length = 100 # this setting is rather guidance, it won't fail if it can't make the shorter

View File

@@ -1,5 +1,5 @@
[toolchain]
channel = "1.74.0"
channel = "1.75.0"
profile = "default"
# The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
# https://rust-lang.github.io/rustup/concepts/profiles.html

View File

@@ -2,7 +2,10 @@
//! S3 objects which are either not referenced by any metadata, or are referenced by a
//! control plane tenant/timeline in a deleted state.
use std::{collections::HashMap, sync::Arc};
use std::{
collections::{HashMap, HashSet},
sync::Arc,
};
use anyhow::Context;
use aws_sdk_s3::{
@@ -118,6 +121,13 @@ const S3_CONCURRENCY: usize = 32;
// How many concurrent API requests to make to the console API.
const CONSOLE_CONCURRENCY: usize = 128;
struct ConsoleCache {
/// Set of tenants found in the control plane API
projects: HashMap<TenantId, ProjectData>,
/// Set of tenants for which the control plane API returned 404
not_found: HashSet<TenantId>,
}
async fn find_garbage_inner(
bucket_config: BucketConfig,
console_config: ConsoleConfig,
@@ -143,23 +153,49 @@ async fn find_garbage_inner(
console_projects.len()
);
// TODO(sharding): batch calls into Console so that we only call once for each TenantId,
// rather than checking the same TenantId for multiple TenantShardId
// Because many tenant shards may look up the same TenantId, we maintain a cache.
let console_cache = Arc::new(std::sync::Mutex::new(ConsoleCache {
projects: console_projects,
not_found: HashSet::new(),
}));
// Enumerate Tenants in S3, and check if each one exists in Console
tracing::info!("Finding all tenants in bucket {}...", bucket_config.bucket);
let tenants = stream_tenants(&s3_client, &target);
let tenants_checked = tenants.map_ok(|t| {
let api_client = cloud_admin_api_client.clone();
let console_projects = &console_projects;
let console_cache = console_cache.clone();
async move {
match console_projects.get(&t.tenant_id) {
// Check cache before issuing API call
let project_data = {
let cache = console_cache.lock().unwrap();
let result = cache.projects.get(&t.tenant_id).cloned();
if result.is_none() && cache.not_found.contains(&t.tenant_id) {
return Ok((t, None));
}
result
};
match project_data {
Some(project_data) => Ok((t, Some(project_data.clone()))),
None => api_client
.find_tenant_project(t.tenant_id)
.await
.map_err(|e| anyhow::anyhow!(e))
.map(|r| (t, r)),
None => {
let project_data = api_client
.find_tenant_project(t.tenant_id)
.await
.map_err(|e| anyhow::anyhow!(e));
// Populate cache with result of API call
{
let mut cache = console_cache.lock().unwrap();
if let Ok(Some(project_data)) = &project_data {
cache.projects.insert(t.tenant_id, project_data.clone());
} else if let Ok(None) = &project_data {
cache.not_found.insert(t.tenant_id);
}
}
project_data.map(|r| (t, r))
}
}
}
});

View File

@@ -17,7 +17,9 @@ use utils::id::TenantId;
#[derive(Serialize)]
pub struct MetadataSummary {
count: usize,
tenant_count: usize,
timeline_count: usize,
timeline_shard_count: usize,
with_errors: HashSet<TenantShardTimelineId>,
with_warnings: HashSet<TenantShardTimelineId>,
with_orphans: HashSet<TenantShardTimelineId>,
@@ -87,7 +89,9 @@ impl MinMaxHisto {
impl MetadataSummary {
fn new() -> Self {
Self {
count: 0,
tenant_count: 0,
timeline_count: 0,
timeline_shard_count: 0,
with_errors: HashSet::new(),
with_warnings: HashSet::new(),
with_orphans: HashSet::new(),
@@ -112,7 +116,7 @@ impl MetadataSummary {
}
fn update_data(&mut self, data: &S3TimelineBlobData) {
self.count += 1;
self.timeline_shard_count += 1;
if let BlobDataParseResult::Parsed {
index_part,
index_part_generation: _,
@@ -158,16 +162,20 @@ impl MetadataSummary {
);
format!(
"Timelines: {0}
With errors: {1}
With warnings: {2}
With orphan layers: {3}
"Tenants: {}
Timelines: {}
Timeline-shards: {}
With errors: {}
With warnings: {}
With orphan layers: {}
Index versions: {version_summary}
Timeline size bytes: {4}
Layer size bytes: {5}
Timeline layer count: {6}
Timeline size bytes: {}
Layer size bytes: {}
Timeline layer count: {}
",
self.count,
self.tenant_count,
self.timeline_count,
self.timeline_shard_count,
self.with_errors.len(),
self.with_warnings.len(),
self.with_orphans.len(),
@@ -182,7 +190,7 @@ Timeline layer count: {6}
}
pub fn is_empty(&self) -> bool {
self.count == 0
self.timeline_shard_count == 0
}
}
@@ -233,8 +241,12 @@ pub async fn scan_metadata(
mut tenant_objects: TenantObjectListing,
timelines: Vec<(TenantShardTimelineId, S3TimelineBlobData)>,
) {
summary.tenant_count += 1;
let mut timeline_ids = HashSet::new();
let mut timeline_generations = HashMap::new();
for (ttid, data) in timelines {
timeline_ids.insert(ttid.timeline_id);
// Stash the generation of each timeline, for later use identifying orphan layers
if let BlobDataParseResult::Parsed {
index_part: _index_part,
@@ -252,6 +264,8 @@ pub async fn scan_metadata(
summary.update_analysis(&ttid, &analysis);
}
summary.timeline_count += timeline_ids.len();
// Identifying orphan layers must be done on a tenant-wide basis, because individual
// shards' layers may be referenced by other shards.
//

View File

@@ -54,6 +54,7 @@ postgres_ffi.workspace = true
pq_proto.workspace = true
remote_storage.workspace = true
safekeeper_api.workspace = true
sha2.workspace = true
sd-notify.workspace = true
storage_broker.workspace = true
tokio-stream.workspace = true

View File

@@ -66,12 +66,10 @@ impl FileStorage {
/// Create file storage for a new timeline, but don't persist it yet.
pub fn create_new(
ttid: &TenantTimelineId,
timeline_dir: Utf8PathBuf,
conf: &SafeKeeperConf,
state: SafeKeeperState,
) -> Result<FileStorage> {
let timeline_dir = conf.timeline_dir(ttid);
let store = FileStorage {
timeline_dir,
conf: conf.clone(),
@@ -277,7 +275,8 @@ mod test {
.await
.expect("failed to create timeline dir");
let state = SafeKeeperState::empty();
let storage = FileStorage::create_new(ttid, conf, state.clone())?;
let timeline_dir = conf.timeline_dir(ttid);
let storage = FileStorage::create_new(timeline_dir, conf, state.clone())?;
Ok((storage, state))
}

View File

@@ -0,0 +1,250 @@
use std::sync::Arc;
use anyhow::{bail, Result};
use camino::Utf8PathBuf;
use postgres_ffi::{MAX_SEND_SIZE, WAL_SEGMENT_SIZE};
use tokio::{
fs::OpenOptions,
io::{AsyncSeekExt, AsyncWriteExt},
};
use tracing::{info, warn};
use utils::{id::TenantTimelineId, lsn::Lsn};
use crate::{
control_file::{FileStorage, Storage},
pull_timeline::{create_temp_timeline_dir, load_temp_timeline, validate_temp_timeline},
safekeeper::SafeKeeperState,
timeline::{Timeline, TimelineError},
wal_backup::copy_s3_segments,
wal_storage::{wal_file_paths, WalReader},
GlobalTimelines, SafeKeeperConf,
};
// we don't want to have more than 10 segments on disk after copy, because they take space
const MAX_BACKUP_LAG: u64 = 10 * WAL_SEGMENT_SIZE as u64;
pub struct Request {
pub source: Arc<Timeline>,
pub until_lsn: Lsn,
pub destination_ttid: TenantTimelineId,
}
pub async fn handle_request(request: Request) -> Result<()> {
// TODO: request.until_lsn MUST be a valid LSN, and we cannot check it :(
// if LSN will point to the middle of a WAL record, timeline will be in "broken" state
match GlobalTimelines::get(request.destination_ttid) {
// timeline already exists. would be good to check that this timeline is the copy
// of the source timeline, but it isn't obvious how to do that
Ok(_) => return Ok(()),
// timeline not found, we are going to create it
Err(TimelineError::NotFound(_)) => {}
// error, probably timeline was deleted
res => {
res?;
}
}
let conf = &GlobalTimelines::get_global_config();
let ttid = request.destination_ttid;
let (_tmp_dir, tli_dir_path) = create_temp_timeline_dir(conf, ttid).await?;
let (mem_state, state) = request.source.get_state().await;
let start_lsn = state.timeline_start_lsn;
if start_lsn == Lsn::INVALID {
bail!("timeline is not initialized");
}
let backup_lsn = mem_state.backup_lsn;
{
let commit_lsn = mem_state.commit_lsn;
let flush_lsn = request.source.get_flush_lsn().await;
info!(
"collected info about source timeline: start_lsn={}, backup_lsn={}, commit_lsn={}, flush_lsn={}",
start_lsn, backup_lsn, commit_lsn, flush_lsn
);
assert!(backup_lsn >= start_lsn);
assert!(commit_lsn >= start_lsn);
assert!(flush_lsn >= start_lsn);
if request.until_lsn > flush_lsn {
bail!("requested LSN is beyond the end of the timeline");
}
if request.until_lsn < start_lsn {
bail!("requested LSN is before the start of the timeline");
}
if request.until_lsn > commit_lsn {
warn!("copy_timeline WAL is not fully committed");
}
if backup_lsn < request.until_lsn && request.until_lsn.0 - backup_lsn.0 > MAX_BACKUP_LAG {
// we have a lot of segments that are not backed up. we can try to wait here until
// segments will be backed up to remote storage, but it's not clear how long to wait
bail!("too many segments are not backed up");
}
}
let wal_seg_size = state.server.wal_seg_size as usize;
if wal_seg_size == 0 {
bail!("wal_seg_size is not set");
}
let first_segment = start_lsn.segment_number(wal_seg_size);
let last_segment = request.until_lsn.segment_number(wal_seg_size);
let new_backup_lsn = {
// we can't have new backup_lsn greater than existing backup_lsn or start of the last segment
let max_backup_lsn = backup_lsn.min(Lsn(last_segment * wal_seg_size as u64));
if max_backup_lsn <= start_lsn {
// probably we are starting from the first segment, which was not backed up yet.
// note that start_lsn can be in the middle of the segment
start_lsn
} else {
// we have some segments backed up, so we will assume all WAL below max_backup_lsn is backed up
assert!(max_backup_lsn.segment_offset(wal_seg_size) == 0);
max_backup_lsn
}
};
// all previous segments will be copied inside S3
let first_ondisk_segment = new_backup_lsn.segment_number(wal_seg_size);
assert!(first_ondisk_segment <= last_segment);
assert!(first_ondisk_segment >= first_segment);
copy_s3_segments(
wal_seg_size,
&request.source.ttid,
&request.destination_ttid,
first_segment,
first_ondisk_segment,
)
.await?;
copy_disk_segments(
conf,
&state,
wal_seg_size,
&request.source.ttid,
new_backup_lsn,
request.until_lsn,
&tli_dir_path,
)
.await?;
let mut new_state = SafeKeeperState::new(
&request.destination_ttid,
state.server.clone(),
vec![],
request.until_lsn,
start_lsn,
);
new_state.timeline_start_lsn = start_lsn;
new_state.peer_horizon_lsn = request.until_lsn;
new_state.backup_lsn = new_backup_lsn;
let mut file_storage = FileStorage::create_new(tli_dir_path.clone(), conf, new_state.clone())?;
file_storage.persist(&new_state).await?;
// now we have a ready timeline in a temp directory
validate_temp_timeline(conf, request.destination_ttid, &tli_dir_path).await?;
load_temp_timeline(conf, request.destination_ttid, &tli_dir_path).await?;
Ok(())
}
async fn copy_disk_segments(
conf: &SafeKeeperConf,
persisted_state: &SafeKeeperState,
wal_seg_size: usize,
source_ttid: &TenantTimelineId,
start_lsn: Lsn,
end_lsn: Lsn,
tli_dir_path: &Utf8PathBuf,
) -> Result<()> {
let mut wal_reader = WalReader::new(
conf.workdir.clone(),
conf.timeline_dir(source_ttid),
persisted_state,
start_lsn,
true,
)?;
let mut buf = [0u8; MAX_SEND_SIZE];
let first_segment = start_lsn.segment_number(wal_seg_size);
let last_segment = end_lsn.segment_number(wal_seg_size);
for segment in first_segment..=last_segment {
let segment_start = segment * wal_seg_size as u64;
let segment_end = segment_start + wal_seg_size as u64;
let copy_start = segment_start.max(start_lsn.0);
let copy_end = segment_end.min(end_lsn.0);
let copy_start = copy_start - segment_start;
let copy_end = copy_end - segment_start;
let wal_file_path = {
let (normal, partial) = wal_file_paths(tli_dir_path, segment, wal_seg_size)?;
if segment == last_segment {
partial
} else {
normal
}
};
write_segment(
&mut buf,
&wal_file_path,
wal_seg_size as u64,
copy_start,
copy_end,
&mut wal_reader,
)
.await?;
}
Ok(())
}
async fn write_segment(
buf: &mut [u8],
file_path: &Utf8PathBuf,
wal_seg_size: u64,
from: u64,
to: u64,
reader: &mut WalReader,
) -> Result<()> {
assert!(from <= to);
assert!(to <= wal_seg_size);
let mut file = OpenOptions::new()
.create(true)
.write(true)
.open(&file_path)
.await?;
// maybe fill with zeros, as in wal_storage.rs?
file.set_len(wal_seg_size).await?;
file.seek(std::io::SeekFrom::Start(from)).await?;
let mut bytes_left = to - from;
while bytes_left > 0 {
let len = bytes_left as usize;
let len = len.min(buf.len());
let len = reader.read(&mut buf[..len]).await?;
file.write_all(&buf[..len]).await?;
bytes_left -= len as u64;
}
file.flush().await?;
file.sync_all().await?;
Ok(())
}

View File

@@ -7,13 +7,16 @@ use std::io::Read;
use std::path::PathBuf;
use std::sync::Arc;
use anyhow::bail;
use anyhow::Result;
use camino::Utf8Path;
use chrono::{DateTime, Utc};
use postgres_ffi::XLogSegNo;
use postgres_ffi::MAX_SEND_SIZE;
use serde::Deserialize;
use serde::Serialize;
use sha2::{Digest, Sha256};
use utils::id::NodeId;
use utils::id::TenantTimelineId;
use utils::id::{TenantId, TimelineId};
@@ -25,6 +28,7 @@ use crate::safekeeper::TermHistory;
use crate::SafeKeeperConf;
use crate::send_wal::WalSenderState;
use crate::wal_storage::WalReader;
use crate::GlobalTimelines;
/// Various filters that influence the resulting JSON output.
@@ -300,3 +304,56 @@ fn build_config(config: SafeKeeperConf) -> Config {
wal_backup_enabled: config.wal_backup_enabled,
}
}
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct TimelineDigestRequest {
pub from_lsn: Lsn,
pub until_lsn: Lsn,
}
#[derive(Debug, Serialize, Deserialize)]
pub struct TimelineDigest {
pub sha256: String,
}
pub async fn calculate_digest(
tli: &Arc<crate::timeline::Timeline>,
request: TimelineDigestRequest,
) -> Result<TimelineDigest> {
if request.from_lsn > request.until_lsn {
bail!("from_lsn is greater than until_lsn");
}
let conf = GlobalTimelines::get_global_config();
let (_, persisted_state) = tli.get_state().await;
if persisted_state.timeline_start_lsn > request.from_lsn {
bail!("requested LSN is before the start of the timeline");
}
let mut wal_reader = WalReader::new(
conf.workdir.clone(),
tli.timeline_dir.clone(),
&persisted_state,
request.from_lsn,
true,
)?;
let mut hasher = Sha256::new();
let mut buf = [0u8; MAX_SEND_SIZE];
let mut bytes_left = (request.until_lsn.0 - request.from_lsn.0) as usize;
while bytes_left > 0 {
let bytes_to_read = std::cmp::min(buf.len(), bytes_left);
let bytes_read = wal_reader.read(&mut buf[..bytes_to_read]).await?;
if bytes_read == 0 {
bail!("wal_reader.read returned 0 bytes");
}
hasher.update(&buf[..bytes_read]);
bytes_left -= bytes_read;
}
let digest = hasher.finalize();
let digest = hex::encode(digest);
Ok(TimelineDigest { sha256: digest })
}

View File

@@ -2,7 +2,7 @@ use hyper::{Body, Request, Response, StatusCode, Uri};
use once_cell::sync::Lazy;
use postgres_ffi::WAL_SEGMENT_SIZE;
use safekeeper_api::models::SkTimelineInfo;
use safekeeper_api::models::{SkTimelineInfo, TimelineCopyRequest};
use serde::{Deserialize, Serialize};
use std::collections::{HashMap, HashSet};
use std::fmt;
@@ -14,19 +14,21 @@ use tokio::fs::File;
use tokio::io::AsyncReadExt;
use tokio_util::sync::CancellationToken;
use utils::failpoint_support::failpoints_handler;
use utils::http::request::parse_query_param;
use std::io::Write as _;
use tokio::sync::mpsc;
use tokio_stream::wrappers::ReceiverStream;
use tracing::info_span;
use tracing::{info_span, Instrument};
use utils::http::endpoint::{request_span, ChannelWriter};
use crate::debug_dump::TimelineDigestRequest;
use crate::receive_wal::WalReceiverState;
use crate::safekeeper::Term;
use crate::safekeeper::{ServerInfo, TermLsn};
use crate::send_wal::WalSenderState;
use crate::timeline::PeerInfo;
use crate::{debug_dump, pull_timeline};
use crate::{copy_timeline, debug_dump, pull_timeline};
use crate::timelines_global_map::TimelineDeleteForceResult;
use crate::GlobalTimelines;
@@ -204,6 +206,56 @@ async fn timeline_pull_handler(mut request: Request<Body>) -> Result<Response<Bo
json_response(StatusCode::OK, resp)
}
async fn timeline_copy_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permission(&request, None)?;
let request_data: TimelineCopyRequest = json_request(&mut request).await?;
let ttid = TenantTimelineId::new(
parse_request_param(&request, "tenant_id")?,
parse_request_param(&request, "source_timeline_id")?,
);
let source = GlobalTimelines::get(ttid)?;
copy_timeline::handle_request(copy_timeline::Request{
source,
until_lsn: request_data.until_lsn,
destination_ttid: TenantTimelineId::new(ttid.tenant_id, request_data.target_timeline_id),
})
.instrument(info_span!("copy_timeline", from=%ttid, to=%request_data.target_timeline_id, until_lsn=%request_data.until_lsn))
.await
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::OK, ())
}
async fn timeline_digest_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let ttid = TenantTimelineId::new(
parse_request_param(&request, "tenant_id")?,
parse_request_param(&request, "timeline_id")?,
);
check_permission(&request, Some(ttid.tenant_id))?;
let from_lsn: Option<Lsn> = parse_query_param(&request, "from_lsn")?;
let until_lsn: Option<Lsn> = parse_query_param(&request, "until_lsn")?;
let request = TimelineDigestRequest {
from_lsn: from_lsn.ok_or(ApiError::BadRequest(anyhow::anyhow!(
"from_lsn is required"
)))?,
until_lsn: until_lsn.ok_or(ApiError::BadRequest(anyhow::anyhow!(
"until_lsn is required"
)))?,
};
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
let response = debug_dump::calculate_digest(&tli, request)
.await
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::OK, response)
}
/// Download a file from the timeline directory.
// TODO: figure out a better way to copy files between safekeepers
async fn timeline_files_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -472,11 +524,18 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
"/v1/tenant/:tenant_id/timeline/:timeline_id/file/:filename",
|r| request_span(r, timeline_files_handler),
)
.post(
"/v1/tenant/:tenant_id/timeline/:source_timeline_id/copy",
|r| request_span(r, timeline_copy_handler),
)
// for tests
.post("/v1/record_safekeeper_info/:tenant_id/:timeline_id", |r| {
request_span(r, record_safekeeper_info)
})
.get("/v1/debug_dump", |r| request_span(r, dump_debug_handler))
.get("/v1/tenant/:tenant_id/timeline/:timeline_id/digest", |r| {
request_span(r, timeline_digest_handler)
})
}
#[cfg(test)]

View File

@@ -16,6 +16,7 @@ mod auth;
pub mod broker;
pub mod control_file;
pub mod control_file_upgrade;
pub mod copy_timeline;
pub mod debug_dump;
pub mod handler;
pub mod http;

View File

@@ -1,16 +1,24 @@
use std::sync::Arc;
use camino::Utf8PathBuf;
use camino_tempfile::Utf8TempDir;
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use anyhow::{bail, Context, Result};
use tokio::io::AsyncWriteExt;
use tracing::info;
use utils::id::{TenantId, TenantTimelineId, TimelineId};
use utils::{
id::{TenantId, TenantTimelineId, TimelineId},
lsn::Lsn,
};
use crate::{
control_file, debug_dump,
http::routes::TimelineStatus,
timeline::{Timeline, TimelineError},
wal_storage::{self, Storage},
GlobalTimelines,
GlobalTimelines, SafeKeeperConf,
};
/// Info about timeline on safekeeper ready for reporting.
@@ -91,7 +99,7 @@ pub async fn handle_request(request: Request) -> Result<Response> {
async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response> {
let ttid = TenantTimelineId::new(status.tenant_id, status.timeline_id);
info!(
"Pulling timeline {} from safekeeper {}, commit_lsn={}, flush_lsn={}, term={}, epoch={}",
"pulling timeline {} from safekeeper {}, commit_lsn={}, flush_lsn={}, term={}, epoch={}",
ttid,
host,
status.commit_lsn,
@@ -121,14 +129,14 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response>
if dump.timelines.len() != 1 {
bail!(
"Expected to fetch single timeline, got {} timelines",
"expected to fetch single timeline, got {} timelines",
dump.timelines.len()
);
}
let timeline = dump.timelines.into_iter().next().unwrap();
let disk_content = timeline.disk_content.ok_or(anyhow::anyhow!(
"Timeline {} doesn't have disk content",
"timeline {} doesn't have disk content",
ttid
))?;
@@ -155,29 +163,12 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response>
filenames.insert(0, "safekeeper.control".to_string());
info!(
"Downloading {} files from safekeeper {}",
"downloading {} files from safekeeper {}",
filenames.len(),
host
);
// Creating temp directory for a new timeline. It needs to be
// located on the same filesystem as the rest of the timelines.
// conf.workdir is usually /storage/safekeeper/data
// will try to transform it into /storage/safekeeper/tmp
let temp_base = conf
.workdir
.parent()
.ok_or(anyhow::anyhow!("workdir has no parent"))?
.join("tmp");
tokio::fs::create_dir_all(&temp_base).await?;
let tli_dir = camino_tempfile::Builder::new()
.suffix("_temptli")
.prefix(&format!("{}_{}_", ttid.tenant_id, ttid.timeline_id))
.tempdir_in(temp_base)?;
let tli_dir_path = tli_dir.path().to_path_buf();
let (_tmp_dir, tli_dir_path) = create_temp_timeline_dir(conf, ttid).await?;
// Note: some time happens between fetching list of files and fetching files themselves.
// It's possible that some files will be removed from safekeeper and we will fail to fetch them.
@@ -201,47 +192,105 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response>
// TODO: fsync?
// Let's create timeline from temp directory and verify that it's correct
let (commit_lsn, flush_lsn) = validate_temp_timeline(conf, ttid, &tli_dir_path).await?;
info!(
"finished downloading timeline {}, commit_lsn={}, flush_lsn={}",
ttid, commit_lsn, flush_lsn
);
assert!(status.commit_lsn <= status.flush_lsn);
let control_path = tli_dir_path.join("safekeeper.control");
// Finally, load the timeline.
let _tli = load_temp_timeline(conf, ttid, &tli_dir_path).await?;
Ok(Response {
safekeeper_host: host,
})
}
/// Create temp directory for a new timeline. It needs to be located on the same
/// filesystem as the rest of the timelines. It will be automatically deleted when
/// Utf8TempDir goes out of scope.
pub async fn create_temp_timeline_dir(
conf: &SafeKeeperConf,
ttid: TenantTimelineId,
) -> Result<(Utf8TempDir, Utf8PathBuf)> {
// conf.workdir is usually /storage/safekeeper/data
// will try to transform it into /storage/safekeeper/tmp
let temp_base = conf
.workdir
.parent()
.ok_or(anyhow::anyhow!("workdir has no parent"))?
.join("tmp");
tokio::fs::create_dir_all(&temp_base).await?;
let tli_dir = camino_tempfile::Builder::new()
.suffix("_temptli")
.prefix(&format!("{}_{}_", ttid.tenant_id, ttid.timeline_id))
.tempdir_in(temp_base)?;
let tli_dir_path = tli_dir.path().to_path_buf();
Ok((tli_dir, tli_dir_path))
}
/// Do basic validation of a temp timeline, before moving it to the global map.
pub async fn validate_temp_timeline(
conf: &SafeKeeperConf,
ttid: TenantTimelineId,
path: &Utf8PathBuf,
) -> Result<(Lsn, Lsn)> {
let control_path = path.join("safekeeper.control");
let control_store = control_file::FileStorage::load_control_file(control_path)?;
if control_store.server.wal_seg_size == 0 {
bail!("wal_seg_size is not set");
}
let wal_store =
wal_storage::PhysicalStorage::new(&ttid, tli_dir_path.clone(), conf, &control_store)?;
let wal_store = wal_storage::PhysicalStorage::new(&ttid, path.clone(), conf, &control_store)?;
let commit_lsn = status.commit_lsn;
let commit_lsn = control_store.commit_lsn;
let flush_lsn = wal_store.flush_lsn();
info!(
"Finished downloading timeline {}, commit_lsn={}, flush_lsn={}",
ttid, commit_lsn, flush_lsn
);
assert!(status.commit_lsn <= status.flush_lsn);
Ok((commit_lsn, flush_lsn))
}
/// Move timeline from a temp directory to the main storage, and load it to the global map.
/// This operation is done under a lock to prevent bugs if several concurrent requests are
/// trying to load the same timeline. Note that it doesn't guard against creating the
/// timeline with the same ttid, but no one should be doing this anyway.
pub async fn load_temp_timeline(
conf: &SafeKeeperConf,
ttid: TenantTimelineId,
tmp_path: &Utf8PathBuf,
) -> Result<Arc<Timeline>> {
// Take a lock to prevent concurrent loadings
let load_lock = GlobalTimelines::loading_lock().await;
let guard = load_lock.lock().await;
if !matches!(GlobalTimelines::get(ttid), Err(TimelineError::NotFound(_))) {
bail!("timeline already exists, cannot overwrite it")
}
// Move timeline dir to the correct location
let timeline_path = conf.timeline_dir(&ttid);
info!(
"Moving timeline {} from {} to {}",
ttid, tli_dir_path, timeline_path
"moving timeline {} from {} to {}",
ttid, tmp_path, timeline_path
);
tokio::fs::create_dir_all(conf.tenant_dir(&ttid.tenant_id)).await?;
tokio::fs::rename(tli_dir_path, &timeline_path).await?;
tokio::fs::rename(tmp_path, &timeline_path).await?;
let tli = GlobalTimelines::load_timeline(ttid)
let tli = GlobalTimelines::load_timeline(&guard, ttid)
.await
.context("Failed to load timeline after copy")?;
info!(
"Loaded timeline {}, flush_lsn={}",
"loaded timeline {}, flush_lsn={}",
ttid,
tli.get_flush_lsn().await
);
Ok(Response {
safekeeper_host: host,
})
Ok(tli)
}

View File

@@ -141,7 +141,8 @@ impl SharedState {
// We don't want to write anything to disk, because we may have existing timeline there.
// These functions should not change anything on disk.
let control_store = control_file::FileStorage::create_new(ttid, conf, state)?;
let timeline_dir = conf.timeline_dir(ttid);
let control_store = control_file::FileStorage::create_new(timeline_dir, conf, state)?;
let wal_store =
wal_storage::PhysicalStorage::new(ttid, conf.timeline_dir(ttid), conf, &control_store)?;
let sk = SafeKeeper::new(control_store, wal_store, conf.my_id)?;

View File

@@ -21,8 +21,12 @@ struct GlobalTimelinesState {
timelines: HashMap<TenantTimelineId, Arc<Timeline>>,
wal_backup_launcher_tx: Option<Sender<TenantTimelineId>>,
conf: Option<SafeKeeperConf>,
load_lock: Arc<tokio::sync::Mutex<TimelineLoadLock>>,
}
// Used to prevent concurrent timeline loading.
pub struct TimelineLoadLock;
impl GlobalTimelinesState {
/// Get configuration, which must be set once during init.
fn get_conf(&self) -> &SafeKeeperConf {
@@ -63,6 +67,7 @@ static TIMELINES_STATE: Lazy<Mutex<GlobalTimelinesState>> = Lazy::new(|| {
timelines: HashMap::new(),
wal_backup_launcher_tx: None,
conf: None,
load_lock: Arc::new(tokio::sync::Mutex::new(TimelineLoadLock)),
})
});
@@ -174,8 +179,16 @@ impl GlobalTimelines {
Ok(())
}
/// Take a lock for timeline loading.
pub async fn loading_lock() -> Arc<tokio::sync::Mutex<TimelineLoadLock>> {
TIMELINES_STATE.lock().unwrap().load_lock.clone()
}
/// Load timeline from disk to the memory.
pub async fn load_timeline(ttid: TenantTimelineId) -> Result<Arc<Timeline>> {
pub async fn load_timeline<'a>(
_guard: &tokio::sync::MutexGuard<'a, TimelineLoadLock>,
ttid: TenantTimelineId,
) -> Result<Arc<Timeline>> {
let (conf, wal_backup_launcher_tx) = TIMELINES_STATE.lock().unwrap().get_dependencies();
match Timeline::load_timeline(&conf, ttid, wal_backup_launcher_tx) {

View File

@@ -7,7 +7,7 @@ use tokio::task::JoinHandle;
use utils::id::NodeId;
use std::cmp::min;
use std::collections::HashMap;
use std::collections::{HashMap, HashSet};
use std::pin::Pin;
use std::sync::Arc;
use std::time::Duration;
@@ -531,3 +531,62 @@ pub async fn read_object(
Ok(Box::pin(reader))
}
/// Copy segments from one timeline to another. Used in copy_timeline.
pub async fn copy_s3_segments(
wal_seg_size: usize,
src_ttid: &TenantTimelineId,
dst_ttid: &TenantTimelineId,
from_segment: XLogSegNo,
to_segment: XLogSegNo,
) -> Result<()> {
const SEGMENTS_PROGRESS_REPORT_INTERVAL: u64 = 1024;
let storage = REMOTE_STORAGE
.get()
.expect("failed to get remote storage")
.as_ref()
.unwrap();
let relative_dst_path =
Utf8Path::new(&dst_ttid.tenant_id.to_string()).join(dst_ttid.timeline_id.to_string());
let remote_path = RemotePath::new(&relative_dst_path)?;
let files = storage.list_files(Some(&remote_path)).await?;
let uploaded_segments = &files
.iter()
.filter_map(|file| file.object_name().map(ToOwned::to_owned))
.collect::<HashSet<_>>();
debug!(
"these segments have already been uploaded: {:?}",
uploaded_segments
);
let relative_src_path =
Utf8Path::new(&src_ttid.tenant_id.to_string()).join(src_ttid.timeline_id.to_string());
for segno in from_segment..to_segment {
if segno % SEGMENTS_PROGRESS_REPORT_INTERVAL == 0 {
info!("copied all segments from {} until {}", from_segment, segno);
}
let segment_name = XLogFileName(PG_TLI, segno, wal_seg_size);
if uploaded_segments.contains(&segment_name) {
continue;
}
debug!("copying segment {}", segment_name);
let from = RemotePath::new(&relative_src_path.join(&segment_name))?;
let to = RemotePath::new(&relative_dst_path.join(&segment_name))?;
storage.copy_object(&from, &to).await?;
}
info!(
"finished copying segments from {} until {}",
from_segment, to_segment
);
Ok(())
}

View File

@@ -728,7 +728,7 @@ async fn write_zeroes(file: &mut File, mut count: usize) -> Result<()> {
}
/// Helper returning full path to WAL segment file and its .partial brother.
fn wal_file_paths(
pub fn wal_file_paths(
timeline_dir: &Utf8Path,
segno: XLogSegNo,
wal_seg_size: usize,

View File

@@ -63,7 +63,7 @@ def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str:
If those files already exist, we will overwrite them.
Returns basepath for files with captured output.
"""
assert type(cmd) is list
assert isinstance(cmd, list)
base = os.path.basename(cmd[0]) + "_{}".format(global_counter())
basepath = os.path.join(capture_dir, base)
stdout_filename = basepath + ".stdout"

View File

@@ -6,5 +6,5 @@ set -euox pipefail
echo 'Reformatting Rust code'
cargo fmt
echo 'Reformatting Python code'
poetry run ruff --fix test_runner scripts
poetry run black test_runner scripts
poetry run ruff check --fix test_runner scripts
poetry run ruff format test_runner scripts

View File

@@ -1166,8 +1166,8 @@ class AbstractNeonCli(abc.ABC):
If `local_binpath` is true, then we are invoking a test utility
"""
assert type(arguments) == list
assert type(self.COMMAND) == str
assert isinstance(arguments, list)
assert isinstance(self.COMMAND, str)
if local_binpath:
# Test utility
@@ -3108,6 +3108,28 @@ class SafekeeperHttpClient(requests.Session):
assert isinstance(res_json, dict)
return res_json
def copy_timeline(self, tenant_id: TenantId, timeline_id: TimelineId, body: Dict[str, Any]):
res = self.post(
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/copy",
json=body,
)
res.raise_for_status()
def timeline_digest(
self, tenant_id: TenantId, timeline_id: TimelineId, from_lsn: Lsn, until_lsn: Lsn
) -> Dict[str, Any]:
res = self.get(
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/digest",
params={
"from_lsn": str(from_lsn),
"until_lsn": str(until_lsn),
},
)
res.raise_for_status()
res_json = res.json()
assert isinstance(res_json, dict)
return res_json
def timeline_create(
self,
tenant_id: TenantId,

View File

@@ -326,6 +326,10 @@ class PageserverHttpClient(requests.Session):
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/heatmap_upload")
self.verbose_error(res)
def tenant_secondary_download(self, tenant_id: TenantId):
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/secondary/download")
self.verbose_error(res)
def set_tenant_config(self, tenant_id: TenantId, config: dict[str, Any]):
assert "tenant_id" not in config.keys()
res = self.put(
@@ -361,9 +365,9 @@ class PageserverHttpClient(requests.Session):
assert isinstance(res, dict)
assert TenantId(res["id"]) == tenant_id
size = res["size"]
assert type(size) == int
assert isinstance(size, int)
inputs = res["inputs"]
assert type(inputs) is dict
assert isinstance(inputs, dict)
return (size, inputs)
def tenant_size_debug(self, tenant_id: TenantId) -> str:

View File

@@ -42,9 +42,10 @@ def test_clickbench_create_pg_stat_statements(remote_compare: RemoteCompare):
# Please do not alter the label for the query, as it is used to identify it.
# Labels for ClickBench queries match the labels in ClickBench reports
# on https://benchmark.clickhouse.com/ (the DB size may differ).
#
# Disable auto formatting for the list of queries so that it's easier to read
# fmt: off
QUERIES: Tuple[LabelledQuery, ...] = (
# Disable `black` formatting for the list of queries so that it's easier to read
# fmt: off
### ClickBench queries:
LabelledQuery("Q0", r"SELECT COUNT(*) FROM hits;"),
LabelledQuery("Q1", r"SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0;"),
@@ -96,8 +97,8 @@ QUERIES: Tuple[LabelledQuery, ...] = (
# LabelledQuery("NQ0", r"..."),
# LabelledQuery("NQ1", r"..."),
# ...
# fmt: on
)
# fmt: on
EXPLAIN_STRING: str = "EXPLAIN (ANALYZE, VERBOSE, BUFFERS, COSTS, SETTINGS, FORMAT JSON)"

View File

@@ -32,8 +32,7 @@ def pg_compare(request) -> PgCompare:
else:
assert (
len(x) == 2
), f"request param ({request.param}) should have a format of \
`neon_{{safekeepers_enable_fsync}}`"
), f"request param ({request.param}) should have a format of `neon_{{safekeepers_enable_fsync}}`"
# `NeonCompare` interface
neon_env_builder = request.getfixturevalue("neon_env_builder")

View File

@@ -194,12 +194,13 @@ def test_fully_custom_config(positive_env: NeonEnv):
assert set(our_tenant_config.effective_config.keys()) == set(
fully_custom_config.keys()
), "ensure we cover all config options"
assert {
k: initial_tenant_config.effective_config[k] != our_tenant_config.effective_config[k]
for k in fully_custom_config.keys()
} == {
k: True for k in fully_custom_config.keys()
}, "ensure our custom config has different values than the default config for all config options, so we know we overrode everything"
assert (
{
k: initial_tenant_config.effective_config[k] != our_tenant_config.effective_config[k]
for k in fully_custom_config.keys()
}
== {k: True for k in fully_custom_config.keys()}
), "ensure our custom config has different values than the default config for all config options, so we know we overrode everything"
ps_http.tenant_detach(tenant_id)
env.pageserver.tenant_attach(tenant_id, config=fully_custom_config)

View File

@@ -186,9 +186,7 @@ def test_backward_compatibility(
else:
raise
assert (
not breaking_changes_allowed
), "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"
assert not breaking_changes_allowed, "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"
@check_ondisk_data_compatibility_if_enabled
@@ -247,9 +245,7 @@ def test_forward_compatibility(
else:
raise
assert (
not breaking_changes_allowed
), "Breaking changes are allowed by ALLOW_FORWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"
assert not breaking_changes_allowed, "Breaking changes are allowed by ALLOW_FORWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"
def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, repo_dir: Path):

View File

@@ -2,7 +2,6 @@ import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnvBuilder, WalCraft
# Restart nodes with WAL end having specially crafted shape, like last record
# crossing segment boundary, to test decoding issues.

View File

@@ -102,9 +102,7 @@ def test_basic_eviction(
), f"Did not expect to find {local_layer} layer after evicting"
empty_layers = list(filter(lambda path: path.name != "metadata", timeline_path.glob("*")))
assert (
not empty_layers
), f"After evicting all layers, timeline {tenant_id}/{timeline_id} should have no layers locally, but got: {empty_layers}"
assert not empty_layers, f"After evicting all layers, timeline {tenant_id}/{timeline_id} should have no layers locally, but got: {empty_layers}"
evicted_layer_map_info = client.layer_map_info(tenant_id=tenant_id, timeline_id=timeline_id)
assert (

View File

@@ -38,6 +38,9 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
env = neon_env_builder.init_start()
env.pageserver.allowed_errors.extend(
[".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
)
ps_http = env.pageserver.http_client()

View File

@@ -145,8 +145,7 @@ def expect_updated_msg_lsn(
last_msg_lsn = Lsn(timeline_details["last_received_msg_lsn"])
assert (
prev_msg_lsn is None or prev_msg_lsn < last_msg_lsn
), f"the last received message's LSN {last_msg_lsn} hasn't been updated \
compared to the previous message's LSN {prev_msg_lsn}"
), f"the last received message's LSN {last_msg_lsn} hasn't been updated compared to the previous message's LSN {prev_msg_lsn}"
return last_msg_lsn

View File

@@ -254,7 +254,9 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
metadata_summary = S3Scrubber(
neon_env_builder.test_output_dir, neon_env_builder
).scan_metadata()
assert metadata_summary["count"] == 1 # Scrubber should have seen our timeline
assert metadata_summary["tenant_count"] == 1 # Scrubber should have seen our timeline
assert metadata_summary["timeline_count"] == 1
assert metadata_summary["timeline_shard_count"] == 1
assert not metadata_summary["with_errors"]
assert not metadata_summary["with_warnings"]

Some files were not shown because too many files have changed in this diff Show More