Compare commits

..

1 Commits

Author SHA1 Message Date
Anastasia Lubennikova
efaf2c6663 RFC Merged compute image 2024-04-02 16:12:12 +01:00
283 changed files with 8128 additions and 15966 deletions

View File

@@ -22,7 +22,6 @@
!s3_scrubber/
!safekeeper/
!storage_broker/
!storage_controller/
!trace/
!vendor/postgres-*/
!workspace_hack/

1
.envrc
View File

@@ -1 +0,0 @@
use flake . --impure

View File

@@ -150,7 +150,7 @@ runs:
# Use aws s3 cp (instead of aws s3 sync) to keep files from previous runs to make old URLs work,
# and to keep files on the host to upload them to the database
time s5cmd --log error cp "${WORKDIR}/report/*" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}/"
time aws s3 cp --recursive --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"
# Generate redirect
cat <<EOF > ${WORKDIR}/index.html

View File

@@ -10,7 +10,7 @@ inputs:
required: true
api_host:
desctiption: 'Neon API host'
default: console-stage.neon.build
default: console.stage.neon.tech
outputs:
dsn:
description: 'Created Branch DSN (for main database)'

View File

@@ -13,7 +13,7 @@ inputs:
required: true
api_host:
desctiption: 'Neon API host'
default: console-stage.neon.build
default: console.stage.neon.tech
runs:
using: "composite"

View File

@@ -13,7 +13,7 @@ inputs:
default: 15
api_host:
desctiption: 'Neon API host'
default: console-stage.neon.build
default: console.stage.neon.tech
provisioner:
desctiption: 'k8s-pod or k8s-neonvm'
default: 'k8s-pod'

View File

@@ -10,7 +10,7 @@ inputs:
required: true
api_host:
desctiption: 'Neon API host'
default: console-stage.neon.build
default: console.stage.neon.tech
runs:
using: "composite"

View File

@@ -18,7 +18,6 @@ on:
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
cancel-in-progress: false
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}

View File

@@ -21,7 +21,6 @@ defaults:
concurrency:
group: build-build-tools-image-${{ inputs.image-tag }}
cancel-in-progress: false
# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
permissions: {}

View File

@@ -477,7 +477,6 @@ jobs:
BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
PAGESERVER_GET_VECTORED_IMPL: vectored
PAGESERVER_GET_IMPL: vectored
# Temporary disable this step until we figure out why it's so flaky
# Ref https://github.com/neondatabase/neon/issues/4540
@@ -736,7 +735,7 @@ jobs:
run: |
mkdir -p .docker-custom
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
- uses: docker/setup-buildx-action@v2
- uses: docker/setup-buildx-action@v3
- uses: docker/login-action@v3
with:
@@ -793,7 +792,7 @@ jobs:
run: |
mkdir -p .docker-custom
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
- uses: docker/setup-buildx-action@v2
- uses: docker/setup-buildx-action@v3
with:
# Disable parallelism for docker buildkit.
# As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner.
@@ -866,7 +865,7 @@ jobs:
run:
shell: sh -eu {0}
env:
VM_BUILDER_VERSION: v0.28.1
VM_BUILDER_VERSION: v0.23.2
steps:
- name: Checkout
@@ -1134,6 +1133,8 @@ jobs:
-f deployPreprodRegion=true
gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
-f deployPgSniRouter=false \
-f deployProxy=false \
-f deployStorage=true \
-f deployStorageBroker=true \
-f deployStorageController=true \

View File

@@ -28,9 +28,7 @@ jobs:
- name: Get build-tools image tag for the current commit
id: get-build-tools-tag
env:
# Usually, for COMMIT_SHA, we use `github.event.pull_request.head.sha || github.sha`, but here, even for PRs,
# we want to use `github.sha` i.e. point to a phantom merge commit to determine the image tag correctly.
COMMIT_SHA: ${{ github.sha }}
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
LAST_BUILD_TOOLS_SHA=$(

View File

@@ -20,7 +20,6 @@ defaults:
concurrency:
group: pin-build-tools-image-${{ inputs.from-tag }}
cancel-in-progress: false
permissions: {}

4
.gitignore vendored
View File

@@ -23,7 +23,3 @@ compaction-suite-results.*
# pgindent typedef lists
*.list
# nix dev env
.direnv
.devenv

View File

@@ -1,5 +1,5 @@
/compute_tools/ @neondatabase/control-plane @neondatabase/compute
/storage_controller @neondatabase/storage
/control_plane/attachment_service @neondatabase/storage
/libs/pageserver_api/ @neondatabase/storage
/libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers
/libs/remote_storage/ @neondatabase/storage

544
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -3,7 +3,7 @@ resolver = "2"
members = [
"compute_tools",
"control_plane",
"control_plane/storcon_cli",
"control_plane/attachment_service",
"pageserver",
"pageserver/compaction",
"pageserver/ctl",
@@ -12,7 +12,6 @@ members = [
"proxy",
"safekeeper",
"storage_broker",
"storage_controller",
"s3_scrubber",
"workspace_hack",
"trace",
@@ -44,7 +43,6 @@ license = "Apache-2.0"
anyhow = { version = "1.0", features = ["backtrace"] }
arc-swap = "1.6"
async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
atomic-take = "1.1.0"
azure_core = "0.18"
azure_identity = "0.18"
azure_storage = "0.18"
@@ -98,7 +96,7 @@ http-types = { version = "2", default-features = false }
humantime = "2.1"
humantime-serde = "1.1.1"
hyper = "0.14"
hyper-tungstenite = "0.13.0"
hyper-tungstenite = "0.11"
inotify = "0.10.2"
ipnet = "2.9.0"
itertools = "0.10"
@@ -107,8 +105,7 @@ lasso = "0.7"
leaky-bucket = "1.0.1"
libc = "0.2"
md5 = "0.7.0"
measured = { version = "0.0.21", features=["lasso"] }
measured-process = { version = "0.0.21" }
measured = { version = "0.0.13", features=["default", "lasso"] }
memoffset = "0.8"
native-tls = "0.2"
nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
@@ -157,12 +154,11 @@ socket2 = "0.5"
strum = "0.24"
strum_macros = "0.24"
"subtle" = "2.5.0"
# https://github.com/nical/rust_debug/pull/4
svg_fmt = { git = "https://github.com/neondatabase/fork--nical--rust_debug", branch = "neon" }
svg_fmt = "0.4.1"
sync_wrapper = "0.1.2"
tar = "0.4"
task-local-extensions = "0.1.4"
test-context = "0.3"
test-context = "0.1"
thiserror = "1.0"
tikv-jemallocator = "0.5"
tikv-jemalloc-ctl = "0.5"

View File

@@ -58,12 +58,6 @@ RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v$
&& mv protoc/include/google /usr/local/include/google \
&& rm -rf protoc.zip protoc
# s5cmd
ENV S5CMD_VERSION=2.2.2
RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/s5cmd_${S5CMD_VERSION}_Linux-$(uname -m | sed 's/x86_64/64bit/g' | sed 's/aarch64/arm64/g').tar.gz" | tar zxvf - s5cmd \
&& chmod +x s5cmd \
&& mv s5cmd /usr/local/bin/s5cmd
# LLVM
ENV LLVM_VERSION=17
RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \

View File

@@ -25,16 +25,14 @@ ifeq ($(UNAME_S),Linux)
# Seccomp BPF is only available for Linux
PG_CONFIGURE_OPTS += --with-libseccomp
else ifeq ($(UNAME_S),Darwin)
ifndef DISABLE_HOMEBREW
# macOS with brew-installed openssl requires explicit paths
# It can be configured with OPENSSL_PREFIX variable
OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig
# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure
# brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage
EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/:
endif
# macOS with brew-installed openssl requires explicit paths
# It can be configured with OPENSSL_PREFIX variable
OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig
# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure
# brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage
EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/:
endif
# Use -C option so that when PostgreSQL "make install" installs the

View File

@@ -818,15 +818,9 @@ impl ComputeNode {
Client::connect(zenith_admin_connstr.as_str(), NoTls)
.context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?;
// Disable forwarding so that users don't get a cloud_admin role
let mut func = || {
client.simple_query("SET neon.forward_ddl = false")?;
client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
client.simple_query("GRANT zenith_admin TO cloud_admin")?;
Ok::<_, anyhow::Error>(())
};
func().context("apply_config setup cloud_admin")?;
client.simple_query("SET neon.forward_ddl = false")?;
client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
client.simple_query("GRANT zenith_admin TO cloud_admin")?;
drop(client);
// reconnect with connstring with expected name
@@ -838,29 +832,24 @@ impl ComputeNode {
};
// Disable DDL forwarding because control plane already knows about these roles/databases.
client
.simple_query("SET neon.forward_ddl = false")
.context("apply_config SET neon.forward_ddl = false")?;
client.simple_query("SET neon.forward_ddl = false")?;
// Proceed with post-startup configuration. Note, that order of operations is important.
let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec;
create_neon_superuser(spec, &mut client).context("apply_config create_neon_superuser")?;
cleanup_instance(&mut client).context("apply_config cleanup_instance")?;
handle_roles(spec, &mut client).context("apply_config handle_roles")?;
handle_databases(spec, &mut client).context("apply_config handle_databases")?;
handle_role_deletions(spec, connstr.as_str(), &mut client)
.context("apply_config handle_role_deletions")?;
create_neon_superuser(spec, &mut client)?;
cleanup_instance(&mut client)?;
handle_roles(spec, &mut client)?;
handle_databases(spec, &mut client)?;
handle_role_deletions(spec, connstr.as_str(), &mut client)?;
handle_grants(
spec,
&mut client,
connstr.as_str(),
self.has_feature(ComputeFeature::AnonExtension),
)
.context("apply_config handle_grants")?;
handle_extensions(spec, &mut client).context("apply_config handle_extensions")?;
handle_extension_neon(&mut client).context("apply_config handle_extension_neon")?;
create_availability_check_data(&mut client)
.context("apply_config create_availability_check_data")?;
)?;
handle_extensions(spec, &mut client)?;
handle_extension_neon(&mut client)?;
create_availability_check_data(&mut client)?;
// 'Close' connection
drop(client);
@@ -868,7 +857,7 @@ impl ComputeNode {
// Run migrations separately to not hold up cold starts
thread::spawn(move || {
let mut client = Client::connect(connstr.as_str(), NoTls)?;
handle_migrations(&mut client).context("apply_config handle_migrations")
handle_migrations(&mut client)
});
Ok(())
}

View File

@@ -6,8 +6,8 @@ use std::path::Path;
use anyhow::Result;
use crate::pg_helpers::escape_conf_value;
use crate::pg_helpers::{GenericOptionExt, PgOptionsSerialize};
use compute_api::spec::{ComputeMode, ComputeSpec, GenericOption};
use crate::pg_helpers::PgOptionsSerialize;
use compute_api::spec::{ComputeMode, ComputeSpec};
/// Check that `line` is inside a text file and put it there if it is not.
/// Create file if it doesn't exist.
@@ -92,27 +92,6 @@ pub fn write_postgres_conf(
}
}
if cfg!(target_os = "linux") {
// Check /proc/sys/vm/overcommit_memory -- if it equals 2 (i.e. linux memory overcommit is
// disabled), then the control plane has enabled swap and we should set
// dynamic_shared_memory_type = 'mmap'.
//
// This is (maybe?) temporary - for more, see https://github.com/neondatabase/cloud/issues/12047.
let overcommit_memory_contents = std::fs::read_to_string("/proc/sys/vm/overcommit_memory")
// ignore any errors - they may be expected to occur under certain situations (e.g. when
// not running in Linux).
.unwrap_or_else(|_| String::new());
if overcommit_memory_contents.trim() == "2" {
let opt = GenericOption {
name: "dynamic_shared_memory_type".to_owned(),
value: Some("mmap".to_owned()),
vartype: "enum".to_owned(),
};
write!(file, "{}", opt.to_pg_setting())?;
}
}
// If there are any extra options in the 'settings' field, append those
if spec.cluster.settings.is_some() {
writeln!(file, "# Managed by compute_ctl: begin")?;

View File

@@ -44,7 +44,7 @@ pub fn escape_conf_value(s: &str) -> String {
format!("'{}'", res)
}
pub trait GenericOptionExt {
trait GenericOptionExt {
fn to_pg_option(&self) -> String;
fn to_pg_setting(&self) -> String;
}

View File

@@ -2,7 +2,7 @@ use std::fs::File;
use std::path::Path;
use std::str::FromStr;
use anyhow::{anyhow, bail, Context, Result};
use anyhow::{anyhow, bail, Result};
use postgres::config::Config;
use postgres::{Client, NoTls};
use reqwest::StatusCode;
@@ -698,8 +698,7 @@ pub fn handle_grants(
// it is important to run this after all grants
if enable_anon_extension {
handle_extension_anon(spec, &db.owner, &mut db_client, false)
.context("handle_grants handle_extension_anon")?;
handle_extension_anon(spec, &db.owner, &mut db_client, false)?;
}
}
@@ -814,36 +813,28 @@ $$;"#,
// Add new migrations below.
];
let mut func = || {
let query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
client.simple_query(query)?;
let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
client.simple_query(query)?;
let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
client.simple_query(query)?;
query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
client.simple_query(query)?;
let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
client.simple_query(query)?;
query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
client.simple_query(query)?;
let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
client.simple_query(query)?;
query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
client.simple_query(query)?;
let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
client.simple_query(query)?;
Ok::<_, anyhow::Error>(())
};
func().context("handle_migrations prepare")?;
query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
client.simple_query(query)?;
let query = "SELECT id FROM neon_migration.migration_id";
let row = client
.query_one(query, &[])
.context("handle_migrations get migration_id")?;
query = "SELECT id FROM neon_migration.migration_id";
let row = client.query_one(query, &[])?;
let mut current_migration: usize = row.get::<&str, i64>("id") as usize;
let starting_migration_id = current_migration;
let query = "BEGIN";
client
.simple_query(query)
.context("handle_migrations begin")?;
query = "BEGIN";
client.simple_query(query)?;
while current_migration < migrations.len() {
let migration = &migrations[current_migration];
@@ -851,9 +842,7 @@ $$;"#,
info!("Skip migration id={}", current_migration);
} else {
info!("Running migration:\n{}\n", migration);
client.simple_query(migration).with_context(|| {
format!("handle_migrations current_migration={}", current_migration)
})?;
client.simple_query(migration)?;
}
current_migration += 1;
}
@@ -861,14 +850,10 @@ $$;"#,
"UPDATE neon_migration.migration_id SET id={}",
migrations.len()
);
client
.simple_query(&setval)
.context("handle_migrations update id")?;
client.simple_query(&setval)?;
let query = "COMMIT";
client
.simple_query(query)
.context("handle_migrations commit")?;
query = "COMMIT";
client.simple_query(query)?;
info!(
"Ran {} migrations",

View File

@@ -1,5 +1,5 @@
[package]
name = "storage_controller"
name = "attachment_service"
version = "0.1.0"
edition.workspace = true
license.workspace = true
@@ -25,7 +25,6 @@ git-version.workspace = true
hex.workspace = true
hyper.workspace = true
humantime.workspace = true
itertools.workspace = true
lasso.workspace = true
once_cell.workspace = true
pageserver_api.workspace = true
@@ -45,8 +44,8 @@ diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] }
diesel_migrations = { version = "2.1.0" }
r2d2 = { version = "0.8.10" }
utils = { path = "../libs/utils/" }
metrics = { path = "../libs/metrics/" }
control_plane = { path = "../control_plane" }
workspace_hack = { version = "0.1", path = "../workspace_hack" }
utils = { path = "../../libs/utils/" }
metrics = { path = "../../libs/metrics/" }
control_plane = { path = ".." }
workspace_hack = { version = "0.1", path = "../../workspace_hack" }

View File

@@ -1,15 +1,12 @@
use std::sync::Arc;
use std::{collections::HashMap, time::Duration};
use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
use control_plane::local_env::LocalEnv;
use futures::StreamExt;
use hyper::{Method, StatusCode};
use pageserver_api::shard::{ShardCount, ShardNumber, ShardStripeSize, TenantShardId};
use postgres_connection::parse_host_port;
use serde::{Deserialize, Serialize};
use tokio_util::sync::CancellationToken;
use tracing::{info_span, Instrument};
use utils::{
backoff::{self},
id::{NodeId, TenantId},
@@ -19,30 +16,16 @@ use crate::service::Config;
const SLOWDOWN_DELAY: Duration = Duration::from_secs(5);
const NOTIFY_REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
pub(crate) const API_CONCURRENCY: usize = 32;
struct UnshardedComputeHookTenant {
// Which node is this tenant attached to
node_id: NodeId,
// Must hold this lock to send a notification.
send_lock: Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>>,
}
struct ShardedComputeHookTenant {
stripe_size: ShardStripeSize,
shard_count: ShardCount,
shards: Vec<(ShardNumber, NodeId)>,
// Must hold this lock to send a notification. The contents represent
// the last successfully sent notification, and are used to coalesce multiple
// updates by only sending when there is a chance since our last successful send.
send_lock: Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>>,
}
enum ComputeHookTenant {
Unsharded(UnshardedComputeHookTenant),
Unsharded(NodeId),
Sharded(ShardedComputeHookTenant),
}
@@ -54,20 +37,9 @@ impl ComputeHookTenant {
shards: vec![(tenant_shard_id.shard_number, node_id)],
stripe_size,
shard_count: tenant_shard_id.shard_count,
send_lock: Arc::default(),
})
} else {
Self::Unsharded(UnshardedComputeHookTenant {
node_id,
send_lock: Arc::default(),
})
}
}
fn get_send_lock(&self) -> &Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>> {
match self {
Self::Unsharded(unsharded_tenant) => &unsharded_tenant.send_lock,
Self::Sharded(sharded_tenant) => &sharded_tenant.send_lock,
Self::Unsharded(node_id)
}
}
@@ -80,8 +52,8 @@ impl ComputeHookTenant {
node_id: NodeId,
) {
match self {
Self::Unsharded(unsharded_tenant) if tenant_shard_id.shard_count.count() == 1 => {
unsharded_tenant.node_id = node_id
Self::Unsharded(existing_node_id) if tenant_shard_id.shard_count.count() == 1 => {
*existing_node_id = node_id
}
Self::Sharded(sharded_tenant)
if sharded_tenant.stripe_size == stripe_size
@@ -108,14 +80,14 @@ impl ComputeHookTenant {
}
}
#[derive(Serialize, Deserialize, Debug, Eq, PartialEq)]
#[derive(Serialize, Deserialize, Debug)]
struct ComputeHookNotifyRequestShard {
node_id: NodeId,
shard_number: ShardNumber,
}
/// Request body that we send to the control plane to notify it of where a tenant is attached
#[derive(Serialize, Deserialize, Debug, Eq, PartialEq)]
#[derive(Serialize, Deserialize, Debug)]
struct ComputeHookNotifyRequest {
tenant_id: TenantId,
stripe_size: Option<ShardStripeSize>,
@@ -148,44 +120,14 @@ pub(crate) enum NotifyError {
Fatal(StatusCode),
}
enum MaybeSendResult {
// Please send this request while holding the lock, and if you succeed then write
// the request into the lock.
Transmit(
(
ComputeHookNotifyRequest,
tokio::sync::OwnedMutexGuard<Option<ComputeHookNotifyRequest>>,
),
),
// Something requires sending, but you must wait for a current sender then call again
AwaitLock(Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>>),
// Nothing requires sending
Noop,
}
impl ComputeHookTenant {
fn maybe_send(
&self,
tenant_id: TenantId,
lock: Option<tokio::sync::OwnedMutexGuard<Option<ComputeHookNotifyRequest>>>,
) -> MaybeSendResult {
let locked = match lock {
Some(already_locked) => already_locked,
None => {
// Lock order: this _must_ be only a try_lock, because we are called inside of the [`ComputeHook::state`] lock.
let Ok(locked) = self.get_send_lock().clone().try_lock_owned() else {
return MaybeSendResult::AwaitLock(self.get_send_lock().clone());
};
locked
}
};
let request = match self {
Self::Unsharded(unsharded_tenant) => Some(ComputeHookNotifyRequest {
fn maybe_reconfigure(&self, tenant_id: TenantId) -> Option<ComputeHookNotifyRequest> {
match self {
Self::Unsharded(node_id) => Some(ComputeHookNotifyRequest {
tenant_id,
shards: vec![ComputeHookNotifyRequestShard {
shard_number: ShardNumber(0),
node_id: unsharded_tenant.node_id,
node_id: *node_id,
}],
stripe_size: None,
}),
@@ -209,25 +151,12 @@ impl ComputeHookTenant {
// Sharded tenant doesn't yet have information for all its shards
tracing::info!(
"ComputeHookTenant::maybe_send: not enough shards ({}/{})",
"ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})",
sharded_tenant.shards.len(),
sharded_tenant.shard_count.count()
);
None
}
};
match request {
None => {
// Not yet ready to emit a notification
tracing::info!("Tenant isn't yet ready to emit a notification");
MaybeSendResult::Noop
}
Some(request) if Some(&request) == locked.as_ref() => {
// No change from the last value successfully sent
MaybeSendResult::Noop
}
Some(request) => MaybeSendResult::Transmit((request, locked)),
}
}
}
@@ -237,19 +166,8 @@ impl ComputeHookTenant {
/// the compute connection string.
pub(super) struct ComputeHook {
config: Config,
state: std::sync::Mutex<HashMap<TenantId, ComputeHookTenant>>,
state: tokio::sync::Mutex<HashMap<TenantId, ComputeHookTenant>>,
authorization_header: Option<String>,
// Concurrency limiter, so that we do not overload the cloud control plane when updating
// large numbers of tenants (e.g. when failing over after a node failure)
api_concurrency: tokio::sync::Semaphore,
// This lock is only used in testing enviroments, to serialize calls into neon_lock
neon_local_lock: tokio::sync::Mutex<()>,
// We share a client across all notifications to enable connection re-use etc when
// sending large numbers of notifications
client: reqwest::Client,
}
impl ComputeHook {
@@ -259,30 +177,18 @@ impl ComputeHook {
.clone()
.map(|jwt| format!("Bearer {}", jwt));
let client = reqwest::ClientBuilder::new()
.timeout(NOTIFY_REQUEST_TIMEOUT)
.build()
.expect("Failed to construct HTTP client");
Self {
state: Default::default(),
config,
authorization_header,
neon_local_lock: Default::default(),
api_concurrency: tokio::sync::Semaphore::new(API_CONCURRENCY),
client,
}
}
/// For test environments: use neon_local's LocalEnv to update compute
async fn do_notify_local(
&self,
reconfigure_request: &ComputeHookNotifyRequest,
reconfigure_request: ComputeHookNotifyRequest,
) -> anyhow::Result<()> {
// neon_local updates are not safe to call concurrently, use a lock to serialize
// all calls to this function
let _locked = self.neon_local_lock.lock().await;
let env = match LocalEnv::load_config() {
Ok(e) => e,
Err(e) => {
@@ -299,7 +205,7 @@ impl ComputeHook {
} = reconfigure_request;
let compute_pageservers = shards
.iter()
.into_iter()
.map(|shard| {
let ps_conf = env
.get_pageserver_conf(shard.node_id)
@@ -311,10 +217,10 @@ impl ComputeHook {
.collect::<Vec<_>>();
for (endpoint_name, endpoint) in &cplane.endpoints {
if endpoint.tenant_id == *tenant_id && endpoint.status() == EndpointStatus::Running {
if endpoint.tenant_id == tenant_id && endpoint.status() == EndpointStatus::Running {
tracing::info!("Reconfiguring endpoint {}", endpoint_name,);
endpoint
.reconfigure(compute_pageservers.clone(), *stripe_size)
.reconfigure(compute_pageservers.clone(), stripe_size)
.await?;
}
}
@@ -324,11 +230,12 @@ impl ComputeHook {
async fn do_notify_iteration(
&self,
client: &reqwest::Client,
url: &String,
reconfigure_request: &ComputeHookNotifyRequest,
cancel: &CancellationToken,
) -> Result<(), NotifyError> {
let req = self.client.request(Method::PUT, url);
let req = client.request(Method::PUT, url);
let req = if let Some(value) = &self.authorization_header {
req.header(reqwest::header::AUTHORIZATION, value)
} else {
@@ -391,21 +298,12 @@ impl ComputeHook {
async fn do_notify(
&self,
url: &String,
reconfigure_request: &ComputeHookNotifyRequest,
reconfigure_request: ComputeHookNotifyRequest,
cancel: &CancellationToken,
) -> Result<(), NotifyError> {
// We hold these semaphore units across all retries, rather than only across each
// HTTP request: this is to preserve fairness and avoid a situation where a retry might
// time out waiting for a semaphore.
let _units = self
.api_concurrency
.acquire()
.await
// Interpret closed semaphore as shutdown
.map_err(|_| NotifyError::ShuttingDown)?;
let client = reqwest::Client::new();
backoff::retry(
|| self.do_notify_iteration(url, reconfigure_request, cancel),
|| self.do_notify_iteration(&client, url, &reconfigure_request, cancel),
|e| {
matches!(
e,
@@ -422,149 +320,6 @@ impl ComputeHook {
.and_then(|x| x)
}
/// Synchronous phase: update the per-tenant state for the next intended notification
fn notify_prepare(
&self,
tenant_shard_id: TenantShardId,
node_id: NodeId,
stripe_size: ShardStripeSize,
) -> MaybeSendResult {
let mut state_locked = self.state.lock().unwrap();
use std::collections::hash_map::Entry;
let tenant = match state_locked.entry(tenant_shard_id.tenant_id) {
Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
tenant_shard_id,
stripe_size,
node_id,
)),
Entry::Occupied(e) => {
let tenant = e.into_mut();
tenant.update(tenant_shard_id, stripe_size, node_id);
tenant
}
};
tenant.maybe_send(tenant_shard_id.tenant_id, None)
}
async fn notify_execute(
&self,
maybe_send_result: MaybeSendResult,
tenant_shard_id: TenantShardId,
cancel: &CancellationToken,
) -> Result<(), NotifyError> {
// Process result: we may get an update to send, or we may have to wait for a lock
// before trying again.
let (request, mut send_lock_guard) = match maybe_send_result {
MaybeSendResult::Noop => {
return Ok(());
}
MaybeSendResult::AwaitLock(send_lock) => {
let send_locked = tokio::select! {
guard = send_lock.lock_owned() => {guard},
_ = cancel.cancelled() => {
return Err(NotifyError::ShuttingDown)
}
};
// Lock order: maybe_send is called within the `[Self::state]` lock, and takes the send lock, but here
// we have acquired the send lock and take `[Self::state]` lock. This is safe because maybe_send only uses
// try_lock.
let state_locked = self.state.lock().unwrap();
let Some(tenant) = state_locked.get(&tenant_shard_id.tenant_id) else {
return Ok(());
};
match tenant.maybe_send(tenant_shard_id.tenant_id, Some(send_locked)) {
MaybeSendResult::AwaitLock(_) => {
unreachable!("We supplied lock guard")
}
MaybeSendResult::Noop => {
return Ok(());
}
MaybeSendResult::Transmit((request, lock)) => (request, lock),
}
}
MaybeSendResult::Transmit((request, lock)) => (request, lock),
};
let result = if let Some(notify_url) = &self.config.compute_hook_url {
self.do_notify(notify_url, &request, cancel).await
} else {
self.do_notify_local(&request).await.map_err(|e| {
// This path is for testing only, so munge the error into our prod-style error type.
tracing::error!("Local notification hook failed: {e}");
NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR)
})
};
if result.is_ok() {
// Before dropping the send lock, stash the request we just sent so that
// subsequent callers can avoid redundantly re-sending the same thing.
*send_lock_guard = Some(request);
}
result
}
/// Infallible synchronous fire-and-forget version of notify(), that sends its results to
/// a channel. Something should consume the channel and arrange to try notifying again
/// if something failed.
pub(super) fn notify_background(
self: &Arc<Self>,
notifications: Vec<(TenantShardId, NodeId, ShardStripeSize)>,
result_tx: tokio::sync::mpsc::Sender<Result<(), (TenantShardId, NotifyError)>>,
cancel: &CancellationToken,
) {
let mut maybe_sends = Vec::new();
for (tenant_shard_id, node_id, stripe_size) in notifications {
let maybe_send_result = self.notify_prepare(tenant_shard_id, node_id, stripe_size);
maybe_sends.push((tenant_shard_id, maybe_send_result))
}
let this = self.clone();
let cancel = cancel.clone();
tokio::task::spawn(async move {
// Construct an async stream of futures to invoke the compute notify function: we do this
// in order to subsequently use .buffered() on the stream to execute with bounded parallelism. The
// ComputeHook semaphore already limits concurrency, but this way we avoid constructing+polling lots of futures which
// would mostly just be waiting on that semaphore.
let mut stream = futures::stream::iter(maybe_sends)
.map(|(tenant_shard_id, maybe_send_result)| {
let this = this.clone();
let cancel = cancel.clone();
async move {
this
.notify_execute(maybe_send_result, tenant_shard_id, &cancel)
.await.map_err(|e| (tenant_shard_id, e))
}.instrument(info_span!(
"notify_background", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()
))
})
.buffered(API_CONCURRENCY);
loop {
tokio::select! {
next = stream.next() => {
match next {
Some(r) => {
result_tx.send(r).await.ok();
},
None => {
tracing::info!("Finished sending background compute notifications");
break;
}
}
},
_ = cancel.cancelled() => {
tracing::info!("Shutdown while running background compute notifications");
break;
}
};
}
});
}
/// Call this to notify the compute (postgres) tier of new pageservers to use
/// for a tenant. notify() is called by each shard individually, and this function
/// will decide whether an update to the tenant is sent. An update is sent on the
@@ -588,9 +343,42 @@ impl ComputeHook {
stripe_size: ShardStripeSize,
cancel: &CancellationToken,
) -> Result<(), NotifyError> {
let maybe_send_result = self.notify_prepare(tenant_shard_id, node_id, stripe_size);
self.notify_execute(maybe_send_result, tenant_shard_id, cancel)
.await
let mut locked = self.state.lock().await;
use std::collections::hash_map::Entry;
let tenant = match locked.entry(tenant_shard_id.tenant_id) {
Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
tenant_shard_id,
stripe_size,
node_id,
)),
Entry::Occupied(e) => {
let tenant = e.into_mut();
tenant.update(tenant_shard_id, stripe_size, node_id);
tenant
}
};
let reconfigure_request = tenant.maybe_reconfigure(tenant_shard_id.tenant_id);
let Some(reconfigure_request) = reconfigure_request else {
// The tenant doesn't yet have pageservers for all its shards: we won't notify anything
// until it does.
tracing::info!("Tenant isn't yet ready to emit a notification");
return Ok(());
};
if let Some(notify_url) = &self.config.compute_hook_url {
self.do_notify(notify_url, reconfigure_request, cancel)
.await
} else {
self.do_notify_local(reconfigure_request)
.await
.map_err(|e| {
// This path is for testing only, so munge the error into our prod-style error type.
tracing::error!("Local notification hook failed: {e}");
NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR)
})
}
}
}
@@ -614,22 +402,21 @@ pub(crate) mod tests {
NodeId(1),
);
// An unsharded tenant is always ready to emit a notification, but won't
// send the same one twice
let send_result = tenant_state.maybe_send(tenant_id, None);
let MaybeSendResult::Transmit((request, mut guard)) = send_result else {
anyhow::bail!("Wrong send result");
};
assert_eq!(request.shards.len(), 1);
assert!(request.stripe_size.is_none());
// Simulate successful send
*guard = Some(request);
drop(guard);
// Try asking again: this should be a no-op
let send_result = tenant_state.maybe_send(tenant_id, None);
assert!(matches!(send_result, MaybeSendResult::Noop));
// An unsharded tenant is always ready to emit a notification
assert!(tenant_state.maybe_reconfigure(tenant_id).is_some());
assert_eq!(
tenant_state
.maybe_reconfigure(tenant_id)
.unwrap()
.shards
.len(),
1
);
assert!(tenant_state
.maybe_reconfigure(tenant_id)
.unwrap()
.stripe_size
.is_none());
// Writing the first shard of a multi-sharded situation (i.e. in a split)
// resets the tenant state and puts it in an non-notifying state (need to
@@ -643,10 +430,7 @@ pub(crate) mod tests {
ShardStripeSize(32768),
NodeId(1),
);
assert!(matches!(
tenant_state.maybe_send(tenant_id, None),
MaybeSendResult::Noop
));
assert!(tenant_state.maybe_reconfigure(tenant_id).is_none());
// Writing the second shard makes it ready to notify
tenant_state.update(
@@ -659,16 +443,22 @@ pub(crate) mod tests {
NodeId(1),
);
let send_result = tenant_state.maybe_send(tenant_id, None);
let MaybeSendResult::Transmit((request, mut guard)) = send_result else {
anyhow::bail!("Wrong send result");
};
assert_eq!(request.shards.len(), 2);
assert_eq!(request.stripe_size, Some(ShardStripeSize(32768)));
// Simulate successful send
*guard = Some(request);
drop(guard);
assert!(tenant_state.maybe_reconfigure(tenant_id).is_some());
assert_eq!(
tenant_state
.maybe_reconfigure(tenant_id)
.unwrap()
.shards
.len(),
2
);
assert_eq!(
tenant_state
.maybe_reconfigure(tenant_id)
.unwrap()
.stripe_size,
Some(ShardStripeSize(32768))
);
Ok(())
}

View File

@@ -184,19 +184,6 @@ impl HeartbeaterTask {
}
}
}
tracing::info!(
"Heartbeat round complete for {} nodes, {} offline",
new_state.len(),
new_state
.values()
.filter(|s| match s {
PageserverState::Available { .. } => {
false
}
PageserverState::Offline => true,
})
.count()
);
let mut deltas = Vec::new();
let now = Instant::now();

View File

@@ -8,7 +8,6 @@ use futures::Future;
use hyper::header::CONTENT_TYPE;
use hyper::{Body, Request, Response};
use hyper::{StatusCode, Uri};
use metrics::{BuildInfo, NeonMetrics};
use pageserver_api::models::{
TenantConfigRequest, TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
TenantTimeTravelRequest, TimelineCreateRequest,
@@ -45,19 +44,15 @@ use control_plane::storage_controller::{AttachHookRequest, InspectRequest};
use routerify::Middleware;
/// State available to HTTP request handlers
#[derive(Clone)]
pub struct HttpState {
service: Arc<crate::service::Service>,
auth: Option<Arc<SwappableJwtAuth>>,
neon_metrics: NeonMetrics,
allowlist_routes: Vec<Uri>,
}
impl HttpState {
pub fn new(
service: Arc<crate::service::Service>,
auth: Option<Arc<SwappableJwtAuth>>,
build_info: BuildInfo,
) -> Self {
pub fn new(service: Arc<crate::service::Service>, auth: Option<Arc<SwappableJwtAuth>>) -> Self {
let allowlist_routes = ["/status", "/ready", "/metrics"]
.iter()
.map(|v| v.parse().unwrap())
@@ -65,7 +60,6 @@ impl HttpState {
Self {
service,
auth,
neon_metrics: NeonMetrics::new(build_info),
allowlist_routes,
}
}
@@ -405,15 +399,6 @@ async fn handle_tenant_describe(
json_response(StatusCode::OK, service.tenant_describe(tenant_id)?)
}
async fn handle_tenant_list(
service: Arc<Service>,
req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
json_response(StatusCode::OK, service.tenant_list())
}
async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
@@ -427,10 +412,7 @@ async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError
check_permissions(&req, Scope::Admin)?;
let state = get_state(&req);
let nodes = state.service.node_list().await?;
let api_nodes = nodes.into_iter().map(|n| n.describe()).collect::<Vec<_>>();
json_response(StatusCode::OK, api_nodes)
json_response(StatusCode::OK, state.service.node_list().await?)
}
async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -522,18 +504,6 @@ async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiErr
json_response(StatusCode::OK, state.service.tenant_drop(tenant_id).await?)
}
async fn handle_tenant_import(req: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
check_permissions(&req, Scope::PageServerApi)?;
let state = get_state(&req);
json_response(
StatusCode::OK,
state.service.tenant_import(tenant_id).await?,
)
}
async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
@@ -620,17 +590,9 @@ where
.await
}
/// Check if the required scope is held in the request's token, or if the request has
/// a token with 'admin' scope then always permit it.
fn check_permissions(request: &Request<Body>, required_scope: Scope) -> Result<(), ApiError> {
check_permission_with(request, |claims| {
match crate::auth::check_permission(claims, required_scope) {
Err(e) => match crate::auth::check_permission(claims, Scope::Admin) {
Ok(()) => Ok(()),
Err(_) => Err(e),
},
Ok(()) => Ok(()),
}
crate::auth::check_permission(claims, required_scope)
})
}
@@ -690,11 +652,10 @@ fn epilogue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>
})
}
pub async fn measured_metrics_handler(req: Request<Body>) -> Result<Response<Body>, ApiError> {
pub async fn measured_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
pub const TEXT_FORMAT: &str = "text/plain; version=0.0.4";
let state = get_state(&req);
let payload = crate::metrics::METRICS_REGISTRY.encode(&state.neon_metrics);
let payload = crate::metrics::METRICS_REGISTRY.encode();
let response = Response::builder()
.status(200)
.header(CONTENT_TYPE, TEXT_FORMAT)
@@ -723,7 +684,6 @@ where
pub fn make_router(
service: Arc<Service>,
auth: Option<Arc<SwappableJwtAuth>>,
build_info: BuildInfo,
) -> RouterBuilder<hyper::Body, ApiError> {
let mut router = endpoint::make_router()
.middleware(prologue_metrics_middleware())
@@ -740,7 +700,7 @@ pub fn make_router(
}
router
.data(Arc::new(HttpState::new(service, auth, build_info)))
.data(Arc::new(HttpState::new(service, auth)))
.get("/metrics", |r| {
named_request_span(r, measured_metrics_handler, RequestName("metrics"))
})
@@ -771,13 +731,6 @@ pub fn make_router(
.post("/debug/v1/node/:node_id/drop", |r| {
named_request_span(r, handle_node_drop, RequestName("debug_v1_node_drop"))
})
.post("/debug/v1/tenant/:tenant_id/import", |r| {
named_request_span(
r,
handle_tenant_import,
RequestName("debug_v1_tenant_import"),
)
})
.get("/debug/v1/tenant", |r| {
named_request_span(r, handle_tenants_dump, RequestName("debug_v1_tenant"))
})
@@ -840,9 +793,6 @@ pub fn make_router(
RequestName("control_v1_tenant_describe"),
)
})
.get("/control/v1/tenant", |r| {
tenant_service_handler(r, handle_tenant_list, RequestName("control_v1_tenant_list"))
})
.put("/control/v1/tenant/:tenant_id/policy", |r| {
named_request_span(
r,

View File

@@ -14,7 +14,7 @@ mod reconciler;
mod scheduler;
mod schema;
pub mod service;
mod tenant_shard;
mod tenant_state;
#[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone, Serialize)]
struct Sequence(u64);

View File

@@ -1,23 +1,18 @@
use anyhow::{anyhow, Context};
use attachment_service::http::make_router;
use attachment_service::metrics::preinitialize_metrics;
use attachment_service::persistence::Persistence;
use attachment_service::service::{Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT};
use camino::Utf8PathBuf;
use clap::Parser;
use diesel::Connection;
use metrics::launch_timestamp::LaunchTimestamp;
use metrics::BuildInfo;
use std::sync::Arc;
use std::time::Duration;
use storage_controller::http::make_router;
use storage_controller::metrics::preinitialize_metrics;
use storage_controller::persistence::Persistence;
use storage_controller::service::{
Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT,
};
use tokio::signal::unix::SignalKind;
use tokio_util::sync::CancellationToken;
use utils::auth::{JwtAuth, SwappableJwtAuth};
use utils::logging::{self, LogFormat};
use utils::sentry_init::init_sentry;
use utils::{project_build_tag, project_git_version, tcp_listener};
project_git_version!(GIT_VERSION);
@@ -55,7 +50,7 @@ struct Cli {
#[arg(short, long)]
path: Option<Utf8PathBuf>,
/// URL to connect to postgres, like postgresql://localhost:1234/storage_controller
/// URL to connect to postgres, like postgresql://localhost:1234/attachment_service
#[arg(long)]
database_url: Option<String>,
@@ -66,10 +61,6 @@ struct Cli {
/// Grace period before marking unresponsive pageserver offline
#[arg(long)]
max_unavailable_interval: Option<humantime::Duration>,
/// Maximum number of reconcilers that may run in parallel
#[arg(long)]
reconciler_concurrency: Option<usize>,
}
enum StrictMode {
@@ -167,8 +158,6 @@ fn main() -> anyhow::Result<()> {
std::process::exit(1);
}));
let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
tokio::runtime::Builder::new_current_thread()
// We use spawn_blocking for database operations, so require approximately
// as many blocking threads as we will open database connections.
@@ -200,11 +189,6 @@ async fn async_main() -> anyhow::Result<()> {
args.listen
);
let build_info = BuildInfo {
revision: GIT_VERSION,
build_tag: BUILD_TAG,
};
let strict_mode = if args.dev {
StrictMode::Dev
} else {
@@ -249,14 +233,9 @@ async fn async_main() -> anyhow::Result<()> {
.max_unavailable_interval
.map(humantime::Duration::into)
.unwrap_or(MAX_UNAVAILABLE_INTERVAL_DEFAULT),
reconciler_concurrency: args
.reconciler_concurrency
.unwrap_or(RECONCILER_CONCURRENCY_DEFAULT),
};
// After loading secrets & config, but before starting anything else, apply database migrations
Persistence::await_connection(&secrets.database_url, Duration::from_secs(5)).await?;
migration_run(&secrets.database_url)
.await
.context("Running database migrations")?;
@@ -271,7 +250,7 @@ async fn async_main() -> anyhow::Result<()> {
let auth = secrets
.public_key
.map(|jwt_auth| Arc::new(SwappableJwtAuth::new(jwt_auth)));
let router = make_router(service.clone(), auth, build_info)
let router = make_router(service.clone(), auth)
.build()
.map_err(|err| anyhow!(err))?;
let router_service = utils::http::RouterService::new(router).unwrap();

View File

@@ -8,8 +8,10 @@
//! The rest of the code defines label group types and deals with converting outer types to labels.
//!
use bytes::Bytes;
use measured::{label::LabelValue, metric::histogram, FixedCardinalityLabel, MetricGroup};
use metrics::NeonMetrics;
use measured::{
label::{LabelValue, StaticLabelSet},
FixedCardinalityLabel, MetricGroup,
};
use once_cell::sync::Lazy;
use std::sync::Mutex;
@@ -24,15 +26,13 @@ pub fn preinitialize_metrics() {
pub(crate) struct StorageControllerMetrics {
pub(crate) metrics_group: StorageControllerMetricGroup,
encoder: Mutex<measured::text::BufferedTextEncoder>,
encoder: Mutex<measured::text::TextEncoder>,
}
#[derive(measured::MetricGroup)]
#[metric(new())]
pub(crate) struct StorageControllerMetricGroup {
/// Count of how many times we spawn a reconcile task
pub(crate) storage_controller_reconcile_spawn: measured::Counter,
/// Reconciler tasks completed, broken down by success/failure/cancelled
pub(crate) storage_controller_reconcile_complete:
measured::CounterVec<ReconcileCompleteLabelGroupSet>,
@@ -43,9 +43,7 @@ pub(crate) struct StorageControllerMetricGroup {
/// HTTP request status counters for handled requests
pub(crate) storage_controller_http_request_status:
measured::CounterVec<HttpRequestStatusLabelGroupSet>,
/// HTTP request handler latency across all status codes
#[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
pub(crate) storage_controller_http_request_latency:
measured::HistogramVec<HttpRequestLatencyLabelGroupSet, 5>,
@@ -57,7 +55,6 @@ pub(crate) struct StorageControllerMetricGroup {
/// Latency of HTTP requests to the pageserver, broken down by pageserver
/// node id, request name and method. This include both successful and unsuccessful
/// requests.
#[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
pub(crate) storage_controller_pageserver_request_latency:
measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
@@ -69,7 +66,6 @@ pub(crate) struct StorageControllerMetricGroup {
/// Latency of pass-through HTTP requests to the pageserver, broken down by pageserver
/// node id, request name and method. This include both successful and unsuccessful
/// requests.
#[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
pub(crate) storage_controller_passthrough_request_latency:
measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,
@@ -78,34 +74,76 @@ pub(crate) struct StorageControllerMetricGroup {
measured::CounterVec<DatabaseQueryErrorLabelGroupSet>,
/// Latency of database queries, broken down by operation.
#[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
pub(crate) storage_controller_database_query_latency:
measured::HistogramVec<DatabaseQueryLatencyLabelGroupSet, 5>,
}
impl StorageControllerMetrics {
pub(crate) fn encode(&self, neon_metrics: &NeonMetrics) -> Bytes {
pub(crate) fn encode(&self) -> Bytes {
let mut encoder = self.encoder.lock().unwrap();
neon_metrics
.collect_group_into(&mut *encoder)
.unwrap_or_else(|infallible| match infallible {});
self.metrics_group
.collect_group_into(&mut *encoder)
.unwrap_or_else(|infallible| match infallible {});
self.metrics_group.collect_into(&mut *encoder);
encoder.finish()
}
}
impl Default for StorageControllerMetrics {
fn default() -> Self {
let mut metrics_group = StorageControllerMetricGroup::new();
metrics_group
.storage_controller_reconcile_complete
.init_all_dense();
Self {
metrics_group,
encoder: Mutex::new(measured::text::BufferedTextEncoder::new()),
metrics_group: StorageControllerMetricGroup::new(),
encoder: Mutex::new(measured::text::TextEncoder::new()),
}
}
}
impl StorageControllerMetricGroup {
pub(crate) fn new() -> Self {
Self {
storage_controller_reconcile_spawn: measured::Counter::new(),
storage_controller_reconcile_complete: measured::CounterVec::new(
ReconcileCompleteLabelGroupSet {
status: StaticLabelSet::new(),
},
),
storage_controller_schedule_optimization: measured::Counter::new(),
storage_controller_http_request_status: measured::CounterVec::new(
HttpRequestStatusLabelGroupSet {
path: lasso::ThreadedRodeo::new(),
method: StaticLabelSet::new(),
status: StaticLabelSet::new(),
},
),
storage_controller_http_request_latency: measured::HistogramVec::new(
measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
),
storage_controller_pageserver_request_error: measured::CounterVec::new(
PageserverRequestLabelGroupSet {
pageserver_id: lasso::ThreadedRodeo::new(),
path: lasso::ThreadedRodeo::new(),
method: StaticLabelSet::new(),
},
),
storage_controller_pageserver_request_latency: measured::HistogramVec::new(
measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
),
storage_controller_passthrough_request_error: measured::CounterVec::new(
PageserverRequestLabelGroupSet {
pageserver_id: lasso::ThreadedRodeo::new(),
path: lasso::ThreadedRodeo::new(),
method: StaticLabelSet::new(),
},
),
storage_controller_passthrough_request_latency: measured::HistogramVec::new(
measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
),
storage_controller_database_query_error: measured::CounterVec::new(
DatabaseQueryErrorLabelGroupSet {
operation: StaticLabelSet::new(),
error_type: StaticLabelSet::new(),
},
),
storage_controller_database_query_latency: measured::HistogramVec::new(
measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
),
}
}
}
@@ -119,7 +157,7 @@ pub(crate) struct ReconcileCompleteLabelGroup {
#[derive(measured::LabelGroup)]
#[label(set = HttpRequestStatusLabelGroupSet)]
pub(crate) struct HttpRequestStatusLabelGroup<'a> {
#[label(dynamic_with = lasso::ThreadedRodeo, default)]
#[label(dynamic_with = lasso::ThreadedRodeo)]
pub(crate) path: &'a str,
pub(crate) method: Method,
pub(crate) status: StatusCode,
@@ -128,21 +166,40 @@ pub(crate) struct HttpRequestStatusLabelGroup<'a> {
#[derive(measured::LabelGroup)]
#[label(set = HttpRequestLatencyLabelGroupSet)]
pub(crate) struct HttpRequestLatencyLabelGroup<'a> {
#[label(dynamic_with = lasso::ThreadedRodeo, default)]
#[label(dynamic_with = lasso::ThreadedRodeo)]
pub(crate) path: &'a str,
pub(crate) method: Method,
}
impl Default for HttpRequestLatencyLabelGroupSet {
fn default() -> Self {
Self {
path: lasso::ThreadedRodeo::new(),
method: StaticLabelSet::new(),
}
}
}
#[derive(measured::LabelGroup, Clone)]
#[label(set = PageserverRequestLabelGroupSet)]
pub(crate) struct PageserverRequestLabelGroup<'a> {
#[label(dynamic_with = lasso::ThreadedRodeo, default)]
#[label(dynamic_with = lasso::ThreadedRodeo)]
pub(crate) pageserver_id: &'a str,
#[label(dynamic_with = lasso::ThreadedRodeo, default)]
#[label(dynamic_with = lasso::ThreadedRodeo)]
pub(crate) path: &'a str,
pub(crate) method: Method,
}
impl Default for PageserverRequestLabelGroupSet {
fn default() -> Self {
Self {
pageserver_id: lasso::ThreadedRodeo::new(),
path: lasso::ThreadedRodeo::new(),
method: StaticLabelSet::new(),
}
}
}
#[derive(measured::LabelGroup)]
#[label(set = DatabaseQueryErrorLabelGroupSet)]
pub(crate) struct DatabaseQueryErrorLabelGroup {
@@ -156,7 +213,7 @@ pub(crate) struct DatabaseQueryLatencyLabelGroup {
pub(crate) operation: DatabaseOperation,
}
#[derive(FixedCardinalityLabel, Clone, Copy)]
#[derive(FixedCardinalityLabel)]
pub(crate) enum ReconcileOutcome {
#[label(rename = "ok")]
Success,
@@ -164,7 +221,7 @@ pub(crate) enum ReconcileOutcome {
Cancel,
}
#[derive(FixedCardinalityLabel, Copy, Clone)]
#[derive(FixedCardinalityLabel, Clone)]
pub(crate) enum Method {
Get,
Put,
@@ -189,12 +246,11 @@ impl From<hyper::Method> for Method {
}
}
#[derive(Clone, Copy)]
pub(crate) struct StatusCode(pub(crate) hyper::http::StatusCode);
impl LabelValue for StatusCode {
fn visit<V: measured::label::LabelVisitor>(&self, v: V) -> V::Output {
v.write_int(self.0.as_u16() as i64)
v.write_int(self.0.as_u16() as u64)
}
}
@@ -212,7 +268,7 @@ impl FixedCardinalityLabel for StatusCode {
}
}
#[derive(FixedCardinalityLabel, Clone, Copy)]
#[derive(FixedCardinalityLabel)]
pub(crate) enum DatabaseErrorLabel {
Query,
Connection,

View File

@@ -3,8 +3,7 @@ use std::{str::FromStr, time::Duration};
use hyper::StatusCode;
use pageserver_api::{
controller_api::{
NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy,
TenantLocateResponseShard,
NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, TenantLocateResponseShard,
},
shard::TenantShardId,
};
@@ -257,19 +256,6 @@ impl Node {
)
.await
}
/// Generate the simplified API-friendly description of a node's state
pub(crate) fn describe(&self) -> NodeDescribeResponse {
NodeDescribeResponse {
id: self.id,
availability: self.availability.into(),
scheduling: self.scheduling,
listen_http_addr: self.listen_http_addr.clone(),
listen_http_port: self.listen_http_port,
listen_pg_addr: self.listen_pg_addr.clone(),
listen_pg_port: self.listen_pg_port,
}
}
}
impl std::fmt::Display for Node {

View File

@@ -1,14 +1,13 @@
use pageserver_api::{
models::{
LocationConfig, LocationConfigListResponse, PageserverUtilization, SecondaryProgress,
TenantScanRemoteStorageResponse, TenantShardSplitRequest, TenantShardSplitResponse,
TimelineCreateRequest, TimelineInfo,
TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
},
shard::TenantShardId,
};
use pageserver_client::mgmt_api::{Client, Result};
use reqwest::StatusCode;
use utils::id::{NodeId, TenantId, TimelineId};
use utils::id::{NodeId, TimelineId};
/// Thin wrapper around [`pageserver_client::mgmt_api::Client`]. It allows the storage
/// controller to collect metrics in a non-intrusive manner.
@@ -89,18 +88,6 @@ impl PageserverClient {
)
}
pub(crate) async fn tenant_scan_remote_storage(
&self,
tenant_id: TenantId,
) -> Result<TenantScanRemoteStorageResponse> {
measured_request!(
"tenant_scan_remote_storage",
crate::metrics::Method::Get,
&self.node_id_label,
self.inner.tenant_scan_remote_storage(tenant_id).await
)
}
pub(crate) async fn tenant_secondary_download(
&self,
tenant_id: TenantShardId,

View File

@@ -2,7 +2,6 @@ pub(crate) mod split_state;
use std::collections::HashMap;
use std::str::FromStr;
use std::time::Duration;
use std::time::Instant;
use self::split_state::SplitState;
use camino::Utf8Path;
@@ -80,7 +79,7 @@ pub(crate) enum DatabaseError {
Logical(String),
}
#[derive(measured::FixedCardinalityLabel, Copy, Clone)]
#[derive(measured::FixedCardinalityLabel, Clone)]
pub(crate) enum DatabaseOperation {
InsertNode,
UpdateNode,
@@ -145,31 +144,6 @@ impl Persistence {
}
}
/// A helper for use during startup, where we would like to tolerate concurrent restarts of the
/// database and the storage controller, therefore the database might not be available right away
pub async fn await_connection(
database_url: &str,
timeout: Duration,
) -> Result<(), diesel::ConnectionError> {
let started_at = Instant::now();
loop {
match PgConnection::establish(database_url) {
Ok(_) => {
tracing::info!("Connected to database.");
return Ok(());
}
Err(e) => {
if started_at.elapsed() > timeout {
return Err(e);
} else {
tracing::info!("Database not yet available, waiting... ({e})");
tokio::time::sleep(Duration::from_millis(100)).await;
}
}
}
}
}
/// Wraps `with_conn` in order to collect latency and error metrics
async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
where
@@ -179,7 +153,9 @@ impl Persistence {
let latency = &METRICS_REGISTRY
.metrics_group
.storage_controller_database_query_latency;
let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup { operation: op });
let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup {
operation: op.clone(),
});
let res = self.with_conn(func).await;
@@ -720,7 +696,7 @@ impl Persistence {
}
}
/// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
/// Parts of [`crate::tenant_state::TenantState`] that are stored durably
#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)]
#[diesel(table_name = crate::schema::tenant_shards)]
pub(crate) struct TenantShardPersistence {

View File

@@ -18,14 +18,14 @@ use utils::sync::gate::GateGuard;
use crate::compute_hook::{ComputeHook, NotifyError};
use crate::node::Node;
use crate::tenant_shard::{IntentState, ObservedState, ObservedStateLocation};
use crate::tenant_state::{IntentState, ObservedState, ObservedStateLocation};
const DEFAULT_HEATMAP_PERIOD: &str = "60s";
/// Object with the lifetime of the background reconcile task that is created
/// for tenants which have a difference between their intent and observed states.
pub(super) struct Reconciler {
/// See [`crate::tenant_shard::TenantShard`] for the meanings of these fields: they are a snapshot
/// See [`crate::tenant_state::TenantState`] for the meanings of these fields: they are a snapshot
/// of a tenant's state from when we spawned a reconcile task.
pub(super) tenant_shard_id: TenantShardId,
pub(crate) shard: ShardIdentity,
@@ -48,15 +48,11 @@ pub(super) struct Reconciler {
/// To avoid stalling if the cloud control plane is unavailable, we may proceed
/// past failures in [`ComputeHook::notify`], but we _must_ remember that we failed
/// so that we can set [`crate::tenant_shard::TenantShard::pending_compute_notification`] to ensure a later retry.
/// so that we can set [`crate::tenant_state::TenantState::pending_compute_notification`] to ensure a later retry.
pub(crate) compute_notify_failure: bool,
/// Reconciler is responsible for keeping alive semaphore units that limit concurrency on how many
/// we will spawn.
pub(crate) _resource_units: ReconcileUnits,
/// A means to abort background reconciliation: it is essential to
/// call this when something changes in the original TenantShard that
/// call this when something changes in the original TenantState that
/// will make this reconciliation impossible or unnecessary, for
/// example when a pageserver node goes offline, or the PlacementPolicy for
/// the tenant is changed.
@@ -70,20 +66,7 @@ pub(super) struct Reconciler {
pub(crate) persistence: Arc<Persistence>,
}
/// RAII resource units granted to a Reconciler, which it should keep alive until it finishes doing I/O
pub(crate) struct ReconcileUnits {
_sem_units: tokio::sync::OwnedSemaphorePermit,
}
impl ReconcileUnits {
pub(crate) fn new(sem_units: tokio::sync::OwnedSemaphorePermit) -> Self {
Self {
_sem_units: sem_units,
}
}
}
/// This is a snapshot of [`crate::tenant_shard::IntentState`], but it does not do any
/// This is a snapshot of [`crate::tenant_state::IntentState`], but it does not do any
/// reference counting for Scheduler. The IntentState is what the scheduler works with,
/// and the TargetState is just the instruction for a particular Reconciler run.
#[derive(Debug)]
@@ -767,10 +750,7 @@ impl Reconciler {
// It is up to the caller whether they want to drop out on this error, but they don't have to:
// in general we should avoid letting unavailability of the cloud control plane stop us from
// making progress.
if !matches!(e, NotifyError::ShuttingDown) {
tracing::warn!("Failed to notify compute of attached pageserver {node}: {e}");
}
tracing::warn!("Failed to notify compute of attached pageserver {node}: {e}");
// Set this flag so that in our ReconcileResult we will set the flag on the shard that it
// needs to retry at some point.
self.compute_notify_failure = true;

View File

@@ -1,4 +1,4 @@
use crate::{node::Node, tenant_shard::TenantShard};
use crate::{node::Node, tenant_state::TenantState};
use pageserver_api::controller_api::UtilizationScore;
use serde::Serialize;
use std::collections::HashMap;
@@ -27,7 +27,7 @@ pub enum MaySchedule {
#[derive(Serialize)]
struct SchedulerNode {
/// How many shards are currently scheduled on this node, via their [`crate::tenant_shard::IntentState`].
/// How many shards are currently scheduled on this node, via their [`crate::tenant_state::IntentState`].
shard_count: usize,
/// Whether this node is currently elegible to have new shards scheduled (this is derived
@@ -84,21 +84,7 @@ impl std::ops::Add for AffinityScore {
}
}
/// Hint for whether this is a sincere attempt to schedule, or a speculative
/// check for where we _would_ schedule (done during optimization)
#[derive(Debug)]
pub(crate) enum ScheduleMode {
Normal,
Speculative,
}
impl Default for ScheduleMode {
fn default() -> Self {
Self::Normal
}
}
// For carrying state between multiple calls to [`TenantShard::schedule`], e.g. when calling
// For carrying state between multiple calls to [`TenantState::schedule`], e.g. when calling
// it for many shards in the same tenant.
#[derive(Debug, Default)]
pub(crate) struct ScheduleContext {
@@ -107,8 +93,6 @@ pub(crate) struct ScheduleContext {
/// Specifically how many _attached_ locations are on each node
pub(crate) attached_nodes: HashMap<NodeId, usize>,
pub(crate) mode: ScheduleMode,
}
impl ScheduleContext {
@@ -163,7 +147,7 @@ impl Scheduler {
pub(crate) fn consistency_check<'a>(
&self,
nodes: impl Iterator<Item = &'a Node>,
shards: impl Iterator<Item = &'a TenantShard>,
shards: impl Iterator<Item = &'a TenantState>,
) -> anyhow::Result<()> {
let mut expect_nodes: HashMap<NodeId, SchedulerNode> = HashMap::new();
for node in nodes {
@@ -345,34 +329,27 @@ impl Scheduler {
scores.sort_by_key(|i| (i.1, i.2, i.0));
if scores.is_empty() {
// After applying constraints, no pageservers were left.
if !matches!(context.mode, ScheduleMode::Speculative) {
// If this was not a speculative attempt, log details to understand why we couldn't
// schedule: this may help an engineer understand if some nodes are marked offline
// in a way that's preventing progress.
// After applying constraints, no pageservers were left. We log some detail about
// the state of nodes to help understand why this happened. This is not logged as an error because
// it is legitimately possible for enough nodes to be Offline to prevent scheduling a shard.
tracing::info!("Scheduling failure, while excluding {hard_exclude:?}, node states:");
for (node_id, node) in &self.nodes {
tracing::info!(
"Scheduling failure, while excluding {hard_exclude:?}, node states:"
"Node {node_id}: may_schedule={} shards={}",
node.may_schedule != MaySchedule::No,
node.shard_count
);
for (node_id, node) in &self.nodes {
tracing::info!(
"Node {node_id}: may_schedule={} shards={}",
node.may_schedule != MaySchedule::No,
node.shard_count
);
}
}
return Err(ScheduleError::ImpossibleConstraint);
}
// Lowest score wins
let node_id = scores.first().unwrap().0;
if !matches!(context.mode, ScheduleMode::Speculative) {
tracing::info!(
tracing::info!(
"scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?})",
scores.iter().map(|i| i.0 .0).collect::<Vec<_>>()
);
}
// Note that we do not update shard count here to reflect the scheduling: that
// is IntentState's job when the scheduled location is used.
@@ -421,7 +398,7 @@ pub(crate) mod test_utils {
mod tests {
use super::*;
use crate::tenant_shard::IntentState;
use crate::tenant_state::IntentState;
#[test]
fn scheduler_basic() -> anyhow::Result<()> {
let nodes = test_utils::make_test_nodes(2);

View File

@@ -7,7 +7,6 @@ use std::{
use crate::{
metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome},
persistence::TenantShardPersistence,
reconciler::ReconcileUnits,
scheduler::{AffinityScore, MaySchedule, ScheduleContext},
};
use pageserver_api::controller_api::{PlacementPolicy, ShardSchedulingPolicy};
@@ -23,7 +22,7 @@ use utils::{
generation::Generation,
id::NodeId,
seqwait::{SeqWait, SeqWaitError},
sync::gate::GateGuard,
sync::gate::Gate,
};
use crate::{
@@ -38,18 +37,12 @@ use crate::{
};
/// Serialization helper
fn read_last_error<S, T>(v: &std::sync::Mutex<Option<T>>, serializer: S) -> Result<S::Ok, S::Error>
fn read_mutex_content<S, T>(v: &std::sync::Mutex<T>, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::ser::Serializer,
T: std::fmt::Display,
T: Clone + std::fmt::Display,
{
serializer.collect_str(
&v.lock()
.unwrap()
.as_ref()
.map(|e| format!("{e}"))
.unwrap_or("".to_string()),
)
serializer.collect_str(&v.lock().unwrap())
}
/// In-memory state for a particular tenant shard.
@@ -57,7 +50,7 @@ where
/// This struct implement Serialize for debugging purposes, but is _not_ persisted
/// itself: see [`crate::persistence`] for the subset of tenant shard state that is persisted.
#[derive(Serialize)]
pub(crate) struct TenantShard {
pub(crate) struct TenantState {
pub(crate) tenant_shard_id: TenantShardId,
pub(crate) shard: ShardIdentity,
@@ -102,10 +95,6 @@ pub(crate) struct TenantShard {
/// reconciliation, and timeline creation.
pub(crate) splitting: SplitState,
/// If a tenant was enqueued for later reconcile due to hitting concurrency limit, this flag
/// is set. This flag is cleared when the tenant is popped off the delay queue.
pub(crate) delayed_reconcile: bool,
/// Optionally wait for reconciliation to complete up to a particular
/// sequence number.
#[serde(skip)]
@@ -117,19 +106,15 @@ pub(crate) struct TenantShard {
#[serde(skip)]
pub(crate) error_waiter: std::sync::Arc<SeqWait<Sequence, Sequence>>,
/// The most recent error from a reconcile on this tenant. This is a nested Arc
/// because:
/// - ReconcileWaiters need to Arc-clone the overall object to read it later
/// - ReconcileWaitError needs to use an `Arc<ReconcileError>` because we can construct
/// many waiters for one shard, and the underlying error types are not Clone.
/// The most recent error from a reconcile on this tenant
/// TODO: generalize to an array of recent events
/// TOOD: use a ArcSwap instead of mutex for faster reads?
#[serde(serialize_with = "read_last_error")]
pub(crate) last_error: std::sync::Arc<std::sync::Mutex<Option<Arc<ReconcileError>>>>,
#[serde(serialize_with = "read_mutex_content")]
pub(crate) last_error: std::sync::Arc<std::sync::Mutex<String>>,
/// If we have a pending compute notification that for some reason we weren't able to send,
/// set this to true. If this is set, calls to [`Self::get_reconcile_needed`] will return Yes
/// and trigger a Reconciler run. This is the mechanism by which compute notifications are included in the scope
/// set this to true. If this is set, calls to [`Self::maybe_reconcile`] will run a task to retry
/// sending it. This is the mechanism by which compute notifications are included in the scope
/// of state that we publish externally in an eventually consistent way.
pub(crate) pending_compute_notification: bool,
@@ -303,18 +288,18 @@ pub(crate) struct ReconcilerWaiter {
seq_wait: std::sync::Arc<SeqWait<Sequence, Sequence>>,
error_seq_wait: std::sync::Arc<SeqWait<Sequence, Sequence>>,
error: std::sync::Arc<std::sync::Mutex<Option<Arc<ReconcileError>>>>,
error: std::sync::Arc<std::sync::Mutex<String>>,
seq: Sequence,
}
#[derive(thiserror::Error, Debug)]
pub(crate) enum ReconcileWaitError {
pub enum ReconcileWaitError {
#[error("Timeout waiting for shard {0}")]
Timeout(TenantShardId),
#[error("shutting down")]
Shutdown,
#[error("Reconcile error on shard {0}: {1}")]
Failed(TenantShardId, Arc<ReconcileError>),
Failed(TenantShardId, String),
}
#[derive(Eq, PartialEq, Debug)]
@@ -352,8 +337,7 @@ impl ReconcilerWaiter {
SeqWaitError::Timeout => unreachable!()
})?;
return Err(ReconcileWaitError::Failed(self.tenant_shard_id,
self.error.lock().unwrap().clone().expect("If error_seq_wait was advanced error was set").clone()))
return Err(ReconcileWaitError::Failed(self.tenant_shard_id, self.error.lock().unwrap().clone()))
}
}
@@ -369,19 +353,8 @@ pub(crate) struct ReconcilerHandle {
cancel: CancellationToken,
}
pub(crate) enum ReconcileNeeded {
/// shard either doesn't need reconciliation, or is forbidden from spawning a reconciler
/// in its current state (e.g. shard split in progress, or ShardSchedulingPolicy forbids it)
No,
/// shard has a reconciler running, and its intent hasn't changed since that one was
/// spawned: wait for the existing reconciler rather than spawning a new one.
WaitExisting(ReconcilerWaiter),
/// shard needs reconciliation: call into [`TenantShard::spawn_reconciler`]
Yes,
}
/// When a reconcile task completes, it sends this result object
/// to be applied to the primary TenantShard.
/// to be applied to the primary TenantState.
pub(crate) struct ReconcileResult {
pub(crate) sequence: Sequence,
/// On errors, `observed` should be treated as an incompleted description
@@ -394,7 +367,7 @@ pub(crate) struct ReconcileResult {
pub(crate) generation: Option<Generation>,
pub(crate) observed: ObservedState,
/// Set [`TenantShard::pending_compute_notification`] from this flag
/// Set [`TenantState::pending_compute_notification`] from this flag
pub(crate) pending_compute_notification: bool,
}
@@ -406,7 +379,7 @@ impl ObservedState {
}
}
impl TenantShard {
impl TenantState {
pub(crate) fn new(
tenant_shard_id: TenantShardId,
shard: ShardIdentity,
@@ -423,7 +396,6 @@ impl TenantShard {
reconciler: None,
splitting: SplitState::Idle,
sequence: Sequence(1),
delayed_reconcile: false,
waiter: Arc::new(SeqWait::new(Sequence(0))),
error_waiter: Arc::new(SeqWait::new(Sequence(0))),
last_error: Arc::default(),
@@ -859,10 +831,16 @@ impl TenantShard {
#[allow(clippy::too_many_arguments)]
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
pub(crate) fn get_reconcile_needed(
pub(crate) fn maybe_reconcile(
&mut self,
result_tx: &tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
pageservers: &Arc<HashMap<NodeId, Node>>,
) -> ReconcileNeeded {
compute_hook: &Arc<ComputeHook>,
service_config: &service::Config,
persistence: &Arc<Persistence>,
gate: &Gate,
cancel: &CancellationToken,
) -> Option<ReconcilerWaiter> {
// If there are any ambiguous observed states, and the nodes they refer to are available,
// we should reconcile to clean them up.
let mut dirty_observed = false;
@@ -884,8 +862,8 @@ impl TenantShard {
active_nodes_dirty || dirty_observed || self.pending_compute_notification;
if !do_reconcile {
tracing::debug!("Not dirty, no reconciliation needed.");
return ReconcileNeeded::No;
tracing::info!("Not dirty, no reconciliation needed.");
return None;
}
// If we are currently splitting, then never start a reconciler task: the splitting logic
@@ -893,7 +871,7 @@ impl TenantShard {
// up top, so that we only log this message if we would otherwise have done a reconciliation.
if !matches!(self.splitting, SplitState::Idle) {
tracing::info!("Refusing to reconcile, splitting in progress");
return ReconcileNeeded::No;
return None;
}
// Reconcile already in flight for the current sequence?
@@ -903,7 +881,7 @@ impl TenantShard {
"Reconciliation already in progress for sequence {:?}",
self.sequence,
);
return ReconcileNeeded::WaitExisting(ReconcilerWaiter {
return Some(ReconcilerWaiter {
tenant_shard_id: self.tenant_shard_id,
seq_wait: self.waiter.clone(),
error_seq_wait: self.error_waiter.clone(),
@@ -922,67 +900,10 @@ impl TenantShard {
// We only reach this point if there is work to do and we're going to skip
// doing it: warn it obvious why this tenant isn't doing what it ought to.
tracing::warn!("Skipping reconcile for policy {:?}", self.scheduling_policy);
return ReconcileNeeded::No;
return None;
}
}
ReconcileNeeded::Yes
}
/// Ensure the sequence number is set to a value where waiting for this value will make us wait
/// for the next reconcile: i.e. it is ahead of all completed or running reconcilers.
///
/// Constructing a ReconcilerWaiter with the resulting sequence number gives the property
/// that the waiter will not complete until some future Reconciler is constructed and run.
fn ensure_sequence_ahead(&mut self) {
// Find the highest sequence for which a Reconciler has previously run or is currently
// running
let max_seen = std::cmp::max(
self.reconciler
.as_ref()
.map(|r| r.sequence)
.unwrap_or(Sequence(0)),
std::cmp::max(self.waiter.load(), self.error_waiter.load()),
);
if self.sequence <= max_seen {
self.sequence = max_seen.next();
}
}
/// Create a waiter that will wait for some future Reconciler that hasn't been spawned yet.
///
/// This is appropriate when you can't spawn a recociler (e.g. due to resource limits), but
/// you would like to wait until one gets spawned in the background.
pub(crate) fn future_reconcile_waiter(&mut self) -> ReconcilerWaiter {
self.ensure_sequence_ahead();
ReconcilerWaiter {
tenant_shard_id: self.tenant_shard_id,
seq_wait: self.waiter.clone(),
error_seq_wait: self.error_waiter.clone(),
error: self.last_error.clone(),
seq: self.sequence,
}
}
#[allow(clippy::too_many_arguments)]
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
pub(crate) fn spawn_reconciler(
&mut self,
result_tx: &tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
pageservers: &Arc<HashMap<NodeId, Node>>,
compute_hook: &Arc<ComputeHook>,
service_config: &service::Config,
persistence: &Arc<Persistence>,
units: ReconcileUnits,
gate_guard: GateGuard,
cancel: &CancellationToken,
) -> Option<ReconcilerWaiter> {
// Reconcile in flight for a stale sequence? Our sequence's task will wait for it before
// doing our sequence's work.
let old_handle = self.reconciler.take();
// Build list of nodes from which the reconciler should detach
let mut detach = Vec::new();
for node_id in self.observed.locations.keys() {
@@ -998,9 +919,18 @@ impl TenantShard {
}
}
// Reconcile in flight for a stale sequence? Our sequence's task will wait for it before
// doing our sequence's work.
let old_handle = self.reconciler.take();
let Ok(gate_guard) = gate.enter() else {
// Shutting down, don't start a reconciler
return None;
};
// Advance the sequence before spawning a reconciler, so that sequence waiters
// can distinguish between before+after the reconcile completes.
self.ensure_sequence_ahead();
self.sequence = self.sequence.next();
let reconciler_cancel = cancel.child_token();
let reconciler_intent = TargetState::from_intent(pageservers, &self.intent);
@@ -1015,7 +945,6 @@ impl TenantShard {
compute_hook: compute_hook.clone(),
service_config: service_config.clone(),
_gate_guard: gate_guard,
_resource_units: units,
cancel: reconciler_cancel.clone(),
persistence: persistence.clone(),
compute_notify_failure: false,
@@ -1082,18 +1011,16 @@ impl TenantShard {
status: outcome_label,
});
// Constructing result implicitly drops Reconciler, freeing any ReconcileUnits before the Service might
// try and schedule more work in response to our result.
let result = ReconcileResult {
sequence: reconcile_seq,
result,
tenant_shard_id: reconciler.tenant_shard_id,
generation: reconciler.generation,
observed: reconciler.observed,
pending_compute_notification: reconciler.compute_notify_failure,
};
result_tx.send(result).ok();
result_tx
.send(ReconcileResult {
sequence: reconcile_seq,
result,
tenant_shard_id: reconciler.tenant_shard_id,
generation: reconciler.generation,
observed: reconciler.observed,
pending_compute_notification: reconciler.compute_notify_failure,
})
.ok();
}
.instrument(reconciler_span),
);
@@ -1162,13 +1089,6 @@ impl TenantShard {
&self.scheduling_policy
}
pub(crate) fn set_last_error(&mut self, sequence: Sequence, error: ReconcileError) {
// Ordering: always set last_error before advancing sequence, so that sequence
// waiters are guaranteed to see a Some value when they see an error.
*(self.last_error.lock().unwrap()) = Some(Arc::new(error));
self.error_waiter.advance(sequence);
}
pub(crate) fn from_persistent(
tsp: TenantShardPersistence,
intent: IntentState,
@@ -1191,7 +1111,6 @@ impl TenantShard {
error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
last_error: Arc::default(),
pending_compute_notification: false,
delayed_reconcile: false,
scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(),
})
}
@@ -1224,7 +1143,7 @@ pub(crate) mod tests {
use super::*;
fn make_test_tenant_shard(policy: PlacementPolicy) -> TenantShard {
fn make_test_tenant_shard(policy: PlacementPolicy) -> TenantState {
let tenant_id = TenantId::generate();
let shard_number = ShardNumber(0);
let shard_count = ShardCount::new(1);
@@ -1234,7 +1153,7 @@ pub(crate) mod tests {
shard_number,
shard_count,
};
TenantShard::new(
TenantState::new(
tenant_shard_id,
ShardIdentity::new(
shard_number,
@@ -1246,7 +1165,7 @@ pub(crate) mod tests {
)
}
fn make_test_tenant(policy: PlacementPolicy, shard_count: ShardCount) -> Vec<TenantShard> {
fn make_test_tenant(policy: PlacementPolicy, shard_count: ShardCount) -> Vec<TenantState> {
let tenant_id = TenantId::generate();
(0..shard_count.count())
@@ -1258,7 +1177,7 @@ pub(crate) mod tests {
shard_number,
shard_count,
};
TenantShard::new(
TenantState::new(
tenant_shard_id,
ShardIdentity::new(
shard_number,
@@ -1283,24 +1202,24 @@ pub(crate) mod tests {
let mut scheduler = Scheduler::new(nodes.values());
let mut context = ScheduleContext::default();
let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1));
tenant_shard
let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
tenant_state
.schedule(&mut scheduler, &mut context)
.expect("we have enough nodes, scheduling should work");
// Expect to initially be schedule on to different nodes
assert_eq!(tenant_shard.intent.secondary.len(), 1);
assert!(tenant_shard.intent.attached.is_some());
assert_eq!(tenant_state.intent.secondary.len(), 1);
assert!(tenant_state.intent.attached.is_some());
let attached_node_id = tenant_shard.intent.attached.unwrap();
let secondary_node_id = *tenant_shard.intent.secondary.iter().last().unwrap();
let attached_node_id = tenant_state.intent.attached.unwrap();
let secondary_node_id = *tenant_state.intent.secondary.iter().last().unwrap();
assert_ne!(attached_node_id, secondary_node_id);
// Notifying the attached node is offline should demote it to a secondary
let changed = tenant_shard.intent.demote_attached(attached_node_id);
let changed = tenant_state.intent.demote_attached(attached_node_id);
assert!(changed);
assert!(tenant_shard.intent.attached.is_none());
assert_eq!(tenant_shard.intent.secondary.len(), 2);
assert!(tenant_state.intent.attached.is_none());
assert_eq!(tenant_state.intent.secondary.len(), 2);
// Update the scheduler state to indicate the node is offline
nodes
@@ -1310,18 +1229,18 @@ pub(crate) mod tests {
scheduler.node_upsert(nodes.get(&attached_node_id).unwrap());
// Scheduling the node should promote the still-available secondary node to attached
tenant_shard
tenant_state
.schedule(&mut scheduler, &mut context)
.expect("active nodes are available");
assert_eq!(tenant_shard.intent.attached.unwrap(), secondary_node_id);
assert_eq!(tenant_state.intent.attached.unwrap(), secondary_node_id);
// The original attached node should have been retained as a secondary
assert_eq!(
*tenant_shard.intent.secondary.iter().last().unwrap(),
*tenant_state.intent.secondary.iter().last().unwrap(),
attached_node_id
);
tenant_shard.intent.clear(&mut scheduler);
tenant_state.intent.clear(&mut scheduler);
Ok(())
}
@@ -1331,48 +1250,48 @@ pub(crate) mod tests {
let nodes = make_test_nodes(3);
let mut scheduler = Scheduler::new(nodes.values());
let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1));
let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
tenant_shard.observed.locations.insert(
tenant_state.observed.locations.insert(
NodeId(3),
ObservedStateLocation {
conf: Some(LocationConfig {
mode: LocationConfigMode::AttachedMulti,
generation: Some(2),
secondary_conf: None,
shard_number: tenant_shard.shard.number.0,
shard_count: tenant_shard.shard.count.literal(),
shard_stripe_size: tenant_shard.shard.stripe_size.0,
shard_number: tenant_state.shard.number.0,
shard_count: tenant_state.shard.count.literal(),
shard_stripe_size: tenant_state.shard.stripe_size.0,
tenant_conf: TenantConfig::default(),
}),
},
);
tenant_shard.observed.locations.insert(
tenant_state.observed.locations.insert(
NodeId(2),
ObservedStateLocation {
conf: Some(LocationConfig {
mode: LocationConfigMode::AttachedStale,
generation: Some(1),
secondary_conf: None,
shard_number: tenant_shard.shard.number.0,
shard_count: tenant_shard.shard.count.literal(),
shard_stripe_size: tenant_shard.shard.stripe_size.0,
shard_number: tenant_state.shard.number.0,
shard_count: tenant_state.shard.count.literal(),
shard_stripe_size: tenant_state.shard.stripe_size.0,
tenant_conf: TenantConfig::default(),
}),
},
);
tenant_shard.intent_from_observed(&mut scheduler);
tenant_state.intent_from_observed(&mut scheduler);
// The highest generationed attached location gets used as attached
assert_eq!(tenant_shard.intent.attached, Some(NodeId(3)));
assert_eq!(tenant_state.intent.attached, Some(NodeId(3)));
// Other locations get used as secondary
assert_eq!(tenant_shard.intent.secondary, vec![NodeId(2)]);
assert_eq!(tenant_state.intent.secondary, vec![NodeId(2)]);
scheduler.consistency_check(nodes.values(), [&tenant_shard].into_iter())?;
scheduler.consistency_check(nodes.values(), [&tenant_state].into_iter())?;
tenant_shard.intent.clear(&mut scheduler);
tenant_state.intent.clear(&mut scheduler);
Ok(())
}
@@ -1381,23 +1300,23 @@ pub(crate) mod tests {
let nodes = make_test_nodes(3);
let mut scheduler = Scheduler::new(nodes.values());
let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1));
let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
// In pause mode, schedule() shouldn't do anything
tenant_shard.scheduling_policy = ShardSchedulingPolicy::Pause;
assert!(tenant_shard
tenant_state.scheduling_policy = ShardSchedulingPolicy::Pause;
assert!(tenant_state
.schedule(&mut scheduler, &mut ScheduleContext::default())
.is_ok());
assert!(tenant_shard.intent.all_pageservers().is_empty());
assert!(tenant_state.intent.all_pageservers().is_empty());
// In active mode, schedule() works
tenant_shard.scheduling_policy = ShardSchedulingPolicy::Active;
assert!(tenant_shard
tenant_state.scheduling_policy = ShardSchedulingPolicy::Active;
assert!(tenant_state
.schedule(&mut scheduler, &mut ScheduleContext::default())
.is_ok());
assert!(!tenant_shard.intent.all_pageservers().is_empty());
assert!(!tenant_state.intent.all_pageservers().is_empty());
tenant_shard.intent.clear(&mut scheduler);
tenant_state.intent.clear(&mut scheduler);
Ok(())
}
@@ -1510,7 +1429,7 @@ pub(crate) mod tests {
fn optimize_til_idle(
nodes: &HashMap<NodeId, Node>,
scheduler: &mut Scheduler,
shards: &mut [TenantShard],
shards: &mut [TenantState],
) {
let mut loop_n = 0;
loop {

View File

@@ -86,10 +86,7 @@ where
.stdout(process_log_file)
.stderr(same_file_for_stderr)
.args(args);
let filled_cmd = fill_env_vars_prefixed_neon(fill_remote_storage_secrets_vars(
fill_rust_env_vars(background_command),
));
let filled_cmd = fill_remote_storage_secrets_vars(fill_rust_env_vars(background_command));
filled_cmd.envs(envs);
let pid_file_to_check = match &initial_pid_file {
@@ -271,15 +268,6 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command {
cmd
}
fn fill_env_vars_prefixed_neon(mut cmd: &mut Command) -> &mut Command {
for (var, val) in std::env::vars() {
if var.starts_with("NEON_PAGESERVER_") {
cmd = cmd.env(var, val);
}
}
cmd
}
/// Add a `pre_exec` to the cmd that, inbetween fork() and exec(),
/// 1. Claims a pidfile with a fcntl lock on it and
/// 2. Sets up the pidfile's file descriptor so that it (and the lock)

View File

@@ -14,7 +14,9 @@ use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
use control_plane::safekeeper::SafekeeperNode;
use control_plane::storage_controller::StorageController;
use control_plane::{broker, local_env};
use pageserver_api::controller_api::PlacementPolicy;
use pageserver_api::controller_api::{
NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy, PlacementPolicy,
};
use pageserver_api::models::{
ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
};
@@ -417,54 +419,6 @@ async fn handle_tenant(
println!("{} {:?}", t.id, t.state);
}
}
Some(("import", import_match)) => {
let tenant_id = parse_tenant_id(import_match)?.unwrap_or_else(TenantId::generate);
let storage_controller = StorageController::from_env(env);
let create_response = storage_controller.tenant_import(tenant_id).await?;
let shard_zero = create_response
.shards
.first()
.expect("Import response omitted shards");
let attached_pageserver_id = shard_zero.node_id;
let pageserver =
PageServerNode::from_env(env, env.get_pageserver_conf(attached_pageserver_id)?);
println!(
"Imported tenant {tenant_id}, attached to pageserver {attached_pageserver_id}"
);
let timelines = pageserver
.http_client
.list_timelines(shard_zero.shard_id)
.await?;
// Pick a 'main' timeline that has no ancestors, the rest will get arbitrary names
let main_timeline = timelines
.iter()
.find(|t| t.ancestor_timeline_id.is_none())
.expect("No timelines found")
.timeline_id;
let mut branch_i = 0;
for timeline in timelines.iter() {
let branch_name = if timeline.timeline_id == main_timeline {
"main".to_string()
} else {
branch_i += 1;
format!("branch_{branch_i}")
};
println!(
"Importing timeline {tenant_id}/{} as branch {branch_name}",
timeline.timeline_id
);
env.register_branch_mapping(branch_name, tenant_id, timeline.timeline_id)?;
}
}
Some(("create", create_match)) => {
let tenant_conf: HashMap<_, _> = create_match
.get_many::<String>("config")
@@ -1106,6 +1060,21 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
}
}
Some(("set-state", subcommand_args)) => {
let pageserver = get_pageserver(env, subcommand_args)?;
let scheduling = subcommand_args.get_one("scheduling");
let availability = subcommand_args.get_one("availability");
let storage_controller = StorageController::from_env(env);
storage_controller
.node_configure(NodeConfigureRequest {
node_id: pageserver.conf.id,
scheduling: scheduling.cloned(),
availability: availability.cloned(),
})
.await?;
}
Some(("status", subcommand_args)) => {
match get_pageserver(env, subcommand_args)?.check_status().await {
Ok(_) => println!("Page server is up and running"),
@@ -1279,7 +1248,7 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
match ComputeControlPlane::load(env.clone()) {
Ok(cplane) => {
for (_k, node) in cplane.endpoints {
if let Err(e) = node.stop(if immediate { "immediate" } else { "fast" }, false) {
if let Err(e) = node.stop(if immediate { "immediate" } else { "fast " }, false) {
eprintln!("postgres stop failed: {e:#}");
}
}
@@ -1465,7 +1434,6 @@ fn cli() -> Command {
.subcommand(
Command::new("timeline")
.about("Manage timelines")
.arg_required_else_help(true)
.subcommand(Command::new("list")
.about("List all timelines, available to this pageserver")
.arg(tenant_id_arg.clone()))
@@ -1528,8 +1496,6 @@ fn cli() -> Command {
.subcommand(Command::new("config")
.arg(tenant_id_arg.clone())
.arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)))
.subcommand(Command::new("import").arg(tenant_id_arg.clone().required(true))
.about("Import a tenant that is present in remote storage, and create branches for its timelines"))
)
.subcommand(
Command::new("pageserver")
@@ -1549,6 +1515,12 @@ fn cli() -> Command {
.about("Restart local pageserver")
.arg(pageserver_config_args.clone())
)
.subcommand(Command::new("set-state")
.arg(Arg::new("availability").value_parser(value_parser!(NodeAvailability)).long("availability").action(ArgAction::Set).help("Availability state: offline,active"))
.arg(Arg::new("scheduling").value_parser(value_parser!(NodeSchedulingPolicy)).long("scheduling").action(ArgAction::Set).help("Scheduling state: draining,pause,filling,active"))
.about("Set scheduling or availability state of pageserver node")
.arg(pageserver_config_args.clone())
)
)
.subcommand(
Command::new("storage_controller")

View File

@@ -129,7 +129,6 @@ pub struct PageServerConf {
pub(crate) virtual_file_io_engine: Option<String>,
pub(crate) get_vectored_impl: Option<String>,
pub(crate) get_impl: Option<String>,
}
impl Default for PageServerConf {
@@ -142,7 +141,6 @@ impl Default for PageServerConf {
http_auth_type: AuthType::Trust,
virtual_file_io_engine: None,
get_vectored_impl: None,
get_impl: None,
}
}
}
@@ -158,7 +156,6 @@ pub struct SafekeeperConf {
pub remote_storage: Option<String>,
pub backup_threads: Option<u32>,
pub auth_enabled: bool,
pub listen_addr: Option<String>,
}
impl Default for SafekeeperConf {
@@ -172,7 +169,6 @@ impl Default for SafekeeperConf {
remote_storage: None,
backup_threads: None,
auth_enabled: false,
listen_addr: None,
}
}
}

View File

@@ -92,7 +92,6 @@ impl PageServerNode {
http_auth_type,
virtual_file_io_engine,
get_vectored_impl,
get_impl,
} = &self.conf;
let id = format!("id={}", id);
@@ -112,11 +111,6 @@ impl PageServerNode {
} else {
String::new()
};
let get_impl = if let Some(get_impl) = get_impl {
format!("get_impl='{get_impl}'")
} else {
String::new()
};
let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());
@@ -130,7 +124,6 @@ impl PageServerNode {
broker_endpoint_param,
virtual_file_io_engine,
get_vectored_impl,
get_impl,
];
if let Some(control_plane_api) = &self.env.control_plane_api {
@@ -441,11 +434,6 @@ impl PageServerNode {
.map(serde_json::from_str)
.transpose()
.context("parse `timeline_get_throttle` from json")?,
switch_to_aux_file_v2: settings
.remove("switch_to_aux_file_v2")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'switch_to_aux_file_v2' as bool")?,
};
if !settings.is_empty() {
bail!("Unrecognized tenant settings: {settings:?}")
@@ -564,11 +552,6 @@ impl PageServerNode {
.map(serde_json::from_str)
.transpose()
.context("parse `timeline_get_throttle` from json")?,
switch_to_aux_file_v2: settings
.remove("switch_to_aux_file_v2")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'switch_to_aux_file_v2' as bool")?,
}
};

View File

@@ -70,31 +70,24 @@ pub struct SafekeeperNode {
pub pg_connection_config: PgConnectionConfig,
pub env: LocalEnv,
pub http_client: reqwest::Client,
pub listen_addr: String,
pub http_base_url: String,
}
impl SafekeeperNode {
pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode {
let listen_addr = if let Some(ref listen_addr) = conf.listen_addr {
listen_addr.clone()
} else {
"127.0.0.1".to_string()
};
SafekeeperNode {
id: conf.id,
conf: conf.clone(),
pg_connection_config: Self::safekeeper_connection_config(&listen_addr, conf.pg_port),
pg_connection_config: Self::safekeeper_connection_config(conf.pg_port),
env: env.clone(),
http_client: reqwest::Client::new(),
http_base_url: format!("http://{}:{}/v1", listen_addr, conf.http_port),
listen_addr,
http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port),
}
}
/// Construct libpq connection string for connecting to this safekeeper.
fn safekeeper_connection_config(addr: &str, port: u16) -> PgConnectionConfig {
PgConnectionConfig::new_host_port(url::Host::parse(addr).unwrap(), port)
fn safekeeper_connection_config(port: u16) -> PgConnectionConfig {
PgConnectionConfig::new_host_port(url::Host::parse("127.0.0.1").unwrap(), port)
}
pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf {
@@ -118,8 +111,8 @@ impl SafekeeperNode {
);
io::stdout().flush().unwrap();
let listen_pg = format!("{}:{}", self.listen_addr, self.conf.pg_port);
let listen_http = format!("{}:{}", self.listen_addr, self.conf.http_port);
let listen_pg = format!("127.0.0.1:{}", self.conf.pg_port);
let listen_http = format!("127.0.0.1:{}", self.conf.http_port);
let id = self.id;
let datadir = self.datadir_path();
@@ -146,7 +139,7 @@ impl SafekeeperNode {
availability_zone,
];
if let Some(pg_tenant_only_port) = self.conf.pg_tenant_only_port {
let listen_pg_tenant_only = format!("{}:{}", self.listen_addr, pg_tenant_only_port);
let listen_pg_tenant_only = format!("127.0.0.1:{}", pg_tenant_only_port);
args.extend(["--listen-pg-tenant-only".to_owned(), listen_pg_tenant_only]);
}
if !self.conf.sync {

View File

@@ -472,16 +472,6 @@ impl StorageController {
.await
}
#[instrument(skip(self))]
pub async fn tenant_import(&self, tenant_id: TenantId) -> anyhow::Result<TenantCreateResponse> {
self.dispatch::<(), TenantCreateResponse>(
Method::POST,
format!("debug/v1/tenant/{tenant_id}/import"),
None,
)
.await
}
#[instrument(skip(self))]
pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> {
self.dispatch::<(), _>(

View File

@@ -1,23 +0,0 @@
[package]
name = "storcon_cli"
version = "0.1.0"
edition.workspace = true
license.workspace = true
[dependencies]
anyhow.workspace = true
clap.workspace = true
comfy-table.workspace = true
hyper.workspace = true
pageserver_api.workspace = true
pageserver_client.workspace = true
reqwest.workspace = true
serde.workspace = true
serde_json = { workspace = true, features = ["raw_value"] }
thiserror.workspace = true
tokio.workspace = true
tracing.workspace = true
utils.workspace = true
workspace_hack.workspace = true

View File

@@ -1,681 +0,0 @@
use std::{collections::HashMap, str::FromStr, time::Duration};
use clap::{Parser, Subcommand};
use hyper::{Method, StatusCode};
use pageserver_api::{
controller_api::{
NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy,
TenantDescribeResponse, TenantPolicyRequest,
},
models::{
LocationConfigSecondary, ShardParameters, TenantConfig, TenantConfigRequest,
TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
},
shard::{ShardStripeSize, TenantShardId},
};
use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt};
use reqwest::Url;
use serde::{de::DeserializeOwned, Serialize};
use utils::id::{NodeId, TenantId};
use pageserver_api::controller_api::{
NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
TenantLocateResponse, TenantShardMigrateRequest, TenantShardMigrateResponse,
};
#[derive(Subcommand, Debug)]
enum Command {
/// Register a pageserver with the storage controller. This shouldn't usually be necessary,
/// since pageservers auto-register when they start up
NodeRegister {
#[arg(long)]
node_id: NodeId,
#[arg(long)]
listen_pg_addr: String,
#[arg(long)]
listen_pg_port: u16,
#[arg(long)]
listen_http_addr: String,
#[arg(long)]
listen_http_port: u16,
},
/// Modify a node's configuration in the storage controller
NodeConfigure {
#[arg(long)]
node_id: NodeId,
/// Availability is usually auto-detected based on heartbeats. Set 'offline' here to
/// manually mark a node offline
#[arg(long)]
availability: Option<NodeAvailabilityArg>,
/// Scheduling policy controls whether tenant shards may be scheduled onto this node.
#[arg(long)]
scheduling: Option<NodeSchedulingPolicy>,
},
/// Modify a tenant's policies in the storage controller
TenantPolicy {
#[arg(long)]
tenant_id: TenantId,
/// Placement policy controls whether a tenant is `detached`, has only a secondary location (`secondary`),
/// or is in the normal attached state with N secondary locations (`attached:N`)
#[arg(long)]
placement: Option<PlacementPolicyArg>,
/// Scheduling policy enables pausing the controller's scheduling activity involving this tenant. `active` is normal,
/// `essential` disables optimization scheduling changes, `pause` disables all scheduling changes, and `stop` prevents
/// all reconciliation activity including for scheduling changes already made. `pause` and `stop` can make a tenant
/// unavailable, and are only for use in emergencies.
#[arg(long)]
scheduling: Option<ShardSchedulingPolicyArg>,
},
/// List nodes known to the storage controller
Nodes {},
/// List tenants known to the storage controller
Tenants {},
/// Create a new tenant in the storage controller, and by extension on pageservers.
TenantCreate {
#[arg(long)]
tenant_id: TenantId,
},
/// Delete a tenant in the storage controller, and by extension on pageservers.
TenantDelete {
#[arg(long)]
tenant_id: TenantId,
},
/// Split an existing tenant into a higher number of shards than its current shard count.
TenantShardSplit {
#[arg(long)]
tenant_id: TenantId,
#[arg(long)]
shard_count: u8,
/// Optional, in 8kiB pages. e.g. set 2048 for 16MB stripes.
#[arg(long)]
stripe_size: Option<u32>,
},
/// Migrate the attached location for a tenant shard to a specific pageserver.
TenantShardMigrate {
#[arg(long)]
tenant_shard_id: TenantShardId,
#[arg(long)]
node: NodeId,
},
/// Modify the pageserver tenant configuration of a tenant: this is the configuration structure
/// that is passed through to pageservers, and does not affect storage controller behavior.
TenantConfig {
#[arg(long)]
tenant_id: TenantId,
#[arg(long)]
config: String,
},
/// Attempt to balance the locations for a tenant across pageservers. This is a client-side
/// alternative to the storage controller's scheduling optimization behavior.
TenantScatter {
#[arg(long)]
tenant_id: TenantId,
},
/// Print details about a particular tenant, including all its shards' states.
TenantDescribe {
#[arg(long)]
tenant_id: TenantId,
},
/// For a tenant which hasn't been onboarded to the storage controller yet, add it in secondary
/// mode so that it can warm up content on a pageserver.
TenantWarmup {
#[arg(long)]
tenant_id: TenantId,
},
}
#[derive(Parser)]
#[command(
author,
version,
about,
long_about = "CLI for Storage Controller Support/Debug"
)]
#[command(arg_required_else_help(true))]
struct Cli {
#[arg(long)]
/// URL to storage controller. e.g. http://127.0.0.1:1234 when using `neon_local`
api: Url,
#[arg(long)]
/// JWT token for authenticating with storage controller. Depending on the API used, this
/// should have either `pageserverapi` or `admin` scopes: for convenience, you should mint
/// a token with both scopes to use with this tool.
jwt: Option<String>,
#[command(subcommand)]
command: Command,
}
#[derive(Debug, Clone)]
struct PlacementPolicyArg(PlacementPolicy);
impl FromStr for PlacementPolicyArg {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"detached" => Ok(Self(PlacementPolicy::Detached)),
"secondary" => Ok(Self(PlacementPolicy::Secondary)),
_ if s.starts_with("attached:") => {
let mut splitter = s.split(':');
let _prefix = splitter.next().unwrap();
match splitter.next().and_then(|s| s.parse::<usize>().ok()) {
Some(n) => Ok(Self(PlacementPolicy::Attached(n))),
None => Err(anyhow::anyhow!(
"Invalid format '{s}', a valid example is 'attached:1'"
)),
}
}
_ => Err(anyhow::anyhow!(
"Unknown placement policy '{s}', try detached,secondary,attached:<n>"
)),
}
}
}
#[derive(Debug, Clone)]
struct ShardSchedulingPolicyArg(ShardSchedulingPolicy);
impl FromStr for ShardSchedulingPolicyArg {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"active" => Ok(Self(ShardSchedulingPolicy::Active)),
"essential" => Ok(Self(ShardSchedulingPolicy::Essential)),
"pause" => Ok(Self(ShardSchedulingPolicy::Pause)),
"stop" => Ok(Self(ShardSchedulingPolicy::Stop)),
_ => Err(anyhow::anyhow!(
"Unknown scheduling policy '{s}', try active,essential,pause,stop"
)),
}
}
}
#[derive(Debug, Clone)]
struct NodeAvailabilityArg(NodeAvailabilityWrapper);
impl FromStr for NodeAvailabilityArg {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"active" => Ok(Self(NodeAvailabilityWrapper::Active)),
"offline" => Ok(Self(NodeAvailabilityWrapper::Offline)),
_ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
}
}
}
struct Client {
base_url: Url,
jwt_token: Option<String>,
client: reqwest::Client,
}
impl Client {
fn new(base_url: Url, jwt_token: Option<String>) -> Self {
Self {
base_url,
jwt_token,
client: reqwest::ClientBuilder::new()
.build()
.expect("Failed to construct http client"),
}
}
/// Simple HTTP request wrapper for calling into storage controller
async fn dispatch<RQ, RS>(
&self,
method: hyper::Method,
path: String,
body: Option<RQ>,
) -> mgmt_api::Result<RS>
where
RQ: Serialize + Sized,
RS: DeserializeOwned + Sized,
{
// The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
// for general purpose API access.
let url = Url::from_str(&format!(
"http://{}:{}/{path}",
self.base_url.host_str().unwrap(),
self.base_url.port().unwrap()
))
.unwrap();
let mut builder = self.client.request(method, url);
if let Some(body) = body {
builder = builder.json(&body)
}
if let Some(jwt_token) = &self.jwt_token {
builder = builder.header(
reqwest::header::AUTHORIZATION,
format!("Bearer {jwt_token}"),
);
}
let response = builder.send().await.map_err(mgmt_api::Error::ReceiveBody)?;
let response = response.error_from_body().await?;
response
.json()
.await
.map_err(pageserver_client::mgmt_api::Error::ReceiveBody)
}
}
#[tokio::main]
async fn main() -> anyhow::Result<()> {
let cli = Cli::parse();
let storcon_client = Client::new(cli.api.clone(), cli.jwt.clone());
let mut trimmed = cli.api.to_string();
trimmed.pop();
let vps_client = mgmt_api::Client::new(trimmed, cli.jwt.as_deref());
match cli.command {
Command::NodeRegister {
node_id,
listen_pg_addr,
listen_pg_port,
listen_http_addr,
listen_http_port,
} => {
storcon_client
.dispatch::<_, ()>(
Method::POST,
"control/v1/node".to_string(),
Some(NodeRegisterRequest {
node_id,
listen_pg_addr,
listen_pg_port,
listen_http_addr,
listen_http_port,
}),
)
.await?;
}
Command::TenantCreate { tenant_id } => {
vps_client
.tenant_create(&TenantCreateRequest {
new_tenant_id: TenantShardId::unsharded(tenant_id),
generation: None,
shard_parameters: ShardParameters::default(),
placement_policy: Some(PlacementPolicy::Attached(1)),
config: TenantConfig::default(),
})
.await?;
}
Command::TenantDelete { tenant_id } => {
let status = vps_client
.tenant_delete(TenantShardId::unsharded(tenant_id))
.await?;
tracing::info!("Delete status: {}", status);
}
Command::Nodes {} => {
let resp = storcon_client
.dispatch::<(), Vec<NodeDescribeResponse>>(
Method::GET,
"control/v1/node".to_string(),
None,
)
.await?;
let mut table = comfy_table::Table::new();
table.set_header(["Id", "Hostname", "Scheduling", "Availability"]);
for node in resp {
table.add_row([
format!("{}", node.id),
node.listen_http_addr,
format!("{:?}", node.scheduling),
format!("{:?}", node.availability),
]);
}
println!("{table}");
}
Command::NodeConfigure {
node_id,
availability,
scheduling,
} => {
let req = NodeConfigureRequest {
node_id,
availability: availability.map(|a| a.0),
scheduling,
};
storcon_client
.dispatch::<_, ()>(
Method::PUT,
format!("control/v1/node/{node_id}/config"),
Some(req),
)
.await?;
}
Command::Tenants {} => {
let resp = storcon_client
.dispatch::<(), Vec<TenantDescribeResponse>>(
Method::GET,
"control/v1/tenant".to_string(),
None,
)
.await?;
let mut table = comfy_table::Table::new();
table.set_header([
"TenantId",
"ShardCount",
"StripeSize",
"Placement",
"Scheduling",
]);
for tenant in resp {
let shard_zero = tenant.shards.into_iter().next().unwrap();
table.add_row([
format!("{}", tenant.tenant_id),
format!("{}", shard_zero.tenant_shard_id.shard_count.literal()),
format!("{:?}", tenant.stripe_size),
format!("{:?}", tenant.policy),
format!("{:?}", shard_zero.scheduling_policy),
]);
}
println!("{table}");
}
Command::TenantPolicy {
tenant_id,
placement,
scheduling,
} => {
let req = TenantPolicyRequest {
scheduling: scheduling.map(|s| s.0),
placement: placement.map(|p| p.0),
};
storcon_client
.dispatch::<_, ()>(
Method::PUT,
format!("control/v1/tenant/{tenant_id}/policy"),
Some(req),
)
.await?;
}
Command::TenantShardSplit {
tenant_id,
shard_count,
stripe_size,
} => {
let req = TenantShardSplitRequest {
new_shard_count: shard_count,
new_stripe_size: stripe_size.map(ShardStripeSize),
};
let response = storcon_client
.dispatch::<TenantShardSplitRequest, TenantShardSplitResponse>(
Method::PUT,
format!("control/v1/tenant/{tenant_id}/shard_split"),
Some(req),
)
.await?;
println!(
"Split tenant {} into {} shards: {}",
tenant_id,
shard_count,
response
.new_shards
.iter()
.map(|s| format!("{:?}", s))
.collect::<Vec<_>>()
.join(",")
);
}
Command::TenantShardMigrate {
tenant_shard_id,
node,
} => {
let req = TenantShardMigrateRequest {
tenant_shard_id,
node_id: node,
};
storcon_client
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
Method::PUT,
format!("control/v1/tenant/{tenant_shard_id}/migrate"),
Some(req),
)
.await?;
}
Command::TenantConfig { tenant_id, config } => {
let tenant_conf = serde_json::from_str(&config)?;
vps_client
.tenant_config(&TenantConfigRequest {
tenant_id,
config: tenant_conf,
})
.await?;
}
Command::TenantScatter { tenant_id } => {
// Find the shards
let locate_response = storcon_client
.dispatch::<(), TenantLocateResponse>(
Method::GET,
format!("control/v1/tenant/{tenant_id}/locate"),
None,
)
.await?;
let shards = locate_response.shards;
let mut node_to_shards: HashMap<NodeId, Vec<TenantShardId>> = HashMap::new();
let shard_count = shards.len();
for s in shards {
let entry = node_to_shards.entry(s.node_id).or_default();
entry.push(s.shard_id);
}
// Load list of available nodes
let nodes_resp = storcon_client
.dispatch::<(), Vec<NodeDescribeResponse>>(
Method::GET,
"control/v1/node".to_string(),
None,
)
.await?;
for node in nodes_resp {
if matches!(node.availability, NodeAvailabilityWrapper::Active) {
node_to_shards.entry(node.id).or_default();
}
}
let max_shard_per_node = shard_count / node_to_shards.len();
loop {
let mut migrate_shard = None;
for shards in node_to_shards.values_mut() {
if shards.len() > max_shard_per_node {
// Pick the emptiest
migrate_shard = Some(shards.pop().unwrap());
}
}
let Some(migrate_shard) = migrate_shard else {
break;
};
// Pick the emptiest node to migrate to
let mut destinations = node_to_shards
.iter()
.map(|(k, v)| (k, v.len()))
.collect::<Vec<_>>();
destinations.sort_by_key(|i| i.1);
let (destination_node, destination_count) = *destinations.first().unwrap();
if destination_count + 1 > max_shard_per_node {
// Even the emptiest destination doesn't have space: we're done
break;
}
let destination_node = *destination_node;
node_to_shards
.get_mut(&destination_node)
.unwrap()
.push(migrate_shard);
println!("Migrate {} -> {} ...", migrate_shard, destination_node);
storcon_client
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
Method::PUT,
format!("control/v1/tenant/{migrate_shard}/migrate"),
Some(TenantShardMigrateRequest {
tenant_shard_id: migrate_shard,
node_id: destination_node,
}),
)
.await?;
println!("Migrate {} -> {} OK", migrate_shard, destination_node);
}
// Spread the shards across the nodes
}
Command::TenantDescribe { tenant_id } => {
let describe_response = storcon_client
.dispatch::<(), TenantDescribeResponse>(
Method::GET,
format!("control/v1/tenant/{tenant_id}"),
None,
)
.await?;
let shards = describe_response.shards;
let mut table = comfy_table::Table::new();
table.set_header(["Shard", "Attached", "Secondary", "Last error", "status"]);
for shard in shards {
let secondary = shard
.node_secondary
.iter()
.map(|n| format!("{}", n))
.collect::<Vec<_>>()
.join(",");
let mut status_parts = Vec::new();
if shard.is_reconciling {
status_parts.push("reconciling");
}
if shard.is_pending_compute_notification {
status_parts.push("pending_compute");
}
if shard.is_splitting {
status_parts.push("splitting");
}
let status = status_parts.join(",");
table.add_row([
format!("{}", shard.tenant_shard_id),
shard
.node_attached
.map(|n| format!("{}", n))
.unwrap_or(String::new()),
secondary,
shard.last_error,
status,
]);
}
println!("{table}");
}
Command::TenantWarmup { tenant_id } => {
let describe_response = storcon_client
.dispatch::<(), TenantDescribeResponse>(
Method::GET,
format!("control/v1/tenant/{tenant_id}"),
None,
)
.await;
match describe_response {
Ok(describe) => {
if matches!(describe.policy, PlacementPolicy::Secondary) {
// Fine: it's already known to controller in secondary mode: calling
// again to put it into secondary mode won't cause problems.
} else {
anyhow::bail!("Tenant already present with policy {:?}", describe.policy);
}
}
Err(mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _)) => {
// Fine: this tenant isn't know to the storage controller yet.
}
Err(e) => {
// Unexpected API error
return Err(e.into());
}
}
vps_client
.location_config(
TenantShardId::unsharded(tenant_id),
pageserver_api::models::LocationConfig {
mode: pageserver_api::models::LocationConfigMode::Secondary,
generation: None,
secondary_conf: Some(LocationConfigSecondary { warm: true }),
shard_number: 0,
shard_count: 0,
shard_stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE.0,
tenant_conf: TenantConfig::default(),
},
None,
true,
)
.await?;
let describe_response = storcon_client
.dispatch::<(), TenantDescribeResponse>(
Method::GET,
format!("control/v1/tenant/{tenant_id}"),
None,
)
.await?;
let secondary_ps_id = describe_response
.shards
.first()
.unwrap()
.node_secondary
.first()
.unwrap();
println!("Tenant {tenant_id} warming up on pageserver {secondary_ps_id}");
loop {
let (status, progress) = vps_client
.tenant_secondary_download(
TenantShardId::unsharded(tenant_id),
Some(Duration::from_secs(10)),
)
.await?;
println!(
"Progress: {}/{} layers, {}/{} bytes",
progress.layers_downloaded,
progress.layers_total,
progress.bytes_downloaded,
progress.bytes_total
);
match status {
StatusCode::OK => {
println!("Download complete");
break;
}
StatusCode::ACCEPTED => {
// Loop
}
_ => {
anyhow::bail!("Unexpected download status: {status}");
}
}
}
}
}
Ok(())
}

View File

@@ -2,8 +2,8 @@
# see https://diesel.rs/guides/configuring-diesel-cli
[print_schema]
file = "storage_controller/src/schema.rs"
file = "control_plane/attachment_service/src/schema.rs"
custom_type_derives = ["diesel::query_builder::QueryId"]
[migrations_directory]
dir = "storage_controller/migrations"
dir = "control_plane/attachment_service/migrations"

View File

@@ -0,0 +1,54 @@
## Merged compute image
https://github.com/neondatabase/neon/issues/6685
### Motivation:
It's hard to manage compute pools for 3 Postgres versions.
(we have a compute image for each version of Postgres (currently, it's 3 for neonVM and 3 for k8s pods; eventually, we will have only neonVMs)).
We can try putting all Postgres versions into a single image, which should dramatically improve pool usage.
### TODO
#### Compute code changes:
1. Create merged compute image https://github.com/neondatabase/neon/pull/6808
2. Pass compute version in spec from control-plane
3. Change path to the postgres in compute_ctl. Now it is not specified explicitly.
`compute_ctl` has `pgbin` and `pgdata` arguments, now they are used only in tests.
3. Make changes to custom_extension code - fix path handling.
#### Control-plane changes:
1. Pass compute version in spec from control-plane
2. Remove old logic of VM pools management
#### Prewarm changes:
Currently, for pooled VMs, we prewarm postgres to improve cold start speed
```
// If this is a pooled VM, prewarm before starting HTTP server and becoming
// available for binding. Prewarming helps Postgres start quicker later,
// because QEMU will already have it's memory allocated from the host, and
// the necessary binaries will already be cached.
```
Prewarm = initdb + start postgres + rm pgdata
Q: How should we do prewarm, if we don't know in adwance, what version of postgres will be used?
I see two options:
- use versioned pgdata directories and run prewarm operations for all existing versions.
- chose "default_version" for each pooled VM and run prewarm. Try to start compute in pooled VM with matching version, in case it doesn't exist, spin compute in any existing VM. Start will be slower, because it is not prewarmed.
#### Extensions support
To support merged compute image (image, containing all supported versions of postgres),
we need to offload extensions from the image. We can implement this using "custom extensions" mechanism.
Custom extensions changes:
1. We need to move all extensions from main compute image file to the build-custom-extensions repo
2. We need to generate spec for all public extensions and pass it to compute image
Spec contains information about files in the extension and paths,
and also content of the control file. Currently it is set manually per-user, for single users that use "rare" custom extensions. We need to improve spec passing.
For public extensions, we can embed this spec into compute image: use artifact from build-custom-extension CI step and put it into compute image.
3. We need to test performance of the extension downloading and ensure that it doesn't affect cold starts (with proxy the speed should be fine).
4. Note that in this task we are not trying to solve extension versioning issue and assume that all extensions are mapped to compute images 1-1 as they are now.
#### Test changes:
- This is general functionality and will be covered by e2e tests.
- We will need to add test for extensions, to ensure that they are available for every new compute version. Don't need to run extension regression tests here. Just ensure that `CREATE EXTENSION ext;` works.

View File

@@ -7,11 +7,6 @@ Below you will find a brief overview of each subdir in the source tree in alphab
Neon storage broker, providing messaging between safekeepers and pageservers.
[storage_broker.md](./storage_broker.md)
`storage_controller`:
Neon storage controller, manages a cluster of pageservers and exposes an API that enables
managing a many-sharded tenant as a single entity.
`/control_plane`:
Local control plane.

View File

@@ -1,150 +0,0 @@
# Storage Controller
## Concepts
The storage controller sits between administrative API clients and pageservers, and handles the details of mapping tenants to pageserver tenant shards. For example, creating a tenant is one API call to the storage controller,
which is mapped into many API calls to many pageservers (for multiple shards, and for secondary locations).
It implements a pageserver-compatible API that may be used for CRUD operations on tenants and timelines, translating these requests into appropriate operations on the shards within a tenant, which may be on many different pageservers. Using this API, the storage controller may be used in the same way as the pageserver's administrative HTTP API, hiding
the underlying details of how data is spread across multiple nodes.
The storage controller also manages generations, high availability (via secondary locations) and live migrations for tenants under its management. This is done with a reconciliation loop pattern, where tenants have an “intent” state and a “reconcile” task that tries to make the outside world match the intent.
## APIs
The storage controllers HTTP server implements four logically separate APIs:
- `/v1/...` path is the pageserver-compatible API. This has to be at the path root because thats where clients expect to find it on a pageserver.
- `/control/v1/...` path is the storage controllers API, which enables operations such as registering and management pageservers, or executing shard splits.
- `/debug/v1/...` path contains endpoints which are either exclusively used in tests, or are for use by engineers when supporting a deployed system.
- `/upcall/v1/...` path contains endpoints that are called by pageservers. This includes the `/re-attach` and `/validate` APIs used by pageservers
to ensure data safety with generation numbers.
The API is authenticated with a JWT token, and tokens must have scope `pageserverapi` (i.e. the same scope as pageservers APIs).
See the `http.rs` file in the source for where the HTTP APIs are implemented.
## Database
The storage controller uses a postgres database to persist a subset of its state. Note that the storage controller does _not_ keep all its state in the database: this is a design choice to enable most operations to be done efficiently in memory, rather than having to read from the database. See `persistence.rs` for a more comprehensive comment explaining what we do and do not persist: a useful metaphor is that we persist objects like tenants and nodes, but we do not
persist the _relationships_ between them: the attachment state of a tenant's shards to nodes is kept in memory and
rebuilt on startup.
The file `[persistence.rs](http://persistence.rs)` contains all the code for accessing the database, and has a large doc comment that goes into more detail about exactly what we persist and why.
The `diesel` crate is used for defining models & migrations.
Running a local cluster with `cargo neon` automatically starts a vanilla postgress process to host the storage controllers database.
### Diesel tip: migrations
If you need to modify the database schema, heres how to create a migration:
- Install the diesel CLI with `cargo install diesel_cli`
- Use `diesel migration generate <name>` to create a new migration
- Populate the SQL files in the `migrations/` subdirectory
- Use `DATABASE_URL=... diesel migration run` to apply the migration you just wrote: this will update the `[schema.rs](http://schema.rs)` file automatically.
- This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/attachment_service`
- Commit the migration files and the changes to schema.rs
- If you need to iterate, you can rewind migrations with `diesel migration revert -a` and then `diesel migration run` again.
- The migrations are build into the storage controller binary, and automatically run at startup after it is deployed, so once youve committed a migration no further steps are needed.
## storcon_cli
The `storcon_cli` tool enables interactive management of the storage controller. This is usually
only necessary for debug, but may also be used to manage nodes (e.g. marking a node as offline).
`storcon_cli --help` includes details on commands.
# Deploying
This section is aimed at engineers deploying the storage controller outside of Neon's cloud platform, as
part of a self-hosted system.
_General note: since the default `neon_local` environment includes a storage controller, this is a useful
reference when figuring out deployment._
## Database
It is **essential** that the database used by the storage controller is durable (**do not store it on ephemeral
local disk**). This database contains pageserver generation numbers, which are essential to data safety on the pageserver.
The resource requirements for the database are very low: a single CPU core and 1GiB of memory should work well for most deployments. The physical size of the database is typically under a gigabyte.
Set the URL to the database using the `--database-url` CLI option.
There is no need to run migrations manually: the storage controller automatically applies migrations
when it starts up.
## Configure pageservers to use the storage controller
1. The pageserver `control_plane_api` and `control_plane_api_token` should be set in the `pageserver.toml` file. The API setting should
point to the "upcall" prefix, for example `http://127.0.0.1:1234/upcall/v1/` is used in neon_local clusters.
2. Create a `metadata.json` file in the same directory as `pageserver.toml`: this enables the pageserver to automatically register itself
with the storage controller when it starts up. See the example below for the format of this file.
### Example `metadata.json`
```
{"host":"acmehost.localdomain","http_host":"acmehost.localdomain","http_port":9898,"port":64000}
```
- `port` and `host` refer to the _postgres_ port and host, and these must be accessible from wherever
postgres runs.
- `http_port` and `http_host` refer to the pageserver's HTTP api, this must be accessible from where
the storage controller runs.
## Handle compute notifications.
The storage controller independently moves tenant attachments between pageservers in response to
changes such as a pageserver node becoming unavailable, or the tenant's shard count changing. To enable
postgres clients to handle such changes, the storage controller calls an API hook when a tenant's pageserver
location changes.
The hook is configured using the storage controller's `--compute-hook-url` CLI option. If the hook requires
JWT auth, the token may be provided with `--control-plane-jwt-token`. The hook will be invoked with a `PUT` request.
In the Neon cloud service, this hook is implemented by Neon's internal cloud control plane. In `neon_local` systems
the storage controller integrates directly with neon_local to reconfigure local postgres processes instead of calling
the compute hook.
When implementing an on-premise Neon deployment, you must implement a service that handles the compute hook. This is not complicated:
the request body has format of the `ComputeHookNotifyRequest` structure, provided below for convenience.
```
struct ComputeHookNotifyRequestShard {
node_id: NodeId,
shard_number: ShardNumber,
}
struct ComputeHookNotifyRequest {
tenant_id: TenantId,
stripe_size: Option<ShardStripeSize>,
shards: Vec<ComputeHookNotifyRequestShard>,
}
```
When a notification is received:
1. Modify postgres configuration for this tenant:
- set `neon.pageserver_connstr` to a comma-separated list of postgres connection strings to pageservers according to the `shards` list. The
shards identified by `NodeId` must be converted to the address+port of the node.
- if stripe_size is not None, set `neon.stripe_size` to this value
2. Send SIGHUP to postgres to reload configuration
3. Respond with 200 to the notification request. Do not return success if postgres was not updated: if an error is returned, the controller
will retry the notification until it succeeds..
### Example notification body
```
{
"tenant_id": "1f359dd625e519a1a4e8d7509690f6fc",
"stripe_size": 32768,
"shards": [
{"node_id": 344, "shard_number": 0},
{"node_id": 722, "shard_number": 1},
],
}
```

477
flake.lock generated
View File

@@ -1,477 +0,0 @@
{
"nodes": {
"cachix": {
"inputs": {
"devenv": "devenv_2",
"flake-compat": [
"devenv",
"flake-compat"
],
"nixpkgs": [
"devenv",
"nixpkgs"
],
"pre-commit-hooks": [
"devenv",
"pre-commit-hooks"
]
},
"locked": {
"lastModified": 1712055811,
"narHash": "sha256-7FcfMm5A/f02yyzuavJe06zLa9hcMHsagE28ADcmQvk=",
"owner": "cachix",
"repo": "cachix",
"rev": "02e38da89851ec7fec3356a5c04bc8349cae0e30",
"type": "github"
},
"original": {
"owner": "cachix",
"repo": "cachix",
"type": "github"
}
},
"devenv": {
"inputs": {
"cachix": "cachix",
"flake-compat": "flake-compat_2",
"nix": "nix_2",
"nixpkgs": "nixpkgs_2",
"pre-commit-hooks": "pre-commit-hooks"
},
"locked": {
"lastModified": 1714390914,
"narHash": "sha256-W5DFIifCjGYJXJzLU3RpqBeqes4zrf0Sr/6rwzTygPU=",
"owner": "cachix",
"repo": "devenv",
"rev": "34e6461fd76b5f51ad5f8214f5cf22c4cd7a196e",
"type": "github"
},
"original": {
"owner": "cachix",
"repo": "devenv",
"type": "github"
}
},
"devenv_2": {
"inputs": {
"flake-compat": [
"devenv",
"cachix",
"flake-compat"
],
"nix": "nix",
"nixpkgs": "nixpkgs",
"poetry2nix": "poetry2nix",
"pre-commit-hooks": [
"devenv",
"cachix",
"pre-commit-hooks"
]
},
"locked": {
"lastModified": 1708704632,
"narHash": "sha256-w+dOIW60FKMaHI1q5714CSibk99JfYxm0CzTinYWr+Q=",
"owner": "cachix",
"repo": "devenv",
"rev": "2ee4450b0f4b95a1b90f2eb5ffea98b90e48c196",
"type": "github"
},
"original": {
"owner": "cachix",
"ref": "python-rewrite",
"repo": "devenv",
"type": "github"
}
},
"flake-compat": {
"flake": false,
"locked": {
"lastModified": 1673956053,
"narHash": "sha256-4gtG9iQuiKITOjNQQeQIpoIB6b16fm+504Ch3sNKLd8=",
"owner": "edolstra",
"repo": "flake-compat",
"rev": "35bb57c0c8d8b62bbfd284272c928ceb64ddbde9",
"type": "github"
},
"original": {
"owner": "edolstra",
"repo": "flake-compat",
"type": "github"
}
},
"flake-compat_2": {
"flake": false,
"locked": {
"lastModified": 1696426674,
"narHash": "sha256-kvjfFW7WAETZlt09AgDn1MrtKzP7t90Vf7vypd3OL1U=",
"owner": "edolstra",
"repo": "flake-compat",
"rev": "0f9255e01c2351cc7d116c072cb317785dd33b33",
"type": "github"
},
"original": {
"owner": "edolstra",
"repo": "flake-compat",
"type": "github"
}
},
"flake-parts": {
"inputs": {
"nixpkgs-lib": "nixpkgs-lib"
},
"locked": {
"lastModified": 1712014858,
"narHash": "sha256-sB4SWl2lX95bExY2gMFG5HIzvva5AVMJd4Igm+GpZNw=",
"owner": "hercules-ci",
"repo": "flake-parts",
"rev": "9126214d0a59633752a136528f5f3b9aa8565b7d",
"type": "github"
},
"original": {
"id": "flake-parts",
"type": "indirect"
}
},
"flake-utils": {
"inputs": {
"systems": "systems"
},
"locked": {
"lastModified": 1689068808,
"narHash": "sha256-6ixXo3wt24N/melDWjq70UuHQLxGV8jZvooRanIHXw0=",
"owner": "numtide",
"repo": "flake-utils",
"rev": "919d646de7be200f3bf08cb76ae1f09402b6f9b4",
"type": "github"
},
"original": {
"owner": "numtide",
"repo": "flake-utils",
"type": "github"
}
},
"flake-utils_2": {
"inputs": {
"systems": "systems_2"
},
"locked": {
"lastModified": 1710146030,
"narHash": "sha256-SZ5L6eA7HJ/nmkzGG7/ISclqe6oZdOZTNoesiInkXPQ=",
"owner": "numtide",
"repo": "flake-utils",
"rev": "b1d9ab70662946ef0850d488da1c9019f3a9752a",
"type": "github"
},
"original": {
"owner": "numtide",
"repo": "flake-utils",
"type": "github"
}
},
"gitignore": {
"inputs": {
"nixpkgs": [
"devenv",
"pre-commit-hooks",
"nixpkgs"
]
},
"locked": {
"lastModified": 1709087332,
"narHash": "sha256-HG2cCnktfHsKV0s4XW83gU3F57gaTljL9KNSuG6bnQs=",
"owner": "hercules-ci",
"repo": "gitignore.nix",
"rev": "637db329424fd7e46cf4185293b9cc8c88c95394",
"type": "github"
},
"original": {
"owner": "hercules-ci",
"repo": "gitignore.nix",
"type": "github"
}
},
"nix": {
"inputs": {
"flake-compat": "flake-compat",
"nixpkgs": [
"devenv",
"cachix",
"devenv",
"nixpkgs"
],
"nixpkgs-regression": "nixpkgs-regression"
},
"locked": {
"lastModified": 1712911606,
"narHash": "sha256-BGvBhepCufsjcUkXnEEXhEVjwdJAwPglCC2+bInc794=",
"owner": "domenkozar",
"repo": "nix",
"rev": "b24a9318ea3f3600c1e24b4a00691ee912d4de12",
"type": "github"
},
"original": {
"owner": "domenkozar",
"ref": "devenv-2.21",
"repo": "nix",
"type": "github"
}
},
"nix-github-actions": {
"inputs": {
"nixpkgs": [
"devenv",
"cachix",
"devenv",
"poetry2nix",
"nixpkgs"
]
},
"locked": {
"lastModified": 1688870561,
"narHash": "sha256-4UYkifnPEw1nAzqqPOTL2MvWtm3sNGw1UTYTalkTcGY=",
"owner": "nix-community",
"repo": "nix-github-actions",
"rev": "165b1650b753316aa7f1787f3005a8d2da0f5301",
"type": "github"
},
"original": {
"owner": "nix-community",
"repo": "nix-github-actions",
"type": "github"
}
},
"nix_2": {
"inputs": {
"flake-compat": [
"devenv",
"flake-compat"
],
"nixpkgs": [
"devenv",
"nixpkgs"
],
"nixpkgs-regression": "nixpkgs-regression_2"
},
"locked": {
"lastModified": 1712911606,
"narHash": "sha256-BGvBhepCufsjcUkXnEEXhEVjwdJAwPglCC2+bInc794=",
"owner": "domenkozar",
"repo": "nix",
"rev": "b24a9318ea3f3600c1e24b4a00691ee912d4de12",
"type": "github"
},
"original": {
"owner": "domenkozar",
"ref": "devenv-2.21",
"repo": "nix",
"type": "github"
}
},
"nixpkgs": {
"locked": {
"lastModified": 1692808169,
"narHash": "sha256-x9Opq06rIiwdwGeK2Ykj69dNc2IvUH1fY55Wm7atwrE=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "9201b5ff357e781bf014d0330d18555695df7ba8",
"type": "github"
},
"original": {
"owner": "NixOS",
"ref": "nixpkgs-unstable",
"repo": "nixpkgs",
"type": "github"
}
},
"nixpkgs-lib": {
"locked": {
"dir": "lib",
"lastModified": 1711703276,
"narHash": "sha256-iMUFArF0WCatKK6RzfUJknjem0H9m4KgorO/p3Dopkk=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "d8fe5e6c92d0d190646fb9f1056741a229980089",
"type": "github"
},
"original": {
"dir": "lib",
"owner": "NixOS",
"ref": "nixos-unstable",
"repo": "nixpkgs",
"type": "github"
}
},
"nixpkgs-regression": {
"locked": {
"lastModified": 1643052045,
"narHash": "sha256-uGJ0VXIhWKGXxkeNnq4TvV3CIOkUJ3PAoLZ3HMzNVMw=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "215d4d0fd80ca5163643b03a33fde804a29cc1e2",
"type": "github"
},
"original": {
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "215d4d0fd80ca5163643b03a33fde804a29cc1e2",
"type": "github"
}
},
"nixpkgs-regression_2": {
"locked": {
"lastModified": 1643052045,
"narHash": "sha256-uGJ0VXIhWKGXxkeNnq4TvV3CIOkUJ3PAoLZ3HMzNVMw=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "215d4d0fd80ca5163643b03a33fde804a29cc1e2",
"type": "github"
},
"original": {
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "215d4d0fd80ca5163643b03a33fde804a29cc1e2",
"type": "github"
}
},
"nixpkgs-stable": {
"locked": {
"lastModified": 1710695816,
"narHash": "sha256-3Eh7fhEID17pv9ZxrPwCLfqXnYP006RKzSs0JptsN84=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "614b4613980a522ba49f0d194531beddbb7220d3",
"type": "github"
},
"original": {
"owner": "NixOS",
"ref": "nixos-23.11",
"repo": "nixpkgs",
"type": "github"
}
},
"nixpkgs_2": {
"locked": {
"lastModified": 1713361204,
"narHash": "sha256-TA6EDunWTkc5FvDCqU3W2T3SFn0gRZqh6D/hJnM02MM=",
"owner": "cachix",
"repo": "devenv-nixpkgs",
"rev": "285676e87ad9f0ca23d8714a6ab61e7e027020c6",
"type": "github"
},
"original": {
"owner": "cachix",
"ref": "rolling",
"repo": "devenv-nixpkgs",
"type": "github"
}
},
"nixpkgs_3": {
"locked": {
"lastModified": 1714314149,
"narHash": "sha256-yNAevSKF4krRWacmLUsLK7D7PlfuY3zF0lYnGYNi9vQ=",
"owner": "nixos",
"repo": "nixpkgs",
"rev": "cf8cc1201be8bc71b7cbbbdaf349b22f4f99c7ae",
"type": "github"
},
"original": {
"owner": "nixos",
"ref": "nixpkgs-unstable",
"repo": "nixpkgs",
"type": "github"
}
},
"poetry2nix": {
"inputs": {
"flake-utils": "flake-utils",
"nix-github-actions": "nix-github-actions",
"nixpkgs": [
"devenv",
"cachix",
"devenv",
"nixpkgs"
]
},
"locked": {
"lastModified": 1692876271,
"narHash": "sha256-IXfZEkI0Mal5y1jr6IRWMqK8GW2/f28xJenZIPQqkY0=",
"owner": "nix-community",
"repo": "poetry2nix",
"rev": "d5006be9c2c2417dafb2e2e5034d83fabd207ee3",
"type": "github"
},
"original": {
"owner": "nix-community",
"repo": "poetry2nix",
"type": "github"
}
},
"pre-commit-hooks": {
"inputs": {
"flake-compat": [
"devenv",
"flake-compat"
],
"flake-utils": "flake-utils_2",
"gitignore": "gitignore",
"nixpkgs": [
"devenv",
"nixpkgs"
],
"nixpkgs-stable": "nixpkgs-stable"
},
"locked": {
"lastModified": 1713775815,
"narHash": "sha256-Wu9cdYTnGQQwtT20QQMg7jzkANKQjwBD9iccfGKkfls=",
"owner": "cachix",
"repo": "pre-commit-hooks.nix",
"rev": "2ac4dcbf55ed43f3be0bae15e181f08a57af24a4",
"type": "github"
},
"original": {
"owner": "cachix",
"repo": "pre-commit-hooks.nix",
"type": "github"
}
},
"root": {
"inputs": {
"devenv": "devenv",
"flake-parts": "flake-parts",
"nixpkgs": "nixpkgs_3"
}
},
"systems": {
"locked": {
"lastModified": 1681028828,
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
"owner": "nix-systems",
"repo": "default",
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
"type": "github"
},
"original": {
"owner": "nix-systems",
"repo": "default",
"type": "github"
}
},
"systems_2": {
"locked": {
"lastModified": 1681028828,
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
"owner": "nix-systems",
"repo": "default",
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
"type": "github"
},
"original": {
"owner": "nix-systems",
"repo": "default",
"type": "github"
}
}
},
"root": "root",
"version": 7
}

View File

@@ -1,58 +0,0 @@
{
inputs = {
nixpkgs.url = "github:nixos/nixpkgs/nixpkgs-unstable";
devenv.url = "github:cachix/devenv";
};
outputs = inputs @ {
flake-parts,
nixpkgs,
...
}:
flake-parts.lib.mkFlake {inherit inputs;} {
imports = [
inputs.devenv.flakeModule
];
systems = nixpkgs.lib.systems.flakeExposed;
perSystem = {
config,
self',
inputs',
pkgs,
system,
lib,
...
}: {
devenv.shells.default = {
packages = with pkgs; [
rustup
postgresql_16
protobuf_26
icu74
pkg-config
bison
flex
openssl
libiconv
readline
zlib
curl
];
env.DISABLE_HOMEBREW = "1";
scripts = {
neonmake = {
description = "Build Neon";
exec =
if (pkgs.stdenv.isDarwin)
then "make -j`sysctl -n hw.logicalcpu` -s"
else "make -j`nproc` -s";
};
};
};
};
};
}

View File

@@ -10,13 +10,11 @@ libc.workspace = true
once_cell.workspace = true
chrono.workspace = true
twox-hash.workspace = true
measured.workspace = true
workspace_hack.workspace = true
[target.'cfg(target_os = "linux")'.dependencies]
procfs.workspace = true
measured-process.workspace = true
[dev-dependencies]
rand = "0.8"

View File

@@ -7,19 +7,14 @@
//! use significantly less memory than this, but can only approximate the cardinality.
use std::{
hash::{BuildHasher, BuildHasherDefault, Hash},
sync::atomic::AtomicU8,
collections::HashMap,
hash::{BuildHasher, BuildHasherDefault, Hash, Hasher},
sync::{atomic::AtomicU8, Arc, RwLock},
};
use measured::{
label::{LabelGroupVisitor, LabelName, LabelValue, LabelVisitor},
metric::{
group::{Encoding, MetricValue},
name::MetricNameEncoder,
Metric, MetricType, MetricVec,
},
text::TextEncoder,
LabelGroup,
use prometheus::{
core::{self, Describer},
proto, Opts,
};
use twox_hash::xxh3;
@@ -98,25 +93,203 @@ macro_rules! register_hll {
/// ```
///
/// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
pub type HyperLogLogVec<L, const N: usize> = MetricVec<HyperLogLogState<N>, L>;
pub type HyperLogLog<const N: usize> = Metric<HyperLogLogState<N>>;
pub struct HyperLogLogState<const N: usize> {
shards: [AtomicU8; N],
#[derive(Clone)]
pub struct HyperLogLogVec<const N: usize> {
core: Arc<HyperLogLogVecCore<N>>,
}
impl<const N: usize> Default for HyperLogLogState<N> {
fn default() -> Self {
#[allow(clippy::declare_interior_mutable_const)]
const ZERO: AtomicU8 = AtomicU8::new(0);
Self { shards: [ZERO; N] }
struct HyperLogLogVecCore<const N: usize> {
pub children: RwLock<HashMap<u64, HyperLogLog<N>, BuildHasherDefault<xxh3::Hash64>>>,
pub desc: core::Desc,
pub opts: Opts,
}
impl<const N: usize> core::Collector for HyperLogLogVec<N> {
fn desc(&self) -> Vec<&core::Desc> {
vec![&self.core.desc]
}
fn collect(&self) -> Vec<proto::MetricFamily> {
let mut m = proto::MetricFamily::default();
m.set_name(self.core.desc.fq_name.clone());
m.set_help(self.core.desc.help.clone());
m.set_field_type(proto::MetricType::GAUGE);
let mut metrics = Vec::new();
for child in self.core.children.read().unwrap().values() {
child.core.collect_into(&mut metrics);
}
m.set_metric(metrics);
vec![m]
}
}
impl<const N: usize> MetricType for HyperLogLogState<N> {
type Metadata = ();
impl<const N: usize> HyperLogLogVec<N> {
/// Create a new [`HyperLogLogVec`] based on the provided
/// [`Opts`] and partitioned by the given label names. At least one label name must be
/// provided.
pub fn new(opts: Opts, label_names: &[&str]) -> prometheus::Result<Self> {
assert!(N.is_power_of_two());
let variable_names = label_names.iter().map(|s| (*s).to_owned()).collect();
let opts = opts.variable_labels(variable_names);
let desc = opts.describe()?;
let v = HyperLogLogVecCore {
children: RwLock::new(HashMap::default()),
desc,
opts,
};
Ok(Self { core: Arc::new(v) })
}
/// `get_metric_with_label_values` returns the [`HyperLogLog<P>`] for the given slice
/// of label values (same order as the VariableLabels in Desc). If that combination of
/// label values is accessed for the first time, a new [`HyperLogLog<P>`] is created.
///
/// An error is returned if the number of label values is not the same as the
/// number of VariableLabels in Desc.
pub fn get_metric_with_label_values(
&self,
vals: &[&str],
) -> prometheus::Result<HyperLogLog<N>> {
self.core.get_metric_with_label_values(vals)
}
/// `with_label_values` works as `get_metric_with_label_values`, but panics if an error
/// occurs.
pub fn with_label_values(&self, vals: &[&str]) -> HyperLogLog<N> {
self.get_metric_with_label_values(vals).unwrap()
}
}
impl<const N: usize> HyperLogLogState<N> {
impl<const N: usize> HyperLogLogVecCore<N> {
pub fn get_metric_with_label_values(
&self,
vals: &[&str],
) -> prometheus::Result<HyperLogLog<N>> {
let h = self.hash_label_values(vals)?;
if let Some(metric) = self.children.read().unwrap().get(&h).cloned() {
return Ok(metric);
}
self.get_or_create_metric(h, vals)
}
pub(crate) fn hash_label_values(&self, vals: &[&str]) -> prometheus::Result<u64> {
if vals.len() != self.desc.variable_labels.len() {
return Err(prometheus::Error::InconsistentCardinality {
expect: self.desc.variable_labels.len(),
got: vals.len(),
});
}
let mut h = xxh3::Hash64::default();
for val in vals {
h.write(val.as_bytes());
}
Ok(h.finish())
}
fn get_or_create_metric(
&self,
hash: u64,
label_values: &[&str],
) -> prometheus::Result<HyperLogLog<N>> {
let mut children = self.children.write().unwrap();
// Check exist first.
if let Some(metric) = children.get(&hash).cloned() {
return Ok(metric);
}
let metric = HyperLogLog::with_opts_and_label_values(&self.opts, label_values)?;
children.insert(hash, metric.clone());
Ok(metric)
}
}
/// HLL is a probabilistic cardinality measure.
///
/// How to use this time-series for a metric name `my_metrics_total_hll`:
///
/// ```promql
/// # harmonic mean
/// 1 / (
/// sum (
/// 2 ^ -(
/// # HLL merge operation
/// max (my_metrics_total_hll{}) by (hll_shard, other_labels...)
/// )
/// ) without (hll_shard)
/// )
/// * alpha
/// * shards_count
/// * shards_count
/// ```
///
/// If you want an estimate over time, you can use the following query:
///
/// ```promql
/// # harmonic mean
/// 1 / (
/// sum (
/// 2 ^ -(
/// # HLL merge operation
/// max (
/// max_over_time(my_metrics_total_hll{}[$__rate_interval])
/// ) by (hll_shard, other_labels...)
/// )
/// ) without (hll_shard)
/// )
/// * alpha
/// * shards_count
/// * shards_count
/// ```
///
/// In the case of low cardinality, you might want to use the linear counting approximation:
///
/// ```promql
/// # LinearCounting(m, V) = m log (m / V)
/// shards_count * ln(shards_count /
/// # calculate V = how many shards contain a 0
/// count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard)
/// )
/// ```
///
/// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
#[derive(Clone)]
pub struct HyperLogLog<const N: usize> {
core: Arc<HyperLogLogCore<N>>,
}
impl<const N: usize> HyperLogLog<N> {
/// Create a [`HyperLogLog`] with the `name` and `help` arguments.
pub fn new<S1: Into<String>, S2: Into<String>>(name: S1, help: S2) -> prometheus::Result<Self> {
assert!(N.is_power_of_two());
let opts = Opts::new(name, help);
Self::with_opts(opts)
}
/// Create a [`HyperLogLog`] with the `opts` options.
pub fn with_opts(opts: Opts) -> prometheus::Result<Self> {
Self::with_opts_and_label_values(&opts, &[])
}
fn with_opts_and_label_values(opts: &Opts, label_values: &[&str]) -> prometheus::Result<Self> {
let desc = opts.describe()?;
let labels = make_label_pairs(&desc, label_values)?;
let v = HyperLogLogCore {
shards: [0; N].map(AtomicU8::new),
desc,
labels,
};
Ok(Self { core: Arc::new(v) })
}
pub fn measure(&self, item: &impl Hash) {
// changing the hasher will break compatibility with previous measurements.
self.record(BuildHasherDefault::<xxh3::Hash64>::default().hash_one(item));
@@ -126,11 +299,42 @@ impl<const N: usize> HyperLogLogState<N> {
let p = N.ilog2() as u8;
let j = hash & (N as u64 - 1);
let rho = (hash >> p).leading_zeros() as u8 + 1 - p;
self.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed);
self.core.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed);
}
}
struct HyperLogLogCore<const N: usize> {
shards: [AtomicU8; N],
desc: core::Desc,
labels: Vec<proto::LabelPair>,
}
impl<const N: usize> core::Collector for HyperLogLog<N> {
fn desc(&self) -> Vec<&core::Desc> {
vec![&self.core.desc]
}
fn take_sample(&self) -> [u8; N] {
self.shards.each_ref().map(|x| {
fn collect(&self) -> Vec<proto::MetricFamily> {
let mut m = proto::MetricFamily::default();
m.set_name(self.core.desc.fq_name.clone());
m.set_help(self.core.desc.help.clone());
m.set_field_type(proto::MetricType::GAUGE);
let mut metrics = Vec::new();
self.core.collect_into(&mut metrics);
m.set_metric(metrics);
vec![m]
}
}
impl<const N: usize> HyperLogLogCore<N> {
fn collect_into(&self, metrics: &mut Vec<proto::Metric>) {
self.shards.iter().enumerate().for_each(|(i, x)| {
let mut shard_label = proto::LabelPair::default();
shard_label.set_name("hll_shard".to_owned());
shard_label.set_value(format!("{i}"));
// We reset the counter to 0 so we can perform a cardinality measure over any time slice in prometheus.
// This seems like it would be a race condition,
@@ -140,90 +344,85 @@ impl<const N: usize> HyperLogLogState<N> {
// TODO: maybe we shouldn't reset this on every collect, instead, only after a time window.
// this would mean that a dev port-forwarding the metrics url won't break the sampling.
x.swap(0, std::sync::atomic::Ordering::Relaxed)
let v = x.swap(0, std::sync::atomic::Ordering::Relaxed);
let mut m = proto::Metric::default();
let mut c = proto::Gauge::default();
c.set_value(v as f64);
m.set_gauge(c);
let mut labels = Vec::with_capacity(self.labels.len() + 1);
labels.extend_from_slice(&self.labels);
labels.push(shard_label);
m.set_label(labels);
metrics.push(m);
})
}
}
impl<W: std::io::Write, const N: usize> measured::metric::MetricEncoding<TextEncoder<W>>
for HyperLogLogState<N>
{
fn write_type(
name: impl MetricNameEncoder,
enc: &mut TextEncoder<W>,
) -> Result<(), std::io::Error> {
enc.write_type(&name, measured::text::MetricType::Gauge)
fn make_label_pairs(
desc: &core::Desc,
label_values: &[&str],
) -> prometheus::Result<Vec<proto::LabelPair>> {
if desc.variable_labels.len() != label_values.len() {
return Err(prometheus::Error::InconsistentCardinality {
expect: desc.variable_labels.len(),
got: label_values.len(),
});
}
fn collect_into(
&self,
_: &(),
labels: impl LabelGroup,
name: impl MetricNameEncoder,
enc: &mut TextEncoder<W>,
) -> Result<(), std::io::Error> {
struct I64(i64);
impl LabelValue for I64 {
fn visit<V: LabelVisitor>(&self, v: V) -> V::Output {
v.write_int(self.0)
}
}
struct HllShardLabel {
hll_shard: i64,
}
impl LabelGroup for HllShardLabel {
fn visit_values(&self, v: &mut impl LabelGroupVisitor) {
const LE: &LabelName = LabelName::from_str("hll_shard");
v.write_value(LE, &I64(self.hll_shard));
}
}
self.take_sample()
.into_iter()
.enumerate()
.try_for_each(|(hll_shard, val)| {
enc.write_metric_value(
name.by_ref(),
labels.by_ref().compose_with(HllShardLabel {
hll_shard: hll_shard as i64,
}),
MetricValue::Int(val as i64),
)
})
let total_len = desc.variable_labels.len() + desc.const_label_pairs.len();
if total_len == 0 {
return Ok(vec![]);
}
if desc.variable_labels.is_empty() {
return Ok(desc.const_label_pairs.clone());
}
let mut label_pairs = Vec::with_capacity(total_len);
for (i, n) in desc.variable_labels.iter().enumerate() {
let mut label_pair = proto::LabelPair::default();
label_pair.set_name(n.clone());
label_pair.set_value(label_values[i].to_owned());
label_pairs.push(label_pair);
}
for label_pair in &desc.const_label_pairs {
label_pairs.push(label_pair.clone());
}
label_pairs.sort();
Ok(label_pairs)
}
#[cfg(test)]
mod tests {
use std::collections::HashSet;
use measured::{label::StaticLabelSet, FixedCardinalityLabel};
use prometheus::{proto, Opts};
use rand::{rngs::StdRng, Rng, SeedableRng};
use rand_distr::{Distribution, Zipf};
use crate::HyperLogLogVec;
#[derive(FixedCardinalityLabel, Clone, Copy)]
#[label(singleton = "x")]
enum Label {
A,
B,
fn collect(hll: &HyperLogLogVec<32>) -> Vec<proto::Metric> {
let mut metrics = vec![];
hll.core
.children
.read()
.unwrap()
.values()
.for_each(|c| c.core.collect_into(&mut metrics));
metrics
}
fn collect(hll: &HyperLogLogVec<StaticLabelSet<Label>, 32>) -> ([u8; 32], [u8; 32]) {
// cannot go through the `hll.collect_family_into` interface yet...
// need to see if I can fix the conflicting impls problem in measured.
(
hll.get_metric(hll.with_labels(Label::A)).take_sample(),
hll.get_metric(hll.with_labels(Label::B)).take_sample(),
)
}
fn get_cardinality(samples: &[[u8; 32]]) -> f64 {
fn get_cardinality(metrics: &[proto::Metric], filter: impl Fn(&proto::Metric) -> bool) -> f64 {
let mut buckets = [0.0; 32];
for &sample in samples {
for (i, m) in sample.into_iter().enumerate() {
buckets[i] = f64::max(buckets[i], m as f64);
for metric in metrics.chunks_exact(32) {
if filter(&metric[0]) {
for (i, m) in metric.iter().enumerate() {
buckets[i] = f64::max(buckets[i], m.get_gauge().get_value());
}
}
}
@@ -238,7 +437,7 @@ mod tests {
}
fn test_cardinality(n: usize, dist: impl Distribution<f64>) -> ([usize; 3], [f64; 3]) {
let hll = HyperLogLogVec::<StaticLabelSet<Label>, 32>::new();
let hll = HyperLogLogVec::<32>::new(Opts::new("foo", "bar"), &["x"]).unwrap();
let mut iter = StdRng::seed_from_u64(0x2024_0112).sample_iter(dist);
let mut set_a = HashSet::new();
@@ -246,20 +445,18 @@ mod tests {
for x in iter.by_ref().take(n) {
set_a.insert(x.to_bits());
hll.get_metric(hll.with_labels(Label::A))
.measure(&x.to_bits());
hll.with_label_values(&["a"]).measure(&x.to_bits());
}
for x in iter.by_ref().take(n) {
set_b.insert(x.to_bits());
hll.get_metric(hll.with_labels(Label::B))
.measure(&x.to_bits());
hll.with_label_values(&["b"]).measure(&x.to_bits());
}
let merge = &set_a | &set_b;
let (a, b) = collect(&hll);
let len = get_cardinality(&[a, b]);
let len_a = get_cardinality(&[a]);
let len_b = get_cardinality(&[b]);
let metrics = collect(&hll);
let len = get_cardinality(&metrics, |_| true);
let len_a = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "a");
let len_b = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "b");
([merge.len(), set_a.len(), set_b.len()], [len, len_a, len_b])
}

View File

@@ -4,17 +4,6 @@
//! a default registry.
#![deny(clippy::undocumented_unsafe_blocks)]
use measured::{
label::{LabelGroupSet, LabelGroupVisitor, LabelName, NoLabels},
metric::{
counter::CounterState,
gauge::GaugeState,
group::{Encoding, MetricValue},
name::{MetricName, MetricNameEncoder},
MetricEncoding, MetricFamilyEncoding,
},
FixedCardinalityLabel, LabelGroup, MetricGroup,
};
use once_cell::sync::Lazy;
use prometheus::core::{
Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec,
@@ -22,7 +11,6 @@ use prometheus::core::{
pub use prometheus::opts;
pub use prometheus::register;
pub use prometheus::Error;
use prometheus::Registry;
pub use prometheus::{core, default_registry, proto};
pub use prometheus::{exponential_buckets, linear_buckets};
pub use prometheus::{register_counter_vec, Counter, CounterVec};
@@ -35,12 +23,13 @@ pub use prometheus::{register_int_counter_vec, IntCounterVec};
pub use prometheus::{register_int_gauge, IntGauge};
pub use prometheus::{register_int_gauge_vec, IntGaugeVec};
pub use prometheus::{Encoder, TextEncoder};
use prometheus::{Registry, Result};
pub mod launch_timestamp;
mod wrappers;
pub use wrappers::{CountedReader, CountedWriter};
mod hll;
pub use hll::{HyperLogLog, HyperLogLogState, HyperLogLogVec};
pub use hll::{HyperLogLog, HyperLogLogVec};
#[cfg(target_os = "linux")]
pub mod more_process_metrics;
@@ -70,7 +59,7 @@ static INTERNAL_REGISTRY: Lazy<Registry> = Lazy::new(Registry::new);
/// Register a collector in the internal registry. MUST be called before the first call to `gather()`.
/// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector
/// while holding the lock.
pub fn register_internal(c: Box<dyn Collector>) -> prometheus::Result<()> {
pub fn register_internal(c: Box<dyn Collector>) -> Result<()> {
INTERNAL_REGISTRY.register(c)
}
@@ -107,127 +96,6 @@ pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[
0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5,
];
pub struct BuildInfo {
pub revision: &'static str,
pub build_tag: &'static str,
}
// todo: allow label group without the set
impl LabelGroup for BuildInfo {
fn visit_values(&self, v: &mut impl LabelGroupVisitor) {
const REVISION: &LabelName = LabelName::from_str("revision");
v.write_value(REVISION, &self.revision);
const BUILD_TAG: &LabelName = LabelName::from_str("build_tag");
v.write_value(BUILD_TAG, &self.build_tag);
}
}
impl<T: Encoding> MetricFamilyEncoding<T> for BuildInfo
where
GaugeState: MetricEncoding<T>,
{
fn collect_family_into(
&self,
name: impl measured::metric::name::MetricNameEncoder,
enc: &mut T,
) -> Result<(), T::Err> {
enc.write_help(&name, "Build/version information")?;
GaugeState::write_type(&name, enc)?;
GaugeState {
count: std::sync::atomic::AtomicI64::new(1),
}
.collect_into(&(), self, name, enc)
}
}
#[derive(MetricGroup)]
#[metric(new(build_info: BuildInfo))]
pub struct NeonMetrics {
#[cfg(target_os = "linux")]
#[metric(namespace = "process")]
#[metric(init = measured_process::ProcessCollector::for_self())]
process: measured_process::ProcessCollector,
#[metric(namespace = "libmetrics")]
#[metric(init = LibMetrics::new(build_info))]
libmetrics: LibMetrics,
}
#[derive(MetricGroup)]
#[metric(new(build_info: BuildInfo))]
pub struct LibMetrics {
#[metric(init = build_info)]
build_info: BuildInfo,
#[metric(flatten)]
rusage: Rusage,
serve_count: CollectionCounter,
}
fn write_gauge<Enc: Encoding>(
x: i64,
labels: impl LabelGroup,
name: impl MetricNameEncoder,
enc: &mut Enc,
) -> Result<(), Enc::Err> {
enc.write_metric_value(name, labels, MetricValue::Int(x))
}
#[derive(Default)]
struct Rusage;
#[derive(FixedCardinalityLabel, Clone, Copy)]
#[label(singleton = "io_operation")]
enum IoOp {
Read,
Write,
}
impl<T: Encoding> MetricGroup<T> for Rusage
where
GaugeState: MetricEncoding<T>,
{
fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> {
const DISK_IO: &MetricName = MetricName::from_str("disk_io_bytes_total");
const MAXRSS: &MetricName = MetricName::from_str("maxrss_kb");
let ru = get_rusage_stats();
enc.write_help(
DISK_IO,
"Bytes written and read from disk, grouped by the operation (read|write)",
)?;
GaugeState::write_type(DISK_IO, enc)?;
write_gauge(ru.ru_inblock * BYTES_IN_BLOCK, IoOp::Read, DISK_IO, enc)?;
write_gauge(ru.ru_oublock * BYTES_IN_BLOCK, IoOp::Write, DISK_IO, enc)?;
enc.write_help(MAXRSS, "Memory usage (Maximum Resident Set Size)")?;
GaugeState::write_type(MAXRSS, enc)?;
write_gauge(ru.ru_maxrss, IoOp::Read, MAXRSS, enc)?;
Ok(())
}
}
#[derive(Default)]
struct CollectionCounter(CounterState);
impl<T: Encoding> MetricFamilyEncoding<T> for CollectionCounter
where
CounterState: MetricEncoding<T>,
{
fn collect_family_into(
&self,
name: impl measured::metric::name::MetricNameEncoder,
enc: &mut T,
) -> Result<(), T::Err> {
self.0.inc();
enc.write_help(&name, "Number of metric requests made")?;
self.0.collect_into(&(), NoLabels, name, enc)
}
}
pub fn set_build_info_metric(revision: &str, build_tag: &str) {
let metric = register_int_gauge_vec!(
"libmetrics_build_info",
@@ -237,7 +105,6 @@ pub fn set_build_info_metric(revision: &str, build_tag: &str) {
.expect("Failed to register build info metric");
metric.with_label_values(&[revision, build_tag]).set(1);
}
const BYTES_IN_BLOCK: i64 = 512;
// Records I/O stats in a "cross-platform" way.
// Compiles both on macOS and Linux, but current macOS implementation always returns 0 as values for I/O stats.
@@ -250,22 +117,14 @@ const BYTES_IN_BLOCK: i64 = 512;
fn update_rusage_metrics() {
let rusage_stats = get_rusage_stats();
const BYTES_IN_BLOCK: i64 = 512;
DISK_IO_BYTES
.with_label_values(&["read"])
.set(rusage_stats.ru_inblock * BYTES_IN_BLOCK);
DISK_IO_BYTES
.with_label_values(&["write"])
.set(rusage_stats.ru_oublock * BYTES_IN_BLOCK);
// On macOS, the unit of maxrss is bytes; on Linux, it's kilobytes. https://stackoverflow.com/a/59915669
#[cfg(target_os = "macos")]
{
MAXRSS_KB.set(rusage_stats.ru_maxrss / 1024);
}
#[cfg(not(target_os = "macos"))]
{
MAXRSS_KB.set(rusage_stats.ru_maxrss);
}
MAXRSS_KB.set(rusage_stats.ru_maxrss);
}
fn get_rusage_stats() -> libc::rusage {
@@ -292,7 +151,6 @@ macro_rules! register_int_counter_pair_vec {
}
}};
}
/// Create an [`IntCounterPair`] and registers to default registry.
#[macro_export(local_inner_macros)]
macro_rules! register_int_counter_pair {
@@ -330,10 +188,7 @@ impl<P: Atomic> GenericCounterPairVec<P> {
///
/// An error is returned if the number of label values is not the same as the
/// number of VariableLabels in Desc.
pub fn get_metric_with_label_values(
&self,
vals: &[&str],
) -> prometheus::Result<GenericCounterPair<P>> {
pub fn get_metric_with_label_values(&self, vals: &[&str]) -> Result<GenericCounterPair<P>> {
Ok(GenericCounterPair {
inc: self.inc.get_metric_with_label_values(vals)?,
dec: self.dec.get_metric_with_label_values(vals)?,
@@ -346,7 +201,7 @@ impl<P: Atomic> GenericCounterPairVec<P> {
self.get_metric_with_label_values(vals).unwrap()
}
pub fn remove_label_values(&self, res: &mut [prometheus::Result<()>; 2], vals: &[&str]) {
pub fn remove_label_values(&self, res: &mut [Result<()>; 2], vals: &[&str]) {
res[0] = self.inc.remove_label_values(vals);
res[1] = self.dec.remove_label_values(vals);
}
@@ -430,171 +285,3 @@ pub type IntCounterPair = GenericCounterPair<AtomicU64>;
/// A guard for [`IntCounterPair`] that will decrement the gauge on drop
pub type IntCounterPairGuard = GenericCounterPairGuard<AtomicU64>;
pub trait CounterPairAssoc {
const INC_NAME: &'static MetricName;
const DEC_NAME: &'static MetricName;
const INC_HELP: &'static str;
const DEC_HELP: &'static str;
type LabelGroupSet: LabelGroupSet;
}
pub struct CounterPairVec<A: CounterPairAssoc> {
vec: measured::metric::MetricVec<MeasuredCounterPairState, A::LabelGroupSet>,
}
impl<A: CounterPairAssoc> Default for CounterPairVec<A>
where
A::LabelGroupSet: Default,
{
fn default() -> Self {
Self {
vec: Default::default(),
}
}
}
impl<A: CounterPairAssoc> CounterPairVec<A> {
pub fn guard(
&self,
labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>,
) -> MeasuredCounterPairGuard<'_, A> {
let id = self.vec.with_labels(labels);
self.vec.get_metric(id).inc.inc();
MeasuredCounterPairGuard { vec: &self.vec, id }
}
pub fn inc(&self, labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>) {
let id = self.vec.with_labels(labels);
self.vec.get_metric(id).inc.inc();
}
pub fn dec(&self, labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>) {
let id = self.vec.with_labels(labels);
self.vec.get_metric(id).dec.inc();
}
pub fn remove_metric(
&self,
labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>,
) -> Option<MeasuredCounterPairState> {
let id = self.vec.with_labels(labels);
self.vec.remove_metric(id)
}
}
impl<T, A> ::measured::metric::group::MetricGroup<T> for CounterPairVec<A>
where
T: ::measured::metric::group::Encoding,
A: CounterPairAssoc,
::measured::metric::counter::CounterState: ::measured::metric::MetricEncoding<T>,
{
fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> {
// write decrement first to avoid a race condition where inc - dec < 0
T::write_help(enc, A::DEC_NAME, A::DEC_HELP)?;
self.vec
.collect_family_into(A::DEC_NAME, &mut Dec(&mut *enc))?;
T::write_help(enc, A::INC_NAME, A::INC_HELP)?;
self.vec
.collect_family_into(A::INC_NAME, &mut Inc(&mut *enc))?;
Ok(())
}
}
#[derive(MetricGroup, Default)]
pub struct MeasuredCounterPairState {
pub inc: CounterState,
pub dec: CounterState,
}
impl measured::metric::MetricType for MeasuredCounterPairState {
type Metadata = ();
}
pub struct MeasuredCounterPairGuard<'a, A: CounterPairAssoc> {
vec: &'a measured::metric::MetricVec<MeasuredCounterPairState, A::LabelGroupSet>,
id: measured::metric::LabelId<A::LabelGroupSet>,
}
impl<A: CounterPairAssoc> Drop for MeasuredCounterPairGuard<'_, A> {
fn drop(&mut self) {
self.vec.get_metric(self.id).dec.inc();
}
}
/// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the inc counter to the inner encoder.
struct Inc<T>(T);
/// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the dec counter to the inner encoder.
struct Dec<T>(T);
impl<T: Encoding> Encoding for Inc<T> {
type Err = T::Err;
fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
self.0.write_help(name, help)
}
fn write_metric_value(
&mut self,
name: impl MetricNameEncoder,
labels: impl LabelGroup,
value: MetricValue,
) -> Result<(), Self::Err> {
self.0.write_metric_value(name, labels, value)
}
}
impl<T: Encoding> MetricEncoding<Inc<T>> for MeasuredCounterPairState
where
CounterState: MetricEncoding<T>,
{
fn write_type(name: impl MetricNameEncoder, enc: &mut Inc<T>) -> Result<(), T::Err> {
CounterState::write_type(name, &mut enc.0)
}
fn collect_into(
&self,
metadata: &(),
labels: impl LabelGroup,
name: impl MetricNameEncoder,
enc: &mut Inc<T>,
) -> Result<(), T::Err> {
self.inc.collect_into(metadata, labels, name, &mut enc.0)
}
}
impl<T: Encoding> Encoding for Dec<T> {
type Err = T::Err;
fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
self.0.write_help(name, help)
}
fn write_metric_value(
&mut self,
name: impl MetricNameEncoder,
labels: impl LabelGroup,
value: MetricValue,
) -> Result<(), Self::Err> {
self.0.write_metric_value(name, labels, value)
}
}
/// Write the dec counter to the encoder
impl<T: Encoding> MetricEncoding<Dec<T>> for MeasuredCounterPairState
where
CounterState: MetricEncoding<T>,
{
fn write_type(name: impl MetricNameEncoder, enc: &mut Dec<T>) -> Result<(), T::Err> {
CounterState::write_type(name, &mut enc.0)
}
fn collect_into(
&self,
metadata: &(),
labels: impl LabelGroup,
name: impl MetricNameEncoder,
enc: &mut Dec<T>,
) -> Result<(), T::Err> {
self.dec.collect_into(metadata, labels, name, &mut enc.0)
}
}

View File

@@ -2,9 +2,9 @@ use std::str::FromStr;
/// Request/response types for the storage controller
/// API (`/control/v1` prefix). Implemented by the server
/// in [`storage_controller::http`]
/// in [`attachment_service::http`]
use serde::{Deserialize, Serialize};
use utils::id::{NodeId, TenantId};
use utils::id::NodeId;
use crate::{
models::{ShardParameters, TenantConfig},
@@ -68,27 +68,12 @@ pub struct TenantLocateResponse {
#[derive(Serialize, Deserialize)]
pub struct TenantDescribeResponse {
pub tenant_id: TenantId,
pub shards: Vec<TenantDescribeResponseShard>,
pub stripe_size: ShardStripeSize,
pub policy: PlacementPolicy,
pub config: TenantConfig,
}
#[derive(Serialize, Deserialize)]
pub struct NodeDescribeResponse {
pub id: NodeId,
pub availability: NodeAvailabilityWrapper,
pub scheduling: NodeSchedulingPolicy,
pub listen_http_addr: String,
pub listen_http_port: u16,
pub listen_pg_addr: String,
pub listen_pg_port: u16,
}
#[derive(Serialize, Deserialize)]
pub struct TenantDescribeResponseShard {
pub tenant_shard_id: TenantShardId,
@@ -104,8 +89,6 @@ pub struct TenantDescribeResponseShard {
pub is_pending_compute_notification: bool,
/// A shard split is currently underway
pub is_splitting: bool,
pub scheduling_policy: ShardSchedulingPolicy,
}
/// Explicitly migrating a particular shard is a low level operation
@@ -120,7 +103,7 @@ pub struct TenantShardMigrateRequest {
/// Utilisation score indicating how good a candidate a pageserver
/// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`].
/// Lower values are better.
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Debug)]
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord)]
pub struct UtilizationScore(pub u64);
impl UtilizationScore {
@@ -129,7 +112,7 @@ impl UtilizationScore {
}
}
#[derive(Serialize, Deserialize, Clone, Copy, Debug)]
#[derive(Serialize, Clone, Copy)]
#[serde(into = "NodeAvailabilityWrapper")]
pub enum NodeAvailability {
// Normal, happy state
@@ -152,7 +135,7 @@ impl Eq for NodeAvailability {}
// This wrapper provides serde functionality and it should only be used to
// communicate with external callers which don't know or care about the
// utilisation score of the pageserver it is targeting.
#[derive(Serialize, Deserialize, Clone, Copy, Debug)]
#[derive(Serialize, Deserialize, Clone)]
pub enum NodeAvailabilityWrapper {
Active,
Offline,
@@ -178,6 +161,21 @@ impl From<NodeAvailability> for NodeAvailabilityWrapper {
}
}
impl FromStr for NodeAvailability {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
// This is used when parsing node configuration requests from neon-local.
// Assume the worst possible utilisation score
// and let it get updated via the heartbeats.
"active" => Ok(Self::Active(UtilizationScore::worst())),
"offline" => Ok(Self::Offline),
_ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
}
}
}
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
pub enum ShardSchedulingPolicy {
// Normal mode: the tenant's scheduled locations may be updated at will, including
@@ -204,7 +202,7 @@ impl Default for ShardSchedulingPolicy {
}
}
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
pub enum NodeSchedulingPolicy {
Active,
Filling,

View File

@@ -1,6 +1,5 @@
use anyhow::{bail, Result};
use byteorder::{ByteOrder, BE};
use bytes::BufMut;
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
use postgres_ffi::{Oid, TransactionId};
use serde::{Deserialize, Serialize};
@@ -22,107 +21,15 @@ pub struct Key {
pub field6: u32,
}
/// The storage key size.
pub const KEY_SIZE: usize = 18;
/// The metadata key size. 2B fewer than the storage key size because field2 is not fully utilized.
/// See [`Key::to_i128`] for more information on the encoding.
pub const METADATA_KEY_SIZE: usize = 16;
/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x40 is a metadata key.
pub const METADATA_KEY_BEGIN_PREFIX: u8 = 0x60;
pub const METADATA_KEY_END_PREFIX: u8 = 0x7F;
/// The (reserved) key prefix of relation sizes.
pub const RELATION_SIZE_PREFIX: u8 = 0x61;
/// The key prefix of AUX file keys.
pub const AUX_KEY_PREFIX: u8 = 0x62;
/// Check if the key falls in the range of metadata keys.
pub const fn is_metadata_key_slice(key: &[u8]) -> bool {
key[0] >= METADATA_KEY_BEGIN_PREFIX && key[0] < METADATA_KEY_END_PREFIX
}
impl Key {
/// Check if the key falls in the range of metadata keys.
pub const fn is_metadata_key(&self) -> bool {
self.field1 >= METADATA_KEY_BEGIN_PREFIX && self.field1 < METADATA_KEY_END_PREFIX
}
/// Encode a metadata key to a storage key.
pub fn from_metadata_key_fixed_size(key: &[u8; METADATA_KEY_SIZE]) -> Self {
assert!(is_metadata_key_slice(key), "key not in metadata key range");
Key {
field1: key[0],
field2: u16::from_be_bytes(key[1..3].try_into().unwrap()) as u32,
field3: u32::from_be_bytes(key[3..7].try_into().unwrap()),
field4: u32::from_be_bytes(key[7..11].try_into().unwrap()),
field5: key[11],
field6: u32::from_be_bytes(key[12..16].try_into().unwrap()),
}
}
/// Encode a metadata key to a storage key.
pub fn from_metadata_key(key: &[u8]) -> Self {
Self::from_metadata_key_fixed_size(key.try_into().expect("expect 16 byte metadata key"))
}
/// Extract a metadata key to a writer. The result should always be 16 bytes.
pub fn extract_metadata_key_to_writer(&self, mut writer: impl BufMut) {
writer.put_u8(self.field1);
assert!(self.field2 <= 0xFFFF);
writer.put_u16(self.field2 as u16);
writer.put_u32(self.field3);
writer.put_u32(self.field4);
writer.put_u8(self.field5);
writer.put_u32(self.field6);
}
/// Get the range of metadata keys.
pub fn metadata_key_range() -> Range<Self> {
Key {
field1: METADATA_KEY_BEGIN_PREFIX,
field2: 0,
field3: 0,
field4: 0,
field5: 0,
field6: 0,
}..Key {
field1: METADATA_KEY_END_PREFIX,
field2: 0,
field3: 0,
field4: 0,
field5: 0,
field6: 0,
}
}
/// Get the range of aux keys.
pub fn metadata_aux_key_range() -> Range<Self> {
Key {
field1: AUX_KEY_PREFIX,
field2: 0,
field3: 0,
field4: 0,
field5: 0,
field6: 0,
}..Key {
field1: AUX_KEY_PREFIX + 1,
field2: 0,
field3: 0,
field4: 0,
field5: 0,
field6: 0,
}
}
/// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish.
/// As long as Neon does not support tablespace (because of lack of access to local file system),
/// we can assume that only some predefined namespace OIDs are used which can fit in u16
pub fn to_i128(&self) -> i128 {
assert!(self.field2 < 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
(((self.field1 & 0x7F) as i128) << 120)
(((self.field1 & 0xf) as i128) << 120)
| (((self.field2 & 0xFFFF) as i128) << 104)
| ((self.field3 as i128) << 72)
| ((self.field4 as i128) << 40)
@@ -132,7 +39,7 @@ impl Key {
pub const fn from_i128(x: i128) -> Self {
Key {
field1: ((x >> 120) & 0x7F) as u8,
field1: ((x >> 120) & 0xf) as u8,
field2: ((x >> 104) & 0xFFFF) as u32,
field3: (x >> 72) as u32,
field4: (x >> 40) as u32,
@@ -141,11 +48,11 @@ impl Key {
}
}
pub const fn next(&self) -> Key {
pub fn next(&self) -> Key {
self.add(1)
}
pub const fn add(&self, x: u32) -> Key {
pub fn add(&self, x: u32) -> Key {
let mut key = *self;
let r = key.field6.overflowing_add(x);
@@ -174,8 +81,6 @@ impl Key {
key
}
/// Convert a 18B slice to a key. This function should not be used for metadata keys because field2 is handled differently.
/// Use [`Key::from_metadata_key`] instead.
pub fn from_slice(b: &[u8]) -> Self {
Key {
field1: b[0],
@@ -187,8 +92,6 @@ impl Key {
}
}
/// Convert a key to a 18B slice. This function should not be used for metadata keys because field2 is handled differently.
/// Use [`Key::extract_metadata_key_to_writer`] instead.
pub fn write_to_byte_slice(&self, buf: &mut [u8]) {
buf[0] = self.field1;
BE::write_u32(&mut buf[1..5], self.field2);
@@ -572,14 +475,12 @@ pub const AUX_FILES_KEY: Key = Key {
// Reverse mappings for a few Keys.
// These are needed by WAL redo manager.
pub const NON_INHERITED_RANGE: Range<Key> = AUX_FILES_KEY..AUX_FILES_KEY.next();
// AUX_FILES currently stores only data for logical replication (slots etc), and
// we don't preserve these on a branch because safekeepers can't follow timeline
// switch (and generally it likely should be optional), so ignore these.
#[inline(always)]
pub fn is_inherited_key(key: Key) -> bool {
!NON_INHERITED_RANGE.contains(&key)
key != AUX_FILES_KEY
}
#[inline(always)]
@@ -655,14 +556,11 @@ impl std::str::FromStr for Key {
mod tests {
use std::str::FromStr;
use crate::key::is_metadata_key_slice;
use crate::key::Key;
use rand::Rng;
use rand::SeedableRng;
use super::AUX_KEY_PREFIX;
#[test]
fn display_fromstr_bijection() {
let mut rng = rand::rngs::StdRng::seed_from_u64(42);
@@ -678,16 +576,4 @@ mod tests {
assert_eq!(key, Key::from_str(&format!("{key}")).unwrap());
}
#[test]
fn test_metadata_keys() {
let mut metadata_key = vec![AUX_KEY_PREFIX];
metadata_key.extend_from_slice(&[0xFF; 15]);
let encoded_key = Key::from_metadata_key(&metadata_key);
let mut output_key = Vec::new();
encoded_key.extract_metadata_key_to_writer(&mut output_key);
assert_eq!(metadata_key, output_key);
assert!(encoded_key.is_metadata_key());
assert!(is_metadata_key_slice(&metadata_key));
}
}

View File

@@ -15,13 +15,7 @@ pub struct KeySpace {
}
impl KeySpace {
/// Create a key space with a single range.
pub fn single(key_range: Range<Key>) -> Self {
Self {
ranges: vec![key_range],
}
}
///
/// Partition a key space into roughly chunks of roughly 'target_size' bytes
/// in each partition.
///
@@ -70,10 +64,6 @@ impl KeySpace {
KeyPartitioning { parts }
}
pub fn is_empty(&self) -> bool {
self.total_size() == 0
}
/// Merge another keyspace into the current one.
/// Note: the keyspaces must not ovelap (enforced via assertions)
pub fn merge(&mut self, other: &KeySpace) {
@@ -104,13 +94,12 @@ impl KeySpace {
/// Remove all keys in `other` from `self`.
/// This can involve splitting or removing of existing ranges.
/// Returns the removed keyspace
pub fn remove_overlapping_with(&mut self, other: &KeySpace) -> KeySpace {
pub fn remove_overlapping_with(&mut self, other: &KeySpace) {
let (self_start, self_end) = match (self.start(), self.end()) {
(Some(start), Some(end)) => (start, end),
_ => {
// self is empty
return KeySpace::default();
return;
}
};
@@ -123,37 +112,30 @@ impl KeySpace {
.skip_while(|range| self_start >= range.end)
.take_while(|range| self_end > range.start);
let mut removed_accum = KeySpaceRandomAccum::new();
for range in other_ranges {
while let Some(overlap_at) = self.overlaps_at(range) {
let overlapped = self.ranges[overlap_at].clone();
if overlapped.start < range.start && overlapped.end <= range.end {
// Higher part of the range is completely overlapped.
removed_accum.add_range(range.start..self.ranges[overlap_at].end);
self.ranges[overlap_at].end = range.start;
}
if overlapped.start >= range.start && overlapped.end > range.end {
// Lower part of the range is completely overlapped.
removed_accum.add_range(self.ranges[overlap_at].start..range.end);
self.ranges[overlap_at].start = range.end;
}
if overlapped.start < range.start && overlapped.end > range.end {
// Middle part of the range is overlapped.
removed_accum.add_range(range.clone());
self.ranges[overlap_at].end = range.start;
self.ranges
.insert(overlap_at + 1, range.end..overlapped.end);
}
if overlapped.start >= range.start && overlapped.end <= range.end {
// Whole range is overlapped
removed_accum.add_range(self.ranges[overlap_at].clone());
self.ranges.remove(overlap_at);
}
}
}
removed_accum.to_keyspace()
}
pub fn start(&self) -> Option<Key> {
@@ -188,11 +170,6 @@ impl KeySpace {
pub fn overlaps(&self, range: &Range<Key>) -> bool {
self.overlaps_at(range).is_some()
}
/// Check if the keyspace contains a key
pub fn contains(&self, key: &Key) -> bool {
self.overlaps(&(*key..key.next()))
}
}
///
@@ -576,16 +553,7 @@ mod tests {
Key::from_i128(11)..Key::from_i128(13),
],
};
let removed = key_space1.remove_overlapping_with(&key_space2);
let removed_expected = KeySpace {
ranges: vec![
Key::from_i128(2)..Key::from_i128(3),
Key::from_i128(6)..Key::from_i128(7),
Key::from_i128(11)..Key::from_i128(12),
],
};
assert_eq!(removed, removed_expected);
key_space1.remove_overlapping_with(&key_space2);
assert_eq!(
key_space1.ranges,
vec![
@@ -615,17 +583,7 @@ mod tests {
Key::from_i128(14)..Key::from_i128(17),
],
};
let removed = key_space1.remove_overlapping_with(&key_space2);
let removed_expected = KeySpace {
ranges: vec![
Key::from_i128(3)..Key::from_i128(5),
Key::from_i128(8)..Key::from_i128(10),
Key::from_i128(14)..Key::from_i128(15),
],
};
assert_eq!(removed, removed_expected);
key_space1.remove_overlapping_with(&key_space2);
assert_eq!(
key_space1.ranges,
vec![
@@ -652,11 +610,7 @@ mod tests {
Key::from_i128(15)..Key::from_i128(17),
],
};
let removed = key_space1.remove_overlapping_with(&key_space2);
let removed_expected = KeySpace::default();
assert_eq!(removed, removed_expected);
key_space1.remove_overlapping_with(&key_space2);
assert_eq!(
key_space1.ranges,
vec![
@@ -683,17 +637,7 @@ mod tests {
let key_space2 = KeySpace {
ranges: vec![Key::from_i128(9)..Key::from_i128(19)],
};
let removed = key_space1.remove_overlapping_with(&key_space2);
let removed_expected = KeySpace {
ranges: vec![
Key::from_i128(9)..Key::from_i128(10),
Key::from_i128(12)..Key::from_i128(15),
Key::from_i128(17)..Key::from_i128(19),
],
};
assert_eq!(removed, removed_expected);
key_space1.remove_overlapping_with(&key_space2);
assert_eq!(
key_space1.ranges,
vec![

View File

@@ -20,7 +20,6 @@ use utils::{
history_buffer::HistoryBufferWithDropCounter,
id::{NodeId, TenantId, TimelineId},
lsn::Lsn,
serde_system_time,
};
use crate::controller_api::PlacementPolicy;
@@ -303,7 +302,6 @@ pub struct TenantConfig {
pub lazy_slru_download: Option<bool>,
pub timeline_get_throttle: Option<ThrottleConfig>,
pub image_layer_creation_check_threshold: Option<u8>,
pub switch_to_aux_file_v2: Option<bool>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -430,7 +428,6 @@ pub struct StatusResponse {
#[derive(Serialize, Deserialize, Debug)]
#[serde(deny_unknown_fields)]
pub struct TenantLocationConfigRequest {
#[serde(skip_serializing_if = "Option::is_none")]
pub tenant_id: Option<TenantShardId>,
#[serde(flatten)]
pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
@@ -749,18 +746,10 @@ pub struct TimelineGcRequest {
pub gc_horizon: Option<u64>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct WalRedoManagerProcessStatus {
pub pid: u32,
/// The strum-generated `into::<&'static str>()` for `pageserver::walredo::ProcessKind`.
/// `ProcessKind` are a transitory thing, so, they have no enum representation in `pageserver_api`.
pub kind: Cow<'static, str>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct WalRedoManagerStatus {
pub last_redo_at: Option<chrono::DateTime<chrono::Utc>>,
pub process: Option<WalRedoManagerProcessStatus>,
pub pid: Option<u32>,
}
/// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating
@@ -769,7 +758,11 @@ pub struct WalRedoManagerStatus {
#[derive(Default, Debug, Serialize, Deserialize, Clone)]
pub struct SecondaryProgress {
/// The remote storage LastModified time of the heatmap object we last downloaded.
pub heatmap_mtime: Option<serde_system_time::SystemTime>,
#[serde(
serialize_with = "opt_ser_rfc3339_millis",
deserialize_with = "opt_deser_rfc3339_millis"
)]
pub heatmap_mtime: Option<SystemTime>,
/// The number of layers currently on-disk
pub layers_downloaded: usize,
@@ -782,15 +775,27 @@ pub struct SecondaryProgress {
pub bytes_total: u64,
}
#[derive(Serialize, Deserialize, Debug)]
pub struct TenantScanRemoteStorageShard {
pub tenant_shard_id: TenantShardId,
pub generation: Option<u32>,
fn opt_ser_rfc3339_millis<S: serde::Serializer>(
ts: &Option<SystemTime>,
serializer: S,
) -> Result<S::Ok, S::Error> {
match ts {
Some(ts) => serializer.collect_str(&humantime::format_rfc3339_millis(*ts)),
None => serializer.serialize_none(),
}
}
#[derive(Serialize, Deserialize, Debug, Default)]
pub struct TenantScanRemoteStorageResponse {
pub shards: Vec<TenantScanRemoteStorageShard>,
fn opt_deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<Option<SystemTime>, D::Error>
where
D: serde::de::Deserializer<'de>,
{
let s: Option<String> = serde::de::Deserialize::deserialize(deserializer)?;
match s {
None => Ok(None),
Some(s) => humantime::parse_rfc3339(&s)
.map_err(serde::de::Error::custom)
.map(Some),
}
}
pub mod virtual_file {
@@ -860,72 +865,39 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
}
}
// In the V2 protocol version, a GetPage request contains two LSN values:
//
// request_lsn: Get the page version at this point in time. Lsn::Max is a special value that means
// "get the latest version present". It's used by the primary server, which knows that no one else
// is writing WAL. 'not_modified_since' must be set to a proper value even if request_lsn is
// Lsn::Max. Standby servers use the current replay LSN as the request LSN.
//
// not_modified_since: Hint to the pageserver that the client knows that the page has not been
// modified between 'not_modified_since' and the request LSN. It's always correct to set
// 'not_modified_since equal' to 'request_lsn' (unless Lsn::Max is used as the 'request_lsn'), but
// passing an earlier LSN can speed up the request, by allowing the pageserver to process the
// request without waiting for 'request_lsn' to arrive.
//
// The legacy V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was
// sufficient for the primary; the 'lsn' was equivalent to the 'not_modified_since' value, and
// 'latest' was set to true. The V2 interface was added because there was no correct way for a
// standby to request a page at a particular non-latest LSN, and also include the
// 'not_modified_since' hint. That led to an awkward choice of either using an old LSN in the
// request, if the standby knows that the page hasn't been modified since, and risk getting an error
// if that LSN has fallen behind the GC horizon, or requesting the current replay LSN, which could
// require the pageserver unnecessarily to wait for the WAL to arrive up to that point. The new V2
// interface allows sending both LSNs, and let the pageserver do the right thing. There is no
// difference in the responses between V1 and V2.
//
// The Request structs below reflect the V2 interface. If V1 is used, the parse function
// maps the old format requests to the new format.
//
#[derive(Clone, Copy)]
pub enum PagestreamProtocolVersion {
V1,
V2,
}
#[derive(Debug, PartialEq, Eq)]
pub struct PagestreamExistsRequest {
pub request_lsn: Lsn,
pub not_modified_since: Lsn,
pub latest: bool,
pub lsn: Lsn,
pub rel: RelTag,
}
#[derive(Debug, PartialEq, Eq)]
pub struct PagestreamNblocksRequest {
pub request_lsn: Lsn,
pub not_modified_since: Lsn,
pub latest: bool,
pub lsn: Lsn,
pub rel: RelTag,
}
#[derive(Debug, PartialEq, Eq)]
pub struct PagestreamGetPageRequest {
pub request_lsn: Lsn,
pub not_modified_since: Lsn,
pub latest: bool,
pub lsn: Lsn,
pub rel: RelTag,
pub blkno: u32,
}
#[derive(Debug, PartialEq, Eq)]
pub struct PagestreamDbSizeRequest {
pub request_lsn: Lsn,
pub not_modified_since: Lsn,
pub latest: bool,
pub lsn: Lsn,
pub dbnode: u32,
}
#[derive(Debug, PartialEq, Eq)]
pub struct PagestreamGetSlruSegmentRequest {
pub request_lsn: Lsn,
pub not_modified_since: Lsn,
pub latest: bool,
pub lsn: Lsn,
pub kind: u8,
pub segno: u32,
}
@@ -972,16 +944,14 @@ pub struct TenantHistorySize {
}
impl PagestreamFeMessage {
/// Serialize a compute -> pageserver message. This is currently only used in testing
/// tools. Always uses protocol version 2.
pub fn serialize(&self) -> Bytes {
let mut bytes = BytesMut::new();
match self {
Self::Exists(req) => {
bytes.put_u8(0);
bytes.put_u64(req.request_lsn.0);
bytes.put_u64(req.not_modified_since.0);
bytes.put_u8(u8::from(req.latest));
bytes.put_u64(req.lsn.0);
bytes.put_u32(req.rel.spcnode);
bytes.put_u32(req.rel.dbnode);
bytes.put_u32(req.rel.relnode);
@@ -990,8 +960,8 @@ impl PagestreamFeMessage {
Self::Nblocks(req) => {
bytes.put_u8(1);
bytes.put_u64(req.request_lsn.0);
bytes.put_u64(req.not_modified_since.0);
bytes.put_u8(u8::from(req.latest));
bytes.put_u64(req.lsn.0);
bytes.put_u32(req.rel.spcnode);
bytes.put_u32(req.rel.dbnode);
bytes.put_u32(req.rel.relnode);
@@ -1000,8 +970,8 @@ impl PagestreamFeMessage {
Self::GetPage(req) => {
bytes.put_u8(2);
bytes.put_u64(req.request_lsn.0);
bytes.put_u64(req.not_modified_since.0);
bytes.put_u8(u8::from(req.latest));
bytes.put_u64(req.lsn.0);
bytes.put_u32(req.rel.spcnode);
bytes.put_u32(req.rel.dbnode);
bytes.put_u32(req.rel.relnode);
@@ -1011,15 +981,15 @@ impl PagestreamFeMessage {
Self::DbSize(req) => {
bytes.put_u8(3);
bytes.put_u64(req.request_lsn.0);
bytes.put_u64(req.not_modified_since.0);
bytes.put_u8(u8::from(req.latest));
bytes.put_u64(req.lsn.0);
bytes.put_u32(req.dbnode);
}
Self::GetSlruSegment(req) => {
bytes.put_u8(4);
bytes.put_u64(req.request_lsn.0);
bytes.put_u64(req.not_modified_since.0);
bytes.put_u8(u8::from(req.latest));
bytes.put_u64(req.lsn.0);
bytes.put_u8(req.kind);
bytes.put_u32(req.segno);
}
@@ -1028,40 +998,18 @@ impl PagestreamFeMessage {
bytes.into()
}
pub fn parse<R: std::io::Read>(
body: &mut R,
protocol_version: PagestreamProtocolVersion,
) -> anyhow::Result<PagestreamFeMessage> {
pub fn parse<R: std::io::Read>(body: &mut R) -> anyhow::Result<PagestreamFeMessage> {
// TODO these gets can fail
// these correspond to the NeonMessageTag enum in pagestore_client.h
//
// TODO: consider using protobuf or serde bincode for less error prone
// serialization.
let msg_tag = body.read_u8()?;
let (request_lsn, not_modified_since) = match protocol_version {
PagestreamProtocolVersion::V2 => (
Lsn::from(body.read_u64::<BigEndian>()?),
Lsn::from(body.read_u64::<BigEndian>()?),
),
PagestreamProtocolVersion::V1 => {
// In the old protocol, each message starts with a boolean 'latest' flag,
// followed by 'lsn'. Convert that to the two LSNs, 'request_lsn' and
// 'not_modified_since', used in the new protocol version.
let latest = body.read_u8()? != 0;
let request_lsn = Lsn::from(body.read_u64::<BigEndian>()?);
if latest {
(Lsn::MAX, request_lsn) // get latest version
} else {
(request_lsn, request_lsn) // get version at specified LSN
}
}
};
// The rest of the messages are the same between V1 and V2
match msg_tag {
0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
request_lsn,
not_modified_since,
latest: body.read_u8()? != 0,
lsn: Lsn::from(body.read_u64::<BigEndian>()?),
rel: RelTag {
spcnode: body.read_u32::<BigEndian>()?,
dbnode: body.read_u32::<BigEndian>()?,
@@ -1070,8 +1018,8 @@ impl PagestreamFeMessage {
},
})),
1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
request_lsn,
not_modified_since,
latest: body.read_u8()? != 0,
lsn: Lsn::from(body.read_u64::<BigEndian>()?),
rel: RelTag {
spcnode: body.read_u32::<BigEndian>()?,
dbnode: body.read_u32::<BigEndian>()?,
@@ -1080,8 +1028,8 @@ impl PagestreamFeMessage {
},
})),
2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
request_lsn,
not_modified_since,
latest: body.read_u8()? != 0,
lsn: Lsn::from(body.read_u64::<BigEndian>()?),
rel: RelTag {
spcnode: body.read_u32::<BigEndian>()?,
dbnode: body.read_u32::<BigEndian>()?,
@@ -1091,14 +1039,14 @@ impl PagestreamFeMessage {
blkno: body.read_u32::<BigEndian>()?,
})),
3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
request_lsn,
not_modified_since,
latest: body.read_u8()? != 0,
lsn: Lsn::from(body.read_u64::<BigEndian>()?),
dbnode: body.read_u32::<BigEndian>()?,
})),
4 => Ok(PagestreamFeMessage::GetSlruSegment(
PagestreamGetSlruSegmentRequest {
request_lsn,
not_modified_since,
latest: body.read_u8()? != 0,
lsn: Lsn::from(body.read_u64::<BigEndian>()?),
kind: body.read_u8()?,
segno: body.read_u32::<BigEndian>()?,
},
@@ -1226,8 +1174,8 @@ mod tests {
// Test serialization/deserialization of PagestreamFeMessage
let messages = vec![
PagestreamFeMessage::Exists(PagestreamExistsRequest {
request_lsn: Lsn(4),
not_modified_since: Lsn(3),
latest: true,
lsn: Lsn(4),
rel: RelTag {
forknum: 1,
spcnode: 2,
@@ -1236,8 +1184,8 @@ mod tests {
},
}),
PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
request_lsn: Lsn(4),
not_modified_since: Lsn(4),
latest: false,
lsn: Lsn(4),
rel: RelTag {
forknum: 1,
spcnode: 2,
@@ -1246,8 +1194,8 @@ mod tests {
},
}),
PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
request_lsn: Lsn(4),
not_modified_since: Lsn(3),
latest: true,
lsn: Lsn(4),
rel: RelTag {
forknum: 1,
spcnode: 2,
@@ -1257,16 +1205,14 @@ mod tests {
blkno: 7,
}),
PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
request_lsn: Lsn(4),
not_modified_since: Lsn(3),
latest: true,
lsn: Lsn(4),
dbnode: 7,
}),
];
for msg in messages {
let bytes = msg.serialize();
let reconstructed =
PagestreamFeMessage::parse(&mut bytes.reader(), PagestreamProtocolVersion::V2)
.unwrap();
let reconstructed = PagestreamFeMessage::parse(&mut bytes.reader()).unwrap();
assert!(msg == reconstructed);
}
}

View File

@@ -1,4 +1,4 @@
use utils::serde_system_time::SystemTime;
use std::time::SystemTime;
/// Pageserver current utilization and scoring for how good candidate the pageserver would be for
/// the next tenant.
@@ -21,9 +21,28 @@ pub struct PageserverUtilization {
/// When was this snapshot captured, pageserver local time.
///
/// Use millis to give confidence that the value is regenerated often enough.
#[serde(
serialize_with = "ser_rfc3339_millis",
deserialize_with = "deser_rfc3339_millis"
)]
pub captured_at: SystemTime,
}
fn ser_rfc3339_millis<S: serde::Serializer>(
ts: &SystemTime,
serializer: S,
) -> Result<S::Ok, S::Error> {
serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
}
fn deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<SystemTime, D::Error>
where
D: serde::de::Deserializer<'de>,
{
let s: String = serde::de::Deserialize::deserialize(deserializer)?;
humantime::parse_rfc3339(&s).map_err(serde::de::Error::custom)
}
/// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients.
///
/// Instead of newtype, use this because a newtype would get require handling deserializing values
@@ -50,9 +69,7 @@ mod tests {
disk_usage_bytes: u64::MAX,
free_space_bytes: 0,
utilization_score: u64::MAX,
captured_at: SystemTime(
std::time::SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
),
captured_at: SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
};
let s = serde_json::to_string(&doc).unwrap();

View File

@@ -5,93 +5,15 @@ use crate::{
models::ShardParameters,
};
use hex::FromHex;
use postgres_ffi::relfile_utils::INIT_FORKNUM;
use serde::{Deserialize, Serialize};
use utils::id::TenantId;
/// See docs/rfcs/031-sharding-static.md for an overview of sharding.
///
/// This module contains a variety of types used to represent the concept of sharding
/// a Neon tenant across multiple physical shards. Since there are quite a few of these,
/// we provide an summary here.
///
/// Types used to describe shards:
/// - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value
/// which identifies a tenant which is not shard-aware. This means its storage paths do not include
/// a shard suffix.
/// - [`ShardNumber`] is simply the zero-based index of a shard within a tenant.
/// - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId`
/// without the tenant ID. This is useful for things that are implicitly scoped to a particular
/// tenant, such as layer files.
/// - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient
/// detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read.
/// - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as
/// four hex digits. An unsharded tenant is `0000`.
/// - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant
///
/// Types used to describe the parameters for data distribution in a sharded tenant:
/// - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across
/// multiple shards. Its value is given in 8kiB pages.
/// - [`ShardLayout`] describes the data distribution scheme, and at time of writing is
/// always zero: this is provided for future upgrades that might introduce different
/// data distribution schemes.
///
/// Examples:
/// - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000
/// - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001
/// - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
/// and their slugs are 0004, 0104, 0204, and 0304.
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
pub struct ShardNumber(pub u8);
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
pub struct ShardCount(u8);
/// Combination of ShardNumber and ShardCount. For use within the context of a particular tenant,
/// when we need to know which shard we're dealing with, but do not need to know the full
/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
/// the fully qualified TenantShardId.
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
pub struct ShardIndex {
pub shard_number: ShardNumber,
pub shard_count: ShardCount,
}
/// The ShardIdentity contains enough information to map a [`Key`] to a [`ShardNumber`],
/// and to check whether that [`ShardNumber`] is the same as the current shard.
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
pub struct ShardIdentity {
pub number: ShardNumber,
pub count: ShardCount,
pub stripe_size: ShardStripeSize,
layout: ShardLayout,
}
/// Formatting helper, for generating the `shard_id` label in traces.
struct ShardSlug<'a>(&'a TenantShardId);
/// TenantShardId globally identifies a particular shard in a particular tenant.
///
/// These are written as `<TenantId>-<ShardSlug>`, for example:
/// # The second shard in a two-shard tenant
/// 072f1291a5310026820b2fe4b2968934-0102
///
/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without
/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables
/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`].
///
/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs,
/// is both forward and backward compatible with TenantId: a legacy TenantId can be
/// decoded as a TenantShardId, and when re-encoded it will be parseable
/// as a TenantId.
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
pub struct TenantShardId {
pub tenant_id: TenantId,
pub shard_number: ShardNumber,
pub shard_count: ShardCount,
}
impl ShardCount {
pub const MAX: Self = Self(u8::MAX);
@@ -116,7 +38,6 @@ impl ShardCount {
self.0
}
///
pub fn is_unsharded(&self) -> bool {
self.0 == 0
}
@@ -132,6 +53,33 @@ impl ShardNumber {
pub const MAX: Self = Self(u8::MAX);
}
/// TenantShardId identify the units of work for the Pageserver.
///
/// These are written as `<tenant_id>-<shard number><shard-count>`, for example:
///
/// # The second shard in a two-shard tenant
/// 072f1291a5310026820b2fe4b2968934-0102
///
/// Historically, tenants could not have multiple shards, and were identified
/// by TenantId. To support this, TenantShardId has a special legacy
/// mode where `shard_count` is equal to zero: this represents a single-sharded
/// tenant which should be written as a TenantId with no suffix.
///
/// The human-readable encoding of TenantShardId, such as used in API URLs,
/// is both forward and backward compatible: a legacy TenantId can be
/// decoded as a TenantShardId, and when re-encoded it will be parseable
/// as a TenantId.
///
/// Note that the binary encoding is _not_ backward compatible, because
/// at the time sharding is introduced, there are no existing binary structures
/// containing TenantId that we need to handle.
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
pub struct TenantShardId {
pub tenant_id: TenantId,
pub shard_number: ShardNumber,
pub shard_count: ShardCount,
}
impl TenantShardId {
pub fn unsharded(tenant_id: TenantId) -> Self {
Self {
@@ -163,13 +111,10 @@ impl TenantShardId {
}
/// Convenience for code that has special behavior on the 0th shard.
pub fn is_shard_zero(&self) -> bool {
pub fn is_zero(&self) -> bool {
self.shard_number == ShardNumber(0)
}
/// The "unsharded" value is distinct from simply having a single shard: it represents
/// a tenant which is not shard-aware at all, and whose storage paths will not include
/// a shard suffix.
pub fn is_unsharded(&self) -> bool {
self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
}
@@ -205,6 +150,9 @@ impl TenantShardId {
}
}
/// Formatting helper
struct ShardSlug<'a>(&'a TenantShardId);
impl<'a> std::fmt::Display for ShardSlug<'a> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
@@ -274,6 +222,16 @@ impl From<[u8; 18]> for TenantShardId {
}
}
/// For use within the context of a particular tenant, when we need to know which
/// shard we're dealing with, but do not need to know the full ShardIdentity (because
/// we won't be doing any page->shard mapping), and do not need to know the fully qualified
/// TenantShardId.
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
pub struct ShardIndex {
pub shard_number: ShardNumber,
pub shard_count: ShardCount,
}
impl ShardIndex {
pub fn new(number: ShardNumber, count: ShardCount) -> Self {
Self {
@@ -288,9 +246,6 @@ impl ShardIndex {
}
}
/// The "unsharded" value is distinct from simply having a single shard: it represents
/// a tenant which is not shard-aware at all, and whose storage paths will not include
/// a shard suffix.
pub fn is_unsharded(&self) -> bool {
self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
}
@@ -358,8 +313,6 @@ impl Serialize for TenantShardId {
if serializer.is_human_readable() {
serializer.collect_str(self)
} else {
// Note: while human encoding of [`TenantShardId`] is backward and forward
// compatible, this binary encoding is not.
let mut packed: [u8; 18] = [0; 18];
packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
packed[16] = self.shard_number.0;
@@ -437,6 +390,16 @@ const LAYOUT_BROKEN: ShardLayout = ShardLayout(255);
/// Default stripe size in pages: 256MiB divided by 8kiB page size.
const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
/// The ShardIdentity contains the information needed for one member of map
/// to resolve a key to a shard, and then check whether that shard is ==self.
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
pub struct ShardIdentity {
pub number: ShardNumber,
pub count: ShardCount,
pub stripe_size: ShardStripeSize,
layout: ShardLayout,
}
#[derive(thiserror::Error, Debug, PartialEq, Eq)]
pub enum ShardConfigError {
#[error("Invalid shard count")]
@@ -476,9 +439,6 @@ impl ShardIdentity {
}
}
/// The "unsharded" value is distinct from simply having a single shard: it represents
/// a tenant which is not shard-aware at all, and whose storage paths will not include
/// a shard suffix.
pub fn is_unsharded(&self) -> bool {
self.number == ShardNumber(0) && self.count == ShardCount(0)
}
@@ -527,8 +487,6 @@ impl ShardIdentity {
}
/// Return true if the key should be ingested by this shard
///
/// Shards must ingest _at least_ keys which return true from this check.
pub fn is_key_local(&self, key: &Key) -> bool {
assert!(!self.is_broken());
if self.count < ShardCount(2) || (key_is_shard0(key) && self.number == ShardNumber(0)) {
@@ -538,28 +496,8 @@ impl ShardIdentity {
}
}
/// Special case for issue `<https://github.com/neondatabase/neon/issues/7451>`
///
/// When we fail to read a forknum block, this function tells us whether we may ignore the error
/// as a symptom of that issue.
pub fn is_key_buggy_forknum(&self, key: &Key) -> bool {
if !is_rel_block_key(key) || key.field5 != INIT_FORKNUM {
return false;
}
let mut hash = murmurhash32(key.field4);
hash = hash_combine(hash, murmurhash32(key.field6 / self.stripe_size.0));
let mapped_shard = ShardNumber((hash % self.count.0 as u32) as u8);
// The key may be affected by issue #7454: it is an initfork and it would not
// have mapped to shard 0 until we fixed that issue.
mapped_shard != ShardNumber(0)
}
/// Return true if the key should be discarded if found in this shard's
/// data store, e.g. during compaction after a split.
///
/// Shards _may_ drop keys which return false here, but are not obliged to.
/// data store, e.g. during compaction after a split
pub fn is_key_disposable(&self, key: &Key) -> bool {
if key_is_shard0(key) {
// Q: Why can't we dispose of shard0 content if we're not shard 0?
@@ -585,7 +523,7 @@ impl ShardIdentity {
/// Convenience for checking if this identity is the 0th shard in a tenant,
/// for special cases on shard 0 such as ingesting relation sizes.
pub fn is_shard_zero(&self) -> bool {
pub fn is_zero(&self) -> bool {
self.number == ShardNumber(0)
}
}
@@ -668,13 +606,7 @@ fn key_is_shard0(key: &Key) -> bool {
// relation pages are distributed to shards other than shard zero. Everything else gets
// stored on shard 0. This guarantees that shard 0 can independently serve basebackup
// requests, and any request other than those for particular blocks in relations.
//
// The only exception to this rule is "initfork" data -- this relates to postgres's UNLOGGED table
// type. These are special relations, usually with only 0 or 1 blocks, and we store them on shard 0
// because they must be included in basebackups.
let is_initfork = key.field5 == INIT_FORKNUM;
!is_rel_block_key(key) || is_initfork
!is_rel_block_key(key)
}
/// Provide the same result as the function in postgres `hashfn.h` with the same name

View File

@@ -118,9 +118,7 @@ pub use v14::bindings::{TimeLineID, TimestampTz, XLogRecPtr, XLogSegNo};
// Likewise for these, although the assumption that these don't change is a little more iffy.
pub use v14::bindings::{MultiXactOffset, MultiXactStatus};
pub use v14::bindings::{PageHeaderData, XLogRecord};
pub use v14::xlog_utils::{
XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
};
pub use v14::xlog_utils::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
pub use v14::bindings::{CheckPoint, ControlFileData};

View File

@@ -4,9 +4,7 @@ use log::*;
use postgres::types::PgLsn;
use postgres::Client;
use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
use postgres_ffi::{
XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
};
use postgres_ffi::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
use std::path::{Path, PathBuf};
use std::process::Command;
use std::time::{Duration, Instant};
@@ -264,21 +262,11 @@ fn craft_internal<C: postgres::GenericClient>(
intermediate_lsns.insert(0, initial_lsn);
}
// Some records may be not flushed, e.g. non-transactional logical messages. Flush now.
// Some records may be not flushed, e.g. non-transactional logical messages.
//
// If the previous WAL record ended exactly at page boundary, pg_current_wal_insert_lsn
// returns the position just after the page header on the next page. That's where the next
// record will be inserted. But the page header hasn't actually been written to the WAL
// yet, and if you try to flush it, you get a "request to flush past end of generated WAL"
// error. Because of that, if the insert location is just after a page header, back off to
// previous page boundary.
let mut lsn = u64::from(client.pg_current_wal_insert_lsn()?);
if lsn % WAL_SEGMENT_SIZE as u64 == XLOG_SIZE_OF_XLOG_LONG_PHD as u64 {
lsn -= XLOG_SIZE_OF_XLOG_LONG_PHD as u64;
} else if lsn % XLOG_BLCKSZ as u64 == XLOG_SIZE_OF_XLOG_SHORT_PHD as u64 {
lsn -= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64;
}
client.execute("select neon_xlogflush($1)", &[&PgLsn::from(lsn)])?;
// Note: this is broken if pg_current_wal_insert_lsn is at page boundary
// because pg_current_wal_insert_lsn skips page headers.
client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?;
Ok(intermediate_lsns)
}
@@ -332,49 +320,38 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
client.execute("CREATE table t(x int)", &[])?;
// Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary. We
// will use carefully-sized logical messages to advance WAL insert location such
// that there is just enough space on the page for the XLOG_SWITCH record.
loop {
// We start with measuring how much WAL it takes for one logical message,
// considering all alignments and headers.
// Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary.
// We will use logical message as the padding. We start with detecting how much WAL
// it takes for one logical message, considering all alignments and headers.
let base_wal_advance = {
let before_lsn = client.pg_current_wal_insert_lsn()?;
// Small non-empty message bigger than few bytes is more likely than an empty
// message to have the same format as the big padding message.
client.execute(
"SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', 10))",
&[],
)?;
let after_lsn = client.pg_current_wal_insert_lsn()?;
// Did the record cross a page boundary? If it did, start over. Crossing a
// page boundary adds to the apparent size of the record because of the page
// header, which throws off the calculation.
if u64::from(before_lsn) / XLOG_BLCKSZ as u64
!= u64::from(after_lsn) / XLOG_BLCKSZ as u64
{
continue;
}
// base_size is the size of a logical message without the payload
let base_size = u64::from(after_lsn) - u64::from(before_lsn) - 10;
// Is there enough space on the page for another logical message and an
// XLOG_SWITCH? If not, start over.
let page_remain = XLOG_BLCKSZ as u64 - u64::from(after_lsn) % XLOG_BLCKSZ as u64;
if page_remain < base_size - XLOG_SIZE_OF_XLOG_RECORD as u64 {
continue;
}
// We will write another logical message, such that after the logical message
// record, there will be space for exactly one XLOG_SWITCH. How large should
// the logical message's payload be? An XLOG_SWITCH record has no data => its
// size is exactly XLOG_SIZE_OF_XLOG_RECORD.
let repeats = page_remain - base_size - XLOG_SIZE_OF_XLOG_RECORD as u64;
client.execute(
"SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))",
&[&(repeats as i32)],
)?;
break;
// The XLOG_SWITCH record has no data => its size is exactly XLOG_SIZE_OF_XLOG_RECORD.
(u64::from(client.pg_current_wal_insert_lsn()?) - u64::from(before_lsn)) as usize
+ XLOG_SIZE_OF_XLOG_RECORD
};
let mut remaining_lsn =
XLOG_BLCKSZ - u64::from(client.pg_current_wal_insert_lsn()?) as usize % XLOG_BLCKSZ;
if remaining_lsn < base_wal_advance {
remaining_lsn += XLOG_BLCKSZ;
}
let repeats = 10 + remaining_lsn - base_wal_advance;
info!(
"current_wal_insert_lsn={}, remaining_lsn={}, base_wal_advance={}, repeats={}",
client.pg_current_wal_insert_lsn()?,
remaining_lsn,
base_wal_advance,
repeats
);
client.execute(
"SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))",
&[&(repeats as i32)],
)?;
info!(
"current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}",
client.pg_current_wal_insert_lsn()?,

View File

@@ -21,13 +21,11 @@ use std::{
fmt::Debug,
num::{NonZeroU32, NonZeroUsize},
pin::Pin,
str::FromStr,
sync::Arc,
time::{Duration, SystemTime},
};
use anyhow::{bail, Context};
use aws_sdk_s3::types::StorageClass;
use camino::{Utf8Path, Utf8PathBuf};
use bytes::Bytes;
@@ -136,11 +134,6 @@ impl RemotePath {
pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Utf8Path, std::path::StripPrefixError> {
self.0.strip_prefix(&p.0)
}
pub fn add_trailing_slash(&self) -> Self {
// Unwrap safety inputs are guararnteed to be valid UTF-8
Self(format!("{}/", self.0).try_into().unwrap())
}
}
/// We don't need callers to be able to pass arbitrary delimiters: just control
@@ -164,21 +157,47 @@ pub struct Listing {
/// providing basic CRUD operations for storage files.
#[allow(async_fn_in_trait)]
pub trait RemoteStorage: Send + Sync + 'static {
/// List objects in remote storage, with semantics matching AWS S3's ListObjectsV2.
/// (see `<https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html>`)
///
/// Note that the prefix is relative to any `prefix_in_bucket` configured for the client, not
/// from the absolute root of the bucket.
///
/// `mode` configures whether to use a delimiter. Without a delimiter all keys
/// within the prefix are listed in the `keys` of the result. With a delimiter, any "directories" at the top level of
/// the prefix are returned in the `prefixes` of the result, and keys in the top level of the prefix are
/// returned in `keys` ().
///
/// `max_keys` controls the maximum number of keys that will be returned. If this is None, this function
/// will iteratively call listobjects until it runs out of keys. Note that this is not safe to use on
/// unlimted size buckets, as the full list of objects is allocated into a monolithic data structure.
/// Lists all top level subdirectories for a given prefix
/// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
/// which already takes into account any kind of global prefix (prefix_in_bucket for S3 or storage_root for LocalFS)
/// so this method doesnt need to.
async fn list_prefixes(
&self,
prefix: Option<&RemotePath>,
cancel: &CancellationToken,
) -> Result<Vec<RemotePath>, DownloadError> {
let result = self
.list(prefix, ListingMode::WithDelimiter, None, cancel)
.await?
.prefixes;
Ok(result)
}
/// Lists all files in directory "recursively"
/// (not really recursively, because AWS has a flat namespace)
/// Note: This is subtely different than list_prefixes,
/// because it is for listing files instead of listing
/// names sharing common prefixes.
/// For example,
/// list_files("foo/bar") = ["foo/bar/cat123.txt",
/// "foo/bar/cat567.txt", "foo/bar/dog123.txt", "foo/bar/dog456.txt"]
/// whereas,
/// list_prefixes("foo/bar/") = ["cat", "dog"]
/// See `test_real_s3.rs` for more details.
///
/// max_keys limits max number of keys returned; None means unlimited.
async fn list_files(
&self,
prefix: Option<&RemotePath>,
max_keys: Option<NonZeroU32>,
cancel: &CancellationToken,
) -> Result<Vec<RemotePath>, DownloadError> {
let result = self
.list(prefix, ListingMode::NoDelimiter, max_keys, cancel)
.await?
.keys;
Ok(result)
}
async fn list(
&self,
prefix: Option<&RemotePath>,
@@ -317,6 +336,41 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
}
}
// A function for listing all the files in a "directory"
// Example:
// list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
//
// max_keys limits max number of keys returned; None means unlimited.
pub async fn list_files(
&self,
folder: Option<&RemotePath>,
max_keys: Option<NonZeroU32>,
cancel: &CancellationToken,
) -> Result<Vec<RemotePath>, DownloadError> {
match self {
Self::LocalFs(s) => s.list_files(folder, max_keys, cancel).await,
Self::AwsS3(s) => s.list_files(folder, max_keys, cancel).await,
Self::AzureBlob(s) => s.list_files(folder, max_keys, cancel).await,
Self::Unreliable(s) => s.list_files(folder, max_keys, cancel).await,
}
}
// lists common *prefixes*, if any of files
// Example:
// list_prefixes("foo123","foo567","bar123","bar432") = ["foo", "bar"]
pub async fn list_prefixes(
&self,
prefix: Option<&RemotePath>,
cancel: &CancellationToken,
) -> Result<Vec<RemotePath>, DownloadError> {
match self {
Self::LocalFs(s) => s.list_prefixes(prefix, cancel).await,
Self::AwsS3(s) => s.list_prefixes(prefix, cancel).await,
Self::AzureBlob(s) => s.list_prefixes(prefix, cancel).await,
Self::Unreliable(s) => s.list_prefixes(prefix, cancel).await,
}
}
/// See [`RemoteStorage::upload`]
pub async fn upload(
&self,
@@ -511,16 +565,6 @@ impl GenericRemoteStorage {
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct StorageMetadata(HashMap<String, String>);
impl<const N: usize> From<[(&str, &str); N]> for StorageMetadata {
fn from(arr: [(&str, &str); N]) -> Self {
let map: HashMap<String, String> = arr
.iter()
.map(|(k, v)| (k.to_string(), v.to_string()))
.collect();
Self(map)
}
}
/// External backup storage configuration, enough for creating a client for that storage.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct RemoteStorageConfig {
@@ -565,7 +609,6 @@ pub struct S3Config {
/// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details.
pub concurrency_limit: NonZeroUsize,
pub max_keys_per_list_response: Option<i32>,
pub upload_storage_class: Option<StorageClass>,
}
impl Debug for S3Config {
@@ -694,18 +737,6 @@ impl RemoteStorageConfig {
endpoint,
concurrency_limit,
max_keys_per_list_response,
upload_storage_class: toml
.get("upload_storage_class")
.map(|prefix_in_bucket| -> anyhow::Result<_> {
let s = parse_toml_string("upload_storage_class", prefix_in_bucket)?;
let storage_class = StorageClass::from_str(&s).expect("infallible");
#[allow(deprecated)]
if matches!(storage_class, StorageClass::Unknown(_)) {
bail!("Specified storage class unknown to SDK: '{s}'. Allowed values: {:?}", StorageClass::values());
}
Ok(storage_class)
})
.transpose()?,
})
}
(_, _, _, Some(_), None) => {

View File

@@ -5,9 +5,11 @@
//! volume is mounted to the local FS.
use std::{
collections::HashSet,
borrow::Cow,
future::Future,
io::ErrorKind,
num::NonZeroU32,
pin::Pin,
time::{Duration, SystemTime, UNIX_EPOCH},
};
@@ -20,11 +22,11 @@ use tokio::{
io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
};
use tokio_util::{io::ReaderStream, sync::CancellationToken};
use utils::crashsafe::path_with_suffix_extension;
use tracing::*;
use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
use crate::{
Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel,
REMOTE_STORAGE_PREFIX_SEPARATOR,
};
use super::{RemoteStorage, StorageMetadata};
@@ -91,47 +93,7 @@ impl LocalFs {
#[cfg(test)]
async fn list_all(&self) -> anyhow::Result<Vec<RemotePath>> {
use std::{future::Future, pin::Pin};
fn get_all_files<'a, P>(
directory_path: P,
) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<Utf8PathBuf>>> + Send + Sync + 'a>>
where
P: AsRef<Utf8Path> + Send + Sync + 'a,
{
Box::pin(async move {
let directory_path = directory_path.as_ref();
if directory_path.exists() {
if directory_path.is_dir() {
let mut paths = Vec::new();
let mut dir_contents = fs::read_dir(directory_path).await?;
while let Some(dir_entry) = dir_contents.next_entry().await? {
let file_type = dir_entry.file_type().await?;
let entry_path =
Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| {
anyhow::Error::msg(format!(
"non-Unicode path: {}",
pb.to_string_lossy()
))
})?;
if file_type.is_symlink() {
tracing::debug!("{entry_path:?} is a symlink, skipping")
} else if file_type.is_dir() {
paths.extend(get_all_files(&entry_path).await?.into_iter())
} else {
paths.push(entry_path);
}
}
Ok(paths)
} else {
bail!("Path {directory_path:?} is not a directory")
}
} else {
Ok(Vec::new())
}
})
}
Ok(get_all_files(&self.storage_root)
Ok(get_all_files(&self.storage_root, true)
.await?
.into_iter()
.map(|path| {
@@ -158,14 +120,6 @@ impl LocalFs {
// S3 object list prefixes can be arbitrary strings, but when reading
// the local filesystem we need a directory to start calling read_dir on.
let mut initial_dir = full_path.clone();
// If there's no trailing slash, we have to start looking from one above: even if
// `initial_dir` is a directory, we should still list any prefixes in the parent
// that start with the same string.
if !full_path.to_string().ends_with('/') {
initial_dir.pop();
}
loop {
// Did we make it to the root?
if initial_dir.parent().is_none() {
@@ -341,66 +295,61 @@ impl RemoteStorage for LocalFs {
let op = async {
let mut result = Listing::default();
// Filter out directories: in S3 directories don't exist, only the keys within them do.
let keys = self
.list_recursive(prefix)
if let ListingMode::NoDelimiter = mode {
let keys = self
.list_recursive(prefix)
.await
.map_err(DownloadError::Other)?;
result.keys = keys
.into_iter()
.filter(|k| {
let path = k.with_base(&self.storage_root);
!path.is_dir()
})
.collect();
if let Some(max_keys) = max_keys {
result.keys.truncate(max_keys.get() as usize);
}
return Ok(result);
}
let path = match prefix {
Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
None => Cow::Borrowed(&self.storage_root),
};
let prefixes_to_filter = get_all_files(path.as_ref(), false)
.await
.map_err(DownloadError::Other)?;
let keys = keys
.into_iter()
.filter(|k| {
let path = k.with_base(&self.storage_root);
!path.is_dir()
})
.collect();
if let ListingMode::NoDelimiter = mode {
result.keys = keys;
} else {
let mut prefixes = HashSet::new();
for key in keys {
// If the part after the prefix includes a "/", take only the first part and put it in `prefixes`.
let relative_key = if let Some(prefix) = prefix {
let mut prefix = prefix.clone();
// We only strip the dirname of the prefix, so that when we strip it from the start of keys we
// end up with full file/dir names.
let prefix_full_local_path = prefix.with_base(&self.storage_root);
let has_slash = prefix.0.to_string().ends_with('/');
let strip_prefix = if prefix_full_local_path.is_dir() && has_slash {
prefix
} else {
prefix.0.pop();
prefix
};
RemotePath::new(key.strip_prefix(&strip_prefix).unwrap()).unwrap()
} else {
key
};
let relative_key = format!("{}", relative_key);
if relative_key.contains(REMOTE_STORAGE_PREFIX_SEPARATOR) {
let first_part = relative_key
.split(REMOTE_STORAGE_PREFIX_SEPARATOR)
.next()
.unwrap()
.to_owned();
prefixes.insert(first_part);
} else {
result
.keys
.push(RemotePath::from_string(&relative_key).unwrap());
}
// filter out empty directories to mirror s3 behavior.
for prefix in prefixes_to_filter {
if prefix.is_dir()
&& is_directory_empty(&prefix)
.await
.map_err(DownloadError::Other)?
{
continue;
}
let stripped = prefix
.strip_prefix(&self.storage_root)
.context("Failed to strip prefix")
.and_then(RemotePath::new)
.expect(
"We list files for storage root, hence should be able to remote the prefix",
);
if prefix.is_dir() {
result.prefixes.push(stripped);
} else {
result.keys.push(stripped);
}
result.prefixes = prefixes
.into_iter()
.map(|s| RemotePath::from_string(&s).unwrap())
.collect();
}
if let Some(max_keys) = max_keys {
result.keys.truncate(max_keys.get() as usize);
}
Ok(result)
};
@@ -611,6 +560,50 @@ fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
path_with_suffix_extension(original_path, "metadata")
}
fn get_all_files<'a, P>(
directory_path: P,
recursive: bool,
) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<Utf8PathBuf>>> + Send + Sync + 'a>>
where
P: AsRef<Utf8Path> + Send + Sync + 'a,
{
Box::pin(async move {
let directory_path = directory_path.as_ref();
if directory_path.exists() {
if directory_path.is_dir() {
let mut paths = Vec::new();
let mut dir_contents = fs::read_dir(directory_path).await?;
while let Some(dir_entry) = dir_contents.next_entry().await? {
let file_type = dir_entry.file_type().await?;
let entry_path =
Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| {
anyhow::Error::msg(format!(
"non-Unicode path: {}",
pb.to_string_lossy()
))
})?;
if file_type.is_symlink() {
debug!("{entry_path:?} is a symlink, skipping")
} else if file_type.is_dir() {
if recursive {
paths.extend(get_all_files(&entry_path, true).await?.into_iter())
} else {
paths.push(entry_path)
}
} else {
paths.push(entry_path);
}
}
Ok(paths)
} else {
bail!("Path {directory_path:?} is not a directory")
}
} else {
Ok(Vec::new())
}
})
}
async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result<()> {
let target_dir = match target_file_path.parent() {
Some(parent_dir) => parent_dir,
@@ -930,18 +923,13 @@ mod fs_tests {
// No delimiter: should recursively list everything
let (storage, cancel) = create_storage()?;
let child = upload_dummy_file(&storage, "grandparent/parent/child", None, &cancel).await?;
let child_sibling =
upload_dummy_file(&storage, "grandparent/parent/child_sibling", None, &cancel).await?;
let uncle = upload_dummy_file(&storage, "grandparent/uncle", None, &cancel).await?;
let listing = storage
.list(None, ListingMode::NoDelimiter, None, &cancel)
.await?;
assert!(listing.prefixes.is_empty());
assert_eq!(
listing.keys.into_iter().collect::<HashSet<_>>(),
HashSet::from([uncle.clone(), child.clone(), child_sibling.clone()])
);
assert_eq!(listing.keys, [uncle.clone(), child.clone()].to_vec());
// Delimiter: should only go one deep
let listing = storage
@@ -954,25 +942,7 @@ mod fs_tests {
);
assert!(listing.keys.is_empty());
// Delimiter & prefix with a trailing slash
let listing = storage
.list(
Some(&RemotePath::from_string("timelines/some_timeline/grandparent/").unwrap()),
ListingMode::WithDelimiter,
None,
&cancel,
)
.await?;
assert_eq!(
listing.keys,
[RemotePath::from_string("uncle").unwrap()].to_vec()
);
assert_eq!(
listing.prefixes,
[RemotePath::from_string("parent").unwrap()].to_vec()
);
// Delimiter and prefix without a trailing slash
// Delimiter & prefix
let listing = storage
.list(
Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()),
@@ -981,66 +951,12 @@ mod fs_tests {
&cancel,
)
.await?;
assert_eq!(listing.keys, [].to_vec());
assert_eq!(
listing.prefixes,
[RemotePath::from_string("grandparent").unwrap()].to_vec()
);
// Delimiter and prefix that's partway through a path component
let listing = storage
.list(
Some(&RemotePath::from_string("timelines/some_timeline/grandp").unwrap()),
ListingMode::WithDelimiter,
None,
&cancel,
)
.await?;
assert_eq!(listing.keys, [].to_vec());
assert_eq!(
listing.prefixes,
[RemotePath::from_string("grandparent").unwrap()].to_vec()
);
Ok(())
}
#[tokio::test]
async fn list_part_component() -> anyhow::Result<()> {
// No delimiter: should recursively list everything
let (storage, cancel) = create_storage()?;
// Imitates what happens in a tenant path when we have an unsharded path and a sharded path, and do a listing
// of the unsharded path: although there is a "directory" at the unsharded path, it should be handled as
// a freeform prefix.
let _child_a =
upload_dummy_file(&storage, "grandparent/tenant-01/child", None, &cancel).await?;
let _child_b =
upload_dummy_file(&storage, "grandparent/tenant/child", None, &cancel).await?;
// Delimiter and prefix that's partway through a path component
let listing = storage
.list(
Some(
&RemotePath::from_string("timelines/some_timeline/grandparent/tenant").unwrap(),
),
ListingMode::WithDelimiter,
None,
&cancel,
)
.await?;
assert_eq!(listing.keys, [].to_vec());
let mut found_prefixes = listing.prefixes.clone();
found_prefixes.sort();
assert_eq!(
found_prefixes,
[
RemotePath::from_string("tenant").unwrap(),
RemotePath::from_string("tenant-01").unwrap(),
]
.to_vec()
[RemotePath::from_string("timelines/some_timeline/grandparent/parent").unwrap()]
.to_vec()
);
assert_eq!(listing.keys, [uncle.clone()].to_vec());
Ok(())
}

View File

@@ -30,7 +30,7 @@ use aws_sdk_s3::{
config::{AsyncSleep, Builder, IdentityCache, Region, SharedAsyncSleep},
error::SdkError,
operation::get_object::GetObjectError,
types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass},
types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion},
Client,
};
use aws_smithy_async::rt::sleep::TokioSleep;
@@ -62,7 +62,6 @@ pub struct S3Bucket {
bucket_name: String,
prefix_in_bucket: Option<String>,
max_keys_per_list_response: Option<i32>,
upload_storage_class: Option<StorageClass>,
concurrency_limiter: ConcurrencyLimiter,
// Per-request timeout. Accessible for tests.
pub timeout: Duration,
@@ -155,7 +154,6 @@ impl S3Bucket {
max_keys_per_list_response: aws_config.max_keys_per_list_response,
prefix_in_bucket,
concurrency_limiter: ConcurrencyLimiter::new(aws_config.concurrency_limit.get()),
upload_storage_class: aws_config.upload_storage_class.clone(),
timeout,
})
}
@@ -180,7 +178,10 @@ impl S3Bucket {
pub fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
let path_string = path.get_path().as_str();
let path_string = path
.get_path()
.as_str()
.trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR);
match &self.prefix_in_bucket {
Some(prefix) => prefix.clone() + "/" + path_string,
None => path_string.to_string(),
@@ -470,11 +471,16 @@ impl RemoteStorage for S3Bucket {
// get the passed prefix or if it is not set use prefix_in_bucket value
let list_prefix = prefix
.map(|p| self.relative_path_to_s3_object(p))
.or_else(|| {
self.prefix_in_bucket.clone().map(|mut s| {
s.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
s
})
.or_else(|| self.prefix_in_bucket.clone())
.map(|mut p| {
// required to end with a separator
// otherwise request will return only the entry of a prefix
if matches!(mode, ListingMode::WithDelimiter)
&& !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
{
p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
}
p
});
let _permit = self.permit(kind, cancel).await?;
@@ -543,15 +549,11 @@ impl RemoteStorage for S3Bucket {
}
}
// S3 gives us prefixes like "foo/", we return them like "foo"
result.prefixes.extend(prefixes.iter().filter_map(|o| {
Some(
self.s3_object_to_relative_path(
o.prefix()?
.trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR),
),
)
}));
result.prefixes.extend(
prefixes
.iter()
.filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
);
continuation_token = match response.next_continuation_token {
Some(new_token) => Some(new_token),
@@ -584,7 +586,6 @@ impl RemoteStorage for S3Bucket {
.bucket(self.bucket_name.clone())
.key(self.relative_path_to_s3_object(to))
.set_metadata(metadata.map(|m| m.0))
.set_storage_class(self.upload_storage_class.clone())
.content_length(from_size_bytes.try_into()?)
.body(bytes_stream)
.send();
@@ -636,7 +637,6 @@ impl RemoteStorage for S3Bucket {
.copy_object()
.bucket(self.bucket_name.clone())
.key(self.relative_path_to_s3_object(to))
.set_storage_class(self.upload_storage_class.clone())
.copy_source(copy_source)
.send();
@@ -894,7 +894,6 @@ impl RemoteStorage for S3Bucket {
.copy_object()
.bucket(self.bucket_name.clone())
.key(key)
.set_storage_class(self.upload_storage_class.clone())
.copy_source(&source_id)
.send();
@@ -1051,22 +1050,22 @@ mod tests {
Some("/test/prefix/"),
];
let expected_outputs = [
vec!["", "some/path", "some/path/"],
vec!["/", "/some/path", "/some/path/"],
vec!["", "some/path", "some/path"],
vec!["/", "/some/path", "/some/path"],
vec![
"test/prefix/",
"test/prefix/some/path",
"test/prefix/some/path/",
"test/prefix/some/path",
],
vec![
"test/prefix/",
"test/prefix/some/path",
"test/prefix/some/path/",
"test/prefix/some/path",
],
vec![
"test/prefix/",
"test/prefix/some/path",
"test/prefix/some/path/",
"test/prefix/some/path",
],
];
@@ -1078,7 +1077,6 @@ mod tests {
endpoint: None,
concurrency_limit: NonZeroUsize::new(100).unwrap(),
max_keys_per_list_response: Some(5),
upload_storage_class: None,
};
let storage =
S3Bucket::new(&config, std::time::Duration::ZERO).expect("remote storage init");

View File

@@ -107,6 +107,27 @@ impl UnreliableWrapper {
type VoidStorage = crate::LocalFs;
impl RemoteStorage for UnreliableWrapper {
async fn list_prefixes(
&self,
prefix: Option<&RemotePath>,
cancel: &CancellationToken,
) -> Result<Vec<RemotePath>, DownloadError> {
self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
.map_err(DownloadError::Other)?;
self.inner.list_prefixes(prefix, cancel).await
}
async fn list_files(
&self,
folder: Option<&RemotePath>,
max_keys: Option<NonZeroU32>,
cancel: &CancellationToken,
) -> Result<Vec<RemotePath>, DownloadError> {
self.attempt(RemoteOp::ListPrefixes(folder.cloned()))
.map_err(DownloadError::Other)?;
self.inner.list_files(folder, max_keys, cancel).await
}
async fn list(
&self,
prefix: Option<&RemotePath>,

View File

@@ -1,6 +1,5 @@
use anyhow::Context;
use camino::Utf8Path;
use remote_storage::ListingMode;
use remote_storage::RemotePath;
use std::sync::Arc;
use std::{collections::HashSet, num::NonZeroU32};
@@ -55,9 +54,9 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
.context("common_prefix construction")?;
let root_remote_prefixes = test_client
.list(None, ListingMode::WithDelimiter, None, &cancel)
.await?
.prefixes
.list_prefixes(None, &cancel)
.await
.context("client list root prefixes failure")?
.into_iter()
.collect::<HashSet<_>>();
assert_eq!(
@@ -66,14 +65,9 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
);
let nested_remote_prefixes = test_client
.list(
Some(&base_prefix.add_trailing_slash()),
ListingMode::WithDelimiter,
None,
&cancel,
)
.await?
.prefixes
.list_prefixes(Some(&base_prefix), &cancel)
.await
.context("client list nested prefixes failure")?
.into_iter()
.collect::<HashSet<_>>();
let remote_only_prefixes = nested_remote_prefixes
@@ -96,13 +90,11 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
///
/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
/// Then performs the following queries:
/// 1. `list(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
/// 2. `list("folder1")`. This should return all files `random_prefix/folder1/blob_{i}.txt`
/// 1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
/// 2. `list_files("folder1")`. This should return all files `random_prefix/folder1/blob_{i}.txt`
#[test_context(MaybeEnabledStorageWithSimpleTestBlobs)]
#[tokio::test]
async fn list_no_delimiter_works(
ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs,
) -> anyhow::Result<()> {
async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> anyhow::Result<()> {
let ctx = match ctx {
MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx,
MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()),
@@ -115,36 +107,29 @@ async fn list_no_delimiter_works(
let base_prefix =
RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
let root_files = test_client
.list(None, ListingMode::NoDelimiter, None, &cancel)
.list_files(None, None, &cancel)
.await
.context("client list root files failure")?
.keys
.into_iter()
.collect::<HashSet<_>>();
assert_eq!(
root_files,
ctx.remote_blobs.clone(),
"remote storage list on root mismatches with the uploads."
"remote storage list_files on root mismatches with the uploads."
);
// Test that max_keys limit works. In total there are about 21 files (see
// upload_simple_remote_data call in test_real_s3.rs).
let limited_root_files = test_client
.list(
None,
ListingMode::NoDelimiter,
Some(NonZeroU32::new(2).unwrap()),
&cancel,
)
.list_files(None, Some(NonZeroU32::new(2).unwrap()), &cancel)
.await
.context("client list root files failure")?;
assert_eq!(limited_root_files.keys.len(), 2);
assert_eq!(limited_root_files.len(), 2);
let nested_remote_files = test_client
.list(Some(&base_prefix), ListingMode::NoDelimiter, None, &cancel)
.list_files(Some(&base_prefix), None, &cancel)
.await
.context("client list nested files failure")?
.keys
.into_iter()
.collect::<HashSet<_>>();
let trim_remote_blobs: HashSet<_> = ctx
@@ -156,7 +141,7 @@ async fn list_no_delimiter_works(
.collect();
assert_eq!(
nested_remote_files, trim_remote_blobs,
"remote storage list on subdirrectory mismatches with the uploads."
"remote storage list_files on subdirrectory mismatches with the uploads."
);
Ok(())
}
@@ -214,11 +199,7 @@ async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<(
ctx.client.delete_objects(&[path1, path2], &cancel).await?;
let prefixes = ctx
.client
.list(None, ListingMode::WithDelimiter, None, &cancel)
.await?
.prefixes;
let prefixes = ctx.client.list_prefixes(None, &cancel).await?;
assert_eq!(prefixes.len(), 1);

View File

@@ -57,6 +57,7 @@ enum MaybeEnabledStorage {
Disabled,
}
#[async_trait::async_trait]
impl AsyncTestContext for MaybeEnabledStorage {
async fn setup() -> Self {
ensure_logging_ready();
@@ -85,6 +86,7 @@ struct AzureWithTestBlobs {
remote_blobs: HashSet<RemotePath>,
}
#[async_trait::async_trait]
impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
async fn setup() -> Self {
ensure_logging_ready();
@@ -132,6 +134,10 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
}
}
// NOTE: the setups for the list_prefixes test and the list_files test are very similar
// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
// whereas the list_files function is concerned with listing files.
// See `RemoteStorage::list_files` documentation for more details
enum MaybeEnabledStorageWithSimpleTestBlobs {
Enabled(AzureWithSimpleTestBlobs),
Disabled,
@@ -142,6 +148,7 @@ struct AzureWithSimpleTestBlobs {
remote_blobs: HashSet<RemotePath>,
}
#[async_trait::async_trait]
impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
async fn setup() -> Self {
ensure_logging_ready();

View File

@@ -12,8 +12,8 @@ use anyhow::Context;
use camino::Utf8Path;
use futures_util::StreamExt;
use remote_storage::{
DownloadError, GenericRemoteStorage, ListingMode, RemotePath, RemoteStorageConfig,
RemoteStorageKind, S3Config,
DownloadError, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
S3Config,
};
use test_context::test_context;
use test_context::AsyncTestContext;
@@ -75,14 +75,11 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
client: &Arc<GenericRemoteStorage>,
cancel: &CancellationToken,
) -> anyhow::Result<HashSet<RemotePath>> {
Ok(
retry(|| client.list(None, ListingMode::NoDelimiter, None, cancel))
.await
.context("list root files failure")?
.keys
.into_iter()
.collect::<HashSet<_>>(),
)
Ok(retry(|| client.list_files(None, None, cancel))
.await
.context("list root files failure")?
.into_iter()
.collect::<HashSet<_>>())
}
let cancel = CancellationToken::new();
@@ -222,6 +219,7 @@ enum MaybeEnabledStorage {
Disabled,
}
#[async_trait::async_trait]
impl AsyncTestContext for MaybeEnabledStorage {
async fn setup() -> Self {
ensure_logging_ready();
@@ -250,6 +248,7 @@ struct S3WithTestBlobs {
remote_blobs: HashSet<RemotePath>,
}
#[async_trait::async_trait]
impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
async fn setup() -> Self {
ensure_logging_ready();
@@ -297,6 +296,10 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
}
}
// NOTE: the setups for the list_prefixes test and the list_files test are very similar
// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
// whereas the list_files function is concerned with listing files.
// See `RemoteStorage::list_files` documentation for more details
enum MaybeEnabledStorageWithSimpleTestBlobs {
Enabled(S3WithSimpleTestBlobs),
Disabled,
@@ -307,6 +310,7 @@ struct S3WithSimpleTestBlobs {
remote_blobs: HashSet<RemotePath>,
}
#[async_trait::async_trait]
impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
async fn setup() -> Self {
ensure_logging_ready();
@@ -380,7 +384,6 @@ fn create_s3_client(
endpoint: None,
concurrency_limit: NonZeroUsize::new(100).unwrap(),
max_keys_per_list_response,
upload_storage_class: None,
}),
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
};

View File

@@ -22,7 +22,6 @@ camino.workspace = true
chrono.workspace = true
heapless.workspace = true
hex = { workspace = true, features = ["serde"] }
humantime.workspace = true
hyper = { workspace = true, features = ["full"] }
fail.workspace = true
futures = { workspace = true}

View File

@@ -1,21 +0,0 @@
//! Wrapper around `std::env::var` for parsing environment variables.
use std::{fmt::Display, str::FromStr};
pub fn var<V, E>(varname: &str) -> Option<V>
where
V: FromStr<Err = E>,
E: Display,
{
match std::env::var(varname) {
Ok(s) => Some(
s.parse()
.map_err(|e| format!("failed to parse env var {varname}: {e:#}"))
.unwrap(),
),
Err(std::env::VarError::NotPresent) => None,
Err(std::env::VarError::NotUnicode(_)) => {
panic!("env var {varname} is not unicode")
}
}
}

View File

@@ -34,8 +34,6 @@ pub enum Generation {
/// scenarios where pageservers might otherwise issue conflicting writes to
/// remote storage
impl Generation {
pub const MAX: Self = Self::Valid(u32::MAX);
/// Create a new Generation that represents a legacy key format with
/// no generation suffix
pub fn none() -> Self {

View File

@@ -63,7 +63,6 @@ pub mod measured_stream;
pub mod serde_percent;
pub mod serde_regex;
pub mod serde_system_time;
pub mod pageserver_feedback;
@@ -90,10 +89,6 @@ pub mod yielding_loop;
pub mod zstd;
pub mod env;
pub mod poison;
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
///
/// we have several cases:

View File

@@ -1,121 +0,0 @@
//! Protect a piece of state from reuse after it is left in an inconsistent state.
//!
//! # Example
//!
//! ```
//! # tokio_test::block_on(async {
//! use utils::poison::Poison;
//! use std::time::Duration;
//!
//! struct State {
//! clean: bool,
//! }
//! let state = tokio::sync::Mutex::new(Poison::new("mystate", State { clean: true }));
//!
//! let mut mutex_guard = state.lock().await;
//! let mut poison_guard = mutex_guard.check_and_arm()?;
//! let state = poison_guard.data_mut();
//! state.clean = false;
//! // If we get cancelled at this await point, subsequent check_and_arm() calls will fail.
//! tokio::time::sleep(Duration::from_secs(10)).await;
//! state.clean = true;
//! poison_guard.disarm();
//! # Ok::<(), utils::poison::Error>(())
//! # });
//! ```
use tracing::warn;
pub struct Poison<T> {
what: &'static str,
state: State,
data: T,
}
#[derive(Clone, Copy)]
enum State {
Clean,
Armed,
Poisoned { at: chrono::DateTime<chrono::Utc> },
}
impl<T> Poison<T> {
/// We log `what` `warning!` level if the [`Guard`] gets dropped without being [`Guard::disarm`]ed.
pub fn new(what: &'static str, data: T) -> Self {
Self {
what,
state: State::Clean,
data,
}
}
/// Check for poisoning and return a [`Guard`] that provides access to the wrapped state.
pub fn check_and_arm(&mut self) -> Result<Guard<T>, Error> {
match self.state {
State::Clean => {
self.state = State::Armed;
Ok(Guard(self))
}
State::Armed => unreachable!("transient state"),
State::Poisoned { at } => Err(Error::Poisoned {
what: self.what,
at,
}),
}
}
}
/// Use [`Self::data`] and [`Self::data_mut`] to access the wrapped state.
/// Once modifications are done, use [`Self::disarm`].
/// If [`Guard`] gets dropped instead of calling [`Self::disarm`], the state is poisoned
/// and subsequent calls to [`Poison::check_and_arm`] will fail with an error.
pub struct Guard<'a, T>(&'a mut Poison<T>);
impl<'a, T> Guard<'a, T> {
pub fn data(&self) -> &T {
&self.0.data
}
pub fn data_mut(&mut self) -> &mut T {
&mut self.0.data
}
pub fn disarm(self) {
match self.0.state {
State::Clean => unreachable!("we set it to Armed in check_and_arm()"),
State::Armed => {
self.0.state = State::Clean;
}
State::Poisoned { at } => {
unreachable!("we fail check_and_arm() if it's in that state: {at}")
}
}
}
}
impl<'a, T> Drop for Guard<'a, T> {
fn drop(&mut self) {
match self.0.state {
State::Clean => {
// set by disarm()
}
State::Armed => {
// still armed => poison it
let at = chrono::Utc::now();
self.0.state = State::Poisoned { at };
warn!(at=?at, "poisoning {}", self.0.what);
}
State::Poisoned { at } => {
unreachable!("we fail check_and_arm() if it's in that state: {at}")
}
}
}
}
#[derive(thiserror::Error, Debug)]
pub enum Error {
#[error("poisoned at {at}: {what}")]
Poisoned {
what: &'static str,
at: chrono::DateTime<chrono::Utc>,
},
}

View File

@@ -182,18 +182,6 @@ where
}
}
/// Check if [`Self::wait_for`] or [`Self::wait_for_timeout`] would wait if called with `num`.
pub fn would_wait_for(&self, num: V) -> Result<(), V> {
let internal = self.internal.lock().unwrap();
let cnt = internal.current.cnt_value();
drop(internal);
if cnt >= num {
Ok(())
} else {
Err(cnt)
}
}
/// Register and return a channel that will be notified when a number arrives,
/// or None, if it has already arrived.
fn queue_for_wait(&self, num: V) -> Result<Option<Receiver<()>>, SeqWaitError> {

View File

@@ -1,55 +0,0 @@
//! A `serde::{Deserialize,Serialize}` type for SystemTime with RFC3339 format and millisecond precision.
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, serde::Serialize, serde::Deserialize)]
#[serde(transparent)]
pub struct SystemTime(
#[serde(
deserialize_with = "deser_rfc3339_millis",
serialize_with = "ser_rfc3339_millis"
)]
pub std::time::SystemTime,
);
fn ser_rfc3339_millis<S: serde::ser::Serializer>(
ts: &std::time::SystemTime,
serializer: S,
) -> Result<S::Ok, S::Error> {
serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
}
fn deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<std::time::SystemTime, D::Error>
where
D: serde::de::Deserializer<'de>,
{
let s: String = serde::de::Deserialize::deserialize(deserializer)?;
humantime::parse_rfc3339(&s).map_err(serde::de::Error::custom)
}
#[cfg(test)]
mod tests {
use super::*;
/// Helper function to make a SystemTime have millisecond precision by truncating additional nanoseconds.
fn to_millisecond_precision(time: SystemTime) -> SystemTime {
match time.0.duration_since(std::time::SystemTime::UNIX_EPOCH) {
Ok(duration) => {
let total_millis = duration.as_secs() * 1_000 + u64::from(duration.subsec_millis());
SystemTime(
std::time::SystemTime::UNIX_EPOCH
+ std::time::Duration::from_millis(total_millis),
)
}
Err(_) => time,
}
}
#[test]
fn test_serialize_deserialize() {
let input = SystemTime(std::time::SystemTime::now());
let expected_serialized = format!("\"{}\"", humantime::format_rfc3339_millis(input.0));
let serialized = serde_json::to_string(&input).unwrap();
assert_eq!(expected_serialized, serialized);
let deserialized: SystemTime = serde_json::from_str(&expected_serialized).unwrap();
assert_eq!(to_millisecond_precision(input), deserialized);
}
}

View File

@@ -192,14 +192,6 @@ impl<T> OnceCell<T> {
}
}
/// Like [`Guard::take_and_deinit`], but will return `None` if this OnceCell was never
/// initialized.
pub fn take_and_deinit(&mut self) -> Option<(T, InitPermit)> {
let inner = self.inner.get_mut().unwrap();
inner.take_and_deinit()
}
/// Return the number of [`Self::get_or_init`] calls waiting for initialization to complete.
pub fn initializer_count(&self) -> usize {
self.initializers.load(Ordering::Relaxed)
@@ -254,23 +246,15 @@ impl<'a, T> Guard<'a, T> {
/// The permit will be on a semaphore part of the new internal value, and any following
/// [`OnceCell::get_or_init`] will wait on it to complete.
pub fn take_and_deinit(mut self) -> (T, InitPermit) {
self.0
.take_and_deinit()
.expect("guard is not created unless value has been initialized")
}
}
impl<T> Inner<T> {
pub fn take_and_deinit(&mut self) -> Option<(T, InitPermit)> {
let value = self.value.take()?;
let mut swapped = Inner::default();
let sem = swapped.init_semaphore.clone();
// acquire and forget right away, moving the control over to InitPermit
sem.try_acquire().expect("we just created this").forget();
let permit = InitPermit(sem);
std::mem::swap(self, &mut swapped);
Some((value, permit))
std::mem::swap(&mut *self.0, &mut swapped);
swapped
.value
.map(|v| (v, InitPermit(sem)))
.expect("guard is not created unless value has been initialized")
}
}
@@ -279,13 +263,6 @@ impl<T> Inner<T> {
/// On drop, this type will return the permit.
pub struct InitPermit(Arc<tokio::sync::Semaphore>);
impl std::fmt::Debug for InitPermit {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let ptr = Arc::as_ptr(&self.0) as *const ();
f.debug_tuple("InitPermit").field(&ptr).finish()
}
}
impl Drop for InitPermit {
fn drop(&mut self) {
assert_eq!(
@@ -582,22 +559,4 @@ mod tests {
assert_eq!(*target.get().unwrap(), 11);
}
#[tokio::test]
async fn take_and_deinit_on_mut() {
use std::convert::Infallible;
let mut target = OnceCell::<u32>::default();
assert!(target.take_and_deinit().is_none());
target
.get_or_init(|permit| async move { Ok::<_, Infallible>((42, permit)) })
.await
.unwrap();
let again = target.take_and_deinit();
assert!(matches!(again, Some((42, _))), "{again:?}");
assert!(target.take_and_deinit().is_none());
}
}

View File

@@ -70,7 +70,6 @@ tokio-stream.workspace = true
tokio-util.workspace = true
toml_edit = { workspace = true, features = [ "serde" ] }
tracing.workspace = true
twox-hash.workspace = true
url.workspace = true
walkdir.workspace = true
metrics.workspace = true

View File

@@ -27,50 +27,30 @@
//!
//! # Reference Numbers
//!
//! 2024-04-15 on i3en.3xlarge
//! 2024-03-20 on i3en.3xlarge
//!
//! ```text
//! async-short/1 time: [24.584 µs 24.737 µs 24.922 µs]
//! async-short/2 time: [33.479 µs 33.660 µs 33.888 µs]
//! async-short/4 time: [42.713 µs 43.046 µs 43.440 µs]
//! async-short/8 time: [71.814 µs 72.478 µs 73.240 µs]
//! async-short/16 time: [132.73 µs 134.45 µs 136.22 µs]
//! async-short/32 time: [258.31 µs 260.73 µs 263.27 µs]
//! async-short/64 time: [511.61 µs 514.44 µs 517.51 µs]
//! async-short/128 time: [992.64 µs 998.23 µs 1.0042 ms]
//! async-medium/1 time: [110.11 µs 110.50 µs 110.96 µs]
//! async-medium/2 time: [153.06 µs 153.85 µs 154.99 µs]
//! async-medium/4 time: [317.51 µs 319.92 µs 322.85 µs]
//! async-medium/8 time: [638.30 µs 644.68 µs 652.12 µs]
//! async-medium/16 time: [1.2651 ms 1.2773 ms 1.2914 ms]
//! async-medium/32 time: [2.5117 ms 2.5410 ms 2.5720 ms]
//! async-medium/64 time: [4.8088 ms 4.8555 ms 4.9047 ms]
//! async-medium/128 time: [8.8311 ms 8.9849 ms 9.1263 ms]
//! sync-short/1 time: [25.503 µs 25.626 µs 25.771 µs]
//! sync-short/2 time: [30.850 µs 31.013 µs 31.208 µs]
//! sync-short/4 time: [45.543 µs 45.856 µs 46.193 µs]
//! sync-short/8 time: [84.114 µs 84.639 µs 85.220 µs]
//! sync-short/16 time: [185.22 µs 186.15 µs 187.13 µs]
//! sync-short/32 time: [377.43 µs 378.87 µs 380.46 µs]
//! sync-short/64 time: [756.49 µs 759.04 µs 761.70 µs]
//! sync-short/128 time: [1.4825 ms 1.4874 ms 1.4923 ms]
//! sync-medium/1 time: [105.66 µs 106.01 µs 106.43 µs]
//! sync-medium/2 time: [153.10 µs 153.84 µs 154.72 µs]
//! sync-medium/4 time: [327.13 µs 329.44 µs 332.27 µs]
//! sync-medium/8 time: [654.26 µs 658.73 µs 663.63 µs]
//! sync-medium/16 time: [1.2682 ms 1.2748 ms 1.2816 ms]
//! sync-medium/32 time: [2.4456 ms 2.4595 ms 2.4731 ms]
//! sync-medium/64 time: [4.6523 ms 4.6890 ms 4.7256 ms]
//! sync-medium/128 time: [8.7215 ms 8.8323 ms 8.9344 ms]
//! short/1 time: [26.483 µs 26.614 µs 26.767 µs]
//! short/2 time: [32.223 µs 32.465 µs 32.767 µs]
//! short/4 time: [47.203 µs 47.583 µs 47.984 µs]
//! short/8 time: [89.135 µs 89.612 µs 90.139 µs]
//! short/16 time: [190.12 µs 191.52 µs 192.88 µs]
//! short/32 time: [380.96 µs 382.63 µs 384.20 µs]
//! short/64 time: [736.86 µs 741.07 µs 745.03 µs]
//! short/128 time: [1.4106 ms 1.4206 ms 1.4294 ms]
//! medium/1 time: [111.81 µs 112.25 µs 112.79 µs]
//! medium/2 time: [158.26 µs 159.13 µs 160.21 µs]
//! medium/4 time: [334.65 µs 337.14 µs 340.07 µs]
//! medium/8 time: [675.32 µs 679.91 µs 685.25 µs]
//! medium/16 time: [1.2929 ms 1.2996 ms 1.3067 ms]
//! medium/32 time: [2.4295 ms 2.4461 ms 2.4623 ms]
//! medium/64 time: [4.3973 ms 4.4458 ms 4.4875 ms]
//! medium/128 time: [7.5955 ms 7.7847 ms 7.9481 ms]
//! ```
use bytes::{Buf, Bytes};
use criterion::{BenchmarkId, Criterion};
use pageserver::{
config::PageServerConf,
walrecord::NeonWalRecord,
walredo::{PostgresRedoManager, ProcessKind},
};
use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
use pageserver_api::{key::Key, shard::TenantShardId};
use std::{
sync::Arc,
@@ -80,39 +60,33 @@ use tokio::{sync::Barrier, task::JoinSet};
use utils::{id::TenantId, lsn::Lsn};
fn bench(c: &mut Criterion) {
for process_kind in &[ProcessKind::Async, ProcessKind::Sync] {
{
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
for nclients in nclients {
let mut group = c.benchmark_group(format!("{process_kind}-short"));
group.bench_with_input(
BenchmarkId::from_parameter(nclients),
&nclients,
|b, nclients| {
let redo_work = Arc::new(Request::short_input());
b.iter_custom(|iters| {
bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients)
});
},
);
}
{
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
for nclients in nclients {
let mut group = c.benchmark_group("short");
group.bench_with_input(
BenchmarkId::from_parameter(nclients),
&nclients,
|b, nclients| {
let redo_work = Arc::new(Request::short_input());
b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
},
);
}
}
{
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
for nclients in nclients {
let mut group = c.benchmark_group(format!("{process_kind}-medium"));
group.bench_with_input(
BenchmarkId::from_parameter(nclients),
&nclients,
|b, nclients| {
let redo_work = Arc::new(Request::medium_input());
b.iter_custom(|iters| {
bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients)
});
},
);
}
{
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
for nclients in nclients {
let mut group = c.benchmark_group("medium");
group.bench_with_input(
BenchmarkId::from_parameter(nclients),
&nclients,
|b, nclients| {
let redo_work = Arc::new(Request::medium_input());
b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
},
);
}
}
}
@@ -120,16 +94,10 @@ criterion::criterion_group!(benches, bench);
criterion::criterion_main!(benches);
// Returns the sum of each client's wall-clock time spent executing their share of the n_redos.
fn bench_impl(
process_kind: ProcessKind,
redo_work: Arc<Request>,
n_redos: u64,
nclients: u64,
) -> Duration {
fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration {
let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();
let mut conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
conf.walredo_process_kind = process_kind;
let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
let conf = Box::leak(Box::new(conf));
let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
@@ -145,40 +113,25 @@ fn bench_impl(
let manager = PostgresRedoManager::new(conf, tenant_shard_id);
let manager = Arc::new(manager);
// divide the amount of work equally among the clients.
let nredos_per_client = n_redos / nclients;
for _ in 0..nclients {
rt.block_on(async {
tasks.spawn(client(
Arc::clone(&manager),
Arc::clone(&start),
Arc::clone(&redo_work),
nredos_per_client,
// divide the amount of work equally among the clients
n_redos / nclients,
))
});
}
let elapsed = rt.block_on(async move {
let mut total_wallclock_time = Duration::ZERO;
rt.block_on(async move {
let mut total_wallclock_time = std::time::Duration::from_millis(0);
while let Some(res) = tasks.join_next().await {
total_wallclock_time += res.unwrap();
}
total_wallclock_time
});
// consistency check to ensure process kind setting worked
if nredos_per_client > 0 {
assert_eq!(
manager
.status()
.process
.map(|p| p.kind)
.expect("the benchmark work causes a walredo process to be spawned"),
std::borrow::Cow::Borrowed(process_kind.into())
);
}
elapsed
})
}
async fn client(

View File

@@ -128,12 +128,12 @@ impl Client {
pub async fn timeline_info(
&self,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
timeline_id: TimelineId,
force_await_logical_size: ForceAwaitLogicalSize,
) -> Result<pageserver_api::models::TimelineInfo> {
let uri = format!(
"{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}",
"{}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
self.mgmt_api_endpoint
);
@@ -151,11 +151,11 @@ impl Client {
pub async fn keyspace(
&self,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> Result<pageserver_api::models::partitioning::Partitioning> {
let uri = format!(
"{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/keyspace",
"{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/keyspace",
self.mgmt_api_endpoint
);
self.get(&uri)
@@ -243,19 +243,6 @@ impl Client {
Ok(())
}
pub async fn tenant_scan_remote_storage(
&self,
tenant_id: TenantId,
) -> Result<TenantScanRemoteStorageResponse> {
let uri = format!(
"{}/v1/tenant/{tenant_id}/scan_remote_storage",
self.mgmt_api_endpoint
);
let response = self.request(Method::GET, &uri, ()).await?;
let body = response.json().await.map_err(Error::ReceiveBody)?;
Ok(body)
}
pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
self.request(Method::PUT, &uri, req).await?;
@@ -292,7 +279,7 @@ impl Client {
lazy: bool,
) -> Result<()> {
let req_body = TenantLocationConfigRequest {
tenant_id: None,
tenant_id: Some(tenant_shard_id),
config,
};

View File

@@ -60,7 +60,7 @@ impl Client {
) -> anyhow::Result<PagestreamClient> {
let copy_both: tokio_postgres::CopyBothDuplex<bytes::Bytes> = self
.client
.copy_both_simple(&format!("pagestream_v2 {tenant_id} {timeline_id}"))
.copy_both_simple(&format!("pagestream {tenant_id} {timeline_id}"))
.await?;
let Client {
cancel_on_client_drop,

View File

@@ -11,6 +11,7 @@ default = []
anyhow.workspace = true
async-compression.workspace = true
async-stream.workspace = true
async-trait.workspace = true
byteorder.workspace = true
bytes.workspace = true
chrono = { workspace = true, features = ["serde"] }

View File

@@ -180,7 +180,7 @@ where
match top.deref_mut() {
LazyLoadLayer::Unloaded(ref mut l) => {
let fut = l.load_keys(this.ctx);
this.load_future.set(Some(Box::pin(fut)));
this.load_future.set(Some(fut));
continue;
}
LazyLoadLayer::Loaded(ref mut entries) => {

View File

@@ -3,6 +3,7 @@
//!
//! All the heavy lifting is done by the create_image and create_delta
//! functions that the implementor provides.
use async_trait::async_trait;
use futures::Future;
use pageserver_api::{key::Key, keyspace::key_range_size};
use std::ops::Range;
@@ -140,16 +141,18 @@ pub trait CompactionLayer<K: CompactionKey + ?Sized> {
fn is_delta(&self) -> bool;
}
#[async_trait]
pub trait CompactionDeltaLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {
type DeltaEntry<'a>: CompactionDeltaEntry<'a, E::Key>
where
Self: 'a;
/// Return all keys in this delta layer.
fn load_keys<'a>(
async fn load_keys<'a>(
&self,
ctx: &E::RequestContext,
) -> impl Future<Output = anyhow::Result<Vec<Self::DeltaEntry<'_>>>> + Send;
) -> anyhow::Result<Vec<Self::DeltaEntry<'_>>>;
}
pub trait CompactionImageLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {}

Some files were not shown because too many files have changed in this diff Show More