Compare commits

..

2 Commits

Author SHA1 Message Date
Vadim Kharitonov
4e17ce654e Revert timescaledb to 2.10.1 for pg14 and pg15 2023-12-06 15:26:11 +01:00
Vadim Kharitonov
bf9ac7d721 Revert "[Compute] Update timescaledb to 2.13.0"
This reverts commit 66ea98f2e5.
2023-12-06 15:25:08 +01:00
164 changed files with 3681 additions and 6813 deletions

View File

@@ -199,10 +199,6 @@ jobs:
#
git config --global --add safe.directory ${{ github.workspace }}
git config --global --add safe.directory ${GITHUB_WORKSPACE}
for r in 14 15 16; do
git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
done
- name: Checkout
uses: actions/checkout@v3
@@ -1101,10 +1097,6 @@ jobs:
#
git config --global --add safe.directory ${{ github.workspace }}
git config --global --add safe.directory ${GITHUB_WORKSPACE}
for r in 14 15 16; do
git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
done
- name: Checkout
uses: actions/checkout@v3

View File

@@ -142,10 +142,6 @@ jobs:
#
git config --global --add safe.directory ${{ github.workspace }}
git config --global --add safe.directory ${GITHUB_WORKSPACE}
for r in 14 15 16; do
git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
done
- name: Checkout
uses: actions/checkout@v4
@@ -242,20 +238,6 @@ jobs:
options: --init
steps:
- name: Fix git ownership
run: |
# Workaround for `fatal: detected dubious ownership in repository at ...`
#
# Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
# Ref https://github.com/actions/checkout/issues/785
#
git config --global --add safe.directory ${{ github.workspace }}
git config --global --add safe.directory ${GITHUB_WORKSPACE}
for r in 14 15 16; do
git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
done
- name: Checkout
uses: actions/checkout@v4
with:

3
.gitignore vendored
View File

@@ -18,6 +18,3 @@ test_output/
*.o
*.so
*.Po
# pgindent typedef lists
*.list

156
Cargo.lock generated
View File

@@ -44,12 +44,6 @@ dependencies = [
"memchr",
]
[[package]]
name = "allocator-api2"
version = "0.2.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5"
[[package]]
name = "android_system_properties"
version = "0.1.5"
@@ -184,7 +178,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "81953c529336010edd6d8e358f886d9581267795c61b19475b71314bffa46d35"
dependencies = [
"concurrent-queue",
"event-listener 2.5.3",
"event-listener",
"futures-core",
]
@@ -205,13 +199,11 @@ dependencies = [
[[package]]
name = "async-lock"
version = "3.2.0"
version = "2.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7125e42787d53db9dd54261812ef17e937c95a51e4d291373b670342fa44310c"
checksum = "287272293e9d8c41773cec55e365490fe034813a2f172f502d6ddcf75b2f582b"
dependencies = [
"event-listener 4.0.0",
"event-listener-strategy",
"pin-project-lite",
"event-listener",
]
[[package]]
@@ -694,9 +686,9 @@ dependencies = [
[[package]]
name = "azure_core"
version = "0.18.0"
version = "0.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a6218987c374650fdad0b476bfc675729762c28dfb35f58608a38a2b1ea337dd"
checksum = "8e29286b9edfdd6f2c7e9d970bb5b015df8621258acab9ecfcea09b2d7692467"
dependencies = [
"async-trait",
"base64 0.21.1",
@@ -704,10 +696,8 @@ dependencies = [
"dyn-clone",
"futures",
"getrandom 0.2.11",
"hmac",
"http-types",
"log",
"once_cell",
"paste",
"pin-project",
"quick-xml",
@@ -716,7 +706,6 @@ dependencies = [
"rustc_version",
"serde",
"serde_json",
"sha2",
"time",
"url",
"uuid",
@@ -724,9 +713,9 @@ dependencies = [
[[package]]
name = "azure_identity"
version = "0.18.1"
version = "0.16.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9e1eacc4f7fb2a73d57c39139d0fc3aed78435606055779ddaef4b43cdf919a8"
checksum = "5b67b337346da8739e91ea1e9400a6ebc9bc54e0b2af1d23c9bcd565950588f9"
dependencies = [
"async-lock",
"async-trait",
@@ -736,6 +725,7 @@ dependencies = [
"oauth2",
"pin-project",
"serde",
"serde_json",
"time",
"tz-rs",
"url",
@@ -744,18 +734,21 @@ dependencies = [
[[package]]
name = "azure_storage"
version = "0.18.0"
version = "0.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ade8f2653e408de88b9eafec9f48c3c26b94026375e88adbd34523a7dd9795a1"
checksum = "bed0ccefde57930b2886fd4aed1f70ac469c197b8c2e94828290d71bcbdb5d97"
dependencies = [
"RustyXML",
"async-lock",
"async-trait",
"azure_core",
"bytes",
"futures",
"hmac",
"log",
"serde",
"serde_derive",
"serde_json",
"sha2",
"time",
"url",
"uuid",
@@ -763,14 +756,13 @@ dependencies = [
[[package]]
name = "azure_storage_blobs"
version = "0.18.0"
version = "0.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "025701c7cc5b523100f0f3b2b01723564ec5a86c03236521c06826337047e872"
checksum = "f91a52da2d192cfe43759f61e8bb31a5969f1722d5b85ac89627f356ad674ab4"
dependencies = [
"RustyXML",
"azure_core",
"azure_storage",
"azure_svc_blobstorage",
"bytes",
"futures",
"log",
@@ -782,22 +774,6 @@ dependencies = [
"uuid",
]
[[package]]
name = "azure_svc_blobstorage"
version = "0.18.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "76051e5bb67cea1055abe5e530a0878feac7e0ab4cbbcb4a6adc953a58993389"
dependencies = [
"azure_core",
"bytes",
"futures",
"log",
"once_cell",
"serde",
"serde_json",
"time",
]
[[package]]
name = "backtrace"
version = "0.3.67"
@@ -914,7 +890,7 @@ checksum = "a246e68bb43f6cd9db24bea052a53e40405417c5fb372e3d1a8a7f770a564ef5"
dependencies = [
"memchr",
"once_cell",
"regex-automata 0.1.10",
"regex-automata",
"serde",
]
@@ -1704,27 +1680,6 @@ version = "2.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0"
[[package]]
name = "event-listener"
version = "4.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "770d968249b5d99410d61f5bf89057f3199a077a04d087092f58e7d10692baae"
dependencies = [
"concurrent-queue",
"parking",
"pin-project-lite",
]
[[package]]
name = "event-listener-strategy"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "958e4d70b6d5e81971bebec42271ec641e7ff4e170a6fa605f2b8a8b65cb97d3"
dependencies = [
"event-listener 4.0.0",
"pin-project-lite",
]
[[package]]
name = "fail"
version = "0.5.1"
@@ -2087,10 +2042,6 @@ name = "hashbrown"
version = "0.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a"
dependencies = [
"ahash",
"allocator-api2",
]
[[package]]
name = "hashlink"
@@ -2582,7 +2533,7 @@ version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8263075bb86c5a1b1427b5ae862e8889656f126e9f77c484496e8b47cf5c5558"
dependencies = [
"regex-automata 0.1.10",
"regex-automata",
]
[[package]]
@@ -2608,9 +2559,9 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
[[package]]
name = "memchr"
version = "2.6.4"
version = "2.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167"
checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
[[package]]
name = "memoffset"
@@ -2683,14 +2634,14 @@ dependencies = [
[[package]]
name = "mio"
version = "0.8.10"
version = "0.8.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f3d0b296e374a4e6f3c7b0a1f5a51d748a0d34c85e7dc48fc3fa9a87657fe09"
checksum = "5b9d9a46eff5b4ff64b45a9e316a6d1e0bc719ef429cbec4dc630684212bfdf9"
dependencies = [
"libc",
"log",
"wasi 0.11.0+wasi-snapshot-preview1",
"windows-sys 0.48.0",
"windows-sys 0.45.0",
]
[[package]]
@@ -3103,7 +3054,6 @@ dependencies = [
"humantime-serde",
"hyper",
"itertools",
"md5",
"metrics",
"nix 0.26.2",
"num-traits",
@@ -3694,7 +3644,7 @@ dependencies = [
"serde_json",
"sha2",
"smol_str",
"socket2 0.5.5",
"socket2 0.5.3",
"sync_wrapper",
"task-local-extensions",
"thiserror",
@@ -3718,9 +3668,9 @@ dependencies = [
[[package]]
name = "quick-xml"
version = "0.31.0"
version = "0.30.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33"
checksum = "eff6510e86862b57b210fd8cbe8ed3f0d7d600b9c2863cd4549a2e033c66e956"
dependencies = [
"memchr",
"serde",
@@ -3860,14 +3810,13 @@ dependencies = [
[[package]]
name = "regex"
version = "1.10.2"
version = "1.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343"
checksum = "d1a59b5d8e97dee33696bf13c5ba8ab85341c002922fba050069326b9c498974"
dependencies = [
"aho-corasick",
"memchr",
"regex-automata 0.4.3",
"regex-syntax 0.8.2",
"regex-syntax 0.7.2",
]
[[package]]
@@ -3879,17 +3828,6 @@ dependencies = [
"regex-syntax 0.6.29",
]
[[package]]
name = "regex-automata"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax 0.8.2",
]
[[package]]
name = "regex-syntax"
version = "0.6.29"
@@ -3898,9 +3836,9 @@ checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
[[package]]
name = "regex-syntax"
version = "0.8.2"
version = "0.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
checksum = "436b050e76ed2903236f032a59761c1eb99e1b0aead2c257922771dab1fc8c78"
[[package]]
name = "relative-path"
@@ -3926,7 +3864,6 @@ dependencies = [
"bytes",
"camino",
"camino-tempfile",
"futures",
"futures-util",
"http-types",
"hyper",
@@ -4354,7 +4291,6 @@ dependencies = [
"tokio-io-timeout",
"tokio-postgres",
"tokio-stream",
"tokio-util",
"toml_edit",
"tracing",
"url",
@@ -4795,9 +4731,9 @@ dependencies = [
[[package]]
name = "socket2"
version = "0.5.5"
version = "0.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7b5fac59a5cb5dd637972e5fca70daf0523c9067fcdc4842f053dae04a18f8e9"
checksum = "2538b18701741680e0322a2302176d3253a35388e2e62f172f64f4f16605f877"
dependencies = [
"libc",
"windows-sys 0.48.0",
@@ -5144,18 +5080,18 @@ dependencies = [
[[package]]
name = "tokio"
version = "1.34.0"
version = "1.28.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d0c014766411e834f7af5b8f4cf46257aab4036ca95e9d2c144a10f59ad6f5b9"
checksum = "0aa32867d44e6f2ce3385e89dceb990188b8bb0fb25b0cf576647a6f98ac5105"
dependencies = [
"backtrace",
"autocfg",
"bytes",
"libc",
"mio",
"num_cpus",
"pin-project-lite",
"signal-hook-registry",
"socket2 0.5.5",
"socket2 0.4.9",
"tokio-macros",
"windows-sys 0.48.0",
]
@@ -5172,9 +5108,9 @@ dependencies = [
[[package]]
name = "tokio-macros"
version = "2.2.0"
version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b"
checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e"
dependencies = [
"proc-macro2",
"quote",
@@ -5209,7 +5145,7 @@ dependencies = [
"pin-project-lite",
"postgres-protocol",
"postgres-types",
"socket2 0.5.5",
"socket2 0.5.3",
"tokio",
"tokio-util",
]
@@ -5278,16 +5214,13 @@ dependencies = [
[[package]]
name = "tokio-util"
version = "0.7.10"
version = "0.7.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5419f34732d9eb6ee4c3578b7989078579b7f039cbbb9ca2c4da015749371e15"
checksum = "806fe8c2c87eccc8b3267cbae29ed3ab2d0bd37fca70ab622e46aaa9375ddb7d"
dependencies = [
"bytes",
"futures-core",
"futures-io",
"futures-sink",
"futures-util",
"hashbrown 0.14.0",
"pin-project-lite",
"tokio",
"tracing",
@@ -6283,8 +6216,7 @@ dependencies = [
"prost",
"rand 0.8.5",
"regex",
"regex-automata 0.4.3",
"regex-syntax 0.8.2",
"regex-syntax 0.7.2",
"reqwest",
"ring 0.16.20",
"rustls",

View File

@@ -38,10 +38,10 @@ license = "Apache-2.0"
anyhow = { version = "1.0", features = ["backtrace"] }
arc-swap = "1.6"
async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
azure_core = "0.18"
azure_identity = "0.18"
azure_storage = "0.18"
azure_storage_blobs = "0.18"
azure_core = "0.16"
azure_identity = "0.16"
azure_storage = "0.16"
azure_storage_blobs = "0.16"
flate2 = "1.0.26"
async-stream = "0.3"
async-trait = "0.1"
@@ -109,7 +109,7 @@ pin-project-lite = "0.2"
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
prost = "0.11"
rand = "0.8"
regex = "1.10.2"
regex = "1.4"
reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_19"] }
reqwest-middleware = "0.2.0"
@@ -149,7 +149,7 @@ tokio-postgres-rustls = "0.10.0"
tokio-rustls = "0.24"
tokio-stream = "0.1"
tokio-tar = "0.3"
tokio-util = { version = "0.7.10", features = ["io", "rt"] }
tokio-util = { version = "0.7", features = ["io"] }
toml = "0.7"
toml_edit = "0.19"
tonic = {version = "0.9", features = ["tls", "tls-roots"]}

View File

@@ -395,7 +395,7 @@ RUN case "${PG_VERSION}" in \
*) \
export TIMESCALEDB_VERSION=2.13.0 \
export TIMESCALEDB_CHECKSUM=584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d \
;; \
;; \s
esac && \
apt-get update && \
apt-get install -y cmake && \

View File

@@ -260,44 +260,6 @@ distclean:
fmt:
./pre-commit.py --fix-inplace
postgres-%-pg-bsd-indent: postgres-%
+@echo "Compiling pg_bsd_indent"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/tools/pg_bsd_indent/
# Create typedef list for the core. Note that generally it should be combined with
# buildfarm one to cover platform specific stuff.
# https://wiki.postgresql.org/wiki/Running_pgindent_on_non-core_code_or_development_code
postgres-%-typedefs.list: postgres-%
$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/find_typedef $(POSTGRES_INSTALL_DIR)/$*/bin > $@
# Indent postgres. See src/tools/pgindent/README for details.
.PHONY: postgres-%-pgindent
postgres-%-pgindent: postgres-%-pg-bsd-indent postgres-%-typedefs.list
+@echo merge with buildfarm typedef to cover all platforms
+@echo note: I first tried to download from pgbuildfarm.org, but for unclear reason e.g. \
REL_16_STABLE list misses PGSemaphoreData
# wget -q -O - "http://www.pgbuildfarm.org/cgi-bin/typedefs.pl?branch=REL_16_STABLE" |\
# cat - postgres-$*-typedefs.list | sort | uniq > postgres-$*-typedefs-full.list
cat $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/typedefs.list |\
cat - postgres-$*-typedefs.list | sort | uniq > postgres-$*-typedefs-full.list
+@echo note: you might want to run it on selected files/dirs instead.
INDENT=$(POSTGRES_INSTALL_DIR)/build/$*/src/tools/pg_bsd_indent/pg_bsd_indent \
$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/pgindent --typedefs postgres-$*-typedefs-full.list \
$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/ \
--excludes $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/exclude_file_patterns
rm -f pg*.BAK
# Indent pxgn/neon.
.PHONY: pgindent
neon-pgindent: postgres-v16-pg-bsd-indent neon-pg-ext-v16
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
FIND_TYPEDEF=$(ROOT_PROJECT_DIR)/vendor/postgres-v16/src/tools/find_typedef \
INDENT=$(POSTGRES_INSTALL_DIR)/build/v16/src/tools/pg_bsd_indent/pg_bsd_indent \
PGINDENT_SCRIPT=$(ROOT_PROJECT_DIR)/vendor/postgres-v16/src/tools/pgindent/pgindent \
-C $(POSTGRES_INSTALL_DIR)/build/neon-v16 \
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile pgindent
.PHONY: setup-pre-commit-hook
setup-pre-commit-hook:
ln -s -f $(ROOT_PROJECT_DIR)/pre-commit.py .git/hooks/pre-commit

View File

@@ -252,7 +252,7 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()>
IF NOT EXISTS (
SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser')
THEN
CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS IN ROLE pg_read_all_data, pg_write_all_data;
CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION IN ROLE pg_read_all_data, pg_write_all_data;
IF array_length(roles, 1) IS NOT NULL THEN
EXECUTE format('GRANT neon_superuser TO %s',
array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(roles) as x), ', '));

View File

@@ -193,11 +193,16 @@ impl Escaping for PgIdent {
/// Build a list of existing Postgres roles
pub fn get_existing_roles(xact: &mut Transaction<'_>) -> Result<Vec<Role>> {
let postgres_roles = xact
.query("SELECT rolname, rolpassword FROM pg_catalog.pg_authid", &[])?
.query(
"SELECT rolname, rolpassword, rolreplication, rolbypassrls FROM pg_catalog.pg_authid",
&[],
)?
.iter()
.map(|row| Role {
name: row.get("rolname"),
encrypted_password: row.get("rolpassword"),
replication: Some(row.get("rolreplication")),
bypassrls: Some(row.get("rolbypassrls")),
options: None,
})
.collect();

View File

@@ -252,6 +252,8 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
let action = if let Some(r) = pg_role {
if (r.encrypted_password.is_none() && role.encrypted_password.is_some())
|| (r.encrypted_password.is_some() && role.encrypted_password.is_none())
|| !r.bypassrls.unwrap_or(false)
|| !r.replication.unwrap_or(false)
{
RoleAction::Update
} else if let Some(pg_pwd) = &r.encrypted_password {
@@ -283,22 +285,14 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
match action {
RoleAction::None => {}
RoleAction::Update => {
// This can be run on /every/ role! Not just ones created through the console.
// This means that if you add some funny ALTER here that adds a permission,
// this will get run even on user-created roles! This will result in different
// behavior before and after a spec gets reapplied. The below ALTER as it stands
// now only grants LOGIN and changes the password. Please do not allow this branch
// to do anything silly.
let mut query: String = format!("ALTER ROLE {} ", name.pg_quote());
let mut query: String =
format!("ALTER ROLE {} BYPASSRLS REPLICATION", name.pg_quote());
query.push_str(&role.to_pg_options());
xact.execute(query.as_str(), &[])?;
}
RoleAction::Create => {
// This branch only runs when roles are created through the console, so it is
// safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
// from neon_superuser.
let mut query: String = format!(
"CREATE ROLE {} INHERIT CREATEROLE CREATEDB IN ROLE neon_superuser",
"CREATE ROLE {} CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
name.pg_quote()
);
info!("role create query: '{}'", &query);

View File

@@ -201,12 +201,6 @@ async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiEr
// TODO(sharding): make this shard-aware
if let Some(tenant_state) = locked.tenants.get(&req_tenant.id.tenant_id) {
let valid = tenant_state.generation == req_tenant.gen;
tracing::info!(
"handle_validate: {}(gen {}): valid={valid} (latest {})",
req_tenant.id,
req_tenant.gen,
tenant_state.generation
);
response.tenants.push(ValidateResponseTenant {
id: req_tenant.id,
valid,
@@ -256,13 +250,6 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
tenant_state.pageserver = attach_req.node_id;
let generation = tenant_state.generation;
tracing::info!(
"handle_attach_hook: tenant {} set generation {}, pageserver {}",
attach_req.tenant_id,
tenant_state.generation,
attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff))
);
locked.save().await.map_err(ApiError::InternalServerError)?;
json_response(

View File

@@ -168,7 +168,7 @@ fn print_timelines_tree(
info: t.clone(),
children: BTreeSet::new(),
name: timeline_name_mappings
.remove(&TenantTimelineId::new(t.tenant_id.tenant_id, t.timeline_id)),
.remove(&TenantTimelineId::new(t.tenant_id, t.timeline_id)),
},
)
})

View File

@@ -407,7 +407,6 @@ impl PageServerNode {
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'gc_feedback' as bool")?,
heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
};
let request = models::TenantCreateRequest {
@@ -505,7 +504,6 @@ impl PageServerNode {
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'gc_feedback' as bool")?,
heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
}
};

View File

@@ -165,7 +165,7 @@ pub fn migrate_tenant(
let found = other_ps_tenants
.into_iter()
.map(|t| t.id)
.any(|i| i.tenant_id == tenant_id);
.any(|i| i == tenant_id);
if !found {
continue;
}

View File

@@ -207,6 +207,8 @@ pub struct DeltaOp {
pub struct Role {
pub name: PgIdent,
pub encrypted_password: Option<String>,
pub replication: Option<bool>,
pub bypassrls: Option<bool>,
pub options: GenericOptions,
}

View File

@@ -3,11 +3,8 @@
//! Otherwise, we might not see all metrics registered via
//! a default registry.
#![deny(clippy::undocumented_unsafe_blocks)]
use once_cell::sync::Lazy;
use prometheus::core::{
Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec,
};
use prometheus::core::{AtomicU64, Collector, GenericGauge, GenericGaugeVec};
pub use prometheus::opts;
pub use prometheus::register;
pub use prometheus::Error;
@@ -135,137 +132,3 @@ fn get_rusage_stats() -> libc::rusage {
rusage.assume_init()
}
}
/// Create an [`IntCounterPairVec`] and registers to default registry.
#[macro_export(local_inner_macros)]
macro_rules! register_int_counter_pair_vec {
($NAME1:expr, $HELP1:expr, $NAME2:expr, $HELP2:expr, $LABELS_NAMES:expr $(,)?) => {{
match (
$crate::register_int_counter_vec!($NAME1, $HELP1, $LABELS_NAMES),
$crate::register_int_counter_vec!($NAME2, $HELP2, $LABELS_NAMES),
) {
(Ok(inc), Ok(dec)) => Ok($crate::IntCounterPairVec::new(inc, dec)),
(Err(e), _) | (_, Err(e)) => Err(e),
}
}};
}
/// Create an [`IntCounterPair`] and registers to default registry.
#[macro_export(local_inner_macros)]
macro_rules! register_int_counter_pair {
($NAME1:expr, $HELP1:expr, $NAME2:expr, $HELP2:expr $(,)?) => {{
match (
$crate::register_int_counter!($NAME1, $HELP1),
$crate::register_int_counter!($NAME2, $HELP2),
) {
(Ok(inc), Ok(dec)) => Ok($crate::IntCounterPair::new(inc, dec)),
(Err(e), _) | (_, Err(e)) => Err(e),
}
}};
}
/// A Pair of [`GenericCounterVec`]s. Like an [`GenericGaugeVec`] but will always observe changes
pub struct GenericCounterPairVec<P: Atomic> {
inc: GenericCounterVec<P>,
dec: GenericCounterVec<P>,
}
/// A Pair of [`GenericCounter`]s. Like an [`GenericGauge`] but will always observe changes
pub struct GenericCounterPair<P: Atomic> {
inc: GenericCounter<P>,
dec: GenericCounter<P>,
}
impl<P: Atomic> GenericCounterPairVec<P> {
pub fn new(inc: GenericCounterVec<P>, dec: GenericCounterVec<P>) -> Self {
Self { inc, dec }
}
/// `get_metric_with_label_values` returns the [`GenericCounterPair<P>`] for the given slice
/// of label values (same order as the VariableLabels in Desc). If that combination of
/// label values is accessed for the first time, a new [`GenericCounterPair<P>`] is created.
///
/// An error is returned if the number of label values is not the same as the
/// number of VariableLabels in Desc.
pub fn get_metric_with_label_values(&self, vals: &[&str]) -> Result<GenericCounterPair<P>> {
Ok(GenericCounterPair {
inc: self.inc.get_metric_with_label_values(vals)?,
dec: self.dec.get_metric_with_label_values(vals)?,
})
}
/// `with_label_values` works as `get_metric_with_label_values`, but panics if an error
/// occurs.
pub fn with_label_values(&self, vals: &[&str]) -> GenericCounterPair<P> {
self.get_metric_with_label_values(vals).unwrap()
}
}
impl<P: Atomic> GenericCounterPair<P> {
pub fn new(inc: GenericCounter<P>, dec: GenericCounter<P>) -> Self {
Self { inc, dec }
}
/// Increment the gauge by 1, returning a guard that decrements by 1 on drop.
pub fn guard(&self) -> GenericCounterPairGuard<P> {
self.inc.inc();
GenericCounterPairGuard(self.dec.clone())
}
/// Increment the gauge by n, returning a guard that decrements by n on drop.
pub fn guard_by(&self, n: P::T) -> GenericCounterPairGuardBy<P> {
self.inc.inc_by(n);
GenericCounterPairGuardBy(self.dec.clone(), n)
}
/// Increase the gauge by 1.
#[inline]
pub fn inc(&self) {
self.inc.inc();
}
/// Decrease the gauge by 1.
#[inline]
pub fn dec(&self) {
self.dec.inc();
}
/// Add the given value to the gauge. (The value can be
/// negative, resulting in a decrement of the gauge.)
#[inline]
pub fn inc_by(&self, v: P::T) {
self.inc.inc_by(v);
}
/// Subtract the given value from the gauge. (The value can be
/// negative, resulting in an increment of the gauge.)
#[inline]
pub fn dec_by(&self, v: P::T) {
self.dec.inc_by(v);
}
}
/// Guard returned by [`GenericCounterPair::guard`]
pub struct GenericCounterPairGuard<P: Atomic>(GenericCounter<P>);
impl<P: Atomic> Drop for GenericCounterPairGuard<P> {
fn drop(&mut self) {
self.0.inc();
}
}
/// Guard returned by [`GenericCounterPair::guard_by`]
pub struct GenericCounterPairGuardBy<P: Atomic>(GenericCounter<P>, P::T);
impl<P: Atomic> Drop for GenericCounterPairGuardBy<P> {
fn drop(&mut self) {
self.0.inc_by(self.1);
}
}
/// A Pair of [`IntCounterVec`]s. Like an [`IntGaugeVec`] but will always observe changes
pub type IntCounterPairVec = GenericCounterPairVec<AtomicU64>;
/// A Pair of [`IntCounter`]s. Like an [`IntGauge`] but will always observe changes
pub type IntCounterPair = GenericCounterPair<AtomicU64>;
/// A guard for [`IntCounterPair`] that will decrement the gauge on drop
pub type IntCounterPairGuard = GenericCounterPairGuard<AtomicU64>;

View File

@@ -237,7 +237,6 @@ pub struct TenantConfig {
pub min_resident_size_override: Option<u64>,
pub evictions_low_residence_duration_metric_threshold: Option<String>,
pub gc_feedback: Option<bool>,
pub heatmap_period: Option<String>,
}
/// A flattened analog of a `pagesever::tenant::LocationMode`, which
@@ -324,7 +323,6 @@ impl TenantConfigRequest {
#[derive(Debug, Deserialize)]
pub struct TenantAttachRequest {
#[serde(default)]
pub config: TenantAttachConfig,
#[serde(default)]
pub generation: Option<u32>,
@@ -332,7 +330,7 @@ pub struct TenantAttachRequest {
/// Newtype to enforce deny_unknown_fields on TenantConfig for
/// its usage inside `TenantAttachRequest`.
#[derive(Debug, Serialize, Deserialize, Default)]
#[derive(Debug, Serialize, Deserialize)]
#[serde(deny_unknown_fields)]
pub struct TenantAttachConfig {
#[serde(flatten)]
@@ -358,7 +356,7 @@ pub enum TenantAttachmentStatus {
#[derive(Serialize, Deserialize, Clone)]
pub struct TenantInfo {
pub id: TenantShardId,
pub id: TenantId,
// NB: intentionally not part of OpenAPI, we don't want to commit to a specific set of TenantState's
pub state: TenantState,
/// Sum of the size of all layer files.
@@ -370,7 +368,7 @@ pub struct TenantInfo {
/// This represents the output of the "timeline_detail" and "timeline_list" API calls.
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct TimelineInfo {
pub tenant_id: TenantShardId,
pub tenant_id: TenantId,
pub timeline_id: TimelineId,
pub ancestor_timeline_id: Option<TimelineId>,
@@ -386,9 +384,6 @@ pub struct TimelineInfo {
/// The LSN that we are advertizing to safekeepers
pub remote_consistent_lsn_visible: Lsn,
/// The LSN from the start of the root timeline (never changes)
pub initdb_lsn: Lsn,
pub current_logical_size: u64,
pub current_logical_size_is_accurate: bool,
@@ -827,7 +822,7 @@ mod tests {
fn test_tenantinfo_serde() {
// Test serialization/deserialization of TenantInfo
let original_active = TenantInfo {
id: TenantShardId::unsharded(TenantId::generate()),
id: TenantId::generate(),
state: TenantState::Active,
current_physical_size: Some(42),
attachment_status: TenantAttachmentStatus::Attached,
@@ -844,7 +839,7 @@ mod tests {
});
let original_broken = TenantInfo {
id: TenantShardId::unsharded(TenantId::generate()),
id: TenantId::generate(),
state: TenantState::Broken {
reason: "reason".into(),
backtrace: "backtrace info".into(),

View File

@@ -73,33 +73,19 @@ impl TenantShardId {
)
}
pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
ShardSlug(self)
}
/// Convenience for code that has special behavior on the 0th shard.
pub fn is_zero(&self) -> bool {
self.shard_number == ShardNumber(0)
}
}
/// Formatting helper
struct ShardSlug<'a>(&'a TenantShardId);
impl<'a> std::fmt::Display for ShardSlug<'a> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"{:02x}{:02x}",
self.0.shard_number.0, self.0.shard_count.0
)
pub fn shard_slug(&self) -> String {
format!("{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
}
}
impl std::fmt::Display for TenantShardId {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
if self.shard_count != ShardCount(0) {
write!(f, "{}-{}", self.tenant_id, self.shard_slug())
write!(
f,
"{}-{:02x}{:02x}",
self.tenant_id, self.shard_number.0, self.shard_count.0
)
} else {
// Legacy case (shard_count == 0) -- format as just the tenant id. Note that this
// is distinct from the normal single shard case (shard count == 1).
@@ -425,12 +411,6 @@ impl ShardIdentity {
String::new()
}
}
/// Convenience for checking if this identity is the 0th shard in a tenant,
/// for special cases on shard 0 such as ingesting relation sizes.
pub fn is_zero(&self) -> bool {
self.number == ShardNumber(0)
}
}
impl Serialize for ShardIndex {

View File

@@ -16,11 +16,10 @@ aws-credential-types.workspace = true
bytes.workspace = true
camino.workspace = true
hyper = { workspace = true, features = ["stream"] }
futures.workspace = true
serde.workspace = true
serde_json.workspace = true
tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
tokio-util = { workspace = true, features = ["compat"] }
tokio-util.workspace = true
toml_edit.workspace = true
tracing.workspace = true
scopeguard.workspace = true

View File

@@ -1,24 +1,21 @@
//! Azure Blob Storage wrapper
use std::borrow::Cow;
use std::collections::HashMap;
use std::env;
use std::num::NonZeroU32;
use std::pin::Pin;
use std::sync::Arc;
use std::{borrow::Cow, io::Cursor};
use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
use anyhow::Result;
use azure_core::request_options::{MaxResults, Metadata, Range};
use azure_core::RetryOptions;
use azure_identity::DefaultAzureCredential;
use azure_storage::StorageCredentials;
use azure_storage_blobs::prelude::ClientBuilder;
use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient};
use bytes::Bytes;
use futures::stream::Stream;
use futures_util::StreamExt;
use http_types::StatusCode;
use tokio::io::AsyncRead;
use tracing::debug;
use crate::s3_bucket::RequestKind;
@@ -52,8 +49,7 @@ impl AzureBlobStorage {
StorageCredentials::token_credential(Arc::new(token_credential))
};
// we have an outer retry
let builder = ClientBuilder::new(account, credentials).retry(RetryOptions::none());
let builder = ClientBuilder::new(account, credentials);
let client = builder.container_client(azure_config.container_name.to_owned());
@@ -120,8 +116,7 @@ impl AzureBlobStorage {
let mut metadata = HashMap::new();
// TODO give proper streaming response instead of buffering into RAM
// https://github.com/neondatabase/neon/issues/5563
let mut bufs = Vec::new();
let mut buf = Vec::new();
while let Some(part) = response.next().await {
let part = part.map_err(to_download_error)?;
if let Some(blob_meta) = part.blob.metadata {
@@ -132,10 +127,10 @@ impl AzureBlobStorage {
.collect()
.await
.map_err(|e| DownloadError::Other(e.into()))?;
bufs.push(data);
buf.extend_from_slice(&data.slice(..));
}
Ok(Download {
download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
download_stream: Box::pin(Cursor::new(buf)),
metadata: Some(StorageMetadata(metadata)),
})
}
@@ -222,10 +217,9 @@ impl RemoteStorage for AzureBlobStorage {
}
Ok(res)
}
async fn upload(
&self,
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
mut from: impl AsyncRead + Unpin + Send + Sync + 'static,
data_size_bytes: usize,
to: &RemotePath,
metadata: Option<StorageMetadata>,
@@ -233,12 +227,13 @@ impl RemoteStorage for AzureBlobStorage {
let _permit = self.permit(RequestKind::Put).await;
let blob_client = self.client.blob_client(self.relative_path_to_name(to));
let from: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static>> =
Box::pin(from);
let from = NonSeekableStream::new(from, data_size_bytes);
let body = azure_core::Body::SeekableStream(Box::new(from));
// TODO FIX THIS UGLY HACK and don't buffer the entire object
// into RAM here, but use the streaming interface. For that,
// we'd have to change the interface though...
// https://github.com/neondatabase/neon/issues/5563
let mut buf = Vec::with_capacity(data_size_bytes);
tokio::io::copy(&mut from, &mut buf).await?;
let body = azure_core::Body::Bytes(buf.into());
let mut builder = blob_client.put_block_blob(body);
@@ -271,12 +266,17 @@ impl RemoteStorage for AzureBlobStorage {
let mut builder = blob_client.get();
let range: Range = if let Some(end_exclusive) = end_exclusive {
(start_inclusive..end_exclusive).into()
if let Some(end_exclusive) = end_exclusive {
builder = builder.range(Range::new(start_inclusive, end_exclusive));
} else {
(start_inclusive..).into()
};
builder = builder.range(range);
// Open ranges are not supported by the SDK so we work around
// by setting the upper limit extremely high (but high enough
// to still be representable by signed 64 bit integers).
// TODO remove workaround once the SDK adds open range support
// https://github.com/Azure/azure-sdk-for-rust/issues/1438
let end_exclusive = u64::MAX / 4;
builder = builder.range(Range::new(start_inclusive, end_exclusive));
}
self.download_for_builder(builder).await
}
@@ -312,153 +312,3 @@ impl RemoteStorage for AzureBlobStorage {
Ok(())
}
}
pin_project_lite::pin_project! {
/// Hack to work around not being able to stream once with azure sdk.
///
/// Azure sdk clones streams around with the assumption that they are like
/// `Arc<tokio::fs::File>` (except not supporting tokio), however our streams are not like
/// that. For example for an `index_part.json` we just have a single chunk of [`Bytes`]
/// representing the whole serialized vec. It could be trivially cloneable and "semi-trivially"
/// seekable, but we can also just re-try the request easier.
#[project = NonSeekableStreamProj]
enum NonSeekableStream<S> {
/// A stream wrappers initial form.
///
/// Mutex exists to allow moving when cloning. If the sdk changes to do less than 1
/// clone before first request, then this must be changed.
Initial {
inner: std::sync::Mutex<Option<tokio_util::compat::Compat<tokio_util::io::StreamReader<S, Bytes>>>>,
len: usize,
},
/// The actually readable variant, produced by cloning the Initial variant.
///
/// The sdk currently always clones once, even without retry policy.
Actual {
#[pin]
inner: tokio_util::compat::Compat<tokio_util::io::StreamReader<S, Bytes>>,
len: usize,
read_any: bool,
},
/// Most likely unneeded, but left to make life easier, in case more clones are added.
Cloned {
len_was: usize,
}
}
}
impl<S> NonSeekableStream<S>
where
S: Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
{
fn new(inner: S, len: usize) -> NonSeekableStream<S> {
use tokio_util::compat::TokioAsyncReadCompatExt;
let inner = tokio_util::io::StreamReader::new(inner).compat();
let inner = Some(inner);
let inner = std::sync::Mutex::new(inner);
NonSeekableStream::Initial { inner, len }
}
}
impl<S> std::fmt::Debug for NonSeekableStream<S> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Initial { len, .. } => f.debug_struct("Initial").field("len", len).finish(),
Self::Actual { len, .. } => f.debug_struct("Actual").field("len", len).finish(),
Self::Cloned { len_was, .. } => f.debug_struct("Cloned").field("len", len_was).finish(),
}
}
}
impl<S> futures::io::AsyncRead for NonSeekableStream<S>
where
S: Stream<Item = std::io::Result<Bytes>>,
{
fn poll_read(
self: std::pin::Pin<&mut Self>,
cx: &mut std::task::Context<'_>,
buf: &mut [u8],
) -> std::task::Poll<std::io::Result<usize>> {
match self.project() {
NonSeekableStreamProj::Actual {
inner, read_any, ..
} => {
*read_any = true;
inner.poll_read(cx, buf)
}
// NonSeekableStream::Initial does not support reading because it is just much easier
// to have the mutex in place where one does not poll the contents, or that's how it
// seemed originally. If there is a version upgrade which changes the cloning, then
// that support needs to be hacked in.
//
// including {self:?} into the message would be useful, but unsure how to unproject.
_ => std::task::Poll::Ready(Err(std::io::Error::new(
std::io::ErrorKind::Other,
"cloned or initial values cannot be read",
))),
}
}
}
impl<S> Clone for NonSeekableStream<S> {
/// Weird clone implementation exists to support the sdk doing cloning before issuing the first
/// request, see type documentation.
fn clone(&self) -> Self {
use NonSeekableStream::*;
match self {
Initial { inner, len } => {
if let Some(inner) = inner.lock().unwrap().take() {
Actual {
inner,
len: *len,
read_any: false,
}
} else {
Self::Cloned { len_was: *len }
}
}
Actual { len, .. } => Cloned { len_was: *len },
Cloned { len_was } => Cloned { len_was: *len_was },
}
}
}
#[async_trait::async_trait]
impl<S> azure_core::SeekableStream for NonSeekableStream<S>
where
S: Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync + 'static,
{
async fn reset(&mut self) -> azure_core::error::Result<()> {
use NonSeekableStream::*;
let msg = match self {
Initial { inner, .. } => {
if inner.get_mut().unwrap().is_some() {
return Ok(());
} else {
"reset after first clone is not supported"
}
}
Actual { read_any, .. } if !*read_any => return Ok(()),
Actual { .. } => "reset after reading is not supported",
Cloned { .. } => "reset after second clone is not supported",
};
Err(azure_core::error::Error::new(
azure_core::error::ErrorKind::Io,
std::io::Error::new(std::io::ErrorKind::Other, msg),
))
}
// Note: it is not documented if this should be the total or remaining length, total passes the
// tests.
fn len(&self) -> usize {
use NonSeekableStream::*;
match self {
Initial { len, .. } => *len,
Actual { len, .. } => *len,
Cloned { len_was, .. } => *len_was,
}
}
}

View File

@@ -19,10 +19,8 @@ use std::{collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::A
use anyhow::{bail, Context};
use camino::{Utf8Path, Utf8PathBuf};
use bytes::Bytes;
use futures::stream::Stream;
use serde::{Deserialize, Serialize};
use tokio::sync::Semaphore;
use tokio::{io, sync::Semaphore};
use toml_edit::Item;
use tracing::info;
@@ -181,7 +179,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
/// Streams the local file contents into remote into the remote storage entry.
async fn upload(
&self,
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
// S3 PUT request requires the content length to be specified,
// otherwise it starts to fail with the concurrent connection count increasing.
data_size_bytes: usize,
@@ -208,7 +206,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
}
pub struct Download {
pub download_stream: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>,
pub download_stream: Pin<Box<dyn io::AsyncRead + Unpin + Send + Sync>>,
/// Extra key-value data, associated with the current remote file.
pub metadata: Option<StorageMetadata>,
}
@@ -302,7 +300,7 @@ impl GenericRemoteStorage {
pub async fn upload(
&self,
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
data_size_bytes: usize,
to: &RemotePath,
metadata: Option<StorageMetadata>,
@@ -400,7 +398,7 @@ impl GenericRemoteStorage {
/// this path is used for the remote object id conversion only.
pub async fn upload_storage_object(
&self,
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
from: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
from_size_bytes: usize,
to: &RemotePath,
) -> anyhow::Result<()> {

View File

@@ -7,14 +7,11 @@
use std::{borrow::Cow, future::Future, io::ErrorKind, pin::Pin};
use anyhow::{bail, ensure, Context};
use bytes::Bytes;
use camino::{Utf8Path, Utf8PathBuf};
use futures::stream::Stream;
use tokio::{
fs,
io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
};
use tokio_util::io::ReaderStream;
use tracing::*;
use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
@@ -102,35 +99,27 @@ impl LocalFs {
};
// If we were given a directory, we may use it as our starting point.
// Otherwise, we must go up to the first ancestor dir that exists. This is because
// Otherwise, we must go up to the parent directory. This is because
// S3 object list prefixes can be arbitrary strings, but when reading
// the local filesystem we need a directory to start calling read_dir on.
let mut initial_dir = full_path.clone();
loop {
// Did we make it to the root?
if initial_dir.parent().is_none() {
anyhow::bail!("list_files: failed to find valid ancestor dir for {full_path}");
}
match fs::metadata(initial_dir.clone()).await {
Ok(meta) if meta.is_dir() => {
// We found a directory, break
break;
}
Ok(_meta) => {
match fs::metadata(full_path.clone()).await {
Ok(meta) => {
if !meta.is_dir() {
// It's not a directory: strip back to the parent
initial_dir.pop();
}
Err(e) if e.kind() == ErrorKind::NotFound => {
// It's not a file that exists: strip the prefix back to the parent directory
initial_dir.pop();
}
Err(e) => {
// Unexpected I/O error
anyhow::bail!(e)
}
}
Err(e) if e.kind() == ErrorKind::NotFound => {
// It's not a file that exists: strip the prefix back to the parent directory
initial_dir.pop();
}
Err(e) => {
// Unexpected I/O error
anyhow::bail!(e)
}
}
// Note that Utf8PathBuf starts_with only considers full path segments, but
// object prefixes are arbitrary strings, so we need the strings for doing
// starts_with later.
@@ -222,7 +211,7 @@ impl RemoteStorage for LocalFs {
async fn upload(
&self,
data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync,
data: impl io::AsyncRead + Unpin + Send + Sync + 'static,
data_size_bytes: usize,
to: &RemotePath,
metadata: Option<StorageMetadata>,
@@ -255,12 +244,9 @@ impl RemoteStorage for LocalFs {
);
let from_size_bytes = data_size_bytes as u64;
let data = tokio_util::io::StreamReader::new(data);
let data = std::pin::pin!(data);
let mut buffer_to_read = data.take(from_size_bytes);
// alternatively we could just write the bytes to a file, but local_fs is a testing utility
let bytes_read = io::copy_buf(&mut buffer_to_read, &mut destination)
let bytes_read = io::copy(&mut buffer_to_read, &mut destination)
.await
.with_context(|| {
format!(
@@ -314,7 +300,7 @@ impl RemoteStorage for LocalFs {
async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
let target_path = from.with_base(&self.storage_root);
if file_exists(&target_path).map_err(DownloadError::BadInput)? {
let source = ReaderStream::new(
let source = io::BufReader::new(
fs::OpenOptions::new()
.read(true)
.open(&target_path)
@@ -354,14 +340,16 @@ impl RemoteStorage for LocalFs {
}
let target_path = from.with_base(&self.storage_root);
if file_exists(&target_path).map_err(DownloadError::BadInput)? {
let mut source = tokio::fs::OpenOptions::new()
.read(true)
.open(&target_path)
.await
.with_context(|| {
format!("Failed to open source file {target_path:?} to use in the download")
})
.map_err(DownloadError::Other)?;
let mut source = io::BufReader::new(
fs::OpenOptions::new()
.read(true)
.open(&target_path)
.await
.with_context(|| {
format!("Failed to open source file {target_path:?} to use in the download")
})
.map_err(DownloadError::Other)?,
);
source
.seek(io::SeekFrom::Start(start_inclusive))
.await
@@ -375,13 +363,11 @@ impl RemoteStorage for LocalFs {
Ok(match end_exclusive {
Some(end_exclusive) => Download {
metadata,
download_stream: Box::pin(ReaderStream::new(
source.take(end_exclusive - start_inclusive),
)),
download_stream: Box::pin(source.take(end_exclusive - start_inclusive)),
},
None => Download {
metadata,
download_stream: Box::pin(ReaderStream::new(source)),
download_stream: Box::pin(source),
},
})
} else {
@@ -481,9 +467,7 @@ fn file_exists(file_path: &Utf8Path) -> anyhow::Result<bool> {
mod fs_tests {
use super::*;
use bytes::Bytes;
use camino_tempfile::tempdir;
use futures_util::Stream;
use std::{collections::HashMap, io::Write};
async fn read_and_assert_remote_file_contents(
@@ -493,7 +477,7 @@ mod fs_tests {
remote_storage_path: &RemotePath,
expected_metadata: Option<&StorageMetadata>,
) -> anyhow::Result<String> {
let download = storage
let mut download = storage
.download(remote_storage_path)
.await
.map_err(|e| anyhow::anyhow!("Download failed: {e}"))?;
@@ -502,9 +486,13 @@ mod fs_tests {
"Unexpected metadata returned for the downloaded file"
);
let contents = aggregate(download.download_stream).await?;
String::from_utf8(contents).map_err(anyhow::Error::new)
let mut contents = String::new();
download
.download_stream
.read_to_string(&mut contents)
.await
.context("Failed to read remote file contents into string")?;
Ok(contents)
}
#[tokio::test]
@@ -533,26 +521,25 @@ mod fs_tests {
let storage = create_storage()?;
let id = RemotePath::new(Utf8Path::new("dummy"))?;
let content = Bytes::from_static(b"12345");
let content = move || futures::stream::once(futures::future::ready(Ok(content.clone())));
let content = std::io::Cursor::new(b"12345");
// Check that you get an error if the size parameter doesn't match the actual
// size of the stream.
storage
.upload(content(), 0, &id, None)
.upload(Box::new(content.clone()), 0, &id, None)
.await
.expect_err("upload with zero size succeeded");
storage
.upload(content(), 4, &id, None)
.upload(Box::new(content.clone()), 4, &id, None)
.await
.expect_err("upload with too short size succeeded");
storage
.upload(content(), 6, &id, None)
.upload(Box::new(content.clone()), 6, &id, None)
.await
.expect_err("upload with too large size succeeded");
// Correct size is 5, this should succeed.
storage.upload(content(), 5, &id, None).await?;
storage.upload(Box::new(content), 5, &id, None).await?;
Ok(())
}
@@ -600,7 +587,7 @@ mod fs_tests {
let uploaded_bytes = dummy_contents(upload_name).into_bytes();
let (first_part_local, second_part_local) = uploaded_bytes.split_at(3);
let first_part_download = storage
let mut first_part_download = storage
.download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64))
.await?;
assert!(
@@ -608,13 +595,21 @@ mod fs_tests {
"No metadata should be returned for no metadata upload"
);
let first_part_remote = aggregate(first_part_download.download_stream).await?;
let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
io::copy(
&mut first_part_download.download_stream,
&mut first_part_remote,
)
.await?;
first_part_remote.flush().await?;
let first_part_remote = first_part_remote.into_inner().into_inner();
assert_eq!(
first_part_local, first_part_remote,
first_part_local,
first_part_remote.as_slice(),
"First part bytes should be returned when requested"
);
let second_part_download = storage
let mut second_part_download = storage
.download_byte_range(
&upload_target,
first_part_local.len() as u64,
@@ -626,9 +621,17 @@ mod fs_tests {
"No metadata should be returned for no metadata upload"
);
let second_part_remote = aggregate(second_part_download.download_stream).await?;
let mut second_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
io::copy(
&mut second_part_download.download_stream,
&mut second_part_remote,
)
.await?;
second_part_remote.flush().await?;
let second_part_remote = second_part_remote.into_inner().into_inner();
assert_eq!(
second_part_local, second_part_remote,
second_part_local,
second_part_remote.as_slice(),
"Second part bytes should be returned when requested"
);
@@ -718,10 +721,17 @@ mod fs_tests {
let uploaded_bytes = dummy_contents(upload_name).into_bytes();
let (first_part_local, _) = uploaded_bytes.split_at(3);
let partial_download_with_metadata = storage
let mut partial_download_with_metadata = storage
.download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64))
.await?;
let first_part_remote = aggregate(partial_download_with_metadata.download_stream).await?;
let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
io::copy(
&mut partial_download_with_metadata.download_stream,
&mut first_part_remote,
)
.await?;
first_part_remote.flush().await?;
let first_part_remote = first_part_remote.into_inner().into_inner();
assert_eq!(
first_part_local,
first_part_remote.as_slice(),
@@ -797,16 +807,16 @@ mod fs_tests {
)
})?;
let file = tokio_util::io::ReaderStream::new(file);
storage.upload(file, size, &relative_path, metadata).await?;
storage
.upload(Box::new(file), size, &relative_path, metadata)
.await?;
Ok(relative_path)
}
async fn create_file_for_upload(
path: &Utf8Path,
contents: &str,
) -> anyhow::Result<(fs::File, usize)> {
) -> anyhow::Result<(io::BufReader<fs::File>, usize)> {
std::fs::create_dir_all(path.parent().unwrap())?;
let mut file_for_writing = std::fs::OpenOptions::new()
.write(true)
@@ -816,7 +826,7 @@ mod fs_tests {
drop(file_for_writing);
let file_size = path.metadata()?.len() as usize;
Ok((
fs::OpenOptions::new().read(true).open(&path).await?,
io::BufReader::new(fs::OpenOptions::new().read(true).open(&path).await?),
file_size,
))
}
@@ -830,16 +840,4 @@ mod fs_tests {
files.sort_by(|a, b| a.0.cmp(&b.0));
Ok(files)
}
async fn aggregate(
stream: impl Stream<Item = std::io::Result<Bytes>>,
) -> anyhow::Result<Vec<u8>> {
use futures::stream::StreamExt;
let mut out = Vec::new();
let mut stream = std::pin::pin!(stream);
while let Some(res) = stream.next().await {
out.extend_from_slice(&res?[..]);
}
Ok(out)
}
}

View File

@@ -4,14 +4,9 @@
//! allowing multiple api users to independently work with the same S3 bucket, if
//! their bucket prefixes are both specified and different.
use std::{
borrow::Cow,
pin::Pin,
sync::Arc,
task::{Context, Poll},
};
use std::{borrow::Cow, sync::Arc};
use anyhow::Context as _;
use anyhow::Context;
use aws_config::{
environment::credentials::EnvironmentVariableCredentialsProvider,
imds::credentials::ImdsCredentialsProvider,
@@ -33,10 +28,11 @@ use aws_smithy_async::rt::sleep::TokioSleep;
use aws_smithy_types::body::SdkBody;
use aws_smithy_types::byte_stream::ByteStream;
use bytes::Bytes;
use futures::stream::Stream;
use hyper::Body;
use scopeguard::ScopeGuard;
use tokio::io::{self, AsyncRead};
use tokio_util::io::ReaderStream;
use tracing::debug;
use super::StorageMetadata;
use crate::{
@@ -67,7 +63,7 @@ struct GetObjectRequest {
impl S3Bucket {
/// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
pub fn new(aws_config: &S3Config) -> anyhow::Result<Self> {
tracing::debug!(
debug!(
"Creating s3 remote storage for S3 bucket {}",
aws_config.bucket_name
);
@@ -229,15 +225,12 @@ impl S3Bucket {
match get_object {
Ok(object_output) => {
let metadata = object_output.metadata().cloned().map(StorageMetadata);
let body = object_output.body;
let body = ByteStreamAsStream::from(body);
let body = PermitCarrying::new(permit, body);
let body = TimedDownload::new(started_at, body);
Ok(Download {
metadata,
download_stream: Box::pin(body),
download_stream: Box::pin(io::BufReader::new(TimedDownload::new(
started_at,
RatelimitedAsyncRead::new(permit, object_output.body.into_async_read()),
))),
})
}
Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => {
@@ -250,55 +243,29 @@ impl S3Bucket {
}
}
pin_project_lite::pin_project! {
struct ByteStreamAsStream {
#[pin]
inner: aws_smithy_types::byte_stream::ByteStream
}
}
impl From<aws_smithy_types::byte_stream::ByteStream> for ByteStreamAsStream {
fn from(inner: aws_smithy_types::byte_stream::ByteStream) -> Self {
ByteStreamAsStream { inner }
}
}
impl Stream for ByteStreamAsStream {
type Item = std::io::Result<Bytes>;
fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
// this does the std::io::ErrorKind::Other conversion
self.project().inner.poll_next(cx).map_err(|x| x.into())
}
// cannot implement size_hint because inner.size_hint is remaining size in bytes, which makes
// sense and Stream::size_hint does not really
}
pin_project_lite::pin_project! {
/// An `AsyncRead` adapter which carries a permit for the lifetime of the value.
struct PermitCarrying<S> {
struct RatelimitedAsyncRead<S> {
permit: tokio::sync::OwnedSemaphorePermit,
#[pin]
inner: S,
}
}
impl<S> PermitCarrying<S> {
impl<S: AsyncRead> RatelimitedAsyncRead<S> {
fn new(permit: tokio::sync::OwnedSemaphorePermit, inner: S) -> Self {
Self { permit, inner }
RatelimitedAsyncRead { permit, inner }
}
}
impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for PermitCarrying<S> {
type Item = <S as Stream>::Item;
fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
self.project().inner.poll_next(cx)
}
fn size_hint(&self) -> (usize, Option<usize>) {
self.inner.size_hint()
impl<S: AsyncRead> AsyncRead for RatelimitedAsyncRead<S> {
fn poll_read(
self: std::pin::Pin<&mut Self>,
cx: &mut std::task::Context<'_>,
buf: &mut io::ReadBuf<'_>,
) -> std::task::Poll<std::io::Result<()>> {
let this = self.project();
this.inner.poll_read(cx, buf)
}
}
@@ -318,7 +285,7 @@ pin_project_lite::pin_project! {
}
}
impl<S> TimedDownload<S> {
impl<S: AsyncRead> TimedDownload<S> {
fn new(started_at: std::time::Instant, inner: S) -> Self {
TimedDownload {
started_at,
@@ -328,26 +295,25 @@ impl<S> TimedDownload<S> {
}
}
impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for TimedDownload<S> {
type Item = <S as Stream>::Item;
fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
use std::task::ready;
impl<S: AsyncRead> AsyncRead for TimedDownload<S> {
fn poll_read(
self: std::pin::Pin<&mut Self>,
cx: &mut std::task::Context<'_>,
buf: &mut io::ReadBuf<'_>,
) -> std::task::Poll<std::io::Result<()>> {
let this = self.project();
let before = buf.filled().len();
let read = std::task::ready!(this.inner.poll_read(cx, buf));
let res = ready!(this.inner.poll_next(cx));
match &res {
Some(Ok(_)) => {}
Some(Err(_)) => *this.outcome = metrics::AttemptOutcome::Err,
None => *this.outcome = metrics::AttemptOutcome::Ok,
let read_eof = buf.filled().len() == before;
match read {
Ok(()) if read_eof => *this.outcome = AttemptOutcome::Ok,
Ok(()) => { /* still in progress */ }
Err(_) => *this.outcome = AttemptOutcome::Err,
}
Poll::Ready(res)
}
fn size_hint(&self) -> (usize, Option<usize>) {
self.inner.size_hint()
std::task::Poll::Ready(read)
}
}
@@ -437,7 +403,7 @@ impl RemoteStorage for S3Bucket {
async fn upload(
&self,
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
from_size_bytes: usize,
to: &RemotePath,
metadata: Option<StorageMetadata>,
@@ -447,7 +413,7 @@ impl RemoteStorage for S3Bucket {
let started_at = start_measuring_requests(kind);
let body = Body::wrap_stream(from);
let body = Body::wrap_stream(ReaderStream::new(from));
let bytes_stream = ByteStream::new(SdkBody::from_body_0_4(body));
let res = self

View File

@@ -1,8 +1,6 @@
//! This module provides a wrapper around a real RemoteStorage implementation that
//! causes the first N attempts at each upload or download operatio to fail. For
//! testing purposes.
use bytes::Bytes;
use futures::stream::Stream;
use std::collections::hash_map::Entry;
use std::collections::HashMap;
use std::sync::Mutex;
@@ -110,7 +108,7 @@ impl RemoteStorage for UnreliableWrapper {
async fn upload(
&self,
data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
data: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
// S3 PUT request requires the content length to be specified,
// otherwise it starts to fail with the concurrent connection count increasing.
data_size_bytes: usize,

View File

@@ -7,9 +7,7 @@ use std::sync::Arc;
use std::time::UNIX_EPOCH;
use anyhow::Context;
use bytes::Bytes;
use camino::Utf8Path;
use futures::stream::Stream;
use once_cell::sync::OnceCell;
use remote_storage::{
AzureConfig, Download, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
@@ -182,14 +180,23 @@ async fn azure_delete_objects_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Resu
let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
.with_context(|| "RemotePath conversion")?;
let (data, len) = upload_stream("remote blob data1".as_bytes().into());
ctx.client.upload(data, len, &path1, None).await?;
let data1 = "remote blob data1".as_bytes();
let data1_len = data1.len();
let data2 = "remote blob data2".as_bytes();
let data2_len = data2.len();
let data3 = "remote blob data3".as_bytes();
let data3_len = data3.len();
ctx.client
.upload(std::io::Cursor::new(data1), data1_len, &path1, None)
.await?;
let (data, len) = upload_stream("remote blob data2".as_bytes().into());
ctx.client.upload(data, len, &path2, None).await?;
ctx.client
.upload(std::io::Cursor::new(data2), data2_len, &path2, None)
.await?;
let (data, len) = upload_stream("remote blob data3".as_bytes().into());
ctx.client.upload(data, len, &path3, None).await?;
ctx.client
.upload(std::io::Cursor::new(data3), data3_len, &path3, None)
.await?;
ctx.client.delete_objects(&[path1, path2]).await?;
@@ -212,56 +219,53 @@ async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Res
let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))
.with_context(|| "RemotePath conversion")?;
let orig = bytes::Bytes::from_static("remote blob data here".as_bytes());
let data = "remote blob data here".as_bytes();
let data_len = data.len() as u64;
let (data, len) = wrap_stream(orig.clone());
ctx.client.upload(data, len, &path, None).await?;
async fn download_and_compare(dl: Download) -> anyhow::Result<Vec<u8>> {
let mut buf = Vec::new();
tokio::io::copy_buf(
&mut tokio_util::io::StreamReader::new(dl.download_stream),
&mut buf,
)
ctx.client
.upload(std::io::Cursor::new(data), data.len(), &path, None)
.await?;
async fn download_and_compare(mut dl: Download) -> anyhow::Result<Vec<u8>> {
let mut buf = Vec::new();
tokio::io::copy(&mut dl.download_stream, &mut buf).await?;
Ok(buf)
}
// Normal download request
let dl = ctx.client.download(&path).await?;
let buf = download_and_compare(dl).await?;
assert_eq!(&buf, &orig);
assert_eq!(buf, data);
// Full range (end specified)
let dl = ctx
.client
.download_byte_range(&path, 0, Some(len as u64))
.download_byte_range(&path, 0, Some(data_len))
.await?;
let buf = download_and_compare(dl).await?;
assert_eq!(&buf, &orig);
assert_eq!(buf, data);
// partial range (end specified)
let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
let buf = download_and_compare(dl).await?;
assert_eq!(&buf, &orig[4..10]);
assert_eq!(buf, data[4..10]);
// partial range (end beyond real end)
let dl = ctx
.client
.download_byte_range(&path, 8, Some(len as u64 * 100))
.download_byte_range(&path, 8, Some(data_len * 100))
.await?;
let buf = download_and_compare(dl).await?;
assert_eq!(&buf, &orig[8..]);
assert_eq!(buf, data[8..]);
// Partial range (end unspecified)
let dl = ctx.client.download_byte_range(&path, 4, None).await?;
let buf = download_and_compare(dl).await?;
assert_eq!(&buf, &orig[4..]);
assert_eq!(buf, data[4..]);
// Full range (end unspecified)
let dl = ctx.client.download_byte_range(&path, 0, None).await?;
let buf = download_and_compare(dl).await?;
assert_eq!(&buf, &orig);
assert_eq!(buf, data);
debug!("Cleanup: deleting file at path {path:?}");
ctx.client
@@ -500,8 +504,11 @@ async fn upload_azure_data(
let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
debug!("Creating remote item {i} at path {blob_path:?}");
let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into());
task_client.upload(data, len, &blob_path, None).await?;
let data = format!("remote blob data {i}").into_bytes();
let data_len = data.len();
task_client
.upload(std::io::Cursor::new(data), data_len, &blob_path, None)
.await?;
Ok::<_, anyhow::Error>((blob_prefix, blob_path))
});
@@ -582,8 +589,11 @@ async fn upload_simple_azure_data(
.with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
debug!("Creating remote item {i} at path {blob_path:?}");
let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into());
task_client.upload(data, len, &blob_path, None).await?;
let data = format!("remote blob data {i}").into_bytes();
let data_len = data.len();
task_client
.upload(std::io::Cursor::new(data), data_len, &blob_path, None)
.await?;
Ok::<_, anyhow::Error>(blob_path)
});
@@ -612,32 +622,3 @@ async fn upload_simple_azure_data(
ControlFlow::Continue(uploaded_blobs)
}
}
// FIXME: copypasted from test_real_s3, can't remember how to share a module which is not compiled
// to binary
fn upload_stream(
content: std::borrow::Cow<'static, [u8]>,
) -> (
impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
usize,
) {
use std::borrow::Cow;
let content = match content {
Cow::Borrowed(x) => Bytes::from_static(x),
Cow::Owned(vec) => Bytes::from(vec),
};
wrap_stream(content)
}
fn wrap_stream(
content: bytes::Bytes,
) -> (
impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
usize,
) {
let len = content.len();
let content = futures::future::ready(Ok(content));
(futures::stream::once(content), len)
}

View File

@@ -7,9 +7,7 @@ use std::sync::Arc;
use std::time::UNIX_EPOCH;
use anyhow::Context;
use bytes::Bytes;
use camino::Utf8Path;
use futures::stream::Stream;
use once_cell::sync::OnceCell;
use remote_storage::{
GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
@@ -178,14 +176,23 @@ async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()>
let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
.with_context(|| "RemotePath conversion")?;
let (data, len) = upload_stream("remote blob data1".as_bytes().into());
ctx.client.upload(data, len, &path1, None).await?;
let data1 = "remote blob data1".as_bytes();
let data1_len = data1.len();
let data2 = "remote blob data2".as_bytes();
let data2_len = data2.len();
let data3 = "remote blob data3".as_bytes();
let data3_len = data3.len();
ctx.client
.upload(std::io::Cursor::new(data1), data1_len, &path1, None)
.await?;
let (data, len) = upload_stream("remote blob data2".as_bytes().into());
ctx.client.upload(data, len, &path2, None).await?;
ctx.client
.upload(std::io::Cursor::new(data2), data2_len, &path2, None)
.await?;
let (data, len) = upload_stream("remote blob data3".as_bytes().into());
ctx.client.upload(data, len, &path3, None).await?;
ctx.client
.upload(std::io::Cursor::new(data3), data3_len, &path3, None)
.await?;
ctx.client.delete_objects(&[path1, path2]).await?;
@@ -425,9 +432,11 @@ async fn upload_s3_data(
let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
debug!("Creating remote item {i} at path {blob_path:?}");
let (data, data_len) =
upload_stream(format!("remote blob data {i}").into_bytes().into());
task_client.upload(data, data_len, &blob_path, None).await?;
let data = format!("remote blob data {i}").into_bytes();
let data_len = data.len();
task_client
.upload(std::io::Cursor::new(data), data_len, &blob_path, None)
.await?;
Ok::<_, anyhow::Error>((blob_prefix, blob_path))
});
@@ -508,9 +517,11 @@ async fn upload_simple_s3_data(
.with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
debug!("Creating remote item {i} at path {blob_path:?}");
let (data, data_len) =
upload_stream(format!("remote blob data {i}").into_bytes().into());
task_client.upload(data, data_len, &blob_path, None).await?;
let data = format!("remote blob data {i}").into_bytes();
let data_len = data.len();
task_client
.upload(std::io::Cursor::new(data), data_len, &blob_path, None)
.await?;
Ok::<_, anyhow::Error>(blob_path)
});
@@ -539,30 +550,3 @@ async fn upload_simple_s3_data(
ControlFlow::Continue(uploaded_blobs)
}
}
fn upload_stream(
content: std::borrow::Cow<'static, [u8]>,
) -> (
impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
usize,
) {
use std::borrow::Cow;
let content = match content {
Cow::Borrowed(x) => Bytes::from_static(x),
Cow::Owned(vec) => Bytes::from(vec),
};
wrap_stream(content)
}
fn wrap_stream(
content: bytes::Bytes,
) -> (
impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
usize,
) {
let len = content.len();
let content = futures::future::ready(Ok(content));
(futures::stream::once(content), len)
}

View File

@@ -1,14 +1,16 @@
use tokio_util::task::{task_tracker::TaskTrackerToken, TaskTracker};
use std::sync::Arc;
use tokio::sync::{mpsc, Mutex};
/// While a reference is kept around, the associated [`Barrier::wait`] will wait.
///
/// Can be cloned, moved and kept around in futures as "guard objects".
#[derive(Clone)]
pub struct Completion(TaskTrackerToken);
pub struct Completion(mpsc::Sender<()>);
/// Barrier will wait until all clones of [`Completion`] have been dropped.
#[derive(Clone)]
pub struct Barrier(TaskTracker);
pub struct Barrier(Arc<Mutex<mpsc::Receiver<()>>>);
impl Default for Barrier {
fn default() -> Self {
@@ -19,7 +21,7 @@ impl Default for Barrier {
impl Barrier {
pub async fn wait(self) {
self.0.wait().await;
self.0.lock().await.recv().await;
}
pub async fn maybe_wait(barrier: Option<Barrier>) {
@@ -31,7 +33,8 @@ impl Barrier {
impl PartialEq for Barrier {
fn eq(&self, other: &Self) -> bool {
TaskTracker::ptr_eq(&self.0, &other.0)
// we don't use dyn so this is good
Arc::ptr_eq(&self.0, &other.0)
}
}
@@ -39,10 +42,8 @@ impl Eq for Barrier {}
/// Create new Guard and Barrier pair.
pub fn channel() -> (Completion, Barrier) {
let tracker = TaskTracker::new();
// otherwise wait never exits
tracker.close();
let token = tracker.token();
(Completion(token), Barrier(tracker))
let (tx, rx) = mpsc::channel::<()>(1);
let rx = Mutex::new(rx);
let rx = Arc::new(rx);
(Completion(tx), Barrier(rx))
}

View File

@@ -1,7 +1,6 @@
use std::str::FromStr;
use anyhow::Context;
use metrics::{IntCounter, IntCounterVec};
use once_cell::sync::Lazy;
use strum_macros::{EnumString, EnumVariantNames};
@@ -25,48 +24,16 @@ impl LogFormat {
}
}
struct TracingEventCountMetric {
error: IntCounter,
warn: IntCounter,
info: IntCounter,
debug: IntCounter,
trace: IntCounter,
}
static TRACING_EVENT_COUNT_METRIC: Lazy<TracingEventCountMetric> = Lazy::new(|| {
let vec = metrics::register_int_counter_vec!(
static TRACING_EVENT_COUNT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
metrics::register_int_counter_vec!(
"libmetrics_tracing_event_count",
"Number of tracing events, by level",
&["level"]
)
.expect("failed to define metric");
TracingEventCountMetric::new(vec)
.expect("failed to define metric")
});
impl TracingEventCountMetric {
fn new(vec: IntCounterVec) -> Self {
Self {
error: vec.with_label_values(&["error"]),
warn: vec.with_label_values(&["warn"]),
info: vec.with_label_values(&["info"]),
debug: vec.with_label_values(&["debug"]),
trace: vec.with_label_values(&["trace"]),
}
}
fn inc_for_level(&self, level: tracing::Level) {
let counter = match level {
tracing::Level::ERROR => &self.error,
tracing::Level::WARN => &self.warn,
tracing::Level::INFO => &self.info,
tracing::Level::DEBUG => &self.debug,
tracing::Level::TRACE => &self.trace,
};
counter.inc();
}
}
struct TracingEventCountLayer(&'static TracingEventCountMetric);
struct TracingEventCountLayer(&'static metrics::IntCounterVec);
impl<S> tracing_subscriber::layer::Layer<S> for TracingEventCountLayer
where
@@ -77,7 +44,15 @@ where
event: &tracing::Event<'_>,
_ctx: tracing_subscriber::layer::Context<'_, S>,
) {
self.0.inc_for_level(*event.metadata().level());
let level = event.metadata().level();
let level = match *level {
tracing::Level::ERROR => "error",
tracing::Level::WARN => "warn",
tracing::Level::INFO => "info",
tracing::Level::DEBUG => "debug",
tracing::Level::TRACE => "trace",
};
self.0.with_label_values(&[level]).inc();
}
}
@@ -131,9 +106,7 @@ pub fn init(
};
log_layer.with_filter(rust_log_env_filter())
});
let r = r.with(
TracingEventCountLayer(&TRACING_EVENT_COUNT_METRIC).with_filter(rust_log_env_filter()),
);
let r = r.with(TracingEventCountLayer(&TRACING_EVENT_COUNT).with_filter(rust_log_env_filter()));
match tracing_error_layer_enablement {
TracingErrorLayerEnablement::EnableWithRustLogFilter => r
.with(tracing_error::ErrorLayer::default().with_filter(rust_log_env_filter()))
@@ -284,14 +257,14 @@ impl std::fmt::Debug for SecretString {
mod tests {
use metrics::{core::Opts, IntCounterVec};
use crate::logging::{TracingEventCountLayer, TracingEventCountMetric};
use super::TracingEventCountLayer;
#[test]
fn tracing_event_count_metric() {
let counter_vec =
IntCounterVec::new(Opts::new("testmetric", "testhelp"), &["level"]).unwrap();
let metric = Box::leak(Box::new(TracingEventCountMetric::new(counter_vec.clone())));
let layer = TracingEventCountLayer(metric);
let counter_vec = Box::leak(Box::new(counter_vec)); // make it 'static
let layer = TracingEventCountLayer(counter_vec);
use tracing_subscriber::prelude::*;
tracing::subscriber::with_default(tracing_subscriber::registry().with(layer), || {

View File

@@ -1,10 +1,10 @@
//!
//! RCU stands for Read-Copy-Update. It's a synchronization mechanism somewhat
//! similar to a lock, but it allows readers to "hold on" to an old value of RCU
//! without blocking writers, and allows writing a new value without blocking
//! readers. When you update the value, the new value is immediately visible
//! without blocking writers, and allows writing a new values without blocking
//! readers. When you update the new value, the new value is immediately visible
//! to new readers, but the update waits until all existing readers have
//! finished, so that on return, no one sees the old value anymore.
//! finishe, so that no one sees the old value anymore.
//!
//! This implementation isn't wait-free; it uses an RwLock that is held for a
//! short duration when the value is read or updated.
@@ -26,7 +26,6 @@
//! Increment the value by one, and wait for old readers to finish:
//!
//! ```
//! # async fn dox() {
//! # let rcu = utils::simple_rcu::Rcu::new(1);
//! let write_guard = rcu.lock_for_write();
//!
@@ -37,17 +36,15 @@
//!
//! // Concurrent reads and writes are now possible again. Wait for all the readers
//! // that still observe the old value to finish.
//! waitlist.wait().await;
//! # }
//! waitlist.wait();
//! ```
//!
#![warn(missing_docs)]
use std::ops::Deref;
use std::sync::mpsc::{sync_channel, Receiver, SyncSender};
use std::sync::{Arc, Weak};
use std::sync::{RwLock, RwLockWriteGuard};
use tokio::sync::watch;
use std::sync::{Mutex, RwLock, RwLockWriteGuard};
///
/// Rcu allows multiple readers to read and hold onto a value without blocking
@@ -71,21 +68,22 @@ struct RcuCell<V> {
value: V,
/// A dummy channel. We never send anything to this channel. The point is
/// that when the RcuCell is dropped, any subscribed Receivers will be notified
/// that when the RcuCell is dropped, any cloned Senders will be notified
/// that the channel is closed. Updaters can use this to wait out until the
/// RcuCell has been dropped, i.e. until the old value is no longer in use.
///
/// We never send anything to this, we just need to hold onto it so that the
/// Receivers will be notified when it's dropped.
watch: watch::Sender<()>,
/// We never do anything with the receiver, we just need to hold onto it so
/// that the Senders will be notified when it's dropped. But because it's
/// not Sync, we need a Mutex on it.
watch: (SyncSender<()>, Mutex<Receiver<()>>),
}
impl<V> RcuCell<V> {
fn new(value: V) -> Self {
let (watch_sender, _) = watch::channel(());
let (watch_sender, watch_receiver) = sync_channel(0);
RcuCell {
value,
watch: watch_sender,
watch: (watch_sender, Mutex::new(watch_receiver)),
}
}
}
@@ -143,10 +141,10 @@ impl<V> Deref for RcuReadGuard<V> {
///
/// Write guard returned by `write`
///
/// NB: Holding this guard blocks all concurrent `read` and `write` calls, so it should only be
/// held for a short duration!
/// NB: Holding this guard blocks all concurrent `read` and `write` calls, so
/// it should only be held for a short duration!
///
/// Calling [`Self::store_and_unlock`] consumes the guard, making new reads and new writes possible
/// Calling `store` consumes the guard, making new reads and new writes possible
/// again.
///
pub struct RcuWriteGuard<'a, V> {
@@ -181,7 +179,7 @@ impl<'a, V> RcuWriteGuard<'a, V> {
// the watches for any that do.
self.inner.old_cells.retain(|weak| {
if let Some(cell) = weak.upgrade() {
watches.push(cell.watch.subscribe());
watches.push(cell.watch.0.clone());
true
} else {
false
@@ -195,20 +193,20 @@ impl<'a, V> RcuWriteGuard<'a, V> {
///
/// List of readers who can still see old values.
///
pub struct RcuWaitList(Vec<watch::Receiver<()>>);
pub struct RcuWaitList(Vec<SyncSender<()>>);
impl RcuWaitList {
///
/// Wait for old readers to finish.
///
pub async fn wait(mut self) {
pub fn wait(mut self) {
// after all the old_cells are no longer in use, we're done
for w in self.0.iter_mut() {
// This will block until the Receiver is closed. That happens when
// the RcuCell is dropped.
#[allow(clippy::single_match)]
match w.changed().await {
Ok(_) => panic!("changed() unexpectedly succeeded on dummy channel"),
match w.send(()) {
Ok(_) => panic!("send() unexpectedly succeeded on dummy channel"),
Err(_) => {
// closed, which means that the cell has been dropped, and
// its value is no longer in use
@@ -222,10 +220,11 @@ impl RcuWaitList {
mod tests {
use super::*;
use std::sync::{Arc, Mutex};
use std::thread::{sleep, spawn};
use std::time::Duration;
#[tokio::test]
async fn two_writers() {
#[test]
fn two_writers() {
let rcu = Rcu::new(1);
let read1 = rcu.read();
@@ -249,35 +248,33 @@ mod tests {
assert_eq!(*read1, 1);
let log = Arc::new(Mutex::new(Vec::new()));
// Wait for the old readers to finish in separate tasks.
// Wait for the old readers to finish in separate threads.
let log_clone = Arc::clone(&log);
let task2 = tokio::spawn(async move {
wait2.wait().await;
let thread2 = spawn(move || {
wait2.wait();
log_clone.lock().unwrap().push("wait2 done");
});
let log_clone = Arc::clone(&log);
let task3 = tokio::spawn(async move {
wait3.wait().await;
let thread3 = spawn(move || {
wait3.wait();
log_clone.lock().unwrap().push("wait3 done");
});
// without this sleep the test can pass on accident if the writer is slow
tokio::time::sleep(Duration::from_millis(100)).await;
sleep(Duration::from_millis(500));
// Release first reader. This allows first write to finish, but calling
// wait() on the 'task3' would still block.
// wait() on the second one would still block.
log.lock().unwrap().push("dropping read1");
drop(read1);
task2.await.unwrap();
thread2.join().unwrap();
assert!(!task3.is_finished());
tokio::time::sleep(Duration::from_millis(100)).await;
sleep(Duration::from_millis(500));
// Release second reader, and finish second writer.
log.lock().unwrap().push("dropping read2");
drop(read2);
task3.await.unwrap();
thread3.join().unwrap();
assert_eq!(
log.lock().unwrap().as_slice(),

View File

@@ -30,32 +30,18 @@ async fn warn_if_stuck<Fut: std::future::Future>(
let mut fut = std::pin::pin!(fut);
let mut warned = false;
let ret = loop {
loop {
match tokio::time::timeout(warn_period, &mut fut).await {
Ok(ret) => break ret,
Ok(ret) => return ret,
Err(_) => {
tracing::warn!(
gate = name,
elapsed_ms = started.elapsed().as_millis(),
"still waiting, taking longer than expected..."
);
warned = true;
}
}
};
// If we emitted a warning for slowness, also emit a message when we complete, so that
// someone debugging a shutdown can know for sure whether we have moved past this operation.
if warned {
tracing::info!(
gate = name,
elapsed_ms = started.elapsed().as_millis(),
"completed, after taking longer than expected"
)
}
ret
}
#[derive(Debug)]

View File

@@ -436,9 +436,9 @@ mod tests {
event_mask: 0,
}),
expected_messages: vec![
// Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160001, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
// Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160000, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
vec![
103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110,
147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147,
188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1,
@@ -478,7 +478,7 @@ mod tests {
// walproposer will panic when it finishes sync_safekeepers
std::panic::catch_unwind(|| wp.start()).unwrap_err();
// validate the resulting LSN
assert_eq!(receiver.try_recv(), Ok(1337));
assert_eq!(receiver.recv()?, 1337);
Ok(())
// drop() will free up resources here
}

View File

@@ -36,7 +36,6 @@ humantime.workspace = true
humantime-serde.workspace = true
hyper.workspace = true
itertools.workspace = true
md5.workspace = true
nix.workspace = true
# hack to get the number of worker threads tokio uses
num_cpus = { version = "1.15" }

View File

@@ -23,7 +23,6 @@ use tracing::*;
use tokio_tar::{Builder, EntryType, Header};
use crate::context::RequestContext;
use crate::pgdatadir_mapping::Version;
use crate::tenant::Timeline;
use pageserver_api::reltag::{RelTag, SlruKind};
@@ -175,7 +174,7 @@ where
] {
for segno in self
.timeline
.list_slru_segments(kind, Version::Lsn(self.lsn), self.ctx)
.list_slru_segments(kind, self.lsn, self.ctx)
.await?
{
self.add_slru_segment(kind, segno).await?;
@@ -193,7 +192,7 @@ where
// Otherwise only include init forks of unlogged relations.
let rels = self
.timeline
.list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
.list_rels(spcnode, dbnode, self.lsn, self.ctx)
.await?;
for &rel in rels.iter() {
// Send init fork as main fork to provide well formed empty
@@ -268,7 +267,7 @@ where
async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> {
let nblocks = self
.timeline
.get_rel_size(src, Version::Lsn(self.lsn), false, self.ctx)
.get_rel_size(src, self.lsn, false, self.ctx)
.await?;
// If the relation is empty, create an empty file
@@ -289,7 +288,7 @@ where
for blknum in startblk..endblk {
let img = self
.timeline
.get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), false, self.ctx)
.get_rel_page_at_lsn(src, blknum, self.lsn, false, self.ctx)
.await?;
segment_data.extend_from_slice(&img[..]);
}
@@ -311,7 +310,7 @@ where
async fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
let nblocks = self
.timeline
.get_slru_segment_size(slru, segno, Version::Lsn(self.lsn), self.ctx)
.get_slru_segment_size(slru, segno, self.lsn, self.ctx)
.await?;
let mut slru_buf: Vec<u8> = Vec::with_capacity(nblocks as usize * BLCKSZ as usize);
@@ -353,7 +352,7 @@ where
let relmap_img = if has_relmap_file {
let img = self
.timeline
.get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
.get_relmap_file(spcnode, dbnode, self.lsn, self.ctx)
.await?;
ensure!(
@@ -400,7 +399,7 @@ where
if !has_relmap_file
&& self
.timeline
.list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
.list_rels(spcnode, dbnode, self.lsn, self.ctx)
.await?
.is_empty()
{

View File

@@ -14,7 +14,7 @@ use pageserver::control_plane_client::ControlPlaneClient;
use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
use pageserver::task_mgr::WALRECEIVER_RUNTIME;
use pageserver::tenant::{secondary, TenantSharedResources};
use pageserver::tenant::TenantSharedResources;
use remote_storage::GenericRemoteStorage;
use tokio::time::Instant;
use tracing::*;
@@ -425,6 +425,7 @@ fn start_pageserver(
let tenant_manager = Arc::new(tenant_manager);
BACKGROUND_RUNTIME.spawn({
let init_done_rx = init_done_rx;
let shutdown_pageserver = shutdown_pageserver.clone();
let drive_init = async move {
// NOTE: unlike many futures in pageserver, this one is cancellation-safe
@@ -504,17 +505,6 @@ fn start_pageserver(
}
});
let secondary_controller = if let Some(remote_storage) = &remote_storage {
secondary::spawn_tasks(
tenant_manager.clone(),
remote_storage.clone(),
background_jobs_barrier.clone(),
shutdown_pageserver.clone(),
)
} else {
secondary::null_controller()
};
// shared state between the disk-usage backed eviction background task and the http endpoint
// that allows triggering disk-usage based eviction manually. note that the http endpoint
// is still accessible even if background task is not configured as long as remote storage has
@@ -544,7 +534,6 @@ fn start_pageserver(
broker_client.clone(),
disk_usage_eviction_state,
deletion_queue.new_client(),
secondary_controller,
)
.context("Failed to initialize router state")?,
);
@@ -571,6 +560,7 @@ fn start_pageserver(
}
if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
let background_jobs_barrier = background_jobs_barrier;
let metrics_ctx = RequestContext::todo_child(
TaskKind::MetricsCollection,
// This task itself shouldn't download anything.

View File

@@ -70,10 +70,6 @@ pub mod defaults {
pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8;
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
///
/// Default built-in configuration file.
///
@@ -86,7 +82,6 @@ pub mod defaults {
#wait_lsn_timeout = '{DEFAULT_WAIT_LSN_TIMEOUT}'
#wal_redo_timeout = '{DEFAULT_WAL_REDO_TIMEOUT}'
#page_cache_size = {DEFAULT_PAGE_CACHE_SIZE}
#max_file_descriptors = {DEFAULT_MAX_FILE_DESCRIPTORS}
# initial superuser role name to use when creating a new tenant
@@ -106,8 +101,6 @@ pub mod defaults {
#background_task_maximum_delay = '{DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY}'
#ingest_batch_size = {DEFAULT_INGEST_BATCH_SIZE}
[tenant_config]
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -124,8 +117,6 @@ pub mod defaults {
#evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}'
#gc_feedback = false
#heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY}
[remote_storage]
"#
@@ -224,13 +215,6 @@ pub struct PageServerConf {
/// If true, pageserver will make best-effort to operate without a control plane: only
/// for use in major incidents.
pub control_plane_emergency_mode: bool,
/// How many heatmap uploads may be done concurrency: lower values implicitly deprioritize
/// heatmap uploads vs. other remote storage operations.
pub heatmap_upload_concurrency: usize,
/// Maximum number of WAL records to be ingested and committed at the same time
pub ingest_batch_size: u64,
}
/// We do not want to store this in a PageServerConf because the latter may be logged
@@ -309,10 +293,6 @@ struct PageServerConfigBuilder {
control_plane_api: BuilderValue<Option<Url>>,
control_plane_api_token: BuilderValue<Option<SecretString>>,
control_plane_emergency_mode: BuilderValue<bool>,
heatmap_upload_concurrency: BuilderValue<usize>,
ingest_batch_size: BuilderValue<u64>,
}
impl Default for PageServerConfigBuilder {
@@ -381,10 +361,6 @@ impl Default for PageServerConfigBuilder {
control_plane_api: Set(None),
control_plane_api_token: Set(None),
control_plane_emergency_mode: Set(false),
heatmap_upload_concurrency: Set(DEFAULT_HEATMAP_UPLOAD_CONCURRENCY),
ingest_batch_size: Set(DEFAULT_INGEST_BATCH_SIZE),
}
}
}
@@ -525,14 +501,6 @@ impl PageServerConfigBuilder {
self.control_plane_emergency_mode = BuilderValue::Set(enabled)
}
pub fn heatmap_upload_concurrency(&mut self, value: usize) {
self.heatmap_upload_concurrency = BuilderValue::Set(value)
}
pub fn ingest_batch_size(&mut self, ingest_batch_size: u64) {
self.ingest_batch_size = BuilderValue::Set(ingest_batch_size)
}
pub fn build(self) -> anyhow::Result<PageServerConf> {
let concurrent_tenant_size_logical_size_queries = self
.concurrent_tenant_size_logical_size_queries
@@ -627,12 +595,6 @@ impl PageServerConfigBuilder {
control_plane_emergency_mode: self
.control_plane_emergency_mode
.ok_or(anyhow!("missing control_plane_emergency_mode"))?,
heatmap_upload_concurrency: self
.heatmap_upload_concurrency
.ok_or(anyhow!("missing heatmap_upload_concurrency"))?,
ingest_batch_size: self
.ingest_batch_size
.ok_or(anyhow!("missing ingest_batch_size"))?,
})
}
}
@@ -866,11 +828,8 @@ impl PageServerConf {
},
"control_plane_emergency_mode" => {
builder.control_plane_emergency_mode(parse_toml_bool(key, item)?)
},
"heatmap_upload_concurrency" => {
builder.heatmap_upload_concurrency(parse_toml_u64(key, item)? as usize)
},
"ingest_batch_size" => builder.ingest_batch_size(parse_toml_u64(key, item)?),
_ => bail!("unrecognized pageserver option '{key}'"),
}
}
@@ -937,8 +896,6 @@ impl PageServerConf {
control_plane_api: None,
control_plane_api_token: None,
control_plane_emergency_mode: false,
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
}
}
}
@@ -1163,9 +1120,7 @@ background_task_maximum_delay = '334 s'
)?,
control_plane_api: None,
control_plane_api_token: None,
control_plane_emergency_mode: false,
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
control_plane_emergency_mode: false
},
"Correct defaults should be used when no config values are provided"
);
@@ -1222,9 +1177,7 @@ background_task_maximum_delay = '334 s'
background_task_maximum_delay: Duration::from_secs(334),
control_plane_api: None,
control_plane_api_token: None,
control_plane_emergency_mode: false,
heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
ingest_batch_size: 100,
control_plane_emergency_mode: false
},
"Should be able to parse all basic config values correctly"
);

View File

@@ -3,7 +3,7 @@
use crate::context::{DownloadBehavior, RequestContext};
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
use crate::tenant::tasks::BackgroundLoopKind;
use crate::tenant::{mgr, LogicalSizeCalculationCause, PageReconstructError, Tenant};
use crate::tenant::{mgr, LogicalSizeCalculationCause, PageReconstructError};
use camino::Utf8PathBuf;
use consumption_metrics::EventType;
use pageserver_api::models::TenantState;
@@ -256,6 +256,8 @@ async fn calculate_synthetic_size_worker(
info!("calculate_synthetic_size_worker stopped");
};
let cause = LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize;
loop {
let started_at = Instant::now();
@@ -267,25 +269,26 @@ async fn calculate_synthetic_size_worker(
}
};
for (tenant_shard_id, tenant_state) in tenants {
for (tenant_id, tenant_state) in tenants {
if tenant_state != TenantState::Active {
continue;
}
if !tenant_shard_id.is_zero() {
// We only send consumption metrics from shard 0, so don't waste time calculating
// synthetic size on other shards.
continue;
if let Ok(tenant) = mgr::get_tenant(tenant_id, true) {
// TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks?
// We can put in some prioritization for consumption metrics.
// Same for the loop that fetches computed metrics.
// By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
// which turns out is really handy to understand the system.
if let Err(e) = tenant.calculate_synthetic_size(cause, cancel, ctx).await {
if let Some(PageReconstructError::Cancelled) =
e.downcast_ref::<PageReconstructError>()
{
return Ok(());
}
error!("failed to calculate synthetic size for tenant {tenant_id}: {e:#}");
}
}
let Ok(tenant) = mgr::get_tenant(tenant_shard_id, true) else {
continue;
};
// there is never any reason to exit calculate_synthetic_size_worker following any
// return value -- we don't need to care about shutdown because no tenant is found when
// pageserver is shut down.
calculate_and_log(&tenant, cancel, ctx).await;
}
crate::tenant::tasks::warn_when_period_overrun(
@@ -296,7 +299,7 @@ async fn calculate_synthetic_size_worker(
let res = tokio::time::timeout_at(
started_at + synthetic_size_calculation_interval,
cancel.cancelled(),
task_mgr::shutdown_token().cancelled(),
)
.await;
if res.is_ok() {
@@ -304,31 +307,3 @@ async fn calculate_synthetic_size_worker(
}
}
}
async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &RequestContext) {
const CAUSE: LogicalSizeCalculationCause =
LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize;
// TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks?
// We can put in some prioritization for consumption metrics.
// Same for the loop that fetches computed metrics.
// By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
// which turns out is really handy to understand the system.
let Err(e) = tenant.calculate_synthetic_size(CAUSE, cancel, ctx).await else {
return;
};
// this error can be returned if timeline is shutting down, but it does not
// mean the synthetic size worker should terminate. we do not need any checks
// in this function because `mgr::get_tenant` will error out after shutdown has
// progressed to shutting down tenants.
let shutting_down = matches!(
e.downcast_ref::<PageReconstructError>(),
Some(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_))
);
if !shutting_down {
let tenant_shard_id = tenant.tenant_shard_id();
error!("failed to calculate synthetic size for tenant {tenant_shard_id}: {e:#}");
}
}

View File

@@ -2,6 +2,7 @@ use crate::{context::RequestContext, tenant::timeline::logical_size::CurrentLogi
use chrono::{DateTime, Utc};
use consumption_metrics::EventType;
use futures::stream::StreamExt;
use pageserver_api::shard::ShardNumber;
use std::{sync::Arc, time::SystemTime};
use utils::{
id::{TenantId, TimelineId},
@@ -197,12 +198,12 @@ pub(super) async fn collect_all_metrics(
};
let tenants = futures::stream::iter(tenants).filter_map(|(id, state)| async move {
if state != TenantState::Active || !id.is_zero() {
if state != TenantState::Active {
None
} else {
crate::tenant::mgr::get_tenant(id, true)
.ok()
.map(|tenant| (id.tenant_id, tenant))
.map(|tenant| (id, tenant))
}
});
@@ -228,6 +229,11 @@ where
while let Some((tenant_id, tenant)) = tenants.next().await {
let mut tenant_resident_size = 0;
// Sharded tenants report all consumption metrics from shard zero
if tenant.tenant_shard_id().shard_number != ShardNumber(0) {
continue;
}
for timeline in tenant.list_timelines() {
let timeline_id = timeline.timeline_id;

View File

@@ -312,18 +312,7 @@ impl ListWriter {
for (tenant_shard_id, tenant_list) in &mut deletion_list.tenants {
if let Some(attached_gen) = attached_tenants.get(tenant_shard_id) {
if attached_gen.previous() == tenant_list.generation {
info!(
seq=%s, tenant_id=%tenant_shard_id.tenant_id,
shard_id=%tenant_shard_id.shard_slug(),
old_gen=?tenant_list.generation, new_gen=?attached_gen,
"Updating gen on recovered list");
tenant_list.generation = *attached_gen;
} else {
info!(
seq=%s, tenant_id=%tenant_shard_id.tenant_id,
shard_id=%tenant_shard_id.shard_slug(),
old_gen=?tenant_list.generation, new_gen=?attached_gen,
"Encountered stale generation on recovered list");
}
}
}

View File

@@ -42,6 +42,7 @@
// reading these fields. We use the Debug impl for semi-structured logging, though.
use std::{
collections::HashMap,
sync::Arc,
time::{Duration, SystemTime},
};
@@ -124,7 +125,7 @@ pub fn launch_disk_usage_global_eviction_task(
async fn disk_usage_eviction_task(
state: &State,
task_config: &DiskUsageEvictionTaskConfig,
storage: &GenericRemoteStorage,
_storage: &GenericRemoteStorage,
tenants_dir: &Utf8Path,
cancel: CancellationToken,
) {
@@ -148,14 +149,8 @@ async fn disk_usage_eviction_task(
let start = Instant::now();
async {
let res = disk_usage_eviction_task_iteration(
state,
task_config,
storage,
tenants_dir,
&cancel,
)
.await;
let res =
disk_usage_eviction_task_iteration(state, task_config, tenants_dir, &cancel).await;
match res {
Ok(()) => {}
@@ -186,13 +181,12 @@ pub trait Usage: Clone + Copy + std::fmt::Debug {
async fn disk_usage_eviction_task_iteration(
state: &State,
task_config: &DiskUsageEvictionTaskConfig,
storage: &GenericRemoteStorage,
tenants_dir: &Utf8Path,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
.context("get filesystem-level disk usage before evictions")?;
let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
let res = disk_usage_eviction_task_iteration_impl(state, usage_pre, cancel).await;
match res {
Ok(outcome) => {
debug!(?outcome, "disk_usage_eviction_iteration finished");
@@ -274,9 +268,8 @@ struct LayerCount {
count: usize,
}
pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
state: &State,
_storage: &GenericRemoteStorage,
usage_pre: U,
cancel: &CancellationToken,
) -> anyhow::Result<IterationOutcome<U>> {
@@ -328,16 +321,16 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
// Walk through the list of candidates, until we have accumulated enough layers to get
// us back under the pressure threshold. 'usage_planned' is updated so that it tracks
// how much disk space would be used after evicting all the layers up to the current
// point in the list.
// point in the list. The layers are collected in 'batched', grouped per timeline.
//
// If we get far enough in the list that we start to evict layers that are below
// the tenant's min-resident-size threshold, print a warning, and memorize the disk
// usage at that point, in 'usage_planned_min_resident_size_respecting'.
let mut batched: HashMap<_, Vec<_>> = HashMap::new();
let mut warned = None;
let mut usage_planned = usage_pre;
let mut evicted_amount = 0;
for (i, (partition, candidate)) in candidates.iter().enumerate() {
let mut max_batch_size = 0;
for (i, (partition, candidate)) in candidates.into_iter().enumerate() {
if !usage_planned.has_pressure() {
debug!(
no_candidates_evicted = i,
@@ -346,13 +339,25 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
break;
}
if partition == &MinResidentSizePartition::Below && warned.is_none() {
if partition == MinResidentSizePartition::Below && warned.is_none() {
warn!(?usage_pre, ?usage_planned, candidate_no=i, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy");
warned = Some(usage_planned);
}
usage_planned.add_available_bytes(candidate.layer.layer_desc().file_size);
evicted_amount += 1;
// FIXME: batching makes no sense anymore because of no layermap locking, should just spawn
// tasks to evict all seen layers until we have evicted enough
let batch = batched.entry(TimelineKey(candidate.timeline)).or_default();
// semaphore will later be used to limit eviction concurrency, and we can express at
// most u32 number of permits. unlikely we would have u32::MAX layers to be evicted,
// but fail gracefully by not making batches larger.
if batch.len() < u32::MAX as usize {
batch.push(candidate.layer);
max_batch_size = max_batch_size.max(batch.len());
}
}
let usage_planned = match warned {
@@ -367,79 +372,100 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
};
debug!(?usage_planned, "usage planned");
// phase2: evict layers
// phase2: evict victims batched by timeline
let mut js = tokio::task::JoinSet::new();
let limit = 1000;
let mut evicted = candidates.into_iter().take(evicted_amount).fuse();
let mut consumed_all = false;
// ratelimit to 1k files or any higher max batch size
let limit = Arc::new(tokio::sync::Semaphore::new(1000.max(max_batch_size)));
// After the evictions, `usage_assumed` is the post-eviction usage,
// according to internal accounting.
let mut usage_assumed = usage_pre;
let mut evictions_failed = LayerCount::default();
for (timeline, batch) in batched {
let tenant_shard_id = timeline.tenant_shard_id;
let timeline_id = timeline.timeline_id;
let batch_size =
u32::try_from(batch.len()).expect("batch size limited to u32::MAX during partitioning");
let evict_layers = async move {
loop {
let next = if js.len() >= limit || consumed_all {
js.join_next().await
} else if !js.is_empty() {
// opportunistically consume ready result, one per each new evicted
futures::future::FutureExt::now_or_never(js.join_next()).and_then(|x| x)
} else {
None
};
// I dislike naming of `available_permits` but it means current total amount of permits
// because permits can be added
assert!(batch_size as usize <= limit.available_permits());
if let Some(next) = next {
match next {
Ok(Ok(file_size)) => {
usage_assumed.add_available_bytes(file_size);
debug!(%timeline_id, "evicting batch for timeline");
let evict = {
let limit = limit.clone();
let cancel = cancel.clone();
async move {
let mut evicted_bytes = 0;
let mut evictions_failed = LayerCount::default();
let Ok(_permit) = limit.acquire_many_owned(batch_size).await else {
// semaphore closing means cancelled
return (evicted_bytes, evictions_failed);
};
let results = timeline.evict_layers(&batch).await;
match results {
Ok(results) => {
assert_eq!(results.len(), batch.len());
for (result, layer) in results.into_iter().zip(batch.iter()) {
let file_size = layer.layer_desc().file_size;
match result {
Some(Ok(())) => {
evicted_bytes += file_size;
}
Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
evictions_failed.file_sizes += file_size;
evictions_failed.count += 1;
}
None => {
assert!(cancel.is_cancelled());
}
}
}
}
Ok(Err((file_size, EvictionError::NotFound | EvictionError::Downloaded))) => {
evictions_failed.file_sizes += file_size;
evictions_failed.count += 1;
Err(e) => {
warn!("failed to evict batch: {:#}", e);
}
Err(je) if je.is_cancelled() => unreachable!("not used"),
Err(je) if je.is_panic() => { /* already logged */ }
Err(je) => tracing::error!("unknown JoinError: {je:?}"),
}
(evicted_bytes, evictions_failed)
}
if consumed_all && js.is_empty() {
break;
}
// calling again when consumed_all is fine as evicted is fused.
let Some((_partition, candidate)) = evicted.next() else {
consumed_all = true;
continue;
};
js.spawn(async move {
let rtc = candidate.timeline.remote_client.as_ref().expect(
"holding the witness, all timelines must have a remote timeline client",
);
let file_size = candidate.layer.layer_desc().file_size;
candidate
.layer
.evict_and_wait(rtc)
.await
.map(|()| file_size)
.map_err(|e| (file_size, e))
});
tokio::task::yield_now().await;
}
.instrument(tracing::info_span!("evict_batch", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id, batch_size));
js.spawn(evict);
// spwaning multiple thousands of these is essentially blocking, so give already spawned a
// chance of making progress
tokio::task::yield_now().await;
}
let join_all = async move {
// After the evictions, `usage_assumed` is the post-eviction usage,
// according to internal accounting.
let mut usage_assumed = usage_pre;
let mut evictions_failed = LayerCount::default();
while let Some(res) = js.join_next().await {
match res {
Ok((evicted_bytes, failed)) => {
usage_assumed.add_available_bytes(evicted_bytes);
evictions_failed.file_sizes += failed.file_sizes;
evictions_failed.count += failed.count;
}
Err(je) if je.is_cancelled() => unreachable!("not used"),
Err(je) if je.is_panic() => { /* already logged */ }
Err(je) => tracing::error!("unknown JoinError: {je:?}"),
}
}
(usage_assumed, evictions_failed)
};
let (usage_assumed, evictions_failed) = tokio::select! {
tuple = evict_layers => { tuple },
tuple = join_all => { tuple },
_ = cancel.cancelled() => {
// dropping joinset will abort all pending evict_and_waits and that is fine, our
// requests will still stand
// close the semaphore to stop any pending acquires
limit.close();
return Ok(IterationOutcome::Cancelled);
}
};

View File

@@ -84,6 +84,7 @@ paths:
required: true
schema:
type: string
format: hex
get:
description: Get tenant status
responses:
@@ -180,6 +181,7 @@ paths:
required: true
schema:
type: string
format: hex
get:
description: Get timelines for tenant
responses:
@@ -230,6 +232,7 @@ paths:
required: true
schema:
type: string
format: hex
- name: timeline_id
in: path
required: true
@@ -335,6 +338,7 @@ paths:
required: true
schema:
type: string
format: hex
- name: timeline_id
in: path
required: true
@@ -397,6 +401,7 @@ paths:
required: true
schema:
type: string
format: hex
- name: timeline_id
in: path
required: true
@@ -464,6 +469,7 @@ paths:
required: true
schema:
type: string
format: hex
- name: timeline_id
in: path
required: true
@@ -517,6 +523,7 @@ paths:
required: true
schema:
type: string
format: hex
post:
description: |
Schedules attach operation to happen in the background for the given tenant.
@@ -624,6 +631,7 @@ paths:
required: true
schema:
type: string
format: hex
- name: flush_ms
in: query
required: false
@@ -716,6 +724,7 @@ paths:
required: true
schema:
type: string
format: hex
- name: detach_ignored
in: query
required: false
@@ -775,6 +784,7 @@ paths:
required: true
schema:
type: string
format: hex
post:
description: |
Remove tenant data (including all corresponding timelines) from pageserver's memory.
@@ -823,6 +833,7 @@ paths:
required: true
schema:
type: string
format: hex
post:
description: |
Schedules an operation that attempts to load a tenant from the local disk and
@@ -879,6 +890,7 @@ paths:
required: true
schema:
type: string
format: hex
get:
description: |
Calculate tenant's synthetic size
@@ -921,6 +933,7 @@ paths:
required: true
schema:
type: string
format: hex
- name: inputs_only
in: query
required: false
@@ -990,6 +1003,7 @@ paths:
required: true
schema:
type: string
format: hex
post:
description: |
Create a timeline. Returns new timeline id on success.\
@@ -1123,6 +1137,7 @@ paths:
application/json:
schema:
type: string
format: hex
"400":
description: Malformed tenant create request
content:
@@ -1219,6 +1234,7 @@ paths:
required: true
schema:
type: string
format: hex
get:
description: |
Returns tenant's config description: specific config overrides a tenant has
@@ -1324,6 +1340,7 @@ components:
properties:
new_tenant_id:
type: string
format: hex
generation:
type: integer
description: Attachment generation number.
@@ -1352,6 +1369,7 @@ components:
properties:
tenant_id:
type: string
format: hex
TenantLocationConfigRequest:
type: object
required:
@@ -1359,6 +1377,7 @@ components:
properties:
tenant_id:
type: string
format: hex
mode:
type: string
enum: ["AttachedSingle", "AttachedMulti", "AttachedStale", "Secondary", "Detached"]
@@ -1405,8 +1424,6 @@ components:
type: integer
trace_read_requests:
type: boolean
heatmap_period:
type: integer
TenantConfigResponse:
type: object
properties:
@@ -1429,6 +1446,7 @@ components:
format: hex
tenant_id:
type: string
format: hex
last_record_lsn:
type: string
format: hex

View File

@@ -42,7 +42,6 @@ use crate::tenant::mgr::{
GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError,
TenantSlotError, TenantSlotUpsertError, TenantStateError,
};
use crate::tenant::secondary::SecondaryController;
use crate::tenant::size::ModelInputs;
use crate::tenant::storage_layer::LayerAccessStatsReset;
use crate::tenant::timeline::CompactFlags;
@@ -76,11 +75,9 @@ pub struct State {
broker_client: storage_broker::BrokerClientChannel,
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
deletion_queue_client: DeletionQueueClient,
secondary_controller: SecondaryController,
}
impl State {
#[allow(clippy::too_many_arguments)]
pub fn new(
conf: &'static PageServerConf,
tenant_manager: Arc<TenantManager>,
@@ -89,7 +86,6 @@ impl State {
broker_client: storage_broker::BrokerClientChannel,
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
deletion_queue_client: DeletionQueueClient,
secondary_controller: SecondaryController,
) -> anyhow::Result<Self> {
let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml", "/metrics"]
.iter()
@@ -104,7 +100,6 @@ impl State {
broker_client,
disk_usage_eviction_state,
deletion_queue_client,
secondary_controller,
})
}
@@ -141,6 +136,11 @@ impl From<PageReconstructError> for ApiError {
fn from(pre: PageReconstructError) -> ApiError {
match pre {
PageReconstructError::Other(pre) => ApiError::InternalServerError(pre),
PageReconstructError::NeedsDownload(_, _) => {
// This shouldn't happen, because we use a RequestContext that requests to
// download any missing layer files on-demand.
ApiError::InternalServerError(anyhow::anyhow!("need to download remote layer file"))
}
PageReconstructError::Cancelled => {
ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
}
@@ -319,7 +319,6 @@ async fn build_timeline_info_common(
ctx: &RequestContext,
) -> anyhow::Result<TimelineInfo> {
crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
let initdb_lsn = timeline.initdb_lsn;
let last_record_lsn = timeline.get_last_record_lsn();
let (wal_source_connstr, last_received_msg_lsn, last_received_msg_ts) = {
let guard = timeline.last_received_wal.lock().unwrap();
@@ -353,14 +352,14 @@ async fn build_timeline_info_common(
let walreceiver_status = timeline.walreceiver_status();
let info = TimelineInfo {
tenant_id: timeline.tenant_shard_id,
// TODO(sharding): add a shard_id field, or make tenant_id into a tenant_shard_id
tenant_id: timeline.tenant_shard_id.tenant_id,
timeline_id: timeline.timeline_id,
ancestor_timeline_id,
ancestor_lsn,
disk_consistent_lsn: timeline.get_disk_consistent_lsn(),
remote_consistent_lsn: remote_consistent_lsn_projected,
remote_consistent_lsn_visible,
initdb_lsn,
last_record_lsn,
prev_record_lsn: Some(timeline.get_prev_record_lsn()),
latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(),
@@ -481,15 +480,15 @@ async fn timeline_list_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let include_non_incremental_logical_size: Option<bool> =
parse_query_param(&request, "include-non-incremental-logical-size")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
check_permission(&request, Some(tenant_id))?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let response_data = async {
let tenant = mgr::get_tenant(tenant_shard_id, true)?;
let tenant = mgr::get_tenant(tenant_id, true)?;
let timelines = tenant.list_timelines();
let mut response_data = Vec::with_capacity(timelines.len());
@@ -508,9 +507,7 @@ async fn timeline_list_handler(
}
Ok::<Vec<TimelineInfo>, ApiError>(response_data)
}
.instrument(info_span!("timeline_list",
tenant_id = %tenant_shard_id.tenant_id,
shard_id = %tenant_shard_id.shard_slug()))
.instrument(info_span!("timeline_list", %tenant_id))
.await?;
json_response(StatusCode::OK, response_data)
@@ -520,17 +517,17 @@ async fn timeline_detail_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
let include_non_incremental_logical_size: Option<bool> =
parse_query_param(&request, "include-non-incremental-logical-size")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
check_permission(&request, Some(tenant_id))?;
// Logical size calculation needs downloading.
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let timeline_info = async {
let tenant = mgr::get_tenant(tenant_shard_id, true)?;
let tenant = mgr::get_tenant(tenant_id, true)?;
let timeline = tenant
.get_timeline(timeline_id, false)
@@ -547,10 +544,7 @@ async fn timeline_detail_handler(
Ok::<_, ApiError>(timeline_info)
}
.instrument(info_span!("timeline_detail",
tenant_id = %tenant_shard_id.tenant_id,
shard_id = %tenant_shard_id.shard_slug(),
%timeline_id))
.instrument(info_span!("timeline_detail", %tenant_id, %timeline_id))
.await?;
json_response(StatusCode::OK, timeline_info)
@@ -560,15 +554,8 @@ async fn get_lsn_by_timestamp_handler(
request: Request<Body>,
cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
if !tenant_shard_id.is_zero() {
// Requires SLRU contents, which are only stored on shard zero
return Err(ApiError::BadRequest(anyhow!(
"Size calculations are only available on shard zero"
)));
}
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let version: Option<u8> = parse_query_param(&request, "version")?;
@@ -580,7 +567,7 @@ async fn get_lsn_by_timestamp_handler(
let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp);
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
let result = timeline
.find_lsn_for_timestamp(timestamp_pg, &cancel, &ctx)
.await?;
@@ -615,15 +602,8 @@ async fn get_timestamp_of_lsn_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
if !tenant_shard_id.is_zero() {
// Requires SLRU contents, which are only stored on shard zero
return Err(ApiError::BadRequest(anyhow!(
"Size calculations are only available on shard zero"
)));
}
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -633,7 +613,7 @@ async fn get_timestamp_of_lsn_handler(
.map_err(ApiError::BadRequest)?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
let result = timeline.get_timestamp_for_lsn(lsn, &ctx).await?;
match result {
@@ -825,11 +805,11 @@ async fn tenant_status(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let tenant_info = async {
let tenant = mgr::get_tenant(tenant_shard_id, false)?;
let tenant = mgr::get_tenant(tenant_id, false)?;
// Calculate total physical size of all timelines
let mut current_physical_size = 0;
@@ -839,15 +819,13 @@ async fn tenant_status(
let state = tenant.current_state();
Result::<_, ApiError>::Ok(TenantInfo {
id: tenant_shard_id,
id: tenant_id,
state: state.clone(),
current_physical_size: Some(current_physical_size),
attachment_status: state.attachment_status(),
})
}
.instrument(info_span!("tenant_status_handler",
tenant_id = %tenant_shard_id.tenant_id,
shard_id = %tenant_shard_id.shard_slug()))
.instrument(info_span!("tenant_status_handler", %tenant_id))
.await?;
json_response(StatusCode::OK, tenant_info)
@@ -866,7 +844,7 @@ async fn tenant_delete_handler(
mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_shard_id)
.instrument(info_span!("tenant_delete_handler",
tenant_id = %tenant_shard_id.tenant_id,
shard = %tenant_shard_id.shard_slug()
shard = tenant_shard_id.shard_slug()
))
.await?;
@@ -890,20 +868,14 @@ async fn tenant_size_handler(
request: Request<Body>,
cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let inputs_only: Option<bool> = parse_query_param(&request, "inputs_only")?;
let retention_period: Option<u64> = parse_query_param(&request, "retention_period")?;
let headers = request.headers();
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let tenant = mgr::get_tenant(tenant_shard_id, true)?;
if !tenant_shard_id.is_zero() {
return Err(ApiError::BadRequest(anyhow!(
"Size calculations are only available on shard zero"
)));
}
let tenant = mgr::get_tenant(tenant_id, true)?;
// this can be long operation
let inputs = tenant
@@ -955,7 +927,7 @@ async fn tenant_size_handler(
json_response(
StatusCode::OK,
TenantHistorySize {
id: tenant_shard_id.tenant_id,
id: tenant_id,
size: sizes.as_ref().map(|x| x.total_size),
segment_sizes: sizes.map(|x| x.segments),
inputs,
@@ -967,14 +939,14 @@ async fn layer_map_info_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
let reset: LayerAccessStatsReset =
parse_query_param(&request, "reset")?.unwrap_or(LayerAccessStatsReset::NoReset);
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
check_permission(&request, Some(tenant_id))?;
let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
let layer_map_info = timeline.layer_map_info(reset).await;
json_response(StatusCode::OK, layer_map_info)
@@ -984,12 +956,13 @@ async fn layer_download_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
let layer_file_name = get_request_param(&request, "layer_file_name")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
check_permission(&request, Some(tenant_id))?;
let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
let downloaded = timeline
.download_layer(layer_file_name)
.await
@@ -1000,7 +973,7 @@ async fn layer_download_handler(
Some(false) => json_response(StatusCode::NOT_MODIFIED, ()),
None => json_response(
StatusCode::BAD_REQUEST,
format!("Layer {tenant_shard_id}/{timeline_id}/{layer_file_name} not found"),
format!("Layer {tenant_id}/{timeline_id}/{layer_file_name} not found"),
),
}
}
@@ -1009,12 +982,12 @@ async fn evict_timeline_layer_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
let layer_file_name = get_request_param(&request, "layer_file_name")?;
let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
let evicted = timeline
.evict_layer(layer_file_name)
.await
@@ -1025,7 +998,7 @@ async fn evict_timeline_layer_handler(
Some(false) => json_response(StatusCode::NOT_MODIFIED, ()),
None => json_response(
StatusCode::BAD_REQUEST,
format!("Layer {tenant_shard_id}/{timeline_id}/{layer_file_name} not found"),
format!("Layer {tenant_id}/{timeline_id}/{layer_file_name} not found"),
),
}
}
@@ -1157,10 +1130,10 @@ async fn get_tenant_config_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let tenant = mgr::get_tenant(tenant_shard_id, false)?;
let tenant = mgr::get_tenant(tenant_id, false)?;
let response = HashMap::from([
(
@@ -1220,7 +1193,7 @@ async fn put_tenant_location_config_handler(
mgr::detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client)
.instrument(info_span!("tenant_detach",
tenant_id = %tenant_shard_id.tenant_id,
shard = %tenant_shard_id.shard_slug()
shard = tenant_shard_id.shard_slug()
))
.await
{
@@ -1254,9 +1227,9 @@ async fn handle_tenant_break(
r: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?;
let tenant_id: TenantId = parse_request_param(&r, "tenant_id")?;
let tenant = crate::tenant::mgr::get_tenant(tenant_shard_id, true)
let tenant = crate::tenant::mgr::get_tenant(tenant_id, true)
.map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?;
tenant.set_broken("broken from test".to_owned()).await;
@@ -1297,15 +1270,14 @@ async fn timeline_gc_handler(
mut request: Request<Body>,
cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
check_permission(&request, Some(tenant_id))?;
let gc_req: TimelineGcRequest = json_request(&mut request).await?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let wait_task_done =
mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx).await?;
let wait_task_done = mgr::immediate_gc(tenant_id, timeline_id, gc_req, cancel, &ctx).await?;
let gc_result = wait_task_done
.await
.context("wait for gc task")
@@ -1320,9 +1292,9 @@ async fn timeline_compact_handler(
request: Request<Body>,
cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
check_permission(&request, Some(tenant_id))?;
let mut flags = EnumSet::empty();
if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
@@ -1330,14 +1302,14 @@ async fn timeline_compact_handler(
}
async {
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
timeline
.compact(&cancel, flags, &ctx)
.await
.map_err(|e| ApiError::InternalServerError(e.into()))?;
json_response(StatusCode::OK, ())
}
.instrument(info_span!("manual_compaction", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
.instrument(info_span!("manual_compaction", %tenant_id, %timeline_id))
.await
}
@@ -1346,9 +1318,9 @@ async fn timeline_checkpoint_handler(
request: Request<Body>,
cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
check_permission(&request, Some(tenant_id))?;
let mut flags = EnumSet::empty();
if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
@@ -1356,7 +1328,7 @@ async fn timeline_checkpoint_handler(
}
async {
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
timeline
.freeze_and_flush()
.await
@@ -1368,7 +1340,7 @@ async fn timeline_checkpoint_handler(
json_response(StatusCode::OK, ())
}
.instrument(info_span!("manual_checkpoint", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
.instrument(info_span!("manual_checkpoint", %tenant_id, %timeline_id))
.await
}
@@ -1376,12 +1348,12 @@ async fn timeline_download_remote_layers_handler_post(
mut request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
let body: DownloadRemoteLayersTaskSpawnRequest = json_request(&mut request).await?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
check_permission(&request, Some(tenant_id))?;
let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
match timeline.spawn_download_all_remote_layers(body).await {
Ok(st) => json_response(StatusCode::ACCEPTED, st),
Err(st) => json_response(StatusCode::CONFLICT, st),
@@ -1392,11 +1364,11 @@ async fn timeline_download_remote_layers_handler_get(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
let info = timeline
.get_download_all_remote_layers_task_info()
.context("task never started since last pageserver process start")
@@ -1442,9 +1414,9 @@ async fn getpage_at_lsn_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
check_permission(&request, Some(tenant_id))?;
struct Key(crate::repository::Key);
@@ -1463,7 +1435,7 @@ async fn getpage_at_lsn_handler(
async {
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
let page = timeline.get(key.0, lsn, &ctx).await?;
@@ -1475,7 +1447,7 @@ async fn getpage_at_lsn_handler(
.unwrap(),
)
}
.instrument(info_span!("timeline_get", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
.instrument(info_span!("timeline_get", %tenant_id, %timeline_id))
.await
}
@@ -1483,9 +1455,9 @@ async fn timeline_collect_keyspace(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
check_permission(&request, Some(tenant_id))?;
struct Partitioning {
keys: crate::keyspace::KeySpace,
@@ -1554,7 +1526,7 @@ async fn timeline_collect_keyspace(
async {
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
let keys = timeline
.collect_keyspace(at_lsn, &ctx)
@@ -1563,15 +1535,15 @@ async fn timeline_collect_keyspace(
json_response(StatusCode::OK, Partitioning { keys, at_lsn })
}
.instrument(info_span!("timeline_collect_keyspace", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
.instrument(info_span!("timeline_collect_keyspace", %tenant_id, %timeline_id))
.await
}
async fn active_timeline_of_active_tenant(
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> Result<Arc<Timeline>, ApiError> {
let tenant = mgr::get_tenant(tenant_shard_id, true)?;
let tenant = mgr::get_tenant(tenant_id, true)?;
tenant
.get_timeline(timeline_id, true)
.map_err(|e| ApiError::NotFound(e.into()))
@@ -1593,7 +1565,7 @@ async fn always_panic_handler(
async fn disk_usage_eviction_run(
mut r: Request<Body>,
cancel: CancellationToken,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
check_permission(&r, None)?;
@@ -1630,41 +1602,48 @@ async fn disk_usage_eviction_run(
freed_bytes: 0,
};
let (tx, rx) = tokio::sync::oneshot::channel();
let state = get_state(&r);
let Some(storage) = state.remote_storage.as_ref() else {
if state.remote_storage.as_ref().is_none() {
return Err(ApiError::InternalServerError(anyhow::anyhow!(
"remote storage not configured, cannot run eviction iteration"
)));
};
}
let state = state.disk_usage_eviction_state.clone();
let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
&state, storage, usage, &cancel,
)
.await;
let cancel = CancellationToken::new();
let child_cancel = cancel.clone();
let _g = cancel.drop_guard();
info!(?res, "disk_usage_eviction_task_iteration_impl finished");
crate::task_mgr::spawn(
crate::task_mgr::BACKGROUND_RUNTIME.handle(),
TaskKind::DiskUsageEviction,
None,
None,
"ondemand disk usage eviction",
false,
async move {
let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
&state,
usage,
&child_cancel,
)
.await;
let res = res.map_err(ApiError::InternalServerError)?;
info!(?res, "disk_usage_eviction_task_iteration_impl finished");
json_response(StatusCode::OK, res)
}
let _ = tx.send(res);
Ok(())
}
.in_current_span(),
);
async fn secondary_upload_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let state = get_state(&request);
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
state
.secondary_controller
.upload_tenant(tenant_shard_id)
.await
.map_err(ApiError::InternalServerError)?;
let response = rx.await.unwrap().map_err(ApiError::InternalServerError)?;
json_response(StatusCode::OK, ())
json_response(StatusCode::OK, response)
}
async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -1841,25 +1820,23 @@ pub fn make_router(
})
.get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
.post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
.get("/v1/tenant/:tenant_shard_id", |r| {
api_handler(r, tenant_status)
})
.get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status))
.delete("/v1/tenant/:tenant_shard_id", |r| {
api_handler(r, tenant_delete_handler)
})
.get("/v1/tenant/:tenant_shard_id/synthetic_size", |r| {
.get("/v1/tenant/:tenant_id/synthetic_size", |r| {
api_handler(r, tenant_size_handler)
})
.put("/v1/tenant/config", |r| {
api_handler(r, update_tenant_config_handler)
})
.get("/v1/tenant/:tenant_shard_id/config", |r| {
.get("/v1/tenant/:tenant_id/config", |r| {
api_handler(r, get_tenant_config_handler)
})
.put("/v1/tenant/:tenant_shard_id/location_config", |r| {
api_handler(r, put_tenant_location_config_handler)
})
.get("/v1/tenant/:tenant_shard_id/timeline", |r| {
.get("/v1/tenant/:tenant_id/timeline", |r| {
api_handler(r, timeline_list_handler)
})
.post("/v1/tenant/:tenant_shard_id/timeline", |r| {
@@ -1880,74 +1857,67 @@ pub fn make_router(
.post("/v1/tenant/:tenant_id/ignore", |r| {
api_handler(r, tenant_ignore_handler)
})
.get("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| {
.get("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
api_handler(r, timeline_detail_handler)
})
.get(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_lsn_by_timestamp",
"/v1/tenant/:tenant_id/timeline/:timeline_id/get_lsn_by_timestamp",
|r| api_handler(r, get_lsn_by_timestamp_handler),
)
.get(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_timestamp_of_lsn",
"/v1/tenant/:tenant_id/timeline/:timeline_id/get_timestamp_of_lsn",
|r| api_handler(r, get_timestamp_of_lsn_handler),
)
.put("/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc", |r| {
api_handler(r, timeline_gc_handler)
})
.put("/v1/tenant/:tenant_id/timeline/:timeline_id/compact", |r| {
testing_api_handler("run timeline compaction", r, timeline_compact_handler)
})
.put(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/do_gc",
|r| api_handler(r, timeline_gc_handler),
)
.put(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact",
|r| testing_api_handler("run timeline compaction", r, timeline_compact_handler),
)
.put(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/checkpoint",
"/v1/tenant/:tenant_id/timeline/:timeline_id/checkpoint",
|r| testing_api_handler("run timeline checkpoint", r, timeline_checkpoint_handler),
)
.post(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_remote_layers",
"/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
|r| api_handler(r, timeline_download_remote_layers_handler_post),
)
.get(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_remote_layers",
"/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
|r| api_handler(r, timeline_download_remote_layers_handler_get),
)
.delete("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| {
api_handler(r, timeline_delete_handler)
})
.get("/v1/tenant/:tenant_id/timeline/:timeline_id/layer", |r| {
api_handler(r, layer_map_info_handler)
})
.get(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer",
|r| api_handler(r, layer_map_info_handler),
)
.get(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name",
"/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
|r| api_handler(r, layer_download_handler),
)
.delete(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name",
"/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
|r| api_handler(r, evict_timeline_layer_handler),
)
.post("/v1/tenant/:tenant_shard_id/heatmap_upload", |r| {
api_handler(r, secondary_upload_handler)
})
.put("/v1/disk_usage_eviction/run", |r| {
api_handler(r, disk_usage_eviction_run)
})
.put("/v1/deletion_queue/flush", |r| {
api_handler(r, deletion_queue_flush)
})
.put("/v1/tenant/:tenant_shard_id/break", |r| {
.put("/v1/tenant/:tenant_id/break", |r| {
testing_api_handler("set tenant state to broken", r, handle_tenant_break)
})
.get("/v1/panic", |r| api_handler(r, always_panic_handler))
.post("/v1/tracing/event", |r| {
testing_api_handler("emit a tracing event", r, post_tracing_event_handler)
})
.get("/v1/tenant/:tenant_id/timeline/:timeline_id/getpage", |r| {
testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler)
})
.get(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/getpage",
|r| testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler),
)
.get(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/keyspace",
"/v1/tenant/:tenant_id/timeline/:timeline_id/keyspace",
|r| testing_api_handler("read out the keyspace", r, timeline_collect_keyspace),
)
.any(handler_404))

View File

@@ -2,8 +2,9 @@
//! Import data and WAL from a PostgreSQL data directory and WAL segments into
//! a neon Timeline.
//!
use std::io::SeekFrom;
use std::path::{Path, PathBuf};
use std::pin::Pin;
use std::task::{self, Poll};
use anyhow::{bail, ensure, Context, Result};
use async_compression::tokio::bufread::ZstdDecoder;
@@ -12,8 +13,7 @@ use bytes::Bytes;
use camino::Utf8Path;
use futures::StreamExt;
use nix::NixPath;
use tokio::fs::{File, OpenOptions};
use tokio::io::{AsyncBufRead, AsyncRead, AsyncReadExt, AsyncSeekExt, AsyncWriteExt};
use tokio::io::{AsyncBufRead, AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt};
use tokio_tar::Archive;
use tokio_tar::Builder;
use tokio_tar::HeaderMode;
@@ -21,7 +21,6 @@ use tracing::*;
use walkdir::WalkDir;
use crate::context::RequestContext;
use crate::metrics::WAL_INGEST;
use crate::pgdatadir_mapping::*;
use crate::tenant::remote_timeline_client::INITDB_PATH;
use crate::tenant::Timeline;
@@ -313,17 +312,13 @@ async fn import_wal(
waldecoder.feed_bytes(&buf);
let mut nrecords = 0;
let mut modification = tline.begin_modification(last_lsn);
let mut modification = tline.begin_modification(endpoint);
let mut decoded = DecodedWALRecord::default();
while last_lsn <= endpoint {
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
walingest
.ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
.await?;
modification.commit(ctx).await?;
WAL_INGEST.records_committed.inc();
last_lsn = lsn;
nrecords += 1;
@@ -453,15 +448,13 @@ pub async fn import_wal_from_tar(
waldecoder.feed_bytes(&bytes[offset..]);
let mut modification = tline.begin_modification(last_lsn);
let mut modification = tline.begin_modification(end_lsn);
let mut decoded = DecodedWALRecord::default();
while last_lsn <= end_lsn {
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
walingest
.ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
.await?;
modification.commit(ctx).await?;
WAL_INGEST.records_committed.inc();
last_lsn = lsn;
debug!("imported record at {} (end {})", lsn, end_lsn);
@@ -636,16 +629,70 @@ async fn read_all_bytes(reader: &mut (impl AsyncRead + Unpin)) -> Result<Bytes>
Ok(Bytes::from(buf))
}
pub async fn create_tar_zst(pgdata_path: &Utf8Path, tmp_path: &Utf8Path) -> Result<(File, u64)> {
let file = OpenOptions::new()
.create(true)
.truncate(true)
.read(true)
.write(true)
.open(&tmp_path)
.await
.with_context(|| format!("tempfile creation {tmp_path}"))?;
/// An in-memory buffer implementing `AsyncWrite`, inserting yields every now and then
///
/// The number of yields is bounded by above by the number of times poll_write is called,
/// so calling it with 8 KB chunks and 8 MB chunks gives the same number of yields in total.
/// This is an explicit choice as the `YieldingVec` is meant to give the async executor
/// breathing room between units of CPU intensive preparation of buffers to be written.
/// Once a write call is issued, the whole buffer has been prepared already, so there is no
/// gain in splitting up the memcopy further.
struct YieldingVec {
yield_budget: usize,
// the buffer written into
buf: Vec<u8>,
}
impl YieldingVec {
fn new() -> Self {
Self {
yield_budget: 0,
buf: Vec::new(),
}
}
// Whether we should yield for a read operation of given size
fn should_yield(&mut self, add_buf_len: usize) -> bool {
// Set this limit to a small value so that we are a
// good async citizen and yield repeatedly (but not
// too often for many small writes to cause many yields)
const YIELD_DIST: usize = 1024;
let target_buf_len = self.buf.len() + add_buf_len;
let ret = self.yield_budget / YIELD_DIST < target_buf_len / YIELD_DIST;
if self.yield_budget < target_buf_len {
self.yield_budget += add_buf_len;
}
ret
}
}
impl AsyncWrite for YieldingVec {
fn poll_write(
mut self: Pin<&mut Self>,
cx: &mut task::Context<'_>,
buf: &[u8],
) -> Poll<std::io::Result<usize>> {
if self.should_yield(buf.len()) {
cx.waker().wake_by_ref();
return Poll::Pending;
}
self.get_mut().buf.extend_from_slice(buf);
Poll::Ready(Ok(buf.len()))
}
fn poll_flush(self: Pin<&mut Self>, _cx: &mut task::Context<'_>) -> Poll<std::io::Result<()>> {
Poll::Ready(Ok(()))
}
fn poll_shutdown(
self: Pin<&mut Self>,
_cx: &mut task::Context<'_>,
) -> Poll<std::io::Result<()>> {
Poll::Ready(Ok(()))
}
}
pub async fn create_tar_zst(pgdata_path: &Utf8Path) -> Result<Vec<u8>> {
let mut paths = Vec::new();
for entry in WalkDir::new(pgdata_path) {
let entry = entry?;
@@ -660,7 +707,7 @@ pub async fn create_tar_zst(pgdata_path: &Utf8Path, tmp_path: &Utf8Path) -> Resu
// Do a sort to get a more consistent listing
paths.sort_unstable();
let zstd = ZstdEncoder::with_quality_and_params(
file,
YieldingVec::new(),
Level::Default,
&[CParameter::enable_long_distance_matching(true)],
);
@@ -678,14 +725,13 @@ pub async fn create_tar_zst(pgdata_path: &Utf8Path, tmp_path: &Utf8Path) -> Resu
}
let mut zstd = builder.into_inner().await?;
zstd.shutdown().await?;
let mut compressed = zstd.into_inner();
let compressed_len = compressed.metadata().await?.len();
const INITDB_TAR_ZST_WARN_LIMIT: u64 = 2 * 1024 * 1024;
let compressed = zstd.into_inner();
let compressed_len = compressed.buf.len();
const INITDB_TAR_ZST_WARN_LIMIT: usize = 2_000_000;
if compressed_len > INITDB_TAR_ZST_WARN_LIMIT {
warn!("compressed {INITDB_PATH} size of {compressed_len} is above limit {INITDB_TAR_ZST_WARN_LIMIT}.");
}
compressed.seek(SeekFrom::Start(0)).await?;
Ok((compressed, compressed_len))
Ok(compressed.buf)
}
pub async fn extract_tar_zst(

View File

@@ -2,10 +2,9 @@ use enum_map::EnumMap;
use metrics::metric_vec_duration::DurationResultObserver;
use metrics::{
register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
register_int_counter, register_int_counter_pair_vec, register_int_counter_vec,
register_int_gauge, register_int_gauge_vec, register_uint_gauge, register_uint_gauge_vec,
Counter, CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPairVec,
IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
register_int_counter, register_int_counter_vec, register_int_gauge, register_int_gauge_vec,
register_uint_gauge, register_uint_gauge_vec, Counter, CounterVec, GaugeVec, Histogram,
HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
};
use once_cell::sync::Lazy;
use pageserver_api::shard::TenantShardId;
@@ -286,63 +285,6 @@ pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheS
},
});
pub(crate) mod page_cache_eviction_metrics {
use std::num::NonZeroUsize;
use metrics::{register_int_counter_vec, IntCounter, IntCounterVec};
use once_cell::sync::Lazy;
#[derive(Clone, Copy)]
pub(crate) enum Outcome {
FoundSlotUnused { iters: NonZeroUsize },
FoundSlotEvicted { iters: NonZeroUsize },
ItersExceeded { iters: NonZeroUsize },
}
static ITERS_TOTAL_VEC: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_page_cache_find_victim_iters_total",
"Counter for the number of iterations in the find_victim loop",
&["outcome"],
)
.expect("failed to define a metric")
});
static CALLS_VEC: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"pageserver_page_cache_find_victim_calls",
"Incremented at the end of each find_victim() call.\
Filter by outcome to get e.g., eviction rate.",
&["outcome"]
)
.unwrap()
});
pub(crate) fn observe(outcome: Outcome) {
macro_rules! dry {
($label:literal, $iters:expr) => {{
static LABEL: &'static str = $label;
static ITERS_TOTAL: Lazy<IntCounter> =
Lazy::new(|| ITERS_TOTAL_VEC.with_label_values(&[LABEL]));
static CALLS: Lazy<IntCounter> =
Lazy::new(|| CALLS_VEC.with_label_values(&[LABEL]));
ITERS_TOTAL.inc_by(($iters.get()) as u64);
CALLS.inc();
}};
}
match outcome {
Outcome::FoundSlotUnused { iters } => dry!("found_empty", iters),
Outcome::FoundSlotEvicted { iters } => {
dry!("found_evicted", iters)
}
Outcome::ItersExceeded { iters } => {
dry!("err_iters_exceeded", iters);
super::page_cache_errors_inc(super::PageCacheErrorKind::EvictIterLimit);
}
}
}
}
pub(crate) static PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_page_cache_acquire_pinned_slot_seconds",
@@ -352,6 +294,14 @@ pub(crate) static PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME: Lazy<Histogram> = Lazy::n
.expect("failed to define a metric")
});
pub(crate) static PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_page_cache_find_victim_iters_total",
"Counter for the number of iterations in the find_victim loop",
)
.expect("failed to define a metric")
});
static PAGE_CACHE_ERRORS: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"page_cache_errors_total",
@@ -651,7 +601,7 @@ static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy<IntCounterVec> = Lazy::new(||
"pageserver_evictions_with_low_residence_duration",
"If a layer is evicted that was resident for less than `low_threshold`, it is counted to this counter. \
Residence duration is determined using the `residence_duration_data_source`.",
&["tenant_id", "shard_id", "timeline_id", "residence_duration_data_source", "low_threshold_secs"]
&["tenant_id", "timeline_id", "residence_duration_data_source", "low_threshold_secs"]
)
.expect("failed to define a metric")
});
@@ -715,16 +665,10 @@ impl EvictionsWithLowResidenceDurationBuilder {
}
}
fn build(
&self,
tenant_id: &str,
shard_id: &str,
timeline_id: &str,
) -> EvictionsWithLowResidenceDuration {
fn build(&self, tenant_id: &str, timeline_id: &str) -> EvictionsWithLowResidenceDuration {
let counter = EVICTIONS_WITH_LOW_RESIDENCE_DURATION
.get_metric_with_label_values(&[
tenant_id,
shard_id,
timeline_id,
self.data_source,
&EvictionsWithLowResidenceDuration::threshold_label_value(self.threshold),
@@ -755,24 +699,21 @@ impl EvictionsWithLowResidenceDuration {
pub fn change_threshold(
&mut self,
tenant_id: &str,
shard_id: &str,
timeline_id: &str,
new_threshold: Duration,
) {
if new_threshold == self.threshold {
return;
}
let mut with_new = EvictionsWithLowResidenceDurationBuilder::new(
self.data_source,
new_threshold,
)
.build(tenant_id, shard_id, timeline_id);
let mut with_new =
EvictionsWithLowResidenceDurationBuilder::new(self.data_source, new_threshold)
.build(tenant_id, timeline_id);
std::mem::swap(self, &mut with_new);
with_new.remove(tenant_id, shard_id, timeline_id);
with_new.remove(tenant_id, timeline_id);
}
// This could be a `Drop` impl, but, we need the `tenant_id` and `timeline_id`.
fn remove(&mut self, tenant_id: &str, shard_id: &str, timeline_id: &str) {
fn remove(&mut self, tenant_id: &str, timeline_id: &str) {
let Some(_counter) = self.counter.take() else {
return;
};
@@ -781,7 +722,6 @@ impl EvictionsWithLowResidenceDuration {
let removed = EVICTIONS_WITH_LOW_RESIDENCE_DURATION.remove_label_values(&[
tenant_id,
shard_id,
timeline_id,
self.data_source,
&threshold,
@@ -834,7 +774,6 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
)]
pub(crate) enum StorageIoOperation {
Open,
OpenAfterReplace,
Close,
CloseByReplace,
Read,
@@ -848,7 +787,6 @@ impl StorageIoOperation {
pub fn as_str(&self) -> &'static str {
match self {
StorageIoOperation::Open => "open",
StorageIoOperation::OpenAfterReplace => "open-after-replace",
StorageIoOperation::Close => "close",
StorageIoOperation::CloseByReplace => "close-by-replace",
StorageIoOperation::Read => "read",
@@ -903,25 +841,6 @@ pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub(crate) mod virtual_file_descriptor_cache {
use super::*;
pub(crate) static SIZE_MAX: Lazy<UIntGauge> = Lazy::new(|| {
register_uint_gauge!(
"pageserver_virtual_file_descriptor_cache_size_max",
"Maximum number of open file descriptors in the cache."
)
.unwrap()
});
// SIZE_CURRENT: derive it like so:
// ```
// sum (pageserver_io_operations_seconds_count{operation=~"^(open|open-after-replace)$")
// -ignoring(operation)
// sum(pageserver_io_operations_seconds_count{operation=~"^(close|close-by-replace)$"}
// ```
}
#[derive(Debug)]
struct GlobalAndPerTimelineHistogram {
global: Histogram,
@@ -1248,52 +1167,6 @@ pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {
}
});
pub(crate) struct WalIngestMetrics {
pub(crate) records_received: IntCounter,
pub(crate) records_committed: IntCounter,
pub(crate) records_filtered: IntCounter,
}
pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
records_received: register_int_counter!(
"pageserver_wal_ingest_records_received",
"Number of WAL records received from safekeepers"
)
.expect("failed to define a metric"),
records_committed: register_int_counter!(
"pageserver_wal_ingest_records_committed",
"Number of WAL records which resulted in writes to pageserver storage"
)
.expect("failed to define a metric"),
records_filtered: register_int_counter!(
"pageserver_wal_ingest_records_filtered",
"Number of WAL records filtered out due to sharding"
)
.expect("failed to define a metric"),
});
pub(crate) struct SecondaryModeMetrics {
pub(crate) upload_heatmap: IntCounter,
pub(crate) upload_heatmap_errors: IntCounter,
pub(crate) upload_heatmap_duration: Histogram,
}
pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| SecondaryModeMetrics {
upload_heatmap: register_int_counter!(
"pageserver_secondary_upload_heatmap",
"Number of heatmaps written to remote storage by attached tenants"
)
.expect("failed to define a metric"),
upload_heatmap_errors: register_int_counter!(
"pageserver_secondary_upload_heatmap_errors",
"Failures writing heatmap to remote storage"
)
.expect("failed to define a metric"),
upload_heatmap_duration: register_histogram!(
"pageserver_secondary_upload_heatmap_duration",
"Time to build and upload a heatmap, including any waiting inside the S3 client"
)
.expect("failed to define a metric"),
});
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum RemoteOpKind {
Upload,
@@ -1344,16 +1217,25 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
.expect("Failed to register tenant_task_events metric")
});
pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
register_int_counter_pair_vec!(
"pageserver_background_loop_semaphore_wait_start_count",
"Counter for background loop concurrency-limiting semaphore acquire calls started",
"pageserver_background_loop_semaphore_wait_finish_count",
"Counter for background loop concurrency-limiting semaphore acquire calls finished",
&["task"],
)
.unwrap()
});
pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_START_COUNT: Lazy<IntCounterVec> =
Lazy::new(|| {
register_int_counter_vec!(
"pageserver_background_loop_semaphore_wait_start_count",
"Counter for background loop concurrency-limiting semaphore acquire calls started",
&["task"],
)
.unwrap()
});
pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_FINISH_COUNT: Lazy<IntCounterVec> =
Lazy::new(|| {
register_int_counter_vec!(
"pageserver_background_loop_semaphore_wait_finish_count",
"Counter for background loop concurrency-limiting semaphore acquire calls finished",
&["task"],
)
.unwrap()
});
pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
@@ -1627,7 +1509,6 @@ impl StorageTimeMetrics {
#[derive(Debug)]
pub struct TimelineMetrics {
tenant_id: String,
shard_id: String,
timeline_id: String,
pub flush_time_histo: StorageTimeMetrics,
pub compact_time_histo: StorageTimeMetrics,
@@ -1648,12 +1529,11 @@ pub struct TimelineMetrics {
impl TimelineMetrics {
pub fn new(
tenant_shard_id: &TenantShardId,
tenant_id: &TenantId,
timeline_id: &TimelineId,
evictions_with_low_residence_duration_builder: EvictionsWithLowResidenceDurationBuilder,
) -> Self {
let tenant_id = tenant_shard_id.tenant_id.to_string();
let shard_id = format!("{}", tenant_shard_id.shard_slug());
let tenant_id = tenant_id.to_string();
let timeline_id = timeline_id.to_string();
let flush_time_histo =
StorageTimeMetrics::new(StorageTimeOperation::LayerFlush, &tenant_id, &timeline_id);
@@ -1690,12 +1570,11 @@ impl TimelineMetrics {
let evictions = EVICTIONS
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
let evictions_with_low_residence_duration = evictions_with_low_residence_duration_builder
.build(&tenant_id, &shard_id, &timeline_id);
let evictions_with_low_residence_duration =
evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id);
TimelineMetrics {
tenant_id,
shard_id,
timeline_id,
flush_time_histo,
compact_time_histo,
@@ -1741,7 +1620,6 @@ impl Drop for TimelineMetrics {
fn drop(&mut self) {
let tenant_id = &self.tenant_id;
let timeline_id = &self.timeline_id;
let shard_id = &self.shard_id;
let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
{
RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
@@ -1755,7 +1633,7 @@ impl Drop for TimelineMetrics {
self.evictions_with_low_residence_duration
.write()
.unwrap()
.remove(tenant_id, shard_id, timeline_id);
.remove(tenant_id, timeline_id);
// The following metrics are born outside of the TimelineMetrics lifecycle but still
// removed at the end of it. The idea is to have the metrics outlive the
@@ -2216,8 +2094,6 @@ pub fn preinitialize_metrics() {
// Tenant manager stats
Lazy::force(&TENANT_MANAGER);
Lazy::force(&crate::tenant::storage_layer::layer::LAYER_IMPL_METRICS);
// countervecs
[&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
.into_iter()

View File

@@ -28,7 +28,7 @@
//! Page cache maps from a cache key to a buffer slot.
//! The cache key uniquely identifies the piece of data that is being cached.
//!
//! The cache key for **materialized pages** is [`TenantShardId`], [`TimelineId`], [`Key`], and [`Lsn`].
//! The cache key for **materialized pages** is [`TenantId`], [`TimelineId`], [`Key`], and [`Lsn`].
//! Use [`PageCache::memorize_materialized_page`] and [`PageCache::lookup_materialized_page`] for fill & access.
//!
//! The cache key for **immutable file** pages is [`FileId`] and a block number.
@@ -83,15 +83,13 @@ use std::{
use anyhow::Context;
use once_cell::sync::OnceCell;
use pageserver_api::shard::TenantShardId;
use utils::{id::TimelineId, lsn::Lsn};
use crate::{
context::RequestContext,
metrics::{page_cache_eviction_metrics, PageCacheSizeMetrics},
repository::Key,
use utils::{
id::{TenantId, TimelineId},
lsn::Lsn,
};
use crate::{context::RequestContext, metrics::PageCacheSizeMetrics, repository::Key};
static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
const TEST_PAGE_CACHE_SIZE: usize = 50;
@@ -152,13 +150,7 @@ enum CacheKey {
#[derive(Debug, PartialEq, Eq, Hash, Clone)]
struct MaterializedPageHashKey {
/// Why is this TenantShardId rather than TenantId?
///
/// Usually, the materialized value of a page@lsn is identical on any shard in the same tenant. However, this
/// this not the case for certain internally-generated pages (e.g. relation sizes). In future, we may make this
/// key smaller by omitting the shard, if we ensure that reads to such pages always skip the cache, or are
/// special-cased in some other way.
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
timeline_id: TimelineId,
key: Key,
}
@@ -382,7 +374,7 @@ impl PageCache {
/// returned page.
pub async fn lookup_materialized_page(
&self,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
timeline_id: TimelineId,
key: &Key,
lsn: Lsn,
@@ -399,7 +391,7 @@ impl PageCache {
let mut cache_key = CacheKey::MaterializedPage {
hash_key: MaterializedPageHashKey {
tenant_shard_id,
tenant_id,
timeline_id,
key: *key,
},
@@ -440,7 +432,7 @@ impl PageCache {
///
pub async fn memorize_materialized_page(
&self,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
timeline_id: TimelineId,
key: Key,
lsn: Lsn,
@@ -448,7 +440,7 @@ impl PageCache {
) -> anyhow::Result<()> {
let cache_key = CacheKey::MaterializedPage {
hash_key: MaterializedPageHashKey {
tenant_shard_id,
tenant_id,
timeline_id,
key,
},
@@ -905,10 +897,8 @@ impl PageCache {
// Note that just yielding to tokio during iteration without such
// priority boosting is likely counter-productive. We'd just give more opportunities
// for B to bump usage count, further starving A.
page_cache_eviction_metrics::observe(
page_cache_eviction_metrics::Outcome::ItersExceeded {
iters: iters.try_into().unwrap(),
},
crate::metrics::page_cache_errors_inc(
crate::metrics::PageCacheErrorKind::EvictIterLimit,
);
anyhow::bail!("exceeded evict iter limit");
}
@@ -919,18 +909,8 @@ impl PageCache {
// remove mapping for old buffer
self.remove_mapping(old_key);
inner.key = None;
page_cache_eviction_metrics::observe(
page_cache_eviction_metrics::Outcome::FoundSlotEvicted {
iters: iters.try_into().unwrap(),
},
);
} else {
page_cache_eviction_metrics::observe(
page_cache_eviction_metrics::Outcome::FoundSlotUnused {
iters: iters.try_into().unwrap(),
},
);
}
crate::metrics::PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL.inc_by(iters as u64);
return Ok((slot_idx, inner));
}
}

View File

@@ -53,7 +53,7 @@ use crate::context::{DownloadBehavior, RequestContext};
use crate::import_datadir::import_wal_from_tar;
use crate::metrics;
use crate::metrics::LIVE_CONNECTIONS_COUNT;
use crate::pgdatadir_mapping::{rel_block_to_key, Version};
use crate::pgdatadir_mapping::rel_block_to_key;
use crate::task_mgr;
use crate::task_mgr::TaskKind;
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
@@ -67,9 +67,9 @@ use crate::trace::Tracer;
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
use postgres_ffi::BLCKSZ;
// How long we may wait for a [`TenantSlot::InProgress`]` and/or a [`Tenant`] which
// How long we may block waiting for a [`TenantSlot::InProgress`]` and/or a [`Tenant`] which
// is not yet in state [`TenantState::Active`].
const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);
/// Read the end of a tar archive.
///
@@ -747,7 +747,7 @@ impl PageServerHandler {
.await?;
let exists = timeline
.get_rel_exists(req.rel, Version::Lsn(lsn), req.latest, ctx)
.get_rel_exists(req.rel, lsn, req.latest, ctx)
.await?;
Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
@@ -766,9 +766,7 @@ impl PageServerHandler {
Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
.await?;
let n_blocks = timeline
.get_rel_size(req.rel, Version::Lsn(lsn), req.latest, ctx)
.await?;
let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest, ctx).await?;
Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
n_blocks,
@@ -787,13 +785,7 @@ impl PageServerHandler {
.await?;
let total_blocks = timeline
.get_db_size(
DEFAULTTABLESPACE_OID,
req.dbnode,
Version::Lsn(lsn),
req.latest,
ctx,
)
.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest, ctx)
.await?;
let db_size = total_blocks as i64 * BLCKSZ as i64;
@@ -824,7 +816,7 @@ impl PageServerHandler {
let key = rel_block_to_key(req.rel, req.blkno);
let page = if timeline.get_shard_identity().is_key_local(&key) {
timeline
.get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), req.latest, ctx)
.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
.await?
} else {
// The Tenant shard we looked up at connection start does not hold this particular
@@ -861,7 +853,7 @@ impl PageServerHandler {
// the GateGuard was already held over the whole connection.
let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?;
timeline
.get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), req.latest, ctx)
.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
.await?
};

View File

@@ -11,7 +11,7 @@ use crate::context::RequestContext;
use crate::keyspace::{KeySpace, KeySpaceAccum};
use crate::repository::*;
use crate::walrecord::NeonWalRecord;
use anyhow::{ensure, Context};
use anyhow::Context;
use bytes::{Buf, Bytes};
use pageserver_api::key::is_rel_block_key;
use pageserver_api::reltag::{RelTag, SlruKind};
@@ -147,7 +147,6 @@ impl Timeline {
{
DatadirModification {
tline: self,
pending_lsns: Vec::new(),
pending_updates: HashMap::new(),
pending_deletions: Vec::new(),
pending_nblocks: 0,
@@ -164,7 +163,7 @@ impl Timeline {
&self,
tag: RelTag,
blknum: BlockNumber,
version: Version<'_>,
lsn: Lsn,
latest: bool,
ctx: &RequestContext,
) -> Result<Bytes, PageReconstructError> {
@@ -174,20 +173,17 @@ impl Timeline {
));
}
let nblocks = self.get_rel_size(tag, version, latest, ctx).await?;
let nblocks = self.get_rel_size(tag, lsn, latest, ctx).await?;
if blknum >= nblocks {
debug!(
"read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
tag,
blknum,
version.get_lsn(),
nblocks
tag, blknum, lsn, nblocks
);
return Ok(ZERO_PAGE.clone());
}
let key = rel_block_to_key(tag, blknum);
version.get(self, key, ctx).await
self.get(key, lsn, ctx).await
}
// Get size of a database in blocks
@@ -195,16 +191,16 @@ impl Timeline {
&self,
spcnode: Oid,
dbnode: Oid,
version: Version<'_>,
lsn: Lsn,
latest: bool,
ctx: &RequestContext,
) -> Result<usize, PageReconstructError> {
let mut total_blocks = 0;
let rels = self.list_rels(spcnode, dbnode, version, ctx).await?;
let rels = self.list_rels(spcnode, dbnode, lsn, ctx).await?;
for rel in rels {
let n_blocks = self.get_rel_size(rel, version, latest, ctx).await?;
let n_blocks = self.get_rel_size(rel, lsn, latest, ctx).await?;
total_blocks += n_blocks as usize;
}
Ok(total_blocks)
@@ -214,7 +210,7 @@ impl Timeline {
pub async fn get_rel_size(
&self,
tag: RelTag,
version: Version<'_>,
lsn: Lsn,
latest: bool,
ctx: &RequestContext,
) -> Result<BlockNumber, PageReconstructError> {
@@ -224,12 +220,12 @@ impl Timeline {
));
}
if let Some(nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) {
if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) {
return Ok(nblocks);
}
if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM)
&& !self.get_rel_exists(tag, version, latest, ctx).await?
&& !self.get_rel_exists(tag, lsn, latest, ctx).await?
{
// FIXME: Postgres sometimes calls smgrcreate() to create
// FSM, and smgrnblocks() on it immediately afterwards,
@@ -239,7 +235,7 @@ impl Timeline {
}
let key = rel_size_to_key(tag);
let mut buf = version.get(self, key, ctx).await?;
let mut buf = self.get(key, lsn, ctx).await?;
let nblocks = buf.get_u32_le();
if latest {
@@ -250,7 +246,7 @@ impl Timeline {
// latest=true, then it can not cause cache corruption, because with latest=true
// pageserver choose max(request_lsn, last_written_lsn) and so cached value will be
// associated with most recent value of LSN.
self.update_cached_rel_size(tag, version.get_lsn(), nblocks);
self.update_cached_rel_size(tag, lsn, nblocks);
}
Ok(nblocks)
}
@@ -259,7 +255,7 @@ impl Timeline {
pub async fn get_rel_exists(
&self,
tag: RelTag,
version: Version<'_>,
lsn: Lsn,
_latest: bool,
ctx: &RequestContext,
) -> Result<bool, PageReconstructError> {
@@ -270,12 +266,12 @@ impl Timeline {
}
// first try to lookup relation in cache
if let Some(_nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) {
if let Some(_nblocks) = self.get_cached_rel_size(&tag, lsn) {
return Ok(true);
}
// fetch directory listing
let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
let buf = version.get(self, key, ctx).await?;
let buf = self.get(key, lsn, ctx).await?;
match RelDirectory::des(&buf).context("deserialization failure") {
Ok(dir) => {
@@ -295,12 +291,12 @@ impl Timeline {
&self,
spcnode: Oid,
dbnode: Oid,
version: Version<'_>,
lsn: Lsn,
ctx: &RequestContext,
) -> Result<HashSet<RelTag>, PageReconstructError> {
// fetch directory listing
let key = rel_dir_to_key(spcnode, dbnode);
let buf = version.get(self, key, ctx).await?;
let buf = self.get(key, lsn, ctx).await?;
match RelDirectory::des(&buf).context("deserialization failure") {
Ok(dir) => {
@@ -336,11 +332,11 @@ impl Timeline {
&self,
kind: SlruKind,
segno: u32,
version: Version<'_>,
lsn: Lsn,
ctx: &RequestContext,
) -> Result<BlockNumber, PageReconstructError> {
let key = slru_segment_size_to_key(kind, segno);
let mut buf = version.get(self, key, ctx).await?;
let mut buf = self.get(key, lsn, ctx).await?;
Ok(buf.get_u32_le())
}
@@ -349,12 +345,12 @@ impl Timeline {
&self,
kind: SlruKind,
segno: u32,
version: Version<'_>,
lsn: Lsn,
ctx: &RequestContext,
) -> Result<bool, PageReconstructError> {
// fetch directory listing
let key = slru_dir_to_key(kind);
let buf = version.get(self, key, ctx).await?;
let buf = self.get(key, lsn, ctx).await?;
match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
Ok(dir) => {
@@ -505,11 +501,11 @@ impl Timeline {
mut f: impl FnMut(TimestampTz) -> ControlFlow<T>,
) -> Result<T, PageReconstructError> {
for segno in self
.list_slru_segments(SlruKind::Clog, Version::Lsn(probe_lsn), ctx)
.list_slru_segments(SlruKind::Clog, probe_lsn, ctx)
.await?
{
let nblocks = self
.get_slru_segment_size(SlruKind::Clog, segno, Version::Lsn(probe_lsn), ctx)
.get_slru_segment_size(SlruKind::Clog, segno, probe_lsn, ctx)
.await?;
for blknum in (0..nblocks).rev() {
let clog_page = self
@@ -535,13 +531,13 @@ impl Timeline {
pub async fn list_slru_segments(
&self,
kind: SlruKind,
version: Version<'_>,
lsn: Lsn,
ctx: &RequestContext,
) -> Result<HashSet<u32>, PageReconstructError> {
// fetch directory entry
let key = slru_dir_to_key(kind);
let buf = version.get(self, key, ctx).await?;
let buf = self.get(key, lsn, ctx).await?;
match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
Ok(dir) => Ok(dir.segments),
Err(e) => Err(PageReconstructError::from(e)),
@@ -552,12 +548,12 @@ impl Timeline {
&self,
spcnode: Oid,
dbnode: Oid,
version: Version<'_>,
lsn: Lsn,
ctx: &RequestContext,
) -> Result<Bytes, PageReconstructError> {
let key = relmap_file_key(spcnode, dbnode);
let buf = version.get(self, key, ctx).await?;
let buf = self.get(key, lsn, ctx).await?;
Ok(buf)
}
@@ -656,10 +652,7 @@ impl Timeline {
let mut total_size: u64 = 0;
for (spcnode, dbnode) in dbdir.dbdirs.keys() {
for rel in self
.list_rels(*spcnode, *dbnode, Version::Lsn(lsn), ctx)
.await?
{
for rel in self.list_rels(*spcnode, *dbnode, lsn, ctx).await? {
if self.cancel.is_cancelled() {
return Err(CalculateLogicalSizeError::Cancelled);
}
@@ -699,7 +692,7 @@ impl Timeline {
result.add_key(rel_dir_to_key(spcnode, dbnode));
let mut rels: Vec<RelTag> = self
.list_rels(spcnode, dbnode, Version::Lsn(lsn), ctx)
.list_rels(spcnode, dbnode, lsn, ctx)
.await?
.into_iter()
.collect();
@@ -806,39 +799,18 @@ pub struct DatadirModification<'a> {
/// in the state in 'tline' yet.
pub tline: &'a Timeline,
/// Current LSN of the modification
lsn: Lsn,
/// Lsn assigned by begin_modification
pub lsn: Lsn,
// The modifications are not applied directly to the underlying key-value store.
// The put-functions add the modifications here, and they are flushed to the
// underlying key-value store by the 'finish' function.
pending_lsns: Vec<Lsn>,
pending_updates: HashMap<Key, Vec<(Lsn, Value)>>,
pending_deletions: Vec<(Range<Key>, Lsn)>,
pending_updates: HashMap<Key, Value>,
pending_deletions: Vec<Range<Key>>,
pending_nblocks: i64,
}
impl<'a> DatadirModification<'a> {
/// Get the current lsn
pub fn get_lsn(&self) -> Lsn {
self.lsn
}
/// Set the current lsn
pub fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
ensure!(
lsn >= self.lsn,
"setting an older lsn {} than {} is not allowed",
lsn,
self.lsn
);
if lsn > self.lsn {
self.pending_lsns.push(self.lsn);
self.lsn = lsn;
}
Ok(())
}
/// Initialize a completely new repository.
///
/// This inserts the directory metadata entries that are assumed to
@@ -850,7 +822,10 @@ impl<'a> DatadirModification<'a> {
self.put(DBDIR_KEY, Value::Image(buf.into()));
// Create AuxFilesDirectory
self.init_aux_dir()?;
let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
files: HashMap::new(),
})?;
self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
let buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory {
xids: HashSet::new(),
@@ -958,7 +933,10 @@ impl<'a> DatadirModification<'a> {
self.put(DBDIR_KEY, Value::Image(buf.into()));
// Create AuxFilesDirectory as well
self.init_aux_dir()?;
let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
files: HashMap::new(),
})?;
self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
}
if r.is_none() {
// Create RelDirectory
@@ -1012,9 +990,11 @@ impl<'a> DatadirModification<'a> {
dbnode: Oid,
ctx: &RequestContext,
) -> anyhow::Result<()> {
let req_lsn = self.tline.get_last_record_lsn();
let total_blocks = self
.tline
.get_db_size(spcnode, dbnode, Version::Modified(self), true, ctx)
.get_db_size(spcnode, dbnode, req_lsn, true, ctx)
.await?;
// Remove entry from dbdir
@@ -1103,11 +1083,8 @@ impl<'a> DatadirModification<'a> {
ctx: &RequestContext,
) -> anyhow::Result<()> {
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
if self
.tline
.get_rel_exists(rel, Version::Modified(self), true, ctx)
.await?
{
let last_lsn = self.tline.get_last_record_lsn();
if self.tline.get_rel_exists(rel, last_lsn, true, ctx).await? {
let size_key = rel_size_to_key(rel);
// Fetch the old size first
let old_size = self.get(size_key, ctx).await?.get_u32_le();
@@ -1284,14 +1261,6 @@ impl<'a> DatadirModification<'a> {
Ok(())
}
pub fn init_aux_dir(&mut self) -> anyhow::Result<()> {
let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
files: HashMap::new(),
})?;
self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
Ok(())
}
pub async fn put_file(
&mut self,
path: &str,
@@ -1352,23 +1321,17 @@ impl<'a> DatadirModification<'a> {
let writer = self.tline.writer().await;
// Flush relation and SLRU data blocks, keep metadata.
let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
for (key, values) in self.pending_updates.drain() {
for (lsn, value) in values {
if is_rel_block_key(&key) || is_slru_block_key(key) {
// This bails out on first error without modifying pending_updates.
// That's Ok, cf this function's doc comment.
writer.put(key, lsn, &value, ctx).await?;
} else {
retained_pending_updates
.entry(key)
.or_default()
.push((lsn, value));
}
let mut retained_pending_updates = HashMap::new();
for (key, value) in self.pending_updates.drain() {
if is_rel_block_key(&key) || is_slru_block_key(key) {
// This bails out on first error without modifying pending_updates.
// That's Ok, cf this function's doc comment.
writer.put(key, self.lsn, &value, ctx).await?;
} else {
retained_pending_updates.insert(key, value);
}
}
self.pending_updates = retained_pending_updates;
self.pending_updates.extend(retained_pending_updates);
if pending_nblocks != 0 {
writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
@@ -1385,28 +1348,18 @@ impl<'a> DatadirModification<'a> {
///
pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
let writer = self.tline.writer().await;
let lsn = self.lsn;
let pending_nblocks = self.pending_nblocks;
self.pending_nblocks = 0;
if !self.pending_updates.is_empty() {
writer.put_batch(&self.pending_updates, ctx).await?;
self.pending_updates.clear();
for (key, value) in self.pending_updates.drain() {
writer.put(key, lsn, &value, ctx).await?;
}
for key_range in self.pending_deletions.drain(..) {
writer.delete(key_range, lsn).await?;
}
if !self.pending_deletions.is_empty() {
writer.delete_batch(&self.pending_deletions).await?;
self.pending_deletions.clear();
}
self.pending_lsns.push(self.lsn);
for pending_lsn in self.pending_lsns.drain(..) {
// Ideally, we should be able to call writer.finish_write() only once
// with the highest LSN. However, the last_record_lsn variable in the
// timeline keeps track of the latest LSN and the immediate previous LSN
// so we need to record every LSN to not leave a gap between them.
writer.finish_write(pending_lsn);
}
writer.finish_write(lsn);
if pending_nblocks != 0 {
writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
@@ -1415,86 +1368,40 @@ impl<'a> DatadirModification<'a> {
Ok(())
}
pub(crate) fn len(&self) -> usize {
self.pending_updates.len() + self.pending_deletions.len()
}
// Internal helper functions to batch the modifications
async fn get(&self, key: Key, ctx: &RequestContext) -> Result<Bytes, PageReconstructError> {
// Have we already updated the same key? Read the latest pending updated
// Have we already updated the same key? Read the pending updated
// version in that case.
//
// Note: we don't check pending_deletions. It is an error to request a
// value that has been removed, deletion only avoids leaking storage.
if let Some(values) = self.pending_updates.get(&key) {
if let Some((_, value)) = values.last() {
return if let Value::Image(img) = value {
Ok(img.clone())
} else {
// Currently, we never need to read back a WAL record that we
// inserted in the same "transaction". All the metadata updates
// work directly with Images, and we never need to read actual
// data pages. We could handle this if we had to, by calling
// the walredo manager, but let's keep it simple for now.
Err(PageReconstructError::from(anyhow::anyhow!(
"unexpected pending WAL record"
)))
};
if let Some(value) = self.pending_updates.get(&key) {
if let Value::Image(img) = value {
Ok(img.clone())
} else {
// Currently, we never need to read back a WAL record that we
// inserted in the same "transaction". All the metadata updates
// work directly with Images, and we never need to read actual
// data pages. We could handle this if we had to, by calling
// the walredo manager, but let's keep it simple for now.
Err(PageReconstructError::from(anyhow::anyhow!(
"unexpected pending WAL record"
)))
}
} else {
let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
self.tline.get(key, lsn, ctx).await
}
let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
self.tline.get(key, lsn, ctx).await
}
fn put(&mut self, key: Key, val: Value) {
let values = self.pending_updates.entry(key).or_default();
// Replace the previous value if it exists at the same lsn
if let Some((last_lsn, last_value)) = values.last_mut() {
if *last_lsn == self.lsn {
*last_value = val;
return;
}
}
values.push((self.lsn, val));
self.pending_updates.insert(key, val);
}
fn delete(&mut self, key_range: Range<Key>) {
trace!("DELETE {}-{}", key_range.start, key_range.end);
self.pending_deletions.push((key_range, self.lsn));
}
}
/// This struct facilitates accessing either a committed key from the timeline at a
/// specific LSN, or the latest uncommitted key from a pending modification.
/// During WAL ingestion, the records from multiple LSNs may be batched in the same
/// modification before being flushed to the timeline. Hence, the routines in WalIngest
/// need to look up the keys in the modification first before looking them up in the
/// timeline to not miss the latest updates.
#[derive(Clone, Copy)]
pub enum Version<'a> {
Lsn(Lsn),
Modified(&'a DatadirModification<'a>),
}
impl<'a> Version<'a> {
async fn get(
&self,
timeline: &Timeline,
key: Key,
ctx: &RequestContext,
) -> Result<Bytes, PageReconstructError> {
match self {
Version::Lsn(lsn) => timeline.get(key, *lsn, ctx).await,
Version::Modified(modification) => modification.get(key, ctx).await,
}
}
fn get_lsn(&self) -> Lsn {
match self {
Version::Lsn(lsn) => *lsn,
Version::Modified(modification) => modification.lsn,
}
self.pending_deletions.push(key_range);
}
}
@@ -1856,13 +1763,6 @@ const AUX_FILES_KEY: Key = Key {
// Reverse mappings for a few Keys.
// These are needed by WAL redo manager.
// AUX_FILES currently stores only data for logical replication (slots etc), and
// we don't preserve these on a branch because safekeepers can't follow timeline
// switch (and generally it likely should be optional), so ignore these.
pub fn is_inherited_key(key: Key) -> bool {
key != AUX_FILES_KEY
}
pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
Ok(match key.field1 {
0x00 => (

View File

@@ -42,7 +42,6 @@ use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::{Arc, Mutex};
use futures::FutureExt;
use pageserver_api::shard::TenantShardId;
use tokio::runtime::Runtime;
use tokio::task::JoinHandle;
use tokio::task_local;
@@ -52,7 +51,7 @@ use tracing::{debug, error, info, warn};
use once_cell::sync::Lazy;
use utils::id::TimelineId;
use utils::id::{TenantId, TimelineId};
use crate::shutdown_pageserver;
@@ -258,9 +257,6 @@ pub enum TaskKind {
/// See [`crate::disk_usage_eviction_task`].
DiskUsageEviction,
/// See [`crate::tenant::secondary`].
SecondaryUploads,
// Initial logical size calculation
InitialLogicalSizeCalculation,
@@ -321,7 +317,7 @@ struct PageServerTask {
/// Tasks may optionally be launched for a particular tenant/timeline, enabling
/// later cancelling tasks for that tenant/timeline in [`shutdown_tasks`]
tenant_shard_id: Option<TenantShardId>,
tenant_id: Option<TenantId>,
timeline_id: Option<TimelineId>,
mutable: Mutex<MutableTaskState>,
@@ -333,7 +329,7 @@ struct PageServerTask {
pub fn spawn<F>(
runtime: &tokio::runtime::Handle,
kind: TaskKind,
tenant_shard_id: Option<TenantShardId>,
tenant_id: Option<TenantId>,
timeline_id: Option<TimelineId>,
name: &str,
shutdown_process_on_error: bool,
@@ -349,7 +345,7 @@ where
kind,
name: name.to_string(),
cancel: cancel.clone(),
tenant_shard_id,
tenant_id,
timeline_id,
mutable: Mutex::new(MutableTaskState { join_handle: None }),
});
@@ -428,28 +424,28 @@ async fn task_finish(
Ok(Err(err)) => {
if shutdown_process_on_error {
error!(
"Shutting down: task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}",
task_name, task.tenant_shard_id, task.timeline_id, err
"Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
task_name, task.tenant_id, task.timeline_id, err
);
shutdown_process = true;
} else {
error!(
"Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}",
task_name, task.tenant_shard_id, task.timeline_id, err
"Task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
task_name, task.tenant_id, task.timeline_id, err
);
}
}
Err(err) => {
if shutdown_process_on_error {
error!(
"Shutting down: task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}",
task_name, task.tenant_shard_id, task.timeline_id, err
"Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
task_name, task.tenant_id, task.timeline_id, err
);
shutdown_process = true;
} else {
error!(
"Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}",
task_name, task.tenant_shard_id, task.timeline_id, err
"Task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
task_name, task.tenant_id, task.timeline_id, err
);
}
}
@@ -471,11 +467,11 @@ async fn task_finish(
///
/// Or to shut down all tasks for given timeline:
///
/// shutdown_tasks(None, Some(tenant_shard_id), Some(timeline_id))
/// shutdown_tasks(None, Some(tenant_id), Some(timeline_id))
///
pub async fn shutdown_tasks(
kind: Option<TaskKind>,
tenant_shard_id: Option<TenantShardId>,
tenant_id: Option<TenantId>,
timeline_id: Option<TimelineId>,
) {
let mut victim_tasks = Vec::new();
@@ -484,35 +480,35 @@ pub async fn shutdown_tasks(
let tasks = TASKS.lock().unwrap();
for task in tasks.values() {
if (kind.is_none() || Some(task.kind) == kind)
&& (tenant_shard_id.is_none() || task.tenant_shard_id == tenant_shard_id)
&& (tenant_id.is_none() || task.tenant_id == tenant_id)
&& (timeline_id.is_none() || task.timeline_id == timeline_id)
{
task.cancel.cancel();
victim_tasks.push((
Arc::clone(task),
task.kind,
task.tenant_shard_id,
task.tenant_id,
task.timeline_id,
));
}
}
}
let log_all = kind.is_none() && tenant_shard_id.is_none() && timeline_id.is_none();
let log_all = kind.is_none() && tenant_id.is_none() && timeline_id.is_none();
for (task, task_kind, tenant_shard_id, timeline_id) in victim_tasks {
for (task, task_kind, tenant_id, timeline_id) in victim_tasks {
let join_handle = {
let mut task_mut = task.mutable.lock().unwrap();
task_mut.join_handle.take()
};
if let Some(mut join_handle) = join_handle {
if log_all {
if tenant_shard_id.is_none() {
if tenant_id.is_none() {
// there are quite few of these
info!(name = task.name, kind = ?task_kind, "stopping global task");
} else {
// warn to catch these in tests; there shouldn't be any
warn!(name = task.name, tenant_shard_id = ?tenant_shard_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
warn!(name = task.name, tenant_id = ?tenant_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
}
}
if tokio::time::timeout(std::time::Duration::from_secs(1), &mut join_handle)
@@ -521,13 +517,12 @@ pub async fn shutdown_tasks(
{
// allow some time to elapse before logging to cut down the number of log
// lines.
info!("waiting for task {} to shut down", task.name);
info!("waiting for {} to shut down", task.name);
// we never handled this return value, but:
// - we don't deschedule which would lead to is_cancelled
// - panics are already logged (is_panicked)
// - task errors are already logged in the wrapper
let _ = join_handle.await;
info!("task {} completed", task.name);
}
} else {
// Possibly one of:

View File

@@ -12,6 +12,7 @@
//!
use anyhow::{bail, Context};
use bytes::Bytes;
use camino::{Utf8Path, Utf8PathBuf};
use enumset::EnumSet;
use futures::stream::FuturesUnordered;
@@ -68,7 +69,6 @@ use crate::tenant::config::TenantConfOpt;
use crate::tenant::metadata::load_metadata;
pub use crate::tenant::remote_timeline_client::index::IndexPart;
use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
use crate::tenant::remote_timeline_client::INITDB_PATH;
use crate::tenant::storage_layer::DeltaLayer;
use crate::tenant::storage_layer::ImageLayer;
use crate::InitializationOrder;
@@ -144,7 +144,6 @@ pub mod storage_layer;
pub mod config;
pub mod delete;
pub mod mgr;
pub mod secondary;
pub mod tasks;
pub mod upload_queue;
@@ -609,7 +608,7 @@ impl Tenant {
task_mgr::spawn(
&tokio::runtime::Handle::current(),
TaskKind::Attach,
Some(tenant_shard_id),
Some(tenant_shard_id.tenant_id),
None,
"attach tenant",
false,
@@ -1918,7 +1917,7 @@ impl Tenant {
//
// this will additionally shutdown and await all timeline tasks.
tracing::debug!("Waiting for tasks...");
task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), None).await;
task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id.tenant_id), None).await;
// Wait for any in-flight operations to complete
self.gate.close().await;
@@ -2115,14 +2114,6 @@ impl Tenant {
.attach_mode
.clone()
}
pub(crate) fn get_tenant_shard_id(&self) -> &TenantShardId {
&self.tenant_shard_id
}
pub(crate) fn get_generation(&self) -> Generation {
self.generation
}
}
/// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id),
@@ -2261,18 +2252,6 @@ impl Tenant {
.or(self.conf.default_tenant_conf.min_resident_size_override)
}
pub fn get_heatmap_period(&self) -> Option<Duration> {
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
let heatmap_period = tenant_conf
.heatmap_period
.unwrap_or(self.conf.default_tenant_conf.heatmap_period);
if heatmap_period.is_zero() {
None
} else {
Some(heatmap_period)
}
}
pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
self.tenant_conf.write().unwrap().tenant_conf = new_tenant_conf;
// Don't hold self.timelines.lock() during the notifies.
@@ -2970,10 +2949,10 @@ impl Tenant {
};
// create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/`
// temporary directory for basebackup files for the given timeline.
let timelines_path = self.conf.timelines_path(&self.tenant_shard_id);
let pgdata_path = path_with_suffix_extension(
timelines_path.join(format!("basebackup-{timeline_id}")),
self.conf
.timelines_path(&self.tenant_shard_id)
.join(format!("basebackup-{timeline_id}")),
TEMP_FILE_SUFFIX,
);
@@ -3004,43 +2983,31 @@ impl Tenant {
)
.await
.context("download initdb tar")?;
let buf_read =
BufReader::with_capacity(remote_timeline_client::BUFFER_SIZE, initdb_tar_zst);
let buf_read = Box::pin(BufReader::new(initdb_tar_zst));
import_datadir::extract_tar_zst(&pgdata_path, buf_read)
.await
.context("extract initdb tar")?;
tokio::fs::remove_file(&initdb_tar_zst_path)
.await
.or_else(|e| {
if e.kind() == std::io::ErrorKind::NotFound {
// If something else already removed the file, ignore the error
Ok(())
} else {
Err(e)
}
})
.with_context(|| format!("tempfile removal {initdb_tar_zst_path}"))?;
if initdb_tar_zst_path.exists() {
tokio::fs::remove_file(&initdb_tar_zst_path)
.await
.context("tempfile removal")?;
}
} else {
// Init temporarily repo to get bootstrap data, this creates a directory in the `initdb_path` path
run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel).await?;
// Upload the created data dir to S3
if let Some(storage) = &self.remote_storage {
let temp_path = timelines_path.join(format!(
"{INITDB_PATH}.upload-{timeline_id}.{TEMP_FILE_SUFFIX}"
));
let (pgdata_zstd, tar_zst_size) =
import_datadir::create_tar_zst(&pgdata_path, &temp_path).await?;
let pgdata_zstd = import_datadir::create_tar_zst(&pgdata_path).await?;
let pgdata_zstd = Bytes::from(pgdata_zstd);
backoff::retry(
|| async {
self::remote_timeline_client::upload_initdb_dir(
storage,
&self.tenant_shard_id.tenant_id,
&timeline_id,
pgdata_zstd.try_clone().await?,
tar_zst_size,
pgdata_zstd.clone(),
)
.await
},
@@ -3052,18 +3019,6 @@ impl Tenant {
backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
)
.await?;
tokio::fs::remove_file(&temp_path)
.await
.or_else(|e| {
if e.kind() == std::io::ErrorKind::NotFound {
// If something else already removed the file, ignore the error
Ok(())
} else {
Err(e)
}
})
.with_context(|| format!("tempfile removal {temp_path}"))?;
}
}
let pgdata_lsn = import_datadir::get_lsn_from_controlfile(&pgdata_path)?.align();
@@ -3715,7 +3670,6 @@ pub(crate) mod harness {
tenant_conf.evictions_low_residence_duration_metric_threshold,
),
gc_feedback: Some(tenant_conf.gc_feedback),
heatmap_period: Some(tenant_conf.heatmap_period),
}
}
}

View File

@@ -46,8 +46,6 @@ pub mod defaults {
pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024;
pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
@@ -336,11 +334,6 @@ pub struct TenantConf {
#[serde(with = "humantime_serde")]
pub evictions_low_residence_duration_metric_threshold: Duration,
pub gc_feedback: bool,
/// If non-zero, the period between uploads of a heatmap from attached tenants. This
/// may be disabled if a Tenant will not have secondary locations: only secondary
/// locations will use the heatmap uploaded by attached locations.
pub heatmap_period: Duration,
}
/// Same as TenantConf, but this struct preserves the information about
@@ -421,11 +414,6 @@ pub struct TenantConfOpt {
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub gc_feedback: Option<bool>,
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(with = "humantime_serde")]
#[serde(default)]
pub heatmap_period: Option<Duration>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -494,7 +482,6 @@ impl TenantConfOpt {
.evictions_low_residence_duration_metric_threshold
.unwrap_or(global_conf.evictions_low_residence_duration_metric_threshold),
gc_feedback: self.gc_feedback.unwrap_or(global_conf.gc_feedback),
heatmap_period: self.heatmap_period.unwrap_or(global_conf.heatmap_period),
}
}
}
@@ -532,7 +519,6 @@ impl Default for TenantConf {
)
.expect("cannot parse default evictions_low_residence_duration_metric_threshold"),
gc_feedback: false,
heatmap_period: Duration::ZERO,
}
}
}

View File

@@ -77,10 +77,8 @@ async fn create_remote_delete_mark(
let data: &[u8] = &[];
backoff::retry(
|| async {
let data = bytes::Bytes::from_static(data);
let stream = futures::stream::once(futures::future::ready(Ok(data)));
remote_storage
.upload(stream, 0, &remote_mark_path, None)
.upload(data, 0, &remote_mark_path, None)
.await
},
|_e| false,
@@ -463,7 +461,7 @@ impl DeleteTenantFlow {
task_mgr::spawn(
task_mgr::BACKGROUND_RUNTIME.handle(),
TaskKind::TimelineDeletionWorker,
Some(tenant_shard_id),
Some(tenant_shard_id.tenant_id),
None,
"tenant_delete",
false,
@@ -550,7 +548,7 @@ impl DeleteTenantFlow {
// we encounter an InProgress marker, yield the barrier it contains and wait on it.
let barrier = {
let mut locked = tenants.write().unwrap();
let removed = locked.remove(tenant.tenant_shard_id);
let removed = locked.remove(&tenant.tenant_shard_id.tenant_id);
// FIXME: we should not be modifying this from outside of mgr.rs.
// This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080)

View File

@@ -98,6 +98,33 @@ pub(crate) enum TenantsMap {
ShuttingDown(BTreeMap<TenantShardId, TenantSlot>),
}
/// Helper for mapping shard-unaware functions to a sharding-aware map
/// TODO(sharding): all users of this must be made shard-aware.
fn exactly_one_or_none<'a>(
map: &'a BTreeMap<TenantShardId, TenantSlot>,
tenant_id: &TenantId,
) -> Option<(&'a TenantShardId, &'a TenantSlot)> {
let mut slots = map.range(TenantShardId::tenant_range(*tenant_id));
// Retrieve the first two slots in the range: if both are populated, we must panic because the caller
// needs a shard-naive view of the world in which only one slot can exist for a TenantId at a time.
let slot_a = slots.next();
let slot_b = slots.next();
match (slot_a, slot_b) {
(None, None) => None,
(Some(slot), None) => {
// Exactly one matching slot
Some(slot)
}
(Some(_slot_a), Some(_slot_b)) => {
// Multiple shards for this tenant: cannot handle this yet.
// TODO(sharding): callers of get() should be shard-aware.
todo!("Attaching multiple shards in teh same tenant to the same pageserver")
}
(None, Some(_)) => unreachable!(),
}
}
pub(crate) enum TenantsMapRemoveResult {
Occupied(TenantSlot),
Vacant,
@@ -120,11 +147,12 @@ impl TenantsMap {
/// Convenience function for typical usage, where we want to get a `Tenant` object, for
/// working with attached tenants. If the TenantId is in the map but in Secondary state,
/// None is returned.
pub(crate) fn get(&self, tenant_shard_id: &TenantShardId) -> Option<&Arc<Tenant>> {
pub(crate) fn get(&self, tenant_id: &TenantId) -> Option<&Arc<Tenant>> {
match self {
TenantsMap::Initializing => None,
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
m.get(tenant_shard_id).and_then(|slot| slot.get_attached())
// TODO(sharding): callers of get() should be shard-aware.
exactly_one_or_none(m, tenant_id).and_then(|(_, slot)| slot.get_attached())
}
}
}
@@ -176,19 +204,25 @@ impl TenantsMap {
///
/// The normal way to remove a tenant is using a SlotGuard, which will gracefully remove the guarded
/// slot if the enclosed tenant is shutdown.
pub(crate) fn remove(&mut self, tenant_shard_id: TenantShardId) -> TenantsMapRemoveResult {
pub(crate) fn remove(&mut self, tenant_id: &TenantId) -> TenantsMapRemoveResult {
use std::collections::btree_map::Entry;
match self {
TenantsMap::Initializing => TenantsMapRemoveResult::Vacant,
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => match m.entry(tenant_shard_id) {
Entry::Occupied(entry) => match entry.get() {
TenantSlot::InProgress(barrier) => {
TenantsMapRemoveResult::InProgress(barrier.clone())
}
_ => TenantsMapRemoveResult::Occupied(entry.remove()),
},
Entry::Vacant(_entry) => TenantsMapRemoveResult::Vacant,
},
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
let key = exactly_one_or_none(m, tenant_id).map(|(k, _)| *k);
match key {
Some(key) => match m.entry(key) {
Entry::Occupied(entry) => match entry.get() {
TenantSlot::InProgress(barrier) => {
TenantsMapRemoveResult::InProgress(barrier.clone())
}
_ => TenantsMapRemoveResult::Occupied(entry.remove()),
},
Entry::Vacant(_entry) => TenantsMapRemoveResult::Vacant,
},
None => TenantsMapRemoveResult::Vacant,
}
}
}
}
@@ -788,16 +822,14 @@ pub(crate) async fn set_new_tenant_config(
new_tenant_conf: TenantConfOpt,
tenant_id: TenantId,
) -> Result<(), SetNewTenantConfigError> {
// Legacy API: does not support sharding
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
info!("configuring tenant {tenant_id}");
let tenant = get_tenant(tenant_shard_id, true)?;
let tenant = get_tenant(tenant_id, true)?;
// This is a legacy API that only operates on attached tenants: the preferred
// API to use is the location_config/ endpoint, which lets the caller provide
// the full LocationConf.
let location_conf = LocationConf::attached_single(new_tenant_conf, tenant.generation);
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf)
.await
@@ -807,12 +839,6 @@ pub(crate) async fn set_new_tenant_config(
}
impl TenantManager {
/// Convenience function so that anyone with a TenantManager can get at the global configuration, without
/// having to pass it around everywhere as a separate object.
pub(crate) fn get_conf(&self) -> &'static PageServerConf {
self.conf
}
/// Gets the attached tenant from the in-memory data, erroring if it's absent, in secondary mode, or is not fitting to the query.
/// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
pub(crate) fn get_attached_tenant_shard(
@@ -1093,20 +1119,6 @@ impl TenantManager {
Ok(())
}
pub(crate) fn get_attached_active_tenant_shards(&self) -> Vec<Arc<Tenant>> {
let locked = self.tenants.read().unwrap();
match &*locked {
TenantsMap::Initializing => Vec::new(),
TenantsMap::Open(map) | TenantsMap::ShuttingDown(map) => map
.values()
.filter_map(|slot| {
slot.get_attached()
.and_then(|t| if t.is_active() { Some(t.clone()) } else { None })
})
.collect(),
}
}
}
#[derive(Debug, thiserror::Error)]
@@ -1131,11 +1143,14 @@ pub(crate) enum GetTenantError {
///
/// This method is cancel-safe.
pub(crate) fn get_tenant(
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
active_only: bool,
) -> Result<Arc<Tenant>, GetTenantError> {
let locked = TENANTS.read().unwrap();
// TODO(sharding): make all callers of get_tenant shard-aware
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)?;
match peek_slot {
@@ -1147,18 +1162,14 @@ pub(crate) fn get_tenant(
TenantState::Active => Ok(Arc::clone(tenant)),
_ => {
if active_only {
Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
Err(GetTenantError::NotActive(tenant_id))
} else {
Ok(Arc::clone(tenant))
}
}
},
Some(TenantSlot::InProgress(_)) => {
Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
}
None | Some(TenantSlot::Secondary) => {
Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
}
Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_id)),
None | Some(TenantSlot::Secondary) => Err(GetTenantError::NotFound(tenant_id)),
}
}
@@ -1531,8 +1542,7 @@ pub(crate) enum TenantMapListError {
///
/// Get list of tenants, for the mgmt API
///
pub(crate) async fn list_tenants() -> Result<Vec<(TenantShardId, TenantState)>, TenantMapListError>
{
pub(crate) async fn list_tenants() -> Result<Vec<(TenantId, TenantState)>, TenantMapListError> {
let tenants = TENANTS.read().unwrap();
let m = match &*tenants {
TenantsMap::Initializing => return Err(TenantMapListError::Initializing),
@@ -1540,10 +1550,12 @@ pub(crate) async fn list_tenants() -> Result<Vec<(TenantShardId, TenantState)>,
};
Ok(m.iter()
.filter_map(|(id, tenant)| match tenant {
TenantSlot::Attached(tenant) => Some((*id, tenant.current_state())),
TenantSlot::Attached(tenant) => Some((id, tenant.current_state())),
TenantSlot::Secondary => None,
TenantSlot::InProgress(_) => None,
})
// TODO(sharding): make callers of this function shard-aware
.map(|(k, v)| (k.tenant_id, v))
.collect())
}
@@ -1925,7 +1937,7 @@ fn tenant_map_acquire_slot_impl(
METRICS.tenant_slot_writes.inc();
let mut locked = tenants.write().unwrap();
let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard = %tenant_shard_id.shard_slug());
let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard=tenant_shard_id.shard_slug());
let _guard = span.enter();
let m = match &mut *locked {
@@ -2077,20 +2089,22 @@ use {
};
pub(crate) async fn immediate_gc(
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
timeline_id: TimelineId,
gc_req: TimelineGcRequest,
cancel: CancellationToken,
ctx: &RequestContext,
) -> Result<tokio::sync::oneshot::Receiver<Result<GcResult, anyhow::Error>>, ApiError> {
let guard = TENANTS.read().unwrap();
let tenant = guard
.get(&tenant_shard_id)
.get(&tenant_id)
.map(Arc::clone)
.with_context(|| format!("tenant {tenant_shard_id}"))
.with_context(|| format!("tenant {tenant_id}"))
.map_err(|e| ApiError::NotFound(e.into()))?;
// TODO(sharding): make callers of this function shard-aware
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
// Use tenant's pitr setting
let pitr = tenant.get_pitr_interval();
@@ -2102,9 +2116,9 @@ pub(crate) async fn immediate_gc(
task_mgr::spawn(
&tokio::runtime::Handle::current(),
TaskKind::GarbageCollector,
Some(tenant_shard_id),
Some(tenant_id),
Some(timeline_id),
&format!("timeline_gc_handler garbage collection run for tenant {tenant_shard_id} timeline {timeline_id}"),
&format!("timeline_gc_handler garbage collection run for tenant {tenant_id} timeline {timeline_id}"),
false,
async move {
fail::fail_point!("immediate_gc_task_pre");

View File

@@ -180,7 +180,7 @@
//! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
//! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map
pub(crate) mod download;
mod download;
pub mod index;
mod upload;
@@ -254,9 +254,6 @@ pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
pub(crate) const INITDB_PATH: &str = "initdb.tar.zst";
/// Default buffer size when interfacing with [`tokio::fs::File`].
pub(crate) const BUFFER_SIZE: usize = 32 * 1024;
pub enum MaybeDeletedIndexPart {
IndexPart(IndexPart),
Deleted(IndexPart),
@@ -1223,7 +1220,7 @@ impl RemoteTimelineClient {
task_mgr::spawn(
&self.runtime,
TaskKind::RemoteUploadTask,
Some(self.tenant_shard_id),
Some(self.tenant_shard_id.tenant_id),
Some(self.timeline_id),
"remote upload",
false,
@@ -1604,23 +1601,6 @@ impl RemoteTimelineClient {
}
}
}
pub(crate) fn get_layers_metadata(
&self,
layers: Vec<LayerFileName>,
) -> anyhow::Result<Vec<Option<LayerFileMetadata>>> {
let q = self.upload_queue.lock().unwrap();
let q = match &*q {
UploadQueue::Stopped(_) | UploadQueue::Uninitialized => {
anyhow::bail!("queue is in state {}", q.as_str())
}
UploadQueue::Initialized(inner) => inner,
};
let decorated = layers.into_iter().map(|l| q.latest_files.get(&l).cloned());
Ok(decorated.collect())
}
}
pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath {
@@ -1676,13 +1656,6 @@ pub fn remote_index_path(
.expect("Failed to construct path")
}
pub const HEATMAP_BASENAME: &str = "heatmap-v1.json";
pub(crate) fn remote_heatmap_path(tenant_shard_id: &TenantShardId) -> RemotePath {
RemotePath::from_string(&format!("tenants/{tenant_shard_id}/{HEATMAP_BASENAME}"))
.expect("Failed to construct path")
}
/// Given the key of an index, parse out the generation part of the name
pub fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
let file_name = match path.get_path().file_name() {

View File

@@ -75,11 +75,12 @@ pub async fn download_layer_file<'a>(
let (mut destination_file, bytes_amount) = download_retry(
|| async {
let destination_file = tokio::fs::File::create(&temp_file_path)
// TODO: this doesn't use the cached fd for some reason?
let mut destination_file = fs::File::create(&temp_file_path)
.await
.with_context(|| format!("create a destination file for layer '{temp_file_path}'"))
.map_err(DownloadError::Other)?;
let download = storage
let mut download = storage
.download(&remote_path)
.await
.with_context(|| {
@@ -89,14 +90,9 @@ pub async fn download_layer_file<'a>(
})
.map_err(DownloadError::Other)?;
let mut destination_file =
tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file);
let mut reader = tokio_util::io::StreamReader::new(download.download_stream);
let bytes_amount = tokio::time::timeout(
MAX_DOWNLOAD_DURATION,
tokio::io::copy_buf(&mut reader, &mut destination_file),
tokio::io::copy(&mut download.download_stream, &mut destination_file),
)
.await
.map_err(|e| DownloadError::Other(anyhow::anyhow!("Timed out {:?}", e)))?
@@ -107,8 +103,6 @@ pub async fn download_layer_file<'a>(
})
.map_err(DownloadError::Other)?;
let destination_file = destination_file.into_inner();
Ok((destination_file, bytes_amount))
},
&format!("download {remote_path:?}"),
@@ -226,22 +220,20 @@ async fn do_download_index_part(
index_generation: Generation,
cancel: CancellationToken,
) -> Result<IndexPart, DownloadError> {
use futures::stream::StreamExt;
let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);
let index_part_bytes = download_retry_forever(
|| async {
let index_part_download = storage.download(&remote_path).await?;
let mut index_part_download = storage.download(&remote_path).await?;
let mut index_part_bytes = Vec::new();
let mut stream = std::pin::pin!(index_part_download.download_stream);
while let Some(chunk) = stream.next().await {
let chunk = chunk
.with_context(|| format!("download index part at {remote_path:?}"))
.map_err(DownloadError::Other)?;
index_part_bytes.extend_from_slice(&chunk[..]);
}
tokio::io::copy(
&mut index_part_download.download_stream,
&mut index_part_bytes,
)
.await
.with_context(|| format!("download index part at {remote_path:?}"))
.map_err(DownloadError::Other)?;
Ok(index_part_bytes)
},
&format!("download {remote_path:?}"),
@@ -402,13 +394,11 @@ pub(crate) async fn download_initdb_tar_zst(
.with_context(|| format!("timeline dir creation {timeline_path}"))
.map_err(DownloadError::Other)?;
}
let temp_path = timeline_path.join(format!(
"{INITDB_PATH}.download-{timeline_id}.{TEMP_FILE_SUFFIX}"
));
let temp_path = timeline_path.join(format!("{INITDB_PATH}-{timeline_id}.{TEMP_FILE_SUFFIX}"));
let file = download_retry(
|| async {
let file = OpenOptions::new()
let mut file = OpenOptions::new()
.create(true)
.truncate(true)
.read(true)
@@ -418,17 +408,13 @@ pub(crate) async fn download_initdb_tar_zst(
.with_context(|| format!("tempfile creation {temp_path}"))
.map_err(DownloadError::Other)?;
let download = storage.download(&remote_path).await?;
let mut download = tokio_util::io::StreamReader::new(download.download_stream);
let mut writer = tokio::io::BufWriter::with_capacity(8 * 1024, file);
let mut download = storage.download(&remote_path).await?;
tokio::io::copy_buf(&mut download, &mut writer)
tokio::io::copy(&mut download.download_stream, &mut file)
.await
.with_context(|| format!("download initdb.tar.zst at {remote_path:?}"))
.map_err(DownloadError::Other)?;
let mut file = writer.into_inner();
file.seek(std::io::SeekFrom::Start(0))
.await
.with_context(|| format!("rewinding initdb.tar.zst at: {remote_path:?}"))
@@ -440,10 +426,10 @@ pub(crate) async fn download_initdb_tar_zst(
)
.await
.map_err(|e| {
// Do a best-effort attempt at deleting the temporary file upon encountering an error.
// We don't have async here nor do we want to pile on any extra errors.
if let Err(e) = std::fs::remove_file(&temp_path) {
if e.kind() != std::io::ErrorKind::NotFound {
if temp_path.exists() {
// Do a best-effort attempt at deleting the temporary file upon encountering an error.
// We don't have async here nor do we want to pile on any extra errors.
if let Err(e) = std::fs::remove_file(&temp_path) {
warn!("error deleting temporary file {temp_path}: {e}");
}
}

View File

@@ -1,11 +1,12 @@
//! Helper functions to upload files to remote storage with a RemoteStorage
use anyhow::{bail, Context};
use bytes::Bytes;
use camino::Utf8Path;
use fail::fail_point;
use pageserver_api::shard::TenantShardId;
use std::io::ErrorKind;
use tokio::fs::{self, File};
use tokio::fs;
use super::Generation;
use crate::{
@@ -40,15 +41,11 @@ pub(super) async fn upload_index_part<'a>(
.to_s3_bytes()
.context("serialize index part file into bytes")?;
let index_part_size = index_part_bytes.len();
let index_part_bytes = bytes::Bytes::from(index_part_bytes);
let index_part_bytes = tokio::io::BufReader::new(std::io::Cursor::new(index_part_bytes));
let remote_path = remote_index_path(tenant_shard_id, timeline_id, generation);
storage
.upload_storage_object(
futures::stream::once(futures::future::ready(Ok(index_part_bytes))),
index_part_size,
&remote_path,
)
.upload_storage_object(Box::new(index_part_bytes), index_part_size, &remote_path)
.await
.with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'"))
}
@@ -104,10 +101,8 @@ pub(super) async fn upload_timeline_layer<'a>(
let fs_size = usize::try_from(fs_size)
.with_context(|| format!("convert {source_path:?} size {fs_size} usize"))?;
let reader = tokio_util::io::ReaderStream::with_capacity(source_file, super::BUFFER_SIZE);
storage
.upload(reader, fs_size, &storage_path, None)
.upload(source_file, fs_size, &storage_path, None)
.await
.with_context(|| format!("upload layer from local path '{source_path}'"))?;
@@ -119,16 +114,16 @@ pub(crate) async fn upload_initdb_dir(
storage: &GenericRemoteStorage,
tenant_id: &TenantId,
timeline_id: &TimelineId,
initdb_tar_zst: File,
size: u64,
initdb_dir: Bytes,
) -> anyhow::Result<()> {
tracing::trace!("uploading initdb dir");
let file = tokio_util::io::ReaderStream::with_capacity(initdb_tar_zst, super::BUFFER_SIZE);
let size = initdb_dir.len();
let bytes = tokio::io::BufReader::new(std::io::Cursor::new(initdb_dir));
let remote_path = remote_initdb_archive_path(tenant_id, timeline_id);
storage
.upload_storage_object(file, size as usize, &remote_path)
.upload_storage_object(bytes, size, &remote_path)
.await
.with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'"))
}

View File

@@ -1,104 +0,0 @@
pub mod heatmap;
mod heatmap_uploader;
use std::sync::Arc;
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
use self::heatmap_uploader::heatmap_uploader_task;
use super::mgr::TenantManager;
use pageserver_api::shard::TenantShardId;
use remote_storage::GenericRemoteStorage;
use tokio_util::sync::CancellationToken;
use utils::completion::Barrier;
enum UploadCommand {
Upload(TenantShardId),
}
struct CommandRequest<T> {
payload: T,
response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
}
struct CommandResponse {
result: anyhow::Result<()>,
}
/// The SecondaryController is a pseudo-rpc client for administrative control of secondary mode downloads,
/// and heatmap uploads. This is not a hot data path: it's primarily a hook for tests,
/// where we want to immediately upload/download for a particular tenant. In normal operation
/// uploads & downloads are autonomous and not driven by this interface.
pub struct SecondaryController {
upload_req_tx: tokio::sync::mpsc::Sender<CommandRequest<UploadCommand>>,
}
impl SecondaryController {
async fn dispatch<T>(
&self,
queue: &tokio::sync::mpsc::Sender<CommandRequest<T>>,
payload: T,
) -> anyhow::Result<()> {
let (response_tx, response_rx) = tokio::sync::oneshot::channel();
queue
.send(CommandRequest {
payload,
response_tx,
})
.await
.map_err(|_| anyhow::anyhow!("Receiver shut down"))?;
let response = response_rx
.await
.map_err(|_| anyhow::anyhow!("Request dropped"))?;
response.result
}
pub async fn upload_tenant(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
self.dispatch(&self.upload_req_tx, UploadCommand::Upload(tenant_shard_id))
.await
}
}
pub fn spawn_tasks(
tenant_manager: Arc<TenantManager>,
remote_storage: GenericRemoteStorage,
background_jobs_can_start: Barrier,
cancel: CancellationToken,
) -> SecondaryController {
let (upload_req_tx, upload_req_rx) =
tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
task_mgr::spawn(
BACKGROUND_RUNTIME.handle(),
TaskKind::SecondaryUploads,
None,
None,
"heatmap uploads",
false,
async move {
heatmap_uploader_task(
tenant_manager,
remote_storage,
upload_req_rx,
background_jobs_can_start,
cancel,
)
.await
},
);
SecondaryController { upload_req_tx }
}
/// For running with remote storage disabled: a SecondaryController that is connected to nothing.
pub fn null_controller() -> SecondaryController {
let (upload_req_tx, _upload_req_rx) =
tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
SecondaryController { upload_req_tx }
}

View File

@@ -1,64 +0,0 @@
use std::time::SystemTime;
use crate::tenant::{
remote_timeline_client::index::IndexLayerMetadata, storage_layer::LayerFileName,
};
use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr, TimestampSeconds};
use utils::{generation::Generation, id::TimelineId};
#[derive(Serialize, Deserialize)]
pub(super) struct HeatMapTenant {
/// Generation of the attached location that uploaded the heatmap: this is not required
/// for correctness, but acts as a hint to secondary locations in order to detect thrashing
/// in the unlikely event that two attached locations are both uploading conflicting heatmaps.
pub(super) generation: Generation,
pub(super) timelines: Vec<HeatMapTimeline>,
}
#[serde_as]
#[derive(Serialize, Deserialize)]
pub(crate) struct HeatMapTimeline {
#[serde_as(as = "DisplayFromStr")]
pub(super) timeline_id: TimelineId,
pub(super) layers: Vec<HeatMapLayer>,
}
#[serde_as]
#[derive(Serialize, Deserialize)]
pub(crate) struct HeatMapLayer {
pub(super) name: LayerFileName,
pub(super) metadata: IndexLayerMetadata,
#[serde_as(as = "TimestampSeconds<i64>")]
pub(super) access_time: SystemTime,
// TODO: an actual 'heat' score that would let secondary locations prioritize downloading
// the hottest layers, rather than trying to simply mirror whatever layers are on-disk on the primary.
}
impl HeatMapLayer {
pub(crate) fn new(
name: LayerFileName,
metadata: IndexLayerMetadata,
access_time: SystemTime,
) -> Self {
Self {
name,
metadata,
access_time,
}
}
}
impl HeatMapTimeline {
pub(crate) fn new(timeline_id: TimelineId, layers: Vec<HeatMapLayer>) -> Self {
Self {
timeline_id,
layers,
}
}
}

View File

@@ -1,582 +0,0 @@
use std::{
collections::HashMap,
sync::{Arc, Weak},
time::{Duration, Instant},
};
use crate::{
metrics::SECONDARY_MODE,
tenant::{
config::AttachmentMode, mgr::TenantManager, remote_timeline_client::remote_heatmap_path,
secondary::CommandResponse, span::debug_assert_current_span_has_tenant_id, Tenant,
},
};
use md5;
use pageserver_api::shard::TenantShardId;
use remote_storage::GenericRemoteStorage;
use tokio::task::JoinSet;
use tokio_util::sync::CancellationToken;
use tracing::instrument;
use utils::{backoff, completion::Barrier};
use super::{heatmap::HeatMapTenant, CommandRequest, UploadCommand};
/// Period between heatmap uploader walking Tenants to look for work to do.
/// If any tenants have a heatmap upload period lower than this, it will be adjusted
/// downward to match.
const DEFAULT_SCHEDULING_INTERVAL: Duration = Duration::from_millis(60000);
const MIN_SCHEDULING_INTERVAL: Duration = Duration::from_millis(1000);
struct WriteInProgress {
barrier: Barrier,
}
struct UploadPending {
tenant: Arc<Tenant>,
last_digest: Option<md5::Digest>,
}
struct WriteComplete {
tenant_shard_id: TenantShardId,
completed_at: Instant,
digest: Option<md5::Digest>,
next_upload: Option<Instant>,
}
/// The heatmap uploader keeps a little bit of per-tenant state, mainly to remember
/// when we last did a write. We only populate this after doing at least one
/// write for a tenant -- this avoids holding state for tenants that have
/// uploads disabled.
struct UploaderTenantState {
// This Weak only exists to enable culling idle instances of this type
// when the Tenant has been deallocated.
tenant: Weak<Tenant>,
/// Digest of the serialized heatmap that we last successfully uploaded
///
/// md5 is generally a bad hash. We use it because it's convenient for interop with AWS S3's ETag,
/// which is also an md5sum.
last_digest: Option<md5::Digest>,
/// When the last upload attempt completed (may have been successful or failed)
last_upload: Option<Instant>,
/// When should we next do an upload? None means never.
next_upload: Option<Instant>,
}
/// This type is owned by a single task ([`heatmap_uploader_task`]) which runs an event
/// handling loop and mutates it as needed: there are no locks here, because that event loop
/// can hold &mut references to this type throughout.
struct HeatmapUploader {
tenant_manager: Arc<TenantManager>,
remote_storage: GenericRemoteStorage,
cancel: CancellationToken,
tenants: HashMap<TenantShardId, UploaderTenantState>,
/// Tenants with work to do, for which tasks should be spawned as soon as concurrency
/// limits permit it.
tenants_pending: std::collections::VecDeque<UploadPending>,
/// Tenants for which a task in `tasks` has been spawned.
tenants_uploading: HashMap<TenantShardId, WriteInProgress>,
tasks: JoinSet<()>,
/// Channel for our child tasks to send results to: we use a channel for results rather than
/// just getting task results via JoinSet because we need the channel's recv() "sleep until something
/// is available" semantic, rather than JoinSet::join_next()'s "sleep until next thing is available _or_ I'm empty"
/// behavior.
task_result_tx: tokio::sync::mpsc::UnboundedSender<WriteComplete>,
task_result_rx: tokio::sync::mpsc::UnboundedReceiver<WriteComplete>,
concurrent_uploads: usize,
scheduling_interval: Duration,
}
/// The uploader task runs a loop that periodically wakes up and schedules tasks for
/// tenants that require an upload, or handles any commands that have been sent into
/// `command_queue`. No I/O is done in this loop: that all happens in the tasks we
/// spawn.
///
/// Scheduling iterations are somewhat infrequent. However, each one will enqueue
/// all tenants that require an upload, and in between scheduling iterations we will
/// continue to spawn new tasks for pending tenants, as our concurrency limit permits.
///
/// While we take a CancellationToken here, it is subordinate to the CancellationTokens
/// of tenants: i.e. we expect all Tenants to have been shut down before we are shut down, otherwise
/// we might block waiting on a Tenant.
pub(super) async fn heatmap_uploader_task(
tenant_manager: Arc<TenantManager>,
remote_storage: GenericRemoteStorage,
mut command_queue: tokio::sync::mpsc::Receiver<CommandRequest<UploadCommand>>,
background_jobs_can_start: Barrier,
cancel: CancellationToken,
) -> anyhow::Result<()> {
let concurrent_uploads = tenant_manager.get_conf().heatmap_upload_concurrency;
let (result_tx, result_rx) = tokio::sync::mpsc::unbounded_channel();
let mut uploader = HeatmapUploader {
tenant_manager,
remote_storage,
cancel: cancel.clone(),
tasks: JoinSet::new(),
tenants: HashMap::new(),
tenants_pending: std::collections::VecDeque::new(),
tenants_uploading: HashMap::new(),
task_result_tx: result_tx,
task_result_rx: result_rx,
concurrent_uploads,
scheduling_interval: DEFAULT_SCHEDULING_INTERVAL,
};
tracing::info!("Waiting for background_jobs_can start...");
background_jobs_can_start.wait().await;
tracing::info!("background_jobs_can is ready, proceeding.");
while !cancel.is_cancelled() {
// Look for new work: this is relatively expensive because we have to go acquire the lock on
// the tenant manager to retrieve tenants, and then iterate over them to figure out which ones
// require an upload.
uploader.schedule_iteration().await?;
// Between scheduling iterations, we will:
// - Drain any complete tasks and spawn pending tasks
// - Handle incoming administrative commands
// - Check our cancellation token
let next_scheduling_iteration = Instant::now()
.checked_add(uploader.scheduling_interval)
.unwrap_or_else(|| {
tracing::warn!(
"Scheduling interval invalid ({}s), running immediately!",
uploader.scheduling_interval.as_secs_f64()
);
Instant::now()
});
loop {
tokio::select! {
_ = cancel.cancelled() => {
// We do not simply drop the JoinSet, in order to have an orderly shutdown without cancellation.
tracing::info!("Heatmap uploader joining tasks");
while let Some(_r) = uploader.tasks.join_next().await {};
tracing::info!("Heatmap uploader terminating");
break;
},
_ = tokio::time::sleep(next_scheduling_iteration.duration_since(Instant::now())) => {
tracing::debug!("heatmap_uploader_task: woke for scheduling interval");
break;},
cmd = command_queue.recv() => {
tracing::debug!("heatmap_uploader_task: woke for command queue");
let cmd = match cmd {
Some(c) =>c,
None => {
// SecondaryController was destroyed, and this has raced with
// our CancellationToken
tracing::info!("Heatmap uploader terminating");
cancel.cancel();
break;
}
};
let CommandRequest{
response_tx,
payload
} = cmd;
uploader.handle_command(payload, response_tx);
},
_ = uploader.process_next_completion() => {
if !cancel.is_cancelled() {
uploader.spawn_pending();
}
}
}
}
}
Ok(())
}
impl HeatmapUploader {
/// Periodic execution phase: inspect all attached tenants and schedule any work they require.
async fn schedule_iteration(&mut self) -> anyhow::Result<()> {
// Cull any entries in self.tenants whose Arc<Tenant> is gone
self.tenants
.retain(|_k, v| v.tenant.upgrade().is_some() && v.next_upload.is_some());
// The priority order of previously scheduled work may be invalidated by current state: drop
// all pending work (it will be re-scheduled if still needed)
self.tenants_pending.clear();
// Used a fixed 'now' through the following loop, for efficiency and fairness.
let now = Instant::now();
// While iterating over the potentially-long list of tenants, we will periodically yield
// to avoid blocking executor.
const YIELD_ITERATIONS: usize = 1000;
// Iterate over tenants looking for work to do.
let tenants = self.tenant_manager.get_attached_active_tenant_shards();
for (i, tenant) in tenants.into_iter().enumerate() {
// Process is shutting down, drop out
if self.cancel.is_cancelled() {
return Ok(());
}
// Skip tenants that already have a write in flight
if self
.tenants_uploading
.contains_key(tenant.get_tenant_shard_id())
{
continue;
}
self.maybe_schedule_upload(&now, tenant);
if i + 1 % YIELD_ITERATIONS == 0 {
tokio::task::yield_now().await;
}
}
// Spawn tasks for as many of our pending tenants as we can.
self.spawn_pending();
Ok(())
}
///
/// Cancellation: this method is cancel-safe.
async fn process_next_completion(&mut self) {
match self.task_result_rx.recv().await {
Some(r) => {
self.on_completion(r);
}
None => {
unreachable!("Result sender is stored on Self");
}
}
}
/// The 'maybe' refers to the tenant's state: whether it is configured
/// for heatmap uploads at all, and whether sufficient time has passed
/// since the last upload.
fn maybe_schedule_upload(&mut self, now: &Instant, tenant: Arc<Tenant>) {
match tenant.get_heatmap_period() {
None => {
// Heatmaps are disabled for this tenant
return;
}
Some(period) => {
// If any tenant has asked for uploads more frequent than our scheduling interval,
// reduce it to match so that we can keep up. This is mainly useful in testing, where
// we may set rather short intervals.
if period < self.scheduling_interval {
self.scheduling_interval = std::cmp::max(period, MIN_SCHEDULING_INTERVAL);
}
}
}
// Stale attachments do not upload anything: if we are in this state, there is probably some
// other attachment in mode Single or Multi running on another pageserver, and we don't
// want to thrash and overwrite their heatmap uploads.
if tenant.get_attach_mode() == AttachmentMode::Stale {
return;
}
// Create an entry in self.tenants if one doesn't already exist: this will later be updated
// with the completion time in on_completion.
let state = self
.tenants
.entry(*tenant.get_tenant_shard_id())
.or_insert_with(|| UploaderTenantState {
tenant: Arc::downgrade(&tenant),
last_upload: None,
next_upload: Some(Instant::now()),
last_digest: None,
});
// Decline to do the upload if insufficient time has passed
if state.next_upload.map(|nu| &nu > now).unwrap_or(false) {
return;
}
let last_digest = state.last_digest;
self.tenants_pending.push_back(UploadPending {
tenant,
last_digest,
})
}
fn spawn_pending(&mut self) {
while !self.tenants_pending.is_empty()
&& self.tenants_uploading.len() < self.concurrent_uploads
{
// unwrap: loop condition includes !is_empty()
let pending = self.tenants_pending.pop_front().unwrap();
self.spawn_upload(pending.tenant, pending.last_digest);
}
}
fn spawn_upload(&mut self, tenant: Arc<Tenant>, last_digest: Option<md5::Digest>) {
let remote_storage = self.remote_storage.clone();
let tenant_shard_id = *tenant.get_tenant_shard_id();
let (completion, barrier) = utils::completion::channel();
let result_tx = self.task_result_tx.clone();
self.tasks.spawn(async move {
// Guard for the barrier in [`WriteInProgress`]
let _completion = completion;
let started_at = Instant::now();
let digest = match upload_tenant_heatmap(remote_storage, &tenant, last_digest).await {
Ok(UploadHeatmapOutcome::Uploaded(digest)) => {
let duration = Instant::now().duration_since(started_at);
SECONDARY_MODE
.upload_heatmap_duration
.observe(duration.as_secs_f64());
SECONDARY_MODE.upload_heatmap.inc();
Some(digest)
}
Ok(UploadHeatmapOutcome::NoChange | UploadHeatmapOutcome::Skipped) => last_digest,
Err(UploadHeatmapError::Upload(e)) => {
tracing::warn!(
"Failed to upload heatmap for tenant {}: {e:#}",
tenant.get_tenant_shard_id(),
);
let duration = Instant::now().duration_since(started_at);
SECONDARY_MODE
.upload_heatmap_duration
.observe(duration.as_secs_f64());
SECONDARY_MODE.upload_heatmap_errors.inc();
last_digest
}
Err(UploadHeatmapError::Cancelled) => {
tracing::info!("Cancelled heatmap upload, shutting down");
last_digest
}
};
let now = Instant::now();
let next_upload = tenant
.get_heatmap_period()
.and_then(|period| now.checked_add(period));
result_tx
.send(WriteComplete {
tenant_shard_id: *tenant.get_tenant_shard_id(),
completed_at: now,
digest,
next_upload,
})
.ok();
});
self.tenants_uploading
.insert(tenant_shard_id, WriteInProgress { barrier });
}
#[instrument(skip_all, fields(tenant_id=%completion.tenant_shard_id.tenant_id, shard_id=%completion.tenant_shard_id.shard_slug()))]
fn on_completion(&mut self, completion: WriteComplete) {
tracing::debug!("Heatmap upload completed");
let WriteComplete {
tenant_shard_id,
completed_at,
digest,
next_upload,
} = completion;
self.tenants_uploading.remove(&tenant_shard_id);
use std::collections::hash_map::Entry;
match self.tenants.entry(tenant_shard_id) {
Entry::Vacant(_) => {
// Tenant state was dropped, nothing to update.
}
Entry::Occupied(mut entry) => {
entry.get_mut().last_upload = Some(completed_at);
entry.get_mut().last_digest = digest;
entry.get_mut().next_upload = next_upload
}
}
}
fn handle_command(
&mut self,
command: UploadCommand,
response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
) {
match command {
UploadCommand::Upload(tenant_shard_id) => {
// If an upload was ongoing for this tenant, let it finish first.
let barrier = if let Some(writing_state) =
self.tenants_uploading.get(&tenant_shard_id)
{
tracing::info!(
tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
"Waiting for heatmap write to complete");
writing_state.barrier.clone()
} else {
// Spawn the upload then immediately wait for it. This will block processing of other commands and
// starting of other background work.
tracing::info!(
tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
"Starting heatmap write on command");
let tenant = match self
.tenant_manager
.get_attached_tenant_shard(tenant_shard_id, true)
{
Ok(t) => t,
Err(e) => {
// Drop result of send: we don't care if caller dropped their receiver
drop(response_tx.send(CommandResponse {
result: Err(e.into()),
}));
return;
}
};
self.spawn_upload(tenant, None);
let writing_state = self
.tenants_uploading
.get(&tenant_shard_id)
.expect("We just inserted this");
tracing::info!(
tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
"Waiting for heatmap upload to complete");
writing_state.barrier.clone()
};
// This task does no I/O: it only listens for a barrier's completion and then
// sends to the command response channel. It is therefore safe to spawn this without
// any gates/task_mgr hooks.
tokio::task::spawn(async move {
barrier.wait().await;
tracing::info!(
tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
"Heatmap upload complete");
// Drop result of send: we don't care if caller dropped their receiver
drop(response_tx.send(CommandResponse { result: Ok(()) }))
});
}
}
}
}
enum UploadHeatmapOutcome {
/// We successfully wrote to remote storage, with this digest.
Uploaded(md5::Digest),
/// We did not upload because the heatmap digest was unchanged since the last upload
NoChange,
/// We skipped the upload for some reason, such as tenant/timeline not ready
Skipped,
}
#[derive(thiserror::Error, Debug)]
enum UploadHeatmapError {
#[error("Cancelled")]
Cancelled,
#[error(transparent)]
Upload(#[from] anyhow::Error),
}
/// The inner upload operation. This will skip if `last_digest` is Some and matches the digest
/// of the object we would have uploaded.
#[instrument(skip_all, fields(tenant_id = %tenant.get_tenant_shard_id().tenant_id, shard_id = %tenant.get_tenant_shard_id().shard_slug()))]
async fn upload_tenant_heatmap(
remote_storage: GenericRemoteStorage,
tenant: &Arc<Tenant>,
last_digest: Option<md5::Digest>,
) -> Result<UploadHeatmapOutcome, UploadHeatmapError> {
debug_assert_current_span_has_tenant_id();
let generation = tenant.get_generation();
if generation.is_none() {
// We do not expect this: generations were implemented before heatmap uploads. However,
// handle it so that we don't have to make the generation in the heatmap an Option<>
// (Generation::none is not serializable)
tracing::warn!("Skipping heatmap upload for tenant with generation==None");
return Ok(UploadHeatmapOutcome::Skipped);
}
let mut heatmap = HeatMapTenant {
timelines: Vec::new(),
generation,
};
let timelines = tenant.timelines.lock().unwrap().clone();
let tenant_cancel = tenant.cancel.clone();
// Ensure that Tenant::shutdown waits for any upload in flight: this is needed because otherwise
// when we delete a tenant, we might race with an upload in flight and end up leaving a heatmap behind
// in remote storage.
let _guard = match tenant.gate.enter() {
Ok(g) => g,
Err(_) => {
tracing::info!("Skipping heatmap upload for tenant which is shutting down");
return Err(UploadHeatmapError::Cancelled);
}
};
for (timeline_id, timeline) in timelines {
let heatmap_timeline = timeline.generate_heatmap().await;
match heatmap_timeline {
None => {
tracing::debug!(
"Skipping heatmap upload because timeline {timeline_id} is not ready"
);
return Ok(UploadHeatmapOutcome::Skipped);
}
Some(heatmap_timeline) => {
heatmap.timelines.push(heatmap_timeline);
}
}
}
// Serialize the heatmap
let bytes = serde_json::to_vec(&heatmap).map_err(|e| anyhow::anyhow!(e))?;
let size = bytes.len();
// Drop out early if nothing changed since our last upload
let digest = md5::compute(&bytes);
if Some(digest) == last_digest {
return Ok(UploadHeatmapOutcome::NoChange);
}
let path = remote_heatmap_path(tenant.get_tenant_shard_id());
// Write the heatmap.
tracing::debug!("Uploading {size} byte heatmap to {path}");
if let Err(e) = backoff::retry(
|| async {
let bytes = futures::stream::once(futures::future::ready(Ok(bytes::Bytes::from(
bytes.clone(),
))));
remote_storage
.upload_storage_object(bytes, size, &path)
.await
},
|_| false,
3,
u32::MAX,
"Uploading heatmap",
backoff::Cancel::new(tenant_cancel.clone(), || anyhow::anyhow!("Shutting down")),
)
.await
{
if tenant_cancel.is_cancelled() {
return Err(UploadHeatmapError::Cancelled);
} else {
return Err(e.into());
}
}
tracing::info!("Successfully uploaded {size} byte heatmap to {path}");
Ok(UploadHeatmapOutcome::Uploaded(digest))
}

View File

@@ -4,7 +4,7 @@ pub mod delta_layer;
mod filename;
pub mod image_layer;
mod inmemory_layer;
pub(crate) mod layer;
mod layer;
mod layer_desc;
use crate::context::{AccessStatsBehavior, RequestContext};

View File

@@ -23,7 +23,7 @@ use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
// while being able to use std::fmt::Write's methods
use std::fmt::Write as _;
use std::ops::Range;
use tokio::sync::{RwLock, RwLockWriteGuard};
use tokio::sync::RwLock;
use super::{DeltaLayerWriter, ResidentLayer};
@@ -252,37 +252,10 @@ impl InMemoryLayer {
lsn: Lsn,
val: &Value,
ctx: &RequestContext,
) -> Result<()> {
let mut inner = self.inner.write().await;
self.assert_writable();
self.put_value_locked(&mut inner, key, lsn, val, ctx).await
}
pub async fn put_values(
&self,
values: &HashMap<Key, Vec<(Lsn, Value)>>,
ctx: &RequestContext,
) -> Result<()> {
let mut inner = self.inner.write().await;
self.assert_writable();
for (key, vals) in values {
for (lsn, val) in vals {
self.put_value_locked(&mut inner, *key, *lsn, val, ctx)
.await?;
}
}
Ok(())
}
async fn put_value_locked(
&self,
locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>,
key: Key,
lsn: Lsn,
val: &Value,
ctx: &RequestContext,
) -> Result<()> {
trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
let inner: &mut _ = &mut *self.inner.write().await;
self.assert_writable();
let off = {
// Avoid doing allocations for "small" values.
@@ -291,7 +264,7 @@ impl InMemoryLayer {
let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
buf.clear();
val.ser_into(&mut buf)?;
locked_inner
inner
.file
.write_blob(
&buf,
@@ -302,7 +275,7 @@ impl InMemoryLayer {
.await?
};
let vec_map = locked_inner.index.entry(key).or_default();
let vec_map = inner.index.entry(key).or_default();
let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
if old.is_some() {
// We already had an entry for this LSN. That's odd..
@@ -318,10 +291,6 @@ impl InMemoryLayer {
Ok(())
}
pub async fn put_tombstones(&self, _key_ranges: &[(Range<Key>, Lsn)]) -> Result<()> {
Ok(())
}
/// Make the layer non-writeable. Only call once.
/// Records the end_lsn for non-dropped layers.
/// `end_lsn` is exclusive

View File

@@ -222,8 +222,8 @@ impl Layer {
///
/// [gc]: [`RemoteTimelineClient::schedule_gc_update`]
/// [compaction]: [`RemoteTimelineClient::schedule_compaction_update`]
pub(crate) fn delete_on_drop(&self) {
self.0.delete_on_drop();
pub(crate) fn garbage_collect_on_drop(&self) {
self.0.garbage_collect_on_drop();
}
/// Return data needed to reconstruct given page at LSN.
@@ -331,10 +331,10 @@ impl Layer {
Ok(())
}
/// Waits until this layer has been dropped (and if needed, local file deletion and remote
/// Waits until this layer has been dropped (and if needed, local garbage collection and remote
/// deletion scheduling has completed).
///
/// Does not start local deletion, use [`Self::delete_on_drop`] for that
/// Does not start garbage collection, use [`Self::garbage_collect_on_drop`] for that
/// separatedly.
#[cfg(feature = "testing")]
pub(crate) fn wait_drop(&self) -> impl std::future::Future<Output = ()> + 'static {
@@ -423,8 +423,8 @@ struct LayerInner {
/// Initialization and deinitialization are done while holding a permit.
inner: heavier_once_cell::OnceCell<ResidentOrWantedEvicted>,
/// Do we want to delete locally and remotely this when `LayerInner` is dropped
wanted_deleted: AtomicBool,
/// Do we want to garbage collect this when `LayerInner` is dropped
wanted_garbage_collected: AtomicBool,
/// Do we want to evict this layer as soon as possible? After being set to `true`, all accesses
/// will try to downgrade [`ResidentOrWantedEvicted`], which will eventually trigger
@@ -438,6 +438,10 @@ struct LayerInner {
version: AtomicUsize,
/// Allow subscribing to when the layer actually gets evicted.
///
/// If in future we need to implement "wait until layer instances are gone and done", carrying
/// this over to the gc spawn_blocking from LayerInner::drop will do the trick, and adding a
/// method for "wait_gc" which will wait to this being closed.
status: tokio::sync::broadcast::Sender<Status>,
/// Counter for exponential backoff with the download
@@ -479,14 +483,14 @@ enum Status {
impl Drop for LayerInner {
fn drop(&mut self) {
if !*self.wanted_deleted.get_mut() {
if !*self.wanted_garbage_collected.get_mut() {
// should we try to evict if the last wish was for eviction?
// feels like there's some hazard of overcrowding near shutdown near by, but we don't
// run drops during shutdown (yet)
return;
}
let span = tracing::info_span!(parent: None, "layer_delete", tenant_id = %self.layer_desc().tenant_shard_id.tenant_id, shard_id=%self.layer_desc().tenant_shard_id.shard_slug(), timeline_id = %self.layer_desc().timeline_id);
let span = tracing::info_span!(parent: None, "layer_gc", tenant_id = %self.layer_desc().tenant_shard_id.tenant_id, shard_id=%self.layer_desc().tenant_shard_id.shard_slug(), timeline_id = %self.layer_desc().timeline_id);
let path = std::mem::take(&mut self.path);
let file_name = self.layer_desc().filename();
@@ -513,8 +517,8 @@ impl Drop for LayerInner {
false
}
Err(e) => {
tracing::error!("failed to remove wanted deleted layer: {e}");
LAYER_IMPL_METRICS.inc_delete_removes_failed();
tracing::error!("failed to remove garbage collected layer: {e}");
LAYER_IMPL_METRICS.inc_gc_removes_failed();
false
}
};
@@ -536,15 +540,15 @@ impl Drop for LayerInner {
} else {
tracing::warn!("scheduling deletion on drop failed: {e:#}");
}
LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed);
LAYER_IMPL_METRICS.inc_gcs_failed(GcFailed::DeleteSchedulingFailed);
} else {
LAYER_IMPL_METRICS.inc_completed_deletes();
LAYER_IMPL_METRICS.inc_completed_gcs();
}
}
} else {
// no need to nag that timeline is gone: under normal situation on
// task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped.
LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
LAYER_IMPL_METRICS.inc_gcs_failed(GcFailed::TimelineGone);
}
});
}
@@ -579,7 +583,7 @@ impl LayerInner {
timeline: Arc::downgrade(timeline),
have_remote_client: timeline.remote_client.is_some(),
access_stats,
wanted_deleted: AtomicBool::new(false),
wanted_garbage_collected: AtomicBool::new(false),
wanted_evicted: AtomicBool::new(false),
inner,
version: AtomicUsize::new(version),
@@ -590,13 +594,16 @@ impl LayerInner {
}
}
fn delete_on_drop(&self) {
let res =
self.wanted_deleted
.compare_exchange(false, true, Ordering::Release, Ordering::Relaxed);
fn garbage_collect_on_drop(&self) {
let res = self.wanted_garbage_collected.compare_exchange(
false,
true,
Ordering::Release,
Ordering::Relaxed,
);
if res.is_ok() {
LAYER_IMPL_METRICS.inc_started_deletes();
LAYER_IMPL_METRICS.inc_started_gcs();
}
}
@@ -664,10 +671,6 @@ impl LayerInner {
// disable any scheduled but not yet running eviction deletions for this
let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);
// count cancellations, which currently remain largely unexpected
let init_cancelled =
scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
// no need to make the evict_and_wait wait for the actual download to complete
drop(self.status.send(Status::Downloaded));
@@ -676,8 +679,6 @@ impl LayerInner {
.upgrade()
.ok_or_else(|| DownloadError::TimelineShutdown)?;
// FIXME: grab a gate
let can_ever_evict = timeline.remote_client.as_ref().is_some();
// check if we really need to be downloaded; could have been already downloaded by a
@@ -738,8 +739,6 @@ impl LayerInner {
tracing::info!(waiters, "completing the on-demand download for other tasks");
}
scopeguard::ScopeGuard::into_inner(init_cancelled);
Ok((ResidentOrWantedEvicted::Resident(res), permit))
};
@@ -837,7 +836,7 @@ impl LayerInner {
crate::task_mgr::spawn(
&tokio::runtime::Handle::current(),
crate::task_mgr::TaskKind::RemoteDownloadTask,
Some(self.desc.tenant_shard_id),
Some(self.desc.tenant_shard_id.tenant_id),
Some(self.desc.timeline_id),
&task_name,
false,
@@ -868,13 +867,14 @@ impl LayerInner {
match res {
(Ok(()), _) => {
// our caller is cancellation safe so this is fine; if someone
// else requests the layer, they'll find it already downloaded.
// else requests the layer, they'll find it already downloaded
// or redownload.
//
// See counter [`LayerImplMetrics::inc_init_needed_no_download`]
//
// FIXME(#6028): however, could be that we should consider marking the
// layer for eviction? alas, cannot: because only DownloadedLayer will
// handle that.
// however, could be that we should consider marking the layer
// for eviction? alas, cannot: because only DownloadedLayer
// will handle that.
tracing::info!("layer file download completed after requester had cancelled");
LAYER_IMPL_METRICS.inc_download_completed_without_requester();
},
(Err(e), _) => {
// our caller is cancellation safe, but we might be racing with
@@ -994,15 +994,12 @@ impl LayerInner {
/// `DownloadedLayer` is being dropped, so it calls this method.
fn on_downloaded_layer_drop(self: Arc<LayerInner>, version: usize) {
let delete = self.wanted_deleted.load(Ordering::Acquire);
let gc = self.wanted_garbage_collected.load(Ordering::Acquire);
let evict = self.wanted_evicted.load(Ordering::Acquire);
let can_evict = self.have_remote_client;
if delete {
// do nothing now, only in LayerInner::drop -- this was originally implemented because
// we could had already scheduled the deletion at the time.
//
// FIXME: this is not true anymore, we can safely evict wanted deleted files.
if gc {
// do nothing now, only in LayerInner::drop
} else if can_evict && evict {
let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_shard_id.tenant_id, shard_id = %self.desc.tenant_shard_id.shard_slug(), timeline_id = %self.desc.timeline_id, layer=%self, %version);
@@ -1017,7 +1014,7 @@ impl LayerInner {
crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(move || {
let _g = span.entered();
// if LayerInner is already dropped here, do nothing because the delete on drop
// if LayerInner is already dropped here, do nothing because the garbage collection
// has already ran while we were in queue
let Some(this) = this.upgrade() else {
LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
@@ -1408,38 +1405,36 @@ impl From<ResidentLayer> for Layer {
}
}
use metrics::IntCounter;
use metrics::{IntCounter, IntCounterVec};
pub(crate) struct LayerImplMetrics {
struct LayerImplMetrics {
started_evictions: IntCounter,
completed_evictions: IntCounter,
cancelled_evictions: enum_map::EnumMap<EvictionCancelled, IntCounter>,
cancelled_evictions: IntCounterVec,
started_deletes: IntCounter,
completed_deletes: IntCounter,
failed_deletes: enum_map::EnumMap<DeleteFailed, IntCounter>,
started_gcs: IntCounter,
completed_gcs: IntCounter,
failed_gcs: IntCounterVec,
rare_counters: enum_map::EnumMap<RareEvent, IntCounter>,
inits_cancelled: metrics::core::GenericCounter<metrics::core::AtomicU64>,
rare_counters: IntCounterVec,
}
impl Default for LayerImplMetrics {
fn default() -> Self {
use enum_map::Enum;
// reminder: these will be pageserver_layer_* with "_total" suffix
let started_evictions = metrics::register_int_counter!(
"pageserver_layer_started_evictions",
"Evictions started in the Layer implementation"
)
.unwrap();
let completed_evictions = metrics::register_int_counter!(
"pageserver_layer_completed_evictions",
"Evictions completed in the Layer implementation"
let evictions = metrics::register_int_counter_vec!(
"pageserver_layer_evictions_count",
"Evictions started and completed in the Layer implementation",
&["state"]
)
.unwrap();
let started_evictions = evictions
.get_metric_with_label_values(&["started"])
.unwrap();
let completed_evictions = evictions
.get_metric_with_label_values(&["completed"])
.unwrap();
let cancelled_evictions = metrics::register_int_counter_vec!(
"pageserver_layer_cancelled_evictions_count",
"Different reasons for evictions to have been cancelled or failed",
@@ -1447,36 +1442,24 @@ impl Default for LayerImplMetrics {
)
.unwrap();
let cancelled_evictions = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
let reason = EvictionCancelled::from_usize(i);
let s = reason.as_str();
cancelled_evictions.with_label_values(&[s])
}));
let started_deletes = metrics::register_int_counter!(
"pageserver_layer_started_deletes",
"Deletions on drop pending in the Layer implementation"
)
.unwrap();
let completed_deletes = metrics::register_int_counter!(
"pageserver_layer_completed_deletes",
"Deletions on drop completed in the Layer implementation"
// reminder: this will be pageserver_layer_gcs_count_total with "_total" suffix
let gcs = metrics::register_int_counter_vec!(
"pageserver_layer_gcs_count",
"Garbage collections started and completed in the Layer implementation",
&["state"]
)
.unwrap();
let failed_deletes = metrics::register_int_counter_vec!(
"pageserver_layer_failed_deletes_count",
"Different reasons for deletions on drop to have failed",
let started_gcs = gcs.get_metric_with_label_values(&["pending"]).unwrap();
let completed_gcs = gcs.get_metric_with_label_values(&["completed"]).unwrap();
let failed_gcs = metrics::register_int_counter_vec!(
"pageserver_layer_failed_gcs_count",
"Different reasons for garbage collections to have failed",
&["reason"]
)
.unwrap();
let failed_deletes = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
let reason = DeleteFailed::from_usize(i);
let s = reason.as_str();
failed_deletes.with_label_values(&[s])
}));
let rare_counters = metrics::register_int_counter_vec!(
"pageserver_layer_assumed_rare_count",
"Times unexpected or assumed rare event happened",
@@ -1484,29 +1467,16 @@ impl Default for LayerImplMetrics {
)
.unwrap();
let rare_counters = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
let event = RareEvent::from_usize(i);
let s = event.as_str();
rare_counters.with_label_values(&[s])
}));
let inits_cancelled = metrics::register_int_counter!(
"pageserver_layer_inits_cancelled_count",
"Times Layer initialization was cancelled",
)
.unwrap();
Self {
started_evictions,
completed_evictions,
cancelled_evictions,
started_deletes,
completed_deletes,
failed_deletes,
started_gcs,
completed_gcs,
failed_gcs,
rare_counters,
inits_cancelled,
}
}
}
@@ -1519,33 +1489,57 @@ impl LayerImplMetrics {
self.completed_evictions.inc();
}
fn inc_eviction_cancelled(&self, reason: EvictionCancelled) {
self.cancelled_evictions[reason].inc()
self.cancelled_evictions
.get_metric_with_label_values(&[reason.as_str()])
.unwrap()
.inc()
}
fn inc_started_deletes(&self) {
self.started_deletes.inc();
fn inc_started_gcs(&self) {
self.started_gcs.inc();
}
fn inc_completed_deletes(&self) {
self.completed_deletes.inc();
fn inc_completed_gcs(&self) {
self.completed_gcs.inc();
}
fn inc_deletes_failed(&self, reason: DeleteFailed) {
self.failed_deletes[reason].inc();
fn inc_gcs_failed(&self, reason: GcFailed) {
self.failed_gcs
.get_metric_with_label_values(&[reason.as_str()])
.unwrap()
.inc();
}
/// Counted separatedly from failed layer deletes because we will complete the layer deletion
/// attempt regardless of failure to delete local file.
fn inc_delete_removes_failed(&self) {
self.rare_counters[RareEvent::RemoveOnDropFailed].inc();
/// Counted separatedly from failed gcs because we will complete the gc attempt regardless of
/// failure to delete local file.
fn inc_gc_removes_failed(&self) {
self.rare_counters
.get_metric_with_label_values(&["gc_remove_failed"])
.unwrap()
.inc();
}
/// Expected rare because requires a race with `evict_blocking` and `get_or_maybe_download`.
/// Expected rare because requires a race with `evict_blocking` and
/// `get_or_maybe_download`.
fn inc_retried_get_or_maybe_download(&self) {
self.rare_counters[RareEvent::RetriedGetOrMaybeDownload].inc();
self.rare_counters
.get_metric_with_label_values(&["retried_gomd"])
.unwrap()
.inc();
}
/// Expected rare because cancellations are unexpected, and failures are unexpected
/// Expected rare because cancellations are unexpected
fn inc_download_completed_without_requester(&self) {
self.rare_counters
.get_metric_with_label_values(&["download_completed_without"])
.unwrap()
.inc();
}
/// Expected rare because cancellations are unexpected
fn inc_download_failed_without_requester(&self) {
self.rare_counters[RareEvent::DownloadFailedWithoutRequester].inc();
self.rare_counters
.get_metric_with_label_values(&["download_failed_without"])
.unwrap()
.inc();
}
/// The Weak in ResidentOrWantedEvicted::WantedEvicted was successfully upgraded.
@@ -1553,30 +1547,37 @@ impl LayerImplMetrics {
/// If this counter is always zero, we should replace ResidentOrWantedEvicted type with an
/// Option.
fn inc_raced_wanted_evicted_accesses(&self) {
self.rare_counters[RareEvent::UpgradedWantedEvicted].inc();
self.rare_counters
.get_metric_with_label_values(&["raced_wanted_evicted"])
.unwrap()
.inc();
}
/// These are only expected for [`Self::inc_init_cancelled`] amount when
/// These are only expected for [`Self::inc_download_completed_without_requester`] amount when
/// running with remote storage.
fn inc_init_needed_no_download(&self) {
self.rare_counters[RareEvent::InitWithoutDownload].inc();
self.rare_counters
.get_metric_with_label_values(&["init_needed_no_download"])
.unwrap()
.inc();
}
/// Expected rare because all layer files should be readable and good
fn inc_permanent_loading_failures(&self) {
self.rare_counters[RareEvent::PermanentLoadingFailure].inc();
self.rare_counters
.get_metric_with_label_values(&["permanent_loading_failure"])
.unwrap()
.inc();
}
fn inc_broadcast_lagged(&self) {
self.rare_counters[RareEvent::EvictAndWaitLagged].inc();
}
fn inc_init_cancelled(&self) {
self.inits_cancelled.inc()
self.rare_counters
.get_metric_with_label_values(&["broadcast_lagged"])
.unwrap()
.inc();
}
}
#[derive(enum_map::Enum)]
enum EvictionCancelled {
LayerGone,
TimelineGone,
@@ -1605,47 +1606,19 @@ impl EvictionCancelled {
}
}
#[derive(enum_map::Enum)]
enum DeleteFailed {
enum GcFailed {
TimelineGone,
DeleteSchedulingFailed,
}
impl DeleteFailed {
impl GcFailed {
fn as_str(&self) -> &'static str {
match self {
DeleteFailed::TimelineGone => "timeline_gone",
DeleteFailed::DeleteSchedulingFailed => "delete_scheduling_failed",
GcFailed::TimelineGone => "timeline_gone",
GcFailed::DeleteSchedulingFailed => "delete_scheduling_failed",
}
}
}
#[derive(enum_map::Enum)]
enum RareEvent {
RemoveOnDropFailed,
RetriedGetOrMaybeDownload,
DownloadFailedWithoutRequester,
UpgradedWantedEvicted,
InitWithoutDownload,
PermanentLoadingFailure,
EvictAndWaitLagged,
}
impl RareEvent {
fn as_str(&self) -> &'static str {
use RareEvent::*;
match self {
RemoveOnDropFailed => "remove_on_drop_failed",
RetriedGetOrMaybeDownload => "retried_gomd",
DownloadFailedWithoutRequester => "download_failed_without",
UpgradedWantedEvicted => "raced_wanted_evicted",
InitWithoutDownload => "init_needed_no_download",
PermanentLoadingFailure => "permanent_loading_failure",
EvictAndWaitLagged => "broadcast_lagged",
}
}
}
pub(crate) static LAYER_IMPL_METRICS: once_cell::sync::Lazy<LayerImplMetrics> =
static LAYER_IMPL_METRICS: once_cell::sync::Lazy<LayerImplMetrics> =
once_cell::sync::Lazy::new(LayerImplMetrics::default);

View File

@@ -63,10 +63,12 @@ pub(crate) async fn concurrent_background_tasks_rate_limit(
_ctx: &RequestContext,
cancel: &CancellationToken,
) -> Result<impl Drop, RateLimitError> {
let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE
crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_START_COUNT
.with_label_values(&[loop_kind.as_static_str()])
.guard();
.inc();
scopeguard::defer!(
crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_FINISH_COUNT.with_label_values(&[loop_kind.as_static_str()]).inc();
);
tokio::select! {
permit = CONCURRENT_BACKGROUND_TASKS.acquire() => {
match permit {
@@ -85,13 +87,13 @@ pub fn start_background_loops(
tenant: &Arc<Tenant>,
background_jobs_can_start: Option<&completion::Barrier>,
) {
let tenant_shard_id = tenant.tenant_shard_id;
let tenant_id = tenant.tenant_shard_id.tenant_id;
task_mgr::spawn(
BACKGROUND_RUNTIME.handle(),
TaskKind::Compaction,
Some(tenant_shard_id),
Some(tenant_id),
None,
&format!("compactor for tenant {tenant_shard_id}"),
&format!("compactor for tenant {tenant_id}"),
false,
{
let tenant = Arc::clone(tenant);
@@ -103,7 +105,7 @@ pub fn start_background_loops(
_ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
};
compaction_loop(tenant, cancel)
.instrument(info_span!("compaction_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
.instrument(info_span!("compaction_loop", tenant_id = %tenant_id))
.await;
Ok(())
}
@@ -112,9 +114,9 @@ pub fn start_background_loops(
task_mgr::spawn(
BACKGROUND_RUNTIME.handle(),
TaskKind::GarbageCollector,
Some(tenant_shard_id),
Some(tenant_id),
None,
&format!("garbage collector for tenant {tenant_shard_id}"),
&format!("garbage collector for tenant {tenant_id}"),
false,
{
let tenant = Arc::clone(tenant);
@@ -126,7 +128,7 @@ pub fn start_background_loops(
_ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
};
gc_loop(tenant, cancel)
.instrument(info_span!("gc_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
.instrument(info_span!("gc_loop", tenant_id = %tenant_id))
.await;
Ok(())
}

View File

@@ -29,7 +29,7 @@ use tokio::{
};
use tokio_util::sync::CancellationToken;
use tracing::*;
use utils::sync::gate::Gate;
use utils::{id::TenantTimelineId, sync::gate::Gate};
use std::collections::{BinaryHeap, HashMap, HashSet};
use std::ops::{Deref, Range};
@@ -66,7 +66,7 @@ use crate::metrics::{
TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
};
use crate::pgdatadir_mapping::LsnForTimestamp;
use crate::pgdatadir_mapping::{is_inherited_key, is_rel_fsm_block_key, is_rel_vm_block_key};
use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key};
use crate::pgdatadir_mapping::{BlockNumber, CalculateLogicalSizeError};
use crate::tenant::config::{EvictionPolicy, TenantConfOpt};
use pageserver_api::reltag::RelTag;
@@ -77,7 +77,7 @@ use postgres_ffi::to_pg_timestamp;
use utils::{
completion,
generation::Generation,
id::TimelineId,
id::{TenantId, TimelineId},
lsn::{AtomicLsn, Lsn, RecordLsn},
seqwait::SeqWait,
simple_rcu::{Rcu, RcuReadGuard},
@@ -98,9 +98,8 @@ use self::logical_size::LogicalSize;
use self::walreceiver::{WalReceiver, WalReceiverConf};
use super::config::TenantConf;
use super::remote_timeline_client::index::{IndexLayerMetadata, IndexPart};
use super::remote_timeline_client::index::IndexPart;
use super::remote_timeline_client::RemoteTimelineClient;
use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
@@ -378,6 +377,9 @@ pub enum PageReconstructError {
#[error(transparent)]
Other(#[from] anyhow::Error),
/// The operation would require downloading a layer that is missing locally.
NeedsDownload(TenantTimelineId, LayerFileName),
/// The operation was cancelled
Cancelled,
@@ -406,6 +408,14 @@ impl std::fmt::Debug for PageReconstructError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
match self {
Self::Other(err) => err.fmt(f),
Self::NeedsDownload(tenant_timeline_id, layer_file_name) => {
write!(
f,
"layer {}/{} needs download",
tenant_timeline_id,
layer_file_name.file_name()
)
}
Self::Cancelled => write!(f, "cancelled"),
Self::AncestorStopping(timeline_id) => {
write!(f, "ancestor timeline {timeline_id} is being stopped")
@@ -419,6 +429,14 @@ impl std::fmt::Display for PageReconstructError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
match self {
Self::Other(err) => err.fmt(f),
Self::NeedsDownload(tenant_timeline_id, layer_file_name) => {
write!(
f,
"layer {}/{} needs download",
tenant_timeline_id,
layer_file_name.file_name()
)
}
Self::Cancelled => write!(f, "cancelled"),
Self::AncestorStopping(timeline_id) => {
write!(f, "ancestor timeline {timeline_id} is being stopped")
@@ -460,7 +478,7 @@ impl Timeline {
.map(|ancestor| ancestor.timeline_id)
}
/// Lock and get timeline's GC cutoff
/// Lock and get timeline's GC cuttof
pub fn get_latest_gc_cutoff_lsn(&self) -> RcuReadGuard<Lsn> {
self.latest_gc_cutoff_lsn.read()
}
@@ -908,7 +926,7 @@ impl Timeline {
tracing::debug!("Waiting for WalReceiverManager...");
task_mgr::shutdown_tasks(
Some(TaskKind::WalReceiverManager),
Some(self.tenant_shard_id),
Some(self.tenant_shard_id.tenant_id),
Some(self.timeline_id),
)
.await;
@@ -959,7 +977,7 @@ impl Timeline {
// Shut down the layer flush task before the remote client, as one depends on the other
task_mgr::shutdown_tasks(
Some(TaskKind::LayerFlushTask),
Some(self.tenant_shard_id),
Some(self.tenant_shard_id.tenant_id),
Some(self.timeline_id),
)
.await;
@@ -977,7 +995,12 @@ impl Timeline {
tracing::debug!("Waiting for tasks...");
task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), Some(self.timeline_id)).await;
task_mgr::shutdown_tasks(
None,
Some(self.tenant_shard_id.tenant_id),
Some(self.timeline_id),
)
.await;
// Finally wait until any gate-holders are complete
self.gate.close().await;
@@ -1100,9 +1123,8 @@ impl Timeline {
Ok(Some(true))
}
/// Evict just one layer.
///
/// Returns `Ok(None)` in the case where the layer could not be found by its `layer_file_name`.
/// Like [`evict_layer_batch`](Self::evict_layer_batch), but for just one layer.
/// Additional case `Ok(None)` covers the case where the layer could not be found by its `layer_file_name`.
pub async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
let _gate = self
.gate
@@ -1113,17 +1135,109 @@ impl Timeline {
return Ok(None);
};
let rtc = self
let Some(local_layer) = local_layer.keep_resident().await? else {
return Ok(Some(false));
};
let local_layer: Layer = local_layer.into();
let remote_client = self
.remote_client
.as_ref()
.ok_or_else(|| anyhow::anyhow!("remote storage not configured; cannot evict"))?;
match local_layer.evict_and_wait(rtc).await {
Ok(()) => Ok(Some(true)),
Err(EvictionError::NotFound) => Ok(Some(false)),
Err(EvictionError::Downloaded) => Ok(Some(false)),
let results = self
.evict_layer_batch(remote_client, &[local_layer])
.await?;
assert_eq!(results.len(), 1);
let result: Option<Result<(), EvictionError>> = results.into_iter().next().unwrap();
match result {
None => anyhow::bail!("task_mgr shutdown requested"),
Some(Ok(())) => Ok(Some(true)),
Some(Err(e)) => Err(anyhow::Error::new(e)),
}
}
/// Evict a batch of layers.
pub(crate) async fn evict_layers(
&self,
layers_to_evict: &[Layer],
) -> anyhow::Result<Vec<Option<Result<(), EvictionError>>>> {
let _gate = self
.gate
.enter()
.map_err(|_| anyhow::anyhow!("Shutting down"))?;
let remote_client = self
.remote_client
.as_ref()
.context("timeline must have RemoteTimelineClient")?;
self.evict_layer_batch(remote_client, layers_to_evict).await
}
/// Evict multiple layers at once, continuing through errors.
///
/// The `remote_client` should be this timeline's `self.remote_client`.
/// We make the caller provide it so that they are responsible for handling the case
/// where someone wants to evict the layer but no remote storage is configured.
///
/// Returns either `Err()` or `Ok(results)` where `results.len() == layers_to_evict.len()`.
/// If `Err()` is returned, no eviction was attempted.
/// Each position of `Ok(results)` corresponds to the layer in `layers_to_evict`.
/// Meaning of each `result[i]`:
/// - `Some(Err(...))` if layer replacement failed for some reason
/// - replacement failed for an expectable reason (e.g., layer removed by GC before we grabbed all locks)
/// - `Some(Ok(()))` if everything went well.
/// - `None` if no eviction attempt was made for the layer because `cancel.is_cancelled() == true`.
async fn evict_layer_batch(
&self,
remote_client: &Arc<RemoteTimelineClient>,
layers_to_evict: &[Layer],
) -> anyhow::Result<Vec<Option<Result<(), EvictionError>>>> {
{
// to avoid racing with detach and delete_timeline
let state = self.current_state();
anyhow::ensure!(
state == TimelineState::Active,
"timeline is not active but {state:?}"
);
}
let mut results = Vec::with_capacity(layers_to_evict.len());
for _ in 0..layers_to_evict.len() {
results.push(None);
}
let mut js = tokio::task::JoinSet::new();
for (i, l) in layers_to_evict.iter().enumerate() {
js.spawn({
let l = l.to_owned();
let remote_client = remote_client.clone();
async move { (i, l.evict_and_wait(&remote_client).await) }
});
}
let join = async {
while let Some(next) = js.join_next().await {
match next {
Ok((i, res)) => results[i] = Some(res),
Err(je) if je.is_cancelled() => unreachable!("not used"),
Err(je) if je.is_panic() => { /* already logged */ }
Err(je) => tracing::error!("unknown JoinError: {je:?}"),
}
}
};
tokio::select! {
_ = self.cancel.cancelled() => {},
_ = join => {}
}
assert_eq!(results.len(), layers_to_evict.len());
Ok(results)
}
}
/// Number of times we will compute partition within a checkpoint distance.
@@ -1200,20 +1314,16 @@ impl Timeline {
&self.conf.default_tenant_conf,
);
// TODO(sharding): make evictions state shard aware
// (https://github.com/neondatabase/neon/issues/5953)
let tenant_id_str = self.tenant_shard_id.tenant_id.to_string();
let shard_id_str = format!("{}", self.tenant_shard_id.shard_slug());
let timeline_id_str = self.timeline_id.to_string();
self.metrics
.evictions_with_low_residence_duration
.write()
.unwrap()
.change_threshold(
&tenant_id_str,
&shard_id_str,
&timeline_id_str,
new_threshold,
);
.change_threshold(&tenant_id_str, &timeline_id_str, new_threshold);
}
}
@@ -1285,7 +1395,7 @@ impl Timeline {
ancestor_lsn: metadata.ancestor_lsn(),
metrics: TimelineMetrics::new(
&tenant_shard_id,
&tenant_shard_id.tenant_id,
&timeline_id,
crate::metrics::EvictionsWithLowResidenceDurationBuilder::new(
"mtime",
@@ -1386,7 +1496,7 @@ impl Timeline {
task_mgr::spawn(
task_mgr::BACKGROUND_RUNTIME.handle(),
task_mgr::TaskKind::LayerFlushTask,
Some(self.tenant_shard_id),
Some(self.tenant_shard_id.tenant_id),
Some(self.timeline_id),
"layer flush task",
false,
@@ -1445,7 +1555,6 @@ impl Timeline {
max_lsn_wal_lag,
auth_token: crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(),
availability_zone: self.conf.availability_zone.clone(),
ingest_batch_size: self.conf.ingest_batch_size,
},
broker_client,
ctx,
@@ -1738,7 +1847,7 @@ impl Timeline {
task_mgr::spawn(
task_mgr::BACKGROUND_RUNTIME.handle(),
task_mgr::TaskKind::InitialLogicalSizeCalculation,
Some(self.tenant_shard_id),
Some(self.tenant_shard_id.tenant_id),
Some(self.timeline_id),
"initial size calculation",
false,
@@ -1911,7 +2020,7 @@ impl Timeline {
task_mgr::spawn(
task_mgr::BACKGROUND_RUNTIME.handle(),
task_mgr::TaskKind::OndemandLogicalSizeCalculation,
Some(self.tenant_shard_id),
Some(self.tenant_shard_id.tenant_id),
Some(self.timeline_id),
"ondemand logical size calculation",
false,
@@ -2057,55 +2166,6 @@ impl Timeline {
None
}
/// The timeline heatmap is a hint to secondary locations from the primary location,
/// indicating which layers are currently on-disk on the primary.
///
/// None is returned if the Timeline is in a state where uploading a heatmap
/// doesn't make sense, such as shutting down or initializing. The caller
/// should treat this as a cue to simply skip doing any heatmap uploading
/// for this timeline.
pub(crate) async fn generate_heatmap(&self) -> Option<HeatMapTimeline> {
let eviction_info = self.get_local_layers_for_disk_usage_eviction().await;
let remote_client = match &self.remote_client {
Some(c) => c,
None => return None,
};
let layer_file_names = eviction_info
.resident_layers
.iter()
.map(|l| l.layer.layer_desc().filename())
.collect::<Vec<_>>();
let decorated = match remote_client.get_layers_metadata(layer_file_names) {
Ok(d) => d,
Err(_) => {
// Getting metadata only fails on Timeline in bad state.
return None;
}
};
let heatmap_layers = std::iter::zip(
eviction_info.resident_layers.into_iter(),
decorated.into_iter(),
)
.filter_map(|(layer, remote_info)| {
remote_info.map(|remote_info| {
HeatMapLayer::new(
layer.layer.layer_desc().filename(),
IndexLayerMetadata::from(remote_info),
layer.last_activity_ts,
)
})
});
Some(HeatMapTimeline::new(
self.timeline_id,
heatmap_layers.collect(),
))
}
}
type TraversalId = String;
@@ -2219,7 +2279,7 @@ impl Timeline {
}
// Recurse into ancestor if needed
if is_inherited_key(key) && Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn {
if Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn {
trace!(
"going into ancestor {}, cont_lsn is {}",
timeline.ancestor_lsn,
@@ -2401,7 +2461,13 @@ impl Timeline {
// FIXME: It's pointless to check the cache for things that are not 8kB pages.
// We should look at the key to determine if it's a cacheable object
let (lsn, read_guard) = cache
.lookup_materialized_page(self.tenant_shard_id, self.timeline_id, key, lsn, ctx)
.lookup_materialized_page(
self.tenant_shard_id.tenant_id,
self.timeline_id,
key,
lsn,
ctx,
)
.await?;
let img = Bytes::from(read_guard.to_vec());
Some((lsn, img))
@@ -2452,37 +2518,13 @@ impl Timeline {
Ok(())
}
async fn put_values(
&self,
values: &HashMap<Key, Vec<(Lsn, Value)>>,
ctx: &RequestContext,
) -> anyhow::Result<()> {
// Pick the first LSN in the batch to get the layer to write to.
for lsns in values.values() {
if let Some((lsn, _)) = lsns.first() {
let layer = self.get_layer_for_write(*lsn).await?;
layer.put_values(values, ctx).await?;
break;
}
}
Ok(())
}
async fn put_tombstone(&self, key_range: Range<Key>, lsn: Lsn) -> anyhow::Result<()> {
let layer = self.get_layer_for_write(lsn).await?;
layer.put_tombstone(key_range, lsn).await?;
Ok(())
}
async fn put_tombstones(&self, tombstones: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
if let Some((_, lsn)) = tombstones.first() {
let layer = self.get_layer_for_write(*lsn).await?;
layer.put_tombstones(tombstones).await?;
}
Ok(())
}
pub(crate) fn finish_write(&self, new_lsn: Lsn) {
fn finish_write(&self, new_lsn: Lsn) {
assert!(new_lsn.is_aligned());
self.metrics.last_record_gauge.set(new_lsn.0 as i64);
@@ -3167,7 +3209,7 @@ impl DurationRecorder {
#[derive(Default)]
struct CompactLevel0Phase1StatsBuilder {
version: Option<u64>,
tenant_id: Option<TenantShardId>,
tenant_id: Option<TenantId>,
timeline_id: Option<TimelineId>,
read_lock_acquisition_micros: DurationRecorder,
read_lock_held_spawn_blocking_startup_micros: DurationRecorder,
@@ -3184,7 +3226,7 @@ struct CompactLevel0Phase1StatsBuilder {
#[derive(serde::Serialize)]
struct CompactLevel0Phase1Stats {
version: u64,
tenant_id: TenantShardId,
tenant_id: TenantId,
timeline_id: TimelineId,
read_lock_acquisition_micros: RecordedDuration,
read_lock_held_spawn_blocking_startup_micros: RecordedDuration,
@@ -3703,7 +3745,7 @@ impl Timeline {
let ctx = ctx.attached_child();
let mut stats = CompactLevel0Phase1StatsBuilder {
version: Some(2),
tenant_id: Some(self.tenant_shard_id),
tenant_id: Some(self.tenant_shard_id.tenant_id),
timeline_id: Some(self.timeline_id),
..Default::default()
};
@@ -3929,7 +3971,7 @@ impl Timeline {
// for details. This will block until the old value is no longer in use.
//
// The GC cutoff should only ever move forwards.
let waitlist = {
{
let write_guard = self.latest_gc_cutoff_lsn.lock_for_write();
ensure!(
*write_guard <= new_gc_cutoff,
@@ -3937,9 +3979,8 @@ impl Timeline {
*write_guard,
new_gc_cutoff
);
write_guard.store_and_unlock(new_gc_cutoff)
};
waitlist.wait().await;
write_guard.store_and_unlock(new_gc_cutoff).wait();
}
info!("GC starting");
@@ -4165,7 +4206,7 @@ impl Timeline {
let cache = page_cache::get();
if let Err(e) = cache
.memorize_materialized_page(
self.tenant_shard_id,
self.tenant_shard_id.tenant_id,
self.timeline_id,
key,
last_rec_lsn,
@@ -4209,7 +4250,7 @@ impl Timeline {
let task_id = task_mgr::spawn(
task_mgr::BACKGROUND_RUNTIME.handle(),
task_mgr::TaskKind::DownloadAllRemoteLayers,
Some(self.tenant_shard_id),
Some(self.tenant_shard_id.tenant_id),
Some(self.timeline_id),
"download all remote layers task",
false,
@@ -4501,22 +4542,10 @@ impl<'a> TimelineWriter<'a> {
self.tl.put_value(key, lsn, value, ctx).await
}
pub async fn put_batch(
&self,
batch: &HashMap<Key, Vec<(Lsn, Value)>>,
ctx: &RequestContext,
) -> anyhow::Result<()> {
self.tl.put_values(batch, ctx).await
}
pub async fn delete(&self, key_range: Range<Key>, lsn: Lsn) -> anyhow::Result<()> {
self.tl.put_tombstone(key_range, lsn).await
}
pub async fn delete_batch(&self, batch: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
self.tl.put_tombstones(batch).await
}
/// Track the end of the latest digested WAL record.
/// Remember the (end of) last valid WAL record remembered in the timeline.
///
@@ -4582,7 +4611,7 @@ mod tests {
.await
.unwrap();
let rtc = timeline
let rc = timeline
.remote_client
.clone()
.expect("just configured this");
@@ -4595,12 +4624,16 @@ mod tests {
.expect("should had been resident")
.drop_eviction_guard();
let first = async { layer.evict_and_wait(&rtc).await };
let second = async { layer.evict_and_wait(&rtc).await };
let batch = [layer];
let first = async { timeline.evict_layer_batch(&rc, &batch).await.unwrap() };
let second = async { timeline.evict_layer_batch(&rc, &batch).await.unwrap() };
let (first, second) = tokio::join!(first, second);
let res = layer.keep_resident().await;
let (first, second) = (only_one(first), only_one(second));
let res = batch[0].keep_resident().await;
assert!(matches!(res, Ok(None)), "{res:?}");
match (first, second) {
@@ -4621,6 +4654,14 @@ mod tests {
RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error)
}
fn only_one<T>(mut input: Vec<Option<T>>) -> T {
assert_eq!(1, input.len());
input
.pop()
.expect("length just checked")
.expect("no cancellation")
}
async fn find_some_layer(timeline: &Timeline) -> Layer {
let layers = timeline.layers.read().await;
let desc = layers

View File

@@ -43,7 +43,7 @@ async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
// Shut down the layer flush task before the remote client, as one depends on the other
task_mgr::shutdown_tasks(
Some(TaskKind::LayerFlushTask),
Some(timeline.tenant_shard_id),
Some(timeline.tenant_shard_id.tenant_id),
Some(timeline.timeline_id),
)
.await;
@@ -71,7 +71,7 @@ async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
info!("waiting for timeline tasks to shutdown");
task_mgr::shutdown_tasks(
None,
Some(timeline.tenant_shard_id),
Some(timeline.tenant_shard_id.tenant_id),
Some(timeline.timeline_id),
)
.await;
@@ -528,7 +528,7 @@ impl DeleteTimelineFlow {
task_mgr::spawn(
task_mgr::BACKGROUND_RUNTIME.handle(),
TaskKind::TimelineDeletionWorker,
Some(tenant_shard_id),
Some(tenant_shard_id.tenant_id),
Some(timeline_id),
"timeline_delete",
false,

View File

@@ -60,7 +60,7 @@ impl Timeline {
task_mgr::spawn(
BACKGROUND_RUNTIME.handle(),
TaskKind::Eviction,
Some(self.tenant_shard_id),
Some(self.tenant_shard_id.tenant_id),
Some(self.timeline_id),
&format!(
"layer eviction for {}/{}",
@@ -212,21 +212,11 @@ impl Timeline {
// Gather layers for eviction.
// NB: all the checks can be invalidated as soon as we release the layer map lock.
// We don't want to hold the layer map lock during eviction.
// So, we just need to deal with this.
let remote_client = match self.remote_client.as_ref() {
Some(c) => c,
None => {
error!("no remote storage configured, cannot evict layers");
return ControlFlow::Continue(());
}
};
let mut js = tokio::task::JoinSet::new();
{
let candidates: Vec<_> = {
let guard = self.layers.read().await;
let layers = guard.layer_map();
let mut candidates = Vec::new();
for hist_layer in layers.iter_historic_layers() {
let hist_layer = guard.get_from_desc(&hist_layer);
@@ -272,49 +262,54 @@ impl Timeline {
continue;
}
};
let layer = guard.drop_eviction_guard();
if no_activity_for > p.threshold {
let remote_client = remote_client.clone();
// this could cause a lot of allocations in some cases
js.spawn(async move { layer.evict_and_wait(&remote_client).await });
stats.candidates += 1;
candidates.push(guard.drop_eviction_guard())
}
}
candidates
};
stats.candidates = candidates.len();
let remote_client = match self.remote_client.as_ref() {
None => {
error!(
num_candidates = candidates.len(),
"no remote storage configured, cannot evict layers"
);
return ControlFlow::Continue(());
}
Some(c) => c,
};
let join_all = async move {
while let Some(next) = js.join_next().await {
match next {
Ok(Ok(())) => stats.evicted += 1,
Ok(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
stats.not_evictable += 1;
}
Err(je) if je.is_cancelled() => unreachable!("not used"),
Err(je) if je.is_panic() => {
/* already logged */
stats.errors += 1;
}
Err(je) => tracing::error!("unknown JoinError: {je:?}"),
}
let results = match self.evict_layer_batch(remote_client, &candidates).await {
Err(pre_err) => {
stats.errors += candidates.len();
error!("could not do any evictions: {pre_err:#}");
return ControlFlow::Continue(());
}
stats
Ok(results) => results,
};
tokio::select! {
stats = join_all => {
if stats.candidates == stats.not_evictable {
debug!(stats=?stats, "eviction iteration complete");
} else if stats.errors > 0 || stats.not_evictable > 0 {
warn!(stats=?stats, "eviction iteration complete");
} else {
info!(stats=?stats, "eviction iteration complete");
assert_eq!(results.len(), candidates.len());
for result in results {
match result {
None => {
stats.skipped_for_shutdown += 1;
}
Some(Ok(())) => {
stats.evicted += 1;
}
Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
stats.not_evictable += 1;
}
}
_ = cancel.cancelled() => {
// just drop the joinset to "abort"
}
}
if stats.candidates == stats.not_evictable {
debug!(stats=?stats, "eviction iteration complete");
} else if stats.errors > 0 || stats.not_evictable > 0 {
warn!(stats=?stats, "eviction iteration complete");
} else {
info!(stats=?stats, "eviction iteration complete");
}
ControlFlow::Continue(())
}
@@ -348,7 +343,7 @@ impl Timeline {
// Make one of the tenant's timelines draw the short straw and run the calculation.
// The others wait until the calculation is done so that they take into account the
// imitated accesses that the winner made.
let tenant = match crate::tenant::mgr::get_tenant(self.tenant_shard_id, true) {
let tenant = match crate::tenant::mgr::get_tenant(self.tenant_shard_id.tenant_id, true) {
Ok(t) => t,
Err(_) => {
return ControlFlow::Break(());

View File

@@ -243,7 +243,7 @@ impl LayerManager {
// map index without actually rebuilding the index.
updates.remove_historic(desc);
mapping.remove(layer);
layer.delete_on_drop();
layer.garbage_collect_on_drop();
}
pub(crate) fn contains(&self, layer: &Layer) -> bool {

View File

@@ -30,7 +30,6 @@ use crate::tenant::timeline::walreceiver::connection_manager::{
connection_manager_loop_step, ConnectionManagerState,
};
use pageserver_api::shard::TenantShardId;
use std::future::Future;
use std::num::NonZeroU64;
use std::ops::ControlFlow;
@@ -42,7 +41,7 @@ use tokio::sync::watch;
use tokio_util::sync::CancellationToken;
use tracing::*;
use utils::id::TimelineId;
use utils::id::TenantTimelineId;
use self::connection_manager::ConnectionManagerStatus;
@@ -58,12 +57,10 @@ pub struct WalReceiverConf {
pub max_lsn_wal_lag: NonZeroU64,
pub auth_token: Option<Arc<String>>,
pub availability_zone: Option<String>,
pub ingest_batch_size: u64,
}
pub struct WalReceiver {
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
timeline: TenantTimelineId,
manager_status: Arc<std::sync::RwLock<Option<ConnectionManagerStatus>>>,
}
@@ -74,7 +71,7 @@ impl WalReceiver {
mut broker_client: BrokerClientChannel,
ctx: &RequestContext,
) -> Self {
let tenant_shard_id = timeline.tenant_shard_id;
let tenant_id = timeline.tenant_shard_id.tenant_id;
let timeline_id = timeline.timeline_id;
let walreceiver_ctx =
ctx.detached_child(TaskKind::WalReceiverManager, DownloadBehavior::Error);
@@ -84,9 +81,9 @@ impl WalReceiver {
task_mgr::spawn(
WALRECEIVER_RUNTIME.handle(),
TaskKind::WalReceiverManager,
Some(timeline.tenant_shard_id),
Some(tenant_id),
Some(timeline_id),
&format!("walreceiver for timeline {tenant_shard_id}/{timeline_id}"),
&format!("walreceiver for timeline {tenant_id}/{timeline_id}"),
false,
async move {
debug_assert_current_span_has_tenant_and_timeline_id();
@@ -120,12 +117,11 @@ impl WalReceiver {
*loop_status.write().unwrap() = None;
Ok(())
}
.instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), timeline_id = %timeline_id))
.instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_id, timeline_id = %timeline_id))
);
Self {
tenant_shard_id,
timeline_id,
timeline: TenantTimelineId::new(tenant_id, timeline_id),
manager_status,
}
}
@@ -133,8 +129,8 @@ impl WalReceiver {
pub async fn stop(self) {
task_mgr::shutdown_tasks(
Some(TaskKind::WalReceiverManager),
Some(self.tenant_shard_id),
Some(self.timeline_id),
Some(self.timeline.tenant_id),
Some(self.timeline.timeline_id),
)
.await;
}

View File

@@ -411,7 +411,6 @@ impl ConnectionManagerState {
let node_id = new_sk.safekeeper_id;
let connect_timeout = self.conf.wal_connect_timeout;
let ingest_batch_size = self.conf.ingest_batch_size;
let timeline = Arc::clone(&self.timeline);
let ctx = ctx.detached_child(
TaskKind::WalReceiverConnectionHandler,
@@ -431,7 +430,6 @@ impl ConnectionManagerState {
connect_timeout,
ctx,
node_id,
ingest_batch_size,
)
.await;
@@ -1347,7 +1345,6 @@ mod tests {
max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(),
auth_token: None,
availability_zone: None,
ingest_batch_size: 1,
},
wal_connection: None,
wal_stream_candidates: HashMap::new(),

View File

@@ -26,7 +26,7 @@ use tracing::{debug, error, info, trace, warn, Instrument};
use super::TaskStateUpdate;
use crate::{
context::RequestContext,
metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS},
task_mgr,
task_mgr::TaskKind,
task_mgr::WALRECEIVER_RUNTIME,
@@ -106,7 +106,6 @@ impl From<WalDecodeError> for WalReceiverError {
/// Open a connection to the given safekeeper and receive WAL, sending back progress
/// messages as we go.
#[allow(clippy::too_many_arguments)]
pub(super) async fn handle_walreceiver_connection(
timeline: Arc<Timeline>,
wal_source_connconf: PgConnectionConfig,
@@ -115,7 +114,6 @@ pub(super) async fn handle_walreceiver_connection(
connect_timeout: Duration,
ctx: RequestContext,
node: NodeId,
ingest_batch_size: u64,
) -> Result<(), WalReceiverError> {
debug_assert_current_span_has_tenant_and_timeline_id();
@@ -165,7 +163,7 @@ pub(super) async fn handle_walreceiver_connection(
task_mgr::spawn(
WALRECEIVER_RUNTIME.handle(),
TaskKind::WalReceiverConnectionPoller,
Some(timeline.tenant_shard_id),
Some(timeline.tenant_shard_id.tenant_id),
Some(timeline.timeline_id),
"walreceiver connection",
false,
@@ -307,9 +305,7 @@ pub(super) async fn handle_walreceiver_connection(
{
let mut decoded = DecodedWALRecord::default();
let mut modification = timeline.begin_modification(startlsn);
let mut uncommitted_records = 0;
let mut filtered_records = 0;
let mut modification = timeline.begin_modification(endlsn);
while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
// It is important to deal with the aligned records as lsn in getPage@LSN is
// aligned and can be several bytes bigger. Without this alignment we are
@@ -318,40 +314,14 @@ pub(super) async fn handle_walreceiver_connection(
return Err(WalReceiverError::Other(anyhow!("LSN not aligned")));
}
// Ingest the records without immediately committing them.
let ingested = walingest
walingest
.ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx)
.await
.with_context(|| format!("could not ingest record at {lsn}"))?;
if !ingested {
tracing::debug!("ingest: filtered out record @ LSN {lsn}");
WAL_INGEST.records_filtered.inc();
filtered_records += 1;
}
fail_point!("walreceiver-after-ingest");
last_rec_lsn = lsn;
// Commit every ingest_batch_size records. Even if we filtered out
// all records, we still need to call commit to advance the LSN.
uncommitted_records += 1;
if uncommitted_records >= ingest_batch_size {
WAL_INGEST
.records_committed
.inc_by(uncommitted_records - filtered_records);
modification.commit(&ctx).await?;
uncommitted_records = 0;
filtered_records = 0;
}
}
// Commit the remaining records.
if uncommitted_records > 0 {
WAL_INGEST
.records_committed
.inc_by(uncommitted_records - filtered_records);
modification.commit(&ctx).await?;
}
}

View File

@@ -288,9 +288,6 @@ impl VirtualFile {
}
let (handle, mut slot_guard) = get_open_files().find_victim_slot();
// NB: there is also StorageIoOperation::OpenAfterReplace which is for the case
// where our caller doesn't get to use the returned VirtualFile before its
// slot gets re-used by someone else.
let file = STORAGE_IO_TIME_METRIC
.get(StorageIoOperation::Open)
.observe_closure_duration(|| open_options.open(path))?;
@@ -314,9 +311,6 @@ impl VirtualFile {
timeline_id,
};
// TODO: Under pressure, it's likely the slot will get re-used and
// the underlying file closed before they get around to using it.
// => https://github.com/neondatabase/neon/issues/6065
slot_guard.file.replace(file);
Ok(vfile)
@@ -427,12 +421,9 @@ impl VirtualFile {
// now locked in write-mode. Find a free slot to put it in.
let (handle, mut slot_guard) = open_files.find_victim_slot();
// Re-open the physical file.
// NB: we use StorageIoOperation::OpenAferReplace for this to distinguish this
// case from StorageIoOperation::Open. This helps with identifying thrashing
// of the virtual file descriptor cache.
// Open the physical file
let file = STORAGE_IO_TIME_METRIC
.get(StorageIoOperation::OpenAfterReplace)
.get(StorageIoOperation::Open)
.observe_closure_duration(|| self.open_options.open(&self.path))?;
// Perform the requested operation on it
@@ -654,7 +645,6 @@ pub fn init(num_slots: usize) {
if OPEN_FILES.set(OpenFiles::new(num_slots)).is_err() {
panic!("virtual_file::init called twice");
}
crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64);
}
const TEST_MAX_FILE_DESCRIPTORS: usize = 10;

View File

@@ -21,7 +21,6 @@
//! redo Postgres process, but some records it can handle directly with
//! bespoken Rust code.
use pageserver_api::shard::ShardIdentity;
use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes;
use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment;
use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};
@@ -31,7 +30,6 @@ use bytes::{Buf, Bytes, BytesMut};
use tracing::*;
use crate::context::RequestContext;
use crate::metrics::WAL_INGEST;
use crate::pgdatadir_mapping::*;
use crate::tenant::PageReconstructError;
use crate::tenant::Timeline;
@@ -47,18 +45,19 @@ use postgres_ffi::TransactionId;
use postgres_ffi::BLCKSZ;
use utils::lsn::Lsn;
pub struct WalIngest {
shard: ShardIdentity,
pub struct WalIngest<'a> {
timeline: &'a Timeline,
checkpoint: CheckPoint,
checkpoint_modified: bool,
}
impl WalIngest {
impl<'a> WalIngest<'a> {
pub async fn new(
timeline: &Timeline,
timeline: &'a Timeline,
startpoint: Lsn,
ctx: &RequestContext,
) -> anyhow::Result<WalIngest> {
ctx: &'_ RequestContext,
) -> anyhow::Result<WalIngest<'a>> {
// Fetch the latest checkpoint into memory, so that we can compare with it
// quickly in `ingest_record` and update it when it changes.
let checkpoint_bytes = timeline.get_checkpoint(startpoint, ctx).await?;
@@ -66,7 +65,7 @@ impl WalIngest {
trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value);
Ok(WalIngest {
shard: *timeline.get_shard_identity(),
timeline,
checkpoint,
checkpoint_modified: false,
})
@@ -80,8 +79,6 @@ impl WalIngest {
/// Helper function to parse a WAL record and call the Timeline's PUT functions for all the
/// relations/pages that the record affects.
///
/// This function returns `true` if the record was ingested, and `false` if it was filtered out
///
pub async fn ingest_record(
&mut self,
recdata: Bytes,
@@ -89,13 +86,9 @@ impl WalIngest {
modification: &mut DatadirModification<'_>,
decoded: &mut DecodedWALRecord,
ctx: &RequestContext,
) -> anyhow::Result<bool> {
WAL_INGEST.records_received.inc();
let pg_version = modification.tline.pg_version;
let prev_len = modification.len();
modification.set_lsn(lsn)?;
decode_wal_record(recdata, decoded, pg_version)?;
) -> anyhow::Result<()> {
modification.lsn = lsn;
decode_wal_record(recdata, decoded, self.timeline.pg_version)?;
let mut buf = decoded.record.clone();
buf.advance(decoded.main_data_offset);
@@ -132,9 +125,9 @@ impl WalIngest {
}
pg_constants::RM_DBASE_ID => {
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
debug!(%info, %pg_version, "handle RM_DBASE_ID");
debug!(%info, pg_version=%self.timeline.pg_version, "handle RM_DBASE_ID");
if pg_version == 14 {
if self.timeline.pg_version == 14 {
if info == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE {
let createdb = XlCreateDatabase::decode(&mut buf);
debug!("XLOG_DBASE_CREATE v14");
@@ -150,7 +143,7 @@ impl WalIngest {
.await?;
}
}
} else if pg_version == 15 {
} else if self.timeline.pg_version == 15 {
if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_WAL_LOG {
debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
} else if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY {
@@ -170,7 +163,7 @@ impl WalIngest {
.await?;
}
}
} else if pg_version == 16 {
} else if self.timeline.pg_version == 16 {
if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_WAL_LOG {
debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
} else if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY {
@@ -362,33 +355,6 @@ impl WalIngest {
// Iterate through all the blocks that the record modifies, and
// "put" a separate copy of the record for each block.
for blk in decoded.blocks.iter() {
let rel = RelTag {
spcnode: blk.rnode_spcnode,
dbnode: blk.rnode_dbnode,
relnode: blk.rnode_relnode,
forknum: blk.forknum,
};
let key = rel_block_to_key(rel, blk.blkno);
let key_is_local = self.shard.is_key_local(&key);
tracing::debug!(
lsn=%lsn,
key=%key,
"ingest: shard decision {} (checkpoint={})",
if !key_is_local { "drop" } else { "keep" },
self.checkpoint_modified
);
if !key_is_local {
if self.shard.is_zero() {
// Shard 0 tracks relation sizes. Although we will not store this block, we will observe
// its blkno in case it implicitly extends a relation.
self.observe_decoded_block(modification, blk, ctx).await?;
}
continue;
}
self.ingest_decoded_block(modification, lsn, decoded, blk, ctx)
.await?;
}
@@ -401,28 +367,11 @@ impl WalIngest {
self.checkpoint_modified = false;
}
// Note that at this point this record is only cached in the modification
// until commit() is called to flush the data into the repository and update
// the latest LSN.
// Now that this record has been fully handled, including updating the
// checkpoint data, let the repository know that it is up-to-date to this LSN
modification.commit(ctx).await?;
Ok(modification.len() > prev_len)
}
/// Do not store this block, but observe it for the purposes of updating our relation size state.
async fn observe_decoded_block(
&mut self,
modification: &mut DatadirModification<'_>,
blk: &DecodedBkpBlock,
ctx: &RequestContext,
) -> Result<(), PageReconstructError> {
let rel = RelTag {
spcnode: blk.rnode_spcnode,
dbnode: blk.rnode_dbnode,
relnode: blk.rnode_relnode,
forknum: blk.forknum,
};
self.handle_rel_extend(modification, rel, blk.blkno, ctx)
.await
Ok(())
}
async fn ingest_decoded_block(
@@ -451,10 +400,8 @@ impl WalIngest {
&& decoded.xl_rmid == pg_constants::RM_XLOG_ID
&& (decoded.xl_info == pg_constants::XLOG_FPI
|| decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
// compression of WAL is not yet supported: fall back to storing the original WAL record
&& !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, modification.tline.pg_version)?
// do not materialize null pages because them most likely be soon replaced with real data
&& blk.bimg_len != 0
// compression of WAL is not yet supported: fall back to storing the original WAL record
&& !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, self.timeline.pg_version)?
{
// Extract page image from FPI record
let img_len = blk.bimg_len as usize;
@@ -505,7 +452,7 @@ impl WalIngest {
let mut old_heap_blkno: Option<u32> = None;
let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS;
match modification.tline.pg_version {
match self.timeline.pg_version {
14 => {
if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
@@ -729,7 +676,7 @@ impl WalIngest {
// replaying it would fail to find the previous image of the page, because
// it doesn't exist. So check if the VM page(s) exist, and skip the WAL
// record if it doesn't.
let vm_size = get_relsize(modification, vm_rel, ctx).await?;
let vm_size = self.get_relsize(vm_rel, modification.lsn, ctx).await?;
if let Some(blknum) = new_vm_blk {
if blknum >= vm_size {
new_vm_blk = None;
@@ -810,11 +757,10 @@ impl WalIngest {
let mut new_heap_blkno: Option<u32> = None;
let mut old_heap_blkno: Option<u32> = None;
let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS;
let pg_version = modification.tline.pg_version;
assert_eq!(decoded.xl_rmid, pg_constants::RM_NEON_ID);
match pg_version {
match self.timeline.pg_version {
16 => {
let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
@@ -877,7 +823,7 @@ impl WalIngest {
}
_ => bail!(
"Neon RMGR has no known compatibility with PostgreSQL version {}",
pg_version
self.timeline.pg_version
),
}
@@ -900,7 +846,7 @@ impl WalIngest {
// replaying it would fail to find the previous image of the page, because
// it doesn't exist. So check if the VM page(s) exist, and skip the WAL
// record if it doesn't.
let vm_size = get_relsize(modification, vm_rel, ctx).await?;
let vm_size = self.get_relsize(vm_rel, modification.lsn, ctx).await?;
if let Some(blknum) = new_vm_blk {
if blknum >= vm_size {
new_vm_blk = None;
@@ -978,14 +924,16 @@ impl WalIngest {
let src_db_id = rec.src_db_id;
let src_tablespace_id = rec.src_tablespace_id;
// Creating a database is implemented by copying the template (aka. source) database.
// To copy all the relations, we need to ask for the state as of the same LSN, but we
// cannot pass 'lsn' to the Timeline.get_* functions, or they will block waiting for
// the last valid LSN to advance up to it. So we use the previous record's LSN in the
// get calls instead.
let req_lsn = modification.tline.get_last_record_lsn();
let rels = modification
.tline
.list_rels(
src_tablespace_id,
src_db_id,
Version::Modified(modification),
ctx,
)
.list_rels(src_tablespace_id, src_db_id, req_lsn, ctx)
.await?;
debug!("ingest_xlog_dbase_create: {} rels", rels.len());
@@ -993,12 +941,7 @@ impl WalIngest {
// Copy relfilemap
let filemap = modification
.tline
.get_relmap_file(
src_tablespace_id,
src_db_id,
Version::Modified(modification),
ctx,
)
.get_relmap_file(src_tablespace_id, src_db_id, req_lsn, ctx)
.await?;
modification
.put_relmap_file(tablespace_id, db_id, filemap, ctx)
@@ -1012,7 +955,7 @@ impl WalIngest {
let nblocks = modification
.tline
.get_rel_size(src_rel, Version::Modified(modification), true, ctx)
.get_rel_size(src_rel, req_lsn, true, ctx)
.await?;
let dst_rel = RelTag {
spcnode: tablespace_id,
@@ -1030,13 +973,7 @@ impl WalIngest {
let content = modification
.tline
.get_rel_page_at_lsn(
src_rel,
blknum,
Version::Modified(modification),
true,
ctx,
)
.get_rel_page_at_lsn(src_rel, blknum, req_lsn, true, ctx)
.await?;
modification.put_rel_page_image(dst_rel, blknum, content)?;
num_blocks_copied += 1;
@@ -1107,7 +1044,7 @@ impl WalIngest {
modification.put_rel_page_image(rel, fsm_physical_page_no, ZERO_PAGE.clone())?;
fsm_physical_page_no += 1;
}
let nblocks = get_relsize(modification, rel, ctx).await?;
let nblocks = self.get_relsize(rel, modification.lsn, ctx).await?;
if nblocks > fsm_physical_page_no {
// check if something to do: FSM is larger than truncate position
self.put_rel_truncation(modification, rel, fsm_physical_page_no, ctx)
@@ -1129,7 +1066,7 @@ impl WalIngest {
modification.put_rel_page_image(rel, vm_page_no, ZERO_PAGE.clone())?;
vm_page_no += 1;
}
let nblocks = get_relsize(modification, rel, ctx).await?;
let nblocks = self.get_relsize(rel, modification.lsn, ctx).await?;
if nblocks > vm_page_no {
// check if something to do: VM is larger than truncate position
self.put_rel_truncation(modification, rel, vm_page_no, ctx)
@@ -1202,9 +1139,10 @@ impl WalIngest {
dbnode: xnode.dbnode,
relnode: xnode.relnode,
};
let last_lsn = self.timeline.get_last_record_lsn();
if modification
.tline
.get_rel_exists(rel, Version::Modified(modification), true, ctx)
.get_rel_exists(rel, last_lsn, true, ctx)
.await?
{
self.put_rel_drop(modification, rel, ctx).await?;
@@ -1258,9 +1196,10 @@ impl WalIngest {
// will block waiting for the last valid LSN to advance up to
// it. So we use the previous record's LSN in the get calls
// instead.
let req_lsn = modification.tline.get_last_record_lsn();
for segno in modification
.tline
.list_slru_segments(SlruKind::Clog, Version::Modified(modification), ctx)
.list_slru_segments(SlruKind::Clog, req_lsn, ctx)
.await?
{
let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
@@ -1472,6 +1411,20 @@ impl WalIngest {
Ok(())
}
async fn get_relsize(
&mut self,
rel: RelTag,
lsn: Lsn,
ctx: &RequestContext,
) -> anyhow::Result<BlockNumber> {
let nblocks = if !self.timeline.get_rel_exists(rel, lsn, true, ctx).await? {
0
} else {
self.timeline.get_rel_size(rel, lsn, true, ctx).await?
};
Ok(nblocks)
}
async fn handle_rel_extend(
&mut self,
modification: &mut DatadirModification<'_>,
@@ -1483,6 +1436,7 @@ impl WalIngest {
// Check if the relation exists. We implicitly create relations on first
// record.
// TODO: would be nice if to be more explicit about it
let last_lsn = modification.lsn;
// Get current size and put rel creation if rel doesn't exist
//
@@ -1490,14 +1444,11 @@ impl WalIngest {
// check the cache too. This is because eagerly checking the cache results in
// less work overall and 10% better performance. It's more work on cache miss
// but cache miss is rare.
let old_nblocks = if let Some(nblocks) = modification
.tline
.get_cached_rel_size(&rel, modification.get_lsn())
{
let old_nblocks = if let Some(nblocks) = self.timeline.get_cached_rel_size(&rel, last_lsn) {
nblocks
} else if !modification
.tline
.get_rel_exists(rel, Version::Modified(modification), true, ctx)
} else if !self
.timeline
.get_rel_exists(rel, last_lsn, true, ctx)
.await?
{
// create it with 0 size initially, the logic below will extend it
@@ -1507,25 +1458,15 @@ impl WalIngest {
.context("Relation Error")?;
0
} else {
modification
.tline
.get_rel_size(rel, Version::Modified(modification), true, ctx)
.await?
self.timeline.get_rel_size(rel, last_lsn, true, ctx).await?
};
if new_nblocks > old_nblocks {
//info!("extending {} {} to {}", rel, old_nblocks, new_nblocks);
modification.put_rel_extend(rel, new_nblocks, ctx).await?;
let mut key = rel_block_to_key(rel, blknum);
// fill the gap with zeros
for gap_blknum in old_nblocks..blknum {
key.field6 = gap_blknum;
if self.shard.get_shard_number(&key) != self.shard.number {
continue;
}
modification.put_rel_page_image(rel, gap_blknum, ZERO_PAGE.clone())?;
}
}
@@ -1563,9 +1504,10 @@ impl WalIngest {
// Check if the relation exists. We implicitly create relations on first
// record.
// TODO: would be nice if to be more explicit about it
let old_nblocks = if !modification
.tline
.get_slru_segment_exists(kind, segno, Version::Modified(modification), ctx)
let last_lsn = self.timeline.get_last_record_lsn();
let old_nblocks = if !self
.timeline
.get_slru_segment_exists(kind, segno, last_lsn, ctx)
.await?
{
// create it with 0 size initially, the logic below will extend it
@@ -1574,9 +1516,8 @@ impl WalIngest {
.await?;
0
} else {
modification
.tline
.get_slru_segment_size(kind, segno, Version::Modified(modification), ctx)
self.timeline
.get_slru_segment_size(kind, segno, last_lsn, ctx)
.await?
};
@@ -1599,26 +1540,6 @@ impl WalIngest {
}
}
async fn get_relsize(
modification: &DatadirModification<'_>,
rel: RelTag,
ctx: &RequestContext,
) -> anyhow::Result<BlockNumber> {
let nblocks = if !modification
.tline
.get_rel_exists(rel, Version::Modified(modification), true, ctx)
.await?
{
0
} else {
modification
.tline
.get_rel_size(rel, Version::Modified(modification), true, ctx)
.await?
};
Ok(nblocks)
}
#[allow(clippy::bool_assert_comparison)]
#[cfg(test)]
mod tests {
@@ -1644,7 +1565,10 @@ mod tests {
static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]);
async fn init_walingest_test(tline: &Timeline, ctx: &RequestContext) -> Result<WalIngest> {
async fn init_walingest_test<'a>(
tline: &'a Timeline,
ctx: &RequestContext,
) -> Result<WalIngest<'a>> {
let mut m = tline.begin_modification(Lsn(0x10));
m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
m.put_relmap_file(0, 111, Bytes::from(""), ctx).await?; // dummy relmapper file
@@ -1689,29 +1613,29 @@ mod tests {
// The relation was created at LSN 2, not visible at LSN 1 yet.
assert_eq!(
tline
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
.get_rel_exists(TESTREL_A, Lsn(0x10), false, &ctx)
.await?,
false
);
assert!(tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
.get_rel_size(TESTREL_A, Lsn(0x10), false, &ctx)
.await
.is_err());
assert_eq!(
tline
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
.get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx)
.await?,
true
);
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
.get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx)
.await?,
1
);
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx)
.get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx)
.await?,
3
);
@@ -1719,46 +1643,46 @@ mod tests {
// Check page contents at each LSN
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x20)), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x20), false, &ctx)
.await?,
TEST_IMG("foo blk 0 at 2")
);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x30)), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x30), false, &ctx)
.await?,
TEST_IMG("foo blk 0 at 3")
);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x40)), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x40), false, &ctx)
.await?,
TEST_IMG("foo blk 0 at 3")
);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x40)), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40), false, &ctx)
.await?,
TEST_IMG("foo blk 1 at 4")
);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x50)), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x50), false, &ctx)
.await?,
TEST_IMG("foo blk 0 at 3")
);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x50)), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50), false, &ctx)
.await?,
TEST_IMG("foo blk 1 at 4")
);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false, &ctx)
.await?,
TEST_IMG("foo blk 2 at 5")
);
@@ -1774,19 +1698,19 @@ mod tests {
// Check reported size and contents after truncation
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), false, &ctx)
.get_rel_size(TESTREL_A, Lsn(0x60), false, &ctx)
.await?,
2
);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x60)), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x60), false, &ctx)
.await?,
TEST_IMG("foo blk 0 at 3")
);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x60)), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60), false, &ctx)
.await?,
TEST_IMG("foo blk 1 at 4")
);
@@ -1794,13 +1718,13 @@ mod tests {
// should still see the truncated block with older LSN
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx)
.get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx)
.await?,
3
);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false, &ctx)
.await?,
TEST_IMG("foo blk 2 at 5")
);
@@ -1813,7 +1737,7 @@ mod tests {
m.commit(&ctx).await?;
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x68)), false, &ctx)
.get_rel_size(TESTREL_A, Lsn(0x68), false, &ctx)
.await?,
0
);
@@ -1826,19 +1750,19 @@ mod tests {
m.commit(&ctx).await?;
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x70)), false, &ctx)
.get_rel_size(TESTREL_A, Lsn(0x70), false, &ctx)
.await?,
2
);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x70)), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70), false, &ctx)
.await?,
ZERO_PAGE
);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x70)), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x70), false, &ctx)
.await?,
TEST_IMG("foo blk 1")
);
@@ -1851,21 +1775,21 @@ mod tests {
m.commit(&ctx).await?;
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx)
.get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx)
.await?,
1501
);
for blk in 2..1500 {
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, blk, Version::Lsn(Lsn(0x80)), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, blk, Lsn(0x80), false, &ctx)
.await?,
ZERO_PAGE
);
}
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, 1500, Version::Lsn(Lsn(0x80)), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, 1500, Lsn(0x80), false, &ctx)
.await?,
TEST_IMG("foo blk 1500")
);
@@ -1892,13 +1816,13 @@ mod tests {
// Check that rel exists and size is correct
assert_eq!(
tline
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
.get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx)
.await?,
true
);
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
.get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx)
.await?,
1
);
@@ -1911,7 +1835,7 @@ mod tests {
// Check that rel is not visible anymore
assert_eq!(
tline
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x30)), false, &ctx)
.get_rel_exists(TESTREL_A, Lsn(0x30), false, &ctx)
.await?,
false
);
@@ -1929,13 +1853,13 @@ mod tests {
// Check that rel exists and size is correct
assert_eq!(
tline
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x40)), false, &ctx)
.get_rel_exists(TESTREL_A, Lsn(0x40), false, &ctx)
.await?,
true
);
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x40)), false, &ctx)
.get_rel_size(TESTREL_A, Lsn(0x40), false, &ctx)
.await?,
1
);
@@ -1968,24 +1892,24 @@ mod tests {
// The relation was created at LSN 20, not visible at LSN 1 yet.
assert_eq!(
tline
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
.get_rel_exists(TESTREL_A, Lsn(0x10), false, &ctx)
.await?,
false
);
assert!(tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
.get_rel_size(TESTREL_A, Lsn(0x10), false, &ctx)
.await
.is_err());
assert_eq!(
tline
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
.get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx)
.await?,
true
);
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
.get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx)
.await?,
relsize
);
@@ -1996,7 +1920,7 @@ mod tests {
let data = format!("foo blk {} at {}", blkno, lsn);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(lsn), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, blkno, lsn, false, &ctx)
.await?,
TEST_IMG(&data)
);
@@ -2013,7 +1937,7 @@ mod tests {
// Check reported size and contents after truncation
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), false, &ctx)
.get_rel_size(TESTREL_A, Lsn(0x60), false, &ctx)
.await?,
1
);
@@ -2023,7 +1947,7 @@ mod tests {
let data = format!("foo blk {} at {}", blkno, lsn);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x60)), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x60), false, &ctx)
.await?,
TEST_IMG(&data)
);
@@ -2032,7 +1956,7 @@ mod tests {
// should still see all blocks with older LSN
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx)
.get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx)
.await?,
relsize
);
@@ -2041,7 +1965,7 @@ mod tests {
let data = format!("foo blk {} at {}", blkno, lsn);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x50)), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x50), false, &ctx)
.await?,
TEST_IMG(&data)
);
@@ -2061,13 +1985,13 @@ mod tests {
assert_eq!(
tline
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx)
.get_rel_exists(TESTREL_A, Lsn(0x80), false, &ctx)
.await?,
true
);
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx)
.get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx)
.await?,
relsize
);
@@ -2077,7 +2001,7 @@ mod tests {
let data = format!("foo blk {} at {}", blkno, lsn);
assert_eq!(
tline
.get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x80)), false, &ctx)
.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x80), false, &ctx)
.await?,
TEST_IMG(&data)
);
@@ -2110,9 +2034,7 @@ mod tests {
assert_current_logical_size(&tline, Lsn(lsn));
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
.await?,
tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
RELSEG_SIZE + 1
);
@@ -2124,9 +2046,7 @@ mod tests {
.await?;
m.commit(&ctx).await?;
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
.await?,
tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
RELSEG_SIZE
);
assert_current_logical_size(&tline, Lsn(lsn));
@@ -2139,9 +2059,7 @@ mod tests {
.await?;
m.commit(&ctx).await?;
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
.await?,
tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
RELSEG_SIZE - 1
);
assert_current_logical_size(&tline, Lsn(lsn));
@@ -2157,9 +2075,7 @@ mod tests {
.await?;
m.commit(&ctx).await?;
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
.await?,
tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
size as BlockNumber
);
@@ -2195,7 +2111,7 @@ mod tests {
let path = "test_data/sk_wal_segment_from_pgbench";
let wal_segment_path = format!("{path}/000000010000000000000001.zst");
let startpoint = Lsn::from_hex("14AEC08").unwrap();
let _endpoint = Lsn::from_hex("1FFFF98").unwrap();
let endpoint = Lsn::from_hex("1FFFF98").unwrap();
// Bootstrap a real timeline. We can't use create_test_timeline because
// it doesn't create a real checkpoint, and Walingest::new tries to parse
@@ -2234,7 +2150,7 @@ mod tests {
let mut walingest = WalIngest::new(tline.as_ref(), startpoint, &ctx)
.await
.unwrap();
let mut modification = tline.begin_modification(startpoint);
let mut modification = tline.begin_modification(endpoint);
let mut decoded = DecodedWALRecord::default();
println!("decoding {} bytes", bytes.len() - xlogoff);
@@ -2248,7 +2164,6 @@ mod tests {
.await
.unwrap();
}
modification.commit(&ctx).await.unwrap();
}
let duration = started_at.elapsed();

View File

@@ -41,17 +41,6 @@ libwalproposer.a: $(WALPROP_OBJS)
rm -f $@
$(AR) $(AROPT) $@ $^
# needs vars:
# FIND_TYPEDEF pointing to find_typedef
# INDENT pointing to pg_bsd_indent
# PGINDENT_SCRIPT pointing to pgindent (be careful with PGINDENT var name:
# pgindent will pick it up as pg_bsd_indent path).
.PHONY: pgindent
pgindent:
+@ echo top_srcdir=$(top_srcdir) top_builddir=$(top_builddir) srcdir=$(srcdir)
$(FIND_TYPEDEF) . > neon.typedefs
INDENT=$(INDENT) $(PGINDENT_SCRIPT) --typedefs neon.typedefs $(srcdir)/*.c $(srcdir)/*.h
PG_CONFIG = pg_config
PGXS := $(shell $(PG_CONFIG) --pgxs)
include $(PGXS)

View File

@@ -41,7 +41,7 @@ static char *ConsoleURL = NULL;
static bool ForwardDDL = true;
/* Curl structures for sending the HTTP requests */
static CURL *CurlHandle;
static CURL * CurlHandle;
static struct curl_slist *ContentHeader = NULL;
/*
@@ -54,7 +54,7 @@ typedef enum
{
Op_Set, /* An upsert: Either a creation or an alter */
Op_Delete,
} OpType;
} OpType;
typedef struct
{
@@ -62,7 +62,7 @@ typedef struct
Oid owner;
char old_name[NAMEDATALEN];
OpType type;
} DbEntry;
} DbEntry;
typedef struct
{
@@ -70,7 +70,7 @@ typedef struct
char old_name[NAMEDATALEN];
const char *password;
OpType type;
} RoleEntry;
} RoleEntry;
/*
* We keep one of these for each subtransaction in a stack. When a subtransaction
@@ -82,10 +82,10 @@ typedef struct DdlHashTable
struct DdlHashTable *prev_table;
HTAB *db_table;
HTAB *role_table;
} DdlHashTable;
} DdlHashTable;
static DdlHashTable RootTable;
static DdlHashTable *CurrentDdlTable = &RootTable;
static DdlHashTable * CurrentDdlTable = &RootTable;
static void
PushKeyValue(JsonbParseState **state, char *key, char *value)
@@ -199,7 +199,7 @@ typedef struct
{
char str[ERROR_SIZE];
size_t size;
} ErrorString;
} ErrorString;
static size_t
ErrorWriteCallback(char *ptr, size_t size, size_t nmemb, void *userdata)
@@ -478,7 +478,7 @@ NeonXactCallback(XactEvent event, void *arg)
static bool
RoleIsNeonSuperuser(const char *role_name)
{
return strcmp(role_name, "neon_superuser") == 0;
return strcmp(role_name, "neon_superuser") == 0;
}
static void
@@ -509,7 +509,6 @@ HandleCreateDb(CreatedbStmt *stmt)
if (downer && downer->arg)
{
const char *owner_name = defGetString(downer);
if (RoleIsNeonSuperuser(owner_name))
elog(ERROR, "can't create a database with owner neon_superuser");
entry->owner = get_role_oid(owner_name, false);
@@ -537,7 +536,6 @@ HandleAlterOwner(AlterOwnerStmt *stmt)
if (!found)
memset(entry->old_name, 0, sizeof(entry->old_name));
const char *new_owner = get_rolespec_name(stmt->newowner);
if (RoleIsNeonSuperuser(new_owner))
elog(ERROR, "can't alter owner to neon_superuser");
entry->owner = get_role_oid(new_owner, false);
@@ -635,7 +633,6 @@ HandleAlterRole(AlterRoleStmt *stmt)
DefElem *dpass = NULL;
ListCell *option;
const char *role_name = stmt->role->rolename;
if (RoleIsNeonSuperuser(role_name))
elog(ERROR, "can't ALTER neon_superuser");

View File

@@ -25,81 +25,79 @@
#include <curl/curl.h>
static int extension_server_port = 0;
static int extension_server_port = 0;
static download_extension_file_hook_type prev_download_extension_file_hook = NULL;
/*
* to download all SQL (and data) files for an extension:
* curl -X POST http://localhost:8080/extension_server/postgis
* it covers two possible extension files layouts:
* 1. extension_name--version--platform.sql
* 2. extension_name/extension_name--version.sql
* extension_name/extra_files.csv
* to download specific library file:
* curl -X POST http://localhost:8080/extension_server/postgis-3.so?is_library=true
*/
// to download all SQL (and data) files for an extension:
// curl -X POST http://localhost:8080/extension_server/postgis
// it covers two possible extension files layouts:
// 1. extension_name--version--platform.sql
// 2. extension_name/extension_name--version.sql
// extension_name/extra_files.csv
//
// to download specific library file:
// curl -X POST http://localhost:8080/extension_server/postgis-3.so?is_library=true
static bool
neon_download_extension_file_http(const char *filename, bool is_library)
{
CURL *curl;
CURLcode res;
char *compute_ctl_url;
char *postdata;
bool ret = false;
CURL *curl;
CURLcode res;
char *compute_ctl_url;
char *postdata;
bool ret = false;
if ((curl = curl_easy_init()) == NULL)
{
elog(ERROR, "Failed to initialize curl handle");
}
if ((curl = curl_easy_init()) == NULL)
{
elog(ERROR, "Failed to initialize curl handle");
}
compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s",
extension_server_port, filename, is_library ? "?is_library=true" : "");
compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s",
extension_server_port, filename, is_library ? "?is_library=true" : "");
elog(LOG, "Sending request to compute_ctl: %s", compute_ctl_url);
elog(LOG, "Sending request to compute_ctl: %s", compute_ctl_url);
curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, "POST");
curl_easy_setopt(curl, CURLOPT_URL, compute_ctl_url);
curl_easy_setopt(curl, CURLOPT_TIMEOUT, 3L /* seconds */ );
curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, "POST");
curl_easy_setopt(curl, CURLOPT_URL, compute_ctl_url);
curl_easy_setopt(curl, CURLOPT_TIMEOUT, 3L /* seconds */);
if (curl)
{
/* Perform the request, res will get the return code */
res = curl_easy_perform(curl);
/* Check for errors */
if (res == CURLE_OK)
{
ret = true;
}
else
{
/* Don't error here because postgres will try to find the file */
/* and will fail with some proper error message if it's not found. */
elog(WARNING, "neon_download_extension_file_http failed: %s\n", curl_easy_strerror(res));
}
if (curl)
{
/* Perform the request, res will get the return code */
res = curl_easy_perform(curl);
/* Check for errors */
if (res == CURLE_OK)
{
ret = true;
}
else
{
// Don't error here because postgres will try to find the file
// and will fail with some proper error message if it's not found.
elog(WARNING, "neon_download_extension_file_http failed: %s\n", curl_easy_strerror(res));
}
/* always cleanup */
curl_easy_cleanup(curl);
}
/* always cleanup */
curl_easy_cleanup(curl);
}
return ret;
return ret;
}
void
pg_init_extension_server()
void pg_init_extension_server()
{
/* Port to connect to compute_ctl on localhost */
/* to request extension files. */
DefineCustomIntVariable("neon.extension_server_port",
"connection string to the compute_ctl",
NULL,
&extension_server_port,
0, 0, INT_MAX,
PGC_POSTMASTER,
0, /* no flags required */
NULL, NULL, NULL);
// Port to connect to compute_ctl on localhost
// to request extension files.
DefineCustomIntVariable("neon.extension_server_port",
"connection string to the compute_ctl",
NULL,
&extension_server_port,
0, 0, INT_MAX,
PGC_POSTMASTER,
0, /* no flags required */
NULL, NULL, NULL);
/* set download_extension_file_hook */
prev_download_extension_file_hook = download_extension_file_hook;
download_extension_file_hook = neon_download_extension_file_http;
// set download_extension_file_hook
prev_download_extension_file_hook = download_extension_file_hook;
download_extension_file_hook = neon_download_extension_file_http;
}

View File

@@ -67,34 +67,32 @@
typedef struct FileCacheEntry
{
BufferTag key;
uint32 hash;
uint32 hash;
uint32 offset;
uint32 access_count;
uint32 bitmap[BLOCKS_PER_CHUNK / 32];
dlist_node lru_node; /* LRU list node */
uint32 bitmap[BLOCKS_PER_CHUNK/32];
dlist_node lru_node; /* LRU list node */
} FileCacheEntry;
typedef struct FileCacheControl
{
uint64 generation; /* generation is needed to handle correct hash
* reenabling */
uint32 size; /* size of cache file in chunks */
uint32 used; /* number of used chunks */
uint32 limit; /* shared copy of lfc_size_limit */
uint64 hits;
uint64 misses;
uint64 writes;
dlist_head lru; /* double linked list for LRU replacement
* algorithm */
uint64 generation; /* generation is needed to handle correct hash reenabling */
uint32 size; /* size of cache file in chunks */
uint32 used; /* number of used chunks */
uint32 limit; /* shared copy of lfc_size_limit */
uint64 hits;
uint64 misses;
uint64 writes;
dlist_head lru; /* double linked list for LRU replacement algorithm */
} FileCacheControl;
static HTAB *lfc_hash;
static int lfc_desc = 0;
static HTAB* lfc_hash;
static int lfc_desc = 0;
static LWLockId lfc_lock;
static int lfc_max_size;
static int lfc_size_limit;
static char *lfc_path;
static FileCacheControl *lfc_ctl;
static int lfc_max_size;
static int lfc_size_limit;
static char* lfc_path;
static FileCacheControl* lfc_ctl;
static shmem_startup_hook_type prev_shmem_startup_hook;
#if PG_VERSION_NUM>=150000
static shmem_request_hook_type prev_shmem_request_hook;
@@ -102,7 +100,7 @@ static shmem_request_hook_type prev_shmem_request_hook;
#define LFC_ENABLED() (lfc_ctl->limit != 0)
void PGDLLEXPORT FileCacheMonitorMain(Datum main_arg);
void PGDLLEXPORT FileCacheMonitorMain(Datum main_arg);
/*
* Local file cache is optional and Neon can work without it.
@@ -111,10 +109,9 @@ void PGDLLEXPORT FileCacheMonitorMain(Datum main_arg);
* All cache content should be invalidated to avoid reading of stale or corrupted data
*/
static void
lfc_disable(char const *op)
lfc_disable(char const* op)
{
int fd;
int fd;
elog(WARNING, "Failed to %s local file cache at %s: %m, disabling local file cache", op, lfc_path);
/* Invalidate hash */
@@ -123,7 +120,7 @@ lfc_disable(char const *op)
if (LFC_ENABLED())
{
HASH_SEQ_STATUS status;
FileCacheEntry *entry;
FileCacheEntry* entry;
hash_seq_init(&status, lfc_hash);
while ((entry = hash_seq_search(&status)) != NULL)
@@ -138,24 +135,16 @@ lfc_disable(char const *op)
if (lfc_desc > 0)
{
/*
* If the reason of error is ENOSPC, then truncation of file may
* help to reclaim some space
*/
int rc = ftruncate(lfc_desc, 0);
/* If the reason of error is ENOSPC, then truncation of file may help to reclaim some space */
int rc = ftruncate(lfc_desc, 0);
if (rc < 0)
elog(WARNING, "Failed to truncate local file cache %s: %m", lfc_path);
}
}
/*
* We need to use unlink to to avoid races in LFC write, because it is not
* protectedby
*/
/* We need to use unlink to to avoid races in LFC write, because it is not protectedby */
unlink(lfc_path);
fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC);
fd = BasicOpenFile(lfc_path, O_RDWR|O_CREAT|O_TRUNC);
if (fd < 0)
elog(WARNING, "Failed to recreate local file cache %s: %m", lfc_path);
else
@@ -181,15 +170,13 @@ lfc_maybe_disabled(void)
static bool
lfc_ensure_opened(void)
{
bool enabled = !lfc_maybe_disabled();
bool enabled = !lfc_maybe_disabled();
/* Open cache file if not done yet */
if (lfc_desc <= 0 && enabled)
{
lfc_desc = BasicOpenFile(lfc_path, O_RDWR);
if (lfc_desc < 0)
{
if (lfc_desc < 0) {
lfc_disable("open");
return false;
}
@@ -200,7 +187,7 @@ lfc_ensure_opened(void)
static void
lfc_shmem_startup(void)
{
bool found;
bool found;
static HASHCTL info;
if (prev_shmem_startup_hook)
@@ -210,22 +197,17 @@ lfc_shmem_startup(void)
LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
lfc_ctl = (FileCacheControl *) ShmemInitStruct("lfc", sizeof(FileCacheControl), &found);
lfc_ctl = (FileCacheControl*)ShmemInitStruct("lfc", sizeof(FileCacheControl), &found);
if (!found)
{
int fd;
uint32 lfc_size = SIZE_MB_TO_CHUNKS(lfc_max_size);
lfc_lock = (LWLockId) GetNamedLWLockTranche("lfc_lock");
int fd;
uint32 lfc_size = SIZE_MB_TO_CHUNKS(lfc_max_size);
lfc_lock = (LWLockId)GetNamedLWLockTranche("lfc_lock");
info.keysize = sizeof(BufferTag);
info.entrysize = sizeof(FileCacheEntry);
/*
* lfc_size+1 because we add new element to hash table before eviction
* of victim
*/
lfc_hash = ShmemInitHash("lfc_hash",
lfc_size + 1, lfc_size + 1,
/* lfc_size+1 because we add new element to hash table before eviction of victim */
lfc_size+1, lfc_size+1,
&info,
HASH_ELEM | HASH_BLOBS);
lfc_ctl->generation = 0;
@@ -237,7 +219,7 @@ lfc_shmem_startup(void)
dlist_init(&lfc_ctl->lru);
/* Recreate file cache on restart */
fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC);
fd = BasicOpenFile(lfc_path, O_RDWR|O_CREAT|O_TRUNC);
if (fd < 0)
{
elog(WARNING, "Failed to create local file cache %s: %m", lfc_path);
@@ -260,7 +242,7 @@ lfc_shmem_request(void)
prev_shmem_request_hook();
#endif
RequestAddinShmemSpace(sizeof(FileCacheControl) + hash_estimate_size(SIZE_MB_TO_CHUNKS(lfc_max_size) + 1, sizeof(FileCacheEntry)));
RequestAddinShmemSpace(sizeof(FileCacheControl) + hash_estimate_size(SIZE_MB_TO_CHUNKS(lfc_max_size)+1, sizeof(FileCacheEntry)));
RequestNamedLWLockTranche("lfc_lock", 1);
}
@@ -268,11 +250,9 @@ static bool
is_normal_backend(void)
{
/*
* Stats collector detach shared memory, so we should not try to access
* shared memory here. Parallel workers first assign default value (0), so
* not perform truncation in parallel workers. The Postmaster can handle
* SIGHUP and it has access to shared memory (UsedShmemSegAddr != NULL),
* but has no PGPROC.
* Stats collector detach shared memory, so we should not try to access shared memory here.
* Parallel workers first assign default value (0), so not perform truncation in parallel workers.
* The Postmaster can handle SIGHUP and it has access to shared memory (UsedShmemSegAddr != NULL), but has no PGPROC.
*/
return lfc_ctl && MyProc && UsedShmemSegAddr && !IsParallelWorker();
}
@@ -291,7 +271,7 @@ lfc_check_limit_hook(int *newval, void **extra, GucSource source)
static void
lfc_change_limit_hook(int newval, void *extra)
{
uint32 new_size = SIZE_MB_TO_CHUNKS(newval);
uint32 new_size = SIZE_MB_TO_CHUNKS(newval);
if (!is_normal_backend())
return;
@@ -303,15 +283,11 @@ lfc_change_limit_hook(int newval, void *extra)
while (new_size < lfc_ctl->used && !dlist_is_empty(&lfc_ctl->lru))
{
/*
* Shrink cache by throwing away least recently accessed chunks and
* returning their space to file system
*/
FileCacheEntry *victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
/* Shrink cache by throwing away least recently accessed chunks and returning their space to file system */
FileCacheEntry* victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
Assert(victim->access_count == 0);
#ifdef FALLOC_FL_PUNCH_HOLE
if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, (off_t) victim->offset * BLOCKS_PER_CHUNK * BLCKSZ, BLOCKS_PER_CHUNK * BLCKSZ) < 0)
if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE, (off_t)victim->offset*BLOCKS_PER_CHUNK*BLCKSZ, BLOCKS_PER_CHUNK*BLCKSZ) < 0)
elog(LOG, "Failed to punch hole in file: %m");
#endif
hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
@@ -338,7 +314,7 @@ lfc_init(void)
"Maximal size of Neon local file cache",
NULL,
&lfc_max_size,
0, /* disabled by default */
0, /* disabled by default */
0,
INT_MAX,
PGC_POSTMASTER,
@@ -351,7 +327,7 @@ lfc_init(void)
"Current limit for size of Neon local file cache",
NULL,
&lfc_size_limit,
0, /* disabled by default */
0, /* disabled by default */
0,
INT_MAX,
PGC_SIGHUP,
@@ -391,18 +367,18 @@ lfc_init(void)
bool
lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
{
BufferTag tag;
FileCacheEntry *entry;
int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
bool found = false;
uint32 hash;
BufferTag tag;
FileCacheEntry* entry;
int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
bool found = false;
uint32 hash;
if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
return false;
CopyNRelFileInfoToBufTag(tag, rinfo);
tag.forkNum = forkNum;
tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1);
hash = get_hash_value(lfc_hash, &tag);
LWLockAcquire(lfc_lock, LW_SHARED);
@@ -421,13 +397,13 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
void
lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
{
BufferTag tag;
FileCacheEntry *entry;
bool found;
int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
uint32 hash;
BufferTag tag;
FileCacheEntry* entry;
bool found;
int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
uint32 hash;
if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
return;
CopyNRelFileInfoToBufTag(tag, rinfo);
@@ -462,10 +438,9 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
*/
if (entry->bitmap[chunk_offs >> 5] == 0)
{
bool has_remaining_pages;
bool has_remaining_pages;
for (int i = 0; i < (BLOCKS_PER_CHUNK / 32); i++)
{
for (int i = 0; i < (BLOCKS_PER_CHUNK / 32); i++) {
if (entry->bitmap[i] != 0)
{
has_remaining_pages = true;
@@ -474,8 +449,8 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
}
/*
* Put the entry at the position that is first to be reclaimed when we
* have no cached pages remaining in the chunk
* Put the entry at the position that is first to be reclaimed when
* we have no cached pages remaining in the chunk
*/
if (!has_remaining_pages)
{
@@ -501,16 +476,16 @@ bool
lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
char *buffer)
{
BufferTag tag;
FileCacheEntry *entry;
ssize_t rc;
int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
bool result = true;
uint32 hash;
uint64 generation;
uint32 entry_offset;
BufferTag tag;
FileCacheEntry* entry;
ssize_t rc;
int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
bool result = true;
uint32 hash;
uint64 generation;
uint32 entry_offset;
if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
return false;
if (!lfc_ensure_opened())
@@ -518,7 +493,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
CopyNRelFileInfoToBufTag(tag, rinfo);
tag.forkNum = forkNum;
tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1);
hash = get_hash_value(lfc_hash, &tag);
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
@@ -545,7 +520,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
LWLockRelease(lfc_lock);
rc = pread(lfc_desc, buffer, BLCKSZ, ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ);
rc = pread(lfc_desc, buffer, BLCKSZ, ((off_t)entry_offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ);
if (rc != BLCKSZ)
{
lfc_disable("read");
@@ -576,29 +551,30 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
* If cache is full then evict some other page.
*/
void
lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
#if PG_MAJORVERSION_NUM < 16
lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, char *buffer)
char *buffer)
#else
lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void *buffer)
const void *buffer)
#endif
{
BufferTag tag;
FileCacheEntry *entry;
ssize_t rc;
bool found;
int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
uint32 hash;
uint64 generation;
uint32 entry_offset;
BufferTag tag;
FileCacheEntry* entry;
ssize_t rc;
bool found;
int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
uint32 hash;
uint64 generation;
uint32 entry_offset;
if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
return;
if (!lfc_ensure_opened())
return;
tag.forkNum = forkNum;
tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1);
CopyNRelFileInfoToBufTag(tag, rinfo);
hash = get_hash_value(lfc_hash, &tag);
@@ -614,36 +590,24 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void
if (found)
{
/*
* Unlink entry from LRU list to pin it for the duration of IO
* operation
*/
/* Unlink entry from LRU list to pin it for the duration of IO operation */
if (entry->access_count++ == 0)
dlist_delete(&entry->lru_node);
}
else
{
/*
* We have two choices if all cache pages are pinned (i.e. used in IO
* operations):
*
* 1) Wait until some of this operation is completed and pages is
* unpinned.
*
* 2) Allocate one more chunk, so that specified cache size is more
* recommendation than hard limit.
*
* As far as probability of such event (that all pages are pinned) is
* considered to be very very small: there are should be very large
* number of concurrent IO operations and them are limited by
* max_connections, we prefer not to complicate code and use second
* approach.
* We have two choices if all cache pages are pinned (i.e. used in IO operations):
* 1. Wait until some of this operation is completed and pages is unpinned
* 2. Allocate one more chunk, so that specified cache size is more recommendation than hard limit.
* As far as probability of such event (that all pages are pinned) is considered to be very very small:
* there are should be very large number of concurrent IO operations and them are limited by max_connections,
* we prefer not to complicate code and use second approach.
*/
if (lfc_ctl->used >= lfc_ctl->limit && !dlist_is_empty(&lfc_ctl->lru))
{
/* Cache overflow: evict least recently used chunk */
FileCacheEntry *victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
FileCacheEntry* victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
Assert(victim->access_count == 0);
entry->offset = victim->offset; /* grab victim's chunk */
hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
@@ -652,8 +616,7 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void
else
{
lfc_ctl->used += 1;
entry->offset = lfc_ctl->size++; /* allocate new chunk at end
* of file */
entry->offset = lfc_ctl->size++; /* allocate new chunk at end of file */
}
entry->access_count = 1;
entry->hash = hash;
@@ -665,7 +628,7 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void
lfc_ctl->writes += 1;
LWLockRelease(lfc_lock);
rc = pwrite(lfc_desc, buffer, BLCKSZ, ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ);
rc = pwrite(lfc_desc, buffer, BLCKSZ, ((off_t)entry_offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ);
if (rc != BLCKSZ)
{
lfc_disable("write");
@@ -702,13 +665,13 @@ Datum
neon_get_lfc_stats(PG_FUNCTION_ARGS)
{
FuncCallContext *funcctx;
NeonGetStatsCtx *fctx;
NeonGetStatsCtx* fctx;
MemoryContext oldcontext;
TupleDesc tupledesc;
Datum result;
HeapTuple tuple;
char const *key;
uint64 value;
char const* key;
uint64 value;
Datum values[NUM_NEON_GET_STATS_COLS];
bool nulls[NUM_NEON_GET_STATS_COLS];
@@ -720,7 +683,7 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
/* Create a user function context for cross-call persistence */
fctx = (NeonGetStatsCtx *) palloc(sizeof(NeonGetStatsCtx));
fctx = (NeonGetStatsCtx*) palloc(sizeof(NeonGetStatsCtx));
/* Construct a tuple descriptor for the result rows. */
tupledesc = CreateTemplateTupleDesc(NUM_NEON_GET_STATS_COLS);
@@ -741,7 +704,7 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
funcctx = SRF_PERCALL_SETUP();
/* Get the saved state */
fctx = (NeonGetStatsCtx *) funcctx->user_fctx;
fctx = (NeonGetStatsCtx*) funcctx->user_fctx;
switch (funcctx->call_cntr)
{
@@ -829,9 +792,9 @@ local_cache_pages(PG_FUNCTION_ARGS)
if (SRF_IS_FIRSTCALL())
{
HASH_SEQ_STATUS status;
FileCacheEntry *entry;
uint32 n_pages = 0;
HASH_SEQ_STATUS status;
FileCacheEntry* entry;
uint32 n_pages = 0;
funcctx = SRF_FIRSTCALL_INIT();
@@ -888,7 +851,7 @@ local_cache_pages(PG_FUNCTION_ARGS)
hash_seq_init(&status, lfc_hash);
while ((entry = hash_seq_search(&status)) != NULL)
{
for (int i = 0; i < BLOCKS_PER_CHUNK / 32; i++)
for (int i = 0; i < BLOCKS_PER_CHUNK/32; i++)
n_pages += pg_popcount32(entry->bitmap[i]);
}
}
@@ -907,11 +870,10 @@ local_cache_pages(PG_FUNCTION_ARGS)
if (n_pages != 0)
{
/*
* Scan through all the cache entries, saving the relevant fields
* in the fctx->record structure.
* Scan through all the cache entries, saving the relevant fields in the
* fctx->record structure.
*/
uint32 n = 0;
uint32 n = 0;
hash_seq_init(&status, lfc_hash);
while ((entry = hash_seq_search(&status)) != NULL)
{
@@ -919,7 +881,7 @@ local_cache_pages(PG_FUNCTION_ARGS)
{
if (entry->bitmap[i >> 5] & (1 << (i & 31)))
{
fctx->record[n].pageoffs = entry->offset * BLOCKS_PER_CHUNK + i;
fctx->record[n].pageoffs = entry->offset*BLOCKS_PER_CHUNK + i;
fctx->record[n].relfilenode = NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key));
fctx->record[n].reltablespace = NInfoGetSpcOid(BufTagGetNRelFileInfo(entry->key));
fctx->record[n].reldatabase = NInfoGetDbOid(BufTagGetNRelFileInfo(entry->key));

View File

@@ -69,9 +69,9 @@ int max_reconnect_attempts = 60;
typedef struct
{
LWLockId lock;
pg_atomic_uint64 update_counter;
char pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
LWLockId lock;
pg_atomic_uint64 update_counter;
char pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
} PagestoreShmemState;
#if PG_VERSION_NUM >= 150000
@@ -83,7 +83,7 @@ static PagestoreShmemState *pagestore_shared;
static uint64 pagestore_local_counter = 0;
static char local_pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;
bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;
static bool pageserver_flush(void);
static void pageserver_disconnect(void);
@@ -91,43 +91,43 @@ static void pageserver_disconnect(void);
static bool
PagestoreShmemIsValid()
{
return pagestore_shared && UsedShmemSegAddr;
return pagestore_shared && UsedShmemSegAddr;
}
static bool
CheckPageserverConnstring(char **newval, void **extra, GucSource source)
{
return strlen(*newval) < MAX_PAGESERVER_CONNSTRING_SIZE;
return strlen(*newval) < MAX_PAGESERVER_CONNSTRING_SIZE;
}
static void
AssignPageserverConnstring(const char *newval, void *extra)
{
if (!PagestoreShmemIsValid())
return;
LWLockAcquire(pagestore_shared->lock, LW_EXCLUSIVE);
strlcpy(pagestore_shared->pageserver_connstring, newval, MAX_PAGESERVER_CONNSTRING_SIZE);
pg_atomic_fetch_add_u64(&pagestore_shared->update_counter, 1);
LWLockRelease(pagestore_shared->lock);
if(!PagestoreShmemIsValid())
return;
LWLockAcquire(pagestore_shared->lock, LW_EXCLUSIVE);
strlcpy(pagestore_shared->pageserver_connstring, newval, MAX_PAGESERVER_CONNSTRING_SIZE);
pg_atomic_fetch_add_u64(&pagestore_shared->update_counter, 1);
LWLockRelease(pagestore_shared->lock);
}
static bool
CheckConnstringUpdated()
{
if (!PagestoreShmemIsValid())
return false;
return pagestore_local_counter < pg_atomic_read_u64(&pagestore_shared->update_counter);
if(!PagestoreShmemIsValid())
return false;
return pagestore_local_counter < pg_atomic_read_u64(&pagestore_shared->update_counter);
}
static void
ReloadConnstring()
{
if (!PagestoreShmemIsValid())
return;
LWLockAcquire(pagestore_shared->lock, LW_SHARED);
strlcpy(local_pageserver_connstring, pagestore_shared->pageserver_connstring, sizeof(local_pageserver_connstring));
pagestore_local_counter = pg_atomic_read_u64(&pagestore_shared->update_counter);
LWLockRelease(pagestore_shared->lock);
if(!PagestoreShmemIsValid())
return;
LWLockAcquire(pagestore_shared->lock, LW_SHARED);
strlcpy(local_pageserver_connstring, pagestore_shared->pageserver_connstring, sizeof(local_pageserver_connstring));
pagestore_local_counter = pg_atomic_read_u64(&pagestore_shared->update_counter);
LWLockRelease(pagestore_shared->lock);
}
static bool
@@ -141,20 +141,21 @@ pageserver_connect(int elevel)
Assert(!connected);
if (CheckConnstringUpdated())
{
ReloadConnstring();
}
if(CheckConnstringUpdated())
{
ReloadConnstring();
}
/*
* Connect using the connection string we got from the
* neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
* variable was set, use that as the password.
*
* The connection options are parsed in the order they're given, so when
* we set the password before the connection string, the connection string
* can override the password from the env variable. Seems useful, although
* we don't currently use that capability anywhere.
* The connection options are parsed in the order they're given, so
* when we set the password before the connection string, the
* connection string can override the password from the env variable.
* Seems useful, although we don't currently use that capability
* anywhere.
*/
n = 0;
if (neon_auth_token)
@@ -197,9 +198,9 @@ pageserver_connect(int elevel)
pageserver_conn_wes = CreateWaitEventSet(TopMemoryContext, 3);
AddWaitEventToSet(pageserver_conn_wes, WL_LATCH_SET, PGINVALID_SOCKET,
MyLatch, NULL);
MyLatch, NULL);
AddWaitEventToSet(pageserver_conn_wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
NULL, NULL);
NULL, NULL);
AddWaitEventToSet(pageserver_conn_wes, WL_SOCKET_READABLE, PQsocket(pageserver_conn), NULL, NULL);
while (PQisBusy(pageserver_conn))
@@ -264,7 +265,6 @@ retry:
if (!PQconsumeInput(pageserver_conn))
{
char *msg = pchomp(PQerrorMessage(pageserver_conn));
neon_log(LOG, "could not get response from pageserver: %s", msg);
pfree(msg);
return -1;
@@ -305,15 +305,15 @@ pageserver_disconnect(void)
}
static bool
pageserver_send(NeonRequest *request)
pageserver_send(NeonRequest * request)
{
StringInfoData req_buff;
if (CheckConnstringUpdated())
{
pageserver_disconnect();
ReloadConnstring();
}
if(CheckConnstringUpdated())
{
pageserver_disconnect();
ReloadConnstring();
}
/* If the connection was lost for some reason, reconnect */
if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD)
@@ -326,12 +326,10 @@ pageserver_send(NeonRequest *request)
/*
* If pageserver is stopped, the connections from compute node are broken.
* The compute node doesn't notice that immediately, but it will cause the
* next request to fail, usually on the next query. That causes
* user-visible errors if pageserver is restarted, or the tenant is moved
* from one pageserver to another. See
* https://github.com/neondatabase/neon/issues/1138 So try to reestablish
* connection in case of failure.
* The compute node doesn't notice that immediately, but it will cause the next request to fail, usually on the next query.
* That causes user-visible errors if pageserver is restarted, or the tenant is moved from one pageserver to another.
* See https://github.com/neondatabase/neon/issues/1138
* So try to reestablish connection in case of failure.
*/
if (!connected)
{
@@ -355,7 +353,6 @@ pageserver_send(NeonRequest *request)
if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
{
char *msg = pchomp(PQerrorMessage(pageserver_conn));
pageserver_disconnect();
neon_log(LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg);
pfree(msg);
@@ -413,8 +410,7 @@ pageserver_receive(void)
}
else if (rc == -2)
{
char *msg = pchomp(PQerrorMessage(pageserver_conn));
char* msg = pchomp(PQerrorMessage(pageserver_conn));
pageserver_disconnect();
neon_log(ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg);
}
@@ -448,7 +444,6 @@ pageserver_flush(void)
if (PQflush(pageserver_conn))
{
char *msg = pchomp(PQerrorMessage(pageserver_conn));
pageserver_disconnect();
neon_log(LOG, "pageserver_flush disconnect because failed to flush page requests: %s", msg);
pfree(msg);
@@ -476,47 +471,46 @@ check_neon_id(char **newval, void **extra, GucSource source)
static Size
PagestoreShmemSize(void)
{
return sizeof(PagestoreShmemState);
return sizeof(PagestoreShmemState);
}
static bool
PagestoreShmemInit(void)
{
bool found;
LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
pagestore_shared = ShmemInitStruct("libpagestore shared state",
PagestoreShmemSize(),
&found);
if (!found)
{
pagestore_shared->lock = &(GetNamedLWLockTranche("neon_libpagestore")->lock);
pg_atomic_init_u64(&pagestore_shared->update_counter, 0);
AssignPageserverConnstring(page_server_connstring, NULL);
}
LWLockRelease(AddinShmemInitLock);
return found;
bool found;
LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
pagestore_shared = ShmemInitStruct("libpagestore shared state",
PagestoreShmemSize(),
&found);
if(!found)
{
pagestore_shared->lock = &(GetNamedLWLockTranche("neon_libpagestore")->lock);
pg_atomic_init_u64(&pagestore_shared->update_counter, 0);
AssignPageserverConnstring(page_server_connstring, NULL);
}
LWLockRelease(AddinShmemInitLock);
return found;
}
static void
pagestore_shmem_startup_hook(void)
{
if (prev_shmem_startup_hook)
prev_shmem_startup_hook();
if(prev_shmem_startup_hook)
prev_shmem_startup_hook();
PagestoreShmemInit();
PagestoreShmemInit();
}
static void
pagestore_shmem_request(void)
{
#if PG_VERSION_NUM >= 150000
if (prev_shmem_request_hook)
prev_shmem_request_hook();
if(prev_shmem_request_hook)
prev_shmem_request_hook();
#endif
RequestAddinShmemSpace(PagestoreShmemSize());
RequestNamedLWLockTranche("neon_libpagestore", 1);
RequestAddinShmemSpace(PagestoreShmemSize());
RequestNamedLWLockTranche("neon_libpagestore", 1);
}
static void
@@ -526,7 +520,7 @@ pagestore_prepare_shmem(void)
prev_shmem_request_hook = shmem_request_hook;
shmem_request_hook = pagestore_shmem_request;
#else
pagestore_shmem_request();
pagestore_shmem_request();
#endif
prev_shmem_startup_hook = shmem_startup_hook;
shmem_startup_hook = pagestore_shmem_startup_hook;
@@ -538,7 +532,7 @@ pagestore_prepare_shmem(void)
void
pg_init_libpagestore(void)
{
pagestore_prepare_shmem();
pagestore_prepare_shmem();
DefineCustomStringVariable("neon.pageserver_connstring",
"connection string to the page server",
@@ -613,10 +607,7 @@ pg_init_libpagestore(void)
neon_log(PageStoreTrace, "libpagestore already loaded");
page_server = &api;
/*
* Retrieve the auth token to use when connecting to pageserver and
* safekeepers
*/
/* Retrieve the auth token to use when connecting to pageserver and safekeepers */
neon_auth_token = getenv("NEON_AUTH_TOKEN");
if (neon_auth_token)
neon_log(LOG, "using storage auth token from NEON_AUTH_TOKEN environment variable");

View File

@@ -48,11 +48,9 @@ _PG_init(void)
pg_init_extension_server();
/*
* Important: This must happen after other parts of the extension are
* loaded, otherwise any settings to GUCs that were set before the
* extension was loaded will be removed.
*/
// Important: This must happen after other parts of the extension
// are loaded, otherwise any settings to GUCs that were set before
// the extension was loaded will be removed.
EmitWarningsOnPlaceholders("neon");
}

View File

@@ -32,7 +32,7 @@ extern void pg_init_extension_server(void);
* block_id; false otherwise.
*/
extern bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id);
extern bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id);
extern bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id);
extern uint64 BackpressureThrottlingTime(void);
extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);

View File

@@ -59,7 +59,7 @@
#define DropRelationAllLocalBuffers DropRelFileNodeAllLocalBuffers
#else /* major version >= 16 */
#else /* major version >= 16 */
#define USE_RELFILELOCATOR
@@ -109,4 +109,4 @@
#define DropRelationAllLocalBuffers DropRelationAllLocalBuffers
#endif
#endif /* NEON_PGVERSIONCOMPAT_H */
#endif //NEON_PGVERSIONCOMPAT_H

View File

@@ -40,13 +40,13 @@ typedef enum
T_NeonGetPageResponse,
T_NeonErrorResponse,
T_NeonDbSizeResponse,
} NeonMessageTag;
} NeonMessageTag;
/* base struct for c-style inheritance */
typedef struct
{
NeonMessageTag tag;
} NeonMessage;
} NeonMessage;
#define messageTag(m) (((const NeonMessage *)(m))->tag)
@@ -67,27 +67,27 @@ typedef struct
NeonMessageTag tag;
bool latest; /* if true, request latest page version */
XLogRecPtr lsn; /* request page version @ this LSN */
} NeonRequest;
} NeonRequest;
typedef struct
{
NeonRequest req;
NRelFileInfo rinfo;
ForkNumber forknum;
} NeonExistsRequest;
} NeonExistsRequest;
typedef struct
{
NeonRequest req;
NRelFileInfo rinfo;
ForkNumber forknum;
} NeonNblocksRequest;
} NeonNblocksRequest;
typedef struct
{
NeonRequest req;
Oid dbNode;
} NeonDbSizeRequest;
} NeonDbSizeRequest;
typedef struct
{
@@ -95,31 +95,31 @@ typedef struct
NRelFileInfo rinfo;
ForkNumber forknum;
BlockNumber blkno;
} NeonGetPageRequest;
} NeonGetPageRequest;
/* supertype of all the Neon*Response structs below */
typedef struct
{
NeonMessageTag tag;
} NeonResponse;
} NeonResponse;
typedef struct
{
NeonMessageTag tag;
bool exists;
} NeonExistsResponse;
} NeonExistsResponse;
typedef struct
{
NeonMessageTag tag;
uint32 n_blocks;
} NeonNblocksResponse;
} NeonNblocksResponse;
typedef struct
{
NeonMessageTag tag;
char page[FLEXIBLE_ARRAY_MEMBER];
} NeonGetPageResponse;
} NeonGetPageResponse;
#define PS_GETPAGERESPONSE_SIZE (MAXALIGN(offsetof(NeonGetPageResponse, page) + BLCKSZ))
@@ -127,18 +127,18 @@ typedef struct
{
NeonMessageTag tag;
int64 db_size;
} NeonDbSizeResponse;
} NeonDbSizeResponse;
typedef struct
{
NeonMessageTag tag;
char message[FLEXIBLE_ARRAY_MEMBER]; /* null-terminated error
* message */
} NeonErrorResponse;
} NeonErrorResponse;
extern StringInfoData nm_pack_request(NeonRequest *msg);
extern NeonResponse *nm_unpack_response(StringInfo s);
extern char *nm_to_string(NeonMessage *msg);
extern StringInfoData nm_pack_request(NeonRequest * msg);
extern NeonResponse * nm_unpack_response(StringInfo s);
extern char *nm_to_string(NeonMessage * msg);
/*
* API
@@ -146,20 +146,20 @@ extern char *nm_to_string(NeonMessage *msg);
typedef struct
{
bool (*send) (NeonRequest *request);
bool (*send) (NeonRequest * request);
NeonResponse *(*receive) (void);
bool (*flush) (void);
} page_server_api;
} page_server_api;
extern void prefetch_on_ps_disconnect(void);
extern page_server_api *page_server;
extern page_server_api * page_server;
extern char *page_server_connstring;
extern int flush_every_n_requests;
extern int readahead_buffer_size;
extern int flush_every_n_requests;
extern int readahead_buffer_size;
extern bool seqscan_prefetch_enabled;
extern int seqscan_prefetch_distance;
extern int seqscan_prefetch_distance;
extern char *neon_timeline;
extern char *neon_tenant;
extern bool wal_redo;
@@ -194,14 +194,14 @@ extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum,
extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
char *buffer);
extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
XLogRecPtr request_lsn, bool request_latest, char *buffer);
XLogRecPtr request_lsn, bool request_latest, char *buffer);
extern void neon_write(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer, bool skipFsync);
#else
extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
void *buffer);
extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
XLogRecPtr request_lsn, bool request_latest, void *buffer);
XLogRecPtr request_lsn, bool request_latest, void *buffer);
extern void neon_write(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, const void *buffer, bool skipFsync);
#endif

View File

@@ -101,21 +101,21 @@ typedef enum
UNLOGGED_BUILD_PHASE_1,
UNLOGGED_BUILD_PHASE_2,
UNLOGGED_BUILD_NOT_PERMANENT
} UnloggedBuildPhase;
} UnloggedBuildPhase;
static SMgrRelation unlogged_build_rel = NULL;
static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
/*
* Prefetch implementation:
*
*
* Prefetch is performed locally by each backend.
*
* There can be up to readahead_buffer_size active IO requests registered at
* any time. Requests using smgr_prefetch are sent to the pageserver, but we
* don't wait on the response. Requests using smgr_read are either read from
* the buffer, or (if that's not possible) we wait on the response to arrive -
* this also will allow us to receive other prefetched pages.
* this also will allow us to receive other prefetched pages.
* Each request is immediately written to the output buffer of the pageserver
* connection, but may not be flushed if smgr_prefetch is used: pageserver
* flushes sent requests on manual flush, or every neon.flush_output_after
@@ -139,7 +139,7 @@ static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
/*
* State machine:
*
*
* not in hash : in hash
* :
* UNUSED ------> REQUESTED --> RECEIVED
@@ -150,34 +150,30 @@ static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
* +----------------+------------+
* :
*/
typedef enum PrefetchStatus
{
PRFS_UNUSED = 0, /* unused slot */
PRFS_REQUESTED, /* request was written to the sendbuffer to
* PS, but not necessarily flushed. all fields
* except response valid */
PRFS_RECEIVED, /* all fields valid */
PRFS_TAG_REMAINS, /* only buftag and my_ring_index are still
* valid */
typedef enum PrefetchStatus {
PRFS_UNUSED = 0, /* unused slot */
PRFS_REQUESTED, /* request was written to the sendbuffer to PS, but not
* necessarily flushed.
* all fields except response valid */
PRFS_RECEIVED, /* all fields valid */
PRFS_TAG_REMAINS, /* only buftag and my_ring_index are still valid */
} PrefetchStatus;
typedef struct PrefetchRequest
{
BufferTag buftag; /* must be first entry in the struct */
typedef struct PrefetchRequest {
BufferTag buftag; /* must be first entry in the struct */
XLogRecPtr effective_request_lsn;
XLogRecPtr actual_request_lsn;
NeonResponse *response; /* may be null */
NeonResponse *response; /* may be null */
PrefetchStatus status;
uint64 my_ring_index;
} PrefetchRequest;
/* prefetch buffer lookup hash table */
typedef struct PrfHashEntry
{
typedef struct PrfHashEntry {
PrefetchRequest *slot;
uint32 status;
uint32 hash;
uint32 status;
uint32 hash;
} PrfHashEntry;
#define SH_PREFIX prfh
@@ -201,42 +197,36 @@ typedef struct PrfHashEntry
/*
* PrefetchState maintains the state of (prefetch) getPage@LSN requests.
* It maintains a (ring) buffer of in-flight requests and responses.
*
*
* We maintain several indexes into the ring buffer:
* ring_unused >= ring_flush >= ring_receive >= ring_last >= 0
*
*
* ring_unused points to the first unused slot of the buffer
* ring_receive is the next request that is to be received
* ring_last is the oldest received entry in the buffer
*
*
* Apart from being an entry in the ring buffer of prefetch requests, each
* PrefetchRequest that is not UNUSED is indexed in prf_hash by buftag.
*/
typedef struct PrefetchState
{
MemoryContext bufctx; /* context for prf_buffer[].response
* allocations */
MemoryContext errctx; /* context for prf_buffer[].response
* allocations */
MemoryContext hashctx; /* context for prf_buffer */
typedef struct PrefetchState {
MemoryContext bufctx; /* context for prf_buffer[].response allocations */
MemoryContext errctx; /* context for prf_buffer[].response allocations */
MemoryContext hashctx; /* context for prf_buffer */
/* buffer indexes */
uint64 ring_unused; /* first unused slot */
uint64 ring_flush; /* next request to flush */
uint64 ring_receive; /* next slot that is to receive a response */
uint64 ring_last; /* min slot with a response value */
uint64 ring_unused; /* first unused slot */
uint64 ring_flush; /* next request to flush */
uint64 ring_receive; /* next slot that is to receive a response */
uint64 ring_last; /* min slot with a response value */
/* metrics / statistics */
int n_responses_buffered; /* count of PS responses not yet in
* buffers */
int n_requests_inflight; /* count of PS requests considered in
* flight */
int n_unused; /* count of buffers < unused, > last, that are
* also unused */
int n_responses_buffered; /* count of PS responses not yet in buffers */
int n_requests_inflight; /* count of PS requests considered in flight */
int n_unused; /* count of buffers < unused, > last, that are also unused */
/* the buffers */
prfh_hash *prf_hash;
PrefetchRequest prf_buffer[]; /* prefetch buffers */
prfh_hash *prf_hash;
PrefetchRequest prf_buffer[]; /* prefetch buffers */
} PrefetchState;
PrefetchState *MyPState;
@@ -274,10 +264,10 @@ static XLogRecPtr neon_get_request_lsn(bool *latest, NRelFileInfo rinfo,
static bool
compact_prefetch_buffers(void)
{
uint64 empty_ring_index = MyPState->ring_last;
uint64 search_ring_index = MyPState->ring_receive;
int n_moved = 0;
uint64 empty_ring_index = MyPState->ring_last;
uint64 search_ring_index = MyPState->ring_receive;
int n_moved = 0;
if (MyPState->ring_receive == MyPState->ring_last)
return false;
@@ -292,14 +282,15 @@ compact_prefetch_buffers(void)
}
/*
* Here we have established: slots < search_ring_index have an unknown
* state (not scanned) slots >= search_ring_index and <= empty_ring_index
* are unused slots > empty_ring_index are in use, or outside our buffer's
* range. ... unless search_ring_index <= ring_last
*
* Here we have established:
* slots < search_ring_index have an unknown state (not scanned)
* slots >= search_ring_index and <= empty_ring_index are unused
* slots > empty_ring_index are in use, or outside our buffer's range.
* ... unless search_ring_index <= ring_last
*
* Therefore, there is a gap of at least one unused items between
* search_ring_index and empty_ring_index (both inclusive), which grows as
* we hit more unused items while moving backwards through the array.
* search_ring_index and empty_ring_index (both inclusive), which grows as we hit
* more unused items while moving backwards through the array.
*/
while (search_ring_index > MyPState->ring_last)
@@ -339,10 +330,7 @@ compact_prefetch_buffers(void)
/* empty the moved slot */
source_slot->status = PRFS_UNUSED;
source_slot->buftag = (BufferTag)
{
0
};
source_slot->buftag = (BufferTag) {0};
source_slot->response = NULL;
source_slot->my_ring_index = 0;
source_slot->effective_request_lsn = 0;
@@ -352,8 +340,8 @@ compact_prefetch_buffers(void)
}
/*
* Only when we've moved slots we can expect trailing unused slots, so
* only then we clean up trailing unused slots.
* Only when we've moved slots we can expect trailing unused slots,
* so only then we clean up trailing unused slots.
*/
if (n_moved > 0)
{
@@ -370,9 +358,10 @@ readahead_buffer_resize(int newsize, void *extra)
uint64 end,
nfree = newsize;
PrefetchState *newPState;
Size newprfs_size = offsetof(PrefetchState, prf_buffer) +
(sizeof(PrefetchRequest) * newsize);
Size newprfs_size = offsetof(PrefetchState, prf_buffer) + (
sizeof(PrefetchRequest) * newsize
);
/* don't try to re-initialize if we haven't initialized yet */
if (MyPState == NULL)
return;
@@ -399,12 +388,12 @@ readahead_buffer_resize(int newsize, void *extra)
newPState->ring_receive = newsize;
newPState->ring_flush = newsize;
/*
/*
* Copy over the prefetches.
*
*
* We populate the prefetch array from the end; to retain the most recent
* prefetches, but this has the benefit of only needing to do one
* iteration on the dataset, and trivial compaction.
* prefetches, but this has the benefit of only needing to do one iteration
* on the dataset, and trivial compaction.
*/
for (end = MyPState->ring_unused - 1;
end >= MyPState->ring_last && end != UINT64_MAX && nfree != 0;
@@ -412,7 +401,7 @@ readahead_buffer_resize(int newsize, void *extra)
{
PrefetchRequest *slot = GetPrfSlot(end);
PrefetchRequest *newslot;
bool found;
bool found;
if (slot->status == PRFS_UNUSED)
continue;
@@ -475,11 +464,10 @@ consume_prefetch_responses(void)
static void
prefetch_cleanup_trailing_unused(void)
{
uint64 ring_index;
uint64 ring_index;
PrefetchRequest *slot;
while (MyPState->ring_last < MyPState->ring_receive)
{
while (MyPState->ring_last < MyPState->ring_receive) {
ring_index = MyPState->ring_last;
slot = GetPrfSlot(ring_index);
@@ -493,7 +481,7 @@ prefetch_cleanup_trailing_unused(void)
/*
* Wait for slot of ring_index to have received its response.
* The caller is responsible for making sure the request buffer is flushed.
*
*
* NOTE: this function may indirectly update MyPState->pfs_hash; which
* invalidates any active pointers into the hash table.
*/
@@ -525,7 +513,7 @@ prefetch_wait_for(uint64 ring_index)
/*
* Read the response of a prefetch request into its slot.
*
*
* The caller is responsible for making sure that the request for this buffer
* was flushed to the PageServer.
*
@@ -565,7 +553,7 @@ prefetch_read(PrefetchRequest *slot)
/*
* Disconnect hook - drop prefetches when the connection drops
*
*
* If we don't remove the failed prefetches, we'd be serving incorrect
* data to the smgr.
*/
@@ -576,7 +564,7 @@ prefetch_on_ps_disconnect(void)
while (MyPState->ring_receive < MyPState->ring_unused)
{
PrefetchRequest *slot;
uint64 ring_index = MyPState->ring_receive;
uint64 ring_index = MyPState->ring_receive;
slot = GetPrfSlot(ring_index);
@@ -606,7 +594,7 @@ prefetch_set_unused(uint64 ring_index)
PrefetchRequest *slot = GetPrfSlot(ring_index);
if (ring_index < MyPState->ring_last)
return; /* Should already be unused */
return; /* Should already be unused */
Assert(MyPState->ring_unused > ring_index);
@@ -637,11 +625,7 @@ prefetch_set_unused(uint64 ring_index)
/* run cleanup if we're holding back ring_last */
if (MyPState->ring_last == ring_index)
prefetch_cleanup_trailing_unused();
/*
* ... and try to store the buffered responses more compactly if > 12.5%
* of the buffer is gaps
*/
/* ... and try to store the buffered responses more compactly if > 12.5% of the buffer is gaps */
else if (ReceiveBufferNeedsCompaction())
compact_prefetch_buffers();
}
@@ -649,7 +633,7 @@ prefetch_set_unused(uint64 ring_index)
static void
prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn)
{
bool found;
bool found;
NeonGetPageRequest request = {
.req.tag = T_NeonGetPageRequest,
.req.latest = false,
@@ -667,22 +651,21 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
}
else
{
XLogRecPtr lsn = neon_get_request_lsn(
&request.req.latest,
BufTagGetNRelFileInfo(slot->buftag),
slot->buftag.forkNum,
slot->buftag.blockNum
);
XLogRecPtr lsn = neon_get_request_lsn(
&request.req.latest,
BufTagGetNRelFileInfo(slot->buftag),
slot->buftag.forkNum,
slot->buftag.blockNum
);
/*
* Note: effective_request_lsn is potentially higher than the
* requested LSN, but still correct:
*
* Note: effective_request_lsn is potentially higher than the requested
* LSN, but still correct:
*
* We know there are no changes between the actual requested LSN and
* the value of effective_request_lsn: If there were, the page would
* have been in cache and evicted between those LSN values, which then
* would have had to result in a larger request LSN for this page.
*
* have been in cache and evicted between those LSN values, which
* then would have had to result in a larger request LSN for this page.
*
* It is possible that a concurrent backend loads the page, modifies
* it and then evicts it again, but the LSN of that eviction cannot be
* smaller than the current WAL insert/redo pointer, which is already
@@ -719,7 +702,7 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
* prefetch_register_buffer() - register and prefetch buffer
*
* Register that we may want the contents of BufferTag in the near future.
*
*
* If force_latest and force_lsn are not NULL, those values are sent to the
* pageserver. If they are NULL, we utilize the lastWrittenLsn -infrastructure
* to fill in these values manually.
@@ -731,14 +714,14 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
static uint64
prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn)
{
uint64 ring_index;
uint64 ring_index;
PrefetchRequest req;
PrefetchRequest *slot;
PrfHashEntry *entry;
/* use an intermediate PrefetchRequest struct to ensure correct alignment */
req.buftag = tag;
Retry:
Retry:
entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &req);
if (entry != NULL)
@@ -758,10 +741,7 @@ Retry:
*/
if (force_latest && force_lsn)
{
/*
* if we want the latest version, any effective_request_lsn <
* request lsn is OK
*/
/* if we want the latest version, any effective_request_lsn < request lsn is OK */
if (*force_latest)
{
if (*force_lsn > slot->effective_request_lsn)
@@ -772,11 +752,7 @@ Retry:
}
}
/*
* if we don't want the latest version, only accept requests with
* the exact same LSN
*/
/* if we don't want the latest version, only accept requests with the exact same LSN */
else
{
if (*force_lsn != slot->effective_request_lsn)
@@ -823,8 +799,7 @@ Retry:
*/
if (MyPState->ring_last + readahead_buffer_size - 1 == MyPState->ring_unused)
{
uint64 cleanup_index = MyPState->ring_last;
uint64 cleanup_index = MyPState->ring_last;
slot = GetPrfSlot(cleanup_index);
Assert(slot->status != PRFS_UNUSED);
@@ -839,10 +814,7 @@ Retry:
}
else
{
/*
* We have the slot for ring_last, so that must still be in
* progress
*/
/* We have the slot for ring_last, so that must still be in progress */
switch (slot->status)
{
case PRFS_REQUESTED:
@@ -861,8 +833,8 @@ Retry:
}
/*
* The next buffer pointed to by `ring_unused` is now definitely empty, so
* we can insert the new request to it.
* The next buffer pointed to by `ring_unused` is now definitely empty,
* so we can insert the new request to it.
*/
ring_index = MyPState->ring_unused;
slot = &MyPState->prf_buffer[((ring_index) % readahead_buffer_size)];
@@ -888,10 +860,7 @@ Retry:
{
if (!page_server->flush())
{
/*
* Prefetch set is reset in case of error, so we should try to
* register our request once again
*/
/* Prefetch set is reset in case of error, so we should try to register our request once again */
goto Retry;
}
MyPState->ring_flush = MyPState->ring_unused;
@@ -903,10 +872,8 @@ Retry:
static NeonResponse *
page_server_request(void const *req)
{
NeonResponse *resp;
do
{
NeonResponse* resp;
do {
while (!page_server->send((NeonRequest *) req) || !page_server->flush());
MyPState->ring_flush = MyPState->ring_unused;
consume_prefetch_responses();
@@ -918,7 +885,7 @@ page_server_request(void const *req)
StringInfoData
nm_pack_request(NeonRequest *msg)
nm_pack_request(NeonRequest * msg)
{
StringInfoData s;
@@ -1034,7 +1001,7 @@ nm_unpack_response(StringInfo s)
/* XXX: should be varlena */
memcpy(msg_resp->page, pq_getmsgbytes(s, BLCKSZ), BLCKSZ);
pq_getmsgend(s);
Assert(msg_resp->tag == T_NeonGetPageResponse);
resp = (NeonResponse *) msg_resp;
@@ -1090,7 +1057,7 @@ nm_unpack_response(StringInfo s)
/* dump to json for debugging / error reporting purposes */
char *
nm_to_string(NeonMessage *msg)
nm_to_string(NeonMessage * msg)
{
StringInfoData s;
@@ -1219,7 +1186,7 @@ nm_to_string(NeonMessage *msg)
* directly because it skips the logging if the LSN is new enough.
*/
static XLogRecPtr
log_newpage_copy(NRelFileInfo * rinfo, ForkNumber forkNum, BlockNumber blkno,
log_newpage_copy(NRelFileInfo *rinfo, ForkNumber forkNum, BlockNumber blkno,
Page page, bool page_std)
{
PGAlignedBlock copied_buffer;
@@ -1242,10 +1209,11 @@ PageIsEmptyHeapPage(char *buffer)
}
static void
neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
#if PG_MAJORVERSION_NUM < 16
neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool force)
char *buffer, bool force)
#else
neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const char *buffer, bool force)
const char *buffer, bool force)
#endif
{
XLogRecPtr lsn = PageGetLSN((Page) buffer);
@@ -1345,23 +1313,24 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
void
neon_init(void)
{
Size prfs_size;
Size prfs_size;
if (MyPState != NULL)
return;
prfs_size = offsetof(PrefetchState, prf_buffer) +
sizeof(PrefetchRequest) * readahead_buffer_size;
prfs_size = offsetof(PrefetchState, prf_buffer) + (
sizeof(PrefetchRequest) * readahead_buffer_size
);
MyPState = MemoryContextAllocZero(TopMemoryContext, prfs_size);
MyPState->n_unused = readahead_buffer_size;
MyPState->bufctx = SlabContextCreate(TopMemoryContext,
"NeonSMGR/prefetch",
SLAB_DEFAULT_BLOCK_SIZE * 17,
PS_GETPAGERESPONSE_SIZE);
MyPState->errctx = AllocSetContextCreate(TopMemoryContext,
MyPState->errctx = AllocSetContextCreate(TopMemoryContext,
"NeonSMGR/errors",
ALLOCSET_DEFAULT_SIZES);
MyPState->hashctx = AllocSetContextCreate(TopMemoryContext,
@@ -1601,14 +1570,14 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
/*
* Newly created relation is empty, remember that in the relsize cache.
*
* Note that in REDO, this is called to make sure the relation fork
* exists, but it does not truncate the relation. So, we can only update
* the relsize if it didn't exist before.
*
* Note that in REDO, this is called to make sure the relation fork exists,
* but it does not truncate the relation. So, we can only update the
* relsize if it didn't exist before.
*
* Also, in redo, we must make sure to update the cached size of the
* relation, as that is the primary source of truth for REDO's file length
* considerations, and as file extension isn't (perfectly) logged, we need
* to take care of that before we hit file size checks.
* relation, as that is the primary source of truth for REDO's
* file length considerations, and as file extension isn't (perfectly)
* logged, we need to take care of that before we hit file size checks.
*
* FIXME: This is currently not just an optimization, but required for
* correctness. Postgres can call smgrnblocks() on the newly-created
@@ -1684,7 +1653,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
#endif
{
XLogRecPtr lsn;
BlockNumber n_blocks = 0;
BlockNumber n_blocks = 0;
switch (reln->smgr_relpersistence)
{
@@ -1725,10 +1694,9 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
}
/*
* Usually Postgres doesn't extend relation on more than one page (leaving
* holes). But this rule is violated in PG-15 where
* CreateAndCopyRelationData call smgrextend for destination relation n
* using size of source relation
* Usually Postgres doesn't extend relation on more than one page
* (leaving holes). But this rule is violated in PG-15 where CreateAndCopyRelationData
* call smgrextend for destination relation n using size of source relation
*/
n_blocks = neon_nblocks(reln, forkNum);
while (n_blocks < blkno)
@@ -1749,13 +1717,11 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
if (IS_LOCAL_REL(reln))
mdextend(reln, forkNum, blkno, buffer, skipFsync);
#endif
/*
* smgr_extend is often called with an all-zeroes page, so
* lsn==InvalidXLogRecPtr. An smgr_write() call will come for the buffer
* later, after it has been initialized with the real page contents, and
* it is eventually evicted from the buffer cache. But we need a valid LSN
* to the relation metadata update now.
* smgr_extend is often called with an all-zeroes page, so lsn==InvalidXLogRecPtr.
* An smgr_write() call will come for the buffer later, after it has been initialized
* with the real page contents, and it is eventually evicted from the buffer cache.
* But we need a valid LSN to the relation metadata update now.
*/
if (lsn == InvalidXLogRecPtr)
{
@@ -1814,9 +1780,9 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber)
ereport(ERROR,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg("cannot extend file \"%s\" beyond %u blocks",
relpath(reln->smgr_rlocator, forkNum),
InvalidBlockNumber)));
errmsg("cannot extend file \"%s\" beyond %u blocks",
relpath(reln->smgr_rlocator, forkNum),
InvalidBlockNumber)));
/* Don't log any pages if we're not allowed to do so. */
if (!XLogInsertAllowed())
@@ -1898,12 +1864,12 @@ neon_close(SMgrRelation reln, ForkNumber forknum)
bool
neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
{
uint64 ring_index PG_USED_FOR_ASSERTS_ONLY;
BufferTag tag;
uint64 ring_index PG_USED_FOR_ASSERTS_ONLY;
switch (reln->smgr_relpersistence)
{
case 0: /* probably shouldn't happen, but ignore it */
case 0: /* probably shouldn't happen, but ignore it */
case RELPERSISTENCE_PERMANENT:
break;
@@ -1918,9 +1884,10 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
if (lfc_cache_contains(InfoFromSMgrRel(reln), forknum, blocknum))
return false;
tag.forkNum = forknum;
tag.blockNum = blocknum;
tag = (BufferTag) {
.forkNum = forknum,
.blockNum = blocknum
};
CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln));
ring_index = prefetch_register_buffer(tag, NULL, NULL);
@@ -1973,21 +1940,23 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
* While function is defined in the neon extension it's used within neon_test_utils directly.
* To avoid breaking tests in the runtime please keep function signature in sync.
*/
void
#if PG_MAJORVERSION_NUM < 16
void PGDLLEXPORT
neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
XLogRecPtr request_lsn, bool request_latest, char *buffer)
#else
void PGDLLEXPORT
neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
XLogRecPtr request_lsn, bool request_latest, void *buffer)
#endif
{
NeonResponse *resp;
BufferTag buftag;
uint64 ring_index;
PrfHashEntry *entry;
PrefetchRequest *slot;
BufferTag buftag =
{
buftag = (BufferTag) {
.forkNum = forkNum,
.blockNum = blkno,
};
@@ -1996,11 +1965,12 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
/*
* The redo process does not lock pages that it needs to replay but are
* not in the shared buffers, so a concurrent process may request the page
* after redo has decided it won't redo that page and updated the LwLSN
* for that page. If we're in hot standby we need to take care that we
* don't return until after REDO has finished replaying up to that LwLSN,
* as the page should have been locked up to that point.
* not in the shared buffers, so a concurrent process may request the
* page after redo has decided it won't redo that page and updated the
* LwLSN for that page.
* If we're in hot standby we need to take care that we don't return
* until after REDO has finished replaying up to that LwLSN, as the page
* should have been locked up to that point.
*
* See also the description on neon_redo_read_buffer_filter below.
*
@@ -2008,7 +1978,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
* concurrent failed read IOs. Those IOs should never have a request_lsn
* that is as large as the WAL record we're currently replaying, if it
* weren't for the behaviour of the LwLsn cache that uses the highest
* value of the LwLsn cache when the entry is not found.
* value of the LwLsn cache when the entry is not found.
*/
if (RecoveryInProgress() && !(MyBackendType == B_STARTUP))
XLogWaitForReplayOf(request_lsn);
@@ -2026,14 +1996,12 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
ring_index = slot->my_ring_index;
pgBufferUsage.prefetch.hits += 1;
}
else /* the current prefetch LSN is not large
* enough, so drop the prefetch */
else /* the current prefetch LSN is not large enough, so drop the prefetch */
{
/*
* We can't drop cache for not-yet-received requested items. It is
* unlikely this happens, but it can happen if prefetch distance
* is large enough and a backend didn't consume all prefetch
* requests.
* unlikely this happens, but it can happen if prefetch distance is
* large enough and a backend didn't consume all prefetch requests.
*/
if (slot->status == PRFS_REQUESTED)
{
@@ -2060,11 +2028,11 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
else
{
/*
* Empty our reference to the prefetch buffer's hash entry. When
* we wait for prefetches, the entry reference is invalidated by
* potential updates to the hash, and when we reconnect to the
* pageserver the prefetch we're waiting for may be dropped, in
* which case we need to retry and take the branch above.
* Empty our reference to the prefetch buffer's hash entry.
* When we wait for prefetches, the entry reference is invalidated by
* potential updates to the hash, and when we reconnect to the
* pageserver the prefetch we're waiting for may be dropped,
* in which case we need to retry and take the branch above.
*/
entry = NULL;
}
@@ -2112,10 +2080,11 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
* neon_read() -- Read the specified block from a relation.
*/
void
neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
#if PG_MAJORVERSION_NUM < 16
neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, char *buffer)
char *buffer)
#else
neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer)
void *buffer)
#endif
{
bool latest;
@@ -2250,10 +2219,11 @@ hexdump_page(char *page)
* use mdextend().
*/
void
neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
#if PG_MAJORVERSION_NUM < 16
neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
char *buffer, bool skipFsync)
#else
neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
const void *buffer, bool skipFsync)
#endif
{
XLogRecPtr lsn;
@@ -2757,7 +2727,6 @@ static void
neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, XLogRecPtr end_recptr)
{
BlockNumber relsize;
/* Extend the relation if we know its size */
if (get_cached_relsize(rinfo, forknum, &relsize))
{
@@ -2770,11 +2739,11 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
else
{
/*
* Size was not cached. We populate the cache now, with the size of
* the relation measured after this WAL record is applied.
* Size was not cached. We populate the cache now, with the size of the
* relation measured after this WAL record is applied.
*
* This length is later reused when we open the smgr to read the
* block, which is fine and expected.
* This length is later reused when we open the smgr to read the block,
* which is fine and expected.
*/
NeonResponse *response;
@@ -2794,7 +2763,7 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
Assert(response->tag == T_NeonNblocksResponse);
nbresponse = (NeonNblocksResponse *) response;
relsize = Max(nbresponse->n_blocks, blkno + 1);
relsize = Max(nbresponse->n_blocks, blkno+1);
set_cached_relsize(rinfo, forknum, relsize);
SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum);
@@ -2836,7 +2805,7 @@ get_fsm_physical_block(BlockNumber heapblk)
/*
* Return whether we can skip the redo for this block.
*
*
* The conditions for skipping the IO are:
*
* - The block is not in the shared buffers, and
@@ -2875,7 +2844,7 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
XLogRecPtr end_recptr = record->EndRecPtr;
NRelFileInfo rinfo;
ForkNumber forknum;
BlockNumber blkno;
BlockNumber blkno;
BufferTag tag;
uint32 hash;
LWLock *partitionLock;
@@ -2894,8 +2863,8 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
/*
* Out of an abundance of caution, we always run redo on shared catalogs,
* regardless of whether the block is stored in shared buffers. See also
* this function's top comment.
* regardless of whether the block is stored in shared buffers.
* See also this function's top comment.
*/
if (!OidIsValid(NInfoGetDbOid(rinfo)))
return false;
@@ -2921,9 +2890,8 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
/* In both cases st lwlsn past this WAL record */
SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno);
/*
* we don't have the buffer in memory, update lwLsn past this record, also
* evict page fro file cache
/* we don't have the buffer in memory, update lwLsn past this record,
* also evict page fro file cache
*/
if (no_redo_needed)
lfc_evict(rinfo, forknum, blkno);

View File

@@ -178,7 +178,7 @@ WalProposerFree(WalProposer *wp)
if (wp->propTermHistory.entries != NULL)
pfree(wp->propTermHistory.entries);
wp->propTermHistory.entries = NULL;
pfree(wp);
}
@@ -275,7 +275,7 @@ WalProposerPoll(WalProposer *wp)
wp->config->safekeeper_connection_timeout))
{
walprop_log(WARNING, "terminating connection to safekeeper '%s:%s' in '%s' state: no messages received during the last %dms or connection attempt took longer than that",
sk->host, sk->port, FormatSafekeeperState(sk->state), wp->config->safekeeper_connection_timeout);
sk->host, sk->port, FormatSafekeeperState(sk->state), wp->config->safekeeper_connection_timeout);
ShutdownConnection(sk);
}
}
@@ -395,7 +395,7 @@ ResetConnection(Safekeeper *sk)
* https://www.postgresql.org/docs/devel/libpq-connect.html#LIBPQ-PQCONNECTSTARTPARAMS
*/
walprop_log(WARNING, "Immediate failure to connect with node '%s:%s':\n\terror: %s",
sk->host, sk->port, wp->api.conn_error_message(sk));
sk->host, sk->port, wp->api.conn_error_message(sk));
/*
* Even though the connection failed, we still need to clean up the
@@ -489,7 +489,7 @@ AdvancePollState(Safekeeper *sk, uint32 events)
*/
case SS_OFFLINE:
walprop_log(FATAL, "Unexpected safekeeper %s:%s state advancement: is offline",
sk->host, sk->port);
sk->host, sk->port);
break; /* actually unreachable, but prevents
* -Wimplicit-fallthrough */
@@ -525,7 +525,7 @@ AdvancePollState(Safekeeper *sk, uint32 events)
*/
case SS_VOTING:
walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
sk->port, FormatSafekeeperState(sk->state));
sk->port, FormatSafekeeperState(sk->state));
ResetConnection(sk);
return;
@@ -554,7 +554,7 @@ AdvancePollState(Safekeeper *sk, uint32 events)
*/
case SS_IDLE:
walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
sk->port, FormatSafekeeperState(sk->state));
sk->port, FormatSafekeeperState(sk->state));
ResetConnection(sk);
return;
@@ -580,7 +580,7 @@ HandleConnectionEvent(Safekeeper *sk)
{
case WP_CONN_POLLING_OK:
walprop_log(LOG, "connected with node %s:%s", sk->host,
sk->port);
sk->port);
sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp);
/*
@@ -604,7 +604,7 @@ HandleConnectionEvent(Safekeeper *sk)
case WP_CONN_POLLING_FAILED:
walprop_log(WARNING, "failed to connect to node '%s:%s': %s",
sk->host, sk->port, wp->api.conn_error_message(sk));
sk->host, sk->port, wp->api.conn_error_message(sk));
/*
* If connecting failed, we don't want to restart the connection
@@ -641,7 +641,7 @@ SendStartWALPush(Safekeeper *sk)
if (!wp->api.conn_send_query(sk, "START_WAL_PUSH"))
{
walprop_log(WARNING, "Failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s",
sk->host, sk->port, wp->api.conn_error_message(sk));
sk->host, sk->port, wp->api.conn_error_message(sk));
ShutdownConnection(sk);
return;
}
@@ -678,7 +678,7 @@ RecvStartWALPushResult(Safekeeper *sk)
case WP_EXEC_FAILED:
walprop_log(WARNING, "Failed to send query to safekeeper %s:%s: %s",
sk->host, sk->port, wp->api.conn_error_message(sk));
sk->host, sk->port, wp->api.conn_error_message(sk));
ShutdownConnection(sk);
return;
@@ -689,7 +689,7 @@ RecvStartWALPushResult(Safekeeper *sk)
*/
case WP_EXEC_UNEXPECTED_SUCCESS:
walprop_log(WARNING, "Received bad response from safekeeper %s:%s query execution",
sk->host, sk->port);
sk->host, sk->port);
ShutdownConnection(sk);
return;
}
@@ -758,8 +758,8 @@ RecvAcceptorGreeting(Safekeeper *sk)
{
/* Another compute with higher term is running. */
walprop_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
sk->host, sk->port,
sk->greetResponse.term, wp->propTerm);
sk->host, sk->port,
sk->greetResponse.term, wp->propTerm);
}
/*
@@ -817,11 +817,11 @@ RecvVoteResponse(Safekeeper *sk)
return;
walprop_log(LOG,
"got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X",
sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory),
LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn),
LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn));
"got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X",
sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory),
LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn),
LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn));
/*
* In case of acceptor rejecting our vote, bail out, but only if either it
@@ -832,8 +832,8 @@ RecvVoteResponse(Safekeeper *sk)
(sk->voteResponse.term > wp->propTerm || wp->n_votes < wp->quorum))
{
walprop_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
sk->host, sk->port,
sk->voteResponse.term, wp->propTerm);
sk->host, sk->port,
sk->voteResponse.term, wp->propTerm);
}
Assert(sk->voteResponse.term == wp->propTerm);
@@ -877,10 +877,10 @@ HandleElectedProposer(WalProposer *wp)
if (wp->truncateLsn < wp->propEpochStartLsn)
{
walprop_log(LOG,
"start recovery because truncateLsn=%X/%X is not "
"equal to epochStartLsn=%X/%X",
LSN_FORMAT_ARGS(wp->truncateLsn),
LSN_FORMAT_ARGS(wp->propEpochStartLsn));
"start recovery because truncateLsn=%X/%X is not "
"equal to epochStartLsn=%X/%X",
LSN_FORMAT_ARGS(wp->truncateLsn),
LSN_FORMAT_ARGS(wp->propEpochStartLsn));
/* Perform recovery */
if (!wp->api.recovery_download(&wp->safekeeper[wp->donor], wp->greetRequest.timeline, wp->truncateLsn, wp->propEpochStartLsn))
walprop_log(FATAL, "Failed to recover state");
@@ -990,9 +990,9 @@ DetermineEpochStartLsn(WalProposer *wp)
wp->timelineStartLsn != wp->safekeeper[i].voteResponse.timelineStartLsn)
{
walprop_log(WARNING,
"inconsistent timelineStartLsn: current %X/%X, received %X/%X",
LSN_FORMAT_ARGS(wp->timelineStartLsn),
LSN_FORMAT_ARGS(wp->safekeeper[i].voteResponse.timelineStartLsn));
"inconsistent timelineStartLsn: current %X/%X, received %X/%X",
LSN_FORMAT_ARGS(wp->timelineStartLsn),
LSN_FORMAT_ARGS(wp->safekeeper[i].voteResponse.timelineStartLsn));
}
wp->timelineStartLsn = wp->safekeeper[i].voteResponse.timelineStartLsn;
}
@@ -1038,11 +1038,11 @@ DetermineEpochStartLsn(WalProposer *wp)
wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].lsn = wp->propEpochStartLsn;
walprop_log(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
wp->quorum,
wp->propTerm,
LSN_FORMAT_ARGS(wp->propEpochStartLsn),
wp->safekeeper[wp->donor].host, wp->safekeeper[wp->donor].port,
LSN_FORMAT_ARGS(wp->truncateLsn));
wp->quorum,
wp->propTerm,
LSN_FORMAT_ARGS(wp->propEpochStartLsn),
wp->safekeeper[wp->donor].host, wp->safekeeper[wp->donor].port,
LSN_FORMAT_ARGS(wp->truncateLsn));
/*
* Ensure the basebackup we are running (at RedoStartLsn) matches LSN
@@ -1070,18 +1070,18 @@ DetermineEpochStartLsn(WalProposer *wp)
walprop_shared->mineLastElectedTerm)))
{
walprop_log(PANIC,
"collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X",
LSN_FORMAT_ARGS(wp->propEpochStartLsn),
LSN_FORMAT_ARGS(wp->api.get_redo_start_lsn(wp)));
"collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X",
LSN_FORMAT_ARGS(wp->propEpochStartLsn),
LSN_FORMAT_ARGS(wp->api.get_redo_start_lsn(wp)));
}
}
walprop_shared->mineLastElectedTerm = wp->propTerm;
}
/*
* WalProposer has just elected itself and initialized history, so we can
* call election callback. Usually it updates truncateLsn to fetch WAL for
* logical replication.
* WalProposer has just elected itself and initialized history, so
* we can call election callback. Usually it updates truncateLsn to
* fetch WAL for logical replication.
*/
wp->api.after_election(wp);
}
@@ -1155,8 +1155,8 @@ SendProposerElected(Safekeeper *sk)
sk->startStreamingAt = wp->truncateLsn;
walprop_log(WARNING, "empty safekeeper joined cluster as %s:%s, historyStart=%X/%X, sk->startStreamingAt=%X/%X",
sk->host, sk->port, LSN_FORMAT_ARGS(wp->propTermHistory.entries[0].lsn),
LSN_FORMAT_ARGS(sk->startStreamingAt));
sk->host, sk->port, LSN_FORMAT_ARGS(wp->propTermHistory.entries[0].lsn),
LSN_FORMAT_ARGS(sk->startStreamingAt));
}
}
else
@@ -1190,8 +1190,8 @@ SendProposerElected(Safekeeper *sk)
lastCommonTerm = i >= 0 ? wp->propTermHistory.entries[i].term : 0;
walprop_log(LOG,
"sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X",
sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn));
"sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X",
sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn));
resetStringInfo(&sk->outbuf);
pq_sendint64_le(&sk->outbuf, msg.tag);
@@ -1355,11 +1355,11 @@ SendAppendRequests(Safekeeper *sk)
PrepareAppendRequest(sk->wp, &sk->appendRequest, sk->streamingAt, endLsn);
walprop_log(DEBUG2, "sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
req->endLsn - req->beginLsn,
LSN_FORMAT_ARGS(req->beginLsn),
LSN_FORMAT_ARGS(req->endLsn),
LSN_FORMAT_ARGS(req->commitLsn),
LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port);
req->endLsn - req->beginLsn,
LSN_FORMAT_ARGS(req->beginLsn),
LSN_FORMAT_ARGS(req->endLsn),
LSN_FORMAT_ARGS(req->commitLsn),
LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port);
resetStringInfo(&sk->outbuf);
@@ -1398,8 +1398,8 @@ SendAppendRequests(Safekeeper *sk)
case PG_ASYNC_WRITE_FAIL:
walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
sk->host, sk->port, FormatSafekeeperState(sk->state),
wp->api.conn_error_message(sk));
sk->host, sk->port, FormatSafekeeperState(sk->state),
wp->api.conn_error_message(sk));
ShutdownConnection(sk);
return false;
default:
@@ -1438,17 +1438,17 @@ RecvAppendResponses(Safekeeper *sk)
break;
walprop_log(DEBUG2, "received message term=" INT64_FORMAT " flushLsn=%X/%X commitLsn=%X/%X from %s:%s",
sk->appendResponse.term,
LSN_FORMAT_ARGS(sk->appendResponse.flushLsn),
LSN_FORMAT_ARGS(sk->appendResponse.commitLsn),
sk->host, sk->port);
sk->appendResponse.term,
LSN_FORMAT_ARGS(sk->appendResponse.flushLsn),
LSN_FORMAT_ARGS(sk->appendResponse.commitLsn),
sk->host, sk->port);
if (sk->appendResponse.term > wp->propTerm)
{
/* Another compute with higher term is running. */
walprop_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "",
sk->host, sk->port,
sk->appendResponse.term, wp->propTerm);
sk->host, sk->port,
sk->appendResponse.term, wp->propTerm);
}
readAnything = true;
@@ -1493,7 +1493,7 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
/* read value length */
rf->currentClusterSize = pq_getmsgint64(reply_message);
walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: current_timeline_size %lu",
rf->currentClusterSize);
rf->currentClusterSize);
}
else if ((strcmp(key, "ps_writelsn") == 0) || (strcmp(key, "last_received_lsn") == 0))
{
@@ -1501,7 +1501,7 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
/* read value length */
rf->last_received_lsn = pq_getmsgint64(reply_message);
walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: last_received_lsn %X/%X",
LSN_FORMAT_ARGS(rf->last_received_lsn));
LSN_FORMAT_ARGS(rf->last_received_lsn));
}
else if ((strcmp(key, "ps_flushlsn") == 0) || (strcmp(key, "disk_consistent_lsn") == 0))
{
@@ -1509,7 +1509,7 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
/* read value length */
rf->disk_consistent_lsn = pq_getmsgint64(reply_message);
walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: disk_consistent_lsn %X/%X",
LSN_FORMAT_ARGS(rf->disk_consistent_lsn));
LSN_FORMAT_ARGS(rf->disk_consistent_lsn));
}
else if ((strcmp(key, "ps_applylsn") == 0) || (strcmp(key, "remote_consistent_lsn") == 0))
{
@@ -1517,7 +1517,7 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
/* read value length */
rf->remote_consistent_lsn = pq_getmsgint64(reply_message);
walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: remote_consistent_lsn %X/%X",
LSN_FORMAT_ARGS(rf->remote_consistent_lsn));
LSN_FORMAT_ARGS(rf->remote_consistent_lsn));
}
else if ((strcmp(key, "ps_replytime") == 0) || (strcmp(key, "replytime") == 0))
{
@@ -1530,7 +1530,7 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
/* Copy because timestamptz_to_str returns a static buffer */
replyTimeStr = pstrdup(timestamptz_to_str(rf->replytime));
walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: replytime %lu reply_time: %s",
rf->replytime, replyTimeStr);
rf->replytime, replyTimeStr);
pfree(replyTimeStr);
}
@@ -1700,8 +1700,8 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size)
case PG_ASYNC_READ_FAIL:
walprop_log(WARNING, "Failed to read from node %s:%s in %s state: %s", sk->host,
sk->port, FormatSafekeeperState(sk->state),
wp->api.conn_error_message(sk));
sk->port, FormatSafekeeperState(sk->state),
wp->api.conn_error_message(sk));
ShutdownConnection(sk);
return false;
}
@@ -1740,7 +1740,7 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
if (tag != anymsg->tag)
{
walprop_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
sk->port, FormatSafekeeperState(sk->state));
sk->port, FormatSafekeeperState(sk->state));
ResetConnection(sk);
return false;
}
@@ -1816,8 +1816,8 @@ BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState succes
if (!wp->api.conn_blocking_write(sk, msg, msg_size))
{
walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
sk->host, sk->port, FormatSafekeeperState(sk->state),
wp->api.conn_error_message(sk));
sk->host, sk->port, FormatSafekeeperState(sk->state),
wp->api.conn_error_message(sk));
ShutdownConnection(sk);
return false;
}
@@ -1863,8 +1863,8 @@ AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_sta
return false;
case PG_ASYNC_WRITE_FAIL:
walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
sk->host, sk->port, FormatSafekeeperState(sk->state),
wp->api.conn_error_message(sk));
sk->host, sk->port, FormatSafekeeperState(sk->state),
wp->api.conn_error_message(sk));
ShutdownConnection(sk);
return false;
default:
@@ -1902,8 +1902,8 @@ AsyncFlush(Safekeeper *sk)
return false;
case -1:
walprop_log(WARNING, "Failed to flush write to node %s:%s in %s state: %s",
sk->host, sk->port, FormatSafekeeperState(sk->state),
wp->api.conn_error_message(sk));
sk->host, sk->port, FormatSafekeeperState(sk->state),
wp->api.conn_error_message(sk));
ResetConnection(sk);
return false;
default:
@@ -2008,7 +2008,7 @@ AssertEventsOkForState(uint32 events, Safekeeper *sk)
* and then an assertion that's guaranteed to fail.
*/
walprop_log(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]",
FormatEvents(wp, events), sk->host, sk->port, FormatSafekeeperState(sk->state));
FormatEvents(wp, events), sk->host, sk->port, FormatSafekeeperState(sk->state));
Assert(events_ok_for_state);
}
}
@@ -2111,7 +2111,7 @@ FormatEvents(WalProposer *wp, uint32 events)
if (events & (~all_flags))
{
walprop_log(WARNING, "Event formatting found unexpected component %d",
events & (~all_flags));
events & (~all_flags));
return_str[6] = '*';
return_str[7] = '\0';
}

View File

@@ -356,8 +356,7 @@ typedef struct Safekeeper
/* postgres-specific fields */
#ifndef WALPROPOSER_LIB
#ifndef WALPROPOSER_LIB
/*
* postgres protocol connection to the WAL acceptor
*
@@ -375,18 +374,17 @@ typedef struct Safekeeper
* Position in wait event set. Equal to -1 if no event
*/
int eventPos;
#endif
#endif
/* WalProposer library specifics */
#ifdef WALPROPOSER_LIB
#ifdef WALPROPOSER_LIB
/*
* Buffer for incoming messages. Usually Rust vector is stored here.
* Caller is responsible for freeing the buffer.
*/
StringInfoData inbuf;
#endif
#endif
} Safekeeper;
/* Re-exported PostgresPollingStatusType */
@@ -474,7 +472,7 @@ typedef struct walproposer_api
WalProposerConnStatusType (*conn_status) (Safekeeper *sk);
/* Start the connection, aka PQconnectStart. */
void (*conn_connect_start) (Safekeeper *sk);
void (*conn_connect_start) (Safekeeper *sk);
/* Poll an asynchronous connection, aka PQconnectPoll. */
WalProposerConnectPollStatusType (*conn_connect_poll) (Safekeeper *sk);
@@ -492,7 +490,7 @@ typedef struct walproposer_api
void (*conn_finish) (Safekeeper *sk);
/*
* Try to read CopyData message from the safekeeper, aka PQgetCopyData.
* Try to read CopyData message from the safekeeper, aka PQgetCopyData.
*
* On success, the data is placed in *buf. It is valid until the next call
* to this function.
@@ -512,7 +510,7 @@ typedef struct walproposer_api
void (*wal_read) (Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count);
/* Allocate WAL reader. */
void (*wal_reader_allocate) (Safekeeper *sk);
void (*wal_reader_allocate) (Safekeeper *sk);
/* Deallocate event set. */
void (*free_event_set) (WalProposer *wp);
@@ -574,7 +572,7 @@ typedef struct walproposer_api
/*
* Called right after the proposer was elected, but before it started
* recovery and sent ProposerElected message to the safekeepers.
*
*
* Used by logical replication to update truncateLsn.
*/
void (*after_election) (WalProposer *wp);
@@ -628,10 +626,10 @@ typedef struct WalProposerConfig
uint64 systemId;
/* Will be passed to safekeepers in greet request. */
TimeLineID pgTimeline;
TimeLineID pgTimeline;
#ifdef WALPROPOSER_LIB
void *callback_data;
void *callback_data;
#endif
} WalProposerConfig;
@@ -712,11 +710,10 @@ extern void WalProposerPoll(WalProposer *wp);
extern void WalProposerFree(WalProposer *wp);
#define WPEVENT 1337 /* special log level for walproposer internal
* events */
#define WPEVENT 1337 /* special log level for walproposer internal events */
#ifdef WALPROPOSER_LIB
extern void WalProposerLibLog(WalProposer *wp, int elevel, char *fmt,...);
void WalProposerLibLog(WalProposer *wp, int elevel, char *fmt, ...);
#define walprop_log(elevel, ...) WalProposerLibLog(wp, elevel, __VA_ARGS__)
#else
#define walprop_log(elevel, ...) elog(elevel, __VA_ARGS__)

View File

@@ -9,9 +9,8 @@
#include "utils/datetime.h"
#include "miscadmin.h"
void
ExceptionalCondition(const char *conditionName,
const char *fileName, int lineNumber)
void ExceptionalCondition(const char *conditionName,
const char *fileName, int lineNumber)
{
fprintf(stderr, "ExceptionalCondition: %s:%d: %s\n",
fileName, lineNumber, conditionName);
@@ -170,18 +169,17 @@ timestamptz_to_str(TimestampTz t)
bool
TimestampDifferenceExceeds(TimestampTz start_time,
TimestampTz stop_time,
int msec)
TimestampTz stop_time,
int msec)
{
TimestampTz diff = stop_time - start_time;
return (diff >= msec * INT64CONST(1000));
}
void
WalProposerLibLog(WalProposer *wp, int elevel, char *fmt,...)
WalProposerLibLog(WalProposer *wp, int elevel, char *fmt, ...)
{
char buf[1024];
char buf[1024];
va_list args;
fmt = _(fmt);

View File

@@ -637,8 +637,8 @@ walprop_connect_start(Safekeeper *sk)
*/
sk->conn = palloc(sizeof(WalProposerConn));
sk->conn->pg_conn = pg_conn;
sk->conn->is_nonblocking = false; /* connections always start in
* blocking mode */
sk->conn->is_nonblocking = false; /* connections always start in blocking
* mode */
sk->conn->recvbuf = NULL;
}
@@ -1291,11 +1291,10 @@ XLogWalPropWrite(WalProposer *wp, char *buf, Size nbytes, XLogRecPtr recptr)
/*
* Apart from walproposer, basebackup LSN page is also written out by
* postgres itself which writes WAL only in pages, and in basebackup it is
* inherently dummy (only safekeepers have historic WAL). Update WAL
* buffers here to avoid dummy page overwriting correct one we download
* here. Ugly, but alternatives are about the same ugly. We won't need
* that if we switch to on-demand WAL download from safekeepers, without
* writing to disk.
* inherently dummy (only safekeepers have historic WAL). Update WAL buffers
* here to avoid dummy page overwriting correct one we download here. Ugly,
* but alternatives are about the same ugly. We won't need that if we switch
* to on-demand WAL download from safekeepers, without writing to disk.
*
* https://github.com/neondatabase/neon/issues/5749
*/
@@ -1682,17 +1681,17 @@ walprop_pg_log_internal(WalProposer *wp, int level, const char *line)
static void
walprop_pg_after_election(WalProposer *wp)
{
FILE *f;
XLogRecPtr lrRestartLsn;
FILE* f;
XLogRecPtr lrRestartLsn;
/* We don't need to do anything in syncSafekeepers mode. */
/* We don't need to do anything in syncSafekeepers mode.*/
if (wp->config->syncSafekeepers)
return;
/*
* If there are active logical replication subscription we need to provide
* enough WAL for their WAL senders based on th position of their
* replication slots.
* If there are active logical replication subscription we need
* to provide enough WAL for their WAL senders based on th position
* of their replication slots.
*/
f = fopen("restart.lsn", "rb");
if (f != NULL && !wp->config->syncSafekeepers)
@@ -1701,12 +1700,8 @@ walprop_pg_after_election(WalProposer *wp)
fclose(f);
if (lrRestartLsn != InvalidXLogRecPtr)
{
elog(LOG, "Logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn));
/*
* start from the beginning of the segment to fetch page headers
* verifed by XLogReader
*/
elog(LOG, "Logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn));
/* start from the beginning of the segment to fetch page headers verifed by XLogReader */
lrRestartLsn = lrRestartLsn - XLogSegmentOffset(lrRestartLsn, wal_segment_size);
wp->truncateLsn = Min(wp->truncateLsn, lrRestartLsn);
}

121
poetry.lock generated
View File

@@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
[[package]]
name = "aiohttp"
@@ -98,18 +98,18 @@ speedups = ["Brotli", "aiodns", "brotlicffi"]
[[package]]
name = "aiopg"
version = "1.4.0"
version = "1.3.4"
description = "Postgres integration with asyncio."
optional = false
python-versions = ">=3.7"
python-versions = ">=3.6"
files = [
{file = "aiopg-1.4.0-py3-none-any.whl", hash = "sha256:aea46e8aff30b039cfa818e6db4752c97656e893fc75e5a5dc57355a9e9dedbd"},
{file = "aiopg-1.4.0.tar.gz", hash = "sha256:116253bef86b4d954116716d181e9a0294037f266718b2e1c9766af995639d71"},
{file = "aiopg-1.3.4-py3-none-any.whl", hash = "sha256:b5b74a124831aad71608c3c203479db90bac4a7eb3f8982bc48c3d3e6f1e57bf"},
{file = "aiopg-1.3.4.tar.gz", hash = "sha256:23f9e4cd9f28e9d91a6de3b4fb517e8bed25511cd954acccba9fe3a702d9b7d0"},
]
[package.dependencies]
async-timeout = ">=3.0,<5.0"
psycopg2-binary = ">=2.9.5"
psycopg2-binary = ">=2.8.4"
[package.extras]
sa = ["sqlalchemy[postgresql-psycopg2binary] (>=1.3,<1.5)"]
@@ -160,71 +160,64 @@ pluggy = ">=0.4.0"
[[package]]
name = "async-timeout"
version = "4.0.3"
version = "4.0.2"
description = "Timeout context manager for asyncio programs"
optional = false
python-versions = ">=3.7"
python-versions = ">=3.6"
files = [
{file = "async-timeout-4.0.3.tar.gz", hash = "sha256:4640d96be84d82d02ed59ea2b7105a0f7b33abe8703703cd0ab0bf87c427522f"},
{file = "async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028"},
{file = "async-timeout-4.0.2.tar.gz", hash = "sha256:2163e1640ddb52b7a8c80d0a67a08587e5d245cc9c553a74a847056bc2976b15"},
{file = "async_timeout-4.0.2-py3-none-any.whl", hash = "sha256:8ca1e4fcf50d07413d66d1a5e416e42cfdf5851c981d679a09851a6853383b3c"},
]
[[package]]
name = "asyncpg"
version = "0.29.0"
version = "0.27.0"
description = "An asyncio PostgreSQL driver"
optional = false
python-versions = ">=3.8.0"
python-versions = ">=3.7.0"
files = [
{file = "asyncpg-0.29.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:72fd0ef9f00aeed37179c62282a3d14262dbbafb74ec0ba16e1b1864d8a12169"},
{file = "asyncpg-0.29.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:52e8f8f9ff6e21f9b39ca9f8e3e33a5fcdceaf5667a8c5c32bee158e313be385"},
{file = "asyncpg-0.29.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a9e6823a7012be8b68301342ba33b4740e5a166f6bbda0aee32bc01638491a22"},
{file = "asyncpg-0.29.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:746e80d83ad5d5464cfbf94315eb6744222ab00aa4e522b704322fb182b83610"},
{file = "asyncpg-0.29.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:ff8e8109cd6a46ff852a5e6bab8b0a047d7ea42fcb7ca5ae6eaae97d8eacf397"},
{file = "asyncpg-0.29.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:97eb024685b1d7e72b1972863de527c11ff87960837919dac6e34754768098eb"},
{file = "asyncpg-0.29.0-cp310-cp310-win32.whl", hash = "sha256:5bbb7f2cafd8d1fa3e65431833de2642f4b2124be61a449fa064e1a08d27e449"},
{file = "asyncpg-0.29.0-cp310-cp310-win_amd64.whl", hash = "sha256:76c3ac6530904838a4b650b2880f8e7af938ee049e769ec2fba7cd66469d7772"},
{file = "asyncpg-0.29.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4900ee08e85af01adb207519bb4e14b1cae8fd21e0ccf80fac6aa60b6da37b4"},
{file = "asyncpg-0.29.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a65c1dcd820d5aea7c7d82a3fdcb70e096f8f70d1a8bf93eb458e49bfad036ac"},
{file = "asyncpg-0.29.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5b52e46f165585fd6af4863f268566668407c76b2c72d366bb8b522fa66f1870"},
{file = "asyncpg-0.29.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dc600ee8ef3dd38b8d67421359779f8ccec30b463e7aec7ed481c8346decf99f"},
{file = "asyncpg-0.29.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:039a261af4f38f949095e1e780bae84a25ffe3e370175193174eb08d3cecab23"},
{file = "asyncpg-0.29.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:6feaf2d8f9138d190e5ec4390c1715c3e87b37715cd69b2c3dfca616134efd2b"},
{file = "asyncpg-0.29.0-cp311-cp311-win32.whl", hash = "sha256:1e186427c88225ef730555f5fdda6c1812daa884064bfe6bc462fd3a71c4b675"},
{file = "asyncpg-0.29.0-cp311-cp311-win_amd64.whl", hash = "sha256:cfe73ffae35f518cfd6e4e5f5abb2618ceb5ef02a2365ce64f132601000587d3"},
{file = "asyncpg-0.29.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:6011b0dc29886ab424dc042bf9eeb507670a3b40aece3439944006aafe023178"},
{file = "asyncpg-0.29.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b544ffc66b039d5ec5a7454667f855f7fec08e0dfaf5a5490dfafbb7abbd2cfb"},
{file = "asyncpg-0.29.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d84156d5fb530b06c493f9e7635aa18f518fa1d1395ef240d211cb563c4e2364"},
{file = "asyncpg-0.29.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:54858bc25b49d1114178d65a88e48ad50cb2b6f3e475caa0f0c092d5f527c106"},
{file = "asyncpg-0.29.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:bde17a1861cf10d5afce80a36fca736a86769ab3579532c03e45f83ba8a09c59"},
{file = "asyncpg-0.29.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:37a2ec1b9ff88d8773d3eb6d3784dc7e3fee7756a5317b67f923172a4748a175"},
{file = "asyncpg-0.29.0-cp312-cp312-win32.whl", hash = "sha256:bb1292d9fad43112a85e98ecdc2e051602bce97c199920586be83254d9dafc02"},
{file = "asyncpg-0.29.0-cp312-cp312-win_amd64.whl", hash = "sha256:2245be8ec5047a605e0b454c894e54bf2ec787ac04b1cb7e0d3c67aa1e32f0fe"},
{file = "asyncpg-0.29.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0009a300cae37b8c525e5b449233d59cd9868fd35431abc470a3e364d2b85cb9"},
{file = "asyncpg-0.29.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5cad1324dbb33f3ca0cd2074d5114354ed3be2b94d48ddfd88af75ebda7c43cc"},
{file = "asyncpg-0.29.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:012d01df61e009015944ac7543d6ee30c2dc1eb2f6b10b62a3f598beb6531548"},
{file = "asyncpg-0.29.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:000c996c53c04770798053e1730d34e30cb645ad95a63265aec82da9093d88e7"},
{file = "asyncpg-0.29.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:e0bfe9c4d3429706cf70d3249089de14d6a01192d617e9093a8e941fea8ee775"},
{file = "asyncpg-0.29.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:642a36eb41b6313ffa328e8a5c5c2b5bea6ee138546c9c3cf1bffaad8ee36dd9"},
{file = "asyncpg-0.29.0-cp38-cp38-win32.whl", hash = "sha256:a921372bbd0aa3a5822dd0409da61b4cd50df89ae85150149f8c119f23e8c408"},
{file = "asyncpg-0.29.0-cp38-cp38-win_amd64.whl", hash = "sha256:103aad2b92d1506700cbf51cd8bb5441e7e72e87a7b3a2ca4e32c840f051a6a3"},
{file = "asyncpg-0.29.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5340dd515d7e52f4c11ada32171d87c05570479dc01dc66d03ee3e150fb695da"},
{file = "asyncpg-0.29.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e17b52c6cf83e170d3d865571ba574577ab8e533e7361a2b8ce6157d02c665d3"},
{file = "asyncpg-0.29.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f100d23f273555f4b19b74a96840aa27b85e99ba4b1f18d4ebff0734e78dc090"},
{file = "asyncpg-0.29.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:48e7c58b516057126b363cec8ca02b804644fd012ef8e6c7e23386b7d5e6ce83"},
{file = "asyncpg-0.29.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:f9ea3f24eb4c49a615573724d88a48bd1b7821c890c2effe04f05382ed9e8810"},
{file = "asyncpg-0.29.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8d36c7f14a22ec9e928f15f92a48207546ffe68bc412f3be718eedccdf10dc5c"},
{file = "asyncpg-0.29.0-cp39-cp39-win32.whl", hash = "sha256:797ab8123ebaed304a1fad4d7576d5376c3a006a4100380fb9d517f0b59c1ab2"},
{file = "asyncpg-0.29.0-cp39-cp39-win_amd64.whl", hash = "sha256:cce08a178858b426ae1aa8409b5cc171def45d4293626e7aa6510696d46decd8"},
{file = "asyncpg-0.29.0.tar.gz", hash = "sha256:d1c49e1f44fffafd9a55e1a9b101590859d881d639ea2922516f5d9c512d354e"},
{file = "asyncpg-0.27.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:fca608d199ffed4903dce1bcd97ad0fe8260f405c1c225bdf0002709132171c2"},
{file = "asyncpg-0.27.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:20b596d8d074f6f695c13ffb8646d0b6bb1ab570ba7b0cfd349b921ff03cfc1e"},
{file = "asyncpg-0.27.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7a6206210c869ebd3f4eb9e89bea132aefb56ff3d1b7dd7e26b102b17e27bbb1"},
{file = "asyncpg-0.27.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a7a94c03386bb95456b12c66026b3a87d1b965f0f1e5733c36e7229f8f137747"},
{file = "asyncpg-0.27.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:bfc3980b4ba6f97138b04f0d32e8af21d6c9fa1f8e6e140c07d15690a0a99279"},
{file = "asyncpg-0.27.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:9654085f2b22f66952124de13a8071b54453ff972c25c59b5ce1173a4283ffd9"},
{file = "asyncpg-0.27.0-cp310-cp310-win32.whl", hash = "sha256:879c29a75969eb2722f94443752f4720d560d1e748474de54ae8dd230bc4956b"},
{file = "asyncpg-0.27.0-cp310-cp310-win_amd64.whl", hash = "sha256:ab0f21c4818d46a60ca789ebc92327d6d874d3b7ccff3963f7af0a21dc6cff52"},
{file = "asyncpg-0.27.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:18f77e8e71e826ba2d0c3ba6764930776719ae2b225ca07e014590545928b576"},
{file = "asyncpg-0.27.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c2232d4625c558f2aa001942cac1d7952aa9f0dbfc212f63bc754277769e1ef2"},
{file = "asyncpg-0.27.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9a3a4ff43702d39e3c97a8786314123d314e0f0e4dabc8367db5b665c93914de"},
{file = "asyncpg-0.27.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ccddb9419ab4e1c48742457d0c0362dbdaeb9b28e6875115abfe319b29ee225d"},
{file = "asyncpg-0.27.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:768e0e7c2898d40b16d4ef7a0b44e8150db3dd8995b4652aa1fe2902e92c7df8"},
{file = "asyncpg-0.27.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:609054a1f47292a905582a1cfcca51a6f3f30ab9d822448693e66fdddde27920"},
{file = "asyncpg-0.27.0-cp311-cp311-win32.whl", hash = "sha256:8113e17cfe236dc2277ec844ba9b3d5312f61bd2fdae6d3ed1c1cdd75f6cf2d8"},
{file = "asyncpg-0.27.0-cp311-cp311-win_amd64.whl", hash = "sha256:bb71211414dd1eeb8d31ec529fe77cff04bf53efc783a5f6f0a32d84923f45cf"},
{file = "asyncpg-0.27.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4750f5cf49ed48a6e49c6e5aed390eee367694636c2dcfaf4a273ca832c5c43c"},
{file = "asyncpg-0.27.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:eca01eb112a39d31cc4abb93a5aef2a81514c23f70956729f42fb83b11b3483f"},
{file = "asyncpg-0.27.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:5710cb0937f696ce303f5eed6d272e3f057339bb4139378ccecafa9ee923a71c"},
{file = "asyncpg-0.27.0-cp37-cp37m-win_amd64.whl", hash = "sha256:71cca80a056ebe19ec74b7117b09e650990c3ca535ac1c35234a96f65604192f"},
{file = "asyncpg-0.27.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4bb366ae34af5b5cabc3ac6a5347dfb6013af38c68af8452f27968d49085ecc0"},
{file = "asyncpg-0.27.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:16ba8ec2e85d586b4a12bcd03e8d29e3d99e832764d6a1d0b8c27dbbe4a2569d"},
{file = "asyncpg-0.27.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d20dea7b83651d93b1eb2f353511fe7fd554752844523f17ad30115d8b9c8cd6"},
{file = "asyncpg-0.27.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:e56ac8a8237ad4adec97c0cd4728596885f908053ab725e22900b5902e7f8e69"},
{file = "asyncpg-0.27.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:bf21ebf023ec67335258e0f3d3ad7b91bb9507985ba2b2206346de488267cad0"},
{file = "asyncpg-0.27.0-cp38-cp38-win32.whl", hash = "sha256:69aa1b443a182b13a17ff926ed6627af2d98f62f2fe5890583270cc4073f63bf"},
{file = "asyncpg-0.27.0-cp38-cp38-win_amd64.whl", hash = "sha256:62932f29cf2433988fcd799770ec64b374a3691e7902ecf85da14d5e0854d1ea"},
{file = "asyncpg-0.27.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:fddcacf695581a8d856654bc4c8cfb73d5c9df26d5f55201722d3e6a699e9629"},
{file = "asyncpg-0.27.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:7d8585707ecc6661d07367d444bbaa846b4e095d84451340da8df55a3757e152"},
{file = "asyncpg-0.27.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:975a320baf7020339a67315284a4d3bf7460e664e484672bd3e71dbd881bc692"},
{file = "asyncpg-0.27.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2232ebae9796d4600a7819fc383da78ab51b32a092795f4555575fc934c1c89d"},
{file = "asyncpg-0.27.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:88b62164738239f62f4af92567b846a8ef7cf8abf53eddd83650603de4d52163"},
{file = "asyncpg-0.27.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:eb4b2fdf88af4fb1cc569781a8f933d2a73ee82cd720e0cb4edabbaecf2a905b"},
{file = "asyncpg-0.27.0-cp39-cp39-win32.whl", hash = "sha256:8934577e1ed13f7d2d9cea3cc016cc6f95c19faedea2c2b56a6f94f257cea672"},
{file = "asyncpg-0.27.0-cp39-cp39-win_amd64.whl", hash = "sha256:1b6499de06fe035cf2fa932ec5617ed3f37d4ebbf663b655922e105a484a6af9"},
{file = "asyncpg-0.27.0.tar.gz", hash = "sha256:720986d9a4705dd8a40fdf172036f5ae787225036a7eb46e704c45aa8f62c054"},
]
[package.dependencies]
async-timeout = {version = ">=4.0.3", markers = "python_version < \"3.12.0\""}
[package.extras]
docs = ["Sphinx (>=5.3.0,<5.4.0)", "sphinx-rtd-theme (>=1.2.2)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)"]
test = ["flake8 (>=6.1,<7.0)", "uvloop (>=0.15.3)"]
dev = ["Cython (>=0.29.24,<0.30.0)", "Sphinx (>=4.1.2,<4.2.0)", "flake8 (>=5.0.4,<5.1.0)", "pytest (>=6.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)", "uvloop (>=0.15.3)"]
docs = ["Sphinx (>=4.1.2,<4.2.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)"]
test = ["flake8 (>=5.0.4,<5.1.0)", "uvloop (>=0.15.3)"]
[[package]]
name = "attrs"
@@ -2483,16 +2476,6 @@ files = [
{file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
{file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
{file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
{file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
{file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
{file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
{file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
{file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
{file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
{file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
@@ -2714,4 +2697,4 @@ cffi = ["cffi (>=1.11)"]
[metadata]
lock-version = "2.0"
python-versions = "^3.9"
content-hash = "c4e38082d246636903e15c02fbf8364c6afc1fd35d36a81c49f596ba68fc739b"
content-hash = "9f33b4404dbb9803ede5785469241dde1d09132427b87db8928bdbc37ccd6b7a"

View File

@@ -4,10 +4,6 @@ version = "0.1.0"
edition.workspace = true
license.workspace = true
[features]
default = []
testing = []
[dependencies]
anyhow.workspace = true
async-trait.workspace = true
@@ -61,7 +57,6 @@ thiserror.workspace = true
tls-listener.workspace = true
tokio-postgres.workspace = true
tokio-rustls.workspace = true
tokio-util.workspace = true
tokio = { workspace = true, features = ["signal"] }
tracing-opentelemetry.workspace = true
tracing-subscriber.workspace = true
@@ -74,12 +69,13 @@ webpki-roots.workspace = true
x509-parser.workspace = true
native-tls.workspace = true
postgres-native-tls.workspace = true
postgres-protocol.workspace = true
smol_str.workspace = true
workspace_hack.workspace = true
tokio-util.workspace = true
[dev-dependencies]
rcgen.workspace = true
rstest.workspace = true
tokio-postgres-rustls.workspace = true
postgres-protocol.workspace = true

View File

@@ -62,9 +62,6 @@ pub enum AuthErrorImpl {
Please add it to the allowed list in the Neon console."
)]
IpAddressNotAllowed,
#[error("Too many connections to this endpoint. Please try again later.")]
TooManyConnections,
}
#[derive(Debug, Error)]
@@ -83,10 +80,6 @@ impl AuthError {
pub fn ip_address_not_allowed() -> Self {
AuthErrorImpl::IpAddressNotAllowed.into()
}
pub fn too_many_connections() -> Self {
AuthErrorImpl::TooManyConnections.into()
}
}
impl<E: Into<AuthErrorImpl>> From<E> for AuthError {
@@ -109,7 +102,6 @@ impl UserFacingError for AuthError {
MissingEndpointName => self.to_string(),
Io(_) => "Internal error".to_string(),
IpAddressNotAllowed => self.to_string(),
TooManyConnections => self.to_string(),
}
}
}

View File

@@ -3,11 +3,9 @@ mod hacks;
mod link;
pub use link::LinkAuthError;
use smol_str::SmolStr;
use tokio_postgres::config::AuthKeys;
use crate::auth::credentials::check_peer_addr_is_in_list;
use crate::auth::validate_password_and_exchange;
use crate::console::errors::GetAuthInfoError;
use crate::console::provider::AuthInfo;
use crate::console::AuthSecret;
@@ -26,12 +24,31 @@ use crate::{
};
use futures::TryFutureExt;
use std::borrow::Cow;
use std::net::IpAddr;
use std::ops::ControlFlow;
use std::sync::Arc;
use tokio::io::{AsyncRead, AsyncWrite};
use tracing::{error, info, warn};
/// A product of successful authentication.
pub struct AuthSuccess<T> {
/// Did we send [`pq_proto::BeMessage::AuthenticationOk`] to client?
pub reported_auth_ok: bool,
/// Something to be considered a positive result.
pub value: T,
}
impl<T> AuthSuccess<T> {
/// Very similar to [`std::option::Option::map`].
/// Maps [`AuthSuccess<T>`] to [`AuthSuccess<R>`] by applying
/// a function to a contained value.
pub fn map<R>(self, f: impl FnOnce(T) -> R) -> AuthSuccess<R> {
AuthSuccess {
reported_auth_ok: self.reported_auth_ok,
value: f(self.value),
}
}
}
/// This type serves two purposes:
///
/// * When `T` is `()`, it's just a regular auth backend selector
@@ -44,11 +61,9 @@ pub enum BackendType<'a, T> {
/// Current Cloud API (V2).
Console(Cow<'a, console::provider::neon::Api>, T),
/// Local mock of Cloud API (V2).
#[cfg(feature = "testing")]
Postgres(Cow<'a, console::provider::mock::Api>, T),
/// Authentication via a web browser.
Link(Cow<'a, url::ApiUrl>),
#[cfg(test)]
/// Test backend.
Test(&'a dyn TestBackend),
}
@@ -63,10 +78,8 @@ impl std::fmt::Display for BackendType<'_, ()> {
use BackendType::*;
match self {
Console(endpoint, _) => fmt.debug_tuple("Console").field(&endpoint.url()).finish(),
#[cfg(feature = "testing")]
Postgres(endpoint, _) => fmt.debug_tuple("Postgres").field(&endpoint.url()).finish(),
Link(url) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
#[cfg(test)]
Test(_) => fmt.debug_tuple("Test").finish(),
}
}
@@ -79,10 +92,8 @@ impl<T> BackendType<'_, T> {
use BackendType::*;
match self {
Console(c, x) => Console(Cow::Borrowed(c), x),
#[cfg(feature = "testing")]
Postgres(c, x) => Postgres(Cow::Borrowed(c), x),
Link(c) => Link(Cow::Borrowed(c)),
#[cfg(test)]
Test(x) => Test(*x),
}
}
@@ -96,10 +107,8 @@ impl<'a, T> BackendType<'a, T> {
use BackendType::*;
match self {
Console(c, x) => Console(c, f(x)),
#[cfg(feature = "testing")]
Postgres(c, x) => Postgres(c, f(x)),
Link(c) => Link(c),
#[cfg(test)]
Test(x) => Test(x),
}
}
@@ -112,87 +121,51 @@ impl<'a, T, E> BackendType<'a, Result<T, E>> {
use BackendType::*;
match self {
Console(c, x) => x.map(|x| Console(c, x)),
#[cfg(feature = "testing")]
Postgres(c, x) => x.map(|x| Postgres(c, x)),
Link(c) => Ok(Link(c)),
#[cfg(test)]
Test(x) => Ok(Test(x)),
}
}
}
pub struct ComputeCredentials<T> {
pub info: ComputeUserInfo,
pub keys: T,
}
pub struct ComputeUserInfoNoEndpoint {
pub user: SmolStr,
pub peer_addr: IpAddr,
pub cache_key: SmolStr,
}
pub struct ComputeUserInfo {
pub endpoint: SmolStr,
pub inner: ComputeUserInfoNoEndpoint,
}
pub enum ComputeCredentialKeys {
#[cfg(feature = "testing")]
pub enum ComputeCredentials {
Password(Vec<u8>),
AuthKeys(AuthKeys),
}
impl TryFrom<ClientCredentials> for ComputeUserInfo {
// user name
type Error = ComputeUserInfoNoEndpoint;
fn try_from(creds: ClientCredentials) -> Result<Self, Self::Error> {
let inner = ComputeUserInfoNoEndpoint {
user: creds.user,
peer_addr: creds.peer_addr,
cache_key: creds.cache_key,
};
match creds.project {
None => Err(inner),
Some(endpoint) => Ok(ComputeUserInfo { endpoint, inner }),
}
}
}
/// True to its name, this function encapsulates our current auth trade-offs.
/// Here, we choose the appropriate auth flow based on circumstances.
///
/// All authentication flows will emit an AuthenticationOk message if successful.
async fn auth_quirks(
async fn auth_quirks_creds(
api: &impl console::Api,
extra: &ConsoleReqExtra,
creds: ClientCredentials,
extra: &ConsoleReqExtra<'_>,
creds: &mut ClientCredentials<'_>,
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
allow_cleartext: bool,
config: &'static AuthenticationConfig,
latency_timer: &mut LatencyTimer,
) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
) -> auth::Result<AuthSuccess<ComputeCredentials>> {
// If there's no project so far, that entails that client doesn't
// support SNI or other means of passing the endpoint (project) name.
// We now expect to see a very specific payload in the place of password.
let (info, unauthenticated_password) = match creds.try_into() {
Err(info) => {
let res = hacks::password_hack_no_authentication(info, client, latency_timer).await?;
(res.info, Some(res.keys))
}
Ok(info) => (info, None),
let maybe_success = if creds.project.is_none() {
// Password will be checked by the compute node later.
Some(hacks::password_hack(creds, client, latency_timer).await?)
} else {
None
};
// Password hack should set the project name.
// TODO: make `creds.project` more type-safe.
assert!(creds.project.is_some());
info!("fetching user's authentication info");
// TODO(anna): this will slow down both "hacks" below; we probably need a cache.
let AuthInfo {
secret,
allowed_ips,
} = api.get_auth_info(extra, &info).await?;
} = api.get_auth_info(extra, creds).await?;
// check allowed list
if !check_peer_addr_is_in_list(&info.inner.peer_addr, &allowed_ips) {
if !check_peer_addr_is_in_list(&creds.peer_addr.ip(), &allowed_ips) {
return Err(auth::AuthError::ip_address_not_allowed());
}
let secret = secret.unwrap_or_else(|| {
@@ -200,49 +173,36 @@ async fn auth_quirks(
// prevent malicious probing (possible due to missing protocol steps).
// This mocked secret will never lead to successful authentication.
info!("authentication info not found, mocking it");
AuthSecret::Scram(scram::ServerSecret::mock(&info.inner.user, rand::random()))
AuthSecret::Scram(scram::ServerSecret::mock(creds.user, rand::random()))
});
if let Some(password) = unauthenticated_password {
let auth_outcome = validate_password_and_exchange(&password, secret)?;
let keys = match auth_outcome {
crate::sasl::Outcome::Success(key) => key,
crate::sasl::Outcome::Failure(reason) => {
info!("auth backend failed with an error: {reason}");
return Err(auth::AuthError::auth_failed(&*info.inner.user));
}
};
// we have authenticated the password
client.write_message_noflush(&pq_proto::BeMessage::AuthenticationOk)?;
return Ok(ComputeCredentials { info, keys });
if let Some(success) = maybe_success {
return Ok(success);
}
// -- the remaining flows are self-authenticating --
// Perform cleartext auth if we're allowed to do that.
// Currently, we use it for websocket connections (latency).
if allow_cleartext {
return hacks::authenticate_cleartext(info, client, latency_timer, secret).await;
// Password will be checked by the compute node later.
return hacks::cleartext_hack(client, latency_timer).await;
}
// Finally, proceed with the main auth flow (SCRAM-based).
classic::authenticate(info, client, config, latency_timer, secret).await
classic::authenticate(creds, client, config, latency_timer, secret).await
}
/// Authenticate the user and then wake a compute (or retrieve an existing compute session from cache)
/// only if authentication was successfuly.
async fn auth_and_wake_compute(
/// True to its name, this function encapsulates our current auth trade-offs.
/// Here, we choose the appropriate auth flow based on circumstances.
async fn auth_quirks(
api: &impl console::Api,
extra: &ConsoleReqExtra,
creds: ClientCredentials,
extra: &ConsoleReqExtra<'_>,
creds: &mut ClientCredentials<'_>,
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
allow_cleartext: bool,
config: &'static AuthenticationConfig,
latency_timer: &mut LatencyTimer,
) -> auth::Result<(CachedNodeInfo, ComputeUserInfo)> {
let compute_credentials = auth_quirks(
) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
let auth_stuff = auth_quirks_creds(
api,
extra,
creds,
@@ -255,7 +215,7 @@ async fn auth_and_wake_compute(
let mut num_retries = 0;
let mut node = loop {
let wake_res = api.wake_compute(extra, &compute_credentials.info).await;
let wake_res = api.wake_compute(extra, creds).await;
match handle_try_wake(wake_res, num_retries) {
Err(e) => {
error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
@@ -272,27 +232,27 @@ async fn auth_and_wake_compute(
tokio::time::sleep(wait_duration).await;
};
match compute_credentials.keys {
#[cfg(feature = "testing")]
ComputeCredentialKeys::Password(password) => node.config.password(password),
ComputeCredentialKeys::AuthKeys(auth_keys) => node.config.auth_keys(auth_keys),
match auth_stuff.value {
ComputeCredentials::Password(password) => node.config.password(password),
ComputeCredentials::AuthKeys(auth_keys) => node.config.auth_keys(auth_keys),
};
Ok((node, compute_credentials.info))
Ok(AuthSuccess {
reported_auth_ok: auth_stuff.reported_auth_ok,
value: node,
})
}
impl<'a> BackendType<'a, ClientCredentials> {
impl BackendType<'_, ClientCredentials<'_>> {
/// Get compute endpoint name from the credentials.
pub fn get_endpoint(&self) -> Option<SmolStr> {
pub fn get_endpoint(&self) -> Option<String> {
use BackendType::*;
match self {
Console(_, creds) => creds.project.clone(),
#[cfg(feature = "testing")]
Postgres(_, creds) => creds.project.clone(),
Link(_) => Some("link".into()),
#[cfg(test)]
Test(_) => Some("test".into()),
Link(_) => Some("link".to_owned()),
Test(_) => Some("test".to_owned()),
}
}
@@ -301,11 +261,9 @@ impl<'a> BackendType<'a, ClientCredentials> {
use BackendType::*;
match self {
Console(_, creds) => &creds.user,
#[cfg(feature = "testing")]
Postgres(_, creds) => &creds.user,
Console(_, creds) => creds.user,
Postgres(_, creds) => creds.user,
Link(_) => "link",
#[cfg(test)]
Test(_) => "test",
}
}
@@ -313,25 +271,26 @@ impl<'a> BackendType<'a, ClientCredentials> {
/// Authenticate the client via the requested backend, possibly using credentials.
#[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)]
pub async fn authenticate(
self,
extra: &ConsoleReqExtra,
&mut self,
extra: &ConsoleReqExtra<'_>,
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
allow_cleartext: bool,
config: &'static AuthenticationConfig,
latency_timer: &mut LatencyTimer,
) -> auth::Result<(CachedNodeInfo, BackendType<'a, ComputeUserInfo>)> {
) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
use BackendType::*;
let res = match self {
Console(api, creds) => {
info!(
user = &*creds.user,
user = creds.user,
project = creds.project(),
"performing authentication using the console"
);
let (cache_info, user_info) = auth_and_wake_compute(
&*api,
let api = api.as_ref();
auth_quirks(
api,
extra,
creds,
client,
@@ -339,19 +298,18 @@ impl<'a> BackendType<'a, ClientCredentials> {
config,
latency_timer,
)
.await?;
(cache_info, BackendType::Console(api, user_info))
.await?
}
#[cfg(feature = "testing")]
Postgres(api, creds) => {
info!(
user = &*creds.user,
user = creds.user,
project = creds.project(),
"performing authentication using a local postgres instance"
);
let (cache_info, user_info) = auth_and_wake_compute(
&*api,
let api = api.as_ref();
auth_quirks(
api,
extra,
creds,
client,
@@ -359,21 +317,16 @@ impl<'a> BackendType<'a, ClientCredentials> {
config,
latency_timer,
)
.await?;
(cache_info, BackendType::Postgres(api, user_info))
.await?
}
// NOTE: this auth backend doesn't use client credentials.
Link(url) => {
info!("performing link authentication");
let node_info = link::authenticate(&url, client).await?;
(
CachedNodeInfo::new_uncached(node_info),
BackendType::Link(url),
)
link::authenticate(url, client)
.await?
.map(CachedNodeInfo::new_uncached)
}
#[cfg(test)]
Test(_) => {
unreachable!("this function should never be called in the test backend")
}
@@ -382,20 +335,16 @@ impl<'a> BackendType<'a, ClientCredentials> {
info!("user successfully authenticated");
Ok(res)
}
}
impl BackendType<'_, ComputeUserInfo> {
pub async fn get_allowed_ips(
&self,
extra: &ConsoleReqExtra,
extra: &ConsoleReqExtra<'_>,
) -> Result<Arc<Vec<String>>, GetAuthInfoError> {
use BackendType::*;
match self {
Console(api, creds) => api.get_allowed_ips(extra, creds).await,
#[cfg(feature = "testing")]
Postgres(api, creds) => api.get_allowed_ips(extra, creds).await,
Link(_) => Ok(Arc::new(vec![])),
#[cfg(test)]
Test(x) => x.get_allowed_ips(),
}
}
@@ -404,16 +353,14 @@ impl BackendType<'_, ComputeUserInfo> {
/// The link auth flow doesn't support this, so we return [`None`] in that case.
pub async fn wake_compute(
&self,
extra: &ConsoleReqExtra,
extra: &ConsoleReqExtra<'_>,
) -> Result<Option<CachedNodeInfo>, console::errors::WakeComputeError> {
use BackendType::*;
match self {
Console(api, creds) => api.wake_compute(extra, creds).map_ok(Some).await,
#[cfg(feature = "testing")]
Postgres(api, creds) => api.wake_compute(extra, creds).map_ok(Some).await,
Link(_) => Ok(None),
#[cfg(test)]
Test(x) => x.wake_compute().map(Some),
}
}

View File

@@ -1,6 +1,6 @@
use super::{ComputeCredentials, ComputeUserInfo};
use super::{AuthSuccess, ComputeCredentials};
use crate::{
auth::{self, backend::ComputeCredentialKeys, AuthFlow},
auth::{self, AuthFlow, ClientCredentials},
compute,
config::AuthenticationConfig,
console::AuthSecret,
@@ -12,15 +12,14 @@ use tokio::io::{AsyncRead, AsyncWrite};
use tracing::{info, warn};
pub(super) async fn authenticate(
creds: ComputeUserInfo,
creds: &ClientCredentials<'_>,
client: &mut PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
config: &'static AuthenticationConfig,
latency_timer: &mut LatencyTimer,
secret: AuthSecret,
) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
) -> auth::Result<AuthSuccess<ComputeCredentials>> {
let flow = AuthFlow::new(client);
let scram_keys = match secret {
#[cfg(feature = "testing")]
AuthSecret::Md5(_) => {
info!("auth endpoint chooses MD5");
return Err(auth::AuthError::bad_auth_method("MD5"));
@@ -54,7 +53,7 @@ pub(super) async fn authenticate(
sasl::Outcome::Success(key) => key,
sasl::Outcome::Failure(reason) => {
info!("auth backend failed with an error: {reason}");
return Err(auth::AuthError::auth_failed(&*creds.inner.user));
return Err(auth::AuthError::auth_failed(creds.user));
}
};
@@ -65,9 +64,9 @@ pub(super) async fn authenticate(
}
};
Ok(ComputeCredentials {
info: creds,
keys: ComputeCredentialKeys::AuthKeys(tokio_postgres::config::AuthKeys::ScramSha256(
Ok(AuthSuccess {
reported_auth_ok: false,
value: ComputeCredentials::AuthKeys(tokio_postgres::config::AuthKeys::ScramSha256(
scram_keys,
)),
})

View File

@@ -1,11 +1,7 @@
use super::{
ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, ComputeUserInfoNoEndpoint,
};
use super::{AuthSuccess, ComputeCredentials};
use crate::{
auth::{self, AuthFlow},
console::AuthSecret,
auth::{self, AuthFlow, ClientCredentials},
proxy::LatencyTimer,
sasl,
stream::{self, Stream},
};
use tokio::io::{AsyncRead, AsyncWrite};
@@ -15,42 +11,35 @@ use tracing::{info, warn};
/// one round trip and *expensive* computations (>= 4096 HMAC iterations).
/// These properties are benefical for serverless JS workers, so we
/// use this mechanism for websocket connections.
pub async fn authenticate_cleartext(
info: ComputeUserInfo,
pub async fn cleartext_hack(
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
latency_timer: &mut LatencyTimer,
secret: AuthSecret,
) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
) -> auth::Result<AuthSuccess<ComputeCredentials>> {
warn!("cleartext auth flow override is enabled, proceeding");
// pause the timer while we communicate with the client
let _paused = latency_timer.pause();
let auth_outcome = AuthFlow::new(client)
.begin(auth::CleartextPassword(secret))
let password = AuthFlow::new(client)
.begin(auth::CleartextPassword)
.await?
.authenticate()
.await?;
let keys = match auth_outcome {
sasl::Outcome::Success(key) => key,
sasl::Outcome::Failure(reason) => {
info!("auth backend failed with an error: {reason}");
return Err(auth::AuthError::auth_failed(&*info.inner.user));
}
};
Ok(ComputeCredentials { info, keys })
// Report tentative success; compute node will check the password anyway.
Ok(AuthSuccess {
reported_auth_ok: false,
value: ComputeCredentials::Password(password),
})
}
/// Workaround for clients which don't provide an endpoint (project) name.
/// Similar to [`authenticate_cleartext`], but there's a specific password format,
/// and passwords are not yet validated (we don't know how to validate them!)
pub async fn password_hack_no_authentication(
info: ComputeUserInfoNoEndpoint,
/// Very similar to [`cleartext_hack`], but there's a specific password format.
pub async fn password_hack(
creds: &mut ClientCredentials<'_>,
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
latency_timer: &mut LatencyTimer,
) -> auth::Result<ComputeCredentials<Vec<u8>>> {
) -> auth::Result<AuthSuccess<ComputeCredentials>> {
warn!("project not specified, resorting to the password hack auth flow");
// pause the timer while we communicate with the client
@@ -59,17 +48,15 @@ pub async fn password_hack_no_authentication(
let payload = AuthFlow::new(client)
.begin(auth::PasswordHack)
.await?
.get_password()
.authenticate()
.await?;
info!(project = &*payload.endpoint, "received missing parameter");
info!(project = &payload.endpoint, "received missing parameter");
creds.project = Some(payload.endpoint);
// Report tentative success; compute node will check the password anyway.
Ok(ComputeCredentials {
info: ComputeUserInfo {
inner: info,
endpoint: payload.endpoint,
},
keys: payload.password,
Ok(AuthSuccess {
reported_auth_ok: false,
value: ComputeCredentials::Password(payload.password),
})
}

View File

@@ -1,3 +1,4 @@
use super::AuthSuccess;
use crate::{
auth, compute,
console::{self, provider::NodeInfo},
@@ -56,7 +57,7 @@ pub fn new_psql_session_id() -> String {
pub(super) async fn authenticate(
link_uri: &reqwest::Url,
client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
) -> auth::Result<NodeInfo> {
) -> auth::Result<AuthSuccess<NodeInfo>> {
let psql_session_id = new_psql_session_id();
let span = info_span!("link", psql_session_id = &psql_session_id);
let greeting = hello_message(link_uri, &psql_session_id);
@@ -101,9 +102,12 @@ pub(super) async fn authenticate(
config.password(password.as_ref());
}
Ok(NodeInfo {
config,
aux: db_info.aux,
allow_self_signed_compute: false, // caller may override
Ok(AuthSuccess {
reported_auth_ok: true,
value: NodeInfo {
config,
aux: db_info.aux,
allow_self_signed_compute: false, // caller may override
},
})
}

View File

@@ -3,12 +3,14 @@
use crate::{
auth::password_hack::parse_endpoint_param,
error::UserFacingError,
proxy::{neon_options_str, NUM_CONNECTION_ACCEPTED_BY_SNI},
proxy::{neon_options, NUM_CONNECTION_ACCEPTED_BY_SNI},
};
use itertools::Itertools;
use pq_proto::StartupMessageParams;
use smol_str::SmolStr;
use std::{collections::HashSet, net::IpAddr};
use std::{
collections::HashSet,
net::{IpAddr, SocketAddr},
};
use thiserror::Error;
use tracing::{info, warn};
@@ -22,7 +24,7 @@ pub enum ClientCredsParseError {
SNI ('{}') and project option ('{}').",
.domain, .option,
)]
InconsistentProjectNames { domain: SmolStr, option: SmolStr },
InconsistentProjectNames { domain: String, option: String },
#[error(
"Common name inferred from SNI ('{}') is not known",
@@ -31,7 +33,7 @@ pub enum ClientCredsParseError {
UnknownCommonName { cn: String },
#[error("Project name ('{0}') must contain only alphanumeric characters and hyphen.")]
MalformedProjectName(SmolStr),
MalformedProjectName(String),
}
impl UserFacingError for ClientCredsParseError {}
@@ -39,34 +41,34 @@ impl UserFacingError for ClientCredsParseError {}
/// Various client credentials which we use for authentication.
/// Note that we don't store any kind of client key or password here.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ClientCredentials {
pub user: SmolStr,
pub struct ClientCredentials<'a> {
pub user: &'a str,
// TODO: this is a severe misnomer! We should think of a new name ASAP.
pub project: Option<SmolStr>,
pub project: Option<String>,
pub cache_key: SmolStr,
pub peer_addr: IpAddr,
pub cache_key: String,
pub peer_addr: SocketAddr,
}
impl ClientCredentials {
impl ClientCredentials<'_> {
#[inline]
pub fn project(&self) -> Option<&str> {
self.project.as_deref()
}
}
impl ClientCredentials {
impl<'a> ClientCredentials<'a> {
pub fn parse(
params: &StartupMessageParams,
params: &'a StartupMessageParams,
sni: Option<&str>,
common_names: Option<HashSet<String>>,
peer_addr: IpAddr,
peer_addr: SocketAddr,
) -> Result<Self, ClientCredsParseError> {
use ClientCredsParseError::*;
// Some parameters are stored in the startup message.
let get_param = |key| params.get(key).ok_or(MissingKey(key));
let user = get_param("user")?.into();
let user = get_param("user")?;
// Project name might be passed via PG's command-line options.
let project_option = params
@@ -80,7 +82,7 @@ impl ClientCredentials {
.at_most_one()
.ok()?
})
.map(|name| name.into());
.map(|name| name.to_string());
let project_from_domain = if let Some(sni_str) = sni {
if let Some(cn) = common_names {
@@ -119,7 +121,7 @@ impl ClientCredentials {
}
.transpose()?;
info!(%user, project = project.as_deref(), "credentials");
info!(user, project = project.as_deref(), "credentials");
if sni.is_some() {
info!("Connection with sni");
NUM_CONNECTION_ACCEPTED_BY_SNI
@@ -140,9 +142,8 @@ impl ClientCredentials {
let cache_key = format!(
"{}{}",
project.as_deref().unwrap_or(""),
neon_options_str(params)
)
.into();
neon_options(params).unwrap_or("".to_string())
);
Ok(Self {
user,
@@ -205,10 +206,10 @@ fn project_name_valid(name: &str) -> bool {
name.chars().all(|c| c.is_alphanumeric() || c == '-')
}
fn subdomain_from_sni(sni: &str, common_name: &str) -> Option<SmolStr> {
fn subdomain_from_sni(sni: &str, common_name: &str) -> Option<String> {
sni.strip_suffix(common_name)?
.strip_suffix('.')
.map(SmolStr::from)
.map(str::to_owned)
}
#[cfg(test)]
@@ -220,7 +221,7 @@ mod tests {
fn parse_bare_minimum() -> anyhow::Result<()> {
// According to postgresql, only `user` should be required.
let options = StartupMessageParams::new([("user", "john_doe")]);
let peer_addr = IpAddr::from([127, 0, 0, 1]);
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
assert_eq!(creds.user, "john_doe");
assert_eq!(creds.project, None);
@@ -235,7 +236,7 @@ mod tests {
("database", "world"), // should be ignored
("foo", "bar"), // should be ignored
]);
let peer_addr = IpAddr::from([127, 0, 0, 1]);
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
assert_eq!(creds.user, "john_doe");
assert_eq!(creds.project, None);
@@ -250,7 +251,7 @@ mod tests {
let sni = Some("foo.localhost");
let common_names = Some(["localhost".into()].into());
let peer_addr = IpAddr::from([127, 0, 0, 1]);
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
assert_eq!(creds.user, "john_doe");
assert_eq!(creds.project.as_deref(), Some("foo"));
@@ -266,7 +267,7 @@ mod tests {
("options", "-ckey=1 project=bar -c geqo=off"),
]);
let peer_addr = IpAddr::from([127, 0, 0, 1]);
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
assert_eq!(creds.user, "john_doe");
assert_eq!(creds.project.as_deref(), Some("bar"));
@@ -281,7 +282,7 @@ mod tests {
("options", "-ckey=1 endpoint=bar -c geqo=off"),
]);
let peer_addr = IpAddr::from([127, 0, 0, 1]);
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
assert_eq!(creds.user, "john_doe");
assert_eq!(creds.project.as_deref(), Some("bar"));
@@ -299,7 +300,7 @@ mod tests {
),
]);
let peer_addr = IpAddr::from([127, 0, 0, 1]);
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
assert_eq!(creds.user, "john_doe");
assert!(creds.project.is_none());
@@ -314,7 +315,7 @@ mod tests {
("options", "-ckey=1 endpoint=bar project=foo -c geqo=off"),
]);
let peer_addr = IpAddr::from([127, 0, 0, 1]);
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
assert_eq!(creds.user, "john_doe");
assert!(creds.project.is_none());
@@ -329,7 +330,7 @@ mod tests {
let sni = Some("baz.localhost");
let common_names = Some(["localhost".into()].into());
let peer_addr = IpAddr::from([127, 0, 0, 1]);
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
assert_eq!(creds.user, "john_doe");
assert_eq!(creds.project.as_deref(), Some("baz"));
@@ -343,13 +344,13 @@ mod tests {
let common_names = Some(["a.com".into(), "b.com".into()].into());
let sni = Some("p1.a.com");
let peer_addr = IpAddr::from([127, 0, 0, 1]);
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
assert_eq!(creds.project.as_deref(), Some("p1"));
let common_names = Some(["a.com".into(), "b.com".into()].into());
let sni = Some("p1.b.com");
let peer_addr = IpAddr::from([127, 0, 0, 1]);
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
assert_eq!(creds.project.as_deref(), Some("p1"));
@@ -364,7 +365,7 @@ mod tests {
let sni = Some("second.localhost");
let common_names = Some(["localhost".into()].into());
let peer_addr = IpAddr::from([127, 0, 0, 1]);
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
let err = ClientCredentials::parse(&options, sni, common_names, peer_addr)
.expect_err("should fail");
match err {
@@ -383,7 +384,7 @@ mod tests {
let sni = Some("project.localhost");
let common_names = Some(["example.com".into()].into());
let peer_addr = IpAddr::from([127, 0, 0, 1]);
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
let err = ClientCredentials::parse(&options, sni, common_names, peer_addr)
.expect_err("should fail");
match err {
@@ -403,10 +404,13 @@ mod tests {
let sni = Some("project.localhost");
let common_names = Some(["localhost".into()].into());
let peer_addr = IpAddr::from([127, 0, 0, 1]);
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
assert_eq!(creds.project.as_deref(), Some("project"));
assert_eq!(creds.cache_key, "projectendpoint_type:read_write lsn:0/2");
assert_eq!(
creds.cache_key,
"projectneon_endpoint_type:read_write neon_lsn:0/2"
);
Ok(())
}

View File

@@ -1,9 +1,8 @@
//! Main authentication flow.
use super::{backend::ComputeCredentialKeys, AuthErrorImpl, PasswordHackPayload};
use super::{AuthErrorImpl, PasswordHackPayload};
use crate::{
config::TlsServerEndPoint,
console::AuthSecret,
sasl, scram,
stream::{PqStream, Stream},
};
@@ -51,7 +50,7 @@ impl AuthMethod for PasswordHack {
/// Use clear-text password auth called `password` in docs
/// <https://www.postgresql.org/docs/current/auth-password.html>
pub struct CleartextPassword(pub AuthSecret);
pub struct CleartextPassword;
impl AuthMethod for CleartextPassword {
#[inline(always)]
@@ -99,7 +98,7 @@ impl<'a, S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'a, S, Begin> {
impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, PasswordHack> {
/// Perform user authentication. Raise an error in case authentication failed.
pub async fn get_password(self) -> super::Result<PasswordHackPayload> {
pub async fn authenticate(self) -> super::Result<PasswordHackPayload> {
let msg = self.stream.read_password_message().await?;
let password = msg
.strip_suffix(&[0])
@@ -118,19 +117,13 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, PasswordHack> {
impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, CleartextPassword> {
/// Perform user authentication. Raise an error in case authentication failed.
pub async fn authenticate(self) -> super::Result<sasl::Outcome<ComputeCredentialKeys>> {
pub async fn authenticate(self) -> super::Result<Vec<u8>> {
let msg = self.stream.read_password_message().await?;
let password = msg
.strip_suffix(&[0])
.ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?;
let outcome = validate_password_and_exchange(password, self.state.0)?;
if let sasl::Outcome::Success(_) = &outcome {
self.stream.write_message_noflush(&Be::AuthenticationOk)?;
}
Ok(outcome)
Ok(password.to_vec())
}
}
@@ -159,49 +152,6 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
))
.await?;
if let sasl::Outcome::Success(_) = &outcome {
self.stream.write_message_noflush(&Be::AuthenticationOk)?;
}
Ok(outcome)
}
}
pub(super) fn validate_password_and_exchange(
password: &[u8],
secret: AuthSecret,
) -> super::Result<sasl::Outcome<ComputeCredentialKeys>> {
match secret {
#[cfg(feature = "testing")]
AuthSecret::Md5(_) => {
// test only
Ok(sasl::Outcome::Success(ComputeCredentialKeys::Password(
password.to_owned(),
)))
}
// perform scram authentication as both client and server to validate the keys
AuthSecret::Scram(scram_secret) => {
use postgres_protocol::authentication::sasl::{ChannelBinding, ScramSha256};
let sasl_client = ScramSha256::new(password, ChannelBinding::unsupported());
let outcome = crate::scram::exchange(
&scram_secret,
sasl_client,
crate::config::TlsServerEndPoint::Undefined,
)?;
let client_key = match outcome {
sasl::Outcome::Success(client_key) => client_key,
sasl::Outcome::Failure(reason) => return Ok(sasl::Outcome::Failure(reason)),
};
let keys = crate::compute::ScramKeys {
client_key: client_key.as_bytes(),
server_key: scram_secret.server_key.as_bytes(),
};
Ok(sasl::Outcome::Success(ComputeCredentialKeys::AuthKeys(
tokio_postgres::config::AuthKeys::ScramSha256(keys),
)))
}
}
}

View File

@@ -4,10 +4,9 @@
//! UPDATE (Mon Aug 8 13:20:34 UTC 2022): the payload format has been simplified.
use bstr::ByteSlice;
use smol_str::SmolStr;
pub struct PasswordHackPayload {
pub endpoint: SmolStr,
pub endpoint: String,
pub password: Vec<u8>,
}
@@ -19,7 +18,7 @@ impl PasswordHackPayload {
if let Some((endpoint, password)) = bytes.split_once_str(sep) {
let endpoint = endpoint.to_str().ok()?;
return Some(Self {
endpoint: parse_endpoint_param(endpoint)?.into(),
endpoint: parse_endpoint_param(endpoint)?.to_owned(),
password: password.to_owned(),
});
}

View File

@@ -8,7 +8,6 @@ use std::{net::SocketAddr, sync::Arc};
use futures::future::Either;
use itertools::Itertools;
use proxy::config::TlsServerEndPoint;
use proxy::proxy::run_until_cancelled;
use tokio::net::TcpListener;
use anyhow::{anyhow, bail, ensure, Context};
@@ -21,7 +20,7 @@ use tokio::io::{AsyncRead, AsyncWrite};
use tokio_util::sync::CancellationToken;
use utils::{project_git_version, sentry_init::init_sentry};
use tracing::{error, info, Instrument};
use tracing::{error, info, warn, Instrument};
project_git_version!(GIT_VERSION);
@@ -152,39 +151,63 @@ async fn task_main(
// will be inherited by all accepted client sockets.
socket2::SockRef::from(&listener).set_keepalive(true)?;
let connections = tokio_util::task::task_tracker::TaskTracker::new();
let mut connections = tokio::task::JoinSet::new();
while let Some(accept_result) =
run_until_cancelled(listener.accept(), &cancellation_token).await
{
let (socket, peer_addr) = accept_result?;
loop {
tokio::select! {
accept_result = listener.accept() => {
let (socket, peer_addr) = accept_result?;
let session_id = uuid::Uuid::new_v4();
let tls_config = Arc::clone(&tls_config);
let dest_suffix = Arc::clone(&dest_suffix);
let session_id = uuid::Uuid::new_v4();
let tls_config = Arc::clone(&tls_config);
let dest_suffix = Arc::clone(&dest_suffix);
connections.spawn(
async move {
socket
.set_nodelay(true)
.context("failed to set socket option")?;
connections.spawn(
async move {
socket
.set_nodelay(true)
.context("failed to set socket option")?;
info!(%peer_addr, "serving");
handle_client(dest_suffix, tls_config, tls_server_end_point, socket).await
info!(%peer_addr, "serving");
handle_client(dest_suffix, tls_config, tls_server_end_point, socket).await
}
.unwrap_or_else(|e| {
// Acknowledge that the task has finished with an error.
error!("per-client task finished with an error: {e:#}");
})
.instrument(tracing::info_span!("handle_client", ?session_id))
);
}
.unwrap_or_else(|e| {
// Acknowledge that the task has finished with an error.
error!("per-client task finished with an error: {e:#}");
})
.instrument(tracing::info_span!("handle_client", ?session_id)),
);
// Don't modify this unless you read https://docs.rs/tokio/latest/tokio/macro.select.html carefully.
// If this future completes and the pattern doesn't match, this branch is disabled for this call to `select!`.
// This only counts for this loop and it will be enabled again on next `select!`.
//
// Prior code had this as `Some(Err(e))` which _looks_ equivalent to the current setup, but it's not.
// When `connections.join_next()` returned `Some(Ok(()))` (which we expect), it would disable the join_next and it would
// not get called again, even if there are more connections to remove.
Some(res) = connections.join_next() => {
if let Err(e) = res {
if !e.is_panic() && !e.is_cancelled() {
warn!("unexpected error from joined connection task: {e:?}");
}
}
}
_ = cancellation_token.cancelled() => {
drop(listener);
break;
}
}
}
connections.close();
drop(listener);
connections.wait().await;
// Drain connections
info!("waiting for all client connections to finish");
while let Some(res) = connections.join_next().await {
if let Err(e) = res {
if !e.is_panic() && !e.is_cancelled() {
warn!("unexpected error from joined connection task: {e:?}");
}
}
}
info!("all client connections have finished");
Ok(())
}

View File

@@ -7,8 +7,6 @@ use proxy::console;
use proxy::console::provider::AllowedIpsCache;
use proxy::console::provider::NodeInfoCache;
use proxy::http;
use proxy::rate_limiter::EndpointRateLimiter;
use proxy::rate_limiter::RateBucketInfo;
use proxy::rate_limiter::RateLimiterConfig;
use proxy::usage_metrics;
@@ -16,7 +14,6 @@ use anyhow::bail;
use proxy::config::{self, ProxyConfig};
use proxy::serverless;
use std::pin::pin;
use std::sync::Arc;
use std::{borrow::Cow, net::SocketAddr};
use tokio::net::TcpListener;
use tokio::task::JoinSet;
@@ -33,7 +30,6 @@ use clap::{Parser, ValueEnum};
#[derive(Clone, Debug, ValueEnum)]
enum AuthBackend {
Console,
#[cfg(feature = "testing")]
Postgres,
Link,
}
@@ -115,12 +111,6 @@ struct ProxyCliArgs {
/// Timeout for rate limiter. If it didn't manage to aquire a permit in this time, it will return an error.
#[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
rate_limiter_timeout: tokio::time::Duration,
/// Endpoint rate limiter max number of requests per second.
///
/// Provided in the form '<Requests Per Second>@<Bucket Duration Size>'.
/// Can be given multiple times for different bucket sizes.
#[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
endpoint_rps_limit: Vec<RateBucketInfo>,
/// Initial limit for dynamic rate limiter. Makes sense only if `rate_limit_algorithm` is *not* `None`.
#[clap(long, default_value_t = 100)]
initial_limit: usize,
@@ -163,8 +153,6 @@ async fn main() -> anyhow::Result<()> {
let proxy_listener = TcpListener::bind(proxy_address).await?;
let cancellation_token = CancellationToken::new();
let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(&config.endpoint_rps_limit));
// client facing tasks. these will exit on error or on cancellation
// cancellation returns Ok(())
let mut client_tasks = JoinSet::new();
@@ -172,7 +160,6 @@ async fn main() -> anyhow::Result<()> {
config,
proxy_listener,
cancellation_token.clone(),
endpoint_rate_limiter.clone(),
));
// TODO: rename the argument to something like serverless.
@@ -186,7 +173,6 @@ async fn main() -> anyhow::Result<()> {
config,
serverless_listener,
cancellation_token.clone(),
endpoint_rate_limiter.clone(),
));
}
@@ -303,7 +289,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
let api = console::provider::neon::Api::new(endpoint, caches, locks);
auth::BackendType::Console(Cow::Owned(api), ())
}
#[cfg(feature = "testing")]
AuthBackend::Postgres => {
let url = args.auth_endpoint.parse()?;
let api = console::provider::mock::Api::new(url);
@@ -321,10 +306,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
let authentication_config = AuthenticationConfig {
scram_protocol_timeout: args.scram_protocol_timeout,
};
let mut endpoint_rps_limit = args.endpoint_rps_limit.clone();
RateBucketInfo::validate(&mut endpoint_rps_limit)?;
let config = Box::leak(Box::new(ProxyConfig {
tls_config,
auth_backend,
@@ -334,35 +315,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
authentication_config,
require_client_ip: args.require_client_ip,
disable_ip_check_for_http: args.disable_ip_check_for_http,
endpoint_rps_limit,
}));
Ok(config)
}
#[cfg(test)]
mod tests {
use std::time::Duration;
use clap::Parser;
use proxy::rate_limiter::RateBucketInfo;
#[test]
fn parse_endpoint_rps_limit() {
let config = super::ProxyCliArgs::parse_from([
"proxy",
"--endpoint-rps-limit",
"100@1s",
"--endpoint-rps-limit",
"20@30s",
]);
assert_eq!(
config.endpoint_rps_limit,
vec![
RateBucketInfo::new(100, Duration::from_secs(1)),
RateBucketInfo::new(20, Duration::from_secs(30)),
]
);
}
}

View File

@@ -1,13 +1,9 @@
use crate::{
auth::parse_endpoint_param,
cancellation::CancelClosure,
console::errors::WakeComputeError,
error::UserFacingError,
proxy::{neon_option, NUM_DB_CONNECTIONS_GAUGE},
auth::parse_endpoint_param, cancellation::CancelClosure, console::errors::WakeComputeError,
error::UserFacingError, proxy::is_neon_param,
};
use futures::{FutureExt, TryFutureExt};
use itertools::Itertools;
use metrics::IntCounterPairGuard;
use pq_proto::StartupMessageParams;
use std::{io, net::SocketAddr, time::Duration};
use thiserror::Error;
@@ -227,8 +223,6 @@ pub struct PostgresConnection {
pub params: std::collections::HashMap<String, String>,
/// Query cancellation token.
pub cancel_closure: CancelClosure,
_guage: IntCounterPairGuard,
}
impl ConnCfg {
@@ -237,7 +231,6 @@ impl ConnCfg {
&self,
allow_self_signed_compute: bool,
timeout: Duration,
proto: &'static str,
) -> Result<PostgresConnection, ConnectionError> {
let (socket_addr, stream, host) = self.connect_raw(timeout).await?;
@@ -271,7 +264,6 @@ impl ConnCfg {
stream,
params,
cancel_closure,
_guage: NUM_DB_CONNECTIONS_GAUGE.with_label_values(&[proto]).guard(),
};
Ok(connection)
@@ -283,7 +275,7 @@ fn filtered_options(params: &StartupMessageParams) -> Option<String> {
#[allow(unstable_name_collisions)]
let options: String = params
.options_raw()?
.filter(|opt| parse_endpoint_param(opt).is_none() && neon_option(opt).is_none())
.filter(|opt| parse_endpoint_param(opt).is_none() && !is_neon_param(opt))
.intersperse(" ") // TODO: use impl from std once it's stabilized
.collect();

View File

@@ -1,4 +1,4 @@
use crate::{auth, rate_limiter::RateBucketInfo};
use crate::auth;
use anyhow::{bail, ensure, Context, Ok};
use rustls::{sign, Certificate, PrivateKey};
use sha2::{Digest, Sha256};
@@ -20,7 +20,6 @@ pub struct ProxyConfig {
pub authentication_config: AuthenticationConfig,
pub require_client_ip: bool,
pub disable_ip_check_for_http: bool,
pub endpoint_rps_limit: Vec<RateBucketInfo>,
}
#[derive(Debug)]

View File

@@ -1,10 +1,9 @@
#[cfg(feature = "testing")]
pub mod mock;
pub mod neon;
use super::messages::MetricsAuxInfo;
use crate::{
auth::backend::ComputeUserInfo,
auth::ClientCredentials,
cache::{timed_lru, TimedLru},
compute, scram,
};
@@ -196,28 +195,16 @@ pub mod errors {
}
/// Extra query params we'd like to pass to the console.
pub struct ConsoleReqExtra {
pub struct ConsoleReqExtra<'a> {
/// A unique identifier for a connection.
pub session_id: uuid::Uuid,
/// Name of client application, if set.
pub application_name: String,
pub options: Vec<(String, String)>,
}
impl ConsoleReqExtra {
// https://swagger.io/docs/specification/serialization/ DeepObject format
// paramName[prop1]=value1&paramName[prop2]=value2&....
pub fn options_as_deep_object(&self) -> Vec<(String, String)> {
self.options
.iter()
.map(|(k, v)| (format!("options[{}]", k), v.to_string()))
.collect()
}
pub application_name: Option<&'a str>,
pub options: Option<&'a str>,
}
/// Auth secret which is managed by the cloud.
pub enum AuthSecret {
#[cfg(feature = "testing")]
/// Md5 hash of user's password.
Md5([u8; 16]),
@@ -259,21 +246,21 @@ pub trait Api {
/// Get the client's auth secret for authentication.
async fn get_auth_info(
&self,
extra: &ConsoleReqExtra,
creds: &ComputeUserInfo,
extra: &ConsoleReqExtra<'_>,
creds: &ClientCredentials,
) -> Result<AuthInfo, errors::GetAuthInfoError>;
async fn get_allowed_ips(
&self,
extra: &ConsoleReqExtra,
creds: &ComputeUserInfo,
extra: &ConsoleReqExtra<'_>,
creds: &ClientCredentials,
) -> Result<Arc<Vec<String>>, errors::GetAuthInfoError>;
/// Wake up the compute node and return the corresponding connection info.
async fn wake_compute(
&self,
extra: &ConsoleReqExtra,
creds: &ComputeUserInfo,
extra: &ConsoleReqExtra<'_>,
creds: &ClientCredentials,
) -> Result<CachedNodeInfo, errors::WakeComputeError>;
}

View File

@@ -6,7 +6,7 @@ use super::{
errors::{ApiError, GetAuthInfoError, WakeComputeError},
AuthInfo, AuthSecret, CachedNodeInfo, ConsoleReqExtra, NodeInfo,
};
use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl};
use crate::{auth::ClientCredentials, compute, error::io_error, scram, url::ApiUrl};
use async_trait::async_trait;
use futures::TryFutureExt;
use thiserror::Error;
@@ -47,7 +47,7 @@ impl Api {
async fn do_get_auth_info(
&self,
creds: &ComputeUserInfo,
creds: &ClientCredentials<'_>,
) -> Result<AuthInfo, GetAuthInfoError> {
let (secret, allowed_ips) = async {
// Perhaps we could persist this connection, but then we'd have to
@@ -60,7 +60,7 @@ impl Api {
let secret = match get_execute_postgres_query(
&client,
"select rolpassword from pg_catalog.pg_authid where rolname = $1",
&[&&*creds.inner.user],
&[&creds.user],
"rolpassword",
)
.await?
@@ -71,14 +71,14 @@ impl Api {
secret.or_else(|| parse_md5(&entry).map(AuthSecret::Md5))
}
None => {
warn!("user '{}' does not exist", creds.inner.user);
warn!("user '{}' does not exist", creds.user);
None
}
};
let allowed_ips = match get_execute_postgres_query(
&client,
"select allowed_ips from neon_control_plane.endpoints where endpoint_id = $1",
&[&creds.endpoint.as_str()],
&[&creds.project.clone().unwrap_or_default().as_str()],
"allowed_ips",
)
.await?
@@ -144,16 +144,16 @@ impl super::Api for Api {
#[tracing::instrument(skip_all)]
async fn get_auth_info(
&self,
_extra: &ConsoleReqExtra,
creds: &ComputeUserInfo,
_extra: &ConsoleReqExtra<'_>,
creds: &ClientCredentials,
) -> Result<AuthInfo, GetAuthInfoError> {
self.do_get_auth_info(creds).await
}
async fn get_allowed_ips(
&self,
_extra: &ConsoleReqExtra,
creds: &ComputeUserInfo,
_extra: &ConsoleReqExtra<'_>,
creds: &ClientCredentials,
) -> Result<Arc<Vec<String>>, GetAuthInfoError> {
Ok(Arc::new(self.do_get_auth_info(creds).await?.allowed_ips))
}
@@ -161,8 +161,8 @@ impl super::Api for Api {
#[tracing::instrument(skip_all)]
async fn wake_compute(
&self,
_extra: &ConsoleReqExtra,
_creds: &ComputeUserInfo,
_extra: &ConsoleReqExtra<'_>,
_creds: &ClientCredentials,
) -> Result<CachedNodeInfo, WakeComputeError> {
self.do_wake_compute()
.map_ok(CachedNodeInfo::new_uncached)

Some files were not shown because too many files have changed in this diff Show More