mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-23 21:30:36 +00:00
Compare commits
50 Commits
release-43
...
release-43
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a46e77b476 | ||
|
|
a92702b01e | ||
|
|
8ff3253f20 | ||
|
|
04b82c92a7 | ||
|
|
e5bf423e68 | ||
|
|
0ba4cae491 | ||
|
|
df1f8e13c4 | ||
|
|
e640bc7dba | ||
|
|
cf024de202 | ||
|
|
e1a564ace2 | ||
|
|
f5b9af6ac7 | ||
|
|
5e98855d80 | ||
|
|
699049b8f3 | ||
|
|
2c544343e0 | ||
|
|
193e60e2b8 | ||
|
|
1bbd6cae24 | ||
|
|
65f48c7002 | ||
|
|
d9d8e9afc7 | ||
|
|
7914eaf1e6 | ||
|
|
37fdbc3aaa | ||
|
|
7aa1e58301 | ||
|
|
f2892d3798 | ||
|
|
b492cedf51 | ||
|
|
880663f6bc | ||
|
|
e89e41f8ba | ||
|
|
f9401fdd31 | ||
|
|
b7ffe24426 | ||
|
|
52718bb8ff | ||
|
|
10c77cb410 | ||
|
|
31be301ef3 | ||
|
|
a3c7d400b4 | ||
|
|
7501ca6efb | ||
|
|
987c9aaea0 | ||
|
|
7fab731f65 | ||
|
|
483caa22c6 | ||
|
|
da5e03b0d8 | ||
|
|
be885370f6 | ||
|
|
bc1020f965 | ||
|
|
61fe9d360d | ||
|
|
f60e49fe8e | ||
|
|
c48918d329 | ||
|
|
bad686bb71 | ||
|
|
85d08581ed | ||
|
|
c7f1143e57 | ||
|
|
7403d55013 | ||
|
|
12f02523a4 | ||
|
|
207c527270 | ||
|
|
eae49ff598 | ||
|
|
e6b2f89fec | ||
|
|
1d81e70d60 |
8
.github/workflows/build_and_test.yml
vendored
8
.github/workflows/build_and_test.yml
vendored
@@ -199,6 +199,10 @@ jobs:
|
||||
#
|
||||
git config --global --add safe.directory ${{ github.workspace }}
|
||||
git config --global --add safe.directory ${GITHUB_WORKSPACE}
|
||||
for r in 14 15 16; do
|
||||
git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
|
||||
git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
|
||||
done
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
@@ -1097,6 +1101,10 @@ jobs:
|
||||
#
|
||||
git config --global --add safe.directory ${{ github.workspace }}
|
||||
git config --global --add safe.directory ${GITHUB_WORKSPACE}
|
||||
for r in 14 15 16; do
|
||||
git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
|
||||
git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
|
||||
done
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
|
||||
18
.github/workflows/neon_extra_builds.yml
vendored
18
.github/workflows/neon_extra_builds.yml
vendored
@@ -142,6 +142,10 @@ jobs:
|
||||
#
|
||||
git config --global --add safe.directory ${{ github.workspace }}
|
||||
git config --global --add safe.directory ${GITHUB_WORKSPACE}
|
||||
for r in 14 15 16; do
|
||||
git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
|
||||
git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
|
||||
done
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
@@ -238,6 +242,20 @@ jobs:
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- name: Fix git ownership
|
||||
run: |
|
||||
# Workaround for `fatal: detected dubious ownership in repository at ...`
|
||||
#
|
||||
# Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
|
||||
# Ref https://github.com/actions/checkout/issues/785
|
||||
#
|
||||
git config --global --add safe.directory ${{ github.workspace }}
|
||||
git config --global --add safe.directory ${GITHUB_WORKSPACE}
|
||||
for r in 14 15 16; do
|
||||
git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
|
||||
git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
|
||||
done
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -18,3 +18,6 @@ test_output/
|
||||
*.o
|
||||
*.so
|
||||
*.Po
|
||||
|
||||
# pgindent typedef lists
|
||||
*.list
|
||||
|
||||
78
Cargo.lock
generated
78
Cargo.lock
generated
@@ -44,6 +44,12 @@ dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "allocator-api2"
|
||||
version = "0.2.16"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5"
|
||||
|
||||
[[package]]
|
||||
name = "android_system_properties"
|
||||
version = "0.1.5"
|
||||
@@ -890,7 +896,7 @@ checksum = "a246e68bb43f6cd9db24bea052a53e40405417c5fb372e3d1a8a7f770a564ef5"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
"once_cell",
|
||||
"regex-automata",
|
||||
"regex-automata 0.1.10",
|
||||
"serde",
|
||||
]
|
||||
|
||||
@@ -2042,6 +2048,10 @@ name = "hashbrown"
|
||||
version = "0.14.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"allocator-api2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hashlink"
|
||||
@@ -2533,7 +2543,7 @@ version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8263075bb86c5a1b1427b5ae862e8889656f126e9f77c484496e8b47cf5c5558"
|
||||
dependencies = [
|
||||
"regex-automata",
|
||||
"regex-automata 0.1.10",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2559,9 +2569,9 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
|
||||
|
||||
[[package]]
|
||||
name = "memchr"
|
||||
version = "2.5.0"
|
||||
version = "2.6.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
|
||||
checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167"
|
||||
|
||||
[[package]]
|
||||
name = "memoffset"
|
||||
@@ -2634,14 +2644,14 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "mio"
|
||||
version = "0.8.6"
|
||||
version = "0.8.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5b9d9a46eff5b4ff64b45a9e316a6d1e0bc719ef429cbec4dc630684212bfdf9"
|
||||
checksum = "8f3d0b296e374a4e6f3c7b0a1f5a51d748a0d34c85e7dc48fc3fa9a87657fe09"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"log",
|
||||
"wasi 0.11.0+wasi-snapshot-preview1",
|
||||
"windows-sys 0.45.0",
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3644,7 +3654,7 @@ dependencies = [
|
||||
"serde_json",
|
||||
"sha2",
|
||||
"smol_str",
|
||||
"socket2 0.5.3",
|
||||
"socket2 0.5.5",
|
||||
"sync_wrapper",
|
||||
"task-local-extensions",
|
||||
"thiserror",
|
||||
@@ -3810,13 +3820,14 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "regex"
|
||||
version = "1.8.2"
|
||||
version = "1.10.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d1a59b5d8e97dee33696bf13c5ba8ab85341c002922fba050069326b9c498974"
|
||||
checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-syntax 0.7.2",
|
||||
"regex-automata 0.4.3",
|
||||
"regex-syntax 0.8.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3828,6 +3839,17 @@ dependencies = [
|
||||
"regex-syntax 0.6.29",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-automata"
|
||||
version = "0.4.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-syntax 0.8.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-syntax"
|
||||
version = "0.6.29"
|
||||
@@ -3836,9 +3858,9 @@ checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
|
||||
|
||||
[[package]]
|
||||
name = "regex-syntax"
|
||||
version = "0.7.2"
|
||||
version = "0.8.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "436b050e76ed2903236f032a59761c1eb99e1b0aead2c257922771dab1fc8c78"
|
||||
checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
|
||||
|
||||
[[package]]
|
||||
name = "relative-path"
|
||||
@@ -3864,6 +3886,7 @@ dependencies = [
|
||||
"bytes",
|
||||
"camino",
|
||||
"camino-tempfile",
|
||||
"futures",
|
||||
"futures-util",
|
||||
"http-types",
|
||||
"hyper",
|
||||
@@ -4291,6 +4314,7 @@ dependencies = [
|
||||
"tokio-io-timeout",
|
||||
"tokio-postgres",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
"toml_edit",
|
||||
"tracing",
|
||||
"url",
|
||||
@@ -4731,9 +4755,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "socket2"
|
||||
version = "0.5.3"
|
||||
version = "0.5.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2538b18701741680e0322a2302176d3253a35388e2e62f172f64f4f16605f877"
|
||||
checksum = "7b5fac59a5cb5dd637972e5fca70daf0523c9067fcdc4842f053dae04a18f8e9"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"windows-sys 0.48.0",
|
||||
@@ -5080,18 +5104,18 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tokio"
|
||||
version = "1.28.1"
|
||||
version = "1.34.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0aa32867d44e6f2ce3385e89dceb990188b8bb0fb25b0cf576647a6f98ac5105"
|
||||
checksum = "d0c014766411e834f7af5b8f4cf46257aab4036ca95e9d2c144a10f59ad6f5b9"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"backtrace",
|
||||
"bytes",
|
||||
"libc",
|
||||
"mio",
|
||||
"num_cpus",
|
||||
"pin-project-lite",
|
||||
"signal-hook-registry",
|
||||
"socket2 0.4.9",
|
||||
"socket2 0.5.5",
|
||||
"tokio-macros",
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
@@ -5108,9 +5132,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tokio-macros"
|
||||
version = "2.1.0"
|
||||
version = "2.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e"
|
||||
checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@@ -5145,7 +5169,7 @@ dependencies = [
|
||||
"pin-project-lite",
|
||||
"postgres-protocol",
|
||||
"postgres-types",
|
||||
"socket2 0.5.3",
|
||||
"socket2 0.5.5",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
]
|
||||
@@ -5214,13 +5238,16 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tokio-util"
|
||||
version = "0.7.8"
|
||||
version = "0.7.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "806fe8c2c87eccc8b3267cbae29ed3ab2d0bd37fca70ab622e46aaa9375ddb7d"
|
||||
checksum = "5419f34732d9eb6ee4c3578b7989078579b7f039cbbb9ca2c4da015749371e15"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"futures-core",
|
||||
"futures-io",
|
||||
"futures-sink",
|
||||
"futures-util",
|
||||
"hashbrown 0.14.0",
|
||||
"pin-project-lite",
|
||||
"tokio",
|
||||
"tracing",
|
||||
@@ -6216,7 +6243,8 @@ dependencies = [
|
||||
"prost",
|
||||
"rand 0.8.5",
|
||||
"regex",
|
||||
"regex-syntax 0.7.2",
|
||||
"regex-automata 0.4.3",
|
||||
"regex-syntax 0.8.2",
|
||||
"reqwest",
|
||||
"ring 0.16.20",
|
||||
"rustls",
|
||||
|
||||
@@ -109,7 +109,7 @@ pin-project-lite = "0.2"
|
||||
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
|
||||
prost = "0.11"
|
||||
rand = "0.8"
|
||||
regex = "1.4"
|
||||
regex = "1.10.2"
|
||||
reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
|
||||
reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_19"] }
|
||||
reqwest-middleware = "0.2.0"
|
||||
@@ -149,7 +149,7 @@ tokio-postgres-rustls = "0.10.0"
|
||||
tokio-rustls = "0.24"
|
||||
tokio-stream = "0.1"
|
||||
tokio-tar = "0.3"
|
||||
tokio-util = { version = "0.7", features = ["io"] }
|
||||
tokio-util = { version = "0.7.10", features = ["io", "rt"] }
|
||||
toml = "0.7"
|
||||
toml_edit = "0.19"
|
||||
tonic = {version = "0.9", features = ["tls", "tls-roots"]}
|
||||
|
||||
38
Makefile
38
Makefile
@@ -260,6 +260,44 @@ distclean:
|
||||
fmt:
|
||||
./pre-commit.py --fix-inplace
|
||||
|
||||
postgres-%-pg-bsd-indent: postgres-%
|
||||
+@echo "Compiling pg_bsd_indent"
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/tools/pg_bsd_indent/
|
||||
|
||||
# Create typedef list for the core. Note that generally it should be combined with
|
||||
# buildfarm one to cover platform specific stuff.
|
||||
# https://wiki.postgresql.org/wiki/Running_pgindent_on_non-core_code_or_development_code
|
||||
postgres-%-typedefs.list: postgres-%
|
||||
$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/find_typedef $(POSTGRES_INSTALL_DIR)/$*/bin > $@
|
||||
|
||||
# Indent postgres. See src/tools/pgindent/README for details.
|
||||
.PHONY: postgres-%-pgindent
|
||||
postgres-%-pgindent: postgres-%-pg-bsd-indent postgres-%-typedefs.list
|
||||
+@echo merge with buildfarm typedef to cover all platforms
|
||||
+@echo note: I first tried to download from pgbuildfarm.org, but for unclear reason e.g. \
|
||||
REL_16_STABLE list misses PGSemaphoreData
|
||||
# wget -q -O - "http://www.pgbuildfarm.org/cgi-bin/typedefs.pl?branch=REL_16_STABLE" |\
|
||||
# cat - postgres-$*-typedefs.list | sort | uniq > postgres-$*-typedefs-full.list
|
||||
cat $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/typedefs.list |\
|
||||
cat - postgres-$*-typedefs.list | sort | uniq > postgres-$*-typedefs-full.list
|
||||
+@echo note: you might want to run it on selected files/dirs instead.
|
||||
INDENT=$(POSTGRES_INSTALL_DIR)/build/$*/src/tools/pg_bsd_indent/pg_bsd_indent \
|
||||
$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/pgindent --typedefs postgres-$*-typedefs-full.list \
|
||||
$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/ \
|
||||
--excludes $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/exclude_file_patterns
|
||||
rm -f pg*.BAK
|
||||
|
||||
# Indent pxgn/neon.
|
||||
.PHONY: pgindent
|
||||
neon-pgindent: postgres-v16-pg-bsd-indent neon-pg-ext-v16
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
|
||||
FIND_TYPEDEF=$(ROOT_PROJECT_DIR)/vendor/postgres-v16/src/tools/find_typedef \
|
||||
INDENT=$(POSTGRES_INSTALL_DIR)/build/v16/src/tools/pg_bsd_indent/pg_bsd_indent \
|
||||
PGINDENT_SCRIPT=$(ROOT_PROJECT_DIR)/vendor/postgres-v16/src/tools/pgindent/pgindent \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-v16 \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile pgindent
|
||||
|
||||
|
||||
.PHONY: setup-pre-commit-hook
|
||||
setup-pre-commit-hook:
|
||||
ln -s -f $(ROOT_PROJECT_DIR)/pre-commit.py .git/hooks/pre-commit
|
||||
|
||||
@@ -274,7 +274,13 @@ fn main() -> Result<()> {
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
state.error = Some(format!("{:?}", err));
|
||||
state.status = ComputeStatus::Failed;
|
||||
drop(state);
|
||||
// Notify others that Postgres failed to start. In case of configuring the
|
||||
// empty compute, it's likely that API handler is still waiting for compute
|
||||
// state change. With this we will notify it that compute is in Failed state,
|
||||
// so control plane will know about it earlier and record proper error instead
|
||||
// of timeout.
|
||||
compute.state_changed.notify_all();
|
||||
drop(state); // unlock
|
||||
delay_exit = true;
|
||||
None
|
||||
}
|
||||
|
||||
@@ -22,7 +22,7 @@ use utils::id::{TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use compute_api::responses::{ComputeMetrics, ComputeStatus};
|
||||
use compute_api::spec::{ComputeMode, ComputeSpec};
|
||||
use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec};
|
||||
use utils::measured_stream::MeasuredReader;
|
||||
|
||||
use remote_storage::{DownloadError, RemotePath};
|
||||
@@ -252,7 +252,7 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()>
|
||||
IF NOT EXISTS (
|
||||
SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser')
|
||||
THEN
|
||||
CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION IN ROLE pg_read_all_data, pg_write_all_data;
|
||||
CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS IN ROLE pg_read_all_data, pg_write_all_data;
|
||||
IF array_length(roles, 1) IS NOT NULL THEN
|
||||
EXECUTE format('GRANT neon_superuser TO %s',
|
||||
array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(roles) as x), ', '));
|
||||
@@ -277,6 +277,17 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()>
|
||||
}
|
||||
|
||||
impl ComputeNode {
|
||||
/// Check that compute node has corresponding feature enabled.
|
||||
pub fn has_feature(&self, feature: ComputeFeature) -> bool {
|
||||
let state = self.state.lock().unwrap();
|
||||
|
||||
if let Some(s) = state.pspec.as_ref() {
|
||||
s.spec.features.contains(&feature)
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set_status(&self, status: ComputeStatus) {
|
||||
let mut state = self.state.lock().unwrap();
|
||||
state.status = status;
|
||||
|
||||
@@ -193,16 +193,11 @@ impl Escaping for PgIdent {
|
||||
/// Build a list of existing Postgres roles
|
||||
pub fn get_existing_roles(xact: &mut Transaction<'_>) -> Result<Vec<Role>> {
|
||||
let postgres_roles = xact
|
||||
.query(
|
||||
"SELECT rolname, rolpassword, rolreplication, rolbypassrls FROM pg_catalog.pg_authid",
|
||||
&[],
|
||||
)?
|
||||
.query("SELECT rolname, rolpassword FROM pg_catalog.pg_authid", &[])?
|
||||
.iter()
|
||||
.map(|row| Role {
|
||||
name: row.get("rolname"),
|
||||
encrypted_password: row.get("rolpassword"),
|
||||
replication: Some(row.get("rolreplication")),
|
||||
bypassrls: Some(row.get("rolbypassrls")),
|
||||
options: None,
|
||||
})
|
||||
.collect();
|
||||
|
||||
@@ -252,8 +252,6 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
let action = if let Some(r) = pg_role {
|
||||
if (r.encrypted_password.is_none() && role.encrypted_password.is_some())
|
||||
|| (r.encrypted_password.is_some() && role.encrypted_password.is_none())
|
||||
|| !r.bypassrls.unwrap_or(false)
|
||||
|| !r.replication.unwrap_or(false)
|
||||
{
|
||||
RoleAction::Update
|
||||
} else if let Some(pg_pwd) = &r.encrypted_password {
|
||||
@@ -285,14 +283,22 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
match action {
|
||||
RoleAction::None => {}
|
||||
RoleAction::Update => {
|
||||
let mut query: String =
|
||||
format!("ALTER ROLE {} BYPASSRLS REPLICATION", name.pg_quote());
|
||||
// This can be run on /every/ role! Not just ones created through the console.
|
||||
// This means that if you add some funny ALTER here that adds a permission,
|
||||
// this will get run even on user-created roles! This will result in different
|
||||
// behavior before and after a spec gets reapplied. The below ALTER as it stands
|
||||
// now only grants LOGIN and changes the password. Please do not allow this branch
|
||||
// to do anything silly.
|
||||
let mut query: String = format!("ALTER ROLE {} ", name.pg_quote());
|
||||
query.push_str(&role.to_pg_options());
|
||||
xact.execute(query.as_str(), &[])?;
|
||||
}
|
||||
RoleAction::Create => {
|
||||
// This branch only runs when roles are created through the console, so it is
|
||||
// safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
|
||||
// from neon_superuser.
|
||||
let mut query: String = format!(
|
||||
"CREATE ROLE {} CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
|
||||
"CREATE ROLE {} INHERIT CREATEROLE CREATEDB IN ROLE neon_superuser",
|
||||
name.pg_quote()
|
||||
);
|
||||
info!("role create query: '{}'", &query);
|
||||
|
||||
@@ -201,6 +201,12 @@ async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiEr
|
||||
// TODO(sharding): make this shard-aware
|
||||
if let Some(tenant_state) = locked.tenants.get(&req_tenant.id.tenant_id) {
|
||||
let valid = tenant_state.generation == req_tenant.gen;
|
||||
tracing::info!(
|
||||
"handle_validate: {}(gen {}): valid={valid} (latest {})",
|
||||
req_tenant.id,
|
||||
req_tenant.gen,
|
||||
tenant_state.generation
|
||||
);
|
||||
response.tenants.push(ValidateResponseTenant {
|
||||
id: req_tenant.id,
|
||||
valid,
|
||||
@@ -250,6 +256,13 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
|
||||
tenant_state.pageserver = attach_req.node_id;
|
||||
let generation = tenant_state.generation;
|
||||
|
||||
tracing::info!(
|
||||
"handle_attach_hook: tenant {} set generation {}, pageserver {}",
|
||||
attach_req.tenant_id,
|
||||
tenant_state.generation,
|
||||
attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff))
|
||||
);
|
||||
|
||||
locked.save().await.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(
|
||||
|
||||
@@ -519,6 +519,7 @@ impl Endpoint {
|
||||
skip_pg_catalog_updates: self.skip_pg_catalog_updates,
|
||||
format_version: 1.0,
|
||||
operation_uuid: None,
|
||||
features: vec![],
|
||||
cluster: Cluster {
|
||||
cluster_id: None, // project ID: not used
|
||||
name: None, // project name: not used
|
||||
|
||||
@@ -26,6 +26,13 @@ pub struct ComputeSpec {
|
||||
// but we don't use it for anything. Serde will ignore missing fields when
|
||||
// deserializing it.
|
||||
pub operation_uuid: Option<String>,
|
||||
|
||||
/// Compute features to enable. These feature flags are provided, when we
|
||||
/// know all the details about client's compute, so they cannot be used
|
||||
/// to change `Empty` compute behavior.
|
||||
#[serde(default)]
|
||||
pub features: Vec<ComputeFeature>,
|
||||
|
||||
/// Expected cluster state at the end of transition process.
|
||||
pub cluster: Cluster,
|
||||
pub delta_operations: Option<Vec<DeltaOp>>,
|
||||
@@ -68,6 +75,19 @@ pub struct ComputeSpec {
|
||||
pub remote_extensions: Option<RemoteExtSpec>,
|
||||
}
|
||||
|
||||
/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
|
||||
#[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum ComputeFeature {
|
||||
// XXX: Add more feature flags here.
|
||||
|
||||
// This is a special feature flag that is used to represent unknown feature flags.
|
||||
// Basically all unknown to enum flags are represented as this one. See unit test
|
||||
// `parse_unknown_features()` for more details.
|
||||
#[serde(other)]
|
||||
UnknownFeature,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default, Deserialize, Serialize)]
|
||||
pub struct RemoteExtSpec {
|
||||
pub public_extensions: Option<Vec<String>>,
|
||||
@@ -187,8 +207,6 @@ pub struct DeltaOp {
|
||||
pub struct Role {
|
||||
pub name: PgIdent,
|
||||
pub encrypted_password: Option<String>,
|
||||
pub replication: Option<bool>,
|
||||
pub bypassrls: Option<bool>,
|
||||
pub options: GenericOptions,
|
||||
}
|
||||
|
||||
@@ -229,7 +247,10 @@ mod tests {
|
||||
#[test]
|
||||
fn parse_spec_file() {
|
||||
let file = File::open("tests/cluster_spec.json").unwrap();
|
||||
let _spec: ComputeSpec = serde_json::from_reader(file).unwrap();
|
||||
let spec: ComputeSpec = serde_json::from_reader(file).unwrap();
|
||||
|
||||
// Features list defaults to empty vector.
|
||||
assert!(spec.features.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -241,4 +262,22 @@ mod tests {
|
||||
ob.insert("unknown_field_123123123".into(), "hello".into());
|
||||
let _spec: ComputeSpec = serde_json::from_value(json).unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_unknown_features() {
|
||||
// Test that unknown feature flags do not cause any errors.
|
||||
let file = File::open("tests/cluster_spec.json").unwrap();
|
||||
let mut json: serde_json::Value = serde_json::from_reader(file).unwrap();
|
||||
let ob = json.as_object_mut().unwrap();
|
||||
|
||||
// Add unknown feature flags.
|
||||
let features = vec!["foo_bar_feature", "baz_feature"];
|
||||
ob.insert("features".into(), features.into());
|
||||
|
||||
let spec: ComputeSpec = serde_json::from_value(json).unwrap();
|
||||
|
||||
assert!(spec.features.len() == 2);
|
||||
assert!(spec.features.contains(&ComputeFeature::UnknownFeature));
|
||||
assert_eq!(spec.features, vec![ComputeFeature::UnknownFeature; 2]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -140,3 +140,7 @@ impl Key {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_rel_block_key(key: &Key) -> bool {
|
||||
key.field1 == 0x00 && key.field4 != 0
|
||||
}
|
||||
|
||||
@@ -323,6 +323,7 @@ impl TenantConfigRequest {
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct TenantAttachRequest {
|
||||
#[serde(default)]
|
||||
pub config: TenantAttachConfig,
|
||||
#[serde(default)]
|
||||
pub generation: Option<u32>,
|
||||
@@ -330,7 +331,7 @@ pub struct TenantAttachRequest {
|
||||
|
||||
/// Newtype to enforce deny_unknown_fields on TenantConfig for
|
||||
/// its usage inside `TenantAttachRequest`.
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
#[derive(Debug, Serialize, Deserialize, Default)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TenantAttachConfig {
|
||||
#[serde(flatten)]
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use std::{ops::RangeInclusive, str::FromStr};
|
||||
|
||||
use crate::key::{is_rel_block_key, Key};
|
||||
use hex::FromHex;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use thiserror;
|
||||
@@ -72,19 +73,28 @@ impl TenantShardId {
|
||||
)
|
||||
}
|
||||
|
||||
pub fn shard_slug(&self) -> String {
|
||||
format!("{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
|
||||
pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
|
||||
ShardSlug(self)
|
||||
}
|
||||
}
|
||||
|
||||
/// Formatting helper
|
||||
struct ShardSlug<'a>(&'a TenantShardId);
|
||||
|
||||
impl<'a> std::fmt::Display for ShardSlug<'a> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"{:02x}{:02x}",
|
||||
self.0.shard_number.0, self.0.shard_count.0
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for TenantShardId {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
if self.shard_count != ShardCount(0) {
|
||||
write!(
|
||||
f,
|
||||
"{}-{:02x}{:02x}",
|
||||
self.tenant_id, self.shard_number.0, self.shard_count.0
|
||||
)
|
||||
write!(f, "{}-{}", self.tenant_id, self.shard_slug())
|
||||
} else {
|
||||
// Legacy case (shard_count == 0) -- format as just the tenant id. Note that this
|
||||
// is distinct from the normal single shard case (shard count == 1).
|
||||
@@ -302,6 +312,8 @@ pub struct ShardStripeSize(pub u32);
|
||||
pub struct ShardLayout(u8);
|
||||
|
||||
const LAYOUT_V1: ShardLayout = ShardLayout(1);
|
||||
/// ShardIdentity uses a magic layout value to indicate if it is unusable
|
||||
const LAYOUT_BROKEN: ShardLayout = ShardLayout(255);
|
||||
|
||||
/// Default stripe size in pages: 256MiB divided by 8kiB page size.
|
||||
const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
|
||||
@@ -310,10 +322,10 @@ const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
|
||||
/// to resolve a key to a shard, and then check whether that shard is ==self.
|
||||
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
|
||||
pub struct ShardIdentity {
|
||||
pub layout: ShardLayout,
|
||||
pub number: ShardNumber,
|
||||
pub count: ShardCount,
|
||||
pub stripe_size: ShardStripeSize,
|
||||
stripe_size: ShardStripeSize,
|
||||
layout: ShardLayout,
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug, PartialEq, Eq)]
|
||||
@@ -339,6 +351,22 @@ impl ShardIdentity {
|
||||
}
|
||||
}
|
||||
|
||||
/// A broken instance of this type is only used for `TenantState::Broken` tenants,
|
||||
/// which are constructed in code paths that don't have access to proper configuration.
|
||||
///
|
||||
/// A ShardIdentity in this state may not be used for anything, and should not be persisted.
|
||||
/// Enforcement is via assertions, to avoid making our interface fallible for this
|
||||
/// edge case: it is the Tenant's responsibility to avoid trying to do any I/O when in a broken
|
||||
/// state, and by extension to avoid trying to do any page->shard resolution.
|
||||
pub fn broken(number: ShardNumber, count: ShardCount) -> Self {
|
||||
Self {
|
||||
number,
|
||||
count,
|
||||
layout: LAYOUT_BROKEN,
|
||||
stripe_size: DEFAULT_STRIPE_SIZE,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_unsharded(&self) -> bool {
|
||||
self.number == ShardNumber(0) && self.count == ShardCount(0)
|
||||
}
|
||||
@@ -365,6 +393,39 @@ impl ShardIdentity {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn is_broken(&self) -> bool {
|
||||
self.layout == LAYOUT_BROKEN
|
||||
}
|
||||
|
||||
pub fn get_shard_number(&self, key: &Key) -> ShardNumber {
|
||||
assert!(!self.is_broken());
|
||||
key_to_shard_number(self.count, self.stripe_size, key)
|
||||
}
|
||||
|
||||
/// Return true if the key should be ingested by this shard
|
||||
pub fn is_key_local(&self, key: &Key) -> bool {
|
||||
assert!(!self.is_broken());
|
||||
if self.count < ShardCount(2) || (key_is_shard0(key) && self.number == ShardNumber(0)) {
|
||||
true
|
||||
} else {
|
||||
key_to_shard_number(self.count, self.stripe_size, key) == self.number
|
||||
}
|
||||
}
|
||||
|
||||
pub fn shard_slug(&self) -> String {
|
||||
if self.count > ShardCount(0) {
|
||||
format!("-{:02x}{:02x}", self.number.0, self.count.0)
|
||||
} else {
|
||||
String::new()
|
||||
}
|
||||
}
|
||||
|
||||
/// Convenience for checking if this identity is the 0th shard in a tenant,
|
||||
/// for special cases on shard 0 such as ingesting relation sizes.
|
||||
pub fn is_zero(&self) -> bool {
|
||||
self.number == ShardNumber(0)
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for ShardIndex {
|
||||
@@ -438,6 +499,65 @@ impl<'de> Deserialize<'de> for ShardIndex {
|
||||
}
|
||||
}
|
||||
|
||||
/// Whether this key is always held on shard 0 (e.g. shard 0 holds all SLRU keys
|
||||
/// in order to be able to serve basebackup requests without peer communication).
|
||||
fn key_is_shard0(key: &Key) -> bool {
|
||||
// To decide what to shard out to shards >0, we apply a simple rule that only
|
||||
// relation pages are distributed to shards other than shard zero. Everything else gets
|
||||
// stored on shard 0. This guarantees that shard 0 can independently serve basebackup
|
||||
// requests, and any request other than those for particular blocks in relations.
|
||||
//
|
||||
// In this condition:
|
||||
// - is_rel_block_key includes only relations, i.e. excludes SLRU data and
|
||||
// all metadata.
|
||||
// - field6 is set to -1 for relation size pages.
|
||||
!(is_rel_block_key(key) && key.field6 != 0xffffffff)
|
||||
}
|
||||
|
||||
/// Provide the same result as the function in postgres `hashfn.h` with the same name
|
||||
fn murmurhash32(mut h: u32) -> u32 {
|
||||
h ^= h >> 16;
|
||||
h = h.wrapping_mul(0x85ebca6b);
|
||||
h ^= h >> 13;
|
||||
h = h.wrapping_mul(0xc2b2ae35);
|
||||
h ^= h >> 16;
|
||||
h
|
||||
}
|
||||
|
||||
/// Provide the same result as the function in postgres `hashfn.h` with the same name
|
||||
fn hash_combine(mut a: u32, mut b: u32) -> u32 {
|
||||
b = b.wrapping_add(0x9e3779b9);
|
||||
b = b.wrapping_add(a << 6);
|
||||
b = b.wrapping_add(a >> 2);
|
||||
|
||||
a ^= b;
|
||||
a
|
||||
}
|
||||
|
||||
/// Where a Key is to be distributed across shards, select the shard. This function
|
||||
/// does not account for keys that should be broadcast across shards.
|
||||
///
|
||||
/// The hashing in this function must exactly match what we do in postgres smgr
|
||||
/// code. The resulting distribution of pages is intended to preserve locality within
|
||||
/// `stripe_size` ranges of contiguous block numbers in the same relation, while otherwise
|
||||
/// distributing data pseudo-randomly.
|
||||
///
|
||||
/// The mapping of key to shard is not stable across changes to ShardCount: this is intentional
|
||||
/// and will be handled at higher levels when shards are split.
|
||||
fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Key) -> ShardNumber {
|
||||
// Fast path for un-sharded tenants or broadcast keys
|
||||
if count < ShardCount(2) || key_is_shard0(key) {
|
||||
return ShardNumber(0);
|
||||
}
|
||||
|
||||
// relNode
|
||||
let mut hash = murmurhash32(key.field4);
|
||||
// blockNum/stripe size
|
||||
hash = hash_combine(hash, murmurhash32(key.field6 / stripe_size.0));
|
||||
|
||||
ShardNumber((hash % count.0 as u32) as u8)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::str::FromStr;
|
||||
@@ -609,4 +729,29 @@ mod tests {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// These are only smoke tests to spot check that our implementation doesn't
|
||||
// deviate from a few examples values: not aiming to validate the overall
|
||||
// hashing algorithm.
|
||||
#[test]
|
||||
fn murmur_hash() {
|
||||
assert_eq!(murmurhash32(0), 0);
|
||||
|
||||
assert_eq!(hash_combine(0xb1ff3b40, 0), 0xfb7923c9);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn shard_mapping() {
|
||||
let key = Key {
|
||||
field1: 0x00,
|
||||
field2: 0x67f,
|
||||
field3: 0x5,
|
||||
field4: 0x400c,
|
||||
field5: 0x00,
|
||||
field6: 0x7d06,
|
||||
};
|
||||
|
||||
let shard = key_to_shard_number(ShardCount(10), DEFAULT_STRIPE_SIZE, &key);
|
||||
assert_eq!(shard, ShardNumber(8));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -289,10 +289,10 @@ impl FeStartupPacket {
|
||||
// We shouldn't advance `buf` as probably full message is not there yet,
|
||||
// so can't directly use Bytes::get_u32 etc.
|
||||
let len = (&buf[0..4]).read_u32::<BigEndian>().unwrap() as usize;
|
||||
// The proposed replacement is `!(4..=MAX_STARTUP_PACKET_LENGTH).contains(&len)`
|
||||
// The proposed replacement is `!(8..=MAX_STARTUP_PACKET_LENGTH).contains(&len)`
|
||||
// which is less readable
|
||||
#[allow(clippy::manual_range_contains)]
|
||||
if len < 4 || len > MAX_STARTUP_PACKET_LENGTH {
|
||||
if len < 8 || len > MAX_STARTUP_PACKET_LENGTH {
|
||||
return Err(ProtocolError::Protocol(format!(
|
||||
"invalid startup packet message length {}",
|
||||
len
|
||||
@@ -975,4 +975,10 @@ mod tests {
|
||||
let params = make_params("foo\\ bar \\ \\\\ baz\\ lol");
|
||||
assert_eq!(split_options(¶ms), ["foo bar", " \\", "baz ", "lol"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_fe_startup_packet_regression() {
|
||||
let data = [0, 0, 0, 7, 0, 0, 0, 0];
|
||||
FeStartupPacket::parse(&mut BytesMut::from_iter(data)).unwrap_err();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,10 +16,11 @@ aws-credential-types.workspace = true
|
||||
bytes.workspace = true
|
||||
camino.workspace = true
|
||||
hyper = { workspace = true, features = ["stream"] }
|
||||
futures.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
|
||||
tokio-util.workspace = true
|
||||
tokio-util = { workspace = true, features = ["compat"] }
|
||||
toml_edit.workspace = true
|
||||
tracing.workspace = true
|
||||
scopeguard.workspace = true
|
||||
|
||||
@@ -1,21 +1,24 @@
|
||||
//! Azure Blob Storage wrapper
|
||||
|
||||
use std::borrow::Cow;
|
||||
use std::collections::HashMap;
|
||||
use std::env;
|
||||
use std::num::NonZeroU32;
|
||||
use std::pin::Pin;
|
||||
use std::sync::Arc;
|
||||
use std::{borrow::Cow, io::Cursor};
|
||||
|
||||
use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
|
||||
use anyhow::Result;
|
||||
use azure_core::request_options::{MaxResults, Metadata, Range};
|
||||
use azure_core::RetryOptions;
|
||||
use azure_identity::DefaultAzureCredential;
|
||||
use azure_storage::StorageCredentials;
|
||||
use azure_storage_blobs::prelude::ClientBuilder;
|
||||
use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient};
|
||||
use bytes::Bytes;
|
||||
use futures::stream::Stream;
|
||||
use futures_util::StreamExt;
|
||||
use http_types::StatusCode;
|
||||
use tokio::io::AsyncRead;
|
||||
use tracing::debug;
|
||||
|
||||
use crate::s3_bucket::RequestKind;
|
||||
@@ -49,7 +52,8 @@ impl AzureBlobStorage {
|
||||
StorageCredentials::token_credential(Arc::new(token_credential))
|
||||
};
|
||||
|
||||
let builder = ClientBuilder::new(account, credentials);
|
||||
// we have an outer retry
|
||||
let builder = ClientBuilder::new(account, credentials).retry(RetryOptions::none());
|
||||
|
||||
let client = builder.container_client(azure_config.container_name.to_owned());
|
||||
|
||||
@@ -116,7 +120,8 @@ impl AzureBlobStorage {
|
||||
let mut metadata = HashMap::new();
|
||||
// TODO give proper streaming response instead of buffering into RAM
|
||||
// https://github.com/neondatabase/neon/issues/5563
|
||||
let mut buf = Vec::new();
|
||||
|
||||
let mut bufs = Vec::new();
|
||||
while let Some(part) = response.next().await {
|
||||
let part = part.map_err(to_download_error)?;
|
||||
if let Some(blob_meta) = part.blob.metadata {
|
||||
@@ -127,10 +132,10 @@ impl AzureBlobStorage {
|
||||
.collect()
|
||||
.await
|
||||
.map_err(|e| DownloadError::Other(e.into()))?;
|
||||
buf.extend_from_slice(&data.slice(..));
|
||||
bufs.push(data);
|
||||
}
|
||||
Ok(Download {
|
||||
download_stream: Box::pin(Cursor::new(buf)),
|
||||
download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
|
||||
metadata: Some(StorageMetadata(metadata)),
|
||||
})
|
||||
}
|
||||
@@ -217,9 +222,10 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
}
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
mut from: impl AsyncRead + Unpin + Send + Sync + 'static,
|
||||
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
data_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
@@ -227,13 +233,12 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
let _permit = self.permit(RequestKind::Put).await;
|
||||
let blob_client = self.client.blob_client(self.relative_path_to_name(to));
|
||||
|
||||
// TODO FIX THIS UGLY HACK and don't buffer the entire object
|
||||
// into RAM here, but use the streaming interface. For that,
|
||||
// we'd have to change the interface though...
|
||||
// https://github.com/neondatabase/neon/issues/5563
|
||||
let mut buf = Vec::with_capacity(data_size_bytes);
|
||||
tokio::io::copy(&mut from, &mut buf).await?;
|
||||
let body = azure_core::Body::Bytes(buf.into());
|
||||
let from: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static>> =
|
||||
Box::pin(from);
|
||||
|
||||
let from = NonSeekableStream::new(from, data_size_bytes);
|
||||
|
||||
let body = azure_core::Body::SeekableStream(Box::new(from));
|
||||
|
||||
let mut builder = blob_client.put_block_blob(body);
|
||||
|
||||
@@ -312,3 +317,153 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pin_project_lite::pin_project! {
|
||||
/// Hack to work around not being able to stream once with azure sdk.
|
||||
///
|
||||
/// Azure sdk clones streams around with the assumption that they are like
|
||||
/// `Arc<tokio::fs::File>` (except not supporting tokio), however our streams are not like
|
||||
/// that. For example for an `index_part.json` we just have a single chunk of [`Bytes`]
|
||||
/// representing the whole serialized vec. It could be trivially cloneable and "semi-trivially"
|
||||
/// seekable, but we can also just re-try the request easier.
|
||||
#[project = NonSeekableStreamProj]
|
||||
enum NonSeekableStream<S> {
|
||||
/// A stream wrappers initial form.
|
||||
///
|
||||
/// Mutex exists to allow moving when cloning. If the sdk changes to do less than 1
|
||||
/// clone before first request, then this must be changed.
|
||||
Initial {
|
||||
inner: std::sync::Mutex<Option<tokio_util::compat::Compat<tokio_util::io::StreamReader<S, Bytes>>>>,
|
||||
len: usize,
|
||||
},
|
||||
/// The actually readable variant, produced by cloning the Initial variant.
|
||||
///
|
||||
/// The sdk currently always clones once, even without retry policy.
|
||||
Actual {
|
||||
#[pin]
|
||||
inner: tokio_util::compat::Compat<tokio_util::io::StreamReader<S, Bytes>>,
|
||||
len: usize,
|
||||
read_any: bool,
|
||||
},
|
||||
/// Most likely unneeded, but left to make life easier, in case more clones are added.
|
||||
Cloned {
|
||||
len_was: usize,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<S> NonSeekableStream<S>
|
||||
where
|
||||
S: Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
{
|
||||
fn new(inner: S, len: usize) -> NonSeekableStream<S> {
|
||||
use tokio_util::compat::TokioAsyncReadCompatExt;
|
||||
|
||||
let inner = tokio_util::io::StreamReader::new(inner).compat();
|
||||
let inner = Some(inner);
|
||||
let inner = std::sync::Mutex::new(inner);
|
||||
NonSeekableStream::Initial { inner, len }
|
||||
}
|
||||
}
|
||||
|
||||
impl<S> std::fmt::Debug for NonSeekableStream<S> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
Self::Initial { len, .. } => f.debug_struct("Initial").field("len", len).finish(),
|
||||
Self::Actual { len, .. } => f.debug_struct("Actual").field("len", len).finish(),
|
||||
Self::Cloned { len_was, .. } => f.debug_struct("Cloned").field("len", len_was).finish(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<S> futures::io::AsyncRead for NonSeekableStream<S>
|
||||
where
|
||||
S: Stream<Item = std::io::Result<Bytes>>,
|
||||
{
|
||||
fn poll_read(
|
||||
self: std::pin::Pin<&mut Self>,
|
||||
cx: &mut std::task::Context<'_>,
|
||||
buf: &mut [u8],
|
||||
) -> std::task::Poll<std::io::Result<usize>> {
|
||||
match self.project() {
|
||||
NonSeekableStreamProj::Actual {
|
||||
inner, read_any, ..
|
||||
} => {
|
||||
*read_any = true;
|
||||
inner.poll_read(cx, buf)
|
||||
}
|
||||
// NonSeekableStream::Initial does not support reading because it is just much easier
|
||||
// to have the mutex in place where one does not poll the contents, or that's how it
|
||||
// seemed originally. If there is a version upgrade which changes the cloning, then
|
||||
// that support needs to be hacked in.
|
||||
//
|
||||
// including {self:?} into the message would be useful, but unsure how to unproject.
|
||||
_ => std::task::Poll::Ready(Err(std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
"cloned or initial values cannot be read",
|
||||
))),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<S> Clone for NonSeekableStream<S> {
|
||||
/// Weird clone implementation exists to support the sdk doing cloning before issuing the first
|
||||
/// request, see type documentation.
|
||||
fn clone(&self) -> Self {
|
||||
use NonSeekableStream::*;
|
||||
|
||||
match self {
|
||||
Initial { inner, len } => {
|
||||
if let Some(inner) = inner.lock().unwrap().take() {
|
||||
Actual {
|
||||
inner,
|
||||
len: *len,
|
||||
read_any: false,
|
||||
}
|
||||
} else {
|
||||
Self::Cloned { len_was: *len }
|
||||
}
|
||||
}
|
||||
Actual { len, .. } => Cloned { len_was: *len },
|
||||
Cloned { len_was } => Cloned { len_was: *len_was },
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl<S> azure_core::SeekableStream for NonSeekableStream<S>
|
||||
where
|
||||
S: Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync + 'static,
|
||||
{
|
||||
async fn reset(&mut self) -> azure_core::error::Result<()> {
|
||||
use NonSeekableStream::*;
|
||||
|
||||
let msg = match self {
|
||||
Initial { inner, .. } => {
|
||||
if inner.get_mut().unwrap().is_some() {
|
||||
return Ok(());
|
||||
} else {
|
||||
"reset after first clone is not supported"
|
||||
}
|
||||
}
|
||||
Actual { read_any, .. } if !*read_any => return Ok(()),
|
||||
Actual { .. } => "reset after reading is not supported",
|
||||
Cloned { .. } => "reset after second clone is not supported",
|
||||
};
|
||||
Err(azure_core::error::Error::new(
|
||||
azure_core::error::ErrorKind::Io,
|
||||
std::io::Error::new(std::io::ErrorKind::Other, msg),
|
||||
))
|
||||
}
|
||||
|
||||
// Note: it is not documented if this should be the total or remaining length, total passes the
|
||||
// tests.
|
||||
fn len(&self) -> usize {
|
||||
use NonSeekableStream::*;
|
||||
match self {
|
||||
Initial { len, .. } => *len,
|
||||
Actual { len, .. } => *len,
|
||||
Cloned { len_was, .. } => *len_was,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,8 +19,10 @@ use std::{collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::A
|
||||
use anyhow::{bail, Context};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
|
||||
use bytes::Bytes;
|
||||
use futures::stream::Stream;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::{io, sync::Semaphore};
|
||||
use tokio::sync::Semaphore;
|
||||
use toml_edit::Item;
|
||||
use tracing::info;
|
||||
|
||||
@@ -179,7 +181,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
/// Streams the local file contents into remote into the remote storage entry.
|
||||
async fn upload(
|
||||
&self,
|
||||
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
// S3 PUT request requires the content length to be specified,
|
||||
// otherwise it starts to fail with the concurrent connection count increasing.
|
||||
data_size_bytes: usize,
|
||||
@@ -206,7 +208,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
}
|
||||
|
||||
pub struct Download {
|
||||
pub download_stream: Pin<Box<dyn io::AsyncRead + Unpin + Send + Sync>>,
|
||||
pub download_stream: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>,
|
||||
/// Extra key-value data, associated with the current remote file.
|
||||
pub metadata: Option<StorageMetadata>,
|
||||
}
|
||||
@@ -300,7 +302,7 @@ impl GenericRemoteStorage {
|
||||
|
||||
pub async fn upload(
|
||||
&self,
|
||||
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
data_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
@@ -398,7 +400,7 @@ impl GenericRemoteStorage {
|
||||
/// this path is used for the remote object id conversion only.
|
||||
pub async fn upload_storage_object(
|
||||
&self,
|
||||
from: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
from_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
) -> anyhow::Result<()> {
|
||||
|
||||
@@ -7,11 +7,14 @@
|
||||
use std::{borrow::Cow, future::Future, io::ErrorKind, pin::Pin};
|
||||
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use bytes::Bytes;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use futures::stream::Stream;
|
||||
use tokio::{
|
||||
fs,
|
||||
io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
|
||||
};
|
||||
use tokio_util::io::ReaderStream;
|
||||
use tracing::*;
|
||||
use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
|
||||
|
||||
@@ -99,27 +102,35 @@ impl LocalFs {
|
||||
};
|
||||
|
||||
// If we were given a directory, we may use it as our starting point.
|
||||
// Otherwise, we must go up to the parent directory. This is because
|
||||
// Otherwise, we must go up to the first ancestor dir that exists. This is because
|
||||
// S3 object list prefixes can be arbitrary strings, but when reading
|
||||
// the local filesystem we need a directory to start calling read_dir on.
|
||||
let mut initial_dir = full_path.clone();
|
||||
match fs::metadata(full_path.clone()).await {
|
||||
Ok(meta) => {
|
||||
if !meta.is_dir() {
|
||||
loop {
|
||||
// Did we make it to the root?
|
||||
if initial_dir.parent().is_none() {
|
||||
anyhow::bail!("list_files: failed to find valid ancestor dir for {full_path}");
|
||||
}
|
||||
|
||||
match fs::metadata(initial_dir.clone()).await {
|
||||
Ok(meta) if meta.is_dir() => {
|
||||
// We found a directory, break
|
||||
break;
|
||||
}
|
||||
Ok(_meta) => {
|
||||
// It's not a directory: strip back to the parent
|
||||
initial_dir.pop();
|
||||
}
|
||||
}
|
||||
Err(e) if e.kind() == ErrorKind::NotFound => {
|
||||
// It's not a file that exists: strip the prefix back to the parent directory
|
||||
initial_dir.pop();
|
||||
}
|
||||
Err(e) => {
|
||||
// Unexpected I/O error
|
||||
anyhow::bail!(e)
|
||||
Err(e) if e.kind() == ErrorKind::NotFound => {
|
||||
// It's not a file that exists: strip the prefix back to the parent directory
|
||||
initial_dir.pop();
|
||||
}
|
||||
Err(e) => {
|
||||
// Unexpected I/O error
|
||||
anyhow::bail!(e)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Note that Utf8PathBuf starts_with only considers full path segments, but
|
||||
// object prefixes are arbitrary strings, so we need the strings for doing
|
||||
// starts_with later.
|
||||
@@ -211,7 +222,7 @@ impl RemoteStorage for LocalFs {
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
data: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync,
|
||||
data_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
@@ -244,9 +255,12 @@ impl RemoteStorage for LocalFs {
|
||||
);
|
||||
|
||||
let from_size_bytes = data_size_bytes as u64;
|
||||
let data = tokio_util::io::StreamReader::new(data);
|
||||
let data = std::pin::pin!(data);
|
||||
let mut buffer_to_read = data.take(from_size_bytes);
|
||||
|
||||
let bytes_read = io::copy(&mut buffer_to_read, &mut destination)
|
||||
// alternatively we could just write the bytes to a file, but local_fs is a testing utility
|
||||
let bytes_read = io::copy_buf(&mut buffer_to_read, &mut destination)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
@@ -300,7 +314,7 @@ impl RemoteStorage for LocalFs {
|
||||
async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
|
||||
let target_path = from.with_base(&self.storage_root);
|
||||
if file_exists(&target_path).map_err(DownloadError::BadInput)? {
|
||||
let source = io::BufReader::new(
|
||||
let source = ReaderStream::new(
|
||||
fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.open(&target_path)
|
||||
@@ -340,16 +354,14 @@ impl RemoteStorage for LocalFs {
|
||||
}
|
||||
let target_path = from.with_base(&self.storage_root);
|
||||
if file_exists(&target_path).map_err(DownloadError::BadInput)? {
|
||||
let mut source = io::BufReader::new(
|
||||
fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.open(&target_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to open source file {target_path:?} to use in the download")
|
||||
})
|
||||
.map_err(DownloadError::Other)?,
|
||||
);
|
||||
let mut source = tokio::fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.open(&target_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to open source file {target_path:?} to use in the download")
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
source
|
||||
.seek(io::SeekFrom::Start(start_inclusive))
|
||||
.await
|
||||
@@ -363,11 +375,13 @@ impl RemoteStorage for LocalFs {
|
||||
Ok(match end_exclusive {
|
||||
Some(end_exclusive) => Download {
|
||||
metadata,
|
||||
download_stream: Box::pin(source.take(end_exclusive - start_inclusive)),
|
||||
download_stream: Box::pin(ReaderStream::new(
|
||||
source.take(end_exclusive - start_inclusive),
|
||||
)),
|
||||
},
|
||||
None => Download {
|
||||
metadata,
|
||||
download_stream: Box::pin(source),
|
||||
download_stream: Box::pin(ReaderStream::new(source)),
|
||||
},
|
||||
})
|
||||
} else {
|
||||
@@ -467,7 +481,9 @@ fn file_exists(file_path: &Utf8Path) -> anyhow::Result<bool> {
|
||||
mod fs_tests {
|
||||
use super::*;
|
||||
|
||||
use bytes::Bytes;
|
||||
use camino_tempfile::tempdir;
|
||||
use futures_util::Stream;
|
||||
use std::{collections::HashMap, io::Write};
|
||||
|
||||
async fn read_and_assert_remote_file_contents(
|
||||
@@ -477,7 +493,7 @@ mod fs_tests {
|
||||
remote_storage_path: &RemotePath,
|
||||
expected_metadata: Option<&StorageMetadata>,
|
||||
) -> anyhow::Result<String> {
|
||||
let mut download = storage
|
||||
let download = storage
|
||||
.download(remote_storage_path)
|
||||
.await
|
||||
.map_err(|e| anyhow::anyhow!("Download failed: {e}"))?;
|
||||
@@ -486,13 +502,9 @@ mod fs_tests {
|
||||
"Unexpected metadata returned for the downloaded file"
|
||||
);
|
||||
|
||||
let mut contents = String::new();
|
||||
download
|
||||
.download_stream
|
||||
.read_to_string(&mut contents)
|
||||
.await
|
||||
.context("Failed to read remote file contents into string")?;
|
||||
Ok(contents)
|
||||
let contents = aggregate(download.download_stream).await?;
|
||||
|
||||
String::from_utf8(contents).map_err(anyhow::Error::new)
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
@@ -521,25 +533,26 @@ mod fs_tests {
|
||||
let storage = create_storage()?;
|
||||
|
||||
let id = RemotePath::new(Utf8Path::new("dummy"))?;
|
||||
let content = std::io::Cursor::new(b"12345");
|
||||
let content = Bytes::from_static(b"12345");
|
||||
let content = move || futures::stream::once(futures::future::ready(Ok(content.clone())));
|
||||
|
||||
// Check that you get an error if the size parameter doesn't match the actual
|
||||
// size of the stream.
|
||||
storage
|
||||
.upload(Box::new(content.clone()), 0, &id, None)
|
||||
.upload(content(), 0, &id, None)
|
||||
.await
|
||||
.expect_err("upload with zero size succeeded");
|
||||
storage
|
||||
.upload(Box::new(content.clone()), 4, &id, None)
|
||||
.upload(content(), 4, &id, None)
|
||||
.await
|
||||
.expect_err("upload with too short size succeeded");
|
||||
storage
|
||||
.upload(Box::new(content.clone()), 6, &id, None)
|
||||
.upload(content(), 6, &id, None)
|
||||
.await
|
||||
.expect_err("upload with too large size succeeded");
|
||||
|
||||
// Correct size is 5, this should succeed.
|
||||
storage.upload(Box::new(content), 5, &id, None).await?;
|
||||
storage.upload(content(), 5, &id, None).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -587,7 +600,7 @@ mod fs_tests {
|
||||
let uploaded_bytes = dummy_contents(upload_name).into_bytes();
|
||||
let (first_part_local, second_part_local) = uploaded_bytes.split_at(3);
|
||||
|
||||
let mut first_part_download = storage
|
||||
let first_part_download = storage
|
||||
.download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64))
|
||||
.await?;
|
||||
assert!(
|
||||
@@ -595,21 +608,13 @@ mod fs_tests {
|
||||
"No metadata should be returned for no metadata upload"
|
||||
);
|
||||
|
||||
let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
||||
io::copy(
|
||||
&mut first_part_download.download_stream,
|
||||
&mut first_part_remote,
|
||||
)
|
||||
.await?;
|
||||
first_part_remote.flush().await?;
|
||||
let first_part_remote = first_part_remote.into_inner().into_inner();
|
||||
let first_part_remote = aggregate(first_part_download.download_stream).await?;
|
||||
assert_eq!(
|
||||
first_part_local,
|
||||
first_part_remote.as_slice(),
|
||||
first_part_local, first_part_remote,
|
||||
"First part bytes should be returned when requested"
|
||||
);
|
||||
|
||||
let mut second_part_download = storage
|
||||
let second_part_download = storage
|
||||
.download_byte_range(
|
||||
&upload_target,
|
||||
first_part_local.len() as u64,
|
||||
@@ -621,17 +626,9 @@ mod fs_tests {
|
||||
"No metadata should be returned for no metadata upload"
|
||||
);
|
||||
|
||||
let mut second_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
||||
io::copy(
|
||||
&mut second_part_download.download_stream,
|
||||
&mut second_part_remote,
|
||||
)
|
||||
.await?;
|
||||
second_part_remote.flush().await?;
|
||||
let second_part_remote = second_part_remote.into_inner().into_inner();
|
||||
let second_part_remote = aggregate(second_part_download.download_stream).await?;
|
||||
assert_eq!(
|
||||
second_part_local,
|
||||
second_part_remote.as_slice(),
|
||||
second_part_local, second_part_remote,
|
||||
"Second part bytes should be returned when requested"
|
||||
);
|
||||
|
||||
@@ -721,17 +718,10 @@ mod fs_tests {
|
||||
let uploaded_bytes = dummy_contents(upload_name).into_bytes();
|
||||
let (first_part_local, _) = uploaded_bytes.split_at(3);
|
||||
|
||||
let mut partial_download_with_metadata = storage
|
||||
let partial_download_with_metadata = storage
|
||||
.download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64))
|
||||
.await?;
|
||||
let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
||||
io::copy(
|
||||
&mut partial_download_with_metadata.download_stream,
|
||||
&mut first_part_remote,
|
||||
)
|
||||
.await?;
|
||||
first_part_remote.flush().await?;
|
||||
let first_part_remote = first_part_remote.into_inner().into_inner();
|
||||
let first_part_remote = aggregate(partial_download_with_metadata.download_stream).await?;
|
||||
assert_eq!(
|
||||
first_part_local,
|
||||
first_part_remote.as_slice(),
|
||||
@@ -807,16 +797,16 @@ mod fs_tests {
|
||||
)
|
||||
})?;
|
||||
|
||||
storage
|
||||
.upload(Box::new(file), size, &relative_path, metadata)
|
||||
.await?;
|
||||
let file = tokio_util::io::ReaderStream::new(file);
|
||||
|
||||
storage.upload(file, size, &relative_path, metadata).await?;
|
||||
Ok(relative_path)
|
||||
}
|
||||
|
||||
async fn create_file_for_upload(
|
||||
path: &Utf8Path,
|
||||
contents: &str,
|
||||
) -> anyhow::Result<(io::BufReader<fs::File>, usize)> {
|
||||
) -> anyhow::Result<(fs::File, usize)> {
|
||||
std::fs::create_dir_all(path.parent().unwrap())?;
|
||||
let mut file_for_writing = std::fs::OpenOptions::new()
|
||||
.write(true)
|
||||
@@ -826,7 +816,7 @@ mod fs_tests {
|
||||
drop(file_for_writing);
|
||||
let file_size = path.metadata()?.len() as usize;
|
||||
Ok((
|
||||
io::BufReader::new(fs::OpenOptions::new().read(true).open(&path).await?),
|
||||
fs::OpenOptions::new().read(true).open(&path).await?,
|
||||
file_size,
|
||||
))
|
||||
}
|
||||
@@ -840,4 +830,16 @@ mod fs_tests {
|
||||
files.sort_by(|a, b| a.0.cmp(&b.0));
|
||||
Ok(files)
|
||||
}
|
||||
|
||||
async fn aggregate(
|
||||
stream: impl Stream<Item = std::io::Result<Bytes>>,
|
||||
) -> anyhow::Result<Vec<u8>> {
|
||||
use futures::stream::StreamExt;
|
||||
let mut out = Vec::new();
|
||||
let mut stream = std::pin::pin!(stream);
|
||||
while let Some(res) = stream.next().await {
|
||||
out.extend_from_slice(&res?[..]);
|
||||
}
|
||||
Ok(out)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,9 +4,14 @@
|
||||
//! allowing multiple api users to independently work with the same S3 bucket, if
|
||||
//! their bucket prefixes are both specified and different.
|
||||
|
||||
use std::{borrow::Cow, sync::Arc};
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
pin::Pin,
|
||||
sync::Arc,
|
||||
task::{Context, Poll},
|
||||
};
|
||||
|
||||
use anyhow::Context;
|
||||
use anyhow::Context as _;
|
||||
use aws_config::{
|
||||
environment::credentials::EnvironmentVariableCredentialsProvider,
|
||||
imds::credentials::ImdsCredentialsProvider,
|
||||
@@ -28,11 +33,10 @@ use aws_smithy_async::rt::sleep::TokioSleep;
|
||||
|
||||
use aws_smithy_types::body::SdkBody;
|
||||
use aws_smithy_types::byte_stream::ByteStream;
|
||||
use bytes::Bytes;
|
||||
use futures::stream::Stream;
|
||||
use hyper::Body;
|
||||
use scopeguard::ScopeGuard;
|
||||
use tokio::io::{self, AsyncRead};
|
||||
use tokio_util::io::ReaderStream;
|
||||
use tracing::debug;
|
||||
|
||||
use super::StorageMetadata;
|
||||
use crate::{
|
||||
@@ -63,7 +67,7 @@ struct GetObjectRequest {
|
||||
impl S3Bucket {
|
||||
/// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
|
||||
pub fn new(aws_config: &S3Config) -> anyhow::Result<Self> {
|
||||
debug!(
|
||||
tracing::debug!(
|
||||
"Creating s3 remote storage for S3 bucket {}",
|
||||
aws_config.bucket_name
|
||||
);
|
||||
@@ -225,12 +229,15 @@ impl S3Bucket {
|
||||
match get_object {
|
||||
Ok(object_output) => {
|
||||
let metadata = object_output.metadata().cloned().map(StorageMetadata);
|
||||
|
||||
let body = object_output.body;
|
||||
let body = ByteStreamAsStream::from(body);
|
||||
let body = PermitCarrying::new(permit, body);
|
||||
let body = TimedDownload::new(started_at, body);
|
||||
|
||||
Ok(Download {
|
||||
metadata,
|
||||
download_stream: Box::pin(io::BufReader::new(TimedDownload::new(
|
||||
started_at,
|
||||
RatelimitedAsyncRead::new(permit, object_output.body.into_async_read()),
|
||||
))),
|
||||
download_stream: Box::pin(body),
|
||||
})
|
||||
}
|
||||
Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => {
|
||||
@@ -243,29 +250,55 @@ impl S3Bucket {
|
||||
}
|
||||
}
|
||||
|
||||
pin_project_lite::pin_project! {
|
||||
struct ByteStreamAsStream {
|
||||
#[pin]
|
||||
inner: aws_smithy_types::byte_stream::ByteStream
|
||||
}
|
||||
}
|
||||
|
||||
impl From<aws_smithy_types::byte_stream::ByteStream> for ByteStreamAsStream {
|
||||
fn from(inner: aws_smithy_types::byte_stream::ByteStream) -> Self {
|
||||
ByteStreamAsStream { inner }
|
||||
}
|
||||
}
|
||||
|
||||
impl Stream for ByteStreamAsStream {
|
||||
type Item = std::io::Result<Bytes>;
|
||||
|
||||
fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
|
||||
// this does the std::io::ErrorKind::Other conversion
|
||||
self.project().inner.poll_next(cx).map_err(|x| x.into())
|
||||
}
|
||||
|
||||
// cannot implement size_hint because inner.size_hint is remaining size in bytes, which makes
|
||||
// sense and Stream::size_hint does not really
|
||||
}
|
||||
|
||||
pin_project_lite::pin_project! {
|
||||
/// An `AsyncRead` adapter which carries a permit for the lifetime of the value.
|
||||
struct RatelimitedAsyncRead<S> {
|
||||
struct PermitCarrying<S> {
|
||||
permit: tokio::sync::OwnedSemaphorePermit,
|
||||
#[pin]
|
||||
inner: S,
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: AsyncRead> RatelimitedAsyncRead<S> {
|
||||
impl<S> PermitCarrying<S> {
|
||||
fn new(permit: tokio::sync::OwnedSemaphorePermit, inner: S) -> Self {
|
||||
RatelimitedAsyncRead { permit, inner }
|
||||
Self { permit, inner }
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: AsyncRead> AsyncRead for RatelimitedAsyncRead<S> {
|
||||
fn poll_read(
|
||||
self: std::pin::Pin<&mut Self>,
|
||||
cx: &mut std::task::Context<'_>,
|
||||
buf: &mut io::ReadBuf<'_>,
|
||||
) -> std::task::Poll<std::io::Result<()>> {
|
||||
let this = self.project();
|
||||
this.inner.poll_read(cx, buf)
|
||||
impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for PermitCarrying<S> {
|
||||
type Item = <S as Stream>::Item;
|
||||
|
||||
fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
|
||||
self.project().inner.poll_next(cx)
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
self.inner.size_hint()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -285,7 +318,7 @@ pin_project_lite::pin_project! {
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: AsyncRead> TimedDownload<S> {
|
||||
impl<S> TimedDownload<S> {
|
||||
fn new(started_at: std::time::Instant, inner: S) -> Self {
|
||||
TimedDownload {
|
||||
started_at,
|
||||
@@ -295,25 +328,26 @@ impl<S: AsyncRead> TimedDownload<S> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: AsyncRead> AsyncRead for TimedDownload<S> {
|
||||
fn poll_read(
|
||||
self: std::pin::Pin<&mut Self>,
|
||||
cx: &mut std::task::Context<'_>,
|
||||
buf: &mut io::ReadBuf<'_>,
|
||||
) -> std::task::Poll<std::io::Result<()>> {
|
||||
impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for TimedDownload<S> {
|
||||
type Item = <S as Stream>::Item;
|
||||
|
||||
fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
|
||||
use std::task::ready;
|
||||
|
||||
let this = self.project();
|
||||
let before = buf.filled().len();
|
||||
let read = std::task::ready!(this.inner.poll_read(cx, buf));
|
||||
|
||||
let read_eof = buf.filled().len() == before;
|
||||
|
||||
match read {
|
||||
Ok(()) if read_eof => *this.outcome = AttemptOutcome::Ok,
|
||||
Ok(()) => { /* still in progress */ }
|
||||
Err(_) => *this.outcome = AttemptOutcome::Err,
|
||||
let res = ready!(this.inner.poll_next(cx));
|
||||
match &res {
|
||||
Some(Ok(_)) => {}
|
||||
Some(Err(_)) => *this.outcome = metrics::AttemptOutcome::Err,
|
||||
None => *this.outcome = metrics::AttemptOutcome::Ok,
|
||||
}
|
||||
|
||||
std::task::Poll::Ready(read)
|
||||
Poll::Ready(res)
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
self.inner.size_hint()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -378,7 +412,7 @@ impl RemoteStorage for S3Bucket {
|
||||
let empty = Vec::new();
|
||||
let prefixes = response.common_prefixes.as_ref().unwrap_or(&empty);
|
||||
|
||||
tracing::info!("list: {} prefixes, {} keys", prefixes.len(), keys.len());
|
||||
tracing::debug!("list: {} prefixes, {} keys", prefixes.len(), keys.len());
|
||||
|
||||
for object in keys {
|
||||
let object_path = object.key().expect("response does not contain a key");
|
||||
@@ -403,7 +437,7 @@ impl RemoteStorage for S3Bucket {
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
from_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
@@ -413,7 +447,7 @@ impl RemoteStorage for S3Bucket {
|
||||
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let body = Body::wrap_stream(ReaderStream::new(from));
|
||||
let body = Body::wrap_stream(from);
|
||||
let bytes_stream = ByteStream::new(SdkBody::from_body_0_4(body));
|
||||
|
||||
let res = self
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
//! This module provides a wrapper around a real RemoteStorage implementation that
|
||||
//! causes the first N attempts at each upload or download operatio to fail. For
|
||||
//! testing purposes.
|
||||
use bytes::Bytes;
|
||||
use futures::stream::Stream;
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Mutex;
|
||||
@@ -108,7 +110,7 @@ impl RemoteStorage for UnreliableWrapper {
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
data: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
// S3 PUT request requires the content length to be specified,
|
||||
// otherwise it starts to fail with the concurrent connection count increasing.
|
||||
data_size_bytes: usize,
|
||||
|
||||
@@ -7,7 +7,9 @@ use std::sync::Arc;
|
||||
use std::time::UNIX_EPOCH;
|
||||
|
||||
use anyhow::Context;
|
||||
use bytes::Bytes;
|
||||
use camino::Utf8Path;
|
||||
use futures::stream::Stream;
|
||||
use once_cell::sync::OnceCell;
|
||||
use remote_storage::{
|
||||
AzureConfig, Download, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
|
||||
@@ -180,23 +182,14 @@ async fn azure_delete_objects_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Resu
|
||||
let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
let data1 = "remote blob data1".as_bytes();
|
||||
let data1_len = data1.len();
|
||||
let data2 = "remote blob data2".as_bytes();
|
||||
let data2_len = data2.len();
|
||||
let data3 = "remote blob data3".as_bytes();
|
||||
let data3_len = data3.len();
|
||||
ctx.client
|
||||
.upload(std::io::Cursor::new(data1), data1_len, &path1, None)
|
||||
.await?;
|
||||
let (data, len) = upload_stream("remote blob data1".as_bytes().into());
|
||||
ctx.client.upload(data, len, &path1, None).await?;
|
||||
|
||||
ctx.client
|
||||
.upload(std::io::Cursor::new(data2), data2_len, &path2, None)
|
||||
.await?;
|
||||
let (data, len) = upload_stream("remote blob data2".as_bytes().into());
|
||||
ctx.client.upload(data, len, &path2, None).await?;
|
||||
|
||||
ctx.client
|
||||
.upload(std::io::Cursor::new(data3), data3_len, &path3, None)
|
||||
.await?;
|
||||
let (data, len) = upload_stream("remote blob data3".as_bytes().into());
|
||||
ctx.client.upload(data, len, &path3, None).await?;
|
||||
|
||||
ctx.client.delete_objects(&[path1, path2]).await?;
|
||||
|
||||
@@ -219,53 +212,56 @@ async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Res
|
||||
let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
let data = "remote blob data here".as_bytes();
|
||||
let data_len = data.len() as u64;
|
||||
let orig = bytes::Bytes::from_static("remote blob data here".as_bytes());
|
||||
|
||||
ctx.client
|
||||
.upload(std::io::Cursor::new(data), data.len(), &path, None)
|
||||
.await?;
|
||||
let (data, len) = wrap_stream(orig.clone());
|
||||
|
||||
async fn download_and_compare(mut dl: Download) -> anyhow::Result<Vec<u8>> {
|
||||
ctx.client.upload(data, len, &path, None).await?;
|
||||
|
||||
async fn download_and_compare(dl: Download) -> anyhow::Result<Vec<u8>> {
|
||||
let mut buf = Vec::new();
|
||||
tokio::io::copy(&mut dl.download_stream, &mut buf).await?;
|
||||
tokio::io::copy_buf(
|
||||
&mut tokio_util::io::StreamReader::new(dl.download_stream),
|
||||
&mut buf,
|
||||
)
|
||||
.await?;
|
||||
Ok(buf)
|
||||
}
|
||||
// Normal download request
|
||||
let dl = ctx.client.download(&path).await?;
|
||||
let buf = download_and_compare(dl).await?;
|
||||
assert_eq!(buf, data);
|
||||
assert_eq!(&buf, &orig);
|
||||
|
||||
// Full range (end specified)
|
||||
let dl = ctx
|
||||
.client
|
||||
.download_byte_range(&path, 0, Some(data_len))
|
||||
.download_byte_range(&path, 0, Some(len as u64))
|
||||
.await?;
|
||||
let buf = download_and_compare(dl).await?;
|
||||
assert_eq!(buf, data);
|
||||
assert_eq!(&buf, &orig);
|
||||
|
||||
// partial range (end specified)
|
||||
let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
|
||||
let buf = download_and_compare(dl).await?;
|
||||
assert_eq!(buf, data[4..10]);
|
||||
assert_eq!(&buf, &orig[4..10]);
|
||||
|
||||
// partial range (end beyond real end)
|
||||
let dl = ctx
|
||||
.client
|
||||
.download_byte_range(&path, 8, Some(data_len * 100))
|
||||
.download_byte_range(&path, 8, Some(len as u64 * 100))
|
||||
.await?;
|
||||
let buf = download_and_compare(dl).await?;
|
||||
assert_eq!(buf, data[8..]);
|
||||
assert_eq!(&buf, &orig[8..]);
|
||||
|
||||
// Partial range (end unspecified)
|
||||
let dl = ctx.client.download_byte_range(&path, 4, None).await?;
|
||||
let buf = download_and_compare(dl).await?;
|
||||
assert_eq!(buf, data[4..]);
|
||||
assert_eq!(&buf, &orig[4..]);
|
||||
|
||||
// Full range (end unspecified)
|
||||
let dl = ctx.client.download_byte_range(&path, 0, None).await?;
|
||||
let buf = download_and_compare(dl).await?;
|
||||
assert_eq!(buf, data);
|
||||
assert_eq!(&buf, &orig);
|
||||
|
||||
debug!("Cleanup: deleting file at path {path:?}");
|
||||
ctx.client
|
||||
@@ -504,11 +500,8 @@ async fn upload_azure_data(
|
||||
let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
|
||||
debug!("Creating remote item {i} at path {blob_path:?}");
|
||||
|
||||
let data = format!("remote blob data {i}").into_bytes();
|
||||
let data_len = data.len();
|
||||
task_client
|
||||
.upload(std::io::Cursor::new(data), data_len, &blob_path, None)
|
||||
.await?;
|
||||
let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into());
|
||||
task_client.upload(data, len, &blob_path, None).await?;
|
||||
|
||||
Ok::<_, anyhow::Error>((blob_prefix, blob_path))
|
||||
});
|
||||
@@ -589,11 +582,8 @@ async fn upload_simple_azure_data(
|
||||
.with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
|
||||
debug!("Creating remote item {i} at path {blob_path:?}");
|
||||
|
||||
let data = format!("remote blob data {i}").into_bytes();
|
||||
let data_len = data.len();
|
||||
task_client
|
||||
.upload(std::io::Cursor::new(data), data_len, &blob_path, None)
|
||||
.await?;
|
||||
let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into());
|
||||
task_client.upload(data, len, &blob_path, None).await?;
|
||||
|
||||
Ok::<_, anyhow::Error>(blob_path)
|
||||
});
|
||||
@@ -622,3 +612,32 @@ async fn upload_simple_azure_data(
|
||||
ControlFlow::Continue(uploaded_blobs)
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME: copypasted from test_real_s3, can't remember how to share a module which is not compiled
|
||||
// to binary
|
||||
fn upload_stream(
|
||||
content: std::borrow::Cow<'static, [u8]>,
|
||||
) -> (
|
||||
impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
usize,
|
||||
) {
|
||||
use std::borrow::Cow;
|
||||
|
||||
let content = match content {
|
||||
Cow::Borrowed(x) => Bytes::from_static(x),
|
||||
Cow::Owned(vec) => Bytes::from(vec),
|
||||
};
|
||||
wrap_stream(content)
|
||||
}
|
||||
|
||||
fn wrap_stream(
|
||||
content: bytes::Bytes,
|
||||
) -> (
|
||||
impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
usize,
|
||||
) {
|
||||
let len = content.len();
|
||||
let content = futures::future::ready(Ok(content));
|
||||
|
||||
(futures::stream::once(content), len)
|
||||
}
|
||||
|
||||
@@ -7,7 +7,9 @@ use std::sync::Arc;
|
||||
use std::time::UNIX_EPOCH;
|
||||
|
||||
use anyhow::Context;
|
||||
use bytes::Bytes;
|
||||
use camino::Utf8Path;
|
||||
use futures::stream::Stream;
|
||||
use once_cell::sync::OnceCell;
|
||||
use remote_storage::{
|
||||
GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
|
||||
@@ -176,23 +178,14 @@ async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()>
|
||||
let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
let data1 = "remote blob data1".as_bytes();
|
||||
let data1_len = data1.len();
|
||||
let data2 = "remote blob data2".as_bytes();
|
||||
let data2_len = data2.len();
|
||||
let data3 = "remote blob data3".as_bytes();
|
||||
let data3_len = data3.len();
|
||||
ctx.client
|
||||
.upload(std::io::Cursor::new(data1), data1_len, &path1, None)
|
||||
.await?;
|
||||
let (data, len) = upload_stream("remote blob data1".as_bytes().into());
|
||||
ctx.client.upload(data, len, &path1, None).await?;
|
||||
|
||||
ctx.client
|
||||
.upload(std::io::Cursor::new(data2), data2_len, &path2, None)
|
||||
.await?;
|
||||
let (data, len) = upload_stream("remote blob data2".as_bytes().into());
|
||||
ctx.client.upload(data, len, &path2, None).await?;
|
||||
|
||||
ctx.client
|
||||
.upload(std::io::Cursor::new(data3), data3_len, &path3, None)
|
||||
.await?;
|
||||
let (data, len) = upload_stream("remote blob data3".as_bytes().into());
|
||||
ctx.client.upload(data, len, &path3, None).await?;
|
||||
|
||||
ctx.client.delete_objects(&[path1, path2]).await?;
|
||||
|
||||
@@ -432,11 +425,9 @@ async fn upload_s3_data(
|
||||
let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
|
||||
debug!("Creating remote item {i} at path {blob_path:?}");
|
||||
|
||||
let data = format!("remote blob data {i}").into_bytes();
|
||||
let data_len = data.len();
|
||||
task_client
|
||||
.upload(std::io::Cursor::new(data), data_len, &blob_path, None)
|
||||
.await?;
|
||||
let (data, data_len) =
|
||||
upload_stream(format!("remote blob data {i}").into_bytes().into());
|
||||
task_client.upload(data, data_len, &blob_path, None).await?;
|
||||
|
||||
Ok::<_, anyhow::Error>((blob_prefix, blob_path))
|
||||
});
|
||||
@@ -517,11 +508,9 @@ async fn upload_simple_s3_data(
|
||||
.with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
|
||||
debug!("Creating remote item {i} at path {blob_path:?}");
|
||||
|
||||
let data = format!("remote blob data {i}").into_bytes();
|
||||
let data_len = data.len();
|
||||
task_client
|
||||
.upload(std::io::Cursor::new(data), data_len, &blob_path, None)
|
||||
.await?;
|
||||
let (data, data_len) =
|
||||
upload_stream(format!("remote blob data {i}").into_bytes().into());
|
||||
task_client.upload(data, data_len, &blob_path, None).await?;
|
||||
|
||||
Ok::<_, anyhow::Error>(blob_path)
|
||||
});
|
||||
@@ -550,3 +539,30 @@ async fn upload_simple_s3_data(
|
||||
ControlFlow::Continue(uploaded_blobs)
|
||||
}
|
||||
}
|
||||
|
||||
fn upload_stream(
|
||||
content: std::borrow::Cow<'static, [u8]>,
|
||||
) -> (
|
||||
impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
usize,
|
||||
) {
|
||||
use std::borrow::Cow;
|
||||
|
||||
let content = match content {
|
||||
Cow::Borrowed(x) => Bytes::from_static(x),
|
||||
Cow::Owned(vec) => Bytes::from(vec),
|
||||
};
|
||||
wrap_stream(content)
|
||||
}
|
||||
|
||||
fn wrap_stream(
|
||||
content: bytes::Bytes,
|
||||
) -> (
|
||||
impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
|
||||
usize,
|
||||
) {
|
||||
let len = content.len();
|
||||
let content = futures::future::ready(Ok(content));
|
||||
|
||||
(futures::stream::once(content), len)
|
||||
}
|
||||
|
||||
@@ -1,16 +1,14 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use tokio::sync::{mpsc, Mutex};
|
||||
use tokio_util::task::{task_tracker::TaskTrackerToken, TaskTracker};
|
||||
|
||||
/// While a reference is kept around, the associated [`Barrier::wait`] will wait.
|
||||
///
|
||||
/// Can be cloned, moved and kept around in futures as "guard objects".
|
||||
#[derive(Clone)]
|
||||
pub struct Completion(mpsc::Sender<()>);
|
||||
pub struct Completion(TaskTrackerToken);
|
||||
|
||||
/// Barrier will wait until all clones of [`Completion`] have been dropped.
|
||||
#[derive(Clone)]
|
||||
pub struct Barrier(Arc<Mutex<mpsc::Receiver<()>>>);
|
||||
pub struct Barrier(TaskTracker);
|
||||
|
||||
impl Default for Barrier {
|
||||
fn default() -> Self {
|
||||
@@ -21,7 +19,7 @@ impl Default for Barrier {
|
||||
|
||||
impl Barrier {
|
||||
pub async fn wait(self) {
|
||||
self.0.lock().await.recv().await;
|
||||
self.0.wait().await;
|
||||
}
|
||||
|
||||
pub async fn maybe_wait(barrier: Option<Barrier>) {
|
||||
@@ -33,8 +31,7 @@ impl Barrier {
|
||||
|
||||
impl PartialEq for Barrier {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
// we don't use dyn so this is good
|
||||
Arc::ptr_eq(&self.0, &other.0)
|
||||
TaskTracker::ptr_eq(&self.0, &other.0)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -42,8 +39,10 @@ impl Eq for Barrier {}
|
||||
|
||||
/// Create new Guard and Barrier pair.
|
||||
pub fn channel() -> (Completion, Barrier) {
|
||||
let (tx, rx) = mpsc::channel::<()>(1);
|
||||
let rx = Mutex::new(rx);
|
||||
let rx = Arc::new(rx);
|
||||
(Completion(tx), Barrier(rx))
|
||||
let tracker = TaskTracker::new();
|
||||
// otherwise wait never exits
|
||||
tracker.close();
|
||||
|
||||
let token = tracker.token();
|
||||
(Completion(token), Barrier(tracker))
|
||||
}
|
||||
|
||||
@@ -152,3 +152,16 @@ impl Debug for Generation {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn generation_gt() {
|
||||
// Important that a None generation compares less than a valid one, during upgrades from
|
||||
// pre-generation systems.
|
||||
assert!(Generation::none() < Generation::new(0));
|
||||
assert!(Generation::none() < Generation::new(1));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
//!
|
||||
//! RCU stands for Read-Copy-Update. It's a synchronization mechanism somewhat
|
||||
//! similar to a lock, but it allows readers to "hold on" to an old value of RCU
|
||||
//! without blocking writers, and allows writing a new values without blocking
|
||||
//! readers. When you update the new value, the new value is immediately visible
|
||||
//! without blocking writers, and allows writing a new value without blocking
|
||||
//! readers. When you update the value, the new value is immediately visible
|
||||
//! to new readers, but the update waits until all existing readers have
|
||||
//! finishe, so that no one sees the old value anymore.
|
||||
//! finished, so that on return, no one sees the old value anymore.
|
||||
//!
|
||||
//! This implementation isn't wait-free; it uses an RwLock that is held for a
|
||||
//! short duration when the value is read or updated.
|
||||
@@ -26,6 +26,7 @@
|
||||
//! Increment the value by one, and wait for old readers to finish:
|
||||
//!
|
||||
//! ```
|
||||
//! # async fn dox() {
|
||||
//! # let rcu = utils::simple_rcu::Rcu::new(1);
|
||||
//! let write_guard = rcu.lock_for_write();
|
||||
//!
|
||||
@@ -36,15 +37,17 @@
|
||||
//!
|
||||
//! // Concurrent reads and writes are now possible again. Wait for all the readers
|
||||
//! // that still observe the old value to finish.
|
||||
//! waitlist.wait();
|
||||
//! waitlist.wait().await;
|
||||
//! # }
|
||||
//! ```
|
||||
//!
|
||||
#![warn(missing_docs)]
|
||||
|
||||
use std::ops::Deref;
|
||||
use std::sync::mpsc::{sync_channel, Receiver, SyncSender};
|
||||
use std::sync::{Arc, Weak};
|
||||
use std::sync::{Mutex, RwLock, RwLockWriteGuard};
|
||||
use std::sync::{RwLock, RwLockWriteGuard};
|
||||
|
||||
use tokio::sync::watch;
|
||||
|
||||
///
|
||||
/// Rcu allows multiple readers to read and hold onto a value without blocking
|
||||
@@ -68,22 +71,21 @@ struct RcuCell<V> {
|
||||
value: V,
|
||||
|
||||
/// A dummy channel. We never send anything to this channel. The point is
|
||||
/// that when the RcuCell is dropped, any cloned Senders will be notified
|
||||
/// that when the RcuCell is dropped, any subscribed Receivers will be notified
|
||||
/// that the channel is closed. Updaters can use this to wait out until the
|
||||
/// RcuCell has been dropped, i.e. until the old value is no longer in use.
|
||||
///
|
||||
/// We never do anything with the receiver, we just need to hold onto it so
|
||||
/// that the Senders will be notified when it's dropped. But because it's
|
||||
/// not Sync, we need a Mutex on it.
|
||||
watch: (SyncSender<()>, Mutex<Receiver<()>>),
|
||||
/// We never send anything to this, we just need to hold onto it so that the
|
||||
/// Receivers will be notified when it's dropped.
|
||||
watch: watch::Sender<()>,
|
||||
}
|
||||
|
||||
impl<V> RcuCell<V> {
|
||||
fn new(value: V) -> Self {
|
||||
let (watch_sender, watch_receiver) = sync_channel(0);
|
||||
let (watch_sender, _) = watch::channel(());
|
||||
RcuCell {
|
||||
value,
|
||||
watch: (watch_sender, Mutex::new(watch_receiver)),
|
||||
watch: watch_sender,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -141,10 +143,10 @@ impl<V> Deref for RcuReadGuard<V> {
|
||||
///
|
||||
/// Write guard returned by `write`
|
||||
///
|
||||
/// NB: Holding this guard blocks all concurrent `read` and `write` calls, so
|
||||
/// it should only be held for a short duration!
|
||||
/// NB: Holding this guard blocks all concurrent `read` and `write` calls, so it should only be
|
||||
/// held for a short duration!
|
||||
///
|
||||
/// Calling `store` consumes the guard, making new reads and new writes possible
|
||||
/// Calling [`Self::store_and_unlock`] consumes the guard, making new reads and new writes possible
|
||||
/// again.
|
||||
///
|
||||
pub struct RcuWriteGuard<'a, V> {
|
||||
@@ -179,7 +181,7 @@ impl<'a, V> RcuWriteGuard<'a, V> {
|
||||
// the watches for any that do.
|
||||
self.inner.old_cells.retain(|weak| {
|
||||
if let Some(cell) = weak.upgrade() {
|
||||
watches.push(cell.watch.0.clone());
|
||||
watches.push(cell.watch.subscribe());
|
||||
true
|
||||
} else {
|
||||
false
|
||||
@@ -193,20 +195,20 @@ impl<'a, V> RcuWriteGuard<'a, V> {
|
||||
///
|
||||
/// List of readers who can still see old values.
|
||||
///
|
||||
pub struct RcuWaitList(Vec<SyncSender<()>>);
|
||||
pub struct RcuWaitList(Vec<watch::Receiver<()>>);
|
||||
|
||||
impl RcuWaitList {
|
||||
///
|
||||
/// Wait for old readers to finish.
|
||||
///
|
||||
pub fn wait(mut self) {
|
||||
pub async fn wait(mut self) {
|
||||
// after all the old_cells are no longer in use, we're done
|
||||
for w in self.0.iter_mut() {
|
||||
// This will block until the Receiver is closed. That happens when
|
||||
// the RcuCell is dropped.
|
||||
#[allow(clippy::single_match)]
|
||||
match w.send(()) {
|
||||
Ok(_) => panic!("send() unexpectedly succeeded on dummy channel"),
|
||||
match w.changed().await {
|
||||
Ok(_) => panic!("changed() unexpectedly succeeded on dummy channel"),
|
||||
Err(_) => {
|
||||
// closed, which means that the cell has been dropped, and
|
||||
// its value is no longer in use
|
||||
@@ -220,11 +222,10 @@ impl RcuWaitList {
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::thread::{sleep, spawn};
|
||||
use std::time::Duration;
|
||||
|
||||
#[test]
|
||||
fn two_writers() {
|
||||
#[tokio::test]
|
||||
async fn two_writers() {
|
||||
let rcu = Rcu::new(1);
|
||||
|
||||
let read1 = rcu.read();
|
||||
@@ -248,33 +249,35 @@ mod tests {
|
||||
assert_eq!(*read1, 1);
|
||||
|
||||
let log = Arc::new(Mutex::new(Vec::new()));
|
||||
// Wait for the old readers to finish in separate threads.
|
||||
// Wait for the old readers to finish in separate tasks.
|
||||
let log_clone = Arc::clone(&log);
|
||||
let thread2 = spawn(move || {
|
||||
wait2.wait();
|
||||
let task2 = tokio::spawn(async move {
|
||||
wait2.wait().await;
|
||||
log_clone.lock().unwrap().push("wait2 done");
|
||||
});
|
||||
let log_clone = Arc::clone(&log);
|
||||
let thread3 = spawn(move || {
|
||||
wait3.wait();
|
||||
let task3 = tokio::spawn(async move {
|
||||
wait3.wait().await;
|
||||
log_clone.lock().unwrap().push("wait3 done");
|
||||
});
|
||||
|
||||
// without this sleep the test can pass on accident if the writer is slow
|
||||
sleep(Duration::from_millis(500));
|
||||
tokio::time::sleep(Duration::from_millis(100)).await;
|
||||
|
||||
// Release first reader. This allows first write to finish, but calling
|
||||
// wait() on the second one would still block.
|
||||
// wait() on the 'task3' would still block.
|
||||
log.lock().unwrap().push("dropping read1");
|
||||
drop(read1);
|
||||
thread2.join().unwrap();
|
||||
task2.await.unwrap();
|
||||
|
||||
sleep(Duration::from_millis(500));
|
||||
assert!(!task3.is_finished());
|
||||
|
||||
tokio::time::sleep(Duration::from_millis(100)).await;
|
||||
|
||||
// Release second reader, and finish second writer.
|
||||
log.lock().unwrap().push("dropping read2");
|
||||
drop(read2);
|
||||
thread3.join().unwrap();
|
||||
task3.await.unwrap();
|
||||
|
||||
assert_eq!(
|
||||
log.lock().unwrap().as_slice(),
|
||||
|
||||
@@ -402,15 +402,11 @@ fn start_pageserver(
|
||||
let (init_remote_done_tx, init_remote_done_rx) = utils::completion::channel();
|
||||
let (init_done_tx, init_done_rx) = utils::completion::channel();
|
||||
|
||||
let (init_logical_size_done_tx, init_logical_size_done_rx) = utils::completion::channel();
|
||||
|
||||
let (background_jobs_can_start, background_jobs_barrier) = utils::completion::channel();
|
||||
|
||||
let order = pageserver::InitializationOrder {
|
||||
initial_tenant_load_remote: Some(init_done_tx),
|
||||
initial_tenant_load: Some(init_remote_done_tx),
|
||||
initial_logical_size_can_start: init_done_rx.clone(),
|
||||
initial_logical_size_attempt: Some(init_logical_size_done_tx),
|
||||
background_jobs_can_start: background_jobs_barrier.clone(),
|
||||
};
|
||||
|
||||
@@ -429,7 +425,6 @@ fn start_pageserver(
|
||||
let tenant_manager = Arc::new(tenant_manager);
|
||||
|
||||
BACKGROUND_RUNTIME.spawn({
|
||||
let init_done_rx = init_done_rx;
|
||||
let shutdown_pageserver = shutdown_pageserver.clone();
|
||||
let drive_init = async move {
|
||||
// NOTE: unlike many futures in pageserver, this one is cancellation-safe
|
||||
@@ -464,7 +459,7 @@ fn start_pageserver(
|
||||
});
|
||||
|
||||
let WaitForPhaseResult {
|
||||
timeout_remaining: timeout,
|
||||
timeout_remaining: _timeout,
|
||||
skipped: init_load_skipped,
|
||||
} = wait_for_phase("initial_tenant_load", init_load_done, timeout).await;
|
||||
|
||||
@@ -472,26 +467,6 @@ fn start_pageserver(
|
||||
|
||||
scopeguard::ScopeGuard::into_inner(guard);
|
||||
|
||||
let guard = scopeguard::guard_on_success((), |_| {
|
||||
tracing::info!("Cancelled before initial logical sizes completed")
|
||||
});
|
||||
|
||||
let logical_sizes_done = std::pin::pin!(async {
|
||||
init_logical_size_done_rx.wait().await;
|
||||
startup_checkpoint(
|
||||
started_startup_at,
|
||||
"initial_logical_sizes",
|
||||
"Initial logical sizes completed",
|
||||
);
|
||||
});
|
||||
|
||||
let WaitForPhaseResult {
|
||||
timeout_remaining: _,
|
||||
skipped: logical_sizes_skipped,
|
||||
} = wait_for_phase("initial_logical_sizes", logical_sizes_done, timeout).await;
|
||||
|
||||
scopeguard::ScopeGuard::into_inner(guard);
|
||||
|
||||
// allow background jobs to start: we either completed prior stages, or they reached timeout
|
||||
// and were skipped. It is important that we do not let them block background jobs indefinitely,
|
||||
// because things like consumption metrics for billing are blocked by this barrier.
|
||||
@@ -514,9 +489,6 @@ fn start_pageserver(
|
||||
if let Some(f) = init_load_skipped {
|
||||
f.await;
|
||||
}
|
||||
if let Some(f) = logical_sizes_skipped {
|
||||
f.await;
|
||||
}
|
||||
scopeguard::ScopeGuard::into_inner(guard);
|
||||
|
||||
startup_checkpoint(started_startup_at, "complete", "Startup complete");
|
||||
@@ -587,7 +559,6 @@ fn start_pageserver(
|
||||
}
|
||||
|
||||
if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
|
||||
let background_jobs_barrier = background_jobs_barrier;
|
||||
let metrics_ctx = RequestContext::todo_child(
|
||||
TaskKind::MetricsCollection,
|
||||
// This task itself shouldn't download anything.
|
||||
|
||||
@@ -281,12 +281,18 @@ async fn calculate_synthetic_size_worker(
|
||||
// By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
|
||||
// which turns out is really handy to understand the system.
|
||||
if let Err(e) = tenant.calculate_synthetic_size(cause, cancel, ctx).await {
|
||||
if let Some(PageReconstructError::Cancelled) =
|
||||
e.downcast_ref::<PageReconstructError>()
|
||||
{
|
||||
return Ok(());
|
||||
// this error can be returned if timeline is shutting down, but it does not
|
||||
// mean the synthetic size worker should terminate. we do not need any checks
|
||||
// in this function because `mgr::get_tenant` will error out after shutdown has
|
||||
// progressed to shutting down tenants.
|
||||
let is_cancelled = matches!(
|
||||
e.downcast_ref::<PageReconstructError>(),
|
||||
Some(PageReconstructError::Cancelled)
|
||||
);
|
||||
|
||||
if !is_cancelled {
|
||||
error!("failed to calculate synthetic size for tenant {tenant_id}: {e:#}");
|
||||
}
|
||||
error!("failed to calculate synthetic size for tenant {tenant_id}: {e:#}");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -299,7 +305,7 @@ async fn calculate_synthetic_size_worker(
|
||||
|
||||
let res = tokio::time::timeout_at(
|
||||
started_at + synthetic_size_calculation_interval,
|
||||
task_mgr::shutdown_token().cancelled(),
|
||||
cancel.cancelled(),
|
||||
)
|
||||
.await;
|
||||
if res.is_ok() {
|
||||
|
||||
@@ -2,6 +2,7 @@ use crate::{context::RequestContext, tenant::timeline::logical_size::CurrentLogi
|
||||
use chrono::{DateTime, Utc};
|
||||
use consumption_metrics::EventType;
|
||||
use futures::stream::StreamExt;
|
||||
use pageserver_api::shard::ShardNumber;
|
||||
use std::{sync::Arc, time::SystemTime};
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
@@ -228,6 +229,11 @@ where
|
||||
while let Some((tenant_id, tenant)) = tenants.next().await {
|
||||
let mut tenant_resident_size = 0;
|
||||
|
||||
// Sharded tenants report all consumption metrics from shard zero
|
||||
if tenant.tenant_shard_id().shard_number != ShardNumber(0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for timeline in tenant.list_timelines() {
|
||||
let timeline_id = timeline.timeline_id;
|
||||
|
||||
@@ -351,7 +357,12 @@ impl TimelineSnapshot {
|
||||
|
||||
let current_exact_logical_size = {
|
||||
let span = tracing::info_span!("collect_metrics_iteration", tenant_id = %t.tenant_shard_id.tenant_id, timeline_id = %t.timeline_id);
|
||||
let size = span.in_scope(|| t.get_current_logical_size(ctx));
|
||||
let size = span.in_scope(|| {
|
||||
t.get_current_logical_size(
|
||||
crate::tenant::timeline::GetLogicalSizePriority::Background,
|
||||
ctx,
|
||||
)
|
||||
});
|
||||
match size {
|
||||
// Only send timeline logical size when it is fully calculated.
|
||||
CurrentLogicalSize::Exact(ref size) => Some(size.into()),
|
||||
|
||||
@@ -312,7 +312,18 @@ impl ListWriter {
|
||||
for (tenant_shard_id, tenant_list) in &mut deletion_list.tenants {
|
||||
if let Some(attached_gen) = attached_tenants.get(tenant_shard_id) {
|
||||
if attached_gen.previous() == tenant_list.generation {
|
||||
info!(
|
||||
seq=%s, tenant_id=%tenant_shard_id.tenant_id,
|
||||
shard_id=%tenant_shard_id.shard_slug(),
|
||||
old_gen=?tenant_list.generation, new_gen=?attached_gen,
|
||||
"Updating gen on recovered list");
|
||||
tenant_list.generation = *attached_gen;
|
||||
} else {
|
||||
info!(
|
||||
seq=%s, tenant_id=%tenant_shard_id.tenant_id,
|
||||
shard_id=%tenant_shard_id.shard_slug(),
|
||||
old_gen=?tenant_list.generation, new_gen=?attached_gen,
|
||||
"Encountered stale generation on recovered list");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -338,7 +338,8 @@ async fn build_timeline_info_common(
|
||||
Lsn(0) => None,
|
||||
lsn @ Lsn(_) => Some(lsn),
|
||||
};
|
||||
let current_logical_size = timeline.get_current_logical_size(ctx);
|
||||
let current_logical_size =
|
||||
timeline.get_current_logical_size(tenant::timeline::GetLogicalSizePriority::User, ctx);
|
||||
let current_physical_size = Some(timeline.layer_size_sum().await);
|
||||
let state = timeline.current_state();
|
||||
let remote_consistent_lsn_projected = timeline
|
||||
@@ -708,6 +709,26 @@ async fn tenant_detach_handler(
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn tenant_reset_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
let drop_cache: Option<bool> = parse_query_param(&request, "drop_cache")?;
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
||||
let state = get_state(&request);
|
||||
state
|
||||
.tenant_manager
|
||||
.reset_tenant(tenant_shard_id, drop_cache.unwrap_or(false), ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn tenant_load_handler(
|
||||
mut request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
@@ -823,7 +844,7 @@ async fn tenant_delete_handler(
|
||||
mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_shard_id)
|
||||
.instrument(info_span!("tenant_delete_handler",
|
||||
tenant_id = %tenant_shard_id.tenant_id,
|
||||
shard = tenant_shard_id.shard_slug()
|
||||
shard = %tenant_shard_id.shard_slug()
|
||||
))
|
||||
.await?;
|
||||
|
||||
@@ -1172,7 +1193,7 @@ async fn put_tenant_location_config_handler(
|
||||
mgr::detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client)
|
||||
.instrument(info_span!("tenant_detach",
|
||||
tenant_id = %tenant_shard_id.tenant_id,
|
||||
shard = tenant_shard_id.shard_slug()
|
||||
shard = %tenant_shard_id.shard_slug()
|
||||
))
|
||||
.await
|
||||
{
|
||||
@@ -1827,6 +1848,9 @@ pub fn make_router(
|
||||
.post("/v1/tenant/:tenant_id/detach", |r| {
|
||||
api_handler(r, tenant_detach_handler)
|
||||
})
|
||||
.post("/v1/tenant/:tenant_shard_id/reset", |r| {
|
||||
api_handler(r, tenant_reset_handler)
|
||||
})
|
||||
.post("/v1/tenant/:tenant_id/load", |r| {
|
||||
api_handler(r, tenant_load_handler)
|
||||
})
|
||||
|
||||
@@ -2,9 +2,8 @@
|
||||
//! Import data and WAL from a PostgreSQL data directory and WAL segments into
|
||||
//! a neon Timeline.
|
||||
//!
|
||||
use std::io::SeekFrom;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::pin::Pin;
|
||||
use std::task::{self, Poll};
|
||||
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use async_compression::tokio::bufread::ZstdDecoder;
|
||||
@@ -13,7 +12,8 @@ use bytes::Bytes;
|
||||
use camino::Utf8Path;
|
||||
use futures::StreamExt;
|
||||
use nix::NixPath;
|
||||
use tokio::io::{AsyncBufRead, AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt};
|
||||
use tokio::fs::{File, OpenOptions};
|
||||
use tokio::io::{AsyncBufRead, AsyncRead, AsyncReadExt, AsyncSeekExt, AsyncWriteExt};
|
||||
use tokio_tar::Archive;
|
||||
use tokio_tar::Builder;
|
||||
use tokio_tar::HeaderMode;
|
||||
@@ -629,70 +629,16 @@ async fn read_all_bytes(reader: &mut (impl AsyncRead + Unpin)) -> Result<Bytes>
|
||||
Ok(Bytes::from(buf))
|
||||
}
|
||||
|
||||
/// An in-memory buffer implementing `AsyncWrite`, inserting yields every now and then
|
||||
///
|
||||
/// The number of yields is bounded by above by the number of times poll_write is called,
|
||||
/// so calling it with 8 KB chunks and 8 MB chunks gives the same number of yields in total.
|
||||
/// This is an explicit choice as the `YieldingVec` is meant to give the async executor
|
||||
/// breathing room between units of CPU intensive preparation of buffers to be written.
|
||||
/// Once a write call is issued, the whole buffer has been prepared already, so there is no
|
||||
/// gain in splitting up the memcopy further.
|
||||
struct YieldingVec {
|
||||
yield_budget: usize,
|
||||
// the buffer written into
|
||||
buf: Vec<u8>,
|
||||
}
|
||||
pub async fn create_tar_zst(pgdata_path: &Utf8Path, tmp_path: &Utf8Path) -> Result<(File, u64)> {
|
||||
let file = OpenOptions::new()
|
||||
.create(true)
|
||||
.truncate(true)
|
||||
.read(true)
|
||||
.write(true)
|
||||
.open(&tmp_path)
|
||||
.await
|
||||
.with_context(|| format!("tempfile creation {tmp_path}"))?;
|
||||
|
||||
impl YieldingVec {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
yield_budget: 0,
|
||||
buf: Vec::new(),
|
||||
}
|
||||
}
|
||||
// Whether we should yield for a read operation of given size
|
||||
fn should_yield(&mut self, add_buf_len: usize) -> bool {
|
||||
// Set this limit to a small value so that we are a
|
||||
// good async citizen and yield repeatedly (but not
|
||||
// too often for many small writes to cause many yields)
|
||||
const YIELD_DIST: usize = 1024;
|
||||
|
||||
let target_buf_len = self.buf.len() + add_buf_len;
|
||||
let ret = self.yield_budget / YIELD_DIST < target_buf_len / YIELD_DIST;
|
||||
if self.yield_budget < target_buf_len {
|
||||
self.yield_budget += add_buf_len;
|
||||
}
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
impl AsyncWrite for YieldingVec {
|
||||
fn poll_write(
|
||||
mut self: Pin<&mut Self>,
|
||||
cx: &mut task::Context<'_>,
|
||||
buf: &[u8],
|
||||
) -> Poll<std::io::Result<usize>> {
|
||||
if self.should_yield(buf.len()) {
|
||||
cx.waker().wake_by_ref();
|
||||
return Poll::Pending;
|
||||
}
|
||||
self.get_mut().buf.extend_from_slice(buf);
|
||||
Poll::Ready(Ok(buf.len()))
|
||||
}
|
||||
|
||||
fn poll_flush(self: Pin<&mut Self>, _cx: &mut task::Context<'_>) -> Poll<std::io::Result<()>> {
|
||||
Poll::Ready(Ok(()))
|
||||
}
|
||||
|
||||
fn poll_shutdown(
|
||||
self: Pin<&mut Self>,
|
||||
_cx: &mut task::Context<'_>,
|
||||
) -> Poll<std::io::Result<()>> {
|
||||
Poll::Ready(Ok(()))
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn create_tar_zst(pgdata_path: &Utf8Path) -> Result<Vec<u8>> {
|
||||
let mut paths = Vec::new();
|
||||
for entry in WalkDir::new(pgdata_path) {
|
||||
let entry = entry?;
|
||||
@@ -707,7 +653,7 @@ pub async fn create_tar_zst(pgdata_path: &Utf8Path) -> Result<Vec<u8>> {
|
||||
// Do a sort to get a more consistent listing
|
||||
paths.sort_unstable();
|
||||
let zstd = ZstdEncoder::with_quality_and_params(
|
||||
YieldingVec::new(),
|
||||
file,
|
||||
Level::Default,
|
||||
&[CParameter::enable_long_distance_matching(true)],
|
||||
);
|
||||
@@ -725,13 +671,14 @@ pub async fn create_tar_zst(pgdata_path: &Utf8Path) -> Result<Vec<u8>> {
|
||||
}
|
||||
let mut zstd = builder.into_inner().await?;
|
||||
zstd.shutdown().await?;
|
||||
let compressed = zstd.into_inner();
|
||||
let compressed_len = compressed.buf.len();
|
||||
const INITDB_TAR_ZST_WARN_LIMIT: usize = 2_000_000;
|
||||
let mut compressed = zstd.into_inner();
|
||||
let compressed_len = compressed.metadata().await?.len();
|
||||
const INITDB_TAR_ZST_WARN_LIMIT: u64 = 2 * 1024 * 1024;
|
||||
if compressed_len > INITDB_TAR_ZST_WARN_LIMIT {
|
||||
warn!("compressed {INITDB_PATH} size of {compressed_len} is above limit {INITDB_TAR_ZST_WARN_LIMIT}.");
|
||||
}
|
||||
Ok(compressed.buf)
|
||||
compressed.seek(SeekFrom::Start(0)).await?;
|
||||
Ok((compressed, compressed_len))
|
||||
}
|
||||
|
||||
pub async fn extract_tar_zst(
|
||||
|
||||
@@ -186,13 +186,6 @@ pub struct InitializationOrder {
|
||||
/// Each initial tenant load task carries this until completion.
|
||||
pub initial_tenant_load: Option<utils::completion::Completion>,
|
||||
|
||||
/// Barrier for when we can start initial logical size calculations.
|
||||
pub initial_logical_size_can_start: utils::completion::Barrier,
|
||||
|
||||
/// Each timeline owns a clone of this to be consumed on the initial logical size calculation
|
||||
/// attempt. It is important to drop this once the attempt has completed.
|
||||
pub initial_logical_size_attempt: Option<utils::completion::Completion>,
|
||||
|
||||
/// Barrier for when we can start any background jobs.
|
||||
///
|
||||
/// This can be broken up later on, but right now there is just one class of a background job.
|
||||
@@ -212,7 +205,7 @@ async fn timed<Fut: std::future::Future>(
|
||||
match tokio::time::timeout(warn_at, &mut fut).await {
|
||||
Ok(ret) => {
|
||||
tracing::info!(
|
||||
task = name,
|
||||
stage = name,
|
||||
elapsed_ms = started.elapsed().as_millis(),
|
||||
"completed"
|
||||
);
|
||||
@@ -220,7 +213,7 @@ async fn timed<Fut: std::future::Future>(
|
||||
}
|
||||
Err(_) => {
|
||||
tracing::info!(
|
||||
task = name,
|
||||
stage = name,
|
||||
elapsed_ms = started.elapsed().as_millis(),
|
||||
"still waiting, taking longer than expected..."
|
||||
);
|
||||
@@ -229,7 +222,7 @@ async fn timed<Fut: std::future::Future>(
|
||||
|
||||
// this has a global allowed_errors
|
||||
tracing::warn!(
|
||||
task = name,
|
||||
stage = name,
|
||||
elapsed_ms = started.elapsed().as_millis(),
|
||||
"completed, took longer than expected"
|
||||
);
|
||||
|
||||
@@ -285,6 +285,63 @@ pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheS
|
||||
},
|
||||
});
|
||||
|
||||
pub(crate) mod page_cache_eviction_metrics {
|
||||
use std::num::NonZeroUsize;
|
||||
|
||||
use metrics::{register_int_counter_vec, IntCounter, IntCounterVec};
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub(crate) enum Outcome {
|
||||
FoundSlotUnused { iters: NonZeroUsize },
|
||||
FoundSlotEvicted { iters: NonZeroUsize },
|
||||
ItersExceeded { iters: NonZeroUsize },
|
||||
}
|
||||
|
||||
static ITERS_TOTAL_VEC: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_page_cache_find_victim_iters_total",
|
||||
"Counter for the number of iterations in the find_victim loop",
|
||||
&["outcome"],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static CALLS_VEC: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_page_cache_find_victim_calls",
|
||||
"Incremented at the end of each find_victim() call.\
|
||||
Filter by outcome to get e.g., eviction rate.",
|
||||
&["outcome"]
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
pub(crate) fn observe(outcome: Outcome) {
|
||||
macro_rules! dry {
|
||||
($label:literal, $iters:expr) => {{
|
||||
static LABEL: &'static str = $label;
|
||||
static ITERS_TOTAL: Lazy<IntCounter> =
|
||||
Lazy::new(|| ITERS_TOTAL_VEC.with_label_values(&[LABEL]));
|
||||
static CALLS: Lazy<IntCounter> =
|
||||
Lazy::new(|| CALLS_VEC.with_label_values(&[LABEL]));
|
||||
ITERS_TOTAL.inc_by(($iters.get()) as u64);
|
||||
CALLS.inc();
|
||||
}};
|
||||
}
|
||||
match outcome {
|
||||
Outcome::FoundSlotUnused { iters } => dry!("found_empty", iters),
|
||||
Outcome::FoundSlotEvicted { iters } => {
|
||||
dry!("found_evicted", iters)
|
||||
}
|
||||
Outcome::ItersExceeded { iters } => {
|
||||
dry!("err_iters_exceeded", iters);
|
||||
super::page_cache_errors_inc(super::PageCacheErrorKind::EvictIterLimit);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) static PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_page_cache_acquire_pinned_slot_seconds",
|
||||
@@ -294,14 +351,6 @@ pub(crate) static PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME: Lazy<Histogram> = Lazy::n
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_page_cache_find_victim_iters_total",
|
||||
"Counter for the number of iterations in the find_victim loop",
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static PAGE_CACHE_ERRORS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"page_cache_errors_total",
|
||||
@@ -407,16 +456,14 @@ pub(crate) mod initial_logical_size {
|
||||
use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
use crate::task_mgr::TaskKind;
|
||||
|
||||
pub(crate) struct StartCalculation(IntCounterVec);
|
||||
pub(crate) static START_CALCULATION: Lazy<StartCalculation> = Lazy::new(|| {
|
||||
StartCalculation(
|
||||
register_int_counter_vec!(
|
||||
"pageserver_initial_logical_size_start_calculation",
|
||||
"Incremented each time we start an initial logical size calculation attempt. \
|
||||
The `task_kind` label is for the task kind that caused this attempt.",
|
||||
&["attempt", "task_kind"]
|
||||
The `circumstances` label provides some additional details.",
|
||||
&["attempt", "circumstances"]
|
||||
)
|
||||
.unwrap(),
|
||||
)
|
||||
@@ -464,19 +511,24 @@ pub(crate) mod initial_logical_size {
|
||||
inc_drop_calculation: Option<IntCounter>,
|
||||
}
|
||||
|
||||
#[derive(strum_macros::IntoStaticStr)]
|
||||
pub(crate) enum StartCircumstances {
|
||||
EmptyInitial,
|
||||
SkippedConcurrencyLimiter,
|
||||
AfterBackgroundTasksRateLimit,
|
||||
}
|
||||
|
||||
impl StartCalculation {
|
||||
pub(crate) fn first(&self, causing_task_kind: Option<TaskKind>) -> OngoingCalculationGuard {
|
||||
let task_kind_label: &'static str =
|
||||
causing_task_kind.map(|k| k.into()).unwrap_or_default();
|
||||
self.0.with_label_values(&["first", task_kind_label]);
|
||||
pub(crate) fn first(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard {
|
||||
let circumstances_label: &'static str = circumstances.into();
|
||||
self.0.with_label_values(&["first", circumstances_label]);
|
||||
OngoingCalculationGuard {
|
||||
inc_drop_calculation: Some(DROP_CALCULATION.first.clone()),
|
||||
}
|
||||
}
|
||||
pub(crate) fn retry(&self, causing_task_kind: Option<TaskKind>) -> OngoingCalculationGuard {
|
||||
let task_kind_label: &'static str =
|
||||
causing_task_kind.map(|k| k.into()).unwrap_or_default();
|
||||
self.0.with_label_values(&["retry", task_kind_label]);
|
||||
pub(crate) fn retry(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard {
|
||||
let circumstances_label: &'static str = circumstances.into();
|
||||
self.0.with_label_values(&["retry", circumstances_label]);
|
||||
OngoingCalculationGuard {
|
||||
inc_drop_calculation: Some(DROP_CALCULATION.retry.clone()),
|
||||
}
|
||||
@@ -771,6 +823,7 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
|
||||
)]
|
||||
pub(crate) enum StorageIoOperation {
|
||||
Open,
|
||||
OpenAfterReplace,
|
||||
Close,
|
||||
CloseByReplace,
|
||||
Read,
|
||||
@@ -784,6 +837,7 @@ impl StorageIoOperation {
|
||||
pub fn as_str(&self) -> &'static str {
|
||||
match self {
|
||||
StorageIoOperation::Open => "open",
|
||||
StorageIoOperation::OpenAfterReplace => "open-after-replace",
|
||||
StorageIoOperation::Close => "close",
|
||||
StorageIoOperation::CloseByReplace => "close-by-replace",
|
||||
StorageIoOperation::Read => "read",
|
||||
@@ -838,6 +892,25 @@ pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) mod virtual_file_descriptor_cache {
|
||||
use super::*;
|
||||
|
||||
pub(crate) static SIZE_MAX: Lazy<UIntGauge> = Lazy::new(|| {
|
||||
register_uint_gauge!(
|
||||
"pageserver_virtual_file_descriptor_cache_size_max",
|
||||
"Maximum number of open file descriptors in the cache."
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
// SIZE_CURRENT: derive it like so:
|
||||
// ```
|
||||
// sum (pageserver_io_operations_seconds_count{operation=~"^(open|open-after-replace)$")
|
||||
// -ignoring(operation)
|
||||
// sum(pageserver_io_operations_seconds_count{operation=~"^(close|close-by-replace)$"}
|
||||
// ```
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct GlobalAndPerTimelineHistogram {
|
||||
global: Histogram,
|
||||
@@ -1164,6 +1237,30 @@ pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {
|
||||
}
|
||||
});
|
||||
|
||||
pub(crate) struct WalIngestMetrics {
|
||||
pub(crate) records_received: IntCounter,
|
||||
pub(crate) records_committed: IntCounter,
|
||||
pub(crate) records_filtered: IntCounter,
|
||||
}
|
||||
|
||||
pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
|
||||
records_received: register_int_counter!(
|
||||
"pageserver_wal_ingest_records_received",
|
||||
"Number of WAL records received from safekeepers"
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
records_committed: register_int_counter!(
|
||||
"pageserver_wal_ingest_records_committed",
|
||||
"Number of WAL records which resulted in writes to pageserver storage"
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
records_filtered: register_int_counter!(
|
||||
"pageserver_wal_ingest_records_filtered",
|
||||
"Number of WAL records filtered out due to sharding"
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
});
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub enum RemoteOpKind {
|
||||
Upload,
|
||||
@@ -1385,6 +1482,8 @@ pub(crate) static WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM: Lazy<Histogram> =
|
||||
pub(crate) struct WalRedoProcessCounters {
|
||||
pub(crate) started: IntCounter,
|
||||
pub(crate) killed_by_cause: enum_map::EnumMap<WalRedoKillCause, IntCounter>,
|
||||
pub(crate) active_stderr_logger_tasks_started: IntCounter,
|
||||
pub(crate) active_stderr_logger_tasks_finished: IntCounter,
|
||||
}
|
||||
|
||||
#[derive(Debug, enum_map::Enum, strum_macros::IntoStaticStr)]
|
||||
@@ -1408,6 +1507,19 @@ impl Default for WalRedoProcessCounters {
|
||||
&["cause"],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let active_stderr_logger_tasks_started = register_int_counter!(
|
||||
"pageserver_walredo_stderr_logger_tasks_started_total",
|
||||
"Number of active walredo stderr logger tasks that have started",
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let active_stderr_logger_tasks_finished = register_int_counter!(
|
||||
"pageserver_walredo_stderr_logger_tasks_finished_total",
|
||||
"Number of active walredo stderr logger tasks that have finished",
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
Self {
|
||||
started,
|
||||
killed_by_cause: EnumMap::from_array(std::array::from_fn(|i| {
|
||||
@@ -1415,6 +1527,8 @@ impl Default for WalRedoProcessCounters {
|
||||
let cause_str: &'static str = cause.into();
|
||||
killed.with_label_values(&[cause_str])
|
||||
})),
|
||||
active_stderr_logger_tasks_started,
|
||||
active_stderr_logger_tasks_finished,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2074,6 +2188,8 @@ pub fn preinitialize_metrics() {
|
||||
// Tenant manager stats
|
||||
Lazy::force(&TENANT_MANAGER);
|
||||
|
||||
Lazy::force(&crate::tenant::storage_layer::layer::LAYER_IMPL_METRICS);
|
||||
|
||||
// countervecs
|
||||
[&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
|
||||
.into_iter()
|
||||
|
||||
@@ -88,7 +88,11 @@ use utils::{
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use crate::{context::RequestContext, metrics::PageCacheSizeMetrics, repository::Key};
|
||||
use crate::{
|
||||
context::RequestContext,
|
||||
metrics::{page_cache_eviction_metrics, PageCacheSizeMetrics},
|
||||
repository::Key,
|
||||
};
|
||||
|
||||
static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
|
||||
const TEST_PAGE_CACHE_SIZE: usize = 50;
|
||||
@@ -897,8 +901,10 @@ impl PageCache {
|
||||
// Note that just yielding to tokio during iteration without such
|
||||
// priority boosting is likely counter-productive. We'd just give more opportunities
|
||||
// for B to bump usage count, further starving A.
|
||||
crate::metrics::page_cache_errors_inc(
|
||||
crate::metrics::PageCacheErrorKind::EvictIterLimit,
|
||||
page_cache_eviction_metrics::observe(
|
||||
page_cache_eviction_metrics::Outcome::ItersExceeded {
|
||||
iters: iters.try_into().unwrap(),
|
||||
},
|
||||
);
|
||||
anyhow::bail!("exceeded evict iter limit");
|
||||
}
|
||||
@@ -909,8 +915,18 @@ impl PageCache {
|
||||
// remove mapping for old buffer
|
||||
self.remove_mapping(old_key);
|
||||
inner.key = None;
|
||||
page_cache_eviction_metrics::observe(
|
||||
page_cache_eviction_metrics::Outcome::FoundSlotEvicted {
|
||||
iters: iters.try_into().unwrap(),
|
||||
},
|
||||
);
|
||||
} else {
|
||||
page_cache_eviction_metrics::observe(
|
||||
page_cache_eviction_metrics::Outcome::FoundSlotUnused {
|
||||
iters: iters.try_into().unwrap(),
|
||||
},
|
||||
);
|
||||
}
|
||||
crate::metrics::PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL.inc_by(iters as u64);
|
||||
return Ok((slot_idx, inner));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -53,21 +53,23 @@ use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::import_datadir::import_wal_from_tar;
|
||||
use crate::metrics;
|
||||
use crate::metrics::LIVE_CONNECTIONS_COUNT;
|
||||
use crate::pgdatadir_mapping::rel_block_to_key;
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::tenant::mgr;
|
||||
use crate::tenant::mgr::get_active_tenant_with_timeout;
|
||||
use crate::tenant::mgr::GetActiveTenantError;
|
||||
use crate::tenant::mgr::ShardSelector;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::trace::Tracer;
|
||||
|
||||
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
|
||||
use postgres_ffi::BLCKSZ;
|
||||
|
||||
// How long we may block waiting for a [`TenantSlot::InProgress`]` and/or a [`Tenant`] which
|
||||
// How long we may wait for a [`TenantSlot::InProgress`]` and/or a [`Tenant`] which
|
||||
// is not yet in state [`TenantState::Active`].
|
||||
const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);
|
||||
const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
|
||||
|
||||
/// Read the end of a tar archive.
|
||||
///
|
||||
@@ -399,16 +401,19 @@ impl PageServerHandler {
|
||||
{
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
// TODO(sharding): enumerate local tenant shards for this tenant, and select the one
|
||||
// that should serve this request.
|
||||
|
||||
// Make request tracer if needed
|
||||
// Note that since one connection may contain getpage requests that target different
|
||||
// shards (e.g. during splitting when the compute is not yet aware of the split), the tenant
|
||||
// that we look up here may not be the one that serves all the actual requests: we will double
|
||||
// check the mapping of key->shard later before calling into Timeline for getpage requests.
|
||||
let tenant = mgr::get_active_tenant_with_timeout(
|
||||
tenant_id,
|
||||
ShardSelector::First,
|
||||
ACTIVE_TENANT_TIMEOUT,
|
||||
&task_mgr::shutdown_token(),
|
||||
)
|
||||
.await?;
|
||||
|
||||
// Make request tracer if needed
|
||||
let mut tracer = if tenant.get_trace_read_requests() {
|
||||
let connection_id = ConnectionId::generate();
|
||||
let path =
|
||||
@@ -566,6 +571,7 @@ impl PageServerHandler {
|
||||
info!("creating new timeline");
|
||||
let tenant = get_active_tenant_with_timeout(
|
||||
tenant_id,
|
||||
ShardSelector::Zero,
|
||||
ACTIVE_TENANT_TIMEOUT,
|
||||
&task_mgr::shutdown_token(),
|
||||
)
|
||||
@@ -628,7 +634,7 @@ impl PageServerHandler {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
let timeline = self
|
||||
.get_active_tenant_timeline(tenant_id, timeline_id)
|
||||
.get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.await?;
|
||||
let last_record_lsn = timeline.get_last_record_lsn();
|
||||
if last_record_lsn != start_lsn {
|
||||
@@ -807,9 +813,49 @@ impl PageServerHandler {
|
||||
}
|
||||
*/
|
||||
|
||||
let page = timeline
|
||||
.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
|
||||
.await?;
|
||||
let key = rel_block_to_key(req.rel, req.blkno);
|
||||
let page = if timeline.get_shard_identity().is_key_local(&key) {
|
||||
timeline
|
||||
.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
|
||||
.await?
|
||||
} else {
|
||||
// The Tenant shard we looked up at connection start does not hold this particular
|
||||
// key: look for other shards in this tenant. This scenario occurs if a pageserver
|
||||
// has multiple shards for the same tenant.
|
||||
//
|
||||
// TODO: optimize this (https://github.com/neondatabase/neon/pull/6037)
|
||||
let timeline = match self
|
||||
.get_active_tenant_timeline(
|
||||
timeline.tenant_shard_id.tenant_id,
|
||||
timeline.timeline_id,
|
||||
ShardSelector::Page(key),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(t) => t,
|
||||
Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
|
||||
// We already know this tenant exists in general, because we resolved it at
|
||||
// start of connection. Getting a NotFound here indicates that the shard containing
|
||||
// the requested page is not present on this node.
|
||||
|
||||
// TODO: this should be some kind of structured error that the client will understand,
|
||||
// so that it can block until its config is updated: this error is expected in the case
|
||||
// that the Tenant's shards' placements are being updated and the client hasn't been
|
||||
// informed yet.
|
||||
//
|
||||
// https://github.com/neondatabase/neon/issues/6038
|
||||
return Err(anyhow::anyhow!("Request routed to wrong shard"));
|
||||
}
|
||||
Err(e) => return Err(e.into()),
|
||||
};
|
||||
|
||||
// Take a GateGuard for the duration of this request. If we were using our main Timeline object,
|
||||
// the GateGuard was already held over the whole connection.
|
||||
let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?;
|
||||
timeline
|
||||
.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
|
||||
.await?
|
||||
};
|
||||
|
||||
Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
|
||||
page,
|
||||
@@ -838,7 +884,7 @@ impl PageServerHandler {
|
||||
|
||||
// check that the timeline exists
|
||||
let timeline = self
|
||||
.get_active_tenant_timeline(tenant_id, timeline_id)
|
||||
.get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.await?;
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
if let Some(lsn) = lsn {
|
||||
@@ -944,9 +990,11 @@ impl PageServerHandler {
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
selector: ShardSelector,
|
||||
) -> Result<Arc<Timeline>, GetActiveTimelineError> {
|
||||
let tenant = get_active_tenant_with_timeout(
|
||||
tenant_id,
|
||||
selector,
|
||||
ACTIVE_TENANT_TIMEOUT,
|
||||
&task_mgr::shutdown_token(),
|
||||
)
|
||||
@@ -1120,7 +1168,7 @@ where
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
let timeline = self
|
||||
.get_active_tenant_timeline(tenant_id, timeline_id)
|
||||
.get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.await?;
|
||||
|
||||
let end_of_timeline = timeline.get_last_record_rlsn();
|
||||
@@ -1307,6 +1355,7 @@ where
|
||||
|
||||
let tenant = get_active_tenant_with_timeout(
|
||||
tenant_id,
|
||||
ShardSelector::Zero,
|
||||
ACTIVE_TENANT_TIMEOUT,
|
||||
&task_mgr::shutdown_token(),
|
||||
)
|
||||
|
||||
@@ -13,6 +13,7 @@ use crate::repository::*;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::Context;
|
||||
use bytes::{Buf, Bytes};
|
||||
use pageserver_api::key::is_rel_block_key;
|
||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
|
||||
use postgres_ffi::BLCKSZ;
|
||||
@@ -282,6 +283,10 @@ impl Timeline {
|
||||
}
|
||||
|
||||
/// Get a list of all existing relations in given tablespace and database.
|
||||
///
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// This method is cancellation-safe.
|
||||
pub async fn list_rels(
|
||||
&self,
|
||||
spcnode: Oid,
|
||||
@@ -630,6 +635,10 @@ impl Timeline {
|
||||
///
|
||||
/// Only relation blocks are counted currently. That excludes metadata,
|
||||
/// SLRUs, twophase files etc.
|
||||
///
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// This method is cancellation-safe.
|
||||
pub async fn get_current_logical_size_non_incremental(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
@@ -1314,7 +1323,7 @@ impl<'a> DatadirModification<'a> {
|
||||
// Flush relation and SLRU data blocks, keep metadata.
|
||||
let mut retained_pending_updates = HashMap::new();
|
||||
for (key, value) in self.pending_updates.drain() {
|
||||
if is_rel_block_key(key) || is_slru_block_key(key) {
|
||||
if is_rel_block_key(&key) || is_slru_block_key(key) {
|
||||
// This bails out on first error without modifying pending_updates.
|
||||
// That's Ok, cf this function's doc comment.
|
||||
writer.put(key, self.lsn, &value, ctx).await?;
|
||||
@@ -1359,6 +1368,10 @@ impl<'a> DatadirModification<'a> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn is_empty(&self) -> bool {
|
||||
self.pending_updates.is_empty() && self.pending_deletions.is_empty()
|
||||
}
|
||||
|
||||
// Internal helper functions to batch the modifications
|
||||
|
||||
async fn get(&self, key: Key, ctx: &RequestContext) -> Result<Bytes, PageReconstructError> {
|
||||
@@ -1570,7 +1583,7 @@ fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key {
|
||||
}
|
||||
}
|
||||
|
||||
fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
|
||||
pub(crate) fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
|
||||
Key {
|
||||
field1: 0x00,
|
||||
field2: rel.spcnode,
|
||||
@@ -1769,10 +1782,6 @@ pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
|
||||
})
|
||||
}
|
||||
|
||||
fn is_rel_block_key(key: Key) -> bool {
|
||||
key.field1 == 0x00 && key.field4 != 0
|
||||
}
|
||||
|
||||
pub fn is_rel_fsm_block_key(key: Key) -> bool {
|
||||
key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
|
||||
}
|
||||
|
||||
@@ -12,13 +12,13 @@
|
||||
//!
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use bytes::Bytes;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use enumset::EnumSet;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use futures::FutureExt;
|
||||
use futures::StreamExt;
|
||||
use pageserver_api::models::TimelineState;
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use remote_storage::DownloadError;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
@@ -68,6 +68,7 @@ use crate::tenant::config::TenantConfOpt;
|
||||
use crate::tenant::metadata::load_metadata;
|
||||
pub use crate::tenant::remote_timeline_client::index::IndexPart;
|
||||
use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
|
||||
use crate::tenant::remote_timeline_client::INITDB_PATH;
|
||||
use crate::tenant::storage_layer::DeltaLayer;
|
||||
use crate::tenant::storage_layer::ImageLayer;
|
||||
use crate::InitializationOrder;
|
||||
@@ -236,6 +237,9 @@ pub struct Tenant {
|
||||
|
||||
tenant_shard_id: TenantShardId,
|
||||
|
||||
// The detailed sharding information, beyond the number/count in tenant_shard_id
|
||||
shard_identity: ShardIdentity,
|
||||
|
||||
/// The remote storage generation, used to protect S3 objects from split-brain.
|
||||
/// Does not change over the lifetime of the [`Tenant`] object.
|
||||
///
|
||||
@@ -312,6 +316,9 @@ impl WalRedoManager {
|
||||
}
|
||||
}
|
||||
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// This method is cancellation-safe.
|
||||
pub async fn request_redo(
|
||||
&self,
|
||||
key: crate::repository::Key,
|
||||
@@ -469,7 +476,6 @@ impl Tenant {
|
||||
index_part: Option<IndexPart>,
|
||||
metadata: TimelineMetadata,
|
||||
ancestor: Option<Arc<Timeline>>,
|
||||
init_order: Option<&InitializationOrder>,
|
||||
_ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let tenant_id = self.tenant_shard_id;
|
||||
@@ -479,7 +485,6 @@ impl Tenant {
|
||||
&metadata,
|
||||
ancestor.clone(),
|
||||
resources,
|
||||
init_order,
|
||||
CreateTimelineCause::Load,
|
||||
)?;
|
||||
let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
|
||||
@@ -567,6 +572,7 @@ impl Tenant {
|
||||
tenant_shard_id: TenantShardId,
|
||||
resources: TenantSharedResources,
|
||||
attached_conf: AttachedTenantConf,
|
||||
shard_identity: ShardIdentity,
|
||||
init_order: Option<InitializationOrder>,
|
||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||
mode: SpawnMode,
|
||||
@@ -588,6 +594,7 @@ impl Tenant {
|
||||
TenantState::Attaching,
|
||||
conf,
|
||||
attached_conf,
|
||||
shard_identity,
|
||||
wal_redo_manager,
|
||||
tenant_shard_id,
|
||||
remote_storage.clone(),
|
||||
@@ -680,10 +687,6 @@ impl Tenant {
|
||||
// as we are no longer loading, signal completion by dropping
|
||||
// the completion while we resume deletion
|
||||
drop(_completion);
|
||||
// do not hold to initial_logical_size_attempt as it will prevent loading from proceeding without timeout
|
||||
let _ = init_order
|
||||
.as_mut()
|
||||
.and_then(|x| x.initial_logical_size_attempt.take());
|
||||
let background_jobs_can_start =
|
||||
init_order.as_ref().map(|x| &x.background_jobs_can_start);
|
||||
if let Some(background) = background_jobs_can_start {
|
||||
@@ -697,7 +700,6 @@ impl Tenant {
|
||||
&tenant_clone,
|
||||
preload,
|
||||
tenants,
|
||||
init_order,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
@@ -710,7 +712,7 @@ impl Tenant {
|
||||
}
|
||||
}
|
||||
|
||||
match tenant_clone.attach(init_order, preload, &ctx).await {
|
||||
match tenant_clone.attach(preload, &ctx).await {
|
||||
Ok(()) => {
|
||||
info!("attach finished, activating");
|
||||
tenant_clone.activate(broker_client, None, &ctx);
|
||||
@@ -773,7 +775,6 @@ impl Tenant {
|
||||
///
|
||||
async fn attach(
|
||||
self: &Arc<Tenant>,
|
||||
init_order: Option<InitializationOrder>,
|
||||
preload: Option<TenantPreload>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
@@ -786,7 +787,7 @@ impl Tenant {
|
||||
None => {
|
||||
// Deprecated dev mode: load from local disk state instead of remote storage
|
||||
// https://github.com/neondatabase/neon/issues/5624
|
||||
return self.load_local(init_order, ctx).await;
|
||||
return self.load_local(ctx).await;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -881,7 +882,6 @@ impl Tenant {
|
||||
&index_part.metadata,
|
||||
Some(remote_timeline_client),
|
||||
self.deletion_queue_client.clone(),
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.context("resume_deletion")
|
||||
@@ -1006,10 +1006,6 @@ impl Tenant {
|
||||
None
|
||||
};
|
||||
|
||||
// we can load remote timelines during init, but they are assumed to be so rare that
|
||||
// initialization order is not passed to here.
|
||||
let init_order = None;
|
||||
|
||||
// timeline loading after attach expects to find metadata file for each metadata
|
||||
save_metadata(
|
||||
self.conf,
|
||||
@@ -1027,7 +1023,6 @@ impl Tenant {
|
||||
Some(index_part),
|
||||
remote_metadata,
|
||||
ancestor,
|
||||
init_order,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
@@ -1051,6 +1046,9 @@ impl Tenant {
|
||||
},
|
||||
conf,
|
||||
AttachedTenantConf::try_from(LocationConf::default()).unwrap(),
|
||||
// Shard identity isn't meaningful for a broken tenant: it's just a placeholder
|
||||
// to occupy the slot for this TenantShardId.
|
||||
ShardIdentity::broken(tenant_shard_id.shard_number, tenant_shard_id.shard_count),
|
||||
wal_redo_manager,
|
||||
tenant_shard_id,
|
||||
None,
|
||||
@@ -1269,11 +1267,7 @@ impl Tenant {
|
||||
/// files on disk. Used at pageserver startup.
|
||||
///
|
||||
/// No background tasks are started as part of this routine.
|
||||
async fn load_local(
|
||||
self: &Arc<Tenant>,
|
||||
init_order: Option<InitializationOrder>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
async fn load_local(self: &Arc<Tenant>, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
span::debug_assert_current_span_has_tenant_id();
|
||||
|
||||
debug!("loading tenant task");
|
||||
@@ -1299,7 +1293,7 @@ impl Tenant {
|
||||
// Process loadable timelines first
|
||||
for (timeline_id, local_metadata) in scan.sorted_timelines_to_load {
|
||||
if let Err(e) = self
|
||||
.load_local_timeline(timeline_id, local_metadata, init_order.as_ref(), ctx, false)
|
||||
.load_local_timeline(timeline_id, local_metadata, ctx, false)
|
||||
.await
|
||||
{
|
||||
match e {
|
||||
@@ -1333,13 +1327,7 @@ impl Tenant {
|
||||
}
|
||||
Some(local_metadata) => {
|
||||
if let Err(e) = self
|
||||
.load_local_timeline(
|
||||
timeline_id,
|
||||
local_metadata,
|
||||
init_order.as_ref(),
|
||||
ctx,
|
||||
true,
|
||||
)
|
||||
.load_local_timeline(timeline_id, local_metadata, ctx, true)
|
||||
.await
|
||||
{
|
||||
match e {
|
||||
@@ -1367,12 +1355,11 @@ impl Tenant {
|
||||
/// Subroutine of `load_tenant`, to load an individual timeline
|
||||
///
|
||||
/// NB: The parent is assumed to be already loaded!
|
||||
#[instrument(skip(self, local_metadata, init_order, ctx))]
|
||||
#[instrument(skip(self, local_metadata, ctx))]
|
||||
async fn load_local_timeline(
|
||||
self: &Arc<Self>,
|
||||
timeline_id: TimelineId,
|
||||
local_metadata: TimelineMetadata,
|
||||
init_order: Option<&InitializationOrder>,
|
||||
ctx: &RequestContext,
|
||||
found_delete_mark: bool,
|
||||
) -> Result<(), LoadLocalTimelineError> {
|
||||
@@ -1389,7 +1376,6 @@ impl Tenant {
|
||||
&local_metadata,
|
||||
None,
|
||||
self.deletion_queue_client.clone(),
|
||||
init_order,
|
||||
)
|
||||
.await
|
||||
.context("resume deletion")
|
||||
@@ -1406,17 +1392,9 @@ impl Tenant {
|
||||
None
|
||||
};
|
||||
|
||||
self.timeline_init_and_sync(
|
||||
timeline_id,
|
||||
resources,
|
||||
None,
|
||||
local_metadata,
|
||||
ancestor,
|
||||
init_order,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
.map_err(LoadLocalTimelineError::Load)
|
||||
self.timeline_init_and_sync(timeline_id, resources, None, local_metadata, ancestor, ctx)
|
||||
.await
|
||||
.map_err(LoadLocalTimelineError::Load)
|
||||
}
|
||||
|
||||
pub(crate) fn tenant_id(&self) -> TenantId {
|
||||
@@ -2311,7 +2289,6 @@ impl Tenant {
|
||||
new_metadata: &TimelineMetadata,
|
||||
ancestor: Option<Arc<Timeline>>,
|
||||
resources: TimelineResources,
|
||||
init_order: Option<&InitializationOrder>,
|
||||
cause: CreateTimelineCause,
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
let state = match cause {
|
||||
@@ -2326,9 +2303,6 @@ impl Tenant {
|
||||
CreateTimelineCause::Delete => TimelineState::Stopping,
|
||||
};
|
||||
|
||||
let initial_logical_size_can_start = init_order.map(|x| &x.initial_logical_size_can_start);
|
||||
let initial_logical_size_attempt = init_order.map(|x| &x.initial_logical_size_attempt);
|
||||
|
||||
let pg_version = new_metadata.pg_version();
|
||||
|
||||
let timeline = Timeline::new(
|
||||
@@ -2339,11 +2313,10 @@ impl Tenant {
|
||||
new_timeline_id,
|
||||
self.tenant_shard_id,
|
||||
self.generation,
|
||||
self.shard_identity,
|
||||
Arc::clone(&self.walredo_mgr),
|
||||
resources,
|
||||
pg_version,
|
||||
initial_logical_size_can_start.cloned(),
|
||||
initial_logical_size_attempt.cloned().flatten(),
|
||||
state,
|
||||
self.cancel.child_token(),
|
||||
);
|
||||
@@ -2358,6 +2331,7 @@ impl Tenant {
|
||||
state: TenantState,
|
||||
conf: &'static PageServerConf,
|
||||
attached_conf: AttachedTenantConf,
|
||||
shard_identity: ShardIdentity,
|
||||
walredo_mgr: Arc<WalRedoManager>,
|
||||
tenant_shard_id: TenantShardId,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
@@ -2419,6 +2393,7 @@ impl Tenant {
|
||||
|
||||
Tenant {
|
||||
tenant_shard_id,
|
||||
shard_identity,
|
||||
generation: attached_conf.location.generation,
|
||||
conf,
|
||||
// using now here is good enough approximation to catch tenants with really long
|
||||
@@ -2540,7 +2515,7 @@ impl Tenant {
|
||||
}
|
||||
}
|
||||
|
||||
info!("persisting tenantconf to {config_path}");
|
||||
debug!("persisting tenantconf to {config_path}");
|
||||
|
||||
let mut conf_content = r#"# This file contains a specific per-tenant's config.
|
||||
# It is read in case of pageserver restart.
|
||||
@@ -2575,7 +2550,7 @@ impl Tenant {
|
||||
target_config_path: &Utf8Path,
|
||||
tenant_conf: &TenantConfOpt,
|
||||
) -> anyhow::Result<()> {
|
||||
info!("persisting tenantconf to {target_config_path}");
|
||||
debug!("persisting tenantconf to {target_config_path}");
|
||||
|
||||
let mut conf_content = r#"# This file contains a specific per-tenant's config.
|
||||
# It is read in case of pageserver restart.
|
||||
@@ -2974,10 +2949,10 @@ impl Tenant {
|
||||
};
|
||||
// create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/`
|
||||
// temporary directory for basebackup files for the given timeline.
|
||||
|
||||
let timelines_path = self.conf.timelines_path(&self.tenant_shard_id);
|
||||
let pgdata_path = path_with_suffix_extension(
|
||||
self.conf
|
||||
.timelines_path(&self.tenant_shard_id)
|
||||
.join(format!("basebackup-{timeline_id}")),
|
||||
timelines_path.join(format!("basebackup-{timeline_id}")),
|
||||
TEMP_FILE_SUFFIX,
|
||||
);
|
||||
|
||||
@@ -3008,31 +2983,43 @@ impl Tenant {
|
||||
)
|
||||
.await
|
||||
.context("download initdb tar")?;
|
||||
let buf_read = Box::pin(BufReader::new(initdb_tar_zst));
|
||||
let buf_read =
|
||||
BufReader::with_capacity(remote_timeline_client::BUFFER_SIZE, initdb_tar_zst);
|
||||
import_datadir::extract_tar_zst(&pgdata_path, buf_read)
|
||||
.await
|
||||
.context("extract initdb tar")?;
|
||||
|
||||
if initdb_tar_zst_path.exists() {
|
||||
tokio::fs::remove_file(&initdb_tar_zst_path)
|
||||
.await
|
||||
.context("tempfile removal")?;
|
||||
}
|
||||
tokio::fs::remove_file(&initdb_tar_zst_path)
|
||||
.await
|
||||
.or_else(|e| {
|
||||
if e.kind() == std::io::ErrorKind::NotFound {
|
||||
// If something else already removed the file, ignore the error
|
||||
Ok(())
|
||||
} else {
|
||||
Err(e)
|
||||
}
|
||||
})
|
||||
.with_context(|| format!("tempfile removal {initdb_tar_zst_path}"))?;
|
||||
} else {
|
||||
// Init temporarily repo to get bootstrap data, this creates a directory in the `initdb_path` path
|
||||
run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel).await?;
|
||||
|
||||
// Upload the created data dir to S3
|
||||
if let Some(storage) = &self.remote_storage {
|
||||
let pgdata_zstd = import_datadir::create_tar_zst(&pgdata_path).await?;
|
||||
let pgdata_zstd = Bytes::from(pgdata_zstd);
|
||||
let temp_path = timelines_path.join(format!(
|
||||
"{INITDB_PATH}.upload-{timeline_id}.{TEMP_FILE_SUFFIX}"
|
||||
));
|
||||
|
||||
let (pgdata_zstd, tar_zst_size) =
|
||||
import_datadir::create_tar_zst(&pgdata_path, &temp_path).await?;
|
||||
backoff::retry(
|
||||
|| async {
|
||||
self::remote_timeline_client::upload_initdb_dir(
|
||||
storage,
|
||||
&self.tenant_shard_id.tenant_id,
|
||||
&timeline_id,
|
||||
pgdata_zstd.clone(),
|
||||
pgdata_zstd.try_clone().await?,
|
||||
tar_zst_size,
|
||||
)
|
||||
.await
|
||||
},
|
||||
@@ -3044,6 +3031,18 @@ impl Tenant {
|
||||
backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
|
||||
)
|
||||
.await?;
|
||||
|
||||
tokio::fs::remove_file(&temp_path)
|
||||
.await
|
||||
.or_else(|e| {
|
||||
if e.kind() == std::io::ErrorKind::NotFound {
|
||||
// If something else already removed the file, ignore the error
|
||||
Ok(())
|
||||
} else {
|
||||
Err(e)
|
||||
}
|
||||
})
|
||||
.with_context(|| format!("tempfile removal {temp_path}"))?;
|
||||
}
|
||||
}
|
||||
let pgdata_lsn = import_datadir::get_lsn_from_controlfile(&pgdata_path)?.align();
|
||||
@@ -3165,7 +3164,6 @@ impl Tenant {
|
||||
new_metadata,
|
||||
ancestor,
|
||||
resources,
|
||||
None,
|
||||
CreateTimelineCause::Load,
|
||||
)
|
||||
.context("Failed to create timeline data structure")?;
|
||||
@@ -3831,6 +3829,8 @@ pub(crate) mod harness {
|
||||
self.generation,
|
||||
))
|
||||
.unwrap(),
|
||||
// This is a legacy/test code path: sharding isn't supported here.
|
||||
ShardIdentity::unsharded(),
|
||||
walredo_mgr,
|
||||
self.tenant_shard_id,
|
||||
Some(self.remote_storage.clone()),
|
||||
@@ -3840,7 +3840,7 @@ pub(crate) mod harness {
|
||||
match mode {
|
||||
LoadMode::Local => {
|
||||
tenant
|
||||
.load_local(None, ctx)
|
||||
.load_local(ctx)
|
||||
.instrument(info_span!("try_load", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
|
||||
.await?;
|
||||
}
|
||||
@@ -3850,7 +3850,7 @@ pub(crate) mod harness {
|
||||
.instrument(info_span!("try_load_preload", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
|
||||
.await?;
|
||||
tenant
|
||||
.attach(None, Some(preload), ctx)
|
||||
.attach(Some(preload), ctx)
|
||||
.instrument(info_span!("try_load", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
|
||||
.await?;
|
||||
}
|
||||
@@ -3893,6 +3893,9 @@ pub(crate) mod harness {
|
||||
pub(crate) struct TestRedoManager;
|
||||
|
||||
impl TestRedoManager {
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// This method is cancellation-safe.
|
||||
pub async fn request_redo(
|
||||
&self,
|
||||
key: Key,
|
||||
|
||||
@@ -15,7 +15,6 @@ use crate::{
|
||||
context::RequestContext,
|
||||
task_mgr::{self, TaskKind},
|
||||
tenant::mgr::{TenantSlot, TenantsMapRemoveResult},
|
||||
InitializationOrder,
|
||||
};
|
||||
|
||||
use super::{
|
||||
@@ -78,8 +77,10 @@ async fn create_remote_delete_mark(
|
||||
let data: &[u8] = &[];
|
||||
backoff::retry(
|
||||
|| async {
|
||||
let data = bytes::Bytes::from_static(data);
|
||||
let stream = futures::stream::once(futures::future::ready(Ok(data)));
|
||||
remote_storage
|
||||
.upload(data, 0, &remote_mark_path, None)
|
||||
.upload(stream, 0, &remote_mark_path, None)
|
||||
.await
|
||||
},
|
||||
|_e| false,
|
||||
@@ -390,7 +391,6 @@ impl DeleteTenantFlow {
|
||||
tenant: &Arc<Tenant>,
|
||||
preload: Option<TenantPreload>,
|
||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||
init_order: Option<InitializationOrder>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
let (_, progress) = completion::channel();
|
||||
@@ -400,10 +400,7 @@ impl DeleteTenantFlow {
|
||||
.await
|
||||
.expect("cant be stopping or broken");
|
||||
|
||||
tenant
|
||||
.attach(init_order, preload, ctx)
|
||||
.await
|
||||
.context("attach")?;
|
||||
tenant.attach(preload, ctx).await.context("attach")?;
|
||||
|
||||
Self::background(
|
||||
guard,
|
||||
|
||||
@@ -2,7 +2,8 @@
|
||||
//! page server.
|
||||
|
||||
use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::shard::{ShardIdentity, ShardNumber, TenantShardId};
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use std::borrow::Cow;
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
@@ -130,6 +131,18 @@ pub(crate) enum TenantsMapRemoveResult {
|
||||
InProgress(utils::completion::Barrier),
|
||||
}
|
||||
|
||||
/// When resolving a TenantId to a shard, we may be looking for the 0th
|
||||
/// shard, or we might be looking for whichever shard holds a particular page.
|
||||
pub(crate) enum ShardSelector {
|
||||
/// Only return the 0th shard, if it is present. If a non-0th shard is present,
|
||||
/// ignore it.
|
||||
Zero,
|
||||
/// Pick the first shard we find for the TenantId
|
||||
First,
|
||||
/// Pick the shard that holds this key
|
||||
Page(Key),
|
||||
}
|
||||
|
||||
impl TenantsMap {
|
||||
/// Convenience function for typical usage, where we want to get a `Tenant` object, for
|
||||
/// working with attached tenants. If the TenantId is in the map but in Secondary state,
|
||||
@@ -144,6 +157,49 @@ impl TenantsMap {
|
||||
}
|
||||
}
|
||||
|
||||
/// A page service client sends a TenantId, and to look up the correct Tenant we must
|
||||
/// resolve this to a fully qualified TenantShardId.
|
||||
fn resolve_shard(
|
||||
&self,
|
||||
tenant_id: &TenantId,
|
||||
selector: ShardSelector,
|
||||
) -> Option<TenantShardId> {
|
||||
let mut want_shard = None;
|
||||
match self {
|
||||
TenantsMap::Initializing => None,
|
||||
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
|
||||
for slot in m.range(TenantShardId::tenant_range(*tenant_id)) {
|
||||
match selector {
|
||||
ShardSelector::First => return Some(*slot.0),
|
||||
ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
|
||||
return Some(*slot.0)
|
||||
}
|
||||
ShardSelector::Page(key) => {
|
||||
if let Some(tenant) = slot.1.get_attached() {
|
||||
// First slot we see for this tenant, calculate the expected shard number
|
||||
// for the key: we will use this for checking if this and subsequent
|
||||
// slots contain the key, rather than recalculating the hash each time.
|
||||
if want_shard.is_none() {
|
||||
want_shard = Some(tenant.shard_identity.get_shard_number(&key));
|
||||
}
|
||||
|
||||
if Some(tenant.shard_identity.number) == want_shard {
|
||||
return Some(*slot.0);
|
||||
}
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
_ => continue,
|
||||
}
|
||||
}
|
||||
|
||||
// Fall through: we didn't find an acceptable shard
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Only for use from DeleteTenantFlow. This method directly removes a TenantSlot from the map.
|
||||
///
|
||||
/// The normal way to remove a tenant is using a SlotGuard, which will gracefully remove the guarded
|
||||
@@ -214,49 +270,6 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
|
||||
static TENANTS: Lazy<std::sync::RwLock<TenantsMap>> =
|
||||
Lazy::new(|| std::sync::RwLock::new(TenantsMap::Initializing));
|
||||
|
||||
/// Create a directory, including parents. This does no fsyncs and makes
|
||||
/// no guarantees about the persistence of the resulting metadata: for
|
||||
/// use when creating dirs for use as cache.
|
||||
async fn unsafe_create_dir_all(path: &Utf8PathBuf) -> std::io::Result<()> {
|
||||
let mut dirs_to_create = Vec::new();
|
||||
let mut path: &Utf8Path = path.as_ref();
|
||||
|
||||
// Figure out which directories we need to create.
|
||||
loop {
|
||||
let meta = tokio::fs::metadata(path).await;
|
||||
match meta {
|
||||
Ok(metadata) if metadata.is_dir() => break,
|
||||
Ok(_) => {
|
||||
return Err(std::io::Error::new(
|
||||
std::io::ErrorKind::AlreadyExists,
|
||||
format!("non-directory found in path: {path}"),
|
||||
));
|
||||
}
|
||||
Err(ref e) if e.kind() == std::io::ErrorKind::NotFound => {}
|
||||
Err(e) => return Err(e),
|
||||
}
|
||||
|
||||
dirs_to_create.push(path);
|
||||
|
||||
match path.parent() {
|
||||
Some(parent) => path = parent,
|
||||
None => {
|
||||
return Err(std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidInput,
|
||||
format!("can't find parent of path '{path}'"),
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Create directories from parent to child.
|
||||
for &path in dirs_to_create.iter().rev() {
|
||||
tokio::fs::create_dir(path).await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// The TenantManager is responsible for storing and mutating the collection of all tenants
|
||||
/// that this pageserver process has state for. Every Tenant and SecondaryTenant instance
|
||||
/// lives inside the TenantManager.
|
||||
@@ -515,12 +528,14 @@ pub async fn init_tenant_mgr(
|
||||
location_conf.attach_in_generation(generation);
|
||||
Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;
|
||||
|
||||
let shard_identity = location_conf.shard;
|
||||
match tenant_spawn(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
&tenant_dir_path,
|
||||
resources.clone(),
|
||||
AttachedTenantConf::try_from(location_conf)?,
|
||||
shard_identity,
|
||||
Some(init_order.clone()),
|
||||
&TENANTS,
|
||||
SpawnMode::Normal,
|
||||
@@ -561,6 +576,7 @@ pub(crate) fn tenant_spawn(
|
||||
tenant_path: &Utf8Path,
|
||||
resources: TenantSharedResources,
|
||||
location_conf: AttachedTenantConf,
|
||||
shard_identity: ShardIdentity,
|
||||
init_order: Option<InitializationOrder>,
|
||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||
mode: SpawnMode,
|
||||
@@ -587,12 +603,19 @@ pub(crate) fn tenant_spawn(
|
||||
"Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}"
|
||||
);
|
||||
|
||||
info!("Attaching tenant {tenant_shard_id}");
|
||||
info!(
|
||||
tenant_id = %tenant_shard_id.tenant_id,
|
||||
shard_id = %tenant_shard_id.shard_slug(),
|
||||
generation = ?location_conf.location.generation,
|
||||
attach_mode = ?location_conf.location.attach_mode,
|
||||
"Attaching tenant"
|
||||
);
|
||||
let tenant = match Tenant::spawn(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
resources,
|
||||
location_conf,
|
||||
shard_identity,
|
||||
init_order,
|
||||
tenants,
|
||||
mode,
|
||||
@@ -762,12 +785,14 @@ pub(crate) async fn create_tenant(
|
||||
tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?;
|
||||
let tenant_path = super::create_tenant_files(conf, &location_conf, &tenant_shard_id).await?;
|
||||
|
||||
let shard_identity = location_conf.shard;
|
||||
let created_tenant = tenant_spawn(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
&tenant_path,
|
||||
resources,
|
||||
AttachedTenantConf::try_from(location_conf)?,
|
||||
shard_identity,
|
||||
None,
|
||||
&TENANTS,
|
||||
SpawnMode::Create,
|
||||
@@ -860,6 +885,7 @@ impl TenantManager {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
|
||||
pub(crate) async fn upsert_location(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
@@ -972,7 +998,7 @@ impl TenantManager {
|
||||
LocationMode::Secondary(_) => {
|
||||
// Directory doesn't need to be fsync'd because if we crash it can
|
||||
// safely be recreated next time this tenant location is configured.
|
||||
unsafe_create_dir_all(&tenant_path)
|
||||
tokio::fs::create_dir_all(&tenant_path)
|
||||
.await
|
||||
.with_context(|| format!("Creating {tenant_path}"))?;
|
||||
|
||||
@@ -988,7 +1014,7 @@ impl TenantManager {
|
||||
// Directory doesn't need to be fsync'd because we do not depend on
|
||||
// it to exist after crashes: it may be recreated when tenant is
|
||||
// re-attached, see https://github.com/neondatabase/neon/issues/5550
|
||||
unsafe_create_dir_all(&timelines_path)
|
||||
tokio::fs::create_dir_all(&tenant_path)
|
||||
.await
|
||||
.with_context(|| format!("Creating {timelines_path}"))?;
|
||||
|
||||
@@ -996,12 +1022,14 @@ impl TenantManager {
|
||||
.await
|
||||
.map_err(SetNewTenantConfigError::Persist)?;
|
||||
|
||||
let shard_identity = new_location_config.shard;
|
||||
let tenant = tenant_spawn(
|
||||
self.conf,
|
||||
tenant_shard_id,
|
||||
&tenant_path,
|
||||
self.resources.clone(),
|
||||
AttachedTenantConf::try_from(new_location_config)?,
|
||||
shard_identity,
|
||||
None,
|
||||
self.tenants,
|
||||
SpawnMode::Normal,
|
||||
@@ -1016,6 +1044,81 @@ impl TenantManager {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Resetting a tenant is equivalent to detaching it, then attaching it again with the same
|
||||
/// LocationConf that was last used to attach it. Optionally, the local file cache may be
|
||||
/// dropped before re-attaching.
|
||||
///
|
||||
/// This is not part of a tenant's normal lifecycle: it is used for debug/support, in situations
|
||||
/// where an issue is identified that would go away with a restart of the tenant.
|
||||
///
|
||||
/// This does not have any special "force" shutdown of a tenant: it relies on the tenant's tasks
|
||||
/// to respect the cancellation tokens used in normal shutdown().
|
||||
#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %drop_cache))]
|
||||
pub(crate) async fn reset_tenant(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
drop_cache: bool,
|
||||
ctx: RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
|
||||
let Some(old_slot) = slot_guard.get_old_value() else {
|
||||
anyhow::bail!("Tenant not found when trying to reset");
|
||||
};
|
||||
|
||||
let Some(tenant) = old_slot.get_attached() else {
|
||||
slot_guard.revert();
|
||||
anyhow::bail!("Tenant is not in attached state");
|
||||
};
|
||||
|
||||
let (_guard, progress) = utils::completion::channel();
|
||||
match tenant.shutdown(progress, false).await {
|
||||
Ok(()) => {
|
||||
slot_guard.drop_old_value()?;
|
||||
}
|
||||
Err(_barrier) => {
|
||||
slot_guard.revert();
|
||||
anyhow::bail!("Cannot reset Tenant, already shutting down");
|
||||
}
|
||||
}
|
||||
|
||||
let tenant_path = self.conf.tenant_path(&tenant_shard_id);
|
||||
let timelines_path = self.conf.timelines_path(&tenant_shard_id);
|
||||
let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?;
|
||||
|
||||
if drop_cache {
|
||||
tracing::info!("Dropping local file cache");
|
||||
|
||||
match tokio::fs::read_dir(&timelines_path).await {
|
||||
Err(e) => {
|
||||
tracing::warn!("Failed to list timelines while dropping cache: {}", e);
|
||||
}
|
||||
Ok(mut entries) => {
|
||||
while let Some(entry) = entries.next_entry().await? {
|
||||
tokio::fs::remove_dir_all(entry.path()).await?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let shard_identity = config.shard;
|
||||
let tenant = tenant_spawn(
|
||||
self.conf,
|
||||
tenant_shard_id,
|
||||
&tenant_path,
|
||||
self.resources.clone(),
|
||||
AttachedTenantConf::try_from(config)?,
|
||||
shard_identity,
|
||||
None,
|
||||
self.tenants,
|
||||
SpawnMode::Normal,
|
||||
&ctx,
|
||||
)?;
|
||||
|
||||
slot_guard.upsert(TenantSlot::Attached(tenant))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
@@ -1100,6 +1203,7 @@ pub(crate) enum GetActiveTenantError {
|
||||
/// then wait for up to `timeout` (minus however long we waited for the slot).
|
||||
pub(crate) async fn get_active_tenant_with_timeout(
|
||||
tenant_id: TenantId,
|
||||
shard_selector: ShardSelector,
|
||||
timeout: Duration,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Arc<Tenant>, GetActiveTenantError> {
|
||||
@@ -1108,15 +1212,17 @@ pub(crate) async fn get_active_tenant_with_timeout(
|
||||
Tenant(Arc<Tenant>),
|
||||
}
|
||||
|
||||
// TODO(sharding): make page service interface sharding-aware (page service should apply ShardIdentity to the key
|
||||
// to decide which shard services the request)
|
||||
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
|
||||
|
||||
let wait_start = Instant::now();
|
||||
let deadline = wait_start + timeout;
|
||||
|
||||
let wait_for = {
|
||||
let (wait_for, tenant_shard_id) = {
|
||||
let locked = TENANTS.read().unwrap();
|
||||
|
||||
// Resolve TenantId to TenantShardId
|
||||
let tenant_shard_id = locked.resolve_shard(&tenant_id, shard_selector).ok_or(
|
||||
GetActiveTenantError::NotFound(GetTenantError::NotFound(tenant_id)),
|
||||
)?;
|
||||
|
||||
let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
|
||||
.map_err(GetTenantError::MapState)?;
|
||||
match peek_slot {
|
||||
@@ -1126,7 +1232,7 @@ pub(crate) async fn get_active_tenant_with_timeout(
|
||||
// Fast path: we don't need to do any async waiting.
|
||||
return Ok(tenant.clone());
|
||||
}
|
||||
_ => WaitFor::Tenant(tenant.clone()),
|
||||
_ => (WaitFor::Tenant(tenant.clone()), tenant_shard_id),
|
||||
}
|
||||
}
|
||||
Some(TenantSlot::Secondary) => {
|
||||
@@ -1134,7 +1240,9 @@ pub(crate) async fn get_active_tenant_with_timeout(
|
||||
tenant_id,
|
||||
)))
|
||||
}
|
||||
Some(TenantSlot::InProgress(barrier)) => WaitFor::Barrier(barrier.clone()),
|
||||
Some(TenantSlot::InProgress(barrier)) => {
|
||||
(WaitFor::Barrier(barrier.clone()), tenant_shard_id)
|
||||
}
|
||||
None => {
|
||||
return Err(GetActiveTenantError::NotFound(GetTenantError::NotFound(
|
||||
tenant_id,
|
||||
@@ -1219,8 +1327,7 @@ pub(crate) async fn delete_tenant(
|
||||
// See https://github.com/neondatabase/neon/issues/5080
|
||||
|
||||
// TODO(sharding): make delete API sharding-aware
|
||||
let mut slot_guard =
|
||||
tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;
|
||||
let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;
|
||||
|
||||
// unwrap is safe because we used MustExist mode when acquiring
|
||||
let tenant = match slot_guard.get_old_value().as_ref().unwrap() {
|
||||
@@ -1377,12 +1484,14 @@ pub(crate) async fn load_tenant(
|
||||
|
||||
Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;
|
||||
|
||||
let shard_identity = location_conf.shard;
|
||||
let new_tenant = tenant_spawn(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
&tenant_path,
|
||||
resources,
|
||||
AttachedTenantConf::try_from(location_conf)?,
|
||||
shard_identity,
|
||||
None,
|
||||
&TENANTS,
|
||||
SpawnMode::Normal,
|
||||
@@ -1472,12 +1581,14 @@ pub(crate) async fn attach_tenant(
|
||||
// TODO: tenant directory remains on disk if we bail out from here on.
|
||||
// See https://github.com/neondatabase/neon/issues/4233
|
||||
|
||||
let shard_identity = location_conf.shard;
|
||||
let attached_tenant = tenant_spawn(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
&tenant_dir,
|
||||
resources,
|
||||
AttachedTenantConf::try_from(location_conf)?,
|
||||
shard_identity,
|
||||
None,
|
||||
&TENANTS,
|
||||
SpawnMode::Normal,
|
||||
@@ -1543,9 +1654,10 @@ pub enum TenantSlotUpsertError {
|
||||
MapState(#[from] TenantMapError),
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
enum TenantSlotDropError {
|
||||
/// It is only legal to drop a TenantSlot if its contents are fully shut down
|
||||
#[error("Tenant was not shut down")]
|
||||
NotShutdown,
|
||||
}
|
||||
|
||||
@@ -1605,9 +1717,9 @@ impl SlotGuard {
|
||||
}
|
||||
}
|
||||
|
||||
/// Take any value that was present in the slot before we acquired ownership
|
||||
/// Get any value that was present in the slot before we acquired ownership
|
||||
/// of it: in state transitions, this will be the old state.
|
||||
fn get_old_value(&mut self) -> &Option<TenantSlot> {
|
||||
fn get_old_value(&self) -> &Option<TenantSlot> {
|
||||
&self.old_value
|
||||
}
|
||||
|
||||
@@ -1825,7 +1937,7 @@ fn tenant_map_acquire_slot_impl(
|
||||
METRICS.tenant_slot_writes.inc();
|
||||
|
||||
let mut locked = tenants.write().unwrap();
|
||||
let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard=tenant_shard_id.shard_slug());
|
||||
let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard = %tenant_shard_id.shard_slug());
|
||||
let _guard = span.enter();
|
||||
|
||||
let m = match &mut *locked {
|
||||
|
||||
@@ -254,6 +254,9 @@ pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
|
||||
|
||||
pub(crate) const INITDB_PATH: &str = "initdb.tar.zst";
|
||||
|
||||
/// Default buffer size when interfacing with [`tokio::fs::File`].
|
||||
pub(crate) const BUFFER_SIZE: usize = 32 * 1024;
|
||||
|
||||
pub enum MaybeDeletedIndexPart {
|
||||
IndexPart(IndexPart),
|
||||
Deleted(IndexPart),
|
||||
|
||||
@@ -75,12 +75,11 @@ pub async fn download_layer_file<'a>(
|
||||
|
||||
let (mut destination_file, bytes_amount) = download_retry(
|
||||
|| async {
|
||||
// TODO: this doesn't use the cached fd for some reason?
|
||||
let mut destination_file = fs::File::create(&temp_file_path)
|
||||
let destination_file = tokio::fs::File::create(&temp_file_path)
|
||||
.await
|
||||
.with_context(|| format!("create a destination file for layer '{temp_file_path}'"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
let mut download = storage
|
||||
let download = storage
|
||||
.download(&remote_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
@@ -90,9 +89,14 @@ pub async fn download_layer_file<'a>(
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let mut destination_file =
|
||||
tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file);
|
||||
|
||||
let mut reader = tokio_util::io::StreamReader::new(download.download_stream);
|
||||
|
||||
let bytes_amount = tokio::time::timeout(
|
||||
MAX_DOWNLOAD_DURATION,
|
||||
tokio::io::copy(&mut download.download_stream, &mut destination_file),
|
||||
tokio::io::copy_buf(&mut reader, &mut destination_file),
|
||||
)
|
||||
.await
|
||||
.map_err(|e| DownloadError::Other(anyhow::anyhow!("Timed out {:?}", e)))?
|
||||
@@ -103,6 +107,8 @@ pub async fn download_layer_file<'a>(
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let destination_file = destination_file.into_inner();
|
||||
|
||||
Ok((destination_file, bytes_amount))
|
||||
},
|
||||
&format!("download {remote_path:?}"),
|
||||
@@ -220,20 +226,22 @@ async fn do_download_index_part(
|
||||
index_generation: Generation,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<IndexPart, DownloadError> {
|
||||
use futures::stream::StreamExt;
|
||||
|
||||
let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);
|
||||
|
||||
let index_part_bytes = download_retry_forever(
|
||||
|| async {
|
||||
let mut index_part_download = storage.download(&remote_path).await?;
|
||||
let index_part_download = storage.download(&remote_path).await?;
|
||||
|
||||
let mut index_part_bytes = Vec::new();
|
||||
tokio::io::copy(
|
||||
&mut index_part_download.download_stream,
|
||||
&mut index_part_bytes,
|
||||
)
|
||||
.await
|
||||
.with_context(|| format!("download index part at {remote_path:?}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
let mut stream = std::pin::pin!(index_part_download.download_stream);
|
||||
while let Some(chunk) = stream.next().await {
|
||||
let chunk = chunk
|
||||
.with_context(|| format!("download index part at {remote_path:?}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
index_part_bytes.extend_from_slice(&chunk[..]);
|
||||
}
|
||||
Ok(index_part_bytes)
|
||||
},
|
||||
&format!("download {remote_path:?}"),
|
||||
@@ -363,7 +371,7 @@ pub(super) async fn download_index_part(
|
||||
None => {
|
||||
// Migration from legacy pre-generation state: we have a generation but no prior
|
||||
// attached pageservers did. Try to load from a no-generation path.
|
||||
tracing::info!("No index_part.json* found");
|
||||
tracing::debug!("No index_part.json* found");
|
||||
do_download_index_part(
|
||||
storage,
|
||||
tenant_shard_id,
|
||||
@@ -394,11 +402,13 @@ pub(crate) async fn download_initdb_tar_zst(
|
||||
.with_context(|| format!("timeline dir creation {timeline_path}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
}
|
||||
let temp_path = timeline_path.join(format!("{INITDB_PATH}-{timeline_id}.{TEMP_FILE_SUFFIX}"));
|
||||
let temp_path = timeline_path.join(format!(
|
||||
"{INITDB_PATH}.download-{timeline_id}.{TEMP_FILE_SUFFIX}"
|
||||
));
|
||||
|
||||
let file = download_retry(
|
||||
|| async {
|
||||
let mut file = OpenOptions::new()
|
||||
let file = OpenOptions::new()
|
||||
.create(true)
|
||||
.truncate(true)
|
||||
.read(true)
|
||||
@@ -408,13 +418,17 @@ pub(crate) async fn download_initdb_tar_zst(
|
||||
.with_context(|| format!("tempfile creation {temp_path}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let mut download = storage.download(&remote_path).await?;
|
||||
let download = storage.download(&remote_path).await?;
|
||||
let mut download = tokio_util::io::StreamReader::new(download.download_stream);
|
||||
let mut writer = tokio::io::BufWriter::with_capacity(8 * 1024, file);
|
||||
|
||||
tokio::io::copy(&mut download.download_stream, &mut file)
|
||||
tokio::io::copy_buf(&mut download, &mut writer)
|
||||
.await
|
||||
.with_context(|| format!("download initdb.tar.zst at {remote_path:?}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let mut file = writer.into_inner();
|
||||
|
||||
file.seek(std::io::SeekFrom::Start(0))
|
||||
.await
|
||||
.with_context(|| format!("rewinding initdb.tar.zst at: {remote_path:?}"))
|
||||
@@ -426,10 +440,10 @@ pub(crate) async fn download_initdb_tar_zst(
|
||||
)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
if temp_path.exists() {
|
||||
// Do a best-effort attempt at deleting the temporary file upon encountering an error.
|
||||
// We don't have async here nor do we want to pile on any extra errors.
|
||||
if let Err(e) = std::fs::remove_file(&temp_path) {
|
||||
// Do a best-effort attempt at deleting the temporary file upon encountering an error.
|
||||
// We don't have async here nor do we want to pile on any extra errors.
|
||||
if let Err(e) = std::fs::remove_file(&temp_path) {
|
||||
if e.kind() != std::io::ErrorKind::NotFound {
|
||||
warn!("error deleting temporary file {temp_path}: {e}");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,12 +1,11 @@
|
||||
//! Helper functions to upload files to remote storage with a RemoteStorage
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use bytes::Bytes;
|
||||
use camino::Utf8Path;
|
||||
use fail::fail_point;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::io::ErrorKind;
|
||||
use tokio::fs;
|
||||
use tokio::fs::{self, File};
|
||||
|
||||
use super::Generation;
|
||||
use crate::{
|
||||
@@ -41,11 +40,15 @@ pub(super) async fn upload_index_part<'a>(
|
||||
.to_s3_bytes()
|
||||
.context("serialize index part file into bytes")?;
|
||||
let index_part_size = index_part_bytes.len();
|
||||
let index_part_bytes = tokio::io::BufReader::new(std::io::Cursor::new(index_part_bytes));
|
||||
let index_part_bytes = bytes::Bytes::from(index_part_bytes);
|
||||
|
||||
let remote_path = remote_index_path(tenant_shard_id, timeline_id, generation);
|
||||
storage
|
||||
.upload_storage_object(Box::new(index_part_bytes), index_part_size, &remote_path)
|
||||
.upload_storage_object(
|
||||
futures::stream::once(futures::future::ready(Ok(index_part_bytes))),
|
||||
index_part_size,
|
||||
&remote_path,
|
||||
)
|
||||
.await
|
||||
.with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'"))
|
||||
}
|
||||
@@ -101,8 +104,10 @@ pub(super) async fn upload_timeline_layer<'a>(
|
||||
let fs_size = usize::try_from(fs_size)
|
||||
.with_context(|| format!("convert {source_path:?} size {fs_size} usize"))?;
|
||||
|
||||
let reader = tokio_util::io::ReaderStream::with_capacity(source_file, super::BUFFER_SIZE);
|
||||
|
||||
storage
|
||||
.upload(source_file, fs_size, &storage_path, None)
|
||||
.upload(reader, fs_size, &storage_path, None)
|
||||
.await
|
||||
.with_context(|| format!("upload layer from local path '{source_path}'"))?;
|
||||
|
||||
@@ -114,16 +119,16 @@ pub(crate) async fn upload_initdb_dir(
|
||||
storage: &GenericRemoteStorage,
|
||||
tenant_id: &TenantId,
|
||||
timeline_id: &TimelineId,
|
||||
initdb_dir: Bytes,
|
||||
initdb_tar_zst: File,
|
||||
size: u64,
|
||||
) -> anyhow::Result<()> {
|
||||
tracing::trace!("uploading initdb dir");
|
||||
|
||||
let size = initdb_dir.len();
|
||||
let bytes = tokio::io::BufReader::new(std::io::Cursor::new(initdb_dir));
|
||||
let file = tokio_util::io::ReaderStream::with_capacity(initdb_tar_zst, super::BUFFER_SIZE);
|
||||
|
||||
let remote_path = remote_initdb_archive_path(tenant_id, timeline_id);
|
||||
storage
|
||||
.upload_storage_object(bytes, size, &remote_path)
|
||||
.upload_storage_object(file, size as usize, &remote_path)
|
||||
.await
|
||||
.with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'"))
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@ pub mod delta_layer;
|
||||
mod filename;
|
||||
pub mod image_layer;
|
||||
mod inmemory_layer;
|
||||
mod layer;
|
||||
pub(crate) mod layer;
|
||||
mod layer_desc;
|
||||
|
||||
use crate::context::{AccessStatsBehavior, RequestContext};
|
||||
|
||||
@@ -222,14 +222,18 @@ impl Layer {
|
||||
///
|
||||
/// [gc]: [`RemoteTimelineClient::schedule_gc_update`]
|
||||
/// [compaction]: [`RemoteTimelineClient::schedule_compaction_update`]
|
||||
pub(crate) fn garbage_collect_on_drop(&self) {
|
||||
self.0.garbage_collect_on_drop();
|
||||
pub(crate) fn delete_on_drop(&self) {
|
||||
self.0.delete_on_drop();
|
||||
}
|
||||
|
||||
/// Return data needed to reconstruct given page at LSN.
|
||||
///
|
||||
/// It is up to the caller to collect more data from the previous layer and
|
||||
/// perform WAL redo, if necessary.
|
||||
///
|
||||
/// # Cancellation-Safety
|
||||
///
|
||||
/// This method is cancellation-safe.
|
||||
pub(crate) async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
@@ -327,10 +331,10 @@ impl Layer {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Waits until this layer has been dropped (and if needed, local garbage collection and remote
|
||||
/// Waits until this layer has been dropped (and if needed, local file deletion and remote
|
||||
/// deletion scheduling has completed).
|
||||
///
|
||||
/// Does not start garbage collection, use [`Self::garbage_collect_on_drop`] for that
|
||||
/// Does not start local deletion, use [`Self::delete_on_drop`] for that
|
||||
/// separatedly.
|
||||
#[cfg(feature = "testing")]
|
||||
pub(crate) fn wait_drop(&self) -> impl std::future::Future<Output = ()> + 'static {
|
||||
@@ -419,8 +423,8 @@ struct LayerInner {
|
||||
/// Initialization and deinitialization are done while holding a permit.
|
||||
inner: heavier_once_cell::OnceCell<ResidentOrWantedEvicted>,
|
||||
|
||||
/// Do we want to garbage collect this when `LayerInner` is dropped
|
||||
wanted_garbage_collected: AtomicBool,
|
||||
/// Do we want to delete locally and remotely this when `LayerInner` is dropped
|
||||
wanted_deleted: AtomicBool,
|
||||
|
||||
/// Do we want to evict this layer as soon as possible? After being set to `true`, all accesses
|
||||
/// will try to downgrade [`ResidentOrWantedEvicted`], which will eventually trigger
|
||||
@@ -434,10 +438,6 @@ struct LayerInner {
|
||||
version: AtomicUsize,
|
||||
|
||||
/// Allow subscribing to when the layer actually gets evicted.
|
||||
///
|
||||
/// If in future we need to implement "wait until layer instances are gone and done", carrying
|
||||
/// this over to the gc spawn_blocking from LayerInner::drop will do the trick, and adding a
|
||||
/// method for "wait_gc" which will wait to this being closed.
|
||||
status: tokio::sync::broadcast::Sender<Status>,
|
||||
|
||||
/// Counter for exponential backoff with the download
|
||||
@@ -479,14 +479,14 @@ enum Status {
|
||||
|
||||
impl Drop for LayerInner {
|
||||
fn drop(&mut self) {
|
||||
if !*self.wanted_garbage_collected.get_mut() {
|
||||
if !*self.wanted_deleted.get_mut() {
|
||||
// should we try to evict if the last wish was for eviction?
|
||||
// feels like there's some hazard of overcrowding near shutdown near by, but we don't
|
||||
// run drops during shutdown (yet)
|
||||
return;
|
||||
}
|
||||
|
||||
let span = tracing::info_span!(parent: None, "layer_gc", tenant_id = %self.layer_desc().tenant_shard_id.tenant_id, shard_id=%self.layer_desc().tenant_shard_id.shard_slug(), timeline_id = %self.layer_desc().timeline_id);
|
||||
let span = tracing::info_span!(parent: None, "layer_delete", tenant_id = %self.layer_desc().tenant_shard_id.tenant_id, shard_id=%self.layer_desc().tenant_shard_id.shard_slug(), timeline_id = %self.layer_desc().timeline_id);
|
||||
|
||||
let path = std::mem::take(&mut self.path);
|
||||
let file_name = self.layer_desc().filename();
|
||||
@@ -513,8 +513,8 @@ impl Drop for LayerInner {
|
||||
false
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::error!("failed to remove garbage collected layer: {e}");
|
||||
LAYER_IMPL_METRICS.inc_gc_removes_failed();
|
||||
tracing::error!("failed to remove wanted deleted layer: {e}");
|
||||
LAYER_IMPL_METRICS.inc_delete_removes_failed();
|
||||
false
|
||||
}
|
||||
};
|
||||
@@ -536,15 +536,15 @@ impl Drop for LayerInner {
|
||||
} else {
|
||||
tracing::warn!("scheduling deletion on drop failed: {e:#}");
|
||||
}
|
||||
LAYER_IMPL_METRICS.inc_gcs_failed(GcFailed::DeleteSchedulingFailed);
|
||||
LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed);
|
||||
} else {
|
||||
LAYER_IMPL_METRICS.inc_completed_gcs();
|
||||
LAYER_IMPL_METRICS.inc_completed_deletes();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// no need to nag that timeline is gone: under normal situation on
|
||||
// task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped.
|
||||
LAYER_IMPL_METRICS.inc_gcs_failed(GcFailed::TimelineGone);
|
||||
LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
|
||||
}
|
||||
});
|
||||
}
|
||||
@@ -579,7 +579,7 @@ impl LayerInner {
|
||||
timeline: Arc::downgrade(timeline),
|
||||
have_remote_client: timeline.remote_client.is_some(),
|
||||
access_stats,
|
||||
wanted_garbage_collected: AtomicBool::new(false),
|
||||
wanted_deleted: AtomicBool::new(false),
|
||||
wanted_evicted: AtomicBool::new(false),
|
||||
inner,
|
||||
version: AtomicUsize::new(version),
|
||||
@@ -590,16 +590,13 @@ impl LayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
fn garbage_collect_on_drop(&self) {
|
||||
let res = self.wanted_garbage_collected.compare_exchange(
|
||||
false,
|
||||
true,
|
||||
Ordering::Release,
|
||||
Ordering::Relaxed,
|
||||
);
|
||||
fn delete_on_drop(&self) {
|
||||
let res =
|
||||
self.wanted_deleted
|
||||
.compare_exchange(false, true, Ordering::Release, Ordering::Relaxed);
|
||||
|
||||
if res.is_ok() {
|
||||
LAYER_IMPL_METRICS.inc_started_gcs();
|
||||
LAYER_IMPL_METRICS.inc_started_deletes();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -667,6 +664,10 @@ impl LayerInner {
|
||||
// disable any scheduled but not yet running eviction deletions for this
|
||||
let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
// count cancellations, which currently remain largely unexpected
|
||||
let init_cancelled =
|
||||
scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
|
||||
|
||||
// no need to make the evict_and_wait wait for the actual download to complete
|
||||
drop(self.status.send(Status::Downloaded));
|
||||
|
||||
@@ -675,6 +676,8 @@ impl LayerInner {
|
||||
.upgrade()
|
||||
.ok_or_else(|| DownloadError::TimelineShutdown)?;
|
||||
|
||||
// FIXME: grab a gate
|
||||
|
||||
let can_ever_evict = timeline.remote_client.as_ref().is_some();
|
||||
|
||||
// check if we really need to be downloaded; could have been already downloaded by a
|
||||
@@ -735,6 +738,8 @@ impl LayerInner {
|
||||
tracing::info!(waiters, "completing the on-demand download for other tasks");
|
||||
}
|
||||
|
||||
scopeguard::ScopeGuard::into_inner(init_cancelled);
|
||||
|
||||
Ok((ResidentOrWantedEvicted::Resident(res), permit))
|
||||
};
|
||||
|
||||
@@ -863,14 +868,13 @@ impl LayerInner {
|
||||
match res {
|
||||
(Ok(()), _) => {
|
||||
// our caller is cancellation safe so this is fine; if someone
|
||||
// else requests the layer, they'll find it already downloaded
|
||||
// or redownload.
|
||||
// else requests the layer, they'll find it already downloaded.
|
||||
//
|
||||
// however, could be that we should consider marking the layer
|
||||
// for eviction? alas, cannot: because only DownloadedLayer
|
||||
// will handle that.
|
||||
tracing::info!("layer file download completed after requester had cancelled");
|
||||
LAYER_IMPL_METRICS.inc_download_completed_without_requester();
|
||||
// See counter [`LayerImplMetrics::inc_init_needed_no_download`]
|
||||
//
|
||||
// FIXME(#6028): however, could be that we should consider marking the
|
||||
// layer for eviction? alas, cannot: because only DownloadedLayer will
|
||||
// handle that.
|
||||
},
|
||||
(Err(e), _) => {
|
||||
// our caller is cancellation safe, but we might be racing with
|
||||
@@ -990,12 +994,15 @@ impl LayerInner {
|
||||
|
||||
/// `DownloadedLayer` is being dropped, so it calls this method.
|
||||
fn on_downloaded_layer_drop(self: Arc<LayerInner>, version: usize) {
|
||||
let gc = self.wanted_garbage_collected.load(Ordering::Acquire);
|
||||
let delete = self.wanted_deleted.load(Ordering::Acquire);
|
||||
let evict = self.wanted_evicted.load(Ordering::Acquire);
|
||||
let can_evict = self.have_remote_client;
|
||||
|
||||
if gc {
|
||||
// do nothing now, only in LayerInner::drop
|
||||
if delete {
|
||||
// do nothing now, only in LayerInner::drop -- this was originally implemented because
|
||||
// we could had already scheduled the deletion at the time.
|
||||
//
|
||||
// FIXME: this is not true anymore, we can safely evict wanted deleted files.
|
||||
} else if can_evict && evict {
|
||||
let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_shard_id.tenant_id, shard_id = %self.desc.tenant_shard_id.shard_slug(), timeline_id = %self.desc.timeline_id, layer=%self, %version);
|
||||
|
||||
@@ -1010,7 +1017,7 @@ impl LayerInner {
|
||||
crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(move || {
|
||||
let _g = span.entered();
|
||||
|
||||
// if LayerInner is already dropped here, do nothing because the garbage collection
|
||||
// if LayerInner is already dropped here, do nothing because the delete on drop
|
||||
// has already ran while we were in queue
|
||||
let Some(this) = this.upgrade() else {
|
||||
LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
|
||||
@@ -1401,35 +1408,37 @@ impl From<ResidentLayer> for Layer {
|
||||
}
|
||||
}
|
||||
|
||||
use metrics::{IntCounter, IntCounterVec};
|
||||
use metrics::IntCounter;
|
||||
|
||||
struct LayerImplMetrics {
|
||||
pub(crate) struct LayerImplMetrics {
|
||||
started_evictions: IntCounter,
|
||||
completed_evictions: IntCounter,
|
||||
cancelled_evictions: IntCounterVec,
|
||||
cancelled_evictions: enum_map::EnumMap<EvictionCancelled, IntCounter>,
|
||||
|
||||
started_gcs: IntCounter,
|
||||
completed_gcs: IntCounter,
|
||||
failed_gcs: IntCounterVec,
|
||||
started_deletes: IntCounter,
|
||||
completed_deletes: IntCounter,
|
||||
failed_deletes: enum_map::EnumMap<DeleteFailed, IntCounter>,
|
||||
|
||||
rare_counters: IntCounterVec,
|
||||
rare_counters: enum_map::EnumMap<RareEvent, IntCounter>,
|
||||
inits_cancelled: metrics::core::GenericCounter<metrics::core::AtomicU64>,
|
||||
}
|
||||
|
||||
impl Default for LayerImplMetrics {
|
||||
fn default() -> Self {
|
||||
let evictions = metrics::register_int_counter_vec!(
|
||||
"pageserver_layer_evictions_count",
|
||||
"Evictions started and completed in the Layer implementation",
|
||||
&["state"]
|
||||
use enum_map::Enum;
|
||||
|
||||
// reminder: these will be pageserver_layer_* with "_total" suffix
|
||||
|
||||
let started_evictions = metrics::register_int_counter!(
|
||||
"pageserver_layer_started_evictions",
|
||||
"Evictions started in the Layer implementation"
|
||||
)
|
||||
.unwrap();
|
||||
let completed_evictions = metrics::register_int_counter!(
|
||||
"pageserver_layer_completed_evictions",
|
||||
"Evictions completed in the Layer implementation"
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let started_evictions = evictions
|
||||
.get_metric_with_label_values(&["started"])
|
||||
.unwrap();
|
||||
let completed_evictions = evictions
|
||||
.get_metric_with_label_values(&["completed"])
|
||||
.unwrap();
|
||||
|
||||
let cancelled_evictions = metrics::register_int_counter_vec!(
|
||||
"pageserver_layer_cancelled_evictions_count",
|
||||
@@ -1438,24 +1447,36 @@ impl Default for LayerImplMetrics {
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// reminder: this will be pageserver_layer_gcs_count_total with "_total" suffix
|
||||
let gcs = metrics::register_int_counter_vec!(
|
||||
"pageserver_layer_gcs_count",
|
||||
"Garbage collections started and completed in the Layer implementation",
|
||||
&["state"]
|
||||
let cancelled_evictions = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
|
||||
let reason = EvictionCancelled::from_usize(i);
|
||||
let s = reason.as_str();
|
||||
cancelled_evictions.with_label_values(&[s])
|
||||
}));
|
||||
|
||||
let started_deletes = metrics::register_int_counter!(
|
||||
"pageserver_layer_started_deletes",
|
||||
"Deletions on drop pending in the Layer implementation"
|
||||
)
|
||||
.unwrap();
|
||||
let completed_deletes = metrics::register_int_counter!(
|
||||
"pageserver_layer_completed_deletes",
|
||||
"Deletions on drop completed in the Layer implementation"
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let started_gcs = gcs.get_metric_with_label_values(&["pending"]).unwrap();
|
||||
let completed_gcs = gcs.get_metric_with_label_values(&["completed"]).unwrap();
|
||||
|
||||
let failed_gcs = metrics::register_int_counter_vec!(
|
||||
"pageserver_layer_failed_gcs_count",
|
||||
"Different reasons for garbage collections to have failed",
|
||||
let failed_deletes = metrics::register_int_counter_vec!(
|
||||
"pageserver_layer_failed_deletes_count",
|
||||
"Different reasons for deletions on drop to have failed",
|
||||
&["reason"]
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let failed_deletes = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
|
||||
let reason = DeleteFailed::from_usize(i);
|
||||
let s = reason.as_str();
|
||||
failed_deletes.with_label_values(&[s])
|
||||
}));
|
||||
|
||||
let rare_counters = metrics::register_int_counter_vec!(
|
||||
"pageserver_layer_assumed_rare_count",
|
||||
"Times unexpected or assumed rare event happened",
|
||||
@@ -1463,16 +1484,29 @@ impl Default for LayerImplMetrics {
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let rare_counters = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
|
||||
let event = RareEvent::from_usize(i);
|
||||
let s = event.as_str();
|
||||
rare_counters.with_label_values(&[s])
|
||||
}));
|
||||
|
||||
let inits_cancelled = metrics::register_int_counter!(
|
||||
"pageserver_layer_inits_cancelled_count",
|
||||
"Times Layer initialization was cancelled",
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
Self {
|
||||
started_evictions,
|
||||
completed_evictions,
|
||||
cancelled_evictions,
|
||||
|
||||
started_gcs,
|
||||
completed_gcs,
|
||||
failed_gcs,
|
||||
started_deletes,
|
||||
completed_deletes,
|
||||
failed_deletes,
|
||||
|
||||
rare_counters,
|
||||
inits_cancelled,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1485,57 +1519,33 @@ impl LayerImplMetrics {
|
||||
self.completed_evictions.inc();
|
||||
}
|
||||
fn inc_eviction_cancelled(&self, reason: EvictionCancelled) {
|
||||
self.cancelled_evictions
|
||||
.get_metric_with_label_values(&[reason.as_str()])
|
||||
.unwrap()
|
||||
.inc()
|
||||
self.cancelled_evictions[reason].inc()
|
||||
}
|
||||
|
||||
fn inc_started_gcs(&self) {
|
||||
self.started_gcs.inc();
|
||||
fn inc_started_deletes(&self) {
|
||||
self.started_deletes.inc();
|
||||
}
|
||||
fn inc_completed_gcs(&self) {
|
||||
self.completed_gcs.inc();
|
||||
fn inc_completed_deletes(&self) {
|
||||
self.completed_deletes.inc();
|
||||
}
|
||||
fn inc_gcs_failed(&self, reason: GcFailed) {
|
||||
self.failed_gcs
|
||||
.get_metric_with_label_values(&[reason.as_str()])
|
||||
.unwrap()
|
||||
.inc();
|
||||
fn inc_deletes_failed(&self, reason: DeleteFailed) {
|
||||
self.failed_deletes[reason].inc();
|
||||
}
|
||||
|
||||
/// Counted separatedly from failed gcs because we will complete the gc attempt regardless of
|
||||
/// failure to delete local file.
|
||||
fn inc_gc_removes_failed(&self) {
|
||||
self.rare_counters
|
||||
.get_metric_with_label_values(&["gc_remove_failed"])
|
||||
.unwrap()
|
||||
.inc();
|
||||
/// Counted separatedly from failed layer deletes because we will complete the layer deletion
|
||||
/// attempt regardless of failure to delete local file.
|
||||
fn inc_delete_removes_failed(&self) {
|
||||
self.rare_counters[RareEvent::RemoveOnDropFailed].inc();
|
||||
}
|
||||
|
||||
/// Expected rare because requires a race with `evict_blocking` and
|
||||
/// `get_or_maybe_download`.
|
||||
/// Expected rare because requires a race with `evict_blocking` and `get_or_maybe_download`.
|
||||
fn inc_retried_get_or_maybe_download(&self) {
|
||||
self.rare_counters
|
||||
.get_metric_with_label_values(&["retried_gomd"])
|
||||
.unwrap()
|
||||
.inc();
|
||||
self.rare_counters[RareEvent::RetriedGetOrMaybeDownload].inc();
|
||||
}
|
||||
|
||||
/// Expected rare because cancellations are unexpected
|
||||
fn inc_download_completed_without_requester(&self) {
|
||||
self.rare_counters
|
||||
.get_metric_with_label_values(&["download_completed_without"])
|
||||
.unwrap()
|
||||
.inc();
|
||||
}
|
||||
|
||||
/// Expected rare because cancellations are unexpected
|
||||
/// Expected rare because cancellations are unexpected, and failures are unexpected
|
||||
fn inc_download_failed_without_requester(&self) {
|
||||
self.rare_counters
|
||||
.get_metric_with_label_values(&["download_failed_without"])
|
||||
.unwrap()
|
||||
.inc();
|
||||
self.rare_counters[RareEvent::DownloadFailedWithoutRequester].inc();
|
||||
}
|
||||
|
||||
/// The Weak in ResidentOrWantedEvicted::WantedEvicted was successfully upgraded.
|
||||
@@ -1543,37 +1553,30 @@ impl LayerImplMetrics {
|
||||
/// If this counter is always zero, we should replace ResidentOrWantedEvicted type with an
|
||||
/// Option.
|
||||
fn inc_raced_wanted_evicted_accesses(&self) {
|
||||
self.rare_counters
|
||||
.get_metric_with_label_values(&["raced_wanted_evicted"])
|
||||
.unwrap()
|
||||
.inc();
|
||||
self.rare_counters[RareEvent::UpgradedWantedEvicted].inc();
|
||||
}
|
||||
|
||||
/// These are only expected for [`Self::inc_download_completed_without_requester`] amount when
|
||||
/// These are only expected for [`Self::inc_init_cancelled`] amount when
|
||||
/// running with remote storage.
|
||||
fn inc_init_needed_no_download(&self) {
|
||||
self.rare_counters
|
||||
.get_metric_with_label_values(&["init_needed_no_download"])
|
||||
.unwrap()
|
||||
.inc();
|
||||
self.rare_counters[RareEvent::InitWithoutDownload].inc();
|
||||
}
|
||||
|
||||
/// Expected rare because all layer files should be readable and good
|
||||
fn inc_permanent_loading_failures(&self) {
|
||||
self.rare_counters
|
||||
.get_metric_with_label_values(&["permanent_loading_failure"])
|
||||
.unwrap()
|
||||
.inc();
|
||||
self.rare_counters[RareEvent::PermanentLoadingFailure].inc();
|
||||
}
|
||||
|
||||
fn inc_broadcast_lagged(&self) {
|
||||
self.rare_counters
|
||||
.get_metric_with_label_values(&["broadcast_lagged"])
|
||||
.unwrap()
|
||||
.inc();
|
||||
self.rare_counters[RareEvent::EvictAndWaitLagged].inc();
|
||||
}
|
||||
|
||||
fn inc_init_cancelled(&self) {
|
||||
self.inits_cancelled.inc()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(enum_map::Enum)]
|
||||
enum EvictionCancelled {
|
||||
LayerGone,
|
||||
TimelineGone,
|
||||
@@ -1602,19 +1605,47 @@ impl EvictionCancelled {
|
||||
}
|
||||
}
|
||||
|
||||
enum GcFailed {
|
||||
#[derive(enum_map::Enum)]
|
||||
enum DeleteFailed {
|
||||
TimelineGone,
|
||||
DeleteSchedulingFailed,
|
||||
}
|
||||
|
||||
impl GcFailed {
|
||||
impl DeleteFailed {
|
||||
fn as_str(&self) -> &'static str {
|
||||
match self {
|
||||
GcFailed::TimelineGone => "timeline_gone",
|
||||
GcFailed::DeleteSchedulingFailed => "delete_scheduling_failed",
|
||||
DeleteFailed::TimelineGone => "timeline_gone",
|
||||
DeleteFailed::DeleteSchedulingFailed => "delete_scheduling_failed",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static LAYER_IMPL_METRICS: once_cell::sync::Lazy<LayerImplMetrics> =
|
||||
#[derive(enum_map::Enum)]
|
||||
enum RareEvent {
|
||||
RemoveOnDropFailed,
|
||||
RetriedGetOrMaybeDownload,
|
||||
DownloadFailedWithoutRequester,
|
||||
UpgradedWantedEvicted,
|
||||
InitWithoutDownload,
|
||||
PermanentLoadingFailure,
|
||||
EvictAndWaitLagged,
|
||||
}
|
||||
|
||||
impl RareEvent {
|
||||
fn as_str(&self) -> &'static str {
|
||||
use RareEvent::*;
|
||||
|
||||
match self {
|
||||
RemoveOnDropFailed => "remove_on_drop_failed",
|
||||
RetriedGetOrMaybeDownload => "retried_gomd",
|
||||
DownloadFailedWithoutRequester => "download_failed_without",
|
||||
UpgradedWantedEvicted => "raced_wanted_evicted",
|
||||
InitWithoutDownload => "init_needed_no_download",
|
||||
PermanentLoadingFailure => "permanent_loading_failure",
|
||||
EvictAndWaitLagged => "broadcast_lagged",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) static LAYER_IMPL_METRICS: once_cell::sync::Lazy<LayerImplMetrics> =
|
||||
once_cell::sync::Lazy::new(LayerImplMetrics::default);
|
||||
|
||||
@@ -44,6 +44,7 @@ pub(crate) enum BackgroundLoopKind {
|
||||
Eviction,
|
||||
ConsumptionMetricsCollectMetrics,
|
||||
ConsumptionMetricsSyntheticSizeWorker,
|
||||
InitialLogicalSizeCalculation,
|
||||
}
|
||||
|
||||
impl BackgroundLoopKind {
|
||||
|
||||
@@ -18,25 +18,29 @@ use pageserver_api::{
|
||||
DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, LayerMapInfo,
|
||||
TimelineState,
|
||||
},
|
||||
shard::TenantShardId,
|
||||
shard::{ShardIdentity, TenantShardId},
|
||||
};
|
||||
use rand::Rng;
|
||||
use serde_with::serde_as;
|
||||
use storage_broker::BrokerClientChannel;
|
||||
use tokio::{
|
||||
runtime::Handle,
|
||||
sync::{oneshot, watch, TryAcquireError},
|
||||
sync::{oneshot, watch},
|
||||
};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::{id::TenantTimelineId, sync::gate::Gate};
|
||||
|
||||
use std::cmp::{max, min, Ordering};
|
||||
use std::collections::{BinaryHeap, HashMap, HashSet};
|
||||
use std::ops::{Deref, Range};
|
||||
use std::pin::pin;
|
||||
use std::sync::atomic::Ordering as AtomicOrdering;
|
||||
use std::sync::{Arc, Mutex, RwLock, Weak};
|
||||
use std::time::{Duration, Instant, SystemTime};
|
||||
use std::{
|
||||
cmp::{max, min, Ordering},
|
||||
ops::ControlFlow,
|
||||
};
|
||||
|
||||
use crate::context::{
|
||||
AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder,
|
||||
@@ -163,6 +167,10 @@ pub struct Timeline {
|
||||
/// this copy enforces the invariant that generatio doesn't change during a Tenant's lifetime.
|
||||
pub(crate) generation: Generation,
|
||||
|
||||
/// The detailed sharding information from our parent Tenant. This enables us to map keys
|
||||
/// to shards, and is constant through the lifetime of this Timeline.
|
||||
shard_identity: ShardIdentity,
|
||||
|
||||
pub pg_version: u32,
|
||||
|
||||
/// The tuple has two elements.
|
||||
@@ -298,13 +306,6 @@ pub struct Timeline {
|
||||
|
||||
eviction_task_timeline_state: tokio::sync::Mutex<EvictionTaskTimelineState>,
|
||||
|
||||
/// Barrier to wait before doing initial logical size calculation. Used only during startup.
|
||||
initial_logical_size_can_start: Option<completion::Barrier>,
|
||||
|
||||
/// Completion shared between all timelines loaded during startup; used to delay heavier
|
||||
/// background tasks until some logical sizes have been calculated.
|
||||
initial_logical_size_attempt: Mutex<Option<completion::Completion>>,
|
||||
|
||||
/// Load or creation time information about the disk_consistent_lsn and when the loading
|
||||
/// happened. Used for consumption metrics.
|
||||
pub(crate) loaded_at: (Lsn, SystemTime),
|
||||
@@ -453,6 +454,11 @@ pub enum LogicalSizeCalculationCause {
|
||||
TenantSizeHandler,
|
||||
}
|
||||
|
||||
pub enum GetLogicalSizePriority {
|
||||
User,
|
||||
Background,
|
||||
}
|
||||
|
||||
#[derive(enumset::EnumSetType)]
|
||||
pub(crate) enum CompactFlags {
|
||||
ForceRepartition,
|
||||
@@ -472,7 +478,7 @@ impl Timeline {
|
||||
.map(|ancestor| ancestor.timeline_id)
|
||||
}
|
||||
|
||||
/// Lock and get timeline's GC cuttof
|
||||
/// Lock and get timeline's GC cutoff
|
||||
pub fn get_latest_gc_cutoff_lsn(&self) -> RcuReadGuard<Lsn> {
|
||||
self.latest_gc_cutoff_lsn.read()
|
||||
}
|
||||
@@ -489,6 +495,9 @@ impl Timeline {
|
||||
/// an ancestor branch, for example, or waste a lot of cycles chasing the
|
||||
/// non-existing key.
|
||||
///
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// This method is cancellation-safe.
|
||||
pub async fn get(
|
||||
&self,
|
||||
key: Key,
|
||||
@@ -801,7 +810,12 @@ impl Timeline {
|
||||
.access_stats_behavior(AccessStatsBehavior::Skip)
|
||||
.build();
|
||||
|
||||
// 2. Create new image layers for partitions that have been modified
|
||||
// 2. Compact
|
||||
let timer = self.metrics.compact_time_histo.start_timer();
|
||||
self.compact_level0(target_file_size, ctx).await?;
|
||||
timer.stop_and_record();
|
||||
|
||||
// 3. Create new image layers for partitions that have been modified
|
||||
// "enough".
|
||||
let layers = self
|
||||
.create_image_layers(&partitioning, lsn, false, &image_ctx)
|
||||
@@ -813,11 +827,6 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Compact
|
||||
let timer = self.metrics.compact_time_histo.start_timer();
|
||||
self.compact_level0(target_file_size, ctx).await?;
|
||||
timer.stop_and_record();
|
||||
|
||||
if let Some(remote_client) = &self.remote_client {
|
||||
// should any new image layer been created, not uploading index_part will
|
||||
// result in a mismatch between remote_physical_size and layermap calculated
|
||||
@@ -849,46 +858,6 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
/// Retrieve current logical size of the timeline.
|
||||
///
|
||||
/// The size could be lagging behind the actual number, in case
|
||||
/// the initial size calculation has not been run (gets triggered on the first size access).
|
||||
///
|
||||
/// return size and boolean flag that shows if the size is exact
|
||||
pub(crate) fn get_current_logical_size(
|
||||
self: &Arc<Self>,
|
||||
ctx: &RequestContext,
|
||||
) -> logical_size::CurrentLogicalSize {
|
||||
let current_size = self.current_logical_size.current_size();
|
||||
debug!("Current size: {current_size:?}");
|
||||
|
||||
if let (CurrentLogicalSize::Approximate(_), Some(initial_part_end)) =
|
||||
(current_size, self.current_logical_size.initial_part_end)
|
||||
{
|
||||
self.try_spawn_size_init_task(initial_part_end, ctx);
|
||||
}
|
||||
|
||||
if let CurrentLogicalSize::Approximate(_) = ¤t_size {
|
||||
if ctx.task_kind() == TaskKind::WalReceiverConnectionHandler {
|
||||
let first = self
|
||||
.current_logical_size
|
||||
.did_return_approximate_to_walreceiver
|
||||
.compare_exchange(
|
||||
false,
|
||||
true,
|
||||
AtomicOrdering::Relaxed,
|
||||
AtomicOrdering::Relaxed,
|
||||
)
|
||||
.is_ok();
|
||||
if first {
|
||||
crate::metrics::initial_logical_size::TIMELINES_WHERE_WALRECEIVER_GOT_APPROXIMATE_SIZE.inc();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
current_size
|
||||
}
|
||||
|
||||
/// Check if more than 'checkpoint_distance' of WAL has been accumulated in
|
||||
/// the in-memory layer, and initiate flushing it if so.
|
||||
///
|
||||
@@ -938,6 +907,7 @@ impl Timeline {
|
||||
background_jobs_can_start: Option<&completion::Barrier>,
|
||||
ctx: &RequestContext,
|
||||
) {
|
||||
self.spawn_initial_logical_size_computation_task(ctx);
|
||||
self.launch_wal_receiver(ctx, broker_client);
|
||||
self.set_state(TimelineState::Active);
|
||||
self.launch_eviction_task(background_jobs_can_start);
|
||||
@@ -1051,17 +1021,6 @@ impl Timeline {
|
||||
error!("Not activating a Stopping timeline");
|
||||
}
|
||||
(_, new_state) => {
|
||||
if matches!(
|
||||
new_state,
|
||||
TimelineState::Stopping | TimelineState::Broken { .. }
|
||||
) {
|
||||
// drop the completion guard, if any; it might be holding off the completion
|
||||
// forever needlessly
|
||||
self.initial_logical_size_attempt
|
||||
.lock()
|
||||
.unwrap_or_else(|e| e.into_inner())
|
||||
.take();
|
||||
}
|
||||
self.state.send_replace(new_state);
|
||||
}
|
||||
}
|
||||
@@ -1380,11 +1339,10 @@ impl Timeline {
|
||||
timeline_id: TimelineId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
generation: Generation,
|
||||
shard_identity: ShardIdentity,
|
||||
walredo_mgr: Arc<super::WalRedoManager>,
|
||||
resources: TimelineResources,
|
||||
pg_version: u32,
|
||||
initial_logical_size_can_start: Option<completion::Barrier>,
|
||||
initial_logical_size_attempt: Option<completion::Completion>,
|
||||
state: TimelineState,
|
||||
cancel: CancellationToken,
|
||||
) -> Arc<Self> {
|
||||
@@ -1411,6 +1369,7 @@ impl Timeline {
|
||||
timeline_id,
|
||||
tenant_shard_id,
|
||||
generation,
|
||||
shard_identity,
|
||||
pg_version,
|
||||
layers: Arc::new(tokio::sync::RwLock::new(LayerManager::create())),
|
||||
wanted_image_layers: Mutex::new(None),
|
||||
@@ -1484,8 +1443,6 @@ impl Timeline {
|
||||
),
|
||||
delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTimelineFlow::default())),
|
||||
|
||||
initial_logical_size_can_start,
|
||||
initial_logical_size_attempt: Mutex::new(initial_logical_size_attempt),
|
||||
cancel,
|
||||
gate: Gate::new(format!("Timeline<{tenant_shard_id}/{timeline_id}>")),
|
||||
|
||||
@@ -1797,39 +1754,91 @@ impl Timeline {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn try_spawn_size_init_task(self: &Arc<Self>, lsn: Lsn, ctx: &RequestContext) {
|
||||
let state = self.current_state();
|
||||
if matches!(
|
||||
state,
|
||||
TimelineState::Broken { .. } | TimelineState::Stopping
|
||||
) {
|
||||
// Can happen when timeline detail endpoint is used when deletion is ongoing (or its broken).
|
||||
return;
|
||||
/// Retrieve current logical size of the timeline.
|
||||
///
|
||||
/// The size could be lagging behind the actual number, in case
|
||||
/// the initial size calculation has not been run (gets triggered on the first size access).
|
||||
///
|
||||
/// return size and boolean flag that shows if the size is exact
|
||||
pub(crate) fn get_current_logical_size(
|
||||
self: &Arc<Self>,
|
||||
priority: GetLogicalSizePriority,
|
||||
ctx: &RequestContext,
|
||||
) -> logical_size::CurrentLogicalSize {
|
||||
let current_size = self.current_logical_size.current_size();
|
||||
debug!("Current size: {current_size:?}");
|
||||
|
||||
match (current_size.accuracy(), priority) {
|
||||
(logical_size::Accuracy::Exact, _) => (), // nothing to do
|
||||
(logical_size::Accuracy::Approximate, GetLogicalSizePriority::Background) => {
|
||||
// background task will eventually deliver an exact value, we're in no rush
|
||||
}
|
||||
(logical_size::Accuracy::Approximate, GetLogicalSizePriority::User) => {
|
||||
// background task is not ready, but user is asking for it now;
|
||||
// => make the background task skip the line
|
||||
// (The alternative would be to calculate the size here, but,
|
||||
// it can actually take a long time if the user has a lot of rels.
|
||||
// And we'll inevitable need it again; So, let the background task do the work.)
|
||||
match self
|
||||
.current_logical_size
|
||||
.cancel_wait_for_background_loop_concurrency_limit_semaphore
|
||||
.get()
|
||||
{
|
||||
Some(cancel) => cancel.cancel(),
|
||||
None => {
|
||||
let state = self.current_state();
|
||||
if matches!(
|
||||
state,
|
||||
TimelineState::Broken { .. } | TimelineState::Stopping
|
||||
) {
|
||||
|
||||
// Can happen when timeline detail endpoint is used when deletion is ongoing (or its broken).
|
||||
// Don't make noise.
|
||||
} else {
|
||||
warn!("unexpected: cancel_wait_for_background_loop_concurrency_limit_semaphore not set, priority-boosting of logical size calculation will not work");
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
let permit = match Arc::clone(&self.current_logical_size.initial_size_computation)
|
||||
.try_acquire_owned()
|
||||
{
|
||||
Ok(permit) => permit,
|
||||
Err(TryAcquireError::NoPermits) => {
|
||||
// computation already ongoing or finished with success
|
||||
return;
|
||||
if let CurrentLogicalSize::Approximate(_) = ¤t_size {
|
||||
if ctx.task_kind() == TaskKind::WalReceiverConnectionHandler {
|
||||
let first = self
|
||||
.current_logical_size
|
||||
.did_return_approximate_to_walreceiver
|
||||
.compare_exchange(
|
||||
false,
|
||||
true,
|
||||
AtomicOrdering::Relaxed,
|
||||
AtomicOrdering::Relaxed,
|
||||
)
|
||||
.is_ok();
|
||||
if first {
|
||||
crate::metrics::initial_logical_size::TIMELINES_WHERE_WALRECEIVER_GOT_APPROXIMATE_SIZE.inc();
|
||||
}
|
||||
}
|
||||
Err(TryAcquireError::Closed) => unreachable!("we never call close"),
|
||||
};
|
||||
debug_assert!(self
|
||||
.current_logical_size
|
||||
.initial_logical_size
|
||||
.get()
|
||||
.is_none());
|
||||
}
|
||||
|
||||
current_size
|
||||
}
|
||||
|
||||
fn spawn_initial_logical_size_computation_task(self: &Arc<Self>, ctx: &RequestContext) {
|
||||
let Some(initial_part_end) = self.current_logical_size.initial_part_end else {
|
||||
// nothing to do for freshly created timelines;
|
||||
assert_eq!(
|
||||
self.current_logical_size.current_size().accuracy(),
|
||||
logical_size::Accuracy::Exact,
|
||||
);
|
||||
return;
|
||||
};
|
||||
|
||||
let cancel_wait_for_background_loop_concurrency_limit_semaphore = CancellationToken::new();
|
||||
let token = cancel_wait_for_background_loop_concurrency_limit_semaphore.clone();
|
||||
self.current_logical_size
|
||||
.cancel_wait_for_background_loop_concurrency_limit_semaphore.set(token)
|
||||
.expect("initial logical size calculation task must be spawned exactly once per Timeline object");
|
||||
|
||||
info!(
|
||||
"spawning logical size computation from context of task kind {:?}",
|
||||
ctx.task_kind()
|
||||
);
|
||||
let causing_task_kind = ctx.task_kind();
|
||||
// We need to start the computation task.
|
||||
// It gets a separate context since it will outlive the request that called this function.
|
||||
let self_clone = Arc::clone(self);
|
||||
let background_ctx = ctx.detached_child(
|
||||
TaskKind::InitialLogicalSizeCalculation,
|
||||
@@ -1844,96 +1853,152 @@ impl Timeline {
|
||||
false,
|
||||
// NB: don't log errors here, task_mgr will do that.
|
||||
async move {
|
||||
|
||||
let cancel = task_mgr::shutdown_token();
|
||||
self_clone
|
||||
.initial_logical_size_calculation_task(
|
||||
initial_part_end,
|
||||
cancel_wait_for_background_loop_concurrency_limit_semaphore,
|
||||
cancel,
|
||||
background_ctx,
|
||||
)
|
||||
.await;
|
||||
Ok(())
|
||||
}
|
||||
.instrument(info_span!(parent: None, "initial_size_calculation", tenant_id=%self.tenant_shard_id.tenant_id, timeline_id=%self.timeline_id)),
|
||||
);
|
||||
}
|
||||
|
||||
// in case we were created during pageserver initialization, wait for
|
||||
// initialization to complete before proceeding. startup time init runs on the same
|
||||
// runtime.
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => { return Ok(()); },
|
||||
_ = completion::Barrier::maybe_wait(self_clone.initial_logical_size_can_start.clone()) => {}
|
||||
async fn initial_logical_size_calculation_task(
|
||||
self: Arc<Self>,
|
||||
initial_part_end: Lsn,
|
||||
skip_concurrency_limiter: CancellationToken,
|
||||
cancel: CancellationToken,
|
||||
background_ctx: RequestContext,
|
||||
) {
|
||||
enum BackgroundCalculationError {
|
||||
Cancelled,
|
||||
Other(anyhow::Error),
|
||||
}
|
||||
|
||||
let try_once = |attempt: usize| {
|
||||
let background_ctx = &background_ctx;
|
||||
let self_ref = &self;
|
||||
let skip_concurrency_limiter = &skip_concurrency_limiter;
|
||||
async move {
|
||||
let cancel = task_mgr::shutdown_token();
|
||||
let wait_for_permit = super::tasks::concurrent_background_tasks_rate_limit(
|
||||
BackgroundLoopKind::InitialLogicalSizeCalculation,
|
||||
background_ctx,
|
||||
&cancel,
|
||||
);
|
||||
|
||||
use crate::metrics::initial_logical_size::StartCircumstances;
|
||||
let (_maybe_permit, circumstances) = tokio::select! {
|
||||
res = wait_for_permit => {
|
||||
match res {
|
||||
Ok(permit) => (Some(permit), StartCircumstances::AfterBackgroundTasksRateLimit),
|
||||
Err(RateLimitError::Cancelled) => {
|
||||
return Err(BackgroundCalculationError::Cancelled);
|
||||
}
|
||||
}
|
||||
}
|
||||
() = skip_concurrency_limiter.cancelled() => {
|
||||
// Some action that is part of a end user interaction requested logical size
|
||||
// => break out of the rate limit
|
||||
// TODO: ideally we'd not run on BackgroundRuntime but the requester's runtime;
|
||||
// but then again what happens if they cancel; also, we should just be using
|
||||
// one runtime across the entire process, so, let's leave this for now.
|
||||
(None, StartCircumstances::SkippedConcurrencyLimiter)
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
// hold off background tasks from starting until all timelines get to try at least
|
||||
// once initial logical size calculation; though retry will rarely be useful.
|
||||
// holding off is done because heavier tasks execute blockingly on the same
|
||||
// runtime.
|
||||
//
|
||||
// dropping this at every outcome is probably better than trying to cling on to it,
|
||||
// delay will be terminated by a timeout regardless.
|
||||
let completion = { self_clone.initial_logical_size_attempt.lock().expect("unexpected initial_logical_size_attempt poisoned").take() };
|
||||
|
||||
let metrics_guard = match &completion {
|
||||
Some(_) => crate::metrics::initial_logical_size::START_CALCULATION.first(Some(causing_task_kind)),
|
||||
None => crate::metrics::initial_logical_size::START_CALCULATION.retry(Some(causing_task_kind)),
|
||||
let metrics_guard = if attempt == 1 {
|
||||
crate::metrics::initial_logical_size::START_CALCULATION.first(circumstances)
|
||||
} else {
|
||||
crate::metrics::initial_logical_size::START_CALCULATION.retry(circumstances)
|
||||
};
|
||||
|
||||
let calculated_size = match self_clone
|
||||
.logical_size_calculation_task(lsn, LogicalSizeCalculationCause::Initial, &background_ctx)
|
||||
match self_ref
|
||||
.logical_size_calculation_task(
|
||||
initial_part_end,
|
||||
LogicalSizeCalculationCause::Initial,
|
||||
background_ctx,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(s) => s,
|
||||
Ok(calculated_size) => Ok((calculated_size, metrics_guard)),
|
||||
Err(CalculateLogicalSizeError::Cancelled) => {
|
||||
// Don't make noise, this is a common task.
|
||||
// In the unlikely case that there is another call to this function, we'll retry
|
||||
// because initial_logical_size is still None.
|
||||
info!("initial size calculation cancelled, likely timeline delete / tenant detach");
|
||||
return Ok(());
|
||||
Err(BackgroundCalculationError::Cancelled)
|
||||
}
|
||||
Err(CalculateLogicalSizeError::Other(err)) => {
|
||||
if let Some(e @ PageReconstructError::AncestorStopping(_)) =
|
||||
if let Some(PageReconstructError::AncestorStopping(_)) =
|
||||
err.root_cause().downcast_ref()
|
||||
{
|
||||
// This can happen if the timeline parent timeline switches to
|
||||
// Stopping state while we're still calculating the initial
|
||||
// timeline size for the child, for example if the tenant is
|
||||
// being detached or the pageserver is shut down. Like with
|
||||
// CalculateLogicalSizeError::Cancelled, don't make noise.
|
||||
info!("initial size calculation failed because the timeline or its ancestor is Stopping, likely because the tenant is being detached: {e:#}");
|
||||
return Ok(());
|
||||
Err(BackgroundCalculationError::Cancelled)
|
||||
} else {
|
||||
Err(BackgroundCalculationError::Other(err))
|
||||
}
|
||||
return Err(err.context("Failed to calculate logical size"));
|
||||
}
|
||||
};
|
||||
|
||||
// we cannot query current_logical_size.current_size() to know the current
|
||||
// *negative* value, only truncated to u64.
|
||||
let added = self_clone
|
||||
.current_logical_size
|
||||
.size_added_after_initial
|
||||
.load(AtomicOrdering::Relaxed);
|
||||
|
||||
let sum = calculated_size.saturating_add_signed(added);
|
||||
|
||||
// set the gauge value before it can be set in `update_current_logical_size`.
|
||||
self_clone.metrics.current_logical_size_gauge.set(sum);
|
||||
|
||||
match self_clone
|
||||
.current_logical_size
|
||||
.initial_logical_size
|
||||
.set((calculated_size, metrics_guard.calculation_result_saved()))
|
||||
{
|
||||
Ok(()) => (),
|
||||
Err(_what_we_just_attempted_to_set) => {
|
||||
let (existing_size, _) = self_clone
|
||||
.current_logical_size
|
||||
.initial_logical_size
|
||||
.get()
|
||||
.expect("once_cell set was lost, then get failed, impossible.");
|
||||
// This shouldn't happen because the semaphore is initialized with 1.
|
||||
// But if it happens, just complain & report success so there are no further retries.
|
||||
error!("Tried to update initial timeline size value to {calculated_size}, but the size was already set to {existing_size}, not changing")
|
||||
}
|
||||
}
|
||||
// now that `initial_logical_size.is_some()`, reduce permit count to 0
|
||||
// so that we prevent future callers from spawning this task
|
||||
permit.forget();
|
||||
Ok(())
|
||||
}.in_current_span(),
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
let retrying = async {
|
||||
let mut attempt = 0;
|
||||
loop {
|
||||
attempt += 1;
|
||||
|
||||
match try_once(attempt).await {
|
||||
Ok(res) => return ControlFlow::Continue(res),
|
||||
Err(BackgroundCalculationError::Cancelled) => return ControlFlow::Break(()),
|
||||
Err(BackgroundCalculationError::Other(e)) => {
|
||||
warn!(attempt, "initial size calculation failed: {e:?}");
|
||||
// exponential back-off doesn't make sense at these long intervals;
|
||||
// use fixed retry interval with generous jitter instead
|
||||
let sleep_duration = Duration::from_secs(
|
||||
u64::try_from(
|
||||
// 1hour base
|
||||
(60_i64 * 60_i64)
|
||||
// 10min jitter
|
||||
+ rand::thread_rng().gen_range(-10 * 60..10 * 60),
|
||||
)
|
||||
.expect("10min < 1hour"),
|
||||
);
|
||||
tokio::time::sleep(sleep_duration).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let (calculated_size, metrics_guard) = tokio::select! {
|
||||
res = retrying => {
|
||||
match res {
|
||||
ControlFlow::Continue(calculated_size) => calculated_size,
|
||||
ControlFlow::Break(()) => return,
|
||||
}
|
||||
}
|
||||
_ = cancel.cancelled() => {
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
// we cannot query current_logical_size.current_size() to know the current
|
||||
// *negative* value, only truncated to u64.
|
||||
let added = self
|
||||
.current_logical_size
|
||||
.size_added_after_initial
|
||||
.load(AtomicOrdering::Relaxed);
|
||||
|
||||
let sum = calculated_size.saturating_add_signed(added);
|
||||
|
||||
// set the gauge value before it can be set in `update_current_logical_size`.
|
||||
self.metrics.current_logical_size_gauge.set(sum);
|
||||
|
||||
self.current_logical_size
|
||||
.initial_logical_size
|
||||
.set((calculated_size, metrics_guard.calculation_result_saved()))
|
||||
.ok()
|
||||
.expect("only this task sets it");
|
||||
}
|
||||
|
||||
pub fn spawn_ondemand_logical_size_calculation(
|
||||
@@ -1971,6 +2036,9 @@ impl Timeline {
|
||||
receiver
|
||||
}
|
||||
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// This method is cancellation-safe.
|
||||
#[instrument(skip_all)]
|
||||
async fn logical_size_calculation_task(
|
||||
self: &Arc<Self>,
|
||||
@@ -2008,6 +2076,10 @@ impl Timeline {
|
||||
///
|
||||
/// NOTE: counted incrementally, includes ancestors. This can be a slow operation,
|
||||
/// especially if we need to download remote layers.
|
||||
///
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// This method is cancellation-safe.
|
||||
pub async fn calculate_logical_size(
|
||||
&self,
|
||||
up_to_lsn: Lsn,
|
||||
@@ -2123,6 +2195,10 @@ impl Timeline {
|
||||
///
|
||||
/// This function takes the current timeline's locked LayerMap as an argument,
|
||||
/// so callers can avoid potential race conditions.
|
||||
///
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// This method is cancellation-safe.
|
||||
async fn get_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
@@ -2371,6 +2447,9 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
/// # Cancel-safety
|
||||
///
|
||||
/// This method is cancellation-safe.
|
||||
async fn lookup_cached_page(
|
||||
&self,
|
||||
key: &Key,
|
||||
@@ -2405,6 +2484,10 @@ impl Timeline {
|
||||
Ok(Arc::clone(ancestor))
|
||||
}
|
||||
|
||||
pub(crate) fn get_shard_identity(&self) -> &ShardIdentity {
|
||||
&self.shard_identity
|
||||
}
|
||||
|
||||
///
|
||||
/// Get a handle to the latest layer for appending.
|
||||
///
|
||||
@@ -2441,7 +2524,7 @@ impl Timeline {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn finish_write(&self, new_lsn: Lsn) {
|
||||
pub(crate) fn finish_write(&self, new_lsn: Lsn) {
|
||||
assert!(new_lsn.is_aligned());
|
||||
|
||||
self.metrics.last_record_gauge.set(new_lsn.0 as i64);
|
||||
@@ -3888,7 +3971,7 @@ impl Timeline {
|
||||
// for details. This will block until the old value is no longer in use.
|
||||
//
|
||||
// The GC cutoff should only ever move forwards.
|
||||
{
|
||||
let waitlist = {
|
||||
let write_guard = self.latest_gc_cutoff_lsn.lock_for_write();
|
||||
ensure!(
|
||||
*write_guard <= new_gc_cutoff,
|
||||
@@ -3896,8 +3979,9 @@ impl Timeline {
|
||||
*write_guard,
|
||||
new_gc_cutoff
|
||||
);
|
||||
write_guard.store_and_unlock(new_gc_cutoff).wait();
|
||||
}
|
||||
write_guard.store_and_unlock(new_gc_cutoff)
|
||||
};
|
||||
waitlist.wait().await;
|
||||
|
||||
info!("GC starting");
|
||||
|
||||
|
||||
@@ -21,7 +21,6 @@ use crate::{
|
||||
},
|
||||
CreateTimelineCause, DeleteTimelineError, Tenant,
|
||||
},
|
||||
InitializationOrder,
|
||||
};
|
||||
|
||||
use super::{Timeline, TimelineResources};
|
||||
@@ -407,7 +406,6 @@ impl DeleteTimelineFlow {
|
||||
local_metadata: &TimelineMetadata,
|
||||
remote_client: Option<RemoteTimelineClient>,
|
||||
deletion_queue_client: DeletionQueueClient,
|
||||
init_order: Option<&InitializationOrder>,
|
||||
) -> anyhow::Result<()> {
|
||||
// Note: here we even skip populating layer map. Timeline is essentially uninitialized.
|
||||
// RemoteTimelineClient is the only functioning part.
|
||||
@@ -420,7 +418,6 @@ impl DeleteTimelineFlow {
|
||||
remote_client,
|
||||
deletion_queue_client,
|
||||
},
|
||||
init_order,
|
||||
// Important. We dont pass ancestor above because it can be missing.
|
||||
// Thus we need to skip the validation here.
|
||||
CreateTimelineCause::Delete,
|
||||
|
||||
@@ -243,7 +243,7 @@ impl LayerManager {
|
||||
// map index without actually rebuilding the index.
|
||||
updates.remove_historic(desc);
|
||||
mapping.remove(layer);
|
||||
layer.garbage_collect_on_drop();
|
||||
layer.delete_on_drop();
|
||||
}
|
||||
|
||||
pub(crate) fn contains(&self, layer: &Layer) -> bool {
|
||||
|
||||
@@ -1,11 +1,10 @@
|
||||
use anyhow::Context;
|
||||
use once_cell::sync::OnceCell;
|
||||
|
||||
use tokio::sync::Semaphore;
|
||||
use once_cell::sync::OnceCell;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use std::sync::atomic::{AtomicBool, AtomicI64, Ordering as AtomicOrdering};
|
||||
use std::sync::Arc;
|
||||
|
||||
/// Internal structure to hold all data needed for logical size calculation.
|
||||
///
|
||||
@@ -28,8 +27,12 @@ pub(super) struct LogicalSize {
|
||||
crate::metrics::initial_logical_size::FinishedCalculationGuard,
|
||||
)>,
|
||||
|
||||
/// Semaphore to track ongoing calculation of `initial_logical_size`.
|
||||
pub initial_size_computation: Arc<tokio::sync::Semaphore>,
|
||||
/// Cancellation for the best-effort logical size calculation.
|
||||
///
|
||||
/// The token is kept in a once-cell so that we can error out if a higher priority
|
||||
/// request comes in *before* we have started the normal logical size calculation.
|
||||
pub(crate) cancel_wait_for_background_loop_concurrency_limit_semaphore:
|
||||
OnceCell<CancellationToken>,
|
||||
|
||||
/// Latest Lsn that has its size uncalculated, could be absent for freshly created timelines.
|
||||
pub initial_part_end: Option<Lsn>,
|
||||
@@ -72,7 +75,7 @@ pub(crate) enum CurrentLogicalSize {
|
||||
Exact(Exact),
|
||||
}
|
||||
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
|
||||
pub(crate) enum Accuracy {
|
||||
Approximate,
|
||||
Exact,
|
||||
@@ -115,11 +118,10 @@ impl LogicalSize {
|
||||
Self {
|
||||
initial_logical_size: OnceCell::with_value((0, {
|
||||
crate::metrics::initial_logical_size::START_CALCULATION
|
||||
.first(None)
|
||||
.first(crate::metrics::initial_logical_size::StartCircumstances::EmptyInitial)
|
||||
.calculation_result_saved()
|
||||
})),
|
||||
// initial_logical_size already computed, so, don't admit any calculations
|
||||
initial_size_computation: Arc::new(Semaphore::new(0)),
|
||||
cancel_wait_for_background_loop_concurrency_limit_semaphore: OnceCell::new(),
|
||||
initial_part_end: None,
|
||||
size_added_after_initial: AtomicI64::new(0),
|
||||
did_return_approximate_to_walreceiver: AtomicBool::new(false),
|
||||
@@ -129,7 +131,7 @@ impl LogicalSize {
|
||||
pub(super) fn deferred_initial(compute_to: Lsn) -> Self {
|
||||
Self {
|
||||
initial_logical_size: OnceCell::new(),
|
||||
initial_size_computation: Arc::new(Semaphore::new(1)),
|
||||
cancel_wait_for_background_loop_concurrency_limit_semaphore: OnceCell::new(),
|
||||
initial_part_end: Some(compute_to),
|
||||
size_added_after_initial: AtomicI64::new(0),
|
||||
did_return_approximate_to_walreceiver: AtomicBool::new(false),
|
||||
|
||||
@@ -397,7 +397,10 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
// Send the replication feedback message.
|
||||
// Regular standby_status_update fields are put into this message.
|
||||
let current_timeline_size = timeline
|
||||
.get_current_logical_size(&ctx)
|
||||
.get_current_logical_size(
|
||||
crate::tenant::timeline::GetLogicalSizePriority::User,
|
||||
&ctx,
|
||||
)
|
||||
// FIXME: https://github.com/neondatabase/neon/issues/5963
|
||||
.size_dont_care_about_accuracy();
|
||||
let status_update = PageserverFeedback {
|
||||
|
||||
@@ -288,6 +288,9 @@ impl VirtualFile {
|
||||
}
|
||||
let (handle, mut slot_guard) = get_open_files().find_victim_slot();
|
||||
|
||||
// NB: there is also StorageIoOperation::OpenAfterReplace which is for the case
|
||||
// where our caller doesn't get to use the returned VirtualFile before its
|
||||
// slot gets re-used by someone else.
|
||||
let file = STORAGE_IO_TIME_METRIC
|
||||
.get(StorageIoOperation::Open)
|
||||
.observe_closure_duration(|| open_options.open(path))?;
|
||||
@@ -311,6 +314,9 @@ impl VirtualFile {
|
||||
timeline_id,
|
||||
};
|
||||
|
||||
// TODO: Under pressure, it's likely the slot will get re-used and
|
||||
// the underlying file closed before they get around to using it.
|
||||
// => https://github.com/neondatabase/neon/issues/6065
|
||||
slot_guard.file.replace(file);
|
||||
|
||||
Ok(vfile)
|
||||
@@ -421,9 +427,12 @@ impl VirtualFile {
|
||||
// now locked in write-mode. Find a free slot to put it in.
|
||||
let (handle, mut slot_guard) = open_files.find_victim_slot();
|
||||
|
||||
// Open the physical file
|
||||
// Re-open the physical file.
|
||||
// NB: we use StorageIoOperation::OpenAferReplace for this to distinguish this
|
||||
// case from StorageIoOperation::Open. This helps with identifying thrashing
|
||||
// of the virtual file descriptor cache.
|
||||
let file = STORAGE_IO_TIME_METRIC
|
||||
.get(StorageIoOperation::Open)
|
||||
.get(StorageIoOperation::OpenAfterReplace)
|
||||
.observe_closure_duration(|| self.open_options.open(&self.path))?;
|
||||
|
||||
// Perform the requested operation on it
|
||||
@@ -610,9 +619,11 @@ impl Drop for VirtualFile {
|
||||
slot.recently_used.store(false, Ordering::Relaxed);
|
||||
// there is also operation "close-by-replace" for closes done on eviction for
|
||||
// comparison.
|
||||
STORAGE_IO_TIME_METRIC
|
||||
.get(StorageIoOperation::Close)
|
||||
.observe_closure_duration(|| drop(slot_guard.file.take()));
|
||||
if let Some(fd) = slot_guard.file.take() {
|
||||
STORAGE_IO_TIME_METRIC
|
||||
.get(StorageIoOperation::Close)
|
||||
.observe_closure_duration(|| drop(fd));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -643,6 +654,7 @@ pub fn init(num_slots: usize) {
|
||||
if OPEN_FILES.set(OpenFiles::new(num_slots)).is_err() {
|
||||
panic!("virtual_file::init called twice");
|
||||
}
|
||||
crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64);
|
||||
}
|
||||
|
||||
const TEST_MAX_FILE_DESCRIPTORS: usize = 10;
|
||||
|
||||
@@ -21,6 +21,7 @@
|
||||
//! redo Postgres process, but some records it can handle directly with
|
||||
//! bespoken Rust code.
|
||||
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes;
|
||||
use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment;
|
||||
use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};
|
||||
@@ -30,6 +31,7 @@ use bytes::{Buf, Bytes, BytesMut};
|
||||
use tracing::*;
|
||||
|
||||
use crate::context::RequestContext;
|
||||
use crate::metrics::WAL_INGEST;
|
||||
use crate::pgdatadir_mapping::*;
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::tenant::Timeline;
|
||||
@@ -46,6 +48,7 @@ use postgres_ffi::BLCKSZ;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
pub struct WalIngest<'a> {
|
||||
shard: ShardIdentity,
|
||||
timeline: &'a Timeline,
|
||||
|
||||
checkpoint: CheckPoint,
|
||||
@@ -65,6 +68,7 @@ impl<'a> WalIngest<'a> {
|
||||
trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value);
|
||||
|
||||
Ok(WalIngest {
|
||||
shard: *timeline.get_shard_identity(),
|
||||
timeline,
|
||||
checkpoint,
|
||||
checkpoint_modified: false,
|
||||
@@ -87,6 +91,8 @@ impl<'a> WalIngest<'a> {
|
||||
decoded: &mut DecodedWALRecord,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
WAL_INGEST.records_received.inc();
|
||||
|
||||
modification.lsn = lsn;
|
||||
decode_wal_record(recdata, decoded, self.timeline.pg_version)?;
|
||||
|
||||
@@ -355,6 +361,33 @@ impl<'a> WalIngest<'a> {
|
||||
// Iterate through all the blocks that the record modifies, and
|
||||
// "put" a separate copy of the record for each block.
|
||||
for blk in decoded.blocks.iter() {
|
||||
let rel = RelTag {
|
||||
spcnode: blk.rnode_spcnode,
|
||||
dbnode: blk.rnode_dbnode,
|
||||
relnode: blk.rnode_relnode,
|
||||
forknum: blk.forknum,
|
||||
};
|
||||
|
||||
let key = rel_block_to_key(rel, blk.blkno);
|
||||
let key_is_local = self.shard.is_key_local(&key);
|
||||
|
||||
tracing::debug!(
|
||||
lsn=%lsn,
|
||||
key=%key,
|
||||
"ingest: shard decision {} (checkpoint={})",
|
||||
if !key_is_local { "drop" } else { "keep" },
|
||||
self.checkpoint_modified
|
||||
);
|
||||
|
||||
if !key_is_local {
|
||||
if self.shard.is_zero() {
|
||||
// Shard 0 tracks relation sizes. Although we will not store this block, we will observe
|
||||
// its blkno in case it implicitly extends a relation.
|
||||
self.observe_decoded_block(modification, blk, ctx).await?;
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
self.ingest_decoded_block(modification, lsn, decoded, blk, ctx)
|
||||
.await?;
|
||||
}
|
||||
@@ -367,13 +400,38 @@ impl<'a> WalIngest<'a> {
|
||||
self.checkpoint_modified = false;
|
||||
}
|
||||
|
||||
if modification.is_empty() {
|
||||
tracing::debug!("ingest: filtered out record @ LSN {lsn}");
|
||||
WAL_INGEST.records_filtered.inc();
|
||||
modification.tline.finish_write(lsn);
|
||||
} else {
|
||||
WAL_INGEST.records_committed.inc();
|
||||
modification.commit(ctx).await?;
|
||||
}
|
||||
|
||||
// Now that this record has been fully handled, including updating the
|
||||
// checkpoint data, let the repository know that it is up-to-date to this LSN
|
||||
modification.commit(ctx).await?;
|
||||
// checkpoint data, let the repository know that it is up-to-date to this LSN.
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Do not store this block, but observe it for the purposes of updating our relation size state.
|
||||
async fn observe_decoded_block(
|
||||
&mut self,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
blk: &DecodedBkpBlock,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), PageReconstructError> {
|
||||
let rel = RelTag {
|
||||
spcnode: blk.rnode_spcnode,
|
||||
dbnode: blk.rnode_dbnode,
|
||||
relnode: blk.rnode_relnode,
|
||||
forknum: blk.forknum,
|
||||
};
|
||||
self.handle_rel_extend(modification, rel, blk.blkno, ctx)
|
||||
.await
|
||||
}
|
||||
|
||||
async fn ingest_decoded_block(
|
||||
&mut self,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
@@ -1465,8 +1523,15 @@ impl<'a> WalIngest<'a> {
|
||||
//info!("extending {} {} to {}", rel, old_nblocks, new_nblocks);
|
||||
modification.put_rel_extend(rel, new_nblocks, ctx).await?;
|
||||
|
||||
let mut key = rel_block_to_key(rel, blknum);
|
||||
// fill the gap with zeros
|
||||
for gap_blknum in old_nblocks..blknum {
|
||||
key.field6 = gap_blknum;
|
||||
|
||||
if self.shard.get_shard_number(&key) != self.shard.number {
|
||||
continue;
|
||||
}
|
||||
|
||||
modification.put_rel_page_image(rel, gap_blknum, ZERO_PAGE.clone())?;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -34,7 +34,6 @@ use std::process::{Child, ChildStdin, ChildStdout, Command};
|
||||
use std::sync::{Arc, Mutex, MutexGuard, RwLock};
|
||||
use std::time::Duration;
|
||||
use std::time::Instant;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock};
|
||||
|
||||
@@ -124,7 +123,9 @@ impl PostgresRedoManager {
|
||||
/// The WAL redo is handled by a separate thread, so this just sends a request
|
||||
/// to the thread and waits for response.
|
||||
///
|
||||
/// CANCEL SAFETY: NOT CANCEL SAFE.
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// This method is cancellation-safe.
|
||||
pub async fn request_redo(
|
||||
&self,
|
||||
key: Key,
|
||||
@@ -157,7 +158,6 @@ impl PostgresRedoManager {
|
||||
self.conf.wal_redo_timeout,
|
||||
pg_version,
|
||||
)
|
||||
.await
|
||||
};
|
||||
img = Some(result?);
|
||||
|
||||
@@ -178,7 +178,6 @@ impl PostgresRedoManager {
|
||||
self.conf.wal_redo_timeout,
|
||||
pg_version,
|
||||
)
|
||||
.await
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -216,7 +215,7 @@ impl PostgresRedoManager {
|
||||
/// Process one request for WAL redo using wal-redo postgres
|
||||
///
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn apply_batch_postgres(
|
||||
fn apply_batch_postgres(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
@@ -332,12 +331,7 @@ impl PostgresRedoManager {
|
||||
// than we can SIGKILL & `wait` for them to exit. By doing it the way we do here,
|
||||
// we limit this risk of run-away to at most $num_runtimes * $num_executor_threads.
|
||||
// This probably needs revisiting at some later point.
|
||||
let mut wait_done = proc.stderr_logger_task_done.clone();
|
||||
drop(proc);
|
||||
wait_done
|
||||
.wait_for(|v| *v)
|
||||
.await
|
||||
.expect("we use scopeguard to ensure we always send `true` to the channel before dropping the sender");
|
||||
} else if n_attempts != 0 {
|
||||
info!(n_attempts, "retried walredo succeeded");
|
||||
}
|
||||
@@ -649,8 +643,6 @@ struct WalRedoProcess {
|
||||
child: Option<NoLeakChild>,
|
||||
stdout: Mutex<ProcessOutput>,
|
||||
stdin: Mutex<ProcessInput>,
|
||||
stderr_logger_cancel: CancellationToken,
|
||||
stderr_logger_task_done: tokio::sync::watch::Receiver<bool>,
|
||||
/// Counter to separate same sized walredo inputs failing at the same millisecond.
|
||||
#[cfg(feature = "testing")]
|
||||
dump_sequence: AtomicUsize,
|
||||
@@ -699,6 +691,8 @@ impl WalRedoProcess {
|
||||
let stdin = child.stdin.take().unwrap();
|
||||
let stdout = child.stdout.take().unwrap();
|
||||
let stderr = child.stderr.take().unwrap();
|
||||
let stderr = tokio::process::ChildStderr::from_std(stderr)
|
||||
.context("convert to tokio::ChildStderr")?;
|
||||
macro_rules! set_nonblock_or_log_err {
|
||||
($file:ident) => {{
|
||||
let res = set_nonblock($file.as_raw_fd());
|
||||
@@ -710,69 +704,45 @@ impl WalRedoProcess {
|
||||
}
|
||||
set_nonblock_or_log_err!(stdin)?;
|
||||
set_nonblock_or_log_err!(stdout)?;
|
||||
set_nonblock_or_log_err!(stderr)?;
|
||||
|
||||
let mut stderr = tokio::io::unix::AsyncFd::new(stderr).context("AsyncFd::with_interest")?;
|
||||
|
||||
// all fallible operations post-spawn are complete, so get rid of the guard
|
||||
let child = scopeguard::ScopeGuard::into_inner(child);
|
||||
|
||||
let stderr_logger_cancel = CancellationToken::new();
|
||||
let (stderr_logger_task_done_tx, stderr_logger_task_done_rx) =
|
||||
tokio::sync::watch::channel(false);
|
||||
tokio::spawn({
|
||||
let stderr_logger_cancel = stderr_logger_cancel.clone();
|
||||
tokio::spawn(
|
||||
async move {
|
||||
scopeguard::defer! {
|
||||
debug!("wal-redo-postgres stderr_logger_task finished");
|
||||
let _ = stderr_logger_task_done_tx.send(true);
|
||||
crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
|
||||
}
|
||||
debug!("wal-redo-postgres stderr_logger_task started");
|
||||
loop {
|
||||
// NB: we purposefully don't do a select! for the cancellation here.
|
||||
// The cancellation would likely cause us to miss stderr messages.
|
||||
// We can rely on this to return from .await because when we SIGKILL
|
||||
// the child, the writing end of the stderr pipe gets closed.
|
||||
match stderr.readable_mut().await {
|
||||
Ok(mut guard) => {
|
||||
let mut errbuf = [0; 16384];
|
||||
let res = guard.try_io(|fd| {
|
||||
use std::io::Read;
|
||||
fd.get_mut().read(&mut errbuf)
|
||||
});
|
||||
match res {
|
||||
Ok(Ok(0)) => {
|
||||
// it closed the stderr pipe
|
||||
break;
|
||||
}
|
||||
Ok(Ok(n)) => {
|
||||
// The message might not be split correctly into lines here. But this is
|
||||
// good enough, the important thing is to get the message to the log.
|
||||
let output = String::from_utf8_lossy(&errbuf[0..n]).to_string();
|
||||
error!(output, "received output");
|
||||
},
|
||||
Ok(Err(e)) => {
|
||||
error!(error = ?e, "read() error, waiting for cancellation");
|
||||
stderr_logger_cancel.cancelled().await;
|
||||
error!(error = ?e, "read() error, cancellation complete");
|
||||
break;
|
||||
}
|
||||
Err(e) => {
|
||||
let _e: tokio::io::unix::TryIoError = e;
|
||||
// the read() returned WouldBlock, that's expected
|
||||
}
|
||||
}
|
||||
crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
|
||||
|
||||
use tokio::io::AsyncBufReadExt;
|
||||
let mut stderr_lines = tokio::io::BufReader::new(stderr);
|
||||
let mut buf = Vec::new();
|
||||
let res = loop {
|
||||
buf.clear();
|
||||
// TODO we don't trust the process to cap its stderr length.
|
||||
// Currently it can do unbounded Vec allocation.
|
||||
match stderr_lines.read_until(b'\n', &mut buf).await {
|
||||
Ok(0) => break Ok(()), // eof
|
||||
Ok(num_bytes) => {
|
||||
let output = String::from_utf8_lossy(&buf[..num_bytes]);
|
||||
error!(%output, "received output");
|
||||
}
|
||||
Err(e) => {
|
||||
error!(error = ?e, "read() error, waiting for cancellation");
|
||||
stderr_logger_cancel.cancelled().await;
|
||||
error!(error = ?e, "read() error, cancellation complete");
|
||||
break;
|
||||
break Err(e);
|
||||
}
|
||||
}
|
||||
};
|
||||
match res {
|
||||
Ok(()) => (),
|
||||
Err(e) => {
|
||||
error!(error=?e, "failed to read from walredo stderr");
|
||||
}
|
||||
}
|
||||
}.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_id, %pg_version))
|
||||
});
|
||||
);
|
||||
|
||||
Ok(Self {
|
||||
conf,
|
||||
@@ -787,8 +757,6 @@ impl WalRedoProcess {
|
||||
pending_responses: VecDeque::new(),
|
||||
n_processed_responses: 0,
|
||||
}),
|
||||
stderr_logger_cancel,
|
||||
stderr_logger_task_done: stderr_logger_task_done_rx,
|
||||
#[cfg(feature = "testing")]
|
||||
dump_sequence: AtomicUsize::default(),
|
||||
})
|
||||
@@ -1029,7 +997,6 @@ impl Drop for WalRedoProcess {
|
||||
.take()
|
||||
.expect("we only do this once")
|
||||
.kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
|
||||
self.stderr_logger_cancel.cancel();
|
||||
// no way to wait for stderr_logger_task from Drop because that is async only
|
||||
}
|
||||
}
|
||||
|
||||
@@ -41,6 +41,17 @@ libwalproposer.a: $(WALPROP_OBJS)
|
||||
rm -f $@
|
||||
$(AR) $(AROPT) $@ $^
|
||||
|
||||
# needs vars:
|
||||
# FIND_TYPEDEF pointing to find_typedef
|
||||
# INDENT pointing to pg_bsd_indent
|
||||
# PGINDENT_SCRIPT pointing to pgindent (be careful with PGINDENT var name:
|
||||
# pgindent will pick it up as pg_bsd_indent path).
|
||||
.PHONY: pgindent
|
||||
pgindent:
|
||||
+@ echo top_srcdir=$(top_srcdir) top_builddir=$(top_builddir) srcdir=$(srcdir)
|
||||
$(FIND_TYPEDEF) . > neon.typedefs
|
||||
INDENT=$(INDENT) $(PGINDENT_SCRIPT) --typedefs neon.typedefs $(srcdir)/*.c $(srcdir)/*.h
|
||||
|
||||
PG_CONFIG = pg_config
|
||||
PGXS := $(shell $(PG_CONFIG) --pgxs)
|
||||
include $(PGXS)
|
||||
|
||||
@@ -41,7 +41,7 @@ static char *ConsoleURL = NULL;
|
||||
static bool ForwardDDL = true;
|
||||
|
||||
/* Curl structures for sending the HTTP requests */
|
||||
static CURL * CurlHandle;
|
||||
static CURL *CurlHandle;
|
||||
static struct curl_slist *ContentHeader = NULL;
|
||||
|
||||
/*
|
||||
@@ -54,7 +54,7 @@ typedef enum
|
||||
{
|
||||
Op_Set, /* An upsert: Either a creation or an alter */
|
||||
Op_Delete,
|
||||
} OpType;
|
||||
} OpType;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
@@ -62,7 +62,7 @@ typedef struct
|
||||
Oid owner;
|
||||
char old_name[NAMEDATALEN];
|
||||
OpType type;
|
||||
} DbEntry;
|
||||
} DbEntry;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
@@ -70,7 +70,7 @@ typedef struct
|
||||
char old_name[NAMEDATALEN];
|
||||
const char *password;
|
||||
OpType type;
|
||||
} RoleEntry;
|
||||
} RoleEntry;
|
||||
|
||||
/*
|
||||
* We keep one of these for each subtransaction in a stack. When a subtransaction
|
||||
@@ -82,10 +82,10 @@ typedef struct DdlHashTable
|
||||
struct DdlHashTable *prev_table;
|
||||
HTAB *db_table;
|
||||
HTAB *role_table;
|
||||
} DdlHashTable;
|
||||
} DdlHashTable;
|
||||
|
||||
static DdlHashTable RootTable;
|
||||
static DdlHashTable * CurrentDdlTable = &RootTable;
|
||||
static DdlHashTable *CurrentDdlTable = &RootTable;
|
||||
|
||||
static void
|
||||
PushKeyValue(JsonbParseState **state, char *key, char *value)
|
||||
@@ -199,7 +199,7 @@ typedef struct
|
||||
{
|
||||
char str[ERROR_SIZE];
|
||||
size_t size;
|
||||
} ErrorString;
|
||||
} ErrorString;
|
||||
|
||||
static size_t
|
||||
ErrorWriteCallback(char *ptr, size_t size, size_t nmemb, void *userdata)
|
||||
@@ -478,7 +478,7 @@ NeonXactCallback(XactEvent event, void *arg)
|
||||
static bool
|
||||
RoleIsNeonSuperuser(const char *role_name)
|
||||
{
|
||||
return strcmp(role_name, "neon_superuser") == 0;
|
||||
return strcmp(role_name, "neon_superuser") == 0;
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -509,6 +509,7 @@ HandleCreateDb(CreatedbStmt *stmt)
|
||||
if (downer && downer->arg)
|
||||
{
|
||||
const char *owner_name = defGetString(downer);
|
||||
|
||||
if (RoleIsNeonSuperuser(owner_name))
|
||||
elog(ERROR, "can't create a database with owner neon_superuser");
|
||||
entry->owner = get_role_oid(owner_name, false);
|
||||
@@ -536,6 +537,7 @@ HandleAlterOwner(AlterOwnerStmt *stmt)
|
||||
if (!found)
|
||||
memset(entry->old_name, 0, sizeof(entry->old_name));
|
||||
const char *new_owner = get_rolespec_name(stmt->newowner);
|
||||
|
||||
if (RoleIsNeonSuperuser(new_owner))
|
||||
elog(ERROR, "can't alter owner to neon_superuser");
|
||||
entry->owner = get_role_oid(new_owner, false);
|
||||
@@ -633,6 +635,7 @@ HandleAlterRole(AlterRoleStmt *stmt)
|
||||
DefElem *dpass = NULL;
|
||||
ListCell *option;
|
||||
const char *role_name = stmt->role->rolename;
|
||||
|
||||
if (RoleIsNeonSuperuser(role_name))
|
||||
elog(ERROR, "can't ALTER neon_superuser");
|
||||
|
||||
|
||||
@@ -25,79 +25,81 @@
|
||||
|
||||
#include <curl/curl.h>
|
||||
|
||||
static int extension_server_port = 0;
|
||||
static int extension_server_port = 0;
|
||||
|
||||
static download_extension_file_hook_type prev_download_extension_file_hook = NULL;
|
||||
|
||||
// to download all SQL (and data) files for an extension:
|
||||
// curl -X POST http://localhost:8080/extension_server/postgis
|
||||
// it covers two possible extension files layouts:
|
||||
// 1. extension_name--version--platform.sql
|
||||
// 2. extension_name/extension_name--version.sql
|
||||
// extension_name/extra_files.csv
|
||||
//
|
||||
// to download specific library file:
|
||||
// curl -X POST http://localhost:8080/extension_server/postgis-3.so?is_library=true
|
||||
/*
|
||||
* to download all SQL (and data) files for an extension:
|
||||
* curl -X POST http://localhost:8080/extension_server/postgis
|
||||
* it covers two possible extension files layouts:
|
||||
* 1. extension_name--version--platform.sql
|
||||
* 2. extension_name/extension_name--version.sql
|
||||
* extension_name/extra_files.csv
|
||||
* to download specific library file:
|
||||
* curl -X POST http://localhost:8080/extension_server/postgis-3.so?is_library=true
|
||||
*/
|
||||
static bool
|
||||
neon_download_extension_file_http(const char *filename, bool is_library)
|
||||
{
|
||||
CURL *curl;
|
||||
CURLcode res;
|
||||
char *compute_ctl_url;
|
||||
char *postdata;
|
||||
bool ret = false;
|
||||
CURL *curl;
|
||||
CURLcode res;
|
||||
char *compute_ctl_url;
|
||||
char *postdata;
|
||||
bool ret = false;
|
||||
|
||||
if ((curl = curl_easy_init()) == NULL)
|
||||
{
|
||||
elog(ERROR, "Failed to initialize curl handle");
|
||||
}
|
||||
if ((curl = curl_easy_init()) == NULL)
|
||||
{
|
||||
elog(ERROR, "Failed to initialize curl handle");
|
||||
}
|
||||
|
||||
compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s",
|
||||
extension_server_port, filename, is_library ? "?is_library=true" : "");
|
||||
compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s",
|
||||
extension_server_port, filename, is_library ? "?is_library=true" : "");
|
||||
|
||||
elog(LOG, "Sending request to compute_ctl: %s", compute_ctl_url);
|
||||
elog(LOG, "Sending request to compute_ctl: %s", compute_ctl_url);
|
||||
|
||||
curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, "POST");
|
||||
curl_easy_setopt(curl, CURLOPT_URL, compute_ctl_url);
|
||||
curl_easy_setopt(curl, CURLOPT_TIMEOUT, 3L /* seconds */);
|
||||
curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, "POST");
|
||||
curl_easy_setopt(curl, CURLOPT_URL, compute_ctl_url);
|
||||
curl_easy_setopt(curl, CURLOPT_TIMEOUT, 3L /* seconds */ );
|
||||
|
||||
if (curl)
|
||||
{
|
||||
/* Perform the request, res will get the return code */
|
||||
res = curl_easy_perform(curl);
|
||||
/* Check for errors */
|
||||
if (res == CURLE_OK)
|
||||
{
|
||||
ret = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Don't error here because postgres will try to find the file
|
||||
// and will fail with some proper error message if it's not found.
|
||||
elog(WARNING, "neon_download_extension_file_http failed: %s\n", curl_easy_strerror(res));
|
||||
}
|
||||
if (curl)
|
||||
{
|
||||
/* Perform the request, res will get the return code */
|
||||
res = curl_easy_perform(curl);
|
||||
/* Check for errors */
|
||||
if (res == CURLE_OK)
|
||||
{
|
||||
ret = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Don't error here because postgres will try to find the file */
|
||||
/* and will fail with some proper error message if it's not found. */
|
||||
elog(WARNING, "neon_download_extension_file_http failed: %s\n", curl_easy_strerror(res));
|
||||
}
|
||||
|
||||
/* always cleanup */
|
||||
curl_easy_cleanup(curl);
|
||||
}
|
||||
/* always cleanup */
|
||||
curl_easy_cleanup(curl);
|
||||
}
|
||||
|
||||
return ret;
|
||||
return ret;
|
||||
}
|
||||
|
||||
void pg_init_extension_server()
|
||||
void
|
||||
pg_init_extension_server()
|
||||
{
|
||||
// Port to connect to compute_ctl on localhost
|
||||
// to request extension files.
|
||||
DefineCustomIntVariable("neon.extension_server_port",
|
||||
"connection string to the compute_ctl",
|
||||
NULL,
|
||||
&extension_server_port,
|
||||
0, 0, INT_MAX,
|
||||
PGC_POSTMASTER,
|
||||
0, /* no flags required */
|
||||
NULL, NULL, NULL);
|
||||
/* Port to connect to compute_ctl on localhost */
|
||||
/* to request extension files. */
|
||||
DefineCustomIntVariable("neon.extension_server_port",
|
||||
"connection string to the compute_ctl",
|
||||
NULL,
|
||||
&extension_server_port,
|
||||
0, 0, INT_MAX,
|
||||
PGC_POSTMASTER,
|
||||
0, /* no flags required */
|
||||
NULL, NULL, NULL);
|
||||
|
||||
// set download_extension_file_hook
|
||||
prev_download_extension_file_hook = download_extension_file_hook;
|
||||
download_extension_file_hook = neon_download_extension_file_http;
|
||||
/* set download_extension_file_hook */
|
||||
prev_download_extension_file_hook = download_extension_file_hook;
|
||||
download_extension_file_hook = neon_download_extension_file_http;
|
||||
}
|
||||
|
||||
@@ -67,32 +67,34 @@
|
||||
typedef struct FileCacheEntry
|
||||
{
|
||||
BufferTag key;
|
||||
uint32 hash;
|
||||
uint32 hash;
|
||||
uint32 offset;
|
||||
uint32 access_count;
|
||||
uint32 bitmap[BLOCKS_PER_CHUNK/32];
|
||||
dlist_node lru_node; /* LRU list node */
|
||||
uint32 bitmap[BLOCKS_PER_CHUNK / 32];
|
||||
dlist_node lru_node; /* LRU list node */
|
||||
} FileCacheEntry;
|
||||
|
||||
typedef struct FileCacheControl
|
||||
{
|
||||
uint64 generation; /* generation is needed to handle correct hash reenabling */
|
||||
uint32 size; /* size of cache file in chunks */
|
||||
uint32 used; /* number of used chunks */
|
||||
uint32 limit; /* shared copy of lfc_size_limit */
|
||||
uint64 hits;
|
||||
uint64 misses;
|
||||
uint64 writes;
|
||||
dlist_head lru; /* double linked list for LRU replacement algorithm */
|
||||
uint64 generation; /* generation is needed to handle correct hash
|
||||
* reenabling */
|
||||
uint32 size; /* size of cache file in chunks */
|
||||
uint32 used; /* number of used chunks */
|
||||
uint32 limit; /* shared copy of lfc_size_limit */
|
||||
uint64 hits;
|
||||
uint64 misses;
|
||||
uint64 writes;
|
||||
dlist_head lru; /* double linked list for LRU replacement
|
||||
* algorithm */
|
||||
} FileCacheControl;
|
||||
|
||||
static HTAB* lfc_hash;
|
||||
static int lfc_desc = 0;
|
||||
static HTAB *lfc_hash;
|
||||
static int lfc_desc = 0;
|
||||
static LWLockId lfc_lock;
|
||||
static int lfc_max_size;
|
||||
static int lfc_size_limit;
|
||||
static char* lfc_path;
|
||||
static FileCacheControl* lfc_ctl;
|
||||
static int lfc_max_size;
|
||||
static int lfc_size_limit;
|
||||
static char *lfc_path;
|
||||
static FileCacheControl *lfc_ctl;
|
||||
static shmem_startup_hook_type prev_shmem_startup_hook;
|
||||
#if PG_VERSION_NUM>=150000
|
||||
static shmem_request_hook_type prev_shmem_request_hook;
|
||||
@@ -100,7 +102,7 @@ static shmem_request_hook_type prev_shmem_request_hook;
|
||||
|
||||
#define LFC_ENABLED() (lfc_ctl->limit != 0)
|
||||
|
||||
void PGDLLEXPORT FileCacheMonitorMain(Datum main_arg);
|
||||
void PGDLLEXPORT FileCacheMonitorMain(Datum main_arg);
|
||||
|
||||
/*
|
||||
* Local file cache is optional and Neon can work without it.
|
||||
@@ -109,9 +111,10 @@ void PGDLLEXPORT FileCacheMonitorMain(Datum main_arg);
|
||||
* All cache content should be invalidated to avoid reading of stale or corrupted data
|
||||
*/
|
||||
static void
|
||||
lfc_disable(char const* op)
|
||||
lfc_disable(char const *op)
|
||||
{
|
||||
int fd;
|
||||
int fd;
|
||||
|
||||
elog(WARNING, "Failed to %s local file cache at %s: %m, disabling local file cache", op, lfc_path);
|
||||
|
||||
/* Invalidate hash */
|
||||
@@ -120,7 +123,7 @@ lfc_disable(char const* op)
|
||||
if (LFC_ENABLED())
|
||||
{
|
||||
HASH_SEQ_STATUS status;
|
||||
FileCacheEntry* entry;
|
||||
FileCacheEntry *entry;
|
||||
|
||||
hash_seq_init(&status, lfc_hash);
|
||||
while ((entry = hash_seq_search(&status)) != NULL)
|
||||
@@ -135,16 +138,24 @@ lfc_disable(char const* op)
|
||||
|
||||
if (lfc_desc > 0)
|
||||
{
|
||||
/* If the reason of error is ENOSPC, then truncation of file may help to reclaim some space */
|
||||
int rc = ftruncate(lfc_desc, 0);
|
||||
/*
|
||||
* If the reason of error is ENOSPC, then truncation of file may
|
||||
* help to reclaim some space
|
||||
*/
|
||||
int rc = ftruncate(lfc_desc, 0);
|
||||
|
||||
if (rc < 0)
|
||||
elog(WARNING, "Failed to truncate local file cache %s: %m", lfc_path);
|
||||
}
|
||||
}
|
||||
/* We need to use unlink to to avoid races in LFC write, because it is not protectedby */
|
||||
|
||||
/*
|
||||
* We need to use unlink to to avoid races in LFC write, because it is not
|
||||
* protectedby
|
||||
*/
|
||||
unlink(lfc_path);
|
||||
|
||||
fd = BasicOpenFile(lfc_path, O_RDWR|O_CREAT|O_TRUNC);
|
||||
fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC);
|
||||
if (fd < 0)
|
||||
elog(WARNING, "Failed to recreate local file cache %s: %m", lfc_path);
|
||||
else
|
||||
@@ -170,13 +181,15 @@ lfc_maybe_disabled(void)
|
||||
static bool
|
||||
lfc_ensure_opened(void)
|
||||
{
|
||||
bool enabled = !lfc_maybe_disabled();
|
||||
bool enabled = !lfc_maybe_disabled();
|
||||
|
||||
/* Open cache file if not done yet */
|
||||
if (lfc_desc <= 0 && enabled)
|
||||
{
|
||||
lfc_desc = BasicOpenFile(lfc_path, O_RDWR);
|
||||
|
||||
if (lfc_desc < 0) {
|
||||
if (lfc_desc < 0)
|
||||
{
|
||||
lfc_disable("open");
|
||||
return false;
|
||||
}
|
||||
@@ -187,7 +200,7 @@ lfc_ensure_opened(void)
|
||||
static void
|
||||
lfc_shmem_startup(void)
|
||||
{
|
||||
bool found;
|
||||
bool found;
|
||||
static HASHCTL info;
|
||||
|
||||
if (prev_shmem_startup_hook)
|
||||
@@ -197,17 +210,22 @@ lfc_shmem_startup(void)
|
||||
|
||||
LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
|
||||
|
||||
lfc_ctl = (FileCacheControl*)ShmemInitStruct("lfc", sizeof(FileCacheControl), &found);
|
||||
lfc_ctl = (FileCacheControl *) ShmemInitStruct("lfc", sizeof(FileCacheControl), &found);
|
||||
if (!found)
|
||||
{
|
||||
int fd;
|
||||
uint32 lfc_size = SIZE_MB_TO_CHUNKS(lfc_max_size);
|
||||
lfc_lock = (LWLockId)GetNamedLWLockTranche("lfc_lock");
|
||||
int fd;
|
||||
uint32 lfc_size = SIZE_MB_TO_CHUNKS(lfc_max_size);
|
||||
|
||||
lfc_lock = (LWLockId) GetNamedLWLockTranche("lfc_lock");
|
||||
info.keysize = sizeof(BufferTag);
|
||||
info.entrysize = sizeof(FileCacheEntry);
|
||||
|
||||
/*
|
||||
* lfc_size+1 because we add new element to hash table before eviction
|
||||
* of victim
|
||||
*/
|
||||
lfc_hash = ShmemInitHash("lfc_hash",
|
||||
/* lfc_size+1 because we add new element to hash table before eviction of victim */
|
||||
lfc_size+1, lfc_size+1,
|
||||
lfc_size + 1, lfc_size + 1,
|
||||
&info,
|
||||
HASH_ELEM | HASH_BLOBS);
|
||||
lfc_ctl->generation = 0;
|
||||
@@ -219,7 +237,7 @@ lfc_shmem_startup(void)
|
||||
dlist_init(&lfc_ctl->lru);
|
||||
|
||||
/* Recreate file cache on restart */
|
||||
fd = BasicOpenFile(lfc_path, O_RDWR|O_CREAT|O_TRUNC);
|
||||
fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC);
|
||||
if (fd < 0)
|
||||
{
|
||||
elog(WARNING, "Failed to create local file cache %s: %m", lfc_path);
|
||||
@@ -242,7 +260,7 @@ lfc_shmem_request(void)
|
||||
prev_shmem_request_hook();
|
||||
#endif
|
||||
|
||||
RequestAddinShmemSpace(sizeof(FileCacheControl) + hash_estimate_size(SIZE_MB_TO_CHUNKS(lfc_max_size)+1, sizeof(FileCacheEntry)));
|
||||
RequestAddinShmemSpace(sizeof(FileCacheControl) + hash_estimate_size(SIZE_MB_TO_CHUNKS(lfc_max_size) + 1, sizeof(FileCacheEntry)));
|
||||
RequestNamedLWLockTranche("lfc_lock", 1);
|
||||
}
|
||||
|
||||
@@ -250,9 +268,11 @@ static bool
|
||||
is_normal_backend(void)
|
||||
{
|
||||
/*
|
||||
* Stats collector detach shared memory, so we should not try to access shared memory here.
|
||||
* Parallel workers first assign default value (0), so not perform truncation in parallel workers.
|
||||
* The Postmaster can handle SIGHUP and it has access to shared memory (UsedShmemSegAddr != NULL), but has no PGPROC.
|
||||
* Stats collector detach shared memory, so we should not try to access
|
||||
* shared memory here. Parallel workers first assign default value (0), so
|
||||
* not perform truncation in parallel workers. The Postmaster can handle
|
||||
* SIGHUP and it has access to shared memory (UsedShmemSegAddr != NULL),
|
||||
* but has no PGPROC.
|
||||
*/
|
||||
return lfc_ctl && MyProc && UsedShmemSegAddr && !IsParallelWorker();
|
||||
}
|
||||
@@ -271,7 +291,7 @@ lfc_check_limit_hook(int *newval, void **extra, GucSource source)
|
||||
static void
|
||||
lfc_change_limit_hook(int newval, void *extra)
|
||||
{
|
||||
uint32 new_size = SIZE_MB_TO_CHUNKS(newval);
|
||||
uint32 new_size = SIZE_MB_TO_CHUNKS(newval);
|
||||
|
||||
if (!is_normal_backend())
|
||||
return;
|
||||
@@ -283,11 +303,15 @@ lfc_change_limit_hook(int newval, void *extra)
|
||||
|
||||
while (new_size < lfc_ctl->used && !dlist_is_empty(&lfc_ctl->lru))
|
||||
{
|
||||
/* Shrink cache by throwing away least recently accessed chunks and returning their space to file system */
|
||||
FileCacheEntry* victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
|
||||
/*
|
||||
* Shrink cache by throwing away least recently accessed chunks and
|
||||
* returning their space to file system
|
||||
*/
|
||||
FileCacheEntry *victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
|
||||
|
||||
Assert(victim->access_count == 0);
|
||||
#ifdef FALLOC_FL_PUNCH_HOLE
|
||||
if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE, (off_t)victim->offset*BLOCKS_PER_CHUNK*BLCKSZ, BLOCKS_PER_CHUNK*BLCKSZ) < 0)
|
||||
if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, (off_t) victim->offset * BLOCKS_PER_CHUNK * BLCKSZ, BLOCKS_PER_CHUNK * BLCKSZ) < 0)
|
||||
elog(LOG, "Failed to punch hole in file: %m");
|
||||
#endif
|
||||
hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
|
||||
@@ -314,7 +338,7 @@ lfc_init(void)
|
||||
"Maximal size of Neon local file cache",
|
||||
NULL,
|
||||
&lfc_max_size,
|
||||
0, /* disabled by default */
|
||||
0, /* disabled by default */
|
||||
0,
|
||||
INT_MAX,
|
||||
PGC_POSTMASTER,
|
||||
@@ -327,7 +351,7 @@ lfc_init(void)
|
||||
"Current limit for size of Neon local file cache",
|
||||
NULL,
|
||||
&lfc_size_limit,
|
||||
0, /* disabled by default */
|
||||
0, /* disabled by default */
|
||||
0,
|
||||
INT_MAX,
|
||||
PGC_SIGHUP,
|
||||
@@ -367,18 +391,18 @@ lfc_init(void)
|
||||
bool
|
||||
lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
|
||||
{
|
||||
BufferTag tag;
|
||||
FileCacheEntry* entry;
|
||||
int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
|
||||
bool found = false;
|
||||
uint32 hash;
|
||||
BufferTag tag;
|
||||
FileCacheEntry *entry;
|
||||
int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
|
||||
bool found = false;
|
||||
uint32 hash;
|
||||
|
||||
if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
|
||||
if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
|
||||
return false;
|
||||
|
||||
CopyNRelFileInfoToBufTag(tag, rinfo);
|
||||
tag.forkNum = forkNum;
|
||||
tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1);
|
||||
tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
|
||||
hash = get_hash_value(lfc_hash, &tag);
|
||||
|
||||
LWLockAcquire(lfc_lock, LW_SHARED);
|
||||
@@ -397,13 +421,13 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
|
||||
void
|
||||
lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
|
||||
{
|
||||
BufferTag tag;
|
||||
FileCacheEntry* entry;
|
||||
bool found;
|
||||
int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
|
||||
uint32 hash;
|
||||
BufferTag tag;
|
||||
FileCacheEntry *entry;
|
||||
bool found;
|
||||
int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
|
||||
uint32 hash;
|
||||
|
||||
if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
|
||||
if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
|
||||
return;
|
||||
|
||||
CopyNRelFileInfoToBufTag(tag, rinfo);
|
||||
@@ -438,9 +462,10 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
|
||||
*/
|
||||
if (entry->bitmap[chunk_offs >> 5] == 0)
|
||||
{
|
||||
bool has_remaining_pages;
|
||||
bool has_remaining_pages;
|
||||
|
||||
for (int i = 0; i < (BLOCKS_PER_CHUNK / 32); i++) {
|
||||
for (int i = 0; i < (BLOCKS_PER_CHUNK / 32); i++)
|
||||
{
|
||||
if (entry->bitmap[i] != 0)
|
||||
{
|
||||
has_remaining_pages = true;
|
||||
@@ -449,8 +474,8 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
|
||||
}
|
||||
|
||||
/*
|
||||
* Put the entry at the position that is first to be reclaimed when
|
||||
* we have no cached pages remaining in the chunk
|
||||
* Put the entry at the position that is first to be reclaimed when we
|
||||
* have no cached pages remaining in the chunk
|
||||
*/
|
||||
if (!has_remaining_pages)
|
||||
{
|
||||
@@ -476,16 +501,16 @@ bool
|
||||
lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
char *buffer)
|
||||
{
|
||||
BufferTag tag;
|
||||
FileCacheEntry* entry;
|
||||
ssize_t rc;
|
||||
int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
|
||||
bool result = true;
|
||||
uint32 hash;
|
||||
uint64 generation;
|
||||
uint32 entry_offset;
|
||||
BufferTag tag;
|
||||
FileCacheEntry *entry;
|
||||
ssize_t rc;
|
||||
int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
|
||||
bool result = true;
|
||||
uint32 hash;
|
||||
uint64 generation;
|
||||
uint32 entry_offset;
|
||||
|
||||
if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
|
||||
if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
|
||||
return false;
|
||||
|
||||
if (!lfc_ensure_opened())
|
||||
@@ -493,7 +518,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
|
||||
CopyNRelFileInfoToBufTag(tag, rinfo);
|
||||
tag.forkNum = forkNum;
|
||||
tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1);
|
||||
tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
|
||||
hash = get_hash_value(lfc_hash, &tag);
|
||||
|
||||
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
|
||||
@@ -520,7 +545,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
|
||||
LWLockRelease(lfc_lock);
|
||||
|
||||
rc = pread(lfc_desc, buffer, BLCKSZ, ((off_t)entry_offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ);
|
||||
rc = pread(lfc_desc, buffer, BLCKSZ, ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ);
|
||||
if (rc != BLCKSZ)
|
||||
{
|
||||
lfc_disable("read");
|
||||
@@ -551,30 +576,29 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
* If cache is full then evict some other page.
|
||||
*/
|
||||
void
|
||||
lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
#if PG_MAJORVERSION_NUM < 16
|
||||
char *buffer)
|
||||
lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, char *buffer)
|
||||
#else
|
||||
const void *buffer)
|
||||
lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void *buffer)
|
||||
#endif
|
||||
{
|
||||
BufferTag tag;
|
||||
FileCacheEntry* entry;
|
||||
ssize_t rc;
|
||||
bool found;
|
||||
int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
|
||||
uint32 hash;
|
||||
uint64 generation;
|
||||
uint32 entry_offset;
|
||||
BufferTag tag;
|
||||
FileCacheEntry *entry;
|
||||
ssize_t rc;
|
||||
bool found;
|
||||
int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
|
||||
uint32 hash;
|
||||
uint64 generation;
|
||||
uint32 entry_offset;
|
||||
|
||||
if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
|
||||
if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
|
||||
return;
|
||||
|
||||
if (!lfc_ensure_opened())
|
||||
return;
|
||||
|
||||
tag.forkNum = forkNum;
|
||||
tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1);
|
||||
tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
|
||||
CopyNRelFileInfoToBufTag(tag, rinfo);
|
||||
hash = get_hash_value(lfc_hash, &tag);
|
||||
|
||||
@@ -590,24 +614,36 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
|
||||
if (found)
|
||||
{
|
||||
/* Unlink entry from LRU list to pin it for the duration of IO operation */
|
||||
/*
|
||||
* Unlink entry from LRU list to pin it for the duration of IO
|
||||
* operation
|
||||
*/
|
||||
if (entry->access_count++ == 0)
|
||||
dlist_delete(&entry->lru_node);
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
* We have two choices if all cache pages are pinned (i.e. used in IO operations):
|
||||
* 1. Wait until some of this operation is completed and pages is unpinned
|
||||
* 2. Allocate one more chunk, so that specified cache size is more recommendation than hard limit.
|
||||
* As far as probability of such event (that all pages are pinned) is considered to be very very small:
|
||||
* there are should be very large number of concurrent IO operations and them are limited by max_connections,
|
||||
* we prefer not to complicate code and use second approach.
|
||||
* We have two choices if all cache pages are pinned (i.e. used in IO
|
||||
* operations):
|
||||
*
|
||||
* 1) Wait until some of this operation is completed and pages is
|
||||
* unpinned.
|
||||
*
|
||||
* 2) Allocate one more chunk, so that specified cache size is more
|
||||
* recommendation than hard limit.
|
||||
*
|
||||
* As far as probability of such event (that all pages are pinned) is
|
||||
* considered to be very very small: there are should be very large
|
||||
* number of concurrent IO operations and them are limited by
|
||||
* max_connections, we prefer not to complicate code and use second
|
||||
* approach.
|
||||
*/
|
||||
if (lfc_ctl->used >= lfc_ctl->limit && !dlist_is_empty(&lfc_ctl->lru))
|
||||
{
|
||||
/* Cache overflow: evict least recently used chunk */
|
||||
FileCacheEntry* victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
|
||||
FileCacheEntry *victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
|
||||
|
||||
Assert(victim->access_count == 0);
|
||||
entry->offset = victim->offset; /* grab victim's chunk */
|
||||
hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
|
||||
@@ -616,7 +652,8 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
else
|
||||
{
|
||||
lfc_ctl->used += 1;
|
||||
entry->offset = lfc_ctl->size++; /* allocate new chunk at end of file */
|
||||
entry->offset = lfc_ctl->size++; /* allocate new chunk at end
|
||||
* of file */
|
||||
}
|
||||
entry->access_count = 1;
|
||||
entry->hash = hash;
|
||||
@@ -628,7 +665,7 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
lfc_ctl->writes += 1;
|
||||
LWLockRelease(lfc_lock);
|
||||
|
||||
rc = pwrite(lfc_desc, buffer, BLCKSZ, ((off_t)entry_offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ);
|
||||
rc = pwrite(lfc_desc, buffer, BLCKSZ, ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ);
|
||||
if (rc != BLCKSZ)
|
||||
{
|
||||
lfc_disable("write");
|
||||
@@ -665,13 +702,13 @@ Datum
|
||||
neon_get_lfc_stats(PG_FUNCTION_ARGS)
|
||||
{
|
||||
FuncCallContext *funcctx;
|
||||
NeonGetStatsCtx* fctx;
|
||||
NeonGetStatsCtx *fctx;
|
||||
MemoryContext oldcontext;
|
||||
TupleDesc tupledesc;
|
||||
Datum result;
|
||||
HeapTuple tuple;
|
||||
char const* key;
|
||||
uint64 value;
|
||||
char const *key;
|
||||
uint64 value;
|
||||
Datum values[NUM_NEON_GET_STATS_COLS];
|
||||
bool nulls[NUM_NEON_GET_STATS_COLS];
|
||||
|
||||
@@ -683,7 +720,7 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
|
||||
oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
|
||||
|
||||
/* Create a user function context for cross-call persistence */
|
||||
fctx = (NeonGetStatsCtx*) palloc(sizeof(NeonGetStatsCtx));
|
||||
fctx = (NeonGetStatsCtx *) palloc(sizeof(NeonGetStatsCtx));
|
||||
|
||||
/* Construct a tuple descriptor for the result rows. */
|
||||
tupledesc = CreateTemplateTupleDesc(NUM_NEON_GET_STATS_COLS);
|
||||
@@ -704,7 +741,7 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
|
||||
funcctx = SRF_PERCALL_SETUP();
|
||||
|
||||
/* Get the saved state */
|
||||
fctx = (NeonGetStatsCtx*) funcctx->user_fctx;
|
||||
fctx = (NeonGetStatsCtx *) funcctx->user_fctx;
|
||||
|
||||
switch (funcctx->call_cntr)
|
||||
{
|
||||
@@ -792,9 +829,9 @@ local_cache_pages(PG_FUNCTION_ARGS)
|
||||
|
||||
if (SRF_IS_FIRSTCALL())
|
||||
{
|
||||
HASH_SEQ_STATUS status;
|
||||
FileCacheEntry* entry;
|
||||
uint32 n_pages = 0;
|
||||
HASH_SEQ_STATUS status;
|
||||
FileCacheEntry *entry;
|
||||
uint32 n_pages = 0;
|
||||
|
||||
funcctx = SRF_FIRSTCALL_INIT();
|
||||
|
||||
@@ -851,7 +888,7 @@ local_cache_pages(PG_FUNCTION_ARGS)
|
||||
hash_seq_init(&status, lfc_hash);
|
||||
while ((entry = hash_seq_search(&status)) != NULL)
|
||||
{
|
||||
for (int i = 0; i < BLOCKS_PER_CHUNK/32; i++)
|
||||
for (int i = 0; i < BLOCKS_PER_CHUNK / 32; i++)
|
||||
n_pages += pg_popcount32(entry->bitmap[i]);
|
||||
}
|
||||
}
|
||||
@@ -870,10 +907,11 @@ local_cache_pages(PG_FUNCTION_ARGS)
|
||||
if (n_pages != 0)
|
||||
{
|
||||
/*
|
||||
* Scan through all the cache entries, saving the relevant fields in the
|
||||
* fctx->record structure.
|
||||
* Scan through all the cache entries, saving the relevant fields
|
||||
* in the fctx->record structure.
|
||||
*/
|
||||
uint32 n = 0;
|
||||
uint32 n = 0;
|
||||
|
||||
hash_seq_init(&status, lfc_hash);
|
||||
while ((entry = hash_seq_search(&status)) != NULL)
|
||||
{
|
||||
@@ -881,7 +919,7 @@ local_cache_pages(PG_FUNCTION_ARGS)
|
||||
{
|
||||
if (entry->bitmap[i >> 5] & (1 << (i & 31)))
|
||||
{
|
||||
fctx->record[n].pageoffs = entry->offset*BLOCKS_PER_CHUNK + i;
|
||||
fctx->record[n].pageoffs = entry->offset * BLOCKS_PER_CHUNK + i;
|
||||
fctx->record[n].relfilenode = NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key));
|
||||
fctx->record[n].reltablespace = NInfoGetSpcOid(BufTagGetNRelFileInfo(entry->key));
|
||||
fctx->record[n].reldatabase = NInfoGetDbOid(BufTagGetNRelFileInfo(entry->key));
|
||||
|
||||
@@ -69,9 +69,9 @@ int max_reconnect_attempts = 60;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
LWLockId lock;
|
||||
pg_atomic_uint64 update_counter;
|
||||
char pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
|
||||
LWLockId lock;
|
||||
pg_atomic_uint64 update_counter;
|
||||
char pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
|
||||
} PagestoreShmemState;
|
||||
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
@@ -83,7 +83,7 @@ static PagestoreShmemState *pagestore_shared;
|
||||
static uint64 pagestore_local_counter = 0;
|
||||
static char local_pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
|
||||
|
||||
bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;
|
||||
bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;
|
||||
|
||||
static bool pageserver_flush(void);
|
||||
static void pageserver_disconnect(void);
|
||||
@@ -91,43 +91,43 @@ static void pageserver_disconnect(void);
|
||||
static bool
|
||||
PagestoreShmemIsValid()
|
||||
{
|
||||
return pagestore_shared && UsedShmemSegAddr;
|
||||
return pagestore_shared && UsedShmemSegAddr;
|
||||
}
|
||||
|
||||
static bool
|
||||
CheckPageserverConnstring(char **newval, void **extra, GucSource source)
|
||||
{
|
||||
return strlen(*newval) < MAX_PAGESERVER_CONNSTRING_SIZE;
|
||||
return strlen(*newval) < MAX_PAGESERVER_CONNSTRING_SIZE;
|
||||
}
|
||||
|
||||
static void
|
||||
AssignPageserverConnstring(const char *newval, void *extra)
|
||||
{
|
||||
if(!PagestoreShmemIsValid())
|
||||
return;
|
||||
LWLockAcquire(pagestore_shared->lock, LW_EXCLUSIVE);
|
||||
strlcpy(pagestore_shared->pageserver_connstring, newval, MAX_PAGESERVER_CONNSTRING_SIZE);
|
||||
pg_atomic_fetch_add_u64(&pagestore_shared->update_counter, 1);
|
||||
LWLockRelease(pagestore_shared->lock);
|
||||
if (!PagestoreShmemIsValid())
|
||||
return;
|
||||
LWLockAcquire(pagestore_shared->lock, LW_EXCLUSIVE);
|
||||
strlcpy(pagestore_shared->pageserver_connstring, newval, MAX_PAGESERVER_CONNSTRING_SIZE);
|
||||
pg_atomic_fetch_add_u64(&pagestore_shared->update_counter, 1);
|
||||
LWLockRelease(pagestore_shared->lock);
|
||||
}
|
||||
|
||||
static bool
|
||||
CheckConnstringUpdated()
|
||||
{
|
||||
if(!PagestoreShmemIsValid())
|
||||
return false;
|
||||
return pagestore_local_counter < pg_atomic_read_u64(&pagestore_shared->update_counter);
|
||||
if (!PagestoreShmemIsValid())
|
||||
return false;
|
||||
return pagestore_local_counter < pg_atomic_read_u64(&pagestore_shared->update_counter);
|
||||
}
|
||||
|
||||
static void
|
||||
ReloadConnstring()
|
||||
{
|
||||
if(!PagestoreShmemIsValid())
|
||||
return;
|
||||
LWLockAcquire(pagestore_shared->lock, LW_SHARED);
|
||||
strlcpy(local_pageserver_connstring, pagestore_shared->pageserver_connstring, sizeof(local_pageserver_connstring));
|
||||
pagestore_local_counter = pg_atomic_read_u64(&pagestore_shared->update_counter);
|
||||
LWLockRelease(pagestore_shared->lock);
|
||||
if (!PagestoreShmemIsValid())
|
||||
return;
|
||||
LWLockAcquire(pagestore_shared->lock, LW_SHARED);
|
||||
strlcpy(local_pageserver_connstring, pagestore_shared->pageserver_connstring, sizeof(local_pageserver_connstring));
|
||||
pagestore_local_counter = pg_atomic_read_u64(&pagestore_shared->update_counter);
|
||||
LWLockRelease(pagestore_shared->lock);
|
||||
}
|
||||
|
||||
static bool
|
||||
@@ -141,21 +141,20 @@ pageserver_connect(int elevel)
|
||||
|
||||
Assert(!connected);
|
||||
|
||||
if(CheckConnstringUpdated())
|
||||
{
|
||||
ReloadConnstring();
|
||||
}
|
||||
if (CheckConnstringUpdated())
|
||||
{
|
||||
ReloadConnstring();
|
||||
}
|
||||
|
||||
/*
|
||||
* Connect using the connection string we got from the
|
||||
* neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
|
||||
* variable was set, use that as the password.
|
||||
*
|
||||
* The connection options are parsed in the order they're given, so
|
||||
* when we set the password before the connection string, the
|
||||
* connection string can override the password from the env variable.
|
||||
* Seems useful, although we don't currently use that capability
|
||||
* anywhere.
|
||||
* The connection options are parsed in the order they're given, so when
|
||||
* we set the password before the connection string, the connection string
|
||||
* can override the password from the env variable. Seems useful, although
|
||||
* we don't currently use that capability anywhere.
|
||||
*/
|
||||
n = 0;
|
||||
if (neon_auth_token)
|
||||
@@ -198,9 +197,9 @@ pageserver_connect(int elevel)
|
||||
|
||||
pageserver_conn_wes = CreateWaitEventSet(TopMemoryContext, 3);
|
||||
AddWaitEventToSet(pageserver_conn_wes, WL_LATCH_SET, PGINVALID_SOCKET,
|
||||
MyLatch, NULL);
|
||||
MyLatch, NULL);
|
||||
AddWaitEventToSet(pageserver_conn_wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
|
||||
NULL, NULL);
|
||||
NULL, NULL);
|
||||
AddWaitEventToSet(pageserver_conn_wes, WL_SOCKET_READABLE, PQsocket(pageserver_conn), NULL, NULL);
|
||||
|
||||
while (PQisBusy(pageserver_conn))
|
||||
@@ -265,6 +264,7 @@ retry:
|
||||
if (!PQconsumeInput(pageserver_conn))
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(pageserver_conn));
|
||||
|
||||
neon_log(LOG, "could not get response from pageserver: %s", msg);
|
||||
pfree(msg);
|
||||
return -1;
|
||||
@@ -305,15 +305,15 @@ pageserver_disconnect(void)
|
||||
}
|
||||
|
||||
static bool
|
||||
pageserver_send(NeonRequest * request)
|
||||
pageserver_send(NeonRequest *request)
|
||||
{
|
||||
StringInfoData req_buff;
|
||||
|
||||
if(CheckConnstringUpdated())
|
||||
{
|
||||
pageserver_disconnect();
|
||||
ReloadConnstring();
|
||||
}
|
||||
if (CheckConnstringUpdated())
|
||||
{
|
||||
pageserver_disconnect();
|
||||
ReloadConnstring();
|
||||
}
|
||||
|
||||
/* If the connection was lost for some reason, reconnect */
|
||||
if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD)
|
||||
@@ -326,10 +326,12 @@ pageserver_send(NeonRequest * request)
|
||||
|
||||
/*
|
||||
* If pageserver is stopped, the connections from compute node are broken.
|
||||
* The compute node doesn't notice that immediately, but it will cause the next request to fail, usually on the next query.
|
||||
* That causes user-visible errors if pageserver is restarted, or the tenant is moved from one pageserver to another.
|
||||
* See https://github.com/neondatabase/neon/issues/1138
|
||||
* So try to reestablish connection in case of failure.
|
||||
* The compute node doesn't notice that immediately, but it will cause the
|
||||
* next request to fail, usually on the next query. That causes
|
||||
* user-visible errors if pageserver is restarted, or the tenant is moved
|
||||
* from one pageserver to another. See
|
||||
* https://github.com/neondatabase/neon/issues/1138 So try to reestablish
|
||||
* connection in case of failure.
|
||||
*/
|
||||
if (!connected)
|
||||
{
|
||||
@@ -353,6 +355,7 @@ pageserver_send(NeonRequest * request)
|
||||
if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(pageserver_conn));
|
||||
|
||||
pageserver_disconnect();
|
||||
neon_log(LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg);
|
||||
pfree(msg);
|
||||
@@ -410,7 +413,8 @@ pageserver_receive(void)
|
||||
}
|
||||
else if (rc == -2)
|
||||
{
|
||||
char* msg = pchomp(PQerrorMessage(pageserver_conn));
|
||||
char *msg = pchomp(PQerrorMessage(pageserver_conn));
|
||||
|
||||
pageserver_disconnect();
|
||||
neon_log(ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg);
|
||||
}
|
||||
@@ -444,6 +448,7 @@ pageserver_flush(void)
|
||||
if (PQflush(pageserver_conn))
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(pageserver_conn));
|
||||
|
||||
pageserver_disconnect();
|
||||
neon_log(LOG, "pageserver_flush disconnect because failed to flush page requests: %s", msg);
|
||||
pfree(msg);
|
||||
@@ -471,46 +476,47 @@ check_neon_id(char **newval, void **extra, GucSource source)
|
||||
static Size
|
||||
PagestoreShmemSize(void)
|
||||
{
|
||||
return sizeof(PagestoreShmemState);
|
||||
return sizeof(PagestoreShmemState);
|
||||
}
|
||||
|
||||
static bool
|
||||
PagestoreShmemInit(void)
|
||||
{
|
||||
bool found;
|
||||
LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
|
||||
pagestore_shared = ShmemInitStruct("libpagestore shared state",
|
||||
PagestoreShmemSize(),
|
||||
&found);
|
||||
if(!found)
|
||||
{
|
||||
pagestore_shared->lock = &(GetNamedLWLockTranche("neon_libpagestore")->lock);
|
||||
pg_atomic_init_u64(&pagestore_shared->update_counter, 0);
|
||||
AssignPageserverConnstring(page_server_connstring, NULL);
|
||||
}
|
||||
LWLockRelease(AddinShmemInitLock);
|
||||
return found;
|
||||
bool found;
|
||||
|
||||
LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
|
||||
pagestore_shared = ShmemInitStruct("libpagestore shared state",
|
||||
PagestoreShmemSize(),
|
||||
&found);
|
||||
if (!found)
|
||||
{
|
||||
pagestore_shared->lock = &(GetNamedLWLockTranche("neon_libpagestore")->lock);
|
||||
pg_atomic_init_u64(&pagestore_shared->update_counter, 0);
|
||||
AssignPageserverConnstring(page_server_connstring, NULL);
|
||||
}
|
||||
LWLockRelease(AddinShmemInitLock);
|
||||
return found;
|
||||
}
|
||||
|
||||
static void
|
||||
pagestore_shmem_startup_hook(void)
|
||||
{
|
||||
if(prev_shmem_startup_hook)
|
||||
prev_shmem_startup_hook();
|
||||
if (prev_shmem_startup_hook)
|
||||
prev_shmem_startup_hook();
|
||||
|
||||
PagestoreShmemInit();
|
||||
PagestoreShmemInit();
|
||||
}
|
||||
|
||||
static void
|
||||
pagestore_shmem_request(void)
|
||||
{
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
if(prev_shmem_request_hook)
|
||||
prev_shmem_request_hook();
|
||||
if (prev_shmem_request_hook)
|
||||
prev_shmem_request_hook();
|
||||
#endif
|
||||
|
||||
RequestAddinShmemSpace(PagestoreShmemSize());
|
||||
RequestNamedLWLockTranche("neon_libpagestore", 1);
|
||||
RequestAddinShmemSpace(PagestoreShmemSize());
|
||||
RequestNamedLWLockTranche("neon_libpagestore", 1);
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -520,7 +526,7 @@ pagestore_prepare_shmem(void)
|
||||
prev_shmem_request_hook = shmem_request_hook;
|
||||
shmem_request_hook = pagestore_shmem_request;
|
||||
#else
|
||||
pagestore_shmem_request();
|
||||
pagestore_shmem_request();
|
||||
#endif
|
||||
prev_shmem_startup_hook = shmem_startup_hook;
|
||||
shmem_startup_hook = pagestore_shmem_startup_hook;
|
||||
@@ -532,7 +538,7 @@ pagestore_prepare_shmem(void)
|
||||
void
|
||||
pg_init_libpagestore(void)
|
||||
{
|
||||
pagestore_prepare_shmem();
|
||||
pagestore_prepare_shmem();
|
||||
|
||||
DefineCustomStringVariable("neon.pageserver_connstring",
|
||||
"connection string to the page server",
|
||||
@@ -607,7 +613,10 @@ pg_init_libpagestore(void)
|
||||
neon_log(PageStoreTrace, "libpagestore already loaded");
|
||||
page_server = &api;
|
||||
|
||||
/* Retrieve the auth token to use when connecting to pageserver and safekeepers */
|
||||
/*
|
||||
* Retrieve the auth token to use when connecting to pageserver and
|
||||
* safekeepers
|
||||
*/
|
||||
neon_auth_token = getenv("NEON_AUTH_TOKEN");
|
||||
if (neon_auth_token)
|
||||
neon_log(LOG, "using storage auth token from NEON_AUTH_TOKEN environment variable");
|
||||
|
||||
@@ -48,9 +48,11 @@ _PG_init(void)
|
||||
|
||||
pg_init_extension_server();
|
||||
|
||||
// Important: This must happen after other parts of the extension
|
||||
// are loaded, otherwise any settings to GUCs that were set before
|
||||
// the extension was loaded will be removed.
|
||||
/*
|
||||
* Important: This must happen after other parts of the extension are
|
||||
* loaded, otherwise any settings to GUCs that were set before the
|
||||
* extension was loaded will be removed.
|
||||
*/
|
||||
EmitWarningsOnPlaceholders("neon");
|
||||
}
|
||||
|
||||
|
||||
@@ -32,7 +32,7 @@ extern void pg_init_extension_server(void);
|
||||
* block_id; false otherwise.
|
||||
*/
|
||||
extern bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id);
|
||||
extern bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id);
|
||||
extern bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id);
|
||||
|
||||
extern uint64 BackpressureThrottlingTime(void);
|
||||
extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);
|
||||
|
||||
@@ -59,7 +59,7 @@
|
||||
|
||||
#define DropRelationAllLocalBuffers DropRelFileNodeAllLocalBuffers
|
||||
|
||||
#else /* major version >= 16 */
|
||||
#else /* major version >= 16 */
|
||||
|
||||
#define USE_RELFILELOCATOR
|
||||
|
||||
@@ -109,4 +109,4 @@
|
||||
#define DropRelationAllLocalBuffers DropRelationAllLocalBuffers
|
||||
#endif
|
||||
|
||||
#endif //NEON_PGVERSIONCOMPAT_H
|
||||
#endif /* NEON_PGVERSIONCOMPAT_H */
|
||||
|
||||
@@ -40,13 +40,13 @@ typedef enum
|
||||
T_NeonGetPageResponse,
|
||||
T_NeonErrorResponse,
|
||||
T_NeonDbSizeResponse,
|
||||
} NeonMessageTag;
|
||||
} NeonMessageTag;
|
||||
|
||||
/* base struct for c-style inheritance */
|
||||
typedef struct
|
||||
{
|
||||
NeonMessageTag tag;
|
||||
} NeonMessage;
|
||||
} NeonMessage;
|
||||
|
||||
#define messageTag(m) (((const NeonMessage *)(m))->tag)
|
||||
|
||||
@@ -67,27 +67,27 @@ typedef struct
|
||||
NeonMessageTag tag;
|
||||
bool latest; /* if true, request latest page version */
|
||||
XLogRecPtr lsn; /* request page version @ this LSN */
|
||||
} NeonRequest;
|
||||
} NeonRequest;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
NeonRequest req;
|
||||
NRelFileInfo rinfo;
|
||||
ForkNumber forknum;
|
||||
} NeonExistsRequest;
|
||||
} NeonExistsRequest;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
NeonRequest req;
|
||||
NRelFileInfo rinfo;
|
||||
ForkNumber forknum;
|
||||
} NeonNblocksRequest;
|
||||
} NeonNblocksRequest;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
NeonRequest req;
|
||||
Oid dbNode;
|
||||
} NeonDbSizeRequest;
|
||||
} NeonDbSizeRequest;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
@@ -95,31 +95,31 @@ typedef struct
|
||||
NRelFileInfo rinfo;
|
||||
ForkNumber forknum;
|
||||
BlockNumber blkno;
|
||||
} NeonGetPageRequest;
|
||||
} NeonGetPageRequest;
|
||||
|
||||
/* supertype of all the Neon*Response structs below */
|
||||
typedef struct
|
||||
{
|
||||
NeonMessageTag tag;
|
||||
} NeonResponse;
|
||||
} NeonResponse;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
NeonMessageTag tag;
|
||||
bool exists;
|
||||
} NeonExistsResponse;
|
||||
} NeonExistsResponse;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
NeonMessageTag tag;
|
||||
uint32 n_blocks;
|
||||
} NeonNblocksResponse;
|
||||
} NeonNblocksResponse;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
NeonMessageTag tag;
|
||||
char page[FLEXIBLE_ARRAY_MEMBER];
|
||||
} NeonGetPageResponse;
|
||||
} NeonGetPageResponse;
|
||||
|
||||
#define PS_GETPAGERESPONSE_SIZE (MAXALIGN(offsetof(NeonGetPageResponse, page) + BLCKSZ))
|
||||
|
||||
@@ -127,18 +127,18 @@ typedef struct
|
||||
{
|
||||
NeonMessageTag tag;
|
||||
int64 db_size;
|
||||
} NeonDbSizeResponse;
|
||||
} NeonDbSizeResponse;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
NeonMessageTag tag;
|
||||
char message[FLEXIBLE_ARRAY_MEMBER]; /* null-terminated error
|
||||
* message */
|
||||
} NeonErrorResponse;
|
||||
} NeonErrorResponse;
|
||||
|
||||
extern StringInfoData nm_pack_request(NeonRequest * msg);
|
||||
extern NeonResponse * nm_unpack_response(StringInfo s);
|
||||
extern char *nm_to_string(NeonMessage * msg);
|
||||
extern StringInfoData nm_pack_request(NeonRequest *msg);
|
||||
extern NeonResponse *nm_unpack_response(StringInfo s);
|
||||
extern char *nm_to_string(NeonMessage *msg);
|
||||
|
||||
/*
|
||||
* API
|
||||
@@ -146,20 +146,20 @@ extern char *nm_to_string(NeonMessage * msg);
|
||||
|
||||
typedef struct
|
||||
{
|
||||
bool (*send) (NeonRequest * request);
|
||||
bool (*send) (NeonRequest *request);
|
||||
NeonResponse *(*receive) (void);
|
||||
bool (*flush) (void);
|
||||
} page_server_api;
|
||||
} page_server_api;
|
||||
|
||||
extern void prefetch_on_ps_disconnect(void);
|
||||
|
||||
extern page_server_api * page_server;
|
||||
extern page_server_api *page_server;
|
||||
|
||||
extern char *page_server_connstring;
|
||||
extern int flush_every_n_requests;
|
||||
extern int readahead_buffer_size;
|
||||
extern int flush_every_n_requests;
|
||||
extern int readahead_buffer_size;
|
||||
extern bool seqscan_prefetch_enabled;
|
||||
extern int seqscan_prefetch_distance;
|
||||
extern int seqscan_prefetch_distance;
|
||||
extern char *neon_timeline;
|
||||
extern char *neon_tenant;
|
||||
extern bool wal_redo;
|
||||
@@ -194,14 +194,14 @@ extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum,
|
||||
extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
char *buffer);
|
||||
extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
|
||||
XLogRecPtr request_lsn, bool request_latest, char *buffer);
|
||||
XLogRecPtr request_lsn, bool request_latest, char *buffer);
|
||||
extern void neon_write(SMgrRelation reln, ForkNumber forknum,
|
||||
BlockNumber blocknum, char *buffer, bool skipFsync);
|
||||
#else
|
||||
extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
void *buffer);
|
||||
extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
|
||||
XLogRecPtr request_lsn, bool request_latest, void *buffer);
|
||||
XLogRecPtr request_lsn, bool request_latest, void *buffer);
|
||||
extern void neon_write(SMgrRelation reln, ForkNumber forknum,
|
||||
BlockNumber blocknum, const void *buffer, bool skipFsync);
|
||||
#endif
|
||||
|
||||
@@ -59,6 +59,7 @@
|
||||
#include "replication/walsender.h"
|
||||
#include "storage/bufmgr.h"
|
||||
#include "storage/buf_internals.h"
|
||||
#include "storage/fsm_internals.h"
|
||||
#include "storage/smgr.h"
|
||||
#include "storage/md.h"
|
||||
#include "pgstat.h"
|
||||
@@ -100,21 +101,21 @@ typedef enum
|
||||
UNLOGGED_BUILD_PHASE_1,
|
||||
UNLOGGED_BUILD_PHASE_2,
|
||||
UNLOGGED_BUILD_NOT_PERMANENT
|
||||
} UnloggedBuildPhase;
|
||||
} UnloggedBuildPhase;
|
||||
|
||||
static SMgrRelation unlogged_build_rel = NULL;
|
||||
static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
|
||||
|
||||
/*
|
||||
* Prefetch implementation:
|
||||
*
|
||||
*
|
||||
* Prefetch is performed locally by each backend.
|
||||
*
|
||||
* There can be up to readahead_buffer_size active IO requests registered at
|
||||
* any time. Requests using smgr_prefetch are sent to the pageserver, but we
|
||||
* don't wait on the response. Requests using smgr_read are either read from
|
||||
* the buffer, or (if that's not possible) we wait on the response to arrive -
|
||||
* this also will allow us to receive other prefetched pages.
|
||||
* this also will allow us to receive other prefetched pages.
|
||||
* Each request is immediately written to the output buffer of the pageserver
|
||||
* connection, but may not be flushed if smgr_prefetch is used: pageserver
|
||||
* flushes sent requests on manual flush, or every neon.flush_output_after
|
||||
@@ -138,7 +139,7 @@ static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
|
||||
|
||||
/*
|
||||
* State machine:
|
||||
*
|
||||
*
|
||||
* not in hash : in hash
|
||||
* :
|
||||
* UNUSED ------> REQUESTED --> RECEIVED
|
||||
@@ -149,30 +150,34 @@ static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
|
||||
* +----------------+------------+
|
||||
* :
|
||||
*/
|
||||
typedef enum PrefetchStatus {
|
||||
PRFS_UNUSED = 0, /* unused slot */
|
||||
PRFS_REQUESTED, /* request was written to the sendbuffer to PS, but not
|
||||
* necessarily flushed.
|
||||
* all fields except response valid */
|
||||
PRFS_RECEIVED, /* all fields valid */
|
||||
PRFS_TAG_REMAINS, /* only buftag and my_ring_index are still valid */
|
||||
typedef enum PrefetchStatus
|
||||
{
|
||||
PRFS_UNUSED = 0, /* unused slot */
|
||||
PRFS_REQUESTED, /* request was written to the sendbuffer to
|
||||
* PS, but not necessarily flushed. all fields
|
||||
* except response valid */
|
||||
PRFS_RECEIVED, /* all fields valid */
|
||||
PRFS_TAG_REMAINS, /* only buftag and my_ring_index are still
|
||||
* valid */
|
||||
} PrefetchStatus;
|
||||
|
||||
typedef struct PrefetchRequest {
|
||||
BufferTag buftag; /* must be first entry in the struct */
|
||||
typedef struct PrefetchRequest
|
||||
{
|
||||
BufferTag buftag; /* must be first entry in the struct */
|
||||
XLogRecPtr effective_request_lsn;
|
||||
XLogRecPtr actual_request_lsn;
|
||||
NeonResponse *response; /* may be null */
|
||||
NeonResponse *response; /* may be null */
|
||||
PrefetchStatus status;
|
||||
uint64 my_ring_index;
|
||||
} PrefetchRequest;
|
||||
|
||||
/* prefetch buffer lookup hash table */
|
||||
|
||||
typedef struct PrfHashEntry {
|
||||
typedef struct PrfHashEntry
|
||||
{
|
||||
PrefetchRequest *slot;
|
||||
uint32 status;
|
||||
uint32 hash;
|
||||
uint32 status;
|
||||
uint32 hash;
|
||||
} PrfHashEntry;
|
||||
|
||||
#define SH_PREFIX prfh
|
||||
@@ -196,36 +201,42 @@ typedef struct PrfHashEntry {
|
||||
/*
|
||||
* PrefetchState maintains the state of (prefetch) getPage@LSN requests.
|
||||
* It maintains a (ring) buffer of in-flight requests and responses.
|
||||
*
|
||||
*
|
||||
* We maintain several indexes into the ring buffer:
|
||||
* ring_unused >= ring_flush >= ring_receive >= ring_last >= 0
|
||||
*
|
||||
*
|
||||
* ring_unused points to the first unused slot of the buffer
|
||||
* ring_receive is the next request that is to be received
|
||||
* ring_last is the oldest received entry in the buffer
|
||||
*
|
||||
*
|
||||
* Apart from being an entry in the ring buffer of prefetch requests, each
|
||||
* PrefetchRequest that is not UNUSED is indexed in prf_hash by buftag.
|
||||
*/
|
||||
typedef struct PrefetchState {
|
||||
MemoryContext bufctx; /* context for prf_buffer[].response allocations */
|
||||
MemoryContext errctx; /* context for prf_buffer[].response allocations */
|
||||
MemoryContext hashctx; /* context for prf_buffer */
|
||||
typedef struct PrefetchState
|
||||
{
|
||||
MemoryContext bufctx; /* context for prf_buffer[].response
|
||||
* allocations */
|
||||
MemoryContext errctx; /* context for prf_buffer[].response
|
||||
* allocations */
|
||||
MemoryContext hashctx; /* context for prf_buffer */
|
||||
|
||||
/* buffer indexes */
|
||||
uint64 ring_unused; /* first unused slot */
|
||||
uint64 ring_flush; /* next request to flush */
|
||||
uint64 ring_receive; /* next slot that is to receive a response */
|
||||
uint64 ring_last; /* min slot with a response value */
|
||||
uint64 ring_unused; /* first unused slot */
|
||||
uint64 ring_flush; /* next request to flush */
|
||||
uint64 ring_receive; /* next slot that is to receive a response */
|
||||
uint64 ring_last; /* min slot with a response value */
|
||||
|
||||
/* metrics / statistics */
|
||||
int n_responses_buffered; /* count of PS responses not yet in buffers */
|
||||
int n_requests_inflight; /* count of PS requests considered in flight */
|
||||
int n_unused; /* count of buffers < unused, > last, that are also unused */
|
||||
int n_responses_buffered; /* count of PS responses not yet in
|
||||
* buffers */
|
||||
int n_requests_inflight; /* count of PS requests considered in
|
||||
* flight */
|
||||
int n_unused; /* count of buffers < unused, > last, that are
|
||||
* also unused */
|
||||
|
||||
/* the buffers */
|
||||
prfh_hash *prf_hash;
|
||||
PrefetchRequest prf_buffer[]; /* prefetch buffers */
|
||||
prfh_hash *prf_hash;
|
||||
PrefetchRequest prf_buffer[]; /* prefetch buffers */
|
||||
} PrefetchState;
|
||||
|
||||
PrefetchState *MyPState;
|
||||
@@ -263,10 +274,10 @@ static XLogRecPtr neon_get_request_lsn(bool *latest, NRelFileInfo rinfo,
|
||||
static bool
|
||||
compact_prefetch_buffers(void)
|
||||
{
|
||||
uint64 empty_ring_index = MyPState->ring_last;
|
||||
uint64 search_ring_index = MyPState->ring_receive;
|
||||
int n_moved = 0;
|
||||
|
||||
uint64 empty_ring_index = MyPState->ring_last;
|
||||
uint64 search_ring_index = MyPState->ring_receive;
|
||||
int n_moved = 0;
|
||||
|
||||
if (MyPState->ring_receive == MyPState->ring_last)
|
||||
return false;
|
||||
|
||||
@@ -281,15 +292,14 @@ compact_prefetch_buffers(void)
|
||||
}
|
||||
|
||||
/*
|
||||
* Here we have established:
|
||||
* slots < search_ring_index have an unknown state (not scanned)
|
||||
* slots >= search_ring_index and <= empty_ring_index are unused
|
||||
* slots > empty_ring_index are in use, or outside our buffer's range.
|
||||
* ... unless search_ring_index <= ring_last
|
||||
*
|
||||
* Here we have established: slots < search_ring_index have an unknown
|
||||
* state (not scanned) slots >= search_ring_index and <= empty_ring_index
|
||||
* are unused slots > empty_ring_index are in use, or outside our buffer's
|
||||
* range. ... unless search_ring_index <= ring_last
|
||||
*
|
||||
* Therefore, there is a gap of at least one unused items between
|
||||
* search_ring_index and empty_ring_index (both inclusive), which grows as we hit
|
||||
* more unused items while moving backwards through the array.
|
||||
* search_ring_index and empty_ring_index (both inclusive), which grows as
|
||||
* we hit more unused items while moving backwards through the array.
|
||||
*/
|
||||
|
||||
while (search_ring_index > MyPState->ring_last)
|
||||
@@ -329,7 +339,10 @@ compact_prefetch_buffers(void)
|
||||
|
||||
/* empty the moved slot */
|
||||
source_slot->status = PRFS_UNUSED;
|
||||
source_slot->buftag = (BufferTag) {0};
|
||||
source_slot->buftag = (BufferTag)
|
||||
{
|
||||
0
|
||||
};
|
||||
source_slot->response = NULL;
|
||||
source_slot->my_ring_index = 0;
|
||||
source_slot->effective_request_lsn = 0;
|
||||
@@ -339,8 +352,8 @@ compact_prefetch_buffers(void)
|
||||
}
|
||||
|
||||
/*
|
||||
* Only when we've moved slots we can expect trailing unused slots,
|
||||
* so only then we clean up trailing unused slots.
|
||||
* Only when we've moved slots we can expect trailing unused slots, so
|
||||
* only then we clean up trailing unused slots.
|
||||
*/
|
||||
if (n_moved > 0)
|
||||
{
|
||||
@@ -357,10 +370,9 @@ readahead_buffer_resize(int newsize, void *extra)
|
||||
uint64 end,
|
||||
nfree = newsize;
|
||||
PrefetchState *newPState;
|
||||
Size newprfs_size = offsetof(PrefetchState, prf_buffer) + (
|
||||
sizeof(PrefetchRequest) * newsize
|
||||
);
|
||||
|
||||
Size newprfs_size = offsetof(PrefetchState, prf_buffer) +
|
||||
(sizeof(PrefetchRequest) * newsize);
|
||||
|
||||
/* don't try to re-initialize if we haven't initialized yet */
|
||||
if (MyPState == NULL)
|
||||
return;
|
||||
@@ -387,12 +399,12 @@ readahead_buffer_resize(int newsize, void *extra)
|
||||
newPState->ring_receive = newsize;
|
||||
newPState->ring_flush = newsize;
|
||||
|
||||
/*
|
||||
/*
|
||||
* Copy over the prefetches.
|
||||
*
|
||||
*
|
||||
* We populate the prefetch array from the end; to retain the most recent
|
||||
* prefetches, but this has the benefit of only needing to do one iteration
|
||||
* on the dataset, and trivial compaction.
|
||||
* prefetches, but this has the benefit of only needing to do one
|
||||
* iteration on the dataset, and trivial compaction.
|
||||
*/
|
||||
for (end = MyPState->ring_unused - 1;
|
||||
end >= MyPState->ring_last && end != UINT64_MAX && nfree != 0;
|
||||
@@ -400,7 +412,7 @@ readahead_buffer_resize(int newsize, void *extra)
|
||||
{
|
||||
PrefetchRequest *slot = GetPrfSlot(end);
|
||||
PrefetchRequest *newslot;
|
||||
bool found;
|
||||
bool found;
|
||||
|
||||
if (slot->status == PRFS_UNUSED)
|
||||
continue;
|
||||
@@ -463,10 +475,11 @@ consume_prefetch_responses(void)
|
||||
static void
|
||||
prefetch_cleanup_trailing_unused(void)
|
||||
{
|
||||
uint64 ring_index;
|
||||
uint64 ring_index;
|
||||
PrefetchRequest *slot;
|
||||
|
||||
while (MyPState->ring_last < MyPState->ring_receive) {
|
||||
while (MyPState->ring_last < MyPState->ring_receive)
|
||||
{
|
||||
ring_index = MyPState->ring_last;
|
||||
slot = GetPrfSlot(ring_index);
|
||||
|
||||
@@ -480,7 +493,7 @@ prefetch_cleanup_trailing_unused(void)
|
||||
/*
|
||||
* Wait for slot of ring_index to have received its response.
|
||||
* The caller is responsible for making sure the request buffer is flushed.
|
||||
*
|
||||
*
|
||||
* NOTE: this function may indirectly update MyPState->pfs_hash; which
|
||||
* invalidates any active pointers into the hash table.
|
||||
*/
|
||||
@@ -512,7 +525,7 @@ prefetch_wait_for(uint64 ring_index)
|
||||
|
||||
/*
|
||||
* Read the response of a prefetch request into its slot.
|
||||
*
|
||||
*
|
||||
* The caller is responsible for making sure that the request for this buffer
|
||||
* was flushed to the PageServer.
|
||||
*
|
||||
@@ -552,7 +565,7 @@ prefetch_read(PrefetchRequest *slot)
|
||||
|
||||
/*
|
||||
* Disconnect hook - drop prefetches when the connection drops
|
||||
*
|
||||
*
|
||||
* If we don't remove the failed prefetches, we'd be serving incorrect
|
||||
* data to the smgr.
|
||||
*/
|
||||
@@ -563,7 +576,7 @@ prefetch_on_ps_disconnect(void)
|
||||
while (MyPState->ring_receive < MyPState->ring_unused)
|
||||
{
|
||||
PrefetchRequest *slot;
|
||||
uint64 ring_index = MyPState->ring_receive;
|
||||
uint64 ring_index = MyPState->ring_receive;
|
||||
|
||||
slot = GetPrfSlot(ring_index);
|
||||
|
||||
@@ -593,7 +606,7 @@ prefetch_set_unused(uint64 ring_index)
|
||||
PrefetchRequest *slot = GetPrfSlot(ring_index);
|
||||
|
||||
if (ring_index < MyPState->ring_last)
|
||||
return; /* Should already be unused */
|
||||
return; /* Should already be unused */
|
||||
|
||||
Assert(MyPState->ring_unused > ring_index);
|
||||
|
||||
@@ -624,7 +637,11 @@ prefetch_set_unused(uint64 ring_index)
|
||||
/* run cleanup if we're holding back ring_last */
|
||||
if (MyPState->ring_last == ring_index)
|
||||
prefetch_cleanup_trailing_unused();
|
||||
/* ... and try to store the buffered responses more compactly if > 12.5% of the buffer is gaps */
|
||||
|
||||
/*
|
||||
* ... and try to store the buffered responses more compactly if > 12.5%
|
||||
* of the buffer is gaps
|
||||
*/
|
||||
else if (ReceiveBufferNeedsCompaction())
|
||||
compact_prefetch_buffers();
|
||||
}
|
||||
@@ -632,7 +649,7 @@ prefetch_set_unused(uint64 ring_index)
|
||||
static void
|
||||
prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn)
|
||||
{
|
||||
bool found;
|
||||
bool found;
|
||||
NeonGetPageRequest request = {
|
||||
.req.tag = T_NeonGetPageRequest,
|
||||
.req.latest = false,
|
||||
@@ -650,21 +667,22 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
|
||||
}
|
||||
else
|
||||
{
|
||||
XLogRecPtr lsn = neon_get_request_lsn(
|
||||
&request.req.latest,
|
||||
BufTagGetNRelFileInfo(slot->buftag),
|
||||
slot->buftag.forkNum,
|
||||
slot->buftag.blockNum
|
||||
);
|
||||
XLogRecPtr lsn = neon_get_request_lsn(
|
||||
&request.req.latest,
|
||||
BufTagGetNRelFileInfo(slot->buftag),
|
||||
slot->buftag.forkNum,
|
||||
slot->buftag.blockNum
|
||||
);
|
||||
|
||||
/*
|
||||
* Note: effective_request_lsn is potentially higher than the requested
|
||||
* LSN, but still correct:
|
||||
*
|
||||
* Note: effective_request_lsn is potentially higher than the
|
||||
* requested LSN, but still correct:
|
||||
*
|
||||
* We know there are no changes between the actual requested LSN and
|
||||
* the value of effective_request_lsn: If there were, the page would
|
||||
* have been in cache and evicted between those LSN values, which
|
||||
* then would have had to result in a larger request LSN for this page.
|
||||
*
|
||||
* have been in cache and evicted between those LSN values, which then
|
||||
* would have had to result in a larger request LSN for this page.
|
||||
*
|
||||
* It is possible that a concurrent backend loads the page, modifies
|
||||
* it and then evicts it again, but the LSN of that eviction cannot be
|
||||
* smaller than the current WAL insert/redo pointer, which is already
|
||||
@@ -701,7 +719,7 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
|
||||
* prefetch_register_buffer() - register and prefetch buffer
|
||||
*
|
||||
* Register that we may want the contents of BufferTag in the near future.
|
||||
*
|
||||
*
|
||||
* If force_latest and force_lsn are not NULL, those values are sent to the
|
||||
* pageserver. If they are NULL, we utilize the lastWrittenLsn -infrastructure
|
||||
* to fill in these values manually.
|
||||
@@ -713,14 +731,14 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
|
||||
static uint64
|
||||
prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn)
|
||||
{
|
||||
uint64 ring_index;
|
||||
uint64 ring_index;
|
||||
PrefetchRequest req;
|
||||
PrefetchRequest *slot;
|
||||
PrfHashEntry *entry;
|
||||
|
||||
/* use an intermediate PrefetchRequest struct to ensure correct alignment */
|
||||
req.buftag = tag;
|
||||
Retry:
|
||||
Retry:
|
||||
entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &req);
|
||||
|
||||
if (entry != NULL)
|
||||
@@ -740,7 +758,10 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls
|
||||
*/
|
||||
if (force_latest && force_lsn)
|
||||
{
|
||||
/* if we want the latest version, any effective_request_lsn < request lsn is OK */
|
||||
/*
|
||||
* if we want the latest version, any effective_request_lsn <
|
||||
* request lsn is OK
|
||||
*/
|
||||
if (*force_latest)
|
||||
{
|
||||
if (*force_lsn > slot->effective_request_lsn)
|
||||
@@ -751,7 +772,11 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls
|
||||
}
|
||||
|
||||
}
|
||||
/* if we don't want the latest version, only accept requests with the exact same LSN */
|
||||
|
||||
/*
|
||||
* if we don't want the latest version, only accept requests with
|
||||
* the exact same LSN
|
||||
*/
|
||||
else
|
||||
{
|
||||
if (*force_lsn != slot->effective_request_lsn)
|
||||
@@ -798,7 +823,8 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls
|
||||
*/
|
||||
if (MyPState->ring_last + readahead_buffer_size - 1 == MyPState->ring_unused)
|
||||
{
|
||||
uint64 cleanup_index = MyPState->ring_last;
|
||||
uint64 cleanup_index = MyPState->ring_last;
|
||||
|
||||
slot = GetPrfSlot(cleanup_index);
|
||||
|
||||
Assert(slot->status != PRFS_UNUSED);
|
||||
@@ -813,7 +839,10 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls
|
||||
}
|
||||
else
|
||||
{
|
||||
/* We have the slot for ring_last, so that must still be in progress */
|
||||
/*
|
||||
* We have the slot for ring_last, so that must still be in
|
||||
* progress
|
||||
*/
|
||||
switch (slot->status)
|
||||
{
|
||||
case PRFS_REQUESTED:
|
||||
@@ -832,8 +861,8 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls
|
||||
}
|
||||
|
||||
/*
|
||||
* The next buffer pointed to by `ring_unused` is now definitely empty,
|
||||
* so we can insert the new request to it.
|
||||
* The next buffer pointed to by `ring_unused` is now definitely empty, so
|
||||
* we can insert the new request to it.
|
||||
*/
|
||||
ring_index = MyPState->ring_unused;
|
||||
slot = &MyPState->prf_buffer[((ring_index) % readahead_buffer_size)];
|
||||
@@ -859,7 +888,10 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls
|
||||
{
|
||||
if (!page_server->flush())
|
||||
{
|
||||
/* Prefetch set is reset in case of error, so we should try to register our request once again */
|
||||
/*
|
||||
* Prefetch set is reset in case of error, so we should try to
|
||||
* register our request once again
|
||||
*/
|
||||
goto Retry;
|
||||
}
|
||||
MyPState->ring_flush = MyPState->ring_unused;
|
||||
@@ -871,8 +903,10 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls
|
||||
static NeonResponse *
|
||||
page_server_request(void const *req)
|
||||
{
|
||||
NeonResponse* resp;
|
||||
do {
|
||||
NeonResponse *resp;
|
||||
|
||||
do
|
||||
{
|
||||
while (!page_server->send((NeonRequest *) req) || !page_server->flush());
|
||||
MyPState->ring_flush = MyPState->ring_unused;
|
||||
consume_prefetch_responses();
|
||||
@@ -884,7 +918,7 @@ page_server_request(void const *req)
|
||||
|
||||
|
||||
StringInfoData
|
||||
nm_pack_request(NeonRequest * msg)
|
||||
nm_pack_request(NeonRequest *msg)
|
||||
{
|
||||
StringInfoData s;
|
||||
|
||||
@@ -1000,7 +1034,7 @@ nm_unpack_response(StringInfo s)
|
||||
/* XXX: should be varlena */
|
||||
memcpy(msg_resp->page, pq_getmsgbytes(s, BLCKSZ), BLCKSZ);
|
||||
pq_getmsgend(s);
|
||||
|
||||
|
||||
Assert(msg_resp->tag == T_NeonGetPageResponse);
|
||||
|
||||
resp = (NeonResponse *) msg_resp;
|
||||
@@ -1056,7 +1090,7 @@ nm_unpack_response(StringInfo s)
|
||||
|
||||
/* dump to json for debugging / error reporting purposes */
|
||||
char *
|
||||
nm_to_string(NeonMessage * msg)
|
||||
nm_to_string(NeonMessage *msg)
|
||||
{
|
||||
StringInfoData s;
|
||||
|
||||
@@ -1185,7 +1219,7 @@ nm_to_string(NeonMessage * msg)
|
||||
* directly because it skips the logging if the LSN is new enough.
|
||||
*/
|
||||
static XLogRecPtr
|
||||
log_newpage_copy(NRelFileInfo *rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
log_newpage_copy(NRelFileInfo * rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
Page page, bool page_std)
|
||||
{
|
||||
PGAlignedBlock copied_buffer;
|
||||
@@ -1208,11 +1242,10 @@ PageIsEmptyHeapPage(char *buffer)
|
||||
}
|
||||
|
||||
static void
|
||||
neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
#if PG_MAJORVERSION_NUM < 16
|
||||
char *buffer, bool force)
|
||||
neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool force)
|
||||
#else
|
||||
const char *buffer, bool force)
|
||||
neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const char *buffer, bool force)
|
||||
#endif
|
||||
{
|
||||
XLogRecPtr lsn = PageGetLSN((Page) buffer);
|
||||
@@ -1312,24 +1345,23 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
void
|
||||
neon_init(void)
|
||||
{
|
||||
Size prfs_size;
|
||||
Size prfs_size;
|
||||
|
||||
if (MyPState != NULL)
|
||||
return;
|
||||
|
||||
prfs_size = offsetof(PrefetchState, prf_buffer) + (
|
||||
sizeof(PrefetchRequest) * readahead_buffer_size
|
||||
);
|
||||
prfs_size = offsetof(PrefetchState, prf_buffer) +
|
||||
sizeof(PrefetchRequest) * readahead_buffer_size;
|
||||
|
||||
MyPState = MemoryContextAllocZero(TopMemoryContext, prfs_size);
|
||||
|
||||
|
||||
MyPState->n_unused = readahead_buffer_size;
|
||||
|
||||
MyPState->bufctx = SlabContextCreate(TopMemoryContext,
|
||||
"NeonSMGR/prefetch",
|
||||
SLAB_DEFAULT_BLOCK_SIZE * 17,
|
||||
PS_GETPAGERESPONSE_SIZE);
|
||||
MyPState->errctx = AllocSetContextCreate(TopMemoryContext,
|
||||
MyPState->errctx = AllocSetContextCreate(TopMemoryContext,
|
||||
"NeonSMGR/errors",
|
||||
ALLOCSET_DEFAULT_SIZES);
|
||||
MyPState->hashctx = AllocSetContextCreate(TopMemoryContext,
|
||||
@@ -1569,14 +1601,14 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
|
||||
/*
|
||||
* Newly created relation is empty, remember that in the relsize cache.
|
||||
*
|
||||
* Note that in REDO, this is called to make sure the relation fork exists,
|
||||
* but it does not truncate the relation. So, we can only update the
|
||||
* relsize if it didn't exist before.
|
||||
*
|
||||
* Note that in REDO, this is called to make sure the relation fork
|
||||
* exists, but it does not truncate the relation. So, we can only update
|
||||
* the relsize if it didn't exist before.
|
||||
*
|
||||
* Also, in redo, we must make sure to update the cached size of the
|
||||
* relation, as that is the primary source of truth for REDO's
|
||||
* file length considerations, and as file extension isn't (perfectly)
|
||||
* logged, we need to take care of that before we hit file size checks.
|
||||
* relation, as that is the primary source of truth for REDO's file length
|
||||
* considerations, and as file extension isn't (perfectly) logged, we need
|
||||
* to take care of that before we hit file size checks.
|
||||
*
|
||||
* FIXME: This is currently not just an optimization, but required for
|
||||
* correctness. Postgres can call smgrnblocks() on the newly-created
|
||||
@@ -1652,7 +1684,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
|
||||
#endif
|
||||
{
|
||||
XLogRecPtr lsn;
|
||||
BlockNumber n_blocks = 0;
|
||||
BlockNumber n_blocks = 0;
|
||||
|
||||
switch (reln->smgr_relpersistence)
|
||||
{
|
||||
@@ -1693,9 +1725,10 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
|
||||
}
|
||||
|
||||
/*
|
||||
* Usually Postgres doesn't extend relation on more than one page
|
||||
* (leaving holes). But this rule is violated in PG-15 where CreateAndCopyRelationData
|
||||
* call smgrextend for destination relation n using size of source relation
|
||||
* Usually Postgres doesn't extend relation on more than one page (leaving
|
||||
* holes). But this rule is violated in PG-15 where
|
||||
* CreateAndCopyRelationData call smgrextend for destination relation n
|
||||
* using size of source relation
|
||||
*/
|
||||
n_blocks = neon_nblocks(reln, forkNum);
|
||||
while (n_blocks < blkno)
|
||||
@@ -1716,11 +1749,13 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
|
||||
if (IS_LOCAL_REL(reln))
|
||||
mdextend(reln, forkNum, blkno, buffer, skipFsync);
|
||||
#endif
|
||||
|
||||
/*
|
||||
* smgr_extend is often called with an all-zeroes page, so lsn==InvalidXLogRecPtr.
|
||||
* An smgr_write() call will come for the buffer later, after it has been initialized
|
||||
* with the real page contents, and it is eventually evicted from the buffer cache.
|
||||
* But we need a valid LSN to the relation metadata update now.
|
||||
* smgr_extend is often called with an all-zeroes page, so
|
||||
* lsn==InvalidXLogRecPtr. An smgr_write() call will come for the buffer
|
||||
* later, after it has been initialized with the real page contents, and
|
||||
* it is eventually evicted from the buffer cache. But we need a valid LSN
|
||||
* to the relation metadata update now.
|
||||
*/
|
||||
if (lsn == InvalidXLogRecPtr)
|
||||
{
|
||||
@@ -1779,9 +1814,9 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
|
||||
if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
|
||||
errmsg("cannot extend file \"%s\" beyond %u blocks",
|
||||
relpath(reln->smgr_rlocator, forkNum),
|
||||
InvalidBlockNumber)));
|
||||
errmsg("cannot extend file \"%s\" beyond %u blocks",
|
||||
relpath(reln->smgr_rlocator, forkNum),
|
||||
InvalidBlockNumber)));
|
||||
|
||||
/* Don't log any pages if we're not allowed to do so. */
|
||||
if (!XLogInsertAllowed())
|
||||
@@ -1863,12 +1898,12 @@ neon_close(SMgrRelation reln, ForkNumber forknum)
|
||||
bool
|
||||
neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
|
||||
{
|
||||
BufferTag tag;
|
||||
uint64 ring_index PG_USED_FOR_ASSERTS_ONLY;
|
||||
BufferTag tag;
|
||||
|
||||
switch (reln->smgr_relpersistence)
|
||||
{
|
||||
case 0: /* probably shouldn't happen, but ignore it */
|
||||
case 0: /* probably shouldn't happen, but ignore it */
|
||||
case RELPERSISTENCE_PERMANENT:
|
||||
break;
|
||||
|
||||
@@ -1883,10 +1918,9 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
|
||||
if (lfc_cache_contains(InfoFromSMgrRel(reln), forknum, blocknum))
|
||||
return false;
|
||||
|
||||
tag = (BufferTag) {
|
||||
.forkNum = forknum,
|
||||
.blockNum = blocknum
|
||||
};
|
||||
tag.forkNum = forknum;
|
||||
tag.blockNum = blocknum;
|
||||
|
||||
CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln));
|
||||
|
||||
ring_index = prefetch_register_buffer(tag, NULL, NULL);
|
||||
@@ -1939,23 +1973,21 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
|
||||
* While function is defined in the neon extension it's used within neon_test_utils directly.
|
||||
* To avoid breaking tests in the runtime please keep function signature in sync.
|
||||
*/
|
||||
void
|
||||
#if PG_MAJORVERSION_NUM < 16
|
||||
void PGDLLEXPORT
|
||||
neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
XLogRecPtr request_lsn, bool request_latest, char *buffer)
|
||||
#else
|
||||
void PGDLLEXPORT
|
||||
neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
XLogRecPtr request_lsn, bool request_latest, void *buffer)
|
||||
#endif
|
||||
{
|
||||
NeonResponse *resp;
|
||||
BufferTag buftag;
|
||||
uint64 ring_index;
|
||||
PrfHashEntry *entry;
|
||||
PrefetchRequest *slot;
|
||||
|
||||
buftag = (BufferTag) {
|
||||
BufferTag buftag =
|
||||
{
|
||||
.forkNum = forkNum,
|
||||
.blockNum = blkno,
|
||||
};
|
||||
@@ -1964,12 +1996,11 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
|
||||
/*
|
||||
* The redo process does not lock pages that it needs to replay but are
|
||||
* not in the shared buffers, so a concurrent process may request the
|
||||
* page after redo has decided it won't redo that page and updated the
|
||||
* LwLSN for that page.
|
||||
* If we're in hot standby we need to take care that we don't return
|
||||
* until after REDO has finished replaying up to that LwLSN, as the page
|
||||
* should have been locked up to that point.
|
||||
* not in the shared buffers, so a concurrent process may request the page
|
||||
* after redo has decided it won't redo that page and updated the LwLSN
|
||||
* for that page. If we're in hot standby we need to take care that we
|
||||
* don't return until after REDO has finished replaying up to that LwLSN,
|
||||
* as the page should have been locked up to that point.
|
||||
*
|
||||
* See also the description on neon_redo_read_buffer_filter below.
|
||||
*
|
||||
@@ -1977,7 +2008,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
* concurrent failed read IOs. Those IOs should never have a request_lsn
|
||||
* that is as large as the WAL record we're currently replaying, if it
|
||||
* weren't for the behaviour of the LwLsn cache that uses the highest
|
||||
* value of the LwLsn cache when the entry is not found.
|
||||
* value of the LwLsn cache when the entry is not found.
|
||||
*/
|
||||
if (RecoveryInProgress() && !(MyBackendType == B_STARTUP))
|
||||
XLogWaitForReplayOf(request_lsn);
|
||||
@@ -1995,12 +2026,14 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
ring_index = slot->my_ring_index;
|
||||
pgBufferUsage.prefetch.hits += 1;
|
||||
}
|
||||
else /* the current prefetch LSN is not large enough, so drop the prefetch */
|
||||
else /* the current prefetch LSN is not large
|
||||
* enough, so drop the prefetch */
|
||||
{
|
||||
/*
|
||||
* We can't drop cache for not-yet-received requested items. It is
|
||||
* unlikely this happens, but it can happen if prefetch distance is
|
||||
* large enough and a backend didn't consume all prefetch requests.
|
||||
* unlikely this happens, but it can happen if prefetch distance
|
||||
* is large enough and a backend didn't consume all prefetch
|
||||
* requests.
|
||||
*/
|
||||
if (slot->status == PRFS_REQUESTED)
|
||||
{
|
||||
@@ -2027,11 +2060,11 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
else
|
||||
{
|
||||
/*
|
||||
* Empty our reference to the prefetch buffer's hash entry.
|
||||
* When we wait for prefetches, the entry reference is invalidated by
|
||||
* potential updates to the hash, and when we reconnect to the
|
||||
* pageserver the prefetch we're waiting for may be dropped,
|
||||
* in which case we need to retry and take the branch above.
|
||||
* Empty our reference to the prefetch buffer's hash entry. When
|
||||
* we wait for prefetches, the entry reference is invalidated by
|
||||
* potential updates to the hash, and when we reconnect to the
|
||||
* pageserver the prefetch we're waiting for may be dropped, in
|
||||
* which case we need to retry and take the branch above.
|
||||
*/
|
||||
entry = NULL;
|
||||
}
|
||||
@@ -2079,11 +2112,10 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
* neon_read() -- Read the specified block from a relation.
|
||||
*/
|
||||
void
|
||||
neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
|
||||
#if PG_MAJORVERSION_NUM < 16
|
||||
char *buffer)
|
||||
neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, char *buffer)
|
||||
#else
|
||||
void *buffer)
|
||||
neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer)
|
||||
#endif
|
||||
{
|
||||
bool latest;
|
||||
@@ -2218,11 +2250,10 @@ hexdump_page(char *page)
|
||||
* use mdextend().
|
||||
*/
|
||||
void
|
||||
neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
#if PG_MAJORVERSION_NUM < 16
|
||||
char *buffer, bool skipFsync)
|
||||
neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
|
||||
#else
|
||||
const void *buffer, bool skipFsync)
|
||||
neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
|
||||
#endif
|
||||
{
|
||||
XLogRecPtr lsn;
|
||||
@@ -2722,9 +2753,90 @@ smgr_init_neon(void)
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, XLogRecPtr end_recptr)
|
||||
{
|
||||
BlockNumber relsize;
|
||||
|
||||
/* Extend the relation if we know its size */
|
||||
if (get_cached_relsize(rinfo, forknum, &relsize))
|
||||
{
|
||||
if (relsize < blkno + 1)
|
||||
{
|
||||
update_cached_relsize(rinfo, forknum, blkno + 1);
|
||||
SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
* Size was not cached. We populate the cache now, with the size of
|
||||
* the relation measured after this WAL record is applied.
|
||||
*
|
||||
* This length is later reused when we open the smgr to read the
|
||||
* block, which is fine and expected.
|
||||
*/
|
||||
|
||||
NeonResponse *response;
|
||||
NeonNblocksResponse *nbresponse;
|
||||
NeonNblocksRequest request = {
|
||||
.req = (NeonRequest) {
|
||||
.lsn = end_recptr,
|
||||
.latest = false,
|
||||
.tag = T_NeonNblocksRequest,
|
||||
},
|
||||
.rinfo = rinfo,
|
||||
.forknum = forknum,
|
||||
};
|
||||
|
||||
response = page_server_request(&request);
|
||||
|
||||
Assert(response->tag == T_NeonNblocksResponse);
|
||||
nbresponse = (NeonNblocksResponse *) response;
|
||||
|
||||
relsize = Max(nbresponse->n_blocks, blkno + 1);
|
||||
|
||||
set_cached_relsize(rinfo, forknum, relsize);
|
||||
SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum);
|
||||
|
||||
elog(SmgrTrace, "Set length to %d", relsize);
|
||||
}
|
||||
}
|
||||
|
||||
#define FSM_TREE_DEPTH ((SlotsPerFSMPage >= 1626) ? 3 : 4)
|
||||
|
||||
/*
|
||||
* TODO: May be it is better to make correspondent fgunctio from freespace.c public?
|
||||
*/
|
||||
static BlockNumber
|
||||
get_fsm_physical_block(BlockNumber heapblk)
|
||||
{
|
||||
BlockNumber pages;
|
||||
int leafno;
|
||||
int l;
|
||||
|
||||
/*
|
||||
* Calculate the logical page number of the first leaf page below the
|
||||
* given page.
|
||||
*/
|
||||
leafno = heapblk / SlotsPerFSMPage;
|
||||
|
||||
/* Count upper level nodes required to address the leaf page */
|
||||
pages = 0;
|
||||
for (l = 0; l < FSM_TREE_DEPTH; l++)
|
||||
{
|
||||
pages += leafno + 1;
|
||||
leafno /= SlotsPerFSMPage;
|
||||
}
|
||||
|
||||
/* Turn the page count into 0-based block number */
|
||||
return pages - 1;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Return whether we can skip the redo for this block.
|
||||
*
|
||||
*
|
||||
* The conditions for skipping the IO are:
|
||||
*
|
||||
* - The block is not in the shared buffers, and
|
||||
@@ -2763,13 +2875,12 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
|
||||
XLogRecPtr end_recptr = record->EndRecPtr;
|
||||
NRelFileInfo rinfo;
|
||||
ForkNumber forknum;
|
||||
BlockNumber blkno;
|
||||
BlockNumber blkno;
|
||||
BufferTag tag;
|
||||
uint32 hash;
|
||||
LWLock *partitionLock;
|
||||
Buffer buffer;
|
||||
bool no_redo_needed;
|
||||
BlockNumber relsize;
|
||||
|
||||
if (old_redo_read_buffer_filter && old_redo_read_buffer_filter(record, block_id))
|
||||
return true;
|
||||
@@ -2783,8 +2894,8 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
|
||||
|
||||
/*
|
||||
* Out of an abundance of caution, we always run redo on shared catalogs,
|
||||
* regardless of whether the block is stored in shared buffers.
|
||||
* See also this function's top comment.
|
||||
* regardless of whether the block is stored in shared buffers. See also
|
||||
* this function's top comment.
|
||||
*/
|
||||
if (!OidIsValid(NInfoGetDbOid(rinfo)))
|
||||
return false;
|
||||
@@ -2810,8 +2921,9 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
|
||||
/* In both cases st lwlsn past this WAL record */
|
||||
SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno);
|
||||
|
||||
/* we don't have the buffer in memory, update lwLsn past this record,
|
||||
* also evict page fro file cache
|
||||
/*
|
||||
* we don't have the buffer in memory, update lwLsn past this record, also
|
||||
* evict page fro file cache
|
||||
*/
|
||||
if (no_redo_needed)
|
||||
lfc_evict(rinfo, forknum, blkno);
|
||||
@@ -2819,49 +2931,10 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
|
||||
|
||||
LWLockRelease(partitionLock);
|
||||
|
||||
/* Extend the relation if we know its size */
|
||||
if (get_cached_relsize(rinfo, forknum, &relsize))
|
||||
neon_extend_rel_size(rinfo, forknum, blkno, end_recptr);
|
||||
if (forknum == MAIN_FORKNUM)
|
||||
{
|
||||
if (relsize < blkno + 1)
|
||||
{
|
||||
update_cached_relsize(rinfo, forknum, blkno + 1);
|
||||
SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum);
|
||||
}
|
||||
neon_extend_rel_size(rinfo, FSM_FORKNUM, get_fsm_physical_block(blkno), end_recptr);
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
* Size was not cached. We populate the cache now, with the size of the
|
||||
* relation measured after this WAL record is applied.
|
||||
*
|
||||
* This length is later reused when we open the smgr to read the block,
|
||||
* which is fine and expected.
|
||||
*/
|
||||
|
||||
NeonResponse *response;
|
||||
NeonNblocksResponse *nbresponse;
|
||||
NeonNblocksRequest request = {
|
||||
.req = (NeonRequest) {
|
||||
.lsn = end_recptr,
|
||||
.latest = false,
|
||||
.tag = T_NeonNblocksRequest,
|
||||
},
|
||||
.rinfo = rinfo,
|
||||
.forknum = forknum,
|
||||
};
|
||||
|
||||
response = page_server_request(&request);
|
||||
|
||||
Assert(response->tag == T_NeonNblocksResponse);
|
||||
nbresponse = (NeonNblocksResponse *) response;
|
||||
|
||||
Assert(nbresponse->n_blocks > blkno);
|
||||
|
||||
set_cached_relsize(rinfo, forknum, nbresponse->n_blocks);
|
||||
SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum);
|
||||
|
||||
elog(SmgrTrace, "Set length to %d", nbresponse->n_blocks);
|
||||
}
|
||||
|
||||
return no_redo_needed;
|
||||
}
|
||||
|
||||
@@ -178,7 +178,7 @@ WalProposerFree(WalProposer *wp)
|
||||
if (wp->propTermHistory.entries != NULL)
|
||||
pfree(wp->propTermHistory.entries);
|
||||
wp->propTermHistory.entries = NULL;
|
||||
|
||||
|
||||
pfree(wp);
|
||||
}
|
||||
|
||||
@@ -275,7 +275,7 @@ WalProposerPoll(WalProposer *wp)
|
||||
wp->config->safekeeper_connection_timeout))
|
||||
{
|
||||
walprop_log(WARNING, "terminating connection to safekeeper '%s:%s' in '%s' state: no messages received during the last %dms or connection attempt took longer than that",
|
||||
sk->host, sk->port, FormatSafekeeperState(sk->state), wp->config->safekeeper_connection_timeout);
|
||||
sk->host, sk->port, FormatSafekeeperState(sk->state), wp->config->safekeeper_connection_timeout);
|
||||
ShutdownConnection(sk);
|
||||
}
|
||||
}
|
||||
@@ -395,7 +395,7 @@ ResetConnection(Safekeeper *sk)
|
||||
* https://www.postgresql.org/docs/devel/libpq-connect.html#LIBPQ-PQCONNECTSTARTPARAMS
|
||||
*/
|
||||
walprop_log(WARNING, "Immediate failure to connect with node '%s:%s':\n\terror: %s",
|
||||
sk->host, sk->port, wp->api.conn_error_message(sk));
|
||||
sk->host, sk->port, wp->api.conn_error_message(sk));
|
||||
|
||||
/*
|
||||
* Even though the connection failed, we still need to clean up the
|
||||
@@ -489,7 +489,7 @@ AdvancePollState(Safekeeper *sk, uint32 events)
|
||||
*/
|
||||
case SS_OFFLINE:
|
||||
walprop_log(FATAL, "Unexpected safekeeper %s:%s state advancement: is offline",
|
||||
sk->host, sk->port);
|
||||
sk->host, sk->port);
|
||||
break; /* actually unreachable, but prevents
|
||||
* -Wimplicit-fallthrough */
|
||||
|
||||
@@ -525,7 +525,7 @@ AdvancePollState(Safekeeper *sk, uint32 events)
|
||||
*/
|
||||
case SS_VOTING:
|
||||
walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
|
||||
sk->port, FormatSafekeeperState(sk->state));
|
||||
sk->port, FormatSafekeeperState(sk->state));
|
||||
ResetConnection(sk);
|
||||
return;
|
||||
|
||||
@@ -554,7 +554,7 @@ AdvancePollState(Safekeeper *sk, uint32 events)
|
||||
*/
|
||||
case SS_IDLE:
|
||||
walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
|
||||
sk->port, FormatSafekeeperState(sk->state));
|
||||
sk->port, FormatSafekeeperState(sk->state));
|
||||
ResetConnection(sk);
|
||||
return;
|
||||
|
||||
@@ -580,7 +580,7 @@ HandleConnectionEvent(Safekeeper *sk)
|
||||
{
|
||||
case WP_CONN_POLLING_OK:
|
||||
walprop_log(LOG, "connected with node %s:%s", sk->host,
|
||||
sk->port);
|
||||
sk->port);
|
||||
sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp);
|
||||
|
||||
/*
|
||||
@@ -604,7 +604,7 @@ HandleConnectionEvent(Safekeeper *sk)
|
||||
|
||||
case WP_CONN_POLLING_FAILED:
|
||||
walprop_log(WARNING, "failed to connect to node '%s:%s': %s",
|
||||
sk->host, sk->port, wp->api.conn_error_message(sk));
|
||||
sk->host, sk->port, wp->api.conn_error_message(sk));
|
||||
|
||||
/*
|
||||
* If connecting failed, we don't want to restart the connection
|
||||
@@ -641,7 +641,7 @@ SendStartWALPush(Safekeeper *sk)
|
||||
if (!wp->api.conn_send_query(sk, "START_WAL_PUSH"))
|
||||
{
|
||||
walprop_log(WARNING, "Failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s",
|
||||
sk->host, sk->port, wp->api.conn_error_message(sk));
|
||||
sk->host, sk->port, wp->api.conn_error_message(sk));
|
||||
ShutdownConnection(sk);
|
||||
return;
|
||||
}
|
||||
@@ -678,7 +678,7 @@ RecvStartWALPushResult(Safekeeper *sk)
|
||||
|
||||
case WP_EXEC_FAILED:
|
||||
walprop_log(WARNING, "Failed to send query to safekeeper %s:%s: %s",
|
||||
sk->host, sk->port, wp->api.conn_error_message(sk));
|
||||
sk->host, sk->port, wp->api.conn_error_message(sk));
|
||||
ShutdownConnection(sk);
|
||||
return;
|
||||
|
||||
@@ -689,7 +689,7 @@ RecvStartWALPushResult(Safekeeper *sk)
|
||||
*/
|
||||
case WP_EXEC_UNEXPECTED_SUCCESS:
|
||||
walprop_log(WARNING, "Received bad response from safekeeper %s:%s query execution",
|
||||
sk->host, sk->port);
|
||||
sk->host, sk->port);
|
||||
ShutdownConnection(sk);
|
||||
return;
|
||||
}
|
||||
@@ -758,8 +758,8 @@ RecvAcceptorGreeting(Safekeeper *sk)
|
||||
{
|
||||
/* Another compute with higher term is running. */
|
||||
walprop_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
|
||||
sk->host, sk->port,
|
||||
sk->greetResponse.term, wp->propTerm);
|
||||
sk->host, sk->port,
|
||||
sk->greetResponse.term, wp->propTerm);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -817,11 +817,11 @@ RecvVoteResponse(Safekeeper *sk)
|
||||
return;
|
||||
|
||||
walprop_log(LOG,
|
||||
"got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X",
|
||||
sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory),
|
||||
LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
|
||||
LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn),
|
||||
LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn));
|
||||
"got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X",
|
||||
sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory),
|
||||
LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
|
||||
LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn),
|
||||
LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn));
|
||||
|
||||
/*
|
||||
* In case of acceptor rejecting our vote, bail out, but only if either it
|
||||
@@ -832,8 +832,8 @@ RecvVoteResponse(Safekeeper *sk)
|
||||
(sk->voteResponse.term > wp->propTerm || wp->n_votes < wp->quorum))
|
||||
{
|
||||
walprop_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
|
||||
sk->host, sk->port,
|
||||
sk->voteResponse.term, wp->propTerm);
|
||||
sk->host, sk->port,
|
||||
sk->voteResponse.term, wp->propTerm);
|
||||
}
|
||||
Assert(sk->voteResponse.term == wp->propTerm);
|
||||
|
||||
@@ -877,10 +877,10 @@ HandleElectedProposer(WalProposer *wp)
|
||||
if (wp->truncateLsn < wp->propEpochStartLsn)
|
||||
{
|
||||
walprop_log(LOG,
|
||||
"start recovery because truncateLsn=%X/%X is not "
|
||||
"equal to epochStartLsn=%X/%X",
|
||||
LSN_FORMAT_ARGS(wp->truncateLsn),
|
||||
LSN_FORMAT_ARGS(wp->propEpochStartLsn));
|
||||
"start recovery because truncateLsn=%X/%X is not "
|
||||
"equal to epochStartLsn=%X/%X",
|
||||
LSN_FORMAT_ARGS(wp->truncateLsn),
|
||||
LSN_FORMAT_ARGS(wp->propEpochStartLsn));
|
||||
/* Perform recovery */
|
||||
if (!wp->api.recovery_download(&wp->safekeeper[wp->donor], wp->greetRequest.timeline, wp->truncateLsn, wp->propEpochStartLsn))
|
||||
walprop_log(FATAL, "Failed to recover state");
|
||||
@@ -990,9 +990,9 @@ DetermineEpochStartLsn(WalProposer *wp)
|
||||
wp->timelineStartLsn != wp->safekeeper[i].voteResponse.timelineStartLsn)
|
||||
{
|
||||
walprop_log(WARNING,
|
||||
"inconsistent timelineStartLsn: current %X/%X, received %X/%X",
|
||||
LSN_FORMAT_ARGS(wp->timelineStartLsn),
|
||||
LSN_FORMAT_ARGS(wp->safekeeper[i].voteResponse.timelineStartLsn));
|
||||
"inconsistent timelineStartLsn: current %X/%X, received %X/%X",
|
||||
LSN_FORMAT_ARGS(wp->timelineStartLsn),
|
||||
LSN_FORMAT_ARGS(wp->safekeeper[i].voteResponse.timelineStartLsn));
|
||||
}
|
||||
wp->timelineStartLsn = wp->safekeeper[i].voteResponse.timelineStartLsn;
|
||||
}
|
||||
@@ -1038,11 +1038,11 @@ DetermineEpochStartLsn(WalProposer *wp)
|
||||
wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].lsn = wp->propEpochStartLsn;
|
||||
|
||||
walprop_log(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
|
||||
wp->quorum,
|
||||
wp->propTerm,
|
||||
LSN_FORMAT_ARGS(wp->propEpochStartLsn),
|
||||
wp->safekeeper[wp->donor].host, wp->safekeeper[wp->donor].port,
|
||||
LSN_FORMAT_ARGS(wp->truncateLsn));
|
||||
wp->quorum,
|
||||
wp->propTerm,
|
||||
LSN_FORMAT_ARGS(wp->propEpochStartLsn),
|
||||
wp->safekeeper[wp->donor].host, wp->safekeeper[wp->donor].port,
|
||||
LSN_FORMAT_ARGS(wp->truncateLsn));
|
||||
|
||||
/*
|
||||
* Ensure the basebackup we are running (at RedoStartLsn) matches LSN
|
||||
@@ -1070,18 +1070,18 @@ DetermineEpochStartLsn(WalProposer *wp)
|
||||
walprop_shared->mineLastElectedTerm)))
|
||||
{
|
||||
walprop_log(PANIC,
|
||||
"collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X",
|
||||
LSN_FORMAT_ARGS(wp->propEpochStartLsn),
|
||||
LSN_FORMAT_ARGS(wp->api.get_redo_start_lsn(wp)));
|
||||
"collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X",
|
||||
LSN_FORMAT_ARGS(wp->propEpochStartLsn),
|
||||
LSN_FORMAT_ARGS(wp->api.get_redo_start_lsn(wp)));
|
||||
}
|
||||
}
|
||||
walprop_shared->mineLastElectedTerm = wp->propTerm;
|
||||
}
|
||||
|
||||
/*
|
||||
* WalProposer has just elected itself and initialized history, so
|
||||
* we can call election callback. Usually it updates truncateLsn to
|
||||
* fetch WAL for logical replication.
|
||||
* WalProposer has just elected itself and initialized history, so we can
|
||||
* call election callback. Usually it updates truncateLsn to fetch WAL for
|
||||
* logical replication.
|
||||
*/
|
||||
wp->api.after_election(wp);
|
||||
}
|
||||
@@ -1155,8 +1155,8 @@ SendProposerElected(Safekeeper *sk)
|
||||
sk->startStreamingAt = wp->truncateLsn;
|
||||
|
||||
walprop_log(WARNING, "empty safekeeper joined cluster as %s:%s, historyStart=%X/%X, sk->startStreamingAt=%X/%X",
|
||||
sk->host, sk->port, LSN_FORMAT_ARGS(wp->propTermHistory.entries[0].lsn),
|
||||
LSN_FORMAT_ARGS(sk->startStreamingAt));
|
||||
sk->host, sk->port, LSN_FORMAT_ARGS(wp->propTermHistory.entries[0].lsn),
|
||||
LSN_FORMAT_ARGS(sk->startStreamingAt));
|
||||
}
|
||||
}
|
||||
else
|
||||
@@ -1190,8 +1190,8 @@ SendProposerElected(Safekeeper *sk)
|
||||
|
||||
lastCommonTerm = i >= 0 ? wp->propTermHistory.entries[i].term : 0;
|
||||
walprop_log(LOG,
|
||||
"sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X",
|
||||
sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn));
|
||||
"sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X",
|
||||
sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn));
|
||||
|
||||
resetStringInfo(&sk->outbuf);
|
||||
pq_sendint64_le(&sk->outbuf, msg.tag);
|
||||
@@ -1355,11 +1355,11 @@ SendAppendRequests(Safekeeper *sk)
|
||||
PrepareAppendRequest(sk->wp, &sk->appendRequest, sk->streamingAt, endLsn);
|
||||
|
||||
walprop_log(DEBUG2, "sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
|
||||
req->endLsn - req->beginLsn,
|
||||
LSN_FORMAT_ARGS(req->beginLsn),
|
||||
LSN_FORMAT_ARGS(req->endLsn),
|
||||
LSN_FORMAT_ARGS(req->commitLsn),
|
||||
LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port);
|
||||
req->endLsn - req->beginLsn,
|
||||
LSN_FORMAT_ARGS(req->beginLsn),
|
||||
LSN_FORMAT_ARGS(req->endLsn),
|
||||
LSN_FORMAT_ARGS(req->commitLsn),
|
||||
LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port);
|
||||
|
||||
resetStringInfo(&sk->outbuf);
|
||||
|
||||
@@ -1398,8 +1398,8 @@ SendAppendRequests(Safekeeper *sk)
|
||||
|
||||
case PG_ASYNC_WRITE_FAIL:
|
||||
walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
|
||||
sk->host, sk->port, FormatSafekeeperState(sk->state),
|
||||
wp->api.conn_error_message(sk));
|
||||
sk->host, sk->port, FormatSafekeeperState(sk->state),
|
||||
wp->api.conn_error_message(sk));
|
||||
ShutdownConnection(sk);
|
||||
return false;
|
||||
default:
|
||||
@@ -1438,17 +1438,17 @@ RecvAppendResponses(Safekeeper *sk)
|
||||
break;
|
||||
|
||||
walprop_log(DEBUG2, "received message term=" INT64_FORMAT " flushLsn=%X/%X commitLsn=%X/%X from %s:%s",
|
||||
sk->appendResponse.term,
|
||||
LSN_FORMAT_ARGS(sk->appendResponse.flushLsn),
|
||||
LSN_FORMAT_ARGS(sk->appendResponse.commitLsn),
|
||||
sk->host, sk->port);
|
||||
sk->appendResponse.term,
|
||||
LSN_FORMAT_ARGS(sk->appendResponse.flushLsn),
|
||||
LSN_FORMAT_ARGS(sk->appendResponse.commitLsn),
|
||||
sk->host, sk->port);
|
||||
|
||||
if (sk->appendResponse.term > wp->propTerm)
|
||||
{
|
||||
/* Another compute with higher term is running. */
|
||||
walprop_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "",
|
||||
sk->host, sk->port,
|
||||
sk->appendResponse.term, wp->propTerm);
|
||||
sk->host, sk->port,
|
||||
sk->appendResponse.term, wp->propTerm);
|
||||
}
|
||||
|
||||
readAnything = true;
|
||||
@@ -1493,7 +1493,7 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
|
||||
/* read value length */
|
||||
rf->currentClusterSize = pq_getmsgint64(reply_message);
|
||||
walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: current_timeline_size %lu",
|
||||
rf->currentClusterSize);
|
||||
rf->currentClusterSize);
|
||||
}
|
||||
else if ((strcmp(key, "ps_writelsn") == 0) || (strcmp(key, "last_received_lsn") == 0))
|
||||
{
|
||||
@@ -1501,7 +1501,7 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
|
||||
/* read value length */
|
||||
rf->last_received_lsn = pq_getmsgint64(reply_message);
|
||||
walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: last_received_lsn %X/%X",
|
||||
LSN_FORMAT_ARGS(rf->last_received_lsn));
|
||||
LSN_FORMAT_ARGS(rf->last_received_lsn));
|
||||
}
|
||||
else if ((strcmp(key, "ps_flushlsn") == 0) || (strcmp(key, "disk_consistent_lsn") == 0))
|
||||
{
|
||||
@@ -1509,7 +1509,7 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
|
||||
/* read value length */
|
||||
rf->disk_consistent_lsn = pq_getmsgint64(reply_message);
|
||||
walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: disk_consistent_lsn %X/%X",
|
||||
LSN_FORMAT_ARGS(rf->disk_consistent_lsn));
|
||||
LSN_FORMAT_ARGS(rf->disk_consistent_lsn));
|
||||
}
|
||||
else if ((strcmp(key, "ps_applylsn") == 0) || (strcmp(key, "remote_consistent_lsn") == 0))
|
||||
{
|
||||
@@ -1517,7 +1517,7 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
|
||||
/* read value length */
|
||||
rf->remote_consistent_lsn = pq_getmsgint64(reply_message);
|
||||
walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: remote_consistent_lsn %X/%X",
|
||||
LSN_FORMAT_ARGS(rf->remote_consistent_lsn));
|
||||
LSN_FORMAT_ARGS(rf->remote_consistent_lsn));
|
||||
}
|
||||
else if ((strcmp(key, "ps_replytime") == 0) || (strcmp(key, "replytime") == 0))
|
||||
{
|
||||
@@ -1530,7 +1530,7 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
|
||||
/* Copy because timestamptz_to_str returns a static buffer */
|
||||
replyTimeStr = pstrdup(timestamptz_to_str(rf->replytime));
|
||||
walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: replytime %lu reply_time: %s",
|
||||
rf->replytime, replyTimeStr);
|
||||
rf->replytime, replyTimeStr);
|
||||
|
||||
pfree(replyTimeStr);
|
||||
}
|
||||
@@ -1700,8 +1700,8 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size)
|
||||
|
||||
case PG_ASYNC_READ_FAIL:
|
||||
walprop_log(WARNING, "Failed to read from node %s:%s in %s state: %s", sk->host,
|
||||
sk->port, FormatSafekeeperState(sk->state),
|
||||
wp->api.conn_error_message(sk));
|
||||
sk->port, FormatSafekeeperState(sk->state),
|
||||
wp->api.conn_error_message(sk));
|
||||
ShutdownConnection(sk);
|
||||
return false;
|
||||
}
|
||||
@@ -1740,7 +1740,7 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
|
||||
if (tag != anymsg->tag)
|
||||
{
|
||||
walprop_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
|
||||
sk->port, FormatSafekeeperState(sk->state));
|
||||
sk->port, FormatSafekeeperState(sk->state));
|
||||
ResetConnection(sk);
|
||||
return false;
|
||||
}
|
||||
@@ -1816,8 +1816,8 @@ BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState succes
|
||||
if (!wp->api.conn_blocking_write(sk, msg, msg_size))
|
||||
{
|
||||
walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
|
||||
sk->host, sk->port, FormatSafekeeperState(sk->state),
|
||||
wp->api.conn_error_message(sk));
|
||||
sk->host, sk->port, FormatSafekeeperState(sk->state),
|
||||
wp->api.conn_error_message(sk));
|
||||
ShutdownConnection(sk);
|
||||
return false;
|
||||
}
|
||||
@@ -1863,8 +1863,8 @@ AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_sta
|
||||
return false;
|
||||
case PG_ASYNC_WRITE_FAIL:
|
||||
walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
|
||||
sk->host, sk->port, FormatSafekeeperState(sk->state),
|
||||
wp->api.conn_error_message(sk));
|
||||
sk->host, sk->port, FormatSafekeeperState(sk->state),
|
||||
wp->api.conn_error_message(sk));
|
||||
ShutdownConnection(sk);
|
||||
return false;
|
||||
default:
|
||||
@@ -1902,8 +1902,8 @@ AsyncFlush(Safekeeper *sk)
|
||||
return false;
|
||||
case -1:
|
||||
walprop_log(WARNING, "Failed to flush write to node %s:%s in %s state: %s",
|
||||
sk->host, sk->port, FormatSafekeeperState(sk->state),
|
||||
wp->api.conn_error_message(sk));
|
||||
sk->host, sk->port, FormatSafekeeperState(sk->state),
|
||||
wp->api.conn_error_message(sk));
|
||||
ResetConnection(sk);
|
||||
return false;
|
||||
default:
|
||||
@@ -2008,7 +2008,7 @@ AssertEventsOkForState(uint32 events, Safekeeper *sk)
|
||||
* and then an assertion that's guaranteed to fail.
|
||||
*/
|
||||
walprop_log(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]",
|
||||
FormatEvents(wp, events), sk->host, sk->port, FormatSafekeeperState(sk->state));
|
||||
FormatEvents(wp, events), sk->host, sk->port, FormatSafekeeperState(sk->state));
|
||||
Assert(events_ok_for_state);
|
||||
}
|
||||
}
|
||||
@@ -2111,7 +2111,7 @@ FormatEvents(WalProposer *wp, uint32 events)
|
||||
if (events & (~all_flags))
|
||||
{
|
||||
walprop_log(WARNING, "Event formatting found unexpected component %d",
|
||||
events & (~all_flags));
|
||||
events & (~all_flags));
|
||||
return_str[6] = '*';
|
||||
return_str[7] = '\0';
|
||||
}
|
||||
|
||||
@@ -356,7 +356,8 @@ typedef struct Safekeeper
|
||||
|
||||
|
||||
/* postgres-specific fields */
|
||||
#ifndef WALPROPOSER_LIB
|
||||
#ifndef WALPROPOSER_LIB
|
||||
|
||||
/*
|
||||
* postgres protocol connection to the WAL acceptor
|
||||
*
|
||||
@@ -374,17 +375,18 @@ typedef struct Safekeeper
|
||||
* Position in wait event set. Equal to -1 if no event
|
||||
*/
|
||||
int eventPos;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
/* WalProposer library specifics */
|
||||
#ifdef WALPROPOSER_LIB
|
||||
#ifdef WALPROPOSER_LIB
|
||||
|
||||
/*
|
||||
* Buffer for incoming messages. Usually Rust vector is stored here.
|
||||
* Caller is responsible for freeing the buffer.
|
||||
*/
|
||||
StringInfoData inbuf;
|
||||
#endif
|
||||
#endif
|
||||
} Safekeeper;
|
||||
|
||||
/* Re-exported PostgresPollingStatusType */
|
||||
@@ -472,7 +474,7 @@ typedef struct walproposer_api
|
||||
WalProposerConnStatusType (*conn_status) (Safekeeper *sk);
|
||||
|
||||
/* Start the connection, aka PQconnectStart. */
|
||||
void (*conn_connect_start) (Safekeeper *sk);
|
||||
void (*conn_connect_start) (Safekeeper *sk);
|
||||
|
||||
/* Poll an asynchronous connection, aka PQconnectPoll. */
|
||||
WalProposerConnectPollStatusType (*conn_connect_poll) (Safekeeper *sk);
|
||||
@@ -490,7 +492,7 @@ typedef struct walproposer_api
|
||||
void (*conn_finish) (Safekeeper *sk);
|
||||
|
||||
/*
|
||||
* Try to read CopyData message from the safekeeper, aka PQgetCopyData.
|
||||
* Try to read CopyData message from the safekeeper, aka PQgetCopyData.
|
||||
*
|
||||
* On success, the data is placed in *buf. It is valid until the next call
|
||||
* to this function.
|
||||
@@ -510,7 +512,7 @@ typedef struct walproposer_api
|
||||
void (*wal_read) (Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count);
|
||||
|
||||
/* Allocate WAL reader. */
|
||||
void (*wal_reader_allocate) (Safekeeper *sk);
|
||||
void (*wal_reader_allocate) (Safekeeper *sk);
|
||||
|
||||
/* Deallocate event set. */
|
||||
void (*free_event_set) (WalProposer *wp);
|
||||
@@ -572,7 +574,7 @@ typedef struct walproposer_api
|
||||
/*
|
||||
* Called right after the proposer was elected, but before it started
|
||||
* recovery and sent ProposerElected message to the safekeepers.
|
||||
*
|
||||
*
|
||||
* Used by logical replication to update truncateLsn.
|
||||
*/
|
||||
void (*after_election) (WalProposer *wp);
|
||||
@@ -626,10 +628,10 @@ typedef struct WalProposerConfig
|
||||
uint64 systemId;
|
||||
|
||||
/* Will be passed to safekeepers in greet request. */
|
||||
TimeLineID pgTimeline;
|
||||
TimeLineID pgTimeline;
|
||||
|
||||
#ifdef WALPROPOSER_LIB
|
||||
void *callback_data;
|
||||
void *callback_data;
|
||||
#endif
|
||||
} WalProposerConfig;
|
||||
|
||||
@@ -710,10 +712,11 @@ extern void WalProposerPoll(WalProposer *wp);
|
||||
extern void WalProposerFree(WalProposer *wp);
|
||||
|
||||
|
||||
#define WPEVENT 1337 /* special log level for walproposer internal events */
|
||||
#define WPEVENT 1337 /* special log level for walproposer internal
|
||||
* events */
|
||||
|
||||
#ifdef WALPROPOSER_LIB
|
||||
void WalProposerLibLog(WalProposer *wp, int elevel, char *fmt, ...);
|
||||
extern void WalProposerLibLog(WalProposer *wp, int elevel, char *fmt,...);
|
||||
#define walprop_log(elevel, ...) WalProposerLibLog(wp, elevel, __VA_ARGS__)
|
||||
#else
|
||||
#define walprop_log(elevel, ...) elog(elevel, __VA_ARGS__)
|
||||
|
||||
@@ -9,8 +9,9 @@
|
||||
#include "utils/datetime.h"
|
||||
#include "miscadmin.h"
|
||||
|
||||
void ExceptionalCondition(const char *conditionName,
|
||||
const char *fileName, int lineNumber)
|
||||
void
|
||||
ExceptionalCondition(const char *conditionName,
|
||||
const char *fileName, int lineNumber)
|
||||
{
|
||||
fprintf(stderr, "ExceptionalCondition: %s:%d: %s\n",
|
||||
fileName, lineNumber, conditionName);
|
||||
@@ -169,17 +170,18 @@ timestamptz_to_str(TimestampTz t)
|
||||
|
||||
bool
|
||||
TimestampDifferenceExceeds(TimestampTz start_time,
|
||||
TimestampTz stop_time,
|
||||
int msec)
|
||||
TimestampTz stop_time,
|
||||
int msec)
|
||||
{
|
||||
TimestampTz diff = stop_time - start_time;
|
||||
|
||||
return (diff >= msec * INT64CONST(1000));
|
||||
}
|
||||
|
||||
void
|
||||
WalProposerLibLog(WalProposer *wp, int elevel, char *fmt, ...)
|
||||
WalProposerLibLog(WalProposer *wp, int elevel, char *fmt,...)
|
||||
{
|
||||
char buf[1024];
|
||||
char buf[1024];
|
||||
va_list args;
|
||||
|
||||
fmt = _(fmt);
|
||||
|
||||
@@ -637,8 +637,8 @@ walprop_connect_start(Safekeeper *sk)
|
||||
*/
|
||||
sk->conn = palloc(sizeof(WalProposerConn));
|
||||
sk->conn->pg_conn = pg_conn;
|
||||
sk->conn->is_nonblocking = false; /* connections always start in blocking
|
||||
* mode */
|
||||
sk->conn->is_nonblocking = false; /* connections always start in
|
||||
* blocking mode */
|
||||
sk->conn->recvbuf = NULL;
|
||||
}
|
||||
|
||||
@@ -1291,10 +1291,11 @@ XLogWalPropWrite(WalProposer *wp, char *buf, Size nbytes, XLogRecPtr recptr)
|
||||
/*
|
||||
* Apart from walproposer, basebackup LSN page is also written out by
|
||||
* postgres itself which writes WAL only in pages, and in basebackup it is
|
||||
* inherently dummy (only safekeepers have historic WAL). Update WAL buffers
|
||||
* here to avoid dummy page overwriting correct one we download here. Ugly,
|
||||
* but alternatives are about the same ugly. We won't need that if we switch
|
||||
* to on-demand WAL download from safekeepers, without writing to disk.
|
||||
* inherently dummy (only safekeepers have historic WAL). Update WAL
|
||||
* buffers here to avoid dummy page overwriting correct one we download
|
||||
* here. Ugly, but alternatives are about the same ugly. We won't need
|
||||
* that if we switch to on-demand WAL download from safekeepers, without
|
||||
* writing to disk.
|
||||
*
|
||||
* https://github.com/neondatabase/neon/issues/5749
|
||||
*/
|
||||
@@ -1681,17 +1682,17 @@ walprop_pg_log_internal(WalProposer *wp, int level, const char *line)
|
||||
static void
|
||||
walprop_pg_after_election(WalProposer *wp)
|
||||
{
|
||||
FILE* f;
|
||||
XLogRecPtr lrRestartLsn;
|
||||
FILE *f;
|
||||
XLogRecPtr lrRestartLsn;
|
||||
|
||||
/* We don't need to do anything in syncSafekeepers mode.*/
|
||||
/* We don't need to do anything in syncSafekeepers mode. */
|
||||
if (wp->config->syncSafekeepers)
|
||||
return;
|
||||
|
||||
/*
|
||||
* If there are active logical replication subscription we need
|
||||
* to provide enough WAL for their WAL senders based on th position
|
||||
* of their replication slots.
|
||||
* If there are active logical replication subscription we need to provide
|
||||
* enough WAL for their WAL senders based on th position of their
|
||||
* replication slots.
|
||||
*/
|
||||
f = fopen("restart.lsn", "rb");
|
||||
if (f != NULL && !wp->config->syncSafekeepers)
|
||||
@@ -1700,8 +1701,12 @@ walprop_pg_after_election(WalProposer *wp)
|
||||
fclose(f);
|
||||
if (lrRestartLsn != InvalidXLogRecPtr)
|
||||
{
|
||||
elog(LOG, "Logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn));
|
||||
/* start from the beginning of the segment to fetch page headers verifed by XLogReader */
|
||||
elog(LOG, "Logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn));
|
||||
|
||||
/*
|
||||
* start from the beginning of the segment to fetch page headers
|
||||
* verifed by XLogReader
|
||||
*/
|
||||
lrRestartLsn = lrRestartLsn - XLogSegmentOffset(lrRestartLsn, wal_segment_size);
|
||||
wp->truncateLsn = Min(wp->truncateLsn, lrRestartLsn);
|
||||
}
|
||||
|
||||
121
poetry.lock
generated
121
poetry.lock
generated
@@ -1,4 +1,4 @@
|
||||
# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
|
||||
# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
|
||||
|
||||
[[package]]
|
||||
name = "aiohttp"
|
||||
@@ -98,18 +98,18 @@ speedups = ["Brotli", "aiodns", "brotlicffi"]
|
||||
|
||||
[[package]]
|
||||
name = "aiopg"
|
||||
version = "1.3.4"
|
||||
version = "1.4.0"
|
||||
description = "Postgres integration with asyncio."
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "aiopg-1.3.4-py3-none-any.whl", hash = "sha256:b5b74a124831aad71608c3c203479db90bac4a7eb3f8982bc48c3d3e6f1e57bf"},
|
||||
{file = "aiopg-1.3.4.tar.gz", hash = "sha256:23f9e4cd9f28e9d91a6de3b4fb517e8bed25511cd954acccba9fe3a702d9b7d0"},
|
||||
{file = "aiopg-1.4.0-py3-none-any.whl", hash = "sha256:aea46e8aff30b039cfa818e6db4752c97656e893fc75e5a5dc57355a9e9dedbd"},
|
||||
{file = "aiopg-1.4.0.tar.gz", hash = "sha256:116253bef86b4d954116716d181e9a0294037f266718b2e1c9766af995639d71"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
async-timeout = ">=3.0,<5.0"
|
||||
psycopg2-binary = ">=2.8.4"
|
||||
psycopg2-binary = ">=2.9.5"
|
||||
|
||||
[package.extras]
|
||||
sa = ["sqlalchemy[postgresql-psycopg2binary] (>=1.3,<1.5)"]
|
||||
@@ -160,64 +160,71 @@ pluggy = ">=0.4.0"
|
||||
|
||||
[[package]]
|
||||
name = "async-timeout"
|
||||
version = "4.0.2"
|
||||
version = "4.0.3"
|
||||
description = "Timeout context manager for asyncio programs"
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "async-timeout-4.0.2.tar.gz", hash = "sha256:2163e1640ddb52b7a8c80d0a67a08587e5d245cc9c553a74a847056bc2976b15"},
|
||||
{file = "async_timeout-4.0.2-py3-none-any.whl", hash = "sha256:8ca1e4fcf50d07413d66d1a5e416e42cfdf5851c981d679a09851a6853383b3c"},
|
||||
{file = "async-timeout-4.0.3.tar.gz", hash = "sha256:4640d96be84d82d02ed59ea2b7105a0f7b33abe8703703cd0ab0bf87c427522f"},
|
||||
{file = "async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "asyncpg"
|
||||
version = "0.27.0"
|
||||
version = "0.29.0"
|
||||
description = "An asyncio PostgreSQL driver"
|
||||
optional = false
|
||||
python-versions = ">=3.7.0"
|
||||
python-versions = ">=3.8.0"
|
||||
files = [
|
||||
{file = "asyncpg-0.27.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:fca608d199ffed4903dce1bcd97ad0fe8260f405c1c225bdf0002709132171c2"},
|
||||
{file = "asyncpg-0.27.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:20b596d8d074f6f695c13ffb8646d0b6bb1ab570ba7b0cfd349b921ff03cfc1e"},
|
||||
{file = "asyncpg-0.27.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7a6206210c869ebd3f4eb9e89bea132aefb56ff3d1b7dd7e26b102b17e27bbb1"},
|
||||
{file = "asyncpg-0.27.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a7a94c03386bb95456b12c66026b3a87d1b965f0f1e5733c36e7229f8f137747"},
|
||||
{file = "asyncpg-0.27.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:bfc3980b4ba6f97138b04f0d32e8af21d6c9fa1f8e6e140c07d15690a0a99279"},
|
||||
{file = "asyncpg-0.27.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:9654085f2b22f66952124de13a8071b54453ff972c25c59b5ce1173a4283ffd9"},
|
||||
{file = "asyncpg-0.27.0-cp310-cp310-win32.whl", hash = "sha256:879c29a75969eb2722f94443752f4720d560d1e748474de54ae8dd230bc4956b"},
|
||||
{file = "asyncpg-0.27.0-cp310-cp310-win_amd64.whl", hash = "sha256:ab0f21c4818d46a60ca789ebc92327d6d874d3b7ccff3963f7af0a21dc6cff52"},
|
||||
{file = "asyncpg-0.27.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:18f77e8e71e826ba2d0c3ba6764930776719ae2b225ca07e014590545928b576"},
|
||||
{file = "asyncpg-0.27.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c2232d4625c558f2aa001942cac1d7952aa9f0dbfc212f63bc754277769e1ef2"},
|
||||
{file = "asyncpg-0.27.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9a3a4ff43702d39e3c97a8786314123d314e0f0e4dabc8367db5b665c93914de"},
|
||||
{file = "asyncpg-0.27.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ccddb9419ab4e1c48742457d0c0362dbdaeb9b28e6875115abfe319b29ee225d"},
|
||||
{file = "asyncpg-0.27.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:768e0e7c2898d40b16d4ef7a0b44e8150db3dd8995b4652aa1fe2902e92c7df8"},
|
||||
{file = "asyncpg-0.27.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:609054a1f47292a905582a1cfcca51a6f3f30ab9d822448693e66fdddde27920"},
|
||||
{file = "asyncpg-0.27.0-cp311-cp311-win32.whl", hash = "sha256:8113e17cfe236dc2277ec844ba9b3d5312f61bd2fdae6d3ed1c1cdd75f6cf2d8"},
|
||||
{file = "asyncpg-0.27.0-cp311-cp311-win_amd64.whl", hash = "sha256:bb71211414dd1eeb8d31ec529fe77cff04bf53efc783a5f6f0a32d84923f45cf"},
|
||||
{file = "asyncpg-0.27.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4750f5cf49ed48a6e49c6e5aed390eee367694636c2dcfaf4a273ca832c5c43c"},
|
||||
{file = "asyncpg-0.27.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:eca01eb112a39d31cc4abb93a5aef2a81514c23f70956729f42fb83b11b3483f"},
|
||||
{file = "asyncpg-0.27.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:5710cb0937f696ce303f5eed6d272e3f057339bb4139378ccecafa9ee923a71c"},
|
||||
{file = "asyncpg-0.27.0-cp37-cp37m-win_amd64.whl", hash = "sha256:71cca80a056ebe19ec74b7117b09e650990c3ca535ac1c35234a96f65604192f"},
|
||||
{file = "asyncpg-0.27.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4bb366ae34af5b5cabc3ac6a5347dfb6013af38c68af8452f27968d49085ecc0"},
|
||||
{file = "asyncpg-0.27.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:16ba8ec2e85d586b4a12bcd03e8d29e3d99e832764d6a1d0b8c27dbbe4a2569d"},
|
||||
{file = "asyncpg-0.27.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d20dea7b83651d93b1eb2f353511fe7fd554752844523f17ad30115d8b9c8cd6"},
|
||||
{file = "asyncpg-0.27.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:e56ac8a8237ad4adec97c0cd4728596885f908053ab725e22900b5902e7f8e69"},
|
||||
{file = "asyncpg-0.27.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:bf21ebf023ec67335258e0f3d3ad7b91bb9507985ba2b2206346de488267cad0"},
|
||||
{file = "asyncpg-0.27.0-cp38-cp38-win32.whl", hash = "sha256:69aa1b443a182b13a17ff926ed6627af2d98f62f2fe5890583270cc4073f63bf"},
|
||||
{file = "asyncpg-0.27.0-cp38-cp38-win_amd64.whl", hash = "sha256:62932f29cf2433988fcd799770ec64b374a3691e7902ecf85da14d5e0854d1ea"},
|
||||
{file = "asyncpg-0.27.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:fddcacf695581a8d856654bc4c8cfb73d5c9df26d5f55201722d3e6a699e9629"},
|
||||
{file = "asyncpg-0.27.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:7d8585707ecc6661d07367d444bbaa846b4e095d84451340da8df55a3757e152"},
|
||||
{file = "asyncpg-0.27.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:975a320baf7020339a67315284a4d3bf7460e664e484672bd3e71dbd881bc692"},
|
||||
{file = "asyncpg-0.27.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2232ebae9796d4600a7819fc383da78ab51b32a092795f4555575fc934c1c89d"},
|
||||
{file = "asyncpg-0.27.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:88b62164738239f62f4af92567b846a8ef7cf8abf53eddd83650603de4d52163"},
|
||||
{file = "asyncpg-0.27.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:eb4b2fdf88af4fb1cc569781a8f933d2a73ee82cd720e0cb4edabbaecf2a905b"},
|
||||
{file = "asyncpg-0.27.0-cp39-cp39-win32.whl", hash = "sha256:8934577e1ed13f7d2d9cea3cc016cc6f95c19faedea2c2b56a6f94f257cea672"},
|
||||
{file = "asyncpg-0.27.0-cp39-cp39-win_amd64.whl", hash = "sha256:1b6499de06fe035cf2fa932ec5617ed3f37d4ebbf663b655922e105a484a6af9"},
|
||||
{file = "asyncpg-0.27.0.tar.gz", hash = "sha256:720986d9a4705dd8a40fdf172036f5ae787225036a7eb46e704c45aa8f62c054"},
|
||||
{file = "asyncpg-0.29.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:72fd0ef9f00aeed37179c62282a3d14262dbbafb74ec0ba16e1b1864d8a12169"},
|
||||
{file = "asyncpg-0.29.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:52e8f8f9ff6e21f9b39ca9f8e3e33a5fcdceaf5667a8c5c32bee158e313be385"},
|
||||
{file = "asyncpg-0.29.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a9e6823a7012be8b68301342ba33b4740e5a166f6bbda0aee32bc01638491a22"},
|
||||
{file = "asyncpg-0.29.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:746e80d83ad5d5464cfbf94315eb6744222ab00aa4e522b704322fb182b83610"},
|
||||
{file = "asyncpg-0.29.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:ff8e8109cd6a46ff852a5e6bab8b0a047d7ea42fcb7ca5ae6eaae97d8eacf397"},
|
||||
{file = "asyncpg-0.29.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:97eb024685b1d7e72b1972863de527c11ff87960837919dac6e34754768098eb"},
|
||||
{file = "asyncpg-0.29.0-cp310-cp310-win32.whl", hash = "sha256:5bbb7f2cafd8d1fa3e65431833de2642f4b2124be61a449fa064e1a08d27e449"},
|
||||
{file = "asyncpg-0.29.0-cp310-cp310-win_amd64.whl", hash = "sha256:76c3ac6530904838a4b650b2880f8e7af938ee049e769ec2fba7cd66469d7772"},
|
||||
{file = "asyncpg-0.29.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4900ee08e85af01adb207519bb4e14b1cae8fd21e0ccf80fac6aa60b6da37b4"},
|
||||
{file = "asyncpg-0.29.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a65c1dcd820d5aea7c7d82a3fdcb70e096f8f70d1a8bf93eb458e49bfad036ac"},
|
||||
{file = "asyncpg-0.29.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5b52e46f165585fd6af4863f268566668407c76b2c72d366bb8b522fa66f1870"},
|
||||
{file = "asyncpg-0.29.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dc600ee8ef3dd38b8d67421359779f8ccec30b463e7aec7ed481c8346decf99f"},
|
||||
{file = "asyncpg-0.29.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:039a261af4f38f949095e1e780bae84a25ffe3e370175193174eb08d3cecab23"},
|
||||
{file = "asyncpg-0.29.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:6feaf2d8f9138d190e5ec4390c1715c3e87b37715cd69b2c3dfca616134efd2b"},
|
||||
{file = "asyncpg-0.29.0-cp311-cp311-win32.whl", hash = "sha256:1e186427c88225ef730555f5fdda6c1812daa884064bfe6bc462fd3a71c4b675"},
|
||||
{file = "asyncpg-0.29.0-cp311-cp311-win_amd64.whl", hash = "sha256:cfe73ffae35f518cfd6e4e5f5abb2618ceb5ef02a2365ce64f132601000587d3"},
|
||||
{file = "asyncpg-0.29.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:6011b0dc29886ab424dc042bf9eeb507670a3b40aece3439944006aafe023178"},
|
||||
{file = "asyncpg-0.29.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b544ffc66b039d5ec5a7454667f855f7fec08e0dfaf5a5490dfafbb7abbd2cfb"},
|
||||
{file = "asyncpg-0.29.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d84156d5fb530b06c493f9e7635aa18f518fa1d1395ef240d211cb563c4e2364"},
|
||||
{file = "asyncpg-0.29.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:54858bc25b49d1114178d65a88e48ad50cb2b6f3e475caa0f0c092d5f527c106"},
|
||||
{file = "asyncpg-0.29.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:bde17a1861cf10d5afce80a36fca736a86769ab3579532c03e45f83ba8a09c59"},
|
||||
{file = "asyncpg-0.29.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:37a2ec1b9ff88d8773d3eb6d3784dc7e3fee7756a5317b67f923172a4748a175"},
|
||||
{file = "asyncpg-0.29.0-cp312-cp312-win32.whl", hash = "sha256:bb1292d9fad43112a85e98ecdc2e051602bce97c199920586be83254d9dafc02"},
|
||||
{file = "asyncpg-0.29.0-cp312-cp312-win_amd64.whl", hash = "sha256:2245be8ec5047a605e0b454c894e54bf2ec787ac04b1cb7e0d3c67aa1e32f0fe"},
|
||||
{file = "asyncpg-0.29.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0009a300cae37b8c525e5b449233d59cd9868fd35431abc470a3e364d2b85cb9"},
|
||||
{file = "asyncpg-0.29.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5cad1324dbb33f3ca0cd2074d5114354ed3be2b94d48ddfd88af75ebda7c43cc"},
|
||||
{file = "asyncpg-0.29.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:012d01df61e009015944ac7543d6ee30c2dc1eb2f6b10b62a3f598beb6531548"},
|
||||
{file = "asyncpg-0.29.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:000c996c53c04770798053e1730d34e30cb645ad95a63265aec82da9093d88e7"},
|
||||
{file = "asyncpg-0.29.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:e0bfe9c4d3429706cf70d3249089de14d6a01192d617e9093a8e941fea8ee775"},
|
||||
{file = "asyncpg-0.29.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:642a36eb41b6313ffa328e8a5c5c2b5bea6ee138546c9c3cf1bffaad8ee36dd9"},
|
||||
{file = "asyncpg-0.29.0-cp38-cp38-win32.whl", hash = "sha256:a921372bbd0aa3a5822dd0409da61b4cd50df89ae85150149f8c119f23e8c408"},
|
||||
{file = "asyncpg-0.29.0-cp38-cp38-win_amd64.whl", hash = "sha256:103aad2b92d1506700cbf51cd8bb5441e7e72e87a7b3a2ca4e32c840f051a6a3"},
|
||||
{file = "asyncpg-0.29.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5340dd515d7e52f4c11ada32171d87c05570479dc01dc66d03ee3e150fb695da"},
|
||||
{file = "asyncpg-0.29.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e17b52c6cf83e170d3d865571ba574577ab8e533e7361a2b8ce6157d02c665d3"},
|
||||
{file = "asyncpg-0.29.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f100d23f273555f4b19b74a96840aa27b85e99ba4b1f18d4ebff0734e78dc090"},
|
||||
{file = "asyncpg-0.29.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:48e7c58b516057126b363cec8ca02b804644fd012ef8e6c7e23386b7d5e6ce83"},
|
||||
{file = "asyncpg-0.29.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:f9ea3f24eb4c49a615573724d88a48bd1b7821c890c2effe04f05382ed9e8810"},
|
||||
{file = "asyncpg-0.29.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8d36c7f14a22ec9e928f15f92a48207546ffe68bc412f3be718eedccdf10dc5c"},
|
||||
{file = "asyncpg-0.29.0-cp39-cp39-win32.whl", hash = "sha256:797ab8123ebaed304a1fad4d7576d5376c3a006a4100380fb9d517f0b59c1ab2"},
|
||||
{file = "asyncpg-0.29.0-cp39-cp39-win_amd64.whl", hash = "sha256:cce08a178858b426ae1aa8409b5cc171def45d4293626e7aa6510696d46decd8"},
|
||||
{file = "asyncpg-0.29.0.tar.gz", hash = "sha256:d1c49e1f44fffafd9a55e1a9b101590859d881d639ea2922516f5d9c512d354e"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
async-timeout = {version = ">=4.0.3", markers = "python_version < \"3.12.0\""}
|
||||
|
||||
[package.extras]
|
||||
dev = ["Cython (>=0.29.24,<0.30.0)", "Sphinx (>=4.1.2,<4.2.0)", "flake8 (>=5.0.4,<5.1.0)", "pytest (>=6.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)", "uvloop (>=0.15.3)"]
|
||||
docs = ["Sphinx (>=4.1.2,<4.2.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)"]
|
||||
test = ["flake8 (>=5.0.4,<5.1.0)", "uvloop (>=0.15.3)"]
|
||||
docs = ["Sphinx (>=5.3.0,<5.4.0)", "sphinx-rtd-theme (>=1.2.2)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)"]
|
||||
test = ["flake8 (>=6.1,<7.0)", "uvloop (>=0.15.3)"]
|
||||
|
||||
[[package]]
|
||||
name = "attrs"
|
||||
@@ -2476,6 +2483,16 @@ files = [
|
||||
{file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
|
||||
{file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
|
||||
{file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
|
||||
{file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
|
||||
{file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
|
||||
{file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
|
||||
@@ -2697,4 +2714,4 @@ cffi = ["cffi (>=1.11)"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.9"
|
||||
content-hash = "9f33b4404dbb9803ede5785469241dde1d09132427b87db8928bdbc37ccd6b7a"
|
||||
content-hash = "c4e38082d246636903e15c02fbf8364c6afc1fd35d36a81c49f596ba68fc739b"
|
||||
|
||||
@@ -4,6 +4,10 @@ version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[features]
|
||||
default = []
|
||||
testing = []
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
async-trait.workspace = true
|
||||
@@ -57,6 +61,7 @@ thiserror.workspace = true
|
||||
tls-listener.workspace = true
|
||||
tokio-postgres.workspace = true
|
||||
tokio-rustls.workspace = true
|
||||
tokio-util.workspace = true
|
||||
tokio = { workspace = true, features = ["signal"] }
|
||||
tracing-opentelemetry.workspace = true
|
||||
tracing-subscriber.workspace = true
|
||||
@@ -69,13 +74,12 @@ webpki-roots.workspace = true
|
||||
x509-parser.workspace = true
|
||||
native-tls.workspace = true
|
||||
postgres-native-tls.workspace = true
|
||||
postgres-protocol.workspace = true
|
||||
smol_str.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
tokio-util.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
rcgen.workspace = true
|
||||
rstest.workspace = true
|
||||
tokio-postgres-rustls.workspace = true
|
||||
postgres-protocol.workspace = true
|
||||
|
||||
@@ -3,9 +3,11 @@ mod hacks;
|
||||
mod link;
|
||||
|
||||
pub use link::LinkAuthError;
|
||||
use smol_str::SmolStr;
|
||||
use tokio_postgres::config::AuthKeys;
|
||||
|
||||
use crate::auth::credentials::check_peer_addr_is_in_list;
|
||||
use crate::auth::validate_password_and_exchange;
|
||||
use crate::console::errors::GetAuthInfoError;
|
||||
use crate::console::provider::AuthInfo;
|
||||
use crate::console::AuthSecret;
|
||||
@@ -24,31 +26,12 @@ use crate::{
|
||||
};
|
||||
use futures::TryFutureExt;
|
||||
use std::borrow::Cow;
|
||||
use std::net::IpAddr;
|
||||
use std::ops::ControlFlow;
|
||||
use std::sync::Arc;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::{error, info, warn};
|
||||
|
||||
/// A product of successful authentication.
|
||||
pub struct AuthSuccess<T> {
|
||||
/// Did we send [`pq_proto::BeMessage::AuthenticationOk`] to client?
|
||||
pub reported_auth_ok: bool,
|
||||
/// Something to be considered a positive result.
|
||||
pub value: T,
|
||||
}
|
||||
|
||||
impl<T> AuthSuccess<T> {
|
||||
/// Very similar to [`std::option::Option::map`].
|
||||
/// Maps [`AuthSuccess<T>`] to [`AuthSuccess<R>`] by applying
|
||||
/// a function to a contained value.
|
||||
pub fn map<R>(self, f: impl FnOnce(T) -> R) -> AuthSuccess<R> {
|
||||
AuthSuccess {
|
||||
reported_auth_ok: self.reported_auth_ok,
|
||||
value: f(self.value),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// This type serves two purposes:
|
||||
///
|
||||
/// * When `T` is `()`, it's just a regular auth backend selector
|
||||
@@ -61,9 +44,11 @@ pub enum BackendType<'a, T> {
|
||||
/// Current Cloud API (V2).
|
||||
Console(Cow<'a, console::provider::neon::Api>, T),
|
||||
/// Local mock of Cloud API (V2).
|
||||
#[cfg(feature = "testing")]
|
||||
Postgres(Cow<'a, console::provider::mock::Api>, T),
|
||||
/// Authentication via a web browser.
|
||||
Link(Cow<'a, url::ApiUrl>),
|
||||
#[cfg(test)]
|
||||
/// Test backend.
|
||||
Test(&'a dyn TestBackend),
|
||||
}
|
||||
@@ -78,8 +63,10 @@ impl std::fmt::Display for BackendType<'_, ()> {
|
||||
use BackendType::*;
|
||||
match self {
|
||||
Console(endpoint, _) => fmt.debug_tuple("Console").field(&endpoint.url()).finish(),
|
||||
#[cfg(feature = "testing")]
|
||||
Postgres(endpoint, _) => fmt.debug_tuple("Postgres").field(&endpoint.url()).finish(),
|
||||
Link(url) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
|
||||
#[cfg(test)]
|
||||
Test(_) => fmt.debug_tuple("Test").finish(),
|
||||
}
|
||||
}
|
||||
@@ -92,8 +79,10 @@ impl<T> BackendType<'_, T> {
|
||||
use BackendType::*;
|
||||
match self {
|
||||
Console(c, x) => Console(Cow::Borrowed(c), x),
|
||||
#[cfg(feature = "testing")]
|
||||
Postgres(c, x) => Postgres(Cow::Borrowed(c), x),
|
||||
Link(c) => Link(Cow::Borrowed(c)),
|
||||
#[cfg(test)]
|
||||
Test(x) => Test(*x),
|
||||
}
|
||||
}
|
||||
@@ -107,8 +96,10 @@ impl<'a, T> BackendType<'a, T> {
|
||||
use BackendType::*;
|
||||
match self {
|
||||
Console(c, x) => Console(c, f(x)),
|
||||
#[cfg(feature = "testing")]
|
||||
Postgres(c, x) => Postgres(c, f(x)),
|
||||
Link(c) => Link(c),
|
||||
#[cfg(test)]
|
||||
Test(x) => Test(x),
|
||||
}
|
||||
}
|
||||
@@ -121,51 +112,87 @@ impl<'a, T, E> BackendType<'a, Result<T, E>> {
|
||||
use BackendType::*;
|
||||
match self {
|
||||
Console(c, x) => x.map(|x| Console(c, x)),
|
||||
#[cfg(feature = "testing")]
|
||||
Postgres(c, x) => x.map(|x| Postgres(c, x)),
|
||||
Link(c) => Ok(Link(c)),
|
||||
#[cfg(test)]
|
||||
Test(x) => Ok(Test(x)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub enum ComputeCredentials {
|
||||
pub struct ComputeCredentials<T> {
|
||||
pub info: ComputeUserInfo,
|
||||
pub keys: T,
|
||||
}
|
||||
|
||||
pub struct ComputeUserInfoNoEndpoint {
|
||||
pub user: SmolStr,
|
||||
pub peer_addr: IpAddr,
|
||||
pub cache_key: SmolStr,
|
||||
}
|
||||
|
||||
pub struct ComputeUserInfo {
|
||||
pub endpoint: SmolStr,
|
||||
pub inner: ComputeUserInfoNoEndpoint,
|
||||
}
|
||||
|
||||
pub enum ComputeCredentialKeys {
|
||||
#[cfg(feature = "testing")]
|
||||
Password(Vec<u8>),
|
||||
AuthKeys(AuthKeys),
|
||||
}
|
||||
|
||||
impl TryFrom<ClientCredentials> for ComputeUserInfo {
|
||||
// user name
|
||||
type Error = ComputeUserInfoNoEndpoint;
|
||||
|
||||
fn try_from(creds: ClientCredentials) -> Result<Self, Self::Error> {
|
||||
let inner = ComputeUserInfoNoEndpoint {
|
||||
user: creds.user,
|
||||
peer_addr: creds.peer_addr,
|
||||
cache_key: creds.cache_key,
|
||||
};
|
||||
match creds.project {
|
||||
None => Err(inner),
|
||||
Some(endpoint) => Ok(ComputeUserInfo { endpoint, inner }),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// True to its name, this function encapsulates our current auth trade-offs.
|
||||
/// Here, we choose the appropriate auth flow based on circumstances.
|
||||
async fn auth_quirks_creds(
|
||||
///
|
||||
/// All authentication flows will emit an AuthenticationOk message if successful.
|
||||
async fn auth_quirks(
|
||||
api: &impl console::Api,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
creds: &mut ClientCredentials<'_>,
|
||||
creds: ClientCredentials,
|
||||
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
||||
allow_cleartext: bool,
|
||||
config: &'static AuthenticationConfig,
|
||||
latency_timer: &mut LatencyTimer,
|
||||
) -> auth::Result<AuthSuccess<ComputeCredentials>> {
|
||||
) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
|
||||
// If there's no project so far, that entails that client doesn't
|
||||
// support SNI or other means of passing the endpoint (project) name.
|
||||
// We now expect to see a very specific payload in the place of password.
|
||||
let maybe_success = if creds.project.is_none() {
|
||||
// Password will be checked by the compute node later.
|
||||
Some(hacks::password_hack(creds, client, latency_timer).await?)
|
||||
} else {
|
||||
None
|
||||
let (info, unauthenticated_password) = match creds.try_into() {
|
||||
Err(info) => {
|
||||
let res = hacks::password_hack_no_authentication(info, client, latency_timer).await?;
|
||||
(res.info, Some(res.keys))
|
||||
}
|
||||
Ok(info) => (info, None),
|
||||
};
|
||||
|
||||
// Password hack should set the project name.
|
||||
// TODO: make `creds.project` more type-safe.
|
||||
assert!(creds.project.is_some());
|
||||
info!("fetching user's authentication info");
|
||||
// TODO(anna): this will slow down both "hacks" below; we probably need a cache.
|
||||
let AuthInfo {
|
||||
secret,
|
||||
allowed_ips,
|
||||
} = api.get_auth_info(extra, creds).await?;
|
||||
} = api.get_auth_info(extra, &info).await?;
|
||||
|
||||
// check allowed list
|
||||
if !check_peer_addr_is_in_list(&creds.peer_addr.ip(), &allowed_ips) {
|
||||
if !check_peer_addr_is_in_list(&info.inner.peer_addr, &allowed_ips) {
|
||||
return Err(auth::AuthError::ip_address_not_allowed());
|
||||
}
|
||||
let secret = secret.unwrap_or_else(|| {
|
||||
@@ -173,36 +200,49 @@ async fn auth_quirks_creds(
|
||||
// prevent malicious probing (possible due to missing protocol steps).
|
||||
// This mocked secret will never lead to successful authentication.
|
||||
info!("authentication info not found, mocking it");
|
||||
AuthSecret::Scram(scram::ServerSecret::mock(creds.user, rand::random()))
|
||||
AuthSecret::Scram(scram::ServerSecret::mock(&info.inner.user, rand::random()))
|
||||
});
|
||||
|
||||
if let Some(success) = maybe_success {
|
||||
return Ok(success);
|
||||
if let Some(password) = unauthenticated_password {
|
||||
let auth_outcome = validate_password_and_exchange(&password, secret)?;
|
||||
let keys = match auth_outcome {
|
||||
crate::sasl::Outcome::Success(key) => key,
|
||||
crate::sasl::Outcome::Failure(reason) => {
|
||||
info!("auth backend failed with an error: {reason}");
|
||||
return Err(auth::AuthError::auth_failed(&*info.inner.user));
|
||||
}
|
||||
};
|
||||
|
||||
// we have authenticated the password
|
||||
client.write_message_noflush(&pq_proto::BeMessage::AuthenticationOk)?;
|
||||
|
||||
return Ok(ComputeCredentials { info, keys });
|
||||
}
|
||||
|
||||
// -- the remaining flows are self-authenticating --
|
||||
|
||||
// Perform cleartext auth if we're allowed to do that.
|
||||
// Currently, we use it for websocket connections (latency).
|
||||
if allow_cleartext {
|
||||
// Password will be checked by the compute node later.
|
||||
return hacks::cleartext_hack(client, latency_timer).await;
|
||||
return hacks::authenticate_cleartext(info, client, latency_timer, secret).await;
|
||||
}
|
||||
|
||||
// Finally, proceed with the main auth flow (SCRAM-based).
|
||||
classic::authenticate(creds, client, config, latency_timer, secret).await
|
||||
classic::authenticate(info, client, config, latency_timer, secret).await
|
||||
}
|
||||
|
||||
/// True to its name, this function encapsulates our current auth trade-offs.
|
||||
/// Here, we choose the appropriate auth flow based on circumstances.
|
||||
async fn auth_quirks(
|
||||
/// Authenticate the user and then wake a compute (or retrieve an existing compute session from cache)
|
||||
/// only if authentication was successfuly.
|
||||
async fn auth_and_wake_compute(
|
||||
api: &impl console::Api,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
creds: &mut ClientCredentials<'_>,
|
||||
creds: ClientCredentials,
|
||||
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
||||
allow_cleartext: bool,
|
||||
config: &'static AuthenticationConfig,
|
||||
latency_timer: &mut LatencyTimer,
|
||||
) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
|
||||
let auth_stuff = auth_quirks_creds(
|
||||
) -> auth::Result<(CachedNodeInfo, ComputeUserInfo)> {
|
||||
let compute_credentials = auth_quirks(
|
||||
api,
|
||||
extra,
|
||||
creds,
|
||||
@@ -215,7 +255,7 @@ async fn auth_quirks(
|
||||
|
||||
let mut num_retries = 0;
|
||||
let mut node = loop {
|
||||
let wake_res = api.wake_compute(extra, creds).await;
|
||||
let wake_res = api.wake_compute(extra, &compute_credentials.info).await;
|
||||
match handle_try_wake(wake_res, num_retries) {
|
||||
Err(e) => {
|
||||
error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
|
||||
@@ -232,27 +272,27 @@ async fn auth_quirks(
|
||||
tokio::time::sleep(wait_duration).await;
|
||||
};
|
||||
|
||||
match auth_stuff.value {
|
||||
ComputeCredentials::Password(password) => node.config.password(password),
|
||||
ComputeCredentials::AuthKeys(auth_keys) => node.config.auth_keys(auth_keys),
|
||||
match compute_credentials.keys {
|
||||
#[cfg(feature = "testing")]
|
||||
ComputeCredentialKeys::Password(password) => node.config.password(password),
|
||||
ComputeCredentialKeys::AuthKeys(auth_keys) => node.config.auth_keys(auth_keys),
|
||||
};
|
||||
|
||||
Ok(AuthSuccess {
|
||||
reported_auth_ok: auth_stuff.reported_auth_ok,
|
||||
value: node,
|
||||
})
|
||||
Ok((node, compute_credentials.info))
|
||||
}
|
||||
|
||||
impl BackendType<'_, ClientCredentials<'_>> {
|
||||
impl<'a> BackendType<'a, ClientCredentials> {
|
||||
/// Get compute endpoint name from the credentials.
|
||||
pub fn get_endpoint(&self) -> Option<String> {
|
||||
pub fn get_endpoint(&self) -> Option<SmolStr> {
|
||||
use BackendType::*;
|
||||
|
||||
match self {
|
||||
Console(_, creds) => creds.project.clone(),
|
||||
#[cfg(feature = "testing")]
|
||||
Postgres(_, creds) => creds.project.clone(),
|
||||
Link(_) => Some("link".to_owned()),
|
||||
Test(_) => Some("test".to_owned()),
|
||||
Link(_) => Some("link".into()),
|
||||
#[cfg(test)]
|
||||
Test(_) => Some("test".into()),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -261,9 +301,11 @@ impl BackendType<'_, ClientCredentials<'_>> {
|
||||
use BackendType::*;
|
||||
|
||||
match self {
|
||||
Console(_, creds) => creds.user,
|
||||
Postgres(_, creds) => creds.user,
|
||||
Console(_, creds) => &creds.user,
|
||||
#[cfg(feature = "testing")]
|
||||
Postgres(_, creds) => &creds.user,
|
||||
Link(_) => "link",
|
||||
#[cfg(test)]
|
||||
Test(_) => "test",
|
||||
}
|
||||
}
|
||||
@@ -271,26 +313,25 @@ impl BackendType<'_, ClientCredentials<'_>> {
|
||||
/// Authenticate the client via the requested backend, possibly using credentials.
|
||||
#[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)]
|
||||
pub async fn authenticate(
|
||||
&mut self,
|
||||
self,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
||||
allow_cleartext: bool,
|
||||
config: &'static AuthenticationConfig,
|
||||
latency_timer: &mut LatencyTimer,
|
||||
) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
|
||||
) -> auth::Result<(CachedNodeInfo, BackendType<'a, ComputeUserInfo>)> {
|
||||
use BackendType::*;
|
||||
|
||||
let res = match self {
|
||||
Console(api, creds) => {
|
||||
info!(
|
||||
user = creds.user,
|
||||
user = &*creds.user,
|
||||
project = creds.project(),
|
||||
"performing authentication using the console"
|
||||
);
|
||||
|
||||
let api = api.as_ref();
|
||||
auth_quirks(
|
||||
api,
|
||||
let (cache_info, user_info) = auth_and_wake_compute(
|
||||
&*api,
|
||||
extra,
|
||||
creds,
|
||||
client,
|
||||
@@ -298,18 +339,19 @@ impl BackendType<'_, ClientCredentials<'_>> {
|
||||
config,
|
||||
latency_timer,
|
||||
)
|
||||
.await?
|
||||
.await?;
|
||||
(cache_info, BackendType::Console(api, user_info))
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
Postgres(api, creds) => {
|
||||
info!(
|
||||
user = creds.user,
|
||||
user = &*creds.user,
|
||||
project = creds.project(),
|
||||
"performing authentication using a local postgres instance"
|
||||
);
|
||||
|
||||
let api = api.as_ref();
|
||||
auth_quirks(
|
||||
api,
|
||||
let (cache_info, user_info) = auth_and_wake_compute(
|
||||
&*api,
|
||||
extra,
|
||||
creds,
|
||||
client,
|
||||
@@ -317,16 +359,21 @@ impl BackendType<'_, ClientCredentials<'_>> {
|
||||
config,
|
||||
latency_timer,
|
||||
)
|
||||
.await?
|
||||
.await?;
|
||||
(cache_info, BackendType::Postgres(api, user_info))
|
||||
}
|
||||
// NOTE: this auth backend doesn't use client credentials.
|
||||
Link(url) => {
|
||||
info!("performing link authentication");
|
||||
|
||||
link::authenticate(url, client)
|
||||
.await?
|
||||
.map(CachedNodeInfo::new_uncached)
|
||||
let node_info = link::authenticate(&url, client).await?;
|
||||
|
||||
(
|
||||
CachedNodeInfo::new_uncached(node_info),
|
||||
BackendType::Link(url),
|
||||
)
|
||||
}
|
||||
#[cfg(test)]
|
||||
Test(_) => {
|
||||
unreachable!("this function should never be called in the test backend")
|
||||
}
|
||||
@@ -335,7 +382,9 @@ impl BackendType<'_, ClientCredentials<'_>> {
|
||||
info!("user successfully authenticated");
|
||||
Ok(res)
|
||||
}
|
||||
}
|
||||
|
||||
impl BackendType<'_, ComputeUserInfo> {
|
||||
pub async fn get_allowed_ips(
|
||||
&self,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
@@ -343,8 +392,10 @@ impl BackendType<'_, ClientCredentials<'_>> {
|
||||
use BackendType::*;
|
||||
match self {
|
||||
Console(api, creds) => api.get_allowed_ips(extra, creds).await,
|
||||
#[cfg(feature = "testing")]
|
||||
Postgres(api, creds) => api.get_allowed_ips(extra, creds).await,
|
||||
Link(_) => Ok(Arc::new(vec![])),
|
||||
#[cfg(test)]
|
||||
Test(x) => x.get_allowed_ips(),
|
||||
}
|
||||
}
|
||||
@@ -359,8 +410,10 @@ impl BackendType<'_, ClientCredentials<'_>> {
|
||||
|
||||
match self {
|
||||
Console(api, creds) => api.wake_compute(extra, creds).map_ok(Some).await,
|
||||
#[cfg(feature = "testing")]
|
||||
Postgres(api, creds) => api.wake_compute(extra, creds).map_ok(Some).await,
|
||||
Link(_) => Ok(None),
|
||||
#[cfg(test)]
|
||||
Test(x) => x.wake_compute().map(Some),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use super::{AuthSuccess, ComputeCredentials};
|
||||
use super::{ComputeCredentials, ComputeUserInfo};
|
||||
use crate::{
|
||||
auth::{self, AuthFlow, ClientCredentials},
|
||||
auth::{self, backend::ComputeCredentialKeys, AuthFlow},
|
||||
compute,
|
||||
config::AuthenticationConfig,
|
||||
console::AuthSecret,
|
||||
@@ -12,14 +12,15 @@ use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::{info, warn};
|
||||
|
||||
pub(super) async fn authenticate(
|
||||
creds: &ClientCredentials<'_>,
|
||||
creds: ComputeUserInfo,
|
||||
client: &mut PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
||||
config: &'static AuthenticationConfig,
|
||||
latency_timer: &mut LatencyTimer,
|
||||
secret: AuthSecret,
|
||||
) -> auth::Result<AuthSuccess<ComputeCredentials>> {
|
||||
) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
|
||||
let flow = AuthFlow::new(client);
|
||||
let scram_keys = match secret {
|
||||
#[cfg(feature = "testing")]
|
||||
AuthSecret::Md5(_) => {
|
||||
info!("auth endpoint chooses MD5");
|
||||
return Err(auth::AuthError::bad_auth_method("MD5"));
|
||||
@@ -53,7 +54,7 @@ pub(super) async fn authenticate(
|
||||
sasl::Outcome::Success(key) => key,
|
||||
sasl::Outcome::Failure(reason) => {
|
||||
info!("auth backend failed with an error: {reason}");
|
||||
return Err(auth::AuthError::auth_failed(creds.user));
|
||||
return Err(auth::AuthError::auth_failed(&*creds.inner.user));
|
||||
}
|
||||
};
|
||||
|
||||
@@ -64,9 +65,9 @@ pub(super) async fn authenticate(
|
||||
}
|
||||
};
|
||||
|
||||
Ok(AuthSuccess {
|
||||
reported_auth_ok: false,
|
||||
value: ComputeCredentials::AuthKeys(tokio_postgres::config::AuthKeys::ScramSha256(
|
||||
Ok(ComputeCredentials {
|
||||
info: creds,
|
||||
keys: ComputeCredentialKeys::AuthKeys(tokio_postgres::config::AuthKeys::ScramSha256(
|
||||
scram_keys,
|
||||
)),
|
||||
})
|
||||
|
||||
@@ -1,7 +1,11 @@
|
||||
use super::{AuthSuccess, ComputeCredentials};
|
||||
use super::{
|
||||
ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, ComputeUserInfoNoEndpoint,
|
||||
};
|
||||
use crate::{
|
||||
auth::{self, AuthFlow, ClientCredentials},
|
||||
auth::{self, AuthFlow},
|
||||
console::AuthSecret,
|
||||
proxy::LatencyTimer,
|
||||
sasl,
|
||||
stream::{self, Stream},
|
||||
};
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
@@ -11,35 +15,42 @@ use tracing::{info, warn};
|
||||
/// one round trip and *expensive* computations (>= 4096 HMAC iterations).
|
||||
/// These properties are benefical for serverless JS workers, so we
|
||||
/// use this mechanism for websocket connections.
|
||||
pub async fn cleartext_hack(
|
||||
pub async fn authenticate_cleartext(
|
||||
info: ComputeUserInfo,
|
||||
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
||||
latency_timer: &mut LatencyTimer,
|
||||
) -> auth::Result<AuthSuccess<ComputeCredentials>> {
|
||||
secret: AuthSecret,
|
||||
) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
|
||||
warn!("cleartext auth flow override is enabled, proceeding");
|
||||
|
||||
// pause the timer while we communicate with the client
|
||||
let _paused = latency_timer.pause();
|
||||
|
||||
let password = AuthFlow::new(client)
|
||||
.begin(auth::CleartextPassword)
|
||||
let auth_outcome = AuthFlow::new(client)
|
||||
.begin(auth::CleartextPassword(secret))
|
||||
.await?
|
||||
.authenticate()
|
||||
.await?;
|
||||
|
||||
// Report tentative success; compute node will check the password anyway.
|
||||
Ok(AuthSuccess {
|
||||
reported_auth_ok: false,
|
||||
value: ComputeCredentials::Password(password),
|
||||
})
|
||||
let keys = match auth_outcome {
|
||||
sasl::Outcome::Success(key) => key,
|
||||
sasl::Outcome::Failure(reason) => {
|
||||
info!("auth backend failed with an error: {reason}");
|
||||
return Err(auth::AuthError::auth_failed(&*info.inner.user));
|
||||
}
|
||||
};
|
||||
|
||||
Ok(ComputeCredentials { info, keys })
|
||||
}
|
||||
|
||||
/// Workaround for clients which don't provide an endpoint (project) name.
|
||||
/// Very similar to [`cleartext_hack`], but there's a specific password format.
|
||||
pub async fn password_hack(
|
||||
creds: &mut ClientCredentials<'_>,
|
||||
/// Similar to [`authenticate_cleartext`], but there's a specific password format,
|
||||
/// and passwords are not yet validated (we don't know how to validate them!)
|
||||
pub async fn password_hack_no_authentication(
|
||||
info: ComputeUserInfoNoEndpoint,
|
||||
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
||||
latency_timer: &mut LatencyTimer,
|
||||
) -> auth::Result<AuthSuccess<ComputeCredentials>> {
|
||||
) -> auth::Result<ComputeCredentials<Vec<u8>>> {
|
||||
warn!("project not specified, resorting to the password hack auth flow");
|
||||
|
||||
// pause the timer while we communicate with the client
|
||||
@@ -48,15 +59,17 @@ pub async fn password_hack(
|
||||
let payload = AuthFlow::new(client)
|
||||
.begin(auth::PasswordHack)
|
||||
.await?
|
||||
.authenticate()
|
||||
.get_password()
|
||||
.await?;
|
||||
|
||||
info!(project = &payload.endpoint, "received missing parameter");
|
||||
creds.project = Some(payload.endpoint);
|
||||
info!(project = &*payload.endpoint, "received missing parameter");
|
||||
|
||||
// Report tentative success; compute node will check the password anyway.
|
||||
Ok(AuthSuccess {
|
||||
reported_auth_ok: false,
|
||||
value: ComputeCredentials::Password(payload.password),
|
||||
Ok(ComputeCredentials {
|
||||
info: ComputeUserInfo {
|
||||
inner: info,
|
||||
endpoint: payload.endpoint,
|
||||
},
|
||||
keys: payload.password,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
use super::AuthSuccess;
|
||||
use crate::{
|
||||
auth, compute,
|
||||
console::{self, provider::NodeInfo},
|
||||
@@ -57,7 +56,7 @@ pub fn new_psql_session_id() -> String {
|
||||
pub(super) async fn authenticate(
|
||||
link_uri: &reqwest::Url,
|
||||
client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
|
||||
) -> auth::Result<AuthSuccess<NodeInfo>> {
|
||||
) -> auth::Result<NodeInfo> {
|
||||
let psql_session_id = new_psql_session_id();
|
||||
let span = info_span!("link", psql_session_id = &psql_session_id);
|
||||
let greeting = hello_message(link_uri, &psql_session_id);
|
||||
@@ -102,12 +101,9 @@ pub(super) async fn authenticate(
|
||||
config.password(password.as_ref());
|
||||
}
|
||||
|
||||
Ok(AuthSuccess {
|
||||
reported_auth_ok: true,
|
||||
value: NodeInfo {
|
||||
config,
|
||||
aux: db_info.aux,
|
||||
allow_self_signed_compute: false, // caller may override
|
||||
},
|
||||
Ok(NodeInfo {
|
||||
config,
|
||||
aux: db_info.aux,
|
||||
allow_self_signed_compute: false, // caller may override
|
||||
})
|
||||
}
|
||||
|
||||
@@ -3,14 +3,12 @@
|
||||
use crate::{
|
||||
auth::password_hack::parse_endpoint_param,
|
||||
error::UserFacingError,
|
||||
proxy::{neon_options, NUM_CONNECTION_ACCEPTED_BY_SNI},
|
||||
proxy::{neon_options_str, NUM_CONNECTION_ACCEPTED_BY_SNI},
|
||||
};
|
||||
use itertools::Itertools;
|
||||
use pq_proto::StartupMessageParams;
|
||||
use std::{
|
||||
collections::HashSet,
|
||||
net::{IpAddr, SocketAddr},
|
||||
};
|
||||
use smol_str::SmolStr;
|
||||
use std::{collections::HashSet, net::IpAddr};
|
||||
use thiserror::Error;
|
||||
use tracing::{info, warn};
|
||||
|
||||
@@ -24,7 +22,7 @@ pub enum ClientCredsParseError {
|
||||
SNI ('{}') and project option ('{}').",
|
||||
.domain, .option,
|
||||
)]
|
||||
InconsistentProjectNames { domain: String, option: String },
|
||||
InconsistentProjectNames { domain: SmolStr, option: SmolStr },
|
||||
|
||||
#[error(
|
||||
"Common name inferred from SNI ('{}') is not known",
|
||||
@@ -33,7 +31,7 @@ pub enum ClientCredsParseError {
|
||||
UnknownCommonName { cn: String },
|
||||
|
||||
#[error("Project name ('{0}') must contain only alphanumeric characters and hyphen.")]
|
||||
MalformedProjectName(String),
|
||||
MalformedProjectName(SmolStr),
|
||||
}
|
||||
|
||||
impl UserFacingError for ClientCredsParseError {}
|
||||
@@ -41,34 +39,34 @@ impl UserFacingError for ClientCredsParseError {}
|
||||
/// Various client credentials which we use for authentication.
|
||||
/// Note that we don't store any kind of client key or password here.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct ClientCredentials<'a> {
|
||||
pub user: &'a str,
|
||||
pub struct ClientCredentials {
|
||||
pub user: SmolStr,
|
||||
// TODO: this is a severe misnomer! We should think of a new name ASAP.
|
||||
pub project: Option<String>,
|
||||
pub project: Option<SmolStr>,
|
||||
|
||||
pub cache_key: String,
|
||||
pub peer_addr: SocketAddr,
|
||||
pub cache_key: SmolStr,
|
||||
pub peer_addr: IpAddr,
|
||||
}
|
||||
|
||||
impl ClientCredentials<'_> {
|
||||
impl ClientCredentials {
|
||||
#[inline]
|
||||
pub fn project(&self) -> Option<&str> {
|
||||
self.project.as_deref()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> ClientCredentials<'a> {
|
||||
impl ClientCredentials {
|
||||
pub fn parse(
|
||||
params: &'a StartupMessageParams,
|
||||
params: &StartupMessageParams,
|
||||
sni: Option<&str>,
|
||||
common_names: Option<HashSet<String>>,
|
||||
peer_addr: SocketAddr,
|
||||
peer_addr: IpAddr,
|
||||
) -> Result<Self, ClientCredsParseError> {
|
||||
use ClientCredsParseError::*;
|
||||
|
||||
// Some parameters are stored in the startup message.
|
||||
let get_param = |key| params.get(key).ok_or(MissingKey(key));
|
||||
let user = get_param("user")?;
|
||||
let user = get_param("user")?.into();
|
||||
|
||||
// Project name might be passed via PG's command-line options.
|
||||
let project_option = params
|
||||
@@ -82,7 +80,7 @@ impl<'a> ClientCredentials<'a> {
|
||||
.at_most_one()
|
||||
.ok()?
|
||||
})
|
||||
.map(|name| name.to_string());
|
||||
.map(|name| name.into());
|
||||
|
||||
let project_from_domain = if let Some(sni_str) = sni {
|
||||
if let Some(cn) = common_names {
|
||||
@@ -121,7 +119,7 @@ impl<'a> ClientCredentials<'a> {
|
||||
}
|
||||
.transpose()?;
|
||||
|
||||
info!(user, project = project.as_deref(), "credentials");
|
||||
info!(%user, project = project.as_deref(), "credentials");
|
||||
if sni.is_some() {
|
||||
info!("Connection with sni");
|
||||
NUM_CONNECTION_ACCEPTED_BY_SNI
|
||||
@@ -142,8 +140,9 @@ impl<'a> ClientCredentials<'a> {
|
||||
let cache_key = format!(
|
||||
"{}{}",
|
||||
project.as_deref().unwrap_or(""),
|
||||
neon_options(params).unwrap_or("".to_string())
|
||||
);
|
||||
neon_options_str(params)
|
||||
)
|
||||
.into();
|
||||
|
||||
Ok(Self {
|
||||
user,
|
||||
@@ -206,10 +205,10 @@ fn project_name_valid(name: &str) -> bool {
|
||||
name.chars().all(|c| c.is_alphanumeric() || c == '-')
|
||||
}
|
||||
|
||||
fn subdomain_from_sni(sni: &str, common_name: &str) -> Option<String> {
|
||||
fn subdomain_from_sni(sni: &str, common_name: &str) -> Option<SmolStr> {
|
||||
sni.strip_suffix(common_name)?
|
||||
.strip_suffix('.')
|
||||
.map(str::to_owned)
|
||||
.map(SmolStr::from)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -221,7 +220,7 @@ mod tests {
|
||||
fn parse_bare_minimum() -> anyhow::Result<()> {
|
||||
// According to postgresql, only `user` should be required.
|
||||
let options = StartupMessageParams::new([("user", "john_doe")]);
|
||||
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
|
||||
let peer_addr = IpAddr::from([127, 0, 0, 1]);
|
||||
let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
|
||||
assert_eq!(creds.user, "john_doe");
|
||||
assert_eq!(creds.project, None);
|
||||
@@ -236,7 +235,7 @@ mod tests {
|
||||
("database", "world"), // should be ignored
|
||||
("foo", "bar"), // should be ignored
|
||||
]);
|
||||
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
|
||||
let peer_addr = IpAddr::from([127, 0, 0, 1]);
|
||||
let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
|
||||
assert_eq!(creds.user, "john_doe");
|
||||
assert_eq!(creds.project, None);
|
||||
@@ -251,7 +250,7 @@ mod tests {
|
||||
let sni = Some("foo.localhost");
|
||||
let common_names = Some(["localhost".into()].into());
|
||||
|
||||
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
|
||||
let peer_addr = IpAddr::from([127, 0, 0, 1]);
|
||||
let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
|
||||
assert_eq!(creds.user, "john_doe");
|
||||
assert_eq!(creds.project.as_deref(), Some("foo"));
|
||||
@@ -267,7 +266,7 @@ mod tests {
|
||||
("options", "-ckey=1 project=bar -c geqo=off"),
|
||||
]);
|
||||
|
||||
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
|
||||
let peer_addr = IpAddr::from([127, 0, 0, 1]);
|
||||
let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
|
||||
assert_eq!(creds.user, "john_doe");
|
||||
assert_eq!(creds.project.as_deref(), Some("bar"));
|
||||
@@ -282,7 +281,7 @@ mod tests {
|
||||
("options", "-ckey=1 endpoint=bar -c geqo=off"),
|
||||
]);
|
||||
|
||||
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
|
||||
let peer_addr = IpAddr::from([127, 0, 0, 1]);
|
||||
let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
|
||||
assert_eq!(creds.user, "john_doe");
|
||||
assert_eq!(creds.project.as_deref(), Some("bar"));
|
||||
@@ -300,7 +299,7 @@ mod tests {
|
||||
),
|
||||
]);
|
||||
|
||||
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
|
||||
let peer_addr = IpAddr::from([127, 0, 0, 1]);
|
||||
let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
|
||||
assert_eq!(creds.user, "john_doe");
|
||||
assert!(creds.project.is_none());
|
||||
@@ -315,7 +314,7 @@ mod tests {
|
||||
("options", "-ckey=1 endpoint=bar project=foo -c geqo=off"),
|
||||
]);
|
||||
|
||||
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
|
||||
let peer_addr = IpAddr::from([127, 0, 0, 1]);
|
||||
let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
|
||||
assert_eq!(creds.user, "john_doe");
|
||||
assert!(creds.project.is_none());
|
||||
@@ -330,7 +329,7 @@ mod tests {
|
||||
let sni = Some("baz.localhost");
|
||||
let common_names = Some(["localhost".into()].into());
|
||||
|
||||
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
|
||||
let peer_addr = IpAddr::from([127, 0, 0, 1]);
|
||||
let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
|
||||
assert_eq!(creds.user, "john_doe");
|
||||
assert_eq!(creds.project.as_deref(), Some("baz"));
|
||||
@@ -344,13 +343,13 @@ mod tests {
|
||||
|
||||
let common_names = Some(["a.com".into(), "b.com".into()].into());
|
||||
let sni = Some("p1.a.com");
|
||||
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
|
||||
let peer_addr = IpAddr::from([127, 0, 0, 1]);
|
||||
let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
|
||||
assert_eq!(creds.project.as_deref(), Some("p1"));
|
||||
|
||||
let common_names = Some(["a.com".into(), "b.com".into()].into());
|
||||
let sni = Some("p1.b.com");
|
||||
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
|
||||
let peer_addr = IpAddr::from([127, 0, 0, 1]);
|
||||
let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
|
||||
assert_eq!(creds.project.as_deref(), Some("p1"));
|
||||
|
||||
@@ -365,7 +364,7 @@ mod tests {
|
||||
let sni = Some("second.localhost");
|
||||
let common_names = Some(["localhost".into()].into());
|
||||
|
||||
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
|
||||
let peer_addr = IpAddr::from([127, 0, 0, 1]);
|
||||
let err = ClientCredentials::parse(&options, sni, common_names, peer_addr)
|
||||
.expect_err("should fail");
|
||||
match err {
|
||||
@@ -384,7 +383,7 @@ mod tests {
|
||||
let sni = Some("project.localhost");
|
||||
let common_names = Some(["example.com".into()].into());
|
||||
|
||||
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
|
||||
let peer_addr = IpAddr::from([127, 0, 0, 1]);
|
||||
let err = ClientCredentials::parse(&options, sni, common_names, peer_addr)
|
||||
.expect_err("should fail");
|
||||
match err {
|
||||
@@ -404,13 +403,10 @@ mod tests {
|
||||
|
||||
let sni = Some("project.localhost");
|
||||
let common_names = Some(["localhost".into()].into());
|
||||
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
|
||||
let peer_addr = IpAddr::from([127, 0, 0, 1]);
|
||||
let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
|
||||
assert_eq!(creds.project.as_deref(), Some("project"));
|
||||
assert_eq!(
|
||||
creds.cache_key,
|
||||
"projectneon_endpoint_type:read_write neon_lsn:0/2"
|
||||
);
|
||||
assert_eq!(creds.cache_key, "projectendpoint_type:read_write lsn:0/2");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
//! Main authentication flow.
|
||||
|
||||
use super::{AuthErrorImpl, PasswordHackPayload};
|
||||
use super::{backend::ComputeCredentialKeys, AuthErrorImpl, PasswordHackPayload};
|
||||
use crate::{
|
||||
config::TlsServerEndPoint,
|
||||
console::AuthSecret,
|
||||
sasl, scram,
|
||||
stream::{PqStream, Stream},
|
||||
};
|
||||
@@ -50,7 +51,7 @@ impl AuthMethod for PasswordHack {
|
||||
|
||||
/// Use clear-text password auth called `password` in docs
|
||||
/// <https://www.postgresql.org/docs/current/auth-password.html>
|
||||
pub struct CleartextPassword;
|
||||
pub struct CleartextPassword(pub AuthSecret);
|
||||
|
||||
impl AuthMethod for CleartextPassword {
|
||||
#[inline(always)]
|
||||
@@ -98,7 +99,7 @@ impl<'a, S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'a, S, Begin> {
|
||||
|
||||
impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, PasswordHack> {
|
||||
/// Perform user authentication. Raise an error in case authentication failed.
|
||||
pub async fn authenticate(self) -> super::Result<PasswordHackPayload> {
|
||||
pub async fn get_password(self) -> super::Result<PasswordHackPayload> {
|
||||
let msg = self.stream.read_password_message().await?;
|
||||
let password = msg
|
||||
.strip_suffix(&[0])
|
||||
@@ -117,13 +118,19 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, PasswordHack> {
|
||||
|
||||
impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, CleartextPassword> {
|
||||
/// Perform user authentication. Raise an error in case authentication failed.
|
||||
pub async fn authenticate(self) -> super::Result<Vec<u8>> {
|
||||
pub async fn authenticate(self) -> super::Result<sasl::Outcome<ComputeCredentialKeys>> {
|
||||
let msg = self.stream.read_password_message().await?;
|
||||
let password = msg
|
||||
.strip_suffix(&[0])
|
||||
.ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?;
|
||||
|
||||
Ok(password.to_vec())
|
||||
let outcome = validate_password_and_exchange(password, self.state.0)?;
|
||||
|
||||
if let sasl::Outcome::Success(_) = &outcome {
|
||||
self.stream.write_message_noflush(&Be::AuthenticationOk)?;
|
||||
}
|
||||
|
||||
Ok(outcome)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -152,6 +159,49 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
|
||||
))
|
||||
.await?;
|
||||
|
||||
if let sasl::Outcome::Success(_) = &outcome {
|
||||
self.stream.write_message_noflush(&Be::AuthenticationOk)?;
|
||||
}
|
||||
|
||||
Ok(outcome)
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn validate_password_and_exchange(
|
||||
password: &[u8],
|
||||
secret: AuthSecret,
|
||||
) -> super::Result<sasl::Outcome<ComputeCredentialKeys>> {
|
||||
match secret {
|
||||
#[cfg(feature = "testing")]
|
||||
AuthSecret::Md5(_) => {
|
||||
// test only
|
||||
Ok(sasl::Outcome::Success(ComputeCredentialKeys::Password(
|
||||
password.to_owned(),
|
||||
)))
|
||||
}
|
||||
// perform scram authentication as both client and server to validate the keys
|
||||
AuthSecret::Scram(scram_secret) => {
|
||||
use postgres_protocol::authentication::sasl::{ChannelBinding, ScramSha256};
|
||||
let sasl_client = ScramSha256::new(password, ChannelBinding::unsupported());
|
||||
let outcome = crate::scram::exchange(
|
||||
&scram_secret,
|
||||
sasl_client,
|
||||
crate::config::TlsServerEndPoint::Undefined,
|
||||
)?;
|
||||
|
||||
let client_key = match outcome {
|
||||
sasl::Outcome::Success(client_key) => client_key,
|
||||
sasl::Outcome::Failure(reason) => return Ok(sasl::Outcome::Failure(reason)),
|
||||
};
|
||||
|
||||
let keys = crate::compute::ScramKeys {
|
||||
client_key: client_key.as_bytes(),
|
||||
server_key: scram_secret.server_key.as_bytes(),
|
||||
};
|
||||
|
||||
Ok(sasl::Outcome::Success(ComputeCredentialKeys::AuthKeys(
|
||||
tokio_postgres::config::AuthKeys::ScramSha256(keys),
|
||||
)))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,9 +4,10 @@
|
||||
//! UPDATE (Mon Aug 8 13:20:34 UTC 2022): the payload format has been simplified.
|
||||
|
||||
use bstr::ByteSlice;
|
||||
use smol_str::SmolStr;
|
||||
|
||||
pub struct PasswordHackPayload {
|
||||
pub endpoint: String,
|
||||
pub endpoint: SmolStr,
|
||||
pub password: Vec<u8>,
|
||||
}
|
||||
|
||||
@@ -18,7 +19,7 @@ impl PasswordHackPayload {
|
||||
if let Some((endpoint, password)) = bytes.split_once_str(sep) {
|
||||
let endpoint = endpoint.to_str().ok()?;
|
||||
return Some(Self {
|
||||
endpoint: parse_endpoint_param(endpoint)?.to_owned(),
|
||||
endpoint: parse_endpoint_param(endpoint)?.into(),
|
||||
password: password.to_owned(),
|
||||
});
|
||||
}
|
||||
|
||||
@@ -8,6 +8,7 @@ use std::{net::SocketAddr, sync::Arc};
|
||||
use futures::future::Either;
|
||||
use itertools::Itertools;
|
||||
use proxy::config::TlsServerEndPoint;
|
||||
use proxy::proxy::run_until_cancelled;
|
||||
use tokio::net::TcpListener;
|
||||
|
||||
use anyhow::{anyhow, bail, ensure, Context};
|
||||
@@ -20,7 +21,7 @@ use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::{project_git_version, sentry_init::init_sentry};
|
||||
|
||||
use tracing::{error, info, warn, Instrument};
|
||||
use tracing::{error, info, Instrument};
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
|
||||
@@ -151,63 +152,39 @@ async fn task_main(
|
||||
// will be inherited by all accepted client sockets.
|
||||
socket2::SockRef::from(&listener).set_keepalive(true)?;
|
||||
|
||||
let mut connections = tokio::task::JoinSet::new();
|
||||
let connections = tokio_util::task::task_tracker::TaskTracker::new();
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
accept_result = listener.accept() => {
|
||||
let (socket, peer_addr) = accept_result?;
|
||||
while let Some(accept_result) =
|
||||
run_until_cancelled(listener.accept(), &cancellation_token).await
|
||||
{
|
||||
let (socket, peer_addr) = accept_result?;
|
||||
|
||||
let session_id = uuid::Uuid::new_v4();
|
||||
let tls_config = Arc::clone(&tls_config);
|
||||
let dest_suffix = Arc::clone(&dest_suffix);
|
||||
let session_id = uuid::Uuid::new_v4();
|
||||
let tls_config = Arc::clone(&tls_config);
|
||||
let dest_suffix = Arc::clone(&dest_suffix);
|
||||
|
||||
connections.spawn(
|
||||
async move {
|
||||
socket
|
||||
.set_nodelay(true)
|
||||
.context("failed to set socket option")?;
|
||||
connections.spawn(
|
||||
async move {
|
||||
socket
|
||||
.set_nodelay(true)
|
||||
.context("failed to set socket option")?;
|
||||
|
||||
info!(%peer_addr, "serving");
|
||||
handle_client(dest_suffix, tls_config, tls_server_end_point, socket).await
|
||||
}
|
||||
.unwrap_or_else(|e| {
|
||||
// Acknowledge that the task has finished with an error.
|
||||
error!("per-client task finished with an error: {e:#}");
|
||||
})
|
||||
.instrument(tracing::info_span!("handle_client", ?session_id))
|
||||
);
|
||||
info!(%peer_addr, "serving");
|
||||
handle_client(dest_suffix, tls_config, tls_server_end_point, socket).await
|
||||
}
|
||||
// Don't modify this unless you read https://docs.rs/tokio/latest/tokio/macro.select.html carefully.
|
||||
// If this future completes and the pattern doesn't match, this branch is disabled for this call to `select!`.
|
||||
// This only counts for this loop and it will be enabled again on next `select!`.
|
||||
//
|
||||
// Prior code had this as `Some(Err(e))` which _looks_ equivalent to the current setup, but it's not.
|
||||
// When `connections.join_next()` returned `Some(Ok(()))` (which we expect), it would disable the join_next and it would
|
||||
// not get called again, even if there are more connections to remove.
|
||||
Some(res) = connections.join_next() => {
|
||||
if let Err(e) = res {
|
||||
if !e.is_panic() && !e.is_cancelled() {
|
||||
warn!("unexpected error from joined connection task: {e:?}");
|
||||
}
|
||||
}
|
||||
}
|
||||
_ = cancellation_token.cancelled() => {
|
||||
drop(listener);
|
||||
break;
|
||||
}
|
||||
}
|
||||
.unwrap_or_else(|e| {
|
||||
// Acknowledge that the task has finished with an error.
|
||||
error!("per-client task finished with an error: {e:#}");
|
||||
})
|
||||
.instrument(tracing::info_span!("handle_client", ?session_id)),
|
||||
);
|
||||
}
|
||||
|
||||
// Drain connections
|
||||
info!("waiting for all client connections to finish");
|
||||
while let Some(res) = connections.join_next().await {
|
||||
if let Err(e) = res {
|
||||
if !e.is_panic() && !e.is_cancelled() {
|
||||
warn!("unexpected error from joined connection task: {e:?}");
|
||||
}
|
||||
}
|
||||
}
|
||||
connections.close();
|
||||
drop(listener);
|
||||
|
||||
connections.wait().await;
|
||||
|
||||
info!("all client connections have finished");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -30,6 +30,7 @@ use clap::{Parser, ValueEnum};
|
||||
#[derive(Clone, Debug, ValueEnum)]
|
||||
enum AuthBackend {
|
||||
Console,
|
||||
#[cfg(feature = "testing")]
|
||||
Postgres,
|
||||
Link,
|
||||
}
|
||||
@@ -103,7 +104,7 @@ struct ProxyCliArgs {
|
||||
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
|
||||
require_client_ip: bool,
|
||||
/// Disable dynamic rate limiter and store the metrics to ensure its production behaviour.
|
||||
#[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
|
||||
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
|
||||
disable_dynamic_rate_limiter: bool,
|
||||
/// Rate limit algorithm. Makes sense only if `disable_rate_limiter` is `false`.
|
||||
#[clap(value_enum, long, default_value_t = proxy::rate_limiter::RateLimitAlgorithm::Aimd)]
|
||||
@@ -289,6 +290,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
let api = console::provider::neon::Api::new(endpoint, caches, locks);
|
||||
auth::BackendType::Console(Cow::Owned(api), ())
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
AuthBackend::Postgres => {
|
||||
let url = args.auth_endpoint.parse()?;
|
||||
let api = console::provider::mock::Api::new(url);
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use crate::{
|
||||
auth::parse_endpoint_param, cancellation::CancelClosure, console::errors::WakeComputeError,
|
||||
error::UserFacingError, proxy::is_neon_param,
|
||||
error::UserFacingError, proxy::neon_option,
|
||||
};
|
||||
use futures::{FutureExt, TryFutureExt};
|
||||
use itertools::Itertools;
|
||||
@@ -275,7 +275,7 @@ fn filtered_options(params: &StartupMessageParams) -> Option<String> {
|
||||
#[allow(unstable_name_collisions)]
|
||||
let options: String = params
|
||||
.options_raw()?
|
||||
.filter(|opt| parse_endpoint_param(opt).is_none() && !is_neon_param(opt))
|
||||
.filter(|opt| parse_endpoint_param(opt).is_none() && neon_option(opt).is_none())
|
||||
.intersperse(" ") // TODO: use impl from std once it's stabilized
|
||||
.collect();
|
||||
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
#[cfg(feature = "testing")]
|
||||
pub mod mock;
|
||||
pub mod neon;
|
||||
|
||||
use super::messages::MetricsAuxInfo;
|
||||
use crate::{
|
||||
auth::ClientCredentials,
|
||||
auth::backend::ComputeUserInfo,
|
||||
cache::{timed_lru, TimedLru},
|
||||
compute, scram,
|
||||
};
|
||||
@@ -200,11 +201,23 @@ pub struct ConsoleReqExtra<'a> {
|
||||
pub session_id: uuid::Uuid,
|
||||
/// Name of client application, if set.
|
||||
pub application_name: Option<&'a str>,
|
||||
pub options: Option<&'a str>,
|
||||
pub options: Vec<(String, String)>,
|
||||
}
|
||||
|
||||
impl<'a> ConsoleReqExtra<'a> {
|
||||
// https://swagger.io/docs/specification/serialization/ DeepObject format
|
||||
// paramName[prop1]=value1¶mName[prop2]=value2&....
|
||||
pub fn options_as_deep_object(&self) -> Vec<(String, String)> {
|
||||
self.options
|
||||
.iter()
|
||||
.map(|(k, v)| (format!("options[{}]", k), v.to_string()))
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
|
||||
/// Auth secret which is managed by the cloud.
|
||||
pub enum AuthSecret {
|
||||
#[cfg(feature = "testing")]
|
||||
/// Md5 hash of user's password.
|
||||
Md5([u8; 16]),
|
||||
|
||||
@@ -247,20 +260,20 @@ pub trait Api {
|
||||
async fn get_auth_info(
|
||||
&self,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
creds: &ClientCredentials,
|
||||
creds: &ComputeUserInfo,
|
||||
) -> Result<AuthInfo, errors::GetAuthInfoError>;
|
||||
|
||||
async fn get_allowed_ips(
|
||||
&self,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
creds: &ClientCredentials,
|
||||
creds: &ComputeUserInfo,
|
||||
) -> Result<Arc<Vec<String>>, errors::GetAuthInfoError>;
|
||||
|
||||
/// Wake up the compute node and return the corresponding connection info.
|
||||
async fn wake_compute(
|
||||
&self,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
creds: &ClientCredentials,
|
||||
creds: &ComputeUserInfo,
|
||||
) -> Result<CachedNodeInfo, errors::WakeComputeError>;
|
||||
}
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ use super::{
|
||||
errors::{ApiError, GetAuthInfoError, WakeComputeError},
|
||||
AuthInfo, AuthSecret, CachedNodeInfo, ConsoleReqExtra, NodeInfo,
|
||||
};
|
||||
use crate::{auth::ClientCredentials, compute, error::io_error, scram, url::ApiUrl};
|
||||
use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl};
|
||||
use async_trait::async_trait;
|
||||
use futures::TryFutureExt;
|
||||
use thiserror::Error;
|
||||
@@ -47,7 +47,7 @@ impl Api {
|
||||
|
||||
async fn do_get_auth_info(
|
||||
&self,
|
||||
creds: &ClientCredentials<'_>,
|
||||
creds: &ComputeUserInfo,
|
||||
) -> Result<AuthInfo, GetAuthInfoError> {
|
||||
let (secret, allowed_ips) = async {
|
||||
// Perhaps we could persist this connection, but then we'd have to
|
||||
@@ -60,7 +60,7 @@ impl Api {
|
||||
let secret = match get_execute_postgres_query(
|
||||
&client,
|
||||
"select rolpassword from pg_catalog.pg_authid where rolname = $1",
|
||||
&[&creds.user],
|
||||
&[&&*creds.inner.user],
|
||||
"rolpassword",
|
||||
)
|
||||
.await?
|
||||
@@ -71,14 +71,14 @@ impl Api {
|
||||
secret.or_else(|| parse_md5(&entry).map(AuthSecret::Md5))
|
||||
}
|
||||
None => {
|
||||
warn!("user '{}' does not exist", creds.user);
|
||||
warn!("user '{}' does not exist", creds.inner.user);
|
||||
None
|
||||
}
|
||||
};
|
||||
let allowed_ips = match get_execute_postgres_query(
|
||||
&client,
|
||||
"select allowed_ips from neon_control_plane.endpoints where endpoint_id = $1",
|
||||
&[&creds.project.clone().unwrap_or_default().as_str()],
|
||||
&[&creds.endpoint.as_str()],
|
||||
"allowed_ips",
|
||||
)
|
||||
.await?
|
||||
@@ -145,7 +145,7 @@ impl super::Api for Api {
|
||||
async fn get_auth_info(
|
||||
&self,
|
||||
_extra: &ConsoleReqExtra<'_>,
|
||||
creds: &ClientCredentials,
|
||||
creds: &ComputeUserInfo,
|
||||
) -> Result<AuthInfo, GetAuthInfoError> {
|
||||
self.do_get_auth_info(creds).await
|
||||
}
|
||||
@@ -153,7 +153,7 @@ impl super::Api for Api {
|
||||
async fn get_allowed_ips(
|
||||
&self,
|
||||
_extra: &ConsoleReqExtra<'_>,
|
||||
creds: &ClientCredentials,
|
||||
creds: &ComputeUserInfo,
|
||||
) -> Result<Arc<Vec<String>>, GetAuthInfoError> {
|
||||
Ok(Arc::new(self.do_get_auth_info(creds).await?.allowed_ips))
|
||||
}
|
||||
@@ -162,7 +162,7 @@ impl super::Api for Api {
|
||||
async fn wake_compute(
|
||||
&self,
|
||||
_extra: &ConsoleReqExtra<'_>,
|
||||
_creds: &ClientCredentials,
|
||||
_creds: &ComputeUserInfo,
|
||||
) -> Result<CachedNodeInfo, WakeComputeError> {
|
||||
self.do_wake_compute()
|
||||
.map_ok(CachedNodeInfo::new_uncached)
|
||||
|
||||
@@ -5,12 +5,8 @@ use super::{
|
||||
errors::{ApiError, GetAuthInfoError, WakeComputeError},
|
||||
ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedNodeInfo, ConsoleReqExtra, NodeInfo,
|
||||
};
|
||||
use crate::{
|
||||
auth::ClientCredentials,
|
||||
compute, http,
|
||||
proxy::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER},
|
||||
scram,
|
||||
};
|
||||
use crate::proxy::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER};
|
||||
use crate::{auth::backend::ComputeUserInfo, compute, http, scram};
|
||||
use async_trait::async_trait;
|
||||
use futures::TryFutureExt;
|
||||
use itertools::Itertools;
|
||||
@@ -53,7 +49,7 @@ impl Api {
|
||||
async fn do_get_auth_info(
|
||||
&self,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
creds: &ClientCredentials<'_>,
|
||||
creds: &ComputeUserInfo,
|
||||
) -> Result<AuthInfo, GetAuthInfoError> {
|
||||
let request_id = uuid::Uuid::new_v4().to_string();
|
||||
async {
|
||||
@@ -65,8 +61,8 @@ impl Api {
|
||||
.query(&[("session_id", extra.session_id)])
|
||||
.query(&[
|
||||
("application_name", extra.application_name),
|
||||
("project", Some(creds.project().expect("impossible"))),
|
||||
("role", Some(creds.user)),
|
||||
("project", Some(&creds.endpoint)),
|
||||
("role", Some(&creds.inner.user)),
|
||||
])
|
||||
.build()?;
|
||||
|
||||
@@ -106,12 +102,11 @@ impl Api {
|
||||
async fn do_wake_compute(
|
||||
&self,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
creds: &ClientCredentials<'_>,
|
||||
creds: &ComputeUserInfo,
|
||||
) -> Result<NodeInfo, WakeComputeError> {
|
||||
let project = creds.project().expect("impossible");
|
||||
let request_id = uuid::Uuid::new_v4().to_string();
|
||||
async {
|
||||
let request = self
|
||||
let mut request_builder = self
|
||||
.endpoint
|
||||
.get("proxy_wake_compute")
|
||||
.header("X-Request-ID", &request_id)
|
||||
@@ -119,10 +114,15 @@ impl Api {
|
||||
.query(&[("session_id", extra.session_id)])
|
||||
.query(&[
|
||||
("application_name", extra.application_name),
|
||||
("project", Some(project)),
|
||||
("options", extra.options),
|
||||
])
|
||||
.build()?;
|
||||
("project", Some(&creds.endpoint)),
|
||||
]);
|
||||
|
||||
request_builder = if extra.options.is_empty() {
|
||||
request_builder
|
||||
} else {
|
||||
request_builder.query(&extra.options_as_deep_object())
|
||||
};
|
||||
let request = request_builder.build()?;
|
||||
|
||||
info!(url = request.url().as_str(), "sending http request");
|
||||
let start = Instant::now();
|
||||
@@ -162,7 +162,7 @@ impl super::Api for Api {
|
||||
async fn get_auth_info(
|
||||
&self,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
creds: &ClientCredentials,
|
||||
creds: &ComputeUserInfo,
|
||||
) -> Result<AuthInfo, GetAuthInfoError> {
|
||||
self.do_get_auth_info(extra, creds).await
|
||||
}
|
||||
@@ -170,9 +170,9 @@ impl super::Api for Api {
|
||||
async fn get_allowed_ips(
|
||||
&self,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
creds: &ClientCredentials,
|
||||
creds: &ComputeUserInfo,
|
||||
) -> Result<Arc<Vec<String>>, GetAuthInfoError> {
|
||||
let key: &str = creds.project().expect("impossible");
|
||||
let key: &str = &creds.endpoint;
|
||||
if let Some(allowed_ips) = self.caches.allowed_ips.get(key) {
|
||||
ALLOWED_IPS_BY_CACHE_OUTCOME
|
||||
.with_label_values(&["hit"])
|
||||
@@ -193,9 +193,9 @@ impl super::Api for Api {
|
||||
async fn wake_compute(
|
||||
&self,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
creds: &ClientCredentials,
|
||||
creds: &ComputeUserInfo,
|
||||
) -> Result<CachedNodeInfo, WakeComputeError> {
|
||||
let key: &str = &creds.cache_key;
|
||||
let key: &str = &creds.inner.cache_key;
|
||||
|
||||
// Every time we do a wakeup http request, the compute node will stay up
|
||||
// for some time (highly depends on the console's scale-to-zero policy);
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
mod tests;
|
||||
|
||||
use crate::{
|
||||
auth::{self, backend::AuthSuccess},
|
||||
auth,
|
||||
cancellation::{self, CancelMap},
|
||||
compute::{self, PostgresConnection},
|
||||
config::{AuthenticationConfig, ProxyConfig, TlsConfig},
|
||||
@@ -24,7 +24,7 @@ use prometheus::{
|
||||
IntGaugeVec,
|
||||
};
|
||||
use regex::Regex;
|
||||
use std::{error::Error, io, net::SocketAddr, ops::ControlFlow, sync::Arc, time::Instant};
|
||||
use std::{error::Error, io, net::IpAddr, ops::ControlFlow, sync::Arc, time::Instant};
|
||||
use tokio::{
|
||||
io::{AsyncRead, AsyncWrite, AsyncWriteExt},
|
||||
time,
|
||||
@@ -134,9 +134,9 @@ pub static ALLOWED_IPS_BY_CACHE_OUTCOME: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
|
||||
pub static RATE_LIMITER_ACQUIRE_LATENCY: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"semaphore_control_plane_token_acquire_seconds",
|
||||
"proxy_control_plane_token_acquire_seconds",
|
||||
"Time it took for proxy to establish a connection to the compute endpoint",
|
||||
// largest bucket = 3^16 * 0.00005ms = 2.15s
|
||||
// largest bucket = 3^16 * 0.05ms = 2.15s
|
||||
exponential_buckets(0.00005, 3.0, 16).unwrap(),
|
||||
)
|
||||
.unwrap()
|
||||
@@ -277,6 +277,21 @@ static NUM_BYTES_PROXIED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
pub async fn run_until_cancelled<F: std::future::Future>(
|
||||
f: F,
|
||||
cancellation_token: &CancellationToken,
|
||||
) -> Option<F::Output> {
|
||||
match futures::future::select(
|
||||
std::pin::pin!(f),
|
||||
std::pin::pin!(cancellation_token.cancelled()),
|
||||
)
|
||||
.await
|
||||
{
|
||||
futures::future::Either::Left((f, _)) => Some(f),
|
||||
futures::future::Either::Right(((), _)) => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn task_main(
|
||||
config: &'static ProxyConfig,
|
||||
listener: tokio::net::TcpListener,
|
||||
@@ -290,71 +305,62 @@ pub async fn task_main(
|
||||
// will be inherited by all accepted client sockets.
|
||||
socket2::SockRef::from(&listener).set_keepalive(true)?;
|
||||
|
||||
let mut connections = tokio::task::JoinSet::new();
|
||||
let connections = tokio_util::task::task_tracker::TaskTracker::new();
|
||||
let cancel_map = Arc::new(CancelMap::default());
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
accept_result = listener.accept() => {
|
||||
let (socket, peer_addr) = accept_result?;
|
||||
while let Some(accept_result) =
|
||||
run_until_cancelled(listener.accept(), &cancellation_token).await
|
||||
{
|
||||
let (socket, peer_addr) = accept_result?;
|
||||
|
||||
let session_id = uuid::Uuid::new_v4();
|
||||
let cancel_map = Arc::clone(&cancel_map);
|
||||
connections.spawn(
|
||||
async move {
|
||||
info!("accepted postgres client connection");
|
||||
let session_id = uuid::Uuid::new_v4();
|
||||
let cancel_map = Arc::clone(&cancel_map);
|
||||
connections.spawn(
|
||||
async move {
|
||||
info!("accepted postgres client connection");
|
||||
|
||||
let mut socket = WithClientIp::new(socket);
|
||||
let mut peer_addr = peer_addr;
|
||||
if let Some(ip) = socket.wait_for_addr().await? {
|
||||
peer_addr = ip;
|
||||
tracing::Span::current().record("peer_addr", &tracing::field::display(ip));
|
||||
} else if config.require_client_ip {
|
||||
bail!("missing required client IP");
|
||||
}
|
||||
|
||||
socket
|
||||
.inner
|
||||
.set_nodelay(true)
|
||||
.context("failed to set socket option")?;
|
||||
|
||||
handle_client(config, &cancel_map, session_id, socket, ClientMode::Tcp, peer_addr).await
|
||||
}
|
||||
.instrument(info_span!("handle_client", ?session_id, peer_addr = tracing::field::Empty))
|
||||
.unwrap_or_else(move |e| {
|
||||
// Acknowledge that the task has finished with an error.
|
||||
error!(?session_id, "per-client task finished with an error: {e:#}");
|
||||
}),
|
||||
);
|
||||
}
|
||||
// Don't modify this unless you read https://docs.rs/tokio/latest/tokio/macro.select.html carefully.
|
||||
// If this future completes and the pattern doesn't match, this branch is disabled for this call to `select!`.
|
||||
// This only counts for this loop and it will be enabled again on next `select!`.
|
||||
//
|
||||
// Prior code had this as `Some(Err(e))` which _looks_ equivalent to the current setup, but it's not.
|
||||
// When `connections.join_next()` returned `Some(Ok(()))` (which we expect), it would disable the join_next and it would
|
||||
// not get called again, even if there are more connections to remove.
|
||||
Some(res) = connections.join_next() => {
|
||||
if let Err(e) = res {
|
||||
if !e.is_panic() && !e.is_cancelled() {
|
||||
warn!("unexpected error from joined connection task: {e:?}");
|
||||
}
|
||||
let mut socket = WithClientIp::new(socket);
|
||||
let mut peer_addr = peer_addr;
|
||||
if let Some(ip) = socket.wait_for_addr().await? {
|
||||
peer_addr = ip;
|
||||
tracing::Span::current().record("peer_addr", &tracing::field::display(ip));
|
||||
} else if config.require_client_ip {
|
||||
bail!("missing required client IP");
|
||||
}
|
||||
|
||||
socket
|
||||
.inner
|
||||
.set_nodelay(true)
|
||||
.context("failed to set socket option")?;
|
||||
|
||||
handle_client(
|
||||
config,
|
||||
&cancel_map,
|
||||
session_id,
|
||||
socket,
|
||||
ClientMode::Tcp,
|
||||
peer_addr.ip(),
|
||||
)
|
||||
.await
|
||||
}
|
||||
_ = cancellation_token.cancelled() => {
|
||||
drop(listener);
|
||||
break;
|
||||
}
|
||||
}
|
||||
.instrument(info_span!(
|
||||
"handle_client",
|
||||
?session_id,
|
||||
peer_addr = tracing::field::Empty
|
||||
))
|
||||
.unwrap_or_else(move |e| {
|
||||
// Acknowledge that the task has finished with an error.
|
||||
error!(?session_id, "per-client task finished with an error: {e:#}");
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
connections.close();
|
||||
drop(listener);
|
||||
|
||||
// Drain connections
|
||||
while let Some(res) = connections.join_next().await {
|
||||
if let Err(e) = res {
|
||||
if !e.is_panic() && !e.is_cancelled() {
|
||||
warn!("unexpected error from joined connection task: {e:?}");
|
||||
}
|
||||
}
|
||||
}
|
||||
connections.wait().await;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -408,7 +414,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
session_id: uuid::Uuid,
|
||||
stream: S,
|
||||
mode: ClientMode,
|
||||
peer_addr: SocketAddr,
|
||||
peer_addr: IpAddr,
|
||||
) -> anyhow::Result<()> {
|
||||
info!(
|
||||
protocol = mode.protocol_label(),
|
||||
@@ -666,7 +672,7 @@ pub async fn connect_to_compute<M: ConnectMechanism>(
|
||||
mechanism: &M,
|
||||
mut node_info: console::CachedNodeInfo,
|
||||
extra: &console::ConsoleReqExtra<'_>,
|
||||
creds: &auth::BackendType<'_, auth::ClientCredentials<'_>>,
|
||||
creds: &auth::BackendType<'_, auth::backend::ComputeUserInfo>,
|
||||
mut latency_timer: LatencyTimer,
|
||||
) -> Result<M::Connection, M::Error>
|
||||
where
|
||||
@@ -696,10 +702,12 @@ where
|
||||
let node_info = loop {
|
||||
let wake_res = match creds {
|
||||
auth::BackendType::Console(api, creds) => api.wake_compute(extra, creds).await,
|
||||
#[cfg(feature = "testing")]
|
||||
auth::BackendType::Postgres(api, creds) => api.wake_compute(extra, creds).await,
|
||||
// nothing to do?
|
||||
auth::BackendType::Link(_) => return Err(err.into()),
|
||||
// test backend
|
||||
#[cfg(test)]
|
||||
auth::BackendType::Test(x) => x.wake_compute(),
|
||||
};
|
||||
|
||||
@@ -838,7 +846,6 @@ pub fn retry_after(num_retries: u32) -> time::Duration {
|
||||
#[tracing::instrument(skip_all)]
|
||||
async fn prepare_client_connection(
|
||||
node: &compute::PostgresConnection,
|
||||
reported_auth_ok: bool,
|
||||
session: cancellation::Session<'_>,
|
||||
stream: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
|
||||
) -> anyhow::Result<()> {
|
||||
@@ -846,13 +853,6 @@ async fn prepare_client_connection(
|
||||
// The new token (cancel_key_data) will be sent to the client.
|
||||
let cancel_key_data = session.enable_query_cancellation(node.cancel_closure.clone());
|
||||
|
||||
// Report authentication success if we haven't done this already.
|
||||
// Note that we do this only (for the most part) after we've connected
|
||||
// to a compute (see above) which performs its own authentication.
|
||||
if !reported_auth_ok {
|
||||
stream.write_message_noflush(&Be::AuthenticationOk)?;
|
||||
}
|
||||
|
||||
// Forward all postgres connection params to the client.
|
||||
// Right now the implementation is very hacky and inefficent (ideally,
|
||||
// we don't need an intermediate hashmap), but at least it should be correct.
|
||||
@@ -921,7 +921,7 @@ struct Client<'a, S> {
|
||||
/// The underlying libpq protocol stream.
|
||||
stream: PqStream<Stream<S>>,
|
||||
/// Client credentials that we care about.
|
||||
creds: auth::BackendType<'a, auth::ClientCredentials<'a>>,
|
||||
creds: auth::BackendType<'a, auth::ClientCredentials>,
|
||||
/// KV-dictionary with PostgreSQL connection params.
|
||||
params: &'a StartupMessageParams,
|
||||
/// Unique connection ID.
|
||||
@@ -934,7 +934,7 @@ impl<'a, S> Client<'a, S> {
|
||||
/// Construct a new connection context.
|
||||
fn new(
|
||||
stream: PqStream<Stream<S>>,
|
||||
creds: auth::BackendType<'a, auth::ClientCredentials<'a>>,
|
||||
creds: auth::BackendType<'a, auth::ClientCredentials>,
|
||||
params: &'a StartupMessageParams,
|
||||
session_id: uuid::Uuid,
|
||||
allow_self_signed_compute: bool,
|
||||
@@ -953,7 +953,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
|
||||
/// Let the client authenticate and connect to the designated compute node.
|
||||
// Instrumentation logs endpoint name everywhere. Doesn't work for link
|
||||
// auth; strictly speaking we don't know endpoint name in its case.
|
||||
#[tracing::instrument(name = "", fields(ep = self.creds.get_endpoint().unwrap_or("".to_owned())), skip_all)]
|
||||
#[tracing::instrument(name = "", fields(ep = %self.creds.get_endpoint().unwrap_or_default()), skip_all)]
|
||||
async fn connect_to_db(
|
||||
self,
|
||||
session: cancellation::Session<'_>,
|
||||
@@ -962,22 +962,21 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
|
||||
) -> anyhow::Result<()> {
|
||||
let Self {
|
||||
mut stream,
|
||||
mut creds,
|
||||
creds,
|
||||
params,
|
||||
session_id,
|
||||
allow_self_signed_compute,
|
||||
} = self;
|
||||
|
||||
let console_options = neon_options(params);
|
||||
|
||||
let extra = console::ConsoleReqExtra {
|
||||
session_id, // aka this connection's id
|
||||
application_name: params.get("application_name"),
|
||||
options: console_options.as_deref(),
|
||||
options: neon_options(params),
|
||||
};
|
||||
|
||||
let mut latency_timer = LatencyTimer::new(mode.protocol_label());
|
||||
|
||||
let user = creds.get_user().to_owned();
|
||||
let auth_result = match creds
|
||||
.authenticate(
|
||||
&extra,
|
||||
@@ -990,7 +989,6 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
|
||||
{
|
||||
Ok(auth_result) => auth_result,
|
||||
Err(e) => {
|
||||
let user = creds.get_user();
|
||||
let db = params.get("database");
|
||||
let app = params.get("application_name");
|
||||
let params_span = tracing::info_span!("", ?user, ?db, ?app);
|
||||
@@ -999,10 +997,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
|
||||
}
|
||||
};
|
||||
|
||||
let AuthSuccess {
|
||||
reported_auth_ok,
|
||||
value: mut node_info,
|
||||
} = auth_result;
|
||||
let (mut node_info, creds) = auth_result;
|
||||
|
||||
node_info.allow_self_signed_compute = allow_self_signed_compute;
|
||||
|
||||
@@ -1025,7 +1020,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
|
||||
NUM_DB_CONNECTIONS_CLOSED_COUNTER.with_label_values(&[proto]).inc();
|
||||
}
|
||||
|
||||
prepare_client_connection(&node, reported_auth_ok, session, &mut stream).await?;
|
||||
prepare_client_connection(&node, session, &mut stream).await?;
|
||||
// Before proxy passing, forward to compute whatever data is left in the
|
||||
// PqStream input buffer. Normally there is none, but our serverless npm
|
||||
// driver in pipeline mode sends startup, password and first query
|
||||
@@ -1036,26 +1031,29 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn neon_options(params: &StartupMessageParams) -> Option<String> {
|
||||
pub fn neon_options(params: &StartupMessageParams) -> Vec<(String, String)> {
|
||||
#[allow(unstable_name_collisions)]
|
||||
let options: String = params
|
||||
.options_raw()?
|
||||
.filter(|opt| is_neon_param(opt))
|
||||
.sorted() // we sort it to use as cache key
|
||||
.intersperse(" ") // TODO: use impl from std once it's stabilized
|
||||
.collect();
|
||||
|
||||
// Don't even bother with empty options.
|
||||
if options.is_empty() {
|
||||
return None;
|
||||
match params.options_raw() {
|
||||
Some(options) => options.filter_map(neon_option).collect(),
|
||||
None => vec![],
|
||||
}
|
||||
|
||||
Some(options)
|
||||
}
|
||||
|
||||
pub fn is_neon_param(bytes: &str) -> bool {
|
||||
pub fn neon_options_str(params: &StartupMessageParams) -> String {
|
||||
#[allow(unstable_name_collisions)]
|
||||
neon_options(params)
|
||||
.iter()
|
||||
.map(|(k, v)| format!("{}:{}", k, v))
|
||||
.sorted() // we sort it to use as cache key
|
||||
.intersperse(" ".to_owned())
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn neon_option(bytes: &str) -> Option<(String, String)> {
|
||||
static RE: OnceCell<Regex> = OnceCell::new();
|
||||
RE.get_or_init(|| Regex::new(r"^neon_\w+:").unwrap());
|
||||
let re = RE.get_or_init(|| Regex::new(r"^neon_(\w+):(.+)").unwrap());
|
||||
|
||||
RE.get().unwrap().is_match(bytes)
|
||||
let cap = re.captures(bytes)?;
|
||||
let (_, [k, v]) = cap.extract();
|
||||
Some((k.to_owned(), v.to_owned()))
|
||||
}
|
||||
|
||||
@@ -3,8 +3,7 @@
|
||||
mod mitm;
|
||||
|
||||
use super::*;
|
||||
use crate::auth::backend::TestBackend;
|
||||
use crate::auth::ClientCredentials;
|
||||
use crate::auth::backend::{ComputeUserInfo, TestBackend};
|
||||
use crate::config::CertResolver;
|
||||
use crate::console::{CachedNodeInfo, NodeInfo};
|
||||
use crate::{auth, http, sasl, scram};
|
||||
@@ -109,8 +108,9 @@ fn generate_tls_config<'a>(
|
||||
trait TestAuth: Sized {
|
||||
async fn authenticate<S: AsyncRead + AsyncWrite + Unpin + Send>(
|
||||
self,
|
||||
_stream: &mut PqStream<Stream<S>>,
|
||||
stream: &mut PqStream<Stream<S>>,
|
||||
) -> anyhow::Result<()> {
|
||||
stream.write_message_noflush(&Be::AuthenticationOk)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -168,7 +168,6 @@ async fn dummy_proxy(
|
||||
auth.authenticate(&mut stream).await?;
|
||||
|
||||
stream
|
||||
.write_message_noflush(&Be::AuthenticationOk)?
|
||||
.write_message_noflush(&Be::CLIENT_ENCODING)?
|
||||
.write_message(&Be::ReadyForQuery)
|
||||
.await?;
|
||||
@@ -486,13 +485,13 @@ fn helper_create_connect_info(
|
||||
) -> (
|
||||
CachedNodeInfo,
|
||||
console::ConsoleReqExtra<'static>,
|
||||
auth::BackendType<'_, ClientCredentials<'static>>,
|
||||
auth::BackendType<'_, ComputeUserInfo>,
|
||||
) {
|
||||
let cache = helper_create_cached_node_info();
|
||||
let extra = console::ConsoleReqExtra {
|
||||
session_id: uuid::Uuid::new_v4(),
|
||||
application_name: Some("TEST"),
|
||||
options: None,
|
||||
options: vec![],
|
||||
};
|
||||
let creds = auth::BackendType::Test(mechanism);
|
||||
(cache, extra, creds)
|
||||
|
||||
@@ -30,6 +30,9 @@ pub enum Error {
|
||||
#[error("Bad client message: {0}")]
|
||||
BadClientMessage(&'static str),
|
||||
|
||||
#[error("Internal error: missing digest")]
|
||||
MissingBinding,
|
||||
|
||||
#[error(transparent)]
|
||||
Io(#[from] io::Error),
|
||||
}
|
||||
@@ -38,8 +41,7 @@ impl UserFacingError for Error {
|
||||
fn to_string_client(&self) -> String {
|
||||
use Error::*;
|
||||
match self {
|
||||
// TODO: add support for channel binding
|
||||
ChannelBindingFailed(_) => "channel binding is not supported yet".to_string(),
|
||||
ChannelBindingFailed(m) => m.to_string(),
|
||||
ChannelBindingBadMethod(m) => format!("unsupported channel binding method {m}"),
|
||||
_ => "authentication protocol violation".to_string(),
|
||||
}
|
||||
|
||||
@@ -15,7 +15,7 @@ mod signature;
|
||||
#[cfg(any(test, doc))]
|
||||
mod password;
|
||||
|
||||
pub use exchange::Exchange;
|
||||
pub use exchange::{exchange, Exchange};
|
||||
pub use key::ScramKey;
|
||||
pub use secret::ServerSecret;
|
||||
|
||||
|
||||
@@ -1,5 +1,9 @@
|
||||
//! Implementation of the SCRAM authentication algorithm.
|
||||
|
||||
use std::convert::Infallible;
|
||||
|
||||
use postgres_protocol::authentication::sasl::ScramSha256;
|
||||
|
||||
use super::messages::{
|
||||
ClientFinalMessage, ClientFirstMessage, OwnedServerFirstMessage, SCRAM_RAW_NONCE_LEN,
|
||||
};
|
||||
@@ -29,22 +33,27 @@ impl std::str::FromStr for TlsServerEndPoint {
|
||||
}
|
||||
}
|
||||
|
||||
struct SaslSentInner {
|
||||
cbind_flag: ChannelBinding<TlsServerEndPoint>,
|
||||
client_first_message_bare: String,
|
||||
server_first_message: OwnedServerFirstMessage,
|
||||
}
|
||||
|
||||
struct SaslInitial {
|
||||
nonce: fn() -> [u8; SCRAM_RAW_NONCE_LEN],
|
||||
}
|
||||
|
||||
enum ExchangeState {
|
||||
/// Waiting for [`ClientFirstMessage`].
|
||||
Initial,
|
||||
Initial(SaslInitial),
|
||||
/// Waiting for [`ClientFinalMessage`].
|
||||
SaltSent {
|
||||
cbind_flag: ChannelBinding<TlsServerEndPoint>,
|
||||
client_first_message_bare: String,
|
||||
server_first_message: OwnedServerFirstMessage,
|
||||
},
|
||||
SaltSent(SaslSentInner),
|
||||
}
|
||||
|
||||
/// Server's side of SCRAM auth algorithm.
|
||||
pub struct Exchange<'a> {
|
||||
state: ExchangeState,
|
||||
secret: &'a ServerSecret,
|
||||
nonce: fn() -> [u8; SCRAM_RAW_NONCE_LEN],
|
||||
tls_server_end_point: config::TlsServerEndPoint,
|
||||
}
|
||||
|
||||
@@ -55,90 +64,160 @@ impl<'a> Exchange<'a> {
|
||||
tls_server_end_point: config::TlsServerEndPoint,
|
||||
) -> Self {
|
||||
Self {
|
||||
state: ExchangeState::Initial,
|
||||
state: ExchangeState::Initial(SaslInitial { nonce }),
|
||||
secret,
|
||||
nonce,
|
||||
tls_server_end_point,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn exchange(
|
||||
secret: &ServerSecret,
|
||||
mut client: ScramSha256,
|
||||
tls_server_end_point: config::TlsServerEndPoint,
|
||||
) -> sasl::Result<sasl::Outcome<super::ScramKey>> {
|
||||
use sasl::Step::*;
|
||||
|
||||
let init = SaslInitial {
|
||||
nonce: rand::random,
|
||||
};
|
||||
|
||||
let client_first = std::str::from_utf8(client.message())
|
||||
.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
|
||||
let sent = match init.transition(secret, &tls_server_end_point, client_first)? {
|
||||
Continue(sent, server_first) => {
|
||||
client.update(server_first.as_bytes())?;
|
||||
sent
|
||||
}
|
||||
Success(x, _) => match x {},
|
||||
Failure(msg) => return Ok(sasl::Outcome::Failure(msg)),
|
||||
};
|
||||
|
||||
let client_final = std::str::from_utf8(client.message())
|
||||
.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
|
||||
let keys = match sent.transition(secret, &tls_server_end_point, client_final)? {
|
||||
Success(keys, server_final) => {
|
||||
client.finish(server_final.as_bytes())?;
|
||||
keys
|
||||
}
|
||||
Continue(x, _) => match x {},
|
||||
Failure(msg) => return Ok(sasl::Outcome::Failure(msg)),
|
||||
};
|
||||
|
||||
Ok(sasl::Outcome::Success(keys))
|
||||
}
|
||||
|
||||
impl SaslInitial {
|
||||
fn transition(
|
||||
&self,
|
||||
secret: &ServerSecret,
|
||||
tls_server_end_point: &config::TlsServerEndPoint,
|
||||
input: &str,
|
||||
) -> sasl::Result<sasl::Step<SaslSentInner, Infallible>> {
|
||||
let client_first_message = ClientFirstMessage::parse(input)
|
||||
.ok_or(SaslError::BadClientMessage("invalid client-first-message"))?;
|
||||
|
||||
// If the flag is set to "y" and the server supports channel
|
||||
// binding, the server MUST fail authentication
|
||||
if client_first_message.cbind_flag == ChannelBinding::NotSupportedServer
|
||||
&& tls_server_end_point.supported()
|
||||
{
|
||||
return Err(SaslError::ChannelBindingFailed("SCRAM-PLUS not used"));
|
||||
}
|
||||
|
||||
let server_first_message = client_first_message.build_server_first_message(
|
||||
&(self.nonce)(),
|
||||
&secret.salt_base64,
|
||||
secret.iterations,
|
||||
);
|
||||
let msg = server_first_message.as_str().to_owned();
|
||||
|
||||
let next = SaslSentInner {
|
||||
cbind_flag: client_first_message.cbind_flag.and_then(str::parse)?,
|
||||
client_first_message_bare: client_first_message.bare.to_owned(),
|
||||
server_first_message,
|
||||
};
|
||||
|
||||
Ok(sasl::Step::Continue(next, msg))
|
||||
}
|
||||
}
|
||||
|
||||
impl SaslSentInner {
|
||||
fn transition(
|
||||
&self,
|
||||
secret: &ServerSecret,
|
||||
tls_server_end_point: &config::TlsServerEndPoint,
|
||||
input: &str,
|
||||
) -> sasl::Result<sasl::Step<Infallible, super::ScramKey>> {
|
||||
let Self {
|
||||
cbind_flag,
|
||||
client_first_message_bare,
|
||||
server_first_message,
|
||||
} = self;
|
||||
|
||||
let client_final_message = ClientFinalMessage::parse(input)
|
||||
.ok_or(SaslError::BadClientMessage("invalid client-final-message"))?;
|
||||
|
||||
let channel_binding = cbind_flag.encode(|_| match tls_server_end_point {
|
||||
config::TlsServerEndPoint::Sha256(x) => Ok(x),
|
||||
config::TlsServerEndPoint::Undefined => Err(SaslError::MissingBinding),
|
||||
})?;
|
||||
|
||||
// This might've been caused by a MITM attack
|
||||
if client_final_message.channel_binding != channel_binding {
|
||||
return Err(SaslError::ChannelBindingFailed(
|
||||
"insecure connection: secure channel data mismatch",
|
||||
));
|
||||
}
|
||||
|
||||
if client_final_message.nonce != server_first_message.nonce() {
|
||||
return Err(SaslError::BadClientMessage("combined nonce doesn't match"));
|
||||
}
|
||||
|
||||
let signature_builder = SignatureBuilder {
|
||||
client_first_message_bare,
|
||||
server_first_message: server_first_message.as_str(),
|
||||
client_final_message_without_proof: client_final_message.without_proof,
|
||||
};
|
||||
|
||||
let client_key = signature_builder
|
||||
.build(&secret.stored_key)
|
||||
.derive_client_key(&client_final_message.proof);
|
||||
|
||||
// Auth fails either if keys don't match or it's pre-determined to fail.
|
||||
if client_key.sha256() != secret.stored_key || secret.doomed {
|
||||
return Ok(sasl::Step::Failure("password doesn't match"));
|
||||
}
|
||||
|
||||
let msg =
|
||||
client_final_message.build_server_final_message(signature_builder, &secret.server_key);
|
||||
|
||||
Ok(sasl::Step::Success(client_key, msg))
|
||||
}
|
||||
}
|
||||
|
||||
impl sasl::Mechanism for Exchange<'_> {
|
||||
type Output = super::ScramKey;
|
||||
|
||||
fn exchange(mut self, input: &str) -> sasl::Result<sasl::Step<Self, Self::Output>> {
|
||||
use {sasl::Step::*, ExchangeState::*};
|
||||
match &self.state {
|
||||
Initial => {
|
||||
let client_first_message = ClientFirstMessage::parse(input)
|
||||
.ok_or(SaslError::BadClientMessage("invalid client-first-message"))?;
|
||||
|
||||
// If the flag is set to "y" and the server supports channel
|
||||
// binding, the server MUST fail authentication
|
||||
if client_first_message.cbind_flag == ChannelBinding::NotSupportedServer
|
||||
&& self.tls_server_end_point.supported()
|
||||
{
|
||||
return Err(SaslError::ChannelBindingFailed("SCRAM-PLUS not used"));
|
||||
}
|
||||
|
||||
let server_first_message = client_first_message.build_server_first_message(
|
||||
&(self.nonce)(),
|
||||
&self.secret.salt_base64,
|
||||
self.secret.iterations,
|
||||
);
|
||||
let msg = server_first_message.as_str().to_owned();
|
||||
|
||||
self.state = SaltSent {
|
||||
cbind_flag: client_first_message.cbind_flag.and_then(str::parse)?,
|
||||
client_first_message_bare: client_first_message.bare.to_owned(),
|
||||
server_first_message,
|
||||
};
|
||||
|
||||
Ok(Continue(self, msg))
|
||||
}
|
||||
SaltSent {
|
||||
cbind_flag,
|
||||
client_first_message_bare,
|
||||
server_first_message,
|
||||
} => {
|
||||
let client_final_message = ClientFinalMessage::parse(input)
|
||||
.ok_or(SaslError::BadClientMessage("invalid client-final-message"))?;
|
||||
|
||||
let channel_binding = cbind_flag.encode(|_| match &self.tls_server_end_point {
|
||||
config::TlsServerEndPoint::Sha256(x) => Ok(x),
|
||||
config::TlsServerEndPoint::Undefined => {
|
||||
Err(SaslError::ChannelBindingFailed("no cert digest provided"))
|
||||
Initial(init) => {
|
||||
match init.transition(self.secret, &self.tls_server_end_point, input)? {
|
||||
Continue(sent, msg) => {
|
||||
self.state = SaltSent(sent);
|
||||
Ok(Continue(self, msg))
|
||||
}
|
||||
})?;
|
||||
|
||||
// This might've been caused by a MITM attack
|
||||
if client_final_message.channel_binding != channel_binding {
|
||||
return Err(SaslError::ChannelBindingFailed("data mismatch"));
|
||||
Success(x, _) => match x {},
|
||||
Failure(msg) => Ok(Failure(msg)),
|
||||
}
|
||||
|
||||
if client_final_message.nonce != server_first_message.nonce() {
|
||||
return Err(SaslError::BadClientMessage("combined nonce doesn't match"));
|
||||
}
|
||||
SaltSent(sent) => {
|
||||
match sent.transition(self.secret, &self.tls_server_end_point, input)? {
|
||||
Success(keys, msg) => Ok(Success(keys, msg)),
|
||||
Continue(x, _) => match x {},
|
||||
Failure(msg) => Ok(Failure(msg)),
|
||||
}
|
||||
|
||||
let signature_builder = SignatureBuilder {
|
||||
client_first_message_bare,
|
||||
server_first_message: server_first_message.as_str(),
|
||||
client_final_message_without_proof: client_final_message.without_proof,
|
||||
};
|
||||
|
||||
let client_key = signature_builder
|
||||
.build(&self.secret.stored_key)
|
||||
.derive_client_key(&client_final_message.proof);
|
||||
|
||||
// Auth fails either if keys don't match or it's pre-determined to fail.
|
||||
if client_key.sha256() != self.secret.stored_key || self.secret.doomed {
|
||||
return Ok(Failure("password doesn't match"));
|
||||
}
|
||||
|
||||
let msg = client_final_message
|
||||
.build_server_final_message(signature_builder, &self.secret.server_key);
|
||||
|
||||
Ok(Success(client_key, msg))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,6 +10,7 @@ use anyhow::bail;
|
||||
use hyper::StatusCode;
|
||||
pub use reqwest_middleware::{ClientWithMiddleware, Error};
|
||||
pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
|
||||
use tokio_util::task::TaskTracker;
|
||||
|
||||
use crate::protocol2::{ProxyProtocolAccept, WithClientIp};
|
||||
use crate::proxy::{NUM_CLIENT_CONNECTION_CLOSED_COUNTER, NUM_CLIENT_CONNECTION_OPENED_COUNTER};
|
||||
@@ -23,7 +24,7 @@ use hyper::{
|
||||
Body, Method, Request, Response,
|
||||
};
|
||||
|
||||
use std::net::SocketAddr;
|
||||
use std::net::IpAddr;
|
||||
use std::task::Poll;
|
||||
use std::{future::ready, sync::Arc};
|
||||
use tls_listener::TlsListener;
|
||||
@@ -70,6 +71,9 @@ pub async fn task_main(
|
||||
incoming: addr_incoming,
|
||||
};
|
||||
|
||||
let ws_connections = tokio_util::task::task_tracker::TaskTracker::new();
|
||||
ws_connections.close(); // allows `ws_connections.wait to complete`
|
||||
|
||||
let tls_listener = TlsListener::new(tls_acceptor, addr_incoming).filter(|conn| {
|
||||
if let Err(err) = conn {
|
||||
error!("failed to accept TLS connection for websockets: {err:?}");
|
||||
@@ -86,6 +90,7 @@ pub async fn task_main(
|
||||
let remote_addr = io.inner.remote_addr();
|
||||
let sni_name = tls.server_name().map(|s| s.to_string());
|
||||
let conn_pool = conn_pool.clone();
|
||||
let ws_connections = ws_connections.clone();
|
||||
|
||||
async move {
|
||||
let peer_addr = match client_addr {
|
||||
@@ -97,13 +102,21 @@ pub async fn task_main(
|
||||
move |req: Request<Body>| {
|
||||
let sni_name = sni_name.clone();
|
||||
let conn_pool = conn_pool.clone();
|
||||
let ws_connections = ws_connections.clone();
|
||||
|
||||
async move {
|
||||
let cancel_map = Arc::new(CancelMap::default());
|
||||
let session_id = uuid::Uuid::new_v4();
|
||||
|
||||
request_handler(
|
||||
req, config, conn_pool, cancel_map, session_id, sni_name, peer_addr,
|
||||
req,
|
||||
config,
|
||||
conn_pool,
|
||||
ws_connections,
|
||||
cancel_map,
|
||||
session_id,
|
||||
sni_name,
|
||||
peer_addr.ip(),
|
||||
)
|
||||
.instrument(info_span!(
|
||||
"serverless",
|
||||
@@ -123,6 +136,9 @@ pub async fn task_main(
|
||||
.with_graceful_shutdown(cancellation_token.cancelled())
|
||||
.await?;
|
||||
|
||||
// await websocket connections
|
||||
ws_connections.wait().await;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -164,14 +180,16 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn request_handler(
|
||||
mut request: Request<Body>,
|
||||
config: &'static ProxyConfig,
|
||||
conn_pool: Arc<conn_pool::GlobalConnPool>,
|
||||
ws_connections: TaskTracker,
|
||||
cancel_map: Arc<CancelMap>,
|
||||
session_id: uuid::Uuid,
|
||||
sni_hostname: Option<String>,
|
||||
peer_addr: SocketAddr,
|
||||
peer_addr: IpAddr,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let host = request
|
||||
.headers()
|
||||
@@ -187,7 +205,7 @@ async fn request_handler(
|
||||
let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None)
|
||||
.map_err(|e| ApiError::BadRequest(e.into()))?;
|
||||
|
||||
tokio::spawn(
|
||||
ws_connections.spawn(
|
||||
async move {
|
||||
if let Err(e) = websocket::serve_websocket(
|
||||
websocket,
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use anyhow::Context;
|
||||
use anyhow::{anyhow, Context};
|
||||
use async_trait::async_trait;
|
||||
use dashmap::DashMap;
|
||||
use futures::future::poll_fn;
|
||||
@@ -9,7 +9,7 @@ use pbkdf2::{
|
||||
};
|
||||
use pq_proto::StartupMessageParams;
|
||||
use smol_str::SmolStr;
|
||||
use std::{collections::HashMap, net::SocketAddr, sync::Arc};
|
||||
use std::{collections::HashMap, net::IpAddr, sync::Arc};
|
||||
use std::{
|
||||
fmt,
|
||||
task::{ready, Poll},
|
||||
@@ -22,7 +22,7 @@ use tokio::time;
|
||||
use tokio_postgres::{AsyncMessage, ReadyForQueryStatus};
|
||||
|
||||
use crate::{
|
||||
auth::{self, check_peer_addr_is_in_list},
|
||||
auth::{self, backend::ComputeUserInfo, check_peer_addr_is_in_list},
|
||||
console,
|
||||
proxy::{
|
||||
neon_options, LatencyTimer, NUM_DB_CONNECTIONS_CLOSED_COUNTER,
|
||||
@@ -146,7 +146,7 @@ impl GlobalConnPool {
|
||||
conn_info: &ConnInfo,
|
||||
force_new: bool,
|
||||
session_id: uuid::Uuid,
|
||||
peer_addr: SocketAddr,
|
||||
peer_addr: IpAddr,
|
||||
) -> anyhow::Result<Client> {
|
||||
let mut client: Option<ClientInner> = None;
|
||||
let mut latency_timer = LatencyTimer::new("http");
|
||||
@@ -406,7 +406,7 @@ async fn connect_to_compute(
|
||||
conn_id: uuid::Uuid,
|
||||
session_id: uuid::Uuid,
|
||||
latency_timer: LatencyTimer,
|
||||
peer_addr: SocketAddr,
|
||||
peer_addr: IpAddr,
|
||||
) -> anyhow::Result<ClientInner> {
|
||||
let tls = config.tls_config.as_ref();
|
||||
let common_names = tls.and_then(|tls| tls.common_names.clone());
|
||||
@@ -423,6 +423,9 @@ async fn connect_to_compute(
|
||||
common_names,
|
||||
peer_addr,
|
||||
)?;
|
||||
|
||||
let creds =
|
||||
ComputeUserInfo::try_from(creds).map_err(|_| anyhow!("missing endpoint identifier"))?;
|
||||
let backend = config.auth_backend.as_ref().map(|_| creds);
|
||||
|
||||
let console_options = neon_options(¶ms);
|
||||
@@ -430,12 +433,12 @@ async fn connect_to_compute(
|
||||
let extra = console::ConsoleReqExtra {
|
||||
session_id: uuid::Uuid::new_v4(),
|
||||
application_name: Some(APP_NAME),
|
||||
options: console_options.as_deref(),
|
||||
options: console_options,
|
||||
};
|
||||
// TODO(anna): this is a bit hacky way, consider using console notification listener.
|
||||
if !config.disable_ip_check_for_http {
|
||||
let allowed_ips = backend.get_allowed_ips(&extra).await?;
|
||||
if !check_peer_addr_is_in_list(&peer_addr.ip(), &allowed_ips) {
|
||||
if !check_peer_addr_is_in_list(&peer_addr, &allowed_ips) {
|
||||
return Err(auth::AuthError::ip_address_not_allowed().into());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use std::net::SocketAddr;
|
||||
use std::net::IpAddr;
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::bail;
|
||||
@@ -202,7 +202,7 @@ pub async fn handle(
|
||||
sni_hostname: Option<String>,
|
||||
conn_pool: Arc<GlobalConnPool>,
|
||||
session_id: uuid::Uuid,
|
||||
peer_addr: SocketAddr,
|
||||
peer_addr: IpAddr,
|
||||
config: &'static HttpConfig,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let result = tokio::time::timeout(
|
||||
@@ -301,7 +301,7 @@ async fn handle_inner(
|
||||
sni_hostname: Option<String>,
|
||||
conn_pool: Arc<GlobalConnPool>,
|
||||
session_id: uuid::Uuid,
|
||||
peer_addr: SocketAddr,
|
||||
peer_addr: IpAddr,
|
||||
) -> anyhow::Result<Response<Body>> {
|
||||
NUM_CONNECTIONS_ACCEPTED_COUNTER
|
||||
.with_label_values(&["http"])
|
||||
|
||||
@@ -11,7 +11,7 @@ use hyper_tungstenite::{tungstenite::Message, HyperWebsocket, WebSocketStream};
|
||||
use pin_project_lite::pin_project;
|
||||
|
||||
use std::{
|
||||
net::SocketAddr,
|
||||
net::IpAddr,
|
||||
pin::Pin,
|
||||
task::{ready, Context, Poll},
|
||||
};
|
||||
@@ -133,7 +133,7 @@ pub async fn serve_websocket(
|
||||
cancel_map: &CancelMap,
|
||||
session_id: uuid::Uuid,
|
||||
hostname: Option<String>,
|
||||
peer_addr: SocketAddr,
|
||||
peer_addr: IpAddr,
|
||||
) -> anyhow::Result<()> {
|
||||
let websocket = websocket.await?;
|
||||
handle_client(
|
||||
|
||||
@@ -12,8 +12,8 @@ typing-extensions = "^4.6.1"
|
||||
PyJWT = {version = "^2.1.0", extras = ["crypto"]}
|
||||
requests = "^2.31.0"
|
||||
pytest-xdist = "^3.3.1"
|
||||
asyncpg = "^0.27.0"
|
||||
aiopg = "^1.3.1"
|
||||
asyncpg = "^0.29.0"
|
||||
aiopg = "^1.4.0"
|
||||
Jinja2 = "^3.0.2"
|
||||
types-requests = "^2.31.0.0"
|
||||
types-psycopg2 = "^2.9.21.10"
|
||||
|
||||
@@ -35,6 +35,7 @@ serde_with.workspace = true
|
||||
signal-hook.workspace = true
|
||||
thiserror.workspace = true
|
||||
tokio = { workspace = true, features = ["fs"] }
|
||||
tokio-util = { workspace = true }
|
||||
tokio-io-timeout.workspace = true
|
||||
tokio-postgres.workspace = true
|
||||
toml_edit.workspace = true
|
||||
|
||||
@@ -914,9 +914,14 @@ where
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Persist control file to disk, called only after timeline creation (bootstrap).
|
||||
pub async fn persist(&mut self) -> Result<()> {
|
||||
self.persist_control_file(self.state.clone()).await
|
||||
/// Persist in-memory state of control file to disk.
|
||||
//
|
||||
// TODO: passing inmem_remote_consistent_lsn everywhere is ugly, better
|
||||
// separate state completely and give Arc to all those who need it.
|
||||
pub async fn persist_inmem(&mut self, inmem_remote_consistent_lsn: Lsn) -> Result<()> {
|
||||
let mut state = self.state.clone();
|
||||
state.remote_consistent_lsn = inmem_remote_consistent_lsn;
|
||||
self.persist_control_file(state).await
|
||||
}
|
||||
|
||||
/// Persist in-memory state to the disk, taking other data from state.
|
||||
@@ -930,7 +935,7 @@ where
|
||||
|
||||
/// Persist control file if there is something to save and enough time
|
||||
/// passed after the last save.
|
||||
pub async fn maybe_persist_control_file(
|
||||
pub async fn maybe_persist_inmem_control_file(
|
||||
&mut self,
|
||||
inmem_remote_consistent_lsn: Lsn,
|
||||
) -> Result<()> {
|
||||
@@ -943,9 +948,7 @@ where
|
||||
|| self.inmem.peer_horizon_lsn > self.state.peer_horizon_lsn
|
||||
|| inmem_remote_consistent_lsn > self.state.remote_consistent_lsn;
|
||||
if need_persist {
|
||||
let mut state = self.state.clone();
|
||||
state.remote_consistent_lsn = inmem_remote_consistent_lsn;
|
||||
self.persist_control_file(state).await?;
|
||||
self.persist_inmem(inmem_remote_consistent_lsn).await?;
|
||||
trace!("saved control file: {CF_SAVE_INTERVAL:?} passed");
|
||||
}
|
||||
Ok(())
|
||||
@@ -1064,8 +1067,6 @@ where
|
||||
|
||||
if sync_control_file {
|
||||
let mut state = self.state.clone();
|
||||
// Note: we could make remote_consistent_lsn update in cf common by
|
||||
// storing Arc to walsenders in Safekeeper.
|
||||
state.remote_consistent_lsn = new_remote_consistent_lsn;
|
||||
self.persist_control_file(state).await?;
|
||||
}
|
||||
|
||||
@@ -182,8 +182,9 @@ impl SharedState {
|
||||
}
|
||||
|
||||
/// Mark timeline active/inactive and return whether s3 offloading requires
|
||||
/// start/stop action.
|
||||
fn update_status(
|
||||
/// start/stop action. If timeline is deactivated, control file is persisted
|
||||
/// as maintenance task does that only for active timelines.
|
||||
async fn update_status(
|
||||
&mut self,
|
||||
num_computes: usize,
|
||||
remote_consistent_lsn: Lsn,
|
||||
@@ -191,7 +192,15 @@ impl SharedState {
|
||||
) -> bool {
|
||||
let is_active = self.is_active(num_computes, remote_consistent_lsn);
|
||||
if self.active != is_active {
|
||||
info!("timeline {} active={} now", ttid, is_active);
|
||||
info!(
|
||||
"timeline {} active={} now, remote_consistent_lsn={}, commit_lsn={}",
|
||||
ttid, is_active, remote_consistent_lsn, self.sk.inmem.commit_lsn
|
||||
);
|
||||
if !is_active {
|
||||
if let Err(e) = self.sk.persist_inmem(remote_consistent_lsn).await {
|
||||
warn!("control file save in update_status failed: {:?}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
self.active = is_active;
|
||||
self.is_wal_backup_action_pending(num_computes)
|
||||
@@ -438,7 +447,7 @@ impl Timeline {
|
||||
fs::create_dir_all(&self.timeline_dir).await?;
|
||||
|
||||
// Write timeline to disk and start background tasks.
|
||||
if let Err(e) = shared_state.sk.persist().await {
|
||||
if let Err(e) = shared_state.sk.persist_inmem(Lsn::INVALID).await {
|
||||
// Bootstrap failed, cancel timeline and remove timeline directory.
|
||||
self.cancel(shared_state);
|
||||
|
||||
@@ -511,12 +520,14 @@ impl Timeline {
|
||||
self.mutex.lock().await
|
||||
}
|
||||
|
||||
fn update_status(&self, shared_state: &mut SharedState) -> bool {
|
||||
shared_state.update_status(
|
||||
self.walreceivers.get_num(),
|
||||
self.get_walsenders().get_remote_consistent_lsn(),
|
||||
self.ttid,
|
||||
)
|
||||
async fn update_status(&self, shared_state: &mut SharedState) -> bool {
|
||||
shared_state
|
||||
.update_status(
|
||||
self.walreceivers.get_num(),
|
||||
self.get_walsenders().get_remote_consistent_lsn(),
|
||||
self.ttid,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Update timeline status and kick wal backup launcher to stop/start offloading if needed.
|
||||
@@ -526,7 +537,7 @@ impl Timeline {
|
||||
}
|
||||
let is_wal_backup_action_pending: bool = {
|
||||
let mut shared_state = self.write_shared_state().await;
|
||||
self.update_status(&mut shared_state)
|
||||
self.update_status(&mut shared_state).await
|
||||
};
|
||||
if is_wal_backup_action_pending {
|
||||
// Can fail only if channel to a static thread got closed, which is not normal at all.
|
||||
@@ -683,7 +694,7 @@ impl Timeline {
|
||||
shared_state.sk.record_safekeeper_info(&sk_info).await?;
|
||||
let peer_info = PeerInfo::from_sk_info(&sk_info, Instant::now());
|
||||
shared_state.peers_info.upsert(&peer_info);
|
||||
is_wal_backup_action_pending = self.update_status(&mut shared_state);
|
||||
is_wal_backup_action_pending = self.update_status(&mut shared_state).await;
|
||||
commit_lsn = shared_state.sk.inmem.commit_lsn;
|
||||
}
|
||||
self.commit_lsn_watch_tx.send(commit_lsn)?;
|
||||
@@ -828,7 +839,7 @@ impl Timeline {
|
||||
self.write_shared_state()
|
||||
.await
|
||||
.sk
|
||||
.maybe_persist_control_file(remote_consistent_lsn)
|
||||
.maybe_persist_inmem_control_file(remote_consistent_lsn)
|
||||
.await
|
||||
}
|
||||
|
||||
|
||||
@@ -35,6 +35,9 @@ use once_cell::sync::OnceCell;
|
||||
const UPLOAD_FAILURE_RETRY_MIN_MS: u64 = 10;
|
||||
const UPLOAD_FAILURE_RETRY_MAX_MS: u64 = 5000;
|
||||
|
||||
/// Default buffer size when interfacing with [`tokio::fs::File`].
|
||||
const BUFFER_SIZE: usize = 32 * 1024;
|
||||
|
||||
/// Check whether wal backup is required for timeline. If yes, mark that launcher is
|
||||
/// aware of current status and return the timeline.
|
||||
async fn is_wal_backup_required(ttid: TenantTimelineId) -> Option<Arc<Timeline>> {
|
||||
@@ -494,15 +497,13 @@ async fn backup_object(
|
||||
.as_ref()
|
||||
.unwrap();
|
||||
|
||||
let file = tokio::io::BufReader::new(
|
||||
File::open(&source_file)
|
||||
.await
|
||||
.with_context(|| format!("Failed to open file {} for wal backup", source_file))?,
|
||||
);
|
||||
|
||||
storage
|
||||
.upload_storage_object(Box::new(file), size, target_file)
|
||||
let file = File::open(&source_file)
|
||||
.await
|
||||
.with_context(|| format!("Failed to open file {source_file:?} for wal backup"))?;
|
||||
|
||||
let file = tokio_util::io::ReaderStream::with_capacity(file, BUFFER_SIZE);
|
||||
|
||||
storage.upload_storage_object(file, size, target_file).await
|
||||
}
|
||||
|
||||
pub async fn read_object(
|
||||
@@ -524,5 +525,9 @@ pub async fn read_object(
|
||||
format!("Failed to open WAL segment download stream for remote path {file_path:?}")
|
||||
})?;
|
||||
|
||||
Ok(download.download_stream)
|
||||
let reader = tokio_util::io::StreamReader::new(download.download_stream);
|
||||
|
||||
let reader = tokio::io::BufReader::with_capacity(BUFFER_SIZE, reader);
|
||||
|
||||
Ok(Box::pin(reader))
|
||||
}
|
||||
|
||||
@@ -266,9 +266,7 @@ class NeonPageserverHttpClient(requests.Session):
|
||||
def tenant_create(self, new_tenant_id: uuid.UUID, ok_if_exists):
|
||||
res = self.post(
|
||||
f"http://{self.host}:{self.port}/v1/tenant",
|
||||
json={
|
||||
"new_tenant_id": new_tenant_id.hex,
|
||||
},
|
||||
json={"new_tenant_id": new_tenant_id.hex, "generation": 1},
|
||||
)
|
||||
|
||||
if res.status_code == 409:
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user