mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-13 03:00:37 +00:00
Compare commits
189 Commits
wp-neon-wa
...
release-40
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
49377abd98 | ||
|
|
a6b2f4e54e | ||
|
|
face60d50b | ||
|
|
9768aa27f2 | ||
|
|
96b2e575e1 | ||
|
|
7222777784 | ||
|
|
5469fdede0 | ||
|
|
72aa6b9fdd | ||
|
|
ae0634b7be | ||
|
|
70711f32fa | ||
|
|
52a88af0aa | ||
|
|
b7a43bf817 | ||
|
|
dce91b33a4 | ||
|
|
23ee4f3050 | ||
|
|
46857e8282 | ||
|
|
368ab0ce54 | ||
|
|
a5987eebfd | ||
|
|
6686ede30f | ||
|
|
373c7057cc | ||
|
|
7d6ec16166 | ||
|
|
0e6fdc8a58 | ||
|
|
521438a5c6 | ||
|
|
07d7874bc8 | ||
|
|
1804111a02 | ||
|
|
cd0178efed | ||
|
|
333574be57 | ||
|
|
79a799a143 | ||
|
|
9da06af6c9 | ||
|
|
ce1753d036 | ||
|
|
67db8432b4 | ||
|
|
4e2e44e524 | ||
|
|
ed786104f3 | ||
|
|
84b74f2bd1 | ||
|
|
fec2ad6283 | ||
|
|
98eebd4682 | ||
|
|
2f74287c9b | ||
|
|
aee1bf95e3 | ||
|
|
b9de9d75ff | ||
|
|
7943b709e6 | ||
|
|
d7d066d493 | ||
|
|
e78ac22107 | ||
|
|
76a8f2bb44 | ||
|
|
8d59a8581f | ||
|
|
b1ddd01289 | ||
|
|
6eae4fc9aa | ||
|
|
765455bca2 | ||
|
|
4204960942 | ||
|
|
67345d66ea | ||
|
|
2266ee5971 | ||
|
|
b58445d855 | ||
|
|
36050e7f3d | ||
|
|
33360ed96d | ||
|
|
39a28d1108 | ||
|
|
efa6aa134f | ||
|
|
2c724e56e2 | ||
|
|
feff887c6f | ||
|
|
353d915fcf | ||
|
|
2e38098cbc | ||
|
|
a6fe5ea1ac | ||
|
|
05b0aed0c1 | ||
|
|
cd1705357d | ||
|
|
6bc7561290 | ||
|
|
fbd3ac14b5 | ||
|
|
e437787c8f | ||
|
|
3460dbf90b | ||
|
|
6b89d99677 | ||
|
|
6cc8ea86e4 | ||
|
|
e62a492d6f | ||
|
|
a475cdf642 | ||
|
|
7002c79a47 | ||
|
|
ee6cf357b4 | ||
|
|
e5c2086b5f | ||
|
|
5f1208296a | ||
|
|
88e8e473cd | ||
|
|
b0a77844f6 | ||
|
|
1baf464307 | ||
|
|
e9b8e81cea | ||
|
|
85d6194aa4 | ||
|
|
333a7a68ef | ||
|
|
6aa4e41bee | ||
|
|
840183e51f | ||
|
|
cbccc94b03 | ||
|
|
fce227df22 | ||
|
|
bd787e800f | ||
|
|
4a7704b4a3 | ||
|
|
ff1119da66 | ||
|
|
4c3ba1627b | ||
|
|
1407174fb2 | ||
|
|
ec9dcb1889 | ||
|
|
d11d781afc | ||
|
|
4e44565b71 | ||
|
|
4ed51ad33b | ||
|
|
1c1ebe5537 | ||
|
|
c19cb7f386 | ||
|
|
4b97d31b16 | ||
|
|
923ade3dd7 | ||
|
|
b04e711975 | ||
|
|
afd0a6b39a | ||
|
|
99752286d8 | ||
|
|
15df93363c | ||
|
|
bc0ab741af | ||
|
|
51d9dfeaa3 | ||
|
|
f63cb18155 | ||
|
|
0de603d88e | ||
|
|
240913912a | ||
|
|
91a4ea0de2 | ||
|
|
8608704f49 | ||
|
|
efef68ce99 | ||
|
|
8daefd24da | ||
|
|
46cc8b7982 | ||
|
|
38cd90dd0c | ||
|
|
a51b269f15 | ||
|
|
43bf6d0a0f | ||
|
|
15273a9b66 | ||
|
|
78aca668d0 | ||
|
|
acbf4148ea | ||
|
|
6508540561 | ||
|
|
a41b5244a8 | ||
|
|
2b3189be95 | ||
|
|
248563c595 | ||
|
|
14cd6ca933 | ||
|
|
eb36403e71 | ||
|
|
3c6f779698 | ||
|
|
f67f0c1c11 | ||
|
|
edb02d3299 | ||
|
|
664a69e65b | ||
|
|
478322ebf9 | ||
|
|
802f174072 | ||
|
|
47f9890bae | ||
|
|
262265daad | ||
|
|
300da5b872 | ||
|
|
7b22b5c433 | ||
|
|
ffca97bc1e | ||
|
|
cb356f3259 | ||
|
|
c85374295f | ||
|
|
4992160677 | ||
|
|
bd535b3371 | ||
|
|
d90c5a03af | ||
|
|
2d02cc9079 | ||
|
|
49ad94b99f | ||
|
|
948a217398 | ||
|
|
125381eae7 | ||
|
|
cd01bbc715 | ||
|
|
d8b5e3b88d | ||
|
|
06d25f2186 | ||
|
|
f759b561f3 | ||
|
|
ece0555600 | ||
|
|
73ea0a0b01 | ||
|
|
d8f6d6fd6f | ||
|
|
d24de169a7 | ||
|
|
0816168296 | ||
|
|
277b44d57a | ||
|
|
68c2c3880e | ||
|
|
49da498f65 | ||
|
|
2c76ba3dd7 | ||
|
|
dbe3dc69ad | ||
|
|
8e5bb3ed49 | ||
|
|
ab0be7b8da | ||
|
|
b4c55f5d24 | ||
|
|
ede70d833c | ||
|
|
70c3d18bb0 | ||
|
|
7a491f52c4 | ||
|
|
323c4ecb4f | ||
|
|
3d2466607e | ||
|
|
ed478b39f4 | ||
|
|
91585a558d | ||
|
|
93467eae1f | ||
|
|
f3aac81d19 | ||
|
|
979ad60c19 | ||
|
|
9316cb1b1f | ||
|
|
e7939a527a | ||
|
|
36d26665e1 | ||
|
|
873347f977 | ||
|
|
e814ac16f9 | ||
|
|
ad3055d386 | ||
|
|
94e03eb452 | ||
|
|
380f26ef79 | ||
|
|
3c5b7f59d7 | ||
|
|
fee89f80b5 | ||
|
|
41cce8eaf1 | ||
|
|
f88fe0218d | ||
|
|
cc856eca85 | ||
|
|
cf350c6002 | ||
|
|
0ce6b6a0a3 | ||
|
|
73f247d537 | ||
|
|
960be82183 | ||
|
|
806e5a6c19 | ||
|
|
8d5df07cce | ||
|
|
df7a9d1407 |
5
.github/workflows/build_and_test.yml
vendored
5
.github/workflows/build_and_test.yml
vendored
@@ -320,9 +320,6 @@ jobs:
|
||||
- name: Build neon extensions
|
||||
run: mold -run make neon-pg-ext -j$(nproc)
|
||||
|
||||
- name: Build walproposer-lib
|
||||
run: mold -run make walproposer-lib -j$(nproc)
|
||||
|
||||
- name: Run cargo build
|
||||
run: |
|
||||
${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
|
||||
@@ -837,7 +834,7 @@ jobs:
|
||||
run:
|
||||
shell: sh -eu {0}
|
||||
env:
|
||||
VM_BUILDER_VERSION: v0.18.1
|
||||
VM_BUILDER_VERSION: v0.17.12
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
|
||||
18
.github/workflows/neon_extra_builds.yml
vendored
18
.github/workflows/neon_extra_builds.yml
vendored
@@ -32,7 +32,7 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
@@ -90,21 +90,18 @@ jobs:
|
||||
|
||||
- name: Build postgres v14
|
||||
if: steps.cache_pg_14.outputs.cache-hit != 'true'
|
||||
run: make postgres-v14 -j$(sysctl -n hw.ncpu)
|
||||
run: make postgres-v14 -j$(nproc)
|
||||
|
||||
- name: Build postgres v15
|
||||
if: steps.cache_pg_15.outputs.cache-hit != 'true'
|
||||
run: make postgres-v15 -j$(sysctl -n hw.ncpu)
|
||||
run: make postgres-v15 -j$(nproc)
|
||||
|
||||
- name: Build postgres v16
|
||||
if: steps.cache_pg_16.outputs.cache-hit != 'true'
|
||||
run: make postgres-v16 -j$(sysctl -n hw.ncpu)
|
||||
run: make postgres-v16 -j$(nproc)
|
||||
|
||||
- name: Build neon extensions
|
||||
run: make neon-pg-ext -j$(sysctl -n hw.ncpu)
|
||||
|
||||
- name: Build walproposer-lib
|
||||
run: make walproposer-lib -j$(sysctl -n hw.ncpu)
|
||||
run: make neon-pg-ext -j$(nproc)
|
||||
|
||||
- name: Run cargo build
|
||||
run: cargo build --all --release
|
||||
@@ -129,7 +126,7 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
@@ -138,9 +135,6 @@ jobs:
|
||||
- name: Get postgres headers
|
||||
run: make postgres-headers -j$(nproc)
|
||||
|
||||
- name: Build walproposer-lib
|
||||
run: make walproposer-lib -j$(nproc)
|
||||
|
||||
- name: Produce the build stats
|
||||
run: cargo build --all --release --timings
|
||||
|
||||
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -18,6 +18,3 @@ test_output/
|
||||
*.o
|
||||
*.so
|
||||
*.Po
|
||||
|
||||
# pgindent typedef lists
|
||||
*.list
|
||||
|
||||
103
Cargo.lock
generated
103
Cargo.lock
generated
@@ -285,7 +285,7 @@ dependencies = [
|
||||
"log",
|
||||
"parking",
|
||||
"polling",
|
||||
"rustix 0.37.25",
|
||||
"rustix 0.37.19",
|
||||
"slab",
|
||||
"socket2 0.4.9",
|
||||
"waker-fn",
|
||||
@@ -853,27 +853,6 @@ dependencies = [
|
||||
"uuid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "azure_identity"
|
||||
version = "0.16.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5b67b337346da8739e91ea1e9400a6ebc9bc54e0b2af1d23c9bcd565950588f9"
|
||||
dependencies = [
|
||||
"async-lock",
|
||||
"async-trait",
|
||||
"azure_core",
|
||||
"futures",
|
||||
"log",
|
||||
"oauth2",
|
||||
"pin-project",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"time 0.3.21",
|
||||
"tz-rs",
|
||||
"url",
|
||||
"uuid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "azure_storage"
|
||||
version = "0.16.0"
|
||||
@@ -1152,11 +1131,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4e3c5919066adf22df73762e50cffcde3a758f2a848b113b586d1f86728b673b"
|
||||
dependencies = [
|
||||
"iana-time-zone",
|
||||
"js-sys",
|
||||
"num-integer",
|
||||
"num-traits",
|
||||
"serde",
|
||||
"wasm-bindgen",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
@@ -2582,7 +2559,7 @@ checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f"
|
||||
dependencies = [
|
||||
"hermit-abi",
|
||||
"io-lifetimes",
|
||||
"rustix 0.37.25",
|
||||
"rustix 0.37.19",
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
@@ -2972,34 +2949,6 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num_threads"
|
||||
version = "0.1.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2819ce041d2ee131036f4fc9d6ae7ae125a3a40e97ba64d04fe799ad9dabbb44"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "oauth2"
|
||||
version = "4.4.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c38841cdd844847e3e7c8d29cef9dcfed8877f8f56f9071f77843ecf3baf937f"
|
||||
dependencies = [
|
||||
"base64 0.13.1",
|
||||
"chrono",
|
||||
"getrandom 0.2.9",
|
||||
"http",
|
||||
"rand 0.8.5",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_path_to_error",
|
||||
"sha2 0.10.6",
|
||||
"thiserror",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "object"
|
||||
version = "0.30.3"
|
||||
@@ -3561,7 +3510,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres"
|
||||
version = "0.19.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -3574,7 +3523,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-native-tls"
|
||||
version = "0.5.0"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
|
||||
dependencies = [
|
||||
"native-tls",
|
||||
"tokio",
|
||||
@@ -3585,7 +3534,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-protocol"
|
||||
version = "0.6.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
|
||||
dependencies = [
|
||||
"base64 0.20.0",
|
||||
"byteorder",
|
||||
@@ -3603,7 +3552,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-types"
|
||||
version = "0.2.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -3734,7 +3683,7 @@ dependencies = [
|
||||
"byteorder",
|
||||
"hex",
|
||||
"lazy_static",
|
||||
"rustix 0.36.16",
|
||||
"rustix 0.36.14",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4072,7 +4021,6 @@ dependencies = [
|
||||
"aws-smithy-http",
|
||||
"aws-types",
|
||||
"azure_core",
|
||||
"azure_identity",
|
||||
"azure_storage",
|
||||
"azure_storage_blobs",
|
||||
"bytes",
|
||||
@@ -4317,9 +4265,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "rustix"
|
||||
version = "0.36.16"
|
||||
version = "0.36.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6da3636faa25820d8648e0e31c5d519bbb01f72fdf57131f0f5f7da5fed36eab"
|
||||
checksum = "14e4d67015953998ad0eb82887a0eb0129e18a7e2f3b7b0f6c422fddcd503d62"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"errno",
|
||||
@@ -4331,9 +4279,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "rustix"
|
||||
version = "0.37.25"
|
||||
version = "0.37.19"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d4eb579851244c2c03e7c24f501c3432bed80b8f720af1d6e5b0e0f01555a035"
|
||||
checksum = "acf8729d8542766f1b2cf77eb034d52f40d375bb8b615d0b147089946e16613d"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"errno",
|
||||
@@ -5174,7 +5122,7 @@ dependencies = [
|
||||
"cfg-if",
|
||||
"fastrand 1.9.0",
|
||||
"redox_syscall 0.3.5",
|
||||
"rustix 0.37.25",
|
||||
"rustix 0.37.19",
|
||||
"windows-sys 0.45.0",
|
||||
]
|
||||
|
||||
@@ -5271,8 +5219,6 @@ checksum = "8f3403384eaacbca9923fa06940178ac13e4edb725486d70e8e15881d0c836cc"
|
||||
dependencies = [
|
||||
"itoa",
|
||||
"js-sys",
|
||||
"libc",
|
||||
"num_threads",
|
||||
"serde",
|
||||
"time-core",
|
||||
"time-macros 0.2.9",
|
||||
@@ -5407,7 +5353,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "tokio-postgres"
|
||||
version = "0.7.7"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"byteorder",
|
||||
@@ -5422,7 +5368,7 @@ dependencies = [
|
||||
"pin-project-lite",
|
||||
"postgres-protocol",
|
||||
"postgres-types",
|
||||
"socket2 0.5.3",
|
||||
"socket2 0.4.9",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
]
|
||||
@@ -5824,15 +5770,6 @@ version = "1.16.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba"
|
||||
|
||||
[[package]]
|
||||
name = "tz-rs"
|
||||
version = "0.6.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "33851b15c848fad2cf4b105c6bb66eb9512b6f6c44a4b13f57c53c73c707e2b4"
|
||||
dependencies = [
|
||||
"const_fn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "uname"
|
||||
version = "0.1.1"
|
||||
@@ -6092,17 +6029,6 @@ dependencies = [
|
||||
"winapi-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "walproposer"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bindgen",
|
||||
"postgres_ffi",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "want"
|
||||
version = "0.3.0"
|
||||
@@ -6508,6 +6434,7 @@ dependencies = [
|
||||
"serde",
|
||||
"serde_json",
|
||||
"smallvec",
|
||||
"socket2 0.4.9",
|
||||
"standback",
|
||||
"syn 1.0.109",
|
||||
"syn 2.0.28",
|
||||
|
||||
19
Cargo.toml
19
Cargo.toml
@@ -26,7 +26,6 @@ members = [
|
||||
"libs/tracing-utils",
|
||||
"libs/postgres_ffi/wal_craft",
|
||||
"libs/vm_monitor",
|
||||
"libs/walproposer",
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
@@ -37,10 +36,9 @@ license = "Apache-2.0"
|
||||
[workspace.dependencies]
|
||||
anyhow = { version = "1.0", features = ["backtrace"] }
|
||||
async-compression = { version = "0.4.0", features = ["tokio", "gzip"] }
|
||||
azure_core = "0.16"
|
||||
azure_identity = "0.16"
|
||||
azure_core = "0.16.0"
|
||||
azure_storage = "0.16"
|
||||
azure_storage_blobs = "0.16"
|
||||
azure_storage_blobs = "0.16.0"
|
||||
flate2 = "1.0.26"
|
||||
async-stream = "0.3"
|
||||
async-trait = "0.1"
|
||||
@@ -161,11 +159,11 @@ env_logger = "0.10"
|
||||
log = "0.4"
|
||||
|
||||
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
|
||||
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
|
||||
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
|
||||
|
||||
## Other git libraries
|
||||
heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
|
||||
@@ -186,7 +184,6 @@ tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
|
||||
tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
|
||||
utils = { version = "0.1", path = "./libs/utils/" }
|
||||
vm_monitor = { version = "0.1", path = "./libs/vm_monitor/" }
|
||||
walproposer = { version = "0.1", path = "./libs/walproposer/" }
|
||||
|
||||
## Common library dependency
|
||||
workspace_hack = { version = "0.1", path = "./workspace_hack/" }
|
||||
@@ -202,7 +199,7 @@ tonic-build = "0.9"
|
||||
|
||||
# This is only needed for proxy's tests.
|
||||
# TODO: we should probably fork `tokio-postgres-rustls` instead.
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
|
||||
|
||||
################# Binary contents sections
|
||||
|
||||
|
||||
76
Makefile
76
Makefile
@@ -62,7 +62,7 @@ all: neon postgres neon-pg-ext
|
||||
#
|
||||
# The 'postgres_ffi' depends on the Postgres headers.
|
||||
.PHONY: neon
|
||||
neon: postgres-headers walproposer-lib
|
||||
neon: postgres-headers
|
||||
+@echo "Compiling Neon"
|
||||
$(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS)
|
||||
|
||||
@@ -168,42 +168,6 @@ neon-pg-ext-clean-%:
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile clean
|
||||
|
||||
# Build walproposer as a static library. walproposer source code is located
|
||||
# in the pgxn/neon directory.
|
||||
#
|
||||
# We also need to include libpgport.a and libpgcommon.a, because walproposer
|
||||
# uses some functions from those libraries.
|
||||
#
|
||||
# Some object files are removed from libpgport.a and libpgcommon.a because
|
||||
# they depend on openssl and other libraries that are not included in our
|
||||
# Rust build.
|
||||
.PHONY: walproposer-lib
|
||||
walproposer-lib: neon-pg-ext-v16
|
||||
+@echo "Compiling walproposer-lib"
|
||||
mkdir -p $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/walproposer-lib \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile walproposer-lib
|
||||
cp $(POSTGRES_INSTALL_DIR)/v16/lib/libpgport.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
|
||||
cp $(POSTGRES_INSTALL_DIR)/v16/lib/libpgcommon.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
|
||||
ifeq ($(UNAME_S),Linux)
|
||||
$(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgport.a \
|
||||
pg_strong_random.o
|
||||
$(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgcommon.a \
|
||||
pg_crc32c.o \
|
||||
hmac_openssl.o \
|
||||
cryptohash_openssl.o \
|
||||
scram-common.o \
|
||||
md5_common.o \
|
||||
checksum_helper.o
|
||||
endif
|
||||
|
||||
.PHONY: walproposer-lib-clean
|
||||
walproposer-lib-clean:
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/walproposer-lib \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean
|
||||
|
||||
.PHONY: neon-pg-ext
|
||||
neon-pg-ext: \
|
||||
neon-pg-ext-v14 \
|
||||
@@ -256,44 +220,6 @@ distclean:
|
||||
fmt:
|
||||
./pre-commit.py --fix-inplace
|
||||
|
||||
postgres-%-pg-bsd-indent: postgres-%
|
||||
+@echo "Compiling pg_bsd_indent"
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/tools/pg_bsd_indent/
|
||||
|
||||
# Create typedef list for the core. Note that generally it should be combined with
|
||||
# buildfarm one to cover platform specific stuff.
|
||||
# https://wiki.postgresql.org/wiki/Running_pgindent_on_non-core_code_or_development_code
|
||||
postgres-%-typedefs.list: postgres-%
|
||||
$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/find_typedef $(POSTGRES_INSTALL_DIR)/$*/bin > $@
|
||||
|
||||
# Indent postgres. See src/tools/pgindent/README for details.
|
||||
.PHONY: postgres-%-pgindent
|
||||
postgres-%-pgindent: postgres-%-pg-bsd-indent postgres-%-typedefs.list
|
||||
+@echo merge with buildfarm typedef to cover all platforms
|
||||
+@echo note: I first tried to download from pgbuildfarm.org, but for unclear reason e.g. \
|
||||
REL_16_STABLE list misses PGSemaphoreData
|
||||
# wget -q -O - "http://www.pgbuildfarm.org/cgi-bin/typedefs.pl?branch=REL_16_STABLE" |\
|
||||
# cat - postgres-$*-typedefs.list | sort | uniq > postgres-$*-typedefs-full.list
|
||||
cat $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/typedefs.list |\
|
||||
cat - postgres-$*-typedefs.list | sort | uniq > postgres-$*-typedefs-full.list
|
||||
+@echo note: you might want to run it on selected files/dirs instead.
|
||||
INDENT=$(POSTGRES_INSTALL_DIR)/build/$*/src/tools/pg_bsd_indent/pg_bsd_indent \
|
||||
$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/pgindent --typedefs postgres-$*-typedefs-full.list \
|
||||
$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/ \
|
||||
--excludes $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/exclude_file_patterns
|
||||
rm -f pg*.BAK
|
||||
|
||||
# Indent pxgn/neon.
|
||||
.PHONY: pgindent
|
||||
neon-pgindent: postgres-v16-pg-bsd-indent neon-pg-ext-v16
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
|
||||
FIND_TYPEDEF=$(ROOT_PROJECT_DIR)/vendor/postgres-v16/src/tools/find_typedef \
|
||||
INDENT=$(POSTGRES_INSTALL_DIR)/build/v16/src/tools/pg_bsd_indent/pg_bsd_indent \
|
||||
PGINDENT_SCRIPT=$(ROOT_PROJECT_DIR)/vendor/postgres-v16/src/tools/pgindent/pgindent \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-v16 \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile pgindent
|
||||
|
||||
|
||||
.PHONY: setup-pre-commit-hook
|
||||
setup-pre-commit-hook:
|
||||
ln -s -f $(ROOT_PROJECT_DIR)/pre-commit.py .git/hooks/pre-commit
|
||||
|
||||
4
NOTICE
4
NOTICE
@@ -1,5 +1,5 @@
|
||||
Neon
|
||||
Copyright 2022 Neon Inc.
|
||||
|
||||
The PostgreSQL submodules in vendor/ are licensed under the PostgreSQL license.
|
||||
See vendor/postgres-vX/COPYRIGHT for details.
|
||||
The PostgreSQL submodules in vendor/postgres-v14 and vendor/postgres-v15 are licensed under the
|
||||
PostgreSQL license. See vendor/postgres-v14/COPYRIGHT and vendor/postgres-v15/COPYRIGHT.
|
||||
|
||||
@@ -252,7 +252,7 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()>
|
||||
IF NOT EXISTS (
|
||||
SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser')
|
||||
THEN
|
||||
CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION IN ROLE pg_read_all_data, pg_write_all_data;
|
||||
CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN IN ROLE pg_read_all_data, pg_write_all_data;
|
||||
IF array_length(roles, 1) IS NOT NULL THEN
|
||||
EXECUTE format('GRANT neon_superuser TO %s',
|
||||
array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(roles) as x), ', '));
|
||||
|
||||
@@ -302,7 +302,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
}
|
||||
RoleAction::Create => {
|
||||
let mut query: String = format!(
|
||||
"CREATE ROLE {} CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
|
||||
"CREATE ROLE {} CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
|
||||
name.pg_quote()
|
||||
);
|
||||
info!("role create query: '{}'", &query);
|
||||
|
||||
@@ -28,7 +28,7 @@ mod pg_helpers_tests {
|
||||
assert_eq!(
|
||||
spec.cluster.settings.as_pg_settings(),
|
||||
r#"fsync = off
|
||||
wal_level = logical
|
||||
wal_level = replica
|
||||
hot_standby = on
|
||||
neon.safekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'
|
||||
wal_log_hints = on
|
||||
|
||||
@@ -253,7 +253,7 @@ impl Endpoint {
|
||||
conf.append("shared_buffers", "1MB");
|
||||
conf.append("fsync", "off");
|
||||
conf.append("max_connections", "100");
|
||||
conf.append("wal_level", "logical");
|
||||
conf.append("wal_level", "replica");
|
||||
// wal_sender_timeout is the maximum time to wait for WAL replication.
|
||||
// It also defines how often the walreciever will send a feedback message to the wal sender.
|
||||
conf.append("wal_sender_timeout", "5s");
|
||||
|
||||
@@ -25,7 +25,7 @@
|
||||
},
|
||||
{
|
||||
"name": "wal_level",
|
||||
"value": "logical",
|
||||
"value": "replica",
|
||||
"vartype": "enum"
|
||||
},
|
||||
{
|
||||
|
||||
@@ -188,60 +188,11 @@ that.
|
||||
|
||||
## Error message style
|
||||
|
||||
### PostgreSQL extensions
|
||||
|
||||
PostgreSQL has a style guide for writing error messages:
|
||||
|
||||
https://www.postgresql.org/docs/current/error-style-guide.html
|
||||
|
||||
Follow that guide when writing error messages in the PostgreSQL
|
||||
extensions.
|
||||
|
||||
### Neon Rust code
|
||||
|
||||
#### Anyhow Context
|
||||
|
||||
When adding anyhow `context()`, use form `present-tense-verb+action`.
|
||||
|
||||
Example:
|
||||
- Bad: `file.metadata().context("could not get file metadata")?;`
|
||||
- Good: `file.metadata().context("get file metadata")?;`
|
||||
|
||||
#### Logging Errors
|
||||
|
||||
When logging any error `e`, use `could not {e:#}` or `failed to {e:#}`.
|
||||
|
||||
If `e` is an `anyhow` error and you want to log the backtrace that it contains,
|
||||
use `{e:?}` instead of `{e:#}`.
|
||||
|
||||
#### Rationale
|
||||
|
||||
The `{:#}` ("alternate Display") of an `anyhow` error chain is concatenation fo the contexts, using `: `.
|
||||
|
||||
For example, the following Rust code will result in output
|
||||
```
|
||||
ERROR failed to list users: load users from server: parse response: invalid json
|
||||
```
|
||||
|
||||
This is more concise / less noisy than what happens if you do `.context("could not ...")?` at each level, i.e.:
|
||||
|
||||
```
|
||||
ERROR could not list users: could not load users from server: could not parse response: invalid json
|
||||
```
|
||||
|
||||
|
||||
```rust
|
||||
fn main() {
|
||||
match list_users().context("list users") else {
|
||||
Ok(_) => ...,
|
||||
Err(e) => tracing::error!("failed to {e:#}"),
|
||||
}
|
||||
}
|
||||
fn list_users() {
|
||||
http_get_users().context("load users from server")?;
|
||||
}
|
||||
fn http_get_users() {
|
||||
let response = client....?;
|
||||
response.parse().context("parse response")?; // fails with serde error "invalid json"
|
||||
}
|
||||
```
|
||||
extension. We don't follow it strictly in the pageserver and
|
||||
safekeeper, but the advice in the PostgreSQL style guide is generally
|
||||
good, and you can't go wrong by following it.
|
||||
|
||||
@@ -76,7 +76,7 @@
|
||||
},
|
||||
{
|
||||
"name": "wal_level",
|
||||
"value": "logical",
|
||||
"value": "replica",
|
||||
"vartype": "enum"
|
||||
},
|
||||
{
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::io::{Read, Result, Write};
|
||||
|
||||
/// A wrapper for an object implementing [Read]
|
||||
/// A wrapper for an object implementing [Read](std::io::Read)
|
||||
/// which allows a closure to observe the amount of bytes read.
|
||||
/// This is useful in conjunction with metrics (e.g. [IntCounter](crate::IntCounter)).
|
||||
///
|
||||
@@ -51,17 +51,17 @@ impl<'a, T> CountedReader<'a, T> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Get an immutable reference to the underlying [Read] implementor
|
||||
/// Get an immutable reference to the underlying [Read](std::io::Read) implementor
|
||||
pub fn inner(&self) -> &T {
|
||||
&self.reader
|
||||
}
|
||||
|
||||
/// Get a mutable reference to the underlying [Read] implementor
|
||||
/// Get a mutable reference to the underlying [Read](std::io::Read) implementor
|
||||
pub fn inner_mut(&mut self) -> &mut T {
|
||||
&mut self.reader
|
||||
}
|
||||
|
||||
/// Consume the wrapper and return the underlying [Read] implementor
|
||||
/// Consume the wrapper and return the underlying [Read](std::io::Read) implementor
|
||||
pub fn into_inner(self) -> T {
|
||||
self.reader
|
||||
}
|
||||
@@ -75,7 +75,7 @@ impl<T: Read> Read for CountedReader<'_, T> {
|
||||
}
|
||||
}
|
||||
|
||||
/// A wrapper for an object implementing [Write]
|
||||
/// A wrapper for an object implementing [Write](std::io::Write)
|
||||
/// which allows a closure to observe the amount of bytes written.
|
||||
/// This is useful in conjunction with metrics (e.g. [IntCounter](crate::IntCounter)).
|
||||
///
|
||||
@@ -122,17 +122,17 @@ impl<'a, T> CountedWriter<'a, T> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Get an immutable reference to the underlying [Write] implementor
|
||||
/// Get an immutable reference to the underlying [Write](std::io::Write) implementor
|
||||
pub fn inner(&self) -> &T {
|
||||
&self.writer
|
||||
}
|
||||
|
||||
/// Get a mutable reference to the underlying [Write] implementor
|
||||
/// Get a mutable reference to the underlying [Write](std::io::Write) implementor
|
||||
pub fn inner_mut(&mut self) -> &mut T {
|
||||
&mut self.writer
|
||||
}
|
||||
|
||||
/// Consume the wrapper and return the underlying [Write] implementor
|
||||
/// Consume the wrapper and return the underlying [Write](std::io::Write) implementor
|
||||
pub fn into_inner(self) -> T {
|
||||
self.writer
|
||||
}
|
||||
|
||||
@@ -19,8 +19,8 @@ use tracing::{debug, error, info, trace};
|
||||
|
||||
use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter};
|
||||
use pq_proto::{
|
||||
BeMessage, FeMessage, FeStartupPacket, ProtocolError, SQLSTATE_ADMIN_SHUTDOWN,
|
||||
SQLSTATE_INTERNAL_ERROR, SQLSTATE_SUCCESSFUL_COMPLETION,
|
||||
BeMessage, FeMessage, FeStartupPacket, ProtocolError, SQLSTATE_INTERNAL_ERROR,
|
||||
SQLSTATE_SUCCESSFUL_COMPLETION,
|
||||
};
|
||||
|
||||
/// An error, occurred during query processing:
|
||||
@@ -30,9 +30,6 @@ pub enum QueryError {
|
||||
/// The connection was lost while processing the query.
|
||||
#[error(transparent)]
|
||||
Disconnected(#[from] ConnectionError),
|
||||
/// We were instructed to shutdown while processing the query
|
||||
#[error("Shutting down")]
|
||||
Shutdown,
|
||||
/// Some other error
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
@@ -47,8 +44,7 @@ impl From<io::Error> for QueryError {
|
||||
impl QueryError {
|
||||
pub fn pg_error_code(&self) -> &'static [u8; 5] {
|
||||
match self {
|
||||
Self::Disconnected(_) => b"08006", // connection failure
|
||||
Self::Shutdown => SQLSTATE_ADMIN_SHUTDOWN,
|
||||
Self::Disconnected(_) => b"08006", // connection failure
|
||||
Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
|
||||
}
|
||||
}
|
||||
@@ -400,20 +396,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
||||
// socket might be already closed, e.g. if previously received error,
|
||||
// so ignore result.
|
||||
self.framed.shutdown().await.ok();
|
||||
match ret {
|
||||
Ok(()) => Ok(()),
|
||||
Err(QueryError::Shutdown) => {
|
||||
info!("Stopped due to shutdown");
|
||||
Ok(())
|
||||
}
|
||||
Err(QueryError::Disconnected(e)) => {
|
||||
info!("Disconnected ({e:#})");
|
||||
// Disconnection is not an error: we just use it that way internally to drop
|
||||
// out of loops.
|
||||
Ok(())
|
||||
}
|
||||
e => e,
|
||||
}
|
||||
ret
|
||||
}
|
||||
|
||||
async fn run_message_loop<F, S>(
|
||||
@@ -433,11 +416,15 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
||||
_ = shutdown_watcher() => {
|
||||
// We were requested to shut down.
|
||||
tracing::info!("shutdown request received during handshake");
|
||||
return Err(QueryError::Shutdown)
|
||||
return Ok(())
|
||||
},
|
||||
|
||||
handshake_r = self.handshake(handler) => {
|
||||
handshake_r?;
|
||||
result = self.handshake(handler) => {
|
||||
// Handshake complete.
|
||||
result?;
|
||||
if self.state == ProtoState::Closed {
|
||||
return Ok(()); // EOF during handshake
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
@@ -448,7 +435,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
||||
_ = shutdown_watcher() => {
|
||||
// We were requested to shut down.
|
||||
tracing::info!("shutdown request received in run_message_loop");
|
||||
return Err(QueryError::Shutdown)
|
||||
Ok(None)
|
||||
},
|
||||
msg = self.read_message() => { msg },
|
||||
)? {
|
||||
@@ -460,14 +447,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
||||
_ = shutdown_watcher() => {
|
||||
// We were requested to shut down.
|
||||
tracing::info!("shutdown request received during response flush");
|
||||
|
||||
// If we exited process_message with a shutdown error, there may be
|
||||
// some valid response content on in our transmit buffer: permit sending
|
||||
// this within a short timeout. This is a best effort thing so we don't
|
||||
// care about the result.
|
||||
tokio::time::timeout(std::time::Duration::from_millis(500), self.flush()).await.ok();
|
||||
|
||||
return Err(QueryError::Shutdown)
|
||||
return Ok(())
|
||||
},
|
||||
flush_r = self.flush() => {
|
||||
flush_r?;
|
||||
@@ -580,9 +560,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
||||
self.peer_addr
|
||||
);
|
||||
self.state = ProtoState::Closed;
|
||||
return Err(QueryError::Disconnected(ConnectionError::Protocol(
|
||||
ProtocolError::Protocol("EOF during handshake".to_string()),
|
||||
)));
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -621,9 +599,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
||||
self.peer_addr
|
||||
);
|
||||
self.state = ProtoState::Closed;
|
||||
return Err(QueryError::Disconnected(ConnectionError::Protocol(
|
||||
ProtocolError::Protocol("EOF during auth".to_string()),
|
||||
)));
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -947,7 +923,6 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> AsyncWrite for CopyDataWriter<'a, I
|
||||
pub fn short_error(e: &QueryError) -> String {
|
||||
match e {
|
||||
QueryError::Disconnected(connection_error) => connection_error.to_string(),
|
||||
QueryError::Shutdown => "shutdown".to_string(),
|
||||
QueryError::Other(e) => format!("{e:#}"),
|
||||
}
|
||||
}
|
||||
@@ -964,9 +939,6 @@ fn log_query_error(query: &str, e: &QueryError) {
|
||||
QueryError::Disconnected(other_connection_error) => {
|
||||
error!("query handler for '{query}' failed with connection error: {other_connection_error:?}")
|
||||
}
|
||||
QueryError::Shutdown => {
|
||||
info!("query handler for '{query}' cancelled during tenant shutdown")
|
||||
}
|
||||
QueryError::Other(e) => {
|
||||
error!("query handler for '{query}' failed: {e:?}");
|
||||
}
|
||||
|
||||
@@ -131,7 +131,6 @@ pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16;
|
||||
|
||||
// Export some version independent functions that are used outside of this mod
|
||||
pub use v14::xlog_utils::encode_logical_message;
|
||||
pub use v14::xlog_utils::from_pg_timestamp;
|
||||
pub use v14::xlog_utils::get_current_timestamp;
|
||||
pub use v14::xlog_utils::to_pg_timestamp;
|
||||
pub use v14::xlog_utils::XLogFileName;
|
||||
|
||||
@@ -220,10 +220,6 @@ pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10;
|
||||
pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001;
|
||||
pub const XLP_LONG_HEADER: u16 = 0x0002;
|
||||
|
||||
/* From replication/slot.h */
|
||||
pub const REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN: usize = 4*4 /* offset of `slotdata` in ReplicationSlotOnDisk */
|
||||
+ 64 /* NameData */ + 4*4;
|
||||
|
||||
/* From fsm_internals.h */
|
||||
const FSM_NODES_PER_PAGE: usize = BLCKSZ as usize - SIZEOF_PAGE_HEADER_DATA - 4;
|
||||
const FSM_NON_LEAF_NODES_PER_PAGE: usize = BLCKSZ as usize / 2 - 1;
|
||||
|
||||
@@ -136,42 +136,21 @@ pub fn get_current_timestamp() -> TimestampTz {
|
||||
to_pg_timestamp(SystemTime::now())
|
||||
}
|
||||
|
||||
// Module to reduce the scope of the constants
|
||||
mod timestamp_conversions {
|
||||
use std::time::Duration;
|
||||
|
||||
use super::*;
|
||||
|
||||
const UNIX_EPOCH_JDATE: u64 = 2440588; // == date2j(1970, 1, 1)
|
||||
const POSTGRES_EPOCH_JDATE: u64 = 2451545; // == date2j(2000, 1, 1)
|
||||
pub fn to_pg_timestamp(time: SystemTime) -> TimestampTz {
|
||||
const UNIX_EPOCH_JDATE: u64 = 2440588; /* == date2j(1970, 1, 1) */
|
||||
const POSTGRES_EPOCH_JDATE: u64 = 2451545; /* == date2j(2000, 1, 1) */
|
||||
const SECS_PER_DAY: u64 = 86400;
|
||||
const USECS_PER_SEC: u64 = 1000000;
|
||||
const SECS_DIFF_UNIX_TO_POSTGRES_EPOCH: u64 =
|
||||
(POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE) * SECS_PER_DAY;
|
||||
|
||||
pub fn to_pg_timestamp(time: SystemTime) -> TimestampTz {
|
||||
match time.duration_since(SystemTime::UNIX_EPOCH) {
|
||||
Ok(n) => {
|
||||
((n.as_secs() - SECS_DIFF_UNIX_TO_POSTGRES_EPOCH) * USECS_PER_SEC
|
||||
+ n.subsec_micros() as u64) as i64
|
||||
}
|
||||
Err(_) => panic!("SystemTime before UNIX EPOCH!"),
|
||||
match time.duration_since(SystemTime::UNIX_EPOCH) {
|
||||
Ok(n) => {
|
||||
((n.as_secs() - ((POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE) * SECS_PER_DAY))
|
||||
* USECS_PER_SEC
|
||||
+ n.subsec_micros() as u64) as i64
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_pg_timestamp(time: TimestampTz) -> SystemTime {
|
||||
let time: u64 = time
|
||||
.try_into()
|
||||
.expect("timestamp before millenium (postgres epoch)");
|
||||
let since_unix_epoch = time + SECS_DIFF_UNIX_TO_POSTGRES_EPOCH * USECS_PER_SEC;
|
||||
SystemTime::UNIX_EPOCH
|
||||
.checked_add(Duration::from_micros(since_unix_epoch))
|
||||
.expect("SystemTime overflow")
|
||||
Err(_) => panic!("SystemTime before UNIX EPOCH!"),
|
||||
}
|
||||
}
|
||||
|
||||
pub use timestamp_conversions::{from_pg_timestamp, to_pg_timestamp};
|
||||
|
||||
// Returns (aligned) end_lsn of the last record in data_dir with WAL segments.
|
||||
// start_lsn must point to some previously known record boundary (beginning of
|
||||
// the next record). If no valid record after is found, start_lsn is returned
|
||||
@@ -502,24 +481,4 @@ pub fn encode_logical_message(prefix: &str, message: &str) -> Vec<u8> {
|
||||
wal
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_ts_conversion() {
|
||||
let now = SystemTime::now();
|
||||
let round_trip = from_pg_timestamp(to_pg_timestamp(now));
|
||||
|
||||
let now_since = now.duration_since(SystemTime::UNIX_EPOCH).unwrap();
|
||||
let round_trip_since = round_trip.duration_since(SystemTime::UNIX_EPOCH).unwrap();
|
||||
assert_eq!(now_since.as_micros(), round_trip_since.as_micros());
|
||||
|
||||
let now_pg = get_current_timestamp();
|
||||
let round_trip_pg = to_pg_timestamp(from_pg_timestamp(now_pg));
|
||||
|
||||
assert_eq!(now_pg, round_trip_pg);
|
||||
}
|
||||
|
||||
// If you need to craft WAL and write tests for this module, put it at wal_craft crate.
|
||||
}
|
||||
// If you need to craft WAL and write tests for this module, put it at wal_craft crate.
|
||||
|
||||
@@ -670,7 +670,6 @@ pub fn read_cstr(buf: &mut Bytes) -> Result<Bytes, ProtocolError> {
|
||||
}
|
||||
|
||||
pub const SQLSTATE_INTERNAL_ERROR: &[u8; 5] = b"XX000";
|
||||
pub const SQLSTATE_ADMIN_SHUTDOWN: &[u8; 5] = b"57P01";
|
||||
pub const SQLSTATE_SUCCESSFUL_COMPLETION: &[u8; 5] = b"00000";
|
||||
|
||||
impl<'a> BeMessage<'a> {
|
||||
|
||||
@@ -28,7 +28,6 @@ utils.workspace = true
|
||||
pin-project-lite.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
azure_core.workspace = true
|
||||
azure_identity.workspace = true
|
||||
azure_storage.workspace = true
|
||||
azure_storage_blobs.workspace = true
|
||||
futures-util.workspace = true
|
||||
|
||||
@@ -1,15 +1,12 @@
|
||||
//! Azure Blob Storage wrapper
|
||||
|
||||
use std::env;
|
||||
use std::num::NonZeroU32;
|
||||
use std::sync::Arc;
|
||||
use std::{borrow::Cow, collections::HashMap, io::Cursor};
|
||||
|
||||
use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
|
||||
use anyhow::Result;
|
||||
use azure_core::request_options::{MaxResults, Metadata, Range};
|
||||
use azure_core::Header;
|
||||
use azure_identity::DefaultAzureCredential;
|
||||
use azure_storage::StorageCredentials;
|
||||
use azure_storage_blobs::prelude::ClientBuilder;
|
||||
use azure_storage_blobs::{
|
||||
@@ -41,16 +38,12 @@ impl AzureBlobStorage {
|
||||
azure_config.container_name
|
||||
);
|
||||
|
||||
let account = env::var("AZURE_STORAGE_ACCOUNT").expect("missing AZURE_STORAGE_ACCOUNT");
|
||||
let account =
|
||||
std::env::var("AZURE_STORAGE_ACCOUNT").expect("missing AZURE_STORAGE_ACCOUNT");
|
||||
let access_key =
|
||||
std::env::var("AZURE_STORAGE_ACCESS_KEY").expect("missing AZURE_STORAGE_ACCESS_KEY");
|
||||
|
||||
// If the `AZURE_STORAGE_ACCESS_KEY` env var has an access key, use that,
|
||||
// otherwise try the token based credentials.
|
||||
let credentials = if let Ok(access_key) = env::var("AZURE_STORAGE_ACCESS_KEY") {
|
||||
StorageCredentials::access_key(account.clone(), access_key)
|
||||
} else {
|
||||
let token_credential = DefaultAzureCredential::default();
|
||||
StorageCredentials::token_credential(Arc::new(token_credential))
|
||||
};
|
||||
let credentials = StorageCredentials::access_key(account.clone(), access_key);
|
||||
|
||||
let builder = ClientBuilder::new(account, credentials);
|
||||
|
||||
@@ -121,7 +114,22 @@ impl AzureBlobStorage {
|
||||
// https://github.com/neondatabase/neon/issues/5563
|
||||
let mut buf = Vec::new();
|
||||
while let Some(part) = response.next().await {
|
||||
let part = part.map_err(to_download_error)?;
|
||||
let part = match part {
|
||||
Ok(l) => l,
|
||||
Err(e) => {
|
||||
return Err(if let Some(http_err) = e.as_http_error() {
|
||||
match http_err.status() {
|
||||
StatusCode::NotFound => DownloadError::NotFound,
|
||||
StatusCode::BadRequest => {
|
||||
DownloadError::BadInput(anyhow::Error::new(e))
|
||||
}
|
||||
_ => DownloadError::Other(anyhow::Error::new(e)),
|
||||
}
|
||||
} else {
|
||||
DownloadError::Other(e.into())
|
||||
});
|
||||
}
|
||||
};
|
||||
let data = part
|
||||
.data
|
||||
.collect()
|
||||
@@ -142,16 +150,30 @@ impl AzureBlobStorage {
|
||||
) -> Result<StorageMetadata, DownloadError> {
|
||||
let builder = blob_client.get_metadata();
|
||||
|
||||
let response = builder.into_future().await.map_err(to_download_error)?;
|
||||
let mut map = HashMap::new();
|
||||
match builder.into_future().await {
|
||||
Ok(r) => {
|
||||
let mut map = HashMap::new();
|
||||
|
||||
for md in response.metadata.iter() {
|
||||
map.insert(
|
||||
md.name().as_str().to_string(),
|
||||
md.value().as_str().to_string(),
|
||||
);
|
||||
for md in r.metadata.iter() {
|
||||
map.insert(
|
||||
md.name().as_str().to_string(),
|
||||
md.value().as_str().to_string(),
|
||||
);
|
||||
}
|
||||
Ok(StorageMetadata(map))
|
||||
}
|
||||
Err(e) => {
|
||||
return Err(if let Some(http_err) = e.as_http_error() {
|
||||
match http_err.status() {
|
||||
StatusCode::NotFound => DownloadError::NotFound,
|
||||
StatusCode::BadRequest => DownloadError::BadInput(anyhow::Error::new(e)),
|
||||
_ => DownloadError::Other(anyhow::Error::new(e)),
|
||||
}
|
||||
} else {
|
||||
DownloadError::Other(e.into())
|
||||
});
|
||||
}
|
||||
}
|
||||
Ok(StorageMetadata(map))
|
||||
}
|
||||
|
||||
async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> {
|
||||
@@ -170,18 +192,6 @@ fn to_azure_metadata(metadata: StorageMetadata) -> Metadata {
|
||||
res
|
||||
}
|
||||
|
||||
fn to_download_error(error: azure_core::Error) -> DownloadError {
|
||||
if let Some(http_err) = error.as_http_error() {
|
||||
match http_err.status() {
|
||||
StatusCode::NotFound => DownloadError::NotFound,
|
||||
StatusCode::BadRequest => DownloadError::BadInput(anyhow::Error::new(error)),
|
||||
_ => DownloadError::Other(anyhow::Error::new(error)),
|
||||
}
|
||||
} else {
|
||||
DownloadError::Other(error.into())
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl RemoteStorage for AzureBlobStorage {
|
||||
async fn list_prefixes(
|
||||
@@ -216,8 +226,23 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
|
||||
let mut response = builder.into_stream();
|
||||
let mut res = Vec::new();
|
||||
while let Some(entry) = response.next().await {
|
||||
let entry = entry.map_err(to_download_error)?;
|
||||
while let Some(l) = response.next().await {
|
||||
let entry = match l {
|
||||
Ok(l) => l,
|
||||
Err(e) => {
|
||||
return Err(if let Some(http_err) = e.as_http_error() {
|
||||
match http_err.status() {
|
||||
StatusCode::NotFound => DownloadError::NotFound,
|
||||
StatusCode::BadRequest => {
|
||||
DownloadError::BadInput(anyhow::Error::new(e))
|
||||
}
|
||||
_ => DownloadError::Other(anyhow::Error::new(e)),
|
||||
}
|
||||
} else {
|
||||
DownloadError::Other(e.into())
|
||||
});
|
||||
}
|
||||
};
|
||||
let name_iter = entry
|
||||
.blobs
|
||||
.prefixes()
|
||||
|
||||
@@ -27,8 +27,8 @@ and old one if it exists.
|
||||
* the filecache: a struct that allows communication with the Postgres file cache.
|
||||
On startup, we connect to the filecache and hold on to the connection for the
|
||||
entire monitor lifetime.
|
||||
* the cgroup watcher: the `CgroupWatcher` polls the `neon-postgres` cgroup's memory
|
||||
usage and sends rolling aggregates to the runner.
|
||||
* the cgroup watcher: the `CgroupWatcher` manages the `neon-postgres` cgroup by
|
||||
listening for `memory.high` events and setting its `memory.{high,max}` values.
|
||||
* the runner: the runner marries the filecache and cgroup watcher together,
|
||||
communicating with the agent throught the `Dispatcher`, and then calling filecache
|
||||
and cgroup watcher functions as needed to upscale and downscale
|
||||
|
||||
@@ -1,38 +1,161 @@
|
||||
use std::fmt::{self, Debug, Formatter};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use anyhow::{anyhow, Context};
|
||||
use cgroups_rs::{
|
||||
hierarchies::{self, is_cgroup2_unified_mode},
|
||||
memory::MemController,
|
||||
Subsystem,
|
||||
use std::{
|
||||
fmt::{Debug, Display},
|
||||
fs,
|
||||
pin::pin,
|
||||
sync::atomic::{AtomicU64, Ordering},
|
||||
};
|
||||
use tokio::sync::watch;
|
||||
|
||||
use anyhow::{anyhow, bail, Context};
|
||||
use cgroups_rs::{
|
||||
freezer::FreezerController,
|
||||
hierarchies::{self, is_cgroup2_unified_mode, UNIFIED_MOUNTPOINT},
|
||||
memory::MemController,
|
||||
MaxValue,
|
||||
Subsystem::{Freezer, Mem},
|
||||
};
|
||||
use inotify::{EventStream, Inotify, WatchMask};
|
||||
use tokio::sync::mpsc::{self, error::TryRecvError};
|
||||
use tokio::time::{Duration, Instant};
|
||||
use tokio_stream::{Stream, StreamExt};
|
||||
use tracing::{info, warn};
|
||||
|
||||
use crate::protocol::Resources;
|
||||
use crate::MiB;
|
||||
|
||||
/// Monotonically increasing counter of the number of memory.high events
|
||||
/// the cgroup has experienced.
|
||||
///
|
||||
/// We use this to determine if a modification to the `memory.events` file actually
|
||||
/// changed the `high` field. If not, we don't care about the change. When we
|
||||
/// read the file, we check the `high` field in the file against `MEMORY_EVENT_COUNT`
|
||||
/// to see if it changed since last time.
|
||||
pub static MEMORY_EVENT_COUNT: AtomicU64 = AtomicU64::new(0);
|
||||
|
||||
/// Monotonically increasing counter that gives each cgroup event a unique id.
|
||||
///
|
||||
/// This allows us to answer questions like "did this upscale arrive before this
|
||||
/// memory.high?". This static is also used by the `Sequenced` type to "tag" values
|
||||
/// with a sequence number. As such, prefer to used the `Sequenced` type rather
|
||||
/// than this static directly.
|
||||
static EVENT_SEQUENCE_NUMBER: AtomicU64 = AtomicU64::new(0);
|
||||
|
||||
/// A memory event type reported in memory.events.
|
||||
#[derive(Debug, Eq, PartialEq, Copy, Clone)]
|
||||
pub enum MemoryEvent {
|
||||
Low,
|
||||
High,
|
||||
Max,
|
||||
Oom,
|
||||
OomKill,
|
||||
OomGroupKill,
|
||||
}
|
||||
|
||||
impl MemoryEvent {
|
||||
fn as_str(&self) -> &str {
|
||||
match self {
|
||||
MemoryEvent::Low => "low",
|
||||
MemoryEvent::High => "high",
|
||||
MemoryEvent::Max => "max",
|
||||
MemoryEvent::Oom => "oom",
|
||||
MemoryEvent::OomKill => "oom_kill",
|
||||
MemoryEvent::OomGroupKill => "oom_group_kill",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for MemoryEvent {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.write_str(self.as_str())
|
||||
}
|
||||
}
|
||||
|
||||
/// Configuration for a `CgroupWatcher`
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Config {
|
||||
/// Interval at which we should be fetching memory statistics
|
||||
memory_poll_interval: Duration,
|
||||
// The target difference between the total memory reserved for the cgroup
|
||||
// and the value of the cgroup's memory.high.
|
||||
//
|
||||
// In other words, memory.high + oom_buffer_bytes will equal the total memory that the cgroup may
|
||||
// use (equal to system memory, minus whatever's taken out for the file cache).
|
||||
oom_buffer_bytes: u64,
|
||||
|
||||
/// The number of samples used in constructing aggregated memory statistics
|
||||
memory_history_len: usize,
|
||||
/// The number of most recent samples that will be periodically logged.
|
||||
///
|
||||
/// Each sample is logged exactly once. Increasing this value means that recent samples will be
|
||||
/// logged less frequently, and vice versa.
|
||||
///
|
||||
/// For simplicity, this value must be greater than or equal to `memory_history_len`.
|
||||
memory_history_log_interval: usize,
|
||||
// The amount of memory, in bytes, below a proposed new value for
|
||||
// memory.high that the cgroup's memory usage must be for us to downscale
|
||||
//
|
||||
// In other words, we can downscale only when:
|
||||
//
|
||||
// memory.current + memory_high_buffer_bytes < (proposed) memory.high
|
||||
//
|
||||
// TODO: there's some minor issues with this approach -- in particular, that we might have
|
||||
// memory in use by the kernel's page cache that we're actually ok with getting rid of.
|
||||
pub(crate) memory_high_buffer_bytes: u64,
|
||||
|
||||
// The maximum duration, in milliseconds, that we're allowed to pause
|
||||
// the cgroup for while waiting for the autoscaler-agent to upscale us
|
||||
max_upscale_wait: Duration,
|
||||
|
||||
// The required minimum time, in milliseconds, that we must wait before re-freezing
|
||||
// the cgroup while waiting for the autoscaler-agent to upscale us.
|
||||
do_not_freeze_more_often_than: Duration,
|
||||
|
||||
// The amount of memory, in bytes, that we should periodically increase memory.high
|
||||
// by while waiting for the autoscaler-agent to upscale us.
|
||||
//
|
||||
// This exists to avoid the excessive throttling that happens when a cgroup is above its
|
||||
// memory.high for too long. See more here:
|
||||
// https://github.com/neondatabase/autoscaling/issues/44#issuecomment-1522487217
|
||||
memory_high_increase_by_bytes: u64,
|
||||
|
||||
// The period, in milliseconds, at which we should repeatedly increase the value
|
||||
// of the cgroup's memory.high while we're waiting on upscaling and memory.high
|
||||
// is still being hit.
|
||||
//
|
||||
// Technically speaking, this actually serves as a rate limit to moderate responding to
|
||||
// memory.high events, but these are roughly equivalent if the process is still allocating
|
||||
// memory.
|
||||
memory_high_increase_every: Duration,
|
||||
}
|
||||
|
||||
impl Config {
|
||||
/// Calculate the new value for the cgroups memory.high based on system memory
|
||||
pub fn calculate_memory_high_value(&self, total_system_mem: u64) -> u64 {
|
||||
total_system_mem.saturating_sub(self.oom_buffer_bytes)
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for Config {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
memory_poll_interval: Duration::from_millis(100),
|
||||
memory_history_len: 5, // use 500ms of history for decision-making
|
||||
memory_history_log_interval: 20, // but only log every ~2s (otherwise it's spammy)
|
||||
oom_buffer_bytes: 100 * MiB,
|
||||
memory_high_buffer_bytes: 100 * MiB,
|
||||
// while waiting for upscale, don't freeze for more than 20ms every 1s
|
||||
max_upscale_wait: Duration::from_millis(20),
|
||||
do_not_freeze_more_often_than: Duration::from_millis(1000),
|
||||
// while waiting for upscale, increase memory.high by 10MiB every 25ms
|
||||
memory_high_increase_by_bytes: 10 * MiB,
|
||||
memory_high_increase_every: Duration::from_millis(25),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Used to represent data that is associated with a certain point in time, such
|
||||
/// as an upscale request or memory.high event.
|
||||
///
|
||||
/// Internally, creating a `Sequenced` uses a static atomic counter to obtain
|
||||
/// a unique sequence number. Sequence numbers are monotonically increasing,
|
||||
/// allowing us to answer questions like "did this upscale happen after this
|
||||
/// memory.high event?" by comparing the sequence numbers of the two events.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Sequenced<T> {
|
||||
seqnum: u64,
|
||||
data: T,
|
||||
}
|
||||
|
||||
impl<T> Sequenced<T> {
|
||||
pub fn new(data: T) -> Self {
|
||||
Self {
|
||||
seqnum: EVENT_SEQUENCE_NUMBER.fetch_add(1, Ordering::AcqRel),
|
||||
data,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -47,14 +170,74 @@ impl Default for Config {
|
||||
pub struct CgroupWatcher {
|
||||
pub config: Config,
|
||||
|
||||
/// The sequence number of the last upscale.
|
||||
///
|
||||
/// If we receive a memory.high event that has a _lower_ sequence number than
|
||||
/// `last_upscale_seqnum`, then we know it occured before the upscale, and we
|
||||
/// can safely ignore it.
|
||||
///
|
||||
/// Note: Like the `events` field, this doesn't _need_ interior mutability but we
|
||||
/// use it anyways so that methods take `&self`, not `&mut self`.
|
||||
last_upscale_seqnum: AtomicU64,
|
||||
|
||||
/// A channel on which we send messages to request upscale from the dispatcher.
|
||||
upscale_requester: mpsc::Sender<()>,
|
||||
|
||||
/// The actual cgroup we are watching and managing.
|
||||
cgroup: cgroups_rs::Cgroup,
|
||||
}
|
||||
|
||||
/// Read memory.events for the desired event type.
|
||||
///
|
||||
/// `path` specifies the path to the desired `memory.events` file.
|
||||
/// For more info, see the `memory.events` section of the [kernel docs]
|
||||
/// <https://docs.kernel.org/admin-guide/cgroup-v2.html#memory-interface-files>
|
||||
fn get_event_count(path: &str, event: MemoryEvent) -> anyhow::Result<u64> {
|
||||
let contents = fs::read_to_string(path)
|
||||
.with_context(|| format!("failed to read memory.events from {path}"))?;
|
||||
|
||||
// Then contents of the file look like:
|
||||
// low 42
|
||||
// high 101
|
||||
// ...
|
||||
contents
|
||||
.lines()
|
||||
.filter_map(|s| s.split_once(' '))
|
||||
.find(|(e, _)| *e == event.as_str())
|
||||
.ok_or_else(|| anyhow!("failed to find entry for memory.{event} events in {path}"))
|
||||
.and_then(|(_, count)| {
|
||||
count
|
||||
.parse::<u64>()
|
||||
.with_context(|| format!("failed to parse memory.{event} as u64"))
|
||||
})
|
||||
}
|
||||
|
||||
/// Create an event stream that produces events whenever the file at the provided
|
||||
/// path is modified.
|
||||
fn create_file_watcher(path: &str) -> anyhow::Result<EventStream<[u8; 1024]>> {
|
||||
info!("creating file watcher for {path}");
|
||||
let inotify = Inotify::init().context("failed to initialize file watcher")?;
|
||||
inotify
|
||||
.watches()
|
||||
.add(path, WatchMask::MODIFY)
|
||||
.with_context(|| format!("failed to start watching {path}"))?;
|
||||
inotify
|
||||
// The inotify docs use [0u8; 1024] so we'll just copy them. We only need
|
||||
// to store one event at a time - if the event gets written over, that's
|
||||
// ok. We still see that there is an event. For more information, see:
|
||||
// https://man7.org/linux/man-pages/man7/inotify.7.html
|
||||
.into_event_stream([0u8; 1024])
|
||||
.context("failed to start inotify event stream")
|
||||
}
|
||||
|
||||
impl CgroupWatcher {
|
||||
/// Create a new `CgroupWatcher`.
|
||||
#[tracing::instrument(skip_all, fields(%name))]
|
||||
pub fn new(name: String) -> anyhow::Result<Self> {
|
||||
pub fn new(
|
||||
name: String,
|
||||
// A channel on which to send upscale requests
|
||||
upscale_requester: mpsc::Sender<()>,
|
||||
) -> anyhow::Result<(Self, impl Stream<Item = Sequenced<u64>>)> {
|
||||
// TODO: clarify exactly why we need v2
|
||||
// Make sure cgroups v2 (aka unified) are supported
|
||||
if !is_cgroup2_unified_mode() {
|
||||
@@ -62,203 +245,410 @@ impl CgroupWatcher {
|
||||
}
|
||||
let cgroup = cgroups_rs::Cgroup::load(hierarchies::auto(), &name);
|
||||
|
||||
Ok(Self {
|
||||
cgroup,
|
||||
config: Default::default(),
|
||||
})
|
||||
// Start monitoring the cgroup for memory events. In general, for
|
||||
// cgroups v2 (aka unified), metrics are reported in files like
|
||||
// > `/sys/fs/cgroup/{name}/{metric}`
|
||||
// We are looking for `memory.high` events, which are stored in the
|
||||
// file `memory.events`. For more info, see the `memory.events` section
|
||||
// of https://docs.kernel.org/admin-guide/cgroup-v2.html#memory-interface-files
|
||||
let path = format!("{}/{}/memory.events", UNIFIED_MOUNTPOINT, &name);
|
||||
let memory_events = create_file_watcher(&path)
|
||||
.with_context(|| format!("failed to create event watcher for {path}"))?
|
||||
// This would be nice with with .inspect_err followed by .ok
|
||||
.filter_map(move |_| match get_event_count(&path, MemoryEvent::High) {
|
||||
Ok(high) => Some(high),
|
||||
Err(error) => {
|
||||
// TODO: Might want to just panic here
|
||||
warn!(?error, "failed to read high events count from {}", &path);
|
||||
None
|
||||
}
|
||||
})
|
||||
// Only report the event if the memory.high count increased
|
||||
.filter_map(|high| {
|
||||
if MEMORY_EVENT_COUNT.fetch_max(high, Ordering::AcqRel) < high {
|
||||
Some(high)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.map(Sequenced::new);
|
||||
|
||||
let initial_count = get_event_count(
|
||||
&format!("{}/{}/memory.events", UNIFIED_MOUNTPOINT, &name),
|
||||
MemoryEvent::High,
|
||||
)?;
|
||||
|
||||
info!(initial_count, "initial memory.high event count");
|
||||
|
||||
// Hard update `MEMORY_EVENT_COUNT` since there could have been processes
|
||||
// running in the cgroup before that caused it to be non-zero.
|
||||
MEMORY_EVENT_COUNT.fetch_max(initial_count, Ordering::AcqRel);
|
||||
|
||||
Ok((
|
||||
Self {
|
||||
cgroup,
|
||||
upscale_requester,
|
||||
last_upscale_seqnum: AtomicU64::new(0),
|
||||
config: Default::default(),
|
||||
},
|
||||
memory_events,
|
||||
))
|
||||
}
|
||||
|
||||
/// The entrypoint for the `CgroupWatcher`.
|
||||
#[tracing::instrument(skip_all)]
|
||||
pub async fn watch(
|
||||
pub async fn watch<E>(
|
||||
&self,
|
||||
updates: watch::Sender<(Instant, MemoryHistory)>,
|
||||
) -> anyhow::Result<()> {
|
||||
// this requirement makes the code a bit easier to work with; see the config for more.
|
||||
assert!(self.config.memory_history_len <= self.config.memory_history_log_interval);
|
||||
// These are ~dependency injected~ (fancy, I know) because this function
|
||||
// should never return.
|
||||
// -> therefore: when we tokio::spawn it, we don't await the JoinHandle.
|
||||
// -> therefore: if we want to stick it in an Arc so many threads can access
|
||||
// it, methods can never take mutable access.
|
||||
// - note: we use the Arc strategy so that a) we can call this function
|
||||
// right here and b) the runner can call the set/get_memory methods
|
||||
// -> since calling recv() on a tokio::sync::mpsc::Receiver takes &mut self,
|
||||
// we just pass them in here instead of holding them in fields, as that
|
||||
// would require this method to take &mut self.
|
||||
mut upscales: mpsc::Receiver<Sequenced<Resources>>,
|
||||
events: E,
|
||||
) -> anyhow::Result<()>
|
||||
where
|
||||
E: Stream<Item = Sequenced<u64>>,
|
||||
{
|
||||
let mut wait_to_freeze = pin!(tokio::time::sleep(Duration::ZERO));
|
||||
let mut last_memory_high_increase_at: Option<Instant> = None;
|
||||
let mut events = pin!(events);
|
||||
|
||||
let mut ticker = tokio::time::interval(self.config.memory_poll_interval);
|
||||
ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
|
||||
// ticker.reset_immediately(); // FIXME: enable this once updating to tokio >= 1.30.0
|
||||
// Are we waiting to be upscaled? Could be true if we request upscale due
|
||||
// to a memory.high event and it does not arrive in time.
|
||||
let mut waiting_on_upscale = false;
|
||||
|
||||
let mem_controller = self.memory()?;
|
||||
loop {
|
||||
tokio::select! {
|
||||
upscale = upscales.recv() => {
|
||||
let Sequenced { seqnum, data } = upscale
|
||||
.context("failed to listen on upscale notification channel")?;
|
||||
waiting_on_upscale = false;
|
||||
last_memory_high_increase_at = None;
|
||||
self.last_upscale_seqnum.store(seqnum, Ordering::Release);
|
||||
info!(cpu = data.cpu, mem_bytes = data.mem, "received upscale");
|
||||
}
|
||||
event = events.next() => {
|
||||
let Some(Sequenced { seqnum, .. }) = event else {
|
||||
bail!("failed to listen for memory.high events")
|
||||
};
|
||||
// The memory.high came before our last upscale, so we consider
|
||||
// it resolved
|
||||
if self.last_upscale_seqnum.fetch_max(seqnum, Ordering::AcqRel) > seqnum {
|
||||
info!(
|
||||
"received memory.high event, but it came before our last upscale -> ignoring it"
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
// buffer for samples that will be logged. once full, it remains so.
|
||||
let history_log_len = self.config.memory_history_log_interval;
|
||||
let mut history_log_buf = vec![MemoryStatus::zeroed(); history_log_len];
|
||||
// The memory.high came after our latest upscale. We don't
|
||||
// want to do anything yet, so peek the next event in hopes
|
||||
// that it's an upscale.
|
||||
if let Some(upscale_num) = self
|
||||
.upscaled(&mut upscales)
|
||||
.context("failed to check if we were upscaled")?
|
||||
{
|
||||
if upscale_num > seqnum {
|
||||
info!(
|
||||
"received memory.high event, but it came before our last upscale -> ignoring it"
|
||||
);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
for t in 0_u64.. {
|
||||
ticker.tick().await;
|
||||
// If it's been long enough since we last froze, freeze the
|
||||
// cgroup and request upscale
|
||||
if wait_to_freeze.is_elapsed() {
|
||||
info!("received memory.high event -> requesting upscale");
|
||||
waiting_on_upscale = self
|
||||
.handle_memory_high_event(&mut upscales)
|
||||
.await
|
||||
.context("failed to handle upscale")?;
|
||||
wait_to_freeze
|
||||
.as_mut()
|
||||
.reset(Instant::now() + self.config.do_not_freeze_more_often_than);
|
||||
continue;
|
||||
}
|
||||
|
||||
let now = Instant::now();
|
||||
let mem = Self::memory_usage(mem_controller);
|
||||
// Ok, we can't freeze, just request upscale
|
||||
if !waiting_on_upscale {
|
||||
info!("received memory.high event, but too soon to refreeze -> requesting upscale");
|
||||
|
||||
let i = t as usize % history_log_len;
|
||||
history_log_buf[i] = mem;
|
||||
// Make check to make sure we haven't been upscaled in the
|
||||
// meantine (can happen if the agent independently decides
|
||||
// to upscale us again)
|
||||
if self
|
||||
.upscaled(&mut upscales)
|
||||
.context("failed to check if we were upscaled")?
|
||||
.is_some()
|
||||
{
|
||||
info!("no need to request upscaling because we got upscaled");
|
||||
continue;
|
||||
}
|
||||
self.upscale_requester
|
||||
.send(())
|
||||
.await
|
||||
.context("failed to request upscale")?;
|
||||
waiting_on_upscale = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
// We're taking *at most* memory_history_len values; we may be bounded by the total
|
||||
// number of samples that have come in so far.
|
||||
let samples_count = (t + 1).min(self.config.memory_history_len as u64) as usize;
|
||||
// NB: in `ring_buf_recent_values_iter`, `i` is *inclusive*, which matches the fact
|
||||
// that we just inserted a value there, so the end of the iterator will *include* the
|
||||
// value at i, rather than stopping just short of it.
|
||||
let samples = ring_buf_recent_values_iter(&history_log_buf, i, samples_count);
|
||||
// Shoot, we can't freeze or and we're still waiting on upscale,
|
||||
// increase memory.high to reduce throttling
|
||||
let can_increase_memory_high = match last_memory_high_increase_at {
|
||||
None => true,
|
||||
Some(t) => t.elapsed() > self.config.memory_high_increase_every,
|
||||
};
|
||||
if can_increase_memory_high {
|
||||
info!(
|
||||
"received memory.high event, \
|
||||
but too soon to refreeze and already requested upscale \
|
||||
-> increasing memory.high"
|
||||
);
|
||||
|
||||
let summary = MemoryHistory {
|
||||
avg_non_reclaimable: samples.map(|h| h.non_reclaimable).sum::<u64>()
|
||||
/ samples_count as u64,
|
||||
samples_count,
|
||||
samples_span: self.config.memory_poll_interval * (samples_count - 1) as u32,
|
||||
// Make check to make sure we haven't been upscaled in the
|
||||
// meantine (can happen if the agent independently decides
|
||||
// to upscale us again)
|
||||
if self
|
||||
.upscaled(&mut upscales)
|
||||
.context("failed to check if we were upscaled")?
|
||||
.is_some()
|
||||
{
|
||||
info!("no need to increase memory.high because got upscaled");
|
||||
continue;
|
||||
}
|
||||
|
||||
// Request upscale anyways (the agent will handle deduplicating
|
||||
// requests)
|
||||
self.upscale_requester
|
||||
.send(())
|
||||
.await
|
||||
.context("failed to request upscale")?;
|
||||
|
||||
let memory_high =
|
||||
self.get_memory_high_bytes().context("failed to get memory.high")?;
|
||||
let new_high = memory_high + self.config.memory_high_increase_by_bytes;
|
||||
info!(
|
||||
current_high_bytes = memory_high,
|
||||
new_high_bytes = new_high,
|
||||
"updating memory.high"
|
||||
);
|
||||
self.set_memory_high_bytes(new_high)
|
||||
.context("failed to set memory.high")?;
|
||||
last_memory_high_increase_at = Some(Instant::now());
|
||||
continue;
|
||||
}
|
||||
|
||||
info!("received memory.high event, but can't do anything");
|
||||
}
|
||||
};
|
||||
|
||||
// Log the current history if it's time to do so. Because `history_log_buf` has length
|
||||
// equal to the logging interval, we can just log the entire buffer every time we set
|
||||
// the last entry, which also means that for this log line, we can ignore that it's a
|
||||
// ring buffer (because all the entries are in order of increasing time).
|
||||
if i == history_log_len - 1 {
|
||||
info!(
|
||||
history = ?MemoryStatus::debug_slice(&history_log_buf),
|
||||
summary = ?summary,
|
||||
"Recent cgroup memory statistics history"
|
||||
);
|
||||
}
|
||||
|
||||
updates
|
||||
.send((now, summary))
|
||||
.context("failed to send MemoryHistory")?;
|
||||
}
|
||||
}
|
||||
|
||||
unreachable!()
|
||||
/// Handle a `memory.high`, returning whether we are still waiting on upscale
|
||||
/// by the time the function returns.
|
||||
///
|
||||
/// The general plan for handling a `memory.high` event is as follows:
|
||||
/// 1. Freeze the cgroup
|
||||
/// 2. Start a timer for `self.config.max_upscale_wait`
|
||||
/// 3. Request upscale
|
||||
/// 4. After the timer elapses or we receive upscale, thaw the cgroup.
|
||||
/// 5. Return whether or not we are still waiting for upscale. If we are,
|
||||
/// we'll increase the cgroups memory.high to avoid getting oom killed
|
||||
#[tracing::instrument(skip_all)]
|
||||
async fn handle_memory_high_event(
|
||||
&self,
|
||||
upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
|
||||
) -> anyhow::Result<bool> {
|
||||
// Immediately freeze the cgroup before doing anything else.
|
||||
info!("received memory.high event -> freezing cgroup");
|
||||
self.freeze().context("failed to freeze cgroup")?;
|
||||
|
||||
// We'll use this for logging durations
|
||||
let start_time = Instant::now();
|
||||
|
||||
// Await the upscale until we have to unfreeze
|
||||
let timed =
|
||||
tokio::time::timeout(self.config.max_upscale_wait, self.await_upscale(upscales));
|
||||
|
||||
// Request the upscale
|
||||
info!(
|
||||
wait = ?self.config.max_upscale_wait,
|
||||
"sending request for immediate upscaling",
|
||||
);
|
||||
self.upscale_requester
|
||||
.send(())
|
||||
.await
|
||||
.context("failed to request upscale")?;
|
||||
|
||||
let waiting_on_upscale = match timed.await {
|
||||
Ok(Ok(())) => {
|
||||
info!(elapsed = ?start_time.elapsed(), "received upscale in time");
|
||||
false
|
||||
}
|
||||
// **important**: unfreeze the cgroup before ?-reporting the error
|
||||
Ok(Err(e)) => {
|
||||
info!("error waiting for upscale -> thawing cgroup");
|
||||
self.thaw()
|
||||
.context("failed to thaw cgroup after errored waiting for upscale")?;
|
||||
Err(e.context("failed to await upscale"))?
|
||||
}
|
||||
Err(_) => {
|
||||
info!(elapsed = ?self.config.max_upscale_wait, "timed out waiting for upscale");
|
||||
true
|
||||
}
|
||||
};
|
||||
|
||||
info!("thawing cgroup");
|
||||
self.thaw().context("failed to thaw cgroup")?;
|
||||
|
||||
Ok(waiting_on_upscale)
|
||||
}
|
||||
|
||||
/// Checks whether we were just upscaled, returning the upscale's sequence
|
||||
/// number if so.
|
||||
#[tracing::instrument(skip_all)]
|
||||
fn upscaled(
|
||||
&self,
|
||||
upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
|
||||
) -> anyhow::Result<Option<u64>> {
|
||||
let Sequenced { seqnum, data } = match upscales.try_recv() {
|
||||
Ok(upscale) => upscale,
|
||||
Err(TryRecvError::Empty) => return Ok(None),
|
||||
Err(TryRecvError::Disconnected) => {
|
||||
bail!("upscale notification channel was disconnected")
|
||||
}
|
||||
};
|
||||
|
||||
// Make sure to update the last upscale sequence number
|
||||
self.last_upscale_seqnum.store(seqnum, Ordering::Release);
|
||||
info!(cpu = data.cpu, mem_bytes = data.mem, "received upscale");
|
||||
Ok(Some(seqnum))
|
||||
}
|
||||
|
||||
/// Await an upscale event, discarding any `memory.high` events received in
|
||||
/// the process.
|
||||
///
|
||||
/// This is used in `handle_memory_high_event`, where we need to listen
|
||||
/// for upscales in particular so we know if we can thaw the cgroup early.
|
||||
#[tracing::instrument(skip_all)]
|
||||
async fn await_upscale(
|
||||
&self,
|
||||
upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
|
||||
) -> anyhow::Result<()> {
|
||||
let Sequenced { seqnum, .. } = upscales
|
||||
.recv()
|
||||
.await
|
||||
.context("error listening for upscales")?;
|
||||
|
||||
self.last_upscale_seqnum.store(seqnum, Ordering::Release);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get the cgroup's name.
|
||||
pub fn path(&self) -> &str {
|
||||
self.cgroup.path()
|
||||
}
|
||||
}
|
||||
|
||||
// Methods for manipulating the actual cgroup
|
||||
impl CgroupWatcher {
|
||||
/// Get a handle on the freezer subsystem.
|
||||
fn freezer(&self) -> anyhow::Result<&FreezerController> {
|
||||
if let Some(Freezer(freezer)) = self
|
||||
.cgroup
|
||||
.subsystems()
|
||||
.iter()
|
||||
.find(|sub| matches!(sub, Freezer(_)))
|
||||
{
|
||||
Ok(freezer)
|
||||
} else {
|
||||
anyhow::bail!("could not find freezer subsystem")
|
||||
}
|
||||
}
|
||||
|
||||
/// Attempt to freeze the cgroup.
|
||||
pub fn freeze(&self) -> anyhow::Result<()> {
|
||||
self.freezer()
|
||||
.context("failed to get freezer subsystem")?
|
||||
.freeze()
|
||||
.context("failed to freeze")
|
||||
}
|
||||
|
||||
/// Attempt to thaw the cgroup.
|
||||
pub fn thaw(&self) -> anyhow::Result<()> {
|
||||
self.freezer()
|
||||
.context("failed to get freezer subsystem")?
|
||||
.thaw()
|
||||
.context("failed to thaw")
|
||||
}
|
||||
|
||||
/// Get a handle on the memory subsystem.
|
||||
///
|
||||
/// Note: this method does not require `self.memory_update_lock` because
|
||||
/// getting a handle to the subsystem does not access any of the files we
|
||||
/// care about, such as memory.high and memory.events
|
||||
fn memory(&self) -> anyhow::Result<&MemController> {
|
||||
self.cgroup
|
||||
if let Some(Mem(memory)) = self
|
||||
.cgroup
|
||||
.subsystems()
|
||||
.iter()
|
||||
.find_map(|sub| match sub {
|
||||
Subsystem::Mem(c) => Some(c),
|
||||
_ => None,
|
||||
.find(|sub| matches!(sub, Mem(_)))
|
||||
{
|
||||
Ok(memory)
|
||||
} else {
|
||||
anyhow::bail!("could not find memory subsystem")
|
||||
}
|
||||
}
|
||||
|
||||
/// Get cgroup current memory usage.
|
||||
pub fn current_memory_usage(&self) -> anyhow::Result<u64> {
|
||||
Ok(self
|
||||
.memory()
|
||||
.context("failed to get memory subsystem")?
|
||||
.memory_stat()
|
||||
.usage_in_bytes)
|
||||
}
|
||||
|
||||
/// Set cgroup memory.high threshold.
|
||||
pub fn set_memory_high_bytes(&self, bytes: u64) -> anyhow::Result<()> {
|
||||
self.set_memory_high_internal(MaxValue::Value(u64::min(bytes, i64::MAX as u64) as i64))
|
||||
}
|
||||
|
||||
/// Set the cgroup's memory.high to 'max', disabling it.
|
||||
pub fn unset_memory_high(&self) -> anyhow::Result<()> {
|
||||
self.set_memory_high_internal(MaxValue::Max)
|
||||
}
|
||||
|
||||
fn set_memory_high_internal(&self, value: MaxValue) -> anyhow::Result<()> {
|
||||
self.memory()
|
||||
.context("failed to get memory subsystem")?
|
||||
.set_mem(cgroups_rs::memory::SetMemory {
|
||||
low: None,
|
||||
high: Some(value),
|
||||
min: None,
|
||||
max: None,
|
||||
})
|
||||
.ok_or_else(|| anyhow!("could not find memory subsystem"))
|
||||
.map_err(anyhow::Error::from)
|
||||
}
|
||||
|
||||
/// Given a handle on the memory subsystem, returns the current memory information
|
||||
fn memory_usage(mem_controller: &MemController) -> MemoryStatus {
|
||||
let stat = mem_controller.memory_stat().stat;
|
||||
MemoryStatus {
|
||||
non_reclaimable: stat.active_anon + stat.inactive_anon,
|
||||
/// Get memory.high threshold.
|
||||
pub fn get_memory_high_bytes(&self) -> anyhow::Result<u64> {
|
||||
let high = self
|
||||
.memory()
|
||||
.context("failed to get memory subsystem while getting memory statistics")?
|
||||
.get_mem()
|
||||
.map(|mem| mem.high)
|
||||
.context("failed to get memory statistics from subsystem")?;
|
||||
match high {
|
||||
Some(MaxValue::Max) => Ok(i64::MAX as u64),
|
||||
Some(MaxValue::Value(high)) => Ok(high as u64),
|
||||
None => anyhow::bail!("failed to read memory.high from memory subsystem"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Helper function for `CgroupWatcher::watch`
|
||||
fn ring_buf_recent_values_iter<T>(
|
||||
buf: &[T],
|
||||
last_value_idx: usize,
|
||||
count: usize,
|
||||
) -> impl '_ + Iterator<Item = &T> {
|
||||
// Assertion carried over from `CgroupWatcher::watch`, to make the logic in this function
|
||||
// easier (we only have to add `buf.len()` once, rather than a dynamic number of times).
|
||||
assert!(count <= buf.len());
|
||||
|
||||
buf.iter()
|
||||
// 'cycle' because the values could wrap around
|
||||
.cycle()
|
||||
// with 'cycle', this skip is more like 'offset', and functionally this is
|
||||
// offsettting by 'last_value_idx - count (mod buf.len())', but we have to be
|
||||
// careful to avoid underflow, so we pre-add buf.len().
|
||||
// The '+ 1' is because `last_value_idx` is inclusive, rather than exclusive.
|
||||
.skip((buf.len() + last_value_idx + 1 - count) % buf.len())
|
||||
.take(count)
|
||||
}
|
||||
|
||||
/// Summary of recent memory usage
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
pub struct MemoryHistory {
|
||||
/// Rolling average of non-reclaimable memory usage samples over the last `history_period`
|
||||
pub avg_non_reclaimable: u64,
|
||||
|
||||
/// The number of samples used to construct this summary
|
||||
pub samples_count: usize,
|
||||
/// Total timespan between the first and last sample used for this summary
|
||||
pub samples_span: Duration,
|
||||
}
|
||||
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
pub struct MemoryStatus {
|
||||
non_reclaimable: u64,
|
||||
}
|
||||
|
||||
impl MemoryStatus {
|
||||
fn zeroed() -> Self {
|
||||
MemoryStatus { non_reclaimable: 0 }
|
||||
}
|
||||
|
||||
fn debug_slice(slice: &[Self]) -> impl '_ + Debug {
|
||||
struct DS<'a>(&'a [MemoryStatus]);
|
||||
|
||||
impl<'a> Debug for DS<'a> {
|
||||
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
|
||||
f.debug_struct("[MemoryStatus]")
|
||||
.field(
|
||||
"non_reclaimable[..]",
|
||||
&Fields(self.0, |stat: &MemoryStatus| {
|
||||
BytesToGB(stat.non_reclaimable)
|
||||
}),
|
||||
)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
struct Fields<'a, F>(&'a [MemoryStatus], F);
|
||||
|
||||
impl<'a, F: Fn(&MemoryStatus) -> T, T: Debug> Debug for Fields<'a, F> {
|
||||
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
|
||||
f.debug_list().entries(self.0.iter().map(&self.1)).finish()
|
||||
}
|
||||
}
|
||||
|
||||
struct BytesToGB(u64);
|
||||
|
||||
impl Debug for BytesToGB {
|
||||
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
|
||||
f.write_fmt(format_args!(
|
||||
"{:.3}Gi",
|
||||
self.0 as f64 / (1_u64 << 30) as f64
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
DS(slice)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
#[test]
|
||||
fn ring_buf_iter() {
|
||||
let buf = vec![0_i32, 1, 2, 3, 4, 5, 6, 7, 8, 9];
|
||||
|
||||
let values = |offset, count| {
|
||||
super::ring_buf_recent_values_iter(&buf, offset, count)
|
||||
.copied()
|
||||
.collect::<Vec<i32>>()
|
||||
};
|
||||
|
||||
// Boundary conditions: start, end, and entire thing:
|
||||
assert_eq!(values(0, 1), [0]);
|
||||
assert_eq!(values(3, 4), [0, 1, 2, 3]);
|
||||
assert_eq!(values(9, 4), [6, 7, 8, 9]);
|
||||
assert_eq!(values(9, 10), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]);
|
||||
|
||||
// "normal" operation: no wraparound
|
||||
assert_eq!(values(7, 4), [4, 5, 6, 7]);
|
||||
|
||||
// wraparound:
|
||||
assert_eq!(values(0, 4), [7, 8, 9, 0]);
|
||||
assert_eq!(values(1, 4), [8, 9, 0, 1]);
|
||||
assert_eq!(values(2, 4), [9, 0, 1, 2]);
|
||||
assert_eq!(values(2, 10), [3, 4, 5, 6, 7, 8, 9, 0, 1, 2]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,10 +12,12 @@ use futures::{
|
||||
stream::{SplitSink, SplitStream},
|
||||
SinkExt, StreamExt,
|
||||
};
|
||||
use tokio::sync::mpsc;
|
||||
use tracing::info;
|
||||
|
||||
use crate::cgroup::Sequenced;
|
||||
use crate::protocol::{
|
||||
OutboundMsg, ProtocolRange, ProtocolResponse, ProtocolVersion, PROTOCOL_MAX_VERSION,
|
||||
OutboundMsg, ProtocolRange, ProtocolResponse, ProtocolVersion, Resources, PROTOCOL_MAX_VERSION,
|
||||
PROTOCOL_MIN_VERSION,
|
||||
};
|
||||
|
||||
@@ -34,6 +36,13 @@ pub struct Dispatcher {
|
||||
/// We send messages to the agent through `sink`
|
||||
sink: SplitSink<WebSocket, Message>,
|
||||
|
||||
/// Used to notify the cgroup when we are upscaled.
|
||||
pub(crate) notify_upscale_events: mpsc::Sender<Sequenced<Resources>>,
|
||||
|
||||
/// When the cgroup requests upscale it will send on this channel. In response
|
||||
/// we send an `UpscaleRequst` to the agent.
|
||||
pub(crate) request_upscale_events: mpsc::Receiver<()>,
|
||||
|
||||
/// The protocol version we have agreed to use with the agent. This is negotiated
|
||||
/// during the creation of the dispatcher, and should be the highest shared protocol
|
||||
/// version.
|
||||
@@ -52,7 +61,11 @@ impl Dispatcher {
|
||||
/// 1. Wait for the agent to sent the range of protocols it supports.
|
||||
/// 2. Send a protocol version that works for us as well, or an error if there
|
||||
/// is no compatible version.
|
||||
pub async fn new(stream: WebSocket) -> anyhow::Result<Self> {
|
||||
pub async fn new(
|
||||
stream: WebSocket,
|
||||
notify_upscale_events: mpsc::Sender<Sequenced<Resources>>,
|
||||
request_upscale_events: mpsc::Receiver<()>,
|
||||
) -> anyhow::Result<Self> {
|
||||
let (mut sink, mut source) = stream.split();
|
||||
|
||||
// Figure out the highest protocol version we both support
|
||||
@@ -106,10 +119,22 @@ impl Dispatcher {
|
||||
Ok(Self {
|
||||
sink,
|
||||
source,
|
||||
notify_upscale_events,
|
||||
request_upscale_events,
|
||||
proto_version: highest_shared_version,
|
||||
})
|
||||
}
|
||||
|
||||
/// Notify the cgroup manager that we have received upscale and wait for
|
||||
/// the acknowledgement.
|
||||
#[tracing::instrument(skip_all, fields(?resources))]
|
||||
pub async fn notify_upscale(&self, resources: Sequenced<Resources>) -> anyhow::Result<()> {
|
||||
self.notify_upscale_events
|
||||
.send(resources)
|
||||
.await
|
||||
.context("failed to send resources and oneshot sender across channel")
|
||||
}
|
||||
|
||||
/// Send a message to the agent.
|
||||
///
|
||||
/// Although this function is small, it has one major benefit: it is the only
|
||||
|
||||
@@ -5,16 +5,18 @@
|
||||
//! all functionality.
|
||||
|
||||
use std::fmt::Debug;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use axum::extract::ws::{Message, WebSocket};
|
||||
use futures::StreamExt;
|
||||
use tokio::sync::{broadcast, watch};
|
||||
use tokio::sync::broadcast;
|
||||
use tokio::sync::mpsc;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{error, info, warn};
|
||||
|
||||
use crate::cgroup::{self, CgroupWatcher};
|
||||
use crate::cgroup::{CgroupWatcher, Sequenced};
|
||||
use crate::dispatcher::Dispatcher;
|
||||
use crate::filecache::{FileCacheConfig, FileCacheState};
|
||||
use crate::protocol::{InboundMsg, InboundMsgKind, OutboundMsg, OutboundMsgKind, Resources};
|
||||
@@ -26,7 +28,7 @@ use crate::{bytes_to_mebibytes, get_total_system_memory, spawn_with_cancel, Args
|
||||
pub struct Runner {
|
||||
config: Config,
|
||||
filecache: Option<FileCacheState>,
|
||||
cgroup: Option<CgroupState>,
|
||||
cgroup: Option<Arc<CgroupWatcher>>,
|
||||
dispatcher: Dispatcher,
|
||||
|
||||
/// We "mint" new message ids by incrementing this counter and taking the value.
|
||||
@@ -43,14 +45,6 @@ pub struct Runner {
|
||||
kill: broadcast::Receiver<()>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct CgroupState {
|
||||
watcher: watch::Receiver<(Instant, cgroup::MemoryHistory)>,
|
||||
/// If [`cgroup::MemoryHistory::avg_non_reclaimable`] exceeds `threshold`, we send upscale
|
||||
/// requests.
|
||||
threshold: u64,
|
||||
}
|
||||
|
||||
/// Configuration for a `Runner`
|
||||
#[derive(Debug)]
|
||||
pub struct Config {
|
||||
@@ -68,56 +62,16 @@ pub struct Config {
|
||||
/// upscale resource amounts (because we might not *actually* have been upscaled yet). This field
|
||||
/// should be removed once we have a better solution there.
|
||||
sys_buffer_bytes: u64,
|
||||
|
||||
/// Minimum fraction of total system memory reserved *before* the the cgroup threshold; in
|
||||
/// other words, providing a ceiling for the highest value of the threshold by enforcing that
|
||||
/// there's at least `cgroup_min_overhead_fraction` of the total memory remaining beyond the
|
||||
/// threshold.
|
||||
///
|
||||
/// For example, a value of `0.1` means that 10% of total memory must remain after exceeding
|
||||
/// the threshold, so the value of the cgroup threshold would always be capped at 90% of total
|
||||
/// memory.
|
||||
///
|
||||
/// The default value of `0.15` means that we *guarantee* sending upscale requests if the
|
||||
/// cgroup is using more than 85% of total memory (even if we're *not* separately reserving
|
||||
/// memory for the file cache).
|
||||
cgroup_min_overhead_fraction: f64,
|
||||
|
||||
cgroup_downscale_threshold_buffer_bytes: u64,
|
||||
}
|
||||
|
||||
impl Default for Config {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
sys_buffer_bytes: 100 * MiB,
|
||||
cgroup_min_overhead_fraction: 0.15,
|
||||
cgroup_downscale_threshold_buffer_bytes: 100 * MiB,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Config {
|
||||
fn cgroup_threshold(&self, total_mem: u64, file_cache_disk_size: u64) -> u64 {
|
||||
// If the file cache is in tmpfs, then it will count towards shmem usage of the cgroup,
|
||||
// and thus be non-reclaimable, so we should allow for additional memory usage.
|
||||
//
|
||||
// If the file cache sits on disk, our desired stable system state is for it to be fully
|
||||
// page cached (its contents should only be paged to/from disk in situations where we can't
|
||||
// upscale fast enough). Page-cached memory is reclaimable, so we need to lower the
|
||||
// threshold for non-reclaimable memory so we scale up *before* the kernel starts paging
|
||||
// out the file cache.
|
||||
let memory_remaining_for_cgroup = total_mem.saturating_sub(file_cache_disk_size);
|
||||
|
||||
// Even if we're not separately making room for the file cache (if it's in tmpfs), we still
|
||||
// want our threshold to be met gracefully instead of letting postgres get OOM-killed.
|
||||
// So we guarantee that there's at least `cgroup_min_overhead_fraction` of total memory
|
||||
// remaining above the threshold.
|
||||
let max_threshold = (total_mem as f64 * (1.0 - self.cgroup_min_overhead_fraction)) as u64;
|
||||
|
||||
memory_remaining_for_cgroup.min(max_threshold)
|
||||
}
|
||||
}
|
||||
|
||||
impl Runner {
|
||||
/// Create a new monitor.
|
||||
#[tracing::instrument(skip_all, fields(?config, ?args))]
|
||||
@@ -133,7 +87,12 @@ impl Runner {
|
||||
"invalid monitor Config: sys_buffer_bytes cannot be 0"
|
||||
);
|
||||
|
||||
let dispatcher = Dispatcher::new(ws)
|
||||
// *NOTE*: the dispatcher and cgroup manager talk through these channels
|
||||
// so make sure they each get the correct half, nothing is droppped, etc.
|
||||
let (notified_send, notified_recv) = mpsc::channel(1);
|
||||
let (requesting_send, requesting_recv) = mpsc::channel(1);
|
||||
|
||||
let dispatcher = Dispatcher::new(ws, notified_send, requesting_recv)
|
||||
.await
|
||||
.context("error creating new dispatcher")?;
|
||||
|
||||
@@ -147,9 +106,45 @@ impl Runner {
|
||||
kill,
|
||||
};
|
||||
|
||||
let mem = get_total_system_memory();
|
||||
// If we have both the cgroup and file cache integrations enabled, it's possible for
|
||||
// temporary failures to result in cgroup throttling (from memory.high), that in turn makes
|
||||
// it near-impossible to connect to the file cache (because it times out). Unfortunately,
|
||||
// we *do* still want to determine the file cache size before setting the cgroup's
|
||||
// memory.high, so it's not as simple as just swapping the order.
|
||||
//
|
||||
// Instead, the resolution here is that on vm-monitor startup (note: happens on each
|
||||
// connection from autoscaler-agent, possibly multiple times per compute_ctl lifecycle), we
|
||||
// temporarily unset memory.high, to allow any existing throttling to dissipate. It's a bit
|
||||
// of a hacky solution, but helps with reliability.
|
||||
if let Some(name) = &args.cgroup {
|
||||
// Best not to set up cgroup stuff more than once, so we'll initialize cgroup state
|
||||
// now, and then set limits later.
|
||||
info!("initializing cgroup");
|
||||
|
||||
let mut file_cache_disk_size = 0;
|
||||
let (cgroup, cgroup_event_stream) = CgroupWatcher::new(name.clone(), requesting_send)
|
||||
.context("failed to create cgroup manager")?;
|
||||
|
||||
info!("temporarily unsetting memory.high");
|
||||
|
||||
// Temporarily un-set cgroup memory.high; see above.
|
||||
cgroup
|
||||
.unset_memory_high()
|
||||
.context("failed to unset memory.high")?;
|
||||
|
||||
let cgroup = Arc::new(cgroup);
|
||||
|
||||
let cgroup_clone = Arc::clone(&cgroup);
|
||||
spawn_with_cancel(
|
||||
token.clone(),
|
||||
|_| error!("cgroup watcher terminated"),
|
||||
async move { cgroup_clone.watch(notified_recv, cgroup_event_stream).await },
|
||||
);
|
||||
|
||||
state.cgroup = Some(cgroup);
|
||||
}
|
||||
|
||||
let mut file_cache_reserved_bytes = 0;
|
||||
let mem = get_total_system_memory();
|
||||
|
||||
// We need to process file cache initialization before cgroup initialization, so that the memory
|
||||
// allocated to the file cache is appropriately taken into account when we decide the cgroup's
|
||||
@@ -161,7 +156,7 @@ impl Runner {
|
||||
false => FileCacheConfig::default_in_memory(),
|
||||
};
|
||||
|
||||
let mut file_cache = FileCacheState::new(connstr, config, token.clone())
|
||||
let mut file_cache = FileCacheState::new(connstr, config, token)
|
||||
.await
|
||||
.context("failed to create file cache")?;
|
||||
|
||||
@@ -186,40 +181,23 @@ impl Runner {
|
||||
if actual_size != new_size {
|
||||
info!("file cache size actually got set to {actual_size}")
|
||||
}
|
||||
|
||||
if args.file_cache_on_disk {
|
||||
file_cache_disk_size = actual_size;
|
||||
// Mark the resources given to the file cache as reserved, but only if it's in memory.
|
||||
if !args.file_cache_on_disk {
|
||||
file_cache_reserved_bytes = actual_size;
|
||||
}
|
||||
|
||||
state.filecache = Some(file_cache);
|
||||
}
|
||||
|
||||
if let Some(name) = &args.cgroup {
|
||||
// Best not to set up cgroup stuff more than once, so we'll initialize cgroup state
|
||||
// now, and then set limits later.
|
||||
info!("initializing cgroup");
|
||||
if let Some(cgroup) = &state.cgroup {
|
||||
let available = mem - file_cache_reserved_bytes;
|
||||
let value = cgroup.config.calculate_memory_high_value(available);
|
||||
|
||||
let cgroup =
|
||||
CgroupWatcher::new(name.clone()).context("failed to create cgroup manager")?;
|
||||
info!(value, "setting memory.high");
|
||||
|
||||
let init_value = cgroup::MemoryHistory {
|
||||
avg_non_reclaimable: 0,
|
||||
samples_count: 0,
|
||||
samples_span: Duration::ZERO,
|
||||
};
|
||||
let (hist_tx, hist_rx) = watch::channel((Instant::now(), init_value));
|
||||
|
||||
spawn_with_cancel(token, |_| error!("cgroup watcher terminated"), async move {
|
||||
cgroup.watch(hist_tx).await
|
||||
});
|
||||
|
||||
let threshold = state.config.cgroup_threshold(mem, file_cache_disk_size);
|
||||
info!(threshold, "set initial cgroup threshold",);
|
||||
|
||||
state.cgroup = Some(CgroupState {
|
||||
watcher: hist_rx,
|
||||
threshold,
|
||||
});
|
||||
cgroup
|
||||
.set_memory_high_bytes(value)
|
||||
.context("failed to set cgroup memory.high")?;
|
||||
}
|
||||
|
||||
Ok(state)
|
||||
@@ -239,51 +217,28 @@ impl Runner {
|
||||
|
||||
let requested_mem = target.mem;
|
||||
let usable_system_memory = requested_mem.saturating_sub(self.config.sys_buffer_bytes);
|
||||
let (expected_file_cache_size, expected_file_cache_disk_size) = self
|
||||
let expected_file_cache_mem_usage = self
|
||||
.filecache
|
||||
.as_ref()
|
||||
.map(|file_cache| {
|
||||
let size = file_cache.config.calculate_cache_size(usable_system_memory);
|
||||
match file_cache.config.in_memory {
|
||||
true => (size, 0),
|
||||
false => (size, size),
|
||||
}
|
||||
})
|
||||
.unwrap_or((0, 0));
|
||||
.map(|file_cache| file_cache.config.calculate_cache_size(usable_system_memory))
|
||||
.unwrap_or(0);
|
||||
let mut new_cgroup_mem_high = 0;
|
||||
if let Some(cgroup) = &self.cgroup {
|
||||
let (last_time, last_history) = *cgroup.watcher.borrow();
|
||||
|
||||
// NB: The ordering of these conditions is intentional. During startup, we should deny
|
||||
// downscaling until we have enough information to determine that it's safe to do so
|
||||
// (i.e. enough samples have come in). But if it's been a while and we *still* haven't
|
||||
// received any information, we should *fail* instead of just denying downscaling.
|
||||
//
|
||||
// `last_time` is set to `Instant::now()` on startup, so checking `last_time.elapsed()`
|
||||
// serves double-duty: it trips if we haven't received *any* metrics for long enough,
|
||||
// OR if we haven't received metrics *recently enough*.
|
||||
//
|
||||
// TODO: make the duration here configurable.
|
||||
if last_time.elapsed() > Duration::from_secs(5) {
|
||||
bail!("haven't gotten cgroup memory stats recently enough to determine downscaling information");
|
||||
} else if last_history.samples_count <= 1 {
|
||||
let status = "haven't received enough cgroup memory stats yet";
|
||||
info!(status, "discontinuing downscale");
|
||||
return Ok((false, status.to_owned()));
|
||||
}
|
||||
|
||||
let new_threshold = self
|
||||
new_cgroup_mem_high = cgroup
|
||||
.config
|
||||
.cgroup_threshold(usable_system_memory, expected_file_cache_disk_size);
|
||||
.calculate_memory_high_value(usable_system_memory - expected_file_cache_mem_usage);
|
||||
|
||||
let current = last_history.avg_non_reclaimable;
|
||||
let current = cgroup
|
||||
.current_memory_usage()
|
||||
.context("failed to fetch cgroup memory")?;
|
||||
|
||||
if new_threshold < current + self.config.cgroup_downscale_threshold_buffer_bytes {
|
||||
if new_cgroup_mem_high < current + cgroup.config.memory_high_buffer_bytes {
|
||||
let status = format!(
|
||||
"{}: {} MiB (new threshold) < {} (current usage) + {} (downscale buffer)",
|
||||
"calculated memory threshold too low",
|
||||
bytes_to_mebibytes(new_threshold),
|
||||
"{}: {} MiB (new high) < {} (current usage) + {} (buffer)",
|
||||
"calculated memory.high too low",
|
||||
bytes_to_mebibytes(new_cgroup_mem_high),
|
||||
bytes_to_mebibytes(current),
|
||||
bytes_to_mebibytes(self.config.cgroup_downscale_threshold_buffer_bytes)
|
||||
bytes_to_mebibytes(cgroup.config.memory_high_buffer_bytes)
|
||||
);
|
||||
|
||||
info!(status, "discontinuing downscale");
|
||||
@@ -294,14 +249,14 @@ impl Runner {
|
||||
|
||||
// The downscaling has been approved. Downscale the file cache, then the cgroup.
|
||||
let mut status = vec![];
|
||||
let mut file_cache_disk_size = 0;
|
||||
let mut file_cache_mem_usage = 0;
|
||||
if let Some(file_cache) = &mut self.filecache {
|
||||
let actual_usage = file_cache
|
||||
.set_file_cache_size(expected_file_cache_size)
|
||||
.set_file_cache_size(expected_file_cache_mem_usage)
|
||||
.await
|
||||
.context("failed to set file cache size")?;
|
||||
if !file_cache.config.in_memory {
|
||||
file_cache_disk_size = actual_usage;
|
||||
if file_cache.config.in_memory {
|
||||
file_cache_mem_usage = actual_usage;
|
||||
}
|
||||
let message = format!(
|
||||
"set file cache size to {} MiB (in memory = {})",
|
||||
@@ -312,18 +267,24 @@ impl Runner {
|
||||
status.push(message);
|
||||
}
|
||||
|
||||
if let Some(cgroup) = &mut self.cgroup {
|
||||
let new_threshold = self
|
||||
.config
|
||||
.cgroup_threshold(usable_system_memory, file_cache_disk_size);
|
||||
if let Some(cgroup) = &self.cgroup {
|
||||
let available_memory = usable_system_memory - file_cache_mem_usage;
|
||||
|
||||
if file_cache_mem_usage != expected_file_cache_mem_usage {
|
||||
new_cgroup_mem_high = cgroup.config.calculate_memory_high_value(available_memory);
|
||||
}
|
||||
|
||||
// new_cgroup_mem_high is initialized to 0 but it is guaranteed to not be here
|
||||
// since it is properly initialized in the previous cgroup if let block
|
||||
cgroup
|
||||
.set_memory_high_bytes(new_cgroup_mem_high)
|
||||
.context("failed to set cgroup memory.high")?;
|
||||
|
||||
let message = format!(
|
||||
"set cgroup memory threshold from {} MiB to {} MiB, of new total {} MiB",
|
||||
bytes_to_mebibytes(cgroup.threshold),
|
||||
bytes_to_mebibytes(new_threshold),
|
||||
bytes_to_mebibytes(usable_system_memory)
|
||||
"set cgroup memory.high to {} MiB, of new max {} MiB",
|
||||
bytes_to_mebibytes(new_cgroup_mem_high),
|
||||
bytes_to_mebibytes(available_memory)
|
||||
);
|
||||
cgroup.threshold = new_threshold;
|
||||
info!("downscale: {message}");
|
||||
status.push(message);
|
||||
}
|
||||
@@ -344,7 +305,8 @@ impl Runner {
|
||||
let new_mem = resources.mem;
|
||||
let usable_system_memory = new_mem.saturating_sub(self.config.sys_buffer_bytes);
|
||||
|
||||
let mut file_cache_disk_size = 0;
|
||||
// Get the file cache's expected contribution to the memory usage
|
||||
let mut file_cache_mem_usage = 0;
|
||||
if let Some(file_cache) = &mut self.filecache {
|
||||
let expected_usage = file_cache.config.calculate_cache_size(usable_system_memory);
|
||||
info!(
|
||||
@@ -357,8 +319,8 @@ impl Runner {
|
||||
.set_file_cache_size(expected_usage)
|
||||
.await
|
||||
.context("failed to set file cache size")?;
|
||||
if !file_cache.config.in_memory {
|
||||
file_cache_disk_size = actual_usage;
|
||||
if file_cache.config.in_memory {
|
||||
file_cache_mem_usage = actual_usage;
|
||||
}
|
||||
|
||||
if actual_usage != expected_usage {
|
||||
@@ -370,18 +332,18 @@ impl Runner {
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(cgroup) = &mut self.cgroup {
|
||||
let new_threshold = self
|
||||
.config
|
||||
.cgroup_threshold(usable_system_memory, file_cache_disk_size);
|
||||
|
||||
if let Some(cgroup) = &self.cgroup {
|
||||
let available_memory = usable_system_memory - file_cache_mem_usage;
|
||||
let new_cgroup_mem_high = cgroup.config.calculate_memory_high_value(available_memory);
|
||||
info!(
|
||||
"set cgroup memory threshold from {} MiB to {} MiB of new total {} MiB",
|
||||
bytes_to_mebibytes(cgroup.threshold),
|
||||
bytes_to_mebibytes(new_threshold),
|
||||
bytes_to_mebibytes(usable_system_memory)
|
||||
target = bytes_to_mebibytes(new_cgroup_mem_high),
|
||||
total = bytes_to_mebibytes(new_mem),
|
||||
name = cgroup.path(),
|
||||
"updating cgroup memory.high",
|
||||
);
|
||||
cgroup.threshold = new_threshold;
|
||||
cgroup
|
||||
.set_memory_high_bytes(new_cgroup_mem_high)
|
||||
.context("failed to set cgroup memory.high")?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -399,6 +361,10 @@ impl Runner {
|
||||
self.handle_upscale(granted)
|
||||
.await
|
||||
.context("failed to handle upscale")?;
|
||||
self.dispatcher
|
||||
.notify_upscale(Sequenced::new(granted))
|
||||
.await
|
||||
.context("failed to notify notify cgroup of upscale")?;
|
||||
Ok(Some(OutboundMsg::new(
|
||||
OutboundMsgKind::UpscaleConfirmation {},
|
||||
id,
|
||||
@@ -442,53 +408,33 @@ impl Runner {
|
||||
Err(e) => bail!("failed to receive kill signal: {e}")
|
||||
}
|
||||
}
|
||||
|
||||
// New memory stats from the cgroup, *may* need to request upscaling, if we've
|
||||
// exceeded the threshold
|
||||
result = self.cgroup.as_mut().unwrap().watcher.changed(), if self.cgroup.is_some() => {
|
||||
result.context("failed to receive from cgroup memory stats watcher")?;
|
||||
|
||||
let cgroup = self.cgroup.as_ref().unwrap();
|
||||
|
||||
let (_time, cgroup_mem_stat) = *cgroup.watcher.borrow();
|
||||
|
||||
// If we haven't exceeded the threshold, then we're all ok
|
||||
if cgroup_mem_stat.avg_non_reclaimable < cgroup.threshold {
|
||||
continue;
|
||||
// we need to propagate an upscale request
|
||||
request = self.dispatcher.request_upscale_events.recv(), if self.cgroup.is_some() => {
|
||||
if request.is_none() {
|
||||
bail!("failed to listen for upscale event from cgroup")
|
||||
}
|
||||
|
||||
// Otherwise, we generally want upscaling. But, if it's been less than 1 second
|
||||
// since the last time we requested upscaling, ignore the event, to avoid
|
||||
// spamming the agent.
|
||||
// If it's been less than 1 second since the last time we requested upscaling,
|
||||
// ignore the event, to avoid spamming the agent (otherwise, this can happen
|
||||
// ~1k times per second).
|
||||
if let Some(t) = self.last_upscale_request_at {
|
||||
let elapsed = t.elapsed();
|
||||
if elapsed < Duration::from_secs(1) {
|
||||
info!(
|
||||
elapsed_millis = elapsed.as_millis(),
|
||||
avg_non_reclaimable = bytes_to_mebibytes(cgroup_mem_stat.avg_non_reclaimable),
|
||||
threshold = bytes_to_mebibytes(cgroup.threshold),
|
||||
"cgroup memory stats are high enough to upscale but too soon to forward the request, ignoring",
|
||||
);
|
||||
info!(elapsed_millis = elapsed.as_millis(), "cgroup asked for upscale but too soon to forward the request, ignoring");
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
self.last_upscale_request_at = Some(Instant::now());
|
||||
|
||||
info!(
|
||||
avg_non_reclaimable = bytes_to_mebibytes(cgroup_mem_stat.avg_non_reclaimable),
|
||||
threshold = bytes_to_mebibytes(cgroup.threshold),
|
||||
"cgroup memory stats are high enough to upscale, requesting upscale",
|
||||
);
|
||||
|
||||
info!("cgroup asking for upscale; forwarding request");
|
||||
self.counter += 2; // Increment, preserving parity (i.e. keep the
|
||||
// counter odd). See the field comment for more.
|
||||
self.dispatcher
|
||||
.send(OutboundMsg::new(OutboundMsgKind::UpscaleRequest {}, self.counter))
|
||||
.await
|
||||
.context("failed to send message")?;
|
||||
},
|
||||
|
||||
}
|
||||
// there is a message from the agent
|
||||
msg = self.dispatcher.source.next() => {
|
||||
if let Some(msg) = msg {
|
||||
@@ -516,14 +462,11 @@ impl Runner {
|
||||
Ok(Some(out)) => out,
|
||||
Ok(None) => continue,
|
||||
Err(e) => {
|
||||
// use {:#} for our logging because the display impl only
|
||||
// gives the outermost cause, and the debug impl
|
||||
// pretty-prints the error, whereas {:#} contains all the
|
||||
// causes, but is compact (no newlines).
|
||||
warn!(error = format!("{e:#}"), "error handling message");
|
||||
let error = e.to_string();
|
||||
warn!(?error, "error handling message");
|
||||
OutboundMsg::new(
|
||||
OutboundMsgKind::InternalError {
|
||||
error: e.to_string(),
|
||||
error
|
||||
},
|
||||
message.id
|
||||
)
|
||||
|
||||
@@ -1,16 +0,0 @@
|
||||
[package]
|
||||
name = "walproposer"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
utils.workspace = true
|
||||
postgres_ffi.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[build-dependencies]
|
||||
anyhow.workspace = true
|
||||
bindgen.workspace = true
|
||||
@@ -1 +0,0 @@
|
||||
#include "walproposer.h"
|
||||
@@ -1,113 +0,0 @@
|
||||
use std::{env, path::PathBuf, process::Command};
|
||||
|
||||
use anyhow::{anyhow, Context};
|
||||
use bindgen::CargoCallbacks;
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
// Tell cargo to invalidate the built crate whenever the wrapper changes
|
||||
println!("cargo:rerun-if-changed=bindgen_deps.h");
|
||||
|
||||
// Finding the location of built libraries and Postgres C headers:
|
||||
// - if POSTGRES_INSTALL_DIR is set look into it, otherwise look into `<project_root>/pg_install`
|
||||
// - if there's a `bin/pg_config` file use it for getting include server, otherwise use `<project_root>/pg_install/{PG_MAJORVERSION}/include/postgresql/server`
|
||||
let pg_install_dir = if let Some(postgres_install_dir) = env::var_os("POSTGRES_INSTALL_DIR") {
|
||||
postgres_install_dir.into()
|
||||
} else {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../pg_install")
|
||||
};
|
||||
|
||||
let pg_install_abs = std::fs::canonicalize(pg_install_dir)?;
|
||||
let walproposer_lib_dir = pg_install_abs.join("build/walproposer-lib");
|
||||
let walproposer_lib_search_str = walproposer_lib_dir
|
||||
.to_str()
|
||||
.ok_or(anyhow!("Bad non-UTF path"))?;
|
||||
|
||||
let pgxn_neon = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../pgxn/neon");
|
||||
let pgxn_neon = std::fs::canonicalize(pgxn_neon)?;
|
||||
let pgxn_neon = pgxn_neon.to_str().ok_or(anyhow!("Bad non-UTF path"))?;
|
||||
|
||||
println!("cargo:rustc-link-lib=static=pgport");
|
||||
println!("cargo:rustc-link-lib=static=pgcommon");
|
||||
println!("cargo:rustc-link-lib=static=walproposer");
|
||||
println!("cargo:rustc-link-search={walproposer_lib_search_str}");
|
||||
|
||||
let pg_config_bin = pg_install_abs.join("v16").join("bin").join("pg_config");
|
||||
let inc_server_path: String = if pg_config_bin.exists() {
|
||||
let output = Command::new(pg_config_bin)
|
||||
.arg("--includedir-server")
|
||||
.output()
|
||||
.context("failed to execute `pg_config --includedir-server`")?;
|
||||
|
||||
if !output.status.success() {
|
||||
panic!("`pg_config --includedir-server` failed")
|
||||
}
|
||||
|
||||
String::from_utf8(output.stdout)
|
||||
.context("pg_config output is not UTF-8")?
|
||||
.trim_end()
|
||||
.into()
|
||||
} else {
|
||||
let server_path = pg_install_abs
|
||||
.join("v16")
|
||||
.join("include")
|
||||
.join("postgresql")
|
||||
.join("server")
|
||||
.into_os_string();
|
||||
server_path
|
||||
.into_string()
|
||||
.map_err(|s| anyhow!("Bad postgres server path {s:?}"))?
|
||||
};
|
||||
|
||||
// The bindgen::Builder is the main entry point
|
||||
// to bindgen, and lets you build up options for
|
||||
// the resulting bindings.
|
||||
let bindings = bindgen::Builder::default()
|
||||
// The input header we would like to generate
|
||||
// bindings for.
|
||||
.header("bindgen_deps.h")
|
||||
// Tell cargo to invalidate the built crate whenever any of the
|
||||
// included header files changed.
|
||||
.parse_callbacks(Box::new(CargoCallbacks))
|
||||
.allowlist_type("WalProposer")
|
||||
.allowlist_type("WalProposerConfig")
|
||||
.allowlist_type("walproposer_api")
|
||||
.allowlist_function("WalProposerCreate")
|
||||
.allowlist_function("WalProposerStart")
|
||||
.allowlist_function("WalProposerBroadcast")
|
||||
.allowlist_function("WalProposerPoll")
|
||||
.allowlist_function("WalProposerFree")
|
||||
.allowlist_var("DEBUG5")
|
||||
.allowlist_var("DEBUG4")
|
||||
.allowlist_var("DEBUG3")
|
||||
.allowlist_var("DEBUG2")
|
||||
.allowlist_var("DEBUG1")
|
||||
.allowlist_var("LOG")
|
||||
.allowlist_var("INFO")
|
||||
.allowlist_var("NOTICE")
|
||||
.allowlist_var("WARNING")
|
||||
.allowlist_var("ERROR")
|
||||
.allowlist_var("FATAL")
|
||||
.allowlist_var("PANIC")
|
||||
.allowlist_var("WPEVENT")
|
||||
.allowlist_var("WL_LATCH_SET")
|
||||
.allowlist_var("WL_SOCKET_READABLE")
|
||||
.allowlist_var("WL_SOCKET_WRITEABLE")
|
||||
.allowlist_var("WL_TIMEOUT")
|
||||
.allowlist_var("WL_SOCKET_CLOSED")
|
||||
.allowlist_var("WL_SOCKET_MASK")
|
||||
.clang_arg("-DWALPROPOSER_LIB")
|
||||
.clang_arg(format!("-I{pgxn_neon}"))
|
||||
.clang_arg(format!("-I{inc_server_path}"))
|
||||
// Finish the builder and generate the bindings.
|
||||
.generate()
|
||||
// Unwrap the Result and panic on failure.
|
||||
.expect("Unable to generate bindings");
|
||||
|
||||
// Write the bindings to the $OUT_DIR/bindings.rs file.
|
||||
let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()).join("bindings.rs");
|
||||
bindings
|
||||
.write_to_file(out_path)
|
||||
.expect("Couldn't write bindings!");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,455 +0,0 @@
|
||||
#![allow(dead_code)]
|
||||
|
||||
use std::ffi::CStr;
|
||||
use std::ffi::CString;
|
||||
|
||||
use crate::bindings::uint32;
|
||||
use crate::bindings::walproposer_api;
|
||||
use crate::bindings::PGAsyncReadResult;
|
||||
use crate::bindings::PGAsyncWriteResult;
|
||||
use crate::bindings::Safekeeper;
|
||||
use crate::bindings::Size;
|
||||
use crate::bindings::StringInfoData;
|
||||
use crate::bindings::TimeLineID;
|
||||
use crate::bindings::TimestampTz;
|
||||
use crate::bindings::WalProposer;
|
||||
use crate::bindings::WalProposerConnStatusType;
|
||||
use crate::bindings::WalProposerConnectPollStatusType;
|
||||
use crate::bindings::WalProposerExecStatusType;
|
||||
use crate::bindings::WalproposerShmemState;
|
||||
use crate::bindings::XLogRecPtr;
|
||||
use crate::walproposer::ApiImpl;
|
||||
use crate::walproposer::WaitResult;
|
||||
|
||||
extern "C" fn get_shmem_state(wp: *mut WalProposer) -> *mut WalproposerShmemState {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).get_shmem_state()
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn start_streaming(wp: *mut WalProposer, startpos: XLogRecPtr) {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).start_streaming(startpos)
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn get_flush_rec_ptr(wp: *mut WalProposer) -> XLogRecPtr {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).get_flush_rec_ptr()
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn get_current_timestamp(wp: *mut WalProposer) -> TimestampTz {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).get_current_timestamp()
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn conn_error_message(sk: *mut Safekeeper) -> *mut ::std::os::raw::c_char {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
let msg = (*api).conn_error_message(&mut (*sk));
|
||||
let msg = CString::new(msg).unwrap();
|
||||
// TODO: fix leaking error message
|
||||
msg.into_raw()
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn conn_status(sk: *mut Safekeeper) -> WalProposerConnStatusType {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).conn_status(&mut (*sk))
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn conn_connect_start(sk: *mut Safekeeper) {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).conn_connect_start(&mut (*sk))
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn conn_connect_poll(sk: *mut Safekeeper) -> WalProposerConnectPollStatusType {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).conn_connect_poll(&mut (*sk))
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn conn_send_query(sk: *mut Safekeeper, query: *mut ::std::os::raw::c_char) -> bool {
|
||||
let query = unsafe { CStr::from_ptr(query) };
|
||||
let query = query.to_str().unwrap();
|
||||
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).conn_send_query(&mut (*sk), query)
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn conn_get_query_result(sk: *mut Safekeeper) -> WalProposerExecStatusType {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).conn_get_query_result(&mut (*sk))
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn conn_flush(sk: *mut Safekeeper) -> ::std::os::raw::c_int {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).conn_flush(&mut (*sk))
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn conn_finish(sk: *mut Safekeeper) {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).conn_finish(&mut (*sk))
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn conn_async_read(
|
||||
sk: *mut Safekeeper,
|
||||
buf: *mut *mut ::std::os::raw::c_char,
|
||||
amount: *mut ::std::os::raw::c_int,
|
||||
) -> PGAsyncReadResult {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
let (res, result) = (*api).conn_async_read(&mut (*sk));
|
||||
|
||||
// This function has guarantee that returned buf will be valid until
|
||||
// the next call. So we can store a Vec in each Safekeeper and reuse
|
||||
// it on the next call.
|
||||
let mut inbuf = take_vec_u8(&mut (*sk).inbuf).unwrap_or_default();
|
||||
|
||||
inbuf.clear();
|
||||
inbuf.extend_from_slice(res);
|
||||
|
||||
// Put a Vec back to sk->inbuf and return data ptr.
|
||||
*buf = store_vec_u8(&mut (*sk).inbuf, inbuf);
|
||||
*amount = res.len() as i32;
|
||||
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn conn_async_write(
|
||||
sk: *mut Safekeeper,
|
||||
buf: *const ::std::os::raw::c_void,
|
||||
size: usize,
|
||||
) -> PGAsyncWriteResult {
|
||||
unsafe {
|
||||
let buf = std::slice::from_raw_parts(buf as *const u8, size);
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).conn_async_write(&mut (*sk), buf)
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn conn_blocking_write(
|
||||
sk: *mut Safekeeper,
|
||||
buf: *const ::std::os::raw::c_void,
|
||||
size: usize,
|
||||
) -> bool {
|
||||
unsafe {
|
||||
let buf = std::slice::from_raw_parts(buf as *const u8, size);
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).conn_blocking_write(&mut (*sk), buf)
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn recovery_download(
|
||||
sk: *mut Safekeeper,
|
||||
_timeline: TimeLineID,
|
||||
startpos: XLogRecPtr,
|
||||
endpos: XLogRecPtr,
|
||||
) -> bool {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).recovery_download(&mut (*sk), startpos, endpos)
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn wal_read(
|
||||
sk: *mut Safekeeper,
|
||||
buf: *mut ::std::os::raw::c_char,
|
||||
startptr: XLogRecPtr,
|
||||
count: Size,
|
||||
) {
|
||||
unsafe {
|
||||
let buf = std::slice::from_raw_parts_mut(buf as *mut u8, count);
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).wal_read(&mut (*sk), buf, startptr)
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn wal_reader_allocate(sk: *mut Safekeeper) {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).wal_reader_allocate(&mut (*sk));
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn free_event_set(wp: *mut WalProposer) {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).free_event_set(&mut (*wp));
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn init_event_set(wp: *mut WalProposer) {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).init_event_set(&mut (*wp));
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn update_event_set(sk: *mut Safekeeper, events: uint32) {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).update_event_set(&mut (*sk), events);
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn add_safekeeper_event_set(sk: *mut Safekeeper, events: uint32) {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).add_safekeeper_event_set(&mut (*sk), events);
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn wait_event_set(
|
||||
wp: *mut WalProposer,
|
||||
timeout: ::std::os::raw::c_long,
|
||||
event_sk: *mut *mut Safekeeper,
|
||||
events: *mut uint32,
|
||||
) -> ::std::os::raw::c_int {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
let result = (*api).wait_event_set(&mut (*wp), timeout);
|
||||
match result {
|
||||
WaitResult::Latch => {
|
||||
*event_sk = std::ptr::null_mut();
|
||||
*events = crate::bindings::WL_LATCH_SET;
|
||||
1
|
||||
}
|
||||
WaitResult::Timeout => {
|
||||
*event_sk = std::ptr::null_mut();
|
||||
*events = crate::bindings::WL_TIMEOUT;
|
||||
0
|
||||
}
|
||||
WaitResult::Network(sk, event_mask) => {
|
||||
*event_sk = sk;
|
||||
*events = event_mask;
|
||||
1
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn strong_random(
|
||||
wp: *mut WalProposer,
|
||||
buf: *mut ::std::os::raw::c_void,
|
||||
len: usize,
|
||||
) -> bool {
|
||||
unsafe {
|
||||
let buf = std::slice::from_raw_parts_mut(buf as *mut u8, len);
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).strong_random(buf)
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn get_redo_start_lsn(wp: *mut WalProposer) -> XLogRecPtr {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).get_redo_start_lsn()
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).finish_sync_safekeepers(lsn)
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, commit_lsn: XLogRecPtr) {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).process_safekeeper_feedback(&mut (*wp), commit_lsn)
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn confirm_wal_streamed(wp: *mut WalProposer, lsn: XLogRecPtr) {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).confirm_wal_streamed(&mut (*wp), lsn)
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn log_internal(
|
||||
wp: *mut WalProposer,
|
||||
level: ::std::os::raw::c_int,
|
||||
line: *const ::std::os::raw::c_char,
|
||||
) {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
let line = CStr::from_ptr(line);
|
||||
let line = line.to_str().unwrap();
|
||||
(*api).log_internal(&mut (*wp), Level::from(level as u32), line)
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn after_election(wp: *mut WalProposer) {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).after_election(&mut (*wp))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum Level {
|
||||
Debug5,
|
||||
Debug4,
|
||||
Debug3,
|
||||
Debug2,
|
||||
Debug1,
|
||||
Log,
|
||||
Info,
|
||||
Notice,
|
||||
Warning,
|
||||
Error,
|
||||
Fatal,
|
||||
Panic,
|
||||
WPEvent,
|
||||
}
|
||||
|
||||
impl Level {
|
||||
pub fn from(elevel: u32) -> Level {
|
||||
use crate::bindings::*;
|
||||
|
||||
match elevel {
|
||||
DEBUG5 => Level::Debug5,
|
||||
DEBUG4 => Level::Debug4,
|
||||
DEBUG3 => Level::Debug3,
|
||||
DEBUG2 => Level::Debug2,
|
||||
DEBUG1 => Level::Debug1,
|
||||
LOG => Level::Log,
|
||||
INFO => Level::Info,
|
||||
NOTICE => Level::Notice,
|
||||
WARNING => Level::Warning,
|
||||
ERROR => Level::Error,
|
||||
FATAL => Level::Fatal,
|
||||
PANIC => Level::Panic,
|
||||
WPEVENT => Level::WPEvent,
|
||||
_ => panic!("unknown log level {}", elevel),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn create_api() -> walproposer_api {
|
||||
walproposer_api {
|
||||
get_shmem_state: Some(get_shmem_state),
|
||||
start_streaming: Some(start_streaming),
|
||||
get_flush_rec_ptr: Some(get_flush_rec_ptr),
|
||||
get_current_timestamp: Some(get_current_timestamp),
|
||||
conn_error_message: Some(conn_error_message),
|
||||
conn_status: Some(conn_status),
|
||||
conn_connect_start: Some(conn_connect_start),
|
||||
conn_connect_poll: Some(conn_connect_poll),
|
||||
conn_send_query: Some(conn_send_query),
|
||||
conn_get_query_result: Some(conn_get_query_result),
|
||||
conn_flush: Some(conn_flush),
|
||||
conn_finish: Some(conn_finish),
|
||||
conn_async_read: Some(conn_async_read),
|
||||
conn_async_write: Some(conn_async_write),
|
||||
conn_blocking_write: Some(conn_blocking_write),
|
||||
recovery_download: Some(recovery_download),
|
||||
wal_read: Some(wal_read),
|
||||
wal_reader_allocate: Some(wal_reader_allocate),
|
||||
free_event_set: Some(free_event_set),
|
||||
init_event_set: Some(init_event_set),
|
||||
update_event_set: Some(update_event_set),
|
||||
add_safekeeper_event_set: Some(add_safekeeper_event_set),
|
||||
wait_event_set: Some(wait_event_set),
|
||||
strong_random: Some(strong_random),
|
||||
get_redo_start_lsn: Some(get_redo_start_lsn),
|
||||
finish_sync_safekeepers: Some(finish_sync_safekeepers),
|
||||
process_safekeeper_feedback: Some(process_safekeeper_feedback),
|
||||
confirm_wal_streamed: Some(confirm_wal_streamed),
|
||||
log_internal: Some(log_internal),
|
||||
after_election: Some(after_election),
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for Level {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
write!(f, "{:?}", self)
|
||||
}
|
||||
}
|
||||
|
||||
/// Take ownership of `Vec<u8>` from StringInfoData.
|
||||
pub(crate) fn take_vec_u8(pg: &mut StringInfoData) -> Option<Vec<u8>> {
|
||||
if pg.data.is_null() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let ptr = pg.data as *mut u8;
|
||||
let length = pg.len as usize;
|
||||
let capacity = pg.maxlen as usize;
|
||||
|
||||
pg.data = std::ptr::null_mut();
|
||||
pg.len = 0;
|
||||
pg.maxlen = 0;
|
||||
|
||||
unsafe { Some(Vec::from_raw_parts(ptr, length, capacity)) }
|
||||
}
|
||||
|
||||
/// Store `Vec<u8>` in StringInfoData.
|
||||
fn store_vec_u8(pg: &mut StringInfoData, vec: Vec<u8>) -> *mut ::std::os::raw::c_char {
|
||||
let ptr = vec.as_ptr() as *mut ::std::os::raw::c_char;
|
||||
let length = vec.len();
|
||||
let capacity = vec.capacity();
|
||||
|
||||
assert!(pg.data.is_null());
|
||||
|
||||
pg.data = ptr;
|
||||
pg.len = length as i32;
|
||||
pg.maxlen = capacity as i32;
|
||||
|
||||
std::mem::forget(vec);
|
||||
|
||||
ptr
|
||||
}
|
||||
@@ -1,14 +0,0 @@
|
||||
pub mod bindings {
|
||||
#![allow(non_upper_case_globals)]
|
||||
#![allow(non_camel_case_types)]
|
||||
#![allow(non_snake_case)]
|
||||
// bindgen creates some unsafe code with no doc comments.
|
||||
#![allow(clippy::missing_safety_doc)]
|
||||
// noted at 1.63 that in many cases there's a u32 -> u32 transmutes in bindgen code.
|
||||
#![allow(clippy::useless_transmute)]
|
||||
|
||||
include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
|
||||
}
|
||||
|
||||
pub mod api_bindings;
|
||||
pub mod walproposer;
|
||||
@@ -1,485 +0,0 @@
|
||||
use std::ffi::CString;
|
||||
|
||||
use postgres_ffi::WAL_SEGMENT_SIZE;
|
||||
use utils::id::TenantTimelineId;
|
||||
|
||||
use crate::{
|
||||
api_bindings::{create_api, take_vec_u8, Level},
|
||||
bindings::{
|
||||
Safekeeper, WalProposer, WalProposerConfig, WalProposerCreate, WalProposerFree,
|
||||
WalProposerStart,
|
||||
},
|
||||
};
|
||||
|
||||
/// Rust high-level wrapper for C walproposer API. Many methods are not required
|
||||
/// for simple cases, hence todo!() in default implementations.
|
||||
///
|
||||
/// Refer to `pgxn/neon/walproposer.h` for documentation.
|
||||
pub trait ApiImpl {
|
||||
fn get_shmem_state(&self) -> &mut crate::bindings::WalproposerShmemState {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn start_streaming(&self, _startpos: u64) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn get_flush_rec_ptr(&self) -> u64 {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn get_current_timestamp(&self) -> i64 {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn conn_error_message(&self, _sk: &mut Safekeeper) -> String {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn conn_status(&self, _sk: &mut Safekeeper) -> crate::bindings::WalProposerConnStatusType {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn conn_connect_start(&self, _sk: &mut Safekeeper) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn conn_connect_poll(
|
||||
&self,
|
||||
_sk: &mut Safekeeper,
|
||||
) -> crate::bindings::WalProposerConnectPollStatusType {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn conn_send_query(&self, _sk: &mut Safekeeper, _query: &str) -> bool {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn conn_get_query_result(
|
||||
&self,
|
||||
_sk: &mut Safekeeper,
|
||||
) -> crate::bindings::WalProposerExecStatusType {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn conn_flush(&self, _sk: &mut Safekeeper) -> i32 {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn conn_finish(&self, _sk: &mut Safekeeper) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn conn_async_read(&self, _sk: &mut Safekeeper) -> (&[u8], crate::bindings::PGAsyncReadResult) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn conn_async_write(
|
||||
&self,
|
||||
_sk: &mut Safekeeper,
|
||||
_buf: &[u8],
|
||||
) -> crate::bindings::PGAsyncWriteResult {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn conn_blocking_write(&self, _sk: &mut Safekeeper, _buf: &[u8]) -> bool {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn recovery_download(&self, _sk: &mut Safekeeper, _startpos: u64, _endpos: u64) -> bool {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn wal_read(&self, _sk: &mut Safekeeper, _buf: &mut [u8], _startpos: u64) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn wal_reader_allocate(&self, _sk: &mut Safekeeper) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn free_event_set(&self, _wp: &mut WalProposer) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn init_event_set(&self, _wp: &mut WalProposer) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn update_event_set(&self, _sk: &mut Safekeeper, _events_mask: u32) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn add_safekeeper_event_set(&self, _sk: &mut Safekeeper, _events_mask: u32) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn wait_event_set(&self, _wp: &mut WalProposer, _timeout_millis: i64) -> WaitResult {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn strong_random(&self, _buf: &mut [u8]) -> bool {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn get_redo_start_lsn(&self) -> u64 {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn finish_sync_safekeepers(&self, _lsn: u64) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn process_safekeeper_feedback(&self, _wp: &mut WalProposer, _commit_lsn: u64) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn confirm_wal_streamed(&self, _wp: &mut WalProposer, _lsn: u64) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn log_internal(&self, _wp: &mut WalProposer, _level: Level, _msg: &str) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn after_election(&self, _wp: &mut WalProposer) {
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
|
||||
pub enum WaitResult {
|
||||
Latch,
|
||||
Timeout,
|
||||
Network(*mut Safekeeper, u32),
|
||||
}
|
||||
|
||||
pub struct Config {
|
||||
/// Tenant and timeline id
|
||||
pub ttid: TenantTimelineId,
|
||||
/// List of safekeepers in format `host:port`
|
||||
pub safekeepers_list: Vec<String>,
|
||||
/// Safekeeper reconnect timeout in milliseconds
|
||||
pub safekeeper_reconnect_timeout: i32,
|
||||
/// Safekeeper connection timeout in milliseconds
|
||||
pub safekeeper_connection_timeout: i32,
|
||||
/// walproposer mode, finish when all safekeepers are synced or subscribe
|
||||
/// to WAL streaming
|
||||
pub sync_safekeepers: bool,
|
||||
}
|
||||
|
||||
/// WalProposer main struct. C methods are reexported as Rust functions.
|
||||
pub struct Wrapper {
|
||||
wp: *mut WalProposer,
|
||||
_safekeepers_list_vec: Vec<u8>,
|
||||
}
|
||||
|
||||
impl Wrapper {
|
||||
pub fn new(api: Box<dyn ApiImpl>, config: Config) -> Wrapper {
|
||||
let neon_tenant = CString::new(config.ttid.tenant_id.to_string())
|
||||
.unwrap()
|
||||
.into_raw();
|
||||
let neon_timeline = CString::new(config.ttid.timeline_id.to_string())
|
||||
.unwrap()
|
||||
.into_raw();
|
||||
|
||||
let mut safekeepers_list_vec = CString::new(config.safekeepers_list.join(","))
|
||||
.unwrap()
|
||||
.into_bytes_with_nul();
|
||||
assert!(safekeepers_list_vec.len() == safekeepers_list_vec.capacity());
|
||||
let safekeepers_list = safekeepers_list_vec.as_mut_ptr() as *mut i8;
|
||||
|
||||
let callback_data = Box::into_raw(Box::new(api)) as *mut ::std::os::raw::c_void;
|
||||
|
||||
let c_config = WalProposerConfig {
|
||||
neon_tenant,
|
||||
neon_timeline,
|
||||
safekeepers_list,
|
||||
safekeeper_reconnect_timeout: config.safekeeper_reconnect_timeout,
|
||||
safekeeper_connection_timeout: config.safekeeper_connection_timeout,
|
||||
wal_segment_size: WAL_SEGMENT_SIZE as i32, // default 16MB
|
||||
syncSafekeepers: config.sync_safekeepers,
|
||||
systemId: 0,
|
||||
pgTimeline: 1,
|
||||
callback_data,
|
||||
};
|
||||
let c_config = Box::into_raw(Box::new(c_config));
|
||||
|
||||
let api = create_api();
|
||||
let wp = unsafe { WalProposerCreate(c_config, api) };
|
||||
Wrapper {
|
||||
wp,
|
||||
_safekeepers_list_vec: safekeepers_list_vec,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn start(&self) {
|
||||
unsafe { WalProposerStart(self.wp) }
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for Wrapper {
|
||||
fn drop(&mut self) {
|
||||
unsafe {
|
||||
let config = (*self.wp).config;
|
||||
drop(Box::from_raw(
|
||||
(*config).callback_data as *mut Box<dyn ApiImpl>,
|
||||
));
|
||||
drop(CString::from_raw((*config).neon_tenant));
|
||||
drop(CString::from_raw((*config).neon_timeline));
|
||||
drop(Box::from_raw(config));
|
||||
|
||||
for i in 0..(*self.wp).n_safekeepers {
|
||||
let sk = &mut (*self.wp).safekeeper[i as usize];
|
||||
take_vec_u8(&mut sk.inbuf);
|
||||
}
|
||||
|
||||
WalProposerFree(self.wp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::{
|
||||
cell::Cell,
|
||||
sync::{atomic::AtomicUsize, mpsc::sync_channel},
|
||||
};
|
||||
|
||||
use utils::id::TenantTimelineId;
|
||||
|
||||
use crate::{api_bindings::Level, walproposer::Wrapper};
|
||||
|
||||
use super::ApiImpl;
|
||||
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
struct WaitEventsData {
|
||||
sk: *mut crate::bindings::Safekeeper,
|
||||
event_mask: u32,
|
||||
}
|
||||
|
||||
struct MockImpl {
|
||||
// data to return from wait_event_set
|
||||
wait_events: Cell<WaitEventsData>,
|
||||
// walproposer->safekeeper messages
|
||||
expected_messages: Vec<Vec<u8>>,
|
||||
expected_ptr: AtomicUsize,
|
||||
// safekeeper->walproposer messages
|
||||
safekeeper_replies: Vec<Vec<u8>>,
|
||||
replies_ptr: AtomicUsize,
|
||||
// channel to send LSN to the main thread
|
||||
sync_channel: std::sync::mpsc::SyncSender<u64>,
|
||||
}
|
||||
|
||||
impl MockImpl {
|
||||
fn check_walproposer_msg(&self, msg: &[u8]) {
|
||||
let ptr = self
|
||||
.expected_ptr
|
||||
.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
|
||||
|
||||
if ptr >= self.expected_messages.len() {
|
||||
panic!("unexpected message from walproposer");
|
||||
}
|
||||
|
||||
let expected_msg = &self.expected_messages[ptr];
|
||||
assert_eq!(msg, expected_msg.as_slice());
|
||||
}
|
||||
|
||||
fn next_safekeeper_reply(&self) -> &[u8] {
|
||||
let ptr = self
|
||||
.replies_ptr
|
||||
.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
|
||||
|
||||
if ptr >= self.safekeeper_replies.len() {
|
||||
panic!("no more safekeeper replies");
|
||||
}
|
||||
|
||||
&self.safekeeper_replies[ptr]
|
||||
}
|
||||
}
|
||||
|
||||
impl ApiImpl for MockImpl {
|
||||
fn get_current_timestamp(&self) -> i64 {
|
||||
println!("get_current_timestamp");
|
||||
0
|
||||
}
|
||||
|
||||
fn conn_status(
|
||||
&self,
|
||||
_: &mut crate::bindings::Safekeeper,
|
||||
) -> crate::bindings::WalProposerConnStatusType {
|
||||
println!("conn_status");
|
||||
crate::bindings::WalProposerConnStatusType_WP_CONNECTION_OK
|
||||
}
|
||||
|
||||
fn conn_connect_start(&self, _: &mut crate::bindings::Safekeeper) {
|
||||
println!("conn_connect_start");
|
||||
}
|
||||
|
||||
fn conn_connect_poll(
|
||||
&self,
|
||||
_: &mut crate::bindings::Safekeeper,
|
||||
) -> crate::bindings::WalProposerConnectPollStatusType {
|
||||
println!("conn_connect_poll");
|
||||
crate::bindings::WalProposerConnectPollStatusType_WP_CONN_POLLING_OK
|
||||
}
|
||||
|
||||
fn conn_send_query(&self, _: &mut crate::bindings::Safekeeper, query: &str) -> bool {
|
||||
println!("conn_send_query: {}", query);
|
||||
true
|
||||
}
|
||||
|
||||
fn conn_get_query_result(
|
||||
&self,
|
||||
_: &mut crate::bindings::Safekeeper,
|
||||
) -> crate::bindings::WalProposerExecStatusType {
|
||||
println!("conn_get_query_result");
|
||||
crate::bindings::WalProposerExecStatusType_WP_EXEC_SUCCESS_COPYBOTH
|
||||
}
|
||||
|
||||
fn conn_async_read(
|
||||
&self,
|
||||
_: &mut crate::bindings::Safekeeper,
|
||||
) -> (&[u8], crate::bindings::PGAsyncReadResult) {
|
||||
println!("conn_async_read");
|
||||
let reply = self.next_safekeeper_reply();
|
||||
println!("conn_async_read result: {:?}", reply);
|
||||
(
|
||||
reply,
|
||||
crate::bindings::PGAsyncReadResult_PG_ASYNC_READ_SUCCESS,
|
||||
)
|
||||
}
|
||||
|
||||
fn conn_blocking_write(&self, _: &mut crate::bindings::Safekeeper, buf: &[u8]) -> bool {
|
||||
println!("conn_blocking_write: {:?}", buf);
|
||||
self.check_walproposer_msg(buf);
|
||||
true
|
||||
}
|
||||
|
||||
fn wal_reader_allocate(&self, _: &mut crate::bindings::Safekeeper) {
|
||||
println!("wal_reader_allocate")
|
||||
}
|
||||
|
||||
fn free_event_set(&self, _: &mut crate::bindings::WalProposer) {
|
||||
println!("free_event_set")
|
||||
}
|
||||
|
||||
fn init_event_set(&self, _: &mut crate::bindings::WalProposer) {
|
||||
println!("init_event_set")
|
||||
}
|
||||
|
||||
fn update_event_set(&self, sk: &mut crate::bindings::Safekeeper, event_mask: u32) {
|
||||
println!(
|
||||
"update_event_set, sk={:?}, events_mask={:#b}",
|
||||
sk as *mut crate::bindings::Safekeeper, event_mask
|
||||
);
|
||||
self.wait_events.set(WaitEventsData { sk, event_mask });
|
||||
}
|
||||
|
||||
fn add_safekeeper_event_set(&self, sk: &mut crate::bindings::Safekeeper, event_mask: u32) {
|
||||
println!(
|
||||
"add_safekeeper_event_set, sk={:?}, events_mask={:#b}",
|
||||
sk as *mut crate::bindings::Safekeeper, event_mask
|
||||
);
|
||||
self.wait_events.set(WaitEventsData { sk, event_mask });
|
||||
}
|
||||
|
||||
fn wait_event_set(
|
||||
&self,
|
||||
_: &mut crate::bindings::WalProposer,
|
||||
timeout_millis: i64,
|
||||
) -> super::WaitResult {
|
||||
let data = self.wait_events.get();
|
||||
println!(
|
||||
"wait_event_set, timeout_millis={}, res={:?}",
|
||||
timeout_millis, data
|
||||
);
|
||||
super::WaitResult::Network(data.sk, data.event_mask)
|
||||
}
|
||||
|
||||
fn strong_random(&self, buf: &mut [u8]) -> bool {
|
||||
println!("strong_random");
|
||||
buf.fill(0);
|
||||
true
|
||||
}
|
||||
|
||||
fn finish_sync_safekeepers(&self, lsn: u64) {
|
||||
self.sync_channel.send(lsn).unwrap();
|
||||
panic!("sync safekeepers finished at lsn={}", lsn);
|
||||
}
|
||||
|
||||
fn log_internal(&self, _wp: &mut crate::bindings::WalProposer, level: Level, msg: &str) {
|
||||
println!("walprop_log[{}] {}", level, msg);
|
||||
}
|
||||
|
||||
fn after_election(&self, _wp: &mut crate::bindings::WalProposer) {
|
||||
println!("after_election");
|
||||
}
|
||||
}
|
||||
|
||||
/// Test that walproposer can successfully connect to safekeeper and finish
|
||||
/// sync_safekeepers. API is mocked in MockImpl.
|
||||
///
|
||||
/// Run this test with valgrind to detect leaks:
|
||||
/// `valgrind --leak-check=full target/debug/deps/walproposer-<build>`
|
||||
#[test]
|
||||
fn test_simple_sync_safekeepers() -> anyhow::Result<()> {
|
||||
let ttid = TenantTimelineId::new(
|
||||
"9e4c8f36063c6c6e93bc20d65a820f3d".parse()?,
|
||||
"9e4c8f36063c6c6e93bc20d65a820f3d".parse()?,
|
||||
);
|
||||
|
||||
let (sender, receiver) = sync_channel(1);
|
||||
|
||||
let my_impl: Box<dyn ApiImpl> = Box::new(MockImpl {
|
||||
wait_events: Cell::new(WaitEventsData {
|
||||
sk: std::ptr::null_mut(),
|
||||
event_mask: 0,
|
||||
}),
|
||||
expected_messages: vec![
|
||||
// Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160000, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
|
||||
vec![
|
||||
103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110,
|
||||
147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147,
|
||||
188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1,
|
||||
],
|
||||
// VoteRequest(VoteRequest { term: 3 })
|
||||
vec![
|
||||
118, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0,
|
||||
],
|
||||
],
|
||||
expected_ptr: AtomicUsize::new(0),
|
||||
safekeeper_replies: vec![
|
||||
// Greeting(AcceptorGreeting { term: 2, node_id: NodeId(1) })
|
||||
vec![
|
||||
103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
|
||||
],
|
||||
// VoteResponse(VoteResponse { term: 3, vote_given: 1, flush_lsn: 0/539, truncate_lsn: 0/539, term_history: [(2, 0/539)], timeline_start_lsn: 0/539 })
|
||||
vec![
|
||||
118, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 57,
|
||||
5, 0, 0, 0, 0, 0, 0, 57, 5, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0,
|
||||
0, 57, 5, 0, 0, 0, 0, 0, 0, 57, 5, 0, 0, 0, 0, 0, 0,
|
||||
],
|
||||
],
|
||||
replies_ptr: AtomicUsize::new(0),
|
||||
sync_channel: sender,
|
||||
});
|
||||
let config = crate::walproposer::Config {
|
||||
ttid,
|
||||
safekeepers_list: vec!["localhost:5000".to_string()],
|
||||
safekeeper_reconnect_timeout: 1000,
|
||||
safekeeper_connection_timeout: 10000,
|
||||
sync_safekeepers: true,
|
||||
};
|
||||
|
||||
let wp = Wrapper::new(my_impl, config);
|
||||
|
||||
// walproposer will panic when it finishes sync_safekeepers
|
||||
std::panic::catch_unwind(|| wp.start()).unwrap_err();
|
||||
// validate the resulting LSN
|
||||
assert_eq!(receiver.recv()?, 1337);
|
||||
Ok(())
|
||||
// drop() will free up resources here
|
||||
}
|
||||
}
|
||||
@@ -32,15 +32,9 @@ fn redo_scenarios(c: &mut Criterion) {
|
||||
|
||||
let manager = Arc::new(manager);
|
||||
|
||||
{
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
tracing::info!("executing first");
|
||||
short().execute(rt.handle(), &manager).unwrap();
|
||||
tracing::info!("first executed");
|
||||
}
|
||||
tracing::info!("executing first");
|
||||
short().execute(&manager).unwrap();
|
||||
tracing::info!("first executed");
|
||||
|
||||
let thread_counts = [1, 2, 4, 8, 16];
|
||||
|
||||
@@ -83,14 +77,9 @@ fn add_multithreaded_walredo_requesters(
|
||||
assert_ne!(threads, 0);
|
||||
|
||||
if threads == 1 {
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
let handle = rt.handle();
|
||||
b.iter_batched_ref(
|
||||
|| Some(input_factory()),
|
||||
|input| execute_all(input.take(), handle, manager),
|
||||
|input| execute_all(input.take(), manager),
|
||||
criterion::BatchSize::PerIteration,
|
||||
);
|
||||
} else {
|
||||
@@ -106,26 +95,19 @@ fn add_multithreaded_walredo_requesters(
|
||||
let manager = manager.clone();
|
||||
let barrier = barrier.clone();
|
||||
let work_rx = work_rx.clone();
|
||||
move || {
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
let handle = rt.handle();
|
||||
loop {
|
||||
// queue up and wait if we want to go another round
|
||||
if work_rx.lock().unwrap().recv().is_err() {
|
||||
break;
|
||||
}
|
||||
|
||||
let input = Some(input_factory());
|
||||
|
||||
barrier.wait();
|
||||
|
||||
execute_all(input, handle, &manager).unwrap();
|
||||
|
||||
barrier.wait();
|
||||
move || loop {
|
||||
// queue up and wait if we want to go another round
|
||||
if work_rx.lock().unwrap().recv().is_err() {
|
||||
break;
|
||||
}
|
||||
|
||||
let input = Some(input_factory());
|
||||
|
||||
barrier.wait();
|
||||
|
||||
execute_all(input, &manager).unwrap();
|
||||
|
||||
barrier.wait();
|
||||
}
|
||||
})
|
||||
})
|
||||
@@ -167,17 +149,13 @@ impl Drop for JoinOnDrop {
|
||||
}
|
||||
}
|
||||
|
||||
fn execute_all<I>(
|
||||
input: I,
|
||||
handle: &tokio::runtime::Handle,
|
||||
manager: &PostgresRedoManager,
|
||||
) -> anyhow::Result<()>
|
||||
fn execute_all<I>(input: I, manager: &PostgresRedoManager) -> anyhow::Result<()>
|
||||
where
|
||||
I: IntoIterator<Item = Request>,
|
||||
{
|
||||
// just fire all requests as fast as possible
|
||||
input.into_iter().try_for_each(|req| {
|
||||
let page = req.execute(handle, manager)?;
|
||||
let page = req.execute(manager)?;
|
||||
assert_eq!(page.remaining(), 8192);
|
||||
anyhow::Ok(())
|
||||
})
|
||||
@@ -492,11 +470,9 @@ struct Request {
|
||||
}
|
||||
|
||||
impl Request {
|
||||
fn execute(
|
||||
self,
|
||||
rt: &tokio::runtime::Handle,
|
||||
manager: &PostgresRedoManager,
|
||||
) -> anyhow::Result<Bytes> {
|
||||
fn execute(self, manager: &PostgresRedoManager) -> anyhow::Result<Bytes> {
|
||||
use pageserver::walredo::WalRedoManager;
|
||||
|
||||
let Request {
|
||||
key,
|
||||
lsn,
|
||||
@@ -505,6 +481,6 @@ impl Request {
|
||||
pg_version,
|
||||
} = self;
|
||||
|
||||
rt.block_on(manager.request_redo(key, lsn, base_img, records, pg_version))
|
||||
manager.request_redo(key, lsn, base_img, records, pg_version)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13,7 +13,6 @@
|
||||
use anyhow::{anyhow, bail, ensure, Context};
|
||||
use bytes::{BufMut, BytesMut};
|
||||
use fail::fail_point;
|
||||
use postgres_ffi::pg_constants;
|
||||
use std::fmt::Write as FmtWrite;
|
||||
use std::time::SystemTime;
|
||||
use tokio::io;
|
||||
@@ -181,7 +180,6 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
let mut min_restart_lsn: Lsn = Lsn::MAX;
|
||||
// Create tablespace directories
|
||||
for ((spcnode, dbnode), has_relmap_file) in
|
||||
self.timeline.list_dbdirs(self.lsn, self.ctx).await?
|
||||
@@ -215,34 +213,6 @@ where
|
||||
self.add_rel(rel, rel).await?;
|
||||
}
|
||||
}
|
||||
|
||||
for (path, content) in self.timeline.list_aux_files(self.lsn, self.ctx).await? {
|
||||
if path.starts_with("pg_replslot") {
|
||||
let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN;
|
||||
let restart_lsn = Lsn(u64::from_le_bytes(
|
||||
content[offs..offs + 8].try_into().unwrap(),
|
||||
));
|
||||
info!("Replication slot {} restart LSN={}", path, restart_lsn);
|
||||
min_restart_lsn = Lsn::min(min_restart_lsn, restart_lsn);
|
||||
}
|
||||
let header = new_tar_header(&path, content.len() as u64)?;
|
||||
self.ar
|
||||
.append(&header, &*content)
|
||||
.await
|
||||
.context("could not add aux file to basebackup tarball")?;
|
||||
}
|
||||
}
|
||||
if min_restart_lsn != Lsn::MAX {
|
||||
info!(
|
||||
"Min restart LSN for logical replication is {}",
|
||||
min_restart_lsn
|
||||
);
|
||||
let data = min_restart_lsn.0.to_le_bytes();
|
||||
let header = new_tar_header("restart.lsn", data.len() as u64)?;
|
||||
self.ar
|
||||
.append(&header, &data[..])
|
||||
.await
|
||||
.context("could not add restart.lsn file to basebackup tarball")?;
|
||||
}
|
||||
for xid in self
|
||||
.timeline
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
|
||||
use std::env::{var, VarError};
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use std::{env, ops::ControlFlow, str::FromStr};
|
||||
|
||||
use anyhow::{anyhow, Context};
|
||||
@@ -201,51 +200,6 @@ fn initialize_config(
|
||||
})
|
||||
}
|
||||
|
||||
struct WaitForPhaseResult<F: std::future::Future + Unpin> {
|
||||
timeout_remaining: Duration,
|
||||
skipped: Option<F>,
|
||||
}
|
||||
|
||||
/// During startup, we apply a timeout to our waits for readiness, to avoid
|
||||
/// stalling the whole service if one Tenant experiences some problem. Each
|
||||
/// phase may consume some of the timeout: this function returns the updated
|
||||
/// timeout for use in the next call.
|
||||
async fn wait_for_phase<F>(phase: &str, mut fut: F, timeout: Duration) -> WaitForPhaseResult<F>
|
||||
where
|
||||
F: std::future::Future + Unpin,
|
||||
{
|
||||
let initial_t = Instant::now();
|
||||
let skipped = match tokio::time::timeout(timeout, &mut fut).await {
|
||||
Ok(_) => None,
|
||||
Err(_) => {
|
||||
tracing::info!(
|
||||
timeout_millis = timeout.as_millis(),
|
||||
%phase,
|
||||
"Startup phase timed out, proceeding anyway"
|
||||
);
|
||||
Some(fut)
|
||||
}
|
||||
};
|
||||
|
||||
WaitForPhaseResult {
|
||||
timeout_remaining: timeout
|
||||
.checked_sub(Instant::now().duration_since(initial_t))
|
||||
.unwrap_or(Duration::ZERO),
|
||||
skipped,
|
||||
}
|
||||
}
|
||||
|
||||
fn startup_checkpoint(started_at: Instant, phase: &str, human_phase: &str) {
|
||||
let elapsed = started_at.elapsed();
|
||||
let secs = elapsed.as_secs_f64();
|
||||
STARTUP_DURATION.with_label_values(&[phase]).set(secs);
|
||||
|
||||
info!(
|
||||
elapsed_ms = elapsed.as_millis(),
|
||||
"{human_phase} ({secs:.3}s since start)"
|
||||
)
|
||||
}
|
||||
|
||||
fn start_pageserver(
|
||||
launch_ts: &'static LaunchTimestamp,
|
||||
conf: &'static PageServerConf,
|
||||
@@ -253,6 +207,16 @@ fn start_pageserver(
|
||||
// Monotonic time for later calculating startup duration
|
||||
let started_startup_at = Instant::now();
|
||||
|
||||
let startup_checkpoint = move |phase: &str, human_phase: &str| {
|
||||
let elapsed = started_startup_at.elapsed();
|
||||
let secs = elapsed.as_secs_f64();
|
||||
STARTUP_DURATION.with_label_values(&[phase]).set(secs);
|
||||
info!(
|
||||
elapsed_ms = elapsed.as_millis(),
|
||||
"{human_phase} ({secs:.3}s since start)"
|
||||
)
|
||||
};
|
||||
|
||||
// Print version and launch timestamp to the log,
|
||||
// and expose them as prometheus metrics.
|
||||
// A changed version string indicates changed software.
|
||||
@@ -377,7 +341,7 @@ fn start_pageserver(
|
||||
|
||||
// Up to this point no significant I/O has been done: this should have been fast. Record
|
||||
// duration prior to starting I/O intensive phase of startup.
|
||||
startup_checkpoint(started_startup_at, "initial", "Starting loading tenants");
|
||||
startup_checkpoint("initial", "Starting loading tenants");
|
||||
STARTUP_IS_LOADING.set(1);
|
||||
|
||||
// Startup staging or optimizing:
|
||||
@@ -424,93 +388,58 @@ fn start_pageserver(
|
||||
let shutdown_pageserver = shutdown_pageserver.clone();
|
||||
let drive_init = async move {
|
||||
// NOTE: unlike many futures in pageserver, this one is cancellation-safe
|
||||
let guard = scopeguard::guard_on_success((), |_| {
|
||||
tracing::info!("Cancelled before initial load completed")
|
||||
});
|
||||
let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial load completed"));
|
||||
|
||||
let timeout = conf.background_task_maximum_delay;
|
||||
init_remote_done_rx.wait().await;
|
||||
startup_checkpoint("initial_tenant_load_remote", "Remote part of initial load completed");
|
||||
|
||||
let init_remote_done = std::pin::pin!(async {
|
||||
init_remote_done_rx.wait().await;
|
||||
startup_checkpoint(
|
||||
started_startup_at,
|
||||
"initial_tenant_load_remote",
|
||||
"Remote part of initial load completed",
|
||||
);
|
||||
});
|
||||
|
||||
let WaitForPhaseResult {
|
||||
timeout_remaining: timeout,
|
||||
skipped: init_remote_skipped,
|
||||
} = wait_for_phase("initial_tenant_load_remote", init_remote_done, timeout).await;
|
||||
|
||||
let init_load_done = std::pin::pin!(async {
|
||||
init_done_rx.wait().await;
|
||||
startup_checkpoint(
|
||||
started_startup_at,
|
||||
"initial_tenant_load",
|
||||
"Initial load completed",
|
||||
);
|
||||
STARTUP_IS_LOADING.set(0);
|
||||
});
|
||||
|
||||
let WaitForPhaseResult {
|
||||
timeout_remaining: timeout,
|
||||
skipped: init_load_skipped,
|
||||
} = wait_for_phase("initial_tenant_load", init_load_done, timeout).await;
|
||||
init_done_rx.wait().await;
|
||||
startup_checkpoint("initial_tenant_load", "Initial load completed");
|
||||
STARTUP_IS_LOADING.set(0);
|
||||
|
||||
// initial logical sizes can now start, as they were waiting on init_done_rx.
|
||||
|
||||
scopeguard::ScopeGuard::into_inner(guard);
|
||||
|
||||
let guard = scopeguard::guard_on_success((), |_| {
|
||||
tracing::info!("Cancelled before initial logical sizes completed")
|
||||
});
|
||||
let mut init_sizes_done = std::pin::pin!(init_logical_size_done_rx.wait());
|
||||
|
||||
let logical_sizes_done = std::pin::pin!(async {
|
||||
init_logical_size_done_rx.wait().await;
|
||||
startup_checkpoint(
|
||||
started_startup_at,
|
||||
"initial_logical_sizes",
|
||||
"Initial logical sizes completed",
|
||||
);
|
||||
});
|
||||
let timeout = conf.background_task_maximum_delay;
|
||||
|
||||
let WaitForPhaseResult {
|
||||
timeout_remaining: _,
|
||||
skipped: logical_sizes_skipped,
|
||||
} = wait_for_phase("initial_logical_sizes", logical_sizes_done, timeout).await;
|
||||
let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed"));
|
||||
|
||||
let init_sizes_done = match tokio::time::timeout(timeout, &mut init_sizes_done).await {
|
||||
Ok(_) => {
|
||||
startup_checkpoint("initial_logical_sizes", "Initial logical sizes completed");
|
||||
None
|
||||
}
|
||||
Err(_) => {
|
||||
tracing::info!(
|
||||
timeout_millis = timeout.as_millis(),
|
||||
"Initial logical size timeout elapsed; starting background jobs"
|
||||
);
|
||||
Some(init_sizes_done)
|
||||
}
|
||||
};
|
||||
|
||||
scopeguard::ScopeGuard::into_inner(guard);
|
||||
|
||||
// allow background jobs to start: we either completed prior stages, or they reached timeout
|
||||
// and were skipped. It is important that we do not let them block background jobs indefinitely,
|
||||
// because things like consumption metrics for billing are blocked by this barrier.
|
||||
// allow background jobs to start
|
||||
drop(background_jobs_can_start);
|
||||
startup_checkpoint(
|
||||
started_startup_at,
|
||||
"background_jobs_can_start",
|
||||
"Starting background jobs",
|
||||
);
|
||||
startup_checkpoint("background_jobs_can_start", "Starting background jobs");
|
||||
|
||||
// We are done. If we skipped any phases due to timeout, run them to completion here so that
|
||||
// they will eventually update their startup_checkpoint, and so that we do not declare the
|
||||
// 'complete' stage until all the other stages are really done.
|
||||
let guard = scopeguard::guard_on_success((), |_| {
|
||||
tracing::info!("Cancelled before waiting for skipped phases done")
|
||||
});
|
||||
if let Some(f) = init_remote_skipped {
|
||||
f.await;
|
||||
}
|
||||
if let Some(f) = init_load_skipped {
|
||||
f.await;
|
||||
}
|
||||
if let Some(f) = logical_sizes_skipped {
|
||||
f.await;
|
||||
}
|
||||
scopeguard::ScopeGuard::into_inner(guard);
|
||||
if let Some(init_sizes_done) = init_sizes_done {
|
||||
// ending up here is not a bug; at the latest logical sizes will be queried by
|
||||
// consumption metrics.
|
||||
let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed"));
|
||||
init_sizes_done.await;
|
||||
|
||||
startup_checkpoint(started_startup_at, "complete", "Startup complete");
|
||||
scopeguard::ScopeGuard::into_inner(guard);
|
||||
|
||||
startup_checkpoint("initial_logical_sizes", "Initial logical sizes completed after timeout (background jobs already started)");
|
||||
|
||||
}
|
||||
|
||||
startup_checkpoint("complete", "Startup complete");
|
||||
};
|
||||
|
||||
async move {
|
||||
@@ -650,7 +579,6 @@ fn start_pageserver(
|
||||
pageserver_listener,
|
||||
conf.pg_auth_type,
|
||||
libpq_ctx,
|
||||
task_mgr::shutdown_token(),
|
||||
)
|
||||
.await
|
||||
},
|
||||
|
||||
@@ -1298,6 +1298,10 @@ pub(crate) mod mock {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_executed(&self) -> usize {
|
||||
self.executed.load(Ordering::Relaxed)
|
||||
}
|
||||
|
||||
#[allow(clippy::await_holding_lock)]
|
||||
pub async fn pump(&self) {
|
||||
if let Some(remote_storage) = &self.remote_storage {
|
||||
|
||||
@@ -306,67 +306,6 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_timestamp_of_lsn:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
- name: timeline_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
get:
|
||||
description: Get timestamp for a given LSN
|
||||
parameters:
|
||||
- name: lsn
|
||||
in: query
|
||||
required: true
|
||||
schema:
|
||||
type: integer
|
||||
description: A LSN to get the timestamp
|
||||
responses:
|
||||
"200":
|
||||
description: OK
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
type: string
|
||||
format: date-time
|
||||
"400":
|
||||
description: Error when no tenant id found in path, no timeline id or invalid timestamp
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"404":
|
||||
description: Timeline not found, or there is no timestamp information for the given lsn
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/NotFoundError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp:
|
||||
parameters:
|
||||
|
||||
@@ -2,12 +2,10 @@
|
||||
//! Management HTTP API
|
||||
//!
|
||||
use std::collections::HashMap;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use futures::TryFutureExt;
|
||||
use humantime::format_rfc3339;
|
||||
use hyper::header::CONTENT_TYPE;
|
||||
use hyper::StatusCode;
|
||||
use hyper::{Body, Request, Response, Uri};
|
||||
@@ -504,33 +502,6 @@ async fn get_lsn_by_timestamp_handler(
|
||||
json_response(StatusCode::OK, result)
|
||||
}
|
||||
|
||||
async fn get_timestamp_of_lsn_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
|
||||
let lsn_str = must_get_query_param(&request, "lsn")?;
|
||||
let lsn = Lsn::from_str(&lsn_str)
|
||||
.with_context(|| format!("Invalid LSN: {lsn_str:?}"))
|
||||
.map_err(ApiError::BadRequest)?;
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
|
||||
let result = timeline.get_timestamp_for_lsn(lsn, &ctx).await?;
|
||||
|
||||
match result {
|
||||
Some(time) => {
|
||||
let time = format_rfc3339(postgres_ffi::from_pg_timestamp(time)).to_string();
|
||||
json_response(StatusCode::OK, time)
|
||||
}
|
||||
None => json_response(StatusCode::NOT_FOUND, ()),
|
||||
}
|
||||
}
|
||||
|
||||
async fn tenant_attach_handler(
|
||||
mut request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
@@ -1063,17 +1034,9 @@ async fn put_tenant_location_config_handler(
|
||||
// The `Detached` state is special, it doesn't upsert a tenant, it removes
|
||||
// its local disk content and drops it from memory.
|
||||
if let LocationConfigMode::Detached = request_data.config.mode {
|
||||
if let Err(e) = mgr::detach_tenant(conf, tenant_id, true, &state.deletion_queue_client)
|
||||
mgr::detach_tenant(conf, tenant_id, true, &state.deletion_queue_client)
|
||||
.instrument(info_span!("tenant_detach", %tenant_id))
|
||||
.await
|
||||
{
|
||||
match e {
|
||||
TenantStateError::NotFound(_) => {
|
||||
// This API is idempotent: a NotFound on a detach is fine.
|
||||
}
|
||||
_ => return Err(e.into()),
|
||||
}
|
||||
}
|
||||
.await?;
|
||||
return json_response(StatusCode::OK, ());
|
||||
}
|
||||
|
||||
@@ -1709,10 +1672,6 @@ pub fn make_router(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/get_lsn_by_timestamp",
|
||||
|r| api_handler(r, get_lsn_by_timestamp_handler),
|
||||
)
|
||||
.get(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/get_timestamp_of_lsn",
|
||||
|r| api_handler(r, get_timestamp_of_lsn_handler),
|
||||
)
|
||||
.put("/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc", |r| {
|
||||
api_handler(r, timeline_gc_handler)
|
||||
})
|
||||
|
||||
@@ -318,6 +318,15 @@ impl std::ops::Deref for PageWriteGuard<'_> {
|
||||
}
|
||||
}
|
||||
|
||||
impl AsMut<[u8; PAGE_SZ]> for PageWriteGuard<'_> {
|
||||
fn as_mut(&mut self) -> &mut [u8; PAGE_SZ] {
|
||||
match &mut self.state {
|
||||
PageWriteGuardState::Invalid { inner, _permit } => inner.buf,
|
||||
PageWriteGuardState::Downgraded => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> PageWriteGuard<'a> {
|
||||
/// Mark that the buffer contents are now valid.
|
||||
#[must_use]
|
||||
|
||||
@@ -122,7 +122,6 @@ pub async fn libpq_listener_main(
|
||||
listener: TcpListener,
|
||||
auth_type: AuthType,
|
||||
listener_ctx: RequestContext,
|
||||
cancel: CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
listener.set_nonblocking(true)?;
|
||||
let tokio_listener = tokio::net::TcpListener::from_std(listener)?;
|
||||
@@ -131,7 +130,7 @@ pub async fn libpq_listener_main(
|
||||
while let Some(res) = tokio::select! {
|
||||
biased;
|
||||
|
||||
_ = cancel.cancelled() => {
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
// We were requested to shut down.
|
||||
None
|
||||
}
|
||||
@@ -300,7 +299,7 @@ impl PageServerHandler {
|
||||
Ok(flush_r?)
|
||||
},
|
||||
_ = self.cancel.cancelled() => {
|
||||
Err(QueryError::Shutdown)
|
||||
Err(QueryError::Other(anyhow::anyhow!("Shutting down")))
|
||||
}
|
||||
)
|
||||
}
|
||||
@@ -317,11 +316,11 @@ impl PageServerHandler {
|
||||
let msg = tokio::select! {
|
||||
biased;
|
||||
|
||||
_ = self.cancel.cancelled() => {
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
// We were requested to shut down.
|
||||
let msg = "pageserver is shutting down";
|
||||
let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None));
|
||||
Err(QueryError::Shutdown)
|
||||
Err(QueryError::Other(anyhow::anyhow!(msg)))
|
||||
}
|
||||
|
||||
msg = pgb.read_message() => { msg.map_err(QueryError::from)}
|
||||
@@ -415,10 +414,10 @@ impl PageServerHandler {
|
||||
let msg = tokio::select! {
|
||||
biased;
|
||||
|
||||
_ = self.cancel.cancelled() => {
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
// We were requested to shut down.
|
||||
info!("shutdown request received in page handler");
|
||||
return Err(QueryError::Shutdown)
|
||||
break;
|
||||
}
|
||||
|
||||
msg = pgb.read_message() => { msg }
|
||||
|
||||
@@ -19,7 +19,6 @@ use postgres_ffi::BLCKSZ;
|
||||
use postgres_ffi::{Oid, TimestampTz, TransactionId};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::{hash_map, HashMap, HashSet};
|
||||
use std::ops::ControlFlow;
|
||||
use std::ops::Range;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, trace, warn};
|
||||
@@ -371,6 +370,7 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Subroutine of find_lsn_for_timestamp(). Returns true, if there are any
|
||||
/// commits that committed after 'search_timestamp', at LSN 'probe_lsn'.
|
||||
///
|
||||
@@ -385,50 +385,6 @@ impl Timeline {
|
||||
found_larger: &mut bool,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<bool, PageReconstructError> {
|
||||
self.map_all_timestamps(probe_lsn, ctx, |timestamp| {
|
||||
if timestamp >= search_timestamp {
|
||||
*found_larger = true;
|
||||
return ControlFlow::Break(true);
|
||||
} else {
|
||||
*found_smaller = true;
|
||||
}
|
||||
ControlFlow::Continue(())
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
/// Obtain the possible timestamp range for the given lsn.
|
||||
///
|
||||
/// If the lsn has no timestamps, returns None. returns `(min, max, median)` if it has timestamps.
|
||||
pub async fn get_timestamp_for_lsn(
|
||||
&self,
|
||||
probe_lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Option<TimestampTz>, PageReconstructError> {
|
||||
let mut max: Option<TimestampTz> = None;
|
||||
self.map_all_timestamps(probe_lsn, ctx, |timestamp| {
|
||||
if let Some(max_prev) = max {
|
||||
max = Some(max_prev.max(timestamp));
|
||||
} else {
|
||||
max = Some(timestamp);
|
||||
}
|
||||
ControlFlow::Continue(())
|
||||
})
|
||||
.await?;
|
||||
|
||||
Ok(max)
|
||||
}
|
||||
|
||||
/// Runs the given function on all the timestamps for a given lsn
|
||||
///
|
||||
/// The return value is either given by the closure, or set to the `Default`
|
||||
/// impl's output.
|
||||
async fn map_all_timestamps<T: Default>(
|
||||
&self,
|
||||
probe_lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
mut f: impl FnMut(TimestampTz) -> ControlFlow<T>,
|
||||
) -> Result<T, PageReconstructError> {
|
||||
for segno in self
|
||||
.list_slru_segments(SlruKind::Clog, probe_lsn, ctx)
|
||||
.await?
|
||||
@@ -446,14 +402,16 @@ impl Timeline {
|
||||
timestamp_bytes.copy_from_slice(&clog_page[BLCKSZ as usize..]);
|
||||
let timestamp = TimestampTz::from_be_bytes(timestamp_bytes);
|
||||
|
||||
match f(timestamp) {
|
||||
ControlFlow::Break(b) => return Ok(b),
|
||||
ControlFlow::Continue(()) => (),
|
||||
if timestamp >= search_timestamp {
|
||||
*found_larger = true;
|
||||
return Ok(true);
|
||||
} else {
|
||||
*found_smaller = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(Default::default())
|
||||
Ok(false)
|
||||
}
|
||||
|
||||
/// Get a list of SLRU segments
|
||||
@@ -541,23 +499,6 @@ impl Timeline {
|
||||
self.get(CHECKPOINT_KEY, lsn, ctx).await
|
||||
}
|
||||
|
||||
pub async fn list_aux_files(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<HashMap<String, Bytes>, PageReconstructError> {
|
||||
match self.get(AUX_FILES_KEY, lsn, ctx).await {
|
||||
Ok(buf) => match AuxFilesDirectory::des(&buf).context("deserialization failure") {
|
||||
Ok(dir) => Ok(dir.files),
|
||||
Err(e) => Err(PageReconstructError::from(e)),
|
||||
},
|
||||
Err(e) => {
|
||||
warn!("Failed to get info about AUX files: {}", e);
|
||||
Ok(HashMap::new())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Does the same as get_current_logical_size but counted on demand.
|
||||
/// Used to initialize the logical size tracking on startup.
|
||||
///
|
||||
@@ -675,7 +616,6 @@ impl Timeline {
|
||||
|
||||
result.add_key(CONTROLFILE_KEY);
|
||||
result.add_key(CHECKPOINT_KEY);
|
||||
result.add_key(AUX_FILES_KEY);
|
||||
|
||||
Ok(result.to_keyspace())
|
||||
}
|
||||
@@ -752,12 +692,6 @@ impl<'a> DatadirModification<'a> {
|
||||
})?;
|
||||
self.put(DBDIR_KEY, Value::Image(buf.into()));
|
||||
|
||||
// Create AuxFilesDirectory
|
||||
let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
|
||||
files: HashMap::new(),
|
||||
})?;
|
||||
self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
|
||||
|
||||
let buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory {
|
||||
xids: HashSet::new(),
|
||||
})?;
|
||||
@@ -862,12 +796,6 @@ impl<'a> DatadirModification<'a> {
|
||||
// 'true', now write the updated 'dbdirs' map back.
|
||||
let buf = DbDirectory::ser(&dbdir)?;
|
||||
self.put(DBDIR_KEY, Value::Image(buf.into()));
|
||||
|
||||
// Create AuxFilesDirectory as well
|
||||
let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
|
||||
files: HashMap::new(),
|
||||
})?;
|
||||
self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
|
||||
}
|
||||
if r.is_none() {
|
||||
// Create RelDirectory
|
||||
@@ -1192,36 +1120,6 @@ impl<'a> DatadirModification<'a> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn put_file(
|
||||
&mut self,
|
||||
path: &str,
|
||||
content: &[u8],
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut dir = match self.get(AUX_FILES_KEY, ctx).await {
|
||||
Ok(buf) => AuxFilesDirectory::des(&buf)?,
|
||||
Err(e) => {
|
||||
warn!("Failed to get info about AUX files: {}", e);
|
||||
AuxFilesDirectory {
|
||||
files: HashMap::new(),
|
||||
}
|
||||
}
|
||||
};
|
||||
let path = path.to_string();
|
||||
if content.is_empty() {
|
||||
dir.files.remove(&path);
|
||||
} else {
|
||||
dir.files.insert(path, Bytes::copy_from_slice(content));
|
||||
}
|
||||
self.put(
|
||||
AUX_FILES_KEY,
|
||||
Value::Image(Bytes::from(
|
||||
AuxFilesDirectory::ser(&dir).context("serialize")?,
|
||||
)),
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Flush changes accumulated so far to the underlying repository.
|
||||
///
|
||||
@@ -1357,11 +1255,6 @@ struct RelDirectory {
|
||||
rels: HashSet<(Oid, u8)>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Default)]
|
||||
struct AuxFilesDirectory {
|
||||
files: HashMap<String, Bytes>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
struct RelSizeEntry {
|
||||
nblocks: u32,
|
||||
@@ -1410,12 +1303,10 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
|
||||
// 02 pg_twophase
|
||||
//
|
||||
// 03 misc
|
||||
// Controlfile
|
||||
// controlfile
|
||||
// checkpoint
|
||||
// pg_version
|
||||
//
|
||||
// 04 aux files
|
||||
//
|
||||
// Below is a full list of the keyspace allocation:
|
||||
//
|
||||
// DbDir:
|
||||
@@ -1453,11 +1344,6 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
|
||||
//
|
||||
// Checkpoint:
|
||||
// 03 00000000 00000000 00000000 00 00000001
|
||||
//
|
||||
// AuxFiles:
|
||||
// 03 00000000 00000000 00000000 00 00000002
|
||||
//
|
||||
|
||||
//-- Section 01: relation data and metadata
|
||||
|
||||
const DBDIR_KEY: Key = Key {
|
||||
@@ -1681,15 +1567,6 @@ const CHECKPOINT_KEY: Key = Key {
|
||||
field6: 1,
|
||||
};
|
||||
|
||||
const AUX_FILES_KEY: Key = Key {
|
||||
field1: 0x03,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 2,
|
||||
};
|
||||
|
||||
// Reverse mappings for a few Keys.
|
||||
// These are needed by WAL redo manager.
|
||||
|
||||
|
||||
@@ -77,12 +77,12 @@ use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
|
||||
use crate::tenant::storage_layer::DeltaLayer;
|
||||
use crate::tenant::storage_layer::ImageLayer;
|
||||
use crate::InitializationOrder;
|
||||
use crate::METADATA_FILE_NAME;
|
||||
|
||||
use crate::tenant::timeline::delete::DeleteTimelineFlow;
|
||||
use crate::tenant::timeline::uninit::cleanup_timeline_directory;
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::walredo::PostgresRedoManager;
|
||||
use crate::walredo::WalRedoManager;
|
||||
use crate::TEMP_FILE_SUFFIX;
|
||||
pub use pageserver_api::models::TenantState;
|
||||
|
||||
@@ -229,7 +229,7 @@ pub struct Tenant {
|
||||
// with timelines, which in turn may cause dropping replication connection, expiration of wait_for_lsn
|
||||
// timeout...
|
||||
gc_cs: tokio::sync::Mutex<()>,
|
||||
walredo_mgr: Arc<WalRedoManager>,
|
||||
walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
|
||||
|
||||
// provides access to timeline data sitting in the remote storage
|
||||
pub(crate) remote_storage: Option<GenericRemoteStorage>,
|
||||
@@ -246,43 +246,67 @@ pub struct Tenant {
|
||||
pub(crate) delete_progress: Arc<tokio::sync::Mutex<DeleteTenantFlow>>,
|
||||
}
|
||||
|
||||
pub(crate) enum WalRedoManager {
|
||||
Prod(PostgresRedoManager),
|
||||
#[cfg(test)]
|
||||
Test(harness::TestRedoManager),
|
||||
}
|
||||
// We should not blindly overwrite local metadata with remote one.
|
||||
// For example, consider the following case:
|
||||
// Image layer is flushed to disk as a new delta layer, we update local metadata and start upload task but after that
|
||||
// pageserver crashes. During startup we'll load new metadata, and then reset it
|
||||
// to the state of remote one. But current layermap will have layers from the old
|
||||
// metadata which is inconsistent.
|
||||
// And with current logic it wont disgard them during load because during layermap
|
||||
// load it sees local disk consistent lsn which is ahead of layer lsns.
|
||||
// If we treat remote as source of truth we need to completely sync with it,
|
||||
// i e delete local files which are missing on the remote. This will add extra work,
|
||||
// wal for these layers needs to be reingested for example
|
||||
//
|
||||
// So the solution is to take remote metadata only when we're attaching.
|
||||
pub fn merge_local_remote_metadata<'a>(
|
||||
local: Option<&'a TimelineMetadata>,
|
||||
remote: Option<&'a TimelineMetadata>,
|
||||
) -> anyhow::Result<(&'a TimelineMetadata, bool)> {
|
||||
match (local, remote) {
|
||||
(None, None) => anyhow::bail!("we should have either local metadata or remote"),
|
||||
(Some(local), None) => Ok((local, true)),
|
||||
// happens if we crash during attach, before writing out the metadata file
|
||||
(None, Some(remote)) => Ok((remote, false)),
|
||||
// This is the regular case where we crash/exit before finishing queued uploads.
|
||||
// Also, it happens if we crash during attach after writing the metadata file
|
||||
// but before removing the attaching marker file.
|
||||
(Some(local), Some(remote)) => {
|
||||
let consistent_lsn_cmp = local
|
||||
.disk_consistent_lsn()
|
||||
.cmp(&remote.disk_consistent_lsn());
|
||||
let gc_cutoff_lsn_cmp = local
|
||||
.latest_gc_cutoff_lsn()
|
||||
.cmp(&remote.latest_gc_cutoff_lsn());
|
||||
use std::cmp::Ordering::*;
|
||||
match (consistent_lsn_cmp, gc_cutoff_lsn_cmp) {
|
||||
// It wouldn't matter, but pick the local one so that we don't rewrite the metadata file.
|
||||
(Equal, Equal) => Ok((local, true)),
|
||||
// Local state is clearly ahead of the remote.
|
||||
(Greater, Greater) => Ok((local, true)),
|
||||
// We have local layer files that aren't on the remote, but GC horizon is on par.
|
||||
(Greater, Equal) => Ok((local, true)),
|
||||
// Local GC started running but we couldn't sync it to the remote.
|
||||
(Equal, Greater) => Ok((local, true)),
|
||||
|
||||
impl From<PostgresRedoManager> for WalRedoManager {
|
||||
fn from(mgr: PostgresRedoManager) -> Self {
|
||||
Self::Prod(mgr)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl From<harness::TestRedoManager> for WalRedoManager {
|
||||
fn from(mgr: harness::TestRedoManager) -> Self {
|
||||
Self::Test(mgr)
|
||||
}
|
||||
}
|
||||
|
||||
impl WalRedoManager {
|
||||
pub async fn request_redo(
|
||||
&self,
|
||||
key: crate::repository::Key,
|
||||
lsn: Lsn,
|
||||
base_img: Option<(Lsn, bytes::Bytes)>,
|
||||
records: Vec<(Lsn, crate::walrecord::NeonWalRecord)>,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<bytes::Bytes> {
|
||||
match self {
|
||||
Self::Prod(mgr) => {
|
||||
mgr.request_redo(key, lsn, base_img, records, pg_version)
|
||||
.await
|
||||
}
|
||||
#[cfg(test)]
|
||||
Self::Test(mgr) => {
|
||||
mgr.request_redo(key, lsn, base_img, records, pg_version)
|
||||
.await
|
||||
// We always update the local value first, so something else must have
|
||||
// updated the remote value, probably a different pageserver.
|
||||
// The control plane is supposed to prevent this from happening.
|
||||
// Bail out.
|
||||
(Less, Less)
|
||||
| (Less, Equal)
|
||||
| (Equal, Less)
|
||||
| (Less, Greater)
|
||||
| (Greater, Less) => {
|
||||
anyhow::bail!(
|
||||
r#"remote metadata appears to be ahead of local metadata:
|
||||
local:
|
||||
{local:#?}
|
||||
remote:
|
||||
{remote:#?}
|
||||
"#
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -351,6 +375,11 @@ impl Debug for SetStoppingError {
|
||||
}
|
||||
}
|
||||
|
||||
struct RemoteStartupData {
|
||||
index_part: IndexPart,
|
||||
remote_metadata: TimelineMetadata,
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub(crate) enum WaitToBecomeActiveError {
|
||||
WillNotBecomeActive {
|
||||
@@ -391,12 +420,6 @@ pub enum CreateTimelineError {
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
/// spawn_attach argument for whether the caller is using attachment markers
|
||||
pub(super) enum AttachMarkerMode {
|
||||
Expect,
|
||||
Ignore,
|
||||
}
|
||||
|
||||
struct TenantDirectoryScan {
|
||||
sorted_timelines_to_load: Vec<(TimelineId, TimelineMetadata)>,
|
||||
timelines_to_resume_deletion: Vec<(TimelineId, Option<TimelineMetadata>)>,
|
||||
@@ -423,17 +446,24 @@ impl Tenant {
|
||||
&self,
|
||||
timeline_id: TimelineId,
|
||||
resources: TimelineResources,
|
||||
index_part: Option<IndexPart>,
|
||||
metadata: TimelineMetadata,
|
||||
remote_startup_data: Option<RemoteStartupData>,
|
||||
local_metadata: Option<TimelineMetadata>,
|
||||
ancestor: Option<Arc<Timeline>>,
|
||||
init_order: Option<&InitializationOrder>,
|
||||
_ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let tenant_id = self.tenant_id;
|
||||
|
||||
let (up_to_date_metadata, picked_local) = merge_local_remote_metadata(
|
||||
local_metadata.as_ref(),
|
||||
remote_startup_data.as_ref().map(|r| &r.remote_metadata),
|
||||
)
|
||||
.context("merge_local_remote_metadata")?
|
||||
.to_owned();
|
||||
|
||||
let timeline = self.create_timeline_struct(
|
||||
timeline_id,
|
||||
&metadata,
|
||||
up_to_date_metadata,
|
||||
ancestor.clone(),
|
||||
resources,
|
||||
init_order,
|
||||
@@ -446,11 +476,20 @@ impl Tenant {
|
||||
);
|
||||
assert_eq!(
|
||||
disk_consistent_lsn,
|
||||
metadata.disk_consistent_lsn(),
|
||||
up_to_date_metadata.disk_consistent_lsn(),
|
||||
"these are used interchangeably"
|
||||
);
|
||||
|
||||
if let Some(index_part) = index_part.as_ref() {
|
||||
// Save the metadata file to local disk.
|
||||
if !picked_local {
|
||||
save_metadata(self.conf, &tenant_id, &timeline_id, up_to_date_metadata)
|
||||
.await
|
||||
.context("save_metadata")?;
|
||||
}
|
||||
|
||||
let index_part = remote_startup_data.as_ref().map(|x| &x.index_part);
|
||||
|
||||
if let Some(index_part) = index_part {
|
||||
timeline
|
||||
.remote_client
|
||||
.as_ref()
|
||||
@@ -463,12 +502,15 @@ impl Tenant {
|
||||
// If control plane retries timeline creation in the meantime, the mgmt API handler
|
||||
// for timeline creation will coalesce on the upload we queue here.
|
||||
let rtc = timeline.remote_client.as_ref().unwrap();
|
||||
rtc.init_upload_queue_for_empty_remote(&metadata)?;
|
||||
rtc.schedule_index_upload_for_metadata_update(&metadata)?;
|
||||
rtc.init_upload_queue_for_empty_remote(up_to_date_metadata)?;
|
||||
rtc.schedule_index_upload_for_metadata_update(up_to_date_metadata)?;
|
||||
}
|
||||
|
||||
timeline
|
||||
.load_layer_map(disk_consistent_lsn, index_part)
|
||||
.load_layer_map(
|
||||
disk_consistent_lsn,
|
||||
remote_startup_data.map(|x| x.index_part),
|
||||
)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to load layermap for timeline {tenant_id}/{timeline_id}")
|
||||
@@ -525,13 +567,10 @@ impl Tenant {
|
||||
resources: TenantSharedResources,
|
||||
attached_conf: AttachedTenantConf,
|
||||
tenants: &'static tokio::sync::RwLock<TenantsMap>,
|
||||
expect_marker: AttachMarkerMode,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<Tenant>> {
|
||||
// TODO dedup with spawn_load
|
||||
let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
|
||||
conf, tenant_id,
|
||||
)));
|
||||
let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id));
|
||||
|
||||
let TenantSharedResources {
|
||||
broker_client,
|
||||
@@ -609,7 +648,7 @@ impl Tenant {
|
||||
}
|
||||
}
|
||||
|
||||
match tenant_clone.attach(&ctx, expect_marker).await {
|
||||
match tenant_clone.attach(&ctx).await {
|
||||
Ok(()) => {
|
||||
info!("attach finished, activating");
|
||||
tenant_clone.activate(broker_client, None, &ctx);
|
||||
@@ -634,23 +673,17 @@ impl Tenant {
|
||||
///
|
||||
/// No background tasks are started as part of this routine.
|
||||
///
|
||||
async fn attach(
|
||||
self: &Arc<Tenant>,
|
||||
ctx: &RequestContext,
|
||||
expect_marker: AttachMarkerMode,
|
||||
) -> anyhow::Result<()> {
|
||||
async fn attach(self: &Arc<Tenant>, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
span::debug_assert_current_span_has_tenant_id();
|
||||
|
||||
let marker_file = self.conf.tenant_attaching_mark_file_path(&self.tenant_id);
|
||||
if let AttachMarkerMode::Expect = expect_marker {
|
||||
if !tokio::fs::try_exists(&marker_file)
|
||||
.await
|
||||
.context("check for existence of marker file")?
|
||||
{
|
||||
anyhow::bail!(
|
||||
"implementation error: marker file should exist at beginning of this function"
|
||||
);
|
||||
}
|
||||
if !tokio::fs::try_exists(&marker_file)
|
||||
.await
|
||||
.context("check for existence of marker file")?
|
||||
{
|
||||
anyhow::bail!(
|
||||
"implementation error: marker file should exist at beginning of this function"
|
||||
);
|
||||
}
|
||||
|
||||
// Get list of remote timelines
|
||||
@@ -772,12 +805,10 @@ impl Tenant {
|
||||
.map_err(LoadLocalTimelineError::ResumeDeletion)?;
|
||||
}
|
||||
|
||||
if let AttachMarkerMode::Expect = expect_marker {
|
||||
std::fs::remove_file(&marker_file)
|
||||
.with_context(|| format!("unlink attach marker file {marker_file}"))?;
|
||||
crashsafe::fsync(marker_file.parent().expect("marker file has parent dir"))
|
||||
.context("fsync tenant directory after unlinking attach marker file")?;
|
||||
}
|
||||
std::fs::remove_file(&marker_file)
|
||||
.with_context(|| format!("unlink attach marker file {marker_file}"))?;
|
||||
crashsafe::fsync(marker_file.parent().expect("marker file has parent dir"))
|
||||
.context("fsync tenant directory after unlinking attach marker file")?;
|
||||
|
||||
crate::failpoint_support::sleep_millis_async!("attach-before-activate");
|
||||
|
||||
@@ -830,23 +861,21 @@ impl Tenant {
|
||||
None
|
||||
};
|
||||
|
||||
// we can load remote timelines during init, but they are assumed to be so rare that
|
||||
// initialization order is not passed to here.
|
||||
let init_order = None;
|
||||
|
||||
// timeline loading after attach expects to find metadata file for each metadata
|
||||
save_metadata(self.conf, &self.tenant_id, &timeline_id, &remote_metadata)
|
||||
.await
|
||||
.context("save_metadata")
|
||||
.map_err(LoadLocalTimelineError::Load)?;
|
||||
// Even if there is local metadata it cannot be ahead of the remote one
|
||||
// since we're attaching. Even if we resume interrupted attach remote one
|
||||
// cannot be older than the local one
|
||||
let local_metadata = None;
|
||||
|
||||
self.timeline_init_and_sync(
|
||||
timeline_id,
|
||||
resources,
|
||||
Some(index_part),
|
||||
remote_metadata,
|
||||
Some(RemoteStartupData {
|
||||
index_part,
|
||||
remote_metadata,
|
||||
}),
|
||||
local_metadata,
|
||||
ancestor,
|
||||
init_order,
|
||||
None,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
@@ -858,9 +887,7 @@ impl Tenant {
|
||||
tenant_id: TenantId,
|
||||
reason: String,
|
||||
) -> Arc<Tenant> {
|
||||
let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
|
||||
conf, tenant_id,
|
||||
)));
|
||||
let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id));
|
||||
Arc::new(Tenant::new(
|
||||
TenantState::Broken {
|
||||
reason,
|
||||
@@ -899,9 +926,7 @@ impl Tenant {
|
||||
let broker_client = resources.broker_client;
|
||||
let remote_storage = resources.remote_storage;
|
||||
|
||||
let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
|
||||
conf, tenant_id,
|
||||
)));
|
||||
let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id));
|
||||
let tenant = Tenant::new(
|
||||
TenantState::Loading,
|
||||
conf,
|
||||
@@ -1324,8 +1349,8 @@ impl Tenant {
|
||||
LoadLocalTimelineError::Load(source) => {
|
||||
// We tried to load deleted timeline, this is a bug.
|
||||
return Err(anyhow::anyhow!(source).context(
|
||||
format!("This is a bug. We tried to load deleted timeline which is wrong and loading failed. Timeline: {timeline_id}")
|
||||
));
|
||||
"This is a bug. We tried to load deleted timeline which is wrong and loading failed. Timeline: {timeline_id}"
|
||||
));
|
||||
}
|
||||
LoadLocalTimelineError::ResumeDeletion(source) => {
|
||||
// Make sure resumed deletion wont fail loading for entire tenant.
|
||||
@@ -1359,11 +1384,6 @@ impl Tenant {
|
||||
|
||||
let mut resources = self.build_timeline_resources(timeline_id);
|
||||
|
||||
struct RemoteStartupData {
|
||||
index_part: IndexPart,
|
||||
remote_metadata: TimelineMetadata,
|
||||
}
|
||||
|
||||
let (remote_startup_data, remote_client) = match preload {
|
||||
Some(preload) => {
|
||||
let TimelinePreload {
|
||||
@@ -1419,7 +1439,7 @@ impl Tenant {
|
||||
)
|
||||
}
|
||||
Err(DownloadError::NotFound) => {
|
||||
info!(found_delete_mark, "no index file was found on the remote, resuming deletion or cleaning unuploaded up");
|
||||
info!("no index file was found on the remote, found_delete_mark: {found_delete_mark}");
|
||||
|
||||
if found_delete_mark {
|
||||
// We could've resumed at a point where remote index was deleted, but metadata file wasnt.
|
||||
@@ -1433,73 +1453,14 @@ impl Tenant {
|
||||
.map_err(LoadLocalTimelineError::ResumeDeletion);
|
||||
}
|
||||
|
||||
// as the remote index_part.json did not exist, this timeline is a
|
||||
// not-yet-uploaded one. it should be deleted now, because the branching might
|
||||
// not have been valid as it's ancestor may have been restored to earlier state
|
||||
// as well. in practice, control plane will keep retrying.
|
||||
//
|
||||
// first ensure that the un-uploaded timeline looks like it should, as in we
|
||||
// are not accidentially deleting a timeline which was ever active:
|
||||
// - root timelines have metadata and one possibly partial layer
|
||||
// - branched timelines have metadata
|
||||
//
|
||||
// if the timeline does not look like expected, fail loading of the tenant.
|
||||
// cleaning the timeline up manually and reloading the tenant is possible via
|
||||
// the above log message.
|
||||
let path = self.conf.timeline_path(&self.tenant_id, &timeline_id);
|
||||
|
||||
let span = tracing::Span::current();
|
||||
|
||||
return tokio::task::spawn_blocking({
|
||||
move || {
|
||||
use std::str::FromStr;
|
||||
use crate::tenant::storage_layer::LayerFileName;
|
||||
|
||||
let _e = span.entered();
|
||||
let mut metadata = false;
|
||||
let mut layers = 0;
|
||||
let mut others = 0;
|
||||
for dentry in path.read_dir_utf8()? {
|
||||
let dentry = dentry?;
|
||||
let file_name = dentry.file_name();
|
||||
|
||||
if file_name == METADATA_FILE_NAME {
|
||||
metadata = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
if LayerFileName::from_str(file_name).is_ok()
|
||||
{
|
||||
layers += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
others += 1;
|
||||
}
|
||||
|
||||
// bootstrapped have the one image layer file, or one partial temp
|
||||
// file, branched have just the metadata
|
||||
if !(metadata && layers + others <= 1) {
|
||||
anyhow::bail!("unexpected assumed unuploaded, never been active timeline: found metadata={}, layers={}, others={}", metadata, layers, others);
|
||||
}
|
||||
|
||||
let tmp_path =
|
||||
path.with_file_name(format!("{timeline_id}{}", TEMP_FILE_SUFFIX));
|
||||
std::fs::rename(path, &tmp_path)?;
|
||||
std::fs::remove_dir_all(&tmp_path)?;
|
||||
Ok(())
|
||||
}
|
||||
})
|
||||
.await
|
||||
.map_err(anyhow::Error::new)
|
||||
.and_then(|x| x)
|
||||
.context("delete assumed unuploaded fresh timeline")
|
||||
.map_err(LoadLocalTimelineError::Load);
|
||||
// We're loading fresh timeline that didnt yet make it into remote.
|
||||
(None, Some(remote_client))
|
||||
}
|
||||
Err(e) => return Err(LoadLocalTimelineError::Load(anyhow::Error::new(e))),
|
||||
}
|
||||
}
|
||||
None => {
|
||||
// No remote client
|
||||
if found_delete_mark {
|
||||
// There is no remote client, we found local metadata.
|
||||
// Continue cleaning up local disk.
|
||||
@@ -1531,27 +1492,11 @@ impl Tenant {
|
||||
None
|
||||
};
|
||||
|
||||
let (index_part, metadata) = match remote_startup_data {
|
||||
Some(RemoteStartupData {
|
||||
index_part,
|
||||
remote_metadata,
|
||||
}) => {
|
||||
// always choose the remote metadata to be crash consistent (see RFC 27)
|
||||
save_metadata(self.conf, &self.tenant_id, &timeline_id, &remote_metadata)
|
||||
.await
|
||||
.context("save_metadata")
|
||||
.map_err(LoadLocalTimelineError::Load)?;
|
||||
|
||||
(Some(index_part), remote_metadata)
|
||||
}
|
||||
None => (None, local_metadata),
|
||||
};
|
||||
|
||||
self.timeline_init_and_sync(
|
||||
timeline_id,
|
||||
resources,
|
||||
index_part,
|
||||
metadata,
|
||||
remote_startup_data,
|
||||
Some(local_metadata),
|
||||
ancestor,
|
||||
init_order,
|
||||
ctx,
|
||||
@@ -2466,7 +2411,7 @@ impl Tenant {
|
||||
state: TenantState,
|
||||
conf: &'static PageServerConf,
|
||||
attached_conf: AttachedTenantConf,
|
||||
walredo_mgr: Arc<WalRedoManager>,
|
||||
walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
|
||||
tenant_id: TenantId,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
deletion_queue_client: DeletionQueueClient,
|
||||
@@ -3597,7 +3542,7 @@ pub async fn dump_layerfile_from_path(
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) mod harness {
|
||||
pub mod harness {
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use once_cell::sync::OnceCell;
|
||||
use std::fs;
|
||||
@@ -3608,6 +3553,7 @@ pub(crate) mod harness {
|
||||
use crate::deletion_queue::mock::MockDeletionQueue;
|
||||
use crate::{
|
||||
config::PageServerConf, repository::Key, tenant::Tenant, walrecord::NeonWalRecord,
|
||||
walredo::WalRedoManager,
|
||||
};
|
||||
|
||||
use super::*;
|
||||
@@ -3736,7 +3682,7 @@ pub(crate) mod harness {
|
||||
}
|
||||
|
||||
pub async fn try_load(&self, ctx: &RequestContext) -> anyhow::Result<Arc<Tenant>> {
|
||||
let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager));
|
||||
let walredo_mgr = Arc::new(TestRedoManager);
|
||||
|
||||
let tenant = Arc::new(Tenant::new(
|
||||
TenantState::Loading,
|
||||
@@ -3770,10 +3716,10 @@ pub(crate) mod harness {
|
||||
}
|
||||
|
||||
// Mock WAL redo manager that doesn't do much
|
||||
pub(crate) struct TestRedoManager;
|
||||
pub struct TestRedoManager;
|
||||
|
||||
impl TestRedoManager {
|
||||
pub async fn request_redo(
|
||||
impl WalRedoManager for TestRedoManager {
|
||||
fn request_redo(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
|
||||
@@ -458,10 +458,7 @@ impl DeleteTenantFlow {
|
||||
.await
|
||||
.expect("cant be stopping or broken");
|
||||
|
||||
tenant
|
||||
.attach(ctx, super::AttachMarkerMode::Expect)
|
||||
.await
|
||||
.context("attach")?;
|
||||
tenant.attach(ctx).await.context("attach")?;
|
||||
|
||||
Self::background(
|
||||
guard,
|
||||
|
||||
@@ -354,7 +354,8 @@ mod tests {
|
||||
}
|
||||
|
||||
// Test a large blob that spans multiple pages
|
||||
let mut large_data = vec![0; 20000];
|
||||
let mut large_data = Vec::new();
|
||||
large_data.resize(20000, 0);
|
||||
thread_rng().fill_bytes(&mut large_data);
|
||||
let pos_large = file.write_blob(&large_data, &ctx).await?;
|
||||
let result = file.block_cursor().read_blob(pos_large, &ctx).await?;
|
||||
|
||||
@@ -27,8 +27,7 @@ use crate::task_mgr::{self, TaskKind};
|
||||
use crate::tenant::config::{AttachmentMode, LocationConf, LocationMode, TenantConfOpt};
|
||||
use crate::tenant::delete::DeleteTenantFlow;
|
||||
use crate::tenant::{
|
||||
create_tenant_files, AttachMarkerMode, AttachedTenantConf, CreateTenantFilesMode, Tenant,
|
||||
TenantState,
|
||||
create_tenant_files, AttachedTenantConf, CreateTenantFilesMode, Tenant, TenantState,
|
||||
};
|
||||
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};
|
||||
|
||||
@@ -152,49 +151,6 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
|
||||
|
||||
static TENANTS: Lazy<RwLock<TenantsMap>> = Lazy::new(|| RwLock::new(TenantsMap::Initializing));
|
||||
|
||||
/// Create a directory, including parents. This does no fsyncs and makes
|
||||
/// no guarantees about the persistence of the resulting metadata: for
|
||||
/// use when creating dirs for use as cache.
|
||||
async fn unsafe_create_dir_all(path: &Utf8PathBuf) -> std::io::Result<()> {
|
||||
let mut dirs_to_create = Vec::new();
|
||||
let mut path: &Utf8Path = path.as_ref();
|
||||
|
||||
// Figure out which directories we need to create.
|
||||
loop {
|
||||
let meta = tokio::fs::metadata(path).await;
|
||||
match meta {
|
||||
Ok(metadata) if metadata.is_dir() => break,
|
||||
Ok(_) => {
|
||||
return Err(std::io::Error::new(
|
||||
std::io::ErrorKind::AlreadyExists,
|
||||
format!("non-directory found in path: {path}"),
|
||||
));
|
||||
}
|
||||
Err(ref e) if e.kind() == std::io::ErrorKind::NotFound => {}
|
||||
Err(e) => return Err(e),
|
||||
}
|
||||
|
||||
dirs_to_create.push(path);
|
||||
|
||||
match path.parent() {
|
||||
Some(parent) => path = parent,
|
||||
None => {
|
||||
return Err(std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidInput,
|
||||
format!("can't find parent of path '{path}'"),
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Create directories from parent to child.
|
||||
for &path in dirs_to_create.iter().rev() {
|
||||
tokio::fs::create_dir(path).await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn emergency_generations(
|
||||
tenant_confs: &HashMap<TenantId, anyhow::Result<LocationConf>>,
|
||||
) -> HashMap<TenantId, Generation> {
|
||||
@@ -490,15 +446,7 @@ pub(crate) fn schedule_local_tenant_processing(
|
||||
"attaching mark file present but no remote storage configured".to_string(),
|
||||
)
|
||||
} else {
|
||||
match Tenant::spawn_attach(
|
||||
conf,
|
||||
tenant_id,
|
||||
resources,
|
||||
location_conf,
|
||||
tenants,
|
||||
AttachMarkerMode::Expect,
|
||||
ctx,
|
||||
) {
|
||||
match Tenant::spawn_attach(conf, tenant_id, resources, location_conf, tenants, ctx) {
|
||||
Ok(tenant) => tenant,
|
||||
Err(e) => {
|
||||
error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}");
|
||||
@@ -707,7 +655,7 @@ pub(crate) async fn set_new_tenant_config(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(%tenant_id))]
|
||||
#[instrument(skip_all, fields(tenant_id, new_location_config))]
|
||||
pub(crate) async fn upsert_location(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
@@ -786,61 +734,36 @@ pub(crate) async fn upsert_location(
|
||||
}
|
||||
|
||||
let new_slot = match &new_location_config.mode {
|
||||
LocationMode::Secondary(_) => {
|
||||
let tenant_path = conf.tenant_path(&tenant_id);
|
||||
// Directory doesn't need to be fsync'd because if we crash it can
|
||||
// safely be recreated next time this tenant location is configured.
|
||||
unsafe_create_dir_all(&tenant_path)
|
||||
.await
|
||||
.with_context(|| format!("Creating {tenant_path}"))?;
|
||||
|
||||
Tenant::persist_tenant_config(conf, &tenant_id, &new_location_config)
|
||||
.await
|
||||
.map_err(SetNewTenantConfigError::Persist)?;
|
||||
|
||||
TenantSlot::Secondary
|
||||
}
|
||||
LocationMode::Secondary(_) => TenantSlot::Secondary,
|
||||
LocationMode::Attached(_attach_config) => {
|
||||
// Do a schedule_local_tenant_processing
|
||||
// FIXME: should avoid doing this disk I/O inside the TenantsMap lock,
|
||||
// we have the same problem in load_tenant/attach_tenant. Probably
|
||||
// need a lock in TenantSlot to fix this.
|
||||
let timelines_path = conf.timelines_path(&tenant_id);
|
||||
|
||||
// Directory doesn't need to be fsync'd because we do not depend on
|
||||
// it to exist after crashes: it may be recreated when tenant is
|
||||
// re-attached, see https://github.com/neondatabase/neon/issues/5550
|
||||
unsafe_create_dir_all(&timelines_path)
|
||||
.await
|
||||
.with_context(|| format!("Creating {timelines_path}"))?;
|
||||
|
||||
Tenant::persist_tenant_config(conf, &tenant_id, &new_location_config)
|
||||
.await
|
||||
.map_err(SetNewTenantConfigError::Persist)?;
|
||||
|
||||
let tenant = match Tenant::spawn_attach(
|
||||
let tenant_path = conf.tenant_path(&tenant_id);
|
||||
let resources = TenantSharedResources {
|
||||
broker_client,
|
||||
remote_storage,
|
||||
deletion_queue_client,
|
||||
};
|
||||
let new_tenant = schedule_local_tenant_processing(
|
||||
conf,
|
||||
tenant_id,
|
||||
TenantSharedResources {
|
||||
broker_client,
|
||||
remote_storage,
|
||||
deletion_queue_client,
|
||||
},
|
||||
&tenant_path,
|
||||
AttachedTenantConf::try_from(new_location_config)?,
|
||||
resources,
|
||||
None,
|
||||
&TENANTS,
|
||||
// The LocationConf API does not use marker files, because we have Secondary
|
||||
// locations where the directory's existence is not a signal that it contains
|
||||
// all timelines. See https://github.com/neondatabase/neon/issues/5550
|
||||
AttachMarkerMode::Ignore,
|
||||
ctx,
|
||||
) {
|
||||
Ok(tenant) => tenant,
|
||||
Err(e) => {
|
||||
error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}");
|
||||
Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"))
|
||||
}
|
||||
};
|
||||
)
|
||||
.with_context(|| {
|
||||
format!("Failed to schedule tenant processing in path {tenant_path:?}")
|
||||
})?;
|
||||
|
||||
TenantSlot::Attached(tenant)
|
||||
TenantSlot::Attached(new_tenant)
|
||||
}
|
||||
};
|
||||
|
||||
@@ -848,6 +771,7 @@ pub(crate) async fn upsert_location(
|
||||
})
|
||||
.await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -226,14 +226,6 @@ impl LayerFileName {
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn kind(&self) -> &'static str {
|
||||
use LayerFileName::*;
|
||||
match self {
|
||||
Delta(_) => "delta",
|
||||
Image(_) => "image",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for LayerFileName {
|
||||
|
||||
@@ -25,7 +25,7 @@ use super::{
|
||||
};
|
||||
|
||||
/// RemoteLayer is a not yet downloaded [`ImageLayer`] or
|
||||
/// [`DeltaLayer`].
|
||||
/// [`DeltaLayer`](super::DeltaLayer).
|
||||
///
|
||||
/// RemoteLayer might be downloaded on-demand during operations which are
|
||||
/// allowed download remote layers and during which, it gets replaced with a
|
||||
|
||||
@@ -81,6 +81,7 @@ use crate::repository::GcResult;
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::walredo::WalRedoManager;
|
||||
use crate::ZERO_PAGE;
|
||||
|
||||
use self::delete::DeleteTimelineFlow;
|
||||
@@ -200,7 +201,7 @@ pub struct Timeline {
|
||||
last_freeze_ts: RwLock<Instant>,
|
||||
|
||||
// WAL redo manager
|
||||
walredo_mgr: Arc<super::WalRedoManager>,
|
||||
walredo_mgr: Arc<dyn WalRedoManager + Sync + Send>,
|
||||
|
||||
/// Remote storage client.
|
||||
/// See [`remote_timeline_client`](super::remote_timeline_client) module comment for details.
|
||||
@@ -1470,7 +1471,7 @@ impl Timeline {
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
generation: Generation,
|
||||
walredo_mgr: Arc<super::WalRedoManager>,
|
||||
walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
|
||||
resources: TimelineResources,
|
||||
pg_version: u32,
|
||||
initial_logical_size_can_start: Option<completion::Barrier>,
|
||||
@@ -1699,7 +1700,7 @@ impl Timeline {
|
||||
disk_consistent_lsn: Lsn,
|
||||
index_part: Option<IndexPart>,
|
||||
) -> anyhow::Result<()> {
|
||||
use init::{Decision::*, Discovered, DismissedLayer};
|
||||
use init::{Decision::*, Discovered, FutureLayer};
|
||||
use LayerFileName::*;
|
||||
|
||||
let mut guard = self.layers.write().await;
|
||||
@@ -1715,7 +1716,7 @@ impl Timeline {
|
||||
// Copy to move into the task we're about to spawn
|
||||
let generation = self.generation;
|
||||
|
||||
let (loaded_layers, needs_cleanup, total_physical_size) = tokio::task::spawn_blocking({
|
||||
let (loaded_layers, to_sync, total_physical_size) = tokio::task::spawn_blocking({
|
||||
move || {
|
||||
let _g = span.entered();
|
||||
let discovered = init::scan_timeline_dir(&timeline_path)?;
|
||||
@@ -1764,6 +1765,7 @@ impl Timeline {
|
||||
);
|
||||
|
||||
let mut loaded_layers = Vec::new();
|
||||
let mut needs_upload = Vec::new();
|
||||
let mut needs_cleanup = Vec::new();
|
||||
let mut total_physical_size = 0;
|
||||
|
||||
@@ -1784,7 +1786,7 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
Ok(decision) => decision,
|
||||
Err(DismissedLayer::Future { local }) => {
|
||||
Err(FutureLayer { local }) => {
|
||||
if local.is_some() {
|
||||
path.push(name.file_name());
|
||||
init::cleanup_future_layer(&path, &name, disk_consistent_lsn)?;
|
||||
@@ -1793,13 +1795,6 @@ impl Timeline {
|
||||
needs_cleanup.push(name);
|
||||
continue;
|
||||
}
|
||||
Err(DismissedLayer::LocalOnly(local)) => {
|
||||
path.push(name.file_name());
|
||||
init::cleanup_local_only_file(&path, &name, &local)?;
|
||||
path.pop();
|
||||
// this file never existed remotely, we will have to do rework
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
match &name {
|
||||
@@ -1808,16 +1803,14 @@ impl Timeline {
|
||||
}
|
||||
|
||||
let status = match &decision {
|
||||
UseLocal(_) => LayerResidenceStatus::Resident,
|
||||
UseLocal(_) | NeedsUpload(_) => LayerResidenceStatus::Resident,
|
||||
Evicted(_) | UseRemote { .. } => LayerResidenceStatus::Evicted,
|
||||
};
|
||||
|
||||
tracing::debug!(layer=%name, ?decision, ?status, "applied");
|
||||
|
||||
let stats = LayerAccessStats::for_loading_layer(status);
|
||||
|
||||
let layer: Arc<dyn PersistentLayer> = match (name, &decision) {
|
||||
(Delta(d), UseLocal(m)) => {
|
||||
(Delta(d), UseLocal(m) | NeedsUpload(m)) => {
|
||||
total_physical_size += m.file_size();
|
||||
Arc::new(DeltaLayer::new(
|
||||
conf,
|
||||
@@ -1828,7 +1821,7 @@ impl Timeline {
|
||||
stats,
|
||||
))
|
||||
}
|
||||
(Image(i), UseLocal(m)) => {
|
||||
(Image(i), UseLocal(m) | NeedsUpload(m)) => {
|
||||
total_physical_size += m.file_size();
|
||||
Arc::new(ImageLayer::new(
|
||||
conf,
|
||||
@@ -1847,9 +1840,17 @@ impl Timeline {
|
||||
),
|
||||
};
|
||||
|
||||
if let NeedsUpload(m) = decision {
|
||||
needs_upload.push((layer.clone(), m));
|
||||
}
|
||||
|
||||
loaded_layers.push(layer);
|
||||
}
|
||||
Ok((loaded_layers, needs_cleanup, total_physical_size))
|
||||
Ok((
|
||||
loaded_layers,
|
||||
(needs_upload, needs_cleanup),
|
||||
total_physical_size,
|
||||
))
|
||||
}
|
||||
})
|
||||
.await
|
||||
@@ -1861,6 +1862,10 @@ impl Timeline {
|
||||
guard.initialize_local_layers(loaded_layers, disk_consistent_lsn + 1);
|
||||
|
||||
if let Some(rtc) = self.remote_client.as_ref() {
|
||||
let (needs_upload, needs_cleanup) = to_sync;
|
||||
for (layer, m) in needs_upload {
|
||||
rtc.schedule_layer_file_upload(&layer.layer_desc().filename(), &m)?;
|
||||
}
|
||||
rtc.schedule_layer_file_deletion(needs_cleanup)?;
|
||||
rtc.schedule_index_upload_for_file_changes()?;
|
||||
// Tenant::create_timeline will wait for these uploads to happen before returning, or
|
||||
@@ -4323,7 +4328,6 @@ impl Timeline {
|
||||
let img = match self
|
||||
.walredo_mgr
|
||||
.request_redo(key, request_lsn, data.img, data.records, self.pg_version)
|
||||
.await
|
||||
.context("Failed to reconstruct a page image:")
|
||||
{
|
||||
Ok(img) => img,
|
||||
|
||||
@@ -72,7 +72,7 @@ pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result<Vec<Discovere
|
||||
}
|
||||
|
||||
/// Decision on what to do with a layer file after considering its local and remote metadata.
|
||||
#[derive(Clone, Debug)]
|
||||
#[derive(Clone)]
|
||||
pub(super) enum Decision {
|
||||
/// The layer is not present locally.
|
||||
Evicted(LayerFileMetadata),
|
||||
@@ -84,30 +84,27 @@ pub(super) enum Decision {
|
||||
},
|
||||
/// The layer is present locally, and metadata matches.
|
||||
UseLocal(LayerFileMetadata),
|
||||
/// The layer is only known locally, it needs to be uploaded.
|
||||
NeedsUpload(LayerFileMetadata),
|
||||
}
|
||||
|
||||
/// A layer needs to be left out of the layer map.
|
||||
/// The related layer is is in future compared to disk_consistent_lsn, it must not be loaded.
|
||||
#[derive(Debug)]
|
||||
pub(super) enum DismissedLayer {
|
||||
/// The related layer is is in future compared to disk_consistent_lsn, it must not be loaded.
|
||||
Future {
|
||||
/// The local metadata. `None` if the layer is only known through [`IndexPart`].
|
||||
local: Option<LayerFileMetadata>,
|
||||
},
|
||||
/// The layer only exists locally.
|
||||
///
|
||||
/// In order to make crash safe updates to layer map, we must dismiss layers which are only
|
||||
/// found locally or not yet included in the remote `index_part.json`.
|
||||
LocalOnly(LayerFileMetadata),
|
||||
pub(super) struct FutureLayer {
|
||||
/// The local metadata. `None` if the layer is only known through [`IndexPart`].
|
||||
pub(super) local: Option<LayerFileMetadata>,
|
||||
}
|
||||
|
||||
/// Merges local discoveries and remote [`IndexPart`] to a collection of decisions.
|
||||
///
|
||||
/// This function should not gain additional reasons to fail than [`FutureLayer`], consider adding
|
||||
/// the checks earlier to [`scan_timeline_dir`].
|
||||
pub(super) fn reconcile(
|
||||
discovered: Vec<(LayerFileName, u64)>,
|
||||
index_part: Option<&IndexPart>,
|
||||
disk_consistent_lsn: Lsn,
|
||||
generation: Generation,
|
||||
) -> Vec<(LayerFileName, Result<Decision, DismissedLayer>)> {
|
||||
) -> Vec<(LayerFileName, Result<Decision, FutureLayer>)> {
|
||||
use Decision::*;
|
||||
|
||||
// name => (local, remote)
|
||||
@@ -145,19 +142,17 @@ pub(super) fn reconcile(
|
||||
.into_iter()
|
||||
.map(|(name, (local, remote))| {
|
||||
let decision = if name.is_in_future(disk_consistent_lsn) {
|
||||
Err(DismissedLayer::Future { local })
|
||||
Err(FutureLayer { local })
|
||||
} else {
|
||||
match (local, remote) {
|
||||
(Some(local), Some(remote)) if local != remote => {
|
||||
Ok(UseRemote { local, remote })
|
||||
}
|
||||
(Some(x), Some(_)) => Ok(UseLocal(x)),
|
||||
(None, Some(x)) => Ok(Evicted(x)),
|
||||
(Some(x), None) => Err(DismissedLayer::LocalOnly(x)),
|
||||
Ok(match (local, remote) {
|
||||
(Some(local), Some(remote)) if local != remote => UseRemote { local, remote },
|
||||
(Some(x), Some(_)) => UseLocal(x),
|
||||
(None, Some(x)) => Evicted(x),
|
||||
(Some(x), None) => NeedsUpload(x),
|
||||
(None, None) => {
|
||||
unreachable!("there must not be any non-local non-remote files")
|
||||
}
|
||||
}
|
||||
})
|
||||
};
|
||||
|
||||
(name, decision)
|
||||
@@ -197,21 +192,14 @@ pub(super) fn cleanup_future_layer(
|
||||
name: &LayerFileName,
|
||||
disk_consistent_lsn: Lsn,
|
||||
) -> anyhow::Result<()> {
|
||||
use LayerFileName::*;
|
||||
let kind = match name {
|
||||
Delta(_) => "delta",
|
||||
Image(_) => "image",
|
||||
};
|
||||
// future image layers are allowed to be produced always for not yet flushed to disk
|
||||
// lsns stored in InMemoryLayer.
|
||||
let kind = name.kind();
|
||||
tracing::info!("found future {kind} layer {name} disk_consistent_lsn is {disk_consistent_lsn}");
|
||||
std::fs::remove_file(path)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(super) fn cleanup_local_only_file(
|
||||
path: &Utf8Path,
|
||||
name: &LayerFileName,
|
||||
local: &LayerFileMetadata,
|
||||
) -> anyhow::Result<()> {
|
||||
let kind = name.kind();
|
||||
tracing::info!("found local-only {kind} layer {name}, metadata {local:?}");
|
||||
std::fs::remove_file(path)?;
|
||||
crate::tenant::timeline::rename_to_backup(path)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -338,20 +338,11 @@ impl<'a> WalIngest<'a> {
|
||||
} else if decoded.xl_rmid == pg_constants::RM_LOGICALMSG_ID {
|
||||
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_LOGICAL_MESSAGE {
|
||||
let xlrec = XlLogicalMessage::decode(&mut buf);
|
||||
let prefix = std::str::from_utf8(&buf[0..xlrec.prefix_size - 1])?;
|
||||
let message = &buf[xlrec.prefix_size..xlrec.prefix_size + xlrec.message_size];
|
||||
if prefix == "neon-test" {
|
||||
// This is a convenient way to make the WAL ingestion pause at
|
||||
// particular point in the WAL. For more fine-grained control,
|
||||
// we could peek into the message and only pause if it contains
|
||||
// a particular string, for example, but this is enough for now.
|
||||
crate::failpoint_support::sleep_millis_async!(
|
||||
"wal-ingest-logical-message-sleep"
|
||||
);
|
||||
} else if let Some(path) = prefix.strip_prefix("neon-file:") {
|
||||
modification.put_file(path, message, ctx).await?;
|
||||
}
|
||||
// This is a convenient way to make the WAL ingestion pause at
|
||||
// particular point in the WAL. For more fine-grained control,
|
||||
// we could peek into the message and only pause if it contains
|
||||
// a particular string, for example, but this is enough for now.
|
||||
crate::failpoint_support::sleep_millis_async!("wal-ingest-logical-message-sleep");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -468,6 +459,7 @@ impl<'a> WalIngest<'a> {
|
||||
}
|
||||
} else if info == pg_constants::XLOG_HEAP_DELETE {
|
||||
let xlrec = v14::XlHeapDelete::decode(buf);
|
||||
assert_eq!(0, buf.remaining());
|
||||
if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
|
||||
new_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
}
|
||||
@@ -535,6 +527,7 @@ impl<'a> WalIngest<'a> {
|
||||
}
|
||||
} else if info == pg_constants::XLOG_HEAP_DELETE {
|
||||
let xlrec = v15::XlHeapDelete::decode(buf);
|
||||
assert_eq!(0, buf.remaining());
|
||||
if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
|
||||
new_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
}
|
||||
@@ -602,6 +595,7 @@ impl<'a> WalIngest<'a> {
|
||||
}
|
||||
} else if info == pg_constants::XLOG_HEAP_DELETE {
|
||||
let xlrec = v16::XlHeapDelete::decode(buf);
|
||||
assert_eq!(0, buf.remaining());
|
||||
if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
|
||||
new_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
}
|
||||
@@ -777,6 +771,7 @@ impl<'a> WalIngest<'a> {
|
||||
}
|
||||
pg_constants::XLOG_NEON_HEAP_DELETE => {
|
||||
let xlrec = v16::rm_neon::XlNeonHeapDelete::decode(buf);
|
||||
assert_eq!(0, buf.remaining());
|
||||
if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
|
||||
new_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
}
|
||||
|
||||
@@ -748,26 +748,6 @@ impl XlMultiXactTruncate {
|
||||
}
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
pub struct XlLogicalMessage {
|
||||
pub db_id: Oid,
|
||||
pub transactional: bool,
|
||||
pub prefix_size: usize,
|
||||
pub message_size: usize,
|
||||
}
|
||||
|
||||
impl XlLogicalMessage {
|
||||
pub fn decode(buf: &mut Bytes) -> XlLogicalMessage {
|
||||
XlLogicalMessage {
|
||||
db_id: buf.get_u32_le(),
|
||||
transactional: buf.get_u32_le() != 0, // 4-bytes alignment
|
||||
prefix_size: buf.get_u64_le() as usize,
|
||||
message_size: buf.get_u64_le() as usize,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Main routine to decode a WAL record and figure out which blocks are modified
|
||||
//
|
||||
// See xlogrecord.h for details
|
||||
|
||||
@@ -26,12 +26,13 @@ use serde::Serialize;
|
||||
use std::collections::VecDeque;
|
||||
use std::io;
|
||||
use std::io::prelude::*;
|
||||
use std::io::{Error, ErrorKind};
|
||||
use std::ops::{Deref, DerefMut};
|
||||
use std::os::unix::io::{AsRawFd, RawFd};
|
||||
use std::os::unix::prelude::CommandExt;
|
||||
use std::process::Stdio;
|
||||
use std::process::{Child, ChildStderr, ChildStdin, ChildStdout, Command};
|
||||
use std::sync::{Arc, Mutex, MutexGuard, RwLock};
|
||||
use std::sync::{Mutex, MutexGuard};
|
||||
use std::time::Duration;
|
||||
use std::time::Instant;
|
||||
use tracing::*;
|
||||
@@ -70,7 +71,29 @@ pub(crate) struct BufferTag {
|
||||
pub blknum: u32,
|
||||
}
|
||||
|
||||
///
|
||||
/// WAL Redo Manager is responsible for replaying WAL records.
|
||||
///
|
||||
/// Callers use the WAL redo manager through this abstract interface,
|
||||
/// which makes it easy to mock it in tests.
|
||||
pub trait WalRedoManager: Send + Sync {
|
||||
/// Apply some WAL records.
|
||||
///
|
||||
/// The caller passes an old page image, and WAL records that should be
|
||||
/// applied over it. The return value is a new page image, after applying
|
||||
/// the reords.
|
||||
fn request_redo(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
base_img: Option<(Lsn, Bytes)>,
|
||||
records: Vec<(Lsn, NeonWalRecord)>,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<Bytes>;
|
||||
}
|
||||
|
||||
struct ProcessInput {
|
||||
child: NoLeakChild,
|
||||
stdin: ChildStdin,
|
||||
stderr_fd: RawFd,
|
||||
stdout_fd: RawFd,
|
||||
@@ -93,7 +116,13 @@ struct ProcessOutput {
|
||||
pub struct PostgresRedoManager {
|
||||
tenant_id: TenantId,
|
||||
conf: &'static PageServerConf,
|
||||
redo_process: RwLock<Option<Arc<WalRedoProcess>>>,
|
||||
/// Counter to separate same sized walredo inputs failing at the same millisecond.
|
||||
#[cfg(feature = "testing")]
|
||||
dump_sequence: AtomicUsize,
|
||||
|
||||
stdout: Mutex<Option<ProcessOutput>>,
|
||||
stdin: Mutex<Option<ProcessInput>>,
|
||||
stderr: Mutex<Option<ChildStderr>>,
|
||||
}
|
||||
|
||||
/// Can this request be served by neon redo functions
|
||||
@@ -114,14 +143,14 @@ fn can_apply_in_neon(rec: &NeonWalRecord) -> bool {
|
||||
///
|
||||
/// Public interface of WAL redo manager
|
||||
///
|
||||
impl PostgresRedoManager {
|
||||
impl WalRedoManager for PostgresRedoManager {
|
||||
///
|
||||
/// Request the WAL redo manager to apply some WAL records
|
||||
///
|
||||
/// The WAL redo is handled by a separate thread, so this just sends a request
|
||||
/// to the thread and waits for response.
|
||||
///
|
||||
pub async fn request_redo(
|
||||
fn request_redo(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
@@ -186,7 +215,11 @@ impl PostgresRedoManager {
|
||||
PostgresRedoManager {
|
||||
tenant_id,
|
||||
conf,
|
||||
redo_process: RwLock::new(None),
|
||||
#[cfg(feature = "testing")]
|
||||
dump_sequence: AtomicUsize::default(),
|
||||
stdin: Mutex::new(None),
|
||||
stdout: Mutex::new(None),
|
||||
stderr: Mutex::new(None),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -209,38 +242,20 @@ impl PostgresRedoManager {
|
||||
let start_time = Instant::now();
|
||||
let mut n_attempts = 0u32;
|
||||
loop {
|
||||
let mut proc = self.stdin.lock().unwrap();
|
||||
let lock_time = Instant::now();
|
||||
|
||||
// launch the WAL redo process on first use
|
||||
let proc: Arc<WalRedoProcess> = {
|
||||
let proc_guard = self.redo_process.read().unwrap();
|
||||
match &*proc_guard {
|
||||
None => {
|
||||
// "upgrade" to write lock to launch the process
|
||||
drop(proc_guard);
|
||||
let mut proc_guard = self.redo_process.write().unwrap();
|
||||
match &*proc_guard {
|
||||
None => {
|
||||
let proc = Arc::new(
|
||||
WalRedoProcess::launch(self.conf, self.tenant_id, pg_version)
|
||||
.context("launch walredo process")?,
|
||||
);
|
||||
*proc_guard = Some(Arc::clone(&proc));
|
||||
proc
|
||||
}
|
||||
Some(proc) => Arc::clone(proc),
|
||||
}
|
||||
}
|
||||
Some(proc) => Arc::clone(proc),
|
||||
}
|
||||
};
|
||||
|
||||
if proc.is_none() {
|
||||
self.launch(&mut proc, pg_version)
|
||||
.context("launch process")?;
|
||||
}
|
||||
WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());
|
||||
|
||||
// Relational WAL records are applied using wal-redo-postgres
|
||||
let buf_tag = BufferTag { rel, blknum };
|
||||
let result = proc
|
||||
.apply_wal_records(buf_tag, &base_img, records, wal_redo_timeout)
|
||||
let result = self
|
||||
.apply_wal_records(proc, buf_tag, &base_img, records, wal_redo_timeout)
|
||||
.context("apply_wal_records");
|
||||
|
||||
let end_time = Instant::now();
|
||||
@@ -281,34 +296,22 @@ impl PostgresRedoManager {
|
||||
n_attempts,
|
||||
e,
|
||||
);
|
||||
// Avoid concurrent callers hitting the same issue.
|
||||
// We can't prevent it from happening because we want to enable parallelism.
|
||||
let mut guard = self.redo_process.write().unwrap();
|
||||
match &*guard {
|
||||
Some(current_field_value) => {
|
||||
if Arc::ptr_eq(current_field_value, &proc) {
|
||||
// We're the first to observe an error from `proc`, it's our job to take it out of rotation.
|
||||
*guard = None;
|
||||
}
|
||||
}
|
||||
None => {
|
||||
// Another thread was faster to observe the error, and already took the process out of rotation.
|
||||
}
|
||||
// self.stdin only holds stdin & stderr as_raw_fd().
|
||||
// Dropping it as part of take() doesn't close them.
|
||||
// The owning objects (ChildStdout and ChildStderr) are stored in
|
||||
// self.stdout and self.stderr, respsectively.
|
||||
// We intentionally keep them open here to avoid a race between
|
||||
// currently running `apply_wal_records()` and a `launch()` call
|
||||
// after we return here.
|
||||
// The currently running `apply_wal_records()` must not read from
|
||||
// the newly launched process.
|
||||
// By keeping self.stdout and self.stderr open here, `launch()` will
|
||||
// get other file descriptors for the new child's stdout and stderr,
|
||||
// and hence the current `apply_wal_records()` calls will observe
|
||||
// `output.stdout.as_raw_fd() != stdout_fd` .
|
||||
if let Some(proc) = self.stdin.lock().unwrap().take() {
|
||||
proc.child.kill_and_wait();
|
||||
}
|
||||
drop(guard);
|
||||
// NB: there may still be other concurrent threads using `proc`.
|
||||
// The last one will send SIGKILL when the underlying Arc reaches refcount 0.
|
||||
// NB: it's important to drop(proc) after drop(guard). Otherwise we'd keep
|
||||
// holding the lock while waiting for the process to exit.
|
||||
// NB: the drop impl blocks the current threads with a wait() system call for
|
||||
// the child process. We dropped the `guard` above so that other threads aren't
|
||||
// affected. But, it's good that the current thread _does_ block to wait.
|
||||
// If we instead deferred the waiting into the background / to tokio, it could
|
||||
// happen that if walredo always fails immediately, we spawn processes faster
|
||||
// than we can SIGKILL & `wait` for them to exit. By doing it the way we do here,
|
||||
// we limit this risk of run-away to at most $num_runtimes * $num_executor_threads.
|
||||
// This probably needs revisiting at some later point.
|
||||
drop(proc);
|
||||
} else if n_attempts != 0 {
|
||||
info!(n_attempts, "retried walredo succeeded");
|
||||
}
|
||||
@@ -611,32 +614,24 @@ impl<C: CommandExt> CloseFileDescriptors for C {
|
||||
}
|
||||
}
|
||||
|
||||
struct WalRedoProcess {
|
||||
#[allow(dead_code)]
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
// Some() on construction, only becomes None on Drop.
|
||||
child: Option<NoLeakChild>,
|
||||
stdout: Mutex<ProcessOutput>,
|
||||
stdin: Mutex<ProcessInput>,
|
||||
stderr: Mutex<ChildStderr>,
|
||||
/// Counter to separate same sized walredo inputs failing at the same millisecond.
|
||||
#[cfg(feature = "testing")]
|
||||
dump_sequence: AtomicUsize,
|
||||
}
|
||||
|
||||
impl WalRedoProcess {
|
||||
impl PostgresRedoManager {
|
||||
//
|
||||
// Start postgres binary in special WAL redo mode.
|
||||
//
|
||||
#[instrument(skip_all,fields(tenant_id=%tenant_id, pg_version=pg_version))]
|
||||
#[instrument(skip_all,fields(tenant_id=%self.tenant_id, pg_version=pg_version))]
|
||||
fn launch(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
&self,
|
||||
input: &mut MutexGuard<Option<ProcessInput>>,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<Self> {
|
||||
let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
|
||||
let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
|
||||
) -> Result<(), Error> {
|
||||
let pg_bin_dir_path = self
|
||||
.conf
|
||||
.pg_bin_dir(pg_version)
|
||||
.map_err(|e| Error::new(ErrorKind::Other, format!("incorrect pg_bin_dir path: {e}")))?;
|
||||
let pg_lib_dir_path = self
|
||||
.conf
|
||||
.pg_lib_dir(pg_version)
|
||||
.map_err(|e| Error::new(ErrorKind::Other, format!("incorrect pg_lib_dir path: {e}")))?;
|
||||
|
||||
// Start postgres itself
|
||||
let child = Command::new(pg_bin_dir_path.join("postgres"))
|
||||
@@ -657,8 +652,13 @@ impl WalRedoProcess {
|
||||
// as close-on-exec by default, but that's not enough, since we use
|
||||
// libraries that directly call libc open without setting that flag.
|
||||
.close_fds()
|
||||
.spawn_no_leak_child(tenant_id)
|
||||
.context("spawn process")?;
|
||||
.spawn_no_leak_child(self.tenant_id)
|
||||
.map_err(|e| {
|
||||
Error::new(
|
||||
e.kind(),
|
||||
format!("postgres --wal-redo command failed to start: {}", e),
|
||||
)
|
||||
})?;
|
||||
|
||||
let mut child = scopeguard::guard(child, |child| {
|
||||
error!("killing wal-redo-postgres process due to a problem during launch");
|
||||
@@ -685,47 +685,36 @@ impl WalRedoProcess {
|
||||
// all fallible operations post-spawn are complete, so get rid of the guard
|
||||
let child = scopeguard::ScopeGuard::into_inner(child);
|
||||
|
||||
Ok(Self {
|
||||
conf,
|
||||
tenant_id,
|
||||
child: Some(child),
|
||||
stdin: Mutex::new(ProcessInput {
|
||||
stdout_fd: stdout.as_raw_fd(),
|
||||
stderr_fd: stderr.as_raw_fd(),
|
||||
stdin,
|
||||
n_requests: 0,
|
||||
}),
|
||||
stdout: Mutex::new(ProcessOutput {
|
||||
stdout,
|
||||
pending_responses: VecDeque::new(),
|
||||
n_processed_responses: 0,
|
||||
}),
|
||||
stderr: Mutex::new(stderr),
|
||||
#[cfg(feature = "testing")]
|
||||
dump_sequence: AtomicUsize::default(),
|
||||
})
|
||||
}
|
||||
**input = Some(ProcessInput {
|
||||
child,
|
||||
stdout_fd: stdout.as_raw_fd(),
|
||||
stderr_fd: stderr.as_raw_fd(),
|
||||
stdin,
|
||||
n_requests: 0,
|
||||
});
|
||||
|
||||
fn id(&self) -> u32 {
|
||||
self.child
|
||||
.as_ref()
|
||||
.expect("must not call this during Drop")
|
||||
.id()
|
||||
*self.stdout.lock().unwrap() = Some(ProcessOutput {
|
||||
stdout,
|
||||
pending_responses: VecDeque::new(),
|
||||
n_processed_responses: 0,
|
||||
});
|
||||
*self.stderr.lock().unwrap() = Some(stderr);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Apply given WAL records ('records') over an old page image. Returns
|
||||
// new page image.
|
||||
//
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.id()))]
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%input.as_ref().unwrap().child.id()))]
|
||||
fn apply_wal_records(
|
||||
&self,
|
||||
input: MutexGuard<Option<ProcessInput>>,
|
||||
tag: BufferTag,
|
||||
base_img: &Option<Bytes>,
|
||||
records: &[(Lsn, NeonWalRecord)],
|
||||
wal_redo_timeout: Duration,
|
||||
) -> anyhow::Result<Bytes> {
|
||||
let input = self.stdin.lock().unwrap();
|
||||
|
||||
// Serialize all the messages to send the WAL redo process first.
|
||||
//
|
||||
// This could be problematic if there are millions of records to replay,
|
||||
@@ -768,17 +757,18 @@ impl WalRedoProcess {
|
||||
fn apply_wal_records0(
|
||||
&self,
|
||||
writebuf: &[u8],
|
||||
input: MutexGuard<ProcessInput>,
|
||||
mut input: MutexGuard<Option<ProcessInput>>,
|
||||
wal_redo_timeout: Duration,
|
||||
) -> anyhow::Result<Bytes> {
|
||||
let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small.
|
||||
let proc = input.as_mut().unwrap();
|
||||
let mut nwrite = 0usize;
|
||||
let stdout_fd = proc.stdout_fd;
|
||||
|
||||
// Prepare for calling poll()
|
||||
let mut pollfds = [
|
||||
PollFd::new(proc.stdin.as_raw_fd(), PollFlags::POLLOUT),
|
||||
PollFd::new(proc.stderr_fd, PollFlags::POLLIN),
|
||||
PollFd::new(proc.stdout_fd, PollFlags::POLLIN),
|
||||
PollFd::new(stdout_fd, PollFlags::POLLIN),
|
||||
];
|
||||
|
||||
// We do two things simultaneously: send the old base image and WAL records to
|
||||
@@ -800,7 +790,8 @@ impl WalRedoProcess {
|
||||
let err_revents = pollfds[1].revents().unwrap();
|
||||
if err_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
|
||||
let mut errbuf: [u8; 16384] = [0; 16384];
|
||||
let mut stderr = self.stderr.lock().unwrap();
|
||||
let mut stderr_guard = self.stderr.lock().unwrap();
|
||||
let stderr = stderr_guard.as_mut().unwrap();
|
||||
let len = stderr.read(&mut errbuf)?;
|
||||
|
||||
// The message might not be split correctly into lines here. But this is
|
||||
@@ -830,7 +821,7 @@ impl WalRedoProcess {
|
||||
}
|
||||
let request_no = proc.n_requests;
|
||||
proc.n_requests += 1;
|
||||
drop(proc);
|
||||
drop(input);
|
||||
|
||||
// To improve walredo performance we separate sending requests and receiving
|
||||
// responses. Them are protected by different mutexes (output and input).
|
||||
@@ -844,7 +835,20 @@ impl WalRedoProcess {
|
||||
// pending responses ring buffer and truncate all empty elements from the front,
|
||||
// advancing processed responses number.
|
||||
|
||||
let mut output = self.stdout.lock().unwrap();
|
||||
let mut output_guard = self.stdout.lock().unwrap();
|
||||
let output = output_guard.as_mut().unwrap();
|
||||
if output.stdout.as_raw_fd() != stdout_fd {
|
||||
// If stdout file descriptor is changed then it means that walredo process is crashed and restarted.
|
||||
// As far as ProcessInput and ProcessOutout are protected by different mutexes,
|
||||
// it can happen that we send request to one process and waiting response from another.
|
||||
// To prevent such situation we compare stdout file descriptors.
|
||||
// As far as old stdout pipe is destroyed only after new one is created,
|
||||
// it can not reuse the same file descriptor, so this check is safe.
|
||||
//
|
||||
// Cross-read this with the comment in apply_batch_postgres if result.is_err().
|
||||
// That's where we kill the child process.
|
||||
anyhow::bail!("WAL redo process closed its stdout unexpectedly");
|
||||
}
|
||||
let n_processed_responses = output.n_processed_responses;
|
||||
while n_processed_responses + output.pending_responses.len() <= request_no {
|
||||
// We expect the WAL redo process to respond with an 8k page image. We read it
|
||||
@@ -869,7 +873,8 @@ impl WalRedoProcess {
|
||||
let err_revents = pollfds[1].revents().unwrap();
|
||||
if err_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
|
||||
let mut errbuf: [u8; 16384] = [0; 16384];
|
||||
let mut stderr = self.stderr.lock().unwrap();
|
||||
let mut stderr_guard = self.stderr.lock().unwrap();
|
||||
let stderr = stderr_guard.as_mut().unwrap();
|
||||
let len = stderr.read(&mut errbuf)?;
|
||||
|
||||
// The message might not be split correctly into lines here. But this is
|
||||
@@ -979,15 +984,6 @@ impl WalRedoProcess {
|
||||
fn record_and_log(&self, _: &[u8]) {}
|
||||
}
|
||||
|
||||
impl Drop for WalRedoProcess {
|
||||
fn drop(&mut self) {
|
||||
self.child
|
||||
.take()
|
||||
.expect("we only do this once")
|
||||
.kill_and_wait();
|
||||
}
|
||||
}
|
||||
|
||||
/// Wrapper type around `std::process::Child` which guarantees that the child
|
||||
/// will be killed and waited-for by this process before being dropped.
|
||||
struct NoLeakChild {
|
||||
@@ -1135,15 +1131,15 @@ fn build_get_page_msg(tag: BufferTag, buf: &mut Vec<u8>) {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::PostgresRedoManager;
|
||||
use super::{PostgresRedoManager, WalRedoManager};
|
||||
use crate::repository::Key;
|
||||
use crate::{config::PageServerConf, walrecord::NeonWalRecord};
|
||||
use bytes::Bytes;
|
||||
use std::str::FromStr;
|
||||
use utils::{id::TenantId, lsn::Lsn};
|
||||
|
||||
#[tokio::test]
|
||||
async fn short_v14_redo() {
|
||||
#[test]
|
||||
fn short_v14_redo() {
|
||||
let expected = std::fs::read("fixtures/short_v14_redo.page").unwrap();
|
||||
|
||||
let h = RedoHarness::new().unwrap();
|
||||
@@ -1164,14 +1160,13 @@ mod tests {
|
||||
short_records(),
|
||||
14,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(&expected, &*page);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn short_v14_fails_for_wrong_key_but_returns_zero_page() {
|
||||
#[test]
|
||||
fn short_v14_fails_for_wrong_key_but_returns_zero_page() {
|
||||
let h = RedoHarness::new().unwrap();
|
||||
|
||||
let page = h
|
||||
@@ -1191,7 +1186,6 @@ mod tests {
|
||||
short_records(),
|
||||
14,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// TODO: there will be some stderr printout, which is forwarded to tracing that could
|
||||
|
||||
@@ -9,7 +9,6 @@ OBJS = \
|
||||
libpagestore.o \
|
||||
neon.o \
|
||||
neon_utils.o \
|
||||
neon_walreader.o \
|
||||
pagestore_smgr.o \
|
||||
relsize_cache.o \
|
||||
walproposer.o \
|
||||
@@ -24,34 +23,6 @@ EXTENSION = neon
|
||||
DATA = neon--1.0.sql
|
||||
PGFILEDESC = "neon - cloud storage for PostgreSQL"
|
||||
|
||||
EXTRA_CLEAN = \
|
||||
libwalproposer.a
|
||||
|
||||
WALPROP_OBJS = \
|
||||
$(WIN32RES) \
|
||||
walproposer.o \
|
||||
neon_utils.o \
|
||||
walproposer_compat.o
|
||||
|
||||
.PHONY: walproposer-lib
|
||||
walproposer-lib: CPPFLAGS += -DWALPROPOSER_LIB
|
||||
walproposer-lib: libwalproposer.a;
|
||||
|
||||
.PHONY: libwalproposer.a
|
||||
libwalproposer.a: $(WALPROP_OBJS)
|
||||
rm -f $@
|
||||
$(AR) $(AROPT) $@ $^
|
||||
|
||||
# needs vars:
|
||||
# FIND_TYPEDEF pointing to find_typedef
|
||||
# INDENT pointing to pg_bsd_indent
|
||||
# PGINDENT_SCRIPT pointing to pgindent (be careful with PGINDENT var name:
|
||||
# pgindent will pick it up as pg_bsd_indent path).
|
||||
.PHONY: pgindent
|
||||
pgindent:
|
||||
+@ echo top_srcdir=$(top_srcdir) top_builddir=$(top_builddir) srcdir=$(srcdir)
|
||||
$(FIND_TYPEDEF) . > neon.typedefs
|
||||
INDENT=$(INDENT) $(PGINDENT_SCRIPT) --typedefs neon.typedefs $(srcdir)/*.c $(srcdir)/*.h
|
||||
|
||||
PG_CONFIG = pg_config
|
||||
PGXS := $(shell $(PG_CONFIG) --pgxs)
|
||||
|
||||
@@ -41,7 +41,7 @@ static char *ConsoleURL = NULL;
|
||||
static bool ForwardDDL = true;
|
||||
|
||||
/* Curl structures for sending the HTTP requests */
|
||||
static CURL *CurlHandle;
|
||||
static CURL * CurlHandle;
|
||||
static struct curl_slist *ContentHeader = NULL;
|
||||
|
||||
/*
|
||||
@@ -54,7 +54,7 @@ typedef enum
|
||||
{
|
||||
Op_Set, /* An upsert: Either a creation or an alter */
|
||||
Op_Delete,
|
||||
} OpType;
|
||||
} OpType;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
@@ -62,7 +62,7 @@ typedef struct
|
||||
Oid owner;
|
||||
char old_name[NAMEDATALEN];
|
||||
OpType type;
|
||||
} DbEntry;
|
||||
} DbEntry;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
@@ -70,7 +70,7 @@ typedef struct
|
||||
char old_name[NAMEDATALEN];
|
||||
const char *password;
|
||||
OpType type;
|
||||
} RoleEntry;
|
||||
} RoleEntry;
|
||||
|
||||
/*
|
||||
* We keep one of these for each subtransaction in a stack. When a subtransaction
|
||||
@@ -82,10 +82,10 @@ typedef struct DdlHashTable
|
||||
struct DdlHashTable *prev_table;
|
||||
HTAB *db_table;
|
||||
HTAB *role_table;
|
||||
} DdlHashTable;
|
||||
} DdlHashTable;
|
||||
|
||||
static DdlHashTable RootTable;
|
||||
static DdlHashTable *CurrentDdlTable = &RootTable;
|
||||
static DdlHashTable * CurrentDdlTable = &RootTable;
|
||||
|
||||
static void
|
||||
PushKeyValue(JsonbParseState **state, char *key, char *value)
|
||||
@@ -199,7 +199,7 @@ typedef struct
|
||||
{
|
||||
char str[ERROR_SIZE];
|
||||
size_t size;
|
||||
} ErrorString;
|
||||
} ErrorString;
|
||||
|
||||
static size_t
|
||||
ErrorWriteCallback(char *ptr, size_t size, size_t nmemb, void *userdata)
|
||||
|
||||
@@ -25,80 +25,79 @@
|
||||
|
||||
#include <curl/curl.h>
|
||||
|
||||
static int extension_server_port = 0;
|
||||
static int extension_server_port = 0;
|
||||
|
||||
static download_extension_file_hook_type prev_download_extension_file_hook = NULL;
|
||||
|
||||
/* to download all SQL (and data) files for an extension: */
|
||||
/* curl -X POST http://localhost:8080/extension_server/postgis */
|
||||
/* it covers two possible extension files layouts: */
|
||||
/* 1. extension_name--version--platform.sql */
|
||||
/* 2. extension_name/extension_name--version.sql */
|
||||
/* extension_name/extra_files.csv */
|
||||
/* */
|
||||
/* to download specific library file: */
|
||||
/* curl -X POST http://localhost:8080/extension_server/postgis-3.so?is_library=true */
|
||||
// to download all SQL (and data) files for an extension:
|
||||
// curl -X POST http://localhost:8080/extension_server/postgis
|
||||
// it covers two possible extension files layouts:
|
||||
// 1. extension_name--version--platform.sql
|
||||
// 2. extension_name/extension_name--version.sql
|
||||
// extension_name/extra_files.csv
|
||||
//
|
||||
// to download specific library file:
|
||||
// curl -X POST http://localhost:8080/extension_server/postgis-3.so?is_library=true
|
||||
static bool
|
||||
neon_download_extension_file_http(const char *filename, bool is_library)
|
||||
{
|
||||
CURL *curl;
|
||||
CURLcode res;
|
||||
char *compute_ctl_url;
|
||||
char *postdata;
|
||||
bool ret = false;
|
||||
CURL *curl;
|
||||
CURLcode res;
|
||||
char *compute_ctl_url;
|
||||
char *postdata;
|
||||
bool ret = false;
|
||||
|
||||
if ((curl = curl_easy_init()) == NULL)
|
||||
{
|
||||
elog(ERROR, "Failed to initialize curl handle");
|
||||
}
|
||||
if ((curl = curl_easy_init()) == NULL)
|
||||
{
|
||||
elog(ERROR, "Failed to initialize curl handle");
|
||||
}
|
||||
|
||||
compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s",
|
||||
extension_server_port, filename, is_library ? "?is_library=true" : "");
|
||||
compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s",
|
||||
extension_server_port, filename, is_library ? "?is_library=true" : "");
|
||||
|
||||
elog(LOG, "Sending request to compute_ctl: %s", compute_ctl_url);
|
||||
elog(LOG, "Sending request to compute_ctl: %s", compute_ctl_url);
|
||||
|
||||
curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, "POST");
|
||||
curl_easy_setopt(curl, CURLOPT_URL, compute_ctl_url);
|
||||
curl_easy_setopt(curl, CURLOPT_TIMEOUT, 3L /* seconds */ );
|
||||
curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, "POST");
|
||||
curl_easy_setopt(curl, CURLOPT_URL, compute_ctl_url);
|
||||
curl_easy_setopt(curl, CURLOPT_TIMEOUT, 3L /* seconds */);
|
||||
|
||||
if (curl)
|
||||
{
|
||||
/* Perform the request, res will get the return code */
|
||||
res = curl_easy_perform(curl);
|
||||
/* Check for errors */
|
||||
if (res == CURLE_OK)
|
||||
{
|
||||
ret = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Don't error here because postgres will try to find the file */
|
||||
/* and will fail with some proper error message if it's not found. */
|
||||
elog(WARNING, "neon_download_extension_file_http failed: %s\n", curl_easy_strerror(res));
|
||||
}
|
||||
if (curl)
|
||||
{
|
||||
/* Perform the request, res will get the return code */
|
||||
res = curl_easy_perform(curl);
|
||||
/* Check for errors */
|
||||
if (res == CURLE_OK)
|
||||
{
|
||||
ret = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Don't error here because postgres will try to find the file
|
||||
// and will fail with some proper error message if it's not found.
|
||||
elog(WARNING, "neon_download_extension_file_http failed: %s\n", curl_easy_strerror(res));
|
||||
}
|
||||
|
||||
/* always cleanup */
|
||||
curl_easy_cleanup(curl);
|
||||
}
|
||||
/* always cleanup */
|
||||
curl_easy_cleanup(curl);
|
||||
}
|
||||
|
||||
return ret;
|
||||
return ret;
|
||||
}
|
||||
|
||||
void
|
||||
pg_init_extension_server()
|
||||
void pg_init_extension_server()
|
||||
{
|
||||
/* Port to connect to compute_ctl on localhost */
|
||||
/* to request extension files. */
|
||||
DefineCustomIntVariable("neon.extension_server_port",
|
||||
"connection string to the compute_ctl",
|
||||
NULL,
|
||||
&extension_server_port,
|
||||
0, 0, INT_MAX,
|
||||
PGC_POSTMASTER,
|
||||
0, /* no flags required */
|
||||
NULL, NULL, NULL);
|
||||
// Port to connect to compute_ctl on localhost
|
||||
// to request extension files.
|
||||
DefineCustomIntVariable("neon.extension_server_port",
|
||||
"connection string to the compute_ctl",
|
||||
NULL,
|
||||
&extension_server_port,
|
||||
0, 0, INT_MAX,
|
||||
PGC_POSTMASTER,
|
||||
0, /* no flags required */
|
||||
NULL, NULL, NULL);
|
||||
|
||||
/* set download_extension_file_hook */
|
||||
prev_download_extension_file_hook = download_extension_file_hook;
|
||||
download_extension_file_hook = neon_download_extension_file_http;
|
||||
// set download_extension_file_hook
|
||||
prev_download_extension_file_hook = download_extension_file_hook;
|
||||
download_extension_file_hook = neon_download_extension_file_http;
|
||||
}
|
||||
|
||||
@@ -67,33 +67,31 @@ typedef struct FileCacheEntry
|
||||
BufferTag key;
|
||||
uint32 offset;
|
||||
uint32 access_count;
|
||||
uint32 bitmap[BLOCKS_PER_CHUNK / 32];
|
||||
dlist_node lru_node; /* LRU list node */
|
||||
uint32 bitmap[BLOCKS_PER_CHUNK/32];
|
||||
dlist_node lru_node; /* LRU list node */
|
||||
} FileCacheEntry;
|
||||
|
||||
typedef struct FileCacheControl
|
||||
{
|
||||
uint64 generation; /* generation is needed to handle correct hash
|
||||
* reenabling */
|
||||
uint32 size; /* size of cache file in chunks */
|
||||
uint32 used; /* number of used chunks */
|
||||
dlist_head lru; /* double linked list for LRU replacement
|
||||
* algorithm */
|
||||
uint64 generation; /* generation is needed to handle correct hash reenabling */
|
||||
uint32 size; /* size of cache file in chunks */
|
||||
uint32 used; /* number of used chunks */
|
||||
dlist_head lru; /* double linked list for LRU replacement algorithm */
|
||||
} FileCacheControl;
|
||||
|
||||
static HTAB *lfc_hash;
|
||||
static int lfc_desc = 0;
|
||||
static HTAB* lfc_hash;
|
||||
static int lfc_desc = 0;
|
||||
static LWLockId lfc_lock;
|
||||
static int lfc_max_size;
|
||||
static int lfc_size_limit;
|
||||
static char *lfc_path;
|
||||
static FileCacheControl *lfc_ctl;
|
||||
static int lfc_max_size;
|
||||
static int lfc_size_limit;
|
||||
static char* lfc_path;
|
||||
static FileCacheControl* lfc_ctl;
|
||||
static shmem_startup_hook_type prev_shmem_startup_hook;
|
||||
#if PG_VERSION_NUM>=150000
|
||||
static shmem_request_hook_type prev_shmem_request_hook;
|
||||
#endif
|
||||
|
||||
void FileCacheMonitorMain(Datum main_arg);
|
||||
void FileCacheMonitorMain(Datum main_arg);
|
||||
|
||||
/*
|
||||
* Local file cache is mandatory and Neon can work without it.
|
||||
@@ -102,10 +100,10 @@ void FileCacheMonitorMain(Datum main_arg);
|
||||
* All cache content should be invalidated to avoid reading of stale or corrupted data
|
||||
*/
|
||||
static void
|
||||
lfc_disable(char const *op)
|
||||
lfc_disable(char const* op)
|
||||
{
|
||||
HASH_SEQ_STATUS status;
|
||||
FileCacheEntry *entry;
|
||||
FileCacheEntry* entry;
|
||||
|
||||
elog(WARNING, "Failed to %s local file cache at %s: %m, disabling local file cache", op, lfc_path);
|
||||
|
||||
@@ -139,10 +137,9 @@ lfc_ensure_opened(void)
|
||||
/* Open cache file if not done yet */
|
||||
if (lfc_desc <= 0)
|
||||
{
|
||||
lfc_desc = BasicOpenFile(lfc_path, O_RDWR | O_CREAT);
|
||||
lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT);
|
||||
|
||||
if (lfc_desc < 0)
|
||||
{
|
||||
if (lfc_desc < 0) {
|
||||
lfc_disable("open");
|
||||
return false;
|
||||
}
|
||||
@@ -153,7 +150,7 @@ lfc_ensure_opened(void)
|
||||
static void
|
||||
lfc_shmem_startup(void)
|
||||
{
|
||||
bool found;
|
||||
bool found;
|
||||
static HASHCTL info;
|
||||
|
||||
if (prev_shmem_startup_hook)
|
||||
@@ -163,21 +160,16 @@ lfc_shmem_startup(void)
|
||||
|
||||
LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
|
||||
|
||||
lfc_ctl = (FileCacheControl *) ShmemInitStruct("lfc", sizeof(FileCacheControl), &found);
|
||||
lfc_ctl = (FileCacheControl*)ShmemInitStruct("lfc", sizeof(FileCacheControl), &found);
|
||||
if (!found)
|
||||
{
|
||||
uint32 lfc_size = SIZE_MB_TO_CHUNKS(lfc_max_size);
|
||||
|
||||
lfc_lock = (LWLockId) GetNamedLWLockTranche("lfc_lock");
|
||||
uint32 lfc_size = SIZE_MB_TO_CHUNKS(lfc_max_size);
|
||||
lfc_lock = (LWLockId)GetNamedLWLockTranche("lfc_lock");
|
||||
info.keysize = sizeof(BufferTag);
|
||||
info.entrysize = sizeof(FileCacheEntry);
|
||||
lfc_hash = ShmemInitHash("lfc_hash",
|
||||
|
||||
/*
|
||||
* lfc_size+1 because we add new element to hash table before eviction
|
||||
* of victim
|
||||
*/
|
||||
lfc_size + 1, lfc_size + 1,
|
||||
/* lfc_size+1 because we add new element to hash table before eviction of victim */
|
||||
lfc_size+1, lfc_size+1,
|
||||
&info,
|
||||
HASH_ELEM | HASH_BLOBS);
|
||||
lfc_ctl->generation = 0;
|
||||
@@ -186,7 +178,7 @@ lfc_shmem_startup(void)
|
||||
dlist_init(&lfc_ctl->lru);
|
||||
|
||||
/* Remove file cache on restart */
|
||||
(void) unlink(lfc_path);
|
||||
(void)unlink(lfc_path);
|
||||
}
|
||||
LWLockRelease(AddinShmemInitLock);
|
||||
}
|
||||
@@ -199,7 +191,7 @@ lfc_shmem_request(void)
|
||||
prev_shmem_request_hook();
|
||||
#endif
|
||||
|
||||
RequestAddinShmemSpace(sizeof(FileCacheControl) + hash_estimate_size(SIZE_MB_TO_CHUNKS(lfc_max_size) + 1, sizeof(FileCacheEntry)));
|
||||
RequestAddinShmemSpace(sizeof(FileCacheControl) + hash_estimate_size(SIZE_MB_TO_CHUNKS(lfc_max_size)+1, sizeof(FileCacheEntry)));
|
||||
RequestNamedLWLockTranche("lfc_lock", 1);
|
||||
}
|
||||
|
||||
@@ -217,14 +209,11 @@ lfc_check_limit_hook(int *newval, void **extra, GucSource source)
|
||||
static void
|
||||
lfc_change_limit_hook(int newval, void *extra)
|
||||
{
|
||||
uint32 new_size = SIZE_MB_TO_CHUNKS(newval);
|
||||
|
||||
uint32 new_size = SIZE_MB_TO_CHUNKS(newval);
|
||||
/*
|
||||
* Stats collector detach shared memory, so we should not try to access
|
||||
* shared memory here. Parallel workers first assign default value (0), so
|
||||
* not perform truncation in parallel workers. The Postmaster can handle
|
||||
* SIGHUP and it has access to shared memory (UsedShmemSegAddr != NULL),
|
||||
* but has no PGPROC.
|
||||
* Stats collector detach shared memory, so we should not try to access shared memory here.
|
||||
* Parallel workers first assign default value (0), so not perform truncation in parallel workers.
|
||||
* The Postmaster can handle SIGHUP and it has access to shared memory (UsedShmemSegAddr != NULL), but has no PGPROC.
|
||||
*/
|
||||
if (!lfc_ctl || !MyProc || !UsedShmemSegAddr || IsParallelWorker())
|
||||
return;
|
||||
@@ -232,9 +221,8 @@ lfc_change_limit_hook(int newval, void *extra)
|
||||
/* Open cache file if not done yet */
|
||||
if (lfc_desc <= 0)
|
||||
{
|
||||
lfc_desc = BasicOpenFile(lfc_path, O_RDWR | O_CREAT);
|
||||
if (lfc_desc < 0)
|
||||
{
|
||||
lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT);
|
||||
if (lfc_desc < 0) {
|
||||
elog(WARNING, "Failed to open file cache %s: %m, disabling file cache", lfc_path);
|
||||
lfc_size_limit = 0; /* disable file cache */
|
||||
return;
|
||||
@@ -243,15 +231,11 @@ lfc_change_limit_hook(int newval, void *extra)
|
||||
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
|
||||
while (new_size < lfc_ctl->used && !dlist_is_empty(&lfc_ctl->lru))
|
||||
{
|
||||
/*
|
||||
* Shrink cache by throwing away least recently accessed chunks and
|
||||
* returning their space to file system
|
||||
*/
|
||||
FileCacheEntry *victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
|
||||
|
||||
/* Shrink cache by throwing away least recently accessed chunks and returning their space to file system */
|
||||
FileCacheEntry* victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
|
||||
Assert(victim->access_count == 0);
|
||||
#ifdef FALLOC_FL_PUNCH_HOLE
|
||||
if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, (off_t) victim->offset * BLOCKS_PER_CHUNK * BLCKSZ, BLOCKS_PER_CHUNK * BLCKSZ) < 0)
|
||||
if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE, (off_t)victim->offset*BLOCKS_PER_CHUNK*BLCKSZ, BLOCKS_PER_CHUNK*BLCKSZ) < 0)
|
||||
elog(LOG, "Failed to punch hole in file: %m");
|
||||
#endif
|
||||
hash_search(lfc_hash, &victim->key, HASH_REMOVE, NULL);
|
||||
@@ -275,7 +259,7 @@ lfc_init(void)
|
||||
"Maximal size of Neon local file cache",
|
||||
NULL,
|
||||
&lfc_max_size,
|
||||
0, /* disabled by default */
|
||||
0, /* disabled by default */
|
||||
0,
|
||||
INT_MAX,
|
||||
PGC_POSTMASTER,
|
||||
@@ -288,7 +272,7 @@ lfc_init(void)
|
||||
"Current limit for size of Neon local file cache",
|
||||
NULL,
|
||||
&lfc_size_limit,
|
||||
0, /* disabled by default */
|
||||
0, /* disabled by default */
|
||||
0,
|
||||
INT_MAX,
|
||||
PGC_SIGHUP,
|
||||
@@ -328,18 +312,18 @@ lfc_init(void)
|
||||
bool
|
||||
lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
|
||||
{
|
||||
BufferTag tag;
|
||||
FileCacheEntry *entry;
|
||||
int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
|
||||
bool found;
|
||||
uint32 hash;
|
||||
BufferTag tag;
|
||||
FileCacheEntry* entry;
|
||||
int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
|
||||
bool found;
|
||||
uint32 hash;
|
||||
|
||||
if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
|
||||
if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
|
||||
return false;
|
||||
|
||||
CopyNRelFileInfoToBufTag(tag, rinfo);
|
||||
tag.forkNum = forkNum;
|
||||
tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
|
||||
tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1);
|
||||
hash = get_hash_value(lfc_hash, &tag);
|
||||
|
||||
LWLockAcquire(lfc_lock, LW_SHARED);
|
||||
@@ -355,13 +339,13 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
|
||||
void
|
||||
lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
|
||||
{
|
||||
BufferTag tag;
|
||||
FileCacheEntry *entry;
|
||||
bool found;
|
||||
int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
|
||||
uint32 hash;
|
||||
BufferTag tag;
|
||||
FileCacheEntry* entry;
|
||||
bool found;
|
||||
int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
|
||||
uint32 hash;
|
||||
|
||||
if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
|
||||
if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
|
||||
return;
|
||||
|
||||
CopyNRelFileInfoToBufTag(tag, rinfo);
|
||||
@@ -389,10 +373,9 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
|
||||
*/
|
||||
if (entry->bitmap[chunk_offs >> 5] == 0)
|
||||
{
|
||||
bool has_remaining_pages;
|
||||
bool has_remaining_pages;
|
||||
|
||||
for (int i = 0; i < (BLOCKS_PER_CHUNK / 32); i++)
|
||||
{
|
||||
for (int i = 0; i < (BLOCKS_PER_CHUNK / 32); i++) {
|
||||
if (entry->bitmap[i] != 0)
|
||||
{
|
||||
has_remaining_pages = true;
|
||||
@@ -401,8 +384,8 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
|
||||
}
|
||||
|
||||
/*
|
||||
* Put the entry at the position that is first to be reclaimed when we
|
||||
* have no cached pages remaining in the chunk
|
||||
* Put the entry at the position that is first to be reclaimed when
|
||||
* we have no cached pages remaining in the chunk
|
||||
*/
|
||||
if (!has_remaining_pages)
|
||||
{
|
||||
@@ -428,16 +411,16 @@ bool
|
||||
lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
char *buffer)
|
||||
{
|
||||
BufferTag tag;
|
||||
FileCacheEntry *entry;
|
||||
ssize_t rc;
|
||||
int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
|
||||
bool result = true;
|
||||
uint32 hash;
|
||||
uint64 generation;
|
||||
uint32 entry_offset;
|
||||
BufferTag tag;
|
||||
FileCacheEntry* entry;
|
||||
ssize_t rc;
|
||||
int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
|
||||
bool result = true;
|
||||
uint32 hash;
|
||||
uint64 generation;
|
||||
uint32 entry_offset;
|
||||
|
||||
if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
|
||||
if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
|
||||
return false;
|
||||
|
||||
if (!lfc_ensure_opened())
|
||||
@@ -445,7 +428,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
|
||||
CopyNRelFileInfoToBufTag(tag, rinfo);
|
||||
tag.forkNum = forkNum;
|
||||
tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
|
||||
tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1);
|
||||
hash = get_hash_value(lfc_hash, &tag);
|
||||
|
||||
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
|
||||
@@ -464,7 +447,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
|
||||
LWLockRelease(lfc_lock);
|
||||
|
||||
rc = pread(lfc_desc, buffer, BLCKSZ, ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ);
|
||||
rc = pread(lfc_desc, buffer, BLCKSZ, ((off_t)entry_offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ);
|
||||
if (rc != BLCKSZ)
|
||||
{
|
||||
lfc_disable("read");
|
||||
@@ -492,31 +475,31 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
* If cache is full then evict some other page.
|
||||
*/
|
||||
void
|
||||
lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
#if PG_MAJORVERSION_NUM < 16
|
||||
char *buffer)
|
||||
char *buffer)
|
||||
#else
|
||||
const void *buffer)
|
||||
const void *buffer)
|
||||
#endif
|
||||
{
|
||||
BufferTag tag;
|
||||
FileCacheEntry *entry;
|
||||
ssize_t rc;
|
||||
bool found;
|
||||
int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
|
||||
uint32 hash;
|
||||
BufferTag tag;
|
||||
FileCacheEntry* entry;
|
||||
ssize_t rc;
|
||||
bool found;
|
||||
int chunk_offs = blkno & (BLOCKS_PER_CHUNK-1);
|
||||
uint32 hash;
|
||||
|
||||
if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
|
||||
if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
|
||||
return;
|
||||
|
||||
if (!lfc_ensure_opened())
|
||||
return;
|
||||
|
||||
tag.forkNum = forkNum;
|
||||
tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK - 1);
|
||||
|
||||
tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1);
|
||||
|
||||
CopyNRelFileInfoToBufTag(tag, rinfo);
|
||||
|
||||
|
||||
hash = get_hash_value(lfc_hash, &tag);
|
||||
|
||||
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
|
||||
@@ -524,30 +507,24 @@ void
|
||||
|
||||
if (found)
|
||||
{
|
||||
/*
|
||||
* Unlink entry from LRU list to pin it for the duration of IO
|
||||
* operation
|
||||
*/
|
||||
/* Unlink entry from LRU list to pin it for the duration of IO operation */
|
||||
if (entry->access_count++ == 0)
|
||||
dlist_delete(&entry->lru_node);
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
* We have two choices if all cache pages are pinned (i.e. used in IO
|
||||
* operations): 1. Wait until some of this operation is completed and
|
||||
* pages is unpinned 2. Allocate one more chunk, so that specified
|
||||
* cache size is more recommendation than hard limit. As far as
|
||||
* probability of such event (that all pages are pinned) is considered
|
||||
* to be very very small: there are should be very large number of
|
||||
* concurrent IO operations and them are limited by max_connections,
|
||||
* We have two choices if all cache pages are pinned (i.e. used in IO operations):
|
||||
* 1. Wait until some of this operation is completed and pages is unpinned
|
||||
* 2. Allocate one more chunk, so that specified cache size is more recommendation than hard limit.
|
||||
* As far as probability of such event (that all pages are pinned) is considered to be very very small:
|
||||
* there are should be very large number of concurrent IO operations and them are limited by max_connections,
|
||||
* we prefer not to complicate code and use second approach.
|
||||
*/
|
||||
if (lfc_ctl->used >= SIZE_MB_TO_CHUNKS(lfc_size_limit) && !dlist_is_empty(&lfc_ctl->lru))
|
||||
{
|
||||
/* Cache overflow: evict least recently used chunk */
|
||||
FileCacheEntry *victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
|
||||
|
||||
FileCacheEntry* victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
|
||||
Assert(victim->access_count == 0);
|
||||
entry->offset = victim->offset; /* grab victim's chunk */
|
||||
hash_search(lfc_hash, &victim->key, HASH_REMOVE, NULL);
|
||||
@@ -556,14 +533,13 @@ void
|
||||
else
|
||||
{
|
||||
lfc_ctl->used += 1;
|
||||
entry->offset = lfc_ctl->size++; /* allocate new chunk at end
|
||||
* of file */
|
||||
entry->offset = lfc_ctl->size++; /* allocate new chunk at end of file */
|
||||
}
|
||||
entry->access_count = 1;
|
||||
memset(entry->bitmap, 0, sizeof entry->bitmap);
|
||||
}
|
||||
|
||||
rc = pwrite(lfc_desc, buffer, BLCKSZ, ((off_t) entry->offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ);
|
||||
rc = pwrite(lfc_desc, buffer, BLCKSZ, ((off_t)entry->offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ);
|
||||
if (rc != BLCKSZ)
|
||||
{
|
||||
LWLockRelease(lfc_lock);
|
||||
@@ -625,9 +601,9 @@ local_cache_pages(PG_FUNCTION_ARGS)
|
||||
|
||||
if (SRF_IS_FIRSTCALL())
|
||||
{
|
||||
HASH_SEQ_STATUS status;
|
||||
FileCacheEntry *entry;
|
||||
uint32 n_pages = 0;
|
||||
HASH_SEQ_STATUS status;
|
||||
FileCacheEntry* entry;
|
||||
uint32 n_pages = 0;
|
||||
|
||||
funcctx = SRF_FIRSTCALL_INIT();
|
||||
|
||||
@@ -677,8 +653,8 @@ local_cache_pages(PG_FUNCTION_ARGS)
|
||||
|
||||
LWLockAcquire(lfc_lock, LW_SHARED);
|
||||
|
||||
hash_seq_init(&status, lfc_hash);
|
||||
while ((entry = hash_seq_search(&status)) != NULL)
|
||||
hash_seq_init(&status, lfc_hash);
|
||||
while ((entry = hash_seq_search(&status)) != NULL)
|
||||
{
|
||||
for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
|
||||
n_pages += (entry->bitmap[i >> 5] & (1 << (i & 31))) != 0;
|
||||
@@ -704,14 +680,14 @@ local_cache_pages(PG_FUNCTION_ARGS)
|
||||
* locks, so the information of each buffer is self-consistent.
|
||||
*/
|
||||
n_pages = 0;
|
||||
hash_seq_init(&status, lfc_hash);
|
||||
while ((entry = hash_seq_search(&status)) != NULL)
|
||||
hash_seq_init(&status, lfc_hash);
|
||||
while ((entry = hash_seq_search(&status)) != NULL)
|
||||
{
|
||||
for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
|
||||
{
|
||||
if (entry->bitmap[i >> 5] & (1 << (i & 31)))
|
||||
{
|
||||
fctx->record[n_pages].pageoffs = entry->offset * BLOCKS_PER_CHUNK + i;
|
||||
fctx->record[n_pages].pageoffs = entry->offset*BLOCKS_PER_CHUNK + i;
|
||||
fctx->record[n_pages].relfilenode = NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key));
|
||||
fctx->record[n_pages].reltablespace = NInfoGetSpcOid(BufTagGetNRelFileInfo(entry->key));
|
||||
fctx->record[n_pages].reldatabase = NInfoGetDbOid(BufTagGetNRelFileInfo(entry->key));
|
||||
|
||||
@@ -60,7 +60,7 @@ int flush_every_n_requests = 8;
|
||||
int n_reconnect_attempts = 0;
|
||||
int max_reconnect_attempts = 60;
|
||||
|
||||
bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;
|
||||
bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;
|
||||
|
||||
static bool pageserver_flush(void);
|
||||
|
||||
@@ -80,10 +80,11 @@ pageserver_connect(int elevel)
|
||||
* neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
|
||||
* variable was set, use that as the password.
|
||||
*
|
||||
* The connection options are parsed in the order they're given, so when
|
||||
* we set the password before the connection string, the connection string
|
||||
* can override the password from the env variable. Seems useful, although
|
||||
* we don't currently use that capability anywhere.
|
||||
* The connection options are parsed in the order they're given, so
|
||||
* when we set the password before the connection string, the
|
||||
* connection string can override the password from the env variable.
|
||||
* Seems useful, although we don't currently use that capability
|
||||
* anywhere.
|
||||
*/
|
||||
n = 0;
|
||||
if (neon_auth_token)
|
||||
@@ -126,9 +127,9 @@ pageserver_connect(int elevel)
|
||||
|
||||
pageserver_conn_wes = CreateWaitEventSet(TopMemoryContext, 3);
|
||||
AddWaitEventToSet(pageserver_conn_wes, WL_LATCH_SET, PGINVALID_SOCKET,
|
||||
MyLatch, NULL);
|
||||
MyLatch, NULL);
|
||||
AddWaitEventToSet(pageserver_conn_wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
|
||||
NULL, NULL);
|
||||
NULL, NULL);
|
||||
AddWaitEventToSet(pageserver_conn_wes, WL_SOCKET_READABLE, PQsocket(pageserver_conn), NULL, NULL);
|
||||
|
||||
while (PQisBusy(pageserver_conn))
|
||||
@@ -193,7 +194,6 @@ retry:
|
||||
if (!PQconsumeInput(pageserver_conn))
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(pageserver_conn));
|
||||
|
||||
neon_log(LOG, "could not get response from pageserver: %s", msg);
|
||||
pfree(msg);
|
||||
return -1;
|
||||
@@ -234,7 +234,7 @@ pageserver_disconnect(void)
|
||||
}
|
||||
|
||||
static bool
|
||||
pageserver_send(NeonRequest *request)
|
||||
pageserver_send(NeonRequest * request)
|
||||
{
|
||||
StringInfoData req_buff;
|
||||
|
||||
@@ -249,12 +249,10 @@ pageserver_send(NeonRequest *request)
|
||||
|
||||
/*
|
||||
* If pageserver is stopped, the connections from compute node are broken.
|
||||
* The compute node doesn't notice that immediately, but it will cause the
|
||||
* next request to fail, usually on the next query. That causes
|
||||
* user-visible errors if pageserver is restarted, or the tenant is moved
|
||||
* from one pageserver to another. See
|
||||
* https://github.com/neondatabase/neon/issues/1138 So try to reestablish
|
||||
* connection in case of failure.
|
||||
* The compute node doesn't notice that immediately, but it will cause the next request to fail, usually on the next query.
|
||||
* That causes user-visible errors if pageserver is restarted, or the tenant is moved from one pageserver to another.
|
||||
* See https://github.com/neondatabase/neon/issues/1138
|
||||
* So try to reestablish connection in case of failure.
|
||||
*/
|
||||
if (!connected)
|
||||
{
|
||||
@@ -277,7 +275,6 @@ pageserver_send(NeonRequest *request)
|
||||
if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(pageserver_conn));
|
||||
|
||||
pageserver_disconnect();
|
||||
neon_log(LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg);
|
||||
pfree(msg);
|
||||
@@ -335,8 +332,7 @@ pageserver_receive(void)
|
||||
}
|
||||
else if (rc == -2)
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(pageserver_conn));
|
||||
|
||||
char* msg = pchomp(PQerrorMessage(pageserver_conn));
|
||||
pageserver_disconnect();
|
||||
neon_log(ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg);
|
||||
}
|
||||
@@ -370,7 +366,6 @@ pageserver_flush(void)
|
||||
if (PQflush(pageserver_conn))
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(pageserver_conn));
|
||||
|
||||
pageserver_disconnect();
|
||||
neon_log(LOG, "pageserver_flush disconnect because failed to flush page requests: %s", msg);
|
||||
pfree(msg);
|
||||
@@ -473,10 +468,7 @@ pg_init_libpagestore(void)
|
||||
neon_log(PageStoreTrace, "libpagestore already loaded");
|
||||
page_server = &api;
|
||||
|
||||
/*
|
||||
* Retrieve the auth token to use when connecting to pageserver and
|
||||
* safekeepers
|
||||
*/
|
||||
/* Retrieve the auth token to use when connecting to pageserver and safekeepers */
|
||||
neon_auth_token = getenv("NEON_AUTH_TOKEN");
|
||||
if (neon_auth_token)
|
||||
neon_log(LOG, "using storage auth token from NEON_AUTH_TOKEN environment variable");
|
||||
|
||||
@@ -1,96 +0,0 @@
|
||||
/*
|
||||
* Interface to set of libpq wrappers walproposer and neon_walreader need.
|
||||
* Similar to libpqwalreceiver, but it has blocking connection establishment and
|
||||
* pqexec which don't fit us. Implementation is at walproposer_pg.c.
|
||||
*/
|
||||
#ifndef ___LIBPQWALPROPOSER_H__
|
||||
#define ___LIBPQWALPROPOSER_H__
|
||||
|
||||
/* Re-exported and modified ExecStatusType */
|
||||
typedef enum
|
||||
{
|
||||
/* We received a single CopyBoth result */
|
||||
WP_EXEC_SUCCESS_COPYBOTH,
|
||||
|
||||
/*
|
||||
* Any success result other than a single CopyBoth was received. The
|
||||
* specifics of the result were already logged, but it may be useful to
|
||||
* provide an error message indicating which safekeeper messed up.
|
||||
*
|
||||
* Do not expect PQerrorMessage to be appropriately set.
|
||||
*/
|
||||
WP_EXEC_UNEXPECTED_SUCCESS,
|
||||
|
||||
/*
|
||||
* No result available at this time. Wait until read-ready, then call
|
||||
* again. Internally, this is returned when PQisBusy indicates that
|
||||
* PQgetResult would block.
|
||||
*/
|
||||
WP_EXEC_NEEDS_INPUT,
|
||||
/* Catch-all failure. Check PQerrorMessage. */
|
||||
WP_EXEC_FAILED,
|
||||
} WalProposerExecStatusType;
|
||||
|
||||
/* Possible return values from walprop_async_read */
|
||||
typedef enum
|
||||
{
|
||||
/* The full read was successful. buf now points to the data */
|
||||
PG_ASYNC_READ_SUCCESS,
|
||||
|
||||
/*
|
||||
* The read is ongoing. Wait until the connection is read-ready, then try
|
||||
* again.
|
||||
*/
|
||||
PG_ASYNC_READ_TRY_AGAIN,
|
||||
/* Reading failed. Check PQerrorMessage(conn) */
|
||||
PG_ASYNC_READ_FAIL,
|
||||
} PGAsyncReadResult;
|
||||
|
||||
/* Possible return values from walprop_async_write */
|
||||
typedef enum
|
||||
{
|
||||
/* The write fully completed */
|
||||
PG_ASYNC_WRITE_SUCCESS,
|
||||
|
||||
/*
|
||||
* The write started, but you'll need to call PQflush some more times to
|
||||
* finish it off. We just tried, so it's best to wait until the connection
|
||||
* is read- or write-ready to try again.
|
||||
*
|
||||
* If it becomes read-ready, call PQconsumeInput and flush again. If it
|
||||
* becomes write-ready, just call PQflush.
|
||||
*/
|
||||
PG_ASYNC_WRITE_TRY_FLUSH,
|
||||
/* Writing failed. Check PQerrorMessage(conn) */
|
||||
PG_ASYNC_WRITE_FAIL,
|
||||
} PGAsyncWriteResult;
|
||||
|
||||
/*
|
||||
* This header is included by walproposer.h to define walproposer_api; if we're
|
||||
* building walproposer without pg, ignore libpq part, leaving only interface
|
||||
* types.
|
||||
*/
|
||||
#ifndef WALPROPOSER_LIB
|
||||
|
||||
#include "libpq-fe.h"
|
||||
|
||||
/*
|
||||
* Sometimes working directly with underlying PGconn is simpler, export the
|
||||
* whole thing for simplicity.
|
||||
*/
|
||||
typedef struct WalProposerConn
|
||||
{
|
||||
PGconn *pg_conn;
|
||||
bool is_nonblocking; /* whether the connection is non-blocking */
|
||||
char *recvbuf; /* last received CopyData message from
|
||||
* walprop_async_read */
|
||||
} WalProposerConn;
|
||||
|
||||
extern WalProposerConn *libpqwp_connect_start(char *conninfo);
|
||||
extern bool libpqwp_send_query(WalProposerConn *conn, char *query);
|
||||
extern WalProposerExecStatusType libpqwp_get_query_result(WalProposerConn *conn);
|
||||
extern PGAsyncReadResult libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount);
|
||||
extern void libpqwp_disconnect(WalProposerConn *conn);
|
||||
|
||||
#endif /* WALPROPOSER_LIB */
|
||||
#endif /* ___LIBPQWALPROPOSER_H__ */
|
||||
@@ -48,9 +48,9 @@ _PG_init(void)
|
||||
|
||||
pg_init_extension_server();
|
||||
|
||||
/* Important: This must happen after other parts of the extension */
|
||||
/* are loaded, otherwise any settings to GUCs that were set before */
|
||||
/* the extension was loaded will be removed. */
|
||||
// Important: This must happen after other parts of the extension
|
||||
// are loaded, otherwise any settings to GUCs that were set before
|
||||
// the extension was loaded will be removed.
|
||||
EmitWarningsOnPlaceholders("neon");
|
||||
}
|
||||
|
||||
|
||||
@@ -32,7 +32,7 @@ extern void pg_init_extension_server(void);
|
||||
* block_id; false otherwise.
|
||||
*/
|
||||
extern bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id);
|
||||
extern bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id);
|
||||
extern bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id);
|
||||
|
||||
extern uint64 BackpressureThrottlingTime(void);
|
||||
extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);
|
||||
|
||||
@@ -59,7 +59,7 @@
|
||||
|
||||
#define DropRelationAllLocalBuffers DropRelFileNodeAllLocalBuffers
|
||||
|
||||
#else /* major version >= 16 */
|
||||
#else /* major version >= 16 */
|
||||
|
||||
#define USE_RELFILELOCATOR
|
||||
|
||||
@@ -109,4 +109,4 @@
|
||||
#define DropRelationAllLocalBuffers DropRelationAllLocalBuffers
|
||||
#endif
|
||||
|
||||
#endif /* //NEON_PGVERSIONCOMPAT_H */
|
||||
#endif //NEON_PGVERSIONCOMPAT_H
|
||||
|
||||
@@ -1,731 +0,0 @@
|
||||
/*
|
||||
* Like WALRead, but when WAL segment doesn't exist locally instead of throwing
|
||||
* ERROR asynchronously tries to fetch it from the most advanced safekeeper.
|
||||
*
|
||||
* We can't use libpqwalreceiver as it blocks during connection establishment
|
||||
* (and waiting for PQExec result), so use libpqwalproposer instead.
|
||||
*
|
||||
* TODO: keepalives are currently never sent, so the other side can close the
|
||||
* connection prematurely.
|
||||
*
|
||||
* TODO: close conn if reading takes too long to prevent stuck connections.
|
||||
*/
|
||||
#include "postgres.h"
|
||||
|
||||
#include <sys/stat.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "access/xlog_internal.h"
|
||||
#include "access/xlogdefs.h"
|
||||
#include "access/xlogreader.h"
|
||||
#include "libpq/pqformat.h"
|
||||
#include "storage/fd.h"
|
||||
#include "utils/wait_event.h"
|
||||
|
||||
#include "libpq-fe.h"
|
||||
|
||||
#include "neon_walreader.h"
|
||||
#include "walproposer.h"
|
||||
|
||||
#define NEON_WALREADER_ERR_MSG_LEN 512
|
||||
|
||||
/*
|
||||
* Can be called where NeonWALReader *state is available in the context, adds log_prefix.
|
||||
*/
|
||||
#define nwr_log(elevel, fmt, ...) elog(elevel, "%s" fmt, state->log_prefix, ## __VA_ARGS__)
|
||||
|
||||
static NeonWALReadResult NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
|
||||
static NeonWALReadResult NeonWALReaderReadMsg(NeonWALReader *state);
|
||||
static void NeonWALReaderResetRemote(NeonWALReader *state);
|
||||
static bool NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
|
||||
static bool neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo, TimeLineID *tli_p);
|
||||
static void neon_wal_segment_close(NeonWALReader *state);
|
||||
static bool is_wal_segment_exists(XLogSegNo segno, int segsize,
|
||||
TimeLineID tli);
|
||||
|
||||
/*
|
||||
* State of connection to donor safekeeper.
|
||||
*/
|
||||
typedef enum
|
||||
{
|
||||
/* no remote connection */
|
||||
RS_NONE,
|
||||
/* doing PQconnectPoll, need readable socket */
|
||||
RS_CONNECTING_READ,
|
||||
/* doing PQconnectPoll, need writable socket */
|
||||
RS_CONNECTING_WRITE,
|
||||
/* Waiting for START_REPLICATION result */
|
||||
RS_WAIT_EXEC_RESULT,
|
||||
/* replication stream established */
|
||||
RS_ESTABLISHED,
|
||||
} NeonWALReaderRemoteState;
|
||||
|
||||
struct NeonWALReader
|
||||
{
|
||||
/*
|
||||
* LSN before which we assume WAL is not available locally. Exists because
|
||||
* though first segment after startup always exists, part before
|
||||
* basebackup LSN is filled with zeros.
|
||||
*/
|
||||
XLogRecPtr available_lsn;
|
||||
WALSegmentContext segcxt;
|
||||
WALOpenSegment seg;
|
||||
int wre_errno;
|
||||
/* Explains failure to read, static for simplicity. */
|
||||
char err_msg[NEON_WALREADER_ERR_MSG_LEN];
|
||||
|
||||
/*
|
||||
* Saved info about request in progress, used to check validity of
|
||||
* arguments after resume and remember how far we accomplished it. req_lsn
|
||||
* is 0 if there is no request in progress.
|
||||
*/
|
||||
XLogRecPtr req_lsn;
|
||||
Size req_len;
|
||||
Size req_progress;
|
||||
WalProposer *wp; /* we learn donor through walproposer */
|
||||
char donor_name[64]; /* saved donor safekeeper name for logging */
|
||||
/* state of connection to safekeeper */
|
||||
NeonWALReaderRemoteState rem_state;
|
||||
WalProposerConn *wp_conn;
|
||||
|
||||
/*
|
||||
* position in wp_conn recvbuf from which we'll copy WAL next time, or
|
||||
* NULL if there is no unprocessed message
|
||||
*/
|
||||
char *wal_ptr;
|
||||
Size wal_rem_len; /* how many unprocessed bytes left in recvbuf */
|
||||
|
||||
/*
|
||||
* LSN of wal_ptr position according to walsender to cross check against
|
||||
* read request
|
||||
*/
|
||||
XLogRecPtr rem_lsn;
|
||||
|
||||
/* prepended to lines logged by neon_walreader, if provided */
|
||||
char log_prefix[64];
|
||||
};
|
||||
|
||||
/* palloc and initialize NeonWALReader */
|
||||
NeonWALReader *
|
||||
NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, WalProposer *wp, char *log_prefix)
|
||||
{
|
||||
NeonWALReader *reader;
|
||||
|
||||
reader = (NeonWALReader *)
|
||||
palloc_extended(sizeof(NeonWALReader),
|
||||
MCXT_ALLOC_NO_OOM | MCXT_ALLOC_ZERO);
|
||||
if (!reader)
|
||||
return NULL;
|
||||
|
||||
reader->available_lsn = available_lsn;
|
||||
reader->seg.ws_file = -1;
|
||||
reader->seg.ws_segno = 0;
|
||||
reader->seg.ws_tli = 0;
|
||||
reader->segcxt.ws_segsize = wal_segment_size;
|
||||
|
||||
reader->wp = wp;
|
||||
|
||||
reader->rem_state = RS_NONE;
|
||||
|
||||
if (log_prefix)
|
||||
strncpy(reader->log_prefix, log_prefix, sizeof(reader->log_prefix));
|
||||
|
||||
return reader;
|
||||
}
|
||||
|
||||
void
|
||||
NeonWALReaderFree(NeonWALReader *state)
|
||||
{
|
||||
if (state->seg.ws_file != -1)
|
||||
neon_wal_segment_close(state);
|
||||
if (state->wp_conn)
|
||||
libpqwp_disconnect(state->wp_conn);
|
||||
pfree(state);
|
||||
}
|
||||
|
||||
/*
|
||||
* Like vanilla WALRead, but if requested position is before available_lsn or
|
||||
* WAL segment doesn't exist on disk, it tries to fetch needed segment from the
|
||||
* advanced safekeeper.
|
||||
*
|
||||
* Read 'count' bytes into 'buf', starting at location 'startptr', from WAL
|
||||
* fetched from timeline 'tli'.
|
||||
*
|
||||
* Returns NEON_WALREAD_SUCCESS if succeeded, NEON_WALREAD_ERROR if an error
|
||||
* occurs, in which case 'err' has the desciption. Error always closes remote
|
||||
* connection, if there was any, so socket subscription should be removed.
|
||||
*
|
||||
* NEON_WALREAD_WOULDBLOCK means caller should obtain socket to wait for with
|
||||
* NeonWALReaderSocket and call NeonWALRead again with exactly the same
|
||||
* arguments when NeonWALReaderEvents happen on the socket. Note that per libpq
|
||||
* docs during connection establishment (before first successful read) socket
|
||||
* underneath might change.
|
||||
*
|
||||
* Also, eventually walreader should switch from remote to local read; caller
|
||||
* should remove subscription to socket then by checking NeonWALReaderEvents
|
||||
* after successful read (otherwise next read might reopen the connection with
|
||||
* different socket).
|
||||
*
|
||||
* Reading not monotonically is not supported and will result in error.
|
||||
*
|
||||
* Caller should be sure that WAL up to requested LSN exists, otherwise
|
||||
* NEON_WALREAD_WOULDBLOCK might be always returned.
|
||||
*/
|
||||
NeonWALReadResult
|
||||
NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli)
|
||||
{
|
||||
/*
|
||||
* If requested data is before known available basebackup lsn or there is
|
||||
* already active remote state, do remote read.
|
||||
*/
|
||||
if (startptr < state->available_lsn || state->rem_state != RS_NONE)
|
||||
{
|
||||
return NeonWALReadRemote(state, buf, startptr, count, tli);
|
||||
}
|
||||
if (NeonWALReadLocal(state, buf, startptr, count, tli))
|
||||
{
|
||||
return NEON_WALREAD_SUCCESS;
|
||||
}
|
||||
else if (state->wre_errno == ENOENT)
|
||||
{
|
||||
nwr_log(LOG, "local read failed as segment at %X/%X doesn't exist, attempting remote",
|
||||
LSN_FORMAT_ARGS(startptr));
|
||||
return NeonWALReadRemote(state, buf, startptr, count, tli);
|
||||
}
|
||||
else
|
||||
{
|
||||
return NEON_WALREAD_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
/* Do the read from remote safekeeper. */
|
||||
static NeonWALReadResult
|
||||
NeonWALReadRemote(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli)
|
||||
{
|
||||
if (state->rem_state == RS_NONE)
|
||||
{
|
||||
XLogRecPtr donor_lsn;
|
||||
|
||||
/* no connection yet; start one */
|
||||
Safekeeper *donor = GetDonor(state->wp, &donor_lsn);
|
||||
|
||||
if (donor == NULL)
|
||||
{
|
||||
snprintf(state->err_msg, sizeof(state->err_msg),
|
||||
"failed to establish remote connection to fetch WAL: no donor available");
|
||||
return NEON_WALREAD_ERROR;
|
||||
}
|
||||
snprintf(state->donor_name, sizeof(state->donor_name), "%s:%s", donor->host, donor->port);
|
||||
nwr_log(LOG, "establishing connection to %s, flush_lsn %X/%X to fetch WAL",
|
||||
state->donor_name, LSN_FORMAT_ARGS(donor_lsn));
|
||||
state->wp_conn = libpqwp_connect_start(donor->conninfo);
|
||||
if (PQstatus(state->wp_conn->pg_conn) == CONNECTION_BAD)
|
||||
{
|
||||
snprintf(state->err_msg, sizeof(state->err_msg),
|
||||
"failed to connect to %s to fetch WAL: immediately failed with %s",
|
||||
state->donor_name, PQerrorMessage(state->wp_conn->pg_conn));
|
||||
NeonWALReaderResetRemote(state);
|
||||
return NEON_WALREAD_ERROR;
|
||||
}
|
||||
/* we'll poll immediately */
|
||||
state->rem_state = RS_CONNECTING_READ;
|
||||
}
|
||||
|
||||
if (state->rem_state == RS_CONNECTING_READ || state->rem_state == RS_CONNECTING_WRITE)
|
||||
{
|
||||
switch (PQconnectPoll(state->wp_conn->pg_conn))
|
||||
{
|
||||
case PGRES_POLLING_FAILED:
|
||||
snprintf(state->err_msg, sizeof(state->err_msg),
|
||||
"failed to connect to %s to fetch WAL: poll error: %s",
|
||||
state->donor_name, PQerrorMessage(state->wp_conn->pg_conn));
|
||||
NeonWALReaderResetRemote(state);
|
||||
return NEON_WALREAD_ERROR;
|
||||
case PGRES_POLLING_READING:
|
||||
state->rem_state = RS_CONNECTING_READ;
|
||||
return NEON_WALREAD_WOULDBLOCK;
|
||||
case PGRES_POLLING_WRITING:
|
||||
state->rem_state = RS_CONNECTING_WRITE;
|
||||
return NEON_WALREAD_WOULDBLOCK;
|
||||
case PGRES_POLLING_OK:
|
||||
{
|
||||
/* connection successfully established */
|
||||
char start_repl_query[128];
|
||||
|
||||
snprintf(start_repl_query, sizeof(start_repl_query),
|
||||
"START_REPLICATION PHYSICAL %X/%X (term='" UINT64_FORMAT "')",
|
||||
LSN_FORMAT_ARGS(startptr), state->wp->propTerm);
|
||||
nwr_log(LOG, "connection to %s to fetch WAL succeeded, running %s",
|
||||
state->donor_name, start_repl_query);
|
||||
if (!libpqwp_send_query(state->wp_conn, start_repl_query))
|
||||
{
|
||||
snprintf(state->err_msg, sizeof(state->err_msg),
|
||||
"failed to send %s query to %s: %s",
|
||||
start_repl_query, state->donor_name, PQerrorMessage(state->wp_conn->pg_conn));
|
||||
NeonWALReaderResetRemote(state);
|
||||
return NEON_WALREAD_ERROR;
|
||||
}
|
||||
state->rem_state = RS_WAIT_EXEC_RESULT;
|
||||
break;
|
||||
}
|
||||
|
||||
default: /* there is unused PGRES_POLLING_ACTIVE */
|
||||
Assert(false);
|
||||
return NEON_WALREAD_ERROR; /* keep the compiler quiet */
|
||||
}
|
||||
}
|
||||
|
||||
if (state->rem_state == RS_WAIT_EXEC_RESULT)
|
||||
{
|
||||
switch (libpqwp_get_query_result(state->wp_conn))
|
||||
{
|
||||
case WP_EXEC_SUCCESS_COPYBOTH:
|
||||
state->rem_state = RS_ESTABLISHED;
|
||||
break;
|
||||
case WP_EXEC_NEEDS_INPUT:
|
||||
return NEON_WALREAD_WOULDBLOCK;
|
||||
case WP_EXEC_FAILED:
|
||||
snprintf(state->err_msg, sizeof(state->err_msg),
|
||||
"get START_REPLICATION result from %s failed: %s",
|
||||
state->donor_name, PQerrorMessage(state->wp_conn->pg_conn));
|
||||
NeonWALReaderResetRemote(state);
|
||||
return NEON_WALREAD_ERROR;
|
||||
default: /* can't happen */
|
||||
snprintf(state->err_msg, sizeof(state->err_msg),
|
||||
"get START_REPLICATION result from %s: unexpected result",
|
||||
state->donor_name);
|
||||
NeonWALReaderResetRemote(state);
|
||||
return NEON_WALREAD_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
Assert(state->rem_state == RS_ESTABLISHED);
|
||||
|
||||
/*
|
||||
* If we had the request before, verify args are the same and advance the
|
||||
* result ptr according to the progress; otherwise register the request.
|
||||
*/
|
||||
if (state->req_lsn != InvalidXLogRecPtr)
|
||||
{
|
||||
if (state->req_lsn != startptr || state->req_len != count)
|
||||
{
|
||||
snprintf(state->err_msg, sizeof(state->err_msg),
|
||||
"args changed during request, was %X/%X %zu, now %X/%X %zu",
|
||||
LSN_FORMAT_ARGS(state->req_lsn), state->req_len, LSN_FORMAT_ARGS(startptr), count);
|
||||
NeonWALReaderResetRemote(state);
|
||||
return NEON_WALREAD_ERROR;
|
||||
}
|
||||
nwr_log(DEBUG5, "continuing remote read at req_lsn=%X/%X len=%zu, req_progress=%zu",
|
||||
LSN_FORMAT_ARGS(startptr),
|
||||
count,
|
||||
state->req_progress);
|
||||
buf += state->req_progress;
|
||||
}
|
||||
else
|
||||
{
|
||||
state->req_lsn = startptr;
|
||||
state->req_len = count;
|
||||
state->req_progress = 0;
|
||||
nwr_log(DEBUG5, "starting remote read req_lsn=%X/%X len=%zu",
|
||||
LSN_FORMAT_ARGS(startptr),
|
||||
count);
|
||||
}
|
||||
|
||||
while (true)
|
||||
{
|
||||
Size to_copy;
|
||||
|
||||
/*
|
||||
* If we have no ready data, receive new message.
|
||||
*/
|
||||
if (state->wal_rem_len == 0 &&
|
||||
|
||||
/*
|
||||
* check for the sake of 0 length reads; walproposer does these for
|
||||
* heartbeats, though generally they shouldn't hit remote source.
|
||||
*/
|
||||
state->req_len - state->req_progress > 0)
|
||||
{
|
||||
NeonWALReadResult read_msg_res = NeonWALReaderReadMsg(state);
|
||||
|
||||
if (read_msg_res != NEON_WALREAD_SUCCESS)
|
||||
return read_msg_res;
|
||||
}
|
||||
|
||||
if (state->req_lsn + state->req_progress != state->rem_lsn)
|
||||
{
|
||||
snprintf(state->err_msg, sizeof(state->err_msg),
|
||||
"expected remote WAL at %X/%X but got %X/%X. Non monotonic read requests could have caused this. req_lsn=%X/%X len=%zu",
|
||||
LSN_FORMAT_ARGS(state->req_lsn + state->req_progress),
|
||||
LSN_FORMAT_ARGS(state->rem_lsn),
|
||||
LSN_FORMAT_ARGS(state->req_lsn),
|
||||
state->req_len);
|
||||
NeonWALReaderResetRemote(state);
|
||||
return NEON_WALREAD_ERROR;
|
||||
}
|
||||
|
||||
/* We can copy min of (available, requested) bytes. */
|
||||
to_copy =
|
||||
Min(state->req_len - state->req_progress, state->wal_rem_len);
|
||||
memcpy(buf, state->wal_ptr, to_copy);
|
||||
state->wal_ptr += to_copy;
|
||||
state->wal_rem_len -= to_copy;
|
||||
state->rem_lsn += to_copy;
|
||||
if (state->wal_rem_len == 0)
|
||||
state->wal_ptr = NULL; /* freed by libpqwalproposer */
|
||||
buf += to_copy;
|
||||
state->req_progress += to_copy;
|
||||
if (state->req_progress == state->req_len)
|
||||
{
|
||||
XLogSegNo next_segno;
|
||||
XLogSegNo req_segno;
|
||||
|
||||
XLByteToSeg(state->req_lsn, req_segno, state->segcxt.ws_segsize);
|
||||
XLByteToSeg(state->rem_lsn, next_segno, state->segcxt.ws_segsize);
|
||||
|
||||
/*
|
||||
* Request completed. If there is a chance of serving next one
|
||||
* locally, close the connection.
|
||||
*/
|
||||
if (state->req_lsn < state->available_lsn &&
|
||||
state->rem_lsn >= state->available_lsn)
|
||||
{
|
||||
nwr_log(LOG, "closing remote connection as available_lsn %X/%X crossed and next read at %X/%X is likely to be served locally",
|
||||
LSN_FORMAT_ARGS(state->available_lsn), LSN_FORMAT_ARGS(state->rem_lsn));
|
||||
NeonWALReaderResetRemote(state);
|
||||
}
|
||||
else if (state->rem_lsn >= state->available_lsn && next_segno > req_segno &&
|
||||
is_wal_segment_exists(next_segno, state->segcxt.ws_segsize, tli))
|
||||
{
|
||||
nwr_log(LOG, "closing remote connection as WAL file at next lsn %X/%X exists",
|
||||
LSN_FORMAT_ARGS(state->rem_lsn));
|
||||
NeonWALReaderResetRemote(state);
|
||||
}
|
||||
state->req_lsn = InvalidXLogRecPtr;
|
||||
state->req_len = 0;
|
||||
state->req_progress = 0;
|
||||
return NEON_WALREAD_SUCCESS;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Read one WAL message from the stream, sets state->wal_ptr in case of success.
|
||||
* Resets remote state in case of failure.
|
||||
*/
|
||||
static NeonWALReadResult
|
||||
NeonWALReaderReadMsg(NeonWALReader *state)
|
||||
{
|
||||
while (true) /* loop until we get 'w' */
|
||||
{
|
||||
char *copydata_ptr;
|
||||
int copydata_size;
|
||||
StringInfoData s;
|
||||
char msg_type;
|
||||
int hdrlen;
|
||||
|
||||
Assert(state->rem_state == RS_ESTABLISHED);
|
||||
Assert(state->wal_ptr == NULL && state->wal_rem_len == 0);
|
||||
|
||||
switch (libpqwp_async_read(state->wp_conn,
|
||||
©data_ptr,
|
||||
©data_size))
|
||||
{
|
||||
case PG_ASYNC_READ_SUCCESS:
|
||||
break;
|
||||
case PG_ASYNC_READ_TRY_AGAIN:
|
||||
return NEON_WALREAD_WOULDBLOCK;
|
||||
case PG_ASYNC_READ_FAIL:
|
||||
snprintf(state->err_msg,
|
||||
sizeof(state->err_msg),
|
||||
"req_lsn=%X/%X, req_len=%zu, req_progress=%zu, get copydata failed: %s",
|
||||
LSN_FORMAT_ARGS(state->req_lsn),
|
||||
state->req_len,
|
||||
state->req_progress,
|
||||
PQerrorMessage(state->wp_conn->pg_conn));
|
||||
goto err;
|
||||
}
|
||||
|
||||
/* put data on StringInfo to parse */
|
||||
s.data = copydata_ptr;
|
||||
s.len = copydata_size;
|
||||
s.cursor = 0;
|
||||
s.maxlen = -1;
|
||||
|
||||
if (copydata_size == 0)
|
||||
{
|
||||
snprintf(state->err_msg,
|
||||
sizeof(state->err_msg),
|
||||
"zero length copydata received");
|
||||
goto err;
|
||||
}
|
||||
msg_type = pq_getmsgbyte(&s);
|
||||
switch (msg_type)
|
||||
{
|
||||
case 'w':
|
||||
{
|
||||
XLogRecPtr start_lsn;
|
||||
|
||||
hdrlen = sizeof(int64) + sizeof(int64) + sizeof(int64);
|
||||
if (s.len - s.cursor < hdrlen)
|
||||
{
|
||||
snprintf(state->err_msg,
|
||||
sizeof(state->err_msg),
|
||||
"invalid WAL message received from primary");
|
||||
goto err;
|
||||
}
|
||||
|
||||
start_lsn = pq_getmsgint64(&s);
|
||||
pq_getmsgint64(&s); /* XLogRecPtr end_lsn; */
|
||||
pq_getmsgint64(&s); /* TimestampTz send_time */
|
||||
|
||||
state->rem_lsn = start_lsn;
|
||||
state->wal_rem_len = (Size) (s.len - s.cursor);
|
||||
state->wal_ptr = (char *) pq_getmsgbytes(&s, s.len - s.cursor);
|
||||
nwr_log(DEBUG5, "received WAL msg at %X/%X len %zu",
|
||||
LSN_FORMAT_ARGS(state->rem_lsn), state->wal_rem_len);
|
||||
|
||||
return NEON_WALREAD_SUCCESS;
|
||||
}
|
||||
case 'k':
|
||||
{
|
||||
XLogRecPtr end_lsn;
|
||||
bool reply_requested;
|
||||
|
||||
hdrlen = sizeof(int64) + sizeof(int64) + sizeof(char);
|
||||
if (s.len - s.cursor < hdrlen)
|
||||
{
|
||||
snprintf(state->err_msg, sizeof(state->err_msg),
|
||||
"invalid keepalive message received from primary");
|
||||
goto err;
|
||||
}
|
||||
|
||||
end_lsn = pq_getmsgint64(&s);
|
||||
pq_getmsgint64(&s); /* TimestampTz timestamp; */
|
||||
reply_requested = pq_getmsgbyte(&s);
|
||||
nwr_log(DEBUG5, "received keepalive end_lsn=%X/%X reply_requested=%d",
|
||||
LSN_FORMAT_ARGS(end_lsn),
|
||||
reply_requested);
|
||||
if (end_lsn < state->req_lsn + state->req_len)
|
||||
{
|
||||
snprintf(state->err_msg, sizeof(state->err_msg),
|
||||
"closing remote connection: requested WAL up to %X/%X, but current donor %s has only up to %X/%X",
|
||||
LSN_FORMAT_ARGS(state->req_lsn + state->req_len), state->donor_name, LSN_FORMAT_ARGS(end_lsn));
|
||||
goto err;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
default:
|
||||
nwr_log(WARNING, "invalid replication message type %d", msg_type);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
err:
|
||||
NeonWALReaderResetRemote(state);
|
||||
return NEON_WALREAD_ERROR;
|
||||
}
|
||||
|
||||
/* reset remote connection and request in progress */
|
||||
static void
|
||||
NeonWALReaderResetRemote(NeonWALReader *state)
|
||||
{
|
||||
state->req_lsn = InvalidXLogRecPtr;
|
||||
state->req_len = 0;
|
||||
state->req_progress = 0;
|
||||
state->rem_state = RS_NONE;
|
||||
if (state->wp_conn)
|
||||
{
|
||||
libpqwp_disconnect(state->wp_conn);
|
||||
state->wp_conn = NULL;
|
||||
}
|
||||
state->donor_name[0] = '\0';
|
||||
state->wal_ptr = NULL;
|
||||
state->wal_rem_len = 0;
|
||||
state->rem_lsn = InvalidXLogRecPtr;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return socket of connection to remote source. Must be called only when
|
||||
* connection exists (NeonWALReaderEvents returns non zero).
|
||||
*/
|
||||
pgsocket
|
||||
NeonWALReaderSocket(NeonWALReader *state)
|
||||
{
|
||||
if (!state->wp_conn)
|
||||
nwr_log(FATAL, "NeonWALReaderSocket is called without active remote connection");
|
||||
return PQsocket(state->wp_conn->pg_conn);
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns events user should wait on connection socket or 0 if remote
|
||||
* connection is not active.
|
||||
*/
|
||||
extern uint32
|
||||
NeonWALReaderEvents(NeonWALReader *state)
|
||||
{
|
||||
switch (state->rem_state)
|
||||
{
|
||||
case RS_NONE:
|
||||
return 0;
|
||||
case RS_CONNECTING_READ:
|
||||
return WL_SOCKET_READABLE;
|
||||
case RS_CONNECTING_WRITE:
|
||||
return WL_SOCKET_WRITEABLE;
|
||||
case RS_WAIT_EXEC_RESULT:
|
||||
case RS_ESTABLISHED:
|
||||
return WL_SOCKET_READABLE;
|
||||
default:
|
||||
Assert(false);
|
||||
return 0; /* make compiler happy */
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli)
|
||||
{
|
||||
char *p;
|
||||
XLogRecPtr recptr;
|
||||
Size nbytes;
|
||||
|
||||
p = buf;
|
||||
recptr = startptr;
|
||||
nbytes = count;
|
||||
|
||||
while (nbytes > 0)
|
||||
{
|
||||
uint32 startoff;
|
||||
int segbytes;
|
||||
int readbytes;
|
||||
|
||||
startoff = XLogSegmentOffset(recptr, state->segcxt.ws_segsize);
|
||||
|
||||
/*
|
||||
* If the data we want is not in a segment we have open, close what we
|
||||
* have (if anything) and open the next one, using the caller's
|
||||
* provided openSegment callback.
|
||||
*/
|
||||
if (state->seg.ws_file < 0 ||
|
||||
!XLByteInSeg(recptr, state->seg.ws_segno, state->segcxt.ws_segsize) ||
|
||||
tli != state->seg.ws_tli)
|
||||
{
|
||||
XLogSegNo nextSegNo;
|
||||
|
||||
neon_wal_segment_close(state);
|
||||
|
||||
XLByteToSeg(recptr, nextSegNo, state->segcxt.ws_segsize);
|
||||
if (!neon_wal_segment_open(state, nextSegNo, &tli))
|
||||
{
|
||||
char fname[MAXFNAMELEN];
|
||||
|
||||
state->wre_errno = errno;
|
||||
|
||||
XLogFileName(fname, tli, nextSegNo, state->segcxt.ws_segsize);
|
||||
snprintf(state->err_msg, sizeof(state->err_msg), "failed to open WAL segment %s while reading at %X/%X: %s",
|
||||
fname, LSN_FORMAT_ARGS(recptr), strerror(state->wre_errno));
|
||||
return false;
|
||||
}
|
||||
|
||||
/* This shouldn't happen -- indicates a bug in segment_open */
|
||||
Assert(state->seg.ws_file >= 0);
|
||||
|
||||
/* Update the current segment info. */
|
||||
state->seg.ws_tli = tli;
|
||||
state->seg.ws_segno = nextSegNo;
|
||||
}
|
||||
|
||||
/* How many bytes are within this segment? */
|
||||
if (nbytes > (state->segcxt.ws_segsize - startoff))
|
||||
segbytes = state->segcxt.ws_segsize - startoff;
|
||||
else
|
||||
segbytes = nbytes;
|
||||
|
||||
#ifndef FRONTEND
|
||||
pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
|
||||
#endif
|
||||
|
||||
/* Reset errno first; eases reporting non-errno-affecting errors */
|
||||
errno = 0;
|
||||
readbytes = pg_pread(state->seg.ws_file, p, segbytes, (off_t) startoff);
|
||||
|
||||
#ifndef FRONTEND
|
||||
pgstat_report_wait_end();
|
||||
#endif
|
||||
|
||||
if (readbytes <= 0)
|
||||
{
|
||||
char fname[MAXFNAMELEN];
|
||||
|
||||
XLogFileName(fname, state->seg.ws_tli, state->seg.ws_segno, state->segcxt.ws_segsize);
|
||||
|
||||
if (readbytes < 0)
|
||||
{
|
||||
state->wre_errno = errno;
|
||||
snprintf(state->err_msg, sizeof(state->err_msg), "could not read from log segment %s, offset %d: %m: %s",
|
||||
fname, startoff, strerror(state->wre_errno));
|
||||
}
|
||||
else
|
||||
{
|
||||
snprintf(state->err_msg, sizeof(state->err_msg), "could not read from log segment %s, offset %d: %m: unexpected EOF",
|
||||
fname, startoff);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Update state for read */
|
||||
recptr += readbytes;
|
||||
nbytes -= readbytes;
|
||||
p += readbytes;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Copy of vanilla wal_segment_open, but returns false in case of error instead
|
||||
* of ERROR, with errno set.
|
||||
*
|
||||
* XLogReaderRoutine->segment_open callback for local pg_wal files
|
||||
*/
|
||||
static bool
|
||||
neon_wal_segment_open(NeonWALReader *state, XLogSegNo nextSegNo,
|
||||
TimeLineID *tli_p)
|
||||
{
|
||||
TimeLineID tli = *tli_p;
|
||||
char path[MAXPGPATH];
|
||||
|
||||
XLogFilePath(path, tli, nextSegNo, state->segcxt.ws_segsize);
|
||||
nwr_log(LOG, "opening %s", path);
|
||||
state->seg.ws_file = BasicOpenFile(path, O_RDONLY | PG_BINARY);
|
||||
if (state->seg.ws_file >= 0)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool
|
||||
is_wal_segment_exists(XLogSegNo segno, int segsize, TimeLineID tli)
|
||||
{
|
||||
struct stat stat_buffer;
|
||||
char path[MAXPGPATH];
|
||||
|
||||
XLogFilePath(path, tli, segno, segsize);
|
||||
return stat(path, &stat_buffer) == 0;
|
||||
}
|
||||
|
||||
/* copy of vanilla wal_segment_close with NeonWALReader */
|
||||
static void
|
||||
neon_wal_segment_close(NeonWALReader *state)
|
||||
{
|
||||
if (state->seg.ws_file >= 0)
|
||||
{
|
||||
close(state->seg.ws_file);
|
||||
/* need to check errno? */
|
||||
state->seg.ws_file = -1;
|
||||
}
|
||||
}
|
||||
|
||||
char *
|
||||
NeonWALReaderErrMsg(NeonWALReader *state)
|
||||
{
|
||||
return state->err_msg;
|
||||
}
|
||||
@@ -1,29 +0,0 @@
|
||||
#ifndef __NEON_WALREADER_H__
|
||||
#define __NEON_WALREADER_H__
|
||||
|
||||
#include "access/xlogdefs.h"
|
||||
|
||||
/* forward declare so we don't have to expose the struct to the public */
|
||||
struct NeonWALReader;
|
||||
typedef struct NeonWALReader NeonWALReader;
|
||||
|
||||
/* avoid including walproposer.h as it includes us */
|
||||
struct WalProposer;
|
||||
typedef struct WalProposer WalProposer;
|
||||
|
||||
/* NeonWALRead return value */
|
||||
typedef enum
|
||||
{
|
||||
NEON_WALREAD_SUCCESS,
|
||||
NEON_WALREAD_WOULDBLOCK,
|
||||
NEON_WALREAD_ERROR,
|
||||
} NeonWALReadResult;
|
||||
|
||||
extern NeonWALReader *NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, WalProposer *wp, char *log_prefix);
|
||||
extern void NeonWALReaderFree(NeonWALReader *state);
|
||||
extern NeonWALReadResult NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
|
||||
extern pgsocket NeonWALReaderSocket(NeonWALReader *state);
|
||||
extern uint32 NeonWALReaderEvents(NeonWALReader *state);
|
||||
extern char *NeonWALReaderErrMsg(NeonWALReader *state);
|
||||
|
||||
#endif /* __NEON_WALREADER_H__ */
|
||||
@@ -40,13 +40,13 @@ typedef enum
|
||||
T_NeonGetPageResponse,
|
||||
T_NeonErrorResponse,
|
||||
T_NeonDbSizeResponse,
|
||||
} NeonMessageTag;
|
||||
} NeonMessageTag;
|
||||
|
||||
/* base struct for c-style inheritance */
|
||||
typedef struct
|
||||
{
|
||||
NeonMessageTag tag;
|
||||
} NeonMessage;
|
||||
} NeonMessage;
|
||||
|
||||
#define messageTag(m) (((const NeonMessage *)(m))->tag)
|
||||
|
||||
@@ -67,27 +67,27 @@ typedef struct
|
||||
NeonMessageTag tag;
|
||||
bool latest; /* if true, request latest page version */
|
||||
XLogRecPtr lsn; /* request page version @ this LSN */
|
||||
} NeonRequest;
|
||||
} NeonRequest;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
NeonRequest req;
|
||||
NRelFileInfo rinfo;
|
||||
ForkNumber forknum;
|
||||
} NeonExistsRequest;
|
||||
} NeonExistsRequest;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
NeonRequest req;
|
||||
NRelFileInfo rinfo;
|
||||
ForkNumber forknum;
|
||||
} NeonNblocksRequest;
|
||||
} NeonNblocksRequest;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
NeonRequest req;
|
||||
Oid dbNode;
|
||||
} NeonDbSizeRequest;
|
||||
} NeonDbSizeRequest;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
@@ -95,31 +95,31 @@ typedef struct
|
||||
NRelFileInfo rinfo;
|
||||
ForkNumber forknum;
|
||||
BlockNumber blkno;
|
||||
} NeonGetPageRequest;
|
||||
} NeonGetPageRequest;
|
||||
|
||||
/* supertype of all the Neon*Response structs below */
|
||||
typedef struct
|
||||
{
|
||||
NeonMessageTag tag;
|
||||
} NeonResponse;
|
||||
} NeonResponse;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
NeonMessageTag tag;
|
||||
bool exists;
|
||||
} NeonExistsResponse;
|
||||
} NeonExistsResponse;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
NeonMessageTag tag;
|
||||
uint32 n_blocks;
|
||||
} NeonNblocksResponse;
|
||||
} NeonNblocksResponse;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
NeonMessageTag tag;
|
||||
char page[FLEXIBLE_ARRAY_MEMBER];
|
||||
} NeonGetPageResponse;
|
||||
} NeonGetPageResponse;
|
||||
|
||||
#define PS_GETPAGERESPONSE_SIZE (MAXALIGN(offsetof(NeonGetPageResponse, page) + BLCKSZ))
|
||||
|
||||
@@ -127,18 +127,18 @@ typedef struct
|
||||
{
|
||||
NeonMessageTag tag;
|
||||
int64 db_size;
|
||||
} NeonDbSizeResponse;
|
||||
} NeonDbSizeResponse;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
NeonMessageTag tag;
|
||||
char message[FLEXIBLE_ARRAY_MEMBER]; /* null-terminated error
|
||||
* message */
|
||||
} NeonErrorResponse;
|
||||
} NeonErrorResponse;
|
||||
|
||||
extern StringInfoData nm_pack_request(NeonRequest *msg);
|
||||
extern NeonResponse *nm_unpack_response(StringInfo s);
|
||||
extern char *nm_to_string(NeonMessage *msg);
|
||||
extern StringInfoData nm_pack_request(NeonRequest * msg);
|
||||
extern NeonResponse * nm_unpack_response(StringInfo s);
|
||||
extern char *nm_to_string(NeonMessage * msg);
|
||||
|
||||
/*
|
||||
* API
|
||||
@@ -146,20 +146,20 @@ extern char *nm_to_string(NeonMessage *msg);
|
||||
|
||||
typedef struct
|
||||
{
|
||||
bool (*send) (NeonRequest *request);
|
||||
bool (*send) (NeonRequest * request);
|
||||
NeonResponse *(*receive) (void);
|
||||
bool (*flush) (void);
|
||||
} page_server_api;
|
||||
} page_server_api;
|
||||
|
||||
extern void prefetch_on_ps_disconnect(void);
|
||||
|
||||
extern page_server_api *page_server;
|
||||
extern page_server_api * page_server;
|
||||
|
||||
extern char *page_server_connstring;
|
||||
extern int flush_every_n_requests;
|
||||
extern int readahead_buffer_size;
|
||||
extern int flush_every_n_requests;
|
||||
extern int readahead_buffer_size;
|
||||
extern bool seqscan_prefetch_enabled;
|
||||
extern int seqscan_prefetch_distance;
|
||||
extern int seqscan_prefetch_distance;
|
||||
extern char *neon_timeline;
|
||||
extern char *neon_tenant;
|
||||
extern bool wal_redo;
|
||||
@@ -194,14 +194,14 @@ extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum,
|
||||
extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
char *buffer);
|
||||
extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
|
||||
XLogRecPtr request_lsn, bool request_latest, char *buffer);
|
||||
XLogRecPtr request_lsn, bool request_latest, char *buffer);
|
||||
extern void neon_write(SMgrRelation reln, ForkNumber forknum,
|
||||
BlockNumber blocknum, char *buffer, bool skipFsync);
|
||||
#else
|
||||
extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
void *buffer);
|
||||
extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
|
||||
XLogRecPtr request_lsn, bool request_latest, void *buffer);
|
||||
XLogRecPtr request_lsn, bool request_latest, void *buffer);
|
||||
extern void neon_write(SMgrRelation reln, ForkNumber forknum,
|
||||
BlockNumber blocknum, const void *buffer, bool skipFsync);
|
||||
#endif
|
||||
|
||||
@@ -63,6 +63,7 @@
|
||||
#include "storage/md.h"
|
||||
#include "pgstat.h"
|
||||
|
||||
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
#include "access/xlogutils.h"
|
||||
#include "access/xlogrecovery.h"
|
||||
@@ -100,21 +101,21 @@ typedef enum
|
||||
UNLOGGED_BUILD_PHASE_1,
|
||||
UNLOGGED_BUILD_PHASE_2,
|
||||
UNLOGGED_BUILD_NOT_PERMANENT
|
||||
} UnloggedBuildPhase;
|
||||
} UnloggedBuildPhase;
|
||||
|
||||
static SMgrRelation unlogged_build_rel = NULL;
|
||||
static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
|
||||
|
||||
/*
|
||||
* Prefetch implementation:
|
||||
*
|
||||
*
|
||||
* Prefetch is performed locally by each backend.
|
||||
*
|
||||
* There can be up to readahead_buffer_size active IO requests registered at
|
||||
* any time. Requests using smgr_prefetch are sent to the pageserver, but we
|
||||
* don't wait on the response. Requests using smgr_read are either read from
|
||||
* the buffer, or (if that's not possible) we wait on the response to arrive -
|
||||
* this also will allow us to receive other prefetched pages.
|
||||
* this also will allow us to receive other prefetched pages.
|
||||
* Each request is immediately written to the output buffer of the pageserver
|
||||
* connection, but may not be flushed if smgr_prefetch is used: pageserver
|
||||
* flushes sent requests on manual flush, or every neon.flush_output_after
|
||||
@@ -138,7 +139,7 @@ static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
|
||||
|
||||
/*
|
||||
* State machine:
|
||||
*
|
||||
*
|
||||
* not in hash : in hash
|
||||
* :
|
||||
* UNUSED ------> REQUESTED --> RECEIVED
|
||||
@@ -149,34 +150,30 @@ static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
|
||||
* +----------------+------------+
|
||||
* :
|
||||
*/
|
||||
typedef enum PrefetchStatus
|
||||
{
|
||||
PRFS_UNUSED = 0, /* unused slot */
|
||||
PRFS_REQUESTED, /* request was written to the sendbuffer to
|
||||
* PS, but not necessarily flushed. all fields
|
||||
* except response valid */
|
||||
PRFS_RECEIVED, /* all fields valid */
|
||||
PRFS_TAG_REMAINS, /* only buftag and my_ring_index are still
|
||||
* valid */
|
||||
typedef enum PrefetchStatus {
|
||||
PRFS_UNUSED = 0, /* unused slot */
|
||||
PRFS_REQUESTED, /* request was written to the sendbuffer to PS, but not
|
||||
* necessarily flushed.
|
||||
* all fields except response valid */
|
||||
PRFS_RECEIVED, /* all fields valid */
|
||||
PRFS_TAG_REMAINS, /* only buftag and my_ring_index are still valid */
|
||||
} PrefetchStatus;
|
||||
|
||||
typedef struct PrefetchRequest
|
||||
{
|
||||
BufferTag buftag; /* must be first entry in the struct */
|
||||
typedef struct PrefetchRequest {
|
||||
BufferTag buftag; /* must be first entry in the struct */
|
||||
XLogRecPtr effective_request_lsn;
|
||||
XLogRecPtr actual_request_lsn;
|
||||
NeonResponse *response; /* may be null */
|
||||
NeonResponse *response; /* may be null */
|
||||
PrefetchStatus status;
|
||||
uint64 my_ring_index;
|
||||
} PrefetchRequest;
|
||||
|
||||
/* prefetch buffer lookup hash table */
|
||||
|
||||
typedef struct PrfHashEntry
|
||||
{
|
||||
typedef struct PrfHashEntry {
|
||||
PrefetchRequest *slot;
|
||||
uint32 status;
|
||||
uint32 hash;
|
||||
uint32 status;
|
||||
uint32 hash;
|
||||
} PrfHashEntry;
|
||||
|
||||
#define SH_PREFIX prfh
|
||||
@@ -200,42 +197,36 @@ typedef struct PrfHashEntry
|
||||
/*
|
||||
* PrefetchState maintains the state of (prefetch) getPage@LSN requests.
|
||||
* It maintains a (ring) buffer of in-flight requests and responses.
|
||||
*
|
||||
*
|
||||
* We maintain several indexes into the ring buffer:
|
||||
* ring_unused >= ring_flush >= ring_receive >= ring_last >= 0
|
||||
*
|
||||
*
|
||||
* ring_unused points to the first unused slot of the buffer
|
||||
* ring_receive is the next request that is to be received
|
||||
* ring_last is the oldest received entry in the buffer
|
||||
*
|
||||
*
|
||||
* Apart from being an entry in the ring buffer of prefetch requests, each
|
||||
* PrefetchRequest that is not UNUSED is indexed in prf_hash by buftag.
|
||||
*/
|
||||
typedef struct PrefetchState
|
||||
{
|
||||
MemoryContext bufctx; /* context for prf_buffer[].response
|
||||
* allocations */
|
||||
MemoryContext errctx; /* context for prf_buffer[].response
|
||||
* allocations */
|
||||
MemoryContext hashctx; /* context for prf_buffer */
|
||||
typedef struct PrefetchState {
|
||||
MemoryContext bufctx; /* context for prf_buffer[].response allocations */
|
||||
MemoryContext errctx; /* context for prf_buffer[].response allocations */
|
||||
MemoryContext hashctx; /* context for prf_buffer */
|
||||
|
||||
/* buffer indexes */
|
||||
uint64 ring_unused; /* first unused slot */
|
||||
uint64 ring_flush; /* next request to flush */
|
||||
uint64 ring_receive; /* next slot that is to receive a response */
|
||||
uint64 ring_last; /* min slot with a response value */
|
||||
uint64 ring_unused; /* first unused slot */
|
||||
uint64 ring_flush; /* next request to flush */
|
||||
uint64 ring_receive; /* next slot that is to receive a response */
|
||||
uint64 ring_last; /* min slot with a response value */
|
||||
|
||||
/* metrics / statistics */
|
||||
int n_responses_buffered; /* count of PS responses not yet in
|
||||
* buffers */
|
||||
int n_requests_inflight; /* count of PS requests considered in
|
||||
* flight */
|
||||
int n_unused; /* count of buffers < unused, > last, that are
|
||||
* also unused */
|
||||
int n_responses_buffered; /* count of PS responses not yet in buffers */
|
||||
int n_requests_inflight; /* count of PS requests considered in flight */
|
||||
int n_unused; /* count of buffers < unused, > last, that are also unused */
|
||||
|
||||
/* the buffers */
|
||||
prfh_hash *prf_hash;
|
||||
PrefetchRequest prf_buffer[]; /* prefetch buffers */
|
||||
prfh_hash *prf_hash;
|
||||
PrefetchRequest prf_buffer[]; /* prefetch buffers */
|
||||
} PrefetchState;
|
||||
|
||||
PrefetchState *MyPState;
|
||||
@@ -273,10 +264,10 @@ static XLogRecPtr neon_get_request_lsn(bool *latest, NRelFileInfo rinfo,
|
||||
static bool
|
||||
compact_prefetch_buffers(void)
|
||||
{
|
||||
uint64 empty_ring_index = MyPState->ring_last;
|
||||
uint64 search_ring_index = MyPState->ring_receive;
|
||||
int n_moved = 0;
|
||||
|
||||
uint64 empty_ring_index = MyPState->ring_last;
|
||||
uint64 search_ring_index = MyPState->ring_receive;
|
||||
int n_moved = 0;
|
||||
|
||||
if (MyPState->ring_receive == MyPState->ring_last)
|
||||
return false;
|
||||
|
||||
@@ -291,14 +282,15 @@ compact_prefetch_buffers(void)
|
||||
}
|
||||
|
||||
/*
|
||||
* Here we have established: slots < search_ring_index have an unknown
|
||||
* state (not scanned) slots >= search_ring_index and <= empty_ring_index
|
||||
* are unused slots > empty_ring_index are in use, or outside our buffer's
|
||||
* range. ... unless search_ring_index <= ring_last
|
||||
*
|
||||
* Here we have established:
|
||||
* slots < search_ring_index have an unknown state (not scanned)
|
||||
* slots >= search_ring_index and <= empty_ring_index are unused
|
||||
* slots > empty_ring_index are in use, or outside our buffer's range.
|
||||
* ... unless search_ring_index <= ring_last
|
||||
*
|
||||
* Therefore, there is a gap of at least one unused items between
|
||||
* search_ring_index and empty_ring_index (both inclusive), which grows as
|
||||
* we hit more unused items while moving backwards through the array.
|
||||
* search_ring_index and empty_ring_index (both inclusive), which grows as we hit
|
||||
* more unused items while moving backwards through the array.
|
||||
*/
|
||||
|
||||
while (search_ring_index > MyPState->ring_last)
|
||||
@@ -338,10 +330,7 @@ compact_prefetch_buffers(void)
|
||||
|
||||
/* empty the moved slot */
|
||||
source_slot->status = PRFS_UNUSED;
|
||||
source_slot->buftag = (BufferTag)
|
||||
{
|
||||
0
|
||||
};
|
||||
source_slot->buftag = (BufferTag) {0};
|
||||
source_slot->response = NULL;
|
||||
source_slot->my_ring_index = 0;
|
||||
source_slot->effective_request_lsn = 0;
|
||||
@@ -351,8 +340,8 @@ compact_prefetch_buffers(void)
|
||||
}
|
||||
|
||||
/*
|
||||
* Only when we've moved slots we can expect trailing unused slots, so
|
||||
* only then we clean up trailing unused slots.
|
||||
* Only when we've moved slots we can expect trailing unused slots,
|
||||
* so only then we clean up trailing unused slots.
|
||||
*/
|
||||
if (n_moved > 0)
|
||||
{
|
||||
@@ -369,10 +358,10 @@ readahead_buffer_resize(int newsize, void *extra)
|
||||
uint64 end,
|
||||
nfree = newsize;
|
||||
PrefetchState *newPState;
|
||||
Size newprfs_size = offsetof(PrefetchState, prf_buffer) + (
|
||||
sizeof(PrefetchRequest) * newsize
|
||||
);
|
||||
|
||||
Size newprfs_size = offsetof(PrefetchState, prf_buffer) + (
|
||||
sizeof(PrefetchRequest) * newsize
|
||||
);
|
||||
|
||||
/* don't try to re-initialize if we haven't initialized yet */
|
||||
if (MyPState == NULL)
|
||||
return;
|
||||
@@ -399,12 +388,12 @@ readahead_buffer_resize(int newsize, void *extra)
|
||||
newPState->ring_receive = newsize;
|
||||
newPState->ring_flush = newsize;
|
||||
|
||||
/*
|
||||
/*
|
||||
* Copy over the prefetches.
|
||||
*
|
||||
*
|
||||
* We populate the prefetch array from the end; to retain the most recent
|
||||
* prefetches, but this has the benefit of only needing to do one
|
||||
* iteration on the dataset, and trivial compaction.
|
||||
* prefetches, but this has the benefit of only needing to do one iteration
|
||||
* on the dataset, and trivial compaction.
|
||||
*/
|
||||
for (end = MyPState->ring_unused - 1;
|
||||
end >= MyPState->ring_last && end != UINT64_MAX && nfree != 0;
|
||||
@@ -412,7 +401,7 @@ readahead_buffer_resize(int newsize, void *extra)
|
||||
{
|
||||
PrefetchRequest *slot = GetPrfSlot(end);
|
||||
PrefetchRequest *newslot;
|
||||
bool found;
|
||||
bool found;
|
||||
|
||||
if (slot->status == PRFS_UNUSED)
|
||||
continue;
|
||||
@@ -475,11 +464,10 @@ consume_prefetch_responses(void)
|
||||
static void
|
||||
prefetch_cleanup_trailing_unused(void)
|
||||
{
|
||||
uint64 ring_index;
|
||||
uint64 ring_index;
|
||||
PrefetchRequest *slot;
|
||||
|
||||
while (MyPState->ring_last < MyPState->ring_receive)
|
||||
{
|
||||
while (MyPState->ring_last < MyPState->ring_receive) {
|
||||
ring_index = MyPState->ring_last;
|
||||
slot = GetPrfSlot(ring_index);
|
||||
|
||||
@@ -493,7 +481,7 @@ prefetch_cleanup_trailing_unused(void)
|
||||
/*
|
||||
* Wait for slot of ring_index to have received its response.
|
||||
* The caller is responsible for making sure the request buffer is flushed.
|
||||
*
|
||||
*
|
||||
* NOTE: this function may indirectly update MyPState->pfs_hash; which
|
||||
* invalidates any active pointers into the hash table.
|
||||
*/
|
||||
@@ -525,7 +513,7 @@ prefetch_wait_for(uint64 ring_index)
|
||||
|
||||
/*
|
||||
* Read the response of a prefetch request into its slot.
|
||||
*
|
||||
*
|
||||
* The caller is responsible for making sure that the request for this buffer
|
||||
* was flushed to the PageServer.
|
||||
*
|
||||
@@ -565,7 +553,7 @@ prefetch_read(PrefetchRequest *slot)
|
||||
|
||||
/*
|
||||
* Disconnect hook - drop prefetches when the connection drops
|
||||
*
|
||||
*
|
||||
* If we don't remove the failed prefetches, we'd be serving incorrect
|
||||
* data to the smgr.
|
||||
*/
|
||||
@@ -576,7 +564,7 @@ prefetch_on_ps_disconnect(void)
|
||||
while (MyPState->ring_receive < MyPState->ring_unused)
|
||||
{
|
||||
PrefetchRequest *slot;
|
||||
uint64 ring_index = MyPState->ring_receive;
|
||||
uint64 ring_index = MyPState->ring_receive;
|
||||
|
||||
slot = GetPrfSlot(ring_index);
|
||||
|
||||
@@ -606,7 +594,7 @@ prefetch_set_unused(uint64 ring_index)
|
||||
PrefetchRequest *slot = GetPrfSlot(ring_index);
|
||||
|
||||
if (ring_index < MyPState->ring_last)
|
||||
return; /* Should already be unused */
|
||||
return; /* Should already be unused */
|
||||
|
||||
Assert(MyPState->ring_unused > ring_index);
|
||||
|
||||
@@ -637,11 +625,7 @@ prefetch_set_unused(uint64 ring_index)
|
||||
/* run cleanup if we're holding back ring_last */
|
||||
if (MyPState->ring_last == ring_index)
|
||||
prefetch_cleanup_trailing_unused();
|
||||
|
||||
/*
|
||||
* ... and try to store the buffered responses more compactly if > 12.5%
|
||||
* of the buffer is gaps
|
||||
*/
|
||||
/* ... and try to store the buffered responses more compactly if > 12.5% of the buffer is gaps */
|
||||
else if (ReceiveBufferNeedsCompaction())
|
||||
compact_prefetch_buffers();
|
||||
}
|
||||
@@ -649,7 +633,7 @@ prefetch_set_unused(uint64 ring_index)
|
||||
static void
|
||||
prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn)
|
||||
{
|
||||
bool found;
|
||||
bool found;
|
||||
NeonGetPageRequest request = {
|
||||
.req.tag = T_NeonGetPageRequest,
|
||||
.req.latest = false,
|
||||
@@ -667,22 +651,21 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
|
||||
}
|
||||
else
|
||||
{
|
||||
XLogRecPtr lsn = neon_get_request_lsn(
|
||||
&request.req.latest,
|
||||
BufTagGetNRelFileInfo(slot->buftag),
|
||||
slot->buftag.forkNum,
|
||||
slot->buftag.blockNum
|
||||
);
|
||||
|
||||
XLogRecPtr lsn = neon_get_request_lsn(
|
||||
&request.req.latest,
|
||||
BufTagGetNRelFileInfo(slot->buftag),
|
||||
slot->buftag.forkNum,
|
||||
slot->buftag.blockNum
|
||||
);
|
||||
/*
|
||||
* Note: effective_request_lsn is potentially higher than the
|
||||
* requested LSN, but still correct:
|
||||
*
|
||||
* Note: effective_request_lsn is potentially higher than the requested
|
||||
* LSN, but still correct:
|
||||
*
|
||||
* We know there are no changes between the actual requested LSN and
|
||||
* the value of effective_request_lsn: If there were, the page would
|
||||
* have been in cache and evicted between those LSN values, which then
|
||||
* would have had to result in a larger request LSN for this page.
|
||||
*
|
||||
* have been in cache and evicted between those LSN values, which
|
||||
* then would have had to result in a larger request LSN for this page.
|
||||
*
|
||||
* It is possible that a concurrent backend loads the page, modifies
|
||||
* it and then evicts it again, but the LSN of that eviction cannot be
|
||||
* smaller than the current WAL insert/redo pointer, which is already
|
||||
@@ -719,7 +702,7 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
|
||||
* prefetch_register_buffer() - register and prefetch buffer
|
||||
*
|
||||
* Register that we may want the contents of BufferTag in the near future.
|
||||
*
|
||||
*
|
||||
* If force_latest and force_lsn are not NULL, those values are sent to the
|
||||
* pageserver. If they are NULL, we utilize the lastWrittenLsn -infrastructure
|
||||
* to fill in these values manually.
|
||||
@@ -731,14 +714,14 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
|
||||
static uint64
|
||||
prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn)
|
||||
{
|
||||
uint64 ring_index;
|
||||
uint64 ring_index;
|
||||
PrefetchRequest req;
|
||||
PrefetchRequest *slot;
|
||||
PrfHashEntry *entry;
|
||||
|
||||
/* use an intermediate PrefetchRequest struct to ensure correct alignment */
|
||||
req.buftag = tag;
|
||||
Retry:
|
||||
Retry:
|
||||
entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &req);
|
||||
|
||||
if (entry != NULL)
|
||||
@@ -758,10 +741,7 @@ Retry:
|
||||
*/
|
||||
if (force_latest && force_lsn)
|
||||
{
|
||||
/*
|
||||
* if we want the latest version, any effective_request_lsn <
|
||||
* request lsn is OK
|
||||
*/
|
||||
/* if we want the latest version, any effective_request_lsn < request lsn is OK */
|
||||
if (*force_latest)
|
||||
{
|
||||
if (*force_lsn > slot->effective_request_lsn)
|
||||
@@ -772,11 +752,7 @@ Retry:
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
* if we don't want the latest version, only accept requests with
|
||||
* the exact same LSN
|
||||
*/
|
||||
/* if we don't want the latest version, only accept requests with the exact same LSN */
|
||||
else
|
||||
{
|
||||
if (*force_lsn != slot->effective_request_lsn)
|
||||
@@ -823,8 +799,7 @@ Retry:
|
||||
*/
|
||||
if (MyPState->ring_last + readahead_buffer_size - 1 == MyPState->ring_unused)
|
||||
{
|
||||
uint64 cleanup_index = MyPState->ring_last;
|
||||
|
||||
uint64 cleanup_index = MyPState->ring_last;
|
||||
slot = GetPrfSlot(cleanup_index);
|
||||
|
||||
Assert(slot->status != PRFS_UNUSED);
|
||||
@@ -839,10 +814,7 @@ Retry:
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
* We have the slot for ring_last, so that must still be in
|
||||
* progress
|
||||
*/
|
||||
/* We have the slot for ring_last, so that must still be in progress */
|
||||
switch (slot->status)
|
||||
{
|
||||
case PRFS_REQUESTED:
|
||||
@@ -861,8 +833,8 @@ Retry:
|
||||
}
|
||||
|
||||
/*
|
||||
* The next buffer pointed to by `ring_unused` is now definitely empty, so
|
||||
* we can insert the new request to it.
|
||||
* The next buffer pointed to by `ring_unused` is now definitely empty,
|
||||
* so we can insert the new request to it.
|
||||
*/
|
||||
ring_index = MyPState->ring_unused;
|
||||
slot = &MyPState->prf_buffer[((ring_index) % readahead_buffer_size)];
|
||||
@@ -888,10 +860,7 @@ Retry:
|
||||
{
|
||||
if (!page_server->flush())
|
||||
{
|
||||
/*
|
||||
* Prefetch set is reset in case of error, so we should try to
|
||||
* register our request once again
|
||||
*/
|
||||
/* Prefetch set is reset in case of error, so we should try to register our request once again */
|
||||
goto Retry;
|
||||
}
|
||||
MyPState->ring_flush = MyPState->ring_unused;
|
||||
@@ -903,10 +872,8 @@ Retry:
|
||||
static NeonResponse *
|
||||
page_server_request(void const *req)
|
||||
{
|
||||
NeonResponse *resp;
|
||||
|
||||
do
|
||||
{
|
||||
NeonResponse* resp;
|
||||
do {
|
||||
while (!page_server->send((NeonRequest *) req) || !page_server->flush());
|
||||
MyPState->ring_flush = MyPState->ring_unused;
|
||||
consume_prefetch_responses();
|
||||
@@ -918,7 +885,7 @@ page_server_request(void const *req)
|
||||
|
||||
|
||||
StringInfoData
|
||||
nm_pack_request(NeonRequest *msg)
|
||||
nm_pack_request(NeonRequest * msg)
|
||||
{
|
||||
StringInfoData s;
|
||||
|
||||
@@ -1034,7 +1001,7 @@ nm_unpack_response(StringInfo s)
|
||||
/* XXX: should be varlena */
|
||||
memcpy(msg_resp->page, pq_getmsgbytes(s, BLCKSZ), BLCKSZ);
|
||||
pq_getmsgend(s);
|
||||
|
||||
|
||||
Assert(msg_resp->tag == T_NeonGetPageResponse);
|
||||
|
||||
resp = (NeonResponse *) msg_resp;
|
||||
@@ -1090,7 +1057,7 @@ nm_unpack_response(StringInfo s)
|
||||
|
||||
/* dump to json for debugging / error reporting purposes */
|
||||
char *
|
||||
nm_to_string(NeonMessage *msg)
|
||||
nm_to_string(NeonMessage * msg)
|
||||
{
|
||||
StringInfoData s;
|
||||
|
||||
@@ -1219,7 +1186,7 @@ nm_to_string(NeonMessage *msg)
|
||||
* directly because it skips the logging if the LSN is new enough.
|
||||
*/
|
||||
static XLogRecPtr
|
||||
log_newpage_copy(NRelFileInfo * rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
log_newpage_copy(NRelFileInfo *rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
Page page, bool page_std)
|
||||
{
|
||||
PGAlignedBlock copied_buffer;
|
||||
@@ -1242,11 +1209,11 @@ PageIsEmptyHeapPage(char *buffer)
|
||||
}
|
||||
|
||||
static void
|
||||
neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
#if PG_MAJORVERSION_NUM < 16
|
||||
char *buffer, bool force)
|
||||
char *buffer, bool force)
|
||||
#else
|
||||
const char *buffer, bool force)
|
||||
const char *buffer, bool force)
|
||||
#endif
|
||||
{
|
||||
XLogRecPtr lsn = PageGetLSN((Page) buffer);
|
||||
@@ -1346,24 +1313,24 @@ static void
|
||||
void
|
||||
neon_init(void)
|
||||
{
|
||||
Size prfs_size;
|
||||
Size prfs_size;
|
||||
|
||||
if (MyPState != NULL)
|
||||
return;
|
||||
|
||||
prfs_size = offsetof(PrefetchState, prf_buffer) + (
|
||||
sizeof(PrefetchRequest) * readahead_buffer_size
|
||||
);
|
||||
sizeof(PrefetchRequest) * readahead_buffer_size
|
||||
);
|
||||
|
||||
MyPState = MemoryContextAllocZero(TopMemoryContext, prfs_size);
|
||||
|
||||
|
||||
MyPState->n_unused = readahead_buffer_size;
|
||||
|
||||
MyPState->bufctx = SlabContextCreate(TopMemoryContext,
|
||||
"NeonSMGR/prefetch",
|
||||
SLAB_DEFAULT_BLOCK_SIZE * 17,
|
||||
PS_GETPAGERESPONSE_SIZE);
|
||||
MyPState->errctx = AllocSetContextCreate(TopMemoryContext,
|
||||
MyPState->errctx = AllocSetContextCreate(TopMemoryContext,
|
||||
"NeonSMGR/errors",
|
||||
ALLOCSET_DEFAULT_SIZES);
|
||||
MyPState->hashctx = AllocSetContextCreate(TopMemoryContext,
|
||||
@@ -1428,6 +1395,12 @@ neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, ForkNumber forknum, Block
|
||||
elog(DEBUG1, "neon_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ",
|
||||
(uint32) ((lsn) >> 32), (uint32) (lsn));
|
||||
}
|
||||
else if (am_walsender)
|
||||
{
|
||||
*latest = true;
|
||||
lsn = InvalidXLogRecPtr;
|
||||
elog(DEBUG1, "am walsender neon_get_request_lsn lsn 0 ");
|
||||
}
|
||||
else
|
||||
{
|
||||
XLogRecPtr flushlsn;
|
||||
@@ -1603,14 +1576,14 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
|
||||
/*
|
||||
* Newly created relation is empty, remember that in the relsize cache.
|
||||
*
|
||||
* Note that in REDO, this is called to make sure the relation fork
|
||||
* exists, but it does not truncate the relation. So, we can only update
|
||||
* the relsize if it didn't exist before.
|
||||
*
|
||||
* Note that in REDO, this is called to make sure the relation fork exists,
|
||||
* but it does not truncate the relation. So, we can only update the
|
||||
* relsize if it didn't exist before.
|
||||
*
|
||||
* Also, in redo, we must make sure to update the cached size of the
|
||||
* relation, as that is the primary source of truth for REDO's file length
|
||||
* considerations, and as file extension isn't (perfectly) logged, we need
|
||||
* to take care of that before we hit file size checks.
|
||||
* relation, as that is the primary source of truth for REDO's
|
||||
* file length considerations, and as file extension isn't (perfectly)
|
||||
* logged, we need to take care of that before we hit file size checks.
|
||||
*
|
||||
* FIXME: This is currently not just an optimization, but required for
|
||||
* correctness. Postgres can call smgrnblocks() on the newly-created
|
||||
@@ -1686,7 +1659,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
|
||||
#endif
|
||||
{
|
||||
XLogRecPtr lsn;
|
||||
BlockNumber n_blocks = 0;
|
||||
BlockNumber n_blocks = 0;
|
||||
|
||||
switch (reln->smgr_relpersistence)
|
||||
{
|
||||
@@ -1727,10 +1700,9 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
|
||||
}
|
||||
|
||||
/*
|
||||
* Usually Postgres doesn't extend relation on more than one page (leaving
|
||||
* holes). But this rule is violated in PG-15 where
|
||||
* CreateAndCopyRelationData call smgrextend for destination relation n
|
||||
* using size of source relation
|
||||
* Usually Postgres doesn't extend relation on more than one page
|
||||
* (leaving holes). But this rule is violated in PG-15 where CreateAndCopyRelationData
|
||||
* call smgrextend for destination relation n using size of source relation
|
||||
*/
|
||||
n_blocks = neon_nblocks(reln, forkNum);
|
||||
while (n_blocks < blkno)
|
||||
@@ -1751,13 +1723,11 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
|
||||
if (IS_LOCAL_REL(reln))
|
||||
mdextend(reln, forkNum, blkno, buffer, skipFsync);
|
||||
#endif
|
||||
|
||||
/*
|
||||
* smgr_extend is often called with an all-zeroes page, so
|
||||
* lsn==InvalidXLogRecPtr. An smgr_write() call will come for the buffer
|
||||
* later, after it has been initialized with the real page contents, and
|
||||
* it is eventually evicted from the buffer cache. But we need a valid LSN
|
||||
* to the relation metadata update now.
|
||||
* smgr_extend is often called with an all-zeroes page, so lsn==InvalidXLogRecPtr.
|
||||
* An smgr_write() call will come for the buffer later, after it has been initialized
|
||||
* with the real page contents, and it is eventually evicted from the buffer cache.
|
||||
* But we need a valid LSN to the relation metadata update now.
|
||||
*/
|
||||
if (lsn == InvalidXLogRecPtr)
|
||||
{
|
||||
@@ -1816,9 +1786,9 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
|
||||
if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
|
||||
errmsg("cannot extend file \"%s\" beyond %u blocks",
|
||||
relpath(reln->smgr_rlocator, forkNum),
|
||||
InvalidBlockNumber)));
|
||||
errmsg("cannot extend file \"%s\" beyond %u blocks",
|
||||
relpath(reln->smgr_rlocator, forkNum),
|
||||
InvalidBlockNumber)));
|
||||
|
||||
/* Don't log any pages if we're not allowed to do so. */
|
||||
if (!XLogInsertAllowed())
|
||||
@@ -1905,7 +1875,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
|
||||
|
||||
switch (reln->smgr_relpersistence)
|
||||
{
|
||||
case 0: /* probably shouldn't happen, but ignore it */
|
||||
case 0: /* probably shouldn't happen, but ignore it */
|
||||
case RELPERSISTENCE_PERMANENT:
|
||||
break;
|
||||
|
||||
@@ -1920,10 +1890,9 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
|
||||
if (lfc_cache_contains(InfoFromSMgrRel(reln), forknum, blocknum))
|
||||
return false;
|
||||
|
||||
tag = (BufferTag)
|
||||
{
|
||||
tag = (BufferTag) {
|
||||
.forkNum = forknum,
|
||||
.blockNum = blocknum
|
||||
.blockNum = blocknum
|
||||
};
|
||||
CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln));
|
||||
|
||||
@@ -1978,11 +1947,11 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
|
||||
* To avoid breaking tests in the runtime please keep function signature in sync.
|
||||
*/
|
||||
#if PG_MAJORVERSION_NUM < 16
|
||||
void PGDLLEXPORT
|
||||
void PGDLLEXPORT
|
||||
neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
XLogRecPtr request_lsn, bool request_latest, char *buffer)
|
||||
#else
|
||||
void PGDLLEXPORT
|
||||
void PGDLLEXPORT
|
||||
neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
XLogRecPtr request_lsn, bool request_latest, void *buffer)
|
||||
#endif
|
||||
@@ -1993,21 +1962,21 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
PrfHashEntry *entry;
|
||||
PrefetchRequest *slot;
|
||||
|
||||
buftag = (BufferTag)
|
||||
{
|
||||
buftag = (BufferTag) {
|
||||
.forkNum = forkNum,
|
||||
.blockNum = blkno,
|
||||
.blockNum = blkno,
|
||||
};
|
||||
|
||||
CopyNRelFileInfoToBufTag(buftag, rinfo);
|
||||
|
||||
/*
|
||||
* The redo process does not lock pages that it needs to replay but are
|
||||
* not in the shared buffers, so a concurrent process may request the page
|
||||
* after redo has decided it won't redo that page and updated the LwLSN
|
||||
* for that page. If we're in hot standby we need to take care that we
|
||||
* don't return until after REDO has finished replaying up to that LwLSN,
|
||||
* as the page should have been locked up to that point.
|
||||
* not in the shared buffers, so a concurrent process may request the
|
||||
* page after redo has decided it won't redo that page and updated the
|
||||
* LwLSN for that page.
|
||||
* If we're in hot standby we need to take care that we don't return
|
||||
* until after REDO has finished replaying up to that LwLSN, as the page
|
||||
* should have been locked up to that point.
|
||||
*
|
||||
* See also the description on neon_redo_read_buffer_filter below.
|
||||
*
|
||||
@@ -2015,7 +1984,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
* concurrent failed read IOs. Those IOs should never have a request_lsn
|
||||
* that is as large as the WAL record we're currently replaying, if it
|
||||
* weren't for the behaviour of the LwLsn cache that uses the highest
|
||||
* value of the LwLsn cache when the entry is not found.
|
||||
* value of the LwLsn cache when the entry is not found.
|
||||
*/
|
||||
if (RecoveryInProgress() && !(MyBackendType == B_STARTUP))
|
||||
XLogWaitForReplayOf(request_lsn);
|
||||
@@ -2033,14 +2002,12 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
ring_index = slot->my_ring_index;
|
||||
pgBufferUsage.prefetch.hits += 1;
|
||||
}
|
||||
else /* the current prefetch LSN is not large
|
||||
* enough, so drop the prefetch */
|
||||
else /* the current prefetch LSN is not large enough, so drop the prefetch */
|
||||
{
|
||||
/*
|
||||
* We can't drop cache for not-yet-received requested items. It is
|
||||
* unlikely this happens, but it can happen if prefetch distance
|
||||
* is large enough and a backend didn't consume all prefetch
|
||||
* requests.
|
||||
* unlikely this happens, but it can happen if prefetch distance is
|
||||
* large enough and a backend didn't consume all prefetch requests.
|
||||
*/
|
||||
if (slot->status == PRFS_REQUESTED)
|
||||
{
|
||||
@@ -2067,11 +2034,11 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
else
|
||||
{
|
||||
/*
|
||||
* Empty our reference to the prefetch buffer's hash entry. When
|
||||
* we wait for prefetches, the entry reference is invalidated by
|
||||
* potential updates to the hash, and when we reconnect to the
|
||||
* pageserver the prefetch we're waiting for may be dropped, in
|
||||
* which case we need to retry and take the branch above.
|
||||
* Empty our reference to the prefetch buffer's hash entry.
|
||||
* When we wait for prefetches, the entry reference is invalidated by
|
||||
* potential updates to the hash, and when we reconnect to the
|
||||
* pageserver the prefetch we're waiting for may be dropped,
|
||||
* in which case we need to retry and take the branch above.
|
||||
*/
|
||||
entry = NULL;
|
||||
}
|
||||
@@ -2119,11 +2086,11 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
* neon_read() -- Read the specified block from a relation.
|
||||
*/
|
||||
void
|
||||
neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
|
||||
neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
|
||||
#if PG_MAJORVERSION_NUM < 16
|
||||
char *buffer)
|
||||
char *buffer)
|
||||
#else
|
||||
void *buffer)
|
||||
void *buffer)
|
||||
#endif
|
||||
{
|
||||
bool latest;
|
||||
@@ -2258,11 +2225,11 @@ hexdump_page(char *page)
|
||||
* use mdextend().
|
||||
*/
|
||||
void
|
||||
neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
#if PG_MAJORVERSION_NUM < 16
|
||||
char *buffer, bool skipFsync)
|
||||
char *buffer, bool skipFsync)
|
||||
#else
|
||||
const void *buffer, bool skipFsync)
|
||||
const void *buffer, bool skipFsync)
|
||||
#endif
|
||||
{
|
||||
XLogRecPtr lsn;
|
||||
@@ -2764,7 +2731,7 @@ smgr_init_neon(void)
|
||||
|
||||
/*
|
||||
* Return whether we can skip the redo for this block.
|
||||
*
|
||||
*
|
||||
* The conditions for skipping the IO are:
|
||||
*
|
||||
* - The block is not in the shared buffers, and
|
||||
@@ -2803,7 +2770,7 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
|
||||
XLogRecPtr end_recptr = record->EndRecPtr;
|
||||
NRelFileInfo rinfo;
|
||||
ForkNumber forknum;
|
||||
BlockNumber blkno;
|
||||
BlockNumber blkno;
|
||||
BufferTag tag;
|
||||
uint32 hash;
|
||||
LWLock *partitionLock;
|
||||
@@ -2823,8 +2790,8 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
|
||||
|
||||
/*
|
||||
* Out of an abundance of caution, we always run redo on shared catalogs,
|
||||
* regardless of whether the block is stored in shared buffers. See also
|
||||
* this function's top comment.
|
||||
* regardless of whether the block is stored in shared buffers.
|
||||
* See also this function's top comment.
|
||||
*/
|
||||
if (!OidIsValid(NInfoGetDbOid(rinfo)))
|
||||
return false;
|
||||
@@ -2850,9 +2817,8 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
|
||||
/* In both cases st lwlsn past this WAL record */
|
||||
SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno);
|
||||
|
||||
/*
|
||||
* we don't have the buffer in memory, update lwLsn past this record, also
|
||||
* evict page fro file cache
|
||||
/* we don't have the buffer in memory, update lwLsn past this record,
|
||||
* also evict page fro file cache
|
||||
*/
|
||||
if (no_redo_needed)
|
||||
lfc_evict(rinfo, forknum, blkno);
|
||||
@@ -2872,11 +2838,11 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
|
||||
else
|
||||
{
|
||||
/*
|
||||
* Size was not cached. We populate the cache now, with the size of
|
||||
* the relation measured after this WAL record is applied.
|
||||
* Size was not cached. We populate the cache now, with the size of the
|
||||
* relation measured after this WAL record is applied.
|
||||
*
|
||||
* This length is later reused when we open the smgr to read the
|
||||
* block, which is fine and expected.
|
||||
* This length is later reused when we open the smgr to read the block,
|
||||
* which is fine and expected.
|
||||
*/
|
||||
|
||||
NeonResponse *response;
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -10,9 +10,6 @@
|
||||
#include "utils/uuid.h"
|
||||
#include "replication/walreceiver.h"
|
||||
|
||||
#include "libpqwalproposer.h"
|
||||
#include "neon_walreader.h"
|
||||
|
||||
#define SK_MAGIC 0xCafeCeefu
|
||||
#define SK_PROTOCOL_VERSION 2
|
||||
|
||||
@@ -25,9 +22,43 @@
|
||||
*/
|
||||
#define WL_NO_EVENTS 0
|
||||
|
||||
struct WalProposerConn; /* Defined in libpqwalproposer.h */
|
||||
struct WalProposerConn; /* Defined in implementation (walprop_pg.c) */
|
||||
typedef struct WalProposerConn WalProposerConn;
|
||||
|
||||
/* Possible return values from ReadPGAsync */
|
||||
typedef enum
|
||||
{
|
||||
/* The full read was successful. buf now points to the data */
|
||||
PG_ASYNC_READ_SUCCESS,
|
||||
|
||||
/*
|
||||
* The read is ongoing. Wait until the connection is read-ready, then try
|
||||
* again.
|
||||
*/
|
||||
PG_ASYNC_READ_TRY_AGAIN,
|
||||
/* Reading failed. Check PQerrorMessage(conn) */
|
||||
PG_ASYNC_READ_FAIL,
|
||||
} PGAsyncReadResult;
|
||||
|
||||
/* Possible return values from WritePGAsync */
|
||||
typedef enum
|
||||
{
|
||||
/* The write fully completed */
|
||||
PG_ASYNC_WRITE_SUCCESS,
|
||||
|
||||
/*
|
||||
* The write started, but you'll need to call PQflush some more times to
|
||||
* finish it off. We just tried, so it's best to wait until the connection
|
||||
* is read- or write-ready to try again.
|
||||
*
|
||||
* If it becomes read-ready, call PQconsumeInput and flush again. If it
|
||||
* becomes write-ready, just call PQflush.
|
||||
*/
|
||||
PG_ASYNC_WRITE_TRY_FLUSH,
|
||||
/* Writing failed. Check PQerrorMessage(conn) */
|
||||
PG_ASYNC_WRITE_FAIL,
|
||||
} PGAsyncWriteResult;
|
||||
|
||||
/*
|
||||
* WAL safekeeper state, which is used to wait for some event.
|
||||
*
|
||||
@@ -104,40 +135,6 @@ typedef enum
|
||||
SS_ACTIVE,
|
||||
} SafekeeperState;
|
||||
|
||||
/*
|
||||
* Sending WAL substates of SS_ACTIVE.
|
||||
*/
|
||||
typedef enum
|
||||
{
|
||||
/*
|
||||
* We are ready to send more WAL, waiting for latch set to learn about
|
||||
* more WAL becoming available (or just a timeout to send heartbeat).
|
||||
*/
|
||||
SS_ACTIVE_SEND,
|
||||
|
||||
/*
|
||||
* Polling neon_walreader to receive chunk of WAL (probably remotely) to
|
||||
* send to this safekeeper.
|
||||
*
|
||||
* Note: socket management is done completely inside walproposer_pg for
|
||||
* simplicity, and thus simulation doesn't test it. Which is fine as
|
||||
* simulation is mainly aimed at consensus checks, not waiteventset
|
||||
* management.
|
||||
*
|
||||
* Also, while in this state we don't touch safekeeper socket, so in
|
||||
* theory it might close connection as inactive. This can be addressed if
|
||||
* needed; however, while fetching WAL we should regularly send it, so the
|
||||
* problem is unlikely. Vice versa is also true (SS_ACTIVE doesn't handle
|
||||
* walreader socket), but similarly shouldn't be a problem.
|
||||
*/
|
||||
SS_ACTIVE_READ_WAL,
|
||||
|
||||
/*
|
||||
* Waiting for write readiness to flush the socket.
|
||||
*/
|
||||
SS_ACTIVE_FLUSH,
|
||||
} SafekeeperActiveState;
|
||||
|
||||
/* Consensus logical timestamp. */
|
||||
typedef uint64 term_t;
|
||||
|
||||
@@ -336,30 +333,6 @@ typedef struct Safekeeper
|
||||
*/
|
||||
char conninfo[MAXCONNINFO];
|
||||
|
||||
/*
|
||||
* Temporary buffer for the message being sent to the safekeeper.
|
||||
*/
|
||||
StringInfoData outbuf;
|
||||
|
||||
/*
|
||||
* Streaming will start here; must be record boundary.
|
||||
*/
|
||||
XLogRecPtr startStreamingAt;
|
||||
|
||||
XLogRecPtr streamingAt; /* current streaming position */
|
||||
AppendRequestHeader appendRequest; /* request for sending to safekeeper */
|
||||
|
||||
SafekeeperState state; /* safekeeper state machine state */
|
||||
SafekeeperActiveState active_state;
|
||||
TimestampTz latestMsgReceivedAt; /* when latest msg is received */
|
||||
AcceptorGreeting greetResponse; /* acceptor greeting */
|
||||
VoteResponse voteResponse; /* the vote */
|
||||
AppendResponse appendResponse; /* feedback for master */
|
||||
|
||||
|
||||
/* postgres-specific fields */
|
||||
#ifndef WALPROPOSER_LIB
|
||||
|
||||
/*
|
||||
* postgres protocol connection to the WAL acceptor
|
||||
*
|
||||
@@ -368,32 +341,33 @@ typedef struct Safekeeper
|
||||
*/
|
||||
WalProposerConn *conn;
|
||||
|
||||
/*
|
||||
* Temporary buffer for the message being sent to the safekeeper.
|
||||
*/
|
||||
StringInfoData outbuf;
|
||||
|
||||
/*
|
||||
* WAL reader, allocated for each safekeeper.
|
||||
*/
|
||||
NeonWALReader *xlogreader;
|
||||
XLogReaderState *xlogreader;
|
||||
|
||||
/*
|
||||
* Position in wait event set. Equal to -1 if no event
|
||||
* Streaming will start here; must be record boundary.
|
||||
*/
|
||||
int eventPos;
|
||||
XLogRecPtr startStreamingAt;
|
||||
|
||||
/*
|
||||
* Neon WAL reader position in wait event set, or -1 if no socket.
|
||||
*/
|
||||
int nwrEventPos;
|
||||
#endif
|
||||
bool flushWrite; /* set to true if we need to call AsyncFlush,*
|
||||
* to flush pending messages */
|
||||
XLogRecPtr streamingAt; /* current streaming position */
|
||||
AppendRequestHeader appendRequest; /* request for sending to safekeeper */
|
||||
|
||||
|
||||
/* WalProposer library specifics */
|
||||
#ifdef WALPROPOSER_LIB
|
||||
|
||||
/*
|
||||
* Buffer for incoming messages. Usually Rust vector is stored here.
|
||||
* Caller is responsible for freeing the buffer.
|
||||
*/
|
||||
StringInfoData inbuf;
|
||||
#endif
|
||||
int eventPos; /* position in wait event set. Equal to -1 if*
|
||||
* no event */
|
||||
SafekeeperState state; /* safekeeper state machine state */
|
||||
TimestampTz latestMsgReceivedAt; /* when latest msg is received */
|
||||
AcceptorGreeting greetResponse; /* acceptor greeting */
|
||||
VoteResponse voteResponse; /* the vote */
|
||||
AppendResponse appendResponse; /* feedback for master */
|
||||
} Safekeeper;
|
||||
|
||||
/* Re-exported PostgresPollingStatusType */
|
||||
@@ -410,6 +384,31 @@ typedef enum
|
||||
*/
|
||||
} WalProposerConnectPollStatusType;
|
||||
|
||||
/* Re-exported and modified ExecStatusType */
|
||||
typedef enum
|
||||
{
|
||||
/* We received a single CopyBoth result */
|
||||
WP_EXEC_SUCCESS_COPYBOTH,
|
||||
|
||||
/*
|
||||
* Any success result other than a single CopyBoth was received. The
|
||||
* specifics of the result were already logged, but it may be useful to
|
||||
* provide an error message indicating which safekeeper messed up.
|
||||
*
|
||||
* Do not expect PQerrorMessage to be appropriately set.
|
||||
*/
|
||||
WP_EXEC_UNEXPECTED_SUCCESS,
|
||||
|
||||
/*
|
||||
* No result available at this time. Wait until read-ready, then call
|
||||
* again. Internally, this is returned when PQisBusy indicates that
|
||||
* PQgetResult would block.
|
||||
*/
|
||||
WP_EXEC_NEEDS_INPUT,
|
||||
/* Catch-all failure. Check PQerrorMessage. */
|
||||
WP_EXEC_FAILED,
|
||||
} WalProposerExecStatusType;
|
||||
|
||||
/* Re-exported ConnStatusType */
|
||||
typedef enum
|
||||
{
|
||||
@@ -434,7 +433,7 @@ typedef struct walproposer_api
|
||||
* Get WalproposerShmemState. This is used to store information about last
|
||||
* elected term.
|
||||
*/
|
||||
WalproposerShmemState *(*get_shmem_state) (WalProposer *wp);
|
||||
WalproposerShmemState *(*get_shmem_state) (void);
|
||||
|
||||
/*
|
||||
* Start receiving notifications about new WAL. This is an infinite loop
|
||||
@@ -444,73 +443,68 @@ typedef struct walproposer_api
|
||||
void (*start_streaming) (WalProposer *wp, XLogRecPtr startpos);
|
||||
|
||||
/* Get pointer to the latest available WAL. */
|
||||
XLogRecPtr (*get_flush_rec_ptr) (WalProposer *wp);
|
||||
XLogRecPtr (*get_flush_rec_ptr) (void);
|
||||
|
||||
/* Get current time. */
|
||||
TimestampTz (*get_current_timestamp) (WalProposer *wp);
|
||||
TimestampTz (*get_current_timestamp) (void);
|
||||
|
||||
/* Get postgres timeline. */
|
||||
TimeLineID (*get_timeline_id) (void);
|
||||
|
||||
/* Current error message, aka PQerrorMessage. */
|
||||
char *(*conn_error_message) (Safekeeper *sk);
|
||||
char *(*conn_error_message) (WalProposerConn *conn);
|
||||
|
||||
/* Connection status, aka PQstatus. */
|
||||
WalProposerConnStatusType (*conn_status) (Safekeeper *sk);
|
||||
WalProposerConnStatusType (*conn_status) (WalProposerConn *conn);
|
||||
|
||||
/* Start the connection, aka PQconnectStart. */
|
||||
void (*conn_connect_start) (Safekeeper *sk);
|
||||
WalProposerConn *(*conn_connect_start) (char *conninfo);
|
||||
|
||||
/* Poll an asynchronous connection, aka PQconnectPoll. */
|
||||
WalProposerConnectPollStatusType (*conn_connect_poll) (Safekeeper *sk);
|
||||
WalProposerConnectPollStatusType (*conn_connect_poll) (WalProposerConn *conn);
|
||||
|
||||
/* Send a blocking SQL query, aka PQsendQuery. */
|
||||
bool (*conn_send_query) (Safekeeper *sk, char *query);
|
||||
bool (*conn_send_query) (WalProposerConn *conn, char *query);
|
||||
|
||||
/* Read the query result, aka PQgetResult. */
|
||||
WalProposerExecStatusType (*conn_get_query_result) (Safekeeper *sk);
|
||||
WalProposerExecStatusType (*conn_get_query_result) (WalProposerConn *conn);
|
||||
|
||||
/* Flush buffer to the network, aka PQflush. */
|
||||
int (*conn_flush) (Safekeeper *sk);
|
||||
int (*conn_flush) (WalProposerConn *conn);
|
||||
|
||||
/* Close the connection, aka PQfinish. */
|
||||
void (*conn_finish) (Safekeeper *sk);
|
||||
void (*conn_finish) (WalProposerConn *conn);
|
||||
|
||||
/*
|
||||
* Try to read CopyData message from the safekeeper, aka PQgetCopyData.
|
||||
*
|
||||
* On success, the data is placed in *buf. It is valid until the next call
|
||||
* to this function.
|
||||
*/
|
||||
PGAsyncReadResult (*conn_async_read) (Safekeeper *sk, char **buf, int *amount);
|
||||
/* Try to read CopyData message, aka PQgetCopyData. */
|
||||
PGAsyncReadResult (*conn_async_read) (WalProposerConn *conn, char **buf, int *amount);
|
||||
|
||||
/* Try to write CopyData message, aka PQputCopyData. */
|
||||
PGAsyncWriteResult (*conn_async_write) (Safekeeper *sk, void const *buf, size_t size);
|
||||
PGAsyncWriteResult (*conn_async_write) (WalProposerConn *conn, void const *buf, size_t size);
|
||||
|
||||
/* Blocking CopyData write, aka PQputCopyData + PQflush. */
|
||||
bool (*conn_blocking_write) (Safekeeper *sk, void const *buf, size_t size);
|
||||
bool (*conn_blocking_write) (WalProposerConn *conn, void const *buf, size_t size);
|
||||
|
||||
/* Download WAL from startpos to endpos and make it available locally. */
|
||||
bool (*recovery_download) (Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos);
|
||||
|
||||
/* Read WAL from disk to buf. */
|
||||
NeonWALReadResult (*wal_read) (Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count);
|
||||
void (*wal_read) (XLogReaderState *state, char *buf, XLogRecPtr startptr, Size count);
|
||||
|
||||
/* Allocate WAL reader. */
|
||||
void (*wal_reader_allocate) (Safekeeper *sk);
|
||||
XLogReaderState *(*wal_reader_allocate) (void);
|
||||
|
||||
/* Deallocate event set. */
|
||||
void (*free_event_set) (void);
|
||||
|
||||
/* Initialize event set. */
|
||||
void (*init_event_set) (WalProposer *wp);
|
||||
void (*init_event_set) (int n_safekeepers);
|
||||
|
||||
/* Update events for an existing safekeeper connection. */
|
||||
void (*update_event_set) (Safekeeper *sk, uint32 events);
|
||||
|
||||
/* Configure wait event set for yield in SS_ACTIVE. */
|
||||
void (*active_state_update_event_set) (Safekeeper *sk);
|
||||
|
||||
/* Add a new safekeeper connection to the event set. */
|
||||
void (*add_safekeeper_event_set) (Safekeeper *sk, uint32 events);
|
||||
|
||||
/* Remove safekeeper connection from event set */
|
||||
void (*rm_safekeeper_event_set) (Safekeeper *sk);
|
||||
|
||||
/*
|
||||
* Wait until some event happens: - timeout is reached - socket event for
|
||||
* safekeeper connection - new WAL is available
|
||||
@@ -519,22 +513,22 @@ typedef struct walproposer_api
|
||||
* events mask to indicate events and sets sk to the safekeeper which has
|
||||
* an event.
|
||||
*/
|
||||
int (*wait_event_set) (WalProposer *wp, long timeout, Safekeeper **sk, uint32 *events);
|
||||
int (*wait_event_set) (long timeout, Safekeeper **sk, uint32 *events);
|
||||
|
||||
/* Read random bytes. */
|
||||
bool (*strong_random) (WalProposer *wp, void *buf, size_t len);
|
||||
bool (*strong_random) (void *buf, size_t len);
|
||||
|
||||
/*
|
||||
* Get a basebackup LSN. Used to cross-validate with the latest available
|
||||
* LSN on the safekeepers.
|
||||
*/
|
||||
XLogRecPtr (*get_redo_start_lsn) (WalProposer *wp);
|
||||
XLogRecPtr (*get_redo_start_lsn) (void);
|
||||
|
||||
/*
|
||||
* Finish sync safekeepers with the given LSN. This function should not
|
||||
* return and should exit the program.
|
||||
*/
|
||||
void (*finish_sync_safekeepers) (WalProposer *wp, XLogRecPtr lsn);
|
||||
void (*finish_sync_safekeepers) (XLogRecPtr lsn);
|
||||
|
||||
/*
|
||||
* Called after every new message from the safekeeper. Used to propagate
|
||||
@@ -547,22 +541,7 @@ typedef struct walproposer_api
|
||||
* Called on peer_horizon_lsn updates. Used to advance replication slot
|
||||
* and to free up disk space by deleting unnecessary WAL.
|
||||
*/
|
||||
void (*confirm_wal_streamed) (WalProposer *wp, XLogRecPtr lsn);
|
||||
|
||||
/*
|
||||
* Write a log message to the internal log processor. This is used only
|
||||
* when walproposer is compiled as a library. Otherwise, all logging is
|
||||
* handled by elog().
|
||||
*/
|
||||
void (*log_internal) (WalProposer *wp, int level, const char *line);
|
||||
|
||||
/*
|
||||
* Called right after the proposer was elected, but before it started
|
||||
* recovery and sent ProposerElected message to the safekeepers.
|
||||
*
|
||||
* Used by logical replication to update truncateLsn.
|
||||
*/
|
||||
void (*after_election) (WalProposer *wp);
|
||||
void (*confirm_wal_streamed) (XLogRecPtr lsn);
|
||||
} walproposer_api;
|
||||
|
||||
/*
|
||||
@@ -611,13 +590,6 @@ typedef struct WalProposerConfig
|
||||
|
||||
/* Will be passed to safekeepers in greet request. */
|
||||
uint64 systemId;
|
||||
|
||||
/* Will be passed to safekeepers in greet request. */
|
||||
TimeLineID pgTimeline;
|
||||
|
||||
#ifdef WALPROPOSER_LIB
|
||||
void *callback_data;
|
||||
#endif
|
||||
} WalProposerConfig;
|
||||
|
||||
|
||||
@@ -694,24 +666,7 @@ extern WalProposer *WalProposerCreate(WalProposerConfig *config, walproposer_api
|
||||
extern void WalProposerStart(WalProposer *wp);
|
||||
extern void WalProposerBroadcast(WalProposer *wp, XLogRecPtr startpos, XLogRecPtr endpos);
|
||||
extern void WalProposerPoll(WalProposer *wp);
|
||||
extern void WalProposerFree(WalProposer *wp);
|
||||
|
||||
/*
|
||||
* WaitEventSet API doesn't allow to remove socket, so walproposer_pg uses it to
|
||||
* recreate set from scratch, hence the export.
|
||||
*/
|
||||
extern void SafekeeperStateDesiredEvents(Safekeeper *sk, uint32 *sk_events, uint32 *nwr_events);
|
||||
extern Safekeeper *GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn);
|
||||
|
||||
|
||||
#define WPEVENT 1337 /* special log level for walproposer internal
|
||||
* events */
|
||||
|
||||
#ifdef WALPROPOSER_LIB
|
||||
void WalProposerLibLog(WalProposer *wp, int elevel, char *fmt,...);
|
||||
#define walprop_log(elevel, ...) WalProposerLibLog(wp, elevel, __VA_ARGS__)
|
||||
#else
|
||||
#define walprop_log(elevel, ...) elog(elevel, __VA_ARGS__)
|
||||
#endif
|
||||
extern void ParsePageserverFeedbackMessage(StringInfo reply_message,
|
||||
PageserverFeedback *rf);
|
||||
|
||||
#endif /* __NEON_WALPROPOSER_H__ */
|
||||
|
||||
@@ -1,194 +0,0 @@
|
||||
/*
|
||||
* Contains copied/adapted functions from libpq and some internal postgres functions.
|
||||
* This is needed to avoid linking to full postgres server installation. This file
|
||||
* is compiled as a part of libwalproposer static library.
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "walproposer.h"
|
||||
#include "utils/datetime.h"
|
||||
#include "miscadmin.h"
|
||||
|
||||
void
|
||||
ExceptionalCondition(const char *conditionName,
|
||||
const char *fileName, int lineNumber)
|
||||
{
|
||||
fprintf(stderr, "ExceptionalCondition: %s:%d: %s\n",
|
||||
fileName, lineNumber, conditionName);
|
||||
fprintf(stderr, "aborting...\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
void
|
||||
pq_copymsgbytes(StringInfo msg, char *buf, int datalen)
|
||||
{
|
||||
if (datalen < 0 || datalen > (msg->len - msg->cursor))
|
||||
ExceptionalCondition("insufficient data left in message", __FILE__, __LINE__);
|
||||
memcpy(buf, &msg->data[msg->cursor], datalen);
|
||||
msg->cursor += datalen;
|
||||
}
|
||||
|
||||
/* --------------------------------
|
||||
* pq_getmsgint - get a binary integer from a message buffer
|
||||
*
|
||||
* Values are treated as unsigned.
|
||||
* --------------------------------
|
||||
*/
|
||||
unsigned int
|
||||
pq_getmsgint(StringInfo msg, int b)
|
||||
{
|
||||
unsigned int result;
|
||||
unsigned char n8;
|
||||
uint16 n16;
|
||||
uint32 n32;
|
||||
|
||||
switch (b)
|
||||
{
|
||||
case 1:
|
||||
pq_copymsgbytes(msg, (char *) &n8, 1);
|
||||
result = n8;
|
||||
break;
|
||||
case 2:
|
||||
pq_copymsgbytes(msg, (char *) &n16, 2);
|
||||
result = pg_ntoh16(n16);
|
||||
break;
|
||||
case 4:
|
||||
pq_copymsgbytes(msg, (char *) &n32, 4);
|
||||
result = pg_ntoh32(n32);
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr, "unsupported integer size %d\n", b);
|
||||
ExceptionalCondition("unsupported integer size", __FILE__, __LINE__);
|
||||
result = 0; /* keep compiler quiet */
|
||||
break;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/* --------------------------------
|
||||
* pq_getmsgint64 - get a binary 8-byte int from a message buffer
|
||||
*
|
||||
* It is tempting to merge this with pq_getmsgint, but we'd have to make the
|
||||
* result int64 for all data widths --- that could be a big performance
|
||||
* hit on machines where int64 isn't efficient.
|
||||
* --------------------------------
|
||||
*/
|
||||
int64
|
||||
pq_getmsgint64(StringInfo msg)
|
||||
{
|
||||
uint64 n64;
|
||||
|
||||
pq_copymsgbytes(msg, (char *) &n64, sizeof(n64));
|
||||
|
||||
return pg_ntoh64(n64);
|
||||
}
|
||||
|
||||
/* --------------------------------
|
||||
* pq_getmsgbyte - get a raw byte from a message buffer
|
||||
* --------------------------------
|
||||
*/
|
||||
int
|
||||
pq_getmsgbyte(StringInfo msg)
|
||||
{
|
||||
if (msg->cursor >= msg->len)
|
||||
ExceptionalCondition("no data left in message", __FILE__, __LINE__);
|
||||
return (unsigned char) msg->data[msg->cursor++];
|
||||
}
|
||||
|
||||
/* --------------------------------
|
||||
* pq_getmsgbytes - get raw data from a message buffer
|
||||
*
|
||||
* Returns a pointer directly into the message buffer; note this
|
||||
* may not have any particular alignment.
|
||||
* --------------------------------
|
||||
*/
|
||||
const char *
|
||||
pq_getmsgbytes(StringInfo msg, int datalen)
|
||||
{
|
||||
const char *result;
|
||||
|
||||
if (datalen < 0 || datalen > (msg->len - msg->cursor))
|
||||
ExceptionalCondition("insufficient data left in message", __FILE__, __LINE__);
|
||||
result = &msg->data[msg->cursor];
|
||||
msg->cursor += datalen;
|
||||
return result;
|
||||
}
|
||||
|
||||
/* --------------------------------
|
||||
* pq_getmsgstring - get a null-terminated text string (with conversion)
|
||||
*
|
||||
* May return a pointer directly into the message buffer, or a pointer
|
||||
* to a palloc'd conversion result.
|
||||
* --------------------------------
|
||||
*/
|
||||
const char *
|
||||
pq_getmsgstring(StringInfo msg)
|
||||
{
|
||||
char *str;
|
||||
int slen;
|
||||
|
||||
str = &msg->data[msg->cursor];
|
||||
|
||||
/*
|
||||
* It's safe to use strlen() here because a StringInfo is guaranteed to
|
||||
* have a trailing null byte. But check we found a null inside the
|
||||
* message.
|
||||
*/
|
||||
slen = strlen(str);
|
||||
if (msg->cursor + slen >= msg->len)
|
||||
ExceptionalCondition("invalid string in message", __FILE__, __LINE__);
|
||||
msg->cursor += slen + 1;
|
||||
|
||||
return str;
|
||||
}
|
||||
|
||||
/* --------------------------------
|
||||
* pq_getmsgend - verify message fully consumed
|
||||
* --------------------------------
|
||||
*/
|
||||
void
|
||||
pq_getmsgend(StringInfo msg)
|
||||
{
|
||||
if (msg->cursor != msg->len)
|
||||
ExceptionalCondition("invalid msg format", __FILE__, __LINE__);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Produce a C-string representation of a TimestampTz.
|
||||
*
|
||||
* This is mostly for use in emitting messages.
|
||||
*/
|
||||
const char *
|
||||
timestamptz_to_str(TimestampTz t)
|
||||
{
|
||||
static char buf[MAXDATELEN + 1];
|
||||
|
||||
snprintf(buf, sizeof(buf), "TimestampTz(%ld)", t);
|
||||
return buf;
|
||||
}
|
||||
|
||||
bool
|
||||
TimestampDifferenceExceeds(TimestampTz start_time,
|
||||
TimestampTz stop_time,
|
||||
int msec)
|
||||
{
|
||||
TimestampTz diff = stop_time - start_time;
|
||||
|
||||
return (diff >= msec * INT64CONST(1000));
|
||||
}
|
||||
|
||||
void
|
||||
WalProposerLibLog(WalProposer *wp, int elevel, char *fmt,...)
|
||||
{
|
||||
char buf[1024];
|
||||
va_list args;
|
||||
|
||||
fmt = _(fmt);
|
||||
|
||||
va_start(args, fmt);
|
||||
vsnprintf(buf, sizeof(buf), fmt, args);
|
||||
va_end(args);
|
||||
|
||||
wp->api.log_internal(wp, elevel, buf);
|
||||
}
|
||||
@@ -43,12 +43,9 @@
|
||||
#include "utils/ps_status.h"
|
||||
#include "utils/timestamp.h"
|
||||
|
||||
#include "libpq-fe.h"
|
||||
|
||||
#include "libpqwalproposer.h"
|
||||
#include "neon.h"
|
||||
#include "neon_walreader.h"
|
||||
#include "walproposer.h"
|
||||
#include "libpq-fe.h"
|
||||
|
||||
#define XLOG_HDR_SIZE (1 + 8 * 3) /* 'w' + startPos + walEnd + timestamp */
|
||||
#define XLOG_HDR_START_POS 1 /* offset of start position in wal sender*
|
||||
@@ -76,8 +73,7 @@ static void walprop_register_bgworker(void);
|
||||
static void walprop_pg_init_standalone_sync_safekeepers(void);
|
||||
static void walprop_pg_init_walsender(void);
|
||||
static void walprop_pg_init_bgworker(void);
|
||||
static TimestampTz walprop_pg_get_current_timestamp(WalProposer *wp);
|
||||
static TimeLineID walprop_pg_get_timeline_id(void);
|
||||
static TimestampTz walprop_pg_get_current_timestamp(void);
|
||||
static void walprop_pg_load_libpqwalreceiver(void);
|
||||
|
||||
static process_interrupts_callback_t PrevProcessInterruptsCallback;
|
||||
@@ -94,10 +90,6 @@ static void XLogBroadcastWalProposer(WalProposer *wp);
|
||||
static void XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr);
|
||||
static void XLogWalPropClose(XLogRecPtr recptr);
|
||||
|
||||
static void add_nwr_event_set(Safekeeper *sk, uint32 events);
|
||||
static void update_nwr_event_set(Safekeeper *sk, uint32 events);
|
||||
static void rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk);
|
||||
|
||||
static void
|
||||
init_walprop_config(bool syncSafekeepers)
|
||||
{
|
||||
@@ -112,7 +104,6 @@ init_walprop_config(bool syncSafekeepers)
|
||||
walprop_config.systemId = GetSystemIdentifier();
|
||||
else
|
||||
walprop_config.systemId = 0;
|
||||
walprop_config.pgTimeline = walprop_pg_get_timeline_id();
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -145,7 +136,7 @@ WalProposerMain(Datum main_arg)
|
||||
walprop_pg_load_libpqwalreceiver();
|
||||
|
||||
wp = WalProposerCreate(&walprop_config, walprop_pg);
|
||||
wp->last_reconnect_attempt = walprop_pg_get_current_timestamp(wp);
|
||||
wp->last_reconnect_attempt = walprop_pg_get_current_timestamp();
|
||||
|
||||
walprop_pg_init_walsender();
|
||||
WalProposerStart(wp);
|
||||
@@ -388,7 +379,7 @@ nwp_shmem_startup_hook(void)
|
||||
}
|
||||
|
||||
static WalproposerShmemState *
|
||||
walprop_pg_get_shmem_state(WalProposer *wp)
|
||||
walprop_pg_get_shmem_state(void)
|
||||
{
|
||||
Assert(walprop_shared != NULL);
|
||||
return walprop_shared;
|
||||
@@ -514,7 +505,7 @@ walprop_pg_init_bgworker(void)
|
||||
}
|
||||
|
||||
static XLogRecPtr
|
||||
walprop_pg_get_flush_rec_ptr(WalProposer *wp)
|
||||
walprop_pg_get_flush_rec_ptr(void)
|
||||
{
|
||||
#if PG_MAJORVERSION_NUM < 15
|
||||
return GetFlushRecPtr();
|
||||
@@ -524,7 +515,7 @@ walprop_pg_get_flush_rec_ptr(WalProposer *wp)
|
||||
}
|
||||
|
||||
static TimestampTz
|
||||
walprop_pg_get_current_timestamp(WalProposer *wp)
|
||||
walprop_pg_get_current_timestamp(void)
|
||||
{
|
||||
return GetCurrentTimestamp();
|
||||
}
|
||||
@@ -548,6 +539,14 @@ walprop_pg_load_libpqwalreceiver(void)
|
||||
elog(ERROR, "libpqwalreceiver didn't initialize correctly");
|
||||
}
|
||||
|
||||
/* Header in walproposer.h -- Wrapper struct to abstract away the libpq connection */
|
||||
struct WalProposerConn
|
||||
{
|
||||
PGconn *pg_conn;
|
||||
bool is_nonblocking; /* whether the connection is non-blocking */
|
||||
char *recvbuf; /* last received data from walprop_async_read */
|
||||
};
|
||||
|
||||
/* Helper function */
|
||||
static bool
|
||||
ensure_nonblocking_status(WalProposerConn *conn, bool is_nonblocking)
|
||||
@@ -566,15 +565,15 @@ ensure_nonblocking_status(WalProposerConn *conn, bool is_nonblocking)
|
||||
|
||||
/* Exported function definitions */
|
||||
static char *
|
||||
walprop_error_message(Safekeeper *sk)
|
||||
walprop_error_message(WalProposerConn *conn)
|
||||
{
|
||||
return PQerrorMessage(sk->conn->pg_conn);
|
||||
return PQerrorMessage(conn->pg_conn);
|
||||
}
|
||||
|
||||
static WalProposerConnStatusType
|
||||
walprop_status(Safekeeper *sk)
|
||||
walprop_status(WalProposerConn *conn)
|
||||
{
|
||||
switch (PQstatus(sk->conn->pg_conn))
|
||||
switch (PQstatus(conn->pg_conn))
|
||||
{
|
||||
case CONNECTION_OK:
|
||||
return WP_CONNECTION_OK;
|
||||
@@ -585,18 +584,16 @@ walprop_status(Safekeeper *sk)
|
||||
}
|
||||
}
|
||||
|
||||
WalProposerConn *
|
||||
libpqwp_connect_start(char *conninfo)
|
||||
static WalProposerConn *
|
||||
walprop_connect_start(char *conninfo)
|
||||
{
|
||||
|
||||
PGconn *pg_conn;
|
||||
WalProposerConn *conn;
|
||||
PGconn *pg_conn;
|
||||
const char *keywords[3];
|
||||
const char *values[3];
|
||||
int n;
|
||||
char *password = neon_auth_token;
|
||||
|
||||
|
||||
/*
|
||||
* Connect using the given connection string. If the NEON_AUTH_TOKEN
|
||||
* environment variable was set, use that as the password.
|
||||
@@ -622,11 +619,11 @@ libpqwp_connect_start(char *conninfo)
|
||||
pg_conn = PQconnectStartParams(keywords, values, 1);
|
||||
|
||||
/*
|
||||
* "If the result is null, then libpq has been unable to allocate a new
|
||||
* PGconn structure"
|
||||
* Allocation of a PQconn can fail, and will return NULL. We want to fully
|
||||
* replicate the behavior of PQconnectStart here.
|
||||
*/
|
||||
if (!pg_conn)
|
||||
elog(FATAL, "failed to allocate new PGconn object");
|
||||
return NULL;
|
||||
|
||||
/*
|
||||
* And in theory this allocation can fail as well, but it's incredibly
|
||||
@@ -643,20 +640,12 @@ libpqwp_connect_start(char *conninfo)
|
||||
return conn;
|
||||
}
|
||||
|
||||
static void
|
||||
walprop_connect_start(Safekeeper *sk)
|
||||
{
|
||||
Assert(sk->conn == NULL);
|
||||
sk->conn = libpqwp_connect_start(sk->conninfo);
|
||||
|
||||
}
|
||||
|
||||
static WalProposerConnectPollStatusType
|
||||
walprop_connect_poll(Safekeeper *sk)
|
||||
walprop_connect_poll(WalProposerConn *conn)
|
||||
{
|
||||
WalProposerConnectPollStatusType return_val;
|
||||
|
||||
switch (PQconnectPoll(sk->conn->pg_conn))
|
||||
switch (PQconnectPoll(conn->pg_conn))
|
||||
{
|
||||
case PGRES_POLLING_FAILED:
|
||||
return_val = WP_CONN_POLLING_FAILED;
|
||||
@@ -692,8 +681,8 @@ walprop_connect_poll(Safekeeper *sk)
|
||||
return return_val;
|
||||
}
|
||||
|
||||
extern bool
|
||||
libpqwp_send_query(WalProposerConn *conn, char *query)
|
||||
static bool
|
||||
walprop_send_query(WalProposerConn *conn, char *query)
|
||||
{
|
||||
/*
|
||||
* We need to be in blocking mode for sending the query to run without
|
||||
@@ -709,16 +698,9 @@ libpqwp_send_query(WalProposerConn *conn, char *query)
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
walprop_send_query(Safekeeper *sk, char *query)
|
||||
static WalProposerExecStatusType
|
||||
walprop_get_query_result(WalProposerConn *conn)
|
||||
{
|
||||
return libpqwp_send_query(sk->conn, query);
|
||||
}
|
||||
|
||||
WalProposerExecStatusType
|
||||
libpqwp_get_query_result(WalProposerConn *conn)
|
||||
{
|
||||
|
||||
PGresult *result;
|
||||
WalProposerExecStatusType return_val;
|
||||
|
||||
@@ -794,29 +776,36 @@ libpqwp_get_query_result(WalProposerConn *conn)
|
||||
return return_val;
|
||||
}
|
||||
|
||||
static WalProposerExecStatusType
|
||||
walprop_get_query_result(Safekeeper *sk)
|
||||
{
|
||||
return libpqwp_get_query_result(sk->conn);
|
||||
}
|
||||
|
||||
static pgsocket
|
||||
walprop_socket(Safekeeper *sk)
|
||||
walprop_socket(WalProposerConn *conn)
|
||||
{
|
||||
return PQsocket(sk->conn->pg_conn);
|
||||
return PQsocket(conn->pg_conn);
|
||||
}
|
||||
|
||||
static int
|
||||
walprop_flush(Safekeeper *sk)
|
||||
walprop_flush(WalProposerConn *conn)
|
||||
{
|
||||
return (PQflush(sk->conn->pg_conn));
|
||||
return (PQflush(conn->pg_conn));
|
||||
}
|
||||
|
||||
/* Like libpqrcv_receive. *buf is valid until the next call. */
|
||||
PGAsyncReadResult
|
||||
libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount)
|
||||
static void
|
||||
walprop_finish(WalProposerConn *conn)
|
||||
{
|
||||
if (conn->recvbuf != NULL)
|
||||
PQfreemem(conn->recvbuf);
|
||||
PQfinish(conn->pg_conn);
|
||||
pfree(conn);
|
||||
}
|
||||
|
||||
/*
|
||||
* Receive a message from the safekeeper.
|
||||
*
|
||||
* On success, the data is placed in *buf. It is valid until the next call
|
||||
* to this function.
|
||||
*/
|
||||
static PGAsyncReadResult
|
||||
walprop_async_read(WalProposerConn *conn, char **buf, int *amount)
|
||||
{
|
||||
int result;
|
||||
|
||||
if (conn->recvbuf != NULL)
|
||||
@@ -885,25 +874,13 @@ libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Receive a message from the safekeeper.
|
||||
*
|
||||
* On success, the data is placed in *buf. It is valid until the next call
|
||||
* to this function.
|
||||
*/
|
||||
static PGAsyncReadResult
|
||||
walprop_async_read(Safekeeper *sk, char **buf, int *amount)
|
||||
{
|
||||
return libpqwp_async_read(sk->conn, buf, amount);
|
||||
}
|
||||
|
||||
static PGAsyncWriteResult
|
||||
walprop_async_write(Safekeeper *sk, void const *buf, size_t size)
|
||||
walprop_async_write(WalProposerConn *conn, void const *buf, size_t size)
|
||||
{
|
||||
int result;
|
||||
|
||||
/* If we aren't in non-blocking mode, switch to it. */
|
||||
if (!ensure_nonblocking_status(sk->conn, true))
|
||||
if (!ensure_nonblocking_status(conn, true))
|
||||
return PG_ASYNC_WRITE_FAIL;
|
||||
|
||||
/*
|
||||
@@ -911,7 +888,7 @@ walprop_async_write(Safekeeper *sk, void const *buf, size_t size)
|
||||
* queued, 0 if it was not queued because of full buffers, or -1 if an
|
||||
* error occurred
|
||||
*/
|
||||
result = PQputCopyData(sk->conn->pg_conn, buf, size);
|
||||
result = PQputCopyData(conn->pg_conn, buf, size);
|
||||
|
||||
/*
|
||||
* We won't get a result of zero because walproposer always empties the
|
||||
@@ -939,7 +916,7 @@ walprop_async_write(Safekeeper *sk, void const *buf, size_t size)
|
||||
* sucessful, 1 if it was unable to send all the data in the send queue
|
||||
* yet -1 if it failed for some reason
|
||||
*/
|
||||
switch (result = PQflush(sk->conn->pg_conn))
|
||||
switch (result = PQflush(conn->pg_conn))
|
||||
{
|
||||
case 0:
|
||||
return PG_ASYNC_WRITE_SUCCESS;
|
||||
@@ -957,54 +934,28 @@ walprop_async_write(Safekeeper *sk, void const *buf, size_t size)
|
||||
* information, refer to the comments there.
|
||||
*/
|
||||
static bool
|
||||
walprop_blocking_write(Safekeeper *sk, void const *buf, size_t size)
|
||||
walprop_blocking_write(WalProposerConn *conn, void const *buf, size_t size)
|
||||
{
|
||||
int result;
|
||||
|
||||
/* If we are in non-blocking mode, switch out of it. */
|
||||
if (!ensure_nonblocking_status(sk->conn, false))
|
||||
if (!ensure_nonblocking_status(conn, false))
|
||||
return false;
|
||||
|
||||
if ((result = PQputCopyData(sk->conn->pg_conn, buf, size)) == -1)
|
||||
if ((result = PQputCopyData(conn->pg_conn, buf, size)) == -1)
|
||||
return false;
|
||||
|
||||
Assert(result == 1);
|
||||
|
||||
/* Because the connection is non-blocking, flushing returns 0 or -1 */
|
||||
|
||||
if ((result = PQflush(sk->conn->pg_conn)) == -1)
|
||||
if ((result = PQflush(conn->pg_conn)) == -1)
|
||||
return false;
|
||||
|
||||
Assert(result == 0);
|
||||
return true;
|
||||
}
|
||||
|
||||
void
|
||||
libpqwp_disconnect(WalProposerConn *conn)
|
||||
{
|
||||
if (conn->recvbuf != NULL)
|
||||
PQfreemem(conn->recvbuf);
|
||||
PQfinish(conn->pg_conn);
|
||||
pfree(conn);
|
||||
}
|
||||
|
||||
static void
|
||||
walprop_finish(Safekeeper *sk)
|
||||
{
|
||||
if (sk->conn)
|
||||
{
|
||||
libpqwp_disconnect(sk->conn);
|
||||
sk->conn = NULL;
|
||||
}
|
||||
|
||||
/* free xlogreader */
|
||||
if (sk->xlogreader)
|
||||
{
|
||||
NeonWALReaderFree(sk->xlogreader);
|
||||
sk->xlogreader = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Subscribe for new WAL and stream it in the loop to safekeepers.
|
||||
*
|
||||
@@ -1429,98 +1380,51 @@ XLogWalPropClose(XLogRecPtr recptr)
|
||||
walpropFile = -1;
|
||||
}
|
||||
|
||||
static NeonWALReadResult
|
||||
walprop_pg_wal_read(Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count)
|
||||
static void
|
||||
walprop_pg_wal_read(XLogReaderState *state, char *buf, XLogRecPtr startptr, Size count)
|
||||
{
|
||||
NeonWALReadResult res;
|
||||
WALReadError errinfo;
|
||||
|
||||
res = NeonWALRead(sk->xlogreader,
|
||||
buf,
|
||||
startptr,
|
||||
count,
|
||||
walprop_pg_get_timeline_id());
|
||||
|
||||
if (res == NEON_WALREAD_SUCCESS)
|
||||
if (!WALRead(state,
|
||||
buf,
|
||||
startptr,
|
||||
count,
|
||||
walprop_pg_get_timeline_id(),
|
||||
&errinfo))
|
||||
{
|
||||
/*
|
||||
* If we have the socket subscribed, but walreader doesn't need any
|
||||
* events, it must mean that remote connection just closed hoping to
|
||||
* do next read locally. Remove the socket then. It is important to do
|
||||
* as otherwise next read might open another connection and we won't
|
||||
* be able to distinguish whether we have correct socket added in wait
|
||||
* event set.
|
||||
*/
|
||||
if (NeonWALReaderEvents(sk->xlogreader) == 0)
|
||||
rm_safekeeper_event_set(sk, false);
|
||||
WALReadRaiseError(&errinfo);
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
static void
|
||||
walprop_pg_wal_reader_allocate(Safekeeper *sk)
|
||||
static XLogReaderState *
|
||||
walprop_pg_wal_reader_allocate(void)
|
||||
{
|
||||
char log_prefix[64];
|
||||
|
||||
snprintf(log_prefix, sizeof(log_prefix), "sk %s:%s nwr: ", sk->host, sk->port);
|
||||
sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propEpochStartLsn, sk->wp, log_prefix);
|
||||
if (sk->xlogreader == NULL)
|
||||
elog(FATAL, "Failed to allocate xlog reader");
|
||||
return XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(.segment_open = wal_segment_open,.segment_close = wal_segment_close), NULL);
|
||||
}
|
||||
|
||||
static WaitEventSet *waitEvents;
|
||||
|
||||
static void
|
||||
walprop_pg_free_event_set(WalProposer *wp)
|
||||
walprop_pg_free_event_set(void)
|
||||
{
|
||||
if (waitEvents)
|
||||
{
|
||||
FreeWaitEventSet(waitEvents);
|
||||
waitEvents = NULL;
|
||||
}
|
||||
|
||||
for (int i = 0; i < wp->n_safekeepers; i++)
|
||||
{
|
||||
wp->safekeeper[i].eventPos = -1;
|
||||
wp->safekeeper[i].nwrEventPos = -1;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
walprop_pg_init_event_set(WalProposer *wp)
|
||||
walprop_pg_init_event_set(int n_safekeepers)
|
||||
{
|
||||
if (waitEvents)
|
||||
elog(FATAL, "double-initialization of event set");
|
||||
|
||||
/* for each sk, we have socket plus potentially socket for neon walreader */
|
||||
waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + 2 * wp->n_safekeepers);
|
||||
waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + n_safekeepers);
|
||||
AddWaitEventToSet(waitEvents, WL_LATCH_SET, PGINVALID_SOCKET,
|
||||
MyLatch, NULL);
|
||||
AddWaitEventToSet(waitEvents, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
|
||||
NULL, NULL);
|
||||
|
||||
for (int i = 0; i < wp->n_safekeepers; i++)
|
||||
{
|
||||
wp->safekeeper[i].eventPos = -1;
|
||||
wp->safekeeper[i].nwrEventPos = -1;
|
||||
}
|
||||
}
|
||||
|
||||
/* add safekeeper socket to wait event set */
|
||||
static void
|
||||
walprop_pg_add_safekeeper_event_set(Safekeeper *sk, uint32 events)
|
||||
{
|
||||
Assert(sk->eventPos == -1);
|
||||
sk->eventPos = AddWaitEventToSet(waitEvents, events, walprop_socket(sk), NULL, sk);
|
||||
}
|
||||
|
||||
/* add neon wal reader socket to wait event set */
|
||||
static void
|
||||
add_nwr_event_set(Safekeeper *sk, uint32 events)
|
||||
{
|
||||
Assert(sk->nwrEventPos == -1);
|
||||
sk->nwrEventPos = AddWaitEventToSet(waitEvents, events, NeonWALReaderSocket(sk->xlogreader), NULL, sk);
|
||||
elog(DEBUG5, "sk %s:%s: added nwr socket events %d", sk->host, sk->port, events);
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -1532,143 +1436,14 @@ walprop_pg_update_event_set(Safekeeper *sk, uint32 events)
|
||||
ModifyWaitEvent(waitEvents, sk->eventPos, events, NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
* Update neon_walreader event.
|
||||
* Can be called when nwr socket doesn't exist, does nothing in this case.
|
||||
*/
|
||||
static void
|
||||
update_nwr_event_set(Safekeeper *sk, uint32 events)
|
||||
walprop_pg_add_safekeeper_event_set(Safekeeper *sk, uint32 events)
|
||||
{
|
||||
/* eventPos = -1 when we don't have an event */
|
||||
if (sk->nwrEventPos != -1)
|
||||
ModifyWaitEvent(waitEvents, sk->nwrEventPos, events, NULL);
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
walprop_pg_active_state_update_event_set(Safekeeper *sk)
|
||||
{
|
||||
uint32 sk_events;
|
||||
uint32 nwr_events;
|
||||
|
||||
Assert(sk->state == SS_ACTIVE);
|
||||
SafekeeperStateDesiredEvents(sk, &sk_events, &nwr_events);
|
||||
|
||||
/*
|
||||
* If we need to wait for neon_walreader, ensure we have up to date socket
|
||||
* in the wait event set.
|
||||
*/
|
||||
if (sk->active_state == SS_ACTIVE_READ_WAL)
|
||||
{
|
||||
/*
|
||||
* TODO: instead of reattaching socket (and thus recreating WES) each
|
||||
* time we should keep it if possible, i.e. if connection is already
|
||||
* established. Note that single neon_walreader object can switch
|
||||
* between local and remote reads multiple times during its lifetime,
|
||||
* so careful bookkeeping is needed here.
|
||||
*/
|
||||
rm_safekeeper_event_set(sk, false);
|
||||
add_nwr_event_set(sk, nwr_events);
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
* Hack: we should always set 0 here, but for random reasons
|
||||
* WaitEventSet (WaitEventAdjustEpoll) asserts that there is at least
|
||||
* some event. Since there is also no way to remove socket except
|
||||
* reconstructing the whole set, SafekeeperStateDesiredEvents instead
|
||||
* gives WL_SOCKET_CLOSED if socket exists.
|
||||
*/
|
||||
Assert(nwr_events == WL_SOCKET_CLOSED || nwr_events == 0);
|
||||
update_nwr_event_set(sk, WL_SOCKET_CLOSED);
|
||||
}
|
||||
walprop_pg_update_event_set(sk, sk_events);
|
||||
}
|
||||
|
||||
static void
|
||||
walprop_pg_rm_safekeeper_event_set(Safekeeper *to_remove)
|
||||
{
|
||||
rm_safekeeper_event_set(to_remove, true);
|
||||
}
|
||||
|
||||
/*
|
||||
* A hacky way to remove single event from the event set. Can be called if event
|
||||
* doesn't exist, does nothing in this case.
|
||||
*
|
||||
* Note: Internally, this completely reconstructs the event set. It should be
|
||||
* avoided if possible.
|
||||
*
|
||||
* If is_sk is true, socket of connection to safekeeper is removed; otherwise
|
||||
* socket of neon_walreader.
|
||||
*/
|
||||
static void
|
||||
rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk)
|
||||
{
|
||||
WalProposer *wp = to_remove->wp;
|
||||
|
||||
elog(DEBUG5, "sk %s:%s: removing event, is_sk %d",
|
||||
to_remove->host, to_remove->port, is_sk);
|
||||
|
||||
/*
|
||||
* Shortpath for exiting if have nothing to do. We never call this
|
||||
* function with safekeeper socket not existing, but do that with neon
|
||||
* walreader socket.
|
||||
*/
|
||||
if ((is_sk && to_remove->eventPos == -1) ||
|
||||
(!is_sk && to_remove->nwrEventPos == -1))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
/* Remove the existing event set, assign sk->eventPos = -1 */
|
||||
walprop_pg_free_event_set(wp);
|
||||
|
||||
/* Re-initialize it without adding any safekeeper events */
|
||||
wp->api.init_event_set(wp);
|
||||
|
||||
/*
|
||||
* loop through the existing safekeepers. If they aren't the one we're
|
||||
* removing, and if they have a socket we can use, re-add the applicable
|
||||
* events.
|
||||
*/
|
||||
for (int i = 0; i < wp->n_safekeepers; i++)
|
||||
{
|
||||
Safekeeper *sk = &wp->safekeeper[i];
|
||||
|
||||
if (sk == to_remove)
|
||||
{
|
||||
if (is_sk)
|
||||
sk->eventPos = -1;
|
||||
else
|
||||
sk->nwrEventPos = -1;
|
||||
}
|
||||
|
||||
/*
|
||||
* If this safekeeper isn't offline, add events for it, except for the
|
||||
* event requested to remove.
|
||||
*/
|
||||
if (sk->state != SS_OFFLINE)
|
||||
{
|
||||
uint32 sk_events;
|
||||
uint32 nwr_events;
|
||||
|
||||
SafekeeperStateDesiredEvents(sk, &sk_events, &nwr_events);
|
||||
|
||||
if (sk != to_remove || !is_sk)
|
||||
{
|
||||
/* will set sk->eventPos */
|
||||
wp->api.add_safekeeper_event_set(sk, sk_events);
|
||||
}
|
||||
else if ((sk != to_remove || is_sk) && nwr_events)
|
||||
{
|
||||
add_nwr_event_set(sk, nwr_events);
|
||||
}
|
||||
}
|
||||
}
|
||||
sk->eventPos = AddWaitEventToSet(waitEvents, events, walprop_socket(sk->conn), NULL, sk);
|
||||
}
|
||||
|
||||
static int
|
||||
walprop_pg_wait_event_set(WalProposer *wp, long timeout, Safekeeper **sk, uint32 *events)
|
||||
walprop_pg_wait_event_set(long timeout, Safekeeper **sk, uint32 *events)
|
||||
{
|
||||
WaitEvent event = {0};
|
||||
int rc = 0;
|
||||
@@ -1724,7 +1499,7 @@ walprop_pg_wait_event_set(WalProposer *wp, long timeout, Safekeeper **sk, uint32
|
||||
}
|
||||
|
||||
static void
|
||||
walprop_pg_finish_sync_safekeepers(WalProposer *wp, XLogRecPtr lsn)
|
||||
walprop_pg_finish_sync_safekeepers(XLogRecPtr lsn)
|
||||
{
|
||||
fprintf(stdout, "%X/%X\n", LSN_FORMAT_ARGS(lsn));
|
||||
exit(0);
|
||||
@@ -1836,7 +1611,7 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, XLogRecPtr commitLsn)
|
||||
* pageserver.
|
||||
*/
|
||||
quorumFeedback.rf.disk_consistent_lsn,
|
||||
walprop_pg_get_current_timestamp(wp), false);
|
||||
walprop_pg_get_current_timestamp(), false);
|
||||
}
|
||||
|
||||
CombineHotStanbyFeedbacks(&hsFeedback, wp);
|
||||
@@ -1853,69 +1628,18 @@ walprop_pg_process_safekeeper_feedback(WalProposer *wp, XLogRecPtr commitLsn)
|
||||
}
|
||||
|
||||
static void
|
||||
walprop_pg_confirm_wal_streamed(WalProposer *wp, XLogRecPtr lsn)
|
||||
walprop_pg_confirm_wal_streamed(XLogRecPtr lsn)
|
||||
{
|
||||
if (MyReplicationSlot)
|
||||
PhysicalConfirmReceivedLocation(lsn);
|
||||
}
|
||||
|
||||
static XLogRecPtr
|
||||
walprop_pg_get_redo_start_lsn(WalProposer *wp)
|
||||
{
|
||||
return GetRedoStartLsn();
|
||||
}
|
||||
|
||||
static bool
|
||||
walprop_pg_strong_random(WalProposer *wp, void *buf, size_t len)
|
||||
{
|
||||
return pg_strong_random(buf, len);
|
||||
}
|
||||
|
||||
static void
|
||||
walprop_pg_log_internal(WalProposer *wp, int level, const char *line)
|
||||
{
|
||||
elog(FATAL, "unexpected log_internal message at level %d: %s", level, line);
|
||||
}
|
||||
|
||||
static void
|
||||
walprop_pg_after_election(WalProposer *wp)
|
||||
{
|
||||
FILE *f;
|
||||
XLogRecPtr lrRestartLsn;
|
||||
|
||||
/* We don't need to do anything in syncSafekeepers mode. */
|
||||
if (wp->config->syncSafekeepers)
|
||||
return;
|
||||
|
||||
/*
|
||||
* If there are active logical replication subscription we need to provide
|
||||
* enough WAL for their WAL senders based on th position of their
|
||||
* replication slots.
|
||||
*/
|
||||
f = fopen("restart.lsn", "rb");
|
||||
if (f != NULL && !wp->config->syncSafekeepers)
|
||||
{
|
||||
fread(&lrRestartLsn, sizeof(lrRestartLsn), 1, f);
|
||||
fclose(f);
|
||||
if (lrRestartLsn != InvalidXLogRecPtr)
|
||||
{
|
||||
elog(LOG, "Logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn));
|
||||
|
||||
/*
|
||||
* start from the beginning of the segment to fetch page headers
|
||||
* verifed by XLogReader
|
||||
*/
|
||||
lrRestartLsn = lrRestartLsn - XLogSegmentOffset(lrRestartLsn, wal_segment_size);
|
||||
wp->truncateLsn = Min(wp->truncateLsn, lrRestartLsn);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const walproposer_api walprop_pg = {
|
||||
.get_shmem_state = walprop_pg_get_shmem_state,
|
||||
.start_streaming = walprop_pg_start_streaming,
|
||||
.get_flush_rec_ptr = walprop_pg_get_flush_rec_ptr,
|
||||
.get_current_timestamp = walprop_pg_get_current_timestamp,
|
||||
.get_timeline_id = walprop_pg_get_timeline_id,
|
||||
.conn_error_message = walprop_error_message,
|
||||
.conn_status = walprop_status,
|
||||
.conn_connect_start = walprop_connect_start,
|
||||
@@ -1930,17 +1654,14 @@ static const walproposer_api walprop_pg = {
|
||||
.recovery_download = WalProposerRecovery,
|
||||
.wal_read = walprop_pg_wal_read,
|
||||
.wal_reader_allocate = walprop_pg_wal_reader_allocate,
|
||||
.free_event_set = walprop_pg_free_event_set,
|
||||
.init_event_set = walprop_pg_init_event_set,
|
||||
.update_event_set = walprop_pg_update_event_set,
|
||||
.active_state_update_event_set = walprop_pg_active_state_update_event_set,
|
||||
.add_safekeeper_event_set = walprop_pg_add_safekeeper_event_set,
|
||||
.rm_safekeeper_event_set = walprop_pg_rm_safekeeper_event_set,
|
||||
.wait_event_set = walprop_pg_wait_event_set,
|
||||
.strong_random = walprop_pg_strong_random,
|
||||
.get_redo_start_lsn = walprop_pg_get_redo_start_lsn,
|
||||
.strong_random = pg_strong_random,
|
||||
.get_redo_start_lsn = GetRedoStartLsn,
|
||||
.finish_sync_safekeepers = walprop_pg_finish_sync_safekeepers,
|
||||
.process_safekeeper_feedback = walprop_pg_process_safekeeper_feedback,
|
||||
.confirm_wal_streamed = walprop_pg_confirm_wal_streamed,
|
||||
.log_internal = walprop_pg_log_internal,
|
||||
.after_election = walprop_pg_after_election,
|
||||
};
|
||||
|
||||
16
poetry.lock
generated
16
poetry.lock
generated
@@ -2415,13 +2415,13 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "urllib3"
|
||||
version = "1.26.18"
|
||||
version = "1.26.17"
|
||||
description = "HTTP library with thread-safe connection pooling, file post, and more."
|
||||
optional = false
|
||||
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
|
||||
files = [
|
||||
{file = "urllib3-1.26.18-py2.py3-none-any.whl", hash = "sha256:34b97092d7e0a3a8cf7cd10e386f401b3737364026c45e622aa02903dffe0f07"},
|
||||
{file = "urllib3-1.26.18.tar.gz", hash = "sha256:f8ecc1bba5667413457c529ab955bf8c67b45db799d159066261719e328580a0"},
|
||||
{file = "urllib3-1.26.17-py2.py3-none-any.whl", hash = "sha256:94a757d178c9be92ef5539b8840d48dc9cf1b2709c9d6b588232a055c524458b"},
|
||||
{file = "urllib3-1.26.17.tar.gz", hash = "sha256:24d6a242c28d29af46c3fae832c36db3bbebcc533dd1bb549172cd739c82df21"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
@@ -2488,16 +2488,6 @@ files = [
|
||||
{file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
|
||||
{file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
|
||||
{file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
|
||||
{file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
|
||||
{file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
|
||||
{file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
|
||||
|
||||
@@ -83,10 +83,6 @@ struct ProxyCliArgs {
|
||||
/// timeout for http connections
|
||||
#[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
|
||||
sql_over_http_timeout: tokio::time::Duration,
|
||||
|
||||
/// Require that all incoming requests have a Proxy Protocol V2 packet **and** have an IP address associated.
|
||||
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
|
||||
require_client_ip: bool,
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
@@ -237,7 +233,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
metric_collection,
|
||||
allow_self_signed_compute: args.allow_self_signed_compute,
|
||||
http_config,
|
||||
require_client_ip: args.require_client_ip,
|
||||
}));
|
||||
|
||||
Ok(config)
|
||||
|
||||
@@ -14,7 +14,6 @@ pub struct ProxyConfig {
|
||||
pub metric_collection: Option<MetricCollectionConfig>,
|
||||
pub allow_self_signed_compute: bool,
|
||||
pub http_config: HttpConfig,
|
||||
pub require_client_ip: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
|
||||
@@ -8,17 +8,14 @@ use pbkdf2::{
|
||||
Params, Pbkdf2,
|
||||
};
|
||||
use pq_proto::StartupMessageParams;
|
||||
use std::sync::atomic::{self, AtomicUsize};
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
use std::{
|
||||
fmt,
|
||||
task::{ready, Poll},
|
||||
};
|
||||
use std::{
|
||||
ops::Deref,
|
||||
sync::atomic::{self, AtomicUsize},
|
||||
};
|
||||
use tokio::time;
|
||||
use tokio_postgres::{AsyncMessage, ReadyForQueryStatus};
|
||||
use tokio_postgres::AsyncMessage;
|
||||
|
||||
use crate::{
|
||||
auth, console,
|
||||
@@ -29,13 +26,13 @@ use crate::{compute, config};
|
||||
|
||||
use crate::proxy::ConnectMechanism;
|
||||
|
||||
use tracing::{error, warn, Span};
|
||||
use tracing::{error, warn};
|
||||
use tracing::{info, info_span, Instrument};
|
||||
|
||||
pub const APP_NAME: &str = "sql_over_http";
|
||||
const MAX_CONNS_PER_ENDPOINT: usize = 20;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
#[derive(Debug)]
|
||||
pub struct ConnInfo {
|
||||
pub username: String,
|
||||
pub dbname: String,
|
||||
@@ -58,7 +55,7 @@ impl fmt::Display for ConnInfo {
|
||||
}
|
||||
|
||||
struct ConnPoolEntry {
|
||||
conn: ClientInner,
|
||||
conn: Client,
|
||||
_last_access: std::time::Instant,
|
||||
}
|
||||
|
||||
@@ -136,20 +133,14 @@ impl GlobalConnPool {
|
||||
}
|
||||
|
||||
pub async fn get(
|
||||
self: &Arc<Self>,
|
||||
&self,
|
||||
conn_info: &ConnInfo,
|
||||
force_new: bool,
|
||||
session_id: uuid::Uuid,
|
||||
) -> anyhow::Result<Client> {
|
||||
let mut client: Option<ClientInner> = None;
|
||||
let mut client: Option<Client> = None;
|
||||
let mut latency_timer = LatencyTimer::new("http");
|
||||
|
||||
let pool = if force_new {
|
||||
None
|
||||
} else {
|
||||
Some((conn_info.clone(), self.clone()))
|
||||
};
|
||||
|
||||
let mut hash_valid = false;
|
||||
if !force_new {
|
||||
let pool = self.get_or_create_endpoint_pool(&conn_info.hostname);
|
||||
@@ -197,11 +188,7 @@ impl GlobalConnPool {
|
||||
latency_timer.pool_hit();
|
||||
info!("pool: reusing connection '{conn_info}'");
|
||||
client.session.send(session_id)?;
|
||||
return Ok(Client {
|
||||
inner: Some(client),
|
||||
span: Span::current(),
|
||||
pool,
|
||||
});
|
||||
return Ok(client);
|
||||
}
|
||||
} else {
|
||||
info!("pool: opening a new connection '{conn_info}'");
|
||||
@@ -241,14 +228,10 @@ impl GlobalConnPool {
|
||||
_ => {}
|
||||
}
|
||||
|
||||
new_client.map(|inner| Client {
|
||||
inner: Some(inner),
|
||||
span: Span::current(),
|
||||
pool,
|
||||
})
|
||||
new_client
|
||||
}
|
||||
|
||||
fn put(&self, conn_info: &ConnInfo, client: ClientInner) -> anyhow::Result<()> {
|
||||
pub fn put(&self, conn_info: &ConnInfo, client: Client) -> anyhow::Result<()> {
|
||||
// We want to hold this open while we return. This ensures that the pool can't close
|
||||
// while we are in the middle of returning the connection.
|
||||
let closed = self.closed.read();
|
||||
@@ -343,7 +326,7 @@ struct TokioMechanism<'a> {
|
||||
|
||||
#[async_trait]
|
||||
impl ConnectMechanism for TokioMechanism<'_> {
|
||||
type Connection = ClientInner;
|
||||
type Connection = Client;
|
||||
type ConnectError = tokio_postgres::Error;
|
||||
type Error = anyhow::Error;
|
||||
|
||||
@@ -367,7 +350,7 @@ async fn connect_to_compute(
|
||||
conn_info: &ConnInfo,
|
||||
session_id: uuid::Uuid,
|
||||
latency_timer: LatencyTimer,
|
||||
) -> anyhow::Result<ClientInner> {
|
||||
) -> anyhow::Result<Client> {
|
||||
let tls = config.tls_config.as_ref();
|
||||
let common_names = tls.and_then(|tls| tls.common_names.clone());
|
||||
|
||||
@@ -416,7 +399,7 @@ async fn connect_to_compute_once(
|
||||
conn_info: &ConnInfo,
|
||||
timeout: time::Duration,
|
||||
mut session: uuid::Uuid,
|
||||
) -> Result<ClientInner, tokio_postgres::Error> {
|
||||
) -> Result<Client, tokio_postgres::Error> {
|
||||
let mut config = (*node_info.config).clone();
|
||||
|
||||
let (client, mut connection) = config
|
||||
@@ -479,99 +462,21 @@ async fn connect_to_compute_once(
|
||||
.instrument(span)
|
||||
);
|
||||
|
||||
Ok(ClientInner {
|
||||
Ok(Client {
|
||||
inner: client,
|
||||
session: tx,
|
||||
ids,
|
||||
})
|
||||
}
|
||||
|
||||
struct ClientInner {
|
||||
inner: tokio_postgres::Client,
|
||||
pub struct Client {
|
||||
pub inner: tokio_postgres::Client,
|
||||
session: tokio::sync::watch::Sender<uuid::Uuid>,
|
||||
ids: Ids,
|
||||
}
|
||||
|
||||
impl Client {
|
||||
pub fn metrics(&self) -> Arc<MetricCounter> {
|
||||
USAGE_METRICS.register(self.inner.as_ref().unwrap().ids.clone())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Client {
|
||||
span: Span,
|
||||
inner: Option<ClientInner>,
|
||||
pool: Option<(ConnInfo, Arc<GlobalConnPool>)>,
|
||||
}
|
||||
|
||||
pub struct Discard<'a> {
|
||||
pool: &'a mut Option<(ConnInfo, Arc<GlobalConnPool>)>,
|
||||
}
|
||||
|
||||
impl Client {
|
||||
pub fn inner(&mut self) -> (&mut tokio_postgres::Client, Discard<'_>) {
|
||||
let Self {
|
||||
inner,
|
||||
pool,
|
||||
span: _,
|
||||
} = self;
|
||||
(
|
||||
&mut inner
|
||||
.as_mut()
|
||||
.expect("client inner should not be removed")
|
||||
.inner,
|
||||
Discard { pool },
|
||||
)
|
||||
}
|
||||
|
||||
pub fn check_idle(&mut self, status: ReadyForQueryStatus) {
|
||||
self.inner().1.check_idle(status)
|
||||
}
|
||||
pub fn discard(&mut self) {
|
||||
self.inner().1.discard()
|
||||
}
|
||||
}
|
||||
|
||||
impl Discard<'_> {
|
||||
pub fn check_idle(&mut self, status: ReadyForQueryStatus) {
|
||||
if status != ReadyForQueryStatus::Idle {
|
||||
if let Some((conn_info, _)) = self.pool.take() {
|
||||
info!("pool: throwing away connection '{conn_info}' because connection is not idle")
|
||||
}
|
||||
}
|
||||
}
|
||||
pub fn discard(&mut self) {
|
||||
if let Some((conn_info, _)) = self.pool.take() {
|
||||
info!("pool: throwing away connection '{conn_info}' because connection is potentially in a broken state")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for Client {
|
||||
type Target = tokio_postgres::Client;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self
|
||||
.inner
|
||||
.as_ref()
|
||||
.expect("client inner should not be removed")
|
||||
.inner
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for Client {
|
||||
fn drop(&mut self) {
|
||||
let client = self
|
||||
.inner
|
||||
.take()
|
||||
.expect("client inner should not be removed");
|
||||
if let Some((conn_info, conn_pool)) = self.pool.take() {
|
||||
let current_span = self.span.clone();
|
||||
// return connection to the pool
|
||||
tokio::task::spawn_blocking(move || {
|
||||
let _span = current_span.enter();
|
||||
let _ = conn_pool.put(&conn_info, client);
|
||||
});
|
||||
}
|
||||
USAGE_METRICS.register(self.ids.clone())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,9 +17,7 @@ use tokio_postgres::types::Kind;
|
||||
use tokio_postgres::types::Type;
|
||||
use tokio_postgres::GenericClient;
|
||||
use tokio_postgres::IsolationLevel;
|
||||
use tokio_postgres::ReadyForQueryStatus;
|
||||
use tokio_postgres::Row;
|
||||
use tokio_postgres::Transaction;
|
||||
use tracing::error;
|
||||
use tracing::instrument;
|
||||
use url::Url;
|
||||
@@ -66,18 +64,20 @@ static HEADER_VALUE_TRUE: HeaderValue = HeaderValue::from_static("true");
|
||||
// Convert json non-string types to strings, so that they can be passed to Postgres
|
||||
// as parameters.
|
||||
//
|
||||
fn json_to_pg_text(json: Vec<Value>) -> Vec<Option<String>> {
|
||||
fn json_to_pg_text(json: Vec<Value>) -> Result<Vec<Option<String>>, serde_json::Error> {
|
||||
json.iter()
|
||||
.map(|value| {
|
||||
match value {
|
||||
// special care for nulls
|
||||
Value::Null => None,
|
||||
Value::Null => Ok(None),
|
||||
|
||||
// convert to text with escaping
|
||||
v @ (Value::Bool(_) | Value::Number(_) | Value::Object(_)) => Some(v.to_string()),
|
||||
Value::Bool(_) => serde_json::to_string(value).map(Some),
|
||||
Value::Number(_) => serde_json::to_string(value).map(Some),
|
||||
Value::Object(_) => serde_json::to_string(value).map(Some),
|
||||
|
||||
// avoid escaping here, as we pass this as a parameter
|
||||
Value::String(s) => Some(s.to_string()),
|
||||
Value::String(s) => Ok(Some(s.to_string())),
|
||||
|
||||
// special care for arrays
|
||||
Value::Array(_) => json_array_to_pg_array(value),
|
||||
@@ -94,26 +94,29 @@ fn json_to_pg_text(json: Vec<Value>) -> Vec<Option<String>> {
|
||||
//
|
||||
// Example of the same escaping in node-postgres: packages/pg/lib/utils.js
|
||||
//
|
||||
fn json_array_to_pg_array(value: &Value) -> Option<String> {
|
||||
fn json_array_to_pg_array(value: &Value) -> Result<Option<String>, serde_json::Error> {
|
||||
match value {
|
||||
// special care for nulls
|
||||
Value::Null => None,
|
||||
Value::Null => Ok(None),
|
||||
|
||||
// convert to text with escaping
|
||||
Value::Bool(_) => serde_json::to_string(value).map(Some),
|
||||
Value::Number(_) => serde_json::to_string(value).map(Some),
|
||||
|
||||
// here string needs to be escaped, as it is part of the array
|
||||
v @ (Value::Bool(_) | Value::Number(_) | Value::String(_)) => Some(v.to_string()),
|
||||
v @ Value::Object(_) => json_array_to_pg_array(&Value::String(v.to_string())),
|
||||
Value::Object(_) => json_array_to_pg_array(&Value::String(serde_json::to_string(value)?)),
|
||||
Value::String(_) => serde_json::to_string(value).map(Some),
|
||||
|
||||
// recurse into array
|
||||
Value::Array(arr) => {
|
||||
let vals = arr
|
||||
.iter()
|
||||
.map(json_array_to_pg_array)
|
||||
.map(|v| v.unwrap_or_else(|| "NULL".to_string()))
|
||||
.collect::<Vec<_>>()
|
||||
.map(|r| r.map(|v| v.unwrap_or_else(|| "NULL".to_string())))
|
||||
.collect::<Result<Vec<_>, _>>()?
|
||||
.join(",");
|
||||
|
||||
Some(format!("{{{}}}", vals))
|
||||
Ok(Some(format!("{{{}}}", vals)))
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -312,119 +315,83 @@ async fn handle_inner(
|
||||
// Now execute the query and return the result
|
||||
//
|
||||
let mut size = 0;
|
||||
let result =
|
||||
match payload {
|
||||
Payload::Single(stmt) => {
|
||||
let (status, results) =
|
||||
query_to_json(&*client, stmt, &mut 0, raw_output, array_mode)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
client.discard();
|
||||
e
|
||||
})?;
|
||||
client.check_idle(status);
|
||||
results
|
||||
let result = match payload {
|
||||
Payload::Single(query) => {
|
||||
query_to_json(&client.inner, query, &mut size, raw_output, array_mode).await
|
||||
}
|
||||
Payload::Batch(batch_query) => {
|
||||
let mut results = Vec::new();
|
||||
let mut builder = client.inner.build_transaction();
|
||||
if let Some(isolation_level) = txn_isolation_level {
|
||||
builder = builder.isolation_level(isolation_level);
|
||||
}
|
||||
Payload::Batch(statements) => {
|
||||
let (inner, mut discard) = client.inner();
|
||||
let mut builder = inner.build_transaction();
|
||||
if let Some(isolation_level) = txn_isolation_level {
|
||||
builder = builder.isolation_level(isolation_level);
|
||||
}
|
||||
if txn_read_only {
|
||||
builder = builder.read_only(true);
|
||||
}
|
||||
if txn_deferrable {
|
||||
builder = builder.deferrable(true);
|
||||
}
|
||||
|
||||
let transaction = builder.start().await.map_err(|e| {
|
||||
// if we cannot start a transaction, we should return immediately
|
||||
// and not return to the pool. connection is clearly broken
|
||||
discard.discard();
|
||||
e
|
||||
})?;
|
||||
|
||||
let results =
|
||||
match query_batch(&transaction, statements, &mut size, raw_output, array_mode)
|
||||
.await
|
||||
{
|
||||
Ok(results) => {
|
||||
let status = transaction.commit().await.map_err(|e| {
|
||||
// if we cannot commit - for now don't return connection to pool
|
||||
// TODO: get a query status from the error
|
||||
discard.discard();
|
||||
e
|
||||
})?;
|
||||
discard.check_idle(status);
|
||||
results
|
||||
}
|
||||
Err(err) => {
|
||||
let status = transaction.rollback().await.map_err(|e| {
|
||||
// if we cannot rollback - for now don't return connection to pool
|
||||
// TODO: get a query status from the error
|
||||
discard.discard();
|
||||
e
|
||||
})?;
|
||||
discard.check_idle(status);
|
||||
return Err(err);
|
||||
}
|
||||
};
|
||||
|
||||
if txn_read_only {
|
||||
response = response.header(
|
||||
TXN_READ_ONLY.clone(),
|
||||
HeaderValue::try_from(txn_read_only.to_string())?,
|
||||
);
|
||||
}
|
||||
if txn_deferrable {
|
||||
response = response.header(
|
||||
TXN_DEFERRABLE.clone(),
|
||||
HeaderValue::try_from(txn_deferrable.to_string())?,
|
||||
);
|
||||
}
|
||||
if let Some(txn_isolation_level) = txn_isolation_level_raw {
|
||||
response = response.header(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level);
|
||||
}
|
||||
json!({ "results": results })
|
||||
if txn_read_only {
|
||||
builder = builder.read_only(true);
|
||||
}
|
||||
};
|
||||
if txn_deferrable {
|
||||
builder = builder.deferrable(true);
|
||||
}
|
||||
let transaction = builder.start().await?;
|
||||
for query in batch_query.queries {
|
||||
let result =
|
||||
query_to_json(&transaction, query, &mut size, raw_output, array_mode).await;
|
||||
match result {
|
||||
Ok(r) => results.push(r),
|
||||
Err(e) => {
|
||||
transaction.rollback().await?;
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
transaction.commit().await?;
|
||||
if txn_read_only {
|
||||
response = response.header(
|
||||
TXN_READ_ONLY.clone(),
|
||||
HeaderValue::try_from(txn_read_only.to_string())?,
|
||||
);
|
||||
}
|
||||
if txn_deferrable {
|
||||
response = response.header(
|
||||
TXN_DEFERRABLE.clone(),
|
||||
HeaderValue::try_from(txn_deferrable.to_string())?,
|
||||
);
|
||||
}
|
||||
if let Some(txn_isolation_level) = txn_isolation_level_raw {
|
||||
response = response.header(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level);
|
||||
}
|
||||
Ok(json!({ "results": results }))
|
||||
}
|
||||
};
|
||||
|
||||
let metrics = client.metrics();
|
||||
|
||||
// how could this possibly fail
|
||||
let body = serde_json::to_string(&result).expect("json serialization should not fail");
|
||||
let len = body.len();
|
||||
let response = response
|
||||
.body(Body::from(body))
|
||||
// only fails if invalid status code or invalid header/values are given.
|
||||
// these are not user configurable so it cannot fail dynamically
|
||||
.expect("building response payload should not fail");
|
||||
|
||||
// count the egress bytes - we miss the TLS and header overhead but oh well...
|
||||
// moving this later in the stack is going to be a lot of effort and ehhhh
|
||||
metrics.record_egress(len as u64);
|
||||
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
async fn query_batch(
|
||||
transaction: &Transaction<'_>,
|
||||
queries: BatchQueryData,
|
||||
total_size: &mut usize,
|
||||
raw_output: bool,
|
||||
array_mode: bool,
|
||||
) -> anyhow::Result<Vec<Value>> {
|
||||
let mut results = Vec::with_capacity(queries.queries.len());
|
||||
let mut current_size = 0;
|
||||
for stmt in queries.queries {
|
||||
// TODO: maybe we should check that the transaction bit is set here
|
||||
let (_, values) =
|
||||
query_to_json(transaction, stmt, &mut current_size, raw_output, array_mode).await?;
|
||||
results.push(values);
|
||||
if allow_pool {
|
||||
let current_span = tracing::Span::current();
|
||||
// return connection to the pool
|
||||
tokio::task::spawn_blocking(move || {
|
||||
let _span = current_span.enter();
|
||||
let _ = conn_pool.put(&conn_info, client);
|
||||
});
|
||||
}
|
||||
|
||||
match result {
|
||||
Ok(value) => {
|
||||
// how could this possibly fail
|
||||
let body = serde_json::to_string(&value).expect("json serialization should not fail");
|
||||
let len = body.len();
|
||||
let response = response
|
||||
.body(Body::from(body))
|
||||
// only fails if invalid status code or invalid header/values are given.
|
||||
// these are not user configurable so it cannot fail dynamically
|
||||
.expect("building response payload should not fail");
|
||||
|
||||
// count the egress bytes - we miss the TLS and header overhead but oh well...
|
||||
// moving this later in the stack is going to be a lot of effort and ehhhh
|
||||
metrics.record_egress(len as u64);
|
||||
Ok(response)
|
||||
}
|
||||
Err(e) => Err(e),
|
||||
}
|
||||
*total_size += current_size;
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
async fn query_to_json<T: GenericClient>(
|
||||
@@ -433,9 +400,11 @@ async fn query_to_json<T: GenericClient>(
|
||||
current_size: &mut usize,
|
||||
raw_output: bool,
|
||||
array_mode: bool,
|
||||
) -> anyhow::Result<(ReadyForQueryStatus, Value)> {
|
||||
let query_params = json_to_pg_text(data.params);
|
||||
let row_stream = client.query_raw_txt(&data.query, query_params).await?;
|
||||
) -> anyhow::Result<Value> {
|
||||
let query_params = json_to_pg_text(data.params)?;
|
||||
let row_stream = client
|
||||
.query_raw_txt::<String, _>(data.query, query_params)
|
||||
.await?;
|
||||
|
||||
// Manually drain the stream into a vector to leave row_stream hanging
|
||||
// around to get a command tag. Also check that the response is not too
|
||||
@@ -455,8 +424,6 @@ async fn query_to_json<T: GenericClient>(
|
||||
}
|
||||
}
|
||||
|
||||
let ready = row_stream.ready_status();
|
||||
|
||||
// grab the command tag and number of rows affected
|
||||
let command_tag = row_stream.command_tag().unwrap_or_default();
|
||||
let mut command_tag_split = command_tag.split(' ');
|
||||
@@ -497,16 +464,13 @@ async fn query_to_json<T: GenericClient>(
|
||||
.collect::<Result<Vec<_>, _>>()?;
|
||||
|
||||
// resulting JSON format is based on the format of node-postgres result
|
||||
Ok((
|
||||
ready,
|
||||
json!({
|
||||
"command": command_tag_name,
|
||||
"rowCount": command_tag_count,
|
||||
"rows": rows,
|
||||
"fields": fields,
|
||||
"rowAsArray": array_mode,
|
||||
}),
|
||||
))
|
||||
Ok(json!({
|
||||
"command": command_tag_name,
|
||||
"rowCount": command_tag_count,
|
||||
"rows": rows,
|
||||
"fields": fields,
|
||||
"rowAsArray": array_mode,
|
||||
}))
|
||||
}
|
||||
|
||||
//
|
||||
@@ -691,22 +655,22 @@ mod tests {
|
||||
#[test]
|
||||
fn test_atomic_types_to_pg_params() {
|
||||
let json = vec![Value::Bool(true), Value::Bool(false)];
|
||||
let pg_params = json_to_pg_text(json);
|
||||
let pg_params = json_to_pg_text(json).unwrap();
|
||||
assert_eq!(
|
||||
pg_params,
|
||||
vec![Some("true".to_owned()), Some("false".to_owned())]
|
||||
);
|
||||
|
||||
let json = vec![Value::Number(serde_json::Number::from(42))];
|
||||
let pg_params = json_to_pg_text(json);
|
||||
let pg_params = json_to_pg_text(json).unwrap();
|
||||
assert_eq!(pg_params, vec![Some("42".to_owned())]);
|
||||
|
||||
let json = vec![Value::String("foo\"".to_string())];
|
||||
let pg_params = json_to_pg_text(json);
|
||||
let pg_params = json_to_pg_text(json).unwrap();
|
||||
assert_eq!(pg_params, vec![Some("foo\"".to_owned())]);
|
||||
|
||||
let json = vec![Value::Null];
|
||||
let pg_params = json_to_pg_text(json);
|
||||
let pg_params = json_to_pg_text(json).unwrap();
|
||||
assert_eq!(pg_params, vec![None]);
|
||||
}
|
||||
|
||||
@@ -715,7 +679,7 @@ mod tests {
|
||||
// atoms and escaping
|
||||
let json = "[true, false, null, \"NULL\", 42, \"foo\", \"bar\\\"-\\\\\"]";
|
||||
let json: Value = serde_json::from_str(json).unwrap();
|
||||
let pg_params = json_to_pg_text(vec![json]);
|
||||
let pg_params = json_to_pg_text(vec![json]).unwrap();
|
||||
assert_eq!(
|
||||
pg_params,
|
||||
vec![Some(
|
||||
@@ -726,7 +690,7 @@ mod tests {
|
||||
// nested arrays
|
||||
let json = "[[true, false], [null, 42], [\"foo\", \"bar\\\"-\\\\\"]]";
|
||||
let json: Value = serde_json::from_str(json).unwrap();
|
||||
let pg_params = json_to_pg_text(vec![json]);
|
||||
let pg_params = json_to_pg_text(vec![json]).unwrap();
|
||||
assert_eq!(
|
||||
pg_params,
|
||||
vec![Some(
|
||||
@@ -736,7 +700,7 @@ mod tests {
|
||||
// array of objects
|
||||
let json = r#"[{"foo": 1},{"bar": 2}]"#;
|
||||
let json: Value = serde_json::from_str(json).unwrap();
|
||||
let pg_params = json_to_pg_text(vec![json]);
|
||||
let pg_params = json_to_pg_text(vec![json]).unwrap();
|
||||
assert_eq!(
|
||||
pg_params,
|
||||
vec![Some(r#"{"{\"foo\":1}","{\"bar\":2}"}"#.to_owned())]
|
||||
|
||||
@@ -8,7 +8,6 @@ use crate::{
|
||||
NUM_CLIENT_CONNECTION_OPENED_COUNTER,
|
||||
},
|
||||
};
|
||||
use anyhow::bail;
|
||||
use bytes::{Buf, Bytes};
|
||||
use futures::{Sink, Stream, StreamExt};
|
||||
use hyper::{
|
||||
@@ -23,6 +22,7 @@ use hyper_tungstenite::{tungstenite::Message, HyperWebsocket, WebSocketStream};
|
||||
use pin_project_lite::pin_project;
|
||||
|
||||
use std::{
|
||||
convert::Infallible,
|
||||
future::ready,
|
||||
pin::Pin,
|
||||
sync::Arc,
|
||||
@@ -280,18 +280,12 @@ pub async fn task_main(
|
||||
let make_svc = hyper::service::make_service_fn(
|
||||
|stream: &tokio_rustls::server::TlsStream<WithClientIp<AddrStream>>| {
|
||||
let (io, tls) = stream.get_ref();
|
||||
let client_addr = io.client_addr();
|
||||
let remote_addr = io.inner.remote_addr();
|
||||
let peer_addr = io.client_addr().unwrap_or(io.inner.remote_addr());
|
||||
let sni_name = tls.server_name().map(|s| s.to_string());
|
||||
let conn_pool = conn_pool.clone();
|
||||
|
||||
async move {
|
||||
let peer_addr = match client_addr {
|
||||
Some(addr) => addr,
|
||||
None if config.require_client_ip => bail!("missing required client ip"),
|
||||
None => remote_addr,
|
||||
};
|
||||
Ok(MetricService::new(hyper::service::service_fn(
|
||||
Ok::<_, Infallible>(MetricService::new(hyper::service::service_fn(
|
||||
move |req: Request<Body>| {
|
||||
let sni_name = sni_name.clone();
|
||||
let conn_pool = conn_pool.clone();
|
||||
|
||||
@@ -200,8 +200,6 @@ pub async fn task_main(
|
||||
let mut socket = WithClientIp::new(socket);
|
||||
if let Some(ip) = socket.wait_for_addr().await? {
|
||||
tracing::Span::current().record("peer_addr", &tracing::field::display(ip));
|
||||
} else if config.require_client_ip {
|
||||
bail!("missing required client IP");
|
||||
}
|
||||
|
||||
socket
|
||||
|
||||
@@ -31,7 +31,7 @@ impl<'a> FirstMessage<'a> {
|
||||
|
||||
/// A single SASL message.
|
||||
/// This struct is deliberately decoupled from lower-level
|
||||
/// [`BeAuthenticationSaslMessage`].
|
||||
/// [`BeAuthenticationSaslMessage`](pq_proto::BeAuthenticationSaslMessage).
|
||||
#[derive(Debug)]
|
||||
pub(super) enum ServerMessage<T> {
|
||||
/// We expect to see more steps.
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
[toolchain]
|
||||
channel = "1.73.0"
|
||||
channel = "1.72.1"
|
||||
profile = "default"
|
||||
# The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
|
||||
# https://rust-lang.github.io/rustup/concepts/profiles.html
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
//
|
||||
use anyhow::{bail, Context, Result};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use clap::{ArgAction, Parser};
|
||||
use clap::Parser;
|
||||
use futures::future::BoxFuture;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use futures::{FutureExt, StreamExt};
|
||||
@@ -105,9 +105,6 @@ struct Args {
|
||||
/// it during this period passed as a human readable duration.
|
||||
#[arg(long, value_parser= humantime::parse_duration, default_value = DEFAULT_HEARTBEAT_TIMEOUT, verbatim_doc_comment)]
|
||||
heartbeat_timeout: Duration,
|
||||
/// Enable/disable peer recovery.
|
||||
#[arg(long, default_value = "false", action=ArgAction::Set)]
|
||||
peer_recovery: bool,
|
||||
/// Remote storage configuration for WAL backup (offloading to s3) as TOML
|
||||
/// inline table, e.g.
|
||||
/// {"max_concurrent_syncs" = 17, "max_sync_errors": 13, "bucket_name": "<BUCKETNAME>", "bucket_region":"<REGION>", "concurrency_limit": 119}
|
||||
@@ -268,7 +265,6 @@ async fn main() -> anyhow::Result<()> {
|
||||
broker_endpoint: args.broker_endpoint,
|
||||
broker_keepalive_interval: args.broker_keepalive_interval,
|
||||
heartbeat_timeout: args.heartbeat_timeout,
|
||||
peer_recovery_enabled: args.peer_recovery,
|
||||
remote_storage: args.remote_storage,
|
||||
max_offloader_lag_bytes: args.max_offloader_lag,
|
||||
wal_backup_enabled: !args.disable_wal_backup,
|
||||
|
||||
@@ -372,13 +372,6 @@ impl SafekeeperPostgresHandler {
|
||||
/// from a walproposer recovery function. This connection gets a special handling:
|
||||
/// safekeeper must stream all local WAL till the flush_lsn, whether committed or not.
|
||||
pub fn is_walproposer_recovery(&self) -> bool {
|
||||
match &self.appname {
|
||||
None => false,
|
||||
Some(appname) => {
|
||||
appname == "wal_proposer_recovery" ||
|
||||
// set by safekeeper peer recovery
|
||||
appname.starts_with("safekeeper")
|
||||
}
|
||||
}
|
||||
self.appname == Some("wal_proposer_recovery".to_string())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,8 +16,8 @@ use tokio::io::AsyncReadExt;
|
||||
use utils::http::endpoint::request_span;
|
||||
|
||||
use crate::receive_wal::WalReceiverState;
|
||||
use crate::safekeeper::ServerInfo;
|
||||
use crate::safekeeper::Term;
|
||||
use crate::safekeeper::{ServerInfo, TermLsn};
|
||||
use crate::send_wal::WalSenderState;
|
||||
use crate::timeline::PeerInfo;
|
||||
use crate::{debug_dump, pull_timeline};
|
||||
@@ -60,25 +60,16 @@ fn get_conf(request: &Request<Body>) -> &SafeKeeperConf {
|
||||
.as_ref()
|
||||
}
|
||||
|
||||
/// Same as TermLsn, but serializes LSN using display serializer
|
||||
/// Same as TermSwitchEntry, but serializes LSN using display serializer
|
||||
/// in Postgres format, i.e. 0/FFFFFFFF. Used only for the API response.
|
||||
#[serde_as]
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct TermSwitchApiEntry {
|
||||
pub term: Term,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub lsn: Lsn,
|
||||
}
|
||||
|
||||
impl From<TermSwitchApiEntry> for TermLsn {
|
||||
fn from(api_val: TermSwitchApiEntry) -> Self {
|
||||
TermLsn {
|
||||
term: api_val.term,
|
||||
lsn: api_val.lsn,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Augment AcceptorState with epoch for convenience
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct AcceptorStateStatus {
|
||||
|
||||
@@ -62,7 +62,6 @@ pub struct SafeKeeperConf {
|
||||
pub broker_endpoint: Uri,
|
||||
pub broker_keepalive_interval: Duration,
|
||||
pub heartbeat_timeout: Duration,
|
||||
pub peer_recovery_enabled: bool,
|
||||
pub remote_storage: Option<RemoteStorageConfig>,
|
||||
pub max_offloader_lag_bytes: u64,
|
||||
pub backup_parallel_jobs: usize,
|
||||
@@ -101,7 +100,6 @@ impl SafeKeeperConf {
|
||||
.parse()
|
||||
.expect("failed to parse default broker endpoint"),
|
||||
broker_keepalive_interval: Duration::from_secs(5),
|
||||
peer_recovery_enabled: true,
|
||||
wal_backup_enabled: true,
|
||||
backup_parallel_jobs: 1,
|
||||
pg_auth: None,
|
||||
|
||||
@@ -55,12 +55,9 @@ impl WalReceivers {
|
||||
|
||||
/// Register new walreceiver. Returned guard provides access to the slot and
|
||||
/// automatically deregisters in Drop.
|
||||
pub fn register(self: &Arc<WalReceivers>, conn_id: Option<ConnectionId>) -> WalReceiverGuard {
|
||||
pub fn register(self: &Arc<WalReceivers>) -> WalReceiverGuard {
|
||||
let slots = &mut self.mutex.lock().slots;
|
||||
let walreceiver = WalReceiverState {
|
||||
conn_id,
|
||||
status: WalReceiverStatus::Voting,
|
||||
};
|
||||
let walreceiver = WalReceiverState::Voting;
|
||||
// find empty slot or create new one
|
||||
let pos = if let Some(pos) = slots.iter().position(|s| s.is_none()) {
|
||||
slots[pos] = Some(walreceiver);
|
||||
@@ -99,18 +96,6 @@ impl WalReceivers {
|
||||
self.mutex.lock().slots.iter().flatten().cloned().collect()
|
||||
}
|
||||
|
||||
/// Get number of streaming walreceivers (normally 0 or 1) from compute.
|
||||
pub fn get_num_streaming(self: &Arc<WalReceivers>) -> usize {
|
||||
self.mutex
|
||||
.lock()
|
||||
.slots
|
||||
.iter()
|
||||
.flatten()
|
||||
// conn_id.is_none skips recovery which also registers here
|
||||
.filter(|s| s.conn_id.is_some() && matches!(s.status, WalReceiverStatus::Streaming))
|
||||
.count()
|
||||
}
|
||||
|
||||
/// Unregister walsender.
|
||||
fn unregister(self: &Arc<WalReceivers>, id: WalReceiverId) {
|
||||
let mut shared = self.mutex.lock();
|
||||
@@ -123,17 +108,10 @@ struct WalReceiversShared {
|
||||
slots: Vec<Option<WalReceiverState>>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct WalReceiverState {
|
||||
/// None means it is recovery initiated by us (this safekeeper).
|
||||
pub conn_id: Option<ConnectionId>,
|
||||
pub status: WalReceiverStatus,
|
||||
}
|
||||
|
||||
/// Walreceiver status. Currently only whether it passed voting stage and
|
||||
/// started receiving the stream, but it is easy to add more if needed.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub enum WalReceiverStatus {
|
||||
pub enum WalReceiverState {
|
||||
Voting,
|
||||
Streaming,
|
||||
}
|
||||
@@ -158,8 +136,8 @@ impl Drop for WalReceiverGuard {
|
||||
}
|
||||
}
|
||||
|
||||
pub const MSG_QUEUE_SIZE: usize = 256;
|
||||
pub const REPLY_QUEUE_SIZE: usize = 16;
|
||||
const MSG_QUEUE_SIZE: usize = 256;
|
||||
const REPLY_QUEUE_SIZE: usize = 16;
|
||||
|
||||
impl SafekeeperPostgresHandler {
|
||||
/// Wrapper around handle_start_wal_push_guts handling result. Error is
|
||||
@@ -283,7 +261,7 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
|
||||
tli.clone(),
|
||||
msg_rx,
|
||||
reply_tx,
|
||||
Some(self.conn_id),
|
||||
self.conn_id,
|
||||
));
|
||||
|
||||
// Forward all messages to WalAcceptor
|
||||
@@ -339,41 +317,31 @@ async fn network_write<IO: AsyncRead + AsyncWrite + Unpin>(
|
||||
// even when it writes a steady stream of messages.
|
||||
const KEEPALIVE_INTERVAL: Duration = Duration::from_secs(1);
|
||||
|
||||
/// Encapsulates a task which takes messages from msg_rx, processes and pushes
|
||||
/// replies to reply_tx; reading from socket and writing to disk in parallel is
|
||||
/// beneficial for performance, this struct provides writing to disk part.
|
||||
pub struct WalAcceptor {
|
||||
/// Takes messages from msg_rx, processes and pushes replies to reply_tx.
|
||||
struct WalAcceptor {
|
||||
tli: Arc<Timeline>,
|
||||
msg_rx: Receiver<ProposerAcceptorMessage>,
|
||||
reply_tx: Sender<AcceptorProposerMessage>,
|
||||
conn_id: Option<ConnectionId>,
|
||||
}
|
||||
|
||||
impl WalAcceptor {
|
||||
/// Spawn task with WalAcceptor running, return handle to it. Task returns
|
||||
/// Ok(()) if either of channels has closed, and Err if any error during
|
||||
/// message processing is encountered.
|
||||
///
|
||||
/// conn_id None means WalAcceptor is used by recovery initiated at this safekeeper.
|
||||
pub fn spawn(
|
||||
/// Spawn thread with WalAcceptor running, return handle to it.
|
||||
fn spawn(
|
||||
tli: Arc<Timeline>,
|
||||
msg_rx: Receiver<ProposerAcceptorMessage>,
|
||||
reply_tx: Sender<AcceptorProposerMessage>,
|
||||
conn_id: Option<ConnectionId>,
|
||||
conn_id: ConnectionId,
|
||||
) -> JoinHandle<anyhow::Result<()>> {
|
||||
task::spawn(async move {
|
||||
let mut wa = WalAcceptor {
|
||||
tli,
|
||||
msg_rx,
|
||||
reply_tx,
|
||||
conn_id,
|
||||
};
|
||||
|
||||
let span_ttid = wa.tli.ttid; // satisfy borrow checker
|
||||
wa.run()
|
||||
.instrument(
|
||||
info_span!("WAL acceptor", cid = %conn_id.unwrap_or(0), ttid = %span_ttid),
|
||||
)
|
||||
.instrument(info_span!("WAL acceptor", cid = %conn_id, ttid = %span_ttid))
|
||||
.await
|
||||
})
|
||||
}
|
||||
@@ -387,7 +355,7 @@ impl WalAcceptor {
|
||||
let _compute_conn_guard = ComputeConnectionGuard {
|
||||
timeline: Arc::clone(&self.tli),
|
||||
};
|
||||
let walreceiver_guard = self.tli.get_walreceivers().register(self.conn_id);
|
||||
let walreceiver_guard = self.tli.get_walreceivers().register();
|
||||
self.tli.update_status_notify().await?;
|
||||
|
||||
// After this timestamp we will stop processing AppendRequests and send a response
|
||||
@@ -404,7 +372,7 @@ impl WalAcceptor {
|
||||
|
||||
// Update walreceiver state in shmem for reporting.
|
||||
if let ProposerAcceptorMessage::Elected(_) = &next_msg {
|
||||
walreceiver_guard.get().status = WalReceiverStatus::Streaming;
|
||||
*walreceiver_guard.get() = WalReceiverState::Streaming;
|
||||
}
|
||||
|
||||
let reply_msg = if matches!(next_msg, ProposerAcceptorMessage::AppendRequest(_)) {
|
||||
|
||||
@@ -1,41 +1,17 @@
|
||||
//! This module implements pulling WAL from peer safekeepers if compute can't
|
||||
//! provide it, i.e. safekeeper lags too much.
|
||||
|
||||
use std::time::SystemTime;
|
||||
use std::{fmt, pin::pin, sync::Arc};
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use futures::StreamExt;
|
||||
use postgres_protocol::message::backend::ReplicationMessage;
|
||||
use tokio::sync::mpsc::{channel, Receiver, Sender};
|
||||
use tokio::time::timeout;
|
||||
use tokio::{
|
||||
select,
|
||||
time::sleep,
|
||||
time::{self, Duration},
|
||||
};
|
||||
use tokio_postgres::replication::ReplicationStream;
|
||||
use tokio_postgres::types::PgLsn;
|
||||
use tracing::*;
|
||||
use utils::{id::NodeId, lsn::Lsn, postgres_client::wal_stream_connection_config};
|
||||
use tokio::{select, time::sleep, time::Duration};
|
||||
use tracing::{info, instrument};
|
||||
|
||||
use crate::receive_wal::{WalAcceptor, REPLY_QUEUE_SIZE};
|
||||
use crate::safekeeper::{AppendRequest, AppendRequestHeader};
|
||||
use crate::{
|
||||
http::routes::TimelineStatus,
|
||||
receive_wal::MSG_QUEUE_SIZE,
|
||||
safekeeper::{
|
||||
AcceptorProposerMessage, ProposerAcceptorMessage, ProposerElected, Term, TermHistory,
|
||||
TermLsn, VoteRequest,
|
||||
},
|
||||
timeline::{PeerInfo, Timeline},
|
||||
SafeKeeperConf,
|
||||
};
|
||||
use crate::{timeline::Timeline, SafeKeeperConf};
|
||||
|
||||
/// Entrypoint for per timeline task which always runs, checking whether
|
||||
/// recovery for this safekeeper is needed and starting it if so.
|
||||
#[instrument(name = "recovery task", skip_all, fields(ttid = %tli.ttid))]
|
||||
pub async fn recovery_main(tli: Arc<Timeline>, conf: SafeKeeperConf) {
|
||||
pub async fn recovery_main(tli: Arc<Timeline>, _conf: SafeKeeperConf) {
|
||||
info!("started");
|
||||
let mut cancellation_rx = match tli.get_cancellation_rx() {
|
||||
Ok(rx) => rx,
|
||||
@@ -46,387 +22,19 @@ pub async fn recovery_main(tli: Arc<Timeline>, conf: SafeKeeperConf) {
|
||||
};
|
||||
|
||||
select! {
|
||||
_ = recovery_main_loop(tli, conf) => { unreachable!() }
|
||||
_ = recovery_main_loop(tli) => { unreachable!() }
|
||||
_ = cancellation_rx.changed() => {
|
||||
info!("stopped");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Result of Timeline::recovery_needed, contains donor(s) if recovery needed and
|
||||
/// fields to explain the choice.
|
||||
#[derive(Debug)]
|
||||
pub struct RecoveryNeededInfo {
|
||||
/// my term
|
||||
pub term: Term,
|
||||
/// my last_log_term
|
||||
pub last_log_term: Term,
|
||||
/// my flush_lsn
|
||||
pub flush_lsn: Lsn,
|
||||
/// peers from which we can fetch WAL, for observability.
|
||||
pub peers: Vec<PeerInfo>,
|
||||
/// for observability
|
||||
pub num_streaming_computes: usize,
|
||||
pub donors: Vec<Donor>,
|
||||
}
|
||||
|
||||
// Custom to omit not important fields from PeerInfo.
|
||||
impl fmt::Display for RecoveryNeededInfo {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "{{")?;
|
||||
write!(
|
||||
f,
|
||||
"term: {}, last_log_term: {}, flush_lsn: {}, peers: {{",
|
||||
self.term, self.last_log_term, self.flush_lsn
|
||||
)?;
|
||||
for p in self.peers.iter() {
|
||||
write!(
|
||||
f,
|
||||
"PeerInfo {{ sk_id: {}, term: {}, last_log_term: {}, flush_lsn: {} }}, ",
|
||||
p.sk_id, p.term, p.last_log_term, p.flush_lsn
|
||||
)?;
|
||||
}
|
||||
write!(
|
||||
f,
|
||||
"}} num_streaming_computes: {}, donors: {:?}",
|
||||
self.num_streaming_computes, self.donors
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Donor {
|
||||
pub sk_id: NodeId,
|
||||
/// equals to last_log_term
|
||||
pub term: Term,
|
||||
pub flush_lsn: Lsn,
|
||||
pub pg_connstr: String,
|
||||
pub http_connstr: String,
|
||||
}
|
||||
|
||||
impl From<&PeerInfo> for Donor {
|
||||
fn from(p: &PeerInfo) -> Self {
|
||||
Donor {
|
||||
sk_id: p.sk_id,
|
||||
term: p.term,
|
||||
flush_lsn: p.flush_lsn,
|
||||
pg_connstr: p.pg_connstr.clone(),
|
||||
http_connstr: p.http_connstr.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const CHECK_INTERVAL_MS: u64 = 2000;
|
||||
|
||||
/// Check regularly whether we need to start recovery.
|
||||
async fn recovery_main_loop(tli: Arc<Timeline>, conf: SafeKeeperConf) {
|
||||
async fn recovery_main_loop(_tli: Arc<Timeline>) {
|
||||
let check_duration = Duration::from_millis(CHECK_INTERVAL_MS);
|
||||
loop {
|
||||
let recovery_needed_info = tli.recovery_needed(conf.heartbeat_timeout).await;
|
||||
match recovery_needed_info.donors.first() {
|
||||
Some(donor) => {
|
||||
info!(
|
||||
"starting recovery from donor {}: {}",
|
||||
donor.sk_id, recovery_needed_info
|
||||
);
|
||||
match recover(tli.clone(), donor, &conf).await {
|
||||
// Note: 'write_wal rewrites WAL written before' error is
|
||||
// expected here and might happen if compute and recovery
|
||||
// concurrently write the same data. Eventually compute
|
||||
// should win.
|
||||
Err(e) => warn!("recovery failed: {:#}", e),
|
||||
Ok(msg) => info!("recovery finished: {}", msg),
|
||||
}
|
||||
}
|
||||
None => {
|
||||
trace!(
|
||||
"recovery not needed or not possible: {}",
|
||||
recovery_needed_info
|
||||
);
|
||||
}
|
||||
}
|
||||
sleep(check_duration).await;
|
||||
}
|
||||
}
|
||||
|
||||
/// Recover from the specified donor. Returns message explaining normal finish
|
||||
/// reason or error.
|
||||
async fn recover(
|
||||
tli: Arc<Timeline>,
|
||||
donor: &Donor,
|
||||
conf: &SafeKeeperConf,
|
||||
) -> anyhow::Result<String> {
|
||||
// Learn donor term switch history to figure out starting point.
|
||||
let client = reqwest::Client::new();
|
||||
let timeline_info: TimelineStatus = client
|
||||
.get(format!(
|
||||
"http://{}/v1/tenant/{}/timeline/{}",
|
||||
donor.http_connstr, tli.ttid.tenant_id, tli.ttid.timeline_id
|
||||
))
|
||||
.send()
|
||||
.await?
|
||||
.json()
|
||||
.await?;
|
||||
if timeline_info.acceptor_state.term != donor.term {
|
||||
bail!(
|
||||
"donor term changed from {} to {}",
|
||||
donor.term,
|
||||
timeline_info.acceptor_state.term
|
||||
);
|
||||
}
|
||||
// convert from API TermSwitchApiEntry into TermLsn.
|
||||
let donor_th = TermHistory(
|
||||
timeline_info
|
||||
.acceptor_state
|
||||
.term_history
|
||||
.iter()
|
||||
.map(|tl| Into::<TermLsn>::into(*tl))
|
||||
.collect(),
|
||||
);
|
||||
|
||||
// Now understand our term history.
|
||||
let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: donor.term });
|
||||
let vote_response = match tli
|
||||
.process_msg(&vote_request)
|
||||
.await
|
||||
.context("VoteRequest handling")?
|
||||
{
|
||||
Some(AcceptorProposerMessage::VoteResponse(vr)) => vr,
|
||||
_ => {
|
||||
bail!("unexpected VoteRequest response"); // unreachable
|
||||
}
|
||||
};
|
||||
if vote_response.term != donor.term {
|
||||
bail!(
|
||||
"our term changed from {} to {}",
|
||||
donor.term,
|
||||
vote_response.term
|
||||
);
|
||||
}
|
||||
|
||||
let last_common_point = match TermHistory::find_highest_common_point(
|
||||
&donor_th,
|
||||
&vote_response.term_history,
|
||||
vote_response.flush_lsn,
|
||||
) {
|
||||
None => bail!(
|
||||
"couldn't find common point in histories, donor {:?}, sk {:?}",
|
||||
donor_th,
|
||||
vote_response.term_history,
|
||||
),
|
||||
Some(lcp) => lcp,
|
||||
};
|
||||
info!("found last common point at {:?}", last_common_point);
|
||||
|
||||
// truncate WAL locally
|
||||
let pe = ProposerAcceptorMessage::Elected(ProposerElected {
|
||||
term: donor.term,
|
||||
start_streaming_at: last_common_point.lsn,
|
||||
term_history: donor_th,
|
||||
timeline_start_lsn: Lsn::INVALID,
|
||||
});
|
||||
// Successful ProposerElected handling always returns None. If term changed,
|
||||
// we'll find out that during the streaming. Note: it is expected to get
|
||||
// 'refusing to overwrite correct WAL' here if walproposer reconnected
|
||||
// concurrently, restart helps here.
|
||||
tli.process_msg(&pe)
|
||||
.await
|
||||
.context("ProposerElected handling")?;
|
||||
|
||||
recovery_stream(tli, donor, last_common_point.lsn, conf).await
|
||||
}
|
||||
|
||||
// Pull WAL from donor, assuming handshake is already done.
|
||||
async fn recovery_stream(
|
||||
tli: Arc<Timeline>,
|
||||
donor: &Donor,
|
||||
start_streaming_at: Lsn,
|
||||
conf: &SafeKeeperConf,
|
||||
) -> anyhow::Result<String> {
|
||||
// TODO: pass auth token
|
||||
let cfg = wal_stream_connection_config(tli.ttid, &donor.pg_connstr, None, None)?;
|
||||
let mut cfg = cfg.to_tokio_postgres_config();
|
||||
// It will make safekeeper give out not committed WAL (up to flush_lsn).
|
||||
cfg.application_name(&format!("safekeeper_{}", conf.my_id));
|
||||
cfg.replication_mode(tokio_postgres::config::ReplicationMode::Physical);
|
||||
|
||||
let connect_timeout = Duration::from_millis(10000);
|
||||
let (client, connection) = match time::timeout(connect_timeout, cfg.connect(postgres::NoTls))
|
||||
.await
|
||||
{
|
||||
Ok(client_and_conn) => client_and_conn?,
|
||||
Err(_elapsed) => {
|
||||
bail!("timed out while waiting {connect_timeout:?} for connection to peer safekeeper to open");
|
||||
}
|
||||
};
|
||||
trace!("connected to {:?}", donor);
|
||||
|
||||
// The connection object performs the actual communication with the
|
||||
// server, spawn it off to run on its own.
|
||||
let ttid = tli.ttid;
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = connection
|
||||
.instrument(info_span!("recovery task connection poll", ttid = %ttid))
|
||||
.await
|
||||
{
|
||||
// This logging isn't very useful as error is anyway forwarded to client.
|
||||
trace!(
|
||||
"tokio_postgres connection object finished with error: {}",
|
||||
e
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
let query = format!(
|
||||
"START_REPLICATION PHYSICAL {} (term='{}')",
|
||||
start_streaming_at, donor.term
|
||||
);
|
||||
|
||||
let copy_stream = client.copy_both_simple(&query).await?;
|
||||
let physical_stream = ReplicationStream::new(copy_stream);
|
||||
|
||||
// As in normal walreceiver, do networking and writing to disk in parallel.
|
||||
let (msg_tx, msg_rx) = channel(MSG_QUEUE_SIZE);
|
||||
let (reply_tx, reply_rx) = channel(REPLY_QUEUE_SIZE);
|
||||
let wa = WalAcceptor::spawn(tli.clone(), msg_rx, reply_tx, None);
|
||||
|
||||
let res = tokio::select! {
|
||||
r = network_io(physical_stream, msg_tx, donor.clone(), tli.clone(), conf.clone()) => r,
|
||||
r = read_replies(reply_rx, donor.term) => r.map(|()| None),
|
||||
};
|
||||
|
||||
// Join the spawned WalAcceptor. At this point chans to/from it passed to
|
||||
// network routines are dropped, so it will exit as soon as it touches them.
|
||||
match wa.await {
|
||||
Ok(Ok(())) => {
|
||||
// WalAcceptor finished normally, termination reason is different
|
||||
match res {
|
||||
Ok(Some(success_desc)) => Ok(success_desc),
|
||||
Ok(None) => bail!("unexpected recovery end without error/success"), // can't happen
|
||||
Err(e) => Err(e), // network error or term change
|
||||
}
|
||||
}
|
||||
Ok(Err(e)) => Err(e), // error while processing message
|
||||
Err(e) => bail!("WalAcceptor panicked: {}", e),
|
||||
}
|
||||
}
|
||||
|
||||
// Perform network part of streaming: read data and push it to msg_tx, send KA
|
||||
// to make sender hear from us. If there is nothing coming for a while, check
|
||||
// for termination.
|
||||
// Returns
|
||||
// - Ok(None) if channel to WalAcceptor closed -- its task should return error.
|
||||
// - Ok(Some(String)) if recovery successfully completed.
|
||||
// - Err if error happened while reading/writing to socket.
|
||||
async fn network_io(
|
||||
physical_stream: ReplicationStream,
|
||||
msg_tx: Sender<ProposerAcceptorMessage>,
|
||||
donor: Donor,
|
||||
tli: Arc<Timeline>,
|
||||
conf: SafeKeeperConf,
|
||||
) -> anyhow::Result<Option<String>> {
|
||||
let mut physical_stream = pin!(physical_stream);
|
||||
let mut last_received_lsn = Lsn::INVALID;
|
||||
// tear down connection if no data arrives withing this period
|
||||
let no_data_timeout = Duration::from_millis(30000);
|
||||
|
||||
loop {
|
||||
let msg = match timeout(no_data_timeout, physical_stream.next()).await {
|
||||
Ok(next) => match next {
|
||||
None => bail!("unexpected end of replication stream"),
|
||||
Some(msg) => msg.context("get replication message")?,
|
||||
},
|
||||
Err(_) => bail!("no message received within {:?}", no_data_timeout),
|
||||
};
|
||||
|
||||
match msg {
|
||||
ReplicationMessage::XLogData(xlog_data) => {
|
||||
let ar_hdr = AppendRequestHeader {
|
||||
term: donor.term,
|
||||
epoch_start_lsn: Lsn::INVALID, // unused
|
||||
begin_lsn: Lsn(xlog_data.wal_start()),
|
||||
end_lsn: Lsn(xlog_data.wal_start()) + xlog_data.data().len() as u64,
|
||||
commit_lsn: Lsn::INVALID, // do not attempt to advance, peer communication anyway does it
|
||||
truncate_lsn: Lsn::INVALID, // do not attempt to advance
|
||||
proposer_uuid: [0; 16],
|
||||
};
|
||||
let ar = AppendRequest {
|
||||
h: ar_hdr,
|
||||
wal_data: xlog_data.into_data(),
|
||||
};
|
||||
trace!(
|
||||
"processing AppendRequest {}-{}, len {}",
|
||||
ar.h.begin_lsn,
|
||||
ar.h.end_lsn,
|
||||
ar.wal_data.len()
|
||||
);
|
||||
last_received_lsn = ar.h.end_lsn;
|
||||
if msg_tx
|
||||
.send(ProposerAcceptorMessage::AppendRequest(ar))
|
||||
.await
|
||||
.is_err()
|
||||
{
|
||||
return Ok(None); // chan closed, WalAcceptor terminated
|
||||
}
|
||||
}
|
||||
ReplicationMessage::PrimaryKeepAlive(_) => {
|
||||
// keepalive means nothing is being streamed for a while. Check whether we need to stop.
|
||||
let recovery_needed_info = tli.recovery_needed(conf.heartbeat_timeout).await;
|
||||
// do current donors still contain one we currently connected to?
|
||||
if !recovery_needed_info
|
||||
.donors
|
||||
.iter()
|
||||
.any(|d| d.sk_id == donor.sk_id)
|
||||
{
|
||||
// Most likely it means we are caughtup.
|
||||
// note: just exiting makes tokio_postgres send CopyFail to the far end.
|
||||
return Ok(Some(format!(
|
||||
"terminating at {} as connected safekeeper {} with term {} is not a donor anymore: {}",
|
||||
last_received_lsn, donor.sk_id, donor.term, recovery_needed_info
|
||||
)));
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
// Send reply to each message to keep connection alive. Ideally we
|
||||
// should do that once in a while instead, but this again requires
|
||||
// stream split or similar workaround, and recovery is anyway not that
|
||||
// performance critical.
|
||||
//
|
||||
// We do not know here real write/flush LSNs (need to take mutex again
|
||||
// or check replies which are read in different future), but neither
|
||||
// sender much cares about them, so just send last received.
|
||||
physical_stream
|
||||
.as_mut()
|
||||
.standby_status_update(
|
||||
PgLsn::from(last_received_lsn.0),
|
||||
PgLsn::from(last_received_lsn.0),
|
||||
PgLsn::from(last_received_lsn.0),
|
||||
SystemTime::now(),
|
||||
0,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
|
||||
// Read replies from WalAcceptor. We are not interested much in sending them to
|
||||
// donor safekeeper, so don't route them anywhere. However, we should check if
|
||||
// term changes and exit if it does.
|
||||
// Returns Ok(()) if channel closed, Err in case of term change.
|
||||
async fn read_replies(
|
||||
mut reply_rx: Receiver<AcceptorProposerMessage>,
|
||||
donor_term: Term,
|
||||
) -> anyhow::Result<()> {
|
||||
loop {
|
||||
match reply_rx.recv().await {
|
||||
Some(msg) => {
|
||||
if let AcceptorProposerMessage::AppendResponse(ar) = msg {
|
||||
if ar.term != donor_term {
|
||||
bail!("donor term changed from {} to {}", donor_term, ar.term);
|
||||
}
|
||||
}
|
||||
}
|
||||
None => return Ok(()), // chan closed, WalAcceptor terminated
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -91,69 +91,6 @@ impl TermHistory {
|
||||
}
|
||||
TermHistory(res)
|
||||
}
|
||||
|
||||
/// Find point of divergence between leader (walproposer) term history and
|
||||
/// safekeeper. Arguments are not symmetrics as proposer history ends at
|
||||
/// +infinity while safekeeper at flush_lsn.
|
||||
/// C version is at walproposer SendProposerElected.
|
||||
pub fn find_highest_common_point(
|
||||
prop_th: &TermHistory,
|
||||
sk_th: &TermHistory,
|
||||
sk_wal_end: Lsn,
|
||||
) -> Option<TermLsn> {
|
||||
let (prop_th, sk_th) = (&prop_th.0, &sk_th.0); // avoid .0 below
|
||||
|
||||
if let Some(sk_th_last) = sk_th.last() {
|
||||
assert!(
|
||||
sk_th_last.lsn <= sk_wal_end,
|
||||
"safekeeper term history end {:?} LSN is higher than WAL end {:?}",
|
||||
sk_th_last,
|
||||
sk_wal_end
|
||||
);
|
||||
}
|
||||
|
||||
// find last common term, if any...
|
||||
let mut last_common_idx = None;
|
||||
for i in 0..min(sk_th.len(), prop_th.len()) {
|
||||
if prop_th[i].term != sk_th[i].term {
|
||||
break;
|
||||
}
|
||||
// If term is the same, LSN must be equal as well.
|
||||
assert!(
|
||||
prop_th[i].lsn == sk_th[i].lsn,
|
||||
"same term {} has different start LSNs: prop {}, sk {}",
|
||||
prop_th[i].term,
|
||||
prop_th[i].lsn,
|
||||
sk_th[i].lsn
|
||||
);
|
||||
last_common_idx = Some(i);
|
||||
}
|
||||
let last_common_idx = match last_common_idx {
|
||||
None => return None, // no common point
|
||||
Some(lci) => lci,
|
||||
};
|
||||
// Now find where it ends at both prop and sk and take min. End of
|
||||
// (common) term is the start of the next except it is the last one;
|
||||
// there it is flush_lsn in case of safekeeper or, in case of proposer
|
||||
// +infinity, so we just take flush_lsn then.
|
||||
if last_common_idx == prop_th.len() - 1 {
|
||||
Some(TermLsn {
|
||||
term: prop_th[last_common_idx].term,
|
||||
lsn: sk_wal_end,
|
||||
})
|
||||
} else {
|
||||
let prop_common_term_end = prop_th[last_common_idx + 1].lsn;
|
||||
let sk_common_term_end = if last_common_idx + 1 < sk_th.len() {
|
||||
sk_th[last_common_idx + 1].lsn
|
||||
} else {
|
||||
sk_wal_end
|
||||
};
|
||||
Some(TermLsn {
|
||||
term: prop_th[last_common_idx].term,
|
||||
lsn: min(prop_common_term_end, sk_common_term_end),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Display only latest entries for Debug.
|
||||
@@ -368,19 +305,19 @@ pub struct AcceptorGreeting {
|
||||
/// Vote request sent from proposer to safekeepers
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct VoteRequest {
|
||||
pub term: Term,
|
||||
term: Term,
|
||||
}
|
||||
|
||||
/// Vote itself, sent from safekeeper to proposer
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct VoteResponse {
|
||||
pub term: Term, // safekeeper's current term; if it is higher than proposer's, the compute is out of date.
|
||||
term: Term, // safekeeper's current term; if it is higher than proposer's, the compute is out of date.
|
||||
vote_given: u64, // fixme u64 due to padding
|
||||
// Safekeeper flush_lsn (end of WAL) + history of term switches allow
|
||||
// proposer to choose the most advanced one.
|
||||
pub flush_lsn: Lsn,
|
||||
flush_lsn: Lsn,
|
||||
truncate_lsn: Lsn,
|
||||
pub term_history: TermHistory,
|
||||
term_history: TermHistory,
|
||||
timeline_start_lsn: Lsn,
|
||||
}
|
||||
|
||||
@@ -407,8 +344,7 @@ pub struct AppendRequest {
|
||||
pub struct AppendRequestHeader {
|
||||
// safekeeper's current term; if it is higher than proposer's, the compute is out of date.
|
||||
pub term: Term,
|
||||
// TODO: remove this field, it in unused -- LSN of term switch can be taken
|
||||
// from ProposerElected (as well as from term history).
|
||||
// LSN since the proposer appends WAL; determines epoch switch point.
|
||||
pub epoch_start_lsn: Lsn,
|
||||
/// start position of message in WAL
|
||||
pub begin_lsn: Lsn,
|
||||
@@ -823,7 +759,7 @@ where
|
||||
bail!("refusing ProposerElected which is going to overwrite correct WAL: term={}, flush_lsn={}, start_streaming_at={}; restarting the handshake should help",
|
||||
msg.term, self.flush_lsn(), msg.start_streaming_at)
|
||||
}
|
||||
// Otherwise we must never attempt to truncate committed data.
|
||||
// Otherwise this shouldn't happen.
|
||||
assert!(
|
||||
msg.start_streaming_at >= self.inmem.commit_lsn,
|
||||
"attempt to truncate committed data: start_streaming_at={}, commit_lsn={}",
|
||||
@@ -874,14 +810,6 @@ where
|
||||
|
||||
info!("start receiving WAL since {:?}", msg.start_streaming_at);
|
||||
|
||||
// Cache LSN where term starts to immediately fsync control file with
|
||||
// commit_lsn once we reach it -- sync-safekeepers finishes when
|
||||
// persisted commit_lsn on majority of safekeepers aligns.
|
||||
self.epoch_start_lsn = match msg.term_history.0.last() {
|
||||
None => bail!("proposer elected with empty term history"),
|
||||
Some(term_lsn_start) => term_lsn_start.lsn,
|
||||
};
|
||||
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
@@ -907,7 +835,10 @@ where
|
||||
// file: walproposer in sync mode is very interested when this
|
||||
// happens. Note: this is for sync-safekeepers mode only, as
|
||||
// otherwise commit_lsn might jump over epoch_start_lsn.
|
||||
if commit_lsn >= self.epoch_start_lsn && self.state.commit_lsn < self.epoch_start_lsn {
|
||||
// Also note that commit_lsn can reach epoch_start_lsn earlier
|
||||
// that we receive new epoch_start_lsn, and we still need to sync
|
||||
// control file in this case.
|
||||
if commit_lsn == self.epoch_start_lsn && self.state.commit_lsn != commit_lsn {
|
||||
self.persist_control_file(self.state.clone()).await?;
|
||||
}
|
||||
|
||||
@@ -971,6 +902,7 @@ where
|
||||
// Now we know that we are in the same term as the proposer,
|
||||
// processing the message.
|
||||
|
||||
self.epoch_start_lsn = msg.h.epoch_start_lsn;
|
||||
self.inmem.proposer_uuid = msg.h.proposer_uuid;
|
||||
|
||||
// do the job
|
||||
@@ -1253,65 +1185,4 @@ mod tests {
|
||||
sk.wal_store.truncate_wal(Lsn(3)).await.unwrap(); // imitate the complete record at 3 %)
|
||||
assert_eq!(sk.get_epoch(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_find_highest_common_point_none() {
|
||||
let prop_th = TermHistory(vec![(0, Lsn(1)).into()]);
|
||||
let sk_th = TermHistory(vec![(1, Lsn(1)).into(), (2, Lsn(2)).into()]);
|
||||
assert_eq!(
|
||||
TermHistory::find_highest_common_point(&prop_th, &sk_th, Lsn(3),),
|
||||
None
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_find_highest_common_point_middle() {
|
||||
let prop_th = TermHistory(vec![
|
||||
(1, Lsn(10)).into(),
|
||||
(2, Lsn(20)).into(),
|
||||
(4, Lsn(40)).into(),
|
||||
]);
|
||||
let sk_th = TermHistory(vec![
|
||||
(1, Lsn(10)).into(),
|
||||
(2, Lsn(20)).into(),
|
||||
(3, Lsn(30)).into(), // sk ends last common term 2 at 30
|
||||
]);
|
||||
assert_eq!(
|
||||
TermHistory::find_highest_common_point(&prop_th, &sk_th, Lsn(40),),
|
||||
Some(TermLsn {
|
||||
term: 2,
|
||||
lsn: Lsn(30),
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_find_highest_common_point_sk_end() {
|
||||
let prop_th = TermHistory(vec![
|
||||
(1, Lsn(10)).into(),
|
||||
(2, Lsn(20)).into(), // last common term 2, sk will end it at 32 sk_end_lsn
|
||||
(4, Lsn(40)).into(),
|
||||
]);
|
||||
let sk_th = TermHistory(vec![(1, Lsn(10)).into(), (2, Lsn(20)).into()]);
|
||||
assert_eq!(
|
||||
TermHistory::find_highest_common_point(&prop_th, &sk_th, Lsn(32),),
|
||||
Some(TermLsn {
|
||||
term: 2,
|
||||
lsn: Lsn(32),
|
||||
})
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_find_highest_common_point_walprop() {
|
||||
let prop_th = TermHistory(vec![(1, Lsn(10)).into(), (2, Lsn(20)).into()]);
|
||||
let sk_th = TermHistory(vec![(1, Lsn(10)).into(), (2, Lsn(20)).into()]);
|
||||
assert_eq!(
|
||||
TermHistory::find_highest_common_point(&prop_th, &sk_th, Lsn(32),),
|
||||
Some(TermLsn {
|
||||
term: 2,
|
||||
lsn: Lsn(32),
|
||||
})
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -418,11 +418,10 @@ impl SafekeeperPostgresHandler {
|
||||
}
|
||||
|
||||
info!(
|
||||
"starting streaming from {:?}, available WAL ends at {}, recovery={}, appname={:?}",
|
||||
"starting streaming from {:?}, available WAL ends at {}, recovery={}",
|
||||
start_pos,
|
||||
end_pos,
|
||||
matches!(end_watch, EndWatch::Flush(_)),
|
||||
appname
|
||||
matches!(end_watch, EndWatch::Flush(_))
|
||||
);
|
||||
|
||||
// switch to copy
|
||||
|
||||
@@ -11,7 +11,6 @@ use tokio::fs;
|
||||
use serde_with::DisplayFromStr;
|
||||
use std::cmp::max;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use tokio::sync::{Mutex, MutexGuard};
|
||||
use tokio::{
|
||||
sync::{mpsc::Sender, watch},
|
||||
@@ -28,7 +27,7 @@ use storage_broker::proto::SafekeeperTimelineInfo;
|
||||
use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
|
||||
|
||||
use crate::receive_wal::WalReceivers;
|
||||
use crate::recovery::{recovery_main, Donor, RecoveryNeededInfo};
|
||||
use crate::recovery::recovery_main;
|
||||
use crate::safekeeper::{
|
||||
AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState,
|
||||
SafekeeperMemState, ServerInfo, Term, TermLsn, INVALID_TERM,
|
||||
@@ -46,12 +45,11 @@ use crate::{debug_dump, wal_storage};
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct PeerInfo {
|
||||
pub sk_id: NodeId,
|
||||
pub term: Term,
|
||||
/// Term of the last entry.
|
||||
pub last_log_term: Term,
|
||||
_last_log_term: Term,
|
||||
/// LSN of the last record.
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub flush_lsn: Lsn,
|
||||
_flush_lsn: Lsn,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub commit_lsn: Lsn,
|
||||
/// Since which LSN safekeeper has WAL. TODO: remove this once we fill new
|
||||
@@ -63,21 +61,16 @@ pub struct PeerInfo {
|
||||
#[serde(skip)]
|
||||
#[serde(default = "Instant::now")]
|
||||
ts: Instant,
|
||||
pub pg_connstr: String,
|
||||
pub http_connstr: String,
|
||||
}
|
||||
|
||||
impl PeerInfo {
|
||||
fn from_sk_info(sk_info: &SafekeeperTimelineInfo, ts: Instant) -> PeerInfo {
|
||||
PeerInfo {
|
||||
sk_id: NodeId(sk_info.safekeeper_id),
|
||||
term: sk_info.term,
|
||||
last_log_term: sk_info.last_log_term,
|
||||
flush_lsn: Lsn(sk_info.flush_lsn),
|
||||
_last_log_term: sk_info.last_log_term,
|
||||
_flush_lsn: Lsn(sk_info.flush_lsn),
|
||||
commit_lsn: Lsn(sk_info.commit_lsn),
|
||||
local_start_lsn: Lsn(sk_info.local_start_lsn),
|
||||
pg_connstr: sk_info.safekeeper_connstr.clone(),
|
||||
http_connstr: sk_info.http_connstr.clone(),
|
||||
ts,
|
||||
}
|
||||
}
|
||||
@@ -119,6 +112,7 @@ pub struct SharedState {
|
||||
/// TODO: it might be better to remove tli completely from GlobalTimelines
|
||||
/// when tli is inactive instead of having this flag.
|
||||
active: bool,
|
||||
num_computes: u32,
|
||||
last_removed_segno: XLogSegNo,
|
||||
}
|
||||
|
||||
@@ -157,6 +151,7 @@ impl SharedState {
|
||||
peers_info: PeersInfo(vec![]),
|
||||
wal_backup_active: false,
|
||||
active: false,
|
||||
num_computes: 0,
|
||||
last_removed_segno: 0,
|
||||
})
|
||||
}
|
||||
@@ -176,6 +171,7 @@ impl SharedState {
|
||||
peers_info: PeersInfo(vec![]),
|
||||
wal_backup_active: false,
|
||||
active: false,
|
||||
num_computes: 0,
|
||||
last_removed_segno: 0,
|
||||
})
|
||||
}
|
||||
@@ -223,7 +219,7 @@ impl SharedState {
|
||||
};
|
||||
trace!(
|
||||
"timeline {} s3 offloading action {} pending: num_computes={}, commit_lsn={}, backup_lsn={}",
|
||||
self.sk.state.timeline_id, action_pending, num_computes, self.sk.inmem.commit_lsn, self.sk.inmem.backup_lsn
|
||||
self.sk.state.timeline_id, action_pending, self.num_computes, self.sk.inmem.commit_lsn, self.sk.inmem.backup_lsn
|
||||
);
|
||||
}
|
||||
res
|
||||
@@ -269,20 +265,6 @@ impl SharedState {
|
||||
availability_zone: conf.availability_zone.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get our latest view of alive peers status on the timeline.
|
||||
/// We pass our own info through the broker as well, so when we don't have connection
|
||||
/// to the broker returned vec is empty.
|
||||
fn get_peers(&self, heartbeat_timeout: Duration) -> Vec<PeerInfo> {
|
||||
let now = Instant::now();
|
||||
self.peers_info
|
||||
.0
|
||||
.iter()
|
||||
// Regard peer as absent if we haven't heard from it within heartbeat_timeout.
|
||||
.filter(|p| now.duration_since(p.ts) <= heartbeat_timeout)
|
||||
.cloned()
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
@@ -464,9 +446,7 @@ impl Timeline {
|
||||
/// Bootstrap new or existing timeline starting background stasks.
|
||||
pub fn bootstrap(self: &Arc<Timeline>, conf: &SafeKeeperConf) {
|
||||
// Start recovery task which always runs on the timeline.
|
||||
if conf.peer_recovery_enabled {
|
||||
tokio::spawn(recovery_main(self.clone(), conf.clone()));
|
||||
}
|
||||
tokio::spawn(recovery_main(self.clone(), conf.clone()));
|
||||
}
|
||||
|
||||
/// Delete timeline from disk completely, by removing timeline directory. Background
|
||||
@@ -551,7 +531,7 @@ impl Timeline {
|
||||
return true;
|
||||
}
|
||||
let shared_state = self.write_shared_state().await;
|
||||
if self.walreceivers.get_num() == 0 {
|
||||
if shared_state.num_computes == 0 {
|
||||
return shared_state.sk.inmem.commit_lsn == Lsn(0) || // no data at all yet
|
||||
reported_remote_consistent_lsn >= shared_state.sk.inmem.commit_lsn;
|
||||
}
|
||||
@@ -700,88 +680,20 @@ impl Timeline {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get our latest view of alive peers status on the timeline.
|
||||
/// We pass our own info through the broker as well, so when we don't have connection
|
||||
/// to the broker returned vec is empty.
|
||||
pub async fn get_peers(&self, conf: &SafeKeeperConf) -> Vec<PeerInfo> {
|
||||
let shared_state = self.write_shared_state().await;
|
||||
shared_state.get_peers(conf.heartbeat_timeout)
|
||||
}
|
||||
|
||||
/// Should we start fetching WAL from a peer safekeeper, and if yes, from
|
||||
/// which? Answer is yes, i.e. .donors is not empty if 1) there is something
|
||||
/// to fetch, and we can do that without running elections; 2) there is no
|
||||
/// actively streaming compute, as we don't want to compete with it.
|
||||
///
|
||||
/// If donor(s) are choosen, theirs last_log_term is guaranteed to be equal
|
||||
/// to its last_log_term so we are sure such a leader ever had been elected.
|
||||
///
|
||||
/// All possible donors are returned so that we could keep connection to the
|
||||
/// current one if it is good even if it slightly lags behind.
|
||||
///
|
||||
/// Note that term conditions above might be not met, but safekeepers are
|
||||
/// still not aligned on last flush_lsn. Generally in this case until
|
||||
/// elections are run it is not possible to say which safekeeper should
|
||||
/// recover from which one -- history which would be committed is different
|
||||
/// depending on assembled quorum (e.g. classic picture 8 from Raft paper).
|
||||
/// Thus we don't try to predict it here.
|
||||
pub async fn recovery_needed(&self, heartbeat_timeout: Duration) -> RecoveryNeededInfo {
|
||||
let ss = self.write_shared_state().await;
|
||||
let term = ss.sk.state.acceptor_state.term;
|
||||
let last_log_term = ss.sk.get_epoch();
|
||||
let flush_lsn = ss.sk.flush_lsn();
|
||||
// note that peers contain myself, but that's ok -- we are interested only in peers which are strictly ahead of us.
|
||||
let mut peers = ss.get_peers(heartbeat_timeout);
|
||||
// Sort by <last log term, lsn> pairs.
|
||||
peers.sort_by(|p1, p2| {
|
||||
let tl1 = TermLsn {
|
||||
term: p1.last_log_term,
|
||||
lsn: p1.flush_lsn,
|
||||
};
|
||||
let tl2 = TermLsn {
|
||||
term: p2.last_log_term,
|
||||
lsn: p2.flush_lsn,
|
||||
};
|
||||
tl2.cmp(&tl1) // desc
|
||||
});
|
||||
let num_streaming_computes = self.walreceivers.get_num_streaming();
|
||||
let donors = if num_streaming_computes > 0 {
|
||||
vec![] // If there is a streaming compute, don't try to recover to not intervene.
|
||||
} else {
|
||||
peers
|
||||
.iter()
|
||||
.filter_map(|candidate| {
|
||||
// Are we interested in this candidate?
|
||||
let candidate_tl = TermLsn {
|
||||
term: candidate.last_log_term,
|
||||
lsn: candidate.flush_lsn,
|
||||
};
|
||||
let my_tl = TermLsn {
|
||||
term: last_log_term,
|
||||
lsn: flush_lsn,
|
||||
};
|
||||
if my_tl < candidate_tl {
|
||||
// Yes, we are interested. Can we pull from it without
|
||||
// (re)running elections? It is possible if 1) his term
|
||||
// is equal to his last_log_term so we could act on
|
||||
// behalf of leader of this term (we must be sure he was
|
||||
// ever elected) and 2) our term is not higher, or we'll refuse data.
|
||||
if candidate.term == candidate.last_log_term && candidate.term >= term {
|
||||
Some(Donor::from(candidate))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
};
|
||||
RecoveryNeededInfo {
|
||||
term,
|
||||
last_log_term,
|
||||
flush_lsn,
|
||||
peers,
|
||||
num_streaming_computes,
|
||||
donors,
|
||||
}
|
||||
let now = Instant::now();
|
||||
shared_state
|
||||
.peers_info
|
||||
.0
|
||||
.iter()
|
||||
// Regard peer as absent if we haven't heard from it within heartbeat_timeout.
|
||||
.filter(|p| now.duration_since(p.ts) <= conf.heartbeat_timeout)
|
||||
.cloned()
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn get_walsenders(&self) -> &Arc<WalSenders> {
|
||||
@@ -853,7 +765,7 @@ impl Timeline {
|
||||
ps_feedback,
|
||||
wal_backup_active: state.wal_backup_active,
|
||||
timeline_is_active: state.active,
|
||||
num_computes: self.walreceivers.get_num() as u32,
|
||||
num_computes: state.num_computes,
|
||||
last_removed_segno: state.last_removed_segno,
|
||||
epoch_start_lsn: state.sk.epoch_start_lsn,
|
||||
mem_state: state.sk.inmem.clone(),
|
||||
@@ -880,7 +792,7 @@ impl Timeline {
|
||||
walsenders: self.walsenders.get_all(),
|
||||
wal_backup_active: state.wal_backup_active,
|
||||
active: state.active,
|
||||
num_computes: self.walreceivers.get_num() as u32,
|
||||
num_computes: state.num_computes,
|
||||
last_removed_segno: state.last_removed_segno,
|
||||
epoch_start_lsn: state.sk.epoch_start_lsn,
|
||||
mem_state: state.sk.inmem.clone(),
|
||||
|
||||
@@ -357,12 +357,6 @@ class PgProtocol:
|
||||
result.append(cur.fetchall())
|
||||
return result
|
||||
|
||||
def safe_psql_scalar(self, query) -> Any:
|
||||
"""
|
||||
Execute query returning single row with single column.
|
||||
"""
|
||||
return self.safe_psql(query)[0][0]
|
||||
|
||||
|
||||
@dataclass
|
||||
class AuthKeys:
|
||||
@@ -1638,9 +1632,6 @@ class NeonPageserver(PgProtocol):
|
||||
# these can happen during shutdown, but it should not be a reason to fail a test
|
||||
".*completed, took longer than expected.*",
|
||||
'.*registered custom resource manager "neon".*',
|
||||
# AWS S3 may emit 500 errors for keys in a DeleteObjects response: we retry these
|
||||
# and it is not a failure of our code when it happens.
|
||||
".*DeleteObjects.*We encountered an internal error. Please try again.*",
|
||||
]
|
||||
|
||||
def timeline_dir(self, tenant_id: TenantId, timeline_id: Optional[TimelineId] = None) -> Path:
|
||||
@@ -2583,13 +2574,6 @@ class Endpoint(PgProtocol):
|
||||
):
|
||||
self.stop()
|
||||
|
||||
# Checkpoints running endpoint and returns pg_wal size in MB.
|
||||
def get_pg_wal_size(self):
|
||||
log.info(f'checkpointing at LSN {self.safe_psql("select pg_current_wal_lsn()")[0][0]}')
|
||||
self.safe_psql("checkpoint")
|
||||
assert self.pgdata_dir is not None # please mypy
|
||||
return get_dir_size(os.path.join(self.pgdata_dir, "pg_wal")) / 1024 / 1024
|
||||
|
||||
|
||||
class EndpointFactory:
|
||||
"""An object representing multiple compute endpoints."""
|
||||
@@ -2771,27 +2755,6 @@ class Safekeeper:
|
||||
def data_dir(self) -> str:
|
||||
return os.path.join(self.env.repo_dir, "safekeepers", f"sk{self.id}")
|
||||
|
||||
def timeline_dir(self, tenant_id, timeline_id) -> str:
|
||||
return os.path.join(self.data_dir(), str(tenant_id), str(timeline_id))
|
||||
|
||||
def list_segments(self, tenant_id, timeline_id) -> List[str]:
|
||||
"""
|
||||
Get list of segment names of the given timeline.
|
||||
"""
|
||||
tli_dir = self.timeline_dir(tenant_id, timeline_id)
|
||||
segments = []
|
||||
for _, _, filenames in os.walk(tli_dir):
|
||||
segments.extend([f for f in filenames if f != "safekeeper.control"])
|
||||
segments.sort()
|
||||
return segments
|
||||
|
||||
|
||||
# Walreceiver as returned by sk's timeline status endpoint.
|
||||
@dataclass
|
||||
class Walreceiver:
|
||||
conn_id: int
|
||||
state: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class SafekeeperTimelineStatus:
|
||||
@@ -2803,7 +2766,6 @@ class SafekeeperTimelineStatus:
|
||||
backup_lsn: Lsn
|
||||
peer_horizon_lsn: Lsn
|
||||
remote_consistent_lsn: Lsn
|
||||
walreceivers: List[Walreceiver]
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -2865,7 +2827,6 @@ class SafekeeperHttpClient(requests.Session):
|
||||
res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}")
|
||||
res.raise_for_status()
|
||||
resj = res.json()
|
||||
walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]]
|
||||
return SafekeeperTimelineStatus(
|
||||
acceptor_epoch=resj["acceptor_state"]["epoch"],
|
||||
pg_version=resj["pg_info"]["pg_version"],
|
||||
@@ -2875,7 +2836,6 @@ class SafekeeperHttpClient(requests.Session):
|
||||
backup_lsn=Lsn(resj["backup_lsn"]),
|
||||
peer_horizon_lsn=Lsn(resj["peer_horizon_lsn"]),
|
||||
remote_consistent_lsn=Lsn(resj["remote_consistent_lsn"]),
|
||||
walreceivers=walreceivers,
|
||||
)
|
||||
|
||||
def record_safekeeper_info(self, tenant_id: TenantId, timeline_id: TimelineId, body):
|
||||
@@ -3159,22 +3119,6 @@ def check_restored_datadir_content(
|
||||
assert (mismatch, error) == ([], [])
|
||||
|
||||
|
||||
def logical_replication_sync(subscriber: VanillaPostgres, publisher: Endpoint) -> Lsn:
|
||||
"""Wait logical replication subscriber to sync with publisher."""
|
||||
publisher_lsn = Lsn(publisher.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
|
||||
while True:
|
||||
res = subscriber.safe_psql("select latest_end_lsn from pg_catalog.pg_stat_subscription")[0][
|
||||
0
|
||||
]
|
||||
if res:
|
||||
log.info(f"subscriber_lsn={res}")
|
||||
subscriber_lsn = Lsn(res)
|
||||
log.info(f"Subscriber LSN={subscriber_lsn}, publisher LSN={ publisher_lsn}")
|
||||
if subscriber_lsn >= publisher_lsn:
|
||||
return subscriber_lsn
|
||||
time.sleep(0.5)
|
||||
|
||||
|
||||
def wait_for_last_flush_lsn(
|
||||
env: NeonEnv,
|
||||
endpoint: Endpoint,
|
||||
|
||||
@@ -453,15 +453,6 @@ class PageserverHttpClient(requests.Session):
|
||||
res_json = res.json()
|
||||
return res_json
|
||||
|
||||
def timeline_get_timestamp_of_lsn(self, tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn):
|
||||
log.info(f"Requesting time range of lsn {lsn}, tenant {tenant_id}, timeline {timeline_id}")
|
||||
res = self.get(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_timestamp_of_lsn?lsn={lsn}",
|
||||
)
|
||||
self.verbose_error(res)
|
||||
res_json = res.json()
|
||||
return res_json
|
||||
|
||||
def timeline_checkpoint(self, tenant_id: TenantId, timeline_id: TimelineId):
|
||||
self.is_testing_enabled_or_skip()
|
||||
|
||||
|
||||
@@ -1,43 +0,0 @@
|
||||
import time
|
||||
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnv, PgBin, logical_replication_sync
|
||||
|
||||
|
||||
@pytest.mark.timeout(1000)
|
||||
def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg):
|
||||
env = neon_simple_env
|
||||
|
||||
env.neon_cli.create_branch("test_logical_replication", "empty")
|
||||
endpoint = env.endpoints.create_start("test_logical_replication")
|
||||
|
||||
log.info("postgres is running on 'test_logical_replication' branch")
|
||||
pg_bin.run_capture(["pgbench", "-i", "-s10", endpoint.connstr()])
|
||||
|
||||
endpoint.safe_psql("create publication pub1 for table pgbench_accounts, pgbench_history")
|
||||
|
||||
# now start subscriber
|
||||
vanilla_pg.start()
|
||||
pg_bin.run_capture(["pgbench", "-i", "-s10", vanilla_pg.connstr()])
|
||||
|
||||
vanilla_pg.safe_psql("truncate table pgbench_accounts")
|
||||
vanilla_pg.safe_psql("truncate table pgbench_history")
|
||||
|
||||
connstr = endpoint.connstr().replace("'", "''")
|
||||
print(f"connstr='{connstr}'")
|
||||
vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
|
||||
|
||||
# Wait logical replication channel to be established
|
||||
logical_replication_sync(vanilla_pg, endpoint)
|
||||
|
||||
pg_bin.run_capture(["pgbench", "-c10", "-T100", "-Mprepared", endpoint.connstr()])
|
||||
|
||||
# Wait logical replication to sync
|
||||
start = time.time()
|
||||
logical_replication_sync(vanilla_pg, endpoint)
|
||||
log.info(f"Sync with master took {time.time() - start} seconds")
|
||||
|
||||
sum_master = endpoint.safe_psql("select sum(abalance) from pgbench_accounts")[0][0]
|
||||
sum_replica = vanilla_pg.safe_psql("select sum(abalance) from pgbench_accounts")[0][0]
|
||||
assert sum_master == sum_replica
|
||||
574
test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
generated
574
test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -9,8 +9,8 @@ publish = false
|
||||
[dependencies]
|
||||
native-tls = "0.2.11"
|
||||
postgres-native-tls = "0.5.0"
|
||||
tokio = { version = "1.33", features=["rt", "macros"] }
|
||||
tokio-postgres = "0.7.10"
|
||||
tokio = { version = "1.28", features=["rt", "macros"] }
|
||||
tokio-postgres = "0.7.8"
|
||||
|
||||
|
||||
# This is not part of the main 'neon' workspace
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
FROM rust:1.73
|
||||
FROM rust:1.70
|
||||
WORKDIR /source
|
||||
|
||||
COPY . .
|
||||
|
||||
@@ -311,8 +311,8 @@ def test_competing_branchings_from_loading_race_to_ok_or_err(neon_env_builder: N
|
||||
assert isinstance(failed, Exception)
|
||||
assert isinstance(succeeded, Dict)
|
||||
|
||||
# there's multiple valid status codes:
|
||||
# - Timeline x/y already exists
|
||||
# FIXME: there's probably multiple valid status codes:
|
||||
# - Timeline 62505b9a9f6b1d29117b1b74eaf07b12/56cd19d3b2dbcc65e9d53ec6ca304f24 already exists
|
||||
# - whatever 409 response says, but that is a subclass of PageserverApiException
|
||||
assert isinstance(failed, PageserverApiException)
|
||||
assert succeeded["state"] == "Active"
|
||||
@@ -320,14 +320,17 @@ def test_competing_branchings_from_loading_race_to_ok_or_err(neon_env_builder: N
|
||||
# we might still have the failpoint active
|
||||
env.pageserver.stop(immediate=True)
|
||||
|
||||
# pytest should nag if we leave threads unjoined
|
||||
for t in threads:
|
||||
t.join()
|
||||
create_root.join()
|
||||
|
||||
|
||||
def test_non_uploaded_root_timeline_is_deleted_after_restart(neon_env_builder: NeonEnvBuilder):
|
||||
def test_non_uploaded_branch_availability_after_restart(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Check that a timeline is deleted locally on subsequent restart if it never successfully uploaded during creation.
|
||||
Currently before RFC#27 we keep and continue uploading branches which were not successfully uploaded before shutdown.
|
||||
|
||||
This test likely duplicates some other test, but it's easier to write one than to make sure there will be a failing test when the rfc is implemented.
|
||||
"""
|
||||
|
||||
env = neon_env_builder.init_configs()
|
||||
@@ -363,59 +366,9 @@ def test_non_uploaded_root_timeline_is_deleted_after_restart(neon_env_builder: N
|
||||
|
||||
wait_until_tenant_active(ps_http, env.initial_tenant)
|
||||
|
||||
with pytest.raises(PageserverApiException, match="not found"):
|
||||
ps_http.timeline_detail(env.initial_tenant, env.initial_timeline)
|
||||
|
||||
|
||||
def test_non_uploaded_branch_is_deleted_after_restart(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Check that a timeline is deleted locally on subsequent restart if it never successfully uploaded during creation.
|
||||
"""
|
||||
|
||||
env = neon_env_builder.init_configs()
|
||||
env.start()
|
||||
|
||||
env.pageserver.allowed_errors.append(
|
||||
".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*"
|
||||
)
|
||||
ps_http = env.pageserver.http_client()
|
||||
|
||||
ps_http.tenant_create(env.initial_tenant)
|
||||
ps_http.timeline_create(env.pg_version, env.initial_tenant, env.initial_timeline)
|
||||
|
||||
# pause all uploads
|
||||
ps_http.configure_failpoints(("before-upload-index-pausable", "pause"))
|
||||
branch_id = TimelineId.generate()
|
||||
|
||||
def start_creating_timeline():
|
||||
with pytest.raises(RequestException):
|
||||
ps_http.timeline_create(
|
||||
env.pg_version,
|
||||
env.initial_tenant,
|
||||
branch_id,
|
||||
ancestor_timeline_id=env.initial_timeline,
|
||||
timeout=60,
|
||||
)
|
||||
|
||||
t = threading.Thread(target=start_creating_timeline)
|
||||
try:
|
||||
t.start()
|
||||
|
||||
wait_until_paused(env, "before-upload-index-pausable")
|
||||
finally:
|
||||
# FIXME: paused uploads bother shutdown
|
||||
env.pageserver.stop(immediate=True)
|
||||
t.join()
|
||||
|
||||
# now without a failpoint
|
||||
env.pageserver.start()
|
||||
|
||||
wait_until_tenant_active(ps_http, env.initial_tenant)
|
||||
|
||||
ps_http.timeline_detail(env.initial_tenant, env.initial_timeline)
|
||||
|
||||
with pytest.raises(PageserverApiException, match="not found"):
|
||||
ps_http.timeline_detail(env.initial_tenant, branch_id)
|
||||
# currently it lives on and will get eventually uploaded, but this will change
|
||||
detail = ps_http.timeline_detail(env.initial_tenant, env.initial_timeline)
|
||||
assert detail["state"] == "Active"
|
||||
|
||||
|
||||
def wait_until_paused(env: NeonEnv, failpoint: str):
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
import time
|
||||
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn
|
||||
from fixtures.pageserver.utils import (
|
||||
wait_for_last_record_lsn,
|
||||
wait_for_upload_queue_empty,
|
||||
wait_until_tenant_active,
|
||||
)
|
||||
@@ -41,12 +41,9 @@ def test_duplicate_layers(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
|
||||
|
||||
def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
|
||||
"""
|
||||
Test sets fail point at the end of first compaction phase: after
|
||||
flushing new L1 layer but before deletion of L0 layers.
|
||||
|
||||
The L1 used to be overwritten, but with crash-consistency via remote
|
||||
index_part.json, we end up deleting the not yet uploaded L1 layer on
|
||||
startup.
|
||||
This test sets fail point at the end of first compaction phase:
|
||||
after flushing new L1 layers but before deletion of L0 layers
|
||||
it should cause generation of duplicate L1 layer by compaction after restart.
|
||||
"""
|
||||
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
|
||||
|
||||
@@ -68,8 +65,7 @@ def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin)
|
||||
connstr = endpoint.connstr(options="-csynchronous_commit=off")
|
||||
pg_bin.run_capture(["pgbench", "-i", "-s1", connstr])
|
||||
|
||||
lsn = wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
|
||||
endpoint.stop()
|
||||
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
|
||||
|
||||
# make sure we receive no new wal after this, so that we'll write over the same L1 file.
|
||||
endpoint.stop()
|
||||
@@ -78,7 +74,7 @@ def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin)
|
||||
|
||||
# hit the exit failpoint
|
||||
with pytest.raises(ConnectionError, match="Remote end closed connection without response"):
|
||||
pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
|
||||
pageserver_http.timeline_compact(tenant_id, timeline_id)
|
||||
env.pageserver.stop()
|
||||
|
||||
# now the duplicate L1 has been created, but is not yet uploaded
|
||||
@@ -111,32 +107,33 @@ def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin)
|
||||
l1_found = path
|
||||
|
||||
assert l1_found is not None, "failed to find L1 locally"
|
||||
original_created_at = l1_found.stat()[8]
|
||||
|
||||
uploaded = env.pageserver_remote_storage.timeline_path(tenant_id, timeline_id) / l1_found.name
|
||||
assert not uploaded.exists(), "to-be-overwritten should not yet be uploaded"
|
||||
|
||||
# give room for fs timestamps
|
||||
time.sleep(1)
|
||||
|
||||
env.pageserver.start()
|
||||
wait_until_tenant_active(pageserver_http, tenant_id)
|
||||
|
||||
assert not l1_found.exists(), "partial compaction result should had been removed during startup"
|
||||
|
||||
# wait for us to catch up again
|
||||
wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, lsn)
|
||||
message = f".*duplicated L1 layer layer={l1_found.name}"
|
||||
env.pageserver.allowed_errors.append(message)
|
||||
|
||||
pageserver_http.timeline_compact(tenant_id, timeline_id)
|
||||
|
||||
# give time for log flush
|
||||
time.sleep(1)
|
||||
|
||||
message = f".*duplicated L1 layer layer={l1_found.name}"
|
||||
found_msg = env.pageserver.log_contains(message)
|
||||
# resident or evicted, it should not be overwritten, however it should had been non-existing at startup
|
||||
assert (
|
||||
found_msg is None
|
||||
), "layer should had been removed during startup, did it live on as evicted?"
|
||||
assert found_msg is not None, "no layer was duplicated, has this been fixed already?"
|
||||
|
||||
assert l1_found.exists(), "the L1 reappears"
|
||||
log.info(f"found log line: {found_msg}")
|
||||
|
||||
overwritten_at = l1_found.stat()[8]
|
||||
assert original_created_at < overwritten_at, "expected the L1 to be overwritten"
|
||||
|
||||
wait_for_upload_queue_empty(pageserver_http, tenant_id, timeline_id)
|
||||
|
||||
assert uploaded.exists(), "the L1 is uploaded"
|
||||
uploaded_at = uploaded.stat()[8]
|
||||
assert overwritten_at <= uploaded_at, "expected the L1 to finally be uploaded"
|
||||
|
||||
@@ -2,6 +2,7 @@ import asyncio
|
||||
import concurrent.futures
|
||||
import random
|
||||
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
Endpoint,
|
||||
@@ -94,12 +95,13 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
|
||||
#
|
||||
def test_gc_index_upload(neon_env_builder: NeonEnvBuilder):
|
||||
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
|
||||
def test_gc_index_upload(neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind):
|
||||
# Disable time-based pitr, we will use LSN-based thresholds in the manual GC calls
|
||||
neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
|
||||
num_index_uploads = 0
|
||||
|
||||
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
|
||||
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
tenant_id = env.initial_tenant
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import time
|
||||
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
@@ -13,10 +14,12 @@ from fixtures.utils import query_scalar
|
||||
|
||||
# Crates a few layers, ensures that we can evict them (removing locally but keeping track of them anyway)
|
||||
# and then download them back.
|
||||
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
|
||||
def test_basic_eviction(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
remote_storage_kind: RemoteStorageKind,
|
||||
):
|
||||
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
|
||||
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
|
||||
|
||||
env = neon_env_builder.init_start(
|
||||
initial_tenant_conf={
|
||||
|
||||
@@ -1,149 +0,0 @@
|
||||
import time
|
||||
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnv,
|
||||
logical_replication_sync,
|
||||
wait_for_last_flush_lsn,
|
||||
)
|
||||
|
||||
|
||||
def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg):
|
||||
env = neon_simple_env
|
||||
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.neon_cli.create_branch("test_logical_replication", "empty")
|
||||
endpoint = env.endpoints.create_start(
|
||||
"test_logical_replication", config_lines=["log_statement=all"]
|
||||
)
|
||||
|
||||
log.info("postgres is running on 'test_logical_replication' branch")
|
||||
pg_conn = endpoint.connect()
|
||||
cur = pg_conn.cursor()
|
||||
|
||||
cur.execute("create table t(pk integer primary key, payload integer)")
|
||||
cur.execute(
|
||||
"CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int, text varchar(120));"
|
||||
)
|
||||
cur.execute("create publication pub1 for table t, replication_example")
|
||||
|
||||
# now start subscriber
|
||||
vanilla_pg.start()
|
||||
vanilla_pg.safe_psql("create table t(pk integer primary key, payload integer)")
|
||||
vanilla_pg.safe_psql(
|
||||
"CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int, text varchar(120), testcolumn1 int, testcolumn2 int, testcolumn3 int);"
|
||||
)
|
||||
connstr = endpoint.connstr().replace("'", "''")
|
||||
log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}")
|
||||
vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
|
||||
|
||||
# Wait logical replication channel to be established
|
||||
logical_replication_sync(vanilla_pg, endpoint)
|
||||
|
||||
# insert some data
|
||||
cur.execute("insert into t values (generate_series(1,1000), 0)")
|
||||
|
||||
# Wait logical replication to sync
|
||||
logical_replication_sync(vanilla_pg, endpoint)
|
||||
assert vanilla_pg.safe_psql("select count(*) from t")[0][0] == 1000
|
||||
|
||||
# now stop subscriber...
|
||||
vanilla_pg.stop()
|
||||
|
||||
# ... and insert some more data which should be delivered to subscriber after restart
|
||||
cur.execute("insert into t values (generate_series(1001,2000), 0)")
|
||||
|
||||
# Restart compute
|
||||
endpoint.stop()
|
||||
endpoint.start()
|
||||
|
||||
# start subscriber
|
||||
vanilla_pg.start()
|
||||
|
||||
# Wait logical replication to sync
|
||||
logical_replication_sync(vanilla_pg, endpoint)
|
||||
|
||||
# Check that subscribers receives all data
|
||||
assert vanilla_pg.safe_psql("select count(*) from t")[0][0] == 2000
|
||||
|
||||
# Test that save/restore of RewriteMappingFile works. Partial copy of
|
||||
# rewrite.sql test.
|
||||
log.info("checking rewriteheap")
|
||||
vanilla_pg.stop()
|
||||
cmds = """
|
||||
INSERT INTO replication_example(somedata) VALUES (1);
|
||||
|
||||
BEGIN;
|
||||
INSERT INTO replication_example(somedata) VALUES (2);
|
||||
ALTER TABLE replication_example ADD COLUMN testcolumn1 int;
|
||||
INSERT INTO replication_example(somedata, testcolumn1) VALUES (3, 1);
|
||||
COMMIT;
|
||||
|
||||
BEGIN;
|
||||
INSERT INTO replication_example(somedata) VALUES (3);
|
||||
ALTER TABLE replication_example ADD COLUMN testcolumn2 int;
|
||||
INSERT INTO replication_example(somedata, testcolumn1, testcolumn2) VALUES (4, 2, 1);
|
||||
COMMIT;
|
||||
|
||||
VACUUM FULL pg_am;
|
||||
VACUUM FULL pg_amop;
|
||||
VACUUM FULL pg_proc;
|
||||
VACUUM FULL pg_opclass;
|
||||
VACUUM FULL pg_type;
|
||||
VACUUM FULL pg_index;
|
||||
VACUUM FULL pg_database;
|
||||
|
||||
|
||||
-- repeated rewrites that fail
|
||||
BEGIN;
|
||||
CLUSTER pg_class USING pg_class_oid_index;
|
||||
CLUSTER pg_class USING pg_class_oid_index;
|
||||
ROLLBACK;
|
||||
|
||||
-- repeated rewrites that succeed
|
||||
BEGIN;
|
||||
CLUSTER pg_class USING pg_class_oid_index;
|
||||
CLUSTER pg_class USING pg_class_oid_index;
|
||||
CLUSTER pg_class USING pg_class_oid_index;
|
||||
COMMIT;
|
||||
|
||||
-- repeated rewrites in different transactions
|
||||
VACUUM FULL pg_class;
|
||||
VACUUM FULL pg_class;
|
||||
|
||||
-- reindexing of important relations / indexes
|
||||
REINDEX TABLE pg_class;
|
||||
REINDEX INDEX pg_class_oid_index;
|
||||
REINDEX INDEX pg_class_tblspc_relfilenode_index;
|
||||
|
||||
INSERT INTO replication_example(somedata, testcolumn1) VALUES (5, 3);
|
||||
|
||||
BEGIN;
|
||||
INSERT INTO replication_example(somedata, testcolumn1) VALUES (6, 4);
|
||||
ALTER TABLE replication_example ADD COLUMN testcolumn3 int;
|
||||
INSERT INTO replication_example(somedata, testcolumn1, testcolumn3) VALUES (7, 5, 1);
|
||||
COMMIT;
|
||||
"""
|
||||
endpoint.safe_psql_many([q for q in cmds.splitlines() if q != "" and not q.startswith("-")])
|
||||
|
||||
# refetch rewrite files from pageserver
|
||||
endpoint.stop()
|
||||
endpoint.start()
|
||||
|
||||
vanilla_pg.start()
|
||||
logical_replication_sync(vanilla_pg, endpoint)
|
||||
eq_q = "select testcolumn1, testcolumn2, testcolumn3 from replication_example order by 1, 2, 3"
|
||||
assert vanilla_pg.safe_psql(eq_q) == endpoint.safe_psql(eq_q)
|
||||
log.info("rewriteheap synced")
|
||||
|
||||
# test that removal of repl slots works across restart
|
||||
vanilla_pg.stop()
|
||||
time.sleep(1) # wait for conn termination; active slots can't be dropped
|
||||
endpoint.safe_psql("select pg_drop_replication_slot('sub1');")
|
||||
endpoint.safe_psql("insert into t values (2001, 1);") # forces WAL flush
|
||||
# wait for drop message to reach safekeepers (it is not transactional)
|
||||
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
|
||||
endpoint.stop()
|
||||
endpoint.start()
|
||||
# it must be gone (but walproposer slot still exists, hence 1)
|
||||
assert endpoint.safe_psql("select count(*) from pg_replication_slots")[0][0] == 1
|
||||
@@ -1,10 +1,7 @@
|
||||
import time
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from datetime import timedelta
|
||||
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_flush_lsn
|
||||
from fixtures.pageserver.http import PageserverApiException
|
||||
from fixtures.types import Lsn
|
||||
from fixtures.utils import query_scalar
|
||||
|
||||
|
||||
@@ -28,14 +25,13 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
|
||||
cur.execute("CREATE TABLE foo (x integer)")
|
||||
tbl = []
|
||||
for i in range(1000):
|
||||
cur.execute("INSERT INTO foo VALUES(%s)", (i,))
|
||||
cur.execute(f"INSERT INTO foo VALUES({i})")
|
||||
# Get the timestamp at UTC
|
||||
after_timestamp = query_scalar(cur, "SELECT clock_timestamp()").replace(tzinfo=None)
|
||||
tbl.append([i, after_timestamp])
|
||||
|
||||
# Execute one more transaction with synchronous_commit enabled, to flush
|
||||
# all the previous transactions
|
||||
cur.execute("SET synchronous_commit=on")
|
||||
cur.execute("INSERT INTO foo VALUES (-1)")
|
||||
|
||||
# Wait until WAL is received by pageserver
|
||||
@@ -71,100 +67,3 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
|
||||
assert endpoint_here.safe_psql("SELECT max(x) FROM foo")[0][0] == i
|
||||
|
||||
endpoint_here.stop_and_destroy()
|
||||
|
||||
|
||||
# Test pageserver get_timestamp_of_lsn API
|
||||
def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder):
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
new_timeline_id = env.neon_cli.create_branch("test_ts_of_lsn_api")
|
||||
endpoint_main = env.endpoints.create_start("test_ts_of_lsn_api")
|
||||
log.info("postgres is running on 'test_ts_of_lsn_api' branch")
|
||||
|
||||
cur = endpoint_main.connect().cursor()
|
||||
# Create table, and insert rows, each in a separate transaction
|
||||
# Disable synchronous_commit to make this initialization go faster.
|
||||
#
|
||||
# Each row contains current insert LSN and the current timestamp, when
|
||||
# the row was inserted.
|
||||
cur.execute("SET synchronous_commit=off")
|
||||
cur.execute("CREATE TABLE foo (x integer)")
|
||||
tbl = []
|
||||
for i in range(1000):
|
||||
cur.execute("INSERT INTO foo VALUES(%s)", (i,))
|
||||
# Get the timestamp at UTC
|
||||
after_timestamp = query_scalar(cur, "SELECT clock_timestamp()").replace(tzinfo=timezone.utc)
|
||||
after_lsn = query_scalar(cur, "SELECT pg_current_wal_lsn()")
|
||||
tbl.append([i, after_timestamp, after_lsn])
|
||||
time.sleep(0.005)
|
||||
|
||||
# Execute one more transaction with synchronous_commit enabled, to flush
|
||||
# all the previous transactions
|
||||
cur.execute("SET synchronous_commit=on")
|
||||
cur.execute("INSERT INTO foo VALUES (-1)")
|
||||
|
||||
# Wait until WAL is received by pageserver
|
||||
last_flush_lsn = wait_for_last_flush_lsn(
|
||||
env, endpoint_main, env.initial_tenant, new_timeline_id
|
||||
)
|
||||
|
||||
with env.pageserver.http_client() as client:
|
||||
# Check edge cases: lsn larger than the last flush lsn
|
||||
probe_lsn = Lsn(int(last_flush_lsn) * 20 + 80_000)
|
||||
result = client.timeline_get_timestamp_of_lsn(
|
||||
env.initial_tenant,
|
||||
new_timeline_id,
|
||||
probe_lsn,
|
||||
)
|
||||
|
||||
# lsn of zero
|
||||
try:
|
||||
probe_lsn = Lsn(0)
|
||||
result = client.timeline_get_timestamp_of_lsn(
|
||||
env.initial_tenant,
|
||||
new_timeline_id,
|
||||
probe_lsn,
|
||||
)
|
||||
# There should always be an error here.
|
||||
raise RuntimeError("there should have been an 'Invalid LSN' error")
|
||||
except PageserverApiException as error:
|
||||
assert error.status_code == 500
|
||||
assert str(error) == "Invalid LSN"
|
||||
env.pageserver.allowed_errors.append(".*Invalid LSN.*")
|
||||
|
||||
# small lsn before initdb_lsn
|
||||
try:
|
||||
probe_lsn = Lsn(64)
|
||||
result = client.timeline_get_timestamp_of_lsn(
|
||||
env.initial_tenant,
|
||||
new_timeline_id,
|
||||
probe_lsn,
|
||||
)
|
||||
# There should always be an error here.
|
||||
raise RuntimeError("there should have been an 'could not find data for key' error")
|
||||
except PageserverApiException as error:
|
||||
assert error.status_code == 500
|
||||
assert str(error).startswith("could not find data for key")
|
||||
env.pageserver.allowed_errors.append(".*could not find data for key.*")
|
||||
|
||||
# Probe a bunch of timestamps in the valid range
|
||||
step_size = 100
|
||||
for i in range(step_size, len(tbl), step_size):
|
||||
after_timestamp = tbl[i][1]
|
||||
after_lsn = tbl[i][2]
|
||||
result = client.timeline_get_timestamp_of_lsn(
|
||||
env.initial_tenant,
|
||||
new_timeline_id,
|
||||
after_lsn,
|
||||
)
|
||||
log.info("result: %s, after_ts: %s", result, after_timestamp)
|
||||
|
||||
# TODO use fromisoformat once we have Python 3.11+
|
||||
# which has https://github.com/python/cpython/pull/92177
|
||||
timestamp = datetime.strptime(result, "%Y-%m-%dT%H:%M:%S.%f000Z").replace(
|
||||
tzinfo=timezone.utc
|
||||
)
|
||||
assert timestamp < after_timestamp, "after_timestamp after timestamp"
|
||||
if i > 1:
|
||||
before_timestamp = tbl[i - step_size][1]
|
||||
assert timestamp >= before_timestamp, "before_timestamp before timestamp"
|
||||
|
||||
@@ -306,10 +306,12 @@ def test_ondemand_download_timetravel(
|
||||
#
|
||||
# Ensure that the `download_remote_layers` API works
|
||||
#
|
||||
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
|
||||
def test_download_remote_layers_api(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
remote_storage_kind: RemoteStorageKind,
|
||||
):
|
||||
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
|
||||
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
|
||||
|
||||
##### First start, insert data and upload it to the remote storage
|
||||
env = neon_env_builder.init_start(
|
||||
@@ -463,11 +465,14 @@ def test_download_remote_layers_api(
|
||||
assert query_scalar(cur, "select count(*) from testtab") == table_len
|
||||
|
||||
|
||||
def test_compaction_downloads_on_demand_without_image_creation(neon_env_builder: NeonEnvBuilder):
|
||||
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.MOCK_S3])
|
||||
def test_compaction_downloads_on_demand_without_image_creation(
|
||||
neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
|
||||
):
|
||||
"""
|
||||
Create a few layers, then evict, then make sure compaction runs successfully.
|
||||
"""
|
||||
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3)
|
||||
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
|
||||
|
||||
conf = {
|
||||
# Disable background GC & compaction
|
||||
@@ -542,14 +547,17 @@ def test_compaction_downloads_on_demand_without_image_creation(neon_env_builder:
|
||||
assert post_compact[1] >= 3, "should had downloaded the three layers"
|
||||
|
||||
|
||||
def test_compaction_downloads_on_demand_with_image_creation(neon_env_builder: NeonEnvBuilder):
|
||||
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.MOCK_S3])
|
||||
def test_compaction_downloads_on_demand_with_image_creation(
|
||||
neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
|
||||
):
|
||||
"""
|
||||
Create layers, compact with high image_creation_threshold, then run final compaction with all layers evicted.
|
||||
|
||||
Due to current implementation, this will make image creation on-demand download layers, but we cannot really
|
||||
directly test for it.
|
||||
"""
|
||||
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3)
|
||||
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
|
||||
|
||||
conf = {
|
||||
# Disable background GC & compaction
|
||||
@@ -637,14 +645,17 @@ def test_compaction_downloads_on_demand_with_image_creation(neon_env_builder: Ne
|
||||
assert dict(kinds_after) == {"Delta": 4, "Image": 1}
|
||||
|
||||
|
||||
def test_ondemand_download_failure_to_replace(neon_env_builder: NeonEnvBuilder):
|
||||
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
|
||||
def test_ondemand_download_failure_to_replace(
|
||||
neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
|
||||
):
|
||||
"""
|
||||
Make sure that we fail on being unable to replace a RemoteLayer instead of for example livelocking.
|
||||
|
||||
See: https://github.com/neondatabase/neon/issues/3533
|
||||
"""
|
||||
|
||||
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
|
||||
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
|
||||
|
||||
# disable gc and compaction via default tenant config because config is lost while detaching
|
||||
# so that compaction will not be the one to download the layer but the http handler is
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user