mirror of
https://github.com/neondatabase/neon.git
synced 2026-03-11 04:10:36 +00:00
Compare commits
52 Commits
RemoteExte
...
proxy-test
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0d74bc0492 | ||
|
|
c994c80962 | ||
|
|
ee7bbdda0e | ||
|
|
b6e070bf85 | ||
|
|
7fa732c96c | ||
|
|
331935df91 | ||
|
|
a8eb4042ba | ||
|
|
4be2223a4c | ||
|
|
fac50a6264 | ||
|
|
a1f37cba1c | ||
|
|
8b8ff88e4b | ||
|
|
7ea593db22 | ||
|
|
789a71c4ee | ||
|
|
242dd8398c | ||
|
|
98ec5c5c46 | ||
|
|
020e607637 | ||
|
|
c77411e903 | ||
|
|
aeda82a010 | ||
|
|
e5daf366ac | ||
|
|
d77583c86a | ||
|
|
241dcbf70c | ||
|
|
da626fb1fa | ||
|
|
12b39c9db9 | ||
|
|
df5e2729a9 | ||
|
|
0fd3cd27cb | ||
|
|
5779c7908a | ||
|
|
1a4dd58b70 | ||
|
|
cbd3a32d4d | ||
|
|
ca818c8bd7 | ||
|
|
1bb9abebf2 | ||
|
|
96d89cde51 | ||
|
|
89a5c654bf | ||
|
|
5239cdc29f | ||
|
|
84a0e7b022 | ||
|
|
8d98981fe5 | ||
|
|
eb919cab88 | ||
|
|
eec1e1a192 | ||
|
|
ea089dc977 | ||
|
|
951c9bf4ca | ||
|
|
568f91420a | ||
|
|
a18aa14754 | ||
|
|
529a79d263 | ||
|
|
c09993396e | ||
|
|
9a31311990 | ||
|
|
c0e0fc8151 | ||
|
|
e8d2843df6 | ||
|
|
af91a28936 | ||
|
|
43eae17f0d | ||
|
|
6c34d4cd14 | ||
|
|
c63e3e7e84 | ||
|
|
c52495774d | ||
|
|
9a017778a9 |
1
.github/workflows/actionlint.yml
vendored
1
.github/workflows/actionlint.yml
vendored
@@ -17,6 +17,7 @@ concurrency:
|
|||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
actionlint:
|
actionlint:
|
||||||
|
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
|
|||||||
2
.github/workflows/build_and_test.yml
vendored
2
.github/workflows/build_and_test.yml
vendored
@@ -26,8 +26,8 @@ env:
|
|||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
check-permissions:
|
check-permissions:
|
||||||
|
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Disallow PRs from forks
|
- name: Disallow PRs from forks
|
||||||
if: |
|
if: |
|
||||||
|
|||||||
2
.github/workflows/neon_extra_builds.yml
vendored
2
.github/workflows/neon_extra_builds.yml
vendored
@@ -117,6 +117,7 @@ jobs:
|
|||||||
|
|
||||||
check-linux-arm-build:
|
check-linux-arm-build:
|
||||||
timeout-minutes: 90
|
timeout-minutes: 90
|
||||||
|
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
|
||||||
runs-on: [ self-hosted, dev, arm64 ]
|
runs-on: [ self-hosted, dev, arm64 ]
|
||||||
|
|
||||||
env:
|
env:
|
||||||
@@ -237,6 +238,7 @@ jobs:
|
|||||||
|
|
||||||
check-codestyle-rust-arm:
|
check-codestyle-rust-arm:
|
||||||
timeout-minutes: 90
|
timeout-minutes: 90
|
||||||
|
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
|
||||||
runs-on: [ self-hosted, dev, arm64 ]
|
runs-on: [ self-hosted, dev, arm64 ]
|
||||||
|
|
||||||
container:
|
container:
|
||||||
|
|||||||
39
Cargo.lock
generated
39
Cargo.lock
generated
@@ -1329,8 +1329,6 @@ dependencies = [
|
|||||||
"clap",
|
"clap",
|
||||||
"comfy-table",
|
"comfy-table",
|
||||||
"compute_api",
|
"compute_api",
|
||||||
"diesel",
|
|
||||||
"diesel_migrations",
|
|
||||||
"futures",
|
"futures",
|
||||||
"git-version",
|
"git-version",
|
||||||
"hex",
|
"hex",
|
||||||
@@ -1641,6 +1639,22 @@ dependencies = [
|
|||||||
"rusticata-macros",
|
"rusticata-macros",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "desim"
|
||||||
|
version = "0.1.0"
|
||||||
|
dependencies = [
|
||||||
|
"anyhow",
|
||||||
|
"bytes",
|
||||||
|
"hex",
|
||||||
|
"parking_lot 0.12.1",
|
||||||
|
"rand 0.8.5",
|
||||||
|
"scopeguard",
|
||||||
|
"smallvec",
|
||||||
|
"tracing",
|
||||||
|
"utils",
|
||||||
|
"workspace_hack",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "diesel"
|
name = "diesel"
|
||||||
version = "2.1.4"
|
version = "2.1.4"
|
||||||
@@ -2249,11 +2263,11 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "hashlink"
|
name = "hashlink"
|
||||||
version = "0.8.2"
|
version = "0.8.4"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "0761a1b9491c4f2e3d66aa0f62d0fba0af9a0e2852e4d48ea506632a4b56e6aa"
|
checksum = "e8094feaf31ff591f651a2664fb9cfd92bba7a60ce3197265e9482ebe753c8f7"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"hashbrown 0.13.2",
|
"hashbrown 0.14.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -3938,6 +3952,7 @@ dependencies = [
|
|||||||
"pin-project-lite",
|
"pin-project-lite",
|
||||||
"postgres-protocol",
|
"postgres-protocol",
|
||||||
"rand 0.8.5",
|
"rand 0.8.5",
|
||||||
|
"serde",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tracing",
|
"tracing",
|
||||||
@@ -4079,6 +4094,7 @@ dependencies = [
|
|||||||
"clap",
|
"clap",
|
||||||
"consumption_metrics",
|
"consumption_metrics",
|
||||||
"dashmap",
|
"dashmap",
|
||||||
|
"env_logger",
|
||||||
"futures",
|
"futures",
|
||||||
"git-version",
|
"git-version",
|
||||||
"hashbrown 0.13.2",
|
"hashbrown 0.13.2",
|
||||||
@@ -4126,6 +4142,7 @@ dependencies = [
|
|||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
"sha2",
|
"sha2",
|
||||||
|
"smallvec",
|
||||||
"smol_str",
|
"smol_str",
|
||||||
"socket2 0.5.5",
|
"socket2 0.5.5",
|
||||||
"sync_wrapper",
|
"sync_wrapper",
|
||||||
@@ -4144,6 +4161,7 @@ dependencies = [
|
|||||||
"tracing-subscriber",
|
"tracing-subscriber",
|
||||||
"tracing-utils",
|
"tracing-utils",
|
||||||
"url",
|
"url",
|
||||||
|
"urlencoding",
|
||||||
"utils",
|
"utils",
|
||||||
"uuid",
|
"uuid",
|
||||||
"walkdir",
|
"walkdir",
|
||||||
@@ -4826,6 +4844,7 @@ dependencies = [
|
|||||||
"clap",
|
"clap",
|
||||||
"const_format",
|
"const_format",
|
||||||
"crc32c",
|
"crc32c",
|
||||||
|
"desim",
|
||||||
"fail",
|
"fail",
|
||||||
"fs2",
|
"fs2",
|
||||||
"futures",
|
"futures",
|
||||||
@@ -4841,6 +4860,7 @@ dependencies = [
|
|||||||
"postgres_backend",
|
"postgres_backend",
|
||||||
"postgres_ffi",
|
"postgres_ffi",
|
||||||
"pq_proto",
|
"pq_proto",
|
||||||
|
"rand 0.8.5",
|
||||||
"regex",
|
"regex",
|
||||||
"remote_storage",
|
"remote_storage",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
@@ -4861,8 +4881,10 @@ dependencies = [
|
|||||||
"tokio-util",
|
"tokio-util",
|
||||||
"toml_edit",
|
"toml_edit",
|
||||||
"tracing",
|
"tracing",
|
||||||
|
"tracing-subscriber",
|
||||||
"url",
|
"url",
|
||||||
"utils",
|
"utils",
|
||||||
|
"walproposer",
|
||||||
"workspace_hack",
|
"workspace_hack",
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -5739,7 +5761,7 @@ dependencies = [
|
|||||||
[[package]]
|
[[package]]
|
||||||
name = "tokio-epoll-uring"
|
name = "tokio-epoll-uring"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#0e1af4ccddf2f01805cfc9eaefa97ee13c04b52d"
|
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#868d2c42b5d54ca82fead6e8f2f233b69a540d3e"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"futures",
|
"futures",
|
||||||
"nix 0.26.4",
|
"nix 0.26.4",
|
||||||
@@ -6264,8 +6286,9 @@ dependencies = [
|
|||||||
[[package]]
|
[[package]]
|
||||||
name = "uring-common"
|
name = "uring-common"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#0e1af4ccddf2f01805cfc9eaefa97ee13c04b52d"
|
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#868d2c42b5d54ca82fead6e8f2f233b69a540d3e"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"bytes",
|
||||||
"io-uring",
|
"io-uring",
|
||||||
"libc",
|
"libc",
|
||||||
]
|
]
|
||||||
@@ -6831,8 +6854,6 @@ dependencies = [
|
|||||||
"clap",
|
"clap",
|
||||||
"clap_builder",
|
"clap_builder",
|
||||||
"crossbeam-utils",
|
"crossbeam-utils",
|
||||||
"diesel",
|
|
||||||
"diesel_derives",
|
|
||||||
"either",
|
"either",
|
||||||
"fail",
|
"fail",
|
||||||
"futures-channel",
|
"futures-channel",
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ members = [
|
|||||||
"libs/pageserver_api",
|
"libs/pageserver_api",
|
||||||
"libs/postgres_ffi",
|
"libs/postgres_ffi",
|
||||||
"libs/safekeeper_api",
|
"libs/safekeeper_api",
|
||||||
|
"libs/desim",
|
||||||
"libs/utils",
|
"libs/utils",
|
||||||
"libs/consumption_metrics",
|
"libs/consumption_metrics",
|
||||||
"libs/postgres_backend",
|
"libs/postgres_backend",
|
||||||
@@ -80,7 +81,7 @@ futures-core = "0.3"
|
|||||||
futures-util = "0.3"
|
futures-util = "0.3"
|
||||||
git-version = "0.3"
|
git-version = "0.3"
|
||||||
hashbrown = "0.13"
|
hashbrown = "0.13"
|
||||||
hashlink = "0.8.1"
|
hashlink = "0.8.4"
|
||||||
hdrhistogram = "7.5.2"
|
hdrhistogram = "7.5.2"
|
||||||
hex = "0.4"
|
hex = "0.4"
|
||||||
hex-literal = "0.4"
|
hex-literal = "0.4"
|
||||||
@@ -171,6 +172,7 @@ tracing-opentelemetry = "0.20.0"
|
|||||||
tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
|
tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
|
||||||
twox-hash = { version = "1.6.3", default-features = false }
|
twox-hash = { version = "1.6.3", default-features = false }
|
||||||
url = "2.2"
|
url = "2.2"
|
||||||
|
urlencoding = "2.1"
|
||||||
uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
|
uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
|
||||||
walkdir = "2.3.2"
|
walkdir = "2.3.2"
|
||||||
webpki-roots = "0.25"
|
webpki-roots = "0.25"
|
||||||
@@ -202,6 +204,7 @@ postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }
|
|||||||
pq_proto = { version = "0.1", path = "./libs/pq_proto/" }
|
pq_proto = { version = "0.1", path = "./libs/pq_proto/" }
|
||||||
remote_storage = { version = "0.1", path = "./libs/remote_storage/" }
|
remote_storage = { version = "0.1", path = "./libs/remote_storage/" }
|
||||||
safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" }
|
safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" }
|
||||||
|
desim = { version = "0.1", path = "./libs/desim" }
|
||||||
storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy.
|
storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy.
|
||||||
tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
|
tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
|
||||||
tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
|
tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
|
||||||
|
|||||||
@@ -100,6 +100,11 @@ RUN mkdir -p /data/.neon/ && chown -R neon:neon /data/.neon/ \
|
|||||||
-c "listen_pg_addr='0.0.0.0:6400'" \
|
-c "listen_pg_addr='0.0.0.0:6400'" \
|
||||||
-c "listen_http_addr='0.0.0.0:9898'"
|
-c "listen_http_addr='0.0.0.0:9898'"
|
||||||
|
|
||||||
|
# When running a binary that links with libpq, default to using our most recent postgres version. Binaries
|
||||||
|
# that want a particular postgres version will select it explicitly: this is just a default.
|
||||||
|
ENV LD_LIBRARY_PATH /usr/local/v16/lib
|
||||||
|
|
||||||
|
|
||||||
VOLUME ["/data"]
|
VOLUME ["/data"]
|
||||||
USER neon
|
USER neon
|
||||||
EXPOSE 6400
|
EXPOSE 6400
|
||||||
|
|||||||
@@ -135,7 +135,7 @@ WORKDIR /home/nonroot
|
|||||||
|
|
||||||
# Rust
|
# Rust
|
||||||
# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
|
# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
|
||||||
ENV RUSTC_VERSION=1.75.0
|
ENV RUSTC_VERSION=1.76.0
|
||||||
ENV RUSTUP_HOME="/home/nonroot/.rustup"
|
ENV RUSTUP_HOME="/home/nonroot/.rustup"
|
||||||
ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
|
ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
|
||||||
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
|
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
|
||||||
|
|||||||
@@ -639,8 +639,8 @@ FROM build-deps AS pg-anon-pg-build
|
|||||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||||
|
|
||||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||||
RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgresql_anonymizer-1.1.0.tar.gz -O pg_anon.tar.gz && \
|
RUN wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
|
||||||
echo "08b09d2ff9b962f96c60db7e6f8e79cf7253eb8772516998fc35ece08633d3ad pg_anon.tar.gz" | sha256sum --check && \
|
echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9 pg_anon.tar.gz" | sha256sum --check && \
|
||||||
mkdir pg_anon-src && cd pg_anon-src && tar xvzf ../pg_anon.tar.gz --strip-components=1 -C . && \
|
mkdir pg_anon-src && cd pg_anon-src && tar xvzf ../pg_anon.tar.gz --strip-components=1 -C . && \
|
||||||
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
|
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
|
||||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||||
@@ -809,6 +809,7 @@ COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
|||||||
COPY --from=pg-semver-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
COPY --from=pg-semver-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||||
COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||||
COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
|
COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
|
||||||
|
COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||||
COPY pgxn/ pgxn/
|
COPY pgxn/ pgxn/
|
||||||
|
|
||||||
RUN make -j $(getconf _NPROCESSORS_ONLN) \
|
RUN make -j $(getconf _NPROCESSORS_ONLN) \
|
||||||
|
|||||||
2
NOTICE
2
NOTICE
@@ -1,5 +1,5 @@
|
|||||||
Neon
|
Neon
|
||||||
Copyright 2022 Neon Inc.
|
Copyright 2022 - 2024 Neon Inc.
|
||||||
|
|
||||||
The PostgreSQL submodules in vendor/ are licensed under the PostgreSQL license.
|
The PostgreSQL submodules in vendor/ are licensed under the PostgreSQL license.
|
||||||
See vendor/postgres-vX/COPYRIGHT for details.
|
See vendor/postgres-vX/COPYRIGHT for details.
|
||||||
|
|||||||
@@ -765,7 +765,12 @@ impl ComputeNode {
|
|||||||
handle_roles(spec, &mut client)?;
|
handle_roles(spec, &mut client)?;
|
||||||
handle_databases(spec, &mut client)?;
|
handle_databases(spec, &mut client)?;
|
||||||
handle_role_deletions(spec, connstr.as_str(), &mut client)?;
|
handle_role_deletions(spec, connstr.as_str(), &mut client)?;
|
||||||
handle_grants(spec, &mut client, connstr.as_str())?;
|
handle_grants(
|
||||||
|
spec,
|
||||||
|
&mut client,
|
||||||
|
connstr.as_str(),
|
||||||
|
self.has_feature(ComputeFeature::AnonExtension),
|
||||||
|
)?;
|
||||||
handle_extensions(spec, &mut client)?;
|
handle_extensions(spec, &mut client)?;
|
||||||
handle_extension_neon(&mut client)?;
|
handle_extension_neon(&mut client)?;
|
||||||
create_availability_check_data(&mut client)?;
|
create_availability_check_data(&mut client)?;
|
||||||
@@ -839,7 +844,12 @@ impl ComputeNode {
|
|||||||
handle_roles(&spec, &mut client)?;
|
handle_roles(&spec, &mut client)?;
|
||||||
handle_databases(&spec, &mut client)?;
|
handle_databases(&spec, &mut client)?;
|
||||||
handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
|
handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
|
||||||
handle_grants(&spec, &mut client, self.connstr.as_str())?;
|
handle_grants(
|
||||||
|
&spec,
|
||||||
|
&mut client,
|
||||||
|
self.connstr.as_str(),
|
||||||
|
self.has_feature(ComputeFeature::AnonExtension),
|
||||||
|
)?;
|
||||||
handle_extensions(&spec, &mut client)?;
|
handle_extensions(&spec, &mut client)?;
|
||||||
handle_extension_neon(&mut client)?;
|
handle_extension_neon(&mut client)?;
|
||||||
// We can skip handle_migrations here because a new migration can only appear
|
// We can skip handle_migrations here because a new migration can only appear
|
||||||
@@ -1235,19 +1245,10 @@ LIMIT 100",
|
|||||||
|
|
||||||
info!("Downloading to shared preload libraries: {:?}", &libs_vec);
|
info!("Downloading to shared preload libraries: {:?}", &libs_vec);
|
||||||
|
|
||||||
let build_tag_str = if spec
|
|
||||||
.features
|
|
||||||
.contains(&ComputeFeature::RemoteExtensionsUseLatest)
|
|
||||||
{
|
|
||||||
"latest"
|
|
||||||
} else {
|
|
||||||
&self.build_tag
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut download_tasks = Vec::new();
|
let mut download_tasks = Vec::new();
|
||||||
for library in &libs_vec {
|
for library in &libs_vec {
|
||||||
let (ext_name, ext_path) =
|
let (ext_name, ext_path) =
|
||||||
remote_extensions.get_ext(library, true, build_tag_str, &self.pgversion)?;
|
remote_extensions.get_ext(library, true, &self.build_tag, &self.pgversion)?;
|
||||||
download_tasks.push(self.download_extension(ext_name, ext_path));
|
download_tasks.push(self.download_extension(ext_name, ext_path));
|
||||||
}
|
}
|
||||||
let results = join_all(download_tasks).await;
|
let results = join_all(download_tasks).await;
|
||||||
|
|||||||
@@ -8,7 +8,6 @@ use std::thread;
|
|||||||
use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
|
use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
|
||||||
use compute_api::requests::ConfigurationRequest;
|
use compute_api::requests::ConfigurationRequest;
|
||||||
use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError};
|
use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError};
|
||||||
use compute_api::spec::ComputeFeature;
|
|
||||||
|
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use hyper::service::{make_service_fn, service_fn};
|
use hyper::service::{make_service_fn, service_fn};
|
||||||
@@ -172,16 +171,12 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let build_tag_str = if spec
|
remote_extensions.get_ext(
|
||||||
.features
|
&filename,
|
||||||
.contains(&ComputeFeature::RemoteExtensionsUseLatest)
|
is_library,
|
||||||
{
|
&compute.build_tag,
|
||||||
"latest"
|
&compute.pgversion,
|
||||||
} else {
|
)
|
||||||
&compute.build_tag
|
|
||||||
};
|
|
||||||
|
|
||||||
remote_extensions.get_ext(&filename, is_library, build_tag_str, &compute.pgversion)
|
|
||||||
};
|
};
|
||||||
|
|
||||||
match ext {
|
match ext {
|
||||||
|
|||||||
@@ -264,9 +264,10 @@ pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> {
|
|||||||
// case we miss some events for some reason. Not strictly necessary, but
|
// case we miss some events for some reason. Not strictly necessary, but
|
||||||
// better safe than sorry.
|
// better safe than sorry.
|
||||||
let (tx, rx) = std::sync::mpsc::channel();
|
let (tx, rx) = std::sync::mpsc::channel();
|
||||||
let (mut watcher, rx): (Box<dyn Watcher>, _) = match notify::recommended_watcher(move |res| {
|
let watcher_res = notify::recommended_watcher(move |res| {
|
||||||
let _ = tx.send(res);
|
let _ = tx.send(res);
|
||||||
}) {
|
});
|
||||||
|
let (mut watcher, rx): (Box<dyn Watcher>, _) = match watcher_res {
|
||||||
Ok(watcher) => (Box::new(watcher), rx),
|
Ok(watcher) => (Box::new(watcher), rx),
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
match e.kind {
|
match e.kind {
|
||||||
|
|||||||
@@ -581,7 +581,12 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
|||||||
/// Grant CREATE ON DATABASE to the database owner and do some other alters and grants
|
/// Grant CREATE ON DATABASE to the database owner and do some other alters and grants
|
||||||
/// to allow users creating trusted extensions and re-creating `public` schema, for example.
|
/// to allow users creating trusted extensions and re-creating `public` schema, for example.
|
||||||
#[instrument(skip_all)]
|
#[instrument(skip_all)]
|
||||||
pub fn handle_grants(spec: &ComputeSpec, client: &mut Client, connstr: &str) -> Result<()> {
|
pub fn handle_grants(
|
||||||
|
spec: &ComputeSpec,
|
||||||
|
client: &mut Client,
|
||||||
|
connstr: &str,
|
||||||
|
enable_anon_extension: bool,
|
||||||
|
) -> Result<()> {
|
||||||
info!("modifying database permissions");
|
info!("modifying database permissions");
|
||||||
let existing_dbs = get_existing_dbs(client)?;
|
let existing_dbs = get_existing_dbs(client)?;
|
||||||
|
|
||||||
@@ -678,6 +683,11 @@ pub fn handle_grants(spec: &ComputeSpec, client: &mut Client, connstr: &str) ->
|
|||||||
inlinify(&grant_query)
|
inlinify(&grant_query)
|
||||||
);
|
);
|
||||||
db_client.simple_query(&grant_query)?;
|
db_client.simple_query(&grant_query)?;
|
||||||
|
|
||||||
|
// it is important to run this after all grants
|
||||||
|
if enable_anon_extension {
|
||||||
|
handle_extension_anon(spec, &db.owner, &mut db_client, false)?;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
@@ -766,6 +776,7 @@ BEGIN
|
|||||||
END IF;
|
END IF;
|
||||||
END
|
END
|
||||||
$$;"#,
|
$$;"#,
|
||||||
|
"GRANT pg_monitor TO neon_superuser WITH ADMIN OPTION",
|
||||||
];
|
];
|
||||||
|
|
||||||
let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
|
let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
|
||||||
@@ -809,5 +820,125 @@ $$;"#,
|
|||||||
"Ran {} migrations",
|
"Ran {} migrations",
|
||||||
(migrations.len() - starting_migration_id)
|
(migrations.len() - starting_migration_id)
|
||||||
);
|
);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Connect to the database as superuser and pre-create anon extension
|
||||||
|
/// if it is present in shared_preload_libraries
|
||||||
|
#[instrument(skip_all)]
|
||||||
|
pub fn handle_extension_anon(
|
||||||
|
spec: &ComputeSpec,
|
||||||
|
db_owner: &str,
|
||||||
|
db_client: &mut Client,
|
||||||
|
grants_only: bool,
|
||||||
|
) -> Result<()> {
|
||||||
|
info!("handle extension anon");
|
||||||
|
|
||||||
|
if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
|
||||||
|
if libs.contains("anon") {
|
||||||
|
if !grants_only {
|
||||||
|
// check if extension is already initialized using anon.is_initialized()
|
||||||
|
let query = "SELECT anon.is_initialized()";
|
||||||
|
match db_client.query(query, &[]) {
|
||||||
|
Ok(rows) => {
|
||||||
|
if !rows.is_empty() {
|
||||||
|
let is_initialized: bool = rows[0].get(0);
|
||||||
|
if is_initialized {
|
||||||
|
info!("anon extension is already initialized");
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
warn!(
|
||||||
|
"anon extension is_installed check failed with expected error: {}",
|
||||||
|
e
|
||||||
|
);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Create anon extension if this compute needs it
|
||||||
|
// Users cannot create it themselves, because superuser is required.
|
||||||
|
let mut query = "CREATE EXTENSION IF NOT EXISTS anon CASCADE";
|
||||||
|
info!("creating anon extension with query: {}", query);
|
||||||
|
match db_client.query(query, &[]) {
|
||||||
|
Ok(_) => {}
|
||||||
|
Err(e) => {
|
||||||
|
error!("anon extension creation failed with error: {}", e);
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// check that extension is installed
|
||||||
|
query = "SELECT extname FROM pg_extension WHERE extname = 'anon'";
|
||||||
|
let rows = db_client.query(query, &[])?;
|
||||||
|
if rows.is_empty() {
|
||||||
|
error!("anon extension is not installed");
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Initialize anon extension
|
||||||
|
// This also requires superuser privileges, so users cannot do it themselves.
|
||||||
|
query = "SELECT anon.init()";
|
||||||
|
match db_client.query(query, &[]) {
|
||||||
|
Ok(_) => {}
|
||||||
|
Err(e) => {
|
||||||
|
error!("anon.init() failed with error: {}", e);
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// check that extension is installed, if not bail early
|
||||||
|
let query = "SELECT extname FROM pg_extension WHERE extname = 'anon'";
|
||||||
|
match db_client.query(query, &[]) {
|
||||||
|
Ok(rows) => {
|
||||||
|
if rows.is_empty() {
|
||||||
|
error!("anon extension is not installed");
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
error!("anon extension check failed with error: {}", e);
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let query = format!("GRANT ALL ON SCHEMA anon TO {}", db_owner);
|
||||||
|
info!("granting anon extension permissions with query: {}", query);
|
||||||
|
db_client.simple_query(&query)?;
|
||||||
|
|
||||||
|
// Grant permissions to db_owner to use anon extension functions
|
||||||
|
let query = format!("GRANT ALL ON ALL FUNCTIONS IN SCHEMA anon TO {}", db_owner);
|
||||||
|
info!("granting anon extension permissions with query: {}", query);
|
||||||
|
db_client.simple_query(&query)?;
|
||||||
|
|
||||||
|
// This is needed, because some functions are defined as SECURITY DEFINER.
|
||||||
|
// In Postgres SECURITY DEFINER functions are executed with the privileges
|
||||||
|
// of the owner.
|
||||||
|
// In anon extension this it is needed to access some GUCs, which are only accessible to
|
||||||
|
// superuser. But we've patched postgres to allow db_owner to access them as well.
|
||||||
|
// So we need to change owner of these functions to db_owner.
|
||||||
|
let query = format!("
|
||||||
|
SELECT 'ALTER FUNCTION '||nsp.nspname||'.'||p.proname||'('||pg_get_function_identity_arguments(p.oid)||') OWNER TO {};'
|
||||||
|
from pg_proc p
|
||||||
|
join pg_namespace nsp ON p.pronamespace = nsp.oid
|
||||||
|
where nsp.nspname = 'anon';", db_owner);
|
||||||
|
|
||||||
|
info!("change anon extension functions owner to db owner");
|
||||||
|
db_client.simple_query(&query)?;
|
||||||
|
|
||||||
|
// affects views as well
|
||||||
|
let query = format!("GRANT ALL ON ALL TABLES IN SCHEMA anon TO {}", db_owner);
|
||||||
|
info!("granting anon extension permissions with query: {}", query);
|
||||||
|
db_client.simple_query(&query)?;
|
||||||
|
|
||||||
|
let query = format!("GRANT ALL ON ALL SEQUENCES IN SCHEMA anon TO {}", db_owner);
|
||||||
|
info!("granting anon extension permissions with query: {}", query);
|
||||||
|
db_client.simple_query(&query)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -10,8 +10,6 @@ async-trait.workspace = true
|
|||||||
camino.workspace = true
|
camino.workspace = true
|
||||||
clap.workspace = true
|
clap.workspace = true
|
||||||
comfy-table.workspace = true
|
comfy-table.workspace = true
|
||||||
diesel = { version = "2.1.4", features = ["postgres"]}
|
|
||||||
diesel_migrations = { version = "2.1.0", features = ["postgres"]}
|
|
||||||
futures.workspace = true
|
futures.workspace = true
|
||||||
git-version.workspace = true
|
git-version.workspace = true
|
||||||
nix.workspace = true
|
nix.workspace = true
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ CREATE TABLE tenant_shards (
|
|||||||
generation INTEGER NOT NULL,
|
generation INTEGER NOT NULL,
|
||||||
generation_pageserver BIGINT NOT NULL,
|
generation_pageserver BIGINT NOT NULL,
|
||||||
placement_policy VARCHAR NOT NULL,
|
placement_policy VARCHAR NOT NULL,
|
||||||
|
splitting SMALLINT NOT NULL,
|
||||||
-- config is JSON encoded, opaque to the database.
|
-- config is JSON encoded, opaque to the database.
|
||||||
config TEXT NOT NULL
|
config TEXT NOT NULL
|
||||||
);
|
);
|
||||||
@@ -3,7 +3,8 @@ use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
|
|||||||
use hyper::{Body, Request, Response};
|
use hyper::{Body, Request, Response};
|
||||||
use hyper::{StatusCode, Uri};
|
use hyper::{StatusCode, Uri};
|
||||||
use pageserver_api::models::{
|
use pageserver_api::models::{
|
||||||
TenantCreateRequest, TenantLocationConfigRequest, TimelineCreateRequest,
|
TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
|
||||||
|
TimelineCreateRequest,
|
||||||
};
|
};
|
||||||
use pageserver_api::shard::TenantShardId;
|
use pageserver_api::shard::TenantShardId;
|
||||||
use pageserver_client::mgmt_api;
|
use pageserver_client::mgmt_api;
|
||||||
@@ -41,7 +42,7 @@ pub struct HttpState {
|
|||||||
|
|
||||||
impl HttpState {
|
impl HttpState {
|
||||||
pub fn new(service: Arc<crate::service::Service>, auth: Option<Arc<SwappableJwtAuth>>) -> Self {
|
pub fn new(service: Arc<crate::service::Service>, auth: Option<Arc<SwappableJwtAuth>>) -> Self {
|
||||||
let allowlist_routes = ["/status"]
|
let allowlist_routes = ["/status", "/ready", "/metrics"]
|
||||||
.iter()
|
.iter()
|
||||||
.map(|v| v.parse().unwrap())
|
.map(|v| v.parse().unwrap())
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
@@ -279,6 +280,12 @@ async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError
|
|||||||
json_response(StatusCode::OK, state.service.node_list().await?)
|
json_response(StatusCode::OK, state.service.node_list().await?)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||||
|
let state = get_state(&req);
|
||||||
|
let node_id: NodeId = parse_request_param(&req, "node_id")?;
|
||||||
|
json_response(StatusCode::OK, state.service.node_drop(node_id).await?)
|
||||||
|
}
|
||||||
|
|
||||||
async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||||
let node_id: NodeId = parse_request_param(&req, "node_id")?;
|
let node_id: NodeId = parse_request_param(&req, "node_id")?;
|
||||||
let config_req = json_request::<NodeConfigureRequest>(&mut req).await?;
|
let config_req = json_request::<NodeConfigureRequest>(&mut req).await?;
|
||||||
@@ -292,6 +299,19 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
|
|||||||
json_response(StatusCode::OK, state.service.node_configure(config_req)?)
|
json_response(StatusCode::OK, state.service.node_configure(config_req)?)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn handle_tenant_shard_split(
|
||||||
|
service: Arc<Service>,
|
||||||
|
mut req: Request<Body>,
|
||||||
|
) -> Result<Response<Body>, ApiError> {
|
||||||
|
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||||
|
let split_req = json_request::<TenantShardSplitRequest>(&mut req).await?;
|
||||||
|
|
||||||
|
json_response(
|
||||||
|
StatusCode::OK,
|
||||||
|
service.tenant_shard_split(tenant_id, split_req).await?,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
async fn handle_tenant_shard_migrate(
|
async fn handle_tenant_shard_migrate(
|
||||||
service: Arc<Service>,
|
service: Arc<Service>,
|
||||||
mut req: Request<Body>,
|
mut req: Request<Body>,
|
||||||
@@ -306,11 +326,29 @@ async fn handle_tenant_shard_migrate(
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||||
|
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||||
|
let state = get_state(&req);
|
||||||
|
|
||||||
|
json_response(StatusCode::OK, state.service.tenant_drop(tenant_id).await?)
|
||||||
|
}
|
||||||
|
|
||||||
/// Status endpoint is just used for checking that our HTTP listener is up
|
/// Status endpoint is just used for checking that our HTTP listener is up
|
||||||
async fn handle_status(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
async fn handle_status(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||||
json_response(StatusCode::OK, ())
|
json_response(StatusCode::OK, ())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Readiness endpoint indicates when we're done doing startup I/O (e.g. reconciling
|
||||||
|
/// with remote pageserver nodes). This is intended for use as a kubernetes readiness probe.
|
||||||
|
async fn handle_ready(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||||
|
let state = get_state(&req);
|
||||||
|
if state.service.startup_complete.is_ready() {
|
||||||
|
json_response(StatusCode::OK, ())
|
||||||
|
} else {
|
||||||
|
json_response(StatusCode::SERVICE_UNAVAILABLE, ())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl From<ReconcileError> for ApiError {
|
impl From<ReconcileError> for ApiError {
|
||||||
fn from(value: ReconcileError) -> Self {
|
fn from(value: ReconcileError) -> Self {
|
||||||
ApiError::Conflict(format!("Reconciliation error: {}", value))
|
ApiError::Conflict(format!("Reconciliation error: {}", value))
|
||||||
@@ -366,6 +404,7 @@ pub fn make_router(
|
|||||||
.data(Arc::new(HttpState::new(service, auth)))
|
.data(Arc::new(HttpState::new(service, auth)))
|
||||||
// Non-prefixed generic endpoints (status, metrics)
|
// Non-prefixed generic endpoints (status, metrics)
|
||||||
.get("/status", |r| request_span(r, handle_status))
|
.get("/status", |r| request_span(r, handle_status))
|
||||||
|
.get("/ready", |r| request_span(r, handle_ready))
|
||||||
// Upcalls for the pageserver: point the pageserver's `control_plane_api` config to this prefix
|
// Upcalls for the pageserver: point the pageserver's `control_plane_api` config to this prefix
|
||||||
.post("/upcall/v1/re-attach", |r| {
|
.post("/upcall/v1/re-attach", |r| {
|
||||||
request_span(r, handle_re_attach)
|
request_span(r, handle_re_attach)
|
||||||
@@ -376,6 +415,12 @@ pub fn make_router(
|
|||||||
request_span(r, handle_attach_hook)
|
request_span(r, handle_attach_hook)
|
||||||
})
|
})
|
||||||
.post("/debug/v1/inspect", |r| request_span(r, handle_inspect))
|
.post("/debug/v1/inspect", |r| request_span(r, handle_inspect))
|
||||||
|
.post("/debug/v1/tenant/:tenant_id/drop", |r| {
|
||||||
|
request_span(r, handle_tenant_drop)
|
||||||
|
})
|
||||||
|
.post("/debug/v1/node/:node_id/drop", |r| {
|
||||||
|
request_span(r, handle_node_drop)
|
||||||
|
})
|
||||||
.get("/control/v1/tenant/:tenant_id/locate", |r| {
|
.get("/control/v1/tenant/:tenant_id/locate", |r| {
|
||||||
tenant_service_handler(r, handle_tenant_locate)
|
tenant_service_handler(r, handle_tenant_locate)
|
||||||
})
|
})
|
||||||
@@ -391,6 +436,9 @@ pub fn make_router(
|
|||||||
.put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
|
.put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
|
||||||
tenant_service_handler(r, handle_tenant_shard_migrate)
|
tenant_service_handler(r, handle_tenant_shard_migrate)
|
||||||
})
|
})
|
||||||
|
.put("/control/v1/tenant/:tenant_id/shard_split", |r| {
|
||||||
|
tenant_service_handler(r, handle_tenant_shard_split)
|
||||||
|
})
|
||||||
// Tenant operations
|
// Tenant operations
|
||||||
// The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
|
// The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
|
||||||
// this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
|
// this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
|
||||||
|
|||||||
@@ -1,7 +1,9 @@
|
|||||||
|
pub(crate) mod split_state;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
|
|
||||||
|
use self::split_state::SplitState;
|
||||||
use camino::Utf8Path;
|
use camino::Utf8Path;
|
||||||
use camino::Utf8PathBuf;
|
use camino::Utf8PathBuf;
|
||||||
use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
|
use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
|
||||||
@@ -258,7 +260,6 @@ impl Persistence {
|
|||||||
|
|
||||||
/// Ordering: call this _after_ deleting the tenant on pageservers, but _before_ dropping state for
|
/// Ordering: call this _after_ deleting the tenant on pageservers, but _before_ dropping state for
|
||||||
/// the tenant from memory on this server.
|
/// the tenant from memory on this server.
|
||||||
#[allow(unused)]
|
|
||||||
pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> {
|
pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> {
|
||||||
use crate::schema::tenant_shards::dsl::*;
|
use crate::schema::tenant_shards::dsl::*;
|
||||||
self.with_conn(move |conn| -> DatabaseResult<()> {
|
self.with_conn(move |conn| -> DatabaseResult<()> {
|
||||||
@@ -271,6 +272,18 @@ impl Persistence {
|
|||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) async fn delete_node(&self, del_node_id: NodeId) -> DatabaseResult<()> {
|
||||||
|
use crate::schema::nodes::dsl::*;
|
||||||
|
self.with_conn(move |conn| -> DatabaseResult<()> {
|
||||||
|
diesel::delete(nodes)
|
||||||
|
.filter(node_id.eq(del_node_id.0 as i64))
|
||||||
|
.execute(conn)?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
|
||||||
/// When a tenant invokes the /re-attach API, this function is responsible for doing an efficient
|
/// When a tenant invokes the /re-attach API, this function is responsible for doing an efficient
|
||||||
/// batched increment of the generations of all tenants whose generation_pageserver is equal to
|
/// batched increment of the generations of all tenants whose generation_pageserver is equal to
|
||||||
/// the node that called /re-attach.
|
/// the node that called /re-attach.
|
||||||
@@ -363,19 +376,107 @@ impl Persistence {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: when we start shard splitting, we must durably mark the tenant so that
|
// When we start shard splitting, we must durably mark the tenant so that
|
||||||
// on restart, we know that we must go through recovery (list shards that exist
|
// on restart, we know that we must go through recovery.
|
||||||
// and pick up where we left off and/or revert to parent shards).
|
//
|
||||||
|
// We create the child shards here, so that they will be available for increment_generation calls
|
||||||
|
// if some pageserver holding a child shard needs to restart before the overall tenant split is complete.
|
||||||
#[allow(dead_code)]
|
#[allow(dead_code)]
|
||||||
pub(crate) async fn begin_shard_split(&self, _tenant_id: TenantId) -> anyhow::Result<()> {
|
pub(crate) async fn begin_shard_split(
|
||||||
todo!();
|
&self,
|
||||||
|
old_shard_count: ShardCount,
|
||||||
|
split_tenant_id: TenantId,
|
||||||
|
parent_to_children: Vec<(TenantShardId, Vec<TenantShardPersistence>)>,
|
||||||
|
) -> DatabaseResult<()> {
|
||||||
|
use crate::schema::tenant_shards::dsl::*;
|
||||||
|
self.with_conn(move |conn| -> DatabaseResult<()> {
|
||||||
|
conn.transaction(|conn| -> DatabaseResult<()> {
|
||||||
|
// Mark parent shards as splitting
|
||||||
|
|
||||||
|
let expect_parent_records = std::cmp::max(1, old_shard_count.0);
|
||||||
|
|
||||||
|
let updated = diesel::update(tenant_shards)
|
||||||
|
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||||
|
.filter(shard_count.eq(old_shard_count.0 as i32))
|
||||||
|
.set((splitting.eq(1),))
|
||||||
|
.execute(conn)?;
|
||||||
|
if u8::try_from(updated)
|
||||||
|
.map_err(|_| DatabaseError::Logical(
|
||||||
|
format!("Overflow existing shard count {} while splitting", updated))
|
||||||
|
)? != expect_parent_records {
|
||||||
|
// Perhaps a deletion or another split raced with this attempt to split, mutating
|
||||||
|
// the parent shards that we intend to split. In this case the split request should fail.
|
||||||
|
return Err(DatabaseError::Logical(
|
||||||
|
format!("Unexpected existing shard count {updated} when preparing tenant for split (expected {expect_parent_records})")
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
// FIXME: spurious clone to sidestep closure move rules
|
||||||
|
let parent_to_children = parent_to_children.clone();
|
||||||
|
|
||||||
|
// Insert child shards
|
||||||
|
for (parent_shard_id, children) in parent_to_children {
|
||||||
|
let mut parent = crate::schema::tenant_shards::table
|
||||||
|
.filter(tenant_id.eq(parent_shard_id.tenant_id.to_string()))
|
||||||
|
.filter(shard_number.eq(parent_shard_id.shard_number.0 as i32))
|
||||||
|
.filter(shard_count.eq(parent_shard_id.shard_count.0 as i32))
|
||||||
|
.load::<TenantShardPersistence>(conn)?;
|
||||||
|
let parent = if parent.len() != 1 {
|
||||||
|
return Err(DatabaseError::Logical(format!(
|
||||||
|
"Parent shard {parent_shard_id} not found"
|
||||||
|
)));
|
||||||
|
} else {
|
||||||
|
parent.pop().unwrap()
|
||||||
|
};
|
||||||
|
for mut shard in children {
|
||||||
|
// Carry the parent's generation into the child
|
||||||
|
shard.generation = parent.generation;
|
||||||
|
|
||||||
|
debug_assert!(shard.splitting == SplitState::Splitting);
|
||||||
|
diesel::insert_into(tenant_shards)
|
||||||
|
.values(shard)
|
||||||
|
.execute(conn)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
})?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
})
|
||||||
|
.await
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: when we finish shard splitting, we must atomically clean up the old shards
|
// When we finish shard splitting, we must atomically clean up the old shards
|
||||||
// and insert the new shards, and clear the splitting marker.
|
// and insert the new shards, and clear the splitting marker.
|
||||||
#[allow(dead_code)]
|
#[allow(dead_code)]
|
||||||
pub(crate) async fn complete_shard_split(&self, _tenant_id: TenantId) -> anyhow::Result<()> {
|
pub(crate) async fn complete_shard_split(
|
||||||
todo!();
|
&self,
|
||||||
|
split_tenant_id: TenantId,
|
||||||
|
old_shard_count: ShardCount,
|
||||||
|
) -> DatabaseResult<()> {
|
||||||
|
use crate::schema::tenant_shards::dsl::*;
|
||||||
|
self.with_conn(move |conn| -> DatabaseResult<()> {
|
||||||
|
conn.transaction(|conn| -> QueryResult<()> {
|
||||||
|
// Drop parent shards
|
||||||
|
diesel::delete(tenant_shards)
|
||||||
|
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||||
|
.filter(shard_count.eq(old_shard_count.0 as i32))
|
||||||
|
.execute(conn)?;
|
||||||
|
|
||||||
|
// Clear sharding flag
|
||||||
|
let updated = diesel::update(tenant_shards)
|
||||||
|
.filter(tenant_id.eq(split_tenant_id.to_string()))
|
||||||
|
.set((splitting.eq(0),))
|
||||||
|
.execute(conn)?;
|
||||||
|
debug_assert!(updated > 0);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
})?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
})
|
||||||
|
.await
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -403,6 +504,8 @@ pub(crate) struct TenantShardPersistence {
|
|||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub(crate) placement_policy: String,
|
pub(crate) placement_policy: String,
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
|
pub(crate) splitting: SplitState,
|
||||||
|
#[serde(default)]
|
||||||
pub(crate) config: String,
|
pub(crate) config: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,46 @@
|
|||||||
|
use diesel::pg::{Pg, PgValue};
|
||||||
|
use diesel::{
|
||||||
|
deserialize::FromSql, deserialize::FromSqlRow, expression::AsExpression, serialize::ToSql,
|
||||||
|
sql_types::Int2,
|
||||||
|
};
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, FromSqlRow, AsExpression)]
|
||||||
|
#[diesel(sql_type = SplitStateSQLRepr)]
|
||||||
|
#[derive(Deserialize, Serialize)]
|
||||||
|
pub enum SplitState {
|
||||||
|
Idle = 0,
|
||||||
|
Splitting = 1,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for SplitState {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self::Idle
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
type SplitStateSQLRepr = Int2;
|
||||||
|
|
||||||
|
impl ToSql<SplitStateSQLRepr, Pg> for SplitState {
|
||||||
|
fn to_sql<'a>(
|
||||||
|
&'a self,
|
||||||
|
out: &'a mut diesel::serialize::Output<Pg>,
|
||||||
|
) -> diesel::serialize::Result {
|
||||||
|
let raw_value: i16 = *self as i16;
|
||||||
|
let mut new_out = out.reborrow();
|
||||||
|
ToSql::<SplitStateSQLRepr, Pg>::to_sql(&raw_value, &mut new_out)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FromSql<SplitStateSQLRepr, Pg> for SplitState {
|
||||||
|
fn from_sql(pg_value: PgValue) -> diesel::deserialize::Result<Self> {
|
||||||
|
match FromSql::<SplitStateSQLRepr, Pg>::from_sql(pg_value).map(|v| match v {
|
||||||
|
0 => Some(Self::Idle),
|
||||||
|
1 => Some(Self::Splitting),
|
||||||
|
_ => None,
|
||||||
|
})? {
|
||||||
|
Some(v) => Ok(v),
|
||||||
|
None => Err(format!("Invalid SplitState value, was: {:?}", pg_value.as_bytes()).into()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -20,6 +20,7 @@ diesel::table! {
|
|||||||
generation -> Int4,
|
generation -> Int4,
|
||||||
generation_pageserver -> Int8,
|
generation_pageserver -> Int8,
|
||||||
placement_policy -> Varchar,
|
placement_policy -> Varchar,
|
||||||
|
splitting -> Int2,
|
||||||
config -> Text,
|
config -> Text,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
use std::{
|
use std::{
|
||||||
collections::{BTreeMap, HashMap},
|
cmp::Ordering,
|
||||||
|
collections::{BTreeMap, HashMap, HashSet},
|
||||||
str::FromStr,
|
str::FromStr,
|
||||||
sync::Arc,
|
sync::Arc,
|
||||||
time::{Duration, Instant},
|
time::{Duration, Instant},
|
||||||
@@ -23,13 +24,14 @@ use pageserver_api::{
|
|||||||
models::{
|
models::{
|
||||||
LocationConfig, LocationConfigMode, ShardParameters, TenantConfig, TenantCreateRequest,
|
LocationConfig, LocationConfigMode, ShardParameters, TenantConfig, TenantCreateRequest,
|
||||||
TenantLocationConfigRequest, TenantLocationConfigResponse, TenantShardLocation,
|
TenantLocationConfigRequest, TenantLocationConfigResponse, TenantShardLocation,
|
||||||
TimelineCreateRequest, TimelineInfo,
|
TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
|
||||||
},
|
},
|
||||||
shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId},
|
shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId},
|
||||||
};
|
};
|
||||||
use pageserver_client::mgmt_api;
|
use pageserver_client::mgmt_api;
|
||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
use utils::{
|
use utils::{
|
||||||
|
backoff,
|
||||||
completion::Barrier,
|
completion::Barrier,
|
||||||
generation::Generation,
|
generation::Generation,
|
||||||
http::error::ApiError,
|
http::error::ApiError,
|
||||||
@@ -40,7 +42,11 @@ use utils::{
|
|||||||
use crate::{
|
use crate::{
|
||||||
compute_hook::{self, ComputeHook},
|
compute_hook::{self, ComputeHook},
|
||||||
node::Node,
|
node::Node,
|
||||||
persistence::{DatabaseError, NodePersistence, Persistence, TenantShardPersistence},
|
persistence::{
|
||||||
|
split_state::SplitState, DatabaseError, NodePersistence, Persistence,
|
||||||
|
TenantShardPersistence,
|
||||||
|
},
|
||||||
|
reconciler::attached_location_conf,
|
||||||
scheduler::Scheduler,
|
scheduler::Scheduler,
|
||||||
tenant_state::{
|
tenant_state::{
|
||||||
IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError,
|
IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError,
|
||||||
@@ -145,31 +151,71 @@ impl Service {
|
|||||||
// indeterminate, same as in [`ObservedStateLocation`])
|
// indeterminate, same as in [`ObservedStateLocation`])
|
||||||
let mut observed = HashMap::new();
|
let mut observed = HashMap::new();
|
||||||
|
|
||||||
let nodes = {
|
let mut nodes_online = HashSet::new();
|
||||||
let locked = self.inner.read().unwrap();
|
|
||||||
locked.nodes.clone()
|
// TODO: give Service a cancellation token for clean shutdown
|
||||||
};
|
let cancel = CancellationToken::new();
|
||||||
|
|
||||||
// TODO: issue these requests concurrently
|
// TODO: issue these requests concurrently
|
||||||
for node in nodes.values() {
|
{
|
||||||
let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
|
let nodes = {
|
||||||
|
let locked = self.inner.read().unwrap();
|
||||||
|
locked.nodes.clone()
|
||||||
|
};
|
||||||
|
for node in nodes.values() {
|
||||||
|
let http_client = reqwest::ClientBuilder::new()
|
||||||
|
.timeout(Duration::from_secs(5))
|
||||||
|
.build()
|
||||||
|
.expect("Failed to construct HTTP client");
|
||||||
|
let client = mgmt_api::Client::from_client(
|
||||||
|
http_client,
|
||||||
|
node.base_url(),
|
||||||
|
self.config.jwt_token.as_deref(),
|
||||||
|
);
|
||||||
|
|
||||||
tracing::info!("Scanning shards on node {}...", node.id);
|
fn is_fatal(e: &mgmt_api::Error) -> bool {
|
||||||
match client.list_location_config().await {
|
use mgmt_api::Error::*;
|
||||||
Err(e) => {
|
match e {
|
||||||
tracing::warn!("Could not contact pageserver {} ({e})", node.id);
|
ReceiveBody(_) | ReceiveErrorBody(_) => false,
|
||||||
// TODO: be more tolerant, apply a generous 5-10 second timeout with retries, in case
|
ApiError(StatusCode::SERVICE_UNAVAILABLE, _)
|
||||||
// pageserver is being restarted at the same time as we are
|
| ApiError(StatusCode::GATEWAY_TIMEOUT, _)
|
||||||
|
| ApiError(StatusCode::REQUEST_TIMEOUT, _) => false,
|
||||||
|
ApiError(_, _) => true,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
Ok(listing) => {
|
|
||||||
tracing::info!(
|
|
||||||
"Received {} shard statuses from pageserver {}, setting it to Active",
|
|
||||||
listing.tenant_shards.len(),
|
|
||||||
node.id
|
|
||||||
);
|
|
||||||
|
|
||||||
for (tenant_shard_id, conf_opt) in listing.tenant_shards {
|
let list_response = backoff::retry(
|
||||||
observed.insert(tenant_shard_id, (node.id, conf_opt));
|
|| client.list_location_config(),
|
||||||
|
is_fatal,
|
||||||
|
1,
|
||||||
|
5,
|
||||||
|
"Location config listing",
|
||||||
|
&cancel,
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
let Some(list_response) = list_response else {
|
||||||
|
tracing::info!("Shutdown during startup_reconcile");
|
||||||
|
return;
|
||||||
|
};
|
||||||
|
|
||||||
|
tracing::info!("Scanning shards on node {}...", node.id);
|
||||||
|
match list_response {
|
||||||
|
Err(e) => {
|
||||||
|
tracing::warn!("Could not contact pageserver {} ({e})", node.id);
|
||||||
|
// TODO: be more tolerant, do some retries, in case
|
||||||
|
// pageserver is being restarted at the same time as we are
|
||||||
|
}
|
||||||
|
Ok(listing) => {
|
||||||
|
tracing::info!(
|
||||||
|
"Received {} shard statuses from pageserver {}, setting it to Active",
|
||||||
|
listing.tenant_shards.len(),
|
||||||
|
node.id
|
||||||
|
);
|
||||||
|
nodes_online.insert(node.id);
|
||||||
|
|
||||||
|
for (tenant_shard_id, conf_opt) in listing.tenant_shards {
|
||||||
|
observed.insert(tenant_shard_id, (node.id, conf_opt));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -180,8 +226,19 @@ impl Service {
|
|||||||
let mut compute_notifications = Vec::new();
|
let mut compute_notifications = Vec::new();
|
||||||
|
|
||||||
// Populate intent and observed states for all tenants, based on reported state on pageservers
|
// Populate intent and observed states for all tenants, based on reported state on pageservers
|
||||||
let shard_count = {
|
let (shard_count, nodes) = {
|
||||||
let mut locked = self.inner.write().unwrap();
|
let mut locked = self.inner.write().unwrap();
|
||||||
|
|
||||||
|
// Mark nodes online if they responded to us: nodes are offline by default after a restart.
|
||||||
|
let mut nodes = (*locked.nodes).clone();
|
||||||
|
for (node_id, node) in nodes.iter_mut() {
|
||||||
|
if nodes_online.contains(node_id) {
|
||||||
|
node.availability = NodeAvailability::Active;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
locked.nodes = Arc::new(nodes);
|
||||||
|
let nodes = locked.nodes.clone();
|
||||||
|
|
||||||
for (tenant_shard_id, (node_id, observed_loc)) in observed {
|
for (tenant_shard_id, (node_id, observed_loc)) in observed {
|
||||||
let Some(tenant_state) = locked.tenants.get_mut(&tenant_shard_id) else {
|
let Some(tenant_state) = locked.tenants.get_mut(&tenant_shard_id) else {
|
||||||
cleanup.push((tenant_shard_id, node_id));
|
cleanup.push((tenant_shard_id, node_id));
|
||||||
@@ -213,7 +270,7 @@ impl Service {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
locked.tenants.len()
|
(locked.tenants.len(), nodes)
|
||||||
};
|
};
|
||||||
|
|
||||||
// TODO: if any tenant's intent now differs from its loaded generation_pageserver, we should clear that
|
// TODO: if any tenant's intent now differs from its loaded generation_pageserver, we should clear that
|
||||||
@@ -274,9 +331,8 @@ impl Service {
|
|||||||
let stream = futures::stream::iter(compute_notifications.into_iter())
|
let stream = futures::stream::iter(compute_notifications.into_iter())
|
||||||
.map(|(tenant_shard_id, node_id)| {
|
.map(|(tenant_shard_id, node_id)| {
|
||||||
let compute_hook = compute_hook.clone();
|
let compute_hook = compute_hook.clone();
|
||||||
|
let cancel = cancel.clone();
|
||||||
async move {
|
async move {
|
||||||
// TODO: give Service a cancellation token for clean shutdown
|
|
||||||
let cancel = CancellationToken::new();
|
|
||||||
if let Err(e) = compute_hook.notify(tenant_shard_id, node_id, &cancel).await {
|
if let Err(e) = compute_hook.notify(tenant_shard_id, node_id, &cancel).await {
|
||||||
tracing::error!(
|
tracing::error!(
|
||||||
tenant_shard_id=%tenant_shard_id,
|
tenant_shard_id=%tenant_shard_id,
|
||||||
@@ -382,7 +438,7 @@ impl Service {
|
|||||||
))),
|
))),
|
||||||
config,
|
config,
|
||||||
persistence,
|
persistence,
|
||||||
startup_complete,
|
startup_complete: startup_complete.clone(),
|
||||||
});
|
});
|
||||||
|
|
||||||
let result_task_this = this.clone();
|
let result_task_this = this.clone();
|
||||||
@@ -476,6 +532,7 @@ impl Service {
|
|||||||
generation_pageserver: i64::MAX,
|
generation_pageserver: i64::MAX,
|
||||||
placement_policy: serde_json::to_string(&PlacementPolicy::default()).unwrap(),
|
placement_policy: serde_json::to_string(&PlacementPolicy::default()).unwrap(),
|
||||||
config: serde_json::to_string(&TenantConfig::default()).unwrap(),
|
config: serde_json::to_string(&TenantConfig::default()).unwrap(),
|
||||||
|
splitting: SplitState::default(),
|
||||||
};
|
};
|
||||||
|
|
||||||
match self.persistence.insert_tenant_shards(vec![tsp]).await {
|
match self.persistence.insert_tenant_shards(vec![tsp]).await {
|
||||||
@@ -718,6 +775,7 @@ impl Service {
|
|||||||
generation_pageserver: i64::MAX,
|
generation_pageserver: i64::MAX,
|
||||||
placement_policy: serde_json::to_string(&placement_policy).unwrap(),
|
placement_policy: serde_json::to_string(&placement_policy).unwrap(),
|
||||||
config: serde_json::to_string(&create_req.config).unwrap(),
|
config: serde_json::to_string(&create_req.config).unwrap(),
|
||||||
|
splitting: SplitState::default(),
|
||||||
})
|
})
|
||||||
.collect();
|
.collect();
|
||||||
self.persistence
|
self.persistence
|
||||||
@@ -977,6 +1035,10 @@ impl Service {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// TODO: if we timeout/fail on reconcile, we should still succeed this request,
|
||||||
|
// because otherwise a broken compute hook causes a feedback loop where
|
||||||
|
// location_config returns 500 and gets retried forever.
|
||||||
|
|
||||||
if let Some(create_req) = maybe_create {
|
if let Some(create_req) = maybe_create {
|
||||||
let create_resp = self.tenant_create(create_req).await?;
|
let create_resp = self.tenant_create(create_req).await?;
|
||||||
result.shards = create_resp
|
result.shards = create_resp
|
||||||
@@ -1100,6 +1162,7 @@ impl Service {
|
|||||||
self.ensure_attached_wait(tenant_id).await?;
|
self.ensure_attached_wait(tenant_id).await?;
|
||||||
|
|
||||||
// TODO: refuse to do this if shard splitting is in progress
|
// TODO: refuse to do this if shard splitting is in progress
|
||||||
|
// (https://github.com/neondatabase/neon/issues/6676)
|
||||||
let targets = {
|
let targets = {
|
||||||
let locked = self.inner.read().unwrap();
|
let locked = self.inner.read().unwrap();
|
||||||
let mut targets = Vec::new();
|
let mut targets = Vec::new();
|
||||||
@@ -1180,6 +1243,7 @@ impl Service {
|
|||||||
self.ensure_attached_wait(tenant_id).await?;
|
self.ensure_attached_wait(tenant_id).await?;
|
||||||
|
|
||||||
// TODO: refuse to do this if shard splitting is in progress
|
// TODO: refuse to do this if shard splitting is in progress
|
||||||
|
// (https://github.com/neondatabase/neon/issues/6676)
|
||||||
let targets = {
|
let targets = {
|
||||||
let locked = self.inner.read().unwrap();
|
let locked = self.inner.read().unwrap();
|
||||||
let mut targets = Vec::new();
|
let mut targets = Vec::new();
|
||||||
@@ -1352,6 +1416,326 @@ impl Service {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) async fn tenant_shard_split(
|
||||||
|
&self,
|
||||||
|
tenant_id: TenantId,
|
||||||
|
split_req: TenantShardSplitRequest,
|
||||||
|
) -> Result<TenantShardSplitResponse, ApiError> {
|
||||||
|
let mut policy = None;
|
||||||
|
let mut shard_ident = None;
|
||||||
|
|
||||||
|
// TODO: put a cancellation token on Service for clean shutdown
|
||||||
|
let cancel = CancellationToken::new();
|
||||||
|
|
||||||
|
// A parent shard which will be split
|
||||||
|
struct SplitTarget {
|
||||||
|
parent_id: TenantShardId,
|
||||||
|
node: Node,
|
||||||
|
child_ids: Vec<TenantShardId>,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Validate input, and calculate which shards we will create
|
||||||
|
let (old_shard_count, targets, compute_hook) = {
|
||||||
|
let locked = self.inner.read().unwrap();
|
||||||
|
|
||||||
|
let pageservers = locked.nodes.clone();
|
||||||
|
|
||||||
|
let mut targets = Vec::new();
|
||||||
|
|
||||||
|
// In case this is a retry, count how many already-split shards we found
|
||||||
|
let mut children_found = Vec::new();
|
||||||
|
let mut old_shard_count = None;
|
||||||
|
|
||||||
|
for (tenant_shard_id, shard) in
|
||||||
|
locked.tenants.range(TenantShardId::tenant_range(tenant_id))
|
||||||
|
{
|
||||||
|
match shard.shard.count.0.cmp(&split_req.new_shard_count) {
|
||||||
|
Ordering::Equal => {
|
||||||
|
// Already split this
|
||||||
|
children_found.push(*tenant_shard_id);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
Ordering::Greater => {
|
||||||
|
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
||||||
|
"Requested count {} but already have shards at count {}",
|
||||||
|
split_req.new_shard_count,
|
||||||
|
shard.shard.count.0
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
Ordering::Less => {
|
||||||
|
// Fall through: this shard has lower count than requested,
|
||||||
|
// is a candidate for splitting.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
match old_shard_count {
|
||||||
|
None => old_shard_count = Some(shard.shard.count),
|
||||||
|
Some(old_shard_count) => {
|
||||||
|
if old_shard_count != shard.shard.count {
|
||||||
|
// We may hit this case if a caller asked for two splits to
|
||||||
|
// different sizes, before the first one is complete.
|
||||||
|
// e.g. 1->2, 2->4, where the 4 call comes while we have a mixture
|
||||||
|
// of shard_count=1 and shard_count=2 shards in the map.
|
||||||
|
return Err(ApiError::Conflict(
|
||||||
|
"Cannot split, currently mid-split".to_string(),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if policy.is_none() {
|
||||||
|
policy = Some(shard.policy.clone());
|
||||||
|
}
|
||||||
|
if shard_ident.is_none() {
|
||||||
|
shard_ident = Some(shard.shard);
|
||||||
|
}
|
||||||
|
|
||||||
|
if tenant_shard_id.shard_count == ShardCount(split_req.new_shard_count) {
|
||||||
|
tracing::info!(
|
||||||
|
"Tenant shard {} already has shard count {}",
|
||||||
|
tenant_shard_id,
|
||||||
|
split_req.new_shard_count
|
||||||
|
);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let node_id =
|
||||||
|
shard
|
||||||
|
.intent
|
||||||
|
.attached
|
||||||
|
.ok_or(ApiError::BadRequest(anyhow::anyhow!(
|
||||||
|
"Cannot split a tenant that is not attached"
|
||||||
|
)))?;
|
||||||
|
|
||||||
|
let node = pageservers
|
||||||
|
.get(&node_id)
|
||||||
|
.expect("Pageservers may not be deleted while referenced");
|
||||||
|
|
||||||
|
// TODO: if any reconciliation is currently in progress for this shard, wait for it.
|
||||||
|
|
||||||
|
targets.push(SplitTarget {
|
||||||
|
parent_id: *tenant_shard_id,
|
||||||
|
node: node.clone(),
|
||||||
|
child_ids: tenant_shard_id.split(ShardCount(split_req.new_shard_count)),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if targets.is_empty() {
|
||||||
|
if children_found.len() == split_req.new_shard_count as usize {
|
||||||
|
return Ok(TenantShardSplitResponse {
|
||||||
|
new_shards: children_found,
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
// No shards found to split, and no existing children found: the
|
||||||
|
// tenant doesn't exist at all.
|
||||||
|
return Err(ApiError::NotFound(
|
||||||
|
anyhow::anyhow!("Tenant {} not found", tenant_id).into(),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
(old_shard_count, targets, locked.compute_hook.clone())
|
||||||
|
};
|
||||||
|
|
||||||
|
// unwrap safety: we would have returned above if we didn't find at least one shard to split
|
||||||
|
let old_shard_count = old_shard_count.unwrap();
|
||||||
|
let shard_ident = shard_ident.unwrap();
|
||||||
|
let policy = policy.unwrap();
|
||||||
|
|
||||||
|
// FIXME: we have dropped self.inner lock, and not yet written anything to the database: another
|
||||||
|
// request could occur here, deleting or mutating the tenant. begin_shard_split checks that the
|
||||||
|
// parent shards exist as expected, but it would be neater to do the above pre-checks within the
|
||||||
|
// same database transaction rather than pre-check in-memory and then maybe-fail the database write.
|
||||||
|
// (https://github.com/neondatabase/neon/issues/6676)
|
||||||
|
|
||||||
|
// Before creating any new child shards in memory or on the pageservers, persist them: this
|
||||||
|
// enables us to ensure that we will always be able to clean up if something goes wrong. This also
|
||||||
|
// acts as the protection against two concurrent attempts to split: one of them will get a database
|
||||||
|
// error trying to insert the child shards.
|
||||||
|
let mut child_tsps = Vec::new();
|
||||||
|
for target in &targets {
|
||||||
|
let mut this_child_tsps = Vec::new();
|
||||||
|
for child in &target.child_ids {
|
||||||
|
let mut child_shard = shard_ident;
|
||||||
|
child_shard.number = child.shard_number;
|
||||||
|
child_shard.count = child.shard_count;
|
||||||
|
|
||||||
|
this_child_tsps.push(TenantShardPersistence {
|
||||||
|
tenant_id: child.tenant_id.to_string(),
|
||||||
|
shard_number: child.shard_number.0 as i32,
|
||||||
|
shard_count: child.shard_count.0 as i32,
|
||||||
|
shard_stripe_size: shard_ident.stripe_size.0 as i32,
|
||||||
|
// Note: this generation is a placeholder, [`Persistence::begin_shard_split`] will
|
||||||
|
// populate the correct generation as part of its transaction, to protect us
|
||||||
|
// against racing with changes in the state of the parent.
|
||||||
|
generation: 0,
|
||||||
|
generation_pageserver: target.node.id.0 as i64,
|
||||||
|
placement_policy: serde_json::to_string(&policy).unwrap(),
|
||||||
|
// TODO: get the config out of the map
|
||||||
|
config: serde_json::to_string(&TenantConfig::default()).unwrap(),
|
||||||
|
splitting: SplitState::Splitting,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
child_tsps.push((target.parent_id, this_child_tsps));
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Err(e) = self
|
||||||
|
.persistence
|
||||||
|
.begin_shard_split(old_shard_count, tenant_id, child_tsps)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
match e {
|
||||||
|
DatabaseError::Query(diesel::result::Error::DatabaseError(
|
||||||
|
DatabaseErrorKind::UniqueViolation,
|
||||||
|
_,
|
||||||
|
)) => {
|
||||||
|
// Inserting a child shard violated a unique constraint: we raced with another call to
|
||||||
|
// this function
|
||||||
|
tracing::warn!("Conflicting attempt to split {tenant_id}: {e}");
|
||||||
|
return Err(ApiError::Conflict("Tenant is already splitting".into()));
|
||||||
|
}
|
||||||
|
_ => return Err(ApiError::InternalServerError(e.into())),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// FIXME: we have now committed the shard split state to the database, so any subsequent
|
||||||
|
// failure needs to roll it back. We will later wrap this function in logic to roll back
|
||||||
|
// the split if it fails.
|
||||||
|
// (https://github.com/neondatabase/neon/issues/6676)
|
||||||
|
|
||||||
|
// TODO: issue split calls concurrently (this only matters once we're splitting
|
||||||
|
// N>1 shards into M shards -- initially we're usually splitting 1 shard into N).
|
||||||
|
|
||||||
|
for target in &targets {
|
||||||
|
let SplitTarget {
|
||||||
|
parent_id,
|
||||||
|
node,
|
||||||
|
child_ids,
|
||||||
|
} = target;
|
||||||
|
let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
|
||||||
|
let response = client
|
||||||
|
.tenant_shard_split(
|
||||||
|
*parent_id,
|
||||||
|
TenantShardSplitRequest {
|
||||||
|
new_shard_count: split_req.new_shard_count,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.map_err(|e| ApiError::Conflict(format!("Failed to split {}: {}", parent_id, e)))?;
|
||||||
|
|
||||||
|
tracing::info!(
|
||||||
|
"Split {} into {}",
|
||||||
|
parent_id,
|
||||||
|
response
|
||||||
|
.new_shards
|
||||||
|
.iter()
|
||||||
|
.map(|s| format!("{:?}", s))
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
.join(",")
|
||||||
|
);
|
||||||
|
|
||||||
|
if &response.new_shards != child_ids {
|
||||||
|
// This should never happen: the pageserver should agree with us on how shard splits work.
|
||||||
|
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
||||||
|
"Splitting shard {} resulted in unexpected IDs: {:?} (expected {:?})",
|
||||||
|
parent_id,
|
||||||
|
response.new_shards,
|
||||||
|
child_ids
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: if the pageserver restarted concurrently with our split API call,
|
||||||
|
// the actual generation of the child shard might differ from the generation
|
||||||
|
// we expect it to have. In order for our in-database generation to end up
|
||||||
|
// correct, we should carry the child generation back in the response and apply it here
|
||||||
|
// in complete_shard_split (and apply the correct generation in memory)
|
||||||
|
// (or, we can carry generation in the request and reject the request if
|
||||||
|
// it doesn't match, but that requires more retry logic on this side)
|
||||||
|
|
||||||
|
self.persistence
|
||||||
|
.complete_shard_split(tenant_id, old_shard_count)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
// Replace all the shards we just split with their children
|
||||||
|
let mut response = TenantShardSplitResponse {
|
||||||
|
new_shards: Vec::new(),
|
||||||
|
};
|
||||||
|
let mut child_locations = Vec::new();
|
||||||
|
{
|
||||||
|
let mut locked = self.inner.write().unwrap();
|
||||||
|
for target in targets {
|
||||||
|
let SplitTarget {
|
||||||
|
parent_id,
|
||||||
|
node: _node,
|
||||||
|
child_ids,
|
||||||
|
} = target;
|
||||||
|
let (pageserver, generation, config) = {
|
||||||
|
let old_state = locked
|
||||||
|
.tenants
|
||||||
|
.remove(&parent_id)
|
||||||
|
.expect("It was present, we just split it");
|
||||||
|
(
|
||||||
|
old_state.intent.attached.unwrap(),
|
||||||
|
old_state.generation,
|
||||||
|
old_state.config.clone(),
|
||||||
|
)
|
||||||
|
};
|
||||||
|
|
||||||
|
locked.tenants.remove(&parent_id);
|
||||||
|
|
||||||
|
for child in child_ids {
|
||||||
|
let mut child_shard = shard_ident;
|
||||||
|
child_shard.number = child.shard_number;
|
||||||
|
child_shard.count = child.shard_count;
|
||||||
|
|
||||||
|
let mut child_observed: HashMap<NodeId, ObservedStateLocation> = HashMap::new();
|
||||||
|
child_observed.insert(
|
||||||
|
pageserver,
|
||||||
|
ObservedStateLocation {
|
||||||
|
conf: Some(attached_location_conf(generation, &child_shard, &config)),
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
let mut child_state = TenantState::new(child, child_shard, policy.clone());
|
||||||
|
child_state.intent = IntentState::single(Some(pageserver));
|
||||||
|
child_state.observed = ObservedState {
|
||||||
|
locations: child_observed,
|
||||||
|
};
|
||||||
|
child_state.generation = generation;
|
||||||
|
child_state.config = config.clone();
|
||||||
|
|
||||||
|
child_locations.push((child, pageserver));
|
||||||
|
|
||||||
|
locked.tenants.insert(child, child_state);
|
||||||
|
response.new_shards.push(child);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Send compute notifications for all the new shards
|
||||||
|
let mut failed_notifications = Vec::new();
|
||||||
|
for (child_id, child_ps) in child_locations {
|
||||||
|
if let Err(e) = compute_hook.notify(child_id, child_ps, &cancel).await {
|
||||||
|
tracing::warn!("Failed to update compute of {}->{} during split, proceeding anyway to complete split ({e})",
|
||||||
|
child_id, child_ps);
|
||||||
|
failed_notifications.push(child_id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we failed any compute notifications, make a note to retry later.
|
||||||
|
if !failed_notifications.is_empty() {
|
||||||
|
let mut locked = self.inner.write().unwrap();
|
||||||
|
for failed in failed_notifications {
|
||||||
|
if let Some(shard) = locked.tenants.get_mut(&failed) {
|
||||||
|
shard.pending_compute_notification = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(response)
|
||||||
|
}
|
||||||
|
|
||||||
pub(crate) async fn tenant_shard_migrate(
|
pub(crate) async fn tenant_shard_migrate(
|
||||||
&self,
|
&self,
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_shard_id: TenantShardId,
|
||||||
@@ -1420,6 +1804,45 @@ impl Service {
|
|||||||
Ok(TenantShardMigrateResponse {})
|
Ok(TenantShardMigrateResponse {})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// This is for debug/support only: we simply drop all state for a tenant, without
|
||||||
|
/// detaching or deleting it on pageservers.
|
||||||
|
pub(crate) async fn tenant_drop(&self, tenant_id: TenantId) -> Result<(), ApiError> {
|
||||||
|
self.persistence.delete_tenant(tenant_id).await?;
|
||||||
|
|
||||||
|
let mut locked = self.inner.write().unwrap();
|
||||||
|
let mut shards = Vec::new();
|
||||||
|
for (tenant_shard_id, _) in locked.tenants.range(TenantShardId::tenant_range(tenant_id)) {
|
||||||
|
shards.push(*tenant_shard_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
for shard in shards {
|
||||||
|
locked.tenants.remove(&shard);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// This is for debug/support only: we simply drop all state for a tenant, without
|
||||||
|
/// detaching or deleting it on pageservers. We do not try and re-schedule any
|
||||||
|
/// tenants that were on this node.
|
||||||
|
///
|
||||||
|
/// TODO: proper node deletion API that unhooks things more gracefully
|
||||||
|
pub(crate) async fn node_drop(&self, node_id: NodeId) -> Result<(), ApiError> {
|
||||||
|
self.persistence.delete_node(node_id).await?;
|
||||||
|
|
||||||
|
let mut locked = self.inner.write().unwrap();
|
||||||
|
|
||||||
|
for shard in locked.tenants.values_mut() {
|
||||||
|
shard.deref_node(node_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut nodes = (*locked.nodes).clone();
|
||||||
|
nodes.remove(&node_id);
|
||||||
|
locked.nodes = Arc::new(nodes);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
pub(crate) async fn node_list(&self) -> Result<Vec<NodePersistence>, ApiError> {
|
pub(crate) async fn node_list(&self) -> Result<Vec<NodePersistence>, ApiError> {
|
||||||
// It is convenient to avoid taking the big lock and converting Node to a serializable
|
// It is convenient to avoid taking the big lock and converting Node to a serializable
|
||||||
// structure, by fetching from storage instead of reading in-memory state.
|
// structure, by fetching from storage instead of reading in-memory state.
|
||||||
|
|||||||
@@ -193,6 +193,13 @@ impl IntentState {
|
|||||||
result
|
result
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn single(node_id: Option<NodeId>) -> Self {
|
||||||
|
Self {
|
||||||
|
attached: node_id,
|
||||||
|
secondary: vec![],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// When a node goes offline, we update intents to avoid using it
|
/// When a node goes offline, we update intents to avoid using it
|
||||||
/// as their attached pageserver.
|
/// as their attached pageserver.
|
||||||
///
|
///
|
||||||
@@ -286,6 +293,9 @@ impl TenantState {
|
|||||||
// self.intent refers to pageservers that are offline, and pick other
|
// self.intent refers to pageservers that are offline, and pick other
|
||||||
// pageservers if so.
|
// pageservers if so.
|
||||||
|
|
||||||
|
// TODO: respect the splitting bit on tenants: if they are currently splitting then we may not
|
||||||
|
// change their attach location.
|
||||||
|
|
||||||
// Build the set of pageservers already in use by this tenant, to avoid scheduling
|
// Build the set of pageservers already in use by this tenant, to avoid scheduling
|
||||||
// more work on the same pageservers we're already using.
|
// more work on the same pageservers we're already using.
|
||||||
let mut used_pageservers = self.intent.all_pageservers();
|
let mut used_pageservers = self.intent.all_pageservers();
|
||||||
@@ -524,4 +534,18 @@ impl TenantState {
|
|||||||
seq: self.sequence,
|
seq: self.sequence,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If we had any state at all referring to this node ID, drop it. Does not
|
||||||
|
// attempt to reschedule.
|
||||||
|
pub(crate) fn deref_node(&mut self, node_id: NodeId) {
|
||||||
|
if self.intent.attached == Some(node_id) {
|
||||||
|
self.intent.attached = None;
|
||||||
|
}
|
||||||
|
|
||||||
|
self.intent.secondary.retain(|n| n != &node_id);
|
||||||
|
|
||||||
|
self.observed.locations.remove(&node_id);
|
||||||
|
|
||||||
|
debug_assert!(!self.intent.all_pageservers().contains(&node_id));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,20 +1,17 @@
|
|||||||
use crate::{background_process, local_env::LocalEnv};
|
use crate::{background_process, local_env::LocalEnv};
|
||||||
use camino::{Utf8Path, Utf8PathBuf};
|
use camino::{Utf8Path, Utf8PathBuf};
|
||||||
use diesel::{
|
|
||||||
backend::Backend,
|
|
||||||
query_builder::{AstPass, QueryFragment, QueryId},
|
|
||||||
Connection, PgConnection, QueryResult, RunQueryDsl,
|
|
||||||
};
|
|
||||||
use diesel_migrations::{HarnessWithOutput, MigrationHarness};
|
|
||||||
use hyper::Method;
|
use hyper::Method;
|
||||||
use pageserver_api::{
|
use pageserver_api::{
|
||||||
models::{ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo},
|
models::{
|
||||||
|
ShardParameters, TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
|
||||||
|
TimelineCreateRequest, TimelineInfo,
|
||||||
|
},
|
||||||
shard::TenantShardId,
|
shard::TenantShardId,
|
||||||
};
|
};
|
||||||
use pageserver_client::mgmt_api::ResponseErrorMessageExt;
|
use pageserver_client::mgmt_api::ResponseErrorMessageExt;
|
||||||
use postgres_backend::AuthType;
|
use postgres_backend::AuthType;
|
||||||
use serde::{de::DeserializeOwned, Deserialize, Serialize};
|
use serde::{de::DeserializeOwned, Deserialize, Serialize};
|
||||||
use std::{env, str::FromStr};
|
use std::str::FromStr;
|
||||||
use tokio::process::Command;
|
use tokio::process::Command;
|
||||||
use tracing::instrument;
|
use tracing::instrument;
|
||||||
use url::Url;
|
use url::Url;
|
||||||
@@ -270,37 +267,6 @@ impl AttachmentService {
|
|||||||
.expect("non-Unicode path")
|
.expect("non-Unicode path")
|
||||||
}
|
}
|
||||||
|
|
||||||
/// In order to access database migrations, we need to find the Neon source tree
|
|
||||||
async fn find_source_root(&self) -> anyhow::Result<Utf8PathBuf> {
|
|
||||||
// We assume that either prd or our binary is in the source tree. The former is usually
|
|
||||||
// true for automated test runners, the latter is usually true for developer workstations. Often
|
|
||||||
// both are true, which is fine.
|
|
||||||
let candidate_start_points = [
|
|
||||||
// Current working directory
|
|
||||||
Utf8PathBuf::from_path_buf(std::env::current_dir()?).unwrap(),
|
|
||||||
// Directory containing the binary we're running inside
|
|
||||||
Utf8PathBuf::from_path_buf(env::current_exe()?.parent().unwrap().to_owned()).unwrap(),
|
|
||||||
];
|
|
||||||
|
|
||||||
// For each candidate start point, search through ancestors looking for a neon.git source tree root
|
|
||||||
for start_point in &candidate_start_points {
|
|
||||||
// Start from the build dir: assumes we are running out of a built neon source tree
|
|
||||||
for path in start_point.ancestors() {
|
|
||||||
// A crude approximation: the root of the source tree is whatever contains a "control_plane"
|
|
||||||
// subdirectory.
|
|
||||||
let control_plane = path.join("control_plane");
|
|
||||||
if tokio::fs::try_exists(&control_plane).await? {
|
|
||||||
return Ok(path.to_owned());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Fall-through
|
|
||||||
Err(anyhow::anyhow!(
|
|
||||||
"Could not find control_plane src dir, after searching ancestors of {candidate_start_points:?}"
|
|
||||||
))
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Find the directory containing postgres binaries, such as `initdb` and `pg_ctl`
|
/// Find the directory containing postgres binaries, such as `initdb` and `pg_ctl`
|
||||||
///
|
///
|
||||||
/// This usually uses ATTACHMENT_SERVICE_POSTGRES_VERSION of postgres, but will fall back
|
/// This usually uses ATTACHMENT_SERVICE_POSTGRES_VERSION of postgres, but will fall back
|
||||||
@@ -340,69 +306,32 @@ impl AttachmentService {
|
|||||||
///
|
///
|
||||||
/// Returns the database url
|
/// Returns the database url
|
||||||
pub async fn setup_database(&self) -> anyhow::Result<String> {
|
pub async fn setup_database(&self) -> anyhow::Result<String> {
|
||||||
let database_url = format!(
|
const DB_NAME: &str = "attachment_service";
|
||||||
"postgresql://localhost:{}/attachment_service",
|
let database_url = format!("postgresql://localhost:{}/{DB_NAME}", self.postgres_port);
|
||||||
self.postgres_port
|
|
||||||
);
|
|
||||||
println!("Running attachment service database setup...");
|
|
||||||
fn change_database_of_url(database_url: &str, default_database: &str) -> (String, String) {
|
|
||||||
let base = ::url::Url::parse(database_url).unwrap();
|
|
||||||
let database = base.path_segments().unwrap().last().unwrap().to_owned();
|
|
||||||
let mut new_url = base.join(default_database).unwrap();
|
|
||||||
new_url.set_query(base.query());
|
|
||||||
(database, new_url.into())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||||
pub struct CreateDatabaseStatement {
|
let createdb_path = pg_bin_dir.join("createdb");
|
||||||
db_name: String,
|
let output = Command::new(&createdb_path)
|
||||||
}
|
.args([
|
||||||
|
"-h",
|
||||||
|
"localhost",
|
||||||
|
"-p",
|
||||||
|
&format!("{}", self.postgres_port),
|
||||||
|
&DB_NAME,
|
||||||
|
])
|
||||||
|
.output()
|
||||||
|
.await
|
||||||
|
.expect("Failed to spawn createdb");
|
||||||
|
|
||||||
impl CreateDatabaseStatement {
|
if !output.status.success() {
|
||||||
pub fn new(db_name: &str) -> Self {
|
let stderr = String::from_utf8(output.stderr).expect("Non-UTF8 output from createdb");
|
||||||
CreateDatabaseStatement {
|
if stderr.contains("already exists") {
|
||||||
db_name: db_name.to_owned(),
|
tracing::info!("Database {DB_NAME} already exists");
|
||||||
}
|
} else {
|
||||||
|
anyhow::bail!("createdb failed with status {}: {stderr}", output.status);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<DB: Backend> QueryFragment<DB> for CreateDatabaseStatement {
|
|
||||||
fn walk_ast<'b>(&'b self, mut out: AstPass<'_, 'b, DB>) -> QueryResult<()> {
|
|
||||||
out.push_sql("CREATE DATABASE ");
|
|
||||||
out.push_identifier(&self.db_name)?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<Conn> RunQueryDsl<Conn> for CreateDatabaseStatement {}
|
|
||||||
|
|
||||||
impl QueryId for CreateDatabaseStatement {
|
|
||||||
type QueryId = ();
|
|
||||||
|
|
||||||
const HAS_STATIC_QUERY_ID: bool = false;
|
|
||||||
}
|
|
||||||
if PgConnection::establish(&database_url).is_err() {
|
|
||||||
let (database, postgres_url) = change_database_of_url(&database_url, "postgres");
|
|
||||||
println!("Creating database: {database}");
|
|
||||||
let mut conn = PgConnection::establish(&postgres_url)?;
|
|
||||||
CreateDatabaseStatement::new(&database).execute(&mut conn)?;
|
|
||||||
}
|
|
||||||
let mut conn = PgConnection::establish(&database_url)?;
|
|
||||||
|
|
||||||
let migrations_dir = self
|
|
||||||
.find_source_root()
|
|
||||||
.await?
|
|
||||||
.join("control_plane/attachment_service/migrations");
|
|
||||||
|
|
||||||
let migrations = diesel_migrations::FileBasedMigrations::from_path(migrations_dir)?;
|
|
||||||
println!("Running migrations in {}", migrations.path().display());
|
|
||||||
HarnessWithOutput::write_to_stdout(&mut conn)
|
|
||||||
.run_pending_migrations(migrations)
|
|
||||||
.map(|_| ())
|
|
||||||
.map_err(|e| anyhow::anyhow!(e))?;
|
|
||||||
|
|
||||||
println!("Migrations complete");
|
|
||||||
|
|
||||||
Ok(database_url)
|
Ok(database_url)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -648,7 +577,7 @@ impl AttachmentService {
|
|||||||
) -> anyhow::Result<TenantShardMigrateResponse> {
|
) -> anyhow::Result<TenantShardMigrateResponse> {
|
||||||
self.dispatch(
|
self.dispatch(
|
||||||
Method::PUT,
|
Method::PUT,
|
||||||
format!("tenant/{tenant_shard_id}/migrate"),
|
format!("control/v1/tenant/{tenant_shard_id}/migrate"),
|
||||||
Some(TenantShardMigrateRequest {
|
Some(TenantShardMigrateRequest {
|
||||||
tenant_shard_id,
|
tenant_shard_id,
|
||||||
node_id,
|
node_id,
|
||||||
@@ -657,6 +586,20 @@ impl AttachmentService {
|
|||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[instrument(skip(self), fields(%tenant_id, %new_shard_count))]
|
||||||
|
pub async fn tenant_split(
|
||||||
|
&self,
|
||||||
|
tenant_id: TenantId,
|
||||||
|
new_shard_count: u8,
|
||||||
|
) -> anyhow::Result<TenantShardSplitResponse> {
|
||||||
|
self.dispatch(
|
||||||
|
Method::PUT,
|
||||||
|
format!("control/v1/tenant/{tenant_id}/shard_split"),
|
||||||
|
Some(TenantShardSplitRequest { new_shard_count }),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
|
||||||
#[instrument(skip_all, fields(node_id=%req.node_id))]
|
#[instrument(skip_all, fields(node_id=%req.node_id))]
|
||||||
pub async fn node_register(&self, req: NodeRegisterRequest) -> anyhow::Result<()> {
|
pub async fn node_register(&self, req: NodeRegisterRequest) -> anyhow::Result<()> {
|
||||||
self.dispatch::<_, ()>(Method::POST, "control/v1/node".to_string(), Some(req))
|
self.dispatch::<_, ()>(Method::POST, "control/v1/node".to_string(), Some(req))
|
||||||
|
|||||||
@@ -72,7 +72,6 @@ where
|
|||||||
let log_path = datadir.join(format!("{process_name}.log"));
|
let log_path = datadir.join(format!("{process_name}.log"));
|
||||||
let process_log_file = fs::OpenOptions::new()
|
let process_log_file = fs::OpenOptions::new()
|
||||||
.create(true)
|
.create(true)
|
||||||
.write(true)
|
|
||||||
.append(true)
|
.append(true)
|
||||||
.open(&log_path)
|
.open(&log_path)
|
||||||
.with_context(|| {
|
.with_context(|| {
|
||||||
|
|||||||
@@ -575,6 +575,26 @@ async fn handle_tenant(
|
|||||||
println!("{tenant_table}");
|
println!("{tenant_table}");
|
||||||
println!("{shard_table}");
|
println!("{shard_table}");
|
||||||
}
|
}
|
||||||
|
Some(("shard-split", matches)) => {
|
||||||
|
let tenant_id = get_tenant_id(matches, env)?;
|
||||||
|
let shard_count: u8 = matches.get_one::<u8>("shard-count").cloned().unwrap_or(0);
|
||||||
|
|
||||||
|
let attachment_service = AttachmentService::from_env(env);
|
||||||
|
let result = attachment_service
|
||||||
|
.tenant_split(tenant_id, shard_count)
|
||||||
|
.await?;
|
||||||
|
println!(
|
||||||
|
"Split tenant {} into shards {}",
|
||||||
|
tenant_id,
|
||||||
|
result
|
||||||
|
.new_shards
|
||||||
|
.iter()
|
||||||
|
.map(|s| format!("{:?}", s))
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
.join(",")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
|
Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
|
||||||
None => bail!("no tenant subcommand provided"),
|
None => bail!("no tenant subcommand provided"),
|
||||||
}
|
}
|
||||||
@@ -994,12 +1014,13 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
|
|||||||
.get_one::<String>("endpoint_id")
|
.get_one::<String>("endpoint_id")
|
||||||
.ok_or_else(|| anyhow!("No endpoint ID was provided to stop"))?;
|
.ok_or_else(|| anyhow!("No endpoint ID was provided to stop"))?;
|
||||||
let destroy = sub_args.get_flag("destroy");
|
let destroy = sub_args.get_flag("destroy");
|
||||||
|
let mode = sub_args.get_one::<String>("mode").expect("has a default");
|
||||||
|
|
||||||
let endpoint = cplane
|
let endpoint = cplane
|
||||||
.endpoints
|
.endpoints
|
||||||
.get(endpoint_id.as_str())
|
.get(endpoint_id.as_str())
|
||||||
.with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
|
.with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
|
||||||
endpoint.stop(destroy)?;
|
endpoint.stop(mode, destroy)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
_ => bail!("Unexpected endpoint subcommand '{sub_name}'"),
|
_ => bail!("Unexpected endpoint subcommand '{sub_name}'"),
|
||||||
@@ -1283,7 +1304,7 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
|
|||||||
match ComputeControlPlane::load(env.clone()) {
|
match ComputeControlPlane::load(env.clone()) {
|
||||||
Ok(cplane) => {
|
Ok(cplane) => {
|
||||||
for (_k, node) in cplane.endpoints {
|
for (_k, node) in cplane.endpoints {
|
||||||
if let Err(e) = node.stop(false) {
|
if let Err(e) = node.stop(if immediate { "immediate" } else { "fast " }, false) {
|
||||||
eprintln!("postgres stop failed: {e:#}");
|
eprintln!("postgres stop failed: {e:#}");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1524,6 +1545,11 @@ fn cli() -> Command {
|
|||||||
.subcommand(Command::new("status")
|
.subcommand(Command::new("status")
|
||||||
.about("Human readable summary of the tenant's shards and attachment locations")
|
.about("Human readable summary of the tenant's shards and attachment locations")
|
||||||
.arg(tenant_id_arg.clone()))
|
.arg(tenant_id_arg.clone()))
|
||||||
|
.subcommand(Command::new("shard-split")
|
||||||
|
.about("Increase the number of shards in the tenant")
|
||||||
|
.arg(tenant_id_arg.clone())
|
||||||
|
.arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)"))
|
||||||
|
)
|
||||||
)
|
)
|
||||||
.subcommand(
|
.subcommand(
|
||||||
Command::new("pageserver")
|
Command::new("pageserver")
|
||||||
@@ -1627,7 +1653,16 @@ fn cli() -> Command {
|
|||||||
.long("destroy")
|
.long("destroy")
|
||||||
.action(ArgAction::SetTrue)
|
.action(ArgAction::SetTrue)
|
||||||
.required(false)
|
.required(false)
|
||||||
)
|
)
|
||||||
|
.arg(
|
||||||
|
Arg::new("mode")
|
||||||
|
.help("Postgres shutdown mode, passed to \"pg_ctl -m <mode>\"")
|
||||||
|
.long("mode")
|
||||||
|
.action(ArgAction::Set)
|
||||||
|
.required(false)
|
||||||
|
.value_parser(["smart", "fast", "immediate"])
|
||||||
|
.default_value("fast")
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -761,22 +761,8 @@ impl Endpoint {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn stop(&self, destroy: bool) -> Result<()> {
|
pub fn stop(&self, mode: &str, destroy: bool) -> Result<()> {
|
||||||
// If we are going to destroy data directory,
|
self.pg_ctl(&["-m", mode, "stop"], &None)?;
|
||||||
// use immediate shutdown mode, otherwise,
|
|
||||||
// shutdown gracefully to leave the data directory sane.
|
|
||||||
//
|
|
||||||
// Postgres is always started from scratch, so stop
|
|
||||||
// without destroy only used for testing and debugging.
|
|
||||||
//
|
|
||||||
self.pg_ctl(
|
|
||||||
if destroy {
|
|
||||||
&["-m", "immediate", "stop"]
|
|
||||||
} else {
|
|
||||||
&["stop"]
|
|
||||||
},
|
|
||||||
&None,
|
|
||||||
)?;
|
|
||||||
|
|
||||||
// Also wait for the compute_ctl process to die. It might have some
|
// Also wait for the compute_ctl process to die. It might have some
|
||||||
// cleanup work to do after postgres stops, like syncing safekeepers,
|
// cleanup work to do after postgres stops, like syncing safekeepers,
|
||||||
|
|||||||
@@ -90,10 +90,8 @@ pub enum ComputeFeature {
|
|||||||
/// track short-lived connections as user activity.
|
/// track short-lived connections as user activity.
|
||||||
ActivityMonitorExperimental,
|
ActivityMonitorExperimental,
|
||||||
|
|
||||||
// Use latest version of remote extensions
|
/// Pre-install and initialize anon extension for every database in the cluster
|
||||||
// This is needed to allow us to test new versions of extensions before
|
AnonExtension,
|
||||||
// they are merged into the main branch.
|
|
||||||
RemoteExtensionsUseLatest,
|
|
||||||
|
|
||||||
/// This is a special feature flag that is used to represent unknown feature flags.
|
/// This is a special feature flag that is used to represent unknown feature flags.
|
||||||
/// Basically all unknown to enum flags are represented as this one. See unit test
|
/// Basically all unknown to enum flags are represented as this one. See unit test
|
||||||
@@ -157,12 +155,8 @@ impl RemoteExtSpec {
|
|||||||
//
|
//
|
||||||
// Keep it in sync with path generation in
|
// Keep it in sync with path generation in
|
||||||
// https://github.com/neondatabase/build-custom-extensions/tree/main
|
// https://github.com/neondatabase/build-custom-extensions/tree/main
|
||||||
//
|
|
||||||
// if ComputeFeature::RemoteExtensionsUseLatest is enabled
|
|
||||||
// use "latest" as the build_tag
|
|
||||||
let archive_path_str =
|
let archive_path_str =
|
||||||
format!("{build_tag}/{pg_major_version}/extensions/{real_ext_name}.tar.zst");
|
format!("{build_tag}/{pg_major_version}/extensions/{real_ext_name}.tar.zst");
|
||||||
|
|
||||||
Ok((
|
Ok((
|
||||||
real_ext_name.to_string(),
|
real_ext_name.to_string(),
|
||||||
RemotePath::from_string(&archive_path_str)?,
|
RemotePath::from_string(&archive_path_str)?,
|
||||||
|
|||||||
18
libs/desim/Cargo.toml
Normal file
18
libs/desim/Cargo.toml
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
[package]
|
||||||
|
name = "desim"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition.workspace = true
|
||||||
|
license.workspace = true
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
anyhow.workspace = true
|
||||||
|
rand.workspace = true
|
||||||
|
tracing.workspace = true
|
||||||
|
bytes.workspace = true
|
||||||
|
utils.workspace = true
|
||||||
|
parking_lot.workspace = true
|
||||||
|
hex.workspace = true
|
||||||
|
scopeguard.workspace = true
|
||||||
|
smallvec = { workspace = true, features = ["write"] }
|
||||||
|
|
||||||
|
workspace_hack.workspace = true
|
||||||
7
libs/desim/README.md
Normal file
7
libs/desim/README.md
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
# Discrete Event SIMulator
|
||||||
|
|
||||||
|
This is a library for running simulations of distributed systems. The main idea is borrowed from [FoundationDB](https://www.youtube.com/watch?v=4fFDFbi3toc).
|
||||||
|
|
||||||
|
Each node runs as a separate thread. This library was not optimized for speed yet, but it's already much faster than running usual intergration tests in real time, because it uses virtual simulation time and can fast-forward time to skip intervals where all nodes are doing nothing but sleeping or waiting for something.
|
||||||
|
|
||||||
|
The original purpose for this library is to test walproposer and safekeeper implementation working together, in a scenarios close to the real world environment. This simulator is determenistic and can inject failures in networking without waiting minutes of wall-time to trigger timeout, which makes it easier to find bugs in our consensus implementation compared to using integration tests.
|
||||||
108
libs/desim/src/chan.rs
Normal file
108
libs/desim/src/chan.rs
Normal file
@@ -0,0 +1,108 @@
|
|||||||
|
use std::{collections::VecDeque, sync::Arc};
|
||||||
|
|
||||||
|
use parking_lot::{Mutex, MutexGuard};
|
||||||
|
|
||||||
|
use crate::executor::{self, PollSome, Waker};
|
||||||
|
|
||||||
|
/// FIFO channel with blocking send and receive. Can be cloned and shared between threads.
|
||||||
|
/// Blocking functions should be used only from threads that are managed by the executor.
|
||||||
|
pub struct Chan<T> {
|
||||||
|
shared: Arc<State<T>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T> Clone for Chan<T> {
|
||||||
|
fn clone(&self) -> Self {
|
||||||
|
Chan {
|
||||||
|
shared: self.shared.clone(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T> Default for Chan<T> {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self::new()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T> Chan<T> {
|
||||||
|
pub fn new() -> Chan<T> {
|
||||||
|
Chan {
|
||||||
|
shared: Arc::new(State {
|
||||||
|
queue: Mutex::new(VecDeque::new()),
|
||||||
|
waker: Waker::new(),
|
||||||
|
}),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get a message from the front of the queue, block if the queue is empty.
|
||||||
|
/// If not called from the executor thread, it can block forever.
|
||||||
|
pub fn recv(&self) -> T {
|
||||||
|
self.shared.recv()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Panic if the queue is empty.
|
||||||
|
pub fn must_recv(&self) -> T {
|
||||||
|
self.shared
|
||||||
|
.try_recv()
|
||||||
|
.expect("message should've been ready")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get a message from the front of the queue, return None if the queue is empty.
|
||||||
|
/// Never blocks.
|
||||||
|
pub fn try_recv(&self) -> Option<T> {
|
||||||
|
self.shared.try_recv()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Send a message to the back of the queue.
|
||||||
|
pub fn send(&self, t: T) {
|
||||||
|
self.shared.send(t);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct State<T> {
|
||||||
|
queue: Mutex<VecDeque<T>>,
|
||||||
|
waker: Waker,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T> State<T> {
|
||||||
|
fn send(&self, t: T) {
|
||||||
|
self.queue.lock().push_back(t);
|
||||||
|
self.waker.wake_all();
|
||||||
|
}
|
||||||
|
|
||||||
|
fn try_recv(&self) -> Option<T> {
|
||||||
|
let mut q = self.queue.lock();
|
||||||
|
q.pop_front()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn recv(&self) -> T {
|
||||||
|
// interrupt the receiver to prevent consuming everything at once
|
||||||
|
executor::yield_me(0);
|
||||||
|
|
||||||
|
let mut queue = self.queue.lock();
|
||||||
|
if let Some(t) = queue.pop_front() {
|
||||||
|
return t;
|
||||||
|
}
|
||||||
|
loop {
|
||||||
|
self.waker.wake_me_later();
|
||||||
|
if let Some(t) = queue.pop_front() {
|
||||||
|
return t;
|
||||||
|
}
|
||||||
|
MutexGuard::unlocked(&mut queue, || {
|
||||||
|
executor::yield_me(-1);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T> PollSome for Chan<T> {
|
||||||
|
/// Schedules a wakeup for the current thread.
|
||||||
|
fn wake_me(&self) {
|
||||||
|
self.shared.waker.wake_me_later();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Checks if chan has any pending messages.
|
||||||
|
fn has_some(&self) -> bool {
|
||||||
|
!self.shared.queue.lock().is_empty()
|
||||||
|
}
|
||||||
|
}
|
||||||
483
libs/desim/src/executor.rs
Normal file
483
libs/desim/src/executor.rs
Normal file
@@ -0,0 +1,483 @@
|
|||||||
|
use std::{
|
||||||
|
panic::AssertUnwindSafe,
|
||||||
|
sync::{
|
||||||
|
atomic::{AtomicBool, AtomicU32, AtomicU8, Ordering},
|
||||||
|
mpsc, Arc, OnceLock,
|
||||||
|
},
|
||||||
|
thread::JoinHandle,
|
||||||
|
};
|
||||||
|
|
||||||
|
use tracing::{debug, error, trace};
|
||||||
|
|
||||||
|
use crate::time::Timing;
|
||||||
|
|
||||||
|
/// Stores status of the running threads. Threads are registered in the runtime upon creation
|
||||||
|
/// and deregistered upon termination.
|
||||||
|
pub struct Runtime {
|
||||||
|
// stores handles to all threads that are currently running
|
||||||
|
threads: Vec<ThreadHandle>,
|
||||||
|
// stores current time and pending wakeups
|
||||||
|
clock: Arc<Timing>,
|
||||||
|
// thread counter
|
||||||
|
thread_counter: AtomicU32,
|
||||||
|
// Thread step counter -- how many times all threads has been actually
|
||||||
|
// stepped (note that all world/time/executor/thread have slightly different
|
||||||
|
// meaning of steps). For observability.
|
||||||
|
pub step_counter: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Runtime {
|
||||||
|
/// Init new runtime, no running threads.
|
||||||
|
pub fn new(clock: Arc<Timing>) -> Self {
|
||||||
|
Self {
|
||||||
|
threads: Vec::new(),
|
||||||
|
clock,
|
||||||
|
thread_counter: AtomicU32::new(0),
|
||||||
|
step_counter: 0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Spawn a new thread and register it in the runtime.
|
||||||
|
pub fn spawn<F>(&mut self, f: F) -> ExternalHandle
|
||||||
|
where
|
||||||
|
F: FnOnce() + Send + 'static,
|
||||||
|
{
|
||||||
|
let (tx, rx) = mpsc::channel();
|
||||||
|
|
||||||
|
let clock = self.clock.clone();
|
||||||
|
let tid = self.thread_counter.fetch_add(1, Ordering::SeqCst);
|
||||||
|
debug!("spawning thread-{}", tid);
|
||||||
|
|
||||||
|
let join = std::thread::spawn(move || {
|
||||||
|
let _guard = tracing::info_span!("", tid).entered();
|
||||||
|
|
||||||
|
let res = std::panic::catch_unwind(AssertUnwindSafe(|| {
|
||||||
|
with_thread_context(|ctx| {
|
||||||
|
assert!(ctx.clock.set(clock).is_ok());
|
||||||
|
ctx.id.store(tid, Ordering::SeqCst);
|
||||||
|
tx.send(ctx.clone()).expect("failed to send thread context");
|
||||||
|
// suspend thread to put it to `threads` in sleeping state
|
||||||
|
ctx.yield_me(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
// start user-provided function
|
||||||
|
f();
|
||||||
|
}));
|
||||||
|
debug!("thread finished");
|
||||||
|
|
||||||
|
if let Err(e) = res {
|
||||||
|
with_thread_context(|ctx| {
|
||||||
|
if !ctx.allow_panic.load(std::sync::atomic::Ordering::SeqCst) {
|
||||||
|
error!("thread panicked, terminating the process: {:?}", e);
|
||||||
|
std::process::exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
debug!("thread panicked: {:?}", e);
|
||||||
|
let mut result = ctx.result.lock();
|
||||||
|
if result.0 == -1 {
|
||||||
|
*result = (256, format!("thread panicked: {:?}", e));
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
with_thread_context(|ctx| {
|
||||||
|
ctx.finish_me();
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
let ctx = rx.recv().expect("failed to receive thread context");
|
||||||
|
let handle = ThreadHandle::new(ctx.clone(), join);
|
||||||
|
|
||||||
|
self.threads.push(handle);
|
||||||
|
|
||||||
|
ExternalHandle { ctx }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns true if there are any unfinished activity, such as running thread or pending events.
|
||||||
|
/// Otherwise returns false, which means all threads are blocked forever.
|
||||||
|
pub fn step(&mut self) -> bool {
|
||||||
|
trace!("runtime step");
|
||||||
|
|
||||||
|
// have we run any thread?
|
||||||
|
let mut ran = false;
|
||||||
|
|
||||||
|
self.threads.retain(|thread: &ThreadHandle| {
|
||||||
|
let res = thread.ctx.wakeup.compare_exchange(
|
||||||
|
PENDING_WAKEUP,
|
||||||
|
NO_WAKEUP,
|
||||||
|
Ordering::SeqCst,
|
||||||
|
Ordering::SeqCst,
|
||||||
|
);
|
||||||
|
if res.is_err() {
|
||||||
|
// thread has no pending wakeups, leaving as is
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
ran = true;
|
||||||
|
|
||||||
|
trace!("entering thread-{}", thread.ctx.tid());
|
||||||
|
let status = thread.step();
|
||||||
|
self.step_counter += 1;
|
||||||
|
trace!(
|
||||||
|
"out of thread-{} with status {:?}",
|
||||||
|
thread.ctx.tid(),
|
||||||
|
status
|
||||||
|
);
|
||||||
|
|
||||||
|
if status == Status::Sleep {
|
||||||
|
true
|
||||||
|
} else {
|
||||||
|
trace!("thread has finished");
|
||||||
|
// removing the thread from the list
|
||||||
|
false
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
if !ran {
|
||||||
|
trace!("no threads were run, stepping clock");
|
||||||
|
if let Some(ctx_to_wake) = self.clock.step() {
|
||||||
|
trace!("waking up thread-{}", ctx_to_wake.tid());
|
||||||
|
ctx_to_wake.inc_wake();
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
true
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Kill all threads. This is done by setting a flag in each thread context and waking it up.
|
||||||
|
pub fn crash_all_threads(&mut self) {
|
||||||
|
for thread in self.threads.iter() {
|
||||||
|
thread.ctx.crash_stop();
|
||||||
|
}
|
||||||
|
|
||||||
|
// all threads should be finished after a few steps
|
||||||
|
while !self.threads.is_empty() {
|
||||||
|
self.step();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Drop for Runtime {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
debug!("dropping the runtime");
|
||||||
|
self.crash_all_threads();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct ExternalHandle {
|
||||||
|
ctx: Arc<ThreadContext>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ExternalHandle {
|
||||||
|
/// Returns true if thread has finished execution.
|
||||||
|
pub fn is_finished(&self) -> bool {
|
||||||
|
let status = self.ctx.mutex.lock();
|
||||||
|
*status == Status::Finished
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns exitcode and message, which is available after thread has finished execution.
|
||||||
|
pub fn result(&self) -> (i32, String) {
|
||||||
|
let result = self.ctx.result.lock();
|
||||||
|
result.clone()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns thread id.
|
||||||
|
pub fn id(&self) -> u32 {
|
||||||
|
self.ctx.id.load(Ordering::SeqCst)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Sets a flag to crash thread on the next wakeup.
|
||||||
|
pub fn crash_stop(&self) {
|
||||||
|
self.ctx.crash_stop();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ThreadHandle {
|
||||||
|
ctx: Arc<ThreadContext>,
|
||||||
|
_join: JoinHandle<()>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ThreadHandle {
|
||||||
|
/// Create a new [`ThreadHandle`] and wait until thread will enter [`Status::Sleep`] state.
|
||||||
|
fn new(ctx: Arc<ThreadContext>, join: JoinHandle<()>) -> Self {
|
||||||
|
let mut status = ctx.mutex.lock();
|
||||||
|
// wait until thread will go into the first yield
|
||||||
|
while *status != Status::Sleep {
|
||||||
|
ctx.condvar.wait(&mut status);
|
||||||
|
}
|
||||||
|
drop(status);
|
||||||
|
|
||||||
|
Self { ctx, _join: join }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Allows thread to execute one step of its execution.
|
||||||
|
/// Returns [`Status`] of the thread after the step.
|
||||||
|
fn step(&self) -> Status {
|
||||||
|
let mut status = self.ctx.mutex.lock();
|
||||||
|
assert!(matches!(*status, Status::Sleep));
|
||||||
|
|
||||||
|
*status = Status::Running;
|
||||||
|
self.ctx.condvar.notify_all();
|
||||||
|
|
||||||
|
while *status == Status::Running {
|
||||||
|
self.ctx.condvar.wait(&mut status);
|
||||||
|
}
|
||||||
|
|
||||||
|
*status
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
||||||
|
enum Status {
|
||||||
|
/// Thread is running.
|
||||||
|
Running,
|
||||||
|
/// Waiting for event to complete, will be resumed by the executor step, once wakeup flag is set.
|
||||||
|
Sleep,
|
||||||
|
/// Thread finished execution.
|
||||||
|
Finished,
|
||||||
|
}
|
||||||
|
|
||||||
|
const NO_WAKEUP: u8 = 0;
|
||||||
|
const PENDING_WAKEUP: u8 = 1;
|
||||||
|
|
||||||
|
pub struct ThreadContext {
|
||||||
|
id: AtomicU32,
|
||||||
|
// used to block thread until it is woken up
|
||||||
|
mutex: parking_lot::Mutex<Status>,
|
||||||
|
condvar: parking_lot::Condvar,
|
||||||
|
// used as a flag to indicate runtime that thread is ready to be woken up
|
||||||
|
wakeup: AtomicU8,
|
||||||
|
clock: OnceLock<Arc<Timing>>,
|
||||||
|
// execution result, set by exit() call
|
||||||
|
result: parking_lot::Mutex<(i32, String)>,
|
||||||
|
// determines if process should be killed on receiving panic
|
||||||
|
allow_panic: AtomicBool,
|
||||||
|
// acts as a signal that thread should crash itself on the next wakeup
|
||||||
|
crash_request: AtomicBool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ThreadContext {
|
||||||
|
pub(crate) fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
id: AtomicU32::new(0),
|
||||||
|
mutex: parking_lot::Mutex::new(Status::Running),
|
||||||
|
condvar: parking_lot::Condvar::new(),
|
||||||
|
wakeup: AtomicU8::new(NO_WAKEUP),
|
||||||
|
clock: OnceLock::new(),
|
||||||
|
result: parking_lot::Mutex::new((-1, String::new())),
|
||||||
|
allow_panic: AtomicBool::new(false),
|
||||||
|
crash_request: AtomicBool::new(false),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Functions for executor to control thread execution.
|
||||||
|
impl ThreadContext {
|
||||||
|
/// Set atomic flag to indicate that thread is ready to be woken up.
|
||||||
|
fn inc_wake(&self) {
|
||||||
|
self.wakeup.store(PENDING_WAKEUP, Ordering::SeqCst);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Internal function used for event queues.
|
||||||
|
pub(crate) fn schedule_wakeup(self: &Arc<Self>, after_ms: u64) {
|
||||||
|
self.clock
|
||||||
|
.get()
|
||||||
|
.unwrap()
|
||||||
|
.schedule_wakeup(after_ms, self.clone());
|
||||||
|
}
|
||||||
|
|
||||||
|
fn tid(&self) -> u32 {
|
||||||
|
self.id.load(Ordering::SeqCst)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn crash_stop(&self) {
|
||||||
|
let status = self.mutex.lock();
|
||||||
|
if *status == Status::Finished {
|
||||||
|
debug!(
|
||||||
|
"trying to crash thread-{}, which is already finished",
|
||||||
|
self.tid()
|
||||||
|
);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
assert!(matches!(*status, Status::Sleep));
|
||||||
|
drop(status);
|
||||||
|
|
||||||
|
self.allow_panic.store(true, Ordering::SeqCst);
|
||||||
|
self.crash_request.store(true, Ordering::SeqCst);
|
||||||
|
// set a wakeup
|
||||||
|
self.inc_wake();
|
||||||
|
// it will panic on the next wakeup
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Internal functions.
|
||||||
|
impl ThreadContext {
|
||||||
|
/// Blocks thread until it's woken up by the executor. If `after_ms` is 0, is will be
|
||||||
|
/// woken on the next step. If `after_ms` > 0, wakeup is scheduled after that time.
|
||||||
|
/// Otherwise wakeup is not scheduled inside `yield_me`, and should be arranged before
|
||||||
|
/// calling this function.
|
||||||
|
fn yield_me(self: &Arc<Self>, after_ms: i64) {
|
||||||
|
let mut status = self.mutex.lock();
|
||||||
|
assert!(matches!(*status, Status::Running));
|
||||||
|
|
||||||
|
match after_ms.cmp(&0) {
|
||||||
|
std::cmp::Ordering::Less => {
|
||||||
|
// block until something wakes us up
|
||||||
|
}
|
||||||
|
std::cmp::Ordering::Equal => {
|
||||||
|
// tell executor that we are ready to be woken up
|
||||||
|
self.inc_wake();
|
||||||
|
}
|
||||||
|
std::cmp::Ordering::Greater => {
|
||||||
|
// schedule wakeup
|
||||||
|
self.clock
|
||||||
|
.get()
|
||||||
|
.unwrap()
|
||||||
|
.schedule_wakeup(after_ms as u64, self.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
*status = Status::Sleep;
|
||||||
|
self.condvar.notify_all();
|
||||||
|
|
||||||
|
// wait until executor wakes us up
|
||||||
|
while *status != Status::Running {
|
||||||
|
self.condvar.wait(&mut status);
|
||||||
|
}
|
||||||
|
|
||||||
|
if self.crash_request.load(Ordering::SeqCst) {
|
||||||
|
panic!("crashed by request");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Called only once, exactly before thread finishes execution.
|
||||||
|
fn finish_me(&self) {
|
||||||
|
let mut status = self.mutex.lock();
|
||||||
|
assert!(matches!(*status, Status::Running));
|
||||||
|
|
||||||
|
*status = Status::Finished;
|
||||||
|
{
|
||||||
|
let mut result = self.result.lock();
|
||||||
|
if result.0 == -1 {
|
||||||
|
*result = (0, "finished normally".to_owned());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
self.condvar.notify_all();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Invokes the given closure with a reference to the current thread [`ThreadContext`].
|
||||||
|
#[inline(always)]
|
||||||
|
fn with_thread_context<T>(f: impl FnOnce(&Arc<ThreadContext>) -> T) -> T {
|
||||||
|
thread_local!(static THREAD_DATA: Arc<ThreadContext> = Arc::new(ThreadContext::new()));
|
||||||
|
THREAD_DATA.with(f)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Waker is used to wake up threads that are blocked on condition.
|
||||||
|
/// It keeps track of contexts [`Arc<ThreadContext>`] and can increment the counter
|
||||||
|
/// of several contexts to send a notification.
|
||||||
|
pub struct Waker {
|
||||||
|
// contexts that are waiting for a notification
|
||||||
|
contexts: parking_lot::Mutex<smallvec::SmallVec<[Arc<ThreadContext>; 8]>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for Waker {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self::new()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Waker {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
contexts: parking_lot::Mutex::new(smallvec::SmallVec::new()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Subscribe current thread to receive a wake notification later.
|
||||||
|
pub fn wake_me_later(&self) {
|
||||||
|
with_thread_context(|ctx| {
|
||||||
|
self.contexts.lock().push(ctx.clone());
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Wake up all threads that are waiting for a notification and clear the list.
|
||||||
|
pub fn wake_all(&self) {
|
||||||
|
let mut v = self.contexts.lock();
|
||||||
|
for ctx in v.iter() {
|
||||||
|
ctx.inc_wake();
|
||||||
|
}
|
||||||
|
v.clear();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// See [`ThreadContext::yield_me`].
|
||||||
|
pub fn yield_me(after_ms: i64) {
|
||||||
|
with_thread_context(|ctx| ctx.yield_me(after_ms))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get current time.
|
||||||
|
pub fn now() -> u64 {
|
||||||
|
with_thread_context(|ctx| ctx.clock.get().unwrap().now())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn exit(code: i32, msg: String) {
|
||||||
|
with_thread_context(|ctx| {
|
||||||
|
ctx.allow_panic.store(true, Ordering::SeqCst);
|
||||||
|
let mut result = ctx.result.lock();
|
||||||
|
*result = (code, msg);
|
||||||
|
panic!("exit");
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn get_thread_ctx() -> Arc<ThreadContext> {
|
||||||
|
with_thread_context(|ctx| ctx.clone())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Trait for polling channels until they have something.
|
||||||
|
pub trait PollSome {
|
||||||
|
/// Schedule wakeup for message arrival.
|
||||||
|
fn wake_me(&self);
|
||||||
|
|
||||||
|
/// Check if channel has a ready message.
|
||||||
|
fn has_some(&self) -> bool;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Blocks current thread until one of the channels has a ready message. Returns
|
||||||
|
/// index of the channel that has a message. If timeout is reached, returns None.
|
||||||
|
///
|
||||||
|
/// Negative timeout means block forever. Zero timeout means check channels and return
|
||||||
|
/// immediately. Positive timeout means block until timeout is reached.
|
||||||
|
pub fn epoll_chans(chans: &[Box<dyn PollSome>], timeout: i64) -> Option<usize> {
|
||||||
|
let deadline = if timeout < 0 {
|
||||||
|
0
|
||||||
|
} else {
|
||||||
|
now() + timeout as u64
|
||||||
|
};
|
||||||
|
|
||||||
|
loop {
|
||||||
|
for chan in chans {
|
||||||
|
chan.wake_me()
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i, chan) in chans.iter().enumerate() {
|
||||||
|
if chan.has_some() {
|
||||||
|
return Some(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if timeout < 0 {
|
||||||
|
// block until wakeup
|
||||||
|
yield_me(-1);
|
||||||
|
} else {
|
||||||
|
let current_time = now();
|
||||||
|
if current_time >= deadline {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
yield_me((deadline - current_time) as i64);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
8
libs/desim/src/lib.rs
Normal file
8
libs/desim/src/lib.rs
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
pub mod chan;
|
||||||
|
pub mod executor;
|
||||||
|
pub mod network;
|
||||||
|
pub mod node_os;
|
||||||
|
pub mod options;
|
||||||
|
pub mod proto;
|
||||||
|
pub mod time;
|
||||||
|
pub mod world;
|
||||||
451
libs/desim/src/network.rs
Normal file
451
libs/desim/src/network.rs
Normal file
@@ -0,0 +1,451 @@
|
|||||||
|
use std::{
|
||||||
|
cmp::Ordering,
|
||||||
|
collections::{BinaryHeap, VecDeque},
|
||||||
|
fmt::{self, Debug},
|
||||||
|
ops::DerefMut,
|
||||||
|
sync::{mpsc, Arc},
|
||||||
|
};
|
||||||
|
|
||||||
|
use parking_lot::{
|
||||||
|
lock_api::{MappedMutexGuard, MutexGuard},
|
||||||
|
Mutex, RawMutex,
|
||||||
|
};
|
||||||
|
use rand::rngs::StdRng;
|
||||||
|
use tracing::debug;
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
executor::{self, ThreadContext},
|
||||||
|
options::NetworkOptions,
|
||||||
|
proto::NetEvent,
|
||||||
|
proto::NodeEvent,
|
||||||
|
};
|
||||||
|
|
||||||
|
use super::{chan::Chan, proto::AnyMessage};
|
||||||
|
|
||||||
|
pub struct NetworkTask {
|
||||||
|
options: Arc<NetworkOptions>,
|
||||||
|
connections: Mutex<Vec<VirtualConnection>>,
|
||||||
|
/// min-heap of connections having something to deliver.
|
||||||
|
events: Mutex<BinaryHeap<Event>>,
|
||||||
|
task_context: Arc<ThreadContext>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl NetworkTask {
|
||||||
|
pub fn start_new(options: Arc<NetworkOptions>, tx: mpsc::Sender<Arc<NetworkTask>>) {
|
||||||
|
let ctx = executor::get_thread_ctx();
|
||||||
|
let task = Arc::new(Self {
|
||||||
|
options,
|
||||||
|
connections: Mutex::new(Vec::new()),
|
||||||
|
events: Mutex::new(BinaryHeap::new()),
|
||||||
|
task_context: ctx,
|
||||||
|
});
|
||||||
|
|
||||||
|
// send the task upstream
|
||||||
|
tx.send(task.clone()).unwrap();
|
||||||
|
|
||||||
|
// start the task
|
||||||
|
task.start();
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn start_new_connection(self: &Arc<Self>, rng: StdRng, dst_accept: Chan<NodeEvent>) -> TCP {
|
||||||
|
let now = executor::now();
|
||||||
|
let connection_id = self.connections.lock().len();
|
||||||
|
|
||||||
|
let vc = VirtualConnection {
|
||||||
|
connection_id,
|
||||||
|
dst_accept,
|
||||||
|
dst_sockets: [Chan::new(), Chan::new()],
|
||||||
|
state: Mutex::new(ConnectionState {
|
||||||
|
buffers: [NetworkBuffer::new(None), NetworkBuffer::new(Some(now))],
|
||||||
|
rng,
|
||||||
|
}),
|
||||||
|
};
|
||||||
|
vc.schedule_timeout(self);
|
||||||
|
vc.send_connect(self);
|
||||||
|
|
||||||
|
let recv_chan = vc.dst_sockets[0].clone();
|
||||||
|
self.connections.lock().push(vc);
|
||||||
|
|
||||||
|
TCP {
|
||||||
|
net: self.clone(),
|
||||||
|
conn_id: connection_id,
|
||||||
|
dir: 0,
|
||||||
|
recv_chan,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// private functions
|
||||||
|
impl NetworkTask {
|
||||||
|
/// Schedule to wakeup network task (self) `after_ms` later to deliver
|
||||||
|
/// messages of connection `id`.
|
||||||
|
fn schedule(&self, id: usize, after_ms: u64) {
|
||||||
|
self.events.lock().push(Event {
|
||||||
|
time: executor::now() + after_ms,
|
||||||
|
conn_id: id,
|
||||||
|
});
|
||||||
|
self.task_context.schedule_wakeup(after_ms);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get locked connection `id`.
|
||||||
|
fn get(&self, id: usize) -> MappedMutexGuard<'_, RawMutex, VirtualConnection> {
|
||||||
|
MutexGuard::map(self.connections.lock(), |connections| {
|
||||||
|
connections.get_mut(id).unwrap()
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn collect_pending_events(&self, now: u64, vec: &mut Vec<Event>) {
|
||||||
|
vec.clear();
|
||||||
|
let mut events = self.events.lock();
|
||||||
|
while let Some(event) = events.peek() {
|
||||||
|
if event.time > now {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
let event = events.pop().unwrap();
|
||||||
|
vec.push(event);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn start(self: &Arc<Self>) {
|
||||||
|
debug!("started network task");
|
||||||
|
|
||||||
|
let mut events = Vec::new();
|
||||||
|
loop {
|
||||||
|
let now = executor::now();
|
||||||
|
self.collect_pending_events(now, &mut events);
|
||||||
|
|
||||||
|
for event in events.drain(..) {
|
||||||
|
let conn = self.get(event.conn_id);
|
||||||
|
conn.process(self);
|
||||||
|
}
|
||||||
|
|
||||||
|
// block until wakeup
|
||||||
|
executor::yield_me(-1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 0 - from node(0) to node(1)
|
||||||
|
// 1 - from node(1) to node(0)
|
||||||
|
type MessageDirection = u8;
|
||||||
|
|
||||||
|
fn sender_str(dir: MessageDirection) -> &'static str {
|
||||||
|
match dir {
|
||||||
|
0 => "client",
|
||||||
|
1 => "server",
|
||||||
|
_ => unreachable!(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn receiver_str(dir: MessageDirection) -> &'static str {
|
||||||
|
match dir {
|
||||||
|
0 => "server",
|
||||||
|
1 => "client",
|
||||||
|
_ => unreachable!(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Virtual connection between two nodes.
|
||||||
|
/// Node 0 is the creator of the connection (client),
|
||||||
|
/// and node 1 is the acceptor (server).
|
||||||
|
struct VirtualConnection {
|
||||||
|
connection_id: usize,
|
||||||
|
/// one-off chan, used to deliver Accept message to dst
|
||||||
|
dst_accept: Chan<NodeEvent>,
|
||||||
|
/// message sinks
|
||||||
|
dst_sockets: [Chan<NetEvent>; 2],
|
||||||
|
state: Mutex<ConnectionState>,
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ConnectionState {
|
||||||
|
buffers: [NetworkBuffer; 2],
|
||||||
|
rng: StdRng,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl VirtualConnection {
|
||||||
|
/// Notify the future about the possible timeout.
|
||||||
|
fn schedule_timeout(&self, net: &NetworkTask) {
|
||||||
|
if let Some(timeout) = net.options.keepalive_timeout {
|
||||||
|
net.schedule(self.connection_id, timeout);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Send the handshake (Accept) to the server.
|
||||||
|
fn send_connect(&self, net: &NetworkTask) {
|
||||||
|
let now = executor::now();
|
||||||
|
let mut state = self.state.lock();
|
||||||
|
let delay = net.options.connect_delay.delay(&mut state.rng);
|
||||||
|
let buffer = &mut state.buffers[0];
|
||||||
|
assert!(buffer.buf.is_empty());
|
||||||
|
assert!(!buffer.recv_closed);
|
||||||
|
assert!(!buffer.send_closed);
|
||||||
|
assert!(buffer.last_recv.is_none());
|
||||||
|
|
||||||
|
let delay = if let Some(ms) = delay {
|
||||||
|
ms
|
||||||
|
} else {
|
||||||
|
debug!("NET: TCP #{} dropped connect", self.connection_id);
|
||||||
|
buffer.send_closed = true;
|
||||||
|
return;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Send a message into the future.
|
||||||
|
buffer
|
||||||
|
.buf
|
||||||
|
.push_back((now + delay, AnyMessage::InternalConnect));
|
||||||
|
net.schedule(self.connection_id, delay);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Transmit some of the messages from the buffer to the nodes.
|
||||||
|
fn process(&self, net: &Arc<NetworkTask>) {
|
||||||
|
let now = executor::now();
|
||||||
|
|
||||||
|
let mut state = self.state.lock();
|
||||||
|
|
||||||
|
for direction in 0..2 {
|
||||||
|
self.process_direction(
|
||||||
|
net,
|
||||||
|
state.deref_mut(),
|
||||||
|
now,
|
||||||
|
direction as MessageDirection,
|
||||||
|
&self.dst_sockets[direction ^ 1],
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Close the one side of the connection by timeout if the node
|
||||||
|
// has not received any messages for a long time.
|
||||||
|
if let Some(timeout) = net.options.keepalive_timeout {
|
||||||
|
let mut to_close = [false, false];
|
||||||
|
for direction in 0..2 {
|
||||||
|
let buffer = &mut state.buffers[direction];
|
||||||
|
if buffer.recv_closed {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if let Some(last_recv) = buffer.last_recv {
|
||||||
|
if now - last_recv >= timeout {
|
||||||
|
debug!(
|
||||||
|
"NET: connection {} timed out at {}",
|
||||||
|
self.connection_id,
|
||||||
|
receiver_str(direction as MessageDirection)
|
||||||
|
);
|
||||||
|
let node_idx = direction ^ 1;
|
||||||
|
to_close[node_idx] = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
drop(state);
|
||||||
|
|
||||||
|
for (node_idx, should_close) in to_close.iter().enumerate() {
|
||||||
|
if *should_close {
|
||||||
|
self.close(node_idx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Process messages in the buffer in the given direction.
|
||||||
|
fn process_direction(
|
||||||
|
&self,
|
||||||
|
net: &Arc<NetworkTask>,
|
||||||
|
state: &mut ConnectionState,
|
||||||
|
now: u64,
|
||||||
|
direction: MessageDirection,
|
||||||
|
to_socket: &Chan<NetEvent>,
|
||||||
|
) {
|
||||||
|
let buffer = &mut state.buffers[direction as usize];
|
||||||
|
if buffer.recv_closed {
|
||||||
|
assert!(buffer.buf.is_empty());
|
||||||
|
}
|
||||||
|
|
||||||
|
while !buffer.buf.is_empty() && buffer.buf.front().unwrap().0 <= now {
|
||||||
|
let msg = buffer.buf.pop_front().unwrap().1;
|
||||||
|
|
||||||
|
buffer.last_recv = Some(now);
|
||||||
|
self.schedule_timeout(net);
|
||||||
|
|
||||||
|
if let AnyMessage::InternalConnect = msg {
|
||||||
|
// TODO: assert to_socket is the server
|
||||||
|
let server_to_client = TCP {
|
||||||
|
net: net.clone(),
|
||||||
|
conn_id: self.connection_id,
|
||||||
|
dir: direction ^ 1,
|
||||||
|
recv_chan: to_socket.clone(),
|
||||||
|
};
|
||||||
|
// special case, we need to deliver new connection to a separate channel
|
||||||
|
self.dst_accept.send(NodeEvent::Accept(server_to_client));
|
||||||
|
} else {
|
||||||
|
to_socket.send(NetEvent::Message(msg));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Try to send a message to the buffer, optionally dropping it and
|
||||||
|
/// determining delivery timestamp.
|
||||||
|
fn send(&self, net: &NetworkTask, direction: MessageDirection, msg: AnyMessage) {
|
||||||
|
let now = executor::now();
|
||||||
|
let mut state = self.state.lock();
|
||||||
|
|
||||||
|
let (delay, close) = if let Some(ms) = net.options.send_delay.delay(&mut state.rng) {
|
||||||
|
(ms, false)
|
||||||
|
} else {
|
||||||
|
(0, true)
|
||||||
|
};
|
||||||
|
|
||||||
|
let buffer = &mut state.buffers[direction as usize];
|
||||||
|
if buffer.send_closed {
|
||||||
|
debug!(
|
||||||
|
"NET: TCP #{} dropped message {:?} (broken pipe)",
|
||||||
|
self.connection_id, msg
|
||||||
|
);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if close {
|
||||||
|
debug!(
|
||||||
|
"NET: TCP #{} dropped message {:?} (pipe just broke)",
|
||||||
|
self.connection_id, msg
|
||||||
|
);
|
||||||
|
buffer.send_closed = true;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if buffer.recv_closed {
|
||||||
|
debug!(
|
||||||
|
"NET: TCP #{} dropped message {:?} (recv closed)",
|
||||||
|
self.connection_id, msg
|
||||||
|
);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Send a message into the future.
|
||||||
|
buffer.buf.push_back((now + delay, msg));
|
||||||
|
net.schedule(self.connection_id, delay);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Close the connection. Only one side of the connection will be closed,
|
||||||
|
/// and no further messages will be delivered. The other side will not be notified.
|
||||||
|
fn close(&self, node_idx: usize) {
|
||||||
|
let mut state = self.state.lock();
|
||||||
|
let recv_buffer = &mut state.buffers[1 ^ node_idx];
|
||||||
|
if recv_buffer.recv_closed {
|
||||||
|
debug!(
|
||||||
|
"NET: TCP #{} closed twice at {}",
|
||||||
|
self.connection_id,
|
||||||
|
sender_str(node_idx as MessageDirection),
|
||||||
|
);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
debug!(
|
||||||
|
"NET: TCP #{} closed at {}",
|
||||||
|
self.connection_id,
|
||||||
|
sender_str(node_idx as MessageDirection),
|
||||||
|
);
|
||||||
|
recv_buffer.recv_closed = true;
|
||||||
|
for msg in recv_buffer.buf.drain(..) {
|
||||||
|
debug!(
|
||||||
|
"NET: TCP #{} dropped message {:?} (closed)",
|
||||||
|
self.connection_id, msg
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
let send_buffer = &mut state.buffers[node_idx];
|
||||||
|
send_buffer.send_closed = true;
|
||||||
|
drop(state);
|
||||||
|
|
||||||
|
// TODO: notify the other side?
|
||||||
|
|
||||||
|
self.dst_sockets[node_idx].send(NetEvent::Closed);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct NetworkBuffer {
|
||||||
|
/// Messages paired with time of delivery
|
||||||
|
buf: VecDeque<(u64, AnyMessage)>,
|
||||||
|
/// True if the connection is closed on the receiving side,
|
||||||
|
/// i.e. no more messages from the buffer will be delivered.
|
||||||
|
recv_closed: bool,
|
||||||
|
/// True if the connection is closed on the sending side,
|
||||||
|
/// i.e. no more messages will be added to the buffer.
|
||||||
|
send_closed: bool,
|
||||||
|
/// Last time a message was delivered from the buffer.
|
||||||
|
/// If None, it means that the server is the receiver and
|
||||||
|
/// it has not yet aware of this connection (i.e. has not
|
||||||
|
/// received the Accept).
|
||||||
|
last_recv: Option<u64>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl NetworkBuffer {
|
||||||
|
fn new(last_recv: Option<u64>) -> Self {
|
||||||
|
Self {
|
||||||
|
buf: VecDeque::new(),
|
||||||
|
recv_closed: false,
|
||||||
|
send_closed: false,
|
||||||
|
last_recv,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Single end of a bidirectional network stream without reordering (TCP-like).
|
||||||
|
/// Reads are implemented using channels, writes go to the buffer inside VirtualConnection.
|
||||||
|
pub struct TCP {
|
||||||
|
net: Arc<NetworkTask>,
|
||||||
|
conn_id: usize,
|
||||||
|
dir: MessageDirection,
|
||||||
|
recv_chan: Chan<NetEvent>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Debug for TCP {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
|
write!(f, "TCP #{} ({})", self.conn_id, sender_str(self.dir),)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TCP {
|
||||||
|
/// Send a message to the other side. It's guaranteed that it will not arrive
|
||||||
|
/// before the arrival of all messages sent earlier.
|
||||||
|
pub fn send(&self, msg: AnyMessage) {
|
||||||
|
let conn = self.net.get(self.conn_id);
|
||||||
|
conn.send(&self.net, self.dir, msg);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get a channel to receive incoming messages.
|
||||||
|
pub fn recv_chan(&self) -> Chan<NetEvent> {
|
||||||
|
self.recv_chan.clone()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn connection_id(&self) -> usize {
|
||||||
|
self.conn_id
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn close(&self) {
|
||||||
|
let conn = self.net.get(self.conn_id);
|
||||||
|
conn.close(self.dir as usize);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
struct Event {
|
||||||
|
time: u64,
|
||||||
|
conn_id: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
// BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here
|
||||||
|
// to get that.
|
||||||
|
impl PartialOrd for Event {
|
||||||
|
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||||
|
Some(self.cmp(other))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Ord for Event {
|
||||||
|
fn cmp(&self, other: &Self) -> Ordering {
|
||||||
|
(other.time, other.conn_id).cmp(&(self.time, self.conn_id))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PartialEq for Event {
|
||||||
|
fn eq(&self, other: &Self) -> bool {
|
||||||
|
(other.time, other.conn_id) == (self.time, self.conn_id)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Eq for Event {}
|
||||||
54
libs/desim/src/node_os.rs
Normal file
54
libs/desim/src/node_os.rs
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use rand::Rng;
|
||||||
|
|
||||||
|
use crate::proto::NodeEvent;
|
||||||
|
|
||||||
|
use super::{
|
||||||
|
chan::Chan,
|
||||||
|
network::TCP,
|
||||||
|
world::{Node, NodeId, World},
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Abstraction with all functions (aka syscalls) available to the node.
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct NodeOs {
|
||||||
|
world: Arc<World>,
|
||||||
|
internal: Arc<Node>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl NodeOs {
|
||||||
|
pub fn new(world: Arc<World>, internal: Arc<Node>) -> NodeOs {
|
||||||
|
NodeOs { world, internal }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get the node id.
|
||||||
|
pub fn id(&self) -> NodeId {
|
||||||
|
self.internal.id
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Opens a bidirectional connection with the other node. Always successful.
|
||||||
|
pub fn open_tcp(&self, dst: NodeId) -> TCP {
|
||||||
|
self.world.open_tcp(dst)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns a channel to receive node events (socket Accept and internal messages).
|
||||||
|
pub fn node_events(&self) -> Chan<NodeEvent> {
|
||||||
|
self.internal.node_events()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get current time.
|
||||||
|
pub fn now(&self) -> u64 {
|
||||||
|
self.world.now()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Generate a random number in range [0, max).
|
||||||
|
pub fn random(&self, max: u64) -> u64 {
|
||||||
|
self.internal.rng.lock().gen_range(0..max)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Append a new event to the world event log.
|
||||||
|
pub fn log_event(&self, data: String) {
|
||||||
|
self.internal.log_event(data)
|
||||||
|
}
|
||||||
|
}
|
||||||
50
libs/desim/src/options.rs
Normal file
50
libs/desim/src/options.rs
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
use rand::{rngs::StdRng, Rng};
|
||||||
|
|
||||||
|
/// Describes random delays and failures. Delay will be uniformly distributed in [min, max].
|
||||||
|
/// Connection failure will occur with the probablity fail_prob.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct Delay {
|
||||||
|
pub min: u64,
|
||||||
|
pub max: u64,
|
||||||
|
pub fail_prob: f64, // [0; 1]
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Delay {
|
||||||
|
/// Create a struct with no delay, no failures.
|
||||||
|
pub fn empty() -> Delay {
|
||||||
|
Delay {
|
||||||
|
min: 0,
|
||||||
|
max: 0,
|
||||||
|
fail_prob: 0.0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create a struct with a fixed delay.
|
||||||
|
pub fn fixed(ms: u64) -> Delay {
|
||||||
|
Delay {
|
||||||
|
min: ms,
|
||||||
|
max: ms,
|
||||||
|
fail_prob: 0.0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Generate a random delay in range [min, max]. Return None if the
|
||||||
|
/// message should be dropped.
|
||||||
|
pub fn delay(&self, rng: &mut StdRng) -> Option<u64> {
|
||||||
|
if rng.gen_bool(self.fail_prob) {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
Some(rng.gen_range(self.min..=self.max))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Describes network settings. All network packets will be subjected to the same delays and failures.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct NetworkOptions {
|
||||||
|
/// Connection will be automatically closed after this timeout if no data is received.
|
||||||
|
pub keepalive_timeout: Option<u64>,
|
||||||
|
/// New connections will be delayed by this amount of time.
|
||||||
|
pub connect_delay: Delay,
|
||||||
|
/// Each message will be delayed by this amount of time.
|
||||||
|
pub send_delay: Delay,
|
||||||
|
}
|
||||||
63
libs/desim/src/proto.rs
Normal file
63
libs/desim/src/proto.rs
Normal file
@@ -0,0 +1,63 @@
|
|||||||
|
use std::fmt::Debug;
|
||||||
|
|
||||||
|
use bytes::Bytes;
|
||||||
|
use utils::lsn::Lsn;
|
||||||
|
|
||||||
|
use crate::{network::TCP, world::NodeId};
|
||||||
|
|
||||||
|
/// Internal node events.
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub enum NodeEvent {
|
||||||
|
Accept(TCP),
|
||||||
|
Internal(AnyMessage),
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Events that are coming from a network socket.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub enum NetEvent {
|
||||||
|
Message(AnyMessage),
|
||||||
|
Closed,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Custom events generated throughout the simulation. Can be used by the test to verify the correctness.
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct SimEvent {
|
||||||
|
pub time: u64,
|
||||||
|
pub node: NodeId,
|
||||||
|
pub data: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Umbrella type for all possible flavours of messages. These events can be sent over network
|
||||||
|
/// or to an internal node events channel.
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub enum AnyMessage {
|
||||||
|
/// Not used, empty placeholder.
|
||||||
|
None,
|
||||||
|
/// Used internally for notifying node about new incoming connection.
|
||||||
|
InternalConnect,
|
||||||
|
Just32(u32),
|
||||||
|
ReplCell(ReplCell),
|
||||||
|
Bytes(Bytes),
|
||||||
|
LSN(u64),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Debug for AnyMessage {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
match self {
|
||||||
|
AnyMessage::None => write!(f, "None"),
|
||||||
|
AnyMessage::InternalConnect => write!(f, "InternalConnect"),
|
||||||
|
AnyMessage::Just32(v) => write!(f, "Just32({})", v),
|
||||||
|
AnyMessage::ReplCell(v) => write!(f, "ReplCell({:?})", v),
|
||||||
|
AnyMessage::Bytes(v) => write!(f, "Bytes({})", hex::encode(v)),
|
||||||
|
AnyMessage::LSN(v) => write!(f, "LSN({})", Lsn(*v)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Used in reliable_copy_test.rs
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct ReplCell {
|
||||||
|
pub value: u32,
|
||||||
|
pub client_id: u32,
|
||||||
|
pub seqno: u32,
|
||||||
|
}
|
||||||
129
libs/desim/src/time.rs
Normal file
129
libs/desim/src/time.rs
Normal file
@@ -0,0 +1,129 @@
|
|||||||
|
use std::{
|
||||||
|
cmp::Ordering,
|
||||||
|
collections::BinaryHeap,
|
||||||
|
ops::DerefMut,
|
||||||
|
sync::{
|
||||||
|
atomic::{AtomicU32, AtomicU64},
|
||||||
|
Arc,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
use parking_lot::Mutex;
|
||||||
|
use tracing::trace;
|
||||||
|
|
||||||
|
use crate::executor::ThreadContext;
|
||||||
|
|
||||||
|
/// Holds current time and all pending wakeup events.
|
||||||
|
pub struct Timing {
|
||||||
|
/// Current world's time.
|
||||||
|
current_time: AtomicU64,
|
||||||
|
/// Pending timers.
|
||||||
|
queue: Mutex<BinaryHeap<Pending>>,
|
||||||
|
/// Global nonce. Makes picking events from binary heap queue deterministic
|
||||||
|
/// by appending a number to events with the same timestamp.
|
||||||
|
nonce: AtomicU32,
|
||||||
|
/// Used to schedule fake events.
|
||||||
|
fake_context: Arc<ThreadContext>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for Timing {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self::new()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Timing {
|
||||||
|
/// Create a new empty clock with time set to 0.
|
||||||
|
pub fn new() -> Timing {
|
||||||
|
Timing {
|
||||||
|
current_time: AtomicU64::new(0),
|
||||||
|
queue: Mutex::new(BinaryHeap::new()),
|
||||||
|
nonce: AtomicU32::new(0),
|
||||||
|
fake_context: Arc::new(ThreadContext::new()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return the current world's time.
|
||||||
|
pub fn now(&self) -> u64 {
|
||||||
|
self.current_time.load(std::sync::atomic::Ordering::SeqCst)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Tick-tock the global clock. Return the event ready to be processed
|
||||||
|
/// or move the clock forward and then return the event.
|
||||||
|
pub(crate) fn step(&self) -> Option<Arc<ThreadContext>> {
|
||||||
|
let mut queue = self.queue.lock();
|
||||||
|
|
||||||
|
if queue.is_empty() {
|
||||||
|
// no future events
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
if !self.is_event_ready(queue.deref_mut()) {
|
||||||
|
let next_time = queue.peek().unwrap().time;
|
||||||
|
self.current_time
|
||||||
|
.store(next_time, std::sync::atomic::Ordering::SeqCst);
|
||||||
|
trace!("rewind time to {}", next_time);
|
||||||
|
assert!(self.is_event_ready(queue.deref_mut()));
|
||||||
|
}
|
||||||
|
|
||||||
|
Some(queue.pop().unwrap().wake_context)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Append an event to the queue, to wakeup the thread in `ms` milliseconds.
|
||||||
|
pub(crate) fn schedule_wakeup(&self, ms: u64, wake_context: Arc<ThreadContext>) {
|
||||||
|
self.nonce.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
|
||||||
|
let nonce = self.nonce.load(std::sync::atomic::Ordering::SeqCst);
|
||||||
|
self.queue.lock().push(Pending {
|
||||||
|
time: self.now() + ms,
|
||||||
|
nonce,
|
||||||
|
wake_context,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Append a fake event to the queue, to prevent clocks from skipping this time.
|
||||||
|
pub fn schedule_fake(&self, ms: u64) {
|
||||||
|
self.queue.lock().push(Pending {
|
||||||
|
time: self.now() + ms,
|
||||||
|
nonce: 0,
|
||||||
|
wake_context: self.fake_context.clone(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return true if there is a ready event.
|
||||||
|
fn is_event_ready(&self, queue: &mut BinaryHeap<Pending>) -> bool {
|
||||||
|
queue.peek().map_or(false, |x| x.time <= self.now())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Clear all pending events.
|
||||||
|
pub(crate) fn clear(&self) {
|
||||||
|
self.queue.lock().clear();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct Pending {
|
||||||
|
time: u64,
|
||||||
|
nonce: u32,
|
||||||
|
wake_context: Arc<ThreadContext>,
|
||||||
|
}
|
||||||
|
|
||||||
|
// BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here
|
||||||
|
// to get that.
|
||||||
|
impl PartialOrd for Pending {
|
||||||
|
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||||
|
Some(self.cmp(other))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Ord for Pending {
|
||||||
|
fn cmp(&self, other: &Self) -> Ordering {
|
||||||
|
(other.time, other.nonce).cmp(&(self.time, self.nonce))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PartialEq for Pending {
|
||||||
|
fn eq(&self, other: &Self) -> bool {
|
||||||
|
(other.time, other.nonce) == (self.time, self.nonce)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Eq for Pending {}
|
||||||
180
libs/desim/src/world.rs
Normal file
180
libs/desim/src/world.rs
Normal file
@@ -0,0 +1,180 @@
|
|||||||
|
use parking_lot::Mutex;
|
||||||
|
use rand::{rngs::StdRng, SeedableRng};
|
||||||
|
use std::{
|
||||||
|
ops::DerefMut,
|
||||||
|
sync::{mpsc, Arc},
|
||||||
|
};
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
executor::{ExternalHandle, Runtime},
|
||||||
|
network::NetworkTask,
|
||||||
|
options::NetworkOptions,
|
||||||
|
proto::{NodeEvent, SimEvent},
|
||||||
|
time::Timing,
|
||||||
|
};
|
||||||
|
|
||||||
|
use super::{chan::Chan, network::TCP, node_os::NodeOs};
|
||||||
|
|
||||||
|
pub type NodeId = u32;
|
||||||
|
|
||||||
|
/// World contains simulation state.
|
||||||
|
pub struct World {
|
||||||
|
nodes: Mutex<Vec<Arc<Node>>>,
|
||||||
|
/// Random number generator.
|
||||||
|
rng: Mutex<StdRng>,
|
||||||
|
/// Internal event log.
|
||||||
|
events: Mutex<Vec<SimEvent>>,
|
||||||
|
/// Separate task that processes all network messages.
|
||||||
|
network_task: Arc<NetworkTask>,
|
||||||
|
/// Runtime for running threads and moving time.
|
||||||
|
runtime: Mutex<Runtime>,
|
||||||
|
/// To get current time.
|
||||||
|
timing: Arc<Timing>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl World {
|
||||||
|
pub fn new(seed: u64, options: Arc<NetworkOptions>) -> World {
|
||||||
|
let timing = Arc::new(Timing::new());
|
||||||
|
let mut runtime = Runtime::new(timing.clone());
|
||||||
|
|
||||||
|
let (tx, rx) = mpsc::channel();
|
||||||
|
|
||||||
|
runtime.spawn(move || {
|
||||||
|
// create and start network background thread, and send it back via the channel
|
||||||
|
NetworkTask::start_new(options, tx)
|
||||||
|
});
|
||||||
|
|
||||||
|
// wait for the network task to start
|
||||||
|
while runtime.step() {}
|
||||||
|
|
||||||
|
let network_task = rx.recv().unwrap();
|
||||||
|
|
||||||
|
World {
|
||||||
|
nodes: Mutex::new(Vec::new()),
|
||||||
|
rng: Mutex::new(StdRng::seed_from_u64(seed)),
|
||||||
|
events: Mutex::new(Vec::new()),
|
||||||
|
network_task,
|
||||||
|
runtime: Mutex::new(runtime),
|
||||||
|
timing,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn step(&self) -> bool {
|
||||||
|
self.runtime.lock().step()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get_thread_step_count(&self) -> u64 {
|
||||||
|
self.runtime.lock().step_counter
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create a new random number generator.
|
||||||
|
pub fn new_rng(&self) -> StdRng {
|
||||||
|
let mut rng = self.rng.lock();
|
||||||
|
StdRng::from_rng(rng.deref_mut()).unwrap()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create a new node.
|
||||||
|
pub fn new_node(self: &Arc<Self>) -> Arc<Node> {
|
||||||
|
let mut nodes = self.nodes.lock();
|
||||||
|
let id = nodes.len() as NodeId;
|
||||||
|
let node = Arc::new(Node::new(id, self.clone(), self.new_rng()));
|
||||||
|
nodes.push(node.clone());
|
||||||
|
node
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get an internal node state by id.
|
||||||
|
fn get_node(&self, id: NodeId) -> Option<Arc<Node>> {
|
||||||
|
let nodes = self.nodes.lock();
|
||||||
|
let num = id as usize;
|
||||||
|
if num < nodes.len() {
|
||||||
|
Some(nodes[num].clone())
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn stop_all(&self) {
|
||||||
|
self.runtime.lock().crash_all_threads();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns a writable end of a TCP connection, to send src->dst messages.
|
||||||
|
pub fn open_tcp(self: &Arc<World>, dst: NodeId) -> TCP {
|
||||||
|
// TODO: replace unwrap() with /dev/null socket.
|
||||||
|
let dst = self.get_node(dst).unwrap();
|
||||||
|
let dst_accept = dst.node_events.lock().clone();
|
||||||
|
|
||||||
|
let rng = self.new_rng();
|
||||||
|
self.network_task.start_new_connection(rng, dst_accept)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get current time.
|
||||||
|
pub fn now(&self) -> u64 {
|
||||||
|
self.timing.now()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get a copy of the internal clock.
|
||||||
|
pub fn clock(&self) -> Arc<Timing> {
|
||||||
|
self.timing.clone()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn add_event(&self, node: NodeId, data: String) {
|
||||||
|
let time = self.now();
|
||||||
|
self.events.lock().push(SimEvent { time, node, data });
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn take_events(&self) -> Vec<SimEvent> {
|
||||||
|
let mut events = self.events.lock();
|
||||||
|
let mut res = Vec::new();
|
||||||
|
std::mem::swap(&mut res, &mut events);
|
||||||
|
res
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn deallocate(&self) {
|
||||||
|
self.stop_all();
|
||||||
|
self.timing.clear();
|
||||||
|
self.nodes.lock().clear();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Internal node state.
|
||||||
|
pub struct Node {
|
||||||
|
pub id: NodeId,
|
||||||
|
node_events: Mutex<Chan<NodeEvent>>,
|
||||||
|
world: Arc<World>,
|
||||||
|
pub(crate) rng: Mutex<StdRng>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Node {
|
||||||
|
pub fn new(id: NodeId, world: Arc<World>, rng: StdRng) -> Node {
|
||||||
|
Node {
|
||||||
|
id,
|
||||||
|
node_events: Mutex::new(Chan::new()),
|
||||||
|
world,
|
||||||
|
rng: Mutex::new(rng),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Spawn a new thread with this node context.
|
||||||
|
pub fn launch(self: &Arc<Self>, f: impl FnOnce(NodeOs) + Send + 'static) -> ExternalHandle {
|
||||||
|
let node = self.clone();
|
||||||
|
let world = self.world.clone();
|
||||||
|
self.world.runtime.lock().spawn(move || {
|
||||||
|
f(NodeOs::new(world, node.clone()));
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns a channel to receive Accepts and internal messages.
|
||||||
|
pub fn node_events(&self) -> Chan<NodeEvent> {
|
||||||
|
self.node_events.lock().clone()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// This will drop all in-flight Accept messages.
|
||||||
|
pub fn replug_node_events(&self, chan: Chan<NodeEvent>) {
|
||||||
|
*self.node_events.lock() = chan;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Append event to the world's log.
|
||||||
|
pub fn log_event(&self, data: String) {
|
||||||
|
self.world.add_event(self.id, data)
|
||||||
|
}
|
||||||
|
}
|
||||||
244
libs/desim/tests/reliable_copy_test.rs
Normal file
244
libs/desim/tests/reliable_copy_test.rs
Normal file
@@ -0,0 +1,244 @@
|
|||||||
|
//! Simple test to verify that simulator is working.
|
||||||
|
#[cfg(test)]
|
||||||
|
mod reliable_copy_test {
|
||||||
|
use anyhow::Result;
|
||||||
|
use desim::executor::{self, PollSome};
|
||||||
|
use desim::options::{Delay, NetworkOptions};
|
||||||
|
use desim::proto::{NetEvent, NodeEvent, ReplCell};
|
||||||
|
use desim::world::{NodeId, World};
|
||||||
|
use desim::{node_os::NodeOs, proto::AnyMessage};
|
||||||
|
use parking_lot::Mutex;
|
||||||
|
use std::sync::Arc;
|
||||||
|
use tracing::info;
|
||||||
|
|
||||||
|
/// Disk storage trait and implementation.
|
||||||
|
pub trait Storage<T> {
|
||||||
|
fn flush_pos(&self) -> u32;
|
||||||
|
fn flush(&mut self) -> Result<()>;
|
||||||
|
fn write(&mut self, t: T);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct SharedStorage<T> {
|
||||||
|
pub state: Arc<Mutex<InMemoryStorage<T>>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T> SharedStorage<T> {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
state: Arc::new(Mutex::new(InMemoryStorage::new())),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T> Storage<T> for SharedStorage<T> {
|
||||||
|
fn flush_pos(&self) -> u32 {
|
||||||
|
self.state.lock().flush_pos
|
||||||
|
}
|
||||||
|
|
||||||
|
fn flush(&mut self) -> Result<()> {
|
||||||
|
executor::yield_me(0);
|
||||||
|
self.state.lock().flush()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn write(&mut self, t: T) {
|
||||||
|
executor::yield_me(0);
|
||||||
|
self.state.lock().write(t);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct InMemoryStorage<T> {
|
||||||
|
pub data: Vec<T>,
|
||||||
|
pub flush_pos: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T> InMemoryStorage<T> {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
data: Vec::new(),
|
||||||
|
flush_pos: 0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn flush(&mut self) -> Result<()> {
|
||||||
|
self.flush_pos = self.data.len() as u32;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn write(&mut self, t: T) {
|
||||||
|
self.data.push(t);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Server implementation.
|
||||||
|
pub fn run_server(os: NodeOs, mut storage: Box<dyn Storage<u32>>) {
|
||||||
|
info!("started server");
|
||||||
|
|
||||||
|
let node_events = os.node_events();
|
||||||
|
let mut epoll_vec: Vec<Box<dyn PollSome>> = vec![Box::new(node_events.clone())];
|
||||||
|
let mut sockets = vec![];
|
||||||
|
|
||||||
|
loop {
|
||||||
|
let index = executor::epoll_chans(&epoll_vec, -1).unwrap();
|
||||||
|
|
||||||
|
if index == 0 {
|
||||||
|
let node_event = node_events.must_recv();
|
||||||
|
info!("got node event: {:?}", node_event);
|
||||||
|
if let NodeEvent::Accept(tcp) = node_event {
|
||||||
|
tcp.send(AnyMessage::Just32(storage.flush_pos()));
|
||||||
|
epoll_vec.push(Box::new(tcp.recv_chan()));
|
||||||
|
sockets.push(tcp);
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let recv_chan = sockets[index - 1].recv_chan();
|
||||||
|
let socket = &sockets[index - 1];
|
||||||
|
|
||||||
|
let event = recv_chan.must_recv();
|
||||||
|
info!("got event: {:?}", event);
|
||||||
|
if let NetEvent::Message(AnyMessage::ReplCell(cell)) = event {
|
||||||
|
if cell.seqno != storage.flush_pos() {
|
||||||
|
info!("got out of order data: {:?}", cell);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
storage.write(cell.value);
|
||||||
|
storage.flush().unwrap();
|
||||||
|
socket.send(AnyMessage::Just32(storage.flush_pos()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Client copies all data from array to the remote node.
|
||||||
|
pub fn run_client(os: NodeOs, data: &[ReplCell], dst: NodeId) {
|
||||||
|
info!("started client");
|
||||||
|
|
||||||
|
let mut delivered = 0;
|
||||||
|
|
||||||
|
let mut sock = os.open_tcp(dst);
|
||||||
|
let mut recv_chan = sock.recv_chan();
|
||||||
|
|
||||||
|
while delivered < data.len() {
|
||||||
|
let num = &data[delivered];
|
||||||
|
info!("sending data: {:?}", num.clone());
|
||||||
|
sock.send(AnyMessage::ReplCell(num.clone()));
|
||||||
|
|
||||||
|
// loop {
|
||||||
|
let event = recv_chan.recv();
|
||||||
|
match event {
|
||||||
|
NetEvent::Message(AnyMessage::Just32(flush_pos)) => {
|
||||||
|
if flush_pos == 1 + delivered as u32 {
|
||||||
|
delivered += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
NetEvent::Closed => {
|
||||||
|
info!("connection closed, reestablishing");
|
||||||
|
sock = os.open_tcp(dst);
|
||||||
|
recv_chan = sock.recv_chan();
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
|
||||||
|
// }
|
||||||
|
}
|
||||||
|
|
||||||
|
let sock = os.open_tcp(dst);
|
||||||
|
for num in data {
|
||||||
|
info!("sending data: {:?}", num.clone());
|
||||||
|
sock.send(AnyMessage::ReplCell(num.clone()));
|
||||||
|
}
|
||||||
|
|
||||||
|
info!("sent all data and finished client");
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Run test simulations.
|
||||||
|
#[test]
|
||||||
|
fn sim_example_reliable_copy() {
|
||||||
|
utils::logging::init(
|
||||||
|
utils::logging::LogFormat::Test,
|
||||||
|
utils::logging::TracingErrorLayerEnablement::Disabled,
|
||||||
|
utils::logging::Output::Stdout,
|
||||||
|
)
|
||||||
|
.expect("logging init failed");
|
||||||
|
|
||||||
|
let delay = Delay {
|
||||||
|
min: 1,
|
||||||
|
max: 60,
|
||||||
|
fail_prob: 0.4,
|
||||||
|
};
|
||||||
|
|
||||||
|
let network = NetworkOptions {
|
||||||
|
keepalive_timeout: Some(50),
|
||||||
|
connect_delay: delay.clone(),
|
||||||
|
send_delay: delay.clone(),
|
||||||
|
};
|
||||||
|
|
||||||
|
for seed in 0..20 {
|
||||||
|
let u32_data: [u32; 5] = [1, 2, 3, 4, 5];
|
||||||
|
let data = u32_to_cells(&u32_data, 1);
|
||||||
|
let world = Arc::new(World::new(seed, Arc::new(network.clone())));
|
||||||
|
|
||||||
|
start_simulation(Options {
|
||||||
|
world,
|
||||||
|
time_limit: 1_000_000,
|
||||||
|
client_fn: Box::new(move |os, server_id| run_client(os, &data, server_id)),
|
||||||
|
u32_data,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct Options {
|
||||||
|
pub world: Arc<World>,
|
||||||
|
pub time_limit: u64,
|
||||||
|
pub u32_data: [u32; 5],
|
||||||
|
pub client_fn: Box<dyn FnOnce(NodeOs, u32) + Send + 'static>,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn start_simulation(options: Options) {
|
||||||
|
let world = options.world;
|
||||||
|
|
||||||
|
let client_node = world.new_node();
|
||||||
|
let server_node = world.new_node();
|
||||||
|
let server_id = server_node.id;
|
||||||
|
|
||||||
|
// start the client thread
|
||||||
|
client_node.launch(move |os| {
|
||||||
|
let client_fn = options.client_fn;
|
||||||
|
client_fn(os, server_id);
|
||||||
|
});
|
||||||
|
|
||||||
|
// start the server thread
|
||||||
|
let shared_storage = SharedStorage::new();
|
||||||
|
let server_storage = shared_storage.clone();
|
||||||
|
server_node.launch(move |os| run_server(os, Box::new(server_storage)));
|
||||||
|
|
||||||
|
while world.step() && world.now() < options.time_limit {}
|
||||||
|
|
||||||
|
let disk_data = shared_storage.state.lock().data.clone();
|
||||||
|
assert!(verify_data(&disk_data, &options.u32_data[..]));
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn u32_to_cells(data: &[u32], client_id: u32) -> Vec<ReplCell> {
|
||||||
|
let mut res = Vec::new();
|
||||||
|
for (i, _) in data.iter().enumerate() {
|
||||||
|
res.push(ReplCell {
|
||||||
|
client_id,
|
||||||
|
seqno: i as u32,
|
||||||
|
value: data[i],
|
||||||
|
});
|
||||||
|
}
|
||||||
|
res
|
||||||
|
}
|
||||||
|
|
||||||
|
fn verify_data(disk_data: &[u32], data: &[u32]) -> bool {
|
||||||
|
if disk_data.len() != data.len() {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
for i in 0..data.len() {
|
||||||
|
if disk_data[i] != data[i] {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
true
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -192,6 +192,16 @@ pub struct TimelineCreateRequest {
|
|||||||
pub pg_version: Option<u32>,
|
pub pg_version: Option<u32>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize)]
|
||||||
|
pub struct TenantShardSplitRequest {
|
||||||
|
pub new_shard_count: u8,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize)]
|
||||||
|
pub struct TenantShardSplitResponse {
|
||||||
|
pub new_shards: Vec<TenantShardId>,
|
||||||
|
}
|
||||||
|
|
||||||
/// Parameters that apply to all shards in a tenant. Used during tenant creation.
|
/// Parameters that apply to all shards in a tenant. Used during tenant creation.
|
||||||
#[derive(Serialize, Deserialize, Debug)]
|
#[derive(Serialize, Deserialize, Debug)]
|
||||||
#[serde(deny_unknown_fields)]
|
#[serde(deny_unknown_fields)]
|
||||||
@@ -484,6 +494,8 @@ pub struct TimelineInfo {
|
|||||||
pub current_logical_size: u64,
|
pub current_logical_size: u64,
|
||||||
pub current_logical_size_is_accurate: bool,
|
pub current_logical_size_is_accurate: bool,
|
||||||
|
|
||||||
|
pub directory_entries_counts: Vec<u64>,
|
||||||
|
|
||||||
/// Sum of the size of all layer files.
|
/// Sum of the size of all layer files.
|
||||||
/// If a layer is present in both local FS and S3, it counts only once.
|
/// If a layer is present in both local FS and S3, it counts only once.
|
||||||
pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
|
pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
|
||||||
|
|||||||
@@ -124,6 +124,7 @@ impl RelTag {
|
|||||||
Ord,
|
Ord,
|
||||||
strum_macros::EnumIter,
|
strum_macros::EnumIter,
|
||||||
strum_macros::FromRepr,
|
strum_macros::FromRepr,
|
||||||
|
enum_map::Enum,
|
||||||
)]
|
)]
|
||||||
#[repr(u8)]
|
#[repr(u8)]
|
||||||
pub enum SlruKind {
|
pub enum SlruKind {
|
||||||
|
|||||||
@@ -88,12 +88,36 @@ impl TenantShardId {
|
|||||||
pub fn is_unsharded(&self) -> bool {
|
pub fn is_unsharded(&self) -> bool {
|
||||||
self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
|
self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Convenience for dropping the tenant_id and just getting the ShardIndex: this
|
||||||
|
/// is useful when logging from code that is already in a span that includes tenant ID, to
|
||||||
|
/// keep messages reasonably terse.
|
||||||
pub fn to_index(&self) -> ShardIndex {
|
pub fn to_index(&self) -> ShardIndex {
|
||||||
ShardIndex {
|
ShardIndex {
|
||||||
shard_number: self.shard_number,
|
shard_number: self.shard_number,
|
||||||
shard_count: self.shard_count,
|
shard_count: self.shard_count,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Calculate the children of this TenantShardId when splitting the overall tenant into
|
||||||
|
/// the given number of shards.
|
||||||
|
pub fn split(&self, new_shard_count: ShardCount) -> Vec<TenantShardId> {
|
||||||
|
let effective_old_shard_count = std::cmp::max(self.shard_count.0, 1);
|
||||||
|
let mut child_shards = Vec::new();
|
||||||
|
for shard_number in 0..ShardNumber(new_shard_count.0).0 {
|
||||||
|
// Key mapping is based on a round robin mapping of key hash modulo shard count,
|
||||||
|
// so our child shards are the ones which the same keys would map to.
|
||||||
|
if shard_number % effective_old_shard_count == self.shard_number.0 {
|
||||||
|
child_shards.push(TenantShardId {
|
||||||
|
tenant_id: self.tenant_id,
|
||||||
|
shard_number: ShardNumber(shard_number),
|
||||||
|
shard_count: new_shard_count,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
child_shards
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Formatting helper
|
/// Formatting helper
|
||||||
@@ -793,4 +817,108 @@ mod tests {
|
|||||||
let shard = key_to_shard_number(ShardCount(10), DEFAULT_STRIPE_SIZE, &key);
|
let shard = key_to_shard_number(ShardCount(10), DEFAULT_STRIPE_SIZE, &key);
|
||||||
assert_eq!(shard, ShardNumber(8));
|
assert_eq!(shard, ShardNumber(8));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn shard_id_split() {
|
||||||
|
let tenant_id = TenantId::generate();
|
||||||
|
let parent = TenantShardId::unsharded(tenant_id);
|
||||||
|
|
||||||
|
// Unsharded into 2
|
||||||
|
assert_eq!(
|
||||||
|
parent.split(ShardCount(2)),
|
||||||
|
vec![
|
||||||
|
TenantShardId {
|
||||||
|
tenant_id,
|
||||||
|
shard_count: ShardCount(2),
|
||||||
|
shard_number: ShardNumber(0)
|
||||||
|
},
|
||||||
|
TenantShardId {
|
||||||
|
tenant_id,
|
||||||
|
shard_count: ShardCount(2),
|
||||||
|
shard_number: ShardNumber(1)
|
||||||
|
}
|
||||||
|
]
|
||||||
|
);
|
||||||
|
|
||||||
|
// Unsharded into 4
|
||||||
|
assert_eq!(
|
||||||
|
parent.split(ShardCount(4)),
|
||||||
|
vec![
|
||||||
|
TenantShardId {
|
||||||
|
tenant_id,
|
||||||
|
shard_count: ShardCount(4),
|
||||||
|
shard_number: ShardNumber(0)
|
||||||
|
},
|
||||||
|
TenantShardId {
|
||||||
|
tenant_id,
|
||||||
|
shard_count: ShardCount(4),
|
||||||
|
shard_number: ShardNumber(1)
|
||||||
|
},
|
||||||
|
TenantShardId {
|
||||||
|
tenant_id,
|
||||||
|
shard_count: ShardCount(4),
|
||||||
|
shard_number: ShardNumber(2)
|
||||||
|
},
|
||||||
|
TenantShardId {
|
||||||
|
tenant_id,
|
||||||
|
shard_count: ShardCount(4),
|
||||||
|
shard_number: ShardNumber(3)
|
||||||
|
}
|
||||||
|
]
|
||||||
|
);
|
||||||
|
|
||||||
|
// count=1 into 2 (check this works the same as unsharded.)
|
||||||
|
let parent = TenantShardId {
|
||||||
|
tenant_id,
|
||||||
|
shard_count: ShardCount(1),
|
||||||
|
shard_number: ShardNumber(0),
|
||||||
|
};
|
||||||
|
assert_eq!(
|
||||||
|
parent.split(ShardCount(2)),
|
||||||
|
vec![
|
||||||
|
TenantShardId {
|
||||||
|
tenant_id,
|
||||||
|
shard_count: ShardCount(2),
|
||||||
|
shard_number: ShardNumber(0)
|
||||||
|
},
|
||||||
|
TenantShardId {
|
||||||
|
tenant_id,
|
||||||
|
shard_count: ShardCount(2),
|
||||||
|
shard_number: ShardNumber(1)
|
||||||
|
}
|
||||||
|
]
|
||||||
|
);
|
||||||
|
|
||||||
|
// count=2 into count=8
|
||||||
|
let parent = TenantShardId {
|
||||||
|
tenant_id,
|
||||||
|
shard_count: ShardCount(2),
|
||||||
|
shard_number: ShardNumber(1),
|
||||||
|
};
|
||||||
|
assert_eq!(
|
||||||
|
parent.split(ShardCount(8)),
|
||||||
|
vec![
|
||||||
|
TenantShardId {
|
||||||
|
tenant_id,
|
||||||
|
shard_count: ShardCount(8),
|
||||||
|
shard_number: ShardNumber(1)
|
||||||
|
},
|
||||||
|
TenantShardId {
|
||||||
|
tenant_id,
|
||||||
|
shard_count: ShardCount(8),
|
||||||
|
shard_number: ShardNumber(3)
|
||||||
|
},
|
||||||
|
TenantShardId {
|
||||||
|
tenant_id,
|
||||||
|
shard_count: ShardCount(8),
|
||||||
|
shard_number: ShardNumber(5)
|
||||||
|
},
|
||||||
|
TenantShardId {
|
||||||
|
tenant_id,
|
||||||
|
shard_count: ShardCount(8),
|
||||||
|
shard_number: ShardNumber(7)
|
||||||
|
},
|
||||||
|
]
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -431,11 +431,11 @@ pub fn generate_wal_segment(segno: u64, system_id: u64, lsn: Lsn) -> Result<Byte
|
|||||||
|
|
||||||
#[repr(C)]
|
#[repr(C)]
|
||||||
#[derive(Serialize)]
|
#[derive(Serialize)]
|
||||||
struct XlLogicalMessage {
|
pub struct XlLogicalMessage {
|
||||||
db_id: Oid,
|
pub db_id: Oid,
|
||||||
transactional: uint32, // bool, takes 4 bytes due to alignment in C structures
|
pub transactional: uint32, // bool, takes 4 bytes due to alignment in C structures
|
||||||
prefix_size: uint64,
|
pub prefix_size: uint64,
|
||||||
message_size: uint64,
|
pub message_size: uint64,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl XlLogicalMessage {
|
impl XlLogicalMessage {
|
||||||
|
|||||||
@@ -13,5 +13,6 @@ rand.workspace = true
|
|||||||
tokio.workspace = true
|
tokio.workspace = true
|
||||||
tracing.workspace = true
|
tracing.workspace = true
|
||||||
thiserror.workspace = true
|
thiserror.workspace = true
|
||||||
|
serde.workspace = true
|
||||||
|
|
||||||
workspace_hack.workspace = true
|
workspace_hack.workspace = true
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ pub mod framed;
|
|||||||
|
|
||||||
use byteorder::{BigEndian, ReadBytesExt};
|
use byteorder::{BigEndian, ReadBytesExt};
|
||||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
use std::{borrow::Cow, collections::HashMap, fmt, io, str};
|
use std::{borrow::Cow, collections::HashMap, fmt, io, str};
|
||||||
|
|
||||||
// re-export for use in utils pageserver_feedback.rs
|
// re-export for use in utils pageserver_feedback.rs
|
||||||
@@ -123,7 +124,7 @@ impl StartupMessageParams {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)]
|
#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy, Serialize, Deserialize)]
|
||||||
pub struct CancelKeyData {
|
pub struct CancelKeyData {
|
||||||
pub backend_pid: i32,
|
pub backend_pid: i32,
|
||||||
pub cancel_key: i32,
|
pub cancel_key: i32,
|
||||||
|
|||||||
@@ -191,6 +191,7 @@ impl RemoteStorage for AzureBlobStorage {
|
|||||||
&self,
|
&self,
|
||||||
prefix: Option<&RemotePath>,
|
prefix: Option<&RemotePath>,
|
||||||
mode: ListingMode,
|
mode: ListingMode,
|
||||||
|
max_keys: Option<NonZeroU32>,
|
||||||
) -> anyhow::Result<Listing, DownloadError> {
|
) -> anyhow::Result<Listing, DownloadError> {
|
||||||
// get the passed prefix or if it is not set use prefix_in_bucket value
|
// get the passed prefix or if it is not set use prefix_in_bucket value
|
||||||
let list_prefix = prefix
|
let list_prefix = prefix
|
||||||
@@ -223,6 +224,8 @@ impl RemoteStorage for AzureBlobStorage {
|
|||||||
|
|
||||||
let mut response = builder.into_stream();
|
let mut response = builder.into_stream();
|
||||||
let mut res = Listing::default();
|
let mut res = Listing::default();
|
||||||
|
// NonZeroU32 doesn't support subtraction apparently
|
||||||
|
let mut max_keys = max_keys.map(|mk| mk.get());
|
||||||
while let Some(l) = response.next().await {
|
while let Some(l) = response.next().await {
|
||||||
let entry = l.map_err(to_download_error)?;
|
let entry = l.map_err(to_download_error)?;
|
||||||
let prefix_iter = entry
|
let prefix_iter = entry
|
||||||
@@ -235,7 +238,18 @@ impl RemoteStorage for AzureBlobStorage {
|
|||||||
.blobs
|
.blobs
|
||||||
.blobs()
|
.blobs()
|
||||||
.map(|k| self.name_to_relative_path(&k.name));
|
.map(|k| self.name_to_relative_path(&k.name));
|
||||||
res.keys.extend(blob_iter);
|
|
||||||
|
for key in blob_iter {
|
||||||
|
res.keys.push(key);
|
||||||
|
if let Some(mut mk) = max_keys {
|
||||||
|
assert!(mk > 0);
|
||||||
|
mk -= 1;
|
||||||
|
if mk == 0 {
|
||||||
|
return Ok(res); // limit reached
|
||||||
|
}
|
||||||
|
max_keys = Some(mk);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
Ok(res)
|
Ok(res)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -13,9 +13,15 @@ mod azure_blob;
|
|||||||
mod local_fs;
|
mod local_fs;
|
||||||
mod s3_bucket;
|
mod s3_bucket;
|
||||||
mod simulate_failures;
|
mod simulate_failures;
|
||||||
|
mod support;
|
||||||
|
|
||||||
use std::{
|
use std::{
|
||||||
collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc, time::SystemTime,
|
collections::HashMap,
|
||||||
|
fmt::Debug,
|
||||||
|
num::{NonZeroU32, NonZeroUsize},
|
||||||
|
pin::Pin,
|
||||||
|
sync::Arc,
|
||||||
|
time::SystemTime,
|
||||||
};
|
};
|
||||||
|
|
||||||
use anyhow::{bail, Context};
|
use anyhow::{bail, Context};
|
||||||
@@ -154,7 +160,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
|||||||
prefix: Option<&RemotePath>,
|
prefix: Option<&RemotePath>,
|
||||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||||
let result = self
|
let result = self
|
||||||
.list(prefix, ListingMode::WithDelimiter)
|
.list(prefix, ListingMode::WithDelimiter, None)
|
||||||
.await?
|
.await?
|
||||||
.prefixes;
|
.prefixes;
|
||||||
Ok(result)
|
Ok(result)
|
||||||
@@ -170,8 +176,17 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
|||||||
/// whereas,
|
/// whereas,
|
||||||
/// list_prefixes("foo/bar/") = ["cat", "dog"]
|
/// list_prefixes("foo/bar/") = ["cat", "dog"]
|
||||||
/// See `test_real_s3.rs` for more details.
|
/// See `test_real_s3.rs` for more details.
|
||||||
async fn list_files(&self, prefix: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
|
///
|
||||||
let result = self.list(prefix, ListingMode::NoDelimiter).await?.keys;
|
/// max_keys limits max number of keys returned; None means unlimited.
|
||||||
|
async fn list_files(
|
||||||
|
&self,
|
||||||
|
prefix: Option<&RemotePath>,
|
||||||
|
max_keys: Option<NonZeroU32>,
|
||||||
|
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||||
|
let result = self
|
||||||
|
.list(prefix, ListingMode::NoDelimiter, max_keys)
|
||||||
|
.await?
|
||||||
|
.keys;
|
||||||
Ok(result)
|
Ok(result)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -179,7 +194,8 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
|||||||
&self,
|
&self,
|
||||||
prefix: Option<&RemotePath>,
|
prefix: Option<&RemotePath>,
|
||||||
_mode: ListingMode,
|
_mode: ListingMode,
|
||||||
) -> anyhow::Result<Listing, DownloadError>;
|
max_keys: Option<NonZeroU32>,
|
||||||
|
) -> Result<Listing, DownloadError>;
|
||||||
|
|
||||||
/// Streams the local file contents into remote into the remote storage entry.
|
/// Streams the local file contents into remote into the remote storage entry.
|
||||||
async fn upload(
|
async fn upload(
|
||||||
@@ -269,6 +285,19 @@ impl std::fmt::Display for DownloadError {
|
|||||||
|
|
||||||
impl std::error::Error for DownloadError {}
|
impl std::error::Error for DownloadError {}
|
||||||
|
|
||||||
|
impl DownloadError {
|
||||||
|
/// Returns true if the error should not be retried with backoff
|
||||||
|
pub fn is_permanent(&self) -> bool {
|
||||||
|
use DownloadError::*;
|
||||||
|
match self {
|
||||||
|
BadInput(_) => true,
|
||||||
|
NotFound => true,
|
||||||
|
Cancelled => true,
|
||||||
|
Other(_) => false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub enum TimeTravelError {
|
pub enum TimeTravelError {
|
||||||
/// Validation or other error happened due to user input.
|
/// Validation or other error happened due to user input.
|
||||||
@@ -324,24 +353,31 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
|
|||||||
&self,
|
&self,
|
||||||
prefix: Option<&RemotePath>,
|
prefix: Option<&RemotePath>,
|
||||||
mode: ListingMode,
|
mode: ListingMode,
|
||||||
|
max_keys: Option<NonZeroU32>,
|
||||||
) -> anyhow::Result<Listing, DownloadError> {
|
) -> anyhow::Result<Listing, DownloadError> {
|
||||||
match self {
|
match self {
|
||||||
Self::LocalFs(s) => s.list(prefix, mode).await,
|
Self::LocalFs(s) => s.list(prefix, mode, max_keys).await,
|
||||||
Self::AwsS3(s) => s.list(prefix, mode).await,
|
Self::AwsS3(s) => s.list(prefix, mode, max_keys).await,
|
||||||
Self::AzureBlob(s) => s.list(prefix, mode).await,
|
Self::AzureBlob(s) => s.list(prefix, mode, max_keys).await,
|
||||||
Self::Unreliable(s) => s.list(prefix, mode).await,
|
Self::Unreliable(s) => s.list(prefix, mode, max_keys).await,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// A function for listing all the files in a "directory"
|
// A function for listing all the files in a "directory"
|
||||||
// Example:
|
// Example:
|
||||||
// list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
|
// list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
|
||||||
pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
|
//
|
||||||
|
// max_keys limits max number of keys returned; None means unlimited.
|
||||||
|
pub async fn list_files(
|
||||||
|
&self,
|
||||||
|
folder: Option<&RemotePath>,
|
||||||
|
max_keys: Option<NonZeroU32>,
|
||||||
|
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||||
match self {
|
match self {
|
||||||
Self::LocalFs(s) => s.list_files(folder).await,
|
Self::LocalFs(s) => s.list_files(folder, max_keys).await,
|
||||||
Self::AwsS3(s) => s.list_files(folder).await,
|
Self::AwsS3(s) => s.list_files(folder, max_keys).await,
|
||||||
Self::AzureBlob(s) => s.list_files(folder).await,
|
Self::AzureBlob(s) => s.list_files(folder, max_keys).await,
|
||||||
Self::Unreliable(s) => s.list_files(folder).await,
|
Self::Unreliable(s) => s.list_files(folder, max_keys).await,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -4,7 +4,9 @@
|
|||||||
//! This storage used in tests, but can also be used in cases when a certain persistent
|
//! This storage used in tests, but can also be used in cases when a certain persistent
|
||||||
//! volume is mounted to the local FS.
|
//! volume is mounted to the local FS.
|
||||||
|
|
||||||
use std::{borrow::Cow, future::Future, io::ErrorKind, pin::Pin, time::SystemTime};
|
use std::{
|
||||||
|
borrow::Cow, future::Future, io::ErrorKind, num::NonZeroU32, pin::Pin, time::SystemTime,
|
||||||
|
};
|
||||||
|
|
||||||
use anyhow::{bail, ensure, Context};
|
use anyhow::{bail, ensure, Context};
|
||||||
use bytes::Bytes;
|
use bytes::Bytes;
|
||||||
@@ -18,9 +20,7 @@ use tokio_util::{io::ReaderStream, sync::CancellationToken};
|
|||||||
use tracing::*;
|
use tracing::*;
|
||||||
use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
|
use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
|
||||||
|
|
||||||
use crate::{
|
use crate::{Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError};
|
||||||
Download, DownloadError, DownloadStream, Listing, ListingMode, RemotePath, TimeTravelError,
|
|
||||||
};
|
|
||||||
|
|
||||||
use super::{RemoteStorage, StorageMetadata};
|
use super::{RemoteStorage, StorageMetadata};
|
||||||
|
|
||||||
@@ -164,6 +164,7 @@ impl RemoteStorage for LocalFs {
|
|||||||
&self,
|
&self,
|
||||||
prefix: Option<&RemotePath>,
|
prefix: Option<&RemotePath>,
|
||||||
mode: ListingMode,
|
mode: ListingMode,
|
||||||
|
max_keys: Option<NonZeroU32>,
|
||||||
) -> Result<Listing, DownloadError> {
|
) -> Result<Listing, DownloadError> {
|
||||||
let mut result = Listing::default();
|
let mut result = Listing::default();
|
||||||
|
|
||||||
@@ -180,6 +181,9 @@ impl RemoteStorage for LocalFs {
|
|||||||
!path.is_dir()
|
!path.is_dir()
|
||||||
})
|
})
|
||||||
.collect();
|
.collect();
|
||||||
|
if let Some(max_keys) = max_keys {
|
||||||
|
result.keys.truncate(max_keys.get() as usize);
|
||||||
|
}
|
||||||
|
|
||||||
return Ok(result);
|
return Ok(result);
|
||||||
}
|
}
|
||||||
@@ -365,27 +369,33 @@ impl RemoteStorage for LocalFs {
|
|||||||
format!("Failed to open source file {target_path:?} to use in the download")
|
format!("Failed to open source file {target_path:?} to use in the download")
|
||||||
})
|
})
|
||||||
.map_err(DownloadError::Other)?;
|
.map_err(DownloadError::Other)?;
|
||||||
|
|
||||||
|
let len = source
|
||||||
|
.metadata()
|
||||||
|
.await
|
||||||
|
.context("query file length")
|
||||||
|
.map_err(DownloadError::Other)?
|
||||||
|
.len();
|
||||||
|
|
||||||
source
|
source
|
||||||
.seek(io::SeekFrom::Start(start_inclusive))
|
.seek(io::SeekFrom::Start(start_inclusive))
|
||||||
.await
|
.await
|
||||||
.context("Failed to seek to the range start in a local storage file")
|
.context("Failed to seek to the range start in a local storage file")
|
||||||
.map_err(DownloadError::Other)?;
|
.map_err(DownloadError::Other)?;
|
||||||
|
|
||||||
let metadata = self
|
let metadata = self
|
||||||
.read_storage_metadata(&target_path)
|
.read_storage_metadata(&target_path)
|
||||||
.await
|
.await
|
||||||
.map_err(DownloadError::Other)?;
|
.map_err(DownloadError::Other)?;
|
||||||
|
|
||||||
let download_stream: DownloadStream = match end_exclusive {
|
let source = source.take(end_exclusive.unwrap_or(len) - start_inclusive);
|
||||||
Some(end_exclusive) => Box::pin(ReaderStream::new(
|
let source = ReaderStream::new(source);
|
||||||
source.take(end_exclusive - start_inclusive),
|
|
||||||
)),
|
|
||||||
None => Box::pin(ReaderStream::new(source)),
|
|
||||||
};
|
|
||||||
Ok(Download {
|
Ok(Download {
|
||||||
metadata,
|
metadata,
|
||||||
last_modified: None,
|
last_modified: None,
|
||||||
etag: None,
|
etag: None,
|
||||||
download_stream,
|
download_stream: Box::pin(source),
|
||||||
})
|
})
|
||||||
} else {
|
} else {
|
||||||
Err(DownloadError::NotFound)
|
Err(DownloadError::NotFound)
|
||||||
@@ -514,10 +524,8 @@ mod fs_tests {
|
|||||||
use futures_util::Stream;
|
use futures_util::Stream;
|
||||||
use std::{collections::HashMap, io::Write};
|
use std::{collections::HashMap, io::Write};
|
||||||
|
|
||||||
async fn read_and_assert_remote_file_contents(
|
async fn read_and_check_metadata(
|
||||||
storage: &LocalFs,
|
storage: &LocalFs,
|
||||||
#[allow(clippy::ptr_arg)]
|
|
||||||
// have to use &Utf8PathBuf due to `storage.local_path` parameter requirements
|
|
||||||
remote_storage_path: &RemotePath,
|
remote_storage_path: &RemotePath,
|
||||||
expected_metadata: Option<&StorageMetadata>,
|
expected_metadata: Option<&StorageMetadata>,
|
||||||
) -> anyhow::Result<String> {
|
) -> anyhow::Result<String> {
|
||||||
@@ -596,7 +604,7 @@ mod fs_tests {
|
|||||||
let upload_name = "upload_1";
|
let upload_name = "upload_1";
|
||||||
let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
|
let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
|
||||||
|
|
||||||
let contents = read_and_assert_remote_file_contents(&storage, &upload_target, None).await?;
|
let contents = read_and_check_metadata(&storage, &upload_target, None).await?;
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
dummy_contents(upload_name),
|
dummy_contents(upload_name),
|
||||||
contents,
|
contents,
|
||||||
@@ -618,7 +626,7 @@ mod fs_tests {
|
|||||||
let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
|
let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
|
||||||
|
|
||||||
let full_range_download_contents =
|
let full_range_download_contents =
|
||||||
read_and_assert_remote_file_contents(&storage, &upload_target, None).await?;
|
read_and_check_metadata(&storage, &upload_target, None).await?;
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
dummy_contents(upload_name),
|
dummy_contents(upload_name),
|
||||||
full_range_download_contents,
|
full_range_download_contents,
|
||||||
@@ -660,6 +668,22 @@ mod fs_tests {
|
|||||||
"Second part bytes should be returned when requested"
|
"Second part bytes should be returned when requested"
|
||||||
);
|
);
|
||||||
|
|
||||||
|
let suffix_bytes = storage
|
||||||
|
.download_byte_range(&upload_target, 13, None)
|
||||||
|
.await?
|
||||||
|
.download_stream;
|
||||||
|
let suffix_bytes = aggregate(suffix_bytes).await?;
|
||||||
|
let suffix = std::str::from_utf8(&suffix_bytes)?;
|
||||||
|
assert_eq!(upload_name, suffix);
|
||||||
|
|
||||||
|
let all_bytes = storage
|
||||||
|
.download_byte_range(&upload_target, 0, None)
|
||||||
|
.await?
|
||||||
|
.download_stream;
|
||||||
|
let all_bytes = aggregate(all_bytes).await?;
|
||||||
|
let all_bytes = std::str::from_utf8(&all_bytes)?;
|
||||||
|
assert_eq!(dummy_contents("upload_1"), all_bytes);
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -736,7 +760,7 @@ mod fs_tests {
|
|||||||
upload_dummy_file(&storage, upload_name, Some(metadata.clone())).await?;
|
upload_dummy_file(&storage, upload_name, Some(metadata.clone())).await?;
|
||||||
|
|
||||||
let full_range_download_contents =
|
let full_range_download_contents =
|
||||||
read_and_assert_remote_file_contents(&storage, &upload_target, Some(&metadata)).await?;
|
read_and_check_metadata(&storage, &upload_target, Some(&metadata)).await?;
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
dummy_contents(upload_name),
|
dummy_contents(upload_name),
|
||||||
full_range_download_contents,
|
full_range_download_contents,
|
||||||
@@ -772,12 +796,12 @@ mod fs_tests {
|
|||||||
let child = upload_dummy_file(&storage, "grandparent/parent/child", None).await?;
|
let child = upload_dummy_file(&storage, "grandparent/parent/child", None).await?;
|
||||||
let uncle = upload_dummy_file(&storage, "grandparent/uncle", None).await?;
|
let uncle = upload_dummy_file(&storage, "grandparent/uncle", None).await?;
|
||||||
|
|
||||||
let listing = storage.list(None, ListingMode::NoDelimiter).await?;
|
let listing = storage.list(None, ListingMode::NoDelimiter, None).await?;
|
||||||
assert!(listing.prefixes.is_empty());
|
assert!(listing.prefixes.is_empty());
|
||||||
assert_eq!(listing.keys, [uncle.clone(), child.clone()].to_vec());
|
assert_eq!(listing.keys, [uncle.clone(), child.clone()].to_vec());
|
||||||
|
|
||||||
// Delimiter: should only go one deep
|
// Delimiter: should only go one deep
|
||||||
let listing = storage.list(None, ListingMode::WithDelimiter).await?;
|
let listing = storage.list(None, ListingMode::WithDelimiter, None).await?;
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
listing.prefixes,
|
listing.prefixes,
|
||||||
@@ -790,6 +814,7 @@ mod fs_tests {
|
|||||||
.list(
|
.list(
|
||||||
Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()),
|
Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()),
|
||||||
ListingMode::WithDelimiter,
|
ListingMode::WithDelimiter,
|
||||||
|
None,
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
|
|||||||
@@ -7,6 +7,7 @@
|
|||||||
use std::{
|
use std::{
|
||||||
borrow::Cow,
|
borrow::Cow,
|
||||||
collections::HashMap,
|
collections::HashMap,
|
||||||
|
num::NonZeroU32,
|
||||||
pin::Pin,
|
pin::Pin,
|
||||||
sync::Arc,
|
sync::Arc,
|
||||||
task::{Context, Poll},
|
task::{Context, Poll},
|
||||||
@@ -45,8 +46,9 @@ use utils::backoff;
|
|||||||
|
|
||||||
use super::StorageMetadata;
|
use super::StorageMetadata;
|
||||||
use crate::{
|
use crate::{
|
||||||
ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage,
|
support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode,
|
||||||
S3Config, TimeTravelError, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
|
RemotePath, RemoteStorage, S3Config, TimeTravelError, MAX_KEYS_PER_DELETE,
|
||||||
|
REMOTE_STORAGE_PREFIX_SEPARATOR,
|
||||||
};
|
};
|
||||||
|
|
||||||
pub(super) mod metrics;
|
pub(super) mod metrics;
|
||||||
@@ -63,7 +65,6 @@ pub struct S3Bucket {
|
|||||||
concurrency_limiter: ConcurrencyLimiter,
|
concurrency_limiter: ConcurrencyLimiter,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Default)]
|
|
||||||
struct GetObjectRequest {
|
struct GetObjectRequest {
|
||||||
bucket: String,
|
bucket: String,
|
||||||
key: String,
|
key: String,
|
||||||
@@ -232,24 +233,8 @@ impl S3Bucket {
|
|||||||
|
|
||||||
let started_at = ScopeGuard::into_inner(started_at);
|
let started_at = ScopeGuard::into_inner(started_at);
|
||||||
|
|
||||||
match get_object {
|
let object_output = match get_object {
|
||||||
Ok(object_output) => {
|
Ok(object_output) => object_output,
|
||||||
let metadata = object_output.metadata().cloned().map(StorageMetadata);
|
|
||||||
let etag = object_output.e_tag.clone();
|
|
||||||
let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok());
|
|
||||||
|
|
||||||
let body = object_output.body;
|
|
||||||
let body = ByteStreamAsStream::from(body);
|
|
||||||
let body = PermitCarrying::new(permit, body);
|
|
||||||
let body = TimedDownload::new(started_at, body);
|
|
||||||
|
|
||||||
Ok(Download {
|
|
||||||
metadata,
|
|
||||||
etag,
|
|
||||||
last_modified,
|
|
||||||
download_stream: Box::pin(body),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => {
|
Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => {
|
||||||
// Count this in the AttemptOutcome::Ok bucket, because 404 is not
|
// Count this in the AttemptOutcome::Ok bucket, because 404 is not
|
||||||
// an error: we expect to sometimes fetch an object and find it missing,
|
// an error: we expect to sometimes fetch an object and find it missing,
|
||||||
@@ -259,7 +244,7 @@ impl S3Bucket {
|
|||||||
AttemptOutcome::Ok,
|
AttemptOutcome::Ok,
|
||||||
started_at,
|
started_at,
|
||||||
);
|
);
|
||||||
Err(DownloadError::NotFound)
|
return Err(DownloadError::NotFound);
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
|
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
|
||||||
@@ -268,11 +253,27 @@ impl S3Bucket {
|
|||||||
started_at,
|
started_at,
|
||||||
);
|
);
|
||||||
|
|
||||||
Err(DownloadError::Other(
|
return Err(DownloadError::Other(
|
||||||
anyhow::Error::new(e).context("download s3 object"),
|
anyhow::Error::new(e).context("download s3 object"),
|
||||||
))
|
));
|
||||||
}
|
}
|
||||||
}
|
};
|
||||||
|
|
||||||
|
let metadata = object_output.metadata().cloned().map(StorageMetadata);
|
||||||
|
let etag = object_output.e_tag;
|
||||||
|
let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok());
|
||||||
|
|
||||||
|
let body = object_output.body;
|
||||||
|
let body = ByteStreamAsStream::from(body);
|
||||||
|
let body = PermitCarrying::new(permit, body);
|
||||||
|
let body = TimedDownload::new(started_at, body);
|
||||||
|
|
||||||
|
Ok(Download {
|
||||||
|
metadata,
|
||||||
|
etag,
|
||||||
|
last_modified,
|
||||||
|
download_stream: Box::pin(body),
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn delete_oids(
|
async fn delete_oids(
|
||||||
@@ -354,33 +355,6 @@ impl Stream for ByteStreamAsStream {
|
|||||||
// sense and Stream::size_hint does not really
|
// sense and Stream::size_hint does not really
|
||||||
}
|
}
|
||||||
|
|
||||||
pin_project_lite::pin_project! {
|
|
||||||
/// An `AsyncRead` adapter which carries a permit for the lifetime of the value.
|
|
||||||
struct PermitCarrying<S> {
|
|
||||||
permit: tokio::sync::OwnedSemaphorePermit,
|
|
||||||
#[pin]
|
|
||||||
inner: S,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<S> PermitCarrying<S> {
|
|
||||||
fn new(permit: tokio::sync::OwnedSemaphorePermit, inner: S) -> Self {
|
|
||||||
Self { permit, inner }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for PermitCarrying<S> {
|
|
||||||
type Item = <S as Stream>::Item;
|
|
||||||
|
|
||||||
fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
|
|
||||||
self.project().inner.poll_next(cx)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
|
||||||
self.inner.size_hint()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pin_project_lite::pin_project! {
|
pin_project_lite::pin_project! {
|
||||||
/// Times and tracks the outcome of the request.
|
/// Times and tracks the outcome of the request.
|
||||||
struct TimedDownload<S> {
|
struct TimedDownload<S> {
|
||||||
@@ -435,8 +409,11 @@ impl RemoteStorage for S3Bucket {
|
|||||||
&self,
|
&self,
|
||||||
prefix: Option<&RemotePath>,
|
prefix: Option<&RemotePath>,
|
||||||
mode: ListingMode,
|
mode: ListingMode,
|
||||||
|
max_keys: Option<NonZeroU32>,
|
||||||
) -> Result<Listing, DownloadError> {
|
) -> Result<Listing, DownloadError> {
|
||||||
let kind = RequestKind::List;
|
let kind = RequestKind::List;
|
||||||
|
// s3 sdk wants i32
|
||||||
|
let mut max_keys = max_keys.map(|mk| mk.get() as i32);
|
||||||
let mut result = Listing::default();
|
let mut result = Listing::default();
|
||||||
|
|
||||||
// get the passed prefix or if it is not set use prefix_in_bucket value
|
// get the passed prefix or if it is not set use prefix_in_bucket value
|
||||||
@@ -460,13 +437,20 @@ impl RemoteStorage for S3Bucket {
|
|||||||
let _guard = self.permit(kind).await;
|
let _guard = self.permit(kind).await;
|
||||||
let started_at = start_measuring_requests(kind);
|
let started_at = start_measuring_requests(kind);
|
||||||
|
|
||||||
|
// min of two Options, returning Some if one is value and another is
|
||||||
|
// None (None is smaller than anything, so plain min doesn't work).
|
||||||
|
let request_max_keys = self
|
||||||
|
.max_keys_per_list_response
|
||||||
|
.into_iter()
|
||||||
|
.chain(max_keys.into_iter())
|
||||||
|
.min();
|
||||||
let mut request = self
|
let mut request = self
|
||||||
.client
|
.client
|
||||||
.list_objects_v2()
|
.list_objects_v2()
|
||||||
.bucket(self.bucket_name.clone())
|
.bucket(self.bucket_name.clone())
|
||||||
.set_prefix(list_prefix.clone())
|
.set_prefix(list_prefix.clone())
|
||||||
.set_continuation_token(continuation_token)
|
.set_continuation_token(continuation_token)
|
||||||
.set_max_keys(self.max_keys_per_list_response);
|
.set_max_keys(request_max_keys);
|
||||||
|
|
||||||
if let ListingMode::WithDelimiter = mode {
|
if let ListingMode::WithDelimiter = mode {
|
||||||
request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
|
request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
|
||||||
@@ -496,6 +480,14 @@ impl RemoteStorage for S3Bucket {
|
|||||||
let object_path = object.key().expect("response does not contain a key");
|
let object_path = object.key().expect("response does not contain a key");
|
||||||
let remote_path = self.s3_object_to_relative_path(object_path);
|
let remote_path = self.s3_object_to_relative_path(object_path);
|
||||||
result.keys.push(remote_path);
|
result.keys.push(remote_path);
|
||||||
|
if let Some(mut mk) = max_keys {
|
||||||
|
assert!(mk > 0);
|
||||||
|
mk -= 1;
|
||||||
|
if mk == 0 {
|
||||||
|
return Ok(result); // limit reached
|
||||||
|
}
|
||||||
|
max_keys = Some(mk);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
result.prefixes.extend(
|
result.prefixes.extend(
|
||||||
|
|||||||
@@ -4,6 +4,7 @@
|
|||||||
use bytes::Bytes;
|
use bytes::Bytes;
|
||||||
use futures::stream::Stream;
|
use futures::stream::Stream;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
use std::num::NonZeroU32;
|
||||||
use std::sync::Mutex;
|
use std::sync::Mutex;
|
||||||
use std::time::SystemTime;
|
use std::time::SystemTime;
|
||||||
use std::{collections::hash_map::Entry, sync::Arc};
|
use std::{collections::hash_map::Entry, sync::Arc};
|
||||||
@@ -60,7 +61,7 @@ impl UnreliableWrapper {
|
|||||||
/// On the first attempts of this operation, return an error. After 'attempts_to_fail'
|
/// On the first attempts of this operation, return an error. After 'attempts_to_fail'
|
||||||
/// attempts, let the operation go ahead, and clear the counter.
|
/// attempts, let the operation go ahead, and clear the counter.
|
||||||
///
|
///
|
||||||
fn attempt(&self, op: RemoteOp) -> Result<u64, DownloadError> {
|
fn attempt(&self, op: RemoteOp) -> anyhow::Result<u64> {
|
||||||
let mut attempts = self.attempts.lock().unwrap();
|
let mut attempts = self.attempts.lock().unwrap();
|
||||||
|
|
||||||
match attempts.entry(op) {
|
match attempts.entry(op) {
|
||||||
@@ -78,13 +79,13 @@ impl UnreliableWrapper {
|
|||||||
} else {
|
} else {
|
||||||
let error =
|
let error =
|
||||||
anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
|
anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
|
||||||
Err(DownloadError::Other(error))
|
Err(error)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Entry::Vacant(e) => {
|
Entry::Vacant(e) => {
|
||||||
let error = anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
|
let error = anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
|
||||||
e.insert(1);
|
e.insert(1);
|
||||||
Err(DownloadError::Other(error))
|
Err(error)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -105,22 +106,30 @@ impl RemoteStorage for UnreliableWrapper {
|
|||||||
&self,
|
&self,
|
||||||
prefix: Option<&RemotePath>,
|
prefix: Option<&RemotePath>,
|
||||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||||
self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))?;
|
self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
|
||||||
|
.map_err(DownloadError::Other)?;
|
||||||
self.inner.list_prefixes(prefix).await
|
self.inner.list_prefixes(prefix).await
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
|
async fn list_files(
|
||||||
self.attempt(RemoteOp::ListPrefixes(folder.cloned()))?;
|
&self,
|
||||||
self.inner.list_files(folder).await
|
folder: Option<&RemotePath>,
|
||||||
|
max_keys: Option<NonZeroU32>,
|
||||||
|
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||||
|
self.attempt(RemoteOp::ListPrefixes(folder.cloned()))
|
||||||
|
.map_err(DownloadError::Other)?;
|
||||||
|
self.inner.list_files(folder, max_keys).await
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn list(
|
async fn list(
|
||||||
&self,
|
&self,
|
||||||
prefix: Option<&RemotePath>,
|
prefix: Option<&RemotePath>,
|
||||||
mode: ListingMode,
|
mode: ListingMode,
|
||||||
|
max_keys: Option<NonZeroU32>,
|
||||||
) -> Result<Listing, DownloadError> {
|
) -> Result<Listing, DownloadError> {
|
||||||
self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))?;
|
self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
|
||||||
self.inner.list(prefix, mode).await
|
.map_err(DownloadError::Other)?;
|
||||||
|
self.inner.list(prefix, mode, max_keys).await
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn upload(
|
async fn upload(
|
||||||
@@ -137,7 +146,8 @@ impl RemoteStorage for UnreliableWrapper {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
|
async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
|
||||||
self.attempt(RemoteOp::Download(from.clone()))?;
|
self.attempt(RemoteOp::Download(from.clone()))
|
||||||
|
.map_err(DownloadError::Other)?;
|
||||||
self.inner.download(from).await
|
self.inner.download(from).await
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -150,7 +160,8 @@ impl RemoteStorage for UnreliableWrapper {
|
|||||||
// Note: We treat any download_byte_range as an "attempt" of the same
|
// Note: We treat any download_byte_range as an "attempt" of the same
|
||||||
// operation. We don't pay attention to the ranges. That's good enough
|
// operation. We don't pay attention to the ranges. That's good enough
|
||||||
// for now.
|
// for now.
|
||||||
self.attempt(RemoteOp::Download(from.clone()))?;
|
self.attempt(RemoteOp::Download(from.clone()))
|
||||||
|
.map_err(DownloadError::Other)?;
|
||||||
self.inner
|
self.inner
|
||||||
.download_byte_range(from, start_inclusive, end_exclusive)
|
.download_byte_range(from, start_inclusive, end_exclusive)
|
||||||
.await
|
.await
|
||||||
@@ -193,7 +204,7 @@ impl RemoteStorage for UnreliableWrapper {
|
|||||||
cancel: &CancellationToken,
|
cancel: &CancellationToken,
|
||||||
) -> Result<(), TimeTravelError> {
|
) -> Result<(), TimeTravelError> {
|
||||||
self.attempt(RemoteOp::TimeTravelRecover(prefix.map(|p| p.to_owned())))
|
self.attempt(RemoteOp::TimeTravelRecover(prefix.map(|p| p.to_owned())))
|
||||||
.map_err(|e| TimeTravelError::Other(anyhow::Error::new(e)))?;
|
.map_err(TimeTravelError::Other)?;
|
||||||
self.inner
|
self.inner
|
||||||
.time_travel_recover(prefix, timestamp, done_if_after, cancel)
|
.time_travel_recover(prefix, timestamp, done_if_after, cancel)
|
||||||
.await
|
.await
|
||||||
|
|||||||
33
libs/remote_storage/src/support.rs
Normal file
33
libs/remote_storage/src/support.rs
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
use std::{
|
||||||
|
pin::Pin,
|
||||||
|
task::{Context, Poll},
|
||||||
|
};
|
||||||
|
|
||||||
|
use futures_util::Stream;
|
||||||
|
|
||||||
|
pin_project_lite::pin_project! {
|
||||||
|
/// An `AsyncRead` adapter which carries a permit for the lifetime of the value.
|
||||||
|
pub(crate) struct PermitCarrying<S> {
|
||||||
|
permit: tokio::sync::OwnedSemaphorePermit,
|
||||||
|
#[pin]
|
||||||
|
inner: S,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<S> PermitCarrying<S> {
|
||||||
|
pub(crate) fn new(permit: tokio::sync::OwnedSemaphorePermit, inner: S) -> Self {
|
||||||
|
Self { permit, inner }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<S: Stream> Stream for PermitCarrying<S> {
|
||||||
|
type Item = <S as Stream>::Item;
|
||||||
|
|
||||||
|
fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
|
||||||
|
self.project().inner.poll_next(cx)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||||
|
self.inner.size_hint()
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,8 +1,8 @@
|
|||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use camino::Utf8Path;
|
use camino::Utf8Path;
|
||||||
use remote_storage::RemotePath;
|
use remote_storage::RemotePath;
|
||||||
use std::collections::HashSet;
|
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
use std::{collections::HashSet, num::NonZeroU32};
|
||||||
use test_context::test_context;
|
use test_context::test_context;
|
||||||
use tracing::debug;
|
use tracing::debug;
|
||||||
|
|
||||||
@@ -103,7 +103,7 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a
|
|||||||
let base_prefix =
|
let base_prefix =
|
||||||
RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
|
RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
|
||||||
let root_files = test_client
|
let root_files = test_client
|
||||||
.list_files(None)
|
.list_files(None, None)
|
||||||
.await
|
.await
|
||||||
.context("client list root files failure")?
|
.context("client list root files failure")?
|
||||||
.into_iter()
|
.into_iter()
|
||||||
@@ -113,8 +113,17 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a
|
|||||||
ctx.remote_blobs.clone(),
|
ctx.remote_blobs.clone(),
|
||||||
"remote storage list_files on root mismatches with the uploads."
|
"remote storage list_files on root mismatches with the uploads."
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// Test that max_keys limit works. In total there are about 21 files (see
|
||||||
|
// upload_simple_remote_data call in test_real_s3.rs).
|
||||||
|
let limited_root_files = test_client
|
||||||
|
.list_files(None, Some(NonZeroU32::new(2).unwrap()))
|
||||||
|
.await
|
||||||
|
.context("client list root files failure")?;
|
||||||
|
assert_eq!(limited_root_files.len(), 2);
|
||||||
|
|
||||||
let nested_remote_files = test_client
|
let nested_remote_files = test_client
|
||||||
.list_files(Some(&base_prefix))
|
.list_files(Some(&base_prefix), None)
|
||||||
.await
|
.await
|
||||||
.context("client list nested files failure")?
|
.context("client list nested files failure")?
|
||||||
.into_iter()
|
.into_iter()
|
||||||
|
|||||||
@@ -70,7 +70,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
|
|||||||
}
|
}
|
||||||
|
|
||||||
async fn list_files(client: &Arc<GenericRemoteStorage>) -> anyhow::Result<HashSet<RemotePath>> {
|
async fn list_files(client: &Arc<GenericRemoteStorage>) -> anyhow::Result<HashSet<RemotePath>> {
|
||||||
Ok(retry(|| client.list_files(None))
|
Ok(retry(|| client.list_files(None, None))
|
||||||
.await
|
.await
|
||||||
.context("list root files failure")?
|
.context("list root files failure")?
|
||||||
.into_iter()
|
.into_iter()
|
||||||
|
|||||||
@@ -27,6 +27,11 @@ impl Barrier {
|
|||||||
b.wait().await
|
b.wait().await
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Return true if a call to wait() would complete immediately
|
||||||
|
pub fn is_ready(&self) -> bool {
|
||||||
|
futures::future::FutureExt::now_or_never(self.0.wait()).is_some()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl PartialEq for Barrier {
|
impl PartialEq for Barrier {
|
||||||
|
|||||||
@@ -54,12 +54,10 @@ impl Generation {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[track_caller]
|
#[track_caller]
|
||||||
pub fn get_suffix(&self) -> String {
|
pub fn get_suffix(&self) -> impl std::fmt::Display {
|
||||||
match self {
|
match self {
|
||||||
Self::Valid(v) => {
|
Self::Valid(v) => GenerationFileSuffix(Some(*v)),
|
||||||
format!("-{:08x}", v)
|
Self::None => GenerationFileSuffix(None),
|
||||||
}
|
|
||||||
Self::None => "".into(),
|
|
||||||
Self::Broken => {
|
Self::Broken => {
|
||||||
panic!("Tried to use a broken generation");
|
panic!("Tried to use a broken generation");
|
||||||
}
|
}
|
||||||
@@ -90,6 +88,7 @@ impl Generation {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[track_caller]
|
||||||
pub fn next(&self) -> Generation {
|
pub fn next(&self) -> Generation {
|
||||||
match self {
|
match self {
|
||||||
Self::Valid(n) => Self::Valid(*n + 1),
|
Self::Valid(n) => Self::Valid(*n + 1),
|
||||||
@@ -107,6 +106,18 @@ impl Generation {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct GenerationFileSuffix(Option<u32>);
|
||||||
|
|
||||||
|
impl std::fmt::Display for GenerationFileSuffix {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
if let Some(g) = self.0 {
|
||||||
|
write!(f, "-{g:08x}")
|
||||||
|
} else {
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl Serialize for Generation {
|
impl Serialize for Generation {
|
||||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||||
where
|
where
|
||||||
@@ -164,4 +175,24 @@ mod test {
|
|||||||
assert!(Generation::none() < Generation::new(0));
|
assert!(Generation::none() < Generation::new(0));
|
||||||
assert!(Generation::none() < Generation::new(1));
|
assert!(Generation::none() < Generation::new(1));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn suffix_is_stable() {
|
||||||
|
use std::fmt::Write as _;
|
||||||
|
|
||||||
|
// the suffix must remain stable through-out the pageserver remote storage evolution and
|
||||||
|
// not be changed accidentially without thinking about migration
|
||||||
|
let examples = [
|
||||||
|
(line!(), Generation::None, ""),
|
||||||
|
(line!(), Generation::Valid(0), "-00000000"),
|
||||||
|
(line!(), Generation::Valid(u32::MAX), "-ffffffff"),
|
||||||
|
];
|
||||||
|
|
||||||
|
let mut s = String::new();
|
||||||
|
for (line, gen, expected) in examples {
|
||||||
|
s.clear();
|
||||||
|
write!(s, "{}", &gen.get_suffix()).expect("string grows");
|
||||||
|
assert_eq!(s, expected, "example on {line}");
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
use std::sync::{
|
use std::sync::{
|
||||||
atomic::{AtomicUsize, Ordering},
|
atomic::{AtomicUsize, Ordering},
|
||||||
Arc,
|
Arc, Mutex, MutexGuard,
|
||||||
};
|
};
|
||||||
use tokio::sync::Semaphore;
|
use tokio::sync::Semaphore;
|
||||||
|
|
||||||
@@ -12,7 +12,7 @@ use tokio::sync::Semaphore;
|
|||||||
///
|
///
|
||||||
/// [`OwnedSemaphorePermit`]: tokio::sync::OwnedSemaphorePermit
|
/// [`OwnedSemaphorePermit`]: tokio::sync::OwnedSemaphorePermit
|
||||||
pub struct OnceCell<T> {
|
pub struct OnceCell<T> {
|
||||||
inner: tokio::sync::RwLock<Inner<T>>,
|
inner: Mutex<Inner<T>>,
|
||||||
initializers: AtomicUsize,
|
initializers: AtomicUsize,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -50,7 +50,7 @@ impl<T> OnceCell<T> {
|
|||||||
let sem = Semaphore::new(1);
|
let sem = Semaphore::new(1);
|
||||||
sem.close();
|
sem.close();
|
||||||
Self {
|
Self {
|
||||||
inner: tokio::sync::RwLock::new(Inner {
|
inner: Mutex::new(Inner {
|
||||||
init_semaphore: Arc::new(sem),
|
init_semaphore: Arc::new(sem),
|
||||||
value: Some(value),
|
value: Some(value),
|
||||||
}),
|
}),
|
||||||
@@ -61,99 +61,63 @@ impl<T> OnceCell<T> {
|
|||||||
/// Returns a guard to an existing initialized value, or uniquely initializes the value before
|
/// Returns a guard to an existing initialized value, or uniquely initializes the value before
|
||||||
/// returning the guard.
|
/// returning the guard.
|
||||||
///
|
///
|
||||||
/// Initializing might wait on any existing [`GuardMut::take_and_deinit`] deinitialization.
|
/// Initializing might wait on any existing [`Guard::take_and_deinit`] deinitialization.
|
||||||
///
|
///
|
||||||
/// Initialization is panic-safe and cancellation-safe.
|
/// Initialization is panic-safe and cancellation-safe.
|
||||||
pub async fn get_mut_or_init<F, Fut, E>(&self, factory: F) -> Result<GuardMut<'_, T>, E>
|
pub async fn get_or_init<F, Fut, E>(&self, factory: F) -> Result<Guard<'_, T>, E>
|
||||||
where
|
where
|
||||||
F: FnOnce(InitPermit) -> Fut,
|
F: FnOnce(InitPermit) -> Fut,
|
||||||
Fut: std::future::Future<Output = Result<(T, InitPermit), E>>,
|
Fut: std::future::Future<Output = Result<(T, InitPermit), E>>,
|
||||||
{
|
{
|
||||||
let sem = {
|
loop {
|
||||||
let guard = self.inner.write().await;
|
let sem = {
|
||||||
if guard.value.is_some() {
|
let guard = self.inner.lock().unwrap();
|
||||||
return Ok(GuardMut(guard));
|
if guard.value.is_some() {
|
||||||
|
return Ok(Guard(guard));
|
||||||
|
}
|
||||||
|
guard.init_semaphore.clone()
|
||||||
|
};
|
||||||
|
|
||||||
|
{
|
||||||
|
let permit = {
|
||||||
|
// increment the count for the duration of queued
|
||||||
|
let _guard = CountWaitingInitializers::start(self);
|
||||||
|
sem.acquire().await
|
||||||
|
};
|
||||||
|
|
||||||
|
let Ok(permit) = permit else {
|
||||||
|
let guard = self.inner.lock().unwrap();
|
||||||
|
if !Arc::ptr_eq(&sem, &guard.init_semaphore) {
|
||||||
|
// there was a take_and_deinit in between
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
assert!(
|
||||||
|
guard.value.is_some(),
|
||||||
|
"semaphore got closed, must be initialized"
|
||||||
|
);
|
||||||
|
return Ok(Guard(guard));
|
||||||
|
};
|
||||||
|
|
||||||
|
permit.forget();
|
||||||
}
|
}
|
||||||
guard.init_semaphore.clone()
|
|
||||||
};
|
|
||||||
|
|
||||||
let permit = {
|
let permit = InitPermit(sem);
|
||||||
// increment the count for the duration of queued
|
let (value, _permit) = factory(permit).await?;
|
||||||
let _guard = CountWaitingInitializers::start(self);
|
|
||||||
sem.acquire_owned().await
|
|
||||||
};
|
|
||||||
|
|
||||||
match permit {
|
let guard = self.inner.lock().unwrap();
|
||||||
Ok(permit) => {
|
|
||||||
let permit = InitPermit(permit);
|
|
||||||
let (value, _permit) = factory(permit).await?;
|
|
||||||
|
|
||||||
let guard = self.inner.write().await;
|
return Ok(Self::set0(value, guard));
|
||||||
|
|
||||||
Ok(Self::set0(value, guard))
|
|
||||||
}
|
|
||||||
Err(_closed) => {
|
|
||||||
let guard = self.inner.write().await;
|
|
||||||
assert!(
|
|
||||||
guard.value.is_some(),
|
|
||||||
"semaphore got closed, must be initialized"
|
|
||||||
);
|
|
||||||
return Ok(GuardMut(guard));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns a guard to an existing initialized value, or uniquely initializes the value before
|
/// Assuming a permit is held after previous call to [`Guard::take_and_deinit`], it can be used
|
||||||
/// returning the guard.
|
|
||||||
///
|
|
||||||
/// Initialization is panic-safe and cancellation-safe.
|
|
||||||
pub async fn get_or_init<F, Fut, E>(&self, factory: F) -> Result<GuardRef<'_, T>, E>
|
|
||||||
where
|
|
||||||
F: FnOnce(InitPermit) -> Fut,
|
|
||||||
Fut: std::future::Future<Output = Result<(T, InitPermit), E>>,
|
|
||||||
{
|
|
||||||
let sem = {
|
|
||||||
let guard = self.inner.read().await;
|
|
||||||
if guard.value.is_some() {
|
|
||||||
return Ok(GuardRef(guard));
|
|
||||||
}
|
|
||||||
guard.init_semaphore.clone()
|
|
||||||
};
|
|
||||||
|
|
||||||
let permit = {
|
|
||||||
// increment the count for the duration of queued
|
|
||||||
let _guard = CountWaitingInitializers::start(self);
|
|
||||||
sem.acquire_owned().await
|
|
||||||
};
|
|
||||||
|
|
||||||
match permit {
|
|
||||||
Ok(permit) => {
|
|
||||||
let permit = InitPermit(permit);
|
|
||||||
let (value, _permit) = factory(permit).await?;
|
|
||||||
|
|
||||||
let guard = self.inner.write().await;
|
|
||||||
|
|
||||||
Ok(Self::set0(value, guard).downgrade())
|
|
||||||
}
|
|
||||||
Err(_closed) => {
|
|
||||||
let guard = self.inner.read().await;
|
|
||||||
assert!(
|
|
||||||
guard.value.is_some(),
|
|
||||||
"semaphore got closed, must be initialized"
|
|
||||||
);
|
|
||||||
return Ok(GuardRef(guard));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Assuming a permit is held after previous call to [`GuardMut::take_and_deinit`], it can be used
|
|
||||||
/// to complete initializing the inner value.
|
/// to complete initializing the inner value.
|
||||||
///
|
///
|
||||||
/// # Panics
|
/// # Panics
|
||||||
///
|
///
|
||||||
/// If the inner has already been initialized.
|
/// If the inner has already been initialized.
|
||||||
pub async fn set(&self, value: T, _permit: InitPermit) -> GuardMut<'_, T> {
|
pub fn set(&self, value: T, _permit: InitPermit) -> Guard<'_, T> {
|
||||||
let guard = self.inner.write().await;
|
let guard = self.inner.lock().unwrap();
|
||||||
|
|
||||||
// cannot assert that this permit is for self.inner.semaphore, but we can assert it cannot
|
// cannot assert that this permit is for self.inner.semaphore, but we can assert it cannot
|
||||||
// give more permits right now.
|
// give more permits right now.
|
||||||
@@ -165,31 +129,21 @@ impl<T> OnceCell<T> {
|
|||||||
Self::set0(value, guard)
|
Self::set0(value, guard)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn set0(value: T, mut guard: tokio::sync::RwLockWriteGuard<'_, Inner<T>>) -> GuardMut<'_, T> {
|
fn set0(value: T, mut guard: std::sync::MutexGuard<'_, Inner<T>>) -> Guard<'_, T> {
|
||||||
if guard.value.is_some() {
|
if guard.value.is_some() {
|
||||||
drop(guard);
|
drop(guard);
|
||||||
unreachable!("we won permit, must not be initialized");
|
unreachable!("we won permit, must not be initialized");
|
||||||
}
|
}
|
||||||
guard.value = Some(value);
|
guard.value = Some(value);
|
||||||
guard.init_semaphore.close();
|
guard.init_semaphore.close();
|
||||||
GuardMut(guard)
|
Guard(guard)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns a guard to an existing initialized value, if any.
|
/// Returns a guard to an existing initialized value, if any.
|
||||||
pub async fn get_mut(&self) -> Option<GuardMut<'_, T>> {
|
pub fn get(&self) -> Option<Guard<'_, T>> {
|
||||||
let guard = self.inner.write().await;
|
let guard = self.inner.lock().unwrap();
|
||||||
if guard.value.is_some() {
|
if guard.value.is_some() {
|
||||||
Some(GuardMut(guard))
|
Some(Guard(guard))
|
||||||
} else {
|
|
||||||
None
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns a guard to an existing initialized value, if any.
|
|
||||||
pub async fn get(&self) -> Option<GuardRef<'_, T>> {
|
|
||||||
let guard = self.inner.read().await;
|
|
||||||
if guard.value.is_some() {
|
|
||||||
Some(GuardRef(guard))
|
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
@@ -221,9 +175,9 @@ impl<'a, T> Drop for CountWaitingInitializers<'a, T> {
|
|||||||
/// Uninteresting guard object to allow short-lived access to inspect or clone the held,
|
/// Uninteresting guard object to allow short-lived access to inspect or clone the held,
|
||||||
/// initialized value.
|
/// initialized value.
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct GuardMut<'a, T>(tokio::sync::RwLockWriteGuard<'a, Inner<T>>);
|
pub struct Guard<'a, T>(MutexGuard<'a, Inner<T>>);
|
||||||
|
|
||||||
impl<T> std::ops::Deref for GuardMut<'_, T> {
|
impl<T> std::ops::Deref for Guard<'_, T> {
|
||||||
type Target = T;
|
type Target = T;
|
||||||
|
|
||||||
fn deref(&self) -> &Self::Target {
|
fn deref(&self) -> &Self::Target {
|
||||||
@@ -234,7 +188,7 @@ impl<T> std::ops::Deref for GuardMut<'_, T> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T> std::ops::DerefMut for GuardMut<'_, T> {
|
impl<T> std::ops::DerefMut for Guard<'_, T> {
|
||||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||||
self.0
|
self.0
|
||||||
.value
|
.value
|
||||||
@@ -243,52 +197,48 @@ impl<T> std::ops::DerefMut for GuardMut<'_, T> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, T> GuardMut<'a, T> {
|
impl<'a, T> Guard<'a, T> {
|
||||||
/// Take the current value, and a new permit for it's deinitialization.
|
/// Take the current value, and a new permit for it's deinitialization.
|
||||||
///
|
///
|
||||||
/// The permit will be on a semaphore part of the new internal value, and any following
|
/// The permit will be on a semaphore part of the new internal value, and any following
|
||||||
/// [`OnceCell::get_or_init`] will wait on it to complete.
|
/// [`OnceCell::get_or_init`] will wait on it to complete.
|
||||||
pub fn take_and_deinit(&mut self) -> (T, InitPermit) {
|
pub fn take_and_deinit(&mut self) -> (T, InitPermit) {
|
||||||
let mut swapped = Inner::default();
|
let mut swapped = Inner::default();
|
||||||
let permit = swapped
|
let sem = swapped.init_semaphore.clone();
|
||||||
.init_semaphore
|
// acquire and forget right away, moving the control over to InitPermit
|
||||||
.clone()
|
sem.try_acquire().expect("we just created this").forget();
|
||||||
.try_acquire_owned()
|
|
||||||
.expect("we just created this");
|
|
||||||
std::mem::swap(&mut *self.0, &mut swapped);
|
std::mem::swap(&mut *self.0, &mut swapped);
|
||||||
swapped
|
swapped
|
||||||
.value
|
.value
|
||||||
.map(|v| (v, InitPermit(permit)))
|
.map(|v| (v, InitPermit(sem)))
|
||||||
.expect("guard is not created unless value has been initialized")
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn downgrade(self) -> GuardRef<'a, T> {
|
|
||||||
GuardRef(self.0.downgrade())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug)]
|
|
||||||
pub struct GuardRef<'a, T>(tokio::sync::RwLockReadGuard<'a, Inner<T>>);
|
|
||||||
|
|
||||||
impl<T> std::ops::Deref for GuardRef<'_, T> {
|
|
||||||
type Target = T;
|
|
||||||
|
|
||||||
fn deref(&self) -> &Self::Target {
|
|
||||||
self.0
|
|
||||||
.value
|
|
||||||
.as_ref()
|
|
||||||
.expect("guard is not created unless value has been initialized")
|
.expect("guard is not created unless value has been initialized")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Type held by OnceCell (de)initializing task.
|
/// Type held by OnceCell (de)initializing task.
|
||||||
pub struct InitPermit(tokio::sync::OwnedSemaphorePermit);
|
///
|
||||||
|
/// On drop, this type will return the permit.
|
||||||
|
pub struct InitPermit(Arc<tokio::sync::Semaphore>);
|
||||||
|
|
||||||
|
impl Drop for InitPermit {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
assert_eq!(
|
||||||
|
self.0.available_permits(),
|
||||||
|
0,
|
||||||
|
"InitPermit should only exist as the unique permit"
|
||||||
|
);
|
||||||
|
self.0.add_permits(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
use futures::Future;
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
use std::{
|
use std::{
|
||||||
convert::Infallible,
|
convert::Infallible,
|
||||||
|
pin::{pin, Pin},
|
||||||
sync::atomic::{AtomicUsize, Ordering},
|
sync::atomic::{AtomicUsize, Ordering},
|
||||||
time::Duration,
|
time::Duration,
|
||||||
};
|
};
|
||||||
@@ -319,7 +269,7 @@ mod tests {
|
|||||||
barrier.wait().await;
|
barrier.wait().await;
|
||||||
let won = {
|
let won = {
|
||||||
let g = cell
|
let g = cell
|
||||||
.get_mut_or_init(|permit| {
|
.get_or_init(|permit| {
|
||||||
counters.factory_got_to_run.fetch_add(1, Ordering::Relaxed);
|
counters.factory_got_to_run.fetch_add(1, Ordering::Relaxed);
|
||||||
async {
|
async {
|
||||||
counters.future_polled.fetch_add(1, Ordering::Relaxed);
|
counters.future_polled.fetch_add(1, Ordering::Relaxed);
|
||||||
@@ -366,11 +316,7 @@ mod tests {
|
|||||||
let cell = cell.clone();
|
let cell = cell.clone();
|
||||||
let deinitialization_started = deinitialization_started.clone();
|
let deinitialization_started = deinitialization_started.clone();
|
||||||
async move {
|
async move {
|
||||||
let (answer, _permit) = cell
|
let (answer, _permit) = cell.get().expect("initialized to value").take_and_deinit();
|
||||||
.get_mut()
|
|
||||||
.await
|
|
||||||
.expect("initialized to value")
|
|
||||||
.take_and_deinit();
|
|
||||||
assert_eq!(answer, initial);
|
assert_eq!(answer, initial);
|
||||||
|
|
||||||
deinitialization_started.wait().await;
|
deinitialization_started.wait().await;
|
||||||
@@ -381,7 +327,7 @@ mod tests {
|
|||||||
deinitialization_started.wait().await;
|
deinitialization_started.wait().await;
|
||||||
|
|
||||||
let started_at = tokio::time::Instant::now();
|
let started_at = tokio::time::Instant::now();
|
||||||
cell.get_mut_or_init(|permit| async { Ok::<_, Infallible>((reinit, permit)) })
|
cell.get_or_init(|permit| async { Ok::<_, Infallible>((reinit, permit)) })
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
@@ -393,21 +339,21 @@ mod tests {
|
|||||||
|
|
||||||
jh.await.unwrap();
|
jh.await.unwrap();
|
||||||
|
|
||||||
assert_eq!(*cell.get_mut().await.unwrap(), reinit);
|
assert_eq!(*cell.get().unwrap(), reinit);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[test]
|
||||||
async fn reinit_with_deinit_permit() {
|
fn reinit_with_deinit_permit() {
|
||||||
let cell = Arc::new(OnceCell::new(42));
|
let cell = Arc::new(OnceCell::new(42));
|
||||||
|
|
||||||
let (mol, permit) = cell.get_mut().await.unwrap().take_and_deinit();
|
let (mol, permit) = cell.get().unwrap().take_and_deinit();
|
||||||
cell.set(5, permit).await;
|
cell.set(5, permit);
|
||||||
assert_eq!(*cell.get_mut().await.unwrap(), 5);
|
assert_eq!(*cell.get().unwrap(), 5);
|
||||||
|
|
||||||
let (five, permit) = cell.get_mut().await.unwrap().take_and_deinit();
|
let (five, permit) = cell.get().unwrap().take_and_deinit();
|
||||||
assert_eq!(5, five);
|
assert_eq!(5, five);
|
||||||
cell.set(mol, permit).await;
|
cell.set(mol, permit);
|
||||||
assert_eq!(*cell.get_mut().await.unwrap(), 42);
|
assert_eq!(*cell.get().unwrap(), 42);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
@@ -415,13 +361,13 @@ mod tests {
|
|||||||
let cell = OnceCell::default();
|
let cell = OnceCell::default();
|
||||||
|
|
||||||
for _ in 0..10 {
|
for _ in 0..10 {
|
||||||
cell.get_mut_or_init(|_permit| async { Err("whatever error") })
|
cell.get_or_init(|_permit| async { Err("whatever error") })
|
||||||
.await
|
.await
|
||||||
.unwrap_err();
|
.unwrap_err();
|
||||||
}
|
}
|
||||||
|
|
||||||
let g = cell
|
let g = cell
|
||||||
.get_mut_or_init(|permit| async { Ok::<_, Infallible>(("finally success", permit)) })
|
.get_or_init(|permit| async { Ok::<_, Infallible>(("finally success", permit)) })
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert_eq!(*g, "finally success");
|
assert_eq!(*g, "finally success");
|
||||||
@@ -433,7 +379,7 @@ mod tests {
|
|||||||
|
|
||||||
let barrier = tokio::sync::Barrier::new(2);
|
let barrier = tokio::sync::Barrier::new(2);
|
||||||
|
|
||||||
let initializer = cell.get_mut_or_init(|permit| async {
|
let initializer = cell.get_or_init(|permit| async {
|
||||||
barrier.wait().await;
|
barrier.wait().await;
|
||||||
futures::future::pending::<()>().await;
|
futures::future::pending::<()>().await;
|
||||||
|
|
||||||
@@ -447,12 +393,93 @@ mod tests {
|
|||||||
|
|
||||||
// now initializer is dropped
|
// now initializer is dropped
|
||||||
|
|
||||||
assert!(cell.get_mut().await.is_none());
|
assert!(cell.get().is_none());
|
||||||
|
|
||||||
let g = cell
|
let g = cell
|
||||||
.get_mut_or_init(|permit| async { Ok::<_, Infallible>(("now initialized", permit)) })
|
.get_or_init(|permit| async { Ok::<_, Infallible>(("now initialized", permit)) })
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert_eq!(*g, "now initialized");
|
assert_eq!(*g, "now initialized");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[tokio::test(start_paused = true)]
|
||||||
|
async fn reproduce_init_take_deinit_race() {
|
||||||
|
init_take_deinit_scenario(|cell, factory| {
|
||||||
|
Box::pin(async {
|
||||||
|
cell.get_or_init(factory).await.unwrap();
|
||||||
|
})
|
||||||
|
})
|
||||||
|
.await;
|
||||||
|
}
|
||||||
|
|
||||||
|
type BoxedInitFuture<T, E> = Pin<Box<dyn Future<Output = Result<(T, InitPermit), E>>>>;
|
||||||
|
type BoxedInitFunction<T, E> = Box<dyn Fn(InitPermit) -> BoxedInitFuture<T, E>>;
|
||||||
|
|
||||||
|
/// Reproduce an assertion failure.
|
||||||
|
///
|
||||||
|
/// This has interesting generics to be generic between `get_or_init` and `get_mut_or_init`.
|
||||||
|
/// We currently only have one, but the structure is kept.
|
||||||
|
async fn init_take_deinit_scenario<F>(init_way: F)
|
||||||
|
where
|
||||||
|
F: for<'a> Fn(
|
||||||
|
&'a OnceCell<&'static str>,
|
||||||
|
BoxedInitFunction<&'static str, Infallible>,
|
||||||
|
) -> Pin<Box<dyn Future<Output = ()> + 'a>>,
|
||||||
|
{
|
||||||
|
let cell = OnceCell::default();
|
||||||
|
|
||||||
|
// acquire the init_semaphore only permit to drive initializing tasks in order to waiting
|
||||||
|
// on the same semaphore.
|
||||||
|
let permit = cell
|
||||||
|
.inner
|
||||||
|
.lock()
|
||||||
|
.unwrap()
|
||||||
|
.init_semaphore
|
||||||
|
.clone()
|
||||||
|
.try_acquire_owned()
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let mut t1 = pin!(init_way(
|
||||||
|
&cell,
|
||||||
|
Box::new(|permit| Box::pin(async move { Ok(("t1", permit)) })),
|
||||||
|
));
|
||||||
|
|
||||||
|
let mut t2 = pin!(init_way(
|
||||||
|
&cell,
|
||||||
|
Box::new(|permit| Box::pin(async move { Ok(("t2", permit)) })),
|
||||||
|
));
|
||||||
|
|
||||||
|
// drive t2 first to the init_semaphore -- the timeout will be hit once t2 future can
|
||||||
|
// no longer make progress
|
||||||
|
tokio::select! {
|
||||||
|
_ = &mut t2 => unreachable!("it cannot get permit"),
|
||||||
|
_ = tokio::time::sleep(Duration::from_secs(3600 * 24 * 7 * 365)) => {}
|
||||||
|
}
|
||||||
|
|
||||||
|
// followed by t1 in the init_semaphore
|
||||||
|
tokio::select! {
|
||||||
|
_ = &mut t1 => unreachable!("it cannot get permit"),
|
||||||
|
_ = tokio::time::sleep(Duration::from_secs(3600 * 24 * 7 * 365)) => {}
|
||||||
|
}
|
||||||
|
|
||||||
|
// now let t2 proceed and initialize
|
||||||
|
drop(permit);
|
||||||
|
t2.await;
|
||||||
|
|
||||||
|
let (s, permit) = { cell.get().unwrap().take_and_deinit() };
|
||||||
|
assert_eq!("t2", s);
|
||||||
|
|
||||||
|
// now originally t1 would see the semaphore it has as closed. it cannot yet get a permit from
|
||||||
|
// the new one.
|
||||||
|
tokio::select! {
|
||||||
|
_ = &mut t1 => unreachable!("it cannot get permit"),
|
||||||
|
_ = tokio::time::sleep(Duration::from_secs(3600 * 24 * 7 * 365)) => {}
|
||||||
|
}
|
||||||
|
|
||||||
|
// only now we get to initialize it
|
||||||
|
drop(permit);
|
||||||
|
t1.await;
|
||||||
|
|
||||||
|
assert_eq!("t1", *cell.get().unwrap());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -34,6 +34,9 @@ fn main() -> anyhow::Result<()> {
|
|||||||
println!("cargo:rustc-link-lib=static=walproposer");
|
println!("cargo:rustc-link-lib=static=walproposer");
|
||||||
println!("cargo:rustc-link-search={walproposer_lib_search_str}");
|
println!("cargo:rustc-link-search={walproposer_lib_search_str}");
|
||||||
|
|
||||||
|
// Rebuild crate when libwalproposer.a changes
|
||||||
|
println!("cargo:rerun-if-changed={walproposer_lib_search_str}/libwalproposer.a");
|
||||||
|
|
||||||
let pg_config_bin = pg_install_abs.join("v16").join("bin").join("pg_config");
|
let pg_config_bin = pg_install_abs.join("v16").join("bin").join("pg_config");
|
||||||
let inc_server_path: String = if pg_config_bin.exists() {
|
let inc_server_path: String = if pg_config_bin.exists() {
|
||||||
let output = Command::new(pg_config_bin)
|
let output = Command::new(pg_config_bin)
|
||||||
@@ -79,6 +82,7 @@ fn main() -> anyhow::Result<()> {
|
|||||||
.allowlist_function("WalProposerBroadcast")
|
.allowlist_function("WalProposerBroadcast")
|
||||||
.allowlist_function("WalProposerPoll")
|
.allowlist_function("WalProposerPoll")
|
||||||
.allowlist_function("WalProposerFree")
|
.allowlist_function("WalProposerFree")
|
||||||
|
.allowlist_function("SafekeeperStateDesiredEvents")
|
||||||
.allowlist_var("DEBUG5")
|
.allowlist_var("DEBUG5")
|
||||||
.allowlist_var("DEBUG4")
|
.allowlist_var("DEBUG4")
|
||||||
.allowlist_var("DEBUG3")
|
.allowlist_var("DEBUG3")
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ use crate::bindings::WalProposerExecStatusType;
|
|||||||
use crate::bindings::WalproposerShmemState;
|
use crate::bindings::WalproposerShmemState;
|
||||||
use crate::bindings::XLogRecPtr;
|
use crate::bindings::XLogRecPtr;
|
||||||
use crate::walproposer::ApiImpl;
|
use crate::walproposer::ApiImpl;
|
||||||
|
use crate::walproposer::StreamingCallback;
|
||||||
use crate::walproposer::WaitResult;
|
use crate::walproposer::WaitResult;
|
||||||
|
|
||||||
extern "C" fn get_shmem_state(wp: *mut WalProposer) -> *mut WalproposerShmemState {
|
extern "C" fn get_shmem_state(wp: *mut WalProposer) -> *mut WalproposerShmemState {
|
||||||
@@ -36,7 +37,8 @@ extern "C" fn start_streaming(wp: *mut WalProposer, startpos: XLogRecPtr) {
|
|||||||
unsafe {
|
unsafe {
|
||||||
let callback_data = (*(*wp).config).callback_data;
|
let callback_data = (*(*wp).config).callback_data;
|
||||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||||
(*api).start_streaming(startpos)
|
let callback = StreamingCallback::new(wp);
|
||||||
|
(*api).start_streaming(startpos, &callback);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -134,19 +136,18 @@ extern "C" fn conn_async_read(
|
|||||||
unsafe {
|
unsafe {
|
||||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||||
let (res, result) = (*api).conn_async_read(&mut (*sk));
|
|
||||||
|
|
||||||
// This function has guarantee that returned buf will be valid until
|
// This function has guarantee that returned buf will be valid until
|
||||||
// the next call. So we can store a Vec in each Safekeeper and reuse
|
// the next call. So we can store a Vec in each Safekeeper and reuse
|
||||||
// it on the next call.
|
// it on the next call.
|
||||||
let mut inbuf = take_vec_u8(&mut (*sk).inbuf).unwrap_or_default();
|
let mut inbuf = take_vec_u8(&mut (*sk).inbuf).unwrap_or_default();
|
||||||
|
|
||||||
inbuf.clear();
|
inbuf.clear();
|
||||||
inbuf.extend_from_slice(res);
|
|
||||||
|
let result = (*api).conn_async_read(&mut (*sk), &mut inbuf);
|
||||||
|
|
||||||
// Put a Vec back to sk->inbuf and return data ptr.
|
// Put a Vec back to sk->inbuf and return data ptr.
|
||||||
|
*amount = inbuf.len() as i32;
|
||||||
*buf = store_vec_u8(&mut (*sk).inbuf, inbuf);
|
*buf = store_vec_u8(&mut (*sk).inbuf, inbuf);
|
||||||
*amount = res.len() as i32;
|
|
||||||
|
|
||||||
result
|
result
|
||||||
}
|
}
|
||||||
@@ -182,6 +183,10 @@ extern "C" fn recovery_download(wp: *mut WalProposer, sk: *mut Safekeeper) -> bo
|
|||||||
unsafe {
|
unsafe {
|
||||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||||
|
|
||||||
|
// currently `recovery_download` is always called right after election
|
||||||
|
(*api).after_election(&mut (*wp));
|
||||||
|
|
||||||
(*api).recovery_download(&mut (*wp), &mut (*sk))
|
(*api).recovery_download(&mut (*wp), &mut (*sk))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -277,7 +282,8 @@ extern "C" fn wait_event_set(
|
|||||||
}
|
}
|
||||||
WaitResult::Timeout => {
|
WaitResult::Timeout => {
|
||||||
*event_sk = std::ptr::null_mut();
|
*event_sk = std::ptr::null_mut();
|
||||||
*events = crate::bindings::WL_TIMEOUT;
|
// WaitEventSetWait returns 0 for timeout.
|
||||||
|
*events = 0;
|
||||||
0
|
0
|
||||||
}
|
}
|
||||||
WaitResult::Network(sk, event_mask) => {
|
WaitResult::Network(sk, event_mask) => {
|
||||||
@@ -340,7 +346,7 @@ extern "C" fn log_internal(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug, PartialEq)]
|
||||||
pub enum Level {
|
pub enum Level {
|
||||||
Debug5,
|
Debug5,
|
||||||
Debug4,
|
Debug4,
|
||||||
|
|||||||
@@ -1,13 +1,13 @@
|
|||||||
use std::ffi::CString;
|
use std::ffi::CString;
|
||||||
|
|
||||||
use postgres_ffi::WAL_SEGMENT_SIZE;
|
use postgres_ffi::WAL_SEGMENT_SIZE;
|
||||||
use utils::id::TenantTimelineId;
|
use utils::{id::TenantTimelineId, lsn::Lsn};
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
api_bindings::{create_api, take_vec_u8, Level},
|
api_bindings::{create_api, take_vec_u8, Level},
|
||||||
bindings::{
|
bindings::{
|
||||||
NeonWALReadResult, Safekeeper, WalProposer, WalProposerConfig, WalProposerCreate,
|
NeonWALReadResult, Safekeeper, WalProposer, WalProposerBroadcast, WalProposerConfig,
|
||||||
WalProposerFree, WalProposerStart,
|
WalProposerCreate, WalProposerFree, WalProposerPoll, WalProposerStart,
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -16,11 +16,11 @@ use crate::{
|
|||||||
///
|
///
|
||||||
/// Refer to `pgxn/neon/walproposer.h` for documentation.
|
/// Refer to `pgxn/neon/walproposer.h` for documentation.
|
||||||
pub trait ApiImpl {
|
pub trait ApiImpl {
|
||||||
fn get_shmem_state(&self) -> &mut crate::bindings::WalproposerShmemState {
|
fn get_shmem_state(&self) -> *mut crate::bindings::WalproposerShmemState {
|
||||||
todo!()
|
todo!()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn start_streaming(&self, _startpos: u64) {
|
fn start_streaming(&self, _startpos: u64, _callback: &StreamingCallback) {
|
||||||
todo!()
|
todo!()
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -70,7 +70,11 @@ pub trait ApiImpl {
|
|||||||
todo!()
|
todo!()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn conn_async_read(&self, _sk: &mut Safekeeper) -> (&[u8], crate::bindings::PGAsyncReadResult) {
|
fn conn_async_read(
|
||||||
|
&self,
|
||||||
|
_sk: &mut Safekeeper,
|
||||||
|
_vec: &mut Vec<u8>,
|
||||||
|
) -> crate::bindings::PGAsyncReadResult {
|
||||||
todo!()
|
todo!()
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -151,12 +155,14 @@ pub trait ApiImpl {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
pub enum WaitResult {
|
pub enum WaitResult {
|
||||||
Latch,
|
Latch,
|
||||||
Timeout,
|
Timeout,
|
||||||
Network(*mut Safekeeper, u32),
|
Network(*mut Safekeeper, u32),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
pub struct Config {
|
pub struct Config {
|
||||||
/// Tenant and timeline id
|
/// Tenant and timeline id
|
||||||
pub ttid: TenantTimelineId,
|
pub ttid: TenantTimelineId,
|
||||||
@@ -242,6 +248,24 @@ impl Drop for Wrapper {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub struct StreamingCallback {
|
||||||
|
wp: *mut WalProposer,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl StreamingCallback {
|
||||||
|
pub fn new(wp: *mut WalProposer) -> StreamingCallback {
|
||||||
|
StreamingCallback { wp }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn broadcast(&self, startpos: Lsn, endpos: Lsn) {
|
||||||
|
unsafe { WalProposerBroadcast(self.wp, startpos.0, endpos.0) }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn poll(&self) {
|
||||||
|
unsafe { WalProposerPoll(self.wp) }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use core::panic;
|
use core::panic;
|
||||||
@@ -344,14 +368,13 @@ mod tests {
|
|||||||
fn conn_async_read(
|
fn conn_async_read(
|
||||||
&self,
|
&self,
|
||||||
_: &mut crate::bindings::Safekeeper,
|
_: &mut crate::bindings::Safekeeper,
|
||||||
) -> (&[u8], crate::bindings::PGAsyncReadResult) {
|
vec: &mut Vec<u8>,
|
||||||
|
) -> crate::bindings::PGAsyncReadResult {
|
||||||
println!("conn_async_read");
|
println!("conn_async_read");
|
||||||
let reply = self.next_safekeeper_reply();
|
let reply = self.next_safekeeper_reply();
|
||||||
println!("conn_async_read result: {:?}", reply);
|
println!("conn_async_read result: {:?}", reply);
|
||||||
(
|
vec.extend_from_slice(reply);
|
||||||
reply,
|
crate::bindings::PGAsyncReadResult_PG_ASYNC_READ_SUCCESS
|
||||||
crate::bindings::PGAsyncReadResult_PG_ASYNC_READ_SUCCESS,
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn conn_blocking_write(&self, _: &mut crate::bindings::Safekeeper, buf: &[u8]) -> bool {
|
fn conn_blocking_write(&self, _: &mut crate::bindings::Safekeeper, buf: &[u8]) -> bool {
|
||||||
|
|||||||
@@ -56,10 +56,18 @@ pub enum ForceAwaitLogicalSize {
|
|||||||
|
|
||||||
impl Client {
|
impl Client {
|
||||||
pub fn new(mgmt_api_endpoint: String, jwt: Option<&str>) -> Self {
|
pub fn new(mgmt_api_endpoint: String, jwt: Option<&str>) -> Self {
|
||||||
|
Self::from_client(reqwest::Client::new(), mgmt_api_endpoint, jwt)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn from_client(
|
||||||
|
client: reqwest::Client,
|
||||||
|
mgmt_api_endpoint: String,
|
||||||
|
jwt: Option<&str>,
|
||||||
|
) -> Self {
|
||||||
Self {
|
Self {
|
||||||
mgmt_api_endpoint,
|
mgmt_api_endpoint,
|
||||||
authorization_header: jwt.map(|jwt| format!("Bearer {jwt}")),
|
authorization_header: jwt.map(|jwt| format!("Bearer {jwt}")),
|
||||||
client: reqwest::Client::new(),
|
client,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -310,6 +318,22 @@ impl Client {
|
|||||||
.map_err(Error::ReceiveBody)
|
.map_err(Error::ReceiveBody)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub async fn tenant_shard_split(
|
||||||
|
&self,
|
||||||
|
tenant_shard_id: TenantShardId,
|
||||||
|
req: TenantShardSplitRequest,
|
||||||
|
) -> Result<TenantShardSplitResponse> {
|
||||||
|
let uri = format!(
|
||||||
|
"{}/v1/tenant/{}/shard_split",
|
||||||
|
self.mgmt_api_endpoint, tenant_shard_id
|
||||||
|
);
|
||||||
|
self.request(Method::PUT, &uri, req)
|
||||||
|
.await?
|
||||||
|
.json()
|
||||||
|
.await
|
||||||
|
.map_err(Error::ReceiveBody)
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn timeline_list(
|
pub async fn timeline_list(
|
||||||
&self,
|
&self,
|
||||||
tenant_shard_id: &TenantShardId,
|
tenant_shard_id: &TenantShardId,
|
||||||
|
|||||||
@@ -274,6 +274,10 @@ fn start_pageserver(
|
|||||||
set_launch_timestamp_metric(launch_ts);
|
set_launch_timestamp_metric(launch_ts);
|
||||||
#[cfg(target_os = "linux")]
|
#[cfg(target_os = "linux")]
|
||||||
metrics::register_internal(Box::new(metrics::more_process_metrics::Collector::new())).unwrap();
|
metrics::register_internal(Box::new(metrics::more_process_metrics::Collector::new())).unwrap();
|
||||||
|
metrics::register_internal(Box::new(
|
||||||
|
pageserver::metrics::tokio_epoll_uring::Collector::new(),
|
||||||
|
))
|
||||||
|
.unwrap();
|
||||||
pageserver::preinitialize_metrics();
|
pageserver::preinitialize_metrics();
|
||||||
|
|
||||||
// If any failpoints were set from FAILPOINTS environment variable,
|
// If any failpoints were set from FAILPOINTS environment variable,
|
||||||
|
|||||||
@@ -234,7 +234,7 @@ impl DeletionHeader {
|
|||||||
let header_bytes = serde_json::to_vec(self).context("serialize deletion header")?;
|
let header_bytes = serde_json::to_vec(self).context("serialize deletion header")?;
|
||||||
let header_path = conf.deletion_header_path();
|
let header_path = conf.deletion_header_path();
|
||||||
let temp_path = path_with_suffix_extension(&header_path, TEMP_SUFFIX);
|
let temp_path = path_with_suffix_extension(&header_path, TEMP_SUFFIX);
|
||||||
VirtualFile::crashsafe_overwrite(&header_path, &temp_path, &header_bytes)
|
VirtualFile::crashsafe_overwrite(&header_path, &temp_path, header_bytes)
|
||||||
.await
|
.await
|
||||||
.maybe_fatal_err("save deletion header")?;
|
.maybe_fatal_err("save deletion header")?;
|
||||||
|
|
||||||
@@ -325,7 +325,7 @@ impl DeletionList {
|
|||||||
let temp_path = path_with_suffix_extension(&path, TEMP_SUFFIX);
|
let temp_path = path_with_suffix_extension(&path, TEMP_SUFFIX);
|
||||||
|
|
||||||
let bytes = serde_json::to_vec(self).expect("Failed to serialize deletion list");
|
let bytes = serde_json::to_vec(self).expect("Failed to serialize deletion list");
|
||||||
VirtualFile::crashsafe_overwrite(&path, &temp_path, &bytes)
|
VirtualFile::crashsafe_overwrite(&path, &temp_path, bytes)
|
||||||
.await
|
.await
|
||||||
.maybe_fatal_err("save deletion list")
|
.maybe_fatal_err("save deletion list")
|
||||||
.map_err(Into::into)
|
.map_err(Into::into)
|
||||||
|
|||||||
@@ -623,6 +623,7 @@ impl std::fmt::Display for EvictionLayer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Default)]
|
||||||
pub(crate) struct DiskUsageEvictionInfo {
|
pub(crate) struct DiskUsageEvictionInfo {
|
||||||
/// Timeline's largest layer (remote or resident)
|
/// Timeline's largest layer (remote or resident)
|
||||||
pub max_layer_size: Option<u64>,
|
pub max_layer_size: Option<u64>,
|
||||||
@@ -854,19 +855,27 @@ async fn collect_eviction_candidates(
|
|||||||
|
|
||||||
let total = tenant_candidates.len();
|
let total = tenant_candidates.len();
|
||||||
|
|
||||||
for (i, mut candidate) in tenant_candidates.into_iter().enumerate() {
|
let tenant_candidates =
|
||||||
// as we iterate this reverse sorted list, the most recently accessed layer will always
|
tenant_candidates
|
||||||
// be 1.0; this is for us to evict it last.
|
.into_iter()
|
||||||
candidate.relative_last_activity = eviction_order.relative_last_activity(total, i);
|
.enumerate()
|
||||||
|
.map(|(i, mut candidate)| {
|
||||||
|
// as we iterate this reverse sorted list, the most recently accessed layer will always
|
||||||
|
// be 1.0; this is for us to evict it last.
|
||||||
|
candidate.relative_last_activity =
|
||||||
|
eviction_order.relative_last_activity(total, i);
|
||||||
|
|
||||||
let partition = if cumsum > min_resident_size as i128 {
|
let partition = if cumsum > min_resident_size as i128 {
|
||||||
MinResidentSizePartition::Above
|
MinResidentSizePartition::Above
|
||||||
} else {
|
} else {
|
||||||
MinResidentSizePartition::Below
|
MinResidentSizePartition::Below
|
||||||
};
|
};
|
||||||
cumsum += i128::from(candidate.layer.get_file_size());
|
cumsum += i128::from(candidate.layer.get_file_size());
|
||||||
candidates.push((partition, candidate));
|
|
||||||
}
|
(partition, candidate)
|
||||||
|
});
|
||||||
|
|
||||||
|
candidates.extend(tenant_candidates);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Note: the same tenant ID might be hit twice, if it transitions from attached to
|
// Note: the same tenant ID might be hit twice, if it transitions from attached to
|
||||||
@@ -882,21 +891,41 @@ async fn collect_eviction_candidates(
|
|||||||
);
|
);
|
||||||
|
|
||||||
for secondary_tenant in secondary_tenants {
|
for secondary_tenant in secondary_tenants {
|
||||||
let mut layer_info = secondary_tenant.get_layers_for_eviction();
|
// for secondary tenants we use a sum of on_disk layers and already evicted layers. this is
|
||||||
|
// to prevent repeated disk usage based evictions from completely draining less often
|
||||||
|
// updating secondaries.
|
||||||
|
let (mut layer_info, total_layers) = secondary_tenant.get_layers_for_eviction();
|
||||||
|
|
||||||
|
debug_assert!(
|
||||||
|
total_layers >= layer_info.resident_layers.len(),
|
||||||
|
"total_layers ({total_layers}) must be at least the resident_layers.len() ({})",
|
||||||
|
layer_info.resident_layers.len()
|
||||||
|
);
|
||||||
|
|
||||||
layer_info
|
layer_info
|
||||||
.resident_layers
|
.resident_layers
|
||||||
.sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));
|
.sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));
|
||||||
|
|
||||||
candidates.extend(layer_info.resident_layers.into_iter().map(|candidate| {
|
let tenant_candidates =
|
||||||
(
|
layer_info
|
||||||
// Secondary locations' layers are always considered above the min resident size,
|
.resident_layers
|
||||||
// i.e. secondary locations are permitted to be trimmed to zero layers if all
|
.into_iter()
|
||||||
// the layers have sufficiently old access times.
|
.enumerate()
|
||||||
MinResidentSizePartition::Above,
|
.map(|(i, mut candidate)| {
|
||||||
candidate,
|
candidate.relative_last_activity =
|
||||||
)
|
eviction_order.relative_last_activity(total_layers, i);
|
||||||
}));
|
(
|
||||||
|
// Secondary locations' layers are always considered above the min resident size,
|
||||||
|
// i.e. secondary locations are permitted to be trimmed to zero layers if all
|
||||||
|
// the layers have sufficiently old access times.
|
||||||
|
MinResidentSizePartition::Above,
|
||||||
|
candidate,
|
||||||
|
)
|
||||||
|
});
|
||||||
|
|
||||||
|
candidates.extend(tenant_candidates);
|
||||||
|
|
||||||
|
tokio::task::yield_now().await;
|
||||||
}
|
}
|
||||||
|
|
||||||
debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
|
debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
|
||||||
|
|||||||
@@ -19,11 +19,14 @@ use pageserver_api::models::ShardParameters;
|
|||||||
use pageserver_api::models::TenantDetails;
|
use pageserver_api::models::TenantDetails;
|
||||||
use pageserver_api::models::TenantLocationConfigResponse;
|
use pageserver_api::models::TenantLocationConfigResponse;
|
||||||
use pageserver_api::models::TenantShardLocation;
|
use pageserver_api::models::TenantShardLocation;
|
||||||
|
use pageserver_api::models::TenantShardSplitRequest;
|
||||||
|
use pageserver_api::models::TenantShardSplitResponse;
|
||||||
use pageserver_api::models::TenantState;
|
use pageserver_api::models::TenantState;
|
||||||
use pageserver_api::models::{
|
use pageserver_api::models::{
|
||||||
DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
|
DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
|
||||||
TenantLoadRequest, TenantLocationConfigRequest,
|
TenantLoadRequest, TenantLocationConfigRequest,
|
||||||
};
|
};
|
||||||
|
use pageserver_api::shard::ShardCount;
|
||||||
use pageserver_api::shard::TenantShardId;
|
use pageserver_api::shard::TenantShardId;
|
||||||
use remote_storage::GenericRemoteStorage;
|
use remote_storage::GenericRemoteStorage;
|
||||||
use remote_storage::TimeTravelError;
|
use remote_storage::TimeTravelError;
|
||||||
@@ -419,6 +422,7 @@ async fn build_timeline_info_common(
|
|||||||
tenant::timeline::logical_size::Accuracy::Approximate => false,
|
tenant::timeline::logical_size::Accuracy::Approximate => false,
|
||||||
tenant::timeline::logical_size::Accuracy::Exact => true,
|
tenant::timeline::logical_size::Accuracy::Exact => true,
|
||||||
},
|
},
|
||||||
|
directory_entries_counts: timeline.get_directory_metrics().to_vec(),
|
||||||
current_physical_size,
|
current_physical_size,
|
||||||
current_logical_size_non_incremental: None,
|
current_logical_size_non_incremental: None,
|
||||||
timeline_dir_layer_file_size_sum: None,
|
timeline_dir_layer_file_size_sum: None,
|
||||||
@@ -485,7 +489,9 @@ async fn timeline_create_handler(
|
|||||||
let state = get_state(&request);
|
let state = get_state(&request);
|
||||||
|
|
||||||
async {
|
async {
|
||||||
let tenant = state.tenant_manager.get_attached_tenant_shard(tenant_shard_id, false)?;
|
let tenant = state
|
||||||
|
.tenant_manager
|
||||||
|
.get_attached_tenant_shard(tenant_shard_id, false)?;
|
||||||
|
|
||||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||||
|
|
||||||
@@ -495,48 +501,62 @@ async fn timeline_create_handler(
|
|||||||
tracing::info!("bootstrapping");
|
tracing::info!("bootstrapping");
|
||||||
}
|
}
|
||||||
|
|
||||||
match tenant.create_timeline(
|
match tenant
|
||||||
new_timeline_id,
|
.create_timeline(
|
||||||
request_data.ancestor_timeline_id.map(TimelineId::from),
|
new_timeline_id,
|
||||||
request_data.ancestor_start_lsn,
|
request_data.ancestor_timeline_id,
|
||||||
request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
|
request_data.ancestor_start_lsn,
|
||||||
request_data.existing_initdb_timeline_id,
|
request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
|
||||||
state.broker_client.clone(),
|
request_data.existing_initdb_timeline_id,
|
||||||
&ctx,
|
state.broker_client.clone(),
|
||||||
)
|
&ctx,
|
||||||
.await {
|
)
|
||||||
|
.await
|
||||||
|
{
|
||||||
Ok(new_timeline) => {
|
Ok(new_timeline) => {
|
||||||
// Created. Construct a TimelineInfo for it.
|
// Created. Construct a TimelineInfo for it.
|
||||||
let timeline_info = build_timeline_info_common(&new_timeline, &ctx, tenant::timeline::GetLogicalSizePriority::User)
|
let timeline_info = build_timeline_info_common(
|
||||||
.await
|
&new_timeline,
|
||||||
.map_err(ApiError::InternalServerError)?;
|
&ctx,
|
||||||
|
tenant::timeline::GetLogicalSizePriority::User,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.map_err(ApiError::InternalServerError)?;
|
||||||
json_response(StatusCode::CREATED, timeline_info)
|
json_response(StatusCode::CREATED, timeline_info)
|
||||||
}
|
}
|
||||||
Err(_) if tenant.cancel.is_cancelled() => {
|
Err(_) if tenant.cancel.is_cancelled() => {
|
||||||
// In case we get some ugly error type during shutdown, cast it into a clean 503.
|
// In case we get some ugly error type during shutdown, cast it into a clean 503.
|
||||||
json_response(StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg("Tenant shutting down".to_string()))
|
json_response(
|
||||||
}
|
StatusCode::SERVICE_UNAVAILABLE,
|
||||||
Err(tenant::CreateTimelineError::Conflict | tenant::CreateTimelineError::AlreadyCreating) => {
|
HttpErrorBody::from_msg("Tenant shutting down".to_string()),
|
||||||
json_response(StatusCode::CONFLICT, ())
|
)
|
||||||
}
|
|
||||||
Err(tenant::CreateTimelineError::AncestorLsn(err)) => {
|
|
||||||
json_response(StatusCode::NOT_ACCEPTABLE, HttpErrorBody::from_msg(
|
|
||||||
format!("{err:#}")
|
|
||||||
))
|
|
||||||
}
|
|
||||||
Err(e @ tenant::CreateTimelineError::AncestorNotActive) => {
|
|
||||||
json_response(StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg(e.to_string()))
|
|
||||||
}
|
|
||||||
Err(tenant::CreateTimelineError::ShuttingDown) => {
|
|
||||||
json_response(StatusCode::SERVICE_UNAVAILABLE,HttpErrorBody::from_msg("tenant shutting down".to_string()))
|
|
||||||
}
|
}
|
||||||
|
Err(
|
||||||
|
tenant::CreateTimelineError::Conflict
|
||||||
|
| tenant::CreateTimelineError::AlreadyCreating,
|
||||||
|
) => json_response(StatusCode::CONFLICT, ()),
|
||||||
|
Err(tenant::CreateTimelineError::AncestorLsn(err)) => json_response(
|
||||||
|
StatusCode::NOT_ACCEPTABLE,
|
||||||
|
HttpErrorBody::from_msg(format!("{err:#}")),
|
||||||
|
),
|
||||||
|
Err(e @ tenant::CreateTimelineError::AncestorNotActive) => json_response(
|
||||||
|
StatusCode::SERVICE_UNAVAILABLE,
|
||||||
|
HttpErrorBody::from_msg(e.to_string()),
|
||||||
|
),
|
||||||
|
Err(tenant::CreateTimelineError::ShuttingDown) => json_response(
|
||||||
|
StatusCode::SERVICE_UNAVAILABLE,
|
||||||
|
HttpErrorBody::from_msg("tenant shutting down".to_string()),
|
||||||
|
),
|
||||||
Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)),
|
Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
.instrument(info_span!("timeline_create",
|
.instrument(info_span!("timeline_create",
|
||||||
tenant_id = %tenant_shard_id.tenant_id,
|
tenant_id = %tenant_shard_id.tenant_id,
|
||||||
shard_id = %tenant_shard_id.shard_slug(),
|
shard_id = %tenant_shard_id.shard_slug(),
|
||||||
timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
|
timeline_id = %new_timeline_id,
|
||||||
|
lsn=?request_data.ancestor_start_lsn,
|
||||||
|
pg_version=?request_data.pg_version
|
||||||
|
))
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -875,7 +895,7 @@ async fn tenant_reset_handler(
|
|||||||
let state = get_state(&request);
|
let state = get_state(&request);
|
||||||
state
|
state
|
||||||
.tenant_manager
|
.tenant_manager
|
||||||
.reset_tenant(tenant_shard_id, drop_cache.unwrap_or(false), ctx)
|
.reset_tenant(tenant_shard_id, drop_cache.unwrap_or(false), &ctx)
|
||||||
.await
|
.await
|
||||||
.map_err(ApiError::InternalServerError)?;
|
.map_err(ApiError::InternalServerError)?;
|
||||||
|
|
||||||
@@ -1104,6 +1124,25 @@ async fn tenant_size_handler(
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn tenant_shard_split_handler(
|
||||||
|
mut request: Request<Body>,
|
||||||
|
_cancel: CancellationToken,
|
||||||
|
) -> Result<Response<Body>, ApiError> {
|
||||||
|
let req: TenantShardSplitRequest = json_request(&mut request).await?;
|
||||||
|
|
||||||
|
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||||
|
let state = get_state(&request);
|
||||||
|
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
||||||
|
|
||||||
|
let new_shards = state
|
||||||
|
.tenant_manager
|
||||||
|
.shard_split(tenant_shard_id, ShardCount(req.new_shard_count), &ctx)
|
||||||
|
.await
|
||||||
|
.map_err(ApiError::InternalServerError)?;
|
||||||
|
|
||||||
|
json_response(StatusCode::OK, TenantShardSplitResponse { new_shards })
|
||||||
|
}
|
||||||
|
|
||||||
async fn layer_map_info_handler(
|
async fn layer_map_info_handler(
|
||||||
request: Request<Body>,
|
request: Request<Body>,
|
||||||
_cancel: CancellationToken,
|
_cancel: CancellationToken,
|
||||||
@@ -2063,6 +2102,9 @@ pub fn make_router(
|
|||||||
.put("/v1/tenant/config", |r| {
|
.put("/v1/tenant/config", |r| {
|
||||||
api_handler(r, update_tenant_config_handler)
|
api_handler(r, update_tenant_config_handler)
|
||||||
})
|
})
|
||||||
|
.put("/v1/tenant/:tenant_shard_id/shard_split", |r| {
|
||||||
|
api_handler(r, tenant_shard_split_handler)
|
||||||
|
})
|
||||||
.get("/v1/tenant/:tenant_shard_id/config", |r| {
|
.get("/v1/tenant/:tenant_shard_id/config", |r| {
|
||||||
api_handler(r, get_tenant_config_handler)
|
api_handler(r, get_tenant_config_handler)
|
||||||
})
|
})
|
||||||
|
|||||||
@@ -602,6 +602,15 @@ pub(crate) mod initial_logical_size {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static DIRECTORY_ENTRIES_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||||
|
register_uint_gauge_vec!(
|
||||||
|
"pageserver_directory_entries_count",
|
||||||
|
"Sum of the entries in pageserver-stored directory listings",
|
||||||
|
&["tenant_id", "shard_id", "timeline_id"]
|
||||||
|
)
|
||||||
|
.expect("failed to define a metric")
|
||||||
|
});
|
||||||
|
|
||||||
pub(crate) static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
pub(crate) static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||||
register_uint_gauge_vec!(
|
register_uint_gauge_vec!(
|
||||||
"pageserver_tenant_states_count",
|
"pageserver_tenant_states_count",
|
||||||
@@ -1809,6 +1818,7 @@ pub(crate) struct TimelineMetrics {
|
|||||||
resident_physical_size_gauge: UIntGauge,
|
resident_physical_size_gauge: UIntGauge,
|
||||||
/// copy of LayeredTimeline.current_logical_size
|
/// copy of LayeredTimeline.current_logical_size
|
||||||
pub current_logical_size_gauge: UIntGauge,
|
pub current_logical_size_gauge: UIntGauge,
|
||||||
|
pub directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>>,
|
||||||
pub num_persistent_files_created: IntCounter,
|
pub num_persistent_files_created: IntCounter,
|
||||||
pub persistent_bytes_written: IntCounter,
|
pub persistent_bytes_written: IntCounter,
|
||||||
pub evictions: IntCounter,
|
pub evictions: IntCounter,
|
||||||
@@ -1818,12 +1828,12 @@ pub(crate) struct TimelineMetrics {
|
|||||||
impl TimelineMetrics {
|
impl TimelineMetrics {
|
||||||
pub fn new(
|
pub fn new(
|
||||||
tenant_shard_id: &TenantShardId,
|
tenant_shard_id: &TenantShardId,
|
||||||
timeline_id: &TimelineId,
|
timeline_id_raw: &TimelineId,
|
||||||
evictions_with_low_residence_duration_builder: EvictionsWithLowResidenceDurationBuilder,
|
evictions_with_low_residence_duration_builder: EvictionsWithLowResidenceDurationBuilder,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
let tenant_id = tenant_shard_id.tenant_id.to_string();
|
let tenant_id = tenant_shard_id.tenant_id.to_string();
|
||||||
let shard_id = format!("{}", tenant_shard_id.shard_slug());
|
let shard_id = format!("{}", tenant_shard_id.shard_slug());
|
||||||
let timeline_id = timeline_id.to_string();
|
let timeline_id = timeline_id_raw.to_string();
|
||||||
let flush_time_histo = StorageTimeMetrics::new(
|
let flush_time_histo = StorageTimeMetrics::new(
|
||||||
StorageTimeOperation::LayerFlush,
|
StorageTimeOperation::LayerFlush,
|
||||||
&tenant_id,
|
&tenant_id,
|
||||||
@@ -1876,6 +1886,22 @@ impl TimelineMetrics {
|
|||||||
let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
|
let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
|
||||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
// TODO use impl Trait syntax here once we have ability to use it: https://github.com/rust-lang/rust/issues/63065
|
||||||
|
let directory_entries_count_gauge_closure = {
|
||||||
|
let tenant_shard_id = *tenant_shard_id;
|
||||||
|
let timeline_id_raw = *timeline_id_raw;
|
||||||
|
move || {
|
||||||
|
let tenant_id = tenant_shard_id.tenant_id.to_string();
|
||||||
|
let shard_id = format!("{}", tenant_shard_id.shard_slug());
|
||||||
|
let timeline_id = timeline_id_raw.to_string();
|
||||||
|
let gauge: UIntGauge = DIRECTORY_ENTRIES_COUNT
|
||||||
|
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||||
|
.unwrap();
|
||||||
|
gauge
|
||||||
|
}
|
||||||
|
};
|
||||||
|
let directory_entries_count_gauge: Lazy<UIntGauge, Box<dyn Send + Fn() -> UIntGauge>> =
|
||||||
|
Lazy::new(Box::new(directory_entries_count_gauge_closure));
|
||||||
let num_persistent_files_created = NUM_PERSISTENT_FILES_CREATED
|
let num_persistent_files_created = NUM_PERSISTENT_FILES_CREATED
|
||||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||||
.unwrap();
|
.unwrap();
|
||||||
@@ -1902,6 +1928,7 @@ impl TimelineMetrics {
|
|||||||
last_record_gauge,
|
last_record_gauge,
|
||||||
resident_physical_size_gauge,
|
resident_physical_size_gauge,
|
||||||
current_logical_size_gauge,
|
current_logical_size_gauge,
|
||||||
|
directory_entries_count_gauge,
|
||||||
num_persistent_files_created,
|
num_persistent_files_created,
|
||||||
persistent_bytes_written,
|
persistent_bytes_written,
|
||||||
evictions,
|
evictions,
|
||||||
@@ -1944,6 +1971,9 @@ impl Drop for TimelineMetrics {
|
|||||||
RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
|
RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
|
||||||
}
|
}
|
||||||
let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
|
let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
|
||||||
|
if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
|
||||||
|
let _ = metric.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
|
||||||
|
}
|
||||||
let _ =
|
let _ =
|
||||||
NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
|
NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
|
||||||
let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
|
let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
|
||||||
@@ -2400,6 +2430,72 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub mod tokio_epoll_uring {
|
||||||
|
use metrics::UIntGauge;
|
||||||
|
|
||||||
|
pub struct Collector {
|
||||||
|
descs: Vec<metrics::core::Desc>,
|
||||||
|
systems_created: UIntGauge,
|
||||||
|
systems_destroyed: UIntGauge,
|
||||||
|
}
|
||||||
|
|
||||||
|
const NMETRICS: usize = 2;
|
||||||
|
|
||||||
|
impl metrics::core::Collector for Collector {
|
||||||
|
fn desc(&self) -> Vec<&metrics::core::Desc> {
|
||||||
|
self.descs.iter().collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
|
||||||
|
let mut mfs = Vec::with_capacity(NMETRICS);
|
||||||
|
let tokio_epoll_uring::metrics::Metrics {
|
||||||
|
systems_created,
|
||||||
|
systems_destroyed,
|
||||||
|
} = tokio_epoll_uring::metrics::global();
|
||||||
|
self.systems_created.set(systems_created);
|
||||||
|
mfs.extend(self.systems_created.collect());
|
||||||
|
self.systems_destroyed.set(systems_destroyed);
|
||||||
|
mfs.extend(self.systems_destroyed.collect());
|
||||||
|
mfs
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Collector {
|
||||||
|
#[allow(clippy::new_without_default)]
|
||||||
|
pub fn new() -> Self {
|
||||||
|
let mut descs = Vec::new();
|
||||||
|
|
||||||
|
let systems_created = UIntGauge::new(
|
||||||
|
"pageserver_tokio_epoll_uring_systems_created",
|
||||||
|
"counter of tokio-epoll-uring systems that were created",
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
descs.extend(
|
||||||
|
metrics::core::Collector::desc(&systems_created)
|
||||||
|
.into_iter()
|
||||||
|
.cloned(),
|
||||||
|
);
|
||||||
|
|
||||||
|
let systems_destroyed = UIntGauge::new(
|
||||||
|
"pageserver_tokio_epoll_uring_systems_destroyed",
|
||||||
|
"counter of tokio-epoll-uring systems that were destroyed",
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
descs.extend(
|
||||||
|
metrics::core::Collector::desc(&systems_destroyed)
|
||||||
|
.into_iter()
|
||||||
|
.cloned(),
|
||||||
|
);
|
||||||
|
|
||||||
|
Self {
|
||||||
|
descs,
|
||||||
|
systems_created,
|
||||||
|
systems_destroyed,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub fn preinitialize_metrics() {
|
pub fn preinitialize_metrics() {
|
||||||
// Python tests need these and on some we do alerting.
|
// Python tests need these and on some we do alerting.
|
||||||
//
|
//
|
||||||
|
|||||||
@@ -91,8 +91,8 @@ const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
|
|||||||
/// `tokio_tar` already read the first such block. Read the second all-zeros block,
|
/// `tokio_tar` already read the first such block. Read the second all-zeros block,
|
||||||
/// and check that there is no more data after the EOF marker.
|
/// and check that there is no more data after the EOF marker.
|
||||||
///
|
///
|
||||||
/// XXX: Currently, any trailing data after the EOF marker prints a warning.
|
/// 'tar' command can also write extra blocks of zeros, up to a record
|
||||||
/// Perhaps it should be a hard error?
|
/// size, controlled by the --record-size argument. Ignore them too.
|
||||||
async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()> {
|
async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()> {
|
||||||
use tokio::io::AsyncReadExt;
|
use tokio::io::AsyncReadExt;
|
||||||
let mut buf = [0u8; 512];
|
let mut buf = [0u8; 512];
|
||||||
@@ -113,17 +113,24 @@ async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()
|
|||||||
anyhow::bail!("invalid tar EOF marker");
|
anyhow::bail!("invalid tar EOF marker");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Drain any data after the EOF marker
|
// Drain any extra zero-blocks after the EOF marker
|
||||||
let mut trailing_bytes = 0;
|
let mut trailing_bytes = 0;
|
||||||
|
let mut seen_nonzero_bytes = false;
|
||||||
loop {
|
loop {
|
||||||
let nbytes = reader.read(&mut buf).await?;
|
let nbytes = reader.read(&mut buf).await?;
|
||||||
trailing_bytes += nbytes;
|
trailing_bytes += nbytes;
|
||||||
|
if !buf.iter().all(|&x| x == 0) {
|
||||||
|
seen_nonzero_bytes = true;
|
||||||
|
}
|
||||||
if nbytes == 0 {
|
if nbytes == 0 {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if trailing_bytes > 0 {
|
if seen_nonzero_bytes {
|
||||||
warn!("ignored {trailing_bytes} unexpected bytes after the tar archive");
|
anyhow::bail!("unexpected non-zero bytes after the tar archive");
|
||||||
|
}
|
||||||
|
if trailing_bytes % 512 != 0 {
|
||||||
|
anyhow::bail!("unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive");
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -14,6 +14,7 @@ use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_i
|
|||||||
use crate::walrecord::NeonWalRecord;
|
use crate::walrecord::NeonWalRecord;
|
||||||
use anyhow::{ensure, Context};
|
use anyhow::{ensure, Context};
|
||||||
use bytes::{Buf, Bytes, BytesMut};
|
use bytes::{Buf, Bytes, BytesMut};
|
||||||
|
use enum_map::Enum;
|
||||||
use pageserver_api::key::{
|
use pageserver_api::key::{
|
||||||
dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key,
|
dbdir_key_range, is_rel_block_key, is_slru_block_key, rel_block_to_key, rel_dir_to_key,
|
||||||
rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
|
rel_key_range, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key,
|
||||||
@@ -155,6 +156,7 @@ impl Timeline {
|
|||||||
pending_updates: HashMap::new(),
|
pending_updates: HashMap::new(),
|
||||||
pending_deletions: Vec::new(),
|
pending_deletions: Vec::new(),
|
||||||
pending_nblocks: 0,
|
pending_nblocks: 0,
|
||||||
|
pending_directory_entries: Vec::new(),
|
||||||
lsn,
|
lsn,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -868,6 +870,7 @@ pub struct DatadirModification<'a> {
|
|||||||
pending_updates: HashMap<Key, Vec<(Lsn, Value)>>,
|
pending_updates: HashMap<Key, Vec<(Lsn, Value)>>,
|
||||||
pending_deletions: Vec<(Range<Key>, Lsn)>,
|
pending_deletions: Vec<(Range<Key>, Lsn)>,
|
||||||
pending_nblocks: i64,
|
pending_nblocks: i64,
|
||||||
|
pending_directory_entries: Vec<(DirectoryKind, usize)>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> DatadirModification<'a> {
|
impl<'a> DatadirModification<'a> {
|
||||||
@@ -899,6 +902,7 @@ impl<'a> DatadirModification<'a> {
|
|||||||
let buf = DbDirectory::ser(&DbDirectory {
|
let buf = DbDirectory::ser(&DbDirectory {
|
||||||
dbdirs: HashMap::new(),
|
dbdirs: HashMap::new(),
|
||||||
})?;
|
})?;
|
||||||
|
self.pending_directory_entries.push((DirectoryKind::Db, 0));
|
||||||
self.put(DBDIR_KEY, Value::Image(buf.into()));
|
self.put(DBDIR_KEY, Value::Image(buf.into()));
|
||||||
|
|
||||||
// Create AuxFilesDirectory
|
// Create AuxFilesDirectory
|
||||||
@@ -907,16 +911,24 @@ impl<'a> DatadirModification<'a> {
|
|||||||
let buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory {
|
let buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory {
|
||||||
xids: HashSet::new(),
|
xids: HashSet::new(),
|
||||||
})?;
|
})?;
|
||||||
|
self.pending_directory_entries
|
||||||
|
.push((DirectoryKind::TwoPhase, 0));
|
||||||
self.put(TWOPHASEDIR_KEY, Value::Image(buf.into()));
|
self.put(TWOPHASEDIR_KEY, Value::Image(buf.into()));
|
||||||
|
|
||||||
let buf: Bytes = SlruSegmentDirectory::ser(&SlruSegmentDirectory::default())?.into();
|
let buf: Bytes = SlruSegmentDirectory::ser(&SlruSegmentDirectory::default())?.into();
|
||||||
let empty_dir = Value::Image(buf);
|
let empty_dir = Value::Image(buf);
|
||||||
self.put(slru_dir_to_key(SlruKind::Clog), empty_dir.clone());
|
self.put(slru_dir_to_key(SlruKind::Clog), empty_dir.clone());
|
||||||
|
self.pending_directory_entries
|
||||||
|
.push((DirectoryKind::SlruSegment(SlruKind::Clog), 0));
|
||||||
self.put(
|
self.put(
|
||||||
slru_dir_to_key(SlruKind::MultiXactMembers),
|
slru_dir_to_key(SlruKind::MultiXactMembers),
|
||||||
empty_dir.clone(),
|
empty_dir.clone(),
|
||||||
);
|
);
|
||||||
|
self.pending_directory_entries
|
||||||
|
.push((DirectoryKind::SlruSegment(SlruKind::Clog), 0));
|
||||||
self.put(slru_dir_to_key(SlruKind::MultiXactOffsets), empty_dir);
|
self.put(slru_dir_to_key(SlruKind::MultiXactOffsets), empty_dir);
|
||||||
|
self.pending_directory_entries
|
||||||
|
.push((DirectoryKind::SlruSegment(SlruKind::MultiXactOffsets), 0));
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -1017,6 +1029,7 @@ impl<'a> DatadirModification<'a> {
|
|||||||
let buf = RelDirectory::ser(&RelDirectory {
|
let buf = RelDirectory::ser(&RelDirectory {
|
||||||
rels: HashSet::new(),
|
rels: HashSet::new(),
|
||||||
})?;
|
})?;
|
||||||
|
self.pending_directory_entries.push((DirectoryKind::Rel, 0));
|
||||||
self.put(
|
self.put(
|
||||||
rel_dir_to_key(spcnode, dbnode),
|
rel_dir_to_key(spcnode, dbnode),
|
||||||
Value::Image(Bytes::from(buf)),
|
Value::Image(Bytes::from(buf)),
|
||||||
@@ -1039,6 +1052,8 @@ impl<'a> DatadirModification<'a> {
|
|||||||
if !dir.xids.insert(xid) {
|
if !dir.xids.insert(xid) {
|
||||||
anyhow::bail!("twophase file for xid {} already exists", xid);
|
anyhow::bail!("twophase file for xid {} already exists", xid);
|
||||||
}
|
}
|
||||||
|
self.pending_directory_entries
|
||||||
|
.push((DirectoryKind::TwoPhase, dir.xids.len()));
|
||||||
self.put(
|
self.put(
|
||||||
TWOPHASEDIR_KEY,
|
TWOPHASEDIR_KEY,
|
||||||
Value::Image(Bytes::from(TwoPhaseDirectory::ser(&dir)?)),
|
Value::Image(Bytes::from(TwoPhaseDirectory::ser(&dir)?)),
|
||||||
@@ -1074,6 +1089,8 @@ impl<'a> DatadirModification<'a> {
|
|||||||
let mut dir = DbDirectory::des(&buf)?;
|
let mut dir = DbDirectory::des(&buf)?;
|
||||||
if dir.dbdirs.remove(&(spcnode, dbnode)).is_some() {
|
if dir.dbdirs.remove(&(spcnode, dbnode)).is_some() {
|
||||||
let buf = DbDirectory::ser(&dir)?;
|
let buf = DbDirectory::ser(&dir)?;
|
||||||
|
self.pending_directory_entries
|
||||||
|
.push((DirectoryKind::Db, dir.dbdirs.len()));
|
||||||
self.put(DBDIR_KEY, Value::Image(buf.into()));
|
self.put(DBDIR_KEY, Value::Image(buf.into()));
|
||||||
} else {
|
} else {
|
||||||
warn!(
|
warn!(
|
||||||
@@ -1111,6 +1128,8 @@ impl<'a> DatadirModification<'a> {
|
|||||||
// Didn't exist. Update dbdir
|
// Didn't exist. Update dbdir
|
||||||
dbdir.dbdirs.insert((rel.spcnode, rel.dbnode), false);
|
dbdir.dbdirs.insert((rel.spcnode, rel.dbnode), false);
|
||||||
let buf = DbDirectory::ser(&dbdir).context("serialize db")?;
|
let buf = DbDirectory::ser(&dbdir).context("serialize db")?;
|
||||||
|
self.pending_directory_entries
|
||||||
|
.push((DirectoryKind::Db, dbdir.dbdirs.len()));
|
||||||
self.put(DBDIR_KEY, Value::Image(buf.into()));
|
self.put(DBDIR_KEY, Value::Image(buf.into()));
|
||||||
|
|
||||||
// and create the RelDirectory
|
// and create the RelDirectory
|
||||||
@@ -1125,6 +1144,10 @@ impl<'a> DatadirModification<'a> {
|
|||||||
if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
|
if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
|
||||||
return Err(RelationError::AlreadyExists);
|
return Err(RelationError::AlreadyExists);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
self.pending_directory_entries
|
||||||
|
.push((DirectoryKind::Rel, rel_dir.rels.len()));
|
||||||
|
|
||||||
self.put(
|
self.put(
|
||||||
rel_dir_key,
|
rel_dir_key,
|
||||||
Value::Image(Bytes::from(
|
Value::Image(Bytes::from(
|
||||||
@@ -1216,6 +1239,9 @@ impl<'a> DatadirModification<'a> {
|
|||||||
let buf = self.get(dir_key, ctx).await?;
|
let buf = self.get(dir_key, ctx).await?;
|
||||||
let mut dir = RelDirectory::des(&buf)?;
|
let mut dir = RelDirectory::des(&buf)?;
|
||||||
|
|
||||||
|
self.pending_directory_entries
|
||||||
|
.push((DirectoryKind::Rel, dir.rels.len()));
|
||||||
|
|
||||||
if dir.rels.remove(&(rel.relnode, rel.forknum)) {
|
if dir.rels.remove(&(rel.relnode, rel.forknum)) {
|
||||||
self.put(dir_key, Value::Image(Bytes::from(RelDirectory::ser(&dir)?)));
|
self.put(dir_key, Value::Image(Bytes::from(RelDirectory::ser(&dir)?)));
|
||||||
} else {
|
} else {
|
||||||
@@ -1251,6 +1277,8 @@ impl<'a> DatadirModification<'a> {
|
|||||||
if !dir.segments.insert(segno) {
|
if !dir.segments.insert(segno) {
|
||||||
anyhow::bail!("slru segment {kind:?}/{segno} already exists");
|
anyhow::bail!("slru segment {kind:?}/{segno} already exists");
|
||||||
}
|
}
|
||||||
|
self.pending_directory_entries
|
||||||
|
.push((DirectoryKind::SlruSegment(kind), dir.segments.len()));
|
||||||
self.put(
|
self.put(
|
||||||
dir_key,
|
dir_key,
|
||||||
Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)),
|
Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)),
|
||||||
@@ -1295,6 +1323,8 @@ impl<'a> DatadirModification<'a> {
|
|||||||
if !dir.segments.remove(&segno) {
|
if !dir.segments.remove(&segno) {
|
||||||
warn!("slru segment {:?}/{} does not exist", kind, segno);
|
warn!("slru segment {:?}/{} does not exist", kind, segno);
|
||||||
}
|
}
|
||||||
|
self.pending_directory_entries
|
||||||
|
.push((DirectoryKind::SlruSegment(kind), dir.segments.len()));
|
||||||
self.put(
|
self.put(
|
||||||
dir_key,
|
dir_key,
|
||||||
Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)),
|
Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)),
|
||||||
@@ -1325,6 +1355,8 @@ impl<'a> DatadirModification<'a> {
|
|||||||
if !dir.xids.remove(&xid) {
|
if !dir.xids.remove(&xid) {
|
||||||
warn!("twophase file for xid {} does not exist", xid);
|
warn!("twophase file for xid {} does not exist", xid);
|
||||||
}
|
}
|
||||||
|
self.pending_directory_entries
|
||||||
|
.push((DirectoryKind::TwoPhase, dir.xids.len()));
|
||||||
self.put(
|
self.put(
|
||||||
TWOPHASEDIR_KEY,
|
TWOPHASEDIR_KEY,
|
||||||
Value::Image(Bytes::from(TwoPhaseDirectory::ser(&dir)?)),
|
Value::Image(Bytes::from(TwoPhaseDirectory::ser(&dir)?)),
|
||||||
@@ -1340,6 +1372,8 @@ impl<'a> DatadirModification<'a> {
|
|||||||
let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
|
let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
|
||||||
files: HashMap::new(),
|
files: HashMap::new(),
|
||||||
})?;
|
})?;
|
||||||
|
self.pending_directory_entries
|
||||||
|
.push((DirectoryKind::AuxFiles, 0));
|
||||||
self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
|
self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -1366,6 +1400,9 @@ impl<'a> DatadirModification<'a> {
|
|||||||
} else {
|
} else {
|
||||||
dir.files.insert(path, Bytes::copy_from_slice(content));
|
dir.files.insert(path, Bytes::copy_from_slice(content));
|
||||||
}
|
}
|
||||||
|
self.pending_directory_entries
|
||||||
|
.push((DirectoryKind::AuxFiles, dir.files.len()));
|
||||||
|
|
||||||
self.put(
|
self.put(
|
||||||
AUX_FILES_KEY,
|
AUX_FILES_KEY,
|
||||||
Value::Image(Bytes::from(
|
Value::Image(Bytes::from(
|
||||||
@@ -1427,6 +1464,10 @@ impl<'a> DatadirModification<'a> {
|
|||||||
self.pending_nblocks = 0;
|
self.pending_nblocks = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (kind, count) in std::mem::take(&mut self.pending_directory_entries) {
|
||||||
|
writer.update_directory_entries_count(kind, count as u64);
|
||||||
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1464,6 +1505,10 @@ impl<'a> DatadirModification<'a> {
|
|||||||
writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
|
writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (kind, count) in std::mem::take(&mut self.pending_directory_entries) {
|
||||||
|
writer.update_directory_entries_count(kind, count as u64);
|
||||||
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1588,6 +1633,23 @@ struct SlruSegmentDirectory {
|
|||||||
segments: HashSet<u32>,
|
segments: HashSet<u32>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Copy, Clone, PartialEq, Eq, Debug, enum_map::Enum)]
|
||||||
|
#[repr(u8)]
|
||||||
|
pub(crate) enum DirectoryKind {
|
||||||
|
Db,
|
||||||
|
TwoPhase,
|
||||||
|
Rel,
|
||||||
|
AuxFiles,
|
||||||
|
SlruSegment(SlruKind),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DirectoryKind {
|
||||||
|
pub(crate) const KINDS_NUM: usize = <DirectoryKind as Enum>::LENGTH;
|
||||||
|
pub(crate) fn offset(&self) -> usize {
|
||||||
|
self.into_usize()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
|
static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
|
||||||
|
|
||||||
#[allow(clippy::bool_assert_comparison)]
|
#[allow(clippy::bool_assert_comparison)]
|
||||||
|
|||||||
@@ -576,8 +576,8 @@ pub fn shutdown_token() -> CancellationToken {
|
|||||||
|
|
||||||
/// Has the current task been requested to shut down?
|
/// Has the current task been requested to shut down?
|
||||||
pub fn is_shutdown_requested() -> bool {
|
pub fn is_shutdown_requested() -> bool {
|
||||||
if let Ok(cancel) = SHUTDOWN_TOKEN.try_with(|t| t.clone()) {
|
if let Ok(true_or_false) = SHUTDOWN_TOKEN.try_with(|t| t.is_cancelled()) {
|
||||||
cancel.is_cancelled()
|
true_or_false
|
||||||
} else {
|
} else {
|
||||||
if !cfg!(test) {
|
if !cfg!(test) {
|
||||||
warn!("is_shutdown_requested() called in an unexpected task or thread");
|
warn!("is_shutdown_requested() called in an unexpected task or thread");
|
||||||
|
|||||||
@@ -53,6 +53,7 @@ use self::metadata::TimelineMetadata;
|
|||||||
use self::mgr::GetActiveTenantError;
|
use self::mgr::GetActiveTenantError;
|
||||||
use self::mgr::GetTenantError;
|
use self::mgr::GetTenantError;
|
||||||
use self::mgr::TenantsMap;
|
use self::mgr::TenantsMap;
|
||||||
|
use self::remote_timeline_client::upload::upload_index_part;
|
||||||
use self::remote_timeline_client::RemoteTimelineClient;
|
use self::remote_timeline_client::RemoteTimelineClient;
|
||||||
use self::timeline::uninit::TimelineExclusionError;
|
use self::timeline::uninit::TimelineExclusionError;
|
||||||
use self::timeline::uninit::TimelineUninitMark;
|
use self::timeline::uninit::TimelineUninitMark;
|
||||||
@@ -643,10 +644,10 @@ impl Tenant {
|
|||||||
|
|
||||||
// The attach task will carry a GateGuard, so that shutdown() reliably waits for it to drop out if
|
// The attach task will carry a GateGuard, so that shutdown() reliably waits for it to drop out if
|
||||||
// we shut down while attaching.
|
// we shut down while attaching.
|
||||||
let Ok(attach_gate_guard) = tenant.gate.enter() else {
|
let attach_gate_guard = tenant
|
||||||
// We just created the Tenant: nothing else can have shut it down yet
|
.gate
|
||||||
unreachable!();
|
.enter()
|
||||||
};
|
.expect("We just created the Tenant: nothing else can have shut it down yet");
|
||||||
|
|
||||||
// Do all the hard work in the background
|
// Do all the hard work in the background
|
||||||
let tenant_clone = Arc::clone(&tenant);
|
let tenant_clone = Arc::clone(&tenant);
|
||||||
@@ -754,36 +755,27 @@ impl Tenant {
|
|||||||
AttachType::Normal
|
AttachType::Normal
|
||||||
};
|
};
|
||||||
|
|
||||||
let preload_timer = TENANT.preload.start_timer();
|
let preload = match (&mode, &remote_storage) {
|
||||||
let preload = match mode {
|
(SpawnMode::Create, _) => {
|
||||||
SpawnMode::Create => {
|
|
||||||
// Don't count the skipped preload into the histogram of preload durations
|
|
||||||
preload_timer.stop_and_discard();
|
|
||||||
None
|
None
|
||||||
},
|
},
|
||||||
SpawnMode::Normal => {
|
(SpawnMode::Normal, Some(remote_storage)) => {
|
||||||
match &remote_storage {
|
let _preload_timer = TENANT.preload.start_timer();
|
||||||
Some(remote_storage) => Some(
|
let res = tenant_clone
|
||||||
match tenant_clone
|
.preload(remote_storage, task_mgr::shutdown_token())
|
||||||
.preload(remote_storage, task_mgr::shutdown_token())
|
.await;
|
||||||
.instrument(
|
match res {
|
||||||
tracing::info_span!(parent: None, "attach_preload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()),
|
Ok(p) => Some(p),
|
||||||
)
|
Err(e) => {
|
||||||
.await {
|
make_broken(&tenant_clone, anyhow::anyhow!(e));
|
||||||
Ok(p) => {
|
return Ok(());
|
||||||
preload_timer.observe_duration();
|
}
|
||||||
p
|
|
||||||
}
|
|
||||||
,
|
|
||||||
Err(e) => {
|
|
||||||
make_broken(&tenant_clone, anyhow::anyhow!(e));
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
},
|
|
||||||
),
|
|
||||||
None => None,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
(SpawnMode::Normal, None) => {
|
||||||
|
let _preload_timer = TENANT.preload.start_timer();
|
||||||
|
None
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// Remote preload is complete.
|
// Remote preload is complete.
|
||||||
@@ -819,36 +811,37 @@ impl Tenant {
|
|||||||
info!("ready for backgound jobs barrier");
|
info!("ready for backgound jobs barrier");
|
||||||
}
|
}
|
||||||
|
|
||||||
match DeleteTenantFlow::resume_from_attach(
|
let deleted = DeleteTenantFlow::resume_from_attach(
|
||||||
deletion,
|
deletion,
|
||||||
&tenant_clone,
|
&tenant_clone,
|
||||||
preload,
|
preload,
|
||||||
tenants,
|
tenants,
|
||||||
&ctx,
|
&ctx,
|
||||||
)
|
)
|
||||||
.await
|
.await;
|
||||||
{
|
|
||||||
Err(err) => {
|
if let Err(e) = deleted {
|
||||||
make_broken(&tenant_clone, anyhow::anyhow!(err));
|
make_broken(&tenant_clone, anyhow::anyhow!(e));
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
Ok(()) => return Ok(()),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
// We will time the duration of the attach phase unless this is a creation (attach will do no work)
|
// We will time the duration of the attach phase unless this is a creation (attach will do no work)
|
||||||
let attach_timer = match mode {
|
let attached = {
|
||||||
SpawnMode::Create => None,
|
let _attach_timer = match mode {
|
||||||
SpawnMode::Normal => {Some(TENANT.attach.start_timer())}
|
SpawnMode::Create => None,
|
||||||
|
SpawnMode::Normal => {Some(TENANT.attach.start_timer())}
|
||||||
|
};
|
||||||
|
tenant_clone.attach(preload, mode, &ctx).await
|
||||||
};
|
};
|
||||||
match tenant_clone.attach(preload, mode, &ctx).await {
|
|
||||||
|
match attached {
|
||||||
Ok(()) => {
|
Ok(()) => {
|
||||||
info!("attach finished, activating");
|
info!("attach finished, activating");
|
||||||
if let Some(t)= attach_timer {t.observe_duration();}
|
|
||||||
tenant_clone.activate(broker_client, None, &ctx);
|
tenant_clone.activate(broker_client, None, &ctx);
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
if let Some(t)= attach_timer {t.observe_duration();}
|
|
||||||
make_broken(&tenant_clone, anyhow::anyhow!(e));
|
make_broken(&tenant_clone, anyhow::anyhow!(e));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -861,34 +854,26 @@ impl Tenant {
|
|||||||
// logical size calculations: if logical size calculation semaphore is saturated,
|
// logical size calculations: if logical size calculation semaphore is saturated,
|
||||||
// then warmup will wait for that before proceeding to the next tenant.
|
// then warmup will wait for that before proceeding to the next tenant.
|
||||||
if let AttachType::Warmup(_permit) = attach_type {
|
if let AttachType::Warmup(_permit) = attach_type {
|
||||||
let mut futs = FuturesUnordered::new();
|
let mut futs: FuturesUnordered<_> = tenant_clone.timelines.lock().unwrap().values().cloned().map(|t| t.await_initial_logical_size()).collect();
|
||||||
let timelines: Vec<_> = tenant_clone.timelines.lock().unwrap().values().cloned().collect();
|
|
||||||
for t in timelines {
|
|
||||||
futs.push(t.await_initial_logical_size())
|
|
||||||
}
|
|
||||||
tracing::info!("Waiting for initial logical sizes while warming up...");
|
tracing::info!("Waiting for initial logical sizes while warming up...");
|
||||||
while futs.next().await.is_some() {
|
while futs.next().await.is_some() {}
|
||||||
|
|
||||||
}
|
|
||||||
tracing::info!("Warm-up complete");
|
tracing::info!("Warm-up complete");
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
.instrument({
|
.instrument(tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), gen=?generation)),
|
||||||
let span = tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), gen=?generation);
|
|
||||||
span.follows_from(Span::current());
|
|
||||||
span
|
|
||||||
}),
|
|
||||||
);
|
);
|
||||||
Ok(tenant)
|
Ok(tenant)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[instrument(skip_all)]
|
||||||
pub(crate) async fn preload(
|
pub(crate) async fn preload(
|
||||||
self: &Arc<Tenant>,
|
self: &Arc<Tenant>,
|
||||||
remote_storage: &GenericRemoteStorage,
|
remote_storage: &GenericRemoteStorage,
|
||||||
cancel: CancellationToken,
|
cancel: CancellationToken,
|
||||||
) -> anyhow::Result<TenantPreload> {
|
) -> anyhow::Result<TenantPreload> {
|
||||||
|
span::debug_assert_current_span_has_tenant_id();
|
||||||
// Get list of remote timelines
|
// Get list of remote timelines
|
||||||
// download index files for every tenant timeline
|
// download index files for every tenant timeline
|
||||||
info!("listing remote timelines");
|
info!("listing remote timelines");
|
||||||
@@ -1376,7 +1361,7 @@ impl Tenant {
|
|||||||
async move {
|
async move {
|
||||||
debug!("starting index part download");
|
debug!("starting index part download");
|
||||||
|
|
||||||
let index_part = client.download_index_file(cancel_clone).await;
|
let index_part = client.download_index_file(&cancel_clone).await;
|
||||||
|
|
||||||
debug!("finished index part download");
|
debug!("finished index part download");
|
||||||
|
|
||||||
@@ -2397,6 +2382,67 @@ impl Tenant {
|
|||||||
pub(crate) fn get_generation(&self) -> Generation {
|
pub(crate) fn get_generation(&self) -> Generation {
|
||||||
self.generation
|
self.generation
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// This function partially shuts down the tenant (it shuts down the Timelines) and is fallible,
|
||||||
|
/// and can leave the tenant in a bad state if it fails. The caller is responsible for
|
||||||
|
/// resetting this tenant to a valid state if we fail.
|
||||||
|
pub(crate) async fn split_prepare(
|
||||||
|
&self,
|
||||||
|
child_shards: &Vec<TenantShardId>,
|
||||||
|
) -> anyhow::Result<()> {
|
||||||
|
let timelines = self.timelines.lock().unwrap().clone();
|
||||||
|
for timeline in timelines.values() {
|
||||||
|
let Some(tl_client) = &timeline.remote_client else {
|
||||||
|
anyhow::bail!("Remote storage is mandatory");
|
||||||
|
};
|
||||||
|
|
||||||
|
let Some(remote_storage) = &self.remote_storage else {
|
||||||
|
anyhow::bail!("Remote storage is mandatory");
|
||||||
|
};
|
||||||
|
|
||||||
|
// We do not block timeline creation/deletion during splits inside the pageserver: it is up to higher levels
|
||||||
|
// to ensure that they do not start a split if currently in the process of doing these.
|
||||||
|
|
||||||
|
// Upload an index from the parent: this is partly to provide freshness for the
|
||||||
|
// child tenants that will copy it, and partly for general ease-of-debugging: there will
|
||||||
|
// always be a parent shard index in the same generation as we wrote the child shard index.
|
||||||
|
tl_client.schedule_index_upload_for_file_changes()?;
|
||||||
|
tl_client.wait_completion().await?;
|
||||||
|
|
||||||
|
// Shut down the timeline's remote client: this means that the indices we write
|
||||||
|
// for child shards will not be invalidated by the parent shard deleting layers.
|
||||||
|
tl_client.shutdown().await?;
|
||||||
|
|
||||||
|
// Download methods can still be used after shutdown, as they don't flow through the remote client's
|
||||||
|
// queue. In principal the RemoteTimelineClient could provide this without downloading it, but this
|
||||||
|
// operation is rare, so it's simpler to just download it (and robustly guarantees that the index
|
||||||
|
// we use here really is the remotely persistent one).
|
||||||
|
let result = tl_client
|
||||||
|
.download_index_file(&self.cancel)
|
||||||
|
.instrument(info_span!("download_index_file", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id))
|
||||||
|
.await?;
|
||||||
|
let index_part = match result {
|
||||||
|
MaybeDeletedIndexPart::Deleted(_) => {
|
||||||
|
anyhow::bail!("Timeline deletion happened concurrently with split")
|
||||||
|
}
|
||||||
|
MaybeDeletedIndexPart::IndexPart(p) => p,
|
||||||
|
};
|
||||||
|
|
||||||
|
for child_shard in child_shards {
|
||||||
|
upload_index_part(
|
||||||
|
remote_storage,
|
||||||
|
child_shard,
|
||||||
|
&timeline.timeline_id,
|
||||||
|
self.generation,
|
||||||
|
&index_part,
|
||||||
|
&self.cancel,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id),
|
/// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id),
|
||||||
@@ -2834,7 +2880,7 @@ impl Tenant {
|
|||||||
let config_path = config_path.to_owned();
|
let config_path = config_path.to_owned();
|
||||||
tokio::task::spawn_blocking(move || {
|
tokio::task::spawn_blocking(move || {
|
||||||
Handle::current().block_on(async move {
|
Handle::current().block_on(async move {
|
||||||
let conf_content = conf_content.as_bytes();
|
let conf_content = conf_content.into_bytes();
|
||||||
VirtualFile::crashsafe_overwrite(&config_path, &temp_path, conf_content)
|
VirtualFile::crashsafe_overwrite(&config_path, &temp_path, conf_content)
|
||||||
.await
|
.await
|
||||||
.with_context(|| {
|
.with_context(|| {
|
||||||
@@ -2871,7 +2917,7 @@ impl Tenant {
|
|||||||
let target_config_path = target_config_path.to_owned();
|
let target_config_path = target_config_path.to_owned();
|
||||||
tokio::task::spawn_blocking(move || {
|
tokio::task::spawn_blocking(move || {
|
||||||
Handle::current().block_on(async move {
|
Handle::current().block_on(async move {
|
||||||
let conf_content = conf_content.as_bytes();
|
let conf_content = conf_content.into_bytes();
|
||||||
VirtualFile::crashsafe_overwrite(&target_config_path, &temp_path, conf_content)
|
VirtualFile::crashsafe_overwrite(&target_config_path, &temp_path, conf_content)
|
||||||
.await
|
.await
|
||||||
.with_context(|| {
|
.with_context(|| {
|
||||||
@@ -3732,6 +3778,10 @@ impl Tenant {
|
|||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn get_tenant_conf(&self) -> TenantConfOpt {
|
||||||
|
self.tenant_conf.read().unwrap().tenant_conf
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn remove_timeline_and_uninit_mark(
|
fn remove_timeline_and_uninit_mark(
|
||||||
@@ -3916,6 +3966,8 @@ pub(crate) mod harness {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
#[derive(Debug)]
|
||||||
enum LoadMode {
|
enum LoadMode {
|
||||||
Local,
|
Local,
|
||||||
Remote,
|
Remote,
|
||||||
@@ -3998,7 +4050,7 @@ pub(crate) mod harness {
|
|||||||
info_span!("TenantHarness", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())
|
info_span!("TenantHarness", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn load(&self) -> (Arc<Tenant>, RequestContext) {
|
pub(crate) async fn load(&self) -> (Arc<Tenant>, RequestContext) {
|
||||||
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
|
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
|
||||||
(
|
(
|
||||||
self.try_load(&ctx)
|
self.try_load(&ctx)
|
||||||
@@ -4008,31 +4060,31 @@ pub(crate) mod harness {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn remote_empty(&self) -> bool {
|
/// For tests that specifically want to exercise the local load path, which does
|
||||||
let tenant_path = self.conf.tenant_path(&self.tenant_shard_id);
|
/// not use remote storage.
|
||||||
let remote_tenant_dir = self
|
pub(crate) async fn try_load_local(
|
||||||
.remote_fs_dir
|
&self,
|
||||||
.join(tenant_path.strip_prefix(&self.conf.workdir).unwrap());
|
ctx: &RequestContext,
|
||||||
if std::fs::metadata(&remote_tenant_dir).is_err() {
|
) -> anyhow::Result<Arc<Tenant>> {
|
||||||
return true;
|
self.do_try_load(ctx, LoadMode::Local).await
|
||||||
}
|
|
||||||
|
|
||||||
match std::fs::read_dir(remote_tenant_dir)
|
|
||||||
.unwrap()
|
|
||||||
.flatten()
|
|
||||||
.next()
|
|
||||||
{
|
|
||||||
Some(entry) => {
|
|
||||||
tracing::debug!(
|
|
||||||
"remote_empty: not empty, found file {}",
|
|
||||||
entry.file_name().to_string_lossy(),
|
|
||||||
);
|
|
||||||
false
|
|
||||||
}
|
|
||||||
None => true,
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// The 'load' in this function is either a local load or a normal attachment,
|
||||||
|
pub(crate) async fn try_load(&self, ctx: &RequestContext) -> anyhow::Result<Arc<Tenant>> {
|
||||||
|
// If we have nothing in remote storage, must use load_local instead of attach: attach
|
||||||
|
// will error out if there are no timelines.
|
||||||
|
//
|
||||||
|
// See https://github.com/neondatabase/neon/issues/5456 for how we will eliminate
|
||||||
|
// this weird state of a Tenant which exists but doesn't have any timelines.
|
||||||
|
let mode = match self.remote_empty() {
|
||||||
|
true => LoadMode::Local,
|
||||||
|
false => LoadMode::Remote,
|
||||||
|
};
|
||||||
|
|
||||||
|
self.do_try_load(ctx, mode).await
|
||||||
|
}
|
||||||
|
|
||||||
|
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), ?mode))]
|
||||||
async fn do_try_load(
|
async fn do_try_load(
|
||||||
&self,
|
&self,
|
||||||
ctx: &RequestContext,
|
ctx: &RequestContext,
|
||||||
@@ -4059,20 +4111,13 @@ pub(crate) mod harness {
|
|||||||
|
|
||||||
match mode {
|
match mode {
|
||||||
LoadMode::Local => {
|
LoadMode::Local => {
|
||||||
tenant
|
tenant.load_local(ctx).await?;
|
||||||
.load_local(ctx)
|
|
||||||
.instrument(info_span!("try_load", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
|
|
||||||
.await?;
|
|
||||||
}
|
}
|
||||||
LoadMode::Remote => {
|
LoadMode::Remote => {
|
||||||
let preload = tenant
|
let preload = tenant
|
||||||
.preload(&self.remote_storage, CancellationToken::new())
|
.preload(&self.remote_storage, CancellationToken::new())
|
||||||
.instrument(info_span!("try_load_preload", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
|
|
||||||
.await?;
|
|
||||||
tenant
|
|
||||||
.attach(Some(preload), SpawnMode::Normal, ctx)
|
|
||||||
.instrument(info_span!("try_load", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
|
|
||||||
.await?;
|
.await?;
|
||||||
|
tenant.attach(Some(preload), SpawnMode::Normal, ctx).await?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -4083,25 +4128,29 @@ pub(crate) mod harness {
|
|||||||
Ok(tenant)
|
Ok(tenant)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// For tests that specifically want to exercise the local load path, which does
|
fn remote_empty(&self) -> bool {
|
||||||
/// not use remote storage.
|
let tenant_path = self.conf.tenant_path(&self.tenant_shard_id);
|
||||||
pub async fn try_load_local(&self, ctx: &RequestContext) -> anyhow::Result<Arc<Tenant>> {
|
let remote_tenant_dir = self
|
||||||
self.do_try_load(ctx, LoadMode::Local).await
|
.remote_fs_dir
|
||||||
}
|
.join(tenant_path.strip_prefix(&self.conf.workdir).unwrap());
|
||||||
|
if std::fs::metadata(&remote_tenant_dir).is_err() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
/// The 'load' in this function is either a local load or a normal attachment,
|
match std::fs::read_dir(remote_tenant_dir)
|
||||||
pub async fn try_load(&self, ctx: &RequestContext) -> anyhow::Result<Arc<Tenant>> {
|
.unwrap()
|
||||||
// If we have nothing in remote storage, must use load_local instead of attach: attach
|
.flatten()
|
||||||
// will error out if there are no timelines.
|
.next()
|
||||||
//
|
{
|
||||||
// See https://github.com/neondatabase/neon/issues/5456 for how we will eliminate
|
Some(entry) => {
|
||||||
// this weird state of a Tenant which exists but doesn't have any timelines.
|
tracing::debug!(
|
||||||
let mode = match self.remote_empty() {
|
"remote_empty: not empty, found file {}",
|
||||||
true => LoadMode::Local,
|
entry.file_name().to_string_lossy(),
|
||||||
false => LoadMode::Remote,
|
);
|
||||||
};
|
false
|
||||||
|
}
|
||||||
self.do_try_load(ctx, mode).await
|
None => true,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn timeline_path(&self, timeline_id: &TimelineId) -> Utf8PathBuf {
|
pub fn timeline_path(&self, timeline_id: &TimelineId) -> Utf8PathBuf {
|
||||||
|
|||||||
@@ -11,6 +11,9 @@
|
|||||||
//! len < 128: 0XXXXXXX
|
//! len < 128: 0XXXXXXX
|
||||||
//! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX
|
//! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX
|
||||||
//!
|
//!
|
||||||
|
use bytes::{BufMut, BytesMut};
|
||||||
|
use tokio_epoll_uring::{BoundedBuf, Slice};
|
||||||
|
|
||||||
use crate::context::RequestContext;
|
use crate::context::RequestContext;
|
||||||
use crate::page_cache::PAGE_SZ;
|
use crate::page_cache::PAGE_SZ;
|
||||||
use crate::tenant::block_io::BlockCursor;
|
use crate::tenant::block_io::BlockCursor;
|
||||||
@@ -100,6 +103,8 @@ pub struct BlobWriter<const BUFFERED: bool> {
|
|||||||
offset: u64,
|
offset: u64,
|
||||||
/// A buffer to save on write calls, only used if BUFFERED=true
|
/// A buffer to save on write calls, only used if BUFFERED=true
|
||||||
buf: Vec<u8>,
|
buf: Vec<u8>,
|
||||||
|
/// We do tiny writes for the length headers; they need to be in an owned buffer;
|
||||||
|
io_buf: Option<BytesMut>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||||
@@ -108,6 +113,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
|||||||
inner,
|
inner,
|
||||||
offset: start_offset,
|
offset: start_offset,
|
||||||
buf: Vec::with_capacity(Self::CAPACITY),
|
buf: Vec::with_capacity(Self::CAPACITY),
|
||||||
|
io_buf: Some(BytesMut::new()),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -117,21 +123,31 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
|||||||
|
|
||||||
const CAPACITY: usize = if BUFFERED { PAGE_SZ } else { 0 };
|
const CAPACITY: usize = if BUFFERED { PAGE_SZ } else { 0 };
|
||||||
|
|
||||||
#[inline(always)]
|
|
||||||
/// Writes the given buffer directly to the underlying `VirtualFile`.
|
/// Writes the given buffer directly to the underlying `VirtualFile`.
|
||||||
/// You need to make sure that the internal buffer is empty, otherwise
|
/// You need to make sure that the internal buffer is empty, otherwise
|
||||||
/// data will be written in wrong order.
|
/// data will be written in wrong order.
|
||||||
async fn write_all_unbuffered(&mut self, src_buf: &[u8]) -> Result<(), Error> {
|
#[inline(always)]
|
||||||
self.inner.write_all(src_buf).await?;
|
async fn write_all_unbuffered<B: BoundedBuf>(
|
||||||
self.offset += src_buf.len() as u64;
|
&mut self,
|
||||||
Ok(())
|
src_buf: B,
|
||||||
|
) -> (B::Buf, Result<(), Error>) {
|
||||||
|
let (src_buf, res) = self.inner.write_all(src_buf).await;
|
||||||
|
let nbytes = match res {
|
||||||
|
Ok(nbytes) => nbytes,
|
||||||
|
Err(e) => return (src_buf, Err(e)),
|
||||||
|
};
|
||||||
|
self.offset += nbytes as u64;
|
||||||
|
(src_buf, Ok(()))
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline(always)]
|
#[inline(always)]
|
||||||
/// Flushes the internal buffer to the underlying `VirtualFile`.
|
/// Flushes the internal buffer to the underlying `VirtualFile`.
|
||||||
pub async fn flush_buffer(&mut self) -> Result<(), Error> {
|
pub async fn flush_buffer(&mut self) -> Result<(), Error> {
|
||||||
self.inner.write_all(&self.buf).await?;
|
let buf = std::mem::take(&mut self.buf);
|
||||||
self.buf.clear();
|
let (mut buf, res) = self.inner.write_all(buf).await;
|
||||||
|
res?;
|
||||||
|
buf.clear();
|
||||||
|
self.buf = buf;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -146,62 +162,91 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Internal, possibly buffered, write function
|
/// Internal, possibly buffered, write function
|
||||||
async fn write_all(&mut self, mut src_buf: &[u8]) -> Result<(), Error> {
|
async fn write_all<B: BoundedBuf>(&mut self, src_buf: B) -> (B::Buf, Result<(), Error>) {
|
||||||
if !BUFFERED {
|
if !BUFFERED {
|
||||||
assert!(self.buf.is_empty());
|
assert!(self.buf.is_empty());
|
||||||
self.write_all_unbuffered(src_buf).await?;
|
return self.write_all_unbuffered(src_buf).await;
|
||||||
return Ok(());
|
|
||||||
}
|
}
|
||||||
let remaining = Self::CAPACITY - self.buf.len();
|
let remaining = Self::CAPACITY - self.buf.len();
|
||||||
|
let src_buf_len = src_buf.bytes_init();
|
||||||
|
if src_buf_len == 0 {
|
||||||
|
return (Slice::into_inner(src_buf.slice_full()), Ok(()));
|
||||||
|
}
|
||||||
|
let mut src_buf = src_buf.slice(0..src_buf_len);
|
||||||
// First try to copy as much as we can into the buffer
|
// First try to copy as much as we can into the buffer
|
||||||
if remaining > 0 {
|
if remaining > 0 {
|
||||||
let copied = self.write_into_buffer(src_buf);
|
let copied = self.write_into_buffer(&src_buf);
|
||||||
src_buf = &src_buf[copied..];
|
src_buf = src_buf.slice(copied..);
|
||||||
}
|
}
|
||||||
// Then, if the buffer is full, flush it out
|
// Then, if the buffer is full, flush it out
|
||||||
if self.buf.len() == Self::CAPACITY {
|
if self.buf.len() == Self::CAPACITY {
|
||||||
self.flush_buffer().await?;
|
if let Err(e) = self.flush_buffer().await {
|
||||||
|
return (Slice::into_inner(src_buf), Err(e));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
// Finally, write the tail of src_buf:
|
// Finally, write the tail of src_buf:
|
||||||
// If it wholly fits into the buffer without
|
// If it wholly fits into the buffer without
|
||||||
// completely filling it, then put it there.
|
// completely filling it, then put it there.
|
||||||
// If not, write it out directly.
|
// If not, write it out directly.
|
||||||
if !src_buf.is_empty() {
|
let src_buf = if !src_buf.is_empty() {
|
||||||
assert_eq!(self.buf.len(), 0);
|
assert_eq!(self.buf.len(), 0);
|
||||||
if src_buf.len() < Self::CAPACITY {
|
if src_buf.len() < Self::CAPACITY {
|
||||||
let copied = self.write_into_buffer(src_buf);
|
let copied = self.write_into_buffer(&src_buf);
|
||||||
// We just verified above that src_buf fits into our internal buffer.
|
// We just verified above that src_buf fits into our internal buffer.
|
||||||
assert_eq!(copied, src_buf.len());
|
assert_eq!(copied, src_buf.len());
|
||||||
|
Slice::into_inner(src_buf)
|
||||||
} else {
|
} else {
|
||||||
self.write_all_unbuffered(src_buf).await?;
|
let (src_buf, res) = self.write_all_unbuffered(src_buf).await;
|
||||||
|
if let Err(e) = res {
|
||||||
|
return (src_buf, Err(e));
|
||||||
|
}
|
||||||
|
src_buf
|
||||||
}
|
}
|
||||||
}
|
} else {
|
||||||
Ok(())
|
Slice::into_inner(src_buf)
|
||||||
|
};
|
||||||
|
(src_buf, Ok(()))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Write a blob of data. Returns the offset that it was written to,
|
/// Write a blob of data. Returns the offset that it was written to,
|
||||||
/// which can be used to retrieve the data later.
|
/// which can be used to retrieve the data later.
|
||||||
pub async fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, Error> {
|
pub async fn write_blob<B: BoundedBuf>(&mut self, srcbuf: B) -> (B::Buf, Result<u64, Error>) {
|
||||||
let offset = self.offset;
|
let offset = self.offset;
|
||||||
|
|
||||||
if srcbuf.len() < 128 {
|
let len = srcbuf.bytes_init();
|
||||||
// Short blob. Write a 1-byte length header
|
|
||||||
let len_buf = srcbuf.len() as u8;
|
let mut io_buf = self.io_buf.take().expect("we always put it back below");
|
||||||
self.write_all(&[len_buf]).await?;
|
io_buf.clear();
|
||||||
} else {
|
let (io_buf, hdr_res) = async {
|
||||||
// Write a 4-byte length header
|
if len < 128 {
|
||||||
if srcbuf.len() > 0x7fff_ffff {
|
// Short blob. Write a 1-byte length header
|
||||||
return Err(Error::new(
|
io_buf.put_u8(len as u8);
|
||||||
ErrorKind::Other,
|
self.write_all(io_buf).await
|
||||||
format!("blob too large ({} bytes)", srcbuf.len()),
|
} else {
|
||||||
));
|
// Write a 4-byte length header
|
||||||
|
if len > 0x7fff_ffff {
|
||||||
|
return (
|
||||||
|
io_buf,
|
||||||
|
Err(Error::new(
|
||||||
|
ErrorKind::Other,
|
||||||
|
format!("blob too large ({} bytes)", len),
|
||||||
|
)),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
let mut len_buf = (len as u32).to_be_bytes();
|
||||||
|
len_buf[0] |= 0x80;
|
||||||
|
io_buf.extend_from_slice(&len_buf[..]);
|
||||||
|
self.write_all(io_buf).await
|
||||||
}
|
}
|
||||||
let mut len_buf = ((srcbuf.len()) as u32).to_be_bytes();
|
|
||||||
len_buf[0] |= 0x80;
|
|
||||||
self.write_all(&len_buf).await?;
|
|
||||||
}
|
}
|
||||||
self.write_all(srcbuf).await?;
|
.await;
|
||||||
Ok(offset)
|
self.io_buf = Some(io_buf);
|
||||||
|
match hdr_res {
|
||||||
|
Ok(_) => (),
|
||||||
|
Err(e) => return (Slice::into_inner(srcbuf.slice(..)), Err(e)),
|
||||||
|
}
|
||||||
|
let (srcbuf, res) = self.write_all(srcbuf).await;
|
||||||
|
(srcbuf, res.map(|_| offset))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -248,12 +293,14 @@ mod tests {
|
|||||||
let file = VirtualFile::create(pathbuf.as_path()).await?;
|
let file = VirtualFile::create(pathbuf.as_path()).await?;
|
||||||
let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
|
let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
|
||||||
for blob in blobs.iter() {
|
for blob in blobs.iter() {
|
||||||
let offs = wtr.write_blob(blob).await?;
|
let (_, res) = wtr.write_blob(blob.clone()).await;
|
||||||
|
let offs = res?;
|
||||||
offsets.push(offs);
|
offsets.push(offs);
|
||||||
}
|
}
|
||||||
// Write out one page worth of zeros so that we can
|
// Write out one page worth of zeros so that we can
|
||||||
// read again with read_blk
|
// read again with read_blk
|
||||||
let offs = wtr.write_blob(&vec![0; PAGE_SZ]).await?;
|
let (_, res) = wtr.write_blob(vec![0; PAGE_SZ]).await;
|
||||||
|
let offs = res?;
|
||||||
println!("Writing final blob at offs={offs}");
|
println!("Writing final blob at offs={offs}");
|
||||||
wtr.flush_buffer().await?;
|
wtr.flush_buffer().await?;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ use pageserver_api::{models::TenantState, shard::TenantShardId};
|
|||||||
use remote_storage::{GenericRemoteStorage, RemotePath};
|
use remote_storage::{GenericRemoteStorage, RemotePath};
|
||||||
use tokio::sync::OwnedMutexGuard;
|
use tokio::sync::OwnedMutexGuard;
|
||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
use tracing::{error, instrument, Instrument, Span};
|
use tracing::{error, instrument, Instrument};
|
||||||
|
|
||||||
use utils::{backoff, completion, crashsafe, fs_ext, id::TimelineId};
|
use utils::{backoff, completion, crashsafe, fs_ext, id::TimelineId};
|
||||||
|
|
||||||
@@ -496,11 +496,7 @@ impl DeleteTenantFlow {
|
|||||||
};
|
};
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
.instrument({
|
.instrument(tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug())),
|
||||||
let span = tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug());
|
|
||||||
span.follows_from(Span::current());
|
|
||||||
span
|
|
||||||
}),
|
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -279,7 +279,7 @@ pub async fn save_metadata(
|
|||||||
let path = conf.metadata_path(tenant_shard_id, timeline_id);
|
let path = conf.metadata_path(tenant_shard_id, timeline_id);
|
||||||
let temp_path = path_with_suffix_extension(&path, TEMP_FILE_SUFFIX);
|
let temp_path = path_with_suffix_extension(&path, TEMP_FILE_SUFFIX);
|
||||||
let metadata_bytes = data.to_bytes().context("serialize metadata")?;
|
let metadata_bytes = data.to_bytes().context("serialize metadata")?;
|
||||||
VirtualFile::crashsafe_overwrite(&path, &temp_path, &metadata_bytes)
|
VirtualFile::crashsafe_overwrite(&path, &temp_path, metadata_bytes)
|
||||||
.await
|
.await
|
||||||
.context("write metadata")?;
|
.context("write metadata")?;
|
||||||
Ok(())
|
Ok(())
|
||||||
|
|||||||
@@ -2,6 +2,7 @@
|
|||||||
//! page server.
|
//! page server.
|
||||||
|
|
||||||
use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
|
use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
|
||||||
|
use itertools::Itertools;
|
||||||
use pageserver_api::key::Key;
|
use pageserver_api::key::Key;
|
||||||
use pageserver_api::models::ShardParameters;
|
use pageserver_api::models::ShardParameters;
|
||||||
use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, TenantShardId};
|
use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, TenantShardId};
|
||||||
@@ -22,7 +23,7 @@ use tokio_util::sync::CancellationToken;
|
|||||||
use tracing::*;
|
use tracing::*;
|
||||||
|
|
||||||
use remote_storage::GenericRemoteStorage;
|
use remote_storage::GenericRemoteStorage;
|
||||||
use utils::crashsafe;
|
use utils::{completion, crashsafe};
|
||||||
|
|
||||||
use crate::config::PageServerConf;
|
use crate::config::PageServerConf;
|
||||||
use crate::context::{DownloadBehavior, RequestContext};
|
use crate::context::{DownloadBehavior, RequestContext};
|
||||||
@@ -644,8 +645,6 @@ pub(crate) async fn shutdown_all_tenants() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
|
async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
|
||||||
use utils::completion;
|
|
||||||
|
|
||||||
let mut join_set = JoinSet::new();
|
let mut join_set = JoinSet::new();
|
||||||
|
|
||||||
// Atomically, 1. create the shutdown tasks and 2. prevent creation of new tenants.
|
// Atomically, 1. create the shutdown tasks and 2. prevent creation of new tenants.
|
||||||
@@ -1200,7 +1199,7 @@ impl TenantManager {
|
|||||||
&self,
|
&self,
|
||||||
tenant_shard_id: TenantShardId,
|
tenant_shard_id: TenantShardId,
|
||||||
drop_cache: bool,
|
drop_cache: bool,
|
||||||
ctx: RequestContext,
|
ctx: &RequestContext,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
|
let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
|
||||||
let Some(old_slot) = slot_guard.get_old_value() else {
|
let Some(old_slot) = slot_guard.get_old_value() else {
|
||||||
@@ -1253,7 +1252,7 @@ impl TenantManager {
|
|||||||
None,
|
None,
|
||||||
self.tenants,
|
self.tenants,
|
||||||
SpawnMode::Normal,
|
SpawnMode::Normal,
|
||||||
&ctx,
|
ctx,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
slot_guard.upsert(TenantSlot::Attached(tenant))?;
|
slot_guard.upsert(TenantSlot::Attached(tenant))?;
|
||||||
@@ -1375,6 +1374,164 @@ impl TenantManager {
|
|||||||
slot_guard.revert();
|
slot_guard.revert();
|
||||||
result
|
result
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), new_shard_count=%new_shard_count.0))]
|
||||||
|
pub(crate) async fn shard_split(
|
||||||
|
&self,
|
||||||
|
tenant_shard_id: TenantShardId,
|
||||||
|
new_shard_count: ShardCount,
|
||||||
|
ctx: &RequestContext,
|
||||||
|
) -> anyhow::Result<Vec<TenantShardId>> {
|
||||||
|
let tenant = get_tenant(tenant_shard_id, true)?;
|
||||||
|
|
||||||
|
// Plan: identify what the new child shards will be
|
||||||
|
let effective_old_shard_count = std::cmp::max(tenant_shard_id.shard_count.0, 1);
|
||||||
|
if new_shard_count <= ShardCount(effective_old_shard_count) {
|
||||||
|
anyhow::bail!("Requested shard count is not an increase");
|
||||||
|
}
|
||||||
|
let expansion_factor = new_shard_count.0 / effective_old_shard_count;
|
||||||
|
if !expansion_factor.is_power_of_two() {
|
||||||
|
anyhow::bail!("Requested split is not a power of two");
|
||||||
|
}
|
||||||
|
|
||||||
|
let parent_shard_identity = tenant.shard_identity;
|
||||||
|
let parent_tenant_conf = tenant.get_tenant_conf();
|
||||||
|
let parent_generation = tenant.generation;
|
||||||
|
|
||||||
|
let child_shards = tenant_shard_id.split(new_shard_count);
|
||||||
|
tracing::info!(
|
||||||
|
"Shard {} splits into: {}",
|
||||||
|
tenant_shard_id.to_index(),
|
||||||
|
child_shards
|
||||||
|
.iter()
|
||||||
|
.map(|id| format!("{}", id.to_index()))
|
||||||
|
.join(",")
|
||||||
|
);
|
||||||
|
|
||||||
|
// Phase 1: Write out child shards' remote index files, in the parent tenant's current generation
|
||||||
|
if let Err(e) = tenant.split_prepare(&child_shards).await {
|
||||||
|
// If [`Tenant::split_prepare`] fails, we must reload the tenant, because it might
|
||||||
|
// have been left in a partially-shut-down state.
|
||||||
|
tracing::warn!("Failed to prepare for split: {e}, reloading Tenant before returning");
|
||||||
|
self.reset_tenant(tenant_shard_id, false, ctx).await?;
|
||||||
|
return Err(e);
|
||||||
|
}
|
||||||
|
|
||||||
|
self.resources.deletion_queue_client.flush_advisory();
|
||||||
|
|
||||||
|
// Phase 2: Put the parent shard to InProgress and grab a reference to the parent Tenant
|
||||||
|
drop(tenant);
|
||||||
|
let mut parent_slot_guard =
|
||||||
|
tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
|
||||||
|
let parent = match parent_slot_guard.get_old_value() {
|
||||||
|
Some(TenantSlot::Attached(t)) => t,
|
||||||
|
Some(TenantSlot::Secondary(_)) => anyhow::bail!("Tenant location in secondary mode"),
|
||||||
|
Some(TenantSlot::InProgress(_)) => {
|
||||||
|
// tenant_map_acquire_slot never returns InProgress, if a slot was InProgress
|
||||||
|
// it would return an error.
|
||||||
|
unreachable!()
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
// We don't actually need the parent shard to still be attached to do our work, but it's
|
||||||
|
// a weird enough situation that the caller probably didn't want us to continue working
|
||||||
|
// if they had detached the tenant they requested the split on.
|
||||||
|
anyhow::bail!("Detached parent shard in the middle of split!")
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// TODO: hardlink layers from the parent into the child shard directories so that they don't immediately re-download
|
||||||
|
// TODO: erase the dentries from the parent
|
||||||
|
|
||||||
|
// Take a snapshot of where the parent's WAL ingest had got to: we will wait for
|
||||||
|
// child shards to reach this point.
|
||||||
|
let mut target_lsns = HashMap::new();
|
||||||
|
for timeline in parent.timelines.lock().unwrap().clone().values() {
|
||||||
|
target_lsns.insert(timeline.timeline_id, timeline.get_last_record_lsn());
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: we should have the parent shard stop its WAL ingest here, it's a waste of resources
|
||||||
|
// and could slow down the children trying to catch up.
|
||||||
|
|
||||||
|
// Phase 3: Spawn the child shards
|
||||||
|
for child_shard in &child_shards {
|
||||||
|
let mut child_shard_identity = parent_shard_identity;
|
||||||
|
child_shard_identity.count = child_shard.shard_count;
|
||||||
|
child_shard_identity.number = child_shard.shard_number;
|
||||||
|
|
||||||
|
let child_location_conf = LocationConf {
|
||||||
|
mode: LocationMode::Attached(AttachedLocationConfig {
|
||||||
|
generation: parent_generation,
|
||||||
|
attach_mode: AttachmentMode::Single,
|
||||||
|
}),
|
||||||
|
shard: child_shard_identity,
|
||||||
|
tenant_conf: parent_tenant_conf,
|
||||||
|
};
|
||||||
|
|
||||||
|
self.upsert_location(
|
||||||
|
*child_shard,
|
||||||
|
child_location_conf,
|
||||||
|
None,
|
||||||
|
SpawnMode::Normal,
|
||||||
|
ctx,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Phase 4: wait for child chards WAL ingest to catch up to target LSN
|
||||||
|
for child_shard_id in &child_shards {
|
||||||
|
let child_shard = {
|
||||||
|
let locked = TENANTS.read().unwrap();
|
||||||
|
let peek_slot =
|
||||||
|
tenant_map_peek_slot(&locked, child_shard_id, TenantSlotPeekMode::Read)?;
|
||||||
|
peek_slot.and_then(|s| s.get_attached()).cloned()
|
||||||
|
};
|
||||||
|
if let Some(t) = child_shard {
|
||||||
|
let timelines = t.timelines.lock().unwrap().clone();
|
||||||
|
for timeline in timelines.values() {
|
||||||
|
let Some(target_lsn) = target_lsns.get(&timeline.timeline_id) else {
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
|
||||||
|
tracing::info!(
|
||||||
|
"Waiting for child shard {}/{} to reach target lsn {}...",
|
||||||
|
child_shard_id,
|
||||||
|
timeline.timeline_id,
|
||||||
|
target_lsn
|
||||||
|
);
|
||||||
|
if let Err(e) = timeline.wait_lsn(*target_lsn, ctx).await {
|
||||||
|
// Failure here might mean shutdown, in any case this part is an optimization
|
||||||
|
// and we shouldn't hold up the split operation.
|
||||||
|
tracing::warn!(
|
||||||
|
"Failed to wait for timeline {} to reach lsn {target_lsn}: {e}",
|
||||||
|
timeline.timeline_id
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
tracing::info!(
|
||||||
|
"Child shard {}/{} reached target lsn {}",
|
||||||
|
child_shard_id,
|
||||||
|
timeline.timeline_id,
|
||||||
|
target_lsn
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Phase 5: Shut down the parent shard.
|
||||||
|
let (_guard, progress) = completion::channel();
|
||||||
|
match parent.shutdown(progress, false).await {
|
||||||
|
Ok(()) => {}
|
||||||
|
Err(other) => {
|
||||||
|
other.wait().await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
parent_slot_guard.drop_old_value()?;
|
||||||
|
|
||||||
|
// Phase 6: Release the InProgress on the parent shard
|
||||||
|
drop(parent_slot_guard);
|
||||||
|
|
||||||
|
Ok(child_shards)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, thiserror::Error)]
|
#[derive(Debug, thiserror::Error)]
|
||||||
@@ -2209,8 +2366,6 @@ async fn remove_tenant_from_memory<V, F>(
|
|||||||
where
|
where
|
||||||
F: std::future::Future<Output = anyhow::Result<V>>,
|
F: std::future::Future<Output = anyhow::Result<V>>,
|
||||||
{
|
{
|
||||||
use utils::completion;
|
|
||||||
|
|
||||||
let mut slot_guard =
|
let mut slot_guard =
|
||||||
tenant_map_acquire_slot_impl(&tenant_shard_id, tenants, TenantSlotAcquireMode::MustExist)?;
|
tenant_map_acquire_slot_impl(&tenant_shard_id, tenants, TenantSlotAcquireMode::MustExist)?;
|
||||||
|
|
||||||
|
|||||||
@@ -217,6 +217,7 @@ use crate::metrics::{
|
|||||||
};
|
};
|
||||||
use crate::task_mgr::shutdown_token;
|
use crate::task_mgr::shutdown_token;
|
||||||
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
|
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||||
|
use crate::tenant::remote_timeline_client::download::download_retry;
|
||||||
use crate::tenant::storage_layer::AsLayerDesc;
|
use crate::tenant::storage_layer::AsLayerDesc;
|
||||||
use crate::tenant::upload_queue::Delete;
|
use crate::tenant::upload_queue::Delete;
|
||||||
use crate::tenant::TIMELINES_SEGMENT_NAME;
|
use crate::tenant::TIMELINES_SEGMENT_NAME;
|
||||||
@@ -262,6 +263,11 @@ pub(crate) const INITDB_PRESERVED_PATH: &str = "initdb-preserved.tar.zst";
|
|||||||
/// Default buffer size when interfacing with [`tokio::fs::File`].
|
/// Default buffer size when interfacing with [`tokio::fs::File`].
|
||||||
pub(crate) const BUFFER_SIZE: usize = 32 * 1024;
|
pub(crate) const BUFFER_SIZE: usize = 32 * 1024;
|
||||||
|
|
||||||
|
/// This timeout is intended to deal with hangs in lower layers, e.g. stuck TCP flows. It is not
|
||||||
|
/// intended to be snappy enough for prompt shutdown, as we have a CancellationToken for that.
|
||||||
|
pub(crate) const UPLOAD_TIMEOUT: Duration = Duration::from_secs(120);
|
||||||
|
pub(crate) const DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(120);
|
||||||
|
|
||||||
pub enum MaybeDeletedIndexPart {
|
pub enum MaybeDeletedIndexPart {
|
||||||
IndexPart(IndexPart),
|
IndexPart(IndexPart),
|
||||||
Deleted(IndexPart),
|
Deleted(IndexPart),
|
||||||
@@ -325,11 +331,6 @@ pub struct RemoteTimelineClient {
|
|||||||
cancel: CancellationToken,
|
cancel: CancellationToken,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// This timeout is intended to deal with hangs in lower layers, e.g. stuck TCP flows. It is not
|
|
||||||
/// intended to be snappy enough for prompt shutdown, as we have a CancellationToken for that.
|
|
||||||
const UPLOAD_TIMEOUT: Duration = Duration::from_secs(120);
|
|
||||||
const DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(120);
|
|
||||||
|
|
||||||
/// Wrapper for timeout_cancellable that flattens result and converts TimeoutCancellableError to anyhow.
|
/// Wrapper for timeout_cancellable that flattens result and converts TimeoutCancellableError to anyhow.
|
||||||
///
|
///
|
||||||
/// This is a convenience for the various upload functions. In future
|
/// This is a convenience for the various upload functions. In future
|
||||||
@@ -506,7 +507,7 @@ impl RemoteTimelineClient {
|
|||||||
/// Download index file
|
/// Download index file
|
||||||
pub async fn download_index_file(
|
pub async fn download_index_file(
|
||||||
&self,
|
&self,
|
||||||
cancel: CancellationToken,
|
cancel: &CancellationToken,
|
||||||
) -> Result<MaybeDeletedIndexPart, DownloadError> {
|
) -> Result<MaybeDeletedIndexPart, DownloadError> {
|
||||||
let _unfinished_gauge_guard = self.metrics.call_begin(
|
let _unfinished_gauge_guard = self.metrics.call_begin(
|
||||||
&RemoteOpFileKind::Index,
|
&RemoteOpFileKind::Index,
|
||||||
@@ -1147,22 +1148,17 @@ impl RemoteTimelineClient {
|
|||||||
|
|
||||||
let cancel = shutdown_token();
|
let cancel = shutdown_token();
|
||||||
|
|
||||||
let remaining = backoff::retry(
|
let remaining = download_retry(
|
||||||
|| async {
|
|| async {
|
||||||
self.storage_impl
|
self.storage_impl
|
||||||
.list_files(Some(&timeline_storage_path))
|
.list_files(Some(&timeline_storage_path), None)
|
||||||
.await
|
.await
|
||||||
},
|
},
|
||||||
|_e| false,
|
"list remaining files",
|
||||||
FAILED_DOWNLOAD_WARN_THRESHOLD,
|
|
||||||
FAILED_REMOTE_OP_RETRIES,
|
|
||||||
"list_prefixes",
|
|
||||||
&cancel,
|
&cancel,
|
||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
.ok_or_else(|| anyhow::anyhow!("Cancelled!"))
|
.context("list files remaining files")?;
|
||||||
.and_then(|x| x)
|
|
||||||
.context("list prefixes")?;
|
|
||||||
|
|
||||||
// We will delete the current index_part object last, since it acts as a deletion
|
// We will delete the current index_part object last, since it acts as a deletion
|
||||||
// marker via its deleted_at attribute
|
// marker via its deleted_at attribute
|
||||||
@@ -1351,6 +1347,7 @@ impl RemoteTimelineClient {
|
|||||||
/// queue.
|
/// queue.
|
||||||
///
|
///
|
||||||
async fn perform_upload_task(self: &Arc<Self>, task: Arc<UploadTask>) {
|
async fn perform_upload_task(self: &Arc<Self>, task: Arc<UploadTask>) {
|
||||||
|
let cancel = shutdown_token();
|
||||||
// Loop to retry until it completes.
|
// Loop to retry until it completes.
|
||||||
loop {
|
loop {
|
||||||
// If we're requested to shut down, close up shop and exit.
|
// If we're requested to shut down, close up shop and exit.
|
||||||
@@ -1362,7 +1359,7 @@ impl RemoteTimelineClient {
|
|||||||
// the Future, but we're not 100% sure if the remote storage library
|
// the Future, but we're not 100% sure if the remote storage library
|
||||||
// is cancellation safe, so we don't dare to do that. Hopefully, the
|
// is cancellation safe, so we don't dare to do that. Hopefully, the
|
||||||
// upload finishes or times out soon enough.
|
// upload finishes or times out soon enough.
|
||||||
if task_mgr::is_shutdown_requested() {
|
if cancel.is_cancelled() {
|
||||||
info!("upload task cancelled by shutdown request");
|
info!("upload task cancelled by shutdown request");
|
||||||
match self.stop() {
|
match self.stop() {
|
||||||
Ok(()) => {}
|
Ok(()) => {}
|
||||||
@@ -1473,7 +1470,7 @@ impl RemoteTimelineClient {
|
|||||||
retries,
|
retries,
|
||||||
DEFAULT_BASE_BACKOFF_SECONDS,
|
DEFAULT_BASE_BACKOFF_SECONDS,
|
||||||
DEFAULT_MAX_BACKOFF_SECONDS,
|
DEFAULT_MAX_BACKOFF_SECONDS,
|
||||||
&shutdown_token(),
|
&cancel,
|
||||||
)
|
)
|
||||||
.await;
|
.await;
|
||||||
}
|
}
|
||||||
@@ -1703,23 +1700,6 @@ impl RemoteTimelineClient {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn get_layers_metadata(
|
|
||||||
&self,
|
|
||||||
layers: Vec<LayerFileName>,
|
|
||||||
) -> anyhow::Result<Vec<Option<LayerFileMetadata>>> {
|
|
||||||
let q = self.upload_queue.lock().unwrap();
|
|
||||||
let q = match &*q {
|
|
||||||
UploadQueue::Stopped(_) | UploadQueue::Uninitialized => {
|
|
||||||
anyhow::bail!("queue is in state {}", q.as_str())
|
|
||||||
}
|
|
||||||
UploadQueue::Initialized(inner) => inner,
|
|
||||||
};
|
|
||||||
|
|
||||||
let decorated = layers.into_iter().map(|l| q.latest_files.get(&l).cloned());
|
|
||||||
|
|
||||||
Ok(decorated.collect())
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath {
|
pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath {
|
||||||
@@ -1990,7 +1970,7 @@ mod tests {
|
|||||||
|
|
||||||
// Download back the index.json, and check that the list of files is correct
|
// Download back the index.json, and check that the list of files is correct
|
||||||
let initial_index_part = match client
|
let initial_index_part = match client
|
||||||
.download_index_file(CancellationToken::new())
|
.download_index_file(&CancellationToken::new())
|
||||||
.await
|
.await
|
||||||
.unwrap()
|
.unwrap()
|
||||||
{
|
{
|
||||||
@@ -2084,7 +2064,7 @@ mod tests {
|
|||||||
|
|
||||||
// Download back the index.json, and check that the list of files is correct
|
// Download back the index.json, and check that the list of files is correct
|
||||||
let index_part = match client
|
let index_part = match client
|
||||||
.download_index_file(CancellationToken::new())
|
.download_index_file(&CancellationToken::new())
|
||||||
.await
|
.await
|
||||||
.unwrap()
|
.unwrap()
|
||||||
{
|
{
|
||||||
@@ -2286,7 +2266,7 @@ mod tests {
|
|||||||
let client = test_state.build_client(get_generation);
|
let client = test_state.build_client(get_generation);
|
||||||
|
|
||||||
let download_r = client
|
let download_r = client
|
||||||
.download_index_file(CancellationToken::new())
|
.download_index_file(&CancellationToken::new())
|
||||||
.await
|
.await
|
||||||
.expect("download should always succeed");
|
.expect("download should always succeed");
|
||||||
assert!(matches!(download_r, MaybeDeletedIndexPart::IndexPart(_)));
|
assert!(matches!(download_r, MaybeDeletedIndexPart::IndexPart(_)));
|
||||||
|
|||||||
@@ -216,16 +216,15 @@ pub async fn list_remote_timelines(
|
|||||||
anyhow::bail!("storage-sync-list-remote-timelines");
|
anyhow::bail!("storage-sync-list-remote-timelines");
|
||||||
});
|
});
|
||||||
|
|
||||||
let cancel_inner = cancel.clone();
|
|
||||||
let listing = download_retry_forever(
|
let listing = download_retry_forever(
|
||||||
|| {
|
|| {
|
||||||
download_cancellable(
|
download_cancellable(
|
||||||
&cancel_inner,
|
&cancel,
|
||||||
storage.list(Some(&remote_path), ListingMode::WithDelimiter),
|
storage.list(Some(&remote_path), ListingMode::WithDelimiter, None),
|
||||||
)
|
)
|
||||||
},
|
},
|
||||||
&format!("list timelines for {tenant_shard_id}"),
|
&format!("list timelines for {tenant_shard_id}"),
|
||||||
cancel,
|
&cancel,
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
@@ -258,19 +257,18 @@ async fn do_download_index_part(
|
|||||||
tenant_shard_id: &TenantShardId,
|
tenant_shard_id: &TenantShardId,
|
||||||
timeline_id: &TimelineId,
|
timeline_id: &TimelineId,
|
||||||
index_generation: Generation,
|
index_generation: Generation,
|
||||||
cancel: CancellationToken,
|
cancel: &CancellationToken,
|
||||||
) -> Result<IndexPart, DownloadError> {
|
) -> Result<IndexPart, DownloadError> {
|
||||||
use futures::stream::StreamExt;
|
use futures::stream::StreamExt;
|
||||||
|
|
||||||
let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);
|
let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);
|
||||||
|
|
||||||
let cancel_inner = cancel.clone();
|
|
||||||
let index_part_bytes = download_retry_forever(
|
let index_part_bytes = download_retry_forever(
|
||||||
|| async {
|
|| async {
|
||||||
// Cancellation: if is safe to cancel this future because we're just downloading into
|
// Cancellation: if is safe to cancel this future because we're just downloading into
|
||||||
// a memory buffer, not touching local disk.
|
// a memory buffer, not touching local disk.
|
||||||
let index_part_download =
|
let index_part_download =
|
||||||
download_cancellable(&cancel_inner, storage.download(&remote_path)).await?;
|
download_cancellable(cancel, storage.download(&remote_path)).await?;
|
||||||
|
|
||||||
let mut index_part_bytes = Vec::new();
|
let mut index_part_bytes = Vec::new();
|
||||||
let mut stream = std::pin::pin!(index_part_download.download_stream);
|
let mut stream = std::pin::pin!(index_part_download.download_stream);
|
||||||
@@ -288,7 +286,7 @@ async fn do_download_index_part(
|
|||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
let index_part: IndexPart = serde_json::from_slice(&index_part_bytes)
|
let index_part: IndexPart = serde_json::from_slice(&index_part_bytes)
|
||||||
.with_context(|| format!("download index part file at {remote_path:?}"))
|
.with_context(|| format!("deserialize index part file at {remote_path:?}"))
|
||||||
.map_err(DownloadError::Other)?;
|
.map_err(DownloadError::Other)?;
|
||||||
|
|
||||||
Ok(index_part)
|
Ok(index_part)
|
||||||
@@ -305,7 +303,7 @@ pub(super) async fn download_index_part(
|
|||||||
tenant_shard_id: &TenantShardId,
|
tenant_shard_id: &TenantShardId,
|
||||||
timeline_id: &TimelineId,
|
timeline_id: &TimelineId,
|
||||||
my_generation: Generation,
|
my_generation: Generation,
|
||||||
cancel: CancellationToken,
|
cancel: &CancellationToken,
|
||||||
) -> Result<IndexPart, DownloadError> {
|
) -> Result<IndexPart, DownloadError> {
|
||||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||||
|
|
||||||
@@ -325,14 +323,8 @@ pub(super) async fn download_index_part(
|
|||||||
// index in our generation.
|
// index in our generation.
|
||||||
//
|
//
|
||||||
// This is an optimization to avoid doing the listing for the general case below.
|
// This is an optimization to avoid doing the listing for the general case below.
|
||||||
let res = do_download_index_part(
|
let res =
|
||||||
storage,
|
do_download_index_part(storage, tenant_shard_id, timeline_id, my_generation, cancel).await;
|
||||||
tenant_shard_id,
|
|
||||||
timeline_id,
|
|
||||||
my_generation,
|
|
||||||
cancel.clone(),
|
|
||||||
)
|
|
||||||
.await;
|
|
||||||
match res {
|
match res {
|
||||||
Ok(index_part) => {
|
Ok(index_part) => {
|
||||||
tracing::debug!(
|
tracing::debug!(
|
||||||
@@ -357,7 +349,7 @@ pub(super) async fn download_index_part(
|
|||||||
tenant_shard_id,
|
tenant_shard_id,
|
||||||
timeline_id,
|
timeline_id,
|
||||||
my_generation.previous(),
|
my_generation.previous(),
|
||||||
cancel.clone(),
|
cancel,
|
||||||
)
|
)
|
||||||
.await;
|
.await;
|
||||||
match res {
|
match res {
|
||||||
@@ -379,18 +371,13 @@ pub(super) async fn download_index_part(
|
|||||||
// objects, and select the highest one with a generation <= my_generation. Constructing the prefix is equivalent
|
// objects, and select the highest one with a generation <= my_generation. Constructing the prefix is equivalent
|
||||||
// to constructing a full index path with no generation, because the generation is a suffix.
|
// to constructing a full index path with no generation, because the generation is a suffix.
|
||||||
let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());
|
let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());
|
||||||
let indices = backoff::retry(
|
|
||||||
|| async { storage.list_files(Some(&index_prefix)).await },
|
let indices = download_retry(
|
||||||
|_| false,
|
|| async { storage.list_files(Some(&index_prefix), None).await },
|
||||||
FAILED_DOWNLOAD_WARN_THRESHOLD,
|
"list index_part files",
|
||||||
FAILED_REMOTE_OP_RETRIES,
|
cancel,
|
||||||
"listing index_part files",
|
|
||||||
&cancel,
|
|
||||||
)
|
)
|
||||||
.await
|
.await?;
|
||||||
.ok_or_else(|| anyhow::anyhow!("Cancelled"))
|
|
||||||
.and_then(|x| x)
|
|
||||||
.map_err(DownloadError::Other)?;
|
|
||||||
|
|
||||||
// General case logic for which index to use: the latest index whose generation
|
// General case logic for which index to use: the latest index whose generation
|
||||||
// is <= our own. See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
|
// is <= our own. See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
|
||||||
@@ -447,8 +434,6 @@ pub(crate) async fn download_initdb_tar_zst(
|
|||||||
"{INITDB_PATH}.download-{timeline_id}.{TEMP_FILE_SUFFIX}"
|
"{INITDB_PATH}.download-{timeline_id}.{TEMP_FILE_SUFFIX}"
|
||||||
));
|
));
|
||||||
|
|
||||||
let cancel_inner = cancel.clone();
|
|
||||||
|
|
||||||
let file = download_retry(
|
let file = download_retry(
|
||||||
|| async {
|
|| async {
|
||||||
let file = OpenOptions::new()
|
let file = OpenOptions::new()
|
||||||
@@ -461,13 +446,11 @@ pub(crate) async fn download_initdb_tar_zst(
|
|||||||
.with_context(|| format!("tempfile creation {temp_path}"))
|
.with_context(|| format!("tempfile creation {temp_path}"))
|
||||||
.map_err(DownloadError::Other)?;
|
.map_err(DownloadError::Other)?;
|
||||||
|
|
||||||
let download = match download_cancellable(&cancel_inner, storage.download(&remote_path))
|
let download = match download_cancellable(cancel, storage.download(&remote_path)).await
|
||||||
.await
|
|
||||||
{
|
{
|
||||||
Ok(dl) => dl,
|
Ok(dl) => dl,
|
||||||
Err(DownloadError::NotFound) => {
|
Err(DownloadError::NotFound) => {
|
||||||
download_cancellable(&cancel_inner, storage.download(&remote_preserved_path))
|
download_cancellable(cancel, storage.download(&remote_preserved_path)).await?
|
||||||
.await?
|
|
||||||
}
|
}
|
||||||
Err(other) => Err(other)?,
|
Err(other) => Err(other)?,
|
||||||
};
|
};
|
||||||
@@ -516,7 +499,7 @@ pub(crate) async fn download_initdb_tar_zst(
|
|||||||
/// with backoff.
|
/// with backoff.
|
||||||
///
|
///
|
||||||
/// (See similar logic for uploads in `perform_upload_task`)
|
/// (See similar logic for uploads in `perform_upload_task`)
|
||||||
async fn download_retry<T, O, F>(
|
pub(super) async fn download_retry<T, O, F>(
|
||||||
op: O,
|
op: O,
|
||||||
description: &str,
|
description: &str,
|
||||||
cancel: &CancellationToken,
|
cancel: &CancellationToken,
|
||||||
@@ -527,7 +510,7 @@ where
|
|||||||
{
|
{
|
||||||
backoff::retry(
|
backoff::retry(
|
||||||
op,
|
op,
|
||||||
|e| matches!(e, DownloadError::BadInput(_) | DownloadError::NotFound),
|
DownloadError::is_permanent,
|
||||||
FAILED_DOWNLOAD_WARN_THRESHOLD,
|
FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||||
FAILED_REMOTE_OP_RETRIES,
|
FAILED_REMOTE_OP_RETRIES,
|
||||||
description,
|
description,
|
||||||
@@ -541,7 +524,7 @@ where
|
|||||||
async fn download_retry_forever<T, O, F>(
|
async fn download_retry_forever<T, O, F>(
|
||||||
op: O,
|
op: O,
|
||||||
description: &str,
|
description: &str,
|
||||||
cancel: CancellationToken,
|
cancel: &CancellationToken,
|
||||||
) -> Result<T, DownloadError>
|
) -> Result<T, DownloadError>
|
||||||
where
|
where
|
||||||
O: FnMut() -> F,
|
O: FnMut() -> F,
|
||||||
@@ -549,11 +532,11 @@ where
|
|||||||
{
|
{
|
||||||
backoff::retry(
|
backoff::retry(
|
||||||
op,
|
op,
|
||||||
|e| matches!(e, DownloadError::BadInput(_) | DownloadError::NotFound),
|
DownloadError::is_permanent,
|
||||||
FAILED_DOWNLOAD_WARN_THRESHOLD,
|
FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||||
u32::MAX,
|
u32::MAX,
|
||||||
description,
|
description,
|
||||||
&cancel,
|
cancel,
|
||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
.ok_or_else(|| DownloadError::Cancelled)
|
.ok_or_else(|| DownloadError::Cancelled)
|
||||||
|
|||||||
@@ -27,7 +27,7 @@ use super::index::LayerFileMetadata;
|
|||||||
use tracing::info;
|
use tracing::info;
|
||||||
|
|
||||||
/// Serializes and uploads the given index part data to the remote storage.
|
/// Serializes and uploads the given index part data to the remote storage.
|
||||||
pub(super) async fn upload_index_part<'a>(
|
pub(crate) async fn upload_index_part<'a>(
|
||||||
storage: &'a GenericRemoteStorage,
|
storage: &'a GenericRemoteStorage,
|
||||||
tenant_shard_id: &TenantShardId,
|
tenant_shard_id: &TenantShardId,
|
||||||
timeline_id: &TimelineId,
|
timeline_id: &TimelineId,
|
||||||
|
|||||||
@@ -160,7 +160,7 @@ impl SecondaryTenant {
|
|||||||
&self.tenant_shard_id
|
&self.tenant_shard_id
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn get_layers_for_eviction(self: &Arc<Self>) -> DiskUsageEvictionInfo {
|
pub(crate) fn get_layers_for_eviction(self: &Arc<Self>) -> (DiskUsageEvictionInfo, usize) {
|
||||||
self.detail.lock().unwrap().get_layers_for_eviction(self)
|
self.detail.lock().unwrap().get_layers_for_eviction(self)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -146,14 +146,15 @@ impl SecondaryDetail {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Additionally returns the total number of layers, used for more stable relative access time
|
||||||
|
/// based eviction.
|
||||||
pub(super) fn get_layers_for_eviction(
|
pub(super) fn get_layers_for_eviction(
|
||||||
&self,
|
&self,
|
||||||
parent: &Arc<SecondaryTenant>,
|
parent: &Arc<SecondaryTenant>,
|
||||||
) -> DiskUsageEvictionInfo {
|
) -> (DiskUsageEvictionInfo, usize) {
|
||||||
let mut result = DiskUsageEvictionInfo {
|
let mut result = DiskUsageEvictionInfo::default();
|
||||||
max_layer_size: None,
|
let mut total_layers = 0;
|
||||||
resident_layers: Vec::new(),
|
|
||||||
};
|
|
||||||
for (timeline_id, timeline_detail) in &self.timelines {
|
for (timeline_id, timeline_detail) in &self.timelines {
|
||||||
result
|
result
|
||||||
.resident_layers
|
.resident_layers
|
||||||
@@ -169,6 +170,10 @@ impl SecondaryDetail {
|
|||||||
relative_last_activity: finite_f32::FiniteF32::ZERO,
|
relative_last_activity: finite_f32::FiniteF32::ZERO,
|
||||||
}
|
}
|
||||||
}));
|
}));
|
||||||
|
|
||||||
|
// total might be missing currently downloading layers, but as a lower than actual
|
||||||
|
// value it is good enough approximation.
|
||||||
|
total_layers += timeline_detail.on_disk_layers.len() + timeline_detail.evicted_at.len();
|
||||||
}
|
}
|
||||||
result.max_layer_size = result
|
result.max_layer_size = result
|
||||||
.resident_layers
|
.resident_layers
|
||||||
@@ -183,7 +188,7 @@ impl SecondaryDetail {
|
|||||||
result.resident_layers.len()
|
result.resident_layers.len()
|
||||||
);
|
);
|
||||||
|
|
||||||
result
|
(result, total_layers)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -312,9 +317,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
|
|||||||
.tenant_manager
|
.tenant_manager
|
||||||
.get_secondary_tenant_shard(*tenant_shard_id);
|
.get_secondary_tenant_shard(*tenant_shard_id);
|
||||||
let Some(tenant) = tenant else {
|
let Some(tenant) = tenant else {
|
||||||
{
|
return Err(anyhow::anyhow!("Not found or not in Secondary mode"));
|
||||||
return Err(anyhow::anyhow!("Not found or not in Secondary mode"));
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
Ok(PendingDownload {
|
Ok(PendingDownload {
|
||||||
@@ -389,9 +392,9 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
|
|||||||
}
|
}
|
||||||
|
|
||||||
CompleteDownload {
|
CompleteDownload {
|
||||||
secondary_state,
|
secondary_state,
|
||||||
completed_at: Instant::now(),
|
completed_at: Instant::now(),
|
||||||
}
|
}
|
||||||
}.instrument(info_span!(parent: None, "secondary_download", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))))
|
}.instrument(info_span!(parent: None, "secondary_download", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -483,7 +486,7 @@ impl<'a> TenantDownloader<'a> {
|
|||||||
let heatmap_path_bg = heatmap_path.clone();
|
let heatmap_path_bg = heatmap_path.clone();
|
||||||
tokio::task::spawn_blocking(move || {
|
tokio::task::spawn_blocking(move || {
|
||||||
tokio::runtime::Handle::current().block_on(async move {
|
tokio::runtime::Handle::current().block_on(async move {
|
||||||
VirtualFile::crashsafe_overwrite(&heatmap_path_bg, &temp_path, &heatmap_bytes).await
|
VirtualFile::crashsafe_overwrite(&heatmap_path_bg, &temp_path, heatmap_bytes).await
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
.await
|
.await
|
||||||
@@ -530,7 +533,7 @@ impl<'a> TenantDownloader<'a> {
|
|||||||
.map_err(UpdateError::from)?;
|
.map_err(UpdateError::from)?;
|
||||||
let mut heatmap_bytes = Vec::new();
|
let mut heatmap_bytes = Vec::new();
|
||||||
let mut body = tokio_util::io::StreamReader::new(download.download_stream);
|
let mut body = tokio_util::io::StreamReader::new(download.download_stream);
|
||||||
let _size = tokio::io::copy(&mut body, &mut heatmap_bytes).await?;
|
let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?;
|
||||||
Ok(heatmap_bytes)
|
Ok(heatmap_bytes)
|
||||||
},
|
},
|
||||||
|e| matches!(e, UpdateError::NoData | UpdateError::Cancelled),
|
|e| matches!(e, UpdateError::NoData | UpdateError::Cancelled),
|
||||||
|
|||||||
@@ -257,6 +257,12 @@ impl LayerAccessStats {
|
|||||||
ret
|
ret
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Get the latest access timestamp, falling back to latest residence event, further falling
|
||||||
|
/// back to `SystemTime::now` for a usable timestamp for eviction.
|
||||||
|
pub(crate) fn latest_activity_or_now(&self) -> SystemTime {
|
||||||
|
self.latest_activity().unwrap_or_else(SystemTime::now)
|
||||||
|
}
|
||||||
|
|
||||||
/// Get the latest access timestamp, falling back to latest residence event.
|
/// Get the latest access timestamp, falling back to latest residence event.
|
||||||
///
|
///
|
||||||
/// This function can only return `None` if there has not yet been a call to the
|
/// This function can only return `None` if there has not yet been a call to the
|
||||||
@@ -271,7 +277,7 @@ impl LayerAccessStats {
|
|||||||
/// that that type can only be produced by inserting into the layer map.
|
/// that that type can only be produced by inserting into the layer map.
|
||||||
///
|
///
|
||||||
/// [`record_residence_event`]: Self::record_residence_event
|
/// [`record_residence_event`]: Self::record_residence_event
|
||||||
pub(crate) fn latest_activity(&self) -> Option<SystemTime> {
|
fn latest_activity(&self) -> Option<SystemTime> {
|
||||||
let locked = self.0.lock().unwrap();
|
let locked = self.0.lock().unwrap();
|
||||||
let inner = &locked.for_eviction_policy;
|
let inner = &locked.for_eviction_policy;
|
||||||
match inner.last_accesses.recent() {
|
match inner.last_accesses.recent() {
|
||||||
|
|||||||
@@ -416,27 +416,31 @@ impl DeltaLayerWriterInner {
|
|||||||
/// The values must be appended in key, lsn order.
|
/// The values must be appended in key, lsn order.
|
||||||
///
|
///
|
||||||
async fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
|
async fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
|
||||||
self.put_value_bytes(key, lsn, &Value::ser(&val)?, val.will_init())
|
let (_, res) = self
|
||||||
.await
|
.put_value_bytes(key, lsn, Value::ser(&val)?, val.will_init())
|
||||||
|
.await;
|
||||||
|
res
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn put_value_bytes(
|
async fn put_value_bytes(
|
||||||
&mut self,
|
&mut self,
|
||||||
key: Key,
|
key: Key,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
val: &[u8],
|
val: Vec<u8>,
|
||||||
will_init: bool,
|
will_init: bool,
|
||||||
) -> anyhow::Result<()> {
|
) -> (Vec<u8>, anyhow::Result<()>) {
|
||||||
assert!(self.lsn_range.start <= lsn);
|
assert!(self.lsn_range.start <= lsn);
|
||||||
|
let (val, res) = self.blob_writer.write_blob(val).await;
|
||||||
let off = self.blob_writer.write_blob(val).await?;
|
let off = match res {
|
||||||
|
Ok(off) => off,
|
||||||
|
Err(e) => return (val, Err(anyhow::anyhow!(e))),
|
||||||
|
};
|
||||||
|
|
||||||
let blob_ref = BlobRef::new(off, will_init);
|
let blob_ref = BlobRef::new(off, will_init);
|
||||||
|
|
||||||
let delta_key = DeltaKey::from_key_lsn(&key, lsn);
|
let delta_key = DeltaKey::from_key_lsn(&key, lsn);
|
||||||
self.tree.append(&delta_key.0, blob_ref.0)?;
|
let res = self.tree.append(&delta_key.0, blob_ref.0);
|
||||||
|
(val, res.map_err(|e| anyhow::anyhow!(e)))
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn size(&self) -> u64 {
|
fn size(&self) -> u64 {
|
||||||
@@ -457,7 +461,8 @@ impl DeltaLayerWriterInner {
|
|||||||
file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))
|
file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))
|
||||||
.await?;
|
.await?;
|
||||||
for buf in block_buf.blocks {
|
for buf in block_buf.blocks {
|
||||||
file.write_all(buf.as_ref()).await?;
|
let (_buf, res) = file.write_all(buf).await;
|
||||||
|
res?;
|
||||||
}
|
}
|
||||||
assert!(self.lsn_range.start < self.lsn_range.end);
|
assert!(self.lsn_range.start < self.lsn_range.end);
|
||||||
// Fill in the summary on blk 0
|
// Fill in the summary on blk 0
|
||||||
@@ -472,17 +477,12 @@ impl DeltaLayerWriterInner {
|
|||||||
index_root_blk,
|
index_root_blk,
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
|
let mut buf = Vec::with_capacity(PAGE_SZ);
|
||||||
|
// TODO: could use smallvec here but it's a pain with Slice<T>
|
||||||
Summary::ser_into(&summary, &mut buf)?;
|
Summary::ser_into(&summary, &mut buf)?;
|
||||||
if buf.spilled() {
|
|
||||||
// This is bad as we only have one free block for the summary
|
|
||||||
warn!(
|
|
||||||
"Used more than one page size for summary buffer: {}",
|
|
||||||
buf.len()
|
|
||||||
);
|
|
||||||
}
|
|
||||||
file.seek(SeekFrom::Start(0)).await?;
|
file.seek(SeekFrom::Start(0)).await?;
|
||||||
file.write_all(&buf).await?;
|
let (_buf, res) = file.write_all(buf).await;
|
||||||
|
res?;
|
||||||
|
|
||||||
let metadata = file
|
let metadata = file
|
||||||
.metadata()
|
.metadata()
|
||||||
@@ -587,9 +587,9 @@ impl DeltaLayerWriter {
|
|||||||
&mut self,
|
&mut self,
|
||||||
key: Key,
|
key: Key,
|
||||||
lsn: Lsn,
|
lsn: Lsn,
|
||||||
val: &[u8],
|
val: Vec<u8>,
|
||||||
will_init: bool,
|
will_init: bool,
|
||||||
) -> anyhow::Result<()> {
|
) -> (Vec<u8>, anyhow::Result<()>) {
|
||||||
self.inner
|
self.inner
|
||||||
.as_mut()
|
.as_mut()
|
||||||
.unwrap()
|
.unwrap()
|
||||||
@@ -675,18 +675,12 @@ impl DeltaLayer {
|
|||||||
|
|
||||||
let new_summary = rewrite(actual_summary);
|
let new_summary = rewrite(actual_summary);
|
||||||
|
|
||||||
let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
|
let mut buf = Vec::with_capacity(PAGE_SZ);
|
||||||
|
// TODO: could use smallvec here, but it's a pain with Slice<T>
|
||||||
Summary::ser_into(&new_summary, &mut buf).context("serialize")?;
|
Summary::ser_into(&new_summary, &mut buf).context("serialize")?;
|
||||||
if buf.spilled() {
|
|
||||||
// The code in DeltaLayerWriterInner just warn!()s for this.
|
|
||||||
// It should probably error out as well.
|
|
||||||
return Err(RewriteSummaryError::Other(anyhow::anyhow!(
|
|
||||||
"Used more than one page size for summary buffer: {}",
|
|
||||||
buf.len()
|
|
||||||
)));
|
|
||||||
}
|
|
||||||
file.seek(SeekFrom::Start(0)).await?;
|
file.seek(SeekFrom::Start(0)).await?;
|
||||||
file.write_all(&buf).await?;
|
let (_buf, res) = file.write_all(buf).await;
|
||||||
|
res?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -341,18 +341,12 @@ impl ImageLayer {
|
|||||||
|
|
||||||
let new_summary = rewrite(actual_summary);
|
let new_summary = rewrite(actual_summary);
|
||||||
|
|
||||||
let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
|
let mut buf = Vec::with_capacity(PAGE_SZ);
|
||||||
|
// TODO: could use smallvec here but it's a pain with Slice<T>
|
||||||
Summary::ser_into(&new_summary, &mut buf).context("serialize")?;
|
Summary::ser_into(&new_summary, &mut buf).context("serialize")?;
|
||||||
if buf.spilled() {
|
|
||||||
// The code in ImageLayerWriterInner just warn!()s for this.
|
|
||||||
// It should probably error out as well.
|
|
||||||
return Err(RewriteSummaryError::Other(anyhow::anyhow!(
|
|
||||||
"Used more than one page size for summary buffer: {}",
|
|
||||||
buf.len()
|
|
||||||
)));
|
|
||||||
}
|
|
||||||
file.seek(SeekFrom::Start(0)).await?;
|
file.seek(SeekFrom::Start(0)).await?;
|
||||||
file.write_all(&buf).await?;
|
let (_buf, res) = file.write_all(buf).await;
|
||||||
|
res?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -528,9 +522,11 @@ impl ImageLayerWriterInner {
|
|||||||
///
|
///
|
||||||
/// The page versions must be appended in blknum order.
|
/// The page versions must be appended in blknum order.
|
||||||
///
|
///
|
||||||
async fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
|
async fn put_image(&mut self, key: Key, img: Bytes) -> anyhow::Result<()> {
|
||||||
ensure!(self.key_range.contains(&key));
|
ensure!(self.key_range.contains(&key));
|
||||||
let off = self.blob_writer.write_blob(img).await?;
|
let (_img, res) = self.blob_writer.write_blob(img).await;
|
||||||
|
// TODO: re-use the buffer for `img` further upstack
|
||||||
|
let off = res?;
|
||||||
|
|
||||||
let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
|
let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
|
||||||
key.write_to_byte_slice(&mut keybuf);
|
key.write_to_byte_slice(&mut keybuf);
|
||||||
@@ -553,7 +549,8 @@ impl ImageLayerWriterInner {
|
|||||||
.await?;
|
.await?;
|
||||||
let (index_root_blk, block_buf) = self.tree.finish()?;
|
let (index_root_blk, block_buf) = self.tree.finish()?;
|
||||||
for buf in block_buf.blocks {
|
for buf in block_buf.blocks {
|
||||||
file.write_all(buf.as_ref()).await?;
|
let (_buf, res) = file.write_all(buf).await;
|
||||||
|
res?;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Fill in the summary on blk 0
|
// Fill in the summary on blk 0
|
||||||
@@ -568,17 +565,12 @@ impl ImageLayerWriterInner {
|
|||||||
index_root_blk,
|
index_root_blk,
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
|
let mut buf = Vec::with_capacity(PAGE_SZ);
|
||||||
|
// TODO: could use smallvec here but it's a pain with Slice<T>
|
||||||
Summary::ser_into(&summary, &mut buf)?;
|
Summary::ser_into(&summary, &mut buf)?;
|
||||||
if buf.spilled() {
|
|
||||||
// This is bad as we only have one free block for the summary
|
|
||||||
warn!(
|
|
||||||
"Used more than one page size for summary buffer: {}",
|
|
||||||
buf.len()
|
|
||||||
);
|
|
||||||
}
|
|
||||||
file.seek(SeekFrom::Start(0)).await?;
|
file.seek(SeekFrom::Start(0)).await?;
|
||||||
file.write_all(&buf).await?;
|
let (_buf, res) = file.write_all(buf).await;
|
||||||
|
res?;
|
||||||
|
|
||||||
let metadata = file
|
let metadata = file
|
||||||
.metadata()
|
.metadata()
|
||||||
@@ -659,7 +651,7 @@ impl ImageLayerWriter {
|
|||||||
///
|
///
|
||||||
/// The page versions must be appended in blknum order.
|
/// The page versions must be appended in blknum order.
|
||||||
///
|
///
|
||||||
pub async fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
|
pub async fn put_image(&mut self, key: Key, img: Bytes) -> anyhow::Result<()> {
|
||||||
self.inner.as_mut().unwrap().put_image(key, img).await
|
self.inner.as_mut().unwrap().put_image(key, img).await
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -383,9 +383,11 @@ impl InMemoryLayer {
|
|||||||
for (lsn, pos) in vec_map.as_slice() {
|
for (lsn, pos) in vec_map.as_slice() {
|
||||||
cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
|
cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
|
||||||
let will_init = Value::des(&buf)?.will_init();
|
let will_init = Value::des(&buf)?.will_init();
|
||||||
delta_layer_writer
|
let res;
|
||||||
.put_value_bytes(key, *lsn, &buf, will_init)
|
(buf, res) = delta_layer_writer
|
||||||
.await?;
|
.put_value_bytes(key, *lsn, buf, will_init)
|
||||||
|
.await;
|
||||||
|
res?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -300,8 +300,8 @@ impl Layer {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) async fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
|
pub(crate) fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
|
||||||
self.0.info(reset).await
|
self.0.info(reset)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn access_stats(&self) -> &LayerAccessStats {
|
pub(crate) fn access_stats(&self) -> &LayerAccessStats {
|
||||||
@@ -612,10 +612,10 @@ impl LayerInner {
|
|||||||
let mut rx = self.status.subscribe();
|
let mut rx = self.status.subscribe();
|
||||||
|
|
||||||
let strong = {
|
let strong = {
|
||||||
match self.inner.get_mut().await {
|
match self.inner.get() {
|
||||||
Some(mut either) => {
|
Some(mut either) => {
|
||||||
self.wanted_evicted.store(true, Ordering::Relaxed);
|
self.wanted_evicted.store(true, Ordering::Relaxed);
|
||||||
ResidentOrWantedEvicted::downgrade(&mut either)
|
either.downgrade()
|
||||||
}
|
}
|
||||||
None => return Err(EvictionError::NotFound),
|
None => return Err(EvictionError::NotFound),
|
||||||
}
|
}
|
||||||
@@ -641,7 +641,7 @@ impl LayerInner {
|
|||||||
// use however late (compared to the initial expressing of wanted) as the
|
// use however late (compared to the initial expressing of wanted) as the
|
||||||
// "outcome" now
|
// "outcome" now
|
||||||
LAYER_IMPL_METRICS.inc_broadcast_lagged();
|
LAYER_IMPL_METRICS.inc_broadcast_lagged();
|
||||||
match self.inner.get_mut().await {
|
match self.inner.get() {
|
||||||
Some(_) => Err(EvictionError::Downloaded),
|
Some(_) => Err(EvictionError::Downloaded),
|
||||||
None => Ok(()),
|
None => Ok(()),
|
||||||
}
|
}
|
||||||
@@ -759,7 +759,7 @@ impl LayerInner {
|
|||||||
// use the already held initialization permit because it is impossible to hit the
|
// use the already held initialization permit because it is impossible to hit the
|
||||||
// below paths anymore essentially limiting the max loop iterations to 2.
|
// below paths anymore essentially limiting the max loop iterations to 2.
|
||||||
let (value, init_permit) = download(init_permit).await?;
|
let (value, init_permit) = download(init_permit).await?;
|
||||||
let mut guard = self.inner.set(value, init_permit).await;
|
let mut guard = self.inner.set(value, init_permit);
|
||||||
let (strong, _upgraded) = guard
|
let (strong, _upgraded) = guard
|
||||||
.get_and_upgrade()
|
.get_and_upgrade()
|
||||||
.expect("init creates strong reference, we held the init permit");
|
.expect("init creates strong reference, we held the init permit");
|
||||||
@@ -767,7 +767,7 @@ impl LayerInner {
|
|||||||
}
|
}
|
||||||
|
|
||||||
let (weak, permit) = {
|
let (weak, permit) = {
|
||||||
let mut locked = self.inner.get_mut_or_init(download).await?;
|
let mut locked = self.inner.get_or_init(download).await?;
|
||||||
|
|
||||||
if let Some((strong, upgraded)) = locked.get_and_upgrade() {
|
if let Some((strong, upgraded)) = locked.get_and_upgrade() {
|
||||||
if upgraded {
|
if upgraded {
|
||||||
@@ -989,12 +989,12 @@ impl LayerInner {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
|
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
|
||||||
let layer_file_name = self.desc.filename().file_name();
|
let layer_file_name = self.desc.filename().file_name();
|
||||||
|
|
||||||
// this is not accurate: we could have the file locally but there was a cancellation
|
// this is not accurate: we could have the file locally but there was a cancellation
|
||||||
// and now we are not in sync, or we are currently downloading it.
|
// and now we are not in sync, or we are currently downloading it.
|
||||||
let remote = self.inner.get_mut().await.is_none();
|
let remote = self.inner.get().is_none();
|
||||||
|
|
||||||
let access_stats = self.access_stats.as_api_model(reset);
|
let access_stats = self.access_stats.as_api_model(reset);
|
||||||
|
|
||||||
@@ -1053,7 +1053,7 @@ impl LayerInner {
|
|||||||
LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
|
LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
|
||||||
return;
|
return;
|
||||||
};
|
};
|
||||||
match tokio::runtime::Handle::current().block_on(this.evict_blocking(version)) {
|
match this.evict_blocking(version) {
|
||||||
Ok(()) => LAYER_IMPL_METRICS.inc_completed_evictions(),
|
Ok(()) => LAYER_IMPL_METRICS.inc_completed_evictions(),
|
||||||
Err(reason) => LAYER_IMPL_METRICS.inc_eviction_cancelled(reason),
|
Err(reason) => LAYER_IMPL_METRICS.inc_eviction_cancelled(reason),
|
||||||
}
|
}
|
||||||
@@ -1061,7 +1061,7 @@ impl LayerInner {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn evict_blocking(&self, only_version: usize) -> Result<(), EvictionCancelled> {
|
fn evict_blocking(&self, only_version: usize) -> Result<(), EvictionCancelled> {
|
||||||
// deleted or detached timeline, don't do anything.
|
// deleted or detached timeline, don't do anything.
|
||||||
let Some(timeline) = self.timeline.upgrade() else {
|
let Some(timeline) = self.timeline.upgrade() else {
|
||||||
return Err(EvictionCancelled::TimelineGone);
|
return Err(EvictionCancelled::TimelineGone);
|
||||||
@@ -1070,7 +1070,7 @@ impl LayerInner {
|
|||||||
// to avoid starting a new download while we evict, keep holding on to the
|
// to avoid starting a new download while we evict, keep holding on to the
|
||||||
// permit.
|
// permit.
|
||||||
let _permit = {
|
let _permit = {
|
||||||
let maybe_downloaded = self.inner.get_mut().await;
|
let maybe_downloaded = self.inner.get();
|
||||||
|
|
||||||
let (_weak, permit) = match maybe_downloaded {
|
let (_weak, permit) = match maybe_downloaded {
|
||||||
Some(mut guard) => {
|
Some(mut guard) => {
|
||||||
@@ -1413,10 +1413,6 @@ impl ResidentLayer {
|
|||||||
&self.owner.0.path
|
&self.owner.0.path
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn access_stats(&self) -> &LayerAccessStats {
|
|
||||||
self.owner.access_stats()
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn metadata(&self) -> LayerFileMetadata {
|
pub(crate) fn metadata(&self) -> LayerFileMetadata {
|
||||||
self.owner.metadata()
|
self.owner.metadata()
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -12,7 +12,9 @@ use bytes::Bytes;
|
|||||||
use camino::{Utf8Path, Utf8PathBuf};
|
use camino::{Utf8Path, Utf8PathBuf};
|
||||||
use enumset::EnumSet;
|
use enumset::EnumSet;
|
||||||
use fail::fail_point;
|
use fail::fail_point;
|
||||||
|
use futures::stream::StreamExt;
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
|
use once_cell::sync::Lazy;
|
||||||
use pageserver_api::{
|
use pageserver_api::{
|
||||||
keyspace::{key_range_size, KeySpaceAccum},
|
keyspace::{key_range_size, KeySpaceAccum},
|
||||||
models::{
|
models::{
|
||||||
@@ -33,17 +35,22 @@ use tokio_util::sync::CancellationToken;
|
|||||||
use tracing::*;
|
use tracing::*;
|
||||||
use utils::sync::gate::Gate;
|
use utils::sync::gate::Gate;
|
||||||
|
|
||||||
use std::collections::{BTreeMap, BinaryHeap, HashMap, HashSet};
|
|
||||||
use std::ops::{Deref, Range};
|
use std::ops::{Deref, Range};
|
||||||
use std::pin::pin;
|
use std::pin::pin;
|
||||||
use std::sync::atomic::Ordering as AtomicOrdering;
|
use std::sync::atomic::Ordering as AtomicOrdering;
|
||||||
use std::sync::{Arc, Mutex, RwLock, Weak};
|
use std::sync::{Arc, Mutex, RwLock, Weak};
|
||||||
use std::time::{Duration, Instant, SystemTime};
|
use std::time::{Duration, Instant, SystemTime};
|
||||||
|
use std::{
|
||||||
|
array,
|
||||||
|
collections::{BTreeMap, BinaryHeap, HashMap, HashSet},
|
||||||
|
sync::atomic::AtomicU64,
|
||||||
|
};
|
||||||
use std::{
|
use std::{
|
||||||
cmp::{max, min, Ordering},
|
cmp::{max, min, Ordering},
|
||||||
ops::ControlFlow,
|
ops::ControlFlow,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
use crate::pgdatadir_mapping::DirectoryKind;
|
||||||
use crate::tenant::timeline::logical_size::CurrentLogicalSize;
|
use crate::tenant::timeline::logical_size::CurrentLogicalSize;
|
||||||
use crate::tenant::{
|
use crate::tenant::{
|
||||||
layer_map::{LayerMap, SearchResult},
|
layer_map::{LayerMap, SearchResult},
|
||||||
@@ -105,7 +112,7 @@ use self::logical_size::LogicalSize;
|
|||||||
use self::walreceiver::{WalReceiver, WalReceiverConf};
|
use self::walreceiver::{WalReceiver, WalReceiverConf};
|
||||||
|
|
||||||
use super::config::TenantConf;
|
use super::config::TenantConf;
|
||||||
use super::remote_timeline_client::index::{IndexLayerMetadata, IndexPart};
|
use super::remote_timeline_client::index::IndexPart;
|
||||||
use super::remote_timeline_client::RemoteTimelineClient;
|
use super::remote_timeline_client::RemoteTimelineClient;
|
||||||
use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
|
use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
|
||||||
use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
|
use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
|
||||||
@@ -257,6 +264,8 @@ pub struct Timeline {
|
|||||||
// in `crate::page_service` writes these metrics.
|
// in `crate::page_service` writes these metrics.
|
||||||
pub(crate) query_metrics: crate::metrics::SmgrQueryTimePerTimeline,
|
pub(crate) query_metrics: crate::metrics::SmgrQueryTimePerTimeline,
|
||||||
|
|
||||||
|
directory_metrics: [AtomicU64; DirectoryKind::KINDS_NUM],
|
||||||
|
|
||||||
/// Ensures layers aren't frozen by checkpointer between
|
/// Ensures layers aren't frozen by checkpointer between
|
||||||
/// [`Timeline::get_layer_for_write`] and layer reads.
|
/// [`Timeline::get_layer_for_write`] and layer reads.
|
||||||
/// Locked automatically by [`TimelineWriter`] and checkpointer.
|
/// Locked automatically by [`TimelineWriter`] and checkpointer.
|
||||||
@@ -789,6 +798,10 @@ impl Timeline {
|
|||||||
self.metrics.resident_physical_size_get()
|
self.metrics.resident_physical_size_get()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn get_directory_metrics(&self) -> [u64; DirectoryKind::KINDS_NUM] {
|
||||||
|
array::from_fn(|idx| self.directory_metrics[idx].load(AtomicOrdering::Relaxed))
|
||||||
|
}
|
||||||
|
|
||||||
///
|
///
|
||||||
/// Wait until WAL has been received and processed up to this LSN.
|
/// Wait until WAL has been received and processed up to this LSN.
|
||||||
///
|
///
|
||||||
@@ -1268,7 +1281,7 @@ impl Timeline {
|
|||||||
let mut historic_layers = Vec::new();
|
let mut historic_layers = Vec::new();
|
||||||
for historic_layer in layer_map.iter_historic_layers() {
|
for historic_layer in layer_map.iter_historic_layers() {
|
||||||
let historic_layer = guard.get_from_desc(&historic_layer);
|
let historic_layer = guard.get_from_desc(&historic_layer);
|
||||||
historic_layers.push(historic_layer.info(reset).await);
|
historic_layers.push(historic_layer.info(reset));
|
||||||
}
|
}
|
||||||
|
|
||||||
LayerMapInfo {
|
LayerMapInfo {
|
||||||
@@ -1458,7 +1471,7 @@ impl Timeline {
|
|||||||
generation,
|
generation,
|
||||||
shard_identity,
|
shard_identity,
|
||||||
pg_version,
|
pg_version,
|
||||||
layers: Arc::new(tokio::sync::RwLock::new(LayerManager::create())),
|
layers: Default::default(),
|
||||||
wanted_image_layers: Mutex::new(None),
|
wanted_image_layers: Mutex::new(None),
|
||||||
|
|
||||||
walredo_mgr,
|
walredo_mgr,
|
||||||
@@ -1495,6 +1508,8 @@ impl Timeline {
|
|||||||
&timeline_id,
|
&timeline_id,
|
||||||
),
|
),
|
||||||
|
|
||||||
|
directory_metrics: array::from_fn(|_| AtomicU64::new(0)),
|
||||||
|
|
||||||
flush_loop_state: Mutex::new(FlushLoopState::NotStarted),
|
flush_loop_state: Mutex::new(FlushLoopState::NotStarted),
|
||||||
|
|
||||||
layer_flush_start_tx,
|
layer_flush_start_tx,
|
||||||
@@ -2263,6 +2278,29 @@ impl Timeline {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn update_directory_entries_count(&self, kind: DirectoryKind, count: u64) {
|
||||||
|
self.directory_metrics[kind.offset()].store(count, AtomicOrdering::Relaxed);
|
||||||
|
let aux_metric =
|
||||||
|
self.directory_metrics[DirectoryKind::AuxFiles.offset()].load(AtomicOrdering::Relaxed);
|
||||||
|
|
||||||
|
let sum_of_entries = self
|
||||||
|
.directory_metrics
|
||||||
|
.iter()
|
||||||
|
.map(|v| v.load(AtomicOrdering::Relaxed))
|
||||||
|
.sum();
|
||||||
|
// Set a high general threshold and a lower threshold for the auxiliary files,
|
||||||
|
// as we can have large numbers of relations in the db directory.
|
||||||
|
const SUM_THRESHOLD: u64 = 5000;
|
||||||
|
const AUX_THRESHOLD: u64 = 1000;
|
||||||
|
if sum_of_entries >= SUM_THRESHOLD || aux_metric >= AUX_THRESHOLD {
|
||||||
|
self.metrics
|
||||||
|
.directory_entries_count_gauge
|
||||||
|
.set(sum_of_entries);
|
||||||
|
} else if let Some(metric) = Lazy::get(&self.metrics.directory_entries_count_gauge) {
|
||||||
|
metric.set(sum_of_entries);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async fn find_layer(&self, layer_file_name: &str) -> Option<Layer> {
|
async fn find_layer(&self, layer_file_name: &str) -> Option<Layer> {
|
||||||
let guard = self.layers.read().await;
|
let guard = self.layers.read().await;
|
||||||
for historic_layer in guard.layer_map().iter_historic_layers() {
|
for historic_layer in guard.layer_map().iter_historic_layers() {
|
||||||
@@ -2283,45 +2321,28 @@ impl Timeline {
|
|||||||
/// should treat this as a cue to simply skip doing any heatmap uploading
|
/// should treat this as a cue to simply skip doing any heatmap uploading
|
||||||
/// for this timeline.
|
/// for this timeline.
|
||||||
pub(crate) async fn generate_heatmap(&self) -> Option<HeatMapTimeline> {
|
pub(crate) async fn generate_heatmap(&self) -> Option<HeatMapTimeline> {
|
||||||
let eviction_info = self.get_local_layers_for_disk_usage_eviction().await;
|
// no point in heatmaps without remote client
|
||||||
|
let _remote_client = self.remote_client.as_ref()?;
|
||||||
|
|
||||||
let remote_client = match &self.remote_client {
|
if !self.is_active() {
|
||||||
Some(c) => c,
|
return None;
|
||||||
None => return None,
|
}
|
||||||
};
|
|
||||||
|
|
||||||
let layer_file_names = eviction_info
|
let guard = self.layers.read().await;
|
||||||
.resident_layers
|
|
||||||
.iter()
|
|
||||||
.map(|l| l.layer.get_name())
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
|
|
||||||
let decorated = match remote_client.get_layers_metadata(layer_file_names) {
|
let resident = guard.resident_layers().map(|layer| {
|
||||||
Ok(d) => d,
|
let last_activity_ts = layer.access_stats().latest_activity_or_now();
|
||||||
Err(_) => {
|
|
||||||
// Getting metadata only fails on Timeline in bad state.
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let heatmap_layers = std::iter::zip(
|
HeatMapLayer::new(
|
||||||
eviction_info.resident_layers.into_iter(),
|
layer.layer_desc().filename(),
|
||||||
decorated.into_iter(),
|
layer.metadata().into(),
|
||||||
)
|
last_activity_ts,
|
||||||
.filter_map(|(layer, remote_info)| {
|
)
|
||||||
remote_info.map(|remote_info| {
|
|
||||||
HeatMapLayer::new(
|
|
||||||
layer.layer.get_name(),
|
|
||||||
IndexLayerMetadata::from(remote_info),
|
|
||||||
layer.last_activity_ts,
|
|
||||||
)
|
|
||||||
})
|
|
||||||
});
|
});
|
||||||
|
|
||||||
Some(HeatMapTimeline::new(
|
let layers = resident.collect().await;
|
||||||
self.timeline_id,
|
|
||||||
heatmap_layers.collect(),
|
Some(HeatMapTimeline::new(self.timeline_id, layers))
|
||||||
))
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -3328,7 +3349,7 @@ impl Timeline {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
image_layer_writer.put_image(img_key, &img).await?;
|
image_layer_writer.put_image(img_key, img).await?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -4662,41 +4683,24 @@ impl Timeline {
|
|||||||
/// Returns non-remote layers for eviction.
|
/// Returns non-remote layers for eviction.
|
||||||
pub(crate) async fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo {
|
pub(crate) async fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo {
|
||||||
let guard = self.layers.read().await;
|
let guard = self.layers.read().await;
|
||||||
let layers = guard.layer_map();
|
|
||||||
|
|
||||||
let mut max_layer_size: Option<u64> = None;
|
let mut max_layer_size: Option<u64> = None;
|
||||||
let mut resident_layers = Vec::new();
|
|
||||||
|
|
||||||
for l in layers.iter_historic_layers() {
|
let resident_layers = guard
|
||||||
let file_size = l.file_size();
|
.resident_layers()
|
||||||
max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size)));
|
.map(|layer| {
|
||||||
|
let file_size = layer.layer_desc().file_size;
|
||||||
|
max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size)));
|
||||||
|
|
||||||
let l = guard.get_from_desc(&l);
|
let last_activity_ts = layer.access_stats().latest_activity_or_now();
|
||||||
|
|
||||||
let l = match l.keep_resident().await {
|
EvictionCandidate {
|
||||||
Ok(Some(l)) => l,
|
layer: layer.into(),
|
||||||
Ok(None) => continue,
|
last_activity_ts,
|
||||||
Err(e) => {
|
relative_last_activity: finite_f32::FiniteF32::ZERO,
|
||||||
// these should not happen, but we cannot make them statically impossible right
|
|
||||||
// now.
|
|
||||||
tracing::warn!(layer=%l, "failed to keep the layer resident: {e:#}");
|
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
};
|
})
|
||||||
|
.collect()
|
||||||
let last_activity_ts = l.access_stats().latest_activity().unwrap_or_else(|| {
|
.await;
|
||||||
// We only use this fallback if there's an implementation error.
|
|
||||||
// `latest_activity` already does rate-limited warn!() log.
|
|
||||||
debug!(layer=%l, "last_activity returns None, using SystemTime::now");
|
|
||||||
SystemTime::now()
|
|
||||||
});
|
|
||||||
|
|
||||||
resident_layers.push(EvictionCandidate {
|
|
||||||
layer: l.drop_eviction_guard().into(),
|
|
||||||
last_activity_ts,
|
|
||||||
relative_last_activity: finite_f32::FiniteF32::ZERO,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
DiskUsageEvictionInfo {
|
DiskUsageEvictionInfo {
|
||||||
max_layer_size,
|
max_layer_size,
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ use std::{
|
|||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use pageserver_api::{models::TimelineState, shard::TenantShardId};
|
use pageserver_api::{models::TimelineState, shard::TenantShardId};
|
||||||
use tokio::sync::OwnedMutexGuard;
|
use tokio::sync::OwnedMutexGuard;
|
||||||
use tracing::{debug, error, info, instrument, warn, Instrument, Span};
|
use tracing::{debug, error, info, instrument, warn, Instrument};
|
||||||
use utils::{crashsafe, fs_ext, id::TimelineId};
|
use utils::{crashsafe, fs_ext, id::TimelineId};
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
@@ -541,12 +541,7 @@ impl DeleteTimelineFlow {
|
|||||||
};
|
};
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
.instrument({
|
.instrument(tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),timeline_id=%timeline_id)),
|
||||||
let span =
|
|
||||||
tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),timeline_id=%timeline_id);
|
|
||||||
span.follows_from(Span::current());
|
|
||||||
span
|
|
||||||
}),
|
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -239,12 +239,7 @@ impl Timeline {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let last_activity_ts = hist_layer.access_stats().latest_activity().unwrap_or_else(|| {
|
let last_activity_ts = hist_layer.access_stats().latest_activity_or_now();
|
||||||
// We only use this fallback if there's an implementation error.
|
|
||||||
// `latest_activity` already does rate-limited warn!() log.
|
|
||||||
debug!(layer=%hist_layer, "last_activity returns None, using SystemTime::now");
|
|
||||||
SystemTime::now()
|
|
||||||
});
|
|
||||||
|
|
||||||
let no_activity_for = match now.duration_since(last_activity_ts) {
|
let no_activity_for = match now.duration_since(last_activity_ts) {
|
||||||
Ok(d) => d,
|
Ok(d) => d,
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
use anyhow::{bail, ensure, Context, Result};
|
use anyhow::{bail, ensure, Context, Result};
|
||||||
|
use futures::StreamExt;
|
||||||
use pageserver_api::shard::TenantShardId;
|
use pageserver_api::shard::TenantShardId;
|
||||||
use std::{collections::HashMap, sync::Arc};
|
use std::{collections::HashMap, sync::Arc};
|
||||||
use tracing::trace;
|
use tracing::trace;
|
||||||
@@ -20,19 +21,13 @@ use crate::{
|
|||||||
};
|
};
|
||||||
|
|
||||||
/// Provides semantic APIs to manipulate the layer map.
|
/// Provides semantic APIs to manipulate the layer map.
|
||||||
|
#[derive(Default)]
|
||||||
pub(crate) struct LayerManager {
|
pub(crate) struct LayerManager {
|
||||||
layer_map: LayerMap,
|
layer_map: LayerMap,
|
||||||
layer_fmgr: LayerFileManager<Layer>,
|
layer_fmgr: LayerFileManager<Layer>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl LayerManager {
|
impl LayerManager {
|
||||||
pub(crate) fn create() -> Self {
|
|
||||||
Self {
|
|
||||||
layer_map: LayerMap::default(),
|
|
||||||
layer_fmgr: LayerFileManager::new(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer {
|
pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer {
|
||||||
self.layer_fmgr.get_from_desc(desc)
|
self.layer_fmgr.get_from_desc(desc)
|
||||||
}
|
}
|
||||||
@@ -246,6 +241,32 @@ impl LayerManager {
|
|||||||
layer.delete_on_drop();
|
layer.delete_on_drop();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn resident_layers(&self) -> impl futures::stream::Stream<Item = Layer> + '_ {
|
||||||
|
// for small layer maps, we most likely have all resident, but for larger more are likely
|
||||||
|
// to be evicted assuming lots of layers correlated with longer lifespan.
|
||||||
|
|
||||||
|
let layers = self
|
||||||
|
.layer_map()
|
||||||
|
.iter_historic_layers()
|
||||||
|
.map(|desc| self.get_from_desc(&desc));
|
||||||
|
|
||||||
|
let layers = futures::stream::iter(layers);
|
||||||
|
|
||||||
|
layers.filter_map(|layer| async move {
|
||||||
|
// TODO(#6028): this query does not really need to see the ResidentLayer
|
||||||
|
match layer.keep_resident().await {
|
||||||
|
Ok(Some(layer)) => Some(layer.drop_eviction_guard()),
|
||||||
|
Ok(None) => None,
|
||||||
|
Err(e) => {
|
||||||
|
// these should not happen, but we cannot make them statically impossible right
|
||||||
|
// now.
|
||||||
|
tracing::warn!(%layer, "failed to keep the layer resident: {e:#}");
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
pub(crate) fn contains(&self, layer: &Layer) -> bool {
|
pub(crate) fn contains(&self, layer: &Layer) -> bool {
|
||||||
self.layer_fmgr.contains(layer)
|
self.layer_fmgr.contains(layer)
|
||||||
}
|
}
|
||||||
@@ -253,6 +274,12 @@ impl LayerManager {
|
|||||||
|
|
||||||
pub(crate) struct LayerFileManager<T>(HashMap<PersistentLayerKey, T>);
|
pub(crate) struct LayerFileManager<T>(HashMap<PersistentLayerKey, T>);
|
||||||
|
|
||||||
|
impl<T> Default for LayerFileManager<T> {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self(HashMap::default())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
|
impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
|
||||||
fn get_from_desc(&self, desc: &PersistentLayerDesc) -> T {
|
fn get_from_desc(&self, desc: &PersistentLayerDesc) -> T {
|
||||||
// The assumption for the `expect()` is that all code maintains the following invariant:
|
// The assumption for the `expect()` is that all code maintains the following invariant:
|
||||||
@@ -275,10 +302,6 @@ impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
|
|||||||
self.0.contains_key(&layer.layer_desc().key())
|
self.0.contains_key(&layer.layer_desc().key())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn new() -> Self {
|
|
||||||
Self(HashMap::new())
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn remove(&mut self, layer: &T) {
|
pub(crate) fn remove(&mut self, layer: &T) {
|
||||||
let present = self.0.remove(&layer.layer_desc().key());
|
let present = self.0.remove(&layer.layer_desc().key());
|
||||||
if present.is_none() && cfg!(debug_assertions) {
|
if present.is_none() && cfg!(debug_assertions) {
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ use once_cell::sync::OnceCell;
|
|||||||
use pageserver_api::shard::TenantShardId;
|
use pageserver_api::shard::TenantShardId;
|
||||||
use std::fs::{self, File};
|
use std::fs::{self, File};
|
||||||
use std::io::{Error, ErrorKind, Seek, SeekFrom};
|
use std::io::{Error, ErrorKind, Seek, SeekFrom};
|
||||||
use tokio_epoll_uring::IoBufMut;
|
use tokio_epoll_uring::{BoundedBuf, IoBufMut, Slice};
|
||||||
|
|
||||||
use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd};
|
use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd};
|
||||||
use std::os::unix::fs::FileExt;
|
use std::os::unix::fs::FileExt;
|
||||||
@@ -410,10 +410,10 @@ impl VirtualFile {
|
|||||||
/// step, the tmp path is renamed to the final path. As renames are
|
/// step, the tmp path is renamed to the final path. As renames are
|
||||||
/// atomic, a crash during the write operation will never leave behind a
|
/// atomic, a crash during the write operation will never leave behind a
|
||||||
/// partially written file.
|
/// partially written file.
|
||||||
pub async fn crashsafe_overwrite(
|
pub async fn crashsafe_overwrite<B: BoundedBuf>(
|
||||||
final_path: &Utf8Path,
|
final_path: &Utf8Path,
|
||||||
tmp_path: &Utf8Path,
|
tmp_path: &Utf8Path,
|
||||||
content: &[u8],
|
content: B,
|
||||||
) -> std::io::Result<()> {
|
) -> std::io::Result<()> {
|
||||||
let Some(final_path_parent) = final_path.parent() else {
|
let Some(final_path_parent) = final_path.parent() else {
|
||||||
return Err(std::io::Error::from_raw_os_error(
|
return Err(std::io::Error::from_raw_os_error(
|
||||||
@@ -430,7 +430,8 @@ impl VirtualFile {
|
|||||||
.create_new(true),
|
.create_new(true),
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
file.write_all(content).await?;
|
let (_content, res) = file.write_all(content).await;
|
||||||
|
res?;
|
||||||
file.sync_all().await?;
|
file.sync_all().await?;
|
||||||
drop(file); // before the rename, that's important!
|
drop(file); // before the rename, that's important!
|
||||||
// renames are atomic
|
// renames are atomic
|
||||||
@@ -601,23 +602,36 @@ impl VirtualFile {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn write_all(&mut self, mut buf: &[u8]) -> Result<(), Error> {
|
/// Writes `buf.slice(0..buf.bytes_init())`.
|
||||||
|
/// Returns the IoBuf that is underlying the BoundedBuf `buf`.
|
||||||
|
/// I.e., the returned value's `bytes_init()` method returns something different than the `bytes_init()` that was passed in.
|
||||||
|
/// It's quite brittle and easy to mis-use, so, we return the size in the Ok() variant.
|
||||||
|
pub async fn write_all<B: BoundedBuf>(&mut self, buf: B) -> (B::Buf, Result<usize, Error>) {
|
||||||
|
let nbytes = buf.bytes_init();
|
||||||
|
if nbytes == 0 {
|
||||||
|
return (Slice::into_inner(buf.slice_full()), Ok(0));
|
||||||
|
}
|
||||||
|
let mut buf = buf.slice(0..nbytes);
|
||||||
while !buf.is_empty() {
|
while !buf.is_empty() {
|
||||||
match self.write(buf).await {
|
// TODO: push `Slice` further down
|
||||||
|
match self.write(&buf).await {
|
||||||
Ok(0) => {
|
Ok(0) => {
|
||||||
return Err(Error::new(
|
return (
|
||||||
std::io::ErrorKind::WriteZero,
|
Slice::into_inner(buf),
|
||||||
"failed to write whole buffer",
|
Err(Error::new(
|
||||||
));
|
std::io::ErrorKind::WriteZero,
|
||||||
|
"failed to write whole buffer",
|
||||||
|
)),
|
||||||
|
);
|
||||||
}
|
}
|
||||||
Ok(n) => {
|
Ok(n) => {
|
||||||
buf = &buf[n..];
|
buf = buf.slice(n..);
|
||||||
}
|
}
|
||||||
Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
|
Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
|
||||||
Err(e) => return Err(e),
|
Err(e) => return (Slice::into_inner(buf), Err(e)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok(())
|
(Slice::into_inner(buf), Ok(nbytes))
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn write(&mut self, buf: &[u8]) -> Result<usize, std::io::Error> {
|
async fn write(&mut self, buf: &[u8]) -> Result<usize, std::io::Error> {
|
||||||
@@ -676,7 +690,6 @@ where
|
|||||||
F: FnMut(tokio_epoll_uring::Slice<B>, u64) -> Fut,
|
F: FnMut(tokio_epoll_uring::Slice<B>, u64) -> Fut,
|
||||||
Fut: std::future::Future<Output = (tokio_epoll_uring::Slice<B>, std::io::Result<usize>)>,
|
Fut: std::future::Future<Output = (tokio_epoll_uring::Slice<B>, std::io::Result<usize>)>,
|
||||||
{
|
{
|
||||||
use tokio_epoll_uring::BoundedBuf;
|
|
||||||
let mut buf: tokio_epoll_uring::Slice<B> = buf.slice_full(); // includes all the uninitialized memory
|
let mut buf: tokio_epoll_uring::Slice<B> = buf.slice_full(); // includes all the uninitialized memory
|
||||||
while buf.bytes_total() != 0 {
|
while buf.bytes_total() != 0 {
|
||||||
let res;
|
let res;
|
||||||
@@ -1063,10 +1076,19 @@ mod tests {
|
|||||||
MaybeVirtualFile::File(file) => file.seek(pos),
|
MaybeVirtualFile::File(file) => file.seek(pos),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
async fn write_all(&mut self, buf: &[u8]) -> Result<(), Error> {
|
async fn write_all<B: BoundedBuf>(&mut self, buf: B) -> Result<(), Error> {
|
||||||
match self {
|
match self {
|
||||||
MaybeVirtualFile::VirtualFile(file) => file.write_all(buf).await,
|
MaybeVirtualFile::VirtualFile(file) => {
|
||||||
MaybeVirtualFile::File(file) => file.write_all(buf),
|
let (_buf, res) = file.write_all(buf).await;
|
||||||
|
res.map(|_| ())
|
||||||
|
}
|
||||||
|
MaybeVirtualFile::File(file) => {
|
||||||
|
let buf_len = buf.bytes_init();
|
||||||
|
if buf_len == 0 {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
file.write_all(&buf.slice(0..buf_len))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1141,7 +1163,7 @@ mod tests {
|
|||||||
.to_owned(),
|
.to_owned(),
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
file_a.write_all(b"foobar").await?;
|
file_a.write_all(b"foobar".to_vec()).await?;
|
||||||
|
|
||||||
// cannot read from a file opened in write-only mode
|
// cannot read from a file opened in write-only mode
|
||||||
let _ = file_a.read_string().await.unwrap_err();
|
let _ = file_a.read_string().await.unwrap_err();
|
||||||
@@ -1150,7 +1172,7 @@ mod tests {
|
|||||||
let mut file_a = openfunc(path_a, OpenOptions::new().read(true).to_owned()).await?;
|
let mut file_a = openfunc(path_a, OpenOptions::new().read(true).to_owned()).await?;
|
||||||
|
|
||||||
// cannot write to a file opened in read-only mode
|
// cannot write to a file opened in read-only mode
|
||||||
let _ = file_a.write_all(b"bar").await.unwrap_err();
|
let _ = file_a.write_all(b"bar".to_vec()).await.unwrap_err();
|
||||||
|
|
||||||
// Try simple read
|
// Try simple read
|
||||||
assert_eq!("foobar", file_a.read_string().await?);
|
assert_eq!("foobar", file_a.read_string().await?);
|
||||||
@@ -1293,7 +1315,7 @@ mod tests {
|
|||||||
let path = testdir.join("myfile");
|
let path = testdir.join("myfile");
|
||||||
let tmp_path = testdir.join("myfile.tmp");
|
let tmp_path = testdir.join("myfile.tmp");
|
||||||
|
|
||||||
VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo")
|
VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo".to_vec())
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
|
let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
|
||||||
@@ -1302,7 +1324,7 @@ mod tests {
|
|||||||
assert!(!tmp_path.exists());
|
assert!(!tmp_path.exists());
|
||||||
drop(file);
|
drop(file);
|
||||||
|
|
||||||
VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"bar")
|
VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"bar".to_vec())
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
|
let mut file = MaybeVirtualFile::from(VirtualFile::open(&path).await.unwrap());
|
||||||
@@ -1324,7 +1346,7 @@ mod tests {
|
|||||||
std::fs::write(&tmp_path, "some preexisting junk that should be removed").unwrap();
|
std::fs::write(&tmp_path, "some preexisting junk that should be removed").unwrap();
|
||||||
assert!(tmp_path.exists());
|
assert!(tmp_path.exists());
|
||||||
|
|
||||||
VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo")
|
VirtualFile::crashsafe_overwrite(&path, &tmp_path, b"foo".to_vec())
|
||||||
.await
|
.await
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
|
|||||||
@@ -346,7 +346,7 @@ impl WalIngest {
|
|||||||
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||||
|
|
||||||
if info == pg_constants::XLOG_LOGICAL_MESSAGE {
|
if info == pg_constants::XLOG_LOGICAL_MESSAGE {
|
||||||
let xlrec = XlLogicalMessage::decode(&mut buf);
|
let xlrec = crate::walrecord::XlLogicalMessage::decode(&mut buf);
|
||||||
let prefix = std::str::from_utf8(&buf[0..xlrec.prefix_size - 1])?;
|
let prefix = std::str::from_utf8(&buf[0..xlrec.prefix_size - 1])?;
|
||||||
let message = &buf[xlrec.prefix_size..xlrec.prefix_size + xlrec.message_size];
|
let message = &buf[xlrec.prefix_size..xlrec.prefix_size + xlrec.message_size];
|
||||||
if prefix == "neon-test" {
|
if prefix == "neon-test" {
|
||||||
|
|||||||
@@ -314,6 +314,9 @@ lfc_change_limit_hook(int newval, void *extra)
|
|||||||
lfc_ctl->used -= 1;
|
lfc_ctl->used -= 1;
|
||||||
}
|
}
|
||||||
lfc_ctl->limit = new_size;
|
lfc_ctl->limit = new_size;
|
||||||
|
if (new_size == 0) {
|
||||||
|
lfc_ctl->generation += 1;
|
||||||
|
}
|
||||||
neon_log(DEBUG1, "set local file cache limit to %d", new_size);
|
neon_log(DEBUG1, "set local file cache limit to %d", new_size);
|
||||||
|
|
||||||
LWLockRelease(lfc_lock);
|
LWLockRelease(lfc_lock);
|
||||||
|
|||||||
133
pgxn/neon/neon.c
133
pgxn/neon/neon.c
@@ -11,16 +11,23 @@
|
|||||||
#include "postgres.h"
|
#include "postgres.h"
|
||||||
#include "fmgr.h"
|
#include "fmgr.h"
|
||||||
|
|
||||||
|
#include "miscadmin.h"
|
||||||
#include "access/xact.h"
|
#include "access/xact.h"
|
||||||
#include "access/xlog.h"
|
#include "access/xlog.h"
|
||||||
#include "storage/buf_internals.h"
|
#include "storage/buf_internals.h"
|
||||||
#include "storage/bufmgr.h"
|
#include "storage/bufmgr.h"
|
||||||
#include "catalog/pg_type.h"
|
#include "catalog/pg_type.h"
|
||||||
|
#include "postmaster/bgworker.h"
|
||||||
|
#include "postmaster/interrupt.h"
|
||||||
|
#include "replication/slot.h"
|
||||||
#include "replication/walsender.h"
|
#include "replication/walsender.h"
|
||||||
|
#include "storage/procsignal.h"
|
||||||
|
#include "tcop/tcopprot.h"
|
||||||
#include "funcapi.h"
|
#include "funcapi.h"
|
||||||
#include "access/htup_details.h"
|
#include "access/htup_details.h"
|
||||||
#include "utils/pg_lsn.h"
|
#include "utils/pg_lsn.h"
|
||||||
#include "utils/guc.h"
|
#include "utils/guc.h"
|
||||||
|
#include "utils/wait_event.h"
|
||||||
|
|
||||||
#include "neon.h"
|
#include "neon.h"
|
||||||
#include "walproposer.h"
|
#include "walproposer.h"
|
||||||
@@ -30,6 +37,130 @@
|
|||||||
PG_MODULE_MAGIC;
|
PG_MODULE_MAGIC;
|
||||||
void _PG_init(void);
|
void _PG_init(void);
|
||||||
|
|
||||||
|
static int logical_replication_max_time_lag = 3600;
|
||||||
|
|
||||||
|
static void
|
||||||
|
InitLogicalReplicationMonitor(void)
|
||||||
|
{
|
||||||
|
BackgroundWorker bgw;
|
||||||
|
|
||||||
|
DefineCustomIntVariable(
|
||||||
|
"neon.logical_replication_max_time_lag",
|
||||||
|
"Threshold for dropping unused logical replication slots",
|
||||||
|
NULL,
|
||||||
|
&logical_replication_max_time_lag,
|
||||||
|
3600, 0, INT_MAX,
|
||||||
|
PGC_SIGHUP,
|
||||||
|
GUC_UNIT_S,
|
||||||
|
NULL, NULL, NULL);
|
||||||
|
|
||||||
|
memset(&bgw, 0, sizeof(bgw));
|
||||||
|
bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
|
||||||
|
bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
|
||||||
|
snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon");
|
||||||
|
snprintf(bgw.bgw_function_name, BGW_MAXLEN, "LogicalSlotsMonitorMain");
|
||||||
|
snprintf(bgw.bgw_name, BGW_MAXLEN, "Logical replication monitor");
|
||||||
|
snprintf(bgw.bgw_type, BGW_MAXLEN, "Logical replication monitor");
|
||||||
|
bgw.bgw_restart_time = 5;
|
||||||
|
bgw.bgw_notify_pid = 0;
|
||||||
|
bgw.bgw_main_arg = (Datum) 0;
|
||||||
|
|
||||||
|
RegisterBackgroundWorker(&bgw);
|
||||||
|
}
|
||||||
|
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
NameData name;
|
||||||
|
bool dropped;
|
||||||
|
XLogRecPtr confirmed_flush_lsn;
|
||||||
|
TimestampTz last_updated;
|
||||||
|
} SlotStatus;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Unused logical replication slots pins WAL and prevents deletion of snapshots.
|
||||||
|
*/
|
||||||
|
PGDLLEXPORT void
|
||||||
|
LogicalSlotsMonitorMain(Datum main_arg)
|
||||||
|
{
|
||||||
|
SlotStatus* slots;
|
||||||
|
TimestampTz now, last_checked;
|
||||||
|
|
||||||
|
/* Establish signal handlers. */
|
||||||
|
pqsignal(SIGUSR1, procsignal_sigusr1_handler);
|
||||||
|
pqsignal(SIGHUP, SignalHandlerForConfigReload);
|
||||||
|
pqsignal(SIGTERM, die);
|
||||||
|
|
||||||
|
BackgroundWorkerUnblockSignals();
|
||||||
|
|
||||||
|
slots = (SlotStatus*)calloc(max_replication_slots, sizeof(SlotStatus));
|
||||||
|
last_checked = GetCurrentTimestamp();
|
||||||
|
|
||||||
|
for (;;)
|
||||||
|
{
|
||||||
|
(void) WaitLatch(MyLatch,
|
||||||
|
WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | WL_TIMEOUT,
|
||||||
|
logical_replication_max_time_lag*1000/2,
|
||||||
|
PG_WAIT_EXTENSION);
|
||||||
|
ResetLatch(MyLatch);
|
||||||
|
CHECK_FOR_INTERRUPTS();
|
||||||
|
|
||||||
|
now = GetCurrentTimestamp();
|
||||||
|
|
||||||
|
if (now - last_checked > logical_replication_max_time_lag*USECS_PER_SEC)
|
||||||
|
{
|
||||||
|
int n_active_slots = 0;
|
||||||
|
last_checked = now;
|
||||||
|
|
||||||
|
LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
|
||||||
|
for (int i = 0; i < max_replication_slots; i++)
|
||||||
|
{
|
||||||
|
ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
|
||||||
|
|
||||||
|
/* Consider only logical repliction slots */
|
||||||
|
if (!s->in_use || !SlotIsLogical(s))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if (s->active_pid != 0)
|
||||||
|
{
|
||||||
|
n_active_slots += 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Check if there was some activity with the slot since last check */
|
||||||
|
if (s->data.confirmed_flush != slots[i].confirmed_flush_lsn)
|
||||||
|
{
|
||||||
|
slots[i].confirmed_flush_lsn = s->data.confirmed_flush;
|
||||||
|
slots[i].last_updated = now;
|
||||||
|
}
|
||||||
|
else if (now - slots[i].last_updated > logical_replication_max_time_lag*USECS_PER_SEC)
|
||||||
|
{
|
||||||
|
slots[i].name = s->data.name;
|
||||||
|
slots[i].dropped = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
LWLockRelease(ReplicationSlotControlLock);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If there are no active subscriptions, then no new snapshots are generated
|
||||||
|
* and so no need to force slot deletion.
|
||||||
|
*/
|
||||||
|
if (n_active_slots != 0)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < max_replication_slots; i++)
|
||||||
|
{
|
||||||
|
if (slots[i].dropped)
|
||||||
|
{
|
||||||
|
elog(LOG, "Drop logical replication slot because it was not update more than %ld seconds",
|
||||||
|
(now - slots[i].last_updated)/USECS_PER_SEC);
|
||||||
|
ReplicationSlotDrop(slots[i].name.data, true);
|
||||||
|
slots[i].dropped = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
_PG_init(void)
|
_PG_init(void)
|
||||||
{
|
{
|
||||||
@@ -44,6 +175,8 @@ _PG_init(void)
|
|||||||
pg_init_libpagestore();
|
pg_init_libpagestore();
|
||||||
pg_init_walproposer();
|
pg_init_walproposer();
|
||||||
|
|
||||||
|
InitLogicalReplicationMonitor();
|
||||||
|
|
||||||
InitControlPlaneConnector();
|
InitControlPlaneConnector();
|
||||||
|
|
||||||
pg_init_extension_server();
|
pg_init_extension_server();
|
||||||
|
|||||||
@@ -3079,14 +3079,6 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
|
|||||||
XLogRecGetBlockTag(record, block_id, &rinfo, &forknum, &blkno);
|
XLogRecGetBlockTag(record, block_id, &rinfo, &forknum, &blkno);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/*
|
|
||||||
* Out of an abundance of caution, we always run redo on shared catalogs,
|
|
||||||
* regardless of whether the block is stored in shared buffers. See also
|
|
||||||
* this function's top comment.
|
|
||||||
*/
|
|
||||||
if (!OidIsValid(NInfoGetDbOid(rinfo)))
|
|
||||||
return false;
|
|
||||||
|
|
||||||
CopyNRelFileInfoToBufTag(tag, rinfo);
|
CopyNRelFileInfoToBufTag(tag, rinfo);
|
||||||
tag.forkNum = forknum;
|
tag.forkNum = forknum;
|
||||||
tag.blockNum = blkno;
|
tag.blockNum = blkno;
|
||||||
@@ -3100,17 +3092,28 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
|
|||||||
*/
|
*/
|
||||||
LWLockAcquire(partitionLock, LW_SHARED);
|
LWLockAcquire(partitionLock, LW_SHARED);
|
||||||
|
|
||||||
/* Try to find the relevant buffer */
|
/*
|
||||||
buffer = BufTableLookup(&tag, hash);
|
* Out of an abundance of caution, we always run redo on shared catalogs,
|
||||||
|
* regardless of whether the block is stored in shared buffers. See also
|
||||||
no_redo_needed = buffer < 0;
|
* this function's top comment.
|
||||||
|
*/
|
||||||
|
if (!OidIsValid(NInfoGetDbOid(rinfo)))
|
||||||
|
{
|
||||||
|
no_redo_needed = false;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* Try to find the relevant buffer */
|
||||||
|
buffer = BufTableLookup(&tag, hash);
|
||||||
|
|
||||||
|
no_redo_needed = buffer < 0;
|
||||||
|
}
|
||||||
/* In both cases st lwlsn past this WAL record */
|
/* In both cases st lwlsn past this WAL record */
|
||||||
SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno);
|
SetLastWrittenLSNForBlock(end_recptr, rinfo, forknum, blkno);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* we don't have the buffer in memory, update lwLsn past this record, also
|
* we don't have the buffer in memory, update lwLsn past this record, also
|
||||||
* evict page fro file cache
|
* evict page from file cache
|
||||||
*/
|
*/
|
||||||
if (no_redo_needed)
|
if (no_redo_needed)
|
||||||
lfc_evict(rinfo, forknum, blkno);
|
lfc_evict(rinfo, forknum, blkno);
|
||||||
|
|||||||
@@ -688,7 +688,7 @@ RecvAcceptorGreeting(Safekeeper *sk)
|
|||||||
if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->greetResponse))
|
if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->greetResponse))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
wp_log(LOG, "received AcceptorGreeting from safekeeper %s:%s", sk->host, sk->port);
|
wp_log(LOG, "received AcceptorGreeting from safekeeper %s:%s, term=" INT64_FORMAT, sk->host, sk->port, sk->greetResponse.term);
|
||||||
|
|
||||||
/* Protocol is all good, move to voting. */
|
/* Protocol is all good, move to voting. */
|
||||||
sk->state = SS_VOTING;
|
sk->state = SS_VOTING;
|
||||||
@@ -922,6 +922,7 @@ static void
|
|||||||
DetermineEpochStartLsn(WalProposer *wp)
|
DetermineEpochStartLsn(WalProposer *wp)
|
||||||
{
|
{
|
||||||
TermHistory *dth;
|
TermHistory *dth;
|
||||||
|
int n_ready = 0;
|
||||||
|
|
||||||
wp->propEpochStartLsn = InvalidXLogRecPtr;
|
wp->propEpochStartLsn = InvalidXLogRecPtr;
|
||||||
wp->donorEpoch = 0;
|
wp->donorEpoch = 0;
|
||||||
@@ -932,6 +933,8 @@ DetermineEpochStartLsn(WalProposer *wp)
|
|||||||
{
|
{
|
||||||
if (wp->safekeeper[i].state == SS_IDLE)
|
if (wp->safekeeper[i].state == SS_IDLE)
|
||||||
{
|
{
|
||||||
|
n_ready++;
|
||||||
|
|
||||||
if (GetEpoch(&wp->safekeeper[i]) > wp->donorEpoch ||
|
if (GetEpoch(&wp->safekeeper[i]) > wp->donorEpoch ||
|
||||||
(GetEpoch(&wp->safekeeper[i]) == wp->donorEpoch &&
|
(GetEpoch(&wp->safekeeper[i]) == wp->donorEpoch &&
|
||||||
wp->safekeeper[i].voteResponse.flushLsn > wp->propEpochStartLsn))
|
wp->safekeeper[i].voteResponse.flushLsn > wp->propEpochStartLsn))
|
||||||
@@ -958,6 +961,16 @@ DetermineEpochStartLsn(WalProposer *wp)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (n_ready < wp->quorum)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* This is a rare case that can be triggered if safekeeper has voted and disconnected.
|
||||||
|
* In this case, its state will not be SS_IDLE and its vote cannot be used, because
|
||||||
|
* we clean up `voteResponse` in `ShutdownConnection`.
|
||||||
|
*/
|
||||||
|
wp_log(FATAL, "missing majority of votes, collected %d, expected %d, got %d", wp->n_votes, wp->quorum, n_ready);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If propEpochStartLsn is 0, it means flushLsn is 0 everywhere, we are bootstrapping
|
* If propEpochStartLsn is 0, it means flushLsn is 0 everywhere, we are bootstrapping
|
||||||
* and nothing was committed yet. Start streaming then from the basebackup LSN.
|
* and nothing was committed yet. Start streaming then from the basebackup LSN.
|
||||||
|
|||||||
@@ -486,6 +486,8 @@ typedef struct walproposer_api
|
|||||||
*
|
*
|
||||||
* On success, the data is placed in *buf. It is valid until the next call
|
* On success, the data is placed in *buf. It is valid until the next call
|
||||||
* to this function.
|
* to this function.
|
||||||
|
*
|
||||||
|
* Returns PG_ASYNC_READ_FAIL on closed connection.
|
||||||
*/
|
*/
|
||||||
PGAsyncReadResult (*conn_async_read) (Safekeeper *sk, char **buf, int *amount);
|
PGAsyncReadResult (*conn_async_read) (Safekeeper *sk, char **buf, int *amount);
|
||||||
|
|
||||||
@@ -532,6 +534,13 @@ typedef struct walproposer_api
|
|||||||
* Returns 0 if timeout is reached, 1 if some event happened. Updates
|
* Returns 0 if timeout is reached, 1 if some event happened. Updates
|
||||||
* events mask to indicate events and sets sk to the safekeeper which has
|
* events mask to indicate events and sets sk to the safekeeper which has
|
||||||
* an event.
|
* an event.
|
||||||
|
*
|
||||||
|
* On timeout, events is set to WL_NO_EVENTS. On socket event, events is
|
||||||
|
* set to WL_SOCKET_READABLE and/or WL_SOCKET_WRITEABLE. When socket is
|
||||||
|
* closed, events is set to WL_SOCKET_READABLE.
|
||||||
|
*
|
||||||
|
* WL_SOCKET_WRITEABLE is usually set only when we need to flush the buffer.
|
||||||
|
* It can be returned only if caller asked for this event in the last *_event_set call.
|
||||||
*/
|
*/
|
||||||
int (*wait_event_set) (WalProposer *wp, long timeout, Safekeeper **sk, uint32 *events);
|
int (*wait_event_set) (WalProposer *wp, long timeout, Safekeeper **sk, uint32 *events);
|
||||||
|
|
||||||
|
|||||||
@@ -19,6 +19,7 @@ chrono.workspace = true
|
|||||||
clap.workspace = true
|
clap.workspace = true
|
||||||
consumption_metrics.workspace = true
|
consumption_metrics.workspace = true
|
||||||
dashmap.workspace = true
|
dashmap.workspace = true
|
||||||
|
env_logger.workspace = true
|
||||||
futures.workspace = true
|
futures.workspace = true
|
||||||
git-version.workspace = true
|
git-version.workspace = true
|
||||||
hashbrown.workspace = true
|
hashbrown.workspace = true
|
||||||
@@ -59,6 +60,8 @@ scopeguard.workspace = true
|
|||||||
serde.workspace = true
|
serde.workspace = true
|
||||||
serde_json.workspace = true
|
serde_json.workspace = true
|
||||||
sha2.workspace = true
|
sha2.workspace = true
|
||||||
|
smol_str.workspace = true
|
||||||
|
smallvec.workspace = true
|
||||||
socket2.workspace = true
|
socket2.workspace = true
|
||||||
sync_wrapper.workspace = true
|
sync_wrapper.workspace = true
|
||||||
task-local-extensions.workspace = true
|
task-local-extensions.workspace = true
|
||||||
@@ -75,6 +78,7 @@ tracing-subscriber.workspace = true
|
|||||||
tracing-utils.workspace = true
|
tracing-utils.workspace = true
|
||||||
tracing.workspace = true
|
tracing.workspace = true
|
||||||
url.workspace = true
|
url.workspace = true
|
||||||
|
urlencoding.workspace = true
|
||||||
utils.workspace = true
|
utils.workspace = true
|
||||||
uuid.workspace = true
|
uuid.workspace = true
|
||||||
webpki-roots.workspace = true
|
webpki-roots.workspace = true
|
||||||
@@ -83,7 +87,6 @@ native-tls.workspace = true
|
|||||||
postgres-native-tls.workspace = true
|
postgres-native-tls.workspace = true
|
||||||
postgres-protocol.workspace = true
|
postgres-protocol.workspace = true
|
||||||
redis.workspace = true
|
redis.workspace = true
|
||||||
smol_str.workspace = true
|
|
||||||
|
|
||||||
workspace_hack.workspace = true
|
workspace_hack.workspace = true
|
||||||
|
|
||||||
|
|||||||
@@ -5,7 +5,8 @@ pub use backend::BackendType;
|
|||||||
|
|
||||||
mod credentials;
|
mod credentials;
|
||||||
pub use credentials::{
|
pub use credentials::{
|
||||||
check_peer_addr_is_in_list, endpoint_sni, ComputeUserInfoMaybeEndpoint, IpPattern,
|
check_peer_addr_is_in_list, endpoint_sni, ComputeUserInfoMaybeEndpoint,
|
||||||
|
ComputeUserInfoParseError, IpPattern,
|
||||||
};
|
};
|
||||||
|
|
||||||
mod password_hack;
|
mod password_hack;
|
||||||
@@ -14,8 +15,12 @@ use password_hack::PasswordHackPayload;
|
|||||||
|
|
||||||
mod flow;
|
mod flow;
|
||||||
pub use flow::*;
|
pub use flow::*;
|
||||||
|
use tokio::time::error::Elapsed;
|
||||||
|
|
||||||
use crate::{console, error::UserFacingError};
|
use crate::{
|
||||||
|
console,
|
||||||
|
error::{ReportableError, UserFacingError},
|
||||||
|
};
|
||||||
use std::io;
|
use std::io;
|
||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
|
|
||||||
@@ -31,9 +36,6 @@ pub enum AuthErrorImpl {
|
|||||||
#[error(transparent)]
|
#[error(transparent)]
|
||||||
GetAuthInfo(#[from] console::errors::GetAuthInfoError),
|
GetAuthInfo(#[from] console::errors::GetAuthInfoError),
|
||||||
|
|
||||||
#[error(transparent)]
|
|
||||||
WakeCompute(#[from] console::errors::WakeComputeError),
|
|
||||||
|
|
||||||
/// SASL protocol errors (includes [SCRAM](crate::scram)).
|
/// SASL protocol errors (includes [SCRAM](crate::scram)).
|
||||||
#[error(transparent)]
|
#[error(transparent)]
|
||||||
Sasl(#[from] crate::sasl::Error),
|
Sasl(#[from] crate::sasl::Error),
|
||||||
@@ -67,6 +69,9 @@ pub enum AuthErrorImpl {
|
|||||||
|
|
||||||
#[error("Too many connections to this endpoint. Please try again later.")]
|
#[error("Too many connections to this endpoint. Please try again later.")]
|
||||||
TooManyConnections,
|
TooManyConnections,
|
||||||
|
|
||||||
|
#[error("Authentication timed out")]
|
||||||
|
UserTimeout(Elapsed),
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Error)]
|
#[derive(Debug, Error)]
|
||||||
@@ -93,6 +98,10 @@ impl AuthError {
|
|||||||
pub fn is_auth_failed(&self) -> bool {
|
pub fn is_auth_failed(&self) -> bool {
|
||||||
matches!(self.0.as_ref(), AuthErrorImpl::AuthFailed(_))
|
matches!(self.0.as_ref(), AuthErrorImpl::AuthFailed(_))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn user_timeout(elapsed: Elapsed) -> Self {
|
||||||
|
AuthErrorImpl::UserTimeout(elapsed).into()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<E: Into<AuthErrorImpl>> From<E> for AuthError {
|
impl<E: Into<AuthErrorImpl>> From<E> for AuthError {
|
||||||
@@ -107,7 +116,6 @@ impl UserFacingError for AuthError {
|
|||||||
match self.0.as_ref() {
|
match self.0.as_ref() {
|
||||||
Link(e) => e.to_string_client(),
|
Link(e) => e.to_string_client(),
|
||||||
GetAuthInfo(e) => e.to_string_client(),
|
GetAuthInfo(e) => e.to_string_client(),
|
||||||
WakeCompute(e) => e.to_string_client(),
|
|
||||||
Sasl(e) => e.to_string_client(),
|
Sasl(e) => e.to_string_client(),
|
||||||
AuthFailed(_) => self.to_string(),
|
AuthFailed(_) => self.to_string(),
|
||||||
BadAuthMethod(_) => self.to_string(),
|
BadAuthMethod(_) => self.to_string(),
|
||||||
@@ -116,6 +124,26 @@ impl UserFacingError for AuthError {
|
|||||||
Io(_) => "Internal error".to_string(),
|
Io(_) => "Internal error".to_string(),
|
||||||
IpAddressNotAllowed => self.to_string(),
|
IpAddressNotAllowed => self.to_string(),
|
||||||
TooManyConnections => self.to_string(),
|
TooManyConnections => self.to_string(),
|
||||||
|
UserTimeout(_) => self.to_string(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ReportableError for AuthError {
|
||||||
|
fn get_error_kind(&self) -> crate::error::ErrorKind {
|
||||||
|
use AuthErrorImpl::*;
|
||||||
|
match self.0.as_ref() {
|
||||||
|
Link(e) => e.get_error_kind(),
|
||||||
|
GetAuthInfo(e) => e.get_error_kind(),
|
||||||
|
Sasl(e) => e.get_error_kind(),
|
||||||
|
AuthFailed(_) => crate::error::ErrorKind::User,
|
||||||
|
BadAuthMethod(_) => crate::error::ErrorKind::User,
|
||||||
|
MalformedPassword(_) => crate::error::ErrorKind::User,
|
||||||
|
MissingEndpointName => crate::error::ErrorKind::User,
|
||||||
|
Io(_) => crate::error::ErrorKind::ClientDisconnect,
|
||||||
|
IpAddressNotAllowed => crate::error::ErrorKind::User,
|
||||||
|
TooManyConnections => crate::error::ErrorKind::RateLimit,
|
||||||
|
UserTimeout(_) => crate::error::ErrorKind::User,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -10,9 +10,9 @@ use crate::auth::validate_password_and_exchange;
|
|||||||
use crate::cache::Cached;
|
use crate::cache::Cached;
|
||||||
use crate::console::errors::GetAuthInfoError;
|
use crate::console::errors::GetAuthInfoError;
|
||||||
use crate::console::provider::{CachedRoleSecret, ConsoleBackend};
|
use crate::console::provider::{CachedRoleSecret, ConsoleBackend};
|
||||||
use crate::console::AuthSecret;
|
use crate::console::{AuthSecret, NodeInfo};
|
||||||
use crate::context::RequestMonitoring;
|
use crate::context::RequestMonitoring;
|
||||||
use crate::proxy::wake_compute::wake_compute;
|
use crate::proxy::connect_compute::ComputeConnectBackend;
|
||||||
use crate::proxy::NeonOptions;
|
use crate::proxy::NeonOptions;
|
||||||
use crate::stream::Stream;
|
use crate::stream::Stream;
|
||||||
use crate::{
|
use crate::{
|
||||||
@@ -26,7 +26,6 @@ use crate::{
|
|||||||
stream, url,
|
stream, url,
|
||||||
};
|
};
|
||||||
use crate::{scram, EndpointCacheKey, EndpointId, RoleName};
|
use crate::{scram, EndpointCacheKey, EndpointId, RoleName};
|
||||||
use futures::TryFutureExt;
|
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use tokio::io::{AsyncRead, AsyncWrite};
|
use tokio::io::{AsyncRead, AsyncWrite};
|
||||||
use tracing::info;
|
use tracing::info;
|
||||||
@@ -56,11 +55,11 @@ impl<T> std::ops::Deref for MaybeOwned<'_, T> {
|
|||||||
/// * However, when we substitute `T` with [`ComputeUserInfoMaybeEndpoint`],
|
/// * However, when we substitute `T` with [`ComputeUserInfoMaybeEndpoint`],
|
||||||
/// this helps us provide the credentials only to those auth
|
/// this helps us provide the credentials only to those auth
|
||||||
/// backends which require them for the authentication process.
|
/// backends which require them for the authentication process.
|
||||||
pub enum BackendType<'a, T> {
|
pub enum BackendType<'a, T, D> {
|
||||||
/// Cloud API (V2).
|
/// Cloud API (V2).
|
||||||
Console(MaybeOwned<'a, ConsoleBackend>, T),
|
Console(MaybeOwned<'a, ConsoleBackend>, T),
|
||||||
/// Authentication via a web browser.
|
/// Authentication via a web browser.
|
||||||
Link(MaybeOwned<'a, url::ApiUrl>),
|
Link(MaybeOwned<'a, url::ApiUrl>, D),
|
||||||
}
|
}
|
||||||
|
|
||||||
pub trait TestBackend: Send + Sync + 'static {
|
pub trait TestBackend: Send + Sync + 'static {
|
||||||
@@ -68,9 +67,10 @@ pub trait TestBackend: Send + Sync + 'static {
|
|||||||
fn get_allowed_ips_and_secret(
|
fn get_allowed_ips_and_secret(
|
||||||
&self,
|
&self,
|
||||||
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), console::errors::GetAuthInfoError>;
|
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), console::errors::GetAuthInfoError>;
|
||||||
|
fn get_role_secret(&self) -> Result<CachedRoleSecret, console::errors::GetAuthInfoError>;
|
||||||
}
|
}
|
||||||
|
|
||||||
impl std::fmt::Display for BackendType<'_, ()> {
|
impl std::fmt::Display for BackendType<'_, (), ()> {
|
||||||
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
use BackendType::*;
|
use BackendType::*;
|
||||||
match self {
|
match self {
|
||||||
@@ -85,51 +85,50 @@ impl std::fmt::Display for BackendType<'_, ()> {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
ConsoleBackend::Test(_) => fmt.debug_tuple("Test").finish(),
|
ConsoleBackend::Test(_) => fmt.debug_tuple("Test").finish(),
|
||||||
},
|
},
|
||||||
Link(url) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
|
Link(url, _) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T> BackendType<'_, T> {
|
impl<T, D> BackendType<'_, T, D> {
|
||||||
/// Very similar to [`std::option::Option::as_ref`].
|
/// Very similar to [`std::option::Option::as_ref`].
|
||||||
/// This helps us pass structured config to async tasks.
|
/// This helps us pass structured config to async tasks.
|
||||||
pub fn as_ref(&self) -> BackendType<'_, &T> {
|
pub fn as_ref(&self) -> BackendType<'_, &T, &D> {
|
||||||
use BackendType::*;
|
use BackendType::*;
|
||||||
match self {
|
match self {
|
||||||
Console(c, x) => Console(MaybeOwned::Borrowed(c), x),
|
Console(c, x) => Console(MaybeOwned::Borrowed(c), x),
|
||||||
Link(c) => Link(MaybeOwned::Borrowed(c)),
|
Link(c, x) => Link(MaybeOwned::Borrowed(c), x),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, T> BackendType<'a, T> {
|
impl<'a, T, D> BackendType<'a, T, D> {
|
||||||
/// Very similar to [`std::option::Option::map`].
|
/// Very similar to [`std::option::Option::map`].
|
||||||
/// Maps [`BackendType<T>`] to [`BackendType<R>`] by applying
|
/// Maps [`BackendType<T>`] to [`BackendType<R>`] by applying
|
||||||
/// a function to a contained value.
|
/// a function to a contained value.
|
||||||
pub fn map<R>(self, f: impl FnOnce(T) -> R) -> BackendType<'a, R> {
|
pub fn map<R>(self, f: impl FnOnce(T) -> R) -> BackendType<'a, R, D> {
|
||||||
use BackendType::*;
|
use BackendType::*;
|
||||||
match self {
|
match self {
|
||||||
Console(c, x) => Console(c, f(x)),
|
Console(c, x) => Console(c, f(x)),
|
||||||
Link(c) => Link(c),
|
Link(c, x) => Link(c, x),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
impl<'a, T, D, E> BackendType<'a, Result<T, E>, D> {
|
||||||
impl<'a, T, E> BackendType<'a, Result<T, E>> {
|
|
||||||
/// Very similar to [`std::option::Option::transpose`].
|
/// Very similar to [`std::option::Option::transpose`].
|
||||||
/// This is most useful for error handling.
|
/// This is most useful for error handling.
|
||||||
pub fn transpose(self) -> Result<BackendType<'a, T>, E> {
|
pub fn transpose(self) -> Result<BackendType<'a, T, D>, E> {
|
||||||
use BackendType::*;
|
use BackendType::*;
|
||||||
match self {
|
match self {
|
||||||
Console(c, x) => x.map(|x| Console(c, x)),
|
Console(c, x) => x.map(|x| Console(c, x)),
|
||||||
Link(c) => Ok(Link(c)),
|
Link(c, x) => Ok(Link(c, x)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct ComputeCredentials<T> {
|
pub struct ComputeCredentials {
|
||||||
pub info: ComputeUserInfo,
|
pub info: ComputeUserInfo,
|
||||||
pub keys: T,
|
pub keys: ComputeCredentialKeys,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
@@ -152,7 +151,6 @@ impl ComputeUserInfo {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub enum ComputeCredentialKeys {
|
pub enum ComputeCredentialKeys {
|
||||||
#[cfg(any(test, feature = "testing"))]
|
|
||||||
Password(Vec<u8>),
|
Password(Vec<u8>),
|
||||||
AuthKeys(AuthKeys),
|
AuthKeys(AuthKeys),
|
||||||
}
|
}
|
||||||
@@ -187,19 +185,21 @@ async fn auth_quirks(
|
|||||||
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
||||||
allow_cleartext: bool,
|
allow_cleartext: bool,
|
||||||
config: &'static AuthenticationConfig,
|
config: &'static AuthenticationConfig,
|
||||||
) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
|
) -> auth::Result<ComputeCredentials> {
|
||||||
// If there's no project so far, that entails that client doesn't
|
// If there's no project so far, that entails that client doesn't
|
||||||
// support SNI or other means of passing the endpoint (project) name.
|
// support SNI or other means of passing the endpoint (project) name.
|
||||||
// We now expect to see a very specific payload in the place of password.
|
// We now expect to see a very specific payload in the place of password.
|
||||||
let (info, unauthenticated_password) = match user_info.try_into() {
|
let (info, unauthenticated_password) = match user_info.try_into() {
|
||||||
Err(info) => {
|
Err(info) => {
|
||||||
let res = hacks::password_hack_no_authentication(info, client, &mut ctx.latency_timer)
|
let res = hacks::password_hack_no_authentication(ctx, info, client).await?;
|
||||||
.await?;
|
|
||||||
|
|
||||||
ctx.set_endpoint_id(res.info.endpoint.clone());
|
ctx.set_endpoint_id(res.info.endpoint.clone());
|
||||||
tracing::Span::current().record("ep", &tracing::field::display(&res.info.endpoint));
|
tracing::Span::current().record("ep", &tracing::field::display(&res.info.endpoint));
|
||||||
|
let password = match res.keys {
|
||||||
(res.info, Some(res.keys))
|
ComputeCredentialKeys::Password(p) => p,
|
||||||
|
_ => unreachable!("password hack should return a password"),
|
||||||
|
};
|
||||||
|
(res.info, Some(password))
|
||||||
}
|
}
|
||||||
Ok(info) => (info, None),
|
Ok(info) => (info, None),
|
||||||
};
|
};
|
||||||
@@ -253,7 +253,7 @@ async fn authenticate_with_secret(
|
|||||||
unauthenticated_password: Option<Vec<u8>>,
|
unauthenticated_password: Option<Vec<u8>>,
|
||||||
allow_cleartext: bool,
|
allow_cleartext: bool,
|
||||||
config: &'static AuthenticationConfig,
|
config: &'static AuthenticationConfig,
|
||||||
) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
|
) -> auth::Result<ComputeCredentials> {
|
||||||
if let Some(password) = unauthenticated_password {
|
if let Some(password) = unauthenticated_password {
|
||||||
let auth_outcome = validate_password_and_exchange(&password, secret)?;
|
let auth_outcome = validate_password_and_exchange(&password, secret)?;
|
||||||
let keys = match auth_outcome {
|
let keys = match auth_outcome {
|
||||||
@@ -275,21 +275,22 @@ async fn authenticate_with_secret(
|
|||||||
// Perform cleartext auth if we're allowed to do that.
|
// Perform cleartext auth if we're allowed to do that.
|
||||||
// Currently, we use it for websocket connections (latency).
|
// Currently, we use it for websocket connections (latency).
|
||||||
if allow_cleartext {
|
if allow_cleartext {
|
||||||
return hacks::authenticate_cleartext(info, client, &mut ctx.latency_timer, secret).await;
|
ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
|
||||||
|
return hacks::authenticate_cleartext(ctx, info, client, secret).await;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Finally, proceed with the main auth flow (SCRAM-based).
|
// Finally, proceed with the main auth flow (SCRAM-based).
|
||||||
classic::authenticate(info, client, config, &mut ctx.latency_timer, secret).await
|
classic::authenticate(ctx, info, client, config, secret).await
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
|
impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
|
||||||
/// Get compute endpoint name from the credentials.
|
/// Get compute endpoint name from the credentials.
|
||||||
pub fn get_endpoint(&self) -> Option<EndpointId> {
|
pub fn get_endpoint(&self) -> Option<EndpointId> {
|
||||||
use BackendType::*;
|
use BackendType::*;
|
||||||
|
|
||||||
match self {
|
match self {
|
||||||
Console(_, user_info) => user_info.endpoint_id.clone(),
|
Console(_, user_info) => user_info.endpoint_id.clone(),
|
||||||
Link(_) => Some("link".into()),
|
Link(_, _) => Some("link".into()),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -299,7 +300,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
|
|||||||
|
|
||||||
match self {
|
match self {
|
||||||
Console(_, user_info) => &user_info.user,
|
Console(_, user_info) => &user_info.user,
|
||||||
Link(_) => "link",
|
Link(_, _) => "link",
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -311,7 +312,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
|
|||||||
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
||||||
allow_cleartext: bool,
|
allow_cleartext: bool,
|
||||||
config: &'static AuthenticationConfig,
|
config: &'static AuthenticationConfig,
|
||||||
) -> auth::Result<(CachedNodeInfo, BackendType<'a, ComputeUserInfo>)> {
|
) -> auth::Result<BackendType<'a, ComputeCredentials, NodeInfo>> {
|
||||||
use BackendType::*;
|
use BackendType::*;
|
||||||
|
|
||||||
let res = match self {
|
let res = match self {
|
||||||
@@ -322,33 +323,17 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
|
|||||||
"performing authentication using the console"
|
"performing authentication using the console"
|
||||||
);
|
);
|
||||||
|
|
||||||
let compute_credentials =
|
let credentials =
|
||||||
auth_quirks(ctx, &*api, user_info, client, allow_cleartext, config).await?;
|
auth_quirks(ctx, &*api, user_info, client, allow_cleartext, config).await?;
|
||||||
|
BackendType::Console(api, credentials)
|
||||||
let mut num_retries = 0;
|
|
||||||
let mut node =
|
|
||||||
wake_compute(&mut num_retries, ctx, &api, &compute_credentials.info).await?;
|
|
||||||
|
|
||||||
ctx.set_project(node.aux.clone());
|
|
||||||
|
|
||||||
match compute_credentials.keys {
|
|
||||||
#[cfg(any(test, feature = "testing"))]
|
|
||||||
ComputeCredentialKeys::Password(password) => node.config.password(password),
|
|
||||||
ComputeCredentialKeys::AuthKeys(auth_keys) => node.config.auth_keys(auth_keys),
|
|
||||||
};
|
|
||||||
|
|
||||||
(node, BackendType::Console(api, compute_credentials.info))
|
|
||||||
}
|
}
|
||||||
// NOTE: this auth backend doesn't use client credentials.
|
// NOTE: this auth backend doesn't use client credentials.
|
||||||
Link(url) => {
|
Link(url, _) => {
|
||||||
info!("performing link authentication");
|
info!("performing link authentication");
|
||||||
|
|
||||||
let node_info = link::authenticate(ctx, &url, client).await?;
|
let info = link::authenticate(ctx, &url, client).await?;
|
||||||
|
|
||||||
(
|
BackendType::Link(url, info)
|
||||||
CachedNodeInfo::new_uncached(node_info),
|
|
||||||
BackendType::Link(url),
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -357,7 +342,18 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl BackendType<'_, ComputeUserInfo> {
|
impl BackendType<'_, ComputeUserInfo, &()> {
|
||||||
|
pub async fn get_role_secret(
|
||||||
|
&self,
|
||||||
|
ctx: &mut RequestMonitoring,
|
||||||
|
) -> Result<CachedRoleSecret, GetAuthInfoError> {
|
||||||
|
use BackendType::*;
|
||||||
|
match self {
|
||||||
|
Console(api, user_info) => api.get_role_secret(ctx, user_info).await,
|
||||||
|
Link(_, _) => Ok(Cached::new_uncached(None)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn get_allowed_ips_and_secret(
|
pub async fn get_allowed_ips_and_secret(
|
||||||
&self,
|
&self,
|
||||||
ctx: &mut RequestMonitoring,
|
ctx: &mut RequestMonitoring,
|
||||||
@@ -365,21 +361,51 @@ impl BackendType<'_, ComputeUserInfo> {
|
|||||||
use BackendType::*;
|
use BackendType::*;
|
||||||
match self {
|
match self {
|
||||||
Console(api, user_info) => api.get_allowed_ips_and_secret(ctx, user_info).await,
|
Console(api, user_info) => api.get_allowed_ips_and_secret(ctx, user_info).await,
|
||||||
Link(_) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
|
Link(_, _) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
/// When applicable, wake the compute node, gaining its connection info in the process.
|
|
||||||
/// The link auth flow doesn't support this, so we return [`None`] in that case.
|
#[async_trait::async_trait]
|
||||||
pub async fn wake_compute(
|
impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> {
|
||||||
&self,
|
async fn wake_compute(
|
||||||
ctx: &mut RequestMonitoring,
|
&self,
|
||||||
) -> Result<Option<CachedNodeInfo>, console::errors::WakeComputeError> {
|
ctx: &mut RequestMonitoring,
|
||||||
use BackendType::*;
|
) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
|
||||||
|
use BackendType::*;
|
||||||
match self {
|
|
||||||
Console(api, user_info) => api.wake_compute(ctx, user_info).map_ok(Some).await,
|
match self {
|
||||||
Link(_) => Ok(None),
|
Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
|
||||||
|
Link(_, info) => Ok(Cached::new_uncached(info.clone())),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_keys(&self) -> Option<&ComputeCredentialKeys> {
|
||||||
|
match self {
|
||||||
|
BackendType::Console(_, creds) => Some(&creds.keys),
|
||||||
|
BackendType::Link(_, _) => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait::async_trait]
|
||||||
|
impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
|
||||||
|
async fn wake_compute(
|
||||||
|
&self,
|
||||||
|
ctx: &mut RequestMonitoring,
|
||||||
|
) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
|
||||||
|
use BackendType::*;
|
||||||
|
|
||||||
|
match self {
|
||||||
|
Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
|
||||||
|
Link(_, _) => unreachable!("link auth flow doesn't support waking the compute"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_keys(&self) -> Option<&ComputeCredentialKeys> {
|
||||||
|
match self {
|
||||||
|
BackendType::Console(_, creds) => Some(&creds.keys),
|
||||||
|
BackendType::Link(_, _) => None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ use crate::{
|
|||||||
compute,
|
compute,
|
||||||
config::AuthenticationConfig,
|
config::AuthenticationConfig,
|
||||||
console::AuthSecret,
|
console::AuthSecret,
|
||||||
metrics::LatencyTimer,
|
context::RequestMonitoring,
|
||||||
sasl,
|
sasl,
|
||||||
stream::{PqStream, Stream},
|
stream::{PqStream, Stream},
|
||||||
};
|
};
|
||||||
@@ -12,12 +12,12 @@ use tokio::io::{AsyncRead, AsyncWrite};
|
|||||||
use tracing::{info, warn};
|
use tracing::{info, warn};
|
||||||
|
|
||||||
pub(super) async fn authenticate(
|
pub(super) async fn authenticate(
|
||||||
|
ctx: &mut RequestMonitoring,
|
||||||
creds: ComputeUserInfo,
|
creds: ComputeUserInfo,
|
||||||
client: &mut PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
client: &mut PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
||||||
config: &'static AuthenticationConfig,
|
config: &'static AuthenticationConfig,
|
||||||
latency_timer: &mut LatencyTimer,
|
|
||||||
secret: AuthSecret,
|
secret: AuthSecret,
|
||||||
) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
|
) -> auth::Result<ComputeCredentials> {
|
||||||
let flow = AuthFlow::new(client);
|
let flow = AuthFlow::new(client);
|
||||||
let scram_keys = match secret {
|
let scram_keys = match secret {
|
||||||
#[cfg(any(test, feature = "testing"))]
|
#[cfg(any(test, feature = "testing"))]
|
||||||
@@ -27,13 +27,11 @@ pub(super) async fn authenticate(
|
|||||||
}
|
}
|
||||||
AuthSecret::Scram(secret) => {
|
AuthSecret::Scram(secret) => {
|
||||||
info!("auth endpoint chooses SCRAM");
|
info!("auth endpoint chooses SCRAM");
|
||||||
let scram = auth::Scram(&secret);
|
let scram = auth::Scram(&secret, &mut *ctx);
|
||||||
|
|
||||||
let auth_outcome = tokio::time::timeout(
|
let auth_outcome = tokio::time::timeout(
|
||||||
config.scram_protocol_timeout,
|
config.scram_protocol_timeout,
|
||||||
async {
|
async {
|
||||||
// pause the timer while we communicate with the client
|
|
||||||
let _paused = latency_timer.pause();
|
|
||||||
|
|
||||||
flow.begin(scram).await.map_err(|error| {
|
flow.begin(scram).await.map_err(|error| {
|
||||||
warn!(?error, "error sending scram acknowledgement");
|
warn!(?error, "error sending scram acknowledgement");
|
||||||
@@ -45,9 +43,9 @@ pub(super) async fn authenticate(
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
.map_err(|error| {
|
.map_err(|e| {
|
||||||
warn!("error processing scram messages error = authentication timed out, execution time exeeded {} seconds", config.scram_protocol_timeout.as_secs());
|
warn!("error processing scram messages error = authentication timed out, execution time exeeded {} seconds", config.scram_protocol_timeout.as_secs());
|
||||||
auth::io::Error::new(auth::io::ErrorKind::TimedOut, error)
|
auth::AuthError::user_timeout(e)
|
||||||
})??;
|
})??;
|
||||||
|
|
||||||
let client_key = match auth_outcome {
|
let client_key = match auth_outcome {
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ use super::{
|
|||||||
use crate::{
|
use crate::{
|
||||||
auth::{self, AuthFlow},
|
auth::{self, AuthFlow},
|
||||||
console::AuthSecret,
|
console::AuthSecret,
|
||||||
metrics::LatencyTimer,
|
context::RequestMonitoring,
|
||||||
sasl,
|
sasl,
|
||||||
stream::{self, Stream},
|
stream::{self, Stream},
|
||||||
};
|
};
|
||||||
@@ -16,15 +16,16 @@ use tracing::{info, warn};
|
|||||||
/// These properties are benefical for serverless JS workers, so we
|
/// These properties are benefical for serverless JS workers, so we
|
||||||
/// use this mechanism for websocket connections.
|
/// use this mechanism for websocket connections.
|
||||||
pub async fn authenticate_cleartext(
|
pub async fn authenticate_cleartext(
|
||||||
|
ctx: &mut RequestMonitoring,
|
||||||
info: ComputeUserInfo,
|
info: ComputeUserInfo,
|
||||||
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
||||||
latency_timer: &mut LatencyTimer,
|
|
||||||
secret: AuthSecret,
|
secret: AuthSecret,
|
||||||
) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
|
) -> auth::Result<ComputeCredentials> {
|
||||||
warn!("cleartext auth flow override is enabled, proceeding");
|
warn!("cleartext auth flow override is enabled, proceeding");
|
||||||
|
ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
|
||||||
|
|
||||||
// pause the timer while we communicate with the client
|
// pause the timer while we communicate with the client
|
||||||
let _paused = latency_timer.pause();
|
let _paused = ctx.latency_timer.pause();
|
||||||
|
|
||||||
let auth_outcome = AuthFlow::new(client)
|
let auth_outcome = AuthFlow::new(client)
|
||||||
.begin(auth::CleartextPassword(secret))
|
.begin(auth::CleartextPassword(secret))
|
||||||
@@ -47,14 +48,15 @@ pub async fn authenticate_cleartext(
|
|||||||
/// Similar to [`authenticate_cleartext`], but there's a specific password format,
|
/// Similar to [`authenticate_cleartext`], but there's a specific password format,
|
||||||
/// and passwords are not yet validated (we don't know how to validate them!)
|
/// and passwords are not yet validated (we don't know how to validate them!)
|
||||||
pub async fn password_hack_no_authentication(
|
pub async fn password_hack_no_authentication(
|
||||||
|
ctx: &mut RequestMonitoring,
|
||||||
info: ComputeUserInfoNoEndpoint,
|
info: ComputeUserInfoNoEndpoint,
|
||||||
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
||||||
latency_timer: &mut LatencyTimer,
|
) -> auth::Result<ComputeCredentials> {
|
||||||
) -> auth::Result<ComputeCredentials<Vec<u8>>> {
|
|
||||||
warn!("project not specified, resorting to the password hack auth flow");
|
warn!("project not specified, resorting to the password hack auth flow");
|
||||||
|
ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
|
||||||
|
|
||||||
// pause the timer while we communicate with the client
|
// pause the timer while we communicate with the client
|
||||||
let _paused = latency_timer.pause();
|
let _paused = ctx.latency_timer.pause();
|
||||||
|
|
||||||
let payload = AuthFlow::new(client)
|
let payload = AuthFlow::new(client)
|
||||||
.begin(auth::PasswordHack)
|
.begin(auth::PasswordHack)
|
||||||
@@ -71,6 +73,6 @@ pub async fn password_hack_no_authentication(
|
|||||||
options: info.options,
|
options: info.options,
|
||||||
endpoint: payload.endpoint,
|
endpoint: payload.endpoint,
|
||||||
},
|
},
|
||||||
keys: payload.password,
|
keys: ComputeCredentialKeys::Password(payload.password),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ use crate::{
|
|||||||
auth, compute,
|
auth, compute,
|
||||||
console::{self, provider::NodeInfo},
|
console::{self, provider::NodeInfo},
|
||||||
context::RequestMonitoring,
|
context::RequestMonitoring,
|
||||||
error::UserFacingError,
|
error::{ReportableError, UserFacingError},
|
||||||
stream::PqStream,
|
stream::PqStream,
|
||||||
waiters,
|
waiters,
|
||||||
};
|
};
|
||||||
@@ -14,10 +14,6 @@ use tracing::{info, info_span};
|
|||||||
|
|
||||||
#[derive(Debug, Error)]
|
#[derive(Debug, Error)]
|
||||||
pub enum LinkAuthError {
|
pub enum LinkAuthError {
|
||||||
/// Authentication error reported by the console.
|
|
||||||
#[error("Authentication failed: {0}")]
|
|
||||||
AuthFailed(String),
|
|
||||||
|
|
||||||
#[error(transparent)]
|
#[error(transparent)]
|
||||||
WaiterRegister(#[from] waiters::RegisterError),
|
WaiterRegister(#[from] waiters::RegisterError),
|
||||||
|
|
||||||
@@ -30,10 +26,16 @@ pub enum LinkAuthError {
|
|||||||
|
|
||||||
impl UserFacingError for LinkAuthError {
|
impl UserFacingError for LinkAuthError {
|
||||||
fn to_string_client(&self) -> String {
|
fn to_string_client(&self) -> String {
|
||||||
use LinkAuthError::*;
|
"Internal error".to_string()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ReportableError for LinkAuthError {
|
||||||
|
fn get_error_kind(&self) -> crate::error::ErrorKind {
|
||||||
match self {
|
match self {
|
||||||
AuthFailed(_) => self.to_string(),
|
LinkAuthError::WaiterRegister(_) => crate::error::ErrorKind::Service,
|
||||||
_ => "Internal error".to_string(),
|
LinkAuthError::WaiterWait(_) => crate::error::ErrorKind::Service,
|
||||||
|
LinkAuthError::Io(_) => crate::error::ErrorKind::ClientDisconnect,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -59,6 +61,8 @@ pub(super) async fn authenticate(
|
|||||||
link_uri: &reqwest::Url,
|
link_uri: &reqwest::Url,
|
||||||
client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
|
client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
|
||||||
) -> auth::Result<NodeInfo> {
|
) -> auth::Result<NodeInfo> {
|
||||||
|
ctx.set_auth_method(crate::context::AuthMethod::Web);
|
||||||
|
|
||||||
// registering waiter can fail if we get unlucky with rng.
|
// registering waiter can fail if we get unlucky with rng.
|
||||||
// just try again.
|
// just try again.
|
||||||
let (psql_session_id, waiter) = loop {
|
let (psql_session_id, waiter) = loop {
|
||||||
|
|||||||
@@ -1,8 +1,12 @@
|
|||||||
//! User credentials used in authentication.
|
//! User credentials used in authentication.
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
auth::password_hack::parse_endpoint_param, context::RequestMonitoring, error::UserFacingError,
|
auth::password_hack::parse_endpoint_param,
|
||||||
metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, proxy::NeonOptions, serverless::SERVERLESS_DRIVER_SNI,
|
context::RequestMonitoring,
|
||||||
|
error::{ReportableError, UserFacingError},
|
||||||
|
metrics::NUM_CONNECTION_ACCEPTED_BY_SNI,
|
||||||
|
proxy::NeonOptions,
|
||||||
|
serverless::SERVERLESS_DRIVER_SNI,
|
||||||
EndpointId, RoleName,
|
EndpointId, RoleName,
|
||||||
};
|
};
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
@@ -39,6 +43,12 @@ pub enum ComputeUserInfoParseError {
|
|||||||
|
|
||||||
impl UserFacingError for ComputeUserInfoParseError {}
|
impl UserFacingError for ComputeUserInfoParseError {}
|
||||||
|
|
||||||
|
impl ReportableError for ComputeUserInfoParseError {
|
||||||
|
fn get_error_kind(&self) -> crate::error::ErrorKind {
|
||||||
|
crate::error::ErrorKind::User
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Various client credentials which we use for authentication.
|
/// Various client credentials which we use for authentication.
|
||||||
/// Note that we don't store any kind of client key or password here.
|
/// Note that we don't store any kind of client key or password here.
|
||||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||||
@@ -89,6 +99,9 @@ impl ComputeUserInfoMaybeEndpoint {
|
|||||||
// record the values if we have them
|
// record the values if we have them
|
||||||
ctx.set_application(params.get("application_name").map(SmolStr::from));
|
ctx.set_application(params.get("application_name").map(SmolStr::from));
|
||||||
ctx.set_user(user.clone());
|
ctx.set_user(user.clone());
|
||||||
|
if let Some(dbname) = params.get("database") {
|
||||||
|
ctx.set_dbname(dbname.into());
|
||||||
|
}
|
||||||
|
|
||||||
// Project name might be passed via PG's command-line options.
|
// Project name might be passed via PG's command-line options.
|
||||||
let endpoint_option = params
|
let endpoint_option = params
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user