mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-27 01:50:38 +00:00
Compare commits
1 Commits
jcsp/issue
...
diko/metri
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
45a44c9826 |
12
.github/scripts/generate_image_maps.py
vendored
12
.github/scripts/generate_image_maps.py
vendored
@@ -39,18 +39,12 @@ registries = {
|
||||
],
|
||||
}
|
||||
|
||||
release_branches = ["release", "release-proxy", "release-compute"]
|
||||
|
||||
outputs: dict[str, dict[str, list[str]]] = {}
|
||||
|
||||
target_tags = (
|
||||
[target_tag, "latest"]
|
||||
if branch == "main"
|
||||
else [target_tag, "released"]
|
||||
if branch in release_branches
|
||||
else [target_tag]
|
||||
target_tags = [target_tag, "latest"] if branch == "main" else [target_tag]
|
||||
target_stages = (
|
||||
["dev", "prod"] if branch in ["release", "release-proxy", "release-compute"] else ["dev"]
|
||||
)
|
||||
target_stages = ["dev", "prod"] if branch in release_branches else ["dev"]
|
||||
|
||||
for component_name, component_images in components.items():
|
||||
for stage in target_stages:
|
||||
|
||||
31
.github/scripts/push_with_image_map.py
vendored
31
.github/scripts/push_with_image_map.py
vendored
@@ -11,27 +11,12 @@ try:
|
||||
except json.JSONDecodeError as e:
|
||||
raise ValueError("Failed to parse IMAGE_MAP as JSON") from e
|
||||
|
||||
failures = []
|
||||
for source, targets in parsed_image_map.items():
|
||||
for target in targets:
|
||||
cmd = ["docker", "buildx", "imagetools", "create", "-t", target, source]
|
||||
print(f"Running: {' '.join(cmd)}")
|
||||
result = subprocess.run(cmd, text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
|
||||
|
||||
pending = [(source, target) for source, targets in parsed_image_map.items() for target in targets]
|
||||
|
||||
while len(pending) > 0:
|
||||
if len(failures) > 10:
|
||||
print("Error: more than 10 failures!")
|
||||
for failure in failures:
|
||||
print(f'"{failure[0]}" failed with the following output:')
|
||||
print(failure[1])
|
||||
raise RuntimeError("Retry limit reached.")
|
||||
|
||||
source, target = pending.pop(0)
|
||||
cmd = ["docker", "buildx", "imagetools", "create", "-t", target, source]
|
||||
print(f"Running: {' '.join(cmd)}")
|
||||
result = subprocess.run(cmd, text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
|
||||
|
||||
if result.returncode != 0:
|
||||
failures.append((" ".join(cmd), result.stdout))
|
||||
pending.append((source, target))
|
||||
|
||||
if len(failures) > 0 and (github_output := os.getenv("GITHUB_OUTPUT")):
|
||||
with open(github_output, "a") as f:
|
||||
f.write("slack_notify=true\n")
|
||||
if result.returncode != 0:
|
||||
print(f"Error: {result.stdout}")
|
||||
raise RuntimeError(f"Command failed: {' '.join(cmd)}")
|
||||
|
||||
@@ -104,18 +104,6 @@ jobs:
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
- name: Copy docker images to target registries
|
||||
id: push
|
||||
run: python3 .github/scripts/push_with_image_map.py
|
||||
env:
|
||||
IMAGE_MAP: ${{ inputs.image-map }}
|
||||
|
||||
- name: Notify Slack if container image pushing fails
|
||||
if: steps.push.outputs.slack_notify == 'true' || failure()
|
||||
uses: slackapi/slack-github-action@485a9d42d3a73031f12ec201c457e2162c45d02d # v2.0.0
|
||||
with:
|
||||
method: chat.postMessage
|
||||
token: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
payload: |
|
||||
channel: ${{ vars.SLACK_ON_CALL_DEVPROD_STREAM }}
|
||||
text: |
|
||||
Pushing container images failed in <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
|
||||
|
||||
25
.github/workflows/build_and_test.yml
vendored
25
.github/workflows/build_and_test.yml
vendored
@@ -89,8 +89,8 @@ jobs:
|
||||
|
||||
check-codestyle-python:
|
||||
needs: [ meta, check-permissions, build-build-tools-image ]
|
||||
# No need to run on `main` because we this in the merge queue. We do need to run this in `.*-rc-pr` because of hotfixes.
|
||||
if: ${{ contains(fromJSON('["pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
|
||||
# No need to run on `main` because we this in the merge queue
|
||||
if: ${{ needs.meta.outputs.run-kind == 'pr' }}
|
||||
uses: ./.github/workflows/_check-codestyle-python.yml
|
||||
with:
|
||||
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
|
||||
@@ -98,8 +98,7 @@ jobs:
|
||||
|
||||
check-codestyle-jsonnet:
|
||||
needs: [ meta, check-permissions, build-build-tools-image ]
|
||||
# We do need to run this in `.*-rc-pr` because of hotfixes.
|
||||
if: ${{ contains(fromJSON('["pr", "push-main", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
|
||||
if: ${{ contains(fromJSON('["pr", "push-main"]'), needs.meta.outputs.run-kind) }}
|
||||
runs-on: [ self-hosted, small ]
|
||||
container:
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
@@ -182,8 +181,8 @@ jobs:
|
||||
|
||||
check-codestyle-rust:
|
||||
needs: [ meta, check-permissions, build-build-tools-image ]
|
||||
# No need to run on `main` because we this in the merge queue. We do need to run this in `.*-rc-pr` because of hotfixes.
|
||||
if: ${{ contains(fromJSON('["pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
|
||||
# No need to run on `main` because we this in the merge queue
|
||||
if: ${{ needs.meta.outputs.run-kind == 'pr' }}
|
||||
uses: ./.github/workflows/_check-codestyle-rust.yml
|
||||
with:
|
||||
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
|
||||
@@ -192,8 +191,7 @@ jobs:
|
||||
|
||||
check-dependencies-rust:
|
||||
needs: [ meta, files-changed, build-build-tools-image ]
|
||||
# No need to run on `main` because we this in the merge queue. We do need to run this in `.*-rc-pr` because of hotfixes.
|
||||
if: ${{ needs.files-changed.outputs.check-rust-dependencies == 'true' && contains(fromJSON('["pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
|
||||
if: ${{ needs.files-changed.outputs.check-rust-dependencies == 'true' && needs.meta.outputs.run-kind == 'pr' }}
|
||||
uses: ./.github/workflows/cargo-deny.yml
|
||||
with:
|
||||
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
|
||||
@@ -201,8 +199,7 @@ jobs:
|
||||
|
||||
build-and-test-locally:
|
||||
needs: [ meta, build-build-tools-image ]
|
||||
# We do need to run this in `.*-rc-pr` because of hotfixes.
|
||||
if: ${{ contains(fromJSON('["pr", "push-main", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
|
||||
if: ${{ contains(fromJSON('["pr", "push-main"]'), needs.meta.outputs.run-kind) }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
@@ -1568,10 +1565,10 @@ jobs:
|
||||
if: |
|
||||
contains(needs.*.result, 'failure')
|
||||
|| contains(needs.*.result, 'cancelled')
|
||||
|| (needs.check-dependencies-rust.result == 'skipped' && needs.files-changed.outputs.check-rust-dependencies == 'true' && contains(fromJSON('["pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind))
|
||||
|| (needs.build-and-test-locally.result == 'skipped' && contains(fromJSON('["pr", "push-main", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind))
|
||||
|| (needs.check-codestyle-python.result == 'skipped' && contains(fromJSON('["pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind))
|
||||
|| (needs.check-codestyle-rust.result == 'skipped' && contains(fromJSON('["pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind))
|
||||
|| (needs.check-dependencies-rust.result == 'skipped' && needs.files-changed.outputs.check-rust-dependencies == 'true' && needs.meta.outputs.run-kind == 'pr')
|
||||
|| (needs.build-and-test-locally.result == 'skipped' && needs.meta.outputs.run-kind == 'pr')
|
||||
|| (needs.check-codestyle-python.result == 'skipped' && needs.meta.outputs.run-kind == 'pr')
|
||||
|| (needs.check-codestyle-rust.result == 'skipped' && needs.meta.outputs.run-kind == 'pr')
|
||||
|| needs.files-changed.result == 'skipped'
|
||||
|| (needs.push-compute-image-dev.result == 'skipped' && contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind))
|
||||
|| (needs.push-neon-image-dev.result == 'skipped' && contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind))
|
||||
|
||||
@@ -292,7 +292,7 @@ WORKDIR /home/nonroot
|
||||
|
||||
# Rust
|
||||
# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
|
||||
ENV RUSTC_VERSION=1.86.0
|
||||
ENV RUSTC_VERSION=1.85.0
|
||||
ENV RUSTUP_HOME="/home/nonroot/.rustup"
|
||||
ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
|
||||
ARG RUSTFILT_VERSION=0.2.1
|
||||
|
||||
@@ -369,7 +369,7 @@ FROM build-deps AS plv8-src
|
||||
ARG PG_VERSION
|
||||
WORKDIR /ext-src
|
||||
|
||||
COPY compute/patches/plv8* .
|
||||
COPY compute/patches/plv8-3.1.10.patch .
|
||||
|
||||
# plv8 3.2.3 supports v17
|
||||
# last release v3.2.3 - Sep 7, 2024
|
||||
@@ -393,7 +393,7 @@ RUN case "${PG_VERSION:?}" in \
|
||||
git clone --recurse-submodules --depth 1 --branch ${PLV8_TAG} https://github.com/plv8/plv8.git plv8-src && \
|
||||
tar -czf plv8.tar.gz --exclude .git plv8-src && \
|
||||
cd plv8-src && \
|
||||
if [[ "${PG_VERSION:?}" < "v17" ]]; then patch -p1 < /ext-src/plv8_v3.1.10.patch; else patch -p1 < /ext-src/plv8_v3.2.3.patch; fi
|
||||
if [[ "${PG_VERSION:?}" < "v17" ]]; then patch -p1 < /ext-src/plv8-3.1.10.patch; fi
|
||||
|
||||
# Step 1: Build the vendored V8 engine. It doesn't depend on PostgreSQL, so use
|
||||
# 'build-deps' as the base. This enables caching and avoids unnecessary rebuilds.
|
||||
@@ -1055,6 +1055,34 @@ RUN if [ -d pg_embedding-src ]; then \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install; \
|
||||
fi
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg_anon-build"
|
||||
# compile anon extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg_anon-src
|
||||
ARG PG_VERSION
|
||||
|
||||
# This is an experimental extension, never got to real production.
|
||||
# !Do not remove! It can be present in shared_preload_libraries and compute will fail to start if library is not found.
|
||||
WORKDIR /ext-src
|
||||
RUN case "${PG_VERSION:?}" in "v17") \
|
||||
echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \
|
||||
esac && \
|
||||
wget https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
|
||||
echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9 pg_anon.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C .
|
||||
|
||||
FROM pg-build AS pg_anon-build
|
||||
COPY --from=pg_anon-src /ext-src/ /ext-src/
|
||||
WORKDIR /ext-src
|
||||
RUN if [ -d pg_anon-src ]; then \
|
||||
cd pg_anon-src && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control; \
|
||||
fi
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg build with nonroot user and cargo installed"
|
||||
@@ -1338,8 +1366,8 @@ ARG PG_VERSION
|
||||
# Do not update without approve from proxy team
|
||||
# Make sure the version is reflected in proxy/src/serverless/local_conn_pool.rs
|
||||
WORKDIR /ext-src
|
||||
RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.3.0.tar.gz -O pg_session_jwt.tar.gz && \
|
||||
echo "19be2dc0b3834d643706ed430af998bb4c2cdf24b3c45e7b102bb3a550e8660c pg_session_jwt.tar.gz" | sha256sum --check && \
|
||||
RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.2.0.tar.gz -O pg_session_jwt.tar.gz && \
|
||||
echo "5ace028e591f2e000ca10afa5b1ca62203ebff014c2907c0ec3b29c36f28a1bb pg_session_jwt.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
sed -i 's/version = "0.12.6"/version = "0.12.9"/g' pgrx-tests/Cargo.toml && \
|
||||
@@ -1649,6 +1677,7 @@ COPY --from=pg_roaringbitmap-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg_semver-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg_embedding-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=wal2json-build /usr/local/pgsql /usr/local/pgsql
|
||||
COPY --from=pg_anon-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg_ivm-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg_mooncake-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
@@ -33,7 +33,6 @@
|
||||
import 'sql_exporter/lfc_hits.libsonnet',
|
||||
import 'sql_exporter/lfc_misses.libsonnet',
|
||||
import 'sql_exporter/lfc_used.libsonnet',
|
||||
import 'sql_exporter/lfc_used_pages.libsonnet',
|
||||
import 'sql_exporter/lfc_writes.libsonnet',
|
||||
import 'sql_exporter/logical_slot_restart_lsn.libsonnet',
|
||||
import 'sql_exporter/max_cluster_size.libsonnet',
|
||||
|
||||
@@ -1,10 +0,0 @@
|
||||
{
|
||||
metric_name: 'lfc_used_pages',
|
||||
type: 'gauge',
|
||||
help: 'LFC pages used',
|
||||
key_labels: null,
|
||||
values: [
|
||||
'lfc_used_pages',
|
||||
],
|
||||
query: importstr 'sql_exporter/lfc_used_pages.sql',
|
||||
}
|
||||
@@ -1 +0,0 @@
|
||||
SELECT lfc_value AS lfc_used_pages FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_used_pages';
|
||||
@@ -1,6 +1,12 @@
|
||||
commit 46b38d3e46f9cd6c70d9b189dd6ff4abaa17cf5e
|
||||
Author: Alexander Bayandin <alexander@neon.tech>
|
||||
Date: Sat Nov 30 18:29:32 2024 +0000
|
||||
|
||||
Fix v8 9.7.37 compilation on Debian 12
|
||||
|
||||
diff --git a/patches/code/84cf3230a9680aac3b73c410c2b758760b6d3066.patch b/patches/code/84cf3230a9680aac3b73c410c2b758760b6d3066.patch
|
||||
new file mode 100644
|
||||
index 0000000..fae1cb3
|
||||
index 0000000..f0a5dc7
|
||||
--- /dev/null
|
||||
+++ b/patches/code/84cf3230a9680aac3b73c410c2b758760b6d3066.patch
|
||||
@@ -0,0 +1,30 @@
|
||||
@@ -29,21 +35,8 @@ index 0000000..fae1cb3
|
||||
+@@ -5,6 +5,7 @@
|
||||
+ #ifndef V8_HEAP_CPPGC_PREFINALIZER_HANDLER_H_
|
||||
+ #define V8_HEAP_CPPGC_PREFINALIZER_HANDLER_H_
|
||||
+
|
||||
+
|
||||
++#include <utility>
|
||||
+ #include <vector>
|
||||
+
|
||||
+
|
||||
+ #include "include/cppgc/prefinalizer.h"
|
||||
diff --git a/plv8.cc b/plv8.cc
|
||||
index c1ce883..6e47e94 100644
|
||||
--- a/plv8.cc
|
||||
+++ b/plv8.cc
|
||||
@@ -379,7 +379,7 @@ _PG_init(void)
|
||||
NULL,
|
||||
&plv8_v8_flags,
|
||||
NULL,
|
||||
- PGC_USERSET, 0,
|
||||
+ PGC_SUSET, 0,
|
||||
#if PG_VERSION_NUM >= 90100
|
||||
NULL,
|
||||
#endif
|
||||
@@ -1,13 +0,0 @@
|
||||
diff --git a/plv8.cc b/plv8.cc
|
||||
index edfa2aa..623e7f2 100644
|
||||
--- a/plv8.cc
|
||||
+++ b/plv8.cc
|
||||
@@ -385,7 +385,7 @@ _PG_init(void)
|
||||
NULL,
|
||||
&plv8_v8_flags,
|
||||
NULL,
|
||||
- PGC_USERSET, 0,
|
||||
+ PGC_SUSET, 0,
|
||||
#if PG_VERSION_NUM >= 90100
|
||||
NULL,
|
||||
#endif
|
||||
@@ -188,7 +188,7 @@ impl ComputeState {
|
||||
|
||||
COMPUTE_CTL_UP.reset();
|
||||
COMPUTE_CTL_UP
|
||||
.with_label_values(&[&BUILD_TAG, status.to_string().as_str()])
|
||||
.with_label_values(&[&BUILD_TAG, format!("{}", status).as_str()])
|
||||
.set(1);
|
||||
}
|
||||
|
||||
@@ -360,14 +360,6 @@ impl ComputeNode {
|
||||
this.prewarm_postgres()?;
|
||||
}
|
||||
|
||||
// Set the up metric with Empty status before starting the HTTP server.
|
||||
// That way on the first metric scrape, an external observer will see us
|
||||
// as 'up' and 'empty' (unless the compute was started with a spec or
|
||||
// already configured by control plane).
|
||||
COMPUTE_CTL_UP
|
||||
.with_label_values(&[&BUILD_TAG, ComputeStatus::Empty.to_string().as_str()])
|
||||
.set(1);
|
||||
|
||||
// Launch the external HTTP server first, so that we can serve control plane
|
||||
// requests while configuration is still in progress.
|
||||
crate::http::server::Server::External {
|
||||
@@ -377,13 +369,19 @@ impl ComputeNode {
|
||||
}
|
||||
.launch(&this);
|
||||
|
||||
// The internal HTTP server could be launched later, but there isn't much
|
||||
// sense in waiting.
|
||||
// The internal HTTP server is needed for a further activation by control plane
|
||||
// if compute was started for a pool, so we have to start server before hanging
|
||||
// waiting for a spec.
|
||||
crate::http::server::Server::Internal {
|
||||
port: this.params.internal_http_port,
|
||||
}
|
||||
.launch(&this);
|
||||
|
||||
// HTTP server is running, so we can officially declare compute_ctl as 'up'
|
||||
COMPUTE_CTL_UP
|
||||
.with_label_values(&[&BUILD_TAG, ComputeStatus::Empty.to_string().as_str()])
|
||||
.set(1);
|
||||
|
||||
// If we got a spec from the CLI already, use that. Otherwise wait for the
|
||||
// control plane to pass it to us with a /configure HTTP request
|
||||
let pspec = if let Some(cli_spec) = cli_spec {
|
||||
|
||||
@@ -59,12 +59,9 @@ impl AsyncAuthorizeRequest<Body> for Authorize {
|
||||
Box::pin(async move {
|
||||
let request_id = request.extract_parts::<RequestId>().await.unwrap();
|
||||
|
||||
// TODO: Remove this stanza after teaching neon_local and the
|
||||
// regression tests to use a JWT + JWKS.
|
||||
//
|
||||
// https://github.com/neondatabase/neon/issues/11316
|
||||
if cfg!(feature = "testing") {
|
||||
warn!(%request_id, "Skipping compute_ctl authorization check");
|
||||
// TODO: Remove this check after a successful rollout
|
||||
if jwks.keys.is_empty() {
|
||||
warn!(%request_id, "Authorization has not been configured");
|
||||
|
||||
return Ok(request);
|
||||
}
|
||||
@@ -113,6 +110,8 @@ impl AsyncAuthorizeRequest<Body> for Authorize {
|
||||
impl Authorize {
|
||||
/// Verify the token using the JSON Web Key set and return the token data.
|
||||
fn verify(jwks: &JwkSet, token: &str, validation: &Validation) -> Result<TokenData<Claims>> {
|
||||
debug_assert!(!jwks.keys.is_empty());
|
||||
|
||||
for jwk in jwks.keys.iter() {
|
||||
let decoding_key = match DecodingKey::from_jwk(jwk) {
|
||||
Ok(key) => key,
|
||||
|
||||
@@ -419,7 +419,7 @@ impl ComputeNode {
|
||||
.iter()
|
||||
.filter_map(|val| val.parse::<usize>().ok())
|
||||
.map(|val| if val > 1 { val - 1 } else { 1 })
|
||||
.next_back()
|
||||
.last()
|
||||
.unwrap_or(3)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -21,7 +21,6 @@ in this repository.
|
||||
- [WAL Redo](./pageserver-walredo.md)
|
||||
- [Page cache](./pageserver-pagecache.md)
|
||||
- [Storage](./pageserver-storage.md)
|
||||
- [Compaction](./pageserver-compaction.md)
|
||||
- [Processing a GetPage request](./pageserver-processing-getpage.md)
|
||||
- [Processing WAL](./pageserver-processing-wal.md)
|
||||
|
||||
|
||||
@@ -1,110 +0,0 @@
|
||||
# Pageserver Compaction
|
||||
|
||||
Lifted from <https://www.notion.so/neondatabase/Rough-Notes-on-Compaction-1baf189e004780859e65ef63b85cfa81?pvs=4>.
|
||||
|
||||
Updated 2025-03-26.
|
||||
|
||||
## Pages and WAL
|
||||
|
||||
Postgres stores data in 8 KB pages, identified by a page number.
|
||||
|
||||
The WAL contains a sequence of page writes: either images (complete page contents) or deltas (patches applied to images). Each write is identified by its byte position in the WAL, aka LSN.
|
||||
|
||||
Each page version is thus identified by page@LSN. Postgres may read pages at past LSNs.
|
||||
|
||||
Pageservers ingest WAL by writing WAL records into a key/value store keyed by page@LSN.
|
||||
|
||||
Pageservers materialize pages for Postgres reads by finding the most recent page image and applying all subsequent page deltas, up to the read LSN.
|
||||
|
||||
## Compaction: Why?
|
||||
|
||||
Pageservers store page@LSN keys in a key/value store using a custom variant of an LSM tree. Each timeline on each tenant shard has its own LSM tree.
|
||||
|
||||
When Pageservers write new page@LSN entries, they are appended unordered to an ephemeral layer file. When the ephemeral layer file exceeds `checkpoint_distance` (default 256 MB), the key/value pairs are sorted by key and written out to a layer file (for efficient lookups).
|
||||
|
||||
As WAL writes continue, more layer files accumulate.
|
||||
|
||||
Reads must search through the layer files to find the page’s image and deltas. The more layer files accumulate, the more la yer files reads must search through before they find a page image, aka read amplification.
|
||||
|
||||
Compaction’s job is to:
|
||||
|
||||
- Reduce read amplification by reorganizing and combining layer files.
|
||||
- Remove old garbage from layer files.
|
||||
|
||||
As part of this, it may combine several page deltas into a single page image where possible.
|
||||
|
||||
## Compaction: How?
|
||||
|
||||
Neon uses a non-standard variant of an LSM tree made up of two levels of layer files: L0 and L1.
|
||||
|
||||
Compaction runs in two phases: L0→L1 compaction, and L1 image compaction.
|
||||
|
||||
L0 contains a stack of L0 layers at decreasing LSN ranges. These have been flushed sequentially from ephemeral layers. Each L0 layer covers the entire page space (page 0 to ~infinity) and the LSN range that was ingested into it. L0 layers are therefore particularly bad for read amp, since every read must search all L0 layers below the read LSN. For example:
|
||||
|
||||
```
|
||||
| Page 0-99 @ LSN 0400-04ff |
|
||||
| Page 0-99 @ LSN 0300-03ff |
|
||||
| Page 0-99 @ LSN 0200-02ff |
|
||||
| Page 0-99 @ LSN 0100-01ff |
|
||||
| Page 0-99 @ LSN 0000-00ff |
|
||||
```
|
||||
|
||||
L0→L1 compaction takes the bottom-most chunk of L0 layer files of between `compaction_threshold` (default 10) and `compaction_upper_limit` (default 20) layers. It uses merge-sort to write out sorted L1 delta layers of size `compaction_target_size` (default 128 MB).
|
||||
|
||||
L1 typically consists of a “bed” of image layers with materialized page images at a specific LSN, and then delta layers of various page/LSN ranges above them with page deltas. For example:
|
||||
|
||||
```
|
||||
Delta layers: | 30-84@0310-04ff |
|
||||
Delta layers: | 10-42@0200-02ff | | 65-92@0174-02aa |
|
||||
Image layers: | 0-39@0100 | 40-79@0100 | 80-99@0100 |
|
||||
```
|
||||
|
||||
L1 image compaction scans across the L1 keyspace at some LSN, materializes page images by reading the image and delta layers below the LSN (via vectored reads), and writes out new sorted image layers of roughly size `compaction_target_size` (default 128 MB) at that LSN.
|
||||
|
||||
Layer files below the new image files’ LSN can be garbage collected when they are no longer needed for PITR.
|
||||
|
||||
Even though the old layer files are not immediately garbage collected, the new image layers help with read amp because reads can stop traversing the layer stack as soon as they encounter a page image.
|
||||
|
||||
## Compaction: When?
|
||||
|
||||
Pageservers run a `compaction_loop` background task for each tenant shard. Every `compaction_period` (default 20 seconds) it will wake up and check if any of the shard’s timelines need compaction. Additionally, L0 layer flushes will eagerly wake the compaction loop if the L0 count exceeds `compaction_threshold` (default 10).
|
||||
|
||||
L0 compaction runs if the number of L0 layers exceeds `compaction_threshold` (default 10).
|
||||
|
||||
L1 image compaction runs across sections of the L1 keyspace that have at least `image_creation_threshold` (default 3) delta layers overlapping image layers.
|
||||
|
||||
At most `CONCURRENT_BACKGROUND_TASKS` (default 3 / 4 * CPUs = 6) background tasks can run concurrently on a Pageserver, including compaction. Further compaction tasks must wait.
|
||||
|
||||
Because L0 layers cause the most read amp (they overlap the entire keyspace and only contain page deltas), they are aggressively compacted down:
|
||||
|
||||
- L0 is compacted down across all tenant timelines before L1 compaction is attempted (`compaction_l0_first`).
|
||||
- L0 compaction uses a separate concurrency limit of `CONCURRENT_L0_COMPACTION_TASKS` (default 3 / 4 * CPUs = 6) to avoid waiting for other tasks (`compaction_l0_semaphore`).
|
||||
- If L0 compaction is needed on any tenant timeline, L1 image compaction will yield to start an immediate L0 compaction run (except for compaction run via admin APIs).
|
||||
|
||||
## Backpressure
|
||||
|
||||
With sustained heavy write loads, new L0 layers may be flushed faster than they can be compacted down. This can cause an unbounded buildup of read amplification and compaction debt, which can take hours to resolve even after the writes stop.
|
||||
|
||||
To avoid this and allow compaction to keep up, layer flushes will slow writes down to apply backpressure on the workload:
|
||||
|
||||
- At `l0_flush_delay_threshold` (default 30) L0 layers, layer flushes are delayed by the flush duration, such that they take 2x as long.
|
||||
- At `l0_flush_stall_threshold` (default disabled) L0 layers, layer flushes stall entirely until the L0 count falls back below the threshold. This is currently disabled because we don’t trust L0 compaction to be responsive enough.
|
||||
|
||||
This backpressure is propagated to the compute by waiting for layer flushes when WAL ingestion rolls the ephemeral layer. The compute will significantly slow down WAL writes at:
|
||||
|
||||
- `max_replication_write_lag` (default 500 MB), when Pageserver WAL ingestion lags
|
||||
- `max_replication_flush_lag` (default 10 GB), when Pageserver L0 flushes lag
|
||||
|
||||
Combined, this means that the compute will backpressure when there are 30 L0 layers (30 * 256 MB = 7.7 GB) and the Pageserver WAL ingestion lags the compute by 500 MB, for a total of ~8 GB L0+ephemeral compaction debt on a single shard.
|
||||
|
||||
Since we only delay L0 flushes by 2x when backpressuring, and haven’t enabled stalls, it is still possible for read amp to increase unbounded if compaction is too slow (although we haven’t seen this in practice). But this is considered better than stalling flushes and causing unavailability for as long as it takes L0 compaction to react, since we don’t trust it to be fast enough — at the expense of continually increasing read latency and CPU usage for this tenant. We should either enable stalls when we have enough confidence in L0 compaction, or scale the flush delay by the number of L0 layers to apply increasing backpressure.
|
||||
|
||||
## Circuit Breaker
|
||||
|
||||
Compaction can fail, often repeatedly. This can happen e.g. due to data corruption, faulty hardware, S3 outages, etc.
|
||||
|
||||
If compaction fails, the compaction loop will naïvely try and fail again almost immediately. It may only fail after doing a significant amount of wasted work, while holding onto the background task semaphore.
|
||||
|
||||
To avoid repeatedly doing wasted work and starving out other compaction jobs, each tenant has a compaction circuit breaker. After 5 repeated compaction failures, the circuit breaker trips and disables compaction for the next 24 hours, before resetting the breaker and trying again. This disables compaction across all tenant timelines (faulty or not).
|
||||
|
||||
Disabling compaction for a long time is dangerous, since it can lead to unbounded read amp and compaction debt, and continuous workload backpressure. However, continually failing would not help either. Tripped circuit breakers trigger an alert and must be investigated promptly.
|
||||
@@ -30,6 +30,20 @@ static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
static SERVE_METRICS_COUNT_2: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"libmetrics_metric_handler_requests_2_total",
|
||||
"Number of metric requests made"
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
static SERVE_METRICS_COUNT_3: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"libmetrics_metric_handler_requests_total_3",
|
||||
"Number of metric requests made"
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static X_REQUEST_ID_HEADER_STR: &str = "x-request-id";
|
||||
|
||||
@@ -251,6 +265,8 @@ impl std::io::Write for ChannelWriter {
|
||||
|
||||
pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
SERVE_METRICS_COUNT.inc();
|
||||
SERVE_METRICS_COUNT_2.inc();
|
||||
SERVE_METRICS_COUNT_3.inc();
|
||||
|
||||
let started_at = std::time::Instant::now();
|
||||
|
||||
|
||||
@@ -91,14 +91,14 @@ impl Server {
|
||||
Ok(tls_stream) => tls_stream,
|
||||
Err(err) => {
|
||||
if !suppress_io_error(&err) {
|
||||
info!(%remote_addr, "Failed to accept TLS connection: {err:#}");
|
||||
info!("Failed to accept TLS connection: {err:#}");
|
||||
}
|
||||
return;
|
||||
}
|
||||
};
|
||||
if let Err(err) = Self::serve_connection(tls_stream, service, cancel).await {
|
||||
if !suppress_hyper_error(&err) {
|
||||
info!(%remote_addr, "Failed to serve HTTPS connection: {err:#}");
|
||||
info!("Failed to serve HTTPS connection: {err:#}");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -106,7 +106,7 @@ impl Server {
|
||||
// Handle HTTP connection.
|
||||
if let Err(err) = Self::serve_connection(tcp_stream, service, cancel).await {
|
||||
if !suppress_hyper_error(&err) {
|
||||
info!(%remote_addr, "Failed to serve HTTP connection: {err:#}");
|
||||
info!("Failed to serve HTTP connection: {err:#}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1418,6 +1418,11 @@ pub struct TimelineInfo {
|
||||
pub last_record_lsn: Lsn,
|
||||
pub prev_record_lsn: Option<Lsn>,
|
||||
|
||||
/// Legacy field, retained for one version to enable old storage controller to
|
||||
/// decode (it was a mandatory field).
|
||||
#[serde(default, rename = "latest_gc_cutoff_lsn")]
|
||||
pub _unused: Lsn,
|
||||
|
||||
/// The LSN up to which GC has advanced: older data may still exist but it is not available for clients.
|
||||
/// This LSN is not suitable for deciding where to create branches etc: use [`TimelineInfo::min_readable_lsn`] instead,
|
||||
/// as it is easier to reason about.
|
||||
|
||||
@@ -558,7 +558,7 @@ async fn upload_large_enough_file(
|
||||
) -> usize {
|
||||
let header = bytes::Bytes::from_static("remote blob data content".as_bytes());
|
||||
let body = bytes::Bytes::from(vec![0u8; 1024]);
|
||||
let contents = std::iter::once(header).chain(std::iter::repeat_n(body, 128));
|
||||
let contents = std::iter::once(header).chain(std::iter::repeat(body).take(128));
|
||||
|
||||
let len = contents.clone().fold(0, |acc, next| acc + next.len());
|
||||
|
||||
|
||||
@@ -86,17 +86,17 @@ impl Client {
|
||||
resp.json().await.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
/// Send an HTTP request to an arbitrary path with a desired HTTP method and returning a streaming
|
||||
/// Response. This function is suitable for pass-through/proxy use cases where we don't care
|
||||
/// what the response content looks like.
|
||||
/// Get an arbitrary path and returning a streaming Response. This function is suitable
|
||||
/// for pass-through/proxy use cases where we don't care what the response content looks
|
||||
/// like.
|
||||
///
|
||||
/// Use/add one of the properly typed methods below if you know aren't proxying, and
|
||||
/// know what kind of response you expect.
|
||||
pub async fn op_raw(&self, method: Method, path: String) -> Result<reqwest::Response> {
|
||||
pub async fn get_raw(&self, path: String) -> Result<reqwest::Response> {
|
||||
debug_assert!(path.starts_with('/'));
|
||||
let uri = format!("{}{}", self.mgmt_api_endpoint, path);
|
||||
|
||||
let mut req = self.client.request(method, uri);
|
||||
let mut req = self.client.request(Method::GET, uri);
|
||||
if let Some(value) = &self.authorization_header {
|
||||
req = req.header(reqwest::header::AUTHORIZATION, value);
|
||||
}
|
||||
|
||||
@@ -74,8 +74,8 @@ use crate::tenant::size::ModelInputs;
|
||||
use crate::tenant::storage_layer::{IoConcurrency, LayerAccessStatsReset, LayerName};
|
||||
use crate::tenant::timeline::offload::{OffloadError, offload_timeline};
|
||||
use crate::tenant::timeline::{
|
||||
CompactFlags, CompactOptions, CompactRequest, CompactionError, MarkInvisibleRequest, Timeline,
|
||||
WaitLsnTimeout, WaitLsnWaiter, import_pgdata,
|
||||
CompactFlags, CompactOptions, CompactRequest, CompactionError, Timeline, WaitLsnTimeout,
|
||||
WaitLsnWaiter, import_pgdata,
|
||||
};
|
||||
use crate::tenant::{
|
||||
GetTimelineError, LogicalSizeCalculationCause, OffloadedTimeline, PageReconstructError,
|
||||
@@ -445,9 +445,6 @@ async fn build_timeline_info_common(
|
||||
|
||||
let (pitr_history_size, within_ancestor_pitr) = timeline.get_pitr_history_stats();
|
||||
|
||||
// Externally, expose the lowest LSN that can be used to create a branch.
|
||||
// Internally we distinguish between the planned GC cutoff (PITR point) and the "applied" GC cutoff (where we
|
||||
// actually trimmed data to), which can pass each other when PITR is changed.
|
||||
let min_readable_lsn = std::cmp::max(
|
||||
timeline.get_gc_cutoff_lsn(),
|
||||
*timeline.get_applied_gc_cutoff_lsn(),
|
||||
@@ -464,6 +461,7 @@ async fn build_timeline_info_common(
|
||||
initdb_lsn,
|
||||
last_record_lsn,
|
||||
prev_record_lsn: Some(timeline.get_prev_record_lsn()),
|
||||
_unused: Default::default(), // Unused, for legacy decode only
|
||||
min_readable_lsn,
|
||||
applied_gc_cutoff_lsn: *timeline.get_applied_gc_cutoff_lsn(),
|
||||
current_logical_size: current_logical_size.size_dont_care_about_accuracy(),
|
||||
@@ -2337,31 +2335,21 @@ async fn timeline_compact_handler(
|
||||
}
|
||||
|
||||
async fn timeline_mark_invisible_handler(
|
||||
mut request: Request<Body>,
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
let compact_request = json_request_maybe::<Option<MarkInvisibleRequest>>(&mut request).await?;
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
let visibility = match compact_request {
|
||||
Some(req) => match req.is_visible {
|
||||
Some(true) => TimelineVisibilityState::Visible,
|
||||
Some(false) | None => TimelineVisibilityState::Invisible,
|
||||
},
|
||||
None => TimelineVisibilityState::Invisible,
|
||||
};
|
||||
|
||||
async {
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
let timeline = tenant.get_timeline(timeline_id, true)?;
|
||||
timeline.remote_client.schedule_index_upload_for_timeline_invisible_state(visibility).map_err(ApiError::InternalServerError)?;
|
||||
timeline.remote_client.schedule_index_upload_for_timeline_invisible_state(TimelineVisibilityState::Invisible).map_err(ApiError::InternalServerError)?;
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
.instrument(info_span!("manual_timeline_mark_invisible", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
|
||||
@@ -3188,8 +3176,7 @@ async fn list_aux_files(
|
||||
timeline.gate.enter().map_err(|_| ApiError::Cancelled)?,
|
||||
);
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download)
|
||||
.with_scope_timeline(&timeline);
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let files = timeline
|
||||
.list_aux_files(body.lsn, &ctx, io_concurrency)
|
||||
.await?;
|
||||
|
||||
@@ -219,7 +219,8 @@ pageserver_runtime!(MGMT_REQUEST_RUNTIME, "mgmt request worker");
|
||||
pageserver_runtime!(WALRECEIVER_RUNTIME, "walreceiver worker");
|
||||
pageserver_runtime!(BACKGROUND_RUNTIME, "background op worker");
|
||||
// Bump this number when adding a new pageserver_runtime!
|
||||
const NUM_MULTIPLE_RUNTIMES: NonZeroUsize = NonZeroUsize::new(4).unwrap();
|
||||
// SAFETY: it's obviously correct
|
||||
const NUM_MULTIPLE_RUNTIMES: NonZeroUsize = unsafe { NonZeroUsize::new_unchecked(4) };
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct PageserverTaskId(u64);
|
||||
|
||||
@@ -3248,23 +3248,17 @@ impl Tenant {
|
||||
async fn housekeeping(&self) {
|
||||
// Call through to all timelines to freeze ephemeral layers as needed. This usually happens
|
||||
// during ingest, but we don't want idle timelines to hold open layers for too long.
|
||||
//
|
||||
// We don't do this if the tenant can't upload layers (i.e. it's in stale attachment mode).
|
||||
// We don't run compaction in this case either, and don't want to keep flushing tiny L0
|
||||
// layers that won't be compacted down.
|
||||
if self.tenant_conf.load().location.may_upload_layers_hint() {
|
||||
let timelines = self
|
||||
.timelines
|
||||
.lock()
|
||||
.unwrap()
|
||||
.values()
|
||||
.filter(|tli| tli.is_active())
|
||||
.cloned()
|
||||
.collect_vec();
|
||||
let timelines = self
|
||||
.timelines
|
||||
.lock()
|
||||
.unwrap()
|
||||
.values()
|
||||
.filter(|tli| tli.is_active())
|
||||
.cloned()
|
||||
.collect_vec();
|
||||
|
||||
for timeline in timelines {
|
||||
timeline.maybe_freeze_ephemeral_layer().await;
|
||||
}
|
||||
for timeline in timelines {
|
||||
timeline.maybe_freeze_ephemeral_layer().await;
|
||||
}
|
||||
|
||||
// Shut down walredo if idle.
|
||||
@@ -3689,7 +3683,7 @@ impl Tenant {
|
||||
}
|
||||
}
|
||||
}
|
||||
TenantState::Active => {
|
||||
TenantState::Active { .. } => {
|
||||
return Ok(());
|
||||
}
|
||||
TenantState::Broken { reason, .. } => {
|
||||
|
||||
@@ -53,7 +53,7 @@ impl<Value: Clone> LayerCoverage<Value> {
|
||||
///
|
||||
/// Complexity: O(log N)
|
||||
fn add_node(&mut self, key: i128) {
|
||||
let value = match self.nodes.range(..=key).next_back() {
|
||||
let value = match self.nodes.range(..=key).last() {
|
||||
Some((_, Some(v))) => Some(v.clone()),
|
||||
Some((_, None)) => None,
|
||||
None => None,
|
||||
|
||||
@@ -58,7 +58,7 @@ use crate::{InitializationOrder, TEMP_FILE_SUFFIX};
|
||||
|
||||
/// For a tenant that appears in TenantsMap, it may either be
|
||||
/// - `Attached`: has a full Tenant object, is elegible to service
|
||||
/// reads and ingest WAL.
|
||||
/// reads and ingest WAL.
|
||||
/// - `Secondary`: is only keeping a local cache warm.
|
||||
///
|
||||
/// Secondary is a totally distinct state rather than being a mode of a `Tenant`, because
|
||||
|
||||
@@ -130,7 +130,7 @@ impl IndexPart {
|
||||
/// Version history
|
||||
/// - 2: added `deleted_at`
|
||||
/// - 3: no longer deserialize `timeline_layers` (serialized format is the same, but timeline_layers
|
||||
/// is always generated from the keys of `layer_metadata`)
|
||||
/// is always generated from the keys of `layer_metadata`)
|
||||
/// - 4: timeline_layers is fully removed.
|
||||
/// - 5: lineage was added
|
||||
/// - 6: last_aux_file_policy is added.
|
||||
|
||||
@@ -895,12 +895,6 @@ pub(crate) struct CompactRequest {
|
||||
pub sub_compaction_max_job_size_mb: Option<u64>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, serde::Deserialize)]
|
||||
pub(crate) struct MarkInvisibleRequest {
|
||||
#[serde(default)]
|
||||
pub is_visible: Option<bool>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub(crate) struct CompactOptions {
|
||||
pub flags: EnumSet<CompactFlags>,
|
||||
@@ -2247,7 +2241,7 @@ impl Timeline {
|
||||
.await
|
||||
.expect("holding a reference to self");
|
||||
}
|
||||
TimelineState::Active => {
|
||||
TimelineState::Active { .. } => {
|
||||
return Ok(());
|
||||
}
|
||||
TimelineState::Broken { .. } | TimelineState::Stopping => {
|
||||
|
||||
@@ -2,14 +2,10 @@ use std::collections::HashSet;
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::Context;
|
||||
use bytes::Bytes;
|
||||
use http_utils::error::ApiError;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::DetachBehavior;
|
||||
use pageserver_api::models::detach_ancestor::AncestorDetached;
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use pageserver_compaction::helpers::overlaps_with;
|
||||
use tokio::sync::Semaphore;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::Instrument;
|
||||
@@ -26,10 +22,7 @@ use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::Tenant;
|
||||
use crate::tenant::remote_timeline_client::index::GcBlockingReason::DetachAncestor;
|
||||
use crate::tenant::storage_layer::layer::local_layer_path;
|
||||
use crate::tenant::storage_layer::{
|
||||
AsLayerDesc as _, DeltaLayerWriter, ImageLayerWriter, IoConcurrency, Layer, ResidentLayer,
|
||||
ValuesReconstructState,
|
||||
};
|
||||
use crate::tenant::storage_layer::{AsLayerDesc as _, DeltaLayerWriter, Layer, ResidentLayer};
|
||||
use crate::virtual_file::{MaybeFatalIo, VirtualFile};
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
@@ -177,92 +170,6 @@ impl Attempt {
|
||||
}
|
||||
}
|
||||
|
||||
async fn generate_tombstone_image_layer(
|
||||
detached: &Arc<Timeline>,
|
||||
ancestor: &Arc<Timeline>,
|
||||
ancestor_lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Option<ResidentLayer>, Error> {
|
||||
tracing::info!(
|
||||
"removing non-inherited keys by writing an image layer with tombstones at the detach LSN"
|
||||
);
|
||||
let io_concurrency = IoConcurrency::spawn_from_conf(
|
||||
detached.conf,
|
||||
detached.gate.enter().map_err(|_| Error::ShuttingDown)?,
|
||||
);
|
||||
let mut reconstruct_state = ValuesReconstructState::new(io_concurrency);
|
||||
// Directly use `get_vectored_impl` to skip the max_vectored_read_key limit check. Note that the keyspace should
|
||||
// not contain too many keys, otherwise this takes a lot of memory. Currently we limit it to 10k keys in the compute.
|
||||
let key_range = Key::sparse_non_inherited_keyspace();
|
||||
// avoid generating a "future layer" which will then be removed
|
||||
let image_lsn = ancestor_lsn;
|
||||
|
||||
{
|
||||
let layers = detached.layers.read().await;
|
||||
for layer in layers.all_persistent_layers() {
|
||||
if !layer.is_delta
|
||||
&& layer.lsn_range.start == image_lsn
|
||||
&& overlaps_with(&key_range, &layer.key_range)
|
||||
{
|
||||
tracing::warn!(
|
||||
layer=%layer, "image layer at the detach LSN already exists, skipping removing aux files"
|
||||
);
|
||||
return Ok(None);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let data = ancestor
|
||||
.get_vectored_impl(
|
||||
KeySpace::single(key_range.clone()),
|
||||
image_lsn,
|
||||
&mut reconstruct_state,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
.context("failed to retrieve aux keys")
|
||||
.map_err(|e| Error::launder(e, Error::Prepare))?;
|
||||
if !data.is_empty() {
|
||||
// TODO: is it possible that we can have an image at `image_lsn`? Unlikely because image layers are only generated
|
||||
// upon compaction but theoretically possible.
|
||||
let mut image_layer_writer = ImageLayerWriter::new(
|
||||
detached.conf,
|
||||
detached.timeline_id,
|
||||
detached.tenant_shard_id,
|
||||
&key_range,
|
||||
image_lsn,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
.context("failed to create image layer writer")
|
||||
.map_err(Error::Prepare)?;
|
||||
for key in data.keys() {
|
||||
image_layer_writer
|
||||
.put_image(*key, Bytes::new(), ctx)
|
||||
.await
|
||||
.context("failed to write key")
|
||||
.map_err(|e| Error::launder(e, Error::Prepare))?;
|
||||
}
|
||||
let (desc, path) = image_layer_writer
|
||||
.finish(ctx)
|
||||
.await
|
||||
.context("failed to finish image layer writer for removing the metadata keys")
|
||||
.map_err(|e| Error::launder(e, Error::Prepare))?;
|
||||
let generated = Layer::finish_creating(detached.conf, detached, desc, &path)
|
||||
.map_err(|e| Error::launder(e, Error::Prepare))?;
|
||||
detached
|
||||
.remote_client
|
||||
.upload_layer_file(&generated, &detached.cancel)
|
||||
.await
|
||||
.map_err(|e| Error::launder(e, Error::Prepare))?;
|
||||
tracing::info!(layer=%generated, "wrote image layer");
|
||||
Ok(Some(generated))
|
||||
} else {
|
||||
tracing::info!("no aux keys found in ancestor");
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
/// See [`Timeline::prepare_to_detach_from_ancestor`]
|
||||
pub(super) async fn prepare(
|
||||
detached: &Arc<Timeline>,
|
||||
@@ -445,16 +352,10 @@ pub(super) async fn prepare(
|
||||
|
||||
// TODO: copying and lsn prefix copying could be done at the same time with a single fsync after
|
||||
let mut new_layers: Vec<Layer> =
|
||||
Vec::with_capacity(straddling_branchpoint.len() + rest_of_historic.len() + 1);
|
||||
|
||||
if let Some(tombstone_layer) =
|
||||
generate_tombstone_image_layer(detached, &ancestor, ancestor_lsn, ctx).await?
|
||||
{
|
||||
new_layers.push(tombstone_layer.into());
|
||||
}
|
||||
Vec::with_capacity(straddling_branchpoint.len() + rest_of_historic.len());
|
||||
|
||||
{
|
||||
tracing::info!(to_rewrite = %straddling_branchpoint.len(), "copying prefix of delta layers");
|
||||
tracing::debug!(to_rewrite = %straddling_branchpoint.len(), "copying prefix of delta layers");
|
||||
|
||||
let mut tasks = tokio::task::JoinSet::new();
|
||||
|
||||
|
||||
@@ -25,8 +25,8 @@ impl<const A: usize> AlignedBufferMut<ConstAlign<A>> {
|
||||
/// * `align` must be a power of two,
|
||||
///
|
||||
/// * `capacity`, when rounded up to the nearest multiple of `align`,
|
||||
/// must not overflow isize (i.e., the rounded value must be
|
||||
/// less than or equal to `isize::MAX`).
|
||||
/// must not overflow isize (i.e., the rounded value must be
|
||||
/// less than or equal to `isize::MAX`).
|
||||
pub fn with_capacity(capacity: usize) -> Self {
|
||||
AlignedBufferMut {
|
||||
raw: RawAlignedBuffer::with_capacity(capacity),
|
||||
|
||||
@@ -37,8 +37,8 @@ impl<const A: usize> RawAlignedBuffer<ConstAlign<A>> {
|
||||
/// * `align` must be a power of two,
|
||||
///
|
||||
/// * `capacity`, when rounded up to the nearest multiple of `align`,
|
||||
/// must not overflow isize (i.e., the rounded value must be
|
||||
/// less than or equal to `isize::MAX`).
|
||||
/// must not overflow isize (i.e., the rounded value must be
|
||||
/// less than or equal to `isize::MAX`).
|
||||
pub fn with_capacity(capacity: usize) -> Self {
|
||||
let align = ConstAlign::<A>;
|
||||
let layout = Layout::from_size_align(capacity, align.align()).expect("Invalid layout");
|
||||
|
||||
@@ -41,7 +41,7 @@ use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
|
||||
use crate::metrics::Metrics;
|
||||
|
||||
pub(crate) const EXT_NAME: &str = "pg_session_jwt";
|
||||
pub(crate) const EXT_VERSION: &str = "0.3.0";
|
||||
pub(crate) const EXT_VERSION: &str = "0.2.0";
|
||||
pub(crate) const EXT_SCHEMA: &str = "auth";
|
||||
|
||||
#[derive(Clone)]
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
[toolchain]
|
||||
channel = "1.86.0"
|
||||
channel = "1.85.0"
|
||||
profile = "default"
|
||||
# The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
|
||||
# https://rust-lang.github.io/rustup/concepts/profiles.html
|
||||
|
||||
@@ -94,10 +94,10 @@ impl WalReceivers {
|
||||
|
||||
/// Get reference to locked slot contents. Slot must exist (registered
|
||||
/// earlier).
|
||||
fn get_slot(
|
||||
self: &Arc<WalReceivers>,
|
||||
fn get_slot<'a>(
|
||||
self: &'a Arc<WalReceivers>,
|
||||
id: WalReceiverId,
|
||||
) -> MappedMutexGuard<'_, WalReceiverState> {
|
||||
) -> MappedMutexGuard<'a, WalReceiverState> {
|
||||
MutexGuard::map(self.mutex.lock(), |locked| {
|
||||
locked.slots[id]
|
||||
.as_mut()
|
||||
|
||||
@@ -699,7 +699,7 @@ impl Timeline {
|
||||
}
|
||||
|
||||
/// Take a writing mutual exclusive lock on timeline shared_state.
|
||||
pub async fn write_shared_state(self: &Arc<Self>) -> WriteGuardSharedState<'_> {
|
||||
pub async fn write_shared_state<'a>(self: &'a Arc<Self>) -> WriteGuardSharedState<'a> {
|
||||
WriteGuardSharedState::new(self.clone(), self.mutex.write().await)
|
||||
}
|
||||
|
||||
|
||||
@@ -116,7 +116,7 @@ fn test_many_tx() -> anyhow::Result<()> {
|
||||
}
|
||||
None
|
||||
})
|
||||
.next_back()
|
||||
.last()
|
||||
.unwrap();
|
||||
|
||||
let initdb_lsn = 21623024;
|
||||
|
||||
@@ -253,7 +253,7 @@ impl HeartBeat<Node, PageserverState> for HeartbeaterTask<Node, PageserverState>
|
||||
PageserverState::WarmingUp { .. } => {
|
||||
warming_up += 1;
|
||||
}
|
||||
PageserverState::Offline => offline += 1,
|
||||
PageserverState::Offline { .. } => offline += 1,
|
||||
PageserverState::Available { .. } => {}
|
||||
}
|
||||
}
|
||||
@@ -391,7 +391,7 @@ impl HeartBeat<Safekeeper, SafekeeperState> for HeartbeaterTask<Safekeeper, Safe
|
||||
let mut offline = 0;
|
||||
for state in new_state.values() {
|
||||
match state {
|
||||
SafekeeperState::Offline => offline += 1,
|
||||
SafekeeperState::Offline { .. } => offline += 1,
|
||||
SafekeeperState::Available { .. } => {}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -639,15 +639,6 @@ async fn handle_tenant_timeline_passthrough(
|
||||
return Err(ApiError::BadRequest(anyhow::anyhow!("Missing path")));
|
||||
};
|
||||
|
||||
let method = match *req.method() {
|
||||
hyper::Method::GET => reqwest::Method::GET,
|
||||
hyper::Method::POST => reqwest::Method::POST,
|
||||
hyper::Method::PUT => reqwest::Method::PUT,
|
||||
hyper::Method::DELETE => reqwest::Method::DELETE,
|
||||
hyper::Method::PATCH => reqwest::Method::PATCH,
|
||||
_ => return Err(ApiError::BadRequest(anyhow::anyhow!("Unsupported method"))),
|
||||
};
|
||||
|
||||
tracing::info!(
|
||||
"Proxying request for tenant {} ({})",
|
||||
tenant_or_shard_id.tenant_id,
|
||||
@@ -695,7 +686,7 @@ async fn handle_tenant_timeline_passthrough(
|
||||
node.base_url(),
|
||||
service.get_config().pageserver_jwt_token.as_deref(),
|
||||
);
|
||||
let resp = client.op_raw(method, path).await.map_err(|e|
|
||||
let resp = client.get_raw(path).await.map_err(|e|
|
||||
// We return 503 here because if we can't successfully send a request to the pageserver,
|
||||
// either we aren't available or the pageserver is unavailable.
|
||||
ApiError::ResourceUnavailable(format!("Error sending pageserver API request to {node}: {e}").into()))?;
|
||||
@@ -1416,12 +1407,6 @@ async fn handle_upsert_safekeeper(mut req: Request<Body>) -> Result<Response<Bod
|
||||
)));
|
||||
}
|
||||
|
||||
if id <= 0 {
|
||||
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"id not allowed to be zero or negative: {id}"
|
||||
)));
|
||||
}
|
||||
|
||||
let req = match maybe_forward(req).await {
|
||||
ForwardOutcome::Forwarded(res) => {
|
||||
return res;
|
||||
@@ -2262,17 +2247,6 @@ pub fn make_router(
|
||||
RequestName("v1_tenant_passthrough"),
|
||||
)
|
||||
})
|
||||
// Tenant timeline mark_invisible passthrough to shard zero
|
||||
.put(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/mark_invisible",
|
||||
|r| {
|
||||
tenant_service_handler(
|
||||
r,
|
||||
handle_tenant_timeline_passthrough,
|
||||
RequestName("v1_tenant_timeline_mark_invisible_passthrough"),
|
||||
)
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -115,17 +115,19 @@ struct Cli {
|
||||
#[arg(long)]
|
||||
split_threshold: Option<u64>,
|
||||
|
||||
/// Maximum number of shards during autosplits. 0 disables autosplits. Defaults
|
||||
/// to 16 as a safety to avoid too many shards by accident.
|
||||
#[arg(long, default_value = "16")]
|
||||
/// Maximum number of shards during autosplits. 0 disables autosplits.
|
||||
// TODO: defaults to 8 for backwards compatibility, should default to 255.
|
||||
#[arg(long, default_value = "8")]
|
||||
max_split_shards: u8,
|
||||
|
||||
/// Size threshold for initial shard splits of unsharded tenants. 0 disables initial splits.
|
||||
#[arg(long)]
|
||||
initial_split_threshold: Option<u64>,
|
||||
// TODO: defaults to 64 GB for backwards compatibility. Should default to None.
|
||||
#[arg(long, default_value = "68719476736")]
|
||||
initial_split_threshold: u64,
|
||||
|
||||
/// Number of target shards for initial splits. 0 or 1 disables initial splits. Defaults to 2.
|
||||
#[arg(long, default_value = "2")]
|
||||
/// Number of target shards for initial splits. 0 or 1 disables initial splits.
|
||||
// TODO: defaults to 8 for backwards compatibility. Should default to 2.
|
||||
#[arg(long, default_value = "8")]
|
||||
initial_split_shards: u8,
|
||||
|
||||
/// Maximum number of normal-priority reconcilers that may run in parallel
|
||||
@@ -283,8 +285,10 @@ impl Secrets {
|
||||
fn load_secret(cli: &Option<String>, env_name: &str) -> Option<String> {
|
||||
if let Some(v) = cli {
|
||||
Some(v.clone())
|
||||
} else if let Ok(v) = std::env::var(env_name) {
|
||||
Some(v)
|
||||
} else {
|
||||
std::env::var(env_name).ok()
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -413,7 +417,7 @@ async fn async_main() -> anyhow::Result<()> {
|
||||
tenant_rate_limit: args.tenant_rate_limit,
|
||||
split_threshold: args.split_threshold,
|
||||
max_split_shards: args.max_split_shards,
|
||||
initial_split_threshold: args.initial_split_threshold,
|
||||
initial_split_threshold: Some(args.initial_split_threshold),
|
||||
initial_split_shards: args.initial_split_shards,
|
||||
neon_local_repo_dir: args.neon_local_repo_dir,
|
||||
max_secondary_lag_bytes: args.max_secondary_lag_bytes,
|
||||
|
||||
@@ -686,8 +686,6 @@ impl Reconciler {
|
||||
.await?,
|
||||
);
|
||||
|
||||
pausable_failpoint!("reconciler-live-migrate-post-generation-inc");
|
||||
|
||||
let dest_conf = build_location_config(
|
||||
&self.shard,
|
||||
&self.config,
|
||||
@@ -762,9 +760,7 @@ impl Reconciler {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Returns true if the observed state of the attached location was refreshed
|
||||
/// and false otherwise.
|
||||
async fn maybe_refresh_observed(&mut self) -> Result<bool, ReconcileError> {
|
||||
async fn maybe_refresh_observed(&mut self) -> Result<(), ReconcileError> {
|
||||
// If the attached node has uncertain state, read it from the pageserver before proceeding: this
|
||||
// is important to avoid spurious generation increments.
|
||||
//
|
||||
@@ -774,7 +770,7 @@ impl Reconciler {
|
||||
|
||||
let Some(attached_node) = self.intent.attached.as_ref() else {
|
||||
// Nothing to do
|
||||
return Ok(false);
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
if matches!(
|
||||
@@ -819,7 +815,7 @@ impl Reconciler {
|
||||
}
|
||||
}
|
||||
|
||||
Ok(true)
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Reconciling a tenant makes API calls to pageservers until the observed state
|
||||
@@ -835,7 +831,7 @@ impl Reconciler {
|
||||
/// state where it still requires later reconciliation.
|
||||
pub(crate) async fn reconcile(&mut self) -> Result<(), ReconcileError> {
|
||||
// Prepare: if we have uncertain `observed` state for our would-be attachement location, then refresh it
|
||||
let refreshed = self.maybe_refresh_observed().await?;
|
||||
self.maybe_refresh_observed().await?;
|
||||
|
||||
// Special case: live migration
|
||||
self.maybe_live_migrate().await?;
|
||||
@@ -859,14 +855,8 @@ impl Reconciler {
|
||||
);
|
||||
match self.observed.locations.get(&node.get_id()) {
|
||||
Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
|
||||
if refreshed {
|
||||
tracing::info!(
|
||||
node_id=%node.get_id(), "Observed configuration correct after refresh. Notifying compute.");
|
||||
self.compute_notify().await?;
|
||||
} else {
|
||||
// Nothing to do
|
||||
tracing::info!(node_id=%node.get_id(), "Observed configuration already correct.");
|
||||
}
|
||||
// Nothing to do
|
||||
tracing::info!(node_id=%node.get_id(), "Observed configuration already correct.")
|
||||
}
|
||||
observed => {
|
||||
// In all cases other than a matching observed configuration, we will
|
||||
|
||||
@@ -622,7 +622,7 @@ impl TenantShard {
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
attached_locs.sort_by_key(|i| i.1);
|
||||
if let Some((node_id, _gen)) = attached_locs.into_iter().next_back() {
|
||||
if let Some((node_id, _gen)) = attached_locs.into_iter().last() {
|
||||
self.intent.set_attached(scheduler, Some(*node_id));
|
||||
}
|
||||
|
||||
|
||||
@@ -18,7 +18,7 @@ enum LargeObjectKind {
|
||||
|
||||
impl LargeObjectKind {
|
||||
fn from_key(key: &str) -> Self {
|
||||
let fname = key.split('/').next_back().unwrap();
|
||||
let fname = key.split('/').last().unwrap();
|
||||
|
||||
let Ok((layer_name, _generation)) = parse_layer_object_name(fname) else {
|
||||
return LargeObjectKind::Other;
|
||||
|
||||
@@ -1344,8 +1344,6 @@ class NeonEnv:
|
||||
and self.storage_controller_config.get("timelines_onto_safekeepers") is True
|
||||
):
|
||||
for sk_id, sk in enumerate(self.safekeepers):
|
||||
# 0 is an invalid safekeeper id
|
||||
sk_id = sk_id + 1
|
||||
body = {
|
||||
"id": sk_id,
|
||||
"created_at": "2023-10-25T09:11:25Z",
|
||||
|
||||
@@ -118,7 +118,6 @@ DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [
|
||||
# failing to connect to them.
|
||||
".*Call to node.*management API.*failed.*receive body.*",
|
||||
".*Call to node.*management API.*failed.*ReceiveBody.*",
|
||||
".*Call to node.*management API.*failed.*Timeout.*",
|
||||
".*Failed to update node .+ after heartbeat round.*error sending request for url.*",
|
||||
".*background_reconcile: failed to fetch top tenants:.*client error \\(Connect\\).*",
|
||||
# Many tests will start up with a node offline
|
||||
|
||||
@@ -853,25 +853,6 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
|
||||
res_json = res.json()
|
||||
return res_json
|
||||
|
||||
def timeline_mark_invisible(
|
||||
self,
|
||||
tenant_id: TenantId | TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
is_visible: bool | None = None,
|
||||
):
|
||||
data = {
|
||||
"is_visible": is_visible,
|
||||
}
|
||||
|
||||
log.info(
|
||||
f"Requesting marking timeline invisible for {is_visible=}, {tenant_id=}, {timeline_id=}"
|
||||
)
|
||||
res = self.put(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/mark_invisible",
|
||||
json=data,
|
||||
)
|
||||
self.verbose_error(res)
|
||||
|
||||
def timeline_get_timestamp_of_lsn(
|
||||
self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, lsn: Lsn
|
||||
):
|
||||
@@ -1192,28 +1173,3 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
|
||||
log.info(f"Got perf info response code: {res.status_code}")
|
||||
self.verbose_error(res)
|
||||
return res.json()
|
||||
|
||||
def ingest_aux_files(
|
||||
self,
|
||||
tenant_id: TenantId | TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
aux_files: dict[str, bytes],
|
||||
):
|
||||
res = self.post(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/ingest_aux_files",
|
||||
json={
|
||||
"aux_files": aux_files,
|
||||
},
|
||||
)
|
||||
self.verbose_error(res)
|
||||
return res.json()
|
||||
|
||||
def list_aux_files(
|
||||
self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, lsn: Lsn
|
||||
) -> Any:
|
||||
res = self.post(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/list_aux_files",
|
||||
json={"lsn": str(lsn)},
|
||||
)
|
||||
self.verbose_error(res)
|
||||
return res.json()
|
||||
|
||||
@@ -43,7 +43,7 @@ def single_timeline(
|
||||
f"template tenant is template_tenant={template_tenant} template_timeline={template_timeline}"
|
||||
)
|
||||
|
||||
log.info("detach template tenant from pageserver")
|
||||
log.info("detach template tenant form pageserver")
|
||||
env.pageserver.tenant_detach(template_tenant)
|
||||
|
||||
log.info(f"duplicating template tenant {ncopies} times in remote storage")
|
||||
@@ -65,13 +65,11 @@ def single_timeline(
|
||||
assert ps_http.tenant_list() == []
|
||||
|
||||
def attach(tenant):
|
||||
# NB: create the new tenant in the storage controller with the correct tenant config. This
|
||||
# will pick up the existing tenant data from remote storage. If we just attach it to the
|
||||
# Pageserver, the storage controller will reset the tenant config to the default.
|
||||
env.create_tenant(
|
||||
tenant_id=tenant,
|
||||
timeline_id=template_timeline,
|
||||
conf=template_config,
|
||||
env.pageserver.tenant_attach(
|
||||
tenant,
|
||||
config=template_config.copy(),
|
||||
generation=100,
|
||||
override_storage_controller_generation=True,
|
||||
)
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=22) as executor:
|
||||
|
||||
@@ -15,7 +15,9 @@ from fixtures.neon_fixtures import (
|
||||
)
|
||||
from fixtures.utils import get_scale_for_db, humantime_to_ms, skip_on_ci
|
||||
|
||||
from performance.pageserver.util import setup_pageserver_with_tenants
|
||||
from performance.pageserver.util import (
|
||||
setup_pageserver_with_tenants,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from typing import Any
|
||||
@@ -124,11 +126,14 @@ def setup_and_run_pagebench_benchmark(
|
||||
for param, (value, kwargs) in params.items():
|
||||
record(param, metric_value=value, report=MetricReport.TEST_PARAM, **kwargs)
|
||||
|
||||
def setup_wrapper(env: NeonEnv):
|
||||
return setup_tenant_template(env, pg_bin, pgbench_scale)
|
||||
|
||||
env = setup_pageserver_with_tenants(
|
||||
neon_env_builder,
|
||||
f"max_throughput_latest_lsn-{n_tenants}-{pgbench_scale}",
|
||||
n_tenants,
|
||||
lambda env: setup_tenant_template(env, pg_bin, pgbench_scale),
|
||||
setup_wrapper,
|
||||
# https://github.com/neondatabase/neon/issues/8070
|
||||
timeout_in_seconds=60,
|
||||
)
|
||||
@@ -155,8 +160,14 @@ def setup_tenant_template(env: NeonEnv, pg_bin: PgBin, scale: int):
|
||||
"gc_period": "0s", # disable periodic gc
|
||||
"checkpoint_timeout": "10 years",
|
||||
"compaction_period": "0s", # disable periodic compaction
|
||||
"compaction_threshold": 10,
|
||||
"compaction_target_size": 134217728,
|
||||
"checkpoint_distance": 268435456,
|
||||
"image_creation_threshold": 3,
|
||||
}
|
||||
template_tenant, template_timeline = env.create_tenant(set_default=True, conf=config)
|
||||
template_tenant, template_timeline = env.create_tenant(set_default=True)
|
||||
env.pageserver.tenant_detach(template_tenant)
|
||||
env.pageserver.tenant_attach(template_tenant, config)
|
||||
ps_http = env.pageserver.http_client()
|
||||
with env.endpoints.create_start("main", tenant_id=template_tenant) as ep:
|
||||
pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", "-I", "dtGvp", ep.connstr()])
|
||||
|
||||
@@ -166,7 +166,6 @@ def test_pageserver_compaction_preempt(
|
||||
|
||||
|
||||
@skip_in_debug_build("only run with release build")
|
||||
@pytest.mark.timeout(900) # This test is slow with sanitizers enabled, especially on ARM
|
||||
@pytest.mark.parametrize(
|
||||
"with_branches",
|
||||
["with_branches", "no_branches"],
|
||||
|
||||
@@ -249,7 +249,6 @@ def test_forward_compatibility(
|
||||
top_output_dir: Path,
|
||||
pg_version: PgVersion,
|
||||
compatibility_snapshot_dir: Path,
|
||||
compute_reconfigure_listener: ComputeReconfigure,
|
||||
):
|
||||
"""
|
||||
Test that the old binaries can read new data
|
||||
@@ -258,7 +257,6 @@ def test_forward_compatibility(
|
||||
os.environ.get("ALLOW_FORWARD_COMPATIBILITY_BREAKAGE", "false").lower() == "true"
|
||||
)
|
||||
|
||||
neon_env_builder.control_plane_hooks_api = compute_reconfigure_listener.control_plane_hooks_api
|
||||
neon_env_builder.test_may_use_compatibility_snapshot_binaries = True
|
||||
|
||||
try:
|
||||
|
||||
@@ -276,33 +276,3 @@ def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder):
|
||||
if i > 1:
|
||||
before_timestamp = tbl[i - step_size][1]
|
||||
assert timestamp >= before_timestamp, "before_timestamp before timestamp"
|
||||
|
||||
|
||||
def test_timestamp_of_lsn_empty_branch(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Test that getting the timestamp of the head LSN of a newly created branch works.
|
||||
This verifies that we don't get a 404 error when trying to get the timestamp
|
||||
of the head LSN of a branch that was just created.
|
||||
|
||||
Reproducer for https://github.com/neondatabase/neon/issues/11439
|
||||
"""
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
# Create a new branch
|
||||
new_timeline_id = env.create_branch("test_timestamp_of_lsn_empty_branch")
|
||||
|
||||
# Retrieve the commit LSN of the empty branch, which we have never run postgres on
|
||||
detail = env.pageserver.http_client().timeline_detail(
|
||||
tenant_id=env.initial_tenant, timeline_id=new_timeline_id
|
||||
)
|
||||
head_lsn = detail["last_record_lsn"]
|
||||
|
||||
# Verify we can get the timestamp for this LSN
|
||||
with env.pageserver.http_client() as client:
|
||||
result = client.timeline_get_timestamp_of_lsn(
|
||||
env.initial_tenant,
|
||||
new_timeline_id,
|
||||
head_lsn,
|
||||
)
|
||||
# If we get here without a 404 error, the test passes
|
||||
assert result is not None
|
||||
|
||||
@@ -1,10 +1,12 @@
|
||||
import os
|
||||
import ssl
|
||||
import time
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder, StorageControllerApiException
|
||||
from fixtures.utils import wait_until
|
||||
from fixtures.log_helper import log
|
||||
|
||||
|
||||
def test_pageserver_https_api(neon_env_builder: NeonEnvBuilder):
|
||||
@@ -151,3 +153,29 @@ def test_certificate_rotation(neon_env_builder: NeonEnvBuilder):
|
||||
requests.get(addr, verify=str(env.ssl_ca_file)).raise_for_status()
|
||||
cur_cert = ssl.get_server_certificate(("localhost", port))
|
||||
assert cur_cert == sk_cert
|
||||
|
||||
def test_metrics(neon_env_builder: NeonEnvBuilder):
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
ps_metrics = env.pageserver.http_client().get_metrics();
|
||||
|
||||
h_count = int(
|
||||
ps_metrics.query_one(
|
||||
"libmetrics_metric_handler_requests_total"
|
||||
).value
|
||||
)
|
||||
log.info(f"h_count: {h_count}")
|
||||
|
||||
h2_count = int(
|
||||
ps_metrics.query_one(
|
||||
"libmetrics_metric_handler_requests_2_total"
|
||||
).value
|
||||
)
|
||||
log.info(f"h2_count: {h2_count}")
|
||||
|
||||
h3_count = int(
|
||||
ps_metrics.query_one(
|
||||
"libmetrics_metric_handler_requests_total_3"
|
||||
).value
|
||||
)
|
||||
log.info(f"h2_count: {h3_count}")
|
||||
|
||||
@@ -4176,121 +4176,3 @@ def test_storage_controller_graceful_migration(neon_env_builder: NeonEnvBuilder,
|
||||
)
|
||||
else:
|
||||
assert initial_ps.http_client().tenant_list_locations()["tenant_shards"] == []
|
||||
|
||||
|
||||
@run_only_on_default_postgres("this is like a 'unit test' against storcon db")
|
||||
def test_storage_controller_migrate_with_pageserver_restart(
|
||||
neon_env_builder: NeonEnvBuilder, make_httpserver
|
||||
):
|
||||
"""
|
||||
Test that live migrations which fail right after incrementing the generation
|
||||
due to the destination going offline eventually send a compute notification
|
||||
after the destination re-attaches.
|
||||
"""
|
||||
neon_env_builder.num_pageservers = 2
|
||||
|
||||
neon_env_builder.storage_controller_config = {
|
||||
# Disable transitions to offline
|
||||
"max_offline": "600s",
|
||||
"use_local_compute_notifications": False,
|
||||
}
|
||||
|
||||
neon_env_builder.control_plane_hooks_api = (
|
||||
f"http://{make_httpserver.host}:{make_httpserver.port}/"
|
||||
)
|
||||
|
||||
notifications = []
|
||||
|
||||
def notify(request: Request):
|
||||
log.info(f"Received notify-attach: {request}")
|
||||
notifications.append(request.json)
|
||||
|
||||
make_httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler(notify)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
env.storage_controller.allowed_errors.extend(
|
||||
[
|
||||
".*Call to node.*management API failed.*",
|
||||
".*Call to node.*management API still failed.*",
|
||||
".*Reconcile error.*",
|
||||
".*request.*PUT.*migrate.*",
|
||||
]
|
||||
)
|
||||
|
||||
env.storage_controller.tenant_policy_update(env.initial_tenant, {"placement": {"Attached": 1}})
|
||||
env.storage_controller.reconcile_until_idle()
|
||||
|
||||
initial_desc = env.storage_controller.tenant_describe(env.initial_tenant)["shards"][0]
|
||||
log.info(f"{initial_desc=}")
|
||||
primary = env.get_pageserver(initial_desc["node_attached"])
|
||||
secondary = env.get_pageserver(initial_desc["node_secondary"][0])
|
||||
|
||||
# Pause the migration after incrementing the generation in the database
|
||||
env.storage_controller.configure_failpoints(
|
||||
("reconciler-live-migrate-post-generation-inc", "pause")
|
||||
)
|
||||
|
||||
tenant_shard_id = TenantShardId(env.initial_tenant, 0, 0)
|
||||
|
||||
try:
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
|
||||
migrate_fut = executor.submit(
|
||||
env.storage_controller.tenant_shard_migrate,
|
||||
tenant_shard_id,
|
||||
secondary.id,
|
||||
config=StorageControllerMigrationConfig(prewarm=False, override_scheduler=True),
|
||||
)
|
||||
|
||||
def has_hit_migration_failpoint():
|
||||
expr = "at failpoint reconciler-live-migrate-post-generation-inc"
|
||||
log.info(expr)
|
||||
assert env.storage_controller.log_contains(expr)
|
||||
|
||||
wait_until(has_hit_migration_failpoint)
|
||||
|
||||
secondary.stop()
|
||||
|
||||
# Eventually migration completes
|
||||
env.storage_controller.configure_failpoints(
|
||||
("reconciler-live-migrate-post-generation-inc", "off")
|
||||
)
|
||||
try:
|
||||
migrate_fut.result()
|
||||
except StorageControllerApiException as err:
|
||||
log.info(f"Migration failed: {err}")
|
||||
except:
|
||||
env.storage_controller.configure_failpoints(
|
||||
("reconciler-live-migrate-post-generation-inc", "off")
|
||||
)
|
||||
raise
|
||||
|
||||
def process_migration_result():
|
||||
dump = env.storage_controller.tenant_shard_dump()
|
||||
observed = dump[0]["observed"]["locations"]
|
||||
|
||||
log.info(f"{observed=} primary={primary.id} secondary={secondary.id}")
|
||||
|
||||
assert observed[str(primary.id)]["conf"]["mode"] == "AttachedStale"
|
||||
assert observed[str(secondary.id)]["conf"] is None
|
||||
|
||||
wait_until(process_migration_result)
|
||||
|
||||
# Start and wait for re-attach to be processed
|
||||
secondary.start()
|
||||
env.storage_controller.poll_node_status(
|
||||
secondary.id,
|
||||
desired_availability=PageserverAvailability.ACTIVE,
|
||||
desired_scheduling_policy=None,
|
||||
max_attempts=10,
|
||||
backoff=1,
|
||||
)
|
||||
|
||||
env.storage_controller.reconcile_until_idle()
|
||||
|
||||
assert notifications[-1] == {
|
||||
"tenant_id": str(env.initial_tenant),
|
||||
"stripe_size": None,
|
||||
"shards": [{"node_id": int(secondary.id), "shard_number": 0}],
|
||||
"preferred_az": DEFAULT_AZ_ID,
|
||||
}
|
||||
|
||||
@@ -782,21 +782,6 @@ def test_lsn_lease_storcon(neon_env_builder: NeonEnvBuilder):
|
||||
)
|
||||
|
||||
|
||||
def test_mark_invisible_storcon(neon_env_builder: NeonEnvBuilder):
|
||||
conf = {
|
||||
"pitr_interval": "0s",
|
||||
"gc_period": "0s",
|
||||
"compaction_period": "0s",
|
||||
}
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=conf)
|
||||
env.storage_controller.pageserver_api().timeline_mark_invisible(
|
||||
env.initial_tenant, env.initial_timeline
|
||||
)
|
||||
env.storage_controller.pageserver_api().timeline_mark_invisible(
|
||||
env.initial_tenant, env.initial_timeline, True
|
||||
)
|
||||
|
||||
|
||||
def insert_with_action(
|
||||
env: NeonEnv,
|
||||
tenant: TenantId,
|
||||
|
||||
@@ -1768,87 +1768,6 @@ def test_pageserver_compaction_detach_ancestor_smoke(neon_env_builder: NeonEnvBu
|
||||
workload_child.validate(env.pageserver.id)
|
||||
|
||||
|
||||
def test_timeline_detach_with_aux_files_with_detach_v1(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
):
|
||||
"""
|
||||
Validate that "branches do not inherit their parent" is invariant over detach_ancestor.
|
||||
|
||||
Branches hide parent branch aux files etc by stopping lookup of non-inherited keyspace at the parent-child boundary.
|
||||
We had a bug where detach_ancestor running on a child branch would copy aux files key range from child to parent,
|
||||
thereby making parent aux files reappear.
|
||||
"""
|
||||
env = neon_env_builder.init_start(
|
||||
initial_tenant_conf={
|
||||
"gc_period": "1s",
|
||||
"lsn_lease_length": "0s",
|
||||
}
|
||||
)
|
||||
|
||||
env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
|
||||
|
||||
http = env.pageserver.http_client()
|
||||
|
||||
endpoint = env.endpoints.create_start("main", tenant_id=env.initial_tenant)
|
||||
lsn0 = wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
|
||||
endpoint.safe_psql(
|
||||
"SELECT pg_create_logical_replication_slot('test_slot_parent_1', 'pgoutput')"
|
||||
)
|
||||
lsn1 = wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
|
||||
endpoint.safe_psql(
|
||||
"SELECT pg_create_logical_replication_slot('test_slot_parent_2', 'pgoutput')"
|
||||
)
|
||||
lsn2 = wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
|
||||
assert set(http.list_aux_files(env.initial_tenant, env.initial_timeline, lsn0).keys()) == set(
|
||||
[]
|
||||
)
|
||||
assert set(http.list_aux_files(env.initial_tenant, env.initial_timeline, lsn1).keys()) == set(
|
||||
["pg_replslot/test_slot_parent_1/state"]
|
||||
)
|
||||
assert set(http.list_aux_files(env.initial_tenant, env.initial_timeline, lsn2).keys()) == set(
|
||||
["pg_replslot/test_slot_parent_1/state", "pg_replslot/test_slot_parent_2/state"]
|
||||
)
|
||||
|
||||
# Restore at LSN1
|
||||
branch_timeline_id = env.create_branch("restore", env.initial_tenant, "main", lsn1)
|
||||
endpoint2 = env.endpoints.create_start("restore", tenant_id=env.initial_tenant)
|
||||
assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn1).keys()) == set([])
|
||||
|
||||
# Add a new slot file to the restore branch (This won't happen in reality because cplane immediately detaches the branch on restore,
|
||||
# but we want to ensure that aux files on the detached branch are NOT inherited during ancestor detach. We could change the behavior
|
||||
# in the future.
|
||||
# TL;DR we should NEVER automatically detach a branch as a background optimization for those tenants that already used the restore
|
||||
# feature before branch detach was introduced because it will clean up the aux files and stop logical replication.
|
||||
endpoint2.safe_psql(
|
||||
"SELECT pg_create_logical_replication_slot('test_slot_restore', 'pgoutput')"
|
||||
)
|
||||
lsn3 = wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, branch_timeline_id)
|
||||
assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn1).keys()) == set([])
|
||||
assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn3).keys()) == set(
|
||||
["pg_replslot/test_slot_restore/state"]
|
||||
)
|
||||
|
||||
print("lsn0=", lsn0)
|
||||
print("lsn1=", lsn1)
|
||||
print("lsn2=", lsn2)
|
||||
print("lsn3=", lsn3)
|
||||
# Detach the restore branch so that main doesn't have any child branches.
|
||||
all_reparented = http.detach_ancestor(
|
||||
env.initial_tenant, branch_timeline_id, detach_behavior="v1"
|
||||
)
|
||||
assert all_reparented == set([])
|
||||
|
||||
# We need to ensure all safekeeper data are ingested before checking aux files: the API does not wait for LSN.
|
||||
wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, branch_timeline_id)
|
||||
assert set(http.list_aux_files(env.initial_tenant, env.initial_timeline, lsn2).keys()) == set(
|
||||
["pg_replslot/test_slot_parent_1/state", "pg_replslot/test_slot_parent_2/state"]
|
||||
), "main branch unaffected"
|
||||
assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn3).keys()) == set(
|
||||
["pg_replslot/test_slot_restore/state"]
|
||||
)
|
||||
assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn1).keys()) == set([])
|
||||
|
||||
|
||||
# TODO:
|
||||
# - branch near existing L1 boundary, image layers?
|
||||
# - investigate: why are layers started at uneven lsn? not just after branching, but in general.
|
||||
|
||||
Reference in New Issue
Block a user