mirror of
https://github.com/neondatabase/neon.git
synced 2026-02-12 15:10:38 +00:00
Compare commits
3 Commits
refactor-c
...
asher/sk-a
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
85b3964000 | ||
|
|
06b97c60f3 | ||
|
|
744428229b |
42
.github/workflows/build_and_test.yml
vendored
42
.github/workflows/build_and_test.yml
vendored
@@ -551,48 +551,6 @@ jobs:
|
||||
- name: Cleanup ECR folder
|
||||
run: rm -rf ~/.ecr
|
||||
|
||||
|
||||
neon-image-depot:
|
||||
# For testing this will run side-by-side for a few merges.
|
||||
# This action is not really optimized yet, but gets the job done
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
needs: [ tag ]
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
|
||||
permissions:
|
||||
contents: read
|
||||
id-token: write
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Setup go
|
||||
uses: actions/setup-go@v3
|
||||
with:
|
||||
go-version: '1.19'
|
||||
|
||||
- name: Set up Depot CLI
|
||||
uses: depot/setup-action@v1
|
||||
|
||||
- name: Install Crane & ECR helper
|
||||
run: go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0
|
||||
|
||||
- name: Configure ECR login
|
||||
run: |
|
||||
mkdir /github/home/.docker/
|
||||
echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
|
||||
|
||||
- name: Build and push
|
||||
uses: depot/build-push-action@v1
|
||||
with:
|
||||
# if no depot.json file is at the root of your repo, you must specify the project id
|
||||
project: nrdv0s4kcs
|
||||
push: true
|
||||
tags: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:depot-${{needs.tag.outputs.build-tag}}
|
||||
|
||||
compute-tools-image:
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
needs: [ tag ]
|
||||
|
||||
6
Cargo.lock
generated
6
Cargo.lock
generated
@@ -851,7 +851,6 @@ dependencies = [
|
||||
"futures",
|
||||
"hyper",
|
||||
"notify",
|
||||
"num_cpus",
|
||||
"opentelemetry",
|
||||
"postgres",
|
||||
"regex",
|
||||
@@ -3324,7 +3323,6 @@ dependencies = [
|
||||
"async-trait",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"chrono",
|
||||
"clap 4.1.4",
|
||||
"const_format",
|
||||
"crc32c",
|
||||
@@ -3334,6 +3332,7 @@ dependencies = [
|
||||
"humantime",
|
||||
"hyper",
|
||||
"metrics",
|
||||
"nix",
|
||||
"once_cell",
|
||||
"parking_lot",
|
||||
"postgres",
|
||||
@@ -4534,6 +4533,7 @@ dependencies = [
|
||||
"once_cell",
|
||||
"rand",
|
||||
"routerify",
|
||||
"rustls",
|
||||
"sentry",
|
||||
"serde",
|
||||
"serde_json",
|
||||
@@ -4544,6 +4544,7 @@ dependencies = [
|
||||
"tempfile",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-rustls",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
"url",
|
||||
@@ -4884,7 +4885,6 @@ dependencies = [
|
||||
"socket2",
|
||||
"syn",
|
||||
"tokio",
|
||||
"tokio-rustls",
|
||||
"tokio-util",
|
||||
"tonic",
|
||||
"tower",
|
||||
|
||||
@@ -64,7 +64,6 @@ md5 = "0.7.0"
|
||||
memoffset = "0.8"
|
||||
nix = "0.26"
|
||||
notify = "5.0.0"
|
||||
num_cpus = "1.15"
|
||||
num-traits = "0.2.15"
|
||||
once_cell = "1.13"
|
||||
opentelemetry = "0.18.0"
|
||||
|
||||
@@ -39,7 +39,7 @@ ARG CACHEPOT_BUCKET=neon-github-dev
|
||||
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server
|
||||
COPY --chown=nonroot . .
|
||||
COPY . .
|
||||
|
||||
# Show build caching stats to check if it was used in the end.
|
||||
# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
|
||||
|
||||
@@ -225,81 +225,6 @@ RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_hashids.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "rum-pg-build"
|
||||
# compile rum extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS rum-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \
|
||||
mkdir rum-src && cd rum-src && tar xvzf ../rum.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pgtap-pg-build"
|
||||
# compile pgTAP extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pgtap-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgtap.tar.gz && \
|
||||
mkdir pgtap-src && cd pgtap-src && tar xvzf ../pgtap.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgtap.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "prefix-pg-build"
|
||||
# compile Prefix extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS prefix-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.9.tar.gz -O prefix.tar.gz && \
|
||||
mkdir prefix-src && cd prefix-src && tar xvzf ../prefix.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/prefix.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "hll-pg-build"
|
||||
# compile hll extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS hll-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.17.tar.gz -O hll.tar.gz && \
|
||||
mkdir hll-src && cd hll-src && tar xvzf ../hll.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/hll.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "plpgsql-check-pg-build"
|
||||
# compile plpgsql_check extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS plpgsql-check-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.3.2.tar.gz -O plpgsql_check.tar.gz && \
|
||||
mkdir plpgsql_check-src && cd plpgsql_check-src && tar xvzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plpgsql_check.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "rust extensions"
|
||||
@@ -323,7 +248,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
|
||||
chmod +x rustup-init && \
|
||||
./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
|
||||
rm rustup-init && \
|
||||
cargo install --locked --version 0.7.3 cargo-pgx && \
|
||||
cargo install --git https://github.com/vadim2404/pgx --branch neon_abi_v0.6.1 --locked cargo-pgx && \
|
||||
/bin/bash -c 'cargo pgx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
|
||||
|
||||
USER root
|
||||
@@ -337,11 +262,11 @@ USER root
|
||||
|
||||
FROM rust-extensions-build AS pg-jsonschema-pg-build
|
||||
|
||||
# there is no release tag yet, but we need it due to the superuser fix in the control file
|
||||
RUN wget https://github.com/supabase/pg_jsonschema/archive/caeab60d70b2fd3ae421ec66466a3abbb37b7ee6.tar.gz -O pg_jsonschema.tar.gz && \
|
||||
mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xvzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
|
||||
sed -i 's/pgx = "0.7.1"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
RUN git clone --depth=1 --single-branch --branch neon_abi_v0.1.4 https://github.com/vadim2404/pg_jsonschema/ && \
|
||||
cd pg_jsonschema && \
|
||||
cargo pgx install --release && \
|
||||
# it's needed to enable extension because it uses untrusted C language
|
||||
sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_jsonschema.control && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control
|
||||
|
||||
#########################################################################################
|
||||
@@ -353,32 +278,13 @@ RUN wget https://github.com/supabase/pg_jsonschema/archive/caeab60d70b2fd3ae421e
|
||||
|
||||
FROM rust-extensions-build AS pg-graphql-pg-build
|
||||
|
||||
# Currently pgx version bump to >= 0.7.2 causes "call to unsafe function" compliation errors in
|
||||
# pgx-contrib-spiext. There is a branch that removes that dependency, so use it. It is on the
|
||||
# same 1.1 version we've used before.
|
||||
RUN git clone -b remove-pgx-contrib-spiext --single-branch https://github.com/yrashk/pg_graphql && \
|
||||
cd pg_graphql && \
|
||||
sed -i 's/pgx = "~0.7.1"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
sed -i 's/pgx-tests = "~0.7.1"/pgx-tests = "0.7.3"/g' Cargo.toml && \
|
||||
RUN git clone --depth=1 --single-branch --branch neon_abi_v1.1.0 https://github.com/vadim2404/pg_graphql && \
|
||||
cd pg_graphql && \
|
||||
cargo pgx install --release && \
|
||||
# it's needed to enable extension because it uses untrusted C language
|
||||
sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_graphql.control && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_graphql.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-tiktoken-build"
|
||||
# Compile "pg_tiktoken" extension
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM rust-extensions-build AS pg-tiktoken-pg-build
|
||||
|
||||
RUN git clone --depth=1 --single-branch https://github.com/kelvich/pg_tiktoken && \
|
||||
cd pg_tiktoken && \
|
||||
cargo pgx install --release && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "neon-pg-ext-build"
|
||||
@@ -396,23 +302,13 @@ COPY --from=vector-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pgjwt-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-jsonschema-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-graphql-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-tiktoken-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=hypopg-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-hashids-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=rum-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pgtap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=prefix-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=hll-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=plpgsql-check-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY pgxn/ pgxn/
|
||||
|
||||
RUN make -j $(getconf _NPROCESSORS_ONLN) \
|
||||
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
|
||||
-C pgxn/neon \
|
||||
-s install && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) \
|
||||
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
|
||||
-C pgxn/neon_utils \
|
||||
-s install
|
||||
|
||||
#########################################################################################
|
||||
@@ -467,7 +363,7 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
|
||||
|
||||
# Install:
|
||||
# libreadline8 for psql
|
||||
# libicu67, locales for collations (including ICU and plpgsql_check)
|
||||
# libicu67, locales for collations (including ICU)
|
||||
# libossp-uuid16 for extension ossp-uuid
|
||||
# libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS
|
||||
# libxml2, libxslt1.1 for xml2
|
||||
|
||||
@@ -11,15 +11,22 @@ RUN set -e \
|
||||
&& touch /etc/inittab
|
||||
|
||||
RUN set -e \
|
||||
&& echo "::respawn:su vm-informant -c '/usr/local/bin/vm-informant --auto-restart'" >> /etc/inittab
|
||||
&& echo "::sysinit:cgconfigparser -l /etc/cgconfig.conf -s 1664" >> /etc/inittab \
|
||||
&& echo "::respawn:su vm-informant -c '/usr/local/bin/vm-informant --auto-restart --cgroup=neon-postgres'" >> /etc/inittab
|
||||
|
||||
# Combine, starting from non-VM compute node image.
|
||||
FROM $SRC_IMAGE as base
|
||||
|
||||
# Temporarily set user back to root so we can run adduser
|
||||
# Temporarily set user back to root so we can run apt update and adduser
|
||||
USER root
|
||||
RUN apt update && \
|
||||
apt install --no-install-recommends -y \
|
||||
cgroup-tools
|
||||
RUN adduser vm-informant --disabled-password --no-create-home
|
||||
USER postgres
|
||||
|
||||
ADD vm-cgconfig.conf /etc/cgconfig.conf
|
||||
COPY --from=informant /etc/inittab /etc/inittab
|
||||
COPY --from=informant /usr/bin/vm-informant /usr/local/bin/vm-informant
|
||||
|
||||
ENTRYPOINT ["/usr/sbin/cgexec", "-g", "*:neon-postgres", "/usr/local/bin/compute_ctl"]
|
||||
|
||||
8
Makefile
8
Makefile
@@ -133,11 +133,6 @@ neon-pg-ext-%: postgres-%
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$* \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install
|
||||
+@echo "Compiling neon_utils $*"
|
||||
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-utils-$*
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install
|
||||
|
||||
.PHONY: neon-pg-ext-clean-%
|
||||
neon-pg-ext-clean-%:
|
||||
@@ -150,9 +145,6 @@ neon-pg-ext-clean-%:
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$* \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile clean
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile clean
|
||||
|
||||
.PHONY: neon-pg-ext
|
||||
neon-pg-ext: \
|
||||
|
||||
@@ -11,7 +11,6 @@ clap.workspace = true
|
||||
futures.workspace = true
|
||||
hyper = { workspace = true, features = ["full"] }
|
||||
notify.workspace = true
|
||||
num_cpus.workspace = true
|
||||
opentelemetry.workspace = true
|
||||
postgres.workspace = true
|
||||
regex.workspace = true
|
||||
|
||||
@@ -25,7 +25,6 @@ use anyhow::{Context, Result};
|
||||
use chrono::{DateTime, Utc};
|
||||
use postgres::{Client, NoTls};
|
||||
use serde::{Serialize, Serializer};
|
||||
use tokio_postgres;
|
||||
use tracing::{info, instrument, warn};
|
||||
|
||||
use crate::checker::create_writability_check_data;
|
||||
@@ -285,7 +284,6 @@ impl ComputeNode {
|
||||
handle_role_deletions(self, &mut client)?;
|
||||
handle_grants(self, &mut client)?;
|
||||
create_writability_check_data(&mut client)?;
|
||||
handle_extensions(&self.spec, &mut client)?;
|
||||
|
||||
// 'Close' connection
|
||||
drop(client);
|
||||
@@ -402,43 +400,4 @@ impl ComputeNode {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Select `pg_stat_statements` data and return it as a stringified JSON
|
||||
pub async fn collect_insights(&self) -> String {
|
||||
let mut result_rows: Vec<String> = Vec::new();
|
||||
let connect_result = tokio_postgres::connect(self.connstr.as_str(), NoTls).await;
|
||||
let (client, connection) = connect_result.unwrap();
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = connection.await {
|
||||
eprintln!("connection error: {}", e);
|
||||
}
|
||||
});
|
||||
let result = client
|
||||
.simple_query(
|
||||
"SELECT
|
||||
row_to_json(pg_stat_statements)
|
||||
FROM
|
||||
pg_stat_statements
|
||||
WHERE
|
||||
userid != 'cloud_admin'::regrole::oid
|
||||
ORDER BY
|
||||
(mean_exec_time + mean_plan_time) DESC
|
||||
LIMIT 100",
|
||||
)
|
||||
.await;
|
||||
|
||||
if let Ok(raw_rows) = result {
|
||||
for message in raw_rows.iter() {
|
||||
if let postgres::SimpleQueryMessage::Row(row) = message {
|
||||
if let Some(json) = row.get(0) {
|
||||
result_rows.push(json.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
format!("{{\"pg_stat_statements\": [{}]}}", result_rows.join(","))
|
||||
} else {
|
||||
"{{\"pg_stat_statements\": []}}".to_string()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,7 +7,6 @@ use crate::compute::ComputeNode;
|
||||
use anyhow::Result;
|
||||
use hyper::service::{make_service_fn, service_fn};
|
||||
use hyper::{Body, Method, Request, Response, Server, StatusCode};
|
||||
use num_cpus;
|
||||
use serde_json;
|
||||
use tracing::{error, info};
|
||||
use tracing_utils::http::OtelName;
|
||||
@@ -34,13 +33,6 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
|
||||
Response::new(Body::from(serde_json::to_string(&compute.metrics).unwrap()))
|
||||
}
|
||||
|
||||
// Collect Postgres current usage insights
|
||||
(&Method::GET, "/insights") => {
|
||||
info!("serving /insights GET request");
|
||||
let insights = compute.collect_insights().await;
|
||||
Response::new(Body::from(insights))
|
||||
}
|
||||
|
||||
(&Method::POST, "/check_writability") => {
|
||||
info!("serving /check_writability POST request");
|
||||
let res = crate::checker::check_writability(compute).await;
|
||||
@@ -50,17 +42,6 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
|
||||
}
|
||||
}
|
||||
|
||||
(&Method::GET, "/info") => {
|
||||
let num_cpus = num_cpus::get_physical();
|
||||
info!("serving /info GET request. num_cpus: {}", num_cpus);
|
||||
Response::new(Body::from(
|
||||
serde_json::json!({
|
||||
"num_cpus": num_cpus,
|
||||
})
|
||||
.to_string(),
|
||||
))
|
||||
}
|
||||
|
||||
// Return the `404 Not Found` for any other routes.
|
||||
_ => {
|
||||
let mut not_found = Response::new(Body::from("404 Not Found"));
|
||||
|
||||
@@ -10,12 +10,12 @@ paths:
|
||||
/status:
|
||||
get:
|
||||
tags:
|
||||
- Info
|
||||
- "info"
|
||||
summary: Get compute node internal status
|
||||
description: ""
|
||||
operationId: getComputeStatus
|
||||
responses:
|
||||
200:
|
||||
"200":
|
||||
description: ComputeState
|
||||
content:
|
||||
application/json:
|
||||
@@ -25,58 +25,27 @@ paths:
|
||||
/metrics.json:
|
||||
get:
|
||||
tags:
|
||||
- Info
|
||||
- "info"
|
||||
summary: Get compute node startup metrics in JSON format
|
||||
description: ""
|
||||
operationId: getComputeMetricsJSON
|
||||
responses:
|
||||
200:
|
||||
"200":
|
||||
description: ComputeMetrics
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ComputeMetrics"
|
||||
|
||||
/insights:
|
||||
get:
|
||||
tags:
|
||||
- Info
|
||||
summary: Get current compute insights in JSON format
|
||||
description: |
|
||||
Note, that this doesn't include any historical data
|
||||
operationId: getComputeInsights
|
||||
responses:
|
||||
200:
|
||||
description: Compute insights
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ComputeInsights"
|
||||
|
||||
/info:
|
||||
get:
|
||||
tags:
|
||||
- "info"
|
||||
summary: Get info about the compute Pod/VM
|
||||
description: ""
|
||||
operationId: getInfo
|
||||
responses:
|
||||
"200":
|
||||
description: Info
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Info"
|
||||
|
||||
/check_writability:
|
||||
post:
|
||||
tags:
|
||||
- Check
|
||||
- "check"
|
||||
summary: Check that we can write new data on this compute
|
||||
description: ""
|
||||
operationId: checkComputeWritability
|
||||
responses:
|
||||
200:
|
||||
"200":
|
||||
description: Check result
|
||||
content:
|
||||
text/plain:
|
||||
@@ -111,15 +80,6 @@ components:
|
||||
total_startup_ms:
|
||||
type: integer
|
||||
|
||||
Info:
|
||||
type: object
|
||||
description: Information about VM/Pod
|
||||
required:
|
||||
- num_cpus
|
||||
properties:
|
||||
num_cpus:
|
||||
type: integer
|
||||
|
||||
ComputeState:
|
||||
type: object
|
||||
required:
|
||||
@@ -136,15 +96,6 @@ components:
|
||||
type: string
|
||||
description: Text of the error during compute startup, if any
|
||||
|
||||
ComputeInsights:
|
||||
type: object
|
||||
properties:
|
||||
pg_stat_statements:
|
||||
description: Contains raw output from pg_stat_statements in JSON format
|
||||
type: array
|
||||
items:
|
||||
type: object
|
||||
|
||||
ComputeStatus:
|
||||
type: string
|
||||
enum:
|
||||
|
||||
@@ -47,23 +47,12 @@ pub struct GenericOption {
|
||||
/// declare a `trait` on it.
|
||||
pub type GenericOptions = Option<Vec<GenericOption>>;
|
||||
|
||||
/// Escape a string for including it in a SQL literal
|
||||
fn escape_literal(s: &str) -> String {
|
||||
s.replace('\'', "''").replace('\\', "\\\\")
|
||||
}
|
||||
|
||||
/// Escape a string so that it can be used in postgresql.conf.
|
||||
/// Same as escape_literal, currently.
|
||||
fn escape_conf_value(s: &str) -> String {
|
||||
s.replace('\'', "''").replace('\\', "\\\\")
|
||||
}
|
||||
|
||||
impl GenericOption {
|
||||
/// Represent `GenericOption` as SQL statement parameter.
|
||||
pub fn to_pg_option(&self) -> String {
|
||||
if let Some(val) = &self.value {
|
||||
match self.vartype.as_ref() {
|
||||
"string" => format!("{} '{}'", self.name, escape_literal(val)),
|
||||
"string" => format!("{} '{}'", self.name, val),
|
||||
_ => format!("{} {}", self.name, val),
|
||||
}
|
||||
} else {
|
||||
@@ -74,8 +63,6 @@ impl GenericOption {
|
||||
/// Represent `GenericOption` as configuration option.
|
||||
pub fn to_pg_setting(&self) -> String {
|
||||
if let Some(val) = &self.value {
|
||||
// TODO: check in the console DB that we don't have these settings
|
||||
// set for any non-deleted project and drop this override.
|
||||
let name = match self.name.as_str() {
|
||||
"safekeepers" => "neon.safekeepers",
|
||||
"wal_acceptor_reconnect" => "neon.safekeeper_reconnect_timeout",
|
||||
@@ -84,7 +71,7 @@ impl GenericOption {
|
||||
};
|
||||
|
||||
match self.vartype.as_ref() {
|
||||
"string" => format!("{} = '{}'", name, escape_conf_value(val)),
|
||||
"string" => format!("{} = '{}'", name, val),
|
||||
_ => format!("{} = {}", name, val),
|
||||
}
|
||||
} else {
|
||||
@@ -120,7 +107,6 @@ impl PgOptionsSerialize for GenericOptions {
|
||||
.map(|op| op.to_pg_setting())
|
||||
.collect::<Vec<String>>()
|
||||
.join("\n")
|
||||
+ "\n" // newline after last setting
|
||||
} else {
|
||||
"".to_string()
|
||||
}
|
||||
|
||||
@@ -515,18 +515,3 @@ pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Create required system extensions
|
||||
#[instrument(skip_all)]
|
||||
pub fn handle_extensions(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
|
||||
if libs.contains("pg_stat_statements") {
|
||||
// Create extension only if this compute really needs it
|
||||
let query = "CREATE EXTENSION IF NOT EXISTS pg_stat_statements";
|
||||
info!("creating system extensions with query: {}", query);
|
||||
client.simple_query(query)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -178,11 +178,6 @@
|
||||
"name": "neon.pageserver_connstring",
|
||||
"value": "host=127.0.0.1 port=6400",
|
||||
"vartype": "string"
|
||||
},
|
||||
{
|
||||
"name": "test.escaping",
|
||||
"value": "here's a backslash \\ and a quote ' and a double-quote \" hooray",
|
||||
"vartype": "string"
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
@@ -28,30 +28,7 @@ mod pg_helpers_tests {
|
||||
|
||||
assert_eq!(
|
||||
spec.cluster.settings.as_pg_settings(),
|
||||
r#"fsync = off
|
||||
wal_level = replica
|
||||
hot_standby = on
|
||||
neon.safekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'
|
||||
wal_log_hints = on
|
||||
log_connections = on
|
||||
shared_buffers = 32768
|
||||
port = 55432
|
||||
max_connections = 100
|
||||
max_wal_senders = 10
|
||||
listen_addresses = '0.0.0.0'
|
||||
wal_sender_timeout = 0
|
||||
password_encryption = md5
|
||||
maintenance_work_mem = 65536
|
||||
max_parallel_workers = 8
|
||||
max_worker_processes = 8
|
||||
neon.tenant_id = 'b0554b632bd4d547a63b86c3630317e8'
|
||||
max_replication_slots = 10
|
||||
neon.timeline_id = '2414a61ffc94e428f14b5758fe308e13'
|
||||
shared_preload_libraries = 'neon'
|
||||
synchronous_standby_names = 'walproposer'
|
||||
neon.pageserver_connstring = 'host=127.0.0.1 port=6400'
|
||||
test.escaping = 'here''s a backslash \\ and a quote '' and a double-quote " hooray'
|
||||
"#
|
||||
"fsync = off\nwal_level = replica\nhot_standby = on\nneon.safekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'\nwal_log_hints = on\nlog_connections = on\nshared_buffers = 32768\nport = 55432\nmax_connections = 100\nmax_wal_senders = 10\nlisten_addresses = '0.0.0.0'\nwal_sender_timeout = 0\npassword_encryption = md5\nmaintenance_work_mem = 65536\nmax_parallel_workers = 8\nmax_worker_processes = 8\nneon.tenant_id = 'b0554b632bd4d547a63b86c3630317e8'\nmax_replication_slots = 10\nneon.timeline_id = '2414a61ffc94e428f14b5758fe308e13'\nshared_preload_libraries = 'neon'\nsynchronous_standby_names = 'walproposer'\nneon.pageserver_connstring = 'host=127.0.0.1 port=6400'"
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -29,41 +29,6 @@ These components should not have access to the private key and may only get toke
|
||||
The key pair is generated once for an installation of compute/pageserver/safekeeper, e.g. by `neon_local init`.
|
||||
There is currently no way to rotate the key without bringing down all components.
|
||||
|
||||
### Token format
|
||||
|
||||
The JWT tokens in Neon use RSA as the algorithm. Example:
|
||||
|
||||
Header:
|
||||
|
||||
```
|
||||
{
|
||||
"alg": "RS512", # RS256, RS384, or RS512
|
||||
"typ": "JWT"
|
||||
}
|
||||
```
|
||||
|
||||
Payload:
|
||||
|
||||
```
|
||||
{
|
||||
"scope": "tenant", # "tenant", "pageserverapi", or "safekeeperdata"
|
||||
"tenant_id": "5204921ff44f09de8094a1390a6a50f6",
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
Meanings of scope:
|
||||
|
||||
"tenant": Provides access to all data for a specific tenant
|
||||
|
||||
"pageserverapi": Provides blanket access to all tenants on the pageserver plus pageserver-wide APIs.
|
||||
Should only be used e.g. for status check/tenant creation/list.
|
||||
|
||||
"safekeeperdata": Provides blanket access to all data on the safekeeper plus safekeeper-wide APIs.
|
||||
Should only be used e.g. for status check.
|
||||
Currently also used for connection from any pageserver to any safekeeper.
|
||||
|
||||
|
||||
### CLI
|
||||
CLI generates a key pair during call to `neon_local init` with the following commands:
|
||||
|
||||
|
||||
@@ -13,8 +13,9 @@ use std::sync::Arc;
|
||||
use std::task::{ready, Poll};
|
||||
use std::{fmt, io};
|
||||
use std::{future::Future, str::FromStr};
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tokio::io::{AsyncRead, AsyncWrite, ReadHalf, WriteHalf};
|
||||
use tokio_rustls::TlsAcceptor;
|
||||
|
||||
use tracing::{debug, error, info, trace};
|
||||
|
||||
use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter};
|
||||
@@ -194,7 +195,7 @@ impl fmt::Display for AuthType {
|
||||
/// (Arc and kinda mutex inside polling) for all uses (e.g. pageserver).
|
||||
enum MaybeWriteOnly {
|
||||
Full(Framed<MaybeTlsStream>),
|
||||
WriteOnly(FramedWriter<MaybeTlsStream>),
|
||||
WriteOnly(FramedWriter<WriteHalf<MaybeTlsStream>>),
|
||||
Broken, // temporary value palmed off during the split
|
||||
}
|
||||
|
||||
@@ -776,7 +777,7 @@ impl PostgresBackend {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PostgresBackendReader(FramedReader<MaybeTlsStream>);
|
||||
pub struct PostgresBackendReader(FramedReader<ReadHalf<MaybeTlsStream>>);
|
||||
|
||||
impl PostgresBackendReader {
|
||||
/// Read full message or return None if connection is cleanly closed with no
|
||||
|
||||
@@ -118,7 +118,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Framed<S> {
|
||||
/// Split into owned read and write parts. Beware of potential issues with
|
||||
/// using halves in different tasks on TLS stream:
|
||||
/// https://github.com/tokio-rs/tls/issues/40
|
||||
pub fn split(self) -> (FramedReader<S>, FramedWriter<S>) {
|
||||
pub fn split(self) -> (FramedReader<ReadHalf<S>>, FramedWriter<WriteHalf<S>>) {
|
||||
let (read_half, write_half) = tokio::io::split(self.stream);
|
||||
let reader = FramedReader {
|
||||
stream: read_half,
|
||||
@@ -132,7 +132,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Framed<S> {
|
||||
}
|
||||
|
||||
/// Join read and write parts back.
|
||||
pub fn unsplit(reader: FramedReader<S>, writer: FramedWriter<S>) -> Self {
|
||||
pub fn unsplit(reader: FramedReader<ReadHalf<S>>, writer: FramedWriter<WriteHalf<S>>) -> Self {
|
||||
Self {
|
||||
stream: reader.stream.unsplit(writer.stream),
|
||||
read_buf: reader.read_buf,
|
||||
@@ -143,7 +143,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Framed<S> {
|
||||
|
||||
/// Read-only version of `Framed`.
|
||||
pub struct FramedReader<S> {
|
||||
stream: ReadHalf<S>,
|
||||
stream: S,
|
||||
read_buf: BytesMut,
|
||||
}
|
||||
|
||||
@@ -155,10 +155,17 @@ impl<S: AsyncRead + Unpin> FramedReader<S> {
|
||||
|
||||
/// Write-only version of `Framed`.
|
||||
pub struct FramedWriter<S> {
|
||||
stream: WriteHalf<S>,
|
||||
stream: S,
|
||||
write_buf: BytesMut,
|
||||
}
|
||||
|
||||
impl<S> FramedWriter<S> {
|
||||
/// Get a mut reference to the underlying stream.
|
||||
pub fn get_mut(&mut self) -> &mut S {
|
||||
&mut self.stream
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: AsyncWrite + Unpin> FramedWriter<S> {
|
||||
/// Write next message to the output buffer; doesn't flush.
|
||||
pub fn write_message_noflush(&mut self, msg: &BeMessage<'_>) -> Result<(), ProtocolError> {
|
||||
|
||||
@@ -818,7 +818,7 @@ impl<'a> BeMessage<'a> {
|
||||
BeMessage::ErrorResponse(error_msg, pg_error_code) => {
|
||||
// 'E' signalizes ErrorResponse messages
|
||||
buf.put_u8(b'E');
|
||||
write_body(buf, |buf| {
|
||||
write_body(buf, |buf| -> Result<(), ProtocolError> {
|
||||
buf.put_u8(b'S'); // severity
|
||||
buf.put_slice(b"ERROR\0");
|
||||
|
||||
@@ -843,7 +843,7 @@ impl<'a> BeMessage<'a> {
|
||||
|
||||
// 'N' signalizes NoticeResponse messages
|
||||
buf.put_u8(b'N');
|
||||
write_body(buf, |buf| {
|
||||
write_body(buf, |buf| -> Result<(), ProtocolError> {
|
||||
buf.put_u8(b'S'); // severity
|
||||
buf.put_slice(b"NOTICE\0");
|
||||
|
||||
@@ -898,7 +898,7 @@ impl<'a> BeMessage<'a> {
|
||||
|
||||
BeMessage::RowDescription(rows) => {
|
||||
buf.put_u8(b'T');
|
||||
write_body(buf, |buf| {
|
||||
write_body(buf, |buf| -> Result<(), ProtocolError> {
|
||||
buf.put_i16(rows.len() as i16); // # of fields
|
||||
for row in rows.iter() {
|
||||
write_cstr(row.name, buf)?;
|
||||
|
||||
@@ -24,9 +24,11 @@ serde_json.workspace = true
|
||||
signal-hook.workspace = true
|
||||
thiserror.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio-rustls.workspace = true
|
||||
tracing.workspace = true
|
||||
tracing-subscriber = { workspace = true, features = ["json"] }
|
||||
rand.workspace = true
|
||||
rustls.workspace = true
|
||||
serde_with.workspace = true
|
||||
strum.workspace = true
|
||||
strum_macros.workspace = true
|
||||
|
||||
@@ -9,28 +9,16 @@ use std::path::Path;
|
||||
|
||||
use anyhow::Result;
|
||||
use jsonwebtoken::{
|
||||
decode, encode, Algorithm, Algorithm::*, DecodingKey, EncodingKey, Header, TokenData,
|
||||
Validation,
|
||||
decode, encode, Algorithm, DecodingKey, EncodingKey, Header, TokenData, Validation,
|
||||
};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
|
||||
use crate::id::TenantId;
|
||||
|
||||
/// Algorithms accepted during validation.
|
||||
///
|
||||
/// Accept all RSA-based algorithms. We pass this list to jsonwebtoken::decode,
|
||||
/// which checks that the algorithm in the token is one of these.
|
||||
///
|
||||
/// XXX: It also fails the validation if there are any algorithms in this list that belong
|
||||
/// to different family than the token's algorithm. In other words, we can *not* list any
|
||||
/// non-RSA algorithms here, or the validation always fails with InvalidAlgorithm error.
|
||||
const ACCEPTED_ALGORITHMS: &[Algorithm] = &[RS256, RS384, RS512];
|
||||
const JWT_ALGORITHM: Algorithm = Algorithm::RS256;
|
||||
|
||||
/// Algorithm to use when generating a new token in [`encode_from_key_file`]
|
||||
const ENCODE_ALGORITHM: Algorithm = Algorithm::RS256;
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum Scope {
|
||||
// Provides access to all data for a specific tenant (specified in `struct Claims` below)
|
||||
@@ -45,9 +33,8 @@ pub enum Scope {
|
||||
SafekeeperData,
|
||||
}
|
||||
|
||||
/// JWT payload. See docs/authentication.md for the format
|
||||
#[serde_as]
|
||||
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
pub struct Claims {
|
||||
#[serde(default)]
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
@@ -68,8 +55,7 @@ pub struct JwtAuth {
|
||||
|
||||
impl JwtAuth {
|
||||
pub fn new(decoding_key: DecodingKey) -> Self {
|
||||
let mut validation = Validation::default();
|
||||
validation.algorithms = ACCEPTED_ALGORITHMS.into();
|
||||
let mut validation = Validation::new(JWT_ALGORITHM);
|
||||
// The default 'required_spec_claims' is 'exp'. But we don't want to require
|
||||
// expiration.
|
||||
validation.required_spec_claims = [].into();
|
||||
@@ -100,113 +86,5 @@ impl std::fmt::Debug for JwtAuth {
|
||||
// this function is used only for testing purposes in CLI e g generate tokens during init
|
||||
pub fn encode_from_key_file(claims: &Claims, key_data: &[u8]) -> Result<String> {
|
||||
let key = EncodingKey::from_rsa_pem(key_data)?;
|
||||
Ok(encode(&Header::new(ENCODE_ALGORITHM), claims, &key)?)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::str::FromStr;
|
||||
|
||||
// generated with:
|
||||
//
|
||||
// openssl genpkey -algorithm rsa -out storage-auth-priv.pem
|
||||
// openssl pkey -in storage-auth-priv.pem -pubout -out storage-auth-pub.pem
|
||||
const TEST_PUB_KEY_RSA: &[u8] = br#"
|
||||
-----BEGIN PUBLIC KEY-----
|
||||
MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAy6OZ+/kQXcueVJA/KTzO
|
||||
v4ljxylc/Kcb0sXWuXg1GB8k3nDA1gK66LFYToH0aTnqrnqG32Vu6wrhwuvqsZA7
|
||||
jQvP0ZePAbWhpEqho7EpNunDPcxZ/XDy5TQlB1P58F9I3lkJXDC+DsHYLuuzwhAv
|
||||
vo2MtWRdYlVHblCVLyZtANHhUMp2HUhgjHnJh5UrLIKOl4doCBxkM3rK0wjKsNCt
|
||||
M92PCR6S9rvYzldfeAYFNppBkEQrXt2CgUqZ4KaS4LXtjTRUJxljijA4HWffhxsr
|
||||
euRu3ufq8kVqie7fum0rdZZSkONmce0V0LesQ4aE2jB+2Sn48h6jb4dLXGWdq8TV
|
||||
wQIDAQAB
|
||||
-----END PUBLIC KEY-----
|
||||
"#;
|
||||
const TEST_PRIV_KEY_RSA: &[u8] = br#"
|
||||
-----BEGIN PRIVATE KEY-----
|
||||
MIIEvQIBADANBgkqhkiG9w0BAQEFAASCBKcwggSjAgEAAoIBAQDLo5n7+RBdy55U
|
||||
kD8pPM6/iWPHKVz8pxvSxda5eDUYHyTecMDWArrosVhOgfRpOequeobfZW7rCuHC
|
||||
6+qxkDuNC8/Rl48BtaGkSqGjsSk26cM9zFn9cPLlNCUHU/nwX0jeWQlcML4Owdgu
|
||||
67PCEC++jYy1ZF1iVUduUJUvJm0A0eFQynYdSGCMecmHlSssgo6Xh2gIHGQzesrT
|
||||
CMqw0K0z3Y8JHpL2u9jOV194BgU2mkGQRCte3YKBSpngppLgte2NNFQnGWOKMDgd
|
||||
Z9+HGyt65G7e5+ryRWqJ7t+6bSt1llKQ42Zx7RXQt6xDhoTaMH7ZKfjyHqNvh0tc
|
||||
ZZ2rxNXBAgMBAAECggEAVz3u4Wlx3o02dsoZlSQs+xf0PEX3RXKeU+1YMbtTG9Nz
|
||||
6yxpIQaoZrpbt76rJE2gwkFR+PEu1NmjoOuLb6j4KlQuI4AHz1auOoGSwFtM6e66
|
||||
K4aZ4x95oEJ3vqz2fkmEIWYJwYpMUmwvnuJx76kZm0xvROMLsu4QHS2+zCVtO5Tr
|
||||
hvS05IMVuZ2TdQBZw0+JaFdwXbgDjQnQGY5n9MoTWSx1a4s/FF4Eby65BbDutcpn
|
||||
Vt3jQAOmO1X2kbPeWSGuPJRzyUs7Kg8qfeglBIR3ppGP3vPYAdWX+ho00bmsVkSp
|
||||
Q8vjul6C3WiM+kjwDxotHSDgbl/xldAl7OqPh0bfAQKBgQDnycXuq14Vg8nZvyn9
|
||||
rTnvucO8RBz5P6G+FZ+44cAS2x79+85onARmMnm+9MKYLSMo8fOvsK034NDI68XM
|
||||
04QQ/vlfouvFklMTGJIurgEImTZbGCmlMYCvFyIxaEWixon8OpeI4rFe4Hmbiijh
|
||||
PxhxWg221AwvBS2sco8J/ylEkQKBgQDg6Rh2QYb/j0Wou1rJPbuy3NhHofd5Rq35
|
||||
4YV3f2lfVYcPrgRhwe3T9SVII7Dx8LfwzsX5TAlf48ESlI3Dzv40uOCDM+xdtBRI
|
||||
r96SfSm+jup6gsXU3AsdNkrRK3HoOG9Z/TkrUp213QAIlVnvIx65l4ckFMlpnPJ0
|
||||
lo1LDXZWMQKBgFArzjZ7N5OhfdO+9zszC3MLgdRAivT7OWqR+CjujIz5FYMr8Xzl
|
||||
WfAvTUTrS9Nu6VZkObFvHrrRG+YjBsuN7YQjbQXTSFGSBwH34bgbn2fl9pMTjHQC
|
||||
50uoaL9GHa/rlBaV/YvvPQJgCi/uXa1rMX0jdNLkDULGO8IF7cu7Yf7BAoGBAIUU
|
||||
J29BkpmAst0GDs/ogTlyR18LTR0rXyHt+UUd1MGeH859TwZw80JpWWf4BmkB4DTS
|
||||
hH3gKePdJY7S65ci0XNsuRupC4DeXuorde0DtkGU2tUmr9wlX0Ynq9lcdYfMbMa4
|
||||
eK1TsxG69JwfkxlWlIWITWRiEFM3lJa7xlrUWmLhAoGAFpKWF/hn4zYg3seU9gai
|
||||
EYHKSbhxA4mRb+F0/9IlCBPMCqFrL5yftUsYIh2XFKn8+QhO97Nmk8wJSK6TzQ5t
|
||||
ZaSRmgySrUUhx4nZ/MgqWCFv8VUbLM5MBzwxPKhXkSTfR4z2vLYLJwVY7Tb4kZtp
|
||||
8ismApXVGHpOCstzikV9W7k=
|
||||
-----END PRIVATE KEY-----
|
||||
"#;
|
||||
|
||||
#[test]
|
||||
fn test_decode() -> Result<(), anyhow::Error> {
|
||||
let expected_claims = Claims {
|
||||
tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081")?),
|
||||
scope: Scope::Tenant,
|
||||
};
|
||||
|
||||
// Here are tokens containing the following payload, signed using TEST_PRIV_KEY_RSA
|
||||
// using RS512, RS384 and RS256 algorithms:
|
||||
//
|
||||
// ```
|
||||
// {
|
||||
// "scope": "tenant",
|
||||
// "tenant_id": "3d1f7595b468230304e0b73cecbcb081",
|
||||
// "iss": "neon.controlplane",
|
||||
// "exp": 1709200879,
|
||||
// "iat": 1678442479
|
||||
// }
|
||||
// ```
|
||||
//
|
||||
// These were encoded with the online debugger at https://jwt.io
|
||||
//
|
||||
let encoded_rs512 = "eyJhbGciOiJSUzUxMiIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.QmqfteDQmDGoxQ5EFkasbt35Lx0W0Nh63muQnYZvFq93DSh4ZbOG9Mc4yaiXZoiS5HgeKtFKv3mbWkDqjz3En06aY17hWwguBtAsGASX48lYeCPADYGlGAuaWnOnVRwe3iiOC7tvPFvwX_45S84X73sNUXyUiXv6nLdcDqVXudtNrGST_DnZDnjuUJX11w7sebtKqQQ8l9-iGHiXOl5yevpMCoB1OcTWcT6DfDtffoNuMHDC3fyhmEGG5oKAt1qBybqAIiyC9-UBAowRZXhdfxrzUl-I9jzKWvk85c5ulhVRwbPeP6TTTlPKwFzBNHg1i2U-1GONew5osQ3aoptwsA";
|
||||
|
||||
let encoded_rs384 = "eyJhbGciOiJSUzM4NCIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.qqk4nkxKzOJP38c_g57_w_SfdQVmCsDT_bsLmdFj_N6LIB22gr6U6_P_5mvk3pIAsp0VCTDwPrCU908TxqjibEkwvQoJwbogHamSGHpD7eJBxGblSnA-Nr3MlEMxpFtec8QokSm6C5mH7DoBYjB2xzeOlxAmpR2GAzInKiMkU4kZ_OcqqrmVcMXY_6VnbxZWMekuw56zE1-PP_qNF1HvYOH-P08ONP8qdo5UPtBG7QBEFlCqZXJZCFihQaI4Vzil9rDuZGCm3I7xQJ8-yh1PX3BTbGo8EzqLdRyBeTpr08UTuRbp_MJDWevHpP3afvJetAItqZXIoZQrbJjcByHqKw";
|
||||
|
||||
let encoded_rs256 = "eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.dF2N9KXG8ftFKHYbd5jQtXMQqv0Ej8FISGp1b_dmqOCotXj5S1y2AWjwyB_EXHM77JXfbEoJPAPrFFBNfd8cWtkCSTvpxWoHaecGzegDFGv5ZSc5AECFV1Daahc3PI3jii9wEiGkFOiwiBNfZ5INomOAsV--XXxlqIwKbTcgSYI7lrOTfecXAbAHiMKQlQYiIBSGnytRCgafhRkyGzPAL8ismthFJ9RHfeejyskht-9GbVHURw02bUyijuHEulpf9eEY3ZiB28de6jnCdU7ftIYaUMaYWt0nZQGkzxKPSfSLZNy14DTOYLDS04DVstWQPqnCUW_ojg0wJETOOfo9Zw";
|
||||
|
||||
// Check that RS512, RS384 and RS256 tokens can all be validated
|
||||
let auth = JwtAuth::new(DecodingKey::from_rsa_pem(TEST_PUB_KEY_RSA)?);
|
||||
|
||||
for encoded in [encoded_rs512, encoded_rs384, encoded_rs256] {
|
||||
let claims_from_token = auth.decode(encoded)?.claims;
|
||||
assert_eq!(claims_from_token, expected_claims);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_encode() -> Result<(), anyhow::Error> {
|
||||
let claims = Claims {
|
||||
tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081")?),
|
||||
scope: Scope::Tenant,
|
||||
};
|
||||
|
||||
let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_RSA)?;
|
||||
|
||||
// decode it back
|
||||
let auth = JwtAuth::new(DecodingKey::from_rsa_pem(TEST_PUB_KEY_RSA)?);
|
||||
let decoded = auth.decode(&encoded)?;
|
||||
|
||||
assert_eq!(decoded.claims, claims);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
Ok(encode(&Header::new(JWT_ALGORITHM), claims, &key)?)
|
||||
}
|
||||
|
||||
@@ -3,14 +3,14 @@ use crate::http::error;
|
||||
use anyhow::{anyhow, Context};
|
||||
use hyper::header::{HeaderName, AUTHORIZATION};
|
||||
use hyper::http::HeaderValue;
|
||||
use hyper::Method;
|
||||
use hyper::{header::CONTENT_TYPE, Body, Request, Response, Server};
|
||||
use hyper::{Method, StatusCode};
|
||||
use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
|
||||
use once_cell::sync::Lazy;
|
||||
use routerify::ext::RequestExt;
|
||||
use routerify::{Middleware, RequestInfo, Router, RouterBuilder, RouterService};
|
||||
use tokio::task::JoinError;
|
||||
use tracing::{self, debug, info, info_span, warn, Instrument};
|
||||
use tracing;
|
||||
|
||||
use std::future::Future;
|
||||
use std::net::TcpListener;
|
||||
@@ -32,77 +32,31 @@ static X_REQUEST_ID_HEADER: HeaderName = HeaderName::from_static(X_REQUEST_ID_HE
|
||||
#[derive(Debug, Default, Clone)]
|
||||
struct RequestId(String);
|
||||
|
||||
/// Adds a tracing info_span! instrumentation around the handler events,
|
||||
/// logs the request start and end events for non-GET requests and non-200 responses.
|
||||
///
|
||||
/// Use this to distinguish between logs of different HTTP requests: every request handler wrapped
|
||||
/// in this type will get request info logged in the wrapping span, including the unique request ID.
|
||||
///
|
||||
/// There could be other ways to implement similar functionality:
|
||||
///
|
||||
/// * procmacros placed on top of all handler methods
|
||||
/// With all the drawbacks of procmacros, brings no difference implementation-wise,
|
||||
/// and little code reduction compared to the existing approach.
|
||||
///
|
||||
/// * Another `TraitExt` with e.g. the `get_with_span`, `post_with_span` methods to do similar logic,
|
||||
/// implemented for [`RouterBuilder`].
|
||||
/// Could be simpler, but we don't want to depend on [`routerify`] more, targeting to use other library later.
|
||||
///
|
||||
/// * In theory, a span guard could've been created in a pre-request middleware and placed into a global collection, to be dropped
|
||||
/// later, in a post-response middleware.
|
||||
/// Due to suspendable nature of the futures, would give contradictive results which is exactly the opposite of what `tracing-futures`
|
||||
/// tries to achive with its `.instrument` used in the current approach.
|
||||
///
|
||||
/// If needed, a declarative macro to substitute the |r| ... closure boilerplate could be introduced.
|
||||
pub struct RequestSpan<E, R, H>(pub H)
|
||||
where
|
||||
E: Into<Box<dyn std::error::Error + Send + Sync>> + 'static,
|
||||
R: Future<Output = Result<Response<Body>, E>> + Send + 'static,
|
||||
H: Fn(Request<Body>) -> R + Send + Sync + 'static;
|
||||
async fn logger(res: Response<Body>, info: RequestInfo) -> Result<Response<Body>, ApiError> {
|
||||
let request_id = info.context::<RequestId>().unwrap_or_default().0;
|
||||
|
||||
impl<E, R, H> RequestSpan<E, R, H>
|
||||
where
|
||||
E: Into<Box<dyn std::error::Error + Send + Sync>> + 'static,
|
||||
R: Future<Output = Result<Response<Body>, E>> + Send + 'static,
|
||||
H: Fn(Request<Body>) -> R + Send + Sync + 'static,
|
||||
{
|
||||
/// Creates a tracing span around inner request handler and executes the request handler in the contex of that span.
|
||||
/// Use as `|r| RequestSpan(my_handler).handle(r)` instead of `my_handler` as the request handler to get the span enabled.
|
||||
pub async fn handle(self, request: Request<Body>) -> Result<Response<Body>, E> {
|
||||
let request_id = request.context::<RequestId>().unwrap_or_default().0;
|
||||
let method = request.method();
|
||||
let path = request.uri().path();
|
||||
let request_span = info_span!("request", %method, %path, %request_id);
|
||||
// cannot factor out the Level to avoid the repetition
|
||||
// because tracing can only work with const Level
|
||||
// which is not the case here
|
||||
|
||||
let log_quietly = method == Method::GET;
|
||||
async move {
|
||||
if log_quietly {
|
||||
debug!("Handling request");
|
||||
} else {
|
||||
info!("Handling request");
|
||||
}
|
||||
|
||||
// Note that we reuse `error::handler` here and not returning and error at all,
|
||||
// yet cannot use `!` directly in the method signature due to `routerify::RouterBuilder` limitation.
|
||||
// Usage of the error handler also means that we expect only the `ApiError` errors to be raised in this call.
|
||||
//
|
||||
// Panics are not handled separately, there's a `tracing_panic_hook` from another module to do that globally.
|
||||
match (self.0)(request).await {
|
||||
Ok(response) => {
|
||||
let response_status = response.status();
|
||||
if log_quietly && response_status.is_success() {
|
||||
debug!("Request handled, status: {response_status}");
|
||||
} else {
|
||||
info!("Request handled, status: {response_status}");
|
||||
}
|
||||
Ok(response)
|
||||
}
|
||||
Err(e) => Ok(error::handler(e.into()).await),
|
||||
}
|
||||
}
|
||||
.instrument(request_span)
|
||||
.await
|
||||
if info.method() == Method::GET && res.status() == StatusCode::OK {
|
||||
tracing::debug!(
|
||||
"{} {} {} {}",
|
||||
info.method(),
|
||||
info.uri().path(),
|
||||
request_id,
|
||||
res.status()
|
||||
);
|
||||
} else {
|
||||
tracing::info!(
|
||||
"{} {} {} {}",
|
||||
info.method(),
|
||||
info.uri().path(),
|
||||
request_id,
|
||||
res.status()
|
||||
);
|
||||
}
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
@@ -142,6 +96,12 @@ pub fn add_request_id_middleware<B: hyper::body::HttpBody + Send + Sync + 'stati
|
||||
request_id.to_string()
|
||||
}
|
||||
};
|
||||
|
||||
if req.method() == Method::GET {
|
||||
tracing::debug!("{} {} {}", req.method(), req.uri().path(), request_id);
|
||||
} else {
|
||||
tracing::info!("{} {} {}", req.method(), req.uri().path(), request_id);
|
||||
}
|
||||
req.set_context(RequestId(request_id));
|
||||
|
||||
Ok(req)
|
||||
@@ -165,12 +125,11 @@ async fn add_request_id_header_to_response(
|
||||
pub fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
|
||||
Router::builder()
|
||||
.middleware(add_request_id_middleware())
|
||||
.middleware(Middleware::post_with_info(logger))
|
||||
.middleware(Middleware::post_with_info(
|
||||
add_request_id_header_to_response,
|
||||
))
|
||||
.get("/metrics", |r| {
|
||||
RequestSpan(prometheus_metrics_handler).handle(r)
|
||||
})
|
||||
.get("/metrics", prometheus_metrics_handler)
|
||||
.err_handler(error::handler)
|
||||
}
|
||||
|
||||
@@ -180,43 +139,40 @@ pub fn attach_openapi_ui(
|
||||
spec_mount_path: &'static str,
|
||||
ui_mount_path: &'static str,
|
||||
) -> RouterBuilder<hyper::Body, ApiError> {
|
||||
router_builder
|
||||
.get(spec_mount_path, move |r| {
|
||||
RequestSpan(move |_| async move { Ok(Response::builder().body(Body::from(spec)).unwrap()) })
|
||||
.handle(r)
|
||||
})
|
||||
.get(ui_mount_path, move |r| RequestSpan( move |_| async move {
|
||||
Ok(Response::builder().body(Body::from(format!(r#"
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<title>rweb</title>
|
||||
<link href="https://cdn.jsdelivr.net/npm/swagger-ui-dist@3/swagger-ui.css" rel="stylesheet">
|
||||
</head>
|
||||
<body>
|
||||
<div id="swagger-ui"></div>
|
||||
<script src="https://cdn.jsdelivr.net/npm/swagger-ui-dist@3/swagger-ui-bundle.js" charset="UTF-8"> </script>
|
||||
<script>
|
||||
window.onload = function() {{
|
||||
const ui = SwaggerUIBundle({{
|
||||
"dom_id": "\#swagger-ui",
|
||||
presets: [
|
||||
SwaggerUIBundle.presets.apis,
|
||||
SwaggerUIBundle.SwaggerUIStandalonePreset
|
||||
],
|
||||
layout: "BaseLayout",
|
||||
deepLinking: true,
|
||||
showExtensions: true,
|
||||
showCommonExtensions: true,
|
||||
url: "{}",
|
||||
}})
|
||||
window.ui = ui;
|
||||
}};
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
"#, spec_mount_path))).unwrap())
|
||||
}).handle(r))
|
||||
router_builder.get(spec_mount_path, move |_| async move {
|
||||
Ok(Response::builder().body(Body::from(spec)).unwrap())
|
||||
}).get(ui_mount_path, move |_| async move {
|
||||
Ok(Response::builder().body(Body::from(format!(r#"
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<title>rweb</title>
|
||||
<link href="https://cdn.jsdelivr.net/npm/swagger-ui-dist@3/swagger-ui.css" rel="stylesheet">
|
||||
</head>
|
||||
<body>
|
||||
<div id="swagger-ui"></div>
|
||||
<script src="https://cdn.jsdelivr.net/npm/swagger-ui-dist@3/swagger-ui-bundle.js" charset="UTF-8"> </script>
|
||||
<script>
|
||||
window.onload = function() {{
|
||||
const ui = SwaggerUIBundle({{
|
||||
"dom_id": "\#swagger-ui",
|
||||
presets: [
|
||||
SwaggerUIBundle.presets.apis,
|
||||
SwaggerUIBundle.SwaggerUIStandalonePreset
|
||||
],
|
||||
layout: "BaseLayout",
|
||||
deepLinking: true,
|
||||
showExtensions: true,
|
||||
showCommonExtensions: true,
|
||||
url: "{}",
|
||||
}})
|
||||
window.ui = ui;
|
||||
}};
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
"#, spec_mount_path))).unwrap())
|
||||
})
|
||||
}
|
||||
|
||||
fn parse_token(header_value: &str) -> Result<&str, ApiError> {
|
||||
@@ -278,7 +234,7 @@ where
|
||||
async move {
|
||||
let headers = response.headers_mut();
|
||||
if headers.contains_key(&name) {
|
||||
warn!(
|
||||
tracing::warn!(
|
||||
"{} response already contains header {:?}",
|
||||
request_info.uri(),
|
||||
&name,
|
||||
@@ -318,7 +274,7 @@ pub fn serve_thread_main<S>(
|
||||
where
|
||||
S: Future<Output = ()> + Send + Sync,
|
||||
{
|
||||
info!("Starting an HTTP endpoint at {}", listener.local_addr()?);
|
||||
tracing::info!("Starting an HTTP endpoint at {}", listener.local_addr()?);
|
||||
|
||||
// Create a Service from the router above to handle incoming requests.
|
||||
let service = RouterService::new(router_builder.build().map_err(|err| anyhow!(err))?).unwrap();
|
||||
|
||||
@@ -1,9 +1,7 @@
|
||||
use std::fmt::Display;
|
||||
|
||||
use anyhow::Context;
|
||||
use bytes::Buf;
|
||||
use hyper::{header, Body, Request, Response, StatusCode};
|
||||
use serde::{Deserialize, Serialize, Serializer};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::error::ApiError;
|
||||
|
||||
@@ -33,12 +31,3 @@ pub fn json_response<T: Serialize>(
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
/// Serialize through Display trait.
|
||||
pub fn display_serialize<S, F>(z: &F, s: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: Serializer,
|
||||
F: Display,
|
||||
{
|
||||
s.serialize_str(&format!("{}", z))
|
||||
}
|
||||
|
||||
@@ -280,17 +280,33 @@ fn start_pageserver(
|
||||
};
|
||||
info!("Using auth: {:#?}", conf.auth_type);
|
||||
|
||||
match var("NEON_AUTH_TOKEN") {
|
||||
Ok(v) => {
|
||||
// TODO: remove ZENITH_AUTH_TOKEN once it's not used anywhere in development/staging/prod configuration.
|
||||
match (var("ZENITH_AUTH_TOKEN"), var("NEON_AUTH_TOKEN")) {
|
||||
(old, Ok(v)) => {
|
||||
info!("Loaded JWT token for authentication with Safekeeper");
|
||||
if let Ok(v_old) = old {
|
||||
warn!(
|
||||
"JWT token for Safekeeper is specified twice, ZENITH_AUTH_TOKEN is deprecated"
|
||||
);
|
||||
if v_old != v {
|
||||
warn!("JWT token for Safekeeper has two different values, choosing NEON_AUTH_TOKEN");
|
||||
}
|
||||
}
|
||||
pageserver::config::SAFEKEEPER_AUTH_TOKEN
|
||||
.set(Arc::new(v))
|
||||
.map_err(|_| anyhow!("Could not initialize SAFEKEEPER_AUTH_TOKEN"))?;
|
||||
}
|
||||
Err(VarError::NotPresent) => {
|
||||
(Ok(v), _) => {
|
||||
info!("Loaded JWT token for authentication with Safekeeper");
|
||||
warn!("Please update pageserver configuration: the JWT token should be NEON_AUTH_TOKEN, not ZENITH_AUTH_TOKEN");
|
||||
pageserver::config::SAFEKEEPER_AUTH_TOKEN
|
||||
.set(Arc::new(v))
|
||||
.map_err(|_| anyhow!("Could not initialize SAFEKEEPER_AUTH_TOKEN"))?;
|
||||
}
|
||||
(_, Err(VarError::NotPresent)) => {
|
||||
info!("No JWT token for authentication with Safekeeper detected");
|
||||
}
|
||||
Err(e) => {
|
||||
(_, Err(e)) => {
|
||||
return Err(e).with_context(|| {
|
||||
"Failed to either load to detect non-present NEON_AUTH_TOKEN environment variable"
|
||||
})
|
||||
|
||||
@@ -698,12 +698,6 @@ impl PageServerConf {
|
||||
Some(parse_toml_u64("compaction_threshold", compaction_threshold)?.try_into()?);
|
||||
}
|
||||
|
||||
if let Some(image_creation_threshold) = item.get("image_creation_threshold") {
|
||||
t_conf.image_creation_threshold = Some(
|
||||
parse_toml_u64("image_creation_threshold", image_creation_threshold)?.try_into()?,
|
||||
);
|
||||
}
|
||||
|
||||
if let Some(gc_horizon) = item.get("gc_horizon") {
|
||||
t_conf.gc_horizon = Some(parse_toml_u64("gc_horizon", gc_horizon)?);
|
||||
}
|
||||
|
||||
@@ -245,53 +245,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
- name: timeline_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
put:
|
||||
description: Garbage collect given timeline
|
||||
responses:
|
||||
"200":
|
||||
description: OK
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
type: string
|
||||
"400":
|
||||
description: Error when no tenant id found in path, no timeline id or invalid timestamp
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
/v1/tenant/{tenant_id}/attach:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
|
||||
@@ -10,7 +10,6 @@ use remote_storage::GenericRemoteStorage;
|
||||
use tenant_size_model::{SizeResult, StorageModel};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::http::endpoint::RequestSpan;
|
||||
use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
|
||||
|
||||
use super::models::{
|
||||
@@ -972,22 +971,19 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
async {
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
|
||||
timeline
|
||||
.freeze_and_flush()
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
timeline
|
||||
.compact(&ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
.instrument(info_span!("manual_checkpoint", tenant_id = %tenant_id, timeline_id = %timeline_id))
|
||||
.await
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
|
||||
timeline
|
||||
.freeze_and_flush()
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
timeline
|
||||
.compact(&ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn timeline_download_remote_layers_handler_post(
|
||||
@@ -1092,8 +1088,7 @@ pub fn make_router(
|
||||
let handler = $handler;
|
||||
#[cfg(not(feature = "testing"))]
|
||||
let handler = cfg_disabled;
|
||||
|
||||
move |r| RequestSpan(handler).handle(r)
|
||||
handler
|
||||
}};
|
||||
}
|
||||
|
||||
@@ -1101,55 +1096,35 @@ pub fn make_router(
|
||||
.data(Arc::new(
|
||||
State::new(conf, auth, remote_storage).context("Failed to initialize router state")?,
|
||||
))
|
||||
.get("/v1/status", |r| RequestSpan(status_handler).handle(r))
|
||||
.get("/v1/status", status_handler)
|
||||
.put(
|
||||
"/v1/failpoints",
|
||||
testing_api!("manage failpoints", failpoints_handler),
|
||||
)
|
||||
.get("/v1/tenant", |r| RequestSpan(tenant_list_handler).handle(r))
|
||||
.post("/v1/tenant", |r| {
|
||||
RequestSpan(tenant_create_handler).handle(r)
|
||||
})
|
||||
.get("/v1/tenant/:tenant_id", |r| {
|
||||
RequestSpan(tenant_status).handle(r)
|
||||
})
|
||||
.get("/v1/tenant/:tenant_id/synthetic_size", |r| {
|
||||
RequestSpan(tenant_size_handler).handle(r)
|
||||
})
|
||||
.put("/v1/tenant/config", |r| {
|
||||
RequestSpan(update_tenant_config_handler).handle(r)
|
||||
})
|
||||
.get("/v1/tenant/:tenant_id/config", |r| {
|
||||
RequestSpan(get_tenant_config_handler).handle(r)
|
||||
})
|
||||
.get("/v1/tenant/:tenant_id/timeline", |r| {
|
||||
RequestSpan(timeline_list_handler).handle(r)
|
||||
})
|
||||
.post("/v1/tenant/:tenant_id/timeline", |r| {
|
||||
RequestSpan(timeline_create_handler).handle(r)
|
||||
})
|
||||
.post("/v1/tenant/:tenant_id/attach", |r| {
|
||||
RequestSpan(tenant_attach_handler).handle(r)
|
||||
})
|
||||
.post("/v1/tenant/:tenant_id/detach", |r| {
|
||||
RequestSpan(tenant_detach_handler).handle(r)
|
||||
})
|
||||
.post("/v1/tenant/:tenant_id/load", |r| {
|
||||
RequestSpan(tenant_load_handler).handle(r)
|
||||
})
|
||||
.post("/v1/tenant/:tenant_id/ignore", |r| {
|
||||
RequestSpan(tenant_ignore_handler).handle(r)
|
||||
})
|
||||
.get("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
|
||||
RequestSpan(timeline_detail_handler).handle(r)
|
||||
})
|
||||
.get("/v1/tenant", tenant_list_handler)
|
||||
.post("/v1/tenant", tenant_create_handler)
|
||||
.get("/v1/tenant/:tenant_id", tenant_status)
|
||||
.get("/v1/tenant/:tenant_id/synthetic_size", tenant_size_handler)
|
||||
.put("/v1/tenant/config", update_tenant_config_handler)
|
||||
.get("/v1/tenant/:tenant_id/config", get_tenant_config_handler)
|
||||
.get("/v1/tenant/:tenant_id/timeline", timeline_list_handler)
|
||||
.post("/v1/tenant/:tenant_id/timeline", timeline_create_handler)
|
||||
.post("/v1/tenant/:tenant_id/attach", tenant_attach_handler)
|
||||
.post("/v1/tenant/:tenant_id/detach", tenant_detach_handler)
|
||||
.post("/v1/tenant/:tenant_id/load", tenant_load_handler)
|
||||
.post("/v1/tenant/:tenant_id/ignore", tenant_ignore_handler)
|
||||
.get(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id",
|
||||
timeline_detail_handler,
|
||||
)
|
||||
.get(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/get_lsn_by_timestamp",
|
||||
|r| RequestSpan(get_lsn_by_timestamp_handler).handle(r),
|
||||
get_lsn_by_timestamp_handler,
|
||||
)
|
||||
.put(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc",
|
||||
timeline_gc_handler,
|
||||
)
|
||||
.put("/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc", |r| {
|
||||
RequestSpan(timeline_gc_handler).handle(r)
|
||||
})
|
||||
.put(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/compact",
|
||||
testing_api!("run timeline compaction", timeline_compact_handler),
|
||||
@@ -1160,26 +1135,28 @@ pub fn make_router(
|
||||
)
|
||||
.post(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
|
||||
|r| RequestSpan(timeline_download_remote_layers_handler_post).handle(r),
|
||||
timeline_download_remote_layers_handler_post,
|
||||
)
|
||||
.get(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
|
||||
|r| RequestSpan(timeline_download_remote_layers_handler_get).handle(r),
|
||||
timeline_download_remote_layers_handler_get,
|
||||
)
|
||||
.delete(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id",
|
||||
timeline_delete_handler,
|
||||
)
|
||||
.get(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/layer",
|
||||
layer_map_info_handler,
|
||||
)
|
||||
.delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
|
||||
RequestSpan(timeline_delete_handler).handle(r)
|
||||
})
|
||||
.get("/v1/tenant/:tenant_id/timeline/:timeline_id/layer", |r| {
|
||||
RequestSpan(layer_map_info_handler).handle(r)
|
||||
})
|
||||
.get(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
|
||||
|r| RequestSpan(layer_download_handler).handle(r),
|
||||
layer_download_handler,
|
||||
)
|
||||
.delete(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
|
||||
|r| RequestSpan(evict_timeline_layer_handler).handle(r),
|
||||
evict_timeline_layer_handler,
|
||||
)
|
||||
.get("/v1/panic", |r| RequestSpan(always_panic_handler).handle(r))
|
||||
.get("/v1/panic", always_panic_handler)
|
||||
.any(handler_404))
|
||||
}
|
||||
|
||||
@@ -123,22 +123,6 @@ static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_remote_ondemand_downloaded_layers_total",
|
||||
"Total on-demand downloaded layers"
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
pub static REMOTE_ONDEMAND_DOWNLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_remote_ondemand_downloaded_bytes_total",
|
||||
"Total bytes of layers on-demand downloaded",
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_current_logical_size",
|
||||
|
||||
@@ -103,7 +103,6 @@ pub struct TenantConfOpt {
|
||||
pub checkpoint_distance: Option<u64>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(with = "humantime_serde")]
|
||||
#[serde(default)]
|
||||
pub checkpoint_timeout: Option<Duration>,
|
||||
|
||||
|
||||
@@ -218,10 +218,9 @@ use tracing::{debug, info, warn};
|
||||
use tracing::{info_span, Instrument};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::metrics::{
|
||||
MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics,
|
||||
REMOTE_ONDEMAND_DOWNLOADED_BYTES, REMOTE_ONDEMAND_DOWNLOADED_LAYERS,
|
||||
};
|
||||
use crate::metrics::RemoteOpFileKind;
|
||||
use crate::metrics::RemoteOpKind;
|
||||
use crate::metrics::{MeasureRemoteOp, RemoteTimelineClientMetrics};
|
||||
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
@@ -447,10 +446,6 @@ impl RemoteTimelineClient {
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
REMOTE_ONDEMAND_DOWNLOADED_LAYERS.inc();
|
||||
REMOTE_ONDEMAND_DOWNLOADED_BYTES.inc_by(downloaded_size);
|
||||
|
||||
Ok(downloaded_size)
|
||||
}
|
||||
|
||||
|
||||
@@ -6,13 +6,11 @@
|
||||
use std::collections::HashSet;
|
||||
use std::future::Future;
|
||||
use std::path::Path;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{anyhow, Context};
|
||||
use tokio::fs;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
|
||||
use tracing::{info, warn};
|
||||
use tracing::{error, info, warn};
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::tenant::storage_layer::LayerFileName;
|
||||
@@ -28,8 +26,6 @@ async fn fsync_path(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Er
|
||||
fs::File::open(path).await?.sync_all().await
|
||||
}
|
||||
|
||||
static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120);
|
||||
|
||||
///
|
||||
/// If 'metadata' is given, we will validate that the downloaded file's size matches that
|
||||
/// in the metadata. (In the future, we might do more cross-checks, like CRC validation)
|
||||
@@ -68,28 +64,22 @@ pub async fn download_layer_file<'a>(
|
||||
// TODO: this doesn't use the cached fd for some reason?
|
||||
let mut destination_file = fs::File::create(&temp_file_path).await.with_context(|| {
|
||||
format!(
|
||||
"create a destination file for layer '{}'",
|
||||
"Failed to create a destination file for layer '{}'",
|
||||
temp_file_path.display()
|
||||
)
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
let mut download = storage.download(&remote_path).await.with_context(|| {
|
||||
format!(
|
||||
"open a download stream for layer with remote storage path '{remote_path:?}'"
|
||||
"Failed to open a download stream for layer with remote storage path '{remote_path:?}'"
|
||||
)
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let bytes_amount = tokio::time::timeout(MAX_DOWNLOAD_DURATION, tokio::io::copy(&mut download.download_stream, &mut destination_file))
|
||||
.await
|
||||
.map_err(|e| DownloadError::Other(anyhow::anyhow!("Timed out {:?}", e)))?
|
||||
.with_context(|| {
|
||||
format!("Failed to download layer with remote storage path '{remote_path:?}' into file {temp_file_path:?}")
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let bytes_amount = tokio::io::copy(&mut download.download_stream, &mut destination_file).await.with_context(|| {
|
||||
format!("Failed to download layer with remote storage path '{remote_path:?}' into file {temp_file_path:?}")
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
Ok((destination_file, bytes_amount))
|
||||
|
||||
},
|
||||
&format!("download {remote_path:?}"),
|
||||
).await?;
|
||||
@@ -310,7 +300,7 @@ where
|
||||
}
|
||||
Err(DownloadError::Other(ref err)) => {
|
||||
// Operation failed FAILED_DOWNLOAD_RETRIES times. Time to give up.
|
||||
warn!("{description} still failed after {attempts} retries, giving up: {err:?}");
|
||||
error!("{description} still failed after {attempts} retries, giving up: {err:?}");
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -364,7 +364,7 @@ pub trait PersistentLayer: Layer {
|
||||
}
|
||||
|
||||
/// Permanently remove this layer from disk.
|
||||
fn delete_resident_layer_file(&self) -> Result<()>;
|
||||
fn delete(&self) -> Result<()>;
|
||||
|
||||
fn downcast_remote_layer(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
|
||||
None
|
||||
|
||||
@@ -438,7 +438,7 @@ impl PersistentLayer for DeltaLayer {
|
||||
))
|
||||
}
|
||||
|
||||
fn delete_resident_layer_file(&self) -> Result<()> {
|
||||
fn delete(&self) -> Result<()> {
|
||||
// delete underlying file
|
||||
fs::remove_file(self.path())?;
|
||||
Ok(())
|
||||
|
||||
@@ -252,7 +252,7 @@ impl PersistentLayer for ImageLayer {
|
||||
unimplemented!();
|
||||
}
|
||||
|
||||
fn delete_resident_layer_file(&self) -> Result<()> {
|
||||
fn delete(&self) -> Result<()> {
|
||||
// delete underlying file
|
||||
fs::remove_file(self.path())?;
|
||||
Ok(())
|
||||
|
||||
@@ -155,8 +155,8 @@ impl PersistentLayer for RemoteLayer {
|
||||
bail!("cannot iterate a remote layer");
|
||||
}
|
||||
|
||||
fn delete_resident_layer_file(&self) -> Result<()> {
|
||||
bail!("remote layer has no layer file");
|
||||
fn delete(&self) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn downcast_remote_layer<'a>(self: Arc<Self>) -> Option<std::sync::Arc<RemoteLayer>> {
|
||||
|
||||
@@ -662,8 +662,8 @@ impl Timeline {
|
||||
// update the index file on next flush iteration too. But it
|
||||
// could take a while until that happens.
|
||||
//
|
||||
// Additionally, only do this once before we return from this function.
|
||||
if last_round || res.is_ok() {
|
||||
// Additionally, only do this on the terminal round before sleeping.
|
||||
if last_round {
|
||||
if let Some(remote_client) = &self.remote_client {
|
||||
remote_client.schedule_index_upload_for_file_changes()?;
|
||||
}
|
||||
@@ -1047,12 +1047,11 @@ impl Timeline {
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
let layer_file_size = local_layer
|
||||
.file_size()
|
||||
.expect("Local layer should have a file size");
|
||||
|
||||
let layer_metadata = LayerFileMetadata::new(layer_file_size);
|
||||
|
||||
let layer_metadata = LayerFileMetadata::new(
|
||||
local_layer
|
||||
.file_size()
|
||||
.expect("Local layer should have a file size"),
|
||||
);
|
||||
let new_remote_layer = Arc::new(match local_layer.filename() {
|
||||
LayerFileName::Image(image_name) => RemoteLayer::new_img(
|
||||
self.tenant_id,
|
||||
@@ -1076,22 +1075,15 @@ impl Timeline {
|
||||
|
||||
let replaced = match batch_updates.replace_historic(local_layer, new_remote_layer)? {
|
||||
Replacement::Replaced { .. } => {
|
||||
if let Err(e) = local_layer.delete_resident_layer_file() {
|
||||
let layer_size = local_layer.file_size();
|
||||
|
||||
if let Err(e) = local_layer.delete() {
|
||||
error!("failed to remove layer file on evict after replacement: {e:#?}");
|
||||
}
|
||||
// Always decrement the physical size gauge, even if we failed to delete the file.
|
||||
// Rationale: we already replaced the layer with a remote layer in the layer map,
|
||||
// and any subsequent download_remote_layer will
|
||||
// 1. overwrite the file on disk and
|
||||
// 2. add the downloaded size to the resident size gauge.
|
||||
//
|
||||
// If there is no re-download, and we restart the pageserver, then load_layer_map
|
||||
// will treat the file as a local layer again, count it towards resident size,
|
||||
// and it'll be like the layer removal never happened.
|
||||
// The bump in resident size is perhaps unexpected but overall a robust behavior.
|
||||
self.metrics
|
||||
.resident_physical_size_gauge
|
||||
.sub(layer_file_size);
|
||||
|
||||
if let Some(layer_size) = layer_size {
|
||||
self.metrics.resident_physical_size_gauge.sub(layer_size);
|
||||
}
|
||||
|
||||
true
|
||||
}
|
||||
@@ -1950,14 +1942,11 @@ impl Timeline {
|
||||
layer: Arc<dyn PersistentLayer>,
|
||||
updates: &mut BatchedUpdates<'_, dyn PersistentLayer>,
|
||||
) -> anyhow::Result<()> {
|
||||
if !layer.is_remote_layer() {
|
||||
layer.delete_resident_layer_file()?;
|
||||
let layer_file_size = layer
|
||||
.file_size()
|
||||
.expect("Local layer should have a file size");
|
||||
self.metrics
|
||||
.resident_physical_size_gauge
|
||||
.sub(layer_file_size);
|
||||
let layer_size = layer.file_size();
|
||||
|
||||
layer.delete()?;
|
||||
if let Some(layer_size) = layer_size {
|
||||
self.metrics.resident_physical_size_gauge.sub(layer_size);
|
||||
}
|
||||
|
||||
// TODO Removing from the bottom of the layer map is expensive.
|
||||
@@ -3819,7 +3808,7 @@ impl Timeline {
|
||||
remote_layer.ongoing_download.close();
|
||||
} else {
|
||||
// Keep semaphore open. We'll drop the permit at the end of the function.
|
||||
error!("on-demand download failed: {:?}", result.as_ref().unwrap_err());
|
||||
info!("on-demand download failed: {:?}", result.as_ref().unwrap_err());
|
||||
}
|
||||
|
||||
// Don't treat it as an error if the task that triggered the download
|
||||
|
||||
@@ -23,11 +23,13 @@ use bytes::{BufMut, Bytes, BytesMut};
|
||||
use nix::poll::*;
|
||||
use serde::Serialize;
|
||||
use std::collections::VecDeque;
|
||||
use std::fs::OpenOptions;
|
||||
use std::io::prelude::*;
|
||||
use std::io::{Error, ErrorKind};
|
||||
use std::ops::{Deref, DerefMut};
|
||||
use std::os::unix::io::{AsRawFd, RawFd};
|
||||
use std::os::unix::prelude::CommandExt;
|
||||
use std::path::PathBuf;
|
||||
use std::process::Stdio;
|
||||
use std::process::{Child, ChildStderr, ChildStdin, ChildStdout, Command};
|
||||
use std::sync::{Mutex, MutexGuard};
|
||||
@@ -254,53 +256,52 @@ impl PostgresRedoManager {
|
||||
pg_version: u32,
|
||||
) -> Result<Bytes, WalRedoError> {
|
||||
let (rel, blknum) = key_to_rel_block(key).or(Err(WalRedoError::InvalidRecord))?;
|
||||
const MAX_RETRY_ATTEMPTS: u32 = 1;
|
||||
|
||||
let start_time = Instant::now();
|
||||
let mut n_attempts = 0u32;
|
||||
loop {
|
||||
let mut proc = self.stdin.lock().unwrap();
|
||||
let lock_time = Instant::now();
|
||||
|
||||
// launch the WAL redo process on first use
|
||||
if proc.is_none() {
|
||||
self.launch(&mut proc, pg_version)?;
|
||||
}
|
||||
WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());
|
||||
let mut proc = self.stdin.lock().unwrap();
|
||||
let lock_time = Instant::now();
|
||||
|
||||
// Relational WAL records are applied using wal-redo-postgres
|
||||
let buf_tag = BufferTag { rel, blknum };
|
||||
let result = self
|
||||
.apply_wal_records(proc, buf_tag, &base_img, records, wal_redo_timeout)
|
||||
.map_err(WalRedoError::IoError);
|
||||
// launch the WAL redo process on first use
|
||||
if proc.is_none() {
|
||||
self.launch(&mut proc, pg_version)?;
|
||||
}
|
||||
WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());
|
||||
|
||||
let end_time = Instant::now();
|
||||
let duration = end_time.duration_since(lock_time);
|
||||
// Relational WAL records are applied using wal-redo-postgres
|
||||
let buf_tag = BufferTag { rel, blknum };
|
||||
let result = self
|
||||
.apply_wal_records(proc, buf_tag, base_img, records, wal_redo_timeout)
|
||||
.map_err(WalRedoError::IoError);
|
||||
|
||||
let len = records.len();
|
||||
let nbytes = records.iter().fold(0, |acumulator, record| {
|
||||
acumulator
|
||||
+ match &record.1 {
|
||||
NeonWalRecord::Postgres { rec, .. } => rec.len(),
|
||||
_ => unreachable!("Only PostgreSQL records are accepted in this batch"),
|
||||
}
|
||||
});
|
||||
let end_time = Instant::now();
|
||||
let duration = end_time.duration_since(lock_time);
|
||||
|
||||
WAL_REDO_TIME.observe(duration.as_secs_f64());
|
||||
WAL_REDO_RECORDS_HISTOGRAM.observe(len as f64);
|
||||
WAL_REDO_BYTES_HISTOGRAM.observe(nbytes as f64);
|
||||
let len = records.len();
|
||||
let nbytes = records.iter().fold(0, |acumulator, record| {
|
||||
acumulator
|
||||
+ match &record.1 {
|
||||
NeonWalRecord::Postgres { rec, .. } => rec.len(),
|
||||
_ => unreachable!("Only PostgreSQL records are accepted in this batch"),
|
||||
}
|
||||
});
|
||||
|
||||
debug!(
|
||||
"postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}",
|
||||
len,
|
||||
nbytes,
|
||||
duration.as_micros(),
|
||||
lsn
|
||||
);
|
||||
WAL_REDO_TIME.observe(duration.as_secs_f64());
|
||||
WAL_REDO_RECORDS_HISTOGRAM.observe(len as f64);
|
||||
WAL_REDO_BYTES_HISTOGRAM.observe(nbytes as f64);
|
||||
|
||||
// If something went wrong, don't try to reuse the process. Kill it, and
|
||||
// next request will launch a new one.
|
||||
if result.is_err() {
|
||||
error!(
|
||||
debug!(
|
||||
"postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}",
|
||||
len,
|
||||
nbytes,
|
||||
duration.as_micros(),
|
||||
lsn
|
||||
);
|
||||
|
||||
// If something went wrong, don't try to reuse the process. Kill it, and
|
||||
// next request will launch a new one.
|
||||
if result.is_err() {
|
||||
error!(
|
||||
"error applying {} WAL records {}..{} ({} bytes) to base image with LSN {} to reconstruct page image at LSN {}",
|
||||
records.len(),
|
||||
records.first().map(|p| p.0).unwrap_or(Lsn(0)),
|
||||
@@ -309,28 +310,24 @@ impl PostgresRedoManager {
|
||||
base_img_lsn,
|
||||
lsn
|
||||
);
|
||||
// self.stdin only holds stdin & stderr as_raw_fd().
|
||||
// Dropping it as part of take() doesn't close them.
|
||||
// The owning objects (ChildStdout and ChildStderr) are stored in
|
||||
// self.stdout and self.stderr, respsectively.
|
||||
// We intentionally keep them open here to avoid a race between
|
||||
// currently running `apply_wal_records()` and a `launch()` call
|
||||
// after we return here.
|
||||
// The currently running `apply_wal_records()` must not read from
|
||||
// the newly launched process.
|
||||
// By keeping self.stdout and self.stderr open here, `launch()` will
|
||||
// get other file descriptors for the new child's stdout and stderr,
|
||||
// and hence the current `apply_wal_records()` calls will observe
|
||||
// `output.stdout.as_raw_fd() != stdout_fd` .
|
||||
if let Some(proc) = self.stdin.lock().unwrap().take() {
|
||||
proc.child.kill_and_wait();
|
||||
}
|
||||
}
|
||||
n_attempts += 1;
|
||||
if n_attempts > MAX_RETRY_ATTEMPTS || result.is_ok() {
|
||||
return result;
|
||||
// self.stdin only holds stdin & stderr as_raw_fd().
|
||||
// Dropping it as part of take() doesn't close them.
|
||||
// The owning objects (ChildStdout and ChildStderr) are stored in
|
||||
// self.stdout and self.stderr, respsectively.
|
||||
// We intentionally keep them open here to avoid a race between
|
||||
// currently running `apply_wal_records()` and a `launch()` call
|
||||
// after we return here.
|
||||
// The currently running `apply_wal_records()` must not read from
|
||||
// the newly launched process.
|
||||
// By keeping self.stdout and self.stderr open here, `launch()` will
|
||||
// get other file descriptors for the new child's stdout and stderr,
|
||||
// and hence the current `apply_wal_records()` calls will observe
|
||||
// `output.stdout.as_raw_fd() != stdout_fd` .
|
||||
if let Some(proc) = self.stdin.lock().unwrap().take() {
|
||||
proc.child.kill_and_wait();
|
||||
}
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
///
|
||||
@@ -637,26 +634,26 @@ impl PostgresRedoManager {
|
||||
input: &mut MutexGuard<Option<ProcessInput>>,
|
||||
pg_version: u32,
|
||||
) -> Result<(), Error> {
|
||||
// Previous versions of wal-redo required data directory and that directories
|
||||
// occupied some space on disk. Remove it if we face it.
|
||||
//
|
||||
// This code could be dropped after one release cycle.
|
||||
let legacy_datadir = path_with_suffix_extension(
|
||||
// FIXME: We need a dummy Postgres cluster to run the process in. Currently, we
|
||||
// just create one with constant name. That fails if you try to launch more than
|
||||
// one WAL redo manager concurrently.
|
||||
let datadir = path_with_suffix_extension(
|
||||
self.conf
|
||||
.tenant_path(&self.tenant_id)
|
||||
.join("wal-redo-datadir"),
|
||||
TEMP_FILE_SUFFIX,
|
||||
);
|
||||
if legacy_datadir.exists() {
|
||||
info!("legacy wal-redo datadir {legacy_datadir:?} exists, removing");
|
||||
fs::remove_dir_all(&legacy_datadir).map_err(|e| {
|
||||
|
||||
// Create empty data directory for wal-redo postgres, deleting old one first.
|
||||
if datadir.exists() {
|
||||
info!("old temporary datadir {datadir:?} exists, removing");
|
||||
fs::remove_dir_all(&datadir).map_err(|e| {
|
||||
Error::new(
|
||||
e.kind(),
|
||||
format!("legacy wal-redo datadir {legacy_datadir:?} removal failure: {e}"),
|
||||
format!("Old temporary dir {datadir:?} removal failure: {e}"),
|
||||
)
|
||||
})?;
|
||||
}
|
||||
|
||||
let pg_bin_dir_path = self
|
||||
.conf
|
||||
.pg_bin_dir(pg_version)
|
||||
@@ -666,6 +663,35 @@ impl PostgresRedoManager {
|
||||
.pg_lib_dir(pg_version)
|
||||
.map_err(|e| Error::new(ErrorKind::Other, format!("incorrect pg_lib_dir path: {e}")))?;
|
||||
|
||||
info!("running initdb in {}", datadir.display());
|
||||
let initdb = Command::new(pg_bin_dir_path.join("initdb"))
|
||||
.args(["-D", &datadir.to_string_lossy()])
|
||||
.arg("-N")
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", &pg_lib_dir_path)
|
||||
.env("DYLD_LIBRARY_PATH", &pg_lib_dir_path) // macOS
|
||||
.close_fds()
|
||||
.output()
|
||||
.map_err(|e| Error::new(e.kind(), format!("failed to execute initdb: {e}")))?;
|
||||
|
||||
if !initdb.status.success() {
|
||||
return Err(Error::new(
|
||||
ErrorKind::Other,
|
||||
format!(
|
||||
"initdb failed\nstdout: {}\nstderr:\n{}",
|
||||
String::from_utf8_lossy(&initdb.stdout),
|
||||
String::from_utf8_lossy(&initdb.stderr)
|
||||
),
|
||||
));
|
||||
} else {
|
||||
// Limit shared cache for wal-redo-postgres
|
||||
let mut config = OpenOptions::new()
|
||||
.append(true)
|
||||
.open(PathBuf::from(&datadir).join("postgresql.conf"))?;
|
||||
config.write_all(b"shared_buffers=128kB\n")?;
|
||||
config.write_all(b"fsync=off\n")?;
|
||||
}
|
||||
|
||||
// Start postgres itself
|
||||
let child = Command::new(pg_bin_dir_path.join("postgres"))
|
||||
.arg("--wal-redo")
|
||||
@@ -675,6 +701,7 @@ impl PostgresRedoManager {
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", &pg_lib_dir_path)
|
||||
.env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
|
||||
.env("PGDATA", &datadir)
|
||||
// The redo process is not trusted, and runs in seccomp mode that
|
||||
// doesn't allow it to open any files. We have to also make sure it
|
||||
// doesn't inherit any file descriptors from the pageserver, that
|
||||
@@ -744,7 +771,7 @@ impl PostgresRedoManager {
|
||||
&self,
|
||||
mut input: MutexGuard<Option<ProcessInput>>,
|
||||
tag: BufferTag,
|
||||
base_img: &Option<Bytes>,
|
||||
base_img: Option<Bytes>,
|
||||
records: &[(Lsn, NeonWalRecord)],
|
||||
wal_redo_timeout: Duration,
|
||||
) -> Result<Bytes, std::io::Error> {
|
||||
@@ -760,7 +787,7 @@ impl PostgresRedoManager {
|
||||
let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
|
||||
build_begin_redo_for_block_msg(tag, &mut writebuf);
|
||||
if let Some(img) = base_img {
|
||||
build_push_page_msg(tag, img, &mut writebuf);
|
||||
build_push_page_msg(tag, &img, &mut writebuf);
|
||||
}
|
||||
for (lsn, rec) in records.iter() {
|
||||
if let NeonWalRecord::Postgres {
|
||||
|
||||
@@ -32,9 +32,6 @@
|
||||
|
||||
#define PageStoreTrace DEBUG5
|
||||
|
||||
#define MAX_RECONNECT_ATTEMPTS 5
|
||||
#define RECONNECT_INTERVAL_USEC 1000000
|
||||
|
||||
bool connected = false;
|
||||
PGconn *pageserver_conn = NULL;
|
||||
|
||||
@@ -55,8 +52,8 @@ int readahead_buffer_size = 128;
|
||||
|
||||
static void pageserver_flush(void);
|
||||
|
||||
static bool
|
||||
pageserver_connect(int elevel)
|
||||
static void
|
||||
pageserver_connect()
|
||||
{
|
||||
char *query;
|
||||
int ret;
|
||||
@@ -72,11 +69,10 @@ pageserver_connect(int elevel)
|
||||
PQfinish(pageserver_conn);
|
||||
pageserver_conn = NULL;
|
||||
|
||||
ereport(elevel,
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
|
||||
errmsg(NEON_TAG "could not establish connection to pageserver"),
|
||||
errdetail_internal("%s", msg)));
|
||||
return false;
|
||||
}
|
||||
|
||||
query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
|
||||
@@ -85,8 +81,7 @@ pageserver_connect(int elevel)
|
||||
{
|
||||
PQfinish(pageserver_conn);
|
||||
pageserver_conn = NULL;
|
||||
neon_log(elevel, "could not send pagestream command to pageserver");
|
||||
return false;
|
||||
neon_log(ERROR, "could not send pagestream command to pageserver");
|
||||
}
|
||||
|
||||
pageserver_conn_wes = CreateWaitEventSet(TopMemoryContext, 3);
|
||||
@@ -118,9 +113,8 @@ pageserver_connect(int elevel)
|
||||
FreeWaitEventSet(pageserver_conn_wes);
|
||||
pageserver_conn_wes = NULL;
|
||||
|
||||
neon_log(elevel, "could not complete handshake with pageserver: %s",
|
||||
neon_log(ERROR, "could not complete handshake with pageserver: %s",
|
||||
msg);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -128,7 +122,6 @@ pageserver_connect(int elevel)
|
||||
neon_log(LOG, "libpagestore: connected to '%s'", page_server_connstring_raw);
|
||||
|
||||
connected = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -156,11 +149,8 @@ retry:
|
||||
if (event.events & WL_SOCKET_READABLE)
|
||||
{
|
||||
if (!PQconsumeInput(pageserver_conn))
|
||||
{
|
||||
neon_log(LOG, "could not get response from pageserver: %s",
|
||||
neon_log(ERROR, "could not get response from pageserver: %s",
|
||||
PQerrorMessage(pageserver_conn));
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
goto retry;
|
||||
@@ -200,62 +190,31 @@ static void
|
||||
pageserver_send(NeonRequest * request)
|
||||
{
|
||||
StringInfoData req_buff;
|
||||
int n_reconnect_attempts = 0;
|
||||
|
||||
/* If the connection was lost for some reason, reconnect */
|
||||
if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD)
|
||||
pageserver_disconnect();
|
||||
|
||||
if (!connected)
|
||||
pageserver_connect();
|
||||
|
||||
req_buff = nm_pack_request(request);
|
||||
|
||||
/*
|
||||
* If pageserver is stopped, the connections from compute node are broken.
|
||||
* The compute node doesn't notice that immediately, but it will cause the next request to fail, usually on the next query.
|
||||
* That causes user-visible errors if pageserver is restarted, or the tenant is moved from one pageserver to another.
|
||||
* See https://github.com/neondatabase/neon/issues/1138
|
||||
* So try to reestablish connection in case of failure.
|
||||
* Send request.
|
||||
*
|
||||
* In principle, this could block if the output buffer is full, and we
|
||||
* should use async mode and check for interrupts while waiting. In
|
||||
* practice, our requests are small enough to always fit in the output and
|
||||
* TCP buffer.
|
||||
*/
|
||||
while (true)
|
||||
if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
|
||||
{
|
||||
if (!connected)
|
||||
{
|
||||
if (!pageserver_connect(n_reconnect_attempts < MAX_RECONNECT_ATTEMPTS ? LOG : ERROR))
|
||||
{
|
||||
n_reconnect_attempts += 1;
|
||||
pg_usleep(RECONNECT_INTERVAL_USEC);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
char *msg = pchomp(PQerrorMessage(pageserver_conn));
|
||||
|
||||
/*
|
||||
* Send request.
|
||||
*
|
||||
* In principle, this could block if the output buffer is full, and we
|
||||
* should use async mode and check for interrupts while waiting. In
|
||||
* practice, our requests are small enough to always fit in the output and
|
||||
* TCP buffer.
|
||||
*/
|
||||
if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(pageserver_conn));
|
||||
if (n_reconnect_attempts < MAX_RECONNECT_ATTEMPTS)
|
||||
{
|
||||
neon_log(LOG, "failed to send page request (try to reconnect): %s", msg);
|
||||
if (n_reconnect_attempts != 0) /* do not sleep before first reconnect attempt, assuming that pageserver is already restarted */
|
||||
pg_usleep(RECONNECT_INTERVAL_USEC);
|
||||
n_reconnect_attempts += 1;
|
||||
continue;
|
||||
}
|
||||
else
|
||||
{
|
||||
pageserver_disconnect();
|
||||
neon_log(ERROR, "failed to send page request: %s", msg);
|
||||
}
|
||||
}
|
||||
break;
|
||||
pageserver_disconnect();
|
||||
neon_log(ERROR, "failed to send page request: %s", msg);
|
||||
}
|
||||
|
||||
pfree(req_buff.data);
|
||||
|
||||
n_unflushed_requests++;
|
||||
|
||||
@@ -1,15 +0,0 @@
|
||||
# pgxs/neon_utils/Makefile
|
||||
|
||||
|
||||
MODULE_big = neon_utils
|
||||
OBJS = \
|
||||
$(WIN32RES) \
|
||||
neon_utils.o
|
||||
|
||||
EXTENSION = neon_utils
|
||||
DATA = neon_utils--1.0.sql
|
||||
PGFILEDESC = "neon_utils - small useful functions"
|
||||
|
||||
PG_CONFIG = pg_config
|
||||
PGXS := $(shell $(PG_CONFIG) --pgxs)
|
||||
include $(PGXS)
|
||||
@@ -1,6 +0,0 @@
|
||||
CREATE FUNCTION num_cpus()
|
||||
RETURNS int
|
||||
AS 'MODULE_PATHNAME', 'num_cpus'
|
||||
LANGUAGE C STRICT
|
||||
PARALLEL UNSAFE
|
||||
VOLATILE;
|
||||
@@ -1,35 +0,0 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* neon_utils.c
|
||||
* neon_utils - small useful functions
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* contrib/neon_utils/neon_utils.c
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
#ifdef _WIN32
|
||||
#include <windows.h>
|
||||
#else
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#include "postgres.h"
|
||||
#include "fmgr.h"
|
||||
|
||||
PG_MODULE_MAGIC;
|
||||
|
||||
PG_FUNCTION_INFO_V1(num_cpus);
|
||||
|
||||
Datum
|
||||
num_cpus(PG_FUNCTION_ARGS)
|
||||
{
|
||||
#ifdef _WIN32
|
||||
SYSTEM_INFO sysinfo;
|
||||
GetSystemInfo(&sysinfo);
|
||||
uint32 num_cpus = (uint32) sysinfo.dwNumberOfProcessors;
|
||||
#else
|
||||
uint32 num_cpus = (uint32) sysconf(_SC_NPROCESSORS_ONLN);
|
||||
#endif
|
||||
PG_RETURN_UINT32(num_cpus);
|
||||
}
|
||||
@@ -1,6 +0,0 @@
|
||||
# neon_utils extension
|
||||
comment = 'neon_utils - small useful functions'
|
||||
default_version = '1.0'
|
||||
module_pathname = '$libdir/neon_utils'
|
||||
relocatable = true
|
||||
trusted = true
|
||||
@@ -65,14 +65,6 @@
|
||||
#include "rusagestub.h"
|
||||
#endif
|
||||
|
||||
#include "access/clog.h"
|
||||
#include "access/commit_ts.h"
|
||||
#include "access/heapam.h"
|
||||
#include "access/multixact.h"
|
||||
#include "access/nbtree.h"
|
||||
#include "access/subtrans.h"
|
||||
#include "access/syncscan.h"
|
||||
#include "access/twophase.h"
|
||||
#include "access/xlog.h"
|
||||
#include "access/xlog_internal.h"
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
@@ -80,36 +72,18 @@
|
||||
#endif
|
||||
#include "access/xlogutils.h"
|
||||
#include "catalog/pg_class.h"
|
||||
#include "commands/async.h"
|
||||
#include "libpq/libpq.h"
|
||||
#include "libpq/pqformat.h"
|
||||
#include "miscadmin.h"
|
||||
#include "pgstat.h"
|
||||
#include "postmaster/autovacuum.h"
|
||||
#include "postmaster/bgworker_internals.h"
|
||||
#include "postmaster/bgwriter.h"
|
||||
#include "postmaster/postmaster.h"
|
||||
#include "replication/logicallauncher.h"
|
||||
#include "replication/origin.h"
|
||||
#include "replication/slot.h"
|
||||
#include "replication/walreceiver.h"
|
||||
#include "replication/walsender.h"
|
||||
#include "storage/buf_internals.h"
|
||||
#include "storage/bufmgr.h"
|
||||
#include "storage/dsm.h"
|
||||
#include "storage/ipc.h"
|
||||
#include "storage/pg_shmem.h"
|
||||
#include "storage/pmsignal.h"
|
||||
#include "storage/predicate.h"
|
||||
#include "storage/proc.h"
|
||||
#include "storage/procarray.h"
|
||||
#include "storage/procsignal.h"
|
||||
#include "storage/sinvaladt.h"
|
||||
#include "storage/smgr.h"
|
||||
#include "storage/spin.h"
|
||||
#include "tcop/tcopprot.h"
|
||||
#include "utils/memutils.h"
|
||||
#include "utils/ps_status.h"
|
||||
#include "utils/snapmgr.h"
|
||||
|
||||
#include "inmem_smgr.h"
|
||||
|
||||
@@ -127,7 +101,6 @@ static void apply_error_callback(void *arg);
|
||||
static bool redo_block_filter(XLogReaderState *record, uint8 block_id);
|
||||
static void GetPage(StringInfo input_message);
|
||||
static ssize_t buffered_read(void *buf, size_t count);
|
||||
static void CreateFakeSharedMemoryAndSemaphores();
|
||||
|
||||
static BufferTag target_redo_tag;
|
||||
|
||||
@@ -168,7 +141,7 @@ enter_seccomp_mode(void)
|
||||
PG_SCMP_ALLOW(shmctl),
|
||||
PG_SCMP_ALLOW(shmdt),
|
||||
PG_SCMP_ALLOW(unlink), // shm_unlink
|
||||
*/
|
||||
*/
|
||||
};
|
||||
|
||||
#ifdef MALLOC_NO_MMAP
|
||||
@@ -204,7 +177,6 @@ WalRedoMain(int argc, char *argv[])
|
||||
* buffers. So let's keep it small (default value is 1024)
|
||||
*/
|
||||
num_temp_buffers = 4;
|
||||
NBuffers = 4;
|
||||
|
||||
/*
|
||||
* install the simple in-memory smgr
|
||||
@@ -212,33 +184,49 @@ WalRedoMain(int argc, char *argv[])
|
||||
smgr_hook = smgr_inmem;
|
||||
smgr_init_hook = smgr_init_inmem;
|
||||
|
||||
/*
|
||||
* Validate we have been given a reasonable-looking DataDir and change into it.
|
||||
*/
|
||||
checkDataDir();
|
||||
ChangeToDataDir();
|
||||
|
||||
/*
|
||||
* Create lockfile for data directory.
|
||||
*/
|
||||
CreateDataDirLockFile(false);
|
||||
|
||||
/* read control file (error checking and contains config ) */
|
||||
LocalProcessControlFile(false);
|
||||
|
||||
/*
|
||||
* process any libraries that should be preloaded at postmaster start
|
||||
*/
|
||||
process_shared_preload_libraries();
|
||||
|
||||
/* Initialize MaxBackends (if under postmaster, was done already) */
|
||||
MaxConnections = 1;
|
||||
max_worker_processes = 0;
|
||||
max_parallel_workers = 0;
|
||||
max_wal_senders = 0;
|
||||
InitializeMaxBackends();
|
||||
|
||||
/* Disable lastWrittenLsnCache */
|
||||
lastWrittenLsnCacheSize = 0;
|
||||
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
/*
|
||||
* Give preloaded libraries a chance to request additional shared memory.
|
||||
*/
|
||||
process_shmem_requests();
|
||||
|
||||
/*
|
||||
* Now that loadable modules have had their chance to request additional
|
||||
* shared memory, determine the value of any runtime-computed GUCs that
|
||||
* depend on the amount of shared memory required.
|
||||
*/
|
||||
InitializeShmemGUCs();
|
||||
|
||||
/*
|
||||
* This will try to access data directory which we do not set.
|
||||
* Seems to be pretty safe to disable.
|
||||
* Now that modules have been loaded, we can process any custom resource
|
||||
* managers specified in the wal_consistency_checking GUC.
|
||||
*/
|
||||
/* InitializeWalConsistencyChecking(); */
|
||||
InitializeWalConsistencyChecking();
|
||||
#endif
|
||||
|
||||
/*
|
||||
* We have our own version of CreateSharedMemoryAndSemaphores() that
|
||||
* sets up local memory instead of shared one.
|
||||
*/
|
||||
CreateFakeSharedMemoryAndSemaphores();
|
||||
CreateSharedMemoryAndSemaphores();
|
||||
|
||||
/*
|
||||
* Remember stand-alone backend startup time,roughly at the same point
|
||||
@@ -366,172 +354,6 @@ WalRedoMain(int argc, char *argv[])
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Initialize dummy shmem.
|
||||
*
|
||||
* This code follows CreateSharedMemoryAndSemaphores() but manually sets up
|
||||
* the shmem header and skips few initialization steps that are not needed for
|
||||
* WAL redo.
|
||||
*
|
||||
* I've also tried removing most of initialization functions that request some
|
||||
* memory (like ApplyLauncherShmemInit and friends) but in reality it haven't had
|
||||
* any sizeable effect on RSS, so probably such clean up not worth the risk of having
|
||||
* half-initialized postgres.
|
||||
*/
|
||||
static void
|
||||
CreateFakeSharedMemoryAndSemaphores()
|
||||
{
|
||||
PGShmemHeader *shim = NULL;
|
||||
PGShmemHeader *hdr;
|
||||
Size size;
|
||||
int numSemas;
|
||||
char cwd[MAXPGPATH];
|
||||
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
size = CalculateShmemSize(&numSemas);
|
||||
#else
|
||||
/*
|
||||
* Postgres v14 doesn't have a separate CalculateShmemSize(). Use result of the
|
||||
* corresponging calculation in CreateSharedMemoryAndSemaphores()
|
||||
*/
|
||||
size = 1409024;
|
||||
numSemas = 10;
|
||||
#endif
|
||||
|
||||
/* Dummy implementation of PGSharedMemoryCreate() */
|
||||
{
|
||||
hdr = (PGShmemHeader *) malloc(size);
|
||||
if (!hdr)
|
||||
ereport(FATAL,
|
||||
(errcode(ERRCODE_OUT_OF_MEMORY),
|
||||
errmsg("[neon-wal-redo] can not allocate (pseudo-) shared memory")));
|
||||
|
||||
hdr->creatorPID = getpid();
|
||||
hdr->magic = PGShmemMagic;
|
||||
hdr->dsm_control = 0;
|
||||
hdr->device = 42; /* not relevant for non-shared memory */
|
||||
hdr->inode = 43; /* not relevant for non-shared memory */
|
||||
hdr->totalsize = size;
|
||||
hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader));
|
||||
|
||||
shim = hdr;
|
||||
UsedShmemSegAddr = hdr;
|
||||
UsedShmemSegID = (unsigned long) 42; /* not relevant for non-shared memory */
|
||||
}
|
||||
|
||||
InitShmemAccess(hdr);
|
||||
|
||||
/*
|
||||
* Reserve semaphores uses dir name as a source of entropy. Set it to cwd(). Rest
|
||||
* of the code does not need DataDir access so nullify DataDir after
|
||||
* PGReserveSemaphores() to error out if something will try to access it.
|
||||
*/
|
||||
if (!getcwd(cwd, MAXPGPATH))
|
||||
ereport(FATAL,
|
||||
(errcode(ERRCODE_INTERNAL_ERROR),
|
||||
errmsg("[neon-wal-redo] can not read current directory name")));
|
||||
DataDir = cwd;
|
||||
PGReserveSemaphores(numSemas);
|
||||
DataDir = NULL;
|
||||
|
||||
/*
|
||||
* The rest of function follows CreateSharedMemoryAndSemaphores() closely,
|
||||
* skipped parts are marked with comments.
|
||||
*/
|
||||
InitShmemAllocation();
|
||||
|
||||
/*
|
||||
* Now initialize LWLocks, which do shared memory allocation and are
|
||||
* needed for InitShmemIndex.
|
||||
*/
|
||||
CreateLWLocks();
|
||||
|
||||
/*
|
||||
* Set up shmem.c index hashtable
|
||||
*/
|
||||
InitShmemIndex();
|
||||
|
||||
dsm_shmem_init();
|
||||
|
||||
/*
|
||||
* Set up xlog, clog, and buffers
|
||||
*/
|
||||
XLOGShmemInit();
|
||||
CLOGShmemInit();
|
||||
CommitTsShmemInit();
|
||||
SUBTRANSShmemInit();
|
||||
MultiXactShmemInit();
|
||||
InitBufferPool();
|
||||
|
||||
/*
|
||||
* Set up lock manager
|
||||
*/
|
||||
InitLocks();
|
||||
|
||||
/*
|
||||
* Set up predicate lock manager
|
||||
*/
|
||||
InitPredicateLocks();
|
||||
|
||||
/*
|
||||
* Set up process table
|
||||
*/
|
||||
if (!IsUnderPostmaster)
|
||||
InitProcGlobal();
|
||||
CreateSharedProcArray();
|
||||
CreateSharedBackendStatus();
|
||||
TwoPhaseShmemInit();
|
||||
BackgroundWorkerShmemInit();
|
||||
|
||||
/*
|
||||
* Set up shared-inval messaging
|
||||
*/
|
||||
CreateSharedInvalidationState();
|
||||
|
||||
/*
|
||||
* Set up interprocess signaling mechanisms
|
||||
*/
|
||||
PMSignalShmemInit();
|
||||
ProcSignalShmemInit();
|
||||
CheckpointerShmemInit();
|
||||
AutoVacuumShmemInit();
|
||||
ReplicationSlotsShmemInit();
|
||||
ReplicationOriginShmemInit();
|
||||
WalSndShmemInit();
|
||||
WalRcvShmemInit();
|
||||
PgArchShmemInit();
|
||||
ApplyLauncherShmemInit();
|
||||
|
||||
/*
|
||||
* Set up other modules that need some shared memory space
|
||||
*/
|
||||
SnapMgrInit();
|
||||
BTreeShmemInit();
|
||||
SyncScanShmemInit();
|
||||
/* Skip due to the 'pg_notify' directory check */
|
||||
/* AsyncShmemInit(); */
|
||||
|
||||
#ifdef EXEC_BACKEND
|
||||
|
||||
/*
|
||||
* Alloc the win32 shared backend array
|
||||
*/
|
||||
if (!IsUnderPostmaster)
|
||||
ShmemBackendArrayAllocation();
|
||||
#endif
|
||||
|
||||
/* Initialize dynamic shared memory facilities. */
|
||||
if (!IsUnderPostmaster)
|
||||
dsm_postmaster_startup(shim);
|
||||
|
||||
/*
|
||||
* Now give loadable modules a chance to set up their shmem allocations
|
||||
*/
|
||||
if (shmem_startup_hook)
|
||||
shmem_startup_hook();
|
||||
}
|
||||
|
||||
|
||||
/* Version compatility wrapper for ReadBufferWithoutRelcache */
|
||||
static inline Buffer
|
||||
NeonRedoReadBuffer(RelFileNode rnode,
|
||||
|
||||
@@ -25,11 +25,12 @@ impl CancelMap {
|
||||
cancel_closure.try_cancel_query().await
|
||||
}
|
||||
|
||||
/// Create a new session, with a new client-facing random cancellation key.
|
||||
///
|
||||
/// Use `enable_query_cancellation` to register the Postgres backend's cancellation
|
||||
/// key with it.
|
||||
pub fn new_session<'a>(&'a self) -> anyhow::Result<Session<'a>> {
|
||||
/// Run async action within an ephemeral session identified by [`CancelKeyData`].
|
||||
pub async fn with_session<'a, F, R, V>(&'a self, f: F) -> anyhow::Result<V>
|
||||
where
|
||||
F: FnOnce(Session<'a>) -> R,
|
||||
R: std::future::Future<Output = anyhow::Result<V>>,
|
||||
{
|
||||
// HACK: We'd rather get the real backend_pid but tokio_postgres doesn't
|
||||
// expose it and we don't want to do another roundtrip to query
|
||||
// for it. The client will be able to notice that this is not the
|
||||
@@ -43,9 +44,17 @@ impl CancelMap {
|
||||
.write()
|
||||
.try_insert(key, None)
|
||||
.map_err(|_| anyhow!("query cancellation key already exists: {key}"))?;
|
||||
info!("registered new query cancellation key {key}");
|
||||
|
||||
Ok(Session::new(key, self))
|
||||
// This will guarantee that the session gets dropped
|
||||
// as soon as the future is finished.
|
||||
scopeguard::defer! {
|
||||
self.0.write().remove(&key);
|
||||
info!("dropped query cancellation key {key}");
|
||||
}
|
||||
|
||||
info!("registered new query cancellation key {key}");
|
||||
let session = Session::new(key, self);
|
||||
f(session).await
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -102,7 +111,7 @@ impl<'a> Session<'a> {
|
||||
impl Session<'_> {
|
||||
/// Store the cancel token for the given session.
|
||||
/// This enables query cancellation in [`crate::proxy::handshake`].
|
||||
pub fn enable_query_cancellation(&self, cancel_closure: CancelClosure) -> CancelKeyData {
|
||||
pub fn enable_query_cancellation(self, cancel_closure: CancelClosure) -> CancelKeyData {
|
||||
info!("enabling query cancellation for this session");
|
||||
self.cancel_map
|
||||
.0
|
||||
@@ -113,14 +122,6 @@ impl Session<'_> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Drop for Session<'a> {
|
||||
fn drop(&mut self) {
|
||||
let key = &self.key;
|
||||
self.cancel_map.0.write().remove(key);
|
||||
info!("dropped query cancellation key {key}");
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -131,14 +132,14 @@ mod tests {
|
||||
static CANCEL_MAP: Lazy<CancelMap> = Lazy::new(Default::default);
|
||||
|
||||
let (tx, rx) = tokio::sync::oneshot::channel();
|
||||
|
||||
let session = CANCEL_MAP.new_session()?;
|
||||
let task = tokio::spawn(async move {
|
||||
let task = tokio::spawn(CANCEL_MAP.with_session(|session| async move {
|
||||
assert!(CANCEL_MAP.contains(&session));
|
||||
|
||||
tx.send(()).expect("failed to send");
|
||||
futures::future::pending::<()>().await; // sleep forever
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}));
|
||||
|
||||
// Wait until the task has been spawned.
|
||||
rx.await.context("failed to hear from the task")?;
|
||||
|
||||
@@ -21,7 +21,6 @@ const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";
|
||||
#[derive(Eq, Hash, PartialEq, Serialize, Debug)]
|
||||
pub struct Ids {
|
||||
pub endpoint_id: String,
|
||||
pub branch_id: String,
|
||||
}
|
||||
|
||||
pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<()> {
|
||||
@@ -75,23 +74,12 @@ fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime<Utc>))> {
|
||||
.find(|l| l.get_name() == "endpoint_id")
|
||||
.unwrap()
|
||||
.get_value();
|
||||
let branch_id = ms
|
||||
.get_label()
|
||||
.iter()
|
||||
.find(|l| l.get_name() == "branch_id")
|
||||
.unwrap()
|
||||
.get_value();
|
||||
|
||||
let value = ms.get_counter().get_value() as u64;
|
||||
|
||||
debug!(
|
||||
"branch_id {} endpoint_id {} val: {}",
|
||||
branch_id, endpoint_id, value
|
||||
);
|
||||
debug!("endpoint_id:val - {}: {}", endpoint_id, value);
|
||||
current_metrics.push((
|
||||
Ids {
|
||||
endpoint_id: endpoint_id.to_string(),
|
||||
branch_id: "".to_string(),
|
||||
},
|
||||
(value, Utc::now()),
|
||||
));
|
||||
@@ -143,7 +131,6 @@ async fn collect_metrics_iteration(
|
||||
value,
|
||||
extra: Ids {
|
||||
endpoint_id: curr_key.endpoint_id.clone(),
|
||||
branch_id: curr_key.branch_id.clone(),
|
||||
},
|
||||
})
|
||||
})
|
||||
@@ -185,7 +172,6 @@ async fn collect_metrics_iteration(
|
||||
cached_metrics
|
||||
.entry(Ids {
|
||||
endpoint_id: send_metric.extra.endpoint_id.clone(),
|
||||
branch_id: send_metric.extra.branch_id.clone(),
|
||||
})
|
||||
// update cached value (add delta) and time
|
||||
.and_modify(|e| {
|
||||
|
||||
@@ -133,14 +133,10 @@ pub async fn handle_ws_client(
|
||||
async { result }.or_else(|e| stream.throw_error(e)).await?
|
||||
};
|
||||
|
||||
let client = Client::new(
|
||||
stream,
|
||||
creds,
|
||||
¶ms,
|
||||
session_id,
|
||||
cancel_map.new_session()?,
|
||||
);
|
||||
client.connect_to_db(true).await
|
||||
let client = Client::new(stream, creds, ¶ms, session_id);
|
||||
cancel_map
|
||||
.with_session(|session| client.connect_to_db(session, true))
|
||||
.await
|
||||
}
|
||||
|
||||
#[tracing::instrument(fields(session_id), skip_all)]
|
||||
@@ -176,14 +172,10 @@ async fn handle_client(
|
||||
async { result }.or_else(|e| stream.throw_error(e)).await?
|
||||
};
|
||||
|
||||
let client = Client::new(
|
||||
stream,
|
||||
creds,
|
||||
¶ms,
|
||||
session_id,
|
||||
cancel_map.new_session()?,
|
||||
);
|
||||
client.connect_to_db(false).await
|
||||
let client = Client::new(stream, creds, ¶ms, session_id);
|
||||
cancel_map
|
||||
.with_session(|session| client.connect_to_db(session, false))
|
||||
.await
|
||||
}
|
||||
|
||||
/// Establish a (most probably, secure) connection with the client.
|
||||
@@ -389,8 +381,6 @@ struct Client<'a, S> {
|
||||
params: &'a StartupMessageParams,
|
||||
/// Unique connection ID.
|
||||
session_id: uuid::Uuid,
|
||||
|
||||
session: cancellation::Session<'a>,
|
||||
}
|
||||
|
||||
impl<'a, S> Client<'a, S> {
|
||||
@@ -400,27 +390,28 @@ impl<'a, S> Client<'a, S> {
|
||||
creds: auth::BackendType<'a, auth::ClientCredentials<'a>>,
|
||||
params: &'a StartupMessageParams,
|
||||
session_id: uuid::Uuid,
|
||||
session: cancellation::Session<'a>,
|
||||
) -> Self {
|
||||
Self {
|
||||
stream,
|
||||
creds,
|
||||
params,
|
||||
session_id,
|
||||
session,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
|
||||
/// Let the client authenticate and connect to the designated compute node.
|
||||
async fn connect_to_db(self, allow_cleartext: bool) -> anyhow::Result<()> {
|
||||
async fn connect_to_db(
|
||||
self,
|
||||
session: cancellation::Session<'_>,
|
||||
allow_cleartext: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
let Self {
|
||||
mut stream,
|
||||
mut creds,
|
||||
params,
|
||||
session_id,
|
||||
session,
|
||||
} = self;
|
||||
|
||||
let extra = console::ConsoleReqExtra {
|
||||
|
||||
@@ -13,7 +13,7 @@
|
||||
# script that checks every feature.
|
||||
#
|
||||
# manual-range-contains wants
|
||||
# !(4..=MAX_STARTUP_PACKET_LENGTH).contains(&len)
|
||||
# !(8..=MAX_STARTUP_PACKET_LENGTH).contains(&len)
|
||||
# instead of
|
||||
# len < 4 || len > MAX_STARTUP_PACKET_LENGTH
|
||||
# , let's disagree.
|
||||
|
||||
@@ -10,7 +10,6 @@ anyhow.workspace = true
|
||||
async-trait.workspace = true
|
||||
byteorder.workspace = true
|
||||
bytes.workspace = true
|
||||
chrono.workspace = true
|
||||
clap = { workspace = true, features = ["derive"] }
|
||||
const_format.workspace = true
|
||||
crc32c.workspace = true
|
||||
@@ -19,6 +18,7 @@ git-version.workspace = true
|
||||
hex.workspace = true
|
||||
humantime.workspace = true
|
||||
hyper.workspace = true
|
||||
nix.workspace = true
|
||||
once_cell.workspace = true
|
||||
parking_lot.workspace = true
|
||||
postgres.workspace = true
|
||||
|
||||
@@ -1,264 +0,0 @@
|
||||
//! Utils for dumping full state of the safekeeper.
|
||||
|
||||
use std::fs;
|
||||
use std::fs::DirEntry;
|
||||
use std::io::BufReader;
|
||||
use std::io::Read;
|
||||
use std::path::PathBuf;
|
||||
|
||||
use anyhow::Result;
|
||||
use chrono::{DateTime, Utc};
|
||||
use postgres_ffi::XLogSegNo;
|
||||
use serde::Serialize;
|
||||
|
||||
use utils::http::json::display_serialize;
|
||||
use utils::id::NodeId;
|
||||
use utils::id::TenantTimelineId;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::safekeeper::SafeKeeperState;
|
||||
use crate::safekeeper::SafekeeperMemState;
|
||||
use crate::safekeeper::TermHistory;
|
||||
use crate::SafeKeeperConf;
|
||||
|
||||
use crate::timeline::ReplicaState;
|
||||
use crate::GlobalTimelines;
|
||||
|
||||
/// Various filters that influence the resulting JSON output.
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct Args {
|
||||
/// Dump all available safekeeper state. False by default.
|
||||
pub dump_all: bool,
|
||||
|
||||
/// Dump control_file content. Uses value of `dump_all` by default.
|
||||
pub dump_control_file: bool,
|
||||
|
||||
/// Dump in-memory state. Uses value of `dump_all` by default.
|
||||
pub dump_memory: bool,
|
||||
|
||||
/// Dump all disk files in a timeline directory. Uses value of `dump_all` by default.
|
||||
pub dump_disk_content: bool,
|
||||
|
||||
/// Dump full term history. True by default.
|
||||
pub dump_term_history: bool,
|
||||
|
||||
/// Filter timelines by tenant_id.
|
||||
pub tenant_id: Option<TenantId>,
|
||||
|
||||
/// Filter timelines by timeline_id.
|
||||
pub timeline_id: Option<TimelineId>,
|
||||
}
|
||||
|
||||
/// Response for debug dump request.
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct Response {
|
||||
pub start_time: DateTime<Utc>,
|
||||
pub finish_time: DateTime<Utc>,
|
||||
pub timelines: Vec<Timeline>,
|
||||
pub timelines_count: usize,
|
||||
pub config: Config,
|
||||
}
|
||||
|
||||
/// Safekeeper configuration.
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct Config {
|
||||
pub id: NodeId,
|
||||
pub workdir: PathBuf,
|
||||
pub listen_pg_addr: String,
|
||||
pub listen_http_addr: String,
|
||||
pub no_sync: bool,
|
||||
pub max_offloader_lag_bytes: u64,
|
||||
pub wal_backup_enabled: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct Timeline {
|
||||
#[serde(serialize_with = "display_serialize")]
|
||||
pub tenant_id: TenantId,
|
||||
#[serde(serialize_with = "display_serialize")]
|
||||
pub timeline_id: TimelineId,
|
||||
pub control_file: Option<SafeKeeperState>,
|
||||
pub memory: Option<Memory>,
|
||||
pub disk_content: Option<DiskContent>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct Memory {
|
||||
pub is_cancelled: bool,
|
||||
pub peers_info_len: usize,
|
||||
pub replicas: Vec<Option<ReplicaState>>,
|
||||
pub wal_backup_active: bool,
|
||||
pub active: bool,
|
||||
pub num_computes: u32,
|
||||
pub last_removed_segno: XLogSegNo,
|
||||
pub epoch_start_lsn: Lsn,
|
||||
pub mem_state: SafekeeperMemState,
|
||||
|
||||
// PhysicalStorage state.
|
||||
pub write_lsn: Lsn,
|
||||
pub write_record_lsn: Lsn,
|
||||
pub flush_lsn: Lsn,
|
||||
pub file_open: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct DiskContent {
|
||||
pub files: Vec<FileInfo>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct FileInfo {
|
||||
pub name: String,
|
||||
pub size: u64,
|
||||
pub created: DateTime<Utc>,
|
||||
pub modified: DateTime<Utc>,
|
||||
pub start_zeroes: u64,
|
||||
pub end_zeroes: u64,
|
||||
// TODO: add sha256 checksum
|
||||
}
|
||||
|
||||
/// Build debug dump response, using the provided [`Args`] filters.
|
||||
pub fn build(args: Args) -> Result<Response> {
|
||||
let start_time = Utc::now();
|
||||
let timelines_count = GlobalTimelines::timelines_count();
|
||||
|
||||
let ptrs_snapshot = if args.tenant_id.is_some() && args.timeline_id.is_some() {
|
||||
// If both tenant_id and timeline_id are specified, we can just get the
|
||||
// timeline directly, without taking a snapshot of the whole list.
|
||||
let ttid = TenantTimelineId::new(args.tenant_id.unwrap(), args.timeline_id.unwrap());
|
||||
if let Ok(tli) = GlobalTimelines::get(ttid) {
|
||||
vec![tli]
|
||||
} else {
|
||||
vec![]
|
||||
}
|
||||
} else {
|
||||
// Otherwise, take a snapshot of the whole list.
|
||||
GlobalTimelines::get_all()
|
||||
};
|
||||
|
||||
// TODO: return Stream instead of Vec
|
||||
let mut timelines = Vec::new();
|
||||
for tli in ptrs_snapshot {
|
||||
let ttid = tli.ttid;
|
||||
if let Some(tenant_id) = args.tenant_id {
|
||||
if tenant_id != ttid.tenant_id {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if let Some(timeline_id) = args.timeline_id {
|
||||
if timeline_id != ttid.timeline_id {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
let control_file = if args.dump_control_file {
|
||||
let mut state = tli.get_state().1;
|
||||
if !args.dump_term_history {
|
||||
state.acceptor_state.term_history = TermHistory(vec![]);
|
||||
}
|
||||
Some(state)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let memory = if args.dump_memory {
|
||||
Some(tli.memory_dump())
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let disk_content = if args.dump_disk_content {
|
||||
// build_disk_content can fail, but we don't want to fail the whole
|
||||
// request because of that.
|
||||
build_disk_content(&tli.timeline_dir).ok()
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let timeline = Timeline {
|
||||
tenant_id: ttid.tenant_id,
|
||||
timeline_id: ttid.timeline_id,
|
||||
control_file,
|
||||
memory,
|
||||
disk_content,
|
||||
};
|
||||
timelines.push(timeline);
|
||||
}
|
||||
|
||||
let config = GlobalTimelines::get_global_config();
|
||||
|
||||
Ok(Response {
|
||||
start_time,
|
||||
finish_time: Utc::now(),
|
||||
timelines,
|
||||
timelines_count,
|
||||
config: build_config(config),
|
||||
})
|
||||
}
|
||||
|
||||
/// Builds DiskContent from a directory path. It can fail if the directory
|
||||
/// is deleted between the time we get the path and the time we try to open it.
|
||||
fn build_disk_content(path: &std::path::Path) -> Result<DiskContent> {
|
||||
let mut files = Vec::new();
|
||||
for entry in fs::read_dir(path)? {
|
||||
if entry.is_err() {
|
||||
continue;
|
||||
}
|
||||
let file = build_file_info(entry?);
|
||||
if file.is_err() {
|
||||
continue;
|
||||
}
|
||||
files.push(file?);
|
||||
}
|
||||
|
||||
Ok(DiskContent { files })
|
||||
}
|
||||
|
||||
/// Builds FileInfo from DirEntry. Sometimes it can return an error
|
||||
/// if the file is deleted between the time we get the DirEntry
|
||||
/// and the time we try to open it.
|
||||
fn build_file_info(entry: DirEntry) -> Result<FileInfo> {
|
||||
let metadata = entry.metadata()?;
|
||||
let path = entry.path();
|
||||
let name = path
|
||||
.file_name()
|
||||
.and_then(|x| x.to_str())
|
||||
.unwrap_or("")
|
||||
.to_owned();
|
||||
let mut file = fs::File::open(path)?;
|
||||
let mut reader = BufReader::new(&mut file).bytes().filter_map(|x| x.ok());
|
||||
|
||||
let start_zeroes = reader.by_ref().take_while(|&x| x == 0).count() as u64;
|
||||
let mut end_zeroes = 0;
|
||||
for b in reader {
|
||||
if b == 0 {
|
||||
end_zeroes += 1;
|
||||
} else {
|
||||
end_zeroes = 0;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(FileInfo {
|
||||
name,
|
||||
size: metadata.len(),
|
||||
created: DateTime::from(metadata.created()?),
|
||||
modified: DateTime::from(metadata.modified()?),
|
||||
start_zeroes,
|
||||
end_zeroes,
|
||||
})
|
||||
}
|
||||
|
||||
/// Converts SafeKeeperConf to Config, filtering out the fields that are not
|
||||
/// supposed to be exposed.
|
||||
fn build_config(config: SafeKeeperConf) -> Config {
|
||||
Config {
|
||||
id: config.my_id,
|
||||
workdir: config.workdir,
|
||||
listen_pg_addr: config.listen_pg_addr,
|
||||
listen_http_addr: config.listen_http_addr,
|
||||
no_sync: config.no_sync,
|
||||
max_offloader_lag_bytes: config.max_offloader_lag_bytes,
|
||||
wal_backup_enabled: config.wal_backup_enabled,
|
||||
}
|
||||
}
|
||||
@@ -8,7 +8,6 @@ use tracing::{info, info_span, Instrument};
|
||||
use crate::auth::check_permission;
|
||||
use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage};
|
||||
|
||||
use crate::wal_service::ConnectionId;
|
||||
use crate::{GlobalTimelines, SafeKeeperConf};
|
||||
use postgres_backend::QueryError;
|
||||
use postgres_backend::{self, PostgresBackend};
|
||||
@@ -29,8 +28,6 @@ pub struct SafekeeperPostgresHandler {
|
||||
pub tenant_id: Option<TenantId>,
|
||||
pub timeline_id: Option<TimelineId>,
|
||||
pub ttid: TenantTimelineId,
|
||||
/// Unique connection id is logged in spans for observability.
|
||||
pub conn_id: ConnectionId,
|
||||
claims: Option<Claims>,
|
||||
}
|
||||
|
||||
@@ -40,9 +37,11 @@ enum SafekeeperPostgresCommand {
|
||||
StartReplication { start_lsn: Lsn },
|
||||
IdentifySystem,
|
||||
JSONCtrl { cmd: AppendLogicalMessage },
|
||||
Show { guc: String },
|
||||
}
|
||||
|
||||
fn parse_cmd(cmd: &str) -> anyhow::Result<SafekeeperPostgresCommand> {
|
||||
let cmd_lowercase = cmd.to_ascii_lowercase();
|
||||
if cmd.starts_with("START_WAL_PUSH") {
|
||||
Ok(SafekeeperPostgresCommand::StartWalPush)
|
||||
} else if cmd.starts_with("START_REPLICATION") {
|
||||
@@ -61,6 +60,14 @@ fn parse_cmd(cmd: &str) -> anyhow::Result<SafekeeperPostgresCommand> {
|
||||
Ok(SafekeeperPostgresCommand::JSONCtrl {
|
||||
cmd: serde_json::from_str(cmd)?,
|
||||
})
|
||||
} else if cmd_lowercase.starts_with("show") {
|
||||
let re = Regex::new(r"show ((?:[[:alpha:]]|_)+)").unwrap();
|
||||
let mut caps = re.captures_iter(&cmd_lowercase);
|
||||
let guc = caps
|
||||
.next()
|
||||
.map(|cap| cap[1].parse::<String>())
|
||||
.context("parse guc in SHOW command")??;
|
||||
Ok(SafekeeperPostgresCommand::Show { guc })
|
||||
} else {
|
||||
anyhow::bail!("unsupported command {cmd}");
|
||||
}
|
||||
@@ -176,6 +183,7 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler {
|
||||
.await
|
||||
}
|
||||
SafekeeperPostgresCommand::IdentifySystem => self.handle_identify_system(pgb).await,
|
||||
SafekeeperPostgresCommand::Show { guc } => self.handle_show(guc, pgb).await,
|
||||
SafekeeperPostgresCommand::JSONCtrl { ref cmd } => {
|
||||
handle_json_ctrl(self, pgb, cmd).await
|
||||
}
|
||||
@@ -184,14 +192,13 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler {
|
||||
}
|
||||
|
||||
impl SafekeeperPostgresHandler {
|
||||
pub fn new(conf: SafeKeeperConf, conn_id: u32) -> Self {
|
||||
pub fn new(conf: SafeKeeperConf) -> Self {
|
||||
SafekeeperPostgresHandler {
|
||||
conf,
|
||||
appname: None,
|
||||
tenant_id: None,
|
||||
timeline_id: None,
|
||||
ttid: TenantTimelineId::empty(),
|
||||
conn_id,
|
||||
claims: None,
|
||||
}
|
||||
}
|
||||
@@ -220,7 +227,7 @@ impl SafekeeperPostgresHandler {
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackend,
|
||||
) -> Result<(), QueryError> {
|
||||
let tli = GlobalTimelines::get(self.ttid).map_err(|e| QueryError::Other(e.into()))?;
|
||||
let tli = GlobalTimelines::get(self.ttid)?;
|
||||
|
||||
let lsn = if self.is_walproposer_recovery() {
|
||||
// walproposer should get all local WAL until flush_lsn
|
||||
@@ -273,6 +280,40 @@ impl SafekeeperPostgresHandler {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn handle_show(
|
||||
&mut self,
|
||||
guc: String,
|
||||
pgb: &mut PostgresBackend,
|
||||
) -> Result<(), QueryError> {
|
||||
match guc.as_str() {
|
||||
// pg_receivewal wants it
|
||||
"data_directory_mode" => {
|
||||
pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::int8_col(
|
||||
b"data_directory_mode",
|
||||
)]))?
|
||||
// xxx we could return real one, not just 0700
|
||||
.write_message_noflush(&BeMessage::DataRow(&[Some("0700".as_bytes())]))?
|
||||
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
}
|
||||
// pg_receivewal wants it
|
||||
"wal_segment_size" => {
|
||||
let tli = GlobalTimelines::get(self.ttid)?;
|
||||
let wal_seg_size = tli.get_state().1.server.wal_seg_size;
|
||||
let wal_seg_size_mb = (wal_seg_size / 1024 / 1024).to_string() + "MB";
|
||||
|
||||
pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col(
|
||||
b"wal_segment_size",
|
||||
)]))?
|
||||
.write_message_noflush(&BeMessage::DataRow(&[Some(wal_seg_size_mb.as_bytes())]))?
|
||||
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
}
|
||||
_ => {
|
||||
return Err(anyhow::anyhow!("SHOW of unknown setting").into());
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Returns true if current connection is a replication connection, originating
|
||||
/// from a walproposer recovery function. This connection gets a special handling:
|
||||
/// safekeeper must stream all local WAL till the flush_lsn, whether committed or not.
|
||||
|
||||
@@ -119,12 +119,6 @@ paths:
|
||||
$ref: "#/components/responses/ForbiddenError"
|
||||
default:
|
||||
$ref: "#/components/responses/GenericError"
|
||||
"404":
|
||||
description: Timeline not found
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/NotFoundError"
|
||||
|
||||
delete:
|
||||
tags:
|
||||
|
||||
@@ -1,19 +1,19 @@
|
||||
use hyper::{Body, Request, Response, StatusCode, Uri};
|
||||
|
||||
use anyhow::Context;
|
||||
use once_cell::sync::Lazy;
|
||||
use postgres_ffi::WAL_SEGMENT_SIZE;
|
||||
use safekeeper_api::models::SkTimelineInfo;
|
||||
use serde::Serialize;
|
||||
use serde::Serializer;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::fmt;
|
||||
use std::str::FromStr;
|
||||
use std::fmt::Display;
|
||||
|
||||
use std::sync::Arc;
|
||||
use storage_broker::proto::SafekeeperTimelineInfo;
|
||||
use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
|
||||
use tokio::task::JoinError;
|
||||
use utils::http::json::display_serialize;
|
||||
|
||||
use crate::debug_dump;
|
||||
use crate::safekeeper::ServerInfo;
|
||||
use crate::safekeeper::Term;
|
||||
|
||||
@@ -55,6 +55,15 @@ fn get_conf(request: &Request<Body>) -> &SafeKeeperConf {
|
||||
.as_ref()
|
||||
}
|
||||
|
||||
/// Serialize through Display trait.
|
||||
fn display_serialize<S, F>(z: &F, s: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: Serializer,
|
||||
F: Display,
|
||||
{
|
||||
s.serialize_str(&format!("{}", z))
|
||||
}
|
||||
|
||||
/// Same as TermSwitchEntry, but serializes LSN using display serializer
|
||||
/// in Postgres format, i.e. 0/FFFFFFFF. Used only for the API response.
|
||||
#[derive(Debug, Serialize)]
|
||||
@@ -111,7 +120,12 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
|
||||
);
|
||||
check_permission(&request, Some(ttid.tenant_id))?;
|
||||
|
||||
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
|
||||
let tli = GlobalTimelines::get(ttid)
|
||||
// FIXME: Currently, the only errors from `GlobalTimelines::get` will be client errors
|
||||
// because the provided timeline isn't there. However, the method can in theory change and
|
||||
// fail from internal errors later. Remove this comment once it the method returns
|
||||
// something other than `anyhow::Result`.
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
let (inmem, state) = tli.get_state();
|
||||
let flush_lsn = tli.get_flush_lsn();
|
||||
|
||||
@@ -244,7 +258,15 @@ async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<B
|
||||
local_start_lsn: sk_info.local_start_lsn.0,
|
||||
};
|
||||
|
||||
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
|
||||
let tli = GlobalTimelines::get(ttid)
|
||||
// `GlobalTimelines::get` returns an error when it can't find the timeline.
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Couldn't get timeline {} for tenant {}",
|
||||
ttid.timeline_id, ttid.tenant_id
|
||||
)
|
||||
})
|
||||
.map_err(ApiError::NotFound)?;
|
||||
tli.record_safekeeper_info(&proto_sk_info)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
@@ -252,69 +274,6 @@ async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<B
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
fn parse_kv_str<E: fmt::Display, T: FromStr<Err = E>>(k: &str, v: &str) -> Result<T, ApiError> {
|
||||
v.parse()
|
||||
.map_err(|e| ApiError::BadRequest(anyhow::anyhow!("cannot parse {k}: {e}")))
|
||||
}
|
||||
|
||||
/// Dump debug info about all available safekeeper state.
|
||||
async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&request, None)?;
|
||||
ensure_no_body(&mut request).await?;
|
||||
|
||||
let mut dump_all: Option<bool> = None;
|
||||
let mut dump_control_file: Option<bool> = None;
|
||||
let mut dump_memory: Option<bool> = None;
|
||||
let mut dump_disk_content: Option<bool> = None;
|
||||
let mut dump_term_history: Option<bool> = None;
|
||||
let mut tenant_id: Option<TenantId> = None;
|
||||
let mut timeline_id: Option<TimelineId> = None;
|
||||
|
||||
let query = request.uri().query().unwrap_or("");
|
||||
let mut values = url::form_urlencoded::parse(query.as_bytes());
|
||||
|
||||
for (k, v) in &mut values {
|
||||
match k.as_ref() {
|
||||
"dump_all" => dump_all = Some(parse_kv_str(&k, &v)?),
|
||||
"dump_control_file" => dump_control_file = Some(parse_kv_str(&k, &v)?),
|
||||
"dump_memory" => dump_memory = Some(parse_kv_str(&k, &v)?),
|
||||
"dump_disk_content" => dump_disk_content = Some(parse_kv_str(&k, &v)?),
|
||||
"dump_term_history" => dump_term_history = Some(parse_kv_str(&k, &v)?),
|
||||
"tenant_id" => tenant_id = Some(parse_kv_str(&k, &v)?),
|
||||
"timeline_id" => timeline_id = Some(parse_kv_str(&k, &v)?),
|
||||
_ => Err(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"Unknown query parameter: {}",
|
||||
k
|
||||
)))?,
|
||||
}
|
||||
}
|
||||
|
||||
let dump_all = dump_all.unwrap_or(false);
|
||||
let dump_control_file = dump_control_file.unwrap_or(dump_all);
|
||||
let dump_memory = dump_memory.unwrap_or(dump_all);
|
||||
let dump_disk_content = dump_disk_content.unwrap_or(dump_all);
|
||||
let dump_term_history = dump_term_history.unwrap_or(true);
|
||||
|
||||
let args = debug_dump::Args {
|
||||
dump_all,
|
||||
dump_control_file,
|
||||
dump_memory,
|
||||
dump_disk_content,
|
||||
dump_term_history,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
};
|
||||
|
||||
let resp = tokio::task::spawn_blocking(move || {
|
||||
debug_dump::build(args).map_err(ApiError::InternalServerError)
|
||||
})
|
||||
.await
|
||||
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
|
||||
|
||||
// TODO: use streaming response
|
||||
json_response(StatusCode::OK, resp)
|
||||
}
|
||||
|
||||
/// Safekeeper http router.
|
||||
pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError> {
|
||||
let mut router = endpoint::make_router();
|
||||
@@ -355,7 +314,6 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
|
||||
"/v1/record_safekeeper_info/:tenant_id/:timeline_id",
|
||||
record_safekeeper_info,
|
||||
)
|
||||
.get("/v1/debug_dump", dump_debug_handler)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use remote_storage::RemoteStorageConfig;
|
||||
|
||||
use std::path::PathBuf;
|
||||
use std::time::Duration;
|
||||
use storage_broker::Uri;
|
||||
@@ -10,7 +9,6 @@ mod auth;
|
||||
pub mod broker;
|
||||
pub mod control_file;
|
||||
pub mod control_file_upgrade;
|
||||
pub mod debug_dump;
|
||||
pub mod handler;
|
||||
pub mod http;
|
||||
pub mod json_ctrl;
|
||||
|
||||
@@ -7,10 +7,10 @@ use crate::safekeeper::AcceptorProposerMessage;
|
||||
use crate::safekeeper::ProposerAcceptorMessage;
|
||||
use crate::safekeeper::ServerInfo;
|
||||
use crate::timeline::Timeline;
|
||||
use crate::wal_service::ConnectionId;
|
||||
use crate::GlobalTimelines;
|
||||
use anyhow::{anyhow, Context};
|
||||
use bytes::BytesMut;
|
||||
use nix::unistd::gettid;
|
||||
use postgres_backend::CopyStreamHandlerEnd;
|
||||
use postgres_backend::PostgresBackend;
|
||||
use postgres_backend::PostgresBackendReader;
|
||||
@@ -68,17 +68,10 @@ impl SafekeeperPostgresHandler {
|
||||
// sends, so this avoids deadlocks.
|
||||
let mut pgb_reader = pgb.split().context("START_WAL_PUSH split")?;
|
||||
let peer_addr = *pgb.get_peer_addr();
|
||||
let network_reader = NetworkReader {
|
||||
ttid: self.ttid,
|
||||
conn_id: self.conn_id,
|
||||
pgb_reader: &mut pgb_reader,
|
||||
peer_addr,
|
||||
acceptor_handle: &mut acceptor_handle,
|
||||
};
|
||||
let res = tokio::select! {
|
||||
// todo: add read|write .context to these errors
|
||||
r = network_reader.run(msg_tx, msg_rx, reply_tx) => r,
|
||||
r = network_write(pgb, reply_rx) => r,
|
||||
r = read_network(self.ttid, &mut pgb_reader, peer_addr, msg_tx, &mut acceptor_handle, msg_rx, reply_tx) => r,
|
||||
r = write_network(pgb, reply_rx) => r,
|
||||
};
|
||||
|
||||
// Join pg backend back.
|
||||
@@ -111,55 +104,6 @@ impl SafekeeperPostgresHandler {
|
||||
}
|
||||
}
|
||||
|
||||
struct NetworkReader<'a> {
|
||||
ttid: TenantTimelineId,
|
||||
conn_id: ConnectionId,
|
||||
pgb_reader: &'a mut PostgresBackendReader,
|
||||
peer_addr: SocketAddr,
|
||||
// WalAcceptor is spawned when we learn server info from walproposer and
|
||||
// create timeline; handle is put here.
|
||||
acceptor_handle: &'a mut Option<JoinHandle<anyhow::Result<()>>>,
|
||||
}
|
||||
|
||||
impl<'a> NetworkReader<'a> {
|
||||
async fn run(
|
||||
self,
|
||||
msg_tx: Sender<ProposerAcceptorMessage>,
|
||||
msg_rx: Receiver<ProposerAcceptorMessage>,
|
||||
reply_tx: Sender<AcceptorProposerMessage>,
|
||||
) -> Result<(), CopyStreamHandlerEnd> {
|
||||
// Receive information about server to create timeline, if not yet.
|
||||
let next_msg = read_message(self.pgb_reader).await?;
|
||||
let tli = match next_msg {
|
||||
ProposerAcceptorMessage::Greeting(ref greeting) => {
|
||||
info!(
|
||||
"start handshake with walproposer {} sysid {} timeline {}",
|
||||
self.peer_addr, greeting.system_id, greeting.tli,
|
||||
);
|
||||
let server_info = ServerInfo {
|
||||
pg_version: greeting.pg_version,
|
||||
system_id: greeting.system_id,
|
||||
wal_seg_size: greeting.wal_seg_size,
|
||||
};
|
||||
GlobalTimelines::create(self.ttid, server_info, Lsn::INVALID, Lsn::INVALID).await?
|
||||
}
|
||||
_ => {
|
||||
return Err(CopyStreamHandlerEnd::Other(anyhow::anyhow!(
|
||||
"unexpected message {next_msg:?} instead of greeting"
|
||||
)))
|
||||
}
|
||||
};
|
||||
|
||||
*self.acceptor_handle = Some(
|
||||
WalAcceptor::spawn(tli.clone(), msg_rx, reply_tx, self.conn_id)
|
||||
.context("spawn WalAcceptor thread")?,
|
||||
);
|
||||
|
||||
// Forward all messages to WalAcceptor
|
||||
read_network_loop(self.pgb_reader, msg_tx, next_msg).await
|
||||
}
|
||||
}
|
||||
|
||||
/// Read next message from walproposer.
|
||||
/// TODO: Return Ok(None) on graceful termination.
|
||||
async fn read_message(
|
||||
@@ -170,6 +114,50 @@ async fn read_message(
|
||||
Ok(msg)
|
||||
}
|
||||
|
||||
/// Read messages from socket and pass it to WalAcceptor thread. Returns Ok(())
|
||||
/// if msg_tx closed; it must mean WalAcceptor terminated, joining it should
|
||||
/// tell the error.
|
||||
async fn read_network(
|
||||
ttid: TenantTimelineId,
|
||||
pgb_reader: &mut PostgresBackendReader,
|
||||
peer_addr: SocketAddr,
|
||||
msg_tx: Sender<ProposerAcceptorMessage>,
|
||||
// WalAcceptor is spawned when we learn server info from walproposer and
|
||||
// create timeline; handle is put here.
|
||||
acceptor_handle: &mut Option<JoinHandle<anyhow::Result<()>>>,
|
||||
msg_rx: Receiver<ProposerAcceptorMessage>,
|
||||
reply_tx: Sender<AcceptorProposerMessage>,
|
||||
) -> Result<(), CopyStreamHandlerEnd> {
|
||||
// Receive information about server to create timeline, if not yet.
|
||||
let next_msg = read_message(pgb_reader).await?;
|
||||
let tli = match next_msg {
|
||||
ProposerAcceptorMessage::Greeting(ref greeting) => {
|
||||
info!(
|
||||
"start handshake with walproposer {} sysid {} timeline {}",
|
||||
peer_addr, greeting.system_id, greeting.tli,
|
||||
);
|
||||
let server_info = ServerInfo {
|
||||
pg_version: greeting.pg_version,
|
||||
system_id: greeting.system_id,
|
||||
wal_seg_size: greeting.wal_seg_size,
|
||||
};
|
||||
GlobalTimelines::create(ttid, server_info, Lsn::INVALID, Lsn::INVALID).await?
|
||||
}
|
||||
_ => {
|
||||
return Err(CopyStreamHandlerEnd::Other(anyhow::anyhow!(
|
||||
"unexpected message {next_msg:?} instead of greeting"
|
||||
)))
|
||||
}
|
||||
};
|
||||
|
||||
*acceptor_handle = Some(
|
||||
WalAcceptor::spawn(tli.clone(), msg_rx, reply_tx).context("spawn WalAcceptor thread")?,
|
||||
);
|
||||
|
||||
// Forward all messages to WalAcceptor
|
||||
read_network_loop(pgb_reader, msg_tx, next_msg).await
|
||||
}
|
||||
|
||||
async fn read_network_loop(
|
||||
pgb_reader: &mut PostgresBackendReader,
|
||||
msg_tx: Sender<ProposerAcceptorMessage>,
|
||||
@@ -186,7 +174,7 @@ async fn read_network_loop(
|
||||
/// Read replies from WalAcceptor and pass them back to socket. Returns Ok(())
|
||||
/// if reply_rx closed; it must mean WalAcceptor terminated, joining it should
|
||||
/// tell the error.
|
||||
async fn network_write(
|
||||
async fn write_network(
|
||||
pgb_writer: &mut PostgresBackend,
|
||||
mut reply_rx: Receiver<AcceptorProposerMessage>,
|
||||
) -> Result<(), CopyStreamHandlerEnd> {
|
||||
@@ -217,7 +205,6 @@ impl WalAcceptor {
|
||||
tli: Arc<Timeline>,
|
||||
msg_rx: Receiver<ProposerAcceptorMessage>,
|
||||
reply_tx: Sender<AcceptorProposerMessage>,
|
||||
conn_id: ConnectionId,
|
||||
) -> anyhow::Result<JoinHandle<anyhow::Result<()>>> {
|
||||
let thread_name = format!("WAL acceptor {}", tli.ttid);
|
||||
thread::Builder::new()
|
||||
@@ -236,7 +223,7 @@ impl WalAcceptor {
|
||||
let span_ttid = wa.tli.ttid; // satisfy borrow checker
|
||||
runtime.block_on(
|
||||
wa.run()
|
||||
.instrument(info_span!("WAL acceptor", cid = %conn_id, ttid = %span_ttid)),
|
||||
.instrument(info_span!("WAL acceptor", tid = %gettid(), ttid = %span_ttid)),
|
||||
)
|
||||
})
|
||||
.map_err(anyhow::Error::from)
|
||||
|
||||
@@ -191,8 +191,7 @@ pub struct SafeKeeperState {
|
||||
/// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn
|
||||
/// of last record streamed to everyone). Persisting it helps skipping
|
||||
/// recovery in walproposer, generally we compute it from peers. In
|
||||
/// walproposer proto called 'truncate_lsn'. Updates are currently drived
|
||||
/// only by walproposer.
|
||||
/// walproposer proto called 'truncate_lsn'.
|
||||
pub peer_horizon_lsn: Lsn,
|
||||
/// LSN of the oldest known checkpoint made by pageserver and successfully
|
||||
/// pushed to s3. We don't remove WAL beyond it. Persisted only for
|
||||
@@ -205,7 +204,7 @@ pub struct SafeKeeperState {
|
||||
pub peers: PersistedPeers,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
#[derive(Debug, Clone)]
|
||||
// In memory safekeeper state. Fields mirror ones in `SafeKeeperState`; values
|
||||
// are not flushed yet.
|
||||
pub struct SafekeeperMemState {
|
||||
@@ -213,7 +212,6 @@ pub struct SafekeeperMemState {
|
||||
pub backup_lsn: Lsn,
|
||||
pub peer_horizon_lsn: Lsn,
|
||||
pub remote_consistent_lsn: Lsn,
|
||||
#[serde(with = "hex")]
|
||||
pub proposer_uuid: PgUuid,
|
||||
}
|
||||
|
||||
@@ -683,7 +681,7 @@ where
|
||||
term: self.state.acceptor_state.term,
|
||||
vote_given: false as u64,
|
||||
flush_lsn: self.flush_lsn(),
|
||||
truncate_lsn: self.inmem.peer_horizon_lsn,
|
||||
truncate_lsn: self.state.peer_horizon_lsn,
|
||||
term_history: self.get_term_history(),
|
||||
timeline_start_lsn: self.state.timeline_start_lsn,
|
||||
};
|
||||
@@ -879,13 +877,7 @@ where
|
||||
if msg.h.commit_lsn != Lsn(0) {
|
||||
self.update_commit_lsn(msg.h.commit_lsn)?;
|
||||
}
|
||||
// Value calculated by walproposer can always lag:
|
||||
// - safekeepers can forget inmem value and send to proposer lower
|
||||
// persisted one on restart;
|
||||
// - if we make safekeepers always send persistent value,
|
||||
// any compute restart would pull it down.
|
||||
// Thus, take max before adopting.
|
||||
self.inmem.peer_horizon_lsn = max(self.inmem.peer_horizon_lsn, msg.h.truncate_lsn);
|
||||
self.inmem.peer_horizon_lsn = msg.h.truncate_lsn;
|
||||
|
||||
// Update truncate and commit LSN in control file.
|
||||
// To avoid negative impact on performance of extra fsync, do it only
|
||||
|
||||
@@ -92,8 +92,7 @@ impl SafekeeperPostgresHandler {
|
||||
start_pos: Lsn,
|
||||
) -> Result<(), CopyStreamHandlerEnd> {
|
||||
let appname = self.appname.clone();
|
||||
let tli =
|
||||
GlobalTimelines::get(self.ttid).map_err(|e| CopyStreamHandlerEnd::Other(e.into()))?;
|
||||
let tli = GlobalTimelines::get(self.ttid)?;
|
||||
|
||||
let state = ReplicaState::new();
|
||||
// This replica_id is used below to check if it's time to stop replication.
|
||||
|
||||
@@ -1,11 +1,10 @@
|
||||
//! This module implements Timeline lifecycle management and has all necessary code
|
||||
//! to glue together SafeKeeper and all other background services.
|
||||
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
use anyhow::{bail, Result};
|
||||
use parking_lot::{Mutex, MutexGuard};
|
||||
use postgres_ffi::XLogSegNo;
|
||||
use pq_proto::ReplicationFeedback;
|
||||
use serde::Serialize;
|
||||
use std::cmp::{max, min};
|
||||
use std::path::PathBuf;
|
||||
use tokio::{
|
||||
@@ -13,7 +12,6 @@ use tokio::{
|
||||
time::Instant,
|
||||
};
|
||||
use tracing::*;
|
||||
use utils::http::error::ApiError;
|
||||
use utils::{
|
||||
id::{NodeId, TenantTimelineId},
|
||||
lsn::Lsn,
|
||||
@@ -30,9 +28,9 @@ use crate::send_wal::HotStandbyFeedback;
|
||||
use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};
|
||||
|
||||
use crate::metrics::FullTimelineInfo;
|
||||
use crate::wal_storage;
|
||||
use crate::wal_storage::Storage as wal_storage_iface;
|
||||
use crate::SafeKeeperConf;
|
||||
use crate::{debug_dump, wal_storage};
|
||||
|
||||
/// Things safekeeper should know about timeline state on peers.
|
||||
#[derive(Debug, Clone)]
|
||||
@@ -82,7 +80,7 @@ impl PeersInfo {
|
||||
}
|
||||
|
||||
/// Replica status update + hot standby feedback
|
||||
#[derive(Debug, Clone, Copy, Serialize)]
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct ReplicaState {
|
||||
/// last known lsn received by replica
|
||||
pub last_received_lsn: Lsn, // None means we don't know
|
||||
@@ -357,18 +355,6 @@ pub enum TimelineError {
|
||||
UninitialinzedPgVersion(TenantTimelineId),
|
||||
}
|
||||
|
||||
// Convert to HTTP API error.
|
||||
impl From<TimelineError> for ApiError {
|
||||
fn from(te: TimelineError) -> ApiError {
|
||||
match te {
|
||||
TimelineError::NotFound(ttid) => {
|
||||
ApiError::NotFound(anyhow!("timeline {} not found", ttid))
|
||||
}
|
||||
_ => ApiError::InternalServerError(anyhow!("{}", te)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Timeline struct manages lifecycle (creation, deletion, restore) of a safekeeper timeline.
|
||||
/// It also holds SharedState and provides mutually exclusive access to it.
|
||||
pub struct Timeline {
|
||||
@@ -395,7 +381,7 @@ pub struct Timeline {
|
||||
cancellation_rx: watch::Receiver<bool>,
|
||||
|
||||
/// Directory where timeline state is stored.
|
||||
pub timeline_dir: PathBuf,
|
||||
timeline_dir: PathBuf,
|
||||
}
|
||||
|
||||
impl Timeline {
|
||||
@@ -607,6 +593,38 @@ impl Timeline {
|
||||
self.write_shared_state().wal_backup_attend()
|
||||
}
|
||||
|
||||
/// Returns full timeline info, required for the metrics. If the timeline is
|
||||
/// not active, returns None instead.
|
||||
pub fn info_for_metrics(&self) -> Option<FullTimelineInfo> {
|
||||
if self.is_cancelled() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let state = self.write_shared_state();
|
||||
if state.active {
|
||||
Some(FullTimelineInfo {
|
||||
ttid: self.ttid,
|
||||
replicas: state
|
||||
.replicas
|
||||
.iter()
|
||||
.filter_map(|r| r.as_ref())
|
||||
.copied()
|
||||
.collect(),
|
||||
wal_backup_active: state.wal_backup_active,
|
||||
timeline_is_active: state.active,
|
||||
num_computes: state.num_computes,
|
||||
last_removed_segno: state.last_removed_segno,
|
||||
epoch_start_lsn: state.sk.epoch_start_lsn,
|
||||
mem_state: state.sk.inmem.clone(),
|
||||
persisted_state: state.sk.state.clone(),
|
||||
flush_lsn: state.sk.wal_store.flush_lsn(),
|
||||
wal_storage: state.sk.wal_store.get_metrics(),
|
||||
})
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns commit_lsn watch channel.
|
||||
pub fn get_commit_lsn_watch_rx(&self) -> watch::Receiver<Lsn> {
|
||||
self.commit_lsn_watch_rx.clone()
|
||||
@@ -771,62 +789,6 @@ impl Timeline {
|
||||
shared_state.last_removed_segno = horizon_segno;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Returns full timeline info, required for the metrics. If the timeline is
|
||||
/// not active, returns None instead.
|
||||
pub fn info_for_metrics(&self) -> Option<FullTimelineInfo> {
|
||||
if self.is_cancelled() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let state = self.write_shared_state();
|
||||
if state.active {
|
||||
Some(FullTimelineInfo {
|
||||
ttid: self.ttid,
|
||||
replicas: state
|
||||
.replicas
|
||||
.iter()
|
||||
.filter_map(|r| r.as_ref())
|
||||
.copied()
|
||||
.collect(),
|
||||
wal_backup_active: state.wal_backup_active,
|
||||
timeline_is_active: state.active,
|
||||
num_computes: state.num_computes,
|
||||
last_removed_segno: state.last_removed_segno,
|
||||
epoch_start_lsn: state.sk.epoch_start_lsn,
|
||||
mem_state: state.sk.inmem.clone(),
|
||||
persisted_state: state.sk.state.clone(),
|
||||
flush_lsn: state.sk.wal_store.flush_lsn(),
|
||||
wal_storage: state.sk.wal_store.get_metrics(),
|
||||
})
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns in-memory timeline state to build a full debug dump.
|
||||
pub fn memory_dump(&self) -> debug_dump::Memory {
|
||||
let state = self.write_shared_state();
|
||||
|
||||
let (write_lsn, write_record_lsn, flush_lsn, file_open) =
|
||||
state.sk.wal_store.internal_state();
|
||||
|
||||
debug_dump::Memory {
|
||||
is_cancelled: self.is_cancelled(),
|
||||
peers_info_len: state.peers_info.0.len(),
|
||||
replicas: state.replicas.clone(),
|
||||
wal_backup_active: state.wal_backup_active,
|
||||
active: state.active,
|
||||
num_computes: state.num_computes,
|
||||
last_removed_segno: state.last_removed_segno,
|
||||
epoch_start_lsn: state.sk.epoch_start_lsn,
|
||||
mem_state: state.sk.inmem.clone(),
|
||||
write_lsn,
|
||||
write_record_lsn,
|
||||
flush_lsn,
|
||||
file_open,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Deletes directory and it's contents. Returns false if directory does not exist.
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
use crate::safekeeper::ServerInfo;
|
||||
use crate::timeline::{Timeline, TimelineError};
|
||||
use crate::SafeKeeperConf;
|
||||
use anyhow::{bail, Context, Result};
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use once_cell::sync::Lazy;
|
||||
use serde::Serialize;
|
||||
use std::collections::HashMap;
|
||||
@@ -50,11 +50,11 @@ impl GlobalTimelinesState {
|
||||
}
|
||||
|
||||
/// Get timeline from the map. Returns error if timeline doesn't exist.
|
||||
fn get(&self, ttid: &TenantTimelineId) -> Result<Arc<Timeline>, TimelineError> {
|
||||
fn get(&self, ttid: &TenantTimelineId) -> Result<Arc<Timeline>> {
|
||||
self.timelines
|
||||
.get(ttid)
|
||||
.cloned()
|
||||
.ok_or(TimelineError::NotFound(*ttid))
|
||||
.ok_or_else(|| anyhow!(TimelineError::NotFound(*ttid)))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -159,16 +159,6 @@ impl GlobalTimelines {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Get the number of timelines in the map.
|
||||
pub fn timelines_count() -> usize {
|
||||
TIMELINES_STATE.lock().unwrap().timelines.len()
|
||||
}
|
||||
|
||||
/// Get the global safekeeper config.
|
||||
pub fn get_global_config() -> SafeKeeperConf {
|
||||
TIMELINES_STATE.lock().unwrap().get_conf().clone()
|
||||
}
|
||||
|
||||
/// Create a new timeline with the given id. If the timeline already exists, returns
|
||||
/// an existing timeline.
|
||||
pub async fn create(
|
||||
@@ -236,17 +226,17 @@ impl GlobalTimelines {
|
||||
/// Get a timeline from the global map. If it's not present, it doesn't exist on disk,
|
||||
/// or was corrupted and couldn't be loaded on startup. Returned timeline is always valid,
|
||||
/// i.e. loaded in memory and not cancelled.
|
||||
pub fn get(ttid: TenantTimelineId) -> Result<Arc<Timeline>, TimelineError> {
|
||||
pub fn get(ttid: TenantTimelineId) -> Result<Arc<Timeline>> {
|
||||
let res = TIMELINES_STATE.lock().unwrap().get(&ttid);
|
||||
|
||||
match res {
|
||||
Ok(tli) => {
|
||||
if tli.is_cancelled() {
|
||||
return Err(TimelineError::Cancelled(ttid));
|
||||
anyhow::bail!(TimelineError::Cancelled(ttid));
|
||||
}
|
||||
Ok(tli)
|
||||
}
|
||||
_ => res,
|
||||
Err(e) => Err(e),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
//! receive WAL from wal_proposer and send it to WAL receivers
|
||||
//!
|
||||
use anyhow::{Context, Result};
|
||||
use nix::unistd::gettid;
|
||||
use postgres_backend::QueryError;
|
||||
use std::{future, thread};
|
||||
use tokio::net::TcpStream;
|
||||
@@ -26,19 +27,17 @@ pub fn thread_main(conf: SafeKeeperConf, pg_listener: std::net::TcpListener) {
|
||||
// Tokio's from_std won't do this for us, per its comment.
|
||||
pg_listener.set_nonblocking(true)?;
|
||||
let listener = tokio::net::TcpListener::from_std(pg_listener)?;
|
||||
let mut connection_count: ConnectionCount = 0;
|
||||
|
||||
loop {
|
||||
match listener.accept().await {
|
||||
Ok((socket, peer_addr)) => {
|
||||
debug!("accepted connection from {}", peer_addr);
|
||||
let conf = conf.clone();
|
||||
let conn_id = issue_connection_id(&mut connection_count);
|
||||
|
||||
let _ = thread::Builder::new()
|
||||
.name("WAL service thread".into())
|
||||
.spawn(move || {
|
||||
if let Err(err) = handle_socket(socket, conf, conn_id) {
|
||||
if let Err(err) = handle_socket(socket, conf) {
|
||||
error!("connection handler exited: {}", err);
|
||||
}
|
||||
})
|
||||
@@ -55,12 +54,8 @@ pub fn thread_main(conf: SafeKeeperConf, pg_listener: std::net::TcpListener) {
|
||||
|
||||
/// This is run by `thread_main` above, inside a background thread.
|
||||
///
|
||||
fn handle_socket(
|
||||
socket: TcpStream,
|
||||
conf: SafeKeeperConf,
|
||||
conn_id: ConnectionId,
|
||||
) -> Result<(), QueryError> {
|
||||
let _enter = info_span!("", cid = %conn_id).entered();
|
||||
fn handle_socket(socket: TcpStream, conf: SafeKeeperConf) -> Result<(), QueryError> {
|
||||
let _enter = info_span!("", tid = %gettid()).entered();
|
||||
|
||||
let runtime = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
@@ -73,7 +68,7 @@ fn handle_socket(
|
||||
None => AuthType::Trust,
|
||||
Some(_) => AuthType::NeonJWT,
|
||||
};
|
||||
let mut conn_handler = SafekeeperPostgresHandler::new(conf, conn_id);
|
||||
let mut conn_handler = SafekeeperPostgresHandler::new(conf);
|
||||
let pgbackend = PostgresBackend::new(socket, auth_type, None)?;
|
||||
// libpq protocol between safekeeper and walproposer / pageserver
|
||||
// We don't use shutdown.
|
||||
@@ -84,12 +79,3 @@ fn handle_socket(
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Unique WAL service connection ids are logged in spans for observability.
|
||||
pub type ConnectionId = u32;
|
||||
pub type ConnectionCount = u32;
|
||||
|
||||
pub fn issue_connection_id(count: &mut ConnectionCount) -> ConnectionId {
|
||||
*count = count.wrapping_add(1);
|
||||
*count
|
||||
}
|
||||
|
||||
@@ -165,16 +165,6 @@ impl PhysicalStorage {
|
||||
})
|
||||
}
|
||||
|
||||
/// Get all known state of the storage.
|
||||
pub fn internal_state(&self) -> (Lsn, Lsn, Lsn, bool) {
|
||||
(
|
||||
self.write_lsn,
|
||||
self.write_record_lsn,
|
||||
self.flush_record_lsn,
|
||||
self.file.is_some(),
|
||||
)
|
||||
}
|
||||
|
||||
/// Call fdatasync if config requires so.
|
||||
fn fdatasync_file(&mut self, file: &mut File) -> Result<()> {
|
||||
if !self.conf.no_sync {
|
||||
|
||||
@@ -354,26 +354,29 @@ class NeonBenchmarker:
|
||||
"""
|
||||
Fetch the "cumulative # of bytes written" metric from the pageserver
|
||||
"""
|
||||
return self.get_int_counter_value(
|
||||
pageserver, "libmetrics_disk_io_bytes_total", {"io_operation": "write"}
|
||||
)
|
||||
metric_name = r'libmetrics_disk_io_bytes_total{io_operation="write"}'
|
||||
return self.get_int_counter_value(pageserver, metric_name)
|
||||
|
||||
def get_peak_mem(self, pageserver: NeonPageserver) -> int:
|
||||
"""
|
||||
Fetch the "maxrss" metric from the pageserver
|
||||
"""
|
||||
return self.get_int_counter_value(pageserver, "libmetrics_maxrss_kb")
|
||||
metric_name = r"libmetrics_maxrss_kb"
|
||||
return self.get_int_counter_value(pageserver, metric_name)
|
||||
|
||||
def get_int_counter_value(
|
||||
self,
|
||||
pageserver: NeonPageserver,
|
||||
metric_name: str,
|
||||
label_filters: Optional[Dict[str, str]] = None,
|
||||
) -> int:
|
||||
def get_int_counter_value(self, pageserver: NeonPageserver, metric_name: str) -> int:
|
||||
"""Fetch the value of given int counter from pageserver metrics."""
|
||||
# TODO: If we start to collect more of the prometheus metrics in the
|
||||
# performance test suite like this, we should refactor this to load and
|
||||
# parse all the metrics into a more convenient structure in one go.
|
||||
#
|
||||
# The metric should be an integer, as it's a number of bytes. But in general
|
||||
# all prometheus metrics are floats. So to be pedantic, read it as a float
|
||||
# and round to integer.
|
||||
all_metrics = pageserver.http_client().get_metrics()
|
||||
sample = all_metrics.query_one(metric_name, label_filters)
|
||||
return int(round(sample.value))
|
||||
matches = re.search(rf"^{metric_name} (\S+)$", all_metrics, re.MULTILINE)
|
||||
assert matches, f"metric {metric_name} not found"
|
||||
return int(round(float(matches.group(1))))
|
||||
|
||||
def get_timeline_size(
|
||||
self, repo_dir: Path, tenant_id: TenantId, timeline_id: TimelineId
|
||||
|
||||
@@ -144,12 +144,12 @@ class NeonCompare(PgCompare):
|
||||
"size", timeline_size / (1024 * 1024), "MB", report=MetricReport.LOWER_IS_BETTER
|
||||
)
|
||||
|
||||
metric_filters = {"tenant_id": str(self.tenant), "timeline_id": str(self.timeline)}
|
||||
params = f'{{tenant_id="{self.tenant}",timeline_id="{self.timeline}"}}'
|
||||
total_files = self.zenbenchmark.get_int_counter_value(
|
||||
self.env.pageserver, "pageserver_created_persistent_files_total", metric_filters
|
||||
self.env.pageserver, "pageserver_created_persistent_files_total" + params
|
||||
)
|
||||
total_bytes = self.zenbenchmark.get_int_counter_value(
|
||||
self.env.pageserver, "pageserver_written_persistent_bytes_total", metric_filters
|
||||
self.env.pageserver, "pageserver_written_persistent_bytes_total" + params
|
||||
)
|
||||
self.zenbenchmark.record(
|
||||
"data_uploaded", total_bytes / (1024 * 1024), "MB", report=MetricReport.LOWER_IS_BETTER
|
||||
|
||||
@@ -13,8 +13,7 @@ class Metrics:
|
||||
self.metrics = defaultdict(list)
|
||||
self.name = name
|
||||
|
||||
def query_all(self, name: str, filter: Optional[Dict[str, str]] = None) -> List[Sample]:
|
||||
filter = filter or {}
|
||||
def query_all(self, name: str, filter: Dict[str, str]) -> List[Sample]:
|
||||
res = []
|
||||
for sample in self.metrics[name]:
|
||||
try:
|
||||
|
||||
@@ -14,7 +14,6 @@ import tempfile
|
||||
import textwrap
|
||||
import time
|
||||
import uuid
|
||||
from collections import defaultdict
|
||||
from contextlib import closing, contextmanager
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Flag, auto
|
||||
@@ -29,6 +28,7 @@ import asyncpg
|
||||
import backoff # type: ignore
|
||||
import boto3
|
||||
import jwt
|
||||
import prometheus_client
|
||||
import psycopg2
|
||||
import pytest
|
||||
import requests
|
||||
@@ -36,7 +36,7 @@ from _pytest.config import Config
|
||||
from _pytest.config.argparsing import Parser
|
||||
from _pytest.fixtures import FixtureRequest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.metrics import Metrics, parse_metrics
|
||||
from fixtures.metrics import parse_metrics
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
from fixtures.utils import (
|
||||
ATTACHMENT_NAME_REGEX,
|
||||
@@ -45,6 +45,7 @@ from fixtures.utils import (
|
||||
get_self_dir,
|
||||
subprocess_capture,
|
||||
)
|
||||
from prometheus_client.parser import text_string_to_metric_families
|
||||
|
||||
# Type-related stuff
|
||||
from psycopg2.extensions import connection as PgConnection
|
||||
@@ -1435,27 +1436,22 @@ class PageserverHttpClient(requests.Session):
|
||||
assert completed["successful_download_count"] > 0
|
||||
return completed
|
||||
|
||||
def get_metrics_str(self) -> str:
|
||||
"""You probably want to use get_metrics() instead."""
|
||||
def get_metrics(self) -> str:
|
||||
res = self.get(f"http://localhost:{self.port}/metrics")
|
||||
self.verbose_error(res)
|
||||
return res.text
|
||||
|
||||
def get_metrics(self) -> Metrics:
|
||||
res = self.get_metrics_str()
|
||||
return parse_metrics(res)
|
||||
|
||||
def get_timeline_metric(
|
||||
self, tenant_id: TenantId, timeline_id: TimelineId, metric_name: str
|
||||
) -> float:
|
||||
metrics = self.get_metrics()
|
||||
return metrics.query_one(
|
||||
metric_name,
|
||||
filter={
|
||||
"tenant_id": str(tenant_id),
|
||||
"timeline_id": str(timeline_id),
|
||||
},
|
||||
).value
|
||||
def get_timeline_metric(self, tenant_id: TenantId, timeline_id: TimelineId, metric_name: str):
|
||||
raw = self.get_metrics()
|
||||
family: List[prometheus_client.Metric] = list(text_string_to_metric_families(raw))
|
||||
[metric] = [m for m in family if m.name == metric_name]
|
||||
[sample] = [
|
||||
s
|
||||
for s in metric.samples
|
||||
if s.labels["tenant_id"] == str(tenant_id)
|
||||
and s.labels["timeline_id"] == str(timeline_id)
|
||||
]
|
||||
return sample.value
|
||||
|
||||
def get_remote_timeline_client_metric(
|
||||
self,
|
||||
@@ -1465,7 +1461,7 @@ class PageserverHttpClient(requests.Session):
|
||||
file_kind: str,
|
||||
op_kind: str,
|
||||
) -> Optional[float]:
|
||||
metrics = self.get_metrics()
|
||||
metrics = parse_metrics(self.get_metrics(), "pageserver")
|
||||
matches = metrics.query_all(
|
||||
name=metric_name,
|
||||
filter={
|
||||
@@ -1484,16 +1480,14 @@ class PageserverHttpClient(requests.Session):
|
||||
assert len(matches) < 2, "above filter should uniquely identify metric"
|
||||
return value
|
||||
|
||||
def get_metric_value(
|
||||
self, name: str, filter: Optional[Dict[str, str]] = None
|
||||
) -> Optional[float]:
|
||||
def get_metric_value(self, name: str) -> Optional[str]:
|
||||
metrics = self.get_metrics()
|
||||
results = metrics.query_all(name, filter=filter)
|
||||
if not results:
|
||||
relevant = [line for line in metrics.splitlines() if line.startswith(name)]
|
||||
if len(relevant) == 0:
|
||||
log.info(f'could not find metric "{name}"')
|
||||
return None
|
||||
assert len(results) == 1, f"metric {name} with given filters is not unique, got: {results}"
|
||||
return results[0].value
|
||||
assert len(relevant) == 1
|
||||
return relevant[0].lstrip(name).strip()
|
||||
|
||||
def layer_map_info(
|
||||
self,
|
||||
@@ -1522,11 +1516,6 @@ class PageserverHttpClient(requests.Session):
|
||||
|
||||
assert res.status_code == 200
|
||||
|
||||
def evict_all_layers(self, tenant_id: TenantId, timeline_id: TimelineId):
|
||||
info = self.layer_map_info(tenant_id, timeline_id)
|
||||
for layer in info.historic_layers:
|
||||
self.evict_layer(tenant_id, timeline_id, layer.layer_file_name)
|
||||
|
||||
|
||||
@dataclass
|
||||
class TenantConfig:
|
||||
@@ -1562,14 +1551,6 @@ class LayerMapInfo:
|
||||
|
||||
return info
|
||||
|
||||
def kind_count(self) -> Dict[str, int]:
|
||||
counts: Dict[str, int] = defaultdict(int)
|
||||
for inmem_layer in self.in_memory_layers:
|
||||
counts[inmem_layer.kind] += 1
|
||||
for hist_layer in self.historic_layers:
|
||||
counts[hist_layer.kind] += 1
|
||||
return counts
|
||||
|
||||
|
||||
@dataclass
|
||||
class InMemoryLayerInfo:
|
||||
@@ -1586,7 +1567,7 @@ class InMemoryLayerInfo:
|
||||
)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
@dataclass
|
||||
class HistoricLayerInfo:
|
||||
kind: str
|
||||
layer_file_name: str
|
||||
@@ -3009,13 +2990,6 @@ class SafekeeperHttpClient(requests.Session):
|
||||
def check_status(self):
|
||||
self.get(f"http://localhost:{self.port}/v1/status").raise_for_status()
|
||||
|
||||
def debug_dump(self, params: Dict[str, str] = {}) -> Dict[str, Any]:
|
||||
res = self.get(f"http://localhost:{self.port}/v1/debug_dump", params=params)
|
||||
res.raise_for_status()
|
||||
res_json = res.json()
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def timeline_create(
|
||||
self, tenant_id: TenantId, timeline_id: TimelineId, pg_version: int, commit_lsn: Lsn
|
||||
):
|
||||
@@ -3544,23 +3518,3 @@ def wait_for_sk_commit_lsn_to_reach_remote_storage(
|
||||
ps_http.timeline_checkpoint(tenant_id, timeline_id)
|
||||
wait_for_upload(ps_http, tenant_id, timeline_id, lsn)
|
||||
return lsn
|
||||
|
||||
|
||||
def wait_for_upload_queue_empty(
|
||||
pageserver: NeonPageserver, tenant_id: TenantId, timeline_id: TimelineId
|
||||
):
|
||||
ps_http = pageserver.http_client()
|
||||
while True:
|
||||
all_metrics = ps_http.get_metrics()
|
||||
tl = all_metrics.query_all(
|
||||
"pageserver_remote_timeline_client_calls_unfinished",
|
||||
{
|
||||
"tenant_id": str(tenant_id),
|
||||
"timeline_id": str(timeline_id),
|
||||
},
|
||||
)
|
||||
assert len(tl) > 0
|
||||
log.info(f"upload queue for {tenant_id}/{timeline_id}: {tl}")
|
||||
if all(m.value == 0 for m in tl):
|
||||
return
|
||||
time.sleep(0.2)
|
||||
|
||||
@@ -1,58 +0,0 @@
|
||||
from contextlib import closing
|
||||
|
||||
import pytest
|
||||
from fixtures.compare_fixtures import NeonCompare
|
||||
from fixtures.neon_fixtures import wait_for_last_flush_lsn
|
||||
|
||||
|
||||
#
|
||||
# Test compaction and image layer creation performance.
|
||||
#
|
||||
# This creates a few tables and runs some simple INSERTs and UPDATEs on them to generate
|
||||
# some delta layers. Then it runs manual compaction, measuring how long it takes.
|
||||
#
|
||||
@pytest.mark.timeout(1000)
|
||||
def test_compaction(neon_compare: NeonCompare):
|
||||
env = neon_compare.env
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
|
||||
tenant_id, timeline_id = env.neon_cli.create_tenant(
|
||||
conf={
|
||||
# Disable background GC and compaction, we'll run compaction manually.
|
||||
"gc_period": "0s",
|
||||
"compaction_period": "0s",
|
||||
# Make checkpoint distance somewhat smaller than default, to create
|
||||
# more delta layers quicker, to trigger compaction.
|
||||
"checkpoint_distance": "25000000", # 25 MB
|
||||
# Force image layer creation when we run compaction.
|
||||
"image_creation_threshold": "1",
|
||||
}
|
||||
)
|
||||
neon_compare.tenant = tenant_id
|
||||
neon_compare.timeline = timeline_id
|
||||
|
||||
# Create some tables, and run a bunch of INSERTs and UPDATes on them,
|
||||
# to generate WAL and layers
|
||||
pg = env.postgres.create_start(
|
||||
"main", tenant_id=tenant_id, config_lines=["shared_buffers=512MB"]
|
||||
)
|
||||
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
for i in range(100):
|
||||
cur.execute(f"create table tbl{i} (i int, j int);")
|
||||
cur.execute(f"insert into tbl{i} values (generate_series(1, 1000), 0);")
|
||||
for j in range(100):
|
||||
cur.execute(f"update tbl{i} set j = {j};")
|
||||
|
||||
wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
|
||||
|
||||
# First compaction generates L1 layers
|
||||
with neon_compare.zenbenchmark.record_duration("compaction"):
|
||||
pageserver_http.timeline_compact(tenant_id, timeline_id)
|
||||
|
||||
# And second compaction triggers image layer creation
|
||||
with neon_compare.zenbenchmark.record_duration("image_creation"):
|
||||
pageserver_http.timeline_compact(tenant_id, timeline_id)
|
||||
|
||||
neon_compare.report_size()
|
||||
@@ -8,7 +8,7 @@ def test_build_info_metric(neon_env_builder: NeonEnvBuilder, link_proxy: NeonPro
|
||||
|
||||
parsed_metrics = {}
|
||||
|
||||
parsed_metrics["pageserver"] = parse_metrics(env.pageserver.http_client().get_metrics_str())
|
||||
parsed_metrics["pageserver"] = parse_metrics(env.pageserver.http_client().get_metrics())
|
||||
parsed_metrics["safekeeper"] = parse_metrics(env.safekeepers[0].http_client().get_metrics_str())
|
||||
parsed_metrics["proxy"] = parse_metrics(link_proxy.get_metrics())
|
||||
|
||||
|
||||
@@ -220,12 +220,9 @@ def prepare_snapshot(
|
||||
for tenant in (repo_dir / "pgdatadirs" / "tenants").glob("*"):
|
||||
shutil.rmtree(tenant)
|
||||
|
||||
# Remove wal-redo temp directory if it exists. Newer pageserver versions don't create
|
||||
# them anymore, but old versions did.
|
||||
# Remove wal-redo temp directory
|
||||
for tenant in (repo_dir / "tenants").glob("*"):
|
||||
wal_redo_dir = tenant / "wal-redo-datadir.___temp"
|
||||
if wal_redo_dir.exists() and wal_redo_dir.is_dir():
|
||||
shutil.rmtree(wal_redo_dir)
|
||||
shutil.rmtree(tenant / "wal-redo-datadir.___temp")
|
||||
|
||||
# Update paths and ports in config files
|
||||
pageserver_toml = repo_dir / "pageserver.toml"
|
||||
|
||||
@@ -4,6 +4,7 @@ import random
|
||||
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.metrics import parse_metrics
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnv,
|
||||
NeonEnvBuilder,
|
||||
@@ -133,7 +134,7 @@ def test_gc_index_upload(neon_env_builder: NeonEnvBuilder, remote_storage_kind:
|
||||
|
||||
# Helper function that gets the number of given kind of remote ops from the metrics
|
||||
def get_num_remote_ops(file_kind: str, op_kind: str) -> int:
|
||||
ps_metrics = env.pageserver.http_client().get_metrics()
|
||||
ps_metrics = parse_metrics(env.pageserver.http_client().get_metrics(), "pageserver")
|
||||
total = 0.0
|
||||
for sample in ps_metrics.query_all(
|
||||
name="pageserver_remote_operation_seconds_count",
|
||||
|
||||
@@ -1,13 +1,8 @@
|
||||
import time
|
||||
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
RemoteStorageKind,
|
||||
wait_for_last_flush_lsn,
|
||||
wait_for_last_record_lsn,
|
||||
wait_for_sk_commit_lsn_to_reach_remote_storage,
|
||||
wait_for_upload,
|
||||
)
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
@@ -143,160 +138,3 @@ def test_basic_eviction(
|
||||
assert (
|
||||
redownloaded_layer_map_info == initial_layer_map_info
|
||||
), "Should have the same layer map after redownloading the evicted layers"
|
||||
|
||||
|
||||
def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=RemoteStorageKind.LOCAL_FS,
|
||||
test_name="test_gc_of_remote_layers",
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
tenant_config = {
|
||||
"pitr_interval": "1s", # set to non-zero, so GC actually does something
|
||||
"gc_period": "0s", # we want to control when GC runs
|
||||
"compaction_period": "0s", # we want to control when compaction runs
|
||||
"checkpoint_timeout": "24h", # something we won't reach
|
||||
"checkpoint_distance": f"{50 * (1024**2)}", # something we won't reach, we checkpoint manually
|
||||
"compaction_threshold": "3",
|
||||
# "image_creation_threshold": set at runtime
|
||||
"compaction_target_size": f"{128 * (1024**2)}", # make it so that we only have 1 partition => image coverage for delta layers => enables gc of delta layers
|
||||
}
|
||||
|
||||
def tenant_update_config(changes):
|
||||
tenant_config.update(changes)
|
||||
env.neon_cli.config_tenant(tenant_id, tenant_config)
|
||||
|
||||
tenant_id, timeline_id = env.neon_cli.create_tenant(conf=tenant_config)
|
||||
log.info("tenant id is %s", tenant_id)
|
||||
env.initial_tenant = tenant_id # update_and_gc relies on this
|
||||
ps_http = env.pageserver.http_client()
|
||||
|
||||
pg = env.postgres.create_start("main")
|
||||
|
||||
log.info("fill with data, creating delta & image layers, some of which are GC'able after")
|
||||
# no particular reason to create the layers like this, but we are sure
|
||||
# not to hit the image_creation_threshold here.
|
||||
with pg.cursor() as cur:
|
||||
cur.execute("create table a (id bigserial primary key, some_value bigint not null)")
|
||||
cur.execute("insert into a(some_value) select i from generate_series(1, 10000) s(i)")
|
||||
wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
|
||||
ps_http.timeline_checkpoint(tenant_id, timeline_id)
|
||||
|
||||
# Create delta layers, then turn them into image layers.
|
||||
# Do it multiple times so that there's something to GC.
|
||||
for k in range(0, 2):
|
||||
# produce delta layers => disable image layer creation by setting high threshold
|
||||
tenant_update_config({"image_creation_threshold": "100"})
|
||||
for i in range(0, 2):
|
||||
for j in range(0, 3):
|
||||
# create a minimal amount of "delta difficulty" for this table
|
||||
with pg.cursor() as cur:
|
||||
cur.execute("update a set some_value = -some_value + %s", (j,))
|
||||
|
||||
with pg.cursor() as cur:
|
||||
# vacuuming should aid to reuse keys, though it's not really important
|
||||
# with image_creation_threshold=1 which we will use on the last compaction
|
||||
cur.execute("vacuum")
|
||||
|
||||
wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
|
||||
|
||||
if i == 1 and j == 2 and k == 1:
|
||||
# last iteration; stop before checkpoint to avoid leaving an inmemory layer
|
||||
pg.stop_and_destroy()
|
||||
|
||||
ps_http.timeline_checkpoint(tenant_id, timeline_id)
|
||||
|
||||
# images should not yet be created, because threshold is too high,
|
||||
# but these will be reshuffled to L1 layers
|
||||
ps_http.timeline_compact(tenant_id, timeline_id)
|
||||
|
||||
for _ in range(0, 20):
|
||||
# loop in case flushing is still in progress
|
||||
layers = ps_http.layer_map_info(tenant_id, timeline_id)
|
||||
if not layers.in_memory_layers:
|
||||
break
|
||||
time.sleep(0.2)
|
||||
|
||||
# now that we've grown some delta layers, turn them into image layers
|
||||
tenant_update_config({"image_creation_threshold": "1"})
|
||||
ps_http.timeline_compact(tenant_id, timeline_id)
|
||||
|
||||
# wait for all uploads to finish
|
||||
wait_for_sk_commit_lsn_to_reach_remote_storage(
|
||||
tenant_id, timeline_id, env.safekeepers, env.pageserver
|
||||
)
|
||||
|
||||
# shutdown safekeepers to avoid on-demand downloads from walreceiver
|
||||
for sk in env.safekeepers:
|
||||
sk.stop()
|
||||
|
||||
ps_http.timeline_checkpoint(tenant_id, timeline_id)
|
||||
|
||||
log.info("ensure the code above produced image and delta layers")
|
||||
pre_evict_info = ps_http.layer_map_info(tenant_id, timeline_id)
|
||||
log.info("layer map dump: %s", pre_evict_info)
|
||||
by_kind = pre_evict_info.kind_count()
|
||||
log.info("by kind: %s", by_kind)
|
||||
assert by_kind["Image"] > 0
|
||||
assert by_kind["Delta"] > 0
|
||||
assert by_kind["InMemory"] == 0
|
||||
resident_layers = list(env.timeline_dir(tenant_id, timeline_id).glob("*-*_*"))
|
||||
log.info("resident layers count before eviction: %s", len(resident_layers))
|
||||
|
||||
log.info("evict all layers")
|
||||
ps_http.evict_all_layers(tenant_id, timeline_id)
|
||||
|
||||
def ensure_resident_and_remote_size_metrics():
|
||||
log.info("ensure that all the layers are gone")
|
||||
resident_layers = list(env.timeline_dir(tenant_id, timeline_id).glob("*-*_*"))
|
||||
# we have disabled all background loops, so, this should hold
|
||||
assert len(resident_layers) == 0
|
||||
|
||||
info = ps_http.layer_map_info(tenant_id, timeline_id)
|
||||
log.info("layer map dump: %s", info)
|
||||
|
||||
log.info("ensure that resident_physical_size metric is zero")
|
||||
resident_physical_size_metric = ps_http.get_timeline_metric(
|
||||
tenant_id, timeline_id, "pageserver_resident_physical_size"
|
||||
)
|
||||
assert resident_physical_size_metric == 0
|
||||
log.info("ensure that resident_physical_size metric corresponds to layer map dump")
|
||||
assert resident_physical_size_metric == sum(
|
||||
[layer.layer_file_size or 0 for layer in info.historic_layers if not layer.remote]
|
||||
)
|
||||
|
||||
log.info("ensure that remote_physical_size metric matches layer map")
|
||||
remote_physical_size_metric = ps_http.get_timeline_metric(
|
||||
tenant_id, timeline_id, "pageserver_remote_physical_size"
|
||||
)
|
||||
log.info("ensure that remote_physical_size metric corresponds to layer map dump")
|
||||
assert remote_physical_size_metric == sum(
|
||||
layer.layer_file_size or 0 for layer in info.historic_layers if layer.remote
|
||||
)
|
||||
|
||||
log.info("before runnning GC, ensure that remote_physical size is zero")
|
||||
ensure_resident_and_remote_size_metrics()
|
||||
|
||||
log.info("run GC")
|
||||
time.sleep(2) # let pitr_interval + 1 second pass
|
||||
ps_http.timeline_gc(tenant_id, timeline_id, 0)
|
||||
time.sleep(1)
|
||||
assert not env.pageserver.log_contains("Nothing to GC")
|
||||
|
||||
log.info("ensure GC deleted some layers, otherwise this test is pointless")
|
||||
post_gc_info = ps_http.layer_map_info(tenant_id, timeline_id)
|
||||
log.info("layer map dump: %s", post_gc_info)
|
||||
log.info("by kind: %s", post_gc_info.kind_count())
|
||||
pre_evict_layers = set([layer.layer_file_name for layer in pre_evict_info.historic_layers])
|
||||
post_gc_layers = set([layer.layer_file_name for layer in post_gc_info.historic_layers])
|
||||
assert post_gc_layers.issubset(pre_evict_layers)
|
||||
assert len(post_gc_layers) < len(pre_evict_layers)
|
||||
|
||||
log.info("update_gc_info might download some layers. Evict them again.")
|
||||
ps_http.evict_all_layers(tenant_id, timeline_id)
|
||||
|
||||
log.info("after running GC, ensure that resident size is still zero")
|
||||
ensure_resident_and_remote_size_metrics()
|
||||
|
||||
@@ -9,6 +9,7 @@ from typing import Iterator
|
||||
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.metrics import parse_metrics
|
||||
from fixtures.neon_fixtures import (
|
||||
PSQL,
|
||||
NeonEnvBuilder,
|
||||
@@ -142,7 +143,7 @@ def test_metric_collection(
|
||||
|
||||
# Helper function that gets the number of given kind of remote ops from the metrics
|
||||
def get_num_remote_ops(file_kind: str, op_kind: str) -> int:
|
||||
ps_metrics = env.pageserver.http_client().get_metrics()
|
||||
ps_metrics = parse_metrics(env.pageserver.http_client().get_metrics(), "pageserver")
|
||||
total = 0.0
|
||||
for sample in ps_metrics.query_all(
|
||||
name="pageserver_remote_operation_seconds_count",
|
||||
|
||||
@@ -4,14 +4,13 @@
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from typing import Any, DefaultDict, Dict, Tuple
|
||||
from typing import Any, DefaultDict, Dict
|
||||
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
PageserverApiException,
|
||||
PageserverHttpClient,
|
||||
RemoteStorageKind,
|
||||
assert_tenant_status,
|
||||
available_remote_storages,
|
||||
@@ -26,16 +25,9 @@ from fixtures.types import Lsn
|
||||
from fixtures.utils import query_scalar
|
||||
|
||||
|
||||
def get_num_downloaded_layers(client: PageserverHttpClient, tenant_id, timeline_id):
|
||||
def get_num_downloaded_layers(client, tenant_id, timeline_id):
|
||||
value = client.get_metric_value(
|
||||
"pageserver_remote_operation_seconds_count",
|
||||
{
|
||||
"file_kind": "layer",
|
||||
"op_kind": "download",
|
||||
"status": "success",
|
||||
"tenant_id": tenant_id,
|
||||
"timeline_id": timeline_id,
|
||||
},
|
||||
f'pageserver_remote_operation_seconds_count{{file_kind="layer",op_kind="download",status="success",tenant_id="{tenant_id}",timeline_id="{timeline_id}"}}'
|
||||
)
|
||||
if value is None:
|
||||
return 0
|
||||
@@ -497,17 +489,6 @@ def test_compaction_downloads_on_demand_without_image_creation(
|
||||
# pitr_interval and gc_horizon are not interesting because we dont run gc
|
||||
}
|
||||
|
||||
def downloaded_bytes_and_count(pageserver_http: PageserverHttpClient) -> Tuple[int, int]:
|
||||
m = pageserver_http.get_metrics()
|
||||
# these are global counters
|
||||
total_bytes = m.query_one("pageserver_remote_ondemand_downloaded_bytes_total").value
|
||||
assert (
|
||||
total_bytes < 2**53 and total_bytes.is_integer()
|
||||
), "bytes should still be safe integer-in-f64"
|
||||
count = m.query_one("pageserver_remote_ondemand_downloaded_layers_total").value
|
||||
assert count < 2**53 and count.is_integer(), "count should still be safe integer-in-f64"
|
||||
return (int(total_bytes), int(count))
|
||||
|
||||
# Override defaults, to create more layers
|
||||
tenant_id, timeline_id = env.neon_cli.create_tenant(conf=stringify(conf))
|
||||
env.initial_tenant = tenant_id
|
||||
@@ -528,14 +509,10 @@ def test_compaction_downloads_on_demand_without_image_creation(
|
||||
|
||||
layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
|
||||
assert not layers.in_memory_layers, "no inmemory layers expected after post-commit checkpoint"
|
||||
assert len(layers.historic_layers) == 1 + 2, "should have initdb layer and 2 deltas"
|
||||
|
||||
layer_sizes = 0
|
||||
assert len(layers.historic_layers) == 1 + 2, "should have inidb layer and 2 deltas"
|
||||
|
||||
for layer in layers.historic_layers:
|
||||
log.info(f"pre-compact: {layer}")
|
||||
assert layer.layer_file_size is not None, "we must know layer file sizes"
|
||||
layer_sizes += layer.layer_file_size
|
||||
pageserver_http.evict_layer(tenant_id, timeline_id, layer.layer_file_name)
|
||||
|
||||
env.neon_cli.config_tenant(tenant_id, {"compaction_threshold": "3"})
|
||||
@@ -546,12 +523,6 @@ def test_compaction_downloads_on_demand_without_image_creation(
|
||||
log.info(f"post compact: {layer}")
|
||||
assert len(layers.historic_layers) == 1, "should have compacted to single layer"
|
||||
|
||||
post_compact = downloaded_bytes_and_count(pageserver_http)
|
||||
|
||||
# use gte to allow pageserver to do other random stuff; this test could be run on a shared pageserver
|
||||
assert post_compact[0] >= layer_sizes
|
||||
assert post_compact[1] >= 3, "should had downloaded the three layers"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.MOCK_S3])
|
||||
def test_compaction_downloads_on_demand_with_image_creation(
|
||||
|
||||
@@ -45,6 +45,14 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
|
||||
env.pageserver.stop()
|
||||
env.pageserver.start()
|
||||
|
||||
# Stopping the pageserver breaks the connection from the postgres backend to
|
||||
# the page server, and causes the next query on the connection to fail. Start a new
|
||||
# postgres connection too, to avoid that error. (Ideally, the compute node would
|
||||
# handle that and retry internally, without propagating the error to the user, but
|
||||
# currently it doesn't...)
|
||||
pg_conn = pg.connect()
|
||||
cur = pg_conn.cursor()
|
||||
|
||||
cur.execute("SELECT count(*) FROM foo")
|
||||
assert cur.fetchone() == (100000,)
|
||||
|
||||
@@ -62,6 +70,8 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
|
||||
assert tenant_status["state"] == "Loading"
|
||||
|
||||
# Try to read. This waits until the loading finishes, and then return normally.
|
||||
pg_conn = pg.connect()
|
||||
cur = pg_conn.cursor()
|
||||
cur.execute("SELECT count(*) FROM foo")
|
||||
assert cur.fetchone() == (100000,)
|
||||
|
||||
@@ -122,6 +132,14 @@ def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder):
|
||||
env.pageserver.stop(immediate=True)
|
||||
env.pageserver.start()
|
||||
|
||||
# Stopping the pageserver breaks the connection from the postgres backend to
|
||||
# the page server, and causes the next query on the connection to fail. Start a new
|
||||
# postgres connection too, to avoid that error. (Ideally, the compute node would
|
||||
# handle that and retry internally, without propagating the error to the user, but
|
||||
# currently it doesn't...)
|
||||
pg_conn = pg.connect()
|
||||
cur = pg_conn.cursor()
|
||||
|
||||
# Check that all the updates are visible
|
||||
num_updates = pg.safe_psql("SELECT sum(updates) FROM foo")[0][0]
|
||||
assert num_updates == i * 100000
|
||||
|
||||
@@ -1,35 +0,0 @@
|
||||
# This test spawns pgbench in a thread in the background and concurrently restarts pageserver,
|
||||
# checking how client is able to transparently restore connection to pageserver
|
||||
#
|
||||
import threading
|
||||
import time
|
||||
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnv, PgBin, Postgres
|
||||
|
||||
|
||||
# Test restarting page server, while safekeeper and compute node keep
|
||||
# running.
|
||||
def test_pageserver_restarts_under_worload(neon_simple_env: NeonEnv, pg_bin: PgBin):
|
||||
env = neon_simple_env
|
||||
env.neon_cli.create_branch("test_pageserver_restarts")
|
||||
pg = env.postgres.create_start("test_pageserver_restarts")
|
||||
n_restarts = 10
|
||||
scale = 10
|
||||
|
||||
def run_pgbench(pg: Postgres):
|
||||
connstr = pg.connstr()
|
||||
log.info(f"Start a pgbench workload on pg {connstr}")
|
||||
pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])
|
||||
pg_bin.run_capture(["pgbench", f"-T{n_restarts}", connstr])
|
||||
|
||||
thread = threading.Thread(target=run_pgbench, args=(pg,), daemon=True)
|
||||
thread.start()
|
||||
|
||||
for i in range(n_restarts):
|
||||
# Stop the pageserver gracefully and restart it.
|
||||
time.sleep(1)
|
||||
env.pageserver.stop()
|
||||
env.pageserver.start()
|
||||
|
||||
thread.join()
|
||||
@@ -233,8 +233,8 @@ def test_remote_storage_upload_queue_retries(
|
||||
# disable background compaction and GC. We invoke it manually when we want it to happen.
|
||||
"gc_period": "0s",
|
||||
"compaction_period": "0s",
|
||||
# create image layers eagerly, so that GC can remove some layers
|
||||
"image_creation_threshold": "1",
|
||||
# don't create image layers, that causes just noise
|
||||
"image_creation_threshold": "10000",
|
||||
}
|
||||
)
|
||||
|
||||
@@ -301,7 +301,7 @@ def test_remote_storage_upload_queue_retries(
|
||||
|
||||
# Create more churn to generate all upload ops.
|
||||
# The checkpoint / compact / gc ops will block because they call remote_client.wait_completion().
|
||||
# So, run this in a different thread.
|
||||
# So, run this in a differen thread.
|
||||
churn_thread_result = [False]
|
||||
|
||||
def churn_while_failpoints_active(result):
|
||||
@@ -395,8 +395,8 @@ def test_remote_timeline_client_calls_started_metric(
|
||||
# disable background compaction and GC. We invoke it manually when we want it to happen.
|
||||
"gc_period": "0s",
|
||||
"compaction_period": "0s",
|
||||
# create image layers eagerly, so that GC can remove some layers
|
||||
"image_creation_threshold": "1",
|
||||
# don't create image layers, that causes just noise
|
||||
"image_creation_threshold": "10000",
|
||||
}
|
||||
)
|
||||
|
||||
@@ -618,9 +618,6 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
|
||||
# checkpoint operations. Hence, checkpoint is allowed to fail now.
|
||||
log.info("sending delete request")
|
||||
checkpoint_allowed_to_fail.set()
|
||||
env.pageserver.allowed_errors.append(
|
||||
".* ERROR .*Error processing HTTP request: InternalServerError\\(timeline is Stopping"
|
||||
)
|
||||
client.timeline_delete(tenant_id, timeline_id)
|
||||
|
||||
assert not timeline_path.exists()
|
||||
|
||||
@@ -129,7 +129,6 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}"""
|
||||
"checkpoint_distance": "15000",
|
||||
"gc_period": "80sec",
|
||||
"compaction_period": "80sec",
|
||||
"image_creation_threshold": "2",
|
||||
}
|
||||
env.neon_cli.config_tenant(
|
||||
tenant_id=tenant,
|
||||
@@ -150,7 +149,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}"""
|
||||
"compaction_threshold": 10,
|
||||
"gc_horizon": 67108864,
|
||||
"gc_period": 80,
|
||||
"image_creation_threshold": 2,
|
||||
"image_creation_threshold": 3,
|
||||
"pitr_interval": 604800,
|
||||
}.items()
|
||||
), f"Unexpected res: {res}"
|
||||
@@ -175,7 +174,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}"""
|
||||
assert updated_effective_config["compaction_target_size"] == 1048576
|
||||
assert updated_effective_config["compaction_threshold"] == 10
|
||||
assert updated_effective_config["gc_horizon"] == 67108864
|
||||
assert updated_effective_config["image_creation_threshold"] == 2
|
||||
assert updated_effective_config["image_creation_threshold"] == 3
|
||||
assert updated_effective_config["pitr_interval"] == "7days"
|
||||
|
||||
# restart the pageserver and ensure that the config is still correct
|
||||
@@ -196,7 +195,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}"""
|
||||
"compaction_threshold": 10,
|
||||
"gc_horizon": 67108864,
|
||||
"gc_period": 80,
|
||||
"image_creation_threshold": 2,
|
||||
"image_creation_threshold": 3,
|
||||
"pitr_interval": 604800,
|
||||
}.items()
|
||||
), f"Unexpected res: {res}"
|
||||
|
||||
@@ -6,6 +6,7 @@ from threading import Thread
|
||||
import asyncpg
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.metrics import parse_metrics
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnv,
|
||||
NeonEnvBuilder,
|
||||
@@ -78,7 +79,7 @@ def test_tenant_reattach(
|
||||
".*failed to perform remote task UploadMetadata.*, will retry.*"
|
||||
)
|
||||
|
||||
ps_metrics = pageserver_http.get_metrics()
|
||||
ps_metrics = parse_metrics(pageserver_http.get_metrics(), "pageserver")
|
||||
tenant_metric_filter = {
|
||||
"tenant_id": str(tenant_id),
|
||||
"timeline_id": str(timeline_id),
|
||||
@@ -92,7 +93,7 @@ def test_tenant_reattach(
|
||||
|
||||
time.sleep(1) # for metrics propagation
|
||||
|
||||
ps_metrics = pageserver_http.get_metrics()
|
||||
ps_metrics = parse_metrics(pageserver_http.get_metrics(), "pageserver")
|
||||
pageserver_last_record_lsn = int(
|
||||
ps_metrics.query_one("pageserver_last_record_lsn", filter=tenant_metric_filter).value
|
||||
)
|
||||
|
||||
@@ -50,22 +50,16 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder):
|
||||
wait_until(10, 0.2, lambda: assert_active(tenant_id))
|
||||
|
||||
# Assert that all tasks finish quickly after tenant is detached
|
||||
task_starts = client.get_metric_value("pageserver_tenant_task_events_total", {"event": "start"})
|
||||
task_starts = client.get_metric_value('pageserver_tenant_task_events{event="start"}')
|
||||
assert task_starts is not None
|
||||
assert int(task_starts) > 0
|
||||
client.tenant_detach(tenant)
|
||||
client.tenant_detach(env.initial_tenant)
|
||||
|
||||
def assert_tasks_finish():
|
||||
tasks_started = client.get_metric_value(
|
||||
"pageserver_tenant_task_events_total", {"event": "start"}
|
||||
)
|
||||
tasks_ended = client.get_metric_value(
|
||||
"pageserver_tenant_task_events_total", {"event": "stop"}
|
||||
)
|
||||
tasks_panicked = client.get_metric_value(
|
||||
"pageserver_tenant_task_events_total", {"event": "panic"}
|
||||
)
|
||||
tasks_started = client.get_metric_value('pageserver_tenant_task_events{event="start"}')
|
||||
tasks_ended = client.get_metric_value('pageserver_tenant_task_events{event="stop"}')
|
||||
tasks_panicked = client.get_metric_value('pageserver_tenant_task_events{event="panic"}')
|
||||
log.info(f"started {tasks_started}, ended {tasks_ended}, panicked {tasks_panicked}")
|
||||
assert tasks_started == tasks_ended
|
||||
assert tasks_panicked is None or int(tasks_panicked) == 0
|
||||
|
||||
@@ -107,7 +107,7 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder):
|
||||
assert cur.fetchone() == (5000050000,)
|
||||
|
||||
collected_metrics = {
|
||||
"pageserver": env.pageserver.http_client().get_metrics_str(),
|
||||
"pageserver": env.pageserver.http_client().get_metrics(),
|
||||
}
|
||||
for sk in env.safekeepers:
|
||||
collected_metrics[f"safekeeper{sk.id}"] = sk.http_client().get_metrics_str()
|
||||
@@ -207,7 +207,7 @@ def test_pageserver_metrics_removed_after_detach(
|
||||
assert cur.fetchone() == (5000050000,)
|
||||
|
||||
def get_ps_metric_samples_for_tenant(tenant_id: TenantId) -> List[Sample]:
|
||||
ps_metrics = env.pageserver.http_client().get_metrics()
|
||||
ps_metrics = parse_metrics(env.pageserver.http_client().get_metrics(), "pageserver")
|
||||
samples = []
|
||||
for metric_name in ps_metrics.metrics:
|
||||
for sample in ps_metrics.query_all(
|
||||
@@ -307,7 +307,7 @@ def test_pageserver_with_empty_tenants(
|
||||
|
||||
time.sleep(1) # to allow metrics propagation
|
||||
|
||||
ps_metrics = client.get_metrics()
|
||||
ps_metrics = parse_metrics(client.get_metrics(), "pageserver")
|
||||
broken_tenants_metric_filter = {
|
||||
"tenant_id": str(tenant_without_timelines_dir),
|
||||
"state": "broken",
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
import math
|
||||
import queue
|
||||
import random
|
||||
import re
|
||||
import threading
|
||||
import time
|
||||
from contextlib import closing
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import psycopg2.errors
|
||||
import psycopg2.extras
|
||||
@@ -19,11 +19,9 @@ from fixtures.neon_fixtures import (
|
||||
PgBin,
|
||||
PortDistributor,
|
||||
Postgres,
|
||||
RemoteStorageKind,
|
||||
VanillaPostgres,
|
||||
assert_tenant_status,
|
||||
wait_for_last_flush_lsn,
|
||||
wait_for_upload_queue_empty,
|
||||
wait_until,
|
||||
)
|
||||
from fixtures.types import TenantId, TimelineId
|
||||
@@ -304,18 +302,8 @@ def test_timeline_initial_logical_size_calculation_cancellation(
|
||||
# message emitted by the code behind failpoint "timeline-calculate-logical-size-check-dir-exists"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
|
||||
def test_timeline_physical_size_init(
|
||||
neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind]
|
||||
):
|
||||
|
||||
if remote_storage_kind is not None:
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind, "test_timeline_physical_size_init"
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
def test_timeline_physical_size_init(neon_simple_env: NeonEnv):
|
||||
env = neon_simple_env
|
||||
new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_init")
|
||||
pg = env.postgres.create_start("test_timeline_physical_size_init")
|
||||
|
||||
@@ -343,22 +331,12 @@ def test_timeline_physical_size_init(
|
||||
)
|
||||
|
||||
assert_physical_size_invariants(
|
||||
get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind),
|
||||
remote_storage_kind,
|
||||
get_physical_size_values(env, env.initial_tenant, new_timeline_id)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
|
||||
def test_timeline_physical_size_post_checkpoint(
|
||||
neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind]
|
||||
):
|
||||
if remote_storage_kind is not None:
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind, "test_timeline_physical_size_init"
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
def test_timeline_physical_size_post_checkpoint(neon_simple_env: NeonEnv):
|
||||
env = neon_simple_env
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_checkpoint")
|
||||
pg = env.postgres.create_start("test_timeline_physical_size_post_checkpoint")
|
||||
@@ -376,21 +354,11 @@ def test_timeline_physical_size_post_checkpoint(
|
||||
pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id)
|
||||
|
||||
assert_physical_size_invariants(
|
||||
get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind),
|
||||
remote_storage_kind,
|
||||
get_physical_size_values(env, env.initial_tenant, new_timeline_id)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
|
||||
def test_timeline_physical_size_post_compaction(
|
||||
neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind]
|
||||
):
|
||||
|
||||
if remote_storage_kind is not None:
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind, "test_timeline_physical_size_init"
|
||||
)
|
||||
|
||||
def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder):
|
||||
# Disable background compaction as we don't want it to happen after `get_physical_size` request
|
||||
# and before checking the expected size on disk, which makes the assertion failed
|
||||
neon_env_builder.pageserver_config_override = (
|
||||
@@ -419,33 +387,15 @@ def test_timeline_physical_size_post_compaction(
|
||||
)
|
||||
|
||||
wait_for_last_flush_lsn(env, pg, env.initial_tenant, new_timeline_id)
|
||||
|
||||
# shutdown safekeepers to prevent new data from coming in
|
||||
for sk in env.safekeepers:
|
||||
sk.stop()
|
||||
|
||||
pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id)
|
||||
pageserver_http.timeline_compact(env.initial_tenant, new_timeline_id)
|
||||
|
||||
if remote_storage_kind is not None:
|
||||
wait_for_upload_queue_empty(env.pageserver, env.initial_tenant, new_timeline_id)
|
||||
|
||||
assert_physical_size_invariants(
|
||||
get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind),
|
||||
remote_storage_kind,
|
||||
get_physical_size_values(env, env.initial_tenant, new_timeline_id)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
|
||||
def test_timeline_physical_size_post_gc(
|
||||
neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind]
|
||||
):
|
||||
|
||||
if remote_storage_kind is not None:
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind, "test_timeline_physical_size_init"
|
||||
)
|
||||
|
||||
def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder):
|
||||
# Disable background compaction and GC as we don't want it to happen after `get_physical_size` request
|
||||
# and before checking the expected size on disk, which makes the assertion failed
|
||||
neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance=100000, compaction_period='0s', gc_period='0s', pitr_interval='1s'}"
|
||||
@@ -481,12 +431,8 @@ def test_timeline_physical_size_post_gc(
|
||||
pageserver_http.timeline_checkpoint(env.initial_tenant, new_timeline_id)
|
||||
pageserver_http.timeline_gc(env.initial_tenant, new_timeline_id, gc_horizon=None)
|
||||
|
||||
if remote_storage_kind is not None:
|
||||
wait_for_upload_queue_empty(env.pageserver, env.initial_tenant, new_timeline_id)
|
||||
|
||||
assert_physical_size_invariants(
|
||||
get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind),
|
||||
remote_storage_kind,
|
||||
get_physical_size_values(env, env.initial_tenant, new_timeline_id)
|
||||
)
|
||||
|
||||
|
||||
@@ -519,26 +465,26 @@ def test_timeline_size_metrics(
|
||||
|
||||
# get the metrics and parse the metric for the current timeline's physical size
|
||||
metrics = env.pageserver.http_client().get_metrics()
|
||||
tl_physical_size_metric = metrics.query_one(
|
||||
name="pageserver_resident_physical_size",
|
||||
filter={
|
||||
"tenant_id": str(env.initial_tenant),
|
||||
"timeline_id": str(new_timeline_id),
|
||||
},
|
||||
).value
|
||||
matches = re.search(
|
||||
f'^pageserver_resident_physical_size{{tenant_id="{env.initial_tenant}",timeline_id="{new_timeline_id}"}} (\\S+)$',
|
||||
metrics,
|
||||
re.MULTILINE,
|
||||
)
|
||||
assert matches
|
||||
tl_physical_size_metric = int(matches.group(1))
|
||||
|
||||
# assert that the physical size metric matches the actual physical size on disk
|
||||
timeline_path = env.timeline_dir(env.initial_tenant, new_timeline_id)
|
||||
assert tl_physical_size_metric == get_timeline_dir_size(timeline_path)
|
||||
|
||||
# Check that the logical size metric is sane, and matches
|
||||
tl_logical_size_metric = metrics.query_one(
|
||||
name="pageserver_current_logical_size",
|
||||
filter={
|
||||
"tenant_id": str(env.initial_tenant),
|
||||
"timeline_id": str(new_timeline_id),
|
||||
},
|
||||
).value
|
||||
matches = re.search(
|
||||
f'^pageserver_current_logical_size{{tenant_id="{env.initial_tenant}",timeline_id="{new_timeline_id}"}} (\\S+)$',
|
||||
metrics,
|
||||
re.MULTILINE,
|
||||
)
|
||||
assert matches
|
||||
tl_logical_size_metric = int(matches.group(1))
|
||||
|
||||
pgdatadir = test_output_dir / "pgdata-vanilla"
|
||||
pg_bin = PgBin(test_output_dir, pg_distrib_dir, pg_version)
|
||||
@@ -570,29 +516,18 @@ def test_timeline_size_metrics(
|
||||
assert math.isclose(dbsize_sum, tl_logical_size_metric, abs_tol=2 * 1024 * 1024)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
|
||||
def test_tenant_physical_size(
|
||||
neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind]
|
||||
):
|
||||
def test_tenant_physical_size(neon_simple_env: NeonEnv):
|
||||
random.seed(100)
|
||||
|
||||
if remote_storage_kind is not None:
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind, "test_timeline_physical_size_init"
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
env = neon_simple_env
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
client = env.pageserver.http_client()
|
||||
|
||||
tenant, timeline = env.neon_cli.create_tenant()
|
||||
if remote_storage_kind is not None:
|
||||
wait_for_upload_queue_empty(env.pageserver, tenant, timeline)
|
||||
|
||||
def get_timeline_resident_physical_size(timeline: TimelineId):
|
||||
sizes = get_physical_size_values(env, tenant, timeline, remote_storage_kind)
|
||||
assert_physical_size_invariants(sizes, remote_storage_kind)
|
||||
sizes = get_physical_size_values(env, tenant, timeline)
|
||||
assert_physical_size_invariants(sizes)
|
||||
return sizes.prometheus_resident_physical
|
||||
|
||||
timeline_total_resident_physical_size = get_timeline_resident_physical_size(timeline)
|
||||
@@ -612,9 +547,6 @@ def test_tenant_physical_size(
|
||||
wait_for_last_flush_lsn(env, pg, tenant, timeline)
|
||||
pageserver_http.timeline_checkpoint(tenant, timeline)
|
||||
|
||||
if remote_storage_kind is not None:
|
||||
wait_for_upload_queue_empty(env.pageserver, tenant, timeline)
|
||||
|
||||
timeline_total_resident_physical_size += get_timeline_resident_physical_size(timeline)
|
||||
|
||||
pg.stop()
|
||||
@@ -632,39 +564,21 @@ def test_tenant_physical_size(
|
||||
|
||||
class TimelinePhysicalSizeValues:
|
||||
api_current_physical: int
|
||||
prometheus_resident_physical: float
|
||||
prometheus_remote_physical: Optional[float] = None
|
||||
prometheus_resident_physical: int
|
||||
python_timelinedir_layerfiles_physical: int
|
||||
layer_map_file_size_sum: int
|
||||
|
||||
|
||||
def get_physical_size_values(
|
||||
env: NeonEnv,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
remote_storage_kind: Optional[RemoteStorageKind],
|
||||
env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId
|
||||
) -> TimelinePhysicalSizeValues:
|
||||
res = TimelinePhysicalSizeValues()
|
||||
|
||||
client = env.pageserver.http_client()
|
||||
|
||||
res.layer_map_file_size_sum = sum(
|
||||
layer.layer_file_size or 0
|
||||
for layer in client.layer_map_info(tenant_id, timeline_id).historic_layers
|
||||
res.prometheus_resident_physical = client.get_timeline_metric(
|
||||
tenant_id, timeline_id, "pageserver_resident_physical_size"
|
||||
)
|
||||
|
||||
metrics = client.get_metrics()
|
||||
metrics_filter = {"tenant_id": str(tenant_id), "timeline_id": str(timeline_id)}
|
||||
res.prometheus_resident_physical = metrics.query_one(
|
||||
"pageserver_resident_physical_size", metrics_filter
|
||||
).value
|
||||
if remote_storage_kind is not None:
|
||||
res.prometheus_remote_physical = metrics.query_one(
|
||||
"pageserver_remote_physical_size", metrics_filter
|
||||
).value
|
||||
else:
|
||||
res.prometheus_remote_physical = None
|
||||
|
||||
detail = client.timeline_detail(
|
||||
tenant_id, timeline_id, include_timeline_dir_layer_file_size_sum=True
|
||||
)
|
||||
@@ -676,20 +590,11 @@ def get_physical_size_values(
|
||||
return res
|
||||
|
||||
|
||||
def assert_physical_size_invariants(
|
||||
sizes: TimelinePhysicalSizeValues, remote_storage_kind: Optional[RemoteStorageKind]
|
||||
):
|
||||
def assert_physical_size_invariants(sizes: TimelinePhysicalSizeValues):
|
||||
# resident phyiscal size is defined as
|
||||
assert sizes.python_timelinedir_layerfiles_physical == sizes.prometheus_resident_physical
|
||||
assert sizes.python_timelinedir_layerfiles_physical == sizes.layer_map_file_size_sum
|
||||
|
||||
# we don't do layer eviction, so, all layers are resident
|
||||
assert sizes.api_current_physical == sizes.prometheus_resident_physical
|
||||
if remote_storage_kind is not None:
|
||||
assert sizes.prometheus_resident_physical == sizes.prometheus_remote_physical
|
||||
# XXX would be nice to assert layer file physical storage utilization here as well, but we can only do that for LocalFS
|
||||
else:
|
||||
assert sizes.prometheus_remote_physical is None
|
||||
|
||||
|
||||
# Timeline logical size initialization is an asynchronous background task that runs once,
|
||||
|
||||
@@ -775,9 +775,6 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
|
||||
if not auth_enabled:
|
||||
wa_http_cli = wa.http_client()
|
||||
wa_http_cli.check_status()
|
||||
|
||||
wa_http_cli_debug = wa.http_client()
|
||||
wa_http_cli_debug.check_status()
|
||||
else:
|
||||
wa_http_cli = wa.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id))
|
||||
wa_http_cli.check_status()
|
||||
@@ -788,10 +785,6 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
|
||||
wa_http_cli_noauth = wa.http_client()
|
||||
wa_http_cli_noauth.check_status()
|
||||
|
||||
# debug endpoint requires safekeeper scope
|
||||
wa_http_cli_debug = wa.http_client(auth_token=env.auth_keys.generate_safekeeper_token())
|
||||
wa_http_cli_debug.check_status()
|
||||
|
||||
# fetch something sensible from status
|
||||
tli_status = wa_http_cli.timeline_status(tenant_id, timeline_id)
|
||||
epoch = tli_status.acceptor_epoch
|
||||
@@ -802,12 +795,6 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
|
||||
with pytest.raises(cli.HTTPError, match="Forbidden|Unauthorized"):
|
||||
cli.timeline_status(tenant_id, timeline_id)
|
||||
|
||||
# fetch debug_dump endpoint
|
||||
debug_dump_0 = wa_http_cli_debug.debug_dump({"dump_all": "true"})
|
||||
log.info(f"debug_dump before reboot {debug_dump_0}")
|
||||
assert debug_dump_0["timelines_count"] == 1
|
||||
assert debug_dump_0["timelines"][0]["timeline_id"] == str(timeline_id)
|
||||
|
||||
pg.safe_psql("create table t(i int)")
|
||||
|
||||
# ensure epoch goes up after reboot
|
||||
@@ -821,25 +808,6 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
|
||||
# and timeline_start_lsn stays the same
|
||||
assert tli_status.timeline_start_lsn == timeline_start_lsn
|
||||
|
||||
# fetch debug_dump after reboot
|
||||
debug_dump_1 = wa_http_cli_debug.debug_dump({"dump_all": "true"})
|
||||
log.info(f"debug_dump after reboot {debug_dump_1}")
|
||||
assert debug_dump_1["timelines_count"] == 1
|
||||
assert debug_dump_1["timelines"][0]["timeline_id"] == str(timeline_id)
|
||||
|
||||
# check that commit_lsn and flush_lsn not decreased
|
||||
assert (
|
||||
debug_dump_1["timelines"][0]["memory"]["mem_state"]["commit_lsn"]
|
||||
>= debug_dump_0["timelines"][0]["memory"]["mem_state"]["commit_lsn"]
|
||||
)
|
||||
assert (
|
||||
debug_dump_1["timelines"][0]["memory"]["flush_lsn"]
|
||||
>= debug_dump_0["timelines"][0]["memory"]["flush_lsn"]
|
||||
)
|
||||
|
||||
# check .config in response
|
||||
assert debug_dump_1["config"]["id"] == env.safekeepers[0].id
|
||||
|
||||
|
||||
class SafekeeperEnv:
|
||||
def __init__(
|
||||
|
||||
2
vendor/postgres-v14
vendored
2
vendor/postgres-v14
vendored
Submodule vendor/postgres-v14 updated: 9fd9794436...5fb2e0bba0
2
vendor/postgres-v15
vendored
2
vendor/postgres-v15
vendored
Submodule vendor/postgres-v15 updated: 257aaefb25...919851e781
12
vm-cgconfig.conf
Normal file
12
vm-cgconfig.conf
Normal file
@@ -0,0 +1,12 @@
|
||||
# Configuration for cgroups in VM compute nodes
|
||||
group neon-postgres {
|
||||
perm {
|
||||
admin {
|
||||
uid = vm-informant;
|
||||
}
|
||||
task {
|
||||
gid = users;
|
||||
}
|
||||
}
|
||||
memory {}
|
||||
}
|
||||
@@ -50,7 +50,6 @@ serde = { version = "1", features = ["alloc", "derive"] }
|
||||
serde_json = { version = "1", features = ["raw_value"] }
|
||||
socket2 = { version = "0.4", default-features = false, features = ["all"] }
|
||||
tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "sync", "time"] }
|
||||
tokio-rustls = { version = "0.23" }
|
||||
tokio-util = { version = "0.7", features = ["codec", "io"] }
|
||||
tonic = { version = "0.8", features = ["tls-roots"] }
|
||||
tower = { version = "0.4", features = ["balance", "buffer", "limit", "retry", "timeout", "util"] }
|
||||
|
||||
Reference in New Issue
Block a user