Compare commits

..

1 Commits

Author SHA1 Message Date
Anastasia Lubennikova
81faefd3d8 Update synthetic_size.md documenation:
describe /synthetic_size HTTP endpoint with examples of use
2023-02-20 20:23:09 +02:00
50 changed files with 369 additions and 1119 deletions

View File

@@ -32,6 +32,8 @@ storage:
hosts:
safekeeper-0.ap-southeast-1.aws.neon.tech:
ansible_host: i-0d6f1dc5161eef894
safekeeper-1.ap-southeast-1.aws.neon.tech:
ansible_host: i-0e338adda8eb2d19f
safekeeper-2.ap-southeast-1.aws.neon.tech:
ansible_host: i-04fb63634e4679eb9
safekeeper-3.ap-southeast-1.aws.neon.tech:

View File

@@ -27,8 +27,6 @@ storage:
ansible_host: i-062227ba7f119eb8c
pageserver-1.us-east-2.aws.neon.tech:
ansible_host: i-0b3ec0afab5968938
pageserver-2.us-east-2.aws.neon.tech:
ansible_host: i-0d7a1c4325e71421d
safekeepers:
hosts:

View File

@@ -1,21 +1,6 @@
# Helm chart values for neon-proxy-scram.
# This is a YAML-formatted file.
deploymentStrategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 100%
maxUnavailable: 50%
# Delay the kill signal by 7 days (7 * 24 * 60 * 60)
# The pod(s) will stay in Terminating, keeps the existing connections
# but doesn't receive new ones
containerLifecycle:
preStop:
exec:
command: ["/bin/sh", "-c", "sleep 604800"]
terminationGracePeriodSeconds: 604800
image:
repository: neondatabase/neon

View File

@@ -1,21 +1,6 @@
# Helm chart values for neon-proxy-scram.
# This is a YAML-formatted file.
deploymentStrategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 100%
maxUnavailable: 50%
# Delay the kill signal by 7 days (7 * 24 * 60 * 60)
# The pod(s) will stay in Terminating, keeps the existing connections
# but doesn't receive new ones
containerLifecycle:
preStop:
exec:
command: ["/bin/sh", "-c", "sleep 604800"]
terminationGracePeriodSeconds: 604800
image:
repository: neondatabase/neon

View File

@@ -1,22 +1,6 @@
# Helm chart values for neon-proxy-scram.
# This is a YAML-formatted file.
deploymentStrategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 100%
maxUnavailable: 50%
# Delay the kill signal by 7 days (7 * 24 * 60 * 60)
# The pod(s) will stay in Terminating, keeps the existing connections
# but doesn't receive new ones
containerLifecycle:
preStop:
exec:
command: ["/bin/sh", "-c", "sleep 604800"]
terminationGracePeriodSeconds: 604800
image:
repository: neondatabase/neon

View File

@@ -1,22 +1,6 @@
# Helm chart values for neon-proxy-scram.
# This is a YAML-formatted file.
deploymentStrategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 100%
maxUnavailable: 50%
# Delay the kill signal by 7 days (7 * 24 * 60 * 60)
# The pod(s) will stay in Terminating, keeps the existing connections
# but doesn't receive new ones
containerLifecycle:
preStop:
exec:
command: ["/bin/sh", "-c", "sleep 604800"]
terminationGracePeriodSeconds: 604800
image:
repository: neondatabase/neon

View File

@@ -1,22 +1,6 @@
# Helm chart values for neon-proxy-scram.
# This is a YAML-formatted file.
deploymentStrategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 100%
maxUnavailable: 50%
# Delay the kill signal by 7 days (7 * 24 * 60 * 60)
# The pod(s) will stay in Terminating, keeps the existing connections
# but doesn't receive new ones
containerLifecycle:
preStop:
exec:
command: ["/bin/sh", "-c", "sleep 604800"]
terminationGracePeriodSeconds: 604800
image:
repository: neondatabase/neon

View File

@@ -1,22 +1,6 @@
# Helm chart values for neon-proxy-scram.
# This is a YAML-formatted file.
deploymentStrategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 100%
maxUnavailable: 50%
# Delay the kill signal by 7 days (7 * 24 * 60 * 60)
# The pod(s) will stay in Terminating, keeps the existing connections
# but doesn't receive new ones
containerLifecycle:
preStop:
exec:
command: ["/bin/sh", "-c", "sleep 604800"]
terminationGracePeriodSeconds: 604800
image:
repository: neondatabase/neon

View File

@@ -67,7 +67,7 @@ jobs:
./get_binaries.sh
ansible-galaxy collection install sivel.toiletwater
ansible-playbook -v deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
rm -f neon_install.tar.gz .neon_current_version
- name: Cleanup ansible folder

View File

@@ -68,7 +68,7 @@ jobs:
./get_binaries.sh
ansible-galaxy collection install sivel.toiletwater
ansible-playbook -v deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
ansible-playbook deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
rm -f neon_install.tar.gz .neon_current_version
deploy-proxy-prod-new:

20
Cargo.lock generated
View File

@@ -854,7 +854,6 @@ dependencies = [
"opentelemetry",
"postgres",
"regex",
"reqwest",
"serde",
"serde_json",
"tar",
@@ -3055,7 +3054,6 @@ dependencies = [
"hyper",
"metrics",
"once_cell",
"pin-project-lite",
"serde",
"serde_json",
"tempfile",
@@ -3067,6 +3065,15 @@ dependencies = [
"workspace_hack",
]
[[package]]
name = "remove_dir_all"
version = "0.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7"
dependencies = [
"winapi",
]
[[package]]
name = "reqwest"
version = "0.11.14"
@@ -3840,15 +3847,16 @@ dependencies = [
[[package]]
name = "tempfile"
version = "3.4.0"
version = "3.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "af18f7ae1acd354b992402e9ec5864359d693cd8a79dcbef59f76891701c1e95"
checksum = "5cdb1ef4eaeeaddc8fbd371e5017057064af0911902ef36b39801f67cc6d79e4"
dependencies = [
"cfg-if",
"fastrand",
"libc",
"redox_syscall",
"rustix",
"windows-sys 0.42.0",
"remove_dir_all",
"winapi",
]
[[package]]

View File

@@ -150,7 +150,7 @@ workspace_hack = { version = "0.1", path = "./workspace_hack/" }
criterion = "0.4"
rcgen = "0.10"
rstest = "0.16"
tempfile = "3.4"
tempfile = "3.2"
tonic-build = "0.8"
# This is only needed for proxy's tests.

View File

@@ -1,4 +1,3 @@
ARG PG_VERSION
ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
ARG IMAGE=rust
ARG TAG=pinned
@@ -12,7 +11,7 @@ FROM debian:bullseye-slim AS build-deps
RUN apt update && \
apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \
libicu-dev libxslt1-dev
libicu-dev
#########################################################################################
#
@@ -24,24 +23,18 @@ FROM build-deps AS pg-build
ARG PG_VERSION
COPY vendor/postgres-${PG_VERSION} postgres
RUN cd postgres && \
./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp --with-icu \
--with-libxml --with-libxslt && \
./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp --with-icu && \
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
# Install headers
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install && \
# Enable some of contrib extensions
echo 'trusted = true' >> /usr/local/pgsql/share/extension/autoinc.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/bloom.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/insert_username.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/intagg.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/moddatetime.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/intagg.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/refint.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control
echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control
#########################################################################################
#
@@ -64,11 +57,10 @@ RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar
DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \
make clean && cp -R /sfcgal/* /
ENV PATH "/usr/local/pgsql/bin:$PATH"
RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postgis.tar.gz && \
mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \
./autogen.sh && \
export PATH="/usr/local/pgsql/bin:$PATH" && \
./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
cd extensions/postgis && \
@@ -82,15 +74,6 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postg
echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control
RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \
mkdir pgrouting-src && cd pgrouting-src && tar xvzf ../pgrouting.tar.gz --strip-components=1 -C . && \
mkdir build && \
cd build && \
cmake .. && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control
#########################################################################################
#
# Layer "plv8-build"
@@ -195,96 +178,6 @@ RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b214
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgjwt.control
#########################################################################################
#
# Layer "hypopg-pg-build"
# compile hypopg extension
#
#########################################################################################
FROM build-deps AS hypopg-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.3.1.tar.gz -O hypopg.tar.gz && \
mkdir hypopg-src && cd hypopg-src && tar xvzf ../hypopg.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/hypopg.control
#########################################################################################
#
# Layer "pg-hashids-pg-build"
# compile pg_hashids extension
#
#########################################################################################
FROM build-deps AS pg-hashids-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz -O pg_hashids.tar.gz && \
mkdir pg_hashids-src && cd pg_hashids-src && tar xvzf ../pg_hashids.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_hashids.control
#########################################################################################
#
# Layer "rust extensions"
# This layer is used to build `pgx` deps
#
#########################################################################################
FROM build-deps AS rust-extensions-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN apt-get update && \
apt-get install -y curl libclang-dev cmake && \
useradd -ms /bin/bash nonroot -b /home
ENV HOME=/home/nonroot
ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
USER nonroot
WORKDIR /home/nonroot
ARG PG_VERSION
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
chmod +x rustup-init && \
./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
rm rustup-init && \
cargo install --git https://github.com/vadim2404/pgx --branch neon_abi_v0.6.1 --locked cargo-pgx && \
/bin/bash -c 'cargo pgx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
USER root
#########################################################################################
#
# Layer "pg-jsonschema-pg-build"
# Compile "pg_jsonschema" extension
#
#########################################################################################
FROM rust-extensions-build AS pg-jsonschema-pg-build
RUN git clone --depth=1 --single-branch --branch neon_abi_v0.1.4 https://github.com/vadim2404/pg_jsonschema/ && \
cd pg_jsonschema && \
cargo pgx install --release && \
# it's needed to enable extension because it uses untrusted C language
sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_jsonschema.control && \
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control
#########################################################################################
#
# Layer "pg-graphql-pg-build"
# Compile "pg_graphql" extension
#
#########################################################################################
FROM rust-extensions-build AS pg-graphql-pg-build
RUN git clone --depth=1 --single-branch --branch neon_abi_v1.1.0 https://github.com/vadim2404/pg_graphql && \
cd pg_graphql && \
cargo pgx install --release && \
# it's needed to enable extension because it uses untrusted C language
sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_graphql.control && \
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_graphql.control
#########################################################################################
#
# Layer "neon-pg-ext-build"
@@ -300,10 +193,6 @@ COPY --from=h3-pg-build /h3/usr /
COPY --from=unit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=vector-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pgjwt-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-jsonschema-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-graphql-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=hypopg-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-hashids-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY pgxn/ pgxn/
RUN make -j $(getconf _NPROCESSORS_ONLN) \
@@ -366,7 +255,6 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
# libicu67, locales for collations (including ICU)
# libossp-uuid16 for extension ossp-uuid
# libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS
# libxml2, libxslt1.1 for xml2
RUN apt update && \
apt install --no-install-recommends -y \
locales \
@@ -378,8 +266,6 @@ RUN apt update && \
libproj19 \
libprotobuf-c1 \
libsfcgal1 \
libxml2 \
libxslt1.1 \
gdb && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8

View File

@@ -34,11 +34,6 @@ dnf install flex bison readline-devel zlib-devel openssl-devel \
libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
protobuf-devel
```
* On Arch based systems, these packages are needed:
```bash
pacman -S base-devel readline zlib libseccomp openssl clang \
postgresql-libs cmake postgresql protobuf
```
2. [Install Rust](https://www.rust-lang.org/tools/install)
```

View File

@@ -17,7 +17,6 @@ regex.workspace = true
serde.workspace = true
serde_json.workspace = true
tar.workspace = true
reqwest = { workspace = true, features = ["json"] }
tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
tokio-postgres.workspace = true
tracing.workspace = true

View File

@@ -65,9 +65,6 @@ fn main() -> Result<()> {
let spec = matches.get_one::<String>("spec");
let spec_path = matches.get_one::<String>("spec-path");
let compute_id = matches.get_one::<String>("compute-id");
let control_plane_uri = matches.get_one::<String>("control-plane-uri");
// Try to use just 'postgres' if no path is provided
let pgbin = matches.get_one::<String>("pgbin").unwrap();
@@ -80,27 +77,8 @@ fn main() -> Result<()> {
let path = Path::new(sp);
let file = File::open(path)?;
serde_json::from_reader(file)?
} else if let Some(id) = compute_id {
if let Some(cp_base) = control_plane_uri {
let cp_uri = format!("{cp_base}/management/api/v1/{id}/spec");
let jwt: String = match std::env::var("NEON_CONSOLE_JWT") {
Ok(v) => v,
Err(_) => "".to_string(),
};
reqwest::blocking::Client::new()
.get(cp_uri)
.header("Authorization", jwt)
.send()?
.json()?
} else {
panic!(
"must specify --control-plane-uri \"{:#?}\" and --compute-id \"{:#?}\"",
control_plane_uri, compute_id
);
}
} else {
panic!("compute spec should be provided via --spec or --spec-path argument");
panic!("cluster spec should be provided via --spec or --spec-path argument");
}
}
};
@@ -249,18 +227,6 @@ fn cli() -> clap::Command {
.long("spec-path")
.value_name("SPEC_PATH"),
)
.arg(
Arg::new("compute-id")
.short('i')
.long("compute-id")
.value_name("COMPUTE_ID"),
)
.arg(
Arg::new("control-plane-uri")
.short('p')
.long("control-plane-uri")
.value_name("CONTROL_PLANE"),
)
}
#[test]

View File

@@ -1,5 +1,41 @@
# Synthetic size
## How to get the data
Pageserver provides a HTTP API for getting the synthetic size of a tenant
along with the data that was used to calculate it. Usage examples:
1. This query returns the synthetic size of the tenant, along with the "raw" data
That is returned in the `segments` and `timeline_inputs` fields.
```
curl localhost:9898/v1/tenant/5e1de642394b00a0a583a088e8276b98/synthetic_size | jq
```
2. If `inputs_only=true` is passed, the response will contain only the raw data.
Actual synthetic size is not calculated.
```
curl localhost:9898/v1/tenant/5e1de642394b00a0a583a088e8276b98/synthetic_size?inputs_only=true | jq
```
3. 'retention_period' is a cutoff (in bytes) that overrides the cutoff that is used in the size calculation.
Note, that override is applied only if provided `retnention_period` is shorter than the real cutoff.
```
curl localhost:9898/v1/tenant/5e1de642394b00a0a583a088e8276b98/synthetic_size?retention_period=1048576 | jq
```
4. If header `Accept: text/html` is passed, the response will be in HTML format.
The HTML contains a json with the same data as in the previous examples + SVG diagram of the tenant timelines.
```
curl -H "Accept: text/html" localhost:9898/v1/tenant/5e1de642394b00a0a583a088e8276b98/synthetic_size > ./size.html | google-chrome ./size.html
```
## Overview
Neon storage has copy-on-write branching, which makes it difficult to
answer the question "how large is my database"? To give one reasonable
answer, we calculate _synthetic size_ for a project.

View File

@@ -98,15 +98,6 @@ impl RelTag {
name
}
pub fn with_forknum(&self, forknum: u8) -> Self {
RelTag {
forknum,
spcnode: self.spcnode,
dbnode: self.dbnode,
relnode: self.relnode,
}
}
}
///

View File

@@ -21,7 +21,7 @@ toml_edit.workspace = true
tracing.workspace = true
metrics.workspace = true
utils.workspace = true
pin-project-lite.workspace = true
workspace_hack.workspace = true
[dev-dependencies]

View File

@@ -20,10 +20,7 @@ use aws_sdk_s3::{
};
use aws_smithy_http::body::SdkBody;
use hyper::Body;
use tokio::{
io::{self, AsyncRead},
sync::Semaphore,
};
use tokio::{io, sync::Semaphore};
use tokio_util::io::ReaderStream;
use tracing::debug;
@@ -105,7 +102,7 @@ pub struct S3Bucket {
// Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded.
// Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold.
// The helps to ensure we don't exceed the thresholds.
concurrency_limiter: Arc<Semaphore>,
concurrency_limiter: Semaphore,
}
#[derive(Default)]
@@ -165,7 +162,7 @@ impl S3Bucket {
client,
bucket_name: aws_config.bucket_name.clone(),
prefix_in_bucket,
concurrency_limiter: Arc::new(Semaphore::new(aws_config.concurrency_limit.get())),
concurrency_limiter: Semaphore::new(aws_config.concurrency_limit.get()),
})
}
@@ -197,10 +194,9 @@ impl S3Bucket {
}
async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
let permit = self
let _guard = self
.concurrency_limiter
.clone()
.acquire_owned()
.acquire()
.await
.context("Concurrency limiter semaphore got closed during S3 download")
.map_err(DownloadError::Other)?;
@@ -221,10 +217,9 @@ impl S3Bucket {
let metadata = object_output.metadata().cloned().map(StorageMetadata);
Ok(Download {
metadata,
download_stream: Box::pin(io::BufReader::new(RatelimitedAsyncRead::new(
permit,
download_stream: Box::pin(io::BufReader::new(
object_output.body.into_async_read(),
))),
)),
})
}
Err(SdkError::ServiceError {
@@ -245,32 +240,6 @@ impl S3Bucket {
}
}
pin_project_lite::pin_project! {
/// An `AsyncRead` adapter which carries a permit for the lifetime of the value.
struct RatelimitedAsyncRead<S> {
permit: tokio::sync::OwnedSemaphorePermit,
#[pin]
inner: S,
}
}
impl<S: AsyncRead> RatelimitedAsyncRead<S> {
fn new(permit: tokio::sync::OwnedSemaphorePermit, inner: S) -> Self {
RatelimitedAsyncRead { permit, inner }
}
}
impl<S: AsyncRead> AsyncRead for RatelimitedAsyncRead<S> {
fn poll_read(
self: std::pin::Pin<&mut Self>,
cx: &mut std::task::Context<'_>,
buf: &mut io::ReadBuf<'_>,
) -> std::task::Poll<std::io::Result<()>> {
let this = self.project();
this.inner.poll_read(cx, buf)
}
}
#[async_trait::async_trait]
impl RemoteStorage for S3Bucket {
async fn list(&self) -> anyhow::Result<Vec<RemotePath>> {

View File

@@ -4,14 +4,13 @@ use anyhow::{anyhow, Context};
use hyper::header::{HeaderName, AUTHORIZATION};
use hyper::http::HeaderValue;
use hyper::{header::CONTENT_TYPE, Body, Request, Response, Server};
use hyper::{Method, StatusCode};
use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
use once_cell::sync::Lazy;
use routerify::ext::RequestExt;
use routerify::RequestInfo;
use routerify::{Middleware, Router, RouterBuilder, RouterService};
use tokio::task::JoinError;
use tracing;
use tracing::info;
use std::future::Future;
use std::net::TcpListener;
@@ -28,14 +27,7 @@ static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
});
async fn logger(res: Response<Body>, info: RequestInfo) -> Result<Response<Body>, ApiError> {
// cannot factor out the Level to avoid the repetition
// because tracing can only work with const Level
// which is not the case here
if info.method() == Method::GET && res.status() == StatusCode::OK {
tracing::debug!("{} {} {}", info.method(), info.uri().path(), res.status());
} else {
tracing::info!("{} {} {}", info.method(), info.uri().path(), res.status());
}
info!("{} {} {}", info.method(), info.uri().path(), res.status(),);
Ok(res)
}
@@ -211,7 +203,7 @@ pub fn serve_thread_main<S>(
where
S: Future<Output = ()> + Send + Sync,
{
tracing::info!("Starting an HTTP endpoint at {}", listener.local_addr()?);
info!("Starting an HTTP endpoint at {}", listener.local_addr()?);
// Create a Service from the router above to handle incoming requests.
let service = RouterService::new(router_builder.build().map_err(|err| anyhow!(err))?).unwrap();

View File

@@ -45,115 +45,3 @@ pub fn init(log_format: LogFormat) -> anyhow::Result<()> {
Ok(())
}
/// Disable the default rust panic hook by using `set_hook`.
///
/// For neon binaries, the assumption is that tracing is configured before with [`init`], after
/// that sentry is configured (if needed). sentry will install it's own on top of this, always
/// processing the panic before we log it.
///
/// When the return value is dropped, the hook is reverted to std default hook (prints to stderr).
/// If the assumptions about the initialization order are not held, use
/// [`TracingPanicHookGuard::disarm`] but keep in mind, if tracing is stopped, then panics will be
/// lost.
#[must_use]
pub fn replace_panic_hook_with_tracing_panic_hook() -> TracingPanicHookGuard {
std::panic::set_hook(Box::new(tracing_panic_hook));
TracingPanicHookGuard::new()
}
/// Drop guard which restores the std panic hook on drop.
///
/// Tracing should not be used when it's not configured, but we cannot really latch on to any
/// imaginary lifetime of tracing.
pub struct TracingPanicHookGuard {
act: bool,
}
impl TracingPanicHookGuard {
fn new() -> Self {
TracingPanicHookGuard { act: true }
}
/// Make this hook guard not do anything when dropped.
pub fn forget(&mut self) {
self.act = false;
}
}
impl Drop for TracingPanicHookGuard {
fn drop(&mut self) {
if self.act {
let _ = std::panic::take_hook();
}
}
}
/// Named symbol for our panic hook, which logs the panic.
fn tracing_panic_hook(info: &std::panic::PanicInfo) {
// following rust 1.66.1 std implementation:
// https://github.com/rust-lang/rust/blob/90743e7298aca107ddaa0c202a4d3604e29bfeb6/library/std/src/panicking.rs#L235-L288
let location = info.location();
let msg = match info.payload().downcast_ref::<&'static str>() {
Some(s) => *s,
None => match info.payload().downcast_ref::<String>() {
Some(s) => &s[..],
None => "Box<dyn Any>",
},
};
let thread = std::thread::current();
let thread = thread.name().unwrap_or("<unnamed>");
let backtrace = std::backtrace::Backtrace::capture();
let _entered = if let Some(location) = location {
tracing::error_span!("panic", %thread, location = %PrettyLocation(location))
} else {
// very unlikely to hit here, but the guarantees of std could change
tracing::error_span!("panic", %thread)
}
.entered();
if backtrace.status() == std::backtrace::BacktraceStatus::Captured {
// this has an annoying extra '\n' in the end which anyhow doesn't do, but we cannot really
// get rid of it as we cannot get in between of std::fmt::Formatter<'_>; we could format to
// string, maybe even to a TLS one but tracing already does that.
tracing::error!("{msg}\n\nStack backtrace:\n{backtrace}");
} else {
tracing::error!("{msg}");
}
// ensure that we log something on the panic if this hook is left after tracing has been
// unconfigured. worst case when teardown is racing the panic is to log the panic twice.
tracing::dispatcher::get_default(|d| {
if let Some(_none) = d.downcast_ref::<tracing::subscriber::NoSubscriber>() {
let location = location.map(PrettyLocation);
log_panic_to_stderr(thread, msg, location, &backtrace);
}
});
}
#[cold]
fn log_panic_to_stderr(
thread: &str,
msg: &str,
location: Option<PrettyLocation<'_, '_>>,
backtrace: &std::backtrace::Backtrace,
) {
eprintln!("panic while tracing is unconfigured: thread '{thread}' panicked at '{msg}', {location:?}\nStack backtrace:\n{backtrace}");
}
struct PrettyLocation<'a, 'b>(&'a std::panic::Location<'b>);
impl std::fmt::Display for PrettyLocation<'_, '_> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}:{}:{}", self.0.file(), self.0.line(), self.0.column())
}
}
impl std::fmt::Debug for PrettyLocation<'_, '_> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
<Self as std::fmt::Display>::fmt(self, f)
}
}

View File

@@ -33,7 +33,6 @@ use pageserver_api::reltag::{RelTag, SlruKind};
use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PGDATA_SUBDIRS, PG_HBA};
use postgres_ffi::relfile_utils::{INIT_FORKNUM, MAIN_FORKNUM};
use postgres_ffi::TransactionId;
use postgres_ffi::XLogFileName;
use postgres_ffi::PG_TLI;
@@ -191,31 +190,14 @@ where
{
self.add_dbdir(spcnode, dbnode, has_relmap_file).await?;
// If full backup is requested, include all relation files.
// Otherwise only include init forks of unlogged relations.
let rels = self
.timeline
.list_rels(spcnode, dbnode, self.lsn, self.ctx)
.await?;
for &rel in rels.iter() {
// Send init fork as main fork to provide well formed empty
// contents of UNLOGGED relations. Postgres copies it in
// `reinit.c` during recovery.
if rel.forknum == INIT_FORKNUM {
// I doubt we need _init fork itself, but having it at least
// serves as a marker relation is unlogged.
self.add_rel(rel, rel).await?;
self.add_rel(rel, rel.with_forknum(MAIN_FORKNUM)).await?;
continue;
}
if self.full_backup {
if rel.forknum == MAIN_FORKNUM && rels.contains(&rel.with_forknum(INIT_FORKNUM))
{
// skip this, will include it when we reach the init fork
continue;
}
self.add_rel(rel, rel).await?;
// Gather and send relational files in each database if full backup is requested.
if self.full_backup {
for rel in self
.timeline
.list_rels(spcnode, dbnode, self.lsn, self.ctx)
.await?
{
self.add_rel(rel).await?;
}
}
}
@@ -238,16 +220,15 @@ where
Ok(())
}
/// Add contents of relfilenode `src`, naming it as `dst`.
async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> {
async fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> {
let nblocks = self
.timeline
.get_rel_size(src, self.lsn, false, self.ctx)
.get_rel_size(tag, self.lsn, false, self.ctx)
.await?;
// If the relation is empty, create an empty file
if nblocks == 0 {
let file_name = dst.to_segfile_name(0);
let file_name = tag.to_segfile_name(0);
let header = new_tar_header(&file_name, 0)?;
self.ar.append(&header, &mut io::empty()).await?;
return Ok(());
@@ -263,12 +244,12 @@ where
for blknum in startblk..endblk {
let img = self
.timeline
.get_rel_page_at_lsn(src, blknum, self.lsn, false, self.ctx)
.get_rel_page_at_lsn(tag, blknum, self.lsn, false, self.ctx)
.await?;
segment_data.extend_from_slice(&img[..]);
}
let file_name = dst.to_segfile_name(seg as u32);
let file_name = tag.to_segfile_name(seg as u32);
let header = new_tar_header(&file_name, segment_data.len() as u64)?;
self.ar.append(&header, segment_data.as_slice()).await?;

View File

@@ -91,9 +91,9 @@ fn main() -> anyhow::Result<()> {
// Initialize logging, which must be initialized before the custom panic hook is installed.
logging::init(conf.log_format)?;
// mind the order required here: 1. logging, 2. panic_hook, 3. sentry.
// disarming this hook on pageserver, because we never tear down tracing.
logging::replace_panic_hook_with_tracing_panic_hook().forget();
// disable the default rust panic hook by using `set_hook`. sentry will install it's own on top
// of this, always processing the panic before we log it.
std::panic::set_hook(Box::new(tracing_panic_hook));
// initialize sentry if SENTRY_DSN is provided
let _sentry_guard = init_sentry(
@@ -499,6 +499,50 @@ fn cli() -> Command {
)
}
/// Named symbol for our panic hook, which logs the panic.
fn tracing_panic_hook(info: &std::panic::PanicInfo) {
// following rust 1.66.1 std implementation:
// https://github.com/rust-lang/rust/blob/90743e7298aca107ddaa0c202a4d3604e29bfeb6/library/std/src/panicking.rs#L235-L288
let location = info.location();
let msg = match info.payload().downcast_ref::<&'static str>() {
Some(s) => *s,
None => match info.payload().downcast_ref::<String>() {
Some(s) => &s[..],
None => "Box<dyn Any>",
},
};
let thread = std::thread::current();
let thread = thread.name().unwrap_or("<unnamed>");
let backtrace = std::backtrace::Backtrace::capture();
struct PrettyLocation<'a, 'b>(&'a std::panic::Location<'b>);
impl std::fmt::Display for PrettyLocation<'_, '_> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}:{}:{}", self.0.file(), self.0.line(), self.0.column())
}
}
let _entered = if let Some(location) = location {
tracing::error_span!("panic", %thread, location = %PrettyLocation(location))
} else {
// very unlikely to hit here, but the guarantees of std could change
tracing::error_span!("panic", %thread)
}
.entered();
if backtrace.status() == std::backtrace::BacktraceStatus::Captured {
// this has an annoying extra '\n' in the end which anyhow doesn't do, but we cannot really
// get rid of it as we cannot get in between of std::fmt::Formatter<'_>; we could format to
// string, maybe even to a TLS one but tracing already does that.
tracing::error!("{msg}\n\nStack backtrace:\n{backtrace}");
} else {
tracing::error!("{msg}");
}
}
#[test]
fn verify_cli() {
cli().debug_assert();

View File

@@ -3,7 +3,7 @@
use std::ops::ControlFlow;
use std::sync::Arc;
use std::time::{Duration, Instant};
use std::time::Duration;
use crate::context::{DownloadBehavior, RequestContext};
use crate::metrics::TENANT_TASK_EVENTS;
@@ -11,7 +11,6 @@ use crate::task_mgr;
use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
use crate::tenant::mgr;
use crate::tenant::{Tenant, TenantState};
use tokio_util::sync::CancellationToken;
use tracing::*;
use utils::id::TenantId;
@@ -54,55 +53,37 @@ async fn compaction_loop(tenant_id: TenantId) {
info!("starting");
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
async {
let cancel = task_mgr::shutdown_token();
let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
let mut first = true;
loop {
trace!("waking up");
let tenant = tokio::select! {
_ = cancel.cancelled() => {
_ = task_mgr::shutdown_watcher() => {
info!("received cancellation request");
return;
return;
},
tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result {
ControlFlow::Break(()) => return,
ControlFlow::Continue(tenant) => tenant,
},
};
};
let period = tenant.get_compaction_period();
// TODO: we shouldn't need to await to find tenant and this could be moved outside of
// loop, #3501. There are also additional "allowed_errors" in tests.
if first {
first = false;
if random_init_delay(period, &cancel).await.is_err() {
break;
}
}
let started_at = Instant::now();
let sleep_duration = if period == Duration::ZERO {
let mut sleep_duration = tenant.get_compaction_period();
if sleep_duration == Duration::ZERO {
info!("automatic compaction is disabled");
// check again in 10 seconds, in case it's been enabled again.
Duration::from_secs(10)
sleep_duration = Duration::from_secs(10);
} else {
// Run compaction
if let Err(e) = tenant.compaction_iteration(&ctx).await {
error!("Compaction failed, retrying in {:?}: {e:?}", wait_duration);
wait_duration
} else {
period
sleep_duration = wait_duration;
error!("Compaction failed, retrying in {:?}: {e:?}", sleep_duration);
}
};
warn_when_period_overrun(started_at.elapsed(), period, "compaction");
}
// Sleep
tokio::select! {
_ = cancel.cancelled() => {
_ = task_mgr::shutdown_watcher() => {
info!("received cancellation request during idling");
break;
},
@@ -124,16 +105,14 @@ async fn gc_loop(tenant_id: TenantId) {
info!("starting");
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
async {
let cancel = task_mgr::shutdown_token();
// GC might require downloading, to find the cutoff LSN that corresponds to the
// cutoff specified as time.
let ctx = RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
let mut first = true;
loop {
trace!("waking up");
let tenant = tokio::select! {
_ = cancel.cancelled() => {
_ = task_mgr::shutdown_watcher() => {
info!("received cancellation request");
return;
},
@@ -143,38 +122,27 @@ async fn gc_loop(tenant_id: TenantId) {
},
};
let period = tenant.get_gc_period();
if first {
first = false;
if random_init_delay(period, &cancel).await.is_err() {
break;
let gc_period = tenant.get_gc_period();
let gc_horizon = tenant.get_gc_horizon();
let mut sleep_duration = gc_period;
if sleep_duration == Duration::ZERO {
info!("automatic GC is disabled");
// check again in 10 seconds, in case it's been enabled again.
sleep_duration = Duration::from_secs(10);
} else {
// Run gc
if gc_horizon > 0 {
if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx).await
{
sleep_duration = wait_duration;
error!("Gc failed, retrying in {:?}: {e:?}", sleep_duration);
}
}
}
let started_at = Instant::now();
let gc_horizon = tenant.get_gc_horizon();
let sleep_duration = if period == Duration::ZERO || gc_horizon == 0 {
info!("automatic GC is disabled");
// check again in 10 seconds, in case it's been enabled again.
Duration::from_secs(10)
} else {
// Run gc
let res = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx).await;
if let Err(e) = res {
error!("Gc failed, retrying in {:?}: {e:?}", wait_duration);
wait_duration
} else {
period
}
};
warn_when_period_overrun(started_at.elapsed(), period, "gc");
// Sleep
tokio::select! {
_ = cancel.cancelled() => {
_ = task_mgr::shutdown_watcher() => {
info!("received cancellation request during idling");
break;
},
@@ -229,49 +197,3 @@ async fn wait_for_active_tenant(
}
}
}
#[derive(thiserror::Error, Debug)]
#[error("cancelled")]
pub(crate) struct Cancelled;
/// Provide a random delay for background task initialization.
///
/// This delay prevents a thundering herd of background tasks and will likely keep them running on
/// different periods for more stable load.
pub(crate) async fn random_init_delay(
period: Duration,
cancel: &CancellationToken,
) -> Result<(), Cancelled> {
use rand::Rng;
let d = {
let mut rng = rand::thread_rng();
// gen_range asserts that the range cannot be empty, which it could be because period can
// be set to zero to disable gc or compaction, so lets set it to be at least 10s.
let period = std::cmp::max(period, Duration::from_secs(10));
// semi-ok default as the source of jitter
rng.gen_range(Duration::ZERO..=period)
};
tokio::select! {
_ = cancel.cancelled() => Err(Cancelled),
_ = tokio::time::sleep(d) => Ok(()),
}
}
pub(crate) fn warn_when_period_overrun(elapsed: Duration, period: Duration, task: &str) {
// Duration::ZERO will happen because it's the "disable [bgtask]" value.
if elapsed >= period && period != Duration::ZERO {
// humantime does no significant digits clamping whereas Duration's debug is a bit more
// intelligent. however it makes sense to keep the "configuration format" for period, even
// though there's no way to output the actual config value.
warn!(
?elapsed,
period = %humantime::format_duration(period),
task,
"task iteration took longer than the configured period"
);
}
}

View File

@@ -19,7 +19,6 @@ use tracing::*;
use utils::id::TenantTimelineId;
use std::cmp::{max, min, Ordering};
use std::collections::BinaryHeap;
use std::collections::HashMap;
use std::fs;
use std::ops::{Deref, Range};
@@ -83,25 +82,6 @@ enum FlushLoopState {
Exited,
}
/// Wrapper for key range to provide reverse ordering by range length for BinaryHeap
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Hole {
key_range: Range<Key>,
coverage_size: usize,
}
impl Ord for Hole {
fn cmp(&self, other: &Self) -> Ordering {
other.coverage_size.cmp(&self.coverage_size) // inverse order
}
}
impl PartialOrd for Hole {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
pub struct Timeline {
conf: &'static PageServerConf,
tenant_conf: Arc<RwLock<TenantConfOpt>>,
@@ -1790,9 +1770,15 @@ impl Timeline {
let calculation = async {
let cancel = cancel.child_token();
let ctx = ctx.attached_child();
self_calculation
.calculate_logical_size(init_lsn, cancel, &ctx)
.await
tokio::task::spawn_blocking(move || {
// Run in a separate thread since this can do a lot of
// synchronous file IO without .await inbetween
// if there are no RemoteLayers that would require downloading.
let h = tokio::runtime::Handle::current();
h.block_on(self_calculation.calculate_logical_size(init_lsn, cancel, &ctx))
})
.await
.context("Failed to spawn calculation result task")?
};
let timeline_state_cancellation = async {
loop {
@@ -1825,7 +1811,7 @@ impl Timeline {
tokio::pin!(calculation);
loop {
tokio::select! {
res = &mut calculation => { return res }
res = &mut calculation => { return res }
reason = timeline_state_cancellation => {
debug!(reason = reason, "cancelling calculation");
cancel.cancel();
@@ -2961,47 +2947,6 @@ impl Timeline {
},
)?;
// Determine N largest holes where N is number of compacted layers.
let max_holes = deltas_to_compact.len();
let last_record_lsn = self.get_last_record_lsn();
let layers = self.layers.read().unwrap(); // Is'n it better to hold original layers lock till here?
let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128;
let min_hole_coverage_size = 3; // TODO: something more flexible?
// min-heap (reserve space for one more element added before eviction)
let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
let mut prev: Option<Key> = None;
for (next_key, _next_lsn, _size) in itertools::process_results(
deltas_to_compact.iter().map(|l| l.key_iter(ctx)),
|iter_iter| iter_iter.kmerge_by(|a, b| a.0 <= b.0),
)? {
if let Some(prev_key) = prev {
// just first fast filter
if next_key.to_i128() - prev_key.to_i128() >= min_hole_range {
let key_range = prev_key..next_key;
// Measuring hole by just subtraction of i128 representation of key range boundaries
// has not so much sense, because largest holes will corresponds field1/field2 changes.
// But we are mostly interested to eliminate holes which cause generation of excessive image layers.
// That is why it is better to measure size of hole as number of covering image layers.
let coverage_size = layers.image_coverage(&key_range, last_record_lsn)?.len();
if coverage_size >= min_hole_coverage_size {
heap.push(Hole {
key_range,
coverage_size,
});
if heap.len() > max_holes {
heap.pop(); // remove smallest hole
}
}
}
}
prev = Some(next_key.next());
}
drop(layers);
let mut holes = heap.into_vec();
holes.sort_unstable_by_key(|hole| hole.key_range.start);
let mut next_hole = 0; // index of next hole in holes vector
// Merge the contents of all the input delta layers into a new set
// of delta layers, based on the current partitioning.
//
@@ -3096,22 +3041,14 @@ impl Timeline {
}
if writer.is_some() {
let written_size = writer.as_mut().unwrap().size();
let contains_hole =
next_hole < holes.len() && key >= holes[next_hole].key_range.end;
// check if key cause layer overflow or contains hole...
// check if key cause layer overflow...
if is_dup_layer
|| dup_end_lsn.is_valid()
|| written_size + key_values_total_size > target_file_size
|| contains_hole
{
// ... if so, flush previous layer and prepare to write new one
new_layers.push(writer.take().unwrap().finish(prev_key.unwrap().next())?);
writer = None;
if contains_hole {
// skip hole
next_hole += 1;
}
}
}
// Remember size of key value because at next iteration we will access next item
@@ -3808,7 +3745,6 @@ impl Timeline {
remote_layer.ongoing_download.close();
} else {
// Keep semaphore open. We'll drop the permit at the end of the function.
info!("on-demand download failed: {:?}", result.as_ref().unwrap_err());
}
// Don't treat it as an error if the task that triggered the download

View File

@@ -41,23 +41,9 @@ impl Timeline {
#[instrument(skip_all, fields(tenant_id = %self.tenant_id, timeline_id = %self.timeline_id))]
async fn eviction_task(self: Arc<Self>, cancel: CancellationToken) {
use crate::tenant::tasks::random_init_delay;
{
let policy = self.get_eviction_policy();
let period = match policy {
EvictionPolicy::LayerAccessThreshold(lat) => lat.period,
EvictionPolicy::NoEviction => Duration::from_secs(10),
};
if random_init_delay(period, &cancel).await.is_err() {
info!("shutting down");
return;
}
}
loop {
let policy = self.get_eviction_policy();
let cf = self.eviction_iteration(&policy, cancel.clone()).await;
match cf {
ControlFlow::Break(()) => break,
ControlFlow::Continue(sleep_until) => {
@@ -92,7 +78,13 @@ impl Timeline {
ControlFlow::Continue(()) => (),
}
let elapsed = start.elapsed();
crate::tenant::tasks::warn_when_period_overrun(elapsed, p.period, "eviction");
if elapsed > p.period {
warn!(
configured_period = %humantime::format_duration(p.period),
last_period = %humantime::format_duration(elapsed),
"this eviction period took longer than the configured period"
);
}
ControlFlow::Continue(start + p.period)
}
}

View File

@@ -37,7 +37,7 @@ use crate::walrecord::*;
use crate::ZERO_PAGE;
use pageserver_api::reltag::{RelTag, SlruKind};
use postgres_ffi::pg_constants;
use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
use postgres_ffi::relfile_utils::{FSM_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
use postgres_ffi::v14::nonrelfile_utils::mx_offset_to_member_segment;
use postgres_ffi::v14::xlog_utils::*;
use postgres_ffi::v14::CheckPoint;
@@ -762,7 +762,7 @@ impl<'a> WalIngest<'a> {
)?;
for xnode in &parsed.xnodes {
for forknum in MAIN_FORKNUM..=INIT_FORKNUM {
for forknum in MAIN_FORKNUM..=VISIBILITYMAP_FORKNUM {
let rel = RelTag {
forknum,
spcnode: xnode.spcnode,

View File

@@ -3,7 +3,7 @@
pub mod backend;
pub use backend::BackendType;
pub mod credentials;
mod credentials;
pub use credentials::ClientCredentials;
mod password_hack;

View File

@@ -11,7 +11,7 @@ use crate::{
provider::{CachedNodeInfo, ConsoleReqExtra},
Api,
},
scram, stream, url,
stream, url,
};
use futures::TryFutureExt;
use std::borrow::Cow;
@@ -59,8 +59,8 @@ impl std::fmt::Display for BackendType<'_, ()> {
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
use BackendType::*;
match self {
Console(api, _) => fmt.debug_tuple("Console").field(&api.url()).finish(),
Postgres(api, _) => fmt.debug_tuple("Postgres").field(&api.url()).finish(),
Console(endpoint, _) => fmt.debug_tuple("Console").field(&endpoint.url()).finish(),
Postgres(endpoint, _) => fmt.debug_tuple("Postgres").field(&endpoint.url()).finish(),
Link(url) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
}
}
@@ -106,23 +106,6 @@ impl<'a, T, E> BackendType<'a, Result<T, E>> {
}
}
impl console::AuthInfo {
/// Either it's our way ([SCRAM](crate::scram)) or the highway :)
/// But seriously, we don't aim to support anything but SCRAM for now.
fn scram_or_goodbye(self) -> auth::Result<scram::ServerSecret> {
match self {
Self::Md5(_) => {
info!("auth endpoint chooses MD5");
Err(auth::AuthError::bad_auth_method("MD5"))
}
Self::Scram(secret) => {
info!("auth endpoint chooses SCRAM");
Ok(secret)
}
}
}
}
/// True to its name, this function encapsulates our current auth trade-offs.
/// Here, we choose the appropriate auth flow based on circumstances.
async fn auth_quirks(
@@ -200,9 +183,7 @@ impl BackendType<'_, ClientCredentials<'_>> {
info!("user successfully authenticated");
Ok(res)
}
}
impl BackendType<'_, ClientCredentials<'_>> {
/// When applicable, wake the compute node, gaining its connection info in the process.
/// The link auth flow doesn't support this, so we return [`None`] in that case.
pub async fn wake_compute(

View File

@@ -2,54 +2,57 @@ use super::AuthSuccess;
use crate::{
auth::{self, AuthFlow, ClientCredentials},
compute,
console::{self, CachedNodeInfo, ConsoleReqExtra},
console::{self, AuthInfo, CachedNodeInfo, ConsoleReqExtra},
sasl, scram,
stream::PqStream,
};
use tokio::io::{AsyncRead, AsyncWrite};
use tokio_postgres::config::AuthKeys;
use tracing::info;
async fn do_scram(
secret: scram::ServerSecret,
creds: &ClientCredentials<'_>,
client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
) -> auth::Result<compute::ScramKeys> {
let outcome = AuthFlow::new(client)
.begin(auth::Scram(&secret))
.await?
.authenticate()
.await?;
let client_key = match outcome {
sasl::Outcome::Success(key) => key,
sasl::Outcome::Failure(reason) => {
info!("auth backend failed with an error: {reason}");
return Err(auth::AuthError::auth_failed(creds.user));
}
};
let keys = compute::ScramKeys {
client_key: client_key.as_bytes(),
server_key: secret.server_key.as_bytes(),
};
Ok(keys)
}
pub async fn authenticate(
pub(super) async fn authenticate(
api: &impl console::Api,
extra: &ConsoleReqExtra<'_>,
creds: &ClientCredentials<'_>,
client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
let info = console::get_auth_info(api, extra, creds).await?;
info!("fetching user's authentication info");
let info = api.get_auth_info(extra, creds).await?.unwrap_or_else(|| {
// If we don't have an authentication secret, we mock one to
// prevent malicious probing (possible due to missing protocol steps).
// This mocked secret will never lead to successful authentication.
info!("authentication info not found, mocking it");
AuthInfo::Scram(scram::ServerSecret::mock(creds.user, rand::random()))
});
let secret = info.scram_or_goodbye()?;
let scram_keys = do_scram(secret, creds, client).await?;
let flow = AuthFlow::new(client);
let scram_keys = match info {
AuthInfo::Md5(_) => {
info!("auth endpoint chooses MD5");
return Err(auth::AuthError::bad_auth_method("MD5"));
}
AuthInfo::Scram(secret) => {
info!("auth endpoint chooses SCRAM");
let scram = auth::Scram(&secret);
let client_key = match flow.begin(scram).await?.authenticate().await? {
sasl::Outcome::Success(key) => key,
sasl::Outcome::Failure(reason) => {
info!("auth backend failed with an error: {reason}");
return Err(auth::AuthError::auth_failed(creds.user));
}
};
Some(compute::ScramKeys {
client_key: client_key.as_bytes(),
server_key: secret.server_key.as_bytes(),
})
}
};
let mut node = api.wake_compute(extra, creds).await?;
node.config.auth_keys(AuthKeys::ScramSha256(scram_keys));
if let Some(keys) = scram_keys {
use tokio_postgres::config::AuthKeys;
node.config.auth_keys(AuthKeys::ScramSha256(keys));
}
Ok(AuthSuccess {
reported_auth_ok: false,

View File

@@ -5,33 +5,11 @@ use crate::{
self,
provider::{CachedNodeInfo, ConsoleReqExtra},
},
stream::PqStream,
stream,
};
use tokio::io::{AsyncRead, AsyncWrite};
use tracing::{info, warn};
/// Wake the compute node, but only if the password is valid.
async fn get_compute(
api: &impl console::Api,
extra: &ConsoleReqExtra<'_>,
creds: &mut ClientCredentials<'_>,
password: Vec<u8>,
) -> auth::Result<CachedNodeInfo> {
// TODO: this will slow down both "hacks" below; we probably need a cache.
let info = console::get_auth_info(api, extra, creds).await?;
let secret = info.scram_or_goodbye()?;
if !secret.matches_password(&password) {
info!("our obscure magic indicates that the password doesn't match");
return Err(auth::AuthError::auth_failed(creds.user));
}
let mut node = api.wake_compute(extra, creds).await?;
node.config.password(password);
Ok(node)
}
/// Compared to [SCRAM](crate::scram), cleartext password auth saves
/// one round trip and *expensive* computations (>= 4096 HMAC iterations).
/// These properties are benefical for serverless JS workers, so we
@@ -40,7 +18,7 @@ pub async fn cleartext_hack(
api: &impl console::Api,
extra: &ConsoleReqExtra<'_>,
creds: &mut ClientCredentials<'_>,
client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
warn!("cleartext auth flow override is enabled, proceeding");
let password = AuthFlow::new(client)
@@ -49,7 +27,8 @@ pub async fn cleartext_hack(
.authenticate()
.await?;
let node = get_compute(api, extra, creds, password).await?;
let mut node = api.wake_compute(extra, creds).await?;
node.config.password(password);
// Report tentative success; compute node will check the password anyway.
Ok(AuthSuccess {
@@ -64,7 +43,7 @@ pub async fn password_hack(
api: &impl console::Api,
extra: &ConsoleReqExtra<'_>,
creds: &mut ClientCredentials<'_>,
client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
warn!("project not specified, resorting to the password hack auth flow");
let payload = AuthFlow::new(client)
@@ -76,7 +55,8 @@ pub async fn password_hack(
info!(project = &payload.project, "received missing parameter");
creds.project = Some(payload.project.into());
let node = get_compute(api, extra, creds, payload.password).await?;
let mut node = api.wake_compute(extra, creds).await?;
node.config.password(payload.password);
// Report tentative success; compute node will check the password anyway.
Ok(AuthSuccess {

View File

@@ -31,22 +31,12 @@ pub enum ClientCredsParseError {
impl UserFacingError for ClientCredsParseError {}
/// eSNI parameters which might contain endpoint/project name.
#[derive(Default)]
pub struct SniParams<'a> {
/// Server Name Indication (TLS jargon).
pub sni: Option<&'a str>,
/// Common Name from a TLS certificate.
pub common_name: Option<&'a str>,
}
/// Various client credentials which we use for authentication.
/// Note that we don't store any kind of client key or password here.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ClientCredentials<'a> {
/// Name of postgres role.
pub user: &'a str,
/// Also known as endpoint in the console.
// TODO: this is a severe misnomer! We should think of a new name ASAP.
pub project: Option<Cow<'a, str>>,
}
@@ -59,17 +49,18 @@ impl ClientCredentials<'_> {
impl<'a> ClientCredentials<'a> {
pub fn parse(
startup_params: &'a StartupMessageParams,
&SniParams { sni, common_name }: &SniParams<'_>,
params: &'a StartupMessageParams,
sni: Option<&str>,
common_name: Option<&str>,
) -> Result<Self, ClientCredsParseError> {
use ClientCredsParseError::*;
// Some parameters are stored in the startup message.
let get_param = |key| startup_params.get(key).ok_or(MissingKey(key));
let get_param = |key| params.get(key).ok_or(MissingKey(key));
let user = get_param("user")?;
// Project name might be passed via PG's command-line options.
let project_option = startup_params.options_raw().and_then(|mut options| {
let project_option = params.options_raw().and_then(|mut options| {
options
.find_map(|opt| opt.strip_prefix("project="))
.map(Cow::Borrowed)
@@ -131,9 +122,7 @@ mod tests {
// According to postgresql, only `user` should be required.
let options = StartupMessageParams::new([("user", "john_doe")]);
let sni = SniParams::default();
let creds = ClientCredentials::parse(&options, &sni)?;
let creds = ClientCredentials::parse(&options, None, None)?;
assert_eq!(creds.user, "john_doe");
assert_eq!(creds.project, None);
@@ -142,15 +131,13 @@ mod tests {
#[test]
fn parse_excessive() -> anyhow::Result<()> {
let startup = StartupMessageParams::new([
let options = StartupMessageParams::new([
("user", "john_doe"),
("database", "world"), // should be ignored
("foo", "bar"), // should be ignored
]);
let sni = SniParams::default();
let creds = ClientCredentials::parse(&startup, &sni)?;
let creds = ClientCredentials::parse(&options, None, None)?;
assert_eq!(creds.user, "john_doe");
assert_eq!(creds.project, None);
@@ -159,14 +146,12 @@ mod tests {
#[test]
fn parse_project_from_sni() -> anyhow::Result<()> {
let startup = StartupMessageParams::new([("user", "john_doe")]);
let options = StartupMessageParams::new([("user", "john_doe")]);
let sni = SniParams {
sni: Some("foo.localhost"),
common_name: Some("localhost"),
};
let sni = Some("foo.localhost");
let common_name = Some("localhost");
let creds = ClientCredentials::parse(&startup, &sni)?;
let creds = ClientCredentials::parse(&options, sni, common_name)?;
assert_eq!(creds.user, "john_doe");
assert_eq!(creds.project.as_deref(), Some("foo"));
@@ -175,14 +160,12 @@ mod tests {
#[test]
fn parse_project_from_options() -> anyhow::Result<()> {
let startup = StartupMessageParams::new([
let options = StartupMessageParams::new([
("user", "john_doe"),
("options", "-ckey=1 project=bar -c geqo=off"),
]);
let sni = SniParams::default();
let creds = ClientCredentials::parse(&startup, &sni)?;
let creds = ClientCredentials::parse(&options, None, None)?;
assert_eq!(creds.user, "john_doe");
assert_eq!(creds.project.as_deref(), Some("bar"));
@@ -191,17 +174,12 @@ mod tests {
#[test]
fn parse_projects_identical() -> anyhow::Result<()> {
let startup = StartupMessageParams::new([
("user", "john_doe"),
("options", "project=baz"), // fmt
]);
let options = StartupMessageParams::new([("user", "john_doe"), ("options", "project=baz")]);
let sni = SniParams {
sni: Some("baz.localhost"),
common_name: Some("localhost"),
};
let sni = Some("baz.localhost");
let common_name = Some("localhost");
let creds = ClientCredentials::parse(&startup, &sni)?;
let creds = ClientCredentials::parse(&options, sni, common_name)?;
assert_eq!(creds.user, "john_doe");
assert_eq!(creds.project.as_deref(), Some("baz"));
@@ -210,17 +188,13 @@ mod tests {
#[test]
fn parse_projects_different() {
let startup = StartupMessageParams::new([
("user", "john_doe"),
("options", "project=first"), // fmt
]);
let options =
StartupMessageParams::new([("user", "john_doe"), ("options", "project=first")]);
let sni = SniParams {
sni: Some("second.localhost"),
common_name: Some("localhost"),
};
let sni = Some("second.localhost");
let common_name = Some("localhost");
let err = ClientCredentials::parse(&startup, &sni).expect_err("should fail");
let err = ClientCredentials::parse(&options, sni, common_name).expect_err("should fail");
match err {
InconsistentProjectNames { domain, option } => {
assert_eq!(option, "first");
@@ -232,14 +206,12 @@ mod tests {
#[test]
fn parse_inconsistent_sni() {
let startup = StartupMessageParams::new([("user", "john_doe")]);
let options = StartupMessageParams::new([("user", "john_doe")]);
let sni = SniParams {
sni: Some("project.localhost"),
common_name: Some("example.com"),
};
let sni = Some("project.localhost");
let common_name = Some("example.com");
let err = ClientCredentials::parse(&startup, &sni).expect_err("should fail");
let err = ClientCredentials::parse(&options, sni, common_name).expect_err("should fail");
match err {
InconsistentSni { sni, cn } => {
assert_eq!(sni, "project.localhost");

View File

@@ -6,9 +6,7 @@ pub mod messages;
/// Wrappers for console APIs and their mocks.
pub mod provider;
pub use provider::{
errors, get_auth_info, Api, AuthInfo, CachedNodeInfo, ConsoleReqExtra, NodeInfo,
};
pub use provider::{errors, Api, AuthInfo, CachedNodeInfo, ConsoleReqExtra, NodeInfo};
/// Various cache-related types.
pub mod caches {

View File

@@ -9,7 +9,6 @@ use crate::{
};
use async_trait::async_trait;
use std::sync::Arc;
use tracing::info;
pub mod errors {
use crate::{
@@ -176,12 +175,6 @@ pub struct NodeInfo {
pub type NodeInfoCache = TimedLru<Arc<str>, NodeInfo>;
pub type CachedNodeInfo = timed_lru::Cached<&'static NodeInfoCache>;
/// Various caches for [`console`].
pub struct ApiCaches {
/// Cache for the `wake_compute` API method.
pub node_info: NodeInfoCache,
}
/// This will allocate per each call, but the http requests alone
/// already require a few allocations, so it should be fine.
#[async_trait]
@@ -201,21 +194,8 @@ pub trait Api {
) -> Result<CachedNodeInfo, errors::WakeComputeError>;
}
/// A more insightful version of [`Api::get_auth_info`] which
/// knows what to do when we get [`None`] instead of [`AuthInfo`].
pub async fn get_auth_info(
api: &impl Api,
extra: &ConsoleReqExtra<'_>,
creds: &ClientCredentials<'_>,
) -> Result<AuthInfo, errors::GetAuthInfoError> {
info!("fetching user's authentication info");
let info = api.get_auth_info(extra, creds).await?.unwrap_or_else(|| {
// If we don't have an authentication secret, we mock one to
// prevent malicious probing (possible due to missing protocol steps).
// This mocked secret will never lead to successful authentication.
info!("authentication info not found, mocking it");
AuthInfo::Scram(scram::ServerSecret::mock(creds.user, rand::random()))
});
Ok(info)
/// Various caches for [`console`].
pub struct ApiCaches {
/// Cache for the `wake_compute` API method.
pub node_info: NodeInfoCache,
}

View File

@@ -43,7 +43,6 @@ async fn flatten_err(
#[tokio::main]
async fn main() -> anyhow::Result<()> {
let _logging_guard = logging::init().await?;
let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
info!("Version: {GIT_VERSION}");

View File

@@ -2,7 +2,7 @@
mod tests;
use crate::{
auth::{self, backend::AuthSuccess, credentials},
auth::{self, backend::AuthSuccess},
cancellation::{self, CancelMap},
compute::{self, PostgresConnection},
config::{ProxyConfig, TlsConfig},
@@ -112,6 +112,7 @@ pub async fn handle_ws_client(
}
let tls = config.tls_config.as_ref();
let hostname = hostname.as_deref();
// TLS is None here, because the connection is already encrypted.
let do_handshake = handshake(stream, None, cancel_map);
@@ -120,17 +121,13 @@ pub async fn handle_ws_client(
None => return Ok(()), // it's a cancellation request
};
let sni = credentials::SniParams {
sni: hostname.as_deref(),
common_name: tls.and_then(|tls| tls.common_name.as_deref()),
};
// Extract credentials which we're going to use for auth.
let creds = {
let common_name = tls.and_then(|tls| tls.common_name.as_deref());
let result = config
.auth_backend
.as_ref()
.map(|_| auth::ClientCredentials::parse(&params, &sni))
.map(|_| auth::ClientCredentials::parse(&params, hostname, common_name))
.transpose();
async { result }.or_else(|e| stream.throw_error(e)).await?
@@ -162,17 +159,14 @@ async fn handle_client(
None => return Ok(()), // it's a cancellation request
};
let sni = credentials::SniParams {
sni: stream.get_ref().sni_hostname(),
common_name: tls.and_then(|tls| tls.common_name.as_deref()),
};
// Extract credentials which we're going to use for auth.
let creds = {
let sni = stream.get_ref().sni_hostname();
let common_name = tls.and_then(|tls| tls.common_name.as_deref());
let result = config
.auth_backend
.as_ref()
.map(|_| auth::ClientCredentials::parse(&params, &sni))
.map(|_| auth::ClientCredentials::parse(&params, sni, common_name))
.transpose();
async { result }.or_else(|e| stream.throw_error(e)).await?

View File

@@ -92,10 +92,10 @@ impl TestAuth for NoAuth {}
struct Scram(scram::ServerSecret);
impl Scram {
fn new(password: &[u8]) -> anyhow::Result<Self> {
fn new(password: &str) -> anyhow::Result<Self> {
let salt = rand::random::<[u8; 16]>();
let secret = scram::ServerSecret::build(password, &salt, 256);
let secret = scram::ServerSecret::build(password, &salt, 256)
.context("failed to generate scram secret")?;
Ok(Scram(secret))
}
@@ -230,11 +230,11 @@ async fn keepalive_is_inherited() -> anyhow::Result<()> {
}
#[rstest]
#[case(b"password_foo")]
#[case(b"pwd-bar")]
#[case(b"")]
#[case("password_foo")]
#[case("pwd-bar")]
#[case("")]
#[tokio::test]
async fn scram_auth_good(#[case] password: &[u8]) -> anyhow::Result<()> {
async fn scram_auth_good(#[case] password: &str) -> anyhow::Result<()> {
let (client, server) = tokio::io::duplex(1024);
let (client_config, server_config) =

View File

@@ -12,6 +12,7 @@ mod messages;
mod secret;
mod signature;
#[cfg(test)]
mod password;
pub use exchange::Exchange;

View File

@@ -73,7 +73,7 @@ impl sasl::Mechanism for Exchange<'_> {
let server_first_message = client_first_message.build_server_first_message(
&(self.nonce)(),
&self.secret.salt,
&self.secret.salt_base64,
self.secret.iterations,
);
let msg = server_first_message.as_str().to_owned();

View File

@@ -75,27 +75,19 @@ impl<'a> ClientFirstMessage<'a> {
pub fn build_server_first_message(
&self,
nonce: &[u8; SCRAM_RAW_NONCE_LEN],
salt: &[u8],
salt_base64: &str,
iterations: u32,
) -> OwnedServerFirstMessage {
use std::fmt::Write;
let mut message = String::new();
// Write base64-encoded combined nonce.
write!(&mut message, "r={}", self.nonce).unwrap();
base64::encode_config_buf(nonce, base64::STANDARD, &mut message);
let combined_nonce = 2..message.len();
// Write base64-encoded salt.
write!(&mut message, ",s=").unwrap();
base64::encode_config_buf(salt, base64::STANDARD, &mut message);
// Write number of iterations.
write!(&mut message, ",i={iterations}").unwrap();
write!(&mut message, ",s={},i={}", salt_base64, iterations).unwrap();
// This design guarantees that it's impossible to create a
// server-first-message without receiving a client-first-message.
// server-first-message without receiving a client-first-message
OwnedServerFirstMessage {
message,
nonce: combined_nonce,
@@ -237,49 +229,4 @@ mod tests {
"SRpfsIVS4Gk11w1LqQ4QvCUBZYQmqXNSDEcHqbQ3CHI="
);
}
#[test]
fn build_server_messages() {
let input = "n,,n=pepe,r=t8JwklwKecDLwSsA72rHmVju";
let client_first_message = ClientFirstMessage::parse(input).unwrap();
let nonce = [0; 18];
let salt = [1, 2, 3];
let iterations = 4096;
let server_first_message =
client_first_message.build_server_first_message(&nonce, &salt, iterations);
assert_eq!(
server_first_message.message,
"r=t8JwklwKecDLwSsA72rHmVjuAAAAAAAAAAAAAAAAAAAAAAAA,s=AQID,i=4096"
);
assert_eq!(
server_first_message.nonce(),
"t8JwklwKecDLwSsA72rHmVjuAAAAAAAAAAAAAAAAAAAAAAAA"
);
let input = [
"c=eSws",
"r=iiYEfS3rOgn8S3rtpSdrOsHtPLWvIkdgmHxA0hf3JNOAG4dU",
"p=SRpfsIVS4Gk11w1LqQ4QvCUBZYQmqXNSDEcHqbQ3CHI=",
]
.join(",");
let client_final_message = ClientFinalMessage::parse(&input).unwrap();
let signature_builder = SignatureBuilder {
client_first_message_bare: client_first_message.bare,
server_first_message: server_first_message.as_str(),
client_final_message_without_proof: client_final_message.without_proof,
};
let server_key = ScramKey::default();
let server_final_message =
client_final_message.build_server_final_message(signature_builder, &server_key);
assert_eq!(
server_final_message,
"v=XEL4X1vy5LnqIgOo4hOjm7zd1Ceyo9+nBUE+/zVnqLE="
);
}
}

View File

@@ -1,7 +1,6 @@
//! Password hashing routines.
use super::key::ScramKey;
use tracing::warn;
pub const SALTED_PASSWORD_LEN: usize = 32;
@@ -14,12 +13,7 @@ pub struct SaltedPassword {
impl SaltedPassword {
/// See `scram-common.c : scram_SaltedPassword` for details.
/// Further reading: <https://datatracker.ietf.org/doc/html/rfc2898> (see `PBKDF2`).
/// TODO: implement proper password normalization required by the RFC!
pub fn new(password: &[u8], salt: &[u8], iterations: u32) -> SaltedPassword {
if !password.is_ascii() {
warn!("found non-ascii symbols in password! salted password might be broken");
}
let one = 1_u32.to_be_bytes(); // magic
let mut current = super::hmac_sha256(password, [salt, &one]);
@@ -36,7 +30,6 @@ impl SaltedPassword {
}
/// Derive `ClientKey` from a salted hashed password.
#[cfg(test)]
pub fn client_key(&self) -> ScramKey {
super::hmac_sha256(&self.bytes, [b"Client Key".as_ref()]).into()
}

View File

@@ -1,14 +1,15 @@
//! Tools for SCRAM server secret management.
use super::{base64_decode_array, key::ScramKey, password::SaltedPassword};
use super::base64_decode_array;
use super::key::ScramKey;
/// Server secret is produced from [password](SaltedPassword)
/// Server secret is produced from [password](super::password::SaltedPassword)
/// and is used throughout the authentication process.
pub struct ServerSecret {
/// Number of iterations for `PBKDF2` function.
pub iterations: u32,
/// Salt used to hash user's password.
pub salt: Vec<u8>,
pub salt_base64: String,
/// Hashed `ClientKey`.
pub stored_key: ScramKey,
/// Used by client to verify server's signature.
@@ -29,7 +30,7 @@ impl ServerSecret {
let secret = ServerSecret {
iterations: iterations.parse().ok()?,
salt: base64::decode(salt).ok()?,
salt_base64: salt.to_owned(),
stored_key: base64_decode_array(stored_key)?.into(),
server_key: base64_decode_array(server_key)?.into(),
doomed: false,
@@ -47,31 +48,31 @@ impl ServerSecret {
Self {
iterations: 4096,
salt: mocked_salt.into(),
salt_base64: base64::encode(mocked_salt),
stored_key: ScramKey::default(),
server_key: ScramKey::default(),
doomed: true,
}
}
/// Check if this secret was derived from the given password.
pub fn matches_password(&self, password: &[u8]) -> bool {
let password = SaltedPassword::new(password, &self.salt, self.iterations);
self.server_key == password.server_key()
}
/// Build a new server secret from the prerequisites.
/// XXX: We only use this function in tests.
#[cfg(test)]
pub fn build(password: &[u8], salt: &[u8], iterations: u32) -> Self {
let password = SaltedPassword::new(password, salt, iterations);
pub fn build(password: &str, salt: &[u8], iterations: u32) -> Option<Self> {
// TODO: implement proper password normalization required by the RFC
if !password.is_ascii() {
return None;
}
Self {
let password = super::password::SaltedPassword::new(password.as_bytes(), salt, iterations);
Some(Self {
iterations,
salt: salt.into(),
salt_base64: base64::encode(salt),
stored_key: password.client_key().sha256(),
server_key: password.server_key(),
doomed: false,
}
})
}
}
@@ -86,11 +87,17 @@ mod tests {
let stored_key = "D5h6KTMBlUvDJk2Y8ELfC1Sjtc6k9YHjRyuRZyBNJns=";
let server_key = "Pi3QHbcluX//NDfVkKlFl88GGzlJ5LkyPwcdlN/QBvI=";
let secret = format!("SCRAM-SHA-256${iterations}:{salt}${stored_key}:{server_key}");
let secret = format!(
"SCRAM-SHA-256${iterations}:{salt}${stored_key}:{server_key}",
iterations = iterations,
salt = salt,
stored_key = stored_key,
server_key = server_key,
);
let parsed = ServerSecret::parse(&secret).unwrap();
assert_eq!(parsed.iterations, iterations);
assert_eq!(base64::encode(parsed.salt), salt);
assert_eq!(parsed.salt_base64, salt);
assert_eq!(base64::encode(parsed.stored_key), stored_key);
assert_eq!(base64::encode(parsed.server_key), server_key);
@@ -99,9 +106,9 @@ mod tests {
#[test]
fn build_scram_secret() {
let salt = b"salt";
let secret = ServerSecret::build(b"password", salt, 4096);
let secret = ServerSecret::build("password", salt, 4096).unwrap();
assert_eq!(secret.iterations, 4096);
assert_eq!(secret.salt, salt);
assert_eq!(secret.salt_base64, base64::encode(salt));
assert_eq!(
base64::encode(secret.stored_key.as_ref()),
"lF4cRm/Jky763CN4HtxdHnjV4Q8AWTNlKvGmEFFU8IQ="
@@ -111,12 +118,4 @@ mod tests {
"ub8OgRsftnk2ccDMOt7ffHXNcikRkQkq1lh4xaAqrSw="
);
}
#[test]
fn secret_match_password() {
let password = b"password";
let secret = ServerSecret::build(password, b"salt", 2);
assert!(secret.matches_password(password));
assert!(!secret.matches_password(b"different"));
}
}

View File

@@ -126,12 +126,7 @@ fn main() -> anyhow::Result<()> {
return Ok(());
}
// important to keep the order of:
// 1. init logging
// 2. tracing panic hook
// 3. sentry
logging::init(LogFormat::from_config(&args.log_format)?)?;
logging::replace_panic_hook_with_tracing_panic_hook().forget();
info!("version: {GIT_VERSION}");
let args_workdir = &args.datadir;

View File

@@ -424,16 +424,12 @@ async fn http1_handler(
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
let args = Args::parse();
// important to keep the order of:
// 1. init logging
// 2. tracing panic hook
// 3. sentry
logging::init(LogFormat::from_config(&args.log_format)?)?;
logging::replace_panic_hook_with_tracing_panic_hook().forget();
// initialize sentry if SENTRY_DSN is provided
let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
let args = Args::parse();
logging::init(LogFormat::from_config(&args.log_format)?)?;
info!("version: {GIT_VERSION}");
::metrics::set_build_info_metric(GIT_VERSION);

View File

@@ -1669,7 +1669,7 @@ class AbstractNeonCli(abc.ABC):
timeout=timeout,
)
if not res.returncode:
log.info(f"Run {res.args} success: {res.stdout}")
log.info(f"Run success: {res.stdout}")
elif check_return_code:
# this way command output will be in recorded and shown in CI in failure message
msg = f"""\
@@ -2079,9 +2079,6 @@ class NeonPageserver(PgProtocol):
".*compaction_loop.*Compaction failed, retrying in.*timeline is Stopping", # When compaction checks timeline state after acquiring layer_removal_cs
".*query handler for 'pagestream.*failed: Timeline .* was not found", # postgres reconnects while timeline_delete doesn't hold the tenant's timelines.lock()
".*query handler for 'pagestream.*failed: Timeline .* is not active", # timeline delete in progress
".*task iteration took longer than the configured period.*",
# this is until #3501
".*Compaction failed, retrying in [^:]+: Cannot run compaction iteration on inactive tenant",
]
def start(
@@ -3463,14 +3460,6 @@ def wait_for_last_flush_lsn(
return wait_for_last_record_lsn(env.pageserver.http_client(), tenant, timeline, last_flush_lsn)
def wait_for_wal_insert_lsn(
env: NeonEnv, pg: Postgres, tenant: TenantId, timeline: TimelineId
) -> Lsn:
"""Wait for pageserver to catch up the latest flush LSN, returns the last observed lsn."""
last_flush_lsn = Lsn(pg.safe_psql("SELECT pg_current_wal_insert_lsn()")[0][0])
return wait_for_last_record_lsn(env.pageserver.http_client(), tenant, timeline, last_flush_lsn)
def fork_at_current_lsn(
env: NeonEnv,
pg: Postgres,

View File

@@ -3,15 +3,8 @@ from typing import List, Tuple
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
NeonEnv,
NeonEnvBuilder,
PageserverHttpClient,
Postgres,
wait_for_last_flush_lsn,
wait_for_wal_insert_lsn,
)
from fixtures.types import Lsn, TenantId, TimelineId
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, wait_for_last_flush_lsn
from fixtures.types import Lsn
def test_empty_tenant_size(neon_simple_env: NeonEnv, test_output_dir: Path):
@@ -331,7 +324,7 @@ def test_single_branch_get_tenant_size_grows(
# inserts is larger than gc_horizon. for example 0x20000 here hid the fact
# that there next_gc_cutoff could be smaller than initdb_lsn, which will
# obviously lead to issues when calculating the size.
gc_horizon = 0x38000
gc_horizon = 0x30000
neon_env_builder.pageserver_config_override = f"tenant_config={{compaction_period='0s', gc_period='0s', pitr_interval='0sec', gc_horizon={gc_horizon}}}"
env = neon_env_builder.init_start()
@@ -341,75 +334,29 @@ def test_single_branch_get_tenant_size_grows(
http_client = env.pageserver.http_client()
collected_responses: List[Tuple[str, Lsn, int]] = []
collected_responses: List[Tuple[Lsn, int]] = []
size_debug_file = open(test_output_dir / "size_debug.html", "w")
def check_size_change(
current_lsn: Lsn, initdb_lsn: Lsn, gc_horizon: int, size: int, prev_size: int
):
if current_lsn - initdb_lsn >= gc_horizon:
def check_size_change(current_lsn: Lsn, initdb_lsn: Lsn, gc_horizon: int, size: int, prev: int):
if current_lsn - initdb_lsn > gc_horizon:
assert (
size >= prev_size
size >= prev
), "tenant_size may grow or not grow, because we only add gc_horizon amount of WAL to initial snapshot size"
else:
assert (
size > prev_size
size > prev
), "tenant_size should grow, because we continue to add WAL to initial snapshot size"
def get_current_consistent_size(
env: NeonEnv,
pg: Postgres,
size_debug_file, # apparently there is no public signature for open()...
http_client: PageserverHttpClient,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> Tuple[Lsn, int]:
consistent = False
size_debug = None
current_lsn = wait_for_wal_insert_lsn(env, pg, tenant_id, timeline_id)
# We want to make sure we have a self-consistent set of values.
# Size changes with WAL, so only if both before and after getting
# the size of the tenant reports the same WAL insert LSN, we're OK
# to use that (size, LSN) combination.
# Note that 'wait_for_wal_flush_lsn' is not accurate enough: There
# can be more wal after the flush LSN that can arrive on the
# pageserver before we're requesting the page size.
# Anyway, in general this is only one iteration, so in general
# this is fine.
while not consistent:
size, sizes = http_client.tenant_size_and_modelinputs(tenant_id)
size_debug = http_client.tenant_size_debug(tenant_id)
after_lsn = wait_for_wal_insert_lsn(env, pg, tenant_id, timeline_id)
consistent = current_lsn == after_lsn
current_lsn = after_lsn
size_debug_file.write(size_debug)
return (current_lsn, size)
with env.postgres.create_start(
branch_name,
tenant_id=tenant_id,
### autovacuum is disabled to limit WAL logging.
config_lines=["autovacuum=off"],
) as pg:
(initdb_lsn, size) = get_current_consistent_size(
env, pg, size_debug_file, http_client, tenant_id, timeline_id
)
collected_responses.append(("INITDB", initdb_lsn, size))
with env.postgres.create_start(branch_name, tenant_id=tenant_id) as pg:
initdb_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
with pg.cursor() as cur:
cur.execute("CREATE TABLE t0 (i BIGINT NOT NULL) WITH (fillfactor = 40)")
(current_lsn, size) = get_current_consistent_size(
env, pg, size_debug_file, http_client, tenant_id, timeline_id
)
collected_responses.append(("CREATE", current_lsn, size))
cur.execute("CREATE TABLE t0 (i BIGINT NOT NULL)")
batch_size = 100
for i in range(3):
i = 0
while True:
with pg.cursor() as cur:
cur.execute(
f"INSERT INTO t0(i) SELECT i FROM generate_series({batch_size} * %s, ({batch_size} * (%s + 1)) - 1) s(i)",
@@ -418,24 +365,27 @@ def test_single_branch_get_tenant_size_grows(
i += 1
(current_lsn, size) = get_current_consistent_size(
env, pg, size_debug_file, http_client, tenant_id, timeline_id
)
current_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
prev_size = collected_responses[-1][2]
if size == 0:
assert prev_size == 0
else:
# branch start shouldn't be past gc_horizon yet
# thus the size should grow as we insert more data
# "gc_horizon" is tuned so that it kicks in _after_ the
# insert phase, but before the update phase ends.
assert (
current_lsn - initdb_lsn <= gc_horizon
), "Tuning of GC window is likely out-of-date"
assert size > prev_size
size, sizes = http_client.tenant_size_and_modelinputs(tenant_id)
collected_responses.append(("INSERT", current_lsn, size))
size_debug = http_client.tenant_size_debug(tenant_id)
size_debug_file.write(size_debug)
if len(collected_responses) > 0:
prev = collected_responses[-1][1]
if size == 0:
assert prev == 0
else:
# branch start shouldn't be past gc_horizon yet
# thus the size should grow as we insert more data
assert current_lsn - initdb_lsn <= gc_horizon
assert size > prev
collected_responses.append((current_lsn, size))
if len(collected_responses) > 2:
break
while True:
with pg.cursor() as cur:
@@ -447,15 +397,18 @@ def test_single_branch_get_tenant_size_grows(
if updated == 0:
break
(current_lsn, size) = get_current_consistent_size(
env, pg, size_debug_file, http_client, tenant_id, timeline_id
)
current_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
prev_size = collected_responses[-1][2]
size, sizes = http_client.tenant_size_and_modelinputs(tenant_id)
check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size)
size_debug = http_client.tenant_size_debug(tenant_id)
size_debug_file.write(size_debug)
collected_responses.append(("UPDATE", current_lsn, size))
prev = collected_responses[-1][1]
check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev)
collected_responses.append((current_lsn, size))
while True:
with pg.cursor() as cur:
@@ -465,47 +418,40 @@ def test_single_branch_get_tenant_size_grows(
if deleted == 0:
break
(current_lsn, size) = get_current_consistent_size(
env, pg, size_debug_file, http_client, tenant_id, timeline_id
)
current_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
prev_size = collected_responses[-1][2]
size = http_client.tenant_size(tenant_id)
prev = collected_responses[-1][1]
check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size)
check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev)
collected_responses.append(("DELETE", current_lsn, size))
collected_responses.append((current_lsn, size))
with pg.cursor() as cur:
cur.execute("DROP TABLE t0")
# The size of the tenant should still be as large as before we dropped
# the table, because the drop operation can still be undone in the PITR
# defined by gc_horizon.
(current_lsn, size) = get_current_consistent_size(
env, pg, size_debug_file, http_client, tenant_id, timeline_id
)
current_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
prev_size = collected_responses[-1][2]
size = http_client.tenant_size(tenant_id)
prev = collected_responses[-1][1]
check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size)
check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev)
collected_responses.append(("DROP", current_lsn, size))
collected_responses.append((current_lsn, size))
# this isn't too many lines to forget for a while. observed while
# developing these tests that locally the value is a bit more than what we
# get in the ci.
for phase, lsn, size in collected_responses:
log.info(f"collected: {phase}, {lsn}, {size}")
for lsn, size in collected_responses:
log.info(f"collected: {lsn}, {size}")
env.pageserver.stop()
env.pageserver.start()
size_after = http_client.tenant_size(tenant_id)
size_debug = http_client.tenant_size_debug(tenant_id)
size_debug_file.write(size_debug)
size_debug_file.close()
prev = collected_responses[-1][2]
size_after = http_client.tenant_size(tenant_id)
prev = collected_responses[-1][1]
assert size_after == prev, "size after restarting pageserver should not have changed"

View File

@@ -1,34 +0,0 @@
from fixtures.neon_fixtures import NeonEnv, fork_at_current_lsn
#
# Test UNLOGGED tables/relations. Postgres copies init fork contents to main
# fork to reset them during recovery. In Neon, pageserver directly sends init
# fork contents as main fork during basebackup.
#
def test_unlogged(neon_simple_env: NeonEnv):
env = neon_simple_env
env.neon_cli.create_branch("test_unlogged", "empty")
pg = env.postgres.create_start("test_unlogged")
conn = pg.connect()
cur = conn.cursor()
cur.execute("CREATE UNLOGGED TABLE iut (id int);")
# create index to test unlogged index relation as well
cur.execute("CREATE UNIQUE INDEX iut_idx ON iut (id);")
cur.execute("INSERT INTO iut values (42);")
# create another compute to fetch inital empty contents from pageserver
fork_at_current_lsn(env, pg, "test_unlogged_basebackup", "test_unlogged")
pg2 = env.postgres.create_start(
"test_unlogged_basebackup",
)
conn2 = pg2.connect()
cur2 = conn2.cursor()
# after restart table should be empty but valid
cur2.execute("PREPARE iut_plan (int) AS INSERT INTO iut VALUES ($1)")
cur2.execute("EXECUTE iut_plan (43);")
cur2.execute("SELECT * FROM iut")
assert cur2.fetchall() == [(43,)]