Compare commits

..

23 Commits

Author SHA1 Message Date
Bojan Serafimov
87d75f6070 Test on real data 2022-12-02 15:28:18 -05:00
Bojan Serafimov
f7201cd3cf refactor test 2022-12-02 15:07:59 -05:00
Bojan Serafimov
2dcbdd9e47 Use i128 2022-12-02 14:52:55 -05:00
Bojan Serafimov
72db121a8a Add todo 2022-12-02 09:17:20 -05:00
Bojan Serafimov
5444e6ff32 Add todo 2022-12-01 23:04:06 -05:00
Bojan Serafimov
809b04eccb Add todo 2022-12-01 22:46:57 -05:00
Bojan Serafimov
e7b2b5ae12 Remove comment 2022-12-01 21:59:34 -05:00
Bojan Serafimov
f476e56315 Actually implement 2022-12-01 21:58:34 -05:00
Bojan Serafimov
72835371cc Add mock persistent BST implementation 2022-12-01 20:49:43 -05:00
Bojan Serafimov
924d91c47d Bench layer map using persistent range queries 2022-12-01 16:42:24 -05:00
Bojan Serafimov
4e61edef7c wip 2022-12-01 15:05:21 -05:00
Bojan Serafimov
b20df9b90a Merge branch 'main' into persistent_range_query 2022-12-01 14:30:37 -05:00
Egor Suvorov
8261455019 persistent_range_query: add layer_map_test 2022-11-24 04:47:19 +02:00
Egor Suvorov
aad88d6c39 persistent_range_query: add stress test 2022-11-24 03:50:18 +02:00
Egor Suvorov
6188315b51 persistent_range_query: more refs 2022-11-24 03:45:02 +02:00
Egor Suvorov
3a4b932d8a Draft generic persistent segment tree 2022-11-24 02:31:48 +02:00
Egor Suvorov
cc2b3c986c Simplify code: fewer lifetimes, auto-impl VecFrozenVersion 2022-11-24 02:11:06 +02:00
Egor Suvorov
c250c2664b Always require Clone for RangeQueryResult 2022-11-24 01:42:14 +02:00
Egor Suvorov
e5550a01b0 VecVersion: make it read-only, only the latest version can be modified 2022-11-24 01:28:13 +02:00
Egor Suvorov
45617ceaef RangeModification: add is_no_op/is_reinitialization 2022-11-24 00:12:05 +02:00
Egor Suvorov
29b39301fe RangeQueryResult: do not require Clone and do not provide default combine/add implementations 2022-11-24 00:11:52 +02:00
Egor Suvorov
b01a93be60 persistent_range_query: second draft
Move same generic parameters to associated types.
Work around private fields of SumResult<T>
2022-11-23 23:09:25 +02:00
Egor Suvorov
4c68d019e3 persistent_range_query: first draft 2022-11-23 20:52:34 +02:00
88 changed files with 3300 additions and 3129 deletions

View File

@@ -11,3 +11,6 @@ opt-level = 3
[profile.dev]
# Turn on a small amount of optimization in Development mode.
opt-level = 1
[alias]
build_testing = ["build", "--features", "testing"]

View File

@@ -1,8 +1,7 @@
#!/bin/sh
# fetch params from meta-data service
# get instance id from meta-data service
INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
AZ_ID=$(curl -s http://169.254.169.254/latest/meta-data/placement/availability-zone)
# store fqdn hostname in var
HOST=$(hostname -f)
@@ -17,8 +16,7 @@ cat <<EOF | tee /tmp/payload
"instance_id": "${INSTANCE_ID}",
"http_host": "${HOST}",
"http_port": 9898,
"active": false,
"availability_zone_id": "${AZ_ID}"
"active": false
}
EOF

View File

@@ -5,7 +5,7 @@ After=network.target auditd.service
[Service]
Type=simple
User=pageserver
Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/v14/lib SENTRY_DSN={{ SENTRY_URL_PAGESERVER }}
Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/v14/lib
ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -c "broker_endpoints=['{{ etcd_endpoints }}']" -D /storage/pageserver/data
ExecReload=/bin/kill -HUP $MAINPID
KillMode=mixed

View File

@@ -5,7 +5,7 @@ After=network.target auditd.service
[Service]
Type=simple
User=safekeeper
Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/v14/lib SENTRY_DSN={{ SENTRY_URL_SAFEKEEPER }}
Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/v14/lib
ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}{{ hostname_suffix }}:6500 --listen-http {{ inventory_hostname }}{{ hostname_suffix }}:7676 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ safekeeper_s3_prefix }}"}'
ExecReload=/bin/kill -HUP $MAINPID
KillMode=mixed

View File

@@ -1,53 +0,0 @@
# Helm chart values for neon-storage-broker
podLabels:
neon_env: staging
neon_service: storage-broker
ingress:
enabled: true
annotations:
kubernetes.io/ingress.class: nginx-internal
nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
nginx.ingress.kubernetes.io/ssl-redirect: "true"
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
hosts:
- host: storage-broker-zeta.eu-west-1.aws.neon.build
paths:
- path: /
pathType: Prefix
tls:
- hosts:
- storage-broker-zeta.eu-west-1.aws.neon.build
secretName: storage-broker-tls
metrics:
enabled: false
extraManifests:
- apiVersion: operator.victoriametrics.com/v1beta1
kind: VMServiceScrape
metadata:
name: "{{ include \"neon-storage-broker.fullname\" . }}"
labels:
helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
app.kubernetes.io/name: neon-storage-broker
app.kubernetes.io/instance: neon-storage-broker
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
app.kubernetes.io/managed-by: Helm
namespace: "{{ .Release.Namespace }}"
spec:
selector:
matchLabels:
app.kubernetes.io/name: "neon-storage-broker"
endpoints:
- port: broker
path: /metrics
interval: 10s
scrapeTimeout: 10s
namespaceSelector:
matchNames:
- "{{ .Release.Namespace }}"

View File

@@ -1,53 +0,0 @@
# Helm chart values for neon-storage-broker
podLabels:
neon_env: staging
neon_service: storage-broker
ingress:
enabled: true
annotations:
kubernetes.io/ingress.class: nginx-internal
nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
nginx.ingress.kubernetes.io/ssl-redirect: "true"
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
hosts:
- host: storage-broker-beta.us-east-2.aws.neon.build
paths:
- path: /
pathType: Prefix
tls:
- hosts:
- storage-broker-beta.us-east-2.aws.neon.build
secretName: storage-broker-tls
metrics:
enabled: false
extraManifests:
- apiVersion: operator.victoriametrics.com/v1beta1
kind: VMServiceScrape
metadata:
name: "{{ include \"neon-storage-broker.fullname\" . }}"
labels:
helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
app.kubernetes.io/name: neon-storage-broker
app.kubernetes.io/instance: neon-storage-broker
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
app.kubernetes.io/managed-by: Helm
namespace: "{{ .Release.Namespace }}"
spec:
selector:
matchLabels:
app.kubernetes.io/name: "neon-storage-broker"
endpoints:
- port: broker
path: /metrics
interval: 10s
scrapeTimeout: 10s
namespaceSelector:
matchNames:
- "{{ .Release.Namespace }}"

View File

@@ -1,54 +0,0 @@
# Helm chart values for neon-storage-broker
podLabels:
neon_env: neon-stress
neon_service: storage-broker
ingress:
enabled: true
annotations:
kubernetes.io/ingress.class: alb
alb.ingress.kubernetes.io/healthcheck-path: /status
alb.ingress.kubernetes.io/listen-ports: '[{"HTTPS":443}]'
alb.ingress.kubernetes.io/scheme: "internal"
alb.ingress.kubernetes.io/target-type: "ip"
alb.ingress.kubernetes.io/ssl-redirect: "443"
alb.ingress.kubernetes.io/backend-protocol-version: "GRPC"
hosts:
- host: storage-broker-stress.stage.neon.tech
paths:
- path: /
pathType: Prefix
metrics:
enabled: true
serviceMonitor:
enabled: true
selector:
release: kube-prometheus-stack
extraManifests:
- apiVersion: operator.victoriametrics.com/v1beta1
kind: VMServiceScrape
metadata:
name: "{{ include \"neon-storage-broker.fullname\" . }}"
labels:
helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
app.kubernetes.io/name: neon-storage-broker
app.kubernetes.io/instance: neon-storage-broker
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
app.kubernetes.io/managed-by: Helm
namespace: "{{ .Release.Namespace }}"
spec:
selector:
matchLabels:
app.kubernetes.io/name: "neon-storage-broker"
endpoints:
- port: broker
path: /metrics
interval: 10s
scrapeTimeout: 10s
namespaceSelector:
matchNames:
- "{{ .Release.Namespace }}"

View File

@@ -1,53 +0,0 @@
# Helm chart values for neon-storage-broker
podLabels:
neon_env: production
neon_service: storage-broker
ingress:
enabled: true
annotations:
kubernetes.io/ingress.class: nginx-internal
nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
nginx.ingress.kubernetes.io/ssl-redirect: "true"
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
hosts:
- host: storage-broker-epsilon.ap-southeast-1.aws.neon.tech
paths:
- path: /
pathType: Prefix
tls:
- hosts:
- storage-broker-epsilon.ap-southeast-1.aws.neon.tech
secretName: storage-broker-tls
metrics:
enabled: false
extraManifests:
- apiVersion: operator.victoriametrics.com/v1beta1
kind: VMServiceScrape
metadata:
name: "{{ include \"neon-storage-broker.fullname\" . }}"
labels:
helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
app.kubernetes.io/name: neon-storage-broker
app.kubernetes.io/instance: neon-storage-broker
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
app.kubernetes.io/managed-by: Helm
namespace: "{{ .Release.Namespace }}"
spec:
selector:
matchLabels:
app.kubernetes.io/name: "neon-storage-broker"
endpoints:
- port: broker
path: /metrics
interval: 10s
scrapeTimeout: 10s
namespaceSelector:
matchNames:
- "{{ .Release.Namespace }}"

View File

@@ -1,53 +0,0 @@
# Helm chart values for neon-storage-broker
podLabels:
neon_env: production
neon_service: storage-broker
ingress:
enabled: true
annotations:
kubernetes.io/ingress.class: nginx-internal
nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
nginx.ingress.kubernetes.io/ssl-redirect: "true"
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
hosts:
- host: storage-broker-gamma.eu-central-1.aws.neon.tech
paths:
- path: /
pathType: Prefix
tls:
- hosts:
- storage-broker-gamma.eu-central-1.aws.neon.tech
secretName: storage-broker-tls
metrics:
enabled: false
extraManifests:
- apiVersion: operator.victoriametrics.com/v1beta1
kind: VMServiceScrape
metadata:
name: "{{ include \"neon-storage-broker.fullname\" . }}"
labels:
helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
app.kubernetes.io/name: neon-storage-broker
app.kubernetes.io/instance: neon-storage-broker
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
app.kubernetes.io/managed-by: Helm
namespace: "{{ .Release.Namespace }}"
spec:
selector:
matchLabels:
app.kubernetes.io/name: "neon-storage-broker"
endpoints:
- port: broker
path: /metrics
interval: 10s
scrapeTimeout: 10s
namespaceSelector:
matchNames:
- "{{ .Release.Namespace }}"

View File

@@ -1,53 +0,0 @@
# Helm chart values for neon-storage-broker
podLabels:
neon_env: production
neon_service: storage-broker
ingress:
enabled: true
annotations:
kubernetes.io/ingress.class: nginx-internal
nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
nginx.ingress.kubernetes.io/ssl-redirect: "true"
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
hosts:
- host: storage-broker-delta.us-east-2.aws.neon.tech
paths:
- path: /
pathType: Prefix
tls:
- hosts:
- storage-broker-delta.us-east-2.aws.neon.tech
secretName: storage-broker-tls
metrics:
enabled: false
extraManifests:
- apiVersion: operator.victoriametrics.com/v1beta1
kind: VMServiceScrape
metadata:
name: "{{ include \"neon-storage-broker.fullname\" . }}"
labels:
helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
app.kubernetes.io/name: neon-storage-broker
app.kubernetes.io/instance: neon-storage-broker
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
app.kubernetes.io/managed-by: Helm
namespace: "{{ .Release.Namespace }}"
spec:
selector:
matchLabels:
app.kubernetes.io/name: "neon-storage-broker"
endpoints:
- port: broker
path: /metrics
interval: 10s
scrapeTimeout: 10s
namespaceSelector:
matchNames:
- "{{ .Release.Namespace }}"

View File

@@ -1,31 +0,0 @@
# Helm chart values for neon-proxy-scram.
# This is a YAML-formatted file.
image:
repository: neondatabase/neon
settings:
authBackend: "console"
authEndpoint: "http://console-release.local/management/api/v2"
domain: "*.us-west-2.aws.neon.tech"
# -- Additional labels for neon-proxy pods
podLabels:
zenith_service: proxy-scram
zenith_env: prod
zenith_region: us-west-2
zenith_region_slug: us-west-2
exposedService:
annotations:
service.beta.kubernetes.io/aws-load-balancer-type: external
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: us-west-2.aws.neon.tech
#metrics:
# enabled: true
# serviceMonitor:
# enabled: true
# selector:
# release: kube-prometheus-stack

View File

@@ -1,53 +0,0 @@
# Helm chart values for neon-storage-broker
podLabels:
neon_env: production
neon_service: storage-broker
ingress:
enabled: true
annotations:
kubernetes.io/ingress.class: nginx-internal
nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
nginx.ingress.kubernetes.io/ssl-redirect: "true"
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
hosts:
- host: storage-broker-eta.us-west-2.aws.neon.tech
paths:
- path: /
pathType: Prefix
tls:
- hosts:
- storage-broker-eta.us-west-2.aws.neon.tech
secretName: storage-broker-tls
metrics:
enabled: false
extraManifests:
- apiVersion: operator.victoriametrics.com/v1beta1
kind: VMServiceScrape
metadata:
name: "{{ include \"neon-storage-broker.fullname\" . }}"
labels:
helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
app.kubernetes.io/name: neon-storage-broker
app.kubernetes.io/instance: neon-storage-broker
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
app.kubernetes.io/managed-by: Helm
namespace: "{{ .Release.Namespace }}"
spec:
selector:
matchLabels:
app.kubernetes.io/name: "neon-storage-broker"
endpoints:
- port: broker
path: /metrics
interval: 10s
scrapeTimeout: 10s
namespaceSelector:
matchNames:
- "{{ .Release.Namespace }}"

View File

@@ -1,54 +0,0 @@
# Helm chart values for neon-storage-broker
podLabels:
neon_env: production
neon_service: storage-broker
ingress:
enabled: true
annotations:
kubernetes.io/ingress.class: alb
alb.ingress.kubernetes.io/healthcheck-path: /status
alb.ingress.kubernetes.io/listen-ports: '[{"HTTPS":443}]'
alb.ingress.kubernetes.io/scheme: "internal"
alb.ingress.kubernetes.io/target-type: "ip"
alb.ingress.kubernetes.io/ssl-redirect: "443"
alb.ingress.kubernetes.io/backend-protocol-version: "GRPC"
hosts:
- host: storage-broker.neon.tech
paths:
- path: /
pathType: Prefix
metrics:
enabled: true
serviceMonitor:
enabled: true
selector:
release: kube-prometheus-stack
extraManifests:
- apiVersion: operator.victoriametrics.com/v1beta1
kind: VMServiceScrape
metadata:
name: "{{ include \"neon-storage-broker.fullname\" . }}"
labels:
helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
app.kubernetes.io/name: neon-storage-broker
app.kubernetes.io/instance: neon-storage-broker
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
app.kubernetes.io/managed-by: Helm
namespace: "{{ .Release.Namespace }}"
spec:
selector:
matchLabels:
app.kubernetes.io/name: "neon-storage-broker"
endpoints:
- port: broker
path: /metrics
interval: 10s
scrapeTimeout: 10s
namespaceSelector:
matchNames:
- "{{ .Release.Namespace }}"

View File

@@ -1,54 +0,0 @@
# Helm chart values for neon-storage-broker
podLabels:
neon_env: staging
neon_service: storage-broker
ingress:
enabled: true
annotations:
kubernetes.io/ingress.class: alb
alb.ingress.kubernetes.io/healthcheck-path: /status
alb.ingress.kubernetes.io/listen-ports: '[{"HTTPS":443}]'
alb.ingress.kubernetes.io/scheme: "internal"
alb.ingress.kubernetes.io/target-type: "ip"
alb.ingress.kubernetes.io/ssl-redirect: "443"
alb.ingress.kubernetes.io/backend-protocol-version: "GRPC"
hosts:
- host: storage-broker.stage.neon.tech
paths:
- path: /
pathType: Prefix
metrics:
enabled: true
serviceMonitor:
enabled: true
selector:
release: kube-prometheus-stack
extraManifests:
- apiVersion: operator.victoriametrics.com/v1beta1
kind: VMServiceScrape
metadata:
name: "{{ include \"neon-storage-broker.fullname\" . }}"
labels:
helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
app.kubernetes.io/name: neon-storage-broker
app.kubernetes.io/instance: neon-storage-broker
app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
app.kubernetes.io/managed-by: Helm
namespace: "{{ .Release.Namespace }}"
spec:
selector:
matchLabels:
app.kubernetes.io/name: "neon-storage-broker"
endpoints:
- port: broker
path: /metrics
interval: 10s
scrapeTimeout: 10s
namespaceSelector:
matchNames:
- "{{ .Release.Namespace }}"

View File

@@ -231,11 +231,8 @@ jobs:
- name: Set database options
if: matrix.platform == 'neon-captest-prefetch'
run: |
DB_NAME=$(psql ${BENCHMARK_CONNSTR} --no-align --quiet -t -c "SELECT current_database()")
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET enable_seqscan_prefetch=on"
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET effective_io_concurrency=32"
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET maintenance_io_concurrency=32"
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE neondb SET enable_seqscan_prefetch=on"
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE neondb SET seqscan_prefetch_buffers=10"
env:
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
@@ -377,11 +374,8 @@ jobs:
- name: Set database options
if: matrix.platform == 'neon-captest-prefetch'
run: |
DB_NAME=$(psql ${BENCHMARK_CONNSTR} --no-align --quiet -t -c "SELECT current_database()")
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET enable_seqscan_prefetch=on"
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET effective_io_concurrency=32"
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET maintenance_io_concurrency=32"
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE main SET enable_seqscan_prefetch=on"
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE main SET seqscan_prefetch_buffers=10"
env:
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}

View File

@@ -100,11 +100,11 @@ jobs:
run: |
if [[ $BUILD_TYPE == "debug" ]]; then
cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
CARGO_FEATURES=""
CARGO_FEATURES="--features testing"
CARGO_FLAGS="--locked --timings $CARGO_FEATURES"
elif [[ $BUILD_TYPE == "release" ]]; then
cov_prefix=""
CARGO_FEATURES="--features profiling"
CARGO_FEATURES="--features testing,profiling"
CARGO_FLAGS="--locked --timings --release $CARGO_FEATURES"
fi
echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV
@@ -539,9 +539,9 @@ jobs:
# `neondatabase/neon` contains multiple binaries, all of them use the same input for the version into the same version formatting library.
# Pick pageserver as currently the only binary with extra "version" features printed in the string to verify.
# Regular pageserver version string looks like
# Neon page server git-env:32d14403bd6ab4f4520a94cbfd81a6acef7a526c features: []
# Neon page server git-env:32d14403bd6ab4f4520a94cbfd81a6acef7a526c failpoints: true, features: []
# Bad versions might loop like:
# Neon page server git-env:local features: [""]
# Neon page server git-env:local failpoints: true, features: ["testing"]
# Ensure that we don't have bad versions.
- name: Verify image versions
shell: bash # ensure no set -e for better error messages
@@ -555,6 +555,11 @@ jobs:
exit 1
fi
if ! echo "$pageserver_version" | grep -qv '"testing"' ; then
echo "Pageserver version should have no testing feature enabled"
exit 1
fi
- name: Verify docker-compose example
run: env REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh
@@ -663,11 +668,11 @@ jobs:
- id: set-matrix
run: |
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
STAGING='{"env_name": "staging", "proxy_job": "neon-proxy", "proxy_config": "staging.proxy", "storage_broker_ns": "neon-storage-broker", "storage_broker_config": "staging.neon-storage-broker", "kubeconfig_secret": "STAGING_KUBECONFIG_DATA", "console_api_key_secret": "NEON_STAGING_API_KEY"}'
NEON_STRESS='{"env_name": "neon-stress", "proxy_job": "neon-stress-proxy", "proxy_config": "neon-stress.proxy", "storage_broker_ns": "neon-stress-storage-broker", "storage_broker_config": "neon-stress.neon-storage-broker", "kubeconfig_secret": "NEON_STRESS_KUBECONFIG_DATA", "console_api_key_secret": "NEON_CAPTEST_API_KEY", storage_broker_config: }'
STAGING='{"env_name": "staging", "proxy_job": "neon-proxy", "proxy_config": "staging.proxy", "kubeconfig_secret": "STAGING_KUBECONFIG_DATA", "console_api_key_secret": "NEON_STAGING_API_KEY"}'
NEON_STRESS='{"env_name": "neon-stress", "proxy_job": "neon-stress-proxy", "proxy_config": "neon-stress.proxy", "kubeconfig_secret": "NEON_STRESS_KUBECONFIG_DATA", "console_api_key_secret": "NEON_CAPTEST_API_KEY"}'
echo "include=[$STAGING, $NEON_STRESS]" >> $GITHUB_OUTPUT
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "storage_broker_ns": "neon-storage-broker", "storage_broker_config": "production.neon-storage-broker", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA", "console_api_key_secret": "NEON_PRODUCTION_API_KEY"}'
PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA", "console_api_key_secret": "NEON_PRODUCTION_API_KEY"}'
echo "include=[$PRODUCTION]" >> $GITHUB_OUTPUT
else
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
@@ -727,7 +732,7 @@ jobs:
ssh-add ssh-key
rm -f ssh-key ssh-key-cert.pub
ansible-galaxy collection install sivel.toiletwater
ansible-playbook deploy.yaml -i ${{ matrix.env_name }}.hosts.yaml -e CONSOLE_API_TOKEN=${{ secrets[matrix.console_api_key_secret] }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
ansible-playbook deploy.yaml -i ${{ matrix.env_name }}.hosts.yaml -e CONSOLE_API_TOKEN=${{ secrets[matrix.console_api_key_secret] }}
rm -f neon_install.tar.gz .neon_current_version
deploy-new:
@@ -765,7 +770,7 @@ jobs:
exit 1
fi
ansible-galaxy collection install sivel.toiletwater
ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_STAGING_API_KEY}}
rm -f neon_install.tar.gz .neon_current_version
deploy-pr-test-new:
@@ -798,7 +803,7 @@ jobs:
./get_binaries.sh
ansible-galaxy collection install sivel.toiletwater
ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_STAGING_API_KEY}}
rm -f neon_install.tar.gz .neon_current_version
deploy-prod-new:
@@ -838,7 +843,7 @@ jobs:
fi
ansible-galaxy collection install sivel.toiletwater
ansible-playbook deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
ansible-playbook deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_PRODUCTION_API_KEY}}
rm -f neon_install.tar.gz .neon_current_version
deploy-proxy:
@@ -880,49 +885,8 @@ jobs:
- name: Re-deploy proxy
run: |
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
deploy-storage-broker-staging:
runs-on: [ self-hosted, dev, x64 ]
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest
# Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
if: |
(github.ref_name == 'main' || github.ref_name == 'release') &&
github.event_name != 'workflow_dispatch'
defaults:
run:
shell: bash
strategy:
matrix:
include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
env:
KUBECONFIG: .kubeconfig
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 0
- name: Add curl
run: apt update && apt install curl -y
- name: Store kubeconfig file
run: |
echo "${{ secrets[matrix.kubeconfig_secret] }}" | base64 --decode > ${KUBECONFIG}
chmod 0600 ${KUBECONFIG}
- name: Setup helm v3
run: |
curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
helm repo add neondatabase https://neondatabase.github.io/helm-charts
- name: Deploy storage-broker
run:
DOCKER_TAG=${{ needs.tag.outputs.build-tag }}
helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace ${{ matrix.storage_broker_ns }} --create-namespace --install -f .github/helm-values/${{ matrix.storage_broker_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
deploy-proxy-new:
runs-on: [ self-hosted, dev, x64 ]
@@ -961,54 +925,19 @@ jobs:
- name: Re-deploy scram proxy
run: |
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
- name: Re-deploy link proxy
if: matrix.deploy_link_proxy
run: |
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
- name: Re-deploy legacy scram proxy
if: matrix.deploy_legacy_scram_proxy
run: |
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
deploy-storage-broker-dev-new:
runs-on: [ self-hosted, dev, x64 ]
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
# Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
needs: [ push-docker-hub, tag, regress-tests ]
if: |
(github.ref_name == 'main') &&
github.event_name != 'workflow_dispatch'
defaults:
run:
shell: bash
strategy:
matrix:
include:
- target_region: us-east-2
target_cluster: dev-us-east-2-beta
- target_region: eu-west-1
target_cluster: dev-eu-west-1-zeta
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 0
- name: Configure environment
run: |
helm repo add neondatabase https://neondatabase.github.io/helm-charts
aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }}
- name: Deploy storage-broker
run:
DOCKER_TAG=${{ needs.tag.outputs.build-tag }}
helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
deploy-proxy-prod-new:
runs-on: prod
@@ -1026,8 +955,6 @@ jobs:
include:
- target_region: us-east-2
target_cluster: prod-us-east-2-delta
- target_region: us-west-2
target_cluster: prod-us-west-2-eta
- target_region: eu-central-1
target_cluster: prod-eu-central-1-gamma
- target_region: ap-southeast-1
@@ -1047,46 +974,7 @@ jobs:
- name: Re-deploy proxy
run: |
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
deploy-storage-broker-prod-new:
runs-on: prod
container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
# Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
needs: [ push-docker-hub, tag, regress-tests ]
if: |
(github.ref_name == 'release') &&
github.event_name != 'workflow_dispatch'
defaults:
run:
shell: bash
strategy:
matrix:
include:
- target_region: us-east-2
target_cluster: prod-us-east-2-delta
- target_region: us-west-2
target_cluster: prod-us-west-2-eta
- target_region: eu-central-1
target_cluster: prod-eu-central-1-gamma
- target_region: ap-southeast-1
target_cluster: prod-ap-southeast-1-epsilon
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 0
- name: Configure environment
run: |
helm repo add neondatabase https://neondatabase.github.io/helm-charts
aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }}
- name: Deploy storage-broker
run:
DOCKER_TAG=${{ needs.tag.outputs.build-tag }}
helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
promote-compatibility-data:
runs-on: [ self-hosted, dev, x64 ]

347
Cargo.lock generated
View File

@@ -66,6 +66,15 @@ dependencies = [
"backtrace",
]
[[package]]
name = "archery"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0a8da9bc4c4053ee067669762bcaeea6e241841295a2b6c948312dad6ef4cc02"
dependencies = [
"static_assertions",
]
[[package]]
name = "arrayvec"
version = "0.7.2"
@@ -85,7 +94,7 @@ dependencies = [
"num-traits",
"rusticata-macros",
"thiserror",
"time",
"time 0.3.15",
]
[[package]]
@@ -190,7 +199,7 @@ dependencies = [
"http",
"hyper",
"ring",
"time",
"time 0.3.15",
"tokio",
"tower",
"tracing",
@@ -331,7 +340,7 @@ dependencies = [
"percent-encoding",
"regex",
"ring",
"time",
"time 0.3.15",
"tracing",
]
@@ -468,7 +477,7 @@ dependencies = [
"itoa",
"num-integer",
"ryu",
"time",
"time 0.3.15",
]
[[package]]
@@ -713,23 +722,20 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "chrono"
version = "0.4.23"
version = "0.4.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "16b0a3d9ed01224b22057780a37bb8c5dbfe1be8ba48678e7bf57ec4b385411f"
checksum = "bfd4d1b31faaa3a89d7934dbded3111da0d2ef28e3ebccdb4f0179f5929d1ef1"
dependencies = [
"iana-time-zone",
"js-sys",
"num-integer",
"num-traits",
"serde",
"time 0.1.44",
"wasm-bindgen",
"winapi",
]
[[package]]
name = "chunked_transfer"
version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fff857943da45f546682664a79488be82e69e43c1a7a2307679ab9afb3a66d2e"
[[package]]
name = "ciborium"
version = "0.2.0"
@@ -1231,16 +1237,6 @@ dependencies = [
"uuid 0.8.2",
]
[[package]]
name = "debugid"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bef552e6f588e446098f6ba40d89ac146c8c7b64aade83c051ee00bb5d2bc18d"
dependencies = [
"serde",
"uuid 1.2.1",
]
[[package]]
name = "der-parser"
version = "8.1.0"
@@ -1410,21 +1406,6 @@ version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
[[package]]
name = "foreign-types"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1"
dependencies = [
"foreign-types-shared",
]
[[package]]
name = "foreign-types-shared"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
[[package]]
name = "form_urlencoded"
version = "1.1.0"
@@ -1566,7 +1547,7 @@ checksum = "4eb1a864a501629691edf6c15a593b7a51eebaa1e8468e9ddc623de7c9b58ec6"
dependencies = [
"cfg-if",
"libc",
"wasi",
"wasi 0.11.0+wasi-snapshot-preview1",
]
[[package]]
@@ -1698,17 +1679,6 @@ dependencies = [
"digest",
]
[[package]]
name = "hostname"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3c731c3e10504cc8ed35cfe2f1db4c9274c3d35fa486e3b31df46f068ef3e867"
dependencies = [
"libc",
"match_cfg",
"winapi",
]
[[package]]
name = "http"
version = "0.2.8"
@@ -1816,19 +1786,6 @@ dependencies = [
"tokio-io-timeout",
]
[[package]]
name = "hyper-tls"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905"
dependencies = [
"bytes",
"hyper",
"native-tls",
"tokio",
"tokio-native-tls",
]
[[package]]
name = "iana-time-zone"
version = "0.1.51"
@@ -2054,12 +2011,6 @@ dependencies = [
"serde",
]
[[package]]
name = "match_cfg"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ffbee8634e0d45d258acb448e7eaab3fce7a0a467395d4d9f228e3c1f01fb2e4"
[[package]]
name = "matchers"
version = "0.1.0"
@@ -2162,7 +2113,7 @@ checksum = "57ee1c23c7c63b0c9250c339ffdc69255f110b298b901b9f6c82547b7b87caaf"
dependencies = [
"libc",
"log",
"wasi",
"wasi 0.11.0+wasi-snapshot-preview1",
"windows-sys",
]
@@ -2172,24 +2123,6 @@ version = "0.8.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
[[package]]
name = "native-tls"
version = "0.2.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e"
dependencies = [
"lazy_static",
"libc",
"log",
"openssl",
"openssl-probe",
"openssl-sys",
"schannel",
"security-framework",
"security-framework-sys",
"tempfile",
]
[[package]]
name = "nb"
version = "0.1.3"
@@ -2360,62 +2293,12 @@ version = "11.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"
[[package]]
name = "openssl"
version = "0.10.43"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "020433887e44c27ff16365eaa2d380547a94544ad509aff6eb5b6e3e0b27b376"
dependencies = [
"bitflags",
"cfg-if",
"foreign-types",
"libc",
"once_cell",
"openssl-macros",
"openssl-sys",
]
[[package]]
name = "openssl-macros"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b501e44f11665960c7e7fcf062c7d96a14ade4aa98116c004b2e37b5be7d736c"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "openssl-probe"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
[[package]]
name = "openssl-sys"
version = "0.9.78"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "07d5c8cb6e57b3a3612064d7b18b117912b4ce70955c2504d4b741c9e244b132"
dependencies = [
"autocfg",
"cc",
"libc",
"pkg-config",
"vcpkg",
]
[[package]]
name = "os_info"
version = "3.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c4750134fb6a5d49afc80777394ad5d95b04bc12068c6abb92fae8f43817270f"
dependencies = [
"log",
"serde",
"winapi",
]
[[package]]
name = "os_str_bytes"
version = "6.3.0"
@@ -2460,6 +2343,7 @@ dependencies = [
"num-traits",
"once_cell",
"pageserver_api",
"persistent_range_query",
"pin-project-lite",
"postgres",
"postgres-protocol",
@@ -2471,6 +2355,7 @@ dependencies = [
"rand",
"regex",
"remote_storage",
"rpds",
"rstar",
"scopeguard",
"serde",
@@ -2577,6 +2462,14 @@ version = "2.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e"
[[package]]
name = "persistent_range_query"
version = "0.1.0"
dependencies = [
"rand",
"workspace_hack",
]
[[package]]
name = "petgraph"
version = "0.6.2"
@@ -2637,12 +2530,6 @@ version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
[[package]]
name = "pkg-config"
version = "0.3.26"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160"
[[package]]
name = "plotters"
version = "0.3.4"
@@ -3117,7 +3004,7 @@ checksum = "ffbe84efe2f38dea12e9bfc1f65377fdf03e53a18cb3b995faedf7934c7e785b"
dependencies = [
"pem",
"ring",
"time",
"time 0.3.15",
"yasna",
]
@@ -3205,12 +3092,10 @@ dependencies = [
"http-body",
"hyper",
"hyper-rustls",
"hyper-tls",
"ipnet",
"js-sys",
"log",
"mime",
"native-tls",
"once_cell",
"percent-encoding",
"pin-project-lite",
@@ -3220,7 +3105,6 @@ dependencies = [
"serde_json",
"serde_urlencoded",
"tokio",
"tokio-native-tls",
"tokio-rustls",
"tower-service",
"url",
@@ -3289,6 +3173,15 @@ dependencies = [
"regex",
]
[[package]]
name = "rpds"
version = "0.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "66262ea963eff99163e6b741fbc3417a52cc13074728c1047e9911789df9b000"
dependencies = [
"archery",
]
[[package]]
name = "rstar"
version = "0.9.3"
@@ -3557,89 +3450,6 @@ version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3"
[[package]]
name = "sentry"
version = "0.29.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c6425e2a14006415449fb0a3e9a119df5032f59e7a2d9350cf8738eca290dfc5"
dependencies = [
"httpdate",
"native-tls",
"reqwest",
"sentry-backtrace",
"sentry-contexts",
"sentry-core",
"sentry-panic",
"tokio",
"ureq",
]
[[package]]
name = "sentry-backtrace"
version = "0.29.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "04d79c194e5c20fe602e81faf39f3cff0f275ec61283f437a892cfd6544da592"
dependencies = [
"backtrace",
"once_cell",
"regex",
"sentry-core",
]
[[package]]
name = "sentry-contexts"
version = "0.29.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e1c2a57601eeb870521cc241caee27e57a012f297ece3c1b7eee87f2a531edb5"
dependencies = [
"hostname",
"libc",
"os_info",
"rustc_version 0.4.0",
"sentry-core",
"uname",
]
[[package]]
name = "sentry-core"
version = "0.29.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8be90ea119c6d0664c8ab534013bc9e90355e7004d782d5d1492ca513393b929"
dependencies = [
"once_cell",
"rand",
"sentry-types",
"serde",
"serde_json",
]
[[package]]
name = "sentry-panic"
version = "0.29.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ec217c3290e3f0d128154da731c28efa8f62cf8e3c3a006fd4bc3407c959176"
dependencies = [
"sentry-backtrace",
"sentry-core",
]
[[package]]
name = "sentry-types"
version = "0.29.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "67ad85f0addf16310a1fbcf3facc7acb17ef5dbf6ae059d2f3c38442a471404d"
dependencies = [
"debugid 0.8.0",
"getrandom",
"hex",
"serde",
"serde_json",
"thiserror",
"time",
"url",
"uuid 1.2.1",
]
[[package]]
name = "serde"
version = "1.0.145"
@@ -3696,7 +3506,7 @@ dependencies = [
"serde",
"serde_json",
"serde_with_macros",
"time",
"time 0.3.15",
]
[[package]]
@@ -3787,7 +3597,7 @@ dependencies = [
"num-bigint",
"num-traits",
"thiserror",
"time",
"time 0.3.15",
]
[[package]]
@@ -3842,6 +3652,12 @@ version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
[[package]]
name = "static_assertions"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
[[package]]
name = "storage_broker"
version = "0.1.0"
@@ -3927,7 +3743,7 @@ version = "8.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f551f902d5642e58039aee6a9021a61037926af96e071816361644983966f540"
dependencies = [
"debugid 0.7.3",
"debugid",
"memmap2",
"stable_deref_trait",
"uuid 0.8.2",
@@ -4049,6 +3865,17 @@ dependencies = [
"once_cell",
]
[[package]]
name = "time"
version = "0.1.44"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6db9e6914ab8b1ae1c260a4ae7a49b6c5611b40328a735b21862567685e73255"
dependencies = [
"libc",
"wasi 0.10.0+wasi-snapshot-preview1",
"winapi",
]
[[package]]
name = "time"
version = "0.3.15"
@@ -4134,16 +3961,6 @@ dependencies = [
"syn",
]
[[package]]
name = "tokio-native-tls"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f7d995660bd2b7f8c1568414c1126076c13fbb725c40112dc0120b78eb9b717b"
dependencies = [
"native-tls",
"tokio",
]
[[package]]
name = "tokio-postgres"
version = "0.7.6"
@@ -4476,15 +4293,6 @@ version = "1.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dcf81ac59edc17cc8697ff311e8f5ef2d99fcbd9817b34cec66f90b6c3dfd987"
[[package]]
name = "uname"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b72f89f0ca32e4db1c04e2a72f5345d59796d4866a1ee0609084569f73683dc8"
dependencies = [
"libc",
]
[[package]]
name = "unicode-bidi"
version = "0.3.8"
@@ -4524,20 +4332,6 @@ version = "0.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a"
[[package]]
name = "ureq"
version = "2.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b97acb4c28a254fd7a4aeec976c46a7fa404eac4d7c134b30c75144846d7cb8f"
dependencies = [
"base64",
"chunked_transfer",
"log",
"native-tls",
"once_cell",
"url",
]
[[package]]
name = "url"
version = "2.3.1"
@@ -4547,7 +4341,6 @@ dependencies = [
"form_urlencoded",
"idna",
"percent-encoding",
"serde",
]
[[package]]
@@ -4580,7 +4373,6 @@ dependencies = [
"rustls",
"rustls-pemfile",
"rustls-split",
"sentry",
"serde",
"serde_json",
"serde_with",
@@ -4624,12 +4416,6 @@ version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77439c1b53d2303b20d9459b1ade71a83c716e3f9c34f3228c00e6f185d6c002"
[[package]]
name = "vcpkg"
version = "0.2.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
[[package]]
name = "version_check"
version = "0.9.4"
@@ -4687,6 +4473,12 @@ dependencies = [
"try-lock",
]
[[package]]
name = "wasi"
version = "0.10.0+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f"
[[package]]
name = "wasi"
version = "0.11.0+wasi-snapshot-preview1"
@@ -4892,6 +4684,7 @@ dependencies = [
"clap 4.0.15",
"crossbeam-utils",
"either",
"fail",
"futures-channel",
"futures-task",
"futures-util",
@@ -4914,12 +4707,12 @@ dependencies = [
"serde",
"stable_deref_trait",
"syn",
"time 0.3.15",
"tokio",
"tokio-util",
"tower",
"tracing",
"tracing-core",
"url",
]
[[package]]
@@ -4937,7 +4730,7 @@ dependencies = [
"oid-registry",
"rusticata-macros",
"thiserror",
"time",
"time 0.3.15",
]
[[package]]
@@ -4961,7 +4754,7 @@ version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "346d34a236c9d3e5f3b9b74563f238f955bbd05fa0b8b4efa53c130c43982f4c"
dependencies = [
"time",
"time 0.3.15",
]
[[package]]

View File

@@ -2,20 +2,29 @@
Neon is a serverless open-source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes the PostgreSQL storage layer by redistributing data across a cluster of nodes.
The project used to be called "Zenith". Many of the commands and code comments
still refer to "zenith", but we are in the process of renaming things.
## Quick start
Try the [Neon Free Tier](https://neon.tech/docs/introduction/technical-preview-free-tier/) to create a serverless Postgres instance. Then connect to it with your preferred Postgres client (psql, dbeaver, etc) or use the online [SQL Editor](https://neon.tech/docs/get-started-with-neon/query-with-neon-sql-editor/). See [Connect from any application](https://neon.tech/docs/connect/connect-from-any-app/) for connection instructions.
[Join the waitlist](https://neon.tech/) for our free tier to receive your serverless postgres instance. Then connect to it with your preferred postgres client (psql, dbeaver, etc) or use the online SQL editor.
Alternatively, compile and run the project [locally](#running-local-installation).
## Architecture overview
A Neon installation consists of compute nodes and the Neon storage engine. Compute nodes are stateless PostgreSQL nodes backed by the Neon storage engine.
A Neon installation consists of compute nodes and a Neon storage engine.
Compute nodes are stateless PostgreSQL nodes backed by the Neon storage engine.
The Neon storage engine consists of two major components:
- Pageserver. Scalable storage backend for the compute nodes.
- Safekeepers. The safekeepers form a redundant WAL service that received WAL from the compute node, and stores it durably until it has been processed by the pageserver and uploaded to cloud storage.
- WAL service. The service receives WAL from the compute node and ensures that it is stored durably.
See developer documentation in [/docs/SUMMARY.md](/docs/SUMMARY.md) for more information.
Pageserver consists of:
- Repository - Neon storage implementation.
- WAL receiver - service that receives WAL from WAL service and stores it in the repository.
- Page service - service that communicates with compute nodes and responds with pages from the repository.
- WAL redo - service that builds pages from base images and WAL records on Page service request
## Running local installation
@@ -213,27 +222,19 @@ Ensure your dependencies are installed as described [here](https://github.com/ne
```sh
git clone --recursive https://github.com/neondatabase/neon.git
make
CARGO_BUILD_FLAGS="--features=testing" make
./scripts/pytest
```
## Documentation
[/docs/](/docs/) Contains a top-level overview of all available markdown documentation.
Now we use README files to cover design ideas and overall architecture for each module and `rustdoc` style documentation comments. See also [/docs/](/docs/) a top-level overview of all available markdown documentation.
- [/docs/sourcetree.md](/docs/sourcetree.md) contains overview of source tree layout.
To view your `rustdoc` documentation in a browser, try running `cargo doc --no-deps --open`
See also README files in some source directories, and `rustdoc` style documentation comments.
Other resources:
- [SELECT 'Hello, World'](https://neon.tech/blog/hello-world/): Blog post by Nikita Shamgunov on the high level architecture
- [Architecture decisions in Neon](https://neon.tech/blog/architecture-decisions-in-neon/): Blog post by Heikki Linnakangas
- [Neon: Serverless PostgreSQL!](https://www.youtube.com/watch?v=rES0yzeERns): Presentation on storage system by Heikki Linnakangas in the CMU Database Group seminar series
### Postgres-specific terms
Due to Neon's very close relation with PostgreSQL internals, numerous specific terms are used.

View File

@@ -5,7 +5,7 @@ edition = "2021"
[dependencies]
anyhow = "1.0"
chrono = { version = "0.4", default-features = false, features = ["clock"] }
chrono = "0.4"
clap = "4.0"
env_logger = "0.9"
futures = "0.3.13"

View File

@@ -14,19 +14,17 @@
use std::ffi::OsStr;
use std::io::Write;
use std::os::unix::prelude::AsRawFd;
use std::os::unix::process::CommandExt;
use std::path::{Path, PathBuf};
use std::path::Path;
use std::process::{Child, Command};
use std::time::Duration;
use std::{fs, io, thread};
use anyhow::Context;
use anyhow::{anyhow, bail, Context, Result};
use nix::errno::Errno;
use nix::fcntl::{FcntlArg, FdFlag};
use nix::sys::signal::{kill, Signal};
use nix::unistd::Pid;
use utils::pid_file::{self, PidFileRead};
use utils::lock_file;
// These constants control the loop used to poll for process start / stop.
//
@@ -88,14 +86,6 @@ where
let filled_cmd = fill_aws_secrets_vars(fill_rust_env_vars(background_command));
filled_cmd.envs(envs);
let pid_file_to_check = match initial_pid_file {
InitialPidFile::Create(path) => {
pre_exec_create_pidfile(filled_cmd, path);
path
}
InitialPidFile::Expect(path) => path,
};
let mut spawned_process = filled_cmd.spawn().with_context(|| {
format!("Could not spawn {process_name}, see console output and log files for details.")
})?;
@@ -105,8 +95,29 @@ where
.with_context(|| format!("Subprocess {process_name} has invalid pid {pid}"))?,
);
let pid_file_to_check = match initial_pid_file {
InitialPidFile::Create(target_pid_file_path) => {
match lock_file::create_lock_file(target_pid_file_path, pid.to_string()) {
lock_file::LockCreationResult::Created { .. } => {
// We use "lock" file here only to create the pid file. The lock on the pidfile will be dropped as soon
// as this CLI invocation exits, so it's a bit useless, but doesn't any harm either.
}
lock_file::LockCreationResult::AlreadyLocked { .. } => {
anyhow::bail!("Cannot write pid file for {process_name} at path {target_pid_file_path:?}: file is already locked by another process")
}
lock_file::LockCreationResult::CreationFailed(e) => {
return Err(e.context(format!(
"Failed to create pid file for {process_name} at path {target_pid_file_path:?}"
)))
}
}
None
}
InitialPidFile::Expect(pid_file_path) => Some(pid_file_path),
};
for retries in 0..RETRIES {
match process_started(pid, Some(pid_file_to_check), &process_status_check) {
match process_started(pid, pid_file_to_check, &process_status_check) {
Ok(true) => {
println!("\n{process_name} started, pid: {pid}");
return Ok(spawned_process);
@@ -154,27 +165,12 @@ pub fn send_stop_child_process(child: &std::process::Child) -> anyhow::Result<()
/// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> anyhow::Result<()> {
let pid = match pid_file::read(pid_file)
.with_context(|| format!("read pid_file {pid_file:?}"))?
{
PidFileRead::NotExist => {
println!("{process_name} is already stopped: no pid file present at {pid_file:?}");
return Ok(());
}
PidFileRead::NotHeldByAnyProcess(_) => {
// Don't try to kill according to file contents beacuse the pid might have been re-used by another process.
// Don't delete the file either, it can race with new pid file creation.
// Read `pid_file` module comment for details.
println!(
"No process is holding the pidfile. The process must have already exited. Leave in place to avoid race conditions: {pid_file:?}"
);
return Ok(());
}
PidFileRead::LockedByOtherProcess(pid) => pid,
};
// XXX the pid could become invalid (and recycled) at any time before the kill() below.
if !pid_file.exists() {
println!("{process_name} is already stopped: no pid file {pid_file:?} is present");
return Ok(());
}
let pid = read_pidfile(pid_file)?;
// send signal
let sig = if immediate {
print!("Stopping {process_name} with pid {pid} immediately..");
Signal::SIGQUIT
@@ -186,9 +182,8 @@ pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> any
match kill(pid, sig) {
Ok(()) => (),
Err(Errno::ESRCH) => {
// Again, don't delete the pid file. The unlink can race with a new pid file being created.
println!(
"{process_name} with pid {pid} does not exist, but a pid file {pid_file:?} was found. Likely the pid got recycled. Lucky we didn't harm anyone."
"{process_name} with pid {pid} does not exist, but a pid file {pid_file:?} was found"
);
return Ok(());
}
@@ -257,69 +252,6 @@ fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
cmd
}
/// Add a `pre_exec` to the cmd that, inbetween fork() and exec(),
/// 1. Claims a pidfile with a fcntl lock on it and
/// 2. Sets up the pidfile's file descriptor so that it (and the lock)
/// will remain held until the cmd exits.
fn pre_exec_create_pidfile<P>(cmd: &mut Command, path: P) -> &mut Command
where
P: Into<PathBuf>,
{
let path: PathBuf = path.into();
// SAFETY
// pre_exec is marked unsafe because it runs between fork and exec.
// Why is that dangerous in various ways?
// Long answer: https://github.com/rust-lang/rust/issues/39575
// Short answer: in a multi-threaded program, other threads may have
// been inside of critical sections at the time of fork. In the
// original process, that was allright, assuming they protected
// the critical sections appropriately, e.g., through locks.
// Fork adds another process to the mix that
// 1. Has a single thread T
// 2. In an exact copy of the address space at the time of fork.
// A variety of problems scan occur now:
// 1. T tries to grab a lock that was locked at the time of fork.
// It will wait forever since in its address space, the lock
// is in state 'taken' but the thread that would unlock it is
// not there.
// 2. A rust object that represented some external resource in the
// parent now got implicitly copied by the the fork, even though
// the object's type is not `Copy`. The parent program may use
// non-copyability as way to enforce unique ownership of an
// external resource in the typesystem. The fork breaks that
// assumption, as now both parent and child process have an
// owned instance of the object that represents the same
// underlying resource.
// While these seem like niche problems, (1) in particular is
// highly relevant. For example, `malloc()` may grab a mutex internally,
// and so, if we forked while another thread was mallocing' and our
// pre_exec closure allocates as well, it will block on the malloc
// mutex forever
//
// The proper solution is to only use C library functions that are marked
// "async-signal-safe": https://man7.org/linux/man-pages/man7/signal-safety.7.html
//
// With this specific pre_exec() closure, the non-error path doesn't allocate.
// The error path uses `anyhow`, and hence does allocate.
// We take our chances there, hoping that any potential disaster is constrained
// to the child process (e.g., malloc has no state ourside of the child process).
// Last, `expect` prints to stderr, and stdio is not async-signal-safe.
// Again, we take our chances, making the same assumptions as for malloc.
unsafe {
cmd.pre_exec(move || {
let file = pid_file::claim_for_current_process(&path).expect("claim pid file");
// Remove the FD_CLOEXEC flag on the pidfile descriptor so that the pidfile
// remains locked after exec.
nix::fcntl::fcntl(file.as_raw_fd(), FcntlArg::F_SETFD(FdFlag::empty()))
.expect("remove FD_CLOEXEC");
// Don't run drop(file), it would close the file before we actually exec.
std::mem::forget(file);
Ok(())
});
}
cmd
}
fn process_started<F>(
pid: Pid,
pid_file_to_check: Option<&Path>,
@@ -330,11 +262,14 @@ where
{
match status_check() {
Ok(true) => match pid_file_to_check {
Some(pid_file_path) => match pid_file::read(pid_file_path)? {
PidFileRead::NotExist => Ok(false),
PidFileRead::LockedByOtherProcess(pid_in_file) => Ok(pid_in_file == pid),
PidFileRead::NotHeldByAnyProcess(_) => Ok(false),
},
Some(pid_file_path) => {
if pid_file_path.exists() {
let pid_in_file = read_pidfile(pid_file_path)?;
Ok(pid_in_file == pid)
} else {
Ok(false)
}
}
None => Ok(true),
},
Ok(false) => Ok(false),
@@ -342,6 +277,21 @@ where
}
}
/// Read a PID file
///
/// We expect a file that contains a single integer.
fn read_pidfile(pidfile: &Path) -> Result<Pid> {
let pid_str = fs::read_to_string(pidfile)
.with_context(|| format!("failed to read pidfile {pidfile:?}"))?;
let pid: i32 = pid_str
.parse()
.map_err(|_| anyhow!("failed to parse pidfile {pidfile:?}"))?;
if pid < 1 {
bail!("pidfile {pidfile:?} contained bad value '{pid}'");
}
Ok(Pid::from_raw(pid))
}
fn process_has_stopped(pid: Pid) -> anyhow::Result<bool> {
match kill(pid, None) {
// Process exists, keep waiting

View File

@@ -324,7 +324,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
pg_version,
)
.unwrap_or_else(|e| {
eprintln!("pageserver init failed: {e:?}");
eprintln!("pageserver init failed: {e}");
exit(1);
});

View File

@@ -156,8 +156,6 @@ pub struct PageServerConf {
// jwt auth token used for communication with pageserver
pub auth_token: String,
pub testing_mode: bool,
}
impl Default for PageServerConf {
@@ -168,7 +166,6 @@ impl Default for PageServerConf {
listen_http_addr: String::new(),
auth_type: AuthType::Trust,
auth_token: String::new(),
testing_mode: false,
}
}
}

View File

@@ -141,9 +141,6 @@ impl PageServerNode {
init_config_overrides.push(&listen_http_addr_param);
init_config_overrides.push(&listen_pg_addr_param);
init_config_overrides.push(&broker_endpoints_param);
if self.env.pageserver.testing_mode {
init_config_overrides.push("testing_mode=true");
}
if let Some(broker_etcd_prefix_param) = broker_etcd_prefix_param.as_deref() {
init_config_overrides.push(broker_etcd_prefix_param);

View File

@@ -45,9 +45,9 @@ and create new databases and accounts (control plane API in our case).
Integration tests, written in Python using the `pytest` framework.
`/vendor/postgres-v14` and `/vendor/postgres-v15`:
`/vendor/postgres-v14`:
PostgreSQL source tree per version, with the modifications needed for Neon.
PostgreSQL source tree, with the modifications needed for Neon.
`/pgxn/neon`:

View File

@@ -201,6 +201,8 @@ pub struct TimelineInfo {
pub last_received_msg_ts: Option<u128>,
pub pg_version: u32,
pub awaits_download: bool,
pub state: TimelineState,
// Some of the above fields are duplicated in 'local' and 'remote', for backwards-

View File

@@ -0,0 +1,12 @@
[package]
name = "persistent_range_query"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
[dev-dependencies]
rand = "0.8.3"

View File

@@ -0,0 +1,78 @@
use std::ops::Range;
pub mod naive;
pub mod ops;
pub mod segment_tree;
/// Should be a monoid:
/// * Identity element: for all a: combine(new_for_empty_range(), a) = combine(a, new_for_empty_range()) = a
/// * Associativity: for all a, b, c: combine(combine(a, b), c) == combine(a, combine(b, c))
pub trait RangeQueryResult<Key>: Sized + Clone {
// Clone is equivalent to combine with an empty range.
fn new_for_empty_range() -> Self;
// Contract: left_range.end == right_range.start
// left_range.start == left_range.end == right_range.start == right_range.end is still possible
fn combine(
left: &Self,
left_range: &Range<Key>,
right: &Self,
right_range: &Range<Key>,
) -> Self;
fn add(left: &mut Self, left_range: &Range<Key>, right: &Self, right_range: &Range<Key>);
}
pub trait LazyRangeInitializer<Result: RangeQueryResult<Key>, Key> {
fn get(&self, range: &Range<Key>) -> Result;
}
/// Should be a monoid:
/// * Identity element: for all op: compose(no_op(), op) == compose(op, no_op()) == op
/// * Associativity: for all op_1, op_2, op_3: compose(compose(op_1, op_2), op_3) == compose(op_1, compose(op_2, op_3))
///
/// Should left act on Result:
/// * Identity operation: for all r: no_op().apply(r) == r
/// * Compatibility: for all op_1, op_2, r: op_1.apply(op_2.apply(r)) == compose(op_1, op_2).apply(r)
pub trait RangeModification<Key> {
type Result: RangeQueryResult<Key>;
fn no_op() -> Self;
fn is_no_op(&self) -> bool;
fn is_reinitialization(&self) -> bool;
fn apply(&self, result: &mut Self::Result, range: &Range<Key>);
fn compose(later: &Self, earlier: &mut Self);
}
pub trait VecReadableVersion<Modification: RangeModification<Key>, Key> {
fn get(&self, keys: &Range<Key>) -> Modification::Result;
}
// TODO: use trait alias when stabilized
pub trait VecFrozenVersion<Modification: RangeModification<Key>, Key>:
Clone + VecReadableVersion<Modification, Key>
{
}
impl<
T: Clone + VecReadableVersion<Modification, Key>,
Modification: RangeModification<Key>,
Key,
> VecFrozenVersion<Modification, Key> for T
{
}
pub trait PersistentVecStorage<
Modification: RangeModification<Key>,
Initializer: LazyRangeInitializer<Modification::Result, Key>,
Key,
>: VecReadableVersion<Modification, Key>
{
fn new(all_keys: Range<Key>, initializer: Initializer) -> Self;
type FrozenVersion: VecFrozenVersion<Modification, Key>;
fn modify(&mut self, keys: &Range<Key>, modification: &Modification);
fn freeze(&mut self) -> Self::FrozenVersion;
}

View File

@@ -0,0 +1,115 @@
use crate::{
LazyRangeInitializer, PersistentVecStorage, RangeModification, RangeQueryResult,
VecReadableVersion,
};
use std::marker::PhantomData;
use std::ops::Range;
use std::rc::Rc;
pub struct NaiveFrozenVersion<Modification: RangeModification<Key>, Key> {
all_keys: Range<Key>,
values: Rc<Box<Vec<Modification::Result>>>,
}
pub trait IndexableKey: Clone {
fn index(all_keys: &Range<Self>, key: &Self) -> usize;
fn element_range(all_keys: &Range<Self>, index: usize) -> Range<Self>;
}
fn get<Modification: RangeModification<Key>, Key: IndexableKey>(
all_keys: &Range<Key>,
values: &Vec<Modification::Result>,
keys: &Range<Key>,
) -> Modification::Result {
let mut result = Modification::Result::new_for_empty_range();
let mut result_range = keys.start.clone()..keys.start.clone();
for index in
IndexableKey::index(&all_keys, &keys.start)..IndexableKey::index(&all_keys, &keys.end)
{
let element_range = IndexableKey::element_range(&all_keys, index);
Modification::Result::add(&mut result, &result_range, &values[index], &element_range);
result_range.end = element_range.end;
}
result
}
impl<Modification: RangeModification<Key>, Key: IndexableKey> VecReadableVersion<Modification, Key>
for NaiveFrozenVersion<Modification, Key>
{
fn get(&self, keys: &Range<Key>) -> Modification::Result {
get::<Modification, Key>(&self.all_keys, &self.values, keys)
}
}
// Manual implementation of `Clone` becase `derive` requires `Modification: Clone`
impl<Modification: RangeModification<Key>, Key: Clone> Clone
for NaiveFrozenVersion<Modification, Key>
{
fn clone(&self) -> Self {
Self {
all_keys: self.all_keys.clone(),
values: self.values.clone(),
}
}
}
// TODO: is it at all possible to store previous versions in this struct,
// without any Rc<>?
pub struct NaiveVecStorage<
Modification: RangeModification<Key>,
Initializer: LazyRangeInitializer<Modification::Result, Key>,
Key: IndexableKey,
> {
all_keys: Range<Key>,
last_version: Vec<Modification::Result>,
_initializer: PhantomData<Initializer>,
}
impl<
Modification: RangeModification<Key>,
Initializer: LazyRangeInitializer<Modification::Result, Key>,
Key: IndexableKey,
> VecReadableVersion<Modification, Key> for NaiveVecStorage<Modification, Initializer, Key>
{
fn get(&self, keys: &Range<Key>) -> Modification::Result {
get::<Modification, Key>(&self.all_keys, &self.last_version, keys)
}
}
impl<
Modification: RangeModification<Key>,
Initializer: LazyRangeInitializer<Modification::Result, Key>,
Key: IndexableKey,
> PersistentVecStorage<Modification, Initializer, Key>
for NaiveVecStorage<Modification, Initializer, Key>
{
fn new(all_keys: Range<Key>, initializer: Initializer) -> Self {
let mut values = Vec::with_capacity(IndexableKey::index(&all_keys, &all_keys.end));
for index in 0..values.capacity() {
values.push(initializer.get(&IndexableKey::element_range(&all_keys, index)));
}
NaiveVecStorage {
all_keys,
last_version: values,
_initializer: PhantomData,
}
}
type FrozenVersion = NaiveFrozenVersion<Modification, Key>;
fn modify(&mut self, keys: &Range<Key>, modification: &Modification) {
for index in IndexableKey::index(&self.all_keys, &keys.start)
..IndexableKey::index(&self.all_keys, &keys.end)
{
let element_range = IndexableKey::element_range(&self.all_keys, index);
modification.apply(&mut self.last_version[index], &element_range);
}
}
fn freeze(&mut self) -> Self::FrozenVersion {
NaiveFrozenVersion::<Modification, Key> {
all_keys: self.all_keys.clone(),
values: Rc::new(Box::new(self.last_version.clone())),
}
}
}

View File

@@ -0,0 +1,14 @@
pub mod rsq;
#[derive(Copy, Clone, Debug)]
pub struct SameElementsInitializer<T> {
initial_element_value: T,
}
impl<T> SameElementsInitializer<T> {
pub fn new(initial_element_value: T) -> Self {
SameElementsInitializer {
initial_element_value,
}
}
}

View File

@@ -0,0 +1,118 @@
//! # Range Sum Query
use crate::ops::SameElementsInitializer;
use crate::{LazyRangeInitializer, RangeModification, RangeQueryResult};
use std::borrow::Borrow;
use std::ops::{Add, AddAssign, Range};
// TODO: commutative Add
#[derive(Clone, Copy, Debug)]
pub struct SumResult<T> {
sum: T,
}
impl<T> SumResult<T> {
pub fn sum(&self) -> &T {
&self.sum
}
}
impl<T: Clone + for<'a> AddAssign<&'a T> + From<u8>, Key> RangeQueryResult<Key> for SumResult<T>
where
for<'a> &'a T: Add<&'a T, Output = T>,
{
fn new_for_empty_range() -> Self {
SumResult { sum: 0.into() }
}
fn combine(
left: &Self,
_left_range: &Range<Key>,
right: &Self,
_right_range: &Range<Key>,
) -> Self {
SumResult {
sum: &left.sum + &right.sum,
}
}
fn add(left: &mut Self, _left_range: &Range<Key>, right: &Self, _right_range: &Range<Key>) {
left.sum += &right.sum
}
}
pub trait SumOfSameElements<Key> {
fn sum(initial_element_value: &Self, keys: &Range<Key>) -> Self;
}
impl<T: SumOfSameElements<Key>, TB: Borrow<T>, Key> LazyRangeInitializer<SumResult<T>, Key>
for SameElementsInitializer<TB>
where
SumResult<T>: RangeQueryResult<Key>,
{
fn get(&self, range: &Range<Key>) -> SumResult<T> {
SumResult {
sum: SumOfSameElements::sum(self.initial_element_value.borrow(), range),
}
}
}
#[derive(Copy, Clone, Debug)]
pub enum AddAssignModification<T> {
None,
Add(T),
Assign(T),
}
impl<T: Clone + for<'a> AddAssign<&'a T>, Key> RangeModification<Key> for AddAssignModification<T>
where
SumResult<T>: RangeQueryResult<Key>,
for<'a> SameElementsInitializer<&'a T>: LazyRangeInitializer<SumResult<T>, Key>,
{
type Result = SumResult<T>;
fn no_op() -> Self {
AddAssignModification::None
}
fn is_no_op(&self) -> bool {
match self {
AddAssignModification::None => true,
_ => false,
}
}
fn is_reinitialization(&self) -> bool {
match self {
AddAssignModification::Assign(_) => true,
_ => false,
}
}
fn apply(&self, result: &mut SumResult<T>, range: &Range<Key>) {
use AddAssignModification::*;
match self {
None => {}
Add(x) | Assign(x) => {
let to_add = SameElementsInitializer::new(x).get(range).sum;
if let Assign(_) = self {
result.sum = to_add;
} else {
result.sum += &to_add;
}
}
}
}
fn compose(later: &Self, earlier: &mut Self) {
use AddAssignModification::*;
match (later, earlier) {
(_, e @ None) => *e = later.clone(),
(None, _) => {}
(Assign(_), e) => *e = later.clone(),
(Add(x), Add(y)) => *y += x,
(Add(x), Assign(value)) => *value += x,
}
}
}

View File

@@ -0,0 +1,255 @@
//! # Segment Tree
//! It is a competitive programming folklore data structure. Do not confuse with the interval tree.
use crate::{LazyRangeInitializer, PersistentVecStorage, RangeQueryResult, VecReadableVersion};
use std::ops::Range;
use std::rc::Rc;
pub trait MidpointableKey: Clone + Ord + Sized {
fn midpoint(range: &Range<Self>) -> Self;
}
pub trait RangeModification<Key>: Clone + crate::RangeModification<Key> {}
// TODO: use trait alias when stabilized
impl<T: Clone + crate::RangeModification<Key>, Key> RangeModification<Key> for T {}
#[derive(Debug)]
struct Node<Modification: RangeModification<Key>, Key> {
result: Modification::Result,
modify_children: Modification,
left: Option<Rc<Self>>,
right: Option<Rc<Self>>,
}
// Manual implementation because we don't need `Key: Clone` for this, unlike with `derive`.
impl<Modification: RangeModification<Key>, Key> Clone for Node<Modification, Key> {
fn clone(&self) -> Self {
Node {
result: self.result.clone(),
modify_children: self.modify_children.clone(),
left: self.left.clone(),
right: self.right.clone(),
}
}
}
impl<Modification: RangeModification<Key>, Key> Node<Modification, Key> {
fn new<Initializer: LazyRangeInitializer<Modification::Result, Key>>(
range: &Range<Key>,
initializer: &Initializer,
) -> Self {
Node {
result: initializer.get(range),
modify_children: Modification::no_op(),
left: None,
right: None,
}
}
pub fn apply(&mut self, modification: &Modification, range: &Range<Key>) {
modification.apply(&mut self.result, range);
Modification::compose(modification, &mut self.modify_children);
if self.modify_children.is_reinitialization() {
self.left = None;
self.right = None;
}
}
pub fn force_children<Initializer: LazyRangeInitializer<Modification::Result, Key>>(
&mut self,
initializer: &Initializer,
range_left: &Range<Key>,
range_right: &Range<Key>,
) {
let left = Rc::make_mut(
self.left
.get_or_insert_with(|| Rc::new(Node::new(&range_left, initializer))),
);
let right = Rc::make_mut(
self.right
.get_or_insert_with(|| Rc::new(Node::new(&range_right, initializer))),
);
left.apply(&self.modify_children, &range_left);
right.apply(&self.modify_children, &range_right);
self.modify_children = Modification::no_op();
}
pub fn recalculate_from_children(&mut self, range_left: &Range<Key>, range_right: &Range<Key>) {
assert!(self.modify_children.is_no_op());
assert!(self.left.is_some());
assert!(self.right.is_some());
self.result = Modification::Result::combine(
&self.left.as_ref().unwrap().result,
&range_left,
&self.right.as_ref().unwrap().result,
&range_right,
);
}
}
fn split_range<Key: MidpointableKey>(range: &Range<Key>) -> (Range<Key>, Range<Key>) {
let range_left = range.start.clone()..MidpointableKey::midpoint(range);
let range_right = range_left.end.clone()..range.end.clone();
(range_left, range_right)
}
pub struct PersistentSegmentTreeVersion<
Modification: RangeModification<Key>,
Initializer: LazyRangeInitializer<Modification::Result, Key>,
Key: Clone,
> {
root: Rc<Node<Modification, Key>>,
all_keys: Range<Key>,
initializer: Rc<Initializer>,
}
// Manual implementation because we don't need `Key: Clone` for this, unlike with `derive`.
impl<
Modification: RangeModification<Key>,
Initializer: LazyRangeInitializer<Modification::Result, Key>,
Key: Clone,
> Clone for PersistentSegmentTreeVersion<Modification, Initializer, Key>
{
fn clone(&self) -> Self {
Self {
root: self.root.clone(),
all_keys: self.all_keys.clone(),
initializer: self.initializer.clone(),
}
}
}
fn get<
Modification: RangeModification<Key>,
Initializer: LazyRangeInitializer<Modification::Result, Key>,
Key: MidpointableKey,
>(
node: &mut Rc<Node<Modification, Key>>,
node_keys: &Range<Key>,
initializer: &Initializer,
keys: &Range<Key>,
) -> Modification::Result {
if node_keys.end <= keys.start || keys.end <= node_keys.start {
return Modification::Result::new_for_empty_range();
}
if keys.start <= node_keys.start && node_keys.end <= keys.end {
return node.result.clone();
}
let node = Rc::make_mut(node);
let (left_keys, right_keys) = split_range(node_keys);
node.force_children(initializer, &left_keys, &right_keys);
let mut result = get(node.left.as_mut().unwrap(), &left_keys, initializer, keys);
Modification::Result::add(
&mut result,
&left_keys,
&get(node.right.as_mut().unwrap(), &right_keys, initializer, keys),
&right_keys,
);
result
}
fn modify<
Modification: RangeModification<Key>,
Initializer: LazyRangeInitializer<Modification::Result, Key>,
Key: MidpointableKey,
>(
node: &mut Rc<Node<Modification, Key>>,
node_keys: &Range<Key>,
initializer: &Initializer,
keys: &Range<Key>,
modification: &Modification,
) {
if modification.is_no_op() || node_keys.end <= keys.start || keys.end <= node_keys.start {
return;
}
let node = Rc::make_mut(node);
if keys.start <= node_keys.start && node_keys.end <= keys.end {
node.apply(modification, node_keys);
return;
}
let (left_keys, right_keys) = split_range(node_keys);
node.force_children(initializer, &left_keys, &right_keys);
modify(
node.left.as_mut().unwrap(),
&left_keys,
initializer,
keys,
&modification,
);
modify(
node.right.as_mut().unwrap(),
&right_keys,
initializer,
keys,
&modification,
);
node.recalculate_from_children(&left_keys, &right_keys);
}
impl<
Modification: RangeModification<Key>,
Initializer: LazyRangeInitializer<Modification::Result, Key>,
Key: MidpointableKey,
> VecReadableVersion<Modification, Key>
for PersistentSegmentTreeVersion<Modification, Initializer, Key>
{
fn get(&self, keys: &Range<Key>) -> Modification::Result {
get(
&mut self.root.clone(), // TODO: do not always force a branch
&self.all_keys,
self.initializer.as_ref(),
keys,
)
}
}
pub struct PersistentSegmentTree<
Modification: RangeModification<Key>,
Initializer: LazyRangeInitializer<Modification::Result, Key>,
Key: MidpointableKey,
>(PersistentSegmentTreeVersion<Modification, Initializer, Key>);
impl<
Modification: RangeModification<Key>,
Initializer: LazyRangeInitializer<Modification::Result, Key>,
Key: MidpointableKey,
> VecReadableVersion<Modification, Key>
for PersistentSegmentTree<Modification, Initializer, Key>
{
fn get(&self, keys: &Range<Key>) -> Modification::Result {
self.0.get(keys)
}
}
impl<
Modification: RangeModification<Key>,
Initializer: LazyRangeInitializer<Modification::Result, Key>,
Key: MidpointableKey,
> PersistentVecStorage<Modification, Initializer, Key>
for PersistentSegmentTree<Modification, Initializer, Key>
{
fn new(all_keys: Range<Key>, initializer: Initializer) -> Self {
PersistentSegmentTree(PersistentSegmentTreeVersion {
root: Rc::new(Node::new(&all_keys, &initializer)),
all_keys: all_keys,
initializer: Rc::new(initializer),
})
}
type FrozenVersion = PersistentSegmentTreeVersion<Modification, Initializer, Key>;
fn modify(&mut self, keys: &Range<Key>, modification: &Modification) {
modify(
&mut self.0.root, // TODO: do not always force a branch
&self.0.all_keys,
self.0.initializer.as_ref(),
keys,
modification,
)
}
fn freeze(&mut self) -> Self::FrozenVersion {
self.0.clone()
}
}

View File

@@ -0,0 +1,295 @@
use persistent_range_query::naive::{IndexableKey, NaiveVecStorage};
use persistent_range_query::ops::SameElementsInitializer;
use persistent_range_query::segment_tree::{MidpointableKey, PersistentSegmentTree};
use persistent_range_query::{
LazyRangeInitializer, PersistentVecStorage, RangeModification, RangeQueryResult,
VecReadableVersion,
};
use std::cmp::Ordering;
use std::ops::Range;
#[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd)]
struct PageIndex(u32);
type LayerId = String;
impl IndexableKey for PageIndex {
fn index(all_keys: &Range<Self>, key: &Self) -> usize {
(key.0 as usize) - (all_keys.start.0 as usize)
}
fn element_range(all_keys: &Range<Self>, index: usize) -> Range<Self> {
PageIndex(all_keys.start.0 + index as u32)..PageIndex(all_keys.start.0 + index as u32 + 1)
}
}
impl MidpointableKey for PageIndex {
fn midpoint(range: &Range<Self>) -> Self {
PageIndex(range.start.0 + (range.end.0 - range.start.0) / 2)
}
}
#[derive(Clone, Debug, Eq, PartialEq)]
struct LayerMapInformation {
// Only make sense for a range of length 1.
last_layer: Option<LayerId>,
last_image_layer: Option<LayerId>,
// Work for all ranges
max_delta_layers: (usize, Range<PageIndex>),
}
impl LayerMapInformation {
fn last_layers(&self) -> (&Option<LayerId>, &Option<LayerId>) {
(&self.last_layer, &self.last_image_layer)
}
fn max_delta_layers(&self) -> &(usize, Range<PageIndex>) {
&self.max_delta_layers
}
}
fn merge_ranges(left: &Range<PageIndex>, right: &Range<PageIndex>) -> Range<PageIndex> {
if left.is_empty() {
right.clone()
} else if right.is_empty() {
left.clone()
} else if left.end == right.start {
left.start..right.end
} else {
left.clone()
}
}
impl RangeQueryResult<PageIndex> for LayerMapInformation {
fn new_for_empty_range() -> Self {
LayerMapInformation {
last_layer: None,
last_image_layer: None,
max_delta_layers: (0, PageIndex(0)..PageIndex(0)),
}
}
fn combine(
left: &Self,
_left_range: &Range<PageIndex>,
right: &Self,
_right_range: &Range<PageIndex>,
) -> Self {
// Note that either range may be empty.
LayerMapInformation {
last_layer: left
.last_layer
.as_ref()
.or_else(|| right.last_layer.as_ref())
.cloned(),
last_image_layer: left
.last_image_layer
.as_ref()
.or_else(|| right.last_image_layer.as_ref())
.cloned(),
max_delta_layers: match left.max_delta_layers.0.cmp(&right.max_delta_layers.0) {
Ordering::Less => right.max_delta_layers.clone(),
Ordering::Greater => left.max_delta_layers.clone(),
Ordering::Equal => (
left.max_delta_layers.0,
merge_ranges(&left.max_delta_layers.1, &right.max_delta_layers.1),
),
},
}
}
fn add(
left: &mut Self,
left_range: &Range<PageIndex>,
right: &Self,
right_range: &Range<PageIndex>,
) {
*left = Self::combine(&left, left_range, right, right_range);
}
}
#[derive(Clone, Debug)]
struct AddDeltaLayers {
last_layer: LayerId,
count: usize,
}
#[derive(Clone, Debug)]
struct LayerMapModification {
add_image_layer: Option<LayerId>,
add_delta_layers: Option<AddDeltaLayers>,
}
impl LayerMapModification {
fn add_image_layer(layer: impl Into<LayerId>) -> Self {
LayerMapModification {
add_image_layer: Some(layer.into()),
add_delta_layers: None,
}
}
fn add_delta_layer(layer: impl Into<LayerId>) -> Self {
LayerMapModification {
add_image_layer: None,
add_delta_layers: Some(AddDeltaLayers {
last_layer: layer.into(),
count: 1,
}),
}
}
}
impl RangeModification<PageIndex> for LayerMapModification {
type Result = LayerMapInformation;
fn no_op() -> Self {
LayerMapModification {
add_image_layer: None,
add_delta_layers: None,
}
}
fn is_no_op(&self) -> bool {
self.add_image_layer.is_none() && self.add_delta_layers.is_none()
}
fn is_reinitialization(&self) -> bool {
self.add_image_layer.is_some()
}
fn apply(&self, result: &mut Self::Result, range: &Range<PageIndex>) {
if let Some(layer) = &self.add_image_layer {
result.last_layer = Some(layer.clone());
result.last_image_layer = Some(layer.clone());
result.max_delta_layers = (0, range.clone());
}
if let Some(AddDeltaLayers { last_layer, count }) = &self.add_delta_layers {
result.last_layer = Some(last_layer.clone());
result.max_delta_layers.0 += count;
}
}
fn compose(later: &Self, earlier: &mut Self) {
if later.add_image_layer.is_some() {
*earlier = later.clone();
return;
}
if let Some(AddDeltaLayers { last_layer, count }) = &later.add_delta_layers {
let res = earlier.add_delta_layers.get_or_insert(AddDeltaLayers {
last_layer: LayerId::default(),
count: 0,
});
res.last_layer = last_layer.clone();
res.count += count;
}
}
}
impl LazyRangeInitializer<LayerMapInformation, PageIndex> for SameElementsInitializer<()> {
fn get(&self, range: &Range<PageIndex>) -> LayerMapInformation {
LayerMapInformation {
last_layer: None,
last_image_layer: None,
max_delta_layers: (0, range.clone()),
}
}
}
fn test_layer_map<
S: PersistentVecStorage<LayerMapModification, SameElementsInitializer<()>, PageIndex>,
>() {
let mut s = S::new(
PageIndex(0)..PageIndex(100),
SameElementsInitializer::new(()),
);
s.modify(
&(PageIndex(0)..PageIndex(70)),
&LayerMapModification::add_image_layer("Img0..70"),
);
s.modify(
&(PageIndex(50)..PageIndex(100)),
&LayerMapModification::add_image_layer("Img50..100"),
);
s.modify(
&(PageIndex(10)..PageIndex(60)),
&LayerMapModification::add_delta_layer("Delta10..60"),
);
let s_before_last_delta = s.freeze();
s.modify(
&(PageIndex(20)..PageIndex(80)),
&LayerMapModification::add_delta_layer("Delta20..80"),
);
assert_eq!(
s.get(&(PageIndex(5)..PageIndex(6))).last_layers(),
(&Some("Img0..70".to_owned()), &Some("Img0..70".to_owned()))
);
assert_eq!(
s.get(&(PageIndex(15)..PageIndex(16))).last_layers(),
(
&Some("Delta10..60".to_owned()),
&Some("Img0..70".to_owned())
)
);
assert_eq!(
s.get(&(PageIndex(25)..PageIndex(26))).last_layers(),
(
&Some("Delta20..80".to_owned()),
&Some("Img0..70".to_owned())
)
);
assert_eq!(
s.get(&(PageIndex(65)..PageIndex(66))).last_layers(),
(
&Some("Delta20..80".to_owned()),
&Some("Img50..100".to_owned())
)
);
assert_eq!(
s.get(&(PageIndex(95)..PageIndex(96))).last_layers(),
(
&Some("Img50..100".to_owned()),
&Some("Img50..100".to_owned())
)
);
assert_eq!(
s.get(&(PageIndex(0)..PageIndex(100))).max_delta_layers(),
&(2, PageIndex(20)..PageIndex(60)),
);
assert_eq!(
*s_before_last_delta
.get(&(PageIndex(0)..PageIndex(100)))
.max_delta_layers(),
(1, PageIndex(10)..PageIndex(60)),
);
assert_eq!(
*s.get(&(PageIndex(10)..PageIndex(30))).max_delta_layers(),
(2, PageIndex(20)..PageIndex(30))
);
assert_eq!(
*s.get(&(PageIndex(10)..PageIndex(20))).max_delta_layers(),
(1, PageIndex(10)..PageIndex(20))
);
assert_eq!(
*s.get(&(PageIndex(70)..PageIndex(80))).max_delta_layers(),
(1, PageIndex(70)..PageIndex(80))
);
assert_eq!(
*s_before_last_delta
.get(&(PageIndex(70)..PageIndex(80)))
.max_delta_layers(),
(0, PageIndex(70)..PageIndex(80))
);
}
#[test]
fn test_naive() {
test_layer_map::<NaiveVecStorage<_, _, _>>();
}
#[test]
fn test_segment_tree() {
test_layer_map::<PersistentSegmentTree<_, _, _>>();
}

View File

@@ -0,0 +1,116 @@
use persistent_range_query::naive::*;
use persistent_range_query::ops::rsq::AddAssignModification::Add;
use persistent_range_query::ops::rsq::*;
use persistent_range_query::ops::SameElementsInitializer;
use persistent_range_query::segment_tree::{MidpointableKey, PersistentSegmentTree};
use persistent_range_query::{PersistentVecStorage, VecReadableVersion};
use rand::{Rng, SeedableRng};
use std::ops::Range;
#[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd)]
struct K(u16);
impl IndexableKey for K {
fn index(all_keys: &Range<Self>, key: &Self) -> usize {
(key.0 as usize) - (all_keys.start.0 as usize)
}
fn element_range(all_keys: &Range<Self>, index: usize) -> Range<Self> {
K(all_keys.start.0 + index as u16)..K(all_keys.start.0 + index as u16 + 1)
}
}
impl SumOfSameElements<K> for i32 {
fn sum(initial_element_value: &Self, keys: &Range<K>) -> Self {
initial_element_value * (keys.end.0 - keys.start.0) as Self
}
}
impl MidpointableKey for K {
fn midpoint(range: &Range<Self>) -> Self {
K(range.start.0 + (range.end.0 - range.start.0) / 2)
}
}
fn test_storage<
S: PersistentVecStorage<AddAssignModification<i32>, SameElementsInitializer<i32>, K>,
>() {
let mut s = S::new(K(0)..K(12), SameElementsInitializer::new(0i32));
assert_eq!(*s.get(&(K(0)..K(12))).sum(), 0);
s.modify(&(K(2)..K(5)), &AddAssignModification::Add(3));
assert_eq!(*s.get(&(K(0)..K(12))).sum(), 3 + 3 + 3);
let s_old = s.freeze();
s.modify(&(K(3)..K(6)), &AddAssignModification::Assign(10));
assert_eq!(*s.get(&(K(0)..K(12))).sum(), 3 + 10 + 10 + 10);
s.modify(&(K(4)..K(7)), &AddAssignModification::Add(2));
assert_eq!(*s.get(&(K(0)..K(12))).sum(), 3 + 10 + 12 + 12 + 2);
assert_eq!(*s.get(&(K(4)..K(6))).sum(), 12 + 12);
assert_eq!(*s_old.get(&(K(4)..K(6))).sum(), 3);
}
#[test]
fn test_naive() {
test_storage::<NaiveVecStorage<_, _, _>>();
}
#[test]
fn test_segment_tree() {
test_storage::<PersistentSegmentTree<_, _, _>>();
}
#[test]
fn test_stress() {
const LEN: u16 = 17_238;
const OPERATIONS: i32 = 20_000;
let mut rng = rand::rngs::StdRng::seed_from_u64(0);
let mut naive: NaiveVecStorage<AddAssignModification<i32>, _, _> =
NaiveVecStorage::new(K(0)..K(LEN), SameElementsInitializer::new(2i32));
let mut segm_tree: PersistentSegmentTree<AddAssignModification<i32>, _, _> =
PersistentSegmentTree::new(K(0)..K(LEN), SameElementsInitializer::new(2i32));
fn gen_range(rng: &mut impl Rng) -> Range<K> {
let l: u16 = rng.gen_range(0..LEN);
let r: u16 = rng.gen_range(0..LEN);
if l <= r {
K(l)..K(r)
} else {
K(r)..K(l)
}
}
for _ in 0..2 {
let checksum_range = gen_range(&mut rng);
let checksum_before: i32 = *naive.get(&checksum_range).sum();
assert_eq!(checksum_before, *segm_tree.get(&checksum_range).sum());
let naive_before = naive.freeze();
let segm_tree_before = segm_tree.freeze();
assert_eq!(checksum_before, *naive_before.get(&checksum_range).sum());
assert_eq!(checksum_before, *segm_tree.get(&checksum_range).sum());
for _ in 0..OPERATIONS {
{
let range = gen_range(&mut rng);
assert_eq!(naive.get(&range).sum(), segm_tree.get(&range).sum());
}
{
let range = gen_range(&mut rng);
let val = rng.gen_range(-10i32..=10i32);
let op = Add(val);
naive.modify(&range, &op);
segm_tree.modify(&range, &op);
}
}
assert_eq!(checksum_before, *naive_before.get(&checksum_range).sum());
assert_eq!(
checksum_before,
*segm_tree_before.get(&checksum_range).sum()
);
}
}

View File

@@ -10,7 +10,7 @@ mod s3_bucket;
use std::{
collections::HashMap,
fmt::Debug,
fmt::{Debug, Display},
num::{NonZeroU32, NonZeroUsize},
ops::Deref,
path::{Path, PathBuf},
@@ -41,27 +41,44 @@ pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
/// Path on the remote storage, relative to some inner prefix.
/// The prefix is an implementation detail, that allows representing local paths
/// as the remote ones, stripping the local storage prefix away.
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct RemotePath(PathBuf);
impl RemotePath {
pub fn new(relative_path: &Path) -> anyhow::Result<Self> {
anyhow::ensure!(
relative_path.is_relative(),
"Path {relative_path:?} is not relative"
);
Ok(Self(relative_path.to_path_buf()))
}
pub fn with_base(&self, base_path: &Path) -> PathBuf {
base_path.join(&self.0)
}
#[derive(Clone, PartialEq, Eq)]
pub struct RemoteObjectId(String);
///
/// A key that refers to an object in remote storage. It works much like a Path,
/// but it's a separate datatype so that you don't accidentally mix local paths
/// and remote keys.
///
impl RemoteObjectId {
// Needed to retrieve last component for RemoteObjectId.
// In other words a file name
/// Turn a/b/c or a/b/c/ into c
pub fn object_name(&self) -> Option<&str> {
self.0.file_name().and_then(|os_str| os_str.to_str())
// corner case, char::to_string is not const, thats why this is more verbose than it needs to be
// see https://github.com/rust-lang/rust/issues/88674
if self.0.len() == 1 && self.0.chars().next().unwrap() == REMOTE_STORAGE_PREFIX_SEPARATOR {
return None;
}
if self.0.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
self.0.rsplit(REMOTE_STORAGE_PREFIX_SEPARATOR).nth(1)
} else {
self.0
.rsplit_once(REMOTE_STORAGE_PREFIX_SEPARATOR)
.map(|(_, last)| last)
}
}
}
impl Debug for RemoteObjectId {
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
Debug::fmt(&self.0, fmt)
}
}
impl Display for RemoteObjectId {
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
Display::fmt(&self.0, fmt)
}
}
@@ -70,40 +87,49 @@ impl RemotePath {
/// providing basic CRUD operations for storage files.
#[async_trait::async_trait]
pub trait RemoteStorage: Send + Sync + 'static {
/// Attempts to derive the storage path out of the local path, if the latter is correct.
fn remote_object_id(&self, local_path: &Path) -> anyhow::Result<RemoteObjectId>;
/// Gets the download path of the given storage file.
fn local_path(&self, remote_object_id: &RemoteObjectId) -> anyhow::Result<PathBuf>;
/// Lists all items the storage has right now.
async fn list(&self) -> anyhow::Result<Vec<RemotePath>>;
async fn list(&self) -> anyhow::Result<Vec<RemoteObjectId>>;
/// Lists all top level subdirectories for a given prefix
/// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
/// which already takes into account any kind of global prefix (prefix_in_bucket for S3 or storage_root for LocalFS)
/// so this method doesnt need to.
async fn list_prefixes(&self, prefix: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>>;
async fn list_prefixes(
&self,
prefix: Option<&RemoteObjectId>,
) -> anyhow::Result<Vec<RemoteObjectId>>;
/// Streams the local file contents into remote into the remote storage entry.
async fn upload(
&self,
data: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>,
from: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>,
// S3 PUT request requires the content length to be specified,
// otherwise it starts to fail with the concurrent connection count increasing.
data_size_bytes: usize,
to: &RemotePath,
from_size_bytes: usize,
to: &RemoteObjectId,
metadata: Option<StorageMetadata>,
) -> anyhow::Result<()>;
/// Streams the remote storage entry contents into the buffered writer given, returns the filled writer.
/// Returns the metadata, if any was stored with the file previously.
async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError>;
async fn download(&self, from: &RemoteObjectId) -> Result<Download, DownloadError>;
/// Streams a given byte range of the remote storage entry contents into the buffered writer given, returns the filled writer.
/// Returns the metadata, if any was stored with the file previously.
async fn download_byte_range(
&self,
from: &RemotePath,
from: &RemoteObjectId,
start_inclusive: u64,
end_exclusive: Option<u64>,
) -> Result<Download, DownloadError>;
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>;
async fn delete(&self, path: &RemoteObjectId) -> anyhow::Result<()>;
/// Downcast to LocalFs implementation. For tests.
fn as_local(&self) -> Option<&LocalFs> {
@@ -152,35 +178,34 @@ impl std::error::Error for DownloadError {}
/// Every storage, currently supported.
/// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics.
#[derive(Clone)]
pub enum GenericRemoteStorage {
LocalFs(LocalFs),
AwsS3(Arc<S3Bucket>),
}
pub struct GenericRemoteStorage(Arc<dyn RemoteStorage>);
impl Deref for GenericRemoteStorage {
type Target = dyn RemoteStorage;
fn deref(&self) -> &Self::Target {
match self {
GenericRemoteStorage::LocalFs(local_fs) => local_fs,
GenericRemoteStorage::AwsS3(s3_bucket) => s3_bucket.as_ref(),
}
self.0.as_ref()
}
}
impl GenericRemoteStorage {
pub fn new(storage: impl RemoteStorage) -> Self {
Self(Arc::new(storage))
}
pub fn from_config(
working_directory: PathBuf,
storage_config: &RemoteStorageConfig,
) -> anyhow::Result<GenericRemoteStorage> {
Ok(match &storage_config.storage {
RemoteStorageKind::LocalFs(root) => {
info!("Using fs root '{}' as a remote storage", root.display());
GenericRemoteStorage::LocalFs(LocalFs::new(root.clone())?)
GenericRemoteStorage::new(LocalFs::new(root.clone(), working_directory)?)
}
RemoteStorageKind::AwsS3(s3_config) => {
info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}'",
s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
GenericRemoteStorage::AwsS3(Arc::new(S3Bucket::new(s3_config)?))
GenericRemoteStorage::new(S3Bucket::new(s3_config, working_directory)?)
}
})
}
@@ -194,12 +219,23 @@ impl GenericRemoteStorage {
&self,
from: Box<dyn tokio::io::AsyncRead + Unpin + Send + Sync + 'static>,
from_size_bytes: usize,
to: &RemotePath,
from_path: &Path,
) -> anyhow::Result<()> {
self.upload(from, from_size_bytes, to, None)
let target_storage_path = self.remote_object_id(from_path).with_context(|| {
format!(
"Failed to get the storage path for source local path '{}'",
from_path.display()
)
})?;
self.upload(from, from_size_bytes, &target_storage_path, None)
.await
.with_context(|| {
format!("Failed to upload data of length {from_size_bytes} to storage path {to:?}")
format!(
"Failed to upload from '{}' to storage path '{:?}'",
from_path.display(),
target_storage_path
)
})
}
@@ -208,11 +244,24 @@ impl GenericRemoteStorage {
pub async fn download_storage_object(
&self,
byte_range: Option<(u64, Option<u64>)>,
from: &RemotePath,
to_path: &Path,
) -> Result<Download, DownloadError> {
let remote_object_path = self
.remote_object_id(to_path)
.with_context(|| {
format!(
"Failed to get the storage path for target local path '{}'",
to_path.display()
)
})
.map_err(DownloadError::BadInput)?;
match byte_range {
Some((start, end)) => self.download_byte_range(from, start, end).await,
None => self.download(from).await,
Some((start, end)) => {
self.download_byte_range(&remote_object_path, start, end)
.await
}
None => self.download(&remote_object_path).await,
}
}
}
@@ -222,6 +271,23 @@ impl GenericRemoteStorage {
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct StorageMetadata(HashMap<String, String>);
fn strip_path_prefix<'a>(prefix: &'a Path, path: &'a Path) -> anyhow::Result<&'a Path> {
if prefix == path {
anyhow::bail!(
"Prefix and the path are equal, cannot strip: '{}'",
prefix.display()
)
} else {
path.strip_prefix(prefix).with_context(|| {
format!(
"Path '{}' is not prefixed with '{}'",
path.display(),
prefix.display(),
)
})
}
}
/// External backup storage configuration, enough for creating a client for that storage.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct RemoteStorageConfig {
@@ -365,24 +431,21 @@ mod tests {
use super::*;
#[test]
fn test_object_name() {
let k = RemotePath::new(Path::new("a/b/c")).unwrap();
fn object_name() {
let k = RemoteObjectId("a/b/c".to_owned());
assert_eq!(k.object_name(), Some("c"));
let k = RemotePath::new(Path::new("a/b/c/")).unwrap();
let k = RemoteObjectId("a/b/c/".to_owned());
assert_eq!(k.object_name(), Some("c"));
let k = RemotePath::new(Path::new("a/")).unwrap();
let k = RemoteObjectId("a/".to_owned());
assert_eq!(k.object_name(), Some("a"));
// XXX is it impossible to have an empty key?
let k = RemotePath::new(Path::new("")).unwrap();
let k = RemoteObjectId("".to_owned());
assert_eq!(k.object_name(), None);
let k = RemoteObjectId("/".to_owned());
assert_eq!(k.object_name(), None);
}
#[test]
fn rempte_path_cannot_be_created_from_absolute_ones() {
let err = RemotePath::new(Path::new("/")).expect_err("Should fail on absolute paths");
assert_eq!(err.to_string(), "Path \"/\" is not relative");
}
}

View File

@@ -5,7 +5,6 @@
//! volume is mounted to the local FS.
use std::{
borrow::Cow,
future::Future,
path::{Path, PathBuf},
pin::Pin,
@@ -19,33 +18,60 @@ use tokio::{
use tracing::*;
use utils::crashsafe::path_with_suffix_extension;
use crate::{Download, DownloadError, RemotePath};
use crate::{Download, DownloadError, RemoteObjectId};
use super::{RemoteStorage, StorageMetadata};
use super::{strip_path_prefix, RemoteStorage, StorageMetadata};
const LOCAL_FS_TEMP_FILE_SUFFIX: &str = "___temp";
#[derive(Debug, Clone)]
/// Convert a Path in the remote storage into a RemoteObjectId
fn remote_object_id_from_path(path: &Path) -> anyhow::Result<RemoteObjectId> {
Ok(RemoteObjectId(
path.to_str()
.ok_or_else(|| anyhow::anyhow!("unexpected characters found in path"))?
.to_string(),
))
}
pub struct LocalFs {
working_directory: PathBuf,
storage_root: PathBuf,
}
impl LocalFs {
/// Attempts to create local FS storage, along with its root directory.
/// Storage root will be created (if does not exist) and transformed into an absolute path (if passed as relative).
pub fn new(mut storage_root: PathBuf) -> anyhow::Result<Self> {
if !storage_root.exists() {
std::fs::create_dir_all(&storage_root).with_context(|| {
format!("Failed to create all directories in the given root path {storage_root:?}")
})?;
}
if !storage_root.is_absolute() {
storage_root = storage_root.canonicalize().with_context(|| {
format!("Failed to represent path {storage_root:?} as an absolute path")
pub fn new(root: PathBuf, working_directory: PathBuf) -> anyhow::Result<Self> {
if !root.exists() {
std::fs::create_dir_all(&root).with_context(|| {
format!(
"Failed to create all directories in the given root path '{}'",
root.display(),
)
})?;
}
Ok(Self {
working_directory,
storage_root: root,
})
}
Ok(Self { storage_root })
///
/// Get the absolute path in the local filesystem to given remote object.
///
/// This is public so that it can be used in tests. Should not be used elsewhere.
///
pub fn resolve_in_storage(&self, remote_object_id: &RemoteObjectId) -> anyhow::Result<PathBuf> {
let path = PathBuf::from(&remote_object_id.0);
if path.is_relative() {
Ok(self.storage_root.join(path))
} else if path.starts_with(&self.storage_root) {
Ok(path)
} else {
bail!(
"Path '{}' does not belong to the current storage",
path.display()
)
}
}
async fn read_storage_metadata(
@@ -77,48 +103,45 @@ impl LocalFs {
#[async_trait::async_trait]
impl RemoteStorage for LocalFs {
async fn list(&self) -> anyhow::Result<Vec<RemotePath>> {
Ok(get_all_files(&self.storage_root, true)
.await?
.into_iter()
.map(|path| {
path.strip_prefix(&self.storage_root)
.context("Failed to strip storage root prefix")
.and_then(RemotePath::new)
.expect(
"We list files for storage root, hence should be able to remote the prefix",
)
})
.collect())
/// Convert a "local" path into a "remote path"
fn remote_object_id(&self, local_path: &Path) -> anyhow::Result<RemoteObjectId> {
let path = self.storage_root.join(
strip_path_prefix(&self.working_directory, local_path)
.context("local path does not belong to this storage")?,
);
remote_object_id_from_path(&path)
}
async fn list_prefixes(&self, prefix: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
fn local_path(&self, remote_object_id: &RemoteObjectId) -> anyhow::Result<PathBuf> {
let storage_path = PathBuf::from(&remote_object_id.0);
let relative_path = strip_path_prefix(&self.storage_root, &storage_path)
.context("local path does not belong to this storage")?;
Ok(self.working_directory.join(relative_path))
}
async fn list(&self) -> anyhow::Result<Vec<RemoteObjectId>> {
get_all_files(&self.storage_root, true).await
}
async fn list_prefixes(
&self,
prefix: Option<&RemoteObjectId>,
) -> anyhow::Result<Vec<RemoteObjectId>> {
let path = match prefix {
Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
None => Cow::Borrowed(&self.storage_root),
Some(prefix) => Path::new(&prefix.0),
None => &self.storage_root,
};
Ok(get_all_files(path.as_ref(), false)
.await?
.into_iter()
.map(|path| {
path.strip_prefix(&self.storage_root)
.context("Failed to strip preifix")
.and_then(RemotePath::new)
.expect(
"We list files for storage root, hence should be able to remote the prefix",
)
})
.collect())
get_all_files(path, false).await
}
async fn upload(
&self,
data: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>,
data_size_bytes: usize,
to: &RemotePath,
from: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>,
from_size_bytes: usize,
to: &RemoteObjectId,
metadata: Option<StorageMetadata>,
) -> anyhow::Result<()> {
let target_file_path = to.with_base(&self.storage_root);
let target_file_path = self.resolve_in_storage(to)?;
create_target_directory(&target_file_path).await?;
// We need this dance with sort of durable rename (without fsyncs)
// to prevent partial uploads. This was really hit when pageserver shutdown
@@ -139,8 +162,8 @@ impl RemoteStorage for LocalFs {
})?,
);
let from_size_bytes = data_size_bytes as u64;
let mut buffer_to_read = data.take(from_size_bytes);
let from_size_bytes = from_size_bytes as u64;
let mut buffer_to_read = from.take(from_size_bytes);
let bytes_read = io::copy(&mut buffer_to_read, &mut destination)
.await
@@ -197,22 +220,27 @@ impl RemoteStorage for LocalFs {
Ok(())
}
async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
let target_path = from.with_base(&self.storage_root);
if file_exists(&target_path).map_err(DownloadError::BadInput)? {
async fn download(&self, from: &RemoteObjectId) -> Result<Download, DownloadError> {
let file_path = self
.resolve_in_storage(from)
.map_err(DownloadError::BadInput)?;
if file_exists(&file_path).map_err(DownloadError::BadInput)? {
let source = io::BufReader::new(
fs::OpenOptions::new()
.read(true)
.open(&target_path)
.open(&file_path)
.await
.with_context(|| {
format!("Failed to open source file {target_path:?} to use in the download")
format!(
"Failed to open source file '{}' to use in the download",
file_path.display()
)
})
.map_err(DownloadError::Other)?,
);
let metadata = self
.read_storage_metadata(&target_path)
.read_storage_metadata(&file_path)
.await
.map_err(DownloadError::Other)?;
Ok(Download {
@@ -226,7 +254,7 @@ impl RemoteStorage for LocalFs {
async fn download_byte_range(
&self,
from: &RemotePath,
from: &RemoteObjectId,
start_inclusive: u64,
end_exclusive: Option<u64>,
) -> Result<Download, DownloadError> {
@@ -238,15 +266,20 @@ impl RemoteStorage for LocalFs {
return Err(DownloadError::Other(anyhow::anyhow!("Invalid range, start ({start_inclusive}) and end_exclusive ({end_exclusive:?}) difference is zero bytes")));
}
}
let target_path = from.with_base(&self.storage_root);
if file_exists(&target_path).map_err(DownloadError::BadInput)? {
let file_path = self
.resolve_in_storage(from)
.map_err(DownloadError::BadInput)?;
if file_exists(&file_path).map_err(DownloadError::BadInput)? {
let mut source = io::BufReader::new(
fs::OpenOptions::new()
.read(true)
.open(&target_path)
.open(&file_path)
.await
.with_context(|| {
format!("Failed to open source file {target_path:?} to use in the download")
format!(
"Failed to open source file '{}' to use in the download",
file_path.display()
)
})
.map_err(DownloadError::Other)?,
);
@@ -256,7 +289,7 @@ impl RemoteStorage for LocalFs {
.context("Failed to seek to the range start in a local storage file")
.map_err(DownloadError::Other)?;
let metadata = self
.read_storage_metadata(&target_path)
.read_storage_metadata(&file_path)
.await
.map_err(DownloadError::Other)?;
@@ -275,12 +308,15 @@ impl RemoteStorage for LocalFs {
}
}
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
let file_path = path.with_base(&self.storage_root);
async fn delete(&self, path: &RemoteObjectId) -> anyhow::Result<()> {
let file_path = self.resolve_in_storage(path)?;
if file_path.exists() && file_path.is_file() {
Ok(fs::remove_file(file_path).await?)
} else {
bail!("File {file_path:?} either does not exist or is not a file")
bail!(
"File '{}' either does not exist or is not a file",
file_path.display()
)
}
}
@@ -296,7 +332,7 @@ fn storage_metadata_path(original_path: &Path) -> PathBuf {
fn get_all_files<'a, P>(
directory_path: P,
recursive: bool,
) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<PathBuf>>> + Send + Sync + 'a>>
) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<RemoteObjectId>>> + Send + Sync + 'a>>
where
P: AsRef<Path> + Send + Sync + 'a,
{
@@ -310,20 +346,20 @@ where
let file_type = dir_entry.file_type().await?;
let entry_path = dir_entry.path();
if file_type.is_symlink() {
debug!("{entry_path:?} us a symlink, skipping")
debug!("{:?} us a symlink, skipping", entry_path)
} else if file_type.is_dir() {
if recursive {
paths.extend(get_all_files(&entry_path, true).await?.into_iter())
} else {
paths.push(entry_path)
paths.push(remote_object_id_from_path(&dir_entry.path())?)
}
} else {
paths.push(entry_path);
paths.push(remote_object_id_from_path(&dir_entry.path())?);
}
}
Ok(paths)
} else {
bail!("Path {directory_path:?} is not a directory")
bail!("Path '{}' is not a directory", directory_path.display())
}
} else {
Ok(Vec::new())
@@ -358,6 +394,173 @@ fn file_exists(file_path: &Path) -> anyhow::Result<bool> {
}
}
#[cfg(test)]
mod pure_tests {
use tempfile::tempdir;
use super::*;
#[test]
fn storage_path_positive() -> anyhow::Result<()> {
let workdir = tempdir()?.path().to_owned();
let storage_root = PathBuf::from("somewhere").join("else");
let storage = LocalFs {
working_directory: workdir.clone(),
storage_root: storage_root.clone(),
};
let local_path = workdir
.join("timelines")
.join("some_timeline")
.join("file_name");
let expected_path = storage_root.join(local_path.strip_prefix(&workdir)?);
let actual_path = PathBuf::from(
storage
.remote_object_id(&local_path)
.expect("Matching path should map to storage path normally")
.0,
);
assert_eq!(
expected_path,
actual_path,
"File paths from workdir should be stored in local fs storage with the same path they have relative to the workdir"
);
Ok(())
}
#[test]
fn storage_path_negatives() -> anyhow::Result<()> {
#[track_caller]
fn storage_path_error(storage: &LocalFs, mismatching_path: &Path) -> String {
match storage.remote_object_id(mismatching_path) {
Ok(wrong_path) => panic!(
"Expected path '{}' to error, but got storage path: {:?}",
mismatching_path.display(),
wrong_path,
),
Err(e) => format!("{:?}", e),
}
}
let workdir = tempdir()?.path().to_owned();
let storage_root = PathBuf::from("somewhere").join("else");
let storage = LocalFs {
working_directory: workdir.clone(),
storage_root,
};
let error_string = storage_path_error(&storage, &workdir);
assert!(error_string.contains("does not belong to this storage"));
assert!(error_string.contains(workdir.to_str().unwrap()));
let mismatching_path_str = "/something/else";
let error_message = storage_path_error(&storage, Path::new(mismatching_path_str));
assert!(
error_message.contains(mismatching_path_str),
"Error should mention wrong path"
);
assert!(
error_message.contains(workdir.to_str().unwrap()),
"Error should mention server workdir"
);
assert!(error_message.contains("does not belong to this storage"));
Ok(())
}
#[test]
fn local_path_positive() -> anyhow::Result<()> {
let workdir = tempdir()?.path().to_owned();
let storage_root = PathBuf::from("somewhere").join("else");
let storage = LocalFs {
working_directory: workdir.clone(),
storage_root: storage_root.clone(),
};
let name = "not a metadata";
let local_path = workdir.join("timelines").join("some_timeline").join(name);
assert_eq!(
local_path,
storage
.local_path(&remote_object_id_from_path(
&storage_root.join(local_path.strip_prefix(&workdir)?)
)?)
.expect("For a valid input, valid local path should be parsed"),
"Should be able to parse metadata out of the correctly named remote delta file"
);
let local_metadata_path = workdir
.join("timelines")
.join("some_timeline")
.join("metadata");
let remote_metadata_path = storage.remote_object_id(&local_metadata_path)?;
assert_eq!(
local_metadata_path,
storage
.local_path(&remote_metadata_path)
.expect("For a valid input, valid local path should be parsed"),
"Should be able to parse metadata out of the correctly named remote metadata file"
);
Ok(())
}
#[test]
fn local_path_negatives() -> anyhow::Result<()> {
#[track_caller]
fn local_path_error(storage: &LocalFs, storage_path: &RemoteObjectId) -> String {
match storage.local_path(storage_path) {
Ok(wrong_path) => panic!(
"Expected local path input {:?} to cause an error, but got file path: {:?}",
storage_path, wrong_path,
),
Err(e) => format!("{:?}", e),
}
}
let storage_root = PathBuf::from("somewhere").join("else");
let storage = LocalFs {
working_directory: tempdir()?.path().to_owned(),
storage_root,
};
let totally_wrong_path = "wrong_wrong_wrong";
let error_message =
local_path_error(&storage, &RemoteObjectId(totally_wrong_path.to_string()));
assert!(error_message.contains(totally_wrong_path));
Ok(())
}
#[test]
fn download_destination_matches_original_path() -> anyhow::Result<()> {
let workdir = tempdir()?.path().to_owned();
let original_path = workdir
.join("timelines")
.join("some_timeline")
.join("some name");
let storage_root = PathBuf::from("somewhere").join("else");
let dummy_storage = LocalFs {
working_directory: workdir,
storage_root,
};
let storage_path = dummy_storage.remote_object_id(&original_path)?;
let download_destination = dummy_storage.local_path(&storage_path)?;
assert_eq!(
original_path, download_destination,
"'original path -> storage path -> matching fs path' transformation should produce the same path as the input one for the correct path"
);
Ok(())
}
}
#[cfg(test)]
mod fs_tests {
use super::*;
@@ -369,7 +572,7 @@ mod fs_tests {
storage: &LocalFs,
#[allow(clippy::ptr_arg)]
// have to use &PathBuf due to `storage.local_path` parameter requirements
remote_storage_path: &RemotePath,
remote_storage_path: &RemoteObjectId,
expected_metadata: Option<&StorageMetadata>,
) -> anyhow::Result<String> {
let mut download = storage
@@ -392,16 +595,41 @@ mod fs_tests {
#[tokio::test]
async fn upload_file() -> anyhow::Result<()> {
let workdir = tempdir()?.path().to_owned();
let storage = create_storage()?;
let target_path_1 = upload_dummy_file(&storage, "upload_1", None).await?;
let (file, size) = create_file_for_upload(
&storage.working_directory.join("whatever"),
"whatever_contents",
)
.await?;
let target_path = "/somewhere/else";
match storage
.upload(
Box::new(file),
size,
&RemoteObjectId(target_path.to_string()),
None,
)
.await
{
Ok(()) => panic!("Should not allow storing files with wrong target path"),
Err(e) => {
let message = format!("{:?}", e);
assert!(message.contains(target_path));
assert!(message.contains("does not belong to the current storage"));
}
}
assert!(storage.list().await?.is_empty());
let target_path_1 = upload_dummy_file(&workdir, &storage, "upload_1", None).await?;
assert_eq!(
storage.list().await?,
vec![target_path_1.clone()],
"Should list a single file after first upload"
);
let target_path_2 = upload_dummy_file(&storage, "upload_2", None).await?;
let target_path_2 = upload_dummy_file(&workdir, &storage, "upload_2", None).await?;
assert_eq!(
list_files_sorted(&storage).await?,
vec![target_path_1.clone(), target_path_2.clone()],
@@ -415,7 +643,7 @@ mod fs_tests {
async fn upload_file_negatives() -> anyhow::Result<()> {
let storage = create_storage()?;
let id = RemotePath::new(Path::new("dummy"))?;
let id = storage.remote_object_id(&storage.working_directory.join("dummy"))?;
let content = std::io::Cursor::new(b"12345");
// Check that you get an error if the size parameter doesn't match the actual
@@ -440,14 +668,16 @@ mod fs_tests {
}
fn create_storage() -> anyhow::Result<LocalFs> {
LocalFs::new(tempdir()?.path().to_owned())
LocalFs::new(tempdir()?.path().to_owned(), tempdir()?.path().to_owned())
}
#[tokio::test]
async fn download_file() -> anyhow::Result<()> {
let workdir = tempdir()?.path().to_owned();
let storage = create_storage()?;
let upload_name = "upload_1";
let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;
let contents = read_and_assert_remote_file_contents(&storage, &upload_target, None).await?;
assert_eq!(
@@ -457,7 +687,7 @@ mod fs_tests {
);
let non_existing_path = "somewhere/else";
match storage.download(&RemotePath::new(Path::new(non_existing_path))?).await {
match storage.download(&RemoteObjectId(non_existing_path.to_string())).await {
Err(DownloadError::NotFound) => {} // Should get NotFound for non existing keys
other => panic!("Should get a NotFound error when downloading non-existing storage files, but got: {other:?}"),
}
@@ -466,9 +696,11 @@ mod fs_tests {
#[tokio::test]
async fn download_file_range_positive() -> anyhow::Result<()> {
let workdir = tempdir()?.path().to_owned();
let storage = create_storage()?;
let upload_name = "upload_1";
let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;
let full_range_download_contents =
read_and_assert_remote_file_contents(&storage, &upload_target, None).await?;
@@ -534,9 +766,11 @@ mod fs_tests {
#[tokio::test]
async fn download_file_range_negative() -> anyhow::Result<()> {
let workdir = tempdir()?.path().to_owned();
let storage = create_storage()?;
let upload_name = "upload_1";
let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;
let start = 1_000_000_000;
let end = start + 1;
@@ -578,9 +812,11 @@ mod fs_tests {
#[tokio::test]
async fn delete_file() -> anyhow::Result<()> {
let workdir = tempdir()?.path().to_owned();
let storage = create_storage()?;
let upload_name = "upload_1";
let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;
storage.delete(&upload_target).await?;
assert!(storage.list().await?.is_empty());
@@ -590,8 +826,7 @@ mod fs_tests {
Err(e) => {
let error_string = e.to_string();
assert!(error_string.contains("does not exist"));
let expected_path = upload_target.with_base(&storage.storage_root);
assert!(error_string.contains(expected_path.to_str().unwrap()));
assert!(error_string.contains(&upload_target.0));
}
}
Ok(())
@@ -599,6 +834,8 @@ mod fs_tests {
#[tokio::test]
async fn file_with_metadata() -> anyhow::Result<()> {
let workdir = tempdir()?.path().to_owned();
let storage = create_storage()?;
let upload_name = "upload_1";
let metadata = StorageMetadata(HashMap::from([
@@ -606,7 +843,7 @@ mod fs_tests {
("two".to_string(), "2".to_string()),
]));
let upload_target =
upload_dummy_file(&storage, upload_name, Some(metadata.clone())).await?;
upload_dummy_file(&workdir, &storage, upload_name, Some(metadata.clone())).await?;
let full_range_download_contents =
read_and_assert_remote_file_contents(&storage, &upload_target, Some(&metadata)).await?;
@@ -646,32 +883,23 @@ mod fs_tests {
}
async fn upload_dummy_file(
workdir: &Path,
storage: &LocalFs,
name: &str,
metadata: Option<StorageMetadata>,
) -> anyhow::Result<RemotePath> {
let from_path = storage
.storage_root
.join("timelines")
.join("some_timeline")
.join(name);
) -> anyhow::Result<RemoteObjectId> {
let timeline_path = workdir.join("timelines").join("some_timeline");
let relative_timeline_path = timeline_path.strip_prefix(&workdir)?;
let storage_path = storage.storage_root.join(relative_timeline_path).join(name);
let remote_object_id = RemoteObjectId(storage_path.to_str().unwrap().to_string());
let from_path = storage.working_directory.join(name);
let (file, size) = create_file_for_upload(&from_path, &dummy_contents(name)).await?;
let relative_path = from_path
.strip_prefix(&storage.storage_root)
.context("Failed to strip storage root prefix")
.and_then(RemotePath::new)
.with_context(|| {
format!(
"Failed to resolve remote part of path {:?} for base {:?}",
from_path, storage.storage_root
)
})?;
storage
.upload(Box::new(file), size, &relative_path, metadata)
.upload(Box::new(file), size, &remote_object_id, metadata)
.await?;
Ok(relative_path)
remote_object_id_from_path(&storage_path)
}
async fn create_file_for_upload(
@@ -696,7 +924,7 @@ mod fs_tests {
format!("contents for {name}")
}
async fn list_files_sorted(storage: &LocalFs) -> anyhow::Result<Vec<RemotePath>> {
async fn list_files_sorted(storage: &LocalFs) -> anyhow::Result<Vec<RemoteObjectId>> {
let mut files = storage.list().await?;
files.sort_by(|a, b| a.0.cmp(&b.0));
Ok(files)

View File

@@ -5,6 +5,7 @@
//! their bucket prefixes are both specified and different.
use std::env::var;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::time::Duration;
@@ -28,7 +29,8 @@ use tracing::debug;
use super::StorageMetadata;
use crate::{
Download, DownloadError, RemotePath, RemoteStorage, S3Config, REMOTE_STORAGE_PREFIX_SEPARATOR,
strip_path_prefix, Download, DownloadError, RemoteObjectId, RemoteStorage, S3Config,
REMOTE_STORAGE_PREFIX_SEPARATOR,
};
const DEFAULT_IMDS_TIMEOUT: Duration = Duration::from_secs(10);
@@ -98,8 +100,31 @@ pub(super) mod metrics {
}
}
fn download_destination(
id: &RemoteObjectId,
workdir: &Path,
prefix_to_strip: Option<&str>,
) -> PathBuf {
let path_without_prefix = match prefix_to_strip {
Some(prefix) => id.0.strip_prefix(prefix).unwrap_or_else(|| {
panic!(
"Could not strip prefix '{}' from S3 object key '{}'",
prefix, id.0
)
}),
None => &id.0,
};
workdir.join(
path_without_prefix
.split(REMOTE_STORAGE_PREFIX_SEPARATOR)
.collect::<PathBuf>(),
)
}
/// AWS S3 storage.
pub struct S3Bucket {
workdir: PathBuf,
client: Client,
bucket_name: String,
prefix_in_bucket: Option<String>,
@@ -117,7 +142,7 @@ struct GetObjectRequest {
}
impl S3Bucket {
/// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
pub fn new(aws_config: &S3Config) -> anyhow::Result<Self> {
pub fn new(aws_config: &S3Config, workdir: PathBuf) -> anyhow::Result<Self> {
debug!(
"Creating s3 remote storage for S3 bucket {}",
aws_config.bucket_name
@@ -171,39 +196,13 @@ impl S3Bucket {
});
Ok(Self {
client,
workdir,
bucket_name: aws_config.bucket_name.clone(),
prefix_in_bucket,
concurrency_limiter: Semaphore::new(aws_config.concurrency_limit.get()),
})
}
fn s3_object_to_relative_path(&self, key: &str) -> RemotePath {
let relative_path =
match key.strip_prefix(self.prefix_in_bucket.as_deref().unwrap_or_default()) {
Some(stripped) => stripped,
// we rely on AWS to return properly prefixed paths
// for requests with a certain prefix
None => panic!(
"Key {} does not start with bucket prefix {:?}",
key, self.prefix_in_bucket
),
};
RemotePath(
relative_path
.split(REMOTE_STORAGE_PREFIX_SEPARATOR)
.collect(),
)
}
fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
let mut full_path = self.prefix_in_bucket.clone().unwrap_or_default();
for segment in path.0.iter() {
full_path.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
full_path.push_str(segment.to_str().unwrap_or_default());
}
full_path
}
async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
let _guard = self
.concurrency_limiter
@@ -253,7 +252,25 @@ impl S3Bucket {
#[async_trait::async_trait]
impl RemoteStorage for S3Bucket {
async fn list(&self) -> anyhow::Result<Vec<RemotePath>> {
fn remote_object_id(&self, local_path: &Path) -> anyhow::Result<RemoteObjectId> {
let relative_path = strip_path_prefix(&self.workdir, local_path)?;
let mut key = self.prefix_in_bucket.clone().unwrap_or_default();
for segment in relative_path {
key.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
key.push_str(&segment.to_string_lossy());
}
Ok(RemoteObjectId(key))
}
fn local_path(&self, storage_path: &RemoteObjectId) -> anyhow::Result<PathBuf> {
Ok(download_destination(
storage_path,
&self.workdir,
self.prefix_in_bucket.as_deref(),
))
}
async fn list(&self) -> anyhow::Result<Vec<RemoteObjectId>> {
let mut document_keys = Vec::new();
let mut continuation_token = None;
@@ -283,7 +300,7 @@ impl RemoteStorage for S3Bucket {
.contents
.unwrap_or_default()
.into_iter()
.filter_map(|o| Some(self.s3_object_to_relative_path(o.key()?))),
.filter_map(|o| Some(RemoteObjectId(o.key?))),
);
match fetch_response.continuation_token {
@@ -297,10 +314,13 @@ impl RemoteStorage for S3Bucket {
/// See the doc for `RemoteStorage::list_prefixes`
/// Note: it wont include empty "directories"
async fn list_prefixes(&self, prefix: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
async fn list_prefixes(
&self,
prefix: Option<&RemoteObjectId>,
) -> anyhow::Result<Vec<RemoteObjectId>> {
// get the passed prefix or if it is not set use prefix_in_bucket value
let list_prefix = prefix
.map(|p| self.relative_path_to_s3_object(p))
.map(|p| p.0.clone())
.or_else(|| self.prefix_in_bucket.clone())
.map(|mut p| {
// required to end with a separator
@@ -342,7 +362,7 @@ impl RemoteStorage for S3Bucket {
.common_prefixes
.unwrap_or_default()
.into_iter()
.filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
.filter_map(|o| Some(RemoteObjectId(o.prefix?))),
);
match fetch_response.continuation_token {
@@ -358,7 +378,7 @@ impl RemoteStorage for S3Bucket {
&self,
from: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>,
from_size_bytes: usize,
to: &RemotePath,
to: &RemoteObjectId,
metadata: Option<StorageMetadata>,
) -> anyhow::Result<()> {
let _guard = self
@@ -375,7 +395,7 @@ impl RemoteStorage for S3Bucket {
self.client
.put_object()
.bucket(self.bucket_name.clone())
.key(self.relative_path_to_s3_object(to))
.key(to.0.to_owned())
.set_metadata(metadata.map(|m| m.0))
.content_length(from_size_bytes.try_into()?)
.body(bytes_stream)
@@ -388,10 +408,10 @@ impl RemoteStorage for S3Bucket {
Ok(())
}
async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
async fn download(&self, from: &RemoteObjectId) -> Result<Download, DownloadError> {
self.download_object(GetObjectRequest {
bucket: self.bucket_name.clone(),
key: self.relative_path_to_s3_object(from),
key: from.0.to_owned(),
..GetObjectRequest::default()
})
.await
@@ -399,7 +419,7 @@ impl RemoteStorage for S3Bucket {
async fn download_byte_range(
&self,
from: &RemotePath,
from: &RemoteObjectId,
start_inclusive: u64,
end_exclusive: Option<u64>,
) -> Result<Download, DownloadError> {
@@ -407,19 +427,19 @@ impl RemoteStorage for S3Bucket {
// and needs both ends to be exclusive
let end_inclusive = end_exclusive.map(|end| end.saturating_sub(1));
let range = Some(match end_inclusive {
Some(end_inclusive) => format!("bytes={start_inclusive}-{end_inclusive}"),
None => format!("bytes={start_inclusive}-"),
Some(end_inclusive) => format!("bytes={}-{}", start_inclusive, end_inclusive),
None => format!("bytes={}-", start_inclusive),
});
self.download_object(GetObjectRequest {
bucket: self.bucket_name.clone(),
key: self.relative_path_to_s3_object(from),
key: from.0.to_owned(),
range,
})
.await
}
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
async fn delete(&self, remote_object_id: &RemoteObjectId) -> anyhow::Result<()> {
let _guard = self
.concurrency_limiter
.acquire()
@@ -431,7 +451,7 @@ impl RemoteStorage for S3Bucket {
self.client
.delete_object()
.bucket(self.bucket_name.clone())
.key(self.relative_path_to_s3_object(path))
.key(remote_object_id.0.to_owned())
.send()
.await
.map_err(|e| {
@@ -441,3 +461,181 @@ impl RemoteStorage for S3Bucket {
Ok(())
}
}
#[cfg(test)]
mod tests {
use tempfile::tempdir;
use super::*;
#[test]
fn test_download_destination() -> anyhow::Result<()> {
let workdir = tempdir()?.path().to_owned();
let local_path = workdir.join("one").join("two").join("test_name");
let relative_path = local_path.strip_prefix(&workdir)?;
let key = RemoteObjectId(format!(
"{}{}",
REMOTE_STORAGE_PREFIX_SEPARATOR,
relative_path
.iter()
.map(|segment| segment.to_str().unwrap())
.collect::<Vec<_>>()
.join(&REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()),
));
assert_eq!(
local_path,
download_destination(&key, &workdir, None),
"Download destination should consist of s3 path joined with the workdir prefix"
);
Ok(())
}
#[test]
fn storage_path_positive() -> anyhow::Result<()> {
let workdir = tempdir()?.path().to_owned();
let segment_1 = "matching";
let segment_2 = "file";
let local_path = &workdir.join(segment_1).join(segment_2);
let storage = dummy_storage(workdir);
let expected_key = RemoteObjectId(format!(
"{}{REMOTE_STORAGE_PREFIX_SEPARATOR}{segment_1}{REMOTE_STORAGE_PREFIX_SEPARATOR}{segment_2}",
storage.prefix_in_bucket.as_deref().unwrap_or_default(),
));
let actual_key = storage
.remote_object_id(local_path)
.expect("Matching path should map to S3 path normally");
assert_eq!(
expected_key,
actual_key,
"S3 key from the matching path should contain all segments after the workspace prefix, separated with S3 separator"
);
Ok(())
}
#[test]
fn storage_path_negatives() -> anyhow::Result<()> {
#[track_caller]
fn storage_path_error(storage: &S3Bucket, mismatching_path: &Path) -> String {
match storage.remote_object_id(mismatching_path) {
Ok(wrong_key) => panic!(
"Expected path '{}' to error, but got S3 key: {:?}",
mismatching_path.display(),
wrong_key,
),
Err(e) => e.to_string(),
}
}
let workdir = tempdir()?.path().to_owned();
let storage = dummy_storage(workdir.clone());
let error_message = storage_path_error(&storage, &workdir);
assert!(
error_message.contains("Prefix and the path are equal"),
"Message '{}' does not contain the required string",
error_message
);
let mismatching_path = PathBuf::from("somewhere").join("else");
let error_message = storage_path_error(&storage, &mismatching_path);
assert!(
error_message.contains(mismatching_path.to_str().unwrap()),
"Error should mention wrong path"
);
assert!(
error_message.contains(workdir.to_str().unwrap()),
"Error should mention server workdir"
);
assert!(
error_message.contains("is not prefixed with"),
"Message '{}' does not contain a required string",
error_message
);
Ok(())
}
#[test]
fn local_path_positive() -> anyhow::Result<()> {
let workdir = tempdir()?.path().to_owned();
let storage = dummy_storage(workdir.clone());
let timeline_dir = workdir.join("timelines").join("test_timeline");
let relative_timeline_path = timeline_dir.strip_prefix(&workdir)?;
let s3_key = create_s3_key(
&relative_timeline_path.join("not a metadata"),
storage.prefix_in_bucket.as_deref(),
);
assert_eq!(
download_destination(&s3_key, &workdir, storage.prefix_in_bucket.as_deref()),
storage
.local_path(&s3_key)
.expect("For a valid input, valid S3 info should be parsed"),
"Should be able to parse metadata out of the correctly named remote delta file"
);
let s3_key = create_s3_key(
&relative_timeline_path.join("metadata"),
storage.prefix_in_bucket.as_deref(),
);
assert_eq!(
download_destination(&s3_key, &workdir, storage.prefix_in_bucket.as_deref()),
storage
.local_path(&s3_key)
.expect("For a valid input, valid S3 info should be parsed"),
"Should be able to parse metadata out of the correctly named remote metadata file"
);
Ok(())
}
#[test]
fn download_destination_matches_original_path() -> anyhow::Result<()> {
let workdir = tempdir()?.path().to_owned();
let original_path = workdir
.join("timelines")
.join("some_timeline")
.join("some name");
let dummy_storage = dummy_storage(workdir);
let key = dummy_storage.remote_object_id(&original_path)?;
let download_destination = dummy_storage.local_path(&key)?;
assert_eq!(
original_path, download_destination,
"'original path -> storage key -> matching fs path' transformation should produce the same path as the input one for the correct path"
);
Ok(())
}
fn dummy_storage(workdir: PathBuf) -> S3Bucket {
S3Bucket {
workdir,
client: Client::new(&aws_config::SdkConfig::builder().build()),
bucket_name: "dummy-bucket".to_string(),
prefix_in_bucket: Some("dummy_prefix/".to_string()),
concurrency_limiter: Semaphore::new(1),
}
}
fn create_s3_key(relative_file_path: &Path, prefix: Option<&str>) -> RemoteObjectId {
RemoteObjectId(relative_file_path.iter().fold(
prefix.unwrap_or_default().to_string(),
|mut path_string, segment| {
path_string.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
path_string.push_str(segment.to_str().unwrap());
path_string
},
))
}
}

View File

@@ -4,7 +4,6 @@ version = "0.1.0"
edition = "2021"
[dependencies]
sentry = "0.29.0"
async-trait = "0.1"
anyhow = "1.0"
bincode = "1.3"

View File

@@ -34,7 +34,6 @@ pub mod sock_split;
pub mod logging;
pub mod lock_file;
pub mod pid_file;
// Misc
pub mod accum;
@@ -47,7 +46,6 @@ pub mod tcp_listener;
pub mod nonblock;
// Default signal handling
pub mod sentry_init;
pub mod signals;
pub mod fs_ext;

View File

@@ -1,133 +1,81 @@
//! A module to create and read lock files.
//! A module to create and read lock files. A lock file ensures that only one
//! process is running at a time, in a particular directory.
//!
//! File locking is done using [`fcntl::flock`] exclusive locks.
//! The only consumer of this module is currently [`pid_file`].
//! See the module-level comment there for potential pitfalls
//! with lock files that are used to store PIDs (pidfiles).
//! File locking is done using [`fcntl::flock`], which means that holding the
//! lock on file only prevents acquiring another lock on it; all other
//! operations are still possible on files. Other process can still open, read,
//! write, or remove the file, for example.
//! If the file is removed while a process is holding a lock on it,
//! the process that holds the lock does not get any error or notification.
//! Furthermore, you can create a new file with the same name and lock the new file,
//! while the old process is still running.
//! Deleting the lock file while the locking process is still running is a bad idea!
use std::{
fs,
io::{Read, Write},
ops::Deref,
os::unix::prelude::AsRawFd,
path::{Path, PathBuf},
};
use std::{fs, os::unix::prelude::AsRawFd, path::Path};
use anyhow::Context;
use nix::{errno::Errno::EAGAIN, fcntl};
use nix::fcntl;
use crate::crashsafe;
/// A handle to an open and unlocked, but not-yet-written lock file.
/// Returned by [`create_exclusive`].
#[must_use]
pub struct UnwrittenLockFile {
path: PathBuf,
file: fs::File,
pub enum LockCreationResult {
Created {
new_lock_contents: String,
file: fs::File,
},
AlreadyLocked {
existing_lock_contents: String,
},
CreationFailed(anyhow::Error),
}
/// Returned by [`UnwrittenLockFile::write_content`].
#[must_use]
pub struct LockFileGuard(fs::File);
impl Deref for LockFileGuard {
type Target = fs::File;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl UnwrittenLockFile {
/// Replace the content of this lock file with the byte representation of `contents`.
pub fn write_content(mut self, contents: String) -> anyhow::Result<LockFileGuard> {
self.file
.set_len(0)
.context("Failed to truncate lockfile")?;
self.file
.write_all(contents.as_bytes())
.with_context(|| format!("Failed to write '{contents}' contents into lockfile"))?;
crashsafe::fsync_file_and_parent(&self.path).context("fsync lockfile")?;
Ok(LockFileGuard(self.file))
}
}
/// Creates and opens a lock file in the path, grabs an exclusive flock on it, and returns
/// a handle that allows overwriting the locked file's content.
///
/// The exclusive lock is released when dropping the returned handle.
///
/// It is not an error if the file already exists.
/// It is an error if the file is already locked.
pub fn create_exclusive(lock_file_path: &Path) -> anyhow::Result<UnwrittenLockFile> {
let lock_file = fs::OpenOptions::new()
/// Creates a lock file in the path given and writes the given contents into the file.
/// Note: The lock is automatically released when the file closed. You might want to use Box::leak to make sure it lives until the end of the program.
pub fn create_lock_file(lock_file_path: &Path, contents: String) -> LockCreationResult {
let lock_file = match fs::OpenOptions::new()
.create(true) // O_CREAT
.write(true)
.open(lock_file_path)
.context("open lock file")?;
let res = fcntl::flock(
lock_file.as_raw_fd(),
fcntl::FlockArg::LockExclusiveNonblock,
);
match res {
Ok(()) => Ok(UnwrittenLockFile {
path: lock_file_path.to_owned(),
file: lock_file,
}),
Err(EAGAIN) => anyhow::bail!("file is already locked"),
Err(e) => Err(e).context("flock error"),
}
}
/// Returned by [`read_and_hold_lock_file`].
/// Check out the [`pid_file`] module for what the variants mean
/// and potential caveats if the lock files that are used to store PIDs.
pub enum LockFileRead {
/// No file exists at the given path.
NotExist,
/// No other process held the lock file, so we grabbed an flock
/// on it and read its contents.
/// Release the flock by dropping the [`LockFileGuard`].
NotHeldByAnyProcess(LockFileGuard, String),
/// The file exists but another process was holding an flock on it.
LockedByOtherProcess {
not_locked_file: fs::File,
content: String,
},
}
/// Open & try to lock the lock file at the given `path`, returning a [handle][`LockFileRead`] to
/// inspect its content. It is not an `Err(...)` if the file does not exist or is already locked.
/// Check the [`LockFileRead`] variants for details.
pub fn read_and_hold_lock_file(path: &Path) -> anyhow::Result<LockFileRead> {
let res = fs::OpenOptions::new().read(true).open(path);
let mut lock_file = match res {
Ok(f) => f,
Err(e) => match e.kind() {
std::io::ErrorKind::NotFound => return Ok(LockFileRead::NotExist),
_ => return Err(e).context("open lock file"),
},
.context("Failed to open lock file")
{
Ok(file) => file,
Err(e) => return LockCreationResult::CreationFailed(e),
};
let res = fcntl::flock(
match fcntl::flock(
lock_file.as_raw_fd(),
fcntl::FlockArg::LockExclusiveNonblock,
);
// We need the content regardless of lock success / failure.
// But, read it after flock so that, if it succeeded, the content is consistent.
let mut content = String::new();
lock_file
.read_to_string(&mut content)
.context("read lock file")?;
match res {
Ok(()) => Ok(LockFileRead::NotHeldByAnyProcess(
LockFileGuard(lock_file),
content,
)),
Err(EAGAIN) => Ok(LockFileRead::LockedByOtherProcess {
not_locked_file: lock_file,
content,
}),
Err(e) => Err(e).context("flock error"),
) {
Ok(()) => {
match lock_file
.set_len(0)
.context("Failed to truncate lockfile")
.and_then(|()| {
fs::write(lock_file_path, &contents).with_context(|| {
format!("Failed to write '{contents}' contents into lockfile")
})
})
.and_then(|()| {
crashsafe::fsync_file_and_parent(lock_file_path)
.context("Failed to fsync lockfile")
}) {
Ok(()) => LockCreationResult::Created {
new_lock_contents: contents,
file: lock_file,
},
Err(e) => LockCreationResult::CreationFailed(e),
}
}
Err(nix::errno::Errno::EAGAIN) => {
match fs::read_to_string(lock_file_path).context("Failed to read lockfile contents") {
Ok(existing_lock_contents) => LockCreationResult::AlreadyLocked {
existing_lock_contents,
},
Err(e) => LockCreationResult::CreationFailed(e),
}
}
Err(e) => {
LockCreationResult::CreationFailed(anyhow::anyhow!("Failed to lock lockfile: {e}"))
}
}
}

View File

@@ -1,165 +0,0 @@
//! Abstraction to create & read pidfiles.
//!
//! A pidfile is a file in the filesystem that stores a process's PID.
//! Its purpose is to implement a singleton behavior where only
//! one process of some "kind" is supposed to be running at a given time.
//! The "kind" is identified by the pidfile.
//!
//! During process startup, the process that is supposed to be a singleton
//! must [claim][`claim_for_current_process`] the pidfile first.
//! If that is unsuccessful, the process must not act as the singleton, i.e.,
//! it must not access any of the resources that only the singleton may access.
//!
//! A common need is to signal a running singleton process, e.g., to make
//! it shut down and exit.
//! For that, we have to [`read`] the pidfile. The result of the `read` operation
//! tells us if there is any singleton process, and if so, what PID it has.
//! We can then proceed to signal it, although some caveats still apply.
//! Read the function-level documentation of [`read`] for that.
//!
//! ## Never Remove Pidfiles
//!
//! It would be natural to assume that the process who claimed the pidfile
//! should remove it upon exit to avoid leaving a stale pidfile in place.
//! However, we already have a reliable way to detect staleness of the pidfile,
//! i.e., the `flock` that [claiming][`claim_for_current_process`] puts on it.
//!
//! And further, removing pidfiles would introduce a **catastrophic race condition**
//! where two processes are running that are supposed to be singletons.
//! Suppose we were to remove our pidfile during process shutdown.
//! Here is how the race plays out:
//! - Suppose we have a service called `myservice` with pidfile `myservice.pidfile`.
//! - Process `A` starts to shut down.
//! - Process `B` is just starting up
//! - It `open("myservice.pid", O_WRONLY|O_CREAT)` the file
//! - It blocks on `flock`
//! - Process `A` removes the pidfile as the last step of its shutdown procedure
//! - `unlink("myservice.pid")
//! - Process `A` exits
//! - This releases its `flock` and unblocks `B`
//! - Process `B` still has the file descriptor for `myservice.pid` open
//! - Process `B` writes its PID into `myservice.pid`.
//! - But the `myservice.pid` file has been unlinked, so, there is `myservice.pid`
//! in the directory.
//! - Process `C` starts
//! - It `open("myservice.pid", O_WRONLY|O_CREAT)` which creates a new file (new inode)
//! - It `flock`s the file, which, since it's a different file, does not block
//! - It writes its PID into the file
//!
//! At this point, `B` and `C` are running, which is hazardous.
//! Morale of the story: don't unlink pidfiles, ever.
use std::{ops::Deref, path::Path};
use anyhow::Context;
use nix::unistd::Pid;
use crate::lock_file::{self, LockFileRead};
/// Keeps a claim on a pidfile alive until it is dropped.
/// Returned by [`claim_for_current_process`].
#[must_use]
pub struct PidFileGuard(lock_file::LockFileGuard);
impl Deref for PidFileGuard {
type Target = lock_file::LockFileGuard;
fn deref(&self) -> &Self::Target {
&self.0
}
}
/// Try to claim `path` as a pidfile for the current process.
///
/// If another process has already claimed the pidfile, and it is still running,
/// this function returns ane error.
/// Otherwise, the function `flock`s the file and updates its contents to the
/// current process's PID.
/// If the update fails, the flock is released and an error returned.
/// On success, the function returns a [`PidFileGuard`] to keep the flock alive.
///
/// ### Maintaining A Claim
///
/// It is the caller's responsibility to maintain the claim.
/// The claim ends as soon as the returned guard object is dropped.
/// To maintain the claim for the remaining lifetime of the current process,
/// use [`std::mem::forget`] or similar.
pub fn claim_for_current_process(path: &Path) -> anyhow::Result<PidFileGuard> {
let unwritten_lock_file = lock_file::create_exclusive(path).context("lock file")?;
// if any of the next steps fail, we drop the file descriptor and thereby release the lock
let guard = unwritten_lock_file
.write_content(Pid::this().to_string())
.context("write pid to lock file")?;
Ok(PidFileGuard(guard))
}
/// Returned by [`read`].
pub enum PidFileRead {
/// No file exists at the given path.
NotExist,
/// The given pidfile is currently not claimed by any process.
/// To determine this, the [`read`] operation acquired
/// an exclusive flock on the file. The lock is still held and responsibility
/// to release it is returned through the guard object.
/// Before releasing it, other [`claim_for_current_process`] or [`read`] calls
/// will fail.
///
/// ### Caveats
///
/// Do not unlink the pidfile from the filesystem. See module-comment for why.
NotHeldByAnyProcess(PidFileGuard),
/// The given pidfile is still claimed by another process whose PID is given
/// as part of this variant.
///
/// ### Caveats
///
/// 1. The other process might exit at any time, turning the given PID stale.
/// 2. There is a small window in which `claim_for_current_process` has already
/// locked the file but not yet updates its contents. [`read`] will return
/// this variant here, but with the old file contents, i.e., a stale PID.
///
/// The kernel is free to recycle PID once it has been `wait(2)`ed upon by
/// its creator. Thus, acting upon a stale PID, e.g., by issuing a `kill`
/// system call on it, bears the risk of killing an unrelated process.
/// This is an inherent limitation of using pidfiles.
/// The only race-free solution is to have a supervisor-process with a lifetime
/// that exceeds that of all of its child-processes (e.g., `runit`, `supervisord`).
LockedByOtherProcess(Pid),
}
/// Try to read the file at the given path as a pidfile that was previously created
/// through [`claim_for_current_process`].
///
/// On success, this function returns a [`PidFileRead`].
/// Check its docs for a description of the meaning of its different variants.
pub fn read(pidfile: &Path) -> anyhow::Result<PidFileRead> {
let res = lock_file::read_and_hold_lock_file(pidfile).context("read and hold pid file")?;
let ret = match res {
LockFileRead::NotExist => PidFileRead::NotExist,
LockFileRead::NotHeldByAnyProcess(guard, _) => {
PidFileRead::NotHeldByAnyProcess(PidFileGuard(guard))
}
LockFileRead::LockedByOtherProcess {
not_locked_file: _not_locked_file,
content,
} => {
// XXX the read races with the write in claim_pid_file_for_pid().
// But pids are smaller than a page, so the kernel page cache will lock for us.
// The only problem is that we might get the old contents here.
// Can only fix that by implementing some scheme that downgrades the
// exclusive lock to shared lock in claim_pid_file_for_pid().
PidFileRead::LockedByOtherProcess(parse_pidfile_content(&content)?)
}
};
Ok(ret)
}
fn parse_pidfile_content(content: &str) -> anyhow::Result<Pid> {
let pid: i32 = content
.parse()
.map_err(|_| anyhow::anyhow!("parse pidfile content to PID"))?;
if pid < 1 {
anyhow::bail!("bad value in pidfile '{pid}'");
}
Ok(Pid::from_raw(pid))
}

View File

@@ -1,27 +0,0 @@
use sentry::ClientInitGuard;
use std::borrow::Cow;
use std::env;
pub use sentry::release_name;
#[must_use]
pub fn init_sentry(
release_name: Option<Cow<'static, str>>,
extra_options: &[(&str, &str)],
) -> Option<ClientInitGuard> {
let dsn = env::var("SENTRY_DSN").ok()?;
let guard = sentry::init((
dsn,
sentry::ClientOptions {
release: release_name,
..Default::default()
},
));
sentry::configure_scope(|scope| {
for &(key, value) in extra_options {
scope.set_extra(key, value.into());
}
});
Some(guard)
}

View File

@@ -5,6 +5,10 @@ edition = "2021"
[features]
default = []
# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
# which adds some runtime cost to run tests on outage conditions
testing = ["fail/failpoints"]
profiling = ["pprof"]
[dependencies]
@@ -14,13 +18,13 @@ async-stream = "0.3"
async-trait = "0.1"
byteorder = "1.4.3"
bytes = "1.0.1"
chrono = { version = "0.4.23", default-features = false, features = ["clock"] }
chrono = "0.4.19"
clap = { version = "4.0", features = ["string"] }
close_fds = "0.3.2"
const_format = "0.2.21"
crc32c = "0.6.0"
crossbeam-utils = "0.8.5"
fail = { version = "0.5", default-features = false, features = ["failpoints"] }
fail = "0.5.0"
futures = "0.3.13"
git-version = "0.3.5"
hex = "0.4.3"
@@ -55,6 +59,7 @@ tracing = "0.1.36"
url = "2"
walkdir = "2.3.2"
persistent_range_query = { path = "../libs/persistent_range_query" }
etcd_broker = { path = "../libs/etcd_broker" }
metrics = { path = "../libs/metrics" }
pageserver_api = { path = "../libs/pageserver_api" }
@@ -65,6 +70,7 @@ remote_storage = { path = "../libs/remote_storage" }
tenant_size_model = { path = "../libs/tenant_size_model" }
utils = { path = "../libs/utils" }
workspace_hack = { version = "0.1", path = "../workspace_hack" }
rpds = "0.12.0"
[dev-dependencies]
criterion = "0.4"

View File

@@ -1,7 +1,10 @@
use anyhow::Result;
use num_traits::ToPrimitive;
use pageserver::repository::{Key, Value};
use pageserver::tenant::bst_layer_map::BSTLM;
use pageserver::tenant::filename::{DeltaFileName, ImageFileName};
use pageserver::tenant::layer_map::LayerMap;
use pageserver::tenant::segment_tree_layer_map::STLM;
use pageserver::tenant::storage_layer::Layer;
use pageserver::tenant::storage_layer::ValueReconstructResult;
use pageserver::tenant::storage_layer::ValueReconstructState;
@@ -243,22 +246,69 @@ fn bench_from_captest_env(c: &mut Criterion) {
// too long processing layer map queries.
fn bench_from_real_project(c: &mut Criterion) {
// TODO consider compressing this file
// Init layer map
let now = Instant::now();
let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt"));
println!("Finished layer map init in {:?}", now.elapsed());
// Init bst layer map with the same layers
let now = Instant::now();
let mut bstlm = BSTLM::new();
let mut sorted_layers: Vec<_> = layer_map.iter_historic_layers().collect();
sorted_layers.sort_by(|a, b| {
a.get_lsn_range().start.cmp(&b.get_lsn_range().start)
});
for layer in sorted_layers {
if layer.is_incremental() {
// TODO check if they're sorted
let kr = layer.get_key_range();
let lr = layer.get_lsn_range();
bstlm.insert(
kr.start.to_i128(),
kr.end.to_i128(),
lr.start.0,
format!("Layer {}", lr.start.0),
);
} else {
let kr = layer.get_key_range();
let lr = layer.get_lsn_range();
bstlm.insert(
kr.start.to_i128(),
kr.end.to_i128(),
lr.start.0,
format!("Layer {}", lr.start.0),
);
}
}
println!("Finished bst init in {:?}", now.elapsed());
// Choose uniformly distributed queries
let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map);
// Test with uniform query pattern
c.bench_function("real_map_uniform_queries", |b| {
// Define and name the benchmark function
let mut group = c.benchmark_group("real_map_uniform_queries");
group.bench_function("current_code", |b| {
b.iter(|| {
for q in queries.clone().into_iter() {
layer_map.search(q.0, q.1).unwrap();
}
});
});
group.bench_function("persistent_bst", |b| {
b.iter(|| {
for q in queries.clone().into_iter() {
bstlm.query(q.0.to_i128(), q.1.0);
}
});
});
group.finish();
}
// Benchmark using synthetic data. Arrange image layers on stacked diagonal lines.
fn bench_sequential(c: &mut Criterion) {
let mut layer_map = LayerMap::default();
// Init layer map. Create 100_000 layers arranged in 1000 diagonal lines.
//
@@ -267,42 +317,65 @@ fn bench_sequential(c: &mut Criterion) {
// Putting it inside the `bench_function` closure is not a solution
// because then it runs multiple times during warmup.
let now = Instant::now();
let mut layer_map = LayerMap::default();
for i in 0..100_000 {
// TODO try inserting a super-wide layer in between every 10 to reflect
// what often happens with L1 layers that include non-rel changes.
// Maybe do that as a separate test.
let i32 = (i as u32) % 100;
let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
let layer = DummyImage {
key_range: zero.add(10 * i32)..zero.add(10 * i32 + 1),
lsn: Lsn(10 * i),
lsn: Lsn(i),
};
layer_map.insert_historic(Arc::new(layer));
}
println!("Finished layer map init in {:?}", now.elapsed());
// Manually measure runtime without criterion because criterion
// has a minimum sample size of 10 and I don't want to run it 10 times.
println!("Finished init in {:?}", now.elapsed());
// Init bst layer map with the same layers
let now = Instant::now();
let mut bstlm = BSTLM::new();
for layer in layer_map.iter_historic_layers() {
if layer.is_incremental() {
panic!("AAA");
} else {
let kr = layer.get_key_range();
let lr = layer.get_lsn_range();
bstlm.insert(
kr.start.to_i128(),
kr.end.to_i128(),
lr.start.0,
format!("Layer {}", lr.start.0),
);
}
}
println!("Finished bst init in {:?}", now.elapsed());
// Choose 100 uniformly random queries
let rng = &mut StdRng::seed_from_u64(1);
let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map)
.choose_multiple(rng, 1)
.choose_multiple(rng, 100)
.copied()
.collect();
// Define and name the benchmark function
c.bench_function("sequential_uniform_queries", |b| {
// Run the search queries
let mut group = c.benchmark_group("sequential_uniform_queries");
group.bench_function("current_code", |b| {
b.iter(|| {
for q in queries.clone().into_iter() {
layer_map.search(q.0, q.1).unwrap();
}
});
});
group.bench_function("persistent_bst", |b| {
b.iter(|| {
for q in queries.clone().into_iter() {
bstlm.query(q.0.to_i128(), q.1.0);
}
});
});
group.finish();
}
criterion_group!(group_1, bench_from_captest_env);
criterion_group!(group_2, bench_from_real_project);
criterion_group!(group_3, bench_sequential);
criterion_main!(group_1, group_2, group_3);
// HACK TODO bring back all the bench functions. I remove
// them here to avoid initializing.
criterion_group!(group, bench_from_real_project);
criterion_main!(group);

View File

@@ -431,7 +431,7 @@ fn pg_record(will_init: bool, bytes: &'static [u8]) -> NeonWalRecord {
struct Request {
key: Key,
lsn: Lsn,
base_img: Option<(Lsn, Bytes)>,
base_img: Option<Bytes>,
records: Vec<(Lsn, NeonWalRecord)>,
pg_version: u32,
}

View File

@@ -12,6 +12,7 @@
//!
use anyhow::{anyhow, bail, ensure, Context, Result};
use bytes::{BufMut, BytesMut};
use fail::fail_point;
use itertools::Itertools;
use std::fmt::Write as FmtWrite;
use std::io;
@@ -21,7 +22,6 @@ use std::time::SystemTime;
use tar::{Builder, EntryType, Header};
use tracing::*;
use crate::fail_point;
use crate::tenant::Timeline;
use pageserver_api::reltag::{RelTag, SlruKind};

View File

@@ -11,8 +11,8 @@
//!
//! Example use:
//! ```
//! $ ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
//! $ grep "__" | cargo run --release --bin draw_timeline_dir > out.svg
//! $ cd test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE
//! $ ls | grep "__" | cargo run --release --bin draw_timeline_dir > out.svg
//! $ firefox out.svg
//! ```
//!
@@ -25,8 +25,6 @@ use anyhow::Result;
use pageserver::repository::Key;
use std::cmp::Ordering;
use std::io::{self, BufRead};
use std::path::PathBuf;
use std::str::FromStr;
use std::{
collections::{BTreeMap, BTreeSet},
ops::Range,
@@ -67,11 +65,7 @@ fn main() -> Result<()> {
let mut ranges: Vec<(Range<Key>, Range<Lsn>)> = vec![];
let stdin = io::stdin();
for line in stdin.lock().lines() {
let line = line.unwrap();
let line = PathBuf::from_str(&line).unwrap();
let filename = line.file_name().unwrap();
let filename = filename.to_str().unwrap();
let range = parse_filename(filename);
let range = parse_filename(&line.unwrap());
ranges.push(range);
}

View File

@@ -7,6 +7,7 @@ use std::{env, ops::ControlFlow, path::Path, str::FromStr};
use anyhow::{anyhow, Context};
use clap::{Arg, ArgAction, Command};
use fail::FailScenario;
use nix::unistd::Pid;
use tracing::*;
use metrics::set_build_info_metric;
@@ -22,10 +23,9 @@ use pageserver::{
use remote_storage::GenericRemoteStorage;
use utils::{
auth::JwtAuth,
logging,
lock_file, logging,
postgres_backend::AuthType,
project_git_version,
sentry_init::{init_sentry, release_name},
signals::{self, Signal},
tcp_listener,
};
@@ -35,6 +35,10 @@ project_git_version!(GIT_VERSION);
const PID_FILE_NAME: &str = "pageserver.pid";
const FEATURES: &[&str] = &[
#[cfg(feature = "testing")]
"testing",
#[cfg(feature = "fail/failpoints")]
"fail/failpoints",
#[cfg(feature = "profiling")]
"profiling",
];
@@ -81,9 +85,6 @@ fn main() -> anyhow::Result<()> {
}
};
// initialize sentry if SENTRY_DSN is provided
let _sentry_guard = init_sentry(release_name!(), &[("node_id", &conf.id.to_string())]);
let tenants_path = conf.tenants_path();
if !tenants_path.exists() {
utils::crashsafe::create_dir_all(conf.tenants_path()).with_context(|| {
@@ -174,10 +175,6 @@ fn initialize_config(
let conf = PageServerConf::parse_and_validate(&toml, workdir)
.context("Failed to parse pageserver configuration")?;
if pageserver::TESTING_MODE.set(conf.testing_mode).is_err() {
anyhow::bail!("testing_mode was already initialized");
}
if update_config {
info!("Writing pageserver config to '{}'", cfg_file_path.display());
@@ -206,32 +203,41 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
// If any failpoints were set from FAILPOINTS environment variable,
// print them to the log for debugging purposes
if *pageserver::TESTING_MODE.get().unwrap() {
let failpoints = fail::list();
if !failpoints.is_empty() {
info!(
"started with testing mode enabled, failpoints: {}",
failpoints
.iter()
.map(|(name, actions)| format!("{name}={actions}"))
.collect::<Vec<String>>()
.join(";")
)
} else {
info!("started with testing mode enabled");
}
} else {
info!("started with testing mode disabled");
let failpoints = fail::list();
if !failpoints.is_empty() {
info!(
"started with failpoints: {}",
failpoints
.iter()
.map(|(name, actions)| format!("{name}={actions}"))
.collect::<Vec<String>>()
.join(";")
)
}
let lock_file_path = conf.workdir.join(PID_FILE_NAME);
let lock_file =
utils::pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?;
info!("Claimed pid file at {lock_file_path:?}");
let lock_file = match lock_file::create_lock_file(&lock_file_path, Pid::this().to_string()) {
lock_file::LockCreationResult::Created {
new_lock_contents,
file,
} => {
info!("Created lock file at {lock_file_path:?} with contenst {new_lock_contents}");
file
}
lock_file::LockCreationResult::AlreadyLocked {
existing_lock_contents,
} => anyhow::bail!(
"Could not lock pid file; pageserver is already running in {:?} with PID {}",
conf.workdir,
existing_lock_contents
),
lock_file::LockCreationResult::CreationFailed(e) => {
return Err(e.context(format!("Failed to create lock file at {lock_file_path:?}")))
}
};
// ensure that the lock file is held even if the main thread of the process is panics
// we need to release the lock file only when the current process is gone
std::mem::forget(lock_file);
let _ = Box::leak(Box::new(lock_file));
// TODO: Check that it looks like a valid repository before going further
@@ -286,23 +292,15 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
let remote_storage = conf
.remote_storage_config
.as_ref()
.map(GenericRemoteStorage::from_config)
.map(|storage_config| {
GenericRemoteStorage::from_config(conf.workdir.clone(), storage_config)
})
.transpose()
.context("Failed to init generic remote storage")?;
let (init_result_sender, init_result_receiver) =
std::sync::mpsc::channel::<anyhow::Result<()>>();
let storage_for_spawn = remote_storage.clone();
let _handler = BACKGROUND_RUNTIME.spawn(async move {
let result = tenant_mgr::init_tenant_mgr(conf, storage_for_spawn).await;
init_result_sender.send(result)
});
match init_result_receiver.recv() {
Ok(init_result) => init_result.context("Failed to init tenant_mgr")?,
Err(_sender_dropped_err) => {
anyhow::bail!("Failed to init tenant_mgr: no init status was returned");
}
}
{
let _rt_guard = BACKGROUND_RUNTIME.enter();
tenant_mgr::init_tenant_mgr(conf, remote_storage.clone())?
};
// Spawn all HTTP related tasks in the MGMT_REQUEST_RUNTIME.
// bind before launching separate thread so the error reported before startup exits

View File

@@ -5,7 +5,7 @@
//! See also `settings.md` for better description on every parameter.
use anyhow::{anyhow, bail, ensure, Context, Result};
use remote_storage::{RemotePath, RemoteStorageConfig};
use remote_storage::RemoteStorageConfig;
use std::env;
use utils::crashsafe::path_with_suffix_extension;
use utils::id::ConnectionId;
@@ -27,9 +27,7 @@ use utils::{
use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME};
use crate::tenant_config::{TenantConf, TenantConfOpt};
use crate::{
IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_UNINIT_MARK_SUFFIX,
};
use crate::{METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_UNINIT_MARK_SUFFIX};
pub mod defaults {
use crate::tenant_config::defaults::*;
@@ -53,8 +51,6 @@ pub mod defaults {
pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize =
super::ConfigurableSemaphore::DEFAULT_INITIAL.get();
pub const DEFAULT_TESTING_MODE: bool = false;
///
/// Default built-in configuration file.
///
@@ -77,8 +73,6 @@ pub mod defaults {
#concurrent_tenant_size_logical_size_queries = '{DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES}'
testing_mode = false
# [tenant_config]
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -147,9 +141,6 @@ pub struct PageServerConf {
/// Number of concurrent [`Tenant::gather_size_inputs`] allowed.
pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
/// Enables failpoint support and extra mgmt APIs useful for testing.
pub testing_mode: bool,
}
/// We do not want to store this in a PageServerConf because the latter may be logged
@@ -229,8 +220,6 @@ struct PageServerConfigBuilder {
log_format: BuilderValue<LogFormat>,
concurrent_tenant_size_logical_size_queries: BuilderValue<ConfigurableSemaphore>,
testing_mode: BuilderValue<bool>,
}
impl Default for PageServerConfigBuilder {
@@ -261,8 +250,6 @@ impl Default for PageServerConfigBuilder {
log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
concurrent_tenant_size_logical_size_queries: Set(ConfigurableSemaphore::default()),
testing_mode: Set(DEFAULT_TESTING_MODE),
}
}
}
@@ -343,11 +330,11 @@ impl PageServerConfigBuilder {
self.concurrent_tenant_size_logical_size_queries = BuilderValue::Set(u);
}
pub fn testing_mode(&mut self, testing_mode: bool) {
self.testing_mode = BuilderValue::Set(testing_mode);
}
pub fn build(self) -> anyhow::Result<PageServerConf> {
let broker_endpoints = self
.broker_endpoints
.ok_or(anyhow!("No broker endpoints provided"))?;
Ok(PageServerConf {
listen_pg_addr: self
.listen_pg_addr
@@ -383,9 +370,7 @@ impl PageServerConfigBuilder {
profiling: self.profiling.ok_or(anyhow!("missing profiling"))?,
// TenantConf is handled separately
default_tenant_conf: TenantConf::default(),
broker_endpoints: self
.broker_endpoints
.ok_or(anyhow!("No broker endpoints provided"))?,
broker_endpoints,
broker_etcd_prefix: self
.broker_etcd_prefix
.ok_or(anyhow!("missing broker_etcd_prefix"))?,
@@ -395,7 +380,6 @@ impl PageServerConfigBuilder {
.ok_or(anyhow!(
"missing concurrent_tenant_size_logical_size_queries"
))?,
testing_mode: self.testing_mode.ok_or(anyhow!("missing testing_mode"))?,
})
}
}
@@ -418,10 +402,6 @@ impl PageServerConf {
.join(TENANT_ATTACHING_MARKER_FILENAME)
}
pub fn tenant_ignore_mark_file_path(&self, tenant_id: TenantId) -> PathBuf {
self.tenant_path(&tenant_id).join(IGNORED_TENANT_FILE_NAME)
}
/// Points to a place in pageserver's local directory,
/// where certain tenant's tenantconf file should be located.
pub fn tenant_config_path(&self, tenant_id: TenantId) -> PathBuf {
@@ -470,28 +450,6 @@ impl PageServerConf {
.join(METADATA_FILE_NAME)
}
/// Files on the remote storage are stored with paths, relative to the workdir.
/// That path includes in itself both tenant and timeline ids, allowing to have a unique remote storage path.
///
/// Errors if the path provided does not start from pageserver's workdir.
pub fn remote_path(&self, local_path: &Path) -> anyhow::Result<RemotePath> {
local_path
.strip_prefix(&self.workdir)
.context("Failed to strip workdir prefix")
.and_then(RemotePath::new)
.with_context(|| {
format!(
"Failed to resolve remote part of path {:?} for base {:?}",
local_path, self.workdir
)
})
}
/// Turns storage remote path of a file into its local path.
pub fn local_path(&self, remote_path: &RemotePath) -> PathBuf {
remote_path.with_base(&self.workdir)
}
//
// Postgres distribution paths
//
@@ -528,7 +486,7 @@ impl PageServerConf {
let mut builder = PageServerConfigBuilder::default();
builder.workdir(workdir.to_owned());
let mut t_conf = TenantConfOpt::default();
let mut t_conf: TenantConfOpt = Default::default();
for (key, item) in toml.iter() {
match key {
@@ -576,7 +534,6 @@ impl PageServerConf {
let permits = NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?;
ConfigurableSemaphore::new(permits)
}),
"testing_mode" => builder.testing_mode(parse_toml_bool(key, item)?),
_ => bail!("unrecognized pageserver option '{key}'"),
}
}
@@ -660,10 +617,6 @@ impl PageServerConf {
if let Some(max_lsn_wal_lag) = item.get("max_lsn_wal_lag") {
t_conf.max_lsn_wal_lag = Some(parse_toml_from_str("max_lsn_wal_lag", max_lsn_wal_lag)?);
}
if let Some(trace_read_requests) = item.get("trace_read_requests") {
t_conf.trace_read_requests =
Some(parse_toml_bool("trace_read_requests", trace_read_requests)?);
}
Ok(t_conf)
}
@@ -696,7 +649,6 @@ impl PageServerConf {
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
testing_mode: true,
}
}
}
@@ -710,11 +662,6 @@ fn parse_toml_string(name: &str, item: &Item) -> Result<String> {
Ok(s.to_string())
}
fn parse_toml_bool(name: &str, item: &Item) -> Result<bool> {
item.as_bool()
.with_context(|| format!("configure option {name} is not a boolean"))
}
fn parse_toml_u64(name: &str, item: &Item) -> Result<u64> {
// A toml integer is signed, so it cannot represent the full range of an u64. That's OK
// for our use, though.
@@ -891,7 +838,6 @@ log_format = 'json'
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
testing_mode: defaults::DEFAULT_TESTING_MODE,
},
"Correct defaults should be used when no config values are provided"
);
@@ -938,7 +884,6 @@ log_format = 'json'
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
log_format: LogFormat::Json,
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
testing_mode: defaults::DEFAULT_TESTING_MODE,
},
"Should be able to parse all basic config values correctly"
);
@@ -1071,35 +1016,6 @@ broker_endpoints = ['{broker_endpoint}']
Ok(())
}
#[test]
fn parse_tenant_config() -> anyhow::Result<()> {
let tempdir = tempdir()?;
let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
let broker_endpoint = "http://127.0.0.1:7777";
let trace_read_requests = true;
let config_string = format!(
r#"{ALL_BASE_VALUES_TOML}
pg_distrib_dir='{}'
broker_endpoints = ['{broker_endpoint}']
[tenant_config]
trace_read_requests = {trace_read_requests}"#,
pg_distrib_dir.display(),
);
let toml = config_string.parse()?;
let conf = PageServerConf::parse_and_validate(&toml, &workdir)?;
assert_eq!(
conf.default_tenant_conf.trace_read_requests, trace_read_requests,
"Tenant config from pageserver config file should be parsed and udpated values used as defaults for all tenants",
);
Ok(())
}
fn prepare_fs(tempdir: &TempDir) -> anyhow::Result<(PathBuf, PathBuf)> {
let tempdir_path = tempdir.path();

View File

@@ -274,7 +274,6 @@ paths:
schema:
type: string
format: hex
post:
description: Schedules attach operation to happen in the background for given tenant
responses:
@@ -326,9 +325,7 @@ paths:
type: string
format: hex
post:
description: |
Remove tenant data (including all corresponding timelines) from pageserver's memory and file system.
Files on the remote storage are not affected.
description: Detach local tenant
responses:
"200":
description: Tenant detached
@@ -357,92 +354,6 @@ paths:
schema:
$ref: "#/components/schemas/Error"
/v1/tenant/{tenant_id}/ignore:
parameters:
- name: tenant_id
in: path
required: true
schema:
type: string
format: hex
post:
description: |
Remove tenant data (including all corresponding timelines) from pageserver's memory.
Files on local disk and remote storage are not affected.
Future pageserver restarts won't load the data back until `load` is called on such tenant.
responses:
"200":
description: Tenant ignored
"400":
description: Error when no tenant id found in path parameters
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
/v1/tenant/{tenant_id}/load:
parameters:
- name: tenant_id
in: path
required: true
schema:
type: string
format: hex
post:
description: |
Schedules an operation that attempts to load a tenant from the local disk and
synchronise it with the remote storage (if enabled), repeating pageserver's restart logic for tenant load.
If the tenant was ignored before, removes the ignore mark and continues with load scheduling.
Errors if the tenant is absent on disk, already present in memory or fails to schedule its load.
Scheduling a load does not mean that the tenant would load successfully, check tenant status to ensure load correctness.
responses:
"202":
description: Tenant scheduled to load successfully
"400":
description: Error when no tenant id found in path parameters
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
/v1/tenant/{tenant_id}/size:
parameters:
- name: tenant_id
@@ -748,6 +659,7 @@ components:
- tenant_id
- last_record_lsn
- disk_consistent_lsn
- awaits_download
- state
- latest_gc_cutoff_lsn
properties:
@@ -790,6 +702,8 @@ components:
format: hex
last_received_msg_ts:
type: integer
awaits_download:
type: boolean
state:
type: string
latest_gc_cutoff_lsn:

View File

@@ -3,18 +3,18 @@ use std::sync::Arc;
use anyhow::{anyhow, Context, Result};
use hyper::StatusCode;
use hyper::{Body, Request, Response, Uri};
use pageserver_api::models::TenantState;
use remote_storage::GenericRemoteStorage;
use tokio::task::JoinError;
use tracing::*;
use super::models::{
ConfigureFailpointsRequest, LocalTimelineInfo, RemoteTimelineInfo, StatusResponse,
TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
LocalTimelineInfo, RemoteTimelineInfo, StatusResponse, TenantConfigRequest,
TenantCreateRequest, TenantCreateResponse, TenantInfo, TimelineCreateRequest, TimelineInfo,
};
use crate::pgdatadir_mapping::LsnForTimestamp;
use crate::tenant::Timeline;
use crate::tenant_config::TenantConfOpt;
use crate::CheckpointConfig;
use crate::{config::PageServerConf, tenant_mgr};
use utils::{
auth::JwtAuth,
@@ -29,6 +29,12 @@ use utils::{
lsn::Lsn,
};
// Imports only used for testing APIs
#[cfg(feature = "testing")]
use super::models::{ConfigureFailpointsRequest, TimelineGcRequest};
#[cfg(feature = "testing")]
use crate::CheckpointConfig;
struct State {
conf: &'static PageServerConf,
auth: Option<Arc<JwtAuth>>,
@@ -76,11 +82,12 @@ fn check_permission(request: &Request<Body>, tenant_id: Option<TenantId>) -> Res
// Helper function to construct a TimelineInfo struct for a timeline
fn build_timeline_info(
tenant_state: TenantState,
timeline: &Arc<Timeline>,
include_non_incremental_logical_size: bool,
include_non_incremental_physical_size: bool,
) -> anyhow::Result<TimelineInfo> {
let mut info = build_timeline_info_common(timeline)?;
let mut info = build_timeline_info_common(tenant_state, timeline)?;
if include_non_incremental_logical_size {
info.current_logical_size_non_incremental =
Some(timeline.get_current_logical_size_non_incremental(info.last_record_lsn)?);
@@ -92,7 +99,10 @@ fn build_timeline_info(
Ok(info)
}
fn build_timeline_info_common(timeline: &Arc<Timeline>) -> anyhow::Result<TimelineInfo> {
fn build_timeline_info_common(
tenant_state: TenantState,
timeline: &Arc<Timeline>,
) -> anyhow::Result<TimelineInfo> {
let last_record_lsn = timeline.get_last_record_lsn();
let (wal_source_connstr, last_received_msg_lsn, last_received_msg_ts) = {
let guard = timeline.last_received_wal.lock().unwrap();
@@ -144,6 +154,10 @@ fn build_timeline_info_common(timeline: &Arc<Timeline>) -> anyhow::Result<Timeli
state,
// XXX bring back tracking of downloads per timeline, or, introduce
// an 'Attaching' state for the timeline and get rid of this field.
awaits_download: tenant_state == TenantState::Attaching,
// Duplicate some fields in 'local' and 'remote' fields, for backwards-compatility
// with the control plane.
local: LocalTimelineInfo {
@@ -175,9 +189,7 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
.new_timeline_id
.unwrap_or_else(TimelineId::generate);
let tenant = tenant_mgr::get_tenant(tenant_id, true)
.await
.map_err(ApiError::NotFound)?;
let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
match tenant.create_timeline(
new_timeline_id,
request_data.ancestor_timeline_id.map(TimelineId::from),
@@ -188,7 +200,7 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
.await {
Ok(Some(new_timeline)) => {
// Created. Construct a TimelineInfo for it.
let timeline_info = build_timeline_info_common(&new_timeline)
let timeline_info = build_timeline_info_common(tenant.current_state(), &new_timeline)
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::CREATED, timeline_info)
}
@@ -205,29 +217,26 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
query_param_present(&request, "include-non-incremental-physical-size");
check_permission(&request, Some(tenant_id))?;
let response_data = async {
let tenant = tenant_mgr::get_tenant(tenant_id, true)
.await
.map_err(ApiError::NotFound)?;
let timelines = tenant.list_timelines();
let _entered = info_span!("timeline_list", tenant = %tenant_id).entered();
let mut response_data = Vec::with_capacity(timelines.len());
for timeline in timelines {
let timeline_info = build_timeline_info(
&timeline,
include_non_incremental_logical_size,
include_non_incremental_physical_size,
)
.context("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}")
.map_err(ApiError::InternalServerError)?;
let (tenant_state, timelines) = {
let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
(tenant.current_state(), tenant.list_timelines())
};
response_data.push(timeline_info);
}
let mut response_data = Vec::with_capacity(timelines.len());
for timeline in timelines {
let timeline_info = build_timeline_info(
tenant_state,
&timeline,
include_non_incremental_logical_size,
include_non_incremental_physical_size,
)
.context("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}")
.map_err(ApiError::InternalServerError)?;
Ok(response_data)
response_data.push(timeline_info);
}
.instrument(info_span!("timeline_list", tenant = %tenant_id))
.await?;
json_response(StatusCode::OK, response_data)
}
@@ -272,15 +281,20 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
check_permission(&request, Some(tenant_id))?;
let timeline_info = async {
let tenant = tenant_mgr::get_tenant(tenant_id, true)
.await
.map_err(ApiError::NotFound)?;
let (tenant_state, timeline) = tokio::task::spawn_blocking(move || {
let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
Ok((
tenant.current_state(),
tenant.get_timeline(timeline_id, false),
))
})
.await
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
let timeline = tenant
.get_timeline(timeline_id, false)
.map_err(ApiError::NotFound)?;
let timeline = timeline.map_err(ApiError::NotFound)?;
let timeline_info = build_timeline_info(
tenant_state,
&timeline,
include_non_incremental_logical_size,
include_non_incremental_physical_size,
@@ -308,7 +322,6 @@ async fn get_lsn_by_timestamp_handler(request: Request<Body>) -> Result<Response
let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp);
let timeline = tenant_mgr::get_tenant(tenant_id, true)
.await
.and_then(|tenant| tenant.get_timeline(timeline_id, true))
.map_err(ApiError::NotFound)?;
let result = match timeline
@@ -334,13 +347,13 @@ async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>,
if let Some(remote_storage) = &state.remote_storage {
// FIXME: distinguish between "Tenant already exists" and other errors
tenant_mgr::attach_tenant(state.conf, tenant_id, remote_storage.clone())
tenant_mgr::attach_tenant(state.conf, tenant_id, remote_storage)
.instrument(info_span!("tenant_attach", tenant = %tenant_id))
.await
.map_err(ApiError::InternalServerError)?;
} else {
return Err(ApiError::BadRequest(anyhow!(
"attach_tenant is not possible because pageserver was configured without remote storage"
"attach_tenant is possible because pageserver was configured without remote storage"
)));
}
@@ -379,49 +392,23 @@ async fn tenant_detach_handler(request: Request<Body>) -> Result<Response<Body>,
json_response(StatusCode::OK, ())
}
async fn tenant_load_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let state = get_state(&request);
tenant_mgr::load_tenant(state.conf, tenant_id, state.remote_storage.clone())
.instrument(info_span!("load", tenant = %tenant_id))
.await
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::ACCEPTED, ())
}
async fn tenant_ignore_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let state = get_state(&request);
let conf = state.conf;
tenant_mgr::ignore_tenant(conf, tenant_id)
.instrument(info_span!("ignore_tenant", tenant = %tenant_id))
.await
// FIXME: Errors from `ignore_tenant` can be caused by both both user and internal errors.
// Replace this with better handling once the error type permits it.
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::OK, ())
}
async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permission(&request, None)?;
let response_data = tenant_mgr::list_tenants()
.instrument(info_span!("tenant_list"))
.await
.iter()
.map(|(id, state)| TenantInfo {
id: *id,
state: *state,
current_physical_size: None,
has_in_progress_downloads: Some(state.has_in_progress_downloads()),
})
.collect::<Vec<TenantInfo>>();
let response_data = tokio::task::spawn_blocking(move || {
let _enter = info_span!("tenant_list").entered();
tenant_mgr::list_tenants()
.iter()
.map(|(id, state)| TenantInfo {
id: *id,
state: *state,
current_physical_size: None,
has_in_progress_downloads: Some(state.has_in_progress_downloads()),
})
.collect::<Vec<TenantInfo>>()
})
.await
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?;
json_response(StatusCode::OK, response_data)
}
@@ -430,8 +417,9 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let tenant_info = async {
let tenant = tenant_mgr::get_tenant(tenant_id, false).await?;
let tenant_info = tokio::task::spawn_blocking(move || {
let _enter = info_span!("tenant_status_handler", tenant = %tenant_id).entered();
let tenant = tenant_mgr::get_tenant(tenant_id, false)?;
// Calculate total physical size of all timelines
let mut current_physical_size = 0;
@@ -440,15 +428,17 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
}
let state = tenant.current_state();
Ok(TenantInfo {
let tenant_info = TenantInfo {
id: tenant_id,
state,
current_physical_size: Some(current_physical_size),
has_in_progress_downloads: Some(state.has_in_progress_downloads()),
})
}
.instrument(info_span!("tenant_status_handler", tenant = %tenant_id))
};
Ok::<_, anyhow::Error>(tenant_info)
})
.await
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::OK, tenant_info)
@@ -458,9 +448,7 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let tenant = tenant_mgr::get_tenant(tenant_id, true)
.await
.map_err(ApiError::InternalServerError)?;
let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::InternalServerError)?;
// this can be long operation, it currently is not backed by any request coalescing or similar
let inputs = tenant
@@ -577,19 +565,22 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
.map(TenantId::from)
.unwrap_or_else(TenantId::generate);
let state = get_state(&request);
let new_tenant = tokio::task::spawn_blocking(move || {
let _enter = info_span!("tenant_create", tenant = ?target_tenant_id).entered();
let state = get_state(&request);
let new_tenant = tenant_mgr::create_tenant(
state.conf,
tenant_conf,
target_tenant_id,
state.remote_storage.clone(),
)
.instrument(info_span!("tenant_create", tenant = ?target_tenant_id))
tenant_mgr::create_tenant(
state.conf,
tenant_conf,
target_tenant_id,
state.remote_storage.clone(),
)
// FIXME: `create_tenant` can fail from both user and internal errors. Replace this
// with better error handling once the type permits it
.map_err(ApiError::InternalServerError)
})
.await
// FIXME: `create_tenant` can fail from both user and internal errors. Replace this
// with better error handling once the type permits it
.map_err(ApiError::InternalServerError)?;
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
Ok(match new_tenant {
Some(tenant) => {
@@ -680,17 +671,22 @@ async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Bo
);
}
let state = get_state(&request);
tenant_mgr::update_tenant_config(state.conf, tenant_conf, tenant_id)
.instrument(info_span!("tenant_config", tenant = ?tenant_id))
.await
// FIXME: `update_tenant_config` can fail because of both user and internal errors.
// Replace this `map_err` with better error handling once the type permits it
.map_err(ApiError::InternalServerError)?;
tokio::task::spawn_blocking(move || {
let _enter = info_span!("tenant_config", tenant = ?tenant_id).entered();
let state = get_state(&request);
tenant_mgr::update_tenant_config(state.conf, tenant_conf, tenant_id)
// FIXME: `update_tenant_config` can fail because of both user and internal errors.
// Replace this `map_err` with better error handling once the type permits it
.map_err(ApiError::InternalServerError)
})
.await
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
json_response(StatusCode::OK, ())
}
#[cfg(feature = "testing")]
async fn failpoints_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
if !fail::has_failpoints() {
return Err(ApiError::BadRequest(anyhow!(
@@ -724,6 +720,7 @@ async fn failpoints_handler(mut request: Request<Body>) -> Result<Response<Body>
}
// Run GC immediately on given timeline.
#[cfg(feature = "testing")]
async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -731,7 +728,7 @@ async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body
let gc_req: TimelineGcRequest = json_request(&mut request).await?;
let wait_task_done = tenant_mgr::immediate_gc(tenant_id, timeline_id, gc_req).await?;
let wait_task_done = tenant_mgr::immediate_gc(tenant_id, timeline_id, gc_req)?;
let gc_result = wait_task_done
.await
.context("wait for gc task")
@@ -742,14 +739,13 @@ async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body
}
// Run compaction immediately on given timeline.
#[cfg(feature = "testing")]
async fn timeline_compact_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_id))?;
let tenant = tenant_mgr::get_tenant(tenant_id, true)
.await
.map_err(ApiError::NotFound)?;
let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
let timeline = tenant
.get_timeline(timeline_id, true)
.map_err(ApiError::NotFound)?;
@@ -762,14 +758,13 @@ async fn timeline_compact_handler(request: Request<Body>) -> Result<Response<Bod
}
// Run checkpoint immediately on given timeline.
#[cfg(feature = "testing")]
async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_id))?;
let tenant = tenant_mgr::get_tenant(tenant_id, true)
.await
.map_err(ApiError::NotFound)?;
let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
let timeline = tenant
.get_timeline(timeline_id, true)
.map_err(ApiError::NotFound)?;
@@ -806,26 +801,22 @@ pub fn make_router(
}))
}
// A wrapper around a handler function that returns an error if the server
// was not configured with testing_mode enabled. This is used to gate API
// functions that should only be used in tests, never in production.
macro_rules! testing_api {
($handler_desc:literal, $handler:path $(,)?) => {{
use futures::FutureExt;
|req: Request<Body>| {
if conf.testing_mode {
$handler(req).left_future()
} else {
async {
Err(ApiError::BadRequest(anyhow!(concat!(
"Cannot ",
$handler_desc,
" because pageserver was configured without testing APIs",
))))
}
.right_future()
}
#[cfg(not(feature = "testing"))]
async fn cfg_disabled(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
Err(ApiError::BadRequest(anyhow!(concat!(
"Cannot ",
$handler_desc,
" because pageserver was compiled without testing APIs",
))))
}
#[cfg(feature = "testing")]
let handler = $handler;
#[cfg(not(feature = "testing"))]
let handler = cfg_disabled;
handler
}};
}
@@ -847,8 +838,6 @@ pub fn make_router(
.post("/v1/tenant/:tenant_id/timeline", timeline_create_handler)
.post("/v1/tenant/:tenant_id/attach", tenant_attach_handler)
.post("/v1/tenant/:tenant_id/detach", tenant_detach_handler)
.post("/v1/tenant/:tenant_id/load", tenant_load_handler)
.post("/v1/tenant/:tenant_id/ignore", tenant_ignore_handler)
.get(
"/v1/tenant/:tenant_id/timeline/:timeline_id",
timeline_detail_handler,

View File

@@ -125,13 +125,6 @@ pub const TEMP_FILE_SUFFIX: &str = "___temp";
/// Full path: `tenants/<tenant_id>/timelines/<timeline_id>___uninit`.
pub const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";
/// A marker file to prevent pageserver from loading a certain tenant on restart.
/// Different from [`TIMELINE_UNINIT_MARK_SUFFIX`] due to semantics of the corresponding
/// `ignore` management API command, that expects the ignored tenant to be properly loaded
/// into pageserver's memory before being ignored.
/// Full path: `tenants/<tenant_id>/___ignored_tenant`.
pub const IGNORED_TENANT_FILE_NAME: &str = "___ignored_tenant";
pub fn is_temporary(path: &Path) -> bool {
match path.file_name() {
Some(name) => name.to_string_lossy().ends_with(TEMP_FILE_SUFFIX),
@@ -148,28 +141,6 @@ pub fn is_uninit_mark(path: &Path) -> bool {
}
}
///
/// Wrapper around fail::fail_point! macro that returns quickly if testing_mode was
/// disabled in the pageserver config. Also enabled in unit tests.
///
/// fail::fail_point! is fairly quick, but it does acquire an RwLock and perform a HashMap
/// lookup. This macro is hopefully cheap enough that we don't need to worry about the
/// overhead even in production, and even if the macro is used in hot spots. (This check
/// compiles to two cmp instructions; get_unchecked() would shrink it to one.)
///
#[macro_export]
macro_rules! fail_point {
($($name:expr),*) => {{
if cfg!(test) || *$crate::TESTING_MODE.get().expect("testing_mode not initialized") {
fail::fail_point!($($name), *)
}
}};
}
/// This is set early in the pageserver startup, from the "testing_mode" setting in the
/// config file.
pub static TESTING_MODE: once_cell::sync::OnceCell<bool> = once_cell::sync::OnceCell::new();
#[cfg(test)]
mod backoff_defaults_tests {
use super::*;

View File

@@ -941,7 +941,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
/// ensures that queries don't fail immediately after pageserver startup, because
/// all tenants are still loading.
async fn get_active_tenant_with_timeout(tenant_id: TenantId) -> Result<Arc<Tenant>> {
let tenant = tenant_mgr::get_tenant(tenant_id, false).await?;
let tenant = tenant_mgr::get_tenant(tenant_id, false)?;
match tokio::time::timeout(Duration::from_secs(30), tenant.wait_to_become_active()).await {
Ok(wait_result) => wait_result
// no .context(), the error message is good enough and some tests depend on it

View File

@@ -202,9 +202,9 @@ use std::sync::atomic::{AtomicU32, Ordering};
use std::sync::{Arc, Mutex};
use anyhow::ensure;
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
use remote_storage::{DownloadError, GenericRemoteStorage};
use tokio::runtime::Runtime;
use tracing::{info, warn};
use tracing::{error, info, warn};
use tracing::{info_span, Instrument};
use utils::lsn::Lsn;
@@ -217,7 +217,7 @@ use crate::metrics::RemoteOpKind;
use crate::metrics::REMOTE_UPLOAD_QUEUE_UNFINISHED_TASKS;
use crate::{
config::PageServerConf,
storage_sync::index::LayerFileMetadata,
storage_sync::index::{LayerFileMetadata, RelativePath},
task_mgr,
task_mgr::TaskKind,
task_mgr::BACKGROUND_RUNTIME,
@@ -287,7 +287,7 @@ struct UploadQueueInitialized {
/// All layer files stored in the remote storage, taking into account all
/// in-progress and queued operations
latest_files: HashMap<RemotePath, LayerFileMetadata>,
latest_files: HashMap<RelativePath, LayerFileMetadata>,
/// Metadata stored in the remote storage, taking into account all
/// in-progress and queued operations.
@@ -337,18 +337,18 @@ impl UploadQueue {
let state = UploadQueueInitialized {
// As described in the doc comment, it's ok for `latest_files` and `latest_metadata` to be ahead.
latest_files: HashMap::new(),
latest_files: Default::default(),
latest_metadata: metadata.clone(),
// We haven't uploaded anything yet, so, `last_uploaded_consistent_lsn` must be 0 to prevent
// safekeepers from garbage-collecting anything.
last_uploaded_consistent_lsn: Lsn(0),
// what follows are boring default initializations
task_counter: 0,
task_counter: Default::default(),
num_inprogress_layer_uploads: 0,
num_inprogress_metadata_uploads: 0,
num_inprogress_deletions: 0,
inprogress_tasks: HashMap::new(),
queued_operations: VecDeque::new(),
inprogress_tasks: Default::default(),
queued_operations: Default::default(),
};
*self = UploadQueue::Initialized(state);
@@ -357,10 +357,6 @@ impl UploadQueue {
fn initialize_with_current_remote_index_part(
&mut self,
conf: &'static PageServerConf,
tenant_id: TenantId,
timeline_id: TimelineId,
index_part: &IndexPart,
) -> anyhow::Result<&mut UploadQueueInitialized> {
match self {
@@ -370,19 +366,14 @@ impl UploadQueue {
}
}
let mut files = HashMap::with_capacity(index_part.timeline_layers.len());
let timeline_path = conf.timeline_path(&timeline_id, &tenant_id);
for timeline_name in &index_part.timeline_layers {
let local_path = timeline_path.join(timeline_name);
let remote_timeline_path = conf.remote_path(&local_path).expect(
"Remote timeline path and local timeline path were constructed form the same conf",
);
let mut files = HashMap::new();
for path in &index_part.timeline_layers {
let layer_metadata = index_part
.layer_metadata
.get(timeline_name)
.get(path)
.map(LayerFileMetadata::from)
.unwrap_or(LayerFileMetadata::MISSING);
files.insert(remote_timeline_path, layer_metadata);
files.insert(path.clone(), layer_metadata);
}
let index_part_metadata = index_part.parse_metadata()?;
@@ -400,8 +391,8 @@ impl UploadQueue {
num_inprogress_layer_uploads: 0,
num_inprogress_metadata_uploads: 0,
num_inprogress_deletions: 0,
inprogress_tasks: HashMap::new(),
queued_operations: VecDeque::new(),
inprogress_tasks: Default::default(),
queued_operations: Default::default(),
};
*self = UploadQueue::Initialized(state);
@@ -465,12 +456,7 @@ impl RemoteTimelineClient {
/// The given `index_part` must be the one on the remote.
pub fn init_upload_queue(&self, index_part: &IndexPart) -> anyhow::Result<()> {
let mut upload_queue = self.upload_queue.lock().unwrap();
upload_queue.initialize_with_current_remote_index_part(
self.conf,
self.tenant_id,
self.timeline_id,
index_part,
)?;
upload_queue.initialize_with_current_remote_index_part(index_part)?;
Ok(())
}
@@ -524,13 +510,15 @@ impl RemoteTimelineClient {
/// On success, returns the size of the downloaded file.
pub async fn download_layer_file(
&self,
remote_path: &RemotePath,
path: &RelativePath,
layer_metadata: &LayerFileMetadata,
) -> anyhow::Result<u64> {
let downloaded_size = download::download_layer_file(
self.conf,
&self.storage_impl,
remote_path,
self.tenant_id,
self.timeline_id,
path,
layer_metadata,
)
.measure_remote_op(
@@ -548,13 +536,13 @@ impl RemoteTimelineClient {
let new_metadata = LayerFileMetadata::new(downloaded_size);
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
if let Some(upgraded) = upload_queue.latest_files.get_mut(remote_path) {
if let Some(upgraded) = upload_queue.latest_files.get_mut(path) {
upgraded.merge(&new_metadata);
} else {
// The file should exist, since we just downloaded it.
warn!(
"downloaded file {:?} not found in local copy of the index file",
remote_path
path
);
}
}
@@ -624,9 +612,14 @@ impl RemoteTimelineClient {
"file size not initialized in metadata"
);
let relative_path = RelativePath::from_local_path(
&self.conf.timeline_path(&self.timeline_id, &self.tenant_id),
path,
)?;
upload_queue
.latest_files
.insert(self.conf.remote_path(path)?, layer_metadata.clone());
.insert(relative_path, layer_metadata.clone());
let op = UploadOp::UploadLayer(PathBuf::from(path), layer_metadata.clone());
self.update_upload_queue_unfinished_metric(1, &op);
@@ -648,10 +641,13 @@ impl RemoteTimelineClient {
let mut guard = self.upload_queue.lock().unwrap();
let upload_queue = guard.initialized_mut()?;
// Convert the paths into RemotePaths, and gather other information we need.
let mut remote_paths = Vec::with_capacity(paths.len());
// Convert the paths into RelativePaths, and gather other information we need.
let mut relative_paths = Vec::with_capacity(paths.len());
for path in paths {
remote_paths.push(self.conf.remote_path(path)?);
relative_paths.push(RelativePath::from_local_path(
&self.conf.timeline_path(&self.timeline_id, &self.tenant_id),
path,
)?);
}
// Deleting layers doesn't affect the values stored in TimelineMetadata,
@@ -667,8 +663,8 @@ impl RemoteTimelineClient {
// from latest_files, but not yet scheduled for deletion. Use a closure
// to syntactically forbid ? or bail! calls here.
let no_bail_here = || {
for remote_path in remote_paths {
upload_queue.latest_files.remove(&remote_path);
for relative_path in relative_paths {
upload_queue.latest_files.remove(&relative_path);
}
let index_part = IndexPart::new(
@@ -842,19 +838,14 @@ impl RemoteTimelineClient {
let upload_result: anyhow::Result<()> = match &task.op {
UploadOp::UploadLayer(ref path, ref layer_metadata) => {
upload::upload_timeline_layer(
self.conf,
&self.storage_impl,
path,
layer_metadata,
)
.measure_remote_op(
self.tenant_id,
self.timeline_id,
RemoteOpFileKind::Layer,
RemoteOpKind::Upload,
)
.await
upload::upload_timeline_layer(&self.storage_impl, path, layer_metadata)
.measure_remote_op(
self.tenant_id,
self.timeline_id,
RemoteOpFileKind::Layer,
RemoteOpKind::Upload,
)
.await
}
UploadOp::UploadMetadata(ref index_part, _lsn) => {
upload::upload_index_part(
@@ -873,7 +864,7 @@ impl RemoteTimelineClient {
.await
}
UploadOp::Delete(metric_file_kind, ref path) => {
delete::delete_layer(self.conf, &self.storage_impl, path)
delete::delete_layer(&self.storage_impl, path)
.measure_remote_op(
self.tenant_id,
self.timeline_id,
@@ -897,20 +888,10 @@ impl RemoteTimelineClient {
Err(e) => {
let retries = task.retries.fetch_add(1, Ordering::SeqCst);
// uploads may fail due to rate limts (IAM, S3) or spurious network and external errors
// such issues are relatively regular, so don't use WARN or ERROR to avoid alerting
// people and tests until the retries are definitely causing delays.
if retries < 3 {
info!(
"failed to perform remote task {}, will retry (attempt {}): {:?}",
task.op, retries, e
);
} else {
warn!(
"failed to perform remote task {}, will retry (attempt {}): {:?}",
task.op, retries, e
);
}
error!(
"failed to perform remote task {}, will retry (attempt {}): {:?}",
task.op, retries, e
);
// sleep until it's time to retry, or we're cancelled
tokio::select! {
@@ -1102,11 +1083,15 @@ mod tests {
TimelineMetadata::from_bytes(&metadata.to_bytes().unwrap()).unwrap()
}
fn assert_file_list(a: &HashSet<String>, b: &[&str]) {
let mut avec: Vec<&str> = a.iter().map(|a| a.as_str()).collect();
fn assert_file_list(a: &HashSet<RelativePath>, b: &[&str]) {
let xx = PathBuf::from("");
let mut avec: Vec<String> = a
.iter()
.map(|x| x.to_local_path(&xx).to_string_lossy().into())
.collect();
avec.sort();
let mut bvec = b.to_vec();
let mut bvec = b.to_owned();
bvec.sort_unstable();
assert_eq!(avec, bvec);
@@ -1174,7 +1159,8 @@ mod tests {
println!("workdir: {}", harness.conf.workdir.display());
let storage_impl = GenericRemoteStorage::from_config(&storage_config)?;
let storage_impl =
GenericRemoteStorage::from_config(harness.conf.workdir.clone(), &storage_config)?;
let client = Arc::new(RemoteTimelineClient {
conf: harness.conf,
runtime,

View File

@@ -5,24 +5,34 @@ use tracing::debug;
use remote_storage::GenericRemoteStorage;
use crate::config::PageServerConf;
pub(super) async fn delete_layer<'a>(
conf: &'static PageServerConf,
storage: &'a GenericRemoteStorage,
local_layer_path: &'a Path,
pub(super) async fn delete_layer(
storage: &GenericRemoteStorage,
local_layer_path: &Path,
) -> anyhow::Result<()> {
crate::fail_point!("before-delete-layer", |_| {
fail::fail_point!("before-delete-layer", |_| {
anyhow::bail!("failpoint before-delete-layer")
});
debug!("Deleting layer from remote storage: {local_layer_path:?}",);
debug!(
"Deleting layer from remote storage: {:?}",
local_layer_path.display()
);
let path_to_delete = conf.remote_path(local_layer_path)?;
let storage_path = storage
.remote_object_id(local_layer_path)
.with_context(|| {
format!(
"Failed to get the layer storage path for local path '{}'",
local_layer_path.display()
)
})?;
// XXX: If the deletion fails because the object already didn't exist,
// it would be good to just issue a warning but consider it success.
// https://github.com/neondatabase/neon/issues/2934
storage.delete(&path_to_delete).await.with_context(|| {
format!("Failed to delete remote layer from storage at {path_to_delete:?}")
storage.delete(&storage_path).await.with_context(|| {
format!(
"Failed to delete remote layer from storage at '{:?}'",
storage_path
)
})
}

View File

@@ -10,11 +10,12 @@ use tracing::debug;
use crate::config::PageServerConf;
use crate::storage_sync::index::LayerFileMetadata;
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
use remote_storage::{DownloadError, GenericRemoteStorage};
use utils::crashsafe::path_with_suffix_extension;
use utils::id::{TenantId, TimelineId};
use super::index::IndexPart;
use super::RelativePath;
async fn fsync_path(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
fs::File::open(path).await?.sync_all().await
@@ -28,10 +29,21 @@ async fn fsync_path(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Er
pub async fn download_layer_file<'a>(
conf: &'static PageServerConf,
storage: &'a GenericRemoteStorage,
remote_path: &'a RemotePath,
tenant_id: TenantId,
timeline_id: TimelineId,
path: &'a RelativePath,
layer_metadata: &'a LayerFileMetadata,
) -> anyhow::Result<u64> {
let local_path = conf.local_path(remote_path);
let timeline_path = conf.timeline_path(&timeline_id, &tenant_id);
let local_path = path.to_local_path(&timeline_path);
let layer_storage_path = storage.remote_object_id(&local_path).with_context(|| {
format!(
"Failed to get the layer storage path for local path '{}'",
local_path.display()
)
})?;
// Perform a rename inspired by durable_rename from file_utils.c.
// The sequence:
@@ -52,13 +64,18 @@ pub async fn download_layer_file<'a>(
temp_file_path.display()
)
})?;
let mut download = storage.download(remote_path).await.with_context(|| {
format!(
"Failed to open a download stream for layer with remote storage path '{remote_path:?}'"
)
})?;
let mut download = storage
.download(&layer_storage_path)
.await
.with_context(|| {
format!(
"Failed to open a download stream for layer with remote storage path '{layer_storage_path:?}'"
)
})?;
let bytes_amount = tokio::io::copy(&mut download.download_stream, &mut destination_file).await.with_context(|| {
format!("Failed to download layer with remote storage path '{remote_path:?}' into file {temp_file_path:?}")
format!(
"Failed to download layer with remote storage path '{layer_storage_path:?}' into file '{}'", temp_file_path.display()
)
})?;
// Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
@@ -97,7 +114,7 @@ pub async fn download_layer_file<'a>(
})?;
drop(destination_file);
crate::fail_point!("remote-storage-download-pre-rename", |_| {
fail::fail_point!("remote-storage-download-pre-rename", |_| {
bail!("remote-storage-download-pre-rename failpoint triggered")
});
@@ -134,7 +151,12 @@ pub async fn list_remote_timelines<'a>(
tenant_id: TenantId,
) -> anyhow::Result<Vec<(TimelineId, IndexPart)>> {
let tenant_path = conf.timelines_path(&tenant_id);
let tenant_storage_path = conf.remote_path(&tenant_path)?;
let tenant_storage_path = storage.remote_object_id(&tenant_path).with_context(|| {
format!(
"Failed to get tenant storage path for local path '{}'",
tenant_path.display()
)
})?;
let timelines = storage
.list_prefixes(Some(&tenant_storage_path))
@@ -196,8 +218,14 @@ pub async fn download_index_part(
let index_part_path = conf
.metadata_path(timeline_id, tenant_id)
.with_file_name(IndexPart::FILE_NAME);
let part_storage_path = conf
.remote_path(&index_part_path)
let part_storage_path = storage
.remote_object_id(&index_part_path)
.with_context(|| {
format!(
"Failed to get the index part storage path for local path '{}'",
index_part_path.display()
)
})
.map_err(DownloadError::BadInput)?;
let mut index_part_download = storage.download(&part_storage_path).await?;
@@ -208,12 +236,20 @@ pub async fn download_index_part(
&mut index_part_bytes,
)
.await
.with_context(|| format!("Failed to download an index part into file {index_part_path:?}"))
.with_context(|| {
format!(
"Failed to download an index part into file '{}'",
index_part_path.display()
)
})
.map_err(DownloadError::Other)?;
let index_part: IndexPart = serde_json::from_slice(&index_part_bytes)
.with_context(|| {
format!("Failed to deserialize index part file into file {index_part_path:?}")
format!(
"Failed to deserialize index part file into file '{}'",
index_part_path.display()
)
})
.map_err(DownloadError::Other)?;

View File

@@ -2,9 +2,12 @@
//! Able to restore itself from the storage index parts, that are located in every timeline's remote directory and contain all data about
//! remote timeline layers and its metadata.
use std::collections::{HashMap, HashSet};
use std::{
collections::{HashMap, HashSet},
path::{Path, PathBuf},
};
use remote_storage::RemotePath;
use anyhow::{Context, Ok};
use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr};
@@ -12,6 +15,33 @@ use crate::tenant::metadata::TimelineMetadata;
use utils::lsn::Lsn;
/// A part of the filesystem path, that needs a root to become a path again.
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
#[serde(transparent)]
pub struct RelativePath(String);
impl RelativePath {
/// Attempts to strip off the base from path, producing a relative path or an error.
pub fn from_local_path(timeline_path: &Path, path: &Path) -> anyhow::Result<RelativePath> {
let relative = path.strip_prefix(timeline_path).with_context(|| {
format!(
"path '{}' is not relative to base '{}'",
path.display(),
timeline_path.display()
)
})?;
Ok(Self::from_filename(relative))
}
pub fn from_filename(path: &Path) -> RelativePath {
RelativePath(path.to_string_lossy().to_string())
}
pub fn to_local_path(&self, timeline_path: &Path) -> PathBuf {
timeline_path.join(&self.0)
}
}
/// Metadata gathered for each of the layer files.
///
/// Fields have to be `Option`s because remote [`IndexPart`]'s can be from different version, which
@@ -67,22 +97,21 @@ pub struct IndexPart {
#[serde(default)]
version: usize,
/// Layer names, which are stored on the remote storage.
/// Each of the layers present on remote storage.
///
/// Additional metadata can might exist in `layer_metadata`.
pub timeline_layers: HashSet<String>,
pub timeline_layers: HashSet<RelativePath>,
/// FIXME: unused field. This should be removed, but that changes the on-disk format,
/// so we need to make sure we're backwards-` (and maybe forwards-) compatible
/// First pass is to move it to Optional and the next would be its removal
missing_layers: Option<HashSet<String>>,
/// so we need to make sure we're backwards- (and maybe forwards-) compatible
missing_layers: HashSet<RelativePath>,
/// Per layer file name metadata, which can be present for a present or missing layer file.
/// Per layer file metadata, which can be present for a present or missing layer file.
///
/// Older versions of `IndexPart` will not have this property or have only a part of metadata
/// that latest version stores.
#[serde(default)]
pub layer_metadata: HashMap<String, IndexLayerMetadata>,
pub layer_metadata: HashMap<RelativePath, IndexLayerMetadata>,
// 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
// It's duplicated here for convenience.
@@ -100,29 +129,23 @@ impl IndexPart {
pub const FILE_NAME: &'static str = "index_part.json";
pub fn new(
layers_and_metadata: HashMap<RemotePath, LayerFileMetadata>,
layers_and_metadata: HashMap<RelativePath, LayerFileMetadata>,
disk_consistent_lsn: Lsn,
metadata_bytes: Vec<u8>,
) -> Self {
let mut timeline_layers = HashSet::with_capacity(layers_and_metadata.len());
let mut layer_metadata = HashMap::with_capacity(layers_and_metadata.len());
let mut timeline_layers = HashSet::new();
let mut layer_metadata = HashMap::new();
for (remote_path, metadata) in &layers_and_metadata {
let metadata = IndexLayerMetadata::from(metadata);
match remote_path.object_name() {
Some(layer_name) => {
timeline_layers.insert(layer_name.to_owned());
layer_metadata.insert(layer_name.to_owned(), metadata);
}
// TODO move this on a type level: we know, that every layer entry does have a name
None => panic!("Layer {remote_path:?} has no file name, skipping"),
}
}
separate_paths_and_metadata(
&layers_and_metadata,
&mut timeline_layers,
&mut layer_metadata,
);
Self {
version: Self::LATEST_VERSION,
timeline_layers,
missing_layers: Some(HashSet::new()),
missing_layers: HashSet::new(),
layer_metadata,
disk_consistent_lsn,
metadata_bytes,
@@ -148,6 +171,18 @@ impl From<&'_ LayerFileMetadata> for IndexLayerMetadata {
}
}
fn separate_paths_and_metadata(
input: &HashMap<RelativePath, LayerFileMetadata>,
output: &mut HashSet<RelativePath>,
layer_metadata: &mut HashMap<RelativePath, IndexLayerMetadata>,
) {
for (path, metadata) in input {
let metadata = IndexLayerMetadata::from(metadata);
layer_metadata.insert(path.clone(), metadata);
output.insert(path.clone());
}
}
#[cfg(test)]
mod tests {
use super::*;
@@ -163,8 +198,8 @@ mod tests {
let expected = IndexPart {
version: 0,
timeline_layers: HashSet::from([String::from("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9")]),
missing_layers: Some(HashSet::from([String::from("not_a_real_layer_but_adding_coverage")])),
timeline_layers: [RelativePath("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_owned())].into_iter().collect(),
missing_layers: [RelativePath("not_a_real_layer_but_adding_coverage".to_owned())].into_iter().collect(),
layer_metadata: HashMap::default(),
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
@@ -191,13 +226,13 @@ mod tests {
let expected = IndexPart {
// note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
version: 1,
timeline_layers: HashSet::from([String::from("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9")]),
missing_layers: Some(HashSet::from([String::from("not_a_real_layer_but_adding_coverage")])),
timeline_layers: [RelativePath("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_owned())].into_iter().collect(),
missing_layers: [RelativePath("not_a_real_layer_but_adding_coverage".to_owned())].into_iter().collect(),
layer_metadata: HashMap::from([
(String::from("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"), IndexLayerMetadata {
(RelativePath("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_owned()), IndexLayerMetadata {
file_size: Some(25600000),
}),
(String::from("not_a_real_layer_but_adding_coverage"), IndexLayerMetadata {
(RelativePath("not_a_real_layer_but_adding_coverage".to_owned()), IndexLayerMetadata {
// serde_json should always parse this but this might be a double with jq for
// example.
file_size: Some(9007199254741001),
@@ -210,46 +245,4 @@ mod tests {
let part = serde_json::from_str::<IndexPart>(example).unwrap();
assert_eq!(part, expected);
}
#[test]
fn v1_indexpart_is_parsed_with_optional_missing_layers() {
let example = r#"{
"version":1,
"timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
"layer_metadata":{
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
"not_a_real_layer_but_adding_coverage": { "file_size": 9007199254741001 }
},
"disk_consistent_lsn":"0/16960E8",
"metadata_bytes":[112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
}"#;
let expected = IndexPart {
// note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
version: 1,
timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_string()]),
layer_metadata: HashMap::from([
(
"000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".to_string(),
IndexLayerMetadata {
file_size: Some(25600000),
}
),
(
"not_a_real_layer_but_adding_coverage".to_string(),
IndexLayerMetadata {
// serde_json should always parse this but this might be a double with jq for
// example.
file_size: Some(9007199254741001),
}
)
]),
disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
missing_layers: None,
};
let part = serde_json::from_str::<IndexPart>(example).unwrap();
assert_eq!(part, expected);
}
}

View File

@@ -1,12 +1,12 @@
//! Helper functions to upload files to remote storage with a RemoteStorage
use anyhow::{bail, Context};
use fail::fail_point;
use std::path::Path;
use tokio::fs;
use super::index::IndexPart;
use crate::config::PageServerConf;
use crate::fail_point;
use crate::storage_sync::LayerFileMetadata;
use remote_storage::GenericRemoteStorage;
use utils::id::{TenantId, TimelineId};
@@ -30,9 +30,12 @@ pub(super) async fn upload_index_part<'a>(
let index_part_path = conf
.metadata_path(timeline_id, tenant_id)
.with_file_name(IndexPart::FILE_NAME);
let storage_path = conf.remote_path(&index_part_path)?;
storage
.upload_storage_object(Box::new(index_part_bytes), index_part_size, &storage_path)
.upload_storage_object(
Box::new(index_part_bytes),
index_part_size,
&index_part_path,
)
.await
.with_context(|| format!("Failed to upload index part for '{tenant_id} / {timeline_id}'"))
}
@@ -41,26 +44,36 @@ pub(super) async fn upload_index_part<'a>(
/// No extra checks for overlapping files is made and any files that are already present remotely will be overwritten, if submitted during the upload.
///
/// On an error, bumps the retries count and reschedules the entire task.
pub(super) async fn upload_timeline_layer<'a>(
conf: &'static PageServerConf,
storage: &'a GenericRemoteStorage,
source_path: &'a Path,
known_metadata: &'a LayerFileMetadata,
pub(super) async fn upload_timeline_layer(
storage: &GenericRemoteStorage,
source_path: &Path,
known_metadata: &LayerFileMetadata,
) -> anyhow::Result<()> {
fail_point!("before-upload-layer", |_| {
bail!("failpoint before-upload-layer")
});
let storage_path = conf.remote_path(source_path)?;
let storage_path = storage.remote_object_id(source_path).with_context(|| {
format!(
"Failed to get the layer storage path for local path '{}'",
source_path.display()
)
})?;
let source_file = fs::File::open(&source_path)
.await
.with_context(|| format!("Failed to open a source file for layer {source_path:?}"))?;
let source_file = fs::File::open(&source_path).await.with_context(|| {
format!(
"Failed to open a source file for layer '{}'",
source_path.display()
)
})?;
let fs_size = source_file
.metadata()
.await
.with_context(|| {
format!("Failed to get the source file metadata for layer {source_path:?}")
format!(
"Failed to get the source file metadata for layer '{}'",
source_path.display()
)
})?
.len();

View File

@@ -46,7 +46,6 @@ use std::time::{Duration, Instant};
use self::metadata::TimelineMetadata;
use crate::config::PageServerConf;
use crate::fail_point;
use crate::import_datadir;
use crate::is_uninit_mark;
use crate::metrics::{remove_tenant_metrics, STORAGE_TIME};
@@ -81,6 +80,8 @@ pub mod filename;
mod image_layer;
mod inmemory_layer;
pub mod layer_map;
pub mod bst_layer_map;
pub mod segment_tree_layer_map;
pub mod metadata;
mod par_fsync;
@@ -256,7 +257,7 @@ impl UninitializedTimeline<'_> {
// Thus spawning flush loop manually and skipping flush_loop setup in initialize_with_lock
raw_timeline.maybe_spawn_flush_loop();
fail_point!("before-checkpoint-new-timeline", |_| {
fail::fail_point!("before-checkpoint-new-timeline", |_| {
bail!("failpoint before-checkpoint-new-timeline");
});
@@ -572,7 +573,7 @@ impl Tenant {
pub fn spawn_attach(
conf: &'static PageServerConf,
tenant_id: TenantId,
remote_storage: GenericRemoteStorage,
remote_storage: &GenericRemoteStorage,
) -> Arc<Tenant> {
// XXX: Attach should provide the config, especially during tenant migration.
// See https://github.com/neondatabase/neon/issues/1555
@@ -585,7 +586,7 @@ impl Tenant {
tenant_conf,
wal_redo_manager,
tenant_id,
Some(remote_storage),
Some(remote_storage.clone()),
));
// Do all the hard work in the background
@@ -783,7 +784,7 @@ impl Tenant {
let tenant_conf = match Self::load_tenant_config(conf, tenant_id) {
Ok(conf) => conf,
Err(e) => {
error!("load tenant config failed: {:?}", e);
error!("load tenant config failed: {}", e);
return Tenant::create_broken_tenant(conf, tenant_id);
}
};
@@ -2099,7 +2100,7 @@ impl Tenant {
// Thus spawn flush loop manually and skip flush_loop setup in initialize_with_lock
unfinished_timeline.maybe_spawn_flush_loop();
fail_point!("before-checkpoint-new-timeline", |_| {
fail::fail_point!("before-checkpoint-new-timeline", |_| {
anyhow::bail!("failpoint before-checkpoint-new-timeline");
});
@@ -2193,7 +2194,7 @@ impl Tenant {
.context("Failed to create timeline data structure")?;
crashsafe::create_dir_all(timeline_path).context("Failed to create timeline directory")?;
fail_point!("after-timeline-uninit-mark-creation", |_| {
fail::fail_point!("after-timeline-uninit-mark-creation", |_| {
anyhow::bail!("failpoint after-timeline-uninit-mark-creation");
});
@@ -2382,7 +2383,7 @@ fn try_create_target_tenant_dir(
temporary_tenant_timelines_dir.display()
)
})?;
fail_point!("tenant-creation-before-tmp-rename", |_| {
fail::fail_point!("tenant-creation-before-tmp-rename", |_| {
anyhow::bail!("failpoint tenant-creation-before-tmp-rename");
});
@@ -2670,7 +2671,7 @@ pub mod harness {
&self,
key: Key,
lsn: Lsn,
base_img: Option<(Lsn, Bytes)>,
base_img: Option<Bytes>,
records: Vec<(Lsn, NeonWalRecord)>,
_pg_version: u32,
) -> Result<Bytes, WalRedoError> {

View File

@@ -0,0 +1,111 @@
use std::collections::BTreeMap;
// TODO the `im` crate has 20x more downloads and also has persistent
// BTree. See if it's better. (What's "fully persistent?")
use rpds::RedBlackTreeMap;
/// Layer map implemented using persistent binary search tree.
/// This implementation is only good enough to run benchmarks,
/// so it's missing unnecessary details. Values are String for now.
pub struct BSTLM {
/// Mapping key to the latest layer (if any) until the next key
head: RedBlackTreeMap<i128, Option<String>>,
/// All previous states of `self.head`
historic: BTreeMap<u64, RedBlackTreeMap<i128, Option<String>>>,
}
impl std::fmt::Debug for BSTLM {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let head_vec: Vec<_> = self.head.iter().collect();
write!(f, "BSTLM: head: {:?}", head_vec)
}
}
impl BSTLM {
pub fn new() -> Self {
BSTLM {
head: RedBlackTreeMap::default(),
historic: BTreeMap::default(),
}
}
pub fn insert(self: &mut Self, key_begin: i128, key_end: i128, lsn: u64, value: String) {
// TODO check for off-by-one errors
// It's a persistent map, not a retroactive one
if let Some(last_entry) = self.historic.iter().rev().next() {
let last_lsn = last_entry.0;
if lsn == *last_lsn {
// TODO there are edge cases to take care of
}
if lsn < *last_lsn {
todo!("smaller lsn not implemented yet")
}
}
// NOTE The order of the following lines is important!!
// Preserve information after right endpoint
let value_at_end = match self.head.range(0..key_end).last() {
Some((_, Some(v))) => Some(v.clone()),
Some((_, None)) => None,
None => None,
};
self.head.insert_mut(key_end, value_at_end);
// Insert the left endpoint
self.head.insert_mut(key_begin, Some(value.clone()));
// Cover the inside of the interval
let to_remove: Vec<_> = self.head.range((key_begin + 1)..key_end)
.map(|(k, _)| k.clone())
.collect();
for key in to_remove {
self.head.remove_mut(&key);
}
// Remember history. Clone is O(1)
self.historic.insert(lsn, self.head.clone());
}
pub fn query(self: &Self, key: i128, lsn: u64) -> Option<&String> {
// TODO check for off-by-one errors
let version = self.historic.range(0..=lsn).rev().next()?.1;
version.range(0..=key).rev().next()?.1.as_ref()
}
// TODO Add API for delta layers with lsn range.
// The easy solution is to only store images, and then from every
// image point to deltas on top of it. There might be something
// nicer but we have this solution as backup.
}
#[test]
fn test_bstlm() {
let mut bstlm = BSTLM::new();
bstlm.insert(0, 5, 100, "Layer 1".to_string());
dbg!(&bstlm);
bstlm.insert(3, 9, 110, "Layer 2".to_string());
dbg!(&bstlm);
bstlm.insert(5, 6, 120, "Layer 3".to_string());
dbg!(&bstlm);
// After Layer 1 insertion
assert_eq!(bstlm.query(1, 105), Some(&"Layer 1".to_string()));
assert_eq!(bstlm.query(4, 105), Some(&"Layer 1".to_string()));
// After Layer 2 insertion
assert_eq!(bstlm.query(4, 115), Some(&"Layer 2".to_string()));
assert_eq!(bstlm.query(8, 115), Some(&"Layer 2".to_string()));
assert_eq!(bstlm.query(11, 115), None);
// After Layer 3 insertion
assert_eq!(bstlm.query(4, 125), Some(&"Layer 2".to_string()));
assert_eq!(bstlm.query(5, 125), Some(&"Layer 3".to_string()));
assert_eq!(bstlm.query(7, 125), Some(&"Layer 2".to_string()));
}

View File

@@ -0,0 +1,346 @@
use persistent_range_query::naive::{IndexableKey, NaiveVecStorage};
use persistent_range_query::ops::SameElementsInitializer;
use persistent_range_query::segment_tree::{MidpointableKey, PersistentSegmentTree, PersistentSegmentTreeVersion};
use persistent_range_query::{
LazyRangeInitializer, PersistentVecStorage, RangeModification, RangeQueryResult,
VecReadableVersion,
};
use std::cmp::Ordering;
use std::collections::BTreeMap;
use std::ops::Range;
#[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd)]
struct PageIndex(u32);
type LayerId = String;
impl IndexableKey for PageIndex {
fn index(all_keys: &Range<Self>, key: &Self) -> usize {
(key.0 as usize) - (all_keys.start.0 as usize)
}
fn element_range(all_keys: &Range<Self>, index: usize) -> Range<Self> {
PageIndex(all_keys.start.0 + index as u32)..PageIndex(all_keys.start.0 + index as u32 + 1)
}
}
impl MidpointableKey for PageIndex {
fn midpoint(range: &Range<Self>) -> Self {
PageIndex(range.start.0 + (range.end.0 - range.start.0) / 2)
}
}
#[derive(Clone, Debug, Eq, PartialEq)]
struct LayerMapInformation {
// Only make sense for a range of length 1.
last_layer: Option<LayerId>,
last_image_layer: Option<LayerId>,
// Work for all ranges
max_delta_layers: (usize, Range<PageIndex>),
}
impl LayerMapInformation {
fn last_layers(&self) -> (&Option<LayerId>, &Option<LayerId>) {
(&self.last_layer, &self.last_image_layer)
}
fn max_delta_layers(&self) -> &(usize, Range<PageIndex>) {
&self.max_delta_layers
}
}
fn merge_ranges(left: &Range<PageIndex>, right: &Range<PageIndex>) -> Range<PageIndex> {
if left.is_empty() {
right.clone()
} else if right.is_empty() {
left.clone()
} else if left.end == right.start {
left.start..right.end
} else {
left.clone()
}
}
impl RangeQueryResult<PageIndex> for LayerMapInformation {
fn new_for_empty_range() -> Self {
LayerMapInformation {
last_layer: None,
last_image_layer: None,
max_delta_layers: (0, PageIndex(0)..PageIndex(0)),
}
}
fn combine(
left: &Self,
_left_range: &Range<PageIndex>,
right: &Self,
_right_range: &Range<PageIndex>,
) -> Self {
// Note that either range may be empty.
LayerMapInformation {
last_layer: left
.last_layer
.as_ref()
.or_else(|| right.last_layer.as_ref())
.cloned(),
last_image_layer: left
.last_image_layer
.as_ref()
.or_else(|| right.last_image_layer.as_ref())
.cloned(),
max_delta_layers: match left.max_delta_layers.0.cmp(&right.max_delta_layers.0) {
Ordering::Less => right.max_delta_layers.clone(),
Ordering::Greater => left.max_delta_layers.clone(),
Ordering::Equal => (
left.max_delta_layers.0,
merge_ranges(&left.max_delta_layers.1, &right.max_delta_layers.1),
),
},
}
}
fn add(
left: &mut Self,
left_range: &Range<PageIndex>,
right: &Self,
right_range: &Range<PageIndex>,
) {
*left = Self::combine(&left, left_range, right, right_range);
}
}
#[derive(Clone, Debug)]
struct AddDeltaLayers {
last_layer: LayerId,
count: usize,
}
#[derive(Clone, Debug)]
struct LayerMapModification {
add_image_layer: Option<LayerId>,
add_delta_layers: Option<AddDeltaLayers>,
}
impl LayerMapModification {
fn add_image_layer(layer: impl Into<LayerId>) -> Self {
LayerMapModification {
add_image_layer: Some(layer.into()),
add_delta_layers: None,
}
}
fn add_delta_layer(layer: impl Into<LayerId>) -> Self {
LayerMapModification {
add_image_layer: None,
add_delta_layers: Some(AddDeltaLayers {
last_layer: layer.into(),
count: 1,
}),
}
}
}
impl RangeModification<PageIndex> for LayerMapModification {
type Result = LayerMapInformation;
fn no_op() -> Self {
LayerMapModification {
add_image_layer: None,
add_delta_layers: None,
}
}
fn is_no_op(&self) -> bool {
self.add_image_layer.is_none() && self.add_delta_layers.is_none()
}
fn is_reinitialization(&self) -> bool {
self.add_image_layer.is_some()
}
fn apply(&self, result: &mut Self::Result, range: &Range<PageIndex>) {
if let Some(layer) = &self.add_image_layer {
result.last_layer = Some(layer.clone());
result.last_image_layer = Some(layer.clone());
result.max_delta_layers = (0, range.clone());
}
if let Some(AddDeltaLayers { last_layer, count }) = &self.add_delta_layers {
result.last_layer = Some(last_layer.clone());
result.max_delta_layers.0 += count;
}
}
fn compose(later: &Self, earlier: &mut Self) {
if later.add_image_layer.is_some() {
*earlier = later.clone();
return;
}
if let Some(AddDeltaLayers { last_layer, count }) = &later.add_delta_layers {
let res = earlier.add_delta_layers.get_or_insert(AddDeltaLayers {
last_layer: LayerId::default(),
count: 0,
});
res.last_layer = last_layer.clone();
res.count += count;
}
}
}
impl LazyRangeInitializer<LayerMapInformation, PageIndex> for SameElementsInitializer<()> {
fn get(&self, range: &Range<PageIndex>) -> LayerMapInformation {
LayerMapInformation {
last_layer: None,
last_image_layer: None,
max_delta_layers: (0, range.clone()),
}
}
}
type Head = PersistentSegmentTree<LayerMapModification, SameElementsInitializer<()>, PageIndex>;
type Version = PersistentSegmentTreeVersion<LayerMapModification, SameElementsInitializer<()>, PageIndex>;
pub struct STLM {
head: Head,
historic: BTreeMap<u32, Version>,
}
/// Layer map (good enough for benchmarks) implemented using persistent segment tree
impl STLM {
pub fn new() -> Self {
STLM {
head: PersistentSegmentTree::new(
PageIndex(0)..PageIndex(100),
SameElementsInitializer::new(()),
),
historic: BTreeMap::default(),
}
}
pub fn insert(self: &mut Self, key_begin: u32, key_end: u32, lsn: u32, value: String) {
self.head.modify(
&(PageIndex(key_begin)..PageIndex(key_end)),
&LayerMapModification::add_image_layer(value),
);
self.historic.insert(lsn, self.head.freeze());
}
pub fn query(self: &Self, key: u32, lsn: u32) -> Option<String> {
let version = self.historic.range(0..=lsn).rev().next()?.1;
let info = version.get(&(PageIndex(key)..PageIndex(key + 1)));
info.last_image_layer.map(|s| s.clone())
}
}
fn test_stlm() {
let mut stlm = STLM::new();
stlm.insert(0, 5, 100, "layer 1".to_string());
stlm.insert(3, 9, 110, "layer 2".to_string());
dbg!(stlm.query(1, 105));
dbg!(stlm.query(4, 105));
dbg!(stlm.query(4, 115));
}
#[test]
fn test_stlm_() {
test_stlm()
}
fn test_layer_map<
S: PersistentVecStorage<LayerMapModification, SameElementsInitializer<()>, PageIndex>,
>() {
let mut s = S::new(
PageIndex(0)..PageIndex(100),
SameElementsInitializer::new(()),
);
s.modify(
&(PageIndex(0)..PageIndex(70)),
&LayerMapModification::add_image_layer("Img0..70"),
);
s.modify(
&(PageIndex(50)..PageIndex(100)),
&LayerMapModification::add_image_layer("Img50..100"),
);
s.modify(
&(PageIndex(10)..PageIndex(60)),
&LayerMapModification::add_delta_layer("Delta10..60"),
);
let s_before_last_delta = s.freeze();
s.modify(
&(PageIndex(20)..PageIndex(80)),
&LayerMapModification::add_delta_layer("Delta20..80"),
);
assert_eq!(
s.get(&(PageIndex(5)..PageIndex(6))).last_layers(),
(&Some("Img0..70".to_owned()), &Some("Img0..70".to_owned()))
);
assert_eq!(
s.get(&(PageIndex(15)..PageIndex(16))).last_layers(),
(
&Some("Delta10..60".to_owned()),
&Some("Img0..70".to_owned())
)
);
assert_eq!(
s.get(&(PageIndex(25)..PageIndex(26))).last_layers(),
(
&Some("Delta20..80".to_owned()),
&Some("Img0..70".to_owned())
)
);
assert_eq!(
s.get(&(PageIndex(65)..PageIndex(66))).last_layers(),
(
&Some("Delta20..80".to_owned()),
&Some("Img50..100".to_owned())
)
);
assert_eq!(
s.get(&(PageIndex(95)..PageIndex(96))).last_layers(),
(
&Some("Img50..100".to_owned()),
&Some("Img50..100".to_owned())
)
);
assert_eq!(
s.get(&(PageIndex(0)..PageIndex(100))).max_delta_layers(),
&(2, PageIndex(20)..PageIndex(60)),
);
assert_eq!(
*s_before_last_delta
.get(&(PageIndex(0)..PageIndex(100)))
.max_delta_layers(),
(1, PageIndex(10)..PageIndex(60)),
);
assert_eq!(
*s.get(&(PageIndex(10)..PageIndex(30))).max_delta_layers(),
(2, PageIndex(20)..PageIndex(30))
);
assert_eq!(
*s.get(&(PageIndex(10)..PageIndex(20))).max_delta_layers(),
(1, PageIndex(10)..PageIndex(20))
);
assert_eq!(
*s.get(&(PageIndex(70)..PageIndex(80))).max_delta_layers(),
(1, PageIndex(70)..PageIndex(80))
);
assert_eq!(
*s_before_last_delta
.get(&(PageIndex(70)..PageIndex(80)))
.max_delta_layers(),
(0, PageIndex(70)..PageIndex(80))
);
}
#[test]
fn test_naive() {
test_layer_map::<NaiveVecStorage<_, _, _>>();
}
#[test]
fn test_segment_tree() {
test_layer_map::<PersistentSegmentTree<_, _, _>>();
}

View File

@@ -2,6 +2,7 @@
use anyhow::{anyhow, bail, ensure, Context};
use bytes::Bytes;
use fail::fail_point;
use itertools::Itertools;
use once_cell::sync::OnceCell;
use pageserver_api::models::TimelineState;
@@ -18,8 +19,7 @@ use std::sync::atomic::{AtomicBool, AtomicI64, Ordering as AtomicOrdering};
use std::sync::{Arc, Mutex, MutexGuard, RwLock};
use std::time::{Duration, Instant, SystemTime};
use crate::fail_point;
use crate::storage_sync::index::IndexPart;
use crate::storage_sync::index::{IndexPart, RelativePath};
use crate::storage_sync::RemoteTimelineClient;
use crate::tenant::{
delta_layer::{DeltaLayer, DeltaLayerWriter},
@@ -999,9 +999,55 @@ impl Timeline {
&self,
index_part: &IndexPart,
remote_client: &RemoteTimelineClient,
local_layers: HashSet<PathBuf>,
mut local_filenames: HashSet<PathBuf>,
up_to_date_disk_consistent_lsn: Lsn,
) -> anyhow::Result<HashSet<PathBuf>> {
let mut remote_filenames: HashSet<PathBuf> = HashSet::new();
for fname in index_part.timeline_layers.iter() {
remote_filenames.insert(fname.to_local_path(&PathBuf::from("")));
}
// Are there any local files that exist, with a size that doesn't match
// with the size stored in the remote index file?
// If so, rename_to_backup those files so that we re-download them later.
local_filenames.retain(|path| {
let layer_metadata = index_part
.layer_metadata
.get(&RelativePath::from_filename(path))
.map(LayerFileMetadata::from)
.unwrap_or(LayerFileMetadata::MISSING);
if let Some(remote_size) = layer_metadata.file_size() {
let local_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id).join(&path);
match local_path.metadata() {
Ok(metadata) => {
let local_size = metadata.len();
if local_size != remote_size {
warn!("removing local file \"{}\" because it has unexpected length {}; length in remote index is {}",
path.display(),
local_size,
remote_size);
if let Err(err) = rename_to_backup(&local_path) {
error!("could not rename file \"{}\": {:?}",
local_path.display(), err);
}
self.metrics.current_physical_size_gauge.sub(local_size);
false
} else {
true
}
}
Err(err) => {
error!("could not get size of local file \"{}\": {:?}", path.display(), err);
true
}
}
} else {
true
}
});
// Are we missing some files that are present in remote storage?
// Download them now.
// TODO Downloading many files this way is not efficient.
@@ -1010,63 +1056,17 @@ impl Timeline {
// b) typical case now is that there is nothing to sync, this downloads a lot
// 1) if there was another pageserver that came and generated new files
// 2) during attach of a timeline with big history which we currently do not do
let mut local_only_layers = local_layers;
let timeline_dir = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
for remote_layer_name in &index_part.timeline_layers {
let local_layer_path = timeline_dir.join(remote_layer_name);
local_only_layers.remove(&local_layer_path);
for path in remote_filenames.difference(&local_filenames) {
let fname = path.to_str().unwrap();
info!("remote layer file {fname} does not exist locally");
let remote_layer_metadata = index_part
let layer_metadata = index_part
.layer_metadata
.get(remote_layer_name)
.get(&RelativePath::from_filename(path))
.map(LayerFileMetadata::from)
.unwrap_or(LayerFileMetadata::MISSING);
let remote_layer_path = self
.conf
.remote_path(&local_layer_path)
.expect("local_layer_path received from the same conf that provided a workdir");
if local_layer_path.exists() {
let mut already_downloaded = true;
// Are there any local files that exist, with a size that doesn't match
// with the size stored in the remote index file?
// If so, rename_to_backup those files so that we re-download them later.
if let Some(remote_size) = remote_layer_metadata.file_size() {
match local_layer_path.metadata() {
Ok(metadata) => {
let local_size = metadata.len();
if local_size != remote_size {
warn!("removing local file {local_layer_path:?} because it has unexpected length {local_size}; length in remote index is {remote_size}");
if let Err(err) = rename_to_backup(&local_layer_path) {
error!("could not rename file {local_layer_path:?}: {err:?}");
} else {
self.metrics.current_physical_size_gauge.sub(local_size);
already_downloaded = false;
}
}
}
Err(err) => {
error!("could not get size of local file {local_layer_path:?}: {err:?}")
}
}
}
if already_downloaded {
continue;
}
} else {
info!("remote layer {remote_layer_path:?} does not exist locally");
}
let layer_name = local_layer_path
.file_name()
.and_then(|os_str| os_str.to_str())
.with_context(|| {
format!("Layer file {local_layer_path:?} has no name in unicode")
})?;
if let Some(imgfilename) = ImageFileName::parse_str(layer_name) {
if let Some(imgfilename) = ImageFileName::parse_str(fname) {
if imgfilename.lsn > up_to_date_disk_consistent_lsn {
warn!(
"found future image layer {} on timeline {} remote_consistent_lsn is {}",
@@ -1075,13 +1075,11 @@ impl Timeline {
continue;
}
trace!("downloading image file: {remote_layer_path:?}");
let downloaded_size = remote_client
.download_layer_file(&remote_layer_path, &remote_layer_metadata)
trace!("downloading image file: {}", file = path.display());
let sz = remote_client
.download_layer_file(&RelativePath::from_filename(path), &layer_metadata)
.await
.with_context(|| {
format!("failed to download image layer from path {remote_layer_path:?}")
})?;
.context("download image layer")?;
trace!("done");
let image_layer =
@@ -1091,10 +1089,8 @@ impl Timeline {
.write()
.unwrap()
.insert_historic(Arc::new(image_layer));
self.metrics
.current_physical_size_gauge
.add(downloaded_size);
} else if let Some(deltafilename) = DeltaFileName::parse_str(layer_name) {
self.metrics.current_physical_size_gauge.add(sz);
} else if let Some(deltafilename) = DeltaFileName::parse_str(fname) {
// Create a DeltaLayer struct for each delta file.
// The end-LSN is exclusive, while disk_consistent_lsn is
// inclusive. For example, if disk_consistent_lsn is 100, it is
@@ -1109,13 +1105,11 @@ impl Timeline {
continue;
}
trace!("downloading delta file: {remote_layer_path:?}");
trace!("downloading image file: {}", file = path.display());
let sz = remote_client
.download_layer_file(&remote_layer_path, &remote_layer_metadata)
.download_layer_file(&RelativePath::from_filename(path), &layer_metadata)
.await
.with_context(|| {
format!("failed to download delta layer from path {remote_layer_path:?}")
})?;
.context("download delta layer")?;
trace!("done");
let delta_layer =
@@ -1127,11 +1121,16 @@ impl Timeline {
.insert_historic(Arc::new(delta_layer));
self.metrics.current_physical_size_gauge.add(sz);
} else {
bail!("unexpected layer filename {layer_name} in remote storage path: {remote_layer_path:?}");
bail!("unexpected layer filename in remote storage: {}", fname);
}
}
Ok(local_only_layers)
// now these are local only filenames
let local_only_filenames = local_filenames
.difference(&remote_filenames)
.cloned()
.collect();
Ok(local_only_filenames)
}
///
@@ -1165,46 +1164,47 @@ impl Timeline {
let disk_consistent_lsn = up_to_date_metadata.disk_consistent_lsn();
// Build a map of local layers for quick lookups
let local_layers = self
.layers
.read()
.unwrap()
.iter_historic_layers()
.map(|historic_layer| {
historic_layer
.local_path()
.expect("Historic layers should have a path")
})
.collect::<HashSet<_>>();
let mut local_filenames: HashSet<PathBuf> = HashSet::new();
for layer in self.layers.read().unwrap().iter_historic_layers() {
local_filenames.insert(layer.filename());
}
let local_only_layers = match index_part {
let local_only_filenames = match index_part {
Some(index_part) => {
info!(
"initializing upload queue from remote index with {} layer files",
index_part.timeline_layers.len()
);
remote_client.init_upload_queue(index_part)?;
self.download_missing(index_part, remote_client, local_layers, disk_consistent_lsn)
.await?
let local_only_filenames = self
.download_missing(
index_part,
remote_client,
local_filenames,
disk_consistent_lsn,
)
.await?;
local_only_filenames
}
None => {
info!("initializing upload queue as empty");
remote_client.init_upload_queue_for_empty_remote(up_to_date_metadata)?;
local_layers
local_filenames
}
};
// Are there local files that don't exist remotely? Schedule uploads for them
for layer_path in &local_only_layers {
let layer_size = layer_path
let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
for fname in &local_only_filenames {
let absolute = timeline_path.join(fname);
let sz = absolute
.metadata()
.with_context(|| format!("failed to get file {layer_path:?} metadata"))?
.with_context(|| format!("failed to get file {} metadata", fname.display()))?
.len();
info!("scheduling {layer_path:?} for upload");
remote_client
.schedule_layer_file_upload(layer_path, &LayerFileMetadata::new(layer_size))?;
info!("scheduling {} for upload", fname.display());
remote_client.schedule_layer_file_upload(&absolute, &LayerFileMetadata::new(sz))?;
}
if !local_only_layers.is_empty() {
if !local_only_filenames.is_empty() {
remote_client.schedule_index_upload(up_to_date_metadata)?;
}
@@ -2642,22 +2642,24 @@ impl Timeline {
data.records.len()
);
} else {
if data.img.is_some() {
let base_img = if let Some((_lsn, img)) = data.img {
trace!(
"found {} WAL records and a base image for {} at {}, performing WAL redo",
data.records.len(),
key,
request_lsn
);
Some(img)
} else {
trace!("found {} WAL records that will init the page for {} at {}, performing WAL redo", data.records.len(), key, request_lsn);
None
};
let last_rec_lsn = data.records.last().unwrap().0;
let img = self
.walredo_mgr
.request_redo(key, request_lsn, data.img, data.records, self.pg_version)
.request_redo(key, request_lsn, base_img, data.records, self.pg_version)
.context("Failed to reconstruct a page image:")?;
if img.len() == page_cache::PAGE_SZ {

View File

@@ -185,9 +185,6 @@ impl TenantConfOpt {
if let Some(max_lsn_wal_lag) = other.max_lsn_wal_lag {
self.max_lsn_wal_lag = Some(max_lsn_wal_lag);
}
if let Some(trace_read_requests) = other.trace_read_requests {
self.trace_read_requests = Some(trace_read_requests);
}
}
}

View File

@@ -1,62 +1,75 @@
//! This module acts as a switchboard to access different repositories managed by this
//! page server.
use std::collections::{hash_map, HashMap};
use std::collections::hash_map;
use std::ffi::OsStr;
use std::fs;
use std::path::Path;
use std::sync::Arc;
use tokio::fs;
use anyhow::Context;
use once_cell::sync::Lazy;
use tokio::sync::RwLock;
use tracing::*;
use pageserver_api::models::TimelineGcRequest;
use remote_storage::GenericRemoteStorage;
use utils::crashsafe;
use crate::config::PageServerConf;
use crate::repository::GcResult;
use crate::task_mgr::{self, TaskKind};
use crate::tenant::{Tenant, TenantState};
use crate::tenant_config::TenantConfOpt;
use crate::IGNORED_TENANT_FILE_NAME;
use utils::fs_ext::PathExt;
use utils::http::error::ApiError;
use utils::id::{TenantId, TimelineId};
static TENANTS: Lazy<RwLock<HashMap<TenantId, Arc<Tenant>>>> =
Lazy::new(|| RwLock::new(HashMap::new()));
mod tenants_state {
use once_cell::sync::Lazy;
use std::{
collections::HashMap,
sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard},
};
use utils::id::TenantId;
use crate::tenant::Tenant;
static TENANTS: Lazy<RwLock<HashMap<TenantId, Arc<Tenant>>>> =
Lazy::new(|| RwLock::new(HashMap::new()));
pub(super) fn read_tenants() -> RwLockReadGuard<'static, HashMap<TenantId, Arc<Tenant>>> {
TENANTS
.read()
.expect("Failed to read() tenants lock, it got poisoned")
}
pub(super) fn write_tenants() -> RwLockWriteGuard<'static, HashMap<TenantId, Arc<Tenant>>> {
TENANTS
.write()
.expect("Failed to write() tenants lock, it got poisoned")
}
}
/// Initialize repositories with locally available timelines.
/// Timelines that are only partially available locally (remote storage has more data than this pageserver)
/// are scheduled for download and added to the tenant once download is completed.
#[instrument(skip(conf, remote_storage))]
pub async fn init_tenant_mgr(
pub fn init_tenant_mgr(
conf: &'static PageServerConf,
remote_storage: Option<GenericRemoteStorage>,
) -> anyhow::Result<()> {
let _entered = info_span!("init_tenant_mgr").entered();
// Scan local filesystem for attached tenants
let mut number_of_tenants = 0;
let tenants_dir = conf.tenants_path();
let mut dir_entries = fs::read_dir(&tenants_dir)
.await
.with_context(|| format!("Failed to list tenants dir {tenants_dir:?}"))?;
loop {
match dir_entries.next_entry().await {
Ok(None) => break,
Ok(Some(dir_entry)) => {
for dir_entry in std::fs::read_dir(&tenants_dir)
.with_context(|| format!("Failed to list tenants dir {}", tenants_dir.display()))?
{
match &dir_entry {
Ok(dir_entry) => {
let tenant_dir_path = dir_entry.path();
if crate::is_temporary(&tenant_dir_path) {
info!(
"Found temporary tenant directory, removing: {}",
tenant_dir_path.display()
);
if let Err(e) = fs::remove_dir_all(&tenant_dir_path).await {
if let Err(e) = std::fs::remove_dir_all(&tenant_dir_path) {
error!(
"Failed to remove temporary directory '{}': {:?}",
tenant_dir_path.display(),
@@ -64,38 +77,27 @@ pub async fn init_tenant_mgr(
);
}
} else {
// This case happens if we crash during attach before creating the attach marker file
let is_empty = tenant_dir_path.is_empty_dir().with_context(|| {
format!("Failed to check whether {tenant_dir_path:?} is an empty dir")
})?;
if is_empty {
info!("removing empty tenant directory {tenant_dir_path:?}");
if let Err(e) = fs::remove_dir(&tenant_dir_path).await {
error!(
"Failed to remove empty tenant directory '{}': {e:#}",
tenant_dir_path.display()
)
}
continue;
}
let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME);
if tenant_ignore_mark_file.exists() {
info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant");
continue;
}
match schedule_local_tenant_processing(
conf,
&tenant_dir_path,
remote_storage.clone(),
) {
Ok(tenant) => {
TENANTS.write().await.insert(tenant.tenant_id(), tenant);
match load_local_tenant(conf, &tenant_dir_path, remote_storage.clone()) {
Ok(Some(tenant)) => {
tenants_state::write_tenants().insert(tenant.tenant_id(), tenant);
number_of_tenants += 1;
}
Ok(None) => {
// This case happens if we crash during attach before creating the attach marker file
if let Err(e) = std::fs::remove_dir(&tenant_dir_path) {
error!(
"Failed to remove empty tenant directory '{}': {e:#}",
tenant_dir_path.display()
)
}
}
Err(e) => {
error!("Failed to collect tenant files from dir {tenants_dir:?} for entry {dir_entry:?}, reason: {e:#}");
error!(
"Failed to collect tenant files from dir '{}' for entry {:?}, reason: {:#}",
tenants_dir.display(),
dir_entry,
e
);
}
}
}
@@ -105,7 +107,10 @@ pub async fn init_tenant_mgr(
// here, the pageserver startup fails altogether, causing outage for *all*
// tenants. That seems worse.
error!(
"Failed to list tenants dir entry in directory {tenants_dir:?}, reason: {e:?}"
"Failed to list tenants dir entry {:?} in directory {}, reason: {:?}",
dir_entry,
tenants_dir.display(),
e,
);
}
}
@@ -115,45 +120,34 @@ pub async fn init_tenant_mgr(
Ok(())
}
pub fn schedule_local_tenant_processing(
fn load_local_tenant(
conf: &'static PageServerConf,
tenant_path: &Path,
remote_storage: Option<GenericRemoteStorage>,
) -> anyhow::Result<Arc<Tenant>> {
anyhow::ensure!(
tenant_path.is_dir(),
"Cannot load tenant from path {tenant_path:?}, it either does not exist or not a directory"
);
anyhow::ensure!(
!crate::is_temporary(tenant_path),
"Cannot load tenant from temporary path {tenant_path:?}"
);
anyhow::ensure!(
!tenant_path.is_empty_dir().with_context(|| {
format!("Failed to check whether {tenant_path:?} is an empty dir")
})?,
"Cannot load tenant from empty directory {tenant_path:?}"
);
) -> anyhow::Result<Option<Arc<Tenant>>> {
if !tenant_path.is_dir() {
anyhow::bail!("tenant_path is not a directory: {tenant_path:?}")
}
let is_empty = tenant_path
.is_empty_dir()
.context("check whether tenant_path is an empty dir")?;
if is_empty {
info!("skipping empty tenant directory {tenant_path:?}");
return Ok(None);
}
let tenant_id = tenant_path
.file_name()
.and_then(OsStr::to_str)
.unwrap_or_default()
.parse::<TenantId>()
.with_context(|| {
format!("Could not parse tenant id out of the tenant dir name in path {tenant_path:?}")
})?;
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(tenant_id);
anyhow::ensure!(
!conf.tenant_ignore_mark_file_path(tenant_id).exists(),
"Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}"
);
.context("Could not parse tenant id out of the tenant dir name")?;
let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
if let Some(remote_storage) = remote_storage {
Tenant::spawn_attach(conf, tenant_id, remote_storage)
Tenant::spawn_attach(conf, tenant_id, &remote_storage)
} else {
warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured");
Tenant::create_broken_tenant(conf, tenant_id)
@@ -163,7 +157,7 @@ pub fn schedule_local_tenant_processing(
// Start loading the tenant into memory. It will initially be in Loading state.
Tenant::spawn_load(conf, tenant_id, remote_storage)
};
Ok(tenant)
Ok(Some(tenant))
}
///
@@ -171,7 +165,7 @@ pub fn schedule_local_tenant_processing(
///
pub async fn shutdown_all_tenants() {
let tenants_to_shut_down = {
let mut m = TENANTS.write().await;
let mut m = tenants_state::write_tenants();
let mut tenants_to_shut_down = Vec::with_capacity(m.len());
for (_, tenant) in m.drain() {
if tenant.is_active() {
@@ -205,13 +199,13 @@ pub async fn shutdown_all_tenants() {
}
}
pub async fn create_tenant(
pub fn create_tenant(
conf: &'static PageServerConf,
tenant_conf: TenantConfOpt,
tenant_id: TenantId,
remote_storage: Option<GenericRemoteStorage>,
) -> anyhow::Result<Option<Arc<Tenant>>> {
match TENANTS.write().await.entry(tenant_id) {
match tenants_state::write_tenants().entry(tenant_id) {
hash_map::Entry::Occupied(_) => {
debug!("tenant {tenant_id} already exists");
Ok(None)
@@ -221,36 +215,44 @@ pub async fn create_tenant(
// If this section ever becomes contentious, introduce a new `TenantState::Creating`.
let tenant_directory =
super::tenant::create_tenant_files(conf, tenant_conf, tenant_id)?;
let created_tenant =
schedule_local_tenant_processing(conf, &tenant_directory, remote_storage)?;
let crated_tenant_id = created_tenant.tenant_id();
anyhow::ensure!(
tenant_id == crated_tenant_id,
"loaded created tenant has unexpected tenant id (expect {tenant_id} != actual {crated_tenant_id})",
);
v.insert(Arc::clone(&created_tenant));
Ok(Some(created_tenant))
let created_tenant = load_local_tenant(conf, &tenant_directory, remote_storage)?;
match created_tenant {
None => {
// We get None in case the directory is empty.
// This shouldn't happen here, because we just created the directory.
// So, skip any cleanup work for now, we don't know how we reached this state.
anyhow::bail!("we just created the tenant directory, it can't be empty");
}
Some(tenant) => {
anyhow::ensure!(
tenant_id == tenant.tenant_id(),
"loaded created tenant has unexpected tenant id (expect {} != actual {})",
tenant_id,
tenant.tenant_id()
);
v.insert(Arc::clone(&tenant));
Ok(Some(tenant))
}
}
}
}
}
pub async fn update_tenant_config(
pub fn update_tenant_config(
conf: &'static PageServerConf,
tenant_conf: TenantConfOpt,
tenant_id: TenantId,
) -> anyhow::Result<()> {
info!("configuring tenant {tenant_id}");
get_tenant(tenant_id, true)
.await?
.update_tenant_config(tenant_conf);
get_tenant(tenant_id, true)?.update_tenant_config(tenant_conf);
Tenant::persist_tenant_config(&conf.tenant_config_path(tenant_id), tenant_conf, false)?;
Ok(())
}
/// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query.
/// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
pub async fn get_tenant(tenant_id: TenantId, active_only: bool) -> anyhow::Result<Arc<Tenant>> {
let m = TENANTS.read().await;
pub fn get_tenant(tenant_id: TenantId, active_only: bool) -> anyhow::Result<Arc<Tenant>> {
let m = tenants_state::read_tenants();
let tenant = m
.get(&tenant_id)
.with_context(|| format!("Tenant {tenant_id} not found in the local state"))?;
@@ -286,7 +288,7 @@ pub async fn delete_timeline(tenant_id: TenantId, timeline_id: TimelineId) -> an
info!("waiting for timeline tasks to shutdown");
task_mgr::shutdown_tasks(None, Some(tenant_id), Some(timeline_id)).await;
info!("timeline task shutdown completed");
match get_tenant(tenant_id, true).await {
match get_tenant(tenant_id, true) {
Ok(tenant) => {
tenant.delete_timeline(timeline_id).await?;
}
@@ -300,67 +302,40 @@ pub async fn detach_tenant(
conf: &'static PageServerConf,
tenant_id: TenantId,
) -> anyhow::Result<()> {
remove_tenant_from_memory(tenant_id, async {
let local_tenant_directory = conf.tenant_path(&tenant_id);
fs::remove_dir_all(&local_tenant_directory)
.await
.with_context(|| {
format!("Failed to remove local tenant directory {local_tenant_directory:?}")
})?;
Ok(())
})
.await
}
let tenant = match {
let mut tenants_accessor = tenants_state::write_tenants();
tenants_accessor.remove(&tenant_id)
} {
Some(tenant) => tenant,
None => anyhow::bail!("Tenant not found for id {tenant_id}"),
};
pub async fn load_tenant(
conf: &'static PageServerConf,
tenant_id: TenantId,
remote_storage: Option<GenericRemoteStorage>,
) -> anyhow::Result<()> {
run_if_no_tenant_in_memory(tenant_id, |vacant_entry| {
let tenant_path = conf.tenant_path(&tenant_id);
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(tenant_id);
if tenant_ignore_mark.exists() {
std::fs::remove_file(&tenant_ignore_mark)
.with_context(|| format!("Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"))?;
}
tenant.set_stopping();
// shutdown all tenant and timeline tasks: gc, compaction, page service)
task_mgr::shutdown_tasks(None, Some(tenant_id), None).await;
let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, remote_storage)
.with_context(|| {
format!("Failed to schedule tenant processing in path {tenant_path:?}")
})?;
// If removal fails there will be no way to successfully retry detach,
// because the tenant no longer exists in the in-memory map. And it needs to be removed from it
// before we remove files, because it contains references to tenant
// which references ephemeral files which are deleted on drop. So if we keep these references,
// we will attempt to remove files which no longer exist. This can be fixed by having shutdown
// mechanism for tenant that will clean temporary data to avoid any references to ephemeral files
let local_tenant_directory = conf.tenant_path(&tenant_id);
fs::remove_dir_all(&local_tenant_directory).with_context(|| {
format!(
"Failed to remove local tenant directory '{}'",
local_tenant_directory.display()
)
})?;
vacant_entry.insert(new_tenant);
Ok(())
}).await
}
pub async fn ignore_tenant(
conf: &'static PageServerConf,
tenant_id: TenantId,
) -> anyhow::Result<()> {
remove_tenant_from_memory(tenant_id, async {
let ignore_mark_file = conf.tenant_ignore_mark_file_path(tenant_id);
fs::File::create(&ignore_mark_file)
.await
.context("Failed to create ignore mark file")
.and_then(|_| {
crashsafe::fsync_file_and_parent(&ignore_mark_file)
.context("Failed to fsync ignore mark file")
})
.with_context(|| format!("Failed to crate ignore mark for tenant {tenant_id}"))?;
Ok(())
})
.await
Ok(())
}
///
/// Get list of tenants, for the mgmt API
///
pub async fn list_tenants() -> Vec<(TenantId, TenantState)> {
TENANTS
.read()
.await
pub fn list_tenants() -> Vec<(TenantId, TenantState)> {
tenants_state::read_tenants()
.iter()
.map(|(id, tenant)| (*id, tenant.current_state()))
.collect()
@@ -373,102 +348,42 @@ pub async fn list_tenants() -> Vec<(TenantId, TenantState)> {
pub async fn attach_tenant(
conf: &'static PageServerConf,
tenant_id: TenantId,
remote_storage: GenericRemoteStorage,
remote_storage: &GenericRemoteStorage,
) -> anyhow::Result<()> {
run_if_no_tenant_in_memory(tenant_id, |vacant_entry| {
let tenant_path = conf.tenant_path(&tenant_id);
anyhow::ensure!(
!tenant_path.exists(),
"Cannot attach tenant {tenant_id}, local tenant directory already exists"
);
let tenant = Tenant::spawn_attach(conf, tenant_id, remote_storage);
vacant_entry.insert(tenant);
Ok(())
})
.await
}
async fn run_if_no_tenant_in_memory<F, V>(tenant_id: TenantId, run: F) -> anyhow::Result<V>
where
F: FnOnce(hash_map::VacantEntry<TenantId, Arc<Tenant>>) -> anyhow::Result<V>,
{
match TENANTS.write().await.entry(tenant_id) {
match tenants_state::write_tenants().entry(tenant_id) {
hash_map::Entry::Occupied(e) => {
anyhow::bail!(
"tenant {tenant_id} already exists, state: {:?}",
e.get().current_state()
)
}
hash_map::Entry::Vacant(v) => run(v),
}
}
/// Stops and removes the tenant from memory, if it's not [`TenantState::Stopping`] already, bails otherwise.
/// Allows to remove other tenant resources manually, via `tenant_cleanup`.
/// If the cleanup fails, tenant will stay in memory in [`TenantState::Broken`] state, and another removal
/// operation would be needed to remove it.
async fn remove_tenant_from_memory<V, F>(
tenant_id: TenantId,
tenant_cleanup: F,
) -> anyhow::Result<V>
where
F: std::future::Future<Output = anyhow::Result<V>>,
{
// It's important to keep the tenant in memory after the final cleanup, to avoid cleanup races.
// The exclusive lock here ensures we don't miss the tenant state updates before trying another removal.
// tenant-wde cleanup operations may take some time (removing the entire tenant directory), we want to
// avoid holding the lock for the entire process.
{
let tenants_accessor = TENANTS.write().await;
match tenants_accessor.get(&tenant_id) {
Some(tenant) => match tenant.current_state() {
TenantState::Attaching
| TenantState::Loading
| TenantState::Broken
| TenantState::Active => tenant.set_stopping(),
TenantState::Stopping => {
anyhow::bail!("Tenant {tenant_id} is stopping already")
// Cannot attach a tenant that already exists. The error message depends on
// the state it's in.
match e.get().current_state() {
TenantState::Attaching => {
anyhow::bail!("tenant {tenant_id} attach is already in progress")
}
current_state => {
anyhow::bail!("tenant already exists, current state: {current_state:?}")
}
},
None => anyhow::bail!("Tenant not found for id {tenant_id}"),
}
}
// shutdown all tenant and timeline tasks: gc, compaction, page service)
// No new tasks will be started for this tenant because it's in `Stopping` state.
// Hence, once we're done here, the `tenant_cleanup` callback can mutate tenant on-disk state freely.
task_mgr::shutdown_tasks(None, Some(tenant_id), None).await;
match tenant_cleanup
.await
.with_context(|| format!("Failed to run cleanup for tenant {tenant_id}"))
{
Ok(hook_value) => {
let mut tenants_accessor = TENANTS.write().await;
if tenants_accessor.remove(&tenant_id).is_none() {
warn!("Tenant {tenant_id} got removed from memory before operation finished");
}
Ok(hook_value)
}
Err(e) => {
let tenants_accessor = TENANTS.read().await;
match tenants_accessor.get(&tenant_id) {
Some(tenant) => tenant.set_broken(),
None => warn!("Tenant {tenant_id} got removed from memory"),
}
Err(e)
hash_map::Entry::Vacant(v) => {
let tenant = Tenant::spawn_attach(conf, tenant_id, remote_storage);
v.insert(tenant);
Ok(())
}
}
}
pub async fn immediate_gc(
#[cfg(feature = "testing")]
use {
crate::repository::GcResult, pageserver_api::models::TimelineGcRequest,
utils::http::error::ApiError,
};
#[cfg(feature = "testing")]
pub fn immediate_gc(
tenant_id: TenantId,
timeline_id: TimelineId,
gc_req: TimelineGcRequest,
) -> Result<tokio::sync::oneshot::Receiver<Result<GcResult, anyhow::Error>>, ApiError> {
let guard = TENANTS.read().await;
let guard = tenants_state::read_tenants();
let tenant = guard
.get(&tenant_id)
@@ -490,7 +405,7 @@ pub async fn immediate_gc(
&format!("timeline_gc_handler garbage collection run for tenant {tenant_id} timeline {timeline_id}"),
false,
async move {
crate::fail_point!("immediate_gc_task_pre");
fail::fail_point!("immediate_gc_task_pre");
let result = tenant
.gc_iteration(Some(timeline_id), gc_horizon, pitr, true)
.instrument(info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id))

View File

@@ -155,7 +155,7 @@ async fn wait_for_active_tenant(
wait: Duration,
) -> ControlFlow<(), Arc<Tenant>> {
let tenant = loop {
match tenant_mgr::get_tenant(tenant_id, false).await {
match tenant_mgr::get_tenant(tenant_id, false) {
Ok(tenant) => break tenant,
Err(e) => {
error!("Failed to get a tenant {tenant_id}: {e:#}");

View File

@@ -9,6 +9,7 @@ use std::{
use anyhow::{bail, ensure, Context};
use bytes::BytesMut;
use chrono::{NaiveDateTime, Utc};
use fail::fail_point;
use futures::StreamExt;
use postgres::{SimpleQueryMessage, SimpleQueryRow};
use postgres_ffi::v14::xlog_utils::normalize_lsn;
@@ -19,7 +20,6 @@ use tokio::{pin, select, sync::watch, time};
use tokio_postgres::{replication::ReplicationStream, Client};
use tracing::{debug, error, info, trace, warn};
use crate::fail_point;
use crate::{metrics::LIVE_CONNECTIONS_COUNT, walreceiver::TaskStateUpdate};
use crate::{
task_mgr,

View File

@@ -84,7 +84,7 @@ pub trait WalRedoManager: Send + Sync {
&self,
key: Key,
lsn: Lsn,
base_img: Option<(Lsn, Bytes)>,
base_img: Option<Bytes>,
records: Vec<(Lsn, NeonWalRecord)>,
pg_version: u32,
) -> Result<Bytes, WalRedoError>;
@@ -147,7 +147,7 @@ impl WalRedoManager for PostgresRedoManager {
&self,
key: Key,
lsn: Lsn,
base_img: Option<(Lsn, Bytes)>,
base_img: Option<Bytes>,
records: Vec<(Lsn, NeonWalRecord)>,
pg_version: u32,
) -> Result<Bytes, WalRedoError> {
@@ -156,8 +156,7 @@ impl WalRedoManager for PostgresRedoManager {
return Err(WalRedoError::InvalidRequest);
}
let base_img_lsn = base_img.as_ref().map(|p| p.0).unwrap_or(Lsn::INVALID);
let mut img = base_img.map(|p| p.1);
let mut img: Option<Bytes> = base_img;
let mut batch_neon = can_apply_in_neon(&records[0].1);
let mut batch_start = 0;
for i in 1..records.len() {
@@ -171,7 +170,6 @@ impl WalRedoManager for PostgresRedoManager {
key,
lsn,
img,
base_img_lsn,
&records[batch_start..i],
self.conf.wal_redo_timeout,
pg_version,
@@ -191,7 +189,6 @@ impl WalRedoManager for PostgresRedoManager {
key,
lsn,
img,
base_img_lsn,
&records[batch_start..],
self.conf.wal_redo_timeout,
pg_version,
@@ -226,13 +223,11 @@ impl PostgresRedoManager {
///
/// Process one request for WAL redo using wal-redo postgres
///
#[allow(clippy::too_many_arguments)]
fn apply_batch_postgres(
&self,
key: Key,
lsn: Lsn,
base_img: Option<Bytes>,
base_img_lsn: Lsn,
records: &[(Lsn, NeonWalRecord)],
wal_redo_timeout: Duration,
pg_version: u32,
@@ -287,12 +282,9 @@ impl PostgresRedoManager {
// next request will launch a new one.
if result.is_err() {
error!(
"error applying {} WAL records {}..{} ({} bytes) to base image with LSN {} to reconstruct page image at LSN {}",
"error applying {} WAL records ({} bytes) to reconstruct page image at LSN {}",
records.len(),
records.first().map(|p| p.0).unwrap_or(Lsn(0)),
records.last().map(|p| p.0).unwrap_or(Lsn(0)),
nbytes,
base_img_lsn,
lsn
);
let process = process_guard.take().unwrap();
@@ -930,7 +922,8 @@ impl NoLeakChild {
match child.wait() {
Ok(exit_status) => {
info!(exit_status = %exit_status, "wait successful");
// log at error level since .kill() is something we only do on errors ATM
error!(exit_status = %exit_status, "wait successful");
}
Err(e) => {
error!(error = %e, "wait error; might leak the child process; it will show as zombie (defunct)");

View File

@@ -464,12 +464,12 @@ pg_init_libpagestore(void)
NULL, NULL, NULL);
DefineCustomIntVariable("neon.readahead_buffer_size",
"number of prefetches to buffer",
"This buffer is used to hold and manage prefetched "
"data; so it is important that this buffer is at "
"least as large as the configured value of all "
"tablespaces' effective_io_concurrency and "
"maintenance_io_concurrency, and your sessions' "
"values for these settings.",
"This buffer is used to store prefetched data; so "
"it is important that this buffer is at least as "
"large as the configured value of all tablespaces' "
"effective_io_concurrency and maintenance_io_concurrency, "
"your sessions' values of these, and the value for "
"seqscan_prefetch_buffers.",
&readahead_buffer_size,
128, 16, 1024,
PGC_USERSET,

View File

@@ -242,14 +242,6 @@ PrefetchState *MyPState;
) \
)
#define ReceiveBufferNeedsCompaction() (\
(MyPState->n_responses_buffered / 8) < ( \
MyPState->ring_receive - \
MyPState->ring_last - \
MyPState->n_responses_buffered \
) \
)
int n_prefetch_hits = 0;
int n_prefetch_misses = 0;
int n_prefetch_missed_caches = 0;
@@ -257,99 +249,17 @@ int n_prefetch_dupes = 0;
XLogRecPtr prefetch_lsn = 0;
static bool compact_prefetch_buffers(void);
static void consume_prefetch_responses(void);
static uint64 prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn);
static bool prefetch_read(PrefetchRequest *slot);
static void prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn);
static bool prefetch_wait_for(uint64 ring_index);
static void prefetch_cleanup_trailing_unused(void);
static void prefetch_cleanup(void);
static inline void prefetch_set_unused(uint64 ring_index);
static XLogRecPtr neon_get_request_lsn(bool *latest, RelFileNode rnode,
ForkNumber forknum, BlockNumber blkno);
static bool
compact_prefetch_buffers(void)
{
uint64 empty_ring_index = MyPState->ring_last;
uint64 search_ring_index = MyPState->ring_receive;
int n_moved = 0;
if (MyPState->ring_receive == MyPState->ring_last)
return false;
while (search_ring_index > MyPState->ring_last)
{
search_ring_index--;
if (GetPrfSlot(search_ring_index)->status == PRFS_UNUSED)
{
empty_ring_index = search_ring_index;
break;
}
}
/*
* Here we have established:
* slots < search_ring_index may be unused (not scanned)
* slots >= search_ring_index and <= empty_ring_index are unused
* slots > empty_ring_index are in use, or outside our buffer's range.
*
* Therefore, there is a gap of at least one unused items between
* search_ring_index and empty_ring_index, which grows as we hit
* more unused items while moving backwards through the array.
*/
while (search_ring_index > MyPState->ring_last)
{
PrefetchRequest *source_slot;
PrefetchRequest *target_slot;
bool found;
search_ring_index--;
source_slot = GetPrfSlot(search_ring_index);
if (source_slot->status == PRFS_UNUSED)
continue;
target_slot = GetPrfSlot(empty_ring_index);
Assert(source_slot->status == PRFS_RECEIVED);
Assert(target_slot->status == PRFS_UNUSED);
target_slot->buftag = source_slot->buftag;
target_slot->status = source_slot->status;
target_slot->response = source_slot->response;
target_slot->effective_request_lsn = source_slot->effective_request_lsn;
target_slot->my_ring_index = empty_ring_index;
prfh_delete(MyPState->prf_hash, source_slot);
prfh_insert(MyPState->prf_hash, target_slot, &found);
Assert(!found);
/* Adjust the location of our known-empty slot */
empty_ring_index--;
source_slot->status = PRFS_UNUSED;
source_slot->buftag = (BufferTag) {0};
source_slot->response = NULL;
source_slot->my_ring_index = 0;
source_slot->effective_request_lsn = 0;
n_moved++;
}
if (MyPState->ring_last != empty_ring_index)
{
MyPState->ring_last = empty_ring_index;
return true;
}
return false;
}
void
readahead_buffer_resize(int newsize, void *extra)
{
@@ -413,7 +323,7 @@ readahead_buffer_resize(int newsize, void *extra)
prfh_insert(newPState->prf_hash, newslot, &found);
Assert(!found);
switch (newslot->status)
{
case PRFS_UNUSED:
@@ -460,7 +370,7 @@ consume_prefetch_responses(void)
}
static void
prefetch_cleanup_trailing_unused(void)
prefetch_cleanup(void)
{
uint64 ring_index;
PrefetchRequest *slot;
@@ -621,10 +531,7 @@ prefetch_set_unused(uint64 ring_index)
/* run cleanup if we're holding back ring_last */
if (MyPState->ring_last == ring_index)
prefetch_cleanup_trailing_unused();
/* ... and try to store the buffered responses more compactly if > 12.5% of the buffer is gaps */
else if (ReceiveBufferNeedsCompaction())
compact_prefetch_buffers();
prefetch_cleanup();
}
static void
@@ -795,31 +702,20 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls
Assert(slot->status != PRFS_UNUSED);
/*
* If there is good reason to run compaction on the prefetch buffers,
* try to do that.
*/
if (ReceiveBufferNeedsCompaction() && compact_prefetch_buffers())
/* We have the slot for ring_last, so that must still be in progress */
switch (slot->status)
{
Assert(slot->status == PRFS_UNUSED);
}
else
{
/* We have the slot for ring_last, so that must still be in progress */
switch (slot->status)
{
case PRFS_REQUESTED:
Assert(MyPState->ring_receive == cleanup_index);
prefetch_wait_for(cleanup_index);
prefetch_set_unused(cleanup_index);
break;
case PRFS_RECEIVED:
case PRFS_TAG_REMAINS:
prefetch_set_unused(cleanup_index);
break;
default:
pg_unreachable();
}
case PRFS_REQUESTED:
Assert(MyPState->ring_receive == cleanup_index);
prefetch_wait_for(cleanup_index);
prefetch_set_unused(cleanup_index);
break;
case PRFS_RECEIVED:
case PRFS_TAG_REMAINS:
prefetch_set_unused(cleanup_index);
break;
default:
pg_unreachable();
}
}
@@ -1206,7 +1102,7 @@ PageIsEmptyHeapPage(char *buffer)
}
static void
neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool force)
neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer)
{
XLogRecPtr lsn = PageGetLSN(buffer);
@@ -1220,7 +1116,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch
* correctness, the non-logged updates are not critical. But we want to
* have a reasonably up-to-date VM and FSM in the page server.
*/
if ((force || forknum == FSM_FORKNUM || forknum == VISIBILITYMAP_FORKNUM) && !RecoveryInProgress())
if (forknum == FSM_FORKNUM && !RecoveryInProgress())
{
/* FSM is never WAL-logged and we don't care. */
XLogRecPtr recptr;
@@ -1229,7 +1125,30 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch
XLogFlush(recptr);
lsn = recptr;
ereport(SmgrTrace,
(errmsg("Page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X/%X",
(errmsg("FSM page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X/%X",
blocknum,
reln->smgr_rnode.node.spcNode,
reln->smgr_rnode.node.dbNode,
reln->smgr_rnode.node.relNode,
forknum, LSN_FORMAT_ARGS(lsn))));
}
else if (forknum == VISIBILITYMAP_FORKNUM && !RecoveryInProgress())
{
/*
* Always WAL-log vm. We should never miss clearing visibility map
* bits.
*
* TODO Is it too bad for performance? Hopefully we do not evict
* actively used vm too often.
*/
XLogRecPtr recptr;
recptr = log_newpage_copy(&reln->smgr_rnode.node, forknum, blocknum, buffer, false);
XLogFlush(recptr);
lsn = recptr;
ereport(SmgrTrace,
(errmsg("Visibilitymap page %u of relation %u/%u/%u.%u was force logged at lsn=%X/%X",
blocknum,
reln->smgr_rnode.node.spcNode,
reln->smgr_rnode.node.dbNode,
@@ -1624,7 +1543,6 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
char *buffer, bool skipFsync)
{
XLogRecPtr lsn;
BlockNumber n_blocks = 0;
switch (reln->smgr_relpersistence)
{
@@ -1664,16 +1582,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
errhint("This limit is defined by neon.max_cluster_size GUC")));
}
/*
* Usually Postgres doesn't extend relation on more than one page
* (leaving holes). But this rule is violated in PG-15 where CreateAndCopyRelationData
* call smgrextend for destination relation n using size of source relation
*/
get_cached_relsize(reln->smgr_rnode.node, forkNum, &n_blocks);
while (n_blocks < blkno)
neon_wallog_page(reln, forkNum, n_blocks++, buffer, true);
neon_wallog_page(reln, forkNum, blkno, buffer, false);
neon_wallog_page(reln, forkNum, blkno, buffer);
set_cached_relsize(reln->smgr_rnode.node, forkNum, blkno + 1);
lsn = PageGetLSN(buffer);
@@ -1920,7 +1829,7 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
/* buffer was used, clean up for later reuse */
prefetch_set_unused(ring_index);
prefetch_cleanup_trailing_unused();
prefetch_cleanup();
}
/*
@@ -2101,7 +2010,7 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
neon_wallog_page(reln, forknum, blocknum, buffer, false);
neon_wallog_page(reln, forknum, blocknum, buffer);
lsn = PageGetLSN(buffer);
elog(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",

View File

@@ -28,7 +28,6 @@ use std::{borrow::Cow, future::Future, net::SocketAddr};
use tokio::{net::TcpListener, task::JoinError};
use tracing::info;
use utils::project_git_version;
use utils::sentry_init::{init_sentry, release_name};
project_git_version!(GIT_VERSION);
@@ -46,9 +45,6 @@ async fn main() -> anyhow::Result<()> {
.with_target(false)
.init();
// initialize sentry if SENTRY_DSN is provided
let _sentry_guard = init_sentry(release_name!(), &[]);
let arg_matches = cli().get_matches();
let tls_config = match (

View File

@@ -13,7 +13,7 @@
# avoid running regular linting script that checks every feature.
if [[ "$OSTYPE" == "darwin"* ]]; then
# no extra features to test currently, add more here when needed
cargo clippy --locked --all --all-targets -- -A unknown_lints -D warnings
cargo clippy --locked --all --all-targets --features testing -- -A unknown_lints -D warnings
else
# * `-A unknown_lints` do not warn about unknown lint suppressions
# that people with newer toolchains might use

View File

@@ -4,6 +4,7 @@
use anyhow::{bail, Context, Result};
use clap::{value_parser, Arg, ArgAction, Command};
use const_format::formatcp;
use nix::unistd::Pid;
use remote_storage::RemoteStorageConfig;
use std::fs::{self, File};
use std::io::{ErrorKind, Write};
@@ -14,7 +15,7 @@ use tokio::sync::mpsc;
use toml_edit::Document;
use tracing::*;
use url::{ParseError, Url};
use utils::pid_file;
use utils::lock_file;
use metrics::set_build_info_metric;
use safekeeper::broker;
@@ -34,14 +35,11 @@ use utils::{
http::endpoint,
id::NodeId,
logging::{self, LogFormat},
project_git_version,
sentry_init::{init_sentry, release_name},
signals, tcp_listener,
project_git_version, signals, tcp_listener,
};
const PID_FILE_NAME: &str = "safekeeper.pid";
const ID_FILE_NAME: &str = "safekeeper.id";
project_git_version!(GIT_VERSION);
fn main() -> anyhow::Result<()> {
@@ -135,8 +133,6 @@ fn main() -> anyhow::Result<()> {
conf.log_format = LogFormat::from_config(log_format)?;
}
// initialize sentry if SENTRY_DSN is provided
let _sentry_guard = init_sentry(release_name!(), &[("node_id", &conf.my_id.to_string())]);
start_safekeeper(conf, given_id, arg_matches.get_flag("init"))
}
@@ -146,13 +142,28 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option<NodeId>, init: bo
// Prevent running multiple safekeepers on the same directory
let lock_file_path = conf.workdir.join(PID_FILE_NAME);
let lock_file =
pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?;
info!("Claimed pid file at {lock_file_path:?}");
let lock_file = match lock_file::create_lock_file(&lock_file_path, Pid::this().to_string()) {
lock_file::LockCreationResult::Created {
new_lock_contents,
file,
} => {
info!("Created lock file at {lock_file_path:?} with contenst {new_lock_contents}");
file
}
lock_file::LockCreationResult::AlreadyLocked {
existing_lock_contents,
} => anyhow::bail!(
"Could not lock pid file; safekeeper is already running in {:?} with PID {}",
conf.workdir,
existing_lock_contents
),
lock_file::LockCreationResult::CreationFailed(e) => {
return Err(e.context(format!("Failed to create lock file at {lock_file_path:?}")))
}
};
// ensure that the lock file is held even if the main thread of the process is panics
// we need to release the lock file only when the current process is gone
std::mem::forget(lock_file);
let _ = Box::leak(Box::new(lock_file));
// Set or read our ID.
set_id(&mut conf, given_id)?;

View File

@@ -226,7 +226,6 @@ impl ReplicationConn {
let mut end_pos = stop_pos.unwrap_or(inmem_state.commit_lsn);
let mut wal_reader = WalReader::new(
spg.conf.workdir.clone(),
spg.conf.timeline_dir(&tli.ttid),
&persisted_state,
start_pos,

View File

@@ -13,7 +13,7 @@ use std::time::Duration;
use postgres_ffi::v14::xlog_utils::XLogSegNoOffsetToRecPtr;
use postgres_ffi::XLogFileName;
use postgres_ffi::{XLogSegNo, PG_TLI};
use remote_storage::{GenericRemoteStorage, RemotePath};
use remote_storage::GenericRemoteStorage;
use tokio::fs::File;
use tokio::runtime::Builder;
@@ -151,7 +151,7 @@ async fn update_task(
let timeline_dir = conf.timeline_dir(&ttid);
let handle = tokio::spawn(
backup_task_main(ttid, timeline_dir, conf.workdir.clone(), shutdown_rx)
backup_task_main(ttid, timeline_dir, shutdown_rx)
.instrument(info_span!("WAL backup task", ttid = %ttid)),
);
@@ -182,10 +182,10 @@ async fn wal_backup_launcher_main_loop(
let conf_ = conf.clone();
REMOTE_STORAGE.get_or_init(|| {
conf_
.remote_storage
.as_ref()
.map(|c| GenericRemoteStorage::from_config(c).expect("failed to create remote storage"))
conf_.remote_storage.as_ref().map(|c| {
GenericRemoteStorage::from_config(conf_.workdir, c)
.expect("failed to create remote storage")
})
});
// Presense in this map means launcher is aware s3 offloading is needed for
@@ -234,7 +234,6 @@ async fn wal_backup_launcher_main_loop(
struct WalBackupTask {
timeline: Arc<Timeline>,
timeline_dir: PathBuf,
workspace_dir: PathBuf,
wal_seg_size: usize,
commit_lsn_watch_rx: watch::Receiver<Lsn>,
}
@@ -243,7 +242,6 @@ struct WalBackupTask {
async fn backup_task_main(
ttid: TenantTimelineId,
timeline_dir: PathBuf,
workspace_dir: PathBuf,
mut shutdown_rx: Receiver<()>,
) {
info!("started");
@@ -259,7 +257,6 @@ async fn backup_task_main(
commit_lsn_watch_rx: tli.get_commit_lsn_watch_rx(),
timeline: tli,
timeline_dir,
workspace_dir,
};
// task is spinned up only when wal_seg_size already initialized
@@ -324,7 +321,6 @@ impl WalBackupTask {
commit_lsn,
self.wal_seg_size,
&self.timeline_dir,
&self.workspace_dir,
)
.await
{
@@ -357,12 +353,11 @@ pub async fn backup_lsn_range(
end_lsn: Lsn,
wal_seg_size: usize,
timeline_dir: &Path,
workspace_dir: &Path,
) -> Result<Lsn> {
let mut res = start_lsn;
let segments = get_segments(start_lsn, end_lsn, wal_seg_size);
for s in &segments {
backup_single_segment(s, timeline_dir, workspace_dir)
backup_single_segment(s, timeline_dir)
.await
.with_context(|| format!("offloading segno {}", s.seg_no))?;
@@ -377,24 +372,11 @@ pub async fn backup_lsn_range(
Ok(res)
}
async fn backup_single_segment(
seg: &Segment,
timeline_dir: &Path,
workspace_dir: &Path,
) -> Result<()> {
let segment_file_path = seg.file_path(timeline_dir)?;
let remote_segment_path = segment_file_path
.strip_prefix(&workspace_dir)
.context("Failed to strip workspace dir prefix")
.and_then(RemotePath::new)
.with_context(|| {
format!(
"Failed to resolve remote part of path {segment_file_path:?} for base {workspace_dir:?}",
)
})?;
async fn backup_single_segment(seg: &Segment, timeline_dir: &Path) -> Result<()> {
let segment_file_name = seg.file_path(timeline_dir)?;
backup_object(&segment_file_path, &remote_segment_path, seg.size()).await?;
debug!("Backup of {} done", segment_file_path.display());
backup_object(&segment_file_name, seg.size()).await?;
debug!("Backup of {} done", segment_file_name.display());
Ok(())
}
@@ -444,7 +426,7 @@ fn get_segments(start: Lsn, end: Lsn, seg_size: usize) -> Vec<Segment> {
static REMOTE_STORAGE: OnceCell<Option<GenericRemoteStorage>> = OnceCell::new();
async fn backup_object(source_file: &Path, target_file: &RemotePath, size: usize) -> Result<()> {
async fn backup_object(source_file: &Path, size: usize) -> Result<()> {
let storage = REMOTE_STORAGE
.get()
.expect("failed to get remote storage")
@@ -459,12 +441,12 @@ async fn backup_object(source_file: &Path, target_file: &RemotePath, size: usize
})?);
storage
.upload_storage_object(Box::new(file), size, target_file)
.upload_storage_object(Box::new(file), size, source_file)
.await
}
pub async fn read_object(
file_path: &RemotePath,
file_path: PathBuf,
offset: u64,
) -> anyhow::Result<Pin<Box<dyn tokio::io::AsyncRead>>> {
let storage = REMOTE_STORAGE
@@ -473,13 +455,19 @@ pub async fn read_object(
.as_ref()
.context("No remote storage configured")?;
info!("segment download about to start from remote path {file_path:?} at offset {offset}");
info!(
"segment download about to start for local path {} at offset {}",
file_path.display(),
offset
);
let download = storage
.download_storage_object(Some((offset, None)), file_path)
.download_storage_object(Some((offset, None)), &file_path)
.await
.with_context(|| {
format!("Failed to open WAL segment download stream for remote path {file_path:?}")
format!(
"Failed to open WAL segment download stream for local path {}",
file_path.display()
)
})?;
Ok(download.download_stream)

View File

@@ -8,7 +8,6 @@
//! Note that last file has `.partial` suffix, that's different from postgres.
use anyhow::{bail, Context, Result};
use remote_storage::RemotePath;
use std::io::{self, Seek, SeekFrom};
use std::pin::Pin;
@@ -446,7 +445,6 @@ fn remove_segments_from_disk(
}
pub struct WalReader {
workdir: PathBuf,
timeline_dir: PathBuf,
wal_seg_size: usize,
pos: Lsn,
@@ -461,7 +459,6 @@ pub struct WalReader {
impl WalReader {
pub fn new(
workdir: PathBuf,
timeline_dir: PathBuf,
state: &SafeKeeperState,
start_pos: Lsn,
@@ -481,7 +478,6 @@ impl WalReader {
}
Ok(Self {
workdir,
timeline_dir,
wal_seg_size: state.server.wal_seg_size as usize,
pos: start_pos,
@@ -549,17 +545,7 @@ impl WalReader {
// Try to open remote file, if remote reads are enabled
if self.enable_remote_read {
let remote_wal_file_path = wal_file_path
.strip_prefix(&self.workdir)
.context("Failed to strip workdir prefix")
.and_then(RemotePath::new)
.with_context(|| {
format!(
"Failed to resolve remote part of path {:?} for base {:?}",
wal_file_path, self.workdir,
)
})?;
return read_object(&remote_wal_file_path, xlogoff as u64).await;
return read_object(wal_file_path, xlogoff as u64).await;
}
bail!("WAL segment is not found")

View File

@@ -6,6 +6,9 @@ Prerequisites:
- Correctly configured Python, see [`/docs/sourcetree.md`](/docs/sourcetree.md#using-python)
- Neon and Postgres binaries
- See the root [README.md](/README.md) for build directions
If you want to test tests with test-only APIs, you would need to add `--features testing` to Rust code build commands.
For convenience, repository cargo config contains `build_testing` alias, that serves as a subcommand, adding the required feature flags.
Usage example: `cargo build_testing --release` is equivalent to `cargo build --features testing --release`
- Tests can be run from the git tree; or see the environment variables
below to run from other directories.
- The neon git repo, including the postgres submodule

View File

@@ -33,7 +33,7 @@ from _pytest.config import Config
from _pytest.fixtures import FixtureRequest
from fixtures.log_helper import log
from fixtures.types import Lsn, TenantId, TimelineId
from fixtures.utils import allure_attach_from_dir, etcd_path, get_self_dir, subprocess_capture
from fixtures.utils import Fn, allure_attach_from_dir, etcd_path, get_self_dir, subprocess_capture
# Type-related stuff
from psycopg2.extensions import connection as PgConnection
@@ -587,7 +587,6 @@ class NeonEnvBuilder:
auth_enabled: bool = False,
rust_log_override: Optional[str] = None,
default_branch_name: str = DEFAULT_BRANCH_NAME,
testing_mode: bool = True,
):
self.repo_dir = repo_dir
self.rust_log_override = rust_log_override
@@ -609,7 +608,6 @@ class NeonEnvBuilder:
self.neon_binpath = neon_binpath
self.pg_distrib_dir = pg_distrib_dir
self.pg_version = pg_version
self.testing_mode = testing_mode
def init(self) -> NeonEnv:
# Cannot create more than one environment from one builder
@@ -860,7 +858,6 @@ class NeonEnv:
http=self.port_distributor.get_port(),
)
pageserver_auth_type = "NeonJWT" if config.auth_enabled else "Trust"
pageserver_testing_mode = "true" if config.testing_mode else "false"
toml += textwrap.dedent(
f"""
@@ -869,7 +866,6 @@ class NeonEnv:
listen_pg_addr = 'localhost:{pageserver_port.pg}'
listen_http_addr = 'localhost:{pageserver_port.http}'
auth_type = '{pageserver_auth_type}'
testing_mode = {pageserver_testing_mode}
"""
)
@@ -982,10 +978,6 @@ def _shared_simple_env(
pg_distrib_dir=pg_distrib_dir,
pg_version=pg_version,
run_id=run_id,
# Disable failpoint support. Failpoints could have unexpected consequences
# when the pageserver is shared by concurrent tests. Also, it might affect
# performance, and we use the shared simple env in performance tests.
testing_mode=False,
) as builder:
env = builder.init_start()
@@ -1056,10 +1048,11 @@ class PageserverApiException(Exception):
class PageserverHttpClient(requests.Session):
def __init__(self, port: int, auth_token: Optional[str] = None):
def __init__(self, port: int, is_testing_enabled_or_skip: Fn, auth_token: Optional[str] = None):
super().__init__()
self.port = port
self.auth_token = auth_token
self.is_testing_enabled_or_skip = is_testing_enabled_or_skip
if auth_token is not None:
self.headers["Authorization"] = f"Bearer {auth_token}"
@@ -1078,6 +1071,8 @@ class PageserverHttpClient(requests.Session):
self.get(f"http://localhost:{self.port}/v1/status").raise_for_status()
def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]):
self.is_testing_enabled_or_skip()
if isinstance(config_strings, tuple):
pairs = [config_strings]
else:
@@ -1124,14 +1119,6 @@ class PageserverHttpClient(requests.Session):
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach")
self.verbose_error(res)
def tenant_load(self, tenant_id: TenantId):
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/load")
self.verbose_error(res)
def tenant_ignore(self, tenant_id: TenantId):
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/ignore")
self.verbose_error(res)
def tenant_status(self, tenant_id: TenantId) -> Dict[Any, Any]:
res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
self.verbose_error(res)
@@ -1217,6 +1204,8 @@ class PageserverHttpClient(requests.Session):
def timeline_gc(
self, tenant_id: TenantId, timeline_id: TimelineId, gc_horizon: Optional[int]
) -> dict[str, Any]:
self.is_testing_enabled_or_skip()
log.info(
f"Requesting GC: tenant {tenant_id}, timeline {timeline_id}, gc_horizon {repr(gc_horizon)}"
)
@@ -1232,6 +1221,8 @@ class PageserverHttpClient(requests.Session):
return res_json
def timeline_compact(self, tenant_id: TenantId, timeline_id: TimelineId):
self.is_testing_enabled_or_skip()
log.info(f"Requesting compact: tenant {tenant_id}, timeline {timeline_id}")
res = self.put(
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/compact"
@@ -1255,6 +1246,8 @@ class PageserverHttpClient(requests.Session):
return res_json
def timeline_checkpoint(self, tenant_id: TenantId, timeline_id: TimelineId):
self.is_testing_enabled_or_skip()
log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}")
res = self.put(
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint"
@@ -1814,6 +1807,10 @@ class NeonPageserver(PgProtocol):
):
self.stop(immediate=True)
def is_testing_enabled_or_skip(self):
if '"testing"' not in self.version:
pytest.skip("pageserver was built without 'testing' feature")
def is_profiling_enabled_or_skip(self):
if '"profiling"' not in self.version:
pytest.skip("pageserver was built without 'profiling' feature")
@@ -1822,6 +1819,7 @@ class NeonPageserver(PgProtocol):
return PageserverHttpClient(
port=self.service_port.http,
auth_token=auth_token,
is_testing_enabled_or_skip=self.is_testing_enabled_or_skip,
)
def assert_no_errors(self):

View File

@@ -3,7 +3,7 @@
First make a release build. The profiling flag is optional, used only for tests that
generate flame graphs. The `-s` flag just silences a lot of output, and makes it
easier to see if you have compile errors without scrolling up.
`BUILD_TYPE=release CARGO_BUILD_FLAGS="--features=profiling" make -s -j8`
`BUILD_TYPE=release CARGO_BUILD_FLAGS="--features=testing,profiling" make -s -j8`
NOTE: the `profiling` flag only works on linux because we use linux-specific
libc APIs like `libc::timer_t`.

View File

@@ -42,8 +42,7 @@ def test_bulk_update(neon_env_builder: NeonEnvBuilder, zenbenchmark, fillfactor)
cur.execute("drop table t")
cur.execute("set enable_seqscan_prefetch=on")
cur.execute("set effective_io_concurrency=32")
cur.execute("set maintenance_io_concurrency=32")
cur.execute("set seqscan_prefetch_buffers=100")
cur.execute(f"create table t2(x integer) WITH (fillfactor={fillfactor})")

View File

@@ -1,14 +1,10 @@
from contextlib import closing
from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_record_lsn
from fixtures.types import Lsn, TenantId, TimelineId
from fixtures.utils import query_scalar
from fixtures.neon_fixtures import NeonEnvBuilder
# This test demonstrates how to collect a read trace. It's useful until
# it gets replaced by a test that actually does stuff with the trace.
#
# Additionally, tests that pageserver is able to create tenants with custom configs.
def test_read_request_tracing(neon_env_builder: NeonEnvBuilder):
neon_env_builder.num_safekeepers = 1
env = neon_env_builder.init_start()
@@ -27,12 +23,6 @@ def test_read_request_tracing(neon_env_builder: NeonEnvBuilder):
cur.execute("create table t (i integer);")
cur.execute(f"insert into t values (generate_series(1,{10000}));")
cur.execute("select count(*) from t;")
tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
# wait until pageserver receives that data
pageserver_http = env.pageserver.http_client()
wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn)
# Stop pg so we drop the connection and flush the traces
pg.stop()

View File

@@ -327,6 +327,7 @@ def check_neon_works(
auth_token = snapshot_config["pageserver"]["auth_token"]
pageserver_http = PageserverHttpClient(
port=pageserver_port,
is_testing_enabled_or_skip=lambda: True, # TODO: check if testing really enabled
auth_token=auth_token,
)

View File

@@ -13,6 +13,7 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder):
neon_env_builder.pageserver_config_override = "tenant_config={checkpoint_distance = 1048576}"
env = neon_env_builder.init()
env.pageserver.is_testing_enabled_or_skip()
neon_env_builder.start()

View File

@@ -71,10 +71,8 @@ def test_remote_storage_backup_and_restore(
# FIXME retry downloads without throwing errors
env.pageserver.allowed_errors.append(".*failed to load remote timeline.*")
# we have a bunch of pytest.raises for these below
env.pageserver.allowed_errors.append(".*tenant .*? already exists, state:.*")
env.pageserver.allowed_errors.append(
".*Cannot attach tenant .*?, local tenant directory already exists.*"
)
env.pageserver.allowed_errors.append(".*tenant already exists.*")
env.pageserver.allowed_errors.append(".*attach is already in progress.*")
pageserver_http = env.pageserver.http_client()
pg = env.postgres.create_start("main")
@@ -138,7 +136,7 @@ def test_remote_storage_backup_and_restore(
# assert cannot attach timeline that is scheduled for download
# FIXME implement layer download retries
with pytest.raises(Exception, match=f"tenant {tenant_id} already exists, state: Broken"):
with pytest.raises(Exception, match="tenant already exists, current state: Broken"):
client.tenant_attach(tenant_id)
tenant_status = client.tenant_status(tenant_id)
@@ -151,7 +149,9 @@ def test_remote_storage_backup_and_restore(
env.pageserver.start()
# ensure that an initiated attach operation survives pageserver restart
with pytest.raises(Exception, match=f"tenant {tenant_id} already exists, state:"):
with pytest.raises(
Exception, match=r".*(tenant already exists|attach is already in progress).*"
):
client.tenant_attach(tenant_id)
log.info("waiting for timeline redownload")
wait_until(
@@ -165,6 +165,7 @@ def test_remote_storage_backup_and_restore(
assert (
Lsn(detail["last_record_lsn"]) >= current_lsn
), "current db Lsn should should not be less than the one stored on remote storage"
assert not detail["awaits_download"]
pg = env.postgres.create_start("main")
with pg.cursor() as cur:
@@ -190,7 +191,7 @@ def test_remote_storage_upload_queue_retries(
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_remote_storage_upload_queue_retries",
test_name="test_remote_storage_backup_and_restore",
)
env = neon_env_builder.init_start()
@@ -352,7 +353,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
):
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_timeline_deletion_with_files_stuck_in_upload_queue",
test_name="test_remote_storage_backup_and_restore",
)
env = neon_env_builder.init_start()

View File

@@ -7,7 +7,6 @@ from fixtures.neon_fixtures import (
NeonEnvBuilder,
PageserverApiException,
PageserverHttpClient,
Postgres,
RemoteStorageKind,
available_remote_storages,
wait_for_last_record_lsn,
@@ -168,337 +167,3 @@ def test_detach_while_attaching(
with pg.cursor() as cur:
cur.execute("SELECT COUNT(*) FROM foo")
# Tests that `ignore` and `get` operations' combination is able to remove and restore the tenant in pageserver's memory.
# * writes some data into tenant's timeline
# * ensures it's synced with the remote storage
# * `ignore` the tenant
# * verify that ignored tenant files are generally unchanged, only an ignored mark had appeared
# * verify the ignored tenant is gone from pageserver's memory
# * restart the pageserver and verify that ignored tenant is still not loaded
# * `load` the same tenant
# * ensure that it's status is `Active` and it's present in pageserver's memory with all timelines
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3])
def test_ignored_tenant_reattach(
neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
):
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_remote_storage_backup_and_restore",
)
env = neon_env_builder.init_start()
pageserver_http = env.pageserver.http_client()
ignored_tenant_id, _ = env.neon_cli.create_tenant()
tenant_dir = env.repo_dir / "tenants" / str(ignored_tenant_id)
tenants_before_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
tenants_before_ignore.sort()
timelines_before_ignore = [
timeline["timeline_id"]
for timeline in pageserver_http.timeline_list(tenant_id=ignored_tenant_id)
]
files_before_ignore = [tenant_path for tenant_path in tenant_dir.glob("**/*")]
# ignore the tenant and veirfy it's not present in pageserver replies, with its files still on disk
pageserver_http.tenant_ignore(ignored_tenant_id)
files_after_ignore_with_retain = [tenant_path for tenant_path in tenant_dir.glob("**/*")]
new_files = set(files_after_ignore_with_retain) - set(files_before_ignore)
disappeared_files = set(files_before_ignore) - set(files_after_ignore_with_retain)
assert (
len(disappeared_files) == 0
), f"Tenant ignore should not remove files from disk, missing: {disappeared_files}"
assert (
len(new_files) == 1
), f"Only tenant ignore file should appear on disk but got: {new_files}"
tenants_after_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
assert ignored_tenant_id not in tenants_after_ignore, "Ignored tenant should be missing"
assert len(tenants_after_ignore) + 1 == len(
tenants_before_ignore
), "Only ignored tenant should be missing"
# restart the pageserver to ensure we don't load the ignore timeline
env.pageserver.stop()
env.pageserver.start()
tenants_after_restart = [tenant["id"] for tenant in pageserver_http.tenant_list()]
tenants_after_restart.sort()
assert (
tenants_after_restart == tenants_after_ignore
), "Ignored tenant should not be reloaded after pageserver restart"
# now, load it from the local files and expect it works
pageserver_http.tenant_load(tenant_id=ignored_tenant_id)
wait_until_tenant_status(pageserver_http, ignored_tenant_id, "Active", 5)
tenants_after_attach = [tenant["id"] for tenant in pageserver_http.tenant_list()]
tenants_after_attach.sort()
assert tenants_after_attach == tenants_before_ignore, "Should have all tenants back"
timelines_after_ignore = [
timeline["timeline_id"]
for timeline in pageserver_http.timeline_list(tenant_id=ignored_tenant_id)
]
assert timelines_before_ignore == timelines_after_ignore, "Should have all timelines back"
# Tests that it's possible to `load` tenants with missing layers and get them restored:
# * writes some data into tenant's timeline
# * ensures it's synced with the remote storage
# * `ignore` the tenant
# * removes all timeline's local layers
# * `load` the same tenant
# * ensure that it's status is `Active`
# * check that timeline data is restored
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
def test_ignored_tenant_download_missing_layers(
neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
):
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_ignored_tenant_download_and_attach",
)
env = neon_env_builder.init_start()
pageserver_http = env.pageserver.http_client()
pg = env.postgres.create_start("main")
tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
data_id = 1
data_secret = "very secret secret"
insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, pg)
tenants_before_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
tenants_before_ignore.sort()
timelines_before_ignore = [
timeline["timeline_id"] for timeline in pageserver_http.timeline_list(tenant_id=tenant_id)
]
# ignore the tenant and remove its layers
pageserver_http.tenant_ignore(tenant_id)
tenant_timeline_dir = env.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
layers_removed = False
for dir_entry in tenant_timeline_dir.iterdir():
if dir_entry.name.startswith("00000"):
# Looks like a layer file. Remove it
dir_entry.unlink()
layers_removed = True
assert layers_removed, f"Found no layers for tenant {tenant_timeline_dir}"
# now, load it from the local files and expect it to work due to remote storage restoration
pageserver_http.tenant_load(tenant_id=tenant_id)
wait_until_tenant_status(pageserver_http, tenant_id, "Active", 5)
tenants_after_attach = [tenant["id"] for tenant in pageserver_http.tenant_list()]
tenants_after_attach.sort()
assert tenants_after_attach == tenants_before_ignore, "Should have all tenants back"
timelines_after_ignore = [
timeline["timeline_id"] for timeline in pageserver_http.timeline_list(tenant_id=tenant_id)
]
assert timelines_before_ignore == timelines_after_ignore, "Should have all timelines back"
pg.stop()
pg.start()
ensure_test_data(data_id, data_secret, pg)
# Tests that it's possible to `load` broken tenants:
# * `ignore` a tenant
# * removes its `metadata` file locally
# * `load` the same tenant
# * ensure that it's status is `Broken`
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
def test_ignored_tenant_stays_broken_without_metadata(
neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
):
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_ignored_tenant_stays_broken_without_metadata",
)
env = neon_env_builder.init_start()
pageserver_http = env.pageserver.http_client()
pg = env.postgres.create_start("main")
tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
# ignore the tenant and remove its metadata
pageserver_http.tenant_ignore(tenant_id)
tenant_timeline_dir = env.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
metadata_removed = False
for dir_entry in tenant_timeline_dir.iterdir():
if dir_entry.name == "metadata":
# Looks like a layer file. Remove it
dir_entry.unlink()
metadata_removed = True
assert metadata_removed, f"Failed to find metadata file in {tenant_timeline_dir}"
env.pageserver.allowed_errors.append(".*could not load tenant .*?: failed to load metadata.*")
# now, load it from the local files and expect it to be broken due to inability to load tenant files into memory
pageserver_http.tenant_load(tenant_id=tenant_id)
wait_until_tenant_status(pageserver_http, tenant_id, "Broken", 5)
# Tests that attach is never working on a tenant, ignored or not, as long as it's not absent locally
# Similarly, tests that it's not possible to schedule a `load` for tenat that's not ignored.
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
def test_load_attach_negatives(
neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
):
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_load_attach_negatives",
)
env = neon_env_builder.init_start()
pageserver_http = env.pageserver.http_client()
pg = env.postgres.create_start("main")
tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
env.pageserver.allowed_errors.append(".*tenant .*? already exists, state:.*")
with pytest.raises(
expected_exception=PageserverApiException,
match=f"tenant {tenant_id} already exists, state: Active",
):
pageserver_http.tenant_load(tenant_id)
with pytest.raises(
expected_exception=PageserverApiException,
match=f"tenant {tenant_id} already exists, state: Active",
):
pageserver_http.tenant_attach(tenant_id)
pageserver_http.tenant_ignore(tenant_id)
env.pageserver.allowed_errors.append(
".*Cannot attach tenant .*?, local tenant directory already exists.*"
)
with pytest.raises(
expected_exception=PageserverApiException,
match=f"Cannot attach tenant {tenant_id}, local tenant directory already exists",
):
pageserver_http.tenant_attach(tenant_id)
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
def test_ignore_while_attaching(
neon_env_builder: NeonEnvBuilder,
remote_storage_kind: RemoteStorageKind,
):
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_ignore_while_attaching",
)
env = neon_env_builder.init_start()
pageserver_http = env.pageserver.http_client()
pg = env.postgres.create_start("main")
pageserver_http = env.pageserver.http_client()
tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
data_id = 1
data_secret = "very secret secret"
insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, pg)
tenants_before_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
# Detach it
pageserver_http.tenant_detach(tenant_id)
# And re-attach, but stop attach task_mgr task from completing
pageserver_http.configure_failpoints([("attach-before-activate", "return(5000)")])
pageserver_http.tenant_attach(tenant_id)
# Run ignore on the task, thereby cancelling the attach.
# XXX This should take priority over attach, i.e., it should cancel the attach task.
# But neither the failpoint, nor the proper storage_sync2 download functions,
# are sensitive to task_mgr::shutdown.
# This problem is tracked in https://github.com/neondatabase/neon/issues/2996 .
# So, for now, effectively, this ignore here will block until attach task completes.
pageserver_http.tenant_ignore(tenant_id)
# Cannot attach it due to some local files existing
env.pageserver.allowed_errors.append(
".*Cannot attach tenant .*?, local tenant directory already exists.*"
)
with pytest.raises(
expected_exception=PageserverApiException,
match=f"Cannot attach tenant {tenant_id}, local tenant directory already exists",
):
pageserver_http.tenant_attach(tenant_id)
tenants_after_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
assert tenant_id not in tenants_after_ignore, "Ignored tenant should be missing"
assert len(tenants_after_ignore) + 1 == len(
tenants_before_ignore
), "Only ignored tenant should be missing"
# But can load it from local files, that will restore attach.
pageserver_http.tenant_load(tenant_id)
wait_until_tenant_status(pageserver_http, tenant_id, "Active", 5)
pg.stop()
pg.start()
ensure_test_data(data_id, data_secret, pg)
def insert_test_data(
pageserver_http: PageserverHttpClient,
tenant_id: TenantId,
timeline_id: TimelineId,
data_id: int,
data: str,
pg: Postgres,
):
with pg.cursor() as cur:
cur.execute(
f"""
CREATE TABLE test(id int primary key, secret text);
INSERT INTO test VALUES ({data_id}, '{data}');
"""
)
current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
# wait until pageserver receives that data
wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn)
# run checkpoint manually to be sure that data landed in remote storage
pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
# wait until pageserver successfully uploaded a checkpoint to remote storage
log.info("waiting for to be ignored tenant data checkpoint upload")
wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn)
def ensure_test_data(data_id: int, data: str, pg: Postgres):
with pg.cursor() as cur:
assert (
query_scalar(cur, f"SELECT secret FROM test WHERE id = {data_id};") == data
), "Should have timeline data back"
# Does not use `wait_until` for debugging purposes
def wait_until_tenant_status(
pageserver_http: PageserverHttpClient,
tenant_id: TenantId,
expected_status: str,
iterations: int,
) -> bool:
for _ in range(iterations):
try:
tenant = pageserver_http.tenant_status(tenant_id=tenant_id)
log.debug(f"Tenant {tenant_id} status: {tenant}")
if tenant["state"] == expected_status:
return True
except Exception as e:
log.debug(f"Tenant {tenant_id} status retrieval failure: {e}")
time.sleep(1)
raise Exception(f"Tenant {tenant_id} did not become {expected_status} in {iterations} seconds")

View File

@@ -58,6 +58,7 @@ def new_pageserver_service(
pageserver_client = PageserverHttpClient(
port=http_port,
auth_token=None,
is_testing_enabled_or_skip=lambda: True, # TODO: check if testing really enabled
)
try:
pageserver_process = start_in_background(
@@ -359,6 +360,7 @@ def test_tenant_relocation(
new_pageserver_http = PageserverHttpClient(
port=new_pageserver_http_port,
auth_token=None,
is_testing_enabled_or_skip=env.pageserver.is_testing_enabled_or_skip,
)
with new_pageserver_service(

View File

@@ -19,6 +19,7 @@ bytes = { version = "1", features = ["serde", "std"] }
clap = { version = "4", features = ["color", "derive", "error-context", "help", "std", "string", "suggestions", "usage"] }
crossbeam-utils = { version = "0.8", features = ["once_cell", "std"] }
either = { version = "1", features = ["use_std"] }
fail = { version = "0.5", default-features = false, features = ["failpoints"] }
futures-channel = { version = "0.3", features = ["alloc", "futures-sink", "sink", "std"] }
futures-task = { version = "0.3", default-features = false, features = ["alloc", "std"] }
futures-util = { version = "0.3", features = ["alloc", "async-await", "async-await-macro", "channel", "futures-channel", "futures-io", "futures-macro", "futures-sink", "io", "memchr", "sink", "slab", "std"] }
@@ -36,16 +37,16 @@ prost-a6292c17cd707f01 = { package = "prost", version = "0.11", features = ["pro
rand = { version = "0.8", features = ["alloc", "getrandom", "libc", "rand_chacha", "rand_hc", "small_rng", "std", "std_rng"] }
regex = { version = "1", features = ["aho-corasick", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] }
regex-syntax = { version = "0.6", features = ["unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] }
reqwest = { version = "0.11", default-features = false, features = ["__rustls", "__tls", "blocking", "default-tls", "hyper-rustls", "hyper-tls", "json", "native-tls-crate", "rustls", "rustls-pemfile", "rustls-tls", "rustls-tls-webpki-roots", "serde_json", "tokio-native-tls", "tokio-rustls", "webpki-roots"] }
reqwest = { version = "0.11", default-features = false, features = ["__rustls", "__tls", "blocking", "hyper-rustls", "json", "rustls", "rustls-pemfile", "rustls-tls", "rustls-tls-webpki-roots", "serde_json", "tokio-rustls", "webpki-roots"] }
scopeguard = { version = "1", features = ["use_std"] }
serde = { version = "1", features = ["alloc", "derive", "serde_derive", "std"] }
stable_deref_trait = { version = "1", features = ["alloc", "std"] }
time = { version = "0.3", features = ["alloc", "formatting", "itoa", "macros", "parsing", "std", "time-macros"] }
tokio = { version = "1", features = ["bytes", "fs", "io-std", "io-util", "libc", "macros", "memchr", "mio", "net", "num_cpus", "once_cell", "process", "rt", "rt-multi-thread", "signal-hook-registry", "socket2", "sync", "time", "tokio-macros"] }
tokio-util = { version = "0.7", features = ["codec", "io", "io-util", "tracing"] }
tower = { version = "0.4", features = ["__common", "balance", "buffer", "discover", "futures-core", "futures-util", "indexmap", "limit", "load", "log", "make", "pin-project", "pin-project-lite", "rand", "ready-cache", "retry", "slab", "timeout", "tokio", "tokio-util", "tracing", "util"] }
tracing = { version = "0.1", features = ["attributes", "log", "std", "tracing-attributes"] }
tracing-core = { version = "0.1", features = ["once_cell", "std"] }
url = { version = "2", features = ["serde"] }
[build-dependencies]
ahash = { version = "0.7", features = ["std"] }