mirror of
https://github.com/neondatabase/neon.git
synced 2026-02-01 01:30:38 +00:00
Compare commits
5 Commits
conn_pools
...
proxy-skip
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6062cd180c | ||
|
|
d2a6cd1182 | ||
|
|
75bfd22084 | ||
|
|
3cb3dce34e | ||
|
|
c7e43f40ba |
@@ -23,30 +23,10 @@ platforms = [
|
||||
]
|
||||
|
||||
[final-excludes]
|
||||
workspace-members = [
|
||||
# vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but
|
||||
# it is built primarly in separate repo neondatabase/autoscaling and thus is excluded
|
||||
# from depending on workspace-hack because most of the dependencies are not used.
|
||||
"vm_monitor",
|
||||
# All of these exist in libs and are not usually built independently.
|
||||
# Putting workspace hack there adds a bottleneck for cargo builds.
|
||||
"compute_api",
|
||||
"consumption_metrics",
|
||||
"desim",
|
||||
"metrics",
|
||||
"pageserver_api",
|
||||
"postgres_backend",
|
||||
"postgres_connection",
|
||||
"postgres_ffi",
|
||||
"pq_proto",
|
||||
"remote_storage",
|
||||
"safekeeper_api",
|
||||
"tenant_size_model",
|
||||
"tracing-utils",
|
||||
"utils",
|
||||
"wal_craft",
|
||||
"walproposer",
|
||||
]
|
||||
# vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but
|
||||
# it is built primarly in separate repo neondatabase/autoscaling and thus is excluded
|
||||
# from depending on workspace-hack because most of the dependencies are not used.
|
||||
workspace-members = ["vm_monitor"]
|
||||
|
||||
# Write out exact versions rather than a semver range. (Defaults to false.)
|
||||
# exact-versions = true
|
||||
|
||||
@@ -5,22 +5,26 @@
|
||||
!Cargo.toml
|
||||
!Makefile
|
||||
!rust-toolchain.toml
|
||||
!scripts/combine_control_files.py
|
||||
!scripts/ninstall.sh
|
||||
!vm-cgconfig.conf
|
||||
!docker-compose/run-tests.sh
|
||||
|
||||
# Directories
|
||||
!.cargo/
|
||||
!.config/
|
||||
!compute/
|
||||
!compute_tools/
|
||||
!control_plane/
|
||||
!libs/
|
||||
!neon_local/
|
||||
!pageserver/
|
||||
!patches/
|
||||
!pgxn/
|
||||
!proxy/
|
||||
!storage_scrubber/
|
||||
!safekeeper/
|
||||
!storage_broker/
|
||||
!storage_controller/
|
||||
!trace/
|
||||
!vendor/postgres-*/
|
||||
!workspace_hack/
|
||||
|
||||
2
.gitattributes
vendored
2
.gitattributes
vendored
@@ -1,2 +0,0 @@
|
||||
# allows for nicer hunk headers with git show
|
||||
*.rs diff=rust
|
||||
6
.github/ISSUE_TEMPLATE/config.yml
vendored
6
.github/ISSUE_TEMPLATE/config.yml
vendored
@@ -1,6 +0,0 @@
|
||||
|
||||
blank_issues_enabled: true
|
||||
contact_links:
|
||||
- name: Feature request
|
||||
url: https://console.neon.tech/app/projects?modal=feedback
|
||||
about: For feature requests in the Neon product, please submit via the feedback form on `https://console.neon.tech`
|
||||
11
.github/actionlint.yml
vendored
11
.github/actionlint.yml
vendored
@@ -1,22 +1,13 @@
|
||||
self-hosted-runner:
|
||||
labels:
|
||||
- arm64
|
||||
- gen3
|
||||
- large
|
||||
- large-arm64
|
||||
- small
|
||||
- small-arm64
|
||||
- us-east-2
|
||||
config-variables:
|
||||
- AZURE_DEV_CLIENT_ID
|
||||
- AZURE_DEV_REGISTRY_NAME
|
||||
- AZURE_DEV_SUBSCRIPTION_ID
|
||||
- AZURE_PROD_CLIENT_ID
|
||||
- AZURE_PROD_REGISTRY_NAME
|
||||
- AZURE_PROD_SUBSCRIPTION_ID
|
||||
- AZURE_TENANT_ID
|
||||
- BENCHMARK_PROJECT_ID_PUB
|
||||
- BENCHMARK_PROJECT_ID_SUB
|
||||
- REMOTE_STORAGE_AZURE_CONTAINER
|
||||
- REMOTE_STORAGE_AZURE_REGION
|
||||
- SLACK_UPCOMING_RELEASE_CHANNEL_ID
|
||||
- DEV_AWS_OIDC_ROLE_ARN
|
||||
|
||||
@@ -183,7 +183,7 @@ runs:
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: ~/.cache/pypoetry/virtualenvs
|
||||
key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }}
|
||||
key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
|
||||
|
||||
- name: Store Allure test stat in the DB (new)
|
||||
if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }}
|
||||
|
||||
2
.github/actions/download/action.yml
vendored
2
.github/actions/download/action.yml
vendored
@@ -26,7 +26,7 @@ runs:
|
||||
TARGET: ${{ inputs.path }}
|
||||
ARCHIVE: /tmp/downloads/${{ inputs.name }}.tar.zst
|
||||
SKIP_IF_DOES_NOT_EXIST: ${{ inputs.skip-if-does-not-exist }}
|
||||
PREFIX: artifacts/${{ inputs.prefix || format('{0}/{1}/{2}', github.event.pull_request.head.sha || github.sha, github.run_id, github.run_attempt) }}
|
||||
PREFIX: artifacts/${{ inputs.prefix || format('{0}/{1}', github.run_id, github.run_attempt) }}
|
||||
run: |
|
||||
BUCKET=neon-github-public-dev
|
||||
FILENAME=$(basename $ARCHIVE)
|
||||
|
||||
16
.github/actions/neon-project-create/action.yml
vendored
16
.github/actions/neon-project-create/action.yml
vendored
@@ -9,13 +9,16 @@ inputs:
|
||||
description: 'Region ID, if not set the project will be created in the default region'
|
||||
default: aws-us-east-2
|
||||
postgres_version:
|
||||
description: 'Postgres version; default is 16'
|
||||
default: '16'
|
||||
description: 'Postgres version; default is 15'
|
||||
default: '15'
|
||||
api_host:
|
||||
description: 'Neon API host'
|
||||
default: console-stage.neon.build
|
||||
provisioner:
|
||||
description: 'k8s-pod or k8s-neonvm'
|
||||
default: 'k8s-pod'
|
||||
compute_units:
|
||||
description: '[Min, Max] compute units'
|
||||
description: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal'
|
||||
default: '[1, 1]'
|
||||
|
||||
outputs:
|
||||
@@ -34,6 +37,10 @@ runs:
|
||||
# A shell without `set -x` to not to expose password/dsn in logs
|
||||
shell: bash -euo pipefail {0}
|
||||
run: |
|
||||
if [ "${PROVISIONER}" == "k8s-pod" ] && [ "${MIN_CU}" != "${MAX_CU}" ]; then
|
||||
echo >&2 "For k8s-pod provisioner MIN_CU should be equal to MAX_CU"
|
||||
fi
|
||||
|
||||
project=$(curl \
|
||||
"https://${API_HOST}/api/v2/projects" \
|
||||
--fail \
|
||||
@@ -45,7 +52,7 @@ runs:
|
||||
\"name\": \"Created by actions/neon-project-create; GITHUB_RUN_ID=${GITHUB_RUN_ID}\",
|
||||
\"pg_version\": ${POSTGRES_VERSION},
|
||||
\"region_id\": \"${REGION_ID}\",
|
||||
\"provisioner\": \"k8s-neonvm\",
|
||||
\"provisioner\": \"${PROVISIONER}\",
|
||||
\"autoscaling_limit_min_cu\": ${MIN_CU},
|
||||
\"autoscaling_limit_max_cu\": ${MAX_CU},
|
||||
\"settings\": { }
|
||||
@@ -68,5 +75,6 @@ runs:
|
||||
API_KEY: ${{ inputs.api_key }}
|
||||
REGION_ID: ${{ inputs.region_id }}
|
||||
POSTGRES_VERSION: ${{ inputs.postgres_version }}
|
||||
PROVISIONER: ${{ inputs.provisioner }}
|
||||
MIN_CU: ${{ fromJSON(inputs.compute_units)[0] }}
|
||||
MAX_CU: ${{ fromJSON(inputs.compute_units)[1] }}
|
||||
|
||||
41
.github/actions/run-python-test-set/action.yml
vendored
41
.github/actions/run-python-test-set/action.yml
vendored
@@ -43,7 +43,7 @@ inputs:
|
||||
pg_version:
|
||||
description: 'Postgres version to use for tests'
|
||||
required: false
|
||||
default: 'v16'
|
||||
default: 'v14'
|
||||
benchmark_durations:
|
||||
description: 'benchmark durations JSON'
|
||||
required: false
|
||||
@@ -56,14 +56,14 @@ runs:
|
||||
if: inputs.build_type != 'remote'
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact
|
||||
name: neon-${{ runner.os }}-${{ inputs.build_type }}-artifact
|
||||
path: /tmp/neon
|
||||
|
||||
- name: Download Neon binaries for the previous release
|
||||
if: inputs.build_type != 'remote'
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact
|
||||
name: neon-${{ runner.os }}-${{ inputs.build_type }}-artifact
|
||||
path: /tmp/neon-previous
|
||||
prefix: latest
|
||||
|
||||
@@ -71,7 +71,7 @@ runs:
|
||||
if: inputs.build_type != 'remote'
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: compatibility-snapshot-${{ runner.arch }}-${{ inputs.build_type }}-pg${{ inputs.pg_version }}
|
||||
name: compatibility-snapshot-${{ inputs.build_type }}-pg${{ inputs.pg_version }}
|
||||
path: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }}
|
||||
prefix: latest
|
||||
# The lack of compatibility snapshot (for example, for the new Postgres version)
|
||||
@@ -83,12 +83,13 @@ runs:
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Cache poetry deps
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: ~/.cache/pypoetry/virtualenvs
|
||||
key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }}
|
||||
key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
|
||||
|
||||
- name: Install Python deps
|
||||
shell: bash -euxo pipefail {0}
|
||||
@@ -113,8 +114,6 @@ runs:
|
||||
export PLATFORM=${PLATFORM:-github-actions-selfhosted}
|
||||
export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install}
|
||||
export DEFAULT_PG_VERSION=${PG_VERSION#v}
|
||||
export LD_LIBRARY_PATH=${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/lib
|
||||
export BENCHMARK_CONNSTR=${BENCHMARK_CONNSTR:-}
|
||||
|
||||
if [ "${BUILD_TYPE}" = "remote" ]; then
|
||||
export REMOTE_ENV=1
|
||||
@@ -130,8 +129,8 @@ runs:
|
||||
exit 1
|
||||
fi
|
||||
if [[ "${{ inputs.run_in_parallel }}" == "true" ]]; then
|
||||
# -n sets the number of parallel processes that pytest-xdist will run
|
||||
EXTRA_PARAMS="-n12 $EXTRA_PARAMS"
|
||||
# -n16 uses sixteen processes to run tests via pytest-xdist
|
||||
EXTRA_PARAMS="-n16 $EXTRA_PARAMS"
|
||||
|
||||
# --dist=loadgroup points tests marked with @pytest.mark.xdist_group
|
||||
# to the same worker to make @pytest.mark.order work with xdist
|
||||
@@ -169,28 +168,23 @@ runs:
|
||||
EXTRA_PARAMS="--durations-path $TEST_OUTPUT/benchmark_durations.json $EXTRA_PARAMS"
|
||||
fi
|
||||
|
||||
if [[ $BUILD_TYPE == "debug" && $RUNNER_ARCH == 'X64' ]]; then
|
||||
if [[ "${{ inputs.build_type }}" == "debug" ]]; then
|
||||
cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
|
||||
elif [[ "${{ inputs.build_type }}" == "release" ]]; then
|
||||
cov_prefix=()
|
||||
else
|
||||
cov_prefix=()
|
||||
fi
|
||||
|
||||
# Wake up the cluster if we use remote neon instance
|
||||
if [ "${{ inputs.build_type }}" = "remote" ] && [ -n "${BENCHMARK_CONNSTR}" ]; then
|
||||
QUERIES=("SELECT version()")
|
||||
if [[ "${PLATFORM}" = "neon"* ]]; then
|
||||
QUERIES+=("SHOW neon.tenant_id")
|
||||
QUERIES+=("SHOW neon.timeline_id")
|
||||
fi
|
||||
|
||||
for q in "${QUERIES[@]}"; do
|
||||
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/psql ${BENCHMARK_CONNSTR} -c "${q}"
|
||||
done
|
||||
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/psql ${BENCHMARK_CONNSTR} -c "SELECT version();"
|
||||
fi
|
||||
|
||||
# Run the tests.
|
||||
#
|
||||
# --alluredir saves test results in Allure format (in a specified directory)
|
||||
# The junit.xml file allows CI tools to display more fine-grained test information
|
||||
# in its "Tests" tab in the results page.
|
||||
# --verbose prints name of each test (helpful when there are
|
||||
# multiple tests in one file)
|
||||
# -rA prints summary in the end
|
||||
@@ -199,6 +193,7 @@ runs:
|
||||
#
|
||||
mkdir -p $TEST_OUTPUT/allure/results
|
||||
"${cov_prefix[@]}" ./scripts/pytest \
|
||||
--junitxml=$TEST_OUTPUT/junit.xml \
|
||||
--alluredir=$TEST_OUTPUT/allure/results \
|
||||
--tb=short \
|
||||
--verbose \
|
||||
@@ -211,13 +206,13 @@ runs:
|
||||
fi
|
||||
|
||||
- name: Upload compatibility snapshot
|
||||
# Note, that we use `github.base_ref` which is a target branch for a PR
|
||||
if: github.event_name == 'pull_request' && github.base_ref == 'release'
|
||||
if: github.ref_name == 'release'
|
||||
uses: ./.github/actions/upload
|
||||
with:
|
||||
name: compatibility-snapshot-${{ runner.arch }}-${{ inputs.build_type }}-pg${{ inputs.pg_version }}
|
||||
name: compatibility-snapshot-${{ inputs.build_type }}-pg${{ inputs.pg_version }}-${{ github.run_id }}
|
||||
# Directory is created by test_compatibility.py::test_create_snapshot, keep the path in sync with the test
|
||||
path: /tmp/test_output/compatibility_snapshot_pg${{ inputs.pg_version }}/
|
||||
prefix: latest
|
||||
|
||||
- name: Upload test results
|
||||
if: ${{ !cancelled() }}
|
||||
|
||||
36
.github/actions/set-docker-config-dir/action.yml
vendored
36
.github/actions/set-docker-config-dir/action.yml
vendored
@@ -1,36 +0,0 @@
|
||||
name: "Set custom docker config directory"
|
||||
description: "Create a directory for docker config and set DOCKER_CONFIG"
|
||||
|
||||
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
|
||||
runs:
|
||||
using: "composite"
|
||||
steps:
|
||||
- name: Show warning on GitHub-hosted runners
|
||||
if: runner.environment == 'github-hosted'
|
||||
shell: bash -euo pipefail {0}
|
||||
run: |
|
||||
# Using the following environment variables to find a path to the workflow file
|
||||
# ${GITHUB_WORKFLOW_REF} - octocat/hello-world/.github/workflows/my-workflow.yml@refs/heads/my_branch
|
||||
# ${GITHUB_REPOSITORY} - octocat/hello-world
|
||||
# ${GITHUB_REF} - refs/heads/my_branch
|
||||
# From https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/variables
|
||||
|
||||
filename_with_ref=${GITHUB_WORKFLOW_REF#"$GITHUB_REPOSITORY/"}
|
||||
filename=${filename_with_ref%"@$GITHUB_REF"}
|
||||
|
||||
# https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/workflow-commands-for-github-actions#setting-a-warning-message
|
||||
title='Unnecessary usage of `.github/actions/set-docker-config-dir`'
|
||||
message='No need to use `.github/actions/set-docker-config-dir` action on GitHub-hosted runners'
|
||||
echo "::warning file=${filename},title=${title}::${message}"
|
||||
|
||||
- uses: pyTooling/Actions/with-post-step@74afc5a42a17a046c90c68cb5cfa627e5c6c5b6b # v1.0.7
|
||||
env:
|
||||
DOCKER_CONFIG: .docker-custom-${{ github.run_id }}-${{ github.run_attempt }}
|
||||
with:
|
||||
main: |
|
||||
mkdir -p "${DOCKER_CONFIG}"
|
||||
echo DOCKER_CONFIG=${DOCKER_CONFIG} | tee -a $GITHUB_ENV
|
||||
post: |
|
||||
if [ -d "${DOCKER_CONFIG}" ]; then
|
||||
rm -r "${DOCKER_CONFIG}"
|
||||
fi
|
||||
4
.github/actions/upload/action.yml
vendored
4
.github/actions/upload/action.yml
vendored
@@ -8,7 +8,7 @@ inputs:
|
||||
description: "A directory or file to upload"
|
||||
required: true
|
||||
prefix:
|
||||
description: "S3 prefix. Default is '${GITHUB_SHA}/${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'"
|
||||
description: "S3 prefix. Default is '${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'"
|
||||
required: false
|
||||
|
||||
runs:
|
||||
@@ -45,7 +45,7 @@ runs:
|
||||
env:
|
||||
SOURCE: ${{ inputs.path }}
|
||||
ARCHIVE: /tmp/uploads/${{ inputs.name }}.tar.zst
|
||||
PREFIX: artifacts/${{ inputs.prefix || format('{0}/{1}/{2}', github.event.pull_request.head.sha || github.sha, github.run_id , github.run_attempt) }}
|
||||
PREFIX: artifacts/${{ inputs.prefix || format('{0}/{1}', github.run_id, github.run_attempt) }}
|
||||
run: |
|
||||
BUCKET=neon-github-public-dev
|
||||
FILENAME=$(basename $ARCHIVE)
|
||||
|
||||
168
.github/workflows/_benchmarking_preparation.yml
vendored
168
.github/workflows/_benchmarking_preparation.yml
vendored
@@ -1,168 +0,0 @@
|
||||
name: Prepare benchmarking databases by restoring dumps
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
# no inputs needed
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
jobs:
|
||||
setup-databases:
|
||||
permissions:
|
||||
contents: write
|
||||
statuses: write
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
platform: [ aws-rds-postgres, aws-aurora-serverless-v2-postgres, neon ]
|
||||
database: [ clickbench, tpch, userexample ]
|
||||
|
||||
env:
|
||||
LD_LIBRARY_PATH: /tmp/neon/pg_install/v16/lib
|
||||
PLATFORM: ${{ matrix.platform }}
|
||||
PG_BINARIES: /tmp/neon/pg_install/v16/bin
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: neondatabase/build-tools:pinned
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- name: Set up Connection String
|
||||
id: set-up-prep-connstr
|
||||
run: |
|
||||
case "${PLATFORM}" in
|
||||
neon)
|
||||
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
|
||||
;;
|
||||
aws-rds-postgres)
|
||||
CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }}
|
||||
;;
|
||||
aws-aurora-serverless-v2-postgres)
|
||||
CONNSTR=${{ secrets.BENCHMARK_RDS_AURORA_CONNSTR }}
|
||||
;;
|
||||
*)
|
||||
echo >&2 "Unknown PLATFORM=${PLATFORM}"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
role-duration-seconds: 18000 # 5 hours
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
|
||||
# we create a table that has one row for each database that we want to restore with the status whether the restore is done
|
||||
- name: Create benchmark_restore_status table if it does not exist
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }}
|
||||
DATABASE_NAME: ${{ matrix.database }}
|
||||
# to avoid a race condition of multiple jobs trying to create the table at the same time,
|
||||
# we use an advisory lock
|
||||
run: |
|
||||
${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -c "
|
||||
SELECT pg_advisory_lock(4711);
|
||||
CREATE TABLE IF NOT EXISTS benchmark_restore_status (
|
||||
databasename text primary key,
|
||||
restore_done boolean
|
||||
);
|
||||
SELECT pg_advisory_unlock(4711);
|
||||
"
|
||||
|
||||
- name: Check if restore is already done
|
||||
id: check-restore-done
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }}
|
||||
DATABASE_NAME: ${{ matrix.database }}
|
||||
run: |
|
||||
skip=false
|
||||
if ${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -tAc "SELECT 1 FROM benchmark_restore_status WHERE databasename='${{ env.DATABASE_NAME }}' AND restore_done=true;" | grep -q 1; then
|
||||
echo "Restore already done for database ${{ env.DATABASE_NAME }} on platform ${{ env.PLATFORM }}. Skipping this database."
|
||||
skip=true
|
||||
fi
|
||||
echo "skip=${skip}" | tee -a $GITHUB_OUTPUT
|
||||
|
||||
- name: Check and create database if it does not exist
|
||||
if: steps.check-restore-done.outputs.skip != 'true'
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }}
|
||||
DATABASE_NAME: ${{ matrix.database }}
|
||||
run: |
|
||||
DB_EXISTS=$(${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -tAc "SELECT 1 FROM pg_database WHERE datname='${{ env.DATABASE_NAME }}'")
|
||||
if [ "$DB_EXISTS" != "1" ]; then
|
||||
echo "Database ${{ env.DATABASE_NAME }} does not exist. Creating it..."
|
||||
${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -c "CREATE DATABASE \"${{ env.DATABASE_NAME }}\";"
|
||||
else
|
||||
echo "Database ${{ env.DATABASE_NAME }} already exists."
|
||||
fi
|
||||
|
||||
- name: Download dump from S3 to /tmp/dumps
|
||||
if: steps.check-restore-done.outputs.skip != 'true'
|
||||
env:
|
||||
DATABASE_NAME: ${{ matrix.database }}
|
||||
run: |
|
||||
mkdir -p /tmp/dumps
|
||||
aws s3 cp s3://neon-github-dev/performance/pgdumps/$DATABASE_NAME/$DATABASE_NAME.pg_dump /tmp/dumps/
|
||||
|
||||
- name: Replace database name in connection string
|
||||
if: steps.check-restore-done.outputs.skip != 'true'
|
||||
id: replace-dbname
|
||||
env:
|
||||
DATABASE_NAME: ${{ matrix.database }}
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }}
|
||||
run: |
|
||||
# Extract the part before the database name
|
||||
base_connstr="${BENCHMARK_CONNSTR%/*}"
|
||||
# Extract the query parameters (if any) after the database name
|
||||
query_params="${BENCHMARK_CONNSTR#*\?}"
|
||||
# Reconstruct the new connection string
|
||||
if [ "$query_params" != "$BENCHMARK_CONNSTR" ]; then
|
||||
new_connstr="${base_connstr}/${DATABASE_NAME}?${query_params}"
|
||||
else
|
||||
new_connstr="${base_connstr}/${DATABASE_NAME}"
|
||||
fi
|
||||
echo "database_connstr=${new_connstr}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Restore dump
|
||||
if: steps.check-restore-done.outputs.skip != 'true'
|
||||
env:
|
||||
DATABASE_NAME: ${{ matrix.database }}
|
||||
DATABASE_CONNSTR: ${{ steps.replace-dbname.outputs.database_connstr }}
|
||||
# the following works only with larger computes:
|
||||
# PGOPTIONS: "-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7"
|
||||
# we add the || true because:
|
||||
# the dumps were created with Neon and contain neon extensions that are not
|
||||
# available in RDS, so we will always report an error, but we can ignore it
|
||||
run: |
|
||||
${PG_BINARIES}/pg_restore --clean --if-exists --no-owner --jobs=4 \
|
||||
-d "${DATABASE_CONNSTR}" /tmp/dumps/${DATABASE_NAME}.pg_dump || true
|
||||
|
||||
- name: Update benchmark_restore_status table
|
||||
if: steps.check-restore-done.outputs.skip != 'true'
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-prep-connstr.outputs.connstr }}
|
||||
DATABASE_NAME: ${{ matrix.database }}
|
||||
run: |
|
||||
${PG_BINARIES}/psql "${{ env.BENCHMARK_CONNSTR }}" -c "
|
||||
INSERT INTO benchmark_restore_status (databasename, restore_done) VALUES ('${{ env.DATABASE_NAME }}', true)
|
||||
ON CONFLICT (databasename) DO UPDATE SET restore_done = true;
|
||||
"
|
||||
324
.github/workflows/_build-and-test-locally.yml
vendored
324
.github/workflows/_build-and-test-locally.yml
vendored
@@ -1,324 +0,0 @@
|
||||
name: Build and Test Locally
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
arch:
|
||||
description: 'x64 or arm64'
|
||||
required: true
|
||||
type: string
|
||||
build-tag:
|
||||
description: 'build tag'
|
||||
required: true
|
||||
type: string
|
||||
build-tools-image:
|
||||
description: 'build-tools image'
|
||||
required: true
|
||||
type: string
|
||||
build-type:
|
||||
description: 'debug or release'
|
||||
required: true
|
||||
type: string
|
||||
pg-versions:
|
||||
description: 'a json array of postgres versions to run regression tests on'
|
||||
required: true
|
||||
type: string
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
env:
|
||||
RUST_BACKTRACE: 1
|
||||
COPT: '-Werror'
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
jobs:
|
||||
build-neon:
|
||||
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }}
|
||||
container:
|
||||
image: ${{ inputs.build-tools-image }}
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
# Raise locked memory limit for tokio-epoll-uring.
|
||||
# On 5.10 LTS kernels < 5.10.162 (and generally mainline kernels < 5.12),
|
||||
# io_uring will account the memory of the CQ and SQ as locked.
|
||||
# More details: https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391
|
||||
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
|
||||
env:
|
||||
BUILD_TYPE: ${{ inputs.build-type }}
|
||||
GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
BUILD_TAG: ${{ inputs.build-tag }}
|
||||
|
||||
steps:
|
||||
- name: Fix git ownership
|
||||
run: |
|
||||
# Workaround for `fatal: detected dubious ownership in repository at ...`
|
||||
#
|
||||
# Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
|
||||
# Ref https://github.com/actions/checkout/issues/785
|
||||
#
|
||||
git config --global --add safe.directory ${{ github.workspace }}
|
||||
git config --global --add safe.directory ${GITHUB_WORKSPACE}
|
||||
for r in 14 15 16 17; do
|
||||
git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
|
||||
git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
|
||||
done
|
||||
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
|
||||
- name: Set pg 14 revision for caching
|
||||
id: pg_v14_rev
|
||||
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Set pg 15 revision for caching
|
||||
id: pg_v15_rev
|
||||
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Set pg 16 revision for caching
|
||||
id: pg_v16_rev
|
||||
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Set pg 17 revision for caching
|
||||
id: pg_v17_rev
|
||||
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v17) >> $GITHUB_OUTPUT
|
||||
|
||||
# Set some environment variables used by all the steps.
|
||||
#
|
||||
# CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc.
|
||||
# It also includes --features, if any
|
||||
#
|
||||
# CARGO_FEATURES is passed to "cargo metadata". It is separate from CARGO_FLAGS,
|
||||
# because "cargo metadata" doesn't accept --release or --debug options
|
||||
#
|
||||
# We run tests with addtional features, that are turned off by default (e.g. in release builds), see
|
||||
# corresponding Cargo.toml files for their descriptions.
|
||||
- name: Set env variables
|
||||
env:
|
||||
ARCH: ${{ inputs.arch }}
|
||||
run: |
|
||||
CARGO_FEATURES="--features testing"
|
||||
if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then
|
||||
cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
|
||||
CARGO_FLAGS="--locked"
|
||||
elif [[ $BUILD_TYPE == "debug" ]]; then
|
||||
cov_prefix=""
|
||||
CARGO_FLAGS="--locked"
|
||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||
cov_prefix=""
|
||||
CARGO_FLAGS="--locked --release"
|
||||
fi
|
||||
{
|
||||
echo "cov_prefix=${cov_prefix}"
|
||||
echo "CARGO_FEATURES=${CARGO_FEATURES}"
|
||||
echo "CARGO_FLAGS=${CARGO_FLAGS}"
|
||||
echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo"
|
||||
} >> $GITHUB_ENV
|
||||
|
||||
- name: Cache postgres v14 build
|
||||
id: cache_pg_14
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: pg_install/v14
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
|
||||
|
||||
- name: Cache postgres v15 build
|
||||
id: cache_pg_15
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: pg_install/v15
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
|
||||
|
||||
- name: Cache postgres v16 build
|
||||
id: cache_pg_16
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: pg_install/v16
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
|
||||
|
||||
- name: Cache postgres v17 build
|
||||
id: cache_pg_17
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: pg_install/v17
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
|
||||
|
||||
- name: Build postgres v14
|
||||
if: steps.cache_pg_14.outputs.cache-hit != 'true'
|
||||
run: mold -run make postgres-v14 -j$(nproc)
|
||||
|
||||
- name: Build postgres v15
|
||||
if: steps.cache_pg_15.outputs.cache-hit != 'true'
|
||||
run: mold -run make postgres-v15 -j$(nproc)
|
||||
|
||||
- name: Build postgres v16
|
||||
if: steps.cache_pg_16.outputs.cache-hit != 'true'
|
||||
run: mold -run make postgres-v16 -j$(nproc)
|
||||
|
||||
- name: Build postgres v17
|
||||
if: steps.cache_pg_17.outputs.cache-hit != 'true'
|
||||
run: mold -run make postgres-v17 -j$(nproc)
|
||||
|
||||
- name: Build neon extensions
|
||||
run: mold -run make neon-pg-ext -j$(nproc)
|
||||
|
||||
- name: Build walproposer-lib
|
||||
run: mold -run make walproposer-lib -j$(nproc)
|
||||
|
||||
- name: Run cargo build
|
||||
run: |
|
||||
PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
|
||||
export PQ_LIB_DIR
|
||||
${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
|
||||
|
||||
# Do install *before* running rust tests because they might recompile the
|
||||
# binaries with different features/flags.
|
||||
- name: Install rust binaries
|
||||
env:
|
||||
ARCH: ${{ inputs.arch }}
|
||||
run: |
|
||||
# Install target binaries
|
||||
mkdir -p /tmp/neon/bin/
|
||||
binaries=$(
|
||||
${cov_prefix} cargo metadata $CARGO_FEATURES --format-version=1 --no-deps |
|
||||
jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name'
|
||||
)
|
||||
for bin in $binaries; do
|
||||
SRC=target/$BUILD_TYPE/$bin
|
||||
DST=/tmp/neon/bin/$bin
|
||||
cp "$SRC" "$DST"
|
||||
done
|
||||
|
||||
# Install test executables and write list of all binaries (for code coverage)
|
||||
if [[ $BUILD_TYPE == "debug" && $ARCH == 'x64' ]]; then
|
||||
# Keep bloated coverage data files away from the rest of the artifact
|
||||
mkdir -p /tmp/coverage/
|
||||
|
||||
mkdir -p /tmp/neon/test_bin/
|
||||
|
||||
test_exe_paths=$(
|
||||
${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES --message-format=json --no-run |
|
||||
jq -r '.executable | select(. != null)'
|
||||
)
|
||||
for bin in $test_exe_paths; do
|
||||
SRC=$bin
|
||||
DST=/tmp/neon/test_bin/$(basename $bin)
|
||||
|
||||
# We don't need debug symbols for code coverage, so strip them out to make
|
||||
# the artifact smaller.
|
||||
strip "$SRC" -o "$DST"
|
||||
echo "$DST" >> /tmp/coverage/binaries.list
|
||||
done
|
||||
|
||||
for bin in $binaries; do
|
||||
echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list
|
||||
done
|
||||
fi
|
||||
|
||||
- name: Run rust tests
|
||||
env:
|
||||
NEXTEST_RETRIES: 3
|
||||
run: |
|
||||
PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
|
||||
export PQ_LIB_DIR
|
||||
LD_LIBRARY_PATH=$(pwd)/pg_install/v17/lib
|
||||
export LD_LIBRARY_PATH
|
||||
|
||||
#nextest does not yet support running doctests
|
||||
${cov_prefix} cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
|
||||
|
||||
# run all non-pageserver tests
|
||||
${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E '!package(pageserver)'
|
||||
|
||||
# run pageserver tests with different settings
|
||||
for io_engine in std-fs tokio-epoll-uring ; do
|
||||
NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(pageserver)'
|
||||
done
|
||||
|
||||
# Run separate tests for real S3
|
||||
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
|
||||
export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
|
||||
export REMOTE_STORAGE_S3_REGION=eu-central-1
|
||||
${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_s3)'
|
||||
|
||||
# Run separate tests for real Azure Blob Storage
|
||||
# XXX: replace region with `eu-central-1`-like region
|
||||
export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
|
||||
export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
|
||||
export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
|
||||
export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
|
||||
export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
|
||||
${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_azure)'
|
||||
|
||||
- name: Install postgres binaries
|
||||
run: |
|
||||
# Use tar to copy files matching the pattern, preserving the paths in the destionation
|
||||
tar c \
|
||||
pg_install/v* \
|
||||
pg_install/build/*/src/test/regress/*.so \
|
||||
pg_install/build/*/src/test/regress/pg_regress \
|
||||
pg_install/build/*/src/test/isolation/isolationtester \
|
||||
pg_install/build/*/src/test/isolation/pg_isolation_regress \
|
||||
| tar x -C /tmp/neon
|
||||
|
||||
- name: Upload Neon artifact
|
||||
uses: ./.github/actions/upload
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-artifact
|
||||
path: /tmp/neon
|
||||
|
||||
# XXX: keep this after the binaries.list is formed, so the coverage can properly work later
|
||||
- name: Merge and upload coverage data
|
||||
if: inputs.build-type == 'debug'
|
||||
uses: ./.github/actions/save-coverage-data
|
||||
|
||||
regress-tests:
|
||||
# Don't run regression tests on debug arm64 builds
|
||||
if: inputs.build-type != 'debug' || inputs.arch != 'arm64'
|
||||
needs: [ build-neon ]
|
||||
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }}
|
||||
container:
|
||||
image: ${{ inputs.build-tools-image }}
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
# for changed limits, see comments on `options:` earlier in this file
|
||||
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
pg_version: ${{ fromJson(inputs.pg-versions) }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
|
||||
- name: Pytest regression tests
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
timeout-minutes: 60
|
||||
with:
|
||||
build_type: ${{ inputs.build-type }}
|
||||
test_selection: regress
|
||||
needs_postgres_source: true
|
||||
run_with_real_s3: true
|
||||
real_s3_bucket: neon-github-ci-tests
|
||||
real_s3_region: eu-central-1
|
||||
rerun_flaky: true
|
||||
pg_version: ${{ matrix.pg_version }}
|
||||
env:
|
||||
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
||||
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
|
||||
BUILD_TAG: ${{ inputs.build-tag }}
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
|
||||
|
||||
# Temporary disable this step until we figure out why it's so flaky
|
||||
# Ref https://github.com/neondatabase/neon/issues/4540
|
||||
- name: Merge and upload coverage data
|
||||
if: |
|
||||
false &&
|
||||
inputs.build-type == 'debug' && matrix.pg_version == 'v16'
|
||||
uses: ./.github/actions/save-coverage-data
|
||||
56
.github/workflows/_push-to-acr.yml
vendored
56
.github/workflows/_push-to-acr.yml
vendored
@@ -1,56 +0,0 @@
|
||||
name: Push images to ACR
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
client_id:
|
||||
description: Client ID of Azure managed identity or Entra app
|
||||
required: true
|
||||
type: string
|
||||
image_tag:
|
||||
description: Tag for the container image
|
||||
required: true
|
||||
type: string
|
||||
images:
|
||||
description: Images to push
|
||||
required: true
|
||||
type: string
|
||||
registry_name:
|
||||
description: Name of the container registry
|
||||
required: true
|
||||
type: string
|
||||
subscription_id:
|
||||
description: Azure subscription ID
|
||||
required: true
|
||||
type: string
|
||||
tenant_id:
|
||||
description: Azure tenant ID
|
||||
required: true
|
||||
type: string
|
||||
|
||||
jobs:
|
||||
push-to-acr:
|
||||
runs-on: ubuntu-22.04
|
||||
permissions:
|
||||
contents: read # This is required for actions/checkout
|
||||
id-token: write # This is required for Azure Login to work.
|
||||
|
||||
steps:
|
||||
- name: Azure login
|
||||
uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # @v2.1.1
|
||||
with:
|
||||
client-id: ${{ inputs.client_id }}
|
||||
subscription-id: ${{ inputs.subscription_id }}
|
||||
tenant-id: ${{ inputs.tenant_id }}
|
||||
|
||||
- name: Login to ACR
|
||||
run: |
|
||||
az acr login --name=${{ inputs.registry_name }}
|
||||
|
||||
- name: Copy docker images to ACR ${{ inputs.registry_name }}
|
||||
run: |
|
||||
images='${{ inputs.images }}'
|
||||
for image in ${images}; do
|
||||
docker buildx imagetools create \
|
||||
-t ${{ inputs.registry_name }}.azurecr.io/neondatabase/${image}:${{ inputs.image_tag }} \
|
||||
neondatabase/${image}:${{ inputs.image_tag }}
|
||||
done
|
||||
11
.github/workflows/actionlint.yml
vendored
11
.github/workflows/actionlint.yml
vendored
@@ -36,16 +36,15 @@ jobs:
|
||||
fail_on_error: true
|
||||
filter_mode: nofilter
|
||||
level: error
|
||||
|
||||
- name: Disallow 'ubuntu-latest' runners
|
||||
run: |
|
||||
- run: |
|
||||
PAT='^\s*runs-on:.*-latest'
|
||||
if grep -ERq $PAT .github/workflows; then
|
||||
if grep -ERq $PAT .github/workflows
|
||||
then
|
||||
grep -ERl $PAT .github/workflows |\
|
||||
while read -r f
|
||||
do
|
||||
l=$(grep -nE $PAT $f | awk -F: '{print $1}' | head -1)
|
||||
echo "::error file=$f,line=$l::Please use 'ubuntu-22.04' instead of 'ubuntu-latest'"
|
||||
l=$(grep -nE $PAT .github/workflows/release.yml | awk -F: '{print $1}' | head -1)
|
||||
echo "::error file=$f,line=$l::Please, do not use ubuntu-latest images to run on, use LTS instead."
|
||||
done
|
||||
exit 1
|
||||
fi
|
||||
|
||||
498
.github/workflows/benchmarking.yml
vendored
498
.github/workflows/benchmarking.yml
vendored
@@ -12,6 +12,7 @@ on:
|
||||
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
|
||||
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
|
||||
- cron: '0 3 * * *' # run once a day, timezone is utc
|
||||
|
||||
workflow_dispatch: # adds ability to run this manually
|
||||
inputs:
|
||||
region_id:
|
||||
@@ -55,54 +56,28 @@ concurrency:
|
||||
jobs:
|
||||
bench:
|
||||
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
|
||||
permissions:
|
||||
contents: write
|
||||
statuses: write
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- DEFAULT_PG_VERSION: 16
|
||||
PLATFORM: "neon-staging"
|
||||
region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
|
||||
RUNNER: [ self-hosted, us-east-2, x64 ]
|
||||
- DEFAULT_PG_VERSION: 16
|
||||
PLATFORM: "azure-staging"
|
||||
region_id: 'azure-eastus2'
|
||||
RUNNER: [ self-hosted, eastus2, x64 ]
|
||||
env:
|
||||
TEST_PG_BENCH_DURATIONS_MATRIX: "300"
|
||||
TEST_PG_BENCH_SCALES_MATRIX: "10,100"
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
DEFAULT_PG_VERSION: ${{ matrix.DEFAULT_PG_VERSION }}
|
||||
DEFAULT_PG_VERSION: 14
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUILD_TYPE: remote
|
||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
|
||||
PLATFORM: ${{ matrix.PLATFORM }}
|
||||
PLATFORM: "neon-staging"
|
||||
|
||||
runs-on: ${{ matrix.RUNNER }}
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: neondatabase/build-tools:pinned
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Configure AWS credentials # necessary on Azure runners
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
role-duration-seconds: 18000 # 5 hours
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
name: neon-${{ runner.os }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
|
||||
@@ -110,7 +85,7 @@ jobs:
|
||||
id: create-neon-project
|
||||
uses: ./.github/actions/neon-project-create
|
||||
with:
|
||||
region_id: ${{ matrix.region_id }}
|
||||
region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
|
||||
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
|
||||
@@ -121,18 +96,10 @@ jobs:
|
||||
test_selection: performance
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
# Set --sparse-ordering option of pytest-order plugin
|
||||
# to ensure tests are running in order of appears in the file.
|
||||
# It's important for test_perf_pgbench.py::test_pgbench_remote_* tests
|
||||
extra_params:
|
||||
-m remote_cluster
|
||||
--sparse-ordering
|
||||
--timeout 14400
|
||||
--ignore test_runner/performance/test_perf_olap.py
|
||||
--ignore test_runner/performance/test_perf_pgvector_queries.py
|
||||
--ignore test_runner/performance/test_logical_replication.py
|
||||
--ignore test_runner/performance/test_physical_replication.py
|
||||
extra_params: -m remote_cluster --sparse-ordering --timeout 5400 --ignore test_runner/performance/test_perf_olap.py --ignore test_runner/performance/test_perf_pgvector_queries.py
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
@@ -146,7 +113,6 @@ jobs:
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
|
||||
- name: Create Allure report
|
||||
id: create-allure-report
|
||||
if: ${{ !cancelled() }}
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
|
||||
@@ -155,100 +121,7 @@ jobs:
|
||||
uses: slackapi/slack-github-action@v1
|
||||
with:
|
||||
channel-id: "C033QLM5P7D" # dev-staging-stream
|
||||
slack-message: |
|
||||
Periodic perf testing: ${{ job.status }}
|
||||
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
|
||||
<${{ steps.create-allure-report.outputs.report-url }}|Allure report>
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
replication-tests:
|
||||
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
|
||||
permissions:
|
||||
contents: write
|
||||
statuses: write
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
env:
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
DEFAULT_PG_VERSION: 16
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUILD_TYPE: remote
|
||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
|
||||
PLATFORM: "neon-staging"
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: neondatabase/build-tools:pinned
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
role-duration-seconds: 18000 # 5 hours
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
|
||||
- name: Run Logical Replication benchmarks
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
test_selection: performance/test_logical_replication.py
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 5400
|
||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
BENCHMARK_PROJECT_ID_PUB: ${{ vars.BENCHMARK_PROJECT_ID_PUB }}
|
||||
BENCHMARK_PROJECT_ID_SUB: ${{ vars.BENCHMARK_PROJECT_ID_SUB }}
|
||||
|
||||
- name: Run Physical Replication benchmarks
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
test_selection: performance/test_physical_replication.py
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 5400
|
||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
|
||||
- name: Create Allure report
|
||||
id: create-allure-report
|
||||
if: ${{ !cancelled() }}
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
with:
|
||||
store-test-results-into-db: true
|
||||
env:
|
||||
REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
||||
|
||||
- name: Post to a Slack channel
|
||||
if: ${{ github.event.schedule && failure() }}
|
||||
uses: slackapi/slack-github-action@v1
|
||||
with:
|
||||
channel-id: "C06T9AMNDQQ" # on-call-compute-staging-stream
|
||||
slack-message: |
|
||||
Periodic replication testing: ${{ job.status }}
|
||||
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
|
||||
<${{ steps.create-allure-report.outputs.report-url }}|Allure report>
|
||||
slack-message: "Periodic perf testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
@@ -257,16 +130,13 @@ jobs:
|
||||
# Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
|
||||
#
|
||||
# Available platforms:
|
||||
# - neonvm-captest-new: Freshly created project (1 CU)
|
||||
# - neonvm-captest-freetier: Use freetier-sized compute (0.25 CU)
|
||||
# - neonvm-captest-azure-new: Freshly created project (1 CU) in azure region
|
||||
# - neonvm-captest-azure-freetier: Use freetier-sized compute (0.25 CU) in azure region
|
||||
# - neonvm-captest-reuse: Reusing existing project
|
||||
# - neon-captest-new: Freshly created project (1 CU)
|
||||
# - neon-captest-freetier: Use freetier-sized compute (0.25 CU)
|
||||
# - neon-captest-reuse: Reusing existing project
|
||||
# - rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
|
||||
# - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
|
||||
env:
|
||||
RUN_AWS_RDS_AND_AURORA: ${{ github.event.inputs.run_AWS_RDS_AND_AURORA || 'false' }}
|
||||
DEFAULT_REGION_ID: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
|
||||
runs-on: ubuntu-22.04
|
||||
outputs:
|
||||
pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }}
|
||||
@@ -277,37 +147,23 @@ jobs:
|
||||
- name: Generate matrix for pgbench benchmark
|
||||
id: pgbench-compare-matrix
|
||||
run: |
|
||||
region_id_default=${{ env.DEFAULT_REGION_ID }}
|
||||
runner_default='["self-hosted", "us-east-2", "x64"]'
|
||||
runner_azure='["self-hosted", "eastus2", "x64"]'
|
||||
image_default="neondatabase/build-tools:pinned"
|
||||
matrix='{
|
||||
"pg_version" : [
|
||||
16
|
||||
],
|
||||
"region_id" : [
|
||||
"'"$region_id_default"'"
|
||||
],
|
||||
"platform": [
|
||||
"neonvm-captest-new",
|
||||
"neonvm-captest-reuse",
|
||||
"neon-captest-new",
|
||||
"neon-captest-reuse",
|
||||
"neonvm-captest-new"
|
||||
],
|
||||
"db_size": [ "10gb" ],
|
||||
"runner": ['"$runner_default"'],
|
||||
"image": [ "'"$image_default"'" ],
|
||||
"include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_default"', "image": "'"$image_default"'" },
|
||||
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
|
||||
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
|
||||
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned" },
|
||||
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "10gb","runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned" },
|
||||
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "50gb","runner": '"$runner_azure"', "image": "neondatabase/build-tools:pinned" },
|
||||
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]
|
||||
"include": [{ "platform": "neon-captest-freetier", "db_size": "3gb" },
|
||||
{ "platform": "neon-captest-new", "db_size": "50gb" },
|
||||
{ "platform": "neonvm-captest-freetier", "db_size": "3gb" },
|
||||
{ "platform": "neonvm-captest-new", "db_size": "50gb" },
|
||||
{ "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
|
||||
}'
|
||||
|
||||
if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
|
||||
matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
|
||||
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "rds-aurora", "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]')
|
||||
if [ "$(date +%A)" = "Saturday" ]; then
|
||||
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
|
||||
{ "platform": "rds-aurora", "db_size": "50gb"}]')
|
||||
fi
|
||||
|
||||
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
|
||||
@@ -317,7 +173,7 @@ jobs:
|
||||
run: |
|
||||
matrix='{
|
||||
"platform": [
|
||||
"neonvm-captest-reuse"
|
||||
"neon-captest-reuse"
|
||||
]
|
||||
}'
|
||||
|
||||
@@ -333,7 +189,7 @@ jobs:
|
||||
run: |
|
||||
matrix='{
|
||||
"platform": [
|
||||
"neonvm-captest-reuse"
|
||||
"neon-captest-reuse"
|
||||
],
|
||||
"scale": [
|
||||
"10"
|
||||
@@ -347,17 +203,9 @@ jobs:
|
||||
|
||||
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
|
||||
|
||||
prepare_AWS_RDS_databases:
|
||||
uses: ./.github/workflows/_benchmarking_preparation.yml
|
||||
secrets: inherit
|
||||
|
||||
pgbench-compare:
|
||||
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
|
||||
needs: [ generate-matrices, prepare_AWS_RDS_databases ]
|
||||
permissions:
|
||||
contents: write
|
||||
statuses: write
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
needs: [ generate-matrices ]
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
@@ -367,15 +215,15 @@ jobs:
|
||||
TEST_PG_BENCH_DURATIONS_MATRIX: "60m"
|
||||
TEST_PG_BENCH_SCALES_MATRIX: ${{ matrix.db_size }}
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
DEFAULT_PG_VERSION: ${{ matrix.pg_version }}
|
||||
DEFAULT_PG_VERSION: 14
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUILD_TYPE: remote
|
||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
|
||||
PLATFORM: ${{ matrix.platform }}
|
||||
|
||||
runs-on: ${{ matrix.runner }}
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: ${{ matrix.image }}
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
options: --init
|
||||
|
||||
# Increase timeout to 8h, default timeout is 6h
|
||||
@@ -384,41 +232,40 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
role-duration-seconds: 18000 # 5 hours
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
name: neon-${{ runner.os }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
|
||||
- name: Add Postgres binaries to PATH
|
||||
run: |
|
||||
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
|
||||
echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Create Neon Project
|
||||
if: contains(fromJson('["neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform)
|
||||
if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier"]'), matrix.platform)
|
||||
id: create-neon-project
|
||||
uses: ./.github/actions/neon-project-create
|
||||
with:
|
||||
region_id: ${{ matrix.region_id }}
|
||||
region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
|
||||
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
compute_units: ${{ (contains(matrix.platform, 'captest-freetier') && '[0.25, 0.25]') || '[1, 1]' }}
|
||||
compute_units: ${{ (matrix.platform == 'neon-captest-freetier' && '[0.25, 0.25]') || '[1, 1]' }}
|
||||
provisioner: ${{ (contains(matrix.platform, 'neonvm-') && 'k8s-neonvm') || 'k8s-pod' }}
|
||||
|
||||
- name: Set up Connection String
|
||||
id: set-up-connstr
|
||||
run: |
|
||||
case "${PLATFORM}" in
|
||||
neonvm-captest-reuse)
|
||||
neon-captest-reuse)
|
||||
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
|
||||
;;
|
||||
neonvm-captest-sharding-reuse)
|
||||
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }}
|
||||
;;
|
||||
neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier)
|
||||
neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier)
|
||||
CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
|
||||
;;
|
||||
rds-aurora)
|
||||
@@ -435,6 +282,16 @@ jobs:
|
||||
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
|
||||
QUERIES=("SELECT version()")
|
||||
if [[ "${PLATFORM}" = "neon"* ]]; then
|
||||
QUERIES+=("SHOW neon.tenant_id")
|
||||
QUERIES+=("SHOW neon.timeline_id")
|
||||
fi
|
||||
|
||||
for q in "${QUERIES[@]}"; do
|
||||
psql ${CONNSTR} -c "${q}"
|
||||
done
|
||||
|
||||
- name: Benchmark init
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
@@ -443,7 +300,6 @@ jobs:
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init
|
||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
@@ -457,7 +313,6 @@ jobs:
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update
|
||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
@@ -471,7 +326,6 @@ jobs:
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only
|
||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
@@ -485,7 +339,6 @@ jobs:
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
|
||||
- name: Create Allure report
|
||||
id: create-allure-report
|
||||
if: ${{ !cancelled() }}
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
|
||||
@@ -494,27 +347,11 @@ jobs:
|
||||
uses: slackapi/slack-github-action@v1
|
||||
with:
|
||||
channel-id: "C033QLM5P7D" # dev-staging-stream
|
||||
slack-message: |
|
||||
Periodic perf testing on ${{ matrix.platform }}: ${{ job.status }}
|
||||
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
|
||||
<${{ steps.create-allure-report.outputs.report-url }}|Allure report>
|
||||
slack-message: "Periodic perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
pgbench-pgvector:
|
||||
permissions:
|
||||
contents: write
|
||||
statuses: write
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- PLATFORM: "neonvm-captest-pgvector"
|
||||
RUNNER: [ self-hosted, us-east-2, x64 ]
|
||||
- PLATFORM: "azure-captest-pgvector"
|
||||
RUNNER: [ self-hosted, eastus2, x64 ]
|
||||
|
||||
env:
|
||||
TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
|
||||
TEST_PG_BENCH_SCALES_MATRIX: "1"
|
||||
@@ -522,72 +359,43 @@ jobs:
|
||||
DEFAULT_PG_VERSION: 16
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUILD_TYPE: remote
|
||||
|
||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
|
||||
PLATFORM: ${{ matrix.PLATFORM }}
|
||||
PLATFORM: "neon-captest-pgvector"
|
||||
|
||||
runs-on: ${{ matrix.RUNNER }}
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: neondatabase/build-tools:pinned
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
# until https://github.com/neondatabase/neon/issues/8275 is fixed we temporarily install postgresql-16
|
||||
# instead of using Neon artifacts containing pgbench
|
||||
- name: Install postgresql-16 where pytest expects it
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
|
||||
- name: Add Postgres binaries to PATH
|
||||
run: |
|
||||
# Just to make it easier to test things locally on macOS (with arm64)
|
||||
arch=$(uname -m | sed 's/x86_64/amd64/g' | sed 's/aarch64/arm64/g')
|
||||
|
||||
cd /home/nonroot
|
||||
wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-17/libpq5_17.0-1.pgdg110+1_${arch}.deb"
|
||||
wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.4-1.pgdg110+2_${arch}.deb"
|
||||
wget -q "https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.4-1.pgdg110+2_${arch}.deb"
|
||||
dpkg -x libpq5_17.0-1.pgdg110+1_${arch}.deb pg
|
||||
dpkg -x postgresql-16_16.4-1.pgdg110+2_${arch}.deb pg
|
||||
dpkg -x postgresql-client-16_16.4-1.pgdg110+2_${arch}.deb pg
|
||||
|
||||
mkdir -p /tmp/neon/pg_install/v16/bin
|
||||
ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench
|
||||
ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql /tmp/neon/pg_install/v16/bin/psql
|
||||
ln -s /home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu /tmp/neon/pg_install/v16/lib
|
||||
|
||||
LD_LIBRARY_PATH="/home/nonroot/pg/usr/lib/$(uname -m)-linux-gnu:${LD_LIBRARY_PATH:-}"
|
||||
export LD_LIBRARY_PATH
|
||||
echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >> ${GITHUB_ENV}
|
||||
|
||||
/tmp/neon/pg_install/v16/bin/pgbench --version
|
||||
/tmp/neon/pg_install/v16/bin/psql --version
|
||||
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
|
||||
echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Set up Connection String
|
||||
id: set-up-connstr
|
||||
run: |
|
||||
case "${PLATFORM}" in
|
||||
neonvm-captest-pgvector)
|
||||
CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
|
||||
;;
|
||||
azure-captest-pgvector)
|
||||
CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR_AZURE }}
|
||||
;;
|
||||
*)
|
||||
echo >&2 "Unknown PLATFORM=${PLATFORM}"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
|
||||
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
role-duration-seconds: 18000 # 5 hours
|
||||
QUERIES=("SELECT version()")
|
||||
QUERIES+=("SHOW neon.tenant_id")
|
||||
QUERIES+=("SHOW neon.timeline_id")
|
||||
|
||||
for q in "${QUERIES[@]}"; do
|
||||
psql ${CONNSTR} -c "${q}"
|
||||
done
|
||||
|
||||
- name: Benchmark pgvector hnsw indexing
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
@@ -597,7 +405,6 @@ jobs:
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 21600 -k test_pgvector_indexing
|
||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
@@ -610,15 +417,13 @@ jobs:
|
||||
test_selection: performance/test_perf_pgvector_queries.py
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 21600
|
||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
extra_params: -m remote_cluster --timeout 21600
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
|
||||
|
||||
- name: Create Allure report
|
||||
id: create-allure-report
|
||||
if: ${{ !cancelled() }}
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
|
||||
@@ -627,13 +432,11 @@ jobs:
|
||||
uses: slackapi/slack-github-action@v1
|
||||
with:
|
||||
channel-id: "C033QLM5P7D" # dev-staging-stream
|
||||
slack-message: |
|
||||
Periodic perf testing on ${{ env.PLATFORM }}: ${{ job.status }}
|
||||
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
|
||||
<${{ steps.create-allure-report.outputs.report-url }}|Allure report>
|
||||
slack-message: "Periodic perf testing neon-captest-pgvector: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
|
||||
clickbench-compare:
|
||||
# ClichBench DB for rds-aurora and rds-Postgres deployed to the same clusters
|
||||
# we use for performance testing in pgbench-compare.
|
||||
@@ -643,11 +446,7 @@ jobs:
|
||||
# *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows
|
||||
# *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB
|
||||
if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
|
||||
permissions:
|
||||
contents: write
|
||||
statuses: write
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
needs: [ generate-matrices, pgbench-compare, prepare_AWS_RDS_databases ]
|
||||
needs: [ generate-matrices, pgbench-compare ]
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
@@ -655,7 +454,7 @@ jobs:
|
||||
|
||||
env:
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
DEFAULT_PG_VERSION: 16
|
||||
DEFAULT_PG_VERSION: 14
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
TEST_OLAP_COLLECT_EXPLAIN: ${{ github.event.inputs.collect_olap_explain }}
|
||||
TEST_OLAP_COLLECT_PG_STAT_STATEMENTS: ${{ github.event.inputs.collect_pg_stat_statements }}
|
||||
@@ -665,34 +464,29 @@ jobs:
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: neondatabase/build-tools:pinned
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
role-duration-seconds: 18000 # 5 hours
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
name: neon-${{ runner.os }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
|
||||
- name: Add Postgres binaries to PATH
|
||||
run: |
|
||||
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
|
||||
echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Set up Connection String
|
||||
id: set-up-connstr
|
||||
run: |
|
||||
case "${PLATFORM}" in
|
||||
neonvm-captest-reuse)
|
||||
neon-captest-reuse)
|
||||
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR }}
|
||||
;;
|
||||
rds-aurora)
|
||||
@@ -702,13 +496,23 @@ jobs:
|
||||
CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CLICKBENCH_10M_CONNSTR }}
|
||||
;;
|
||||
*)
|
||||
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
|
||||
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
|
||||
QUERIES=("SELECT version()")
|
||||
if [[ "${PLATFORM}" = "neon"* ]]; then
|
||||
QUERIES+=("SHOW neon.tenant_id")
|
||||
QUERIES+=("SHOW neon.timeline_id")
|
||||
fi
|
||||
|
||||
for q in "${QUERIES[@]}"; do
|
||||
psql ${CONNSTR} -c "${q}"
|
||||
done
|
||||
|
||||
- name: ClickBench benchmark
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
@@ -717,7 +521,6 @@ jobs:
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 21600 -k test_clickbench
|
||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
@@ -727,7 +530,6 @@ jobs:
|
||||
TEST_OLAP_SCALE: 10
|
||||
|
||||
- name: Create Allure report
|
||||
id: create-allure-report
|
||||
if: ${{ !cancelled() }}
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
|
||||
@@ -736,10 +538,7 @@ jobs:
|
||||
uses: slackapi/slack-github-action@v1
|
||||
with:
|
||||
channel-id: "C033QLM5P7D" # dev-staging-stream
|
||||
slack-message: |
|
||||
Periodic OLAP perf testing on ${{ matrix.platform }}: ${{ job.status }}
|
||||
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
|
||||
<${{ steps.create-allure-report.outputs.report-url }}|Allure report>
|
||||
slack-message: "Periodic OLAP perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
@@ -751,11 +550,7 @@ jobs:
|
||||
#
|
||||
# *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB)
|
||||
if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
|
||||
permissions:
|
||||
contents: write
|
||||
statuses: write
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
needs: [ generate-matrices, clickbench-compare, prepare_AWS_RDS_databases ]
|
||||
needs: [ generate-matrices, clickbench-compare ]
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
@@ -763,7 +558,7 @@ jobs:
|
||||
|
||||
env:
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
DEFAULT_PG_VERSION: 16
|
||||
DEFAULT_PG_VERSION: 14
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUILD_TYPE: remote
|
||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
|
||||
@@ -772,43 +567,38 @@ jobs:
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: neondatabase/build-tools:pinned
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
role-duration-seconds: 18000 # 5 hours
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
name: neon-${{ runner.os }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
|
||||
- name: Add Postgres binaries to PATH
|
||||
run: |
|
||||
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
|
||||
echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Get Connstring Secret Name
|
||||
run: |
|
||||
case "${PLATFORM}" in
|
||||
neonvm-captest-reuse)
|
||||
neon-captest-reuse)
|
||||
ENV_PLATFORM=CAPTEST_TPCH
|
||||
;;
|
||||
rds-aurora)
|
||||
ENV_PLATFORM=RDS_AURORA_TPCH
|
||||
;;
|
||||
rds-postgres)
|
||||
ENV_PLATFORM=RDS_POSTGRES_TPCH
|
||||
ENV_PLATFORM=RDS_AURORA_TPCH
|
||||
;;
|
||||
*)
|
||||
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
|
||||
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
@@ -823,6 +613,16 @@ jobs:
|
||||
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
|
||||
QUERIES=("SELECT version()")
|
||||
if [[ "${PLATFORM}" = "neon"* ]]; then
|
||||
QUERIES+=("SHOW neon.tenant_id")
|
||||
QUERIES+=("SHOW neon.timeline_id")
|
||||
fi
|
||||
|
||||
for q in "${QUERIES[@]}"; do
|
||||
psql ${CONNSTR} -c "${q}"
|
||||
done
|
||||
|
||||
- name: Run TPC-H benchmark
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
@@ -831,7 +631,6 @@ jobs:
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 21600 -k test_tpch
|
||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
@@ -839,7 +638,6 @@ jobs:
|
||||
TEST_OLAP_SCALE: ${{ matrix.scale }}
|
||||
|
||||
- name: Create Allure report
|
||||
id: create-allure-report
|
||||
if: ${{ !cancelled() }}
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
|
||||
@@ -848,20 +646,13 @@ jobs:
|
||||
uses: slackapi/slack-github-action@v1
|
||||
with:
|
||||
channel-id: "C033QLM5P7D" # dev-staging-stream
|
||||
slack-message: |
|
||||
Periodic TPC-H perf testing on ${{ matrix.platform }}: ${{ job.status }}
|
||||
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
|
||||
<${{ steps.create-allure-report.outputs.report-url }}|Allure report>
|
||||
slack-message: "Periodic TPC-H perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
user-examples-compare:
|
||||
if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }}
|
||||
permissions:
|
||||
contents: write
|
||||
statuses: write
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
needs: [ generate-matrices, tpch-compare, prepare_AWS_RDS_databases ]
|
||||
needs: [ generate-matrices, tpch-compare ]
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
@@ -869,7 +660,7 @@ jobs:
|
||||
|
||||
env:
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
DEFAULT_PG_VERSION: 16
|
||||
DEFAULT_PG_VERSION: 14
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUILD_TYPE: remote
|
||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
|
||||
@@ -877,34 +668,29 @@ jobs:
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: neondatabase/build-tools:pinned
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
role-duration-seconds: 18000 # 5 hours
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
name: neon-${{ runner.os }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
|
||||
- name: Add Postgres binaries to PATH
|
||||
run: |
|
||||
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
|
||||
echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Set up Connection String
|
||||
id: set-up-connstr
|
||||
run: |
|
||||
case "${PLATFORM}" in
|
||||
neonvm-captest-reuse)
|
||||
neon-captest-reuse)
|
||||
CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_CAPTEST_CONNSTR }}
|
||||
;;
|
||||
rds-aurora)
|
||||
@@ -914,13 +700,23 @@ jobs:
|
||||
CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_RDS_POSTGRES_CONNSTR }}
|
||||
;;
|
||||
*)
|
||||
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
|
||||
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
|
||||
QUERIES=("SELECT version()")
|
||||
if [[ "${PLATFORM}" = "neon"* ]]; then
|
||||
QUERIES+=("SHOW neon.tenant_id")
|
||||
QUERIES+=("SHOW neon.timeline_id")
|
||||
fi
|
||||
|
||||
for q in "${QUERIES[@]}"; do
|
||||
psql ${CONNSTR} -c "${q}"
|
||||
done
|
||||
|
||||
- name: Run user examples
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
@@ -929,14 +725,12 @@ jobs:
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 21600 -k test_user_examples
|
||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
|
||||
- name: Create Allure report
|
||||
id: create-allure-report
|
||||
if: ${{ !cancelled() }}
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
|
||||
@@ -945,10 +739,6 @@ jobs:
|
||||
uses: slackapi/slack-github-action@v1
|
||||
with:
|
||||
channel-id: "C033QLM5P7D" # dev-staging-stream
|
||||
slack-message: |
|
||||
Periodic TPC-H perf testing on ${{ matrix.platform }}: ${{ job.status }}
|
||||
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
|
||||
<${{ steps.create-allure-report.outputs.report-url }}|Allure report>
|
||||
|
||||
slack-message: "Periodic User example perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
33
.github/workflows/build-build-tools-image.yml
vendored
33
.github/workflows/build-build-tools-image.yml
vendored
@@ -30,6 +30,7 @@ jobs:
|
||||
check-image:
|
||||
uses: ./.github/workflows/check-build-tools-image.yml
|
||||
|
||||
# This job uses older version of GitHub Actions because it's run on gen2 runners, which don't support node 20 (for newer versions)
|
||||
build-image:
|
||||
needs: [ check-image ]
|
||||
if: needs.check-image.outputs.found == 'false'
|
||||
@@ -38,7 +39,7 @@ jobs:
|
||||
matrix:
|
||||
arch: [ x64, arm64 ]
|
||||
|
||||
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
|
||||
runs-on: ${{ fromJson(format('["self-hosted", "gen3", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
|
||||
|
||||
env:
|
||||
IMAGE_TAG: ${{ inputs.image-tag }}
|
||||
@@ -56,33 +57,35 @@ jobs:
|
||||
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: ./.github/actions/set-docker-config-dir
|
||||
- uses: docker/setup-buildx-action@v3
|
||||
with:
|
||||
cache-binary: false
|
||||
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
|
||||
# The default value is ~/.docker
|
||||
- name: Set custom docker config directory
|
||||
run: |
|
||||
mkdir -p /tmp/.docker-custom
|
||||
echo DOCKER_CONFIG=/tmp/.docker-custom >> $GITHUB_ENV
|
||||
|
||||
- uses: docker/login-action@v3
|
||||
- uses: docker/setup-buildx-action@v2
|
||||
|
||||
- uses: docker/login-action@v2
|
||||
with:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
registry: cache.neon.build
|
||||
username: ${{ secrets.NEON_CI_DOCKERCACHE_USERNAME }}
|
||||
password: ${{ secrets.NEON_CI_DOCKERCACHE_PASSWORD }}
|
||||
|
||||
- uses: docker/build-push-action@v6
|
||||
- uses: docker/build-push-action@v4
|
||||
with:
|
||||
context: .
|
||||
provenance: false
|
||||
push: true
|
||||
pull: true
|
||||
file: Dockerfile.build-tools
|
||||
cache-from: type=registry,ref=cache.neon.build/build-tools:cache-${{ matrix.arch }}
|
||||
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/build-tools:cache-{0},mode=max', matrix.arch) || '' }}
|
||||
cache-from: type=registry,ref=neondatabase/build-tools:cache-${{ matrix.arch }}
|
||||
cache-to: type=registry,ref=neondatabase/build-tools:cache-${{ matrix.arch }},mode=max
|
||||
tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }}
|
||||
|
||||
- name: Remove custom docker config directory
|
||||
run: |
|
||||
rm -rf /tmp/.docker-custom
|
||||
|
||||
merge-images:
|
||||
needs: [ build-image ]
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
784
.github/workflows/build_and_test.yml
vendored
784
.github/workflows/build_and_test.yml
vendored
File diff suppressed because it is too large
Load Diff
23
.github/workflows/check-build-tools-image.yml
vendored
23
.github/workflows/check-build-tools-image.yml
vendored
@@ -25,17 +25,26 @@ jobs:
|
||||
found: ${{ steps.check-image.outputs.found }}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Get build-tools image tag for the current commit
|
||||
id: get-build-tools-tag
|
||||
env:
|
||||
IMAGE_TAG: |
|
||||
${{ hashFiles('Dockerfile.build-tools',
|
||||
'.github/workflows/check-build-tools-image.yml',
|
||||
'.github/workflows/build-build-tools-image.yml') }}
|
||||
# Usually, for COMMIT_SHA, we use `github.event.pull_request.head.sha || github.sha`, but here, even for PRs,
|
||||
# we want to use `github.sha` i.e. point to a phantom merge commit to determine the image tag correctly.
|
||||
COMMIT_SHA: ${{ github.sha }}
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: |
|
||||
echo "image-tag=${IMAGE_TAG}" | tee -a $GITHUB_OUTPUT
|
||||
LAST_BUILD_TOOLS_SHA=$(
|
||||
gh api \
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
-H "X-GitHub-Api-Version: 2022-11-28" \
|
||||
--method GET \
|
||||
--field path=Dockerfile.build-tools \
|
||||
--field sha=${COMMIT_SHA} \
|
||||
--field per_page=1 \
|
||||
--jq ".[0].sha" \
|
||||
"/repos/${GITHUB_REPOSITORY}/commits"
|
||||
)
|
||||
echo "image-tag=${LAST_BUILD_TOOLS_SHA}" | tee -a $GITHUB_OUTPUT
|
||||
|
||||
- name: Check if such tag found in the registry
|
||||
id: check-image
|
||||
|
||||
102
.github/workflows/cloud-regress.yml
vendored
102
.github/workflows/cloud-regress.yml
vendored
@@ -1,102 +0,0 @@
|
||||
name: Cloud Regression Test
|
||||
on:
|
||||
schedule:
|
||||
# * is a special character in YAML so you have to quote this string
|
||||
# ┌───────────── minute (0 - 59)
|
||||
# │ ┌───────────── hour (0 - 23)
|
||||
# │ │ ┌───────────── day of the month (1 - 31)
|
||||
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
|
||||
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
|
||||
- cron: '45 1 * * *' # run once a day, timezone is utc
|
||||
workflow_dispatch: # adds ability to run this manually
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
concurrency:
|
||||
# Allow only one workflow
|
||||
group: ${{ github.workflow }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
regress:
|
||||
env:
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
DEFAULT_PG_VERSION: 16
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUILD_TYPE: remote
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
runs-on: us-east-2
|
||||
container:
|
||||
image: neondatabase/build-tools:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
|
||||
- name: Patch the test
|
||||
run: |
|
||||
cd "vendor/postgres-v${DEFAULT_PG_VERSION}"
|
||||
patch -p1 < "../../compute/patches/cloud_regress_pg${DEFAULT_PG_VERSION}.patch"
|
||||
|
||||
- name: Generate a random password
|
||||
id: pwgen
|
||||
run: |
|
||||
set +x
|
||||
DBPASS=$(dd if=/dev/random bs=48 count=1 2>/dev/null | base64)
|
||||
echo "::add-mask::${DBPASS//\//}"
|
||||
echo DBPASS="${DBPASS//\//}" >> "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Change tests according to the generated password
|
||||
env:
|
||||
DBPASS: ${{ steps.pwgen.outputs.DBPASS }}
|
||||
run: |
|
||||
cd vendor/postgres-v"${DEFAULT_PG_VERSION}"/src/test/regress
|
||||
for fname in sql/*.sql expected/*.out; do
|
||||
sed -i.bak s/NEON_PASSWORD_PLACEHOLDER/"'${DBPASS}'"/ "${fname}"
|
||||
done
|
||||
for ph in $(grep NEON_MD5_PLACEHOLDER expected/password.out | awk '{print $3;}' | sort | uniq); do
|
||||
USER=$(echo "${ph}" | cut -c 22-)
|
||||
MD5=md5$(echo -n "${DBPASS}${USER}" | md5sum | awk '{print $1;}')
|
||||
sed -i.bak "s/${ph}/${MD5}/" expected/password.out
|
||||
done
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
|
||||
- name: Run the regression tests
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
test_selection: cloud_regress
|
||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
extra_params: -m remote_cluster
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ secrets.PG_REGRESS_CONNSTR }}
|
||||
|
||||
- name: Create Allure report
|
||||
id: create-allure-report
|
||||
if: ${{ !cancelled() }}
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
|
||||
- name: Post to a Slack channel
|
||||
if: ${{ github.event.schedule && failure() }}
|
||||
uses: slackapi/slack-github-action@v1
|
||||
with:
|
||||
channel-id: "C033QLM5P7D" # on-call-staging-stream
|
||||
slack-message: |
|
||||
Periodic pg_regress on staging: ${{ job.status }}
|
||||
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
|
||||
<${{ steps.create-allure-report.outputs.report-url }}|Allure report>
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
78
.github/workflows/label-for-external-users.yml
vendored
78
.github/workflows/label-for-external-users.yml
vendored
@@ -1,78 +0,0 @@
|
||||
name: Add `external` label to issues and PRs created by external users
|
||||
|
||||
on:
|
||||
issues:
|
||||
types:
|
||||
- opened
|
||||
pull_request_target:
|
||||
types:
|
||||
- opened
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
github-actor:
|
||||
description: 'GitHub username. If empty, the username of the current user will be used'
|
||||
required: false
|
||||
|
||||
# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
|
||||
permissions: {}
|
||||
|
||||
env:
|
||||
LABEL: external
|
||||
|
||||
jobs:
|
||||
check-user:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
outputs:
|
||||
is-member: ${{ steps.check-user.outputs.is-member }}
|
||||
|
||||
steps:
|
||||
- name: Check whether `${{ github.actor }}` is a member of `${{ github.repository_owner }}`
|
||||
id: check-user
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
ACTOR: ${{ inputs.github-actor || github.actor }}
|
||||
run: |
|
||||
expected_error="User does not exist or is not a member of the organization"
|
||||
output_file=output.txt
|
||||
|
||||
for i in $(seq 1 10); do
|
||||
if gh api "/orgs/${GITHUB_REPOSITORY_OWNER}/members/${ACTOR}" \
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
-H "X-GitHub-Api-Version: 2022-11-28" > ${output_file}; then
|
||||
|
||||
is_member=true
|
||||
break
|
||||
elif grep -q "${expected_error}" ${output_file}; then
|
||||
is_member=false
|
||||
break
|
||||
elif [ $i -eq 10 ]; then
|
||||
title="Failed to get memmbership status for ${ACTOR}"
|
||||
message="The latest GitHub API error message: '$(cat ${output_file})'"
|
||||
echo "::error file=.github/workflows/label-for-external-users.yml,title=${title}::${message}"
|
||||
|
||||
exit 1
|
||||
fi
|
||||
|
||||
sleep 1
|
||||
done
|
||||
|
||||
echo "is-member=${is_member}" | tee -a ${GITHUB_OUTPUT}
|
||||
|
||||
add-label:
|
||||
if: needs.check-user.outputs.is-member == 'false'
|
||||
needs: [ check-user ]
|
||||
|
||||
runs-on: ubuntu-22.04
|
||||
permissions:
|
||||
pull-requests: write # for `gh pr edit`
|
||||
issues: write # for `gh issue edit`
|
||||
|
||||
steps:
|
||||
- name: Add `${{ env.LABEL }}` label
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
ITEM_NUMBER: ${{ github.event[github.event_name == 'pull_request_target' && 'pull_request' || 'issue'].number }}
|
||||
GH_CLI_COMMAND: ${{ github.event_name == 'pull_request_target' && 'pr' || 'issue' }}
|
||||
run: |
|
||||
gh ${GH_CLI_COMMAND} --repo ${GITHUB_REPOSITORY} edit --add-label=${LABEL} ${ITEM_NUMBER}
|
||||
229
.github/workflows/neon_extra_builds.yml
vendored
229
.github/workflows/neon_extra_builds.yml
vendored
@@ -56,6 +56,7 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Install macOS postgres dependencies
|
||||
run: brew install flex bison openssl protobuf icu4c pkg-config
|
||||
@@ -72,10 +73,6 @@ jobs:
|
||||
id: pg_v16_rev
|
||||
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Set pg 17 revision for caching
|
||||
id: pg_v17_rev
|
||||
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v17) >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Cache postgres v14 build
|
||||
id: cache_pg_14
|
||||
uses: actions/cache@v4
|
||||
@@ -97,13 +94,6 @@ jobs:
|
||||
path: pg_install/v16
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Cache postgres v17 build
|
||||
id: cache_pg_17
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: pg_install/v17
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Set extra env for macOS
|
||||
run: |
|
||||
echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
|
||||
@@ -131,10 +121,6 @@ jobs:
|
||||
if: steps.cache_pg_16.outputs.cache-hit != 'true'
|
||||
run: make postgres-v16 -j$(sysctl -n hw.ncpu)
|
||||
|
||||
- name: Build postgres v17
|
||||
if: steps.cache_pg_17.outputs.cache-hit != 'true'
|
||||
run: make postgres-v17 -j$(sysctl -n hw.ncpu)
|
||||
|
||||
- name: Build neon extensions
|
||||
run: make neon-pg-ext -j$(sysctl -n hw.ncpu)
|
||||
|
||||
@@ -147,6 +133,214 @@ jobs:
|
||||
- name: Check that no warnings are produced
|
||||
run: ./run_clippy.sh
|
||||
|
||||
check-linux-arm-build:
|
||||
needs: [ check-permissions, build-build-tools-image ]
|
||||
timeout-minutes: 90
|
||||
runs-on: [ self-hosted, small-arm64 ]
|
||||
|
||||
env:
|
||||
# Use release build only, to have less debug info around
|
||||
# Hence keeping target/ (and general cache size) smaller
|
||||
BUILD_TYPE: release
|
||||
CARGO_FEATURES: --features testing
|
||||
CARGO_FLAGS: --release
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
container:
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- name: Fix git ownership
|
||||
run: |
|
||||
# Workaround for `fatal: detected dubious ownership in repository at ...`
|
||||
#
|
||||
# Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
|
||||
# Ref https://github.com/actions/checkout/issues/785
|
||||
#
|
||||
git config --global --add safe.directory ${{ github.workspace }}
|
||||
git config --global --add safe.directory ${GITHUB_WORKSPACE}
|
||||
for r in 14 15 16; do
|
||||
git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
|
||||
git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
|
||||
done
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Set pg 14 revision for caching
|
||||
id: pg_v14_rev
|
||||
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Set pg 15 revision for caching
|
||||
id: pg_v15_rev
|
||||
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Set pg 16 revision for caching
|
||||
id: pg_v16_rev
|
||||
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Set env variables
|
||||
run: |
|
||||
echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" >> $GITHUB_ENV
|
||||
|
||||
- name: Cache postgres v14 build
|
||||
id: cache_pg_14
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: pg_install/v14
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Cache postgres v15 build
|
||||
id: cache_pg_15
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: pg_install/v15
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Cache postgres v16 build
|
||||
id: cache_pg_16
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: pg_install/v16
|
||||
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Build postgres v14
|
||||
if: steps.cache_pg_14.outputs.cache-hit != 'true'
|
||||
run: mold -run make postgres-v14 -j$(nproc)
|
||||
|
||||
- name: Build postgres v15
|
||||
if: steps.cache_pg_15.outputs.cache-hit != 'true'
|
||||
run: mold -run make postgres-v15 -j$(nproc)
|
||||
|
||||
- name: Build postgres v16
|
||||
if: steps.cache_pg_16.outputs.cache-hit != 'true'
|
||||
run: mold -run make postgres-v16 -j$(nproc)
|
||||
|
||||
- name: Build neon extensions
|
||||
run: mold -run make neon-pg-ext -j$(nproc)
|
||||
|
||||
- name: Build walproposer-lib
|
||||
run: mold -run make walproposer-lib -j$(nproc)
|
||||
|
||||
- name: Run cargo build
|
||||
run: |
|
||||
mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests -j$(nproc)
|
||||
|
||||
- name: Run cargo test
|
||||
env:
|
||||
NEXTEST_RETRIES: 3
|
||||
run: |
|
||||
cargo nextest run $CARGO_FEATURES -j$(nproc)
|
||||
|
||||
# Run separate tests for real S3
|
||||
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
|
||||
export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
|
||||
export REMOTE_STORAGE_S3_REGION=eu-central-1
|
||||
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
|
||||
cargo nextest run --package remote_storage --test test_real_s3 -j$(nproc)
|
||||
|
||||
# Run separate tests for real Azure Blob Storage
|
||||
# XXX: replace region with `eu-central-1`-like region
|
||||
export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
|
||||
export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
|
||||
export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
|
||||
export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
|
||||
export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
|
||||
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
|
||||
cargo nextest run --package remote_storage --test test_real_azure -j$(nproc)
|
||||
|
||||
check-codestyle-rust-arm:
|
||||
needs: [ check-permissions, build-build-tools-image ]
|
||||
timeout-minutes: 90
|
||||
runs-on: [ self-hosted, small-arm64 ]
|
||||
|
||||
container:
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
build_type: [ debug, release ]
|
||||
|
||||
steps:
|
||||
- name: Fix git ownership
|
||||
run: |
|
||||
# Workaround for `fatal: detected dubious ownership in repository at ...`
|
||||
#
|
||||
# Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
|
||||
# Ref https://github.com/actions/checkout/issues/785
|
||||
#
|
||||
git config --global --add safe.directory ${{ github.workspace }}
|
||||
git config --global --add safe.directory ${GITHUB_WORKSPACE}
|
||||
for r in 14 15 16; do
|
||||
git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
|
||||
git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
|
||||
done
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
|
||||
# Some of our rust modules use FFI and need those to be checked
|
||||
- name: Get postgres headers
|
||||
run: make postgres-headers -j$(nproc)
|
||||
|
||||
# cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations.
|
||||
# This will catch compiler & clippy warnings in all feature combinations.
|
||||
# TODO: use cargo hack for build and test as well, but, that's quite expensive.
|
||||
# NB: keep clippy args in sync with ./run_clippy.sh
|
||||
- run: |
|
||||
CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
|
||||
if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
|
||||
echo "No clippy args found in .neon_clippy_args"
|
||||
exit 1
|
||||
fi
|
||||
echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
|
||||
|
||||
- name: Run cargo clippy (debug)
|
||||
if: matrix.build_type == 'debug'
|
||||
run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS
|
||||
- name: Run cargo clippy (release)
|
||||
if: matrix.build_type == 'release'
|
||||
run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS
|
||||
|
||||
- name: Check documentation generation
|
||||
if: matrix.build_type == 'release'
|
||||
run: cargo doc --workspace --no-deps --document-private-items -j$(nproc)
|
||||
env:
|
||||
RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
|
||||
|
||||
# Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
|
||||
- name: Check formatting
|
||||
if: ${{ !cancelled() && matrix.build_type == 'release' }}
|
||||
run: cargo fmt --all -- --check
|
||||
|
||||
# https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
|
||||
- name: Check rust dependencies
|
||||
if: ${{ !cancelled() && matrix.build_type == 'release' }}
|
||||
run: |
|
||||
cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date
|
||||
cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack
|
||||
|
||||
# https://github.com/EmbarkStudios/cargo-deny
|
||||
- name: Check rust licenses/bans/advisories/sources
|
||||
if: ${{ !cancelled() && matrix.build_type == 'release' }}
|
||||
run: cargo deny check
|
||||
|
||||
gather-rust-build-stats:
|
||||
needs: [ check-permissions, build-build-tools-image ]
|
||||
if: |
|
||||
@@ -163,6 +357,8 @@ jobs:
|
||||
|
||||
env:
|
||||
BUILD_TYPE: release
|
||||
# remove the cachepot wrapper and build without crate caches
|
||||
RUSTC_WRAPPER: ""
|
||||
# build with incremental compilation produce partial results
|
||||
# so do not attempt to cache this build, also disable the incremental compilation
|
||||
CARGO_INCREMENTAL: 0
|
||||
@@ -172,6 +368,7 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
|
||||
# Some of our rust modules use FFI and need those to be checked
|
||||
- name: Get postgres headers
|
||||
@@ -181,7 +378,7 @@ jobs:
|
||||
run: make walproposer-lib -j$(nproc)
|
||||
|
||||
- name: Produce the build stats
|
||||
run: PQ_LIB_DIR=$(pwd)/pg_install/v17/lib cargo build --all --release --timings -j$(nproc)
|
||||
run: cargo build --all --release --timings -j$(nproc)
|
||||
|
||||
- name: Upload the build stats
|
||||
id: upload-stats
|
||||
|
||||
155
.github/workflows/periodic_pagebench.yml
vendored
155
.github/workflows/periodic_pagebench.yml
vendored
@@ -1,155 +0,0 @@
|
||||
name: Periodic pagebench performance test on dedicated EC2 machine in eu-central-1 region
|
||||
|
||||
on:
|
||||
schedule:
|
||||
# * is a special character in YAML so you have to quote this string
|
||||
# ┌───────────── minute (0 - 59)
|
||||
# │ ┌───────────── hour (0 - 23)
|
||||
# │ │ ┌───────────── day of the month (1 - 31)
|
||||
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
|
||||
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
|
||||
- cron: '0 18 * * *' # Runs at 6 PM UTC every day
|
||||
workflow_dispatch: # Allows manual triggering of the workflow
|
||||
inputs:
|
||||
commit_hash:
|
||||
type: string
|
||||
description: 'The long neon repo commit hash for the system under test (pageserver) to be tested.'
|
||||
required: false
|
||||
default: ''
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euo pipefail {0}
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}
|
||||
cancel-in-progress: false
|
||||
|
||||
jobs:
|
||||
trigger_bench_on_ec2_machine_in_eu_central_1:
|
||||
runs-on: [ self-hosted, small ]
|
||||
container:
|
||||
image: neondatabase/build-tools:pinned
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
timeout-minutes: 360 # Set the timeout to 6 hours
|
||||
env:
|
||||
API_KEY: ${{ secrets.PERIODIC_PAGEBENCH_EC2_RUNNER_API_KEY }}
|
||||
RUN_ID: ${{ github.run_id }}
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_ID }}
|
||||
AWS_SECRET_ACCESS_KEY : ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_SECRET }}
|
||||
AWS_DEFAULT_REGION : "eu-central-1"
|
||||
AWS_INSTANCE_ID : "i-02a59a3bf86bc7e74"
|
||||
steps:
|
||||
# we don't need the neon source code because we run everything remotely
|
||||
# however we still need the local github actions to run the allure step below
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Show my own (github runner) external IP address - usefull for IP allowlisting
|
||||
run: curl https://ifconfig.me
|
||||
|
||||
- name: Start EC2 instance and wait for the instance to boot up
|
||||
run: |
|
||||
aws ec2 start-instances --instance-ids $AWS_INSTANCE_ID
|
||||
aws ec2 wait instance-running --instance-ids $AWS_INSTANCE_ID
|
||||
sleep 60 # sleep some time to allow cloudinit and our API server to start up
|
||||
|
||||
- name: Determine public IP of the EC2 instance and set env variable EC2_MACHINE_URL_US
|
||||
run: |
|
||||
public_ip=$(aws ec2 describe-instances --instance-ids $AWS_INSTANCE_ID --query 'Reservations[*].Instances[*].PublicIpAddress' --output text)
|
||||
echo "Public IP of the EC2 instance: $public_ip"
|
||||
echo "EC2_MACHINE_URL_US=https://${public_ip}:8443" >> $GITHUB_ENV
|
||||
|
||||
- name: Determine commit hash
|
||||
env:
|
||||
INPUT_COMMIT_HASH: ${{ github.event.inputs.commit_hash }}
|
||||
run: |
|
||||
if [ -z "$INPUT_COMMIT_HASH" ]; then
|
||||
echo "COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha')" >> $GITHUB_ENV
|
||||
else
|
||||
echo "COMMIT_HASH=$INPUT_COMMIT_HASH" >> $GITHUB_ENV
|
||||
fi
|
||||
|
||||
- name: Start Bench with run_id
|
||||
run: |
|
||||
curl -k -X 'POST' \
|
||||
"${EC2_MACHINE_URL_US}/start_test/${GITHUB_RUN_ID}" \
|
||||
-H 'accept: application/json' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H "Authorization: Bearer $API_KEY" \
|
||||
-d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\"}"
|
||||
|
||||
- name: Poll Test Status
|
||||
id: poll_step
|
||||
run: |
|
||||
status=""
|
||||
while [[ "$status" != "failure" && "$status" != "success" ]]; do
|
||||
response=$(curl -k -X 'GET' \
|
||||
"${EC2_MACHINE_URL_US}/test_status/${GITHUB_RUN_ID}" \
|
||||
-H 'accept: application/json' \
|
||||
-H "Authorization: Bearer $API_KEY")
|
||||
echo "Response: $response"
|
||||
set +x
|
||||
status=$(echo $response | jq -r '.status')
|
||||
echo "Test status: $status"
|
||||
if [[ "$status" == "failure" ]]; then
|
||||
echo "Test failed"
|
||||
exit 1 # Fail the job step if status is failure
|
||||
elif [[ "$status" == "success" || "$status" == "null" ]]; then
|
||||
break
|
||||
elif [[ "$status" == "too_many_runs" ]]; then
|
||||
echo "Too many runs already running"
|
||||
echo "too_many_runs=true" >> "$GITHUB_OUTPUT"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
sleep 60 # Poll every 60 seconds
|
||||
done
|
||||
|
||||
- name: Retrieve Test Logs
|
||||
if: always() && steps.poll_step.outputs.too_many_runs != 'true'
|
||||
run: |
|
||||
curl -k -X 'GET' \
|
||||
"${EC2_MACHINE_URL_US}/test_log/${GITHUB_RUN_ID}" \
|
||||
-H 'accept: application/gzip' \
|
||||
-H "Authorization: Bearer $API_KEY" \
|
||||
--output "test_log_${GITHUB_RUN_ID}.gz"
|
||||
|
||||
- name: Unzip Test Log and Print it into this job's log
|
||||
if: always() && steps.poll_step.outputs.too_many_runs != 'true'
|
||||
run: |
|
||||
gzip -d "test_log_${GITHUB_RUN_ID}.gz"
|
||||
cat "test_log_${GITHUB_RUN_ID}"
|
||||
|
||||
- name: Create Allure report
|
||||
env:
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
if: ${{ !cancelled() }}
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
|
||||
- name: Post to a Slack channel
|
||||
if: ${{ github.event.schedule && failure() }}
|
||||
uses: slackapi/slack-github-action@v1
|
||||
with:
|
||||
channel-id: "C033QLM5P7D" # dev-staging-stream
|
||||
slack-message: "Periodic pagebench testing on dedicated hardware: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
- name: Cleanup Test Resources
|
||||
if: always()
|
||||
run: |
|
||||
curl -k -X 'POST' \
|
||||
"${EC2_MACHINE_URL_US}/cleanup_test/${GITHUB_RUN_ID}" \
|
||||
-H 'accept: application/json' \
|
||||
-H "Authorization: Bearer $API_KEY" \
|
||||
-d ''
|
||||
|
||||
- name: Stop EC2 instance and wait for the instance to be stopped
|
||||
if: always() && steps.poll_step.outputs.too_many_runs != 'true'
|
||||
run: |
|
||||
aws ec2 stop-instances --instance-ids $AWS_INSTANCE_ID
|
||||
aws ec2 wait instance-stopped --instance-ids $AWS_INSTANCE_ID
|
||||
211
.github/workflows/pg-clients.yml
vendored
211
.github/workflows/pg-clients.yml
vendored
@@ -1,211 +0,0 @@
|
||||
name: Test Postgres client libraries
|
||||
|
||||
on:
|
||||
schedule:
|
||||
# * is a special character in YAML so you have to quote this string
|
||||
# ┌───────────── minute (0 - 59)
|
||||
# │ ┌───────────── hour (0 - 23)
|
||||
# │ │ ┌───────────── day of the month (1 - 31)
|
||||
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
|
||||
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
|
||||
- cron: '23 02 * * *' # run once a day, timezone is utc
|
||||
pull_request:
|
||||
paths:
|
||||
- '.github/workflows/pg-clients.yml'
|
||||
- 'test_runner/pg_clients/**'
|
||||
- 'test_runner/logical_repl/**'
|
||||
- 'poetry.lock'
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref_name }}
|
||||
cancel-in-progress: ${{ github.event_name == 'pull_request' }}
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
env:
|
||||
DEFAULT_PG_VERSION: 16
|
||||
PLATFORM: neon-captest-new
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
AWS_DEFAULT_REGION: eu-central-1
|
||||
|
||||
jobs:
|
||||
check-permissions:
|
||||
if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
|
||||
uses: ./.github/workflows/check-permissions.yml
|
||||
with:
|
||||
github-event-name: ${{ github.event_name }}
|
||||
|
||||
check-build-tools-image:
|
||||
needs: [ check-permissions ]
|
||||
uses: ./.github/workflows/check-build-tools-image.yml
|
||||
|
||||
build-build-tools-image:
|
||||
needs: [ check-build-tools-image ]
|
||||
uses: ./.github/workflows/build-build-tools-image.yml
|
||||
with:
|
||||
image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
|
||||
secrets: inherit
|
||||
|
||||
test-logical-replication:
|
||||
needs: [ build-build-tools-image ]
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
container:
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init --user root
|
||||
services:
|
||||
clickhouse:
|
||||
image: clickhouse/clickhouse-server:24.6.3.64
|
||||
ports:
|
||||
- 9000:9000
|
||||
- 8123:8123
|
||||
zookeeper:
|
||||
image: quay.io/debezium/zookeeper:2.7
|
||||
ports:
|
||||
- 2181:2181
|
||||
kafka:
|
||||
image: quay.io/debezium/kafka:2.7
|
||||
env:
|
||||
ZOOKEEPER_CONNECT: "zookeeper:2181"
|
||||
KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092
|
||||
KAFKA_BROKER_ID: 1
|
||||
KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
|
||||
KAFKA_JMX_PORT: 9991
|
||||
ports:
|
||||
- 9092:9092
|
||||
debezium:
|
||||
image: quay.io/debezium/connect:2.7
|
||||
env:
|
||||
BOOTSTRAP_SERVERS: kafka:9092
|
||||
GROUP_ID: 1
|
||||
CONFIG_STORAGE_TOPIC: debezium-config
|
||||
OFFSET_STORAGE_TOPIC: debezium-offset
|
||||
STATUS_STORAGE_TOPIC: debezium-status
|
||||
DEBEZIUM_CONFIG_CONNECTOR_CLASS: io.debezium.connector.postgresql.PostgresConnector
|
||||
ports:
|
||||
- 8083:8083
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
|
||||
- name: Create Neon Project
|
||||
id: create-neon-project
|
||||
uses: ./.github/actions/neon-project-create
|
||||
with:
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
|
||||
- name: Run tests
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: remote
|
||||
test_selection: logical_repl
|
||||
run_in_parallel: false
|
||||
extra_params: -m remote_cluster
|
||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
|
||||
|
||||
- name: Delete Neon Project
|
||||
if: always()
|
||||
uses: ./.github/actions/neon-project-delete
|
||||
with:
|
||||
project_id: ${{ steps.create-neon-project.outputs.project_id }}
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
|
||||
- name: Create Allure report
|
||||
if: ${{ !cancelled() }}
|
||||
id: create-allure-report
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
with:
|
||||
store-test-results-into-db: true
|
||||
env:
|
||||
REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
||||
|
||||
- name: Post to a Slack channel
|
||||
if: github.event.schedule && failure()
|
||||
uses: slackapi/slack-github-action@v1
|
||||
with:
|
||||
channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
|
||||
slack-message: |
|
||||
Testing the logical replication: <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|${{ job.status }}> (<${{ steps.create-allure-report.outputs.report-url }}|test report>)
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
test-postgres-client-libs:
|
||||
needs: [ build-build-tools-image ]
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
container:
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init --user root
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
|
||||
- name: Create Neon Project
|
||||
id: create-neon-project
|
||||
uses: ./.github/actions/neon-project-create
|
||||
with:
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
|
||||
- name: Run tests
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: remote
|
||||
test_selection: pg_clients
|
||||
run_in_parallel: false
|
||||
extra_params: -m remote_cluster
|
||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
|
||||
|
||||
- name: Delete Neon Project
|
||||
if: always()
|
||||
uses: ./.github/actions/neon-project-delete
|
||||
with:
|
||||
project_id: ${{ steps.create-neon-project.outputs.project_id }}
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
|
||||
- name: Create Allure report
|
||||
if: ${{ !cancelled() }}
|
||||
id: create-allure-report
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
with:
|
||||
store-test-results-into-db: true
|
||||
env:
|
||||
REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
||||
|
||||
- name: Post to a Slack channel
|
||||
if: github.event.schedule && failure()
|
||||
uses: slackapi/slack-github-action@v1
|
||||
with:
|
||||
channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
|
||||
slack-message: |
|
||||
Testing Postgres clients: <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|${{ job.status }}> (<${{ steps.create-allure-report.outputs.report-url }}|test report>)
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
98
.github/workflows/pg_clients.yml
vendored
Normal file
98
.github/workflows/pg_clients.yml
vendored
Normal file
@@ -0,0 +1,98 @@
|
||||
name: Test Postgres client libraries
|
||||
|
||||
on:
|
||||
schedule:
|
||||
# * is a special character in YAML so you have to quote this string
|
||||
# ┌───────────── minute (0 - 59)
|
||||
# │ ┌───────────── hour (0 - 23)
|
||||
# │ │ ┌───────────── day of the month (1 - 31)
|
||||
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
|
||||
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
|
||||
- cron: '23 02 * * *' # run once a day, timezone is utc
|
||||
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
# Allow only one workflow per any non-`main` branch.
|
||||
group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
test-postgres-client-libs:
|
||||
# TODO: switch to gen2 runner, requires docker
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
env:
|
||||
DEFAULT_PG_VERSION: 14
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: 3.9
|
||||
|
||||
- name: Install Poetry
|
||||
uses: snok/install-poetry@v1
|
||||
|
||||
- name: Cache poetry deps
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: ~/.cache/pypoetry/virtualenvs
|
||||
key: v2-${{ runner.os }}-python-deps-ubunutu-latest-${{ hashFiles('poetry.lock') }}
|
||||
|
||||
- name: Install Python deps
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: ./scripts/pysync
|
||||
|
||||
- name: Create Neon Project
|
||||
id: create-neon-project
|
||||
uses: ./.github/actions/neon-project-create
|
||||
with:
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
|
||||
- name: Run pytest
|
||||
env:
|
||||
REMOTE_ENV: 1
|
||||
BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
# Test framework expects we have psql binary;
|
||||
# but since we don't really need it in this test, let's mock it
|
||||
mkdir -p "$POSTGRES_DISTRIB_DIR/v${DEFAULT_PG_VERSION}/bin" && touch "$POSTGRES_DISTRIB_DIR/v${DEFAULT_PG_VERSION}/bin/psql";
|
||||
./scripts/pytest \
|
||||
--junitxml=$TEST_OUTPUT/junit.xml \
|
||||
--tb=short \
|
||||
--verbose \
|
||||
-m "remote_cluster" \
|
||||
-rA "test_runner/pg_clients"
|
||||
|
||||
- name: Delete Neon Project
|
||||
if: ${{ always() }}
|
||||
uses: ./.github/actions/neon-project-delete
|
||||
with:
|
||||
project_id: ${{ steps.create-neon-project.outputs.project_id }}
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
|
||||
# We use GitHub's action upload-artifact because `ubuntu-latest` doesn't have configured AWS CLI.
|
||||
# It will be fixed after switching to gen2 runner
|
||||
- name: Upload python test logs
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
retention-days: 7
|
||||
name: python-test-pg_clients-${{ runner.os }}-stage-logs
|
||||
path: ${{ env.TEST_OUTPUT }}
|
||||
|
||||
- name: Post to a Slack channel
|
||||
if: ${{ github.event.schedule && failure() }}
|
||||
uses: slackapi/slack-github-action@v1
|
||||
with:
|
||||
channel-id: "C033QLM5P7D" # dev-staging-stream
|
||||
slack-message: "Testing Postgres clients: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
58
.github/workflows/pin-build-tools-image.yml
vendored
58
.github/workflows/pin-build-tools-image.yml
vendored
@@ -7,20 +7,12 @@ on:
|
||||
description: 'Source tag'
|
||||
required: true
|
||||
type: string
|
||||
force:
|
||||
description: 'Force the image to be pinned'
|
||||
default: false
|
||||
type: boolean
|
||||
workflow_call:
|
||||
inputs:
|
||||
from-tag:
|
||||
description: 'Source tag'
|
||||
required: true
|
||||
type: string
|
||||
force:
|
||||
description: 'Force the image to be pinned'
|
||||
default: false
|
||||
type: boolean
|
||||
|
||||
defaults:
|
||||
run:
|
||||
@@ -30,18 +22,15 @@ concurrency:
|
||||
group: pin-build-tools-image-${{ inputs.from-tag }}
|
||||
cancel-in-progress: false
|
||||
|
||||
# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
|
||||
permissions: {}
|
||||
|
||||
env:
|
||||
FROM_TAG: ${{ inputs.from-tag }}
|
||||
TO_TAG: pinned
|
||||
|
||||
jobs:
|
||||
check-manifests:
|
||||
tag-image:
|
||||
runs-on: ubuntu-22.04
|
||||
outputs:
|
||||
skip: ${{ steps.check-manifests.outputs.skip }}
|
||||
|
||||
env:
|
||||
FROM_TAG: ${{ inputs.from-tag }}
|
||||
TO_TAG: pinned
|
||||
|
||||
steps:
|
||||
- name: Check if we really need to pin the image
|
||||
@@ -58,44 +47,27 @@ jobs:
|
||||
|
||||
echo "skip=${skip}" | tee -a $GITHUB_OUTPUT
|
||||
|
||||
tag-image:
|
||||
needs: check-manifests
|
||||
|
||||
# use format(..) to catch both inputs.force = true AND inputs.force = 'true'
|
||||
if: needs.check-manifests.outputs.skip == 'false' || format('{0}', inputs.force) == 'true'
|
||||
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
permissions:
|
||||
id-token: write # for `azure/login`
|
||||
|
||||
steps:
|
||||
- uses: docker/login-action@v3
|
||||
|
||||
if: steps.check-manifests.outputs.skip == 'false'
|
||||
with:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
- name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub
|
||||
if: steps.check-manifests.outputs.skip == 'false'
|
||||
run: |
|
||||
docker buildx imagetools create -t neondatabase/build-tools:${TO_TAG} \
|
||||
neondatabase/build-tools:${FROM_TAG}
|
||||
|
||||
- uses: docker/login-action@v3
|
||||
if: steps.check-manifests.outputs.skip == 'false'
|
||||
with:
|
||||
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
password: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
- name: Azure login
|
||||
uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # @v2.1.1
|
||||
with:
|
||||
client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }}
|
||||
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
|
||||
subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }}
|
||||
|
||||
- name: Login to ACR
|
||||
run: |
|
||||
az acr login --name=neoneastus2
|
||||
|
||||
- name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub, ECR, and ACR
|
||||
- name: Tag build-tools with `${{ env.TO_TAG }}` in ECR
|
||||
if: steps.check-manifests.outputs.skip == 'false'
|
||||
run: |
|
||||
docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG} \
|
||||
-t neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG} \
|
||||
-t neondatabase/build-tools:${TO_TAG} \
|
||||
neondatabase/build-tools:${FROM_TAG}
|
||||
|
||||
12
.github/workflows/release.yml
vendored
12
.github/workflows/release.yml
vendored
@@ -52,15 +52,13 @@ jobs:
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
run: |
|
||||
TITLE="Storage & Compute release ${RELEASE_DATE}"
|
||||
|
||||
cat << EOF > body.md
|
||||
## ${TITLE}
|
||||
## Storage & Compute release ${RELEASE_DATE}
|
||||
|
||||
**Please merge this Pull Request using 'Create a merge commit' button**
|
||||
EOF
|
||||
|
||||
gh pr create --title "${TITLE}" \
|
||||
gh pr create --title "Release ${RELEASE_DATE}" \
|
||||
--body-file "body.md" \
|
||||
--head "${RELEASE_BRANCH}" \
|
||||
--base "release"
|
||||
@@ -93,15 +91,13 @@ jobs:
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
run: |
|
||||
TITLE="Proxy release ${RELEASE_DATE}"
|
||||
|
||||
cat << EOF > body.md
|
||||
## ${TITLE}
|
||||
## Proxy release ${RELEASE_DATE}
|
||||
|
||||
**Please merge this Pull Request using 'Create a merge commit' button**
|
||||
EOF
|
||||
|
||||
gh pr create --title "${TITLE}" \
|
||||
gh pr create --title "Proxy release ${RELEASE_DATE}" \
|
||||
--body-file "body.md" \
|
||||
--head "${RELEASE_BRANCH}" \
|
||||
--base "release-proxy"
|
||||
|
||||
41
.github/workflows/report-workflow-stats.yml
vendored
41
.github/workflows/report-workflow-stats.yml
vendored
@@ -1,41 +0,0 @@
|
||||
name: Report Workflow Stats
|
||||
|
||||
on:
|
||||
workflow_run:
|
||||
workflows:
|
||||
- Add `external` label to issues and PRs created by external users
|
||||
- Benchmarking
|
||||
- Build and Test
|
||||
- Build and Test Locally
|
||||
- Build build-tools image
|
||||
- Check Permissions
|
||||
- Check build-tools image
|
||||
- Check neon with extra platform builds
|
||||
- Cloud Regression Test
|
||||
- Create Release Branch
|
||||
- Handle `approved-for-ci-run` label
|
||||
- Lint GitHub Workflows
|
||||
- Notify Slack channel about upcoming release
|
||||
- Periodic pagebench performance test on dedicated EC2 machine in eu-central-1 region
|
||||
- Pin build-tools image
|
||||
- Prepare benchmarking databases by restoring dumps
|
||||
- Push images to ACR
|
||||
- Test Postgres client libraries
|
||||
- Trigger E2E Tests
|
||||
- cleanup caches by a branch
|
||||
types: [completed]
|
||||
|
||||
jobs:
|
||||
gh-workflow-stats:
|
||||
name: Github Workflow Stats
|
||||
runs-on: ubuntu-22.04
|
||||
permissions:
|
||||
actions: read
|
||||
steps:
|
||||
- name: Export GH Workflow Stats
|
||||
uses: fedordikarev/gh-workflow-stats-action@v0.1.2
|
||||
with:
|
||||
DB_URI: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }}
|
||||
DB_TABLE: "gh_workflow_stats_neon"
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
GH_RUN_ID: ${{ github.event.workflow_run.id }}
|
||||
51
.github/workflows/trigger-e2e-tests.yml
vendored
51
.github/workflows/trigger-e2e-tests.yml
vendored
@@ -13,6 +13,8 @@ defaults:
|
||||
env:
|
||||
# A concurrency group that we use for e2e-tests runs, matches `concurrency.group` above with `github.repository` as a prefix
|
||||
E2E_CONCURRENCY_GROUP: ${{ github.repository }}-e2e-tests-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
|
||||
jobs:
|
||||
cancel-previous-e2e-tests:
|
||||
@@ -34,8 +36,8 @@ jobs:
|
||||
build-tag: ${{ steps.build-tag.outputs.tag }}
|
||||
|
||||
steps:
|
||||
# Need `fetch-depth: 0` to count the number of commits in the branch
|
||||
- uses: actions/checkout@v4
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -62,35 +64,19 @@ jobs:
|
||||
needs: [ tag ]
|
||||
runs-on: ubuntu-22.04
|
||||
env:
|
||||
EVENT_ACTION: ${{ github.event.action }}
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
TAG: ${{ needs.tag.outputs.build-tag }}
|
||||
steps:
|
||||
- name: Wait for `promote-images` job to finish
|
||||
# It's important to have a timeout here, the script in the step can run infinitely
|
||||
timeout-minutes: 60
|
||||
- name: check if ecr image are present
|
||||
env:
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
run: |
|
||||
if [ "${GITHUB_EVENT_NAME}" != "pull_request" ] || [ "${EVENT_ACTION}" != "ready_for_review" ]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# For PRs we use the run id as the tag
|
||||
BUILD_AND_TEST_RUN_ID=${TAG}
|
||||
while true; do
|
||||
conclusion=$(gh run --repo ${GITHUB_REPOSITORY} view ${BUILD_AND_TEST_RUN_ID} --json jobs --jq '.jobs[] | select(.name == "promote-images") | .conclusion')
|
||||
case "$conclusion" in
|
||||
success)
|
||||
break
|
||||
;;
|
||||
failure | cancelled | skipped)
|
||||
echo "The 'promote-images' job didn't succeed: '${conclusion}'. Exiting..."
|
||||
exit 1
|
||||
;;
|
||||
*)
|
||||
echo "The 'promote-images' hasn't succeed yet. Waiting..."
|
||||
sleep 60
|
||||
;;
|
||||
esac
|
||||
for REPO in neon compute-tools compute-node-v14 vm-compute-node-v14 compute-node-v15 vm-compute-node-v15 compute-node-v16 vm-compute-node-v16; do
|
||||
OUTPUT=$(aws ecr describe-images --repository-name ${REPO} --region eu-central-1 --query "imageDetails[?imageTags[?contains(@, '${TAG}')]]" --output text)
|
||||
if [ "$OUTPUT" == "" ]; then
|
||||
echo "$REPO with image tag $TAG not found" >> $GITHUB_OUTPUT
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
||||
- name: Set e2e-platforms
|
||||
@@ -102,17 +88,12 @@ jobs:
|
||||
# Default set of platforms to run e2e tests on
|
||||
platforms='["docker", "k8s"]'
|
||||
|
||||
# If a PR changes anything that affects computes, add k8s-neonvm to the list of platforms.
|
||||
# If the PR changes vendor/, pgxn/ or libs/vm_monitor/ directories, or Dockerfile.compute-node, add k8s-neonvm to the list of platforms.
|
||||
# If the workflow run is not a pull request, add k8s-neonvm to the list.
|
||||
if [ "$GITHUB_EVENT_NAME" == "pull_request" ]; then
|
||||
for f in $(gh api "/repos/${GITHUB_REPOSITORY}/pulls/${PR_NUMBER}/files" --paginate --jq '.[].filename'); do
|
||||
case "$f" in
|
||||
# List of directories that contain code which affect compute images.
|
||||
#
|
||||
# This isn't exhaustive, just the paths that are most directly compute-related.
|
||||
# For example, compute_ctl also depends on libs/utils, but we don't trigger
|
||||
# an e2e run on that.
|
||||
vendor/*|pgxn/*|compute_tools/*|libs/vm_monitor/*|compute/Dockerfile.compute-node)
|
||||
vendor/*|pgxn/*|libs/vm_monitor/*|Dockerfile.compute-node)
|
||||
platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
|
||||
;;
|
||||
*)
|
||||
|
||||
4
.gitmodules
vendored
4
.gitmodules
vendored
@@ -10,7 +10,3 @@
|
||||
path = vendor/postgres-v16
|
||||
url = https://github.com/neondatabase/postgres.git
|
||||
branch = REL_16_STABLE_neon
|
||||
[submodule "vendor/postgres-v17"]
|
||||
path = vendor/postgres-v17
|
||||
url = https://github.com/neondatabase/postgres.git
|
||||
branch = REL_17_STABLE_neon
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
# * `-A unknown_lints` – do not warn about unknown lint suppressions
|
||||
# that people with newer toolchains might use
|
||||
# * `-D warnings` - fail on any warnings (`cargo` returns non-zero exit status)
|
||||
# * `-D clippy::todo` - don't let `todo!()` slip into `main`
|
||||
export CLIPPY_COMMON_ARGS="--locked --workspace --all-targets -- -A unknown_lints -D warnings -D clippy::todo"
|
||||
export CLIPPY_COMMON_ARGS="--locked --workspace --all-targets -- -A unknown_lints -D warnings"
|
||||
|
||||
@@ -1,14 +1,13 @@
|
||||
/compute_tools/ @neondatabase/control-plane @neondatabase/compute
|
||||
/storage_controller @neondatabase/storage
|
||||
/storage_scrubber @neondatabase/storage
|
||||
/libs/pageserver_api/ @neondatabase/storage
|
||||
/libs/postgres_ffi/ @neondatabase/compute @neondatabase/storage
|
||||
/libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers
|
||||
/libs/remote_storage/ @neondatabase/storage
|
||||
/libs/safekeeper_api/ @neondatabase/storage
|
||||
/libs/safekeeper_api/ @neondatabase/safekeepers
|
||||
/libs/vm_monitor/ @neondatabase/autoscaling
|
||||
/pageserver/ @neondatabase/storage
|
||||
/pgxn/ @neondatabase/compute
|
||||
/pgxn/neon/ @neondatabase/compute @neondatabase/storage
|
||||
/pgxn/neon/ @neondatabase/compute @neondatabase/safekeepers
|
||||
/proxy/ @neondatabase/proxy
|
||||
/safekeeper/ @neondatabase/storage
|
||||
/safekeeper/ @neondatabase/safekeepers
|
||||
/vendor/ @neondatabase/compute
|
||||
|
||||
1789
Cargo.lock
generated
1789
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
115
Cargo.toml
115
Cargo.toml
@@ -13,9 +13,9 @@ members = [
|
||||
"safekeeper",
|
||||
"storage_broker",
|
||||
"storage_controller",
|
||||
"storage_controller/client",
|
||||
"storage_scrubber",
|
||||
"workspace_hack",
|
||||
"trace",
|
||||
"libs/compute_api",
|
||||
"libs/pageserver_api",
|
||||
"libs/postgres_ffi",
|
||||
@@ -53,19 +53,18 @@ azure_storage_blobs = { version = "0.19", default-features = false, features = [
|
||||
flate2 = "1.0.26"
|
||||
async-stream = "0.3"
|
||||
async-trait = "0.1"
|
||||
aws-config = { version = "1.5", default-features = false, features=["rustls", "sso"] }
|
||||
aws-sdk-s3 = "1.52"
|
||||
aws-sdk-iam = "1.46.0"
|
||||
aws-config = { version = "1.3", default-features = false, features=["rustls"] }
|
||||
aws-sdk-s3 = "1.26"
|
||||
aws-sdk-iam = "1.15.0"
|
||||
aws-smithy-async = { version = "1.2.1", default-features = false, features=["rt-tokio"] }
|
||||
aws-smithy-types = "1.2"
|
||||
aws-smithy-types = "1.1.9"
|
||||
aws-credential-types = "1.2.0"
|
||||
aws-sigv4 = { version = "1.2", features = ["sign-http"] }
|
||||
aws-types = "1.3"
|
||||
axum = { version = "0.7.5", features = ["ws"] }
|
||||
aws-sigv4 = { version = "1.2.1", features = ["sign-http"] }
|
||||
aws-types = "1.2.0"
|
||||
axum = { version = "0.6.20", features = ["ws"] }
|
||||
base64 = "0.13.0"
|
||||
bincode = "1.3"
|
||||
bindgen = "0.70"
|
||||
bit_field = "0.10.2"
|
||||
bindgen = "0.65"
|
||||
bstr = "1.0"
|
||||
byteorder = "1.4"
|
||||
bytes = "1.0"
|
||||
@@ -73,9 +72,11 @@ camino = "1.1.6"
|
||||
cfg-if = "1.0.0"
|
||||
chrono = { version = "0.4", default-features = false, features = ["clock"] }
|
||||
clap = { version = "4.0", features = ["derive"] }
|
||||
comfy-table = "7.1"
|
||||
comfy-table = "6.1"
|
||||
const_format = "0.2"
|
||||
crc32c = "0.6"
|
||||
crossbeam-deque = "0.8.5"
|
||||
crossbeam-utils = "0.8.5"
|
||||
dashmap = { version = "5.5.0", features = ["raw-api"] }
|
||||
either = "1.8"
|
||||
enum-map = "2.4.2"
|
||||
@@ -83,6 +84,7 @@ enumset = "1.0.12"
|
||||
fail = "0.5.0"
|
||||
fallible-iterator = "0.2"
|
||||
framed-websockets = { version = "0.1.0", git = "https://github.com/neondatabase/framed-websockets" }
|
||||
fs2 = "0.4.3"
|
||||
futures = "0.3"
|
||||
futures-core = "0.3"
|
||||
futures-util = "0.3"
|
||||
@@ -93,49 +95,46 @@ hdrhistogram = "7.5.2"
|
||||
hex = "0.4"
|
||||
hex-literal = "0.4"
|
||||
hmac = "0.12.1"
|
||||
hostname = "0.4"
|
||||
hostname = "0.3.1"
|
||||
http = {version = "1.1.0", features = ["std"]}
|
||||
http-types = { version = "2", default-features = false }
|
||||
http-body-util = "0.1.2"
|
||||
humantime = "2.1"
|
||||
humantime-serde = "1.1.1"
|
||||
hyper0 = { package = "hyper", version = "0.14" }
|
||||
hyper = "1.4"
|
||||
hyper-util = "0.1"
|
||||
tokio-tungstenite = "0.21.0"
|
||||
hyper = "0.14"
|
||||
tokio-tungstenite = "0.20.0"
|
||||
indexmap = "2"
|
||||
indoc = "2"
|
||||
inotify = "0.10.2"
|
||||
ipnet = "2.9.0"
|
||||
itertools = "0.10"
|
||||
jsonwebtoken = "9"
|
||||
lasso = "0.7"
|
||||
leaky-bucket = "1.0.1"
|
||||
libc = "0.2"
|
||||
md5 = "0.7.0"
|
||||
measured = { version = "0.0.22", features=["lasso"] }
|
||||
measured-process = { version = "0.0.22" }
|
||||
memoffset = "0.9"
|
||||
nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal", "poll"] }
|
||||
measured = { version = "0.0.21", features=["lasso"] }
|
||||
measured-process = { version = "0.0.21" }
|
||||
memoffset = "0.8"
|
||||
nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
|
||||
notify = "6.0.0"
|
||||
num_cpus = "1.15"
|
||||
num-traits = "0.2.15"
|
||||
once_cell = "1.13"
|
||||
opentelemetry = "0.24"
|
||||
opentelemetry_sdk = "0.24"
|
||||
opentelemetry-otlp = { version = "0.17", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
||||
opentelemetry-semantic-conventions = "0.16"
|
||||
opentelemetry = "0.20.0"
|
||||
opentelemetry-otlp = { version = "0.13.0", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
||||
opentelemetry-semantic-conventions = "0.12.0"
|
||||
parking_lot = "0.12"
|
||||
parquet = { version = "53", default-features = false, features = ["zstd"] }
|
||||
parquet_derive = "53"
|
||||
parquet = { version = "51.0.0", default-features = false, features = ["zstd"] }
|
||||
parquet_derive = "51.0.0"
|
||||
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
|
||||
pin-project-lite = "0.2"
|
||||
procfs = "0.16"
|
||||
procfs = "0.14"
|
||||
prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
|
||||
prost = "0.13"
|
||||
prost = "0.11"
|
||||
rand = "0.8"
|
||||
redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
|
||||
regex = "1.10.2"
|
||||
reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] }
|
||||
reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_24"] }
|
||||
reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_20"] }
|
||||
reqwest-middleware = "0.3.0"
|
||||
reqwest-retry = "0.5"
|
||||
routerify = "3"
|
||||
@@ -143,10 +142,10 @@ rpds = "0.13"
|
||||
rustc-hash = "1.1.0"
|
||||
rustls = "0.22"
|
||||
rustls-pemfile = "2"
|
||||
rustls-split = "0.3"
|
||||
scopeguard = "1.1"
|
||||
sysinfo = "0.29.2"
|
||||
sd-notify = "0.4.1"
|
||||
send-future = "0.1.0"
|
||||
sentry = { version = "0.32", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
@@ -158,12 +157,14 @@ signal-hook = "0.3"
|
||||
smallvec = "1.11"
|
||||
smol_str = { version = "0.2.0", features = ["serde"] }
|
||||
socket2 = "0.5"
|
||||
strum = "0.26"
|
||||
strum_macros = "0.26"
|
||||
strum = "0.24"
|
||||
strum_macros = "0.24"
|
||||
"subtle" = "2.5.0"
|
||||
svg_fmt = "0.4.3"
|
||||
# Our PR https://github.com/nical/rust_debug/pull/4 has been merged but no new version released yet
|
||||
svg_fmt = { git = "https://github.com/nical/rust_debug", rev = "28a7d96eecff2f28e75b1ea09f2d499a60d0e3b4" }
|
||||
sync_wrapper = "0.1.2"
|
||||
tar = "0.4"
|
||||
task-local-extensions = "0.1.4"
|
||||
test-context = "0.3"
|
||||
thiserror = "1.0"
|
||||
tikv-jemallocator = "0.5"
|
||||
@@ -176,45 +177,34 @@ tokio-rustls = "0.25"
|
||||
tokio-stream = "0.1"
|
||||
tokio-tar = "0.3"
|
||||
tokio-util = { version = "0.7.10", features = ["io", "rt"] }
|
||||
toml = "0.8"
|
||||
toml_edit = "0.22"
|
||||
tonic = {version = "0.12.3", features = ["tls", "tls-roots"]}
|
||||
toml = "0.7"
|
||||
toml_edit = "0.19"
|
||||
tonic = {version = "0.9", features = ["tls", "tls-roots"]}
|
||||
tower-service = "0.3.2"
|
||||
tracing = "0.1"
|
||||
tracing-error = "0.2"
|
||||
tracing-opentelemetry = "0.25"
|
||||
tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
|
||||
try-lock = "0.2.5"
|
||||
tracing-error = "0.2.0"
|
||||
tracing-opentelemetry = "0.21.0"
|
||||
tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] }
|
||||
twox-hash = { version = "1.6.3", default-features = false }
|
||||
typed-json = "0.1"
|
||||
url = "2.2"
|
||||
urlencoding = "2.1"
|
||||
uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
|
||||
walkdir = "2.3.2"
|
||||
rustls-native-certs = "0.7"
|
||||
x509-parser = "0.15"
|
||||
whoami = "1.5.1"
|
||||
|
||||
## TODO replace this with tracing
|
||||
env_logger = "0.10"
|
||||
log = "0.4"
|
||||
|
||||
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="skip-auth-1rtt" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch="skip-auth-1rtt" }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="skip-auth-1rtt" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="skip-auth-1rtt" }
|
||||
|
||||
# We want to use the 'neon' branch for these, but there's currently one
|
||||
# incompatible change on the branch. See:
|
||||
#
|
||||
# - PR #8076 which contained changes that depended on the new changes in
|
||||
# the rust-postgres crate, and
|
||||
# - PR #8654 which reverted those changes and made the code in proxy incompatible
|
||||
# with the tip of the 'neon' branch again.
|
||||
#
|
||||
# When those proxy changes are re-applied (see PR #8747), we can switch using
|
||||
# the tip of the 'neon' branch again.
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
|
||||
## Other git libraries
|
||||
heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
|
||||
|
||||
## Local libraries
|
||||
compute_api = { version = "0.1", path = "./libs/compute_api/" }
|
||||
@@ -231,7 +221,6 @@ remote_storage = { version = "0.1", path = "./libs/remote_storage/" }
|
||||
safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" }
|
||||
desim = { version = "0.1", path = "./libs/desim" }
|
||||
storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy.
|
||||
storage_controller_client = { path = "./storage_controller/client" }
|
||||
tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
|
||||
tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
|
||||
utils = { version = "0.1", path = "./libs/utils/" }
|
||||
@@ -246,12 +235,16 @@ criterion = "0.5.1"
|
||||
rcgen = "0.12"
|
||||
rstest = "0.18"
|
||||
camino-tempfile = "1.0.2"
|
||||
tonic-build = "0.12"
|
||||
tonic-build = "0.9"
|
||||
|
||||
[patch.crates-io]
|
||||
|
||||
# Needed to get `tokio-postgres-rustls` to depend on our fork.
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="skip-auth-1rtt" }
|
||||
|
||||
# bug fixes for UUID
|
||||
parquet = { git = "https://github.com/apache/arrow-rs", branch = "master" }
|
||||
parquet_derive = { git = "https://github.com/apache/arrow-rs", branch = "master" }
|
||||
|
||||
################# Binary contents sections
|
||||
|
||||
|
||||
54
Dockerfile
54
Dockerfile
@@ -5,8 +5,6 @@
|
||||
ARG REPOSITORY=neondatabase
|
||||
ARG IMAGE=build-tools
|
||||
ARG TAG=pinned
|
||||
ARG DEFAULT_PG_VERSION=17
|
||||
ARG STABLE_PG_VERSION=16
|
||||
|
||||
# Build Postgres
|
||||
FROM $REPOSITORY/$IMAGE:$TAG AS pg-build
|
||||
@@ -15,12 +13,11 @@ WORKDIR /home/nonroot
|
||||
COPY --chown=nonroot vendor/postgres-v14 vendor/postgres-v14
|
||||
COPY --chown=nonroot vendor/postgres-v15 vendor/postgres-v15
|
||||
COPY --chown=nonroot vendor/postgres-v16 vendor/postgres-v16
|
||||
COPY --chown=nonroot vendor/postgres-v17 vendor/postgres-v17
|
||||
COPY --chown=nonroot pgxn pgxn
|
||||
COPY --chown=nonroot Makefile Makefile
|
||||
COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh
|
||||
|
||||
ENV BUILD_TYPE=release
|
||||
ENV BUILD_TYPE release
|
||||
RUN set -e \
|
||||
&& mold -run make -j $(nproc) -s neon-pg-ext \
|
||||
&& rm -rf pg_install/build \
|
||||
@@ -31,19 +28,26 @@ FROM $REPOSITORY/$IMAGE:$TAG AS build
|
||||
WORKDIR /home/nonroot
|
||||
ARG GIT_VERSION=local
|
||||
ARG BUILD_TAG
|
||||
ARG STABLE_PG_VERSION
|
||||
|
||||
# Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds.
|
||||
# Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations.
|
||||
# cachepot falls back to local filesystem if S3 is misconfigured, not failing the build
|
||||
ARG RUSTC_WRAPPER=cachepot
|
||||
ENV AWS_REGION=eu-central-1
|
||||
ENV CACHEPOT_S3_KEY_PREFIX=cachepot
|
||||
ARG CACHEPOT_BUCKET=neon-github-dev
|
||||
#ARG AWS_ACCESS_KEY_ID
|
||||
#ARG AWS_SECRET_ACCESS_KEY
|
||||
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v17/include/postgresql/server pg_install/v17/include/postgresql/server
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v16/lib pg_install/v16/lib
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v17/lib pg_install/v17/lib
|
||||
COPY --chown=nonroot . .
|
||||
|
||||
ARG ADDITIONAL_RUSTFLAGS
|
||||
# Show build caching stats to check if it was used in the end.
|
||||
# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
|
||||
RUN set -e \
|
||||
&& PQ_LIB_DIR=$(pwd)/pg_install/v${STABLE_PG_VERSION}/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment ${ADDITIONAL_RUSTFLAGS}" cargo build \
|
||||
&& RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build \
|
||||
--bin pg_sni_router \
|
||||
--bin pageserver \
|
||||
--bin pagectl \
|
||||
@@ -52,13 +56,12 @@ RUN set -e \
|
||||
--bin storage_controller \
|
||||
--bin proxy \
|
||||
--bin neon_local \
|
||||
--bin storage_scrubber \
|
||||
--locked --release
|
||||
--locked --release \
|
||||
&& cachepot -s
|
||||
|
||||
# Build final image
|
||||
#
|
||||
FROM debian:bullseye-slim
|
||||
ARG DEFAULT_PG_VERSION
|
||||
WORKDIR /data
|
||||
|
||||
RUN set -e \
|
||||
@@ -66,6 +69,8 @@ RUN set -e \
|
||||
&& apt install -y \
|
||||
libreadline-dev \
|
||||
libseccomp-dev \
|
||||
libicu67 \
|
||||
openssl \
|
||||
ca-certificates \
|
||||
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
|
||||
&& useradd -d /data neon \
|
||||
@@ -79,35 +84,28 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_controller /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_scrubber /usr/local/bin
|
||||
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v16 /usr/local/v16/
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v17 /usr/local/v17/
|
||||
COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/
|
||||
|
||||
# By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config.
|
||||
# Now, when `docker run ... pageserver` is run, it can start without errors, yet will have some default dummy values.
|
||||
RUN mkdir -p /data/.neon/ && \
|
||||
echo "id=1234" > "/data/.neon/identity.toml" && \
|
||||
echo "broker_endpoint='http://storage_broker:50051'\n" \
|
||||
"pg_distrib_dir='/usr/local/'\n" \
|
||||
"listen_pg_addr='0.0.0.0:6400'\n" \
|
||||
"listen_http_addr='0.0.0.0:9898'\n" \
|
||||
"availability_zone='local'\n" \
|
||||
> /data/.neon/pageserver.toml && \
|
||||
chown -R neon:neon /data/.neon
|
||||
RUN mkdir -p /data/.neon/ && chown -R neon:neon /data/.neon/ \
|
||||
&& /usr/local/bin/pageserver -D /data/.neon/ --init \
|
||||
-c "id=1234" \
|
||||
-c "broker_endpoint='http://storage_broker:50051'" \
|
||||
-c "pg_distrib_dir='/usr/local/'" \
|
||||
-c "listen_pg_addr='0.0.0.0:6400'" \
|
||||
-c "listen_http_addr='0.0.0.0:9898'"
|
||||
|
||||
# When running a binary that links with libpq, default to using our most recent postgres version. Binaries
|
||||
# that want a particular postgres version will select it explicitly: this is just a default.
|
||||
ENV LD_LIBRARY_PATH=/usr/local/v${DEFAULT_PG_VERSION}/lib
|
||||
ENV LD_LIBRARY_PATH /usr/local/v16/lib
|
||||
|
||||
|
||||
VOLUME ["/data"]
|
||||
USER neon
|
||||
EXPOSE 6400
|
||||
EXPOSE 9898
|
||||
|
||||
CMD ["/usr/local/bin/pageserver", "-D", "/data/.neon"]
|
||||
|
||||
|
||||
@@ -1,21 +1,10 @@
|
||||
FROM debian:bullseye-slim
|
||||
|
||||
# Use ARG as a build-time environment variable here to allow.
|
||||
# It's not supposed to be set outside.
|
||||
# Alternatively it can be obtained using the following command
|
||||
# ```
|
||||
# . /etc/os-release && echo "${VERSION_CODENAME}"
|
||||
# ```
|
||||
ARG DEBIAN_VERSION_CODENAME=bullseye
|
||||
|
||||
# Add nonroot user
|
||||
RUN useradd -ms /bin/bash nonroot -b /home
|
||||
SHELL ["/bin/bash", "-c"]
|
||||
|
||||
# System deps
|
||||
#
|
||||
# 'gdb' is included so that we get backtraces of core dumps produced in
|
||||
# regression tests
|
||||
RUN set -e \
|
||||
&& apt update \
|
||||
&& apt install -y \
|
||||
@@ -27,7 +16,6 @@ RUN set -e \
|
||||
cmake \
|
||||
curl \
|
||||
flex \
|
||||
gdb \
|
||||
git \
|
||||
gnupg \
|
||||
gzip \
|
||||
@@ -38,6 +26,7 @@ RUN set -e \
|
||||
liblzma-dev \
|
||||
libncurses5-dev \
|
||||
libncursesw5-dev \
|
||||
libpq-dev \
|
||||
libreadline-dev \
|
||||
libseccomp-dev \
|
||||
libsqlite3-dev \
|
||||
@@ -62,7 +51,7 @@ RUN set -e \
|
||||
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||
|
||||
# protobuf-compiler (protoc)
|
||||
ENV PROTOC_VERSION=25.1
|
||||
ENV PROTOC_VERSION 25.1
|
||||
RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \
|
||||
&& unzip -q protoc.zip -d protoc \
|
||||
&& mv protoc/bin/protoc /usr/local/bin/protoc \
|
||||
@@ -78,24 +67,19 @@ RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/
|
||||
# LLVM
|
||||
ENV LLVM_VERSION=18
|
||||
RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
|
||||
&& echo "deb http://apt.llvm.org/${DEBIAN_VERSION_CODENAME}/ llvm-toolchain-${DEBIAN_VERSION_CODENAME}-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
|
||||
&& echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
|
||||
&& apt update \
|
||||
&& apt install -y clang-${LLVM_VERSION} llvm-${LLVM_VERSION} \
|
||||
&& bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \
|
||||
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||
|
||||
# Install docker
|
||||
RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg \
|
||||
&& echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/debian ${DEBIAN_VERSION_CODENAME} stable" > /etc/apt/sources.list.d/docker.list \
|
||||
# PostgreSQL 14
|
||||
RUN curl -fsSL 'https://www.postgresql.org/media/keys/ACCC4CF8.asc' | apt-key add - \
|
||||
&& echo 'deb http://apt.postgresql.org/pub/repos/apt bullseye-pgdg main' > /etc/apt/sources.list.d/pgdg.list \
|
||||
&& apt update \
|
||||
&& apt install -y docker-ce docker-ce-cli \
|
||||
&& apt install -y postgresql-client-14 \
|
||||
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||
|
||||
# Configure sudo & docker
|
||||
RUN usermod -aG sudo nonroot && \
|
||||
echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers && \
|
||||
usermod -aG docker nonroot
|
||||
|
||||
# AWS CLI
|
||||
RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" \
|
||||
&& unzip -q awscliv2.zip \
|
||||
@@ -103,7 +87,7 @@ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "aws
|
||||
&& rm awscliv2.zip
|
||||
|
||||
# Mold: A Modern Linker
|
||||
ENV MOLD_VERSION=v2.33.0
|
||||
ENV MOLD_VERSION v2.31.0
|
||||
RUN set -e \
|
||||
&& git clone https://github.com/rui314/mold.git \
|
||||
&& mkdir mold/build \
|
||||
@@ -128,51 +112,12 @@ RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JS
|
||||
&& make install \
|
||||
&& rm -rf ../lcov.tar.gz
|
||||
|
||||
# Compile and install the static OpenSSL library
|
||||
ENV OPENSSL_VERSION=1.1.1w
|
||||
ENV OPENSSL_PREFIX=/usr/local/openssl
|
||||
RUN wget -O /tmp/openssl-${OPENSSL_VERSION}.tar.gz https://www.openssl.org/source/openssl-${OPENSSL_VERSION}.tar.gz && \
|
||||
echo "cf3098950cb4d853ad95c0841f1f9c6d3dc102dccfcacd521d93925208b76ac8 /tmp/openssl-${OPENSSL_VERSION}.tar.gz" | sha256sum --check && \
|
||||
cd /tmp && \
|
||||
tar xzvf /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \
|
||||
rm /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \
|
||||
cd /tmp/openssl-${OPENSSL_VERSION} && \
|
||||
./config --prefix=${OPENSSL_PREFIX} -static --static no-shared -fPIC && \
|
||||
make -j "$(nproc)" && \
|
||||
make install && \
|
||||
cd /tmp && \
|
||||
rm -rf /tmp/openssl-${OPENSSL_VERSION}
|
||||
|
||||
# Use the same version of libicu as the compute nodes so that
|
||||
# clusters created using inidb on pageserver can be used by computes.
|
||||
#
|
||||
# TODO: at this time, Dockerfile.compute-node uses the debian bullseye libicu
|
||||
# package, which is 67.1. We're duplicating that knowledge here, and also, technically,
|
||||
# Debian has a few patches on top of 67.1 that we're not adding here.
|
||||
ENV ICU_VERSION=67.1
|
||||
ENV ICU_PREFIX=/usr/local/icu
|
||||
|
||||
# Download and build static ICU
|
||||
RUN wget -O /tmp/libicu-${ICU_VERSION}.tgz https://github.com/unicode-org/icu/releases/download/release-${ICU_VERSION//./-}/icu4c-${ICU_VERSION//./_}-src.tgz && \
|
||||
echo "94a80cd6f251a53bd2a997f6f1b5ac6653fe791dfab66e1eb0227740fb86d5dc /tmp/libicu-${ICU_VERSION}.tgz" | sha256sum --check && \
|
||||
mkdir /tmp/icu && \
|
||||
pushd /tmp/icu && \
|
||||
tar -xzf /tmp/libicu-${ICU_VERSION}.tgz && \
|
||||
pushd icu/source && \
|
||||
./configure --prefix=${ICU_PREFIX} --enable-static --enable-shared=no CXXFLAGS="-fPIC" CFLAGS="-fPIC" && \
|
||||
make -j "$(nproc)" && \
|
||||
make install && \
|
||||
popd && \
|
||||
rm -rf icu && \
|
||||
rm -f /tmp/libicu-${ICU_VERSION}.tgz && \
|
||||
popd
|
||||
|
||||
# Switch to nonroot user
|
||||
USER nonroot:nonroot
|
||||
WORKDIR /home/nonroot
|
||||
|
||||
# Python
|
||||
ENV PYTHON_VERSION=3.9.19 \
|
||||
ENV PYTHON_VERSION=3.9.18 \
|
||||
PYENV_ROOT=/home/nonroot/.pyenv \
|
||||
PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH
|
||||
RUN set -e \
|
||||
@@ -196,14 +141,9 @@ WORKDIR /home/nonroot
|
||||
|
||||
# Rust
|
||||
# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
|
||||
ENV RUSTC_VERSION=1.81.0
|
||||
ENV RUSTC_VERSION=1.79.0
|
||||
ENV RUSTUP_HOME="/home/nonroot/.rustup"
|
||||
ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
|
||||
ARG RUSTFILT_VERSION=0.2.1
|
||||
ARG CARGO_HAKARI_VERSION=0.9.30
|
||||
ARG CARGO_DENY_VERSION=0.16.1
|
||||
ARG CARGO_HACK_VERSION=0.6.31
|
||||
ARG CARGO_NEXTEST_VERSION=0.9.72
|
||||
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
|
||||
chmod +x rustup-init && \
|
||||
./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \
|
||||
@@ -211,14 +151,16 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
|
||||
export PATH="$HOME/.cargo/bin:$PATH" && \
|
||||
. "$HOME/.cargo/env" && \
|
||||
cargo --version && rustup --version && \
|
||||
rustup component add llvm-tools rustfmt clippy && \
|
||||
cargo install rustfilt --version ${RUSTFILT_VERSION} && \
|
||||
cargo install cargo-hakari --version ${CARGO_HAKARI_VERSION} && \
|
||||
cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \
|
||||
cargo install cargo-hack --version ${CARGO_HACK_VERSION} && \
|
||||
cargo install cargo-nextest --version ${CARGO_NEXTEST_VERSION} && \
|
||||
rustup component add llvm-tools-preview rustfmt clippy && \
|
||||
cargo install --git https://github.com/paritytech/cachepot && \
|
||||
cargo install rustfilt && \
|
||||
cargo install cargo-hakari && \
|
||||
cargo install cargo-deny --locked && \
|
||||
cargo install cargo-hack && \
|
||||
cargo install cargo-nextest && \
|
||||
rm -rf /home/nonroot/.cargo/registry && \
|
||||
rm -rf /home/nonroot/.cargo/git
|
||||
ENV RUSTC_WRAPPER=cachepot
|
||||
|
||||
# Show versions
|
||||
RUN whoami \
|
||||
@@ -228,6 +170,3 @@ RUN whoami \
|
||||
&& rustup --version --verbose \
|
||||
&& rustc --version --verbose \
|
||||
&& clang --version
|
||||
|
||||
# Set following flag to check in Makefile if its running in Docker
|
||||
RUN touch /home/nonroot/.docker_build
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
94
Makefile
94
Makefile
@@ -3,9 +3,6 @@ ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
|
||||
# Where to install Postgres, default is ./pg_install, maybe useful for package managers
|
||||
POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/pg_install/
|
||||
|
||||
OPENSSL_PREFIX_DIR := /usr/local/openssl
|
||||
ICU_PREFIX_DIR := /usr/local/icu
|
||||
|
||||
#
|
||||
# We differentiate between release / debug build types using the BUILD_TYPE
|
||||
# environment variable.
|
||||
@@ -23,16 +20,6 @@ else
|
||||
$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
|
||||
endif
|
||||
|
||||
ifeq ($(shell test -e /home/nonroot/.docker_build && echo -n yes),yes)
|
||||
# Exclude static build openssl, icu for local build (MacOS, Linux)
|
||||
# Only keep for build type release and debug
|
||||
PG_CFLAGS += -I$(OPENSSL_PREFIX_DIR)/include
|
||||
PG_CONFIGURE_OPTS += --with-icu
|
||||
PG_CONFIGURE_OPTS += ICU_CFLAGS='-I/$(ICU_PREFIX_DIR)/include -DU_STATIC_IMPLEMENTATION'
|
||||
PG_CONFIGURE_OPTS += ICU_LIBS='-L$(ICU_PREFIX_DIR)/lib -L$(ICU_PREFIX_DIR)/lib64 -licui18n -licuuc -licudata -lstdc++ -Wl,-Bdynamic -lm'
|
||||
PG_CONFIGURE_OPTS += LDFLAGS='-L$(OPENSSL_PREFIX_DIR)/lib -L$(OPENSSL_PREFIX_DIR)/lib64 -L$(ICU_PREFIX_DIR)/lib -L$(ICU_PREFIX_DIR)/lib64 -Wl,-Bstatic -lssl -lcrypto -Wl,-Bdynamic -lrt -lm -ldl -lpthread'
|
||||
endif
|
||||
|
||||
UNAME_S := $(shell uname -s)
|
||||
ifeq ($(UNAME_S),Linux)
|
||||
# Seccomp BPF is only available for Linux
|
||||
@@ -41,7 +28,7 @@ else ifeq ($(UNAME_S),Darwin)
|
||||
ifndef DISABLE_HOMEBREW
|
||||
# macOS with brew-installed openssl requires explicit paths
|
||||
# It can be configured with OPENSSL_PREFIX variable
|
||||
OPENSSL_PREFIX := $(shell brew --prefix openssl@3)
|
||||
OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
|
||||
PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
|
||||
PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig
|
||||
# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure
|
||||
@@ -69,8 +56,6 @@ CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1
|
||||
# Set PQ_LIB_DIR to make sure `storage_controller` get linked with bundled libpq (through diesel)
|
||||
CARGO_CMD_PREFIX += PQ_LIB_DIR=$(POSTGRES_INSTALL_DIR)/v16/lib
|
||||
|
||||
CACHEDIR_TAG_CONTENTS := "Signature: 8a477f597d28d172789f06886806bc55"
|
||||
|
||||
#
|
||||
# Top level Makefile to build Neon and PostgreSQL
|
||||
#
|
||||
@@ -81,24 +66,15 @@ all: neon postgres neon-pg-ext
|
||||
#
|
||||
# The 'postgres_ffi' depends on the Postgres headers.
|
||||
.PHONY: neon
|
||||
neon: postgres-headers walproposer-lib cargo-target-dir
|
||||
neon: postgres-headers walproposer-lib
|
||||
+@echo "Compiling Neon"
|
||||
$(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS)
|
||||
.PHONY: cargo-target-dir
|
||||
cargo-target-dir:
|
||||
# https://github.com/rust-lang/cargo/issues/14281
|
||||
mkdir -p target
|
||||
test -e target/CACHEDIR.TAG || echo "$(CACHEDIR_TAG_CONTENTS)" > target/CACHEDIR.TAG
|
||||
|
||||
### PostgreSQL parts
|
||||
# Some rules are duplicated for Postgres v14 and 15. We may want to refactor
|
||||
# to avoid the duplication in the future, but it's tolerable for now.
|
||||
#
|
||||
$(POSTGRES_INSTALL_DIR)/build/%/config.status:
|
||||
|
||||
mkdir -p $(POSTGRES_INSTALL_DIR)
|
||||
test -e $(POSTGRES_INSTALL_DIR)/CACHEDIR.TAG || echo "$(CACHEDIR_TAG_CONTENTS)" > $(POSTGRES_INSTALL_DIR)/CACHEDIR.TAG
|
||||
|
||||
+@echo "Configuring Postgres $* build"
|
||||
@test -s $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure || { \
|
||||
echo "\nPostgres submodule not found in $(ROOT_PROJECT_DIR)/vendor/postgres-$*/, execute "; \
|
||||
@@ -119,8 +95,6 @@ $(POSTGRES_INSTALL_DIR)/build/%/config.status:
|
||||
# I'm not sure why it wouldn't work, but this is the only place (apart from
|
||||
# the "build-all-versions" entry points) where direct mention of PostgreSQL
|
||||
# versions is used.
|
||||
.PHONY: postgres-configure-v17
|
||||
postgres-configure-v17: $(POSTGRES_INSTALL_DIR)/build/v17/config.status
|
||||
.PHONY: postgres-configure-v16
|
||||
postgres-configure-v16: $(POSTGRES_INSTALL_DIR)/build/v16/config.status
|
||||
.PHONY: postgres-configure-v15
|
||||
@@ -168,27 +142,27 @@ postgres-check-%: postgres-%
|
||||
neon-pg-ext-%: postgres-%
|
||||
+@echo "Compiling neon $*"
|
||||
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-$*
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-$* \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install
|
||||
+@echo "Compiling neon_walredo $*"
|
||||
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$*
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$* \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install
|
||||
+@echo "Compiling neon_rmgr $*"
|
||||
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-rmgr-$*
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-rmgr-$* \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon_rmgr/Makefile install
|
||||
+@echo "Compiling neon_test_utils $*"
|
||||
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$*
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$* \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile install
|
||||
+@echo "Compiling neon_utils $*"
|
||||
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-utils-$*
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install
|
||||
|
||||
@@ -217,31 +191,29 @@ neon-pg-clean-ext-%:
|
||||
# they depend on openssl and other libraries that are not included in our
|
||||
# Rust build.
|
||||
.PHONY: walproposer-lib
|
||||
walproposer-lib: neon-pg-ext-v17
|
||||
walproposer-lib: neon-pg-ext-v16
|
||||
+@echo "Compiling walproposer-lib"
|
||||
mkdir -p $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v17/bin/pg_config COPT='$(COPT)' \
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/walproposer-lib \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile walproposer-lib
|
||||
cp $(POSTGRES_INSTALL_DIR)/v17/lib/libpgport.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
|
||||
cp $(POSTGRES_INSTALL_DIR)/v17/lib/libpgcommon.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
|
||||
cp $(POSTGRES_INSTALL_DIR)/v16/lib/libpgport.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
|
||||
cp $(POSTGRES_INSTALL_DIR)/v16/lib/libpgcommon.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
|
||||
ifeq ($(UNAME_S),Linux)
|
||||
$(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgport.a \
|
||||
pg_strong_random.o
|
||||
$(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgcommon.a \
|
||||
checksum_helper.o \
|
||||
cryptohash_openssl.o \
|
||||
pg_crc32c.o \
|
||||
hmac_openssl.o \
|
||||
cryptohash_openssl.o \
|
||||
scram-common.o \
|
||||
md5_common.o \
|
||||
parse_manifest.o \
|
||||
scram-common.o
|
||||
ifeq ($(UNAME_S),Linux)
|
||||
$(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgcommon.a \
|
||||
pg_crc32c.o
|
||||
checksum_helper.o
|
||||
endif
|
||||
|
||||
.PHONY: walproposer-lib-clean
|
||||
walproposer-lib-clean:
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v17/bin/pg_config \
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/walproposer-lib \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean
|
||||
|
||||
@@ -249,44 +221,38 @@ walproposer-lib-clean:
|
||||
neon-pg-ext: \
|
||||
neon-pg-ext-v14 \
|
||||
neon-pg-ext-v15 \
|
||||
neon-pg-ext-v16 \
|
||||
neon-pg-ext-v17
|
||||
neon-pg-ext-v16
|
||||
|
||||
.PHONY: neon-pg-clean-ext
|
||||
neon-pg-clean-ext: \
|
||||
neon-pg-clean-ext-v14 \
|
||||
neon-pg-clean-ext-v15 \
|
||||
neon-pg-clean-ext-v16 \
|
||||
neon-pg-clean-ext-v17
|
||||
neon-pg-clean-ext-v16
|
||||
|
||||
# shorthand to build all Postgres versions
|
||||
.PHONY: postgres
|
||||
postgres: \
|
||||
postgres-v14 \
|
||||
postgres-v15 \
|
||||
postgres-v16 \
|
||||
postgres-v17
|
||||
postgres-v16
|
||||
|
||||
.PHONY: postgres-headers
|
||||
postgres-headers: \
|
||||
postgres-headers-v14 \
|
||||
postgres-headers-v15 \
|
||||
postgres-headers-v16 \
|
||||
postgres-headers-v17
|
||||
postgres-headers-v16
|
||||
|
||||
.PHONY: postgres-clean
|
||||
postgres-clean: \
|
||||
postgres-clean-v14 \
|
||||
postgres-clean-v15 \
|
||||
postgres-clean-v16 \
|
||||
postgres-clean-v17
|
||||
postgres-clean-v16
|
||||
|
||||
.PHONY: postgres-check
|
||||
postgres-check: \
|
||||
postgres-check-v14 \
|
||||
postgres-check-v15 \
|
||||
postgres-check-v16 \
|
||||
postgres-check-v17
|
||||
postgres-check-v16
|
||||
|
||||
# This doesn't remove the effects of 'configure'.
|
||||
.PHONY: clean
|
||||
@@ -331,13 +297,13 @@ postgres-%-pgindent: postgres-%-pg-bsd-indent postgres-%-typedefs.list
|
||||
rm -f pg*.BAK
|
||||
|
||||
# Indent pxgn/neon.
|
||||
.PHONY: neon-pgindent
|
||||
neon-pgindent: postgres-v17-pg-bsd-indent neon-pg-ext-v17
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v17/bin/pg_config COPT='$(COPT)' \
|
||||
FIND_TYPEDEF=$(ROOT_PROJECT_DIR)/vendor/postgres-v17/src/tools/find_typedef \
|
||||
INDENT=$(POSTGRES_INSTALL_DIR)/build/v17/src/tools/pg_bsd_indent/pg_bsd_indent \
|
||||
PGINDENT_SCRIPT=$(ROOT_PROJECT_DIR)/vendor/postgres-v17/src/tools/pgindent/pgindent \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-v17 \
|
||||
.PHONY: pgindent
|
||||
neon-pgindent: postgres-v16-pg-bsd-indent neon-pg-ext-v16
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
|
||||
FIND_TYPEDEF=$(ROOT_PROJECT_DIR)/vendor/postgres-v16/src/tools/find_typedef \
|
||||
INDENT=$(POSTGRES_INSTALL_DIR)/build/v16/src/tools/pg_bsd_indent/pg_bsd_indent \
|
||||
PGINDENT_SCRIPT=$(ROOT_PROJECT_DIR)/vendor/postgres-v16/src/tools/pgindent/pgindent \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-v16 \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile pgindent
|
||||
|
||||
|
||||
|
||||
12
README.md
12
README.md
@@ -58,18 +58,12 @@ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
|
||||
1. Install XCode and dependencies
|
||||
```
|
||||
xcode-select --install
|
||||
brew install protobuf openssl flex bison icu4c pkg-config m4
|
||||
brew install protobuf openssl flex bison icu4c pkg-config
|
||||
|
||||
# add openssl to PATH, required for ed25519 keys generation in neon_local
|
||||
echo 'export PATH="$(brew --prefix openssl)/bin:$PATH"' >> ~/.zshrc
|
||||
```
|
||||
|
||||
If you get errors about missing `m4` you may have to install it manually:
|
||||
```
|
||||
brew install m4
|
||||
brew link --force m4
|
||||
```
|
||||
|
||||
2. [Install Rust](https://www.rust-lang.org/tools/install)
|
||||
```
|
||||
# recommended approach from https://www.rust-lang.org/tools/install
|
||||
@@ -132,7 +126,7 @@ make -j`sysctl -n hw.logicalcpu` -s
|
||||
To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `pg_install/bin` and `pg_install/lib`, respectively.
|
||||
|
||||
To run the integration tests or Python scripts (not required to use the code), install
|
||||
Python (3.9 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.8](https://python-poetry.org/)) in the project directory.
|
||||
Python (3.9 or higher), and install the python3 packages using `./scripts/pysync` (requires [poetry>=1.3](https://python-poetry.org/)) in the project directory.
|
||||
|
||||
|
||||
#### Running neon database
|
||||
@@ -268,7 +262,7 @@ By default, this runs both debug and release modes, and all supported postgres v
|
||||
testing locally, it is convenient to run just one set of permutations, like this:
|
||||
|
||||
```sh
|
||||
DEFAULT_PG_VERSION=16 BUILD_TYPE=release ./scripts/pytest
|
||||
DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest
|
||||
```
|
||||
|
||||
## Flamegraphs
|
||||
|
||||
@@ -1,21 +0,0 @@
|
||||
This directory contains files that are needed to build the compute
|
||||
images, or included in the compute images.
|
||||
|
||||
Dockerfile.compute-node
|
||||
To build the compute image
|
||||
|
||||
vm-image-spec.yaml
|
||||
Instructions for vm-builder, to turn the compute-node image into
|
||||
corresponding vm-compute-node image.
|
||||
|
||||
etc/
|
||||
Configuration files included in /etc in the compute image
|
||||
|
||||
patches/
|
||||
Some extensions need to be patched to work with Neon. This
|
||||
directory contains such patches. They are applied to the extension
|
||||
sources in Dockerfile.compute-node
|
||||
|
||||
In addition to these, postgres itself, the neon postgres extension,
|
||||
and compute_ctl are built and copied into the compute image by
|
||||
Dockerfile.compute-node.
|
||||
@@ -1,331 +0,0 @@
|
||||
collector_name: neon_collector
|
||||
metrics:
|
||||
- metric_name: lfc_misses
|
||||
type: gauge
|
||||
help: 'lfc_misses'
|
||||
key_labels:
|
||||
values: [lfc_misses]
|
||||
query: |
|
||||
select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses';
|
||||
|
||||
- metric_name: lfc_used
|
||||
type: gauge
|
||||
help: 'LFC chunks used (chunk = 1MB)'
|
||||
key_labels:
|
||||
values: [lfc_used]
|
||||
query: |
|
||||
select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used';
|
||||
|
||||
- metric_name: lfc_hits
|
||||
type: gauge
|
||||
help: 'lfc_hits'
|
||||
key_labels:
|
||||
values: [lfc_hits]
|
||||
query: |
|
||||
select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits';
|
||||
|
||||
- metric_name: lfc_writes
|
||||
type: gauge
|
||||
help: 'lfc_writes'
|
||||
key_labels:
|
||||
values: [lfc_writes]
|
||||
query: |
|
||||
select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes';
|
||||
|
||||
- metric_name: lfc_cache_size_limit
|
||||
type: gauge
|
||||
help: 'LFC cache size limit in bytes'
|
||||
key_labels:
|
||||
values: [lfc_cache_size_limit]
|
||||
query: |
|
||||
select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit;
|
||||
|
||||
- metric_name: connection_counts
|
||||
type: gauge
|
||||
help: 'Connection counts'
|
||||
key_labels:
|
||||
- datname
|
||||
- state
|
||||
values: [count]
|
||||
query: |
|
||||
select datname, state, count(*) as count from pg_stat_activity where state <> '' group by datname, state;
|
||||
|
||||
- metric_name: pg_stats_userdb
|
||||
type: gauge
|
||||
help: 'Stats for several oldest non-system dbs'
|
||||
key_labels:
|
||||
- datname
|
||||
value_label: kind
|
||||
values:
|
||||
- db_size
|
||||
- deadlocks
|
||||
# Rows
|
||||
- inserted
|
||||
- updated
|
||||
- deleted
|
||||
# We export stats for 10 non-system database. Without this limit
|
||||
# it is too easy to abuse the system by creating lots of databases.
|
||||
query: |
|
||||
select pg_database_size(datname) as db_size, deadlocks,
|
||||
tup_inserted as inserted, tup_updated as updated, tup_deleted as deleted,
|
||||
datname
|
||||
from pg_stat_database
|
||||
where datname IN (
|
||||
select datname
|
||||
from pg_database
|
||||
where datname <> 'postgres' and not datistemplate
|
||||
order by oid
|
||||
limit 10
|
||||
);
|
||||
|
||||
- metric_name: max_cluster_size
|
||||
type: gauge
|
||||
help: 'neon.max_cluster_size setting'
|
||||
key_labels:
|
||||
values: [max_cluster_size]
|
||||
query: |
|
||||
select setting::int as max_cluster_size from pg_settings where name = 'neon.max_cluster_size';
|
||||
|
||||
- metric_name: db_total_size
|
||||
type: gauge
|
||||
help: 'Size of all databases'
|
||||
key_labels:
|
||||
values: [total]
|
||||
query: |
|
||||
select sum(pg_database_size(datname)) as total from pg_database;
|
||||
|
||||
- metric_name: getpage_wait_seconds_count
|
||||
type: counter
|
||||
help: 'Number of getpage requests'
|
||||
values: [getpage_wait_seconds_count]
|
||||
query_ref: neon_perf_counters
|
||||
|
||||
- metric_name: getpage_wait_seconds_sum
|
||||
type: counter
|
||||
help: 'Time spent in getpage requests'
|
||||
values: [getpage_wait_seconds_sum]
|
||||
query_ref: neon_perf_counters
|
||||
|
||||
- metric_name: getpage_prefetch_requests_total
|
||||
type: counter
|
||||
help: 'Number of getpage issued for prefetching'
|
||||
values: [getpage_prefetch_requests_total]
|
||||
query_ref: neon_perf_counters
|
||||
|
||||
- metric_name: getpage_sync_requests_total
|
||||
type: counter
|
||||
help: 'Number of synchronous getpage issued'
|
||||
values: [getpage_sync_requests_total]
|
||||
query_ref: neon_perf_counters
|
||||
|
||||
- metric_name: getpage_prefetch_misses_total
|
||||
type: counter
|
||||
help: 'Total number of readahead misses; consisting of either prefetches that don''t satisfy the LSN bounds once the prefetch got read by the backend, or cases where somehow no readahead was issued for the read'
|
||||
values: [getpage_prefetch_misses_total]
|
||||
query_ref: neon_perf_counters
|
||||
|
||||
- metric_name: getpage_prefetch_discards_total
|
||||
type: counter
|
||||
help: 'Number of prefetch responses issued but not used'
|
||||
values: [getpage_prefetch_discards_total]
|
||||
query_ref: neon_perf_counters
|
||||
|
||||
- metric_name: pageserver_requests_sent_total
|
||||
type: counter
|
||||
help: 'Number of all requests sent to the pageserver (not just GetPage requests)'
|
||||
values: [pageserver_requests_sent_total]
|
||||
query_ref: neon_perf_counters
|
||||
|
||||
- metric_name: pageserver_disconnects_total
|
||||
type: counter
|
||||
help: 'Number of times that the connection to the pageserver was lost'
|
||||
values: [pageserver_disconnects_total]
|
||||
query_ref: neon_perf_counters
|
||||
|
||||
- metric_name: pageserver_send_flushes_total
|
||||
type: counter
|
||||
help: 'Number of flushes to the pageserver connection'
|
||||
values: [pageserver_send_flushes_total]
|
||||
query_ref: neon_perf_counters
|
||||
|
||||
- metric_name: getpage_wait_seconds_bucket
|
||||
type: counter
|
||||
help: 'Histogram buckets of getpage request latency'
|
||||
key_labels:
|
||||
- bucket_le
|
||||
values: [value]
|
||||
query_ref: getpage_wait_seconds_buckets
|
||||
|
||||
# DEPRECATED
|
||||
- metric_name: lfc_approximate_working_set_size
|
||||
type: gauge
|
||||
help: 'Approximate working set size in pages of 8192 bytes'
|
||||
key_labels:
|
||||
values: [approximate_working_set_size]
|
||||
query: |
|
||||
select neon.approximate_working_set_size(false) as approximate_working_set_size;
|
||||
|
||||
- metric_name: lfc_approximate_working_set_size_windows
|
||||
type: gauge
|
||||
help: 'Approximate working set size in pages of 8192 bytes'
|
||||
key_labels: [duration]
|
||||
values: [size]
|
||||
# NOTE: This is the "public" / "human-readable" version. Here, we supply a small selection
|
||||
# of durations in a pretty-printed form.
|
||||
query: |
|
||||
select
|
||||
x as duration,
|
||||
neon.approximate_working_set_size_seconds(extract('epoch' from x::interval)::int) as size
|
||||
from
|
||||
(values ('5m'),('15m'),('1h')) as t (x);
|
||||
|
||||
- metric_name: compute_current_lsn
|
||||
type: gauge
|
||||
help: 'Current LSN of the database'
|
||||
key_labels:
|
||||
values: [lsn]
|
||||
query: |
|
||||
select
|
||||
case
|
||||
when pg_catalog.pg_is_in_recovery()
|
||||
then (pg_last_wal_replay_lsn() - '0/0')::FLOAT8
|
||||
else (pg_current_wal_lsn() - '0/0')::FLOAT8
|
||||
end as lsn;
|
||||
|
||||
- metric_name: compute_receive_lsn
|
||||
type: gauge
|
||||
help: 'Returns the last write-ahead log location that has been received and synced to disk by streaming replication'
|
||||
key_labels:
|
||||
values: [lsn]
|
||||
query: |
|
||||
SELECT
|
||||
CASE
|
||||
WHEN pg_catalog.pg_is_in_recovery()
|
||||
THEN (pg_last_wal_receive_lsn() - '0/0')::FLOAT8
|
||||
ELSE 0
|
||||
END AS lsn;
|
||||
|
||||
- metric_name: replication_delay_bytes
|
||||
type: gauge
|
||||
help: 'Bytes between received and replayed LSN'
|
||||
key_labels:
|
||||
values: [replication_delay_bytes]
|
||||
# We use a GREATEST call here because this calculation can be negative.
|
||||
# The calculation is not atomic, meaning after we've gotten the receive
|
||||
# LSN, the replay LSN may have advanced past the receive LSN we
|
||||
# are using for the calculation.
|
||||
query: |
|
||||
SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes;
|
||||
|
||||
- metric_name: replication_delay_seconds
|
||||
type: gauge
|
||||
help: 'Time since last LSN was replayed'
|
||||
key_labels:
|
||||
values: [replication_delay_seconds]
|
||||
query: |
|
||||
SELECT
|
||||
CASE
|
||||
WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0
|
||||
ELSE GREATEST (0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()))
|
||||
END AS replication_delay_seconds;
|
||||
|
||||
- metric_name: checkpoints_req
|
||||
type: gauge
|
||||
help: 'Number of requested checkpoints'
|
||||
key_labels:
|
||||
values: [checkpoints_req]
|
||||
query: |
|
||||
SELECT checkpoints_req FROM pg_stat_bgwriter;
|
||||
|
||||
- metric_name: checkpoints_timed
|
||||
type: gauge
|
||||
help: 'Number of scheduled checkpoints'
|
||||
key_labels:
|
||||
values: [checkpoints_timed]
|
||||
query: |
|
||||
SELECT checkpoints_timed FROM pg_stat_bgwriter;
|
||||
|
||||
- metric_name: compute_logical_snapshot_files
|
||||
type: gauge
|
||||
help: 'Number of snapshot files in pg_logical/snapshot'
|
||||
key_labels:
|
||||
- timeline_id
|
||||
values: [num_logical_snapshot_files]
|
||||
query: |
|
||||
SELECT
|
||||
(SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
|
||||
-- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. These
|
||||
-- temporary snapshot files are renamed to the actual snapshot files after they are
|
||||
-- completely built. We only WAL-log the completely built snapshot files.
|
||||
(SELECT COUNT(*) FROM pg_ls_dir('pg_logical/snapshots') AS name WHERE name LIKE '%.snap') AS num_logical_snapshot_files;
|
||||
|
||||
# In all the below metrics, we cast LSNs to floats because Prometheus only supports floats.
|
||||
# It's probably fine because float64 can store integers from -2^53 to +2^53 exactly.
|
||||
|
||||
# Number of slots is limited by max_replication_slots, so collecting position for all of them shouldn't be bad.
|
||||
- metric_name: logical_slot_restart_lsn
|
||||
type: gauge
|
||||
help: 'restart_lsn of logical slots'
|
||||
key_labels:
|
||||
- slot_name
|
||||
values: [restart_lsn]
|
||||
query: |
|
||||
select slot_name, (restart_lsn - '0/0')::FLOAT8 as restart_lsn
|
||||
from pg_replication_slots
|
||||
where slot_type = 'logical';
|
||||
|
||||
- metric_name: compute_subscriptions_count
|
||||
type: gauge
|
||||
help: 'Number of logical replication subscriptions grouped by enabled/disabled'
|
||||
key_labels:
|
||||
- enabled
|
||||
values: [subscriptions_count]
|
||||
query: |
|
||||
select subenabled::text as enabled, count(*) as subscriptions_count
|
||||
from pg_subscription
|
||||
group by subenabled;
|
||||
|
||||
- metric_name: retained_wal
|
||||
type: gauge
|
||||
help: 'Retained WAL in inactive replication slots'
|
||||
key_labels:
|
||||
- slot_name
|
||||
values: [retained_wal]
|
||||
query: |
|
||||
SELECT slot_name, pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::FLOAT8 AS retained_wal
|
||||
FROM pg_replication_slots
|
||||
WHERE active = false;
|
||||
|
||||
- metric_name: wal_is_lost
|
||||
type: gauge
|
||||
help: 'Whether or not the replication slot wal_status is lost'
|
||||
key_labels:
|
||||
- slot_name
|
||||
values: [wal_is_lost]
|
||||
query: |
|
||||
SELECT slot_name,
|
||||
CASE WHEN wal_status = 'lost' THEN 1 ELSE 0 END AS wal_is_lost
|
||||
FROM pg_replication_slots;
|
||||
|
||||
queries:
|
||||
- query_name: neon_perf_counters
|
||||
query: |
|
||||
WITH c AS (
|
||||
SELECT pg_catalog.jsonb_object_agg(metric, value) jb FROM neon.neon_perf_counters
|
||||
)
|
||||
SELECT d.*
|
||||
FROM pg_catalog.jsonb_to_record((select jb from c)) as d(
|
||||
getpage_wait_seconds_count numeric,
|
||||
getpage_wait_seconds_sum numeric,
|
||||
getpage_prefetch_requests_total numeric,
|
||||
getpage_sync_requests_total numeric,
|
||||
getpage_prefetch_misses_total numeric,
|
||||
getpage_prefetch_discards_total numeric,
|
||||
pageserver_requests_sent_total numeric,
|
||||
pageserver_disconnects_total numeric,
|
||||
pageserver_send_flushes_total numeric
|
||||
);
|
||||
|
||||
- query_name: getpage_wait_seconds_buckets
|
||||
query: |
|
||||
SELECT bucket_le, value FROM neon.neon_perf_counters WHERE metric = 'getpage_wait_seconds_bucket';
|
||||
@@ -1,55 +0,0 @@
|
||||
collector_name: neon_collector_autoscaling
|
||||
metrics:
|
||||
- metric_name: lfc_misses
|
||||
type: gauge
|
||||
help: 'lfc_misses'
|
||||
key_labels:
|
||||
values: [lfc_misses]
|
||||
query: |
|
||||
select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses';
|
||||
|
||||
- metric_name: lfc_used
|
||||
type: gauge
|
||||
help: 'LFC chunks used (chunk = 1MB)'
|
||||
key_labels:
|
||||
values: [lfc_used]
|
||||
query: |
|
||||
select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used';
|
||||
|
||||
- metric_name: lfc_hits
|
||||
type: gauge
|
||||
help: 'lfc_hits'
|
||||
key_labels:
|
||||
values: [lfc_hits]
|
||||
query: |
|
||||
select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits';
|
||||
|
||||
- metric_name: lfc_writes
|
||||
type: gauge
|
||||
help: 'lfc_writes'
|
||||
key_labels:
|
||||
values: [lfc_writes]
|
||||
query: |
|
||||
select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes';
|
||||
|
||||
- metric_name: lfc_cache_size_limit
|
||||
type: gauge
|
||||
help: 'LFC cache size limit in bytes'
|
||||
key_labels:
|
||||
values: [lfc_cache_size_limit]
|
||||
query: |
|
||||
select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit;
|
||||
|
||||
- metric_name: lfc_approximate_working_set_size_windows
|
||||
type: gauge
|
||||
help: 'Approximate working set size in pages of 8192 bytes'
|
||||
key_labels: [duration_seconds]
|
||||
values: [size]
|
||||
# NOTE: This is the "internal" / "machine-readable" version. This outputs the working set
|
||||
# size looking back 1..60 minutes, labeled with the number of minutes.
|
||||
query: |
|
||||
select
|
||||
x::text as duration_seconds,
|
||||
neon.approximate_working_set_size_seconds(x) as size
|
||||
from
|
||||
(select generate_series * 60 as x from generate_series(1, 60)) as t (x);
|
||||
@@ -1,17 +0,0 @@
|
||||
[databases]
|
||||
*=host=localhost port=5432 auth_user=cloud_admin
|
||||
[pgbouncer]
|
||||
listen_port=6432
|
||||
listen_addr=0.0.0.0
|
||||
auth_type=scram-sha-256
|
||||
auth_user=cloud_admin
|
||||
auth_dbname=postgres
|
||||
client_tls_sslmode=disable
|
||||
server_tls_sslmode=disable
|
||||
pool_mode=transaction
|
||||
max_client_conn=10000
|
||||
default_pool_size=64
|
||||
max_prepared_statements=0
|
||||
admin_users=postgres
|
||||
unix_socket_dir=/tmp/
|
||||
unix_socket_mode=0777
|
||||
@@ -1,33 +0,0 @@
|
||||
# Configuration for sql_exporter
|
||||
# Global defaults.
|
||||
global:
|
||||
# If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s.
|
||||
scrape_timeout: 10s
|
||||
# Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first.
|
||||
scrape_timeout_offset: 500ms
|
||||
# Minimum interval between collector runs: by default (0s) collectors are executed on every scrape.
|
||||
min_interval: 0s
|
||||
# Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections,
|
||||
# as will concurrent scrapes.
|
||||
max_connections: 1
|
||||
# Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should
|
||||
# always be the same as max_connections.
|
||||
max_idle_connections: 1
|
||||
# Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse.
|
||||
# If 0, connections are not closed due to a connection's age.
|
||||
max_connection_lifetime: 5m
|
||||
|
||||
# The target to monitor and the collectors to execute on it.
|
||||
target:
|
||||
# Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
|
||||
# the schema gets dropped or replaced to match the driver expected DSN format.
|
||||
data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter'
|
||||
|
||||
# Collectors (referenced by name) to execute on the target.
|
||||
# Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
|
||||
collectors: [neon_collector]
|
||||
|
||||
# Collector files specifies a list of globs. One collector definition is read from each matching file.
|
||||
# Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
|
||||
collector_files:
|
||||
- "neon_collector.yml"
|
||||
@@ -1,33 +0,0 @@
|
||||
# Configuration for sql_exporter for autoscaling-agent
|
||||
# Global defaults.
|
||||
global:
|
||||
# If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s.
|
||||
scrape_timeout: 10s
|
||||
# Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first.
|
||||
scrape_timeout_offset: 500ms
|
||||
# Minimum interval between collector runs: by default (0s) collectors are executed on every scrape.
|
||||
min_interval: 0s
|
||||
# Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections,
|
||||
# as will concurrent scrapes.
|
||||
max_connections: 1
|
||||
# Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should
|
||||
# always be the same as max_connections.
|
||||
max_idle_connections: 1
|
||||
# Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse.
|
||||
# If 0, connections are not closed due to a connection's age.
|
||||
max_connection_lifetime: 5m
|
||||
|
||||
# The target to monitor and the collectors to execute on it.
|
||||
target:
|
||||
# Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
|
||||
# the schema gets dropped or replaced to match the driver expected DSN format.
|
||||
data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter_autoscaling'
|
||||
|
||||
# Collectors (referenced by name) to execute on the target.
|
||||
# Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
|
||||
collectors: [neon_collector_autoscaling]
|
||||
|
||||
# Collector files specifies a list of globs. One collector definition is read from each matching file.
|
||||
# Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
|
||||
collector_files:
|
||||
- "neon_collector_autoscaling.yml"
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,54 +0,0 @@
|
||||
commit 68f3b3b0d594f08aacc4a082ee210749ed5677eb
|
||||
Author: Anastasia Lubennikova <anastasia@neon.tech>
|
||||
Date: Mon Jul 15 12:31:56 2024 +0100
|
||||
|
||||
Neon: fix unlogged index build patch
|
||||
|
||||
diff --git a/src/ruminsert.c b/src/ruminsert.c
|
||||
index e8b209d..e89bf2a 100644
|
||||
--- a/src/ruminsert.c
|
||||
+++ b/src/ruminsert.c
|
||||
@@ -628,6 +628,10 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo)
|
||||
elog(ERROR, "index \"%s\" already contains data",
|
||||
RelationGetRelationName(index));
|
||||
|
||||
+#ifdef NEON_SMGR
|
||||
+ smgr_start_unlogged_build(index->rd_smgr);
|
||||
+#endif
|
||||
+
|
||||
initRumState(&buildstate.rumstate, index);
|
||||
buildstate.rumstate.isBuild = true;
|
||||
buildstate.indtuples = 0;
|
||||
@@ -693,6 +697,10 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo)
|
||||
buildstate.buildStats.nTotalPages = RelationGetNumberOfBlocks(index);
|
||||
rumUpdateStats(index, &buildstate.buildStats, buildstate.rumstate.isBuild);
|
||||
|
||||
+#ifdef NEON_SMGR
|
||||
+ smgr_finish_unlogged_build_phase_1(index->rd_smgr);
|
||||
+#endif
|
||||
+
|
||||
/*
|
||||
* Write index to xlog
|
||||
*/
|
||||
@@ -713,6 +721,21 @@ rumbuild(Relation heap, Relation index, struct IndexInfo *indexInfo)
|
||||
UnlockReleaseBuffer(buffer);
|
||||
}
|
||||
|
||||
+#ifdef NEON_SMGR
|
||||
+ {
|
||||
+#if PG_VERSION_NUM >= 160000
|
||||
+ RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator;
|
||||
+#else
|
||||
+ RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node;
|
||||
+#endif
|
||||
+
|
||||
+ SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator, MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
|
||||
+ SetLastWrittenLSNForRelation(XactLastRecEnd, rlocator, MAIN_FORKNUM);
|
||||
+
|
||||
+ smgr_end_unlogged_build(index->rd_smgr);
|
||||
+ }
|
||||
+#endif
|
||||
+
|
||||
/*
|
||||
* Return statistics
|
||||
*/
|
||||
@@ -1,121 +0,0 @@
|
||||
# Supplemental file for neondatabase/autoscaling's vm-builder, for producing the VM compute image.
|
||||
---
|
||||
commands:
|
||||
- name: cgconfigparser
|
||||
user: root
|
||||
sysvInitAction: sysinit
|
||||
shell: 'cgconfigparser -l /etc/cgconfig.conf -s 1664'
|
||||
# restrict permissions on /neonvm/bin/resize-swap, because we grant access to compute_ctl for
|
||||
# running it as root.
|
||||
- name: chmod-resize-swap
|
||||
user: root
|
||||
sysvInitAction: sysinit
|
||||
shell: 'chmod 711 /neonvm/bin/resize-swap'
|
||||
- name: chmod-set-disk-quota
|
||||
user: root
|
||||
sysvInitAction: sysinit
|
||||
shell: 'chmod 711 /neonvm/bin/set-disk-quota'
|
||||
- name: pgbouncer
|
||||
user: postgres
|
||||
sysvInitAction: respawn
|
||||
shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini'
|
||||
- name: local_proxy
|
||||
user: postgres
|
||||
sysvInitAction: respawn
|
||||
shell: '/usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
|
||||
- name: postgres-exporter
|
||||
user: nobody
|
||||
sysvInitAction: respawn
|
||||
shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter'
|
||||
- name: sql-exporter
|
||||
user: nobody
|
||||
sysvInitAction: respawn
|
||||
shell: '/bin/sql_exporter -config.file=/etc/sql_exporter.yml -web.listen-address=:9399'
|
||||
- name: sql-exporter-autoscaling
|
||||
user: nobody
|
||||
sysvInitAction: respawn
|
||||
shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499'
|
||||
shutdownHook: |
|
||||
su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
|
||||
files:
|
||||
- filename: compute_ctl-sudoers
|
||||
content: |
|
||||
# Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap
|
||||
# and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD),
|
||||
# regardless of hostname (ALL)
|
||||
postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota
|
||||
- filename: cgconfig.conf
|
||||
content: |
|
||||
# Configuration for cgroups in VM compute nodes
|
||||
group neon-postgres {
|
||||
perm {
|
||||
admin {
|
||||
uid = postgres;
|
||||
}
|
||||
task {
|
||||
gid = users;
|
||||
}
|
||||
}
|
||||
memory {}
|
||||
}
|
||||
build: |
|
||||
# Build cgroup-tools
|
||||
#
|
||||
# At time of writing (2023-03-14), debian bullseye has a version of cgroup-tools (technically
|
||||
# libcgroup) that doesn't support cgroup v2 (version 0.41-11). Unfortunately, the vm-monitor
|
||||
# requires cgroup v2, so we'll build cgroup-tools ourselves.
|
||||
FROM debian:bullseye-slim as libcgroup-builder
|
||||
ENV LIBCGROUP_VERSION=v2.0.3
|
||||
|
||||
RUN set -exu \
|
||||
&& apt update \
|
||||
&& apt install --no-install-recommends -y \
|
||||
git \
|
||||
ca-certificates \
|
||||
automake \
|
||||
cmake \
|
||||
make \
|
||||
gcc \
|
||||
byacc \
|
||||
flex \
|
||||
libtool \
|
||||
libpam0g-dev \
|
||||
&& git clone --depth 1 -b $LIBCGROUP_VERSION https://github.com/libcgroup/libcgroup \
|
||||
&& INSTALL_DIR="/libcgroup-install" \
|
||||
&& mkdir -p "$INSTALL_DIR/bin" "$INSTALL_DIR/include" \
|
||||
&& cd libcgroup \
|
||||
# extracted from bootstrap.sh, with modified flags:
|
||||
&& (test -d m4 || mkdir m4) \
|
||||
&& autoreconf -fi \
|
||||
&& rm -rf autom4te.cache \
|
||||
&& CFLAGS="-O3" ./configure --prefix="$INSTALL_DIR" --sysconfdir=/etc --localstatedir=/var --enable-opaque-hierarchy="name=systemd" \
|
||||
# actually build the thing...
|
||||
&& make install
|
||||
merge: |
|
||||
# tweak nofile limits
|
||||
RUN set -e \
|
||||
&& echo 'fs.file-max = 1048576' >>/etc/sysctl.conf \
|
||||
&& test ! -e /etc/security || ( \
|
||||
echo '* - nofile 1048576' >>/etc/security/limits.conf \
|
||||
&& echo 'root - nofile 1048576' >>/etc/security/limits.conf \
|
||||
)
|
||||
|
||||
# Allow postgres user (compute_ctl) to run swap resizer.
|
||||
# Need to install sudo in order to allow this.
|
||||
#
|
||||
# Also, remove the 'read' permission from group/other on /neonvm/bin/resize-swap, just to be safe.
|
||||
RUN set -e \
|
||||
&& apt update \
|
||||
&& apt install --no-install-recommends -y \
|
||||
sudo \
|
||||
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||
COPY compute_ctl-sudoers /etc/sudoers.d/compute_ctl-sudoers
|
||||
|
||||
COPY cgconfig.conf /etc/cgconfig.conf
|
||||
|
||||
RUN set -e \
|
||||
&& chmod 0644 /etc/cgconfig.conf
|
||||
|
||||
COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/
|
||||
COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/
|
||||
COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/
|
||||
@@ -4,27 +4,22 @@ version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[features]
|
||||
default = []
|
||||
# Enables test specific features.
|
||||
testing = []
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
camino.workspace = true
|
||||
async-compression.workspace = true
|
||||
chrono.workspace = true
|
||||
cfg-if.workspace = true
|
||||
clap.workspace = true
|
||||
flate2.workspace = true
|
||||
futures.workspace = true
|
||||
hyper0 = { workspace = true, features = ["full"] }
|
||||
hyper = { workspace = true, features = ["full"] }
|
||||
nix.workspace = true
|
||||
notify.workspace = true
|
||||
num_cpus.workspace = true
|
||||
opentelemetry.workspace = true
|
||||
opentelemetry_sdk.workspace = true
|
||||
postgres.workspace = true
|
||||
regex.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
signal-hook.workspace = true
|
||||
tar.workspace = true
|
||||
@@ -43,9 +38,9 @@ url.workspace = true
|
||||
compute_api.workspace = true
|
||||
utils.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
toml_edit.workspace = true
|
||||
remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
|
||||
vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
|
||||
zstd = "0.13"
|
||||
bytes = "1.0"
|
||||
rust-ini = "0.20.0"
|
||||
rlimit = "0.10.1"
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
//! - Every start is a fresh start, so the data directory is removed and
|
||||
//! initialized again on each run.
|
||||
//! - If remote_extension_config is provided, it will be used to fetch extensions list
|
||||
//! and download `shared_preload_libraries` from the remote storage.
|
||||
//! and download `shared_preload_libraries` from the remote storage.
|
||||
//! - Next it will put configuration files into the `PGDATA` directory.
|
||||
//! - Sync safekeepers and get commit LSN.
|
||||
//! - Get `basebackup` from pageserver using the returned on the previous step LSN.
|
||||
@@ -33,6 +33,7 @@
|
||||
//! -b /usr/local/bin/postgres \
|
||||
//! -r http://pg-ext-s3-gateway \
|
||||
//! ```
|
||||
//!
|
||||
use std::collections::HashMap;
|
||||
use std::fs::File;
|
||||
use std::path::Path;
|
||||
@@ -44,8 +45,6 @@ use std::{thread, time::Duration};
|
||||
use anyhow::{Context, Result};
|
||||
use chrono::Utc;
|
||||
use clap::Arg;
|
||||
use compute_tools::disk_quota::set_disk_quota;
|
||||
use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static;
|
||||
use signal_hook::consts::{SIGQUIT, SIGTERM};
|
||||
use signal_hook::{consts::SIGINT, iterator::Signals};
|
||||
use tracing::{error, info, warn};
|
||||
@@ -65,7 +64,6 @@ use compute_tools::monitor::launch_monitor;
|
||||
use compute_tools::params::*;
|
||||
use compute_tools::spec::*;
|
||||
use compute_tools::swap::resize_swap;
|
||||
use rlimit::{setrlimit, Resource};
|
||||
|
||||
// this is an arbitrary build tag. Fine as a default / for testing purposes
|
||||
// in-case of not-set environment var
|
||||
@@ -74,9 +72,6 @@ const BUILD_TAG_DEFAULT: &str = "latest";
|
||||
fn main() -> Result<()> {
|
||||
let (build_tag, clap_args) = init()?;
|
||||
|
||||
// enable core dumping for all child processes
|
||||
setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?;
|
||||
|
||||
let (pg_handle, start_pg_result) = {
|
||||
// Enter startup tracing context
|
||||
let _startup_context_guard = startup_context_from_env();
|
||||
@@ -152,7 +147,6 @@ fn process_cli(matches: &clap::ArgMatches) -> Result<ProcessCliResult> {
|
||||
let spec_json = matches.get_one::<String>("spec");
|
||||
let spec_path = matches.get_one::<String>("spec-path");
|
||||
let resize_swap_on_bind = matches.get_flag("resize-swap-on-bind");
|
||||
let set_disk_quota_for_fs = matches.get_one::<String>("set-disk-quota-for-fs");
|
||||
|
||||
Ok(ProcessCliResult {
|
||||
connstr,
|
||||
@@ -163,7 +157,6 @@ fn process_cli(matches: &clap::ArgMatches) -> Result<ProcessCliResult> {
|
||||
spec_json,
|
||||
spec_path,
|
||||
resize_swap_on_bind,
|
||||
set_disk_quota_for_fs,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -176,7 +169,6 @@ struct ProcessCliResult<'clap> {
|
||||
spec_json: Option<&'clap String>,
|
||||
spec_path: Option<&'clap String>,
|
||||
resize_swap_on_bind: bool,
|
||||
set_disk_quota_for_fs: Option<&'clap String>,
|
||||
}
|
||||
|
||||
fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
|
||||
@@ -218,7 +210,7 @@ fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
|
||||
}
|
||||
if !startup_tracing_carrier.is_empty() {
|
||||
use opentelemetry::propagation::TextMapPropagator;
|
||||
use opentelemetry_sdk::propagation::TraceContextPropagator;
|
||||
use opentelemetry::sdk::propagation::TraceContextPropagator;
|
||||
let guard = TraceContextPropagator::new()
|
||||
.extract(&startup_tracing_carrier)
|
||||
.attach();
|
||||
@@ -297,7 +289,6 @@ fn wait_spec(
|
||||
pgbin,
|
||||
ext_remote_storage,
|
||||
resize_swap_on_bind,
|
||||
set_disk_quota_for_fs,
|
||||
http_port,
|
||||
..
|
||||
}: ProcessCliResult,
|
||||
@@ -372,13 +363,10 @@ fn wait_spec(
|
||||
state.start_time = now;
|
||||
}
|
||||
|
||||
launch_lsn_lease_bg_task_for_static(&compute);
|
||||
|
||||
Ok(WaitSpecResult {
|
||||
compute,
|
||||
http_port,
|
||||
resize_swap_on_bind,
|
||||
set_disk_quota_for_fs: set_disk_quota_for_fs.cloned(),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -387,7 +375,6 @@ struct WaitSpecResult {
|
||||
// passed through from ProcessCliResult
|
||||
http_port: u16,
|
||||
resize_swap_on_bind: bool,
|
||||
set_disk_quota_for_fs: Option<String>,
|
||||
}
|
||||
|
||||
fn start_postgres(
|
||||
@@ -397,12 +384,12 @@ fn start_postgres(
|
||||
compute,
|
||||
http_port,
|
||||
resize_swap_on_bind,
|
||||
set_disk_quota_for_fs,
|
||||
}: WaitSpecResult,
|
||||
) -> Result<(Option<PostgresHandle>, StartPostgresResult)> {
|
||||
// We got all we need, update the state.
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
state.set_status(ComputeStatus::Init, &compute.state_changed);
|
||||
state.status = ComputeStatus::Init;
|
||||
compute.state_changed.notify_all();
|
||||
|
||||
info!(
|
||||
"running compute with features: {:?}",
|
||||
@@ -410,7 +397,6 @@ fn start_postgres(
|
||||
);
|
||||
// before we release the mutex, fetch the swap size (if any) for later.
|
||||
let swap_size_bytes = state.pspec.as_ref().unwrap().spec.swap_size_bytes;
|
||||
let disk_quota_bytes = state.pspec.as_ref().unwrap().spec.disk_quota_bytes;
|
||||
drop(state);
|
||||
|
||||
// Launch remaining service threads
|
||||
@@ -430,8 +416,8 @@ fn start_postgres(
|
||||
// OOM-killed during startup because swap wasn't available yet.
|
||||
match resize_swap(size_bytes) {
|
||||
Ok(()) => {
|
||||
let size_mib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
|
||||
info!(%size_bytes, %size_mib, "resized swap");
|
||||
let size_gib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
|
||||
info!(%size_bytes, %size_gib, "resized swap");
|
||||
}
|
||||
Err(err) => {
|
||||
let err = err.context("failed to resize swap");
|
||||
@@ -440,29 +426,10 @@ fn start_postgres(
|
||||
// Mark compute startup as failed; don't try to start postgres, and report this
|
||||
// error to the control plane when it next asks.
|
||||
prestartup_failed = true;
|
||||
compute.set_failed_status(err);
|
||||
delay_exit = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Set disk quota if the compute spec says so
|
||||
if let (Some(disk_quota_bytes), Some(disk_quota_fs_mountpoint)) =
|
||||
(disk_quota_bytes, set_disk_quota_for_fs)
|
||||
{
|
||||
match set_disk_quota(disk_quota_bytes, &disk_quota_fs_mountpoint) {
|
||||
Ok(()) => {
|
||||
let size_mib = disk_quota_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
|
||||
info!(%disk_quota_bytes, %size_mib, "set disk quota");
|
||||
}
|
||||
Err(err) => {
|
||||
let err = err.context("failed to set disk quota");
|
||||
error!("{err:#}");
|
||||
|
||||
// Mark compute startup as failed; don't try to start postgres, and report this
|
||||
// error to the control plane when it next asks.
|
||||
prestartup_failed = true;
|
||||
compute.set_failed_status(err);
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
state.error = Some(format!("{err:?}"));
|
||||
state.status = ComputeStatus::Failed;
|
||||
compute.state_changed.notify_all();
|
||||
delay_exit = true;
|
||||
}
|
||||
}
|
||||
@@ -477,7 +444,16 @@ fn start_postgres(
|
||||
Ok(pg) => Some(pg),
|
||||
Err(err) => {
|
||||
error!("could not start the compute node: {:#}", err);
|
||||
compute.set_failed_status(err);
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
state.error = Some(format!("{:?}", err));
|
||||
state.status = ComputeStatus::Failed;
|
||||
// Notify others that Postgres failed to start. In case of configuring the
|
||||
// empty compute, it's likely that API handler is still waiting for compute
|
||||
// state change. With this we will notify it that compute is in Failed state,
|
||||
// so control plane will know about it earlier and record proper error instead
|
||||
// of timeout.
|
||||
compute.state_changed.notify_all();
|
||||
drop(state); // unlock
|
||||
delay_exit = true;
|
||||
None
|
||||
}
|
||||
@@ -768,11 +744,6 @@ fn cli() -> clap::Command {
|
||||
.long("resize-swap-on-bind")
|
||||
.action(clap::ArgAction::SetTrue),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("set-disk-quota-for-fs")
|
||||
.long("set-disk-quota-for-fs")
|
||||
.value_name("SET_DISK_QUOTA_FOR_FS")
|
||||
)
|
||||
}
|
||||
|
||||
/// When compute_ctl is killed, send also termination signal to sync-safekeepers
|
||||
|
||||
@@ -10,7 +10,6 @@ use std::sync::atomic::AtomicU32;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::{Condvar, Mutex, RwLock};
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
use std::time::Instant;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
@@ -34,7 +33,6 @@ use nix::sys::signal::{kill, Signal};
|
||||
use remote_storage::{DownloadError, RemotePath};
|
||||
|
||||
use crate::checker::create_availability_check_data;
|
||||
use crate::local_proxy;
|
||||
use crate::logger::inlinify;
|
||||
use crate::pg_helpers::*;
|
||||
use crate::spec::*;
|
||||
@@ -58,7 +56,6 @@ pub struct ComputeNode {
|
||||
/// - we push new spec and it does reconfiguration
|
||||
/// - but then something happens and compute pod / VM is destroyed,
|
||||
/// so k8s controller starts it again with the **old** spec
|
||||
///
|
||||
/// and the same for empty computes:
|
||||
/// - we started compute without any spec
|
||||
/// - we push spec and it does configuration
|
||||
@@ -109,18 +106,6 @@ impl ComputeState {
|
||||
metrics: ComputeMetrics::default(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set_status(&mut self, status: ComputeStatus, state_changed: &Condvar) {
|
||||
let prev = self.status;
|
||||
info!("Changing compute status from {} to {}", prev, status);
|
||||
self.status = status;
|
||||
state_changed.notify_all();
|
||||
}
|
||||
|
||||
pub fn set_failed_status(&mut self, err: anyhow::Error, state_changed: &Condvar) {
|
||||
self.error = Some(format!("{err:?}"));
|
||||
self.set_status(ComputeStatus::Failed, state_changed);
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ComputeState {
|
||||
@@ -315,12 +300,8 @@ impl ComputeNode {
|
||||
|
||||
pub fn set_status(&self, status: ComputeStatus) {
|
||||
let mut state = self.state.lock().unwrap();
|
||||
state.set_status(status, &self.state_changed);
|
||||
}
|
||||
|
||||
pub fn set_failed_status(&self, err: anyhow::Error) {
|
||||
let mut state = self.state.lock().unwrap();
|
||||
state.set_failed_status(err, &self.state_changed);
|
||||
state.status = status;
|
||||
self.state_changed.notify_all();
|
||||
}
|
||||
|
||||
pub fn get_status(&self) -> ComputeStatus {
|
||||
@@ -418,15 +399,7 @@ impl ComputeNode {
|
||||
pub fn get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
|
||||
let mut retry_period_ms = 500.0;
|
||||
let mut attempts = 0;
|
||||
const DEFAULT_ATTEMPTS: u16 = 10;
|
||||
#[cfg(feature = "testing")]
|
||||
let max_attempts = if let Ok(v) = env::var("NEON_COMPUTE_TESTING_BASEBACKUP_RETRIES") {
|
||||
u16::from_str(&v).unwrap()
|
||||
} else {
|
||||
DEFAULT_ATTEMPTS
|
||||
};
|
||||
#[cfg(not(feature = "testing"))]
|
||||
let max_attempts = DEFAULT_ATTEMPTS;
|
||||
let max_attempts = 10;
|
||||
loop {
|
||||
let result = self.try_get_basebackup(compute_state, lsn);
|
||||
match result {
|
||||
@@ -728,7 +701,7 @@ impl ComputeNode {
|
||||
info!("running initdb");
|
||||
let initdb_bin = Path::new(&self.pgbin).parent().unwrap().join("initdb");
|
||||
Command::new(initdb_bin)
|
||||
.args(["--pgdata", pgdata])
|
||||
.args(["-D", pgdata])
|
||||
.output()
|
||||
.expect("cannot start initdb process");
|
||||
|
||||
@@ -825,11 +798,7 @@ impl ComputeNode {
|
||||
// In this case we need to connect with old `zenith_admin` name
|
||||
// and create new user. We cannot simply rename connected user,
|
||||
// but we can create a new one and grant it all privileges.
|
||||
let mut connstr = self.connstr.clone();
|
||||
connstr
|
||||
.query_pairs_mut()
|
||||
.append_pair("application_name", "apply_config");
|
||||
|
||||
let connstr = self.connstr.clone();
|
||||
let mut client = match Client::connect(connstr.as_str(), NoTls) {
|
||||
Err(e) => match e.code() {
|
||||
Some(&SqlState::INVALID_PASSWORD)
|
||||
@@ -896,26 +865,17 @@ impl ComputeNode {
|
||||
// 'Close' connection
|
||||
drop(client);
|
||||
|
||||
if let Some(ref local_proxy) = spec.local_proxy_config {
|
||||
info!("configuring local_proxy");
|
||||
local_proxy::configure(local_proxy).context("apply_config local_proxy")?;
|
||||
}
|
||||
|
||||
// Run migrations separately to not hold up cold starts
|
||||
thread::spawn(move || {
|
||||
let mut connstr = connstr.clone();
|
||||
connstr
|
||||
.query_pairs_mut()
|
||||
.append_pair("application_name", "migrations");
|
||||
|
||||
let mut client = Client::connect(connstr.as_str(), NoTls)?;
|
||||
handle_migrations(&mut client).context("apply_config handle_migrations")
|
||||
});
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Wrapped this around `pg_ctl reload`, but right now we don't use
|
||||
// `pg_ctl` for start / stop.
|
||||
// We could've wrapped this around `pg_ctl reload`, but right now we don't use
|
||||
// `pg_ctl` for start / stop, so this just seems much easier to do as we already
|
||||
// have opened connection to Postgres and superuser access.
|
||||
#[instrument(skip_all)]
|
||||
fn pg_reload_conf(&self) -> Result<()> {
|
||||
let pgctl_bin = Path::new(&self.pgbin).parent().unwrap().join("pg_ctl");
|
||||
@@ -951,19 +911,6 @@ impl ComputeNode {
|
||||
});
|
||||
}
|
||||
|
||||
if let Some(ref local_proxy) = spec.local_proxy_config {
|
||||
info!("configuring local_proxy");
|
||||
|
||||
// Spawn a thread to do the configuration,
|
||||
// so that we don't block the main thread that starts Postgres.
|
||||
let local_proxy = local_proxy.clone();
|
||||
let _handle = Some(thread::spawn(move || {
|
||||
if let Err(err) = local_proxy::configure(&local_proxy) {
|
||||
error!("error while configuring local_proxy: {err:?}");
|
||||
}
|
||||
}));
|
||||
}
|
||||
|
||||
// Write new config
|
||||
let pgdata_path = Path::new(&self.pgdata);
|
||||
let postgresql_conf_path = pgdata_path.join("postgresql.conf");
|
||||
@@ -971,39 +918,38 @@ impl ComputeNode {
|
||||
// temporarily reset max_cluster_size in config
|
||||
// to avoid the possibility of hitting the limit, while we are reconfiguring:
|
||||
// creating new extensions, roles, etc...
|
||||
config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || {
|
||||
self.pg_reload_conf()?;
|
||||
config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?;
|
||||
self.pg_reload_conf()?;
|
||||
|
||||
let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
|
||||
let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
|
||||
|
||||
// Proceed with post-startup configuration. Note, that order of operations is important.
|
||||
// Disable DDL forwarding because control plane already knows about these roles/databases.
|
||||
if spec.mode == ComputeMode::Primary {
|
||||
client.simple_query("SET neon.forward_ddl = false")?;
|
||||
cleanup_instance(&mut client)?;
|
||||
handle_roles(&spec, &mut client)?;
|
||||
handle_databases(&spec, &mut client)?;
|
||||
handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
|
||||
handle_grants(
|
||||
&spec,
|
||||
&mut client,
|
||||
self.connstr.as_str(),
|
||||
self.has_feature(ComputeFeature::AnonExtension),
|
||||
)?;
|
||||
handle_extensions(&spec, &mut client)?;
|
||||
handle_extension_neon(&mut client)?;
|
||||
// We can skip handle_migrations here because a new migration can only appear
|
||||
// if we have a new version of the compute_ctl binary, which can only happen
|
||||
// if compute got restarted, in which case we'll end up inside of apply_config
|
||||
// instead of reconfigure.
|
||||
}
|
||||
// Proceed with post-startup configuration. Note, that order of operations is important.
|
||||
// Disable DDL forwarding because control plane already knows about these roles/databases.
|
||||
if spec.mode == ComputeMode::Primary {
|
||||
client.simple_query("SET neon.forward_ddl = false")?;
|
||||
cleanup_instance(&mut client)?;
|
||||
handle_roles(&spec, &mut client)?;
|
||||
handle_databases(&spec, &mut client)?;
|
||||
handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
|
||||
handle_grants(
|
||||
&spec,
|
||||
&mut client,
|
||||
self.connstr.as_str(),
|
||||
self.has_feature(ComputeFeature::AnonExtension),
|
||||
)?;
|
||||
handle_extensions(&spec, &mut client)?;
|
||||
handle_extension_neon(&mut client)?;
|
||||
// We can skip handle_migrations here because a new migration can only appear
|
||||
// if we have a new version of the compute_ctl binary, which can only happen
|
||||
// if compute got restarted, in which case we'll end up inside of apply_config
|
||||
// instead of reconfigure.
|
||||
}
|
||||
|
||||
// 'Close' connection
|
||||
drop(client);
|
||||
|
||||
Ok(())
|
||||
})?;
|
||||
// 'Close' connection
|
||||
drop(client);
|
||||
|
||||
// reset max_cluster_size in config back to original value and reload config
|
||||
config::compute_ctl_temp_override_remove(pgdata_path)?;
|
||||
self.pg_reload_conf()?;
|
||||
|
||||
let unknown_op = "unknown".to_string();
|
||||
@@ -1051,19 +997,6 @@ impl ComputeNode {
|
||||
});
|
||||
}
|
||||
|
||||
if let Some(local_proxy) = &pspec.spec.local_proxy_config {
|
||||
info!("configuring local_proxy");
|
||||
|
||||
// Spawn a thread to do the configuration,
|
||||
// so that we don't block the main thread that starts Postgres.
|
||||
let local_proxy = local_proxy.clone();
|
||||
let _handle = thread::spawn(move || {
|
||||
if let Err(err) = local_proxy::configure(&local_proxy) {
|
||||
error!("error while configuring local_proxy: {err:?}");
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
info!(
|
||||
"start_compute spec.remote_extensions {:?}",
|
||||
pspec.spec.remote_extensions
|
||||
@@ -1107,17 +1040,12 @@ impl ComputeNode {
|
||||
// temporarily reset max_cluster_size in config
|
||||
// to avoid the possibility of hitting the limit, while we are applying config:
|
||||
// creating new extensions, roles, etc...
|
||||
config::with_compute_ctl_tmp_override(
|
||||
pgdata_path,
|
||||
"neon.max_cluster_size=-1",
|
||||
|| {
|
||||
self.pg_reload_conf()?;
|
||||
config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?;
|
||||
self.pg_reload_conf()?;
|
||||
|
||||
self.apply_config(&compute_state)?;
|
||||
self.apply_config(&compute_state)?;
|
||||
|
||||
Ok(())
|
||||
},
|
||||
)?;
|
||||
config::compute_ctl_temp_override_remove(pgdata_path)?;
|
||||
self.pg_reload_conf()?;
|
||||
}
|
||||
self.post_apply_config()?;
|
||||
@@ -1174,14 +1102,11 @@ impl ComputeNode {
|
||||
// EKS worker nodes have following core dump settings:
|
||||
// /proc/sys/kernel/core_pattern -> core
|
||||
// /proc/sys/kernel/core_uses_pid -> 1
|
||||
// ulimit -c -> unlimited
|
||||
// ulimint -c -> unlimited
|
||||
// which results in core dumps being written to postgres data directory as core.<pid>.
|
||||
//
|
||||
// Use that as a default location and pattern, except macos where core dumps are written
|
||||
// to /cores/ directory by default.
|
||||
//
|
||||
// With default Linux settings, the core dump file is called just "core", so check for
|
||||
// that too.
|
||||
pub fn check_for_core_dumps(&self) -> Result<()> {
|
||||
let core_dump_dir = match std::env::consts::OS {
|
||||
"macos" => Path::new("/cores/"),
|
||||
@@ -1193,17 +1118,8 @@ impl ComputeNode {
|
||||
let files = fs::read_dir(core_dump_dir)?;
|
||||
let cores = files.filter_map(|entry| {
|
||||
let entry = entry.ok()?;
|
||||
|
||||
let is_core_dump = match entry.file_name().to_str()? {
|
||||
n if n.starts_with("core.") => true,
|
||||
"core" => true,
|
||||
_ => false,
|
||||
};
|
||||
if is_core_dump {
|
||||
Some(entry.path())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
let _ = entry.file_name().to_str()?.strip_prefix("core.")?;
|
||||
Some(entry.path())
|
||||
});
|
||||
|
||||
// Print backtrace for each core dump
|
||||
@@ -1454,58 +1370,6 @@ LIMIT 100",
|
||||
}
|
||||
Ok(remote_ext_metrics)
|
||||
}
|
||||
|
||||
/// Waits until current thread receives a state changed notification and
|
||||
/// the pageserver connection strings has changed.
|
||||
///
|
||||
/// The operation will time out after a specified duration.
|
||||
pub fn wait_timeout_while_pageserver_connstr_unchanged(&self, duration: Duration) {
|
||||
let state = self.state.lock().unwrap();
|
||||
let old_pageserver_connstr = state
|
||||
.pspec
|
||||
.as_ref()
|
||||
.expect("spec must be set")
|
||||
.pageserver_connstr
|
||||
.clone();
|
||||
let mut unchanged = true;
|
||||
let _ = self
|
||||
.state_changed
|
||||
.wait_timeout_while(state, duration, |s| {
|
||||
let pageserver_connstr = &s
|
||||
.pspec
|
||||
.as_ref()
|
||||
.expect("spec must be set")
|
||||
.pageserver_connstr;
|
||||
unchanged = pageserver_connstr == &old_pageserver_connstr;
|
||||
unchanged
|
||||
})
|
||||
.unwrap();
|
||||
if !unchanged {
|
||||
info!("Pageserver config changed");
|
||||
}
|
||||
}
|
||||
|
||||
// Gather info about installed extensions
|
||||
pub fn get_installed_extensions(&self) -> Result<()> {
|
||||
let connstr = self.connstr.clone();
|
||||
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("failed to create runtime");
|
||||
let result = rt
|
||||
.block_on(crate::installed_extensions::get_installed_extensions(
|
||||
connstr,
|
||||
))
|
||||
.expect("failed to get installed extensions");
|
||||
|
||||
info!(
|
||||
"{}",
|
||||
serde_json::to_string(&result).expect("failed to serialize extensions list")
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub fn forward_termination_signal() {
|
||||
@@ -1517,9 +1381,7 @@ pub fn forward_termination_signal() {
|
||||
let pg_pid = PG_PID.load(Ordering::SeqCst);
|
||||
if pg_pid != 0 {
|
||||
let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
|
||||
// Use 'fast' shutdown (SIGINT) because it also creates a shutdown checkpoint, which is important for
|
||||
// ROs to get a list of running xacts faster instead of going through the CLOG.
|
||||
// See https://www.postgresql.org/docs/current/server-shutdown.html for the list of modes and signals.
|
||||
kill(pg_pid, Signal::SIGINT).ok();
|
||||
// use 'immediate' shutdown (SIGQUIT): https://www.postgresql.org/docs/current/server-shutdown.html
|
||||
kill(pg_pid, Signal::SIGQUIT).ok();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -83,6 +83,12 @@ pub fn write_postgres_conf(
|
||||
ComputeMode::Replica => {
|
||||
// hot_standby is 'on' by default, but let's be explicit
|
||||
writeln!(file, "hot_standby=on")?;
|
||||
|
||||
// Inform the replica about the primary state
|
||||
// Default is 'false'
|
||||
if let Some(primary_is_running) = spec.primary_is_running {
|
||||
writeln!(file, "neon.primary_is_running={}", primary_is_running)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -125,17 +131,18 @@ pub fn write_postgres_conf(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn with_compute_ctl_tmp_override<F>(pgdata_path: &Path, options: &str, exec: F) -> Result<()>
|
||||
where
|
||||
F: FnOnce() -> Result<()>,
|
||||
{
|
||||
/// create file compute_ctl_temp_override.conf in pgdata_dir
|
||||
/// add provided options to this file
|
||||
pub fn compute_ctl_temp_override_create(pgdata_path: &Path, options: &str) -> Result<()> {
|
||||
let path = pgdata_path.join("compute_ctl_temp_override.conf");
|
||||
let mut file = File::create(path)?;
|
||||
write!(file, "{}", options)?;
|
||||
|
||||
let res = exec();
|
||||
|
||||
file.set_len(0)?;
|
||||
|
||||
res
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// remove file compute_ctl_temp_override.conf in pgdata_dir
|
||||
pub fn compute_ctl_temp_override_remove(pgdata_path: &Path) -> Result<()> {
|
||||
let path = pgdata_path.join("compute_ctl_temp_override.conf");
|
||||
std::fs::remove_file(path)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -11,20 +11,13 @@ use crate::compute::ComputeNode;
|
||||
fn configurator_main_loop(compute: &Arc<ComputeNode>) {
|
||||
info!("waiting for reconfiguration requests");
|
||||
loop {
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
let state = compute.state.lock().unwrap();
|
||||
let mut state = compute.state_changed.wait(state).unwrap();
|
||||
|
||||
// We have to re-check the status after re-acquiring the lock because it could be that
|
||||
// the status has changed while we were waiting for the lock, and we might not need to
|
||||
// wait on the condition variable. Otherwise, we might end up in some soft-/deadlock, i.e.
|
||||
// we are waiting for a condition variable that will never be signaled.
|
||||
if state.status != ComputeStatus::ConfigurationPending {
|
||||
state = compute.state_changed.wait(state).unwrap();
|
||||
}
|
||||
|
||||
// Re-check the status after waking up
|
||||
if state.status == ComputeStatus::ConfigurationPending {
|
||||
info!("got configuration request");
|
||||
state.set_status(ComputeStatus::Configuration, &compute.state_changed);
|
||||
state.status = ComputeStatus::Configuration;
|
||||
compute.state_changed.notify_all();
|
||||
drop(state);
|
||||
|
||||
let mut new_status = ComputeStatus::Failed;
|
||||
|
||||
@@ -1,25 +0,0 @@
|
||||
use anyhow::Context;
|
||||
|
||||
pub const DISK_QUOTA_BIN: &str = "/neonvm/bin/set-disk-quota";
|
||||
|
||||
/// If size_bytes is 0, it disables the quota. Otherwise, it sets filesystem quota to size_bytes.
|
||||
/// `fs_mountpoint` should point to the mountpoint of the filesystem where the quota should be set.
|
||||
pub fn set_disk_quota(size_bytes: u64, fs_mountpoint: &str) -> anyhow::Result<()> {
|
||||
let size_kb = size_bytes / 1024;
|
||||
// run `/neonvm/bin/set-disk-quota {size_kb} {mountpoint}`
|
||||
let child_result = std::process::Command::new("/usr/bin/sudo")
|
||||
.arg(DISK_QUOTA_BIN)
|
||||
.arg(size_kb.to_string())
|
||||
.arg(fs_mountpoint)
|
||||
.spawn();
|
||||
|
||||
child_result
|
||||
.context("spawn() failed")
|
||||
.and_then(|mut child| child.wait().context("wait() failed"))
|
||||
.and_then(|status| match status.success() {
|
||||
true => Ok(()),
|
||||
false => Err(anyhow::anyhow!("process exited with {status}")),
|
||||
})
|
||||
// wrap any prior error with the overall context that we couldn't run the command
|
||||
.with_context(|| format!("could not run `/usr/bin/sudo {DISK_QUOTA_BIN}`"))
|
||||
}
|
||||
@@ -124,7 +124,6 @@ fn parse_pg_version(human_version: &str) -> &str {
|
||||
"14" => return "v14",
|
||||
"15" => return "v15",
|
||||
"16" => return "v16",
|
||||
"17" => return "v17",
|
||||
_ => {}
|
||||
},
|
||||
_ => {}
|
||||
|
||||
@@ -17,7 +17,7 @@ use hyper::header::CONTENT_TYPE;
|
||||
use hyper::service::{make_service_fn, service_fn};
|
||||
use hyper::{Body, Method, Request, Response, Server, StatusCode};
|
||||
use tokio::task;
|
||||
use tracing::{debug, error, info, warn};
|
||||
use tracing::{error, info, warn};
|
||||
use tracing_utils::http::OtelName;
|
||||
use utils::http::request::must_get_query_param;
|
||||
|
||||
@@ -48,7 +48,7 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
|
||||
match (req.method(), req.uri().path()) {
|
||||
// Serialized compute state.
|
||||
(&Method::GET, "/status") => {
|
||||
debug!("serving /status GET request");
|
||||
info!("serving /status GET request");
|
||||
let state = compute.state.lock().unwrap();
|
||||
let status_response = status_response_from_state(&state);
|
||||
Response::new(Body::from(serde_json::to_string(&status_response).unwrap()))
|
||||
@@ -165,32 +165,6 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
|
||||
}
|
||||
}
|
||||
|
||||
// get the list of installed extensions
|
||||
// currently only used in python tests
|
||||
// TODO: call it from cplane
|
||||
(&Method::GET, "/installed_extensions") => {
|
||||
info!("serving /installed_extensions GET request");
|
||||
let status = compute.get_status();
|
||||
if status != ComputeStatus::Running {
|
||||
let msg = format!(
|
||||
"invalid compute status for extensions request: {:?}",
|
||||
status
|
||||
);
|
||||
error!(msg);
|
||||
return Response::new(Body::from(msg));
|
||||
}
|
||||
|
||||
let connstr = compute.connstr.clone();
|
||||
let res = crate::installed_extensions::get_installed_extensions(connstr).await;
|
||||
match res {
|
||||
Ok(res) => render_json(Body::from(serde_json::to_string(&res).unwrap())),
|
||||
Err(e) => render_json_error(
|
||||
&format!("could not get list of installed extensions: {}", e),
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
// download extension files from remote extension storage on demand
|
||||
(&Method::POST, route) if route.starts_with("/extension_server/") => {
|
||||
info!("serving {:?} POST request", route);
|
||||
@@ -314,7 +288,8 @@ async fn handle_configure_request(
|
||||
return Err((msg, StatusCode::PRECONDITION_FAILED));
|
||||
}
|
||||
state.pspec = Some(parsed_spec);
|
||||
state.set_status(ComputeStatus::ConfigurationPending, &compute.state_changed);
|
||||
state.status = ComputeStatus::ConfigurationPending;
|
||||
compute.state_changed.notify_all();
|
||||
drop(state);
|
||||
info!("set new spec and notified waiters");
|
||||
}
|
||||
@@ -387,15 +362,15 @@ async fn handle_terminate_request(compute: &Arc<ComputeNode>) -> Result<(), (Str
|
||||
}
|
||||
if state.status != ComputeStatus::Empty && state.status != ComputeStatus::Running {
|
||||
let msg = format!(
|
||||
"invalid compute status for termination request: {}",
|
||||
state.status
|
||||
"invalid compute status for termination request: {:?}",
|
||||
state.status.clone()
|
||||
);
|
||||
return Err((msg, StatusCode::PRECONDITION_FAILED));
|
||||
}
|
||||
state.set_status(ComputeStatus::TerminationPending, &compute.state_changed);
|
||||
state.status = ComputeStatus::TerminationPending;
|
||||
compute.state_changed.notify_all();
|
||||
drop(state);
|
||||
}
|
||||
|
||||
forward_termination_signal();
|
||||
info!("sent signal and notified waiters");
|
||||
|
||||
@@ -409,8 +384,7 @@ async fn handle_terminate_request(compute: &Arc<ComputeNode>) -> Result<(), (Str
|
||||
while state.status != ComputeStatus::Terminated {
|
||||
state = c.state_changed.wait(state).unwrap();
|
||||
info!(
|
||||
"waiting for compute to become {}, current status: {:?}",
|
||||
ComputeStatus::Terminated,
|
||||
"waiting for compute to become Terminated, current status: {:?}",
|
||||
state.status
|
||||
);
|
||||
}
|
||||
|
||||
@@ -53,20 +53,6 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ComputeInsights"
|
||||
|
||||
/installed_extensions:
|
||||
get:
|
||||
tags:
|
||||
- Info
|
||||
summary: Get installed extensions.
|
||||
description: ""
|
||||
operationId: getInstalledExtensions
|
||||
responses:
|
||||
200:
|
||||
description: List of installed extensions
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/InstalledExtensions"
|
||||
/info:
|
||||
get:
|
||||
tags:
|
||||
@@ -409,24 +395,6 @@ components:
|
||||
- configuration
|
||||
example: running
|
||||
|
||||
InstalledExtensions:
|
||||
type: object
|
||||
properties:
|
||||
extensions:
|
||||
description: Contains list of installed extensions.
|
||||
type: array
|
||||
items:
|
||||
type: object
|
||||
properties:
|
||||
extname:
|
||||
type: string
|
||||
versions:
|
||||
type: array
|
||||
items:
|
||||
type: string
|
||||
n_databases:
|
||||
type: integer
|
||||
|
||||
#
|
||||
# Errors
|
||||
#
|
||||
|
||||
@@ -1,80 +0,0 @@
|
||||
use compute_api::responses::{InstalledExtension, InstalledExtensions};
|
||||
use std::collections::HashMap;
|
||||
use std::collections::HashSet;
|
||||
use url::Url;
|
||||
|
||||
use anyhow::Result;
|
||||
use postgres::{Client, NoTls};
|
||||
use tokio::task;
|
||||
|
||||
/// We don't reuse get_existing_dbs() just for code clarity
|
||||
/// and to make database listing query here more explicit.
|
||||
///
|
||||
/// Limit the number of databases to 500 to avoid excessive load.
|
||||
fn list_dbs(client: &mut Client) -> Result<Vec<String>> {
|
||||
// `pg_database.datconnlimit = -2` means that the database is in the
|
||||
// invalid state
|
||||
let databases = client
|
||||
.query(
|
||||
"SELECT datname FROM pg_catalog.pg_database
|
||||
WHERE datallowconn
|
||||
AND datconnlimit <> - 2
|
||||
LIMIT 500",
|
||||
&[],
|
||||
)?
|
||||
.iter()
|
||||
.map(|row| {
|
||||
let db: String = row.get("datname");
|
||||
db
|
||||
})
|
||||
.collect();
|
||||
|
||||
Ok(databases)
|
||||
}
|
||||
|
||||
/// Connect to every database (see list_dbs above) and get the list of installed extensions.
|
||||
/// Same extension can be installed in multiple databases with different versions,
|
||||
/// we only keep the highest and lowest version across all databases.
|
||||
pub async fn get_installed_extensions(connstr: Url) -> Result<InstalledExtensions> {
|
||||
let mut connstr = connstr.clone();
|
||||
|
||||
task::spawn_blocking(move || {
|
||||
let mut client = Client::connect(connstr.as_str(), NoTls)?;
|
||||
let databases: Vec<String> = list_dbs(&mut client)?;
|
||||
|
||||
let mut extensions_map: HashMap<String, InstalledExtension> = HashMap::new();
|
||||
for db in databases.iter() {
|
||||
connstr.set_path(db);
|
||||
let mut db_client = Client::connect(connstr.as_str(), NoTls)?;
|
||||
let extensions: Vec<(String, String)> = db_client
|
||||
.query(
|
||||
"SELECT extname, extversion FROM pg_catalog.pg_extension;",
|
||||
&[],
|
||||
)?
|
||||
.iter()
|
||||
.map(|row| (row.get("extname"), row.get("extversion")))
|
||||
.collect();
|
||||
|
||||
for (extname, v) in extensions.iter() {
|
||||
let version = v.to_string();
|
||||
extensions_map
|
||||
.entry(extname.to_string())
|
||||
.and_modify(|e| {
|
||||
e.versions.insert(version.clone());
|
||||
// count the number of databases where the extension is installed
|
||||
e.n_databases += 1;
|
||||
})
|
||||
.or_insert(InstalledExtension {
|
||||
extname: extname.to_string(),
|
||||
versions: HashSet::from([version.clone()]),
|
||||
n_databases: 1,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
Ok(InstalledExtensions {
|
||||
extensions: extensions_map.values().cloned().collect(),
|
||||
})
|
||||
})
|
||||
.await?
|
||||
}
|
||||
@@ -2,9 +2,6 @@
|
||||
//! configuration.
|
||||
#![deny(unsafe_code)]
|
||||
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||
|
||||
extern crate hyper0 as hyper;
|
||||
|
||||
pub mod checker;
|
||||
pub mod config;
|
||||
pub mod configurator;
|
||||
@@ -13,12 +10,7 @@ pub mod http;
|
||||
pub mod logger;
|
||||
pub mod catalog;
|
||||
pub mod compute;
|
||||
pub mod disk_quota;
|
||||
pub mod extension_server;
|
||||
pub mod installed_extensions;
|
||||
pub mod local_proxy;
|
||||
pub mod lsn_lease;
|
||||
mod migration;
|
||||
pub mod monitor;
|
||||
pub mod params;
|
||||
pub mod pg_helpers;
|
||||
|
||||
@@ -1,56 +0,0 @@
|
||||
//! Local Proxy is a feature of our BaaS Neon Authorize project.
|
||||
//!
|
||||
//! Local Proxy validates JWTs and manages the pg_session_jwt extension.
|
||||
//! It also maintains a connection pool to postgres.
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use camino::Utf8Path;
|
||||
use compute_api::spec::LocalProxySpec;
|
||||
use nix::sys::signal::Signal;
|
||||
use utils::pid_file::{self, PidFileRead};
|
||||
|
||||
pub fn configure(local_proxy: &LocalProxySpec) -> Result<()> {
|
||||
write_local_proxy_conf("/etc/local_proxy/config.json".as_ref(), local_proxy)?;
|
||||
notify_local_proxy("/etc/local_proxy/pid".as_ref())?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Create or completely rewrite configuration file specified by `path`
|
||||
fn write_local_proxy_conf(path: &Utf8Path, local_proxy: &LocalProxySpec) -> Result<()> {
|
||||
let config =
|
||||
serde_json::to_string_pretty(local_proxy).context("serializing LocalProxySpec to json")?;
|
||||
std::fs::write(path, config).with_context(|| format!("writing {path}"))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Notify local proxy about a new config file.
|
||||
fn notify_local_proxy(path: &Utf8Path) -> Result<()> {
|
||||
match pid_file::read(path)? {
|
||||
// if the file doesn't exist, or isn't locked, local_proxy isn't running
|
||||
// and will naturally pick up our config later
|
||||
PidFileRead::NotExist | PidFileRead::NotHeldByAnyProcess(_) => {}
|
||||
PidFileRead::LockedByOtherProcess(pid) => {
|
||||
// From the pid_file docs:
|
||||
//
|
||||
// > 1. The other process might exit at any time, turning the given PID stale.
|
||||
// > 2. There is a small window in which `claim_for_current_process` has already
|
||||
// > locked the file but not yet updates its contents. [`read`] will return
|
||||
// > this variant here, but with the old file contents, i.e., a stale PID.
|
||||
// >
|
||||
// > The kernel is free to recycle PID once it has been `wait(2)`ed upon by
|
||||
// > its creator. Thus, acting upon a stale PID, e.g., by issuing a `kill`
|
||||
// > system call on it, bears the risk of killing an unrelated process.
|
||||
// > This is an inherent limitation of using pidfiles.
|
||||
// > The only race-free solution is to have a supervisor-process with a lifetime
|
||||
// > that exceeds that of all of its child-processes (e.g., `runit`, `supervisord`).
|
||||
//
|
||||
// This is an ok risk as we only send a SIGHUP which likely won't actually
|
||||
// kill the process, only reload config.
|
||||
nix::sys::signal::kill(pid, Signal::SIGHUP).context("sending signal to local_proxy")?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,3 +1,4 @@
|
||||
use tracing_opentelemetry::OpenTelemetryLayer;
|
||||
use tracing_subscriber::layer::SubscriberExt;
|
||||
use tracing_subscriber::prelude::*;
|
||||
|
||||
@@ -22,7 +23,8 @@ pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> {
|
||||
.with_writer(std::io::stderr);
|
||||
|
||||
// Initialize OpenTelemetry
|
||||
let otlp_layer = tracing_utils::init_tracing_without_runtime("compute_ctl");
|
||||
let otlp_layer =
|
||||
tracing_utils::init_tracing_without_runtime("compute_ctl").map(OpenTelemetryLayer::new);
|
||||
|
||||
// Put it all together
|
||||
tracing_subscriber::registry()
|
||||
|
||||
@@ -1,185 +0,0 @@
|
||||
use anyhow::bail;
|
||||
use anyhow::Result;
|
||||
use postgres::{NoTls, SimpleQueryMessage};
|
||||
use std::time::SystemTime;
|
||||
use std::{str::FromStr, sync::Arc, thread, time::Duration};
|
||||
use utils::id::TenantId;
|
||||
use utils::id::TimelineId;
|
||||
|
||||
use compute_api::spec::ComputeMode;
|
||||
use tracing::{info, warn};
|
||||
use utils::{
|
||||
lsn::Lsn,
|
||||
shard::{ShardCount, ShardNumber, TenantShardId},
|
||||
};
|
||||
|
||||
use crate::compute::ComputeNode;
|
||||
|
||||
/// Spawns a background thread to periodically renew LSN leases for static compute.
|
||||
/// Do nothing if the compute is not in static mode.
|
||||
pub fn launch_lsn_lease_bg_task_for_static(compute: &Arc<ComputeNode>) {
|
||||
let (tenant_id, timeline_id, lsn) = {
|
||||
let state = compute.state.lock().unwrap();
|
||||
let spec = state.pspec.as_ref().expect("Spec must be set");
|
||||
match spec.spec.mode {
|
||||
ComputeMode::Static(lsn) => (spec.tenant_id, spec.timeline_id, lsn),
|
||||
_ => return,
|
||||
}
|
||||
};
|
||||
let compute = compute.clone();
|
||||
|
||||
let span = tracing::info_span!("lsn_lease_bg_task", %tenant_id, %timeline_id, %lsn);
|
||||
thread::spawn(move || {
|
||||
let _entered = span.entered();
|
||||
if let Err(e) = lsn_lease_bg_task(compute, tenant_id, timeline_id, lsn) {
|
||||
// TODO: might need stronger error feedback than logging an warning.
|
||||
warn!("Exited with error: {e}");
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/// Renews lsn lease periodically so static compute are not affected by GC.
|
||||
fn lsn_lease_bg_task(
|
||||
compute: Arc<ComputeNode>,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
lsn: Lsn,
|
||||
) -> Result<()> {
|
||||
loop {
|
||||
let valid_until = acquire_lsn_lease_with_retry(&compute, tenant_id, timeline_id, lsn)?;
|
||||
let valid_duration = valid_until
|
||||
.duration_since(SystemTime::now())
|
||||
.unwrap_or(Duration::ZERO);
|
||||
|
||||
// Sleep for 60 seconds less than the valid duration but no more than half of the valid duration.
|
||||
let sleep_duration = valid_duration
|
||||
.saturating_sub(Duration::from_secs(60))
|
||||
.max(valid_duration / 2);
|
||||
|
||||
info!(
|
||||
"Request succeeded, sleeping for {} seconds",
|
||||
sleep_duration.as_secs()
|
||||
);
|
||||
compute.wait_timeout_while_pageserver_connstr_unchanged(sleep_duration);
|
||||
}
|
||||
}
|
||||
|
||||
/// Acquires lsn lease in a retry loop. Returns the expiration time if a lease is granted.
|
||||
/// Returns an error if a lease is explicitly not granted. Otherwise, we keep sending requests.
|
||||
fn acquire_lsn_lease_with_retry(
|
||||
compute: &Arc<ComputeNode>,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
lsn: Lsn,
|
||||
) -> Result<SystemTime> {
|
||||
let mut attempts = 0usize;
|
||||
let mut retry_period_ms: f64 = 500.0;
|
||||
const MAX_RETRY_PERIOD_MS: f64 = 60.0 * 1000.0;
|
||||
|
||||
loop {
|
||||
// Note: List of pageservers is dynamic, need to re-read configs before each attempt.
|
||||
let configs = {
|
||||
let state = compute.state.lock().unwrap();
|
||||
|
||||
let spec = state.pspec.as_ref().expect("spec must be set");
|
||||
|
||||
let conn_strings = spec.pageserver_connstr.split(',');
|
||||
|
||||
conn_strings
|
||||
.map(|connstr| {
|
||||
let mut config = postgres::Config::from_str(connstr).expect("Invalid connstr");
|
||||
if let Some(storage_auth_token) = &spec.storage_auth_token {
|
||||
config.password(storage_auth_token.clone());
|
||||
}
|
||||
config
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
let result = try_acquire_lsn_lease(tenant_id, timeline_id, lsn, &configs);
|
||||
match result {
|
||||
Ok(Some(res)) => {
|
||||
return Ok(res);
|
||||
}
|
||||
Ok(None) => {
|
||||
bail!("Permanent error: lease could not be obtained, LSN is behind the GC cutoff");
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("Failed to acquire lsn lease: {e} (attempt {attempts})");
|
||||
|
||||
compute.wait_timeout_while_pageserver_connstr_unchanged(Duration::from_millis(
|
||||
retry_period_ms as u64,
|
||||
));
|
||||
retry_period_ms *= 1.5;
|
||||
retry_period_ms = retry_period_ms.min(MAX_RETRY_PERIOD_MS);
|
||||
}
|
||||
}
|
||||
attempts += 1;
|
||||
}
|
||||
}
|
||||
|
||||
/// Tries to acquire an LSN lease through PS page_service API.
|
||||
fn try_acquire_lsn_lease(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
lsn: Lsn,
|
||||
configs: &[postgres::Config],
|
||||
) -> Result<Option<SystemTime>> {
|
||||
fn get_valid_until(
|
||||
config: &postgres::Config,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
lsn: Lsn,
|
||||
) -> Result<Option<SystemTime>> {
|
||||
let mut client = config.connect(NoTls)?;
|
||||
let cmd = format!("lease lsn {} {} {} ", tenant_shard_id, timeline_id, lsn);
|
||||
let res = client.simple_query(&cmd)?;
|
||||
let msg = match res.first() {
|
||||
Some(msg) => msg,
|
||||
None => bail!("empty response"),
|
||||
};
|
||||
let row = match msg {
|
||||
SimpleQueryMessage::Row(row) => row,
|
||||
_ => bail!("error parsing lsn lease response"),
|
||||
};
|
||||
|
||||
// Note: this will be None if a lease is explicitly not granted.
|
||||
let valid_until_str = row.get("valid_until");
|
||||
|
||||
let valid_until = valid_until_str.map(|s| {
|
||||
SystemTime::UNIX_EPOCH
|
||||
.checked_add(Duration::from_millis(u128::from_str(s).unwrap() as u64))
|
||||
.expect("Time larger than max SystemTime could handle")
|
||||
});
|
||||
Ok(valid_until)
|
||||
}
|
||||
|
||||
let shard_count = configs.len();
|
||||
|
||||
let valid_until = if shard_count > 1 {
|
||||
configs
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(shard_number, config)| {
|
||||
let tenant_shard_id = TenantShardId {
|
||||
tenant_id,
|
||||
shard_count: ShardCount::new(shard_count as u8),
|
||||
shard_number: ShardNumber(shard_number as u8),
|
||||
};
|
||||
get_valid_until(config, tenant_shard_id, timeline_id, lsn)
|
||||
})
|
||||
.collect::<Result<Vec<Option<SystemTime>>>>()?
|
||||
.into_iter()
|
||||
.min()
|
||||
.unwrap()
|
||||
} else {
|
||||
get_valid_until(
|
||||
&configs[0],
|
||||
TenantShardId::unsharded(tenant_id),
|
||||
timeline_id,
|
||||
lsn,
|
||||
)?
|
||||
};
|
||||
|
||||
Ok(valid_until)
|
||||
}
|
||||
@@ -1,105 +0,0 @@
|
||||
use anyhow::{Context, Result};
|
||||
use postgres::Client;
|
||||
use tracing::info;
|
||||
|
||||
pub(crate) struct MigrationRunner<'m> {
|
||||
client: &'m mut Client,
|
||||
migrations: &'m [&'m str],
|
||||
}
|
||||
|
||||
impl<'m> MigrationRunner<'m> {
|
||||
pub fn new(client: &'m mut Client, migrations: &'m [&'m str]) -> Self {
|
||||
// The neon_migration.migration_id::id column is a bigint, which is equivalent to an i64
|
||||
assert!(migrations.len() + 1 < i64::MAX as usize);
|
||||
|
||||
Self { client, migrations }
|
||||
}
|
||||
|
||||
fn get_migration_id(&mut self) -> Result<i64> {
|
||||
let query = "SELECT id FROM neon_migration.migration_id";
|
||||
let row = self
|
||||
.client
|
||||
.query_one(query, &[])
|
||||
.context("run_migrations get migration_id")?;
|
||||
|
||||
Ok(row.get::<&str, i64>("id"))
|
||||
}
|
||||
|
||||
fn update_migration_id(&mut self, migration_id: i64) -> Result<()> {
|
||||
let setval = format!("UPDATE neon_migration.migration_id SET id={}", migration_id);
|
||||
|
||||
self.client
|
||||
.simple_query(&setval)
|
||||
.context("run_migrations update id")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn prepare_migrations(&mut self) -> Result<()> {
|
||||
let query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
|
||||
self.client.simple_query(query)?;
|
||||
|
||||
let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
|
||||
self.client.simple_query(query)?;
|
||||
|
||||
let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
|
||||
self.client.simple_query(query)?;
|
||||
|
||||
let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
|
||||
self.client.simple_query(query)?;
|
||||
|
||||
let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
|
||||
self.client.simple_query(query)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn run_migrations(mut self) -> Result<()> {
|
||||
self.prepare_migrations()?;
|
||||
|
||||
let mut current_migration = self.get_migration_id()? as usize;
|
||||
while current_migration < self.migrations.len() {
|
||||
macro_rules! migration_id {
|
||||
($cm:expr) => {
|
||||
($cm + 1) as i64
|
||||
};
|
||||
}
|
||||
|
||||
let migration = self.migrations[current_migration];
|
||||
|
||||
if migration.starts_with("-- SKIP") {
|
||||
info!("Skipping migration id={}", migration_id!(current_migration));
|
||||
} else {
|
||||
info!(
|
||||
"Running migration id={}:\n{}\n",
|
||||
migration_id!(current_migration),
|
||||
migration
|
||||
);
|
||||
|
||||
self.client
|
||||
.simple_query("BEGIN")
|
||||
.context("begin migration")?;
|
||||
|
||||
self.client.simple_query(migration).with_context(|| {
|
||||
format!(
|
||||
"run_migrations migration id={}",
|
||||
migration_id!(current_migration)
|
||||
)
|
||||
})?;
|
||||
|
||||
// Migration IDs start at 1
|
||||
self.update_migration_id(migration_id!(current_migration))?;
|
||||
|
||||
self.client
|
||||
.simple_query("COMMIT")
|
||||
.context("commit migration")?;
|
||||
|
||||
info!("Finished migration id={}", migration_id!(current_migration));
|
||||
}
|
||||
|
||||
current_migration += 1;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -1,7 +0,0 @@
|
||||
DO $$
|
||||
BEGIN
|
||||
IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
|
||||
EXECUTE 'GRANT EXECUTE ON FUNCTION pg_export_snapshot TO neon_superuser';
|
||||
EXECUTE 'GRANT EXECUTE ON FUNCTION pg_log_standby_snapshot TO neon_superuser';
|
||||
END IF;
|
||||
END $$;
|
||||
@@ -1 +0,0 @@
|
||||
GRANT EXECUTE ON FUNCTION pg_show_replication_origin_status TO neon_superuser;
|
||||
@@ -17,11 +17,7 @@ const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500);
|
||||
// should be handled gracefully.
|
||||
fn watch_compute_activity(compute: &ComputeNode) {
|
||||
// Suppose that `connstr` doesn't change
|
||||
let mut connstr = compute.connstr.clone();
|
||||
connstr
|
||||
.query_pairs_mut()
|
||||
.append_pair("application_name", "compute_activity_monitor");
|
||||
let connstr = connstr.as_str();
|
||||
let connstr = compute.connstr.as_str();
|
||||
|
||||
// During startup and configuration we connect to every Postgres database,
|
||||
// but we don't want to count this as some user activity. So wait until
|
||||
|
||||
@@ -22,10 +22,9 @@ use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};
|
||||
|
||||
const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds
|
||||
|
||||
/// Escape a string for including it in a SQL literal.
|
||||
///
|
||||
/// Wrapping the result with `E'{}'` or `'{}'` is not required,
|
||||
/// as it returns a ready-to-use SQL string literal, e.g. `'db'''` or `E'db\\'`.
|
||||
/// Escape a string for including it in a SQL literal. Wrapping the result
|
||||
/// with `E'{}'` or `'{}'` is not required, as it returns a ready-to-use
|
||||
/// SQL string literal, e.g. `'db'''` or `E'db\\'`.
|
||||
/// See <https://github.com/postgres/postgres/blob/da98d005cdbcd45af563d0c4ac86d0e9772cd15f/src/backend/utils/adt/quote.c#L47>
|
||||
/// for the original implementation.
|
||||
pub fn escape_literal(s: &str) -> String {
|
||||
@@ -490,7 +489,7 @@ pub fn handle_postgres_logs(stderr: std::process::ChildStderr) -> JoinHandle<()>
|
||||
/// Read Postgres logs from `stderr` until EOF. Buffer is flushed on one of the following conditions:
|
||||
/// - next line starts with timestamp
|
||||
/// - EOF
|
||||
/// - no new lines were written for the last 100 milliseconds
|
||||
/// - no new lines were written for the last second
|
||||
async fn handle_postgres_logs_async(stderr: tokio::process::ChildStderr) -> Result<()> {
|
||||
let mut lines = tokio::io::BufReader::new(stderr).lines();
|
||||
let timeout_duration = Duration::from_millis(100);
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
use std::collections::HashSet;
|
||||
use std::fs::File;
|
||||
use std::path::Path;
|
||||
use std::str::FromStr;
|
||||
@@ -11,7 +10,6 @@ use tracing::{error, info, info_span, instrument, span_enabled, warn, Level};
|
||||
|
||||
use crate::config;
|
||||
use crate::logger::inlinify;
|
||||
use crate::migration::MigrationRunner;
|
||||
use crate::params::PG_HBA_ALL_MD5;
|
||||
use crate::pg_helpers::*;
|
||||
|
||||
@@ -190,15 +188,6 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
let mut xact = client.transaction()?;
|
||||
let existing_roles: Vec<Role> = get_existing_roles(&mut xact)?;
|
||||
|
||||
let mut jwks_roles = HashSet::new();
|
||||
if let Some(local_proxy) = &spec.local_proxy_config {
|
||||
for jwks_setting in local_proxy.jwks.iter().flatten() {
|
||||
for role_name in &jwks_setting.role_names {
|
||||
jwks_roles.insert(role_name.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Print a list of existing Postgres roles (only in debug mode)
|
||||
if span_enabled!(Level::INFO) {
|
||||
let mut vec = Vec::new();
|
||||
@@ -318,9 +307,6 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
"CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
|
||||
name.pg_quote()
|
||||
);
|
||||
if jwks_roles.contains(name.as_str()) {
|
||||
query = format!("CREATE ROLE {}", name.pg_quote());
|
||||
}
|
||||
info!("running role create query: '{}'", &query);
|
||||
query.push_str(&role.to_pg_options());
|
||||
xact.execute(query.as_str(), &[])?;
|
||||
@@ -790,28 +776,84 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {
|
||||
|
||||
// Add new migrations in numerical order.
|
||||
let migrations = [
|
||||
include_str!("./migrations/0001-neon_superuser_bypass_rls.sql"),
|
||||
include_str!("./migrations/0002-alter_roles.sql"),
|
||||
include_str!("./migrations/0003-grant_pg_create_subscription_to_neon_superuser.sql"),
|
||||
include_str!("./migrations/0004-grant_pg_monitor_to_neon_superuser.sql"),
|
||||
include_str!("./migrations/0005-grant_all_on_tables_to_neon_superuser.sql"),
|
||||
include_str!("./migrations/0006-grant_all_on_sequences_to_neon_superuser.sql"),
|
||||
include_str!("./migrations/0000-neon_superuser_bypass_rls.sql"),
|
||||
include_str!("./migrations/0001-alter_roles.sql"),
|
||||
include_str!("./migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql"),
|
||||
include_str!("./migrations/0003-grant_pg_monitor_to_neon_superuser.sql"),
|
||||
include_str!("./migrations/0004-grant_all_on_tables_to_neon_superuser.sql"),
|
||||
include_str!("./migrations/0005-grant_all_on_sequences_to_neon_superuser.sql"),
|
||||
include_str!(
|
||||
"./migrations/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql"
|
||||
"./migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql"
|
||||
),
|
||||
include_str!(
|
||||
"./migrations/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql"
|
||||
),
|
||||
include_str!("./migrations/0009-revoke_replication_for_previously_allowed_roles.sql"),
|
||||
include_str!(
|
||||
"./migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql"
|
||||
),
|
||||
include_str!(
|
||||
"./migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql"
|
||||
"./migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql"
|
||||
),
|
||||
include_str!("./migrations/0008-revoke_replication_for_previously_allowed_roles.sql"),
|
||||
];
|
||||
|
||||
MigrationRunner::new(client, &migrations).run_migrations()?;
|
||||
let mut func = || {
|
||||
let query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
|
||||
client.simple_query(query)?;
|
||||
|
||||
let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
|
||||
client.simple_query(query)?;
|
||||
|
||||
let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
|
||||
client.simple_query(query)?;
|
||||
|
||||
let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
|
||||
client.simple_query(query)?;
|
||||
|
||||
let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
|
||||
client.simple_query(query)?;
|
||||
Ok::<_, anyhow::Error>(())
|
||||
};
|
||||
func().context("handle_migrations prepare")?;
|
||||
|
||||
let query = "SELECT id FROM neon_migration.migration_id";
|
||||
let row = client
|
||||
.query_one(query, &[])
|
||||
.context("handle_migrations get migration_id")?;
|
||||
let mut current_migration: usize = row.get::<&str, i64>("id") as usize;
|
||||
let starting_migration_id = current_migration;
|
||||
|
||||
let query = "BEGIN";
|
||||
client
|
||||
.simple_query(query)
|
||||
.context("handle_migrations begin")?;
|
||||
|
||||
while current_migration < migrations.len() {
|
||||
let migration = &migrations[current_migration];
|
||||
if migration.starts_with("-- SKIP") {
|
||||
info!("Skipping migration id={}", current_migration);
|
||||
} else {
|
||||
info!(
|
||||
"Running migration id={}:\n{}\n",
|
||||
current_migration, migration
|
||||
);
|
||||
client.simple_query(migration).with_context(|| {
|
||||
format!("handle_migrations current_migration={}", current_migration)
|
||||
})?;
|
||||
}
|
||||
current_migration += 1;
|
||||
}
|
||||
let setval = format!(
|
||||
"UPDATE neon_migration.migration_id SET id={}",
|
||||
migrations.len()
|
||||
);
|
||||
client
|
||||
.simple_query(&setval)
|
||||
.context("handle_migrations update id")?;
|
||||
|
||||
let query = "COMMIT";
|
||||
client
|
||||
.simple_query(query)
|
||||
.context("handle_migrations commit")?;
|
||||
|
||||
info!(
|
||||
"Ran {} migrations",
|
||||
(migrations.len() - starting_migration_id)
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -6,20 +6,26 @@ license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
async-trait.workspace = true
|
||||
camino.workspace = true
|
||||
clap.workspace = true
|
||||
comfy-table.workspace = true
|
||||
futures.workspace = true
|
||||
git-version.workspace = true
|
||||
humantime.workspace = true
|
||||
nix.workspace = true
|
||||
once_cell.workspace = true
|
||||
postgres.workspace = true
|
||||
hex.workspace = true
|
||||
humantime-serde.workspace = true
|
||||
hyper0.workspace = true
|
||||
hyper.workspace = true
|
||||
regex.workspace = true
|
||||
reqwest = { workspace = true, features = ["blocking", "json"] }
|
||||
scopeguard.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
serde_with.workspace = true
|
||||
tar.workspace = true
|
||||
thiserror.workspace = true
|
||||
toml.workspace = true
|
||||
toml_edit.workspace = true
|
||||
@@ -34,7 +40,6 @@ safekeeper_api.workspace = true
|
||||
postgres_connection.workspace = true
|
||||
storage_broker.workspace = true
|
||||
utils.workspace = true
|
||||
whoami.workspace = true
|
||||
|
||||
compute_api.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -36,11 +36,11 @@ use utils::pid_file::{self, PidFileRead};
|
||||
// it's waiting. If the process hasn't started/stopped after 5 seconds,
|
||||
// it prints a notice that it's taking long, but keeps waiting.
|
||||
//
|
||||
const STOP_RETRY_TIMEOUT: Duration = Duration::from_secs(10);
|
||||
const STOP_RETRIES: u128 = STOP_RETRY_TIMEOUT.as_millis() / RETRY_INTERVAL.as_millis();
|
||||
const RETRY_INTERVAL: Duration = Duration::from_millis(100);
|
||||
const DOT_EVERY_RETRIES: u128 = 10;
|
||||
const NOTICE_AFTER_RETRIES: u128 = 50;
|
||||
const RETRY_UNTIL_SECS: u64 = 10;
|
||||
const RETRIES: u64 = (RETRY_UNTIL_SECS * 1000) / RETRY_INTERVAL_MILLIS;
|
||||
const RETRY_INTERVAL_MILLIS: u64 = 100;
|
||||
const DOT_EVERY_RETRIES: u64 = 10;
|
||||
const NOTICE_AFTER_RETRIES: u64 = 50;
|
||||
|
||||
/// Argument to `start_process`, to indicate whether it should create pidfile or if the process creates
|
||||
/// it itself.
|
||||
@@ -52,7 +52,6 @@ pub enum InitialPidFile {
|
||||
}
|
||||
|
||||
/// Start a background child process using the parameters given.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn start_process<F, Fut, AI, A, EI>(
|
||||
process_name: &str,
|
||||
datadir: &Path,
|
||||
@@ -60,7 +59,6 @@ pub async fn start_process<F, Fut, AI, A, EI>(
|
||||
args: AI,
|
||||
envs: EI,
|
||||
initial_pid_file: InitialPidFile,
|
||||
retry_timeout: &Duration,
|
||||
process_status_check: F,
|
||||
) -> anyhow::Result<()>
|
||||
where
|
||||
@@ -71,10 +69,6 @@ where
|
||||
// Not generic AsRef<OsStr>, otherwise empty `envs` prevents type inference
|
||||
EI: IntoIterator<Item = (String, String)>,
|
||||
{
|
||||
let retries: u128 = retry_timeout.as_millis() / RETRY_INTERVAL.as_millis();
|
||||
if !datadir.metadata().context("stat datadir")?.is_dir() {
|
||||
anyhow::bail!("`datadir` must be a directory when calling this function: {datadir:?}");
|
||||
}
|
||||
let log_path = datadir.join(format!("{process_name}.log"));
|
||||
let process_log_file = fs::OpenOptions::new()
|
||||
.create(true)
|
||||
@@ -91,13 +85,7 @@ where
|
||||
let background_command = command
|
||||
.stdout(process_log_file)
|
||||
.stderr(same_file_for_stderr)
|
||||
.args(args)
|
||||
// spawn all child processes in their datadir, useful for all kinds of things,
|
||||
// not least cleaning up child processes e.g. after an unclean exit from the test suite:
|
||||
// ```
|
||||
// lsof -d cwd -a +D Users/cs/src/neon/test_output
|
||||
// ```
|
||||
.current_dir(datadir);
|
||||
.args(args);
|
||||
|
||||
let filled_cmd = fill_env_vars_prefixed_neon(fill_remote_storage_secrets_vars(
|
||||
fill_rust_env_vars(background_command),
|
||||
@@ -133,7 +121,7 @@ where
|
||||
.unwrap();
|
||||
});
|
||||
|
||||
for retries in 0..retries {
|
||||
for retries in 0..RETRIES {
|
||||
match process_started(pid, pid_file_to_check, &process_status_check).await {
|
||||
Ok(true) => {
|
||||
println!("\n{process_name} started and passed status check, pid: {pid}");
|
||||
@@ -151,7 +139,7 @@ where
|
||||
print!(".");
|
||||
io::stdout().flush().unwrap();
|
||||
}
|
||||
tokio::time::sleep(RETRY_INTERVAL).await;
|
||||
thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS));
|
||||
}
|
||||
Err(e) => {
|
||||
println!("error starting process {process_name:?}: {e:#}");
|
||||
@@ -160,10 +148,9 @@ where
|
||||
}
|
||||
}
|
||||
println!();
|
||||
anyhow::bail!(format!(
|
||||
"{} did not start+pass status checks within {:?} seconds",
|
||||
process_name, retry_timeout
|
||||
));
|
||||
anyhow::bail!(
|
||||
"{process_name} did not start+pass status checks within {RETRY_UNTIL_SECS} seconds"
|
||||
);
|
||||
}
|
||||
|
||||
/// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
|
||||
@@ -219,7 +206,7 @@ pub fn stop_process(
|
||||
}
|
||||
|
||||
pub fn wait_until_stopped(process_name: &str, pid: Pid) -> anyhow::Result<()> {
|
||||
for retries in 0..STOP_RETRIES {
|
||||
for retries in 0..RETRIES {
|
||||
match process_has_stopped(pid) {
|
||||
Ok(true) => {
|
||||
println!("\n{process_name} stopped");
|
||||
@@ -235,7 +222,7 @@ pub fn wait_until_stopped(process_name: &str, pid: Pid) -> anyhow::Result<()> {
|
||||
print!(".");
|
||||
io::stdout().flush().unwrap();
|
||||
}
|
||||
thread::sleep(RETRY_INTERVAL);
|
||||
thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS));
|
||||
}
|
||||
Err(e) => {
|
||||
println!("{process_name} with pid {pid} failed to stop: {e:#}");
|
||||
@@ -244,10 +231,7 @@ pub fn wait_until_stopped(process_name: &str, pid: Pid) -> anyhow::Result<()> {
|
||||
}
|
||||
}
|
||||
println!();
|
||||
anyhow::bail!(format!(
|
||||
"{} with pid {} did not stop in {:?} seconds",
|
||||
process_name, pid, STOP_RETRY_TIMEOUT
|
||||
));
|
||||
anyhow::bail!("{process_name} with pid {pid} did not stop in {RETRY_UNTIL_SECS} seconds");
|
||||
}
|
||||
|
||||
fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
|
||||
@@ -289,7 +273,7 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command {
|
||||
|
||||
fn fill_env_vars_prefixed_neon(mut cmd: &mut Command) -> &mut Command {
|
||||
for (var, val) in std::env::vars() {
|
||||
if var.starts_with("NEON_") {
|
||||
if var.starts_with("NEON_PAGESERVER_") {
|
||||
cmd = cmd.env(var, val);
|
||||
}
|
||||
}
|
||||
@@ -379,7 +363,7 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn process_has_stopped(pid: Pid) -> anyhow::Result<bool> {
|
||||
fn process_has_stopped(pid: Pid) -> anyhow::Result<bool> {
|
||||
match kill(pid, None) {
|
||||
// Process exists, keep waiting
|
||||
Ok(_) => Ok(false),
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,94 +0,0 @@
|
||||
//! Branch mappings for convenience
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use utils::id::{TenantId, TenantTimelineId, TimelineId};
|
||||
|
||||
/// Keep human-readable aliases in memory (and persist them to config XXX), to hide tenant/timeline hex strings from the user.
|
||||
#[derive(PartialEq, Eq, Clone, Debug, Default, Serialize, Deserialize)]
|
||||
#[serde(default, deny_unknown_fields)]
|
||||
pub struct BranchMappings {
|
||||
/// Default tenant ID to use with the 'neon_local' command line utility, when
|
||||
/// --tenant_id is not explicitly specified. This comes from the branches.
|
||||
pub default_tenant_id: Option<TenantId>,
|
||||
|
||||
// A `HashMap<String, HashMap<TenantId, TimelineId>>` would be more appropriate here,
|
||||
// but deserialization into a generic toml object as `toml::Value::try_from` fails with an error.
|
||||
// https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table".
|
||||
pub mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
|
||||
}
|
||||
|
||||
impl BranchMappings {
|
||||
pub fn register_branch_mapping(
|
||||
&mut self,
|
||||
branch_name: String,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> anyhow::Result<()> {
|
||||
let existing_values = self.mappings.entry(branch_name.clone()).or_default();
|
||||
|
||||
let existing_ids = existing_values
|
||||
.iter()
|
||||
.find(|(existing_tenant_id, _)| existing_tenant_id == &tenant_id);
|
||||
|
||||
if let Some((_, old_timeline_id)) = existing_ids {
|
||||
if old_timeline_id == &timeline_id {
|
||||
Ok(())
|
||||
} else {
|
||||
bail!("branch '{branch_name}' is already mapped to timeline {old_timeline_id}, cannot map to another timeline {timeline_id}");
|
||||
}
|
||||
} else {
|
||||
existing_values.push((tenant_id, timeline_id));
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_branch_timeline_id(
|
||||
&self,
|
||||
branch_name: &str,
|
||||
tenant_id: TenantId,
|
||||
) -> Option<TimelineId> {
|
||||
// If it looks like a timeline ID, return it as it is
|
||||
if let Ok(timeline_id) = branch_name.parse::<TimelineId>() {
|
||||
return Some(timeline_id);
|
||||
}
|
||||
|
||||
self.mappings
|
||||
.get(branch_name)?
|
||||
.iter()
|
||||
.find(|(mapped_tenant_id, _)| mapped_tenant_id == &tenant_id)
|
||||
.map(|&(_, timeline_id)| timeline_id)
|
||||
.map(TimelineId::from)
|
||||
}
|
||||
|
||||
pub fn timeline_name_mappings(&self) -> HashMap<TenantTimelineId, String> {
|
||||
self.mappings
|
||||
.iter()
|
||||
.flat_map(|(name, tenant_timelines)| {
|
||||
tenant_timelines.iter().map(|&(tenant_id, timeline_id)| {
|
||||
(TenantTimelineId::new(tenant_id, timeline_id), name.clone())
|
||||
})
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn persist(&self, path: &Path) -> anyhow::Result<()> {
|
||||
let content = &toml::to_string_pretty(self)?;
|
||||
fs::write(path, content).with_context(|| {
|
||||
format!(
|
||||
"Failed to write branch information into path '{}'",
|
||||
path.display()
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
pub fn load(path: &Path) -> anyhow::Result<BranchMappings> {
|
||||
let branches_file_contents = fs::read_to_string(path)?;
|
||||
Ok(toml::from_str(branches_file_contents.as_str())?)
|
||||
}
|
||||
}
|
||||
@@ -1,22 +1,17 @@
|
||||
//! Code to manage the storage broker
|
||||
//!
|
||||
//! In the local test environment, the storage broker stores its data directly in
|
||||
//! In the local test environment, the data for each safekeeper is stored in
|
||||
//!
|
||||
//! ```text
|
||||
//! .neon
|
||||
//! .neon/safekeepers/<safekeeper id>
|
||||
//! ```
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::Context;
|
||||
|
||||
use camino::Utf8PathBuf;
|
||||
|
||||
use crate::{background_process, local_env};
|
||||
|
||||
pub async fn start_broker_process(
|
||||
env: &local_env::LocalEnv,
|
||||
retry_timeout: &Duration,
|
||||
) -> anyhow::Result<()> {
|
||||
pub async fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
|
||||
let broker = &env.broker;
|
||||
let listen_addr = &broker.listen_addr;
|
||||
|
||||
@@ -32,7 +27,6 @@ pub async fn start_broker_process(
|
||||
args,
|
||||
[],
|
||||
background_process::InitialPidFile::Create(storage_broker_pid_file_path(env)),
|
||||
retry_timeout,
|
||||
|| async {
|
||||
let url = broker.client_url();
|
||||
let status_url = url.join("status").with_context(|| {
|
||||
|
||||
@@ -499,23 +499,6 @@ impl Endpoint {
|
||||
.join(",")
|
||||
}
|
||||
|
||||
/// Map safekeepers ids to the actual connection strings.
|
||||
fn build_safekeepers_connstrs(&self, sk_ids: Vec<NodeId>) -> Result<Vec<String>> {
|
||||
let mut safekeeper_connstrings = Vec::new();
|
||||
if self.mode == ComputeMode::Primary {
|
||||
for sk_id in sk_ids {
|
||||
let sk = self
|
||||
.env
|
||||
.safekeepers
|
||||
.iter()
|
||||
.find(|node| node.id == sk_id)
|
||||
.ok_or_else(|| anyhow!("safekeeper {sk_id} does not exist"))?;
|
||||
safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.get_compute_port()));
|
||||
}
|
||||
}
|
||||
Ok(safekeeper_connstrings)
|
||||
}
|
||||
|
||||
pub async fn start(
|
||||
&self,
|
||||
auth_token: &Option<String>,
|
||||
@@ -540,7 +523,18 @@ impl Endpoint {
|
||||
let pageserver_connstring = Self::build_pageserver_connstr(&pageservers);
|
||||
assert!(!pageserver_connstring.is_empty());
|
||||
|
||||
let safekeeper_connstrings = self.build_safekeepers_connstrs(safekeepers)?;
|
||||
let mut safekeeper_connstrings = Vec::new();
|
||||
if self.mode == ComputeMode::Primary {
|
||||
for sk_id in safekeepers {
|
||||
let sk = self
|
||||
.env
|
||||
.safekeepers
|
||||
.iter()
|
||||
.find(|node| node.id == sk_id)
|
||||
.ok_or_else(|| anyhow!("safekeeper {sk_id} does not exist"))?;
|
||||
safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.get_compute_port()));
|
||||
}
|
||||
}
|
||||
|
||||
// check for file remote_extensions_spec.json
|
||||
// if it is present, read it and pass to compute_ctl
|
||||
@@ -561,7 +555,6 @@ impl Endpoint {
|
||||
operation_uuid: None,
|
||||
features: self.features.clone(),
|
||||
swap_size_bytes: None,
|
||||
disk_quota_bytes: None,
|
||||
cluster: Cluster {
|
||||
cluster_id: None, // project ID: not used
|
||||
name: None, // project name: not used
|
||||
@@ -599,7 +592,7 @@ impl Endpoint {
|
||||
remote_extensions,
|
||||
pgbouncer_settings: None,
|
||||
shard_stripe_size: Some(shard_stripe_size),
|
||||
local_proxy_config: None,
|
||||
primary_is_running: None,
|
||||
};
|
||||
let spec_path = self.endpoint_path().join("spec.json");
|
||||
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
|
||||
@@ -704,7 +697,7 @@ impl Endpoint {
|
||||
}
|
||||
}
|
||||
}
|
||||
tokio::time::sleep(ATTEMPT_INTERVAL).await;
|
||||
std::thread::sleep(ATTEMPT_INTERVAL);
|
||||
}
|
||||
|
||||
// disarm the scopeguard, let the child outlive this function (and neon_local invoction)
|
||||
@@ -748,7 +741,6 @@ impl Endpoint {
|
||||
&self,
|
||||
mut pageservers: Vec<(Host, u16)>,
|
||||
stripe_size: Option<ShardStripeSize>,
|
||||
safekeepers: Option<Vec<NodeId>>,
|
||||
) -> Result<()> {
|
||||
let mut spec: ComputeSpec = {
|
||||
let spec_path = self.endpoint_path().join("spec.json");
|
||||
@@ -783,12 +775,6 @@ impl Endpoint {
|
||||
spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize);
|
||||
}
|
||||
|
||||
// If safekeepers are not specified, don't change them.
|
||||
if let Some(safekeepers) = safekeepers {
|
||||
let safekeeper_connstrings = self.build_safekeepers_connstrs(safekeepers)?;
|
||||
spec.safekeeper_connstrings = safekeeper_connstrings;
|
||||
}
|
||||
|
||||
let client = reqwest::Client::builder()
|
||||
.timeout(Duration::from_secs(30))
|
||||
.build()
|
||||
@@ -826,12 +812,11 @@ impl Endpoint {
|
||||
// cleanup work to do after postgres stops, like syncing safekeepers,
|
||||
// etc.
|
||||
//
|
||||
// If destroying or stop mode is immediate, send it SIGTERM before
|
||||
// waiting. Sometimes we do *not* want this cleanup: tests intentionally
|
||||
// do stop when majority of safekeepers is down, so sync-safekeepers
|
||||
// would hang otherwise. This could be a separate flag though.
|
||||
let send_sigterm = destroy || mode == "immediate";
|
||||
self.wait_for_compute_ctl_to_exit(send_sigterm)?;
|
||||
// If destroying, send it SIGTERM before waiting. Sometimes we do *not*
|
||||
// want this cleanup: tests intentionally do stop when majority of
|
||||
// safekeepers is down, so sync-safekeepers would hang otherwise. This
|
||||
// could be a separate flag though.
|
||||
self.wait_for_compute_ctl_to_exit(destroy)?;
|
||||
if destroy {
|
||||
println!(
|
||||
"Destroying postgres data directory '{}'",
|
||||
|
||||
@@ -27,7 +27,7 @@ use crate::pageserver::PageServerNode;
|
||||
use crate::pageserver::PAGESERVER_REMOTE_STORAGE_DIR;
|
||||
use crate::safekeeper::SafekeeperNode;
|
||||
|
||||
pub const DEFAULT_PG_VERSION: u32 = 16;
|
||||
pub const DEFAULT_PG_VERSION: u32 = 15;
|
||||
|
||||
//
|
||||
// This data structures represents neon_local CLI config
|
||||
@@ -42,8 +42,8 @@ pub struct LocalEnv {
|
||||
// compute endpoints).
|
||||
//
|
||||
// This is not stored in the config file. Rather, this is the path where the
|
||||
// config file itself is. It is read from the NEON_REPO_DIR env variable which
|
||||
// must be an absolute path. If the env var is not set, $PWD/.neon is used.
|
||||
// config file itself is. It is read from the NEON_REPO_DIR env variable or
|
||||
// '.neon' if not given.
|
||||
pub base_data_dir: PathBuf,
|
||||
|
||||
// Path to postgres distribution. It's expected that "bin", "include",
|
||||
@@ -151,49 +151,23 @@ pub struct NeonBroker {
|
||||
pub struct NeonStorageControllerConf {
|
||||
/// Heartbeat timeout before marking a node offline
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub max_offline: Duration,
|
||||
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub max_warming_up: Duration,
|
||||
|
||||
pub start_as_candidate: bool,
|
||||
|
||||
/// Database url used when running multiple storage controller instances
|
||||
pub database_url: Option<SocketAddr>,
|
||||
pub max_unavailable: Duration,
|
||||
|
||||
/// Threshold for auto-splitting a tenant into shards
|
||||
pub split_threshold: Option<u64>,
|
||||
|
||||
pub max_secondary_lag_bytes: Option<u64>,
|
||||
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub heartbeat_interval: Duration,
|
||||
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub long_reconcile_threshold: Option<Duration>,
|
||||
}
|
||||
|
||||
impl NeonStorageControllerConf {
|
||||
// Use a shorter pageserver unavailability interval than the default to speed up tests.
|
||||
const DEFAULT_MAX_OFFLINE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10);
|
||||
|
||||
const DEFAULT_MAX_WARMING_UP_INTERVAL: std::time::Duration = std::time::Duration::from_secs(30);
|
||||
|
||||
// Very tight heartbeat interval to speed up tests
|
||||
const DEFAULT_HEARTBEAT_INTERVAL: std::time::Duration = std::time::Duration::from_millis(100);
|
||||
const DEFAULT_MAX_UNAVAILABLE_INTERVAL: std::time::Duration =
|
||||
std::time::Duration::from_secs(10);
|
||||
}
|
||||
|
||||
impl Default for NeonStorageControllerConf {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
max_offline: Self::DEFAULT_MAX_OFFLINE_INTERVAL,
|
||||
max_warming_up: Self::DEFAULT_MAX_WARMING_UP_INTERVAL,
|
||||
start_as_candidate: false,
|
||||
database_url: None,
|
||||
max_unavailable: Self::DEFAULT_MAX_UNAVAILABLE_INTERVAL,
|
||||
split_threshold: None,
|
||||
max_secondary_lag_bytes: None,
|
||||
heartbeat_interval: Self::DEFAULT_HEARTBEAT_INTERVAL,
|
||||
long_reconcile_threshold: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -346,21 +320,16 @@ impl LocalEnv {
|
||||
|
||||
#[allow(clippy::manual_range_patterns)]
|
||||
match pg_version {
|
||||
14 | 15 | 16 | 17 => Ok(path.join(format!("v{pg_version}"))),
|
||||
14 | 15 | 16 => Ok(path.join(format!("v{pg_version}"))),
|
||||
_ => bail!("Unsupported postgres version: {}", pg_version),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn pg_dir(&self, pg_version: u32, dir_name: &str) -> anyhow::Result<PathBuf> {
|
||||
Ok(self.pg_distrib_dir(pg_version)?.join(dir_name))
|
||||
}
|
||||
|
||||
pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
|
||||
self.pg_dir(pg_version, "bin")
|
||||
Ok(self.pg_distrib_dir(pg_version)?.join("bin"))
|
||||
}
|
||||
|
||||
pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
|
||||
self.pg_dir(pg_version, "lib")
|
||||
Ok(self.pg_distrib_dir(pg_version)?.join("lib"))
|
||||
}
|
||||
|
||||
pub fn pageserver_bin(&self) -> PathBuf {
|
||||
@@ -410,36 +379,6 @@ impl LocalEnv {
|
||||
}
|
||||
}
|
||||
|
||||
/// Inspect the base data directory and extract the instance id and instance directory path
|
||||
/// for all storage controller instances
|
||||
pub async fn storage_controller_instances(&self) -> std::io::Result<Vec<(u8, PathBuf)>> {
|
||||
let mut instances = Vec::default();
|
||||
|
||||
let dir = std::fs::read_dir(self.base_data_dir.clone())?;
|
||||
for dentry in dir {
|
||||
let dentry = dentry?;
|
||||
let is_dir = dentry.metadata()?.is_dir();
|
||||
let filename = dentry.file_name().into_string().unwrap();
|
||||
let parsed_instance_id = match filename.strip_prefix("storage_controller_") {
|
||||
Some(suffix) => suffix.parse::<u8>().ok(),
|
||||
None => None,
|
||||
};
|
||||
|
||||
let is_instance_dir = is_dir && parsed_instance_id.is_some();
|
||||
|
||||
if !is_instance_dir {
|
||||
continue;
|
||||
}
|
||||
|
||||
instances.push((
|
||||
parsed_instance_id.expect("Checked previously"),
|
||||
dentry.path(),
|
||||
));
|
||||
}
|
||||
|
||||
Ok(instances)
|
||||
}
|
||||
|
||||
pub fn register_branch_mapping(
|
||||
&mut self,
|
||||
branch_name: String,
|
||||
@@ -492,7 +431,9 @@ impl LocalEnv {
|
||||
}
|
||||
|
||||
/// Construct `Self` from on-disk state.
|
||||
pub fn load_config(repopath: &Path) -> anyhow::Result<Self> {
|
||||
pub fn load_config() -> anyhow::Result<Self> {
|
||||
let repopath = base_path();
|
||||
|
||||
if !repopath.exists() {
|
||||
bail!(
|
||||
"Neon config is not found in {}. You need to run 'neon_local init' first",
|
||||
@@ -520,7 +461,7 @@ impl LocalEnv {
|
||||
branch_name_mappings,
|
||||
} = on_disk_config;
|
||||
LocalEnv {
|
||||
base_data_dir: repopath.to_owned(),
|
||||
base_data_dir: repopath.clone(),
|
||||
pg_distrib_dir,
|
||||
neon_distrib_dir,
|
||||
default_tenant_id,
|
||||
@@ -541,7 +482,7 @@ impl LocalEnv {
|
||||
"we ensure this during deserialization"
|
||||
);
|
||||
env.pageservers = {
|
||||
let iter = std::fs::read_dir(repopath).context("open dir")?;
|
||||
let iter = std::fs::read_dir(&repopath).context("open dir")?;
|
||||
let mut pageservers = Vec::new();
|
||||
for res in iter {
|
||||
let dentry = res?;
|
||||
@@ -565,6 +506,7 @@ impl LocalEnv {
|
||||
#[derive(serde::Serialize, serde::Deserialize)]
|
||||
// (allow unknown fields, unlike PageServerConf)
|
||||
struct PageserverConfigTomlSubset {
|
||||
id: NodeId,
|
||||
listen_pg_addr: String,
|
||||
listen_http_addr: String,
|
||||
pg_auth_type: AuthType,
|
||||
@@ -576,30 +518,18 @@ impl LocalEnv {
|
||||
.with_context(|| format!("read {:?}", config_toml_path))?,
|
||||
)
|
||||
.context("parse pageserver.toml")?;
|
||||
let identity_toml_path = dentry.path().join("identity.toml");
|
||||
#[derive(serde::Serialize, serde::Deserialize)]
|
||||
struct IdentityTomlSubset {
|
||||
id: NodeId,
|
||||
}
|
||||
let identity_toml: IdentityTomlSubset = toml_edit::de::from_str(
|
||||
&std::fs::read_to_string(&identity_toml_path)
|
||||
.with_context(|| format!("read {:?}", identity_toml_path))?,
|
||||
)
|
||||
.context("parse identity.toml")?;
|
||||
let PageserverConfigTomlSubset {
|
||||
id: config_toml_id,
|
||||
listen_pg_addr,
|
||||
listen_http_addr,
|
||||
pg_auth_type,
|
||||
http_auth_type,
|
||||
} = config_toml;
|
||||
let IdentityTomlSubset {
|
||||
id: identity_toml_id,
|
||||
} = identity_toml;
|
||||
let conf = PageServerConf {
|
||||
id: {
|
||||
anyhow::ensure!(
|
||||
identity_toml_id == id,
|
||||
"id mismatch: identity.toml:id={identity_toml_id} pageserver_(.*) id={id}",
|
||||
config_toml_id == id,
|
||||
"id mismatch: config_toml.id={config_toml_id} id={id}",
|
||||
);
|
||||
id
|
||||
},
|
||||
@@ -789,25 +719,10 @@ impl LocalEnv {
|
||||
}
|
||||
|
||||
pub fn base_path() -> PathBuf {
|
||||
let path = match std::env::var_os("NEON_REPO_DIR") {
|
||||
Some(val) => {
|
||||
let path = PathBuf::from(val);
|
||||
if !path.is_absolute() {
|
||||
// repeat the env var in the error because our default is always absolute
|
||||
panic!("NEON_REPO_DIR must be an absolute path, got {path:?}");
|
||||
}
|
||||
path
|
||||
}
|
||||
None => {
|
||||
let pwd = std::env::current_dir()
|
||||
// technically this can fail but it's quite unlikeley
|
||||
.expect("determine current directory");
|
||||
let pwd_abs = pwd.canonicalize().expect("canonicalize current directory");
|
||||
pwd_abs.join(".neon")
|
||||
}
|
||||
};
|
||||
assert!(path.is_absolute());
|
||||
path
|
||||
match std::env::var_os("NEON_REPO_DIR") {
|
||||
Some(val) => PathBuf::from(val),
|
||||
None => PathBuf::from(".neon"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Generate a public/private key pair for JWT authentication
|
||||
|
||||
@@ -1,10 +1,8 @@
|
||||
//! Code to manage pageservers
|
||||
//!
|
||||
//! In the local test environment, the data for each pageserver is stored in
|
||||
//! In the local test environment, the pageserver stores its data directly in
|
||||
//!
|
||||
//! ```text
|
||||
//! .neon/pageserver_<pageserver_id>
|
||||
//! ```
|
||||
//! .neon/
|
||||
//!
|
||||
use std::collections::HashMap;
|
||||
|
||||
@@ -17,13 +15,16 @@ use std::time::Duration;
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver_api::models::{self, AuxFilePolicy, TenantInfo, TimelineInfo};
|
||||
use futures::SinkExt;
|
||||
use pageserver_api::models::{
|
||||
self, AuxFilePolicy, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo,
|
||||
TimelineInfo,
|
||||
};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_client::mgmt_api;
|
||||
use postgres_backend::AuthType;
|
||||
use postgres_connection::{parse_host_port, PgConnectionConfig};
|
||||
use utils::auth::{Claims, Scope};
|
||||
use utils::id::NodeId;
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
@@ -73,14 +74,10 @@ impl PageServerNode {
|
||||
}
|
||||
}
|
||||
|
||||
fn pageserver_make_identity_toml(&self, node_id: NodeId) -> toml_edit::DocumentMut {
|
||||
toml_edit::DocumentMut::from_str(&format!("id={node_id}")).unwrap()
|
||||
}
|
||||
|
||||
fn pageserver_init_make_toml(
|
||||
&self,
|
||||
conf: NeonLocalInitPageserverConf,
|
||||
) -> anyhow::Result<toml_edit::DocumentMut> {
|
||||
) -> anyhow::Result<toml_edit::Document> {
|
||||
assert_eq!(&PageServerConf::from(&conf), &self.conf, "during neon_local init, we derive the runtime state of ps conf (self.conf) from the --config flag fully");
|
||||
|
||||
// TODO(christian): instead of what we do here, create a pageserver_api::config::ConfigToml (PR #7656)
|
||||
@@ -125,19 +122,16 @@ impl PageServerNode {
|
||||
}
|
||||
|
||||
// Apply the user-provided overrides
|
||||
overrides.push({
|
||||
let mut doc =
|
||||
toml_edit::ser::to_document(&conf).expect("we deserialized this from toml earlier");
|
||||
// `id` is written out to `identity.toml` instead of `pageserver.toml`
|
||||
doc.remove("id").expect("it's part of the struct");
|
||||
doc.to_string()
|
||||
});
|
||||
overrides.push(
|
||||
toml_edit::ser::to_string_pretty(&conf)
|
||||
.expect("we deserialized this from toml earlier"),
|
||||
);
|
||||
|
||||
// Turn `overrides` into a toml document.
|
||||
// TODO: above code is legacy code, it should be refactored to use toml_edit directly.
|
||||
let mut config_toml = toml_edit::DocumentMut::new();
|
||||
let mut config_toml = toml_edit::Document::new();
|
||||
for fragment_str in overrides {
|
||||
let fragment = toml_edit::DocumentMut::from_str(&fragment_str)
|
||||
let fragment = toml_edit::Document::from_str(&fragment_str)
|
||||
.expect("all fragments in `overrides` are valid toml documents, this function controls that");
|
||||
for (key, item) in fragment.iter() {
|
||||
config_toml.insert(key, item.clone());
|
||||
@@ -164,8 +158,8 @@ impl PageServerNode {
|
||||
.expect("non-Unicode path")
|
||||
}
|
||||
|
||||
pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
|
||||
self.start_node(retry_timeout).await
|
||||
pub async fn start(&self) -> anyhow::Result<()> {
|
||||
self.start_node().await
|
||||
}
|
||||
|
||||
fn pageserver_init(&self, conf: NeonLocalInitPageserverConf) -> anyhow::Result<()> {
|
||||
@@ -179,23 +173,6 @@ impl PageServerNode {
|
||||
);
|
||||
io::stdout().flush()?;
|
||||
|
||||
// If the config file we got as a CLI argument includes the `availability_zone`
|
||||
// config, then use that to populate the `metadata.json` file for the pageserver.
|
||||
// In production the deployment orchestrator does this for us.
|
||||
let az_id = conf
|
||||
.other
|
||||
.get("availability_zone")
|
||||
.map(|toml| {
|
||||
let az_str = toml.to_string();
|
||||
// Trim the (") chars from the toml representation
|
||||
if az_str.starts_with('"') && az_str.ends_with('"') {
|
||||
az_str[1..az_str.len() - 1].to_string()
|
||||
} else {
|
||||
az_str
|
||||
}
|
||||
})
|
||||
.unwrap_or("local".to_string());
|
||||
|
||||
let config = self
|
||||
.pageserver_init_make_toml(conf)
|
||||
.context("make pageserver toml")?;
|
||||
@@ -209,19 +186,6 @@ impl PageServerNode {
|
||||
.write_all(config.to_string().as_bytes())
|
||||
.context("write pageserver toml")?;
|
||||
drop(config_file);
|
||||
|
||||
let identity_file_path = datadir.join("identity.toml");
|
||||
let mut identity_file = std::fs::OpenOptions::new()
|
||||
.create_new(true)
|
||||
.write(true)
|
||||
.open(identity_file_path)
|
||||
.with_context(|| format!("open identity toml for write: {config_file_path:?}"))?;
|
||||
let identity_toml = self.pageserver_make_identity_toml(node_id);
|
||||
identity_file
|
||||
.write_all(identity_toml.to_string().as_bytes())
|
||||
.context("write identity toml")?;
|
||||
drop(identity_toml);
|
||||
|
||||
// TODO: invoke a TBD config-check command to validate that pageserver will start with the written config
|
||||
|
||||
// Write metadata file, used by pageserver on startup to register itself with
|
||||
@@ -231,7 +195,6 @@ impl PageServerNode {
|
||||
let (_http_host, http_port) =
|
||||
parse_host_port(&self.conf.listen_http_addr).expect("Unable to parse listen_http_addr");
|
||||
let http_port = http_port.unwrap_or(9898);
|
||||
|
||||
// Intentionally hand-craft JSON: this acts as an implicit format compat test
|
||||
// in case the pageserver-side structure is edited, and reflects the real life
|
||||
// situation: the metadata is written by some other script.
|
||||
@@ -242,10 +205,7 @@ impl PageServerNode {
|
||||
postgres_port: self.pg_connection_config.port(),
|
||||
http_host: "localhost".to_string(),
|
||||
http_port,
|
||||
other: HashMap::from([(
|
||||
"availability_zone_id".to_string(),
|
||||
serde_json::json!(az_id),
|
||||
)]),
|
||||
other: HashMap::new(),
|
||||
})
|
||||
.unwrap(),
|
||||
)
|
||||
@@ -254,15 +214,14 @@ impl PageServerNode {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn start_node(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
|
||||
async fn start_node(&self) -> anyhow::Result<()> {
|
||||
// TODO: using a thread here because start_process() is not async but we need to call check_status()
|
||||
let datadir = self.repo_path();
|
||||
print!(
|
||||
"Starting pageserver node {} at '{}' in {:?}, retrying for {:?}",
|
||||
"Starting pageserver node {} at '{}' in {:?}",
|
||||
self.conf.id,
|
||||
self.pg_connection_config.raw_address(),
|
||||
datadir,
|
||||
retry_timeout
|
||||
datadir
|
||||
);
|
||||
io::stdout().flush().context("flush stdout")?;
|
||||
|
||||
@@ -280,7 +239,6 @@ impl PageServerNode {
|
||||
args,
|
||||
self.pageserver_env_variables()?,
|
||||
background_process::InitialPidFile::Expect(self.pid_file()),
|
||||
retry_timeout,
|
||||
|| async {
|
||||
let st = self.check_status().await;
|
||||
match st {
|
||||
@@ -322,6 +280,22 @@ impl PageServerNode {
|
||||
background_process::stop_process(immediate, "pageserver", &self.pid_file())
|
||||
}
|
||||
|
||||
pub async fn page_server_psql_client(
|
||||
&self,
|
||||
) -> anyhow::Result<(
|
||||
tokio_postgres::Client,
|
||||
tokio_postgres::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>,
|
||||
)> {
|
||||
let mut config = self.pg_connection_config.clone();
|
||||
if self.conf.pg_auth_type == AuthType::NeonJWT {
|
||||
let token = self
|
||||
.env
|
||||
.generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
|
||||
config = config.set_password(Some(token));
|
||||
}
|
||||
Ok(config.connect_no_tls().await?)
|
||||
}
|
||||
|
||||
pub async fn check_status(&self) -> mgmt_api::Result<()> {
|
||||
self.http_client.status().await
|
||||
}
|
||||
@@ -375,6 +349,11 @@ impl PageServerNode {
|
||||
.map(|x| x.parse::<NonZeroU64>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
|
||||
trace_read_requests: settings
|
||||
.remove("trace_read_requests")
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'trace_read_requests' as bool")?,
|
||||
eviction_policy: settings
|
||||
.remove("eviction_policy")
|
||||
.map(serde_json::from_str)
|
||||
@@ -404,10 +383,6 @@ impl PageServerNode {
|
||||
.map(|x| x.parse::<AuxFilePolicy>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'switch_aux_file_policy'")?,
|
||||
lsn_lease_length: settings.remove("lsn_lease_length").map(|x| x.to_string()),
|
||||
lsn_lease_length_for_ts: settings
|
||||
.remove("lsn_lease_length_for_ts")
|
||||
.map(|x| x.to_string()),
|
||||
};
|
||||
if !settings.is_empty() {
|
||||
bail!("Unrecognized tenant settings: {settings:?}")
|
||||
@@ -416,6 +391,28 @@ impl PageServerNode {
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn tenant_create(
|
||||
&self,
|
||||
new_tenant_id: TenantId,
|
||||
generation: Option<u32>,
|
||||
settings: HashMap<&str, &str>,
|
||||
) -> anyhow::Result<TenantId> {
|
||||
let config = Self::parse_config(settings.clone())?;
|
||||
|
||||
let request = models::TenantCreateRequest {
|
||||
new_tenant_id: TenantShardId::unsharded(new_tenant_id),
|
||||
generation,
|
||||
config,
|
||||
shard_parameters: ShardParameters::default(),
|
||||
// Placement policy is not meaningful for creations not done via storage controller
|
||||
placement_policy: None,
|
||||
};
|
||||
if !settings.is_empty() {
|
||||
bail!("Unrecognized tenant settings: {settings:?}")
|
||||
}
|
||||
Ok(self.http_client.tenant_create(&request).await?)
|
||||
}
|
||||
|
||||
pub async fn tenant_config(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
@@ -475,6 +472,11 @@ impl PageServerNode {
|
||||
.map(|x| x.parse::<NonZeroU64>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
|
||||
trace_read_requests: settings
|
||||
.remove("trace_read_requests")
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'trace_read_requests' as bool")?,
|
||||
eviction_policy: settings
|
||||
.remove("eviction_policy")
|
||||
.map(serde_json::from_str)
|
||||
@@ -504,10 +506,6 @@ impl PageServerNode {
|
||||
.map(|x| x.parse::<AuxFilePolicy>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'switch_aux_file_policy'")?,
|
||||
lsn_lease_length: settings.remove("lsn_lease_length").map(|x| x.to_string()),
|
||||
lsn_lease_length_for_ts: settings
|
||||
.remove("lsn_lease_length_for_ts")
|
||||
.map(|x| x.to_string()),
|
||||
}
|
||||
};
|
||||
|
||||
@@ -522,6 +520,19 @@ impl PageServerNode {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn location_config(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
config: LocationConfig,
|
||||
flush_ms: Option<Duration>,
|
||||
lazy: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
Ok(self
|
||||
.http_client
|
||||
.location_config(tenant_shard_id, config, flush_ms, lazy)
|
||||
.await?)
|
||||
}
|
||||
|
||||
pub async fn timeline_list(
|
||||
&self,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
@@ -568,41 +579,72 @@ impl PageServerNode {
|
||||
pg_wal: Option<(Lsn, PathBuf)>,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<()> {
|
||||
let (client, conn) = self.page_server_psql_client().await?;
|
||||
// The connection object performs the actual communication with the database,
|
||||
// so spawn it off to run on its own.
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = conn.await {
|
||||
eprintln!("connection error: {}", e);
|
||||
}
|
||||
});
|
||||
let client = std::pin::pin!(client);
|
||||
|
||||
// Init base reader
|
||||
let (start_lsn, base_tarfile_path) = base;
|
||||
let base_tarfile = tokio::fs::File::open(base_tarfile_path).await?;
|
||||
let base_tarfile =
|
||||
mgmt_api::ReqwestBody::wrap_stream(tokio_util::io::ReaderStream::new(base_tarfile));
|
||||
let base_tarfile = tokio_util::io::ReaderStream::new(base_tarfile);
|
||||
|
||||
// Init wal reader if necessary
|
||||
let (end_lsn, wal_reader) = if let Some((end_lsn, wal_tarfile_path)) = pg_wal {
|
||||
let wal_tarfile = tokio::fs::File::open(wal_tarfile_path).await?;
|
||||
let wal_reader =
|
||||
mgmt_api::ReqwestBody::wrap_stream(tokio_util::io::ReaderStream::new(wal_tarfile));
|
||||
let wal_reader = tokio_util::io::ReaderStream::new(wal_tarfile);
|
||||
(end_lsn, Some(wal_reader))
|
||||
} else {
|
||||
(start_lsn, None)
|
||||
};
|
||||
|
||||
// Import base
|
||||
self.http_client
|
||||
.import_basebackup(
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
start_lsn,
|
||||
end_lsn,
|
||||
pg_version,
|
||||
base_tarfile,
|
||||
)
|
||||
.await?;
|
||||
let copy_in = |reader, cmd| {
|
||||
let client = &client;
|
||||
async move {
|
||||
let writer = client.copy_in(&cmd).await?;
|
||||
let writer = std::pin::pin!(writer);
|
||||
let mut writer = writer.sink_map_err(|e| {
|
||||
std::io::Error::new(std::io::ErrorKind::Other, format!("{e}"))
|
||||
});
|
||||
let mut reader = std::pin::pin!(reader);
|
||||
writer.send_all(&mut reader).await?;
|
||||
writer.into_inner().finish().await?;
|
||||
anyhow::Ok(())
|
||||
}
|
||||
};
|
||||
|
||||
// Import base
|
||||
copy_in(
|
||||
base_tarfile,
|
||||
format!(
|
||||
"import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn} {pg_version}"
|
||||
),
|
||||
)
|
||||
.await?;
|
||||
// Import wal if necessary
|
||||
if let Some(wal_reader) = wal_reader {
|
||||
self.http_client
|
||||
.import_wal(tenant_id, timeline_id, start_lsn, end_lsn, wal_reader)
|
||||
.await?;
|
||||
copy_in(
|
||||
wal_reader,
|
||||
format!("import wal {tenant_id} {timeline_id} {start_lsn} {end_lsn}"),
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn tenant_synthetic_size(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
) -> anyhow::Result<TenantHistorySize> {
|
||||
Ok(self
|
||||
.http_client
|
||||
.tenant_synthetic_size(tenant_shard_id)
|
||||
.await?)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,10 +4,13 @@
|
||||
/// NOTE: This doesn't implement the full, correct postgresql.conf syntax. Just
|
||||
/// enough to extract a few settings we need in Neon, assuming you don't do
|
||||
/// funny stuff like include-directives or funny escaping.
|
||||
use anyhow::{bail, Context, Result};
|
||||
use once_cell::sync::Lazy;
|
||||
use regex::Regex;
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::io::BufRead;
|
||||
use std::str::FromStr;
|
||||
|
||||
/// In-memory representation of a postgresql.conf file
|
||||
#[derive(Default, Debug)]
|
||||
@@ -16,16 +19,84 @@ pub struct PostgresConf {
|
||||
hash: HashMap<String, String>,
|
||||
}
|
||||
|
||||
static CONF_LINE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^((?:\w|\.)+)\s*=\s*(\S+)$").unwrap());
|
||||
|
||||
impl PostgresConf {
|
||||
pub fn new() -> PostgresConf {
|
||||
PostgresConf::default()
|
||||
}
|
||||
|
||||
/// Read file into memory
|
||||
pub fn read(read: impl std::io::Read) -> Result<PostgresConf> {
|
||||
let mut result = Self::new();
|
||||
|
||||
for line in std::io::BufReader::new(read).lines() {
|
||||
let line = line?;
|
||||
|
||||
// Store each line in a vector, in original format
|
||||
result.lines.push(line.clone());
|
||||
|
||||
// Also parse each line and insert key=value lines into a hash map.
|
||||
//
|
||||
// FIXME: This doesn't match exactly the flex/bison grammar in PostgreSQL.
|
||||
// But it's close enough for our usage.
|
||||
let line = line.trim();
|
||||
if line.starts_with('#') {
|
||||
// comment, ignore
|
||||
continue;
|
||||
} else if let Some(caps) = CONF_LINE_RE.captures(line) {
|
||||
let name = caps.get(1).unwrap().as_str();
|
||||
let raw_val = caps.get(2).unwrap().as_str();
|
||||
|
||||
if let Ok(val) = deescape_str(raw_val) {
|
||||
// Note: if there's already an entry in the hash map for
|
||||
// this key, this will replace it. That's the behavior what
|
||||
// we want; when PostgreSQL reads the file, each line
|
||||
// overrides any previous value for the same setting.
|
||||
result.hash.insert(name.to_string(), val.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Return the current value of 'option'
|
||||
pub fn get(&self, option: &str) -> Option<&str> {
|
||||
self.hash.get(option).map(|x| x.as_ref())
|
||||
}
|
||||
|
||||
/// Return the current value of a field, parsed to the right datatype.
|
||||
///
|
||||
/// This calls the FromStr::parse() function on the value of the field. If
|
||||
/// the field does not exist, or parsing fails, returns an error.
|
||||
///
|
||||
pub fn parse_field<T>(&self, field_name: &str, context: &str) -> Result<T>
|
||||
where
|
||||
T: FromStr,
|
||||
<T as FromStr>::Err: std::error::Error + Send + Sync + 'static,
|
||||
{
|
||||
self.get(field_name)
|
||||
.with_context(|| format!("could not find '{}' option {}", field_name, context))?
|
||||
.parse::<T>()
|
||||
.with_context(|| format!("could not parse '{}' option {}", field_name, context))
|
||||
}
|
||||
|
||||
pub fn parse_field_optional<T>(&self, field_name: &str, context: &str) -> Result<Option<T>>
|
||||
where
|
||||
T: FromStr,
|
||||
<T as FromStr>::Err: std::error::Error + Send + Sync + 'static,
|
||||
{
|
||||
if let Some(val) = self.get(field_name) {
|
||||
let result = val
|
||||
.parse::<T>()
|
||||
.with_context(|| format!("could not parse '{}' option {}", field_name, context))?;
|
||||
|
||||
Ok(Some(result))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Note: if you call this multiple times for the same option, the config
|
||||
/// file will a line for each call. It would be nice to have a function
|
||||
@@ -83,8 +154,48 @@ fn escape_str(s: &str) -> String {
|
||||
}
|
||||
}
|
||||
|
||||
/// De-escape a possibly-quoted value.
|
||||
///
|
||||
/// See `DeescapeQuotedString` function in PostgreSQL sources for how PostgreSQL
|
||||
/// does this.
|
||||
fn deescape_str(s: &str) -> Result<String> {
|
||||
// If the string has a quote at the beginning and end, strip them out.
|
||||
if s.len() >= 2 && s.starts_with('\'') && s.ends_with('\'') {
|
||||
let mut result = String::new();
|
||||
|
||||
let mut iter = s[1..(s.len() - 1)].chars().peekable();
|
||||
while let Some(c) = iter.next() {
|
||||
let newc = if c == '\\' {
|
||||
match iter.next() {
|
||||
Some('b') => '\x08',
|
||||
Some('f') => '\x0c',
|
||||
Some('n') => '\n',
|
||||
Some('r') => '\r',
|
||||
Some('t') => '\t',
|
||||
Some('0'..='7') => {
|
||||
// TODO
|
||||
bail!("octal escapes not supported");
|
||||
}
|
||||
Some(n) => n,
|
||||
None => break,
|
||||
}
|
||||
} else if c == '\'' && iter.peek() == Some(&'\'') {
|
||||
// doubled quote becomes just one quote
|
||||
iter.next().unwrap()
|
||||
} else {
|
||||
c
|
||||
};
|
||||
|
||||
result.push(newc);
|
||||
}
|
||||
Ok(result)
|
||||
} else {
|
||||
Ok(s.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_postgresql_conf_escapes() -> anyhow::Result<()> {
|
||||
fn test_postgresql_conf_escapes() -> Result<()> {
|
||||
assert_eq!(escape_str("foo bar"), "'foo bar'");
|
||||
// these don't need to be quoted
|
||||
assert_eq!(escape_str("foo"), "foo");
|
||||
@@ -103,5 +214,13 @@ fn test_postgresql_conf_escapes() -> anyhow::Result<()> {
|
||||
assert_eq!(escape_str("fo\\o"), "'fo\\\\o'");
|
||||
assert_eq!(escape_str("10 cats"), "'10 cats'");
|
||||
|
||||
// Test de-escaping
|
||||
assert_eq!(deescape_str(&escape_str("foo"))?, "foo");
|
||||
assert_eq!(deescape_str(&escape_str("fo'o\nba\\r"))?, "fo'o\nba\\r");
|
||||
assert_eq!(deescape_str("'\\b\\f\\n\\r\\t'")?, "\x08\x0c\n\r\t");
|
||||
|
||||
// octal-escapes are currently not supported
|
||||
assert!(deescape_str("'foo\\7\\07\\007'").is_err());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -5,10 +5,8 @@
|
||||
//! ```text
|
||||
//! .neon/safekeepers/<safekeeper id>
|
||||
//! ```
|
||||
use std::future::Future;
|
||||
use std::io::Write;
|
||||
use std::path::PathBuf;
|
||||
use std::time::Duration;
|
||||
use std::{io, result};
|
||||
|
||||
use anyhow::Context;
|
||||
@@ -16,7 +14,6 @@ use camino::Utf8PathBuf;
|
||||
use postgres_connection::PgConnectionConfig;
|
||||
use reqwest::{IntoUrl, Method};
|
||||
use thiserror::Error;
|
||||
use utils::auth::{Claims, Scope};
|
||||
use utils::{http::error::HttpErrorBody, id::NodeId};
|
||||
|
||||
use crate::{
|
||||
@@ -35,10 +32,12 @@ pub enum SafekeeperHttpError {
|
||||
|
||||
type Result<T> = result::Result<T, SafekeeperHttpError>;
|
||||
|
||||
pub(crate) trait ResponseErrorMessageExt: Sized {
|
||||
fn error_from_body(self) -> impl Future<Output = Result<Self>> + Send;
|
||||
#[async_trait::async_trait]
|
||||
pub trait ResponseErrorMessageExt: Sized {
|
||||
async fn error_from_body(self) -> Result<Self>;
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl ResponseErrorMessageExt for reqwest::Response {
|
||||
async fn error_from_body(self) -> Result<Self> {
|
||||
let status = self.status();
|
||||
@@ -111,16 +110,11 @@ impl SafekeeperNode {
|
||||
.expect("non-Unicode path")
|
||||
}
|
||||
|
||||
pub async fn start(
|
||||
&self,
|
||||
extra_opts: &[String],
|
||||
retry_timeout: &Duration,
|
||||
) -> anyhow::Result<()> {
|
||||
pub async fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<()> {
|
||||
print!(
|
||||
"Starting safekeeper at '{}' in '{}', retrying for {:?}",
|
||||
"Starting safekeeper at '{}' in '{}'",
|
||||
self.pg_connection_config.raw_address(),
|
||||
self.datadir_path().display(),
|
||||
retry_timeout,
|
||||
self.datadir_path().display()
|
||||
);
|
||||
io::stdout().flush().unwrap();
|
||||
|
||||
@@ -196,16 +190,15 @@ impl SafekeeperNode {
|
||||
]);
|
||||
}
|
||||
|
||||
args.extend_from_slice(extra_opts);
|
||||
args.extend(extra_opts);
|
||||
|
||||
background_process::start_process(
|
||||
&format!("safekeeper-{id}"),
|
||||
&datadir,
|
||||
&self.env.safekeeper_bin(),
|
||||
&args,
|
||||
self.safekeeper_env_variables()?,
|
||||
[],
|
||||
background_process::InitialPidFile::Expect(self.pid_file()),
|
||||
retry_timeout,
|
||||
|| async {
|
||||
match self.check_status().await {
|
||||
Ok(()) => Ok(true),
|
||||
@@ -217,18 +210,6 @@ impl SafekeeperNode {
|
||||
.await
|
||||
}
|
||||
|
||||
fn safekeeper_env_variables(&self) -> anyhow::Result<Vec<(String, String)>> {
|
||||
// Generate a token to connect from safekeeper to peers
|
||||
if self.conf.auth_enabled {
|
||||
let token = self
|
||||
.env
|
||||
.generate_auth_token(&Claims::new(None, Scope::SafekeeperData))?;
|
||||
Ok(vec![("SAFEKEEPER_AUTH_TOKEN".to_owned(), token)])
|
||||
} else {
|
||||
Ok(Vec::new())
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Stop the server.
|
||||
///
|
||||
|
||||
@@ -3,16 +3,14 @@ use crate::{
|
||||
local_env::{LocalEnv, NeonStorageControllerConf},
|
||||
};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use hyper0::Uri;
|
||||
use nix::unistd::Pid;
|
||||
use pageserver_api::{
|
||||
controller_api::{
|
||||
NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest,
|
||||
TenantCreateResponse, TenantLocateResponse, TenantShardMigrateRequest,
|
||||
TenantShardMigrateResponse,
|
||||
NodeConfigureRequest, NodeRegisterRequest, TenantCreateResponse, TenantLocateResponse,
|
||||
TenantShardMigrateRequest, TenantShardMigrateResponse,
|
||||
},
|
||||
models::{
|
||||
TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
|
||||
TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
|
||||
TimelineCreateRequest, TimelineInfo,
|
||||
},
|
||||
shard::{ShardStripeSize, TenantShardId},
|
||||
};
|
||||
@@ -20,7 +18,7 @@ use pageserver_client::mgmt_api::ResponseErrorMessageExt;
|
||||
use postgres_backend::AuthType;
|
||||
use reqwest::Method;
|
||||
use serde::{de::DeserializeOwned, Deserialize, Serialize};
|
||||
use std::{fs, net::SocketAddr, path::PathBuf, str::FromStr, sync::OnceLock};
|
||||
use std::{fs, str::FromStr};
|
||||
use tokio::process::Command;
|
||||
use tracing::instrument;
|
||||
use url::Url;
|
||||
@@ -28,61 +26,26 @@ use utils::{
|
||||
auth::{encode_from_key_file, Claims, Scope},
|
||||
id::{NodeId, TenantId},
|
||||
};
|
||||
use whoami::username;
|
||||
|
||||
pub struct StorageController {
|
||||
env: LocalEnv,
|
||||
listen: String,
|
||||
path: Utf8PathBuf,
|
||||
private_key: Option<Vec<u8>>,
|
||||
public_key: Option<String>,
|
||||
postgres_port: u16,
|
||||
client: reqwest::Client,
|
||||
config: NeonStorageControllerConf,
|
||||
|
||||
// The listen addresses is learned when starting the storage controller,
|
||||
// hence the use of OnceLock to init it at the right time.
|
||||
listen: OnceLock<SocketAddr>,
|
||||
}
|
||||
|
||||
const COMMAND: &str = "storage_controller";
|
||||
|
||||
const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
|
||||
|
||||
const DB_NAME: &str = "storage_controller";
|
||||
|
||||
pub struct NeonStorageControllerStartArgs {
|
||||
pub instance_id: u8,
|
||||
pub base_port: Option<u16>,
|
||||
pub start_timeout: humantime::Duration,
|
||||
}
|
||||
|
||||
impl NeonStorageControllerStartArgs {
|
||||
pub fn with_default_instance_id(start_timeout: humantime::Duration) -> Self {
|
||||
Self {
|
||||
instance_id: 1,
|
||||
base_port: None,
|
||||
start_timeout,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct NeonStorageControllerStopArgs {
|
||||
pub instance_id: u8,
|
||||
pub immediate: bool,
|
||||
}
|
||||
|
||||
impl NeonStorageControllerStopArgs {
|
||||
pub fn with_default_instance_id(immediate: bool) -> Self {
|
||||
Self {
|
||||
instance_id: 1,
|
||||
immediate,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct AttachHookRequest {
|
||||
pub tenant_shard_id: TenantShardId,
|
||||
pub node_id: Option<NodeId>,
|
||||
pub generation_override: Option<i32>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
@@ -102,6 +65,27 @@ pub struct InspectResponse {
|
||||
|
||||
impl StorageController {
|
||||
pub fn from_env(env: &LocalEnv) -> Self {
|
||||
let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone())
|
||||
.unwrap()
|
||||
.join("attachments.json");
|
||||
|
||||
// Makes no sense to construct this if pageservers aren't going to use it: assume
|
||||
// pageservers have control plane API set
|
||||
let listen_url = env.control_plane_api.clone().unwrap();
|
||||
|
||||
let listen = format!(
|
||||
"{}:{}",
|
||||
listen_url.host_str().unwrap(),
|
||||
listen_url.port().unwrap()
|
||||
);
|
||||
|
||||
// Convention: NeonEnv in python tests reserves the next port after the control_plane_api
|
||||
// port, for use by our captive postgres.
|
||||
let postgres_port = listen_url
|
||||
.port()
|
||||
.expect("Control plane API setting should always have a port")
|
||||
+ 1;
|
||||
|
||||
// Assume all pageservers have symmetric auth configuration: this service
|
||||
// expects to use one JWT token to talk to all of them.
|
||||
let ps_conf = env
|
||||
@@ -144,28 +128,21 @@ impl StorageController {
|
||||
|
||||
Self {
|
||||
env: env.clone(),
|
||||
path,
|
||||
listen,
|
||||
private_key,
|
||||
public_key,
|
||||
postgres_port,
|
||||
client: reqwest::ClientBuilder::new()
|
||||
.build()
|
||||
.expect("Failed to construct http client"),
|
||||
config: env.storage_controller.clone(),
|
||||
listen: OnceLock::default(),
|
||||
}
|
||||
}
|
||||
|
||||
fn storage_controller_instance_dir(&self, instance_id: u8) -> PathBuf {
|
||||
self.env
|
||||
.base_data_dir
|
||||
.join(format!("storage_controller_{}", instance_id))
|
||||
}
|
||||
|
||||
fn pid_file(&self, instance_id: u8) -> Utf8PathBuf {
|
||||
Utf8PathBuf::from_path_buf(
|
||||
self.storage_controller_instance_dir(instance_id)
|
||||
.join("storage_controller.pid"),
|
||||
)
|
||||
.expect("non-Unicode path")
|
||||
fn pid_file(&self) -> Utf8PathBuf {
|
||||
Utf8PathBuf::from_path_buf(self.env.base_data_dir.join("storage_controller.pid"))
|
||||
.expect("non-Unicode path")
|
||||
}
|
||||
|
||||
/// PIDFile for the postgres instance used to store storage controller state
|
||||
@@ -178,16 +155,16 @@ impl StorageController {
|
||||
.expect("non-Unicode path")
|
||||
}
|
||||
|
||||
/// Find the directory containing postgres subdirectories, such `bin` and `lib`
|
||||
/// Find the directory containing postgres binaries, such as `initdb` and `pg_ctl`
|
||||
///
|
||||
/// This usually uses STORAGE_CONTROLLER_POSTGRES_VERSION of postgres, but will fall back
|
||||
/// to other versions if that one isn't found. Some automated tests create circumstances
|
||||
/// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`.
|
||||
async fn get_pg_dir(&self, dir_name: &str) -> anyhow::Result<Utf8PathBuf> {
|
||||
let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 16, 15, 14];
|
||||
pub async fn get_pg_bin_dir(&self) -> anyhow::Result<Utf8PathBuf> {
|
||||
let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 15, 14];
|
||||
|
||||
for v in prefer_versions {
|
||||
let path = Utf8PathBuf::from_path_buf(self.env.pg_dir(v, dir_name)?).unwrap();
|
||||
let path = Utf8PathBuf::from_path_buf(self.env.pg_bin_dir(v)?).unwrap();
|
||||
if tokio::fs::try_exists(&path).await? {
|
||||
return Ok(path);
|
||||
}
|
||||
@@ -195,51 +172,30 @@ impl StorageController {
|
||||
|
||||
// Fall through
|
||||
anyhow::bail!(
|
||||
"Postgres directory '{}' not found in {}",
|
||||
dir_name,
|
||||
self.env.pg_distrib_dir.display(),
|
||||
"Postgres binaries not found in {}",
|
||||
self.env.pg_distrib_dir.display()
|
||||
);
|
||||
}
|
||||
|
||||
pub async fn get_pg_bin_dir(&self) -> anyhow::Result<Utf8PathBuf> {
|
||||
self.get_pg_dir("bin").await
|
||||
}
|
||||
|
||||
pub async fn get_pg_lib_dir(&self) -> anyhow::Result<Utf8PathBuf> {
|
||||
self.get_pg_dir("lib").await
|
||||
}
|
||||
|
||||
/// Readiness check for our postgres process
|
||||
async fn pg_isready(&self, pg_bin_dir: &Utf8Path, postgres_port: u16) -> anyhow::Result<bool> {
|
||||
async fn pg_isready(&self, pg_bin_dir: &Utf8Path) -> anyhow::Result<bool> {
|
||||
let bin_path = pg_bin_dir.join("pg_isready");
|
||||
let args = [
|
||||
"-h",
|
||||
"localhost",
|
||||
"-U",
|
||||
&username(),
|
||||
"-d",
|
||||
DB_NAME,
|
||||
"-p",
|
||||
&format!("{}", postgres_port),
|
||||
];
|
||||
let args = ["-h", "localhost", "-p", &format!("{}", self.postgres_port)];
|
||||
let exitcode = Command::new(bin_path).args(args).spawn()?.wait().await?;
|
||||
|
||||
Ok(exitcode.success())
|
||||
}
|
||||
|
||||
/// Create our database if it doesn't exist
|
||||
/// Create our database if it doesn't exist, and run migrations.
|
||||
///
|
||||
/// This function is equivalent to the `diesel setup` command in the diesel CLI. We implement
|
||||
/// the same steps by hand to avoid imposing a dependency on installing diesel-cli for developers
|
||||
/// who just want to run `cargo neon_local` without knowing about diesel.
|
||||
///
|
||||
/// Returns the database url
|
||||
pub async fn setup_database(&self, postgres_port: u16) -> anyhow::Result<String> {
|
||||
let database_url = format!(
|
||||
"postgresql://{}@localhost:{}/{DB_NAME}",
|
||||
&username(),
|
||||
postgres_port
|
||||
);
|
||||
pub async fn setup_database(&self) -> anyhow::Result<String> {
|
||||
const DB_NAME: &str = "storage_controller";
|
||||
let database_url = format!("postgresql://localhost:{}/{DB_NAME}", self.postgres_port);
|
||||
|
||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||
let createdb_path = pg_bin_dir.join("createdb");
|
||||
@@ -248,11 +204,7 @@ impl StorageController {
|
||||
"-h",
|
||||
"localhost",
|
||||
"-p",
|
||||
&format!("{}", postgres_port),
|
||||
"-U",
|
||||
&username(),
|
||||
"-O",
|
||||
&username(),
|
||||
&format!("{}", self.postgres_port),
|
||||
DB_NAME,
|
||||
])
|
||||
.output()
|
||||
@@ -271,232 +223,80 @@ impl StorageController {
|
||||
Ok(database_url)
|
||||
}
|
||||
|
||||
pub async fn connect_to_database(
|
||||
&self,
|
||||
postgres_port: u16,
|
||||
) -> anyhow::Result<(
|
||||
tokio_postgres::Client,
|
||||
tokio_postgres::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>,
|
||||
)> {
|
||||
tokio_postgres::Config::new()
|
||||
.host("localhost")
|
||||
.port(postgres_port)
|
||||
// The user is the ambient operating system user name.
|
||||
// That is an impurity which we want to fix in => TODO https://github.com/neondatabase/neon/issues/8400
|
||||
//
|
||||
// Until we get there, use the ambient operating system user name.
|
||||
// Recent tokio-postgres versions default to this if the user isn't specified.
|
||||
// But tokio-postgres fork doesn't have this upstream commit:
|
||||
// https://github.com/sfackler/rust-postgres/commit/cb609be758f3fb5af537f04b584a2ee0cebd5e79
|
||||
// => we should rebase our fork => TODO https://github.com/neondatabase/neon/issues/8399
|
||||
.user(&username())
|
||||
.dbname(DB_NAME)
|
||||
.connect(tokio_postgres::NoTls)
|
||||
.await
|
||||
.map_err(anyhow::Error::new)
|
||||
}
|
||||
pub async fn start(&self) -> anyhow::Result<()> {
|
||||
// Start a vanilla Postgres process used by the storage controller for persistence.
|
||||
let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
|
||||
.unwrap()
|
||||
.join("storage_controller_db");
|
||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||
let pg_log_path = pg_data_path.join("postgres.log");
|
||||
|
||||
pub async fn start(&self, start_args: NeonStorageControllerStartArgs) -> anyhow::Result<()> {
|
||||
let instance_dir = self.storage_controller_instance_dir(start_args.instance_id);
|
||||
if let Err(err) = tokio::fs::create_dir(&instance_dir).await {
|
||||
if err.kind() != std::io::ErrorKind::AlreadyExists {
|
||||
panic!("Failed to create instance dir {instance_dir:?}");
|
||||
if !tokio::fs::try_exists(&pg_data_path).await? {
|
||||
// Initialize empty database
|
||||
let initdb_path = pg_bin_dir.join("initdb");
|
||||
let mut child = Command::new(&initdb_path)
|
||||
.args(["-D", pg_data_path.as_ref()])
|
||||
.spawn()
|
||||
.expect("Failed to spawn initdb");
|
||||
let status = child.wait().await?;
|
||||
if !status.success() {
|
||||
anyhow::bail!("initdb failed with status {status}");
|
||||
}
|
||||
}
|
||||
|
||||
let (listen, postgres_port) = {
|
||||
if let Some(base_port) = start_args.base_port {
|
||||
(
|
||||
format!("127.0.0.1:{base_port}"),
|
||||
self.config
|
||||
.database_url
|
||||
.expect("--base-port requires NeonStorageControllerConf::database_url")
|
||||
.port(),
|
||||
)
|
||||
} else {
|
||||
let listen_url = self.env.control_plane_api.clone().unwrap();
|
||||
|
||||
let listen = format!(
|
||||
"{}:{}",
|
||||
listen_url.host_str().unwrap(),
|
||||
listen_url.port().unwrap()
|
||||
);
|
||||
|
||||
(listen, listen_url.port().unwrap() + 1)
|
||||
}
|
||||
};
|
||||
|
||||
let socket_addr = listen
|
||||
.parse()
|
||||
.expect("listen address is a valid socket address");
|
||||
self.listen
|
||||
.set(socket_addr)
|
||||
.expect("StorageController::listen is only set here");
|
||||
|
||||
// Do we remove the pid file on stop?
|
||||
let pg_started = self.is_postgres_running().await?;
|
||||
let pg_lib_dir = self.get_pg_lib_dir().await?;
|
||||
|
||||
if !pg_started {
|
||||
// Start a vanilla Postgres process used by the storage controller for persistence.
|
||||
let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
|
||||
.unwrap()
|
||||
.join("storage_controller_db");
|
||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||
let pg_log_path = pg_data_path.join("postgres.log");
|
||||
|
||||
if !tokio::fs::try_exists(&pg_data_path).await? {
|
||||
let initdb_args = [
|
||||
"--pgdata",
|
||||
pg_data_path.as_ref(),
|
||||
"--username",
|
||||
&username(),
|
||||
"--no-sync",
|
||||
"--no-instructions",
|
||||
];
|
||||
tracing::info!(
|
||||
"Initializing storage controller database with args: {:?}",
|
||||
initdb_args
|
||||
);
|
||||
|
||||
// Initialize empty database
|
||||
let initdb_path = pg_bin_dir.join("initdb");
|
||||
let mut child = Command::new(&initdb_path)
|
||||
.envs(vec![
|
||||
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
])
|
||||
.args(initdb_args)
|
||||
.spawn()
|
||||
.expect("Failed to spawn initdb");
|
||||
let status = child.wait().await?;
|
||||
if !status.success() {
|
||||
anyhow::bail!("initdb failed with status {status}");
|
||||
}
|
||||
};
|
||||
|
||||
// Write a minimal config file:
|
||||
// - Specify the port, since this is chosen dynamically
|
||||
// - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
|
||||
// the storage controller we don't want a slow local disk to interfere with that.
|
||||
//
|
||||
// NB: it's important that we rewrite this file on each start command so we propagate changes
|
||||
// from `LocalEnv`'s config file (`.neon/config`).
|
||||
tokio::fs::write(
|
||||
&pg_data_path.join("postgresql.conf"),
|
||||
format!("port = {}\nfsync=off\n", postgres_port),
|
||||
format!("port = {}\nfsync=off\n", self.postgres_port),
|
||||
)
|
||||
.await?;
|
||||
|
||||
println!("Starting storage controller database...");
|
||||
let db_start_args = [
|
||||
"-w",
|
||||
"-D",
|
||||
pg_data_path.as_ref(),
|
||||
"-l",
|
||||
pg_log_path.as_ref(),
|
||||
"-U",
|
||||
&username(),
|
||||
"start",
|
||||
];
|
||||
tracing::info!(
|
||||
"Starting storage controller database with args: {:?}",
|
||||
db_start_args
|
||||
);
|
||||
|
||||
background_process::start_process(
|
||||
"storage_controller_db",
|
||||
&self.env.base_data_dir,
|
||||
pg_bin_dir.join("pg_ctl").as_std_path(),
|
||||
db_start_args,
|
||||
vec![
|
||||
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
],
|
||||
background_process::InitialPidFile::Create(self.postgres_pid_file()),
|
||||
&start_args.start_timeout,
|
||||
|| self.pg_isready(&pg_bin_dir, postgres_port),
|
||||
)
|
||||
.await?;
|
||||
|
||||
self.setup_database(postgres_port).await?;
|
||||
}
|
||||
|
||||
let database_url = format!("postgresql://localhost:{}/{DB_NAME}", postgres_port);
|
||||
|
||||
// We support running a startup SQL script to fiddle with the database before we launch storcon.
|
||||
// This is used by the test suite.
|
||||
let startup_script_path = self
|
||||
.env
|
||||
.base_data_dir
|
||||
.join("storage_controller_db.startup.sql");
|
||||
let startup_script = match tokio::fs::read_to_string(&startup_script_path).await {
|
||||
Ok(script) => {
|
||||
tokio::fs::remove_file(startup_script_path).await?;
|
||||
script
|
||||
}
|
||||
Err(e) => {
|
||||
if e.kind() == std::io::ErrorKind::NotFound {
|
||||
// always run some startup script so that this code path doesn't bit rot
|
||||
"BEGIN; COMMIT;".to_string()
|
||||
} else {
|
||||
anyhow::bail!("Failed to read startup script: {e}")
|
||||
}
|
||||
}
|
||||
};
|
||||
let (mut client, conn) = self.connect_to_database(postgres_port).await?;
|
||||
let conn = tokio::spawn(conn);
|
||||
let tx = client.build_transaction();
|
||||
let tx = tx.start().await?;
|
||||
tx.batch_execute(&startup_script).await?;
|
||||
tx.commit().await?;
|
||||
drop(client);
|
||||
conn.await??;
|
||||
|
||||
let listen = self
|
||||
.listen
|
||||
.get()
|
||||
.expect("cell is set earlier in this function");
|
||||
let address_for_peers = Uri::builder()
|
||||
.scheme("http")
|
||||
.authority(format!("{}:{}", listen.ip(), listen.port()))
|
||||
.path_and_query("")
|
||||
.build()
|
||||
.unwrap();
|
||||
println!("Starting storage controller database...");
|
||||
let db_start_args = [
|
||||
"-w",
|
||||
"-D",
|
||||
pg_data_path.as_ref(),
|
||||
"-l",
|
||||
pg_log_path.as_ref(),
|
||||
"start",
|
||||
];
|
||||
|
||||
background_process::start_process(
|
||||
"storage_controller_db",
|
||||
&self.env.base_data_dir,
|
||||
pg_bin_dir.join("pg_ctl").as_std_path(),
|
||||
db_start_args,
|
||||
[],
|
||||
background_process::InitialPidFile::Create(self.postgres_pid_file()),
|
||||
|| self.pg_isready(&pg_bin_dir),
|
||||
)
|
||||
.await?;
|
||||
|
||||
// Run migrations on every startup, in case something changed.
|
||||
let database_url = self.setup_database().await?;
|
||||
|
||||
let mut args = vec![
|
||||
"-l",
|
||||
&listen.to_string(),
|
||||
&self.listen,
|
||||
"-p",
|
||||
self.path.as_ref(),
|
||||
"--dev",
|
||||
"--database-url",
|
||||
&database_url,
|
||||
"--max-offline-interval",
|
||||
&humantime::Duration::from(self.config.max_offline).to_string(),
|
||||
"--max-warming-up-interval",
|
||||
&humantime::Duration::from(self.config.max_warming_up).to_string(),
|
||||
"--heartbeat-interval",
|
||||
&humantime::Duration::from(self.config.heartbeat_interval).to_string(),
|
||||
"--address-for-peers",
|
||||
&address_for_peers.to_string(),
|
||||
"--max-unavailable-interval",
|
||||
&humantime::Duration::from(self.config.max_unavailable).to_string(),
|
||||
]
|
||||
.into_iter()
|
||||
.map(|s| s.to_string())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
if self.config.start_as_candidate {
|
||||
args.push("--start-as-candidate".to_string());
|
||||
}
|
||||
|
||||
if let Some(private_key) = &self.private_key {
|
||||
let claims = Claims::new(None, Scope::PageServerApi);
|
||||
let jwt_token =
|
||||
encode_from_key_file(&claims, private_key).expect("failed to generate jwt token");
|
||||
args.push(format!("--jwt-token={jwt_token}"));
|
||||
|
||||
let peer_claims = Claims::new(None, Scope::Admin);
|
||||
let peer_jwt_token = encode_from_key_file(&peer_claims, private_key)
|
||||
.expect("failed to generate jwt token");
|
||||
args.push(format!("--peer-jwt-token={peer_jwt_token}"));
|
||||
}
|
||||
|
||||
if let Some(public_key) = &self.public_key {
|
||||
@@ -513,33 +313,16 @@ impl StorageController {
|
||||
args.push(format!("--split-threshold={split_threshold}"))
|
||||
}
|
||||
|
||||
if let Some(lag) = self.config.max_secondary_lag_bytes.as_ref() {
|
||||
args.push(format!("--max-secondary-lag-bytes={lag}"))
|
||||
}
|
||||
|
||||
if let Some(threshold) = self.config.long_reconcile_threshold {
|
||||
args.push(format!(
|
||||
"--long-reconcile-threshold={}",
|
||||
humantime::Duration::from(threshold)
|
||||
))
|
||||
}
|
||||
|
||||
args.push(format!(
|
||||
"--neon-local-repo-dir={}",
|
||||
self.env.base_data_dir.display()
|
||||
));
|
||||
|
||||
background_process::start_process(
|
||||
COMMAND,
|
||||
&instance_dir,
|
||||
&self.env.base_data_dir,
|
||||
&self.env.storage_controller_bin(),
|
||||
args,
|
||||
vec![
|
||||
("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
|
||||
],
|
||||
background_process::InitialPidFile::Create(self.pid_file(start_args.instance_id)),
|
||||
&start_args.start_timeout,
|
||||
[(
|
||||
"NEON_REPO_DIR".to_string(),
|
||||
self.env.base_data_dir.to_string_lossy().to_string(),
|
||||
)],
|
||||
background_process::InitialPidFile::Create(self.pid_file()),
|
||||
|| async {
|
||||
match self.ready().await {
|
||||
Ok(_) => Ok(true),
|
||||
@@ -552,35 +335,8 @@ impl StorageController {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn stop(&self, stop_args: NeonStorageControllerStopArgs) -> anyhow::Result<()> {
|
||||
background_process::stop_process(
|
||||
stop_args.immediate,
|
||||
COMMAND,
|
||||
&self.pid_file(stop_args.instance_id),
|
||||
)?;
|
||||
|
||||
let storcon_instances = self.env.storage_controller_instances().await?;
|
||||
for (instance_id, instanced_dir_path) in storcon_instances {
|
||||
if instance_id == stop_args.instance_id {
|
||||
continue;
|
||||
}
|
||||
|
||||
let pid_file = instanced_dir_path.join("storage_controller.pid");
|
||||
let pid = tokio::fs::read_to_string(&pid_file)
|
||||
.await
|
||||
.map_err(|err| {
|
||||
anyhow::anyhow!("Failed to read storcon pid file at {pid_file:?}: {err}")
|
||||
})?
|
||||
.parse::<i32>()
|
||||
.expect("pid is valid i32");
|
||||
|
||||
let other_proc_alive = !background_process::process_has_stopped(Pid::from_raw(pid))?;
|
||||
if other_proc_alive {
|
||||
// There is another storage controller instance running, so we return
|
||||
// and leave the database running.
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
pub async fn stop(&self, immediate: bool) -> anyhow::Result<()> {
|
||||
background_process::stop_process(immediate, COMMAND, &self.pid_file())?;
|
||||
|
||||
let pg_data_path = self.env.base_data_dir.join("storage_controller_db");
|
||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||
@@ -593,51 +349,27 @@ impl StorageController {
|
||||
.wait()
|
||||
.await?;
|
||||
if !stop_status.success() {
|
||||
match self.is_postgres_running().await {
|
||||
Ok(false) => {
|
||||
println!("Storage controller database is already stopped");
|
||||
return Ok(());
|
||||
}
|
||||
Ok(true) => {
|
||||
anyhow::bail!("Failed to stop storage controller database");
|
||||
}
|
||||
Err(err) => {
|
||||
anyhow::bail!("Failed to stop storage controller database: {err}");
|
||||
}
|
||||
let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"];
|
||||
let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl"))
|
||||
.args(pg_status_args)
|
||||
.spawn()?
|
||||
.wait()
|
||||
.await?;
|
||||
|
||||
// pg_ctl status returns this exit code if postgres is not running: in this case it is
|
||||
// fine that stop failed. Otherwise it is an error that stop failed.
|
||||
const PG_STATUS_NOT_RUNNING: i32 = 3;
|
||||
if Some(PG_STATUS_NOT_RUNNING) == status_exitcode.code() {
|
||||
println!("Storage controller database is already stopped");
|
||||
return Ok(());
|
||||
} else {
|
||||
anyhow::bail!("Failed to stop storage controller database: {stop_status}")
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn is_postgres_running(&self) -> anyhow::Result<bool> {
|
||||
let pg_data_path = self.env.base_data_dir.join("storage_controller_db");
|
||||
let pg_bin_dir = self.get_pg_bin_dir().await?;
|
||||
|
||||
let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"];
|
||||
let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl"))
|
||||
.args(pg_status_args)
|
||||
.spawn()?
|
||||
.wait()
|
||||
.await?;
|
||||
|
||||
// pg_ctl status returns this exit code if postgres is not running: in this case it is
|
||||
// fine that stop failed. Otherwise it is an error that stop failed.
|
||||
const PG_STATUS_NOT_RUNNING: i32 = 3;
|
||||
const PG_NO_DATA_DIR: i32 = 4;
|
||||
const PG_STATUS_RUNNING: i32 = 0;
|
||||
match status_exitcode.code() {
|
||||
Some(PG_STATUS_NOT_RUNNING) => Ok(false),
|
||||
Some(PG_NO_DATA_DIR) => Ok(false),
|
||||
Some(PG_STATUS_RUNNING) => Ok(true),
|
||||
Some(code) => Err(anyhow::anyhow!(
|
||||
"pg_ctl status returned unexpected status code: {:?}",
|
||||
code
|
||||
)),
|
||||
None => Err(anyhow::anyhow!("pg_ctl status returned no status code")),
|
||||
}
|
||||
}
|
||||
|
||||
fn get_claims_for_path(path: &str) -> anyhow::Result<Option<Claims>> {
|
||||
let category = match path.find('/') {
|
||||
Some(idx) => &path[..idx],
|
||||
@@ -663,31 +395,15 @@ impl StorageController {
|
||||
RQ: Serialize + Sized,
|
||||
RS: DeserializeOwned + Sized,
|
||||
{
|
||||
// In the special case of the `storage_controller start` subcommand, we wish
|
||||
// to use the API endpoint of the newly started storage controller in order
|
||||
// to pass the readiness check. In this scenario [`Self::listen`] will be set
|
||||
// (see [`Self::start`]).
|
||||
//
|
||||
// Otherwise, we infer the storage controller api endpoint from the configured
|
||||
// control plane API.
|
||||
let url = if let Some(socket_addr) = self.listen.get() {
|
||||
Url::from_str(&format!(
|
||||
"http://{}:{}/{path}",
|
||||
socket_addr.ip().to_canonical(),
|
||||
socket_addr.port()
|
||||
))
|
||||
.unwrap()
|
||||
} else {
|
||||
// The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
|
||||
// for general purpose API access.
|
||||
let listen_url = self.env.control_plane_api.clone().unwrap();
|
||||
Url::from_str(&format!(
|
||||
"http://{}:{}/{path}",
|
||||
listen_url.host_str().unwrap(),
|
||||
listen_url.port().unwrap()
|
||||
))
|
||||
.unwrap()
|
||||
};
|
||||
// The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
|
||||
// for general purpose API access.
|
||||
let listen_url = self.env.control_plane_api.clone().unwrap();
|
||||
let url = Url::from_str(&format!(
|
||||
"http://{}:{}/{path}",
|
||||
listen_url.host_str().unwrap(),
|
||||
listen_url.port().unwrap()
|
||||
))
|
||||
.unwrap();
|
||||
|
||||
let mut builder = self.client.request(method, url);
|
||||
if let Some(body) = body {
|
||||
@@ -724,7 +440,6 @@ impl StorageController {
|
||||
let request = AttachHookRequest {
|
||||
tenant_shard_id,
|
||||
node_id: Some(pageserver_id),
|
||||
generation_override: None,
|
||||
};
|
||||
|
||||
let response = self
|
||||
@@ -836,15 +551,6 @@ impl StorageController {
|
||||
.await
|
||||
}
|
||||
|
||||
pub async fn node_list(&self) -> anyhow::Result<Vec<NodeDescribeResponse>> {
|
||||
self.dispatch::<(), Vec<NodeDescribeResponse>>(
|
||||
Method::GET,
|
||||
"control/v1/node".to_string(),
|
||||
None,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
#[instrument(skip(self))]
|
||||
pub async fn ready(&self) -> anyhow::Result<()> {
|
||||
self.dispatch::<(), ()>(Method::GET, "ready".to_string(), None)
|
||||
|
||||
@@ -11,11 +11,13 @@ clap.workspace = true
|
||||
comfy-table.workspace = true
|
||||
futures.workspace = true
|
||||
humantime.workspace = true
|
||||
hyper.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
pageserver_client.workspace = true
|
||||
reqwest.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json = { workspace = true, features = ["raw_value"] }
|
||||
storage_controller_client.workspace = true
|
||||
thiserror.workspace = true
|
||||
tokio.workspace = true
|
||||
tracing.workspace = true
|
||||
utils.workspace = true
|
||||
|
||||
@@ -1,28 +1,28 @@
|
||||
use futures::StreamExt;
|
||||
use std::{str::FromStr, time::Duration};
|
||||
use std::{collections::HashMap, str::FromStr, time::Duration};
|
||||
|
||||
use clap::{Parser, Subcommand};
|
||||
use pageserver_api::{
|
||||
controller_api::{
|
||||
AvailabilityZone, NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse,
|
||||
ShardSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
|
||||
NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy,
|
||||
TenantDescribeResponse, TenantPolicyRequest,
|
||||
},
|
||||
models::{
|
||||
EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
|
||||
ShardParameters, TenantConfig, TenantConfigRequest, TenantShardSplitRequest,
|
||||
TenantShardSplitResponse,
|
||||
ShardParameters, TenantConfig, TenantConfigRequest, TenantCreateRequest,
|
||||
TenantShardSplitRequest, TenantShardSplitResponse,
|
||||
},
|
||||
shard::{ShardStripeSize, TenantShardId},
|
||||
};
|
||||
use pageserver_client::mgmt_api::{self};
|
||||
use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt};
|
||||
use reqwest::{Method, StatusCode, Url};
|
||||
use serde::{de::DeserializeOwned, Serialize};
|
||||
use utils::id::{NodeId, TenantId};
|
||||
|
||||
use pageserver_api::controller_api::{
|
||||
NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
|
||||
TenantShardMigrateRequest, TenantShardMigrateResponse,
|
||||
TenantLocateResponse, TenantShardMigrateRequest, TenantShardMigrateResponse,
|
||||
};
|
||||
use storage_controller_client::control_api::Client;
|
||||
|
||||
#[derive(Subcommand, Debug)]
|
||||
enum Command {
|
||||
@@ -41,8 +41,6 @@ enum Command {
|
||||
listen_http_addr: String,
|
||||
#[arg(long)]
|
||||
listen_http_port: u16,
|
||||
#[arg(long)]
|
||||
availability_zone_id: String,
|
||||
},
|
||||
|
||||
/// Modify a node's configuration in the storage controller
|
||||
@@ -58,10 +56,6 @@ enum Command {
|
||||
#[arg(long)]
|
||||
scheduling: Option<NodeSchedulingPolicy>,
|
||||
},
|
||||
NodeDelete {
|
||||
#[arg(long)]
|
||||
node_id: NodeId,
|
||||
},
|
||||
/// Modify a tenant's policies in the storage controller
|
||||
TenantPolicy {
|
||||
#[arg(long)]
|
||||
@@ -80,10 +74,7 @@ enum Command {
|
||||
/// List nodes known to the storage controller
|
||||
Nodes {},
|
||||
/// List tenants known to the storage controller
|
||||
Tenants {
|
||||
/// If this field is set, it will list the tenants on a specific node
|
||||
node_id: Option<NodeId>,
|
||||
},
|
||||
Tenants {},
|
||||
/// Create a new tenant in the storage controller, and by extension on pageservers.
|
||||
TenantCreate {
|
||||
#[arg(long)]
|
||||
@@ -119,6 +110,12 @@ enum Command {
|
||||
#[arg(long)]
|
||||
config: String,
|
||||
},
|
||||
/// Attempt to balance the locations for a tenant across pageservers. This is a client-side
|
||||
/// alternative to the storage controller's scheduling optimization behavior.
|
||||
TenantScatter {
|
||||
#[arg(long)]
|
||||
tenant_id: TenantId,
|
||||
},
|
||||
/// Print details about a particular tenant, including all its shards' states.
|
||||
TenantDescribe {
|
||||
#[arg(long)]
|
||||
@@ -152,9 +149,9 @@ enum Command {
|
||||
#[arg(long)]
|
||||
threshold: humantime::Duration,
|
||||
},
|
||||
// Migrate away from a set of specified pageservers by moving the primary attachments to pageservers
|
||||
// Drain a set of specified pageservers by moving the primary attachments to pageservers
|
||||
// outside of the specified set.
|
||||
BulkMigrate {
|
||||
Drain {
|
||||
// Set of pageserver node ids to drain.
|
||||
#[arg(long)]
|
||||
nodes: Vec<NodeId>,
|
||||
@@ -168,34 +165,6 @@ enum Command {
|
||||
#[arg(long)]
|
||||
dry_run: Option<bool>,
|
||||
},
|
||||
/// Start draining the specified pageserver.
|
||||
/// The drain is complete when the schedulling policy returns to active.
|
||||
StartDrain {
|
||||
#[arg(long)]
|
||||
node_id: NodeId,
|
||||
},
|
||||
/// Cancel draining the specified pageserver and wait for `timeout`
|
||||
/// for the operation to be canceled. May be retried.
|
||||
CancelDrain {
|
||||
#[arg(long)]
|
||||
node_id: NodeId,
|
||||
#[arg(long)]
|
||||
timeout: humantime::Duration,
|
||||
},
|
||||
/// Start filling the specified pageserver.
|
||||
/// The drain is complete when the schedulling policy returns to active.
|
||||
StartFill {
|
||||
#[arg(long)]
|
||||
node_id: NodeId,
|
||||
},
|
||||
/// Cancel filling the specified pageserver and wait for `timeout`
|
||||
/// for the operation to be canceled. May be retried.
|
||||
CancelFill {
|
||||
#[arg(long)]
|
||||
node_id: NodeId,
|
||||
#[arg(long)]
|
||||
timeout: humantime::Duration,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
@@ -282,32 +251,62 @@ impl FromStr for NodeAvailabilityArg {
|
||||
}
|
||||
}
|
||||
|
||||
async fn wait_for_scheduling_policy<F>(
|
||||
client: Client,
|
||||
node_id: NodeId,
|
||||
timeout: Duration,
|
||||
f: F,
|
||||
) -> anyhow::Result<NodeSchedulingPolicy>
|
||||
where
|
||||
F: Fn(NodeSchedulingPolicy) -> bool,
|
||||
{
|
||||
let waiter = tokio::time::timeout(timeout, async move {
|
||||
loop {
|
||||
let node = client
|
||||
.dispatch::<(), NodeDescribeResponse>(
|
||||
Method::GET,
|
||||
format!("control/v1/node/{node_id}"),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
struct Client {
|
||||
base_url: Url,
|
||||
jwt_token: Option<String>,
|
||||
client: reqwest::Client,
|
||||
}
|
||||
|
||||
if f(node.scheduling) {
|
||||
return Ok::<NodeSchedulingPolicy, mgmt_api::Error>(node.scheduling);
|
||||
}
|
||||
impl Client {
|
||||
fn new(base_url: Url, jwt_token: Option<String>) -> Self {
|
||||
Self {
|
||||
base_url,
|
||||
jwt_token,
|
||||
client: reqwest::ClientBuilder::new()
|
||||
.build()
|
||||
.expect("Failed to construct http client"),
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
Ok(waiter.await??)
|
||||
/// Simple HTTP request wrapper for calling into storage controller
|
||||
async fn dispatch<RQ, RS>(
|
||||
&self,
|
||||
method: Method,
|
||||
path: String,
|
||||
body: Option<RQ>,
|
||||
) -> mgmt_api::Result<RS>
|
||||
where
|
||||
RQ: Serialize + Sized,
|
||||
RS: DeserializeOwned + Sized,
|
||||
{
|
||||
// The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
|
||||
// for general purpose API access.
|
||||
let url = Url::from_str(&format!(
|
||||
"http://{}:{}/{path}",
|
||||
self.base_url.host_str().unwrap(),
|
||||
self.base_url.port().unwrap()
|
||||
))
|
||||
.unwrap();
|
||||
|
||||
let mut builder = self.client.request(method, url);
|
||||
if let Some(body) = body {
|
||||
builder = builder.json(&body)
|
||||
}
|
||||
if let Some(jwt_token) = &self.jwt_token {
|
||||
builder = builder.header(
|
||||
reqwest::header::AUTHORIZATION,
|
||||
format!("Bearer {jwt_token}"),
|
||||
);
|
||||
}
|
||||
|
||||
let response = builder.send().await.map_err(mgmt_api::Error::ReceiveBody)?;
|
||||
let response = response.error_from_body().await?;
|
||||
|
||||
response
|
||||
.json()
|
||||
.await
|
||||
.map_err(pageserver_client::mgmt_api::Error::ReceiveBody)
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
@@ -327,7 +326,6 @@ async fn main() -> anyhow::Result<()> {
|
||||
listen_pg_port,
|
||||
listen_http_addr,
|
||||
listen_http_port,
|
||||
availability_zone_id,
|
||||
} => {
|
||||
storcon_client
|
||||
.dispatch::<_, ()>(
|
||||
@@ -339,24 +337,19 @@ async fn main() -> anyhow::Result<()> {
|
||||
listen_pg_port,
|
||||
listen_http_addr,
|
||||
listen_http_port,
|
||||
availability_zone_id: AvailabilityZone(availability_zone_id),
|
||||
}),
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
Command::TenantCreate { tenant_id } => {
|
||||
storcon_client
|
||||
.dispatch::<_, ()>(
|
||||
Method::POST,
|
||||
"v1/tenant".to_string(),
|
||||
Some(TenantCreateRequest {
|
||||
new_tenant_id: TenantShardId::unsharded(tenant_id),
|
||||
generation: None,
|
||||
shard_parameters: ShardParameters::default(),
|
||||
placement_policy: Some(PlacementPolicy::Attached(1)),
|
||||
config: TenantConfig::default(),
|
||||
}),
|
||||
)
|
||||
vps_client
|
||||
.tenant_create(&TenantCreateRequest {
|
||||
new_tenant_id: TenantShardId::unsharded(tenant_id),
|
||||
generation: None,
|
||||
shard_parameters: ShardParameters::default(),
|
||||
placement_policy: Some(PlacementPolicy::Attached(1)),
|
||||
config: TenantConfig::default(),
|
||||
})
|
||||
.await?;
|
||||
}
|
||||
Command::TenantDelete { tenant_id } => {
|
||||
@@ -366,16 +359,13 @@ async fn main() -> anyhow::Result<()> {
|
||||
tracing::info!("Delete status: {}", status);
|
||||
}
|
||||
Command::Nodes {} => {
|
||||
let mut resp = storcon_client
|
||||
let resp = storcon_client
|
||||
.dispatch::<(), Vec<NodeDescribeResponse>>(
|
||||
Method::GET,
|
||||
"control/v1/node".to_string(),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
resp.sort_by(|a, b| a.listen_http_addr.cmp(&b.listen_http_addr));
|
||||
|
||||
let mut table = comfy_table::Table::new();
|
||||
table.set_header(["Id", "Hostname", "Scheduling", "Availability"]);
|
||||
for node in resp {
|
||||
@@ -406,51 +396,14 @@ async fn main() -> anyhow::Result<()> {
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
Command::Tenants {
|
||||
node_id: Some(node_id),
|
||||
} => {
|
||||
let describe_response = storcon_client
|
||||
.dispatch::<(), NodeShardResponse>(
|
||||
Method::GET,
|
||||
format!("control/v1/node/{node_id}/shards"),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
let shards = describe_response.shards;
|
||||
let mut table = comfy_table::Table::new();
|
||||
table.set_header([
|
||||
"Shard",
|
||||
"Intended Primary/Secondary",
|
||||
"Observed Primary/Secondary",
|
||||
]);
|
||||
for shard in shards {
|
||||
table.add_row([
|
||||
format!("{}", shard.tenant_shard_id),
|
||||
match shard.is_intended_secondary {
|
||||
None => "".to_string(),
|
||||
Some(true) => "Secondary".to_string(),
|
||||
Some(false) => "Primary".to_string(),
|
||||
},
|
||||
match shard.is_observed_secondary {
|
||||
None => "".to_string(),
|
||||
Some(true) => "Secondary".to_string(),
|
||||
Some(false) => "Primary".to_string(),
|
||||
},
|
||||
]);
|
||||
}
|
||||
println!("{table}");
|
||||
}
|
||||
Command::Tenants { node_id: None } => {
|
||||
let mut resp = storcon_client
|
||||
Command::Tenants {} => {
|
||||
let resp = storcon_client
|
||||
.dispatch::<(), Vec<TenantDescribeResponse>>(
|
||||
Method::GET,
|
||||
"control/v1/tenant".to_string(),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
resp.sort_by(|a, b| a.tenant_id.cmp(&b.tenant_id));
|
||||
|
||||
let mut table = comfy_table::Table::new();
|
||||
table.set_header([
|
||||
"TenantId",
|
||||
@@ -545,6 +498,88 @@ async fn main() -> anyhow::Result<()> {
|
||||
})
|
||||
.await?;
|
||||
}
|
||||
Command::TenantScatter { tenant_id } => {
|
||||
// Find the shards
|
||||
let locate_response = storcon_client
|
||||
.dispatch::<(), TenantLocateResponse>(
|
||||
Method::GET,
|
||||
format!("control/v1/tenant/{tenant_id}/locate"),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
let shards = locate_response.shards;
|
||||
|
||||
let mut node_to_shards: HashMap<NodeId, Vec<TenantShardId>> = HashMap::new();
|
||||
let shard_count = shards.len();
|
||||
for s in shards {
|
||||
let entry = node_to_shards.entry(s.node_id).or_default();
|
||||
entry.push(s.shard_id);
|
||||
}
|
||||
|
||||
// Load list of available nodes
|
||||
let nodes_resp = storcon_client
|
||||
.dispatch::<(), Vec<NodeDescribeResponse>>(
|
||||
Method::GET,
|
||||
"control/v1/node".to_string(),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
for node in nodes_resp {
|
||||
if matches!(node.availability, NodeAvailabilityWrapper::Active) {
|
||||
node_to_shards.entry(node.id).or_default();
|
||||
}
|
||||
}
|
||||
|
||||
let max_shard_per_node = shard_count / node_to_shards.len();
|
||||
|
||||
loop {
|
||||
let mut migrate_shard = None;
|
||||
for shards in node_to_shards.values_mut() {
|
||||
if shards.len() > max_shard_per_node {
|
||||
// Pick the emptiest
|
||||
migrate_shard = Some(shards.pop().unwrap());
|
||||
}
|
||||
}
|
||||
let Some(migrate_shard) = migrate_shard else {
|
||||
break;
|
||||
};
|
||||
|
||||
// Pick the emptiest node to migrate to
|
||||
let mut destinations = node_to_shards
|
||||
.iter()
|
||||
.map(|(k, v)| (k, v.len()))
|
||||
.collect::<Vec<_>>();
|
||||
destinations.sort_by_key(|i| i.1);
|
||||
let (destination_node, destination_count) = *destinations.first().unwrap();
|
||||
if destination_count + 1 > max_shard_per_node {
|
||||
// Even the emptiest destination doesn't have space: we're done
|
||||
break;
|
||||
}
|
||||
let destination_node = *destination_node;
|
||||
|
||||
node_to_shards
|
||||
.get_mut(&destination_node)
|
||||
.unwrap()
|
||||
.push(migrate_shard);
|
||||
|
||||
println!("Migrate {} -> {} ...", migrate_shard, destination_node);
|
||||
|
||||
storcon_client
|
||||
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
|
||||
Method::PUT,
|
||||
format!("control/v1/tenant/{migrate_shard}/migrate"),
|
||||
Some(TenantShardMigrateRequest {
|
||||
tenant_shard_id: migrate_shard,
|
||||
node_id: destination_node,
|
||||
}),
|
||||
)
|
||||
.await?;
|
||||
println!("Migrate {} -> {} OK", migrate_shard, destination_node);
|
||||
}
|
||||
|
||||
// Spread the shards across the nodes
|
||||
}
|
||||
Command::TenantDescribe { tenant_id } => {
|
||||
let describe_response = storcon_client
|
||||
.dispatch::<(), TenantDescribeResponse>(
|
||||
@@ -699,11 +734,6 @@ async fn main() -> anyhow::Result<()> {
|
||||
.dispatch::<(), ()>(Method::POST, format!("debug/v1/node/{node_id}/drop"), None)
|
||||
.await?;
|
||||
}
|
||||
Command::NodeDelete { node_id } => {
|
||||
storcon_client
|
||||
.dispatch::<(), ()>(Method::DELETE, format!("control/v1/node/{node_id}"), None)
|
||||
.await?;
|
||||
}
|
||||
Command::TenantSetTimeBasedEviction {
|
||||
tenant_id,
|
||||
period,
|
||||
@@ -719,13 +749,12 @@ async fn main() -> anyhow::Result<()> {
|
||||
threshold: threshold.into(),
|
||||
},
|
||||
)),
|
||||
heatmap_period: Some("300s".to_string()),
|
||||
..Default::default()
|
||||
},
|
||||
})
|
||||
.await?;
|
||||
}
|
||||
Command::BulkMigrate {
|
||||
Command::Drain {
|
||||
nodes,
|
||||
concurrency,
|
||||
max_shards,
|
||||
@@ -754,7 +783,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
|
||||
if nodes.len() != node_to_drain_descs.len() {
|
||||
anyhow::bail!("Bulk migration requested away from node which doesn't exist.")
|
||||
anyhow::bail!("Drain requested for node which doesn't exist.")
|
||||
}
|
||||
|
||||
node_to_fill_descs.retain(|desc| {
|
||||
@@ -766,7 +795,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
});
|
||||
|
||||
if node_to_fill_descs.is_empty() {
|
||||
anyhow::bail!("There are no nodes to migrate to")
|
||||
anyhow::bail!("There are no nodes to drain to")
|
||||
}
|
||||
|
||||
// Set the node scheduling policy to draining for the nodes which
|
||||
@@ -787,7 +816,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
.await?;
|
||||
}
|
||||
|
||||
// Perform the migration: move each tenant shard scheduled on a node to
|
||||
// Perform the drain: move each tenant shard scheduled on a node to
|
||||
// be drained to a node which is being filled. A simple round robin
|
||||
// strategy is used to pick the new node.
|
||||
let tenants = storcon_client
|
||||
@@ -800,13 +829,13 @@ async fn main() -> anyhow::Result<()> {
|
||||
|
||||
let mut selected_node_idx = 0;
|
||||
|
||||
struct MigrationMove {
|
||||
struct DrainMove {
|
||||
tenant_shard_id: TenantShardId,
|
||||
from: NodeId,
|
||||
to: NodeId,
|
||||
}
|
||||
|
||||
let mut moves: Vec<MigrationMove> = Vec::new();
|
||||
let mut moves: Vec<DrainMove> = Vec::new();
|
||||
|
||||
let shards = tenants
|
||||
.into_iter()
|
||||
@@ -836,7 +865,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
continue;
|
||||
}
|
||||
|
||||
moves.push(MigrationMove {
|
||||
moves.push(DrainMove {
|
||||
tenant_shard_id: shard.tenant_shard_id,
|
||||
from: shard
|
||||
.node_attached
|
||||
@@ -913,67 +942,6 @@ async fn main() -> anyhow::Result<()> {
|
||||
failure
|
||||
);
|
||||
}
|
||||
Command::StartDrain { node_id } => {
|
||||
storcon_client
|
||||
.dispatch::<(), ()>(
|
||||
Method::PUT,
|
||||
format!("control/v1/node/{node_id}/drain"),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
println!("Drain started for {node_id}");
|
||||
}
|
||||
Command::CancelDrain { node_id, timeout } => {
|
||||
storcon_client
|
||||
.dispatch::<(), ()>(
|
||||
Method::DELETE,
|
||||
format!("control/v1/node/{node_id}/drain"),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
println!("Waiting for node {node_id} to quiesce on scheduling policy ...");
|
||||
|
||||
let final_policy =
|
||||
wait_for_scheduling_policy(storcon_client, node_id, *timeout, |sched| {
|
||||
use NodeSchedulingPolicy::*;
|
||||
matches!(sched, Active | PauseForRestart)
|
||||
})
|
||||
.await?;
|
||||
|
||||
println!(
|
||||
"Drain was cancelled for node {node_id}. Schedulling policy is now {final_policy:?}"
|
||||
);
|
||||
}
|
||||
Command::StartFill { node_id } => {
|
||||
storcon_client
|
||||
.dispatch::<(), ()>(Method::PUT, format!("control/v1/node/{node_id}/fill"), None)
|
||||
.await?;
|
||||
|
||||
println!("Fill started for {node_id}");
|
||||
}
|
||||
Command::CancelFill { node_id, timeout } => {
|
||||
storcon_client
|
||||
.dispatch::<(), ()>(
|
||||
Method::DELETE,
|
||||
format!("control/v1/node/{node_id}/fill"),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
println!("Waiting for node {node_id} to quiesce on scheduling policy ...");
|
||||
|
||||
let final_policy =
|
||||
wait_for_scheduling_policy(storcon_client, node_id, *timeout, |sched| {
|
||||
use NodeSchedulingPolicy::*;
|
||||
matches!(sched, Active)
|
||||
})
|
||||
.await?;
|
||||
|
||||
println!(
|
||||
"Fill was cancelled for node {node_id}. Schedulling policy is now {final_policy:?}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
||||
15
deny.toml
15
deny.toml
@@ -4,7 +4,6 @@
|
||||
# to your expectations and requirements.
|
||||
|
||||
# Root options
|
||||
[graph]
|
||||
targets = [
|
||||
{ triple = "x86_64-unknown-linux-gnu" },
|
||||
{ triple = "aarch64-unknown-linux-gnu" },
|
||||
@@ -13,7 +12,6 @@ targets = [
|
||||
]
|
||||
all-features = false
|
||||
no-default-features = false
|
||||
[output]
|
||||
feature-depth = 1
|
||||
|
||||
# This section is considered when running `cargo deny check advisories`
|
||||
@@ -21,16 +19,17 @@ feature-depth = 1
|
||||
# https://embarkstudios.github.io/cargo-deny/checks/advisories/cfg.html
|
||||
[advisories]
|
||||
db-urls = ["https://github.com/rustsec/advisory-db"]
|
||||
vulnerability = "deny"
|
||||
unmaintained = "warn"
|
||||
yanked = "warn"
|
||||
|
||||
[[advisories.ignore]]
|
||||
id = "RUSTSEC-2023-0071"
|
||||
reason = "the marvin attack only affects private key decryption, not public key signature verification"
|
||||
notice = "warn"
|
||||
ignore = []
|
||||
|
||||
# This section is considered when running `cargo deny check licenses`
|
||||
# More documentation for the licenses section can be found here:
|
||||
# https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html
|
||||
[licenses]
|
||||
unlicensed = "deny"
|
||||
allow = [
|
||||
"Apache-2.0",
|
||||
"Artistic-2.0",
|
||||
@@ -43,6 +42,10 @@ allow = [
|
||||
"OpenSSL",
|
||||
"Unicode-DFS-2016",
|
||||
]
|
||||
deny = []
|
||||
copyleft = "warn"
|
||||
allow-osi-fsf-free = "neither"
|
||||
default = "deny"
|
||||
confidence-threshold = 0.8
|
||||
exceptions = [
|
||||
# Zlib license has some restrictions if we decide to change sth
|
||||
|
||||
@@ -1,10 +0,0 @@
|
||||
|
||||
# Example docker compose configuration
|
||||
|
||||
The configuration in this directory is used for testing Neon docker images: it is
|
||||
not intended for deploying a usable system. To run a development environment where
|
||||
you can experiment with a miniature Neon system, use `cargo neon` rather than container images.
|
||||
|
||||
This configuration does not start the storage controller, because the controller
|
||||
needs a way to reconfigure running computes, and no such thing exists in this setup.
|
||||
|
||||
@@ -23,17 +23,18 @@ echo "Page server is ready."
|
||||
echo "Create a tenant and timeline"
|
||||
generate_id tenant_id
|
||||
PARAMS=(
|
||||
-X PUT
|
||||
-sb
|
||||
-X POST
|
||||
-H "Content-Type: application/json"
|
||||
-d "{\"mode\": \"AttachedSingle\", \"generation\": 1, \"tenant_conf\": {}}"
|
||||
"http://pageserver:9898/v1/tenant/${tenant_id}/location_config"
|
||||
-d "{\"new_tenant_id\": \"${tenant_id}\"}"
|
||||
http://pageserver:9898/v1/tenant/
|
||||
)
|
||||
result=$(curl "${PARAMS[@]}")
|
||||
echo $result | jq .
|
||||
|
||||
generate_id timeline_id
|
||||
PARAMS=(
|
||||
-sbf
|
||||
-sb
|
||||
-X POST
|
||||
-H "Content-Type: application/json"
|
||||
-d "{\"new_timeline_id\": \"${timeline_id}\", \"pg_version\": ${PG_VERSION}}"
|
||||
|
||||
@@ -31,14 +31,25 @@ services:
|
||||
restart: always
|
||||
image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
|
||||
environment:
|
||||
- BROKER_ENDPOINT='http://storage_broker:50051'
|
||||
- AWS_ACCESS_KEY_ID=minio
|
||||
- AWS_SECRET_ACCESS_KEY=password
|
||||
#- RUST_BACKTRACE=1
|
||||
ports:
|
||||
#- 6400:6400 # pg protocol handler
|
||||
- 9898:9898 # http endpoints
|
||||
volumes:
|
||||
- ./pageserver_config:/data/.neon/
|
||||
entrypoint:
|
||||
- "/bin/sh"
|
||||
- "-c"
|
||||
command:
|
||||
- "/usr/local/bin/pageserver -D /data/.neon/
|
||||
-c \"broker_endpoint=$$BROKER_ENDPOINT\"
|
||||
-c \"listen_pg_addr='0.0.0.0:6400'\"
|
||||
-c \"listen_http_addr='0.0.0.0:9898'\"
|
||||
-c \"remote_storage={endpoint='http://minio:9000',
|
||||
bucket_name='neon',
|
||||
bucket_region='eu-north-1',
|
||||
prefix_in_bucket='/pageserver/'}\""
|
||||
depends_on:
|
||||
- storage_broker
|
||||
- minio_create_buckets
|
||||
|
||||
@@ -78,7 +78,7 @@ for pg_version in 14 15 16; do
|
||||
docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/ext-src/pg_hint_plan-src/
|
||||
rm -rf $TMPDIR
|
||||
# We are running tests now
|
||||
if docker exec -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \
|
||||
if docker exec -e SKIP=rum-src,timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \
|
||||
$TEST_CONTAINER_NAME /run-tests.sh | tee testout.txt
|
||||
then
|
||||
cleanup
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
id=1234
|
||||
@@ -1,5 +0,0 @@
|
||||
broker_endpoint='http://storage_broker:50051'
|
||||
pg_distrib_dir='/usr/local/'
|
||||
listen_pg_addr='0.0.0.0:6400'
|
||||
listen_http_addr='0.0.0.0:9898'
|
||||
remote_storage={ endpoint='http://minio:9000', bucket_name='neon', bucket_region='eu-north-1', prefix_in_bucket='/pageserver' }
|
||||
@@ -1,15 +1,15 @@
|
||||
#!/bin/bash
|
||||
set -x
|
||||
|
||||
cd /ext-src || exit 2
|
||||
cd /ext-src
|
||||
FAILED=
|
||||
LIST=$( (echo -e "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u)
|
||||
LIST=$((echo ${SKIP} | sed 's/,/\n/g'; ls -d *-src) | sort | uniq -u)
|
||||
for d in ${LIST}
|
||||
do
|
||||
[ -d "${d}" ] || continue
|
||||
[ -d ${d} ] || continue
|
||||
psql -c "select 1" >/dev/null || break
|
||||
USE_PGXS=1 make -C "${d}" installcheck || FAILED="${d} ${FAILED}"
|
||||
make -C ${d} installcheck || FAILED="${d} ${FAILED}"
|
||||
done
|
||||
[ -z "${FAILED}" ] && exit 0
|
||||
echo "${FAILED}"
|
||||
echo ${FAILED}
|
||||
exit 1
|
||||
@@ -1,18 +1,13 @@
|
||||
# Summary
|
||||
|
||||
# Looking for `neon.tech` docs?
|
||||
|
||||
This page linkes to a selection of technical content about the open source code in this repository.
|
||||
|
||||
Please visit https://neon.tech/docs for documentation about using the Neon service, which is based on the code
|
||||
in this repository.
|
||||
|
||||
# Architecture
|
||||
|
||||
[Introduction]()
|
||||
- [Separation of Compute and Storage](./separation-compute-storage.md)
|
||||
|
||||
# Architecture
|
||||
|
||||
- [Compute]()
|
||||
- [WAL proposer]()
|
||||
- [WAL Backpressure]()
|
||||
- [Postgres changes](./core_changes.md)
|
||||
|
||||
- [Pageserver](./pageserver.md)
|
||||
@@ -21,15 +16,33 @@ in this repository.
|
||||
- [WAL Redo](./pageserver-walredo.md)
|
||||
- [Page cache](./pageserver-pagecache.md)
|
||||
- [Storage](./pageserver-storage.md)
|
||||
- [Datadir mapping]()
|
||||
- [Layer files]()
|
||||
- [Branching]()
|
||||
- [Garbage collection]()
|
||||
- [Cloud Storage]()
|
||||
- [Processing a GetPage request](./pageserver-processing-getpage.md)
|
||||
- [Processing WAL](./pageserver-processing-wal.md)
|
||||
- [Management API]()
|
||||
- [Tenant Rebalancing]()
|
||||
|
||||
- [WAL Service](walservice.md)
|
||||
- [Consensus protocol](safekeeper-protocol.md)
|
||||
- [Management API]()
|
||||
- [Rebalancing]()
|
||||
|
||||
- [Control Plane]()
|
||||
|
||||
- [Proxy]()
|
||||
|
||||
- [Source view](./sourcetree.md)
|
||||
- [docker.md](./docker.md) — Docker images and building pipeline.
|
||||
- [Error handling and logging](./error-handling.md)
|
||||
- [Testing]()
|
||||
- [Unit testing]()
|
||||
- [Integration testing]()
|
||||
- [Benchmarks]()
|
||||
|
||||
|
||||
- [Glossary](./glossary.md)
|
||||
|
||||
@@ -45,6 +58,28 @@ in this repository.
|
||||
|
||||
# RFCs
|
||||
|
||||
Major changes are documented in RFCS:
|
||||
- See [RFCs](./rfcs/README.md) for more information
|
||||
- view the RFCs at https://github.com/neondatabase/neon/tree/main/docs/rfcs
|
||||
- [RFCs](./rfcs/README.md)
|
||||
|
||||
- [002-storage](rfcs/002-storage.md)
|
||||
- [003-laptop-cli](rfcs/003-laptop-cli.md)
|
||||
- [004-durability](rfcs/004-durability.md)
|
||||
- [005-zenith_local](rfcs/005-zenith_local.md)
|
||||
- [006-laptop-cli-v2-CLI](rfcs/006-laptop-cli-v2-CLI.md)
|
||||
- [006-laptop-cli-v2-repository-structure](rfcs/006-laptop-cli-v2-repository-structure.md)
|
||||
- [007-serverless-on-laptop](rfcs/007-serverless-on-laptop.md)
|
||||
- [008-push-pull](rfcs/008-push-pull.md)
|
||||
- [009-snapshot-first-storage-cli](rfcs/009-snapshot-first-storage-cli.md)
|
||||
- [009-snapshot-first-storage](rfcs/009-snapshot-first-storage.md)
|
||||
- [009-snapshot-first-storage-pitr](rfcs/009-snapshot-first-storage-pitr.md)
|
||||
- [010-storage_details](rfcs/010-storage_details.md)
|
||||
- [011-retention-policy](rfcs/011-retention-policy.md)
|
||||
- [012-background-tasks](rfcs/012-background-tasks.md)
|
||||
- [013-term-history](rfcs/013-term-history.md)
|
||||
- [014-safekeepers-gossip](rfcs/014-safekeepers-gossip.md)
|
||||
- [014-storage-lsm](rfcs/014-storage-lsm.md)
|
||||
- [015-storage-messaging](rfcs/015-storage-messaging.md)
|
||||
- [016-connection-routing](rfcs/016-connection-routing.md)
|
||||
- [017-timeline-data-management](rfcs/017-timeline-data-management.md)
|
||||
- [018-storage-messaging-2](rfcs/018-storage-messaging-2.md)
|
||||
- [019-tenant-timeline-lifecycles](rfcs/019-tenant-timeline-lifecycles.md)
|
||||
- [cluster-size-limits](rfcs/cluster-size-limits.md)
|
||||
|
||||
@@ -11,28 +11,15 @@ page server. We currently use the same binary for both, with --wal-redo runtime
|
||||
the WAL redo mode. Some PostgreSQL changes are needed in the compute node, while others are just for
|
||||
the WAL redo process.
|
||||
|
||||
In addition to core PostgreSQL changes, there is a Neon extension in the pgxn/neon directory that
|
||||
hooks into the smgr interface, and rmgr extension in pgxn/neon_rmgr. The extensions are loaded into
|
||||
the Postgres processes with shared_preload_libraries. Most of the Neon-specific code is in the
|
||||
extensions, and for any new features, that is preferred over modifying core PostgreSQL code.
|
||||
In addition to core PostgreSQL changes, there is a Neon extension in contrib/neon, to hook into the
|
||||
smgr interface. Once all the core changes have been submitted to upstream or eliminated some other
|
||||
way, the extension could live outside the postgres repository and build against vanilla PostgreSQL.
|
||||
|
||||
Below is a list of all the PostgreSQL source code changes, categorized into changes needed for
|
||||
compute, and changes needed for the WAL redo process:
|
||||
|
||||
# Changes for Compute node
|
||||
|
||||
## Prefetching
|
||||
|
||||
There are changes in many places to perform prefetching, for example for sequential scans. Neon
|
||||
doesn't benefit from OS readahead, and the latency to pageservers is quite high compared to local
|
||||
disk, so prefetching is critical for performance, also for sequential scans.
|
||||
|
||||
### How to get rid of the patch
|
||||
|
||||
Upcoming "streaming read" work in v17 might simplify this. And async I/O work in v18 will hopefully
|
||||
do more.
|
||||
|
||||
|
||||
## Add t_cid to heap WAL records
|
||||
|
||||
```
|
||||
@@ -50,11 +37,54 @@ The problem is that the XLOG_HEAP_INSERT record does not include the command id
|
||||
|
||||
Bite the bullet and submit the patch to PostgreSQL, to add the t_cid to the WAL records. It makes the WAL records larger, which could make this unpopular in the PostgreSQL community. However, it might simplify some logical decoding code; Andres Freund briefly mentioned in PGCon 2022 discussion on Heikki's Neon presentation that logical decoding currently needs to jump through some hoops to reconstruct the same information.
|
||||
|
||||
Update from Heikki (2024-04-17): I tried to write an upstream patch for that, to use the t_cid field for logical decoding, but it was not as straightforward as it first sounded.
|
||||
|
||||
### Alternatives
|
||||
Perhaps we could write an extra WAL record with the t_cid information, when a page is evicted that contains rows that were touched a transaction that's still running. However, that seems very complicated.
|
||||
|
||||
## ginfast.c
|
||||
|
||||
```
|
||||
diff --git a/src/backend/access/gin/ginfast.c b/src/backend/access/gin/ginfast.c
|
||||
index e0d9940946..2d964c02e9 100644
|
||||
--- a/src/backend/access/gin/ginfast.c
|
||||
+++ b/src/backend/access/gin/ginfast.c
|
||||
@@ -285,6 +285,17 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector)
|
||||
memset(&sublist, 0, sizeof(GinMetaPageData));
|
||||
makeSublist(index, collector->tuples, collector->ntuples, &sublist);
|
||||
|
||||
+ if (metadata->head != InvalidBlockNumber)
|
||||
+ {
|
||||
+ /*
|
||||
+ * ZENITH: Get buffer before XLogBeginInsert() to avoid recursive call
|
||||
+ * of XLogBeginInsert(). Reading a new buffer might evict a dirty page from
|
||||
+ * the buffer cache, and if that page happens to be an FSM or VM page, zenith_write()
|
||||
+ * will try to WAL-log an image of the page.
|
||||
+ */
|
||||
+ buffer = ReadBuffer(index, metadata->tail);
|
||||
+ }
|
||||
+
|
||||
if (needWal)
|
||||
XLogBeginInsert();
|
||||
|
||||
@@ -316,7 +327,6 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector)
|
||||
data.prevTail = metadata->tail;
|
||||
data.newRightlink = sublist.head;
|
||||
|
||||
- buffer = ReadBuffer(index, metadata->tail);
|
||||
LockBuffer(buffer, GIN_EXCLUSIVE);
|
||||
page = BufferGetPage(buffer);
|
||||
```
|
||||
|
||||
The problem is explained in the comment above
|
||||
|
||||
### How to get rid of the patch
|
||||
|
||||
Can we stop WAL-logging FSM or VM pages? Or delay the WAL logging until we're out of the critical
|
||||
section or something.
|
||||
|
||||
Maybe some bigger rewrite of FSM and VM would help to avoid WAL-logging FSM and VM page images?
|
||||
|
||||
|
||||
## Mark index builds that use buffer manager without logging explicitly
|
||||
|
||||
```
|
||||
@@ -65,8 +95,6 @@ Perhaps we could write an extra WAL record with the t_cid information, when a pa
|
||||
also some changes in src/backend/storage/smgr/smgr.c
|
||||
```
|
||||
|
||||
pgvector 0.6.0 also needs a similar change, which would be very nice to get rid of too.
|
||||
|
||||
When a GIN index is built, for example, it is built by inserting the entries into the index more or
|
||||
less normally, but without WAL-logging anything. After the index has been built, we iterate through
|
||||
all pages and write them to the WAL. That doesn't work for Neon, because if a page is not WAL-logged
|
||||
@@ -81,10 +109,6 @@ an operation: `smgr_start_unlogged_build`, `smgr_finish_unlogged_build_phase_1`
|
||||
I think it would make sense to be more explicit about that in PostgreSQL too. So extract these
|
||||
changes to a patch and post to pgsql-hackers.
|
||||
|
||||
Perhaps we could deduce that an unlogged index build has started when we see a page being evicted
|
||||
with zero LSN. How to be sure it's an unlogged index build rather than a bug? Currently we have a
|
||||
check for that and PANIC if we see page with zero LSN being evicted. And how do we detect when the
|
||||
index build has finished? See https://github.com/neondatabase/neon/pull/7440 for an attempt at that.
|
||||
|
||||
## Track last-written page LSN
|
||||
|
||||
@@ -116,6 +140,57 @@ The old method is still available, though.
|
||||
Wait until v15?
|
||||
|
||||
|
||||
## Cache relation sizes
|
||||
|
||||
The Neon extension contains a little cache for smgrnblocks() and smgrexists() calls, to avoid going
|
||||
to the page server every time. It might be useful to cache those in PostgreSQL, maybe in the
|
||||
relcache? (I think we do cache nblocks in relcache already, check why that's not good enough for
|
||||
Neon)
|
||||
|
||||
|
||||
## Use buffer manager when extending VM or FSM
|
||||
|
||||
```
|
||||
src/backend/storage/freespace/freespace.c | 14 +-
|
||||
src/backend/access/heap/visibilitymap.c | 15 +-
|
||||
|
||||
diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
|
||||
index e198df65d8..addfe93eac 100644
|
||||
--- a/src/backend/access/heap/visibilitymap.c
|
||||
+++ b/src/backend/access/heap/visibilitymap.c
|
||||
@@ -652,10 +652,19 @@ vm_extend(Relation rel, BlockNumber vm_nblocks)
|
||||
/* Now extend the file */
|
||||
while (vm_nblocks_now < vm_nblocks)
|
||||
{
|
||||
- PageSetChecksumInplace((Page) pg.data, vm_nblocks_now);
|
||||
+ /*
|
||||
+ * ZENITH: Initialize VM pages through buffer cache to prevent loading
|
||||
+ * them from pageserver.
|
||||
+ */
|
||||
+ Buffer buffer = ReadBufferExtended(rel, VISIBILITYMAP_FORKNUM, P_NEW,
|
||||
+ RBM_ZERO_AND_LOCK, NULL);
|
||||
+ Page page = BufferGetPage(buffer);
|
||||
+
|
||||
+ PageInit((Page) page, BLCKSZ, 0);
|
||||
+ PageSetChecksumInplace(page, vm_nblocks_now);
|
||||
+ MarkBufferDirty(buffer);
|
||||
+ UnlockReleaseBuffer(buffer);
|
||||
|
||||
- smgrextend(rel->rd_smgr, VISIBILITYMAP_FORKNUM, vm_nblocks_now,
|
||||
- pg.data, false);
|
||||
vm_nblocks_now++;
|
||||
}
|
||||
```
|
||||
|
||||
### Problem we're trying to solve
|
||||
|
||||
???
|
||||
|
||||
### How to get rid of the patch
|
||||
|
||||
Maybe this would be a reasonable change in PostgreSQL too?
|
||||
|
||||
|
||||
## Allow startup without reading checkpoint record
|
||||
|
||||
In Neon, the compute node is stateless. So when we are launching compute node, we need to provide
|
||||
@@ -156,7 +231,7 @@ index 0415df9ccb..9f9db3c8bc 100644
|
||||
* crash we can lose (skip over) as many values as we pre-logged.
|
||||
*/
|
||||
-#define SEQ_LOG_VALS 32
|
||||
+/* Neon XXX: to ensure sequence order of sequence in Zenith we need to WAL log each sequence update. */
|
||||
+/* Zenith XXX: to ensure sequence order of sequence in Zenith we need to WAL log each sequence update. */
|
||||
+/* #define SEQ_LOG_VALS 32 */
|
||||
+#define SEQ_LOG_VALS 0
|
||||
```
|
||||
@@ -175,6 +250,66 @@ would be weird if the sequence moved backwards though, think of PITR.
|
||||
Or add a GUC for the amount to prefix to PostgreSQL, and force it to 1 in Neon.
|
||||
|
||||
|
||||
## Walproposer
|
||||
|
||||
```
|
||||
src/Makefile | 1 +
|
||||
src/backend/replication/libpqwalproposer/Makefile | 37 +
|
||||
src/backend/replication/libpqwalproposer/libpqwalproposer.c | 416 ++++++++++++
|
||||
src/backend/postmaster/bgworker.c | 4 +
|
||||
src/backend/postmaster/postmaster.c | 6 +
|
||||
src/backend/replication/Makefile | 4 +-
|
||||
src/backend/replication/walproposer.c | 2350 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
src/backend/replication/walproposer_utils.c | 402 +++++++++++
|
||||
src/backend/replication/walreceiver.c | 7 +
|
||||
src/backend/replication/walsender.c | 320 ++++++---
|
||||
src/backend/storage/ipc/ipci.c | 6 +
|
||||
src/include/replication/walproposer.h | 565 ++++++++++++++++
|
||||
```
|
||||
|
||||
WAL proposer is communicating with safekeeper and ensures WAL durability by quorum writes. It is
|
||||
currently implemented as patch to standard WAL sender.
|
||||
|
||||
### How to get rid of the patch
|
||||
|
||||
Refactor into an extension. Submit hooks or APIs into upstream if necessary.
|
||||
|
||||
@MMeent did some work on this already: https://github.com/neondatabase/postgres/pull/96
|
||||
|
||||
## Ignore unexpected data beyond EOF in bufmgr.c
|
||||
|
||||
```
|
||||
@@ -922,11 +928,14 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
|
||||
*/
|
||||
bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
|
||||
if (!PageIsNew((Page) bufBlock))
|
||||
- ereport(ERROR,
|
||||
+ {
|
||||
+ // XXX-ZENITH
|
||||
+ MemSet((char *) bufBlock, 0, BLCKSZ);
|
||||
+ ereport(DEBUG1,
|
||||
(errmsg("unexpected data beyond EOF in block %u of relation %s",
|
||||
blockNum, relpath(smgr->smgr_rnode, forkNum)),
|
||||
errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
|
||||
-
|
||||
+ }
|
||||
/*
|
||||
* We *must* do smgrextend before succeeding, else the page will not
|
||||
* be reserved by the kernel, and the next P_NEW call will decide to
|
||||
```
|
||||
|
||||
PostgreSQL is a bit sloppy with extending relations. Usually, the relation is extended with zeros
|
||||
first, then the page is filled, and finally the new page WAL-logged. But if multiple backends extend
|
||||
a relation at the same time, the pages can be WAL-logged in different order.
|
||||
|
||||
I'm not sure what scenario exactly required this change in Neon, though.
|
||||
|
||||
### How to get rid of the patch
|
||||
|
||||
Submit patches to pgsql-hackers, to tighten up the WAL-logging around relation extension. It's a bit
|
||||
confusing even in PostgreSQL. Maybe WAL log the intention to extend first, then extend the relation,
|
||||
and finally WAL-log that the extension succeeded.
|
||||
|
||||
## Make smgr interface available to extensions
|
||||
|
||||
```
|
||||
@@ -186,8 +321,6 @@ Or add a GUC for the amount to prefix to PostgreSQL, and force it to 1 in Neon.
|
||||
|
||||
Submit to upstream. This could be useful for the Disk Encryption patches too, or for compression.
|
||||
|
||||
We have submitted this to upstream, but it's moving at glacial a speed.
|
||||
https://commitfest.postgresql.org/47/4428/
|
||||
|
||||
## Added relpersistence argument to smgropen()
|
||||
|
||||
@@ -311,148 +444,6 @@ Ignore it. This is only needed for disaster recovery, so once we've eliminated a
|
||||
patches, we can just keep it around as a patch or as separate branch in a repo.
|
||||
|
||||
|
||||
## pg_waldump flags to ignore errors
|
||||
|
||||
After creating a new project or branch in Neon, the first timeline can begin in the middle of a WAL segment. pg_waldump chokes on that, so we added some flags to make it possible to ignore errors.
|
||||
|
||||
### How to get rid of the patch
|
||||
|
||||
Like previous one, ignore it.
|
||||
|
||||
|
||||
|
||||
## Backpressure if pageserver doesn't ingest WAL fast enough
|
||||
|
||||
```
|
||||
@@ -3200,6 +3202,7 @@ ProcessInterrupts(void)
|
||||
return;
|
||||
InterruptPending = false;
|
||||
|
||||
+retry:
|
||||
if (ProcDiePending)
|
||||
{
|
||||
ProcDiePending = false;
|
||||
@@ -3447,6 +3450,13 @@ ProcessInterrupts(void)
|
||||
|
||||
if (ParallelApplyMessagePending)
|
||||
HandleParallelApplyMessages();
|
||||
+
|
||||
+ /* Call registered callback if any */
|
||||
+ if (ProcessInterruptsCallback)
|
||||
+ {
|
||||
+ if (ProcessInterruptsCallback())
|
||||
+ goto retry;
|
||||
+ }
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
### How to get rid of the patch
|
||||
|
||||
Submit a patch to upstream, for a hook in ProcessInterrupts. Could be useful for other extensions
|
||||
too.
|
||||
|
||||
|
||||
## SLRU on-demand download
|
||||
|
||||
```
|
||||
src/backend/access/transam/slru.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------
|
||||
1 file changed, 92 insertions(+), 13 deletions(-)
|
||||
```
|
||||
|
||||
### Problem we're trying to solve
|
||||
|
||||
Previously, SLRU files were included in the basebackup, but the total size of them can be large,
|
||||
several GB, and downloading them all made the startup time too long.
|
||||
|
||||
### Alternatives
|
||||
|
||||
FUSE hook or LD_PRELOAD trick to intercept the reads on SLRU files
|
||||
|
||||
|
||||
## WAL-log an all-zeros page as one large hole
|
||||
|
||||
- In XLogRecordAssemble()
|
||||
|
||||
### Problem we're trying to solve
|
||||
|
||||
This change was made in v16. Starting with v16, when PostgreSQL extends a relation, it first extends
|
||||
it with zeros, and it can extend the relation more than one block at a time. The all-zeros page is WAL-ogged, but it's very wasteful to include 8 kB of zeros in the WAL for that. This hack was made so that we WAL logged a compact record with a whole-page "hole". However, PostgreSQL has assertions that prevent that such WAL records from being replayed, so this breaks compatibility such that unmodified PostreSQL cannot process Neon-generated WAL.
|
||||
|
||||
### How to get rid of the patch
|
||||
|
||||
Find another compact representation for a full-page image of an all-zeros page. A compressed image perhaps.
|
||||
|
||||
|
||||
## Shut down walproposer after checkpointer
|
||||
|
||||
```
|
||||
+ /* Neon: Also allow walproposer background worker to be treated like a WAL sender, so that it's shut down last */
|
||||
+ if ((bp->bkend_type == BACKEND_TYPE_NORMAL || bp->bkend_type == BACKEND_TYPE_BGWORKER) &&
|
||||
```
|
||||
|
||||
This changes was needed so that postmaster shuts down the walproposer process only after the shutdown checkpoint record is written. Otherwise, the shutdown record will never make it to the safekeepers.
|
||||
|
||||
### How to get rid of the patch
|
||||
|
||||
Do a bigger refactoring of the postmaster state machine, such that a background worker can specify
|
||||
the shutdown ordering by itself. The postmaster state machine has grown pretty complicated, and
|
||||
would benefit from a refactoring for the sake of readability anyway.
|
||||
|
||||
|
||||
## EXPLAIN changes for prefetch and LFC
|
||||
|
||||
### How to get rid of the patch
|
||||
|
||||
Konstantin submitted a patch to -hackers already: https://commitfest.postgresql.org/47/4643/. Get that into a committable state.
|
||||
|
||||
|
||||
## On-demand download of extensions
|
||||
|
||||
### How to get rid of the patch
|
||||
|
||||
FUSE or LD_PRELOAD trickery to intercept reads?
|
||||
|
||||
|
||||
## Publication superuser checks
|
||||
|
||||
We have hacked CreatePublication so that also neon_superuser can create them.
|
||||
|
||||
### How to get rid of the patch
|
||||
|
||||
Create an upstream patch with more fine-grained privileges for publications CREATE/DROP that can be GRANTed to users.
|
||||
|
||||
|
||||
## WAL log replication slots
|
||||
|
||||
### How to get rid of the patch
|
||||
|
||||
Utilize the upcoming v17 "slot sync worker", or a similar neon-specific background worker process, to periodically WAL-log the slots, or to export them somewhere else.
|
||||
|
||||
|
||||
## WAL-log replication snapshots
|
||||
|
||||
### How to get rid of the patch
|
||||
|
||||
WAL-log them periodically, from a backgound worker.
|
||||
|
||||
|
||||
## WAL-log relmapper files
|
||||
|
||||
Similarly to replications snapshot files, the CID mapping files generated during VACUUM FULL of a catalog table are WAL-logged
|
||||
|
||||
### How to get rid of the patch
|
||||
|
||||
WAL-log them periodically, from a backgound worker.
|
||||
|
||||
|
||||
## XLogWaitForReplayOf()
|
||||
|
||||
??
|
||||
|
||||
|
||||
|
||||
|
||||
# Not currently committed but proposed
|
||||
|
||||
## Disable ring buffer buffer manager strategies
|
||||
@@ -481,10 +472,23 @@ hint bits are set. Wal logging hint bits updates requires FPI which significantl
|
||||
|
||||
Add special WAL record for setting page hints.
|
||||
|
||||
## Prefetching
|
||||
|
||||
### Why?
|
||||
|
||||
As far as pages in Neon are loaded on demand, to reduce node startup time
|
||||
and also speedup some massive queries we need some mechanism for bulk loading to
|
||||
reduce page request round-trip overhead.
|
||||
|
||||
Currently Postgres is supporting prefetching only for bitmap scan.
|
||||
In Neon we should also use prefetch for sequential and index scans, because the OS is not doing it for us.
|
||||
For sequential scan we could prefetch some number of following pages. For index scan we could prefetch pages
|
||||
of heap relation addressed by TIDs.
|
||||
|
||||
## Prewarming
|
||||
|
||||
### Why?
|
||||
|
||||
Short downtime (or, in other words, fast compute node restart time) is one of the key feature of Neon.
|
||||
Short downtime (or, in other words, fast compute node restart time) is one of the key feature of Zenith.
|
||||
But overhead of request-response round-trip for loading pages on demand can make started node warm-up quite slow.
|
||||
We can capture state of compute node buffer cache and send bulk request for this pages at startup.
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user