From 105b8bb9d36297e0c5bc119c87ed1e94ce18cbd5 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Tue, 4 Apr 2023 12:21:54 +0100 Subject: [PATCH 01/31] test_runner: automatically rerun flaky tests (#3880) This PR adds a plugin that automatically reruns (up to 3 times) flaky tests. Internally, it uses data from `TEST_RESULT_CONNSTR` database and `pytest-rerunfailures` plugin. As the first approximation we consider the test flaky if it has failed on the main branch in the last 10 days. Flaky tests are fetched by `scripts/flaky_tests.py` script (it's possible to use it in a standalone mode to learn which tests are flaky), stored to a JSON file, and then the file is passed to the pytest plugin. --- .github/actions/allure-report/action.yml | 4 +- .../actions/run-python-test-set/action.yml | 12 +++ .github/workflows/build_and_test.yml | 3 + poetry.lock | 38 +++++--- pyproject.toml | 6 +- scripts/flaky_tests.py | 87 +++++++++++++++++++ test_runner/conftest.py | 1 + test_runner/fixtures/flaky.py | 58 +++++++++++++ test_runner/fixtures/utils.py | 2 +- 9 files changed, 195 insertions(+), 16 deletions(-) create mode 100755 scripts/flaky_tests.py create mode 100644 test_runner/fixtures/flaky.py diff --git a/.github/actions/allure-report/action.yml b/.github/actions/allure-report/action.yml index 2d4cabdde5..e685006245 100644 --- a/.github/actions/allure-report/action.yml +++ b/.github/actions/allure-report/action.yml @@ -76,8 +76,8 @@ runs: rm -f ${ALLURE_ZIP} fi env: - ALLURE_VERSION: 2.19.0 - ALLURE_ZIP_MD5: ced21401a1a8b9dfb68cee9e4c210464 + ALLURE_VERSION: 2.21.0 + ALLURE_ZIP_MD5: c8db4dd8e2a7882583d569ed2c82879c - name: Upload Allure results if: ${{ inputs.action == 'store' }} diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 29b04a3478..11f5c78f19 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -44,6 +44,10 @@ inputs: description: 'Secret access key' required: false default: '' + rerun_flaky: + description: 'Whether to rerun flaky tests' + required: false + default: 'false' runs: using: "composite" @@ -101,6 +105,7 @@ runs: COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg14 ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage') ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage') + RERUN_FLAKY: ${{ inputs.rerun_flaky }} shell: bash -euxo pipefail {0} run: | # PLATFORM will be embedded in the perf test report @@ -143,6 +148,13 @@ runs: EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS" fi + if [ "${RERUN_FLAKY}" == "true" ]; then + mkdir -p $TEST_OUTPUT + poetry run ./scripts/flaky_tests.py "${TEST_RESULT_CONNSTR}" --days 10 --output "$TEST_OUTPUT/flaky.json" + + EXTRA_PARAMS="--flaky-tests-json $TEST_OUTPUT/flaky.json $EXTRA_PARAMS" + fi + if [[ "${{ inputs.build_type }}" == "debug" ]]; then cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run) elif [[ "${{ inputs.build_type }}" == "release" ]]; then diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 8482341b0c..8c108e7f50 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -335,6 +335,9 @@ jobs: real_s3_region: us-west-2 real_s3_access_key_id: "${{ secrets.AWS_ACCESS_KEY_ID_CI_TESTS_S3 }}" real_s3_secret_access_key: "${{ secrets.AWS_SECRET_ACCESS_KEY_CI_TESTS_S3 }}" + rerun_flaky: true + env: + TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }} - name: Merge and upload coverage data if: matrix.build_type == 'debug' diff --git a/poetry.lock b/poetry.lock index 011d5d7817..7b368cd3b4 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.4.0 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.4.1 and should not be changed by hand. [[package]] name = "aiohttp" @@ -79,37 +79,35 @@ sa = ["sqlalchemy[postgresql-psycopg2binary] (>=1.3,<1.5)"] [[package]] name = "allure-pytest" -version = "2.10.0" +version = "2.13.1" description = "Allure pytest integration" category = "main" optional = false python-versions = "*" files = [ - {file = "allure-pytest-2.10.0.tar.gz", hash = "sha256:3b2ab67629f4cbd8617abd817d2b22292c6eb7efd5584f992d1af8143aea6ee7"}, - {file = "allure_pytest-2.10.0-py3-none-any.whl", hash = "sha256:08274096594758447db54c3b2c382526ee04f1fe12119cdaee92d2d93c84b530"}, + {file = "allure-pytest-2.13.1.tar.gz", hash = "sha256:68d69456eeb65af4061ec06a80bc941163b0616e8216554d36b070a6bf070e08"}, + {file = "allure_pytest-2.13.1-py3-none-any.whl", hash = "sha256:a8de2fc3b3effe2d8f98801646920de3f055b779710f4c806dbee7c613c24633"}, ] [package.dependencies] -allure-python-commons = "2.10.0" +allure-python-commons = "2.13.1" pytest = ">=4.5.0" -six = ">=1.9.0" [[package]] name = "allure-python-commons" -version = "2.10.0" +version = "2.13.1" description = "Common module for integrate allure with python-based frameworks" category = "main" optional = false -python-versions = ">=3.5" +python-versions = ">=3.6" files = [ - {file = "allure-python-commons-2.10.0.tar.gz", hash = "sha256:d4d31344b0f0037a4a11e16b91b28cf0eeb23ffa0e50c27fcfc6aabe72212d3c"}, - {file = "allure_python_commons-2.10.0-py3-none-any.whl", hash = "sha256:2a717e8ca8d296bf89cd57f38fc3c21893bd7ea8cd02a6ae5420e6d1a6eda5d0"}, + {file = "allure-python-commons-2.13.1.tar.gz", hash = "sha256:3fc13e1da8ebb23f9ab5c9c72ad04595023cdd5078dbb8604939997faebed5cb"}, + {file = "allure_python_commons-2.13.1-py3-none-any.whl", hash = "sha256:d08e04867bddf44fef55def3d67f4bc25af58a1bf9fcffcf4ec3331f7f2ef0d0"}, ] [package.dependencies] attrs = ">=16.0.0" pluggy = ">=0.4.0" -six = ">=1.9.0" [[package]] name = "async-timeout" @@ -1932,6 +1930,22 @@ pytest = [ {version = ">=6.2.4", markers = "python_version >= \"3.10\""}, ] +[[package]] +name = "pytest-rerunfailures" +version = "11.1.2" +description = "pytest plugin to re-run tests to eliminate flaky failures" +category = "main" +optional = false +python-versions = ">=3.7" +files = [ + {file = "pytest-rerunfailures-11.1.2.tar.gz", hash = "sha256:55611661e873f1cafa384c82f08d07883954f4b76435f4b8a5b470c1954573de"}, + {file = "pytest_rerunfailures-11.1.2-py3-none-any.whl", hash = "sha256:d21fe2e46d9774f8ad95f1aa799544ae95cac3a223477af94aa985adfae92b7e"}, +] + +[package.dependencies] +packaging = ">=17.1" +pytest = ">=5.3" + [[package]] name = "pytest-timeout" version = "2.1.0" @@ -2597,4 +2611,4 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>= [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "2515a9320c2960076012fbc036fb33c4f6a23515c8d143785931dc18c6722d91" +content-hash = "b689ffd6eae32b966f1744b5ac3343fe0dd26b31ee1f50e13daf5045ee0623e1" diff --git a/pyproject.toml b/pyproject.toml index f21c12b2e3..a51e91782e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ prometheus-client = "^0.14.1" pytest-timeout = "^2.1.0" Werkzeug = "^2.2.3" pytest-order = "^1.0.1" -allure-pytest = "^2.10.0" +allure-pytest = "^2.13.1" pytest-asyncio = "^0.19.0" toml = "^0.10.2" psutil = "^5.9.4" @@ -34,6 +34,7 @@ types-psutil = "^5.9.5.4" types-toml = "^0.10.8" pytest-httpserver = "^1.0.6" aiohttp = "3.7.4" +pytest-rerunfailures = "^11.1.2" [tool.poetry.group.dev.dependencies] black = "^23.1.0" @@ -69,6 +70,9 @@ strict = true module = [ "asyncpg.*", "pg8000.*", + "allure.*", + "allure_commons.*", + "allure_pytest.*", ] ignore_missing_imports = true diff --git a/scripts/flaky_tests.py b/scripts/flaky_tests.py new file mode 100755 index 0000000000..829cc814e8 --- /dev/null +++ b/scripts/flaky_tests.py @@ -0,0 +1,87 @@ +#! /usr/bin/env python3 + +import argparse +import json +import logging +from collections import defaultdict +from typing import DefaultDict, Dict + +import psycopg2 +import psycopg2.extras + +# We call the test "flaky" if it failed at least once on the main branch in the last N=10 days. +FLAKY_TESTS_QUERY = """ + SELECT + DISTINCT parent_suite, suite, test + FROM + ( + SELECT + revision, + jsonb_array_elements(data -> 'children') -> 'name' as parent_suite, + jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'name' as suite, + jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'name' as test, + jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'status' as status, + to_timestamp((jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'time' -> 'start')::bigint / 1000)::date as timestamp + FROM + regress_test_results + WHERE + reference = 'refs/heads/main' + ) data + WHERE + timestamp > CURRENT_DATE - INTERVAL '%s' day + AND status::text IN ('"failed"', '"broken"') + ; +""" + + +def main(args: argparse.Namespace): + connstr = args.connstr + interval_days = args.days + output = args.output + + res: DefaultDict[str, DefaultDict[str, Dict[str, bool]]] + res = defaultdict(lambda: defaultdict(dict)) + + logging.info("connecting to the database...") + with psycopg2.connect(connstr, connect_timeout=10) as conn: + with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: + logging.info("fetching flaky tests...") + cur.execute(FLAKY_TESTS_QUERY, (interval_days,)) + rows = cur.fetchall() + + for row in rows: + logging.info(f"\t{row['parent_suite'].replace('.', '/')}/{row['suite']}.py::{row['test']}") + res[row["parent_suite"]][row["suite"]][row["test"]] = True + + logging.info(f"saving results to {output.name}") + json.dump(res, output, indent=2) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Detect flaky tests in the last N days") + parser.add_argument( + "--output", + type=argparse.FileType("w"), + default="flaky.json", + help="path to output json file (default: flaky.json)", + ) + parser.add_argument( + "--days", + required=False, + default=10, + type=int, + help="how many days to look back for flaky tests (default: 10)", + ) + parser.add_argument( + "connstr", + help="connection string to the test results database", + ) + args = parser.parse_args() + + level = logging.INFO + logging.basicConfig( + format="%(message)s", + level=level, + ) + + main(args) diff --git a/test_runner/conftest.py b/test_runner/conftest.py index 8b7f6a2eea..75242b84ce 100644 --- a/test_runner/conftest.py +++ b/test_runner/conftest.py @@ -4,4 +4,5 @@ pytest_plugins = ( "fixtures.pg_stats", "fixtures.compare_fixtures", "fixtures.slow", + "fixtures.flaky", ) diff --git a/test_runner/fixtures/flaky.py b/test_runner/fixtures/flaky.py new file mode 100644 index 0000000000..9d7f8ead9a --- /dev/null +++ b/test_runner/fixtures/flaky.py @@ -0,0 +1,58 @@ +import json +from pathlib import Path +from typing import List + +import pytest +from _pytest.config import Config +from _pytest.config.argparsing import Parser +from allure_commons.types import LabelType +from allure_pytest.utils import allure_name, allure_suite_labels + +from fixtures.log_helper import log + +""" +The plugin reruns flaky tests. +It uses `pytest.mark.flaky` provided by `pytest-rerunfailures` plugin and flaky tests detected by `scripts/flaky_tests.py` + +Note: the logic of getting flaky tests is extracted to a separate script to avoid running it for each of N xdist workers +""" + + +def pytest_addoption(parser: Parser): + parser.addoption( + "--flaky-tests-json", + action="store", + type=Path, + help="Path to json file with flaky tests generated by scripts/flaky_tests.py", + ) + + +def pytest_collection_modifyitems(config: Config, items: List[pytest.Item]): + if not config.getoption("--flaky-tests-json"): + return + + # Any error with getting flaky tests aren't critical, so just do not rerun any tests + flaky_json = config.getoption("--flaky-tests-json") + if not flaky_json.exists(): + return + + content = flaky_json.read_text() + try: + flaky_tests = json.loads(content) + except ValueError: + log.error(f"Can't parse {content} as json") + return + + for item in items: + # Use the same logic for constructing test name as Allure does (we store allure-provided data in DB) + # Ref https://github.com/allure-framework/allure-python/blob/2.13.1/allure-pytest/src/listener.py#L98-L100 + allure_labels = dict(allure_suite_labels(item)) + parent_suite = str(allure_labels.get(LabelType.PARENT_SUITE)) + suite = str(allure_labels.get(LabelType.SUITE)) + params = item.callspec.params if hasattr(item, "callspec") else {} + name = allure_name(item, params) + + if flaky_tests.get(parent_suite, {}).get(suite, {}).get(name, False): + # Rerun 3 times = 1 original run + 2 reruns + log.info(f"Marking {item.nodeid} as flaky. It will be rerun up to 3 times") + item.add_marker(pytest.mark.flaky(reruns=2)) diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index ce03658e8f..1e15fea3c2 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -7,7 +7,7 @@ import time from pathlib import Path from typing import Any, Callable, Dict, List, Tuple, TypeVar -import allure # type: ignore +import allure from psycopg2.extensions import cursor from fixtures.log_helper import log From 1d23b5d1de0b1b373f507d2e9407e104a171bd48 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Tue, 4 Apr 2023 12:22:47 +0100 Subject: [PATCH 02/31] Comment PR with test results (#3907) This PR adds posting a comment with test results. Each workflow run updates the comment with new results. The layout and the information that we post can be changed to our needs, right now, it contains failed tests and test which changes status after rerun (i.e. flaky tests) --- .github/actions/allure-report/action.yml | 44 ++++++-- .github/workflows/build_and_test.yml | 82 +++++++++++---- scripts/pr-comment-test-report.js | 125 +++++++++++++++++++++++ 3 files changed, 222 insertions(+), 29 deletions(-) create mode 100644 scripts/pr-comment-test-report.js diff --git a/.github/actions/allure-report/action.yml b/.github/actions/allure-report/action.yml index e685006245..e35cbb20fd 100644 --- a/.github/actions/allure-report/action.yml +++ b/.github/actions/allure-report/action.yml @@ -15,10 +15,32 @@ outputs: report-url: description: 'Allure report URL' value: ${{ steps.generate-report.outputs.report-url }} + report-json-url: + description: 'Allure report JSON URL' + value: ${{ steps.generate-report.outputs.report-json-url }} runs: using: "composite" + steps: + # We're using some of env variables quite offen, so let's set them once. + # + # It would be nice to have them set in common runs.env[0] section, but it doesn't work[1] + # + # - [0] https://docs.github.com/en/actions/creating-actions/metadata-syntax-for-github-actions#runsenv + # - [1] https://github.com/neondatabase/neon/pull/3907#discussion_r1154703456 + # + - name: Set common environment variables + shell: bash -euxo pipefail {0} + run: | + echo "BUILD_TYPE=${BUILD_TYPE}" >> $GITHUB_ENV + echo "BUCKET=${BUCKET}" >> $GITHUB_ENV + echo "TEST_OUTPUT=${TEST_OUTPUT}" >> $GITHUB_ENV + env: + BUILD_TYPE: ${{ inputs.build_type }} + BUCKET: neon-github-public-dev + TEST_OUTPUT: /tmp/test_output + - name: Validate input parameters shell: bash -euxo pipefail {0} run: | @@ -84,8 +106,6 @@ runs: env: REPORT_PREFIX: reports/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }} RAW_PREFIX: reports-raw/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }} - TEST_OUTPUT: /tmp/test_output - BUCKET: neon-github-public-dev TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }} shell: bash -euxo pipefail {0} run: | @@ -104,7 +124,7 @@ runs: EOF cat < $TEST_OUTPUT/allure/results/environment.properties TEST_SELECTION=${{ inputs.test_selection }} - BUILD_TYPE=${{ inputs.build_type }} + BUILD_TYPE=${BUILD_TYPE} EOF ARCHIVE="${GITHUB_RUN_ID}-${TEST_SELECTION}-${GITHUB_RUN_ATTEMPT}-$(date +%s).tar.zst" @@ -113,13 +133,12 @@ runs: tar -C ${TEST_OUTPUT}/allure/results -cf ${ARCHIVE} --zstd . aws s3 mv --only-show-errors ${ARCHIVE} "s3://${BUCKET}/${RAW_PREFIX}/${ARCHIVE}" - # Potentially we could have several running build for the same key (for example for the main branch), so we use improvised lock for this + # Potentially we could have several running build for the same key (for example for the main branch), so we use improvised lock for this - name: Acquire Allure lock if: ${{ inputs.action == 'generate' }} shell: bash -euxo pipefail {0} env: LOCK_FILE: reports/${{ steps.calculate-vars.outputs.KEY }}/lock.txt - BUCKET: neon-github-public-dev TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }} run: | LOCK_TIMEOUT=300 # seconds @@ -149,8 +168,6 @@ runs: env: REPORT_PREFIX: reports/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }} RAW_PREFIX: reports-raw/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }} - TEST_OUTPUT: /tmp/test_output - BUCKET: neon-github-public-dev shell: bash -euxo pipefail {0} run: | # Get previously uploaded data for this run @@ -186,24 +203,24 @@ runs: REPORT_URL=https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html # Generate redirect - cat < ./index.html + cat < ${TEST_OUTPUT}/allure/index.html Redirecting to ${REPORT_URL} EOF - aws s3 cp --only-show-errors ./index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html" + aws s3 cp --only-show-errors ${TEST_OUTPUT}/allure/index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html" echo "[Allure Report](${REPORT_URL})" >> ${GITHUB_STEP_SUMMARY} echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT + echo "report-json-url=${REPORT_URL%/index.html}/data/suites.json" >> $GITHUB_OUTPUT - name: Release Allure lock if: ${{ inputs.action == 'generate' && always() }} shell: bash -euxo pipefail {0} env: LOCK_FILE: reports/${{ steps.calculate-vars.outputs.KEY }}/lock.txt - BUCKET: neon-github-public-dev TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }} run: | aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt || exit 0 @@ -212,11 +229,16 @@ runs: aws s3 rm "s3://${BUCKET}/${LOCK_FILE}" fi + - name: Cleanup + if: always() + shell: bash -euxo pipefail {0} + run: | + rm -rf ${TEST_OUTPUT}/allure + - uses: actions/github-script@v6 if: ${{ inputs.action == 'generate' && always() }} env: REPORT_URL: ${{ steps.generate-report.outputs.report-url }} - BUILD_TYPE: ${{ inputs.build_type }} SHA: ${{ github.event.pull_request.head.sha || github.sha }} with: script: | diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 8c108e7f50..68102bce84 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -374,42 +374,88 @@ jobs: # XXX: no coverage data handling here, since benchmarks are run on release builds, # while coverage is currently collected for the debug ones - merge-allure-report: + create-test-report: runs-on: [ self-hosted, gen3, small ] container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned options: --init needs: [ regress-tests, benchmarks ] if: ${{ !cancelled() }} - strategy: - fail-fast: false - matrix: - build_type: [ debug, release ] - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - submodules: false - - name: Create Allure report - id: create-allure-report + steps: + - uses: actions/checkout@v3 + + - name: Create Allure report (debug) + if: ${{ !cancelled() }} + id: create-allure-report-debug uses: ./.github/actions/allure-report with: action: generate - build_type: ${{ matrix.build_type }} + build_type: debug + + - name: Create Allure report (release) + if: ${{ !cancelled() }} + id: create-allure-report-release + uses: ./.github/actions/allure-report + with: + action: generate + build_type: release + + - uses: actions/github-script@v6 + if: > + !cancelled() && + github.event_name == 'pull_request' && ( + steps.create-allure-report-debug.outputs.report-url || + steps.create-allure-report-release.outputs.report-url + ) + with: + script: | + const reports = [{ + buildType: "debug", + reportUrl: "${{ steps.create-allure-report-debug.outputs.report-url }}", + jsonUrl: "${{ steps.create-allure-report-debug.outputs.report-json-url }}", + }, { + buildType: "release", + reportUrl: "${{ steps.create-allure-report-release.outputs.report-url }}", + jsonUrl: "${{ steps.create-allure-report-release.outputs.report-json-url }}", + }] + + const script = require("./scripts/pr-comment-test-report.js") + await script({ + github, + context, + fetch, + reports, + }) - name: Store Allure test stat in the DB - if: ${{ steps.create-allure-report.outputs.report-url }} + if: > + !cancelled() && ( + steps.create-allure-report-debug.outputs.report-url || + steps.create-allure-report-release.outputs.report-url + ) env: - BUILD_TYPE: ${{ matrix.build_type }} SHA: ${{ github.event.pull_request.head.sha || github.sha }} - REPORT_URL: ${{ steps.create-allure-report.outputs.report-url }} + REPORT_JSON_URL_DEBUG: ${{ steps.create-allure-report-debug.outputs.report-json-url }} + REPORT_JSON_URL_RELEASE: ${{ steps.create-allure-report-release.outputs.report-json-url }} TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }} run: | - curl --fail --output suites.json ${REPORT_URL%/index.html}/data/suites.json ./scripts/pysync - DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json + for report_url in $REPORT_JSON_URL_DEBUG $REPORT_JSON_URL_RELEASE; do + if [ -z "$report_url" ]; then + continue + fi + + if [[ "$report_url" == "$REPORT_JSON_URL_DEBUG" ]]; then + BUILD_TYPE=debug + else + BUILD_TYPE=release + fi + + curl --fail --output suites.json "${report_url}" + DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json + done coverage-report: runs-on: [ self-hosted, gen3, small ] diff --git a/scripts/pr-comment-test-report.js b/scripts/pr-comment-test-report.js new file mode 100644 index 0000000000..d2c5aebc4f --- /dev/null +++ b/scripts/pr-comment-test-report.js @@ -0,0 +1,125 @@ +// +// The script parses Allure reports and posts a comment with a summary of the test results to the PR. +// It accepts an array of items and creates a comment with a summary for each one (for "release" and "debug", together or separately if any of them failed to be generated). +// +// The comment is updated on each run with the latest results. +// +// It is designed to be used with actions/github-script from GitHub Workflows: +// - uses: actions/github-script@v6 +// with: +// script: | +// const script = require("./scripts/pr-comment-test-report.js") +// await script({ +// github, +// context, +// fetch, +// reports: [{...}, ...], // each report is expected to have "buildType", "reportUrl", and "jsonUrl" properties +// }) +// + +module.exports = async ({ github, context, fetch, reports }) => { + // Marker to find the comment in the subsequent runs + const startMarker = `` + // GitHub bot id taken from (https://api.github.com/users/github-actions[bot]) + const githubActionsBotId = 41898282 + // The latest commit in the PR URL + const commitUrl = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/pull/${context.payload.number}/commits/${context.sha}` + // Commend body itself + let commentBody = `${startMarker}\n### Test results for ${commitUrl}:\n___\n` + + // Common parameters for GitHub API requests + const ownerRepoParams = { + owner: context.repo.owner, + repo: context.repo.repo, + } + + for (const report of reports) { + const {buildType, reportUrl, jsonUrl} = report + + if (!reportUrl || !jsonUrl) { + console.warn(`"reportUrl" or "jsonUrl" aren't set for ${buildType} build`) + continue + } + + const suites = await (await fetch(jsonUrl)).json() + + // Allure distinguishes "failed" (with an assertion error) and "broken" (with any other error) tests. + // For this report it's ok to treat them in the same way (as failed). + failedTests = [] + passedTests = [] + skippedTests = [] + + retriedTests = [] + retriedStatusChangedTests = [] + + for (const parentSuite of suites.children) { + for (const suite of parentSuite.children) { + for (const test of suite.children) { + pytestName = `${parentSuite.name.replace(".", "/")}/${suite.name}.py::${test.name}` + test.pytestName = pytestName + + if (test.status === "passed") { + passedTests.push(test); + } else if (test.status === "failed" || test.status === "broken") { + failedTests.push(test); + } else if (test.status === "skipped") { + skippedTests.push(test); + } + + if (test.retriesCount > 0) { + retriedTests.push(test); + + if (test.retriedStatusChangedTests) { + retriedStatusChangedTests.push(test); + } + } + } + } + } + + const totalTestsCount = failedTests.length + passedTests.length + skippedTests.length + commentBody += `#### ${buildType} build: ${totalTestsCount} tests run: ${passedTests.length} passed, ${failedTests.length} failed, ${skippedTests.length} ([full report](${reportUrl}))\n` + if (failedTests.length > 0) { + commentBody += `Failed tests:\n` + for (const test of failedTests) { + const allureLink = `${reportUrl}#suites/${test.parentUid}/${test.uid}` + + commentBody += `- [\`${test.pytestName}\`](${allureLink})` + if (test.retriesCount > 0) { + commentBody += ` (ran [${test.retriesCount + 1} times](${allureLink}/retries))` + } + commentBody += "\n" + } + commentBody += "\n" + } + if (retriedStatusChangedTests > 0) { + commentBody += `Flaky tests:\n` + for (const test of retriedStatusChangedTests) { + const status = test.status === "passed" ? ":white_check_mark:" : ":x:" + commentBody += `- ${status} [\`${test.pytestName}\`](${reportUrl}#suites/${test.parentUid}/${test.uid}/retries)\n` + } + commentBody += "\n" + } + commentBody += "___\n" + } + + const { data: comments } = await github.rest.issues.listComments({ + issue_number: context.payload.number, + ...ownerRepoParams, + }) + + const comment = comments.find(comment => comment.user.id === githubActionsBotId && comment.body.startsWith(startMarker)) + if (comment) { + await github.rest.issues.updateComment({ + comment_id: comment.id, + body: commentBody, + ...ownerRepoParams, + }) + } else { + await github.rest.issues.createComment({ + issue_number: context.payload.number, + body: commentBody, + ...ownerRepoParams, + }) + } +} From 957acb51b555b467551e43349847b9079f349cf2 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Tue, 4 Apr 2023 17:06:10 +0100 Subject: [PATCH 03/31] GitHub Autocomment: Fix the link to the latest commit (#3952) --- scripts/pr-comment-test-report.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/pr-comment-test-report.js b/scripts/pr-comment-test-report.js index d2c5aebc4f..8df7248c4e 100644 --- a/scripts/pr-comment-test-report.js +++ b/scripts/pr-comment-test-report.js @@ -23,7 +23,7 @@ module.exports = async ({ github, context, fetch, reports }) => { // GitHub bot id taken from (https://api.github.com/users/github-actions[bot]) const githubActionsBotId = 41898282 // The latest commit in the PR URL - const commitUrl = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/pull/${context.payload.number}/commits/${context.sha}` + const commitUrl = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/pull/${context.payload.number}/commits/${context.payload.pull_request.head.sha}` // Commend body itself let commentBody = `${startMarker}\n### Test results for ${commitUrl}:\n___\n` From c3ca48c62bad8ed070a041433e36d3d5913ebcb8 Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Wed, 5 Apr 2023 02:29:05 +0300 Subject: [PATCH 04/31] Support extra domain names for proxy. Make it possible to specify directory where proxy will look up for extra certificates. Proxy will iterate through subdirs of that directory and load `key.pem` and `cert.pem` files from each subdir. Certs directory structure may look like that: certs |--example.com | |--key.pem | |--cert.pem |--foo.bar |--key.pem |--cert.pem Actual domain names are taken from certs and key, subdir names are ignored. --- proxy/src/auth/backend/hacks.rs | 2 +- proxy/src/auth/credentials.rs | 103 ++++++++++++-------- proxy/src/config.rs | 161 +++++++++++++++++++++++++------- proxy/src/main.rs | 12 ++- proxy/src/proxy.rs | 8 +- proxy/src/proxy/tests.rs | 4 +- 6 files changed, 206 insertions(+), 84 deletions(-) diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs index f710581cb2..d45806461e 100644 --- a/proxy/src/auth/backend/hacks.rs +++ b/proxy/src/auth/backend/hacks.rs @@ -53,7 +53,7 @@ pub async fn password_hack( .await?; info!(project = &payload.project, "received missing parameter"); - creds.project = Some(payload.project.into()); + creds.project = Some(payload.project); let mut node = api.wake_compute(extra, creds).await?; node.config.password(payload.password); diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index c556c33197..b21cd79ddf 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -2,7 +2,7 @@ use crate::error::UserFacingError; use pq_proto::StartupMessageParams; -use std::borrow::Cow; +use std::collections::HashSet; use thiserror::Error; use tracing::info; @@ -19,11 +19,10 @@ pub enum ClientCredsParseError { InconsistentProjectNames { domain: String, option: String }, #[error( - "SNI ('{}') inconsistently formatted with respect to common name ('{}'). \ - SNI should be formatted as '.{}'.", - .sni, .cn, .cn, + "Common name inferred from SNI ('{}') is not known", + .cn, )] - InconsistentSni { sni: String, cn: String }, + UnknownCommonName { cn: String }, #[error("Project name ('{0}') must contain only alphanumeric characters and hyphen.")] MalformedProjectName(String), @@ -37,7 +36,7 @@ impl UserFacingError for ClientCredsParseError {} pub struct ClientCredentials<'a> { pub user: &'a str, // TODO: this is a severe misnomer! We should think of a new name ASAP. - pub project: Option>, + pub project: Option, } impl ClientCredentials<'_> { @@ -51,7 +50,7 @@ impl<'a> ClientCredentials<'a> { pub fn parse( params: &'a StartupMessageParams, sni: Option<&str>, - common_name: Option<&str>, + common_names: Option>, ) -> Result { use ClientCredsParseError::*; @@ -60,37 +59,43 @@ impl<'a> ClientCredentials<'a> { let user = get_param("user")?; // Project name might be passed via PG's command-line options. - let project_option = params.options_raw().and_then(|mut options| { - options - .find_map(|opt| opt.strip_prefix("project=")) - .map(Cow::Borrowed) - }); + let project_option = params + .options_raw() + .and_then(|mut options| options.find_map(|opt| opt.strip_prefix("project="))) + .map(|name| name.to_string()); - // Alternative project name is in fact a subdomain from SNI. - // NOTE: we do not consider SNI if `common_name` is missing. - let project_domain = sni - .zip(common_name) - .map(|(sni, cn)| { - subdomain_from_sni(sni, cn) - .ok_or_else(|| InconsistentSni { - sni: sni.into(), - cn: cn.into(), + let project_from_domain = if let Some(sni_str) = sni { + if let Some(cn) = common_names { + let common_name_from_sni = sni_str.split_once('.').map(|(_, domain)| domain); + + let project = common_name_from_sni + .and_then(|domain| { + if cn.contains(domain) { + subdomain_from_sni(sni_str, domain) + } else { + None + } }) - .map(Cow::<'static, str>::Owned) - }) - .transpose()?; + .ok_or_else(|| UnknownCommonName { + cn: common_name_from_sni.unwrap_or("").into(), + })?; - let project = match (project_option, project_domain) { + Some(project) + } else { + None + } + } else { + None + }; + + let project = match (project_option, project_from_domain) { // Invariant: if we have both project name variants, they should match. (Some(option), Some(domain)) if option != domain => { - Some(Err(InconsistentProjectNames { - domain: domain.into(), - option: option.into(), - })) + Some(Err(InconsistentProjectNames { domain, option })) } // Invariant: project name may not contain certain characters. (a, b) => a.or(b).map(|name| match project_name_valid(&name) { - false => Err(MalformedProjectName(name.into())), + false => Err(MalformedProjectName(name)), true => Ok(name), }), } @@ -149,9 +154,9 @@ mod tests { let options = StartupMessageParams::new([("user", "john_doe")]); let sni = Some("foo.localhost"); - let common_name = Some("localhost"); + let common_names = Some(["localhost".into()].into()); - let creds = ClientCredentials::parse(&options, sni, common_name)?; + let creds = ClientCredentials::parse(&options, sni, common_names)?; assert_eq!(creds.user, "john_doe"); assert_eq!(creds.project.as_deref(), Some("foo")); @@ -177,24 +182,41 @@ mod tests { let options = StartupMessageParams::new([("user", "john_doe"), ("options", "project=baz")]); let sni = Some("baz.localhost"); - let common_name = Some("localhost"); + let common_names = Some(["localhost".into()].into()); - let creds = ClientCredentials::parse(&options, sni, common_name)?; + let creds = ClientCredentials::parse(&options, sni, common_names)?; assert_eq!(creds.user, "john_doe"); assert_eq!(creds.project.as_deref(), Some("baz")); Ok(()) } + #[test] + fn parse_multi_common_names() -> anyhow::Result<()> { + let options = StartupMessageParams::new([("user", "john_doe")]); + + let common_names = Some(["a.com".into(), "b.com".into()].into()); + let sni = Some("p1.a.com"); + let creds = ClientCredentials::parse(&options, sni, common_names)?; + assert_eq!(creds.project.as_deref(), Some("p1")); + + let common_names = Some(["a.com".into(), "b.com".into()].into()); + let sni = Some("p1.b.com"); + let creds = ClientCredentials::parse(&options, sni, common_names)?; + assert_eq!(creds.project.as_deref(), Some("p1")); + + Ok(()) + } + #[test] fn parse_projects_different() { let options = StartupMessageParams::new([("user", "john_doe"), ("options", "project=first")]); let sni = Some("second.localhost"); - let common_name = Some("localhost"); + let common_names = Some(["localhost".into()].into()); - let err = ClientCredentials::parse(&options, sni, common_name).expect_err("should fail"); + let err = ClientCredentials::parse(&options, sni, common_names).expect_err("should fail"); match err { InconsistentProjectNames { domain, option } => { assert_eq!(option, "first"); @@ -209,13 +231,12 @@ mod tests { let options = StartupMessageParams::new([("user", "john_doe")]); let sni = Some("project.localhost"); - let common_name = Some("example.com"); + let common_names = Some(["example.com".into()].into()); - let err = ClientCredentials::parse(&options, sni, common_name).expect_err("should fail"); + let err = ClientCredentials::parse(&options, sni, common_names).expect_err("should fail"); match err { - InconsistentSni { sni, cn } => { - assert_eq!(sni, "project.localhost"); - assert_eq!(cn, "example.com"); + UnknownCommonName { cn } => { + assert_eq!(cn, "localhost"); } _ => panic!("bad error: {err:?}"), } diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 600db7f8ec..9f6241d733 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -1,6 +1,12 @@ use crate::auth; -use anyhow::{bail, ensure, Context}; -use std::{str::FromStr, sync::Arc, time::Duration}; +use anyhow::{bail, ensure, Context, Ok}; +use rustls::sign; +use std::{ + collections::{HashMap, HashSet}, + str::FromStr, + sync::Arc, + time::Duration, +}; pub struct ProxyConfig { pub tls_config: Option, @@ -16,7 +22,7 @@ pub struct MetricCollectionConfig { pub struct TlsConfig { pub config: Arc, - pub common_name: Option, + pub common_names: Option>, } impl TlsConfig { @@ -26,28 +32,33 @@ impl TlsConfig { } /// Configure TLS for the main endpoint. -pub fn configure_tls(key_path: &str, cert_path: &str) -> anyhow::Result { - let key = { - let key_bytes = std::fs::read(key_path).context("TLS key file")?; - let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]) - .context(format!("Failed to read TLS keys at '{key_path}'"))?; +pub fn configure_tls( + key_path: &str, + cert_path: &str, + certs_dir: Option<&String>, +) -> anyhow::Result { + let mut cert_resolver = CertResolver::new(); - ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len()); - keys.pop().map(rustls::PrivateKey).unwrap() - }; + // add default certificate + cert_resolver.add_cert(key_path, cert_path)?; - let cert_chain_bytes = std::fs::read(cert_path) - .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?; + // add extra certificates + if let Some(certs_dir) = certs_dir { + for entry in std::fs::read_dir(certs_dir)? { + let entry = entry?; + let path = entry.path(); + if path.is_dir() { + let key_path = path.join("key.pem"); + let cert_path = path.join("cert.pem"); + if key_path.exists() && cert_path.exists() { + cert_resolver + .add_cert(&key_path.to_string_lossy(), &cert_path.to_string_lossy())?; + } + } + } + } - let cert_chain = { - rustls_pemfile::certs(&mut &cert_chain_bytes[..]) - .context(format!( - "Failed to read TLS certificate chain from bytes from file at '{cert_path}'." - ))? - .into_iter() - .map(rustls::Certificate) - .collect() - }; + let common_names = cert_resolver.get_common_names(); let config = rustls::ServerConfig::builder() .with_safe_default_cipher_suites() @@ -55,27 +66,105 @@ pub fn configure_tls(key_path: &str, cert_path: &str) -> anyhow::Result>, +} + +impl CertResolver { + fn new() -> Self { + Self { + certs: HashMap::new(), + } + } + + fn add_cert(&mut self, key_path: &str, cert_path: &str) -> anyhow::Result<()> { + let priv_key = { + let key_bytes = std::fs::read(key_path).context("TLS key file")?; + let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]) + .context(format!("Failed to read TLS keys at '{key_path}'"))?; + + ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len()); + keys.pop().map(rustls::PrivateKey).unwrap() + }; + + let key = sign::any_supported_type(&priv_key).context("invalid private key")?; + + let cert_chain_bytes = std::fs::read(cert_path) + .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?; + + let cert_chain = { + rustls_pemfile::certs(&mut &cert_chain_bytes[..]) + .context(format!( + "Failed to read TLS certificate chain from bytes from file at '{cert_path}'." + ))? + .into_iter() + .map(rustls::Certificate) + .collect() + }; + + let common_name = { + let pem = x509_parser::pem::parse_x509_pem(&cert_chain_bytes) + .context(format!( + "Failed to parse PEM object from bytes from file at '{cert_path}'." + ))? + .1; + let common_name = pem.parse_x509()?.subject().to_string(); + common_name.strip_prefix("CN=*.").map(|s| s.to_string()) + } + .context(format!( + "Failed to parse common name from certificate at '{cert_path}'." + ))?; + + self.certs.insert( + common_name, + Arc::new(rustls::sign::CertifiedKey::new(cert_chain, key)), + ); + + Ok(()) + } + + fn get_common_names(&self) -> HashSet { + self.certs.keys().map(|s| s.to_string()).collect() + } +} + +impl rustls::server::ResolvesServerCert for CertResolver { + fn resolve( + &self, + _client_hello: rustls::server::ClientHello, + ) -> Option> { + // loop here and cut off more and more subdomains until we find + // a match to get a proper wildcard support. OTOH, we now do not + // use nested domains, so keep this simple for now. + // + // With the current coding foo.com will match *.foo.com and that + // repeats behavior of the old code. + if let Some(mut sni_name) = _client_hello.server_name() { + loop { + if let Some(cert) = self.certs.get(sni_name) { + return Some(cert.clone()); + } + if let Some((_, rest)) = sni_name.split_once('.') { + sni_name = rest; + } else { + return None; + } + } + } else { + None + } + } +} + /// Helper for cmdline cache options parsing. pub struct CacheOptions { /// Max number of entries. diff --git a/proxy/src/main.rs b/proxy/src/main.rs index 85478da3bc..c6526e9aff 100644 --- a/proxy/src/main.rs +++ b/proxy/src/main.rs @@ -132,7 +132,11 @@ fn build_config(args: &clap::ArgMatches) -> anyhow::Result<&'static ProxyConfig> args.get_one::("tls-key"), args.get_one::("tls-cert"), ) { - (Some(key_path), Some(cert_path)) => Some(config::configure_tls(key_path, cert_path)?), + (Some(key_path), Some(cert_path)) => Some(config::configure_tls( + key_path, + cert_path, + args.get_one::("certs-dir"), + )?), (None, None) => None, _ => bail!("either both or neither tls-key and tls-cert must be specified"), }; @@ -254,6 +258,12 @@ fn cli() -> clap::Command { .alias("ssl-cert") // backwards compatibility .help("path to TLS cert for client postgres connections"), ) + // tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir + .arg( + Arg::new("certs-dir") + .long("certs-dir") + .help("path to directory with TLS certificates for client postgres connections"), + ) .arg( Arg::new("metric-collection-endpoint") .long("metric-collection-endpoint") diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 03c9c72f30..70fb25474e 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -124,11 +124,11 @@ pub async fn handle_ws_client( // Extract credentials which we're going to use for auth. let creds = { - let common_name = tls.and_then(|tls| tls.common_name.as_deref()); + let common_names = tls.and_then(|tls| tls.common_names.clone()); let result = config .auth_backend .as_ref() - .map(|_| auth::ClientCredentials::parse(¶ms, hostname, common_name)) + .map(|_| auth::ClientCredentials::parse(¶ms, hostname, common_names)) .transpose(); async { result }.or_else(|e| stream.throw_error(e)).await? @@ -163,11 +163,11 @@ async fn handle_client( // Extract credentials which we're going to use for auth. let creds = { let sni = stream.get_ref().sni_hostname(); - let common_name = tls.and_then(|tls| tls.common_name.as_deref()); + let common_names = tls.and_then(|tls| tls.common_names.clone()); let result = config .auth_backend .as_ref() - .map(|_| auth::ClientCredentials::parse(¶ms, sni, common_name)) + .map(|_| auth::ClientCredentials::parse(¶ms, sni, common_names)) .transpose(); async { result }.or_else(|e| stream.throw_error(e)).await? diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs index ed429df421..60acb588dc 100644 --- a/proxy/src/proxy/tests.rs +++ b/proxy/src/proxy/tests.rs @@ -54,9 +54,11 @@ fn generate_tls_config<'a>( .with_single_cert(vec![cert], key)? .into(); + let common_names = Some([common_name.to_owned()].iter().cloned().collect()); + TlsConfig { config, - common_name: Some(common_name.to_string()), + common_names, } }; From d8df5237fa3cb629c2afc6d65e960b10cd0fad8a Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Wed, 5 Apr 2023 20:56:52 +0300 Subject: [PATCH 05/31] Aligne extra certificate name with default cert-manager names --- proxy/src/config.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 9f6241d733..5f9585149e 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -48,8 +48,9 @@ pub fn configure_tls( let entry = entry?; let path = entry.path(); if path.is_dir() { - let key_path = path.join("key.pem"); - let cert_path = path.join("cert.pem"); + // file names aligned with default cert-manager names + let key_path = path.join("tls.key"); + let cert_path = path.join("tls.crt"); if key_path.exists() && cert_path.exists() { cert_resolver .add_cert(&key_path.to_string_lossy(), &cert_path.to_string_lossy())?; From 9310949b44b7c9d5f89e6efa2322e6b841c9dafb Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Wed, 5 Apr 2023 20:08:06 +0100 Subject: [PATCH 06/31] GitHub Autocomment: Retry on server errors (#3958) Retry posting/updating a comment in case of 5XX errors from GitHub API --- .github/workflows/build_and_test.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 68102bce84..56c0aa8f9a 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -409,6 +409,8 @@ jobs: steps.create-allure-report-release.outputs.report-url ) with: + # Retry script for 5XX server errors: https://github.com/actions/github-script#retries + retries: 5 script: | const reports = [{ buildType: "debug", From b17c24fa38c3985e8a03b1f839de2e5d831d1e3d Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Thu, 6 Apr 2023 12:47:21 +0300 Subject: [PATCH 07/31] fix: settle down to configured percent (#3947) in real env testing we noted that the disk-usage based eviction sails 1 percentage point above the configured value, which might be a source of confusion, so it might be better to get rid of that confusion now. confusion: "I configured 85% but pageserver sails at 86%". Co-authored-by: Christian Schwarz --- libs/utils/src/serde_percent.rs | 8 +++++ pageserver/src/disk_usage_eviction_task.rs | 41 +++++++++++++++++++++- 2 files changed, 48 insertions(+), 1 deletion(-) diff --git a/libs/utils/src/serde_percent.rs b/libs/utils/src/serde_percent.rs index 63b62b5f1e..36e874a161 100644 --- a/libs/utils/src/serde_percent.rs +++ b/libs/utils/src/serde_percent.rs @@ -11,6 +11,14 @@ use serde::{Deserialize, Serialize}; pub struct Percent(#[serde(deserialize_with = "deserialize_pct_0_to_100")] u8); impl Percent { + pub const fn new(pct: u8) -> Option { + if pct <= 100 { + Some(Percent(pct)) + } else { + None + } + } + pub fn get(&self) -> u8 { self.0 } diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index eeeb6fda89..f4a0f3f18e 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -639,7 +639,7 @@ mod filesystem_level_usage { ), ( "max_usage_pct", - usage_pct > self.config.max_usage_pct.get() as u64, + usage_pct >= self.config.max_usage_pct.get() as u64, ), ]; @@ -686,4 +686,43 @@ mod filesystem_level_usage { avail_bytes, }) } + + #[test] + fn max_usage_pct_pressure() { + use super::Usage as _; + use std::time::Duration; + use utils::serde_percent::Percent; + + let mut usage = Usage { + config: &DiskUsageEvictionTaskConfig { + max_usage_pct: Percent::new(85).unwrap(), + min_avail_bytes: 0, + period: Duration::MAX, + #[cfg(feature = "testing")] + mock_statvfs: None, + }, + total_bytes: 100_000, + avail_bytes: 0, + }; + + assert!(usage.has_pressure(), "expected pressure at 100%"); + + usage.add_available_bytes(14_000); + assert!(usage.has_pressure(), "expected pressure at 86%"); + + usage.add_available_bytes(999); + assert!(usage.has_pressure(), "expected pressure at 85.001%"); + + usage.add_available_bytes(1); + assert!(usage.has_pressure(), "expected pressure at precisely 85%"); + + usage.add_available_bytes(1); + assert!(!usage.has_pressure(), "no pressure at 84.999%"); + + usage.add_available_bytes(999); + assert!(!usage.has_pressure(), "no pressure at 84%"); + + usage.add_available_bytes(16_000); + assert!(!usage.has_pressure()); + } } From 9db70f6232aa78823ebda8552b89b6bb1a61ee12 Mon Sep 17 00:00:00 2001 From: Gleb Novikov Date: Thu, 6 Apr 2023 14:02:56 +0400 Subject: [PATCH 08/31] Added disk_size and instance_type to payload (#3918) ## Describe your changes In https://github.com/neondatabase/cloud/issues/4354 we are making scheduling of projects based on available disk space and overcommit, so we need to know disk size and just in case instance type of the pageserver ## Issue ticket number and link https://github.com/neondatabase/cloud/issues/4354 ## Checklist before requesting a review - [x] I have performed a self-review of my code. - [ ] ~If it is a core feature, I have added thorough tests.~ - [ ] ~Do we need to implement analytics? if so did you add the relevant metrics to the dashboard?~ - [ ] ~If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section.~ --- .github/ansible/scripts/init_pageserver.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/ansible/scripts/init_pageserver.sh b/.github/ansible/scripts/init_pageserver.sh index e7d6efadae..d88f754a86 100644 --- a/.github/ansible/scripts/init_pageserver.sh +++ b/.github/ansible/scripts/init_pageserver.sh @@ -3,6 +3,8 @@ # fetch params from meta-data service INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id) AZ_ID=$(curl -s http://169.254.169.254/latest/meta-data/placement/availability-zone) +INSTANCE_TYPE=$(curl -s http://169.254.169.254/latest/meta-data/instance-type) +DISK_SIZE=$(df -B1 /storage | tail -1 | awk '{print $2}') # store fqdn hostname in var HOST=$(hostname -f) @@ -18,7 +20,9 @@ cat < Date: Thu, 6 Apr 2023 12:53:58 +0200 Subject: [PATCH 09/31] Allow installation of `pg_stat_statements` --- Dockerfile.compute-node | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index ef861b15be..92a1bb69e5 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -38,6 +38,7 @@ RUN cd postgres && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/insert_username.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/intagg.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/moddatetime.control && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_stat_statements.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/refint.control && \ From 887cee64e26754c47be8d5a641f0923f95a616ee Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Thu, 6 Apr 2023 13:52:41 +0100 Subject: [PATCH 10/31] test_runner: add links to grafana for remote tests (#3961) Add Grafana links to allure reports to make it easier to debug perf test failures --- test_runner/fixtures/neon_fixtures.py | 10 ++++++- test_runner/fixtures/utils.py | 42 +++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index a232bf8b6d..c24158e9ec 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -17,6 +17,7 @@ import uuid from collections import defaultdict from contextlib import closing, contextmanager from dataclasses import dataclass, field +from datetime import datetime from enum import Flag, auto from functools import cached_property from itertools import chain, product @@ -48,6 +49,7 @@ from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import ( ATTACHMENT_NAME_REGEX, Fn, + allure_add_grafana_links, allure_attach_from_dir, get_self_dir, subprocess_capture, @@ -2436,10 +2438,16 @@ def remote_pg( connstr = os.getenv("BENCHMARK_CONNSTR") if connstr is None: raise ValueError("no connstr provided, use BENCHMARK_CONNSTR environment variable") - + start_ms = int(datetime.utcnow().timestamp() * 1000) with RemotePostgres(pg_bin, connstr) as remote_pg: yield remote_pg + end_ms = int(datetime.utcnow().timestamp() * 1000) + host = parse_dsn(connstr).get("host", "") + if host.endswith(".neon.build"): + # Add 10s margin to the start and end times + allure_add_grafana_links(host, start_ms - 10_000, end_ms + 10_000) + class PSQL: """ diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 1e15fea3c2..b58539ca86 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -1,4 +1,5 @@ import contextlib +import json import os import re import subprocess @@ -6,6 +7,7 @@ import tarfile import time from pathlib import Path from typing import Any, Callable, Dict, List, Tuple, TypeVar +from urllib.parse import urlencode import allure from psycopg2.extensions import cursor @@ -184,6 +186,46 @@ def allure_attach_from_dir(dir: Path): allure.attach.file(source, name, attachment_type, extension) +DATASOURCE_ID = "xHHYY0dVz" + + +def allure_add_grafana_links(host: str, start_ms: int, end_ms: int): + """Add links to server logs in Grafana to Allure report""" + # We expect host to be in format like ep-divine-night-159320.us-east-2.aws.neon.build + endpoint_id, region_id, _ = host.split(".", 2) + + expressions = { + "compute logs": f'{{app="compute-node-{endpoint_id}", neon_region="{region_id}"}}', + "k8s events": f'{{job="integrations/kubernetes/eventhandler"}} |~ "name=compute-node-{endpoint_id}-"', + "console logs": f'{{neon_service="console", neon_region="{region_id}"}} | json | endpoint_id = "{endpoint_id}"', + "proxy logs": f'{{neon_service="proxy-scram", neon_region="{region_id}"}}', + } + + params: Dict[str, Any] = { + "datasource": DATASOURCE_ID, + "queries": [ + { + "expr": "", + "refId": "A", + "datasource": {"type": "loki", "uid": DATASOURCE_ID}, + "editorMode": "code", + "queryType": "range", + } + ], + "range": { + "from": str(start_ms), + "to": str(end_ms), + }, + } + for name, expr in expressions.items(): + params["queries"][0]["expr"] = expr + query_string = urlencode({"orgId": 1, "left": json.dumps(params)}) + link = f"https://neonprod.grafana.net/explore?{query_string}" + + allure.dynamic.link(link, name=name) + log.info(f"{name}: {link}") + + def start_in_background( command: list[str], cwd: Path, log_file_name: str, is_started: Fn ) -> subprocess.Popen[bytes]: From 102746bc8f0e24c6ba1ddfb88dffcea5624e442b Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Thu, 6 Apr 2023 18:57:48 +0300 Subject: [PATCH 11/31] Apply clippy rule exclusion locally instead of a global approach (#3974) --- libs/pq_proto/src/lib.rs | 3 +++ run_clippy.sh | 8 +------- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs index a976e19029..ed0239072a 100644 --- a/libs/pq_proto/src/lib.rs +++ b/libs/pq_proto/src/lib.rs @@ -293,6 +293,9 @@ impl FeStartupPacket { // We shouldn't advance `buf` as probably full message is not there yet, // so can't directly use Bytes::get_u32 etc. let len = (&buf[0..4]).read_u32::().unwrap() as usize; + // The proposed replacement is `!(4..=MAX_STARTUP_PACKET_LENGTH).contains(&len)` + // which is less readable + #[allow(clippy::manual_range_contains)] if len < 4 || len > MAX_STARTUP_PACKET_LENGTH { return Err(ProtocolError::Protocol(format!( "invalid startup packet message length {}", diff --git a/run_clippy.sh b/run_clippy.sh index ae9482ee96..9adfddedc2 100755 --- a/run_clippy.sh +++ b/run_clippy.sh @@ -8,13 +8,7 @@ # warnings and errors right in the editor. # In vscode, this setting is Rust-analyzer>Check On Save:Command -# manual-range-contains wants -# !(4..=MAX_STARTUP_PACKET_LENGTH).contains(&len) -# instead of -# len < 4 || len > MAX_STARTUP_PACKET_LENGTH -# , let's disagree. - # * `-A unknown_lints` – do not warn about unknown lint suppressions # that people with newer toolchains might use # * `-D warnings` - fail on any warnings (`cargo` returns non-zero exit status) -cargo clippy --locked --all --all-targets --all-features -- -A unknown_lints -A clippy::manual-range-contains -D warnings +cargo clippy --locked --all --all-targets --all-features -- -A unknown_lints -D warnings From 4d64edf8a552a9123590bc5a121bb79093741b68 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Thu, 6 Apr 2023 17:18:24 +0100 Subject: [PATCH 12/31] Nightly Benchmarks: Add free tier sized compute (#3969) - Add support for VMs and CU - Add free tier limited benchmark (0.25 CU) - Ensure we use 1 CU by default for pgbench workload --- .github/actions/neon-project-create/action.yml | 16 ++++++++++++++++ .github/workflows/benchmarking.yml | 11 ++++++++--- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml index 0480bfbc84..ae6464990e 100644 --- a/.github/actions/neon-project-create/action.yml +++ b/.github/actions/neon-project-create/action.yml @@ -14,6 +14,12 @@ inputs: api_host: desctiption: 'Neon API host' default: console.stage.neon.tech + provisioner: + desctiption: 'k8s-pod or k8s-neonvm' + default: 'k8s-pod' + compute_units: + desctiption: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal' + default: '[1, 1]' outputs: dsn: @@ -31,6 +37,10 @@ runs: # A shell without `set -x` to not to expose password/dsn in logs shell: bash -euo pipefail {0} run: | + if [ "${PROVISIONER}" == "k8s-pod" ] && [ "${MIN_CU}" != "${MAX_CU}" ]; then + echo >&2 "For k8s-pod provisioner MIN_CU should be equal to MAX_CU" + fi + project=$(curl \ "https://${API_HOST}/api/v2/projects" \ --fail \ @@ -42,6 +52,9 @@ runs: \"name\": \"Created by actions/neon-project-create; GITHUB_RUN_ID=${GITHUB_RUN_ID}\", \"pg_version\": ${POSTGRES_VERSION}, \"region_id\": \"${REGION_ID}\", + \"provisioner\": \"${PROVISIONER}\", + \"autoscaling_limit_min_cu\": ${MIN_CU}, + \"autoscaling_limit_max_cu\": ${MAX_CU}, \"settings\": { } } }") @@ -62,3 +75,6 @@ runs: API_KEY: ${{ inputs.api_key }} REGION_ID: ${{ inputs.region_id }} POSTGRES_VERSION: ${{ inputs.postgres_version }} + PROVISIONER: ${{ inputs.provisioner }} + MIN_CU: ${{ fromJSON(inputs.compute_units)[0] }} + MAX_CU: ${{ fromJSON(inputs.compute_units)[1] }} diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 16be60b1a1..425d4d76c9 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -111,6 +111,7 @@ jobs: strategy: fail-fast: false matrix: + # neon-captest-freetier: Run pgbench with freetier-limited compute # neon-captest-new: Run pgbench in a freshly created project # neon-captest-reuse: Same, but reusing existing project # neon-captest-prefetch: Same, with prefetching enabled (new project) @@ -120,6 +121,9 @@ jobs: db_size: [ 10gb ] runner: [ us-east-2 ] include: + - platform: neon-captest-freetier + db_size: 3gb + runner: us-east-2 - platform: neon-captest-prefetch db_size: 50gb runner: us-east-2 @@ -160,13 +164,14 @@ jobs: echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH - name: Create Neon Project - if: contains(fromJson('["neon-captest-new", "neon-captest-prefetch"]'), matrix.platform) + if: contains(fromJson('["neon-captest-new", "neon-captest-prefetch", "neon-captest-freetier"]'), matrix.platform) id: create-neon-project uses: ./.github/actions/neon-project-create with: region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }} postgres_version: ${{ env.DEFAULT_PG_VERSION }} api_key: ${{ secrets.NEON_STAGING_API_KEY }} + compute_units: ${{ (matrix.platform == 'neon-captest-freetier' && '[0.25, 0.25]') || '[1, 1]' }} - name: Set up Connection String id: set-up-connstr @@ -175,7 +180,7 @@ jobs: neon-captest-reuse) CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }} ;; - neon-captest-new | neon-captest-prefetch) + neon-captest-new | neon-captest-prefetch | neon-captest-freetier) CONNSTR=${{ steps.create-neon-project.outputs.dsn }} ;; rds-aurora) @@ -185,7 +190,7 @@ jobs: CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }} ;; *) - echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'neon-captest-new', 'neon-captest-prefetch', 'rds-aurora', or 'rds-postgres'" + echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'neon-captest-new', 'neon-captest-prefetch', neon-captest-freetier, 'rds-aurora', or 'rds-postgres'" exit 1 ;; esac From ba4a96fdb1cc08098c57e5b4f75492c5ea30345b Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Thu, 6 Apr 2023 20:57:06 +0300 Subject: [PATCH 13/31] Eagerly update wal_backup_lsn after each segment offload (#3976) Otherwise it can lag a lot, preventing WAL segments cleanup. Also max wal_backup_lsn on update, pulling it down is pointless. Should help with https://github.com/neondatabase/neon/issues/3957, but will not fix it completely. --- safekeeper/src/timeline.rs | 3 ++- safekeeper/src/wal_backup.rs | 26 +++++++++++++------------- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 9dd8a63cf0..8097c863fa 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -674,7 +674,8 @@ impl Timeline { bail!(TimelineError::Cancelled(self.ttid)); } - self.write_shared_state().sk.inmem.backup_lsn = backup_lsn; + let mut state = self.write_shared_state(); + state.sk.inmem.backup_lsn = max(state.sk.inmem.backup_lsn, backup_lsn); // we should check whether to shut down offloader, but this will be done // soon by peer communication anyway. Ok(()) diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 798b9abaf3..163ac99be8 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -323,7 +323,8 @@ impl WalBackupTask { } match backup_lsn_range( - backup_lsn, + &self.timeline, + &mut backup_lsn, commit_lsn, self.wal_seg_size, &self.timeline_dir, @@ -331,13 +332,7 @@ impl WalBackupTask { ) .await { - Ok(backup_lsn_result) => { - backup_lsn = backup_lsn_result; - let res = self.timeline.set_wal_backup_lsn(backup_lsn_result); - if let Err(e) = res { - error!("failed to set wal_backup_lsn: {}", e); - return; - } + Ok(()) => { retry_attempt = 0; } Err(e) => { @@ -354,20 +349,25 @@ impl WalBackupTask { } pub async fn backup_lsn_range( - start_lsn: Lsn, + timeline: &Arc, + backup_lsn: &mut Lsn, end_lsn: Lsn, wal_seg_size: usize, timeline_dir: &Path, workspace_dir: &Path, -) -> Result { - let mut res = start_lsn; +) -> Result<()> { + let start_lsn = *backup_lsn; let segments = get_segments(start_lsn, end_lsn, wal_seg_size); for s in &segments { backup_single_segment(s, timeline_dir, workspace_dir) .await .with_context(|| format!("offloading segno {}", s.seg_no))?; - res = s.end_lsn; + let new_backup_lsn = s.end_lsn; + timeline + .set_wal_backup_lsn(new_backup_lsn) + .context("setting wal_backup_lsn")?; + *backup_lsn = new_backup_lsn; } info!( "offloaded segnos {:?} up to {}, previous backup_lsn {}", @@ -375,7 +375,7 @@ pub async fn backup_lsn_range( end_lsn, start_lsn, ); - Ok(res) + Ok(()) } async fn backup_single_segment( From b45c92e533d322cc693e23b6c5a19e662237c12e Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Thu, 6 Apr 2023 21:21:39 +0300 Subject: [PATCH 14/31] tests: exclude compatibility tests by default (#3975) This allows to skip compatibility tests based on `CHECK_ONDISK_DATA_COMPATIBILITY` environment variable. When the variable is missing (default) compatibility tests wont be run. --- .github/workflows/build_and_test.yml | 1 + test_runner/regress/test_compatibility.py | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 56c0aa8f9a..c096aef4a9 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -338,6 +338,7 @@ jobs: rerun_flaky: true env: TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }} + CHECK_ONDISK_DATA_COMPATIBILITY: nonempty - name: Merge and upload coverage data if: matrix.build_type == 'debug' diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index e9dadb5348..be6e1a69b2 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -34,9 +34,15 @@ from pytest import FixtureRequest # - check_neon_works performs the test itself, feel free to add more checks there. # +check_ondisk_data_compatibility_if_enabled = pytest.mark.skipif( + os.environ.get("CHECK_ONDISK_DATA_COMPATIBILITY") is None, + reason="CHECK_ONDISK_DATA_COMPATIBILITY env is not set", +) + # Note: if renaming this test, don't forget to update a reference to it in a workflow file: # "Upload compatibility snapshot" step in .github/actions/run-python-test-set/action.yml +@check_ondisk_data_compatibility_if_enabled @pytest.mark.xdist_group("compatibility") @pytest.mark.order(before="test_forward_compatibility") def test_create_snapshot(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, test_output_dir: Path): @@ -81,6 +87,7 @@ def test_create_snapshot(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, test_o # Directory `test_output_dir / "compatibility_snapshot_pg14"` is uploaded to S3 in a workflow, keep the name in sync with it +@check_ondisk_data_compatibility_if_enabled @pytest.mark.xdist_group("compatibility") @pytest.mark.order(after="test_create_snapshot") def test_backward_compatibility( @@ -134,6 +141,7 @@ def test_backward_compatibility( ), "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage" +@check_ondisk_data_compatibility_if_enabled @pytest.mark.xdist_group("compatibility") @pytest.mark.order(after="test_create_snapshot") def test_forward_compatibility( From e42982fb1e0178f5efe338a20e5fd8e593aa4ffb Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Thu, 6 Apr 2023 21:21:58 +0200 Subject: [PATCH 15/31] [compute_ctl] Empty computes and /configure API (#3963) This commit adds an option to start compute without spec and then pass it a valid spec via `POST /configure` API endpoint. This is a main prerequisite for maintaining the pool of compute nodes in the control-plane. For example: 1. Start compute with ```shell cargo run --bin compute_ctl -- -i no-compute \ -p http://localhost:9095 \ -D compute_pgdata \ -C "postgresql://cloud_admin@127.0.0.1:5434/postgres" \ -b ./pg_install/v15/bin/postgres ``` 2. Configure it with ```shell curl -d "{\"spec\": $(cat ./compute-spec.json)}" http://localhost:3080/configure ``` Internally, it's implemented using a `Condvar` + `Mutex`. Compute spec is moved under Mutex, as it's now could be updated in the http handler. Also `RwLock` was replaced with `Mutex` because the latter works well with `Condvar`. First part of the neondatabase/cloud#4433 --- compute_tools/src/bin/compute_ctl.rs | 153 +++++++++++++--------- compute_tools/src/compute.rs | 157 ++++++++++++++--------- compute_tools/src/http/api.rs | 128 +++++++++++++++++- compute_tools/src/http/mod.rs | 2 + compute_tools/src/http/openapi_spec.yaml | 105 +++++++++++++-- compute_tools/src/http/requests.rs | 11 ++ compute_tools/src/http/responses.rs | 40 ++++++ compute_tools/src/monitor.rs | 4 +- compute_tools/src/pg_helpers.rs | 6 +- compute_tools/src/spec.rs | 50 +++++--- 10 files changed, 498 insertions(+), 158 deletions(-) create mode 100644 compute_tools/src/http/requests.rs create mode 100644 compute_tools/src/http/responses.rs diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index f29a576413..1a3ac77af4 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -34,13 +34,14 @@ use std::fs::File; use std::panic; use std::path::Path; use std::process::exit; -use std::sync::{Arc, RwLock}; +use std::sync::{Arc, Condvar, Mutex}; use std::{thread, time::Duration}; use anyhow::{Context, Result}; use chrono::Utc; use clap::Arg; use tracing::{error, info}; +use url::Url; use compute_tools::compute::{ComputeMetrics, ComputeNode, ComputeState, ComputeStatus}; use compute_tools::http::api::launch_http_server; @@ -49,7 +50,6 @@ use compute_tools::monitor::launch_monitor; use compute_tools::params::*; use compute_tools::pg_helpers::*; use compute_tools::spec::*; -use url::Url; fn main() -> Result<()> { init_tracing_and_logging(DEFAULT_LOG_LEVEL)?; @@ -62,7 +62,7 @@ fn main() -> Result<()> { let connstr = matches .get_one::("connstr") .expect("Postgres connection string is required"); - let spec = matches.get_one::("spec"); + let spec_json = matches.get_one::("spec"); let spec_path = matches.get_one::("spec-path"); let compute_id = matches.get_one::("compute-id"); @@ -71,40 +71,107 @@ fn main() -> Result<()> { // Try to use just 'postgres' if no path is provided let pgbin = matches.get_one::("pgbin").unwrap(); - let spec: ComputeSpec = match spec { + let mut spec = Default::default(); + let mut spec_set = false; + let mut live_config_allowed = false; + match spec_json { // First, try to get cluster spec from the cli argument - Some(json) => serde_json::from_str(json)?, + Some(json) => { + spec = serde_json::from_str(json)?; + spec_set = true; + } None => { // Second, try to read it from the file if path is provided if let Some(sp) = spec_path { let path = Path::new(sp); let file = File::open(path)?; - serde_json::from_reader(file)? + spec = serde_json::from_reader(file)?; + spec_set = true; } else if let Some(id) = compute_id { if let Some(cp_base) = control_plane_uri { - let cp_uri = format!("{cp_base}/management/api/v1/{id}/spec"); - let jwt: String = match std::env::var("NEON_CONSOLE_JWT") { - Ok(v) => v, - Err(_) => "".to_string(), - }; - - reqwest::blocking::Client::new() - .get(cp_uri) - .header("Authorization", jwt) - .send()? - .json()? + live_config_allowed = true; + if let Ok(s) = get_spec_from_control_plane(cp_base, id) { + spec = s; + spec_set = true; + } } else { - panic!( - "must specify --control-plane-uri \"{:#?}\" and --compute-id \"{:#?}\"", - control_plane_uri, compute_id - ); + panic!("must specify both --control-plane-uri and --compute-id or none"); } } else { - panic!("compute spec should be provided via --spec or --spec-path argument"); + panic!( + "compute spec should be provided by one of the following ways: \ + --spec OR --spec-path OR --control-plane-uri and --compute-id" + ); } } }; + let mut new_state = ComputeState::new(); + if spec_set { + new_state.spec = spec; + } + let compute_node = ComputeNode { + start_time: Utc::now(), + connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?, + pgdata: pgdata.to_string(), + pgbin: pgbin.to_string(), + live_config_allowed, + metrics: ComputeMetrics::default(), + state: Mutex::new(new_state), + state_changed: Condvar::new(), + }; + let compute = Arc::new(compute_node); + + // Launch http service first, so we were able to serve control-plane + // requests, while configuration is still in progress. + let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread"); + + if !spec_set { + // No spec provided, hang waiting for it. + info!("no compute spec provided, waiting"); + let mut state = compute.state.lock().unwrap(); + while state.status != ComputeStatus::ConfigurationPending { + state = compute.state_changed.wait(state).unwrap(); + + if state.status == ComputeStatus::ConfigurationPending { + info!("got spec, continue configuration"); + // Spec is already set by the http server handler. + break; + } + } + } + + // We got all we need, fill in the state. + let mut state = compute.state.lock().unwrap(); + let pageserver_connstr = state + .spec + .cluster + .settings + .find("neon.pageserver_connstring") + .expect("pageserver connstr should be provided"); + let storage_auth_token = state.spec.storage_auth_token.clone(); + let tenant = state + .spec + .cluster + .settings + .find("neon.tenant_id") + .expect("tenant id should be provided"); + let timeline = state + .spec + .cluster + .settings + .find("neon.timeline_id") + .expect("tenant id should be provided"); + let startup_tracing_context = state.spec.startup_tracing_context.clone(); + + state.pageserver_connstr = pageserver_connstr; + state.storage_auth_token = storage_auth_token; + state.tenant = tenant; + state.timeline = timeline; + state.status = ComputeStatus::Init; + compute.state_changed.notify_all(); + drop(state); + // Extract OpenTelemetry context for the startup actions from the spec, and // attach it to the current tracing context. // @@ -120,7 +187,7 @@ fn main() -> Result<()> { // postgres is configured and up-and-running, we exit this span. Any other // actions that are performed on incoming HTTP requests, for example, are // performed in separate spans. - let startup_context_guard = if let Some(ref carrier) = spec.startup_tracing_context { + let startup_context_guard = if let Some(ref carrier) = startup_tracing_context { use opentelemetry::propagation::TextMapPropagator; use opentelemetry::sdk::propagation::TraceContextPropagator; Some(TraceContextPropagator::new().extract(carrier).attach()) @@ -128,41 +195,7 @@ fn main() -> Result<()> { None }; - let pageserver_connstr = spec - .cluster - .settings - .find("neon.pageserver_connstring") - .expect("pageserver connstr should be provided"); - let storage_auth_token = spec.storage_auth_token.clone(); - let tenant = spec - .cluster - .settings - .find("neon.tenant_id") - .expect("tenant id should be provided"); - let timeline = spec - .cluster - .settings - .find("neon.timeline_id") - .expect("tenant id should be provided"); - - let compute_state = ComputeNode { - start_time: Utc::now(), - connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?, - pgdata: pgdata.to_string(), - pgbin: pgbin.to_string(), - spec, - tenant, - timeline, - pageserver_connstr, - storage_auth_token, - metrics: ComputeMetrics::default(), - state: RwLock::new(ComputeState::new()), - }; - let compute = Arc::new(compute_state); - - // Launch service threads first, so we were able to serve availability - // requests, while configuration is still in progress. - let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread"); + // Launch remaining service threads let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread"); // Start Postgres @@ -172,7 +205,7 @@ fn main() -> Result<()> { Ok(pg) => Some(pg), Err(err) => { error!("could not start the compute node: {:?}", err); - let mut state = compute.state.write().unwrap(); + let mut state = compute.state.lock().unwrap(); state.error = Some(format!("{:?}", err)); state.status = ComputeStatus::Failed; drop(state); @@ -262,7 +295,7 @@ fn cli() -> clap::Command { Arg::new("control-plane-uri") .short('p') .long("control-plane-uri") - .value_name("CONTROL_PLANE"), + .value_name("CONTROL_PLANE_API_BASE_URI"), ) } diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 00d1e234ab..3e92ec57dc 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -20,12 +20,12 @@ use std::path::Path; use std::process::{Command, Stdio}; use std::str::FromStr; use std::sync::atomic::{AtomicU64, Ordering}; -use std::sync::RwLock; +use std::sync::{Condvar, Mutex}; use anyhow::{Context, Result}; use chrono::{DateTime, Utc}; use postgres::{Client, NoTls}; -use serde::{Serialize, Serializer}; +use serde::Serialize; use tokio_postgres; use tracing::{info, instrument, warn}; @@ -41,41 +41,52 @@ pub struct ComputeNode { pub connstr: url::Url, pub pgdata: String, pub pgbin: String, + pub metrics: ComputeMetrics, + /// We should only allow live re- / configuration of the compute node if + /// it uses 'pull model', i.e. it can go to control-plane and fetch + /// the latest configuration. Otherwise, there could be a case: + /// - we start compute with some spec provided as argument + /// - we push new spec and it does reconfiguration + /// - but then something happens and compute pod / VM is destroyed, + /// so k8s controller starts it again with the **old** spec + /// and the same for empty computes: + /// - we started compute without any spec + /// - we push spec and it does configuration + /// - but then it is restarted without any spec again + pub live_config_allowed: bool, + /// Volatile part of the `ComputeNode`, which should be used under `Mutex`. + /// To allow HTTP API server to serving status requests, while configuration + /// is in progress, lock should be held only for short periods of time to do + /// read/write, not the whole configuration process. + pub state: Mutex, + /// `Condvar` to allow notifying waiters about state changes. + pub state_changed: Condvar, +} + +#[derive(Clone, Debug)] +pub struct ComputeState { + pub status: ComputeStatus, + /// Timestamp of the last Postgres activity + pub last_active: DateTime, + pub error: Option, pub spec: ComputeSpec, pub tenant: String, pub timeline: String, pub pageserver_connstr: String, pub storage_auth_token: Option, - pub metrics: ComputeMetrics, - /// Volatile part of the `ComputeNode` so should be used under `RwLock` - /// to allow HTTP API server to serve status requests, while configuration - /// is in progress. - pub state: RwLock, -} - -fn rfc3339_serialize(x: &DateTime, s: S) -> Result -where - S: Serializer, -{ - x.to_rfc3339().serialize(s) -} - -#[derive(Serialize)] -#[serde(rename_all = "snake_case")] -pub struct ComputeState { - pub status: ComputeStatus, - /// Timestamp of the last Postgres activity - #[serde(serialize_with = "rfc3339_serialize")] - pub last_active: DateTime, - pub error: Option, } impl ComputeState { pub fn new() -> Self { Self { - status: ComputeStatus::Init, + status: ComputeStatus::Empty, last_active: Utc::now(), error: None, + spec: ComputeSpec::default(), + tenant: String::new(), + timeline: String::new(), + pageserver_connstr: String::new(), + storage_auth_token: None, } } } @@ -86,11 +97,22 @@ impl Default for ComputeState { } } -#[derive(Serialize, Clone, Copy, PartialEq, Eq)] +#[derive(Serialize, Clone, Copy, PartialEq, Eq, Debug)] #[serde(rename_all = "snake_case")] pub enum ComputeStatus { + // Spec wasn't provided at start, waiting for it to be + // provided by control-plane. + Empty, + // Compute configuration was requested. + ConfigurationPending, + // Compute node has spec and initial startup and + // configuration is in progress. Init, + // Compute is configured and running. Running, + // Either startup or configuration failed, + // compute will exit soon or is waiting for + // control-plane to terminate it. Failed, } @@ -104,11 +126,13 @@ pub struct ComputeMetrics { impl ComputeNode { pub fn set_status(&self, status: ComputeStatus) { - self.state.write().unwrap().status = status; + let mut state = self.state.lock().unwrap(); + state.status = status; + self.state_changed.notify_all(); } pub fn get_status(&self) -> ComputeStatus { - self.state.read().unwrap().status + self.state.lock().unwrap().status } // Remove `pgdata` directory and create it again with right permissions. @@ -124,15 +148,15 @@ impl ComputeNode { // Get basebackup from the libpq connection to pageserver using `connstr` and // unarchive it to `pgdata` directory overriding all its previous content. - #[instrument(skip(self))] - fn get_basebackup(&self, lsn: &str) -> Result<()> { + #[instrument(skip(self, compute_state))] + fn get_basebackup(&self, compute_state: &ComputeState, lsn: &str) -> Result<()> { let start_time = Utc::now(); - let mut config = postgres::Config::from_str(&self.pageserver_connstr)?; + let mut config = postgres::Config::from_str(&compute_state.pageserver_connstr)?; // Use the storage auth token from the config file, if given. // Note: this overrides any password set in the connection string. - if let Some(storage_auth_token) = &self.storage_auth_token { + if let Some(storage_auth_token) = &compute_state.storage_auth_token { info!("Got storage auth token from spec file"); config.password(storage_auth_token); } else { @@ -141,8 +165,14 @@ impl ComputeNode { let mut client = config.connect(NoTls)?; let basebackup_cmd = match lsn { - "0/0" => format!("basebackup {} {}", &self.tenant, &self.timeline), // First start of the compute - _ => format!("basebackup {} {} {}", &self.tenant, &self.timeline, lsn), + "0/0" => format!( + "basebackup {} {}", + &compute_state.tenant, &compute_state.timeline + ), // First start of the compute + _ => format!( + "basebackup {} {} {}", + &compute_state.tenant, &compute_state.timeline, lsn + ), }; let copyreader = client.copy_out(basebackup_cmd.as_str())?; @@ -169,14 +199,14 @@ impl ComputeNode { // Run `postgres` in a special mode with `--sync-safekeepers` argument // and return the reported LSN back to the caller. - #[instrument(skip(self))] - fn sync_safekeepers(&self) -> Result { + #[instrument(skip(self, storage_auth_token))] + fn sync_safekeepers(&self, storage_auth_token: Option) -> Result { let start_time = Utc::now(); let sync_handle = Command::new(&self.pgbin) .args(["--sync-safekeepers"]) .env("PGDATA", &self.pgdata) // we cannot use -D in this mode - .envs(if let Some(storage_auth_token) = &self.storage_auth_token { + .envs(if let Some(storage_auth_token) = &storage_auth_token { vec![("NEON_AUTH_TOKEN", storage_auth_token)] } else { vec![] @@ -217,9 +247,9 @@ impl ComputeNode { /// Do all the preparations like PGDATA directory creation, configuration, /// safekeepers sync, basebackup, etc. - #[instrument(skip(self))] - pub fn prepare_pgdata(&self) -> Result<()> { - let spec = &self.spec; + #[instrument(skip(self, compute_state))] + pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> { + let spec = &compute_state.spec; let pgdata_path = Path::new(&self.pgdata); // Remove/create an empty pgdata directory and put configuration there. @@ -228,18 +258,18 @@ impl ComputeNode { info!("starting safekeepers syncing"); let lsn = self - .sync_safekeepers() + .sync_safekeepers(compute_state.storage_auth_token.clone()) .with_context(|| "failed to sync safekeepers")?; info!("safekeepers synced at LSN {}", lsn); info!( "getting basebackup@{} from pageserver {}", - lsn, &self.pageserver_connstr + lsn, &compute_state.pageserver_connstr ); - self.get_basebackup(&lsn).with_context(|| { + self.get_basebackup(compute_state, &lsn).with_context(|| { format!( "failed to get basebackup@{} from pageserver {}", - lsn, &self.pageserver_connstr + lsn, &compute_state.pageserver_connstr ) })?; @@ -252,13 +282,16 @@ impl ComputeNode { /// Start Postgres as a child process and manage DBs/roles. /// After that this will hang waiting on the postmaster process to exit. #[instrument(skip(self))] - pub fn start_postgres(&self) -> Result { + pub fn start_postgres( + &self, + storage_auth_token: Option, + ) -> Result { let pgdata_path = Path::new(&self.pgdata); // Run postgres as a child process. let mut pg = Command::new(&self.pgbin) .args(["-D", &self.pgdata]) - .envs(if let Some(storage_auth_token) = &self.storage_auth_token { + .envs(if let Some(storage_auth_token) = &storage_auth_token { vec![("NEON_AUTH_TOKEN", storage_auth_token)] } else { vec![] @@ -271,8 +304,9 @@ impl ComputeNode { Ok(pg) } - #[instrument(skip(self))] - pub fn apply_config(&self) -> Result<()> { + /// Do initial configuration of the already started Postgres. + #[instrument(skip(self, compute_state))] + pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> { // If connection fails, // it may be the old node with `zenith_admin` superuser. // @@ -303,19 +337,19 @@ impl ComputeNode { }; // Proceed with post-startup configuration. Note, that order of operations is important. - handle_roles(&self.spec, &mut client)?; - handle_databases(&self.spec, &mut client)?; - handle_role_deletions(self, &mut client)?; - handle_grants(self, &mut client)?; + handle_roles(&compute_state.spec, &mut client)?; + handle_databases(&compute_state.spec, &mut client)?; + handle_role_deletions(&compute_state.spec, self.connstr.as_str(), &mut client)?; + handle_grants(&compute_state.spec, self.connstr.as_str(), &mut client)?; create_writability_check_data(&mut client)?; - handle_extensions(&self.spec, &mut client)?; + handle_extensions(&compute_state.spec, &mut client)?; // 'Close' connection drop(client); info!( "finished configuration of compute for project {}", - self.spec.cluster.cluster_id + compute_state.spec.cluster.cluster_id ); Ok(()) @@ -323,21 +357,22 @@ impl ComputeNode { #[instrument(skip(self))] pub fn start_compute(&self) -> Result { + let compute_state = self.state.lock().unwrap().clone(); info!( "starting compute for project {}, operation {}, tenant {}, timeline {}", - self.spec.cluster.cluster_id, - self.spec.operation_uuid.as_ref().unwrap(), - self.tenant, - self.timeline, + compute_state.spec.cluster.cluster_id, + compute_state.spec.operation_uuid.as_ref().unwrap(), + compute_state.tenant, + compute_state.timeline, ); - self.prepare_pgdata()?; + self.prepare_pgdata(&compute_state)?; let start_time = Utc::now(); - let pg = self.start_postgres()?; + let pg = self.start_postgres(compute_state.storage_auth_token.clone())?; - self.apply_config()?; + self.apply_config(&compute_state)?; let startup_end_time = Utc::now(); self.metrics.config_ms.store( diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs index 199e0f3bd0..8620b10636 100644 --- a/compute_tools/src/http/api.rs +++ b/compute_tools/src/http/api.rs @@ -3,12 +3,16 @@ use std::net::SocketAddr; use std::sync::Arc; use std::thread; -use crate::compute::ComputeNode; +use crate::compute::{ComputeNode, ComputeStatus}; +use crate::http::requests::ConfigurationRequest; +use crate::http::responses::{ComputeStatusResponse, GenericAPIError}; + use anyhow::Result; use hyper::service::{make_service_fn, service_fn}; use hyper::{Body, Method, Request, Response, Server, StatusCode}; use num_cpus; use serde_json; +use tokio::task; use tracing::{error, info}; use tracing_utils::http::OtelName; @@ -23,8 +27,10 @@ async fn routes(req: Request, compute: &Arc) -> Response { info!("serving /status GET request"); - let state = compute.state.read().unwrap(); - Response::new(Body::from(serde_json::to_string(&*state).unwrap())) + let state = compute.state.lock().unwrap(); + let status_response = ComputeStatusResponse::from(state.clone()); + + Response::new(Body::from(serde_json::to_string(&status_response).unwrap())) } // Startup metrics in JSON format. Keep /metrics reserved for a possible @@ -37,12 +43,29 @@ async fn routes(req: Request, compute: &Arc) -> Response { info!("serving /insights GET request"); + let status = compute.get_status(); + if status != ComputeStatus::Running { + let msg = format!("compute is not running, current status: {:?}", status); + error!(msg); + return Response::new(Body::from(msg)); + } + let insights = compute.collect_insights().await; Response::new(Body::from(insights)) } (&Method::POST, "/check_writability") => { info!("serving /check_writability POST request"); + let status = compute.get_status(); + if status != ComputeStatus::Running { + let msg = format!( + "invalid compute status for check_writability request: {:?}", + status + ); + error!(msg); + return Response::new(Body::from(msg)); + } + let res = crate::checker::check_writability(compute).await; match res { Ok(_) => Response::new(Body::from("true")), @@ -61,6 +84,23 @@ async fn routes(req: Request, compute: &Arc) -> Response { + info!("serving /configure POST request"); + match handle_configure_request(req, compute).await { + Ok(msg) => Response::new(Body::from(msg)), + Err((msg, code)) => { + error!("error handling /configure request: {msg}"); + render_json_error(&msg, code) + } + } + } + // Return the `404 Not Found` for any other routes. _ => { let mut not_found = Response::new(Body::from("404 Not Found")); @@ -70,6 +110,88 @@ async fn routes(req: Request, compute: &Arc) -> Response, + compute: &Arc, +) -> Result { + if !compute.live_config_allowed { + return Err(( + "live configuration is not allowed for this compute node".to_string(), + StatusCode::PRECONDITION_FAILED, + )); + } + + let body_bytes = hyper::body::to_bytes(req.into_body()).await.unwrap(); + let spec_raw = String::from_utf8(body_bytes.to_vec()).unwrap(); + if let Ok(request) = serde_json::from_str::(&spec_raw) { + let spec = request.spec; + // XXX: wrap state update under lock in code blocks. Otherwise, + // we will try to `Send` `mut state` into the spawned thread + // bellow, which will cause error: + // ``` + // error: future cannot be sent between threads safely + // ``` + { + let mut state = compute.state.lock().unwrap(); + if state.status != ComputeStatus::Empty { + let msg = format!( + "invalid compute status for configuration request: {:?}", + state.status.clone() + ); + return Err((msg, StatusCode::PRECONDITION_FAILED)); + } + state.spec = spec; + state.status = ComputeStatus::ConfigurationPending; + compute.state_changed.notify_all(); + drop(state); + info!("set new spec and notified waiters"); + } + + // Spawn a blocking thread to wait for compute to become Running. + // This is needed to do not block the main pool of workers and + // be able to serve other requests while some particular request + // is waiting for compute to finish configuration. + let c = compute.clone(); + task::spawn_blocking(move || { + let mut state = c.state.lock().unwrap(); + while state.status != ComputeStatus::Running { + state = c.state_changed.wait(state).unwrap(); + info!( + "waiting for compute to become Running, current status: {:?}", + state.status + ); + + if state.status == ComputeStatus::Failed { + let err = state.error.clone().unwrap_or("unknown error".to_string()); + let msg = format!("compute configuration failed: {:?}", err); + return Err((msg, StatusCode::INTERNAL_SERVER_ERROR)); + } + } + + Ok(()) + }) + .await + .unwrap()?; + + // Return current compute state if everything went well. + let state = compute.state.lock().unwrap().clone(); + let status_response = ComputeStatusResponse::from(state); + Ok(serde_json::to_string(&status_response).unwrap()) + } else { + Err(("invalid spec".to_string(), StatusCode::BAD_REQUEST)) + } +} + +fn render_json_error(e: &str, status: StatusCode) -> Response { + let error = GenericAPIError { + error: e.to_string(), + }; + Response::builder() + .status(status) + .body(Body::from(serde_json::to_string(&error).unwrap())) + .unwrap() +} + // Main Hyper HTTP server function that runs it and blocks waiting on it forever. #[tokio::main] async fn serve(state: Arc) { diff --git a/compute_tools/src/http/mod.rs b/compute_tools/src/http/mod.rs index e5fdf85eed..e54b4e3341 100644 --- a/compute_tools/src/http/mod.rs +++ b/compute_tools/src/http/mod.rs @@ -1 +1,3 @@ pub mod api; +pub mod requests; +pub mod responses; diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml index 5c74dfd2d2..bdb09d4a6b 100644 --- a/compute_tools/src/http/openapi_spec.yaml +++ b/compute_tools/src/http/openapi_spec.yaml @@ -11,7 +11,7 @@ paths: get: tags: - Info - summary: Get compute node internal status + summary: Get compute node internal status. description: "" operationId: getComputeStatus responses: @@ -26,7 +26,7 @@ paths: get: tags: - Info - summary: Get compute node startup metrics in JSON format + summary: Get compute node startup metrics in JSON format. description: "" operationId: getComputeMetricsJSON responses: @@ -41,9 +41,9 @@ paths: get: tags: - Info - summary: Get current compute insights in JSON format + summary: Get current compute insights in JSON format. description: | - Note, that this doesn't include any historical data + Note, that this doesn't include any historical data. operationId: getComputeInsights responses: 200: @@ -56,12 +56,12 @@ paths: /info: get: tags: - - "info" - summary: Get info about the compute Pod/VM + - Info + summary: Get info about the compute pod / VM. description: "" operationId: getInfo responses: - "200": + 200: description: Info content: application/json: @@ -72,7 +72,7 @@ paths: post: tags: - Check - summary: Check that we can write new data on this compute + summary: Check that we can write new data on this compute. description: "" operationId: checkComputeWritability responses: @@ -82,9 +82,64 @@ paths: text/plain: schema: type: string - description: Error text or 'true' if check passed + description: Error text or 'true' if check passed. example: "true" + /configure: + post: + tags: + - Configure + summary: Perform compute node configuration. + description: | + This is a blocking API endpoint, i.e. it blocks waiting until + compute is finished configuration and is in `Running` state. + Optional non-blocking mode could be added later. + operationId: configureCompute + requestBody: + description: Configuration request. + required: true + content: + application/json: + schema: + type: object + required: + - spec + properties: + spec: + # XXX: I don't want to explain current spec in the OpenAPI format, + # as it could be changed really soon. Consider doing it later. + type: object + responses: + 200: + description: Compute configuration finished. + content: + application/json: + schema: + $ref: "#/components/schemas/ComputeState" + 400: + description: Provided spec is invalid. + content: + application/json: + schema: + $ref: "#/components/schemas/GenericError" + 412: + description: | + It's not possible to do live-configuration of the compute. + It's either in the wrong state, or compute doesn't use pull + mode of configuration. + content: + application/json: + schema: + $ref: "#/components/schemas/GenericError" + 500: + description: | + Compute configuration request was processed, but error + occurred. Compute will likely shutdown soon. + content: + application/json: + schema: + $ref: "#/components/schemas/GenericError" + components: securitySchemes: JWT: @@ -95,7 +150,7 @@ components: schemas: ComputeMetrics: type: object - description: Compute startup metrics + description: Compute startup metrics. required: - sync_safekeepers_ms - basebackup_ms @@ -113,7 +168,7 @@ components: Info: type: object - description: Information about VM/Pod + description: Information about VM/Pod. required: - num_cpus properties: @@ -130,17 +185,26 @@ components: $ref: '#/components/schemas/ComputeStatus' last_active: type: string - description: The last detected compute activity timestamp in UTC and RFC3339 format + description: The last detected compute activity timestamp in UTC and RFC3339 format. example: "2022-10-12T07:20:50.52Z" error: type: string - description: Text of the error during compute startup, if any + description: Text of the error during compute startup, if any. + example: "" + tenant: + type: string + description: Identifier of the current tenant served by compute node, if any. + example: c9269c359e9a199fad1ea0981246a78f + timeline: + type: string + description: Identifier of the current timeline served by compute node, if any. + example: ece7de74d4b8cbe5433a68ce4d1b97b4 ComputeInsights: type: object properties: pg_stat_statements: - description: Contains raw output from pg_stat_statements in JSON format + description: Contains raw output from pg_stat_statements in JSON format. type: array items: type: object @@ -151,6 +215,19 @@ components: - init - failed - running + example: running + + # + # Errors + # + + GenericError: + type: object + required: + - error + properties: + error: + type: string security: - JWT: [] diff --git a/compute_tools/src/http/requests.rs b/compute_tools/src/http/requests.rs new file mode 100644 index 0000000000..2e41c7aea4 --- /dev/null +++ b/compute_tools/src/http/requests.rs @@ -0,0 +1,11 @@ +use serde::Deserialize; + +use crate::spec::ComputeSpec; + +/// We now pass only `spec` in the configuration request, but later we can +/// extend it and something like `restart: bool` or something else. So put +/// `spec` into a struct initially to be more flexible in the future. +#[derive(Deserialize, Debug)] +pub struct ConfigurationRequest { + pub spec: ComputeSpec, +} diff --git a/compute_tools/src/http/responses.rs b/compute_tools/src/http/responses.rs new file mode 100644 index 0000000000..1ef4b380a9 --- /dev/null +++ b/compute_tools/src/http/responses.rs @@ -0,0 +1,40 @@ +use serde::{Serialize, Serializer}; + +use chrono::{DateTime, Utc}; + +use crate::compute::{ComputeState, ComputeStatus}; + +#[derive(Serialize, Debug)] +pub struct GenericAPIError { + pub error: String, +} + +#[derive(Serialize, Debug)] +#[serde(rename_all = "snake_case")] +pub struct ComputeStatusResponse { + pub tenant: String, + pub timeline: String, + pub status: ComputeStatus, + #[serde(serialize_with = "rfc3339_serialize")] + pub last_active: DateTime, + pub error: Option, +} + +impl From for ComputeStatusResponse { + fn from(state: ComputeState) -> Self { + ComputeStatusResponse { + tenant: state.tenant, + timeline: state.timeline, + status: state.status, + last_active: state.last_active, + error: state.error, + } + } +} + +fn rfc3339_serialize(x: &DateTime, s: S) -> Result +where + S: Serializer, +{ + x.to_rfc3339().serialize(s) +} diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs index 7c9878ffcf..a30b52aed4 100644 --- a/compute_tools/src/monitor.rs +++ b/compute_tools/src/monitor.rs @@ -46,7 +46,7 @@ fn watch_compute_activity(compute: &ComputeNode) { AND usename != 'cloud_admin';", // XXX: find a better way to filter other monitors? &[], ); - let mut last_active = compute.state.read().unwrap().last_active; + let mut last_active = compute.state.lock().unwrap().last_active; if let Ok(backs) = backends { let mut idle_backs: Vec> = vec![]; @@ -87,7 +87,7 @@ fn watch_compute_activity(compute: &ComputeNode) { } // Update the last activity in the shared state if we got a more recent one. - let mut state = compute.state.write().unwrap(); + let mut state = compute.state.lock().unwrap(); if last_active > state.last_active { state.last_active = last_active; debug!("set the last compute activity time to: {}", last_active); diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 01b192b2de..38d1a6d777 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -17,7 +17,7 @@ const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // mil /// Rust representation of Postgres role info with only those fields /// that matter for us. -#[derive(Clone, Deserialize)] +#[derive(Clone, Deserialize, Debug)] pub struct Role { pub name: PgIdent, pub encrypted_password: Option, @@ -26,7 +26,7 @@ pub struct Role { /// Rust representation of Postgres database info with only those fields /// that matter for us. -#[derive(Clone, Deserialize)] +#[derive(Clone, Deserialize, Debug)] pub struct Database { pub name: PgIdent, pub owner: PgIdent, @@ -36,7 +36,7 @@ pub struct Database { /// Common type representing both SQL statement params with or without value, /// like `LOGIN` or `OWNER username` in the `CREATE/ALTER ROLE`, and config /// options like `wal_level = logical`. -#[derive(Clone, Deserialize)] +#[derive(Clone, Deserialize, Debug)] pub struct GenericOption { pub name: String, pub value: Option, diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 9694ba9a88..b7f15a99d1 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -8,14 +8,13 @@ use postgres::{Client, NoTls}; use serde::Deserialize; use tracing::{info, info_span, instrument, span_enabled, warn, Level}; -use crate::compute::ComputeNode; use crate::config; use crate::params::PG_HBA_ALL_MD5; use crate::pg_helpers::*; /// Cluster spec or configuration represented as an optional number of /// delta operations + final cluster state description. -#[derive(Clone, Deserialize)] +#[derive(Clone, Deserialize, Debug, Default)] pub struct ComputeSpec { pub format_version: f32, pub timestamp: String, @@ -31,7 +30,7 @@ pub struct ComputeSpec { /// Cluster state seen from the perspective of the external tools /// like Rails web console. -#[derive(Clone, Deserialize)] +#[derive(Clone, Deserialize, Debug, Default)] pub struct Cluster { pub cluster_id: String, pub name: String, @@ -47,13 +46,36 @@ pub struct Cluster { /// - DROP ROLE /// - ALTER ROLE name RENAME TO new_name /// - ALTER DATABASE name RENAME TO new_name -#[derive(Clone, Deserialize)] +#[derive(Clone, Deserialize, Debug)] pub struct DeltaOp { pub action: String, pub name: PgIdent, pub new_name: Option, } +/// Request spec from the control-plane by compute_id. If `NEON_CONSOLE_JWT` +/// env variable is set, it will be used for authorization. +pub fn get_spec_from_control_plane(base_uri: &str, compute_id: &str) -> Result { + let cp_uri = format!("{base_uri}/management/api/v2/computes/{compute_id}/spec"); + let jwt: String = match std::env::var("NEON_CONSOLE_JWT") { + Ok(v) => v, + Err(_) => "".to_string(), + }; + info!("getting spec from control plane: {}", cp_uri); + + // TODO: check the response. We should distinguish cases when it's + // - network error, then retry + // - no spec for compute yet, then wait + // - compute id is unknown or any other error, then bail out + let spec = reqwest::blocking::Client::new() + .get(cp_uri) + .header("Authorization", jwt) + .send()? + .json()?; + + Ok(spec) +} + /// It takes cluster specification and does the following: /// - Serialize cluster config and put it into `postgresql.conf` completely rewriting the file. /// - Update `pg_hba.conf` to allow external connections. @@ -226,8 +248,8 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> { /// Reassign all dependent objects and delete requested roles. #[instrument(skip_all)] -pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<()> { - if let Some(ops) = &node.spec.delta_operations { +pub fn handle_role_deletions(spec: &ComputeSpec, connstr: &str, client: &mut Client) -> Result<()> { + if let Some(ops) = &spec.delta_operations { // First, reassign all dependent objects to db owners. info!("reassigning dependent objects of to-be-deleted roles"); @@ -244,7 +266,7 @@ pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result< // Check that role is still present in Postgres, as this could be a // restart with the same spec after role deletion. if op.action == "delete_role" && existing_roles.iter().any(|r| r.name == op.name) { - reassign_owned_objects(node, &op.name)?; + reassign_owned_objects(spec, connstr, &op.name)?; } } @@ -268,10 +290,10 @@ pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result< } // Reassign all owned objects in all databases to the owner of the database. -fn reassign_owned_objects(node: &ComputeNode, role_name: &PgIdent) -> Result<()> { - for db in &node.spec.cluster.databases { +fn reassign_owned_objects(spec: &ComputeSpec, connstr: &str, role_name: &PgIdent) -> Result<()> { + for db in &spec.cluster.databases { if db.owner != *role_name { - let mut conf = Config::from_str(node.connstr.as_str())?; + let mut conf = Config::from_str(connstr)?; conf.dbname(&db.name); let mut client = conf.connect(NoTls)?; @@ -416,9 +438,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> { /// Grant CREATE ON DATABASE to the database owner and do some other alters and grants /// to allow users creating trusted extensions and re-creating `public` schema, for example. #[instrument(skip_all)] -pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> { - let spec = &node.spec; - +pub fn handle_grants(spec: &ComputeSpec, connstr: &str, client: &mut Client) -> Result<()> { info!("cluster spec grants:"); // We now have a separate `web_access` role to connect to the database @@ -450,8 +470,8 @@ pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> { // Do some per-database access adjustments. We'd better do this at db creation time, // but CREATE DATABASE isn't transactional. So we cannot create db + do some grants // atomically. - for db in &node.spec.cluster.databases { - let mut conf = Config::from_str(node.connstr.as_str())?; + for db in &spec.cluster.databases { + let mut conf = Config::from_str(connstr)?; conf.dbname(&db.name); let mut db_client = conf.connect(NoTls)?; From 6d01d835a89d99d2314a49eb00bd4afaceee224a Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Tue, 4 Apr 2023 16:52:40 +0300 Subject: [PATCH 16/31] [proxy] Report error if proxy_io_bytes_per_client metric has decreased --- proxy/src/metrics.rs | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index be22c45836..445c2e930c 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -5,7 +5,7 @@ use chrono::{DateTime, Utc}; use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE}; use serde::Serialize; use std::collections::HashMap; -use tracing::{debug, error, info, instrument, trace}; +use tracing::{error, info, instrument, trace, warn}; const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client"; @@ -84,10 +84,14 @@ fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime))> { let value = ms.get_counter().get_value() as u64; - debug!( - "branch_id {} endpoint_id {} val: {}", - branch_id, endpoint_id, value - ); + // Report if the metric value is suspiciously large + if value > (1u64 << 40) { + warn!( + "potentially abnormal counter value: branch_id {} endpoint_id {} val: {}", + branch_id, endpoint_id, value + ); + } + current_metrics.push(( Ids { endpoint_id: endpoint_id.to_string(), @@ -124,11 +128,15 @@ async fn collect_metrics_iteration( let mut value = *curr_val; if let Some((prev_val, prev_time)) = cached_metrics.get(curr_key) { - // Only send metrics updates if the metric has changed - if curr_val - prev_val > 0 { + // Only send metrics updates if the metric has increased + if curr_val > prev_val { value = curr_val - prev_val; start_time = *prev_time; } else { + if curr_val < prev_val { + error!("proxy_io_bytes_per_client metric value decreased from {} to {} for key {:?}", + prev_val, curr_val, curr_key); + } return None; } }; @@ -189,7 +197,7 @@ async fn collect_metrics_iteration( }) // update cached value (add delta) and time .and_modify(|e| { - e.0 += send_metric.value; + e.0 = e.0.saturating_add(send_metric.value); e.1 = stop_time }) // cache new metric From b1c2a6384ae8fd9f1da4b3d186eb48774e0da5d7 Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Thu, 6 Apr 2023 13:59:45 +0300 Subject: [PATCH 17/31] Set non-wildcard common names in link auth proxy Old coding here ignored non-wildcard common names and passed None instead. With my recent changes I started throwing an error in that case. Old logic doesn't seem to be a great choice, so instead of passing None I actually set non-wildcard common names too. That way it is possible to avoid handling cases with None in downstream code. --- proxy/src/config.rs | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 5f9585149e..ad51502b49 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -119,7 +119,18 @@ impl CertResolver { ))? .1; let common_name = pem.parse_x509()?.subject().to_string(); - common_name.strip_prefix("CN=*.").map(|s| s.to_string()) + + // We only use non-wildcard certificates in link proxy so it seems okay to treat them the same as + // wildcard ones as we don't use SNI there. That treatment only affects certificate selection, so + // verify-full will still check wildcard match. Old coding here just ignored non-wildcard common names + // and passed None instead, which blows up number of cases downstream code should handle. Proper coding + // here should better avoid Option for common_names, and do wildcard-based certificate selection instead + // of cutting off '*.' parts. + if common_name.starts_with("CN=*.") { + common_name.strip_prefix("CN=*.").map(|s| s.to_string()) + } else { + common_name.strip_prefix("CN=").map(|s| s.to_string()) + } } .context(format!( "Failed to parse common name from certificate at '{cert_path}'." From bfeb428d1b367351df11b7b17a75cff1c3c64ff6 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Fri, 7 Apr 2023 13:47:28 +0300 Subject: [PATCH 18/31] tests: make neon_fixtures a bit thinner by splitting out some pageserver related helpers (#3977) neon_fixture is quite big and messy, lets clean it up a bit. --- test_runner/fixtures/neon_fixtures.py | 702 +----------------- test_runner/fixtures/pageserver/__init__.py | 0 test_runner/fixtures/pageserver/http.py | 545 ++++++++++++++ test_runner/fixtures/pageserver/utils.py | 145 ++++ test_runner/fixtures/utils.py | 16 + .../performance/test_branch_creation.py | 2 +- test_runner/regress/test_auth.py | 3 +- test_runner/regress/test_compatibility.py | 5 +- .../regress/test_disk_usage_eviction.py | 10 +- test_runner/regress/test_import.py | 3 +- test_runner/regress/test_layer_eviction.py | 3 +- test_runner/regress/test_neon_cli.py | 2 +- test_runner/regress/test_normal_work.py | 3 +- test_runner/regress/test_ondemand_download.py | 12 +- test_runner/regress/test_pageserver_api.py | 2 +- test_runner/regress/test_read_trace.py | 3 +- test_runner/regress/test_readonly_node.py | 3 +- test_runner/regress/test_remote_storage.py | 33 +- test_runner/regress/test_tenant_conf.py | 3 +- test_runner/regress/test_tenant_detach.py | 8 +- test_runner/regress/test_tenant_relocation.py | 12 +- test_runner/regress/test_tenant_size.py | 2 +- .../test_tenants_with_remote_storage.py | 6 +- test_runner/regress/test_timeline_delete.py | 3 +- test_runner/regress/test_timeline_size.py | 26 +- test_runner/regress/test_wal_acceptor.py | 3 +- .../test_walredo_not_left_behind_on_detach.py | 3 +- 27 files changed, 779 insertions(+), 779 deletions(-) create mode 100644 test_runner/fixtures/pageserver/__init__.py create mode 100644 test_runner/fixtures/pageserver/http.py create mode 100644 test_runner/fixtures/pageserver/utils.py diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index c24158e9ec..5b6f2e5c96 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -14,7 +14,6 @@ import tempfile import textwrap import time import uuid -from collections import defaultdict from contextlib import closing, contextmanager from dataclasses import dataclass, field from datetime import datetime @@ -44,11 +43,11 @@ from psycopg2.extensions import make_dsn, parse_dsn from typing_extensions import Literal from fixtures.log_helper import log -from fixtures.metrics import Metrics, parse_metrics +from fixtures.pageserver.http import PageserverHttpClient +from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import ( ATTACHMENT_NAME_REGEX, - Fn, allure_add_grafana_links, allure_attach_from_dir, get_self_dir, @@ -1120,538 +1119,6 @@ def neon_env_builder( yield builder -class PageserverApiException(Exception): - def __init__(self, message, status_code: int): - super().__init__(message) - self.status_code = status_code - - -class PageserverHttpClient(requests.Session): - def __init__(self, port: int, is_testing_enabled_or_skip: Fn, auth_token: Optional[str] = None): - super().__init__() - self.port = port - self.auth_token = auth_token - self.is_testing_enabled_or_skip = is_testing_enabled_or_skip - - if auth_token is not None: - self.headers["Authorization"] = f"Bearer {auth_token}" - - def verbose_error(self, res: requests.Response): - try: - res.raise_for_status() - except requests.RequestException as e: - try: - msg = res.json()["msg"] - except: # noqa: E722 - msg = "" - raise PageserverApiException(msg, res.status_code) from e - - def check_status(self): - self.get(f"http://localhost:{self.port}/v1/status").raise_for_status() - - def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]): - self.is_testing_enabled_or_skip() - - if isinstance(config_strings, tuple): - pairs = [config_strings] - else: - pairs = config_strings - - log.info(f"Requesting config failpoints: {repr(pairs)}") - - res = self.put( - f"http://localhost:{self.port}/v1/failpoints", - json=[{"name": name, "actions": actions} for name, actions in pairs], - ) - log.info(f"Got failpoints request response code {res.status_code}") - self.verbose_error(res) - res_json = res.json() - assert res_json is None - return res_json - - def tenant_list(self) -> List[Dict[Any, Any]]: - res = self.get(f"http://localhost:{self.port}/v1/tenant") - self.verbose_error(res) - res_json = res.json() - assert isinstance(res_json, list) - return res_json - - def tenant_create(self, new_tenant_id: Optional[TenantId] = None) -> TenantId: - res = self.post( - f"http://localhost:{self.port}/v1/tenant", - json={ - "new_tenant_id": str(new_tenant_id) if new_tenant_id else None, - }, - ) - self.verbose_error(res) - if res.status_code == 409: - raise Exception(f"could not create tenant: already exists for id {new_tenant_id}") - new_tenant_id = res.json() - assert isinstance(new_tenant_id, str) - return TenantId(new_tenant_id) - - def tenant_attach(self, tenant_id: TenantId): - res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/attach") - self.verbose_error(res) - - def tenant_detach(self, tenant_id: TenantId, detach_ignored=False): - params = {} - if detach_ignored: - params["detach_ignored"] = "true" - - res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach", params=params) - self.verbose_error(res) - - def tenant_load(self, tenant_id: TenantId): - res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/load") - self.verbose_error(res) - - def tenant_ignore(self, tenant_id: TenantId): - res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/ignore") - self.verbose_error(res) - - def tenant_status(self, tenant_id: TenantId) -> Dict[Any, Any]: - res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}") - self.verbose_error(res) - res_json = res.json() - assert isinstance(res_json, dict) - return res_json - - def tenant_config(self, tenant_id: TenantId) -> TenantConfig: - res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/config") - self.verbose_error(res) - return TenantConfig.from_json(res.json()) - - def set_tenant_config(self, tenant_id: TenantId, config: dict[str, Any]): - assert "tenant_id" not in config.keys() - res = self.put( - f"http://localhost:{self.port}/v1/tenant/config", - json={**config, "tenant_id": str(tenant_id)}, - ) - self.verbose_error(res) - - def patch_tenant_config_client_side( - self, - tenant_id: TenantId, - inserts: Optional[Dict[str, Any]] = None, - removes: Optional[List[str]] = None, - ): - current = self.tenant_config(tenant_id).tenant_specific_overrides - if inserts is not None: - current.update(inserts) - if removes is not None: - for key in removes: - del current[key] - self.set_tenant_config(tenant_id, current) - - def tenant_size(self, tenant_id: TenantId) -> int: - return self.tenant_size_and_modelinputs(tenant_id)[0] - - def tenant_size_and_modelinputs(self, tenant_id: TenantId) -> Tuple[int, Dict[str, Any]]: - """ - Returns the tenant size, together with the model inputs as the second tuple item. - """ - res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/synthetic_size") - self.verbose_error(res) - res = res.json() - assert isinstance(res, dict) - assert TenantId(res["id"]) == tenant_id - size = res["size"] - assert type(size) == int - inputs = res["inputs"] - assert type(inputs) is dict - return (size, inputs) - - def tenant_size_debug(self, tenant_id: TenantId) -> str: - """ - Returns the tenant size debug info, as an HTML string - """ - res = self.get( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/synthetic_size", - headers={"Accept": "text/html"}, - ) - return res.text - - def timeline_list( - self, - tenant_id: TenantId, - include_non_incremental_logical_size: bool = False, - include_timeline_dir_layer_file_size_sum: bool = False, - ) -> List[Dict[str, Any]]: - params = {} - if include_non_incremental_logical_size: - params["include-non-incremental-logical-size"] = "true" - if include_timeline_dir_layer_file_size_sum: - params["include-timeline-dir-layer-file-size-sum"] = "true" - - res = self.get( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline", params=params - ) - self.verbose_error(res) - res_json = res.json() - assert isinstance(res_json, list) - return res_json - - def timeline_create( - self, - tenant_id: TenantId, - new_timeline_id: Optional[TimelineId] = None, - ancestor_timeline_id: Optional[TimelineId] = None, - ancestor_start_lsn: Optional[Lsn] = None, - ) -> Dict[Any, Any]: - res = self.post( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline", - json={ - "new_timeline_id": str(new_timeline_id) if new_timeline_id else None, - "ancestor_start_lsn": str(ancestor_start_lsn) if ancestor_start_lsn else None, - "ancestor_timeline_id": str(ancestor_timeline_id) if ancestor_timeline_id else None, - }, - ) - self.verbose_error(res) - if res.status_code == 409: - raise Exception(f"could not create timeline: already exists for id {new_timeline_id}") - - res_json = res.json() - assert isinstance(res_json, dict) - return res_json - - def timeline_detail( - self, - tenant_id: TenantId, - timeline_id: TimelineId, - include_non_incremental_logical_size: bool = False, - include_timeline_dir_layer_file_size_sum: bool = False, - **kwargs, - ) -> Dict[Any, Any]: - params = {} - if include_non_incremental_logical_size: - params["include-non-incremental-logical-size"] = "true" - if include_timeline_dir_layer_file_size_sum: - params["include-timeline-dir-layer-file-size-sum"] = "true" - - res = self.get( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}", - params=params, - **kwargs, - ) - self.verbose_error(res) - res_json = res.json() - assert isinstance(res_json, dict) - return res_json - - def timeline_delete(self, tenant_id: TenantId, timeline_id: TimelineId): - res = self.delete( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}" - ) - self.verbose_error(res) - res_json = res.json() - assert res_json is None - - def timeline_gc( - self, tenant_id: TenantId, timeline_id: TimelineId, gc_horizon: Optional[int] - ) -> dict[str, Any]: - self.is_testing_enabled_or_skip() - - log.info( - f"Requesting GC: tenant {tenant_id}, timeline {timeline_id}, gc_horizon {repr(gc_horizon)}" - ) - res = self.put( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc", - json={"gc_horizon": gc_horizon}, - ) - log.info(f"Got GC request response code: {res.status_code}") - self.verbose_error(res) - res_json = res.json() - assert res_json is not None - assert isinstance(res_json, dict) - return res_json - - def timeline_compact(self, tenant_id: TenantId, timeline_id: TimelineId): - self.is_testing_enabled_or_skip() - - log.info(f"Requesting compact: tenant {tenant_id}, timeline {timeline_id}") - res = self.put( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/compact" - ) - log.info(f"Got compact request response code: {res.status_code}") - self.verbose_error(res) - res_json = res.json() - assert res_json is None - - def timeline_get_lsn_by_timestamp( - self, tenant_id: TenantId, timeline_id: TimelineId, timestamp - ): - log.info( - f"Requesting lsn by timestamp {timestamp}, tenant {tenant_id}, timeline {timeline_id}" - ) - res = self.get( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp}", - ) - self.verbose_error(res) - res_json = res.json() - return res_json - - def timeline_checkpoint(self, tenant_id: TenantId, timeline_id: TimelineId): - self.is_testing_enabled_or_skip() - - log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}") - res = self.put( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint" - ) - log.info(f"Got checkpoint request response code: {res.status_code}") - self.verbose_error(res) - res_json = res.json() - assert res_json is None - - def timeline_spawn_download_remote_layers( - self, - tenant_id: TenantId, - timeline_id: TimelineId, - max_concurrent_downloads: int, - ) -> dict[str, Any]: - body = { - "max_concurrent_downloads": max_concurrent_downloads, - } - res = self.post( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/download_remote_layers", - json=body, - ) - self.verbose_error(res) - res_json = res.json() - assert res_json is not None - assert isinstance(res_json, dict) - return res_json - - def timeline_poll_download_remote_layers_status( - self, - tenant_id: TenantId, - timeline_id: TimelineId, - spawn_response: dict[str, Any], - poll_state=None, - ) -> None | dict[str, Any]: - res = self.get( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/download_remote_layers", - ) - self.verbose_error(res) - res_json = res.json() - assert res_json is not None - assert isinstance(res_json, dict) - - # assumption in this API client here is that nobody else spawns the task - assert res_json["task_id"] == spawn_response["task_id"] - - if poll_state is None or res_json["state"] == poll_state: - return res_json - return None - - def timeline_download_remote_layers( - self, - tenant_id: TenantId, - timeline_id: TimelineId, - max_concurrent_downloads: int, - errors_ok=False, - at_least_one_download=True, - ): - res = self.timeline_spawn_download_remote_layers( - tenant_id, timeline_id, max_concurrent_downloads - ) - while True: - completed = self.timeline_poll_download_remote_layers_status( - tenant_id, timeline_id, res, poll_state="Completed" - ) - if not completed: - time.sleep(0.1) - continue - if not errors_ok: - assert completed["failed_download_count"] == 0 - if at_least_one_download: - assert completed["successful_download_count"] > 0 - return completed - - def get_metrics_str(self) -> str: - """You probably want to use get_metrics() instead.""" - res = self.get(f"http://localhost:{self.port}/metrics") - self.verbose_error(res) - return res.text - - def get_metrics(self) -> Metrics: - res = self.get_metrics_str() - return parse_metrics(res) - - def get_timeline_metric( - self, tenant_id: TenantId, timeline_id: TimelineId, metric_name: str - ) -> float: - metrics = self.get_metrics() - return metrics.query_one( - metric_name, - filter={ - "tenant_id": str(tenant_id), - "timeline_id": str(timeline_id), - }, - ).value - - def get_remote_timeline_client_metric( - self, - metric_name: str, - tenant_id: TenantId, - timeline_id: TimelineId, - file_kind: str, - op_kind: str, - ) -> Optional[float]: - metrics = self.get_metrics() - matches = metrics.query_all( - name=metric_name, - filter={ - "tenant_id": str(tenant_id), - "timeline_id": str(timeline_id), - "file_kind": str(file_kind), - "op_kind": str(op_kind), - }, - ) - if len(matches) == 0: - value = None - elif len(matches) == 1: - value = matches[0].value - assert value is not None - else: - assert len(matches) < 2, "above filter should uniquely identify metric" - return value - - def get_metric_value( - self, name: str, filter: Optional[Dict[str, str]] = None - ) -> Optional[float]: - metrics = self.get_metrics() - results = metrics.query_all(name, filter=filter) - if not results: - log.info(f'could not find metric "{name}"') - return None - assert len(results) == 1, f"metric {name} with given filters is not unique, got: {results}" - return results[0].value - - def layer_map_info( - self, - tenant_id: TenantId, - timeline_id: TimelineId, - ) -> LayerMapInfo: - res = self.get( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/", - ) - self.verbose_error(res) - return LayerMapInfo.from_json(res.json()) - - def download_layer(self, tenant_id: TenantId, timeline_id: TimelineId, layer_name: str): - res = self.get( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/{layer_name}", - ) - self.verbose_error(res) - - assert res.status_code == 200 - - def evict_layer(self, tenant_id: TenantId, timeline_id: TimelineId, layer_name: str): - res = self.delete( - f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/{layer_name}", - ) - self.verbose_error(res) - - assert res.status_code == 200 - - def evict_all_layers(self, tenant_id: TenantId, timeline_id: TimelineId): - info = self.layer_map_info(tenant_id, timeline_id) - for layer in info.historic_layers: - self.evict_layer(tenant_id, timeline_id, layer.layer_file_name) - - def disk_usage_eviction_run(self, request: dict[str, Any]): - res = self.put( - f"http://localhost:{self.port}/v1/disk_usage_eviction/run", - json=request, - ) - self.verbose_error(res) - return res.json() - - def tenant_break(self, tenant_id: TenantId): - res = self.put(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/break") - self.verbose_error(res) - - -@dataclass -class TenantConfig: - tenant_specific_overrides: Dict[str, Any] - effective_config: Dict[str, Any] - - @classmethod - def from_json(cls, d: Dict[str, Any]) -> TenantConfig: - return TenantConfig( - tenant_specific_overrides=d["tenant_specific_overrides"], - effective_config=d["effective_config"], - ) - - -@dataclass -class LayerMapInfo: - in_memory_layers: List[InMemoryLayerInfo] - historic_layers: List[HistoricLayerInfo] - - @classmethod - def from_json(cls, d: Dict[str, Any]) -> LayerMapInfo: - info = LayerMapInfo(in_memory_layers=[], historic_layers=[]) - - json_in_memory_layers = d["in_memory_layers"] - assert isinstance(json_in_memory_layers, List) - for json_in_memory_layer in json_in_memory_layers: - info.in_memory_layers.append(InMemoryLayerInfo.from_json(json_in_memory_layer)) - - json_historic_layers = d["historic_layers"] - assert isinstance(json_historic_layers, List) - for json_historic_layer in json_historic_layers: - info.historic_layers.append(HistoricLayerInfo.from_json(json_historic_layer)) - - return info - - def kind_count(self) -> Dict[str, int]: - counts: Dict[str, int] = defaultdict(int) - for inmem_layer in self.in_memory_layers: - counts[inmem_layer.kind] += 1 - for hist_layer in self.historic_layers: - counts[hist_layer.kind] += 1 - return counts - - -@dataclass -class InMemoryLayerInfo: - kind: str - lsn_start: str - lsn_end: Optional[str] - - @classmethod - def from_json(cls, d: Dict[str, Any]) -> InMemoryLayerInfo: - return InMemoryLayerInfo( - kind=d["kind"], - lsn_start=d["lsn_start"], - lsn_end=d.get("lsn_end"), - ) - - -@dataclass(frozen=True) -class HistoricLayerInfo: - kind: str - layer_file_name: str - layer_file_size: Optional[int] - lsn_start: str - lsn_end: Optional[str] - remote: bool - - @classmethod - def from_json(cls, d: Dict[str, Any]) -> HistoricLayerInfo: - return HistoricLayerInfo( - kind=d["kind"], - layer_file_name=d["layer_file_name"], - layer_file_size=d.get("layer_file_size"), - lsn_start=d["lsn_start"], - lsn_end=d.get("lsn_end"), - remote=d["remote"], - ) - - @dataclass class PageserverPort: pg: int @@ -3386,151 +2853,6 @@ def check_restored_datadir_content( assert (mismatch, error) == ([], []) -def wait_until(number_of_iterations: int, interval: float, func): - """ - Wait until 'func' returns successfully, without exception. Returns the - last return value from the function. - """ - last_exception = None - for i in range(number_of_iterations): - try: - res = func() - except Exception as e: - log.info("waiting for %s iteration %s failed", func, i + 1) - last_exception = e - time.sleep(interval) - continue - return res - raise Exception("timed out while waiting for %s" % func) from last_exception - - -def wait_while(number_of_iterations: int, interval: float, func): - """ - Wait until 'func' returns false, or throws an exception. - """ - for i in range(number_of_iterations): - try: - if not func(): - return - log.info("waiting for %s iteration %s failed", func, i + 1) - time.sleep(interval) - continue - except Exception: - return - raise Exception("timed out while waiting for %s" % func) - - -def assert_tenant_status( - pageserver_http_client: PageserverHttpClient, tenant: TenantId, expected_status: str -): - tenant_status = pageserver_http_client.tenant_status(tenant) - log.info(f"tenant_status: {tenant_status}") - assert tenant_status["state"] == expected_status, tenant_status - - -def tenant_exists(ps_http: PageserverHttpClient, tenant_id: TenantId): - tenants = ps_http.tenant_list() - matching = [t for t in tenants if TenantId(t["id"]) == tenant_id] - assert len(matching) < 2 - if len(matching) == 0: - return None - return matching[0] - - -def remote_consistent_lsn( - pageserver_http_client: PageserverHttpClient, tenant: TenantId, timeline: TimelineId -) -> Lsn: - detail = pageserver_http_client.timeline_detail(tenant, timeline) - - if detail["remote_consistent_lsn"] is None: - # No remote information at all. This happens right after creating - # a timeline, before any part of it has been uploaded to remote - # storage yet. - return Lsn(0) - else: - lsn_str = detail["remote_consistent_lsn"] - assert isinstance(lsn_str, str) - return Lsn(lsn_str) - - -def wait_for_upload( - pageserver_http_client: PageserverHttpClient, - tenant: TenantId, - timeline: TimelineId, - lsn: Lsn, -): - """waits for local timeline upload up to specified lsn""" - for i in range(20): - current_lsn = remote_consistent_lsn(pageserver_http_client, tenant, timeline) - if current_lsn >= lsn: - log.info("wait finished") - return - log.info( - "waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format( - lsn, current_lsn, i + 1 - ) - ) - time.sleep(1) - raise Exception( - "timed out while waiting for remote_consistent_lsn to reach {}, was {}".format( - lsn, current_lsn - ) - ) - - -# Does not use `wait_until` for debugging purposes -def wait_until_tenant_state( - pageserver_http: PageserverHttpClient, - tenant_id: TenantId, - expected_state: str, - iterations: int, -) -> bool: - for _ in range(iterations): - try: - tenant = pageserver_http.tenant_status(tenant_id=tenant_id) - log.debug(f"Tenant {tenant_id} data: {tenant}") - if tenant["state"] == expected_state: - return True - except Exception as e: - log.debug(f"Tenant {tenant_id} state retrieval failure: {e}") - - time.sleep(1) - - raise Exception(f"Tenant {tenant_id} did not become {expected_state} in {iterations} seconds") - - -def last_record_lsn( - pageserver_http_client: PageserverHttpClient, tenant: TenantId, timeline: TimelineId -) -> Lsn: - detail = pageserver_http_client.timeline_detail(tenant, timeline) - - lsn_str = detail["last_record_lsn"] - assert isinstance(lsn_str, str) - return Lsn(lsn_str) - - -def wait_for_last_record_lsn( - pageserver_http_client: PageserverHttpClient, - tenant: TenantId, - timeline: TimelineId, - lsn: Lsn, -) -> Lsn: - """waits for pageserver to catch up to a certain lsn, returns the last observed lsn.""" - for i in range(10): - current_lsn = last_record_lsn(pageserver_http_client, tenant, timeline) - if current_lsn >= lsn: - return current_lsn - log.info( - "waiting for last_record_lsn to reach {}, now {}, iteration {}".format( - lsn, current_lsn, i + 1 - ) - ) - time.sleep(1) - raise Exception( - "timed out while waiting for last_record_lsn to reach {}, was {}".format(lsn, current_lsn) - ) - - def wait_for_last_flush_lsn( env: NeonEnv, pg: Postgres, tenant: TenantId, timeline: TimelineId ) -> Lsn: @@ -3592,23 +2914,3 @@ def wait_for_sk_commit_lsn_to_reach_remote_storage( ps_http.timeline_checkpoint(tenant_id, timeline_id) wait_for_upload(ps_http, tenant_id, timeline_id, lsn) return lsn - - -def wait_for_upload_queue_empty( - pageserver: NeonPageserver, tenant_id: TenantId, timeline_id: TimelineId -): - ps_http = pageserver.http_client() - while True: - all_metrics = ps_http.get_metrics() - tl = all_metrics.query_all( - "pageserver_remote_timeline_client_calls_unfinished", - { - "tenant_id": str(tenant_id), - "timeline_id": str(timeline_id), - }, - ) - assert len(tl) > 0 - log.info(f"upload queue for {tenant_id}/{timeline_id}: {tl}") - if all(m.value == 0 for m in tl): - return - time.sleep(0.2) diff --git a/test_runner/fixtures/pageserver/__init__.py b/test_runner/fixtures/pageserver/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py new file mode 100644 index 0000000000..1e1effe295 --- /dev/null +++ b/test_runner/fixtures/pageserver/http.py @@ -0,0 +1,545 @@ +from __future__ import annotations + +import time +from collections import defaultdict +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple + +import requests + +from fixtures.log_helper import log +from fixtures.metrics import Metrics, parse_metrics +from fixtures.types import Lsn, TenantId, TimelineId +from fixtures.utils import Fn + + +class PageserverApiException(Exception): + def __init__(self, message, status_code: int): + super().__init__(message) + self.status_code = status_code + + +@dataclass +class InMemoryLayerInfo: + kind: str + lsn_start: str + lsn_end: Optional[str] + + @classmethod + def from_json(cls, d: Dict[str, Any]) -> InMemoryLayerInfo: + return InMemoryLayerInfo( + kind=d["kind"], + lsn_start=d["lsn_start"], + lsn_end=d.get("lsn_end"), + ) + + +@dataclass(frozen=True) +class HistoricLayerInfo: + kind: str + layer_file_name: str + layer_file_size: Optional[int] + lsn_start: str + lsn_end: Optional[str] + remote: bool + + @classmethod + def from_json(cls, d: Dict[str, Any]) -> HistoricLayerInfo: + return HistoricLayerInfo( + kind=d["kind"], + layer_file_name=d["layer_file_name"], + layer_file_size=d.get("layer_file_size"), + lsn_start=d["lsn_start"], + lsn_end=d.get("lsn_end"), + remote=d["remote"], + ) + + +@dataclass +class LayerMapInfo: + in_memory_layers: List[InMemoryLayerInfo] + historic_layers: List[HistoricLayerInfo] + + @classmethod + def from_json(cls, d: Dict[str, Any]) -> LayerMapInfo: + info = LayerMapInfo(in_memory_layers=[], historic_layers=[]) + + json_in_memory_layers = d["in_memory_layers"] + assert isinstance(json_in_memory_layers, List) + for json_in_memory_layer in json_in_memory_layers: + info.in_memory_layers.append(InMemoryLayerInfo.from_json(json_in_memory_layer)) + + json_historic_layers = d["historic_layers"] + assert isinstance(json_historic_layers, List) + for json_historic_layer in json_historic_layers: + info.historic_layers.append(HistoricLayerInfo.from_json(json_historic_layer)) + + return info + + def kind_count(self) -> Dict[str, int]: + counts: Dict[str, int] = defaultdict(int) + for inmem_layer in self.in_memory_layers: + counts[inmem_layer.kind] += 1 + for hist_layer in self.historic_layers: + counts[hist_layer.kind] += 1 + return counts + + +@dataclass +class TenantConfig: + tenant_specific_overrides: Dict[str, Any] + effective_config: Dict[str, Any] + + @classmethod + def from_json(cls, d: Dict[str, Any]) -> TenantConfig: + return TenantConfig( + tenant_specific_overrides=d["tenant_specific_overrides"], + effective_config=d["effective_config"], + ) + + +class PageserverHttpClient(requests.Session): + def __init__(self, port: int, is_testing_enabled_or_skip: Fn, auth_token: Optional[str] = None): + super().__init__() + self.port = port + self.auth_token = auth_token + self.is_testing_enabled_or_skip = is_testing_enabled_or_skip + + if auth_token is not None: + self.headers["Authorization"] = f"Bearer {auth_token}" + + def verbose_error(self, res: requests.Response): + try: + res.raise_for_status() + except requests.RequestException as e: + try: + msg = res.json()["msg"] + except: # noqa: E722 + msg = "" + raise PageserverApiException(msg, res.status_code) from e + + def check_status(self): + self.get(f"http://localhost:{self.port}/v1/status").raise_for_status() + + def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]): + self.is_testing_enabled_or_skip() + + if isinstance(config_strings, tuple): + pairs = [config_strings] + else: + pairs = config_strings + + log.info(f"Requesting config failpoints: {repr(pairs)}") + + res = self.put( + f"http://localhost:{self.port}/v1/failpoints", + json=[{"name": name, "actions": actions} for name, actions in pairs], + ) + log.info(f"Got failpoints request response code {res.status_code}") + self.verbose_error(res) + res_json = res.json() + assert res_json is None + return res_json + + def tenant_list(self) -> List[Dict[Any, Any]]: + res = self.get(f"http://localhost:{self.port}/v1/tenant") + self.verbose_error(res) + res_json = res.json() + assert isinstance(res_json, list) + return res_json + + def tenant_create(self, new_tenant_id: Optional[TenantId] = None) -> TenantId: + res = self.post( + f"http://localhost:{self.port}/v1/tenant", + json={ + "new_tenant_id": str(new_tenant_id) if new_tenant_id else None, + }, + ) + self.verbose_error(res) + if res.status_code == 409: + raise Exception(f"could not create tenant: already exists for id {new_tenant_id}") + new_tenant_id = res.json() + assert isinstance(new_tenant_id, str) + return TenantId(new_tenant_id) + + def tenant_attach(self, tenant_id: TenantId): + res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/attach") + self.verbose_error(res) + + def tenant_detach(self, tenant_id: TenantId, detach_ignored=False): + params = {} + if detach_ignored: + params["detach_ignored"] = "true" + + res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach", params=params) + self.verbose_error(res) + + def tenant_load(self, tenant_id: TenantId): + res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/load") + self.verbose_error(res) + + def tenant_ignore(self, tenant_id: TenantId): + res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/ignore") + self.verbose_error(res) + + def tenant_status(self, tenant_id: TenantId) -> Dict[Any, Any]: + res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}") + self.verbose_error(res) + res_json = res.json() + assert isinstance(res_json, dict) + return res_json + + def tenant_config(self, tenant_id: TenantId) -> TenantConfig: + res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/config") + self.verbose_error(res) + return TenantConfig.from_json(res.json()) + + def set_tenant_config(self, tenant_id: TenantId, config: dict[str, Any]): + assert "tenant_id" not in config.keys() + res = self.put( + f"http://localhost:{self.port}/v1/tenant/config", + json={**config, "tenant_id": str(tenant_id)}, + ) + self.verbose_error(res) + + def patch_tenant_config_client_side( + self, + tenant_id: TenantId, + inserts: Optional[Dict[str, Any]] = None, + removes: Optional[List[str]] = None, + ): + current = self.tenant_config(tenant_id).tenant_specific_overrides + if inserts is not None: + current.update(inserts) + if removes is not None: + for key in removes: + del current[key] + self.set_tenant_config(tenant_id, current) + + def tenant_size(self, tenant_id: TenantId) -> int: + return self.tenant_size_and_modelinputs(tenant_id)[0] + + def tenant_size_and_modelinputs(self, tenant_id: TenantId) -> Tuple[int, Dict[str, Any]]: + """ + Returns the tenant size, together with the model inputs as the second tuple item. + """ + res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/synthetic_size") + self.verbose_error(res) + res = res.json() + assert isinstance(res, dict) + assert TenantId(res["id"]) == tenant_id + size = res["size"] + assert type(size) == int + inputs = res["inputs"] + assert type(inputs) is dict + return (size, inputs) + + def tenant_size_debug(self, tenant_id: TenantId) -> str: + """ + Returns the tenant size debug info, as an HTML string + """ + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/synthetic_size", + headers={"Accept": "text/html"}, + ) + return res.text + + def timeline_list( + self, + tenant_id: TenantId, + include_non_incremental_logical_size: bool = False, + include_timeline_dir_layer_file_size_sum: bool = False, + ) -> List[Dict[str, Any]]: + params = {} + if include_non_incremental_logical_size: + params["include-non-incremental-logical-size"] = "true" + if include_timeline_dir_layer_file_size_sum: + params["include-timeline-dir-layer-file-size-sum"] = "true" + + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline", params=params + ) + self.verbose_error(res) + res_json = res.json() + assert isinstance(res_json, list) + return res_json + + def timeline_create( + self, + tenant_id: TenantId, + new_timeline_id: Optional[TimelineId] = None, + ancestor_timeline_id: Optional[TimelineId] = None, + ancestor_start_lsn: Optional[Lsn] = None, + ) -> Dict[Any, Any]: + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline", + json={ + "new_timeline_id": str(new_timeline_id) if new_timeline_id else None, + "ancestor_start_lsn": str(ancestor_start_lsn) if ancestor_start_lsn else None, + "ancestor_timeline_id": str(ancestor_timeline_id) if ancestor_timeline_id else None, + }, + ) + self.verbose_error(res) + if res.status_code == 409: + raise Exception(f"could not create timeline: already exists for id {new_timeline_id}") + + res_json = res.json() + assert isinstance(res_json, dict) + return res_json + + def timeline_detail( + self, + tenant_id: TenantId, + timeline_id: TimelineId, + include_non_incremental_logical_size: bool = False, + include_timeline_dir_layer_file_size_sum: bool = False, + **kwargs, + ) -> Dict[Any, Any]: + params = {} + if include_non_incremental_logical_size: + params["include-non-incremental-logical-size"] = "true" + if include_timeline_dir_layer_file_size_sum: + params["include-timeline-dir-layer-file-size-sum"] = "true" + + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}", + params=params, + **kwargs, + ) + self.verbose_error(res) + res_json = res.json() + assert isinstance(res_json, dict) + return res_json + + def timeline_delete(self, tenant_id: TenantId, timeline_id: TimelineId): + res = self.delete( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}" + ) + self.verbose_error(res) + res_json = res.json() + assert res_json is None + + def timeline_gc( + self, tenant_id: TenantId, timeline_id: TimelineId, gc_horizon: Optional[int] + ) -> dict[str, Any]: + self.is_testing_enabled_or_skip() + + log.info( + f"Requesting GC: tenant {tenant_id}, timeline {timeline_id}, gc_horizon {repr(gc_horizon)}" + ) + res = self.put( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc", + json={"gc_horizon": gc_horizon}, + ) + log.info(f"Got GC request response code: {res.status_code}") + self.verbose_error(res) + res_json = res.json() + assert res_json is not None + assert isinstance(res_json, dict) + return res_json + + def timeline_compact(self, tenant_id: TenantId, timeline_id: TimelineId): + self.is_testing_enabled_or_skip() + + log.info(f"Requesting compact: tenant {tenant_id}, timeline {timeline_id}") + res = self.put( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/compact" + ) + log.info(f"Got compact request response code: {res.status_code}") + self.verbose_error(res) + res_json = res.json() + assert res_json is None + + def timeline_get_lsn_by_timestamp( + self, tenant_id: TenantId, timeline_id: TimelineId, timestamp + ): + log.info( + f"Requesting lsn by timestamp {timestamp}, tenant {tenant_id}, timeline {timeline_id}" + ) + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp}", + ) + self.verbose_error(res) + res_json = res.json() + return res_json + + def timeline_checkpoint(self, tenant_id: TenantId, timeline_id: TimelineId): + self.is_testing_enabled_or_skip() + + log.info(f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}") + res = self.put( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint" + ) + log.info(f"Got checkpoint request response code: {res.status_code}") + self.verbose_error(res) + res_json = res.json() + assert res_json is None + + def timeline_spawn_download_remote_layers( + self, + tenant_id: TenantId, + timeline_id: TimelineId, + max_concurrent_downloads: int, + ) -> dict[str, Any]: + body = { + "max_concurrent_downloads": max_concurrent_downloads, + } + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/download_remote_layers", + json=body, + ) + self.verbose_error(res) + res_json = res.json() + assert res_json is not None + assert isinstance(res_json, dict) + return res_json + + def timeline_poll_download_remote_layers_status( + self, + tenant_id: TenantId, + timeline_id: TimelineId, + spawn_response: dict[str, Any], + poll_state=None, + ) -> None | dict[str, Any]: + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/download_remote_layers", + ) + self.verbose_error(res) + res_json = res.json() + assert res_json is not None + assert isinstance(res_json, dict) + + # assumption in this API client here is that nobody else spawns the task + assert res_json["task_id"] == spawn_response["task_id"] + + if poll_state is None or res_json["state"] == poll_state: + return res_json + return None + + def timeline_download_remote_layers( + self, + tenant_id: TenantId, + timeline_id: TimelineId, + max_concurrent_downloads: int, + errors_ok=False, + at_least_one_download=True, + ): + res = self.timeline_spawn_download_remote_layers( + tenant_id, timeline_id, max_concurrent_downloads + ) + while True: + completed = self.timeline_poll_download_remote_layers_status( + tenant_id, timeline_id, res, poll_state="Completed" + ) + if not completed: + time.sleep(0.1) + continue + if not errors_ok: + assert completed["failed_download_count"] == 0 + if at_least_one_download: + assert completed["successful_download_count"] > 0 + return completed + + def get_metrics_str(self) -> str: + """You probably want to use get_metrics() instead.""" + res = self.get(f"http://localhost:{self.port}/metrics") + self.verbose_error(res) + return res.text + + def get_metrics(self) -> Metrics: + res = self.get_metrics_str() + return parse_metrics(res) + + def get_timeline_metric( + self, tenant_id: TenantId, timeline_id: TimelineId, metric_name: str + ) -> float: + metrics = self.get_metrics() + return metrics.query_one( + metric_name, + filter={ + "tenant_id": str(tenant_id), + "timeline_id": str(timeline_id), + }, + ).value + + def get_remote_timeline_client_metric( + self, + metric_name: str, + tenant_id: TenantId, + timeline_id: TimelineId, + file_kind: str, + op_kind: str, + ) -> Optional[float]: + metrics = self.get_metrics() + matches = metrics.query_all( + name=metric_name, + filter={ + "tenant_id": str(tenant_id), + "timeline_id": str(timeline_id), + "file_kind": str(file_kind), + "op_kind": str(op_kind), + }, + ) + if len(matches) == 0: + value = None + elif len(matches) == 1: + value = matches[0].value + assert value is not None + else: + assert len(matches) < 2, "above filter should uniquely identify metric" + return value + + def get_metric_value( + self, name: str, filter: Optional[Dict[str, str]] = None + ) -> Optional[float]: + metrics = self.get_metrics() + results = metrics.query_all(name, filter=filter) + if not results: + log.info(f'could not find metric "{name}"') + return None + assert len(results) == 1, f"metric {name} with given filters is not unique, got: {results}" + return results[0].value + + def layer_map_info( + self, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> LayerMapInfo: + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/", + ) + self.verbose_error(res) + return LayerMapInfo.from_json(res.json()) + + def download_layer(self, tenant_id: TenantId, timeline_id: TimelineId, layer_name: str): + res = self.get( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/{layer_name}", + ) + self.verbose_error(res) + + assert res.status_code == 200 + + def evict_layer(self, tenant_id: TenantId, timeline_id: TimelineId, layer_name: str): + res = self.delete( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/{layer_name}", + ) + self.verbose_error(res) + + assert res.status_code == 200 + + def evict_all_layers(self, tenant_id: TenantId, timeline_id: TimelineId): + info = self.layer_map_info(tenant_id, timeline_id) + for layer in info.historic_layers: + self.evict_layer(tenant_id, timeline_id, layer.layer_file_name) + + def disk_usage_eviction_run(self, request: dict[str, Any]): + res = self.put( + f"http://localhost:{self.port}/v1/disk_usage_eviction/run", + json=request, + ) + self.verbose_error(res) + return res.json() + + def tenant_break(self, tenant_id: TenantId): + res = self.put(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/break") + self.verbose_error(res) diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py new file mode 100644 index 0000000000..65eda5b636 --- /dev/null +++ b/test_runner/fixtures/pageserver/utils.py @@ -0,0 +1,145 @@ +import time + +from fixtures.log_helper import log +from fixtures.pageserver.http import PageserverHttpClient +from fixtures.types import Lsn, TenantId, TimelineId + + +def assert_tenant_status( + pageserver_http: PageserverHttpClient, tenant: TenantId, expected_status: str +): + tenant_status = pageserver_http.tenant_status(tenant) + log.info(f"tenant_status: {tenant_status}") + assert tenant_status["state"] == expected_status, tenant_status + + +def tenant_exists(pageserver_http: PageserverHttpClient, tenant_id: TenantId): + tenants = pageserver_http.tenant_list() + matching = [t for t in tenants if TenantId(t["id"]) == tenant_id] + assert len(matching) < 2 + if len(matching) == 0: + return None + return matching[0] + + +def remote_consistent_lsn( + pageserver_http: PageserverHttpClient, tenant: TenantId, timeline: TimelineId +) -> Lsn: + detail = pageserver_http.timeline_detail(tenant, timeline) + + if detail["remote_consistent_lsn"] is None: + # No remote information at all. This happens right after creating + # a timeline, before any part of it has been uploaded to remote + # storage yet. + return Lsn(0) + else: + lsn_str = detail["remote_consistent_lsn"] + assert isinstance(lsn_str, str) + return Lsn(lsn_str) + + +def wait_for_upload( + pageserver_http: PageserverHttpClient, + tenant: TenantId, + timeline: TimelineId, + lsn: Lsn, +): + """waits for local timeline upload up to specified lsn""" + for i in range(20): + current_lsn = remote_consistent_lsn(pageserver_http, tenant, timeline) + if current_lsn >= lsn: + log.info("wait finished") + return + log.info( + "waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format( + lsn, current_lsn, i + 1 + ) + ) + time.sleep(1) + raise Exception( + "timed out while waiting for remote_consistent_lsn to reach {}, was {}".format( + lsn, current_lsn + ) + ) + + +def wait_until_tenant_state( + pageserver_http: PageserverHttpClient, + tenant_id: TenantId, + expected_state: str, + iterations: int, +) -> bool: + """ + Does not use `wait_until` for debugging purposes + """ + for _ in range(iterations): + try: + tenant = pageserver_http.tenant_status(tenant_id=tenant_id) + log.debug(f"Tenant {tenant_id} data: {tenant}") + if tenant["state"] == expected_state: + return True + except Exception as e: + log.debug(f"Tenant {tenant_id} state retrieval failure: {e}") + + time.sleep(1) + + raise Exception(f"Tenant {tenant_id} did not become {expected_state} in {iterations} seconds") + + +def wait_until_tenant_active( + pageserver_http: PageserverHttpClient, tenant_id: TenantId, iterations: int = 30 +): + wait_until_tenant_state( + pageserver_http, tenant_id, expected_state="Active", iterations=iterations + ) + + +def last_record_lsn( + pageserver_http_client: PageserverHttpClient, tenant: TenantId, timeline: TimelineId +) -> Lsn: + detail = pageserver_http_client.timeline_detail(tenant, timeline) + + lsn_str = detail["last_record_lsn"] + assert isinstance(lsn_str, str) + return Lsn(lsn_str) + + +def wait_for_last_record_lsn( + pageserver_http: PageserverHttpClient, + tenant: TenantId, + timeline: TimelineId, + lsn: Lsn, +) -> Lsn: + """waits for pageserver to catch up to a certain lsn, returns the last observed lsn.""" + for i in range(10): + current_lsn = last_record_lsn(pageserver_http, tenant, timeline) + if current_lsn >= lsn: + return current_lsn + log.info( + "waiting for last_record_lsn to reach {}, now {}, iteration {}".format( + lsn, current_lsn, i + 1 + ) + ) + time.sleep(1) + raise Exception( + "timed out while waiting for last_record_lsn to reach {}, was {}".format(lsn, current_lsn) + ) + + +def wait_for_upload_queue_empty( + pageserver_http: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId +): + while True: + all_metrics = pageserver_http.get_metrics() + tl = all_metrics.query_all( + "pageserver_remote_timeline_client_calls_unfinished", + { + "tenant_id": str(tenant_id), + "timeline_id": str(timeline_id), + }, + ) + assert len(tl) > 0 + log.info(f"upload queue for {tenant_id}/{timeline_id}: {tl}") + if all(m.value == 0 for m in tl): + return + time.sleep(0.2) diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index b58539ca86..71df74dfba 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -278,3 +278,19 @@ def wait_until(number_of_iterations: int, interval: float, func: Fn): continue return res raise Exception("timed out while waiting for %s" % func) from last_exception + + +def wait_while(number_of_iterations: int, interval: float, func): + """ + Wait until 'func' returns false, or throws an exception. + """ + for i in range(number_of_iterations): + try: + if not func(): + return + log.info("waiting for %s iteration %s failed", func, i + 1) + time.sleep(interval) + continue + except Exception: + return + raise Exception("timed out while waiting for %s" % func) diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py index 4b109c150f..16c5438b8f 100644 --- a/test_runner/performance/test_branch_creation.py +++ b/test_runner/performance/test_branch_creation.py @@ -10,7 +10,7 @@ import pytest from fixtures.benchmark_fixture import MetricReport from fixtures.compare_fixtures import NeonCompare from fixtures.log_helper import log -from fixtures.neon_fixtures import wait_for_last_record_lsn +from fixtures.pageserver.utils import wait_for_last_record_lsn from fixtures.types import Lsn diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py index f3d153d934..f7c4736e04 100644 --- a/test_runner/regress/test_auth.py +++ b/test_runner/regress/test_auth.py @@ -1,7 +1,8 @@ from contextlib import closing import pytest -from fixtures.neon_fixtures import NeonEnvBuilder, PageserverApiException, PgProtocol +from fixtures.neon_fixtures import NeonEnvBuilder, PgProtocol +from fixtures.pageserver.http import PageserverApiException from fixtures.types import TenantId diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index be6e1a69b2..0cc111bd8c 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -10,12 +10,11 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonCli, NeonEnvBuilder, - PageserverHttpClient, PgBin, PortDistributor, - wait_for_last_record_lsn, - wait_for_upload, ) +from fixtures.pageserver.http import PageserverHttpClient +from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload from fixtures.types import Lsn from pytest import FixtureRequest diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index 6ed09734fe..413d6c9d5a 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -11,14 +11,14 @@ from fixtures.neon_fixtures import ( LocalFsStorage, NeonEnv, NeonEnvBuilder, - PageserverHttpClient, PgBin, RemoteStorageKind, wait_for_last_flush_lsn, - wait_for_upload_queue_empty, - wait_until, ) +from fixtures.pageserver.http import PageserverHttpClient +from fixtures.pageserver.utils import wait_for_upload_queue_empty from fixtures.types import Lsn, TenantId, TimelineId +from fixtures.utils import wait_until GLOBAL_LRU_LOG_LINE = "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy" @@ -138,7 +138,7 @@ def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> Ev # remove the initial tenant ## why wait for upload queue? => https://github.com/neondatabase/neon/issues/3865 assert env.initial_timeline - wait_for_upload_queue_empty(env.pageserver, env.initial_tenant, env.initial_timeline) + wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, env.initial_timeline) pageserver_http.tenant_detach(env.initial_tenant) assert isinstance(env.remote_storage, LocalFsStorage) tenant_remote_storage = env.remote_storage.root / "tenants" / str(env.initial_tenant) @@ -182,7 +182,7 @@ def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> Ev # after stopping the safekeepers, we know that no new WAL will be coming in for tenant_id, timeline_id in timelines: pageserver_http.timeline_checkpoint(tenant_id, timeline_id) - wait_for_upload_queue_empty(env.pageserver, tenant_id, timeline_id) + wait_for_upload_queue_empty(pageserver_http, tenant_id, timeline_id) tl_info = pageserver_http.timeline_detail(tenant_id, timeline_id) assert tl_info["last_record_lsn"] == tl_info["disk_consistent_lsn"] assert tl_info["disk_consistent_lsn"] == tl_info["remote_consistent_lsn"] diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index 1dc10fbf4f..774ed98563 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -13,9 +13,8 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, PgBin, Postgres, - wait_for_last_record_lsn, - wait_for_upload, ) +from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import subprocess_capture diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py index 80e7ae8d7e..2d07d02ce7 100644 --- a/test_runner/regress/test_layer_eviction.py +++ b/test_runner/regress/test_layer_eviction.py @@ -6,10 +6,9 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, RemoteStorageKind, wait_for_last_flush_lsn, - wait_for_last_record_lsn, wait_for_sk_commit_lsn_to_reach_remote_storage, - wait_for_upload, ) +from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import query_scalar diff --git a/test_runner/regress/test_neon_cli.py b/test_runner/regress/test_neon_cli.py index d146f78c3a..cd481e69eb 100644 --- a/test_runner/regress/test_neon_cli.py +++ b/test_runner/regress/test_neon_cli.py @@ -5,8 +5,8 @@ from fixtures.neon_fixtures import ( DEFAULT_BRANCH_NAME, NeonEnv, NeonEnvBuilder, - PageserverHttpClient, ) +from fixtures.pageserver.http import PageserverHttpClient from fixtures.types import TenantId, TimelineId diff --git a/test_runner/regress/test_normal_work.py b/test_runner/regress/test_normal_work.py index 73933021a4..aa37a2411c 100644 --- a/test_runner/regress/test_normal_work.py +++ b/test_runner/regress/test_normal_work.py @@ -1,6 +1,7 @@ import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PageserverHttpClient +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder +from fixtures.pageserver.http import PageserverHttpClient def check_tenant(env: NeonEnv, pageserver_http: PageserverHttpClient): diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index fd13651427..90ab8e68d8 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -10,20 +10,20 @@ import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, - PageserverApiException, - PageserverHttpClient, RemoteStorageKind, - assert_tenant_status, available_remote_storages, wait_for_last_flush_lsn, - wait_for_last_record_lsn, wait_for_sk_commit_lsn_to_reach_remote_storage, +) +from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient +from fixtures.pageserver.utils import ( + assert_tenant_status, + wait_for_last_record_lsn, wait_for_upload, - wait_until, wait_until_tenant_state, ) from fixtures.types import Lsn -from fixtures.utils import query_scalar +from fixtures.utils import query_scalar, wait_until def get_num_downloaded_layers(client: PageserverHttpClient, tenant_id, timeline_id): diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py index eb22ac5f99..5b05989ae4 100644 --- a/test_runner/regress/test_pageserver_api.py +++ b/test_runner/regress/test_pageserver_api.py @@ -6,8 +6,8 @@ from fixtures.neon_fixtures import ( DEFAULT_BRANCH_NAME, NeonEnv, NeonEnvBuilder, - PageserverHttpClient, ) +from fixtures.pageserver.http import PageserverHttpClient from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import wait_until diff --git a/test_runner/regress/test_read_trace.py b/test_runner/regress/test_read_trace.py index 1b00b272c2..be0eb76ccd 100644 --- a/test_runner/regress/test_read_trace.py +++ b/test_runner/regress/test_read_trace.py @@ -1,6 +1,7 @@ from contextlib import closing -from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_record_lsn +from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.pageserver.utils import wait_for_last_record_lsn from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import query_scalar diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py index 7487757071..69d6e427ce 100644 --- a/test_runner/regress/test_readonly_node.py +++ b/test_runner/regress/test_readonly_node.py @@ -1,6 +1,7 @@ import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, wait_for_last_record_lsn +from fixtures.neon_fixtures import NeonEnv +from fixtures.pageserver.utils import wait_for_last_record_lsn from fixtures.types import Lsn from fixtures.utils import query_scalar diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index f6600e8974..222305f006 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -13,13 +13,15 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import ( LocalFsStorage, NeonEnvBuilder, - PageserverApiException, - PageserverHttpClient, RemoteStorageKind, available_remote_storages, wait_for_last_flush_lsn, +) +from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient +from fixtures.pageserver.utils import ( wait_for_last_record_lsn, wait_for_upload, + wait_until_tenant_active, wait_until_tenant_state, ) from fixtures.types import Lsn, TenantId, TimelineId @@ -172,15 +174,10 @@ def test_remote_storage_backup_and_restore( client.tenant_attach(tenant_id) log.info("waiting for tenant to become active. this should be quick with on-demand download") - def tenant_active(): - all_states = client.tenant_list() - [tenant] = [t for t in all_states if TenantId(t["id"]) == tenant_id] - assert tenant["state"] == "Active" - - wait_until( - number_of_iterations=5, - interval=1, - func=tenant_active, + wait_until_tenant_active( + pageserver_http=client, + tenant_id=tenant_id, + iterations=5, ) detail = client.timeline_detail(tenant_id, timeline_id) @@ -357,12 +354,7 @@ def test_remote_storage_upload_queue_retries( client.tenant_attach(tenant_id) - def tenant_active(): - all_states = client.tenant_list() - [tenant] = [t for t in all_states if TenantId(t["id"]) == tenant_id] - assert tenant["state"] == "Active" - - wait_until(30, 1, tenant_active) + wait_until_tenant_active(client, tenant_id) log.info("restarting postgres to validate") pg = env.postgres.create_start("main", tenant_id=tenant_id) @@ -497,12 +489,7 @@ def test_remote_timeline_client_calls_started_metric( client.tenant_attach(tenant_id) - def tenant_active(): - all_states = client.tenant_list() - [tenant] = [t for t in all_states if TenantId(t["id"]) == tenant_id] - assert tenant["state"] == "Active" - - wait_until(30, 1, tenant_active) + wait_until_tenant_active(client, tenant_id) log.info("restarting postgres to validate") pg = env.postgres.create_start("main", tenant_id=tenant_id) diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py index c5f9a3d157..67aba227e5 100644 --- a/test_runner/regress/test_tenant_conf.py +++ b/test_runner/regress/test_tenant_conf.py @@ -6,9 +6,8 @@ from fixtures.neon_fixtures import ( LocalFsStorage, NeonEnvBuilder, RemoteStorageKind, - assert_tenant_status, - wait_for_upload, ) +from fixtures.pageserver.utils import assert_tenant_status, wait_for_upload from fixtures.types import Lsn from fixtures.utils import wait_until diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index 5db79eef4a..58a010951e 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -9,18 +9,18 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, - PageserverApiException, - PageserverHttpClient, Postgres, RemoteStorageKind, available_remote_storages, +) +from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient +from fixtures.pageserver.utils import ( wait_for_last_record_lsn, wait_for_upload, - wait_until, wait_until_tenant_state, ) from fixtures.types import Lsn, TenantId, TimelineId -from fixtures.utils import query_scalar +from fixtures.utils import query_scalar, wait_until def do_gc_target( diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index aaf33c0d59..8ad4bd1c11 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -10,18 +10,24 @@ from fixtures.neon_fixtures import ( NeonBroker, NeonEnv, NeonEnvBuilder, - PageserverHttpClient, PortDistributor, Postgres, +) +from fixtures.pageserver.http import PageserverHttpClient +from fixtures.pageserver.utils import ( assert_tenant_status, tenant_exists, wait_for_last_record_lsn, wait_for_upload, +) +from fixtures.types import Lsn, TenantId, TimelineId +from fixtures.utils import ( + query_scalar, + start_in_background, + subprocess_capture, wait_until, wait_while, ) -from fixtures.types import Lsn, TenantId, TimelineId -from fixtures.utils import query_scalar, start_in_background, subprocess_capture def assert_abs_margin_ratio(a: float, b: float, margin_ratio: float): diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py index a4b5f7739a..9037fe0045 100644 --- a/test_runner/regress/test_tenant_size.py +++ b/test_runner/regress/test_tenant_size.py @@ -6,11 +6,11 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, - PageserverHttpClient, Postgres, wait_for_last_flush_lsn, wait_for_wal_insert_lsn, ) +from fixtures.pageserver.http import PageserverHttpClient from fixtures.types import Lsn, TenantId, TimelineId diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index c786f8a8e1..ec1c12a0d8 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -20,10 +20,12 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, Postgres, RemoteStorageKind, - assert_tenant_status, available_remote_storages, - wait_for_last_record_lsn, wait_for_sk_commit_lsn_to_reach_remote_storage, +) +from fixtures.pageserver.utils import ( + assert_tenant_status, + wait_for_last_record_lsn, wait_for_upload, ) from fixtures.types import Lsn, TenantId, TimelineId diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 93fafff934..cf607f4f7b 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -1,5 +1,6 @@ import pytest -from fixtures.neon_fixtures import NeonEnv, PageserverApiException +from fixtures.neon_fixtures import NeonEnv +from fixtures.pageserver.http import PageserverApiException from fixtures.types import TenantId, TimelineId from fixtures.utils import wait_until diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index c4e8e7aa07..7c77e1fe59 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -14,20 +14,21 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, - PageserverApiException, - PageserverHttpClient, PgBin, PortDistributor, Postgres, RemoteStorageKind, VanillaPostgres, - assert_tenant_status, wait_for_last_flush_lsn, +) +from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient +from fixtures.pageserver.utils import ( + assert_tenant_status, wait_for_upload_queue_empty, - wait_until, + wait_until_tenant_active, ) from fixtures.types import TenantId, TimelineId -from fixtures.utils import get_timeline_dir_size +from fixtures.utils import get_timeline_dir_size, wait_until def test_timeline_size(neon_simple_env: NeonEnv): @@ -246,12 +247,7 @@ def test_timeline_initial_logical_size_calculation_cancellation( extra_env_vars={"FAILPOINTS": "timeline-calculate-logical-size-pause=pause"} ) - def tenant_active(): - all_states = client.tenant_list() - [tenant] = [t for t in all_states if TenantId(t["id"]) == tenant_id] - assert tenant["state"] == "Active" - - wait_until(30, 1, tenant_active) + wait_until_tenant_active(client, tenant_id) # kick off initial size calculation task (the response we get here is the estimated size) def assert_size_calculation_not_done(): @@ -425,7 +421,7 @@ def test_timeline_physical_size_post_compaction( pageserver_http.timeline_compact(env.initial_tenant, new_timeline_id) if remote_storage_kind is not None: - wait_for_upload_queue_empty(env.pageserver, env.initial_tenant, new_timeline_id) + wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, new_timeline_id) assert_physical_size_invariants( get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind), @@ -478,7 +474,7 @@ def test_timeline_physical_size_post_gc( pageserver_http.timeline_gc(env.initial_tenant, new_timeline_id, gc_horizon=None) if remote_storage_kind is not None: - wait_for_upload_queue_empty(env.pageserver, env.initial_tenant, new_timeline_id) + wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, new_timeline_id) assert_physical_size_invariants( get_physical_size_values(env, env.initial_tenant, new_timeline_id, remote_storage_kind), @@ -584,7 +580,7 @@ def test_tenant_physical_size( tenant, timeline = env.neon_cli.create_tenant() if remote_storage_kind is not None: - wait_for_upload_queue_empty(env.pageserver, tenant, timeline) + wait_for_upload_queue_empty(pageserver_http, tenant, timeline) def get_timeline_resident_physical_size(timeline: TimelineId): sizes = get_physical_size_values(env, tenant, timeline, remote_storage_kind) @@ -609,7 +605,7 @@ def test_tenant_physical_size( pageserver_http.timeline_checkpoint(tenant, timeline) if remote_storage_kind is not None: - wait_for_upload_queue_empty(env.pageserver, tenant, timeline) + wait_for_upload_queue_empty(pageserver_http, tenant, timeline) timeline_total_resident_physical_size += get_timeline_resident_physical_size(timeline) diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 407085a01a..306c492e8f 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -30,9 +30,8 @@ from fixtures.neon_fixtures import ( SafekeeperHttpClient, SafekeeperPort, available_remote_storages, - wait_for_last_record_lsn, - wait_for_upload, ) +from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import get_dir_size, query_scalar, start_in_background diff --git a/test_runner/regress/test_walredo_not_left_behind_on_detach.py b/test_runner/regress/test_walredo_not_left_behind_on_detach.py index 395d54b8c3..d6302f8632 100644 --- a/test_runner/regress/test_walredo_not_left_behind_on_detach.py +++ b/test_runner/regress/test_walredo_not_left_behind_on_detach.py @@ -3,7 +3,8 @@ import time import psutil import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder, PageserverApiException +from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.pageserver.http import PageserverApiException from fixtures.types import TenantId From bfee4127014022a43bd85bccb562ed4bc62dc075 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Fri, 7 Apr 2023 14:26:21 +0300 Subject: [PATCH 19/31] Trigger tests for index scan implementation (#3968) ## Describe your changes ## Issue ticket number and link ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. ## Checklist before merging - [ ] Do not forget to reformat commit message to not include the above checklist --- vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 757df1dab8..3e70693c91 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 757df1dab82f69bdf69469119420a0bbb307f992 +Subproject commit 3e70693c9178878404d14a61c96b15b74eb02688 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index f8a650e49b..4ad87b0f36 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit f8a650e49b06d39ad131b860117504044b01f312 +Subproject commit 4ad87b0f364a2313600c1d9774ca33df00e606f4 From 979fa8b1ba8cf5d33f3fe33f9b448c62ea755777 Mon Sep 17 00:00:00 2001 From: Vadim Kharitonov Date: Tue, 14 Feb 2023 09:58:06 +0100 Subject: [PATCH 20/31] Compile timescaledb --- Dockerfile.compute-node | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 92a1bb69e5..48d49a60f3 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -301,6 +301,27 @@ RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.3.2.tar.gz make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plpgsql_check.control +######################################################################################### +# +# Layer "timescaledb-pg-build" +# compile timescaledb extension +# +######################################################################################### +FROM build-deps AS timescaledb-pg-build +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +ENV PATH "/usr/local/pgsql/bin:$PATH" + +RUN apt-get update && \ + apt-get install -y cmake && \ + wget https://github.com/timescale/timescaledb/archive/refs/tags/2.10.1.tar.gz -O timescaledb.tar.gz && \ + mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \ + ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOLL=OFF -DAPACHE_ONLY:BOOL=ON && \ + cd build && \ + make -j $(getconf _NPROCESSORS_ONLN) && \ + make install -j $(getconf _NPROCESSORS_ONLN) && \ + echo "trusted = true" >> /usr/local/pgsql/share/extension/timescaledb.control + ######################################################################################### # # Layer "rust extensions" @@ -405,6 +426,7 @@ COPY --from=pgtap-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=prefix-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=hll-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=plpgsql-check-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY pgxn/ pgxn/ RUN make -j $(getconf _NPROCESSORS_ONLN) \ From 31f2cdeb1ec6fb66d26dc40a7c71f4b47dd8b31f Mon Sep 17 00:00:00 2001 From: Vadim Kharitonov Date: Fri, 31 Mar 2023 14:46:38 +0200 Subject: [PATCH 21/31] Update Dockerfile.compute-node Co-authored-by: MMeent --- Dockerfile.compute-node | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 48d49a60f3..3473487444 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -316,7 +316,7 @@ RUN apt-get update && \ apt-get install -y cmake && \ wget https://github.com/timescale/timescaledb/archive/refs/tags/2.10.1.tar.gz -O timescaledb.tar.gz && \ mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \ - ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOLL=OFF -DAPACHE_ONLY:BOOL=ON && \ + ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON && \ cd build && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make install -j $(getconf _NPROCESSORS_ONLN) && \ From 0bf70e113f284a553fa5b3dbaf1162ebd7b4b8e9 Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Fri, 7 Apr 2023 15:03:13 +0300 Subject: [PATCH 22/31] Add extra cnames to staging proxy --- .github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml index 40814e55c9..2a8f028f3b 100644 --- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml +++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml @@ -23,6 +23,7 @@ settings: authBackend: "console" authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2" domain: "*.us-east-2.aws.neon.build" + extraDomains: ["*.us-east-2.postgres.zenith.tech", "*.us-east-2.retooldb-staging.com"] sentryEnvironment: "staging" wssPort: 8443 metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events" From dec58092e8b7c20e743a584f3e6fa8aaca73c988 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 7 Apr 2023 21:39:49 +0300 Subject: [PATCH 23/31] Replace Box with impl in RemoteStorage upload (#3984) Replaces `Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>` with `impl io::AsyncRead + Unpin + Send + Sync + 'static` usages in the `RemoteStorage` interface, to make it closer to [`#![feature(async_fn_in_trait)]`](https://blog.rust-lang.org/inside-rust/2022/11/17/async-fn-in-trait-nightly.html) For `GenericRemoteStorage`, replaces `type Target = dyn RemoteStorage` with another impl with `RemoteStorage` methods inside it. We can reuse the trait, that would require importing the trait in every file where it's used and makes us farther from the unstable feature. After this PR, I've manged to create a patch with the changes: https://github.com/neondatabase/neon/compare/kb/less-dyn-storage...kb/nightly-async-trait?expand=1 Current rust implementation does not like recursive async trait calls, so `UnreliableWrapper` was removed: it contained a `GenericRemoteStorage` that implemented the `RemoteStorage` trait, and itself implemented the trait, which nightly rustc did not like and proposed to box the future. Similarly, `GenericRemoteStorage` cannot implement `RemoteStorage` for nightly rustc to work, since calls various remote storages' methods from inside. I've compiled current `main` and the nightly branch both with `time env RUSTC_WRAPPER="" cargo +nightly build --all --timings` command, and got ``` Finished dev [optimized + debuginfo] target(s) in 2m 04s env RUSTC_WRAPPER="" cargo +nightly build --all --timings 1283.19s user 50.40s system 1074% cpu 2:04.15 total for the new feature tried and Finished dev [optimized + debuginfo] target(s) in 2m 40s env RUSTC_WRAPPER="" cargo +nightly build --all --timings 1288.59s user 52.06s system 834% cpu 2:40.71 total for the old async_trait approach. ``` On my machine, the `remote_storage` lib compilation takes ~10 less time with the nightly feature (left) than the regular main (right). ![image](https://user-images.githubusercontent.com/2690773/230620797-163d8b89-dac8-4366-bcf6-cd1cdddcd22c.png) Full cargo reports are available at [timings.zip](https://github.com/neondatabase/neon/files/11179369/timings.zip) --- libs/remote_storage/src/lib.rs | 72 +++++++++++++++++--- libs/remote_storage/src/local_fs.rs | 2 +- libs/remote_storage/src/s3_bucket.rs | 2 +- libs/remote_storage/src/simulate_failures.rs | 2 +- 4 files changed, 65 insertions(+), 13 deletions(-) diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 5b74308514..e0cc3ca543 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -13,7 +13,6 @@ use std::{ collections::HashMap, fmt::Debug, num::{NonZeroU32, NonZeroUsize}, - ops::Deref, path::{Path, PathBuf}, pin::Pin, sync::Arc, @@ -90,7 +89,7 @@ pub trait RemoteStorage: Send + Sync + 'static { /// Streams the local file contents into remote into the remote storage entry. async fn upload( &self, - data: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>, + from: impl io::AsyncRead + Unpin + Send + Sync + 'static, // S3 PUT request requires the content length to be specified, // otherwise it starts to fail with the concurrent connection count increasing. data_size_bytes: usize, @@ -161,14 +160,67 @@ pub enum GenericRemoteStorage { Unreliable(Arc), } -impl Deref for GenericRemoteStorage { - type Target = dyn RemoteStorage; - - fn deref(&self) -> &Self::Target { +impl GenericRemoteStorage { + pub async fn list_prefixes( + &self, + prefix: Option<&RemotePath>, + ) -> Result, DownloadError> { match self { - GenericRemoteStorage::LocalFs(local_fs) => local_fs, - GenericRemoteStorage::AwsS3(s3_bucket) => s3_bucket.as_ref(), - GenericRemoteStorage::Unreliable(s) => s.as_ref(), + Self::LocalFs(s) => s.list_prefixes(prefix).await, + Self::AwsS3(s) => s.list_prefixes(prefix).await, + Self::Unreliable(s) => s.list_prefixes(prefix).await, + } + } + + pub async fn upload( + &self, + from: impl io::AsyncRead + Unpin + Send + Sync + 'static, + data_size_bytes: usize, + to: &RemotePath, + metadata: Option, + ) -> anyhow::Result<()> { + match self { + Self::LocalFs(s) => s.upload(from, data_size_bytes, to, metadata).await, + Self::AwsS3(s) => s.upload(from, data_size_bytes, to, metadata).await, + Self::Unreliable(s) => s.upload(from, data_size_bytes, to, metadata).await, + } + } + + pub async fn download(&self, from: &RemotePath) -> Result { + match self { + Self::LocalFs(s) => s.download(from).await, + Self::AwsS3(s) => s.download(from).await, + Self::Unreliable(s) => s.download(from).await, + } + } + + pub async fn download_byte_range( + &self, + from: &RemotePath, + start_inclusive: u64, + end_exclusive: Option, + ) -> Result { + match self { + Self::LocalFs(s) => { + s.download_byte_range(from, start_inclusive, end_exclusive) + .await + } + Self::AwsS3(s) => { + s.download_byte_range(from, start_inclusive, end_exclusive) + .await + } + Self::Unreliable(s) => { + s.download_byte_range(from, start_inclusive, end_exclusive) + .await + } + } + } + + pub async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> { + match self { + Self::LocalFs(s) => s.delete(path).await, + Self::AwsS3(s) => s.delete(path).await, + Self::Unreliable(s) => s.delete(path).await, } } } @@ -199,7 +251,7 @@ impl GenericRemoteStorage { /// this path is used for the remote object id conversion only. pub async fn upload_storage_object( &self, - from: Box, + from: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static, from_size_bytes: usize, to: &RemotePath, ) -> anyhow::Result<()> { diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index 21a4156ad3..d7b46731cd 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -118,7 +118,7 @@ impl RemoteStorage for LocalFs { async fn upload( &self, - data: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>, + data: impl io::AsyncRead + Unpin + Send + Sync + 'static, data_size_bytes: usize, to: &RemotePath, metadata: Option, diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index fdf3ae02d3..e6c1e19ad5 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -343,7 +343,7 @@ impl RemoteStorage for S3Bucket { async fn upload( &self, - from: Box<(dyn io::AsyncRead + Unpin + Send + Sync + 'static)>, + from: impl io::AsyncRead + Unpin + Send + Sync + 'static, from_size_bytes: usize, to: &RemotePath, metadata: Option, diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs index d1d062f8e7..cb40859831 100644 --- a/libs/remote_storage/src/simulate_failures.rs +++ b/libs/remote_storage/src/simulate_failures.rs @@ -84,7 +84,7 @@ impl RemoteStorage for UnreliableWrapper { async fn upload( &self, - data: Box<(dyn tokio::io::AsyncRead + Unpin + Send + Sync + 'static)>, + data: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static, // S3 PUT request requires the content length to be specified, // otherwise it starts to fail with the concurrent connection count increasing. data_size_bytes: usize, From 818e341af0918d5ab78313a9af7fb408fa6a9e55 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Sun, 9 Apr 2023 12:52:49 +0100 Subject: [PATCH 24/31] Nightly Benchmarks: replace neon-captest-prefetch with -new/-reuse (#3970) We have enabled prefetch by default, let's use this in Nightly Benchmarks: - effective_io_concurrency=100 by default (instead of 32) - maintenance_io_concurrency=100 by default (instead of 32) Rename `neon-captest-prefetch` to `neon-captest-new` (for pgbench with initialisation) and `neon-captest-reuse` (for OLAP scenarios) --- .github/workflows/benchmarking.yml | 79 +++++++----------------------- 1 file changed, 17 insertions(+), 62 deletions(-) diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 425d4d76c9..2aeea6eca4 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -114,17 +114,16 @@ jobs: # neon-captest-freetier: Run pgbench with freetier-limited compute # neon-captest-new: Run pgbench in a freshly created project # neon-captest-reuse: Same, but reusing existing project - # neon-captest-prefetch: Same, with prefetching enabled (new project) # rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs # rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage - platform: [ neon-captest-reuse, neon-captest-prefetch, rds-postgres ] + platform: [ neon-captest-reuse, neon-captest-new, rds-postgres ] db_size: [ 10gb ] runner: [ us-east-2 ] include: - platform: neon-captest-freetier db_size: 3gb runner: us-east-2 - - platform: neon-captest-prefetch + - platform: neon-captest-new db_size: 50gb runner: us-east-2 - platform: rds-aurora @@ -164,7 +163,7 @@ jobs: echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH - name: Create Neon Project - if: contains(fromJson('["neon-captest-new", "neon-captest-prefetch", "neon-captest-freetier"]'), matrix.platform) + if: contains(fromJson('["neon-captest-new", "neon-captest-freetier"]'), matrix.platform) id: create-neon-project uses: ./.github/actions/neon-project-create with: @@ -180,7 +179,7 @@ jobs: neon-captest-reuse) CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }} ;; - neon-captest-new | neon-captest-prefetch | neon-captest-freetier) + neon-captest-new | neon-captest-freetier) CONNSTR=${{ steps.create-neon-project.outputs.dsn }} ;; rds-aurora) @@ -190,7 +189,7 @@ jobs: CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }} ;; *) - echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'neon-captest-new', 'neon-captest-prefetch', neon-captest-freetier, 'rds-aurora', or 'rds-postgres'" + echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'neon-captest-new', 'neon-captest-freetier', 'rds-aurora', or 'rds-postgres'" exit 1 ;; esac @@ -199,17 +198,6 @@ jobs: psql ${CONNSTR} -c "SELECT version();" - - name: Set database options - if: matrix.platform == 'neon-captest-prefetch' - run: | - DB_NAME=$(psql ${BENCHMARK_CONNSTR} --no-align --quiet -t -c "SELECT current_database()") - - psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET enable_seqscan_prefetch=on" - psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET effective_io_concurrency=32" - psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET maintenance_io_concurrency=32" - env: - BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} - - name: Benchmark init uses: ./.github/actions/run-python-test-set with: @@ -286,10 +274,10 @@ jobs: strategy: fail-fast: false matrix: - # neon-captest-prefetch: We have pre-created projects with prefetch enabled + # neon-captest-reuse: We have pre-created projects 1 CU # rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs # rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage - platform: [ neon-captest-prefetch, rds-postgres, rds-aurora ] + platform: [ neon-captest-reuse, rds-postgres, rds-aurora ] env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install @@ -325,7 +313,7 @@ jobs: id: set-up-connstr run: | case "${PLATFORM}" in - neon-captest-prefetch) + neon-captest-reuse) CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR }} ;; rds-aurora) @@ -335,7 +323,7 @@ jobs: CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CLICKBENCH_10M_CONNSTR }} ;; *) - echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-prefetch', 'rds-aurora', or 'rds-postgres'" + echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'" exit 1 ;; esac @@ -344,17 +332,6 @@ jobs: psql ${CONNSTR} -c "SELECT version();" - - name: Set database options - if: matrix.platform == 'neon-captest-prefetch' - run: | - DB_NAME=$(psql ${BENCHMARK_CONNSTR} --no-align --quiet -t -c "SELECT current_database()") - - psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET enable_seqscan_prefetch=on" - psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET effective_io_concurrency=32" - psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET maintenance_io_concurrency=32" - env: - BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} - - name: ClickBench benchmark uses: ./.github/actions/run-python-test-set with: @@ -397,10 +374,10 @@ jobs: strategy: fail-fast: false matrix: - # neon-captest-prefetch: We have pre-created projects with prefetch enabled + # neon-captest-reuse: We have pre-created projects 1 CU # rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs # rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage - platform: [ neon-captest-prefetch, rds-postgres, rds-aurora ] + platform: [ neon-captest-reuse, rds-postgres, rds-aurora ] env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install @@ -436,7 +413,7 @@ jobs: id: set-up-connstr run: | case "${PLATFORM}" in - neon-captest-prefetch) + neon-captest-reuse) CONNSTR=${{ secrets.BENCHMARK_CAPTEST_TPCH_S10_CONNSTR }} ;; rds-aurora) @@ -446,7 +423,7 @@ jobs: CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_TPCH_S10_CONNSTR }} ;; *) - echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-prefetch', 'rds-aurora', or 'rds-postgres'" + echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'" exit 1 ;; esac @@ -455,17 +432,6 @@ jobs: psql ${CONNSTR} -c "SELECT version();" - - name: Set database options - if: matrix.platform == 'neon-captest-prefetch' - run: | - DB_NAME=$(psql ${BENCHMARK_CONNSTR} --no-align --quiet -t -c "SELECT current_database()") - - psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET enable_seqscan_prefetch=on" - psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET effective_io_concurrency=32" - psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET maintenance_io_concurrency=32" - env: - BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} - - name: Run TPC-H benchmark uses: ./.github/actions/run-python-test-set with: @@ -502,10 +468,10 @@ jobs: strategy: fail-fast: false matrix: - # neon-captest-prefetch: We have pre-created projects with prefetch enabled + # neon-captest-reuse: We have pre-created projects with 1 CU # rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs # rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage - platform: [ neon-captest-prefetch, rds-postgres, rds-aurora ] + platform: [ neon-captest-reuse, rds-postgres, rds-aurora ] env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install @@ -541,7 +507,7 @@ jobs: id: set-up-connstr run: | case "${PLATFORM}" in - neon-captest-prefetch) + neon-captest-reuse) CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_CAPTEST_CONNSTR }} ;; rds-aurora) @@ -551,7 +517,7 @@ jobs: CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_RDS_POSTGRES_CONNSTR }} ;; *) - echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-prefetch', 'rds-aurora', or 'rds-postgres'" + echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'" exit 1 ;; esac @@ -560,17 +526,6 @@ jobs: psql ${CONNSTR} -c "SELECT version();" - - name: Set database options - if: matrix.platform == 'neon-captest-prefetch' - run: | - DB_NAME=$(psql ${BENCHMARK_CONNSTR} --no-align --quiet -t -c "SELECT current_database()") - - psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET enable_seqscan_prefetch=on" - psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET effective_io_concurrency=32" - psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET maintenance_io_concurrency=32" - env: - BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} - - name: Run user examples uses: ./.github/actions/run-python-test-set with: From f0b2e076d9beb3601049388dc2d7d94ef7b68f23 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Sun, 9 Apr 2023 21:52:28 +0300 Subject: [PATCH 25/31] Move compute_ctl structs used in HTTP API and spec file to separate crate. This is in preparation of using compute_ctl to launch postgres nodes in the neon_local control plane. And seems like a good idea to separate the public interfaces anyway. One non-mechanical change here is that the 'metrics' field is moved under the Mutex, instead of using atomics. We were not using atomics for performance but for convenience here, and it seems more clear to not use atomics in the model for the HTTP response type. --- Cargo.lock | 13 +++ Cargo.toml | 1 + compute_tools/Cargo.toml | 1 + compute_tools/src/bin/compute_ctl.rs | 5 +- compute_tools/src/compute.rs | 81 +++++----------- compute_tools/src/config.rs | 2 +- compute_tools/src/http/api.rs | 26 +++-- compute_tools/src/http/mod.rs | 2 - compute_tools/src/http/responses.rs | 40 -------- compute_tools/src/pg_helpers.rs | 78 +++++---------- compute_tools/src/spec.rs | 43 +-------- compute_tools/tests/pg_helpers_tests.rs | 7 +- libs/compute_api/Cargo.toml | 14 +++ libs/compute_api/src/lib.rs | 3 + .../http => libs/compute_api/src}/requests.rs | 5 +- libs/compute_api/src/responses.rs | 66 +++++++++++++ libs/compute_api/src/spec.rs | 94 +++++++++++++++++++ .../compute_api}/tests/cluster_spec.json | 0 18 files changed, 271 insertions(+), 210 deletions(-) delete mode 100644 compute_tools/src/http/responses.rs create mode 100644 libs/compute_api/Cargo.toml create mode 100644 libs/compute_api/src/lib.rs rename {compute_tools/src/http => libs/compute_api/src}/requests.rs (76%) create mode 100644 libs/compute_api/src/responses.rs create mode 100644 libs/compute_api/src/spec.rs rename {compute_tools => libs/compute_api}/tests/cluster_spec.json (100%) diff --git a/Cargo.lock b/Cargo.lock index 4590e76014..c5b64b235a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -841,6 +841,18 @@ dependencies = [ "unicode-width", ] +[[package]] +name = "compute_api" +version = "0.1.0" +dependencies = [ + "anyhow", + "chrono", + "serde", + "serde_json", + "serde_with", + "workspace_hack", +] + [[package]] name = "compute_tools" version = "0.1.0" @@ -848,6 +860,7 @@ dependencies = [ "anyhow", "chrono", "clap 4.1.4", + "compute_api", "futures", "hyper", "notify", diff --git a/Cargo.toml b/Cargo.toml index 09cc150606..d563324c86 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -132,6 +132,7 @@ tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df6 heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending ## Local libraries +compute_api = { version = "0.1", path = "./libs/compute_api/" } consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" } metrics = { version = "0.1", path = "./libs/metrics/" } pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" } diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 59433535f1..f315d2b7d9 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -27,4 +27,5 @@ tracing-subscriber.workspace = true tracing-utils.workspace = true url.workspace = true +compute_api.workspace = true workspace_hack.workspace = true diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 1a3ac77af4..d61eae5f7a 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -43,7 +43,9 @@ use clap::Arg; use tracing::{error, info}; use url::Url; -use compute_tools::compute::{ComputeMetrics, ComputeNode, ComputeState, ComputeStatus}; +use compute_api::responses::ComputeStatus; + +use compute_tools::compute::{ComputeNode, ComputeState}; use compute_tools::http::api::launch_http_server; use compute_tools::logger::*; use compute_tools::monitor::launch_monitor; @@ -116,7 +118,6 @@ fn main() -> Result<()> { pgdata: pgdata.to_string(), pgbin: pgbin.to_string(), live_config_allowed, - metrics: ComputeMetrics::default(), state: Mutex::new(new_state), state_changed: Condvar::new(), }; diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 3e92ec57dc..689aa6ef43 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -19,16 +19,17 @@ use std::os::unix::fs::PermissionsExt; use std::path::Path; use std::process::{Command, Stdio}; use std::str::FromStr; -use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::{Condvar, Mutex}; use anyhow::{Context, Result}; use chrono::{DateTime, Utc}; use postgres::{Client, NoTls}; -use serde::Serialize; use tokio_postgres; use tracing::{info, instrument, warn}; +use compute_api::responses::{ComputeMetrics, ComputeStatus}; +use compute_api::spec::ComputeSpec; + use crate::checker::create_writability_check_data; use crate::config; use crate::pg_helpers::*; @@ -41,7 +42,6 @@ pub struct ComputeNode { pub connstr: url::Url, pub pgdata: String, pub pgbin: String, - pub metrics: ComputeMetrics, /// We should only allow live re- / configuration of the compute node if /// it uses 'pull model', i.e. it can go to control-plane and fetch /// the latest configuration. Otherwise, there could be a case: @@ -74,6 +74,8 @@ pub struct ComputeState { pub timeline: String, pub pageserver_connstr: String, pub storage_auth_token: Option, + + pub metrics: ComputeMetrics, } impl ComputeState { @@ -87,6 +89,7 @@ impl ComputeState { timeline: String::new(), pageserver_connstr: String::new(), storage_auth_token: None, + metrics: ComputeMetrics::default(), } } } @@ -97,33 +100,6 @@ impl Default for ComputeState { } } -#[derive(Serialize, Clone, Copy, PartialEq, Eq, Debug)] -#[serde(rename_all = "snake_case")] -pub enum ComputeStatus { - // Spec wasn't provided at start, waiting for it to be - // provided by control-plane. - Empty, - // Compute configuration was requested. - ConfigurationPending, - // Compute node has spec and initial startup and - // configuration is in progress. - Init, - // Compute is configured and running. - Running, - // Either startup or configuration failed, - // compute will exit soon or is waiting for - // control-plane to terminate it. - Failed, -} - -#[derive(Default, Serialize)] -pub struct ComputeMetrics { - pub sync_safekeepers_ms: AtomicU64, - pub basebackup_ms: AtomicU64, - pub config_ms: AtomicU64, - pub total_startup_ms: AtomicU64, -} - impl ComputeNode { pub fn set_status(&self, status: ComputeStatus) { let mut state = self.state.lock().unwrap(); @@ -185,15 +161,11 @@ impl ComputeNode { ar.set_ignore_zeros(true); ar.unpack(&self.pgdata)?; - self.metrics.basebackup_ms.store( - Utc::now() - .signed_duration_since(start_time) - .to_std() - .unwrap() - .as_millis() as u64, - Ordering::Relaxed, - ); - + self.state.lock().unwrap().metrics.basebackup_ms = Utc::now() + .signed_duration_since(start_time) + .to_std() + .unwrap() + .as_millis() as u64; Ok(()) } @@ -231,14 +203,11 @@ impl ComputeNode { ); } - self.metrics.sync_safekeepers_ms.store( - Utc::now() - .signed_duration_since(start_time) - .to_std() - .unwrap() - .as_millis() as u64, - Ordering::Relaxed, - ); + self.state.lock().unwrap().metrics.sync_safekeepers_ms = Utc::now() + .signed_duration_since(start_time) + .to_std() + .unwrap() + .as_millis() as u64; let lsn = String::from(String::from_utf8(sync_output.stdout)?.trim()); @@ -375,23 +344,19 @@ impl ComputeNode { self.apply_config(&compute_state)?; let startup_end_time = Utc::now(); - self.metrics.config_ms.store( - startup_end_time + { + let mut state = self.state.lock().unwrap(); + state.metrics.config_ms = startup_end_time .signed_duration_since(start_time) .to_std() .unwrap() - .as_millis() as u64, - Ordering::Relaxed, - ); - self.metrics.total_startup_ms.store( - startup_end_time + .as_millis() as u64; + state.metrics.total_startup_ms = startup_end_time .signed_duration_since(self.start_time) .to_std() .unwrap() - .as_millis() as u64, - Ordering::Relaxed, - ); - + .as_millis() as u64; + } self.set_status(ComputeStatus::Running); Ok(pg) diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index 6cbd0e3d4c..d25eb9b2fc 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -6,7 +6,7 @@ use std::path::Path; use anyhow::Result; use crate::pg_helpers::PgOptionsSerialize; -use crate::spec::ComputeSpec; +use compute_api::spec::ComputeSpec; /// Check that `line` is inside a text file and put it there if it is not. /// Create file if it doesn't exist. diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs index 8620b10636..cea45dc596 100644 --- a/compute_tools/src/http/api.rs +++ b/compute_tools/src/http/api.rs @@ -3,9 +3,9 @@ use std::net::SocketAddr; use std::sync::Arc; use std::thread; -use crate::compute::{ComputeNode, ComputeStatus}; -use crate::http::requests::ConfigurationRequest; -use crate::http::responses::{ComputeStatusResponse, GenericAPIError}; +use crate::compute::{ComputeNode, ComputeState}; +use compute_api::requests::ConfigurationRequest; +use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError}; use anyhow::Result; use hyper::service::{make_service_fn, service_fn}; @@ -16,6 +16,16 @@ use tokio::task; use tracing::{error, info}; use tracing_utils::http::OtelName; +fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse { + ComputeStatusResponse { + tenant: state.tenant.clone(), + timeline: state.timeline.clone(), + status: state.status, + last_active: state.last_active, + error: state.error.clone(), + } +} + // Service function to handle all available routes. async fn routes(req: Request, compute: &Arc) -> Response { // @@ -28,8 +38,7 @@ async fn routes(req: Request, compute: &Arc) -> Response { info!("serving /status GET request"); let state = compute.state.lock().unwrap(); - let status_response = ComputeStatusResponse::from(state.clone()); - + let status_response = status_response_from_state(&state); Response::new(Body::from(serde_json::to_string(&status_response).unwrap())) } @@ -37,7 +46,8 @@ async fn routes(req: Request, compute: &Arc) -> Response { info!("serving /metrics.json GET request"); - Response::new(Body::from(serde_json::to_string(&compute.metrics).unwrap())) + let metrics = compute.state.lock().unwrap().metrics.clone(); + Response::new(Body::from(serde_json::to_string(&metrics).unwrap())) } // Collect Postgres current usage insights @@ -162,7 +172,7 @@ async fn handle_configure_request( ); if state.status == ComputeStatus::Failed { - let err = state.error.clone().unwrap_or("unknown error".to_string()); + let err = state.error.as_ref().map_or("unknown error", |x| x); let msg = format!("compute configuration failed: {:?}", err); return Err((msg, StatusCode::INTERNAL_SERVER_ERROR)); } @@ -175,7 +185,7 @@ async fn handle_configure_request( // Return current compute state if everything went well. let state = compute.state.lock().unwrap().clone(); - let status_response = ComputeStatusResponse::from(state); + let status_response = status_response_from_state(&state); Ok(serde_json::to_string(&status_response).unwrap()) } else { Err(("invalid spec".to_string(), StatusCode::BAD_REQUEST)) diff --git a/compute_tools/src/http/mod.rs b/compute_tools/src/http/mod.rs index e54b4e3341..e5fdf85eed 100644 --- a/compute_tools/src/http/mod.rs +++ b/compute_tools/src/http/mod.rs @@ -1,3 +1 @@ pub mod api; -pub mod requests; -pub mod responses; diff --git a/compute_tools/src/http/responses.rs b/compute_tools/src/http/responses.rs deleted file mode 100644 index 1ef4b380a9..0000000000 --- a/compute_tools/src/http/responses.rs +++ /dev/null @@ -1,40 +0,0 @@ -use serde::{Serialize, Serializer}; - -use chrono::{DateTime, Utc}; - -use crate::compute::{ComputeState, ComputeStatus}; - -#[derive(Serialize, Debug)] -pub struct GenericAPIError { - pub error: String, -} - -#[derive(Serialize, Debug)] -#[serde(rename_all = "snake_case")] -pub struct ComputeStatusResponse { - pub tenant: String, - pub timeline: String, - pub status: ComputeStatus, - #[serde(serialize_with = "rfc3339_serialize")] - pub last_active: DateTime, - pub error: Option, -} - -impl From for ComputeStatusResponse { - fn from(state: ComputeState) -> Self { - ComputeStatusResponse { - tenant: state.tenant, - timeline: state.timeline, - status: state.status, - last_active: state.last_active, - error: state.error, - } - } -} - -fn rfc3339_serialize(x: &DateTime, s: S) -> Result -where - S: Serializer, -{ - x.to_rfc3339().serialize(s) -} diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 38d1a6d777..bb787d0506 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -10,43 +10,12 @@ use std::time::{Duration, Instant}; use anyhow::{bail, Result}; use notify::{RecursiveMode, Watcher}; use postgres::{Client, Transaction}; -use serde::Deserialize; use tracing::{debug, instrument}; +use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role}; + const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds -/// Rust representation of Postgres role info with only those fields -/// that matter for us. -#[derive(Clone, Deserialize, Debug)] -pub struct Role { - pub name: PgIdent, - pub encrypted_password: Option, - pub options: GenericOptions, -} - -/// Rust representation of Postgres database info with only those fields -/// that matter for us. -#[derive(Clone, Deserialize, Debug)] -pub struct Database { - pub name: PgIdent, - pub owner: PgIdent, - pub options: GenericOptions, -} - -/// Common type representing both SQL statement params with or without value, -/// like `LOGIN` or `OWNER username` in the `CREATE/ALTER ROLE`, and config -/// options like `wal_level = logical`. -#[derive(Clone, Deserialize, Debug)] -pub struct GenericOption { - pub name: String, - pub value: Option, - pub vartype: String, -} - -/// Optional collection of `GenericOption`'s. Type alias allows us to -/// declare a `trait` on it. -pub type GenericOptions = Option>; - /// Escape a string for including it in a SQL literal fn escape_literal(s: &str) -> String { s.replace('\'', "''").replace('\\', "\\\\") @@ -58,9 +27,14 @@ fn escape_conf_value(s: &str) -> String { s.replace('\'', "''").replace('\\', "\\\\") } -impl GenericOption { +trait GenericOptionExt { + fn to_pg_option(&self) -> String; + fn to_pg_setting(&self) -> String; +} + +impl GenericOptionExt for GenericOption { /// Represent `GenericOption` as SQL statement parameter. - pub fn to_pg_option(&self) -> String { + fn to_pg_option(&self) -> String { if let Some(val) = &self.value { match self.vartype.as_ref() { "string" => format!("{} '{}'", self.name, escape_literal(val)), @@ -72,7 +46,7 @@ impl GenericOption { } /// Represent `GenericOption` as configuration option. - pub fn to_pg_setting(&self) -> String { + fn to_pg_setting(&self) -> String { if let Some(val) = &self.value { match self.vartype.as_ref() { "string" => format!("{} = '{}'", self.name, escape_conf_value(val)), @@ -131,10 +105,14 @@ impl GenericOptionsSearch for GenericOptions { } } -impl Role { +pub trait RoleExt { + fn to_pg_options(&self) -> String; +} + +impl RoleExt for Role { /// Serialize a list of role parameters into a Postgres-acceptable /// string of arguments. - pub fn to_pg_options(&self) -> String { + fn to_pg_options(&self) -> String { // XXX: consider putting LOGIN as a default option somewhere higher, e.g. in control-plane. // For now, we do not use generic `options` for roles. Once used, add // `self.options.as_pg_options()` somewhere here. @@ -159,21 +137,17 @@ impl Role { } } -impl Database { - pub fn new(name: PgIdent, owner: PgIdent) -> Self { - Self { - name, - owner, - options: None, - } - } +pub trait DatabaseExt { + fn to_pg_options(&self) -> String; +} +impl DatabaseExt for Database { /// Serialize a list of database parameters into a Postgres-acceptable /// string of arguments. /// NB: `TEMPLATE` is actually also an identifier, but so far we only need /// to use `template0` and `template1`, so it is not a problem. Yet in the future /// it may require a proper quoting too. - pub fn to_pg_options(&self) -> String { + fn to_pg_options(&self) -> String { let mut params: String = self.options.as_pg_options(); write!(params, " OWNER {}", &self.owner.pg_quote()) .expect("String is documented to not to error during write operations"); @@ -182,10 +156,6 @@ impl Database { } } -/// String type alias representing Postgres identifier and -/// intended to be used for DB / role names. -pub type PgIdent = String; - /// Generic trait used to provide quoting / encoding for strings used in the /// Postgres SQL queries and DATABASE_URL. pub trait Escaping { @@ -226,7 +196,11 @@ pub fn get_existing_dbs(client: &mut Client) -> Result> { &[], )? .iter() - .map(|row| Database::new(row.get("datname"), row.get("owner"))) + .map(|row| Database { + name: row.get("datname"), + owner: row.get("owner"), + options: None, + }) .collect(); Ok(postgres_dbs) diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index b7f15a99d1..2350113c39 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -1,57 +1,16 @@ -use std::collections::HashMap; use std::path::Path; use std::str::FromStr; use anyhow::Result; use postgres::config::Config; use postgres::{Client, NoTls}; -use serde::Deserialize; use tracing::{info, info_span, instrument, span_enabled, warn, Level}; use crate::config; use crate::params::PG_HBA_ALL_MD5; use crate::pg_helpers::*; -/// Cluster spec or configuration represented as an optional number of -/// delta operations + final cluster state description. -#[derive(Clone, Deserialize, Debug, Default)] -pub struct ComputeSpec { - pub format_version: f32, - pub timestamp: String, - pub operation_uuid: Option, - /// Expected cluster state at the end of transition process. - pub cluster: Cluster, - pub delta_operations: Option>, - - pub storage_auth_token: Option, - - pub startup_tracing_context: Option>, -} - -/// Cluster state seen from the perspective of the external tools -/// like Rails web console. -#[derive(Clone, Deserialize, Debug, Default)] -pub struct Cluster { - pub cluster_id: String, - pub name: String, - pub state: Option, - pub roles: Vec, - pub databases: Vec, - pub settings: GenericOptions, -} - -/// Single cluster state changing operation that could not be represented as -/// a static `Cluster` structure. For example: -/// - DROP DATABASE -/// - DROP ROLE -/// - ALTER ROLE name RENAME TO new_name -/// - ALTER DATABASE name RENAME TO new_name -#[derive(Clone, Deserialize, Debug)] -pub struct DeltaOp { - pub action: String, - pub name: PgIdent, - pub new_name: Option, -} +use compute_api::spec::{ComputeSpec, Database, PgIdent, Role}; /// Request spec from the control-plane by compute_id. If `NEON_CONSOLE_JWT` /// env variable is set, it will be used for authorization. diff --git a/compute_tools/tests/pg_helpers_tests.rs b/compute_tools/tests/pg_helpers_tests.rs index f48211f7ed..a63ee038c7 100644 --- a/compute_tools/tests/pg_helpers_tests.rs +++ b/compute_tools/tests/pg_helpers_tests.rs @@ -1,14 +1,13 @@ #[cfg(test)] mod pg_helpers_tests { - use std::fs::File; + use compute_api::spec::{ComputeSpec, GenericOption, GenericOptions, PgIdent}; use compute_tools::pg_helpers::*; - use compute_tools::spec::ComputeSpec; #[test] fn params_serialize() { - let file = File::open("tests/cluster_spec.json").unwrap(); + let file = File::open("../libs/compute_api/tests/cluster_spec.json").unwrap(); let spec: ComputeSpec = serde_json::from_reader(file).unwrap(); assert_eq!( @@ -23,7 +22,7 @@ mod pg_helpers_tests { #[test] fn settings_serialize() { - let file = File::open("tests/cluster_spec.json").unwrap(); + let file = File::open("../libs/compute_api/tests/cluster_spec.json").unwrap(); let spec: ComputeSpec = serde_json::from_reader(file).unwrap(); assert_eq!( diff --git a/libs/compute_api/Cargo.toml b/libs/compute_api/Cargo.toml new file mode 100644 index 0000000000..533a091207 --- /dev/null +++ b/libs/compute_api/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "compute_api" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +[dependencies] +anyhow.workspace = true +chrono.workspace = true +serde.workspace = true +serde_with.workspace = true +serde_json.workspace = true + +workspace_hack.workspace = true diff --git a/libs/compute_api/src/lib.rs b/libs/compute_api/src/lib.rs new file mode 100644 index 0000000000..b660799ec0 --- /dev/null +++ b/libs/compute_api/src/lib.rs @@ -0,0 +1,3 @@ +pub mod requests; +pub mod responses; +pub mod spec; diff --git a/compute_tools/src/http/requests.rs b/libs/compute_api/src/requests.rs similarity index 76% rename from compute_tools/src/http/requests.rs rename to libs/compute_api/src/requests.rs index 2e41c7aea4..5896c7dc65 100644 --- a/compute_tools/src/http/requests.rs +++ b/libs/compute_api/src/requests.rs @@ -1,7 +1,10 @@ -use serde::Deserialize; +//! Structs representing the JSON formats used in the compute_ctl's HTTP API. use crate::spec::ComputeSpec; +use serde::Deserialize; +/// Request of the /configure API +/// /// We now pass only `spec` in the configuration request, but later we can /// extend it and something like `restart: bool` or something else. So put /// `spec` into a struct initially to be more flexible in the future. diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs new file mode 100644 index 0000000000..43289a5e3e --- /dev/null +++ b/libs/compute_api/src/responses.rs @@ -0,0 +1,66 @@ +//! Structs representing the JSON formats used in the compute_ctl's HTTP API. + +use chrono::{DateTime, Utc}; +use serde::{Serialize, Serializer}; + +#[derive(Serialize, Debug)] +pub struct GenericAPIError { + pub error: String, +} + +/// Response of the /status API +#[derive(Serialize, Debug)] +#[serde(rename_all = "snake_case")] +pub struct ComputeStatusResponse { + pub tenant: String, + pub timeline: String, + pub status: ComputeStatus, + #[serde(serialize_with = "rfc3339_serialize")] + pub last_active: DateTime, + pub error: Option, +} + +#[derive(Serialize)] +#[serde(rename_all = "snake_case")] +pub struct ComputeState { + pub status: ComputeStatus, + /// Timestamp of the last Postgres activity + #[serde(serialize_with = "rfc3339_serialize")] + pub last_active: DateTime, + pub error: Option, +} + +#[derive(Serialize, Clone, Copy, Debug, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub enum ComputeStatus { + // Spec wasn't provided at start, waiting for it to be + // provided by control-plane. + Empty, + // Compute configuration was requested. + ConfigurationPending, + // Compute node has spec and initial startup and + // configuration is in progress. + Init, + // Compute is configured and running. + Running, + // Either startup or configuration failed, + // compute will exit soon or is waiting for + // control-plane to terminate it. + Failed, +} + +fn rfc3339_serialize(x: &DateTime, s: S) -> Result +where + S: Serializer, +{ + x.to_rfc3339().serialize(s) +} + +/// Response of the /metrics.json API +#[derive(Clone, Debug, Default, Serialize)] +pub struct ComputeMetrics { + pub sync_safekeepers_ms: u64, + pub basebackup_ms: u64, + pub config_ms: u64, + pub total_startup_ms: u64, +} diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs new file mode 100644 index 0000000000..37fe133b68 --- /dev/null +++ b/libs/compute_api/src/spec.rs @@ -0,0 +1,94 @@ +//! `ComputeSpec` represents the contents of the spec.json file. +//! +//! The spec.json file is used to pass information to 'compute_ctl'. It contains +//! all the information needed to start up the right version of PostgreSQL, +//! and connect it to the storage nodes. +use serde::Deserialize; +use std::collections::HashMap; + +/// String type alias representing Postgres identifier and +/// intended to be used for DB / role names. +pub type PgIdent = String; + +/// Cluster spec or configuration represented as an optional number of +/// delta operations + final cluster state description. +#[derive(Clone, Debug, Default, Deserialize)] +pub struct ComputeSpec { + pub format_version: f32, + pub timestamp: String, + pub operation_uuid: Option, + /// Expected cluster state at the end of transition process. + pub cluster: Cluster, + pub delta_operations: Option>, + + pub storage_auth_token: Option, + + pub startup_tracing_context: Option>, +} + +#[derive(Clone, Debug, Default, Deserialize)] +pub struct Cluster { + pub cluster_id: String, + pub name: String, + pub state: Option, + pub roles: Vec, + pub databases: Vec, + pub settings: GenericOptions, +} + +/// Single cluster state changing operation that could not be represented as +/// a static `Cluster` structure. For example: +/// - DROP DATABASE +/// - DROP ROLE +/// - ALTER ROLE name RENAME TO new_name +/// - ALTER DATABASE name RENAME TO new_name +#[derive(Clone, Debug, Deserialize)] +pub struct DeltaOp { + pub action: String, + pub name: PgIdent, + pub new_name: Option, +} + +/// Rust representation of Postgres role info with only those fields +/// that matter for us. +#[derive(Clone, Debug, Deserialize)] +pub struct Role { + pub name: PgIdent, + pub encrypted_password: Option, + pub options: GenericOptions, +} + +/// Rust representation of Postgres database info with only those fields +/// that matter for us. +#[derive(Clone, Debug, Deserialize)] +pub struct Database { + pub name: PgIdent, + pub owner: PgIdent, + pub options: GenericOptions, +} + +/// Common type representing both SQL statement params with or without value, +/// like `LOGIN` or `OWNER username` in the `CREATE/ALTER ROLE`, and config +/// options like `wal_level = logical`. +#[derive(Clone, Debug, Deserialize)] +pub struct GenericOption { + pub name: String, + pub value: Option, + pub vartype: String, +} + +/// Optional collection of `GenericOption`'s. Type alias allows us to +/// declare a `trait` on it. +pub type GenericOptions = Option>; + +#[cfg(test)] +mod tests { + use super::*; + use std::fs::File; + + #[test] + fn parse_spec_file() { + let file = File::open("tests/cluster_spec.json").unwrap(); + let _spec: ComputeSpec = serde_json::from_reader(file).unwrap(); + } +} diff --git a/compute_tools/tests/cluster_spec.json b/libs/compute_api/tests/cluster_spec.json similarity index 100% rename from compute_tools/tests/cluster_spec.json rename to libs/compute_api/tests/cluster_spec.json From 98df7db09483d954792644756d804a300210d41e Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Mon, 10 Apr 2023 23:41:16 +0300 Subject: [PATCH 26/31] Support aarch64 in walredo seccomp code Aarch64 doesn't implement some old syscalls like open and select. Use openat instead of open to check if seccomp is supported. Leave both select and pselect6 in the allowlist since we don't call select syscall directly and may hope that libc will call pselect6 on aarch64. To check whether some syscall is supported it is possible to use `scmp_sys_resolver` from seccopm package: ``` > apt install seccopm > scmp_sys_resolver -a x86_64 select 23 > scmp_sys_resolver -a aarch64 select -10101 > scmp_sys_resolver -a aarch64 pselect6 72 ``` Negative value means that syscall is not supported. Another cross-check is to look up for the actuall syscall table in `unistd.h`. To resolve all the macroses one can use `gcc -E` as it is done in `dump_sys_aarch64()` function in libseccomp/src/arch-syscall-validate. --- pgxn/neon_walredo/seccomp.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/pgxn/neon_walredo/seccomp.c b/pgxn/neon_walredo/seccomp.c index 5d5ba549ef..1a9121734e 100644 --- a/pgxn/neon_walredo/seccomp.c +++ b/pgxn/neon_walredo/seccomp.c @@ -9,6 +9,13 @@ * To prevent this, it has been decided to limit possible interactions * with the outside world using the Secure Computing BPF mode. * + * This code is intended to support both x86_64 and aarch64. The latter + * doesn't implement some syscalls like open and select. So instead of + * open we use openat. We add both select (absend on aarch64) and + * pselect6 (present on both architectures) to the allowlist. Since we + * don't call select syscall directly we may expect that libc will call + * pselect6 on aarch64. + * * We use this mode to disable all syscalls not in the allowlist. This * approach has its pros & cons: * @@ -122,9 +129,10 @@ seccomp_load_rules(PgSeccompRule *rules, int count) /* * First, check that open of a well-known file works. - * XXX: We use raw syscall() to call the very open(). + * XXX: We use raw syscall() to call the very openat() which is + * present both on x86_64 and on aarch64. */ - fd = syscall(SCMP_SYS(open), "/dev/null", O_RDONLY, 0); + fd = syscall(SCMP_SYS(openat), AT_FDCWD, "/dev/null", O_RDONLY, 0); if (seccomp_test_sighandler_done) ereport(FATAL, (errcode(ERRCODE_SYSTEM_ERROR), @@ -142,8 +150,8 @@ seccomp_load_rules(PgSeccompRule *rules, int count) (errcode(ERRCODE_SYSTEM_ERROR), errmsg("seccomp: could not load test trap"))); - /* Finally, check that open() now raises SIGSYS */ - (void) syscall(SCMP_SYS(open), "/dev/null", O_RDONLY, 0); + /* Finally, check that openat() now raises SIGSYS */ + (void) syscall(SCMP_SYS(openat), AT_FDCWD, "/dev/null", O_RDONLY, 0); if (!seccomp_test_sighandler_done) ereport(FATAL, (errcode(ERRCODE_SYSTEM_ERROR), From 83549a8d409a35a8b6f0233c867c2088bb5e665c Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Tue, 11 Apr 2023 00:08:01 +0300 Subject: [PATCH 27/31] Revert "Support aarch64 in walredo seccomp code" This reverts commit 98df7db09483d954792644756d804a300210d41e. --- pgxn/neon_walredo/seccomp.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/pgxn/neon_walredo/seccomp.c b/pgxn/neon_walredo/seccomp.c index 1a9121734e..5d5ba549ef 100644 --- a/pgxn/neon_walredo/seccomp.c +++ b/pgxn/neon_walredo/seccomp.c @@ -9,13 +9,6 @@ * To prevent this, it has been decided to limit possible interactions * with the outside world using the Secure Computing BPF mode. * - * This code is intended to support both x86_64 and aarch64. The latter - * doesn't implement some syscalls like open and select. So instead of - * open we use openat. We add both select (absend on aarch64) and - * pselect6 (present on both architectures) to the allowlist. Since we - * don't call select syscall directly we may expect that libc will call - * pselect6 on aarch64. - * * We use this mode to disable all syscalls not in the allowlist. This * approach has its pros & cons: * @@ -129,10 +122,9 @@ seccomp_load_rules(PgSeccompRule *rules, int count) /* * First, check that open of a well-known file works. - * XXX: We use raw syscall() to call the very openat() which is - * present both on x86_64 and on aarch64. + * XXX: We use raw syscall() to call the very open(). */ - fd = syscall(SCMP_SYS(openat), AT_FDCWD, "/dev/null", O_RDONLY, 0); + fd = syscall(SCMP_SYS(open), "/dev/null", O_RDONLY, 0); if (seccomp_test_sighandler_done) ereport(FATAL, (errcode(ERRCODE_SYSTEM_ERROR), @@ -150,8 +142,8 @@ seccomp_load_rules(PgSeccompRule *rules, int count) (errcode(ERRCODE_SYSTEM_ERROR), errmsg("seccomp: could not load test trap"))); - /* Finally, check that openat() now raises SIGSYS */ - (void) syscall(SCMP_SYS(openat), AT_FDCWD, "/dev/null", O_RDONLY, 0); + /* Finally, check that open() now raises SIGSYS */ + (void) syscall(SCMP_SYS(open), "/dev/null", O_RDONLY, 0); if (!seccomp_test_sighandler_done) ereport(FATAL, (errcode(ERRCODE_SYSTEM_ERROR), From 22c890b71c2a0547a48f1056f7d17acf432dd9a1 Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Mon, 10 Apr 2023 20:16:12 +0300 Subject: [PATCH 28/31] Add more cnames to proxies --- .../prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml | 1 + .../helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml | 1 + .github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml | 1 + .github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml | 1 + 4 files changed, 4 insertions(+) diff --git a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml index aa5be89101..36dac8309d 100644 --- a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml @@ -24,6 +24,7 @@ settings: authBackend: "console" authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2" domain: "*.ap-southeast-1.aws.neon.tech" + extraDomains: ["*.ap-southeast-1.retooldb.com"] sentryEnvironment: "production" wssPort: 8443 metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events" diff --git a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml index 083af6aa2d..f5b2f31cb9 100644 --- a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml @@ -24,6 +24,7 @@ settings: authBackend: "console" authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2" domain: "*.eu-central-1.aws.neon.tech" + extraDomains: ["*.eu-central-1.retooldb.com"] sentryEnvironment: "production" wssPort: 8443 metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events" diff --git a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml index 40fbc52b39..0be78d868a 100644 --- a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml @@ -24,6 +24,7 @@ settings: authBackend: "console" authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2" domain: "*.us-east-2.aws.neon.tech" + extraDomains: ["*.us-east-2.retooldb.com"] sentryEnvironment: "production" wssPort: 8443 metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events" diff --git a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml index 810a6a5f78..79115be0e2 100644 --- a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml @@ -24,6 +24,7 @@ settings: authBackend: "console" authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2" domain: "*.us-west-2.aws.neon.tech" + extraDomains: ["*.us-west-2.retooldb.com"] sentryEnvironment: "production" wssPort: 8443 metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events" From 7ad5a5e847ff9f007fe3dcaffe9f8a403297a995 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Mon, 10 Apr 2023 22:13:53 +0400 Subject: [PATCH 29/31] Enable timeout on reading from socket in safekeeper WAL service. TCP_KEEPALIVE is not enabled by default, so this prevents hanged up connections in case of abrupt client termination. Add 'closed' flag to PostgresBackendReader and pass it during handles join to prevent attempts to read from socket if we errored out previously -- now with timeouts this is a common situation. It looks like 2023-04-10T18:08:37.493448Z INFO {cid=68}:WAL receiver{ttid=59f91ad4e821ab374f9ccdf918da3a85/16438f99d61572c72f0c7b0ed772785d}: terminated: timed out Presumably fixes https://github.com/neondatabase/neon/issues/3971 --- Cargo.lock | 1 + Cargo.toml | 1 + libs/postgres_backend/src/lib.rs | 47 ++++++++++++++++----- safekeeper/Cargo.toml | 1 + safekeeper/src/wal_service.rs | 70 +++++++++++++++++++------------- 5 files changed, 80 insertions(+), 40 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c5b64b235a..5b99e93e76 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3367,6 +3367,7 @@ dependencies = [ "tempfile", "thiserror", "tokio", + "tokio-io-timeout", "tokio-postgres", "toml_edit", "tracing", diff --git a/Cargo.toml b/Cargo.toml index d563324c86..679605dc1d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -101,6 +101,7 @@ test-context = "0.1" thiserror = "1.0" tls-listener = { version = "0.6", features = ["rustls", "hyper-h1"] } tokio = { version = "1.17", features = ["macros"] } +tokio-io-timeout = "1.2.0" tokio-postgres-rustls = "0.9.0" tokio-rustls = "0.23" tokio-stream = "0.1" diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index 60932a5950..f6bf7c6fc2 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -54,7 +54,7 @@ pub fn is_expected_io_error(e: &io::Error) -> bool { use io::ErrorKind::*; matches!( e.kind(), - ConnectionRefused | ConnectionAborted | ConnectionReset + ConnectionRefused | ConnectionAborted | ConnectionReset | TimedOut ) } @@ -320,9 +320,17 @@ impl PostgresBackend { if let ProtoState::Closed = self.state { Ok(None) } else { - let m = self.framed.read_message().await?; - trace!("read msg {:?}", m); - Ok(m) + match self.framed.read_message().await { + Ok(m) => { + trace!("read msg {:?}", m); + Ok(m) + } + Err(e) => { + // remember not to try to read anymore + self.state = ProtoState::Closed; + Err(e) + } + } } } @@ -493,7 +501,10 @@ impl PostgresBackend { MaybeWriteOnly::Full(framed) => { let (reader, writer) = framed.split(); self.framed = MaybeWriteOnly::WriteOnly(writer); - Ok(PostgresBackendReader(reader)) + Ok(PostgresBackendReader { + reader, + closed: false, + }) } MaybeWriteOnly::WriteOnly(_) => { anyhow::bail!("PostgresBackend is already split") @@ -510,8 +521,12 @@ impl PostgresBackend { anyhow::bail!("PostgresBackend is not split") } MaybeWriteOnly::WriteOnly(writer) => { - let joined = Framed::unsplit(reader.0, writer); + let joined = Framed::unsplit(reader.reader, writer); self.framed = MaybeWriteOnly::Full(joined); + // if reader encountered connection error, do not attempt reading anymore + if reader.closed { + self.state = ProtoState::Closed; + } Ok(()) } MaybeWriteOnly::Broken => panic!("unsplit on framed in invalid state"), @@ -797,15 +812,25 @@ impl PostgresBackend { } } -pub struct PostgresBackendReader(FramedReader>); +pub struct PostgresBackendReader { + reader: FramedReader>, + closed: bool, // true if received error closing the connection +} impl PostgresBackendReader { /// Read full message or return None if connection is cleanly closed with no /// unprocessed data. pub async fn read_message(&mut self) -> Result, ConnectionError> { - let m = self.0.read_message().await?; - trace!("read msg {:?}", m); - Ok(m) + match self.reader.read_message().await { + Ok(m) => { + trace!("read msg {:?}", m); + Ok(m) + } + Err(e) => { + self.closed = true; + Err(e) + } + } } /// Get CopyData contents of the next message in COPY stream or error @@ -923,7 +948,7 @@ pub enum CopyStreamHandlerEnd { #[error("EOF on COPY stream")] EOF, /// The connection was lost - #[error(transparent)] + #[error("connection error: {0}")] Disconnected(#[from] ConnectionError), /// Some other error #[error(transparent)] diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 8b0733832a..00cd111da5 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -30,6 +30,7 @@ serde_with.workspace = true signal-hook.workspace = true thiserror.workspace = true tokio = { workspace = true, features = ["fs"] } +tokio-io-timeout.workspace = true tokio-postgres.workspace = true toml_edit.workspace = true tracing.workspace = true diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs index 22f50c3428..fb0d77a9f2 100644 --- a/safekeeper/src/wal_service.rs +++ b/safekeeper/src/wal_service.rs @@ -4,8 +4,9 @@ //! use anyhow::{Context, Result}; use postgres_backend::QueryError; -use std::{future, thread}; +use std::{future, thread, time::Duration}; use tokio::net::TcpStream; +use tokio_io_timeout::TimeoutReader; use tracing::*; use utils::measured_stream::MeasuredStream; @@ -67,41 +68,52 @@ fn handle_socket( let runtime = tokio::runtime::Builder::new_current_thread() .enable_all() .build()?; - let local = tokio::task::LocalSet::new(); socket.set_nodelay(true)?; let peer_addr = socket.peer_addr()?; - let traffic_metrics = TrafficMetrics::new(); - if let Some(current_az) = conf.availability_zone.as_deref() { - traffic_metrics.set_sk_az(current_az); - } + // TimeoutReader wants async runtime during creation. + runtime.block_on(async move { + // Set timeout on reading from the socket. It prevents hanged up connection + // if client suddenly disappears. Note that TCP_KEEPALIVE is not enabled by + // default, and tokio doesn't provide ability to set it out of the box. + let mut socket = TimeoutReader::new(socket); + let wal_service_timeout = Duration::from_secs(60 * 10); + socket.set_timeout(Some(wal_service_timeout)); + // pin! is here because TimeoutReader (due to storing sleep future inside) + // is not Unpin, and all pgbackend/framed/tokio dependencies require stream + // to be Unpin. Which is reasonable, as indeed something like TimeoutReader + // shouldn't be moved. + tokio::pin!(socket); - let socket = MeasuredStream::new( - socket, - |cnt| { - traffic_metrics.observe_read(cnt); - }, - |cnt| { - traffic_metrics.observe_write(cnt); - }, - ); + let traffic_metrics = TrafficMetrics::new(); + if let Some(current_az) = conf.availability_zone.as_deref() { + traffic_metrics.set_sk_az(current_az); + } - let auth_type = match conf.auth { - None => AuthType::Trust, - Some(_) => AuthType::NeonJWT, - }; - let mut conn_handler = - SafekeeperPostgresHandler::new(conf, conn_id, Some(traffic_metrics.clone())); - let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?; - // libpq protocol between safekeeper and walproposer / pageserver - // We don't use shutdown. - local.block_on( - &runtime, - pgbackend.run(&mut conn_handler, future::pending::<()>), - )?; + let socket = MeasuredStream::new( + socket, + |cnt| { + traffic_metrics.observe_read(cnt); + }, + |cnt| { + traffic_metrics.observe_write(cnt); + }, + ); - Ok(()) + let auth_type = match conf.auth { + None => AuthType::Trust, + Some(_) => AuthType::NeonJWT, + }; + let mut conn_handler = + SafekeeperPostgresHandler::new(conf, conn_id, Some(traffic_metrics.clone())); + let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?; + // libpq protocol between safekeeper and walproposer / pageserver + // We don't use shutdown. + pgbackend + .run(&mut conn_handler, future::pending::<()>) + .await + }) } /// Unique WAL service connection ids are logged in spans for observability. From c79d5a947cde6f0d6e0781dca6cd20480a3d7762 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Tue, 11 Apr 2023 10:58:04 +0100 Subject: [PATCH 30/31] Nightly Benchmarks: run third-party benchmarks once a week (#3987) --- .github/workflows/benchmarking.yml | 101 ++++++++++++++++++----------- 1 file changed, 63 insertions(+), 38 deletions(-) diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 2aeea6eca4..028fe8d8ad 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -107,28 +107,65 @@ jobs: env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} + generate-matrices: + # Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday) + # + # Available platforms: + # - neon-captest-new: Freshly created project (1 CU) + # - neon-captest-freetier: Use freetier-sized compute (0.25 CU) + # - neon-captest-reuse: Reusing existing project + # - rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs + # - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage + runs-on: ubuntu-latest + outputs: + pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }} + olap-compare-matrix: ${{ steps.olap-compare-matrix.outputs.matrix }} + + steps: + - name: Generate matrix for pgbench benchmark + id: pgbench-compare-matrix + run: | + matrix='{ + "platform": [ + "neon-captest-new", + "neon-captest-reuse" + ], + "db_size": [ "10gb" ], + "include": [ + { "platform": "neon-captest-freetier", "db_size": "3gb" }, + { "platform": "neon-captest-new", "db_size": "50gb" } + ] + }' + + if [ "$(date +%A)" = "Saturday" ]; then + matrix=$(echo $matrix | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"}, + { "platform": "rds-aurora", "db_size": "50gb"}]') + fi + + echo "matrix=$(echo $matrix | jq --compact-output '.')" >> $GITHUB_OUTPUT + + - name: Generate matrix for OLAP benchmarks + id: olap-compare-matrix + run: | + matrix='{ + "platform": [ + "neon-captest-reuse" + ] + }' + + if [ "$(date +%A)" = "Saturday" ]; then + matrix=$(echo $matrix | jq '.include += [{ "platform": "rds-postgres" }, + { "platform": "rds-aurora" }]') + fi + + echo "matrix=$(echo $matrix | jq --compact-output '.')" >> $GITHUB_OUTPUT + pgbench-compare: + needs: [ generate-matrices ] + strategy: fail-fast: false - matrix: - # neon-captest-freetier: Run pgbench with freetier-limited compute - # neon-captest-new: Run pgbench in a freshly created project - # neon-captest-reuse: Same, but reusing existing project - # rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs - # rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage - platform: [ neon-captest-reuse, neon-captest-new, rds-postgres ] - db_size: [ 10gb ] - runner: [ us-east-2 ] - include: - - platform: neon-captest-freetier - db_size: 3gb - runner: us-east-2 - - platform: neon-captest-new - db_size: 50gb - runner: us-east-2 - - platform: rds-aurora - db_size: 50gb - runner: us-east-2 + matrix: ${{fromJson(needs.generate-matrices.outputs.pgbench-compare-matrix)}} env: TEST_PG_BENCH_DURATIONS_MATRIX: "60m" @@ -140,7 +177,7 @@ jobs: SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }} PLATFORM: ${{ matrix.platform }} - runs-on: [ self-hosted, "${{ matrix.runner }}", x64 ] + runs-on: [ self-hosted, us-east-2, x64 ] container: image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned options: --init @@ -269,15 +306,11 @@ jobs: # *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows # *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB if: success() || failure() - needs: [ pgbench-compare ] + needs: [ generate-matrices, pgbench-compare ] strategy: fail-fast: false - matrix: - # neon-captest-reuse: We have pre-created projects 1 CU - # rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs - # rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage - platform: [ neon-captest-reuse, rds-postgres, rds-aurora ] + matrix: ${{ fromJson(needs.generate-matrices.outputs.olap-compare-matrix) }} env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install @@ -369,15 +402,11 @@ jobs: # # *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB) if: success() || failure() - needs: [ clickbench-compare ] + needs: [ generate-matrices, clickbench-compare ] strategy: fail-fast: false - matrix: - # neon-captest-reuse: We have pre-created projects 1 CU - # rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs - # rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage - platform: [ neon-captest-reuse, rds-postgres, rds-aurora ] + matrix: ${{ fromJson(needs.generate-matrices.outputs.olap-compare-matrix) }} env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install @@ -463,15 +492,11 @@ jobs: user-examples-compare: if: success() || failure() - needs: [ tpch-compare ] + needs: [ generate-matrices, tpch-compare ] strategy: fail-fast: false - matrix: - # neon-captest-reuse: We have pre-created projects with 1 CU - # rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs - # rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage - platform: [ neon-captest-reuse, rds-postgres, rds-aurora ] + matrix: ${{ fromJson(needs.generate-matrices.outputs.olap-compare-matrix) }} env: POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install From 4ed51ad33b12edfec2e03b47f8fcfb7e7b8173cb Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Tue, 11 Apr 2023 12:50:10 +0300 Subject: [PATCH 31/31] Add more proxy cnames --- .../prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml | 2 +- .../helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml | 2 +- .github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml | 2 +- .github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml index 36dac8309d..5a98217bae 100644 --- a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml @@ -24,7 +24,7 @@ settings: authBackend: "console" authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2" domain: "*.ap-southeast-1.aws.neon.tech" - extraDomains: ["*.ap-southeast-1.retooldb.com"] + extraDomains: ["*.ap-southeast-1.retooldb.com", "*.ap-southeast-1.postgres.vercel-storage.com"] sentryEnvironment: "production" wssPort: 8443 metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events" diff --git a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml index f5b2f31cb9..a9ee49d82f 100644 --- a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml @@ -24,7 +24,7 @@ settings: authBackend: "console" authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2" domain: "*.eu-central-1.aws.neon.tech" - extraDomains: ["*.eu-central-1.retooldb.com"] + extraDomains: ["*.eu-central-1.retooldb.com", "*.eu-central-1.postgres.vercel-storage.com"] sentryEnvironment: "production" wssPort: 8443 metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events" diff --git a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml index 0be78d868a..239a9911c7 100644 --- a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml @@ -24,7 +24,7 @@ settings: authBackend: "console" authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2" domain: "*.us-east-2.aws.neon.tech" - extraDomains: ["*.us-east-2.retooldb.com"] + extraDomains: ["*.us-east-2.retooldb.com", "*.us-east-2.postgres.vercel-storage.com"] sentryEnvironment: "production" wssPort: 8443 metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events" diff --git a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml index 79115be0e2..c987ae236a 100644 --- a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml @@ -24,7 +24,7 @@ settings: authBackend: "console" authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2" domain: "*.us-west-2.aws.neon.tech" - extraDomains: ["*.us-west-2.retooldb.com"] + extraDomains: ["*.us-west-2.retooldb.com", "*.us-west-2.postgres.vercel-storage.com"] sentryEnvironment: "production" wssPort: 8443 metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"