From 268bc890ea309db615d0be7e6ef22f5d938dfa4f Mon Sep 17 00:00:00 2001 From: Ivan Efremov Date: Thu, 28 Nov 2024 08:32:22 +0200 Subject: [PATCH 01/65] proxy: spawn cancellation checks in the background (#9918) ## Problem For cancellation, a connection is open during all the cancel checks. ## Summary of changes Spawn cancellation checks in the background, and close connection immediately. Use task_tracker for cancellation checks. --- proxy/src/cancellation.rs | 15 ++++++++----- proxy/src/console_redirect_proxy.rs | 35 +++++++++++++++++++++-------- proxy/src/proxy/mod.rs | 35 +++++++++++++++++++++-------- proxy/src/redis/notifications.rs | 2 +- proxy/src/serverless/mod.rs | 9 ++++++++ proxy/src/serverless/websocket.rs | 3 +++ 6 files changed, 75 insertions(+), 24 deletions(-) diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index 74415f1ffe..91e198bf88 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -99,16 +99,17 @@ impl CancellationHandler

{ /// Try to cancel a running query for the corresponding connection. /// If the cancellation key is not found, it will be published to Redis. /// check_allowed - if true, check if the IP is allowed to cancel the query + /// return Result primarily for tests pub(crate) async fn cancel_session( &self, key: CancelKeyData, session_id: Uuid, - peer_addr: &IpAddr, + peer_addr: IpAddr, check_allowed: bool, ) -> Result<(), CancelError> { // TODO: check for unspecified address is only for backward compatibility, should be removed if !peer_addr.is_unspecified() { - let subnet_key = match *peer_addr { + let subnet_key = match peer_addr { IpAddr::V4(ip) => IpNet::V4(Ipv4Net::new_assert(ip, 24).trunc()), // use defaut mask here IpAddr::V6(ip) => IpNet::V6(Ipv6Net::new_assert(ip, 64).trunc()), }; @@ -141,9 +142,11 @@ impl CancellationHandler

{ return Ok(()); } - match self.client.try_publish(key, session_id, *peer_addr).await { + match self.client.try_publish(key, session_id, peer_addr).await { Ok(()) => {} // do nothing Err(e) => { + // log it here since cancel_session could be spawned in a task + tracing::error!("failed to publish cancellation key: {key}, error: {e}"); return Err(CancelError::IO(std::io::Error::new( std::io::ErrorKind::Other, e.to_string(), @@ -154,8 +157,10 @@ impl CancellationHandler

{ }; if check_allowed - && !check_peer_addr_is_in_list(peer_addr, cancel_closure.ip_allowlist.as_slice()) + && !check_peer_addr_is_in_list(&peer_addr, cancel_closure.ip_allowlist.as_slice()) { + // log it here since cancel_session could be spawned in a task + tracing::warn!("IP is not allowed to cancel the query: {key}"); return Err(CancelError::IpNotAllowed); } @@ -306,7 +311,7 @@ mod tests { cancel_key: 0, }, Uuid::new_v4(), - &("127.0.0.1".parse().unwrap()), + "127.0.0.1".parse().unwrap(), true, ) .await diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs index b910b524b1..8f78df1964 100644 --- a/proxy/src/console_redirect_proxy.rs +++ b/proxy/src/console_redirect_proxy.rs @@ -35,6 +35,7 @@ pub async fn task_main( socket2::SockRef::from(&listener).set_keepalive(true)?; let connections = tokio_util::task::task_tracker::TaskTracker::new(); + let cancellations = tokio_util::task::task_tracker::TaskTracker::new(); while let Some(accept_result) = run_until_cancelled(listener.accept(), &cancellation_token).await @@ -48,6 +49,7 @@ pub async fn task_main( let session_id = uuid::Uuid::new_v4(); let cancellation_handler = Arc::clone(&cancellation_handler); + let cancellations = cancellations.clone(); debug!(protocol = "tcp", %session_id, "accepted new TCP connection"); @@ -96,6 +98,7 @@ pub async fn task_main( cancellation_handler, socket, conn_gauge, + cancellations, ) .instrument(ctx.span()) .boxed() @@ -127,10 +130,12 @@ pub async fn task_main( } connections.close(); + cancellations.close(); drop(listener); // Drain connections connections.wait().await; + cancellations.wait().await; Ok(()) } @@ -142,6 +147,7 @@ pub(crate) async fn handle_client( cancellation_handler: Arc, stream: S, conn_gauge: NumClientConnectionsGuard<'static>, + cancellations: tokio_util::task::task_tracker::TaskTracker, ) -> Result>, ClientRequestError> { debug!( protocol = %ctx.protocol(), @@ -161,15 +167,26 @@ pub(crate) async fn handle_client( match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? { HandshakeData::Startup(stream, params) => (stream, params), HandshakeData::Cancel(cancel_key_data) => { - return Ok(cancellation_handler - .cancel_session( - cancel_key_data, - ctx.session_id(), - &ctx.peer_addr(), - config.authentication_config.ip_allowlist_check_enabled, - ) - .await - .map(|()| None)?) + // spawn a task to cancel the session, but don't wait for it + cancellations.spawn({ + let cancellation_handler_clone = Arc::clone(&cancellation_handler); + let session_id = ctx.session_id(); + let peer_ip = ctx.peer_addr(); + async move { + drop( + cancellation_handler_clone + .cancel_session( + cancel_key_data, + session_id, + peer_ip, + config.authentication_config.ip_allowlist_check_enabled, + ) + .await, + ); + } + }); + + return Ok(None); } }; drop(pause); diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index 7fe67e43de..956036d29d 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -69,6 +69,7 @@ pub async fn task_main( socket2::SockRef::from(&listener).set_keepalive(true)?; let connections = tokio_util::task::task_tracker::TaskTracker::new(); + let cancellations = tokio_util::task::task_tracker::TaskTracker::new(); while let Some(accept_result) = run_until_cancelled(listener.accept(), &cancellation_token).await @@ -82,6 +83,7 @@ pub async fn task_main( let session_id = uuid::Uuid::new_v4(); let cancellation_handler = Arc::clone(&cancellation_handler); + let cancellations = cancellations.clone(); debug!(protocol = "tcp", %session_id, "accepted new TCP connection"); let endpoint_rate_limiter2 = endpoint_rate_limiter.clone(); @@ -133,6 +135,7 @@ pub async fn task_main( ClientMode::Tcp, endpoint_rate_limiter2, conn_gauge, + cancellations, ) .instrument(ctx.span()) .boxed() @@ -164,10 +167,12 @@ pub async fn task_main( } connections.close(); + cancellations.close(); drop(listener); // Drain connections connections.wait().await; + cancellations.wait().await; Ok(()) } @@ -250,6 +255,7 @@ pub(crate) async fn handle_client( mode: ClientMode, endpoint_rate_limiter: Arc, conn_gauge: NumClientConnectionsGuard<'static>, + cancellations: tokio_util::task::task_tracker::TaskTracker, ) -> Result>, ClientRequestError> { debug!( protocol = %ctx.protocol(), @@ -270,15 +276,26 @@ pub(crate) async fn handle_client( match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? { HandshakeData::Startup(stream, params) => (stream, params), HandshakeData::Cancel(cancel_key_data) => { - return Ok(cancellation_handler - .cancel_session( - cancel_key_data, - ctx.session_id(), - &ctx.peer_addr(), - config.authentication_config.ip_allowlist_check_enabled, - ) - .await - .map(|()| None)?) + // spawn a task to cancel the session, but don't wait for it + cancellations.spawn({ + let cancellation_handler_clone = Arc::clone(&cancellation_handler); + let session_id = ctx.session_id(); + let peer_ip = ctx.peer_addr(); + async move { + drop( + cancellation_handler_clone + .cancel_session( + cancel_key_data, + session_id, + peer_ip, + config.authentication_config.ip_allowlist_check_enabled, + ) + .await, + ); + } + }); + + return Ok(None); } }; drop(pause); diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs index 65008ae943..9ac07b7e90 100644 --- a/proxy/src/redis/notifications.rs +++ b/proxy/src/redis/notifications.rs @@ -149,7 +149,7 @@ impl MessageHandler { .cancel_session( cancel_session.cancel_key_data, uuid::Uuid::nil(), - &peer_addr, + peer_addr, cancel_session.peer_addr.is_some(), ) .await diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs index 77025f419d..80b42f9e55 100644 --- a/proxy/src/serverless/mod.rs +++ b/proxy/src/serverless/mod.rs @@ -132,6 +132,7 @@ pub async fn task_main( let connections = tokio_util::task::task_tracker::TaskTracker::new(); connections.close(); // allows `connections.wait to complete` + let cancellations = tokio_util::task::task_tracker::TaskTracker::new(); while let Some(res) = run_until_cancelled(ws_listener.accept(), &cancellation_token).await { let (conn, peer_addr) = res.context("could not accept TCP stream")?; if let Err(e) = conn.set_nodelay(true) { @@ -160,6 +161,7 @@ pub async fn task_main( let connections2 = connections.clone(); let cancellation_handler = cancellation_handler.clone(); let endpoint_rate_limiter = endpoint_rate_limiter.clone(); + let cancellations = cancellations.clone(); connections.spawn( async move { let conn_token2 = conn_token.clone(); @@ -188,6 +190,7 @@ pub async fn task_main( config, backend, connections2, + cancellations, cancellation_handler, endpoint_rate_limiter, conn_token, @@ -313,6 +316,7 @@ async fn connection_handler( config: &'static ProxyConfig, backend: Arc, connections: TaskTracker, + cancellations: TaskTracker, cancellation_handler: Arc, endpoint_rate_limiter: Arc, cancellation_token: CancellationToken, @@ -353,6 +357,7 @@ async fn connection_handler( // `request_handler` is not cancel safe. It expects to be cancelled only at specific times. // By spawning the future, we ensure it never gets cancelled until it decides to. + let cancellations = cancellations.clone(); let handler = connections.spawn( request_handler( req, @@ -364,6 +369,7 @@ async fn connection_handler( conn_info2.clone(), http_request_token, endpoint_rate_limiter.clone(), + cancellations, ) .in_current_span() .map_ok_or_else(api_error_into_response, |r| r), @@ -411,6 +417,7 @@ async fn request_handler( // used to cancel in-flight HTTP requests. not used to cancel websockets http_cancellation_token: CancellationToken, endpoint_rate_limiter: Arc, + cancellations: TaskTracker, ) -> Result>, ApiError> { let host = request .headers() @@ -436,6 +443,7 @@ async fn request_handler( let (response, websocket) = framed_websockets::upgrade::upgrade(&mut request) .map_err(|e| ApiError::BadRequest(e.into()))?; + let cancellations = cancellations.clone(); ws_connections.spawn( async move { if let Err(e) = websocket::serve_websocket( @@ -446,6 +454,7 @@ async fn request_handler( cancellation_handler, endpoint_rate_limiter, host, + cancellations, ) .await { diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs index 4088fea835..bdb83fe6be 100644 --- a/proxy/src/serverless/websocket.rs +++ b/proxy/src/serverless/websocket.rs @@ -123,6 +123,7 @@ impl AsyncBufRead for WebSocketRw { } } +#[allow(clippy::too_many_arguments)] pub(crate) async fn serve_websocket( config: &'static ProxyConfig, auth_backend: &'static crate::auth::Backend<'static, ()>, @@ -131,6 +132,7 @@ pub(crate) async fn serve_websocket( cancellation_handler: Arc, endpoint_rate_limiter: Arc, hostname: Option, + cancellations: tokio_util::task::task_tracker::TaskTracker, ) -> anyhow::Result<()> { let websocket = websocket.await?; let websocket = WebSocketServer::after_handshake(TokioIo::new(websocket)); @@ -149,6 +151,7 @@ pub(crate) async fn serve_websocket( ClientMode::Websockets { hostname }, endpoint_rate_limiter, conn_gauge, + cancellations, )) .await; From 03d90bc0b353dcbdaa1adc226e47369f4d680276 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Thu, 28 Nov 2024 10:11:08 +0000 Subject: [PATCH 02/65] remote_storage/abs: count 404 and 304 for get as ok for metrics (#9912) ## Problem We currently see elevated levels of errors for GetBlob requests. This is because 404 and 304 are counted as errors for metric reporting. ## Summary of Changes Bring the implementation in line with the S3 client and treat 404 and 304 responses as ok for metric purposes. Related: https://github.com/neondatabase/cloud/issues/20666 --- libs/remote_storage/src/azure_blob.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs index ae0a94295c..840917ef68 100644 --- a/libs/remote_storage/src/azure_blob.rs +++ b/libs/remote_storage/src/azure_blob.rs @@ -220,6 +220,11 @@ impl AzureBlobStorage { let started_at = ScopeGuard::into_inner(started_at); let outcome = match &download { Ok(_) => AttemptOutcome::Ok, + // At this level in the stack 404 and 304 responses do not indicate an error. + // There's expected cases when a blob may not exist or hasn't been modified since + // the last get (e.g. probing for timeline indices and heatmap downloads). + // Callers should handle errors if they are unexpected. + Err(DownloadError::NotFound | DownloadError::Unmodified) => AttemptOutcome::Ok, Err(_) => AttemptOutcome::Err, }; crate::metrics::BUCKET_METRICS From 2fa461b66887323889a3ee1cf538aa57cb17c49e Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Thu, 28 Nov 2024 16:48:18 +0100 Subject: [PATCH 03/65] Makefile: build pg_visibility (#9922) Build the `pg_visibility` extension for use with `neon_local`. This is useful to inspect the visibility map for debugging. Touches #9914. --- Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile b/Makefile index dc67b87239..9cffc74508 100644 --- a/Makefile +++ b/Makefile @@ -147,6 +147,8 @@ postgres-%: postgres-configure-% \ $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_prewarm install +@echo "Compiling pg_buffercache $*" $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_buffercache install + +@echo "Compiling pg_visibility $*" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_visibility install +@echo "Compiling pageinspect $*" $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install +@echo "Compiling amcheck $*" From e401f666983e59e4763f97b921cbc1a614af6c44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Thu, 28 Nov 2024 16:49:30 +0100 Subject: [PATCH 04/65] Update rust to 1.83.0, also update cargo adjacent tools (#9926) We keep the practice of keeping the compiler up to date, pointing to the latest release. This is done by many other projects in the Rust ecosystem as well. [Release notes](https://releases.rs/docs/1.83.0/). Also update `cargo-hakari`, `cargo-deny`, `cargo-hack` and `cargo-nextest` to their latest versions. Prior update was in #9445. --- build-tools.Dockerfile | 18 +++++++++--------- rust-toolchain.toml | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile index 4f491afec5..2671702697 100644 --- a/build-tools.Dockerfile +++ b/build-tools.Dockerfile @@ -57,9 +57,9 @@ RUN mkdir -p /pgcopydb/bin && \ mkdir -p /pgcopydb/lib && \ chmod -R 755 /pgcopydb && \ chown -R nonroot:nonroot /pgcopydb - -COPY --from=pgcopydb_builder /usr/lib/postgresql/16/bin/pgcopydb /pgcopydb/bin/pgcopydb -COPY --from=pgcopydb_builder /pgcopydb/lib/libpq.so.5 /pgcopydb/lib/libpq.so.5 + +COPY --from=pgcopydb_builder /usr/lib/postgresql/16/bin/pgcopydb /pgcopydb/bin/pgcopydb +COPY --from=pgcopydb_builder /pgcopydb/lib/libpq.so.5 /pgcopydb/lib/libpq.so.5 # System deps # @@ -258,14 +258,14 @@ WORKDIR /home/nonroot # Rust # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`) -ENV RUSTC_VERSION=1.82.0 +ENV RUSTC_VERSION=1.83.0 ENV RUSTUP_HOME="/home/nonroot/.rustup" ENV PATH="/home/nonroot/.cargo/bin:${PATH}" ARG RUSTFILT_VERSION=0.2.1 -ARG CARGO_HAKARI_VERSION=0.9.30 -ARG CARGO_DENY_VERSION=0.16.1 -ARG CARGO_HACK_VERSION=0.6.31 -ARG CARGO_NEXTEST_VERSION=0.9.72 +ARG CARGO_HAKARI_VERSION=0.9.33 +ARG CARGO_DENY_VERSION=0.16.2 +ARG CARGO_HACK_VERSION=0.6.33 +ARG CARGO_NEXTEST_VERSION=0.9.85 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \ chmod +x rustup-init && \ ./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \ @@ -289,7 +289,7 @@ RUN whoami \ && cargo --version --verbose \ && rustup --version --verbose \ && rustc --version --verbose \ - && clang --version + && clang --version RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \ LD_LIBRARY_PATH=/pgcopydb/lib /pgcopydb/bin/pgcopydb --version; \ diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 92b7929c7f..f0661a32e0 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,5 +1,5 @@ [toolchain] -channel = "1.82.0" +channel = "1.83.0" profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html From e16439400dc07612883b603c4f51b3357220f6f5 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Thu, 28 Nov 2024 17:38:47 +0000 Subject: [PATCH 05/65] pageserver: return correct LSN for interpreted proto keep alive responses (#9928) ## Problem For the interpreted proto the pageserver is not returning the correct LSN in replies to keep alive requests. This is because the interpreted protocol arm was not updating `last_rec_lsn`. ## Summary of changes * Return correct LSN in keep-alive responses * Fix shard field in wal sender traces --- .../tenant/timeline/walreceiver/walreceiver_connection.rs | 4 ++++ safekeeper/src/handler.rs | 5 +++-- safekeeper/src/wal_service.rs | 2 +- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index 31cf1b6307..d90ffbfa2c 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -454,6 +454,10 @@ pub(super) async fn handle_walreceiver_connection( timeline.get_last_record_lsn() ); + if let Some(lsn) = next_record_lsn { + last_rec_lsn = lsn; + } + Some(streaming_lsn) } diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index 22f33b17e0..8dd2929a03 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -212,8 +212,9 @@ impl postgres_backend::Handler ); if let Some(shard) = self.shard.as_ref() { - tracing::Span::current() - .record("shard", tracing::field::display(shard.shard_slug())); + if let Some(slug) = shard.shard_slug().strip_prefix("-") { + tracing::Span::current().record("shard", tracing::field::display(slug)); + } } Ok(()) diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs index 1ab54d4cce..5248d545db 100644 --- a/safekeeper/src/wal_service.rs +++ b/safekeeper/src/wal_service.rs @@ -44,7 +44,7 @@ pub async fn task_main( error!("connection handler exited: {}", err); } } - .instrument(info_span!("", cid = %conn_id, ttid = field::Empty, application_name = field::Empty)), + .instrument(info_span!("", cid = %conn_id, ttid = field::Empty, application_name = field::Empty, shard = field::Empty)), ); } } From 1fb6ab59e82fd7f603b1a406474cb8092589c659 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Thu, 28 Nov 2024 19:02:57 +0000 Subject: [PATCH 06/65] test_runner: rerun all failed tests (#9917) ## Problem Currently, we rerun only known flaky tests. This approach was chosen to reduce the number of tests that go unnoticed (by forcing people to take a look at failed tests and rerun the job manually), but it has some drawbacks: - In PRs, people tend to push new changes without checking failed tests (that's ok) - In the main, tests are just restarted without checking (understandable) - Parametrised tests become flaky one by one, i.e. if `test[1]` is flaky `, test[2]` is not marked as flaky automatically (which may or may not be the case). I suggest rerunning all failed tests to increase the stability of GitHub jobs and using the Grafana Dashboard with flaky tests for deeper analysis. ## Summary of changes - Rerun all failed tests twice at max --- .../actions/run-python-test-set/action.yml | 17 +- .github/workflows/_build-and-test-locally.yml | 2 +- poetry.lock | 12 +- pyproject.toml | 2 +- scripts/flaky_tests.py | 147 ------------------ test_runner/conftest.py | 2 +- test_runner/fixtures/flaky.py | 78 ---------- test_runner/fixtures/paths.py | 2 +- test_runner/fixtures/reruns.py | 31 ++++ 9 files changed, 46 insertions(+), 247 deletions(-) delete mode 100755 scripts/flaky_tests.py delete mode 100644 test_runner/fixtures/flaky.py create mode 100644 test_runner/fixtures/reruns.py diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 275f161019..1159627302 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -36,8 +36,8 @@ inputs: description: 'Region name for real s3 tests' required: false default: '' - rerun_flaky: - description: 'Whether to rerun flaky tests' + rerun_failed: + description: 'Whether to rerun failed tests' required: false default: 'false' pg_version: @@ -108,7 +108,7 @@ runs: COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }} ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage') ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage') - RERUN_FLAKY: ${{ inputs.rerun_flaky }} + RERUN_FAILED: ${{ inputs.rerun_failed }} PG_VERSION: ${{ inputs.pg_version }} shell: bash -euxo pipefail {0} run: | @@ -154,15 +154,8 @@ runs: EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS" fi - if [ "${RERUN_FLAKY}" == "true" ]; then - mkdir -p $TEST_OUTPUT - poetry run ./scripts/flaky_tests.py "${TEST_RESULT_CONNSTR}" \ - --days 7 \ - --output "$TEST_OUTPUT/flaky.json" \ - --pg-version "${DEFAULT_PG_VERSION}" \ - --build-type "${BUILD_TYPE}" - - EXTRA_PARAMS="--flaky-tests-json $TEST_OUTPUT/flaky.json $EXTRA_PARAMS" + if [ "${RERUN_FAILED}" == "true" ]; then + EXTRA_PARAMS="--reruns 2 $EXTRA_PARAMS" fi # We use pytest-split plugin to run benchmarks in parallel on different CI runners diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index bdf7c07c6a..42c32a23e3 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -293,7 +293,7 @@ jobs: run_with_real_s3: true real_s3_bucket: neon-github-ci-tests real_s3_region: eu-central-1 - rerun_flaky: true + rerun_failed: true pg_version: ${{ matrix.pg_version }} env: TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} diff --git a/poetry.lock b/poetry.lock index e2fca7be47..59ae5cf1ca 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2563,18 +2563,18 @@ pytest = "*" [[package]] name = "pytest-rerunfailures" -version = "13.0" +version = "15.0" description = "pytest plugin to re-run tests to eliminate flaky failures" optional = false -python-versions = ">=3.7" +python-versions = ">=3.9" files = [ - {file = "pytest-rerunfailures-13.0.tar.gz", hash = "sha256:e132dbe420bc476f544b96e7036edd0a69707574209b6677263c950d19b09199"}, - {file = "pytest_rerunfailures-13.0-py3-none-any.whl", hash = "sha256:34919cb3fcb1f8e5d4b940aa75ccdea9661bade925091873b7c6fa5548333069"}, + {file = "pytest-rerunfailures-15.0.tar.gz", hash = "sha256:2d9ac7baf59f4c13ac730b47f6fa80e755d1ba0581da45ce30b72fb3542b4474"}, + {file = "pytest_rerunfailures-15.0-py3-none-any.whl", hash = "sha256:dd150c4795c229ef44320adc9a0c0532c51b78bb7a6843a8c53556b9a611df1a"}, ] [package.dependencies] packaging = ">=17.1" -pytest = ">=7" +pytest = ">=7.4,<8.2.2 || >8.2.2" [[package]] name = "pytest-split" @@ -3524,4 +3524,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "21debe1116843e5d14bdf37d6e265c68c63a98a64ba04ec8b8a02af2e8d9f486" +content-hash = "426c385df93f578ba3537c40a269535e27fbcca1978b3cf266096ecbc298c6a9" diff --git a/pyproject.toml b/pyproject.toml index ccd3ab1864..01d15ee6bb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,7 +33,7 @@ types-psutil = "^5.9.5.12" types-toml = "^0.10.8.6" pytest-httpserver = "^1.0.8" aiohttp = "3.10.11" -pytest-rerunfailures = "^13.0" +pytest-rerunfailures = "^15.0" types-pytest-lazy-fixture = "^0.6.3.3" pytest-split = "^0.8.1" zstandard = "^0.21.0" diff --git a/scripts/flaky_tests.py b/scripts/flaky_tests.py deleted file mode 100755 index 3fb668ed2d..0000000000 --- a/scripts/flaky_tests.py +++ /dev/null @@ -1,147 +0,0 @@ -#! /usr/bin/env python3 - -from __future__ import annotations - -import argparse -import json -import logging -import os -from collections import defaultdict -from typing import TYPE_CHECKING - -import psycopg2 -import psycopg2.extras -import toml - -if TYPE_CHECKING: - from typing import Any - -FLAKY_TESTS_QUERY = """ - SELECT - DISTINCT parent_suite, suite, name - FROM results - WHERE - started_at > CURRENT_DATE - INTERVAL '%s' day - AND ( - (status IN ('failed', 'broken') AND reference = 'refs/heads/main') - OR flaky - ) - ; -""" - - -def main(args: argparse.Namespace): - connstr = args.connstr - interval_days = args.days - output = args.output - - build_type = args.build_type - pg_version = args.pg_version - - res: defaultdict[str, defaultdict[str, dict[str, bool]]] - res = defaultdict(lambda: defaultdict(dict)) - - try: - logging.info("connecting to the database...") - with psycopg2.connect(connstr, connect_timeout=30) as conn: - with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: - logging.info("fetching flaky tests...") - cur.execute(FLAKY_TESTS_QUERY, (interval_days,)) - rows = cur.fetchall() - except psycopg2.OperationalError as exc: - logging.error("cannot fetch flaky tests from the DB due to an error", exc) - rows = [] - - # If a test run has non-default PAGESERVER_VIRTUAL_FILE_IO_ENGINE (i.e. not empty, not tokio-epoll-uring), - # use it to parametrize test name along with build_type and pg_version - # - # See test_runner/fixtures/parametrize.py for details - if (io_engine := os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in ( - "", - "tokio-epoll-uring", - ): - pageserver_virtual_file_io_engine_parameter = f"-{io_engine}" - else: - pageserver_virtual_file_io_engine_parameter = "" - - # re-use existing records of flaky tests from before parametrization by compaction_algorithm - def get_pageserver_default_tenant_config_compaction_algorithm() -> dict[str, Any] | None: - """Duplicated from parametrize.py""" - toml_table = os.getenv("PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM") - if toml_table is None: - return None - v = toml.loads(toml_table) - assert isinstance(v, dict) - return v - - pageserver_default_tenant_config_compaction_algorithm_parameter = "" - if ( - explicit_default := get_pageserver_default_tenant_config_compaction_algorithm() - ) is not None: - pageserver_default_tenant_config_compaction_algorithm_parameter = ( - f"-{explicit_default['kind']}" - ) - - for row in rows: - # We don't want to automatically rerun tests in a performance suite - if row["parent_suite"] != "test_runner.regress": - continue - - if row["name"].endswith("]"): - parametrized_test = row["name"].replace( - "[", - f"[{build_type}-pg{pg_version}{pageserver_virtual_file_io_engine_parameter}{pageserver_default_tenant_config_compaction_algorithm_parameter}-", - ) - else: - parametrized_test = f"{row['name']}[{build_type}-pg{pg_version}{pageserver_virtual_file_io_engine_parameter}{pageserver_default_tenant_config_compaction_algorithm_parameter}]" - - res[row["parent_suite"]][row["suite"]][parametrized_test] = True - - logging.info( - f"\t{row['parent_suite'].replace('.', '/')}/{row['suite']}.py::{parametrized_test}" - ) - - logging.info(f"saving results to {output.name}") - json.dump(res, output, indent=2) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Detect flaky tests in the last N days") - parser.add_argument( - "--output", - type=argparse.FileType("w"), - default="flaky.json", - help="path to output json file (default: flaky.json)", - ) - parser.add_argument( - "--days", - required=False, - default=10, - type=int, - help="how many days to look back for flaky tests (default: 10)", - ) - parser.add_argument( - "--build-type", - required=True, - type=str, - help="for which build type to create list of flaky tests (debug or release)", - ) - parser.add_argument( - "--pg-version", - required=True, - type=int, - help="for which Postgres version to create list of flaky tests (14, 15, etc.)", - ) - parser.add_argument( - "connstr", - help="connection string to the test results database", - ) - args = parser.parse_args() - - level = logging.INFO - logging.basicConfig( - format="%(message)s", - level=level, - ) - - main(args) diff --git a/test_runner/conftest.py b/test_runner/conftest.py index 84eda52d33..887bfef478 100644 --- a/test_runner/conftest.py +++ b/test_runner/conftest.py @@ -13,5 +13,5 @@ pytest_plugins = ( "fixtures.pg_stats", "fixtures.compare_fixtures", "fixtures.slow", - "fixtures.flaky", + "fixtures.reruns", ) diff --git a/test_runner/fixtures/flaky.py b/test_runner/fixtures/flaky.py deleted file mode 100644 index 01634a29c5..0000000000 --- a/test_runner/fixtures/flaky.py +++ /dev/null @@ -1,78 +0,0 @@ -from __future__ import annotations - -import json -from collections.abc import MutableMapping -from pathlib import Path -from typing import TYPE_CHECKING, cast - -import pytest -from _pytest.config import Config -from _pytest.config.argparsing import Parser -from allure_commons.types import LabelType -from allure_pytest.utils import allure_name, allure_suite_labels - -from fixtures.log_helper import log - -if TYPE_CHECKING: - from collections.abc import MutableMapping - from typing import Any - - -""" -The plugin reruns flaky tests. -It uses `pytest.mark.flaky` provided by `pytest-rerunfailures` plugin and flaky tests detected by `scripts/flaky_tests.py` - -Note: the logic of getting flaky tests is extracted to a separate script to avoid running it for each of N xdist workers -""" - - -def pytest_addoption(parser: Parser): - parser.addoption( - "--flaky-tests-json", - action="store", - type=Path, - help="Path to json file with flaky tests generated by scripts/flaky_tests.py", - ) - - -def pytest_collection_modifyitems(config: Config, items: list[pytest.Item]): - if not config.getoption("--flaky-tests-json"): - return - - # Any error with getting flaky tests aren't critical, so just do not rerun any tests - flaky_json = config.getoption("--flaky-tests-json") - if not flaky_json.exists(): - return - - content = flaky_json.read_text() - try: - flaky_tests = json.loads(content) - except ValueError: - log.error(f"Can't parse {content} as json") - return - - for item in items: - # Use the same logic for constructing test name as Allure does (we store allure-provided data in DB) - # Ref https://github.com/allure-framework/allure-python/blob/2.13.1/allure-pytest/src/listener.py#L98-L100 - allure_labels = dict(allure_suite_labels(item)) - parent_suite = str(allure_labels.get(LabelType.PARENT_SUITE)) - suite = str(allure_labels.get(LabelType.SUITE)) - params = item.callspec.params if hasattr(item, "callspec") else {} - name = allure_name(item, params) - - if flaky_tests.get(parent_suite, {}).get(suite, {}).get(name, False): - # Rerun 3 times = 1 original run + 2 reruns - log.info(f"Marking {item.nodeid} as flaky. It will be rerun up to 3 times") - item.add_marker(pytest.mark.flaky(reruns=2)) - - # pytest-rerunfailures is not compatible with pytest-timeout (timeout is not set for reruns), - # we can workaround it by setting `timeout_func_only` to True[1]. - # Unfortunately, setting `timeout_func_only = True` globally in pytest.ini is broken[2], - # but we still can do it using pytest marker. - # - # - [1] https://github.com/pytest-dev/pytest-rerunfailures/issues/99 - # - [2] https://github.com/pytest-dev/pytest-timeout/issues/142 - timeout_marker = item.get_closest_marker("timeout") - if timeout_marker is not None: - kwargs = cast("MutableMapping[str, Any]", timeout_marker.kwargs) - kwargs["func_only"] = True diff --git a/test_runner/fixtures/paths.py b/test_runner/fixtures/paths.py index 1c71abea19..80777d65e9 100644 --- a/test_runner/fixtures/paths.py +++ b/test_runner/fixtures/paths.py @@ -30,7 +30,7 @@ def get_test_dir(request: FixtureRequest, top_output_dir: Path, prefix: str | No test_name = request.node.name test_dir = top_output_dir / f"{prefix or ''}{test_name.replace('/', '-')}" - # We rerun flaky tests multiple times, use a separate directory for each run. + # We rerun failed tests multiple times, use a separate directory for each run. if (suffix := getattr(request.node, "execution_count", None)) is not None: test_dir = test_dir.parent / f"{test_dir.name}-{suffix}" diff --git a/test_runner/fixtures/reruns.py b/test_runner/fixtures/reruns.py new file mode 100644 index 0000000000..f2a25ae8f6 --- /dev/null +++ b/test_runner/fixtures/reruns.py @@ -0,0 +1,31 @@ +from __future__ import annotations + +from collections.abc import MutableMapping +from typing import TYPE_CHECKING, cast + +import pytest + +if TYPE_CHECKING: + from collections.abc import MutableMapping + from typing import Any + + from _pytest.config import Config + + +def pytest_collection_modifyitems(config: Config, items: list[pytest.Item]): + # pytest-rerunfailures is not compatible with pytest-timeout (timeout is not set for reruns), + # we can workaround it by setting `timeout_func_only` to True[1]. + # Unfortunately, setting `timeout_func_only = True` globally in pytest.ini is broken[2], + # but we still can do it using pytest marker. + # + # - [1] https://github.com/pytest-dev/pytest-rerunfailures/issues/99 + # - [2] https://github.com/pytest-dev/pytest-timeout/issues/142 + + if not config.getoption("--reruns"): + return + + for item in items: + timeout_marker = item.get_closest_marker("timeout") + if timeout_marker is not None: + kwargs = cast("MutableMapping[str, Any]", timeout_marker.kwargs) + kwargs["func_only"] = True From b0822a54991825e450bcbf450599707d476ddc4b Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Thu, 28 Nov 2024 22:38:30 +0100 Subject: [PATCH 07/65] fix(compute_ctl): Allow usage of DB names with whitespaces (#9919) ## Problem We used `set_path()` to replace the database name in the connection string. It automatically does url-safe encoding if the path is not already encoded, but it does it as per the URL standard, which assumes that tabs can be safely removed from the path without changing the meaning of the URL. See, e.g., https://url.spec.whatwg.org/#concept-basic-url-parser. It also breaks for DBs with properly %-encoded names, like with `%20`, as they are kept intact, but actually should be escaped. Yet, this is not true for Postgres, where it's completely valid to have trailing tabs in the database name. I think this is the PR that caused this regression https://github.com/neondatabase/neon/pull/9717, as it switched from `postgres::config::Config` back to `set_path()`. This was fixed a while ago already [1], btw, I just haven't added a test to catch this regression back then :( ## Summary of changes This commit changes the code back to use `postgres/tokio_postgres::Config` everywhere. While on it, also do some changes around, as I had to touch this code: 1. Bump some logging from `debug` to `info` in the spec apply path. We do not use `debug` in prod, and it was tricky to understand what was going on with this bug in prod. 2. Refactor configuration concurrency calculation code so it was reusable. Yet, still keep `1` in the case of reconfiguration. The database can be actively used at this moment, so we cannot guarantee that there will be enough spare connection slots, and the underlying code won't handle connection errors properly. 3. Simplify the installed extensions code. It was spawning a blocking task inside async function, which doesn't make much sense. Instead, just have a main sync function and call it with `spawn_blocking` in the API code -- the only place we need it to be async. 4. Add regression python test to cover this and related problems in the future. Also, add more extensive testing of schema dump and DBs and roles listing API. [1]: https://github.com/neondatabase/neon/commit/4d1e48f3b9a4b7064787513fd2c455f0001f6e18 [2]: https://www.postgresql.org/message-id/flat/20151023003445.931.91267%40wrigleys.postgresql.org Resolves neondatabase/cloud#20869 --- compute_tools/src/catalog.rs | 39 ++++- compute_tools/src/compute.rs | 153 +++++++++++--------- compute_tools/src/http/api.rs | 7 +- compute_tools/src/installed_extensions.rs | 105 +++++--------- compute_tools/src/pg_helpers.rs | 11 ++ test_runner/fixtures/endpoint/http.py | 6 +- test_runner/fixtures/neon_fixtures.py | 29 ++++ test_runner/regress/test_compute_catalog.py | 111 +++++++++++++- 8 files changed, 318 insertions(+), 143 deletions(-) diff --git a/compute_tools/src/catalog.rs b/compute_tools/src/catalog.rs index 2f6f82dd39..08ae8bf44d 100644 --- a/compute_tools/src/catalog.rs +++ b/compute_tools/src/catalog.rs @@ -1,4 +1,3 @@ -use compute_api::responses::CatalogObjects; use futures::Stream; use postgres::NoTls; use std::{path::Path, process::Stdio, result::Result, sync::Arc}; @@ -13,7 +12,8 @@ use tokio_util::codec::{BytesCodec, FramedRead}; use tracing::warn; use crate::compute::ComputeNode; -use crate::pg_helpers::{get_existing_dbs_async, get_existing_roles_async}; +use crate::pg_helpers::{get_existing_dbs_async, get_existing_roles_async, postgres_conf_for_db}; +use compute_api::responses::CatalogObjects; pub async fn get_dbs_and_roles(compute: &Arc) -> anyhow::Result { let connstr = compute.connstr.clone(); @@ -43,6 +43,8 @@ pub enum SchemaDumpError { DatabaseDoesNotExist, #[error("Failed to execute pg_dump.")] IO(#[from] std::io::Error), + #[error("Unexpected error.")] + Unexpected, } // It uses the pg_dump utility to dump the schema of the specified database. @@ -60,11 +62,38 @@ pub async fn get_database_schema( let pgbin = &compute.pgbin; let basepath = Path::new(pgbin).parent().unwrap(); let pgdump = basepath.join("pg_dump"); - let mut connstr = compute.connstr.clone(); - connstr.set_path(dbname); + + // Replace the DB in the connection string and disable it to parts. + // This is the only option to handle DBs with special characters. + let conf = + postgres_conf_for_db(&compute.connstr, dbname).map_err(|_| SchemaDumpError::Unexpected)?; + let host = conf + .get_hosts() + .first() + .ok_or(SchemaDumpError::Unexpected)?; + let host = match host { + tokio_postgres::config::Host::Tcp(ip) => ip.to_string(), + #[cfg(unix)] + tokio_postgres::config::Host::Unix(path) => path.to_string_lossy().to_string(), + }; + let port = conf + .get_ports() + .first() + .ok_or(SchemaDumpError::Unexpected)?; + let user = conf.get_user().ok_or(SchemaDumpError::Unexpected)?; + let dbname = conf.get_dbname().ok_or(SchemaDumpError::Unexpected)?; + let mut cmd = Command::new(pgdump) + // XXX: this seems to be the only option to deal with DBs with `=` in the name + // See + .env("PGDATABASE", dbname) + .arg("--host") + .arg(host) + .arg("--port") + .arg(port.to_string()) + .arg("--username") + .arg(user) .arg("--schema-only") - .arg(connstr.as_str()) .stdout(Stdio::piped()) .stderr(Stdio::piped()) .kill_on_drop(true) diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 4f67425ba8..1a026a4014 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -34,9 +34,8 @@ use utils::measured_stream::MeasuredReader; use nix::sys::signal::{kill, Signal}; use remote_storage::{DownloadError, RemotePath}; use tokio::spawn; -use url::Url; -use crate::installed_extensions::get_installed_extensions_sync; +use crate::installed_extensions::get_installed_extensions; use crate::local_proxy; use crate::pg_helpers::*; use crate::spec::*; @@ -816,30 +815,32 @@ impl ComputeNode { Ok(()) } - async fn get_maintenance_client(url: &Url) -> Result { - let mut connstr = url.clone(); + async fn get_maintenance_client( + conf: &tokio_postgres::Config, + ) -> Result { + let mut conf = conf.clone(); - connstr - .query_pairs_mut() - .append_pair("application_name", "apply_config"); + conf.application_name("apply_config"); - let (client, conn) = match tokio_postgres::connect(connstr.as_str(), NoTls).await { + let (client, conn) = match conf.connect(NoTls).await { + // If connection fails, it may be the old node with `zenith_admin` superuser. + // + // In this case we need to connect with old `zenith_admin` name + // and create new user. We cannot simply rename connected user, + // but we can create a new one and grant it all privileges. Err(e) => match e.code() { Some(&SqlState::INVALID_PASSWORD) | Some(&SqlState::INVALID_AUTHORIZATION_SPECIFICATION) => { - // connect with zenith_admin if cloud_admin could not authenticate + // Connect with zenith_admin if cloud_admin could not authenticate info!( "cannot connect to postgres: {}, retrying with `zenith_admin` username", e ); - let mut zenith_admin_connstr = connstr.clone(); - - zenith_admin_connstr - .set_username("zenith_admin") - .map_err(|_| anyhow::anyhow!("invalid connstr"))?; + let mut zenith_admin_conf = postgres::config::Config::from(conf.clone()); + zenith_admin_conf.user("zenith_admin"); let mut client = - Client::connect(zenith_admin_connstr.as_str(), NoTls) + zenith_admin_conf.connect(NoTls) .context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?; // Disable forwarding so that users don't get a cloud_admin role @@ -853,8 +854,8 @@ impl ComputeNode { drop(client); - // reconnect with connstring with expected name - tokio_postgres::connect(connstr.as_str(), NoTls).await? + // Reconnect with connstring with expected name + conf.connect(NoTls).await? } _ => return Err(e.into()), }, @@ -885,7 +886,7 @@ impl ComputeNode { pub fn apply_spec_sql( &self, spec: Arc, - url: Arc, + conf: Arc, concurrency: usize, ) -> Result<()> { let rt = tokio::runtime::Builder::new_multi_thread() @@ -897,7 +898,7 @@ impl ComputeNode { rt.block_on(async { // Proceed with post-startup configuration. Note, that order of operations is important. - let client = Self::get_maintenance_client(&url).await?; + let client = Self::get_maintenance_client(&conf).await?; let spec = spec.clone(); let databases = get_existing_dbs_async(&client).await?; @@ -931,7 +932,7 @@ impl ComputeNode { RenameAndDeleteDatabases, CreateAndAlterDatabases, ] { - debug!("Applying phase {:?}", &phase); + info!("Applying phase {:?}", &phase); apply_operations( spec.clone(), ctx.clone(), @@ -942,6 +943,7 @@ impl ComputeNode { .await?; } + info!("Applying RunInEachDatabase phase"); let concurrency_token = Arc::new(tokio::sync::Semaphore::new(concurrency)); let db_processes = spec @@ -955,7 +957,7 @@ impl ComputeNode { let spec = spec.clone(); let ctx = ctx.clone(); let jwks_roles = jwks_roles.clone(); - let mut url = url.as_ref().clone(); + let mut conf = conf.as_ref().clone(); let concurrency_token = concurrency_token.clone(); let db = db.clone(); @@ -964,14 +966,14 @@ impl ComputeNode { match &db { DB::SystemDB => {} DB::UserDB(db) => { - url.set_path(db.name.as_str()); + conf.dbname(db.name.as_str()); } } - let url = Arc::new(url); + let conf = Arc::new(conf); let fut = Self::apply_spec_sql_db( spec.clone(), - url, + conf, ctx.clone(), jwks_roles.clone(), concurrency_token.clone(), @@ -1017,7 +1019,7 @@ impl ComputeNode { /// semaphore. The caller has to make sure the semaphore isn't exhausted. async fn apply_spec_sql_db( spec: Arc, - url: Arc, + conf: Arc, ctx: Arc>, jwks_roles: Arc>, concurrency_token: Arc, @@ -1046,7 +1048,7 @@ impl ComputeNode { // that database. || async { if client_conn.is_none() { - let db_client = Self::get_maintenance_client(&url).await?; + let db_client = Self::get_maintenance_client(&conf).await?; client_conn.replace(db_client); } let client = client_conn.as_ref().unwrap(); @@ -1061,34 +1063,16 @@ impl ComputeNode { Ok::<(), anyhow::Error>(()) } - /// Do initial configuration of the already started Postgres. - #[instrument(skip_all)] - pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> { - // If connection fails, - // it may be the old node with `zenith_admin` superuser. - // - // In this case we need to connect with old `zenith_admin` name - // and create new user. We cannot simply rename connected user, - // but we can create a new one and grant it all privileges. - let mut url = self.connstr.clone(); - url.query_pairs_mut() - .append_pair("application_name", "apply_config"); - - let url = Arc::new(url); - let spec = Arc::new( - compute_state - .pspec - .as_ref() - .expect("spec must be set") - .spec - .clone(), - ); - - // Choose how many concurrent connections to use for applying the spec changes. - // If the cluster is not currently Running we don't have to deal with user connections, + /// Choose how many concurrent connections to use for applying the spec changes. + pub fn max_service_connections( + &self, + compute_state: &ComputeState, + spec: &ComputeSpec, + ) -> usize { + // If the cluster is in Init state we don't have to deal with user connections, // and can thus use all `max_connections` connection slots. However, that's generally not // very efficient, so we generally still limit it to a smaller number. - let max_concurrent_connections = if compute_state.status != ComputeStatus::Running { + if compute_state.status == ComputeStatus::Init { // If the settings contain 'max_connections', use that as template if let Some(config) = spec.cluster.settings.find("max_connections") { config.parse::().ok() @@ -1144,10 +1128,29 @@ impl ComputeNode { .map(|val| if val > 1 { val - 1 } else { 1 }) .last() .unwrap_or(3) - }; + } + } + + /// Do initial configuration of the already started Postgres. + #[instrument(skip_all)] + pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> { + let mut conf = tokio_postgres::Config::from_str(self.connstr.as_str()).unwrap(); + conf.application_name("apply_config"); + + let conf = Arc::new(conf); + let spec = Arc::new( + compute_state + .pspec + .as_ref() + .expect("spec must be set") + .spec + .clone(), + ); + + let max_concurrent_connections = self.max_service_connections(compute_state, &spec); // Merge-apply spec & changes to PostgreSQL state. - self.apply_spec_sql(spec.clone(), url.clone(), max_concurrent_connections)?; + self.apply_spec_sql(spec.clone(), conf.clone(), max_concurrent_connections)?; if let Some(ref local_proxy) = &spec.clone().local_proxy_config { info!("configuring local_proxy"); @@ -1156,12 +1159,11 @@ impl ComputeNode { // Run migrations separately to not hold up cold starts thread::spawn(move || { - let mut connstr = url.as_ref().clone(); - connstr - .query_pairs_mut() - .append_pair("application_name", "migrations"); + let conf = conf.as_ref().clone(); + let mut conf = postgres::config::Config::from(conf); + conf.application_name("migrations"); - let mut client = Client::connect(connstr.as_str(), NoTls)?; + let mut client = conf.connect(NoTls)?; handle_migrations(&mut client).context("apply_config handle_migrations") }); @@ -1222,21 +1224,28 @@ impl ComputeNode { let pgdata_path = Path::new(&self.pgdata); let postgresql_conf_path = pgdata_path.join("postgresql.conf"); config::write_postgres_conf(&postgresql_conf_path, &spec, None)?; - // temporarily reset max_cluster_size in config + + // TODO(ololobus): We need a concurrency during reconfiguration as well, + // but DB is already running and used by user. We can easily get out of + // `max_connections` limit, and the current code won't handle that. + // let compute_state = self.state.lock().unwrap().clone(); + // let max_concurrent_connections = self.max_service_connections(&compute_state, &spec); + let max_concurrent_connections = 1; + + // Temporarily reset max_cluster_size in config // to avoid the possibility of hitting the limit, while we are reconfiguring: - // creating new extensions, roles, etc... + // creating new extensions, roles, etc. config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || { self.pg_reload_conf()?; if spec.mode == ComputeMode::Primary { - let mut url = self.connstr.clone(); - url.query_pairs_mut() - .append_pair("application_name", "apply_config"); - let url = Arc::new(url); + let mut conf = tokio_postgres::Config::from_str(self.connstr.as_str()).unwrap(); + conf.application_name("apply_config"); + let conf = Arc::new(conf); let spec = Arc::new(spec.clone()); - self.apply_spec_sql(spec, url, 1)?; + self.apply_spec_sql(spec, conf, max_concurrent_connections)?; } Ok(()) @@ -1362,7 +1371,17 @@ impl ComputeNode { let connstr = self.connstr.clone(); thread::spawn(move || { - get_installed_extensions_sync(connstr).context("get_installed_extensions") + let res = get_installed_extensions(&connstr); + match res { + Ok(extensions) => { + info!( + "[NEON_EXT_STAT] {}", + serde_json::to_string(&extensions) + .expect("failed to serialize extensions list") + ); + } + Err(err) => error!("could not get installed extensions: {err:?}"), + } }); } diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs index 8a047634df..a6c6cff20a 100644 --- a/compute_tools/src/http/api.rs +++ b/compute_tools/src/http/api.rs @@ -296,7 +296,12 @@ async fn routes(req: Request, compute: &Arc) -> Response render_json(Body::from(serde_json::to_string(&res).unwrap())), Err(e) => render_json_error( diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs index 79d8b2ca04..f473c29a55 100644 --- a/compute_tools/src/installed_extensions.rs +++ b/compute_tools/src/installed_extensions.rs @@ -2,17 +2,16 @@ use compute_api::responses::{InstalledExtension, InstalledExtensions}; use metrics::proto::MetricFamily; use std::collections::HashMap; use std::collections::HashSet; -use tracing::info; -use url::Url; use anyhow::Result; use postgres::{Client, NoTls}; -use tokio::task; use metrics::core::Collector; use metrics::{register_uint_gauge_vec, UIntGaugeVec}; use once_cell::sync::Lazy; +use crate::pg_helpers::postgres_conf_for_db; + /// We don't reuse get_existing_dbs() just for code clarity /// and to make database listing query here more explicit. /// @@ -42,75 +41,51 @@ fn list_dbs(client: &mut Client) -> Result> { /// /// Same extension can be installed in multiple databases with different versions, /// we only keep the highest and lowest version across all databases. -pub async fn get_installed_extensions(connstr: Url) -> Result { - let mut connstr = connstr.clone(); +pub fn get_installed_extensions(connstr: &url::Url) -> Result { + let mut client = Client::connect(connstr.as_str(), NoTls)?; + let databases: Vec = list_dbs(&mut client)?; - task::spawn_blocking(move || { - let mut client = Client::connect(connstr.as_str(), NoTls)?; - let databases: Vec = list_dbs(&mut client)?; + let mut extensions_map: HashMap = HashMap::new(); + for db in databases.iter() { + let config = postgres_conf_for_db(connstr, db)?; + let mut db_client = config.connect(NoTls)?; + let extensions: Vec<(String, String)> = db_client + .query( + "SELECT extname, extversion FROM pg_catalog.pg_extension;", + &[], + )? + .iter() + .map(|row| (row.get("extname"), row.get("extversion"))) + .collect(); - let mut extensions_map: HashMap = HashMap::new(); - for db in databases.iter() { - connstr.set_path(db); - let mut db_client = Client::connect(connstr.as_str(), NoTls)?; - let extensions: Vec<(String, String)> = db_client - .query( - "SELECT extname, extversion FROM pg_catalog.pg_extension;", - &[], - )? - .iter() - .map(|row| (row.get("extname"), row.get("extversion"))) - .collect(); + for (extname, v) in extensions.iter() { + let version = v.to_string(); - for (extname, v) in extensions.iter() { - let version = v.to_string(); + // increment the number of databases where the version of extension is installed + INSTALLED_EXTENSIONS + .with_label_values(&[extname, &version]) + .inc(); - // increment the number of databases where the version of extension is installed - INSTALLED_EXTENSIONS - .with_label_values(&[extname, &version]) - .inc(); - - extensions_map - .entry(extname.to_string()) - .and_modify(|e| { - e.versions.insert(version.clone()); - // count the number of databases where the extension is installed - e.n_databases += 1; - }) - .or_insert(InstalledExtension { - extname: extname.to_string(), - versions: HashSet::from([version.clone()]), - n_databases: 1, - }); - } + extensions_map + .entry(extname.to_string()) + .and_modify(|e| { + e.versions.insert(version.clone()); + // count the number of databases where the extension is installed + e.n_databases += 1; + }) + .or_insert(InstalledExtension { + extname: extname.to_string(), + versions: HashSet::from([version.clone()]), + n_databases: 1, + }); } + } - let res = InstalledExtensions { - extensions: extensions_map.values().cloned().collect(), - }; + let res = InstalledExtensions { + extensions: extensions_map.values().cloned().collect(), + }; - Ok(res) - }) - .await? -} - -// Gather info about installed extensions -pub fn get_installed_extensions_sync(connstr: Url) -> Result<()> { - let rt = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .expect("failed to create runtime"); - let result = rt - .block_on(crate::installed_extensions::get_installed_extensions( - connstr, - )) - .expect("failed to get installed extensions"); - - info!( - "[NEON_EXT_STAT] {}", - serde_json::to_string(&result).expect("failed to serialize extensions list") - ); - Ok(()) + Ok(res) } static INSTALLED_EXTENSIONS: Lazy = Lazy::new(|| { diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 4a1e5ee0e8..e03b410699 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -6,6 +6,7 @@ use std::io::{BufRead, BufReader}; use std::os::unix::fs::PermissionsExt; use std::path::Path; use std::process::Child; +use std::str::FromStr; use std::thread::JoinHandle; use std::time::{Duration, Instant}; @@ -13,8 +14,10 @@ use anyhow::{bail, Result}; use futures::StreamExt; use ini::Ini; use notify::{RecursiveMode, Watcher}; +use postgres::config::Config; use tokio::io::AsyncBufReadExt; use tokio::time::timeout; +use tokio_postgres; use tokio_postgres::NoTls; use tracing::{debug, error, info, instrument}; @@ -542,3 +545,11 @@ async fn handle_postgres_logs_async(stderr: tokio::process::ChildStderr) -> Resu Ok(()) } + +/// `Postgres::config::Config` handles database names with whitespaces +/// and special characters properly. +pub fn postgres_conf_for_db(connstr: &url::Url, dbname: &str) -> Result { + let mut conf = Config::from_str(connstr.as_str())?; + conf.dbname(dbname); + Ok(conf) +} diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py index db3723b7cc..1cd9158c68 100644 --- a/test_runner/fixtures/endpoint/http.py +++ b/test_runner/fixtures/endpoint/http.py @@ -1,5 +1,7 @@ from __future__ import annotations +import urllib.parse + import requests from requests.adapters import HTTPAdapter @@ -20,7 +22,9 @@ class EndpointHttpClient(requests.Session): return res.json() def database_schema(self, database: str): - res = self.get(f"http://localhost:{self.port}/database_schema?database={database}") + res = self.get( + f"http://localhost:{self.port}/database_schema?database={urllib.parse.quote(database, safe='')}" + ) res.raise_for_status() return res.text diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index a45a311dc2..1f4d2aa5ec 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -3934,6 +3934,35 @@ class Endpoint(PgProtocol, LogUtils): log.info(json.dumps(dict(data_dict, **kwargs))) json.dump(dict(data_dict, **kwargs), file, indent=4) + def respec_deep(self, **kwargs: Any) -> None: + """ + Update the endpoint.json file taking into account nested keys. + It does one level deep update. Should enough for most cases. + Distinct method from respec() to do not break existing functionality. + NOTE: This method also updates the spec.json file, not endpoint.json. + We need it because neon_local also writes to spec.json, so intended + use-case is i) start endpoint with some config, ii) respec_deep(), + iii) call reconfigure() to apply the changes. + """ + config_path = os.path.join(self.endpoint_path(), "spec.json") + with open(config_path) as f: + data_dict: dict[str, Any] = json.load(f) + + log.info("Current compute spec: %s", json.dumps(data_dict, indent=4)) + + for key, value in kwargs.items(): + if isinstance(value, dict): + if key not in data_dict: + data_dict[key] = value + else: + data_dict[key] = {**data_dict[key], **value} + else: + data_dict[key] = value + + with open(config_path, "w") as file: + log.info("Updating compute spec to: %s", json.dumps(data_dict, indent=4)) + json.dump(data_dict, file, indent=4) + # Please note: Migrations only run if pg_skip_catalog_updates is false def wait_for_migrations(self, num_migrations: int = 11): with self.cursor() as cur: diff --git a/test_runner/regress/test_compute_catalog.py b/test_runner/regress/test_compute_catalog.py index d43c71ceac..b3719a45ed 100644 --- a/test_runner/regress/test_compute_catalog.py +++ b/test_runner/regress/test_compute_catalog.py @@ -3,13 +3,60 @@ from __future__ import annotations import requests from fixtures.neon_fixtures import NeonEnv +TEST_DB_NAMES = [ + { + "name": "neondb", + "owner": "cloud_admin", + }, + { + "name": "db with spaces", + "owner": "cloud_admin", + }, + { + "name": "db with%20spaces ", + "owner": "cloud_admin", + }, + { + "name": "db with whitespaces ", + "owner": "cloud_admin", + }, + { + "name": "injective db with spaces'; SELECT pg_sleep(10);", + "owner": "cloud_admin", + }, + { + "name": "db with #pound-sign and &ersands=true", + "owner": "cloud_admin", + }, + { + "name": "db with emoji 🌍", + "owner": "cloud_admin", + }, +] + def test_compute_catalog(neon_simple_env: NeonEnv): + """ + Create a bunch of databases with tricky names and test that we can list them + and dump via API. + """ env = neon_simple_env - endpoint = env.endpoints.create_start("main", config_lines=["log_min_messages=debug1"]) - client = endpoint.http_client() + endpoint = env.endpoints.create_start("main") + # Update the spec.json file to include new databases + # and reconfigure the endpoint to create some test databases. + endpoint.respec_deep( + **{ + "skip_pg_catalog_updates": False, + "cluster": { + "databases": TEST_DB_NAMES, + }, + } + ) + endpoint.reconfigure() + + client = endpoint.http_client() objects = client.dbs_and_roles() # Assert that 'cloud_admin' role exists in the 'roles' list @@ -22,9 +69,24 @@ def test_compute_catalog(neon_simple_env: NeonEnv): db["name"] == "postgres" for db in objects["databases"] ), "The 'postgres' database is missing" - ddl = client.database_schema(database="postgres") + # Check other databases + for test_db in TEST_DB_NAMES: + db = next((db for db in objects["databases"] if db["name"] == test_db["name"]), None) + assert db is not None, f"The '{test_db['name']}' database is missing" + assert ( + db["owner"] == test_db["owner"] + ), f"The '{test_db['name']}' database has incorrect owner" - assert "-- PostgreSQL database dump" in ddl + ddl = client.database_schema(database=test_db["name"]) + + # Check that it looks like a valid PostgreSQL dump + assert "-- PostgreSQL database dump" in ddl + + # Check that it doesn't contain health_check and migration traces. + # They are only created in system `postgres` database, so by checking + # that we ensure that we dump right databases. + assert "health_check" not in ddl, f"The '{test_db['name']}' database contains health_check" + assert "migration" not in ddl, f"The '{test_db['name']}' database contains migrations data" try: client.database_schema(database="nonexistentdb") @@ -33,3 +95,44 @@ def test_compute_catalog(neon_simple_env: NeonEnv): assert ( e.response.status_code == 404 ), f"Expected 404 status code, but got {e.response.status_code}" + + +def test_compute_create_databases(neon_simple_env: NeonEnv): + """ + Test that compute_ctl can create and work with databases with special + characters (whitespaces, %, tabs, etc.) in the name. + """ + env = neon_simple_env + + # Create and start endpoint so that neon_local put all the generated + # stuff into the spec.json file. + endpoint = env.endpoints.create_start("main") + + # Update the spec.json file to include new databases + # and reconfigure the endpoint to apply the changes. + endpoint.respec_deep( + **{ + "skip_pg_catalog_updates": False, + "cluster": { + "databases": TEST_DB_NAMES, + }, + } + ) + endpoint.reconfigure() + + for db in TEST_DB_NAMES: + # Check that database has a correct name in the system catalog + with endpoint.cursor() as cursor: + cursor.execute("SELECT datname FROM pg_database WHERE datname = %s", (db["name"],)) + catalog_db = cursor.fetchone() + assert catalog_db is not None + assert len(catalog_db) == 1 + assert catalog_db[0] == db["name"] + + # Check that we can connect to this database without any issues + with endpoint.cursor(dbname=db["name"]) as cursor: + cursor.execute("SELECT * FROM current_database()") + curr_db = cursor.fetchone() + assert curr_db is not None + assert len(curr_db) == 1 + assert curr_db[0] == db["name"] From e5152551ad6de0ae8516d2416cd90244d97b3088 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Fri, 29 Nov 2024 10:40:08 +0100 Subject: [PATCH 08/65] test_runner/performance: add logical message ingest benchmark (#9749) Adds a benchmark for logical message WAL ingestion throughput end-to-end. Logical messages are essentially noops, and thus ignored by the Pageserver. Example results from my MacBook, with fsync enabled: ``` postgres_ingest: 14.445 s safekeeper_ingest: 29.948 s pageserver_ingest: 30.013 s pageserver_recover_ingest: 8.633 s wal_written: 10,340 MB message_count: 1310720 messages postgres_throughput: 715 MB/s safekeeper_throughput: 345 MB/s pageserver_throughput: 344 MB/s pageserver_recover_throughput: 1197 MB/s ``` See https://github.com/neondatabase/neon/issues/9642#issuecomment-2475995205 for running analysis. Touches #9642. --- test_runner/fixtures/neon_fixtures.py | 31 ++++++ .../test_ingest_logical_message.py | 101 ++++++++++++++++++ 2 files changed, 132 insertions(+) create mode 100644 test_runner/performance/test_ingest_logical_message.py diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 1f4d2aa5ec..e3c88e9965 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -4404,6 +4404,10 @@ class Safekeeper(LogUtils): log.info(f"sk {self.id} flush LSN: {flush_lsn}") return flush_lsn + def get_commit_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn: + timeline_status = self.http_client().timeline_status(tenant_id, timeline_id) + return timeline_status.commit_lsn + def pull_timeline( self, srcs: list[Safekeeper], tenant_id: TenantId, timeline_id: TimelineId ) -> dict[str, Any]: @@ -4949,6 +4953,33 @@ def wait_for_last_flush_lsn( return min(results) +def wait_for_commit_lsn( + env: NeonEnv, + tenant: TenantId, + timeline: TimelineId, + lsn: Lsn, +) -> Lsn: + # TODO: it would be better to poll this in the compute, but there's no API for it. See: + # https://github.com/neondatabase/neon/issues/9758 + "Wait for the given LSN to be committed on any Safekeeper" + + max_commit_lsn = Lsn(0) + for i in range(1000): + for sk in env.safekeepers: + commit_lsn = sk.get_commit_lsn(tenant, timeline) + if commit_lsn >= lsn: + log.info(f"{tenant}/{timeline} at commit_lsn {commit_lsn}") + return commit_lsn + max_commit_lsn = max(max_commit_lsn, commit_lsn) + + if i % 10 == 0: + log.info( + f"{tenant}/{timeline} waiting for commit_lsn to reach {lsn}, now {max_commit_lsn}" + ) + time.sleep(0.1) + raise Exception(f"timed out while waiting for commit_lsn to reach {lsn}, was {max_commit_lsn}") + + def flush_ep_to_pageserver( env: NeonEnv, ep: Endpoint, diff --git a/test_runner/performance/test_ingest_logical_message.py b/test_runner/performance/test_ingest_logical_message.py new file mode 100644 index 0000000000..d3118eb15a --- /dev/null +++ b/test_runner/performance/test_ingest_logical_message.py @@ -0,0 +1,101 @@ +from __future__ import annotations + +import pytest +from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from fixtures.common_types import Lsn +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnvBuilder, + wait_for_commit_lsn, + wait_for_last_flush_lsn, +) +from fixtures.pageserver.utils import wait_for_last_record_lsn + + +@pytest.mark.timeout(600) +@pytest.mark.parametrize("size", [1024, 8192, 131072]) +@pytest.mark.parametrize("fsync", [True, False], ids=["fsync", "nofsync"]) +def test_ingest_logical_message( + request: pytest.FixtureRequest, + neon_env_builder: NeonEnvBuilder, + zenbenchmark: NeonBenchmarker, + fsync: bool, + size: int, +): + """ + Benchmarks ingestion of 10 GB of logical message WAL. These are essentially noops, and don't + incur any pageserver writes. + """ + + VOLUME = 10 * 1024**3 + count = VOLUME // size + + neon_env_builder.safekeepers_enable_fsync = fsync + + env = neon_env_builder.init_start() + endpoint = env.endpoints.create_start( + "main", + config_lines=[ + f"fsync = {fsync}", + # Disable backpressure. We don't want to block on pageserver. + "max_replication_apply_lag = 0", + "max_replication_flush_lag = 0", + "max_replication_write_lag = 0", + ], + ) + client = env.pageserver.http_client() + + # Wait for the timeline to be propagated to the pageserver. + wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline) + + # Ingest data and measure durations. + start_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0]) + + with endpoint.cursor() as cur: + cur.execute("set statement_timeout = 0") + + # Postgres will return once the logical messages have been written to its local WAL, without + # waiting for Safekeeper commit. We measure ingestion time both for Postgres, Safekeeper, + # and Pageserver to detect bottlenecks. + log.info("Ingesting data") + with zenbenchmark.record_duration("pageserver_ingest"): + with zenbenchmark.record_duration("safekeeper_ingest"): + with zenbenchmark.record_duration("postgres_ingest"): + cur.execute(f""" + select pg_logical_emit_message(false, '', repeat('x', {size})) + from generate_series(1, {count}) + """) + + end_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0]) + + # Wait for Safekeeper. + log.info("Waiting for Safekeeper to catch up") + wait_for_commit_lsn(env, env.initial_tenant, env.initial_timeline, end_lsn) + + # Wait for Pageserver. + log.info("Waiting for Pageserver to catch up") + wait_for_last_record_lsn(client, env.initial_tenant, env.initial_timeline, end_lsn) + + # Now that all data is ingested, delete and recreate the tenant in the pageserver. This will + # reingest all the WAL from the safekeeper without any other constraints. This gives us a + # baseline of how fast the pageserver can ingest this WAL in isolation. + status = env.storage_controller.inspect(tenant_shard_id=env.initial_tenant) + assert status is not None + + client.tenant_delete(env.initial_tenant) + env.pageserver.tenant_create(tenant_id=env.initial_tenant, generation=status[0]) + + with zenbenchmark.record_duration("pageserver_recover_ingest"): + log.info("Recovering WAL into pageserver") + client.timeline_create(env.pg_version, env.initial_tenant, env.initial_timeline) + wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline) + + # Emit metrics. + wal_written_mb = round((end_lsn - start_lsn) / (1024 * 1024)) + zenbenchmark.record("wal_written", wal_written_mb, "MB", MetricReport.TEST_PARAM) + zenbenchmark.record("message_count", count, "messages", MetricReport.TEST_PARAM) + + props = {p["name"]: p["value"] for _, p in request.node.user_properties} + for name in ("postgres", "safekeeper", "pageserver", "pageserver_recover"): + throughput = int(wal_written_mb / props[f"{name}_ingest"]) + zenbenchmark.record(f"{name}_throughput", throughput, "MB/s", MetricReport.HIGHER_IS_BETTER) From e61ec94fbc25da4368cd2c6242f66067f5cd85fe Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Fri, 29 Nov 2024 11:08:01 +0000 Subject: [PATCH 09/65] chore(proxy): vendor a subset of rust-postgres (#9930) Our rust-postgres fork is getting messy. Mostly because proxy wants more control over the raw protocol than tokio-postgres provides. As such, it's diverging more and more. Storage and compute also make use of rust-postgres, but in more normal usage, thus they don't need our crazy changes. Idea: * proxy maintains their subset * other teams use a minimal patch set against upstream rust-postgres Reviewing this code will be difficult. To implement it, I 1. Copied tokio-postgres, postgres-protocol and postgres-types from https://github.com/neondatabase/rust-postgres/tree/00940fcdb57a8e99e805297b75839e7c4c7b1796 2. Updated their package names with the `2` suffix to make them compile in the workspace. 3. Updated proxy to use those packages 4. Copied in the code from tokio-postgres-rustls 0.13 (with some patches applied https://github.com/jbg/tokio-postgres-rustls/pull/32 https://github.com/jbg/tokio-postgres-rustls/pull/33) 5. Removed as much dead code as I could find in the vendored libraries 6. Updated the tokio-postgres-rustls code to use our existing channel binding implementation --- .config/hakari.toml | 3 + Cargo.lock | 56 +- Cargo.toml | 3 + libs/proxy/README.md | 6 + libs/proxy/postgres-protocol2/Cargo.toml | 21 + .../src/authentication/mod.rs | 37 + .../src/authentication/sasl.rs | 516 +++++ .../postgres-protocol2/src/escape/mod.rs | 93 + .../postgres-protocol2/src/escape/test.rs | 17 + libs/proxy/postgres-protocol2/src/lib.rs | 78 + .../postgres-protocol2/src/message/backend.rs | 766 ++++++++ .../src/message/frontend.rs | 297 +++ .../postgres-protocol2/src/message/mod.rs | 8 + .../postgres-protocol2/src/password/mod.rs | 107 ++ .../postgres-protocol2/src/password/test.rs | 19 + .../proxy/postgres-protocol2/src/types/mod.rs | 294 +++ .../postgres-protocol2/src/types/test.rs | 87 + libs/proxy/postgres-types2/Cargo.toml | 10 + libs/proxy/postgres-types2/src/lib.rs | 477 +++++ libs/proxy/postgres-types2/src/private.rs | 34 + libs/proxy/postgres-types2/src/type_gen.rs | 1524 +++++++++++++++ libs/proxy/tokio-postgres2/Cargo.toml | 21 + .../proxy/tokio-postgres2/src/cancel_query.rs | 40 + .../tokio-postgres2/src/cancel_query_raw.rs | 29 + .../proxy/tokio-postgres2/src/cancel_token.rs | 62 + libs/proxy/tokio-postgres2/src/client.rs | 439 +++++ libs/proxy/tokio-postgres2/src/codec.rs | 109 ++ libs/proxy/tokio-postgres2/src/config.rs | 897 +++++++++ libs/proxy/tokio-postgres2/src/connect.rs | 112 ++ libs/proxy/tokio-postgres2/src/connect_raw.rs | 359 ++++ .../tokio-postgres2/src/connect_socket.rs | 65 + libs/proxy/tokio-postgres2/src/connect_tls.rs | 48 + libs/proxy/tokio-postgres2/src/connection.rs | 323 ++++ libs/proxy/tokio-postgres2/src/error/mod.rs | 501 +++++ .../tokio-postgres2/src/error/sqlstate.rs | 1670 +++++++++++++++++ .../tokio-postgres2/src/generic_client.rs | 64 + libs/proxy/tokio-postgres2/src/lib.rs | 148 ++ .../tokio-postgres2/src/maybe_tls_stream.rs | 77 + libs/proxy/tokio-postgres2/src/prepare.rs | 262 +++ libs/proxy/tokio-postgres2/src/query.rs | 340 ++++ libs/proxy/tokio-postgres2/src/row.rs | 300 +++ .../proxy/tokio-postgres2/src/simple_query.rs | 142 ++ libs/proxy/tokio-postgres2/src/statement.rs | 157 ++ libs/proxy/tokio-postgres2/src/tls.rs | 162 ++ .../proxy/tokio-postgres2/src/to_statement.rs | 57 + libs/proxy/tokio-postgres2/src/transaction.rs | 74 + .../src/transaction_builder.rs | 113 ++ libs/proxy/tokio-postgres2/src/types.rs | 6 + proxy/Cargo.toml | 6 +- proxy/src/compute.rs | 5 +- proxy/src/context/mod.rs | 1 + proxy/src/lib.rs | 1 + proxy/src/postgres_rustls/mod.rs | 158 ++ proxy/src/proxy/tests/mod.rs | 2 +- proxy/src/serverless/backend.rs | 2 +- proxy/src/serverless/conn_pool.rs | 5 +- proxy/src/serverless/local_conn_pool.rs | 11 +- workspace_hack/Cargo.toml | 4 +- 58 files changed, 11199 insertions(+), 26 deletions(-) create mode 100644 libs/proxy/README.md create mode 100644 libs/proxy/postgres-protocol2/Cargo.toml create mode 100644 libs/proxy/postgres-protocol2/src/authentication/mod.rs create mode 100644 libs/proxy/postgres-protocol2/src/authentication/sasl.rs create mode 100644 libs/proxy/postgres-protocol2/src/escape/mod.rs create mode 100644 libs/proxy/postgres-protocol2/src/escape/test.rs create mode 100644 libs/proxy/postgres-protocol2/src/lib.rs create mode 100644 libs/proxy/postgres-protocol2/src/message/backend.rs create mode 100644 libs/proxy/postgres-protocol2/src/message/frontend.rs create mode 100644 libs/proxy/postgres-protocol2/src/message/mod.rs create mode 100644 libs/proxy/postgres-protocol2/src/password/mod.rs create mode 100644 libs/proxy/postgres-protocol2/src/password/test.rs create mode 100644 libs/proxy/postgres-protocol2/src/types/mod.rs create mode 100644 libs/proxy/postgres-protocol2/src/types/test.rs create mode 100644 libs/proxy/postgres-types2/Cargo.toml create mode 100644 libs/proxy/postgres-types2/src/lib.rs create mode 100644 libs/proxy/postgres-types2/src/private.rs create mode 100644 libs/proxy/postgres-types2/src/type_gen.rs create mode 100644 libs/proxy/tokio-postgres2/Cargo.toml create mode 100644 libs/proxy/tokio-postgres2/src/cancel_query.rs create mode 100644 libs/proxy/tokio-postgres2/src/cancel_query_raw.rs create mode 100644 libs/proxy/tokio-postgres2/src/cancel_token.rs create mode 100644 libs/proxy/tokio-postgres2/src/client.rs create mode 100644 libs/proxy/tokio-postgres2/src/codec.rs create mode 100644 libs/proxy/tokio-postgres2/src/config.rs create mode 100644 libs/proxy/tokio-postgres2/src/connect.rs create mode 100644 libs/proxy/tokio-postgres2/src/connect_raw.rs create mode 100644 libs/proxy/tokio-postgres2/src/connect_socket.rs create mode 100644 libs/proxy/tokio-postgres2/src/connect_tls.rs create mode 100644 libs/proxy/tokio-postgres2/src/connection.rs create mode 100644 libs/proxy/tokio-postgres2/src/error/mod.rs create mode 100644 libs/proxy/tokio-postgres2/src/error/sqlstate.rs create mode 100644 libs/proxy/tokio-postgres2/src/generic_client.rs create mode 100644 libs/proxy/tokio-postgres2/src/lib.rs create mode 100644 libs/proxy/tokio-postgres2/src/maybe_tls_stream.rs create mode 100644 libs/proxy/tokio-postgres2/src/prepare.rs create mode 100644 libs/proxy/tokio-postgres2/src/query.rs create mode 100644 libs/proxy/tokio-postgres2/src/row.rs create mode 100644 libs/proxy/tokio-postgres2/src/simple_query.rs create mode 100644 libs/proxy/tokio-postgres2/src/statement.rs create mode 100644 libs/proxy/tokio-postgres2/src/tls.rs create mode 100644 libs/proxy/tokio-postgres2/src/to_statement.rs create mode 100644 libs/proxy/tokio-postgres2/src/transaction.rs create mode 100644 libs/proxy/tokio-postgres2/src/transaction_builder.rs create mode 100644 libs/proxy/tokio-postgres2/src/types.rs create mode 100644 proxy/src/postgres_rustls/mod.rs diff --git a/.config/hakari.toml b/.config/hakari.toml index b5990d090e..3b6d9d8822 100644 --- a/.config/hakari.toml +++ b/.config/hakari.toml @@ -46,6 +46,9 @@ workspace-members = [ "utils", "wal_craft", "walproposer", + "postgres-protocol2", + "postgres-types2", + "tokio-postgres2", ] # Write out exact versions rather than a semver range. (Defaults to false.) diff --git a/Cargo.lock b/Cargo.lock index 43a46fb1eb..f05c6311dd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4162,6 +4162,23 @@ dependencies = [ "tokio", ] +[[package]] +name = "postgres-protocol2" +version = "0.1.0" +dependencies = [ + "base64 0.20.0", + "byteorder", + "bytes", + "fallible-iterator", + "hmac", + "md-5", + "memchr", + "rand 0.8.5", + "sha2", + "stringprep", + "tokio", +] + [[package]] name = "postgres-types" version = "0.2.4" @@ -4170,8 +4187,15 @@ dependencies = [ "bytes", "fallible-iterator", "postgres-protocol", - "serde", - "serde_json", +] + +[[package]] +name = "postgres-types2" +version = "0.1.0" +dependencies = [ + "bytes", + "fallible-iterator", + "postgres-protocol2", ] [[package]] @@ -4501,7 +4525,7 @@ dependencies = [ "parquet_derive", "pbkdf2", "pin-project-lite", - "postgres-protocol", + "postgres-protocol2", "postgres_backend", "pq_proto", "prometheus", @@ -4536,8 +4560,7 @@ dependencies = [ "tikv-jemalloc-ctl", "tikv-jemallocator", "tokio", - "tokio-postgres", - "tokio-postgres-rustls", + "tokio-postgres2", "tokio-rustls 0.26.0", "tokio-tungstenite", "tokio-util", @@ -6421,6 +6444,7 @@ dependencies = [ "libc", "mio", "num_cpus", + "parking_lot 0.12.1", "pin-project-lite", "signal-hook-registry", "socket2", @@ -6502,6 +6526,26 @@ dependencies = [ "x509-certificate", ] +[[package]] +name = "tokio-postgres2" +version = "0.1.0" +dependencies = [ + "async-trait", + "byteorder", + "bytes", + "fallible-iterator", + "futures-util", + "log", + "parking_lot 0.12.1", + "percent-encoding", + "phf", + "pin-project-lite", + "postgres-protocol2", + "postgres-types2", + "tokio", + "tokio-util", +] + [[package]] name = "tokio-rustls" version = "0.24.0" @@ -7597,7 +7641,6 @@ dependencies = [ "num-traits", "once_cell", "parquet", - "postgres-types", "prettyplease", "proc-macro2", "prost", @@ -7622,7 +7665,6 @@ dependencies = [ "time", "time-macros", "tokio", - "tokio-postgres", "tokio-rustls 0.26.0", "tokio-stream", "tokio-util", diff --git a/Cargo.toml b/Cargo.toml index e3dc5b97f8..742201d0f5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -35,6 +35,9 @@ members = [ "libs/walproposer", "libs/wal_decoder", "libs/postgres_initdb", + "libs/proxy/postgres-protocol2", + "libs/proxy/postgres-types2", + "libs/proxy/tokio-postgres2", ] [workspace.package] diff --git a/libs/proxy/README.md b/libs/proxy/README.md new file mode 100644 index 0000000000..2ae6210e46 --- /dev/null +++ b/libs/proxy/README.md @@ -0,0 +1,6 @@ +This directory contains libraries that are specific for proxy. + +Currently, it contains a signficant fork/refactoring of rust-postgres that no longer reflects the API +of the original library. Since it was so significant, it made sense to upgrade it to it's own set of libraries. + +Proxy needs unique access to the protocol, which explains why such heavy modifications were necessary. diff --git a/libs/proxy/postgres-protocol2/Cargo.toml b/libs/proxy/postgres-protocol2/Cargo.toml new file mode 100644 index 0000000000..284a632954 --- /dev/null +++ b/libs/proxy/postgres-protocol2/Cargo.toml @@ -0,0 +1,21 @@ +[package] +name = "postgres-protocol2" +version = "0.1.0" +edition = "2018" +license = "MIT/Apache-2.0" + +[dependencies] +base64 = "0.20" +byteorder.workspace = true +bytes.workspace = true +fallible-iterator.workspace = true +hmac.workspace = true +md-5 = "0.10" +memchr = "2.0" +rand.workspace = true +sha2.workspace = true +stringprep = "0.1" +tokio = { workspace = true, features = ["rt"] } + +[dev-dependencies] +tokio = { workspace = true, features = ["full"] } diff --git a/libs/proxy/postgres-protocol2/src/authentication/mod.rs b/libs/proxy/postgres-protocol2/src/authentication/mod.rs new file mode 100644 index 0000000000..71afa4b9b6 --- /dev/null +++ b/libs/proxy/postgres-protocol2/src/authentication/mod.rs @@ -0,0 +1,37 @@ +//! Authentication protocol support. +use md5::{Digest, Md5}; + +pub mod sasl; + +/// Hashes authentication information in a way suitable for use in response +/// to an `AuthenticationMd5Password` message. +/// +/// The resulting string should be sent back to the database in a +/// `PasswordMessage` message. +#[inline] +pub fn md5_hash(username: &[u8], password: &[u8], salt: [u8; 4]) -> String { + let mut md5 = Md5::new(); + md5.update(password); + md5.update(username); + let output = md5.finalize_reset(); + md5.update(format!("{:x}", output)); + md5.update(salt); + format!("md5{:x}", md5.finalize()) +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn md5() { + let username = b"md5_user"; + let password = b"password"; + let salt = [0x2a, 0x3d, 0x8f, 0xe0]; + + assert_eq!( + md5_hash(username, password, salt), + "md562af4dd09bbb41884907a838a3233294" + ); + } +} diff --git a/libs/proxy/postgres-protocol2/src/authentication/sasl.rs b/libs/proxy/postgres-protocol2/src/authentication/sasl.rs new file mode 100644 index 0000000000..19aa3c1e9a --- /dev/null +++ b/libs/proxy/postgres-protocol2/src/authentication/sasl.rs @@ -0,0 +1,516 @@ +//! SASL-based authentication support. + +use hmac::{Hmac, Mac}; +use rand::{self, Rng}; +use sha2::digest::FixedOutput; +use sha2::{Digest, Sha256}; +use std::fmt::Write; +use std::io; +use std::iter; +use std::mem; +use std::str; +use tokio::task::yield_now; + +const NONCE_LENGTH: usize = 24; + +/// The identifier of the SCRAM-SHA-256 SASL authentication mechanism. +pub const SCRAM_SHA_256: &str = "SCRAM-SHA-256"; +/// The identifier of the SCRAM-SHA-256-PLUS SASL authentication mechanism. +pub const SCRAM_SHA_256_PLUS: &str = "SCRAM-SHA-256-PLUS"; + +// since postgres passwords are not required to exclude saslprep-prohibited +// characters or even be valid UTF8, we run saslprep if possible and otherwise +// return the raw password. +fn normalize(pass: &[u8]) -> Vec { + let pass = match str::from_utf8(pass) { + Ok(pass) => pass, + Err(_) => return pass.to_vec(), + }; + + match stringprep::saslprep(pass) { + Ok(pass) => pass.into_owned().into_bytes(), + Err(_) => pass.as_bytes().to_vec(), + } +} + +pub(crate) async fn hi(str: &[u8], salt: &[u8], iterations: u32) -> [u8; 32] { + let mut hmac = + Hmac::::new_from_slice(str).expect("HMAC is able to accept all key sizes"); + hmac.update(salt); + hmac.update(&[0, 0, 0, 1]); + let mut prev = hmac.finalize().into_bytes(); + + let mut hi = prev; + + for i in 1..iterations { + let mut hmac = Hmac::::new_from_slice(str).expect("already checked above"); + hmac.update(&prev); + prev = hmac.finalize().into_bytes(); + + for (hi, prev) in hi.iter_mut().zip(prev) { + *hi ^= prev; + } + // yield every ~250us + // hopefully reduces tail latencies + if i % 1024 == 0 { + yield_now().await + } + } + + hi.into() +} + +enum ChannelBindingInner { + Unrequested, + Unsupported, + TlsServerEndPoint(Vec), +} + +/// The channel binding configuration for a SCRAM authentication exchange. +pub struct ChannelBinding(ChannelBindingInner); + +impl ChannelBinding { + /// The server did not request channel binding. + pub fn unrequested() -> ChannelBinding { + ChannelBinding(ChannelBindingInner::Unrequested) + } + + /// The server requested channel binding but the client is unable to provide it. + pub fn unsupported() -> ChannelBinding { + ChannelBinding(ChannelBindingInner::Unsupported) + } + + /// The server requested channel binding and the client will use the `tls-server-end-point` + /// method. + pub fn tls_server_end_point(signature: Vec) -> ChannelBinding { + ChannelBinding(ChannelBindingInner::TlsServerEndPoint(signature)) + } + + fn gs2_header(&self) -> &'static str { + match self.0 { + ChannelBindingInner::Unrequested => "y,,", + ChannelBindingInner::Unsupported => "n,,", + ChannelBindingInner::TlsServerEndPoint(_) => "p=tls-server-end-point,,", + } + } + + fn cbind_data(&self) -> &[u8] { + match self.0 { + ChannelBindingInner::Unrequested | ChannelBindingInner::Unsupported => &[], + ChannelBindingInner::TlsServerEndPoint(ref buf) => buf, + } + } +} + +/// A pair of keys for the SCRAM-SHA-256 mechanism. +/// See for details. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct ScramKeys { + /// Used by server to authenticate client. + pub client_key: [u8; N], + /// Used by client to verify server's signature. + pub server_key: [u8; N], +} + +/// Password or keys which were derived from it. +enum Credentials { + /// A regular password as a vector of bytes. + Password(Vec), + /// A precomputed pair of keys. + Keys(Box>), +} + +enum State { + Update { + nonce: String, + password: Credentials<32>, + channel_binding: ChannelBinding, + }, + Finish { + server_key: [u8; 32], + auth_message: String, + }, + Done, +} + +/// A type which handles the client side of the SCRAM-SHA-256/SCRAM-SHA-256-PLUS authentication +/// process. +/// +/// During the authentication process, if the backend sends an `AuthenticationSASL` message which +/// includes `SCRAM-SHA-256` as an authentication mechanism, this type can be used. +/// +/// After a `ScramSha256` is constructed, the buffer returned by the `message()` method should be +/// sent to the backend in a `SASLInitialResponse` message along with the mechanism name. +/// +/// The server will reply with an `AuthenticationSASLContinue` message. Its contents should be +/// passed to the `update()` method, after which the buffer returned by the `message()` method +/// should be sent to the backend in a `SASLResponse` message. +/// +/// The server will reply with an `AuthenticationSASLFinal` message. Its contents should be passed +/// to the `finish()` method, after which the authentication process is complete. +pub struct ScramSha256 { + message: String, + state: State, +} + +fn nonce() -> String { + // rand 0.5's ThreadRng is cryptographically secure + let mut rng = rand::thread_rng(); + (0..NONCE_LENGTH) + .map(|_| { + let mut v = rng.gen_range(0x21u8..0x7e); + if v == 0x2c { + v = 0x7e + } + v as char + }) + .collect() +} + +impl ScramSha256 { + /// Constructs a new instance which will use the provided password for authentication. + pub fn new(password: &[u8], channel_binding: ChannelBinding) -> ScramSha256 { + let password = Credentials::Password(normalize(password)); + ScramSha256::new_inner(password, channel_binding, nonce()) + } + + /// Constructs a new instance which will use the provided key pair for authentication. + pub fn new_with_keys(keys: ScramKeys<32>, channel_binding: ChannelBinding) -> ScramSha256 { + let password = Credentials::Keys(keys.into()); + ScramSha256::new_inner(password, channel_binding, nonce()) + } + + fn new_inner( + password: Credentials<32>, + channel_binding: ChannelBinding, + nonce: String, + ) -> ScramSha256 { + ScramSha256 { + message: format!("{}n=,r={}", channel_binding.gs2_header(), nonce), + state: State::Update { + nonce, + password, + channel_binding, + }, + } + } + + /// Returns the message which should be sent to the backend in an `SASLResponse` message. + pub fn message(&self) -> &[u8] { + if let State::Done = self.state { + panic!("invalid SCRAM state"); + } + self.message.as_bytes() + } + + /// Updates the state machine with the response from the backend. + /// + /// This should be called when an `AuthenticationSASLContinue` message is received. + pub async fn update(&mut self, message: &[u8]) -> io::Result<()> { + let (client_nonce, password, channel_binding) = + match mem::replace(&mut self.state, State::Done) { + State::Update { + nonce, + password, + channel_binding, + } => (nonce, password, channel_binding), + _ => return Err(io::Error::new(io::ErrorKind::Other, "invalid SCRAM state")), + }; + + let message = + str::from_utf8(message).map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?; + + let parsed = Parser::new(message).server_first_message()?; + + if !parsed.nonce.starts_with(&client_nonce) { + return Err(io::Error::new(io::ErrorKind::InvalidInput, "invalid nonce")); + } + + let (client_key, server_key) = match password { + Credentials::Password(password) => { + let salt = match base64::decode(parsed.salt) { + Ok(salt) => salt, + Err(e) => return Err(io::Error::new(io::ErrorKind::InvalidInput, e)), + }; + + let salted_password = hi(&password, &salt, parsed.iteration_count).await; + + let make_key = |name| { + let mut hmac = Hmac::::new_from_slice(&salted_password) + .expect("HMAC is able to accept all key sizes"); + hmac.update(name); + + let mut key = [0u8; 32]; + key.copy_from_slice(hmac.finalize().into_bytes().as_slice()); + key + }; + + (make_key(b"Client Key"), make_key(b"Server Key")) + } + Credentials::Keys(keys) => (keys.client_key, keys.server_key), + }; + + let mut hash = Sha256::default(); + hash.update(client_key); + let stored_key = hash.finalize_fixed(); + + let mut cbind_input = vec![]; + cbind_input.extend(channel_binding.gs2_header().as_bytes()); + cbind_input.extend(channel_binding.cbind_data()); + let cbind_input = base64::encode(&cbind_input); + + self.message.clear(); + write!(&mut self.message, "c={},r={}", cbind_input, parsed.nonce).unwrap(); + + let auth_message = format!("n=,r={},{},{}", client_nonce, message, self.message); + + let mut hmac = Hmac::::new_from_slice(&stored_key) + .expect("HMAC is able to accept all key sizes"); + hmac.update(auth_message.as_bytes()); + let client_signature = hmac.finalize().into_bytes(); + + let mut client_proof = client_key; + for (proof, signature) in client_proof.iter_mut().zip(client_signature) { + *proof ^= signature; + } + + write!(&mut self.message, ",p={}", base64::encode(client_proof)).unwrap(); + + self.state = State::Finish { + server_key, + auth_message, + }; + Ok(()) + } + + /// Finalizes the authentication process. + /// + /// This should be called when the backend sends an `AuthenticationSASLFinal` message. + /// Authentication has only succeeded if this method returns `Ok(())`. + pub fn finish(&mut self, message: &[u8]) -> io::Result<()> { + let (server_key, auth_message) = match mem::replace(&mut self.state, State::Done) { + State::Finish { + server_key, + auth_message, + } => (server_key, auth_message), + _ => return Err(io::Error::new(io::ErrorKind::Other, "invalid SCRAM state")), + }; + + let message = + str::from_utf8(message).map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?; + + let parsed = Parser::new(message).server_final_message()?; + + let verifier = match parsed { + ServerFinalMessage::Error(e) => { + return Err(io::Error::new( + io::ErrorKind::Other, + format!("SCRAM error: {}", e), + )); + } + ServerFinalMessage::Verifier(verifier) => verifier, + }; + + let verifier = match base64::decode(verifier) { + Ok(verifier) => verifier, + Err(e) => return Err(io::Error::new(io::ErrorKind::InvalidInput, e)), + }; + + let mut hmac = Hmac::::new_from_slice(&server_key) + .expect("HMAC is able to accept all key sizes"); + hmac.update(auth_message.as_bytes()); + hmac.verify_slice(&verifier) + .map_err(|_| io::Error::new(io::ErrorKind::InvalidInput, "SCRAM verification error")) + } +} + +struct Parser<'a> { + s: &'a str, + it: iter::Peekable>, +} + +impl<'a> Parser<'a> { + fn new(s: &'a str) -> Parser<'a> { + Parser { + s, + it: s.char_indices().peekable(), + } + } + + fn eat(&mut self, target: char) -> io::Result<()> { + match self.it.next() { + Some((_, c)) if c == target => Ok(()), + Some((i, c)) => { + let m = format!( + "unexpected character at byte {}: expected `{}` but got `{}", + i, target, c + ); + Err(io::Error::new(io::ErrorKind::InvalidInput, m)) + } + None => Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "unexpected EOF", + )), + } + } + + fn take_while(&mut self, f: F) -> io::Result<&'a str> + where + F: Fn(char) -> bool, + { + let start = match self.it.peek() { + Some(&(i, _)) => i, + None => return Ok(""), + }; + + loop { + match self.it.peek() { + Some(&(_, c)) if f(c) => { + self.it.next(); + } + Some(&(i, _)) => return Ok(&self.s[start..i]), + None => return Ok(&self.s[start..]), + } + } + } + + fn printable(&mut self) -> io::Result<&'a str> { + self.take_while(|c| matches!(c, '\x21'..='\x2b' | '\x2d'..='\x7e')) + } + + fn nonce(&mut self) -> io::Result<&'a str> { + self.eat('r')?; + self.eat('=')?; + self.printable() + } + + fn base64(&mut self) -> io::Result<&'a str> { + self.take_while(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '/' | '+' | '=')) + } + + fn salt(&mut self) -> io::Result<&'a str> { + self.eat('s')?; + self.eat('=')?; + self.base64() + } + + fn posit_number(&mut self) -> io::Result { + let n = self.take_while(|c| c.is_ascii_digit())?; + n.parse() + .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e)) + } + + fn iteration_count(&mut self) -> io::Result { + self.eat('i')?; + self.eat('=')?; + self.posit_number() + } + + fn eof(&mut self) -> io::Result<()> { + match self.it.peek() { + Some(&(i, _)) => Err(io::Error::new( + io::ErrorKind::InvalidInput, + format!("unexpected trailing data at byte {}", i), + )), + None => Ok(()), + } + } + + fn server_first_message(&mut self) -> io::Result> { + let nonce = self.nonce()?; + self.eat(',')?; + let salt = self.salt()?; + self.eat(',')?; + let iteration_count = self.iteration_count()?; + self.eof()?; + + Ok(ServerFirstMessage { + nonce, + salt, + iteration_count, + }) + } + + fn value(&mut self) -> io::Result<&'a str> { + self.take_while(|c| matches!(c, '\0' | '=' | ',')) + } + + fn server_error(&mut self) -> io::Result> { + match self.it.peek() { + Some(&(_, 'e')) => {} + _ => return Ok(None), + } + + self.eat('e')?; + self.eat('=')?; + self.value().map(Some) + } + + fn verifier(&mut self) -> io::Result<&'a str> { + self.eat('v')?; + self.eat('=')?; + self.base64() + } + + fn server_final_message(&mut self) -> io::Result> { + let message = match self.server_error()? { + Some(error) => ServerFinalMessage::Error(error), + None => ServerFinalMessage::Verifier(self.verifier()?), + }; + self.eof()?; + Ok(message) + } +} + +struct ServerFirstMessage<'a> { + nonce: &'a str, + salt: &'a str, + iteration_count: u32, +} + +enum ServerFinalMessage<'a> { + Error(&'a str), + Verifier(&'a str), +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn parse_server_first_message() { + let message = "r=fyko+d2lbbFgONRv9qkxdawL3rfcNHYJY1ZVvWVs7j,s=QSXCR+Q6sek8bf92,i=4096"; + let message = Parser::new(message).server_first_message().unwrap(); + assert_eq!(message.nonce, "fyko+d2lbbFgONRv9qkxdawL3rfcNHYJY1ZVvWVs7j"); + assert_eq!(message.salt, "QSXCR+Q6sek8bf92"); + assert_eq!(message.iteration_count, 4096); + } + + // recorded auth exchange from psql + #[tokio::test] + async fn exchange() { + let password = "foobar"; + let nonce = "9IZ2O01zb9IgiIZ1WJ/zgpJB"; + + let client_first = "n,,n=,r=9IZ2O01zb9IgiIZ1WJ/zgpJB"; + let server_first = + "r=9IZ2O01zb9IgiIZ1WJ/zgpJBjx/oIRLs02gGSHcw1KEty3eY,s=fs3IXBy7U7+IvVjZ,i\ + =4096"; + let client_final = + "c=biws,r=9IZ2O01zb9IgiIZ1WJ/zgpJBjx/oIRLs02gGSHcw1KEty3eY,p=AmNKosjJzS3\ + 1NTlQYNs5BTeQjdHdk7lOflDo5re2an8="; + let server_final = "v=U+ppxD5XUKtradnv8e2MkeupiA8FU87Sg8CXzXHDAzw="; + + let mut scram = ScramSha256::new_inner( + Credentials::Password(normalize(password.as_bytes())), + ChannelBinding::unsupported(), + nonce.to_string(), + ); + assert_eq!(str::from_utf8(scram.message()).unwrap(), client_first); + + scram.update(server_first.as_bytes()).await.unwrap(); + assert_eq!(str::from_utf8(scram.message()).unwrap(), client_final); + + scram.finish(server_final.as_bytes()).unwrap(); + } +} diff --git a/libs/proxy/postgres-protocol2/src/escape/mod.rs b/libs/proxy/postgres-protocol2/src/escape/mod.rs new file mode 100644 index 0000000000..0ba7efdcac --- /dev/null +++ b/libs/proxy/postgres-protocol2/src/escape/mod.rs @@ -0,0 +1,93 @@ +//! Provides functions for escaping literals and identifiers for use +//! in SQL queries. +//! +//! Prefer parameterized queries where possible. Do not escape +//! parameters in a parameterized query. + +#[cfg(test)] +mod test; + +/// Escape a literal and surround result with single quotes. Not +/// recommended in most cases. +/// +/// If input contains backslashes, result will be of the form ` +/// E'...'` so it is safe to use regardless of the setting of +/// standard_conforming_strings. +pub fn escape_literal(input: &str) -> String { + escape_internal(input, false) +} + +/// Escape an identifier and surround result with double quotes. +pub fn escape_identifier(input: &str) -> String { + escape_internal(input, true) +} + +// Translation of PostgreSQL libpq's PQescapeInternal(). Does not +// require a connection because input string is known to be valid +// UTF-8. +// +// Escape arbitrary strings. If as_ident is true, we escape the +// result as an identifier; if false, as a literal. The result is +// returned in a newly allocated buffer. If we fail due to an +// encoding violation or out of memory condition, we return NULL, +// storing an error message into conn. +fn escape_internal(input: &str, as_ident: bool) -> String { + let mut num_backslashes = 0; + let mut num_quotes = 0; + let quote_char = if as_ident { '"' } else { '\'' }; + + // Scan the string for characters that must be escaped. + for ch in input.chars() { + if ch == quote_char { + num_quotes += 1; + } else if ch == '\\' { + num_backslashes += 1; + } + } + + // Allocate output String. + let mut result_size = input.len() + num_quotes + 3; // two quotes, plus a NUL + if !as_ident && num_backslashes > 0 { + result_size += num_backslashes + 2; + } + + let mut output = String::with_capacity(result_size); + + // If we are escaping a literal that contains backslashes, we use + // the escape string syntax so that the result is correct under + // either value of standard_conforming_strings. We also emit a + // leading space in this case, to guard against the possibility + // that the result might be interpolated immediately following an + // identifier. + if !as_ident && num_backslashes > 0 { + output.push(' '); + output.push('E'); + } + + // Opening quote. + output.push(quote_char); + + // Use fast path if possible. + // + // We've already verified that the input string is well-formed in + // the current encoding. If it contains no quotes and, in the + // case of literal-escaping, no backslashes, then we can just copy + // it directly to the output buffer, adding the necessary quotes. + // + // If not, we must rescan the input and process each character + // individually. + if num_quotes == 0 && (num_backslashes == 0 || as_ident) { + output.push_str(input); + } else { + for ch in input.chars() { + if ch == quote_char || (!as_ident && ch == '\\') { + output.push(ch); + } + output.push(ch); + } + } + + output.push(quote_char); + + output +} diff --git a/libs/proxy/postgres-protocol2/src/escape/test.rs b/libs/proxy/postgres-protocol2/src/escape/test.rs new file mode 100644 index 0000000000..4816a103b7 --- /dev/null +++ b/libs/proxy/postgres-protocol2/src/escape/test.rs @@ -0,0 +1,17 @@ +use crate::escape::{escape_identifier, escape_literal}; + +#[test] +fn test_escape_idenifier() { + assert_eq!(escape_identifier("foo"), String::from("\"foo\"")); + assert_eq!(escape_identifier("f\\oo"), String::from("\"f\\oo\"")); + assert_eq!(escape_identifier("f'oo"), String::from("\"f'oo\"")); + assert_eq!(escape_identifier("f\"oo"), String::from("\"f\"\"oo\"")); +} + +#[test] +fn test_escape_literal() { + assert_eq!(escape_literal("foo"), String::from("'foo'")); + assert_eq!(escape_literal("f\\oo"), String::from(" E'f\\\\oo'")); + assert_eq!(escape_literal("f'oo"), String::from("'f''oo'")); + assert_eq!(escape_literal("f\"oo"), String::from("'f\"oo'")); +} diff --git a/libs/proxy/postgres-protocol2/src/lib.rs b/libs/proxy/postgres-protocol2/src/lib.rs new file mode 100644 index 0000000000..947f2f835d --- /dev/null +++ b/libs/proxy/postgres-protocol2/src/lib.rs @@ -0,0 +1,78 @@ +//! Low level Postgres protocol APIs. +//! +//! This crate implements the low level components of Postgres's communication +//! protocol, including message and value serialization and deserialization. +//! It is designed to be used as a building block by higher level APIs such as +//! `rust-postgres`, and should not typically be used directly. +//! +//! # Note +//! +//! This library assumes that the `client_encoding` backend parameter has been +//! set to `UTF8`. It will most likely not behave properly if that is not the case. +#![doc(html_root_url = "https://docs.rs/postgres-protocol/0.6")] +#![warn(missing_docs, rust_2018_idioms, clippy::all)] + +use byteorder::{BigEndian, ByteOrder}; +use bytes::{BufMut, BytesMut}; +use std::io; + +pub mod authentication; +pub mod escape; +pub mod message; +pub mod password; +pub mod types; + +/// A Postgres OID. +pub type Oid = u32; + +/// A Postgres Log Sequence Number (LSN). +pub type Lsn = u64; + +/// An enum indicating if a value is `NULL` or not. +pub enum IsNull { + /// The value is `NULL`. + Yes, + /// The value is not `NULL`. + No, +} + +fn write_nullable(serializer: F, buf: &mut BytesMut) -> Result<(), E> +where + F: FnOnce(&mut BytesMut) -> Result, + E: From, +{ + let base = buf.len(); + buf.put_i32(0); + let size = match serializer(buf)? { + IsNull::No => i32::from_usize(buf.len() - base - 4)?, + IsNull::Yes => -1, + }; + BigEndian::write_i32(&mut buf[base..], size); + + Ok(()) +} + +trait FromUsize: Sized { + fn from_usize(x: usize) -> Result; +} + +macro_rules! from_usize { + ($t:ty) => { + impl FromUsize for $t { + #[inline] + fn from_usize(x: usize) -> io::Result<$t> { + if x > <$t>::MAX as usize { + Err(io::Error::new( + io::ErrorKind::InvalidInput, + "value too large to transmit", + )) + } else { + Ok(x as $t) + } + } + } + }; +} + +from_usize!(i16); +from_usize!(i32); diff --git a/libs/proxy/postgres-protocol2/src/message/backend.rs b/libs/proxy/postgres-protocol2/src/message/backend.rs new file mode 100644 index 0000000000..356d142f3f --- /dev/null +++ b/libs/proxy/postgres-protocol2/src/message/backend.rs @@ -0,0 +1,766 @@ +#![allow(missing_docs)] + +use byteorder::{BigEndian, ByteOrder, ReadBytesExt}; +use bytes::{Bytes, BytesMut}; +use fallible_iterator::FallibleIterator; +use memchr::memchr; +use std::cmp; +use std::io::{self, Read}; +use std::ops::Range; +use std::str; + +use crate::Oid; + +// top-level message tags +const PARSE_COMPLETE_TAG: u8 = b'1'; +const BIND_COMPLETE_TAG: u8 = b'2'; +const CLOSE_COMPLETE_TAG: u8 = b'3'; +pub const NOTIFICATION_RESPONSE_TAG: u8 = b'A'; +const COPY_DONE_TAG: u8 = b'c'; +const COMMAND_COMPLETE_TAG: u8 = b'C'; +const COPY_DATA_TAG: u8 = b'd'; +const DATA_ROW_TAG: u8 = b'D'; +const ERROR_RESPONSE_TAG: u8 = b'E'; +const COPY_IN_RESPONSE_TAG: u8 = b'G'; +const COPY_OUT_RESPONSE_TAG: u8 = b'H'; +const COPY_BOTH_RESPONSE_TAG: u8 = b'W'; +const EMPTY_QUERY_RESPONSE_TAG: u8 = b'I'; +const BACKEND_KEY_DATA_TAG: u8 = b'K'; +pub const NO_DATA_TAG: u8 = b'n'; +pub const NOTICE_RESPONSE_TAG: u8 = b'N'; +const AUTHENTICATION_TAG: u8 = b'R'; +const PORTAL_SUSPENDED_TAG: u8 = b's'; +pub const PARAMETER_STATUS_TAG: u8 = b'S'; +const PARAMETER_DESCRIPTION_TAG: u8 = b't'; +const ROW_DESCRIPTION_TAG: u8 = b'T'; +pub const READY_FOR_QUERY_TAG: u8 = b'Z'; + +#[derive(Debug, Copy, Clone)] +pub struct Header { + tag: u8, + len: i32, +} + +#[allow(clippy::len_without_is_empty)] +impl Header { + #[inline] + pub fn parse(buf: &[u8]) -> io::Result> { + if buf.len() < 5 { + return Ok(None); + } + + let tag = buf[0]; + let len = BigEndian::read_i32(&buf[1..]); + + if len < 4 { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "invalid message length: header length < 4", + )); + } + + Ok(Some(Header { tag, len })) + } + + #[inline] + pub fn tag(self) -> u8 { + self.tag + } + + #[inline] + pub fn len(self) -> i32 { + self.len + } +} + +/// An enum representing Postgres backend messages. +#[non_exhaustive] +pub enum Message { + AuthenticationCleartextPassword, + AuthenticationGss, + AuthenticationKerberosV5, + AuthenticationMd5Password(AuthenticationMd5PasswordBody), + AuthenticationOk, + AuthenticationScmCredential, + AuthenticationSspi, + AuthenticationGssContinue, + AuthenticationSasl(AuthenticationSaslBody), + AuthenticationSaslContinue(AuthenticationSaslContinueBody), + AuthenticationSaslFinal(AuthenticationSaslFinalBody), + BackendKeyData(BackendKeyDataBody), + BindComplete, + CloseComplete, + CommandComplete(CommandCompleteBody), + CopyData, + CopyDone, + CopyInResponse, + CopyOutResponse, + CopyBothResponse, + DataRow(DataRowBody), + EmptyQueryResponse, + ErrorResponse(ErrorResponseBody), + NoData, + NoticeResponse(NoticeResponseBody), + NotificationResponse(NotificationResponseBody), + ParameterDescription(ParameterDescriptionBody), + ParameterStatus(ParameterStatusBody), + ParseComplete, + PortalSuspended, + ReadyForQuery(ReadyForQueryBody), + RowDescription(RowDescriptionBody), +} + +impl Message { + #[inline] + pub fn parse(buf: &mut BytesMut) -> io::Result> { + if buf.len() < 5 { + let to_read = 5 - buf.len(); + buf.reserve(to_read); + return Ok(None); + } + + let tag = buf[0]; + let len = (&buf[1..5]).read_u32::().unwrap(); + + if len < 4 { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "invalid message length: parsing u32", + )); + } + + let total_len = len as usize + 1; + if buf.len() < total_len { + let to_read = total_len - buf.len(); + buf.reserve(to_read); + return Ok(None); + } + + let mut buf = Buffer { + bytes: buf.split_to(total_len).freeze(), + idx: 5, + }; + + let message = match tag { + PARSE_COMPLETE_TAG => Message::ParseComplete, + BIND_COMPLETE_TAG => Message::BindComplete, + CLOSE_COMPLETE_TAG => Message::CloseComplete, + NOTIFICATION_RESPONSE_TAG => { + let process_id = buf.read_i32::()?; + let channel = buf.read_cstr()?; + let message = buf.read_cstr()?; + Message::NotificationResponse(NotificationResponseBody { + process_id, + channel, + message, + }) + } + COPY_DONE_TAG => Message::CopyDone, + COMMAND_COMPLETE_TAG => { + let tag = buf.read_cstr()?; + Message::CommandComplete(CommandCompleteBody { tag }) + } + COPY_DATA_TAG => Message::CopyData, + DATA_ROW_TAG => { + let len = buf.read_u16::()?; + let storage = buf.read_all(); + Message::DataRow(DataRowBody { storage, len }) + } + ERROR_RESPONSE_TAG => { + let storage = buf.read_all(); + Message::ErrorResponse(ErrorResponseBody { storage }) + } + COPY_IN_RESPONSE_TAG => Message::CopyInResponse, + COPY_OUT_RESPONSE_TAG => Message::CopyOutResponse, + COPY_BOTH_RESPONSE_TAG => Message::CopyBothResponse, + EMPTY_QUERY_RESPONSE_TAG => Message::EmptyQueryResponse, + BACKEND_KEY_DATA_TAG => { + let process_id = buf.read_i32::()?; + let secret_key = buf.read_i32::()?; + Message::BackendKeyData(BackendKeyDataBody { + process_id, + secret_key, + }) + } + NO_DATA_TAG => Message::NoData, + NOTICE_RESPONSE_TAG => { + let storage = buf.read_all(); + Message::NoticeResponse(NoticeResponseBody { storage }) + } + AUTHENTICATION_TAG => match buf.read_i32::()? { + 0 => Message::AuthenticationOk, + 2 => Message::AuthenticationKerberosV5, + 3 => Message::AuthenticationCleartextPassword, + 5 => { + let mut salt = [0; 4]; + buf.read_exact(&mut salt)?; + Message::AuthenticationMd5Password(AuthenticationMd5PasswordBody { salt }) + } + 6 => Message::AuthenticationScmCredential, + 7 => Message::AuthenticationGss, + 8 => Message::AuthenticationGssContinue, + 9 => Message::AuthenticationSspi, + 10 => { + let storage = buf.read_all(); + Message::AuthenticationSasl(AuthenticationSaslBody(storage)) + } + 11 => { + let storage = buf.read_all(); + Message::AuthenticationSaslContinue(AuthenticationSaslContinueBody(storage)) + } + 12 => { + let storage = buf.read_all(); + Message::AuthenticationSaslFinal(AuthenticationSaslFinalBody(storage)) + } + tag => { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + format!("unknown authentication tag `{}`", tag), + )); + } + }, + PORTAL_SUSPENDED_TAG => Message::PortalSuspended, + PARAMETER_STATUS_TAG => { + let name = buf.read_cstr()?; + let value = buf.read_cstr()?; + Message::ParameterStatus(ParameterStatusBody { name, value }) + } + PARAMETER_DESCRIPTION_TAG => { + let len = buf.read_u16::()?; + let storage = buf.read_all(); + Message::ParameterDescription(ParameterDescriptionBody { storage, len }) + } + ROW_DESCRIPTION_TAG => { + let len = buf.read_u16::()?; + let storage = buf.read_all(); + Message::RowDescription(RowDescriptionBody { storage, len }) + } + READY_FOR_QUERY_TAG => { + let status = buf.read_u8()?; + Message::ReadyForQuery(ReadyForQueryBody { status }) + } + tag => { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + format!("unknown message tag `{}`", tag), + )); + } + }; + + if !buf.is_empty() { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "invalid message length: expected buffer to be empty", + )); + } + + Ok(Some(message)) + } +} + +struct Buffer { + bytes: Bytes, + idx: usize, +} + +impl Buffer { + #[inline] + fn slice(&self) -> &[u8] { + &self.bytes[self.idx..] + } + + #[inline] + fn is_empty(&self) -> bool { + self.slice().is_empty() + } + + #[inline] + fn read_cstr(&mut self) -> io::Result { + match memchr(0, self.slice()) { + Some(pos) => { + let start = self.idx; + let end = start + pos; + let cstr = self.bytes.slice(start..end); + self.idx = end + 1; + Ok(cstr) + } + None => Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "unexpected EOF", + )), + } + } + + #[inline] + fn read_all(&mut self) -> Bytes { + let buf = self.bytes.slice(self.idx..); + self.idx = self.bytes.len(); + buf + } +} + +impl Read for Buffer { + #[inline] + fn read(&mut self, buf: &mut [u8]) -> io::Result { + let len = { + let slice = self.slice(); + let len = cmp::min(slice.len(), buf.len()); + buf[..len].copy_from_slice(&slice[..len]); + len + }; + self.idx += len; + Ok(len) + } +} + +pub struct AuthenticationMd5PasswordBody { + salt: [u8; 4], +} + +impl AuthenticationMd5PasswordBody { + #[inline] + pub fn salt(&self) -> [u8; 4] { + self.salt + } +} + +pub struct AuthenticationSaslBody(Bytes); + +impl AuthenticationSaslBody { + #[inline] + pub fn mechanisms(&self) -> SaslMechanisms<'_> { + SaslMechanisms(&self.0) + } +} + +pub struct SaslMechanisms<'a>(&'a [u8]); + +impl<'a> FallibleIterator for SaslMechanisms<'a> { + type Item = &'a str; + type Error = io::Error; + + #[inline] + fn next(&mut self) -> io::Result> { + let value_end = find_null(self.0, 0)?; + if value_end == 0 { + if self.0.len() != 1 { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "invalid message length: expected to be at end of iterator for sasl", + )); + } + Ok(None) + } else { + let value = get_str(&self.0[..value_end])?; + self.0 = &self.0[value_end + 1..]; + Ok(Some(value)) + } + } +} + +pub struct AuthenticationSaslContinueBody(Bytes); + +impl AuthenticationSaslContinueBody { + #[inline] + pub fn data(&self) -> &[u8] { + &self.0 + } +} + +pub struct AuthenticationSaslFinalBody(Bytes); + +impl AuthenticationSaslFinalBody { + #[inline] + pub fn data(&self) -> &[u8] { + &self.0 + } +} + +pub struct BackendKeyDataBody { + process_id: i32, + secret_key: i32, +} + +impl BackendKeyDataBody { + #[inline] + pub fn process_id(&self) -> i32 { + self.process_id + } + + #[inline] + pub fn secret_key(&self) -> i32 { + self.secret_key + } +} + +pub struct CommandCompleteBody { + tag: Bytes, +} + +impl CommandCompleteBody { + #[inline] + pub fn tag(&self) -> io::Result<&str> { + get_str(&self.tag) + } +} + +#[derive(Debug)] +pub struct DataRowBody { + storage: Bytes, + len: u16, +} + +impl DataRowBody { + #[inline] + pub fn ranges(&self) -> DataRowRanges<'_> { + DataRowRanges { + buf: &self.storage, + len: self.storage.len(), + remaining: self.len, + } + } + + #[inline] + pub fn buffer(&self) -> &[u8] { + &self.storage + } +} + +pub struct DataRowRanges<'a> { + buf: &'a [u8], + len: usize, + remaining: u16, +} + +impl FallibleIterator for DataRowRanges<'_> { + type Item = Option>; + type Error = io::Error; + + #[inline] + fn next(&mut self) -> io::Result>>> { + if self.remaining == 0 { + if self.buf.is_empty() { + return Ok(None); + } else { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "invalid message length: datarowrange is not empty", + )); + } + } + + self.remaining -= 1; + let len = self.buf.read_i32::()?; + if len < 0 { + Ok(Some(None)) + } else { + let len = len as usize; + if self.buf.len() < len { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "unexpected EOF", + )); + } + let base = self.len - self.buf.len(); + self.buf = &self.buf[len..]; + Ok(Some(Some(base..base + len))) + } + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + let len = self.remaining as usize; + (len, Some(len)) + } +} + +pub struct ErrorResponseBody { + storage: Bytes, +} + +impl ErrorResponseBody { + #[inline] + pub fn fields(&self) -> ErrorFields<'_> { + ErrorFields { buf: &self.storage } + } +} + +pub struct ErrorFields<'a> { + buf: &'a [u8], +} + +impl<'a> FallibleIterator for ErrorFields<'a> { + type Item = ErrorField<'a>; + type Error = io::Error; + + #[inline] + fn next(&mut self) -> io::Result>> { + let type_ = self.buf.read_u8()?; + if type_ == 0 { + if self.buf.is_empty() { + return Ok(None); + } else { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "invalid message length: error fields is not drained", + )); + } + } + + let value_end = find_null(self.buf, 0)?; + let value = get_str(&self.buf[..value_end])?; + self.buf = &self.buf[value_end + 1..]; + + Ok(Some(ErrorField { type_, value })) + } +} + +pub struct ErrorField<'a> { + type_: u8, + value: &'a str, +} + +impl ErrorField<'_> { + #[inline] + pub fn type_(&self) -> u8 { + self.type_ + } + + #[inline] + pub fn value(&self) -> &str { + self.value + } +} + +pub struct NoticeResponseBody { + storage: Bytes, +} + +impl NoticeResponseBody { + #[inline] + pub fn fields(&self) -> ErrorFields<'_> { + ErrorFields { buf: &self.storage } + } +} + +pub struct NotificationResponseBody { + process_id: i32, + channel: Bytes, + message: Bytes, +} + +impl NotificationResponseBody { + #[inline] + pub fn process_id(&self) -> i32 { + self.process_id + } + + #[inline] + pub fn channel(&self) -> io::Result<&str> { + get_str(&self.channel) + } + + #[inline] + pub fn message(&self) -> io::Result<&str> { + get_str(&self.message) + } +} + +pub struct ParameterDescriptionBody { + storage: Bytes, + len: u16, +} + +impl ParameterDescriptionBody { + #[inline] + pub fn parameters(&self) -> Parameters<'_> { + Parameters { + buf: &self.storage, + remaining: self.len, + } + } +} + +pub struct Parameters<'a> { + buf: &'a [u8], + remaining: u16, +} + +impl FallibleIterator for Parameters<'_> { + type Item = Oid; + type Error = io::Error; + + #[inline] + fn next(&mut self) -> io::Result> { + if self.remaining == 0 { + if self.buf.is_empty() { + return Ok(None); + } else { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "invalid message length: parameters is not drained", + )); + } + } + + self.remaining -= 1; + self.buf.read_u32::().map(Some) + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + let len = self.remaining as usize; + (len, Some(len)) + } +} + +pub struct ParameterStatusBody { + name: Bytes, + value: Bytes, +} + +impl ParameterStatusBody { + #[inline] + pub fn name(&self) -> io::Result<&str> { + get_str(&self.name) + } + + #[inline] + pub fn value(&self) -> io::Result<&str> { + get_str(&self.value) + } +} + +pub struct ReadyForQueryBody { + status: u8, +} + +impl ReadyForQueryBody { + #[inline] + pub fn status(&self) -> u8 { + self.status + } +} + +pub struct RowDescriptionBody { + storage: Bytes, + len: u16, +} + +impl RowDescriptionBody { + #[inline] + pub fn fields(&self) -> Fields<'_> { + Fields { + buf: &self.storage, + remaining: self.len, + } + } +} + +pub struct Fields<'a> { + buf: &'a [u8], + remaining: u16, +} + +impl<'a> FallibleIterator for Fields<'a> { + type Item = Field<'a>; + type Error = io::Error; + + #[inline] + fn next(&mut self) -> io::Result>> { + if self.remaining == 0 { + if self.buf.is_empty() { + return Ok(None); + } else { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "invalid message length: field is not drained", + )); + } + } + + self.remaining -= 1; + let name_end = find_null(self.buf, 0)?; + let name = get_str(&self.buf[..name_end])?; + self.buf = &self.buf[name_end + 1..]; + let table_oid = self.buf.read_u32::()?; + let column_id = self.buf.read_i16::()?; + let type_oid = self.buf.read_u32::()?; + let type_size = self.buf.read_i16::()?; + let type_modifier = self.buf.read_i32::()?; + let format = self.buf.read_i16::()?; + + Ok(Some(Field { + name, + table_oid, + column_id, + type_oid, + type_size, + type_modifier, + format, + })) + } +} + +pub struct Field<'a> { + name: &'a str, + table_oid: Oid, + column_id: i16, + type_oid: Oid, + type_size: i16, + type_modifier: i32, + format: i16, +} + +impl<'a> Field<'a> { + #[inline] + pub fn name(&self) -> &'a str { + self.name + } + + #[inline] + pub fn table_oid(&self) -> Oid { + self.table_oid + } + + #[inline] + pub fn column_id(&self) -> i16 { + self.column_id + } + + #[inline] + pub fn type_oid(&self) -> Oid { + self.type_oid + } + + #[inline] + pub fn type_size(&self) -> i16 { + self.type_size + } + + #[inline] + pub fn type_modifier(&self) -> i32 { + self.type_modifier + } + + #[inline] + pub fn format(&self) -> i16 { + self.format + } +} + +#[inline] +fn find_null(buf: &[u8], start: usize) -> io::Result { + match memchr(0, &buf[start..]) { + Some(pos) => Ok(pos + start), + None => Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "unexpected EOF", + )), + } +} + +#[inline] +fn get_str(buf: &[u8]) -> io::Result<&str> { + str::from_utf8(buf).map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e)) +} diff --git a/libs/proxy/postgres-protocol2/src/message/frontend.rs b/libs/proxy/postgres-protocol2/src/message/frontend.rs new file mode 100644 index 0000000000..5d0a8ff8c8 --- /dev/null +++ b/libs/proxy/postgres-protocol2/src/message/frontend.rs @@ -0,0 +1,297 @@ +//! Frontend message serialization. +#![allow(missing_docs)] + +use byteorder::{BigEndian, ByteOrder}; +use bytes::{Buf, BufMut, BytesMut}; +use std::convert::TryFrom; +use std::error::Error; +use std::io; +use std::marker; + +use crate::{write_nullable, FromUsize, IsNull, Oid}; + +#[inline] +fn write_body(buf: &mut BytesMut, f: F) -> Result<(), E> +where + F: FnOnce(&mut BytesMut) -> Result<(), E>, + E: From, +{ + let base = buf.len(); + buf.extend_from_slice(&[0; 4]); + + f(buf)?; + + let size = i32::from_usize(buf.len() - base)?; + BigEndian::write_i32(&mut buf[base..], size); + Ok(()) +} + +pub enum BindError { + Conversion(Box), + Serialization(io::Error), +} + +impl From> for BindError { + #[inline] + fn from(e: Box) -> BindError { + BindError::Conversion(e) + } +} + +impl From for BindError { + #[inline] + fn from(e: io::Error) -> BindError { + BindError::Serialization(e) + } +} + +#[inline] +pub fn bind( + portal: &str, + statement: &str, + formats: I, + values: J, + mut serializer: F, + result_formats: K, + buf: &mut BytesMut, +) -> Result<(), BindError> +where + I: IntoIterator, + J: IntoIterator, + F: FnMut(T, &mut BytesMut) -> Result>, + K: IntoIterator, +{ + buf.put_u8(b'B'); + + write_body(buf, |buf| { + write_cstr(portal.as_bytes(), buf)?; + write_cstr(statement.as_bytes(), buf)?; + write_counted( + formats, + |f, buf| { + buf.put_i16(f); + Ok::<_, io::Error>(()) + }, + buf, + )?; + write_counted( + values, + |v, buf| write_nullable(|buf| serializer(v, buf), buf), + buf, + )?; + write_counted( + result_formats, + |f, buf| { + buf.put_i16(f); + Ok::<_, io::Error>(()) + }, + buf, + )?; + + Ok(()) + }) +} + +#[inline] +fn write_counted(items: I, mut serializer: F, buf: &mut BytesMut) -> Result<(), E> +where + I: IntoIterator, + F: FnMut(T, &mut BytesMut) -> Result<(), E>, + E: From, +{ + let base = buf.len(); + buf.extend_from_slice(&[0; 2]); + let mut count = 0; + for item in items { + serializer(item, buf)?; + count += 1; + } + let count = i16::from_usize(count)?; + BigEndian::write_i16(&mut buf[base..], count); + + Ok(()) +} + +#[inline] +pub fn cancel_request(process_id: i32, secret_key: i32, buf: &mut BytesMut) { + write_body(buf, |buf| { + buf.put_i32(80_877_102); + buf.put_i32(process_id); + buf.put_i32(secret_key); + Ok::<_, io::Error>(()) + }) + .unwrap(); +} + +#[inline] +pub fn close(variant: u8, name: &str, buf: &mut BytesMut) -> io::Result<()> { + buf.put_u8(b'C'); + write_body(buf, |buf| { + buf.put_u8(variant); + write_cstr(name.as_bytes(), buf) + }) +} + +pub struct CopyData { + buf: T, + len: i32, +} + +impl CopyData +where + T: Buf, +{ + pub fn new(buf: T) -> io::Result> { + let len = buf + .remaining() + .checked_add(4) + .and_then(|l| i32::try_from(l).ok()) + .ok_or_else(|| { + io::Error::new(io::ErrorKind::InvalidInput, "message length overflow") + })?; + + Ok(CopyData { buf, len }) + } + + pub fn write(self, out: &mut BytesMut) { + out.put_u8(b'd'); + out.put_i32(self.len); + out.put(self.buf); + } +} + +#[inline] +pub fn copy_done(buf: &mut BytesMut) { + buf.put_u8(b'c'); + write_body(buf, |_| Ok::<(), io::Error>(())).unwrap(); +} + +#[inline] +pub fn copy_fail(message: &str, buf: &mut BytesMut) -> io::Result<()> { + buf.put_u8(b'f'); + write_body(buf, |buf| write_cstr(message.as_bytes(), buf)) +} + +#[inline] +pub fn describe(variant: u8, name: &str, buf: &mut BytesMut) -> io::Result<()> { + buf.put_u8(b'D'); + write_body(buf, |buf| { + buf.put_u8(variant); + write_cstr(name.as_bytes(), buf) + }) +} + +#[inline] +pub fn execute(portal: &str, max_rows: i32, buf: &mut BytesMut) -> io::Result<()> { + buf.put_u8(b'E'); + write_body(buf, |buf| { + write_cstr(portal.as_bytes(), buf)?; + buf.put_i32(max_rows); + Ok(()) + }) +} + +#[inline] +pub fn parse(name: &str, query: &str, param_types: I, buf: &mut BytesMut) -> io::Result<()> +where + I: IntoIterator, +{ + buf.put_u8(b'P'); + write_body(buf, |buf| { + write_cstr(name.as_bytes(), buf)?; + write_cstr(query.as_bytes(), buf)?; + write_counted( + param_types, + |t, buf| { + buf.put_u32(t); + Ok::<_, io::Error>(()) + }, + buf, + )?; + Ok(()) + }) +} + +#[inline] +pub fn password_message(password: &[u8], buf: &mut BytesMut) -> io::Result<()> { + buf.put_u8(b'p'); + write_body(buf, |buf| write_cstr(password, buf)) +} + +#[inline] +pub fn query(query: &str, buf: &mut BytesMut) -> io::Result<()> { + buf.put_u8(b'Q'); + write_body(buf, |buf| write_cstr(query.as_bytes(), buf)) +} + +#[inline] +pub fn sasl_initial_response(mechanism: &str, data: &[u8], buf: &mut BytesMut) -> io::Result<()> { + buf.put_u8(b'p'); + write_body(buf, |buf| { + write_cstr(mechanism.as_bytes(), buf)?; + let len = i32::from_usize(data.len())?; + buf.put_i32(len); + buf.put_slice(data); + Ok(()) + }) +} + +#[inline] +pub fn sasl_response(data: &[u8], buf: &mut BytesMut) -> io::Result<()> { + buf.put_u8(b'p'); + write_body(buf, |buf| { + buf.put_slice(data); + Ok(()) + }) +} + +#[inline] +pub fn ssl_request(buf: &mut BytesMut) { + write_body(buf, |buf| { + buf.put_i32(80_877_103); + Ok::<_, io::Error>(()) + }) + .unwrap(); +} + +#[inline] +pub fn startup_message<'a, I>(parameters: I, buf: &mut BytesMut) -> io::Result<()> +where + I: IntoIterator, +{ + write_body(buf, |buf| { + // postgres protocol version 3.0(196608) in bigger-endian + buf.put_i32(0x00_03_00_00); + for (key, value) in parameters { + write_cstr(key.as_bytes(), buf)?; + write_cstr(value.as_bytes(), buf)?; + } + buf.put_u8(0); + Ok(()) + }) +} + +#[inline] +pub fn sync(buf: &mut BytesMut) { + buf.put_u8(b'S'); + write_body(buf, |_| Ok::<(), io::Error>(())).unwrap(); +} + +#[inline] +pub fn terminate(buf: &mut BytesMut) { + buf.put_u8(b'X'); + write_body(buf, |_| Ok::<(), io::Error>(())).unwrap(); +} + +#[inline] +fn write_cstr(s: &[u8], buf: &mut BytesMut) -> Result<(), io::Error> { + if s.contains(&0) { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "string contains embedded null", + )); + } + buf.put_slice(s); + buf.put_u8(0); + Ok(()) +} diff --git a/libs/proxy/postgres-protocol2/src/message/mod.rs b/libs/proxy/postgres-protocol2/src/message/mod.rs new file mode 100644 index 0000000000..9e5d997548 --- /dev/null +++ b/libs/proxy/postgres-protocol2/src/message/mod.rs @@ -0,0 +1,8 @@ +//! Postgres message protocol support. +//! +//! See [Postgres's documentation][docs] for more information on message flow. +//! +//! [docs]: https://www.postgresql.org/docs/9.5/static/protocol-flow.html + +pub mod backend; +pub mod frontend; diff --git a/libs/proxy/postgres-protocol2/src/password/mod.rs b/libs/proxy/postgres-protocol2/src/password/mod.rs new file mode 100644 index 0000000000..e669e80f3f --- /dev/null +++ b/libs/proxy/postgres-protocol2/src/password/mod.rs @@ -0,0 +1,107 @@ +//! Functions to encrypt a password in the client. +//! +//! This is intended to be used by client applications that wish to +//! send commands like `ALTER USER joe PASSWORD 'pwd'`. The password +//! need not be sent in cleartext if it is encrypted on the client +//! side. This is good because it ensures the cleartext password won't +//! end up in logs pg_stat displays, etc. + +use crate::authentication::sasl; +use hmac::{Hmac, Mac}; +use md5::Md5; +use rand::RngCore; +use sha2::digest::FixedOutput; +use sha2::{Digest, Sha256}; + +#[cfg(test)] +mod test; + +const SCRAM_DEFAULT_ITERATIONS: u32 = 4096; +const SCRAM_DEFAULT_SALT_LEN: usize = 16; + +/// Hash password using SCRAM-SHA-256 with a randomly-generated +/// salt. +/// +/// The client may assume the returned string doesn't contain any +/// special characters that would require escaping in an SQL command. +pub async fn scram_sha_256(password: &[u8]) -> String { + let mut salt: [u8; SCRAM_DEFAULT_SALT_LEN] = [0; SCRAM_DEFAULT_SALT_LEN]; + let mut rng = rand::thread_rng(); + rng.fill_bytes(&mut salt); + scram_sha_256_salt(password, salt).await +} + +// Internal implementation of scram_sha_256 with a caller-provided +// salt. This is useful for testing. +pub(crate) async fn scram_sha_256_salt( + password: &[u8], + salt: [u8; SCRAM_DEFAULT_SALT_LEN], +) -> String { + // Prepare the password, per [RFC + // 4013](https://tools.ietf.org/html/rfc4013), if possible. + // + // Postgres treats passwords as byte strings (without embedded NUL + // bytes), but SASL expects passwords to be valid UTF-8. + // + // Follow the behavior of libpq's PQencryptPasswordConn(), and + // also the backend. If the password is not valid UTF-8, or if it + // contains prohibited characters (such as non-ASCII whitespace), + // just skip the SASLprep step and use the original byte + // sequence. + let prepared: Vec = match std::str::from_utf8(password) { + Ok(password_str) => { + match stringprep::saslprep(password_str) { + Ok(p) => p.into_owned().into_bytes(), + // contains invalid characters; skip saslprep + Err(_) => Vec::from(password), + } + } + // not valid UTF-8; skip saslprep + Err(_) => Vec::from(password), + }; + + // salt password + let salted_password = sasl::hi(&prepared, &salt, SCRAM_DEFAULT_ITERATIONS).await; + + // client key + let mut hmac = Hmac::::new_from_slice(&salted_password) + .expect("HMAC is able to accept all key sizes"); + hmac.update(b"Client Key"); + let client_key = hmac.finalize().into_bytes(); + + // stored key + let mut hash = Sha256::default(); + hash.update(client_key.as_slice()); + let stored_key = hash.finalize_fixed(); + + // server key + let mut hmac = Hmac::::new_from_slice(&salted_password) + .expect("HMAC is able to accept all key sizes"); + hmac.update(b"Server Key"); + let server_key = hmac.finalize().into_bytes(); + + format!( + "SCRAM-SHA-256${}:{}${}:{}", + SCRAM_DEFAULT_ITERATIONS, + base64::encode(salt), + base64::encode(stored_key), + base64::encode(server_key) + ) +} + +/// **Not recommended, as MD5 is not considered to be secure.** +/// +/// Hash password using MD5 with the username as the salt. +/// +/// The client may assume the returned string doesn't contain any +/// special characters that would require escaping. +pub fn md5(password: &[u8], username: &str) -> String { + // salt password with username + let mut salted_password = Vec::from(password); + salted_password.extend_from_slice(username.as_bytes()); + + let mut hash = Md5::new(); + hash.update(&salted_password); + let digest = hash.finalize(); + format!("md5{:x}", digest) +} diff --git a/libs/proxy/postgres-protocol2/src/password/test.rs b/libs/proxy/postgres-protocol2/src/password/test.rs new file mode 100644 index 0000000000..c9d340f09d --- /dev/null +++ b/libs/proxy/postgres-protocol2/src/password/test.rs @@ -0,0 +1,19 @@ +use crate::password; + +#[tokio::test] +async fn test_encrypt_scram_sha_256() { + // Specify the salt to make the test deterministic. Any bytes will do. + let salt: [u8; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]; + assert_eq!( + password::scram_sha_256_salt(b"secret", salt).await, + "SCRAM-SHA-256$4096:AQIDBAUGBwgJCgsMDQ4PEA==$8rrDg00OqaiWXJ7p+sCgHEIaBSHY89ZJl3mfIsf32oY=:05L1f+yZbiN8O0AnO40Og85NNRhvzTS57naKRWCcsIA=" + ); +} + +#[test] +fn test_encrypt_md5() { + assert_eq!( + password::md5(b"secret", "foo"), + "md54ab2c5d00339c4b2a4e921d2dc4edec7" + ); +} diff --git a/libs/proxy/postgres-protocol2/src/types/mod.rs b/libs/proxy/postgres-protocol2/src/types/mod.rs new file mode 100644 index 0000000000..78131c05bf --- /dev/null +++ b/libs/proxy/postgres-protocol2/src/types/mod.rs @@ -0,0 +1,294 @@ +//! Conversions to and from Postgres's binary format for various types. +use byteorder::{BigEndian, ReadBytesExt}; +use bytes::{BufMut, BytesMut}; +use fallible_iterator::FallibleIterator; +use std::boxed::Box as StdBox; +use std::error::Error; +use std::str; + +use crate::Oid; + +#[cfg(test)] +mod test; + +/// Serializes a `TEXT`, `VARCHAR`, `CHAR(n)`, `NAME`, or `CITEXT` value. +#[inline] +pub fn text_to_sql(v: &str, buf: &mut BytesMut) { + buf.put_slice(v.as_bytes()); +} + +/// Deserializes a `TEXT`, `VARCHAR`, `CHAR(n)`, `NAME`, or `CITEXT` value. +#[inline] +pub fn text_from_sql(buf: &[u8]) -> Result<&str, StdBox> { + Ok(str::from_utf8(buf)?) +} + +/// Deserializes a `"char"` value. +#[inline] +pub fn char_from_sql(mut buf: &[u8]) -> Result> { + let v = buf.read_i8()?; + if !buf.is_empty() { + return Err("invalid buffer size".into()); + } + Ok(v) +} + +/// Serializes an `OID` value. +#[inline] +pub fn oid_to_sql(v: Oid, buf: &mut BytesMut) { + buf.put_u32(v); +} + +/// Deserializes an `OID` value. +#[inline] +pub fn oid_from_sql(mut buf: &[u8]) -> Result> { + let v = buf.read_u32::()?; + if !buf.is_empty() { + return Err("invalid buffer size".into()); + } + Ok(v) +} + +/// A fallible iterator over `HSTORE` entries. +pub struct HstoreEntries<'a> { + remaining: i32, + buf: &'a [u8], +} + +impl<'a> FallibleIterator for HstoreEntries<'a> { + type Item = (&'a str, Option<&'a str>); + type Error = StdBox; + + #[inline] + #[allow(clippy::type_complexity)] + fn next( + &mut self, + ) -> Result)>, StdBox> { + if self.remaining == 0 { + if !self.buf.is_empty() { + return Err("invalid buffer size".into()); + } + return Ok(None); + } + + self.remaining -= 1; + + let key_len = self.buf.read_i32::()?; + if key_len < 0 { + return Err("invalid key length".into()); + } + let (key, buf) = self.buf.split_at(key_len as usize); + let key = str::from_utf8(key)?; + self.buf = buf; + + let value_len = self.buf.read_i32::()?; + let value = if value_len < 0 { + None + } else { + let (value, buf) = self.buf.split_at(value_len as usize); + let value = str::from_utf8(value)?; + self.buf = buf; + Some(value) + }; + + Ok(Some((key, value))) + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + let len = self.remaining as usize; + (len, Some(len)) + } +} + +/// Deserializes an array value. +#[inline] +pub fn array_from_sql(mut buf: &[u8]) -> Result, StdBox> { + let dimensions = buf.read_i32::()?; + if dimensions < 0 { + return Err("invalid dimension count".into()); + } + + let mut r = buf; + let mut elements = 1i32; + for _ in 0..dimensions { + let len = r.read_i32::()?; + if len < 0 { + return Err("invalid dimension size".into()); + } + let _lower_bound = r.read_i32::()?; + elements = match elements.checked_mul(len) { + Some(elements) => elements, + None => return Err("too many array elements".into()), + }; + } + + if dimensions == 0 { + elements = 0; + } + + Ok(Array { + dimensions, + elements, + buf, + }) +} + +/// A Postgres array. +pub struct Array<'a> { + dimensions: i32, + elements: i32, + buf: &'a [u8], +} + +impl<'a> Array<'a> { + /// Returns an iterator over the dimensions of the array. + #[inline] + pub fn dimensions(&self) -> ArrayDimensions<'a> { + ArrayDimensions(&self.buf[..self.dimensions as usize * 8]) + } + + /// Returns an iterator over the values of the array. + #[inline] + pub fn values(&self) -> ArrayValues<'a> { + ArrayValues { + remaining: self.elements, + buf: &self.buf[self.dimensions as usize * 8..], + } + } +} + +/// An iterator over the dimensions of an array. +pub struct ArrayDimensions<'a>(&'a [u8]); + +impl FallibleIterator for ArrayDimensions<'_> { + type Item = ArrayDimension; + type Error = StdBox; + + #[inline] + fn next(&mut self) -> Result, StdBox> { + if self.0.is_empty() { + return Ok(None); + } + + let len = self.0.read_i32::()?; + let lower_bound = self.0.read_i32::()?; + + Ok(Some(ArrayDimension { len, lower_bound })) + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + let len = self.0.len() / 8; + (len, Some(len)) + } +} + +/// Information about a dimension of an array. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct ArrayDimension { + /// The length of this dimension. + pub len: i32, + + /// The base value used to index into this dimension. + pub lower_bound: i32, +} + +/// An iterator over the values of an array, in row-major order. +pub struct ArrayValues<'a> { + remaining: i32, + buf: &'a [u8], +} + +impl<'a> FallibleIterator for ArrayValues<'a> { + type Item = Option<&'a [u8]>; + type Error = StdBox; + + #[inline] + fn next(&mut self) -> Result>, StdBox> { + if self.remaining == 0 { + if !self.buf.is_empty() { + return Err("invalid message length: arrayvalue not drained".into()); + } + return Ok(None); + } + self.remaining -= 1; + + let len = self.buf.read_i32::()?; + let val = if len < 0 { + None + } else { + if self.buf.len() < len as usize { + return Err("invalid value length".into()); + } + + let (val, buf) = self.buf.split_at(len as usize); + self.buf = buf; + Some(val) + }; + + Ok(Some(val)) + } + + fn size_hint(&self) -> (usize, Option) { + let len = self.remaining as usize; + (len, Some(len)) + } +} + +/// Serializes a Postgres ltree string +#[inline] +pub fn ltree_to_sql(v: &str, buf: &mut BytesMut) { + // A version number is prepended to an ltree string per spec + buf.put_u8(1); + // Append the rest of the query + buf.put_slice(v.as_bytes()); +} + +/// Deserialize a Postgres ltree string +#[inline] +pub fn ltree_from_sql(buf: &[u8]) -> Result<&str, StdBox> { + match buf { + // Remove the version number from the front of the ltree per spec + [1u8, rest @ ..] => Ok(str::from_utf8(rest)?), + _ => Err("ltree version 1 only supported".into()), + } +} + +/// Serializes a Postgres lquery string +#[inline] +pub fn lquery_to_sql(v: &str, buf: &mut BytesMut) { + // A version number is prepended to an lquery string per spec + buf.put_u8(1); + // Append the rest of the query + buf.put_slice(v.as_bytes()); +} + +/// Deserialize a Postgres lquery string +#[inline] +pub fn lquery_from_sql(buf: &[u8]) -> Result<&str, StdBox> { + match buf { + // Remove the version number from the front of the lquery per spec + [1u8, rest @ ..] => Ok(str::from_utf8(rest)?), + _ => Err("lquery version 1 only supported".into()), + } +} + +/// Serializes a Postgres ltxtquery string +#[inline] +pub fn ltxtquery_to_sql(v: &str, buf: &mut BytesMut) { + // A version number is prepended to an ltxtquery string per spec + buf.put_u8(1); + // Append the rest of the query + buf.put_slice(v.as_bytes()); +} + +/// Deserialize a Postgres ltxtquery string +#[inline] +pub fn ltxtquery_from_sql(buf: &[u8]) -> Result<&str, StdBox> { + match buf { + // Remove the version number from the front of the ltxtquery per spec + [1u8, rest @ ..] => Ok(str::from_utf8(rest)?), + _ => Err("ltxtquery version 1 only supported".into()), + } +} diff --git a/libs/proxy/postgres-protocol2/src/types/test.rs b/libs/proxy/postgres-protocol2/src/types/test.rs new file mode 100644 index 0000000000..96cc055bc3 --- /dev/null +++ b/libs/proxy/postgres-protocol2/src/types/test.rs @@ -0,0 +1,87 @@ +use bytes::{Buf, BytesMut}; + +use super::*; + +#[test] +fn ltree_sql() { + let mut query = vec![1u8]; + query.extend_from_slice("A.B.C".as_bytes()); + + let mut buf = BytesMut::new(); + + ltree_to_sql("A.B.C", &mut buf); + + assert_eq!(query.as_slice(), buf.chunk()); +} + +#[test] +fn ltree_str() { + let mut query = vec![1u8]; + query.extend_from_slice("A.B.C".as_bytes()); + + assert!(ltree_from_sql(query.as_slice()).is_ok()) +} + +#[test] +fn ltree_wrong_version() { + let mut query = vec![2u8]; + query.extend_from_slice("A.B.C".as_bytes()); + + assert!(ltree_from_sql(query.as_slice()).is_err()) +} + +#[test] +fn lquery_sql() { + let mut query = vec![1u8]; + query.extend_from_slice("A.B.C".as_bytes()); + + let mut buf = BytesMut::new(); + + lquery_to_sql("A.B.C", &mut buf); + + assert_eq!(query.as_slice(), buf.chunk()); +} + +#[test] +fn lquery_str() { + let mut query = vec![1u8]; + query.extend_from_slice("A.B.C".as_bytes()); + + assert!(lquery_from_sql(query.as_slice()).is_ok()) +} + +#[test] +fn lquery_wrong_version() { + let mut query = vec![2u8]; + query.extend_from_slice("A.B.C".as_bytes()); + + assert!(lquery_from_sql(query.as_slice()).is_err()) +} + +#[test] +fn ltxtquery_sql() { + let mut query = vec![1u8]; + query.extend_from_slice("a & b*".as_bytes()); + + let mut buf = BytesMut::new(); + + ltree_to_sql("a & b*", &mut buf); + + assert_eq!(query.as_slice(), buf.chunk()); +} + +#[test] +fn ltxtquery_str() { + let mut query = vec![1u8]; + query.extend_from_slice("a & b*".as_bytes()); + + assert!(ltree_from_sql(query.as_slice()).is_ok()) +} + +#[test] +fn ltxtquery_wrong_version() { + let mut query = vec![2u8]; + query.extend_from_slice("a & b*".as_bytes()); + + assert!(ltree_from_sql(query.as_slice()).is_err()) +} diff --git a/libs/proxy/postgres-types2/Cargo.toml b/libs/proxy/postgres-types2/Cargo.toml new file mode 100644 index 0000000000..58cfb5571f --- /dev/null +++ b/libs/proxy/postgres-types2/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "postgres-types2" +version = "0.1.0" +edition = "2018" +license = "MIT/Apache-2.0" + +[dependencies] +bytes.workspace = true +fallible-iterator.workspace = true +postgres-protocol2 = { path = "../postgres-protocol2" } diff --git a/libs/proxy/postgres-types2/src/lib.rs b/libs/proxy/postgres-types2/src/lib.rs new file mode 100644 index 0000000000..18ba032151 --- /dev/null +++ b/libs/proxy/postgres-types2/src/lib.rs @@ -0,0 +1,477 @@ +//! Conversions to and from Postgres types. +//! +//! This crate is used by the `tokio-postgres` and `postgres` crates. You normally don't need to depend directly on it +//! unless you want to define your own `ToSql` or `FromSql` definitions. +#![doc(html_root_url = "https://docs.rs/postgres-types/0.2")] +#![warn(clippy::all, rust_2018_idioms, missing_docs)] + +use fallible_iterator::FallibleIterator; +use postgres_protocol2::types; +use std::any::type_name; +use std::error::Error; +use std::fmt; +use std::sync::Arc; + +use crate::type_gen::{Inner, Other}; + +#[doc(inline)] +pub use postgres_protocol2::Oid; + +use bytes::BytesMut; + +/// Generates a simple implementation of `ToSql::accepts` which accepts the +/// types passed to it. +macro_rules! accepts { + ($($expected:ident),+) => ( + fn accepts(ty: &$crate::Type) -> bool { + matches!(*ty, $($crate::Type::$expected)|+) + } + ) +} + +/// Generates an implementation of `ToSql::to_sql_checked`. +/// +/// All `ToSql` implementations should use this macro. +macro_rules! to_sql_checked { + () => { + fn to_sql_checked( + &self, + ty: &$crate::Type, + out: &mut $crate::private::BytesMut, + ) -> ::std::result::Result< + $crate::IsNull, + Box, + > { + $crate::__to_sql_checked(self, ty, out) + } + }; +} + +// WARNING: this function is not considered part of this crate's public API. +// It is subject to change at any time. +#[doc(hidden)] +pub fn __to_sql_checked( + v: &T, + ty: &Type, + out: &mut BytesMut, +) -> Result> +where + T: ToSql, +{ + if !T::accepts(ty) { + return Err(Box::new(WrongType::new::(ty.clone()))); + } + v.to_sql(ty, out) +} + +// mod pg_lsn; +#[doc(hidden)] +pub mod private; +// mod special; +mod type_gen; + +/// A Postgres type. +#[derive(PartialEq, Eq, Clone, Hash)] +pub struct Type(Inner); + +impl fmt::Debug for Type { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Debug::fmt(&self.0, fmt) + } +} + +impl fmt::Display for Type { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + match self.schema() { + "public" | "pg_catalog" => {} + schema => write!(fmt, "{}.", schema)?, + } + fmt.write_str(self.name()) + } +} + +impl Type { + /// Creates a new `Type`. + pub fn new(name: String, oid: Oid, kind: Kind, schema: String) -> Type { + Type(Inner::Other(Arc::new(Other { + name, + oid, + kind, + schema, + }))) + } + + /// Returns the `Type` corresponding to the provided `Oid` if it + /// corresponds to a built-in type. + pub fn from_oid(oid: Oid) -> Option { + Inner::from_oid(oid).map(Type) + } + + /// Returns the OID of the `Type`. + pub fn oid(&self) -> Oid { + self.0.oid() + } + + /// Returns the kind of this type. + pub fn kind(&self) -> &Kind { + self.0.kind() + } + + /// Returns the schema of this type. + pub fn schema(&self) -> &str { + match self.0 { + Inner::Other(ref u) => &u.schema, + _ => "pg_catalog", + } + } + + /// Returns the name of this type. + pub fn name(&self) -> &str { + self.0.name() + } +} + +/// Represents the kind of a Postgres type. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +#[non_exhaustive] +pub enum Kind { + /// A simple type like `VARCHAR` or `INTEGER`. + Simple, + /// An enumerated type along with its variants. + Enum(Vec), + /// A pseudo-type. + Pseudo, + /// An array type along with the type of its elements. + Array(Type), + /// A range type along with the type of its elements. + Range(Type), + /// A multirange type along with the type of its elements. + Multirange(Type), + /// A domain type along with its underlying type. + Domain(Type), + /// A composite type along with information about its fields. + Composite(Vec), +} + +/// Information about a field of a composite type. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct Field { + name: String, + type_: Type, +} + +impl Field { + /// Creates a new `Field`. + pub fn new(name: String, type_: Type) -> Field { + Field { name, type_ } + } + + /// Returns the name of the field. + pub fn name(&self) -> &str { + &self.name + } + + /// Returns the type of the field. + pub fn type_(&self) -> &Type { + &self.type_ + } +} + +/// An error indicating that a `NULL` Postgres value was passed to a `FromSql` +/// implementation that does not support `NULL` values. +#[derive(Debug, Clone, Copy)] +pub struct WasNull; + +impl fmt::Display for WasNull { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt.write_str("a Postgres value was `NULL`") + } +} + +impl Error for WasNull {} + +/// An error indicating that a conversion was attempted between incompatible +/// Rust and Postgres types. +#[derive(Debug)] +pub struct WrongType { + postgres: Type, + rust: &'static str, +} + +impl fmt::Display for WrongType { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + fmt, + "cannot convert between the Rust type `{}` and the Postgres type `{}`", + self.rust, self.postgres, + ) + } +} + +impl Error for WrongType {} + +impl WrongType { + /// Creates a new `WrongType` error. + pub fn new(ty: Type) -> WrongType { + WrongType { + postgres: ty, + rust: type_name::(), + } + } +} + +/// An error indicating that a as_text conversion was attempted on a binary +/// result. +#[derive(Debug)] +pub struct WrongFormat {} + +impl Error for WrongFormat {} + +impl fmt::Display for WrongFormat { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + fmt, + "cannot read column as text while it is in binary format" + ) + } +} + +/// A trait for types that can be created from a Postgres value. +pub trait FromSql<'a>: Sized { + /// Creates a new value of this type from a buffer of data of the specified + /// Postgres `Type` in its binary format. + /// + /// The caller of this method is responsible for ensuring that this type + /// is compatible with the Postgres `Type`. + fn from_sql(ty: &Type, raw: &'a [u8]) -> Result>; + + /// Creates a new value of this type from a `NULL` SQL value. + /// + /// The caller of this method is responsible for ensuring that this type + /// is compatible with the Postgres `Type`. + /// + /// The default implementation returns `Err(Box::new(WasNull))`. + #[allow(unused_variables)] + fn from_sql_null(ty: &Type) -> Result> { + Err(Box::new(WasNull)) + } + + /// A convenience function that delegates to `from_sql` and `from_sql_null` depending on the + /// value of `raw`. + fn from_sql_nullable( + ty: &Type, + raw: Option<&'a [u8]>, + ) -> Result> { + match raw { + Some(raw) => Self::from_sql(ty, raw), + None => Self::from_sql_null(ty), + } + } + + /// Determines if a value of this type can be created from the specified + /// Postgres `Type`. + fn accepts(ty: &Type) -> bool; +} + +/// A trait for types which can be created from a Postgres value without borrowing any data. +/// +/// This is primarily useful for trait bounds on functions. +pub trait FromSqlOwned: for<'a> FromSql<'a> {} + +impl FromSqlOwned for T where T: for<'a> FromSql<'a> {} + +impl<'a, T: FromSql<'a>> FromSql<'a> for Option { + fn from_sql(ty: &Type, raw: &'a [u8]) -> Result, Box> { + ::from_sql(ty, raw).map(Some) + } + + fn from_sql_null(_: &Type) -> Result, Box> { + Ok(None) + } + + fn accepts(ty: &Type) -> bool { + ::accepts(ty) + } +} + +impl<'a, T: FromSql<'a>> FromSql<'a> for Vec { + fn from_sql(ty: &Type, raw: &'a [u8]) -> Result, Box> { + let member_type = match *ty.kind() { + Kind::Array(ref member) => member, + _ => panic!("expected array type"), + }; + + let array = types::array_from_sql(raw)?; + if array.dimensions().count()? > 1 { + return Err("array contains too many dimensions".into()); + } + + array + .values() + .map(|v| T::from_sql_nullable(member_type, v)) + .collect() + } + + fn accepts(ty: &Type) -> bool { + match *ty.kind() { + Kind::Array(ref inner) => T::accepts(inner), + _ => false, + } + } +} + +impl<'a> FromSql<'a> for String { + fn from_sql(ty: &Type, raw: &'a [u8]) -> Result> { + <&str as FromSql>::from_sql(ty, raw).map(ToString::to_string) + } + + fn accepts(ty: &Type) -> bool { + <&str as FromSql>::accepts(ty) + } +} + +impl<'a> FromSql<'a> for &'a str { + fn from_sql(ty: &Type, raw: &'a [u8]) -> Result<&'a str, Box> { + match *ty { + ref ty if ty.name() == "ltree" => types::ltree_from_sql(raw), + ref ty if ty.name() == "lquery" => types::lquery_from_sql(raw), + ref ty if ty.name() == "ltxtquery" => types::ltxtquery_from_sql(raw), + _ => types::text_from_sql(raw), + } + } + + fn accepts(ty: &Type) -> bool { + match *ty { + Type::VARCHAR | Type::TEXT | Type::BPCHAR | Type::NAME | Type::UNKNOWN => true, + ref ty + if (ty.name() == "citext" + || ty.name() == "ltree" + || ty.name() == "lquery" + || ty.name() == "ltxtquery") => + { + true + } + _ => false, + } + } +} + +macro_rules! simple_from { + ($t:ty, $f:ident, $($expected:ident),+) => { + impl<'a> FromSql<'a> for $t { + fn from_sql(_: &Type, raw: &'a [u8]) -> Result<$t, Box> { + types::$f(raw) + } + + accepts!($($expected),+); + } + } +} + +simple_from!(i8, char_from_sql, CHAR); +simple_from!(u32, oid_from_sql, OID); + +/// An enum representing the nullability of a Postgres value. +pub enum IsNull { + /// The value is NULL. + Yes, + /// The value is not NULL. + No, +} + +/// A trait for types that can be converted into Postgres values. +pub trait ToSql: fmt::Debug { + /// Converts the value of `self` into the binary format of the specified + /// Postgres `Type`, appending it to `out`. + /// + /// The caller of this method is responsible for ensuring that this type + /// is compatible with the Postgres `Type`. + /// + /// The return value indicates if this value should be represented as + /// `NULL`. If this is the case, implementations **must not** write + /// anything to `out`. + fn to_sql(&self, ty: &Type, out: &mut BytesMut) -> Result> + where + Self: Sized; + + /// Determines if a value of this type can be converted to the specified + /// Postgres `Type`. + fn accepts(ty: &Type) -> bool + where + Self: Sized; + + /// An adaptor method used internally by Rust-Postgres. + /// + /// *All* implementations of this method should be generated by the + /// `to_sql_checked!()` macro. + fn to_sql_checked( + &self, + ty: &Type, + out: &mut BytesMut, + ) -> Result>; + + /// Specify the encode format + fn encode_format(&self, _ty: &Type) -> Format { + Format::Binary + } +} + +/// Supported Postgres message format types +/// +/// Using Text format in a message assumes a Postgres `SERVER_ENCODING` of `UTF8` +#[derive(Clone, Copy, Debug, PartialEq)] +pub enum Format { + /// Text format (UTF-8) + Text, + /// Compact, typed binary format + Binary, +} + +impl ToSql for &str { + fn to_sql(&self, ty: &Type, w: &mut BytesMut) -> Result> { + match *ty { + ref ty if ty.name() == "ltree" => types::ltree_to_sql(self, w), + ref ty if ty.name() == "lquery" => types::lquery_to_sql(self, w), + ref ty if ty.name() == "ltxtquery" => types::ltxtquery_to_sql(self, w), + _ => types::text_to_sql(self, w), + } + Ok(IsNull::No) + } + + fn accepts(ty: &Type) -> bool { + match *ty { + Type::VARCHAR | Type::TEXT | Type::BPCHAR | Type::NAME | Type::UNKNOWN => true, + ref ty + if (ty.name() == "citext" + || ty.name() == "ltree" + || ty.name() == "lquery" + || ty.name() == "ltxtquery") => + { + true + } + _ => false, + } + } + + to_sql_checked!(); +} + +macro_rules! simple_to { + ($t:ty, $f:ident, $($expected:ident),+) => { + impl ToSql for $t { + fn to_sql(&self, + _: &Type, + w: &mut BytesMut) + -> Result> { + types::$f(*self, w); + Ok(IsNull::No) + } + + accepts!($($expected),+); + + to_sql_checked!(); + } + } +} + +simple_to!(u32, oid_to_sql, OID); diff --git a/libs/proxy/postgres-types2/src/private.rs b/libs/proxy/postgres-types2/src/private.rs new file mode 100644 index 0000000000..774f9a301c --- /dev/null +++ b/libs/proxy/postgres-types2/src/private.rs @@ -0,0 +1,34 @@ +use crate::{FromSql, Type}; +pub use bytes::BytesMut; +use std::error::Error; + +pub fn read_be_i32(buf: &mut &[u8]) -> Result> { + if buf.len() < 4 { + return Err("invalid buffer size".into()); + } + let mut bytes = [0; 4]; + bytes.copy_from_slice(&buf[..4]); + *buf = &buf[4..]; + Ok(i32::from_be_bytes(bytes)) +} + +pub fn read_value<'a, T>( + type_: &Type, + buf: &mut &'a [u8], +) -> Result> +where + T: FromSql<'a>, +{ + let len = read_be_i32(buf)?; + let value = if len < 0 { + None + } else { + if len as usize > buf.len() { + return Err("invalid buffer size".into()); + } + let (head, tail) = buf.split_at(len as usize); + *buf = tail; + Some(head) + }; + T::from_sql_nullable(type_, value) +} diff --git a/libs/proxy/postgres-types2/src/type_gen.rs b/libs/proxy/postgres-types2/src/type_gen.rs new file mode 100644 index 0000000000..a1bc3f85c0 --- /dev/null +++ b/libs/proxy/postgres-types2/src/type_gen.rs @@ -0,0 +1,1524 @@ +// Autogenerated file - DO NOT EDIT +use std::sync::Arc; + +use crate::{Kind, Oid, Type}; + +#[derive(PartialEq, Eq, Debug, Hash)] +pub struct Other { + pub name: String, + pub oid: Oid, + pub kind: Kind, + pub schema: String, +} + +#[derive(PartialEq, Eq, Clone, Debug, Hash)] +pub enum Inner { + Bool, + Bytea, + Char, + Name, + Int8, + Int2, + Int2Vector, + Int4, + Regproc, + Text, + Oid, + Tid, + Xid, + Cid, + OidVector, + PgDdlCommand, + Json, + Xml, + XmlArray, + PgNodeTree, + JsonArray, + TableAmHandler, + Xid8Array, + IndexAmHandler, + Point, + Lseg, + Path, + Box, + Polygon, + Line, + LineArray, + Cidr, + CidrArray, + Float4, + Float8, + Unknown, + Circle, + CircleArray, + Macaddr8, + Macaddr8Array, + Money, + MoneyArray, + Macaddr, + Inet, + BoolArray, + ByteaArray, + CharArray, + NameArray, + Int2Array, + Int2VectorArray, + Int4Array, + RegprocArray, + TextArray, + TidArray, + XidArray, + CidArray, + OidVectorArray, + BpcharArray, + VarcharArray, + Int8Array, + PointArray, + LsegArray, + PathArray, + BoxArray, + Float4Array, + Float8Array, + PolygonArray, + OidArray, + Aclitem, + AclitemArray, + MacaddrArray, + InetArray, + Bpchar, + Varchar, + Date, + Time, + Timestamp, + TimestampArray, + DateArray, + TimeArray, + Timestamptz, + TimestamptzArray, + Interval, + IntervalArray, + NumericArray, + CstringArray, + Timetz, + TimetzArray, + Bit, + BitArray, + Varbit, + VarbitArray, + Numeric, + Refcursor, + RefcursorArray, + Regprocedure, + Regoper, + Regoperator, + Regclass, + Regtype, + RegprocedureArray, + RegoperArray, + RegoperatorArray, + RegclassArray, + RegtypeArray, + Record, + Cstring, + Any, + Anyarray, + Void, + Trigger, + LanguageHandler, + Internal, + Anyelement, + RecordArray, + Anynonarray, + TxidSnapshotArray, + Uuid, + UuidArray, + TxidSnapshot, + FdwHandler, + PgLsn, + PgLsnArray, + TsmHandler, + PgNdistinct, + PgDependencies, + Anyenum, + TsVector, + Tsquery, + GtsVector, + TsVectorArray, + GtsVectorArray, + TsqueryArray, + Regconfig, + RegconfigArray, + Regdictionary, + RegdictionaryArray, + Jsonb, + JsonbArray, + AnyRange, + EventTrigger, + Int4Range, + Int4RangeArray, + NumRange, + NumRangeArray, + TsRange, + TsRangeArray, + TstzRange, + TstzRangeArray, + DateRange, + DateRangeArray, + Int8Range, + Int8RangeArray, + Jsonpath, + JsonpathArray, + Regnamespace, + RegnamespaceArray, + Regrole, + RegroleArray, + Regcollation, + RegcollationArray, + Int4multiRange, + NummultiRange, + TsmultiRange, + TstzmultiRange, + DatemultiRange, + Int8multiRange, + AnymultiRange, + AnycompatiblemultiRange, + PgBrinBloomSummary, + PgBrinMinmaxMultiSummary, + PgMcvList, + PgSnapshot, + PgSnapshotArray, + Xid8, + Anycompatible, + Anycompatiblearray, + Anycompatiblenonarray, + AnycompatibleRange, + Int4multiRangeArray, + NummultiRangeArray, + TsmultiRangeArray, + TstzmultiRangeArray, + DatemultiRangeArray, + Int8multiRangeArray, + Other(Arc), +} + +impl Inner { + pub fn from_oid(oid: Oid) -> Option { + match oid { + 16 => Some(Inner::Bool), + 17 => Some(Inner::Bytea), + 18 => Some(Inner::Char), + 19 => Some(Inner::Name), + 20 => Some(Inner::Int8), + 21 => Some(Inner::Int2), + 22 => Some(Inner::Int2Vector), + 23 => Some(Inner::Int4), + 24 => Some(Inner::Regproc), + 25 => Some(Inner::Text), + 26 => Some(Inner::Oid), + 27 => Some(Inner::Tid), + 28 => Some(Inner::Xid), + 29 => Some(Inner::Cid), + 30 => Some(Inner::OidVector), + 32 => Some(Inner::PgDdlCommand), + 114 => Some(Inner::Json), + 142 => Some(Inner::Xml), + 143 => Some(Inner::XmlArray), + 194 => Some(Inner::PgNodeTree), + 199 => Some(Inner::JsonArray), + 269 => Some(Inner::TableAmHandler), + 271 => Some(Inner::Xid8Array), + 325 => Some(Inner::IndexAmHandler), + 600 => Some(Inner::Point), + 601 => Some(Inner::Lseg), + 602 => Some(Inner::Path), + 603 => Some(Inner::Box), + 604 => Some(Inner::Polygon), + 628 => Some(Inner::Line), + 629 => Some(Inner::LineArray), + 650 => Some(Inner::Cidr), + 651 => Some(Inner::CidrArray), + 700 => Some(Inner::Float4), + 701 => Some(Inner::Float8), + 705 => Some(Inner::Unknown), + 718 => Some(Inner::Circle), + 719 => Some(Inner::CircleArray), + 774 => Some(Inner::Macaddr8), + 775 => Some(Inner::Macaddr8Array), + 790 => Some(Inner::Money), + 791 => Some(Inner::MoneyArray), + 829 => Some(Inner::Macaddr), + 869 => Some(Inner::Inet), + 1000 => Some(Inner::BoolArray), + 1001 => Some(Inner::ByteaArray), + 1002 => Some(Inner::CharArray), + 1003 => Some(Inner::NameArray), + 1005 => Some(Inner::Int2Array), + 1006 => Some(Inner::Int2VectorArray), + 1007 => Some(Inner::Int4Array), + 1008 => Some(Inner::RegprocArray), + 1009 => Some(Inner::TextArray), + 1010 => Some(Inner::TidArray), + 1011 => Some(Inner::XidArray), + 1012 => Some(Inner::CidArray), + 1013 => Some(Inner::OidVectorArray), + 1014 => Some(Inner::BpcharArray), + 1015 => Some(Inner::VarcharArray), + 1016 => Some(Inner::Int8Array), + 1017 => Some(Inner::PointArray), + 1018 => Some(Inner::LsegArray), + 1019 => Some(Inner::PathArray), + 1020 => Some(Inner::BoxArray), + 1021 => Some(Inner::Float4Array), + 1022 => Some(Inner::Float8Array), + 1027 => Some(Inner::PolygonArray), + 1028 => Some(Inner::OidArray), + 1033 => Some(Inner::Aclitem), + 1034 => Some(Inner::AclitemArray), + 1040 => Some(Inner::MacaddrArray), + 1041 => Some(Inner::InetArray), + 1042 => Some(Inner::Bpchar), + 1043 => Some(Inner::Varchar), + 1082 => Some(Inner::Date), + 1083 => Some(Inner::Time), + 1114 => Some(Inner::Timestamp), + 1115 => Some(Inner::TimestampArray), + 1182 => Some(Inner::DateArray), + 1183 => Some(Inner::TimeArray), + 1184 => Some(Inner::Timestamptz), + 1185 => Some(Inner::TimestamptzArray), + 1186 => Some(Inner::Interval), + 1187 => Some(Inner::IntervalArray), + 1231 => Some(Inner::NumericArray), + 1263 => Some(Inner::CstringArray), + 1266 => Some(Inner::Timetz), + 1270 => Some(Inner::TimetzArray), + 1560 => Some(Inner::Bit), + 1561 => Some(Inner::BitArray), + 1562 => Some(Inner::Varbit), + 1563 => Some(Inner::VarbitArray), + 1700 => Some(Inner::Numeric), + 1790 => Some(Inner::Refcursor), + 2201 => Some(Inner::RefcursorArray), + 2202 => Some(Inner::Regprocedure), + 2203 => Some(Inner::Regoper), + 2204 => Some(Inner::Regoperator), + 2205 => Some(Inner::Regclass), + 2206 => Some(Inner::Regtype), + 2207 => Some(Inner::RegprocedureArray), + 2208 => Some(Inner::RegoperArray), + 2209 => Some(Inner::RegoperatorArray), + 2210 => Some(Inner::RegclassArray), + 2211 => Some(Inner::RegtypeArray), + 2249 => Some(Inner::Record), + 2275 => Some(Inner::Cstring), + 2276 => Some(Inner::Any), + 2277 => Some(Inner::Anyarray), + 2278 => Some(Inner::Void), + 2279 => Some(Inner::Trigger), + 2280 => Some(Inner::LanguageHandler), + 2281 => Some(Inner::Internal), + 2283 => Some(Inner::Anyelement), + 2287 => Some(Inner::RecordArray), + 2776 => Some(Inner::Anynonarray), + 2949 => Some(Inner::TxidSnapshotArray), + 2950 => Some(Inner::Uuid), + 2951 => Some(Inner::UuidArray), + 2970 => Some(Inner::TxidSnapshot), + 3115 => Some(Inner::FdwHandler), + 3220 => Some(Inner::PgLsn), + 3221 => Some(Inner::PgLsnArray), + 3310 => Some(Inner::TsmHandler), + 3361 => Some(Inner::PgNdistinct), + 3402 => Some(Inner::PgDependencies), + 3500 => Some(Inner::Anyenum), + 3614 => Some(Inner::TsVector), + 3615 => Some(Inner::Tsquery), + 3642 => Some(Inner::GtsVector), + 3643 => Some(Inner::TsVectorArray), + 3644 => Some(Inner::GtsVectorArray), + 3645 => Some(Inner::TsqueryArray), + 3734 => Some(Inner::Regconfig), + 3735 => Some(Inner::RegconfigArray), + 3769 => Some(Inner::Regdictionary), + 3770 => Some(Inner::RegdictionaryArray), + 3802 => Some(Inner::Jsonb), + 3807 => Some(Inner::JsonbArray), + 3831 => Some(Inner::AnyRange), + 3838 => Some(Inner::EventTrigger), + 3904 => Some(Inner::Int4Range), + 3905 => Some(Inner::Int4RangeArray), + 3906 => Some(Inner::NumRange), + 3907 => Some(Inner::NumRangeArray), + 3908 => Some(Inner::TsRange), + 3909 => Some(Inner::TsRangeArray), + 3910 => Some(Inner::TstzRange), + 3911 => Some(Inner::TstzRangeArray), + 3912 => Some(Inner::DateRange), + 3913 => Some(Inner::DateRangeArray), + 3926 => Some(Inner::Int8Range), + 3927 => Some(Inner::Int8RangeArray), + 4072 => Some(Inner::Jsonpath), + 4073 => Some(Inner::JsonpathArray), + 4089 => Some(Inner::Regnamespace), + 4090 => Some(Inner::RegnamespaceArray), + 4096 => Some(Inner::Regrole), + 4097 => Some(Inner::RegroleArray), + 4191 => Some(Inner::Regcollation), + 4192 => Some(Inner::RegcollationArray), + 4451 => Some(Inner::Int4multiRange), + 4532 => Some(Inner::NummultiRange), + 4533 => Some(Inner::TsmultiRange), + 4534 => Some(Inner::TstzmultiRange), + 4535 => Some(Inner::DatemultiRange), + 4536 => Some(Inner::Int8multiRange), + 4537 => Some(Inner::AnymultiRange), + 4538 => Some(Inner::AnycompatiblemultiRange), + 4600 => Some(Inner::PgBrinBloomSummary), + 4601 => Some(Inner::PgBrinMinmaxMultiSummary), + 5017 => Some(Inner::PgMcvList), + 5038 => Some(Inner::PgSnapshot), + 5039 => Some(Inner::PgSnapshotArray), + 5069 => Some(Inner::Xid8), + 5077 => Some(Inner::Anycompatible), + 5078 => Some(Inner::Anycompatiblearray), + 5079 => Some(Inner::Anycompatiblenonarray), + 5080 => Some(Inner::AnycompatibleRange), + 6150 => Some(Inner::Int4multiRangeArray), + 6151 => Some(Inner::NummultiRangeArray), + 6152 => Some(Inner::TsmultiRangeArray), + 6153 => Some(Inner::TstzmultiRangeArray), + 6155 => Some(Inner::DatemultiRangeArray), + 6157 => Some(Inner::Int8multiRangeArray), + _ => None, + } + } + + pub fn oid(&self) -> Oid { + match *self { + Inner::Bool => 16, + Inner::Bytea => 17, + Inner::Char => 18, + Inner::Name => 19, + Inner::Int8 => 20, + Inner::Int2 => 21, + Inner::Int2Vector => 22, + Inner::Int4 => 23, + Inner::Regproc => 24, + Inner::Text => 25, + Inner::Oid => 26, + Inner::Tid => 27, + Inner::Xid => 28, + Inner::Cid => 29, + Inner::OidVector => 30, + Inner::PgDdlCommand => 32, + Inner::Json => 114, + Inner::Xml => 142, + Inner::XmlArray => 143, + Inner::PgNodeTree => 194, + Inner::JsonArray => 199, + Inner::TableAmHandler => 269, + Inner::Xid8Array => 271, + Inner::IndexAmHandler => 325, + Inner::Point => 600, + Inner::Lseg => 601, + Inner::Path => 602, + Inner::Box => 603, + Inner::Polygon => 604, + Inner::Line => 628, + Inner::LineArray => 629, + Inner::Cidr => 650, + Inner::CidrArray => 651, + Inner::Float4 => 700, + Inner::Float8 => 701, + Inner::Unknown => 705, + Inner::Circle => 718, + Inner::CircleArray => 719, + Inner::Macaddr8 => 774, + Inner::Macaddr8Array => 775, + Inner::Money => 790, + Inner::MoneyArray => 791, + Inner::Macaddr => 829, + Inner::Inet => 869, + Inner::BoolArray => 1000, + Inner::ByteaArray => 1001, + Inner::CharArray => 1002, + Inner::NameArray => 1003, + Inner::Int2Array => 1005, + Inner::Int2VectorArray => 1006, + Inner::Int4Array => 1007, + Inner::RegprocArray => 1008, + Inner::TextArray => 1009, + Inner::TidArray => 1010, + Inner::XidArray => 1011, + Inner::CidArray => 1012, + Inner::OidVectorArray => 1013, + Inner::BpcharArray => 1014, + Inner::VarcharArray => 1015, + Inner::Int8Array => 1016, + Inner::PointArray => 1017, + Inner::LsegArray => 1018, + Inner::PathArray => 1019, + Inner::BoxArray => 1020, + Inner::Float4Array => 1021, + Inner::Float8Array => 1022, + Inner::PolygonArray => 1027, + Inner::OidArray => 1028, + Inner::Aclitem => 1033, + Inner::AclitemArray => 1034, + Inner::MacaddrArray => 1040, + Inner::InetArray => 1041, + Inner::Bpchar => 1042, + Inner::Varchar => 1043, + Inner::Date => 1082, + Inner::Time => 1083, + Inner::Timestamp => 1114, + Inner::TimestampArray => 1115, + Inner::DateArray => 1182, + Inner::TimeArray => 1183, + Inner::Timestamptz => 1184, + Inner::TimestamptzArray => 1185, + Inner::Interval => 1186, + Inner::IntervalArray => 1187, + Inner::NumericArray => 1231, + Inner::CstringArray => 1263, + Inner::Timetz => 1266, + Inner::TimetzArray => 1270, + Inner::Bit => 1560, + Inner::BitArray => 1561, + Inner::Varbit => 1562, + Inner::VarbitArray => 1563, + Inner::Numeric => 1700, + Inner::Refcursor => 1790, + Inner::RefcursorArray => 2201, + Inner::Regprocedure => 2202, + Inner::Regoper => 2203, + Inner::Regoperator => 2204, + Inner::Regclass => 2205, + Inner::Regtype => 2206, + Inner::RegprocedureArray => 2207, + Inner::RegoperArray => 2208, + Inner::RegoperatorArray => 2209, + Inner::RegclassArray => 2210, + Inner::RegtypeArray => 2211, + Inner::Record => 2249, + Inner::Cstring => 2275, + Inner::Any => 2276, + Inner::Anyarray => 2277, + Inner::Void => 2278, + Inner::Trigger => 2279, + Inner::LanguageHandler => 2280, + Inner::Internal => 2281, + Inner::Anyelement => 2283, + Inner::RecordArray => 2287, + Inner::Anynonarray => 2776, + Inner::TxidSnapshotArray => 2949, + Inner::Uuid => 2950, + Inner::UuidArray => 2951, + Inner::TxidSnapshot => 2970, + Inner::FdwHandler => 3115, + Inner::PgLsn => 3220, + Inner::PgLsnArray => 3221, + Inner::TsmHandler => 3310, + Inner::PgNdistinct => 3361, + Inner::PgDependencies => 3402, + Inner::Anyenum => 3500, + Inner::TsVector => 3614, + Inner::Tsquery => 3615, + Inner::GtsVector => 3642, + Inner::TsVectorArray => 3643, + Inner::GtsVectorArray => 3644, + Inner::TsqueryArray => 3645, + Inner::Regconfig => 3734, + Inner::RegconfigArray => 3735, + Inner::Regdictionary => 3769, + Inner::RegdictionaryArray => 3770, + Inner::Jsonb => 3802, + Inner::JsonbArray => 3807, + Inner::AnyRange => 3831, + Inner::EventTrigger => 3838, + Inner::Int4Range => 3904, + Inner::Int4RangeArray => 3905, + Inner::NumRange => 3906, + Inner::NumRangeArray => 3907, + Inner::TsRange => 3908, + Inner::TsRangeArray => 3909, + Inner::TstzRange => 3910, + Inner::TstzRangeArray => 3911, + Inner::DateRange => 3912, + Inner::DateRangeArray => 3913, + Inner::Int8Range => 3926, + Inner::Int8RangeArray => 3927, + Inner::Jsonpath => 4072, + Inner::JsonpathArray => 4073, + Inner::Regnamespace => 4089, + Inner::RegnamespaceArray => 4090, + Inner::Regrole => 4096, + Inner::RegroleArray => 4097, + Inner::Regcollation => 4191, + Inner::RegcollationArray => 4192, + Inner::Int4multiRange => 4451, + Inner::NummultiRange => 4532, + Inner::TsmultiRange => 4533, + Inner::TstzmultiRange => 4534, + Inner::DatemultiRange => 4535, + Inner::Int8multiRange => 4536, + Inner::AnymultiRange => 4537, + Inner::AnycompatiblemultiRange => 4538, + Inner::PgBrinBloomSummary => 4600, + Inner::PgBrinMinmaxMultiSummary => 4601, + Inner::PgMcvList => 5017, + Inner::PgSnapshot => 5038, + Inner::PgSnapshotArray => 5039, + Inner::Xid8 => 5069, + Inner::Anycompatible => 5077, + Inner::Anycompatiblearray => 5078, + Inner::Anycompatiblenonarray => 5079, + Inner::AnycompatibleRange => 5080, + Inner::Int4multiRangeArray => 6150, + Inner::NummultiRangeArray => 6151, + Inner::TsmultiRangeArray => 6152, + Inner::TstzmultiRangeArray => 6153, + Inner::DatemultiRangeArray => 6155, + Inner::Int8multiRangeArray => 6157, + Inner::Other(ref u) => u.oid, + } + } + + pub fn kind(&self) -> &Kind { + match *self { + Inner::Bool => &Kind::Simple, + Inner::Bytea => &Kind::Simple, + Inner::Char => &Kind::Simple, + Inner::Name => &Kind::Simple, + Inner::Int8 => &Kind::Simple, + Inner::Int2 => &Kind::Simple, + Inner::Int2Vector => &Kind::Array(Type(Inner::Int2)), + Inner::Int4 => &Kind::Simple, + Inner::Regproc => &Kind::Simple, + Inner::Text => &Kind::Simple, + Inner::Oid => &Kind::Simple, + Inner::Tid => &Kind::Simple, + Inner::Xid => &Kind::Simple, + Inner::Cid => &Kind::Simple, + Inner::OidVector => &Kind::Array(Type(Inner::Oid)), + Inner::PgDdlCommand => &Kind::Pseudo, + Inner::Json => &Kind::Simple, + Inner::Xml => &Kind::Simple, + Inner::XmlArray => &Kind::Array(Type(Inner::Xml)), + Inner::PgNodeTree => &Kind::Simple, + Inner::JsonArray => &Kind::Array(Type(Inner::Json)), + Inner::TableAmHandler => &Kind::Pseudo, + Inner::Xid8Array => &Kind::Array(Type(Inner::Xid8)), + Inner::IndexAmHandler => &Kind::Pseudo, + Inner::Point => &Kind::Simple, + Inner::Lseg => &Kind::Simple, + Inner::Path => &Kind::Simple, + Inner::Box => &Kind::Simple, + Inner::Polygon => &Kind::Simple, + Inner::Line => &Kind::Simple, + Inner::LineArray => &Kind::Array(Type(Inner::Line)), + Inner::Cidr => &Kind::Simple, + Inner::CidrArray => &Kind::Array(Type(Inner::Cidr)), + Inner::Float4 => &Kind::Simple, + Inner::Float8 => &Kind::Simple, + Inner::Unknown => &Kind::Simple, + Inner::Circle => &Kind::Simple, + Inner::CircleArray => &Kind::Array(Type(Inner::Circle)), + Inner::Macaddr8 => &Kind::Simple, + Inner::Macaddr8Array => &Kind::Array(Type(Inner::Macaddr8)), + Inner::Money => &Kind::Simple, + Inner::MoneyArray => &Kind::Array(Type(Inner::Money)), + Inner::Macaddr => &Kind::Simple, + Inner::Inet => &Kind::Simple, + Inner::BoolArray => &Kind::Array(Type(Inner::Bool)), + Inner::ByteaArray => &Kind::Array(Type(Inner::Bytea)), + Inner::CharArray => &Kind::Array(Type(Inner::Char)), + Inner::NameArray => &Kind::Array(Type(Inner::Name)), + Inner::Int2Array => &Kind::Array(Type(Inner::Int2)), + Inner::Int2VectorArray => &Kind::Array(Type(Inner::Int2Vector)), + Inner::Int4Array => &Kind::Array(Type(Inner::Int4)), + Inner::RegprocArray => &Kind::Array(Type(Inner::Regproc)), + Inner::TextArray => &Kind::Array(Type(Inner::Text)), + Inner::TidArray => &Kind::Array(Type(Inner::Tid)), + Inner::XidArray => &Kind::Array(Type(Inner::Xid)), + Inner::CidArray => &Kind::Array(Type(Inner::Cid)), + Inner::OidVectorArray => &Kind::Array(Type(Inner::OidVector)), + Inner::BpcharArray => &Kind::Array(Type(Inner::Bpchar)), + Inner::VarcharArray => &Kind::Array(Type(Inner::Varchar)), + Inner::Int8Array => &Kind::Array(Type(Inner::Int8)), + Inner::PointArray => &Kind::Array(Type(Inner::Point)), + Inner::LsegArray => &Kind::Array(Type(Inner::Lseg)), + Inner::PathArray => &Kind::Array(Type(Inner::Path)), + Inner::BoxArray => &Kind::Array(Type(Inner::Box)), + Inner::Float4Array => &Kind::Array(Type(Inner::Float4)), + Inner::Float8Array => &Kind::Array(Type(Inner::Float8)), + Inner::PolygonArray => &Kind::Array(Type(Inner::Polygon)), + Inner::OidArray => &Kind::Array(Type(Inner::Oid)), + Inner::Aclitem => &Kind::Simple, + Inner::AclitemArray => &Kind::Array(Type(Inner::Aclitem)), + Inner::MacaddrArray => &Kind::Array(Type(Inner::Macaddr)), + Inner::InetArray => &Kind::Array(Type(Inner::Inet)), + Inner::Bpchar => &Kind::Simple, + Inner::Varchar => &Kind::Simple, + Inner::Date => &Kind::Simple, + Inner::Time => &Kind::Simple, + Inner::Timestamp => &Kind::Simple, + Inner::TimestampArray => &Kind::Array(Type(Inner::Timestamp)), + Inner::DateArray => &Kind::Array(Type(Inner::Date)), + Inner::TimeArray => &Kind::Array(Type(Inner::Time)), + Inner::Timestamptz => &Kind::Simple, + Inner::TimestamptzArray => &Kind::Array(Type(Inner::Timestamptz)), + Inner::Interval => &Kind::Simple, + Inner::IntervalArray => &Kind::Array(Type(Inner::Interval)), + Inner::NumericArray => &Kind::Array(Type(Inner::Numeric)), + Inner::CstringArray => &Kind::Array(Type(Inner::Cstring)), + Inner::Timetz => &Kind::Simple, + Inner::TimetzArray => &Kind::Array(Type(Inner::Timetz)), + Inner::Bit => &Kind::Simple, + Inner::BitArray => &Kind::Array(Type(Inner::Bit)), + Inner::Varbit => &Kind::Simple, + Inner::VarbitArray => &Kind::Array(Type(Inner::Varbit)), + Inner::Numeric => &Kind::Simple, + Inner::Refcursor => &Kind::Simple, + Inner::RefcursorArray => &Kind::Array(Type(Inner::Refcursor)), + Inner::Regprocedure => &Kind::Simple, + Inner::Regoper => &Kind::Simple, + Inner::Regoperator => &Kind::Simple, + Inner::Regclass => &Kind::Simple, + Inner::Regtype => &Kind::Simple, + Inner::RegprocedureArray => &Kind::Array(Type(Inner::Regprocedure)), + Inner::RegoperArray => &Kind::Array(Type(Inner::Regoper)), + Inner::RegoperatorArray => &Kind::Array(Type(Inner::Regoperator)), + Inner::RegclassArray => &Kind::Array(Type(Inner::Regclass)), + Inner::RegtypeArray => &Kind::Array(Type(Inner::Regtype)), + Inner::Record => &Kind::Pseudo, + Inner::Cstring => &Kind::Pseudo, + Inner::Any => &Kind::Pseudo, + Inner::Anyarray => &Kind::Pseudo, + Inner::Void => &Kind::Pseudo, + Inner::Trigger => &Kind::Pseudo, + Inner::LanguageHandler => &Kind::Pseudo, + Inner::Internal => &Kind::Pseudo, + Inner::Anyelement => &Kind::Pseudo, + Inner::RecordArray => &Kind::Pseudo, + Inner::Anynonarray => &Kind::Pseudo, + Inner::TxidSnapshotArray => &Kind::Array(Type(Inner::TxidSnapshot)), + Inner::Uuid => &Kind::Simple, + Inner::UuidArray => &Kind::Array(Type(Inner::Uuid)), + Inner::TxidSnapshot => &Kind::Simple, + Inner::FdwHandler => &Kind::Pseudo, + Inner::PgLsn => &Kind::Simple, + Inner::PgLsnArray => &Kind::Array(Type(Inner::PgLsn)), + Inner::TsmHandler => &Kind::Pseudo, + Inner::PgNdistinct => &Kind::Simple, + Inner::PgDependencies => &Kind::Simple, + Inner::Anyenum => &Kind::Pseudo, + Inner::TsVector => &Kind::Simple, + Inner::Tsquery => &Kind::Simple, + Inner::GtsVector => &Kind::Simple, + Inner::TsVectorArray => &Kind::Array(Type(Inner::TsVector)), + Inner::GtsVectorArray => &Kind::Array(Type(Inner::GtsVector)), + Inner::TsqueryArray => &Kind::Array(Type(Inner::Tsquery)), + Inner::Regconfig => &Kind::Simple, + Inner::RegconfigArray => &Kind::Array(Type(Inner::Regconfig)), + Inner::Regdictionary => &Kind::Simple, + Inner::RegdictionaryArray => &Kind::Array(Type(Inner::Regdictionary)), + Inner::Jsonb => &Kind::Simple, + Inner::JsonbArray => &Kind::Array(Type(Inner::Jsonb)), + Inner::AnyRange => &Kind::Pseudo, + Inner::EventTrigger => &Kind::Pseudo, + Inner::Int4Range => &Kind::Range(Type(Inner::Int4)), + Inner::Int4RangeArray => &Kind::Array(Type(Inner::Int4Range)), + Inner::NumRange => &Kind::Range(Type(Inner::Numeric)), + Inner::NumRangeArray => &Kind::Array(Type(Inner::NumRange)), + Inner::TsRange => &Kind::Range(Type(Inner::Timestamp)), + Inner::TsRangeArray => &Kind::Array(Type(Inner::TsRange)), + Inner::TstzRange => &Kind::Range(Type(Inner::Timestamptz)), + Inner::TstzRangeArray => &Kind::Array(Type(Inner::TstzRange)), + Inner::DateRange => &Kind::Range(Type(Inner::Date)), + Inner::DateRangeArray => &Kind::Array(Type(Inner::DateRange)), + Inner::Int8Range => &Kind::Range(Type(Inner::Int8)), + Inner::Int8RangeArray => &Kind::Array(Type(Inner::Int8Range)), + Inner::Jsonpath => &Kind::Simple, + Inner::JsonpathArray => &Kind::Array(Type(Inner::Jsonpath)), + Inner::Regnamespace => &Kind::Simple, + Inner::RegnamespaceArray => &Kind::Array(Type(Inner::Regnamespace)), + Inner::Regrole => &Kind::Simple, + Inner::RegroleArray => &Kind::Array(Type(Inner::Regrole)), + Inner::Regcollation => &Kind::Simple, + Inner::RegcollationArray => &Kind::Array(Type(Inner::Regcollation)), + Inner::Int4multiRange => &Kind::Multirange(Type(Inner::Int4)), + Inner::NummultiRange => &Kind::Multirange(Type(Inner::Numeric)), + Inner::TsmultiRange => &Kind::Multirange(Type(Inner::Timestamp)), + Inner::TstzmultiRange => &Kind::Multirange(Type(Inner::Timestamptz)), + Inner::DatemultiRange => &Kind::Multirange(Type(Inner::Date)), + Inner::Int8multiRange => &Kind::Multirange(Type(Inner::Int8)), + Inner::AnymultiRange => &Kind::Pseudo, + Inner::AnycompatiblemultiRange => &Kind::Pseudo, + Inner::PgBrinBloomSummary => &Kind::Simple, + Inner::PgBrinMinmaxMultiSummary => &Kind::Simple, + Inner::PgMcvList => &Kind::Simple, + Inner::PgSnapshot => &Kind::Simple, + Inner::PgSnapshotArray => &Kind::Array(Type(Inner::PgSnapshot)), + Inner::Xid8 => &Kind::Simple, + Inner::Anycompatible => &Kind::Pseudo, + Inner::Anycompatiblearray => &Kind::Pseudo, + Inner::Anycompatiblenonarray => &Kind::Pseudo, + Inner::AnycompatibleRange => &Kind::Pseudo, + Inner::Int4multiRangeArray => &Kind::Array(Type(Inner::Int4multiRange)), + Inner::NummultiRangeArray => &Kind::Array(Type(Inner::NummultiRange)), + Inner::TsmultiRangeArray => &Kind::Array(Type(Inner::TsmultiRange)), + Inner::TstzmultiRangeArray => &Kind::Array(Type(Inner::TstzmultiRange)), + Inner::DatemultiRangeArray => &Kind::Array(Type(Inner::DatemultiRange)), + Inner::Int8multiRangeArray => &Kind::Array(Type(Inner::Int8multiRange)), + Inner::Other(ref u) => &u.kind, + } + } + + pub fn name(&self) -> &str { + match *self { + Inner::Bool => "bool", + Inner::Bytea => "bytea", + Inner::Char => "char", + Inner::Name => "name", + Inner::Int8 => "int8", + Inner::Int2 => "int2", + Inner::Int2Vector => "int2vector", + Inner::Int4 => "int4", + Inner::Regproc => "regproc", + Inner::Text => "text", + Inner::Oid => "oid", + Inner::Tid => "tid", + Inner::Xid => "xid", + Inner::Cid => "cid", + Inner::OidVector => "oidvector", + Inner::PgDdlCommand => "pg_ddl_command", + Inner::Json => "json", + Inner::Xml => "xml", + Inner::XmlArray => "_xml", + Inner::PgNodeTree => "pg_node_tree", + Inner::JsonArray => "_json", + Inner::TableAmHandler => "table_am_handler", + Inner::Xid8Array => "_xid8", + Inner::IndexAmHandler => "index_am_handler", + Inner::Point => "point", + Inner::Lseg => "lseg", + Inner::Path => "path", + Inner::Box => "box", + Inner::Polygon => "polygon", + Inner::Line => "line", + Inner::LineArray => "_line", + Inner::Cidr => "cidr", + Inner::CidrArray => "_cidr", + Inner::Float4 => "float4", + Inner::Float8 => "float8", + Inner::Unknown => "unknown", + Inner::Circle => "circle", + Inner::CircleArray => "_circle", + Inner::Macaddr8 => "macaddr8", + Inner::Macaddr8Array => "_macaddr8", + Inner::Money => "money", + Inner::MoneyArray => "_money", + Inner::Macaddr => "macaddr", + Inner::Inet => "inet", + Inner::BoolArray => "_bool", + Inner::ByteaArray => "_bytea", + Inner::CharArray => "_char", + Inner::NameArray => "_name", + Inner::Int2Array => "_int2", + Inner::Int2VectorArray => "_int2vector", + Inner::Int4Array => "_int4", + Inner::RegprocArray => "_regproc", + Inner::TextArray => "_text", + Inner::TidArray => "_tid", + Inner::XidArray => "_xid", + Inner::CidArray => "_cid", + Inner::OidVectorArray => "_oidvector", + Inner::BpcharArray => "_bpchar", + Inner::VarcharArray => "_varchar", + Inner::Int8Array => "_int8", + Inner::PointArray => "_point", + Inner::LsegArray => "_lseg", + Inner::PathArray => "_path", + Inner::BoxArray => "_box", + Inner::Float4Array => "_float4", + Inner::Float8Array => "_float8", + Inner::PolygonArray => "_polygon", + Inner::OidArray => "_oid", + Inner::Aclitem => "aclitem", + Inner::AclitemArray => "_aclitem", + Inner::MacaddrArray => "_macaddr", + Inner::InetArray => "_inet", + Inner::Bpchar => "bpchar", + Inner::Varchar => "varchar", + Inner::Date => "date", + Inner::Time => "time", + Inner::Timestamp => "timestamp", + Inner::TimestampArray => "_timestamp", + Inner::DateArray => "_date", + Inner::TimeArray => "_time", + Inner::Timestamptz => "timestamptz", + Inner::TimestamptzArray => "_timestamptz", + Inner::Interval => "interval", + Inner::IntervalArray => "_interval", + Inner::NumericArray => "_numeric", + Inner::CstringArray => "_cstring", + Inner::Timetz => "timetz", + Inner::TimetzArray => "_timetz", + Inner::Bit => "bit", + Inner::BitArray => "_bit", + Inner::Varbit => "varbit", + Inner::VarbitArray => "_varbit", + Inner::Numeric => "numeric", + Inner::Refcursor => "refcursor", + Inner::RefcursorArray => "_refcursor", + Inner::Regprocedure => "regprocedure", + Inner::Regoper => "regoper", + Inner::Regoperator => "regoperator", + Inner::Regclass => "regclass", + Inner::Regtype => "regtype", + Inner::RegprocedureArray => "_regprocedure", + Inner::RegoperArray => "_regoper", + Inner::RegoperatorArray => "_regoperator", + Inner::RegclassArray => "_regclass", + Inner::RegtypeArray => "_regtype", + Inner::Record => "record", + Inner::Cstring => "cstring", + Inner::Any => "any", + Inner::Anyarray => "anyarray", + Inner::Void => "void", + Inner::Trigger => "trigger", + Inner::LanguageHandler => "language_handler", + Inner::Internal => "internal", + Inner::Anyelement => "anyelement", + Inner::RecordArray => "_record", + Inner::Anynonarray => "anynonarray", + Inner::TxidSnapshotArray => "_txid_snapshot", + Inner::Uuid => "uuid", + Inner::UuidArray => "_uuid", + Inner::TxidSnapshot => "txid_snapshot", + Inner::FdwHandler => "fdw_handler", + Inner::PgLsn => "pg_lsn", + Inner::PgLsnArray => "_pg_lsn", + Inner::TsmHandler => "tsm_handler", + Inner::PgNdistinct => "pg_ndistinct", + Inner::PgDependencies => "pg_dependencies", + Inner::Anyenum => "anyenum", + Inner::TsVector => "tsvector", + Inner::Tsquery => "tsquery", + Inner::GtsVector => "gtsvector", + Inner::TsVectorArray => "_tsvector", + Inner::GtsVectorArray => "_gtsvector", + Inner::TsqueryArray => "_tsquery", + Inner::Regconfig => "regconfig", + Inner::RegconfigArray => "_regconfig", + Inner::Regdictionary => "regdictionary", + Inner::RegdictionaryArray => "_regdictionary", + Inner::Jsonb => "jsonb", + Inner::JsonbArray => "_jsonb", + Inner::AnyRange => "anyrange", + Inner::EventTrigger => "event_trigger", + Inner::Int4Range => "int4range", + Inner::Int4RangeArray => "_int4range", + Inner::NumRange => "numrange", + Inner::NumRangeArray => "_numrange", + Inner::TsRange => "tsrange", + Inner::TsRangeArray => "_tsrange", + Inner::TstzRange => "tstzrange", + Inner::TstzRangeArray => "_tstzrange", + Inner::DateRange => "daterange", + Inner::DateRangeArray => "_daterange", + Inner::Int8Range => "int8range", + Inner::Int8RangeArray => "_int8range", + Inner::Jsonpath => "jsonpath", + Inner::JsonpathArray => "_jsonpath", + Inner::Regnamespace => "regnamespace", + Inner::RegnamespaceArray => "_regnamespace", + Inner::Regrole => "regrole", + Inner::RegroleArray => "_regrole", + Inner::Regcollation => "regcollation", + Inner::RegcollationArray => "_regcollation", + Inner::Int4multiRange => "int4multirange", + Inner::NummultiRange => "nummultirange", + Inner::TsmultiRange => "tsmultirange", + Inner::TstzmultiRange => "tstzmultirange", + Inner::DatemultiRange => "datemultirange", + Inner::Int8multiRange => "int8multirange", + Inner::AnymultiRange => "anymultirange", + Inner::AnycompatiblemultiRange => "anycompatiblemultirange", + Inner::PgBrinBloomSummary => "pg_brin_bloom_summary", + Inner::PgBrinMinmaxMultiSummary => "pg_brin_minmax_multi_summary", + Inner::PgMcvList => "pg_mcv_list", + Inner::PgSnapshot => "pg_snapshot", + Inner::PgSnapshotArray => "_pg_snapshot", + Inner::Xid8 => "xid8", + Inner::Anycompatible => "anycompatible", + Inner::Anycompatiblearray => "anycompatiblearray", + Inner::Anycompatiblenonarray => "anycompatiblenonarray", + Inner::AnycompatibleRange => "anycompatiblerange", + Inner::Int4multiRangeArray => "_int4multirange", + Inner::NummultiRangeArray => "_nummultirange", + Inner::TsmultiRangeArray => "_tsmultirange", + Inner::TstzmultiRangeArray => "_tstzmultirange", + Inner::DatemultiRangeArray => "_datemultirange", + Inner::Int8multiRangeArray => "_int8multirange", + Inner::Other(ref u) => &u.name, + } + } +} +impl Type { + /// BOOL - boolean, 'true'/'false' + pub const BOOL: Type = Type(Inner::Bool); + + /// BYTEA - variable-length string, binary values escaped + pub const BYTEA: Type = Type(Inner::Bytea); + + /// CHAR - single character + pub const CHAR: Type = Type(Inner::Char); + + /// NAME - 63-byte type for storing system identifiers + pub const NAME: Type = Type(Inner::Name); + + /// INT8 - ~18 digit integer, 8-byte storage + pub const INT8: Type = Type(Inner::Int8); + + /// INT2 - -32 thousand to 32 thousand, 2-byte storage + pub const INT2: Type = Type(Inner::Int2); + + /// INT2VECTOR - array of int2, used in system tables + pub const INT2_VECTOR: Type = Type(Inner::Int2Vector); + + /// INT4 - -2 billion to 2 billion integer, 4-byte storage + pub const INT4: Type = Type(Inner::Int4); + + /// REGPROC - registered procedure + pub const REGPROC: Type = Type(Inner::Regproc); + + /// TEXT - variable-length string, no limit specified + pub const TEXT: Type = Type(Inner::Text); + + /// OID - object identifier(oid), maximum 4 billion + pub const OID: Type = Type(Inner::Oid); + + /// TID - (block, offset), physical location of tuple + pub const TID: Type = Type(Inner::Tid); + + /// XID - transaction id + pub const XID: Type = Type(Inner::Xid); + + /// CID - command identifier type, sequence in transaction id + pub const CID: Type = Type(Inner::Cid); + + /// OIDVECTOR - array of oids, used in system tables + pub const OID_VECTOR: Type = Type(Inner::OidVector); + + /// PG_DDL_COMMAND - internal type for passing CollectedCommand + pub const PG_DDL_COMMAND: Type = Type(Inner::PgDdlCommand); + + /// JSON - JSON stored as text + pub const JSON: Type = Type(Inner::Json); + + /// XML - XML content + pub const XML: Type = Type(Inner::Xml); + + /// XML[] + pub const XML_ARRAY: Type = Type(Inner::XmlArray); + + /// PG_NODE_TREE - string representing an internal node tree + pub const PG_NODE_TREE: Type = Type(Inner::PgNodeTree); + + /// JSON[] + pub const JSON_ARRAY: Type = Type(Inner::JsonArray); + + /// TABLE_AM_HANDLER + pub const TABLE_AM_HANDLER: Type = Type(Inner::TableAmHandler); + + /// XID8[] + pub const XID8_ARRAY: Type = Type(Inner::Xid8Array); + + /// INDEX_AM_HANDLER - pseudo-type for the result of an index AM handler function + pub const INDEX_AM_HANDLER: Type = Type(Inner::IndexAmHandler); + + /// POINT - geometric point '(x, y)' + pub const POINT: Type = Type(Inner::Point); + + /// LSEG - geometric line segment '(pt1,pt2)' + pub const LSEG: Type = Type(Inner::Lseg); + + /// PATH - geometric path '(pt1,...)' + pub const PATH: Type = Type(Inner::Path); + + /// BOX - geometric box '(lower left,upper right)' + pub const BOX: Type = Type(Inner::Box); + + /// POLYGON - geometric polygon '(pt1,...)' + pub const POLYGON: Type = Type(Inner::Polygon); + + /// LINE - geometric line + pub const LINE: Type = Type(Inner::Line); + + /// LINE[] + pub const LINE_ARRAY: Type = Type(Inner::LineArray); + + /// CIDR - network IP address/netmask, network address + pub const CIDR: Type = Type(Inner::Cidr); + + /// CIDR[] + pub const CIDR_ARRAY: Type = Type(Inner::CidrArray); + + /// FLOAT4 - single-precision floating point number, 4-byte storage + pub const FLOAT4: Type = Type(Inner::Float4); + + /// FLOAT8 - double-precision floating point number, 8-byte storage + pub const FLOAT8: Type = Type(Inner::Float8); + + /// UNKNOWN - pseudo-type representing an undetermined type + pub const UNKNOWN: Type = Type(Inner::Unknown); + + /// CIRCLE - geometric circle '(center,radius)' + pub const CIRCLE: Type = Type(Inner::Circle); + + /// CIRCLE[] + pub const CIRCLE_ARRAY: Type = Type(Inner::CircleArray); + + /// MACADDR8 - XX:XX:XX:XX:XX:XX:XX:XX, MAC address + pub const MACADDR8: Type = Type(Inner::Macaddr8); + + /// MACADDR8[] + pub const MACADDR8_ARRAY: Type = Type(Inner::Macaddr8Array); + + /// MONEY - monetary amounts, $d,ddd.cc + pub const MONEY: Type = Type(Inner::Money); + + /// MONEY[] + pub const MONEY_ARRAY: Type = Type(Inner::MoneyArray); + + /// MACADDR - XX:XX:XX:XX:XX:XX, MAC address + pub const MACADDR: Type = Type(Inner::Macaddr); + + /// INET - IP address/netmask, host address, netmask optional + pub const INET: Type = Type(Inner::Inet); + + /// BOOL[] + pub const BOOL_ARRAY: Type = Type(Inner::BoolArray); + + /// BYTEA[] + pub const BYTEA_ARRAY: Type = Type(Inner::ByteaArray); + + /// CHAR[] + pub const CHAR_ARRAY: Type = Type(Inner::CharArray); + + /// NAME[] + pub const NAME_ARRAY: Type = Type(Inner::NameArray); + + /// INT2[] + pub const INT2_ARRAY: Type = Type(Inner::Int2Array); + + /// INT2VECTOR[] + pub const INT2_VECTOR_ARRAY: Type = Type(Inner::Int2VectorArray); + + /// INT4[] + pub const INT4_ARRAY: Type = Type(Inner::Int4Array); + + /// REGPROC[] + pub const REGPROC_ARRAY: Type = Type(Inner::RegprocArray); + + /// TEXT[] + pub const TEXT_ARRAY: Type = Type(Inner::TextArray); + + /// TID[] + pub const TID_ARRAY: Type = Type(Inner::TidArray); + + /// XID[] + pub const XID_ARRAY: Type = Type(Inner::XidArray); + + /// CID[] + pub const CID_ARRAY: Type = Type(Inner::CidArray); + + /// OIDVECTOR[] + pub const OID_VECTOR_ARRAY: Type = Type(Inner::OidVectorArray); + + /// BPCHAR[] + pub const BPCHAR_ARRAY: Type = Type(Inner::BpcharArray); + + /// VARCHAR[] + pub const VARCHAR_ARRAY: Type = Type(Inner::VarcharArray); + + /// INT8[] + pub const INT8_ARRAY: Type = Type(Inner::Int8Array); + + /// POINT[] + pub const POINT_ARRAY: Type = Type(Inner::PointArray); + + /// LSEG[] + pub const LSEG_ARRAY: Type = Type(Inner::LsegArray); + + /// PATH[] + pub const PATH_ARRAY: Type = Type(Inner::PathArray); + + /// BOX[] + pub const BOX_ARRAY: Type = Type(Inner::BoxArray); + + /// FLOAT4[] + pub const FLOAT4_ARRAY: Type = Type(Inner::Float4Array); + + /// FLOAT8[] + pub const FLOAT8_ARRAY: Type = Type(Inner::Float8Array); + + /// POLYGON[] + pub const POLYGON_ARRAY: Type = Type(Inner::PolygonArray); + + /// OID[] + pub const OID_ARRAY: Type = Type(Inner::OidArray); + + /// ACLITEM - access control list + pub const ACLITEM: Type = Type(Inner::Aclitem); + + /// ACLITEM[] + pub const ACLITEM_ARRAY: Type = Type(Inner::AclitemArray); + + /// MACADDR[] + pub const MACADDR_ARRAY: Type = Type(Inner::MacaddrArray); + + /// INET[] + pub const INET_ARRAY: Type = Type(Inner::InetArray); + + /// BPCHAR - char(length), blank-padded string, fixed storage length + pub const BPCHAR: Type = Type(Inner::Bpchar); + + /// VARCHAR - varchar(length), non-blank-padded string, variable storage length + pub const VARCHAR: Type = Type(Inner::Varchar); + + /// DATE - date + pub const DATE: Type = Type(Inner::Date); + + /// TIME - time of day + pub const TIME: Type = Type(Inner::Time); + + /// TIMESTAMP - date and time + pub const TIMESTAMP: Type = Type(Inner::Timestamp); + + /// TIMESTAMP[] + pub const TIMESTAMP_ARRAY: Type = Type(Inner::TimestampArray); + + /// DATE[] + pub const DATE_ARRAY: Type = Type(Inner::DateArray); + + /// TIME[] + pub const TIME_ARRAY: Type = Type(Inner::TimeArray); + + /// TIMESTAMPTZ - date and time with time zone + pub const TIMESTAMPTZ: Type = Type(Inner::Timestamptz); + + /// TIMESTAMPTZ[] + pub const TIMESTAMPTZ_ARRAY: Type = Type(Inner::TimestamptzArray); + + /// INTERVAL - @ <number> <units>, time interval + pub const INTERVAL: Type = Type(Inner::Interval); + + /// INTERVAL[] + pub const INTERVAL_ARRAY: Type = Type(Inner::IntervalArray); + + /// NUMERIC[] + pub const NUMERIC_ARRAY: Type = Type(Inner::NumericArray); + + /// CSTRING[] + pub const CSTRING_ARRAY: Type = Type(Inner::CstringArray); + + /// TIMETZ - time of day with time zone + pub const TIMETZ: Type = Type(Inner::Timetz); + + /// TIMETZ[] + pub const TIMETZ_ARRAY: Type = Type(Inner::TimetzArray); + + /// BIT - fixed-length bit string + pub const BIT: Type = Type(Inner::Bit); + + /// BIT[] + pub const BIT_ARRAY: Type = Type(Inner::BitArray); + + /// VARBIT - variable-length bit string + pub const VARBIT: Type = Type(Inner::Varbit); + + /// VARBIT[] + pub const VARBIT_ARRAY: Type = Type(Inner::VarbitArray); + + /// NUMERIC - numeric(precision, decimal), arbitrary precision number + pub const NUMERIC: Type = Type(Inner::Numeric); + + /// REFCURSOR - reference to cursor (portal name) + pub const REFCURSOR: Type = Type(Inner::Refcursor); + + /// REFCURSOR[] + pub const REFCURSOR_ARRAY: Type = Type(Inner::RefcursorArray); + + /// REGPROCEDURE - registered procedure (with args) + pub const REGPROCEDURE: Type = Type(Inner::Regprocedure); + + /// REGOPER - registered operator + pub const REGOPER: Type = Type(Inner::Regoper); + + /// REGOPERATOR - registered operator (with args) + pub const REGOPERATOR: Type = Type(Inner::Regoperator); + + /// REGCLASS - registered class + pub const REGCLASS: Type = Type(Inner::Regclass); + + /// REGTYPE - registered type + pub const REGTYPE: Type = Type(Inner::Regtype); + + /// REGPROCEDURE[] + pub const REGPROCEDURE_ARRAY: Type = Type(Inner::RegprocedureArray); + + /// REGOPER[] + pub const REGOPER_ARRAY: Type = Type(Inner::RegoperArray); + + /// REGOPERATOR[] + pub const REGOPERATOR_ARRAY: Type = Type(Inner::RegoperatorArray); + + /// REGCLASS[] + pub const REGCLASS_ARRAY: Type = Type(Inner::RegclassArray); + + /// REGTYPE[] + pub const REGTYPE_ARRAY: Type = Type(Inner::RegtypeArray); + + /// RECORD - pseudo-type representing any composite type + pub const RECORD: Type = Type(Inner::Record); + + /// CSTRING - C-style string + pub const CSTRING: Type = Type(Inner::Cstring); + + /// ANY - pseudo-type representing any type + pub const ANY: Type = Type(Inner::Any); + + /// ANYARRAY - pseudo-type representing a polymorphic array type + pub const ANYARRAY: Type = Type(Inner::Anyarray); + + /// VOID - pseudo-type for the result of a function with no real result + pub const VOID: Type = Type(Inner::Void); + + /// TRIGGER - pseudo-type for the result of a trigger function + pub const TRIGGER: Type = Type(Inner::Trigger); + + /// LANGUAGE_HANDLER - pseudo-type for the result of a language handler function + pub const LANGUAGE_HANDLER: Type = Type(Inner::LanguageHandler); + + /// INTERNAL - pseudo-type representing an internal data structure + pub const INTERNAL: Type = Type(Inner::Internal); + + /// ANYELEMENT - pseudo-type representing a polymorphic base type + pub const ANYELEMENT: Type = Type(Inner::Anyelement); + + /// RECORD[] + pub const RECORD_ARRAY: Type = Type(Inner::RecordArray); + + /// ANYNONARRAY - pseudo-type representing a polymorphic base type that is not an array + pub const ANYNONARRAY: Type = Type(Inner::Anynonarray); + + /// TXID_SNAPSHOT[] + pub const TXID_SNAPSHOT_ARRAY: Type = Type(Inner::TxidSnapshotArray); + + /// UUID - UUID datatype + pub const UUID: Type = Type(Inner::Uuid); + + /// UUID[] + pub const UUID_ARRAY: Type = Type(Inner::UuidArray); + + /// TXID_SNAPSHOT - txid snapshot + pub const TXID_SNAPSHOT: Type = Type(Inner::TxidSnapshot); + + /// FDW_HANDLER - pseudo-type for the result of an FDW handler function + pub const FDW_HANDLER: Type = Type(Inner::FdwHandler); + + /// PG_LSN - PostgreSQL LSN datatype + pub const PG_LSN: Type = Type(Inner::PgLsn); + + /// PG_LSN[] + pub const PG_LSN_ARRAY: Type = Type(Inner::PgLsnArray); + + /// TSM_HANDLER - pseudo-type for the result of a tablesample method function + pub const TSM_HANDLER: Type = Type(Inner::TsmHandler); + + /// PG_NDISTINCT - multivariate ndistinct coefficients + pub const PG_NDISTINCT: Type = Type(Inner::PgNdistinct); + + /// PG_DEPENDENCIES - multivariate dependencies + pub const PG_DEPENDENCIES: Type = Type(Inner::PgDependencies); + + /// ANYENUM - pseudo-type representing a polymorphic base type that is an enum + pub const ANYENUM: Type = Type(Inner::Anyenum); + + /// TSVECTOR - text representation for text search + pub const TS_VECTOR: Type = Type(Inner::TsVector); + + /// TSQUERY - query representation for text search + pub const TSQUERY: Type = Type(Inner::Tsquery); + + /// GTSVECTOR - GiST index internal text representation for text search + pub const GTS_VECTOR: Type = Type(Inner::GtsVector); + + /// TSVECTOR[] + pub const TS_VECTOR_ARRAY: Type = Type(Inner::TsVectorArray); + + /// GTSVECTOR[] + pub const GTS_VECTOR_ARRAY: Type = Type(Inner::GtsVectorArray); + + /// TSQUERY[] + pub const TSQUERY_ARRAY: Type = Type(Inner::TsqueryArray); + + /// REGCONFIG - registered text search configuration + pub const REGCONFIG: Type = Type(Inner::Regconfig); + + /// REGCONFIG[] + pub const REGCONFIG_ARRAY: Type = Type(Inner::RegconfigArray); + + /// REGDICTIONARY - registered text search dictionary + pub const REGDICTIONARY: Type = Type(Inner::Regdictionary); + + /// REGDICTIONARY[] + pub const REGDICTIONARY_ARRAY: Type = Type(Inner::RegdictionaryArray); + + /// JSONB - Binary JSON + pub const JSONB: Type = Type(Inner::Jsonb); + + /// JSONB[] + pub const JSONB_ARRAY: Type = Type(Inner::JsonbArray); + + /// ANYRANGE - pseudo-type representing a range over a polymorphic base type + pub const ANY_RANGE: Type = Type(Inner::AnyRange); + + /// EVENT_TRIGGER - pseudo-type for the result of an event trigger function + pub const EVENT_TRIGGER: Type = Type(Inner::EventTrigger); + + /// INT4RANGE - range of integers + pub const INT4_RANGE: Type = Type(Inner::Int4Range); + + /// INT4RANGE[] + pub const INT4_RANGE_ARRAY: Type = Type(Inner::Int4RangeArray); + + /// NUMRANGE - range of numerics + pub const NUM_RANGE: Type = Type(Inner::NumRange); + + /// NUMRANGE[] + pub const NUM_RANGE_ARRAY: Type = Type(Inner::NumRangeArray); + + /// TSRANGE - range of timestamps without time zone + pub const TS_RANGE: Type = Type(Inner::TsRange); + + /// TSRANGE[] + pub const TS_RANGE_ARRAY: Type = Type(Inner::TsRangeArray); + + /// TSTZRANGE - range of timestamps with time zone + pub const TSTZ_RANGE: Type = Type(Inner::TstzRange); + + /// TSTZRANGE[] + pub const TSTZ_RANGE_ARRAY: Type = Type(Inner::TstzRangeArray); + + /// DATERANGE - range of dates + pub const DATE_RANGE: Type = Type(Inner::DateRange); + + /// DATERANGE[] + pub const DATE_RANGE_ARRAY: Type = Type(Inner::DateRangeArray); + + /// INT8RANGE - range of bigints + pub const INT8_RANGE: Type = Type(Inner::Int8Range); + + /// INT8RANGE[] + pub const INT8_RANGE_ARRAY: Type = Type(Inner::Int8RangeArray); + + /// JSONPATH - JSON path + pub const JSONPATH: Type = Type(Inner::Jsonpath); + + /// JSONPATH[] + pub const JSONPATH_ARRAY: Type = Type(Inner::JsonpathArray); + + /// REGNAMESPACE - registered namespace + pub const REGNAMESPACE: Type = Type(Inner::Regnamespace); + + /// REGNAMESPACE[] + pub const REGNAMESPACE_ARRAY: Type = Type(Inner::RegnamespaceArray); + + /// REGROLE - registered role + pub const REGROLE: Type = Type(Inner::Regrole); + + /// REGROLE[] + pub const REGROLE_ARRAY: Type = Type(Inner::RegroleArray); + + /// REGCOLLATION - registered collation + pub const REGCOLLATION: Type = Type(Inner::Regcollation); + + /// REGCOLLATION[] + pub const REGCOLLATION_ARRAY: Type = Type(Inner::RegcollationArray); + + /// INT4MULTIRANGE - multirange of integers + pub const INT4MULTI_RANGE: Type = Type(Inner::Int4multiRange); + + /// NUMMULTIRANGE - multirange of numerics + pub const NUMMULTI_RANGE: Type = Type(Inner::NummultiRange); + + /// TSMULTIRANGE - multirange of timestamps without time zone + pub const TSMULTI_RANGE: Type = Type(Inner::TsmultiRange); + + /// TSTZMULTIRANGE - multirange of timestamps with time zone + pub const TSTZMULTI_RANGE: Type = Type(Inner::TstzmultiRange); + + /// DATEMULTIRANGE - multirange of dates + pub const DATEMULTI_RANGE: Type = Type(Inner::DatemultiRange); + + /// INT8MULTIRANGE - multirange of bigints + pub const INT8MULTI_RANGE: Type = Type(Inner::Int8multiRange); + + /// ANYMULTIRANGE - pseudo-type representing a polymorphic base type that is a multirange + pub const ANYMULTI_RANGE: Type = Type(Inner::AnymultiRange); + + /// ANYCOMPATIBLEMULTIRANGE - pseudo-type representing a multirange over a polymorphic common type + pub const ANYCOMPATIBLEMULTI_RANGE: Type = Type(Inner::AnycompatiblemultiRange); + + /// PG_BRIN_BLOOM_SUMMARY - BRIN bloom summary + pub const PG_BRIN_BLOOM_SUMMARY: Type = Type(Inner::PgBrinBloomSummary); + + /// PG_BRIN_MINMAX_MULTI_SUMMARY - BRIN minmax-multi summary + pub const PG_BRIN_MINMAX_MULTI_SUMMARY: Type = Type(Inner::PgBrinMinmaxMultiSummary); + + /// PG_MCV_LIST - multivariate MCV list + pub const PG_MCV_LIST: Type = Type(Inner::PgMcvList); + + /// PG_SNAPSHOT - snapshot + pub const PG_SNAPSHOT: Type = Type(Inner::PgSnapshot); + + /// PG_SNAPSHOT[] + pub const PG_SNAPSHOT_ARRAY: Type = Type(Inner::PgSnapshotArray); + + /// XID8 - full transaction id + pub const XID8: Type = Type(Inner::Xid8); + + /// ANYCOMPATIBLE - pseudo-type representing a polymorphic common type + pub const ANYCOMPATIBLE: Type = Type(Inner::Anycompatible); + + /// ANYCOMPATIBLEARRAY - pseudo-type representing an array of polymorphic common type elements + pub const ANYCOMPATIBLEARRAY: Type = Type(Inner::Anycompatiblearray); + + /// ANYCOMPATIBLENONARRAY - pseudo-type representing a polymorphic common type that is not an array + pub const ANYCOMPATIBLENONARRAY: Type = Type(Inner::Anycompatiblenonarray); + + /// ANYCOMPATIBLERANGE - pseudo-type representing a range over a polymorphic common type + pub const ANYCOMPATIBLE_RANGE: Type = Type(Inner::AnycompatibleRange); + + /// INT4MULTIRANGE[] + pub const INT4MULTI_RANGE_ARRAY: Type = Type(Inner::Int4multiRangeArray); + + /// NUMMULTIRANGE[] + pub const NUMMULTI_RANGE_ARRAY: Type = Type(Inner::NummultiRangeArray); + + /// TSMULTIRANGE[] + pub const TSMULTI_RANGE_ARRAY: Type = Type(Inner::TsmultiRangeArray); + + /// TSTZMULTIRANGE[] + pub const TSTZMULTI_RANGE_ARRAY: Type = Type(Inner::TstzmultiRangeArray); + + /// DATEMULTIRANGE[] + pub const DATEMULTI_RANGE_ARRAY: Type = Type(Inner::DatemultiRangeArray); + + /// INT8MULTIRANGE[] + pub const INT8MULTI_RANGE_ARRAY: Type = Type(Inner::Int8multiRangeArray); +} diff --git a/libs/proxy/tokio-postgres2/Cargo.toml b/libs/proxy/tokio-postgres2/Cargo.toml new file mode 100644 index 0000000000..7130c1b726 --- /dev/null +++ b/libs/proxy/tokio-postgres2/Cargo.toml @@ -0,0 +1,21 @@ +[package] +name = "tokio-postgres2" +version = "0.1.0" +edition = "2018" +license = "MIT/Apache-2.0" + +[dependencies] +async-trait.workspace = true +bytes.workspace = true +byteorder.workspace = true +fallible-iterator.workspace = true +futures-util = { workspace = true, features = ["sink"] } +log = "0.4" +parking_lot.workspace = true +percent-encoding = "2.0" +pin-project-lite.workspace = true +phf = "0.11" +postgres-protocol2 = { path = "../postgres-protocol2" } +postgres-types2 = { path = "../postgres-types2" } +tokio = { workspace = true, features = ["io-util", "time", "net"] } +tokio-util = { workspace = true, features = ["codec"] } diff --git a/libs/proxy/tokio-postgres2/src/cancel_query.rs b/libs/proxy/tokio-postgres2/src/cancel_query.rs new file mode 100644 index 0000000000..cddbf16336 --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/cancel_query.rs @@ -0,0 +1,40 @@ +use tokio::net::TcpStream; + +use crate::client::SocketConfig; +use crate::config::{Host, SslMode}; +use crate::tls::MakeTlsConnect; +use crate::{cancel_query_raw, connect_socket, Error}; +use std::io; + +pub(crate) async fn cancel_query( + config: Option, + ssl_mode: SslMode, + mut tls: T, + process_id: i32, + secret_key: i32, +) -> Result<(), Error> +where + T: MakeTlsConnect, +{ + let config = match config { + Some(config) => config, + None => { + return Err(Error::connect(io::Error::new( + io::ErrorKind::InvalidInput, + "unknown host", + ))) + } + }; + + let hostname = match &config.host { + Host::Tcp(host) => &**host, + }; + let tls = tls + .make_tls_connect(hostname) + .map_err(|e| Error::tls(e.into()))?; + + let socket = + connect_socket::connect_socket(&config.host, config.port, config.connect_timeout).await?; + + cancel_query_raw::cancel_query_raw(socket, ssl_mode, tls, process_id, secret_key).await +} diff --git a/libs/proxy/tokio-postgres2/src/cancel_query_raw.rs b/libs/proxy/tokio-postgres2/src/cancel_query_raw.rs new file mode 100644 index 0000000000..8c08296435 --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/cancel_query_raw.rs @@ -0,0 +1,29 @@ +use crate::config::SslMode; +use crate::tls::TlsConnect; +use crate::{connect_tls, Error}; +use bytes::BytesMut; +use postgres_protocol2::message::frontend; +use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt}; + +pub async fn cancel_query_raw( + stream: S, + mode: SslMode, + tls: T, + process_id: i32, + secret_key: i32, +) -> Result<(), Error> +where + S: AsyncRead + AsyncWrite + Unpin, + T: TlsConnect, +{ + let mut stream = connect_tls::connect_tls(stream, mode, tls).await?; + + let mut buf = BytesMut::new(); + frontend::cancel_request(process_id, secret_key, &mut buf); + + stream.write_all(&buf).await.map_err(Error::io)?; + stream.flush().await.map_err(Error::io)?; + stream.shutdown().await.map_err(Error::io)?; + + Ok(()) +} diff --git a/libs/proxy/tokio-postgres2/src/cancel_token.rs b/libs/proxy/tokio-postgres2/src/cancel_token.rs new file mode 100644 index 0000000000..b949bf358f --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/cancel_token.rs @@ -0,0 +1,62 @@ +use crate::config::SslMode; +use crate::tls::TlsConnect; + +use crate::{cancel_query, client::SocketConfig, tls::MakeTlsConnect}; +use crate::{cancel_query_raw, Error}; +use tokio::io::{AsyncRead, AsyncWrite}; +use tokio::net::TcpStream; + +/// The capability to request cancellation of in-progress queries on a +/// connection. +#[derive(Clone)] +pub struct CancelToken { + pub(crate) socket_config: Option, + pub(crate) ssl_mode: SslMode, + pub(crate) process_id: i32, + pub(crate) secret_key: i32, +} + +impl CancelToken { + /// Attempts to cancel the in-progress query on the connection associated + /// with this `CancelToken`. + /// + /// The server provides no information about whether a cancellation attempt was successful or not. An error will + /// only be returned if the client was unable to connect to the database. + /// + /// Cancellation is inherently racy. There is no guarantee that the + /// cancellation request will reach the server before the query terminates + /// normally, or that the connection associated with this token is still + /// active. + /// + /// Requires the `runtime` Cargo feature (enabled by default). + pub async fn cancel_query(&self, tls: T) -> Result<(), Error> + where + T: MakeTlsConnect, + { + cancel_query::cancel_query( + self.socket_config.clone(), + self.ssl_mode, + tls, + self.process_id, + self.secret_key, + ) + .await + } + + /// Like `cancel_query`, but uses a stream which is already connected to the server rather than opening a new + /// connection itself. + pub async fn cancel_query_raw(&self, stream: S, tls: T) -> Result<(), Error> + where + S: AsyncRead + AsyncWrite + Unpin, + T: TlsConnect, + { + cancel_query_raw::cancel_query_raw( + stream, + self.ssl_mode, + tls, + self.process_id, + self.secret_key, + ) + .await + } +} diff --git a/libs/proxy/tokio-postgres2/src/client.rs b/libs/proxy/tokio-postgres2/src/client.rs new file mode 100644 index 0000000000..96200b71e7 --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/client.rs @@ -0,0 +1,439 @@ +use crate::codec::{BackendMessages, FrontendMessage}; + +use crate::config::Host; +use crate::config::SslMode; +use crate::connection::{Request, RequestMessages}; + +use crate::query::RowStream; +use crate::simple_query::SimpleQueryStream; + +use crate::types::{Oid, ToSql, Type}; + +use crate::{ + prepare, query, simple_query, slice_iter, CancelToken, Error, ReadyForQueryStatus, Row, + SimpleQueryMessage, Statement, ToStatement, Transaction, TransactionBuilder, +}; +use bytes::BytesMut; +use fallible_iterator::FallibleIterator; +use futures_util::{future, ready, TryStreamExt}; +use parking_lot::Mutex; +use postgres_protocol2::message::{backend::Message, frontend}; +use std::collections::HashMap; +use std::fmt; +use std::sync::Arc; +use std::task::{Context, Poll}; +use tokio::sync::mpsc; + +use std::time::Duration; + +pub struct Responses { + receiver: mpsc::Receiver, + cur: BackendMessages, +} + +impl Responses { + pub fn poll_next(&mut self, cx: &mut Context<'_>) -> Poll> { + loop { + match self.cur.next().map_err(Error::parse)? { + Some(Message::ErrorResponse(body)) => return Poll::Ready(Err(Error::db(body))), + Some(message) => return Poll::Ready(Ok(message)), + None => {} + } + + match ready!(self.receiver.poll_recv(cx)) { + Some(messages) => self.cur = messages, + None => return Poll::Ready(Err(Error::closed())), + } + } + } + + pub async fn next(&mut self) -> Result { + future::poll_fn(|cx| self.poll_next(cx)).await + } +} + +/// A cache of type info and prepared statements for fetching type info +/// (corresponding to the queries in the [prepare] module). +#[derive(Default)] +struct CachedTypeInfo { + /// A statement for basic information for a type from its + /// OID. Corresponds to [TYPEINFO_QUERY](prepare::TYPEINFO_QUERY) (or its + /// fallback). + typeinfo: Option, + /// A statement for getting information for a composite type from its OID. + /// Corresponds to [TYPEINFO_QUERY](prepare::TYPEINFO_COMPOSITE_QUERY). + typeinfo_composite: Option, + /// A statement for getting information for a composite type from its OID. + /// Corresponds to [TYPEINFO_QUERY](prepare::TYPEINFO_COMPOSITE_QUERY) (or + /// its fallback). + typeinfo_enum: Option, + + /// Cache of types already looked up. + types: HashMap, +} + +pub struct InnerClient { + sender: mpsc::UnboundedSender, + cached_typeinfo: Mutex, + + /// A buffer to use when writing out postgres commands. + buffer: Mutex, +} + +impl InnerClient { + pub fn send(&self, messages: RequestMessages) -> Result { + let (sender, receiver) = mpsc::channel(1); + let request = Request { messages, sender }; + self.sender.send(request).map_err(|_| Error::closed())?; + + Ok(Responses { + receiver, + cur: BackendMessages::empty(), + }) + } + + pub fn typeinfo(&self) -> Option { + self.cached_typeinfo.lock().typeinfo.clone() + } + + pub fn set_typeinfo(&self, statement: &Statement) { + self.cached_typeinfo.lock().typeinfo = Some(statement.clone()); + } + + pub fn typeinfo_composite(&self) -> Option { + self.cached_typeinfo.lock().typeinfo_composite.clone() + } + + pub fn set_typeinfo_composite(&self, statement: &Statement) { + self.cached_typeinfo.lock().typeinfo_composite = Some(statement.clone()); + } + + pub fn typeinfo_enum(&self) -> Option { + self.cached_typeinfo.lock().typeinfo_enum.clone() + } + + pub fn set_typeinfo_enum(&self, statement: &Statement) { + self.cached_typeinfo.lock().typeinfo_enum = Some(statement.clone()); + } + + pub fn type_(&self, oid: Oid) -> Option { + self.cached_typeinfo.lock().types.get(&oid).cloned() + } + + pub fn set_type(&self, oid: Oid, type_: &Type) { + self.cached_typeinfo.lock().types.insert(oid, type_.clone()); + } + + /// Call the given function with a buffer to be used when writing out + /// postgres commands. + pub fn with_buf(&self, f: F) -> R + where + F: FnOnce(&mut BytesMut) -> R, + { + let mut buffer = self.buffer.lock(); + let r = f(&mut buffer); + buffer.clear(); + r + } +} + +#[derive(Clone)] +pub(crate) struct SocketConfig { + pub host: Host, + pub port: u16, + pub connect_timeout: Option, + // pub keepalive: Option, +} + +/// An asynchronous PostgreSQL client. +/// +/// The client is one half of what is returned when a connection is established. Users interact with the database +/// through this client object. +pub struct Client { + inner: Arc, + + socket_config: Option, + ssl_mode: SslMode, + process_id: i32, + secret_key: i32, +} + +impl Client { + pub(crate) fn new( + sender: mpsc::UnboundedSender, + ssl_mode: SslMode, + process_id: i32, + secret_key: i32, + ) -> Client { + Client { + inner: Arc::new(InnerClient { + sender, + cached_typeinfo: Default::default(), + buffer: Default::default(), + }), + + socket_config: None, + ssl_mode, + process_id, + secret_key, + } + } + + /// Returns process_id. + pub fn get_process_id(&self) -> i32 { + self.process_id + } + + pub(crate) fn inner(&self) -> &Arc { + &self.inner + } + + pub(crate) fn set_socket_config(&mut self, socket_config: SocketConfig) { + self.socket_config = Some(socket_config); + } + + /// Creates a new prepared statement. + /// + /// Prepared statements can be executed repeatedly, and may contain query parameters (indicated by `$1`, `$2`, etc), + /// which are set when executed. Prepared statements can only be used with the connection that created them. + pub async fn prepare(&self, query: &str) -> Result { + self.prepare_typed(query, &[]).await + } + + /// Like `prepare`, but allows the types of query parameters to be explicitly specified. + /// + /// The list of types may be smaller than the number of parameters - the types of the remaining parameters will be + /// inferred. For example, `client.prepare_typed(query, &[])` is equivalent to `client.prepare(query)`. + pub async fn prepare_typed( + &self, + query: &str, + parameter_types: &[Type], + ) -> Result { + prepare::prepare(&self.inner, query, parameter_types).await + } + + /// Executes a statement, returning a vector of the resulting rows. + /// + /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list + /// provided, 1-indexed. + /// + /// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be + /// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front + /// with the `prepare` method. + /// + /// # Panics + /// + /// Panics if the number of parameters provided does not match the number expected. + pub async fn query( + &self, + statement: &T, + params: &[&(dyn ToSql + Sync)], + ) -> Result, Error> + where + T: ?Sized + ToStatement, + { + self.query_raw(statement, slice_iter(params)) + .await? + .try_collect() + .await + } + + /// The maximally flexible version of [`query`]. + /// + /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list + /// provided, 1-indexed. + /// + /// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be + /// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front + /// with the `prepare` method. + /// + /// # Panics + /// + /// Panics if the number of parameters provided does not match the number expected. + /// + /// [`query`]: #method.query + pub async fn query_raw<'a, T, I>(&self, statement: &T, params: I) -> Result + where + T: ?Sized + ToStatement, + I: IntoIterator, + I::IntoIter: ExactSizeIterator, + { + let statement = statement.__convert().into_statement(self).await?; + query::query(&self.inner, statement, params).await + } + + /// Pass text directly to the Postgres backend to allow it to sort out typing itself and + /// to save a roundtrip + pub async fn query_raw_txt(&self, statement: &str, params: I) -> Result + where + S: AsRef, + I: IntoIterator>, + I::IntoIter: ExactSizeIterator, + { + query::query_txt(&self.inner, statement, params).await + } + + /// Executes a statement, returning the number of rows modified. + /// + /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list + /// provided, 1-indexed. + /// + /// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be + /// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front + /// with the `prepare` method. + /// + /// If the statement does not modify any rows (e.g. `SELECT`), 0 is returned. + /// + /// # Panics + /// + /// Panics if the number of parameters provided does not match the number expected. + pub async fn execute( + &self, + statement: &T, + params: &[&(dyn ToSql + Sync)], + ) -> Result + where + T: ?Sized + ToStatement, + { + self.execute_raw(statement, slice_iter(params)).await + } + + /// The maximally flexible version of [`execute`]. + /// + /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list + /// provided, 1-indexed. + /// + /// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be + /// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front + /// with the `prepare` method. + /// + /// # Panics + /// + /// Panics if the number of parameters provided does not match the number expected. + /// + /// [`execute`]: #method.execute + pub async fn execute_raw<'a, T, I>(&self, statement: &T, params: I) -> Result + where + T: ?Sized + ToStatement, + I: IntoIterator, + I::IntoIter: ExactSizeIterator, + { + let statement = statement.__convert().into_statement(self).await?; + query::execute(self.inner(), statement, params).await + } + + /// Executes a sequence of SQL statements using the simple query protocol, returning the resulting rows. + /// + /// Statements should be separated by semicolons. If an error occurs, execution of the sequence will stop at that + /// point. The simple query protocol returns the values in rows as strings rather than in their binary encodings, + /// so the associated row type doesn't work with the `FromSql` trait. Rather than simply returning a list of the + /// rows, this method returns a list of an enum which indicates either the completion of one of the commands, + /// or a row of data. This preserves the framing between the separate statements in the request. + /// + /// # Warning + /// + /// Prepared statements should be use for any query which contains user-specified data, as they provided the + /// functionality to safely embed that data in the request. Do not form statements via string concatenation and pass + /// them to this method! + pub async fn simple_query(&self, query: &str) -> Result, Error> { + self.simple_query_raw(query).await?.try_collect().await + } + + pub(crate) async fn simple_query_raw(&self, query: &str) -> Result { + simple_query::simple_query(self.inner(), query).await + } + + /// Executes a sequence of SQL statements using the simple query protocol. + /// + /// Statements should be separated by semicolons. If an error occurs, execution of the sequence will stop at that + /// point. This is intended for use when, for example, initializing a database schema. + /// + /// # Warning + /// + /// Prepared statements should be use for any query which contains user-specified data, as they provided the + /// functionality to safely embed that data in the request. Do not form statements via string concatenation and pass + /// them to this method! + pub async fn batch_execute(&self, query: &str) -> Result { + simple_query::batch_execute(self.inner(), query).await + } + + /// Begins a new database transaction. + /// + /// The transaction will roll back by default - use the `commit` method to commit it. + pub async fn transaction(&mut self) -> Result, Error> { + struct RollbackIfNotDone<'me> { + client: &'me Client, + done: bool, + } + + impl Drop for RollbackIfNotDone<'_> { + fn drop(&mut self) { + if self.done { + return; + } + + let buf = self.client.inner().with_buf(|buf| { + frontend::query("ROLLBACK", buf).unwrap(); + buf.split().freeze() + }); + let _ = self + .client + .inner() + .send(RequestMessages::Single(FrontendMessage::Raw(buf))); + } + } + + // This is done, as `Future` created by this method can be dropped after + // `RequestMessages` is synchronously send to the `Connection` by + // `batch_execute()`, but before `Responses` is asynchronously polled to + // completion. In that case `Transaction` won't be created and thus + // won't be rolled back. + { + let mut cleaner = RollbackIfNotDone { + client: self, + done: false, + }; + self.batch_execute("BEGIN").await?; + cleaner.done = true; + } + + Ok(Transaction::new(self)) + } + + /// Returns a builder for a transaction with custom settings. + /// + /// Unlike the `transaction` method, the builder can be used to control the transaction's isolation level and other + /// attributes. + pub fn build_transaction(&mut self) -> TransactionBuilder<'_> { + TransactionBuilder::new(self) + } + + /// Constructs a cancellation token that can later be used to request cancellation of a query running on the + /// connection associated with this client. + pub fn cancel_token(&self) -> CancelToken { + CancelToken { + socket_config: self.socket_config.clone(), + ssl_mode: self.ssl_mode, + process_id: self.process_id, + secret_key: self.secret_key, + } + } + + /// Query for type information + pub async fn get_type(&self, oid: Oid) -> Result { + crate::prepare::get_type(&self.inner, oid).await + } + + /// Determines if the connection to the server has already closed. + /// + /// In that case, all future queries will fail. + pub fn is_closed(&self) -> bool { + self.inner.sender.is_closed() + } +} + +impl fmt::Debug for Client { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("Client").finish() + } +} diff --git a/libs/proxy/tokio-postgres2/src/codec.rs b/libs/proxy/tokio-postgres2/src/codec.rs new file mode 100644 index 0000000000..7412db785b --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/codec.rs @@ -0,0 +1,109 @@ +use bytes::{Buf, Bytes, BytesMut}; +use fallible_iterator::FallibleIterator; +use postgres_protocol2::message::backend; +use postgres_protocol2::message::frontend::CopyData; +use std::io; +use tokio_util::codec::{Decoder, Encoder}; + +pub enum FrontendMessage { + Raw(Bytes), + CopyData(CopyData>), +} + +pub enum BackendMessage { + Normal { + messages: BackendMessages, + request_complete: bool, + }, + Async(backend::Message), +} + +pub struct BackendMessages(BytesMut); + +impl BackendMessages { + pub fn empty() -> BackendMessages { + BackendMessages(BytesMut::new()) + } +} + +impl FallibleIterator for BackendMessages { + type Item = backend::Message; + type Error = io::Error; + + fn next(&mut self) -> io::Result> { + backend::Message::parse(&mut self.0) + } +} + +pub struct PostgresCodec { + pub max_message_size: Option, +} + +impl Encoder for PostgresCodec { + type Error = io::Error; + + fn encode(&mut self, item: FrontendMessage, dst: &mut BytesMut) -> io::Result<()> { + match item { + FrontendMessage::Raw(buf) => dst.extend_from_slice(&buf), + FrontendMessage::CopyData(data) => data.write(dst), + } + + Ok(()) + } +} + +impl Decoder for PostgresCodec { + type Item = BackendMessage; + type Error = io::Error; + + fn decode(&mut self, src: &mut BytesMut) -> Result, io::Error> { + let mut idx = 0; + let mut request_complete = false; + + while let Some(header) = backend::Header::parse(&src[idx..])? { + let len = header.len() as usize + 1; + if src[idx..].len() < len { + break; + } + + if let Some(max) = self.max_message_size { + if len > max { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "message too large", + )); + } + } + + match header.tag() { + backend::NOTICE_RESPONSE_TAG + | backend::NOTIFICATION_RESPONSE_TAG + | backend::PARAMETER_STATUS_TAG => { + if idx == 0 { + let message = backend::Message::parse(src)?.unwrap(); + return Ok(Some(BackendMessage::Async(message))); + } else { + break; + } + } + _ => {} + } + + idx += len; + + if header.tag() == backend::READY_FOR_QUERY_TAG { + request_complete = true; + break; + } + } + + if idx == 0 { + Ok(None) + } else { + Ok(Some(BackendMessage::Normal { + messages: BackendMessages(src.split_to(idx)), + request_complete, + })) + } + } +} diff --git a/libs/proxy/tokio-postgres2/src/config.rs b/libs/proxy/tokio-postgres2/src/config.rs new file mode 100644 index 0000000000..969c20ba47 --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/config.rs @@ -0,0 +1,897 @@ +//! Connection configuration. + +use crate::connect::connect; +use crate::connect_raw::connect_raw; +use crate::tls::MakeTlsConnect; +use crate::tls::TlsConnect; +use crate::{Client, Connection, Error}; +use std::borrow::Cow; +use std::str; +use std::str::FromStr; +use std::time::Duration; +use std::{error, fmt, iter, mem}; +use tokio::io::{AsyncRead, AsyncWrite}; + +pub use postgres_protocol2::authentication::sasl::ScramKeys; +use tokio::net::TcpStream; + +/// Properties required of a session. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +#[non_exhaustive] +pub enum TargetSessionAttrs { + /// No special properties are required. + Any, + /// The session must allow writes. + ReadWrite, +} + +/// TLS configuration. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +#[non_exhaustive] +pub enum SslMode { + /// Do not use TLS. + Disable, + /// Attempt to connect with TLS but allow sessions without. + Prefer, + /// Require the use of TLS. + Require, +} + +/// Channel binding configuration. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +#[non_exhaustive] +pub enum ChannelBinding { + /// Do not use channel binding. + Disable, + /// Attempt to use channel binding but allow sessions without. + Prefer, + /// Require the use of channel binding. + Require, +} + +/// Replication mode configuration. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +#[non_exhaustive] +pub enum ReplicationMode { + /// Physical replication. + Physical, + /// Logical replication. + Logical, +} + +/// A host specification. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum Host { + /// A TCP hostname. + Tcp(String), +} + +/// Precomputed keys which may override password during auth. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum AuthKeys { + /// A `ClientKey` & `ServerKey` pair for `SCRAM-SHA-256`. + ScramSha256(ScramKeys<32>), +} + +/// Connection configuration. +/// +/// Configuration can be parsed from libpq-style connection strings. These strings come in two formats: +/// +/// # Key-Value +/// +/// This format consists of space-separated key-value pairs. Values which are either the empty string or contain +/// whitespace should be wrapped in `'`. `'` and `\` characters should be backslash-escaped. +/// +/// ## Keys +/// +/// * `user` - The username to authenticate with. Required. +/// * `password` - The password to authenticate with. +/// * `dbname` - The name of the database to connect to. Defaults to the username. +/// * `options` - Command line options used to configure the server. +/// * `application_name` - Sets the `application_name` parameter on the server. +/// * `sslmode` - Controls usage of TLS. If set to `disable`, TLS will not be used. If set to `prefer`, TLS will be used +/// if available, but not used otherwise. If set to `require`, TLS will be forced to be used. Defaults to `prefer`. +/// * `host` - The host to connect to. On Unix platforms, if the host starts with a `/` character it is treated as the +/// path to the directory containing Unix domain sockets. Otherwise, it is treated as a hostname. Multiple hosts +/// can be specified, separated by commas. Each host will be tried in turn when connecting. Required if connecting +/// with the `connect` method. +/// * `port` - The port to connect to. Multiple ports can be specified, separated by commas. The number of ports must be +/// either 1, in which case it will be used for all hosts, or the same as the number of hosts. Defaults to 5432 if +/// omitted or the empty string. +/// * `connect_timeout` - The time limit in seconds applied to each socket-level connection attempt. Note that hostnames +/// can resolve to multiple IP addresses, and this limit is applied to each address. Defaults to no timeout. +/// * `target_session_attrs` - Specifies requirements of the session. If set to `read-write`, the client will check that +/// the `transaction_read_write` session parameter is set to `on`. This can be used to connect to the primary server +/// in a database cluster as opposed to the secondary read-only mirrors. Defaults to `all`. +/// * `channel_binding` - Controls usage of channel binding in the authentication process. If set to `disable`, channel +/// binding will not be used. If set to `prefer`, channel binding will be used if available, but not used otherwise. +/// If set to `require`, the authentication process will fail if channel binding is not used. Defaults to `prefer`. +/// +/// ## Examples +/// +/// ```not_rust +/// host=localhost user=postgres connect_timeout=10 keepalives=0 +/// ``` +/// +/// ```not_rust +/// host=/var/lib/postgresql,localhost port=1234 user=postgres password='password with spaces' +/// ``` +/// +/// ```not_rust +/// host=host1,host2,host3 port=1234,,5678 user=postgres target_session_attrs=read-write +/// ``` +/// +/// # Url +/// +/// This format resembles a URL with a scheme of either `postgres://` or `postgresql://`. All components are optional, +/// and the format accepts query parameters for all of the key-value pairs described in the section above. Multiple +/// host/port pairs can be comma-separated. Unix socket paths in the host section of the URL should be percent-encoded, +/// as the path component of the URL specifies the database name. +/// +/// ## Examples +/// +/// ```not_rust +/// postgresql://user@localhost +/// ``` +/// +/// ```not_rust +/// postgresql://user:password@%2Fvar%2Flib%2Fpostgresql/mydb?connect_timeout=10 +/// ``` +/// +/// ```not_rust +/// postgresql://user@host1:1234,host2,host3:5678?target_session_attrs=read-write +/// ``` +/// +/// ```not_rust +/// postgresql:///mydb?user=user&host=/var/lib/postgresql +/// ``` +#[derive(Clone, PartialEq, Eq)] +pub struct Config { + pub(crate) user: Option, + pub(crate) password: Option>, + pub(crate) auth_keys: Option>, + pub(crate) dbname: Option, + pub(crate) options: Option, + pub(crate) application_name: Option, + pub(crate) ssl_mode: SslMode, + pub(crate) host: Vec, + pub(crate) port: Vec, + pub(crate) connect_timeout: Option, + pub(crate) target_session_attrs: TargetSessionAttrs, + pub(crate) channel_binding: ChannelBinding, + pub(crate) replication_mode: Option, + pub(crate) max_backend_message_size: Option, +} + +impl Default for Config { + fn default() -> Config { + Config::new() + } +} + +impl Config { + /// Creates a new configuration. + pub fn new() -> Config { + Config { + user: None, + password: None, + auth_keys: None, + dbname: None, + options: None, + application_name: None, + ssl_mode: SslMode::Prefer, + host: vec![], + port: vec![], + connect_timeout: None, + target_session_attrs: TargetSessionAttrs::Any, + channel_binding: ChannelBinding::Prefer, + replication_mode: None, + max_backend_message_size: None, + } + } + + /// Sets the user to authenticate with. + /// + /// Required. + pub fn user(&mut self, user: &str) -> &mut Config { + self.user = Some(user.to_string()); + self + } + + /// Gets the user to authenticate with, if one has been configured with + /// the `user` method. + pub fn get_user(&self) -> Option<&str> { + self.user.as_deref() + } + + /// Sets the password to authenticate with. + pub fn password(&mut self, password: T) -> &mut Config + where + T: AsRef<[u8]>, + { + self.password = Some(password.as_ref().to_vec()); + self + } + + /// Gets the password to authenticate with, if one has been configured with + /// the `password` method. + pub fn get_password(&self) -> Option<&[u8]> { + self.password.as_deref() + } + + /// Sets precomputed protocol-specific keys to authenticate with. + /// When set, this option will override `password`. + /// See [`AuthKeys`] for more information. + pub fn auth_keys(&mut self, keys: AuthKeys) -> &mut Config { + self.auth_keys = Some(Box::new(keys)); + self + } + + /// Gets precomputed protocol-specific keys to authenticate with. + /// if one has been configured with the `auth_keys` method. + pub fn get_auth_keys(&self) -> Option { + self.auth_keys.as_deref().copied() + } + + /// Sets the name of the database to connect to. + /// + /// Defaults to the user. + pub fn dbname(&mut self, dbname: &str) -> &mut Config { + self.dbname = Some(dbname.to_string()); + self + } + + /// Gets the name of the database to connect to, if one has been configured + /// with the `dbname` method. + pub fn get_dbname(&self) -> Option<&str> { + self.dbname.as_deref() + } + + /// Sets command line options used to configure the server. + pub fn options(&mut self, options: &str) -> &mut Config { + self.options = Some(options.to_string()); + self + } + + /// Gets the command line options used to configure the server, if the + /// options have been set with the `options` method. + pub fn get_options(&self) -> Option<&str> { + self.options.as_deref() + } + + /// Sets the value of the `application_name` runtime parameter. + pub fn application_name(&mut self, application_name: &str) -> &mut Config { + self.application_name = Some(application_name.to_string()); + self + } + + /// Gets the value of the `application_name` runtime parameter, if it has + /// been set with the `application_name` method. + pub fn get_application_name(&self) -> Option<&str> { + self.application_name.as_deref() + } + + /// Sets the SSL configuration. + /// + /// Defaults to `prefer`. + pub fn ssl_mode(&mut self, ssl_mode: SslMode) -> &mut Config { + self.ssl_mode = ssl_mode; + self + } + + /// Gets the SSL configuration. + pub fn get_ssl_mode(&self) -> SslMode { + self.ssl_mode + } + + /// Adds a host to the configuration. + /// + /// Multiple hosts can be specified by calling this method multiple times, and each will be tried in order. + pub fn host(&mut self, host: &str) -> &mut Config { + self.host.push(Host::Tcp(host.to_string())); + self + } + + /// Gets the hosts that have been added to the configuration with `host`. + pub fn get_hosts(&self) -> &[Host] { + &self.host + } + + /// Adds a port to the configuration. + /// + /// Multiple ports can be specified by calling this method multiple times. There must either be no ports, in which + /// case the default of 5432 is used, a single port, in which it is used for all hosts, or the same number of ports + /// as hosts. + pub fn port(&mut self, port: u16) -> &mut Config { + self.port.push(port); + self + } + + /// Gets the ports that have been added to the configuration with `port`. + pub fn get_ports(&self) -> &[u16] { + &self.port + } + + /// Sets the timeout applied to socket-level connection attempts. + /// + /// Note that hostnames can resolve to multiple IP addresses, and this timeout will apply to each address of each + /// host separately. Defaults to no limit. + pub fn connect_timeout(&mut self, connect_timeout: Duration) -> &mut Config { + self.connect_timeout = Some(connect_timeout); + self + } + + /// Gets the connection timeout, if one has been set with the + /// `connect_timeout` method. + pub fn get_connect_timeout(&self) -> Option<&Duration> { + self.connect_timeout.as_ref() + } + + /// Sets the requirements of the session. + /// + /// This can be used to connect to the primary server in a clustered database rather than one of the read-only + /// secondary servers. Defaults to `Any`. + pub fn target_session_attrs( + &mut self, + target_session_attrs: TargetSessionAttrs, + ) -> &mut Config { + self.target_session_attrs = target_session_attrs; + self + } + + /// Gets the requirements of the session. + pub fn get_target_session_attrs(&self) -> TargetSessionAttrs { + self.target_session_attrs + } + + /// Sets the channel binding behavior. + /// + /// Defaults to `prefer`. + pub fn channel_binding(&mut self, channel_binding: ChannelBinding) -> &mut Config { + self.channel_binding = channel_binding; + self + } + + /// Gets the channel binding behavior. + pub fn get_channel_binding(&self) -> ChannelBinding { + self.channel_binding + } + + /// Set replication mode. + pub fn replication_mode(&mut self, replication_mode: ReplicationMode) -> &mut Config { + self.replication_mode = Some(replication_mode); + self + } + + /// Get replication mode. + pub fn get_replication_mode(&self) -> Option { + self.replication_mode + } + + /// Set limit for backend messages size. + pub fn max_backend_message_size(&mut self, max_backend_message_size: usize) -> &mut Config { + self.max_backend_message_size = Some(max_backend_message_size); + self + } + + /// Get limit for backend messages size. + pub fn get_max_backend_message_size(&self) -> Option { + self.max_backend_message_size + } + + fn param(&mut self, key: &str, value: &str) -> Result<(), Error> { + match key { + "user" => { + self.user(value); + } + "password" => { + self.password(value); + } + "dbname" => { + self.dbname(value); + } + "options" => { + self.options(value); + } + "application_name" => { + self.application_name(value); + } + "sslmode" => { + let mode = match value { + "disable" => SslMode::Disable, + "prefer" => SslMode::Prefer, + "require" => SslMode::Require, + _ => return Err(Error::config_parse(Box::new(InvalidValue("sslmode")))), + }; + self.ssl_mode(mode); + } + "host" => { + for host in value.split(',') { + self.host(host); + } + } + "port" => { + for port in value.split(',') { + let port = if port.is_empty() { + 5432 + } else { + port.parse() + .map_err(|_| Error::config_parse(Box::new(InvalidValue("port"))))? + }; + self.port(port); + } + } + "connect_timeout" => { + let timeout = value + .parse::() + .map_err(|_| Error::config_parse(Box::new(InvalidValue("connect_timeout"))))?; + if timeout > 0 { + self.connect_timeout(Duration::from_secs(timeout as u64)); + } + } + "target_session_attrs" => { + let target_session_attrs = match value { + "any" => TargetSessionAttrs::Any, + "read-write" => TargetSessionAttrs::ReadWrite, + _ => { + return Err(Error::config_parse(Box::new(InvalidValue( + "target_session_attrs", + )))); + } + }; + self.target_session_attrs(target_session_attrs); + } + "channel_binding" => { + let channel_binding = match value { + "disable" => ChannelBinding::Disable, + "prefer" => ChannelBinding::Prefer, + "require" => ChannelBinding::Require, + _ => { + return Err(Error::config_parse(Box::new(InvalidValue( + "channel_binding", + )))) + } + }; + self.channel_binding(channel_binding); + } + "max_backend_message_size" => { + let limit = value.parse::().map_err(|_| { + Error::config_parse(Box::new(InvalidValue("max_backend_message_size"))) + })?; + if limit > 0 { + self.max_backend_message_size(limit); + } + } + key => { + return Err(Error::config_parse(Box::new(UnknownOption( + key.to_string(), + )))); + } + } + + Ok(()) + } + + /// Opens a connection to a PostgreSQL database. + /// + /// Requires the `runtime` Cargo feature (enabled by default). + pub async fn connect( + &self, + tls: T, + ) -> Result<(Client, Connection), Error> + where + T: MakeTlsConnect, + { + connect(tls, self).await + } + + /// Connects to a PostgreSQL database over an arbitrary stream. + /// + /// All of the settings other than `user`, `password`, `dbname`, `options`, and `application_name` name are ignored. + pub async fn connect_raw( + &self, + stream: S, + tls: T, + ) -> Result<(Client, Connection), Error> + where + S: AsyncRead + AsyncWrite + Unpin, + T: TlsConnect, + { + connect_raw(stream, tls, self).await + } +} + +impl FromStr for Config { + type Err = Error; + + fn from_str(s: &str) -> Result { + match UrlParser::parse(s)? { + Some(config) => Ok(config), + None => Parser::parse(s), + } + } +} + +// Omit password from debug output +impl fmt::Debug for Config { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + struct Redaction {} + impl fmt::Debug for Redaction { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "_") + } + } + + f.debug_struct("Config") + .field("user", &self.user) + .field("password", &self.password.as_ref().map(|_| Redaction {})) + .field("dbname", &self.dbname) + .field("options", &self.options) + .field("application_name", &self.application_name) + .field("ssl_mode", &self.ssl_mode) + .field("host", &self.host) + .field("port", &self.port) + .field("connect_timeout", &self.connect_timeout) + .field("target_session_attrs", &self.target_session_attrs) + .field("channel_binding", &self.channel_binding) + .field("replication", &self.replication_mode) + .finish() + } +} + +#[derive(Debug)] +struct UnknownOption(String); + +impl fmt::Display for UnknownOption { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(fmt, "unknown option `{}`", self.0) + } +} + +impl error::Error for UnknownOption {} + +#[derive(Debug)] +struct InvalidValue(&'static str); + +impl fmt::Display for InvalidValue { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(fmt, "invalid value for option `{}`", self.0) + } +} + +impl error::Error for InvalidValue {} + +struct Parser<'a> { + s: &'a str, + it: iter::Peekable>, +} + +impl<'a> Parser<'a> { + fn parse(s: &'a str) -> Result { + let mut parser = Parser { + s, + it: s.char_indices().peekable(), + }; + + let mut config = Config::new(); + + while let Some((key, value)) = parser.parameter()? { + config.param(key, &value)?; + } + + Ok(config) + } + + fn skip_ws(&mut self) { + self.take_while(char::is_whitespace); + } + + fn take_while(&mut self, f: F) -> &'a str + where + F: Fn(char) -> bool, + { + let start = match self.it.peek() { + Some(&(i, _)) => i, + None => return "", + }; + + loop { + match self.it.peek() { + Some(&(_, c)) if f(c) => { + self.it.next(); + } + Some(&(i, _)) => return &self.s[start..i], + None => return &self.s[start..], + } + } + } + + fn eat(&mut self, target: char) -> Result<(), Error> { + match self.it.next() { + Some((_, c)) if c == target => Ok(()), + Some((i, c)) => { + let m = format!( + "unexpected character at byte {}: expected `{}` but got `{}`", + i, target, c + ); + Err(Error::config_parse(m.into())) + } + None => Err(Error::config_parse("unexpected EOF".into())), + } + } + + fn eat_if(&mut self, target: char) -> bool { + match self.it.peek() { + Some(&(_, c)) if c == target => { + self.it.next(); + true + } + _ => false, + } + } + + fn keyword(&mut self) -> Option<&'a str> { + let s = self.take_while(|c| match c { + c if c.is_whitespace() => false, + '=' => false, + _ => true, + }); + + if s.is_empty() { + None + } else { + Some(s) + } + } + + fn value(&mut self) -> Result { + let value = if self.eat_if('\'') { + let value = self.quoted_value()?; + self.eat('\'')?; + value + } else { + self.simple_value()? + }; + + Ok(value) + } + + fn simple_value(&mut self) -> Result { + let mut value = String::new(); + + while let Some(&(_, c)) = self.it.peek() { + if c.is_whitespace() { + break; + } + + self.it.next(); + if c == '\\' { + if let Some((_, c2)) = self.it.next() { + value.push(c2); + } + } else { + value.push(c); + } + } + + if value.is_empty() { + return Err(Error::config_parse("unexpected EOF".into())); + } + + Ok(value) + } + + fn quoted_value(&mut self) -> Result { + let mut value = String::new(); + + while let Some(&(_, c)) = self.it.peek() { + if c == '\'' { + return Ok(value); + } + + self.it.next(); + if c == '\\' { + if let Some((_, c2)) = self.it.next() { + value.push(c2); + } + } else { + value.push(c); + } + } + + Err(Error::config_parse( + "unterminated quoted connection parameter value".into(), + )) + } + + fn parameter(&mut self) -> Result, Error> { + self.skip_ws(); + let keyword = match self.keyword() { + Some(keyword) => keyword, + None => return Ok(None), + }; + self.skip_ws(); + self.eat('=')?; + self.skip_ws(); + let value = self.value()?; + + Ok(Some((keyword, value))) + } +} + +// This is a pretty sloppy "URL" parser, but it matches the behavior of libpq, where things really aren't very strict +struct UrlParser<'a> { + s: &'a str, + config: Config, +} + +impl<'a> UrlParser<'a> { + fn parse(s: &'a str) -> Result, Error> { + let s = match Self::remove_url_prefix(s) { + Some(s) => s, + None => return Ok(None), + }; + + let mut parser = UrlParser { + s, + config: Config::new(), + }; + + parser.parse_credentials()?; + parser.parse_host()?; + parser.parse_path()?; + parser.parse_params()?; + + Ok(Some(parser.config)) + } + + fn remove_url_prefix(s: &str) -> Option<&str> { + for prefix in &["postgres://", "postgresql://"] { + if let Some(stripped) = s.strip_prefix(prefix) { + return Some(stripped); + } + } + + None + } + + fn take_until(&mut self, end: &[char]) -> Option<&'a str> { + match self.s.find(end) { + Some(pos) => { + let (head, tail) = self.s.split_at(pos); + self.s = tail; + Some(head) + } + None => None, + } + } + + fn take_all(&mut self) -> &'a str { + mem::take(&mut self.s) + } + + fn eat_byte(&mut self) { + self.s = &self.s[1..]; + } + + fn parse_credentials(&mut self) -> Result<(), Error> { + let creds = match self.take_until(&['@']) { + Some(creds) => creds, + None => return Ok(()), + }; + self.eat_byte(); + + let mut it = creds.splitn(2, ':'); + let user = self.decode(it.next().unwrap())?; + self.config.user(&user); + + if let Some(password) = it.next() { + let password = Cow::from(percent_encoding::percent_decode(password.as_bytes())); + self.config.password(password); + } + + Ok(()) + } + + fn parse_host(&mut self) -> Result<(), Error> { + let host = match self.take_until(&['/', '?']) { + Some(host) => host, + None => self.take_all(), + }; + + if host.is_empty() { + return Ok(()); + } + + for chunk in host.split(',') { + let (host, port) = if chunk.starts_with('[') { + let idx = match chunk.find(']') { + Some(idx) => idx, + None => return Err(Error::config_parse(InvalidValue("host").into())), + }; + + let host = &chunk[1..idx]; + let remaining = &chunk[idx + 1..]; + let port = if let Some(port) = remaining.strip_prefix(':') { + Some(port) + } else if remaining.is_empty() { + None + } else { + return Err(Error::config_parse(InvalidValue("host").into())); + }; + + (host, port) + } else { + let mut it = chunk.splitn(2, ':'); + (it.next().unwrap(), it.next()) + }; + + self.host_param(host)?; + let port = self.decode(port.unwrap_or("5432"))?; + self.config.param("port", &port)?; + } + + Ok(()) + } + + fn parse_path(&mut self) -> Result<(), Error> { + if !self.s.starts_with('/') { + return Ok(()); + } + self.eat_byte(); + + let dbname = match self.take_until(&['?']) { + Some(dbname) => dbname, + None => self.take_all(), + }; + + if !dbname.is_empty() { + self.config.dbname(&self.decode(dbname)?); + } + + Ok(()) + } + + fn parse_params(&mut self) -> Result<(), Error> { + if !self.s.starts_with('?') { + return Ok(()); + } + self.eat_byte(); + + while !self.s.is_empty() { + let key = match self.take_until(&['=']) { + Some(key) => self.decode(key)?, + None => return Err(Error::config_parse("unterminated parameter".into())), + }; + self.eat_byte(); + + let value = match self.take_until(&['&']) { + Some(value) => { + self.eat_byte(); + value + } + None => self.take_all(), + }; + + if key == "host" { + self.host_param(value)?; + } else { + let value = self.decode(value)?; + self.config.param(&key, &value)?; + } + } + + Ok(()) + } + + fn host_param(&mut self, s: &str) -> Result<(), Error> { + let s = self.decode(s)?; + self.config.param("host", &s) + } + + fn decode(&self, s: &'a str) -> Result, Error> { + percent_encoding::percent_decode(s.as_bytes()) + .decode_utf8() + .map_err(|e| Error::config_parse(e.into())) + } +} diff --git a/libs/proxy/tokio-postgres2/src/connect.rs b/libs/proxy/tokio-postgres2/src/connect.rs new file mode 100644 index 0000000000..7517fe0cde --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/connect.rs @@ -0,0 +1,112 @@ +use crate::client::SocketConfig; +use crate::config::{Host, TargetSessionAttrs}; +use crate::connect_raw::connect_raw; +use crate::connect_socket::connect_socket; +use crate::tls::{MakeTlsConnect, TlsConnect}; +use crate::{Client, Config, Connection, Error, SimpleQueryMessage}; +use futures_util::{future, pin_mut, Future, FutureExt, Stream}; +use std::io; +use std::task::Poll; +use tokio::net::TcpStream; + +pub async fn connect( + mut tls: T, + config: &Config, +) -> Result<(Client, Connection), Error> +where + T: MakeTlsConnect, +{ + if config.host.is_empty() { + return Err(Error::config("host missing".into())); + } + + if config.port.len() > 1 && config.port.len() != config.host.len() { + return Err(Error::config("invalid number of ports".into())); + } + + let mut error = None; + for (i, host) in config.host.iter().enumerate() { + let port = config + .port + .get(i) + .or_else(|| config.port.first()) + .copied() + .unwrap_or(5432); + + let hostname = match host { + Host::Tcp(host) => host.as_str(), + }; + + let tls = tls + .make_tls_connect(hostname) + .map_err(|e| Error::tls(e.into()))?; + + match connect_once(host, port, tls, config).await { + Ok((client, connection)) => return Ok((client, connection)), + Err(e) => error = Some(e), + } + } + + Err(error.unwrap()) +} + +async fn connect_once( + host: &Host, + port: u16, + tls: T, + config: &Config, +) -> Result<(Client, Connection), Error> +where + T: TlsConnect, +{ + let socket = connect_socket(host, port, config.connect_timeout).await?; + let (mut client, mut connection) = connect_raw(socket, tls, config).await?; + + if let TargetSessionAttrs::ReadWrite = config.target_session_attrs { + let rows = client.simple_query_raw("SHOW transaction_read_only"); + pin_mut!(rows); + + let rows = future::poll_fn(|cx| { + if connection.poll_unpin(cx)?.is_ready() { + return Poll::Ready(Err(Error::closed())); + } + + rows.as_mut().poll(cx) + }) + .await?; + pin_mut!(rows); + + loop { + let next = future::poll_fn(|cx| { + if connection.poll_unpin(cx)?.is_ready() { + return Poll::Ready(Some(Err(Error::closed()))); + } + + rows.as_mut().poll_next(cx) + }); + + match next.await.transpose()? { + Some(SimpleQueryMessage::Row(row)) => { + if row.try_get(0)? == Some("on") { + return Err(Error::connect(io::Error::new( + io::ErrorKind::PermissionDenied, + "database does not allow writes", + ))); + } else { + break; + } + } + Some(_) => {} + None => return Err(Error::unexpected_message()), + } + } + } + + client.set_socket_config(SocketConfig { + host: host.clone(), + port, + connect_timeout: config.connect_timeout, + }); + + Ok((client, connection)) +} diff --git a/libs/proxy/tokio-postgres2/src/connect_raw.rs b/libs/proxy/tokio-postgres2/src/connect_raw.rs new file mode 100644 index 0000000000..80677af969 --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/connect_raw.rs @@ -0,0 +1,359 @@ +use crate::codec::{BackendMessage, BackendMessages, FrontendMessage, PostgresCodec}; +use crate::config::{self, AuthKeys, Config, ReplicationMode}; +use crate::connect_tls::connect_tls; +use crate::maybe_tls_stream::MaybeTlsStream; +use crate::tls::{TlsConnect, TlsStream}; +use crate::{Client, Connection, Error}; +use bytes::BytesMut; +use fallible_iterator::FallibleIterator; +use futures_util::{ready, Sink, SinkExt, Stream, TryStreamExt}; +use postgres_protocol2::authentication; +use postgres_protocol2::authentication::sasl; +use postgres_protocol2::authentication::sasl::ScramSha256; +use postgres_protocol2::message::backend::{AuthenticationSaslBody, Message}; +use postgres_protocol2::message::frontend; +use std::collections::{HashMap, VecDeque}; +use std::io; +use std::pin::Pin; +use std::task::{Context, Poll}; +use tokio::io::{AsyncRead, AsyncWrite}; +use tokio::sync::mpsc; +use tokio_util::codec::Framed; + +pub struct StartupStream { + inner: Framed, PostgresCodec>, + buf: BackendMessages, + delayed: VecDeque, +} + +impl Sink for StartupStream +where + S: AsyncRead + AsyncWrite + Unpin, + T: AsyncRead + AsyncWrite + Unpin, +{ + type Error = io::Error; + + fn poll_ready(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + Pin::new(&mut self.inner).poll_ready(cx) + } + + fn start_send(mut self: Pin<&mut Self>, item: FrontendMessage) -> io::Result<()> { + Pin::new(&mut self.inner).start_send(item) + } + + fn poll_flush(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + Pin::new(&mut self.inner).poll_flush(cx) + } + + fn poll_close(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + Pin::new(&mut self.inner).poll_close(cx) + } +} + +impl Stream for StartupStream +where + S: AsyncRead + AsyncWrite + Unpin, + T: AsyncRead + AsyncWrite + Unpin, +{ + type Item = io::Result; + + fn poll_next( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + ) -> Poll>> { + loop { + match self.buf.next() { + Ok(Some(message)) => return Poll::Ready(Some(Ok(message))), + Ok(None) => {} + Err(e) => return Poll::Ready(Some(Err(e))), + } + + match ready!(Pin::new(&mut self.inner).poll_next(cx)) { + Some(Ok(BackendMessage::Normal { messages, .. })) => self.buf = messages, + Some(Ok(BackendMessage::Async(message))) => return Poll::Ready(Some(Ok(message))), + Some(Err(e)) => return Poll::Ready(Some(Err(e))), + None => return Poll::Ready(None), + } + } + } +} + +pub async fn connect_raw( + stream: S, + tls: T, + config: &Config, +) -> Result<(Client, Connection), Error> +where + S: AsyncRead + AsyncWrite + Unpin, + T: TlsConnect, +{ + let stream = connect_tls(stream, config.ssl_mode, tls).await?; + + let mut stream = StartupStream { + inner: Framed::new( + stream, + PostgresCodec { + max_message_size: config.max_backend_message_size, + }, + ), + buf: BackendMessages::empty(), + delayed: VecDeque::new(), + }; + + startup(&mut stream, config).await?; + authenticate(&mut stream, config).await?; + let (process_id, secret_key, parameters) = read_info(&mut stream).await?; + + let (sender, receiver) = mpsc::unbounded_channel(); + let client = Client::new(sender, config.ssl_mode, process_id, secret_key); + let connection = Connection::new(stream.inner, stream.delayed, parameters, receiver); + + Ok((client, connection)) +} + +async fn startup(stream: &mut StartupStream, config: &Config) -> Result<(), Error> +where + S: AsyncRead + AsyncWrite + Unpin, + T: AsyncRead + AsyncWrite + Unpin, +{ + let mut params = vec![("client_encoding", "UTF8")]; + if let Some(user) = &config.user { + params.push(("user", &**user)); + } + if let Some(dbname) = &config.dbname { + params.push(("database", &**dbname)); + } + if let Some(options) = &config.options { + params.push(("options", &**options)); + } + if let Some(application_name) = &config.application_name { + params.push(("application_name", &**application_name)); + } + if let Some(replication_mode) = &config.replication_mode { + match replication_mode { + ReplicationMode::Physical => params.push(("replication", "true")), + ReplicationMode::Logical => params.push(("replication", "database")), + } + } + + let mut buf = BytesMut::new(); + frontend::startup_message(params, &mut buf).map_err(Error::encode)?; + + stream + .send(FrontendMessage::Raw(buf.freeze())) + .await + .map_err(Error::io) +} + +async fn authenticate(stream: &mut StartupStream, config: &Config) -> Result<(), Error> +where + S: AsyncRead + AsyncWrite + Unpin, + T: TlsStream + Unpin, +{ + match stream.try_next().await.map_err(Error::io)? { + Some(Message::AuthenticationOk) => { + can_skip_channel_binding(config)?; + return Ok(()); + } + Some(Message::AuthenticationCleartextPassword) => { + can_skip_channel_binding(config)?; + + let pass = config + .password + .as_ref() + .ok_or_else(|| Error::config("password missing".into()))?; + + authenticate_password(stream, pass).await?; + } + Some(Message::AuthenticationMd5Password(body)) => { + can_skip_channel_binding(config)?; + + let user = config + .user + .as_ref() + .ok_or_else(|| Error::config("user missing".into()))?; + let pass = config + .password + .as_ref() + .ok_or_else(|| Error::config("password missing".into()))?; + + let output = authentication::md5_hash(user.as_bytes(), pass, body.salt()); + authenticate_password(stream, output.as_bytes()).await?; + } + Some(Message::AuthenticationSasl(body)) => { + authenticate_sasl(stream, body, config).await?; + } + Some(Message::AuthenticationKerberosV5) + | Some(Message::AuthenticationScmCredential) + | Some(Message::AuthenticationGss) + | Some(Message::AuthenticationSspi) => { + return Err(Error::authentication( + "unsupported authentication method".into(), + )) + } + Some(Message::ErrorResponse(body)) => return Err(Error::db(body)), + Some(_) => return Err(Error::unexpected_message()), + None => return Err(Error::closed()), + } + + match stream.try_next().await.map_err(Error::io)? { + Some(Message::AuthenticationOk) => Ok(()), + Some(Message::ErrorResponse(body)) => Err(Error::db(body)), + Some(_) => Err(Error::unexpected_message()), + None => Err(Error::closed()), + } +} + +fn can_skip_channel_binding(config: &Config) -> Result<(), Error> { + match config.channel_binding { + config::ChannelBinding::Disable | config::ChannelBinding::Prefer => Ok(()), + config::ChannelBinding::Require => Err(Error::authentication( + "server did not use channel binding".into(), + )), + } +} + +async fn authenticate_password( + stream: &mut StartupStream, + password: &[u8], +) -> Result<(), Error> +where + S: AsyncRead + AsyncWrite + Unpin, + T: AsyncRead + AsyncWrite + Unpin, +{ + let mut buf = BytesMut::new(); + frontend::password_message(password, &mut buf).map_err(Error::encode)?; + + stream + .send(FrontendMessage::Raw(buf.freeze())) + .await + .map_err(Error::io) +} + +async fn authenticate_sasl( + stream: &mut StartupStream, + body: AuthenticationSaslBody, + config: &Config, +) -> Result<(), Error> +where + S: AsyncRead + AsyncWrite + Unpin, + T: TlsStream + Unpin, +{ + let mut has_scram = false; + let mut has_scram_plus = false; + let mut mechanisms = body.mechanisms(); + while let Some(mechanism) = mechanisms.next().map_err(Error::parse)? { + match mechanism { + sasl::SCRAM_SHA_256 => has_scram = true, + sasl::SCRAM_SHA_256_PLUS => has_scram_plus = true, + _ => {} + } + } + + let channel_binding = stream + .inner + .get_ref() + .channel_binding() + .tls_server_end_point + .filter(|_| config.channel_binding != config::ChannelBinding::Disable) + .map(sasl::ChannelBinding::tls_server_end_point); + + let (channel_binding, mechanism) = if has_scram_plus { + match channel_binding { + Some(channel_binding) => (channel_binding, sasl::SCRAM_SHA_256_PLUS), + None => (sasl::ChannelBinding::unsupported(), sasl::SCRAM_SHA_256), + } + } else if has_scram { + match channel_binding { + Some(_) => (sasl::ChannelBinding::unrequested(), sasl::SCRAM_SHA_256), + None => (sasl::ChannelBinding::unsupported(), sasl::SCRAM_SHA_256), + } + } else { + return Err(Error::authentication("unsupported SASL mechanism".into())); + }; + + if mechanism != sasl::SCRAM_SHA_256_PLUS { + can_skip_channel_binding(config)?; + } + + let mut scram = if let Some(AuthKeys::ScramSha256(keys)) = config.get_auth_keys() { + ScramSha256::new_with_keys(keys, channel_binding) + } else if let Some(password) = config.get_password() { + ScramSha256::new(password, channel_binding) + } else { + return Err(Error::config("password or auth keys missing".into())); + }; + + let mut buf = BytesMut::new(); + frontend::sasl_initial_response(mechanism, scram.message(), &mut buf).map_err(Error::encode)?; + stream + .send(FrontendMessage::Raw(buf.freeze())) + .await + .map_err(Error::io)?; + + let body = match stream.try_next().await.map_err(Error::io)? { + Some(Message::AuthenticationSaslContinue(body)) => body, + Some(Message::ErrorResponse(body)) => return Err(Error::db(body)), + Some(_) => return Err(Error::unexpected_message()), + None => return Err(Error::closed()), + }; + + scram + .update(body.data()) + .await + .map_err(|e| Error::authentication(e.into()))?; + + let mut buf = BytesMut::new(); + frontend::sasl_response(scram.message(), &mut buf).map_err(Error::encode)?; + stream + .send(FrontendMessage::Raw(buf.freeze())) + .await + .map_err(Error::io)?; + + let body = match stream.try_next().await.map_err(Error::io)? { + Some(Message::AuthenticationSaslFinal(body)) => body, + Some(Message::ErrorResponse(body)) => return Err(Error::db(body)), + Some(_) => return Err(Error::unexpected_message()), + None => return Err(Error::closed()), + }; + + scram + .finish(body.data()) + .map_err(|e| Error::authentication(e.into()))?; + + Ok(()) +} + +async fn read_info( + stream: &mut StartupStream, +) -> Result<(i32, i32, HashMap), Error> +where + S: AsyncRead + AsyncWrite + Unpin, + T: AsyncRead + AsyncWrite + Unpin, +{ + let mut process_id = 0; + let mut secret_key = 0; + let mut parameters = HashMap::new(); + + loop { + match stream.try_next().await.map_err(Error::io)? { + Some(Message::BackendKeyData(body)) => { + process_id = body.process_id(); + secret_key = body.secret_key(); + } + Some(Message::ParameterStatus(body)) => { + parameters.insert( + body.name().map_err(Error::parse)?.to_string(), + body.value().map_err(Error::parse)?.to_string(), + ); + } + Some(msg @ Message::NoticeResponse(_)) => { + stream.delayed.push_back(BackendMessage::Async(msg)) + } + Some(Message::ReadyForQuery(_)) => return Ok((process_id, secret_key, parameters)), + Some(Message::ErrorResponse(body)) => return Err(Error::db(body)), + Some(_) => return Err(Error::unexpected_message()), + None => return Err(Error::closed()), + } + } +} diff --git a/libs/proxy/tokio-postgres2/src/connect_socket.rs b/libs/proxy/tokio-postgres2/src/connect_socket.rs new file mode 100644 index 0000000000..336a13317f --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/connect_socket.rs @@ -0,0 +1,65 @@ +use crate::config::Host; +use crate::Error; +use std::future::Future; +use std::io; +use std::time::Duration; +use tokio::net::{self, TcpStream}; +use tokio::time; + +pub(crate) async fn connect_socket( + host: &Host, + port: u16, + connect_timeout: Option, +) -> Result { + match host { + Host::Tcp(host) => { + let addrs = net::lookup_host((&**host, port)) + .await + .map_err(Error::connect)?; + + let mut last_err = None; + + for addr in addrs { + let stream = + match connect_with_timeout(TcpStream::connect(addr), connect_timeout).await { + Ok(stream) => stream, + Err(e) => { + last_err = Some(e); + continue; + } + }; + + stream.set_nodelay(true).map_err(Error::connect)?; + + return Ok(stream); + } + + Err(last_err.unwrap_or_else(|| { + Error::connect(io::Error::new( + io::ErrorKind::InvalidInput, + "could not resolve any addresses", + )) + })) + } + } +} + +async fn connect_with_timeout(connect: F, timeout: Option) -> Result +where + F: Future>, +{ + match timeout { + Some(timeout) => match time::timeout(timeout, connect).await { + Ok(Ok(socket)) => Ok(socket), + Ok(Err(e)) => Err(Error::connect(e)), + Err(_) => Err(Error::connect(io::Error::new( + io::ErrorKind::TimedOut, + "connection timed out", + ))), + }, + None => match connect.await { + Ok(socket) => Ok(socket), + Err(e) => Err(Error::connect(e)), + }, + } +} diff --git a/libs/proxy/tokio-postgres2/src/connect_tls.rs b/libs/proxy/tokio-postgres2/src/connect_tls.rs new file mode 100644 index 0000000000..64b0b68abc --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/connect_tls.rs @@ -0,0 +1,48 @@ +use crate::config::SslMode; +use crate::maybe_tls_stream::MaybeTlsStream; +use crate::tls::private::ForcePrivateApi; +use crate::tls::TlsConnect; +use crate::Error; +use bytes::BytesMut; +use postgres_protocol2::message::frontend; +use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt}; + +pub async fn connect_tls( + mut stream: S, + mode: SslMode, + tls: T, +) -> Result, Error> +where + S: AsyncRead + AsyncWrite + Unpin, + T: TlsConnect, +{ + match mode { + SslMode::Disable => return Ok(MaybeTlsStream::Raw(stream)), + SslMode::Prefer if !tls.can_connect(ForcePrivateApi) => { + return Ok(MaybeTlsStream::Raw(stream)) + } + SslMode::Prefer | SslMode::Require => {} + } + + let mut buf = BytesMut::new(); + frontend::ssl_request(&mut buf); + stream.write_all(&buf).await.map_err(Error::io)?; + + let mut buf = [0]; + stream.read_exact(&mut buf).await.map_err(Error::io)?; + + if buf[0] != b'S' { + if SslMode::Require == mode { + return Err(Error::tls("server does not support TLS".into())); + } else { + return Ok(MaybeTlsStream::Raw(stream)); + } + } + + let stream = tls + .connect(stream) + .await + .map_err(|e| Error::tls(e.into()))?; + + Ok(MaybeTlsStream::Tls(stream)) +} diff --git a/libs/proxy/tokio-postgres2/src/connection.rs b/libs/proxy/tokio-postgres2/src/connection.rs new file mode 100644 index 0000000000..0aa5c77e22 --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/connection.rs @@ -0,0 +1,323 @@ +use crate::codec::{BackendMessage, BackendMessages, FrontendMessage, PostgresCodec}; +use crate::error::DbError; +use crate::maybe_tls_stream::MaybeTlsStream; +use crate::{AsyncMessage, Error, Notification}; +use bytes::BytesMut; +use fallible_iterator::FallibleIterator; +use futures_util::{ready, Sink, Stream}; +use log::{info, trace}; +use postgres_protocol2::message::backend::Message; +use postgres_protocol2::message::frontend; +use std::collections::{HashMap, VecDeque}; +use std::future::Future; +use std::pin::Pin; +use std::task::{Context, Poll}; +use tokio::io::{AsyncRead, AsyncWrite}; +use tokio::sync::mpsc; +use tokio_util::codec::Framed; +use tokio_util::sync::PollSender; + +pub enum RequestMessages { + Single(FrontendMessage), +} + +pub struct Request { + pub messages: RequestMessages, + pub sender: mpsc::Sender, +} + +pub struct Response { + sender: PollSender, +} + +#[derive(PartialEq, Debug)] +enum State { + Active, + Terminating, + Closing, +} + +/// A connection to a PostgreSQL database. +/// +/// This is one half of what is returned when a new connection is established. It performs the actual IO with the +/// server, and should generally be spawned off onto an executor to run in the background. +/// +/// `Connection` implements `Future`, and only resolves when the connection is closed, either because a fatal error has +/// occurred, or because its associated `Client` has dropped and all outstanding work has completed. +#[must_use = "futures do nothing unless polled"] +pub struct Connection { + /// HACK: we need this in the Neon Proxy. + pub stream: Framed, PostgresCodec>, + /// HACK: we need this in the Neon Proxy to forward params. + pub parameters: HashMap, + receiver: mpsc::UnboundedReceiver, + pending_request: Option, + pending_responses: VecDeque, + responses: VecDeque, + state: State, +} + +impl Connection +where + S: AsyncRead + AsyncWrite + Unpin, + T: AsyncRead + AsyncWrite + Unpin, +{ + pub(crate) fn new( + stream: Framed, PostgresCodec>, + pending_responses: VecDeque, + parameters: HashMap, + receiver: mpsc::UnboundedReceiver, + ) -> Connection { + Connection { + stream, + parameters, + receiver, + pending_request: None, + pending_responses, + responses: VecDeque::new(), + state: State::Active, + } + } + + fn poll_response( + &mut self, + cx: &mut Context<'_>, + ) -> Poll>> { + if let Some(message) = self.pending_responses.pop_front() { + trace!("retrying pending response"); + return Poll::Ready(Some(Ok(message))); + } + + Pin::new(&mut self.stream) + .poll_next(cx) + .map(|o| o.map(|r| r.map_err(Error::io))) + } + + fn poll_read(&mut self, cx: &mut Context<'_>) -> Result, Error> { + if self.state != State::Active { + trace!("poll_read: done"); + return Ok(None); + } + + loop { + let message = match self.poll_response(cx)? { + Poll::Ready(Some(message)) => message, + Poll::Ready(None) => return Err(Error::closed()), + Poll::Pending => { + trace!("poll_read: waiting on response"); + return Ok(None); + } + }; + + let (mut messages, request_complete) = match message { + BackendMessage::Async(Message::NoticeResponse(body)) => { + let error = DbError::parse(&mut body.fields()).map_err(Error::parse)?; + return Ok(Some(AsyncMessage::Notice(error))); + } + BackendMessage::Async(Message::NotificationResponse(body)) => { + let notification = Notification { + process_id: body.process_id(), + channel: body.channel().map_err(Error::parse)?.to_string(), + payload: body.message().map_err(Error::parse)?.to_string(), + }; + return Ok(Some(AsyncMessage::Notification(notification))); + } + BackendMessage::Async(Message::ParameterStatus(body)) => { + self.parameters.insert( + body.name().map_err(Error::parse)?.to_string(), + body.value().map_err(Error::parse)?.to_string(), + ); + continue; + } + BackendMessage::Async(_) => unreachable!(), + BackendMessage::Normal { + messages, + request_complete, + } => (messages, request_complete), + }; + + let mut response = match self.responses.pop_front() { + Some(response) => response, + None => match messages.next().map_err(Error::parse)? { + Some(Message::ErrorResponse(error)) => return Err(Error::db(error)), + _ => return Err(Error::unexpected_message()), + }, + }; + + match response.sender.poll_reserve(cx) { + Poll::Ready(Ok(())) => { + let _ = response.sender.send_item(messages); + if !request_complete { + self.responses.push_front(response); + } + } + Poll::Ready(Err(_)) => { + // we need to keep paging through the rest of the messages even if the receiver's hung up + if !request_complete { + self.responses.push_front(response); + } + } + Poll::Pending => { + self.responses.push_front(response); + self.pending_responses.push_back(BackendMessage::Normal { + messages, + request_complete, + }); + trace!("poll_read: waiting on sender"); + return Ok(None); + } + } + } + } + + fn poll_request(&mut self, cx: &mut Context<'_>) -> Poll> { + if let Some(messages) = self.pending_request.take() { + trace!("retrying pending request"); + return Poll::Ready(Some(messages)); + } + + if self.receiver.is_closed() { + return Poll::Ready(None); + } + + match self.receiver.poll_recv(cx) { + Poll::Ready(Some(request)) => { + trace!("polled new request"); + self.responses.push_back(Response { + sender: PollSender::new(request.sender), + }); + Poll::Ready(Some(request.messages)) + } + Poll::Ready(None) => Poll::Ready(None), + Poll::Pending => Poll::Pending, + } + } + + fn poll_write(&mut self, cx: &mut Context<'_>) -> Result { + loop { + if self.state == State::Closing { + trace!("poll_write: done"); + return Ok(false); + } + + if Pin::new(&mut self.stream) + .poll_ready(cx) + .map_err(Error::io)? + .is_pending() + { + trace!("poll_write: waiting on socket"); + return Ok(false); + } + + let request = match self.poll_request(cx) { + Poll::Ready(Some(request)) => request, + Poll::Ready(None) if self.responses.is_empty() && self.state == State::Active => { + trace!("poll_write: at eof, terminating"); + self.state = State::Terminating; + let mut request = BytesMut::new(); + frontend::terminate(&mut request); + RequestMessages::Single(FrontendMessage::Raw(request.freeze())) + } + Poll::Ready(None) => { + trace!( + "poll_write: at eof, pending responses {}", + self.responses.len() + ); + return Ok(true); + } + Poll::Pending => { + trace!("poll_write: waiting on request"); + return Ok(true); + } + }; + + match request { + RequestMessages::Single(request) => { + Pin::new(&mut self.stream) + .start_send(request) + .map_err(Error::io)?; + if self.state == State::Terminating { + trace!("poll_write: sent eof, closing"); + self.state = State::Closing; + } + } + } + } + } + + fn poll_flush(&mut self, cx: &mut Context<'_>) -> Result<(), Error> { + match Pin::new(&mut self.stream) + .poll_flush(cx) + .map_err(Error::io)? + { + Poll::Ready(()) => trace!("poll_flush: flushed"), + Poll::Pending => trace!("poll_flush: waiting on socket"), + } + Ok(()) + } + + fn poll_shutdown(&mut self, cx: &mut Context<'_>) -> Poll> { + if self.state != State::Closing { + return Poll::Pending; + } + + match Pin::new(&mut self.stream) + .poll_close(cx) + .map_err(Error::io)? + { + Poll::Ready(()) => { + trace!("poll_shutdown: complete"); + Poll::Ready(Ok(())) + } + Poll::Pending => { + trace!("poll_shutdown: waiting on socket"); + Poll::Pending + } + } + } + + /// Returns the value of a runtime parameter for this connection. + pub fn parameter(&self, name: &str) -> Option<&str> { + self.parameters.get(name).map(|s| &**s) + } + + /// Polls for asynchronous messages from the server. + /// + /// The server can send notices as well as notifications asynchronously to the client. Applications that wish to + /// examine those messages should use this method to drive the connection rather than its `Future` implementation. + pub fn poll_message( + &mut self, + cx: &mut Context<'_>, + ) -> Poll>> { + let message = self.poll_read(cx)?; + let want_flush = self.poll_write(cx)?; + if want_flush { + self.poll_flush(cx)?; + } + match message { + Some(message) => Poll::Ready(Some(Ok(message))), + None => match self.poll_shutdown(cx) { + Poll::Ready(Ok(())) => Poll::Ready(None), + Poll::Ready(Err(e)) => Poll::Ready(Some(Err(e))), + Poll::Pending => Poll::Pending, + }, + } + } +} + +impl Future for Connection +where + S: AsyncRead + AsyncWrite + Unpin, + T: AsyncRead + AsyncWrite + Unpin, +{ + type Output = Result<(), Error>; + + fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + while let Some(message) = ready!(self.poll_message(cx)?) { + if let AsyncMessage::Notice(notice) = message { + info!("{}: {}", notice.severity(), notice.message()); + } + } + Poll::Ready(Ok(())) + } +} diff --git a/libs/proxy/tokio-postgres2/src/error/mod.rs b/libs/proxy/tokio-postgres2/src/error/mod.rs new file mode 100644 index 0000000000..6514322250 --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/error/mod.rs @@ -0,0 +1,501 @@ +//! Errors. + +use fallible_iterator::FallibleIterator; +use postgres_protocol2::message::backend::{ErrorFields, ErrorResponseBody}; +use std::error::{self, Error as _Error}; +use std::fmt; +use std::io; + +pub use self::sqlstate::*; + +#[allow(clippy::unreadable_literal)] +mod sqlstate; + +/// The severity of a Postgres error or notice. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub enum Severity { + /// PANIC + Panic, + /// FATAL + Fatal, + /// ERROR + Error, + /// WARNING + Warning, + /// NOTICE + Notice, + /// DEBUG + Debug, + /// INFO + Info, + /// LOG + Log, +} + +impl fmt::Display for Severity { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + let s = match *self { + Severity::Panic => "PANIC", + Severity::Fatal => "FATAL", + Severity::Error => "ERROR", + Severity::Warning => "WARNING", + Severity::Notice => "NOTICE", + Severity::Debug => "DEBUG", + Severity::Info => "INFO", + Severity::Log => "LOG", + }; + fmt.write_str(s) + } +} + +impl Severity { + fn from_str(s: &str) -> Option { + match s { + "PANIC" => Some(Severity::Panic), + "FATAL" => Some(Severity::Fatal), + "ERROR" => Some(Severity::Error), + "WARNING" => Some(Severity::Warning), + "NOTICE" => Some(Severity::Notice), + "DEBUG" => Some(Severity::Debug), + "INFO" => Some(Severity::Info), + "LOG" => Some(Severity::Log), + _ => None, + } + } +} + +/// A Postgres error or notice. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct DbError { + severity: String, + parsed_severity: Option, + code: SqlState, + message: String, + detail: Option, + hint: Option, + position: Option, + where_: Option, + schema: Option, + table: Option, + column: Option, + datatype: Option, + constraint: Option, + file: Option, + line: Option, + routine: Option, +} + +impl DbError { + pub(crate) fn parse(fields: &mut ErrorFields<'_>) -> io::Result { + let mut severity = None; + let mut parsed_severity = None; + let mut code = None; + let mut message = None; + let mut detail = None; + let mut hint = None; + let mut normal_position = None; + let mut internal_position = None; + let mut internal_query = None; + let mut where_ = None; + let mut schema = None; + let mut table = None; + let mut column = None; + let mut datatype = None; + let mut constraint = None; + let mut file = None; + let mut line = None; + let mut routine = None; + + while let Some(field) = fields.next()? { + match field.type_() { + b'S' => severity = Some(field.value().to_owned()), + b'C' => code = Some(SqlState::from_code(field.value())), + b'M' => message = Some(field.value().to_owned()), + b'D' => detail = Some(field.value().to_owned()), + b'H' => hint = Some(field.value().to_owned()), + b'P' => { + normal_position = Some(field.value().parse::().map_err(|_| { + io::Error::new( + io::ErrorKind::InvalidInput, + "`P` field did not contain an integer", + ) + })?); + } + b'p' => { + internal_position = Some(field.value().parse::().map_err(|_| { + io::Error::new( + io::ErrorKind::InvalidInput, + "`p` field did not contain an integer", + ) + })?); + } + b'q' => internal_query = Some(field.value().to_owned()), + b'W' => where_ = Some(field.value().to_owned()), + b's' => schema = Some(field.value().to_owned()), + b't' => table = Some(field.value().to_owned()), + b'c' => column = Some(field.value().to_owned()), + b'd' => datatype = Some(field.value().to_owned()), + b'n' => constraint = Some(field.value().to_owned()), + b'F' => file = Some(field.value().to_owned()), + b'L' => { + line = Some(field.value().parse::().map_err(|_| { + io::Error::new( + io::ErrorKind::InvalidInput, + "`L` field did not contain an integer", + ) + })?); + } + b'R' => routine = Some(field.value().to_owned()), + b'V' => { + parsed_severity = Some(Severity::from_str(field.value()).ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidInput, + "`V` field contained an invalid value", + ) + })?); + } + _ => {} + } + } + + Ok(DbError { + severity: severity + .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidInput, "`S` field missing"))?, + parsed_severity, + code: code + .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidInput, "`C` field missing"))?, + message: message + .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidInput, "`M` field missing"))?, + detail, + hint, + position: match normal_position { + Some(position) => Some(ErrorPosition::Original(position)), + None => match internal_position { + Some(position) => Some(ErrorPosition::Internal { + position, + query: internal_query.ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidInput, + "`q` field missing but `p` field present", + ) + })?, + }), + None => None, + }, + }, + where_, + schema, + table, + column, + datatype, + constraint, + file, + line, + routine, + }) + } + + /// The field contents are ERROR, FATAL, or PANIC (in an error message), + /// or WARNING, NOTICE, DEBUG, INFO, or LOG (in a notice message), or a + /// localized translation of one of these. + pub fn severity(&self) -> &str { + &self.severity + } + + /// A parsed, nonlocalized version of `severity`. (PostgreSQL 9.6+) + pub fn parsed_severity(&self) -> Option { + self.parsed_severity + } + + /// The SQLSTATE code for the error. + pub fn code(&self) -> &SqlState { + &self.code + } + + /// The primary human-readable error message. + /// + /// This should be accurate but terse (typically one line). + pub fn message(&self) -> &str { + &self.message + } + + /// An optional secondary error message carrying more detail about the + /// problem. + /// + /// Might run to multiple lines. + pub fn detail(&self) -> Option<&str> { + self.detail.as_deref() + } + + /// An optional suggestion what to do about the problem. + /// + /// This is intended to differ from `detail` in that it offers advice + /// (potentially inappropriate) rather than hard facts. Might run to + /// multiple lines. + pub fn hint(&self) -> Option<&str> { + self.hint.as_deref() + } + + /// An optional error cursor position into either the original query string + /// or an internally generated query. + pub fn position(&self) -> Option<&ErrorPosition> { + self.position.as_ref() + } + + /// An indication of the context in which the error occurred. + /// + /// Presently this includes a call stack traceback of active procedural + /// language functions and internally-generated queries. The trace is one + /// entry per line, most recent first. + pub fn where_(&self) -> Option<&str> { + self.where_.as_deref() + } + + /// If the error was associated with a specific database object, the name + /// of the schema containing that object, if any. (PostgreSQL 9.3+) + pub fn schema(&self) -> Option<&str> { + self.schema.as_deref() + } + + /// If the error was associated with a specific table, the name of the + /// table. (Refer to the schema name field for the name of the table's + /// schema.) (PostgreSQL 9.3+) + pub fn table(&self) -> Option<&str> { + self.table.as_deref() + } + + /// If the error was associated with a specific table column, the name of + /// the column. + /// + /// (Refer to the schema and table name fields to identify the table.) + /// (PostgreSQL 9.3+) + pub fn column(&self) -> Option<&str> { + self.column.as_deref() + } + + /// If the error was associated with a specific data type, the name of the + /// data type. (Refer to the schema name field for the name of the data + /// type's schema.) (PostgreSQL 9.3+) + pub fn datatype(&self) -> Option<&str> { + self.datatype.as_deref() + } + + /// If the error was associated with a specific constraint, the name of the + /// constraint. + /// + /// Refer to fields listed above for the associated table or domain. + /// (For this purpose, indexes are treated as constraints, even if they + /// weren't created with constraint syntax.) (PostgreSQL 9.3+) + pub fn constraint(&self) -> Option<&str> { + self.constraint.as_deref() + } + + /// The file name of the source-code location where the error was reported. + pub fn file(&self) -> Option<&str> { + self.file.as_deref() + } + + /// The line number of the source-code location where the error was + /// reported. + pub fn line(&self) -> Option { + self.line + } + + /// The name of the source-code routine reporting the error. + pub fn routine(&self) -> Option<&str> { + self.routine.as_deref() + } +} + +impl fmt::Display for DbError { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(fmt, "{}: {}", self.severity, self.message)?; + if let Some(detail) = &self.detail { + write!(fmt, "\nDETAIL: {}", detail)?; + } + if let Some(hint) = &self.hint { + write!(fmt, "\nHINT: {}", hint)?; + } + Ok(()) + } +} + +impl error::Error for DbError {} + +/// Represents the position of an error in a query. +#[derive(Clone, PartialEq, Eq, Debug)] +pub enum ErrorPosition { + /// A position in the original query. + Original(u32), + /// A position in an internally generated query. + Internal { + /// The byte position. + position: u32, + /// A query generated by the Postgres server. + query: String, + }, +} + +#[derive(Debug, PartialEq)] +enum Kind { + Io, + UnexpectedMessage, + Tls, + ToSql(usize), + FromSql(usize), + Column(String), + Closed, + Db, + Parse, + Encode, + Authentication, + ConfigParse, + Config, + Connect, + Timeout, +} + +struct ErrorInner { + kind: Kind, + cause: Option>, +} + +/// An error communicating with the Postgres server. +pub struct Error(Box); + +impl fmt::Debug for Error { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt.debug_struct("Error") + .field("kind", &self.0.kind) + .field("cause", &self.0.cause) + .finish() + } +} + +impl fmt::Display for Error { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + match &self.0.kind { + Kind::Io => fmt.write_str("error communicating with the server")?, + Kind::UnexpectedMessage => fmt.write_str("unexpected message from server")?, + Kind::Tls => fmt.write_str("error performing TLS handshake")?, + Kind::ToSql(idx) => write!(fmt, "error serializing parameter {}", idx)?, + Kind::FromSql(idx) => write!(fmt, "error deserializing column {}", idx)?, + Kind::Column(column) => write!(fmt, "invalid column `{}`", column)?, + Kind::Closed => fmt.write_str("connection closed")?, + Kind::Db => fmt.write_str("db error")?, + Kind::Parse => fmt.write_str("error parsing response from server")?, + Kind::Encode => fmt.write_str("error encoding message to server")?, + Kind::Authentication => fmt.write_str("authentication error")?, + Kind::ConfigParse => fmt.write_str("invalid connection string")?, + Kind::Config => fmt.write_str("invalid configuration")?, + Kind::Connect => fmt.write_str("error connecting to server")?, + Kind::Timeout => fmt.write_str("timeout waiting for server")?, + }; + if let Some(ref cause) = self.0.cause { + write!(fmt, ": {}", cause)?; + } + Ok(()) + } +} + +impl error::Error for Error { + fn source(&self) -> Option<&(dyn error::Error + 'static)> { + self.0.cause.as_ref().map(|e| &**e as _) + } +} + +impl Error { + /// Consumes the error, returning its cause. + pub fn into_source(self) -> Option> { + self.0.cause + } + + /// Returns the source of this error if it was a `DbError`. + /// + /// This is a simple convenience method. + pub fn as_db_error(&self) -> Option<&DbError> { + self.source().and_then(|e| e.downcast_ref::()) + } + + /// Determines if the error was associated with closed connection. + pub fn is_closed(&self) -> bool { + self.0.kind == Kind::Closed + } + + /// Returns the SQLSTATE error code associated with the error. + /// + /// This is a convenience method that downcasts the cause to a `DbError` and returns its code. + pub fn code(&self) -> Option<&SqlState> { + self.as_db_error().map(DbError::code) + } + + fn new(kind: Kind, cause: Option>) -> Error { + Error(Box::new(ErrorInner { kind, cause })) + } + + pub(crate) fn closed() -> Error { + Error::new(Kind::Closed, None) + } + + pub(crate) fn unexpected_message() -> Error { + Error::new(Kind::UnexpectedMessage, None) + } + + #[allow(clippy::needless_pass_by_value)] + pub(crate) fn db(error: ErrorResponseBody) -> Error { + match DbError::parse(&mut error.fields()) { + Ok(e) => Error::new(Kind::Db, Some(Box::new(e))), + Err(e) => Error::new(Kind::Parse, Some(Box::new(e))), + } + } + + pub(crate) fn parse(e: io::Error) -> Error { + Error::new(Kind::Parse, Some(Box::new(e))) + } + + pub(crate) fn encode(e: io::Error) -> Error { + Error::new(Kind::Encode, Some(Box::new(e))) + } + + #[allow(clippy::wrong_self_convention)] + pub(crate) fn to_sql(e: Box, idx: usize) -> Error { + Error::new(Kind::ToSql(idx), Some(e)) + } + + pub(crate) fn from_sql(e: Box, idx: usize) -> Error { + Error::new(Kind::FromSql(idx), Some(e)) + } + + pub(crate) fn column(column: String) -> Error { + Error::new(Kind::Column(column), None) + } + + pub(crate) fn tls(e: Box) -> Error { + Error::new(Kind::Tls, Some(e)) + } + + pub(crate) fn io(e: io::Error) -> Error { + Error::new(Kind::Io, Some(Box::new(e))) + } + + pub(crate) fn authentication(e: Box) -> Error { + Error::new(Kind::Authentication, Some(e)) + } + + pub(crate) fn config_parse(e: Box) -> Error { + Error::new(Kind::ConfigParse, Some(e)) + } + + pub(crate) fn config(e: Box) -> Error { + Error::new(Kind::Config, Some(e)) + } + + pub(crate) fn connect(e: io::Error) -> Error { + Error::new(Kind::Connect, Some(Box::new(e))) + } + + #[doc(hidden)] + pub fn __private_api_timeout() -> Error { + Error::new(Kind::Timeout, None) + } +} diff --git a/libs/proxy/tokio-postgres2/src/error/sqlstate.rs b/libs/proxy/tokio-postgres2/src/error/sqlstate.rs new file mode 100644 index 0000000000..13a1d75f95 --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/error/sqlstate.rs @@ -0,0 +1,1670 @@ +// Autogenerated file - DO NOT EDIT + +/// A SQLSTATE error code +#[derive(PartialEq, Eq, Clone, Debug)] +pub struct SqlState(Inner); + +impl SqlState { + /// Creates a `SqlState` from its error code. + pub fn from_code(s: &str) -> SqlState { + match SQLSTATE_MAP.get(s) { + Some(state) => state.clone(), + None => SqlState(Inner::Other(s.into())), + } + } + + /// Returns the error code corresponding to the `SqlState`. + pub fn code(&self) -> &str { + match &self.0 { + Inner::E00000 => "00000", + Inner::E01000 => "01000", + Inner::E0100C => "0100C", + Inner::E01008 => "01008", + Inner::E01003 => "01003", + Inner::E01007 => "01007", + Inner::E01006 => "01006", + Inner::E01004 => "01004", + Inner::E01P01 => "01P01", + Inner::E02000 => "02000", + Inner::E02001 => "02001", + Inner::E03000 => "03000", + Inner::E08000 => "08000", + Inner::E08003 => "08003", + Inner::E08006 => "08006", + Inner::E08001 => "08001", + Inner::E08004 => "08004", + Inner::E08007 => "08007", + Inner::E08P01 => "08P01", + Inner::E09000 => "09000", + Inner::E0A000 => "0A000", + Inner::E0B000 => "0B000", + Inner::E0F000 => "0F000", + Inner::E0F001 => "0F001", + Inner::E0L000 => "0L000", + Inner::E0LP01 => "0LP01", + Inner::E0P000 => "0P000", + Inner::E0Z000 => "0Z000", + Inner::E0Z002 => "0Z002", + Inner::E20000 => "20000", + Inner::E21000 => "21000", + Inner::E22000 => "22000", + Inner::E2202E => "2202E", + Inner::E22021 => "22021", + Inner::E22008 => "22008", + Inner::E22012 => "22012", + Inner::E22005 => "22005", + Inner::E2200B => "2200B", + Inner::E22022 => "22022", + Inner::E22015 => "22015", + Inner::E2201E => "2201E", + Inner::E22014 => "22014", + Inner::E22016 => "22016", + Inner::E2201F => "2201F", + Inner::E2201G => "2201G", + Inner::E22018 => "22018", + Inner::E22007 => "22007", + Inner::E22019 => "22019", + Inner::E2200D => "2200D", + Inner::E22025 => "22025", + Inner::E22P06 => "22P06", + Inner::E22010 => "22010", + Inner::E22023 => "22023", + Inner::E22013 => "22013", + Inner::E2201B => "2201B", + Inner::E2201W => "2201W", + Inner::E2201X => "2201X", + Inner::E2202H => "2202H", + Inner::E2202G => "2202G", + Inner::E22009 => "22009", + Inner::E2200C => "2200C", + Inner::E2200G => "2200G", + Inner::E22004 => "22004", + Inner::E22002 => "22002", + Inner::E22003 => "22003", + Inner::E2200H => "2200H", + Inner::E22026 => "22026", + Inner::E22001 => "22001", + Inner::E22011 => "22011", + Inner::E22027 => "22027", + Inner::E22024 => "22024", + Inner::E2200F => "2200F", + Inner::E22P01 => "22P01", + Inner::E22P02 => "22P02", + Inner::E22P03 => "22P03", + Inner::E22P04 => "22P04", + Inner::E22P05 => "22P05", + Inner::E2200L => "2200L", + Inner::E2200M => "2200M", + Inner::E2200N => "2200N", + Inner::E2200S => "2200S", + Inner::E2200T => "2200T", + Inner::E22030 => "22030", + Inner::E22031 => "22031", + Inner::E22032 => "22032", + Inner::E22033 => "22033", + Inner::E22034 => "22034", + Inner::E22035 => "22035", + Inner::E22036 => "22036", + Inner::E22037 => "22037", + Inner::E22038 => "22038", + Inner::E22039 => "22039", + Inner::E2203A => "2203A", + Inner::E2203B => "2203B", + Inner::E2203C => "2203C", + Inner::E2203D => "2203D", + Inner::E2203E => "2203E", + Inner::E2203F => "2203F", + Inner::E2203G => "2203G", + Inner::E23000 => "23000", + Inner::E23001 => "23001", + Inner::E23502 => "23502", + Inner::E23503 => "23503", + Inner::E23505 => "23505", + Inner::E23514 => "23514", + Inner::E23P01 => "23P01", + Inner::E24000 => "24000", + Inner::E25000 => "25000", + Inner::E25001 => "25001", + Inner::E25002 => "25002", + Inner::E25008 => "25008", + Inner::E25003 => "25003", + Inner::E25004 => "25004", + Inner::E25005 => "25005", + Inner::E25006 => "25006", + Inner::E25007 => "25007", + Inner::E25P01 => "25P01", + Inner::E25P02 => "25P02", + Inner::E25P03 => "25P03", + Inner::E26000 => "26000", + Inner::E27000 => "27000", + Inner::E28000 => "28000", + Inner::E28P01 => "28P01", + Inner::E2B000 => "2B000", + Inner::E2BP01 => "2BP01", + Inner::E2D000 => "2D000", + Inner::E2F000 => "2F000", + Inner::E2F005 => "2F005", + Inner::E2F002 => "2F002", + Inner::E2F003 => "2F003", + Inner::E2F004 => "2F004", + Inner::E34000 => "34000", + Inner::E38000 => "38000", + Inner::E38001 => "38001", + Inner::E38002 => "38002", + Inner::E38003 => "38003", + Inner::E38004 => "38004", + Inner::E39000 => "39000", + Inner::E39001 => "39001", + Inner::E39004 => "39004", + Inner::E39P01 => "39P01", + Inner::E39P02 => "39P02", + Inner::E39P03 => "39P03", + Inner::E3B000 => "3B000", + Inner::E3B001 => "3B001", + Inner::E3D000 => "3D000", + Inner::E3F000 => "3F000", + Inner::E40000 => "40000", + Inner::E40002 => "40002", + Inner::E40001 => "40001", + Inner::E40003 => "40003", + Inner::E40P01 => "40P01", + Inner::E42000 => "42000", + Inner::E42601 => "42601", + Inner::E42501 => "42501", + Inner::E42846 => "42846", + Inner::E42803 => "42803", + Inner::E42P20 => "42P20", + Inner::E42P19 => "42P19", + Inner::E42830 => "42830", + Inner::E42602 => "42602", + Inner::E42622 => "42622", + Inner::E42939 => "42939", + Inner::E42804 => "42804", + Inner::E42P18 => "42P18", + Inner::E42P21 => "42P21", + Inner::E42P22 => "42P22", + Inner::E42809 => "42809", + Inner::E428C9 => "428C9", + Inner::E42703 => "42703", + Inner::E42883 => "42883", + Inner::E42P01 => "42P01", + Inner::E42P02 => "42P02", + Inner::E42704 => "42704", + Inner::E42701 => "42701", + Inner::E42P03 => "42P03", + Inner::E42P04 => "42P04", + Inner::E42723 => "42723", + Inner::E42P05 => "42P05", + Inner::E42P06 => "42P06", + Inner::E42P07 => "42P07", + Inner::E42712 => "42712", + Inner::E42710 => "42710", + Inner::E42702 => "42702", + Inner::E42725 => "42725", + Inner::E42P08 => "42P08", + Inner::E42P09 => "42P09", + Inner::E42P10 => "42P10", + Inner::E42611 => "42611", + Inner::E42P11 => "42P11", + Inner::E42P12 => "42P12", + Inner::E42P13 => "42P13", + Inner::E42P14 => "42P14", + Inner::E42P15 => "42P15", + Inner::E42P16 => "42P16", + Inner::E42P17 => "42P17", + Inner::E44000 => "44000", + Inner::E53000 => "53000", + Inner::E53100 => "53100", + Inner::E53200 => "53200", + Inner::E53300 => "53300", + Inner::E53400 => "53400", + Inner::E54000 => "54000", + Inner::E54001 => "54001", + Inner::E54011 => "54011", + Inner::E54023 => "54023", + Inner::E55000 => "55000", + Inner::E55006 => "55006", + Inner::E55P02 => "55P02", + Inner::E55P03 => "55P03", + Inner::E55P04 => "55P04", + Inner::E57000 => "57000", + Inner::E57014 => "57014", + Inner::E57P01 => "57P01", + Inner::E57P02 => "57P02", + Inner::E57P03 => "57P03", + Inner::E57P04 => "57P04", + Inner::E57P05 => "57P05", + Inner::E58000 => "58000", + Inner::E58030 => "58030", + Inner::E58P01 => "58P01", + Inner::E58P02 => "58P02", + Inner::E72000 => "72000", + Inner::EF0000 => "F0000", + Inner::EF0001 => "F0001", + Inner::EHV000 => "HV000", + Inner::EHV005 => "HV005", + Inner::EHV002 => "HV002", + Inner::EHV010 => "HV010", + Inner::EHV021 => "HV021", + Inner::EHV024 => "HV024", + Inner::EHV007 => "HV007", + Inner::EHV008 => "HV008", + Inner::EHV004 => "HV004", + Inner::EHV006 => "HV006", + Inner::EHV091 => "HV091", + Inner::EHV00B => "HV00B", + Inner::EHV00C => "HV00C", + Inner::EHV00D => "HV00D", + Inner::EHV090 => "HV090", + Inner::EHV00A => "HV00A", + Inner::EHV009 => "HV009", + Inner::EHV014 => "HV014", + Inner::EHV001 => "HV001", + Inner::EHV00P => "HV00P", + Inner::EHV00J => "HV00J", + Inner::EHV00K => "HV00K", + Inner::EHV00Q => "HV00Q", + Inner::EHV00R => "HV00R", + Inner::EHV00L => "HV00L", + Inner::EHV00M => "HV00M", + Inner::EHV00N => "HV00N", + Inner::EP0000 => "P0000", + Inner::EP0001 => "P0001", + Inner::EP0002 => "P0002", + Inner::EP0003 => "P0003", + Inner::EP0004 => "P0004", + Inner::EXX000 => "XX000", + Inner::EXX001 => "XX001", + Inner::EXX002 => "XX002", + Inner::Other(code) => code, + } + } + + /// 00000 + pub const SUCCESSFUL_COMPLETION: SqlState = SqlState(Inner::E00000); + + /// 01000 + pub const WARNING: SqlState = SqlState(Inner::E01000); + + /// 0100C + pub const WARNING_DYNAMIC_RESULT_SETS_RETURNED: SqlState = SqlState(Inner::E0100C); + + /// 01008 + pub const WARNING_IMPLICIT_ZERO_BIT_PADDING: SqlState = SqlState(Inner::E01008); + + /// 01003 + pub const WARNING_NULL_VALUE_ELIMINATED_IN_SET_FUNCTION: SqlState = SqlState(Inner::E01003); + + /// 01007 + pub const WARNING_PRIVILEGE_NOT_GRANTED: SqlState = SqlState(Inner::E01007); + + /// 01006 + pub const WARNING_PRIVILEGE_NOT_REVOKED: SqlState = SqlState(Inner::E01006); + + /// 01004 + pub const WARNING_STRING_DATA_RIGHT_TRUNCATION: SqlState = SqlState(Inner::E01004); + + /// 01P01 + pub const WARNING_DEPRECATED_FEATURE: SqlState = SqlState(Inner::E01P01); + + /// 02000 + pub const NO_DATA: SqlState = SqlState(Inner::E02000); + + /// 02001 + pub const NO_ADDITIONAL_DYNAMIC_RESULT_SETS_RETURNED: SqlState = SqlState(Inner::E02001); + + /// 03000 + pub const SQL_STATEMENT_NOT_YET_COMPLETE: SqlState = SqlState(Inner::E03000); + + /// 08000 + pub const CONNECTION_EXCEPTION: SqlState = SqlState(Inner::E08000); + + /// 08003 + pub const CONNECTION_DOES_NOT_EXIST: SqlState = SqlState(Inner::E08003); + + /// 08006 + pub const CONNECTION_FAILURE: SqlState = SqlState(Inner::E08006); + + /// 08001 + pub const SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION: SqlState = SqlState(Inner::E08001); + + /// 08004 + pub const SQLSERVER_REJECTED_ESTABLISHMENT_OF_SQLCONNECTION: SqlState = SqlState(Inner::E08004); + + /// 08007 + pub const TRANSACTION_RESOLUTION_UNKNOWN: SqlState = SqlState(Inner::E08007); + + /// 08P01 + pub const PROTOCOL_VIOLATION: SqlState = SqlState(Inner::E08P01); + + /// 09000 + pub const TRIGGERED_ACTION_EXCEPTION: SqlState = SqlState(Inner::E09000); + + /// 0A000 + pub const FEATURE_NOT_SUPPORTED: SqlState = SqlState(Inner::E0A000); + + /// 0B000 + pub const INVALID_TRANSACTION_INITIATION: SqlState = SqlState(Inner::E0B000); + + /// 0F000 + pub const LOCATOR_EXCEPTION: SqlState = SqlState(Inner::E0F000); + + /// 0F001 + pub const L_E_INVALID_SPECIFICATION: SqlState = SqlState(Inner::E0F001); + + /// 0L000 + pub const INVALID_GRANTOR: SqlState = SqlState(Inner::E0L000); + + /// 0LP01 + pub const INVALID_GRANT_OPERATION: SqlState = SqlState(Inner::E0LP01); + + /// 0P000 + pub const INVALID_ROLE_SPECIFICATION: SqlState = SqlState(Inner::E0P000); + + /// 0Z000 + pub const DIAGNOSTICS_EXCEPTION: SqlState = SqlState(Inner::E0Z000); + + /// 0Z002 + pub const STACKED_DIAGNOSTICS_ACCESSED_WITHOUT_ACTIVE_HANDLER: SqlState = + SqlState(Inner::E0Z002); + + /// 20000 + pub const CASE_NOT_FOUND: SqlState = SqlState(Inner::E20000); + + /// 21000 + pub const CARDINALITY_VIOLATION: SqlState = SqlState(Inner::E21000); + + /// 22000 + pub const DATA_EXCEPTION: SqlState = SqlState(Inner::E22000); + + /// 2202E + pub const ARRAY_ELEMENT_ERROR: SqlState = SqlState(Inner::E2202E); + + /// 2202E + pub const ARRAY_SUBSCRIPT_ERROR: SqlState = SqlState(Inner::E2202E); + + /// 22021 + pub const CHARACTER_NOT_IN_REPERTOIRE: SqlState = SqlState(Inner::E22021); + + /// 22008 + pub const DATETIME_FIELD_OVERFLOW: SqlState = SqlState(Inner::E22008); + + /// 22008 + pub const DATETIME_VALUE_OUT_OF_RANGE: SqlState = SqlState(Inner::E22008); + + /// 22012 + pub const DIVISION_BY_ZERO: SqlState = SqlState(Inner::E22012); + + /// 22005 + pub const ERROR_IN_ASSIGNMENT: SqlState = SqlState(Inner::E22005); + + /// 2200B + pub const ESCAPE_CHARACTER_CONFLICT: SqlState = SqlState(Inner::E2200B); + + /// 22022 + pub const INDICATOR_OVERFLOW: SqlState = SqlState(Inner::E22022); + + /// 22015 + pub const INTERVAL_FIELD_OVERFLOW: SqlState = SqlState(Inner::E22015); + + /// 2201E + pub const INVALID_ARGUMENT_FOR_LOG: SqlState = SqlState(Inner::E2201E); + + /// 22014 + pub const INVALID_ARGUMENT_FOR_NTILE: SqlState = SqlState(Inner::E22014); + + /// 22016 + pub const INVALID_ARGUMENT_FOR_NTH_VALUE: SqlState = SqlState(Inner::E22016); + + /// 2201F + pub const INVALID_ARGUMENT_FOR_POWER_FUNCTION: SqlState = SqlState(Inner::E2201F); + + /// 2201G + pub const INVALID_ARGUMENT_FOR_WIDTH_BUCKET_FUNCTION: SqlState = SqlState(Inner::E2201G); + + /// 22018 + pub const INVALID_CHARACTER_VALUE_FOR_CAST: SqlState = SqlState(Inner::E22018); + + /// 22007 + pub const INVALID_DATETIME_FORMAT: SqlState = SqlState(Inner::E22007); + + /// 22019 + pub const INVALID_ESCAPE_CHARACTER: SqlState = SqlState(Inner::E22019); + + /// 2200D + pub const INVALID_ESCAPE_OCTET: SqlState = SqlState(Inner::E2200D); + + /// 22025 + pub const INVALID_ESCAPE_SEQUENCE: SqlState = SqlState(Inner::E22025); + + /// 22P06 + pub const NONSTANDARD_USE_OF_ESCAPE_CHARACTER: SqlState = SqlState(Inner::E22P06); + + /// 22010 + pub const INVALID_INDICATOR_PARAMETER_VALUE: SqlState = SqlState(Inner::E22010); + + /// 22023 + pub const INVALID_PARAMETER_VALUE: SqlState = SqlState(Inner::E22023); + + /// 22013 + pub const INVALID_PRECEDING_OR_FOLLOWING_SIZE: SqlState = SqlState(Inner::E22013); + + /// 2201B + pub const INVALID_REGULAR_EXPRESSION: SqlState = SqlState(Inner::E2201B); + + /// 2201W + pub const INVALID_ROW_COUNT_IN_LIMIT_CLAUSE: SqlState = SqlState(Inner::E2201W); + + /// 2201X + pub const INVALID_ROW_COUNT_IN_RESULT_OFFSET_CLAUSE: SqlState = SqlState(Inner::E2201X); + + /// 2202H + pub const INVALID_TABLESAMPLE_ARGUMENT: SqlState = SqlState(Inner::E2202H); + + /// 2202G + pub const INVALID_TABLESAMPLE_REPEAT: SqlState = SqlState(Inner::E2202G); + + /// 22009 + pub const INVALID_TIME_ZONE_DISPLACEMENT_VALUE: SqlState = SqlState(Inner::E22009); + + /// 2200C + pub const INVALID_USE_OF_ESCAPE_CHARACTER: SqlState = SqlState(Inner::E2200C); + + /// 2200G + pub const MOST_SPECIFIC_TYPE_MISMATCH: SqlState = SqlState(Inner::E2200G); + + /// 22004 + pub const NULL_VALUE_NOT_ALLOWED: SqlState = SqlState(Inner::E22004); + + /// 22002 + pub const NULL_VALUE_NO_INDICATOR_PARAMETER: SqlState = SqlState(Inner::E22002); + + /// 22003 + pub const NUMERIC_VALUE_OUT_OF_RANGE: SqlState = SqlState(Inner::E22003); + + /// 2200H + pub const SEQUENCE_GENERATOR_LIMIT_EXCEEDED: SqlState = SqlState(Inner::E2200H); + + /// 22026 + pub const STRING_DATA_LENGTH_MISMATCH: SqlState = SqlState(Inner::E22026); + + /// 22001 + pub const STRING_DATA_RIGHT_TRUNCATION: SqlState = SqlState(Inner::E22001); + + /// 22011 + pub const SUBSTRING_ERROR: SqlState = SqlState(Inner::E22011); + + /// 22027 + pub const TRIM_ERROR: SqlState = SqlState(Inner::E22027); + + /// 22024 + pub const UNTERMINATED_C_STRING: SqlState = SqlState(Inner::E22024); + + /// 2200F + pub const ZERO_LENGTH_CHARACTER_STRING: SqlState = SqlState(Inner::E2200F); + + /// 22P01 + pub const FLOATING_POINT_EXCEPTION: SqlState = SqlState(Inner::E22P01); + + /// 22P02 + pub const INVALID_TEXT_REPRESENTATION: SqlState = SqlState(Inner::E22P02); + + /// 22P03 + pub const INVALID_BINARY_REPRESENTATION: SqlState = SqlState(Inner::E22P03); + + /// 22P04 + pub const BAD_COPY_FILE_FORMAT: SqlState = SqlState(Inner::E22P04); + + /// 22P05 + pub const UNTRANSLATABLE_CHARACTER: SqlState = SqlState(Inner::E22P05); + + /// 2200L + pub const NOT_AN_XML_DOCUMENT: SqlState = SqlState(Inner::E2200L); + + /// 2200M + pub const INVALID_XML_DOCUMENT: SqlState = SqlState(Inner::E2200M); + + /// 2200N + pub const INVALID_XML_CONTENT: SqlState = SqlState(Inner::E2200N); + + /// 2200S + pub const INVALID_XML_COMMENT: SqlState = SqlState(Inner::E2200S); + + /// 2200T + pub const INVALID_XML_PROCESSING_INSTRUCTION: SqlState = SqlState(Inner::E2200T); + + /// 22030 + pub const DUPLICATE_JSON_OBJECT_KEY_VALUE: SqlState = SqlState(Inner::E22030); + + /// 22031 + pub const INVALID_ARGUMENT_FOR_SQL_JSON_DATETIME_FUNCTION: SqlState = SqlState(Inner::E22031); + + /// 22032 + pub const INVALID_JSON_TEXT: SqlState = SqlState(Inner::E22032); + + /// 22033 + pub const INVALID_SQL_JSON_SUBSCRIPT: SqlState = SqlState(Inner::E22033); + + /// 22034 + pub const MORE_THAN_ONE_SQL_JSON_ITEM: SqlState = SqlState(Inner::E22034); + + /// 22035 + pub const NO_SQL_JSON_ITEM: SqlState = SqlState(Inner::E22035); + + /// 22036 + pub const NON_NUMERIC_SQL_JSON_ITEM: SqlState = SqlState(Inner::E22036); + + /// 22037 + pub const NON_UNIQUE_KEYS_IN_A_JSON_OBJECT: SqlState = SqlState(Inner::E22037); + + /// 22038 + pub const SINGLETON_SQL_JSON_ITEM_REQUIRED: SqlState = SqlState(Inner::E22038); + + /// 22039 + pub const SQL_JSON_ARRAY_NOT_FOUND: SqlState = SqlState(Inner::E22039); + + /// 2203A + pub const SQL_JSON_MEMBER_NOT_FOUND: SqlState = SqlState(Inner::E2203A); + + /// 2203B + pub const SQL_JSON_NUMBER_NOT_FOUND: SqlState = SqlState(Inner::E2203B); + + /// 2203C + pub const SQL_JSON_OBJECT_NOT_FOUND: SqlState = SqlState(Inner::E2203C); + + /// 2203D + pub const TOO_MANY_JSON_ARRAY_ELEMENTS: SqlState = SqlState(Inner::E2203D); + + /// 2203E + pub const TOO_MANY_JSON_OBJECT_MEMBERS: SqlState = SqlState(Inner::E2203E); + + /// 2203F + pub const SQL_JSON_SCALAR_REQUIRED: SqlState = SqlState(Inner::E2203F); + + /// 2203G + pub const SQL_JSON_ITEM_CANNOT_BE_CAST_TO_TARGET_TYPE: SqlState = SqlState(Inner::E2203G); + + /// 23000 + pub const INTEGRITY_CONSTRAINT_VIOLATION: SqlState = SqlState(Inner::E23000); + + /// 23001 + pub const RESTRICT_VIOLATION: SqlState = SqlState(Inner::E23001); + + /// 23502 + pub const NOT_NULL_VIOLATION: SqlState = SqlState(Inner::E23502); + + /// 23503 + pub const FOREIGN_KEY_VIOLATION: SqlState = SqlState(Inner::E23503); + + /// 23505 + pub const UNIQUE_VIOLATION: SqlState = SqlState(Inner::E23505); + + /// 23514 + pub const CHECK_VIOLATION: SqlState = SqlState(Inner::E23514); + + /// 23P01 + pub const EXCLUSION_VIOLATION: SqlState = SqlState(Inner::E23P01); + + /// 24000 + pub const INVALID_CURSOR_STATE: SqlState = SqlState(Inner::E24000); + + /// 25000 + pub const INVALID_TRANSACTION_STATE: SqlState = SqlState(Inner::E25000); + + /// 25001 + pub const ACTIVE_SQL_TRANSACTION: SqlState = SqlState(Inner::E25001); + + /// 25002 + pub const BRANCH_TRANSACTION_ALREADY_ACTIVE: SqlState = SqlState(Inner::E25002); + + /// 25008 + pub const HELD_CURSOR_REQUIRES_SAME_ISOLATION_LEVEL: SqlState = SqlState(Inner::E25008); + + /// 25003 + pub const INAPPROPRIATE_ACCESS_MODE_FOR_BRANCH_TRANSACTION: SqlState = SqlState(Inner::E25003); + + /// 25004 + pub const INAPPROPRIATE_ISOLATION_LEVEL_FOR_BRANCH_TRANSACTION: SqlState = + SqlState(Inner::E25004); + + /// 25005 + pub const NO_ACTIVE_SQL_TRANSACTION_FOR_BRANCH_TRANSACTION: SqlState = SqlState(Inner::E25005); + + /// 25006 + pub const READ_ONLY_SQL_TRANSACTION: SqlState = SqlState(Inner::E25006); + + /// 25007 + pub const SCHEMA_AND_DATA_STATEMENT_MIXING_NOT_SUPPORTED: SqlState = SqlState(Inner::E25007); + + /// 25P01 + pub const NO_ACTIVE_SQL_TRANSACTION: SqlState = SqlState(Inner::E25P01); + + /// 25P02 + pub const IN_FAILED_SQL_TRANSACTION: SqlState = SqlState(Inner::E25P02); + + /// 25P03 + pub const IDLE_IN_TRANSACTION_SESSION_TIMEOUT: SqlState = SqlState(Inner::E25P03); + + /// 26000 + pub const INVALID_SQL_STATEMENT_NAME: SqlState = SqlState(Inner::E26000); + + /// 26000 + pub const UNDEFINED_PSTATEMENT: SqlState = SqlState(Inner::E26000); + + /// 27000 + pub const TRIGGERED_DATA_CHANGE_VIOLATION: SqlState = SqlState(Inner::E27000); + + /// 28000 + pub const INVALID_AUTHORIZATION_SPECIFICATION: SqlState = SqlState(Inner::E28000); + + /// 28P01 + pub const INVALID_PASSWORD: SqlState = SqlState(Inner::E28P01); + + /// 2B000 + pub const DEPENDENT_PRIVILEGE_DESCRIPTORS_STILL_EXIST: SqlState = SqlState(Inner::E2B000); + + /// 2BP01 + pub const DEPENDENT_OBJECTS_STILL_EXIST: SqlState = SqlState(Inner::E2BP01); + + /// 2D000 + pub const INVALID_TRANSACTION_TERMINATION: SqlState = SqlState(Inner::E2D000); + + /// 2F000 + pub const SQL_ROUTINE_EXCEPTION: SqlState = SqlState(Inner::E2F000); + + /// 2F005 + pub const S_R_E_FUNCTION_EXECUTED_NO_RETURN_STATEMENT: SqlState = SqlState(Inner::E2F005); + + /// 2F002 + pub const S_R_E_MODIFYING_SQL_DATA_NOT_PERMITTED: SqlState = SqlState(Inner::E2F002); + + /// 2F003 + pub const S_R_E_PROHIBITED_SQL_STATEMENT_ATTEMPTED: SqlState = SqlState(Inner::E2F003); + + /// 2F004 + pub const S_R_E_READING_SQL_DATA_NOT_PERMITTED: SqlState = SqlState(Inner::E2F004); + + /// 34000 + pub const INVALID_CURSOR_NAME: SqlState = SqlState(Inner::E34000); + + /// 34000 + pub const UNDEFINED_CURSOR: SqlState = SqlState(Inner::E34000); + + /// 38000 + pub const EXTERNAL_ROUTINE_EXCEPTION: SqlState = SqlState(Inner::E38000); + + /// 38001 + pub const E_R_E_CONTAINING_SQL_NOT_PERMITTED: SqlState = SqlState(Inner::E38001); + + /// 38002 + pub const E_R_E_MODIFYING_SQL_DATA_NOT_PERMITTED: SqlState = SqlState(Inner::E38002); + + /// 38003 + pub const E_R_E_PROHIBITED_SQL_STATEMENT_ATTEMPTED: SqlState = SqlState(Inner::E38003); + + /// 38004 + pub const E_R_E_READING_SQL_DATA_NOT_PERMITTED: SqlState = SqlState(Inner::E38004); + + /// 39000 + pub const EXTERNAL_ROUTINE_INVOCATION_EXCEPTION: SqlState = SqlState(Inner::E39000); + + /// 39001 + pub const E_R_I_E_INVALID_SQLSTATE_RETURNED: SqlState = SqlState(Inner::E39001); + + /// 39004 + pub const E_R_I_E_NULL_VALUE_NOT_ALLOWED: SqlState = SqlState(Inner::E39004); + + /// 39P01 + pub const E_R_I_E_TRIGGER_PROTOCOL_VIOLATED: SqlState = SqlState(Inner::E39P01); + + /// 39P02 + pub const E_R_I_E_SRF_PROTOCOL_VIOLATED: SqlState = SqlState(Inner::E39P02); + + /// 39P03 + pub const E_R_I_E_EVENT_TRIGGER_PROTOCOL_VIOLATED: SqlState = SqlState(Inner::E39P03); + + /// 3B000 + pub const SAVEPOINT_EXCEPTION: SqlState = SqlState(Inner::E3B000); + + /// 3B001 + pub const S_E_INVALID_SPECIFICATION: SqlState = SqlState(Inner::E3B001); + + /// 3D000 + pub const INVALID_CATALOG_NAME: SqlState = SqlState(Inner::E3D000); + + /// 3D000 + pub const UNDEFINED_DATABASE: SqlState = SqlState(Inner::E3D000); + + /// 3F000 + pub const INVALID_SCHEMA_NAME: SqlState = SqlState(Inner::E3F000); + + /// 3F000 + pub const UNDEFINED_SCHEMA: SqlState = SqlState(Inner::E3F000); + + /// 40000 + pub const TRANSACTION_ROLLBACK: SqlState = SqlState(Inner::E40000); + + /// 40002 + pub const T_R_INTEGRITY_CONSTRAINT_VIOLATION: SqlState = SqlState(Inner::E40002); + + /// 40001 + pub const T_R_SERIALIZATION_FAILURE: SqlState = SqlState(Inner::E40001); + + /// 40003 + pub const T_R_STATEMENT_COMPLETION_UNKNOWN: SqlState = SqlState(Inner::E40003); + + /// 40P01 + pub const T_R_DEADLOCK_DETECTED: SqlState = SqlState(Inner::E40P01); + + /// 42000 + pub const SYNTAX_ERROR_OR_ACCESS_RULE_VIOLATION: SqlState = SqlState(Inner::E42000); + + /// 42601 + pub const SYNTAX_ERROR: SqlState = SqlState(Inner::E42601); + + /// 42501 + pub const INSUFFICIENT_PRIVILEGE: SqlState = SqlState(Inner::E42501); + + /// 42846 + pub const CANNOT_COERCE: SqlState = SqlState(Inner::E42846); + + /// 42803 + pub const GROUPING_ERROR: SqlState = SqlState(Inner::E42803); + + /// 42P20 + pub const WINDOWING_ERROR: SqlState = SqlState(Inner::E42P20); + + /// 42P19 + pub const INVALID_RECURSION: SqlState = SqlState(Inner::E42P19); + + /// 42830 + pub const INVALID_FOREIGN_KEY: SqlState = SqlState(Inner::E42830); + + /// 42602 + pub const INVALID_NAME: SqlState = SqlState(Inner::E42602); + + /// 42622 + pub const NAME_TOO_LONG: SqlState = SqlState(Inner::E42622); + + /// 42939 + pub const RESERVED_NAME: SqlState = SqlState(Inner::E42939); + + /// 42804 + pub const DATATYPE_MISMATCH: SqlState = SqlState(Inner::E42804); + + /// 42P18 + pub const INDETERMINATE_DATATYPE: SqlState = SqlState(Inner::E42P18); + + /// 42P21 + pub const COLLATION_MISMATCH: SqlState = SqlState(Inner::E42P21); + + /// 42P22 + pub const INDETERMINATE_COLLATION: SqlState = SqlState(Inner::E42P22); + + /// 42809 + pub const WRONG_OBJECT_TYPE: SqlState = SqlState(Inner::E42809); + + /// 428C9 + pub const GENERATED_ALWAYS: SqlState = SqlState(Inner::E428C9); + + /// 42703 + pub const UNDEFINED_COLUMN: SqlState = SqlState(Inner::E42703); + + /// 42883 + pub const UNDEFINED_FUNCTION: SqlState = SqlState(Inner::E42883); + + /// 42P01 + pub const UNDEFINED_TABLE: SqlState = SqlState(Inner::E42P01); + + /// 42P02 + pub const UNDEFINED_PARAMETER: SqlState = SqlState(Inner::E42P02); + + /// 42704 + pub const UNDEFINED_OBJECT: SqlState = SqlState(Inner::E42704); + + /// 42701 + pub const DUPLICATE_COLUMN: SqlState = SqlState(Inner::E42701); + + /// 42P03 + pub const DUPLICATE_CURSOR: SqlState = SqlState(Inner::E42P03); + + /// 42P04 + pub const DUPLICATE_DATABASE: SqlState = SqlState(Inner::E42P04); + + /// 42723 + pub const DUPLICATE_FUNCTION: SqlState = SqlState(Inner::E42723); + + /// 42P05 + pub const DUPLICATE_PSTATEMENT: SqlState = SqlState(Inner::E42P05); + + /// 42P06 + pub const DUPLICATE_SCHEMA: SqlState = SqlState(Inner::E42P06); + + /// 42P07 + pub const DUPLICATE_TABLE: SqlState = SqlState(Inner::E42P07); + + /// 42712 + pub const DUPLICATE_ALIAS: SqlState = SqlState(Inner::E42712); + + /// 42710 + pub const DUPLICATE_OBJECT: SqlState = SqlState(Inner::E42710); + + /// 42702 + pub const AMBIGUOUS_COLUMN: SqlState = SqlState(Inner::E42702); + + /// 42725 + pub const AMBIGUOUS_FUNCTION: SqlState = SqlState(Inner::E42725); + + /// 42P08 + pub const AMBIGUOUS_PARAMETER: SqlState = SqlState(Inner::E42P08); + + /// 42P09 + pub const AMBIGUOUS_ALIAS: SqlState = SqlState(Inner::E42P09); + + /// 42P10 + pub const INVALID_COLUMN_REFERENCE: SqlState = SqlState(Inner::E42P10); + + /// 42611 + pub const INVALID_COLUMN_DEFINITION: SqlState = SqlState(Inner::E42611); + + /// 42P11 + pub const INVALID_CURSOR_DEFINITION: SqlState = SqlState(Inner::E42P11); + + /// 42P12 + pub const INVALID_DATABASE_DEFINITION: SqlState = SqlState(Inner::E42P12); + + /// 42P13 + pub const INVALID_FUNCTION_DEFINITION: SqlState = SqlState(Inner::E42P13); + + /// 42P14 + pub const INVALID_PSTATEMENT_DEFINITION: SqlState = SqlState(Inner::E42P14); + + /// 42P15 + pub const INVALID_SCHEMA_DEFINITION: SqlState = SqlState(Inner::E42P15); + + /// 42P16 + pub const INVALID_TABLE_DEFINITION: SqlState = SqlState(Inner::E42P16); + + /// 42P17 + pub const INVALID_OBJECT_DEFINITION: SqlState = SqlState(Inner::E42P17); + + /// 44000 + pub const WITH_CHECK_OPTION_VIOLATION: SqlState = SqlState(Inner::E44000); + + /// 53000 + pub const INSUFFICIENT_RESOURCES: SqlState = SqlState(Inner::E53000); + + /// 53100 + pub const DISK_FULL: SqlState = SqlState(Inner::E53100); + + /// 53200 + pub const OUT_OF_MEMORY: SqlState = SqlState(Inner::E53200); + + /// 53300 + pub const TOO_MANY_CONNECTIONS: SqlState = SqlState(Inner::E53300); + + /// 53400 + pub const CONFIGURATION_LIMIT_EXCEEDED: SqlState = SqlState(Inner::E53400); + + /// 54000 + pub const PROGRAM_LIMIT_EXCEEDED: SqlState = SqlState(Inner::E54000); + + /// 54001 + pub const STATEMENT_TOO_COMPLEX: SqlState = SqlState(Inner::E54001); + + /// 54011 + pub const TOO_MANY_COLUMNS: SqlState = SqlState(Inner::E54011); + + /// 54023 + pub const TOO_MANY_ARGUMENTS: SqlState = SqlState(Inner::E54023); + + /// 55000 + pub const OBJECT_NOT_IN_PREREQUISITE_STATE: SqlState = SqlState(Inner::E55000); + + /// 55006 + pub const OBJECT_IN_USE: SqlState = SqlState(Inner::E55006); + + /// 55P02 + pub const CANT_CHANGE_RUNTIME_PARAM: SqlState = SqlState(Inner::E55P02); + + /// 55P03 + pub const LOCK_NOT_AVAILABLE: SqlState = SqlState(Inner::E55P03); + + /// 55P04 + pub const UNSAFE_NEW_ENUM_VALUE_USAGE: SqlState = SqlState(Inner::E55P04); + + /// 57000 + pub const OPERATOR_INTERVENTION: SqlState = SqlState(Inner::E57000); + + /// 57014 + pub const QUERY_CANCELED: SqlState = SqlState(Inner::E57014); + + /// 57P01 + pub const ADMIN_SHUTDOWN: SqlState = SqlState(Inner::E57P01); + + /// 57P02 + pub const CRASH_SHUTDOWN: SqlState = SqlState(Inner::E57P02); + + /// 57P03 + pub const CANNOT_CONNECT_NOW: SqlState = SqlState(Inner::E57P03); + + /// 57P04 + pub const DATABASE_DROPPED: SqlState = SqlState(Inner::E57P04); + + /// 57P05 + pub const IDLE_SESSION_TIMEOUT: SqlState = SqlState(Inner::E57P05); + + /// 58000 + pub const SYSTEM_ERROR: SqlState = SqlState(Inner::E58000); + + /// 58030 + pub const IO_ERROR: SqlState = SqlState(Inner::E58030); + + /// 58P01 + pub const UNDEFINED_FILE: SqlState = SqlState(Inner::E58P01); + + /// 58P02 + pub const DUPLICATE_FILE: SqlState = SqlState(Inner::E58P02); + + /// 72000 + pub const SNAPSHOT_TOO_OLD: SqlState = SqlState(Inner::E72000); + + /// F0000 + pub const CONFIG_FILE_ERROR: SqlState = SqlState(Inner::EF0000); + + /// F0001 + pub const LOCK_FILE_EXISTS: SqlState = SqlState(Inner::EF0001); + + /// HV000 + pub const FDW_ERROR: SqlState = SqlState(Inner::EHV000); + + /// HV005 + pub const FDW_COLUMN_NAME_NOT_FOUND: SqlState = SqlState(Inner::EHV005); + + /// HV002 + pub const FDW_DYNAMIC_PARAMETER_VALUE_NEEDED: SqlState = SqlState(Inner::EHV002); + + /// HV010 + pub const FDW_FUNCTION_SEQUENCE_ERROR: SqlState = SqlState(Inner::EHV010); + + /// HV021 + pub const FDW_INCONSISTENT_DESCRIPTOR_INFORMATION: SqlState = SqlState(Inner::EHV021); + + /// HV024 + pub const FDW_INVALID_ATTRIBUTE_VALUE: SqlState = SqlState(Inner::EHV024); + + /// HV007 + pub const FDW_INVALID_COLUMN_NAME: SqlState = SqlState(Inner::EHV007); + + /// HV008 + pub const FDW_INVALID_COLUMN_NUMBER: SqlState = SqlState(Inner::EHV008); + + /// HV004 + pub const FDW_INVALID_DATA_TYPE: SqlState = SqlState(Inner::EHV004); + + /// HV006 + pub const FDW_INVALID_DATA_TYPE_DESCRIPTORS: SqlState = SqlState(Inner::EHV006); + + /// HV091 + pub const FDW_INVALID_DESCRIPTOR_FIELD_IDENTIFIER: SqlState = SqlState(Inner::EHV091); + + /// HV00B + pub const FDW_INVALID_HANDLE: SqlState = SqlState(Inner::EHV00B); + + /// HV00C + pub const FDW_INVALID_OPTION_INDEX: SqlState = SqlState(Inner::EHV00C); + + /// HV00D + pub const FDW_INVALID_OPTION_NAME: SqlState = SqlState(Inner::EHV00D); + + /// HV090 + pub const FDW_INVALID_STRING_LENGTH_OR_BUFFER_LENGTH: SqlState = SqlState(Inner::EHV090); + + /// HV00A + pub const FDW_INVALID_STRING_FORMAT: SqlState = SqlState(Inner::EHV00A); + + /// HV009 + pub const FDW_INVALID_USE_OF_NULL_POINTER: SqlState = SqlState(Inner::EHV009); + + /// HV014 + pub const FDW_TOO_MANY_HANDLES: SqlState = SqlState(Inner::EHV014); + + /// HV001 + pub const FDW_OUT_OF_MEMORY: SqlState = SqlState(Inner::EHV001); + + /// HV00P + pub const FDW_NO_SCHEMAS: SqlState = SqlState(Inner::EHV00P); + + /// HV00J + pub const FDW_OPTION_NAME_NOT_FOUND: SqlState = SqlState(Inner::EHV00J); + + /// HV00K + pub const FDW_REPLY_HANDLE: SqlState = SqlState(Inner::EHV00K); + + /// HV00Q + pub const FDW_SCHEMA_NOT_FOUND: SqlState = SqlState(Inner::EHV00Q); + + /// HV00R + pub const FDW_TABLE_NOT_FOUND: SqlState = SqlState(Inner::EHV00R); + + /// HV00L + pub const FDW_UNABLE_TO_CREATE_EXECUTION: SqlState = SqlState(Inner::EHV00L); + + /// HV00M + pub const FDW_UNABLE_TO_CREATE_REPLY: SqlState = SqlState(Inner::EHV00M); + + /// HV00N + pub const FDW_UNABLE_TO_ESTABLISH_CONNECTION: SqlState = SqlState(Inner::EHV00N); + + /// P0000 + pub const PLPGSQL_ERROR: SqlState = SqlState(Inner::EP0000); + + /// P0001 + pub const RAISE_EXCEPTION: SqlState = SqlState(Inner::EP0001); + + /// P0002 + pub const NO_DATA_FOUND: SqlState = SqlState(Inner::EP0002); + + /// P0003 + pub const TOO_MANY_ROWS: SqlState = SqlState(Inner::EP0003); + + /// P0004 + pub const ASSERT_FAILURE: SqlState = SqlState(Inner::EP0004); + + /// XX000 + pub const INTERNAL_ERROR: SqlState = SqlState(Inner::EXX000); + + /// XX001 + pub const DATA_CORRUPTED: SqlState = SqlState(Inner::EXX001); + + /// XX002 + pub const INDEX_CORRUPTED: SqlState = SqlState(Inner::EXX002); +} + +#[derive(PartialEq, Eq, Clone, Debug)] +#[allow(clippy::upper_case_acronyms)] +enum Inner { + E00000, + E01000, + E0100C, + E01008, + E01003, + E01007, + E01006, + E01004, + E01P01, + E02000, + E02001, + E03000, + E08000, + E08003, + E08006, + E08001, + E08004, + E08007, + E08P01, + E09000, + E0A000, + E0B000, + E0F000, + E0F001, + E0L000, + E0LP01, + E0P000, + E0Z000, + E0Z002, + E20000, + E21000, + E22000, + E2202E, + E22021, + E22008, + E22012, + E22005, + E2200B, + E22022, + E22015, + E2201E, + E22014, + E22016, + E2201F, + E2201G, + E22018, + E22007, + E22019, + E2200D, + E22025, + E22P06, + E22010, + E22023, + E22013, + E2201B, + E2201W, + E2201X, + E2202H, + E2202G, + E22009, + E2200C, + E2200G, + E22004, + E22002, + E22003, + E2200H, + E22026, + E22001, + E22011, + E22027, + E22024, + E2200F, + E22P01, + E22P02, + E22P03, + E22P04, + E22P05, + E2200L, + E2200M, + E2200N, + E2200S, + E2200T, + E22030, + E22031, + E22032, + E22033, + E22034, + E22035, + E22036, + E22037, + E22038, + E22039, + E2203A, + E2203B, + E2203C, + E2203D, + E2203E, + E2203F, + E2203G, + E23000, + E23001, + E23502, + E23503, + E23505, + E23514, + E23P01, + E24000, + E25000, + E25001, + E25002, + E25008, + E25003, + E25004, + E25005, + E25006, + E25007, + E25P01, + E25P02, + E25P03, + E26000, + E27000, + E28000, + E28P01, + E2B000, + E2BP01, + E2D000, + E2F000, + E2F005, + E2F002, + E2F003, + E2F004, + E34000, + E38000, + E38001, + E38002, + E38003, + E38004, + E39000, + E39001, + E39004, + E39P01, + E39P02, + E39P03, + E3B000, + E3B001, + E3D000, + E3F000, + E40000, + E40002, + E40001, + E40003, + E40P01, + E42000, + E42601, + E42501, + E42846, + E42803, + E42P20, + E42P19, + E42830, + E42602, + E42622, + E42939, + E42804, + E42P18, + E42P21, + E42P22, + E42809, + E428C9, + E42703, + E42883, + E42P01, + E42P02, + E42704, + E42701, + E42P03, + E42P04, + E42723, + E42P05, + E42P06, + E42P07, + E42712, + E42710, + E42702, + E42725, + E42P08, + E42P09, + E42P10, + E42611, + E42P11, + E42P12, + E42P13, + E42P14, + E42P15, + E42P16, + E42P17, + E44000, + E53000, + E53100, + E53200, + E53300, + E53400, + E54000, + E54001, + E54011, + E54023, + E55000, + E55006, + E55P02, + E55P03, + E55P04, + E57000, + E57014, + E57P01, + E57P02, + E57P03, + E57P04, + E57P05, + E58000, + E58030, + E58P01, + E58P02, + E72000, + EF0000, + EF0001, + EHV000, + EHV005, + EHV002, + EHV010, + EHV021, + EHV024, + EHV007, + EHV008, + EHV004, + EHV006, + EHV091, + EHV00B, + EHV00C, + EHV00D, + EHV090, + EHV00A, + EHV009, + EHV014, + EHV001, + EHV00P, + EHV00J, + EHV00K, + EHV00Q, + EHV00R, + EHV00L, + EHV00M, + EHV00N, + EP0000, + EP0001, + EP0002, + EP0003, + EP0004, + EXX000, + EXX001, + EXX002, + Other(Box), +} + +#[rustfmt::skip] +static SQLSTATE_MAP: phf::Map<&'static str, SqlState> = +::phf::Map { + key: 12913932095322966823, + disps: &[ + (0, 24), + (0, 12), + (0, 74), + (0, 109), + (0, 11), + (0, 9), + (0, 0), + (4, 38), + (3, 155), + (0, 6), + (1, 242), + (0, 66), + (0, 53), + (5, 180), + (3, 221), + (7, 230), + (0, 125), + (1, 46), + (0, 11), + (1, 2), + (0, 5), + (0, 13), + (0, 171), + (0, 15), + (0, 4), + (0, 22), + (1, 85), + (0, 75), + (2, 0), + (1, 25), + (7, 47), + (0, 45), + (0, 35), + (0, 7), + (7, 124), + (0, 0), + (14, 104), + (1, 183), + (61, 50), + (3, 76), + (0, 12), + (0, 7), + (4, 189), + (0, 1), + (64, 102), + (0, 0), + (16, 192), + (24, 19), + (0, 5), + (0, 87), + (0, 89), + (0, 14), + ], + entries: &[ + ("2F000", SqlState::SQL_ROUTINE_EXCEPTION), + ("01008", SqlState::WARNING_IMPLICIT_ZERO_BIT_PADDING), + ("42501", SqlState::INSUFFICIENT_PRIVILEGE), + ("22000", SqlState::DATA_EXCEPTION), + ("0100C", SqlState::WARNING_DYNAMIC_RESULT_SETS_RETURNED), + ("2200N", SqlState::INVALID_XML_CONTENT), + ("40001", SqlState::T_R_SERIALIZATION_FAILURE), + ("28P01", SqlState::INVALID_PASSWORD), + ("38000", SqlState::EXTERNAL_ROUTINE_EXCEPTION), + ("25006", SqlState::READ_ONLY_SQL_TRANSACTION), + ("2203D", SqlState::TOO_MANY_JSON_ARRAY_ELEMENTS), + ("42P09", SqlState::AMBIGUOUS_ALIAS), + ("F0000", SqlState::CONFIG_FILE_ERROR), + ("42P18", SqlState::INDETERMINATE_DATATYPE), + ("40002", SqlState::T_R_INTEGRITY_CONSTRAINT_VIOLATION), + ("22009", SqlState::INVALID_TIME_ZONE_DISPLACEMENT_VALUE), + ("42P08", SqlState::AMBIGUOUS_PARAMETER), + ("08000", SqlState::CONNECTION_EXCEPTION), + ("25P01", SqlState::NO_ACTIVE_SQL_TRANSACTION), + ("22024", SqlState::UNTERMINATED_C_STRING), + ("55000", SqlState::OBJECT_NOT_IN_PREREQUISITE_STATE), + ("25001", SqlState::ACTIVE_SQL_TRANSACTION), + ("03000", SqlState::SQL_STATEMENT_NOT_YET_COMPLETE), + ("42710", SqlState::DUPLICATE_OBJECT), + ("2D000", SqlState::INVALID_TRANSACTION_TERMINATION), + ("2200G", SqlState::MOST_SPECIFIC_TYPE_MISMATCH), + ("22022", SqlState::INDICATOR_OVERFLOW), + ("55006", SqlState::OBJECT_IN_USE), + ("53200", SqlState::OUT_OF_MEMORY), + ("22012", SqlState::DIVISION_BY_ZERO), + ("P0002", SqlState::NO_DATA_FOUND), + ("XX001", SqlState::DATA_CORRUPTED), + ("22P05", SqlState::UNTRANSLATABLE_CHARACTER), + ("40003", SqlState::T_R_STATEMENT_COMPLETION_UNKNOWN), + ("22021", SqlState::CHARACTER_NOT_IN_REPERTOIRE), + ("25000", SqlState::INVALID_TRANSACTION_STATE), + ("42P15", SqlState::INVALID_SCHEMA_DEFINITION), + ("0B000", SqlState::INVALID_TRANSACTION_INITIATION), + ("22004", SqlState::NULL_VALUE_NOT_ALLOWED), + ("42804", SqlState::DATATYPE_MISMATCH), + ("42803", SqlState::GROUPING_ERROR), + ("02001", SqlState::NO_ADDITIONAL_DYNAMIC_RESULT_SETS_RETURNED), + ("25002", SqlState::BRANCH_TRANSACTION_ALREADY_ACTIVE), + ("28000", SqlState::INVALID_AUTHORIZATION_SPECIFICATION), + ("HV009", SqlState::FDW_INVALID_USE_OF_NULL_POINTER), + ("22P01", SqlState::FLOATING_POINT_EXCEPTION), + ("2B000", SqlState::DEPENDENT_PRIVILEGE_DESCRIPTORS_STILL_EXIST), + ("42723", SqlState::DUPLICATE_FUNCTION), + ("21000", SqlState::CARDINALITY_VIOLATION), + ("0Z002", SqlState::STACKED_DIAGNOSTICS_ACCESSED_WITHOUT_ACTIVE_HANDLER), + ("23505", SqlState::UNIQUE_VIOLATION), + ("HV00J", SqlState::FDW_OPTION_NAME_NOT_FOUND), + ("23P01", SqlState::EXCLUSION_VIOLATION), + ("39P03", SqlState::E_R_I_E_EVENT_TRIGGER_PROTOCOL_VIOLATED), + ("42P10", SqlState::INVALID_COLUMN_REFERENCE), + ("2202H", SqlState::INVALID_TABLESAMPLE_ARGUMENT), + ("55P04", SqlState::UNSAFE_NEW_ENUM_VALUE_USAGE), + ("P0000", SqlState::PLPGSQL_ERROR), + ("2F005", SqlState::S_R_E_FUNCTION_EXECUTED_NO_RETURN_STATEMENT), + ("HV00M", SqlState::FDW_UNABLE_TO_CREATE_REPLY), + ("0A000", SqlState::FEATURE_NOT_SUPPORTED), + ("24000", SqlState::INVALID_CURSOR_STATE), + ("25008", SqlState::HELD_CURSOR_REQUIRES_SAME_ISOLATION_LEVEL), + ("01003", SqlState::WARNING_NULL_VALUE_ELIMINATED_IN_SET_FUNCTION), + ("42712", SqlState::DUPLICATE_ALIAS), + ("HV014", SqlState::FDW_TOO_MANY_HANDLES), + ("58030", SqlState::IO_ERROR), + ("2201W", SqlState::INVALID_ROW_COUNT_IN_LIMIT_CLAUSE), + ("22033", SqlState::INVALID_SQL_JSON_SUBSCRIPT), + ("2BP01", SqlState::DEPENDENT_OBJECTS_STILL_EXIST), + ("HV005", SqlState::FDW_COLUMN_NAME_NOT_FOUND), + ("25004", SqlState::INAPPROPRIATE_ISOLATION_LEVEL_FOR_BRANCH_TRANSACTION), + ("54000", SqlState::PROGRAM_LIMIT_EXCEEDED), + ("20000", SqlState::CASE_NOT_FOUND), + ("2203G", SqlState::SQL_JSON_ITEM_CANNOT_BE_CAST_TO_TARGET_TYPE), + ("22038", SqlState::SINGLETON_SQL_JSON_ITEM_REQUIRED), + ("22007", SqlState::INVALID_DATETIME_FORMAT), + ("08004", SqlState::SQLSERVER_REJECTED_ESTABLISHMENT_OF_SQLCONNECTION), + ("2200H", SqlState::SEQUENCE_GENERATOR_LIMIT_EXCEEDED), + ("HV00D", SqlState::FDW_INVALID_OPTION_NAME), + ("P0004", SqlState::ASSERT_FAILURE), + ("22018", SqlState::INVALID_CHARACTER_VALUE_FOR_CAST), + ("0L000", SqlState::INVALID_GRANTOR), + ("22P04", SqlState::BAD_COPY_FILE_FORMAT), + ("22031", SqlState::INVALID_ARGUMENT_FOR_SQL_JSON_DATETIME_FUNCTION), + ("01P01", SqlState::WARNING_DEPRECATED_FEATURE), + ("0LP01", SqlState::INVALID_GRANT_OPERATION), + ("58P02", SqlState::DUPLICATE_FILE), + ("26000", SqlState::INVALID_SQL_STATEMENT_NAME), + ("54001", SqlState::STATEMENT_TOO_COMPLEX), + ("22010", SqlState::INVALID_INDICATOR_PARAMETER_VALUE), + ("HV00C", SqlState::FDW_INVALID_OPTION_INDEX), + ("22008", SqlState::DATETIME_FIELD_OVERFLOW), + ("42P06", SqlState::DUPLICATE_SCHEMA), + ("25007", SqlState::SCHEMA_AND_DATA_STATEMENT_MIXING_NOT_SUPPORTED), + ("42P20", SqlState::WINDOWING_ERROR), + ("HV091", SqlState::FDW_INVALID_DESCRIPTOR_FIELD_IDENTIFIER), + ("HV021", SqlState::FDW_INCONSISTENT_DESCRIPTOR_INFORMATION), + ("42702", SqlState::AMBIGUOUS_COLUMN), + ("02000", SqlState::NO_DATA), + ("54011", SqlState::TOO_MANY_COLUMNS), + ("HV004", SqlState::FDW_INVALID_DATA_TYPE), + ("01006", SqlState::WARNING_PRIVILEGE_NOT_REVOKED), + ("42701", SqlState::DUPLICATE_COLUMN), + ("08P01", SqlState::PROTOCOL_VIOLATION), + ("42622", SqlState::NAME_TOO_LONG), + ("P0003", SqlState::TOO_MANY_ROWS), + ("22003", SqlState::NUMERIC_VALUE_OUT_OF_RANGE), + ("42P03", SqlState::DUPLICATE_CURSOR), + ("23001", SqlState::RESTRICT_VIOLATION), + ("57000", SqlState::OPERATOR_INTERVENTION), + ("22027", SqlState::TRIM_ERROR), + ("42P12", SqlState::INVALID_DATABASE_DEFINITION), + ("3B000", SqlState::SAVEPOINT_EXCEPTION), + ("2201B", SqlState::INVALID_REGULAR_EXPRESSION), + ("22030", SqlState::DUPLICATE_JSON_OBJECT_KEY_VALUE), + ("2F004", SqlState::S_R_E_READING_SQL_DATA_NOT_PERMITTED), + ("428C9", SqlState::GENERATED_ALWAYS), + ("2200S", SqlState::INVALID_XML_COMMENT), + ("22039", SqlState::SQL_JSON_ARRAY_NOT_FOUND), + ("42809", SqlState::WRONG_OBJECT_TYPE), + ("2201X", SqlState::INVALID_ROW_COUNT_IN_RESULT_OFFSET_CLAUSE), + ("39001", SqlState::E_R_I_E_INVALID_SQLSTATE_RETURNED), + ("25P02", SqlState::IN_FAILED_SQL_TRANSACTION), + ("0P000", SqlState::INVALID_ROLE_SPECIFICATION), + ("HV00N", SqlState::FDW_UNABLE_TO_ESTABLISH_CONNECTION), + ("53100", SqlState::DISK_FULL), + ("42601", SqlState::SYNTAX_ERROR), + ("23000", SqlState::INTEGRITY_CONSTRAINT_VIOLATION), + ("HV006", SqlState::FDW_INVALID_DATA_TYPE_DESCRIPTORS), + ("HV00B", SqlState::FDW_INVALID_HANDLE), + ("HV00Q", SqlState::FDW_SCHEMA_NOT_FOUND), + ("01000", SqlState::WARNING), + ("42883", SqlState::UNDEFINED_FUNCTION), + ("57P01", SqlState::ADMIN_SHUTDOWN), + ("22037", SqlState::NON_UNIQUE_KEYS_IN_A_JSON_OBJECT), + ("00000", SqlState::SUCCESSFUL_COMPLETION), + ("55P03", SqlState::LOCK_NOT_AVAILABLE), + ("42P01", SqlState::UNDEFINED_TABLE), + ("42830", SqlState::INVALID_FOREIGN_KEY), + ("22005", SqlState::ERROR_IN_ASSIGNMENT), + ("22025", SqlState::INVALID_ESCAPE_SEQUENCE), + ("XX002", SqlState::INDEX_CORRUPTED), + ("42P16", SqlState::INVALID_TABLE_DEFINITION), + ("55P02", SqlState::CANT_CHANGE_RUNTIME_PARAM), + ("22019", SqlState::INVALID_ESCAPE_CHARACTER), + ("P0001", SqlState::RAISE_EXCEPTION), + ("72000", SqlState::SNAPSHOT_TOO_OLD), + ("42P11", SqlState::INVALID_CURSOR_DEFINITION), + ("40P01", SqlState::T_R_DEADLOCK_DETECTED), + ("57P02", SqlState::CRASH_SHUTDOWN), + ("HV00A", SqlState::FDW_INVALID_STRING_FORMAT), + ("2F002", SqlState::S_R_E_MODIFYING_SQL_DATA_NOT_PERMITTED), + ("23503", SqlState::FOREIGN_KEY_VIOLATION), + ("40000", SqlState::TRANSACTION_ROLLBACK), + ("22032", SqlState::INVALID_JSON_TEXT), + ("2202E", SqlState::ARRAY_ELEMENT_ERROR), + ("42P19", SqlState::INVALID_RECURSION), + ("42611", SqlState::INVALID_COLUMN_DEFINITION), + ("42P13", SqlState::INVALID_FUNCTION_DEFINITION), + ("25003", SqlState::INAPPROPRIATE_ACCESS_MODE_FOR_BRANCH_TRANSACTION), + ("39P02", SqlState::E_R_I_E_SRF_PROTOCOL_VIOLATED), + ("XX000", SqlState::INTERNAL_ERROR), + ("08006", SqlState::CONNECTION_FAILURE), + ("57P04", SqlState::DATABASE_DROPPED), + ("42P07", SqlState::DUPLICATE_TABLE), + ("22P03", SqlState::INVALID_BINARY_REPRESENTATION), + ("22035", SqlState::NO_SQL_JSON_ITEM), + ("42P14", SqlState::INVALID_PSTATEMENT_DEFINITION), + ("01007", SqlState::WARNING_PRIVILEGE_NOT_GRANTED), + ("38004", SqlState::E_R_E_READING_SQL_DATA_NOT_PERMITTED), + ("42P21", SqlState::COLLATION_MISMATCH), + ("0Z000", SqlState::DIAGNOSTICS_EXCEPTION), + ("HV001", SqlState::FDW_OUT_OF_MEMORY), + ("0F000", SqlState::LOCATOR_EXCEPTION), + ("22013", SqlState::INVALID_PRECEDING_OR_FOLLOWING_SIZE), + ("2201E", SqlState::INVALID_ARGUMENT_FOR_LOG), + ("22011", SqlState::SUBSTRING_ERROR), + ("42602", SqlState::INVALID_NAME), + ("01004", SqlState::WARNING_STRING_DATA_RIGHT_TRUNCATION), + ("42P02", SqlState::UNDEFINED_PARAMETER), + ("2203C", SqlState::SQL_JSON_OBJECT_NOT_FOUND), + ("HV002", SqlState::FDW_DYNAMIC_PARAMETER_VALUE_NEEDED), + ("0F001", SqlState::L_E_INVALID_SPECIFICATION), + ("58P01", SqlState::UNDEFINED_FILE), + ("38001", SqlState::E_R_E_CONTAINING_SQL_NOT_PERMITTED), + ("42703", SqlState::UNDEFINED_COLUMN), + ("57P05", SqlState::IDLE_SESSION_TIMEOUT), + ("57P03", SqlState::CANNOT_CONNECT_NOW), + ("HV007", SqlState::FDW_INVALID_COLUMN_NAME), + ("22014", SqlState::INVALID_ARGUMENT_FOR_NTILE), + ("22P06", SqlState::NONSTANDARD_USE_OF_ESCAPE_CHARACTER), + ("2203F", SqlState::SQL_JSON_SCALAR_REQUIRED), + ("2200F", SqlState::ZERO_LENGTH_CHARACTER_STRING), + ("09000", SqlState::TRIGGERED_ACTION_EXCEPTION), + ("2201F", SqlState::INVALID_ARGUMENT_FOR_POWER_FUNCTION), + ("08003", SqlState::CONNECTION_DOES_NOT_EXIST), + ("38002", SqlState::E_R_E_MODIFYING_SQL_DATA_NOT_PERMITTED), + ("F0001", SqlState::LOCK_FILE_EXISTS), + ("42P22", SqlState::INDETERMINATE_COLLATION), + ("2200C", SqlState::INVALID_USE_OF_ESCAPE_CHARACTER), + ("2203E", SqlState::TOO_MANY_JSON_OBJECT_MEMBERS), + ("23514", SqlState::CHECK_VIOLATION), + ("22P02", SqlState::INVALID_TEXT_REPRESENTATION), + ("54023", SqlState::TOO_MANY_ARGUMENTS), + ("2200T", SqlState::INVALID_XML_PROCESSING_INSTRUCTION), + ("22016", SqlState::INVALID_ARGUMENT_FOR_NTH_VALUE), + ("25P03", SqlState::IDLE_IN_TRANSACTION_SESSION_TIMEOUT), + ("3B001", SqlState::S_E_INVALID_SPECIFICATION), + ("08001", SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION), + ("22036", SqlState::NON_NUMERIC_SQL_JSON_ITEM), + ("3F000", SqlState::INVALID_SCHEMA_NAME), + ("39P01", SqlState::E_R_I_E_TRIGGER_PROTOCOL_VIOLATED), + ("22026", SqlState::STRING_DATA_LENGTH_MISMATCH), + ("42P17", SqlState::INVALID_OBJECT_DEFINITION), + ("22034", SqlState::MORE_THAN_ONE_SQL_JSON_ITEM), + ("HV000", SqlState::FDW_ERROR), + ("2200B", SqlState::ESCAPE_CHARACTER_CONFLICT), + ("HV008", SqlState::FDW_INVALID_COLUMN_NUMBER), + ("34000", SqlState::INVALID_CURSOR_NAME), + ("2201G", SqlState::INVALID_ARGUMENT_FOR_WIDTH_BUCKET_FUNCTION), + ("44000", SqlState::WITH_CHECK_OPTION_VIOLATION), + ("HV010", SqlState::FDW_FUNCTION_SEQUENCE_ERROR), + ("39004", SqlState::E_R_I_E_NULL_VALUE_NOT_ALLOWED), + ("22001", SqlState::STRING_DATA_RIGHT_TRUNCATION), + ("3D000", SqlState::INVALID_CATALOG_NAME), + ("25005", SqlState::NO_ACTIVE_SQL_TRANSACTION_FOR_BRANCH_TRANSACTION), + ("2200L", SqlState::NOT_AN_XML_DOCUMENT), + ("27000", SqlState::TRIGGERED_DATA_CHANGE_VIOLATION), + ("HV090", SqlState::FDW_INVALID_STRING_LENGTH_OR_BUFFER_LENGTH), + ("42939", SqlState::RESERVED_NAME), + ("58000", SqlState::SYSTEM_ERROR), + ("2200M", SqlState::INVALID_XML_DOCUMENT), + ("HV00L", SqlState::FDW_UNABLE_TO_CREATE_EXECUTION), + ("57014", SqlState::QUERY_CANCELED), + ("23502", SqlState::NOT_NULL_VIOLATION), + ("22002", SqlState::NULL_VALUE_NO_INDICATOR_PARAMETER), + ("HV00R", SqlState::FDW_TABLE_NOT_FOUND), + ("HV00P", SqlState::FDW_NO_SCHEMAS), + ("38003", SqlState::E_R_E_PROHIBITED_SQL_STATEMENT_ATTEMPTED), + ("39000", SqlState::EXTERNAL_ROUTINE_INVOCATION_EXCEPTION), + ("22015", SqlState::INTERVAL_FIELD_OVERFLOW), + ("HV00K", SqlState::FDW_REPLY_HANDLE), + ("HV024", SqlState::FDW_INVALID_ATTRIBUTE_VALUE), + ("2200D", SqlState::INVALID_ESCAPE_OCTET), + ("08007", SqlState::TRANSACTION_RESOLUTION_UNKNOWN), + ("2F003", SqlState::S_R_E_PROHIBITED_SQL_STATEMENT_ATTEMPTED), + ("42725", SqlState::AMBIGUOUS_FUNCTION), + ("2203A", SqlState::SQL_JSON_MEMBER_NOT_FOUND), + ("42846", SqlState::CANNOT_COERCE), + ("42P04", SqlState::DUPLICATE_DATABASE), + ("42000", SqlState::SYNTAX_ERROR_OR_ACCESS_RULE_VIOLATION), + ("2203B", SqlState::SQL_JSON_NUMBER_NOT_FOUND), + ("42P05", SqlState::DUPLICATE_PSTATEMENT), + ("53300", SqlState::TOO_MANY_CONNECTIONS), + ("53400", SqlState::CONFIGURATION_LIMIT_EXCEEDED), + ("42704", SqlState::UNDEFINED_OBJECT), + ("2202G", SqlState::INVALID_TABLESAMPLE_REPEAT), + ("22023", SqlState::INVALID_PARAMETER_VALUE), + ("53000", SqlState::INSUFFICIENT_RESOURCES), + ], +}; diff --git a/libs/proxy/tokio-postgres2/src/generic_client.rs b/libs/proxy/tokio-postgres2/src/generic_client.rs new file mode 100644 index 0000000000..768213f8ed --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/generic_client.rs @@ -0,0 +1,64 @@ +use crate::query::RowStream; +use crate::types::Type; +use crate::{Client, Error, Transaction}; +use async_trait::async_trait; +use postgres_protocol2::Oid; + +mod private { + pub trait Sealed {} +} + +/// A trait allowing abstraction over connections and transactions. +/// +/// This trait is "sealed", and cannot be implemented outside of this crate. +#[async_trait] +pub trait GenericClient: private::Sealed { + /// Like `Client::query_raw_txt`. + async fn query_raw_txt(&self, statement: &str, params: I) -> Result + where + S: AsRef + Sync + Send, + I: IntoIterator> + Sync + Send, + I::IntoIter: ExactSizeIterator + Sync + Send; + + /// Query for type information + async fn get_type(&self, oid: Oid) -> Result; +} + +impl private::Sealed for Client {} + +#[async_trait] +impl GenericClient for Client { + async fn query_raw_txt(&self, statement: &str, params: I) -> Result + where + S: AsRef + Sync + Send, + I: IntoIterator> + Sync + Send, + I::IntoIter: ExactSizeIterator + Sync + Send, + { + self.query_raw_txt(statement, params).await + } + + /// Query for type information + async fn get_type(&self, oid: Oid) -> Result { + self.get_type(oid).await + } +} + +impl private::Sealed for Transaction<'_> {} + +#[async_trait] +#[allow(clippy::needless_lifetimes)] +impl GenericClient for Transaction<'_> { + async fn query_raw_txt(&self, statement: &str, params: I) -> Result + where + S: AsRef + Sync + Send, + I: IntoIterator> + Sync + Send, + I::IntoIter: ExactSizeIterator + Sync + Send, + { + self.query_raw_txt(statement, params).await + } + + /// Query for type information + async fn get_type(&self, oid: Oid) -> Result { + self.client().get_type(oid).await + } +} diff --git a/libs/proxy/tokio-postgres2/src/lib.rs b/libs/proxy/tokio-postgres2/src/lib.rs new file mode 100644 index 0000000000..72ba8172b2 --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/lib.rs @@ -0,0 +1,148 @@ +//! An asynchronous, pipelined, PostgreSQL client. +#![warn(rust_2018_idioms, clippy::all, missing_docs)] + +pub use crate::cancel_token::CancelToken; +pub use crate::client::Client; +pub use crate::config::Config; +pub use crate::connection::Connection; +use crate::error::DbError; +pub use crate::error::Error; +pub use crate::generic_client::GenericClient; +pub use crate::query::RowStream; +pub use crate::row::{Row, SimpleQueryRow}; +pub use crate::simple_query::SimpleQueryStream; +pub use crate::statement::{Column, Statement}; +use crate::tls::MakeTlsConnect; +pub use crate::tls::NoTls; +pub use crate::to_statement::ToStatement; +pub use crate::transaction::Transaction; +pub use crate::transaction_builder::{IsolationLevel, TransactionBuilder}; +use crate::types::ToSql; +use postgres_protocol2::message::backend::ReadyForQueryBody; +use tokio::net::TcpStream; + +/// After executing a query, the connection will be in one of these states +#[derive(Clone, Copy, Debug, PartialEq)] +#[repr(u8)] +pub enum ReadyForQueryStatus { + /// Connection state is unknown + Unknown, + /// Connection is idle (no transactions) + Idle = b'I', + /// Connection is in a transaction block + Transaction = b'T', + /// Connection is in a failed transaction block + FailedTransaction = b'E', +} + +impl From for ReadyForQueryStatus { + fn from(value: ReadyForQueryBody) -> Self { + match value.status() { + b'I' => Self::Idle, + b'T' => Self::Transaction, + b'E' => Self::FailedTransaction, + _ => Self::Unknown, + } + } +} + +mod cancel_query; +mod cancel_query_raw; +mod cancel_token; +mod client; +mod codec; +pub mod config; +mod connect; +mod connect_raw; +mod connect_socket; +mod connect_tls; +mod connection; +pub mod error; +mod generic_client; +pub mod maybe_tls_stream; +mod prepare; +mod query; +pub mod row; +mod simple_query; +mod statement; +pub mod tls; +mod to_statement; +mod transaction; +mod transaction_builder; +pub mod types; + +/// A convenience function which parses a connection string and connects to the database. +/// +/// See the documentation for [`Config`] for details on the connection string format. +/// +/// Requires the `runtime` Cargo feature (enabled by default). +/// +/// [`Config`]: config/struct.Config.html +pub async fn connect( + config: &str, + tls: T, +) -> Result<(Client, Connection), Error> +where + T: MakeTlsConnect, +{ + let config = config.parse::()?; + config.connect(tls).await +} + +/// An asynchronous notification. +#[derive(Clone, Debug)] +pub struct Notification { + process_id: i32, + channel: String, + payload: String, +} + +impl Notification { + /// The process ID of the notifying backend process. + pub fn process_id(&self) -> i32 { + self.process_id + } + + /// The name of the channel that the notify has been raised on. + pub fn channel(&self) -> &str { + &self.channel + } + + /// The "payload" string passed from the notifying process. + pub fn payload(&self) -> &str { + &self.payload + } +} + +/// An asynchronous message from the server. +#[allow(clippy::large_enum_variant)] +#[derive(Debug, Clone)] +#[non_exhaustive] +pub enum AsyncMessage { + /// A notice. + /// + /// Notices use the same format as errors, but aren't "errors" per-se. + Notice(DbError), + /// A notification. + /// + /// Connections can subscribe to notifications with the `LISTEN` command. + Notification(Notification), +} + +/// Message returned by the `SimpleQuery` stream. +#[derive(Debug)] +#[non_exhaustive] +pub enum SimpleQueryMessage { + /// A row of data. + Row(SimpleQueryRow), + /// A statement in the query has completed. + /// + /// The number of rows modified or selected is returned. + CommandComplete(u64), +} + +fn slice_iter<'a>( + s: &'a [&'a (dyn ToSql + Sync)], +) -> impl ExactSizeIterator + 'a { + s.iter().map(|s| *s as _) +} diff --git a/libs/proxy/tokio-postgres2/src/maybe_tls_stream.rs b/libs/proxy/tokio-postgres2/src/maybe_tls_stream.rs new file mode 100644 index 0000000000..9a7e248997 --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/maybe_tls_stream.rs @@ -0,0 +1,77 @@ +//! MaybeTlsStream. +//! +//! Represents a stream that may or may not be encrypted with TLS. +use crate::tls::{ChannelBinding, TlsStream}; +use std::io; +use std::pin::Pin; +use std::task::{Context, Poll}; +use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; + +/// A stream that may or may not be encrypted with TLS. +pub enum MaybeTlsStream { + /// An unencrypted stream. + Raw(S), + /// An encrypted stream. + Tls(T), +} + +impl AsyncRead for MaybeTlsStream +where + S: AsyncRead + Unpin, + T: AsyncRead + Unpin, +{ + fn poll_read( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + buf: &mut ReadBuf<'_>, + ) -> Poll> { + match &mut *self { + MaybeTlsStream::Raw(s) => Pin::new(s).poll_read(cx, buf), + MaybeTlsStream::Tls(s) => Pin::new(s).poll_read(cx, buf), + } + } +} + +impl AsyncWrite for MaybeTlsStream +where + S: AsyncWrite + Unpin, + T: AsyncWrite + Unpin, +{ + fn poll_write( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + buf: &[u8], + ) -> Poll> { + match &mut *self { + MaybeTlsStream::Raw(s) => Pin::new(s).poll_write(cx, buf), + MaybeTlsStream::Tls(s) => Pin::new(s).poll_write(cx, buf), + } + } + + fn poll_flush(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + match &mut *self { + MaybeTlsStream::Raw(s) => Pin::new(s).poll_flush(cx), + MaybeTlsStream::Tls(s) => Pin::new(s).poll_flush(cx), + } + } + + fn poll_shutdown(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + match &mut *self { + MaybeTlsStream::Raw(s) => Pin::new(s).poll_shutdown(cx), + MaybeTlsStream::Tls(s) => Pin::new(s).poll_shutdown(cx), + } + } +} + +impl TlsStream for MaybeTlsStream +where + S: AsyncRead + AsyncWrite + Unpin, + T: TlsStream + Unpin, +{ + fn channel_binding(&self) -> ChannelBinding { + match self { + MaybeTlsStream::Raw(_) => ChannelBinding::none(), + MaybeTlsStream::Tls(s) => s.channel_binding(), + } + } +} diff --git a/libs/proxy/tokio-postgres2/src/prepare.rs b/libs/proxy/tokio-postgres2/src/prepare.rs new file mode 100644 index 0000000000..da0c755c5b --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/prepare.rs @@ -0,0 +1,262 @@ +use crate::client::InnerClient; +use crate::codec::FrontendMessage; +use crate::connection::RequestMessages; +use crate::error::SqlState; +use crate::types::{Field, Kind, Oid, Type}; +use crate::{query, slice_iter}; +use crate::{Column, Error, Statement}; +use bytes::Bytes; +use fallible_iterator::FallibleIterator; +use futures_util::{pin_mut, TryStreamExt}; +use log::debug; +use postgres_protocol2::message::backend::Message; +use postgres_protocol2::message::frontend; +use std::future::Future; +use std::pin::Pin; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Arc; + +pub(crate) const TYPEINFO_QUERY: &str = "\ +SELECT t.typname, t.typtype, t.typelem, r.rngsubtype, t.typbasetype, n.nspname, t.typrelid +FROM pg_catalog.pg_type t +LEFT OUTER JOIN pg_catalog.pg_range r ON r.rngtypid = t.oid +INNER JOIN pg_catalog.pg_namespace n ON t.typnamespace = n.oid +WHERE t.oid = $1 +"; + +// Range types weren't added until Postgres 9.2, so pg_range may not exist +const TYPEINFO_FALLBACK_QUERY: &str = "\ +SELECT t.typname, t.typtype, t.typelem, NULL::OID, t.typbasetype, n.nspname, t.typrelid +FROM pg_catalog.pg_type t +INNER JOIN pg_catalog.pg_namespace n ON t.typnamespace = n.oid +WHERE t.oid = $1 +"; + +const TYPEINFO_ENUM_QUERY: &str = "\ +SELECT enumlabel +FROM pg_catalog.pg_enum +WHERE enumtypid = $1 +ORDER BY enumsortorder +"; + +// Postgres 9.0 didn't have enumsortorder +const TYPEINFO_ENUM_FALLBACK_QUERY: &str = "\ +SELECT enumlabel +FROM pg_catalog.pg_enum +WHERE enumtypid = $1 +ORDER BY oid +"; + +pub(crate) const TYPEINFO_COMPOSITE_QUERY: &str = "\ +SELECT attname, atttypid +FROM pg_catalog.pg_attribute +WHERE attrelid = $1 +AND NOT attisdropped +AND attnum > 0 +ORDER BY attnum +"; + +static NEXT_ID: AtomicUsize = AtomicUsize::new(0); + +pub async fn prepare( + client: &Arc, + query: &str, + types: &[Type], +) -> Result { + let name = format!("s{}", NEXT_ID.fetch_add(1, Ordering::SeqCst)); + let buf = encode(client, &name, query, types)?; + let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?; + + match responses.next().await? { + Message::ParseComplete => {} + _ => return Err(Error::unexpected_message()), + } + + let parameter_description = match responses.next().await? { + Message::ParameterDescription(body) => body, + _ => return Err(Error::unexpected_message()), + }; + + let row_description = match responses.next().await? { + Message::RowDescription(body) => Some(body), + Message::NoData => None, + _ => return Err(Error::unexpected_message()), + }; + + let mut parameters = vec![]; + let mut it = parameter_description.parameters(); + while let Some(oid) = it.next().map_err(Error::parse)? { + let type_ = get_type(client, oid).await?; + parameters.push(type_); + } + + let mut columns = vec![]; + if let Some(row_description) = row_description { + let mut it = row_description.fields(); + while let Some(field) = it.next().map_err(Error::parse)? { + let type_ = get_type(client, field.type_oid()).await?; + let column = Column::new(field.name().to_string(), type_, field); + columns.push(column); + } + } + + Ok(Statement::new(client, name, parameters, columns)) +} + +fn prepare_rec<'a>( + client: &'a Arc, + query: &'a str, + types: &'a [Type], +) -> Pin> + 'a + Send>> { + Box::pin(prepare(client, query, types)) +} + +fn encode(client: &InnerClient, name: &str, query: &str, types: &[Type]) -> Result { + if types.is_empty() { + debug!("preparing query {}: {}", name, query); + } else { + debug!("preparing query {} with types {:?}: {}", name, types, query); + } + + client.with_buf(|buf| { + frontend::parse(name, query, types.iter().map(Type::oid), buf).map_err(Error::encode)?; + frontend::describe(b'S', name, buf).map_err(Error::encode)?; + frontend::sync(buf); + Ok(buf.split().freeze()) + }) +} + +pub async fn get_type(client: &Arc, oid: Oid) -> Result { + if let Some(type_) = Type::from_oid(oid) { + return Ok(type_); + } + + if let Some(type_) = client.type_(oid) { + return Ok(type_); + } + + let stmt = typeinfo_statement(client).await?; + + let rows = query::query(client, stmt, slice_iter(&[&oid])).await?; + pin_mut!(rows); + + let row = match rows.try_next().await? { + Some(row) => row, + None => return Err(Error::unexpected_message()), + }; + + let name: String = row.try_get(0)?; + let type_: i8 = row.try_get(1)?; + let elem_oid: Oid = row.try_get(2)?; + let rngsubtype: Option = row.try_get(3)?; + let basetype: Oid = row.try_get(4)?; + let schema: String = row.try_get(5)?; + let relid: Oid = row.try_get(6)?; + + let kind = if type_ == b'e' as i8 { + let variants = get_enum_variants(client, oid).await?; + Kind::Enum(variants) + } else if type_ == b'p' as i8 { + Kind::Pseudo + } else if basetype != 0 { + let type_ = get_type_rec(client, basetype).await?; + Kind::Domain(type_) + } else if elem_oid != 0 { + let type_ = get_type_rec(client, elem_oid).await?; + Kind::Array(type_) + } else if relid != 0 { + let fields = get_composite_fields(client, relid).await?; + Kind::Composite(fields) + } else if let Some(rngsubtype) = rngsubtype { + let type_ = get_type_rec(client, rngsubtype).await?; + Kind::Range(type_) + } else { + Kind::Simple + }; + + let type_ = Type::new(name, oid, kind, schema); + client.set_type(oid, &type_); + + Ok(type_) +} + +fn get_type_rec<'a>( + client: &'a Arc, + oid: Oid, +) -> Pin> + Send + 'a>> { + Box::pin(get_type(client, oid)) +} + +async fn typeinfo_statement(client: &Arc) -> Result { + if let Some(stmt) = client.typeinfo() { + return Ok(stmt); + } + + let stmt = match prepare_rec(client, TYPEINFO_QUERY, &[]).await { + Ok(stmt) => stmt, + Err(ref e) if e.code() == Some(&SqlState::UNDEFINED_TABLE) => { + prepare_rec(client, TYPEINFO_FALLBACK_QUERY, &[]).await? + } + Err(e) => return Err(e), + }; + + client.set_typeinfo(&stmt); + Ok(stmt) +} + +async fn get_enum_variants(client: &Arc, oid: Oid) -> Result, Error> { + let stmt = typeinfo_enum_statement(client).await?; + + query::query(client, stmt, slice_iter(&[&oid])) + .await? + .and_then(|row| async move { row.try_get(0) }) + .try_collect() + .await +} + +async fn typeinfo_enum_statement(client: &Arc) -> Result { + if let Some(stmt) = client.typeinfo_enum() { + return Ok(stmt); + } + + let stmt = match prepare_rec(client, TYPEINFO_ENUM_QUERY, &[]).await { + Ok(stmt) => stmt, + Err(ref e) if e.code() == Some(&SqlState::UNDEFINED_COLUMN) => { + prepare_rec(client, TYPEINFO_ENUM_FALLBACK_QUERY, &[]).await? + } + Err(e) => return Err(e), + }; + + client.set_typeinfo_enum(&stmt); + Ok(stmt) +} + +async fn get_composite_fields(client: &Arc, oid: Oid) -> Result, Error> { + let stmt = typeinfo_composite_statement(client).await?; + + let rows = query::query(client, stmt, slice_iter(&[&oid])) + .await? + .try_collect::>() + .await?; + + let mut fields = vec![]; + for row in rows { + let name = row.try_get(0)?; + let oid = row.try_get(1)?; + let type_ = get_type_rec(client, oid).await?; + fields.push(Field::new(name, type_)); + } + + Ok(fields) +} + +async fn typeinfo_composite_statement(client: &Arc) -> Result { + if let Some(stmt) = client.typeinfo_composite() { + return Ok(stmt); + } + + let stmt = prepare_rec(client, TYPEINFO_COMPOSITE_QUERY, &[]).await?; + + client.set_typeinfo_composite(&stmt); + Ok(stmt) +} diff --git a/libs/proxy/tokio-postgres2/src/query.rs b/libs/proxy/tokio-postgres2/src/query.rs new file mode 100644 index 0000000000..534195a707 --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/query.rs @@ -0,0 +1,340 @@ +use crate::client::{InnerClient, Responses}; +use crate::codec::FrontendMessage; +use crate::connection::RequestMessages; +use crate::types::IsNull; +use crate::{Column, Error, ReadyForQueryStatus, Row, Statement}; +use bytes::{BufMut, Bytes, BytesMut}; +use fallible_iterator::FallibleIterator; +use futures_util::{ready, Stream}; +use log::{debug, log_enabled, Level}; +use pin_project_lite::pin_project; +use postgres_protocol2::message::backend::Message; +use postgres_protocol2::message::frontend; +use postgres_types2::{Format, ToSql, Type}; +use std::fmt; +use std::marker::PhantomPinned; +use std::pin::Pin; +use std::sync::Arc; +use std::task::{Context, Poll}; + +struct BorrowToSqlParamsDebug<'a>(&'a [&'a (dyn ToSql + Sync)]); + +impl fmt::Debug for BorrowToSqlParamsDebug<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_list().entries(self.0.iter()).finish() + } +} + +pub async fn query<'a, I>( + client: &InnerClient, + statement: Statement, + params: I, +) -> Result +where + I: IntoIterator, + I::IntoIter: ExactSizeIterator, +{ + let buf = if log_enabled!(Level::Debug) { + let params = params.into_iter().collect::>(); + debug!( + "executing statement {} with parameters: {:?}", + statement.name(), + BorrowToSqlParamsDebug(params.as_slice()), + ); + encode(client, &statement, params)? + } else { + encode(client, &statement, params)? + }; + let responses = start(client, buf).await?; + Ok(RowStream { + statement, + responses, + command_tag: None, + status: ReadyForQueryStatus::Unknown, + output_format: Format::Binary, + _p: PhantomPinned, + }) +} + +pub async fn query_txt( + client: &Arc, + query: &str, + params: I, +) -> Result +where + S: AsRef, + I: IntoIterator>, + I::IntoIter: ExactSizeIterator, +{ + let params = params.into_iter(); + + let buf = client.with_buf(|buf| { + frontend::parse( + "", // unnamed prepared statement + query, // query to parse + std::iter::empty(), // give no type info + buf, + ) + .map_err(Error::encode)?; + frontend::describe(b'S', "", buf).map_err(Error::encode)?; + // Bind, pass params as text, retrieve as binary + match frontend::bind( + "", // empty string selects the unnamed portal + "", // unnamed prepared statement + std::iter::empty(), // all parameters use the default format (text) + params, + |param, buf| match param { + Some(param) => { + buf.put_slice(param.as_ref().as_bytes()); + Ok(postgres_protocol2::IsNull::No) + } + None => Ok(postgres_protocol2::IsNull::Yes), + }, + Some(0), // all text + buf, + ) { + Ok(()) => Ok(()), + Err(frontend::BindError::Conversion(e)) => Err(Error::to_sql(e, 0)), + Err(frontend::BindError::Serialization(e)) => Err(Error::encode(e)), + }?; + + // Execute + frontend::execute("", 0, buf).map_err(Error::encode)?; + // Sync + frontend::sync(buf); + + Ok(buf.split().freeze()) + })?; + + // now read the responses + let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?; + + match responses.next().await? { + Message::ParseComplete => {} + _ => return Err(Error::unexpected_message()), + } + + let parameter_description = match responses.next().await? { + Message::ParameterDescription(body) => body, + _ => return Err(Error::unexpected_message()), + }; + + let row_description = match responses.next().await? { + Message::RowDescription(body) => Some(body), + Message::NoData => None, + _ => return Err(Error::unexpected_message()), + }; + + match responses.next().await? { + Message::BindComplete => {} + _ => return Err(Error::unexpected_message()), + } + + let mut parameters = vec![]; + let mut it = parameter_description.parameters(); + while let Some(oid) = it.next().map_err(Error::parse)? { + let type_ = Type::from_oid(oid).unwrap_or(Type::UNKNOWN); + parameters.push(type_); + } + + let mut columns = vec![]; + if let Some(row_description) = row_description { + let mut it = row_description.fields(); + while let Some(field) = it.next().map_err(Error::parse)? { + let type_ = Type::from_oid(field.type_oid()).unwrap_or(Type::UNKNOWN); + let column = Column::new(field.name().to_string(), type_, field); + columns.push(column); + } + } + + Ok(RowStream { + statement: Statement::new_anonymous(parameters, columns), + responses, + command_tag: None, + status: ReadyForQueryStatus::Unknown, + output_format: Format::Text, + _p: PhantomPinned, + }) +} + +pub async fn execute<'a, I>( + client: &InnerClient, + statement: Statement, + params: I, +) -> Result +where + I: IntoIterator, + I::IntoIter: ExactSizeIterator, +{ + let buf = if log_enabled!(Level::Debug) { + let params = params.into_iter().collect::>(); + debug!( + "executing statement {} with parameters: {:?}", + statement.name(), + BorrowToSqlParamsDebug(params.as_slice()), + ); + encode(client, &statement, params)? + } else { + encode(client, &statement, params)? + }; + let mut responses = start(client, buf).await?; + + let mut rows = 0; + loop { + match responses.next().await? { + Message::DataRow(_) => {} + Message::CommandComplete(body) => { + rows = body + .tag() + .map_err(Error::parse)? + .rsplit(' ') + .next() + .unwrap() + .parse() + .unwrap_or(0); + } + Message::EmptyQueryResponse => rows = 0, + Message::ReadyForQuery(_) => return Ok(rows), + _ => return Err(Error::unexpected_message()), + } + } +} + +async fn start(client: &InnerClient, buf: Bytes) -> Result { + let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?; + + match responses.next().await? { + Message::BindComplete => {} + _ => return Err(Error::unexpected_message()), + } + + Ok(responses) +} + +pub fn encode<'a, I>(client: &InnerClient, statement: &Statement, params: I) -> Result +where + I: IntoIterator, + I::IntoIter: ExactSizeIterator, +{ + client.with_buf(|buf| { + encode_bind(statement, params, "", buf)?; + frontend::execute("", 0, buf).map_err(Error::encode)?; + frontend::sync(buf); + Ok(buf.split().freeze()) + }) +} + +pub fn encode_bind<'a, I>( + statement: &Statement, + params: I, + portal: &str, + buf: &mut BytesMut, +) -> Result<(), Error> +where + I: IntoIterator, + I::IntoIter: ExactSizeIterator, +{ + let param_types = statement.params(); + let params = params.into_iter(); + + assert!( + param_types.len() == params.len(), + "expected {} parameters but got {}", + param_types.len(), + params.len() + ); + + let (param_formats, params): (Vec<_>, Vec<_>) = params + .zip(param_types.iter()) + .map(|(p, ty)| (p.encode_format(ty) as i16, p)) + .unzip(); + + let params = params.into_iter(); + + let mut error_idx = 0; + let r = frontend::bind( + portal, + statement.name(), + param_formats, + params.zip(param_types).enumerate(), + |(idx, (param, ty)), buf| match param.to_sql_checked(ty, buf) { + Ok(IsNull::No) => Ok(postgres_protocol2::IsNull::No), + Ok(IsNull::Yes) => Ok(postgres_protocol2::IsNull::Yes), + Err(e) => { + error_idx = idx; + Err(e) + } + }, + Some(1), + buf, + ); + match r { + Ok(()) => Ok(()), + Err(frontend::BindError::Conversion(e)) => Err(Error::to_sql(e, error_idx)), + Err(frontend::BindError::Serialization(e)) => Err(Error::encode(e)), + } +} + +pin_project! { + /// A stream of table rows. + pub struct RowStream { + statement: Statement, + responses: Responses, + command_tag: Option, + output_format: Format, + status: ReadyForQueryStatus, + #[pin] + _p: PhantomPinned, + } +} + +impl Stream for RowStream { + type Item = Result; + + fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + let this = self.project(); + loop { + match ready!(this.responses.poll_next(cx)?) { + Message::DataRow(body) => { + return Poll::Ready(Some(Ok(Row::new( + this.statement.clone(), + body, + *this.output_format, + )?))) + } + Message::EmptyQueryResponse | Message::PortalSuspended => {} + Message::CommandComplete(body) => { + if let Ok(tag) = body.tag() { + *this.command_tag = Some(tag.to_string()); + } + } + Message::ReadyForQuery(status) => { + *this.status = status.into(); + return Poll::Ready(None); + } + _ => return Poll::Ready(Some(Err(Error::unexpected_message()))), + } + } + } +} + +impl RowStream { + /// Returns information about the columns of data in the row. + pub fn columns(&self) -> &[Column] { + self.statement.columns() + } + + /// Returns the command tag of this query. + /// + /// This is only available after the stream has been exhausted. + pub fn command_tag(&self) -> Option { + self.command_tag.clone() + } + + /// Returns if the connection is ready for querying, with the status of the connection. + /// + /// This might be available only after the stream has been exhausted. + pub fn ready_status(&self) -> ReadyForQueryStatus { + self.status + } +} diff --git a/libs/proxy/tokio-postgres2/src/row.rs b/libs/proxy/tokio-postgres2/src/row.rs new file mode 100644 index 0000000000..10e130707d --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/row.rs @@ -0,0 +1,300 @@ +//! Rows. + +use crate::row::sealed::{AsName, Sealed}; +use crate::simple_query::SimpleColumn; +use crate::statement::Column; +use crate::types::{FromSql, Type, WrongType}; +use crate::{Error, Statement}; +use fallible_iterator::FallibleIterator; +use postgres_protocol2::message::backend::DataRowBody; +use postgres_types2::{Format, WrongFormat}; +use std::fmt; +use std::ops::Range; +use std::str; +use std::sync::Arc; + +mod sealed { + pub trait Sealed {} + + pub trait AsName { + fn as_name(&self) -> &str; + } +} + +impl AsName for Column { + fn as_name(&self) -> &str { + self.name() + } +} + +impl AsName for String { + fn as_name(&self) -> &str { + self + } +} + +/// A trait implemented by types that can index into columns of a row. +/// +/// This cannot be implemented outside of this crate. +pub trait RowIndex: Sealed { + #[doc(hidden)] + fn __idx(&self, columns: &[T]) -> Option + where + T: AsName; +} + +impl Sealed for usize {} + +impl RowIndex for usize { + #[inline] + fn __idx(&self, columns: &[T]) -> Option + where + T: AsName, + { + if *self >= columns.len() { + None + } else { + Some(*self) + } + } +} + +impl Sealed for str {} + +impl RowIndex for str { + #[inline] + fn __idx(&self, columns: &[T]) -> Option + where + T: AsName, + { + if let Some(idx) = columns.iter().position(|d| d.as_name() == self) { + return Some(idx); + }; + + // FIXME ASCII-only case insensitivity isn't really the right thing to + // do. Postgres itself uses a dubious wrapper around tolower and JDBC + // uses the US locale. + columns + .iter() + .position(|d| d.as_name().eq_ignore_ascii_case(self)) + } +} + +impl Sealed for &T where T: ?Sized + Sealed {} + +impl RowIndex for &T +where + T: ?Sized + RowIndex, +{ + #[inline] + fn __idx(&self, columns: &[U]) -> Option + where + U: AsName, + { + T::__idx(*self, columns) + } +} + +/// A row of data returned from the database by a query. +pub struct Row { + statement: Statement, + output_format: Format, + body: DataRowBody, + ranges: Vec>>, +} + +impl fmt::Debug for Row { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("Row") + .field("columns", &self.columns()) + .finish() + } +} + +impl Row { + pub(crate) fn new( + statement: Statement, + body: DataRowBody, + output_format: Format, + ) -> Result { + let ranges = body.ranges().collect().map_err(Error::parse)?; + Ok(Row { + statement, + body, + ranges, + output_format, + }) + } + + /// Returns information about the columns of data in the row. + pub fn columns(&self) -> &[Column] { + self.statement.columns() + } + + /// Determines if the row contains no values. + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Returns the number of values in the row. + pub fn len(&self) -> usize { + self.columns().len() + } + + /// Deserializes a value from the row. + /// + /// The value can be specified either by its numeric index in the row, or by its column name. + /// + /// # Panics + /// + /// Panics if the index is out of bounds or if the value cannot be converted to the specified type. + pub fn get<'a, I, T>(&'a self, idx: I) -> T + where + I: RowIndex + fmt::Display, + T: FromSql<'a>, + { + match self.get_inner(&idx) { + Ok(ok) => ok, + Err(err) => panic!("error retrieving column {}: {}", idx, err), + } + } + + /// Like `Row::get`, but returns a `Result` rather than panicking. + pub fn try_get<'a, I, T>(&'a self, idx: I) -> Result + where + I: RowIndex + fmt::Display, + T: FromSql<'a>, + { + self.get_inner(&idx) + } + + fn get_inner<'a, I, T>(&'a self, idx: &I) -> Result + where + I: RowIndex + fmt::Display, + T: FromSql<'a>, + { + let idx = match idx.__idx(self.columns()) { + Some(idx) => idx, + None => return Err(Error::column(idx.to_string())), + }; + + let ty = self.columns()[idx].type_(); + if !T::accepts(ty) { + return Err(Error::from_sql( + Box::new(WrongType::new::(ty.clone())), + idx, + )); + } + + FromSql::from_sql_nullable(ty, self.col_buffer(idx)).map_err(|e| Error::from_sql(e, idx)) + } + + /// Get the raw bytes for the column at the given index. + fn col_buffer(&self, idx: usize) -> Option<&[u8]> { + let range = self.ranges.get(idx)?.to_owned()?; + Some(&self.body.buffer()[range]) + } + + /// Interpret the column at the given index as text + /// + /// Useful when using query_raw_txt() which sets text transfer mode + pub fn as_text(&self, idx: usize) -> Result, Error> { + if self.output_format == Format::Text { + match self.col_buffer(idx) { + Some(raw) => { + FromSql::from_sql(&Type::TEXT, raw).map_err(|e| Error::from_sql(e, idx)) + } + None => Ok(None), + } + } else { + Err(Error::from_sql(Box::new(WrongFormat {}), idx)) + } + } + + /// Row byte size + pub fn body_len(&self) -> usize { + self.body.buffer().len() + } +} + +impl AsName for SimpleColumn { + fn as_name(&self) -> &str { + self.name() + } +} + +/// A row of data returned from the database by a simple query. +#[derive(Debug)] +pub struct SimpleQueryRow { + columns: Arc<[SimpleColumn]>, + body: DataRowBody, + ranges: Vec>>, +} + +impl SimpleQueryRow { + #[allow(clippy::new_ret_no_self)] + pub(crate) fn new( + columns: Arc<[SimpleColumn]>, + body: DataRowBody, + ) -> Result { + let ranges = body.ranges().collect().map_err(Error::parse)?; + Ok(SimpleQueryRow { + columns, + body, + ranges, + }) + } + + /// Returns information about the columns of data in the row. + pub fn columns(&self) -> &[SimpleColumn] { + &self.columns + } + + /// Determines if the row contains no values. + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Returns the number of values in the row. + pub fn len(&self) -> usize { + self.columns.len() + } + + /// Returns a value from the row. + /// + /// The value can be specified either by its numeric index in the row, or by its column name. + /// + /// # Panics + /// + /// Panics if the index is out of bounds or if the value cannot be converted to the specified type. + pub fn get(&self, idx: I) -> Option<&str> + where + I: RowIndex + fmt::Display, + { + match self.get_inner(&idx) { + Ok(ok) => ok, + Err(err) => panic!("error retrieving column {}: {}", idx, err), + } + } + + /// Like `SimpleQueryRow::get`, but returns a `Result` rather than panicking. + pub fn try_get(&self, idx: I) -> Result, Error> + where + I: RowIndex + fmt::Display, + { + self.get_inner(&idx) + } + + fn get_inner(&self, idx: &I) -> Result, Error> + where + I: RowIndex + fmt::Display, + { + let idx = match idx.__idx(&self.columns) { + Some(idx) => idx, + None => return Err(Error::column(idx.to_string())), + }; + + let buf = self.ranges[idx].clone().map(|r| &self.body.buffer()[r]); + FromSql::from_sql_nullable(&Type::TEXT, buf).map_err(|e| Error::from_sql(e, idx)) + } +} diff --git a/libs/proxy/tokio-postgres2/src/simple_query.rs b/libs/proxy/tokio-postgres2/src/simple_query.rs new file mode 100644 index 0000000000..fb2550377b --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/simple_query.rs @@ -0,0 +1,142 @@ +use crate::client::{InnerClient, Responses}; +use crate::codec::FrontendMessage; +use crate::connection::RequestMessages; +use crate::{Error, ReadyForQueryStatus, SimpleQueryMessage, SimpleQueryRow}; +use bytes::Bytes; +use fallible_iterator::FallibleIterator; +use futures_util::{ready, Stream}; +use log::debug; +use pin_project_lite::pin_project; +use postgres_protocol2::message::backend::Message; +use postgres_protocol2::message::frontend; +use std::marker::PhantomPinned; +use std::pin::Pin; +use std::sync::Arc; +use std::task::{Context, Poll}; + +/// Information about a column of a single query row. +#[derive(Debug)] +pub struct SimpleColumn { + name: String, +} + +impl SimpleColumn { + pub(crate) fn new(name: String) -> SimpleColumn { + SimpleColumn { name } + } + + /// Returns the name of the column. + pub fn name(&self) -> &str { + &self.name + } +} + +pub async fn simple_query(client: &InnerClient, query: &str) -> Result { + debug!("executing simple query: {}", query); + + let buf = encode(client, query)?; + let responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?; + + Ok(SimpleQueryStream { + responses, + columns: None, + status: ReadyForQueryStatus::Unknown, + _p: PhantomPinned, + }) +} + +pub async fn batch_execute( + client: &InnerClient, + query: &str, +) -> Result { + debug!("executing statement batch: {}", query); + + let buf = encode(client, query)?; + let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?; + + loop { + match responses.next().await? { + Message::ReadyForQuery(status) => return Ok(status.into()), + Message::CommandComplete(_) + | Message::EmptyQueryResponse + | Message::RowDescription(_) + | Message::DataRow(_) => {} + _ => return Err(Error::unexpected_message()), + } + } +} + +pub(crate) fn encode(client: &InnerClient, query: &str) -> Result { + client.with_buf(|buf| { + frontend::query(query, buf).map_err(Error::encode)?; + Ok(buf.split().freeze()) + }) +} + +pin_project! { + /// A stream of simple query results. + pub struct SimpleQueryStream { + responses: Responses, + columns: Option>, + status: ReadyForQueryStatus, + #[pin] + _p: PhantomPinned, + } +} + +impl SimpleQueryStream { + /// Returns if the connection is ready for querying, with the status of the connection. + /// + /// This might be available only after the stream has been exhausted. + pub fn ready_status(&self) -> ReadyForQueryStatus { + self.status + } +} + +impl Stream for SimpleQueryStream { + type Item = Result; + + fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + let this = self.project(); + loop { + match ready!(this.responses.poll_next(cx)?) { + Message::CommandComplete(body) => { + let rows = body + .tag() + .map_err(Error::parse)? + .rsplit(' ') + .next() + .unwrap() + .parse() + .unwrap_or(0); + return Poll::Ready(Some(Ok(SimpleQueryMessage::CommandComplete(rows)))); + } + Message::EmptyQueryResponse => { + return Poll::Ready(Some(Ok(SimpleQueryMessage::CommandComplete(0)))); + } + Message::RowDescription(body) => { + let columns = body + .fields() + .map(|f| Ok(SimpleColumn::new(f.name().to_string()))) + .collect::>() + .map_err(Error::parse)? + .into(); + + *this.columns = Some(columns); + } + Message::DataRow(body) => { + let row = match &this.columns { + Some(columns) => SimpleQueryRow::new(columns.clone(), body)?, + None => return Poll::Ready(Some(Err(Error::unexpected_message()))), + }; + return Poll::Ready(Some(Ok(SimpleQueryMessage::Row(row)))); + } + Message::ReadyForQuery(s) => { + *this.status = s.into(); + return Poll::Ready(None); + } + _ => return Poll::Ready(Some(Err(Error::unexpected_message()))), + } + } + } +} diff --git a/libs/proxy/tokio-postgres2/src/statement.rs b/libs/proxy/tokio-postgres2/src/statement.rs new file mode 100644 index 0000000000..22e160fc05 --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/statement.rs @@ -0,0 +1,157 @@ +use crate::client::InnerClient; +use crate::codec::FrontendMessage; +use crate::connection::RequestMessages; +use crate::types::Type; +use postgres_protocol2::{ + message::{backend::Field, frontend}, + Oid, +}; +use std::{ + fmt, + sync::{Arc, Weak}, +}; + +struct StatementInner { + client: Weak, + name: String, + params: Vec, + columns: Vec, +} + +impl Drop for StatementInner { + fn drop(&mut self) { + if let Some(client) = self.client.upgrade() { + let buf = client.with_buf(|buf| { + frontend::close(b'S', &self.name, buf).unwrap(); + frontend::sync(buf); + buf.split().freeze() + }); + let _ = client.send(RequestMessages::Single(FrontendMessage::Raw(buf))); + } + } +} + +/// A prepared statement. +/// +/// Prepared statements can only be used with the connection that created them. +#[derive(Clone)] +pub struct Statement(Arc); + +impl Statement { + pub(crate) fn new( + inner: &Arc, + name: String, + params: Vec, + columns: Vec, + ) -> Statement { + Statement(Arc::new(StatementInner { + client: Arc::downgrade(inner), + name, + params, + columns, + })) + } + + pub(crate) fn new_anonymous(params: Vec, columns: Vec) -> Statement { + Statement(Arc::new(StatementInner { + client: Weak::new(), + name: String::new(), + params, + columns, + })) + } + + pub(crate) fn name(&self) -> &str { + &self.0.name + } + + /// Returns the expected types of the statement's parameters. + pub fn params(&self) -> &[Type] { + &self.0.params + } + + /// Returns information about the columns returned when the statement is queried. + pub fn columns(&self) -> &[Column] { + &self.0.columns + } +} + +/// Information about a column of a query. +pub struct Column { + name: String, + type_: Type, + + // raw fields from RowDescription + table_oid: Oid, + column_id: i16, + format: i16, + + // that better be stored in self.type_, but that is more radical refactoring + type_oid: Oid, + type_size: i16, + type_modifier: i32, +} + +impl Column { + pub(crate) fn new(name: String, type_: Type, raw_field: Field<'_>) -> Column { + Column { + name, + type_, + table_oid: raw_field.table_oid(), + column_id: raw_field.column_id(), + format: raw_field.format(), + type_oid: raw_field.type_oid(), + type_size: raw_field.type_size(), + type_modifier: raw_field.type_modifier(), + } + } + + /// Returns the name of the column. + pub fn name(&self) -> &str { + &self.name + } + + /// Returns the type of the column. + pub fn type_(&self) -> &Type { + &self.type_ + } + + /// Returns the table OID of the column. + pub fn table_oid(&self) -> Oid { + self.table_oid + } + + /// Returns the column ID of the column. + pub fn column_id(&self) -> i16 { + self.column_id + } + + /// Returns the format of the column. + pub fn format(&self) -> i16 { + self.format + } + + /// Returns the type OID of the column. + pub fn type_oid(&self) -> Oid { + self.type_oid + } + + /// Returns the type size of the column. + pub fn type_size(&self) -> i16 { + self.type_size + } + + /// Returns the type modifier of the column. + pub fn type_modifier(&self) -> i32 { + self.type_modifier + } +} + +impl fmt::Debug for Column { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt.debug_struct("Column") + .field("name", &self.name) + .field("type", &self.type_) + .finish() + } +} diff --git a/libs/proxy/tokio-postgres2/src/tls.rs b/libs/proxy/tokio-postgres2/src/tls.rs new file mode 100644 index 0000000000..dc8140719f --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/tls.rs @@ -0,0 +1,162 @@ +//! TLS support. + +use std::error::Error; +use std::future::Future; +use std::pin::Pin; +use std::task::{Context, Poll}; +use std::{fmt, io}; +use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; + +pub(crate) mod private { + pub struct ForcePrivateApi; +} + +/// Channel binding information returned from a TLS handshake. +pub struct ChannelBinding { + pub(crate) tls_server_end_point: Option>, +} + +impl ChannelBinding { + /// Creates a `ChannelBinding` containing no information. + pub fn none() -> ChannelBinding { + ChannelBinding { + tls_server_end_point: None, + } + } + + /// Creates a `ChannelBinding` containing `tls-server-end-point` channel binding information. + pub fn tls_server_end_point(tls_server_end_point: Vec) -> ChannelBinding { + ChannelBinding { + tls_server_end_point: Some(tls_server_end_point), + } + } +} + +/// A constructor of `TlsConnect`ors. +/// +/// Requires the `runtime` Cargo feature (enabled by default). +pub trait MakeTlsConnect { + /// The stream type created by the `TlsConnect` implementation. + type Stream: TlsStream + Unpin; + /// The `TlsConnect` implementation created by this type. + type TlsConnect: TlsConnect; + /// The error type returned by the `TlsConnect` implementation. + type Error: Into>; + + /// Creates a new `TlsConnect`or. + /// + /// The domain name is provided for certificate verification and SNI. + fn make_tls_connect(&mut self, domain: &str) -> Result; +} + +/// An asynchronous function wrapping a stream in a TLS session. +pub trait TlsConnect { + /// The stream returned by the future. + type Stream: TlsStream + Unpin; + /// The error returned by the future. + type Error: Into>; + /// The future returned by the connector. + type Future: Future>; + + /// Returns a future performing a TLS handshake over the stream. + fn connect(self, stream: S) -> Self::Future; + + #[doc(hidden)] + fn can_connect(&self, _: private::ForcePrivateApi) -> bool { + true + } +} + +/// A TLS-wrapped connection to a PostgreSQL database. +pub trait TlsStream: AsyncRead + AsyncWrite { + /// Returns channel binding information for the session. + fn channel_binding(&self) -> ChannelBinding; +} + +/// A `MakeTlsConnect` and `TlsConnect` implementation which simply returns an error. +/// +/// This can be used when `sslmode` is `none` or `prefer`. +#[derive(Debug, Copy, Clone)] +pub struct NoTls; + +impl MakeTlsConnect for NoTls { + type Stream = NoTlsStream; + type TlsConnect = NoTls; + type Error = NoTlsError; + + fn make_tls_connect(&mut self, _: &str) -> Result { + Ok(NoTls) + } +} + +impl TlsConnect for NoTls { + type Stream = NoTlsStream; + type Error = NoTlsError; + type Future = NoTlsFuture; + + fn connect(self, _: S) -> NoTlsFuture { + NoTlsFuture(()) + } + + fn can_connect(&self, _: private::ForcePrivateApi) -> bool { + false + } +} + +/// The future returned by `NoTls`. +pub struct NoTlsFuture(()); + +impl Future for NoTlsFuture { + type Output = Result; + + fn poll(self: Pin<&mut Self>, _: &mut Context<'_>) -> Poll { + Poll::Ready(Err(NoTlsError(()))) + } +} + +/// The TLS "stream" type produced by the `NoTls` connector. +/// +/// Since `NoTls` doesn't support TLS, this type is uninhabited. +pub enum NoTlsStream {} + +impl AsyncRead for NoTlsStream { + fn poll_read( + self: Pin<&mut Self>, + _: &mut Context<'_>, + _: &mut ReadBuf<'_>, + ) -> Poll> { + match *self {} + } +} + +impl AsyncWrite for NoTlsStream { + fn poll_write(self: Pin<&mut Self>, _: &mut Context<'_>, _: &[u8]) -> Poll> { + match *self {} + } + + fn poll_flush(self: Pin<&mut Self>, _: &mut Context<'_>) -> Poll> { + match *self {} + } + + fn poll_shutdown(self: Pin<&mut Self>, _: &mut Context<'_>) -> Poll> { + match *self {} + } +} + +impl TlsStream for NoTlsStream { + fn channel_binding(&self) -> ChannelBinding { + match *self {} + } +} + +/// The error returned by `NoTls`. +#[derive(Debug)] +pub struct NoTlsError(()); + +impl fmt::Display for NoTlsError { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt.write_str("no TLS implementation configured") + } +} + +impl Error for NoTlsError {} diff --git a/libs/proxy/tokio-postgres2/src/to_statement.rs b/libs/proxy/tokio-postgres2/src/to_statement.rs new file mode 100644 index 0000000000..427f77dd79 --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/to_statement.rs @@ -0,0 +1,57 @@ +use crate::to_statement::private::{Sealed, ToStatementType}; +use crate::Statement; + +mod private { + use crate::{Client, Error, Statement}; + + pub trait Sealed {} + + pub enum ToStatementType<'a> { + Statement(&'a Statement), + Query(&'a str), + } + + impl<'a> ToStatementType<'a> { + pub async fn into_statement(self, client: &Client) -> Result { + match self { + ToStatementType::Statement(s) => Ok(s.clone()), + ToStatementType::Query(s) => client.prepare(s).await, + } + } + } +} + +/// A trait abstracting over prepared and unprepared statements. +/// +/// Many methods are generic over this bound, so that they support both a raw query string as well as a statement which +/// was prepared previously. +/// +/// This trait is "sealed" and cannot be implemented by anything outside this crate. +pub trait ToStatement: Sealed { + #[doc(hidden)] + fn __convert(&self) -> ToStatementType<'_>; +} + +impl ToStatement for Statement { + fn __convert(&self) -> ToStatementType<'_> { + ToStatementType::Statement(self) + } +} + +impl Sealed for Statement {} + +impl ToStatement for str { + fn __convert(&self) -> ToStatementType<'_> { + ToStatementType::Query(self) + } +} + +impl Sealed for str {} + +impl ToStatement for String { + fn __convert(&self) -> ToStatementType<'_> { + ToStatementType::Query(self) + } +} + +impl Sealed for String {} diff --git a/libs/proxy/tokio-postgres2/src/transaction.rs b/libs/proxy/tokio-postgres2/src/transaction.rs new file mode 100644 index 0000000000..03a57e4947 --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/transaction.rs @@ -0,0 +1,74 @@ +use crate::codec::FrontendMessage; +use crate::connection::RequestMessages; +use crate::query::RowStream; +use crate::{CancelToken, Client, Error, ReadyForQueryStatus}; +use postgres_protocol2::message::frontend; + +/// A representation of a PostgreSQL database transaction. +/// +/// Transactions will implicitly roll back when dropped. Use the `commit` method to commit the changes made in the +/// transaction. Transactions can be nested, with inner transactions implemented via safepoints. +pub struct Transaction<'a> { + client: &'a mut Client, + done: bool, +} + +impl Drop for Transaction<'_> { + fn drop(&mut self) { + if self.done { + return; + } + + let buf = self.client.inner().with_buf(|buf| { + frontend::query("ROLLBACK", buf).unwrap(); + buf.split().freeze() + }); + let _ = self + .client + .inner() + .send(RequestMessages::Single(FrontendMessage::Raw(buf))); + } +} + +impl<'a> Transaction<'a> { + pub(crate) fn new(client: &'a mut Client) -> Transaction<'a> { + Transaction { + client, + done: false, + } + } + + /// Consumes the transaction, committing all changes made within it. + pub async fn commit(mut self) -> Result { + self.done = true; + self.client.batch_execute("COMMIT").await + } + + /// Rolls the transaction back, discarding all changes made within it. + /// + /// This is equivalent to `Transaction`'s `Drop` implementation, but provides any error encountered to the caller. + pub async fn rollback(mut self) -> Result { + self.done = true; + self.client.batch_execute("ROLLBACK").await + } + + /// Like `Client::query_raw_txt`. + pub async fn query_raw_txt(&self, statement: &str, params: I) -> Result + where + S: AsRef, + I: IntoIterator>, + I::IntoIter: ExactSizeIterator, + { + self.client.query_raw_txt(statement, params).await + } + + /// Like `Client::cancel_token`. + pub fn cancel_token(&self) -> CancelToken { + self.client.cancel_token() + } + + /// Returns a reference to the underlying `Client`. + pub fn client(&self) -> &Client { + self.client + } +} diff --git a/libs/proxy/tokio-postgres2/src/transaction_builder.rs b/libs/proxy/tokio-postgres2/src/transaction_builder.rs new file mode 100644 index 0000000000..9718ac588c --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/transaction_builder.rs @@ -0,0 +1,113 @@ +use crate::{Client, Error, Transaction}; + +/// The isolation level of a database transaction. +#[derive(Debug, Copy, Clone)] +#[non_exhaustive] +pub enum IsolationLevel { + /// Equivalent to `ReadCommitted`. + ReadUncommitted, + + /// An individual statement in the transaction will see rows committed before it began. + ReadCommitted, + + /// All statements in the transaction will see the same view of rows committed before the first query in the + /// transaction. + RepeatableRead, + + /// The reads and writes in this transaction must be able to be committed as an atomic "unit" with respect to reads + /// and writes of all other concurrent serializable transactions without interleaving. + Serializable, +} + +/// A builder for database transactions. +pub struct TransactionBuilder<'a> { + client: &'a mut Client, + isolation_level: Option, + read_only: Option, + deferrable: Option, +} + +impl<'a> TransactionBuilder<'a> { + pub(crate) fn new(client: &'a mut Client) -> TransactionBuilder<'a> { + TransactionBuilder { + client, + isolation_level: None, + read_only: None, + deferrable: None, + } + } + + /// Sets the isolation level of the transaction. + pub fn isolation_level(mut self, isolation_level: IsolationLevel) -> Self { + self.isolation_level = Some(isolation_level); + self + } + + /// Sets the access mode of the transaction. + pub fn read_only(mut self, read_only: bool) -> Self { + self.read_only = Some(read_only); + self + } + + /// Sets the deferrability of the transaction. + /// + /// If the transaction is also serializable and read only, creation of the transaction may block, but when it + /// completes the transaction is able to run with less overhead and a guarantee that it will not be aborted due to + /// serialization failure. + pub fn deferrable(mut self, deferrable: bool) -> Self { + self.deferrable = Some(deferrable); + self + } + + /// Begins the transaction. + /// + /// The transaction will roll back by default - use the `commit` method to commit it. + pub async fn start(self) -> Result, Error> { + let mut query = "START TRANSACTION".to_string(); + let mut first = true; + + if let Some(level) = self.isolation_level { + first = false; + + query.push_str(" ISOLATION LEVEL "); + let level = match level { + IsolationLevel::ReadUncommitted => "READ UNCOMMITTED", + IsolationLevel::ReadCommitted => "READ COMMITTED", + IsolationLevel::RepeatableRead => "REPEATABLE READ", + IsolationLevel::Serializable => "SERIALIZABLE", + }; + query.push_str(level); + } + + if let Some(read_only) = self.read_only { + if !first { + query.push(','); + } + first = false; + + let s = if read_only { + " READ ONLY" + } else { + " READ WRITE" + }; + query.push_str(s); + } + + if let Some(deferrable) = self.deferrable { + if !first { + query.push(','); + } + + let s = if deferrable { + " DEFERRABLE" + } else { + " NOT DEFERRABLE" + }; + query.push_str(s); + } + + self.client.batch_execute(&query).await?; + + Ok(Transaction::new(self.client)) + } +} diff --git a/libs/proxy/tokio-postgres2/src/types.rs b/libs/proxy/tokio-postgres2/src/types.rs new file mode 100644 index 0000000000..e571d7ee00 --- /dev/null +++ b/libs/proxy/tokio-postgres2/src/types.rs @@ -0,0 +1,6 @@ +//! Types. +//! +//! This module is a reexport of the `postgres_types` crate. + +#[doc(inline)] +pub use postgres_types2::*; diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 1665d6361a..0d774d529d 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -55,6 +55,7 @@ parquet.workspace = true parquet_derive.workspace = true pin-project-lite.workspace = true postgres_backend.workspace = true +postgres-protocol = { package = "postgres-protocol2", path = "../libs/proxy/postgres-protocol2" } pq_proto.workspace = true prometheus.workspace = true rand.workspace = true @@ -80,8 +81,7 @@ subtle.workspace = true thiserror.workspace = true tikv-jemallocator.workspace = true tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] } -tokio-postgres = { workspace = true, features = ["with-serde_json-1"] } -tokio-postgres-rustls.workspace = true +tokio-postgres = { package = "tokio-postgres2", path = "../libs/proxy/tokio-postgres2" } tokio-rustls.workspace = true tokio-util.workspace = true tokio = { workspace = true, features = ["signal"] } @@ -96,7 +96,6 @@ utils.workspace = true uuid.workspace = true rustls-native-certs.workspace = true x509-parser.workspace = true -postgres-protocol.workspace = true redis.workspace = true zerocopy.workspace = true @@ -117,6 +116,5 @@ tokio-tungstenite.workspace = true pbkdf2 = { workspace = true, features = ["simple", "std"] } rcgen.workspace = true rstest.workspace = true -tokio-postgres-rustls.workspace = true walkdir.workspace = true rand_distr = "0.4" diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 8408d4720b..2abe88ac88 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -13,7 +13,6 @@ use rustls::pki_types::InvalidDnsNameError; use thiserror::Error; use tokio::net::TcpStream; use tokio_postgres::tls::MakeTlsConnect; -use tokio_postgres_rustls::MakeRustlsConnect; use tracing::{debug, error, info, warn}; use crate::auth::parse_endpoint_param; @@ -24,6 +23,7 @@ use crate::control_plane::errors::WakeComputeError; use crate::control_plane::messages::MetricsAuxInfo; use crate::error::{ReportableError, UserFacingError}; use crate::metrics::{Metrics, NumDbConnectionsGuard}; +use crate::postgres_rustls::MakeRustlsConnect; use crate::proxy::neon_option; use crate::types::Host; @@ -244,7 +244,6 @@ impl ConnCfg { let port = ports.get(i).or_else(|| ports.first()).unwrap_or(&5432); let host = match host { Host::Tcp(host) => host.as_str(), - Host::Unix(_) => continue, // unix sockets are not welcome here }; match connect_once(host, *port).await { @@ -315,7 +314,7 @@ impl ConnCfg { }; let client_config = client_config.with_no_client_auth(); - let mut mk_tls = tokio_postgres_rustls::MakeRustlsConnect::new(client_config); + let mut mk_tls = crate::postgres_rustls::MakeRustlsConnect::new(client_config); let tls = >::make_tls_connect( &mut mk_tls, host, diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs index 5c19a23e36..4a063a5faa 100644 --- a/proxy/src/context/mod.rs +++ b/proxy/src/context/mod.rs @@ -414,6 +414,7 @@ impl RequestContextInner { outcome, }); } + if let Some(tx) = self.sender.take() { // If type changes, this error handling needs to be updated. let tx: mpsc::UnboundedSender = tx; diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs index ad7e1d2771..ba69f9cf2d 100644 --- a/proxy/src/lib.rs +++ b/proxy/src/lib.rs @@ -88,6 +88,7 @@ pub mod jemalloc; pub mod logging; pub mod metrics; pub mod parse; +pub mod postgres_rustls; pub mod protocol2; pub mod proxy; pub mod rate_limiter; diff --git a/proxy/src/postgres_rustls/mod.rs b/proxy/src/postgres_rustls/mod.rs new file mode 100644 index 0000000000..31e7915e89 --- /dev/null +++ b/proxy/src/postgres_rustls/mod.rs @@ -0,0 +1,158 @@ +use std::convert::TryFrom; +use std::sync::Arc; + +use rustls::pki_types::ServerName; +use rustls::ClientConfig; +use tokio::io::{AsyncRead, AsyncWrite}; +use tokio_postgres::tls::MakeTlsConnect; + +mod private { + use std::future::Future; + use std::io; + use std::pin::Pin; + use std::task::{Context, Poll}; + + use rustls::pki_types::ServerName; + use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; + use tokio_postgres::tls::{ChannelBinding, TlsConnect}; + use tokio_rustls::client::TlsStream; + use tokio_rustls::TlsConnector; + + use crate::config::TlsServerEndPoint; + + pub struct TlsConnectFuture { + inner: tokio_rustls::Connect, + } + + impl Future for TlsConnectFuture + where + S: AsyncRead + AsyncWrite + Unpin, + { + type Output = io::Result>; + + fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + Pin::new(&mut self.inner).poll(cx).map_ok(RustlsStream) + } + } + + pub struct RustlsConnect(pub RustlsConnectData); + + pub struct RustlsConnectData { + pub hostname: ServerName<'static>, + pub connector: TlsConnector, + } + + impl TlsConnect for RustlsConnect + where + S: AsyncRead + AsyncWrite + Unpin + Send + 'static, + { + type Stream = RustlsStream; + type Error = io::Error; + type Future = TlsConnectFuture; + + fn connect(self, stream: S) -> Self::Future { + TlsConnectFuture { + inner: self.0.connector.connect(self.0.hostname, stream), + } + } + } + + pub struct RustlsStream(TlsStream); + + impl tokio_postgres::tls::TlsStream for RustlsStream + where + S: AsyncRead + AsyncWrite + Unpin, + { + fn channel_binding(&self) -> ChannelBinding { + let (_, session) = self.0.get_ref(); + match session.peer_certificates() { + Some([cert, ..]) => TlsServerEndPoint::new(cert) + .ok() + .and_then(|cb| match cb { + TlsServerEndPoint::Sha256(hash) => Some(hash), + TlsServerEndPoint::Undefined => None, + }) + .map_or_else(ChannelBinding::none, |hash| { + ChannelBinding::tls_server_end_point(hash.to_vec()) + }), + _ => ChannelBinding::none(), + } + } + } + + impl AsyncRead for RustlsStream + where + S: AsyncRead + AsyncWrite + Unpin, + { + fn poll_read( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + buf: &mut ReadBuf<'_>, + ) -> Poll> { + Pin::new(&mut self.0).poll_read(cx, buf) + } + } + + impl AsyncWrite for RustlsStream + where + S: AsyncRead + AsyncWrite + Unpin, + { + fn poll_write( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + buf: &[u8], + ) -> Poll> { + Pin::new(&mut self.0).poll_write(cx, buf) + } + + fn poll_flush( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + ) -> Poll> { + Pin::new(&mut self.0).poll_flush(cx) + } + + fn poll_shutdown( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + ) -> Poll> { + Pin::new(&mut self.0).poll_shutdown(cx) + } + } +} + +/// A `MakeTlsConnect` implementation using `rustls`. +/// +/// That way you can connect to PostgreSQL using `rustls` as the TLS stack. +#[derive(Clone)] +pub struct MakeRustlsConnect { + config: Arc, +} + +impl MakeRustlsConnect { + /// Creates a new `MakeRustlsConnect` from the provided `ClientConfig`. + #[must_use] + pub fn new(config: ClientConfig) -> Self { + Self { + config: Arc::new(config), + } + } +} + +impl MakeTlsConnect for MakeRustlsConnect +where + S: AsyncRead + AsyncWrite + Unpin + Send + 'static, +{ + type Stream = private::RustlsStream; + type TlsConnect = private::RustlsConnect; + type Error = rustls::pki_types::InvalidDnsNameError; + + fn make_tls_connect(&mut self, hostname: &str) -> Result { + ServerName::try_from(hostname).map(|dns_name| { + private::RustlsConnect(private::RustlsConnectData { + hostname: dns_name.to_owned(), + connector: Arc::clone(&self.config).into(), + }) + }) + } +} diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs index 3de8ca8736..2c2c2964b6 100644 --- a/proxy/src/proxy/tests/mod.rs +++ b/proxy/src/proxy/tests/mod.rs @@ -14,7 +14,6 @@ use rustls::pki_types; use tokio::io::DuplexStream; use tokio_postgres::config::SslMode; use tokio_postgres::tls::{MakeTlsConnect, NoTls}; -use tokio_postgres_rustls::MakeRustlsConnect; use super::connect_compute::ConnectMechanism; use super::retry::CouldRetry; @@ -29,6 +28,7 @@ use crate::control_plane::{ self, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, NodeInfo, NodeInfoCache, }; use crate::error::ErrorKind; +use crate::postgres_rustls::MakeRustlsConnect; use crate::types::{BranchId, EndpointId, ProjectId}; use crate::{sasl, scram}; diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index 3037e20888..75909f3358 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -333,7 +333,7 @@ impl PoolingBackend { debug!("setting up backend session state"); // initiates the auth session - if let Err(e) = client.query("select auth.init()", &[]).await { + if let Err(e) = client.execute("select auth.init()", &[]).await { discard.discard(); return Err(e.into()); } diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index bd262f45ed..c302eac568 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -6,9 +6,10 @@ use std::task::{ready, Poll}; use futures::future::poll_fn; use futures::Future; use smallvec::SmallVec; +use tokio::net::TcpStream; use tokio::time::Instant; use tokio_postgres::tls::NoTlsStream; -use tokio_postgres::{AsyncMessage, Socket}; +use tokio_postgres::AsyncMessage; use tokio_util::sync::CancellationToken; use tracing::{error, info, info_span, warn, Instrument}; #[cfg(test)] @@ -57,7 +58,7 @@ pub(crate) fn poll_client( ctx: &RequestContext, conn_info: ConnInfo, client: C, - mut connection: tokio_postgres::Connection, + mut connection: tokio_postgres::Connection, conn_id: uuid::Uuid, aux: MetricsAuxInfo, ) -> Client { diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs index 9abe35db08..db9ac49dae 100644 --- a/proxy/src/serverless/local_conn_pool.rs +++ b/proxy/src/serverless/local_conn_pool.rs @@ -24,10 +24,11 @@ use p256::ecdsa::{Signature, SigningKey}; use parking_lot::RwLock; use serde_json::value::RawValue; use signature::Signer; +use tokio::net::TcpStream; use tokio::time::Instant; use tokio_postgres::tls::NoTlsStream; use tokio_postgres::types::ToSql; -use tokio_postgres::{AsyncMessage, Socket}; +use tokio_postgres::AsyncMessage; use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, info_span, warn, Instrument}; @@ -163,7 +164,7 @@ pub(crate) fn poll_client( ctx: &RequestContext, conn_info: ConnInfo, client: C, - mut connection: tokio_postgres::Connection, + mut connection: tokio_postgres::Connection, key: SigningKey, conn_id: uuid::Uuid, aux: MetricsAuxInfo, @@ -286,11 +287,11 @@ impl ClientInnerCommon { let token = resign_jwt(&local_data.key, payload, local_data.jti)?; // initiates the auth session - self.inner.simple_query("discard all").await?; + self.inner.batch_execute("discard all").await?; self.inner - .query( + .execute( "select auth.jwt_session_init($1)", - &[&token as &(dyn ToSql + Sync)], + &[&&*token as &(dyn ToSql + Sync)], ) .await?; diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index a73d9d6352..c0a3abc377 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -60,7 +60,6 @@ num-integer = { version = "0.1", features = ["i128"] } num-traits = { version = "0.2", features = ["i128", "libm"] } once_cell = { version = "1" } parquet = { version = "53", default-features = false, features = ["zstd"] } -postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon", default-features = false, features = ["with-serde_json-1"] } prost = { version = "0.13", features = ["prost-derive"] } rand = { version = "0.8", features = ["small_rng"] } regex = { version = "1" } @@ -79,8 +78,7 @@ subtle = { version = "2" } sync_wrapper = { version = "0.1", default-features = false, features = ["futures"] } tikv-jemalloc-sys = { version = "0.6", features = ["stats"] } time = { version = "0.3", features = ["macros", "serde-well-known"] } -tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] } -tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon", features = ["with-serde_json-1"] } +tokio = { version = "1", features = ["full", "test-util"] } tokio-rustls = { version = "0.26", default-features = false, features = ["logging", "ring", "tls12"] } tokio-stream = { version = "0.1", features = ["net"] } tokio-util = { version = "0.7", features = ["codec", "compat", "io", "rt"] } From 008616cfe696b219d39a53baee396748e75f9477 Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 29 Nov 2024 13:27:49 +0000 Subject: [PATCH 10/65] storage controller: use proper ScheduleContext when evacuating a node (#9908) ## Problem When picking locations for a shard, we should use a ScheduleContext that includes all the other shards in the tenant, so that we apply proper anti-affinity between shards. If we don't do this, then it can lead to unstable scheduling, where we place a shard somewhere that the optimizer will then immediately move it away from. We didn't always do this, because it was a bit awkward to accumulate the context for a tenant rather than just walking tenants. This was a TODO in `handle_node_availability_transition`: ``` // TODO: populate a ScheduleContext including all shards in the same tenant_id (only matters // for tenants without secondary locations: if they have a secondary location, then this // schedule() call is just promoting an existing secondary) ``` This is a precursor to https://github.com/neondatabase/neon/issues/8264, where the current imperfect scheduling during node evacuation hampers testing. ## Summary of changes - Add an iterator type that yields each shard along with a schedulecontext that includes all the other shards from the same tenant - Use the iterator to replace hand-crafted logic in optimize_all_plan (functionally identical) - Use the iterator in `handle_node_availability_transition` to apply proper anti-affinity during node evacuation. --- storage_controller/src/scheduler.rs | 17 +- storage_controller/src/service.rs | 200 +++++++----------- .../src/service/context_iterator.rs | 139 ++++++++++++ storage_controller/src/tenant_shard.rs | 11 +- 4 files changed, 245 insertions(+), 122 deletions(-) create mode 100644 storage_controller/src/service/context_iterator.rs diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs index 2414d95eb8..ecc6b11e47 100644 --- a/storage_controller/src/scheduler.rs +++ b/storage_controller/src/scheduler.rs @@ -305,7 +305,7 @@ impl std::ops::Add for AffinityScore { /// Hint for whether this is a sincere attempt to schedule, or a speculative /// check for where we _would_ schedule (done during optimization) -#[derive(Debug)] +#[derive(Debug, Clone)] pub(crate) enum ScheduleMode { Normal, Speculative, @@ -319,7 +319,7 @@ impl Default for ScheduleMode { // For carrying state between multiple calls to [`TenantShard::schedule`], e.g. when calling // it for many shards in the same tenant. -#[derive(Debug, Default)] +#[derive(Debug, Default, Clone)] pub(crate) struct ScheduleContext { /// Sparse map of nodes: omitting a node implicitly makes its affinity [`AffinityScore::FREE`] pub(crate) nodes: HashMap, @@ -331,6 +331,14 @@ pub(crate) struct ScheduleContext { } impl ScheduleContext { + pub(crate) fn new(mode: ScheduleMode) -> Self { + Self { + nodes: HashMap::new(), + attached_nodes: HashMap::new(), + mode, + } + } + /// Input is a list of nodes we would like to avoid using again within this context. The more /// times a node is passed into this call, the less inclined we are to use it. pub(crate) fn avoid(&mut self, nodes: &[NodeId]) { @@ -355,6 +363,11 @@ impl ScheduleContext { pub(crate) fn get_node_attachments(&self, node_id: NodeId) -> usize { self.attached_nodes.get(&node_id).copied().unwrap_or(0) } + + #[cfg(test)] + pub(crate) fn attach_count(&self) -> usize { + self.attached_nodes.values().sum() + } } pub(crate) enum RefCountUpdate { diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 446c476b99..636ccf11a1 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -1,3 +1,6 @@ +pub mod chaos_injector; +mod context_iterator; + use hyper::Uri; use std::{ borrow::Cow, @@ -95,7 +98,7 @@ use crate::{ }, }; -pub mod chaos_injector; +use context_iterator::TenantShardContextIterator; // For operations that should be quick, like attaching a new tenant const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5); @@ -5498,49 +5501,51 @@ impl Service { let mut tenants_affected: usize = 0; - for (tenant_shard_id, tenant_shard) in tenants { - if let Some(observed_loc) = tenant_shard.observed.locations.get_mut(&node_id) { - // When a node goes offline, we set its observed configuration to None, indicating unknown: we will - // not assume our knowledge of the node's configuration is accurate until it comes back online - observed_loc.conf = None; - } + for (_tenant_id, mut schedule_context, shards) in + TenantShardContextIterator::new(tenants, ScheduleMode::Normal) + { + for tenant_shard in shards { + let tenant_shard_id = tenant_shard.tenant_shard_id; + if let Some(observed_loc) = + tenant_shard.observed.locations.get_mut(&node_id) + { + // When a node goes offline, we set its observed configuration to None, indicating unknown: we will + // not assume our knowledge of the node's configuration is accurate until it comes back online + observed_loc.conf = None; + } - if nodes.len() == 1 { - // Special case for single-node cluster: there is no point trying to reschedule - // any tenant shards: avoid doing so, in order to avoid spewing warnings about - // failures to schedule them. - continue; - } + if nodes.len() == 1 { + // Special case for single-node cluster: there is no point trying to reschedule + // any tenant shards: avoid doing so, in order to avoid spewing warnings about + // failures to schedule them. + continue; + } - if !nodes - .values() - .any(|n| matches!(n.may_schedule(), MaySchedule::Yes(_))) - { - // Special case for when all nodes are unavailable and/or unschedulable: there is no point - // trying to reschedule since there's nowhere else to go. Without this - // branch we incorrectly detach tenants in response to node unavailability. - continue; - } + if !nodes + .values() + .any(|n| matches!(n.may_schedule(), MaySchedule::Yes(_))) + { + // Special case for when all nodes are unavailable and/or unschedulable: there is no point + // trying to reschedule since there's nowhere else to go. Without this + // branch we incorrectly detach tenants in response to node unavailability. + continue; + } - if tenant_shard.intent.demote_attached(scheduler, node_id) { - tenant_shard.sequence = tenant_shard.sequence.next(); + if tenant_shard.intent.demote_attached(scheduler, node_id) { + tenant_shard.sequence = tenant_shard.sequence.next(); - // TODO: populate a ScheduleContext including all shards in the same tenant_id (only matters - // for tenants without secondary locations: if they have a secondary location, then this - // schedule() call is just promoting an existing secondary) - let mut schedule_context = ScheduleContext::default(); - - match tenant_shard.schedule(scheduler, &mut schedule_context) { - Err(e) => { - // It is possible that some tenants will become unschedulable when too many pageservers - // go offline: in this case there isn't much we can do other than make the issue observable. - // TODO: give TenantShard a scheduling error attribute to be queried later. - tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", node_id); - } - Ok(()) => { - if self.maybe_reconcile_shard(tenant_shard, nodes).is_some() { - tenants_affected += 1; - }; + match tenant_shard.schedule(scheduler, &mut schedule_context) { + Err(e) => { + // It is possible that some tenants will become unschedulable when too many pageservers + // go offline: in this case there isn't much we can do other than make the issue observable. + // TODO: give TenantShard a scheduling error attribute to be queried later. + tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", node_id); + } + Ok(()) => { + if self.maybe_reconcile_shard(tenant_shard, nodes).is_some() { + tenants_affected += 1; + }; + } } } } @@ -6011,14 +6016,8 @@ impl Service { let (nodes, tenants, _scheduler) = locked.parts_mut(); let pageservers = nodes.clone(); - let mut schedule_context = ScheduleContext::default(); - let mut reconciles_spawned = 0; - for (tenant_shard_id, shard) in tenants.iter_mut() { - if tenant_shard_id.is_shard_zero() { - schedule_context = ScheduleContext::default(); - } - + for shard in tenants.values_mut() { // Skip checking if this shard is already enqueued for reconciliation if shard.delayed_reconcile && self.reconciler_concurrency.available_permits() == 0 { // If there is something delayed, then return a nonzero count so that @@ -6033,8 +6032,6 @@ impl Service { if self.maybe_reconcile_shard(shard, &pageservers).is_some() { reconciles_spawned += 1; } - - schedule_context.avoid(&shard.intent.all_pageservers()); } reconciles_spawned @@ -6103,95 +6100,62 @@ impl Service { } fn optimize_all_plan(&self) -> Vec<(TenantShardId, ScheduleOptimization)> { - let mut schedule_context = ScheduleContext::default(); - - let mut tenant_shards: Vec<&TenantShard> = Vec::new(); - // How many candidate optimizations we will generate, before evaluating them for readniess: setting // this higher than the execution limit gives us a chance to execute some work even if the first // few optimizations we find are not ready. const MAX_OPTIMIZATIONS_PLAN_PER_PASS: usize = 8; let mut work = Vec::new(); - let mut locked = self.inner.write().unwrap(); let (nodes, tenants, scheduler) = locked.parts_mut(); - for (tenant_shard_id, shard) in tenants.iter() { - if tenant_shard_id.is_shard_zero() { - // Reset accumulators on the first shard in a tenant - schedule_context = ScheduleContext::default(); - schedule_context.mode = ScheduleMode::Speculative; - tenant_shards.clear(); - } - if work.len() >= MAX_OPTIMIZATIONS_PLAN_PER_PASS { - break; - } - - match shard.get_scheduling_policy() { - ShardSchedulingPolicy::Active => { - // Ok to do optimization + for (_tenant_id, schedule_context, shards) in + TenantShardContextIterator::new(tenants, ScheduleMode::Speculative) + { + for shard in shards { + if work.len() >= MAX_OPTIMIZATIONS_PLAN_PER_PASS { + break; } - ShardSchedulingPolicy::Essential - | ShardSchedulingPolicy::Pause - | ShardSchedulingPolicy::Stop => { - // Policy prevents optimizing this shard. - continue; + match shard.get_scheduling_policy() { + ShardSchedulingPolicy::Active => { + // Ok to do optimization + } + ShardSchedulingPolicy::Essential + | ShardSchedulingPolicy::Pause + | ShardSchedulingPolicy::Stop => { + // Policy prevents optimizing this shard. + continue; + } } - } - // Accumulate the schedule context for all the shards in a tenant: we must have - // the total view of all shards before we can try to optimize any of them. - schedule_context.avoid(&shard.intent.all_pageservers()); - if let Some(attached) = shard.intent.get_attached() { - schedule_context.push_attached(*attached); - } - tenant_shards.push(shard); - - // Once we have seen the last shard in the tenant, proceed to search across all shards - // in the tenant for optimizations - if shard.shard.number.0 == shard.shard.count.count() - 1 { - if tenant_shards.iter().any(|s| s.reconciler.is_some()) { + if !matches!(shard.splitting, SplitState::Idle) + || matches!(shard.policy, PlacementPolicy::Detached) + || shard.reconciler.is_some() + { // Do not start any optimizations while another change to the tenant is ongoing: this // is not necessary for correctness, but simplifies operations and implicitly throttles // optimization changes to happen in a "trickle" over time. continue; } - if tenant_shards.iter().any(|s| { - !matches!(s.splitting, SplitState::Idle) - || matches!(s.policy, PlacementPolicy::Detached) - }) { - // Never attempt to optimize a tenant that is currently being split, or - // a tenant that is meant to be detached - continue; - } - // TODO: optimization calculations are relatively expensive: create some fast-path for // the common idle case (avoiding the search on tenants that we have recently checked) - - for shard in &tenant_shards { - if let Some(optimization) = - // If idle, maybe ptimize attachments: if a shard has a secondary location that is preferable to - // its primary location based on soft constraints, cut it over. - shard.optimize_attachment(nodes, &schedule_context) - { - work.push((shard.tenant_shard_id, optimization)); - break; - } else if let Some(optimization) = - // If idle, maybe optimize secondary locations: if a shard has a secondary location that would be - // better placed on another node, based on ScheduleContext, then adjust it. This - // covers cases like after a shard split, where we might have too many shards - // in the same tenant with secondary locations on the node where they originally split. - shard.optimize_secondary(scheduler, &schedule_context) - { - work.push((shard.tenant_shard_id, optimization)); - break; - } - - // TODO: extend this mechanism to prefer attaching on nodes with fewer attached - // tenants (i.e. extend schedule state to distinguish attached from secondary counts), - // for the total number of attachments on a node (not just within a tenant.) + if let Some(optimization) = + // If idle, maybe ptimize attachments: if a shard has a secondary location that is preferable to + // its primary location based on soft constraints, cut it over. + shard.optimize_attachment(nodes, &schedule_context) + { + work.push((shard.tenant_shard_id, optimization)); + break; + } else if let Some(optimization) = + // If idle, maybe optimize secondary locations: if a shard has a secondary location that would be + // better placed on another node, based on ScheduleContext, then adjust it. This + // covers cases like after a shard split, where we might have too many shards + // in the same tenant with secondary locations on the node where they originally split. + shard.optimize_secondary(scheduler, &schedule_context) + { + work.push((shard.tenant_shard_id, optimization)); + break; } } } diff --git a/storage_controller/src/service/context_iterator.rs b/storage_controller/src/service/context_iterator.rs new file mode 100644 index 0000000000..d38010a27e --- /dev/null +++ b/storage_controller/src/service/context_iterator.rs @@ -0,0 +1,139 @@ +use std::collections::BTreeMap; + +use utils::id::TenantId; +use utils::shard::TenantShardId; + +use crate::scheduler::{ScheduleContext, ScheduleMode}; +use crate::tenant_shard::TenantShard; + +/// When making scheduling decisions, it is useful to have the ScheduleContext for a whole +/// tenant while considering the individual shards within it. This iterator is a helper +/// that gathers all the shards in a tenant and then yields them together with a ScheduleContext +/// for the tenant. +pub(super) struct TenantShardContextIterator<'a> { + schedule_mode: ScheduleMode, + inner: std::collections::btree_map::IterMut<'a, TenantShardId, TenantShard>, +} + +impl<'a> TenantShardContextIterator<'a> { + pub(super) fn new( + tenants: &'a mut BTreeMap, + schedule_mode: ScheduleMode, + ) -> Self { + Self { + schedule_mode, + inner: tenants.iter_mut(), + } + } +} + +impl<'a> Iterator for TenantShardContextIterator<'a> { + type Item = (TenantId, ScheduleContext, Vec<&'a mut TenantShard>); + + fn next(&mut self) -> Option { + let mut tenant_shards = Vec::new(); + let mut schedule_context = ScheduleContext::new(self.schedule_mode.clone()); + loop { + let (tenant_shard_id, shard) = self.inner.next()?; + + if tenant_shard_id.is_shard_zero() { + // Cleared on last shard of previous tenant + assert!(tenant_shards.is_empty()); + } + + // Accumulate the schedule context for all the shards in a tenant + schedule_context.avoid(&shard.intent.all_pageservers()); + if let Some(attached) = shard.intent.get_attached() { + schedule_context.push_attached(*attached); + } + tenant_shards.push(shard); + + if tenant_shard_id.shard_number.0 == tenant_shard_id.shard_count.count() - 1 { + return Some((tenant_shard_id.tenant_id, schedule_context, tenant_shards)); + } + } + } +} + +#[cfg(test)] +mod tests { + use std::{collections::BTreeMap, str::FromStr}; + + use pageserver_api::controller_api::PlacementPolicy; + use utils::shard::{ShardCount, ShardNumber}; + + use crate::{ + scheduler::test_utils::make_test_nodes, service::Scheduler, + tenant_shard::tests::make_test_tenant_with_id, + }; + + use super::*; + + #[test] + fn test_context_iterator() { + // Hand-crafted tenant IDs to ensure they appear in the expected order when put into + // a btreemap & iterated + let mut t_1_shards = make_test_tenant_with_id( + TenantId::from_str("af0480929707ee75372337efaa5ecf96").unwrap(), + PlacementPolicy::Attached(1), + ShardCount(1), + None, + ); + let t_2_shards = make_test_tenant_with_id( + TenantId::from_str("bf0480929707ee75372337efaa5ecf96").unwrap(), + PlacementPolicy::Attached(1), + ShardCount(4), + None, + ); + let mut t_3_shards = make_test_tenant_with_id( + TenantId::from_str("cf0480929707ee75372337efaa5ecf96").unwrap(), + PlacementPolicy::Attached(1), + ShardCount(1), + None, + ); + + let t1_id = t_1_shards[0].tenant_shard_id.tenant_id; + let t2_id = t_2_shards[0].tenant_shard_id.tenant_id; + let t3_id = t_3_shards[0].tenant_shard_id.tenant_id; + + let mut tenants = BTreeMap::new(); + tenants.insert(t_1_shards[0].tenant_shard_id, t_1_shards.pop().unwrap()); + for shard in t_2_shards { + tenants.insert(shard.tenant_shard_id, shard); + } + tenants.insert(t_3_shards[0].tenant_shard_id, t_3_shards.pop().unwrap()); + + let nodes = make_test_nodes(3, &[]); + let mut scheduler = Scheduler::new(nodes.values()); + let mut context = ScheduleContext::default(); + for shard in tenants.values_mut() { + shard.schedule(&mut scheduler, &mut context).unwrap(); + } + + let mut iter = TenantShardContextIterator::new(&mut tenants, ScheduleMode::Speculative); + let (tenant_id, context, shards) = iter.next().unwrap(); + assert_eq!(tenant_id, t1_id); + assert_eq!(shards[0].tenant_shard_id.shard_number, ShardNumber(0)); + assert_eq!(shards.len(), 1); + assert_eq!(context.attach_count(), 1); + + let (tenant_id, context, shards) = iter.next().unwrap(); + assert_eq!(tenant_id, t2_id); + assert_eq!(shards[0].tenant_shard_id.shard_number, ShardNumber(0)); + assert_eq!(shards[1].tenant_shard_id.shard_number, ShardNumber(1)); + assert_eq!(shards[2].tenant_shard_id.shard_number, ShardNumber(2)); + assert_eq!(shards[3].tenant_shard_id.shard_number, ShardNumber(3)); + assert_eq!(shards.len(), 4); + assert_eq!(context.attach_count(), 4); + + let (tenant_id, context, shards) = iter.next().unwrap(); + assert_eq!(tenant_id, t3_id); + assert_eq!(shards[0].tenant_shard_id.shard_number, ShardNumber(0)); + assert_eq!(shards.len(), 1); + assert_eq!(context.attach_count(), 1); + + for shard in tenants.values_mut() { + shard.intent.clear(&mut scheduler); + } + } +} diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index 27c97d3b86..2eb98ee825 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -1574,13 +1574,20 @@ pub(crate) mod tests { ) } - fn make_test_tenant( + pub(crate) fn make_test_tenant( policy: PlacementPolicy, shard_count: ShardCount, preferred_az: Option, ) -> Vec { - let tenant_id = TenantId::generate(); + make_test_tenant_with_id(TenantId::generate(), policy, shard_count, preferred_az) + } + pub(crate) fn make_test_tenant_with_id( + tenant_id: TenantId, + policy: PlacementPolicy, + shard_count: ShardCount, + preferred_az: Option, + ) -> Vec { (0..shard_count.count()) .map(|i| { let shard_number = ShardNumber(i); From 4bb9554e4a12005a22135f4a0f5fbd164f55875a Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Fri, 29 Nov 2024 15:38:04 +0200 Subject: [PATCH 11/65] safekeeper: use jemalloc (#9780) ## Problem To add Safekeeper heap profiling in #9778, we need to switch to an allocator that supports it. Pageserver and proxy already use jemalloc. Touches #9534. ## Summary of changes Use jemalloc in Safekeeper. --- Cargo.lock | 1 + safekeeper/Cargo.toml | 1 + safekeeper/benches/receive_wal.rs | 30 +++++++++++++++++++++++++++++- safekeeper/src/bin/safekeeper.rs | 3 +++ 4 files changed, 34 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index f05c6311dd..abe69525c9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5409,6 +5409,7 @@ dependencies = [ "strum", "strum_macros", "thiserror", + "tikv-jemallocator", "tokio", "tokio-io-timeout", "tokio-postgres", diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 635a9222e1..0422c46ab1 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -41,6 +41,7 @@ serde_json.workspace = true strum.workspace = true strum_macros.workspace = true thiserror.workspace = true +tikv-jemallocator.workspace = true tokio = { workspace = true, features = ["fs"] } tokio-util = { workspace = true } tokio-io-timeout.workspace = true diff --git a/safekeeper/benches/receive_wal.rs b/safekeeper/benches/receive_wal.rs index c637b4fb24..8c4281cf52 100644 --- a/safekeeper/benches/receive_wal.rs +++ b/safekeeper/benches/receive_wal.rs @@ -6,6 +6,7 @@ mod benchutils; use std::io::Write as _; use benchutils::Env; +use bytes::BytesMut; use camino_tempfile::tempfile; use criterion::{criterion_group, criterion_main, BatchSize, Bencher, Criterion}; use itertools::Itertools as _; @@ -23,6 +24,9 @@ const KB: usize = 1024; const MB: usize = 1024 * KB; const GB: usize = 1024 * MB; +#[global_allocator] +static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; + // Register benchmarks with Criterion. criterion_group!( name = benches; @@ -30,7 +34,8 @@ criterion_group!( targets = bench_process_msg, bench_wal_acceptor, bench_wal_acceptor_throughput, - bench_file_write + bench_file_write, + bench_bytes_reserve, ); criterion_main!(benches); @@ -341,3 +346,26 @@ fn bench_file_write(c: &mut Criterion) { Ok(()) } } + +/// Benchmarks the cost of memory allocations when receiving WAL messages. This emulates the logic +/// in FeMessage::parse, which extends the read buffer. It is primarily intended to test jemalloc. +fn bench_bytes_reserve(c: &mut Criterion) { + let mut g = c.benchmark_group("bytes_reserve"); + for size in [1, 64, KB, 8 * KB, 128 * KB] { + g.throughput(criterion::Throughput::Bytes(size as u64)); + g.bench_function(format!("size={size}"), |b| run_bench(b, size).unwrap()); + } + + fn run_bench(b: &mut Bencher, size: usize) -> anyhow::Result<()> { + let mut bytes = BytesMut::new(); + let data = vec![0; size]; + + b.iter(|| { + bytes.reserve(size); + bytes.extend_from_slice(&data); + bytes.split_to(size).freeze(); + }); + + Ok(()) + } +} diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 1248428d33..3659bcd7e0 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -48,6 +48,9 @@ use utils::{ tcp_listener, }; +#[global_allocator] +static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; + const PID_FILE_NAME: &str = "safekeeper.pid"; const ID_FILE_NAME: &str = "safekeeper.id"; From 0a550f3e7dcec29cc5b162665ef5ff16daf5eaf5 Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Fri, 29 Nov 2024 14:55:56 +0100 Subject: [PATCH 12/65] feat(compute_ctl): Always set application_name (#9934) ## Problem It was not always possible to judge what exactly some `cloud_admin` connections were doing because we didn't consistently set `application_name` everywhere. ## Summary of changes Unify the way we connect to Postgres: 1. Switch to building configs everywhere 2. Always set `application_name` and make naming consistent Follow-up for #9919 Part of neondatabase/cloud#20948 --- compute_tools/src/bin/compute_ctl.rs | 10 ++++- compute_tools/src/catalog.rs | 7 +--- compute_tools/src/checker.rs | 3 +- compute_tools/src/compute.rs | 49 ++++++++++++++++------- compute_tools/src/http/api.rs | 11 +++-- compute_tools/src/installed_extensions.rs | 14 +++---- compute_tools/src/monitor.rs | 13 +++--- 7 files changed, 64 insertions(+), 43 deletions(-) diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 6b670de2ea..b178d7abd6 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -37,6 +37,7 @@ use std::collections::HashMap; use std::fs::File; use std::path::Path; use std::process::exit; +use std::str::FromStr; use std::sync::atomic::Ordering; use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock}; use std::{thread, time::Duration}; @@ -322,8 +323,15 @@ fn wait_spec( } else { spec_set = false; } + let connstr = Url::parse(connstr).context("cannot parse connstr as a URL")?; + let conn_conf = postgres::config::Config::from_str(connstr.as_str()) + .context("cannot build postgres config from connstr")?; + let tokio_conn_conf = tokio_postgres::config::Config::from_str(connstr.as_str()) + .context("cannot build tokio postgres config from connstr")?; let compute_node = ComputeNode { - connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?, + connstr, + conn_conf, + tokio_conn_conf, pgdata: pgdata.to_string(), pgbin: pgbin.to_string(), pgversion: get_pg_version_string(pgbin), diff --git a/compute_tools/src/catalog.rs b/compute_tools/src/catalog.rs index 08ae8bf44d..72198a9479 100644 --- a/compute_tools/src/catalog.rs +++ b/compute_tools/src/catalog.rs @@ -6,7 +6,6 @@ use tokio::{ process::Command, spawn, }; -use tokio_postgres::connect; use tokio_stream::{self as stream, StreamExt}; use tokio_util::codec::{BytesCodec, FramedRead}; use tracing::warn; @@ -16,10 +15,8 @@ use crate::pg_helpers::{get_existing_dbs_async, get_existing_roles_async, postgr use compute_api::responses::CatalogObjects; pub async fn get_dbs_and_roles(compute: &Arc) -> anyhow::Result { - let connstr = compute.connstr.clone(); - - let (client, connection): (tokio_postgres::Client, _) = - connect(connstr.as_str(), NoTls).await?; + let conf = compute.get_tokio_conn_conf(Some("compute_ctl:get_dbs_and_roles")); + let (client, connection): (tokio_postgres::Client, _) = conf.connect(NoTls).await?; spawn(async move { if let Err(e) = connection.await { diff --git a/compute_tools/src/checker.rs b/compute_tools/src/checker.rs index cec2b1bed8..62d61a8bc9 100644 --- a/compute_tools/src/checker.rs +++ b/compute_tools/src/checker.rs @@ -9,7 +9,8 @@ use crate::compute::ComputeNode; #[instrument(skip_all)] pub async fn check_writability(compute: &ComputeNode) -> Result<()> { // Connect to the database. - let (client, connection) = tokio_postgres::connect(compute.connstr.as_str(), NoTls).await?; + let conf = compute.get_tokio_conn_conf(Some("compute_ctl:availability_checker")); + let (client, connection) = conf.connect(NoTls).await?; if client.is_closed() { return Err(anyhow!("connection to postgres closed")); } diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 1a026a4014..da1caf1a9b 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -20,8 +20,9 @@ use futures::future::join_all; use futures::stream::FuturesUnordered; use futures::StreamExt; use nix::unistd::Pid; +use postgres; use postgres::error::SqlState; -use postgres::{Client, NoTls}; +use postgres::NoTls; use tracing::{debug, error, info, instrument, warn}; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; @@ -58,6 +59,10 @@ pub static PG_PID: AtomicU32 = AtomicU32::new(0); pub struct ComputeNode { // Url type maintains proper escaping pub connstr: url::Url, + // We connect to Postgres from many different places, so build configs once + // and reuse them where needed. + pub conn_conf: postgres::config::Config, + pub tokio_conn_conf: tokio_postgres::config::Config, pub pgdata: String, pub pgbin: String, pub pgversion: String, @@ -800,10 +805,10 @@ impl ComputeNode { /// version. In the future, it may upgrade all 3rd-party extensions. #[instrument(skip_all)] pub fn post_apply_config(&self) -> Result<()> { - let connstr = self.connstr.clone(); + let conf = self.get_conn_conf(Some("compute_ctl:post_apply_config")); thread::spawn(move || { let func = || { - let mut client = Client::connect(connstr.as_str(), NoTls)?; + let mut client = conf.connect(NoTls)?; handle_neon_extension_upgrade(&mut client) .context("handle_neon_extension_upgrade")?; Ok::<_, anyhow::Error>(()) @@ -815,12 +820,27 @@ impl ComputeNode { Ok(()) } + pub fn get_conn_conf(&self, application_name: Option<&str>) -> postgres::Config { + let mut conf = self.conn_conf.clone(); + if let Some(application_name) = application_name { + conf.application_name(application_name); + } + conf + } + + pub fn get_tokio_conn_conf(&self, application_name: Option<&str>) -> tokio_postgres::Config { + let mut conf = self.tokio_conn_conf.clone(); + if let Some(application_name) = application_name { + conf.application_name(application_name); + } + conf + } + async fn get_maintenance_client( conf: &tokio_postgres::Config, ) -> Result { let mut conf = conf.clone(); - - conf.application_name("apply_config"); + conf.application_name("compute_ctl:apply_config"); let (client, conn) = match conf.connect(NoTls).await { // If connection fails, it may be the old node with `zenith_admin` superuser. @@ -837,6 +857,7 @@ impl ComputeNode { e ); let mut zenith_admin_conf = postgres::config::Config::from(conf.clone()); + zenith_admin_conf.application_name("compute_ctl:apply_config"); zenith_admin_conf.user("zenith_admin"); let mut client = @@ -1134,8 +1155,7 @@ impl ComputeNode { /// Do initial configuration of the already started Postgres. #[instrument(skip_all)] pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> { - let mut conf = tokio_postgres::Config::from_str(self.connstr.as_str()).unwrap(); - conf.application_name("apply_config"); + let conf = self.get_tokio_conn_conf(Some("compute_ctl:apply_config")); let conf = Arc::new(conf); let spec = Arc::new( @@ -1161,7 +1181,7 @@ impl ComputeNode { thread::spawn(move || { let conf = conf.as_ref().clone(); let mut conf = postgres::config::Config::from(conf); - conf.application_name("migrations"); + conf.application_name("compute_ctl:migrations"); let mut client = conf.connect(NoTls)?; handle_migrations(&mut client).context("apply_config handle_migrations") @@ -1369,9 +1389,9 @@ impl ComputeNode { } self.post_apply_config()?; - let connstr = self.connstr.clone(); + let conf = self.get_conn_conf(None); thread::spawn(move || { - let res = get_installed_extensions(&connstr); + let res = get_installed_extensions(conf); match res { Ok(extensions) => { info!( @@ -1510,7 +1530,8 @@ impl ComputeNode { /// Select `pg_stat_statements` data and return it as a stringified JSON pub async fn collect_insights(&self) -> String { let mut result_rows: Vec = Vec::new(); - let connect_result = tokio_postgres::connect(self.connstr.as_str(), NoTls).await; + let conf = self.get_tokio_conn_conf(Some("compute_ctl:collect_insights")); + let connect_result = conf.connect(NoTls).await; let (client, connection) = connect_result.unwrap(); tokio::spawn(async move { if let Err(e) = connection.await { @@ -1636,10 +1657,9 @@ LIMIT 100", privileges: &[Privilege], role_name: &PgIdent, ) -> Result<()> { - use tokio_postgres::config::Config; use tokio_postgres::NoTls; - let mut conf = Config::from_str(self.connstr.as_str()).unwrap(); + let mut conf = self.get_tokio_conn_conf(Some("compute_ctl:set_role_grants")); conf.dbname(db_name); let (db_client, conn) = conf @@ -1676,10 +1696,9 @@ LIMIT 100", db_name: &PgIdent, ext_version: ExtVersion, ) -> Result { - use tokio_postgres::config::Config; use tokio_postgres::NoTls; - let mut conf = Config::from_str(self.connstr.as_str()).unwrap(); + let mut conf = self.get_tokio_conn_conf(Some("compute_ctl:install_extension")); conf.dbname(db_name); let (db_client, conn) = conf diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs index a6c6cff20a..7fa6426d8f 100644 --- a/compute_tools/src/http/api.rs +++ b/compute_tools/src/http/api.rs @@ -295,12 +295,11 @@ async fn routes(req: Request, compute: &Arc) -> Response render_json(Body::from(serde_json::to_string(&res).unwrap())), diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs index f473c29a55..5f62f08858 100644 --- a/compute_tools/src/installed_extensions.rs +++ b/compute_tools/src/installed_extensions.rs @@ -10,8 +10,6 @@ use metrics::core::Collector; use metrics::{register_uint_gauge_vec, UIntGaugeVec}; use once_cell::sync::Lazy; -use crate::pg_helpers::postgres_conf_for_db; - /// We don't reuse get_existing_dbs() just for code clarity /// and to make database listing query here more explicit. /// @@ -41,14 +39,16 @@ fn list_dbs(client: &mut Client) -> Result> { /// /// Same extension can be installed in multiple databases with different versions, /// we only keep the highest and lowest version across all databases. -pub fn get_installed_extensions(connstr: &url::Url) -> Result { - let mut client = Client::connect(connstr.as_str(), NoTls)?; +pub fn get_installed_extensions(mut conf: postgres::config::Config) -> Result { + conf.application_name("compute_ctl:get_installed_extensions"); + let mut client = conf.connect(NoTls)?; + let databases: Vec = list_dbs(&mut client)?; let mut extensions_map: HashMap = HashMap::new(); for db in databases.iter() { - let config = postgres_conf_for_db(connstr, db)?; - let mut db_client = config.connect(NoTls)?; + conf.dbname(db); + let mut db_client = conf.connect(NoTls)?; let extensions: Vec<(String, String)> = db_client .query( "SELECT extname, extversion FROM pg_catalog.pg_extension;", @@ -82,7 +82,7 @@ pub fn get_installed_extensions(connstr: &url::Url) -> Result = None; @@ -57,7 +54,7 @@ fn watch_compute_activity(compute: &ComputeNode) { info!("connection to Postgres is closed, trying to reconnect"); // Connection is closed, reconnect and try again. - client = Client::connect(connstr, NoTls); + client = conf.connect(NoTls); continue; } @@ -196,7 +193,7 @@ fn watch_compute_activity(compute: &ComputeNode) { debug!("could not connect to Postgres: {}, retrying", e); // Establish a new connection and try again. - client = Client::connect(connstr, NoTls); + client = conf.connect(NoTls); } } } From 4f3649461512e5696a64f65972a68f5a115224e7 Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 29 Nov 2024 15:11:44 +0000 Subject: [PATCH 13/65] pageserver: download small objects using a smaller timeout (#9938) ## Problem It appears that the Azure storage API tends to hang TCP connections more than S3 does. Currently we use a 2 minute timeout for all downloads. This is large because sometimes the objects we download are large. However, waiting 2 minutes when doing something like downloading a manifest on tenant attach is problematic, because when someone is doing a "create tenant, create timeline" workflow, that 2 minutes is long enough for them reasonably to give up creating that timeline. Rather than propagate oversized timeouts further up the stack, we should use a different timeout for objects that we expect to be small. Closes: https://github.com/neondatabase/neon/issues/9836 ## Summary of changes - Add a `small_timeout` configuration attribute to remote storage, defaulting to 30 seconds (still a very generous period to do something like download an index) - Add a DownloadKind parameter to DownloadOpts, so that callers can indicate whether they expect the object to be small or large. - In the azure client, use small timeout for HEAD requests, and for GET requests if DownloadKind::Small is used. - Use DownloadKind::Small for manifests, indices, and heatmap downloads. This PR intentionally does not make the equivalent change to the S3 client, to reduce blast radius in case this has unexpected consequences (we could accomplish the same thing by editing lots of configs, but just skipping the code is simpler for right now) --- libs/remote_storage/src/azure_blob.rs | 23 ++++++++++++++--- libs/remote_storage/src/config.rs | 25 ++++++++++++++++--- libs/remote_storage/src/lib.rs | 20 ++++++++++++++- libs/remote_storage/tests/test_real_azure.rs | 3 ++- libs/remote_storage/tests/test_real_s3.rs | 1 + pageserver/src/deletion_queue.rs | 1 + pageserver/src/tenant.rs | 1 + .../tenant/remote_timeline_client/download.rs | 21 +++++++++++++--- pageserver/src/tenant/secondary/downloader.rs | 3 ++- .../import_pgdata/importbucket_client.rs | 4 ++- proxy/src/context/parquet.rs | 2 ++ 11 files changed, 89 insertions(+), 15 deletions(-) diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs index 840917ef68..8d1962fa29 100644 --- a/libs/remote_storage/src/azure_blob.rs +++ b/libs/remote_storage/src/azure_blob.rs @@ -35,6 +35,7 @@ use utils::backoff; use utils::backoff::exponential_backoff_duration_seconds; use crate::metrics::{start_measuring_requests, AttemptOutcome, RequestKind}; +use crate::DownloadKind; use crate::{ config::AzureConfig, error::Cancelled, ConcurrencyLimiter, Download, DownloadError, DownloadOpts, Listing, ListingMode, ListingObject, RemotePath, RemoteStorage, StorageMetadata, @@ -49,10 +50,17 @@ pub struct AzureBlobStorage { concurrency_limiter: ConcurrencyLimiter, // Per-request timeout. Accessible for tests. pub timeout: Duration, + + // Alternative timeout used for metadata objects which are expected to be small + pub small_timeout: Duration, } impl AzureBlobStorage { - pub fn new(azure_config: &AzureConfig, timeout: Duration) -> Result { + pub fn new( + azure_config: &AzureConfig, + timeout: Duration, + small_timeout: Duration, + ) -> Result { debug!( "Creating azure remote storage for azure container {}", azure_config.container_name @@ -94,6 +102,7 @@ impl AzureBlobStorage { max_keys_per_list_response, concurrency_limiter: ConcurrencyLimiter::new(azure_config.concurrency_limit.get()), timeout, + small_timeout, }) } @@ -133,6 +142,7 @@ impl AzureBlobStorage { async fn download_for_builder( &self, builder: GetBlobBuilder, + timeout: Duration, cancel: &CancellationToken, ) -> Result { let kind = RequestKind::Get; @@ -156,7 +166,7 @@ impl AzureBlobStorage { .map_err(to_download_error); // apply per request timeout - let response = tokio_stream::StreamExt::timeout(response, self.timeout); + let response = tokio_stream::StreamExt::timeout(response, timeout); // flatten let response = response.map(|res| match res { @@ -415,7 +425,7 @@ impl RemoteStorage for AzureBlobStorage { let blob_client = self.client.blob_client(self.relative_path_to_name(key)); let properties_future = blob_client.get_properties().into_future(); - let properties_future = tokio::time::timeout(self.timeout, properties_future); + let properties_future = tokio::time::timeout(self.small_timeout, properties_future); let res = tokio::select! { res = properties_future => res, @@ -521,7 +531,12 @@ impl RemoteStorage for AzureBlobStorage { }); } - self.download_for_builder(builder, cancel).await + let timeout = match opts.kind { + DownloadKind::Small => self.small_timeout, + DownloadKind::Large => self.timeout, + }; + + self.download_for_builder(builder, timeout, cancel).await } async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> { diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs index e99ae4f747..f6ef31077c 100644 --- a/libs/remote_storage/src/config.rs +++ b/libs/remote_storage/src/config.rs @@ -24,6 +24,13 @@ pub struct RemoteStorageConfig { skip_serializing_if = "is_default_timeout" )] pub timeout: Duration, + /// Alternative timeout used for metadata objects which are expected to be small + #[serde( + with = "humantime_serde", + default = "default_small_timeout", + skip_serializing_if = "is_default_small_timeout" + )] + pub small_timeout: Duration, } impl RemoteStorageKind { @@ -40,10 +47,18 @@ fn default_timeout() -> Duration { RemoteStorageConfig::DEFAULT_TIMEOUT } +fn default_small_timeout() -> Duration { + RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT +} + fn is_default_timeout(d: &Duration) -> bool { *d == RemoteStorageConfig::DEFAULT_TIMEOUT } +fn is_default_small_timeout(d: &Duration) -> bool { + *d == RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT +} + /// A kind of a remote storage to connect to, with its connection configuration. #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)] #[serde(untagged)] @@ -184,6 +199,7 @@ fn serialize_storage_class( impl RemoteStorageConfig { pub const DEFAULT_TIMEOUT: Duration = std::time::Duration::from_secs(120); + pub const DEFAULT_SMALL_TIMEOUT: Duration = std::time::Duration::from_secs(30); pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result { Ok(utils::toml_edit_ext::deserialize_item(toml)?) @@ -219,7 +235,8 @@ timeout = '5s'"; storage: RemoteStorageKind::LocalFs { local_path: Utf8PathBuf::from(".") }, - timeout: Duration::from_secs(5) + timeout: Duration::from_secs(5), + small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT } ); } @@ -247,7 +264,8 @@ timeout = '5s'"; max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, upload_storage_class: Some(StorageClass::IntelligentTiering), }), - timeout: Duration::from_secs(7) + timeout: Duration::from_secs(7), + small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT } ); } @@ -299,7 +317,8 @@ timeout = '5s'"; concurrency_limit: default_remote_storage_azure_concurrency_limit(), max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, }), - timeout: Duration::from_secs(7) + timeout: Duration::from_secs(7), + small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT } ); } diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 719608dd5f..0ece29d99e 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -178,6 +178,15 @@ pub struct DownloadOpts { /// The end of the byte range to download, or unbounded. Must be after the /// start bound. pub byte_end: Bound, + /// Indicate whether we're downloading something small or large: this indirectly controls + /// timeouts: for something like an index/manifest/heatmap, we should time out faster than + /// for layer files + pub kind: DownloadKind, +} + +pub enum DownloadKind { + Large, + Small, } impl Default for DownloadOpts { @@ -186,6 +195,7 @@ impl Default for DownloadOpts { etag: Default::default(), byte_start: Bound::Unbounded, byte_end: Bound::Unbounded, + kind: DownloadKind::Large, } } } @@ -584,6 +594,10 @@ impl GenericRemoteStorage> { impl GenericRemoteStorage { pub async fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result { let timeout = storage_config.timeout; + + // If somkeone overrides timeout to be small without adjusting small_timeout, then adjust it automatically + let small_timeout = std::cmp::min(storage_config.small_timeout, timeout); + Ok(match &storage_config.storage { RemoteStorageKind::LocalFs { local_path: path } => { info!("Using fs root '{path}' as a remote storage"); @@ -606,7 +620,11 @@ impl GenericRemoteStorage { .unwrap_or(""); info!("Using azure container '{}' in account '{storage_account}' in region '{}' as a remote storage, prefix in container: '{:?}'", azure_config.container_name, azure_config.container_region, azure_config.prefix_in_container); - Self::AzureBlob(Arc::new(AzureBlobStorage::new(azure_config, timeout)?)) + Self::AzureBlob(Arc::new(AzureBlobStorage::new( + azure_config, + timeout, + small_timeout, + )?)) } }) } diff --git a/libs/remote_storage/tests/test_real_azure.rs b/libs/remote_storage/tests/test_real_azure.rs index 3a20649490..92d579fec8 100644 --- a/libs/remote_storage/tests/test_real_azure.rs +++ b/libs/remote_storage/tests/test_real_azure.rs @@ -219,7 +219,8 @@ async fn create_azure_client( concurrency_limit: NonZeroUsize::new(100).unwrap(), max_keys_per_list_response, }), - timeout: Duration::from_secs(120), + timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, + small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT, }; Ok(Arc::new( GenericRemoteStorage::from_config(&remote_storage_config) diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs index 3e99a65fac..e60ec18c93 100644 --- a/libs/remote_storage/tests/test_real_s3.rs +++ b/libs/remote_storage/tests/test_real_s3.rs @@ -396,6 +396,7 @@ async fn create_s3_client( upload_storage_class: None, }), timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, + small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT, }; Ok(Arc::new( GenericRemoteStorage::from_config(&remote_storage_config) diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index e74c8ecf5a..1d508f5fe9 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -838,6 +838,7 @@ mod test { local_path: remote_fs_dir.clone(), }, timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, + small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT, }; let storage = GenericRemoteStorage::from_config(&storage_config) .await diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 339a3ca1bb..cd0690bb1a 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -5423,6 +5423,7 @@ pub(crate) mod harness { local_path: remote_fs_dir.clone(), }, timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, + small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT, }; let remote_storage = GenericRemoteStorage::from_config(&config).await.unwrap(); let deletion_queue = MockDeletionQueue::new(Some(remote_storage.clone())); diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index d632e595ad..739615be9c 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -30,7 +30,9 @@ use crate::tenant::Generation; use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt; use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile}; use crate::TEMP_FILE_SUFFIX; -use remote_storage::{DownloadError, DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath}; +use remote_storage::{ + DownloadError, DownloadKind, DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath, +}; use utils::crashsafe::path_with_suffix_extension; use utils::id::{TenantId, TimelineId}; use utils::pausable_failpoint; @@ -345,12 +347,13 @@ pub async fn list_remote_timelines( async fn do_download_remote_path_retry_forever( storage: &GenericRemoteStorage, remote_path: &RemotePath, + download_opts: DownloadOpts, cancel: &CancellationToken, ) -> Result<(Vec, SystemTime), DownloadError> { download_retry_forever( || async { let download = storage - .download(remote_path, &DownloadOpts::default(), cancel) + .download(remote_path, &download_opts, cancel) .await?; let mut bytes = Vec::new(); @@ -377,8 +380,13 @@ async fn do_download_tenant_manifest( ) -> Result<(TenantManifest, Generation, SystemTime), DownloadError> { let remote_path = remote_tenant_manifest_path(tenant_shard_id, generation); + let download_opts = DownloadOpts { + kind: DownloadKind::Small, + ..Default::default() + }; + let (manifest_bytes, manifest_bytes_mtime) = - do_download_remote_path_retry_forever(storage, &remote_path, cancel).await?; + do_download_remote_path_retry_forever(storage, &remote_path, download_opts, cancel).await?; let tenant_manifest = TenantManifest::from_json_bytes(&manifest_bytes) .with_context(|| format!("deserialize tenant manifest file at {remote_path:?}")) @@ -398,8 +406,13 @@ async fn do_download_index_part( timeline_id.expect("A timeline ID is always provided when downloading an index"); let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation); + let download_opts = DownloadOpts { + kind: DownloadKind::Small, + ..Default::default() + }; + let (index_part_bytes, index_part_mtime) = - do_download_remote_path_retry_forever(storage, &remote_path, cancel).await?; + do_download_remote_path_retry_forever(storage, &remote_path, download_opts, cancel).await?; let index_part: IndexPart = serde_json::from_slice(&index_part_bytes) .with_context(|| format!("deserialize index part file at {remote_path:?}")) diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index 7443261a9c..8d771dc405 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -49,7 +49,7 @@ use futures::Future; use metrics::UIntGauge; use pageserver_api::models::SecondaryProgress; use pageserver_api::shard::TenantShardId; -use remote_storage::{DownloadError, DownloadOpts, Etag, GenericRemoteStorage}; +use remote_storage::{DownloadError, DownloadKind, DownloadOpts, Etag, GenericRemoteStorage}; use tokio_util::sync::CancellationToken; use tracing::{info_span, instrument, warn, Instrument}; @@ -946,6 +946,7 @@ impl<'a> TenantDownloader<'a> { let cancel = &self.secondary_state.cancel; let opts = DownloadOpts { etag: prev_etag.cloned(), + kind: DownloadKind::Small, ..Default::default() }; diff --git a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs index 8d5ab1780f..bc4d148a29 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs @@ -4,7 +4,8 @@ use anyhow::Context; use bytes::Bytes; use postgres_ffi::ControlFileData; use remote_storage::{ - Download, DownloadError, DownloadOpts, GenericRemoteStorage, Listing, ListingObject, RemotePath, + Download, DownloadError, DownloadKind, DownloadOpts, GenericRemoteStorage, Listing, + ListingObject, RemotePath, }; use serde::de::DeserializeOwned; use tokio_util::sync::CancellationToken; @@ -239,6 +240,7 @@ impl RemoteStorageWrapper { .download( path, &DownloadOpts { + kind: DownloadKind::Large, etag: None, byte_start: Bound::Included(start_inclusive), byte_end: Bound::Excluded(end_exclusive) diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index e328c6de79..b375eb886e 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -486,6 +486,7 @@ mod tests { upload_storage_class: None, }), timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, + small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT, }) ); assert_eq!(parquet_upload.parquet_upload_row_group_size, 100); @@ -545,6 +546,7 @@ mod tests { local_path: tmpdir.to_path_buf(), }, timeout: std::time::Duration::from_secs(120), + small_timeout: std::time::Duration::from_secs(30), }; let storage = GenericRemoteStorage::from_config(&remote_storage_config) .await From 93f29a0065ff820b2fa16f97a824d358655282f3 Mon Sep 17 00:00:00 2001 From: Gleb Novikov Date: Fri, 29 Nov 2024 17:58:36 +0000 Subject: [PATCH 14/65] Fixed fast_import pgbin in calling get_pg_version (#9933) Was working on https://github.com/neondatabase/cloud/pull/20795 and discovered that fast_import is not working normally. --- compute_tools/src/bin/fast_import.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs index 6716cc6234..b6db3eb11a 100644 --- a/compute_tools/src/bin/fast_import.rs +++ b/compute_tools/src/bin/fast_import.rs @@ -21,7 +21,7 @@ //! - Build the image with the following command: //! //! ```bash -//! docker buildx build --build-arg DEBIAN_FLAVOR=bullseye-slim --build-arg GIT_VERSION=local --build-arg PG_VERSION=v14 --build-arg BUILD_TAG="$(date --iso-8601=s -u)" -t localhost:3030/localregistry/compute-node-v14:latest -f compute/Dockerfile.com +//! docker buildx build --platform linux/amd64 --build-arg DEBIAN_VERSION=bullseye --build-arg GIT_VERSION=local --build-arg PG_VERSION=v14 --build-arg BUILD_TAG="$(date --iso-8601=s -u)" -t localhost:3030/localregistry/compute-node-v14:latest -f compute/compute-node.Dockerfile . //! docker push localhost:3030/localregistry/compute-node-v14:latest //! ``` @@ -132,7 +132,8 @@ pub(crate) async fn main() -> anyhow::Result<()> { // // Initialize pgdata // - let pg_version = match get_pg_version(pg_bin_dir.as_str()) { + let pgbin = pg_bin_dir.join("postgres"); + let pg_version = match get_pg_version(pgbin.as_ref()) { PostgresMajorVersion::V14 => 14, PostgresMajorVersion::V15 => 15, PostgresMajorVersion::V16 => 16, @@ -155,7 +156,7 @@ pub(crate) async fn main() -> anyhow::Result<()> { // // Launch postgres process // - let mut postgres_proc = tokio::process::Command::new(pg_bin_dir.join("postgres")) + let mut postgres_proc = tokio::process::Command::new(pgbin) .arg("-D") .arg(&pgdata_dir) .args(["-c", "wal_level=minimal"]) From 547b2d28278ccb69bb6d782e771790151bc62651 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Fri, 29 Nov 2024 20:10:26 +0100 Subject: [PATCH 15/65] Fix timeout value used in XLogWaitForReplayOf (#9937) The previous value assumed usec precision, while the timeout used is in milliseconds, causing replica backends to wait for (potentially) many hours for WAL replay without the expected progress reports in logs. This fixes the issue. Reported-By: Alexander Lakhin ## Problem https://github.com/neondatabase/postgres/pull/279#issuecomment-2507671817 The timeout value was configured with the assumption the indicated value would be microseconds, where it's actually milliseconds. That causes the backend to wait for much longer (2h46m40s) before it emits the "I'm waiting for recovery" message. While we do have wait events configured on this, it's not great to have stuck backends without clear logs, so this fixes the timeout value in all our PostgreSQL branches. ## PG PRs * PG14: https://github.com/neondatabase/postgres/pull/542 * PG15: https://github.com/neondatabase/postgres/pull/543 * PG16: https://github.com/neondatabase/postgres/pull/544 * PG17: https://github.com/neondatabase/postgres/pull/545 --- vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- vendor/postgres-v16 | 2 +- vendor/postgres-v17 | 2 +- vendor/revisions.json | 8 ++++---- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 284ae56be2..c1989c934d 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 284ae56be2397fd3eaf20777fa220b2d0ad968f5 +Subproject commit c1989c934d46e04e78b3c496c8a34bcd40ddceeb diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index aed79ee87b..d929b9a8b9 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit aed79ee87b94779cc52ec13e3b74eba6ada93f05 +Subproject commit d929b9a8b9f32f6fe5a0eac3e6e963f0e44e27e6 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index f5cfc6fa89..13e9e35394 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit f5cfc6fa898544050e821ac688adafece1ac3cff +Subproject commit 13e9e3539419003e79bd9aa29e1bc44f3fd555dd diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index 3c15b6565f..faebe5e5af 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit 3c15b6565f6c8d36d169ed9ea7412cf90cfb2a8f +Subproject commit faebe5e5aff5687908504453623778f8515529db diff --git a/vendor/revisions.json b/vendor/revisions.json index 4dae88e73d..abeddcadf7 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,18 +1,18 @@ { "v17": [ "17.2", - "3c15b6565f6c8d36d169ed9ea7412cf90cfb2a8f" + "faebe5e5aff5687908504453623778f8515529db" ], "v16": [ "16.6", - "f5cfc6fa898544050e821ac688adafece1ac3cff" + "13e9e3539419003e79bd9aa29e1bc44f3fd555dd" ], "v15": [ "15.10", - "aed79ee87b94779cc52ec13e3b74eba6ada93f05" + "d929b9a8b9f32f6fe5a0eac3e6e963f0e44e27e6" ], "v14": [ "14.15", - "284ae56be2397fd3eaf20777fa220b2d0ad968f5" + "c1989c934d46e04e78b3c496c8a34bcd40ddceeb" ] } From 5dad89acd4f7cc8231e939cc62cb6b5e048ea67e Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Sat, 30 Nov 2024 01:16:24 +0100 Subject: [PATCH 16/65] page_service: rewrite batching to work without a timeout (#9851) # Problem The timeout-based batching adds latency to unbatchable workloads. We can choose a short batching timeout (e.g. 10us) but that requires high-resolution timers, which tokio doesn't have. I thoroughly explored options to use OS timers (see [this](https://github.com/neondatabase/neon/pull/9822) abandoned PR). In short, it's not an attractive option because any timer implementation adds non-trivial overheads. # Solution The insight is that, in the steady state of a batchable workload, the time we spend in `get_vectored` will be hundreds of microseconds anyway. If we prepare the next batch concurrently to `get_vectored`, we will have a sizeable batch ready once `get_vectored` of the current batch is done and do not need an explicit timeout. This can be reasonably described as **pipelining of the protocol handler**. # Implementation We model the sub-protocol handler for pagestream requests (`handle_pagrequests`) as two futures that form a pipeline: 2. Batching: read requests from the connection and fill the current batch 3. Execution: `take` the current batch, execute it using `get_vectored`, and send the response. The Reading and Batching stage are connected through a new type of channel called `spsc_fold`. See the long comment in the `handle_pagerequests_pipelined` for details. # Changes - Refactor `handle_pagerequests` - separate functions for - reading one protocol message; produces a `BatchedFeMessage` with just one page request in it - batching; tried to merge an incoming `BatchedFeMessage` into an existing `BatchedFeMessage`; returns `None` on success and returns back the incoming message in case merging isn't possible - execution of a batched message - unify the timeline handle acquisition & request span construction; it now happen in the function that reads the protocol message - Implement serial and pipelined model - serial: what we had before any of the batching changes - read one protocol message - execute protocol messages - pipelined: the design described above - optionality for execution of the pipeline: either via concurrent futures vs tokio tasks - Pageserver config - remove batching timeout field - add ability to configure pipelining mode - add ability to limit max batch size for pipelined configurations (required for the rollout, cf https://github.com/neondatabase/cloud/issues/20620 ) - ability to configure execution mode - Tests - remove `batch_timeout` parametrization - rename `test_getpage_merge_smoke` to `test_throughput` - add parametrization to test different max batch sizes and execution moes - rename `test_timer_precision` to `test_latency` - rename the test case file to `test_page_service_batching.py` - better descriptions of what the tests actually do ## On the holding The `TimelineHandle` in the pending batch While batching, we hold the `TimelineHandle` in the pending batch. Therefore, the timeline will not finish shutting down while we're batching. This is not a problem in practice because the concurrently ongoing `get_vectored` call will fail quickly with an error indicating that the timeline is shutting down. This results in the Execution stage returning a `QueryError::Shutdown`, which causes the pipeline / entire page service connection to shut down. This drops all references to the `Arc>>>` object, thereby dropping the contained `TimelineHandle`s. - => fixes https://github.com/neondatabase/neon/issues/9850 # Performance Local run of the benchmarks, results in [this empty commit](https://github.com/neondatabase/neon/pull/9851/commits/1cf5b1463f69ba5066cbb0713912aec7bb5579ad) in the PR branch. Key take-aways: * `concurrent-futures` and `tasks` deliver identical `batching_factor` * tail latency impact unknown, cf https://github.com/neondatabase/neon/issues/9837 * `concurrent-futures` has higher throughput than `tasks` in all workloads (=lower `time` metric) * In unbatchable workloads, `concurrent-futures` has 5% higher `CPU-per-throughput` than that of `tasks`, and 15% higher than that of `serial`. * In batchable-32 workload, `concurrent-futures` has 8% lower `CPU-per-throughput` than that of `tasks` (comparison to tput of `serial` is irrelevant) * in unbatchable workloads, mean and tail latencies of `concurrent-futures` is practically identical to `serial`, whereas `tasks` adds 20-30us of overhead Overall, `concurrent-futures` seems like a slightly more attractive choice. # Rollout This change is disabled-by-default. Rollout plan: - https://github.com/neondatabase/cloud/issues/20620 # Refs - epic: https://github.com/neondatabase/neon/issues/9376 - this sub-task: https://github.com/neondatabase/neon/issues/9377 - the abandoned attempt to improve batching timeout resolution: https://github.com/neondatabase/neon/pull/9820 - closes https://github.com/neondatabase/neon/issues/9850 - fixes https://github.com/neondatabase/neon/issues/9835 --- Cargo.lock | 10 +- Cargo.toml | 1 + libs/pageserver_api/src/config.rs | 30 +- libs/utils/Cargo.toml | 2 + libs/utils/src/sync.rs | 2 + libs/utils/src/sync/spsc_fold.rs | 452 +++++++ pageserver/src/config.rs | 10 +- pageserver/src/lib.rs | 19 + pageserver/src/page_service.rs | 1059 ++++++++++------- test_runner/fixtures/neon_fixtures.py | 3 +- ...merge.py => test_page_service_batching.py} | 131 +- 11 files changed, 1262 insertions(+), 457 deletions(-) create mode 100644 libs/utils/src/sync/spsc_fold.rs rename test_runner/performance/pageserver/{test_pageserver_getpage_merge.py => test_page_service_batching.py} (69%) diff --git a/Cargo.lock b/Cargo.lock index abe69525c9..313222cf3c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,6 +1,6 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 [[package]] name = "RustyXML" @@ -1717,6 +1717,12 @@ dependencies = [ "utils", ] +[[package]] +name = "diatomic-waker" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab03c107fafeb3ee9f5925686dbb7a73bc76e3932abb0d2b365cb64b169cf04c" + [[package]] name = "diesel" version = "2.2.3" @@ -7045,6 +7051,7 @@ dependencies = [ "chrono", "const_format", "criterion", + "diatomic-waker", "fail", "futures", "git-version", @@ -7063,6 +7070,7 @@ dependencies = [ "rand 0.8.5", "regex", "routerify", + "scopeguard", "sentry", "serde", "serde_assert", diff --git a/Cargo.toml b/Cargo.toml index 742201d0f5..64c384f17a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -83,6 +83,7 @@ comfy-table = "7.1" const_format = "0.2" crc32c = "0.6" dashmap = { version = "5.5.0", features = ["raw-api"] } +diatomic-waker = { version = "0.2.3" } either = "1.8" enum-map = "2.4.2" enumset = "1.0.12" diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 721d97404b..e49d15ba87 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -118,9 +118,8 @@ pub struct ConfigToml { pub virtual_file_io_mode: Option, #[serde(skip_serializing_if = "Option::is_none")] pub no_sync: Option, - #[serde(with = "humantime_serde")] - pub server_side_batch_timeout: Option, pub wal_receiver_protocol: PostgresClientProtocol, + pub page_service_pipelining: PageServicePipeliningConfig, } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -137,6 +136,28 @@ pub struct DiskUsageEvictionTaskConfig { pub eviction_order: EvictionOrder, } +#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[serde(tag = "mode", rename_all = "kebab-case")] +#[serde(deny_unknown_fields)] +pub enum PageServicePipeliningConfig { + Serial, + Pipelined(PageServicePipeliningConfigPipelined), +} +#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[serde(deny_unknown_fields)] +pub struct PageServicePipeliningConfigPipelined { + /// Causes runtime errors if larger than max get_vectored batch size. + pub max_batch_size: NonZeroUsize, + pub execution: PageServiceProtocolPipelinedExecutionStrategy, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[serde(rename_all = "kebab-case")] +pub enum PageServiceProtocolPipelinedExecutionStrategy { + ConcurrentFutures, + Tasks, +} + pub mod statvfs { pub mod mock { #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -332,8 +353,6 @@ pub mod defaults { pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512; - pub const DEFAULT_SERVER_SIDE_BATCH_TIMEOUT: Option<&str> = None; - pub const DEFAULT_WAL_RECEIVER_PROTOCOL: utils::postgres_client::PostgresClientProtocol = utils::postgres_client::PostgresClientProtocol::Vanilla; } @@ -420,11 +439,10 @@ impl Default for ConfigToml { ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB), l0_flush: None, virtual_file_io_mode: None, - server_side_batch_timeout: DEFAULT_SERVER_SIDE_BATCH_TIMEOUT - .map(|duration| humantime::parse_duration(duration).unwrap()), tenant_config: TenantConfigToml::default(), no_sync: None, wal_receiver_protocol: DEFAULT_WAL_RECEIVER_PROTOCOL, + page_service_pipelining: PageServicePipeliningConfig::Serial, } } } diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index f440b81d8f..5648072a83 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -19,6 +19,7 @@ bincode.workspace = true bytes.workspace = true camino.workspace = true chrono.workspace = true +diatomic-waker.workspace = true git-version.workspace = true hex = { workspace = true, features = ["serde"] } humantime.workspace = true @@ -45,6 +46,7 @@ tracing.workspace = true tracing-error.workspace = true tracing-subscriber = { workspace = true, features = ["json", "registry"] } rand.workspace = true +scopeguard.workspace = true strum.workspace = true strum_macros.workspace = true url.workspace = true diff --git a/libs/utils/src/sync.rs b/libs/utils/src/sync.rs index 2ee8f35449..7aa26e24bc 100644 --- a/libs/utils/src/sync.rs +++ b/libs/utils/src/sync.rs @@ -1,3 +1,5 @@ pub mod heavier_once_cell; pub mod gate; + +pub mod spsc_fold; diff --git a/libs/utils/src/sync/spsc_fold.rs b/libs/utils/src/sync/spsc_fold.rs new file mode 100644 index 0000000000..b44f766ef0 --- /dev/null +++ b/libs/utils/src/sync/spsc_fold.rs @@ -0,0 +1,452 @@ +use core::{future::poll_fn, task::Poll}; +use std::sync::{Arc, Mutex}; + +use diatomic_waker::DiatomicWaker; + +pub struct Sender { + state: Arc>, +} + +pub struct Receiver { + state: Arc>, +} + +struct Inner { + wake_receiver: DiatomicWaker, + wake_sender: DiatomicWaker, + value: Mutex>, +} + +enum State { + NoData, + HasData(T), + TryFoldFailed, // transient state + SenderWaitsForReceiverToConsume(T), + SenderGone(Option), + ReceiverGone, + AllGone, + SenderDropping, // transient state + ReceiverDropping, // transient state +} + +pub fn channel() -> (Sender, Receiver) { + let inner = Inner { + wake_receiver: DiatomicWaker::new(), + wake_sender: DiatomicWaker::new(), + value: Mutex::new(State::NoData), + }; + + let state = Arc::new(inner); + ( + Sender { + state: state.clone(), + }, + Receiver { state }, + ) +} + +#[derive(Debug, thiserror::Error)] +pub enum SendError { + #[error("receiver is gone")] + ReceiverGone, +} + +impl Sender { + /// # Panics + /// + /// If `try_fold` panics, any subsequent call to `send` panic. + pub async fn send(&mut self, value: T, try_fold: F) -> Result<(), SendError> + where + F: Fn(&mut T, T) -> Result<(), T>, + { + let mut value = Some(value); + poll_fn(|cx| { + let mut guard = self.state.value.lock().unwrap(); + match &mut *guard { + State::NoData => { + *guard = State::HasData(value.take().unwrap()); + self.state.wake_receiver.notify(); + Poll::Ready(Ok(())) + } + State::HasData(_) => { + let State::HasData(acc_mut) = &mut *guard else { + unreachable!("this match arm guarantees that the guard is HasData"); + }; + match try_fold(acc_mut, value.take().unwrap()) { + Ok(()) => { + // no need to wake receiver, if it was waiting it already + // got a wake-up when we transitioned from NoData to HasData + Poll::Ready(Ok(())) + } + Err(unfoldable_value) => { + value = Some(unfoldable_value); + let State::HasData(acc) = + std::mem::replace(&mut *guard, State::TryFoldFailed) + else { + unreachable!("this match arm guarantees that the guard is HasData"); + }; + *guard = State::SenderWaitsForReceiverToConsume(acc); + // SAFETY: send is single threaded due to `&mut self` requirement, + // therefore register is not concurrent. + unsafe { + self.state.wake_sender.register(cx.waker()); + } + Poll::Pending + } + } + } + State::SenderWaitsForReceiverToConsume(_data) => { + // Really, we shouldn't be polled until receiver has consumed and wakes us. + Poll::Pending + } + State::ReceiverGone => Poll::Ready(Err(SendError::ReceiverGone)), + State::SenderGone(_) + | State::AllGone + | State::SenderDropping + | State::ReceiverDropping + | State::TryFoldFailed => { + unreachable!(); + } + } + }) + .await + } +} + +impl Drop for Sender { + fn drop(&mut self) { + scopeguard::defer! { + self.state.wake_receiver.notify() + }; + let Ok(mut guard) = self.state.value.lock() else { + return; + }; + *guard = match std::mem::replace(&mut *guard, State::SenderDropping) { + State::NoData => State::SenderGone(None), + State::HasData(data) | State::SenderWaitsForReceiverToConsume(data) => { + State::SenderGone(Some(data)) + } + State::ReceiverGone => State::AllGone, + State::TryFoldFailed + | State::SenderGone(_) + | State::AllGone + | State::SenderDropping + | State::ReceiverDropping => { + unreachable!("unreachable state {:?}", guard.discriminant_str()) + } + } + } +} + +#[derive(Debug, thiserror::Error)] +pub enum RecvError { + #[error("sender is gone")] + SenderGone, +} + +impl Receiver { + pub async fn recv(&mut self) -> Result { + poll_fn(|cx| { + let mut guard = self.state.value.lock().unwrap(); + match &mut *guard { + State::NoData => { + // SAFETY: recv is single threaded due to `&mut self` requirement, + // therefore register is not concurrent. + unsafe { + self.state.wake_receiver.register(cx.waker()); + } + Poll::Pending + } + guard @ State::HasData(_) + | guard @ State::SenderWaitsForReceiverToConsume(_) + | guard @ State::SenderGone(Some(_)) => { + let data = guard + .take_data() + .expect("in these states, data is guaranteed to be present"); + self.state.wake_sender.notify(); + Poll::Ready(Ok(data)) + } + State::SenderGone(None) => Poll::Ready(Err(RecvError::SenderGone)), + State::ReceiverGone + | State::AllGone + | State::SenderDropping + | State::ReceiverDropping + | State::TryFoldFailed => { + unreachable!("unreachable state {:?}", guard.discriminant_str()); + } + } + }) + .await + } +} + +impl Drop for Receiver { + fn drop(&mut self) { + scopeguard::defer! { + self.state.wake_sender.notify() + }; + let Ok(mut guard) = self.state.value.lock() else { + return; + }; + *guard = match std::mem::replace(&mut *guard, State::ReceiverDropping) { + State::NoData => State::ReceiverGone, + State::HasData(_) | State::SenderWaitsForReceiverToConsume(_) => State::ReceiverGone, + State::SenderGone(_) => State::AllGone, + State::TryFoldFailed + | State::ReceiverGone + | State::AllGone + | State::SenderDropping + | State::ReceiverDropping => { + unreachable!("unreachable state {:?}", guard.discriminant_str()) + } + } + } +} + +impl State { + fn take_data(&mut self) -> Option { + match self { + State::HasData(_) => { + let State::HasData(data) = std::mem::replace(self, State::NoData) else { + unreachable!("this match arm guarantees that the state is HasData"); + }; + Some(data) + } + State::SenderWaitsForReceiverToConsume(_) => { + let State::SenderWaitsForReceiverToConsume(data) = + std::mem::replace(self, State::NoData) + else { + unreachable!( + "this match arm guarantees that the state is SenderWaitsForReceiverToConsume" + ); + }; + Some(data) + } + State::SenderGone(data) => Some(data.take().unwrap()), + State::NoData + | State::TryFoldFailed + | State::ReceiverGone + | State::AllGone + | State::SenderDropping + | State::ReceiverDropping => None, + } + } + fn discriminant_str(&self) -> &'static str { + match self { + State::NoData => "NoData", + State::HasData(_) => "HasData", + State::TryFoldFailed => "TryFoldFailed", + State::SenderWaitsForReceiverToConsume(_) => "SenderWaitsForReceiverToConsume", + State::SenderGone(_) => "SenderGone", + State::ReceiverGone => "ReceiverGone", + State::AllGone => "AllGone", + State::SenderDropping => "SenderDropping", + State::ReceiverDropping => "ReceiverDropping", + } + } +} + +#[cfg(test)] +mod tests { + + use super::*; + + const FOREVER: std::time::Duration = std::time::Duration::from_secs(u64::MAX); + + #[tokio::test] + async fn test_send_recv() { + let (mut sender, mut receiver) = channel(); + + sender + .send(42, |acc, val| { + *acc += val; + Ok(()) + }) + .await + .unwrap(); + + let received = receiver.recv().await.unwrap(); + assert_eq!(received, 42); + } + + #[tokio::test] + async fn test_send_recv_with_fold() { + let (mut sender, mut receiver) = channel(); + + sender + .send(1, |acc, val| { + *acc += val; + Ok(()) + }) + .await + .unwrap(); + sender + .send(2, |acc, val| { + *acc += val; + Ok(()) + }) + .await + .unwrap(); + + let received = receiver.recv().await.unwrap(); + assert_eq!(received, 3); + } + + #[tokio::test(start_paused = true)] + async fn test_sender_waits_for_receiver_if_try_fold_fails() { + let (mut sender, mut receiver) = channel(); + + sender.send(23, |_, _| panic!("first send")).await.unwrap(); + + let send_fut = sender.send(42, |_, val| Err(val)); + let mut send_fut = std::pin::pin!(send_fut); + + tokio::select! { + _ = tokio::time::sleep(FOREVER) => {}, + _ = &mut send_fut => { + panic!("send should not complete"); + }, + } + + let val = receiver.recv().await.unwrap(); + assert_eq!(val, 23); + + tokio::select! { + _ = tokio::time::sleep(FOREVER) => { + panic!("receiver should have consumed the value"); + }, + _ = &mut send_fut => { }, + } + + let val = receiver.recv().await.unwrap(); + assert_eq!(val, 42); + } + + #[tokio::test(start_paused = true)] + async fn test_sender_errors_if_waits_for_receiver_and_receiver_drops() { + let (mut sender, receiver) = channel(); + + sender.send(23, |_, _| unreachable!()).await.unwrap(); + + let send_fut = sender.send(42, |_, val| Err(val)); + let send_fut = std::pin::pin!(send_fut); + + drop(receiver); + + let result = send_fut.await; + assert!(matches!(result, Err(SendError::ReceiverGone))); + } + + #[tokio::test(start_paused = true)] + async fn test_receiver_errors_if_waits_for_sender_and_sender_drops() { + let (sender, mut receiver) = channel::<()>(); + + let recv_fut = receiver.recv(); + let recv_fut = std::pin::pin!(recv_fut); + + drop(sender); + + let result = recv_fut.await; + assert!(matches!(result, Err(RecvError::SenderGone))); + } + + #[tokio::test(start_paused = true)] + async fn test_receiver_errors_if_waits_for_sender_and_sender_drops_with_data() { + let (mut sender, mut receiver) = channel(); + + sender.send(42, |_, _| unreachable!()).await.unwrap(); + + { + let recv_fut = receiver.recv(); + let recv_fut = std::pin::pin!(recv_fut); + + drop(sender); + + let val = recv_fut.await.unwrap(); + assert_eq!(val, 42); + } + + let result = receiver.recv().await; + assert!(matches!(result, Err(RecvError::SenderGone))); + } + + #[tokio::test(start_paused = true)] + async fn test_receiver_waits_for_sender_if_no_data() { + let (mut sender, mut receiver) = channel(); + + let recv_fut = receiver.recv(); + let mut recv_fut = std::pin::pin!(recv_fut); + + tokio::select! { + _ = tokio::time::sleep(FOREVER) => {}, + _ = &mut recv_fut => { + panic!("recv should not complete"); + }, + } + + sender.send(42, |_, _| Ok(())).await.unwrap(); + + let val = recv_fut.await.unwrap(); + assert_eq!(val, 42); + } + + #[tokio::test] + async fn test_receiver_gone_while_nodata() { + let (mut sender, receiver) = channel(); + drop(receiver); + + let result = sender.send(42, |_, _| Ok(())).await; + assert!(matches!(result, Err(SendError::ReceiverGone))); + } + + #[tokio::test] + async fn test_sender_gone_while_nodata() { + let (sender, mut receiver) = super::channel::(); + drop(sender); + + let result = receiver.recv().await; + assert!(matches!(result, Err(RecvError::SenderGone))); + } + + #[tokio::test(start_paused = true)] + async fn test_receiver_drops_after_sender_went_to_sleep() { + let (mut sender, receiver) = channel(); + let state = receiver.state.clone(); + + sender.send(23, |_, _| unreachable!()).await.unwrap(); + + let send_task = tokio::spawn(async move { sender.send(42, |_, v| Err(v)).await }); + + tokio::time::sleep(FOREVER).await; + + assert!(matches!( + &*state.value.lock().unwrap(), + &State::SenderWaitsForReceiverToConsume(_) + )); + + drop(receiver); + + let err = send_task + .await + .unwrap() + .expect_err("should unblock immediately"); + assert!(matches!(err, SendError::ReceiverGone)); + } + + #[tokio::test(start_paused = true)] + async fn test_sender_drops_after_receiver_went_to_sleep() { + let (sender, mut receiver) = channel::(); + let state = sender.state.clone(); + + let recv_task = tokio::spawn(async move { receiver.recv().await }); + + tokio::time::sleep(FOREVER).await; + + assert!(matches!(&*state.value.lock().unwrap(), &State::NoData)); + + drop(sender); + + let err = recv_task.await.unwrap().expect_err("should error"); + assert!(matches!(err, RecvError::SenderGone)); + } +} diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 2cf237e72b..1651db8500 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -188,11 +188,9 @@ pub struct PageServerConf { /// Optionally disable disk syncs (unsafe!) pub no_sync: bool, - /// Maximum amount of time for which a get page request request - /// might be held up for request merging. - pub server_side_batch_timeout: Option, - pub wal_receiver_protocol: PostgresClientProtocol, + + pub page_service_pipelining: pageserver_api::config::PageServicePipeliningConfig, } /// Token for authentication to safekeepers @@ -350,10 +348,10 @@ impl PageServerConf { concurrent_tenant_warmup, concurrent_tenant_size_logical_size_queries, virtual_file_io_engine, - server_side_batch_timeout, tenant_config, no_sync, wal_receiver_protocol, + page_service_pipelining, } = config_toml; let mut conf = PageServerConf { @@ -393,11 +391,11 @@ impl PageServerConf { image_compression, timeline_offloading, ephemeral_bytes_per_memory_kb, - server_side_batch_timeout, import_pgdata_upcall_api, import_pgdata_upcall_api_token: import_pgdata_upcall_api_token.map(SecretString::from), import_pgdata_aws_endpoint_url, wal_receiver_protocol, + page_service_pipelining, // ------------------------------------------------------------ // fields that require additional validation or custom handling diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index ef6711397a..ff6af3566c 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -356,6 +356,25 @@ async fn timed( } } +/// Like [`timed`], but the warning timeout only starts after `cancel` has been cancelled. +async fn timed_after_cancellation( + fut: Fut, + name: &str, + warn_at: std::time::Duration, + cancel: &CancellationToken, +) -> ::Output { + let mut fut = std::pin::pin!(fut); + + tokio::select! { + _ = cancel.cancelled() => { + timed(fut, name, warn_at).await + } + ret = &mut fut => { + ret + } + } +} + #[cfg(test)] mod timed_tests { use super::timed; diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 5fd02d8749..1917e7f5b7 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -7,6 +7,10 @@ use bytes::Buf; use futures::FutureExt; use itertools::Itertools; use once_cell::sync::OnceCell; +use pageserver_api::config::{ + PageServicePipeliningConfig, PageServicePipeliningConfigPipelined, + PageServiceProtocolPipelinedExecutionStrategy, +}; use pageserver_api::models::{self, TenantState}; use pageserver_api::models::{ PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse, @@ -16,12 +20,15 @@ use pageserver_api::models::{ PagestreamProtocolVersion, }; use pageserver_api::shard::TenantShardId; -use postgres_backend::{is_expected_io_error, AuthType, PostgresBackend, QueryError}; +use postgres_backend::{ + is_expected_io_error, AuthType, PostgresBackend, PostgresBackendReader, QueryError, +}; use pq_proto::framed::ConnectionError; use pq_proto::FeStartupPacket; use pq_proto::{BeMessage, FeMessage, RowDescriptor}; use std::borrow::Cow; use std::io; +use std::num::NonZeroUsize; use std::str; use std::str::FromStr; use std::sync::Arc; @@ -32,6 +39,7 @@ use tokio::io::{AsyncWriteExt, BufWriter}; use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tracing::*; +use utils::sync::spsc_fold; use utils::{ auth::{Claims, Scope, SwappableJwtAuth}, id::{TenantId, TimelineId}, @@ -40,7 +48,6 @@ use utils::{ }; use crate::auth::check_permission; -use crate::basebackup; use crate::basebackup::BasebackupError; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; @@ -58,6 +65,7 @@ use crate::tenant::timeline::{self, WaitLsnError}; use crate::tenant::GetTimelineError; use crate::tenant::PageReconstructError; use crate::tenant::Timeline; +use crate::{basebackup, timed_after_cancellation}; use pageserver_api::key::rel_block_to_key; use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID; @@ -105,7 +113,7 @@ pub fn spawn( pg_auth, tcp_listener, conf.pg_auth_type, - conf.server_side_batch_timeout, + conf.page_service_pipelining.clone(), libpq_ctx, cancel.clone(), ) @@ -154,7 +162,7 @@ pub async fn libpq_listener_main( auth: Option>, listener: tokio::net::TcpListener, auth_type: AuthType, - server_side_batch_timeout: Option, + pipelining_config: PageServicePipeliningConfig, listener_ctx: RequestContext, listener_cancel: CancellationToken, ) -> Connections { @@ -185,7 +193,7 @@ pub async fn libpq_listener_main( local_auth, socket, auth_type, - server_side_batch_timeout, + pipelining_config.clone(), connection_ctx, connections_cancel.child_token(), )); @@ -213,7 +221,7 @@ async fn page_service_conn_main( auth: Option>, socket: tokio::net::TcpStream, auth_type: AuthType, - server_side_batch_timeout: Option, + pipelining_config: PageServicePipeliningConfig, connection_ctx: RequestContext, cancel: CancellationToken, ) -> ConnectionHandlerResult { @@ -256,7 +264,7 @@ async fn page_service_conn_main( // a while: we will tear down this PageServerHandler and instantiate a new one if/when // they reconnect. socket.set_timeout(Some(std::time::Duration::from_millis(socket_timeout_ms))); - let socket = std::pin::pin!(socket); + let socket = Box::pin(socket); fail::fail_point!("ps::connection-start::pre-login"); @@ -267,7 +275,7 @@ async fn page_service_conn_main( let mut conn_handler = PageServerHandler::new( tenant_manager, auth, - server_side_batch_timeout, + pipelining_config, connection_ctx, cancel.clone(), ); @@ -283,7 +291,7 @@ async fn page_service_conn_main( info!("Postgres client disconnected ({io_error})"); Ok(()) } else { - let tenant_id = conn_handler.timeline_handles.tenant_id(); + let tenant_id = conn_handler.timeline_handles.as_ref().unwrap().tenant_id(); Err(io_error).context(format!( "Postgres connection error for tenant_id={:?} client at peer_addr={}", tenant_id, peer_addr @@ -291,7 +299,7 @@ async fn page_service_conn_main( } } other => { - let tenant_id = conn_handler.timeline_handles.tenant_id(); + let tenant_id = conn_handler.timeline_handles.as_ref().unwrap().tenant_id(); other.context(format!( "Postgres query error for tenant_id={:?} client peer_addr={}", tenant_id, peer_addr @@ -312,13 +320,10 @@ struct PageServerHandler { cancel: CancellationToken, - timeline_handles: TimelineHandles, + /// None only while pagestream protocol is being processed. + timeline_handles: Option, - /// Messages queued up for the next processing batch - next_batch: Option, - - /// See [`PageServerConf::server_side_batch_timeout`] - server_side_batch_timeout: Option, + pipelining_config: PageServicePipeliningConfig, } struct TimelineHandles { @@ -535,10 +540,12 @@ impl From for QueryError { enum BatchedFeMessage { Exists { span: Span, + shard: timeline::handle::Handle, req: models::PagestreamExistsRequest, }, Nblocks { span: Span, + shard: timeline::handle::Handle, req: models::PagestreamNblocksRequest, }, GetPage { @@ -549,10 +556,12 @@ enum BatchedFeMessage { }, DbSize { span: Span, + shard: timeline::handle::Handle, req: models::PagestreamDbSizeRequest, }, GetSlruSegment { span: Span, + shard: timeline::handle::Handle, req: models::PagestreamGetSlruSegmentRequest, }, RespondError { @@ -561,18 +570,11 @@ enum BatchedFeMessage { }, } -enum BatchOrEof { - /// In the common case, this has one entry. - /// At most, it has two entries: the first is the leftover batch, the second is an error. - Batch(smallvec::SmallVec<[BatchedFeMessage; 1]>), - Eof, -} - impl PageServerHandler { pub fn new( tenant_manager: Arc, auth: Option>, - server_side_batch_timeout: Option, + pipelining_config: PageServicePipeliningConfig, connection_ctx: RequestContext, cancel: CancellationToken, ) -> Self { @@ -580,10 +582,9 @@ impl PageServerHandler { auth, claims: None, connection_ctx, - timeline_handles: TimelineHandles::new(tenant_manager), + timeline_handles: Some(TimelineHandles::new(tenant_manager)), cancel, - next_batch: None, - server_side_batch_timeout, + pipelining_config, } } @@ -611,219 +612,356 @@ impl PageServerHandler { ) } - async fn read_batch_from_connection( - &mut self, - pgb: &mut PostgresBackend, - tenant_id: &TenantId, - timeline_id: &TimelineId, + async fn pagestream_read_message( + pgb: &mut PostgresBackendReader, + tenant_id: TenantId, + timeline_id: TimelineId, + timeline_handles: &mut TimelineHandles, + cancel: &CancellationToken, ctx: &RequestContext, - ) -> Result, QueryError> + parent_span: Span, + ) -> Result, QueryError> + where + IO: AsyncRead + AsyncWrite + Send + Sync + Unpin + 'static, + { + let msg = tokio::select! { + biased; + _ = cancel.cancelled() => { + return Err(QueryError::Shutdown) + } + msg = pgb.read_message() => { msg } + }; + + let copy_data_bytes = match msg? { + Some(FeMessage::CopyData(bytes)) => bytes, + Some(FeMessage::Terminate) => { + return Ok(None); + } + Some(m) => { + return Err(QueryError::Other(anyhow::anyhow!( + "unexpected message: {m:?} during COPY" + ))); + } + None => { + return Ok(None); + } // client disconnected + }; + trace!("query: {copy_data_bytes:?}"); + + fail::fail_point!("ps::handle-pagerequest-message"); + + // parse request + let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?; + + let batched_msg = match neon_fe_msg { + PagestreamFeMessage::Exists(req) => { + let span = tracing::info_span!(parent: parent_span, "handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn); + let shard = timeline_handles + .get(tenant_id, timeline_id, ShardSelector::Zero) + .instrument(span.clone()) // sets `shard_id` field + .await?; + BatchedFeMessage::Exists { span, shard, req } + } + PagestreamFeMessage::Nblocks(req) => { + let span = tracing::info_span!(parent: parent_span, "handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn); + let shard = timeline_handles + .get(tenant_id, timeline_id, ShardSelector::Zero) + .instrument(span.clone()) // sets `shard_id` field + .await?; + BatchedFeMessage::Nblocks { span, shard, req } + } + PagestreamFeMessage::DbSize(req) => { + let span = tracing::info_span!(parent: parent_span, "handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn); + let shard = timeline_handles + .get(tenant_id, timeline_id, ShardSelector::Zero) + .instrument(span.clone()) // sets `shard_id` field + .await?; + BatchedFeMessage::DbSize { span, shard, req } + } + PagestreamFeMessage::GetSlruSegment(req) => { + let span = tracing::info_span!(parent: parent_span, "handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn); + let shard = timeline_handles + .get(tenant_id, timeline_id, ShardSelector::Zero) + .instrument(span.clone()) // sets `shard_id` field + .await?; + BatchedFeMessage::GetSlruSegment { span, shard, req } + } + PagestreamFeMessage::GetPage(PagestreamGetPageRequest { + request_lsn, + not_modified_since, + rel, + blkno, + }) => { + let span = tracing::info_span!(parent: parent_span, "handle_get_page_at_lsn_request_batched", req_lsn = %request_lsn); + + macro_rules! respond_error { + ($error:expr) => {{ + let error = BatchedFeMessage::RespondError { + span, + error: $error, + }; + Ok(Some(error)) + }}; + } + + let key = rel_block_to_key(rel, blkno); + let shard = match timeline_handles + .get(tenant_id, timeline_id, ShardSelector::Page(key)) + .instrument(span.clone()) // sets `shard_id` field + .await + { + Ok(tl) => tl, + Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => { + // We already know this tenant exists in general, because we resolved it at + // start of connection. Getting a NotFound here indicates that the shard containing + // the requested page is not present on this node: the client's knowledge of shard->pageserver + // mapping is out of date. + // + // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via + // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration + // and talk to a different pageserver. + return respond_error!(PageStreamError::Reconnect( + "getpage@lsn request routed to wrong shard".into() + )); + } + Err(e) => { + return respond_error!(e.into()); + } + }; + let effective_request_lsn = match Self::wait_or_get_last_lsn( + &shard, + request_lsn, + not_modified_since, + &shard.get_latest_gc_cutoff_lsn(), + ctx, + ) + // TODO: if we actually need to wait for lsn here, it delays the entire batch which doesn't need to wait + .await + { + Ok(lsn) => lsn, + Err(e) => { + return respond_error!(e); + } + }; + BatchedFeMessage::GetPage { + span, + shard, + effective_request_lsn, + pages: smallvec::smallvec![(rel, blkno)], + } + } + }; + Ok(Some(batched_msg)) + } + + /// Post-condition: `batch` is Some() + #[instrument(skip_all, level = tracing::Level::TRACE)] + #[allow(clippy::boxed_local)] + fn pagestream_do_batch( + max_batch_size: NonZeroUsize, + batch: &mut Result, + this_msg: Result, + ) -> Result<(), Result> { + debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id(); + + let this_msg = match this_msg { + Ok(this_msg) => this_msg, + Err(e) => return Err(Err(e)), + }; + + match (&mut *batch, this_msg) { + // something batched already, let's see if we can add this message to the batch + ( + Ok(BatchedFeMessage::GetPage { + span: _, + shard: accum_shard, + pages: ref mut accum_pages, + effective_request_lsn: accum_lsn, + }), + BatchedFeMessage::GetPage { + span: _, + shard: this_shard, + pages: this_pages, + effective_request_lsn: this_lsn, + }, + ) if (|| { + assert_eq!(this_pages.len(), 1); + if accum_pages.len() >= max_batch_size.get() { + trace!(%accum_lsn, %this_lsn, %max_batch_size, "stopping batching because of batch size"); + assert_eq!(accum_pages.len(), max_batch_size.get()); + return false; + } + if (accum_shard.tenant_shard_id, accum_shard.timeline_id) + != (this_shard.tenant_shard_id, this_shard.timeline_id) + { + trace!(%accum_lsn, %this_lsn, "stopping batching because timeline object mismatch"); + // TODO: we _could_ batch & execute each shard seperately (and in parallel). + // But the current logic for keeping responses in order does not support that. + return false; + } + // the vectored get currently only supports a single LSN, so, bounce as soon + // as the effective request_lsn changes + if *accum_lsn != this_lsn { + trace!(%accum_lsn, %this_lsn, "stopping batching because LSN changed"); + return false; + } + true + })() => + { + // ok to batch + accum_pages.extend(this_pages); + Ok(()) + } + // something batched already but this message is unbatchable + (_, this_msg) => { + // by default, don't continue batching + Err(Ok(this_msg)) + } + } + } + + #[instrument(level = tracing::Level::DEBUG, skip_all)] + async fn pagesteam_handle_batched_message( + &mut self, + pgb_writer: &mut PostgresBackend, + batch: BatchedFeMessage, + cancel: &CancellationToken, + ctx: &RequestContext, + ) -> Result<(), QueryError> where IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, { - let mut batch = self.next_batch.take(); - let mut batch_started_at: Option = None; - - let next_batch: Option = loop { - let sleep_fut = match (self.server_side_batch_timeout, batch_started_at) { - (Some(batch_timeout), Some(started_at)) => futures::future::Either::Left( - tokio::time::sleep_until((started_at + batch_timeout).into()), - ), - _ => futures::future::Either::Right(futures::future::pending()), - }; - - let msg = tokio::select! { - biased; - _ = self.cancel.cancelled() => { - return Err(QueryError::Shutdown) - } - msg = pgb.read_message() => { - msg - } - _ = sleep_fut => { - assert!(batch.is_some()); - break None; - } - }; - let copy_data_bytes = match msg? { - Some(FeMessage::CopyData(bytes)) => bytes, - Some(FeMessage::Terminate) => { - return Ok(Some(BatchOrEof::Eof)); - } - Some(m) => { - return Err(QueryError::Other(anyhow::anyhow!( - "unexpected message: {m:?} during COPY" - ))); - } - None => { - return Ok(Some(BatchOrEof::Eof)); - } // client disconnected - }; - trace!("query: {copy_data_bytes:?}"); - fail::fail_point!("ps::handle-pagerequest-message"); - - // parse request - let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?; - - let this_msg = match neon_fe_msg { - PagestreamFeMessage::Exists(req) => BatchedFeMessage::Exists { - span: tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn), - req, - }, - PagestreamFeMessage::Nblocks(req) => BatchedFeMessage::Nblocks { - span: tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn), - req, - }, - PagestreamFeMessage::DbSize(req) => BatchedFeMessage::DbSize { - span: tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn), - req, - }, - PagestreamFeMessage::GetSlruSegment(req) => BatchedFeMessage::GetSlruSegment { - span: tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn), - req, - }, - PagestreamFeMessage::GetPage(PagestreamGetPageRequest { - request_lsn, - not_modified_since, - rel, - blkno, - }) => { - // shard_id is filled in by the handler - let span = tracing::info_span!( - "handle_get_page_at_lsn_request_batched", - %tenant_id, %timeline_id, shard_id = tracing::field::Empty, req_lsn = %request_lsn, - batch_size = tracing::field::Empty, batch_id = tracing::field::Empty - ); - - macro_rules! current_batch_and_error { - ($error:expr) => {{ - let error = BatchedFeMessage::RespondError { - span, - error: $error, - }; - let batch_and_error = match batch { - Some(b) => smallvec::smallvec![b, error], - None => smallvec::smallvec![error], - }; - Ok(Some(BatchOrEof::Batch(batch_and_error))) - }}; - } - - let key = rel_block_to_key(rel, blkno); - let shard = match self - .timeline_handles - .get(*tenant_id, *timeline_id, ShardSelector::Page(key)) - .instrument(span.clone()) - .await - { - Ok(tl) => tl, - Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => { - // We already know this tenant exists in general, because we resolved it at - // start of connection. Getting a NotFound here indicates that the shard containing - // the requested page is not present on this node: the client's knowledge of shard->pageserver - // mapping is out of date. - // - // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via - // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration - // and talk to a different pageserver. - return current_batch_and_error!(PageStreamError::Reconnect( - "getpage@lsn request routed to wrong shard".into() - )); - } - Err(e) => { - return current_batch_and_error!(e.into()); - } - }; - let effective_request_lsn = match Self::wait_or_get_last_lsn( - &shard, - request_lsn, - not_modified_since, - &shard.get_latest_gc_cutoff_lsn(), - ctx, - ) - // TODO: if we actually need to wait for lsn here, it delays the entire batch which doesn't need to wait - .await - { - Ok(lsn) => lsn, - Err(e) => { - return current_batch_and_error!(e); - } - }; - BatchedFeMessage::GetPage { + // invoke handler function + let (handler_results, span): (Vec>, _) = + match batch { + BatchedFeMessage::Exists { span, shard, req } => { + fail::fail_point!("ps::handle-pagerequest-message::exists"); + ( + vec![ + self.handle_get_rel_exists_request(&shard, &req, ctx) + .instrument(span.clone()) + .await, + ], span, - shard, - effective_request_lsn, - pages: smallvec::smallvec![(rel, blkno)], - } + ) + } + BatchedFeMessage::Nblocks { span, shard, req } => { + fail::fail_point!("ps::handle-pagerequest-message::nblocks"); + ( + vec![ + self.handle_get_nblocks_request(&shard, &req, ctx) + .instrument(span.clone()) + .await, + ], + span, + ) + } + BatchedFeMessage::GetPage { + span, + shard, + effective_request_lsn, + pages, + } => { + fail::fail_point!("ps::handle-pagerequest-message::getpage"); + ( + { + let npages = pages.len(); + trace!(npages, "handling getpage request"); + let res = self + .handle_get_page_at_lsn_request_batched( + &shard, + effective_request_lsn, + pages, + ctx, + ) + .instrument(span.clone()) + .await; + assert_eq!(res.len(), npages); + res + }, + span, + ) + } + BatchedFeMessage::DbSize { span, shard, req } => { + fail::fail_point!("ps::handle-pagerequest-message::dbsize"); + ( + vec![ + self.handle_db_size_request(&shard, &req, ctx) + .instrument(span.clone()) + .await, + ], + span, + ) + } + BatchedFeMessage::GetSlruSegment { span, shard, req } => { + fail::fail_point!("ps::handle-pagerequest-message::slrusegment"); + ( + vec![ + self.handle_get_slru_segment_request(&shard, &req, ctx) + .instrument(span.clone()) + .await, + ], + span, + ) + } + BatchedFeMessage::RespondError { span, error } => { + // We've already decided to respond with an error, so we don't need to + // call the handler. + (vec![Err(error)], span) } }; - let batch_timeout = match self.server_side_batch_timeout { - Some(value) => value, - None => { - // Batching is not enabled - stop on the first message. - return Ok(Some(BatchOrEof::Batch(smallvec::smallvec![this_msg]))); - } + // Map handler result to protocol behavior. + // Some handler errors cause exit from pagestream protocol. + // Other handler errors are sent back as an error message and we stay in pagestream protocol. + for handler_result in handler_results { + let response_msg = match handler_result { + Err(e) => match &e { + PageStreamError::Shutdown => { + // If we fail to fulfil a request during shutdown, which may be _because_ of + // shutdown, then do not send the error to the client. Instead just drop the + // connection. + span.in_scope(|| info!("dropping connection due to shutdown")); + return Err(QueryError::Shutdown); + } + PageStreamError::Reconnect(reason) => { + span.in_scope(|| info!("handler requested reconnect: {reason}")); + return Err(QueryError::Reconnect); + } + PageStreamError::Read(_) + | PageStreamError::LsnTimeout(_) + | PageStreamError::NotFound(_) + | PageStreamError::BadRequest(_) => { + // print the all details to the log with {:#}, but for the client the + // error message is enough. Do not log if shutting down, as the anyhow::Error + // here includes cancellation which is not an error. + let full = utils::error::report_compact_sources(&e); + span.in_scope(|| { + error!("error reading relation or page version: {full:#}") + }); + PagestreamBeMessage::Error(PagestreamErrorResponse { + message: e.to_string(), + }) + } + }, + Ok(response_msg) => response_msg, }; - // check if we can batch - match (&mut batch, this_msg) { - (None, this_msg) => { - batch = Some(this_msg); - } - ( - Some(BatchedFeMessage::GetPage { - span: _, - shard: accum_shard, - pages: accum_pages, - effective_request_lsn: accum_lsn, - }), - BatchedFeMessage::GetPage { - span: _, - shard: this_shard, - pages: this_pages, - effective_request_lsn: this_lsn, - }, - ) if async { - assert_eq!(this_pages.len(), 1); - if accum_pages.len() >= Timeline::MAX_GET_VECTORED_KEYS as usize { - assert_eq!(accum_pages.len(), Timeline::MAX_GET_VECTORED_KEYS as usize); - return false; - } - if (accum_shard.tenant_shard_id, accum_shard.timeline_id) - != (this_shard.tenant_shard_id, this_shard.timeline_id) - { - // TODO: we _could_ batch & execute each shard seperately (and in parallel). - // But the current logic for keeping responses in order does not support that. - return false; - } - // the vectored get currently only supports a single LSN, so, bounce as soon - // as the effective request_lsn changes - if *accum_lsn != this_lsn { - return false; - } - true - } - .await => - { - // ok to batch - accum_pages.extend(this_pages); - } - (Some(_), this_msg) => { - // by default, don't continue batching - break Some(this_msg); - } + // marshal & transmit response message + pgb_writer.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?; + } + tokio::select! { + biased; + _ = cancel.cancelled() => { + // We were requested to shut down. + info!("shutdown request received in page handler"); + return Err(QueryError::Shutdown) } - - // batching impl piece - let started_at = batch_started_at.get_or_insert_with(Instant::now); - if started_at.elapsed() > batch_timeout { - break None; + res = pgb_writer.flush() => { + res?; } - }; - - self.next_batch = next_batch; - Ok(batch.map(|b| BatchOrEof::Batch(smallvec::smallvec![b]))) + } + Ok(()) } /// Pagestream sub-protocol handler. @@ -845,7 +983,7 @@ impl PageServerHandler { ctx: RequestContext, ) -> Result<(), QueryError> where - IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, + IO: AsyncRead + AsyncWrite + Send + Sync + Unpin + 'static, { debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id(); @@ -861,169 +999,283 @@ impl PageServerHandler { } } - // If [`PageServerHandler`] is reused for multiple pagestreams, - // then make sure to not process requests from the previous ones. - self.next_batch = None; + let pgb_reader = pgb + .split() + .context("implementation error: split pgb into reader and writer")?; - loop { - let maybe_batched = self - .read_batch_from_connection(pgb, &tenant_id, &timeline_id, &ctx) - .await?; - let batched = match maybe_batched { - Some(BatchOrEof::Batch(b)) => b, - Some(BatchOrEof::Eof) => { - break; - } + let timeline_handles = self + .timeline_handles + .take() + .expect("implementation error: timeline_handles should not be locked"); + + let request_span = info_span!("request", shard_id = tracing::field::Empty); + let ((pgb_reader, timeline_handles), result) = match self.pipelining_config.clone() { + PageServicePipeliningConfig::Pipelined(pipelining_config) => { + self.handle_pagerequests_pipelined( + pgb, + pgb_reader, + tenant_id, + timeline_id, + timeline_handles, + request_span, + pipelining_config, + &ctx, + ) + .await + } + PageServicePipeliningConfig::Serial => { + self.handle_pagerequests_serial( + pgb, + pgb_reader, + tenant_id, + timeline_id, + timeline_handles, + request_span, + &ctx, + ) + .await + } + }; + + debug!("pagestream subprotocol shut down cleanly"); + + pgb.unsplit(pgb_reader) + .context("implementation error: unsplit pgb")?; + + let replaced = self.timeline_handles.replace(timeline_handles); + assert!(replaced.is_none()); + + result + } + + #[allow(clippy::too_many_arguments)] + async fn handle_pagerequests_serial( + &mut self, + pgb_writer: &mut PostgresBackend, + mut pgb_reader: PostgresBackendReader, + tenant_id: TenantId, + timeline_id: TimelineId, + mut timeline_handles: TimelineHandles, + request_span: Span, + ctx: &RequestContext, + ) -> ( + (PostgresBackendReader, TimelineHandles), + Result<(), QueryError>, + ) + where + IO: AsyncRead + AsyncWrite + Send + Sync + Unpin + 'static, + { + let cancel = self.cancel.clone(); + let err = loop { + let msg = Self::pagestream_read_message( + &mut pgb_reader, + tenant_id, + timeline_id, + &mut timeline_handles, + &cancel, + ctx, + request_span.clone(), + ) + .await; + let msg = match msg { + Ok(msg) => msg, + Err(e) => break e, + }; + let msg = match msg { + Some(msg) => msg, None => { - continue; + debug!("pagestream subprotocol end observed"); + return ((pgb_reader, timeline_handles), Ok(())); } }; + let err = self + .pagesteam_handle_batched_message(pgb_writer, msg, &cancel, ctx) + .await; + match err { + Ok(()) => {} + Err(e) => break e, + } + }; + ((pgb_reader, timeline_handles), Err(err)) + } - for batch in batched { - // invoke handler function - let (handler_results, span): ( - Vec>, - _, - ) = match batch { - BatchedFeMessage::Exists { span, req } => { - fail::fail_point!("ps::handle-pagerequest-message::exists"); - ( - vec![ - self.handle_get_rel_exists_request( - tenant_id, - timeline_id, - &req, - &ctx, - ) - .instrument(span.clone()) - .await, - ], - span, - ) - } - BatchedFeMessage::Nblocks { span, req } => { - fail::fail_point!("ps::handle-pagerequest-message::nblocks"); - ( - vec![ - self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx) - .instrument(span.clone()) - .await, - ], - span, - ) - } - BatchedFeMessage::GetPage { - span, - shard, - effective_request_lsn, - pages, - } => { - fail::fail_point!("ps::handle-pagerequest-message::getpage"); - ( - { - let npages = pages.len(); - let res = self - .handle_get_page_at_lsn_request_batched( - &shard, - effective_request_lsn, - pages, - &ctx, - ) - .instrument(span.clone()) - .await; - assert_eq!(res.len(), npages); - res - }, - span, - ) - } - BatchedFeMessage::DbSize { span, req } => { - fail::fail_point!("ps::handle-pagerequest-message::dbsize"); - ( - vec![ - self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx) - .instrument(span.clone()) - .await, - ], - span, - ) - } - BatchedFeMessage::GetSlruSegment { span, req } => { - fail::fail_point!("ps::handle-pagerequest-message::slrusegment"); - ( - vec![ - self.handle_get_slru_segment_request( - tenant_id, - timeline_id, - &req, - &ctx, - ) - .instrument(span.clone()) - .await, - ], - span, - ) - } - BatchedFeMessage::RespondError { span, error } => { - // We've already decided to respond with an error, so we don't need to - // call the handler. - (vec![Err(error)], span) - } - }; + /// # Cancel-Safety + /// + /// May leak tokio tasks if not polled to completion. + #[allow(clippy::too_many_arguments)] + async fn handle_pagerequests_pipelined( + &mut self, + pgb_writer: &mut PostgresBackend, + pgb_reader: PostgresBackendReader, + tenant_id: TenantId, + timeline_id: TimelineId, + mut timeline_handles: TimelineHandles, + request_span: Span, + pipelining_config: PageServicePipeliningConfigPipelined, + ctx: &RequestContext, + ) -> ( + (PostgresBackendReader, TimelineHandles), + Result<(), QueryError>, + ) + where + IO: AsyncRead + AsyncWrite + Send + Sync + Unpin + 'static, + { + // + // Pipelined pagestream handling consists of + // - a Batcher that reads requests off the wire and + // and batches them if possible, + // - an Executor that processes the batched requests. + // + // The batch is built up inside an `spsc_fold` channel, + // shared betwen Batcher (Sender) and Executor (Receiver). + // + // The Batcher continously folds client requests into the batch, + // while the Executor can at any time take out what's in the batch + // in order to process it. + // This means the next batch builds up while the Executor + // executes the last batch. + // + // CANCELLATION + // + // We run both Batcher and Executor futures to completion before + // returning from this function. + // + // If Executor exits first, it signals cancellation to the Batcher + // via a CancellationToken that is child of `self.cancel`. + // If Batcher exits first, it signals cancellation to the Executor + // by dropping the spsc_fold channel Sender. + // + // CLEAN SHUTDOWN + // + // Clean shutdown means that the client ends the COPYBOTH session. + // In response to such a client message, the Batcher exits. + // The Executor continues to run, draining the spsc_fold channel. + // Once drained, the spsc_fold recv will fail with a distinct error + // indicating that the sender disconnected. + // The Executor exits with Ok(()) in response to that error. + // + // Server initiated shutdown is not clean shutdown, but instead + // is an error Err(QueryError::Shutdown) that is propagated through + // error propagation. + // + // ERROR PROPAGATION + // + // When the Batcher encounter an error, it sends it as a value + // through the spsc_fold channel and exits afterwards. + // When the Executor observes such an error in the channel, + // it exits returning that error value. + // + // This design ensures that the Executor stage will still process + // the batch that was in flight when the Batcher encountered an error, + // thereby beahving identical to a serial implementation. - // Map handler result to protocol behavior. - // Some handler errors cause exit from pagestream protocol. - // Other handler errors are sent back as an error message and we stay in pagestream protocol. - for handler_result in handler_results { - let response_msg = match handler_result { - Err(e) => match &e { - PageStreamError::Shutdown => { - // If we fail to fulfil a request during shutdown, which may be _because_ of - // shutdown, then do not send the error to the client. Instead just drop the - // connection. - span.in_scope(|| info!("dropping connection due to shutdown")); - return Err(QueryError::Shutdown); - } - PageStreamError::Reconnect(reason) => { - span.in_scope(|| info!("handler requested reconnect: {reason}")); - return Err(QueryError::Reconnect); - } - PageStreamError::Read(_) - | PageStreamError::LsnTimeout(_) - | PageStreamError::NotFound(_) - | PageStreamError::BadRequest(_) => { - // print the all details to the log with {:#}, but for the client the - // error message is enough. Do not log if shutting down, as the anyhow::Error - // here includes cancellation which is not an error. - let full = utils::error::report_compact_sources(&e); - span.in_scope(|| { - error!("error reading relation or page version: {full:#}") - }); - PagestreamBeMessage::Error(PagestreamErrorResponse { - message: e.to_string(), - }) - } - }, - Ok(response_msg) => response_msg, - }; + let PageServicePipeliningConfigPipelined { + max_batch_size, + execution, + } = pipelining_config; - // marshal & transmit response message - pgb.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?; + // Macro to _define_ a pipeline stage. + macro_rules! pipeline_stage { + ($name:literal, $cancel:expr, $make_fut:expr) => {{ + let cancel: CancellationToken = $cancel; + let stage_fut = $make_fut(cancel.clone()); + async move { + scopeguard::defer! { + debug!("exiting"); + } + timed_after_cancellation(stage_fut, $name, Duration::from_millis(100), &cancel) + .await } - tokio::select! { - biased; - _ = self.cancel.cancelled() => { - // We were requested to shut down. - info!("shutdown request received in page handler"); - return Err(QueryError::Shutdown) - } - res = pgb.flush() => { - res?; - } + .instrument(tracing::info_span!($name)) + }}; + } + + // + // Batcher + // + + let cancel_batcher = self.cancel.child_token(); + let (mut batch_tx, mut batch_rx) = spsc_fold::channel(); + let batcher = pipeline_stage!("batcher", cancel_batcher.clone(), move |cancel_batcher| { + let ctx = ctx.attached_child(); + async move { + let mut pgb_reader = pgb_reader; + let mut exit = false; + while !exit { + let read_res = Self::pagestream_read_message( + &mut pgb_reader, + tenant_id, + timeline_id, + &mut timeline_handles, + &cancel_batcher, + &ctx, + request_span.clone(), + ) + .await; + let Some(read_res) = read_res.transpose() else { + debug!("client-initiated shutdown"); + break; + }; + exit |= read_res.is_err(); + let could_send = batch_tx + .send(read_res, |batch, res| { + Self::pagestream_do_batch(max_batch_size, batch, res) + }) + .await; + exit |= could_send.is_err(); + } + (pgb_reader, timeline_handles) + } + }); + + // + // Executor + // + + let executor = pipeline_stage!("executor", self.cancel.clone(), move |cancel| { + let ctx = ctx.attached_child(); + async move { + let _cancel_batcher = cancel_batcher.drop_guard(); + loop { + let maybe_batch = batch_rx.recv().await; + let batch = match maybe_batch { + Ok(batch) => batch, + Err(spsc_fold::RecvError::SenderGone) => { + debug!("upstream gone"); + return Ok(()); + } + }; + let batch = match batch { + Ok(batch) => batch, + Err(e) => { + return Err(e); + } + }; + self.pagesteam_handle_batched_message(pgb_writer, batch, &cancel, &ctx) + .await?; } } + }); + + // + // Execute the stages. + // + + match execution { + PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures => { + tokio::join!(batcher, executor) + } + PageServiceProtocolPipelinedExecutionStrategy::Tasks => { + // These tasks are not tracked anywhere. + let read_messages_task = tokio::spawn(batcher); + let (read_messages_task_res, executor_res_) = + tokio::join!(read_messages_task, executor,); + ( + read_messages_task_res.expect("propagated panic from read_messages"), + executor_res_, + ) + } } - Ok(()) } /// Helper function to handle the LSN from client request. @@ -1131,6 +1383,8 @@ impl PageServerHandler { { let timeline = self .timeline_handles + .as_mut() + .unwrap() .get( tenant_shard_id.tenant_id, timeline_id, @@ -1165,22 +1419,17 @@ impl PageServerHandler { #[instrument(skip_all, fields(shard_id))] async fn handle_get_rel_exists_request( &mut self, - tenant_id: TenantId, - timeline_id: TimelineId, + timeline: &Timeline, req: &PagestreamExistsRequest, ctx: &RequestContext, ) -> Result { - let timeline = self - .timeline_handles - .get(tenant_id, timeline_id, ShardSelector::Zero) - .await?; let _timer = timeline .query_metrics .start_timer(metrics::SmgrQueryType::GetRelExists, ctx); let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( - &timeline, + timeline, req.request_lsn, req.not_modified_since, &latest_gc_cutoff_lsn, @@ -1200,23 +1449,17 @@ impl PageServerHandler { #[instrument(skip_all, fields(shard_id))] async fn handle_get_nblocks_request( &mut self, - tenant_id: TenantId, - timeline_id: TimelineId, + timeline: &Timeline, req: &PagestreamNblocksRequest, ctx: &RequestContext, ) -> Result { - let timeline = self - .timeline_handles - .get(tenant_id, timeline_id, ShardSelector::Zero) - .await?; - let _timer = timeline .query_metrics .start_timer(metrics::SmgrQueryType::GetRelSize, ctx); let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( - &timeline, + timeline, req.request_lsn, req.not_modified_since, &latest_gc_cutoff_lsn, @@ -1236,23 +1479,17 @@ impl PageServerHandler { #[instrument(skip_all, fields(shard_id))] async fn handle_db_size_request( &mut self, - tenant_id: TenantId, - timeline_id: TimelineId, + timeline: &Timeline, req: &PagestreamDbSizeRequest, ctx: &RequestContext, ) -> Result { - let timeline = self - .timeline_handles - .get(tenant_id, timeline_id, ShardSelector::Zero) - .await?; - let _timer = timeline .query_metrics .start_timer(metrics::SmgrQueryType::GetDbSize, ctx); let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( - &timeline, + timeline, req.request_lsn, req.not_modified_since, &latest_gc_cutoff_lsn, @@ -1300,23 +1537,17 @@ impl PageServerHandler { #[instrument(skip_all, fields(shard_id))] async fn handle_get_slru_segment_request( &mut self, - tenant_id: TenantId, - timeline_id: TimelineId, + timeline: &Timeline, req: &PagestreamGetSlruSegmentRequest, ctx: &RequestContext, ) -> Result { - let timeline = self - .timeline_handles - .get(tenant_id, timeline_id, ShardSelector::Zero) - .await?; - let _timer = timeline .query_metrics .start_timer(metrics::SmgrQueryType::GetSlruSegment, ctx); let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( - &timeline, + timeline, req.request_lsn, req.not_modified_since, &latest_gc_cutoff_lsn, @@ -1374,6 +1605,8 @@ impl PageServerHandler { let timeline = self .timeline_handles + .as_mut() + .unwrap() .get(tenant_id, timeline_id, ShardSelector::Zero) .await?; @@ -1716,7 +1949,7 @@ impl PageServiceCmd { impl postgres_backend::Handler for PageServerHandler where - IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, + IO: AsyncRead + AsyncWrite + Send + Sync + Unpin + 'static, { fn check_auth_jwt( &mut self, diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index e3c88e9965..9bcfffeb9c 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -3804,9 +3804,10 @@ class Endpoint(PgProtocol, LogUtils): # shared_buffers = 512kB to make postgres use LFC intensively # neon.max_file_cache_size and neon.file_cache size limit are # set to 1MB because small LFC is better for testing (helps to find more problems) + lfc_path_escaped = str(lfc_path).replace("'", "''") config_lines = [ "shared_buffers = 512kB", - f"neon.file_cache_path = '{self.lfc_path()}'", + f"neon.file_cache_path = '{lfc_path_escaped}'", "neon.max_file_cache_size = 1MB", "neon.file_cache_size_limit = 1MB", ] + config_lines diff --git a/test_runner/performance/pageserver/test_pageserver_getpage_merge.py b/test_runner/performance/pageserver/test_page_service_batching.py similarity index 69% rename from test_runner/performance/pageserver/test_pageserver_getpage_merge.py rename to test_runner/performance/pageserver/test_page_service_batching.py index 34cce9900b..c47a849fec 100644 --- a/test_runner/performance/pageserver/test_pageserver_getpage_merge.py +++ b/test_runner/performance/pageserver/test_page_service_batching.py @@ -11,36 +11,95 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn from fixtures.utils import humantime_to_ms -TARGET_RUNTIME = 60 +TARGET_RUNTIME = 30 + + +@dataclass +class PageServicePipeliningConfig: + pass + + +@dataclass +class PageServicePipeliningConfigSerial(PageServicePipeliningConfig): + mode: str = "serial" + + +@dataclass +class PageServicePipeliningConfigPipelined(PageServicePipeliningConfig): + max_batch_size: int + execution: str + mode: str = "pipelined" + + +EXECUTION = ["concurrent-futures", "tasks"] + +NON_BATCHABLE: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()] +for max_batch_size in [1, 32]: + for execution in EXECUTION: + NON_BATCHABLE.append(PageServicePipeliningConfigPipelined(max_batch_size, execution)) + +BATCHABLE: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()] +for max_batch_size in [1, 2, 4, 8, 16, 32]: + for execution in EXECUTION: + BATCHABLE.append(PageServicePipeliningConfigPipelined(max_batch_size, execution)) -@pytest.mark.skip("See https://github.com/neondatabase/neon/pull/9820#issue-2675856095") @pytest.mark.parametrize( - "tablesize_mib, batch_timeout, target_runtime, effective_io_concurrency, readhead_buffer_size, name", + "tablesize_mib, pipelining_config, target_runtime, effective_io_concurrency, readhead_buffer_size, name", [ - # the next 4 cases demonstrate how not-batchable workloads suffer from batching timeout - (50, None, TARGET_RUNTIME, 1, 128, "not batchable no batching"), - (50, "10us", TARGET_RUNTIME, 1, 128, "not batchable 10us timeout"), - (50, "1ms", TARGET_RUNTIME, 1, 128, "not batchable 1ms timeout"), - # the next 4 cases demonstrate how batchable workloads benefit from batching - (50, None, TARGET_RUNTIME, 100, 128, "batchable no batching"), - (50, "10us", TARGET_RUNTIME, 100, 128, "batchable 10us timeout"), - (50, "100us", TARGET_RUNTIME, 100, 128, "batchable 100us timeout"), - (50, "1ms", TARGET_RUNTIME, 100, 128, "batchable 1ms timeout"), + # non-batchable workloads + # (A separate benchmark will consider latency). + *[ + ( + 50, + config, + TARGET_RUNTIME, + 1, + 128, + f"not batchable {dataclasses.asdict(config)}", + ) + for config in NON_BATCHABLE + ], + # batchable workloads should show throughput and CPU efficiency improvements + *[ + ( + 50, + config, + TARGET_RUNTIME, + 100, + 128, + f"batchable {dataclasses.asdict(config)}", + ) + for config in BATCHABLE + ], ], ) -def test_getpage_merge_smoke( +def test_throughput( neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker, tablesize_mib: int, - batch_timeout: str | None, + pipelining_config: PageServicePipeliningConfig, target_runtime: int, effective_io_concurrency: int, readhead_buffer_size: int, name: str, ): """ - Do a bunch of sequential scans and ensure that the pageserver does some merging. + Do a bunch of sequential scans with varying compute and pipelining configurations. + Primary performance metrics are the achieved batching factor and throughput (wall clock time). + Resource utilization is also interesting - we currently measure CPU time. + + The test is a fixed-runtime based type of test (target_runtime). + Hence, the results are normalized to the number of iterations completed within target runtime. + + If the compute doesn't provide pipeline depth (effective_io_concurrency=1), + performance should be about identical in all configurations. + Pipelining can still yield improvements in these scenarios because it parses the + next request while the current one is still being executed. + + If the compute provides pipeline depth (effective_io_concurrency=100), then + pipelining configs, especially with max_batch_size>1 should yield dramatic improvements + in all performance metrics. """ # @@ -51,14 +110,16 @@ def test_getpage_merge_smoke( params.update( { "tablesize_mib": (tablesize_mib, {"unit": "MiB"}), - "batch_timeout": ( - -1 if batch_timeout is None else 1e3 * humantime_to_ms(batch_timeout), - {"unit": "us"}, - ), # target_runtime is just a polite ask to the workload to run for this long "effective_io_concurrency": (effective_io_concurrency, {}), "readhead_buffer_size": (readhead_buffer_size, {}), - # name is not a metric + # name is not a metric, we just use it to identify the test easily in the `test_...[...]`` notation + } + ) + params.update( + { + f"pipelining_config.{k}": (v, {}) + for k, v in dataclasses.asdict(pipelining_config).items() } ) @@ -170,7 +231,9 @@ def test_getpage_merge_smoke( after = get_metrics() return (after - before).normalize(iters - 1) - env.pageserver.patch_config_toml_nonrecursive({"server_side_batch_timeout": batch_timeout}) + env.pageserver.patch_config_toml_nonrecursive( + {"page_service_pipelining": dataclasses.asdict(pipelining_config)} + ) env.pageserver.restart() metrics = workload() @@ -199,23 +262,30 @@ def test_getpage_merge_smoke( ) -@pytest.mark.skip("See https://github.com/neondatabase/neon/pull/9820#issue-2675856095") +PRECISION_CONFIGS: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()] +for max_batch_size in [1, 32]: + for execution in EXECUTION: + PRECISION_CONFIGS.append(PageServicePipeliningConfigPipelined(max_batch_size, execution)) + + @pytest.mark.parametrize( - "batch_timeout", [None, "10us", "20us", "50us", "100us", "200us", "500us", "1ms"] + "pipelining_config,name", + [(config, f"{dataclasses.asdict(config)}") for config in PRECISION_CONFIGS], ) -def test_timer_precision( +def test_latency( neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker, pg_bin: PgBin, - batch_timeout: str | None, + pipelining_config: PageServicePipeliningConfig, + name: str, ): """ - Determine the batching timeout precision (mean latency) and tail latency impact. + Measure the latency impact of pipelining in an un-batchable workloads. - The baseline is `None`; an ideal batching timeout implementation would increase - the mean latency by exactly `batch_timeout`. + An ideal implementation should not increase average or tail latencies for such workloads. - That is not the case with the current implementation, will be addressed in future changes. + We don't have support in pagebench to create queue depth yet. + => https://github.com/neondatabase/neon/issues/9837 """ # @@ -223,7 +293,8 @@ def test_timer_precision( # def patch_ps_config(ps_config): - ps_config["server_side_batch_timeout"] = batch_timeout + if pipelining_config is not None: + ps_config["page_service_pipelining"] = dataclasses.asdict(pipelining_config) neon_env_builder.pageserver_config_override = patch_ps_config From 8b7e9ed820e5ae4a4da6cd2f88ed89ba47739e4f Mon Sep 17 00:00:00 2001 From: Folke Behrens Date: Sat, 30 Nov 2024 11:11:37 +0100 Subject: [PATCH 17/65] Merge the consumption metric pushes (#9939) #8564 ## Problem The main and backup consumption metric pushes are completely independent, resulting in different event time windows and different idempotency keys. ## Summary of changes * Merge the push tasks, but keep chunks the same size. --- Cargo.lock | 1 + libs/consumption_metrics/src/lib.rs | 5 +- proxy/Cargo.toml | 1 + proxy/src/bin/proxy.rs | 4 - proxy/src/usage_metrics.rs | 351 ++++++++++++++-------------- 5 files changed, 181 insertions(+), 181 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 313222cf3c..5ce27a7d45 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4501,6 +4501,7 @@ dependencies = [ "ecdsa 0.16.9", "env_logger", "fallible-iterator", + "flate2", "framed-websockets", "futures", "hashbrown 0.14.5", diff --git a/libs/consumption_metrics/src/lib.rs b/libs/consumption_metrics/src/lib.rs index fbe2e6830f..448134f31a 100644 --- a/libs/consumption_metrics/src/lib.rs +++ b/libs/consumption_metrics/src/lib.rs @@ -103,11 +103,12 @@ impl<'a> IdempotencyKey<'a> { } } +/// Split into chunks of 1000 metrics to avoid exceeding the max request size. pub const CHUNK_SIZE: usize = 1000; // Just a wrapper around a slice of events // to serialize it as `{"events" : [ ] } -#[derive(serde::Serialize, Deserialize)] -pub struct EventChunk<'a, T: Clone> { +#[derive(Debug, serde::Serialize, serde::Deserialize, PartialEq)] +pub struct EventChunk<'a, T: Clone + PartialEq> { pub events: std::borrow::Cow<'a, [T]>, } diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 0d774d529d..f5934c8a89 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -112,6 +112,7 @@ workspace_hack.workspace = true [dev-dependencies] camino-tempfile.workspace = true fallible-iterator.workspace = true +flate2.workspace = true tokio-tungstenite.workspace = true pbkdf2 = { workspace = true, features = ["simple", "std"] } rcgen.workspace = true diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index a935378162..b772a987ee 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -517,10 +517,6 @@ async fn main() -> anyhow::Result<()> { if let Some(metrics_config) = &config.metric_collection { // TODO: Add gc regardles of the metric collection being enabled. maintenance_tasks.spawn(usage_metrics::task_main(metrics_config)); - client_tasks.spawn(usage_metrics::task_backup( - &metrics_config.backup_metric_collection_config, - cancellation_token.clone(), - )); } if let Either::Left(auth::Backend::ControlPlane(api, _)) = &auth_backend { diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs index c5e8588623..65e74466f2 100644 --- a/proxy/src/usage_metrics.rs +++ b/proxy/src/usage_metrics.rs @@ -1,19 +1,18 @@ //! Periodically collect proxy consumption metrics //! and push them to a HTTP endpoint. +use std::borrow::Cow; use std::convert::Infallible; -use std::pin::pin; use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; use std::sync::Arc; use std::time::Duration; -use anyhow::Context; +use anyhow::{bail, Context}; use async_compression::tokio::write::GzipEncoder; use bytes::Bytes; use chrono::{DateTime, Datelike, Timelike, Utc}; use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE}; use dashmap::mapref::entry::Entry; use dashmap::DashMap; -use futures::future::select; use once_cell::sync::Lazy; use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel}; use serde::{Deserialize, Serialize}; @@ -23,7 +22,7 @@ use tracing::{error, info, instrument, trace, warn}; use utils::backoff; use uuid::{NoContext, Timestamp}; -use crate::config::{MetricBackupCollectionConfig, MetricCollectionConfig}; +use crate::config::MetricCollectionConfig; use crate::context::parquet::{FAILED_UPLOAD_MAX_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD}; use crate::http; use crate::intern::{BranchIdInt, EndpointIdInt}; @@ -58,55 +57,21 @@ trait MetricCounterReporter { fn move_metrics(&self) -> (u64, usize); } -#[derive(Debug)] -struct MetricBackupCounter { - transmitted: AtomicU64, - opened_connections: AtomicUsize, -} - -impl MetricCounterRecorder for MetricBackupCounter { - fn record_egress(&self, bytes: u64) { - self.transmitted.fetch_add(bytes, Ordering::AcqRel); - } - - fn record_connection(&self, count: usize) { - self.opened_connections.fetch_add(count, Ordering::AcqRel); - } -} - -impl MetricCounterReporter for MetricBackupCounter { - fn get_metrics(&mut self) -> (u64, usize) { - ( - *self.transmitted.get_mut(), - *self.opened_connections.get_mut(), - ) - } - fn move_metrics(&self) -> (u64, usize) { - ( - self.transmitted.swap(0, Ordering::AcqRel), - self.opened_connections.swap(0, Ordering::AcqRel), - ) - } -} - #[derive(Debug)] pub(crate) struct MetricCounter { transmitted: AtomicU64, opened_connections: AtomicUsize, - backup: Arc, } impl MetricCounterRecorder for MetricCounter { /// Record that some bytes were sent from the proxy to the client fn record_egress(&self, bytes: u64) { - self.transmitted.fetch_add(bytes, Ordering::AcqRel); - self.backup.record_egress(bytes); + self.transmitted.fetch_add(bytes, Ordering::Relaxed); } /// Record that some connections were opened fn record_connection(&self, count: usize) { - self.opened_connections.fetch_add(count, Ordering::AcqRel); - self.backup.record_connection(count); + self.opened_connections.fetch_add(count, Ordering::Relaxed); } } @@ -119,8 +84,8 @@ impl MetricCounterReporter for MetricCounter { } fn move_metrics(&self) -> (u64, usize) { ( - self.transmitted.swap(0, Ordering::AcqRel), - self.opened_connections.swap(0, Ordering::AcqRel), + self.transmitted.swap(0, Ordering::Relaxed), + self.opened_connections.swap(0, Ordering::Relaxed), ) } } @@ -173,26 +138,11 @@ type FastHasher = std::hash::BuildHasherDefault; #[derive(Default)] pub(crate) struct Metrics { endpoints: DashMap, FastHasher>, - backup_endpoints: DashMap, FastHasher>, } impl Metrics { /// Register a new byte metrics counter for this endpoint pub(crate) fn register(&self, ids: Ids) -> Arc { - let backup = if let Some(entry) = self.backup_endpoints.get(&ids) { - entry.clone() - } else { - self.backup_endpoints - .entry(ids.clone()) - .or_insert_with(|| { - Arc::new(MetricBackupCounter { - transmitted: AtomicU64::new(0), - opened_connections: AtomicUsize::new(0), - }) - }) - .clone() - }; - let entry = if let Some(entry) = self.endpoints.get(&ids) { entry.clone() } else { @@ -202,7 +152,6 @@ impl Metrics { Arc::new(MetricCounter { transmitted: AtomicU64::new(0), opened_connections: AtomicUsize::new(0), - backup: backup.clone(), }) }) .clone() @@ -227,6 +176,21 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result anyhow::Result( now: DateTime, chunk_size: usize, ) -> impl Iterator>> + 'a { - // Split into chunks of 1000 metrics to avoid exceeding the max request size metrics_to_send .chunks(chunk_size) .map(move |chunk| EventChunk { @@ -303,11 +268,14 @@ fn create_event_chunks<'a>( }) } +#[expect(clippy::too_many_arguments)] #[instrument(skip_all)] async fn collect_metrics_iteration( endpoints: &DashMap, FastHasher>, client: &http::ClientWithMiddleware, metric_collection_endpoint: &reqwest::Url, + storage: Option<&GenericRemoteStorage>, + outer_chunk_size: usize, hostname: &str, prev: DateTime, now: DateTime, @@ -323,17 +291,54 @@ async fn collect_metrics_iteration( trace!("no new metrics to send"); } + let cancel = CancellationToken::new(); + let path_prefix = create_remote_path_prefix(now); + // Send metrics. - for chunk in create_event_chunks(&metrics_to_send, hostname, prev, now, CHUNK_SIZE) { + for chunk in create_event_chunks(&metrics_to_send, hostname, prev, now, outer_chunk_size) { + tokio::join!( + upload_main_events_chunked(client, metric_collection_endpoint, &chunk, CHUNK_SIZE), + async { + if let Err(e) = upload_backup_events(storage, &chunk, &path_prefix, &cancel).await { + error!("failed to upload consumption events to remote storage: {e:?}"); + } + } + ); + } +} + +fn create_remote_path_prefix(now: DateTime) -> String { + format!( + "year={year:04}/month={month:02}/day={day:02}/{hour:02}:{minute:02}:{second:02}Z", + year = now.year(), + month = now.month(), + day = now.day(), + hour = now.hour(), + minute = now.minute(), + second = now.second(), + ) +} + +async fn upload_main_events_chunked( + client: &http::ClientWithMiddleware, + metric_collection_endpoint: &reqwest::Url, + chunk: &EventChunk<'_, Event>, + subchunk_size: usize, +) { + // Split into smaller chunks to avoid exceeding the max request size + for subchunk in chunk.events.chunks(subchunk_size).map(|c| EventChunk { + events: Cow::Borrowed(c), + }) { let res = client .post(metric_collection_endpoint.clone()) - .json(&chunk) + .json(&subchunk) .send() .await; let res = match res { Ok(x) => x, Err(err) => { + // TODO: retry? error!("failed to send metrics: {:?}", err); continue; } @@ -341,7 +346,7 @@ async fn collect_metrics_iteration( if !res.status().is_success() { error!("metrics endpoint refused the sent metrics: {:?}", res); - for metric in chunk.events.iter().filter(|e| e.value > (1u64 << 40)) { + for metric in subchunk.events.iter().filter(|e| e.value > (1u64 << 40)) { // Report if the metric value is suspiciously large warn!("potentially abnormal metric value: {:?}", metric); } @@ -349,113 +354,34 @@ async fn collect_metrics_iteration( } } -pub async fn task_backup( - backup_config: &MetricBackupCollectionConfig, - cancellation_token: CancellationToken, -) -> anyhow::Result<()> { - info!("metrics backup config: {backup_config:?}"); - scopeguard::defer! { - info!("metrics backup has shut down"); - } - // Even if the remote storage is not configured, we still want to clear the metrics. - let storage = if let Some(config) = backup_config.remote_storage_config.as_ref() { - Some( - GenericRemoteStorage::from_config(config) - .await - .context("remote storage init")?, - ) - } else { - None - }; - let mut ticker = tokio::time::interval(backup_config.interval); - let mut prev = Utc::now(); - let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned(); - loop { - select(pin!(ticker.tick()), pin!(cancellation_token.cancelled())).await; - let now = Utc::now(); - collect_metrics_backup_iteration( - &USAGE_METRICS.backup_endpoints, - storage.as_ref(), - &hostname, - prev, - now, - backup_config.chunk_size, - ) - .await; - - prev = now; - if cancellation_token.is_cancelled() { - info!("metrics backup has been cancelled"); - break; - } - } - Ok(()) -} - -#[instrument(skip_all)] -async fn collect_metrics_backup_iteration( - endpoints: &DashMap, FastHasher>, +async fn upload_backup_events( storage: Option<&GenericRemoteStorage>, - hostname: &str, - prev: DateTime, - now: DateTime, - chunk_size: usize, -) { - let year = now.year(); - let month = now.month(); - let day = now.day(); - let hour = now.hour(); - let minute = now.minute(); - let second = now.second(); - let cancel = CancellationToken::new(); - - info!("starting collect_metrics_backup_iteration"); - - let metrics_to_send = collect_and_clear_metrics(endpoints); - - if metrics_to_send.is_empty() { - trace!("no new metrics to send"); - } - - // Send metrics. - for chunk in create_event_chunks(&metrics_to_send, hostname, prev, now, chunk_size) { - let real_now = Utc::now(); - let id = uuid::Uuid::new_v7(Timestamp::from_unix( - NoContext, - real_now.second().into(), - real_now.nanosecond(), - )); - let path = format!("year={year:04}/month={month:02}/day={day:02}/{hour:02}:{minute:02}:{second:02}Z_{id}.json.gz"); - let remote_path = match RemotePath::from_string(&path) { - Ok(remote_path) => remote_path, - Err(e) => { - error!("failed to create remote path from str {path}: {:?}", e); - continue; - } - }; - - let res = upload_events_chunk(storage, chunk, &remote_path, &cancel).await; - - if let Err(e) = res { - error!( - "failed to upload consumption events to remote storage: {:?}", - e - ); - } - } -} - -async fn upload_events_chunk( - storage: Option<&GenericRemoteStorage>, - chunk: EventChunk<'_, Event>, - remote_path: &RemotePath, + chunk: &EventChunk<'_, Event>, + path_prefix: &str, cancel: &CancellationToken, ) -> anyhow::Result<()> { let Some(storage) = storage else { - error!("no remote storage configured"); + warn!("no remote storage configured"); return Ok(()); }; - let data = serde_json::to_vec(&chunk).context("serialize metrics")?; + + let real_now = Utc::now(); + let id = uuid::Uuid::new_v7(Timestamp::from_unix( + NoContext, + real_now.second().into(), + real_now.nanosecond(), + )); + let path = format!("{path_prefix}_{id}.json.gz"); + let remote_path = match RemotePath::from_string(&path) { + Ok(remote_path) => remote_path, + Err(e) => { + bail!("failed to create remote path from str {path}: {:?}", e); + } + }; + + // TODO: This is async compression from Vec to Vec. Rewrite as byte stream. + // Use sync compression in blocking threadpool. + let data = serde_json::to_vec(chunk).context("serialize metrics")?; let mut encoder = GzipEncoder::new(Vec::new()); encoder.write_all(&data).await.context("compress metrics")?; encoder.shutdown().await.context("compress metrics")?; @@ -464,7 +390,7 @@ async fn upload_events_chunk( || async { let stream = futures::stream::once(futures::future::ready(Ok(compressed_data.clone()))); storage - .upload(stream, compressed_data.len(), remote_path, None, cancel) + .upload(stream, compressed_data.len(), &remote_path, None, cancel) .await }, TimeoutOrCancel::caused_by_cancel, @@ -482,9 +408,12 @@ async fn upload_events_chunk( #[cfg(test)] mod tests { + use std::fs; + use std::io::BufReader; use std::sync::{Arc, Mutex}; use anyhow::Error; + use camino_tempfile::tempdir; use chrono::Utc; use consumption_metrics::{Event, EventChunk}; use http_body_util::BodyExt; @@ -493,6 +422,7 @@ mod tests { use hyper::service::service_fn; use hyper::{Request, Response}; use hyper_util::rt::TokioIo; + use remote_storage::{RemoteStorageConfig, RemoteStorageKind}; use tokio::net::TcpListener; use url::Url; @@ -538,8 +468,34 @@ mod tests { let endpoint = Url::parse(&format!("http://{addr}")).unwrap(); let now = Utc::now(); + let storage_test_dir = tempdir().unwrap(); + let local_fs_path = storage_test_dir.path().join("usage_metrics"); + fs::create_dir_all(&local_fs_path).unwrap(); + let storage = GenericRemoteStorage::from_config(&RemoteStorageConfig { + storage: RemoteStorageKind::LocalFs { + local_path: local_fs_path.clone(), + }, + timeout: Duration::from_secs(10), + small_timeout: Duration::from_secs(1), + }) + .await + .unwrap(); + + let mut pushed_chunks: Vec = Vec::new(); + let mut stored_chunks: Vec = Vec::new(); + // no counters have been registered - collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await; + collect_metrics_iteration( + &metrics.endpoints, + &client, + &endpoint, + Some(&storage), + 1000, + "foo", + now, + now, + ) + .await; let r = std::mem::take(&mut *reports.lock().unwrap()); assert!(r.is_empty()); @@ -551,39 +507,84 @@ mod tests { }); // the counter should be observed despite 0 egress - collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await; + collect_metrics_iteration( + &metrics.endpoints, + &client, + &endpoint, + Some(&storage), + 1000, + "foo", + now, + now, + ) + .await; let r = std::mem::take(&mut *reports.lock().unwrap()); assert_eq!(r.len(), 1); assert_eq!(r[0].events.len(), 1); assert_eq!(r[0].events[0].value, 0); + pushed_chunks.extend(r); // record egress counter.record_egress(1); // egress should be observered - collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await; + collect_metrics_iteration( + &metrics.endpoints, + &client, + &endpoint, + Some(&storage), + 1000, + "foo", + now, + now, + ) + .await; let r = std::mem::take(&mut *reports.lock().unwrap()); assert_eq!(r.len(), 1); assert_eq!(r[0].events.len(), 1); assert_eq!(r[0].events[0].value, 1); + pushed_chunks.extend(r); // release counter drop(counter); // we do not observe the counter - collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await; + collect_metrics_iteration( + &metrics.endpoints, + &client, + &endpoint, + Some(&storage), + 1000, + "foo", + now, + now, + ) + .await; let r = std::mem::take(&mut *reports.lock().unwrap()); assert!(r.is_empty()); // counter is unregistered assert!(metrics.endpoints.is_empty()); - collect_metrics_backup_iteration(&metrics.backup_endpoints, None, "foo", now, now, 1000) - .await; - assert!(!metrics.backup_endpoints.is_empty()); - collect_metrics_backup_iteration(&metrics.backup_endpoints, None, "foo", now, now, 1000) - .await; - // backup counter is unregistered after the second iteration - assert!(metrics.backup_endpoints.is_empty()); + let path_prefix = create_remote_path_prefix(now); + for entry in walkdir::WalkDir::new(&local_fs_path) + .into_iter() + .filter_map(|e| e.ok()) + { + let path = local_fs_path.join(&path_prefix).to_string(); + if entry.path().to_str().unwrap().starts_with(&path) { + let chunk = serde_json::from_reader(flate2::bufread::GzDecoder::new( + BufReader::new(fs::File::open(entry.into_path()).unwrap()), + )) + .unwrap(); + stored_chunks.push(chunk); + } + } + storage_test_dir.close().ok(); + + // sort by first event's idempotency key because the order of files is nondeterministic + pushed_chunks.sort_by_cached_key(|c| c.events[0].idempotency_key.clone()); + stored_chunks.sort_by_cached_key(|c| c.events[0].idempotency_key.clone()); + assert_eq!(pushed_chunks, stored_chunks); } } From 62b74bdc2c3f8123a14380ef17c7e506479cba83 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Sun, 1 Dec 2024 14:23:10 +0200 Subject: [PATCH 18/65] Add GUC controlling whether to pause recovery if some critical GUCs at replica have smaller value than on primary (#9057) ## Problem See https://github.com/neondatabase/neon/issues/9023 ## Summary of changes Ass GUC `recovery_pause_on_misconfig` allowing not to pause in case of replica and primary configuration mismatch See https://github.com/neondatabase/postgres/pull/501 See https://github.com/neondatabase/postgres/pull/502 See https://github.com/neondatabase/postgres/pull/503 See https://github.com/neondatabase/postgres/pull/504 ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. ## Checklist before merging - [ ] Do not forget to reformat commit message to not include the above checklist --------- Co-authored-by: Konstantin Knizhnik Co-authored-by: Heikki Linnakangas --- pgxn/neon/neon.c | 13 ++ .../regress/test_physical_replication.py | 221 +++++++++++++++++- vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- vendor/postgres-v16 | 2 +- vendor/postgres-v17 | 2 +- vendor/revisions.json | 8 +- 7 files changed, 241 insertions(+), 9 deletions(-) diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index 51b9f58bbc..ff08f9164d 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -15,6 +15,9 @@ #include "access/subtrans.h" #include "access/twophase.h" #include "access/xlog.h" +#if PG_MAJORVERSION_NUM >= 15 +#include "access/xlogrecovery.h" +#endif #include "replication/logical.h" #include "replication/slot.h" #include "replication/walsender.h" @@ -432,6 +435,16 @@ _PG_init(void) restore_running_xacts_callback = RestoreRunningXactsFromClog; + DefineCustomBoolVariable( + "neon.allow_replica_misconfig", + "Allow replica startup when some critical GUCs have smaller value than on primary node", + NULL, + &allowReplicaMisconfig, + true, + PGC_POSTMASTER, + 0, + NULL, NULL, NULL); + DefineCustomEnumVariable( "neon.running_xacts_overflow_policy", "Action performed on snapshot overflow when restoring runnings xacts from CLOG", diff --git a/test_runner/regress/test_physical_replication.py b/test_runner/regress/test_physical_replication.py index 043aff686b..6cb11b825d 100644 --- a/test_runner/regress/test_physical_replication.py +++ b/test_runner/regress/test_physical_replication.py @@ -4,6 +4,10 @@ import random import time from typing import TYPE_CHECKING +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import wait_replica_caughtup + if TYPE_CHECKING: from fixtures.neon_fixtures import NeonEnv @@ -19,8 +23,8 @@ def test_physical_replication(neon_simple_env: NeonEnv): p_cur.execute( "CREATE TABLE t(pk bigint primary key, payload text default repeat('?',200))" ) - time.sleep(1) with env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") as secondary: + wait_replica_caughtup(primary, secondary) with primary.connect() as p_con: with p_con.cursor() as p_cur: with secondary.connect() as s_con: @@ -42,3 +46,218 @@ def test_physical_replication(neon_simple_env: NeonEnv): s_cur.execute( "select * from t where pk=%s", (random.randrange(1, 2 * pk),) ) + + +def test_physical_replication_config_mismatch_max_connections(neon_simple_env: NeonEnv): + """ + Test for primary and replica with different configuration settings (max_connections). + PostgreSQL enforces that settings that affect how many transactions can be open at the same time + have values equal to or higher in a hot standby replica than in the primary. If they don't, the replica refuses + to start up. If the settings are changed in the primary, it emits a WAL record with the new settings, and + when the replica sees that record it pauses the replay. + + PostgreSQL enforces this to ensure that the replica can hold all the XIDs in the so-called + "known-assigned XIDs" array, which is a fixed size array that needs to be allocated + upfront and server startup. That's pretty pessimistic, though; usually you can get + away with smaller settings, because we allocate space for 64 subtransactions per + transaction too. If you get unlucky and you run out of space, WAL redo dies with + "ERROR: too many KnownAssignedXids". It's better to take the chances than refuse + to start up, especially in Neon: if the WAL redo dies, the server is restarted, which is + no worse than refusing to start up in the first place. Furthermore, the control plane + tries to ensure that on restart, the settings are set high enough, so most likely it will + work after restart. Because of that, we have patched Postgres to disable to checks when + the `recovery_pause_on_misconfig` setting is set to `false` (which is the default on neon). + + This test tests all those cases of running out of space in known-assigned XIDs array that + we can hit with `recovery_pause_on_misconfig=false`, which are unreachable in unpatched + Postgres. + There's a similar check for `max_locks_per_transactions` too, which is related to running out + of space in the lock manager rather than known-assigned XIDs. Similar story with that, although + running out of space in the lock manager is possible in unmodified Postgres too. Enforcing the + check for `max_locks_per_transactions` ensures that you don't run out of space in the lock manager + when there are no read-only queries holding locks in the replica, but you can still run out if you have + those. + """ + env = neon_simple_env + with env.endpoints.create_start( + branch_name="main", + endpoint_id="primary", + ) as primary: + with primary.connect() as p_con: + with p_con.cursor() as p_cur: + p_cur.execute( + "CREATE TABLE t(pk bigint primary key, payload text default repeat('?',200))" + ) + with env.endpoints.new_replica_start( + origin=primary, + endpoint_id="secondary", + config_lines=["max_connections=5"], + ) as secondary: + wait_replica_caughtup(primary, secondary) + with secondary.connect() as s_con: + with s_con.cursor() as s_cur: + cursors = [] + for i in range(10): + p_con = primary.connect() + p_cur = p_con.cursor() + p_cur.execute("begin") + p_cur.execute("insert into t (pk) values (%s)", (i,)) + cursors.append(p_cur) + + for p_cur in cursors: + p_cur.execute("commit") + + wait_replica_caughtup(primary, secondary) + s_cur.execute("select count(*) from t") + assert s_cur.fetchall()[0][0] == 10 + + +def test_physical_replication_config_mismatch_max_prepared(neon_simple_env: NeonEnv): + """ + Test for primary and replica with different configuration settings (max_prepared_transactions). + If number of transactions at primary exceeds its limit at replica then WAL replay is terminated. + """ + env = neon_simple_env + primary = env.endpoints.create_start( + branch_name="main", + endpoint_id="primary", + config_lines=["max_prepared_transactions=10"], + ) + p_con = primary.connect() + p_cur = p_con.cursor() + p_cur.execute("CREATE TABLE t(pk bigint primary key, payload text default repeat('?',200))") + + secondary = env.endpoints.new_replica_start( + origin=primary, + endpoint_id="secondary", + config_lines=["max_prepared_transactions=5"], + ) + wait_replica_caughtup(primary, secondary) + + s_con = secondary.connect() + s_cur = s_con.cursor() + cursors = [] + for i in range(10): + p_con = primary.connect() + p_cur = p_con.cursor() + p_cur.execute("begin") + p_cur.execute("insert into t (pk) values (%s)", (i,)) + p_cur.execute(f"prepare transaction 't{i}'") + cursors.append(p_cur) + + for i in range(10): + cursors[i].execute(f"commit prepared 't{i}'") + + time.sleep(5) + with pytest.raises(Exception) as e: + s_cur.execute("select count(*) from t") + assert s_cur.fetchall()[0][0] == 10 + secondary.stop() + + log.info(f"Replica crashed with {e}") + assert secondary.log_contains("maximum number of prepared transactions reached") + + +def connect(ep): + max_reconnect_attempts = 10 + for _ in range(max_reconnect_attempts): + try: + return ep.connect() + except Exception as e: + log.info(f"Failed to connect with primary: {e}") + time.sleep(1) + + +def test_physical_replication_config_mismatch_too_many_known_xids(neon_simple_env: NeonEnv): + """ + Test for primary and replica with different configuration settings (max_connections). + In this case large difference in this setting and larger number of concurrent transactions at primary + # cause too many known xids error at replica. + """ + env = neon_simple_env + primary = env.endpoints.create_start( + branch_name="main", + endpoint_id="primary", + config_lines=[ + "max_connections=1000", + "shared_buffers=128MB", # prevent "no unpinned buffers available" error + ], + ) + secondary = env.endpoints.new_replica_start( + origin=primary, + endpoint_id="secondary", + config_lines=[ + "max_connections=2", + "autovacuum_max_workers=1", + "max_worker_processes=5", + "max_wal_senders=1", + "superuser_reserved_connections=0", + ], + ) + + p_con = primary.connect() + p_cur = p_con.cursor() + p_cur.execute("CREATE TABLE t(x integer)") + + n_connections = 990 + cursors = [] + for i in range(n_connections): + p_con = connect(primary) + p_cur = p_con.cursor() + p_cur.execute("begin") + p_cur.execute(f"insert into t values({i})") + cursors.append(p_cur) + + for cur in cursors: + cur.execute("commit") + + time.sleep(5) + with pytest.raises(Exception) as e: + s_con = secondary.connect() + s_cur = s_con.cursor() + s_cur.execute("select count(*) from t") + assert s_cur.fetchall()[0][0] == n_connections + secondary.stop() + + log.info(f"Replica crashed with {e}") + assert secondary.log_contains("too many KnownAssignedXids") + + +def test_physical_replication_config_mismatch_max_locks_per_transaction(neon_simple_env: NeonEnv): + """ + Test for primary and replica with different configuration settings (max_locks_per_transaction). + In conjunction with different number of max_connections at primary and standby it can cause "out of shared memory" + error if the primary obtains more AccessExclusiveLocks than the standby can hold. + """ + env = neon_simple_env + primary = env.endpoints.create_start( + branch_name="main", + endpoint_id="primary", + config_lines=[ + "max_locks_per_transaction = 100", + ], + ) + secondary = env.endpoints.new_replica_start( + origin=primary, + endpoint_id="secondary", + config_lines=[ + "max_connections=10", + "max_locks_per_transaction = 10", + ], + ) + + n_tables = 1000 + + p_con = primary.connect() + p_cur = p_con.cursor() + p_cur.execute("begin") + for i in range(n_tables): + p_cur.execute(f"CREATE TABLE t_{i}(x integer)") + p_cur.execute("commit") + + with pytest.raises(Exception) as e: + wait_replica_caughtup(primary, secondary) + secondary.stop() + + log.info(f"Replica crashed with {e}") + assert secondary.log_contains("You might need to increase") diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index c1989c934d..373f9decad 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit c1989c934d46e04e78b3c496c8a34bcd40ddceeb +Subproject commit 373f9decad933d2d46f321231032ae8b0da81acd diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index d929b9a8b9..972e325e62 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit d929b9a8b9f32f6fe5a0eac3e6e963f0e44e27e6 +Subproject commit 972e325e62b455957adbbdd8580e31275bb5b8c9 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 13e9e35394..dff6615a8e 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 13e9e3539419003e79bd9aa29e1bc44f3fd555dd +Subproject commit dff6615a8e48a10bb17a03fa3c00635f1ace7a92 diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index faebe5e5af..a10d95be67 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit faebe5e5aff5687908504453623778f8515529db +Subproject commit a10d95be67265e0f10a422ba0457f5a7af01de71 diff --git a/vendor/revisions.json b/vendor/revisions.json index abeddcadf7..8a73e14dcf 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,18 +1,18 @@ { "v17": [ "17.2", - "faebe5e5aff5687908504453623778f8515529db" + "a10d95be67265e0f10a422ba0457f5a7af01de71" ], "v16": [ "16.6", - "13e9e3539419003e79bd9aa29e1bc44f3fd555dd" + "dff6615a8e48a10bb17a03fa3c00635f1ace7a92" ], "v15": [ "15.10", - "d929b9a8b9f32f6fe5a0eac3e6e963f0e44e27e6" + "972e325e62b455957adbbdd8580e31275bb5b8c9" ], "v14": [ "14.15", - "c1989c934d46e04e78b3c496c8a34bcd40ddceeb" + "373f9decad933d2d46f321231032ae8b0da81acd" ] } From 030810ed3e44acf2ff9845c4ce57b2f75d01dfcf Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Sun, 1 Dec 2024 13:04:37 +0000 Subject: [PATCH 19/65] Compute image: prepare Postgres v14-v16 for Debian 12 (#9954) ## Problem Current compute images for Postgres 14-16 don't build on Debian 12 because of issues with extensions. This PR fixes that, but for the current setup, it is mostly a no-op change. ## Summary of changes - Use `/bin/bash -euo pipefail` as SHELL to fail earlier - Fix `plv8` build: backport a trivial patch for v8 - Fix `postgis` build: depend `sfgal` version on Debian version instead of Postgres version Tested in: https://github.com/neondatabase/neon/pull/9849 --- compute/compute-node.Dockerfile | 17 ++++++++----- compute/patches/plv8-3.1.10.patch | 42 +++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 6 deletions(-) create mode 100644 compute/patches/plv8-3.1.10.patch diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 2fcd9985bc..9567018053 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -14,6 +14,9 @@ ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim FROM debian:$DEBIAN_FLAVOR AS build-deps ARG DEBIAN_VERSION +# Use strict mode for bash to catch errors early +SHELL ["/bin/bash", "-euo", "pipefail", "-c"] + RUN case $DEBIAN_VERSION in \ # Version-specific installs for Bullseye (PG14-PG16): # The h3_pg extension needs a cmake 3.20+, but Debian bullseye has 3.18. @@ -106,6 +109,7 @@ RUN cd postgres && \ # ######################################################################################### FROM build-deps AS postgis-build +ARG DEBIAN_VERSION ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt update && \ @@ -122,12 +126,12 @@ RUN apt update && \ # and also we must check backward compatibility with older versions of PostGIS. # # Use new version only for v17 -RUN case "${PG_VERSION}" in \ - "v17") \ +RUN case "${DEBIAN_VERSION}" in \ + "bookworm") \ export SFCGAL_VERSION=1.4.1 \ export SFCGAL_CHECKSUM=1800c8a26241588f11cddcf433049e9b9aea902e923414d2ecef33a3295626c3 \ ;; \ - "v14" | "v15" | "v16") \ + "bullseye") \ export SFCGAL_VERSION=1.3.10 \ export SFCGAL_CHECKSUM=4e39b3b2adada6254a7bdba6d297bb28e1a9835a9f879b74f37e2dab70203232 \ ;; \ @@ -228,6 +232,8 @@ FROM build-deps AS plv8-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY compute/patches/plv8-3.1.10.patch /plv8-3.1.10.patch + RUN apt update && \ apt install --no-install-recommends -y ninja-build python3-dev libncurses5 binutils clang @@ -239,8 +245,6 @@ RUN apt update && \ # # Use new version only for v17 # because since v3.2, plv8 doesn't include plcoffee and plls extensions -ENV PLV8_TAG=v3.2.3 - RUN case "${PG_VERSION}" in \ "v17") \ export PLV8_TAG=v3.2.3 \ @@ -255,8 +259,9 @@ RUN case "${PG_VERSION}" in \ git clone --recurse-submodules --depth 1 --branch ${PLV8_TAG} https://github.com/plv8/plv8.git plv8-src && \ tar -czf plv8.tar.gz --exclude .git plv8-src && \ cd plv8-src && \ + if [[ "${PG_VERSION}" < "v17" ]]; then patch -p1 < /plv8-3.1.10.patch; fi && \ # generate and copy upgrade scripts - mkdir -p upgrade && ./generate_upgrade.sh 3.1.10 && \ + mkdir -p upgrade && ./generate_upgrade.sh ${PLV8_TAG#v} && \ cp upgrade/* /usr/local/pgsql/share/extension/ && \ export PATH="/usr/local/pgsql/bin:$PATH" && \ make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \ diff --git a/compute/patches/plv8-3.1.10.patch b/compute/patches/plv8-3.1.10.patch new file mode 100644 index 0000000000..43cdb479f7 --- /dev/null +++ b/compute/patches/plv8-3.1.10.patch @@ -0,0 +1,42 @@ +commit 46b38d3e46f9cd6c70d9b189dd6ff4abaa17cf5e +Author: Alexander Bayandin +Date: Sat Nov 30 18:29:32 2024 +0000 + + Fix v8 9.7.37 compilation on Debian 12 + +diff --git a/patches/code/84cf3230a9680aac3b73c410c2b758760b6d3066.patch b/patches/code/84cf3230a9680aac3b73c410c2b758760b6d3066.patch +new file mode 100644 +index 0000000..f0a5dc7 +--- /dev/null ++++ b/patches/code/84cf3230a9680aac3b73c410c2b758760b6d3066.patch +@@ -0,0 +1,30 @@ ++From 84cf3230a9680aac3b73c410c2b758760b6d3066 Mon Sep 17 00:00:00 2001 ++From: Michael Lippautz ++Date: Thu, 27 Jan 2022 14:14:11 +0100 ++Subject: [PATCH] cppgc: Fix include ++ ++Add to cover for std::exchange. ++ ++Bug: v8:12585 ++Change-Id: Ida65144e93e466be8914527d0e646f348c136bcb ++Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3420309 ++Auto-Submit: Michael Lippautz ++Reviewed-by: Omer Katz ++Commit-Queue: Michael Lippautz ++Cr-Commit-Position: refs/heads/main@{#78820} ++--- ++ src/heap/cppgc/prefinalizer-handler.h | 1 + ++ 1 file changed, 1 insertion(+) ++ ++diff --git a/src/heap/cppgc/prefinalizer-handler.h b/src/heap/cppgc/prefinalizer-handler.h ++index bc17c99b1838..c82c91ff5a45 100644 ++--- a/src/heap/cppgc/prefinalizer-handler.h +++++ b/src/heap/cppgc/prefinalizer-handler.h ++@@ -5,6 +5,7 @@ ++ #ifndef V8_HEAP_CPPGC_PREFINALIZER_HANDLER_H_ ++ #define V8_HEAP_CPPGC_PREFINALIZER_HANDLER_H_ ++ +++#include ++ #include ++ ++ #include "include/cppgc/prefinalizer.h" From 1e08b5dccc13030f76081d9a6c0cd343c8c29393 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Sun, 1 Dec 2024 17:47:28 +0200 Subject: [PATCH 20/65] Fix issues with prefetch ring buffer resize (#9847) ## Problem See https://neondb.slack.com/archives/C04DGM6SMTM/p1732110190129479 We observe the following error in the logs ``` [XX000] ERROR: [NEON_SMGR] [shard 3] Incorrect prefetch read: status=1 response=0x7fafef335138 my=128 receive=128 ``` most likely caused by changing `neon.readahead_buffer_size` ## Summary of changes 1. Copy shard state 2. Do not use prefetch_set_unused in readahead_buffer_resize 3. Change prefetch buffer overflow criteria --------- Co-authored-by: Konstantin Knizhnik --- pgxn/neon/pagestore_smgr.c | 13 +++++- .../regress/test_prefetch_buffer_resize.py | 40 +++++++++++++++++++ 2 files changed, 51 insertions(+), 2 deletions(-) create mode 100644 test_runner/regress/test_prefetch_buffer_resize.py diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index cbb0e2ae6d..a5e0c402fb 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -439,6 +439,8 @@ readahead_buffer_resize(int newsize, void *extra) newPState->ring_unused = newsize; newPState->ring_receive = newsize; newPState->ring_flush = newsize; + newPState->max_shard_no = MyPState->max_shard_no; + memcpy(newPState->shard_bitmap, MyPState->shard_bitmap, sizeof(MyPState->shard_bitmap)); /* * Copy over the prefetches. @@ -495,7 +497,11 @@ readahead_buffer_resize(int newsize, void *extra) for (; end >= MyPState->ring_last && end != UINT64_MAX; end -= 1) { - prefetch_set_unused(end); + PrefetchRequest *slot = GetPrfSlot(end); + if (slot->status == PRFS_RECEIVED) + { + pfree(slot->response); + } } prfh_destroy(MyPState->prf_hash); @@ -944,6 +950,9 @@ Retry: Assert(entry == NULL); Assert(slot == NULL); + /* There should be no buffer overflow */ + Assert(MyPState->ring_last + readahead_buffer_size >= MyPState->ring_unused); + /* * If the prefetch queue is full, we need to make room by clearing the * oldest slot. If the oldest slot holds a buffer that was already @@ -958,7 +967,7 @@ Retry: * a prefetch request kind of goes against the principles of * prefetching) */ - if (MyPState->ring_last + readahead_buffer_size - 1 == MyPState->ring_unused) + if (MyPState->ring_last + readahead_buffer_size == MyPState->ring_unused) { uint64 cleanup_index = MyPState->ring_last; diff --git a/test_runner/regress/test_prefetch_buffer_resize.py b/test_runner/regress/test_prefetch_buffer_resize.py new file mode 100644 index 0000000000..7676b78b0e --- /dev/null +++ b/test_runner/regress/test_prefetch_buffer_resize.py @@ -0,0 +1,40 @@ +from __future__ import annotations + +import random + +import pytest +from fixtures.neon_fixtures import NeonEnvBuilder + + +@pytest.mark.parametrize("shard_count", [None, 4]) +@pytest.mark.timeout(600) +def test_prefetch(neon_env_builder: NeonEnvBuilder, shard_count: int | None): + if shard_count is not None: + neon_env_builder.num_pageservers = shard_count + env = neon_env_builder.init_start( + initial_tenant_shard_count=shard_count, + ) + n_iter = 10 + n_rec = 100000 + + endpoint = env.endpoints.create_start( + "main", + config_lines=[ + "shared_buffers=10MB", + ], + ) + + cur = endpoint.connect().cursor() + + cur.execute("CREATE TABLE t(pk integer, filler text default repeat('?', 200))") + cur.execute(f"insert into t (pk) values (generate_series(1,{n_rec}))") + + cur.execute("set statement_timeout=0") + cur.execute("set effective_io_concurrency=20") + cur.execute("set max_parallel_workers_per_gather=0") + + for _ in range(n_iter): + buf_size = random.randrange(16, 32) + cur.execute(f"set neon.readahead_buffer_size={buf_size}") + limit = random.randrange(1, n_rec) + cur.execute(f"select sum(pk) from (select pk from t limit {limit}) s") From fdf231c237c1929f1b11c893dc9666e4c89e1722 Mon Sep 17 00:00:00 2001 From: John Spray Date: Sun, 1 Dec 2024 18:09:58 +0000 Subject: [PATCH 21/65] storcon: don't take any Service locks in /status and /ready (#9944) ## Problem We saw unexpected container terminations when running in k8s with with small CPU resource requests. The /status and /ready handlers called `maybe_forward`, which always takes the lock on Service::inner. If there is a lot of writer lock contention, and the container is starved of CPU, this increases the likelihood that we will get killed by the kubelet. It isn't certain that this was a cause of issues, but it is a potential source that we can eliminate. ## Summary of changes - Revise logic to return immediately if the URL is in the non-forwarded list, rather than calling maybe_forward --- storage_controller/src/http.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index 9b5d4caf31..39e078ba7c 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -1452,10 +1452,15 @@ async fn maybe_forward(req: Request) -> ForwardOutcome { let uri = req.uri().to_string(); let uri_for_forward = !NOT_FOR_FORWARD.contains(&uri.as_str()); + // Fast return before trying to take any Service locks, if we will never forward anyway + if !uri_for_forward { + return ForwardOutcome::NotForwarded(req); + } + let state = get_state(&req); let leadership_status = state.service.get_leadership_status(); - if leadership_status != LeadershipStatus::SteppedDown || !uri_for_forward { + if leadership_status != LeadershipStatus::SteppedDown { return ForwardOutcome::NotForwarded(req); } From 7fc2912d06a79e70091bdd51d422835848048be3 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Mon, 2 Dec 2024 10:10:51 +0000 Subject: [PATCH 22/65] Update pgvector to 0.8.0 (#9733) --- compute/compute-node.Dockerfile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 9567018053..222a0cb88b 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -358,10 +358,10 @@ COPY compute/patches/pgvector.patch /pgvector.patch # because we build the images on different machines than where we run them. # Pass OPTFLAGS="" to remove it. # -# vector 0.7.4 supports v17 -# last release v0.7.4 - Aug 5, 2024 -RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.4.tar.gz -O pgvector.tar.gz && \ - echo "0341edf89b1924ae0d552f617e14fb7f8867c0194ed775bcc44fa40288642583 pgvector.tar.gz" | sha256sum --check && \ +# vector >0.7.4 supports v17 +# last release v0.8.0 - Oct 30, 2024 +RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.8.0.tar.gz -O pgvector.tar.gz && \ + echo "867a2c328d4928a5a9d6f052cd3bc78c7d60228a9b914ad32aa3db88e9de27b0 pgvector.tar.gz" | sha256sum --check && \ mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \ patch -p1 < /pgvector.patch && \ make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ From 0e1a33660754bb216cd954e61ff90085a64e2d24 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Mon, 2 Dec 2024 12:26:15 +0200 Subject: [PATCH 23/65] test_runner: improve `wait_until` (#9936) Improves `wait_until` by: * Use `timeout` instead of `iterations`. This allows changing the timeout/interval parameters independently. * Make `timeout` and `interval` optional (default 20s and 0.5s). Most callers don't care. * Only output status every 1s by default, and add optional `status_interval` parameter. * Remove `show_intermediate_error`, this was always emitted anyway. Most callers have been updated to use the defaults, except where they had good reason otherwise. --- test_runner/fixtures/neon_fixtures.py | 14 ++-- test_runner/fixtures/pageserver/utils.py | 17 +---- test_runner/fixtures/safekeeper/http.py | 2 +- test_runner/fixtures/safekeeper/utils.py | 2 +- test_runner/fixtures/utils.py | 35 +++++----- test_runner/logical_repl/test_clickhouse.py | 6 +- test_runner/logical_repl/test_debezium.py | 12 +--- .../performance/test_branch_creation.py | 8 +-- .../regress/test_attach_tenant_config.py | 2 - test_runner/regress/test_compaction.py | 2 +- .../regress/test_disk_usage_eviction.py | 10 +-- test_runner/regress/test_hot_standby.py | 6 +- .../regress/test_layers_from_future.py | 2 +- test_runner/regress/test_logging.py | 4 +- .../regress/test_logical_replication.py | 6 +- test_runner/regress/test_lsn_mapping.py | 4 +- test_runner/regress/test_neon_superuser.py | 2 +- test_runner/regress/test_ondemand_download.py | 16 ++--- test_runner/regress/test_pageserver_api.py | 12 +--- .../regress/test_pageserver_generations.py | 14 ++-- .../test_pageserver_getpage_throttle.py | 7 +- .../regress/test_pageserver_layer_rolling.py | 16 ++--- .../regress/test_pageserver_restart.py | 2 +- .../regress/test_pageserver_secondary.py | 6 +- test_runner/regress/test_readonly_node.py | 2 - test_runner/regress/test_remote_storage.py | 42 ++++++----- test_runner/regress/test_replica_start.py | 2 +- test_runner/regress/test_sharding.py | 18 ++--- .../regress/test_storage_controller.py | 70 +++++++++---------- test_runner/regress/test_storage_scrubber.py | 2 - .../regress/test_subscriber_restart.py | 2 +- test_runner/regress/test_tenant_conf.py | 6 +- test_runner/regress/test_tenant_delete.py | 14 ++-- test_runner/regress/test_tenant_detach.py | 8 +-- test_runner/regress/test_tenant_relocation.py | 6 +- test_runner/regress/test_tenant_size.py | 4 +- test_runner/regress/test_tenant_tasks.py | 2 +- test_runner/regress/test_tenants.py | 2 +- .../test_tenants_with_remote_storage.py | 12 +--- test_runner/regress/test_timeline_archive.py | 6 +- test_runner/regress/test_timeline_delete.py | 61 +++++----------- .../regress/test_timeline_detach_ancestor.py | 30 ++++---- .../regress/test_timeline_gc_blocking.py | 2 +- test_runner/regress/test_timeline_size.py | 36 +++++----- test_runner/regress/test_wal_acceptor.py | 22 +++--- test_runner/regress/test_wal_receiver.py | 4 +- 46 files changed, 234 insertions(+), 326 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 9bcfffeb9c..5709a3b82b 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1736,7 +1736,7 @@ class NeonStorageController(MetricsGetter, LogUtils): def storage_controller_ready(): assert self.ready() is True - wait_until(30, 1, storage_controller_ready) + wait_until(storage_controller_ready) return time.time() - t1 def attach_hook_issue( @@ -2574,7 +2574,7 @@ class NeonPageserver(PgProtocol, LogUtils): log.info(f"any_unstable={any_unstable}") assert not any_unstable - wait_until(20, 0.5, complete) + wait_until(complete) def __enter__(self) -> Self: return self @@ -3973,7 +3973,7 @@ class Endpoint(PgProtocol, LogUtils): migration_id: int = cur.fetchall()[0][0] assert migration_id >= num_migrations - wait_until(20, 0.5, check_migrations_done) + wait_until(check_migrations_done) # Mock the extension part of spec passed from control plane for local testing # endpooint.rs adds content of this file as a part of the spec.json @@ -4489,12 +4489,10 @@ class Safekeeper(LogUtils): ) assert stat.remote_consistent_lsn >= lsn and stat.backup_lsn >= lsn.segment_lsn() - # xxx: max wait is long because we might be waiting for reconnection from - # pageserver to this safekeeper - wait_until(30, 1, are_lsns_advanced) + wait_until(are_lsns_advanced) client.checkpoint(tenant_id, timeline_id) if wait_wal_removal: - wait_until(30, 1, are_segments_removed) + wait_until(are_segments_removed) def wait_until_paused(self, failpoint: str): msg = f"at failpoint {failpoint}" @@ -4503,7 +4501,7 @@ class Safekeeper(LogUtils): log.info(f"waiting for hitting failpoint {failpoint}") self.assert_log_contains(msg) - wait_until(20, 0.5, paused) + wait_until(paused) class NeonBroker(LogUtils): diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py index 46700e3fe3..7c10edc5fc 100644 --- a/test_runner/fixtures/pageserver/utils.py +++ b/test_runner/fixtures/pageserver/utils.py @@ -13,7 +13,7 @@ from mypy_boto3_s3.type_defs import ( from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId from fixtures.log_helper import log from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient -from fixtures.remote_storage import RemoteStorage, RemoteStorageKind, S3Storage +from fixtures.remote_storage import RemoteStorage, S3Storage from fixtures.utils import wait_until if TYPE_CHECKING: @@ -269,12 +269,7 @@ def wait_timeline_detail_404( pageserver_http: PageserverHttpClient, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, - iterations: int, - interval: float | None = None, ): - if interval is None: - interval = 0.25 - def timeline_is_missing(): data = {} try: @@ -287,19 +282,17 @@ def wait_timeline_detail_404( raise RuntimeError(f"Timeline exists state {data.get('state')}") - wait_until(iterations, interval, func=timeline_is_missing) + wait_until(timeline_is_missing) def timeline_delete_wait_completed( pageserver_http: PageserverHttpClient, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, - iterations: int = 20, - interval: float | None = None, **delete_args, ) -> None: pageserver_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id, **delete_args) - wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, iterations, interval) + wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id) # remote_storage must not be None, but that's easier for callers to make mypy happy @@ -453,7 +446,3 @@ def many_small_layers_tenant_config() -> dict[str, Any]: "checkpoint_distance": 1024**2, "image_creation_threshold": 100, } - - -def poll_for_remote_storage_iterations(remote_storage_kind: RemoteStorageKind) -> int: - return 40 if remote_storage_kind is RemoteStorageKind.REAL_S3 else 15 diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py index 094188c0b5..286f80ba69 100644 --- a/test_runner/fixtures/safekeeper/http.py +++ b/test_runner/fixtures/safekeeper/http.py @@ -175,7 +175,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): assert s > Lsn(0) return s - return wait_until(30, 1, timeline_start_lsn_non_zero) + return wait_until(timeline_start_lsn_non_zero) def get_commit_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn: return self.timeline_status(tenant_id, timeline_id).commit_lsn diff --git a/test_runner/fixtures/safekeeper/utils.py b/test_runner/fixtures/safekeeper/utils.py index 0246916470..922cdedccc 100644 --- a/test_runner/fixtures/safekeeper/utils.py +++ b/test_runner/fixtures/safekeeper/utils.py @@ -19,4 +19,4 @@ def wait_walreceivers_absent( log.info(f"waiting for walreceivers to be gone, currently {status.walreceivers}") assert len(status.walreceivers) == 0 - wait_until(30, 0.5, walreceivers_absent) + wait_until(walreceivers_absent) diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 04e98fe494..c34ac298d1 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -9,6 +9,7 @@ import tarfile import threading import time from collections.abc import Callable, Iterable +from datetime import datetime, timedelta from hashlib import sha256 from pathlib import Path from typing import TYPE_CHECKING, Any, TypeVar @@ -380,15 +381,10 @@ def start_in_background( if return_code is not None: error = f"expected subprocess to run but it exited with code {return_code}" else: - attempts = 10 try: - wait_until( - number_of_iterations=attempts, - interval=1, - func=is_started, - ) + wait_until(is_started, timeout=10) except Exception: - error = f"Failed to get correct status from subprocess in {attempts} attempts" + error = "Failed to get correct status from subprocess" except Exception as e: error = f"expected subprocess to start but it failed with exception: {e}" @@ -402,28 +398,31 @@ def start_in_background( def wait_until( - number_of_iterations: int, - interval: float, func: Callable[[], WaitUntilRet], - show_intermediate_error: bool = False, + name: str | None = None, + timeout: float = 20.0, # seconds + interval: float = 0.5, # seconds + status_interval: float = 1.0, # seconds ) -> WaitUntilRet: """ Wait until 'func' returns successfully, without exception. Returns the last return value from the function. """ + if name is None: + name = getattr(func, "__name__", repr(func)) + deadline = datetime.now() + timedelta(seconds=timeout) + next_status = datetime.now() last_exception = None - for i in range(number_of_iterations): + while datetime.now() <= deadline: try: - res = func() + return func() except Exception as e: - log.info("waiting for %s iteration %s failed: %s", func, i + 1, e) + if datetime.now() >= next_status: + log.info("waiting for %s: %s", name, e) + next_status = datetime.now() + timedelta(seconds=status_interval) last_exception = e - if show_intermediate_error: - log.info(e) time.sleep(interval) - continue - return res - raise Exception(f"timed out while waiting for {func}") from last_exception + raise Exception(f"timed out while waiting for {name}") from last_exception def assert_eq(a, b) -> None: diff --git a/test_runner/logical_repl/test_clickhouse.py b/test_runner/logical_repl/test_clickhouse.py index 8e03bbe5d4..6b522fa46d 100644 --- a/test_runner/logical_repl/test_clickhouse.py +++ b/test_runner/logical_repl/test_clickhouse.py @@ -60,24 +60,22 @@ def test_clickhouse(remote_pg: RemotePostgres): "SETTINGS materialized_postgresql_tables_list = 'table1';" ) wait_until( - 120, - 0.5, lambda: query_clickhouse( client, "select * from db1_postgres.table1 order by 1", "ee600d8f7cd05bd0b169fa81f44300a9dd10085a", ), + timeout=60, ) cur.execute("INSERT INTO table1 (id, column1) VALUES (3, 'ghi'), (4, 'jkl');") conn.commit() wait_until( - 120, - 0.5, lambda: query_clickhouse( client, "select * from db1_postgres.table1 order by 1", "9eba2daaf7e4d7d27ac849525f68b562ab53947d", ), + timeout=60, ) log.debug("Sleeping before final checking if Neon is still alive") time.sleep(3) diff --git a/test_runner/logical_repl/test_debezium.py b/test_runner/logical_repl/test_debezium.py index d2cb087c92..8023d64d3d 100644 --- a/test_runner/logical_repl/test_debezium.py +++ b/test_runner/logical_repl/test_debezium.py @@ -148,14 +148,12 @@ def test_debezium(debezium): ) conn.commit() wait_until( - 100, - 0.5, lambda: get_kafka_msg( consumer, ts_ms, after={"first_name": "John", "last_name": "Dow", "email": "johndow@example.com"}, ), - show_intermediate_error=True, + timeout=60, ) ts_ms = time.time() * 1000 log.info("Insert 2 ts_ms: %s", ts_ms) @@ -165,28 +163,24 @@ def test_debezium(debezium): ) conn.commit() wait_until( - 100, - 0.5, lambda: get_kafka_msg( consumer, ts_ms, after={"first_name": "Alex", "last_name": "Row", "email": "alexrow@example.com"}, ), - show_intermediate_error=True, + timeout=60, ) ts_ms = time.time() * 1000 log.info("Update ts_ms: %s", ts_ms) cur.execute("update inventory.customers set first_name = 'Alexander' where id = 2") conn.commit() wait_until( - 100, - 0.5, lambda: get_kafka_msg( consumer, ts_ms, after={"first_name": "Alexander"}, ), - show_intermediate_error=True, + timeout=60, ) time.sleep(3) cur.execute("select 1") diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py index c50c4ad432..3ce27d6cd3 100644 --- a/test_runner/performance/test_branch_creation.py +++ b/test_runner/performance/test_branch_creation.py @@ -137,7 +137,7 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape: startup_line = "INFO version: git(-env)?:" # find the first line of the log file so we can find the next start later - _, first_start = wait_until(5, 1, lambda: env.pageserver.assert_log_contains(startup_line)) + _, first_start = wait_until(lambda: env.pageserver.assert_log_contains(startup_line)) # start without gc so we can time compaction with less noise; use shorter # period for compaction so it starts earlier @@ -156,7 +156,7 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape: ) _, second_start = wait_until( - 5, 1, lambda: env.pageserver.assert_log_contains(startup_line, first_start) + lambda: env.pageserver.assert_log_contains(startup_line, first_start), ) env.pageserver.quiesce_tenants() @@ -164,8 +164,6 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape: # wait for compaction to complete, which most likely has already done so multiple times msg, _ = wait_until( - 30, - 1, lambda: env.pageserver.assert_log_contains( f".*tenant_id={env.initial_tenant}.*: compaction iteration complete.*", second_start ), @@ -205,7 +203,7 @@ def wait_and_record_startup_metrics( assert len(matching) == len(expected_labels) return matching - samples = wait_until(10, 1, metrics_are_filled) + samples = wait_until(metrics_are_filled) for sample in samples: phase = sample.labels["phase"] diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index 670c2698f5..45112fd67e 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -64,8 +64,6 @@ def negative_env(neon_env_builder: NeonEnvBuilder) -> Generator[NegativeTests, N ) wait_until( - 50, - 0.1, lambda: env.pageserver.assert_log_contains(".*Error processing HTTP request: Bad request"), ) diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index 302a8fd0d1..b6741aed68 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -385,7 +385,7 @@ def test_pageserver_compaction_circuit_breaker(neon_env_builder: NeonEnvBuilder) # Wait for enough failures to break the circuit breaker # This wait is fairly long because we back off on compaction failures, so 5 retries takes ~30s - wait_until(60, 1, assert_broken) + wait_until(assert_broken, timeout=60) # Sleep for a while, during which time we expect that compaction will _not_ be retried time.sleep(10) diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index 1807511008..05956b5b93 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -211,7 +211,7 @@ class EvictionEnv: pageserver.assert_log_contains(".*running mocked statvfs.*") # we most likely have already completed multiple runs - wait_until(10, 1, statvfs_called) + wait_until(statvfs_called) def count_layers_per_tenant( @@ -772,14 +772,14 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv): ) wait_until( - 10, 1, lambda: env.neon_env.pageserver.assert_log_contains(".*disk usage pressure relieved") + lambda: env.neon_env.pageserver.assert_log_contains(".*disk usage pressure relieved") ) def less_than_max_usage_pct(): post_eviction_total_size, _, _ = env.timelines_du(env.pageserver) assert post_eviction_total_size < 0.33 * total_size, "we requested max 33% usage" - wait_until(2, 2, less_than_max_usage_pct) + wait_until(less_than_max_usage_pct, timeout=5) # Disk usage candidate collection only takes into account active tenants. # However, the statvfs call takes into account the entire tenants directory, @@ -825,7 +825,7 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv): ) wait_until( - 10, 1, lambda: env.neon_env.pageserver.assert_log_contains(".*disk usage pressure relieved") + lambda: env.neon_env.pageserver.assert_log_contains(".*disk usage pressure relieved"), ) def more_than_min_avail_bytes_freed(): @@ -834,7 +834,7 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv): total_size - post_eviction_total_size >= min_avail_bytes ), f"we requested at least {min_avail_bytes} worth of free space" - wait_until(2, 2, more_than_min_avail_bytes_freed) + wait_until(more_than_min_avail_bytes_freed, timeout=5) def test_secondary_mode_eviction(eviction_env_ha: EvictionEnv): diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py index 0b1ac11c16..4044f25b37 100644 --- a/test_runner/regress/test_hot_standby.py +++ b/test_runner/regress/test_hot_standby.py @@ -257,7 +257,7 @@ def test_hot_standby_feedback(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): # Wait until we see that the pgbench_accounts is created + filled on replica *and* # index is created. Otherwise index creation would conflict with # read queries and hs feedback won't save us. - wait_until(60, 1.0, partial(pgbench_accounts_initialized, secondary)) + wait_until(partial(pgbench_accounts_initialized, secondary), timeout=60) # Test should fail if hs feedback is disabled anyway, but cross # check that walproposer sets some xmin. @@ -269,7 +269,7 @@ def test_hot_standby_feedback(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): log.info(f"xmin is {slot_xmin}") assert int(slot_xmin) > 0 - wait_until(10, 1.0, xmin_is_not_null) + wait_until(xmin_is_not_null) for _ in range(1, 5): # in debug mode takes about 5-7s balance = secondary.safe_psql_scalar("select sum(abalance) from pgbench_accounts") @@ -286,7 +286,7 @@ def test_hot_standby_feedback(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): log.info(f"xmin is {slot_xmin}") assert slot_xmin is None - wait_until(10, 1.0, xmin_is_null) + wait_until(xmin_is_null) # Test race condition between WAL replay and backends performing queries diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py index 761ec7568f..8818b40712 100644 --- a/test_runner/regress/test_layers_from_future.py +++ b/test_runner/regress/test_layers_from_future.py @@ -206,7 +206,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder, attach_mode: str): future_layers = set(get_future_layers()) assert future_layer not in future_layers - wait_until(10, 0.5, future_layer_is_gone_from_index_part) + wait_until(future_layer_is_gone_from_index_part) # We already make deletion stuck here, but we don't necessarily hit the failpoint # because deletions are batched. diff --git a/test_runner/regress/test_logging.py b/test_runner/regress/test_logging.py index f6fbdcabfd..d94c786f49 100644 --- a/test_runner/regress/test_logging.py +++ b/test_runner/regress/test_logging.py @@ -37,7 +37,7 @@ def test_logging_event_count(neon_env_builder: NeonEnvBuilder, level: str): return env.pageserver.assert_log_contains(f".*{msg_id}.*") - wait_until(10, 0.5, assert_logged) + wait_until(assert_logged) # make sure it's counted def assert_metric_value(): @@ -49,4 +49,4 @@ def test_logging_event_count(neon_env_builder: NeonEnvBuilder, level: str): log.info("libmetrics_tracing_event_count: %s", val) assert val > (before or 0.0) - wait_until(10, 1, assert_metric_value) + wait_until(assert_metric_value) diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py index ba471b7147..db18e1758c 100644 --- a/test_runner/regress/test_logical_replication.py +++ b/test_runner/regress/test_logical_replication.py @@ -207,7 +207,7 @@ def test_obsolete_slot_drop(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgre log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}") vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1") - wait_until(number_of_iterations=10, interval=2, func=partial(slot_removed, endpoint)) + wait_until(partial(slot_removed, endpoint)) def test_ondemand_wal_download_in_replication_slot_funcs(neon_env_builder: NeonEnvBuilder): @@ -519,7 +519,7 @@ def test_replication_shutdown(neon_simple_env: NeonEnv): assert len(res) == 4 assert [r[0] for r in res] == [10, 20, 30, 40] - wait_until(10, 0.5, check_that_changes_propagated) + wait_until(check_that_changes_propagated) def logical_replication_wait_flush_lsn_sync(publisher: PgProtocol) -> Lsn: @@ -549,7 +549,7 @@ select sent_lsn, flush_lsn, pg_current_wal_flush_lsn() from pg_stat_replication ) assert flush_lsn >= publisher_flush_lsn - wait_until(30, 0.5, check_caughtup) + wait_until(check_caughtup) return publisher_flush_lsn diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py index 7f0b541128..e42e71646d 100644 --- a/test_runner/regress/test_lsn_mapping.py +++ b/test_runner/regress/test_lsn_mapping.py @@ -169,7 +169,7 @@ def test_get_lsn_by_timestamp_cancelled(neon_env_builder: NeonEnvBuilder): ) _, offset = wait_until( - 20, 0.5, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}") + lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}") ) with pytest.raises(ReadTimeout): @@ -178,8 +178,6 @@ def test_get_lsn_by_timestamp_cancelled(neon_env_builder: NeonEnvBuilder): client.configure_failpoints((failpoint, "off")) _, offset = wait_until( - 20, - 0.5, lambda: env.pageserver.assert_log_contains( "Cancelled request finished with an error: Cancelled$", offset ), diff --git a/test_runner/regress/test_neon_superuser.py b/test_runner/regress/test_neon_superuser.py index 7118127a1f..49cd91906f 100644 --- a/test_runner/regress/test_neon_superuser.py +++ b/test_runner/regress/test_neon_superuser.py @@ -77,7 +77,7 @@ def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion): assert len(res) == 4 assert [r[0] for r in res] == [10, 20, 30, 40] - wait_until(10, 0.5, check_that_changes_propagated) + wait_until(check_that_changes_propagated) # Test that pg_monitor is working for neon_superuser role cur.execute("SELECT query from pg_stat_activity LIMIT 1") diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index e1caaeb6c1..028d1c2e49 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -256,7 +256,7 @@ def test_ondemand_download_timetravel(neon_env_builder: NeonEnvBuilder): ##### Second start, restore the data and ensure it's the same env.pageserver.start() - wait_until(10, 0.2, lambda: assert_tenant_state(client, tenant_id, "Active")) + wait_until(lambda: assert_tenant_state(client, tenant_id, "Active")) # The current_physical_size reports the sum of layers loaded in the layer # map, regardless of where the layer files are located. So even though we @@ -413,7 +413,7 @@ def test_download_remote_layers_api( ] ) - wait_until(10, 0.2, lambda: assert_tenant_state(client, tenant_id, "Active")) + wait_until(lambda: assert_tenant_state(client, tenant_id, "Active")) ###### Phase 1: exercise download error code path @@ -705,7 +705,7 @@ def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBu ) _, offset = wait_until( - 20, 0.5, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}") + lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}") ) location_conf = {"mode": "Detached", "tenant_conf": {}} @@ -713,8 +713,6 @@ def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBu detach = exec.submit(client.tenant_location_conf, env.initial_tenant, location_conf) _, offset = wait_until( - 20, - 0.5, lambda: env.pageserver.assert_log_contains( "closing is taking longer than expected", offset ), @@ -734,8 +732,6 @@ def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBu client.configure_failpoints((failpoint, "pause")) _, offset = wait_until( - 20, - 0.5, lambda: env.pageserver.assert_log_contains(f"cfg failpoint: {failpoint} pause", offset), ) @@ -750,8 +746,6 @@ def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBu warmup = exec.submit(client.tenant_secondary_download, env.initial_tenant, wait_ms=30000) _, offset = wait_until( - 20, - 0.5, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}", offset), ) @@ -805,7 +799,7 @@ def test_layer_download_timeouted(neon_env_builder: NeonEnvBuilder): ) _, offset = wait_until( - 20, 0.5, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}") + lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}") ) # ensure enough time while paused to trip the timeout time.sleep(2) @@ -824,8 +818,6 @@ def test_layer_download_timeouted(neon_env_builder: NeonEnvBuilder): # capture the next offset for a new synchronization with the failpoint _, offset = wait_until( - 20, - 0.5, lambda: env.pageserver.assert_log_contains(f"cfg failpoint: {failpoint} pause", offset), ) diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py index 05e81b82e0..55fd7a8608 100644 --- a/test_runner/regress/test_pageserver_api.py +++ b/test_runner/regress/test_pageserver_api.py @@ -117,19 +117,11 @@ def test_pageserver_http_get_wal_receiver_success(neon_simple_env: NeonEnv): # We need to wait here because it's possible that we don't have access to # the latest WAL yet, when the `timeline_detail` API is first called. # See: https://github.com/neondatabase/neon/issues/1768. - lsn = wait_until( - number_of_iterations=5, - interval=1, - func=lambda: expect_updated_msg_lsn(client, tenant_id, timeline_id, None), - ) + lsn = wait_until(lambda: expect_updated_msg_lsn(client, tenant_id, timeline_id, None)) # Make a DB modification then expect getting a new WAL receiver's data. endpoint.safe_psql("INSERT INTO t VALUES (1, 'hey')") - wait_until( - number_of_iterations=5, - interval=1, - func=lambda: expect_updated_msg_lsn(client, tenant_id, timeline_id, lsn), - ) + wait_until(lambda: expect_updated_msg_lsn(client, tenant_id, timeline_id, lsn)) def test_pageserver_http_api_client(neon_simple_env: NeonEnv): diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 6ba5753420..7e5bb45242 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -352,7 +352,7 @@ def test_deletion_queue_recovery( def assert_some_validations(): assert get_deletion_queue_validated(ps_http) > 0 - wait_until(20, 1, assert_some_validations) + wait_until(assert_some_validations) # The validatated keys statistic advances before the header is written, so we # also wait to see the header hit the disk: this seems paranoid but the race @@ -360,7 +360,7 @@ def test_deletion_queue_recovery( def assert_header_written(): assert (main_pageserver.workdir / "deletion" / "header-01").exists() - wait_until(20, 1, assert_header_written) + wait_until(assert_header_written) # If we will lose attachment, then our expectation on restart is that only the ones # we already validated will execute. Act like only those were present in the queue. @@ -382,11 +382,11 @@ def test_deletion_queue_recovery( # After restart, issue a flush to kick the deletion frontend to do recovery. # It should recover all the operations we submitted before the restart. ps_http.deletion_queue_flush(execute=False) - wait_until(20, 0.25, lambda: assert_deletions_submitted(before_restart_depth)) + wait_until(lambda: assert_deletions_submitted(before_restart_depth)) # The queue should drain through completely if we flush it ps_http.deletion_queue_flush(execute=True) - wait_until(10, 1, lambda: assert_deletion_queue(ps_http, lambda n: n == 0)) + wait_until(lambda: assert_deletion_queue(ps_http, lambda n: n == 0)) if keep_attachment == KeepAttachment.KEEP: # - If we kept the attachment, then our pre-restart deletions should execute @@ -564,7 +564,7 @@ def test_multi_attach( ) # Initially, the tenant will be attached to the first pageserver (first is default in our test harness) - wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[0], tenant_id, "Active")) + wait_until(lambda: assert_tenant_state(http_clients[0], tenant_id, "Active")) _detail = http_clients[0].timeline_detail(tenant_id, timeline_id) with pytest.raises(PageserverApiException): http_clients[1].timeline_detail(tenant_id, timeline_id) @@ -579,8 +579,8 @@ def test_multi_attach( pageservers[1].tenant_attach(env.initial_tenant) pageservers[2].tenant_attach(env.initial_tenant) - wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[1], tenant_id, "Active")) - wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[2], tenant_id, "Active")) + wait_until(lambda: assert_tenant_state(http_clients[1], tenant_id, "Active")) + wait_until(lambda: assert_tenant_state(http_clients[2], tenant_id, "Active")) # Now they all have it attached _details = list([c.timeline_detail(tenant_id, timeline_id) for c in http_clients]) diff --git a/test_runner/regress/test_pageserver_getpage_throttle.py b/test_runner/regress/test_pageserver_getpage_throttle.py index f1aad85fe9..ba6a1d9045 100644 --- a/test_runner/regress/test_pageserver_getpage_throttle.py +++ b/test_runner/regress/test_pageserver_getpage_throttle.py @@ -81,9 +81,7 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P marker = uuid.uuid4().hex ps_http.post_tracing_event("info", marker) - _, marker_offset = wait_until( - 10, 0.5, lambda: env.pageserver.assert_log_contains(marker, offset=None) - ) + _, marker_offset = wait_until(lambda: env.pageserver.assert_log_contains(marker, offset=None)) log.info("run pagebench") duration_secs = 10 @@ -103,12 +101,11 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P log.info("validate that we logged the throttling") wait_until( - 10, - compaction_period / 10, lambda: env.pageserver.assert_log_contains( f".*{tenant_id}.*shard was throttled in the last n_seconds.*", offset=marker_offset, ), + timeout=compaction_period, ) log.info("validate that the metric doesn't include throttle wait time") diff --git a/test_runner/regress/test_pageserver_layer_rolling.py b/test_runner/regress/test_pageserver_layer_rolling.py index f6a7bfa1ad..706da1e35e 100644 --- a/test_runner/regress/test_pageserver_layer_rolling.py +++ b/test_runner/regress/test_pageserver_layer_rolling.py @@ -84,7 +84,7 @@ def wait_for_wal_ingest_metric(pageserver_http: PageserverHttpClient) -> float: # The metric gets initialised on the first update. # Retry a few times, but return 0 if it's stable. try: - return float(wait_until(3, 0.5, query)) + return float(wait_until(query, timeout=2, interval=0.5)) except Exception: return 0 @@ -131,7 +131,7 @@ def test_pageserver_small_inmemory_layers( wait_until_pageserver_is_caught_up(env, last_flush_lsns) # We didn't write enough data to trigger a size-based checkpoint: we should see dirty data. - wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env)) + wait_until(lambda: assert_dirty_bytes_nonzero(env)) ps_http_client = env.pageserver.http_client() total_wal_ingested_before_restart = wait_for_wal_ingest_metric(ps_http_client) @@ -139,7 +139,7 @@ def test_pageserver_small_inmemory_layers( # Within ~ the checkpoint interval, all the ephemeral layers should be frozen and flushed, # such that there are zero bytes of ephemeral layer left on the pageserver log.info("Waiting for background checkpoints...") - wait_until(CHECKPOINT_TIMEOUT_SECONDS * 2, 1, lambda: assert_dirty_bytes(env, 0)) + wait_until(lambda: assert_dirty_bytes(env, 0), timeout=2 * CHECKPOINT_TIMEOUT_SECONDS) # Zero ephemeral layer bytes does not imply that all the frozen layers were uploaded: they # must be uploaded to remain visible to the pageserver after restart. @@ -180,7 +180,7 @@ def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder): wait_until_pageserver_is_caught_up(env, last_flush_lsns) # We didn't write enough data to trigger a size-based checkpoint: we should see dirty data. - wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env)) + wait_until(lambda: assert_dirty_bytes_nonzero(env)) # Stop the safekeepers, so that we cannot have any more WAL receiver connections for sk in env.safekeepers: @@ -193,7 +193,7 @@ def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder): # Within ~ the checkpoint interval, all the ephemeral layers should be frozen and flushed, # such that there are zero bytes of ephemeral layer left on the pageserver log.info("Waiting for background checkpoints...") - wait_until(CHECKPOINT_TIMEOUT_SECONDS * 2, 1, lambda: assert_dirty_bytes(env, 0)) + wait_until(lambda: assert_dirty_bytes(env, 0), timeout=2 * CHECKPOINT_TIMEOUT_SECONDS) # The code below verifies that we do not flush on the first write # after an idle period longer than the checkpoint timeout. @@ -210,7 +210,7 @@ def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder): run_worker_for_tenant(env, 5, tenant_with_extra_writes, offset=ENTRIES_PER_TIMELINE) ) - dirty_after_write = wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env)) + dirty_after_write = wait_until(lambda: assert_dirty_bytes_nonzero(env)) # We shouldn't flush since we've just opened a new layer waited_for = 0 @@ -305,11 +305,11 @@ def test_total_size_limit(neon_env_builder: NeonEnvBuilder): # Wait until enough layers have rolled that the amount of dirty data is under the threshold. # We do this indirectly via layer maps, rather than the dirty bytes metric, to avoid false-passing # if that metric isn't updated quickly enough to reflect the dirty bytes exceeding the limit. - wait_until(compaction_period_s * 2, 1, assert_bytes_rolled) + wait_until(assert_bytes_rolled, timeout=2 * compaction_period_s) # The end state should also have the reported metric under the limit def assert_dirty_data_limited(): dirty_bytes = get_dirty_bytes(env) assert dirty_bytes < max_dirty_data - wait_until(compaction_period_s * 2, 1, lambda: assert_dirty_data_limited()) + wait_until(lambda: assert_dirty_data_limited(), timeout=2 * compaction_period_s) diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py index 4bf5705517..835ccbd5d4 100644 --- a/test_runner/regress/test_pageserver_restart.py +++ b/test_runner/regress/test_pageserver_restart.py @@ -103,7 +103,7 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): raise AssertionError("No 'complete' metric yet") - wait_until(30, 1.0, assert_complete) + wait_until(assert_complete) # Expectation callbacks: arg t is sample value, arg p is the previous phase's sample value expectations = [ diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index a264f4d3c9..1292682f9e 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -356,7 +356,7 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder): ) assert destination_lsn >= origin_lsn - wait_until(100, 0.1, caught_up) + wait_until(caught_up) # The destination should accept writes workload.churn_rows(64, pageserver_b.id) @@ -411,7 +411,7 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder): assert submitted is not None assert submitted > 0 - wait_until(10, 0.1, blocked_deletions_drained) + wait_until(blocked_deletions_drained) workload.churn_rows(64, pageserver_b.id) workload.validate(pageserver_b.id) @@ -702,7 +702,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder): else: timeout = int(deadline - now) + 1 try: - wait_until(timeout, 1, lambda: pageserver.assert_log_contains(expression)) + wait_until(lambda: pageserver.assert_log_contains(expression), timeout=timeout) except: log.error(f"Timed out waiting for '{expression}'") raise diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py index 70d558ac5a..c13bea7ee1 100644 --- a/test_runner/regress/test_readonly_node.py +++ b/test_runner/regress/test_readonly_node.py @@ -215,8 +215,6 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder): # wait for lease renewal before running query. _, offset = wait_until( - 20, - 0.5, lambda: ep_static.assert_log_contains( "lsn_lease_bg_task.*Request succeeded", offset=offset ), diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 137e75f784..76a42ef4a2 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -300,9 +300,9 @@ def test_remote_storage_upload_queue_retries( print_gc_result(gc_result) assert gc_result["layers_removed"] > 0 - wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0)) - wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0)) - wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0)) + wait_until(lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0)) + wait_until(lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0)) + wait_until(lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0)) # let all future operations queue up configure_storage_sync_failpoints("return") @@ -333,16 +333,28 @@ def test_remote_storage_upload_queue_retries( # wait for churn thread's data to get stuck in the upload queue # Exponential back-off in upload queue, so, gracious timeouts. - wait_until(30, 1, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="upload"), 0)) - wait_until(30, 1, lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 1)) - wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0)) + wait_until( + lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="upload"), 0), timeout=30 + ) + wait_until( + lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 1), timeout=30 + ) + wait_until( + lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0), timeout=30 + ) # unblock churn operations configure_storage_sync_failpoints("off") - wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0)) - wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0)) - wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0)) + wait_until( + lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0), timeout=30 + ) + wait_until( + lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0), timeout=30 + ) + wait_until( + lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0), timeout=30 + ) # The churn thread doesn't make progress once it blocks on the first wait_completion() call, # so, give it some time to wrap up. @@ -580,7 +592,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue( > 0 ) - wait_until(200, 0.1, assert_compacted_and_uploads_queued) + wait_until(assert_compacted_and_uploads_queued) # Regardless, give checkpoint some time to block for good. # Not strictly necessary, but might help uncover failure modes in the future. @@ -598,9 +610,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue( ] ) - # Generous timeout, because currently deletions can get blocked waiting for compaction - # This can be reduced when https://github.com/neondatabase/neon/issues/4998 is fixed. - timeline_delete_wait_completed(client, tenant_id, timeline_id, iterations=30, interval=1) + timeline_delete_wait_completed(client, tenant_id, timeline_id) assert not timeline_path.exists() @@ -826,22 +836,16 @@ def wait_upload_queue_empty( client: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId ): wait_until( - 2, - 1, lambda: assert_eq( get_queued_count(client, tenant_id, timeline_id, file_kind="layer", op_kind="upload"), 0 ), ) wait_until( - 2, - 1, lambda: assert_eq( get_queued_count(client, tenant_id, timeline_id, file_kind="index", op_kind="upload"), 0 ), ) wait_until( - 2, - 1, lambda: assert_eq( get_queued_count(client, tenant_id, timeline_id, file_kind="layer", op_kind="delete"), 0 ), diff --git a/test_runner/regress/test_replica_start.py b/test_runner/regress/test_replica_start.py index 8e7c01f950..e2a22cc769 100644 --- a/test_runner/regress/test_replica_start.py +++ b/test_runner/regress/test_replica_start.py @@ -378,7 +378,7 @@ def test_replica_too_many_known_assigned_xids(neon_simple_env: NeonEnv): return None raise RuntimeError("connection succeeded") - wait_until(20, 0.5, check_replica_crashed) + wait_until(check_replica_crashed) assert secondary.log_contains("too many KnownAssignedXids") # Replica is crashed, so ignore stop result diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 411574bd86..c86ba0d4ea 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -836,7 +836,7 @@ def test_sharding_split_stripe_size( assert len(notifications) == 3 assert notifications[2] == expect_after - wait_until(10, 1, assert_restart_notification) + wait_until(assert_restart_notification) # The quantity of data isn't huge, but debug can be _very_ slow, and the things we're @@ -1025,7 +1025,7 @@ def test_sharding_ingest_gaps( assert Lsn(timeline_detail["disk_consistent_lsn"]) >= expect_lsn # We set a short checkpoint timeout: expect things to get frozen+flushed within that - wait_until(checkpoint_interval_secs * 3, 1, assert_all_disk_consistent) + wait_until(assert_all_disk_consistent, timeout=3 * checkpoint_interval_secs) def assert_all_remote_consistent(): """ @@ -1037,7 +1037,7 @@ def test_sharding_ingest_gaps( assert Lsn(timeline_detail["remote_consistent_lsn"]) >= expect_lsn # We set a short checkpoint timeout: expect things to get frozen+flushed within that - wait_until(checkpoint_interval_secs * 3, 1, assert_all_remote_consistent) + wait_until(assert_all_remote_consistent, timeout=3 * checkpoint_interval_secs) workload.validate() @@ -1405,14 +1405,14 @@ def test_sharding_split_failures( # e.g. while waiting for a storage controller to re-attach a parent shard if we failed # inside the pageserver and the storage controller responds by detaching children and attaching # parents concurrently (https://github.com/neondatabase/neon/issues/7148) - wait_until(10, 1, lambda: workload.churn_rows(10, upload=False, ingest=False)) + wait_until(lambda: workload.churn_rows(10, upload=False, ingest=False)) workload.validate() if failure.fails_forward(env): log.info("Fail-forward failure, checking split eventually completes...") # A failure type which results in eventual completion of the split - wait_until(30, 1, assert_split_done) + wait_until(assert_split_done) elif failure.can_mitigate(): log.info("Mitigating failure...") # Mitigation phase: we expect to be able to proceed with a successful shard split @@ -1420,21 +1420,21 @@ def test_sharding_split_failures( # The split should appear to be rolled back from the point of view of all pageservers # apart from the one that is offline - wait_until(30, 1, lambda: assert_rolled_back(exclude_ps_id=failure.pageserver_id)) + wait_until(lambda: assert_rolled_back(exclude_ps_id=failure.pageserver_id)) finish_split() - wait_until(30, 1, lambda: assert_split_done(exclude_ps_id=failure.pageserver_id)) + wait_until(lambda: assert_split_done(exclude_ps_id=failure.pageserver_id)) # Having cleared the failure, everything should converge to a pristine state failure.clear(env) - wait_until(30, 1, assert_split_done) + wait_until(assert_split_done) else: # Once we restore the faulty pageserver's API to good health, rollback should # eventually complete. log.info("Clearing failure...") failure.clear(env) - wait_until(30, 1, assert_rolled_back) + wait_until(assert_rolled_back) # Having rolled back, the tenant should be working workload.churn_rows(10) diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 13bc54a114..e93e251b4f 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -154,7 +154,7 @@ def test_storage_controller_smoke(neon_env_builder: NeonEnvBuilder, combination) counts = get_node_shard_counts(env, tenant_ids) assert counts[node_id] == 0 - wait_until(10, 1, lambda: node_evacuated(env.pageservers[0].id)) + wait_until(lambda: node_evacuated(env.pageservers[0].id)) # Let all the reconciliations after marking the node offline complete env.storage_controller.reconcile_until_idle() @@ -222,7 +222,7 @@ def test_node_status_after_restart( def is_ready(): assert env.storage_controller.ready() is True - wait_until(30, 1, is_ready) + wait_until(is_ready) # We loaded nodes from database on restart nodes = env.storage_controller.node_list() @@ -606,7 +606,7 @@ def test_storage_controller_compute_hook( counts = get_node_shard_counts(env, [env.initial_tenant]) assert counts[node_id] == 0 - wait_until(10, 1, lambda: node_evacuated(env.pageservers[0].id)) + wait_until(lambda: node_evacuated(env.pageservers[0].id)) # Additional notification from migration log.info(f"notifications: {notifications}") @@ -620,7 +620,7 @@ def test_storage_controller_compute_hook( assert len(notifications) == 2 assert notifications[1] == expect - wait_until(20, 0.25, received_migration_notification) + wait_until(received_migration_notification) # When we restart, we should re-emit notifications for all tenants env.storage_controller.stop() @@ -630,7 +630,7 @@ def test_storage_controller_compute_hook( assert len(notifications) == 3 assert notifications[2] == expect - wait_until(10, 1, received_restart_notification) + wait_until(received_restart_notification) # Splitting a tenant should cause its stripe size to become visible in the compute notification env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=2) @@ -647,7 +647,7 @@ def test_storage_controller_compute_hook( assert len(notifications) == 4 assert notifications[3] == expect - wait_until(10, 1, received_split_notification) + wait_until(received_split_notification) # If the compute hook is unavailable, that should not block creating a tenant and # creating a timeline. This simulates a control plane refusing to accept notifications @@ -736,7 +736,7 @@ def test_storage_controller_stuck_compute_hook( def logged_stuck(): env.storage_controller.assert_log_contains(NOTIFY_BLOCKED_LOG) - wait_until(10, 0.25, logged_stuck) + wait_until(logged_stuck) contains_r = env.storage_controller.log_contains(NOTIFY_BLOCKED_LOG) assert contains_r is not None # Appease mypy (_, log_cursor) = contains_r @@ -764,7 +764,7 @@ def test_storage_controller_stuck_compute_hook( def logged_stuck_again(): env.storage_controller.assert_log_contains(NOTIFY_BLOCKED_LOG, offset=log_cursor) - wait_until(10, 0.25, logged_stuck_again) + wait_until(logged_stuck_again) assert migrate_fut.running() # This time, the compute hook remains stuck, but we mark the origin node offline: this should @@ -865,7 +865,7 @@ def test_storage_controller_compute_hook_revert( assert latest["shards"] is not None assert latest["shards"][0]["node_id"] == ps_id - wait_until(30, 1, lambda: notified_ps(pageserver_a.id)) + wait_until(lambda: notified_ps(pageserver_a.id)) env.storage_controller.allowed_errors.append(NOTIFY_BLOCKED_LOG) env.storage_controller.allowed_errors.extend(NOTIFY_FAILURE_LOGS) @@ -880,7 +880,7 @@ def test_storage_controller_compute_hook_revert( # Although the migration API failed, the hook should still see pageserver B (it remembers what # was posted even when returning an error code) - wait_until(30, 1, lambda: notified_ps(pageserver_b.id)) + wait_until(lambda: notified_ps(pageserver_b.id)) # Although the migration API failed, the tenant should still have moved to the right pageserver assert len(pageserver_b.http_client().tenant_list()) == 1 @@ -898,7 +898,7 @@ def test_storage_controller_compute_hook_revert( def logged_giving_up(): env.storage_controller.assert_log_contains(".*Giving up on compute notification.*") - wait_until(30, 1, logged_giving_up) + wait_until(logged_giving_up) pageserver_a.start() @@ -919,7 +919,7 @@ def test_storage_controller_compute_hook_revert( handle_params["status"] = 200 env.storage_controller.tenant_shard_migrate(tenant_shard_id, pageserver_a.id) - wait_until(30, 1, lambda: notified_ps(pageserver_a.id)) + wait_until(lambda: notified_ps(pageserver_a.id)) def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder): @@ -1453,7 +1453,7 @@ def test_storage_controller_heartbeats( # Check that each node got one tenant assert all(len(ts) == 1 for ts in node_to_tenants.values()) - wait_until(10, 1, tenants_placed) + wait_until(tenants_placed) # ... then we apply the failure offline_node_ids = set(failure.nodes()) @@ -1476,7 +1476,7 @@ def test_storage_controller_heartbeats( assert node["availability"] == "Offline" start = time.time() - wait_until(failure.offline_timeout, 1, nodes_offline) + wait_until(nodes_offline, timeout=failure.offline_timeout) detected_after = time.time() - start log.info(f"Detected node failures after {detected_after}s") @@ -1497,7 +1497,7 @@ def test_storage_controller_heartbeats( assert observed_tenants == set(tenant_ids) - wait_until(10, 1, tenant_migrated) + wait_until(tenant_migrated) # ... then we clear the failure failure.clear(env) @@ -1509,7 +1509,7 @@ def test_storage_controller_heartbeats( if node["id"] in online_node_ids: assert node["availability"] == "Active" - wait_until(10, 1, nodes_online) + wait_until(nodes_online) time.sleep(5) @@ -1562,7 +1562,7 @@ def test_storage_controller_re_attach(neon_env_builder: NeonEnvBuilder): # We could pre-empty this by configuring the node to Offline, but it's preferable to test # the realistic path we would take when a node restarts uncleanly. # The delay here will be ~NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL in neon_local - wait_until(30, 1, failed_over) + wait_until(failed_over) reconciles_before_restart = env.storage_controller.get_metric_value( "storage_controller_reconcile_complete_total", filter={"status": "ok"} @@ -1640,12 +1640,12 @@ def test_storage_controller_shard_scheduling_policy(neon_env_builder: NeonEnvBui assert e > n return e - errs = wait_until(10, 1, lambda: assert_errors_gt(0)) + errs = wait_until(lambda: assert_errors_gt(0)) # Try reconciling again, it should fail again with pytest.raises(StorageControllerApiException): env.storage_controller.reconcile_all() - errs = wait_until(10, 1, lambda: assert_errors_gt(errs)) + errs = wait_until(lambda: assert_errors_gt(errs)) # Configure the tenant to disable reconciles env.storage_controller.tenant_policy_update( @@ -1674,7 +1674,7 @@ def test_storage_controller_shard_scheduling_policy(neon_env_builder: NeonEnvBui return o # We should see a successful reconciliation - wait_until(10, 1, lambda: assert_ok_gt(0)) + wait_until(lambda: assert_ok_gt(0)) # And indeed the tenant should be attached assert len(env.pageserver.http_client().tenant_list_locations()["tenant_shards"]) == 1 @@ -2073,7 +2073,7 @@ def test_skip_drain_on_secondary_lag(neon_env_builder: NeonEnvBuilder, pg_bin: P raise Exception(f"Secondary lag not big enough: {lag}") log.info(f"Looking for lag to develop on the secondary {secondary}") - wait_until(10, 1, secondary_is_lagging) + wait_until(secondary_is_lagging) log.info(f"Starting drain of primary {primary} with laggy secondary {secondary}") env.storage_controller.retryable_node_operation( @@ -2107,7 +2107,7 @@ def test_skip_drain_on_secondary_lag(neon_env_builder: NeonEnvBuilder, pg_bin: P if lag > 1 * 1024 * 1024: raise Exception(f"Secondary lag not big enough: {lag}") - wait_until(10, 1, lag_is_acceptable) + wait_until(lag_is_acceptable) env.storage_controller.node_configure(primary, {"scheduling": "Active"}) @@ -2227,7 +2227,7 @@ def test_storage_controller_node_deletion( log.info(f"Shards on nodes other than on victim: {elsewhere}") assert elsewhere == tenant_count * shard_count_per_tenant - wait_until(30, 1, assert_shards_migrated) + wait_until(assert_shards_migrated) log.info(f"Deleting pageserver {victim.id}") env.storage_controller.node_delete(victim.id) @@ -2240,7 +2240,7 @@ def test_storage_controller_node_deletion( log.info(f"Shards on node {victim.id}: {count}") assert count == 0 - wait_until(30, 1, assert_victim_evacuated) + wait_until(assert_victim_evacuated) # The node should be gone from the list API assert victim.id not in [n["id"] for n in env.storage_controller.node_list()] @@ -2569,7 +2569,7 @@ def test_storage_controller_leadership_transfer( == StorageControllerLeadershipStatus.STEPPED_DOWN ) - wait_until(5, 1, previous_stepped_down) + wait_until(previous_stepped_down) storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_2_port}") @@ -2579,7 +2579,7 @@ def test_storage_controller_leadership_transfer( == StorageControllerLeadershipStatus.LEADER ) - wait_until(15, 1, new_becomes_leader) + wait_until(new_becomes_leader) leader = env.storage_controller.get_leader() assert leader["address"] == f"http://127.0.0.1:{storage_controller_2_port}/" @@ -2624,7 +2624,7 @@ def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvB env.storage_controller.configure_failpoints(("sleepy-drain-loop", "return(10000)")) env.storage_controller.node_drain(attached.id) - wait_until(10, 0.5, attached_is_draining) + wait_until(attached_is_draining) attached.restart() @@ -2646,7 +2646,7 @@ def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvB env.storage_controller.node_configure(attached.id, {"scheduling": "Pause"}) # allow for small delay between actually having cancelled and being able reconfigure again - wait_until(4, 0.5, reconfigure_node_again) + wait_until(reconfigure_node_again) def test_storage_controller_timeline_crud_race(neon_env_builder: NeonEnvBuilder): @@ -2691,7 +2691,7 @@ def test_storage_controller_timeline_crud_race(neon_env_builder: NeonEnvBuilder) ps.log_contains(f"at failpoint {failpoint}") is not None for ps in env.pageservers ) - wait_until(10, 1, has_hit_failpoint) + wait_until(has_hit_failpoint) # Migrate the tenant while the timeline creation is in progress: this migration will complete once it # can detach from the old pageserver, which will happen once the failpoint completes. @@ -2775,7 +2775,7 @@ def test_storage_controller_validate_during_migration(neon_env_builder: NeonEnvB def has_hit_compaction_failpoint(): assert origin_pageserver.log_contains(f"at failpoint {compaction_failpoint}") - wait_until(10, 1, has_hit_compaction_failpoint) + wait_until(has_hit_compaction_failpoint) # While the compaction is running, start a live migration which will pause long enough for the compaction to sleep, # after incrementing generation and attaching the new location @@ -2794,7 +2794,7 @@ def test_storage_controller_validate_during_migration(neon_env_builder: NeonEnvB # before it reaches this point. The timeout is because the AttachedStale transition includes # a flush of remote storage, and if the compaction already enqueued an index upload this cannot # make progress. - wait_until(60, 1, has_hit_migration_failpoint) + wait_until(has_hit_migration_failpoint, timeout=60) # Origin pageserver has succeeded with compaction before the migration completed. It has done all the writes it wanted to do in its own (stale) generation origin_pageserver.http_client().configure_failpoints((compaction_failpoint, "off")) @@ -2917,7 +2917,7 @@ def test_storage_controller_proxy_during_migration( log.info(expr) assert env.storage_controller.log_contains(expr) - wait_until(10, 1, has_hit_migration_failpoint) + wait_until(has_hit_migration_failpoint) # This request should be routed to whichever pageserver holds the highest generation tenant_info = env.storage_controller.pageserver_api().tenant_status( @@ -2934,7 +2934,7 @@ def test_storage_controller_proxy_during_migration( # We expect request to land on the origin assert tenant_info["generation"] == 1 - wait_until(10, 1, long_migration_metric_published) + wait_until(long_migration_metric_published) # Eventually migration completes env.storage_controller.configure_failpoints((migration_failpoint.value, "off")) @@ -3113,7 +3113,7 @@ def test_timeline_delete_mid_live_migration(neon_env_builder: NeonEnvBuilder, mi log.info(expr) assert env.storage_controller.log_contains(expr) - wait_until(10, 1, has_hit_migration_failpoint) + wait_until(has_hit_migration_failpoint) env.storage_controller.pageserver_api().timeline_delete( tenant_id=tenant_id, timeline_id=timeline_id @@ -3182,7 +3182,7 @@ def test_multi_attached_timeline_creation(neon_env_builder: NeonEnvBuilder, migr log.info(expr) assert env.storage_controller.log_contains(expr) - wait_until(10, 1, has_hit_migration_failpoint) + wait_until(has_hit_migration_failpoint) timeline_id = TimelineId.generate() env.storage_controller.pageserver_api().timeline_create( diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py index 3991bd7061..b16dc54c24 100644 --- a/test_runner/regress/test_storage_scrubber.py +++ b/test_runner/regress/test_storage_scrubber.py @@ -431,8 +431,6 @@ def test_scrubber_physical_gc_ancestors_split(neon_env_builder: NeonEnvBuilder): # Let the controller reach the failpoint wait_until( - 10, - 1, lambda: env.storage_controller.assert_log_contains( 'failpoint "shard-split-post-remote-sleep": sleeping' ), diff --git a/test_runner/regress/test_subscriber_restart.py b/test_runner/regress/test_subscriber_restart.py index d37eeb1e6e..7d4f66d044 100644 --- a/test_runner/regress/test_subscriber_restart.py +++ b/test_runner/regress/test_subscriber_restart.py @@ -56,4 +56,4 @@ def test_subscriber_restart(neon_simple_env: NeonEnv): pcur.execute(f"INSERT into t values ({n_records}, 0)") n_records += 1 with sub.cursor() as scur: - wait_until(60, 0.5, check_that_changes_propagated) + wait_until(check_that_changes_propagated) diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py index 1dd46ec3d1..f8f240cfdc 100644 --- a/test_runner/regress/test_tenant_conf.py +++ b/test_runner/regress/test_tenant_conf.py @@ -234,11 +234,7 @@ def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder): assert not config_path.exists(), "detach did not remove config file" env.pageserver.tenant_attach(tenant_id) - wait_until( - number_of_iterations=5, - interval=1, - func=lambda: assert_tenant_state(http_client, tenant_id, "Active"), - ) + wait_until(lambda: assert_tenant_state(http_client, tenant_id, "Active")) env.config_tenant(tenant_id, {"gc_horizon": "1000000"}) contents_first = config_path.read_text() diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index 47df3ead70..48e55c1ab1 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -185,21 +185,21 @@ def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonE deletion = None try: - wait_until(10, 1, has_hit_failpoint) + wait_until(has_hit_failpoint) # it should start ok, sync up with the stuck creation, then hang waiting for the timeline # to shut down. deletion = Thread(target=start_deletion) deletion.start() - wait_until(10, 1, deletion_has_started_waiting_for_timelines) + wait_until(deletion_has_started_waiting_for_timelines) pageserver_http.configure_failpoints((failpoint, "off")) creation.join() deletion.join() - wait_until(10, 1, tenant_is_deleted) + wait_until(tenant_is_deleted) finally: creation.join() if deletion is not None: @@ -264,7 +264,7 @@ def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder) def hit_initdb_upload_failpoint(): env.pageserver.assert_log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}") - wait_until(100, 0.1, hit_initdb_upload_failpoint) + wait_until(hit_initdb_upload_failpoint) def creation_connection_timed_out(): env.pageserver.assert_log_contains( @@ -273,7 +273,7 @@ def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder) # Wait so that we hit the timeout and the connection is dropped # (But timeline creation still continues) - wait_until(100, 0.1, creation_connection_timed_out) + wait_until(creation_connection_timed_out) ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "pause")) @@ -281,7 +281,7 @@ def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder) def tenant_delete_inner(): ps_http.tenant_delete(tenant_id) - wait_until(100, 0.5, tenant_delete_inner) + wait_until(tenant_delete_inner) Thread(target=tenant_delete).start() @@ -290,7 +290,7 @@ def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder) f"cfg failpoint: {DELETE_BEFORE_CLEANUP_FAILPOINT} pause" ) - wait_until(100, 0.1, deletion_arrived) + wait_until(deletion_arrived) ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "off")) diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index 8d7ca7bc4e..3f21dc895a 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -212,7 +212,7 @@ def test_tenant_reattach_while_busy( nonlocal updates_started, updates_finished, updates_to_perform # Wait until we have performed some updates - wait_until(20, 0.5, lambda: updates_finished > 500) + wait_until(lambda: updates_finished > 500) log.info("Detaching tenant") pageserver_http.tenant_detach(tenant_id) @@ -512,7 +512,7 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading( ) assert only_int(active) == 0 and only_int(broken) == 1 and only_int(broken_set) == 1 - wait_until(10, 0.5, found_broken) + wait_until(found_broken) client.tenant_detach(env.initial_tenant) @@ -524,7 +524,7 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading( ) assert only_int(broken) == 0 and len(broken_set) == 0 - wait_until(10, 0.5, found_cleaned_up) + wait_until(found_cleaned_up) env.pageserver.tenant_attach(env.initial_tenant) @@ -536,4 +536,4 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading( ) assert only_int(active) == 1 and len(broken_set) == 0 - wait_until(10, 0.5, found_active) + wait_until(found_active) diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index bf6120aa0a..df53a98e92 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -298,11 +298,7 @@ def test_tenant_relocation( destination_ps.tenant_attach(tenant_id) # wait for tenant to finish attaching - wait_until( - number_of_iterations=10, - interval=1, - func=lambda: assert_tenant_state(destination_http, tenant_id, "Active"), - ) + wait_until(lambda: assert_tenant_state(destination_http, tenant_id, "Active")) check_timeline_attached( destination_http, diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py index 8b733da0c6..713f89c60f 100644 --- a/test_runner/regress/test_tenant_size.py +++ b/test_runner/regress/test_tenant_size.py @@ -638,7 +638,7 @@ def test_synthetic_size_while_deleting(neon_env_builder: NeonEnvBuilder): with ThreadPoolExecutor(max_workers=1) as exec: completion = exec.submit(client.tenant_size, env.initial_tenant) _, last_offset = wait_until( - 10, 1.0, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}") + lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}") ) timeline_delete_wait_completed(client, env.initial_tenant, branch_id) @@ -656,8 +656,6 @@ def test_synthetic_size_while_deleting(neon_env_builder: NeonEnvBuilder): with ThreadPoolExecutor(max_workers=1) as exec: completion = exec.submit(client.tenant_size, env.initial_tenant) wait_until( - 10, - 1.0, lambda: env.pageserver.assert_log_contains( f"at failpoint {failpoint}", offset=last_offset ), diff --git a/test_runner/regress/test_tenant_tasks.py b/test_runner/regress/test_tenant_tasks.py index 72183f5778..4c26b64d22 100644 --- a/test_runner/regress/test_tenant_tasks.py +++ b/test_runner/regress/test_tenant_tasks.py @@ -77,4 +77,4 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder): assert tasks_started == tasks_ended assert tasks_panicked is None or int(tasks_panicked) == 0 - wait_until(10, 0.2, assert_tasks_finish) + wait_until(assert_tasks_finish) diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 158c3fddb0..d31901b384 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -330,7 +330,7 @@ def test_pageserver_with_empty_tenants(neon_env_builder: NeonEnvBuilder): assert len(tenants) == 1 assert all(t["state"]["slug"] != "Attaching" for t in tenants) - wait_until(10, 0.2, not_attaching) + wait_until(not_attaching) tenants = client.tenant_list() diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index 8d3ddf7e54..6b27c41d1c 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -178,11 +178,7 @@ def test_tenants_attached_after_download(neon_env_builder: NeonEnvBuilder): env.pageserver.start() client = env.pageserver.http_client() - wait_until( - number_of_iterations=5, - interval=1, - func=lambda: assert_tenant_state(client, tenant_id, "Active"), - ) + wait_until(lambda: assert_tenant_state(client, tenant_id, "Active")) restored_timelines = client.timeline_list(tenant_id) assert ( @@ -257,11 +253,7 @@ def test_tenant_redownloads_truncated_file_on_startup( env.pageserver.start() client = env.pageserver.http_client() - wait_until( - number_of_iterations=5, - interval=1, - func=lambda: assert_tenant_state(client, tenant_id, "Active"), - ) + wait_until(lambda: assert_tenant_state(client, tenant_id, "Active")) restored_timelines = client.timeline_list(tenant_id) assert ( diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py index bc2e048f69..5a1e493bbe 100644 --- a/test_runner/regress/test_timeline_archive.py +++ b/test_runner/regress/test_timeline_archive.py @@ -227,8 +227,8 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=leaf_timeline_id) assert timeline_offloaded_logged(leaf_timeline_id) - wait_until(30, 1, leaf_offloaded) - wait_until(30, 1, parent_offloaded) + wait_until(leaf_offloaded) + wait_until(parent_offloaded) # Offloaded child timelines should still prevent deletion with pytest.raises( @@ -331,7 +331,7 @@ def test_timeline_offload_persist(neon_env_builder: NeonEnvBuilder, delete_timel ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=child_timeline_id) assert timeline_offloaded_api(child_timeline_id) - wait_until(30, 1, child_offloaded) + wait_until(child_offloaded) assert timeline_offloaded_api(child_timeline_id) assert not timeline_offloaded_api(root_timeline_id) diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 155709e106..fbece68367 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -21,7 +21,6 @@ from fixtures.pageserver.utils import ( assert_prefix_empty, assert_prefix_not_empty, many_small_layers_tenant_config, - poll_for_remote_storage_iterations, timeline_delete_wait_completed, wait_for_last_record_lsn, wait_for_upload, @@ -94,12 +93,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv): assert timeline_path.exists() # retry deletes when compaction or gc is running in pageserver - # TODO: review whether this wait_until is actually necessary, we do an await() internally - wait_until( - number_of_iterations=3, - interval=0.2, - func=lambda: timeline_delete_wait_completed(ps_http, env.initial_tenant, leaf_timeline_id), - ) + timeline_delete_wait_completed(ps_http, env.initial_tenant, leaf_timeline_id) assert not timeline_path.exists() @@ -111,13 +105,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv): ps_http.timeline_detail(env.initial_tenant, leaf_timeline_id) assert exc.value.status_code == 404 - wait_until( - number_of_iterations=3, - interval=0.2, - func=lambda: timeline_delete_wait_completed( - ps_http, env.initial_tenant, parent_timeline_id - ), - ) + timeline_delete_wait_completed(ps_http, env.initial_tenant, parent_timeline_id) # Check that we didn't pick up the timeline again after restart. # See https://github.com/neondatabase/neon/issues/3560 @@ -226,8 +214,6 @@ def test_delete_timeline_exercise_crash_safety_failpoints( ps_http.configure_failpoints((failpoint, "return")) - iterations = poll_for_remote_storage_iterations(remote_storage_kind) - # These failpoints are earlier than background task is spawned. # so they result in api request failure. if failpoint in ( @@ -244,7 +230,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints( tenant_id=env.initial_tenant, timeline_id=timeline_id, expected_state="Broken", - iterations=iterations, + iterations=40, ) reason = timeline_info["state"]["Broken"]["reason"] @@ -257,25 +243,21 @@ def test_delete_timeline_exercise_crash_safety_failpoints( env.pageserver.stop() env.pageserver.start() - wait_until_tenant_active(ps_http, env.initial_tenant, iterations=iterations) + wait_until_tenant_active(ps_http, env.initial_tenant) if failpoint == "timeline-delete-before-index-deleted-at": # We crashed before persisting this to remote storage, need to retry delete request timeline_delete_wait_completed(ps_http, env.initial_tenant, timeline_id) else: # Pageserver should've resumed deletion after restart. - wait_timeline_detail_404( - ps_http, env.initial_tenant, timeline_id, iterations=iterations - ) + wait_timeline_detail_404(ps_http, env.initial_tenant, timeline_id) elif check is Check.RETRY_WITHOUT_RESTART: # this should succeed # this also checks that delete can be retried even when timeline is in Broken state ps_http.configure_failpoints((failpoint, "off")) - timeline_delete_wait_completed( - ps_http, env.initial_tenant, timeline_id, iterations=iterations - ) + timeline_delete_wait_completed(ps_http, env.initial_tenant, timeline_id) # Check remote is empty if remote_storage_kind is RemoteStorageKind.MOCK_S3: @@ -378,7 +360,7 @@ def test_timeline_resurrection_on_attach( env.pageserver.tenant_attach(tenant_id=tenant_id) - wait_until_tenant_active(ps_http, tenant_id=tenant_id, iterations=10, period=0.5) + wait_until_tenant_active(ps_http, tenant_id=tenant_id) timelines = ps_http.timeline_list(tenant_id=tenant_id) assert {TimelineId(tl["timeline_id"]) for tl in timelines} == { @@ -439,7 +421,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild # Wait for tenant to finish loading. wait_until_tenant_active(ps_http, tenant_id=env.initial_tenant, iterations=10, period=1) - wait_timeline_detail_404(ps_http, env.initial_tenant, leaf_timeline_id, iterations=4) + wait_timeline_detail_404(ps_http, env.initial_tenant, leaf_timeline_id) assert ( not leaf_timeline_path.exists() @@ -481,11 +463,10 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild ) # for some reason the check above doesnt immediately take effect for the below. - # Assume it is mock server incosistency and check twice. + # Assume it is mock server incosistency and check a few times. wait_until( - 2, - 0.5, lambda: assert_prefix_empty(neon_env_builder.pageserver_remote_storage), + timeout=2, ) # We deleted our only tenant, and the scrubber fails if it detects nothing @@ -544,7 +525,7 @@ def test_concurrent_timeline_delete_stuck_on( f".*{child_timeline_id}.*at failpoint {stuck_failpoint}" ) - wait_until(50, 0.1, first_call_hit_failpoint) + wait_until(first_call_hit_failpoint, interval=0.1, status_interval=1.0) # make the second call and assert behavior log.info("second call start") @@ -613,7 +594,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder): def hit_failpoint(): env.pageserver.assert_log_contains(at_failpoint_log_message) - wait_until(50, 0.1, hit_failpoint) + wait_until(hit_failpoint, interval=0.1) # we log this error if a client hangs up # might as well use it as another indicator that the test works @@ -623,7 +604,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder): def got_hangup_log_message(): env.pageserver.assert_log_contains(hangup_log_message) - wait_until(50, 0.1, got_hangup_log_message) + wait_until(got_hangup_log_message, interval=0.1) # check that the timeline is still present ps_http.timeline_detail(env.initial_tenant, child_timeline_id) @@ -635,10 +616,10 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder): message = f".*DELETE.*{child_timeline_id}.*Cancelled request finished" env.pageserver.assert_log_contains(message) - wait_until(50, 0.1, first_request_finished) + wait_until(first_request_finished, interval=0.1) # check that the timeline is gone - wait_timeline_detail_404(ps_http, env.initial_tenant, child_timeline_id, iterations=10) + wait_timeline_detail_404(ps_http, env.initial_tenant, child_timeline_id) def test_timeline_delete_works_for_remote_smoke( @@ -707,7 +688,7 @@ def test_timeline_delete_works_for_remote_smoke( # for some reason the check above doesnt immediately take effect for the below. # Assume it is mock server inconsistency and check twice. - wait_until(2, 0.5, lambda: assert_prefix_empty(neon_env_builder.pageserver_remote_storage)) + wait_until(lambda: assert_prefix_empty(neon_env_builder.pageserver_remote_storage)) # We deleted our only tenant, and the scrubber fails if it detects nothing neon_env_builder.disable_scrub_on_exit() @@ -753,15 +734,13 @@ def test_delete_orphaned_objects( env.pageserver.allowed_errors.append(f".*failpoint: {failpoint}") - iterations = poll_for_remote_storage_iterations(remote_storage_kind) - ps_http.timeline_delete(env.initial_tenant, timeline_id) timeline_info = wait_until_timeline_state( pageserver_http=ps_http, tenant_id=env.initial_tenant, timeline_id=timeline_id, expected_state="Broken", - iterations=iterations, + iterations=40, ) reason = timeline_info["state"]["Broken"]["reason"] @@ -827,8 +806,6 @@ def test_timeline_delete_resumed_on_attach( ) ) - iterations = poll_for_remote_storage_iterations(remote_storage_kind) - ps_http.timeline_delete(tenant_id, timeline_id) timeline_info = wait_until_timeline_state( @@ -836,7 +813,7 @@ def test_timeline_delete_resumed_on_attach( tenant_id=env.initial_tenant, timeline_id=timeline_id, expected_state="Broken", - iterations=iterations, + iterations=40, ) reason = timeline_info["state"]["Broken"]["reason"] @@ -871,7 +848,7 @@ def test_timeline_delete_resumed_on_attach( env.pageserver.tenant_attach(tenant_id=tenant_id) # delete should be resumed - wait_timeline_detail_404(ps_http, env.initial_tenant, timeline_id, iterations=iterations) + wait_timeline_detail_404(ps_http, env.initial_tenant, timeline_id) tenant_path = env.pageserver.timeline_dir(tenant_id, timeline_id) assert not tenant_path.exists() diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index 9c7e851ba8..2c3ee38bae 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -203,7 +203,7 @@ def test_ancestor_detach_branched_from( ) client.timeline_delete(env.initial_tenant, env.initial_timeline) - wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline, 10, 1.0) + wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline) # because we do the fullbackup from ancestor at the branch_lsn, the zenith.signal is always different # as there is always "PREV_LSN: invalid" for "before" @@ -336,10 +336,10 @@ def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder): # delete the timelines to confirm detach actually worked client.timeline_delete(env.initial_tenant, after) - wait_timeline_detail_404(client, env.initial_tenant, after, 10, 1.0) + wait_timeline_detail_404(client, env.initial_tenant, after) client.timeline_delete(env.initial_tenant, env.initial_timeline) - wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline, 10, 1.0) + wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline) def test_detached_receives_flushes_while_being_detached(neon_env_builder: NeonEnvBuilder): @@ -973,17 +973,17 @@ def test_timeline_detach_ancestor_interrupted_by_deletion( with ThreadPoolExecutor(max_workers=2) as pool: try: fut = pool.submit(detach_ancestor) - offset = wait_until(10, 1.0, at_failpoint) + offset = wait_until(at_failpoint) delete = pool.submit(start_delete) - offset = wait_until(10, 1.0, lambda: at_waiting_on_gate_close(offset)) + offset = wait_until(lambda: at_waiting_on_gate_close(offset)) victim_http.configure_failpoints((pausepoint, "off")) delete.result() - assert wait_until(10, 1.0, is_deleted), f"unimplemented mode {mode}" + assert wait_until(is_deleted), f"unimplemented mode {mode}" # TODO: match the error with pytest.raises(PageserverApiException) as exc: @@ -1115,11 +1115,11 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv with ThreadPoolExecutor(max_workers=1) as pool: try: fut = pool.submit(detach_timeline) - wait_until(10, 1.0, paused_at_failpoint) + wait_until(paused_at_failpoint) # let stuck complete stuck_http.configure_failpoints((pausepoint, "off")) - wait_until(10, 1.0, first_completed) + wait_until(first_completed) if mode == "delete_reparentable_timeline": assert first_branch is not None @@ -1127,7 +1127,7 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv env.initial_tenant, first_branch ) victim_http.configure_failpoints((pausepoint, "off")) - wait_until(10, 1.0, first_branch_gone) + wait_until(first_branch_gone) elif mode == "create_reparentable_timeline": first_branch = create_reparentable_timeline() victim_http.configure_failpoints((pausepoint, "off")) @@ -1271,11 +1271,11 @@ def test_retryable_500_hit_through_storcon_during_timeline_detach_ancestor( with ThreadPoolExecutor(max_workers=1) as pool: try: fut = pool.submit(detach_timeline) - wait_until(10, 1.0, paused_at_failpoint) + wait_until(paused_at_failpoint) # let stuck complete stuck_http.configure_failpoints((pausepoint, "off")) - wait_until(10, 1.0, first_completed) + wait_until(first_completed) victim_http.configure_failpoints((pausepoint, "off")) @@ -1456,7 +1456,7 @@ def test_retried_detach_ancestor_after_failed_reparenting(neon_env_builder: Neon # other tests take the "detach? reparent complete", but this only hits # "complete". http.timeline_delete(env.initial_tenant, env.initial_timeline) - wait_timeline_detail_404(http, env.initial_tenant, env.initial_timeline, 20) + wait_timeline_detail_404(http, env.initial_tenant, env.initial_timeline) http.configure_failpoints(("timeline-detach-ancestor::complete_before_uploading", "off")) @@ -1518,7 +1518,7 @@ def test_timeline_is_deleted_before_timeline_detach_ancestor_completes( with ThreadPoolExecutor(max_workers=1) as pool: detach = pool.submit(detach_and_get_stuck) - offset = wait_until(10, 1.0, request_processing_noted_in_log) + offset = wait_until(request_processing_noted_in_log) # make this named fn tor more clear failure test output logging def pausepoint_hit_with_gc_paused() -> LogCursor: @@ -1529,11 +1529,11 @@ def test_timeline_is_deleted_before_timeline_detach_ancestor_completes( ) return at - offset = wait_until(10, 1.0, pausepoint_hit_with_gc_paused) + offset = wait_until(pausepoint_hit_with_gc_paused) delete_detached() - wait_timeline_detail_404(http, env.initial_tenant, detached, 10, 1.0) + wait_timeline_detail_404(http, env.initial_tenant, detached) http.configure_failpoints((failpoint, "off")) diff --git a/test_runner/regress/test_timeline_gc_blocking.py b/test_runner/regress/test_timeline_gc_blocking.py index 5a5ca3290a..7605e1f758 100644 --- a/test_runner/regress/test_timeline_gc_blocking.py +++ b/test_runner/regress/test_timeline_gc_blocking.py @@ -61,7 +61,7 @@ def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder, sharded: bool # deletion unblocks gc http.timeline_delete(env.initial_tenant, foo_branch) - wait_timeline_detail_404(http, env.initial_tenant, foo_branch, 10, 1.0) + wait_timeline_detail_404(http, env.initial_tenant, foo_branch) wait_for_another_gc_round() pss.assert_log_contains(gc_active_line) diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 4528bc6180..95bf9106cd 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -396,11 +396,7 @@ def test_timeline_physical_size_init(neon_env_builder: NeonEnvBuilder): # Wait for the tenant to be loaded client = env.pageserver.http_client() - wait_until( - number_of_iterations=5, - interval=1, - func=lambda: assert_tenant_state(client, env.initial_tenant, "Active"), - ) + wait_until(lambda: assert_tenant_state(client, env.initial_tenant, "Active")) assert_physical_size_invariants( get_physical_size_values(env, env.initial_tenant, new_timeline_id), @@ -433,7 +429,7 @@ def test_timeline_physical_size_post_checkpoint(neon_env_builder: NeonEnvBuilder get_physical_size_values(env, env.initial_tenant, new_timeline_id), ) - wait_until(10, 1, check) + wait_until(check) def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder): @@ -721,7 +717,7 @@ def wait_for_tenant_startup_completions(client: PageserverHttpClient, count: int def condition(): assert client.get_metric_value("pageserver_tenant_startup_complete_total") == count - wait_until(5, 1.0, condition) + wait_until(condition) def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): @@ -768,7 +764,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): assert "Active" in set(get_tenant_states().values()) # One tenant should activate, then get stuck in their logical size calculation - wait_until(10, 1, at_least_one_active) + wait_until(at_least_one_active) # Wait some walltime to gain confidence that other tenants really are stuck and not proceeding to activate time.sleep(5) @@ -836,13 +832,13 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): def all_active(): assert all(s == "Active" for s in get_tenant_states().values()) - wait_until(10, 1, all_active) + wait_until(all_active) # Final control check: restarting with no failpoints at all results in all tenants coming active # without being prompted by client I/O env.pageserver.stop() env.pageserver.start() - wait_until(10, 1, all_active) + wait_until(all_active) assert ( pageserver_http.get_metric_value("pageserver_tenant_startup_scheduled_total") == n_tenants @@ -856,7 +852,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): extra_env_vars={"FAILPOINTS": "timeline-calculate-logical-size-pause=pause"} ) - wait_until(10, 1, at_least_one_active) + wait_until(at_least_one_active) detach_tenant_id = list( [(tid, s) for (tid, s) in get_tenant_states().items() if s == "Attaching"] @@ -881,7 +877,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): # Check that all the stuck tenants proceed to active (apart from the one that deletes, and the one # we detached) - wait_until(10, 1, all_active) + wait_until(all_active) assert len(get_tenant_states()) == n_tenants - 2 @@ -908,7 +904,7 @@ def delete_lazy_activating( try: # Deletion will get to the point in shutdown where it's waiting for timeline shutdown, then # hang because of our failpoint blocking activation. - wait_until(10, 1, shutting_down) + wait_until(shutting_down) finally: log.info("Clearing failpoint") pageserver_http.configure_failpoints(("timeline-calculate-logical-size-pause", "off")) @@ -1030,13 +1026,13 @@ def test_eager_attach_does_not_queue_up(neon_env_builder: NeonEnvBuilder): log.info(f"{states}") assert len(states["Active"]) == 1 - wait_until(10, 1, one_is_active) + wait_until(one_is_active) def other_is_attaching(): states = get_tenant_states() assert len(states["Attaching"]) == 1 - wait_until(10, 1, other_is_attaching) + wait_until(other_is_attaching) def eager_tenant_is_active(): resp = client.tenant_status(eager_tenant) @@ -1053,7 +1049,7 @@ def test_eager_attach_does_not_queue_up(neon_env_builder: NeonEnvBuilder): }, lazy=False, ) - wait_until(10, 1, eager_tenant_is_active) + wait_until(eager_tenant_is_active) other_is_attaching() @@ -1096,7 +1092,7 @@ def test_lazy_attach_activation(neon_env_builder: NeonEnvBuilder, activation_met resp = client.tenant_status(env.initial_tenant) assert resp["state"]["slug"] == "Active" - wait_until(10, 1, initial_tenant_is_active) + wait_until(initial_tenant_is_active) # even though the initial tenant is now active, because it was startup time # attach, it will consume the only permit because logical size calculation @@ -1119,7 +1115,7 @@ def test_lazy_attach_activation(neon_env_builder: NeonEnvBuilder, activation_met assert resp["state"]["slug"] == "Attaching" # paused logical size calculation of env.initial_tenant is keeping it attaching - wait_until(10, 1, lazy_tenant_is_attaching) + wait_until(lazy_tenant_is_attaching) for _ in range(5): lazy_tenant_is_attaching() @@ -1132,10 +1128,10 @@ def test_lazy_attach_activation(neon_env_builder: NeonEnvBuilder, activation_met if activation_method == "endpoint": with env.endpoints.create_start("main", tenant_id=lazy_tenant): # starting up the endpoint should make it jump the queue - wait_until(10, 1, lazy_tenant_is_active) + wait_until(lazy_tenant_is_active) elif activation_method == "branch": env.create_timeline("second_branch", lazy_tenant) - wait_until(10, 1, lazy_tenant_is_active) + wait_until(lazy_tenant_is_active) elif activation_method == "delete": delete_lazy_activating(lazy_tenant, env.pageserver, expect_attaching=True) else: diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 8fa33b81a9..23d4f23cdb 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -2136,7 +2136,7 @@ def test_pull_timeline_while_evicted(neon_env_builder: NeonEnvBuilder): # Check that on source no segment files are present assert src_sk.list_segments(tenant_id, timeline_id) == [] - wait_until(60, 1, evicted_on_source) + wait_until(evicted_on_source, timeout=60) # Invoke pull_timeline: source should serve snapshot request without promoting anything to local disk, # destination should import the control file only & go into evicted mode immediately @@ -2155,7 +2155,7 @@ def test_pull_timeline_while_evicted(neon_env_builder: NeonEnvBuilder): # This should be fast, it is a wait_until because eviction state is updated # in the background wrt pull_timeline. - wait_until(10, 0.1, evicted_on_destination) + wait_until(evicted_on_destination, timeout=1.0, interval=0.1) # Delete the timeline on the source, to prove that deletion works on an # evicted timeline _and_ that the final compute test is really not using @@ -2178,7 +2178,7 @@ def test_pull_timeline_while_evicted(neon_env_builder: NeonEnvBuilder): n_evicted = dst_sk.http_client().get_metric_value("safekeeper_evicted_timelines") assert n_evicted == 0 - wait_until(10, 1, unevicted_on_dest) + wait_until(unevicted_on_dest, interval=0.1, timeout=1.0) # In this test we check for excessive START_REPLICATION and START_WAL_PUSH queries @@ -2606,10 +2606,10 @@ def test_s3_eviction( assert n_evicted # make mypy happy assert int(n_evicted) == n_timelines - wait_until(60, 0.5, all_evicted) + wait_until(all_evicted, timeout=30) # restart should preserve the metric value sk.stop().start() - wait_until(60, 0.5, all_evicted) + wait_until(all_evicted) # and endpoint start should reduce is endpoints[0].start() @@ -2618,7 +2618,7 @@ def test_s3_eviction( assert n_evicted # make mypy happy assert int(n_evicted) < n_timelines - wait_until(60, 0.5, one_unevicted) + wait_until(one_unevicted) # Test resetting uploaded partial segment state. @@ -2666,7 +2666,7 @@ def test_backup_partial_reset(neon_env_builder: NeonEnvBuilder): if isinstance(eviction_state, str) and eviction_state == "Present": raise Exception("eviction didn't happen yet") - wait_until(30, 1, evicted) + wait_until(evicted) # it must have uploaded something uploaded_segs = sk.list_uploaded_segments(tenant_id, timeline_id) log.info(f"uploaded segments before reset: {uploaded_segs}") @@ -2763,7 +2763,7 @@ def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilde raise Exception("Partial segment not uploaded yet") - source_partial_segment = wait_until(15, 1, source_partial_segment_uploaded) + source_partial_segment = wait_until(source_partial_segment_uploaded) log.info( f"Uploaded segments before pull are {src_sk.list_uploaded_segments(tenant_id, timeline_id)}" ) @@ -2787,7 +2787,7 @@ def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilde if evictions is None or evictions == 0: raise Exception("Eviction did not happen on source safekeeper yet") - wait_until(30, 1, evicted) + wait_until(evicted) endpoint.start(safekeepers=[2, 3]) @@ -2804,7 +2804,7 @@ def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilde ) endpoint.safe_psql("insert into t select generate_series(1, 1000), 'pear'") - wait_until(15, 1, new_partial_segment_uploaded) + wait_until(new_partial_segment_uploaded) log.info( f"Uploaded segments after post-pull ingest are {src_sk.list_uploaded_segments(tenant_id, timeline_id)}" @@ -2833,4 +2833,4 @@ def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilde if unevictions is None or unevictions == 0: raise Exception("Uneviction did not happen on source safekeeper yet") - wait_until(10, 1, unevicted) + wait_until(unevicted) diff --git a/test_runner/regress/test_wal_receiver.py b/test_runner/regress/test_wal_receiver.py index 294f86ffa7..d22a900c59 100644 --- a/test_runner/regress/test_wal_receiver.py +++ b/test_runner/regress/test_wal_receiver.py @@ -97,7 +97,7 @@ def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuil str(safekeeper.id) in exception_string ), f"Should have safekeeper {safekeeper.id} printed in walreceiver state after WAL wait timeout" - wait_until(60, 0.5, all_sks_in_wareceiver_state) + wait_until(all_sks_in_wareceiver_state, timeout=30) stopped_safekeeper = env.safekeepers[-1] stopped_safekeeper_id = stopped_safekeeper.id @@ -124,7 +124,7 @@ def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuil str(safekeeper.id) in exception_string ), f"Should have safekeeper {safekeeper.id} printed in walreceiver state after 2nd WAL wait timeout" - wait_until(60, 0.5, all_but_stopped_sks_in_wareceiver_state) + wait_until(all_but_stopped_sks_in_wareceiver_state, timeout=30) def insert_test_elements(env: NeonEnv, tenant_id: TenantId, start: int, count: int): From 9ce0dd4e5511573487120cac763d22633a07d40a Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 2 Dec 2024 11:50:22 +0000 Subject: [PATCH 24/65] storcon: add metric for AZ scheduling violations (#9949) ## Problem We can't easily tell how far the state of shards is from their AZ preferences. This can be a cause of performance issues, so it's important for diagnosability that we can tell easily if there are significant numbers of shards that aren't running in their preferred AZ. Related: https://github.com/neondatabase/cloud/issues/15413 ## Summary of changes - In reconcile_all, count shards that are scheduled into the wrong AZ (if they have a preference), and publish it as a prometheus gauge. - Also calculate a statistic for how many shards wanted to reconcile but couldn't. This is clearly a lazy calculation: reconcile all only runs periodically. But that's okay: shards in the wrong AZ is something that only matters if it stays that way for some period of time. --- storage_controller/src/metrics.rs | 6 ++++++ storage_controller/src/service.rs | 32 +++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs index a1f7bc2457..6d5885eba6 100644 --- a/storage_controller/src/metrics.rs +++ b/storage_controller/src/metrics.rs @@ -50,6 +50,12 @@ pub(crate) struct StorageControllerMetricGroup { /// Count of how many times we make an optimization change to a tenant's scheduling pub(crate) storage_controller_schedule_optimization: measured::Counter, + /// How many shards are not scheduled into their preferred AZ + pub(crate) storage_controller_schedule_az_violation: measured::Gauge, + + /// How many shards would like to reconcile but were blocked by concurrency limits + pub(crate) storage_controller_pending_reconciles: measured::Gauge, + /// HTTP request status counters for handled requests pub(crate) storage_controller_http_request_status: measured::CounterVec, diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 636ccf11a1..631fdb4923 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -6016,14 +6016,33 @@ impl Service { let (nodes, tenants, _scheduler) = locked.parts_mut(); let pageservers = nodes.clone(); + // This function is an efficient place to update lazy statistics, since we are walking + // all tenants. + let mut pending_reconciles = 0; + let mut az_violations = 0; + let mut reconciles_spawned = 0; for shard in tenants.values_mut() { + // Accumulate scheduling statistics + if let (Some(attached), Some(preferred)) = + (shard.intent.get_attached(), shard.preferred_az()) + { + let node_az = nodes + .get(attached) + .expect("Nodes exist if referenced") + .get_availability_zone_id(); + if node_az != preferred { + az_violations += 1; + } + } + // Skip checking if this shard is already enqueued for reconciliation if shard.delayed_reconcile && self.reconciler_concurrency.available_permits() == 0 { // If there is something delayed, then return a nonzero count so that // callers like reconcile_all_now do not incorrectly get the impression // that the system is in a quiescent state. reconciles_spawned = std::cmp::max(1, reconciles_spawned); + pending_reconciles += 1; continue; } @@ -6031,9 +6050,22 @@ impl Service { // dirty, spawn another rone if self.maybe_reconcile_shard(shard, &pageservers).is_some() { reconciles_spawned += 1; + } else if shard.delayed_reconcile { + // Shard wanted to reconcile but for some reason couldn't. + pending_reconciles += 1; } } + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_schedule_az_violation + .set(az_violations as i64); + + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_pending_reconciles + .set(pending_reconciles as i64); + reconciles_spawned } From a750c14735502019a91055d46351d417ccd2bec1 Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Mon, 2 Dec 2024 12:29:57 +0000 Subject: [PATCH 25/65] fix(proxy): forward notifications from authentication (#9948) Fixes https://github.com/neondatabase/cloud/issues/20973. This refactors `connect_raw` in order to return direct access to the delayed notices. I cannot find a way to test this with psycopg2 unfortunately, although testing it with psql does return the expected results. --- libs/pq_proto/src/lib.rs | 6 +++ .../postgres-protocol2/src/message/backend.rs | 4 ++ .../proxy/tokio-postgres2/src/cancel_token.rs | 8 ++-- libs/proxy/tokio-postgres2/src/client.rs | 13 +++--- libs/proxy/tokio-postgres2/src/config.rs | 6 +-- libs/proxy/tokio-postgres2/src/connect.rs | 42 +++++++++++++++---- libs/proxy/tokio-postgres2/src/connect_raw.rs | 37 +++++++++------- libs/proxy/tokio-postgres2/src/lib.rs | 5 ++- proxy/src/compute.rs | 38 ++++++++++++----- proxy/src/proxy/mod.rs | 8 ++-- proxy/src/proxy/tests/mod.rs | 8 ++-- 11 files changed, 117 insertions(+), 58 deletions(-) diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs index 4b0331999d..43dfbc22a4 100644 --- a/libs/pq_proto/src/lib.rs +++ b/libs/pq_proto/src/lib.rs @@ -565,6 +565,8 @@ pub enum BeMessage<'a> { /// Batch of interpreted, shard filtered WAL records, /// ready for the pageserver to ingest InterpretedWalRecords(InterpretedWalRecordsBody<'a>), + + Raw(u8, &'a [u8]), } /// Common shorthands. @@ -754,6 +756,10 @@ impl BeMessage<'_> { /// one more buffer. pub fn write(buf: &mut BytesMut, message: &BeMessage) -> Result<(), ProtocolError> { match message { + BeMessage::Raw(code, data) => { + buf.put_u8(*code); + write_body(buf, |b| b.put_slice(data)) + } BeMessage::AuthenticationOk => { buf.put_u8(b'R'); write_body(buf, |buf| { diff --git a/libs/proxy/postgres-protocol2/src/message/backend.rs b/libs/proxy/postgres-protocol2/src/message/backend.rs index 356d142f3f..33d77fc252 100644 --- a/libs/proxy/postgres-protocol2/src/message/backend.rs +++ b/libs/proxy/postgres-protocol2/src/message/backend.rs @@ -541,6 +541,10 @@ impl NoticeResponseBody { pub fn fields(&self) -> ErrorFields<'_> { ErrorFields { buf: &self.storage } } + + pub fn as_bytes(&self) -> &[u8] { + &self.storage + } } pub struct NotificationResponseBody { diff --git a/libs/proxy/tokio-postgres2/src/cancel_token.rs b/libs/proxy/tokio-postgres2/src/cancel_token.rs index b949bf358f..a10e8bf5c3 100644 --- a/libs/proxy/tokio-postgres2/src/cancel_token.rs +++ b/libs/proxy/tokio-postgres2/src/cancel_token.rs @@ -10,10 +10,10 @@ use tokio::net::TcpStream; /// connection. #[derive(Clone)] pub struct CancelToken { - pub(crate) socket_config: Option, - pub(crate) ssl_mode: SslMode, - pub(crate) process_id: i32, - pub(crate) secret_key: i32, + pub socket_config: Option, + pub ssl_mode: SslMode, + pub process_id: i32, + pub secret_key: i32, } impl CancelToken { diff --git a/libs/proxy/tokio-postgres2/src/client.rs b/libs/proxy/tokio-postgres2/src/client.rs index 96200b71e7..a7cd53afc3 100644 --- a/libs/proxy/tokio-postgres2/src/client.rs +++ b/libs/proxy/tokio-postgres2/src/client.rs @@ -138,7 +138,7 @@ impl InnerClient { } #[derive(Clone)] -pub(crate) struct SocketConfig { +pub struct SocketConfig { pub host: Host, pub port: u16, pub connect_timeout: Option, @@ -152,7 +152,7 @@ pub(crate) struct SocketConfig { pub struct Client { inner: Arc, - socket_config: Option, + socket_config: SocketConfig, ssl_mode: SslMode, process_id: i32, secret_key: i32, @@ -161,6 +161,7 @@ pub struct Client { impl Client { pub(crate) fn new( sender: mpsc::UnboundedSender, + socket_config: SocketConfig, ssl_mode: SslMode, process_id: i32, secret_key: i32, @@ -172,7 +173,7 @@ impl Client { buffer: Default::default(), }), - socket_config: None, + socket_config, ssl_mode, process_id, secret_key, @@ -188,10 +189,6 @@ impl Client { &self.inner } - pub(crate) fn set_socket_config(&mut self, socket_config: SocketConfig) { - self.socket_config = Some(socket_config); - } - /// Creates a new prepared statement. /// /// Prepared statements can be executed repeatedly, and may contain query parameters (indicated by `$1`, `$2`, etc), @@ -412,7 +409,7 @@ impl Client { /// connection associated with this client. pub fn cancel_token(&self) -> CancelToken { CancelToken { - socket_config: self.socket_config.clone(), + socket_config: Some(self.socket_config.clone()), ssl_mode: self.ssl_mode, process_id: self.process_id, secret_key: self.secret_key, diff --git a/libs/proxy/tokio-postgres2/src/config.rs b/libs/proxy/tokio-postgres2/src/config.rs index 969c20ba47..26124b38ef 100644 --- a/libs/proxy/tokio-postgres2/src/config.rs +++ b/libs/proxy/tokio-postgres2/src/config.rs @@ -2,6 +2,7 @@ use crate::connect::connect; use crate::connect_raw::connect_raw; +use crate::connect_raw::RawConnection; use crate::tls::MakeTlsConnect; use crate::tls::TlsConnect; use crate::{Client, Connection, Error}; @@ -485,14 +486,11 @@ impl Config { connect(tls, self).await } - /// Connects to a PostgreSQL database over an arbitrary stream. - /// - /// All of the settings other than `user`, `password`, `dbname`, `options`, and `application_name` name are ignored. pub async fn connect_raw( &self, stream: S, tls: T, - ) -> Result<(Client, Connection), Error> + ) -> Result, Error> where S: AsyncRead + AsyncWrite + Unpin, T: TlsConnect, diff --git a/libs/proxy/tokio-postgres2/src/connect.rs b/libs/proxy/tokio-postgres2/src/connect.rs index 7517fe0cde..98067d91f9 100644 --- a/libs/proxy/tokio-postgres2/src/connect.rs +++ b/libs/proxy/tokio-postgres2/src/connect.rs @@ -1,13 +1,16 @@ use crate::client::SocketConfig; +use crate::codec::BackendMessage; use crate::config::{Host, TargetSessionAttrs}; use crate::connect_raw::connect_raw; use crate::connect_socket::connect_socket; use crate::tls::{MakeTlsConnect, TlsConnect}; -use crate::{Client, Config, Connection, Error, SimpleQueryMessage}; +use crate::{Client, Config, Connection, Error, RawConnection, SimpleQueryMessage}; use futures_util::{future, pin_mut, Future, FutureExt, Stream}; +use postgres_protocol2::message::backend::Message; use std::io; use std::task::Poll; use tokio::net::TcpStream; +use tokio::sync::mpsc; pub async fn connect( mut tls: T, @@ -60,7 +63,36 @@ where T: TlsConnect, { let socket = connect_socket(host, port, config.connect_timeout).await?; - let (mut client, mut connection) = connect_raw(socket, tls, config).await?; + let RawConnection { + stream, + parameters, + delayed_notice, + process_id, + secret_key, + } = connect_raw(socket, tls, config).await?; + + let socket_config = SocketConfig { + host: host.clone(), + port, + connect_timeout: config.connect_timeout, + }; + + let (sender, receiver) = mpsc::unbounded_channel(); + let client = Client::new( + sender, + socket_config, + config.ssl_mode, + process_id, + secret_key, + ); + + // delayed notices are always sent as "Async" messages. + let delayed = delayed_notice + .into_iter() + .map(|m| BackendMessage::Async(Message::NoticeResponse(m))) + .collect(); + + let mut connection = Connection::new(stream, delayed, parameters, receiver); if let TargetSessionAttrs::ReadWrite = config.target_session_attrs { let rows = client.simple_query_raw("SHOW transaction_read_only"); @@ -102,11 +134,5 @@ where } } - client.set_socket_config(SocketConfig { - host: host.clone(), - port, - connect_timeout: config.connect_timeout, - }); - Ok((client, connection)) } diff --git a/libs/proxy/tokio-postgres2/src/connect_raw.rs b/libs/proxy/tokio-postgres2/src/connect_raw.rs index 80677af969..9c6f1a2552 100644 --- a/libs/proxy/tokio-postgres2/src/connect_raw.rs +++ b/libs/proxy/tokio-postgres2/src/connect_raw.rs @@ -3,27 +3,26 @@ use crate::config::{self, AuthKeys, Config, ReplicationMode}; use crate::connect_tls::connect_tls; use crate::maybe_tls_stream::MaybeTlsStream; use crate::tls::{TlsConnect, TlsStream}; -use crate::{Client, Connection, Error}; +use crate::Error; use bytes::BytesMut; use fallible_iterator::FallibleIterator; use futures_util::{ready, Sink, SinkExt, Stream, TryStreamExt}; use postgres_protocol2::authentication; use postgres_protocol2::authentication::sasl; use postgres_protocol2::authentication::sasl::ScramSha256; -use postgres_protocol2::message::backend::{AuthenticationSaslBody, Message}; +use postgres_protocol2::message::backend::{AuthenticationSaslBody, Message, NoticeResponseBody}; use postgres_protocol2::message::frontend; -use std::collections::{HashMap, VecDeque}; +use std::collections::HashMap; use std::io; use std::pin::Pin; use std::task::{Context, Poll}; use tokio::io::{AsyncRead, AsyncWrite}; -use tokio::sync::mpsc; use tokio_util::codec::Framed; pub struct StartupStream { inner: Framed, PostgresCodec>, buf: BackendMessages, - delayed: VecDeque, + delayed_notice: Vec, } impl Sink for StartupStream @@ -78,11 +77,19 @@ where } } +pub struct RawConnection { + pub stream: Framed, PostgresCodec>, + pub parameters: HashMap, + pub delayed_notice: Vec, + pub process_id: i32, + pub secret_key: i32, +} + pub async fn connect_raw( stream: S, tls: T, config: &Config, -) -> Result<(Client, Connection), Error> +) -> Result, Error> where S: AsyncRead + AsyncWrite + Unpin, T: TlsConnect, @@ -97,18 +104,20 @@ where }, ), buf: BackendMessages::empty(), - delayed: VecDeque::new(), + delayed_notice: Vec::new(), }; startup(&mut stream, config).await?; authenticate(&mut stream, config).await?; let (process_id, secret_key, parameters) = read_info(&mut stream).await?; - let (sender, receiver) = mpsc::unbounded_channel(); - let client = Client::new(sender, config.ssl_mode, process_id, secret_key); - let connection = Connection::new(stream.inner, stream.delayed, parameters, receiver); - - Ok((client, connection)) + Ok(RawConnection { + stream: stream.inner, + parameters, + delayed_notice: stream.delayed_notice, + process_id, + secret_key, + }) } async fn startup(stream: &mut StartupStream, config: &Config) -> Result<(), Error> @@ -347,9 +356,7 @@ where body.value().map_err(Error::parse)?.to_string(), ); } - Some(msg @ Message::NoticeResponse(_)) => { - stream.delayed.push_back(BackendMessage::Async(msg)) - } + Some(Message::NoticeResponse(body)) => stream.delayed_notice.push(body), Some(Message::ReadyForQuery(_)) => return Ok((process_id, secret_key, parameters)), Some(Message::ErrorResponse(body)) => return Err(Error::db(body)), Some(_) => return Err(Error::unexpected_message()), diff --git a/libs/proxy/tokio-postgres2/src/lib.rs b/libs/proxy/tokio-postgres2/src/lib.rs index 72ba8172b2..57c639a7de 100644 --- a/libs/proxy/tokio-postgres2/src/lib.rs +++ b/libs/proxy/tokio-postgres2/src/lib.rs @@ -1,9 +1,10 @@ //! An asynchronous, pipelined, PostgreSQL client. -#![warn(rust_2018_idioms, clippy::all, missing_docs)] +#![warn(rust_2018_idioms, clippy::all)] pub use crate::cancel_token::CancelToken; -pub use crate::client::Client; +pub use crate::client::{Client, SocketConfig}; pub use crate::config::Config; +pub use crate::connect_raw::RawConnection; pub use crate::connection::Connection; use crate::error::DbError; pub use crate::error::Error; diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 2abe88ac88..b689b97a21 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -6,6 +6,7 @@ use std::time::Duration; use futures::{FutureExt, TryFutureExt}; use itertools::Itertools; use once_cell::sync::OnceCell; +use postgres_protocol::message::backend::NoticeResponseBody; use pq_proto::StartupMessageParams; use rustls::client::danger::ServerCertVerifier; use rustls::crypto::ring; @@ -13,6 +14,7 @@ use rustls::pki_types::InvalidDnsNameError; use thiserror::Error; use tokio::net::TcpStream; use tokio_postgres::tls::MakeTlsConnect; +use tokio_postgres::{CancelToken, RawConnection}; use tracing::{debug, error, info, warn}; use crate::auth::parse_endpoint_param; @@ -277,6 +279,8 @@ pub(crate) struct PostgresConnection { pub(crate) cancel_closure: CancelClosure, /// Labels for proxy's metrics. pub(crate) aux: MetricsAuxInfo, + /// Notices received from compute after authenticating + pub(crate) delayed_notice: Vec, _guage: NumDbConnectionsGuard<'static>, } @@ -322,10 +326,19 @@ impl ConnCfg { // connect_raw() will not use TLS if sslmode is "disable" let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); - let (client, connection) = self.0.connect_raw(stream, tls).await?; + let connection = self.0.connect_raw(stream, tls).await?; drop(pause); - tracing::Span::current().record("pid", tracing::field::display(client.get_process_id())); - let stream = connection.stream.into_inner(); + + let RawConnection { + stream, + parameters, + delayed_notice, + process_id, + secret_key, + } = connection; + + tracing::Span::current().record("pid", tracing::field::display(process_id)); + let stream = stream.into_inner(); // TODO: lots of useful info but maybe we can move it elsewhere (eg traces?) info!( @@ -334,18 +347,23 @@ impl ConnCfg { self.0.get_ssl_mode() ); - // This is very ugly but as of now there's no better way to - // extract the connection parameters from tokio-postgres' connection. - // TODO: solve this problem in a more elegant manner (e.g. the new library). - let params = connection.parameters; - // NB: CancelToken is supposed to hold socket_addr, but we use connect_raw. // Yet another reason to rework the connection establishing code. - let cancel_closure = CancelClosure::new(socket_addr, client.cancel_token(), vec![]); + let cancel_closure = CancelClosure::new( + socket_addr, + CancelToken { + socket_config: None, + ssl_mode: self.0.get_ssl_mode(), + process_id, + secret_key, + }, + vec![], + ); let connection = PostgresConnection { stream, - params, + params: parameters, + delayed_notice, cancel_closure, aux, _guage: Metrics::get().proxy.db_connections.guard(ctx.protocol()), diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index 956036d29d..af97fb3d71 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -384,11 +384,13 @@ pub(crate) async fn prepare_client_connection

( // The new token (cancel_key_data) will be sent to the client. let cancel_key_data = session.enable_query_cancellation(node.cancel_closure.clone()); + // Forward all deferred notices to the client. + for notice in &node.delayed_notice { + stream.write_message_noflush(&Be::Raw(b'N', notice.as_bytes()))?; + } + // Forward all postgres connection params to the client. - // Right now the implementation is very hacky and inefficent (ideally, - // we don't need an intermediate hashmap), but at least it should be correct. for (name, value) in &node.params { - // TODO: Theoretically, this could result in a big pile of params... stream.write_message_noflush(&Be::ParameterStatus { name: name.as_bytes(), value: value.as_bytes(), diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs index 2c2c2964b6..15be6c9724 100644 --- a/proxy/src/proxy/tests/mod.rs +++ b/proxy/src/proxy/tests/mod.rs @@ -233,7 +233,7 @@ async fn handshake_tls() -> anyhow::Result<()> { generate_tls_config("generic-project-name.localhost", "localhost")?; let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), NoAuth)); - let (_client, _conn) = tokio_postgres::Config::new() + let _conn = tokio_postgres::Config::new() .user("john_doe") .dbname("earth") .ssl_mode(SslMode::Require) @@ -249,7 +249,7 @@ async fn handshake_raw() -> anyhow::Result<()> { let proxy = tokio::spawn(dummy_proxy(client, None, NoAuth)); - let (_client, _conn) = tokio_postgres::Config::new() + let _conn = tokio_postgres::Config::new() .user("john_doe") .dbname("earth") .options("project=generic-project-name") @@ -296,7 +296,7 @@ async fn scram_auth_good(#[case] password: &str) -> anyhow::Result<()> { Scram::new(password).await?, )); - let (_client, _conn) = tokio_postgres::Config::new() + let _conn = tokio_postgres::Config::new() .channel_binding(tokio_postgres::config::ChannelBinding::Require) .user("user") .dbname("db") @@ -320,7 +320,7 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> { Scram::new("password").await?, )); - let (_client, _conn) = tokio_postgres::Config::new() + let _conn = tokio_postgres::Config::new() .channel_binding(tokio_postgres::config::ChannelBinding::Disable) .user("user") .dbname("db") From 4f6c5949739d7d85a0f3535494a68a345b9e7b6d Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Mon, 2 Dec 2024 12:46:07 +0000 Subject: [PATCH 26/65] CI(replication-tests): fix notifications about replication-tests failures (#9950) ## Problem `if: ${{ github.event.schedule }}` gets skipped if a previous step has failed, but we want to run the step for both `success` and `failure` ## Summary of changes - Add `!cancelled()` to notification step if-condition, to skip only cancelled jobs --- .github/workflows/benchmarking.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index ea8fee80c2..7621d72f64 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -249,7 +249,7 @@ jobs: # Post both success and failure to the Slack channel - name: Post to a Slack channel - if: ${{ github.event.schedule }} + if: ${{ github.event.schedule && !cancelled() }} uses: slackapi/slack-github-action@v1 with: channel-id: "C06T9AMNDQQ" # on-call-compute-staging-stream From b6f93dcec94ec6481ef4a427c5ca1b349216967f Mon Sep 17 00:00:00 2001 From: Folke Behrens Date: Mon, 2 Dec 2024 16:38:12 +0100 Subject: [PATCH 27/65] proxy: Create Elasticache credentials provider lazily (#9967) ## Problem The credentials providers tries to connect to AWS STS even when we use plain Redis connections. ## Summary of changes * Construct the CredentialsProvider only when needed ("irsa"). --- proxy/src/bin/proxy.rs | 49 +++++--------------------------- proxy/src/redis/elasticache.rs | 51 ++++++++++++++++++++++++++++++---- 2 files changed, 53 insertions(+), 47 deletions(-) diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index b772a987ee..c929b97d78 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -3,14 +3,6 @@ use std::pin::pin; use std::sync::Arc; use anyhow::bail; -use aws_config::environment::EnvironmentVariableCredentialsProvider; -use aws_config::imds::credentials::ImdsCredentialsProvider; -use aws_config::meta::credentials::CredentialsProviderChain; -use aws_config::meta::region::RegionProviderChain; -use aws_config::profile::ProfileFileCredentialsProvider; -use aws_config::provider_config::ProviderConfig; -use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider; -use aws_config::Region; use futures::future::Either; use proxy::auth::backend::jwt::JwkCache; use proxy::auth::backend::{AuthRateLimiter, ConsoleRedirectBackend, MaybeOwned}; @@ -314,39 +306,7 @@ async fn main() -> anyhow::Result<()> { }; info!("Using region: {}", args.aws_region); - let region_provider = - RegionProviderChain::default_provider().or_else(Region::new(args.aws_region.clone())); - let provider_conf = - ProviderConfig::without_region().with_region(region_provider.region().await); - let aws_credentials_provider = { - // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY" - CredentialsProviderChain::first_try("env", EnvironmentVariableCredentialsProvider::new()) - // uses "AWS_PROFILE" / `aws sso login --profile ` - .or_else( - "profile-sso", - ProfileFileCredentialsProvider::builder() - .configure(&provider_conf) - .build(), - ) - // uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME" - // needed to access remote extensions bucket - .or_else( - "token", - WebIdentityTokenCredentialsProvider::builder() - .configure(&provider_conf) - .build(), - ) - // uses imds v2 - .or_else("imds", ImdsCredentialsProvider::builder().build()) - }; - let elasticache_credentials_provider = Arc::new(elasticache::CredentialsProvider::new( - elasticache::AWSIRSAConfig::new( - args.aws_region.clone(), - args.redis_cluster_name, - args.redis_user_id, - ), - aws_credentials_provider, - )); + // TODO: untangle the config args let regional_redis_client = match (args.redis_auth_type.as_str(), &args.redis_notifications) { ("plain", redis_url) => match redis_url { None => { @@ -361,7 +321,12 @@ async fn main() -> anyhow::Result<()> { ConnectionWithCredentialsProvider::new_with_credentials_provider( host.to_string(), port, - elasticache_credentials_provider.clone(), + elasticache::CredentialsProvider::new( + args.aws_region, + args.redis_cluster_name, + args.redis_user_id, + ) + .await, ), ), (None, None) => { diff --git a/proxy/src/redis/elasticache.rs b/proxy/src/redis/elasticache.rs index d118c8f412..bf6dde9332 100644 --- a/proxy/src/redis/elasticache.rs +++ b/proxy/src/redis/elasticache.rs @@ -1,6 +1,14 @@ +use std::sync::Arc; use std::time::{Duration, SystemTime}; +use aws_config::environment::EnvironmentVariableCredentialsProvider; +use aws_config::imds::credentials::ImdsCredentialsProvider; use aws_config::meta::credentials::CredentialsProviderChain; +use aws_config::meta::region::RegionProviderChain; +use aws_config::profile::ProfileFileCredentialsProvider; +use aws_config::provider_config::ProviderConfig; +use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider; +use aws_config::Region; use aws_sdk_iam::config::ProvideCredentials; use aws_sigv4::http_request::{ self, SignableBody, SignableRequest, SignatureLocation, SigningSettings, @@ -45,12 +53,45 @@ pub struct CredentialsProvider { } impl CredentialsProvider { - pub fn new(config: AWSIRSAConfig, credentials_provider: CredentialsProviderChain) -> Self { - CredentialsProvider { - config, - credentials_provider, - } + pub async fn new( + aws_region: String, + redis_cluster_name: Option, + redis_user_id: Option, + ) -> Arc { + let region_provider = + RegionProviderChain::default_provider().or_else(Region::new(aws_region.clone())); + let provider_conf = + ProviderConfig::without_region().with_region(region_provider.region().await); + let aws_credentials_provider = { + // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY" + CredentialsProviderChain::first_try( + "env", + EnvironmentVariableCredentialsProvider::new(), + ) + // uses "AWS_PROFILE" / `aws sso login --profile ` + .or_else( + "profile-sso", + ProfileFileCredentialsProvider::builder() + .configure(&provider_conf) + .build(), + ) + // uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME" + // needed to access remote extensions bucket + .or_else( + "token", + WebIdentityTokenCredentialsProvider::builder() + .configure(&provider_conf) + .build(), + ) + // uses imds v2 + .or_else("imds", ImdsCredentialsProvider::builder().build()) + }; + Arc::new(CredentialsProvider { + config: AWSIRSAConfig::new(aws_region, redis_cluster_name, redis_user_id), + credentials_provider: aws_credentials_provider, + }) } + pub(crate) async fn provide_credentials(&self) -> anyhow::Result<(String, String)> { let aws_credentials = self .credentials_provider From 84687b743d2323cb289321c2cc290798fe71b7b5 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Mon, 2 Dec 2024 19:10:44 +0300 Subject: [PATCH 28/65] Update consensus protocol spec (#9607) The spec was written for the buggy protocol which we had before the one more similar to Raft was implemented. Update the spec with what we currently have. ref https://github.com/neondatabase/neon/issues/8699 --- safekeeper/spec/.gitignore | 3 + safekeeper/spec/MCProposerAcceptorStatic.tla | 31 + safekeeper/spec/ProposerAcceptorConsensus.cfg | 34 - safekeeper/spec/ProposerAcceptorConsensus.tla | 363 ---- safekeeper/spec/ProposerAcceptorStatic.tla | 449 +++++ safekeeper/spec/modelcheck.sh | 49 + .../MCProposerAcceptorStatic_p2_a3_t2_l2.cfg | 19 + .../MCProposerAcceptorStatic_p2_a3_t3_l2.cfg | 19 + .../MCProposerAcceptorStatic_p2_a3_t3_l3.cfg | 17 + .../MCProposerAcceptorStatic_p2_a3_t4_l4.cfg | 17 + .../MCProposerAcceptorStatic_p2_a5_t2_l2.cfg | 16 + .../MCProposerAcceptorStatic_p2_a5_t3_l3.cfg | 16 + .../MCProposerAcceptorStatic_p2_a5_t4_l3.cfg | 16 + safekeeper/spec/readme.md | 12 + ...c_p2_a3_t2_l2.cfg-2024-11-06--13-44-17.log | 63 + ...c_p2_a3_t3_l2.cfg-2024-11-15--09-09-58.log | 69 + ...c_p2_a3_t3_l3.cfg-2024-11-06--13-03-51.log | 72 + ...c_p2_a3_t4_l4.cfg-2024-11-06--14-20-25.log | 1466 +++++++++++++++++ ...c_p2_a3_t4_l4.cfg-2024-11-06--15-30-45.log | 1374 +++++++++++++++ ...c_p2_a5_t2_l2.cfg-2024-11-06--12-09-32.log | 89 + 20 files changed, 3797 insertions(+), 397 deletions(-) create mode 100644 safekeeper/spec/.gitignore create mode 100644 safekeeper/spec/MCProposerAcceptorStatic.tla delete mode 100644 safekeeper/spec/ProposerAcceptorConsensus.cfg delete mode 100644 safekeeper/spec/ProposerAcceptorConsensus.tla create mode 100644 safekeeper/spec/ProposerAcceptorStatic.tla create mode 100755 safekeeper/spec/modelcheck.sh create mode 100644 safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t2_l2.cfg create mode 100644 safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t3_l2.cfg create mode 100644 safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t3_l3.cfg create mode 100644 safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t4_l4.cfg create mode 100644 safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t2_l2.cfg create mode 100644 safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t3_l3.cfg create mode 100644 safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t4_l3.cfg create mode 100644 safekeeper/spec/readme.md create mode 100644 safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t2_l2.cfg-2024-11-06--13-44-17.log create mode 100644 safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t3_l2.cfg-2024-11-15--09-09-58.log create mode 100644 safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t3_l3.cfg-2024-11-06--13-03-51.log create mode 100644 safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t4_l4.cfg-2024-11-06--14-20-25.log create mode 100644 safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t4_l4.cfg-2024-11-06--15-30-45.log create mode 100644 safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a5_t2_l2.cfg-2024-11-06--12-09-32.log diff --git a/safekeeper/spec/.gitignore b/safekeeper/spec/.gitignore new file mode 100644 index 0000000000..7233153039 --- /dev/null +++ b/safekeeper/spec/.gitignore @@ -0,0 +1,3 @@ +*TTrace* +*.toolbox/ +states/ diff --git a/safekeeper/spec/MCProposerAcceptorStatic.tla b/safekeeper/spec/MCProposerAcceptorStatic.tla new file mode 100644 index 0000000000..be3d99c697 --- /dev/null +++ b/safekeeper/spec/MCProposerAcceptorStatic.tla @@ -0,0 +1,31 @@ +---- MODULE MCProposerAcceptorStatic ---- +EXTENDS TLC, ProposerAcceptorStatic + +\* Augments the spec with model checking constraints. + +\* For model checking. +CONSTANTS + max_entries, \* model constraint: max log entries acceptor/proposer can hold + max_term \* model constraint: max allowed term + +ASSUME max_entries \in Nat /\ max_term \in Nat + +\* Model space constraint. +StateConstraint == \A p \in proposers: + /\ prop_state[p].term <= max_term + /\ Len(prop_state[p].wal) <= max_entries +\* Sets of proposers and acceptors are symmetric because we don't take any +\* actions depending on some concrete proposer/acceptor (like IF p = p1 THEN +\* ...) +ProposerAcceptorSymmetry == Permutations(proposers) \union Permutations(acceptors) + +\* enforce order of the vars in the error trace with ALIAS +\* Note that ALIAS is supported only since version 1.8.0 which is pre-release +\* as of writing this. +Alias == [ + prop_state |-> prop_state, + acc_state |-> acc_state, + committed |-> committed + ] + +==== diff --git a/safekeeper/spec/ProposerAcceptorConsensus.cfg b/safekeeper/spec/ProposerAcceptorConsensus.cfg deleted file mode 100644 index 989c86e47d..0000000000 --- a/safekeeper/spec/ProposerAcceptorConsensus.cfg +++ /dev/null @@ -1,34 +0,0 @@ -\* MV CONSTANT declarations -CONSTANT NULL = NULL -CONSTANTS -p1 = p1 -p2 = p2 -p3 = p3 -a1 = a1 -a2 = a2 -a3 = a3 -\* MV CONSTANT definitions -CONSTANT -proposers = {p1, p2} -acceptors = {a1, a2, a3} -\* SYMMETRY definition -SYMMETRY perms -\* CONSTANT definitions -CONSTANT -max_term = 3 -CONSTANT -max_entries = 3 -\* INIT definition -INIT -Init -\* NEXT definition -NEXT -Next -\* INVARIANT definition -INVARIANT -TypeOk -ElectionSafety -LogIsMonotonic -LogSafety -CommittedNotOverwritten -CHECK_DEADLOCK FALSE \ No newline at end of file diff --git a/safekeeper/spec/ProposerAcceptorConsensus.tla b/safekeeper/spec/ProposerAcceptorConsensus.tla deleted file mode 100644 index e5f0bb270f..0000000000 --- a/safekeeper/spec/ProposerAcceptorConsensus.tla +++ /dev/null @@ -1,363 +0,0 @@ ----- MODULE ProposerAcceptorConsensus ---- - -\* Differences from current implementation: -\* - unified not-globally-unique epoch & term (node_id) -\* Simplifications: -\* - instant message delivery -\* - feedback is not modeled separately, commit_lsn is updated directly - -EXTENDS Integers, Sequences, FiniteSets, TLC - -VARIABLES - prop_state, \* prop_state[p] is state of proposer p - acc_state, \* acc_state[a] is state of acceptor a - commit_lsns \* map of acceptor -> commit_lsn - -CONSTANT - acceptors, - proposers, - max_entries, \* model constraint: max log entries acceptor/proposer can hold - max_term \* model constraint: max allowed term - -CONSTANT NULL - -ASSUME max_entries \in Nat /\ max_term \in Nat - -\* For specifying symmetry set in manual cfg file, see -\* https://github.com/tlaplus/tlaplus/issues/404 -perms == Permutations(proposers) \union Permutations(acceptors) - -\******************************************************************************** -\* Helpers -\******************************************************************************** - -Maximum(S) == - (*************************************************************************) - (* If S is a set of numbers, then this define Maximum(S) to be the *) - (* maximum of those numbers, or -1 if S is empty. *) - (*************************************************************************) - IF S = {} THEN -1 - ELSE CHOOSE n \in S : \A m \in S : n \geq m - -\* minimum of numbers in the set, error if set is empty -Minimum(S) == - CHOOSE min \in S : \A n \in S : min <= n - -\* Min of two numbers -Min(a, b) == IF a < b THEN a ELSE b - -\* Set of values of function f. XXX is there a such builtin? -FValues(f) == {f[a] : a \in DOMAIN f} - -\* Sort of 0 for functions -EmptyF == [x \in {} |-> 42] -IsEmptyF(f) == DOMAIN f = {} - -\* Next entry proposer p will push to acceptor a or NULL. -NextEntry(p, a) == - IF Len(prop_state[p].wal) >= prop_state[p].next_send_lsn[a] THEN - CHOOSE r \in FValues(prop_state[p].wal) : r.lsn = prop_state[p].next_send_lsn[a] - ELSE - NULL - - -\***************** - -NumAccs == Cardinality(acceptors) - -\* does acc_set form the quorum? -Quorum(acc_set) == Cardinality(acc_set) >= (NumAccs \div 2 + 1) -\* all quorums of acceptors -Quorums == {subset \in SUBSET acceptors: Quorum(subset)} - -\* flush_lsn of acceptor a. -FlushLsn(a) == Len(acc_state[a].wal) - - -\******************************************************************************** -\* Type assertion -\******************************************************************************** -\* Defining sets of all possible tuples and using them in TypeOk in usual -\* all-tuples constructor is not practical because such definitions force -\* TLC to enumerate them, while they are are horribly enormous -\* (TLC screams "Attempted to construct a set with too many elements"). -\* So instead check types manually. -TypeOk == - /\ \A p \in proposers: - /\ DOMAIN prop_state[p] = {"state", "term", "votes", "donor_epoch", "vcl", "wal", "next_send_lsn"} - \* in campaign proposer sends RequestVote and waits for acks; - \* in leader he is elected - /\ prop_state[p].state \in {"campaign", "leader"} - \* 0..max_term should be actually Nat in the unbounded model, but TLC won't - \* swallow it - /\ prop_state[p].term \in 0..max_term - \* votes received - /\ \A voter \in DOMAIN prop_state[p].votes: - /\ voter \in acceptors - /\ prop_state[p].votes[voter] \in [epoch: 0..max_term, flush_lsn: 0..max_entries] - /\ prop_state[p].donor_epoch \in 0..max_term - \* wal is sequence of just records - /\ \A i \in DOMAIN prop_state[p].wal: - prop_state[p].wal[i] \in [lsn: 1..max_entries, epoch: 1..max_term] - \* Following implementation, we skew the original Aurora meaning of this; - \* here it is lsn of highest definitely committed record as set by proposer - \* when it is elected; it doesn't change since then - /\ prop_state[p].vcl \in 0..max_entries - \* map of acceptor -> next lsn to send - /\ \A a \in DOMAIN prop_state[p].next_send_lsn: - /\ a \in acceptors - /\ prop_state[p].next_send_lsn[a] \in 1..(max_entries + 1) - /\ \A a \in acceptors: - /\ DOMAIN acc_state[a] = {"term", "epoch", "wal"} - /\ acc_state[a].term \in 0..max_term - /\ acc_state[a].epoch \in 0..max_term - /\ \A i \in DOMAIN acc_state[a].wal: - acc_state[a].wal[i] \in [lsn: 1..max_entries, epoch: 1..max_term] - /\ \A a \in DOMAIN commit_lsns: - /\ a \in acceptors - /\ commit_lsns[a] \in 0..max_entries - -\******************************************************************************** -\* Initial -\******************************************************************************** - -Init == - /\ prop_state = [p \in proposers |-> [ - state |-> "campaign", - term |-> 1, - votes |-> EmptyF, - donor_epoch |-> 0, - vcl |-> 0, - wal |-> << >>, - next_send_lsn |-> EmptyF - ]] - /\ acc_state = [a \in acceptors |-> [ - \* there will be no leader in this term, 1 is the first real - term |-> 0, - epoch |-> 0, - wal |-> << >> - ]] - /\ commit_lsns = [a \in acceptors |-> 0] - - -\******************************************************************************** -\* Actions -\******************************************************************************** - -\* Proposer loses all state. -\* For simplicity (and to reduct state space), we assume it immediately gets -\* current state from quorum q of acceptors determining the term he will request -\* to vote for. -RestartProposer(p, q) == - /\ Quorum(q) - /\ LET - new_term == Maximum({acc_state[a].term : a \in q}) + 1 - IN - /\ new_term <= max_term - /\ prop_state' = [prop_state EXCEPT ![p].state = "campaign", - ![p].term = new_term, - ![p].votes = EmptyF, - ![p].donor_epoch = 0, - ![p].vcl = 0, - ![p].wal = << >>, - ![p].next_send_lsn = EmptyF] - /\ UNCHANGED <> - -\* Acceptor a immediately votes for proposer p. -Vote(p, a) == - /\ prop_state[p].state = "campaign" - /\ acc_state[a].term < prop_state[p].term \* main voting condition - /\ acc_state' = [acc_state EXCEPT ![a].term = prop_state[p].term] - /\ LET - vote == [epoch |-> acc_state[a].epoch, flush_lsn |-> FlushLsn(a)] - IN - prop_state' = [prop_state EXCEPT ![p].votes = prop_state[p].votes @@ (a :> vote)] - /\ UNCHANGED <> - - -\* Proposer p gets elected. -BecomeLeader(p) == - /\ prop_state[p].state = "campaign" - /\ Quorum(DOMAIN prop_state[p].votes) - /\ LET - max_epoch == Maximum({v.epoch : v \in FValues(prop_state[p].votes)}) - max_epoch_votes == {v \in FValues(prop_state[p].votes) : v.epoch = max_epoch} - donor == CHOOSE dv \in DOMAIN prop_state[p].votes : - /\ prop_state[p].votes[dv].epoch = max_epoch - /\ \A v \in max_epoch_votes: - prop_state[p].votes[dv].flush_lsn >= v.flush_lsn - max_vote == prop_state[p].votes[donor] - \* Establish lsn to stream from for voters. - \* At some point it seemed like we can regard log as correct and only - \* append to it if has in the max_epoch, however TLC showed that's not - \* the case; we must always stream since first not matching record. - next_send_lsn == [voter \in DOMAIN prop_state[p].votes |-> 1] - IN - \* we fetch log from the most advanced node (this is separate - \* roundtrip), make sure node is still on one term with us - /\ acc_state[donor].term = prop_state[p].term - /\ prop_state' = [prop_state EXCEPT ![p].state = "leader", - \* fetch the log from donor - ![p].wal = acc_state[donor].wal, - ![p].donor_epoch = max_epoch, - ![p].vcl = max_vote.flush_lsn, - ![p].next_send_lsn = next_send_lsn] - /\ UNCHANGED <> - - -\* acceptor a learns about elected proposer p's term. -UpdateTerm(p, a) == - /\ prop_state[p].state = "leader" - /\ acc_state[a].term < prop_state[p].term - /\ acc_state' = [acc_state EXCEPT ![a].term = prop_state[p].term] - /\ UNCHANGED <> - - -\* Acceptor a which didn't participate in voting connects to elected proposer p -\* and p sets the streaming point -HandshakeWithLeader(p, a) == - /\ prop_state[p].state = "leader" - /\ acc_state[a].term = prop_state[p].term - /\ a \notin DOMAIN prop_state[p].next_send_lsn - /\ LET - next_send_lsn == prop_state[p].next_send_lsn @@ (a :> 1) - IN - prop_state' = [prop_state EXCEPT ![p].next_send_lsn = next_send_lsn] - /\ UNCHANGED <> - - -\* Append new log entry to elected proposer -NewEntry(p) == - /\ prop_state[p].state = "leader" - /\ Len(prop_state[p].wal) < max_entries \* model constraint - /\ LET - new_lsn == IF Len(prop_state[p].wal) = 0 THEN - prop_state[p].vcl + 1 - ELSE - \* lsn of last record + 1 - prop_state[p].wal[Len(prop_state[p].wal)].lsn + 1 - new_entry == [lsn |-> new_lsn, epoch |-> prop_state[p].term] - IN - /\ prop_state' = [prop_state EXCEPT ![p].wal = Append(prop_state[p].wal, new_entry)] - /\ UNCHANGED <> - - -\* Write entry new_e to log wal, rolling back all higher entries if e is different. -\* If bump_epoch is TRUE, it means we get record with lsn=vcl and going to update -\* the epoch. Truncate log in this case as well, as we might have correct <= vcl -\* part and some outdated entries behind it which we want to purge before -\* declaring us as recovered. Another way to accomplish this (in previous commit) -\* is wait for first-entry-from-new-epoch before bumping it. -WriteEntry(wal, new_e, bump_epoch) == - (new_e.lsn :> new_e) @@ - \* If wal has entry with such lsn and it is different, truncate all higher log. - IF \/ (new_e.lsn \in DOMAIN wal /\ wal[new_e.lsn] /= new_e) - \/ bump_epoch THEN - SelectSeq(wal, LAMBDA e: e.lsn < new_e.lsn) - ELSE - wal - - -\* Try to transfer entry from elected proposer p to acceptor a -TransferEntry(p, a) == - /\ prop_state[p].state = "leader" - /\ prop_state[p].term = acc_state[a].term - /\ a \in DOMAIN prop_state[p].next_send_lsn - /\ LET - next_e == NextEntry(p, a) - IN - /\ next_e /= NULL - /\ LET - \* Consider bumping epoch if getting this entry recovers the acceptor, - \* that is, we reach first record behind VCL. - new_epoch == - IF /\ acc_state[a].epoch < prop_state[p].term - /\ next_e.lsn >= prop_state[p].vcl - THEN - prop_state[p].term - ELSE - acc_state[a].epoch - \* Also check whether this entry allows to advance commit_lsn and - \* if so, bump it where possible. Modeling this as separate action - \* significantly bloats the space (5m vs 15m on max_entries=3 max_term=3, - \* so act immediately. - entry_owners == {o \in acceptors: - /\ o /= a - \* only recovered acceptors advance commit_lsn - /\ acc_state[o].epoch = prop_state[p].term - /\ next_e \in FValues(acc_state[o].wal)} \cup {a} - IN - /\ acc_state' = [acc_state EXCEPT ![a].wal = WriteEntry(acc_state[a].wal, next_e, new_epoch /= acc_state[a].epoch), - ![a].epoch = new_epoch] - /\ prop_state' = [prop_state EXCEPT ![p].next_send_lsn[a] = - prop_state[p].next_send_lsn[a] + 1] - /\ commit_lsns' = IF /\ new_epoch = prop_state[p].term - /\ Quorum(entry_owners) - THEN - [acc \in acceptors |-> - IF /\ acc \in entry_owners - /\ next_e.lsn > commit_lsns[acc] - THEN - next_e.lsn - ELSE - commit_lsns[acc]] - ELSE - commit_lsns - - -\******************************************************************************* -\* Final spec -\******************************************************************************* - -Next == - \/ \E q \in Quorums: \E p \in proposers: RestartProposer(p, q) - \/ \E p \in proposers: \E a \in acceptors: Vote(p, a) - \/ \E p \in proposers: BecomeLeader(p) - \/ \E p \in proposers: \E a \in acceptors: UpdateTerm(p, a) - \/ \E p \in proposers: \E a \in acceptors: HandshakeWithLeader(p, a) - \/ \E p \in proposers: NewEntry(p) - \/ \E p \in proposers: \E a \in acceptors: TransferEntry(p, a) - -Spec == Init /\ [][Next]_<> - - -\******************************************************************************** -\* Invariants -\******************************************************************************** - -\* we don't track history, but this property is fairly convincing anyway -ElectionSafety == - \A p1, p2 \in proposers: - (/\ prop_state[p1].state = "leader" - /\ prop_state[p2].state = "leader" - /\ prop_state[p1].term = prop_state[p2].term) => (p1 = p2) - -LogIsMonotonic == - \A a \in acceptors: - \A i \in DOMAIN acc_state[a].wal: \A j \in DOMAIN acc_state[a].wal: - (i > j) => (/\ acc_state[a].wal[i].lsn > acc_state[a].wal[j].lsn - /\ acc_state[a].wal[i].epoch >= acc_state[a].wal[j].epoch) - -\* Main invariant: log under commit_lsn must match everywhere. -LogSafety == - \A a1 \in acceptors: \A a2 \in acceptors: - LET - common_len == Min(commit_lsns[a1], commit_lsns[a2]) - IN - SubSeq(acc_state[a1].wal, 1, common_len) = SubSeq(acc_state[a2].wal, 1, common_len) - -\* Next record we are going to push to acceptor must never overwrite committed -\* different record. -CommittedNotOverwritten == - \A p \in proposers: \A a \in acceptors: - (/\ prop_state[p].state = "leader" - /\ prop_state[p].term = acc_state[a].term - /\ a \in DOMAIN prop_state[p].next_send_lsn) => - LET - next_e == NextEntry(p, a) - IN - (next_e /= NULL) => - ((commit_lsns[a] >= next_e.lsn) => (acc_state[a].wal[next_e.lsn] = next_e)) - - -==== \ No newline at end of file diff --git a/safekeeper/spec/ProposerAcceptorStatic.tla b/safekeeper/spec/ProposerAcceptorStatic.tla new file mode 100644 index 0000000000..b2d2f005db --- /dev/null +++ b/safekeeper/spec/ProposerAcceptorStatic.tla @@ -0,0 +1,449 @@ +---- MODULE ProposerAcceptorStatic ---- + +(* + The protocol is very similar to Raft. The key differences are: + - Leaders (proposers) are separated from storage nodes (acceptors), which has + been already an established way to think about Paxos. + - We don't want to stamp each log record with term, so instead carry around + term histories which are sequences of pairs. + As a bonus (and subtlety) this allows the proposer to commit entries from + previous terms without writing new records -- if acceptor's log is caught + up, update of term history on it updates last_log_term as well. +*) + +\* Model simplifications: +\* - Instant message delivery. Notably, ProposerElected message (TruncateWal action) is not +\* delayed, so we don't attempt to truncate WAL when the same wp already appended something +\* on the acceptor since common point had been calculated (this should be rejected). +\* - old WAL is immediately copied to proposer on its election, without on-demand fetch later. + +\* Some ideas how to break it to play around to get a feeling: +\* - replace Quorums with BadQuorums. +\* - remove 'don't commit entries from previous terms separately' rule in +\* CommitEntries and observe figure 8 from the raft paper. +\* With p2a3t4l4 32 steps error was found in 1h on 80 cores. + +EXTENDS Integers, Sequences, FiniteSets, TLC + +VARIABLES + prop_state, \* prop_state[p] is state of proposer p + acc_state, \* acc_state[a] is state of acceptor a + committed, \* bag (set) of ever committed <> entries + elected_history \* counter for elected terms, see TypeOk for details + +CONSTANT + acceptors, + proposers + +CONSTANT NULL + +\******************************************************************************** +\* Helpers +\******************************************************************************** + +Maximum(S) == + (*************************************************************************) + (* If S is a set of numbers, then this define Maximum(S) to be the *) + (* maximum of those numbers, or -1 if S is empty. *) + (*************************************************************************) + IF S = {} THEN -1 ELSE CHOOSE n \in S : \A m \in S : n \geq m + +\* minimum of numbers in the set, error if set is empty +Minimum(S) == CHOOSE min \in S : \A n \in S : min <= n + +\* Min of two numbers +Min(a, b) == IF a < b THEN a ELSE b + +\* Sort of 0 for functions +EmptyF == [x \in {} |-> 42] +IsEmptyF(f) == DOMAIN f = {} + +\* Set of values (image) of the function f. Apparently no such builtin. +Range(f) == {f[x] : x \in DOMAIN f} + +\* If key k is in function f, map it using l, otherwise insert v. Returns the +\* updated function. +Upsert(f, k, v, l(_)) == + LET new_val == IF k \in DOMAIN f THEN l(f[k]) ELSE v IN + (k :> new_val) @@ f + +\***************** + +NumAccs == Cardinality(acceptors) + +\* does acc_set form the quorum? +Quorum(acc_set) == Cardinality(acc_set) >= (NumAccs \div 2 + 1) +\* all quorums of acceptors +Quorums == {subset \in SUBSET acceptors: Quorum(subset)} + +\* For substituting Quorums and seeing what happens. +BadQuorum(acc_set) == Cardinality(acc_set) >= (NumAccs \div 2) +BadQuorums == {subset \in SUBSET acceptors: BadQuorum(subset)} + +\* flushLsn (end of WAL, i.e. index of next entry) of acceptor a. +FlushLsn(a) == Len(acc_state[a].wal) + 1 + +\* Typedefs. Note that TLA+ Nat includes zero. +Terms == Nat +Lsns == Nat + +\******************************************************************************** +\* Type assertion +\******************************************************************************** +\* Defining sets of all possible tuples and using them in TypeOk in usual +\* all-tuples constructor is not practical because such definitions force +\* TLC to enumerate them, while they are are horribly enormous +\* (TLC screams "Attempted to construct a set with too many elements"). +\* So instead check types manually. + + +\* Term history is a sequence of pairs. +IsTermHistory(th) == + \A th_entry \in Range(th): th_entry.term \in Terms /\ th_entry.lsn \in Lsns + +IsWal(w) == + \A i \in DOMAIN w: + /\ i \in Lsns + /\ w[i] \in Terms + +TypeOk == + /\ \A p \in proposers: + \* '_' in field names hinders pretty printing + \* https://github.com/tlaplus/tlaplus/issues/1051 + \* so use camel case. + /\ DOMAIN prop_state[p] = {"state", "term", "votes", "termHistory", "wal", "nextSendLsn"} + \* In campaign proposer sends RequestVote and waits for acks; + \* in leader he is elected. + /\ prop_state[p].state \in {"campaign", "leader"} + \* term for which it will campaign, or won term in leader state + /\ prop_state[p].term \in Terms + \* votes received + /\ \A voter \in DOMAIN prop_state[p].votes: voter \in acceptors + /\ \A vote \in Range(prop_state[p].votes): + /\ IsTermHistory(vote.termHistory) + /\ vote.flushLsn \in Lsns + \* Proposer's term history. Empty while proposer is in "campaign". + /\ IsTermHistory(prop_state[p].termHistory) + \* In the model we identify WAL entries only by pairs + \* without additional unique id, which is enough for its purposes. + \* It means that with term history fully modeled wal becomes + \* redundant as it can be computed from term history + WAL length. + \* However, we still keep it here and at acceptors as explicit sequence + \* where index is LSN and value is the term to avoid artificial mapping to + \* figure out real entries. It shouldn't bloat model much because this + \* doesn't increase number of distinct states. + /\ IsWal(prop_state[p].wal) + \* Map of acceptor -> next lsn to send. It is set when truncate_wal is + \* done so sending entries is allowed only after that. In the impl TCP + \* ensures this ordering. + /\ \A a \in DOMAIN prop_state[p].nextSendLsn: + /\ a \in acceptors + /\ prop_state[p].nextSendLsn[a] \in Lsns + /\ \A a \in acceptors: + /\ DOMAIN acc_state[a] = {"term", "termHistory", "wal"} + /\ acc_state[a].term \in Terms + /\ IsTermHistory(acc_state[a].termHistory) + /\ IsWal(acc_state[a].wal) + /\ \A c \in committed: + /\ c.term \in Terms + /\ c.lsn \in Lsns + \* elected_history is a retrospective map of term -> number of times it was + \* elected, for use in ElectionSafetyFull invariant. For static spec it is + \* fairly convincing that it holds, but with membership change it is less + \* trivial. And as we identify log entries only with , importance + \* of it is quite high as violation of log safety might go undetected if + \* election safety is violated. Note though that this is not always the + \* case, i.e. you can imagine (and TLC should find) schedule where log + \* safety violation is still detected because two leaders with the same term + \* commit histories which are different in previous terms, so it is not that + \* crucial. Plus if spec allows ElectionSafetyFull violation, likely + \* ElectionSafety will also be violated in some schedules. But neither it + \* should bloat the model too much. + /\ \A term \in DOMAIN elected_history: + /\ term \in Terms + /\ elected_history[term] \in Nat + +\******************************************************************************** +\* Initial +\******************************************************************************** + +Init == + /\ prop_state = [p \in proposers |-> [ + state |-> "campaign", + term |-> 1, + votes |-> EmptyF, + termHistory |-> << >>, + wal |-> << >>, + nextSendLsn |-> EmptyF + ]] + /\ acc_state = [a \in acceptors |-> [ + \* There will be no leader in zero term, 1 is the first + \* real. + term |-> 0, + \* Again, leader in term 0 doesn't exist, but we initialize + \* term histories with it to always have common point in + \* them. Lsn is 1 because TLA+ sequences are indexed from 1 + \* (we don't want to truncate WAL out of range). + termHistory |-> << [term |-> 0, lsn |-> 1] >>, + wal |-> << >> + ]] + /\ committed = {} + /\ elected_history = EmptyF + + +\******************************************************************************** +\* Actions +\******************************************************************************** + +\* Proposer loses all state. +\* For simplicity (and to reduct state space), we assume it immediately gets +\* current state from quorum q of acceptors determining the term he will request +\* to vote for. +RestartProposer(p, q) == + /\ Quorum(q) + /\ LET new_term == Maximum({acc_state[a].term : a \in q}) + 1 IN + /\ prop_state' = [prop_state EXCEPT ![p].state = "campaign", + ![p].term = new_term, + ![p].votes = EmptyF, + ![p].termHistory = << >>, + ![p].wal = << >>, + ![p].nextSendLsn = EmptyF] + /\ UNCHANGED <> + +\* Term history of acceptor a's WAL: the one saved truncated to contain only <= +\* local FlushLsn entries. +AcceptorTermHistory(a) == + SelectSeq(acc_state[a].termHistory, LAMBDA th_entry: th_entry.lsn <= FlushLsn(a)) + +\* Acceptor a immediately votes for proposer p. +Vote(p, a) == + /\ prop_state[p].state = "campaign" + /\ acc_state[a].term < prop_state[p].term \* main voting condition + /\ acc_state' = [acc_state EXCEPT ![a].term = prop_state[p].term] + /\ LET + vote == [termHistory |-> AcceptorTermHistory(a), flushLsn |-> FlushLsn(a)] + IN + prop_state' = [prop_state EXCEPT ![p].votes = (a :> vote) @@ prop_state[p].votes] + /\ UNCHANGED <> + + +\* Get lastLogTerm from term history th. +LastLogTerm(th) == th[Len(th)].term + +\* Proposer p gets elected. +BecomeLeader(p) == + /\ prop_state[p].state = "campaign" + /\ Quorum(DOMAIN prop_state[p].votes) + /\ LET + \* Find acceptor with the highest vote. + max_vote_acc == + CHOOSE a \in DOMAIN prop_state[p].votes: + LET v == prop_state[p].votes[a] + IN \A v2 \in Range(prop_state[p].votes): + /\ LastLogTerm(v.termHistory) >= LastLogTerm(v2.termHistory) + /\ (LastLogTerm(v.termHistory) = LastLogTerm(v2.termHistory) => v.flushLsn >= v2.flushLsn) + max_vote == prop_state[p].votes[max_vote_acc] + prop_th == Append(max_vote.termHistory, [term |-> prop_state[p].term, lsn |-> max_vote.flushLsn]) + IN + \* We copy all log preceding proposer's term from the max vote node so + \* make sure it is still on one term with us. This is a model + \* simplification which can be removed, in impl we fetch WAL on demand + \* from safekeeper which has it later. Note though that in case of on + \* demand fetch we must check on donor not only term match, but that + \* truncate_wal had already been done (if it is not max_vote_acc). + /\ acc_state[max_vote_acc].term = prop_state[p].term + /\ prop_state' = [prop_state EXCEPT ![p].state = "leader", + ![p].termHistory = prop_th, + ![p].wal = acc_state[max_vote_acc].wal + ] + /\ elected_history' = Upsert(elected_history, prop_state[p].term, 1, LAMBDA c: c + 1) + /\ UNCHANGED <> + + +\* Acceptor a learns about elected proposer p's term. In impl it matches to +\* VoteRequest/VoteResponse exchange when leader is already elected and is not +\* interested in the vote result. +UpdateTerm(p, a) == + /\ prop_state[p].state = "leader" + /\ acc_state[a].term < prop_state[p].term + /\ acc_state' = [acc_state EXCEPT ![a].term = prop_state[p].term] + /\ UNCHANGED <> + +\* Find highest common point (LSN of the first divergent record) in the logs of +\* proposer p and acceptor a. Returns of the highest common point. +FindHighestCommonPoint(prop_th, acc_th, acc_flush_lsn) == + LET + \* First find index of the highest common term. + \* It must exist because we initialize th with <0, 1>. + last_common_idx == Maximum({i \in 1..Min(Len(prop_th), Len(acc_th)): prop_th[i].term = acc_th[i].term}) + last_common_term == prop_th[last_common_idx].term + \* Now find where it ends at both prop and acc and take min. End of term + \* is the start of the next unless it is the last one; there it is + \* flush_lsn in case of acceptor. In case of proposer it is the current + \* writing position, but it can't be less than flush_lsn, so we + \* take flush_lsn. + acc_common_term_end == IF last_common_idx = Len(acc_th) THEN acc_flush_lsn ELSE acc_th[last_common_idx + 1].lsn + prop_common_term_end == IF last_common_idx = Len(prop_th) THEN acc_flush_lsn ELSE prop_th[last_common_idx + 1].lsn + IN + [term |-> last_common_term, lsn |-> Min(acc_common_term_end, prop_common_term_end)] + +\* Elected proposer p immediately truncates WAL (and term history) of acceptor a +\* before starting streaming. Establishes nextSendLsn for a. +\* +\* In impl this happens at each reconnection, here we also allow to do it multiple times. +TruncateWal(p, a) == + /\ prop_state[p].state = "leader" + /\ acc_state[a].term = prop_state[p].term + /\ LET + hcp == FindHighestCommonPoint(prop_state[p].termHistory, AcceptorTermHistory(a), FlushLsn(a)) + next_send_lsn == (a :> hcp.lsn) @@ prop_state[p].nextSendLsn + IN + \* Acceptor persists full history immediately; reads adjust it to the + \* really existing wal with AcceptorTermHistory. + /\ acc_state' = [acc_state EXCEPT ![a].termHistory = prop_state[p].termHistory, + \* note: SubSeq is inclusive, hence -1. + ![a].wal = SubSeq(acc_state[a].wal, 1, hcp.lsn - 1) + ] + /\ prop_state' = [prop_state EXCEPT ![p].nextSendLsn = next_send_lsn] + /\ UNCHANGED <> + +\* Append new log entry to elected proposer +NewEntry(p) == + /\ prop_state[p].state = "leader" + /\ LET + \* entry consists only of term, index serves as LSN. + new_entry == prop_state[p].term + IN + /\ prop_state' = [prop_state EXCEPT ![p].wal = Append(prop_state[p].wal, new_entry)] + /\ UNCHANGED <> + +\* Immediately append next entry from elected proposer to acceptor a. +AppendEntry(p, a) == + /\ prop_state[p].state = "leader" + /\ acc_state[a].term = prop_state[p].term + /\ a \in DOMAIN prop_state[p].nextSendLsn \* did TruncateWal + /\ prop_state[p].nextSendLsn[a] <= Len(prop_state[p].wal) \* have smth to send + /\ LET + send_lsn == prop_state[p].nextSendLsn[a] + entry == prop_state[p].wal[send_lsn] + \* Since message delivery is instant we don't check that send_lsn follows + \* the last acc record, it must always be true. + IN + /\ prop_state' = [prop_state EXCEPT ![p].nextSendLsn[a] = send_lsn + 1] + /\ acc_state' = [acc_state EXCEPT ![a].wal = Append(acc_state[a].wal, entry)] + /\ UNCHANGED <> + +\* LSN where elected proposer p starts writing its records. +PropStartLsn(p) == + IF prop_state[p].state = "leader" THEN prop_state[p].termHistory[Len(prop_state[p].termHistory)].lsn ELSE NULL + +\* Proposer p commits all entries it can using quorum q. Note that unlike +\* will62794/logless-reconfig this allows to commit entries from previous terms +\* (when conditions for that are met). +CommitEntries(p, q) == + /\ prop_state[p].state = "leader" + /\ \A a \in q: + /\ acc_state[a].term = prop_state[p].term + \* nextSendLsn existence means TruncateWal has happened, it ensures + \* acceptor's WAL (and FlushLsn) are from proper proposer's history. + \* Alternatively we could compare LastLogTerm here, but that's closer to + \* what we do in the impl (we check flushLsn in AppendResponse, but + \* AppendRequest is processed only if HandleElected handling was good). + /\ a \in DOMAIN prop_state[p].nextSendLsn + \* Now find the LSN present on all the quorum. + /\ LET quorum_lsn == Minimum({FlushLsn(a): a \in q}) IN + \* This is the basic Raft rule of not committing entries from previous + \* terms except along with current term entry (commit them only when + \* quorum recovers, i.e. last_log_term on it reaches leader's term). + /\ quorum_lsn >= PropStartLsn(p) + /\ committed' = committed \cup {[term |-> prop_state[p].wal[lsn], lsn |-> lsn]: lsn \in 1..(quorum_lsn - 1)} + /\ UNCHANGED <> + +\******************************************************************************* +\* Final spec +\******************************************************************************* + +Next == + \/ \E q \in Quorums: \E p \in proposers: RestartProposer(p, q) + \/ \E p \in proposers: \E a \in acceptors: Vote(p, a) + \/ \E p \in proposers: BecomeLeader(p) + \/ \E p \in proposers: \E a \in acceptors: UpdateTerm(p, a) + \/ \E p \in proposers: \E a \in acceptors: TruncateWal(p, a) + \/ \E p \in proposers: NewEntry(p) + \/ \E p \in proposers: \E a \in acceptors: AppendEntry(p, a) + \/ \E q \in Quorums: \E p \in proposers: CommitEntries(p, q) + +Spec == Init /\ [][Next]_<> + + +\******************************************************************************** +\* Invariants +\******************************************************************************** + +\* Lighter version of ElectionSafetyFull which doesn't require elected_history. +ElectionSafety == + \A p1, p2 \in proposers: + (/\ prop_state[p1].state = "leader" + /\ prop_state[p2].state = "leader" + /\ prop_state[p1].term = prop_state[p2].term) => (p1 = p2) + +\* Single term must never be elected more than once. +ElectionSafetyFull == \A term \in DOMAIN elected_history: elected_history[term] <= 1 + +\* Log is expected to be monotonic by comparison. This is not true +\* in variants of multi Paxos, but in Raft (and here) it is. +LogIsMonotonic == + \A a \in acceptors: + \A i, j \in DOMAIN acc_state[a].wal: + (i > j) => (acc_state[a].wal[i] >= acc_state[a].wal[j]) + +\* Main invariant: If two entries are committed at the same LSN, they must be +\* the same entry. +LogSafety == + \A c1, c2 \in committed: (c1.lsn = c2.lsn) => (c1 = c2) + + +\******************************************************************************** +\* Invariants which don't need to hold, but useful for playing/debugging. +\******************************************************************************** + +\* Limits term of elected proposers +MaxTerm == \A p \in proposers: (prop_state[p].state = "leader" => prop_state[p].term < 2) + +MaxAccWalLen == \A a \in acceptors: Len(acc_state[a].wal) < 2 + +\* Limits max number of committed entries. That way we can check that we'are +\* actually committing something. +MaxCommitLsn == Cardinality(committed) < 2 + +\* How many records with different terms can be removed in single WAL +\* truncation. +MaxTruncatedTerms == + \A p \in proposers: \A a \in acceptors: + (/\ prop_state[p].state = "leader" + /\ prop_state[p].term = acc_state[a].term) => + LET + hcp == FindHighestCommonPoint(prop_state[p].termHistory, AcceptorTermHistory(a), FlushLsn(a)) + truncated_lsns == {lsn \in DOMAIN acc_state[a].wal: lsn >= hcp.lsn} + truncated_records_terms == {acc_state[a].wal[lsn]: lsn \in truncated_lsns} + IN + Cardinality(truncated_records_terms) < 2 + +\* Check that TruncateWal never deletes committed record. +\* It might seem that this should an invariant, but it is not. +\* With 5 nodes, it is legit to truncate record which had been +\* globally committed: e.g. nodes abc can commit record of term 1 in +\* term 3, and after that leader of term 2 can delete such record +\* on d. On 10 cores TLC can find such a trace in ~7 hours. +CommittedNotTruncated == + \A p \in proposers: \A a \in acceptors: + (/\ prop_state[p].state = "leader" + /\ prop_state[p].term = acc_state[a].term) => + LET + hcp == FindHighestCommonPoint(prop_state[p].termHistory, AcceptorTermHistory(a), FlushLsn(a)) + truncated_lsns == {lsn \in DOMAIN acc_state[a].wal: lsn >= hcp.lsn} + truncated_records == {[term |-> acc_state[a].wal[lsn], lsn |-> lsn]: lsn \in truncated_lsns} + IN + \A r \in truncated_records: r \notin committed + +==== diff --git a/safekeeper/spec/modelcheck.sh b/safekeeper/spec/modelcheck.sh new file mode 100755 index 0000000000..21ead7dad8 --- /dev/null +++ b/safekeeper/spec/modelcheck.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +# Usage: ./modelcheck.sh , e.g. +# ./modelcheck.sh models/MCProposerAcceptorStatic_p2_a3_t3_l3.cfg MCProposerAcceptorStatic.tla +CONFIG=$1 +SPEC=$2 + +MEM=7G +TOOLSPATH="/opt/TLA+Toolbox/tla2tools.jar" + +mkdir -p "tlc-results" +CONFIG_FILE=$(basename -- "$CONFIG") +outfilename="$SPEC-${CONFIG_FILE}-$(date --utc +%Y-%m-%d--%H-%M-%S)".log +outfile="tlc-results/$outfilename" +touch $outfile + +# Save some info about the run. +GIT_REV=`git rev-parse --short HEAD` +INFO=`uname -a` + +# First for Linux, second for Mac. +CPUNAMELinux=$(lscpu | grep 'Model name' | cut -f 2 -d ":" | awk '{$1=$1}1') +CPUCORESLinux=`nproc` +CPUNAMEMac=`sysctl -n machdep.cpu.brand_string` +CPUCORESMac=`sysctl -n machdep.cpu.thread_count` + +echo "git revision: $GIT_REV" >> $outfile +echo "Platform: $INFO" >> $outfile +echo "CPU Info Linux: $CPUNAMELinux" >> $outfile +echo "CPU Cores Linux: $CPUCORESLinux" >> $outfile +echo "CPU Info Mac: $CPUNAMEMac" >> $outfile +echo "CPU Cores Mac: $CPUCORESMac" >> $outfile +echo "Spec: $SPEC" >> $outfile +echo "Config: $CONFIG" >> $outfile +echo "----" >> $outfile +cat $CONFIG >> $outfile +echo "" >> $outfile +echo "----" >> $outfile +echo "" >> $outfile + +# see +# https://lamport.azurewebsites.net/tla/current-tools.pdf +# for TLC options. +# OffHeapDiskFPSet is the optimal fingerprint set implementation +# https://docs.tlapl.us/codebase:architecture#fingerprint_sets_fpsets +# +# Add -simulate to run in infinite simulation mode. +java -Xmx$MEM -XX:MaxDirectMemorySize=$MEM -XX:+UseParallelGC -Dtlc2.tool.fp.FPSet.impl=tlc2.tool.fp.OffHeapDiskFPSet \ + -cp "${TOOLSPATH}" tlc2.TLC $SPEC -config $CONFIG -workers auto -gzip | tee -a $outfile diff --git a/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t2_l2.cfg b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t2_l2.cfg new file mode 100644 index 0000000000..c06109c601 --- /dev/null +++ b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t2_l2.cfg @@ -0,0 +1,19 @@ +\* A very small model just to play. +CONSTANTS +NULL = NULL +proposers = {p1, p2} +acceptors = {a1, a2, a3} +max_term = 2 +max_entries = 2 +SPECIFICATION Spec +CONSTRAINT StateConstraint +INVARIANT +TypeOk +ElectionSafetyFull +LogIsMonotonic +LogSafety +CommittedNotTruncated +SYMMETRY ProposerAcceptorSymmetry +CHECK_DEADLOCK FALSE +ALIAS Alias + diff --git a/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t3_l2.cfg b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t3_l2.cfg new file mode 100644 index 0000000000..5d10fa960f --- /dev/null +++ b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t3_l2.cfg @@ -0,0 +1,19 @@ +\* A model next to the smallest one. +CONSTANTS +NULL = NULL +proposers = {p1, p2} +acceptors = {a1, a2, a3} +max_term = 3 +max_entries = 2 +SPECIFICATION Spec +CONSTRAINT StateConstraint +INVARIANT +TypeOk +ElectionSafetyFull +LogIsMonotonic +LogSafety +CommittedNotTruncated +SYMMETRY ProposerAcceptorSymmetry +CHECK_DEADLOCK FALSE +ALIAS Alias + diff --git a/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t3_l3.cfg b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t3_l3.cfg new file mode 100644 index 0000000000..8ba8ce95a4 --- /dev/null +++ b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t3_l3.cfg @@ -0,0 +1,17 @@ +CONSTANTS +NULL = NULL +proposers = {p1, p2} +acceptors = {a1, a2, a3} +max_term = 3 +max_entries = 3 +SPECIFICATION Spec +CONSTRAINT StateConstraint +INVARIANT +TypeOk +ElectionSafety +LogIsMonotonic +LogSafety +CommittedNotTruncated +SYMMETRY ProposerAcceptorSymmetry +CHECK_DEADLOCK FALSE +ALIAS Alias diff --git a/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t4_l4.cfg b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t4_l4.cfg new file mode 100644 index 0000000000..4763a34ec4 --- /dev/null +++ b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t4_l4.cfg @@ -0,0 +1,17 @@ +CONSTANTS +NULL = NULL +proposers = {p1, p2} +acceptors = {a1, a2, a3} +max_term = 4 +max_entries = 4 +SPECIFICATION Spec +CONSTRAINT StateConstraint +INVARIANT +TypeOk +ElectionSafety +LogIsMonotonic +LogSafety +CommittedNotTruncated +SYMMETRY ProposerAcceptorSymmetry +CHECK_DEADLOCK FALSE +ALIAS Alias diff --git a/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t2_l2.cfg b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t2_l2.cfg new file mode 100644 index 0000000000..ebf4724633 --- /dev/null +++ b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t2_l2.cfg @@ -0,0 +1,16 @@ +CONSTANTS +NULL = NULL +proposers = {p1, p2} +acceptors = {a1, a2, a3, a4, a5} +max_term = 2 +max_entries = 2 +SPECIFICATION Spec +CONSTRAINT StateConstraint +INVARIANT +TypeOk +ElectionSafety +LogIsMonotonic +LogSafety +SYMMETRY ProposerAcceptorSymmetry +CHECK_DEADLOCK FALSE +ALIAS Alias diff --git a/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t3_l3.cfg b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t3_l3.cfg new file mode 100644 index 0000000000..bb77350c58 --- /dev/null +++ b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t3_l3.cfg @@ -0,0 +1,16 @@ +CONSTANTS +NULL = NULL +proposers = {p1, p2} +acceptors = {a1, a2, a3, a4, a5} +max_term = 3 +max_entries = 3 +SPECIFICATION Spec +CONSTRAINT StateConstraint +INVARIANT +TypeOk +ElectionSafety +LogIsMonotonic +LogSafety +SYMMETRY ProposerAcceptorSymmetry +CHECK_DEADLOCK FALSE +ALIAS Alias diff --git a/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t4_l3.cfg b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t4_l3.cfg new file mode 100644 index 0000000000..9a5e142f99 --- /dev/null +++ b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t4_l3.cfg @@ -0,0 +1,16 @@ +CONSTANTS +NULL = NULL +proposers = {p1, p2} +acceptors = {a1, a2, a3, a4, a5} +max_term = 4 +max_entries = 3 +SPECIFICATION Spec +CONSTRAINT StateConstraint +INVARIANT +TypeOk +ElectionSafety +LogIsMonotonic +LogSafety +SYMMETRY ProposerAcceptorSymmetry +CHECK_DEADLOCK FALSE +ALIAS Alias diff --git a/safekeeper/spec/readme.md b/safekeeper/spec/readme.md new file mode 100644 index 0000000000..ec2649d87d --- /dev/null +++ b/safekeeper/spec/readme.md @@ -0,0 +1,12 @@ +The specifications, models and results of running of them of the compute <-> +safekeepers consensus algorithm for committing WAL on the fleet of safekeepers. +Following Paxos parlance, compute which writes WAL is called (WAL) proposer here +and safekeepers which persist it are called (WAL) acceptors. + +Directory structure: +- Use modelcheck.sh to run TLC. +- MC*.tla contains bits of TLA+ needed for TLC like constraining the state space, and models/ actual models. +- Other .tla files are the actual specs. + +Structure is partially borrowed from +[logless-reconfig](https://github.com/will62794/logless-reconfig), thanks to it. diff --git a/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t2_l2.cfg-2024-11-06--13-44-17.log b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t2_l2.cfg-2024-11-06--13-44-17.log new file mode 100644 index 0000000000..768722b1eb --- /dev/null +++ b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t2_l2.cfg-2024-11-06--13-44-17.log @@ -0,0 +1,63 @@ +git revision: 864f4667d +Platform: Linux neon-dev-arm64-1 6.8.0-48-generic #48-Ubuntu SMP PREEMPT_DYNAMIC Fri Sep 27 14:35:45 UTC 2024 aarch64 aarch64 aarch64 GNU/Linux +CPU Info Linux: Neoverse-N1 +CPU Cores Linux: 80 +CPU Info Mac: +CPU Cores Mac: +Spec: MCProposerAcceptorStatic.tla +Config: models/MCProposerAcceptorStatic_p2_a3_t2_l2.cfg +---- +\* A very small model just to play. +CONSTANTS +NULL = NULL +proposers = {p1, p2} +acceptors = {a1, a2, a3} +max_term = 2 +max_entries = 2 +SPECIFICATION Spec +CONSTRAINT StateConstraint +INVARIANT +TypeOk +ElectionSafety +LogIsMonotonic +LogSafety +CommittedNotTruncated +SYMMETRY ProposerAcceptorSymmetry +CHECK_DEADLOCK FALSE +ALIAS Alias + + +---- + +TLC2 Version 2.20 of Day Month 20?? (rev: f68cb71) +Running breadth-first search Model-Checking with fp 110 and seed 3949669318051689745 with 80 workers on 80 cores with 54613MB heap and 61440MB offheap memory [pid: 46037] (Linux 6.8.0-48-generic aarch64, Ubuntu 21.0.4 x86_64, OffHeapDiskFPSet, DiskStateQueue). +Parsing file /home/arseny/neon/safekeeper/spec/MCProposerAcceptorStatic.tla +Parsing file /tmp/tlc-11123278435718411444/TLC.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLC.tla) +Parsing file /home/arseny/neon/safekeeper/spec/ProposerAcceptorStatic.tla +Parsing file /tmp/tlc-11123278435718411444/_TLCTrace.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/_TLCTrace.tla) +Parsing file /tmp/tlc-11123278435718411444/Integers.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Integers.tla) +Parsing file /tmp/tlc-11123278435718411444/Sequences.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Sequences.tla) +Parsing file /tmp/tlc-11123278435718411444/FiniteSets.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/FiniteSets.tla) +Parsing file /tmp/tlc-11123278435718411444/Naturals.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Naturals.tla) +Parsing file /tmp/tlc-11123278435718411444/TLCExt.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLCExt.tla) +Semantic processing of module Naturals +Semantic processing of module Sequences +Semantic processing of module FiniteSets +Semantic processing of module TLC +Semantic processing of module Integers +Semantic processing of module ProposerAcceptorStatic +Semantic processing of module TLCExt +Semantic processing of module _TLCTrace +Semantic processing of module MCProposerAcceptorStatic +Starting... (2024-11-06 13:44:18) +Computing initial states... +Finished computing initial states: 1 distinct state generated at 2024-11-06 13:44:20. +Model checking completed. No error has been found. + Estimates of the probability that TLC did not check all reachable states + because two distinct states had the same fingerprint: + calculated (optimistic): val = 2.9E-9 + based on the actual fingerprints: val = 4.1E-10 +922134 states generated, 61249 distinct states found, 0 states left on queue. +The depth of the complete state graph search is 31. +The average outdegree of the complete state graph is 1 (minimum is 0, the maximum 6 and the 95th percentile is 3). +Finished in 11s at (2024-11-06 13:44:28) diff --git a/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t3_l2.cfg-2024-11-15--09-09-58.log b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t3_l2.cfg-2024-11-15--09-09-58.log new file mode 100644 index 0000000000..ae3ba98da6 --- /dev/null +++ b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t3_l2.cfg-2024-11-15--09-09-58.log @@ -0,0 +1,69 @@ +git revision: bcbff084a +Platform: Linux nonlibrem 6.10.11-amd64 #1 SMP PREEMPT_DYNAMIC Debian 6.10.11-1 (2024-09-22) x86_64 GNU/Linux +CPU Info Linux: 13th Gen Intel(R) Core(TM) i7-1355U +CPU Cores Linux: 10 +CPU Info Mac: +CPU Cores Mac: +Spec: MCProposerAcceptorStatic.tla +Config: models/MCProposerAcceptorStatic_p2_a3_t3_l2.cfg +---- +\* A model next to the smallest one. +CONSTANTS +NULL = NULL +proposers = {p1, p2} +acceptors = {a1, a2, a3} +max_term = 3 +max_entries = 2 +SPECIFICATION Spec +CONSTRAINT StateConstraint +INVARIANT +TypeOk +ElectionSafety +LogIsMonotonic +LogSafety +CommittedNotTruncated +SYMMETRY ProposerAcceptorSymmetry +CHECK_DEADLOCK FALSE +ALIAS Alias + + +---- + +TLC2 Version 2.20 of Day Month 20?? (rev: cc65eef) +Running breadth-first search Model-Checking with fp 41 and seed -3061068726727581619 with 10 workers on 10 cores with 6372MB heap and 7168MB offheap memory [pid: 1250346] (Linux 6.10.11-amd64 amd64, Debian 21.0.5 x86_64, OffHeapDiskFPSet, DiskStateQueue). +Parsing file /home/ars/neon/neon/safekeeper/spec/MCProposerAcceptorStatic.tla +Parsing file /tmp/tlc-3023124431504466774/TLC.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/TLC.tla) +Parsing file /home/ars/neon/neon/safekeeper/spec/ProposerAcceptorStatic.tla +Parsing file /tmp/tlc-3023124431504466774/_TLCTrace.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/_TLCTrace.tla) +Parsing file /tmp/tlc-3023124431504466774/Integers.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/Integers.tla) +Parsing file /tmp/tlc-3023124431504466774/Sequences.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/Sequences.tla) +Parsing file /tmp/tlc-3023124431504466774/FiniteSets.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/FiniteSets.tla) +Parsing file /tmp/tlc-3023124431504466774/Naturals.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/Naturals.tla) +Parsing file /tmp/tlc-3023124431504466774/TLCExt.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/TLCExt.tla) +Semantic processing of module Naturals +Semantic processing of module Sequences +Semantic processing of module FiniteSets +Semantic processing of module TLC +Semantic processing of module Integers +Semantic processing of module ProposerAcceptorStatic +Semantic processing of module TLCExt +Semantic processing of module _TLCTrace +Semantic processing of module MCProposerAcceptorStatic +Starting... (2024-11-15 12:09:59) +Computing initial states... +Finished computing initial states: 1 distinct state generated at 2024-11-15 12:10:00. +Progress(19) at 2024-11-15 12:10:03: 464,696 states generated (464,696 s/min), 57,859 distinct states found (57,859 ds/min), 21,435 states left on queue. +Progress(26) at 2024-11-15 12:11:03: 8,813,399 states generated (8,348,703 s/min), 877,254 distinct states found (819,395 ds/min), 214,794 states left on queue. +Progress(27) at 2024-11-15 12:12:03: 16,121,858 states generated (7,308,459 s/min), 1,464,707 distinct states found (587,453 ds/min), 274,230 states left on queue. +Progress(29) at 2024-11-15 12:13:03: 23,073,903 states generated (6,952,045 s/min), 1,948,802 distinct states found (484,095 ds/min), 263,697 states left on queue. +Progress(31) at 2024-11-15 12:14:03: 29,740,681 states generated (6,666,778 s/min), 2,331,052 distinct states found (382,250 ds/min), 185,484 states left on queue. +Progress(34) at 2024-11-15 12:15:03: 36,085,876 states generated (6,345,195 s/min), 2,602,370 distinct states found (271,318 ds/min), 31,659 states left on queue. +Model checking completed. No error has been found. + Estimates of the probability that TLC did not check all reachable states + because two distinct states had the same fingerprint: + calculated (optimistic): val = 4.9E-6 + based on the actual fingerprints: val = 6.9E-7 +36896322 states generated, 2623542 distinct states found, 0 states left on queue. +The depth of the complete state graph search is 39. +The average outdegree of the complete state graph is 1 (minimum is 0, the maximum 7 and the 95th percentile is 3). +Finished in 05min 14s at (2024-11-15 12:15:13) diff --git a/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t3_l3.cfg-2024-11-06--13-03-51.log b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t3_l3.cfg-2024-11-06--13-03-51.log new file mode 100644 index 0000000000..46f21cee72 --- /dev/null +++ b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t3_l3.cfg-2024-11-06--13-03-51.log @@ -0,0 +1,72 @@ +git revision: 864f4667d +Platform: Linux neon-dev-arm64-1 6.8.0-48-generic #48-Ubuntu SMP PREEMPT_DYNAMIC Fri Sep 27 14:35:45 UTC 2024 aarch64 aarch64 aarch64 GNU/Linux +CPU Info Linux: Neoverse-N1 +CPU Cores Linux: 80 +CPU Info Mac: +CPU Cores Mac: +Spec: MCProposerAcceptorStatic.tla +Config: models/MCProposerAcceptorStatic_p2_a3_t3_l3.cfg +---- +CONSTANTS +NULL = NULL +proposers = {p1, p2} +acceptors = {a1, a2, a3} +max_term = 3 +max_entries = 3 +SPECIFICATION Spec +CONSTRAINT StateConstraint +INVARIANT +TypeOk +ElectionSafety +LogIsMonotonic +LogSafety +CommittedNotTruncated +SYMMETRY ProposerAcceptorSymmetry +CHECK_DEADLOCK FALSE +ALIAS Alias + +---- + +TLC2 Version 2.20 of Day Month 20?? (rev: f68cb71) +Running breadth-first search Model-Checking with fp 126 and seed 2302892334567572769 with 80 workers on 80 cores with 54613MB heap and 61440MB offheap memory [pid: 39701] (Linux 6.8.0-48-generic aarch64, Ubuntu 21.0.4 x86_64, OffHeapDiskFPSet, DiskStateQueue). +Parsing file /home/arseny/neon/safekeeper/spec/MCProposerAcceptorStatic.tla +Parsing file /tmp/tlc-15178810317173795942/TLC.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLC.tla) +Parsing file /home/arseny/neon/safekeeper/spec/ProposerAcceptorStatic.tla +Parsing file /tmp/tlc-15178810317173795942/_TLCTrace.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/_TLCTrace.tla) +Parsing file /tmp/tlc-15178810317173795942/Integers.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Integers.tla) +Parsing file /tmp/tlc-15178810317173795942/Sequences.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Sequences.tla) +Parsing file /tmp/tlc-15178810317173795942/FiniteSets.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/FiniteSets.tla) +Parsing file /tmp/tlc-15178810317173795942/Naturals.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Naturals.tla) +Parsing file /tmp/tlc-15178810317173795942/TLCExt.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLCExt.tla) +Semantic processing of module Naturals +Semantic processing of module Sequences +Semantic processing of module FiniteSets +Semantic processing of module TLC +Semantic processing of module Integers +Semantic processing of module ProposerAcceptorStatic +Semantic processing of module TLCExt +Semantic processing of module _TLCTrace +Semantic processing of module MCProposerAcceptorStatic +Starting... (2024-11-06 13:03:52) +Computing initial states... +Finished computing initial states: 1 distinct state generated at 2024-11-06 13:03:55. +Progress(21) at 2024-11-06 13:03:58: 846,240 states generated (846,240 s/min), 106,298 distinct states found (106,298 ds/min), 41,028 states left on queue. +Progress(28) at 2024-11-06 13:04:58: 27,538,211 states generated (26,691,971 s/min), 2,768,793 distinct states found (2,662,495 ds/min), 782,984 states left on queue. +Progress(30) at 2024-11-06 13:05:58: 54,048,763 states generated (26,510,552 s/min), 5,076,745 distinct states found (2,307,952 ds/min), 1,241,301 states left on queue. +Progress(31) at 2024-11-06 13:06:58: 80,554,724 states generated (26,505,961 s/min), 7,199,201 distinct states found (2,122,456 ds/min), 1,541,574 states left on queue. +Progress(32) at 2024-11-06 13:07:58: 106,991,261 states generated (26,436,537 s/min), 9,121,549 distinct states found (1,922,348 ds/min), 1,686,289 states left on queue. +Progress(33) at 2024-11-06 13:08:58: 133,354,665 states generated (26,363,404 s/min), 10,935,451 distinct states found (1,813,902 ds/min), 1,739,977 states left on queue. +Progress(34) at 2024-11-06 13:09:58: 159,631,385 states generated (26,276,720 s/min), 12,605,372 distinct states found (1,669,921 ds/min), 1,677,447 states left on queue. +Progress(35) at 2024-11-06 13:10:58: 185,862,196 states generated (26,230,811 s/min), 14,138,409 distinct states found (1,533,037 ds/min), 1,501,760 states left on queue. +Progress(36) at 2024-11-06 13:11:58: 212,021,688 states generated (26,159,492 s/min), 15,538,990 distinct states found (1,400,581 ds/min), 1,216,621 states left on queue. +Progress(37) at 2024-11-06 13:12:58: 238,046,160 states generated (26,024,472 s/min), 16,778,583 distinct states found (1,239,593 ds/min), 797,230 states left on queue. +Progress(39) at 2024-11-06 13:13:58: 263,931,163 states generated (25,885,003 s/min), 17,820,786 distinct states found (1,042,203 ds/min), 209,400 states left on queue. +Model checking completed. No error has been found. + Estimates of the probability that TLC did not check all reachable states + because two distinct states had the same fingerprint: + calculated (optimistic): val = 2.5E-4 + based on the actual fingerprints: val = 7.9E-5 +270257170 states generated, 18005639 distinct states found, 0 states left on queue. +The depth of the complete state graph search is 47. +The average outdegree of the complete state graph is 1 (minimum is 0, the maximum 7 and the 95th percentile is 3). +Finished in 10min 25s at (2024-11-06 13:14:17) diff --git a/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t4_l4.cfg-2024-11-06--14-20-25.log b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t4_l4.cfg-2024-11-06--14-20-25.log new file mode 100644 index 0000000000..c7cc853af0 --- /dev/null +++ b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t4_l4.cfg-2024-11-06--14-20-25.log @@ -0,0 +1,1466 @@ +# Shows LogSafety violation when "don't commit separately entries from previous terms" check is disabled. +git revision: 4f1ee6331 +Platform: Linux neon-dev-arm64-1 6.8.0-48-generic #48-Ubuntu SMP PREEMPT_DYNAMIC Fri Sep 27 14:35:45 UTC 2024 aarch64 aarch64 aarch64 GNU/Linux +CPU Info Linux: Neoverse-N1 +CPU Cores Linux: 80 +CPU Info Mac: +CPU Cores Mac: +Spec: MCProposerAcceptorStatic.tla +Config: models/MCProposerAcceptorStatic_p2_a3_t4_l4.cfg +---- +CONSTANTS +NULL = NULL +proposers = {p1, p2} +acceptors = {a1, a2, a3} +max_term = 4 +max_entries = 4 +SPECIFICATION Spec +CONSTRAINT StateConstraint +INVARIANT +TypeOk +ElectionSafety +LogIsMonotonic +LogSafety +SYMMETRY ProposerAcceptorSymmetry +CHECK_DEADLOCK FALSE +ALIAS Alias + +---- + +TLC2 Version 2.20 of Day Month 20?? (rev: f68cb71) +Running breadth-first search Model-Checking with fp 12 and seed -5379034126224420237 with 80 workers on 80 cores with 54613MB heap and 61440MB offheap memory [pid: 52295] (Linux 6.8.0-48-generic aarch64, Ubuntu 21.0.4 x86_64, OffHeapDiskFPSet, DiskStateQueue). +Parsing file /home/arseny/neon/safekeeper/spec/MCProposerAcceptorStatic.tla +Parsing file /tmp/tlc-4533438058229992850/TLC.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLC.tla) +Parsing file /home/arseny/neon/safekeeper/spec/ProposerAcceptorStatic.tla +Parsing file /tmp/tlc-4533438058229992850/_TLCTrace.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/_TLCTrace.tla) +Parsing file /tmp/tlc-4533438058229992850/Integers.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Integers.tla) +Parsing file /tmp/tlc-4533438058229992850/Sequences.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Sequences.tla) +Parsing file /tmp/tlc-4533438058229992850/FiniteSets.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/FiniteSets.tla) +Parsing file /tmp/tlc-4533438058229992850/Naturals.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Naturals.tla) +Parsing file /tmp/tlc-4533438058229992850/TLCExt.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLCExt.tla) +Semantic processing of module Naturals +Semantic processing of module Sequences +Semantic processing of module FiniteSets +Semantic processing of module TLC +Semantic processing of module Integers +Semantic processing of module ProposerAcceptorStatic +Semantic processing of module TLCExt +Semantic processing of module _TLCTrace +Semantic processing of module MCProposerAcceptorStatic +Starting... (2024-11-06 14:20:26) +Computing initial states... +Finished computing initial states: 1 distinct state generated at 2024-11-06 14:20:29. +Progress(20) at 2024-11-06 14:20:32: 1,011,898 states generated (1,011,898 s/min), 140,947 distinct states found (140,947 ds/min), 60,535 states left on queue. +Progress(26) at 2024-11-06 14:21:32: 30,146,518 states generated (29,134,620 s/min), 3,742,736 distinct states found (3,601,789 ds/min), 1,438,779 states left on queue. +Progress(27) at 2024-11-06 14:22:32: 59,362,708 states generated (29,216,190 s/min), 7,210,233 distinct states found (3,467,497 ds/min), 2,708,295 states left on queue. +Progress(28) at 2024-11-06 14:23:32: 88,589,291 states generated (29,226,583 s/min), 10,552,781 distinct states found (3,342,548 ds/min), 3,874,296 states left on queue. +Progress(29) at 2024-11-06 14:24:32: 117,894,209 states generated (29,304,918 s/min), 13,932,498 distinct states found (3,379,717 ds/min), 5,069,960 states left on queue. +Progress(29) at 2024-11-06 14:25:32: 147,338,882 states generated (29,444,673 s/min), 17,180,069 distinct states found (3,247,571 ds/min), 6,146,371 states left on queue. +Progress(29) at 2024-11-06 14:26:32: 176,498,135 states generated (29,159,253 s/min), 20,547,926 distinct states found (3,367,857 ds/min), 7,338,835 states left on queue. +Progress(30) at 2024-11-06 14:27:32: 205,957,044 states generated (29,458,909 s/min), 23,661,090 distinct states found (3,113,164 ds/min), 8,293,570 states left on queue. +Progress(30) at 2024-11-06 14:28:32: 235,390,133 states generated (29,433,089 s/min), 26,892,306 distinct states found (3,231,216 ds/min), 9,369,229 states left on queue. +Progress(30) at 2024-11-06 14:29:32: 264,571,938 states generated (29,181,805 s/min), 30,176,971 distinct states found (3,284,665 ds/min), 10,493,429 states left on queue. +Progress(31) at 2024-11-06 14:30:32: 293,928,191 states generated (29,356,253 s/min), 33,296,160 distinct states found (3,119,189 ds/min), 11,463,686 states left on queue. +Progress(31) at 2024-11-06 14:31:32: 323,436,668 states generated (29,508,477 s/min), 36,347,973 distinct states found (3,051,813 ds/min), 12,365,578 states left on queue. +Progress(31) at 2024-11-06 14:32:32: 352,943,790 states generated (29,507,122 s/min), 39,465,244 distinct states found (3,117,271 ds/min), 13,349,544 states left on queue. +Progress(31) at 2024-11-06 14:33:32: 382,292,863 states generated (29,349,073 s/min), 42,654,621 distinct states found (3,189,377 ds/min), 14,384,363 states left on queue. +Progress(31) at 2024-11-06 14:34:32: 411,385,854 states generated (29,092,991 s/min), 45,941,145 distinct states found (3,286,524 ds/min), 15,509,450 states left on queue. +Progress(31) at 2024-11-06 14:35:32: 440,738,756 states generated (29,352,902 s/min), 48,984,566 distinct states found (3,043,421 ds/min), 16,419,882 states left on queue. +Progress(32) at 2024-11-06 14:36:32: 470,251,558 states generated (29,512,802 s/min), 51,925,693 distinct states found (2,941,127 ds/min), 17,211,457 states left on queue. +Progress(32) at 2024-11-06 14:37:32: 499,714,013 states generated (29,462,455 s/min), 54,955,581 distinct states found (3,029,888 ds/min), 18,114,624 states left on queue. +Progress(32) at 2024-11-06 14:38:32: 529,254,608 states generated (29,540,595 s/min), 57,938,914 distinct states found (2,983,333 ds/min), 18,996,128 states left on queue. +Progress(32) at 2024-11-06 14:39:32: 558,774,398 states generated (29,519,790 s/min), 61,072,943 distinct states found (3,134,029 ds/min), 19,975,689 states left on queue. +Progress(32) at 2024-11-06 14:40:32: 588,134,665 states generated (29,360,267 s/min), 64,148,888 distinct states found (3,075,945 ds/min), 20,922,407 states left on queue. +Progress(32) at 2024-11-06 14:41:32: 617,464,374 states generated (29,329,709 s/min), 67,306,855 distinct states found (3,157,967 ds/min), 21,928,799 states left on queue. +Progress(32) at 2024-11-06 14:42:32: 646,525,281 states generated (29,060,907 s/min), 70,425,194 distinct states found (3,118,339 ds/min), 22,895,971 states left on queue. +Progress(32) at 2024-11-06 14:43:32: 676,054,893 states generated (29,529,612 s/min), 73,351,905 distinct states found (2,926,711 ds/min), 23,703,779 states left on queue. +Progress(33) at 2024-11-06 14:44:32: 705,581,782 states generated (29,526,889 s/min), 76,200,615 distinct states found (2,848,710 ds/min), 24,414,094 states left on queue. +Progress(33) at 2024-11-06 14:45:32: 735,069,836 states generated (29,488,054 s/min), 79,168,244 distinct states found (2,967,629 ds/min), 25,255,224 states left on queue. +Progress(33) at 2024-11-06 14:46:32: 764,659,188 states generated (29,589,352 s/min), 82,024,430 distinct states found (2,856,186 ds/min), 26,011,047 states left on queue. +Progress(33) at 2024-11-06 14:47:32: 794,276,423 states generated (29,617,235 s/min), 84,974,312 distinct states found (2,949,882 ds/min), 26,868,750 states left on queue. +Progress(33) at 2024-11-06 14:48:32: 823,875,831 states generated (29,599,408 s/min), 88,004,386 distinct states found (3,030,074 ds/min), 27,771,984 states left on queue. +Progress(33) at 2024-11-06 14:49:32: 853,138,894 states generated (29,263,063 s/min), 91,006,890 distinct states found (3,002,504 ds/min), 28,636,661 states left on queue. +Checkpointing of run states/24-11-06-14-20-25.868 +Checkpointing completed at (2024-11-06 14:50:32) +Progress(33) at 2024-11-06 14:50:32: 882,514,167 states generated (29,375,273 s/min), 94,011,000 distinct states found (3,004,110 ds/min), 29,534,516 states left on queue. +Progress(33) at 2024-11-06 14:51:32: 911,838,377 states generated (29,324,210 s/min), 97,108,937 distinct states found (3,097,937 ds/min), 30,498,587 states left on queue. +Progress(33) at 2024-11-06 14:52:32: 940,646,920 states generated (28,808,543 s/min), 100,248,865 distinct states found (3,139,928 ds/min), 31,472,191 states left on queue. +Progress(33) at 2024-11-06 14:53:32: 970,074,175 states generated (29,427,255 s/min), 103,170,815 distinct states found (2,921,950 ds/min), 32,265,691 states left on queue. +Progress(33) at 2024-11-06 14:54:32: 999,627,974 states generated (29,553,799 s/min), 106,004,823 distinct states found (2,834,008 ds/min), 33,009,618 states left on queue. +Progress(34) at 2024-11-06 14:55:32: 1,029,148,983 states generated (29,521,009 s/min), 108,740,783 distinct states found (2,735,960 ds/min), 33,616,222 states left on queue. +Progress(34) at 2024-11-06 14:56:32: 1,058,582,001 states generated (29,433,018 s/min), 111,612,965 distinct states found (2,872,182 ds/min), 34,375,212 states left on queue. +Progress(34) at 2024-11-06 14:57:32: 1,088,123,602 states generated (29,541,601 s/min), 114,464,196 distinct states found (2,851,231 ds/min), 35,116,195 states left on queue. +Progress(34) at 2024-11-06 14:58:32: 1,117,684,936 states generated (29,561,334 s/min), 117,252,198 distinct states found (2,788,002 ds/min), 35,817,205 states left on queue. +Progress(34) at 2024-11-06 14:59:32: 1,147,356,249 states generated (29,671,313 s/min), 120,014,476 distinct states found (2,762,278 ds/min), 36,517,255 states left on queue. +Progress(34) at 2024-11-06 15:00:32: 1,176,921,098 states generated (29,564,849 s/min), 122,859,312 distinct states found (2,844,836 ds/min), 37,291,096 states left on queue. +Progress(34) at 2024-11-06 15:01:32: 1,206,454,440 states generated (29,533,342 s/min), 125,830,942 distinct states found (2,971,630 ds/min), 38,147,762 states left on queue. +Progress(34) at 2024-11-06 15:02:32: 1,235,721,673 states generated (29,267,233 s/min), 128,869,493 distinct states found (3,038,551 ds/min), 39,035,481 states left on queue. +Progress(34) at 2024-11-06 15:03:32: 1,265,097,779 states generated (29,376,106 s/min), 131,669,552 distinct states found (2,800,059 ds/min), 39,746,864 states left on queue. +Progress(34) at 2024-11-06 15:04:32: 1,294,408,098 states generated (29,310,319 s/min), 134,604,630 distinct states found (2,935,078 ds/min), 40,584,235 states left on queue. +Progress(34) at 2024-11-06 15:05:32: 1,323,792,755 states generated (29,384,657 s/min), 137,579,390 distinct states found (2,974,760 ds/min), 41,446,478 states left on queue. +Progress(34) at 2024-11-06 15:06:32: 1,353,085,163 states generated (29,292,408 s/min), 140,575,723 distinct states found (2,996,333 ds/min), 42,309,510 states left on queue. +Progress(34) at 2024-11-06 15:07:32: 1,381,809,417 states generated (28,724,254 s/min), 143,655,566 distinct states found (3,079,843 ds/min), 43,220,682 states left on queue. +Progress(34) at 2024-11-06 15:08:32: 1,411,255,848 states generated (29,446,431 s/min), 146,482,192 distinct states found (2,826,626 ds/min), 43,944,938 states left on queue. +Progress(34) at 2024-11-06 15:09:32: 1,440,646,323 states generated (29,390,475 s/min), 149,419,989 distinct states found (2,937,797 ds/min), 44,763,293 states left on queue. +Progress(34) at 2024-11-06 15:10:32: 1,470,298,568 states generated (29,652,245 s/min), 152,041,419 distinct states found (2,621,430 ds/min), 45,311,911 states left on queue. +Progress(35) at 2024-11-06 15:11:32: 1,499,747,712 states generated (29,449,144 s/min), 154,696,867 distinct states found (2,655,448 ds/min), 45,842,895 states left on queue. +Progress(35) at 2024-11-06 15:12:32: 1,529,256,993 states generated (29,509,281 s/min), 157,493,365 distinct states found (2,796,498 ds/min), 46,535,472 states left on queue. +Progress(35) at 2024-11-06 15:13:32: 1,558,829,306 states generated (29,572,313 s/min), 160,256,575 distinct states found (2,763,210 ds/min), 47,212,471 states left on queue. +Progress(35) at 2024-11-06 15:14:32: 1,588,345,878 states generated (29,516,572 s/min), 163,002,602 distinct states found (2,746,027 ds/min), 47,862,117 states left on queue. +Progress(35) at 2024-11-06 15:15:32: 1,617,885,675 states generated (29,539,797 s/min), 165,699,121 distinct states found (2,696,519 ds/min), 48,472,896 states left on queue. +Progress(35) at 2024-11-06 15:16:32: 1,647,559,965 states generated (29,674,290 s/min), 168,343,286 distinct states found (2,644,165 ds/min), 49,065,377 states left on queue. +Progress(35) at 2024-11-06 15:17:32: 1,677,033,250 states generated (29,473,285 s/min), 171,134,409 distinct states found (2,791,123 ds/min), 49,823,330 states left on queue. +Progress(35) at 2024-11-06 15:18:32: 1,706,730,266 states generated (29,697,016 s/min), 173,860,974 distinct states found (2,726,565 ds/min), 50,493,221 states left on queue. +Error: Invariant LogSafety is violated. +Error: The behavior up to this point is: +State 1: +/\ prop_state = ( p1 :> + [ term |-> 1, + wal |-> <<>>, + state |-> "campaign", + votes |-> <<>>, + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] @@ + p2 :> + [ term |-> 1, + wal |-> <<>>, + state |-> "campaign", + votes |-> <<>>, + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] ) +/\ acc_state = ( a1 :> + [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@ + a2 :> + [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@ + a3 :> + [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) +/\ committed = {} + +State 2: +/\ prop_state = ( p1 :> + [ term |-> 1, + wal |-> <<>>, + state |-> "campaign", + votes |-> + ( a1 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] @@ + p2 :> + [ term |-> 1, + wal |-> <<>>, + state |-> "campaign", + votes |-> <<>>, + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] ) +/\ acc_state = ( a1 :> + [term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@ + a2 :> + [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@ + a3 :> + [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) +/\ committed = {} + +State 3: +/\ prop_state = ( p1 :> + [ term |-> 1, + wal |-> <<>>, + state |-> "campaign", + votes |-> + ( a1 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] @@ + p2 :> + [ term |-> 2, + wal |-> <<>>, + state |-> "campaign", + votes |-> <<>>, + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] ) +/\ acc_state = ( a1 :> + [term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@ + a2 :> + [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@ + a3 :> + [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) +/\ committed = {} + +State 4: +/\ prop_state = ( p1 :> + [ term |-> 1, + wal |-> <<>>, + state |-> "campaign", + votes |-> + ( a1 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] @@ + a2 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] @@ + p2 :> + [ term |-> 2, + wal |-> <<>>, + state |-> "campaign", + votes |-> <<>>, + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] ) +/\ acc_state = ( a1 :> + [term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@ + a2 :> + [term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@ + a3 :> + [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) +/\ committed = {} + +State 5: +/\ prop_state = ( p1 :> + [ term |-> 1, + wal |-> <<>>, + state |-> "leader", + votes |-> + ( a1 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] @@ + a2 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + nextSendLsn |-> <<>> ] @@ + p2 :> + [ term |-> 2, + wal |-> <<>>, + state |-> "campaign", + votes |-> <<>>, + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] ) +/\ acc_state = ( a1 :> + [term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@ + a2 :> + [term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@ + a3 :> + [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) +/\ committed = {} + +State 6: +/\ prop_state = ( p1 :> + [ term |-> 1, + wal |-> <<>>, + state |-> "leader", + votes |-> + ( a1 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] @@ + a2 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + nextSendLsn |-> <<>> ] @@ + p2 :> + [ term |-> 2, + wal |-> <<>>, + state |-> "campaign", + votes |-> + ( a1 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] ) +/\ acc_state = ( a1 :> + [term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@ + a2 :> + [term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@ + a3 :> + [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) +/\ committed = {} + +State 7: +/\ prop_state = ( p1 :> + [ term |-> 1, + wal |-> <<>>, + state |-> "leader", + votes |-> + ( a1 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] @@ + a2 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + nextSendLsn |-> (a2 :> 1) ] @@ + p2 :> + [ term |-> 2, + wal |-> <<>>, + state |-> "campaign", + votes |-> + ( a1 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] ) +/\ acc_state = ( a1 :> + [term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@ + a2 :> + [ term |-> 1, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@ + a3 :> + [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) +/\ committed = {} + +State 8: +/\ prop_state = ( p1 :> + [ term |-> 1, + wal |-> <<>>, + state |-> "leader", + votes |-> + ( a1 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] @@ + a2 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + nextSendLsn |-> (a2 :> 1) ] @@ + p2 :> + [ term |-> 2, + wal |-> <<>>, + state |-> "campaign", + votes |-> + ( a1 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] @@ + a3 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] ) +/\ acc_state = ( a1 :> + [term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@ + a2 :> + [ term |-> 1, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@ + a3 :> + [term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) +/\ committed = {} + +State 9: +/\ prop_state = ( p1 :> + [ term |-> 1, + wal |-> <<>>, + state |-> "leader", + votes |-> + ( a1 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] @@ + a2 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + nextSendLsn |-> (a2 :> 1) ] @@ + p2 :> + [ term |-> 2, + wal |-> <<>>, + state |-> "leader", + votes |-> + ( a1 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] @@ + a3 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, + nextSendLsn |-> <<>> ] ) +/\ acc_state = ( a1 :> + [term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@ + a2 :> + [ term |-> 1, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@ + a3 :> + [term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) +/\ committed = {} + +State 10: +/\ prop_state = ( p1 :> + [ term |-> 1, + wal |-> <<>>, + state |-> "leader", + votes |-> + ( a1 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] @@ + a2 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + nextSendLsn |-> (a2 :> 1) ] @@ + p2 :> + [ term |-> 2, + wal |-> <<>>, + state |-> "leader", + votes |-> + ( a1 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] @@ + a3 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, + nextSendLsn |-> (a1 :> 1) ] ) +/\ acc_state = ( a1 :> + [ term |-> 2, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ + a2 :> + [ term |-> 1, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@ + a3 :> + [term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) +/\ committed = {} + +State 11: +/\ prop_state = ( p1 :> + [ term |-> 1, + wal |-> <<>>, + state |-> "leader", + votes |-> + ( a1 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] @@ + a2 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + nextSendLsn |-> (a2 :> 1) ] @@ + p2 :> + [ term |-> 3, + wal |-> <<>>, + state |-> "campaign", + votes |-> <<>>, + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] ) +/\ acc_state = ( a1 :> + [ term |-> 2, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ + a2 :> + [ term |-> 1, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@ + a3 :> + [term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) +/\ committed = {} + +State 12: +/\ prop_state = ( p1 :> + [ term |-> 1, + wal |-> <<>>, + state |-> "leader", + votes |-> + ( a1 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] @@ + a2 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + nextSendLsn |-> (a2 :> 1) ] @@ + p2 :> + [ term |-> 3, + wal |-> <<>>, + state |-> "campaign", + votes |-> + ( a3 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] ) +/\ acc_state = ( a1 :> + [ term |-> 2, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ + a2 :> + [ term |-> 1, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@ + a3 :> + [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) +/\ committed = {} + +State 13: +/\ prop_state = ( p1 :> + [ term |-> 1, + wal |-> <<1>>, + state |-> "leader", + votes |-> + ( a1 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] @@ + a2 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + nextSendLsn |-> (a2 :> 1) ] @@ + p2 :> + [ term |-> 3, + wal |-> <<>>, + state |-> "campaign", + votes |-> + ( a3 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] ) +/\ acc_state = ( a1 :> + [ term |-> 2, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ + a2 :> + [ term |-> 1, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@ + a3 :> + [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) +/\ committed = {} + +State 14: +/\ prop_state = ( p1 :> + [ term |-> 1, + wal |-> <<1, 1>>, + state |-> "leader", + votes |-> + ( a1 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] @@ + a2 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + nextSendLsn |-> (a2 :> 1) ] @@ + p2 :> + [ term |-> 3, + wal |-> <<>>, + state |-> "campaign", + votes |-> + ( a3 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] ) +/\ acc_state = ( a1 :> + [ term |-> 2, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ + a2 :> + [ term |-> 1, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@ + a3 :> + [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) +/\ committed = {} + +State 15: +/\ prop_state = ( p1 :> + [ term |-> 1, + wal |-> <<1, 1>>, + state |-> "leader", + votes |-> + ( a1 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] @@ + a2 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + nextSendLsn |-> (a2 :> 2) ] @@ + p2 :> + [ term |-> 3, + wal |-> <<>>, + state |-> "campaign", + votes |-> + ( a3 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] ) +/\ acc_state = ( a1 :> + [ term |-> 2, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ + a2 :> + [ term |-> 1, + wal |-> <<1>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@ + a3 :> + [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) +/\ committed = {} + +State 16: +/\ prop_state = ( p1 :> + [ term |-> 1, + wal |-> <<1, 1>>, + state |-> "leader", + votes |-> + ( a1 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] @@ + a2 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + nextSendLsn |-> (a2 :> 3) ] @@ + p2 :> + [ term |-> 3, + wal |-> <<>>, + state |-> "campaign", + votes |-> + ( a3 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] ) +/\ acc_state = ( a1 :> + [ term |-> 2, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ + a2 :> + [ term |-> 1, + wal |-> <<1, 1>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@ + a3 :> + [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) +/\ committed = {} + +State 17: +/\ prop_state = ( p1 :> + [ term |-> 4, + wal |-> <<>>, + state |-> "campaign", + votes |-> <<>>, + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] @@ + p2 :> + [ term |-> 3, + wal |-> <<>>, + state |-> "campaign", + votes |-> + ( a3 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] ) +/\ acc_state = ( a1 :> + [ term |-> 2, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ + a2 :> + [ term |-> 1, + wal |-> <<1, 1>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@ + a3 :> + [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) +/\ committed = {} + +State 18: +/\ prop_state = ( p1 :> + [ term |-> 4, + wal |-> <<>>, + state |-> "campaign", + votes |-> + ( a1 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] @@ + p2 :> + [ term |-> 3, + wal |-> <<>>, + state |-> "campaign", + votes |-> + ( a3 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] ) +/\ acc_state = ( a1 :> + [ term |-> 4, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ + a2 :> + [ term |-> 1, + wal |-> <<1, 1>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@ + a3 :> + [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) +/\ committed = {} + +State 19: +/\ prop_state = ( p1 :> + [ term |-> 4, + wal |-> <<>>, + state |-> "campaign", + votes |-> + ( a1 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] @@ + p2 :> + [ term |-> 3, + wal |-> <<>>, + state |-> "campaign", + votes |-> + ( a2 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + flushLsn |-> 3 ] @@ + a3 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] ) +/\ acc_state = ( a1 :> + [ term |-> 4, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ + a2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@ + a3 :> + [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) +/\ committed = {} + +State 20: +/\ prop_state = ( p1 :> + [ term |-> 4, + wal |-> <<>>, + state |-> "campaign", + votes |-> + ( a1 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] @@ + p2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + state |-> "leader", + votes |-> + ( a2 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + flushLsn |-> 3 ] @@ + a3 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >>, + nextSendLsn |-> <<>> ] ) +/\ acc_state = ( a1 :> + [ term |-> 4, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ + a2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@ + a3 :> + [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) +/\ committed = {} + +State 21: +/\ prop_state = ( p1 :> + [ term |-> 4, + wal |-> <<>>, + state |-> "campaign", + votes |-> + ( a1 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] @@ + p2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + state |-> "leader", + votes |-> + ( a2 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + flushLsn |-> 3 ] @@ + a3 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >>, + nextSendLsn |-> (a2 :> 3) ] ) +/\ acc_state = ( a1 :> + [ term |-> 4, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ + a2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >> ] @@ + a3 :> + [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] ) +/\ committed = {} + +State 22: +/\ prop_state = ( p1 :> + [ term |-> 4, + wal |-> <<>>, + state |-> "campaign", + votes |-> + ( a1 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] @@ + p2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + state |-> "leader", + votes |-> + ( a2 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + flushLsn |-> 3 ] @@ + a3 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >>, + nextSendLsn |-> (a2 :> 3 @@ a3 :> 1) ] ) +/\ acc_state = ( a1 :> + [ term |-> 4, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ + a2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >> ] @@ + a3 :> + [ term |-> 3, + wal |-> <<>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >> ] ) +/\ committed = {} + +State 23: +/\ prop_state = ( p1 :> + [ term |-> 4, + wal |-> <<>>, + state |-> "campaign", + votes |-> + ( a1 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] @@ + p2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + state |-> "leader", + votes |-> + ( a2 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + flushLsn |-> 3 ] @@ + a3 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >>, + nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] ) +/\ acc_state = ( a1 :> + [ term |-> 4, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ + a2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >> ] @@ + a3 :> + [ term |-> 3, + wal |-> <<1>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >> ] ) +/\ committed = {} + +State 24: +/\ prop_state = ( p1 :> + [ term |-> 4, + wal |-> <<>>, + state |-> "campaign", + votes |-> + ( a1 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] @@ + p2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + state |-> "leader", + votes |-> + ( a2 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + flushLsn |-> 3 ] @@ + a3 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >>, + nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] ) +/\ acc_state = ( a1 :> + [ term |-> 4, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ + a2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >> ] @@ + a3 :> + [ term |-> 3, + wal |-> <<1>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >> ] ) +/\ committed = {[term |-> 1, lsn |-> 1]} + +State 25: +/\ prop_state = ( p1 :> + [ term |-> 4, + wal |-> <<>>, + state |-> "campaign", + votes |-> + ( a1 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, + flushLsn |-> 1 ] @@ + a3 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + flushLsn |-> 2 ] ), + termHistory |-> <<>>, + nextSendLsn |-> <<>> ] @@ + p2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + state |-> "leader", + votes |-> + ( a2 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + flushLsn |-> 3 ] @@ + a3 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >>, + nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] ) +/\ acc_state = ( a1 :> + [ term |-> 4, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ + a2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >> ] @@ + a3 :> + [ term |-> 4, + wal |-> <<1>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >> ] ) +/\ committed = {[term |-> 1, lsn |-> 1]} + +State 26: +/\ prop_state = ( p1 :> + [ term |-> 4, + wal |-> <<>>, + state |-> "leader", + votes |-> + ( a1 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, + flushLsn |-> 1 ] @@ + a3 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + flushLsn |-> 2 ] ), + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 2, lsn |-> 1], + [term |-> 4, lsn |-> 1] >>, + nextSendLsn |-> <<>> ] @@ + p2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + state |-> "leader", + votes |-> + ( a2 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + flushLsn |-> 3 ] @@ + a3 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >>, + nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] ) +/\ acc_state = ( a1 :> + [ term |-> 4, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ + a2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >> ] @@ + a3 :> + [ term |-> 4, + wal |-> <<1>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >> ] ) +/\ committed = {[term |-> 1, lsn |-> 1]} + +State 27: +/\ prop_state = ( p1 :> + [ term |-> 4, + wal |-> <<>>, + state |-> "leader", + votes |-> + ( a1 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, + flushLsn |-> 1 ] @@ + a3 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + flushLsn |-> 2 ] ), + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 2, lsn |-> 1], + [term |-> 4, lsn |-> 1] >>, + nextSendLsn |-> (a3 :> 1) ] @@ + p2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + state |-> "leader", + votes |-> + ( a2 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + flushLsn |-> 3 ] @@ + a3 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >>, + nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] ) +/\ acc_state = ( a1 :> + [ term |-> 4, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ + a2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >> ] @@ + a3 :> + [ term |-> 4, + wal |-> <<>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 2, lsn |-> 1], + [term |-> 4, lsn |-> 1] >> ] ) +/\ committed = {[term |-> 1, lsn |-> 1]} + +State 28: +/\ prop_state = ( p1 :> + [ term |-> 4, + wal |-> <<4>>, + state |-> "leader", + votes |-> + ( a1 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, + flushLsn |-> 1 ] @@ + a3 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + flushLsn |-> 2 ] ), + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 2, lsn |-> 1], + [term |-> 4, lsn |-> 1] >>, + nextSendLsn |-> (a3 :> 1) ] @@ + p2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + state |-> "leader", + votes |-> + ( a2 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + flushLsn |-> 3 ] @@ + a3 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >>, + nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] ) +/\ acc_state = ( a1 :> + [ term |-> 4, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ + a2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >> ] @@ + a3 :> + [ term |-> 4, + wal |-> <<>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 2, lsn |-> 1], + [term |-> 4, lsn |-> 1] >> ] ) +/\ committed = {[term |-> 1, lsn |-> 1]} + +State 29: +/\ prop_state = ( p1 :> + [ term |-> 4, + wal |-> <<4>>, + state |-> "leader", + votes |-> + ( a1 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, + flushLsn |-> 1 ] @@ + a3 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + flushLsn |-> 2 ] ), + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 2, lsn |-> 1], + [term |-> 4, lsn |-> 1] >>, + nextSendLsn |-> (a3 :> 2) ] @@ + p2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + state |-> "leader", + votes |-> + ( a2 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + flushLsn |-> 3 ] @@ + a3 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >>, + nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] ) +/\ acc_state = ( a1 :> + [ term |-> 4, + wal |-> <<>>, + termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@ + a2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >> ] @@ + a3 :> + [ term |-> 4, + wal |-> <<4>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 2, lsn |-> 1], + [term |-> 4, lsn |-> 1] >> ] ) +/\ committed = {[term |-> 1, lsn |-> 1]} + +State 30: +/\ prop_state = ( p1 :> + [ term |-> 4, + wal |-> <<4>>, + state |-> "leader", + votes |-> + ( a1 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, + flushLsn |-> 1 ] @@ + a3 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + flushLsn |-> 2 ] ), + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 2, lsn |-> 1], + [term |-> 4, lsn |-> 1] >>, + nextSendLsn |-> (a1 :> 1 @@ a3 :> 2) ] @@ + p2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + state |-> "leader", + votes |-> + ( a2 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + flushLsn |-> 3 ] @@ + a3 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >>, + nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] ) +/\ acc_state = ( a1 :> + [ term |-> 4, + wal |-> <<>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 2, lsn |-> 1], + [term |-> 4, lsn |-> 1] >> ] @@ + a2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >> ] @@ + a3 :> + [ term |-> 4, + wal |-> <<4>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 2, lsn |-> 1], + [term |-> 4, lsn |-> 1] >> ] ) +/\ committed = {[term |-> 1, lsn |-> 1]} + +State 31: +/\ prop_state = ( p1 :> + [ term |-> 4, + wal |-> <<4>>, + state |-> "leader", + votes |-> + ( a1 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, + flushLsn |-> 1 ] @@ + a3 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + flushLsn |-> 2 ] ), + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 2, lsn |-> 1], + [term |-> 4, lsn |-> 1] >>, + nextSendLsn |-> (a1 :> 2 @@ a3 :> 2) ] @@ + p2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + state |-> "leader", + votes |-> + ( a2 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + flushLsn |-> 3 ] @@ + a3 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >>, + nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] ) +/\ acc_state = ( a1 :> + [ term |-> 4, + wal |-> <<4>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 2, lsn |-> 1], + [term |-> 4, lsn |-> 1] >> ] @@ + a2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >> ] @@ + a3 :> + [ term |-> 4, + wal |-> <<4>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 2, lsn |-> 1], + [term |-> 4, lsn |-> 1] >> ] ) +/\ committed = {[term |-> 1, lsn |-> 1]} + +State 32: +/\ prop_state = ( p1 :> + [ term |-> 4, + wal |-> <<4>>, + state |-> "leader", + votes |-> + ( a1 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>, + flushLsn |-> 1 ] @@ + a3 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + flushLsn |-> 2 ] ), + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 2, lsn |-> 1], + [term |-> 4, lsn |-> 1] >>, + nextSendLsn |-> (a1 :> 2 @@ a3 :> 2) ] @@ + p2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + state |-> "leader", + votes |-> + ( a2 :> + [ termHistory |-> + <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>, + flushLsn |-> 3 ] @@ + a3 :> + [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>, + flushLsn |-> 1 ] ), + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >>, + nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] ) +/\ acc_state = ( a1 :> + [ term |-> 4, + wal |-> <<4>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 2, lsn |-> 1], + [term |-> 4, lsn |-> 1] >> ] @@ + a2 :> + [ term |-> 3, + wal |-> <<1, 1>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 1, lsn |-> 1], + [term |-> 3, lsn |-> 3] >> ] @@ + a3 :> + [ term |-> 4, + wal |-> <<4>>, + termHistory |-> + << [term |-> 0, lsn |-> 1], + [term |-> 2, lsn |-> 1], + [term |-> 4, lsn |-> 1] >> ] ) +/\ committed = {[term |-> 1, lsn |-> 1], [term |-> 4, lsn |-> 1]} + +1712918117 states generated, 174460942 distinct states found, 50658619 states left on queue. +The depth of the complete state graph search is 35. +Finished in 58min 19s at (2024-11-06 15:18:45) +Trace exploration spec path: ./MCProposerAcceptorStatic_TTrace_1730902825.tla diff --git a/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t4_l4.cfg-2024-11-06--15-30-45.log b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t4_l4.cfg-2024-11-06--15-30-45.log new file mode 100644 index 0000000000..8248240ded --- /dev/null +++ b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t4_l4.cfg-2024-11-06--15-30-45.log @@ -0,0 +1,1374 @@ +git revision: 4f1ee6331 +Platform: Linux neon-dev-arm64-1 6.8.0-48-generic #48-Ubuntu SMP PREEMPT_DYNAMIC Fri Sep 27 14:35:45 UTC 2024 aarch64 aarch64 aarch64 GNU/Linux +CPU Info Linux: Neoverse-N1 +CPU Cores Linux: 80 +CPU Info Mac: +CPU Cores Mac: +Spec: MCProposerAcceptorStatic.tla +Config: models/MCProposerAcceptorStatic_p2_a3_t4_l4.cfg +---- +CONSTANTS +NULL = NULL +proposers = {p1, p2} +acceptors = {a1, a2, a3} +max_term = 4 +max_entries = 4 +SPECIFICATION Spec +CONSTRAINT StateConstraint +INVARIANT +TypeOk +ElectionSafety +LogIsMonotonic +LogSafety +SYMMETRY ProposerAcceptorSymmetry +CHECK_DEADLOCK FALSE +ALIAS Alias + +---- + +TLC2 Version 2.20 of Day Month 20?? (rev: f68cb71) +Running breadth-first search Model-Checking with fp 84 and seed -1069171980999686913 with 80 workers on 80 cores with 54613MB heap and 61440MB offheap memory [pid: 62544] (Linux 6.8.0-48-generic aarch64, Ubuntu 21.0.4 x86_64, OffHeapDiskFPSet, DiskStateQueue). +Parsing file /home/arseny/neon/safekeeper/spec/MCProposerAcceptorStatic.tla +Parsing file /tmp/tlc-6542850091824737097/TLC.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLC.tla) +Parsing file /home/arseny/neon/safekeeper/spec/ProposerAcceptorStatic.tla +Parsing file /tmp/tlc-6542850091824737097/_TLCTrace.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/_TLCTrace.tla) +Parsing file /tmp/tlc-6542850091824737097/Integers.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Integers.tla) +Parsing file /tmp/tlc-6542850091824737097/Sequences.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Sequences.tla) +Parsing file /tmp/tlc-6542850091824737097/FiniteSets.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/FiniteSets.tla) +Parsing file /tmp/tlc-6542850091824737097/Naturals.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Naturals.tla) +Parsing file /tmp/tlc-6542850091824737097/TLCExt.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLCExt.tla) +Semantic processing of module Naturals +Semantic processing of module Sequences +Semantic processing of module FiniteSets +Semantic processing of module TLC +Semantic processing of module Integers +Semantic processing of module ProposerAcceptorStatic +Semantic processing of module TLCExt +Semantic processing of module _TLCTrace +Semantic processing of module MCProposerAcceptorStatic +Starting... (2024-11-06 15:30:45) +Computing initial states... +Finished computing initial states: 1 distinct state generated at 2024-11-06 15:30:48. +Progress(20) at 2024-11-06 15:30:51: 956,386 states generated (956,386 s/min), 134,121 distinct states found (134,121 ds/min), 57,996 states left on queue. +Progress(27) at 2024-11-06 15:31:51: 30,048,294 states generated (29,091,908 s/min), 3,778,849 distinct states found (3,644,728 ds/min), 1,463,715 states left on queue. +Progress(28) at 2024-11-06 15:32:51: 59,092,248 states generated (29,043,954 s/min), 7,282,332 distinct states found (3,503,483 ds/min), 2,750,944 states left on queue. +Progress(29) at 2024-11-06 15:33:51: 88,333,136 states generated (29,240,888 s/min), 10,694,325 distinct states found (3,411,993 ds/min), 3,955,744 states left on queue. +Progress(29) at 2024-11-06 15:34:51: 117,708,994 states generated (29,375,858 s/min), 14,000,885 distinct states found (3,306,560 ds/min), 5,067,487 states left on queue. +Progress(30) at 2024-11-06 15:35:51: 146,847,667 states generated (29,138,673 s/min), 17,407,824 distinct states found (3,406,939 ds/min), 6,258,337 states left on queue. +Progress(30) at 2024-11-06 15:36:51: 176,211,801 states generated (29,364,134 s/min), 20,626,933 distinct states found (3,219,109 ds/min), 7,302,661 states left on queue. +Progress(31) at 2024-11-06 15:37:51: 205,665,438 states generated (29,453,637 s/min), 23,877,622 distinct states found (3,250,689 ds/min), 8,361,004 states left on queue. +Progress(31) at 2024-11-06 15:38:51: 234,757,357 states generated (29,091,919 s/min), 27,246,813 distinct states found (3,369,191 ds/min), 9,511,916 states left on queue. +Progress(31) at 2024-11-06 15:39:51: 264,154,436 states generated (29,397,079 s/min), 30,383,069 distinct states found (3,136,256 ds/min), 10,494,238 states left on queue. +Progress(31) at 2024-11-06 15:40:51: 293,638,121 states generated (29,483,685 s/min), 33,498,433 distinct states found (3,115,364 ds/min), 11,429,812 states left on queue. +Progress(32) at 2024-11-06 15:41:51: 323,039,991 states generated (29,401,870 s/min), 36,709,338 distinct states found (3,210,905 ds/min), 12,463,752 states left on queue. +Progress(32) at 2024-11-06 15:42:51: 352,081,458 states generated (29,041,467 s/min), 39,979,938 distinct states found (3,270,600 ds/min), 13,531,461 states left on queue. +Progress(32) at 2024-11-06 15:43:51: 381,472,323 states generated (29,390,865 s/min), 43,147,359 distinct states found (3,167,421 ds/min), 14,513,444 states left on queue. +Progress(32) at 2024-11-06 15:44:51: 410,911,764 states generated (29,439,441 s/min), 46,200,793 distinct states found (3,053,434 ds/min), 15,418,951 states left on queue. +Progress(32) at 2024-11-06 15:45:51: 440,514,627 states generated (29,602,863 s/min), 49,210,279 distinct states found (3,009,486 ds/min), 16,263,879 states left on queue. +Progress(33) at 2024-11-06 15:46:51: 470,070,180 states generated (29,555,553 s/min), 52,317,535 distinct states found (3,107,256 ds/min), 17,200,875 states left on queue. +Progress(33) at 2024-11-06 15:47:51: 499,387,268 states generated (29,317,088 s/min), 55,489,376 distinct states found (3,171,841 ds/min), 18,196,719 states left on queue. +Progress(33) at 2024-11-06 15:48:51: 528,308,354 states generated (28,921,086 s/min), 58,716,400 distinct states found (3,227,024 ds/min), 19,225,822 states left on queue. +Progress(33) at 2024-11-06 15:49:51: 557,626,508 states generated (29,318,154 s/min), 61,861,039 distinct states found (3,144,639 ds/min), 20,172,391 states left on queue. +Progress(33) at 2024-11-06 15:50:51: 587,011,551 states generated (29,385,043 s/min), 64,911,520 distinct states found (3,050,481 ds/min), 21,068,246 states left on queue. +Progress(33) at 2024-11-06 15:51:51: 616,469,665 states generated (29,458,114 s/min), 67,862,377 distinct states found (2,950,857 ds/min), 21,888,495 states left on queue. +Progress(33) at 2024-11-06 15:52:51: 646,037,901 states generated (29,568,236 s/min), 70,774,601 distinct states found (2,912,224 ds/min), 22,642,487 states left on queue. +Progress(33) at 2024-11-06 15:53:51: 675,679,292 states generated (29,641,391 s/min), 73,753,124 distinct states found (2,978,523 ds/min), 23,459,982 states left on queue. +Progress(34) at 2024-11-06 15:54:51: 705,213,119 states generated (29,533,827 s/min), 76,751,356 distinct states found (2,998,232 ds/min), 24,319,315 states left on queue. +Progress(34) at 2024-11-06 15:55:51: 734,548,637 states generated (29,335,518 s/min), 79,865,504 distinct states found (3,114,148 ds/min), 25,270,867 states left on queue. +Progress(34) at 2024-11-06 15:56:51: 763,724,351 states generated (29,175,714 s/min), 82,969,406 distinct states found (3,103,902 ds/min), 26,203,099 states left on queue. +Progress(34) at 2024-11-06 15:57:51: 792,795,916 states generated (29,071,565 s/min), 86,092,913 distinct states found (3,123,507 ds/min), 27,124,641 states left on queue. +Progress(34) at 2024-11-06 15:58:51: 822,084,221 states generated (29,288,305 s/min), 89,196,548 distinct states found (3,103,635 ds/min), 28,028,058 states left on queue. +Progress(34) at 2024-11-06 15:59:51: 851,516,510 states generated (29,432,289 s/min), 92,135,078 distinct states found (2,938,530 ds/min), 28,822,750 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-06 16:00:51) +Progress(34) at 2024-11-06 16:00:51: 880,891,436 states generated (29,374,926 s/min), 95,133,622 distinct states found (2,998,544 ds/min), 29,669,470 states left on queue. +Progress(34) at 2024-11-06 16:01:51: 910,262,536 states generated (29,371,100 s/min), 98,019,631 distinct states found (2,886,009 ds/min), 30,433,293 states left on queue. +Progress(34) at 2024-11-06 16:02:51: 939,689,255 states generated (29,426,719 s/min), 100,814,884 distinct states found (2,795,253 ds/min), 31,083,132 states left on queue. +Progress(34) at 2024-11-06 16:03:51: 969,299,651 states generated (29,610,396 s/min), 103,664,772 distinct states found (2,849,888 ds/min), 31,821,093 states left on queue. +Progress(34) at 2024-11-06 16:04:51: 999,051,292 states generated (29,751,641 s/min), 106,544,287 distinct states found (2,879,515 ds/min), 32,536,946 states left on queue. +Progress(35) at 2024-11-06 16:05:51: 1,028,690,576 states generated (29,639,284 s/min), 109,444,362 distinct states found (2,900,075 ds/min), 33,326,316 states left on queue. +Progress(35) at 2024-11-06 16:06:51: 1,058,155,400 states generated (29,464,824 s/min), 112,439,937 distinct states found (2,995,575 ds/min), 34,167,604 states left on queue. +Progress(35) at 2024-11-06 16:07:51: 1,087,496,744 states generated (29,341,344 s/min), 115,461,649 distinct states found (3,021,712 ds/min), 35,032,974 states left on queue. +Progress(35) at 2024-11-06 16:08:51: 1,116,663,767 states generated (29,167,023 s/min), 118,482,838 distinct states found (3,021,189 ds/min), 35,902,651 states left on queue. +Progress(35) at 2024-11-06 16:09:51: 1,145,439,918 states generated (28,776,151 s/min), 121,562,159 distinct states found (3,079,321 ds/min), 36,785,088 states left on queue. +Progress(35) at 2024-11-06 16:10:51: 1,174,812,354 states generated (29,372,436 s/min), 124,511,721 distinct states found (2,949,562 ds/min), 37,555,204 states left on queue. +Progress(35) at 2024-11-06 16:11:51: 1,204,150,178 states generated (29,337,824 s/min), 127,579,155 distinct states found (3,067,434 ds/min), 38,425,790 states left on queue. +Progress(35) at 2024-11-06 16:12:51: 1,233,620,353 states generated (29,470,175 s/min), 130,490,427 distinct states found (2,911,272 ds/min), 39,188,412 states left on queue. +Progress(35) at 2024-11-06 16:13:51: 1,263,022,331 states generated (29,401,978 s/min), 133,317,160 distinct states found (2,826,733 ds/min), 39,893,070 states left on queue. +Progress(35) at 2024-11-06 16:14:51: 1,292,411,979 states generated (29,389,648 s/min), 136,229,817 distinct states found (2,912,657 ds/min), 40,666,029 states left on queue. +Progress(35) at 2024-11-06 16:15:51: 1,321,695,856 states generated (29,283,877 s/min), 139,081,910 distinct states found (2,852,093 ds/min), 41,389,715 states left on queue. +Progress(35) at 2024-11-06 16:16:51: 1,351,045,560 states generated (29,349,704 s/min), 141,811,662 distinct states found (2,729,752 ds/min), 41,999,267 states left on queue. +Progress(35) at 2024-11-06 16:17:51: 1,380,677,436 states generated (29,631,876 s/min), 144,516,072 distinct states found (2,704,410 ds/min), 42,579,779 states left on queue. +Progress(35) at 2024-11-06 16:18:51: 1,410,332,660 states generated (29,655,224 s/min), 147,269,848 distinct states found (2,753,776 ds/min), 43,232,732 states left on queue. +Progress(35) at 2024-11-06 16:19:51: 1,440,071,594 states generated (29,738,934 s/min), 150,116,683 distinct states found (2,846,835 ds/min), 43,917,859 states left on queue. +Progress(35) at 2024-11-06 16:20:51: 1,469,737,942 states generated (29,666,348 s/min), 152,881,605 distinct states found (2,764,922 ds/min), 44,594,909 states left on queue. +Progress(36) at 2024-11-06 16:21:51: 1,499,124,482 states generated (29,386,540 s/min), 155,722,313 distinct states found (2,840,708 ds/min), 45,306,186 states left on queue. +Progress(36) at 2024-11-06 16:22:51: 1,528,616,635 states generated (29,492,153 s/min), 158,643,911 distinct states found (2,921,598 ds/min), 46,098,600 states left on queue. +Progress(36) at 2024-11-06 16:23:51: 1,557,820,328 states generated (29,203,693 s/min), 161,651,516 distinct states found (3,007,605 ds/min), 46,958,572 states left on queue. +Progress(36) at 2024-11-06 16:24:51: 1,587,341,565 states generated (29,521,237 s/min), 164,469,424 distinct states found (2,817,908 ds/min), 47,648,932 states left on queue. +Progress(36) at 2024-11-06 16:25:51: 1,616,246,807 states generated (28,905,242 s/min), 167,471,199 distinct states found (3,001,775 ds/min), 48,496,844 states left on queue. +Progress(36) at 2024-11-06 16:26:51: 1,645,107,613 states generated (28,860,806 s/min), 170,454,103 distinct states found (2,982,904 ds/min), 49,283,244 states left on queue. +Progress(36) at 2024-11-06 16:27:51: 1,674,492,314 states generated (29,384,701 s/min), 173,343,045 distinct states found (2,888,942 ds/min), 50,006,895 states left on queue. +Progress(36) at 2024-11-06 16:28:51: 1,703,875,027 states generated (29,382,713 s/min), 176,157,623 distinct states found (2,814,578 ds/min), 50,662,128 states left on queue. +Progress(36) at 2024-11-06 16:29:51: 1,733,099,131 states generated (29,224,104 s/min), 179,186,519 distinct states found (3,028,896 ds/min), 51,498,029 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-06 16:30:52) +Progress(36) at 2024-11-06 16:30:52: 1,762,724,622 states generated (29,625,491 s/min), 181,958,595 distinct states found (2,772,076 ds/min), 52,142,450 states left on queue. +Progress(36) at 2024-11-06 16:31:52: 1,792,118,288 states generated (29,393,666 s/min), 184,725,090 distinct states found (2,766,495 ds/min), 52,785,705 states left on queue. +Progress(36) at 2024-11-06 16:32:52: 1,821,258,069 states generated (29,139,781 s/min), 187,681,452 distinct states found (2,956,362 ds/min), 53,592,610 states left on queue. +Progress(36) at 2024-11-06 16:33:52: 1,850,729,054 states generated (29,470,985 s/min), 190,451,722 distinct states found (2,770,270 ds/min), 54,239,919 states left on queue. +Progress(36) at 2024-11-06 16:34:52: 1,879,860,913 states generated (29,131,859 s/min), 193,207,770 distinct states found (2,756,048 ds/min), 54,886,748 states left on queue. +Progress(36) at 2024-11-06 16:35:52: 1,909,200,565 states generated (29,339,652 s/min), 195,832,123 distinct states found (2,624,353 ds/min), 55,404,535 states left on queue. +Progress(36) at 2024-11-06 16:36:52: 1,938,403,873 states generated (29,203,308 s/min), 198,569,916 distinct states found (2,737,793 ds/min), 55,993,675 states left on queue. +Progress(36) at 2024-11-06 16:37:52: 1,968,097,695 states generated (29,693,822 s/min), 201,148,799 distinct states found (2,578,883 ds/min), 56,501,179 states left on queue. +Progress(36) at 2024-11-06 16:38:52: 1,997,628,304 states generated (29,530,609 s/min), 203,860,765 distinct states found (2,711,966 ds/min), 57,133,283 states left on queue. +Progress(36) at 2024-11-06 16:39:52: 2,027,338,755 states generated (29,710,451 s/min), 206,496,491 distinct states found (2,635,726 ds/min), 57,649,914 states left on queue. +Progress(36) at 2024-11-06 16:40:52: 2,057,072,538 states generated (29,733,783 s/min), 209,189,488 distinct states found (2,692,997 ds/min), 58,229,449 states left on queue. +Progress(36) at 2024-11-06 16:41:52: 2,086,549,250 states generated (29,476,712 s/min), 211,909,869 distinct states found (2,720,381 ds/min), 58,875,611 states left on queue. +Progress(37) at 2024-11-06 16:42:52: 2,115,953,926 states generated (29,404,676 s/min), 214,630,876 distinct states found (2,721,007 ds/min), 59,494,220 states left on queue. +Progress(37) at 2024-11-06 16:43:52: 2,145,423,196 states generated (29,469,270 s/min), 217,412,888 distinct states found (2,782,012 ds/min), 60,176,423 states left on queue. +Progress(37) at 2024-11-06 16:44:52: 2,174,796,796 states generated (29,373,600 s/min), 220,316,140 distinct states found (2,903,252 ds/min), 60,925,815 states left on queue. +Progress(37) at 2024-11-06 16:45:52: 2,203,907,384 states generated (29,110,588 s/min), 223,255,125 distinct states found (2,938,985 ds/min), 61,739,564 states left on queue. +Progress(37) at 2024-11-06 16:46:52: 2,233,378,272 states generated (29,470,888 s/min), 225,995,858 distinct states found (2,740,733 ds/min), 62,364,627 states left on queue. +Progress(37) at 2024-11-06 16:47:52: 2,262,648,334 states generated (29,270,062 s/min), 228,738,653 distinct states found (2,742,795 ds/min), 63,003,155 states left on queue. +Progress(37) at 2024-11-06 16:48:52: 2,291,309,648 states generated (28,661,314 s/min), 231,720,498 distinct states found (2,981,845 ds/min), 63,816,162 states left on queue. +Progress(37) at 2024-11-06 16:49:52: 2,320,153,384 states generated (28,843,736 s/min), 234,599,475 distinct states found (2,878,977 ds/min), 64,513,886 states left on queue. +Progress(37) at 2024-11-06 16:50:52: 2,349,538,907 states generated (29,385,523 s/min), 237,330,640 distinct states found (2,731,165 ds/min), 65,105,576 states left on queue. +Progress(37) at 2024-11-06 16:51:52: 2,379,015,082 states generated (29,476,175 s/min), 240,064,625 distinct states found (2,733,985 ds/min), 65,704,108 states left on queue. +Progress(37) at 2024-11-06 16:52:52: 2,408,376,582 states generated (29,361,500 s/min), 242,869,889 distinct states found (2,805,264 ds/min), 66,339,299 states left on queue. +Progress(37) at 2024-11-06 16:53:52: 2,437,554,516 states generated (29,177,934 s/min), 245,844,106 distinct states found (2,974,217 ds/min), 67,125,834 states left on queue. +Progress(37) at 2024-11-06 16:54:52: 2,466,925,193 states generated (29,370,677 s/min), 248,540,587 distinct states found (2,696,481 ds/min), 67,707,623 states left on queue. +Progress(37) at 2024-11-06 16:55:52: 2,496,386,977 states generated (29,461,784 s/min), 251,318,893 distinct states found (2,778,306 ds/min), 68,345,796 states left on queue. +Progress(37) at 2024-11-06 16:56:52: 2,525,837,965 states generated (29,450,988 s/min), 253,918,986 distinct states found (2,600,093 ds/min), 68,851,521 states left on queue. +Progress(37) at 2024-11-06 16:57:52: 2,555,073,687 states generated (29,235,722 s/min), 256,806,753 distinct states found (2,887,767 ds/min), 69,596,597 states left on queue. +Progress(37) at 2024-11-06 16:58:52: 2,584,381,294 states generated (29,307,607 s/min), 259,714,054 distinct states found (2,907,301 ds/min), 70,335,539 states left on queue. +Progress(37) at 2024-11-06 16:59:52: 2,613,557,081 states generated (29,175,787 s/min), 262,407,462 distinct states found (2,693,408 ds/min), 70,920,265 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-06 17:00:53) +Progress(37) at 2024-11-06 17:00:53: 2,643,168,141 states generated (29,611,060 s/min), 264,973,171 distinct states found (2,565,709 ds/min), 71,384,749 states left on queue. +Progress(37) at 2024-11-06 17:01:53: 2,672,453,868 states generated (29,285,727 s/min), 267,551,971 distinct states found (2,578,800 ds/min), 71,854,220 states left on queue. +Progress(37) at 2024-11-06 17:02:53: 2,701,696,399 states generated (29,242,531 s/min), 270,233,135 distinct states found (2,681,164 ds/min), 72,406,567 states left on queue. +Progress(37) at 2024-11-06 17:03:53: 2,731,216,488 states generated (29,520,089 s/min), 272,711,390 distinct states found (2,478,255 ds/min), 72,805,269 states left on queue. +Progress(37) at 2024-11-06 17:04:53: 2,760,788,758 states generated (29,572,270 s/min), 275,307,217 distinct states found (2,595,827 ds/min), 73,313,123 states left on queue. +Progress(37) at 2024-11-06 17:05:53: 2,790,339,552 states generated (29,550,794 s/min), 277,881,113 distinct states found (2,573,896 ds/min), 73,833,900 states left on queue. +Progress(37) at 2024-11-06 17:06:53: 2,820,046,206 states generated (29,706,654 s/min), 280,371,086 distinct states found (2,489,973 ds/min), 74,231,258 states left on queue. +Progress(37) at 2024-11-06 17:07:53: 2,849,787,753 states generated (29,741,547 s/min), 283,097,131 distinct states found (2,726,045 ds/min), 74,814,735 states left on queue. +Progress(37) at 2024-11-06 17:08:53: 2,879,520,949 states generated (29,733,196 s/min), 285,608,053 distinct states found (2,510,922 ds/min), 75,293,894 states left on queue. +Progress(37) at 2024-11-06 17:09:53: 2,908,889,760 states generated (29,368,811 s/min), 288,274,872 distinct states found (2,666,819 ds/min), 75,880,480 states left on queue. +Progress(38) at 2024-11-06 17:10:53: 2,938,412,523 states generated (29,522,763 s/min), 290,877,598 distinct states found (2,602,726 ds/min), 76,391,156 states left on queue. +Progress(38) at 2024-11-06 17:11:53: 2,967,963,455 states generated (29,550,932 s/min), 293,492,146 distinct states found (2,614,548 ds/min), 76,932,124 states left on queue. +Progress(38) at 2024-11-06 17:12:53: 2,997,327,370 states generated (29,363,915 s/min), 296,353,306 distinct states found (2,861,160 ds/min), 77,659,606 states left on queue. +Progress(38) at 2024-11-06 17:13:53: 3,026,713,138 states generated (29,385,768 s/min), 299,173,963 distinct states found (2,820,657 ds/min), 78,342,645 states left on queue. +Progress(38) at 2024-11-06 17:14:53: 3,055,986,492 states generated (29,273,354 s/min), 302,024,049 distinct states found (2,850,086 ds/min), 79,071,501 states left on queue. +Progress(38) at 2024-11-06 17:15:53: 3,085,491,974 states generated (29,505,482 s/min), 304,668,970 distinct states found (2,644,921 ds/min), 79,608,084 states left on queue. +Progress(38) at 2024-11-06 17:16:53: 3,114,898,266 states generated (29,406,292 s/min), 307,272,526 distinct states found (2,603,556 ds/min), 80,132,575 states left on queue. +Progress(38) at 2024-11-06 17:17:53: 3,144,023,490 states generated (29,125,224 s/min), 310,022,073 distinct states found (2,749,547 ds/min), 80,777,238 states left on queue. +Progress(38) at 2024-11-06 17:18:53: 3,172,762,795 states generated (28,739,305 s/min), 312,891,905 distinct states found (2,869,832 ds/min), 81,497,739 states left on queue. +Progress(38) at 2024-11-06 17:19:53: 3,201,314,425 states generated (28,551,630 s/min), 315,766,566 distinct states found (2,874,661 ds/min), 82,171,729 states left on queue. +Progress(38) at 2024-11-06 17:20:53: 3,230,713,777 states generated (29,399,352 s/min), 318,365,612 distinct states found (2,599,046 ds/min), 82,638,018 states left on queue. +Progress(38) at 2024-11-06 17:21:53: 3,260,188,634 states generated (29,474,857 s/min), 321,040,810 distinct states found (2,675,198 ds/min), 83,185,708 states left on queue. +Progress(38) at 2024-11-06 17:22:53: 3,289,654,456 states generated (29,465,822 s/min), 323,660,313 distinct states found (2,619,503 ds/min), 83,689,075 states left on queue. +Progress(38) at 2024-11-06 17:23:53: 3,319,003,677 states generated (29,349,221 s/min), 326,391,347 distinct states found (2,731,034 ds/min), 84,261,368 states left on queue. +Progress(38) at 2024-11-06 17:24:53: 3,348,330,685 states generated (29,327,008 s/min), 329,204,934 distinct states found (2,813,587 ds/min), 84,925,046 states left on queue. +Progress(38) at 2024-11-06 17:25:53: 3,377,572,946 states generated (29,242,261 s/min), 331,997,887 distinct states found (2,792,953 ds/min), 85,533,473 states left on queue. +Progress(38) at 2024-11-06 17:26:53: 3,406,881,714 states generated (29,308,768 s/min), 334,599,745 distinct states found (2,601,858 ds/min), 86,047,276 states left on queue. +Progress(38) at 2024-11-06 17:27:53: 3,436,375,389 states generated (29,493,675 s/min), 337,261,572 distinct states found (2,661,827 ds/min), 86,591,357 states left on queue. +Progress(38) at 2024-11-06 17:28:53: 3,465,811,732 states generated (29,436,343 s/min), 339,829,613 distinct states found (2,568,041 ds/min), 87,057,550 states left on queue. +Progress(38) at 2024-11-06 17:29:53: 3,495,144,983 states generated (29,333,251 s/min), 342,566,275 distinct states found (2,736,662 ds/min), 87,671,131 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-06 17:30:53) +Progress(38) at 2024-11-06 17:30:53: 3,524,611,246 states generated (29,466,263 s/min), 345,366,358 distinct states found (2,800,083 ds/min), 88,316,673 states left on queue. +Progress(38) at 2024-11-06 17:31:53: 3,553,819,331 states generated (29,208,085 s/min), 348,291,666 distinct states found (2,925,308 ds/min), 89,059,679 states left on queue. +Progress(38) at 2024-11-06 17:32:53: 3,583,208,821 states generated (29,389,490 s/min), 350,796,636 distinct states found (2,504,970 ds/min), 89,478,521 states left on queue. +Progress(38) at 2024-11-06 17:33:53: 3,612,329,910 states generated (29,121,089 s/min), 353,414,448 distinct states found (2,617,812 ds/min), 90,008,568 states left on queue. +Progress(38) at 2024-11-06 17:34:53: 3,641,485,253 states generated (29,155,343 s/min), 356,010,441 distinct states found (2,595,993 ds/min), 90,486,313 states left on queue. +Progress(38) at 2024-11-06 17:35:53: 3,670,761,645 states generated (29,276,392 s/min), 358,411,973 distinct states found (2,401,532 ds/min), 90,799,029 states left on queue. +Progress(38) at 2024-11-06 17:36:53: 3,700,008,207 states generated (29,246,562 s/min), 360,943,422 distinct states found (2,531,449 ds/min), 91,235,694 states left on queue. +Progress(38) at 2024-11-06 17:37:53: 3,729,045,761 states generated (29,037,554 s/min), 363,523,499 distinct states found (2,580,077 ds/min), 91,685,579 states left on queue. +Progress(38) at 2024-11-06 17:38:53: 3,758,697,262 states generated (29,651,501 s/min), 365,860,396 distinct states found (2,336,897 ds/min), 92,003,313 states left on queue. +Progress(38) at 2024-11-06 17:39:53: 3,788,188,489 states generated (29,491,227 s/min), 368,369,398 distinct states found (2,509,002 ds/min), 92,452,083 states left on queue. +Progress(38) at 2024-11-06 17:40:53: 3,817,718,772 states generated (29,530,283 s/min), 370,855,965 distinct states found (2,486,567 ds/min), 92,899,812 states left on queue. +Progress(38) at 2024-11-06 17:41:53: 3,847,372,748 states generated (29,653,976 s/min), 373,231,774 distinct states found (2,375,809 ds/min), 93,202,503 states left on queue. +Progress(38) at 2024-11-06 17:42:53: 3,877,091,950 states generated (29,719,202 s/min), 375,934,374 distinct states found (2,702,600 ds/min), 93,775,105 states left on queue. +Progress(38) at 2024-11-06 17:43:53: 3,906,843,295 states generated (29,751,345 s/min), 378,304,497 distinct states found (2,370,123 ds/min), 94,098,611 states left on queue. +Progress(38) at 2024-11-06 17:44:53: 3,936,304,033 states generated (29,460,738 s/min), 380,793,774 distinct states found (2,489,277 ds/min), 94,560,398 states left on queue. +Progress(38) at 2024-11-06 17:45:53: 3,965,687,311 states generated (29,383,278 s/min), 383,366,376 distinct states found (2,572,602 ds/min), 95,062,163 states left on queue. +Progress(38) at 2024-11-06 17:46:53: 3,995,264,758 states generated (29,577,447 s/min), 385,832,314 distinct states found (2,465,938 ds/min), 95,460,777 states left on queue. +Progress(38) at 2024-11-06 17:47:53: 4,024,519,333 states generated (29,254,575 s/min), 388,384,282 distinct states found (2,551,968 ds/min), 95,931,698 states left on queue. +Progress(38) at 2024-11-06 17:48:53: 4,054,053,752 states generated (29,534,419 s/min), 390,990,581 distinct states found (2,606,299 ds/min), 96,493,705 states left on queue. +Progress(38) at 2024-11-06 17:49:53: 4,083,403,606 states generated (29,349,854 s/min), 393,717,328 distinct states found (2,726,747 ds/min), 97,099,592 states left on queue. +Progress(38) at 2024-11-06 17:50:53: 4,112,753,694 states generated (29,350,088 s/min), 396,441,909 distinct states found (2,724,581 ds/min), 97,694,523 states left on queue. +Progress(38) at 2024-11-06 17:51:53: 4,141,940,951 states generated (29,187,257 s/min), 399,238,612 distinct states found (2,796,703 ds/min), 98,387,103 states left on queue. +Progress(38) at 2024-11-06 17:52:53: 4,171,185,273 states generated (29,244,322 s/min), 401,861,376 distinct states found (2,622,764 ds/min), 98,900,168 states left on queue. +Progress(38) at 2024-11-06 17:53:53: 4,200,735,055 states generated (29,549,782 s/min), 404,419,627 distinct states found (2,558,251 ds/min), 99,388,507 states left on queue. +Progress(38) at 2024-11-06 17:54:53: 4,230,057,902 states generated (29,322,847 s/min), 406,926,477 distinct states found (2,506,850 ds/min), 99,826,562 states left on queue. +Progress(38) at 2024-11-06 17:55:53: 4,259,279,515 states generated (29,221,613 s/min), 409,512,606 distinct states found (2,586,129 ds/min), 100,340,214 states left on queue. +Progress(38) at 2024-11-06 17:56:53: 4,288,265,663 states generated (28,986,148 s/min), 412,254,402 distinct states found (2,741,796 ds/min), 100,966,036 states left on queue. +Progress(38) at 2024-11-06 17:57:53: 4,316,798,413 states generated (28,532,750 s/min), 415,047,481 distinct states found (2,793,079 ds/min), 101,589,869 states left on queue. +Progress(38) at 2024-11-06 17:58:53: 4,345,527,290 states generated (28,728,877 s/min), 417,768,588 distinct states found (2,721,107 ds/min), 102,133,503 states left on queue. +Progress(38) at 2024-11-06 17:59:53: 4,374,924,942 states generated (29,397,652 s/min), 420,254,082 distinct states found (2,485,494 ds/min), 102,500,461 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-06 18:00:54) +Progress(38) at 2024-11-06 18:00:54: 4,404,604,911 states generated (29,679,969 s/min), 422,801,691 distinct states found (2,547,609 ds/min), 102,936,440 states left on queue. +Progress(38) at 2024-11-06 18:01:54: 4,434,018,901 states generated (29,413,990 s/min), 425,477,119 distinct states found (2,675,428 ds/min), 103,472,987 states left on queue. +Progress(38) at 2024-11-06 18:02:54: 4,463,498,297 states generated (29,479,396 s/min), 427,949,289 distinct states found (2,472,170 ds/min), 103,858,839 states left on queue. +Progress(38) at 2024-11-06 18:03:54: 4,492,775,931 states generated (29,277,634 s/min), 430,592,094 distinct states found (2,642,805 ds/min), 104,353,609 states left on queue. +Progress(38) at 2024-11-06 18:04:54: 4,522,002,300 states generated (29,226,369 s/min), 433,322,584 distinct states found (2,730,490 ds/min), 104,949,753 states left on queue. +Progress(38) at 2024-11-06 18:05:54: 4,551,375,180 states generated (29,372,880 s/min), 436,005,138 distinct states found (2,682,554 ds/min), 105,482,546 states left on queue. +Progress(38) at 2024-11-06 18:06:54: 4,580,718,169 states generated (29,342,989 s/min), 438,516,579 distinct states found (2,511,441 ds/min), 105,868,435 states left on queue. +Progress(38) at 2024-11-06 18:07:54: 4,609,859,344 states generated (29,141,175 s/min), 441,134,700 distinct states found (2,618,121 ds/min), 106,390,335 states left on queue. +Progress(38) at 2024-11-06 18:08:54: 4,639,331,150 states generated (29,471,806 s/min), 443,662,679 distinct states found (2,527,979 ds/min), 106,821,264 states left on queue. +Progress(38) at 2024-11-06 18:09:54: 4,668,696,820 states generated (29,365,670 s/min), 446,222,969 distinct states found (2,560,290 ds/min), 107,277,508 states left on queue. +Progress(38) at 2024-11-06 18:10:54: 4,698,140,829 states generated (29,444,009 s/min), 448,693,022 distinct states found (2,470,053 ds/min), 107,654,262 states left on queue. +Progress(38) at 2024-11-06 18:11:54: 4,727,380,985 states generated (29,240,156 s/min), 451,459,276 distinct states found (2,766,254 ds/min), 108,284,101 states left on queue. +Progress(38) at 2024-11-06 18:12:54: 4,756,654,088 states generated (29,273,103 s/min), 454,180,180 distinct states found (2,720,904 ds/min), 108,879,205 states left on queue. +Progress(38) at 2024-11-06 18:13:54: 4,785,893,104 states generated (29,239,016 s/min), 457,001,077 distinct states found (2,820,897 ds/min), 109,511,015 states left on queue. +Progress(38) at 2024-11-06 18:14:54: 4,815,289,339 states generated (29,396,235 s/min), 459,530,340 distinct states found (2,529,263 ds/min), 109,951,588 states left on queue. +Progress(38) at 2024-11-06 18:15:54: 4,844,354,767 states generated (29,065,428 s/min), 462,144,567 distinct states found (2,614,227 ds/min), 110,455,692 states left on queue. +Progress(38) at 2024-11-06 18:16:54: 4,873,381,465 states generated (29,026,698 s/min), 464,718,128 distinct states found (2,573,561 ds/min), 110,936,992 states left on queue. +Progress(38) at 2024-11-06 18:17:54: 4,902,616,179 states generated (29,234,714 s/min), 467,171,620 distinct states found (2,453,492 ds/min), 111,288,450 states left on queue. +Progress(38) at 2024-11-06 18:18:54: 4,931,808,383 states generated (29,192,204 s/min), 469,593,253 distinct states found (2,421,633 ds/min), 111,607,240 states left on queue. +Progress(38) at 2024-11-06 18:19:54: 4,961,319,800 states generated (29,511,417 s/min), 471,795,067 distinct states found (2,201,814 ds/min), 111,770,077 states left on queue. +Progress(38) at 2024-11-06 18:20:54: 4,990,051,892 states generated (28,732,092 s/min), 474,595,717 distinct states found (2,800,650 ds/min), 112,380,795 states left on queue. +Progress(38) at 2024-11-06 18:21:54: 5,019,620,389 states generated (29,568,497 s/min), 476,860,178 distinct states found (2,264,461 ds/min), 112,610,789 states left on queue. +Progress(38) at 2024-11-06 18:22:54: 5,049,176,225 states generated (29,555,836 s/min), 479,117,000 distinct states found (2,256,822 ds/min), 112,849,809 states left on queue. +Progress(38) at 2024-11-06 18:23:54: 5,078,659,511 states generated (29,483,286 s/min), 481,552,566 distinct states found (2,435,566 ds/min), 113,238,679 states left on queue. +Progress(38) at 2024-11-06 18:24:54: 5,108,186,428 states generated (29,526,917 s/min), 483,970,290 distinct states found (2,417,724 ds/min), 113,645,974 states left on queue. +Progress(38) at 2024-11-06 18:25:54: 5,137,766,496 states generated (29,580,068 s/min), 486,204,445 distinct states found (2,234,155 ds/min), 113,816,273 states left on queue. +Progress(38) at 2024-11-06 18:26:54: 5,167,429,477 states generated (29,662,981 s/min), 488,726,479 distinct states found (2,522,034 ds/min), 114,265,425 states left on queue. +Progress(38) at 2024-11-06 18:27:54: 5,197,227,715 states generated (29,798,238 s/min), 491,213,848 distinct states found (2,487,369 ds/min), 114,645,624 states left on queue. +Progress(38) at 2024-11-06 18:28:54: 5,226,883,420 states generated (29,655,705 s/min), 493,480,968 distinct states found (2,267,120 ds/min), 114,901,786 states left on queue. +Progress(38) at 2024-11-06 18:29:54: 5,256,355,905 states generated (29,472,485 s/min), 495,866,549 distinct states found (2,385,581 ds/min), 115,277,276 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-06 18:30:55) +Progress(38) at 2024-11-06 18:30:55: 5,286,035,252 states generated (29,679,347 s/min), 498,324,679 distinct states found (2,458,130 ds/min), 115,663,015 states left on queue. +Progress(38) at 2024-11-06 18:31:55: 5,315,467,724 states generated (29,432,472 s/min), 500,723,577 distinct states found (2,398,898 ds/min), 116,023,619 states left on queue. +Progress(38) at 2024-11-06 18:32:55: 5,344,728,453 states generated (29,260,729 s/min), 503,156,876 distinct states found (2,433,299 ds/min), 116,384,801 states left on queue. +Progress(38) at 2024-11-06 18:33:55: 5,374,055,231 states generated (29,326,778 s/min), 505,588,957 distinct states found (2,432,081 ds/min), 116,786,679 states left on queue. +Progress(38) at 2024-11-06 18:34:55: 5,403,566,278 states generated (29,511,047 s/min), 508,096,703 distinct states found (2,507,746 ds/min), 117,258,425 states left on queue. +Progress(38) at 2024-11-06 18:35:55: 5,432,770,932 states generated (29,204,654 s/min), 510,765,370 distinct states found (2,668,667 ds/min), 117,821,443 states left on queue. +Progress(38) at 2024-11-06 18:36:55: 5,462,325,607 states generated (29,554,675 s/min), 513,306,027 distinct states found (2,540,657 ds/min), 118,252,946 states left on queue. +Progress(38) at 2024-11-06 18:37:55: 5,491,531,381 states generated (29,205,774 s/min), 516,017,383 distinct states found (2,711,356 ds/min), 118,857,035 states left on queue. +Progress(38) at 2024-11-06 18:38:55: 5,520,744,572 states generated (29,213,191 s/min), 518,696,783 distinct states found (2,679,400 ds/min), 119,445,954 states left on queue. +Progress(38) at 2024-11-06 18:39:55: 5,549,903,819 states generated (29,159,247 s/min), 521,329,662 distinct states found (2,632,879 ds/min), 119,977,569 states left on queue. +Progress(38) at 2024-11-06 18:40:55: 5,579,474,839 states generated (29,571,020 s/min), 523,702,578 distinct states found (2,372,916 ds/min), 120,289,041 states left on queue. +Progress(38) at 2024-11-06 18:41:55: 5,608,757,550 states generated (29,282,711 s/min), 526,191,629 distinct states found (2,489,051 ds/min), 120,719,632 states left on queue. +Progress(38) at 2024-11-06 18:42:55: 5,638,085,090 states generated (29,327,540 s/min), 528,478,505 distinct states found (2,286,876 ds/min), 120,990,568 states left on queue. +Progress(38) at 2024-11-06 18:43:55: 5,667,141,833 states generated (29,056,743 s/min), 531,035,593 distinct states found (2,557,088 ds/min), 121,480,763 states left on queue. +Progress(38) at 2024-11-06 18:44:55: 5,696,139,104 states generated (28,997,271 s/min), 533,684,330 distinct states found (2,648,737 ds/min), 122,027,516 states left on queue. +Progress(38) at 2024-11-06 18:45:55: 5,724,868,902 states generated (28,729,798 s/min), 536,316,715 distinct states found (2,632,385 ds/min), 122,548,317 states left on queue. +Progress(38) at 2024-11-06 18:46:55: 5,753,438,871 states generated (28,569,969 s/min), 539,001,028 distinct states found (2,684,313 ds/min), 123,041,578 states left on queue. +Progress(38) at 2024-11-06 18:47:55: 5,782,391,778 states generated (28,952,907 s/min), 541,537,259 distinct states found (2,536,231 ds/min), 123,436,184 states left on queue. +Progress(38) at 2024-11-06 18:48:55: 5,811,823,996 states generated (29,432,218 s/min), 543,896,432 distinct states found (2,359,173 ds/min), 123,698,698 states left on queue. +Progress(38) at 2024-11-06 18:49:55: 5,841,258,941 states generated (29,434,945 s/min), 546,273,191 distinct states found (2,376,759 ds/min), 124,012,754 states left on queue. +Progress(38) at 2024-11-06 18:50:55: 5,870,667,995 states generated (29,409,054 s/min), 548,835,686 distinct states found (2,562,495 ds/min), 124,450,482 states left on queue. +Progress(38) at 2024-11-06 18:51:55: 5,900,038,718 states generated (29,370,723 s/min), 551,304,457 distinct states found (2,468,771 ds/min), 124,805,220 states left on queue. +Progress(38) at 2024-11-06 18:52:55: 5,929,442,421 states generated (29,403,703 s/min), 553,776,296 distinct states found (2,471,839 ds/min), 125,178,608 states left on queue. +Progress(38) at 2024-11-06 18:53:55: 5,958,838,496 states generated (29,396,075 s/min), 556,289,762 distinct states found (2,513,466 ds/min), 125,588,158 states left on queue. +Progress(38) at 2024-11-06 18:54:55: 5,988,187,325 states generated (29,348,829 s/min), 558,898,224 distinct states found (2,608,462 ds/min), 126,074,377 states left on queue. +Progress(38) at 2024-11-06 18:55:55: 6,017,546,111 states generated (29,358,786 s/min), 561,530,468 distinct states found (2,632,244 ds/min), 126,579,784 states left on queue. +Progress(38) at 2024-11-06 18:56:55: 6,046,777,143 states generated (29,231,032 s/min), 564,182,546 distinct states found (2,652,078 ds/min), 127,037,883 states left on queue. +Progress(39) at 2024-11-06 18:57:55: 6,076,111,479 states generated (29,334,336 s/min), 566,509,898 distinct states found (2,327,352 ds/min), 127,319,036 states left on queue. +Progress(39) at 2024-11-06 18:58:55: 6,105,215,668 states generated (29,104,189 s/min), 569,000,954 distinct states found (2,491,056 ds/min), 127,724,185 states left on queue. +Progress(39) at 2024-11-06 18:59:55: 6,134,619,650 states generated (29,403,982 s/min), 571,444,199 distinct states found (2,443,245 ds/min), 128,083,849 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-06 19:00:55) +Progress(39) at 2024-11-06 19:00:55: 6,164,303,226 states generated (29,683,576 s/min), 574,046,920 distinct states found (2,602,721 ds/min), 128,537,330 states left on queue. +Progress(39) at 2024-11-06 19:01:55: 6,193,710,515 states generated (29,407,289 s/min), 576,294,161 distinct states found (2,247,241 ds/min), 128,749,186 states left on queue. +Progress(39) at 2024-11-06 19:02:55: 6,223,050,437 states generated (29,339,922 s/min), 578,840,811 distinct states found (2,546,650 ds/min), 129,198,375 states left on queue. +Progress(39) at 2024-11-06 19:03:55: 6,252,273,339 states generated (29,222,902 s/min), 581,530,481 distinct states found (2,689,670 ds/min), 129,745,195 states left on queue. +Progress(39) at 2024-11-06 19:04:55: 6,281,535,213 states generated (29,261,874 s/min), 584,206,969 distinct states found (2,676,488 ds/min), 130,306,182 states left on queue. +Progress(39) at 2024-11-06 19:05:55: 6,310,569,147 states generated (29,033,934 s/min), 587,031,959 distinct states found (2,824,990 ds/min), 130,922,629 states left on queue. +Progress(39) at 2024-11-06 19:06:55: 6,339,951,741 states generated (29,382,594 s/min), 589,709,668 distinct states found (2,677,709 ds/min), 131,483,555 states left on queue. +Progress(39) at 2024-11-06 19:07:55: 6,369,354,481 states generated (29,402,740 s/min), 591,964,654 distinct states found (2,254,986 ds/min), 131,688,532 states left on queue. +Progress(39) at 2024-11-06 19:08:55: 6,398,254,591 states generated (28,900,110 s/min), 594,604,924 distinct states found (2,640,270 ds/min), 132,195,069 states left on queue. +Progress(39) at 2024-11-06 19:09:55: 6,427,422,756 states generated (29,168,165 s/min), 597,059,083 distinct states found (2,454,159 ds/min), 132,571,626 states left on queue. +Progress(39) at 2024-11-06 19:10:55: 6,456,469,721 states generated (29,046,965 s/min), 599,400,317 distinct states found (2,341,234 ds/min), 132,826,474 states left on queue. +Progress(39) at 2024-11-06 19:11:55: 6,485,733,442 states generated (29,263,721 s/min), 602,040,336 distinct states found (2,640,019 ds/min), 133,286,664 states left on queue. +Progress(39) at 2024-11-06 19:12:55: 6,515,001,998 states generated (29,268,556 s/min), 604,003,958 distinct states found (1,963,622 ds/min), 133,255,252 states left on queue. +Progress(39) at 2024-11-06 19:13:55: 6,544,172,146 states generated (29,170,148 s/min), 606,473,164 distinct states found (2,469,206 ds/min), 133,627,323 states left on queue. +Progress(39) at 2024-11-06 19:14:55: 6,572,975,355 states generated (28,803,209 s/min), 609,043,606 distinct states found (2,570,442 ds/min), 134,023,262 states left on queue. +Progress(39) at 2024-11-06 19:15:55: 6,602,534,934 states generated (29,559,579 s/min), 611,212,652 distinct states found (2,169,046 ds/min), 134,205,070 states left on queue. +Progress(39) at 2024-11-06 19:16:55: 6,632,044,851 states generated (29,509,917 s/min), 613,377,378 distinct states found (2,164,726 ds/min), 134,360,577 states left on queue. +Progress(39) at 2024-11-06 19:17:55: 6,661,465,356 states generated (29,420,505 s/min), 615,729,605 distinct states found (2,352,227 ds/min), 134,679,148 states left on queue. +Progress(39) at 2024-11-06 19:18:55: 6,690,848,776 states generated (29,383,420 s/min), 618,034,126 distinct states found (2,304,521 ds/min), 134,989,999 states left on queue. +Progress(39) at 2024-11-06 19:19:55: 6,720,362,641 states generated (29,513,865 s/min), 620,264,990 distinct states found (2,230,864 ds/min), 135,213,527 states left on queue. +Progress(39) at 2024-11-06 19:20:55: 6,749,995,972 states generated (29,633,331 s/min), 622,424,423 distinct states found (2,159,433 ds/min), 135,336,269 states left on queue. +Progress(39) at 2024-11-06 19:21:55: 6,779,641,479 states generated (29,645,507 s/min), 624,953,002 distinct states found (2,528,579 ds/min), 135,781,717 states left on queue. +Progress(39) at 2024-11-06 19:22:55: 6,809,496,805 states generated (29,855,326 s/min), 627,297,563 distinct states found (2,344,561 ds/min), 136,040,988 states left on queue. +Progress(39) at 2024-11-06 19:23:55: 6,839,096,708 states generated (29,599,903 s/min), 629,464,688 distinct states found (2,167,125 ds/min), 136,210,971 states left on queue. +Progress(39) at 2024-11-06 19:24:55: 6,868,614,311 states generated (29,517,603 s/min), 631,704,627 distinct states found (2,239,939 ds/min), 136,469,731 states left on queue. +Progress(39) at 2024-11-06 19:25:55: 6,897,932,930 states generated (29,318,619 s/min), 633,961,042 distinct states found (2,256,415 ds/min), 136,714,912 states left on queue. +Progress(39) at 2024-11-06 19:26:55: 6,927,200,602 states generated (29,267,672 s/min), 636,414,800 distinct states found (2,453,758 ds/min), 137,101,547 states left on queue. +Progress(39) at 2024-11-06 19:27:55: 6,956,755,074 states generated (29,554,472 s/min), 638,616,489 distinct states found (2,201,689 ds/min), 137,285,238 states left on queue. +Progress(39) at 2024-11-06 19:28:55: 6,985,926,285 states generated (29,171,211 s/min), 640,970,274 distinct states found (2,353,785 ds/min), 137,592,586 states left on queue. +Progress(39) at 2024-11-06 19:29:55: 7,015,240,294 states generated (29,314,009 s/min), 643,310,280 distinct states found (2,340,006 ds/min), 137,914,322 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-06 19:30:56) +Progress(39) at 2024-11-06 19:30:56: 7,045,112,039 states generated (29,871,745 s/min), 645,650,251 distinct states found (2,339,971 ds/min), 138,248,533 states left on queue. +Progress(39) at 2024-11-06 19:31:56: 7,074,347,122 states generated (29,235,083 s/min), 648,286,341 distinct states found (2,636,090 ds/min), 138,800,606 states left on queue. +Progress(39) at 2024-11-06 19:32:56: 7,103,701,427 states generated (29,354,305 s/min), 650,776,754 distinct states found (2,490,413 ds/min), 139,200,935 states left on queue. +Progress(39) at 2024-11-06 19:33:56: 7,133,125,574 states generated (29,424,147 s/min), 653,222,778 distinct states found (2,446,024 ds/min), 139,553,972 states left on queue. +Progress(39) at 2024-11-06 19:34:56: 7,162,393,954 states generated (29,268,380 s/min), 655,812,815 distinct states found (2,590,037 ds/min), 140,051,736 states left on queue. +Progress(39) at 2024-11-06 19:35:56: 7,191,614,309 states generated (29,220,355 s/min), 658,388,779 distinct states found (2,575,964 ds/min), 140,550,430 states left on queue. +Progress(39) at 2024-11-06 19:36:56: 7,220,841,977 states generated (29,227,668 s/min), 660,885,901 distinct states found (2,497,122 ds/min), 140,973,038 states left on queue. +Progress(39) at 2024-11-06 19:37:56: 7,250,020,241 states generated (29,178,264 s/min), 663,335,701 distinct states found (2,449,800 ds/min), 141,327,800 states left on queue. +Progress(39) at 2024-11-06 19:38:56: 7,279,545,923 states generated (29,525,682 s/min), 665,706,252 distinct states found (2,370,551 ds/min), 141,666,628 states left on queue. +Progress(39) at 2024-11-06 19:39:56: 7,308,806,585 states generated (29,260,662 s/min), 668,059,763 distinct states found (2,353,511 ds/min), 141,985,139 states left on queue. +Progress(39) at 2024-11-06 19:40:56: 7,338,028,888 states generated (29,222,303 s/min), 670,241,848 distinct states found (2,182,085 ds/min), 142,169,842 states left on queue. +Progress(39) at 2024-11-06 19:41:56: 7,367,241,753 states generated (29,212,865 s/min), 672,613,255 distinct states found (2,371,407 ds/min), 142,507,724 states left on queue. +Progress(39) at 2024-11-06 19:42:56: 7,396,269,434 states generated (29,027,681 s/min), 675,112,517 distinct states found (2,499,262 ds/min), 142,941,967 states left on queue. +Progress(39) at 2024-11-06 19:43:56: 7,425,237,701 states generated (28,968,267 s/min), 677,646,850 distinct states found (2,534,333 ds/min), 143,388,301 states left on queue. +Progress(39) at 2024-11-06 19:44:56: 7,453,929,312 states generated (28,691,611 s/min), 680,183,486 distinct states found (2,536,636 ds/min), 143,823,998 states left on queue. +Progress(39) at 2024-11-06 19:45:56: 7,482,605,282 states generated (28,675,970 s/min), 682,751,269 distinct states found (2,567,783 ds/min), 144,211,694 states left on queue. +Progress(39) at 2024-11-06 19:46:56: 7,511,402,194 states generated (28,796,912 s/min), 685,177,338 distinct states found (2,426,069 ds/min), 144,502,576 states left on queue. +Progress(39) at 2024-11-06 19:47:56: 7,540,667,315 states generated (29,265,121 s/min), 687,470,422 distinct states found (2,293,084 ds/min), 144,717,485 states left on queue. +Progress(39) at 2024-11-06 19:48:56: 7,570,065,371 states generated (29,398,056 s/min), 689,724,172 distinct states found (2,253,750 ds/min), 144,895,541 states left on queue. +Progress(39) at 2024-11-06 19:49:56: 7,599,596,791 states generated (29,531,420 s/min), 692,064,101 distinct states found (2,339,929 ds/min), 145,171,911 states left on queue. +Progress(39) at 2024-11-06 19:50:56: 7,629,011,363 states generated (29,414,572 s/min), 694,540,161 distinct states found (2,476,060 ds/min), 145,540,423 states left on queue. +Progress(39) at 2024-11-06 19:51:56: 7,658,453,965 states generated (29,442,602 s/min), 696,912,122 distinct states found (2,371,961 ds/min), 145,809,567 states left on queue. +Progress(39) at 2024-11-06 19:52:56: 7,687,913,137 states generated (29,459,172 s/min), 699,240,630 distinct states found (2,328,508 ds/min), 146,098,273 states left on queue. +Progress(39) at 2024-11-06 19:53:56: 7,717,161,254 states generated (29,248,117 s/min), 701,789,915 distinct states found (2,549,285 ds/min), 146,502,121 states left on queue. +Progress(39) at 2024-11-06 19:54:56: 7,746,587,948 states generated (29,426,694 s/min), 704,037,014 distinct states found (2,247,099 ds/min), 146,684,369 states left on queue. +Progress(39) at 2024-11-06 19:55:56: 7,775,767,241 states generated (29,179,293 s/min), 706,750,225 distinct states found (2,713,211 ds/min), 147,270,858 states left on queue. +Progress(39) at 2024-11-06 19:56:56: 7,805,143,313 states generated (29,376,072 s/min), 709,214,940 distinct states found (2,464,715 ds/min), 147,627,166 states left on queue. +Progress(39) at 2024-11-06 19:57:56: 7,834,403,478 states generated (29,260,165 s/min), 711,759,633 distinct states found (2,544,693 ds/min), 147,996,842 states left on queue. +Progress(40) at 2024-11-06 19:58:56: 7,863,785,909 states generated (29,382,431 s/min), 713,915,903 distinct states found (2,156,270 ds/min), 148,107,480 states left on queue. +Progress(40) at 2024-11-06 19:59:56: 7,892,661,923 states generated (28,876,014 s/min), 716,529,052 distinct states found (2,613,149 ds/min), 148,615,346 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-06 20:00:57) +Progress(40) at 2024-11-06 20:00:57: 7,922,354,868 states generated (29,692,945 s/min), 718,724,840 distinct states found (2,195,788 ds/min), 148,760,464 states left on queue. +Progress(40) at 2024-11-06 20:01:57: 7,951,821,345 states generated (29,466,477 s/min), 721,199,790 distinct states found (2,474,950 ds/min), 149,133,458 states left on queue. +Progress(40) at 2024-11-06 20:02:57: 7,981,212,562 states generated (29,391,217 s/min), 723,637,084 distinct states found (2,437,294 ds/min), 149,453,388 states left on queue. +Progress(40) at 2024-11-06 20:03:57: 8,010,639,344 states generated (29,426,782 s/min), 725,776,597 distinct states found (2,139,513 ds/min), 149,580,205 states left on queue. +Progress(40) at 2024-11-06 20:04:57: 8,039,970,078 states generated (29,330,734 s/min), 728,145,896 distinct states found (2,369,299 ds/min), 149,873,787 states left on queue. +Progress(40) at 2024-11-06 20:05:57: 8,069,221,501 states generated (29,251,423 s/min), 730,835,980 distinct states found (2,690,084 ds/min), 150,431,663 states left on queue. +Progress(40) at 2024-11-06 20:06:57: 8,098,568,645 states generated (29,347,144 s/min), 733,266,238 distinct states found (2,430,258 ds/min), 150,772,190 states left on queue. +Progress(40) at 2024-11-06 20:07:57: 8,127,646,970 states generated (29,078,325 s/min), 736,001,441 distinct states found (2,735,203 ds/min), 151,368,297 states left on queue. +Progress(40) at 2024-11-06 20:08:57: 8,156,755,007 states generated (29,108,037 s/min), 738,759,675 distinct states found (2,758,234 ds/min), 151,912,929 states left on queue. +Progress(40) at 2024-11-06 20:09:57: 8,186,234,810 states generated (29,479,803 s/min), 741,336,146 distinct states found (2,576,471 ds/min), 152,376,828 states left on queue. +Progress(40) at 2024-11-06 20:10:57: 8,215,641,994 states generated (29,407,184 s/min), 743,647,353 distinct states found (2,311,207 ds/min), 152,617,899 states left on queue. +Progress(40) at 2024-11-06 20:11:57: 8,244,746,445 states generated (29,104,451 s/min), 746,080,007 distinct states found (2,432,654 ds/min), 152,939,104 states left on queue. +Progress(40) at 2024-11-06 20:12:57: 8,273,514,095 states generated (28,767,650 s/min), 748,726,701 distinct states found (2,646,694 ds/min), 153,445,645 states left on queue. +Progress(40) at 2024-11-06 20:13:57: 8,302,647,011 states generated (29,132,916 s/min), 751,041,420 distinct states found (2,314,719 ds/min), 153,711,631 states left on queue. +Progress(40) at 2024-11-06 20:14:57: 8,331,785,512 states generated (29,138,501 s/min), 753,262,324 distinct states found (2,220,904 ds/min), 153,861,206 states left on queue. +Progress(40) at 2024-11-06 20:15:57: 8,361,058,813 states generated (29,273,301 s/min), 755,881,803 distinct states found (2,619,479 ds/min), 154,293,451 states left on queue. +Progress(40) at 2024-11-06 20:16:57: 8,390,323,842 states generated (29,265,029 s/min), 757,769,813 distinct states found (1,888,010 ds/min), 154,184,183 states left on queue. +Progress(40) at 2024-11-06 20:17:57: 8,419,579,524 states generated (29,255,682 s/min), 760,009,795 distinct states found (2,239,982 ds/min), 154,382,656 states left on queue. +Progress(40) at 2024-11-06 20:18:57: 8,448,394,343 states generated (28,814,819 s/min), 762,597,225 distinct states found (2,587,430 ds/min), 154,795,314 states left on queue. +Progress(40) at 2024-11-06 20:19:57: 8,477,530,142 states generated (29,135,799 s/min), 764,903,184 distinct states found (2,305,959 ds/min), 154,997,361 states left on queue. +Progress(40) at 2024-11-06 20:20:57: 8,507,035,930 states generated (29,505,788 s/min), 766,887,142 distinct states found (1,983,958 ds/min), 155,034,831 states left on queue. +Progress(40) at 2024-11-06 20:21:57: 8,536,505,703 states generated (29,469,773 s/min), 769,048,483 distinct states found (2,161,341 ds/min), 155,183,742 states left on queue. +Progress(40) at 2024-11-06 20:22:57: 8,565,867,584 states generated (29,361,881 s/min), 771,258,076 distinct states found (2,209,593 ds/min), 155,385,262 states left on queue. +Progress(40) at 2024-11-06 20:23:57: 8,595,185,764 states generated (29,318,180 s/min), 773,454,985 distinct states found (2,196,909 ds/min), 155,614,111 states left on queue. +Progress(40) at 2024-11-06 20:24:57: 8,624,496,269 states generated (29,310,505 s/min), 775,619,630 distinct states found (2,164,645 ds/min), 155,798,174 states left on queue. +Progress(40) at 2024-11-06 20:25:57: 8,654,080,073 states generated (29,583,804 s/min), 777,637,410 distinct states found (2,017,780 ds/min), 155,782,045 states left on queue. +Progress(40) at 2024-11-06 20:26:57: 8,683,722,009 states generated (29,641,936 s/min), 779,940,399 distinct states found (2,302,989 ds/min), 156,073,330 states left on queue. +Progress(40) at 2024-11-06 20:27:57: 8,713,410,725 states generated (29,688,716 s/min), 782,406,987 distinct states found (2,466,588 ds/min), 156,445,902 states left on queue. +Progress(40) at 2024-11-06 20:28:57: 8,743,158,002 states generated (29,747,277 s/min), 784,542,609 distinct states found (2,135,622 ds/min), 156,539,841 states left on queue. +Progress(40) at 2024-11-06 20:29:57: 8,772,688,809 states generated (29,530,807 s/min), 786,583,608 distinct states found (2,040,999 ds/min), 156,630,041 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-06 20:30:57) +Progress(40) at 2024-11-06 20:30:57: 8,802,299,219 states generated (29,610,410 s/min), 788,709,007 distinct states found (2,125,399 ds/min), 156,780,966 states left on queue. +Progress(40) at 2024-11-06 20:31:57: 8,831,545,663 states generated (29,246,444 s/min), 790,874,634 distinct states found (2,165,627 ds/min), 156,943,688 states left on queue. +Progress(40) at 2024-11-06 20:32:57: 8,860,742,526 states generated (29,196,863 s/min), 793,218,612 distinct states found (2,343,978 ds/min), 157,247,738 states left on queue. +Progress(40) at 2024-11-06 20:33:57: 8,890,145,689 states generated (29,403,163 s/min), 795,347,746 distinct states found (2,129,134 ds/min), 157,376,715 states left on queue. +Progress(40) at 2024-11-06 20:34:57: 8,919,277,440 states generated (29,131,751 s/min), 797,557,991 distinct states found (2,210,245 ds/min), 157,566,508 states left on queue. +Progress(40) at 2024-11-06 20:35:57: 8,948,368,355 states generated (29,090,915 s/min), 799,870,441 distinct states found (2,312,450 ds/min), 157,825,337 states left on queue. +Progress(40) at 2024-11-06 20:36:57: 8,977,811,769 states generated (29,443,414 s/min), 801,992,418 distinct states found (2,121,977 ds/min), 158,015,008 states left on queue. +Progress(40) at 2024-11-06 20:37:57: 9,007,285,675 states generated (29,473,906 s/min), 804,250,024 distinct states found (2,257,606 ds/min), 158,295,507 states left on queue. +Progress(40) at 2024-11-06 20:38:57: 9,036,450,953 states generated (29,165,278 s/min), 806,795,860 distinct states found (2,545,836 ds/min), 158,767,907 states left on queue. +Progress(40) at 2024-11-06 20:39:57: 9,065,704,268 states generated (29,253,315 s/min), 809,198,438 distinct states found (2,402,578 ds/min), 159,105,121 states left on queue. +Progress(40) at 2024-11-06 20:40:57: 9,095,165,427 states generated (29,461,159 s/min), 811,512,584 distinct states found (2,314,146 ds/min), 159,345,117 states left on queue. +Progress(40) at 2024-11-06 20:41:57: 9,124,541,297 states generated (29,375,870 s/min), 813,905,920 distinct states found (2,393,336 ds/min), 159,672,325 states left on queue. +Progress(40) at 2024-11-06 20:42:57: 9,153,712,591 states generated (29,171,294 s/min), 816,392,570 distinct states found (2,486,650 ds/min), 160,082,547 states left on queue. +Progress(40) at 2024-11-06 20:43:57: 9,182,920,866 states generated (29,208,275 s/min), 818,845,538 distinct states found (2,452,968 ds/min), 160,476,056 states left on queue. +Progress(40) at 2024-11-06 20:44:57: 9,212,093,614 states generated (29,172,748 s/min), 821,212,595 distinct states found (2,367,057 ds/min), 160,787,698 states left on queue. +Progress(40) at 2024-11-06 20:45:57: 9,241,177,362 states generated (29,083,748 s/min), 823,731,111 distinct states found (2,518,516 ds/min), 161,227,975 states left on queue. +Progress(40) at 2024-11-06 20:46:57: 9,270,666,448 states generated (29,489,086 s/min), 825,877,262 distinct states found (2,146,151 ds/min), 161,339,209 states left on queue. +Progress(40) at 2024-11-06 20:47:57: 9,299,985,513 states generated (29,319,065 s/min), 828,195,512 distinct states found (2,318,250 ds/min), 161,644,069 states left on queue. +Progress(40) at 2024-11-06 20:48:57: 9,329,155,005 states generated (29,169,492 s/min), 830,386,518 distinct states found (2,191,006 ds/min), 161,807,802 states left on queue. +Progress(40) at 2024-11-06 20:49:57: 9,358,433,771 states generated (29,278,766 s/min), 832,419,931 distinct states found (2,033,413 ds/min), 161,882,018 states left on queue. +Progress(40) at 2024-11-06 20:50:57: 9,387,665,287 states generated (29,231,516 s/min), 834,751,267 distinct states found (2,331,336 ds/min), 162,183,217 states left on queue. +Progress(40) at 2024-11-06 20:51:57: 9,416,697,647 states generated (29,032,360 s/min), 837,127,657 distinct states found (2,376,390 ds/min), 162,511,558 states left on queue. +Progress(40) at 2024-11-06 20:52:57: 9,445,747,666 states generated (29,050,019 s/min), 839,556,372 distinct states found (2,428,715 ds/min), 162,873,418 states left on queue. +Progress(40) at 2024-11-06 20:53:57: 9,474,599,613 states generated (28,851,947 s/min), 841,985,780 distinct states found (2,429,408 ds/min), 163,231,531 states left on queue. +Progress(40) at 2024-11-06 20:54:57: 9,503,408,525 states generated (28,808,912 s/min), 844,368,680 distinct states found (2,382,900 ds/min), 163,533,407 states left on queue. +Progress(40) at 2024-11-06 20:55:57: 9,532,128,492 states generated (28,719,967 s/min), 846,804,519 distinct states found (2,435,839 ds/min), 163,787,695 states left on queue. +Progress(40) at 2024-11-06 20:56:57: 9,560,935,598 states generated (28,807,106 s/min), 849,075,143 distinct states found (2,270,624 ds/min), 163,946,240 states left on queue. +Progress(40) at 2024-11-06 20:57:57: 9,590,127,374 states generated (29,191,776 s/min), 851,260,378 distinct states found (2,185,235 ds/min), 164,077,372 states left on queue. +Progress(40) at 2024-11-06 20:58:57: 9,619,514,341 states generated (29,386,967 s/min), 853,352,738 distinct states found (2,092,360 ds/min), 164,118,186 states left on queue. +Progress(40) at 2024-11-06 20:59:57: 9,648,985,302 states generated (29,470,961 s/min), 855,543,408 distinct states found (2,190,670 ds/min), 164,279,076 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-06 21:00:58) +Progress(40) at 2024-11-06 21:00:58: 9,678,677,722 states generated (29,692,420 s/min), 857,894,775 distinct states found (2,351,367 ds/min), 164,516,395 states left on queue. +Progress(40) at 2024-11-06 21:01:58: 9,708,095,509 states generated (29,417,787 s/min), 860,383,155 distinct states found (2,488,380 ds/min), 164,898,153 states left on queue. +Progress(40) at 2024-11-06 21:02:58: 9,737,488,378 states generated (29,392,869 s/min), 862,497,194 distinct states found (2,114,039 ds/min), 164,966,010 states left on queue. +Progress(40) at 2024-11-06 21:03:58: 9,766,895,552 states generated (29,407,174 s/min), 864,819,525 distinct states found (2,322,331 ds/min), 165,232,701 states left on queue. +Progress(40) at 2024-11-06 21:04:58: 9,796,208,300 states generated (29,312,748 s/min), 867,276,841 distinct states found (2,457,316 ds/min), 165,568,613 states left on queue. +Progress(40) at 2024-11-06 21:05:58: 9,825,603,726 states generated (29,395,426 s/min), 869,434,526 distinct states found (2,157,685 ds/min), 165,685,610 states left on queue. +Progress(40) at 2024-11-06 21:06:58: 9,854,789,772 states generated (29,186,046 s/min), 871,934,034 distinct states found (2,499,508 ds/min), 166,084,000 states left on queue. +Progress(40) at 2024-11-06 21:07:58: 9,884,028,390 states generated (29,238,618 s/min), 874,443,659 distinct states found (2,509,625 ds/min), 166,483,652 states left on queue. +Progress(40) at 2024-11-06 21:08:58: 9,913,377,669 states generated (29,349,279 s/min), 876,803,913 distinct states found (2,360,254 ds/min), 166,740,702 states left on queue. +Progress(40) at 2024-11-06 21:09:58: 9,942,721,749 states generated (29,344,080 s/min), 879,187,270 distinct states found (2,383,357 ds/min), 166,953,562 states left on queue. +Progress(41) at 2024-11-06 21:10:58: 9,972,078,704 states generated (29,356,955 s/min), 881,233,361 distinct states found (2,046,091 ds/min), 166,999,841 states left on queue. +Progress(41) at 2024-11-06 21:11:58: 10,000,914,792 states generated (28,836,088 s/min), 883,811,441 distinct states found (2,578,080 ds/min), 167,466,583 states left on queue. +Progress(41) at 2024-11-06 21:12:58: 10,030,210,434 states generated (29,295,642 s/min), 885,899,950 distinct states found (2,088,509 ds/min), 167,531,826 states left on queue. +Progress(41) at 2024-11-06 21:13:58: 10,059,587,070 states generated (29,376,636 s/min), 888,188,669 distinct states found (2,288,719 ds/min), 167,753,242 states left on queue. +Progress(41) at 2024-11-06 21:14:58: 10,089,078,901 states generated (29,491,831 s/min), 890,649,997 distinct states found (2,461,328 ds/min), 168,098,890 states left on queue. +Progress(41) at 2024-11-06 21:15:58: 10,118,348,352 states generated (29,269,451 s/min), 892,695,892 distinct states found (2,045,895 ds/min), 168,141,532 states left on queue. +Progress(41) at 2024-11-06 21:16:58: 10,147,644,676 states generated (29,296,324 s/min), 894,823,997 distinct states found (2,128,105 ds/min), 168,231,032 states left on queue. +Progress(41) at 2024-11-06 21:17:58: 10,176,967,773 states generated (29,323,097 s/min), 897,225,523 distinct states found (2,401,526 ds/min), 168,555,740 states left on queue. +Progress(41) at 2024-11-06 21:18:58: 10,206,275,174 states generated (29,307,401 s/min), 899,814,626 distinct states found (2,589,103 ds/min), 169,020,971 states left on queue. +Progress(41) at 2024-11-06 21:19:58: 10,235,593,993 states generated (29,318,819 s/min), 902,141,356 distinct states found (2,326,730 ds/min), 169,267,251 states left on queue. +Progress(41) at 2024-11-06 21:20:58: 10,264,799,049 states generated (29,205,056 s/min), 904,746,333 distinct states found (2,604,977 ds/min), 169,758,459 states left on queue. +Progress(41) at 2024-11-06 21:21:58: 10,293,910,586 states generated (29,111,537 s/min), 907,433,182 distinct states found (2,686,849 ds/min), 170,277,176 states left on queue. +Progress(41) at 2024-11-06 21:22:58: 10,323,190,750 states generated (29,280,164 s/min), 910,052,108 distinct states found (2,618,926 ds/min), 170,695,212 states left on queue. +Progress(41) at 2024-11-06 21:23:58: 10,352,580,182 states generated (29,389,432 s/min), 912,516,064 distinct states found (2,463,956 ds/min), 171,083,771 states left on queue. +Progress(41) at 2024-11-06 21:24:58: 10,381,951,479 states generated (29,371,297 s/min), 914,781,443 distinct states found (2,265,379 ds/min), 171,281,545 states left on queue. +Progress(41) at 2024-11-06 21:25:58: 10,411,026,945 states generated (29,075,466 s/min), 917,078,052 distinct states found (2,296,609 ds/min), 171,498,613 states left on queue. +Progress(41) at 2024-11-06 21:26:58: 10,439,904,441 states generated (28,877,496 s/min), 919,547,808 distinct states found (2,469,756 ds/min), 171,860,589 states left on queue. +Progress(41) at 2024-11-06 21:27:58: 10,469,008,600 states generated (29,104,159 s/min), 921,912,547 distinct states found (2,364,739 ds/min), 172,121,551 states left on queue. +Progress(41) at 2024-11-06 21:28:58: 10,497,834,986 states generated (28,826,386 s/min), 924,235,840 distinct states found (2,323,293 ds/min), 172,353,661 states left on queue. +Progress(41) at 2024-11-06 21:29:58: 10,527,064,696 states generated (29,229,710 s/min), 926,456,744 distinct states found (2,220,904 ds/min), 172,508,439 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-06 21:30:59) +Progress(41) at 2024-11-06 21:30:59: 10,556,579,142 states generated (29,514,446 s/min), 928,988,872 distinct states found (2,532,128 ds/min), 172,833,183 states left on queue. +Progress(41) at 2024-11-06 21:31:59: 10,585,719,909 states generated (29,140,767 s/min), 930,745,149 distinct states found (1,756,277 ds/min), 172,622,496 states left on queue. +Progress(41) at 2024-11-06 21:32:59: 10,614,881,115 states generated (29,161,206 s/min), 932,948,083 distinct states found (2,202,934 ds/min), 172,792,818 states left on queue. +Progress(41) at 2024-11-06 21:33:59: 10,643,693,909 states generated (28,812,794 s/min), 935,441,862 distinct states found (2,493,779 ds/min), 173,119,721 states left on queue. +Progress(41) at 2024-11-06 21:34:59: 10,672,671,166 states generated (28,977,257 s/min), 937,653,961 distinct states found (2,212,099 ds/min), 173,216,843 states left on queue. +Progress(41) at 2024-11-06 21:35:59: 10,702,072,440 states generated (29,401,274 s/min), 939,638,920 distinct states found (1,984,959 ds/min), 173,254,076 states left on queue. +Progress(41) at 2024-11-06 21:36:59: 10,731,415,292 states generated (29,342,852 s/min), 941,583,653 distinct states found (1,944,733 ds/min), 173,229,968 states left on queue. +Progress(41) at 2024-11-06 21:37:59: 10,760,802,656 states generated (29,387,364 s/min), 943,770,610 distinct states found (2,186,957 ds/min), 173,412,799 states left on queue. +Progress(41) at 2024-11-06 21:38:59: 10,789,961,996 states generated (29,159,340 s/min), 945,790,519 distinct states found (2,019,909 ds/min), 173,482,204 states left on queue. +Progress(41) at 2024-11-06 21:39:59: 10,819,303,972 states generated (29,341,976 s/min), 947,902,156 distinct states found (2,111,637 ds/min), 173,640,941 states left on queue. +Progress(41) at 2024-11-06 21:40:59: 10,848,636,471 states generated (29,332,499 s/min), 949,908,145 distinct states found (2,005,989 ds/min), 173,684,074 states left on queue. +Progress(41) at 2024-11-06 21:41:59: 10,878,207,345 states generated (29,570,874 s/min), 951,870,784 distinct states found (1,962,639 ds/min), 173,648,255 states left on queue. +Progress(41) at 2024-11-06 21:42:59: 10,907,777,091 states generated (29,569,746 s/min), 954,123,321 distinct states found (2,252,537 ds/min), 173,881,583 states left on queue. +Progress(41) at 2024-11-06 21:43:59: 10,937,383,465 states generated (29,606,374 s/min), 956,486,701 distinct states found (2,363,380 ds/min), 174,173,694 states left on queue. +Progress(41) at 2024-11-06 21:44:59: 10,967,070,713 states generated (29,687,248 s/min), 958,539,717 distinct states found (2,053,016 ds/min), 174,194,592 states left on queue. +Progress(41) at 2024-11-06 21:45:59: 10,996,524,132 states generated (29,453,419 s/min), 960,439,766 distinct states found (1,900,049 ds/min), 174,165,777 states left on queue. +Progress(41) at 2024-11-06 21:46:59: 11,025,919,452 states generated (29,395,320 s/min), 962,518,661 distinct states found (2,078,895 ds/min), 174,284,642 states left on queue. +Progress(41) at 2024-11-06 21:47:59: 11,055,087,136 states generated (29,167,684 s/min), 964,440,130 distinct states found (1,921,469 ds/min), 174,253,951 states left on queue. +Progress(41) at 2024-11-06 21:48:59: 11,084,346,164 states generated (29,259,028 s/min), 966,652,841 distinct states found (2,212,711 ds/min), 174,452,762 states left on queue. +Progress(41) at 2024-11-06 21:49:59: 11,113,503,996 states generated (29,157,832 s/min), 968,786,590 distinct states found (2,133,749 ds/min), 174,578,147 states left on queue. +Progress(41) at 2024-11-06 21:50:59: 11,142,862,327 states generated (29,358,331 s/min), 970,780,918 distinct states found (1,994,328 ds/min), 174,585,050 states left on queue. +Progress(41) at 2024-11-06 21:51:59: 11,171,907,560 states generated (29,045,233 s/min), 972,924,432 distinct states found (2,143,514 ds/min), 174,718,189 states left on queue. +Progress(41) at 2024-11-06 21:52:59: 11,201,055,602 states generated (29,148,042 s/min), 975,106,131 distinct states found (2,181,699 ds/min), 174,874,035 states left on queue. +Progress(41) at 2024-11-06 21:53:59: 11,230,576,268 states generated (29,520,666 s/min), 977,176,048 distinct states found (2,069,917 ds/min), 175,042,666 states left on queue. +Progress(41) at 2024-11-06 21:54:59: 11,259,928,257 states generated (29,351,989 s/min), 979,337,351 distinct states found (2,161,303 ds/min), 175,248,665 states left on queue. +Progress(41) at 2024-11-06 21:55:59: 11,289,190,366 states generated (29,262,109 s/min), 981,837,130 distinct states found (2,499,779 ds/min), 175,680,736 states left on queue. +Progress(41) at 2024-11-06 21:56:59: 11,318,399,828 states generated (29,209,462 s/min), 984,112,195 distinct states found (2,275,065 ds/min), 175,913,580 states left on queue. +Progress(41) at 2024-11-06 21:57:59: 11,347,862,845 states generated (29,463,017 s/min), 986,368,069 distinct states found (2,255,874 ds/min), 176,126,523 states left on queue. +Progress(41) at 2024-11-06 21:58:59: 11,377,318,937 states generated (29,456,092 s/min), 988,548,686 distinct states found (2,180,617 ds/min), 176,253,552 states left on queue. +Progress(41) at 2024-11-06 21:59:59: 11,406,551,913 states generated (29,232,976 s/min), 990,875,071 distinct states found (2,326,385 ds/min), 176,528,465 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-06 22:00:59) +Progress(41) at 2024-11-06 22:00:59: 11,436,006,666 states generated (29,454,753 s/min), 993,234,999 distinct states found (2,359,928 ds/min), 176,816,755 states left on queue. +Progress(41) at 2024-11-06 22:01:59: 11,465,207,151 states generated (29,200,485 s/min), 995,557,179 distinct states found (2,322,180 ds/min), 177,094,397 states left on queue. +Progress(41) at 2024-11-06 22:02:59: 11,494,298,575 states generated (29,091,424 s/min), 997,927,812 distinct states found (2,370,633 ds/min), 177,411,890 states left on queue. +Progress(41) at 2024-11-06 22:03:59: 11,523,576,632 states generated (29,278,057 s/min), 1,000,196,030 distinct states found (2,268,218 ds/min), 177,640,656 states left on queue. +Progress(41) at 2024-11-06 22:04:59: 11,552,734,483 states generated (29,157,851 s/min), 1,002,452,277 distinct states found (2,256,247 ds/min), 177,827,247 states left on queue. +Progress(41) at 2024-11-06 22:05:59: 11,582,200,298 states generated (29,465,815 s/min), 1,004,593,818 distinct states found (2,141,541 ds/min), 177,983,707 states left on queue. +Progress(41) at 2024-11-06 22:06:59: 11,611,484,149 states generated (29,283,851 s/min), 1,006,774,383 distinct states found (2,180,565 ds/min), 178,161,577 states left on queue. +Progress(41) at 2024-11-06 22:07:59: 11,640,449,232 states generated (28,965,083 s/min), 1,008,870,356 distinct states found (2,095,973 ds/min), 178,245,657 states left on queue. +Progress(41) at 2024-11-06 22:08:59: 11,669,695,402 states generated (29,246,170 s/min), 1,010,743,262 distinct states found (1,872,906 ds/min), 178,199,630 states left on queue. +Progress(41) at 2024-11-06 22:09:59: 11,698,855,657 states generated (29,160,255 s/min), 1,012,993,163 distinct states found (2,249,901 ds/min), 178,433,806 states left on queue. +Progress(41) at 2024-11-06 22:10:59: 11,727,873,536 states generated (29,017,879 s/min), 1,015,222,628 distinct states found (2,229,465 ds/min), 178,645,315 states left on queue. +Progress(41) at 2024-11-06 22:11:59: 11,756,910,696 states generated (29,037,160 s/min), 1,017,493,811 distinct states found (2,271,183 ds/min), 178,885,854 states left on queue. +Progress(41) at 2024-11-06 22:12:59: 11,785,841,957 states generated (28,931,261 s/min), 1,019,798,730 distinct states found (2,304,919 ds/min), 179,138,831 states left on queue. +Progress(41) at 2024-11-06 22:13:59: 11,814,627,351 states generated (28,785,394 s/min), 1,022,115,935 distinct states found (2,317,205 ds/min), 179,401,355 states left on queue. +Progress(41) at 2024-11-06 22:14:59: 11,843,482,288 states generated (28,854,937 s/min), 1,024,372,991 distinct states found (2,257,056 ds/min), 179,570,167 states left on queue. +Progress(41) at 2024-11-06 22:15:59: 11,872,232,503 states generated (28,750,215 s/min), 1,026,655,919 distinct states found (2,282,928 ds/min), 179,704,400 states left on queue. +Progress(41) at 2024-11-06 22:16:59: 11,901,011,327 states generated (28,778,824 s/min), 1,028,780,151 distinct states found (2,124,232 ds/min), 179,744,822 states left on queue. +Progress(41) at 2024-11-06 22:17:59: 11,930,078,061 states generated (29,066,734 s/min), 1,030,863,673 distinct states found (2,083,522 ds/min), 179,790,662 states left on queue. +Progress(41) at 2024-11-06 22:18:59: 11,959,463,901 states generated (29,385,840 s/min), 1,032,840,344 distinct states found (1,976,671 ds/min), 179,738,442 states left on queue. +Progress(41) at 2024-11-06 22:19:59: 11,988,811,132 states generated (29,347,231 s/min), 1,034,897,049 distinct states found (2,056,705 ds/min), 179,788,782 states left on queue. +Progress(41) at 2024-11-06 22:20:59: 12,018,335,911 states generated (29,524,779 s/min), 1,037,158,579 distinct states found (2,261,530 ds/min), 179,978,226 states left on queue. +Progress(41) at 2024-11-06 22:21:59: 12,047,755,593 states generated (29,419,682 s/min), 1,039,437,623 distinct states found (2,279,044 ds/min), 180,177,371 states left on queue. +Progress(41) at 2024-11-06 22:22:59: 12,077,111,001 states generated (29,355,408 s/min), 1,041,672,961 distinct states found (2,235,338 ds/min), 180,336,777 states left on queue. +Progress(41) at 2024-11-06 22:23:59: 12,106,556,177 states generated (29,445,176 s/min), 1,043,675,880 distinct states found (2,002,919 ds/min), 180,345,759 states left on queue. +Progress(41) at 2024-11-06 22:24:59: 12,135,797,446 states generated (29,241,269 s/min), 1,045,966,606 distinct states found (2,290,726 ds/min), 180,552,887 states left on queue. +Progress(41) at 2024-11-06 22:25:59: 12,165,143,756 states generated (29,346,310 s/min), 1,048,373,643 distinct states found (2,407,037 ds/min), 180,860,142 states left on queue. +Progress(41) at 2024-11-06 22:26:59: 12,194,478,236 states generated (29,334,480 s/min), 1,050,403,560 distinct states found (2,029,917 ds/min), 180,873,811 states left on queue. +Progress(41) at 2024-11-06 22:27:59: 12,223,653,080 states generated (29,174,844 s/min), 1,052,798,502 distinct states found (2,394,942 ds/min), 181,184,025 states left on queue. +Progress(41) at 2024-11-06 22:28:59: 12,252,926,784 states generated (29,273,704 s/min), 1,055,243,990 distinct states found (2,445,488 ds/min), 181,542,525 states left on queue. +Progress(41) at 2024-11-06 22:29:59: 12,282,176,071 states generated (29,249,287 s/min), 1,057,488,489 distinct states found (2,244,499 ds/min), 181,704,266 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-06 22:31:00) +Progress(41) at 2024-11-06 22:31:00: 12,311,654,529 states generated (29,478,458 s/min), 1,059,789,296 distinct states found (2,300,807 ds/min), 181,875,392 states left on queue. +Progress(41) at 2024-11-06 22:32:00: 12,340,837,903 states generated (29,183,374 s/min), 1,061,857,294 distinct states found (2,067,998 ds/min), 181,860,690 states left on queue. +Progress(41) at 2024-11-06 22:33:00: 12,369,978,352 states generated (29,140,449 s/min), 1,063,943,173 distinct states found (2,085,879 ds/min), 181,951,091 states left on queue. +Progress(41) at 2024-11-06 22:34:00: 12,398,820,660 states generated (28,842,308 s/min), 1,066,384,327 distinct states found (2,441,154 ds/min), 182,284,376 states left on queue. +Progress(41) at 2024-11-06 22:35:00: 12,427,966,245 states generated (29,145,585 s/min), 1,068,376,116 distinct states found (1,991,789 ds/min), 182,275,982 states left on queue. +Progress(41) at 2024-11-06 22:36:00: 12,457,300,671 states generated (29,334,426 s/min), 1,070,596,949 distinct states found (2,220,833 ds/min), 182,442,278 states left on queue. +Progress(41) at 2024-11-06 22:37:00: 12,486,769,483 states generated (29,468,812 s/min), 1,072,968,640 distinct states found (2,371,691 ds/min), 182,718,485 states left on queue. +Progress(41) at 2024-11-06 22:38:00: 12,516,031,360 states generated (29,261,877 s/min), 1,075,001,378 distinct states found (2,032,738 ds/min), 182,729,966 states left on queue. +Progress(41) at 2024-11-06 22:39:00: 12,545,265,331 states generated (29,233,971 s/min), 1,076,880,794 distinct states found (1,879,416 ds/min), 182,634,798 states left on queue. +Progress(41) at 2024-11-06 22:40:00: 12,574,495,559 states generated (29,230,228 s/min), 1,079,123,856 distinct states found (2,243,062 ds/min), 182,812,322 states left on queue. +Progress(41) at 2024-11-06 22:41:00: 12,603,757,387 states generated (29,261,828 s/min), 1,081,610,769 distinct states found (2,486,913 ds/min), 183,219,247 states left on queue. +Progress(41) at 2024-11-06 22:42:00: 12,632,909,026 states generated (29,151,639 s/min), 1,083,967,637 distinct states found (2,356,868 ds/min), 183,478,879 states left on queue. +Progress(41) at 2024-11-06 22:43:00: 12,662,254,981 states generated (29,345,955 s/min), 1,086,272,935 distinct states found (2,305,298 ds/min), 183,726,701 states left on queue. +Progress(41) at 2024-11-06 22:44:00: 12,691,400,218 states generated (29,145,237 s/min), 1,088,778,928 distinct states found (2,505,993 ds/min), 184,128,274 states left on queue. +Progress(41) at 2024-11-06 22:45:00: 12,720,528,098 states generated (29,127,880 s/min), 1,091,335,929 distinct states found (2,557,001 ds/min), 184,556,078 states left on queue. +Progress(41) at 2024-11-06 22:46:00: 12,749,701,886 states generated (29,173,788 s/min), 1,093,889,510 distinct states found (2,553,581 ds/min), 184,916,391 states left on queue. +Progress(41) at 2024-11-06 22:47:00: 12,779,153,937 states generated (29,452,051 s/min), 1,096,185,973 distinct states found (2,296,463 ds/min), 185,115,877 states left on queue. +Progress(41) at 2024-11-06 22:48:00: 12,808,440,971 states generated (29,287,034 s/min), 1,098,733,865 distinct states found (2,547,892 ds/min), 185,564,617 states left on queue. +Progress(41) at 2024-11-06 22:49:00: 12,837,695,256 states generated (29,254,285 s/min), 1,100,705,460 distinct states found (1,971,595 ds/min), 185,532,558 states left on queue. +Progress(41) at 2024-11-06 22:50:00: 12,866,801,129 states generated (29,105,873 s/min), 1,103,074,603 distinct states found (2,369,143 ds/min), 185,770,427 states left on queue. +Progress(41) at 2024-11-06 22:51:00: 12,895,682,870 states generated (28,881,741 s/min), 1,105,437,747 distinct states found (2,363,144 ds/min), 186,049,274 states left on queue. +Progress(41) at 2024-11-06 22:52:00: 12,924,655,990 states generated (28,973,120 s/min), 1,107,853,554 distinct states found (2,415,807 ds/min), 186,325,129 states left on queue. +Progress(41) at 2024-11-06 22:53:00: 12,953,616,826 states generated (28,960,836 s/min), 1,110,097,321 distinct states found (2,243,767 ds/min), 186,509,276 states left on queue. +Progress(41) at 2024-11-06 22:54:00: 12,982,711,068 states generated (29,094,242 s/min), 1,112,146,097 distinct states found (2,048,776 ds/min), 186,507,356 states left on queue. +Progress(41) at 2024-11-06 22:55:00: 13,011,962,667 states generated (29,251,599 s/min), 1,114,530,785 distinct states found (2,384,688 ds/min), 186,758,016 states left on queue. +Progress(41) at 2024-11-06 22:56:00: 13,041,163,382 states generated (29,200,715 s/min), 1,116,566,038 distinct states found (2,035,253 ds/min), 186,702,453 states left on queue. +Progress(41) at 2024-11-06 22:57:00: 13,070,416,604 states generated (29,253,222 s/min), 1,118,433,735 distinct states found (1,867,697 ds/min), 186,595,926 states left on queue. +Progress(41) at 2024-11-06 22:58:00: 13,099,393,765 states generated (28,977,161 s/min), 1,120,727,626 distinct states found (2,293,891 ds/min), 186,785,521 states left on queue. +Progress(41) at 2024-11-06 22:59:00: 13,128,309,003 states generated (28,915,238 s/min), 1,123,075,278 distinct states found (2,347,652 ds/min), 186,977,496 states left on queue. +Progress(42) at 2024-11-06 23:00:00: 13,157,492,254 states generated (29,183,251 s/min), 1,125,164,050 distinct states found (2,088,772 ds/min), 186,994,591 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-06 23:01:01) +Progress(42) at 2024-11-06 23:01:01: 13,187,099,442 states generated (29,607,188 s/min), 1,126,955,828 distinct states found (1,791,778 ds/min), 186,860,457 states left on queue. +Progress(42) at 2024-11-06 23:02:01: 13,216,408,249 states generated (29,308,807 s/min), 1,128,852,586 distinct states found (1,896,758 ds/min), 186,800,218 states left on queue. +Progress(42) at 2024-11-06 23:03:01: 13,245,736,139 states generated (29,327,890 s/min), 1,130,960,381 distinct states found (2,107,795 ds/min), 186,911,118 states left on queue. +Progress(42) at 2024-11-06 23:04:01: 13,274,893,464 states generated (29,157,325 s/min), 1,132,863,930 distinct states found (1,903,549 ds/min), 186,892,129 states left on queue. +Progress(42) at 2024-11-06 23:05:01: 13,304,183,990 states generated (29,290,526 s/min), 1,134,876,306 distinct states found (2,012,376 ds/min), 186,965,153 states left on queue. +Progress(42) at 2024-11-06 23:06:01: 13,333,457,770 states generated (29,273,780 s/min), 1,136,812,237 distinct states found (1,935,931 ds/min), 186,957,506 states left on queue. +Progress(42) at 2024-11-06 23:07:01: 13,362,984,994 states generated (29,527,224 s/min), 1,138,649,876 distinct states found (1,837,639 ds/min), 186,823,887 states left on queue. +Progress(42) at 2024-11-06 23:08:01: 13,392,550,733 states generated (29,565,739 s/min), 1,140,795,722 distinct states found (2,145,846 ds/min), 186,974,795 states left on queue. +Progress(42) at 2024-11-06 23:09:01: 13,422,111,300 states generated (29,560,567 s/min), 1,143,038,611 distinct states found (2,242,889 ds/min), 187,179,197 states left on queue. +Progress(42) at 2024-11-06 23:10:01: 13,451,822,496 states generated (29,711,196 s/min), 1,145,071,502 distinct states found (2,032,891 ds/min), 187,190,480 states left on queue. +Progress(42) at 2024-11-06 23:11:01: 13,481,293,484 states generated (29,470,988 s/min), 1,146,905,806 distinct states found (1,834,304 ds/min), 187,079,661 states left on queue. +Progress(42) at 2024-11-06 23:12:01: 13,510,659,679 states generated (29,366,195 s/min), 1,148,841,643 distinct states found (1,935,837 ds/min), 187,082,815 states left on queue. +Progress(42) at 2024-11-06 23:13:01: 13,539,730,883 states generated (29,071,204 s/min), 1,150,715,436 distinct states found (1,873,793 ds/min), 187,013,975 states left on queue. +Progress(42) at 2024-11-06 23:14:01: 13,568,973,308 states generated (29,242,425 s/min), 1,152,689,735 distinct states found (1,974,299 ds/min), 187,016,208 states left on queue. +Progress(42) at 2024-11-06 23:15:01: 13,598,106,627 states generated (29,133,319 s/min), 1,154,829,869 distinct states found (2,140,134 ds/min), 187,147,884 states left on queue. +Progress(42) at 2024-11-06 23:16:01: 13,627,319,459 states generated (29,212,832 s/min), 1,156,740,070 distinct states found (1,910,201 ds/min), 187,086,942 states left on queue. +Progress(42) at 2024-11-06 23:17:01: 13,656,462,121 states generated (29,142,662 s/min), 1,158,698,307 distinct states found (1,958,237 ds/min), 187,072,201 states left on queue. +Progress(42) at 2024-11-06 23:18:01: 13,685,545,941 states generated (29,083,820 s/min), 1,160,688,939 distinct states found (1,990,632 ds/min), 187,078,553 states left on queue. +Progress(42) at 2024-11-06 23:19:01: 13,714,652,628 states generated (29,106,687 s/min), 1,162,748,633 distinct states found (2,059,694 ds/min), 187,157,229 states left on queue. +Progress(42) at 2024-11-06 23:20:01: 13,744,105,986 states generated (29,453,358 s/min), 1,164,748,782 distinct states found (2,000,149 ds/min), 187,275,480 states left on queue. +Progress(42) at 2024-11-06 23:21:01: 13,773,414,393 states generated (29,308,407 s/min), 1,166,804,740 distinct states found (2,055,958 ds/min), 187,393,312 states left on queue. +Progress(42) at 2024-11-06 23:22:01: 13,802,600,069 states generated (29,185,676 s/min), 1,169,251,493 distinct states found (2,446,753 ds/min), 187,781,298 states left on queue. +Progress(42) at 2024-11-06 23:23:01: 13,831,830,649 states generated (29,230,580 s/min), 1,171,412,176 distinct states found (2,160,683 ds/min), 187,932,991 states left on queue. +Progress(42) at 2024-11-06 23:24:01: 13,861,152,221 states generated (29,321,572 s/min), 1,173,582,994 distinct states found (2,170,818 ds/min), 188,078,037 states left on queue. +Progress(42) at 2024-11-06 23:25:01: 13,890,538,756 states generated (29,386,535 s/min), 1,175,642,901 distinct states found (2,059,907 ds/min), 188,116,794 states left on queue. +Progress(42) at 2024-11-06 23:26:01: 13,919,812,820 states generated (29,274,064 s/min), 1,177,743,048 distinct states found (2,100,147 ds/min), 188,189,399 states left on queue. +Progress(42) at 2024-11-06 23:27:01: 13,948,903,585 states generated (29,090,765 s/min), 1,179,980,470 distinct states found (2,237,422 ds/min), 188,388,309 states left on queue. +Progress(42) at 2024-11-06 23:28:01: 13,978,138,385 states generated (29,234,800 s/min), 1,182,134,981 distinct states found (2,154,511 ds/min), 188,526,735 states left on queue. +Progress(42) at 2024-11-06 23:29:01: 14,007,310,151 states generated (29,171,766 s/min), 1,184,360,360 distinct states found (2,225,379 ds/min), 188,718,575 states left on queue. +Progress(42) at 2024-11-06 23:30:01: 14,036,411,110 states generated (29,100,959 s/min), 1,186,617,835 distinct states found (2,257,475 ds/min), 188,941,068 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-06 23:31:01) +Progress(42) at 2024-11-06 23:31:01: 14,065,894,113 states generated (29,483,003 s/min), 1,188,743,048 distinct states found (2,125,213 ds/min), 189,035,636 states left on queue. +Progress(42) at 2024-11-06 23:32:01: 14,094,909,096 states generated (29,014,983 s/min), 1,191,096,961 distinct states found (2,353,913 ds/min), 189,332,174 states left on queue. +Progress(42) at 2024-11-06 23:33:01: 14,124,212,567 states generated (29,303,471 s/min), 1,193,012,997 distinct states found (1,916,036 ds/min), 189,266,016 states left on queue. +Progress(42) at 2024-11-06 23:34:01: 14,153,428,768 states generated (29,216,201 s/min), 1,195,170,448 distinct states found (2,157,451 ds/min), 189,430,881 states left on queue. +Progress(42) at 2024-11-06 23:35:01: 14,182,568,290 states generated (29,139,522 s/min), 1,197,127,126 distinct states found (1,956,678 ds/min), 189,423,769 states left on queue. +Progress(42) at 2024-11-06 23:36:01: 14,211,602,024 states generated (29,033,734 s/min), 1,199,044,612 distinct states found (1,917,486 ds/min), 189,380,199 states left on queue. +Progress(42) at 2024-11-06 23:37:01: 14,240,593,845 states generated (28,991,821 s/min), 1,200,900,028 distinct states found (1,855,416 ds/min), 189,324,925 states left on queue. +Progress(42) at 2024-11-06 23:38:01: 14,269,687,808 states generated (29,093,963 s/min), 1,203,034,598 distinct states found (2,134,570 ds/min), 189,466,947 states left on queue. +Progress(42) at 2024-11-06 23:39:01: 14,298,626,140 states generated (28,938,332 s/min), 1,205,190,806 distinct states found (2,156,208 ds/min), 189,608,794 states left on queue. +Progress(42) at 2024-11-06 23:40:01: 14,327,587,116 states generated (28,960,976 s/min), 1,207,339,559 distinct states found (2,148,753 ds/min), 189,750,359 states left on queue. +Progress(42) at 2024-11-06 23:41:01: 14,356,469,494 states generated (28,882,378 s/min), 1,209,518,146 distinct states found (2,178,587 ds/min), 189,892,036 states left on queue. +Progress(42) at 2024-11-06 23:42:01: 14,385,314,696 states generated (28,845,202 s/min), 1,211,701,473 distinct states found (2,183,327 ds/min), 190,050,090 states left on queue. +Progress(42) at 2024-11-06 23:43:01: 14,414,142,550 states generated (28,827,854 s/min), 1,213,859,919 distinct states found (2,158,446 ds/min), 190,161,804 states left on queue. +Progress(42) at 2024-11-06 23:44:01: 14,442,945,644 states generated (28,803,094 s/min), 1,216,005,127 distinct states found (2,145,208 ds/min), 190,173,898 states left on queue. +Progress(42) at 2024-11-06 23:45:01: 14,471,693,798 states generated (28,748,154 s/min), 1,218,030,292 distinct states found (2,025,165 ds/min), 190,127,864 states left on queue. +Progress(42) at 2024-11-06 23:46:01: 14,500,599,025 states generated (28,905,227 s/min), 1,219,996,243 distinct states found (1,965,951 ds/min), 190,069,034 states left on queue. +Progress(42) at 2024-11-06 23:47:01: 14,529,770,118 states generated (29,171,093 s/min), 1,221,890,284 distinct states found (1,894,041 ds/min), 189,948,701 states left on queue. +Progress(42) at 2024-11-06 23:48:01: 14,559,044,399 states generated (29,274,281 s/min), 1,223,772,100 distinct states found (1,881,816 ds/min), 189,844,417 states left on queue. +Progress(42) at 2024-11-06 23:49:01: 14,588,505,088 states generated (29,460,689 s/min), 1,225,870,790 distinct states found (2,098,690 ds/min), 189,921,025 states left on queue. +Progress(42) at 2024-11-06 23:50:01: 14,618,007,797 states generated (29,502,709 s/min), 1,227,944,381 distinct states found (2,073,591 ds/min), 189,947,590 states left on queue. +Progress(42) at 2024-11-06 23:51:01: 14,647,405,532 states generated (29,397,735 s/min), 1,230,287,712 distinct states found (2,343,331 ds/min), 190,200,223 states left on queue. +Progress(42) at 2024-11-06 23:52:01: 14,676,733,478 states generated (29,327,946 s/min), 1,232,303,440 distinct states found (2,015,728 ds/min), 190,178,290 states left on queue. +Progress(42) at 2024-11-06 23:53:01: 14,706,089,483 states generated (29,356,005 s/min), 1,234,269,055 distinct states found (1,965,615 ds/min), 190,175,215 states left on queue. +Progress(42) at 2024-11-06 23:54:01: 14,735,226,809 states generated (29,137,326 s/min), 1,236,451,189 distinct states found (2,182,134 ds/min), 190,293,853 states left on queue. +Progress(42) at 2024-11-06 23:55:01: 14,764,611,146 states generated (29,384,337 s/min), 1,238,780,557 distinct states found (2,329,368 ds/min), 190,528,991 states left on queue. +Progress(42) at 2024-11-06 23:56:01: 14,793,911,038 states generated (29,299,892 s/min), 1,240,745,156 distinct states found (1,964,599 ds/min), 190,493,881 states left on queue. +Progress(42) at 2024-11-06 23:57:01: 14,823,113,635 states generated (29,202,597 s/min), 1,242,984,781 distinct states found (2,239,625 ds/min), 190,675,723 states left on queue. +Progress(42) at 2024-11-06 23:58:01: 14,852,208,056 states generated (29,094,421 s/min), 1,245,341,804 distinct states found (2,357,023 ds/min), 190,959,027 states left on queue. +Progress(42) at 2024-11-06 23:59:01: 14,881,390,523 states generated (29,182,467 s/min), 1,247,530,823 distinct states found (2,189,019 ds/min), 191,085,175 states left on queue. +Progress(42) at 2024-11-07 00:00:01: 14,910,709,837 states generated (29,319,314 s/min), 1,249,665,632 distinct states found (2,134,809 ds/min), 191,148,911 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 00:01:02) +Progress(42) at 2024-11-07 00:01:02: 14,940,301,722 states generated (29,591,885 s/min), 1,251,820,098 distinct states found (2,154,466 ds/min), 191,164,099 states left on queue. +Progress(42) at 2024-11-07 00:02:02: 14,969,468,946 states generated (29,167,224 s/min), 1,253,608,374 distinct states found (1,788,276 ds/min), 190,977,899 states left on queue. +Progress(42) at 2024-11-07 00:03:02: 14,998,469,861 states generated (29,000,915 s/min), 1,255,846,206 distinct states found (2,237,832 ds/min), 191,179,932 states left on queue. +Progress(42) at 2024-11-07 00:04:02: 15,027,424,344 states generated (28,954,483 s/min), 1,258,012,253 distinct states found (2,166,047 ds/min), 191,269,006 states left on queue. +Progress(42) at 2024-11-07 00:05:02: 15,056,595,053 states generated (29,170,709 s/min), 1,259,974,817 distinct states found (1,962,564 ds/min), 191,232,379 states left on queue. +Progress(42) at 2024-11-07 00:06:02: 15,085,857,792 states generated (29,262,739 s/min), 1,262,139,752 distinct states found (2,164,935 ds/min), 191,351,326 states left on queue. +Progress(42) at 2024-11-07 00:07:02: 15,115,386,019 states generated (29,528,227 s/min), 1,264,425,723 distinct states found (2,285,971 ds/min), 191,549,077 states left on queue. +Progress(42) at 2024-11-07 00:08:02: 15,144,705,784 states generated (29,319,765 s/min), 1,266,390,816 distinct states found (1,965,093 ds/min), 191,495,454 states left on queue. +Progress(42) at 2024-11-07 00:09:02: 15,173,877,454 states generated (29,171,670 s/min), 1,268,144,487 distinct states found (1,753,671 ds/min), 191,300,959 states left on queue. +Progress(42) at 2024-11-07 00:10:02: 15,203,080,845 states generated (29,203,391 s/min), 1,270,256,870 distinct states found (2,112,383 ds/min), 191,363,085 states left on queue. +Progress(42) at 2024-11-07 00:11:02: 15,232,426,418 states generated (29,345,573 s/min), 1,272,624,413 distinct states found (2,367,543 ds/min), 191,673,032 states left on queue. +Progress(42) at 2024-11-07 00:12:02: 15,261,677,209 states generated (29,250,791 s/min), 1,274,995,857 distinct states found (2,371,444 ds/min), 191,960,618 states left on queue. +Progress(42) at 2024-11-07 00:13:02: 15,290,882,314 states generated (29,205,105 s/min), 1,277,269,501 distinct states found (2,273,644 ds/min), 192,155,220 states left on queue. +Progress(42) at 2024-11-07 00:14:02: 15,320,166,816 states generated (29,284,502 s/min), 1,279,524,897 distinct states found (2,255,396 ds/min), 192,367,797 states left on queue. +Progress(42) at 2024-11-07 00:15:02: 15,349,391,017 states generated (29,224,201 s/min), 1,281,912,896 distinct states found (2,387,999 ds/min), 192,657,361 states left on queue. +Progress(42) at 2024-11-07 00:16:02: 15,378,510,873 states generated (29,119,856 s/min), 1,284,352,819 distinct states found (2,439,923 ds/min), 192,982,001 states left on queue. +Progress(42) at 2024-11-07 00:17:02: 15,407,729,690 states generated (29,218,817 s/min), 1,286,798,116 distinct states found (2,445,297 ds/min), 193,251,888 states left on queue. +Progress(42) at 2024-11-07 00:18:02: 15,437,122,682 states generated (29,392,992 s/min), 1,289,060,398 distinct states found (2,262,282 ds/min), 193,393,686 states left on queue. +Progress(42) at 2024-11-07 00:19:02: 15,466,437,919 states generated (29,315,237 s/min), 1,291,390,007 distinct states found (2,329,609 ds/min), 193,674,611 states left on queue. +Progress(42) at 2024-11-07 00:20:02: 15,495,795,434 states generated (29,357,515 s/min), 1,293,625,999 distinct states found (2,235,992 ds/min), 193,855,148 states left on queue. +Progress(42) at 2024-11-07 00:21:02: 15,524,856,146 states generated (29,060,712 s/min), 1,295,675,220 distinct states found (2,049,221 ds/min), 193,858,347 states left on queue. +Progress(42) at 2024-11-07 00:22:02: 15,553,951,279 states generated (29,095,133 s/min), 1,297,806,219 distinct states found (2,130,999 ds/min), 193,910,330 states left on queue. +Progress(42) at 2024-11-07 00:23:02: 15,582,781,229 states generated (28,829,950 s/min), 1,300,215,254 distinct states found (2,409,035 ds/min), 194,211,020 states left on queue. +Progress(42) at 2024-11-07 00:24:02: 15,611,889,872 states generated (29,108,643 s/min), 1,302,431,347 distinct states found (2,216,093 ds/min), 194,324,070 states left on queue. +Progress(42) at 2024-11-07 00:25:02: 15,640,778,210 states generated (28,888,338 s/min), 1,304,674,839 distinct states found (2,243,492 ds/min), 194,483,563 states left on queue. +Progress(42) at 2024-11-07 00:26:02: 15,669,830,004 states generated (29,051,794 s/min), 1,306,661,103 distinct states found (1,986,264 ds/min), 194,429,101 states left on queue. +Progress(42) at 2024-11-07 00:27:02: 15,699,049,213 states generated (29,219,209 s/min), 1,308,920,712 distinct states found (2,259,609 ds/min), 194,577,576 states left on queue. +Progress(42) at 2024-11-07 00:28:02: 15,728,283,982 states generated (29,234,769 s/min), 1,310,924,780 distinct states found (2,004,068 ds/min), 194,488,601 states left on queue. +Progress(42) at 2024-11-07 00:29:02: 15,757,507,793 states generated (29,223,811 s/min), 1,312,729,390 distinct states found (1,804,610 ds/min), 194,321,454 states left on queue. +Progress(42) at 2024-11-07 00:30:02: 15,786,513,733 states generated (29,005,940 s/min), 1,314,926,573 distinct states found (2,197,183 ds/min), 194,422,995 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 00:31:03) +Progress(42) at 2024-11-07 00:31:03: 15,815,683,048 states generated (29,169,315 s/min), 1,317,135,461 distinct states found (2,208,888 ds/min), 194,492,192 states left on queue. +Progress(42) at 2024-11-07 00:32:03: 15,844,758,678 states generated (29,075,630 s/min), 1,319,144,875 distinct states found (2,009,414 ds/min), 194,413,387 states left on queue. +Progress(42) at 2024-11-07 00:33:03: 15,873,998,157 states generated (29,239,479 s/min), 1,320,932,025 distinct states found (1,787,150 ds/min), 194,281,981 states left on queue. +Progress(42) at 2024-11-07 00:34:03: 15,903,205,479 states generated (29,207,322 s/min), 1,322,654,400 distinct states found (1,722,375 ds/min), 194,091,121 states left on queue. +Progress(42) at 2024-11-07 00:35:03: 15,932,501,264 states generated (29,295,785 s/min), 1,324,682,430 distinct states found (2,028,030 ds/min), 194,137,494 states left on queue. +Progress(42) at 2024-11-07 00:36:03: 15,961,589,919 states generated (29,088,655 s/min), 1,326,509,334 distinct states found (1,826,904 ds/min), 194,051,639 states left on queue. +Progress(42) at 2024-11-07 00:37:03: 15,990,668,327 states generated (29,078,408 s/min), 1,328,357,672 distinct states found (1,848,338 ds/min), 193,989,585 states left on queue. +Progress(42) at 2024-11-07 00:38:03: 16,019,782,313 states generated (29,113,986 s/min), 1,330,232,446 distinct states found (1,874,774 ds/min), 193,949,446 states left on queue. +Progress(42) at 2024-11-07 00:39:03: 16,049,252,200 states generated (29,469,887 s/min), 1,331,987,412 distinct states found (1,754,966 ds/min), 193,747,896 states left on queue. +Progress(42) at 2024-11-07 00:40:03: 16,078,692,514 states generated (29,440,314 s/min), 1,333,894,185 distinct states found (1,906,773 ds/min), 193,729,942 states left on queue. +Progress(42) at 2024-11-07 00:41:03: 16,108,160,136 states generated (29,467,622 s/min), 1,336,102,661 distinct states found (2,208,476 ds/min), 193,914,624 states left on queue. +Progress(42) at 2024-11-07 00:42:03: 16,137,813,382 states generated (29,653,246 s/min), 1,338,180,836 distinct states found (2,078,175 ds/min), 193,976,996 states left on queue. +Progress(43) at 2024-11-07 00:43:03: 16,167,357,885 states generated (29,544,503 s/min), 1,339,957,139 distinct states found (1,776,303 ds/min), 193,787,392 states left on queue. +Progress(43) at 2024-11-07 00:44:03: 16,196,650,450 states generated (29,292,565 s/min), 1,341,719,088 distinct states found (1,761,949 ds/min), 193,648,551 states left on queue. +Progress(43) at 2024-11-07 00:45:03: 16,225,735,286 states generated (29,084,836 s/min), 1,343,468,127 distinct states found (1,749,039 ds/min), 193,497,590 states left on queue. +Progress(43) at 2024-11-07 00:46:03: 16,254,805,612 states generated (29,070,326 s/min), 1,345,280,226 distinct states found (1,812,099 ds/min), 193,364,788 states left on queue. +Progress(43) at 2024-11-07 00:47:03: 16,283,933,423 states generated (29,127,811 s/min), 1,347,294,879 distinct states found (2,014,653 ds/min), 193,397,713 states left on queue. +Progress(43) at 2024-11-07 00:48:03: 16,312,911,730 states generated (28,978,307 s/min), 1,349,192,377 distinct states found (1,897,498 ds/min), 193,321,503 states left on queue. +Progress(43) at 2024-11-07 00:49:03: 16,342,115,657 states generated (29,203,927 s/min), 1,350,961,684 distinct states found (1,769,307 ds/min), 193,144,596 states left on queue. +Progress(43) at 2024-11-07 00:50:03: 16,370,988,391 states generated (28,872,734 s/min), 1,352,868,904 distinct states found (1,907,220 ds/min), 193,089,969 states left on queue. +Progress(43) at 2024-11-07 00:51:03: 16,400,089,208 states generated (29,100,817 s/min), 1,354,864,448 distinct states found (1,995,544 ds/min), 193,098,377 states left on queue. +Progress(43) at 2024-11-07 00:52:03: 16,429,331,456 states generated (29,242,248 s/min), 1,356,734,632 distinct states found (1,870,184 ds/min), 193,093,615 states left on queue. +Progress(43) at 2024-11-07 00:53:03: 16,458,648,761 states generated (29,317,305 s/min), 1,358,622,917 distinct states found (1,888,285 ds/min), 193,098,172 states left on queue. +Progress(43) at 2024-11-07 00:54:03: 16,487,874,773 states generated (29,226,012 s/min), 1,360,737,908 distinct states found (2,114,991 ds/min), 193,250,949 states left on queue. +Progress(43) at 2024-11-07 00:55:03: 16,517,101,401 states generated (29,226,628 s/min), 1,363,024,072 distinct states found (2,286,164 ds/min), 193,508,719 states left on queue. +Progress(43) at 2024-11-07 00:56:03: 16,546,231,362 states generated (29,129,961 s/min), 1,365,056,771 distinct states found (2,032,699 ds/min), 193,558,441 states left on queue. +Progress(43) at 2024-11-07 00:57:03: 16,575,532,837 states generated (29,301,475 s/min), 1,367,107,709 distinct states found (2,050,938 ds/min), 193,609,354 states left on queue. +Progress(43) at 2024-11-07 00:58:03: 16,604,872,137 states generated (29,339,300 s/min), 1,369,059,417 distinct states found (1,951,708 ds/min), 193,561,420 states left on queue. +Progress(43) at 2024-11-07 00:59:03: 16,634,070,732 states generated (29,198,595 s/min), 1,371,016,928 distinct states found (1,957,511 ds/min), 193,513,278 states left on queue. +Progress(43) at 2024-11-07 01:00:03: 16,663,158,113 states generated (29,087,381 s/min), 1,373,092,542 distinct states found (2,075,614 ds/min), 193,582,661 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 01:01:03) +Progress(43) at 2024-11-07 01:01:03: 16,692,576,110 states generated (29,417,997 s/min), 1,375,200,108 distinct states found (2,107,566 ds/min), 193,664,621 states left on queue. +Progress(43) at 2024-11-07 01:02:03: 16,721,716,479 states generated (29,140,369 s/min), 1,377,247,529 distinct states found (2,047,421 ds/min), 193,708,538 states left on queue. +Progress(43) at 2024-11-07 01:03:03: 16,750,779,523 states generated (29,063,044 s/min), 1,379,368,087 distinct states found (2,120,558 ds/min), 193,813,065 states left on queue. +Progress(43) at 2024-11-07 01:04:03: 16,779,794,524 states generated (29,015,001 s/min), 1,381,371,287 distinct states found (2,003,200 ds/min), 193,825,465 states left on queue. +Progress(43) at 2024-11-07 01:05:03: 16,808,907,203 states generated (29,112,679 s/min), 1,383,515,008 distinct states found (2,143,721 ds/min), 193,953,821 states left on queue. +Progress(43) at 2024-11-07 01:06:03: 16,838,029,628 states generated (29,122,425 s/min), 1,385,629,882 distinct states found (2,114,874 ds/min), 194,038,163 states left on queue. +Progress(43) at 2024-11-07 01:07:03: 16,867,418,111 states generated (29,388,483 s/min), 1,387,561,049 distinct states found (1,931,167 ds/min), 194,004,058 states left on queue. +Progress(43) at 2024-11-07 01:08:03: 16,896,555,416 states generated (29,137,305 s/min), 1,389,592,238 distinct states found (2,031,189 ds/min), 194,058,208 states left on queue. +Progress(43) at 2024-11-07 01:09:03: 16,925,642,685 states generated (29,087,269 s/min), 1,391,404,896 distinct states found (1,812,658 ds/min), 193,924,951 states left on queue. +Progress(43) at 2024-11-07 01:10:03: 16,954,638,533 states generated (28,995,848 s/min), 1,393,186,525 distinct states found (1,781,629 ds/min), 193,784,358 states left on queue. +Progress(43) at 2024-11-07 01:11:03: 16,983,710,894 states generated (29,072,361 s/min), 1,395,018,264 distinct states found (1,831,739 ds/min), 193,697,690 states left on queue. +Progress(43) at 2024-11-07 01:12:03: 17,012,741,316 states generated (29,030,422 s/min), 1,397,039,325 distinct states found (2,021,061 ds/min), 193,755,919 states left on queue. +Progress(43) at 2024-11-07 01:13:03: 17,041,674,538 states generated (28,933,222 s/min), 1,399,086,352 distinct states found (2,047,027 ds/min), 193,799,420 states left on queue. +Progress(43) at 2024-11-07 01:14:03: 17,070,653,912 states generated (28,979,374 s/min), 1,401,092,312 distinct states found (2,005,960 ds/min), 193,820,018 states left on queue. +Progress(43) at 2024-11-07 01:15:03: 17,099,536,446 states generated (28,882,534 s/min), 1,403,159,743 distinct states found (2,067,431 ds/min), 193,867,947 states left on queue. +Progress(43) at 2024-11-07 01:16:03: 17,128,396,670 states generated (28,860,224 s/min), 1,405,244,280 distinct states found (2,084,537 ds/min), 193,945,380 states left on queue. +Progress(43) at 2024-11-07 01:17:03: 17,157,276,177 states generated (28,879,507 s/min), 1,407,274,748 distinct states found (2,030,468 ds/min), 193,944,077 states left on queue. +Progress(43) at 2024-11-07 01:18:03: 17,186,149,639 states generated (28,873,462 s/min), 1,409,283,088 distinct states found (2,008,340 ds/min), 193,881,792 states left on queue. +Progress(43) at 2024-11-07 01:19:03: 17,214,923,206 states generated (28,773,567 s/min), 1,411,167,065 distinct states found (1,883,977 ds/min), 193,711,394 states left on queue. +Progress(43) at 2024-11-07 01:20:03: 17,243,730,245 states generated (28,807,039 s/min), 1,413,023,763 distinct states found (1,856,698 ds/min), 193,546,054 states left on queue. +Progress(43) at 2024-11-07 01:21:03: 17,272,650,525 states generated (28,920,280 s/min), 1,414,802,171 distinct states found (1,778,408 ds/min), 193,345,308 states left on queue. +Progress(43) at 2024-11-07 01:22:03: 17,301,943,589 states generated (29,293,064 s/min), 1,416,599,440 distinct states found (1,797,269 ds/min), 193,158,676 states left on queue. +Progress(43) at 2024-11-07 01:23:03: 17,331,337,313 states generated (29,393,724 s/min), 1,418,547,450 distinct states found (1,948,010 ds/min), 193,112,883 states left on queue. +Progress(43) at 2024-11-07 01:24:03: 17,360,793,100 states generated (29,455,787 s/min), 1,420,576,018 distinct states found (2,028,568 ds/min), 193,100,476 states left on queue. +Progress(43) at 2024-11-07 01:25:03: 17,390,123,392 states generated (29,330,292 s/min), 1,422,693,479 distinct states found (2,117,461 ds/min), 193,171,748 states left on queue. +Progress(43) at 2024-11-07 01:26:03: 17,419,468,515 states generated (29,345,123 s/min), 1,424,783,244 distinct states found (2,089,765 ds/min), 193,228,274 states left on queue. +Progress(43) at 2024-11-07 01:27:03: 17,448,810,016 states generated (29,341,501 s/min), 1,426,560,811 distinct states found (1,777,567 ds/min), 193,036,459 states left on queue. +Progress(43) at 2024-11-07 01:28:03: 17,478,034,472 states generated (29,224,456 s/min), 1,428,663,374 distinct states found (2,102,563 ds/min), 193,125,616 states left on queue. +Progress(43) at 2024-11-07 01:29:03: 17,507,201,835 states generated (29,167,363 s/min), 1,430,735,910 distinct states found (2,072,536 ds/min), 193,146,850 states left on queue. +Progress(43) at 2024-11-07 01:30:03: 17,536,546,498 states generated (29,344,663 s/min), 1,432,877,950 distinct states found (2,142,040 ds/min), 193,230,645 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 01:31:04) +Progress(43) at 2024-11-07 01:31:04: 17,566,061,546 states generated (29,515,048 s/min), 1,434,839,708 distinct states found (1,961,758 ds/min), 193,176,951 states left on queue. +Progress(43) at 2024-11-07 01:32:04: 17,595,015,993 states generated (28,954,447 s/min), 1,436,986,257 distinct states found (2,146,549 ds/min), 193,289,254 states left on queue. +Progress(43) at 2024-11-07 01:33:04: 17,624,137,153 states generated (29,121,160 s/min), 1,439,279,150 distinct states found (2,292,893 ds/min), 193,525,973 states left on queue. +Progress(43) at 2024-11-07 01:34:04: 17,653,328,248 states generated (29,191,095 s/min), 1,441,299,767 distinct states found (2,020,617 ds/min), 193,504,947 states left on queue. +Progress(43) at 2024-11-07 01:35:04: 17,682,562,562 states generated (29,234,314 s/min), 1,443,317,413 distinct states found (2,017,646 ds/min), 193,471,905 states left on queue. +Progress(43) at 2024-11-07 01:36:04: 17,711,829,397 states generated (29,266,835 s/min), 1,445,304,310 distinct states found (1,986,897 ds/min), 193,370,899 states left on queue. +Progress(43) at 2024-11-07 01:37:04: 17,740,910,347 states generated (29,080,950 s/min), 1,447,009,563 distinct states found (1,705,253 ds/min), 193,129,235 states left on queue. +Progress(43) at 2024-11-07 01:38:04: 17,769,836,321 states generated (28,925,974 s/min), 1,449,139,496 distinct states found (2,129,933 ds/min), 193,234,511 states left on queue. +Progress(43) at 2024-11-07 01:39:04: 17,798,713,067 states generated (28,876,746 s/min), 1,451,211,612 distinct states found (2,072,116 ds/min), 193,241,362 states left on queue. +Progress(43) at 2024-11-07 01:40:04: 17,827,794,691 states generated (29,081,624 s/min), 1,453,062,046 distinct states found (1,850,434 ds/min), 193,114,753 states left on queue. +Progress(43) at 2024-11-07 01:41:04: 17,856,974,014 states generated (29,179,323 s/min), 1,455,151,187 distinct states found (2,089,141 ds/min), 193,169,579 states left on queue. +Progress(43) at 2024-11-07 01:42:04: 17,886,446,666 states generated (29,472,652 s/min), 1,457,303,171 distinct states found (2,151,984 ds/min), 193,263,708 states left on queue. +Progress(43) at 2024-11-07 01:43:04: 17,915,744,840 states generated (29,298,174 s/min), 1,459,261,460 distinct states found (1,958,289 ds/min), 193,194,468 states left on queue. +Progress(43) at 2024-11-07 01:44:04: 17,944,793,057 states generated (29,048,217 s/min), 1,460,885,305 distinct states found (1,623,845 ds/min), 192,905,330 states left on queue. +Progress(43) at 2024-11-07 01:45:04: 17,973,952,967 states generated (29,159,910 s/min), 1,462,880,642 distinct states found (1,995,337 ds/min), 192,871,348 states left on queue. +Progress(43) at 2024-11-07 01:46:04: 18,003,158,344 states generated (29,205,377 s/min), 1,465,077,846 distinct states found (2,197,204 ds/min), 193,039,702 states left on queue. +Progress(43) at 2024-11-07 01:47:04: 18,032,464,087 states generated (29,305,743 s/min), 1,467,361,120 distinct states found (2,283,274 ds/min), 193,271,051 states left on queue. +Progress(43) at 2024-11-07 01:48:04: 18,061,597,682 states generated (29,133,595 s/min), 1,469,505,688 distinct states found (2,144,568 ds/min), 193,354,360 states left on queue. +Progress(43) at 2024-11-07 01:49:04: 18,090,888,515 states generated (29,290,833 s/min), 1,471,655,035 distinct states found (2,149,347 ds/min), 193,472,080 states left on queue. +Progress(43) at 2024-11-07 01:50:04: 18,119,855,749 states generated (28,967,234 s/min), 1,473,959,147 distinct states found (2,304,112 ds/min), 193,714,821 states left on queue. +Progress(43) at 2024-11-07 01:51:04: 18,149,035,954 states generated (29,180,205 s/min), 1,476,253,894 distinct states found (2,294,747 ds/min), 193,939,051 states left on queue. +Progress(43) at 2024-11-07 01:52:04: 18,178,210,402 states generated (29,174,448 s/min), 1,478,557,699 distinct states found (2,303,805 ds/min), 194,141,809 states left on queue. +Progress(43) at 2024-11-07 01:53:04: 18,207,377,534 states generated (29,167,132 s/min), 1,480,870,404 distinct states found (2,312,705 ds/min), 194,307,877 states left on queue. +Progress(43) at 2024-11-07 01:54:04: 18,236,577,989 states generated (29,200,455 s/min), 1,483,070,823 distinct states found (2,200,419 ds/min), 194,387,223 states left on queue. +Progress(43) at 2024-11-07 01:55:04: 18,265,859,163 states generated (29,281,174 s/min), 1,485,222,154 distinct states found (2,151,331 ds/min), 194,522,233 states left on queue. +Progress(43) at 2024-11-07 01:56:04: 18,295,148,797 states generated (29,289,634 s/min), 1,487,521,283 distinct states found (2,299,129 ds/min), 194,755,427 states left on queue. +Progress(43) at 2024-11-07 01:57:04: 18,324,289,175 states generated (29,140,378 s/min), 1,489,367,193 distinct states found (1,845,910 ds/min), 194,604,366 states left on queue. +Progress(43) at 2024-11-07 01:58:04: 18,353,385,770 states generated (29,096,595 s/min), 1,491,503,782 distinct states found (2,136,589 ds/min), 194,651,670 states left on queue. +Progress(43) at 2024-11-07 01:59:04: 18,382,277,307 states generated (28,891,537 s/min), 1,493,659,362 distinct states found (2,155,580 ds/min), 194,761,640 states left on queue. +Progress(43) at 2024-11-07 02:00:04: 18,411,146,853 states generated (28,869,546 s/min), 1,495,935,237 distinct states found (2,275,875 ds/min), 194,908,896 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 02:01:05) +Progress(43) at 2024-11-07 02:01:05: 18,440,532,837 states generated (29,385,984 s/min), 1,497,937,249 distinct states found (2,002,012 ds/min), 194,874,096 states left on queue. +Progress(43) at 2024-11-07 02:02:05: 18,469,385,511 states generated (28,852,674 s/min), 1,500,062,084 distinct states found (2,124,835 ds/min), 194,900,930 states left on queue. +Progress(43) at 2024-11-07 02:03:05: 18,498,509,160 states generated (29,123,649 s/min), 1,502,077,387 distinct states found (2,015,303 ds/min), 194,871,250 states left on queue. +Progress(43) at 2024-11-07 02:04:05: 18,527,694,520 states generated (29,185,360 s/min), 1,504,242,136 distinct states found (2,164,749 ds/min), 194,924,748 states left on queue. +Progress(43) at 2024-11-07 02:05:05: 18,556,901,350 states generated (29,206,830 s/min), 1,506,034,687 distinct states found (1,792,551 ds/min), 194,670,609 states left on queue. +Progress(43) at 2024-11-07 02:06:05: 18,586,004,706 states generated (29,103,356 s/min), 1,507,879,191 distinct states found (1,844,504 ds/min), 194,551,782 states left on queue. +Progress(43) at 2024-11-07 02:07:05: 18,614,881,319 states generated (28,876,613 s/min), 1,510,019,997 distinct states found (2,140,806 ds/min), 194,594,957 states left on queue. +Progress(43) at 2024-11-07 02:08:05: 18,643,854,322 states generated (28,973,003 s/min), 1,512,074,165 distinct states found (2,054,168 ds/min), 194,532,832 states left on queue. +Progress(43) at 2024-11-07 02:09:05: 18,672,998,550 states generated (29,144,228 s/min), 1,513,943,120 distinct states found (1,868,955 ds/min), 194,368,599 states left on queue. +Progress(43) at 2024-11-07 02:10:05: 18,702,201,308 states generated (29,202,758 s/min), 1,515,546,068 distinct states found (1,602,948 ds/min), 194,090,755 states left on queue. +Progress(43) at 2024-11-07 02:11:05: 18,731,481,011 states generated (29,279,703 s/min), 1,517,343,788 distinct states found (1,797,720 ds/min), 193,942,961 states left on queue. +Progress(43) at 2024-11-07 02:12:05: 18,760,609,986 states generated (29,128,975 s/min), 1,519,160,050 distinct states found (1,816,262 ds/min), 193,815,502 states left on queue. +Progress(43) at 2024-11-07 02:13:05: 18,789,628,202 states generated (29,018,216 s/min), 1,520,860,123 distinct states found (1,700,073 ds/min), 193,642,399 states left on queue. +Progress(43) at 2024-11-07 02:14:05: 18,818,770,407 states generated (29,142,205 s/min), 1,522,616,126 distinct states found (1,756,003 ds/min), 193,516,180 states left on queue. +Progress(43) at 2024-11-07 02:15:05: 18,847,943,521 states generated (29,173,114 s/min), 1,524,373,878 distinct states found (1,757,752 ds/min), 193,352,389 states left on queue. +Progress(43) at 2024-11-07 02:16:05: 18,877,338,814 states generated (29,395,293 s/min), 1,526,022,199 distinct states found (1,648,321 ds/min), 193,099,089 states left on queue. +Progress(43) at 2024-11-07 02:17:05: 18,906,854,907 states generated (29,516,093 s/min), 1,528,057,287 distinct states found (2,035,088 ds/min), 193,164,007 states left on queue. +Progress(43) at 2024-11-07 02:18:05: 18,936,272,714 states generated (29,417,807 s/min), 1,530,070,868 distinct states found (2,013,581 ds/min), 193,195,191 states left on queue. +Progress(43) at 2024-11-07 02:19:05: 18,965,845,291 states generated (29,572,577 s/min), 1,531,953,514 distinct states found (1,882,646 ds/min), 193,094,610 states left on queue. +Progress(44) at 2024-11-07 02:20:05: 18,995,225,711 states generated (29,380,420 s/min), 1,533,586,486 distinct states found (1,632,972 ds/min), 192,813,292 states left on queue. +Progress(44) at 2024-11-07 02:21:05: 19,024,424,249 states generated (29,198,538 s/min), 1,535,341,846 distinct states found (1,755,360 ds/min), 192,665,431 states left on queue. +Progress(44) at 2024-11-07 02:22:05: 19,053,319,611 states generated (28,895,362 s/min), 1,536,913,652 distinct states found (1,571,806 ds/min), 192,336,687 states left on queue. +Progress(44) at 2024-11-07 02:23:05: 19,082,456,366 states generated (29,136,755 s/min), 1,538,781,638 distinct states found (1,867,986 ds/min), 192,258,068 states left on queue. +Progress(44) at 2024-11-07 02:24:05: 19,111,445,941 states generated (28,989,575 s/min), 1,540,696,734 distinct states found (1,915,096 ds/min), 192,193,602 states left on queue. +Progress(44) at 2024-11-07 02:25:05: 19,140,498,683 states generated (29,052,742 s/min), 1,542,368,994 distinct states found (1,672,260 ds/min), 191,938,239 states left on queue. +Progress(44) at 2024-11-07 02:26:05: 19,169,386,645 states generated (28,887,962 s/min), 1,544,099,236 distinct states found (1,730,242 ds/min), 191,741,059 states left on queue. +Progress(44) at 2024-11-07 02:27:05: 19,198,354,957 states generated (28,968,312 s/min), 1,545,891,836 distinct states found (1,792,600 ds/min), 191,577,211 states left on queue. +Progress(44) at 2024-11-07 02:28:05: 19,227,551,398 states generated (29,196,441 s/min), 1,547,751,807 distinct states found (1,859,971 ds/min), 191,530,291 states left on queue. +Progress(44) at 2024-11-07 02:29:05: 19,256,905,544 states generated (29,354,146 s/min), 1,549,562,753 distinct states found (1,810,946 ds/min), 191,492,536 states left on queue. +Progress(44) at 2024-11-07 02:30:05: 19,286,043,009 states generated (29,137,465 s/min), 1,551,387,062 distinct states found (1,824,309 ds/min), 191,432,131 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 02:31:05) +Progress(44) at 2024-11-07 02:31:05: 19,315,478,636 states generated (29,435,627 s/min), 1,553,684,416 distinct states found (2,297,354 ds/min), 191,685,387 states left on queue. +Progress(44) at 2024-11-07 02:32:05: 19,344,574,433 states generated (29,095,797 s/min), 1,555,642,251 distinct states found (1,957,835 ds/min), 191,687,380 states left on queue. +Progress(44) at 2024-11-07 02:33:05: 19,373,560,321 states generated (28,985,888 s/min), 1,557,576,032 distinct states found (1,933,781 ds/min), 191,644,771 states left on queue. +Progress(44) at 2024-11-07 02:34:05: 19,402,882,849 states generated (29,322,528 s/min), 1,559,483,211 distinct states found (1,907,179 ds/min), 191,584,351 states left on queue. +Progress(44) at 2024-11-07 02:35:05: 19,432,084,827 states generated (29,201,978 s/min), 1,561,305,888 distinct states found (1,822,677 ds/min), 191,432,596 states left on queue. +Progress(44) at 2024-11-07 02:36:05: 19,461,112,335 states generated (29,027,508 s/min), 1,563,180,797 distinct states found (1,874,909 ds/min), 191,330,553 states left on queue. +Progress(44) at 2024-11-07 02:37:05: 19,490,043,498 states generated (28,931,163 s/min), 1,565,137,368 distinct states found (1,956,571 ds/min), 191,292,333 states left on queue. +Progress(44) at 2024-11-07 02:38:05: 19,519,153,014 states generated (29,109,516 s/min), 1,567,034,954 distinct states found (1,897,586 ds/min), 191,229,524 states left on queue. +Progress(44) at 2024-11-07 02:39:05: 19,548,204,678 states generated (29,051,664 s/min), 1,568,989,443 distinct states found (1,954,489 ds/min), 191,191,752 states left on queue. +Progress(44) at 2024-11-07 02:40:05: 19,577,227,470 states generated (29,022,792 s/min), 1,570,981,495 distinct states found (1,992,052 ds/min), 191,200,024 states left on queue. +Progress(44) at 2024-11-07 02:41:05: 19,606,172,601 states generated (28,945,131 s/min), 1,572,870,324 distinct states found (1,888,829 ds/min), 191,115,956 states left on queue. +Progress(44) at 2024-11-07 02:42:05: 19,635,167,481 states generated (28,994,880 s/min), 1,574,894,468 distinct states found (2,024,144 ds/min), 191,139,869 states left on queue. +Progress(44) at 2024-11-07 02:43:05: 19,664,339,049 states generated (29,171,568 s/min), 1,576,906,348 distinct states found (2,011,880 ds/min), 191,137,521 states left on queue. +Progress(44) at 2024-11-07 02:44:05: 19,693,639,689 states generated (29,300,640 s/min), 1,578,748,425 distinct states found (1,842,077 ds/min), 191,040,518 states left on queue. +Progress(44) at 2024-11-07 02:45:05: 19,722,704,536 states generated (29,064,847 s/min), 1,580,671,538 distinct states found (1,923,113 ds/min), 191,001,469 states left on queue. +Progress(44) at 2024-11-07 02:46:05: 19,751,627,669 states generated (28,923,133 s/min), 1,582,340,762 distinct states found (1,669,224 ds/min), 190,750,504 states left on queue. +Progress(44) at 2024-11-07 02:47:05: 19,780,532,535 states generated (28,904,866 s/min), 1,583,965,049 distinct states found (1,624,287 ds/min), 190,492,540 states left on queue. +Progress(44) at 2024-11-07 02:48:05: 19,809,548,743 states generated (29,016,208 s/min), 1,585,820,774 distinct states found (1,855,725 ds/min), 190,422,454 states left on queue. +Progress(44) at 2024-11-07 02:49:05: 19,838,541,075 states generated (28,992,332 s/min), 1,587,731,649 distinct states found (1,910,875 ds/min), 190,386,932 states left on queue. +Progress(44) at 2024-11-07 02:50:05: 19,867,458,320 states generated (28,917,245 s/min), 1,589,622,141 distinct states found (1,890,492 ds/min), 190,310,460 states left on queue. +Progress(44) at 2024-11-07 02:51:05: 19,896,287,158 states generated (28,828,838 s/min), 1,591,517,151 distinct states found (1,895,010 ds/min), 190,235,561 states left on queue. +Progress(44) at 2024-11-07 02:52:05: 19,925,117,820 states generated (28,830,662 s/min), 1,593,453,289 distinct states found (1,936,138 ds/min), 190,176,789 states left on queue. +Progress(44) at 2024-11-07 02:53:05: 19,953,949,651 states generated (28,831,831 s/min), 1,595,392,832 distinct states found (1,939,543 ds/min), 190,137,713 states left on queue. +Progress(44) at 2024-11-07 02:54:05: 19,982,791,590 states generated (28,841,939 s/min), 1,597,295,182 distinct states found (1,902,350 ds/min), 190,030,864 states left on queue. +Progress(44) at 2024-11-07 02:55:05: 20,011,631,796 states generated (28,840,206 s/min), 1,599,162,388 distinct states found (1,867,206 ds/min), 189,857,155 states left on queue. +Progress(44) at 2024-11-07 02:56:05: 20,040,350,017 states generated (28,718,221 s/min), 1,600,882,747 distinct states found (1,720,359 ds/min), 189,556,504 states left on queue. +Progress(44) at 2024-11-07 02:57:05: 20,069,048,267 states generated (28,698,250 s/min), 1,602,583,945 distinct states found (1,701,198 ds/min), 189,276,085 states left on queue. +Progress(44) at 2024-11-07 02:58:05: 20,098,037,079 states generated (28,988,812 s/min), 1,604,245,937 distinct states found (1,661,992 ds/min), 188,968,070 states left on queue. +Progress(44) at 2024-11-07 02:59:05: 20,127,216,730 states generated (29,179,651 s/min), 1,605,916,753 distinct states found (1,670,816 ds/min), 188,703,437 states left on queue. +Progress(44) at 2024-11-07 03:00:05: 20,156,712,917 states generated (29,496,187 s/min), 1,607,868,866 distinct states found (1,952,113 ds/min), 188,640,553 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 03:01:06) +Progress(44) at 2024-11-07 03:01:06: 20,186,396,044 states generated (29,683,127 s/min), 1,609,765,772 distinct states found (1,896,906 ds/min), 188,510,848 states left on queue. +Progress(44) at 2024-11-07 03:02:06: 20,215,754,864 states generated (29,358,820 s/min), 1,611,924,139 distinct states found (2,158,367 ds/min), 188,607,723 states left on queue. +Progress(44) at 2024-11-07 03:03:06: 20,245,041,982 states generated (29,287,118 s/min), 1,613,794,702 distinct states found (1,870,563 ds/min), 188,472,700 states left on queue. +Progress(44) at 2024-11-07 03:04:06: 20,274,294,374 states generated (29,252,392 s/min), 1,615,566,733 distinct states found (1,772,031 ds/min), 188,311,061 states left on queue. +Progress(44) at 2024-11-07 03:05:06: 20,303,317,537 states generated (29,023,163 s/min), 1,617,541,966 distinct states found (1,975,233 ds/min), 188,275,227 states left on queue. +Progress(44) at 2024-11-07 03:06:06: 20,332,555,917 states generated (29,238,380 s/min), 1,619,626,477 distinct states found (2,084,511 ds/min), 188,311,129 states left on queue. +Progress(44) at 2024-11-07 03:07:06: 20,361,814,948 states generated (29,259,031 s/min), 1,621,498,944 distinct states found (1,872,467 ds/min), 188,187,982 states left on queue. +Progress(44) at 2024-11-07 03:08:06: 20,391,066,062 states generated (29,251,114 s/min), 1,623,499,145 distinct states found (2,000,201 ds/min), 188,184,372 states left on queue. +Progress(44) at 2024-11-07 03:09:06: 20,420,013,539 states generated (28,947,477 s/min), 1,625,534,256 distinct states found (2,035,111 ds/min), 188,202,174 states left on queue. +Progress(44) at 2024-11-07 03:10:06: 20,449,116,787 states generated (29,103,248 s/min), 1,627,670,135 distinct states found (2,135,879 ds/min), 188,303,061 states left on queue. +Progress(44) at 2024-11-07 03:11:06: 20,478,265,224 states generated (29,148,437 s/min), 1,629,558,947 distinct states found (1,888,812 ds/min), 188,171,995 states left on queue. +Progress(44) at 2024-11-07 03:12:06: 20,507,459,785 states generated (29,194,561 s/min), 1,631,460,915 distinct states found (1,901,968 ds/min), 188,044,516 states left on queue. +Progress(44) at 2024-11-07 03:13:06: 20,536,655,025 states generated (29,195,240 s/min), 1,633,292,515 distinct states found (1,831,600 ds/min), 187,823,678 states left on queue. +Progress(44) at 2024-11-07 03:14:06: 20,565,699,198 states generated (29,044,173 s/min), 1,634,967,122 distinct states found (1,674,607 ds/min), 187,564,357 states left on queue. +Progress(44) at 2024-11-07 03:15:06: 20,594,568,781 states generated (28,869,583 s/min), 1,636,996,440 distinct states found (2,029,318 ds/min), 187,577,506 states left on queue. +Progress(44) at 2024-11-07 03:16:06: 20,623,463,526 states generated (28,894,745 s/min), 1,638,870,718 distinct states found (1,874,278 ds/min), 187,429,057 states left on queue. +Progress(44) at 2024-11-07 03:17:06: 20,652,517,975 states generated (29,054,449 s/min), 1,640,608,054 distinct states found (1,737,336 ds/min), 187,198,996 states left on queue. +Progress(44) at 2024-11-07 03:18:06: 20,681,729,377 states generated (29,211,402 s/min), 1,642,682,611 distinct states found (2,074,557 ds/min), 187,238,673 states left on queue. +Progress(44) at 2024-11-07 03:19:06: 20,711,226,363 states generated (29,496,986 s/min), 1,644,764,480 distinct states found (2,081,869 ds/min), 187,269,746 states left on queue. +Progress(44) at 2024-11-07 03:20:06: 20,740,520,876 states generated (29,294,513 s/min), 1,646,565,948 distinct states found (1,801,468 ds/min), 187,085,841 states left on queue. +Progress(44) at 2024-11-07 03:21:06: 20,769,532,066 states generated (29,011,190 s/min), 1,648,139,570 distinct states found (1,573,622 ds/min), 186,737,971 states left on queue. +Progress(44) at 2024-11-07 03:22:06: 20,798,731,555 states generated (29,199,489 s/min), 1,650,061,318 distinct states found (1,921,748 ds/min), 186,652,080 states left on queue. +Progress(44) at 2024-11-07 03:23:06: 20,827,864,871 states generated (29,133,316 s/min), 1,652,217,368 distinct states found (2,156,050 ds/min), 186,786,338 states left on queue. +Progress(44) at 2024-11-07 03:24:06: 20,857,114,542 states generated (29,249,671 s/min), 1,654,404,059 distinct states found (2,186,691 ds/min), 186,937,127 states left on queue. +Progress(44) at 2024-11-07 03:25:06: 20,886,216,235 states generated (29,101,693 s/min), 1,656,424,687 distinct states found (2,020,628 ds/min), 186,925,384 states left on queue. +Progress(44) at 2024-11-07 03:26:06: 20,915,415,138 states generated (29,198,903 s/min), 1,658,503,968 distinct states found (2,079,281 ds/min), 186,988,200 states left on queue. +Progress(44) at 2024-11-07 03:27:06: 20,944,436,117 states generated (29,020,979 s/min), 1,660,708,925 distinct states found (2,204,957 ds/min), 187,151,771 states left on queue. +Progress(44) at 2024-11-07 03:28:06: 20,973,637,986 states generated (29,201,869 s/min), 1,662,812,161 distinct states found (2,103,236 ds/min), 187,208,363 states left on queue. +Progress(44) at 2024-11-07 03:29:06: 21,002,664,654 states generated (29,026,668 s/min), 1,665,077,078 distinct states found (2,264,917 ds/min), 187,398,168 states left on queue. +Progress(44) at 2024-11-07 03:30:06: 21,031,900,683 states generated (29,236,029 s/min), 1,667,241,517 distinct states found (2,164,439 ds/min), 187,444,342 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 03:31:07) +Progress(44) at 2024-11-07 03:31:07: 21,061,190,967 states generated (29,290,284 s/min), 1,669,337,346 distinct states found (2,095,829 ds/min), 187,431,388 states left on queue. +Progress(44) at 2024-11-07 03:32:07: 21,090,368,622 states generated (29,177,655 s/min), 1,671,292,120 distinct states found (1,954,774 ds/min), 187,370,395 states left on queue. +Progress(44) at 2024-11-07 03:33:07: 21,119,546,588 states generated (29,177,966 s/min), 1,673,505,061 distinct states found (2,212,941 ds/min), 187,548,275 states left on queue. +Progress(44) at 2024-11-07 03:34:07: 21,148,770,544 states generated (29,223,956 s/min), 1,675,679,331 distinct states found (2,174,270 ds/min), 187,681,477 states left on queue. +Progress(44) at 2024-11-07 03:35:07: 21,177,752,842 states generated (28,982,298 s/min), 1,677,512,003 distinct states found (1,832,672 ds/min), 187,502,663 states left on queue. +Progress(44) at 2024-11-07 03:36:07: 21,206,777,989 states generated (29,025,147 s/min), 1,679,391,120 distinct states found (1,879,117 ds/min), 187,345,876 states left on queue. +Progress(44) at 2024-11-07 03:37:07: 21,235,634,562 states generated (28,856,573 s/min), 1,681,551,461 distinct states found (2,160,341 ds/min), 187,464,887 states left on queue. +Progress(44) at 2024-11-07 03:38:07: 21,264,448,690 states generated (28,814,128 s/min), 1,683,690,836 distinct states found (2,139,375 ds/min), 187,491,922 states left on queue. +Progress(44) at 2024-11-07 03:39:07: 21,293,469,454 states generated (29,020,764 s/min), 1,685,615,643 distinct states found (1,924,807 ds/min), 187,416,199 states left on queue. +Progress(44) at 2024-11-07 03:40:07: 21,322,287,082 states generated (28,817,628 s/min), 1,687,574,723 distinct states found (1,959,080 ds/min), 187,310,981 states left on queue. +Progress(44) at 2024-11-07 03:41:07: 21,351,396,680 states generated (29,109,598 s/min), 1,689,546,445 distinct states found (1,971,722 ds/min), 187,236,923 states left on queue. +Progress(44) at 2024-11-07 03:42:07: 21,380,557,165 states generated (29,160,485 s/min), 1,691,587,169 distinct states found (2,040,724 ds/min), 187,186,480 states left on queue. +Progress(44) at 2024-11-07 03:43:07: 21,409,627,333 states generated (29,070,168 s/min), 1,693,246,645 distinct states found (1,659,476 ds/min), 186,839,410 states left on queue. +Progress(44) at 2024-11-07 03:44:07: 21,438,692,500 states generated (29,065,167 s/min), 1,695,162,088 distinct states found (1,915,443 ds/min), 186,763,843 states left on queue. +Progress(44) at 2024-11-07 03:45:07: 21,467,558,980 states generated (28,866,480 s/min), 1,697,105,328 distinct states found (1,943,240 ds/min), 186,647,091 states left on queue. +Progress(44) at 2024-11-07 03:46:07: 21,496,459,596 states generated (28,900,616 s/min), 1,698,987,134 distinct states found (1,881,806 ds/min), 186,428,411 states left on queue. +Progress(44) at 2024-11-07 03:47:07: 21,525,539,564 states generated (29,079,968 s/min), 1,700,685,335 distinct states found (1,698,201 ds/min), 186,176,831 states left on queue. +Progress(44) at 2024-11-07 03:48:07: 21,554,716,115 states generated (29,176,551 s/min), 1,702,193,633 distinct states found (1,508,298 ds/min), 185,811,852 states left on queue. +Progress(44) at 2024-11-07 03:49:07: 21,583,930,332 states generated (29,214,217 s/min), 1,703,965,186 distinct states found (1,771,553 ds/min), 185,645,122 states left on queue. +Progress(44) at 2024-11-07 03:50:07: 21,612,870,304 states generated (28,939,972 s/min), 1,705,581,017 distinct states found (1,615,831 ds/min), 185,385,482 states left on queue. +Progress(44) at 2024-11-07 03:51:07: 21,641,828,993 states generated (28,958,689 s/min), 1,707,209,695 distinct states found (1,628,678 ds/min), 185,147,878 states left on queue. +Progress(44) at 2024-11-07 03:52:07: 21,670,879,227 states generated (29,050,234 s/min), 1,708,891,056 distinct states found (1,681,361 ds/min), 184,967,950 states left on queue. +Progress(44) at 2024-11-07 03:53:07: 21,700,175,853 states generated (29,296,626 s/min), 1,710,442,845 distinct states found (1,551,789 ds/min), 184,628,950 states left on queue. +Progress(44) at 2024-11-07 03:54:07: 21,729,661,920 states generated (29,486,067 s/min), 1,712,360,375 distinct states found (1,917,530 ds/min), 184,602,047 states left on queue. +Progress(44) at 2024-11-07 03:55:07: 21,759,015,470 states generated (29,353,550 s/min), 1,714,259,170 distinct states found (1,898,795 ds/min), 184,554,564 states left on queue. +Progress(44) at 2024-11-07 03:56:07: 21,788,534,088 states generated (29,518,618 s/min), 1,716,081,999 distinct states found (1,822,829 ds/min), 184,406,994 states left on queue. +Progress(44) at 2024-11-07 03:57:07: 21,817,875,474 states generated (29,341,386 s/min), 1,717,634,611 distinct states found (1,552,612 ds/min), 184,057,660 states left on queue. +Progress(44) at 2024-11-07 03:58:07: 21,847,006,510 states generated (29,131,036 s/min), 1,719,299,741 distinct states found (1,665,130 ds/min), 183,828,258 states left on queue. +Progress(44) at 2024-11-07 03:59:07: 21,875,869,357 states generated (28,862,847 s/min), 1,720,801,722 distinct states found (1,501,981 ds/min), 183,443,083 states left on queue. +Progress(44) at 2024-11-07 04:00:07: 21,904,922,732 states generated (29,053,375 s/min), 1,722,588,504 distinct states found (1,786,782 ds/min), 183,289,094 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 04:01:07) +Progress(44) at 2024-11-07 04:01:07: 21,933,965,695 states generated (29,042,963 s/min), 1,724,285,279 distinct states found (1,696,775 ds/min), 183,029,310 states left on queue. +Progress(44) at 2024-11-07 04:02:07: 21,962,959,341 states generated (28,993,646 s/min), 1,725,868,213 distinct states found (1,582,934 ds/min), 182,699,155 states left on queue. +Progress(44) at 2024-11-07 04:03:07: 21,991,777,816 states generated (28,818,475 s/min), 1,727,519,032 distinct states found (1,650,819 ds/min), 182,433,701 states left on queue. +Progress(44) at 2024-11-07 04:04:07: 22,020,733,433 states generated (28,955,617 s/min), 1,729,219,503 distinct states found (1,700,471 ds/min), 182,216,615 states left on queue. +Progress(44) at 2024-11-07 04:05:07: 22,049,984,634 states generated (29,251,201 s/min), 1,730,967,606 distinct states found (1,748,103 ds/min), 182,140,987 states left on queue. +Progress(44) at 2024-11-07 04:06:07: 22,079,112,674 states generated (29,128,040 s/min), 1,732,648,368 distinct states found (1,680,762 ds/min), 181,963,576 states left on queue. +Progress(44) at 2024-11-07 04:07:07: 22,108,329,917 states generated (29,217,243 s/min), 1,734,711,160 distinct states found (2,062,792 ds/min), 182,074,434 states left on queue. +Progress(44) at 2024-11-07 04:08:07: 22,137,402,322 states generated (29,072,405 s/min), 1,736,773,111 distinct states found (2,061,951 ds/min), 182,163,318 states left on queue. +Progress(44) at 2024-11-07 04:09:07: 22,166,402,243 states generated (28,999,921 s/min), 1,738,573,615 distinct states found (1,800,504 ds/min), 182,034,194 states left on queue. +Progress(44) at 2024-11-07 04:10:07: 22,195,545,763 states generated (29,143,520 s/min), 1,740,349,901 distinct states found (1,776,286 ds/min), 181,869,339 states left on queue. +Progress(44) at 2024-11-07 04:11:07: 22,224,766,309 states generated (29,220,546 s/min), 1,742,110,577 distinct states found (1,760,676 ds/min), 181,671,885 states left on queue. +Progress(44) at 2024-11-07 04:12:07: 22,253,807,692 states generated (29,041,383 s/min), 1,743,796,752 distinct states found (1,686,175 ds/min), 181,407,584 states left on queue. +Progress(44) at 2024-11-07 04:13:07: 22,282,790,947 states generated (28,983,255 s/min), 1,745,617,175 distinct states found (1,820,423 ds/min), 181,265,096 states left on queue. +Progress(44) at 2024-11-07 04:14:07: 22,311,840,917 states generated (29,049,970 s/min), 1,747,424,658 distinct states found (1,807,483 ds/min), 181,110,335 states left on queue. +Progress(44) at 2024-11-07 04:15:07: 22,340,851,116 states generated (29,010,199 s/min), 1,749,204,899 distinct states found (1,780,241 ds/min), 180,933,264 states left on queue. +Progress(44) at 2024-11-07 04:16:07: 22,369,820,191 states generated (28,969,075 s/min), 1,751,058,290 distinct states found (1,853,391 ds/min), 180,819,450 states left on queue. +Progress(44) at 2024-11-07 04:17:07: 22,398,637,854 states generated (28,817,663 s/min), 1,752,838,012 distinct states found (1,779,722 ds/min), 180,641,066 states left on queue. +Progress(44) at 2024-11-07 04:18:07: 22,427,736,775 states generated (29,098,921 s/min), 1,754,678,716 distinct states found (1,840,704 ds/min), 180,523,907 states left on queue. +Progress(44) at 2024-11-07 04:19:07: 22,456,749,604 states generated (29,012,829 s/min), 1,756,653,204 distinct states found (1,974,488 ds/min), 180,502,441 states left on queue. +Progress(44) at 2024-11-07 04:20:07: 22,485,995,309 states generated (29,245,705 s/min), 1,758,406,219 distinct states found (1,753,015 ds/min), 180,303,710 states left on queue. +Progress(44) at 2024-11-07 04:21:07: 22,515,059,607 states generated (29,064,298 s/min), 1,760,239,858 distinct states found (1,833,639 ds/min), 180,203,277 states left on queue. +Progress(44) at 2024-11-07 04:22:07: 22,544,007,885 states generated (28,948,278 s/min), 1,761,871,023 distinct states found (1,631,165 ds/min), 179,919,396 states left on queue. +Progress(44) at 2024-11-07 04:23:07: 22,572,858,704 states generated (28,850,819 s/min), 1,763,420,170 distinct states found (1,549,147 ds/min), 179,579,696 states left on queue. +Progress(44) at 2024-11-07 04:24:07: 22,601,850,297 states generated (28,991,593 s/min), 1,765,118,103 distinct states found (1,697,933 ds/min), 179,386,571 states left on queue. +Progress(44) at 2024-11-07 04:25:07: 22,630,832,111 states generated (28,981,814 s/min), 1,766,934,802 distinct states found (1,816,699 ds/min), 179,271,264 states left on queue. +Progress(44) at 2024-11-07 04:26:07: 22,659,674,047 states generated (28,841,936 s/min), 1,768,697,425 distinct states found (1,762,623 ds/min), 179,093,059 states left on queue. +Progress(44) at 2024-11-07 04:27:07: 22,688,427,580 states generated (28,753,533 s/min), 1,770,450,184 distinct states found (1,752,759 ds/min), 178,899,489 states left on queue. +Progress(44) at 2024-11-07 04:28:07: 22,717,189,869 states generated (28,762,289 s/min), 1,772,256,239 distinct states found (1,806,055 ds/min), 178,731,640 states left on queue. +Progress(44) at 2024-11-07 04:29:07: 22,746,022,343 states generated (28,832,474 s/min), 1,774,044,050 distinct states found (1,787,811 ds/min), 178,570,129 states left on queue. +Progress(44) at 2024-11-07 04:30:07: 22,774,887,995 states generated (28,865,652 s/min), 1,775,840,059 distinct states found (1,796,009 ds/min), 178,368,886 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 04:31:08) +Progress(44) at 2024-11-07 04:31:08: 22,803,877,345 states generated (28,989,350 s/min), 1,777,539,486 distinct states found (1,699,427 ds/min), 178,036,764 states left on queue. +Progress(44) at 2024-11-07 04:32:08: 22,832,344,161 states generated (28,466,816 s/min), 1,779,071,939 distinct states found (1,532,453 ds/min), 177,609,496 states left on queue. +Progress(44) at 2024-11-07 04:33:08: 22,860,965,708 states generated (28,621,547 s/min), 1,780,632,191 distinct states found (1,560,252 ds/min), 177,226,645 states left on queue. +Progress(44) at 2024-11-07 04:34:08: 22,890,116,212 states generated (29,150,504 s/min), 1,782,192,671 distinct states found (1,560,480 ds/min), 176,856,967 states left on queue. +Progress(44) at 2024-11-07 04:35:08: 22,919,394,798 states generated (29,278,586 s/min), 1,783,989,997 distinct states found (1,797,326 ds/min), 176,677,020 states left on queue. +Progress(44) at 2024-11-07 04:36:08: 22,948,717,272 states generated (29,322,474 s/min), 1,785,769,628 distinct states found (1,779,631 ds/min), 176,466,304 states left on queue. +Progress(44) at 2024-11-07 04:37:08: 22,978,008,874 states generated (29,291,602 s/min), 1,787,768,546 distinct states found (1,998,918 ds/min), 176,439,057 states left on queue. +Progress(45) at 2024-11-07 04:38:08: 23,007,259,342 states generated (29,250,468 s/min), 1,789,652,180 distinct states found (1,883,634 ds/min), 176,335,868 states left on queue. +Progress(45) at 2024-11-07 04:39:08: 23,036,414,234 states generated (29,154,892 s/min), 1,791,293,395 distinct states found (1,641,215 ds/min), 176,048,834 states left on queue. +Progress(45) at 2024-11-07 04:40:08: 23,065,467,218 states generated (29,052,984 s/min), 1,793,176,180 distinct states found (1,882,785 ds/min), 175,945,068 states left on queue. +Progress(45) at 2024-11-07 04:41:08: 23,094,601,413 states generated (29,134,195 s/min), 1,795,085,201 distinct states found (1,909,021 ds/min), 175,844,471 states left on queue. +Progress(45) at 2024-11-07 04:42:08: 23,123,835,299 states generated (29,233,886 s/min), 1,796,998,629 distinct states found (1,913,428 ds/min), 175,751,026 states left on queue. +Progress(45) at 2024-11-07 04:43:08: 23,153,014,383 states generated (29,179,084 s/min), 1,798,830,917 distinct states found (1,832,288 ds/min), 175,609,899 states left on queue. +Progress(45) at 2024-11-07 04:44:08: 23,181,848,791 states generated (28,834,408 s/min), 1,800,688,969 distinct states found (1,858,052 ds/min), 175,482,089 states left on queue. +Progress(45) at 2024-11-07 04:45:08: 23,210,960,242 states generated (29,111,451 s/min), 1,802,681,838 distinct states found (1,992,869 ds/min), 175,468,259 states left on queue. +Progress(45) at 2024-11-07 04:46:08: 23,239,931,898 states generated (28,971,656 s/min), 1,804,527,297 distinct states found (1,845,459 ds/min), 175,314,676 states left on queue. +Progress(45) at 2024-11-07 04:47:08: 23,269,110,236 states generated (29,178,338 s/min), 1,806,324,412 distinct states found (1,797,115 ds/min), 175,104,294 states left on queue. +Progress(45) at 2024-11-07 04:48:08: 23,298,261,893 states generated (29,151,657 s/min), 1,808,026,372 distinct states found (1,701,960 ds/min), 174,789,761 states left on queue. +Progress(45) at 2024-11-07 04:49:08: 23,327,194,301 states generated (28,932,408 s/min), 1,809,635,143 distinct states found (1,608,771 ds/min), 174,475,327 states left on queue. +Progress(45) at 2024-11-07 04:50:08: 23,356,033,807 states generated (28,839,506 s/min), 1,811,533,685 distinct states found (1,898,542 ds/min), 174,375,697 states left on queue. +Progress(45) at 2024-11-07 04:51:08: 23,384,783,950 states generated (28,750,143 s/min), 1,813,242,773 distinct states found (1,709,088 ds/min), 174,093,638 states left on queue. +Progress(45) at 2024-11-07 04:52:08: 23,413,868,078 states generated (29,084,128 s/min), 1,814,921,217 distinct states found (1,678,444 ds/min), 173,816,375 states left on queue. +Progress(45) at 2024-11-07 04:53:08: 23,443,072,326 states generated (29,204,248 s/min), 1,816,887,463 distinct states found (1,966,246 ds/min), 173,768,064 states left on queue. +Progress(45) at 2024-11-07 04:54:08: 23,472,531,302 states generated (29,458,976 s/min), 1,818,893,389 distinct states found (2,005,926 ds/min), 173,736,986 states left on queue. +Progress(45) at 2024-11-07 04:55:08: 23,501,670,169 states generated (29,138,867 s/min), 1,820,467,013 distinct states found (1,573,624 ds/min), 173,393,980 states left on queue. +Progress(45) at 2024-11-07 04:56:08: 23,530,619,816 states generated (28,949,647 s/min), 1,822,153,389 distinct states found (1,686,376 ds/min), 173,102,476 states left on queue. +Progress(45) at 2024-11-07 04:57:08: 23,559,730,839 states generated (29,111,023 s/min), 1,824,067,840 distinct states found (1,914,451 ds/min), 173,045,910 states left on queue. +Progress(45) at 2024-11-07 04:58:08: 23,588,956,543 states generated (29,225,704 s/min), 1,826,128,132 distinct states found (2,060,292 ds/min), 173,097,456 states left on queue. +Progress(45) at 2024-11-07 04:59:08: 23,617,943,385 states generated (28,986,842 s/min), 1,828,156,857 distinct states found (2,028,725 ds/min), 173,115,797 states left on queue. +Progress(45) at 2024-11-07 05:00:08: 23,647,052,247 states generated (29,108,862 s/min), 1,830,116,296 distinct states found (1,959,439 ds/min), 173,061,677 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 05:01:09) +Progress(45) at 2024-11-07 05:01:09: 23,676,540,644 states generated (29,488,397 s/min), 1,832,081,172 distinct states found (1,964,876 ds/min), 173,019,523 states left on queue. +Progress(45) at 2024-11-07 05:02:09: 23,705,447,239 states generated (28,906,595 s/min), 1,834,157,962 distinct states found (2,076,790 ds/min), 173,069,444 states left on queue. +Progress(45) at 2024-11-07 05:03:09: 23,734,590,381 states generated (29,143,142 s/min), 1,836,148,599 distinct states found (1,990,637 ds/min), 173,037,041 states left on queue. +Progress(45) at 2024-11-07 05:04:09: 23,763,605,229 states generated (29,014,848 s/min), 1,838,302,051 distinct states found (2,153,452 ds/min), 173,135,339 states left on queue. +Progress(45) at 2024-11-07 05:05:09: 23,792,794,847 states generated (29,189,618 s/min), 1,840,318,078 distinct states found (2,016,027 ds/min), 173,064,676 states left on queue. +Progress(45) at 2024-11-07 05:06:09: 23,821,711,411 states generated (28,916,564 s/min), 1,842,248,819 distinct states found (1,930,741 ds/min), 172,938,116 states left on queue. +Progress(45) at 2024-11-07 05:07:09: 23,850,829,522 states generated (29,118,111 s/min), 1,844,084,520 distinct states found (1,835,701 ds/min), 172,779,569 states left on queue. +Progress(45) at 2024-11-07 05:08:09: 23,880,027,055 states generated (29,197,533 s/min), 1,846,207,907 distinct states found (2,123,387 ds/min), 172,876,875 states left on queue. +Progress(45) at 2024-11-07 05:09:09: 23,909,238,654 states generated (29,211,599 s/min), 1,848,275,162 distinct states found (2,067,255 ds/min), 172,917,710 states left on queue. +Progress(45) at 2024-11-07 05:10:09: 23,938,254,527 states generated (29,015,873 s/min), 1,850,062,508 distinct states found (1,787,346 ds/min), 172,709,939 states left on queue. +Progress(45) at 2024-11-07 05:11:09: 23,967,280,908 states generated (29,026,381 s/min), 1,851,840,844 distinct states found (1,778,336 ds/min), 172,472,809 states left on queue. +Progress(45) at 2024-11-07 05:12:09: 23,996,137,153 states generated (28,856,245 s/min), 1,853,907,711 distinct states found (2,066,867 ds/min), 172,514,422 states left on queue. +Progress(45) at 2024-11-07 05:13:09: 24,025,003,271 states generated (28,866,118 s/min), 1,855,881,596 distinct states found (1,973,885 ds/min), 172,410,581 states left on queue. +Progress(45) at 2024-11-07 05:14:09: 24,053,998,968 states generated (28,995,697 s/min), 1,857,730,142 distinct states found (1,848,546 ds/min), 172,259,468 states left on queue. +Progress(45) at 2024-11-07 05:15:09: 24,082,780,775 states generated (28,781,807 s/min), 1,859,612,879 distinct states found (1,882,737 ds/min), 172,097,889 states left on queue. +Progress(45) at 2024-11-07 05:16:09: 24,111,843,462 states generated (29,062,687 s/min), 1,861,479,353 distinct states found (1,866,474 ds/min), 171,938,834 states left on queue. +Progress(45) at 2024-11-07 05:17:09: 24,140,987,153 states generated (29,143,691 s/min), 1,863,390,493 distinct states found (1,911,140 ds/min), 171,786,752 states left on queue. +Progress(45) at 2024-11-07 05:18:09: 24,170,023,897 states generated (29,036,744 s/min), 1,864,965,603 distinct states found (1,575,110 ds/min), 171,386,848 states left on queue. +Progress(45) at 2024-11-07 05:19:09: 24,198,987,772 states generated (28,963,875 s/min), 1,866,820,638 distinct states found (1,855,035 ds/min), 171,238,575 states left on queue. +Progress(45) at 2024-11-07 05:20:09: 24,227,820,740 states generated (28,832,968 s/min), 1,868,623,853 distinct states found (1,803,215 ds/min), 171,005,974 states left on queue. +Progress(45) at 2024-11-07 05:21:09: 24,256,712,636 states generated (28,891,896 s/min), 1,870,265,139 distinct states found (1,641,286 ds/min), 170,619,838 states left on queue. +Progress(45) at 2024-11-07 05:22:09: 24,285,792,587 states generated (29,079,951 s/min), 1,871,770,548 distinct states found (1,505,409 ds/min), 170,247,019 states left on queue. +Progress(45) at 2024-11-07 05:23:09: 24,315,021,618 states generated (29,229,031 s/min), 1,873,433,426 distinct states found (1,662,878 ds/min), 169,986,497 states left on queue. +Progress(45) at 2024-11-07 05:24:09: 24,343,972,976 states generated (28,951,358 s/min), 1,874,958,509 distinct states found (1,525,083 ds/min), 169,639,357 states left on queue. +Progress(45) at 2024-11-07 05:25:09: 24,372,818,044 states generated (28,845,068 s/min), 1,876,461,909 distinct states found (1,503,400 ds/min), 169,298,313 states left on queue. +Progress(45) at 2024-11-07 05:26:09: 24,401,879,839 states generated (29,061,795 s/min), 1,878,043,093 distinct states found (1,581,184 ds/min), 169,034,999 states left on queue. +Progress(45) at 2024-11-07 05:27:09: 24,431,117,440 states generated (29,237,601 s/min), 1,879,528,913 distinct states found (1,485,820 ds/min), 168,669,766 states left on queue. +Progress(45) at 2024-11-07 05:28:09: 24,460,565,564 states generated (29,448,124 s/min), 1,881,382,841 distinct states found (1,853,928 ds/min), 168,585,549 states left on queue. +Progress(45) at 2024-11-07 05:29:09: 24,489,842,320 states generated (29,276,756 s/min), 1,883,163,526 distinct states found (1,780,685 ds/min), 168,440,866 states left on queue. +Progress(45) at 2024-11-07 05:30:09: 24,519,309,785 states generated (29,467,465 s/min), 1,884,840,978 distinct states found (1,677,452 ds/min), 168,176,100 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 05:31:09) +Progress(45) at 2024-11-07 05:31:09: 24,548,699,426 states generated (29,389,641 s/min), 1,886,346,733 distinct states found (1,505,755 ds/min), 167,794,030 states left on queue. +Progress(45) at 2024-11-07 05:32:09: 24,577,454,860 states generated (28,755,434 s/min), 1,887,761,288 distinct states found (1,414,555 ds/min), 167,342,409 states left on queue. +Progress(45) at 2024-11-07 05:33:09: 24,606,401,929 states generated (28,947,069 s/min), 1,889,451,503 distinct states found (1,690,215 ds/min), 167,115,718 states left on queue. +Progress(45) at 2024-11-07 05:34:09: 24,635,080,181 states generated (28,678,252 s/min), 1,891,013,080 distinct states found (1,561,577 ds/min), 166,760,395 states left on queue. +Progress(45) at 2024-11-07 05:35:09: 24,663,912,233 states generated (28,832,052 s/min), 1,892,486,967 distinct states found (1,473,887 ds/min), 166,347,547 states left on queue. +Progress(45) at 2024-11-07 05:36:09: 24,692,601,003 states generated (28,688,770 s/min), 1,894,014,661 distinct states found (1,527,694 ds/min), 165,980,327 states left on queue. +Progress(45) at 2024-11-07 05:37:09: 24,721,596,280 states generated (28,995,277 s/min), 1,895,667,269 distinct states found (1,652,608 ds/min), 165,766,132 states left on queue. +Progress(45) at 2024-11-07 05:38:09: 24,750,737,270 states generated (29,140,990 s/min), 1,897,304,588 distinct states found (1,637,319 ds/min), 165,602,331 states left on queue. +Progress(45) at 2024-11-07 05:39:09: 24,779,762,621 states generated (29,025,351 s/min), 1,898,944,557 distinct states found (1,639,969 ds/min), 165,399,097 states left on queue. +Progress(45) at 2024-11-07 05:40:09: 24,808,890,636 states generated (29,128,015 s/min), 1,901,039,200 distinct states found (2,094,643 ds/min), 165,505,866 states left on queue. +Progress(45) at 2024-11-07 05:41:09: 24,837,834,330 states generated (28,943,694 s/min), 1,902,825,947 distinct states found (1,786,747 ds/min), 165,385,690 states left on queue. +Progress(45) at 2024-11-07 05:42:09: 24,866,749,194 states generated (28,914,864 s/min), 1,904,509,048 distinct states found (1,683,101 ds/min), 165,143,394 states left on queue. +Progress(45) at 2024-11-07 05:43:09: 24,895,891,462 states generated (29,142,268 s/min), 1,906,186,633 distinct states found (1,677,585 ds/min), 164,907,199 states left on queue. +Progress(45) at 2024-11-07 05:44:09: 24,924,929,592 states generated (29,038,130 s/min), 1,907,774,010 distinct states found (1,587,377 ds/min), 164,567,256 states left on queue. +Progress(45) at 2024-11-07 05:45:09: 24,953,854,731 states generated (28,925,139 s/min), 1,909,438,393 distinct states found (1,664,383 ds/min), 164,297,435 states left on queue. +Progress(45) at 2024-11-07 05:46:09: 24,982,773,173 states generated (28,918,442 s/min), 1,911,115,370 distinct states found (1,676,977 ds/min), 164,029,981 states left on queue. +Progress(45) at 2024-11-07 05:47:09: 25,011,681,639 states generated (28,908,466 s/min), 1,912,739,102 distinct states found (1,623,732 ds/min), 163,722,709 states left on queue. +Progress(45) at 2024-11-07 05:48:09: 25,040,624,886 states generated (28,943,247 s/min), 1,914,465,220 distinct states found (1,726,118 ds/min), 163,504,979 states left on queue. +Progress(45) at 2024-11-07 05:49:09: 25,069,369,631 states generated (28,744,745 s/min), 1,916,123,524 distinct states found (1,658,304 ds/min), 163,227,016 states left on queue. +Progress(45) at 2024-11-07 05:50:09: 25,098,381,973 states generated (29,012,342 s/min), 1,917,856,454 distinct states found (1,732,930 ds/min), 163,020,213 states left on queue. +Progress(45) at 2024-11-07 05:51:09: 25,127,432,010 states generated (29,050,037 s/min), 1,919,715,623 distinct states found (1,859,169 ds/min), 162,903,211 states left on queue. +Progress(45) at 2024-11-07 05:52:09: 25,156,554,852 states generated (29,122,842 s/min), 1,921,381,482 distinct states found (1,665,859 ds/min), 162,640,342 states left on queue. +Progress(45) at 2024-11-07 05:53:09: 25,185,439,752 states generated (28,884,900 s/min), 1,923,074,493 distinct states found (1,693,011 ds/min), 162,418,419 states left on queue. +Progress(45) at 2024-11-07 05:54:09: 25,214,250,620 states generated (28,810,868 s/min), 1,924,599,166 distinct states found (1,524,673 ds/min), 162,035,736 states left on queue. +Progress(45) at 2024-11-07 05:55:09: 25,243,065,684 states generated (28,815,064 s/min), 1,926,028,590 distinct states found (1,429,424 ds/min), 161,647,928 states left on queue. +Progress(45) at 2024-11-07 05:56:09: 25,272,074,106 states generated (29,008,422 s/min), 1,927,788,924 distinct states found (1,760,334 ds/min), 161,469,066 states left on queue. +Progress(45) at 2024-11-07 05:57:09: 25,300,916,527 states generated (28,842,421 s/min), 1,929,427,503 distinct states found (1,638,579 ds/min), 161,203,063 states left on queue. +Progress(45) at 2024-11-07 05:58:09: 25,329,617,957 states generated (28,701,430 s/min), 1,931,016,200 distinct states found (1,588,697 ds/min), 160,883,828 states left on queue. +Progress(45) at 2024-11-07 05:59:09: 25,358,305,874 states generated (28,687,917 s/min), 1,932,700,683 distinct states found (1,684,483 ds/min), 160,613,534 states left on queue. +Progress(45) at 2024-11-07 06:00:09: 25,387,060,807 states generated (28,754,933 s/min), 1,934,352,908 distinct states found (1,652,225 ds/min), 160,340,594 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 06:01:10) +Progress(45) at 2024-11-07 06:01:10: 25,416,167,383 states generated (29,106,576 s/min), 1,936,031,185 distinct states found (1,678,277 ds/min), 160,024,096 states left on queue. +Progress(45) at 2024-11-07 06:02:10: 25,444,775,068 states generated (28,607,685 s/min), 1,937,531,864 distinct states found (1,500,679 ds/min), 159,558,759 states left on queue. +Progress(45) at 2024-11-07 06:03:10: 25,473,218,014 states generated (28,442,946 s/min), 1,938,932,593 distinct states found (1,400,729 ds/min), 159,031,186 states left on queue. +Progress(45) at 2024-11-07 06:04:10: 25,502,153,601 states generated (28,935,587 s/min), 1,940,366,906 distinct states found (1,434,313 ds/min), 158,550,067 states left on queue. +Progress(45) at 2024-11-07 06:05:10: 25,531,409,924 states generated (29,256,323 s/min), 1,942,031,081 distinct states found (1,664,175 ds/min), 158,260,393 states left on queue. +Progress(45) at 2024-11-07 06:06:10: 25,560,798,500 states generated (29,388,576 s/min), 1,943,755,697 distinct states found (1,724,616 ds/min), 158,001,838 states left on queue. +Progress(45) at 2024-11-07 06:07:10: 25,590,101,236 states generated (29,302,736 s/min), 1,945,659,191 distinct states found (1,903,494 ds/min), 157,894,541 states left on queue. +Progress(45) at 2024-11-07 06:08:10: 25,619,347,006 states generated (29,245,770 s/min), 1,947,436,584 distinct states found (1,777,393 ds/min), 157,703,839 states left on queue. +Progress(45) at 2024-11-07 06:09:10: 25,648,466,795 states generated (29,119,789 s/min), 1,949,039,117 distinct states found (1,602,533 ds/min), 157,391,298 states left on queue. +Progress(45) at 2024-11-07 06:10:10: 25,677,360,883 states generated (28,894,088 s/min), 1,950,787,656 distinct states found (1,748,539 ds/min), 157,176,854 states left on queue. +Progress(45) at 2024-11-07 06:11:10: 25,706,625,655 states generated (29,264,772 s/min), 1,952,700,166 distinct states found (1,912,510 ds/min), 157,069,408 states left on queue. +Progress(46) at 2024-11-07 06:12:10: 25,735,830,172 states generated (29,204,517 s/min), 1,954,444,069 distinct states found (1,743,903 ds/min), 156,852,227 states left on queue. +Progress(46) at 2024-11-07 06:13:10: 25,764,811,792 states generated (28,981,620 s/min), 1,956,165,433 distinct states found (1,721,364 ds/min), 156,618,900 states left on queue. +Progress(46) at 2024-11-07 06:14:10: 25,793,740,486 states generated (28,928,694 s/min), 1,957,961,862 distinct states found (1,796,429 ds/min), 156,441,787 states left on queue. +Progress(46) at 2024-11-07 06:15:10: 25,822,741,831 states generated (29,001,345 s/min), 1,959,749,416 distinct states found (1,787,554 ds/min), 156,253,838 states left on queue. +Progress(46) at 2024-11-07 06:16:10: 25,851,804,688 states generated (29,062,857 s/min), 1,961,466,422 distinct states found (1,717,006 ds/min), 155,977,351 states left on queue. +Progress(46) at 2024-11-07 06:17:10: 25,880,868,584 states generated (29,063,896 s/min), 1,963,090,742 distinct states found (1,624,320 ds/min), 155,628,145 states left on queue. +Progress(46) at 2024-11-07 06:18:10: 25,909,824,307 states generated (28,955,723 s/min), 1,964,570,100 distinct states found (1,479,358 ds/min), 155,182,107 states left on queue. +Progress(46) at 2024-11-07 06:19:10: 25,938,584,425 states generated (28,760,118 s/min), 1,966,303,642 distinct states found (1,733,542 ds/min), 154,946,766 states left on queue. +Progress(46) at 2024-11-07 06:20:10: 25,967,304,223 states generated (28,719,798 s/min), 1,967,883,207 distinct states found (1,579,565 ds/min), 154,558,935 states left on queue. +Progress(46) at 2024-11-07 06:21:10: 25,996,402,469 states generated (29,098,246 s/min), 1,969,591,000 distinct states found (1,707,793 ds/min), 154,302,069 states left on queue. +Progress(46) at 2024-11-07 06:22:10: 26,025,623,943 states generated (29,221,474 s/min), 1,971,434,403 distinct states found (1,843,403 ds/min), 154,157,059 states left on queue. +Progress(46) at 2024-11-07 06:23:10: 26,055,038,054 states generated (29,414,111 s/min), 1,973,261,720 distinct states found (1,827,317 ds/min), 153,981,317 states left on queue. +Progress(46) at 2024-11-07 06:24:10: 26,083,986,220 states generated (28,948,166 s/min), 1,974,670,648 distinct states found (1,408,928 ds/min), 153,508,388 states left on queue. +Progress(46) at 2024-11-07 06:25:10: 26,113,067,907 states generated (29,081,687 s/min), 1,976,391,547 distinct states found (1,720,899 ds/min), 153,263,845 states left on queue. +Progress(46) at 2024-11-07 06:26:10: 26,142,186,839 states generated (29,118,932 s/min), 1,978,379,881 distinct states found (1,988,334 ds/min), 153,253,200 states left on queue. +Progress(46) at 2024-11-07 06:27:10: 26,171,338,068 states generated (29,151,229 s/min), 1,980,293,569 distinct states found (1,913,688 ds/min), 153,185,559 states left on queue. +Progress(46) at 2024-11-07 06:28:10: 26,200,319,869 states generated (28,981,801 s/min), 1,982,130,034 distinct states found (1,836,465 ds/min), 153,039,826 states left on queue. +Progress(46) at 2024-11-07 06:29:10: 26,229,451,237 states generated (29,131,368 s/min), 1,984,117,981 distinct states found (1,987,947 ds/min), 153,030,792 states left on queue. +Progress(46) at 2024-11-07 06:30:10: 26,258,476,767 states generated (29,025,530 s/min), 1,985,981,073 distinct states found (1,863,092 ds/min), 152,917,939 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 06:31:11) +Progress(46) at 2024-11-07 06:31:11: 26,287,657,848 states generated (29,181,081 s/min), 1,987,875,178 distinct states found (1,894,105 ds/min), 152,784,901 states left on queue. +Progress(46) at 2024-11-07 06:32:11: 26,316,549,803 states generated (28,891,955 s/min), 1,989,821,141 distinct states found (1,945,963 ds/min), 152,728,813 states left on queue. +Progress(46) at 2024-11-07 06:33:11: 26,345,570,902 states generated (29,021,099 s/min), 1,991,762,973 distinct states found (1,941,832 ds/min), 152,648,662 states left on queue. +Progress(46) at 2024-11-07 06:34:11: 26,374,519,051 states generated (28,948,149 s/min), 1,993,605,958 distinct states found (1,842,985 ds/min), 152,446,201 states left on queue. +Progress(46) at 2024-11-07 06:35:11: 26,403,403,284 states generated (28,884,233 s/min), 1,995,379,328 distinct states found (1,773,370 ds/min), 152,189,032 states left on queue. +Progress(46) at 2024-11-07 06:36:11: 26,432,512,518 states generated (29,109,234 s/min), 1,997,205,848 distinct states found (1,826,520 ds/min), 152,060,823 states left on queue. +Progress(46) at 2024-11-07 06:37:11: 26,461,635,963 states generated (29,123,445 s/min), 1,999,221,288 distinct states found (2,015,440 ds/min), 152,052,317 states left on queue. +Progress(46) at 2024-11-07 06:38:11: 26,490,692,408 states generated (29,056,445 s/min), 2,001,003,940 distinct states found (1,782,652 ds/min), 151,869,333 states left on queue. +Progress(46) at 2024-11-07 06:39:11: 26,519,611,691 states generated (28,919,283 s/min), 2,002,772,264 distinct states found (1,768,324 ds/min), 151,637,576 states left on queue. +Progress(46) at 2024-11-07 06:40:11: 26,548,405,773 states generated (28,794,082 s/min), 2,004,530,832 distinct states found (1,758,568 ds/min), 151,415,653 states left on queue. +Progress(46) at 2024-11-07 06:41:11: 26,577,168,173 states generated (28,762,400 s/min), 2,006,431,383 distinct states found (1,900,551 ds/min), 151,293,655 states left on queue. +Progress(46) at 2024-11-07 06:42:11: 26,606,013,565 states generated (28,845,392 s/min), 2,008,118,930 distinct states found (1,687,547 ds/min), 150,979,607 states left on queue. +Progress(46) at 2024-11-07 06:43:11: 26,634,840,454 states generated (28,826,889 s/min), 2,010,033,233 distinct states found (1,914,303 ds/min), 150,859,233 states left on queue. +Progress(46) at 2024-11-07 06:44:11: 26,663,791,564 states generated (28,951,110 s/min), 2,011,764,506 distinct states found (1,731,273 ds/min), 150,592,176 states left on queue. +Progress(46) at 2024-11-07 06:45:11: 26,692,845,560 states generated (29,053,996 s/min), 2,013,541,948 distinct states found (1,777,442 ds/min), 150,346,125 states left on queue. +Progress(46) at 2024-11-07 06:46:11: 26,721,838,462 states generated (28,992,902 s/min), 2,015,055,311 distinct states found (1,513,363 ds/min), 149,898,025 states left on queue. +Progress(46) at 2024-11-07 06:47:11: 26,750,784,724 states generated (28,946,262 s/min), 2,016,795,791 distinct states found (1,740,480 ds/min), 149,636,143 states left on queue. +Progress(46) at 2024-11-07 06:48:11: 26,779,537,729 states generated (28,753,005 s/min), 2,018,420,817 distinct states found (1,625,026 ds/min), 149,264,338 states left on queue. +Progress(46) at 2024-11-07 06:49:11: 26,808,414,064 states generated (28,876,335 s/min), 2,019,941,133 distinct states found (1,520,316 ds/min), 148,833,851 states left on queue. +Progress(46) at 2024-11-07 06:50:11: 26,837,552,895 states generated (29,138,831 s/min), 2,021,402,334 distinct states found (1,461,201 ds/min), 148,423,082 states left on queue. +Progress(46) at 2024-11-07 06:51:11: 26,866,488,521 states generated (28,935,626 s/min), 2,022,896,299 distinct states found (1,493,965 ds/min), 148,037,640 states left on queue. +Progress(46) at 2024-11-07 06:52:11: 26,895,259,654 states generated (28,771,133 s/min), 2,024,306,180 distinct states found (1,409,881 ds/min), 147,623,626 states left on queue. +Progress(46) at 2024-11-07 06:53:11: 26,924,324,639 states generated (29,064,985 s/min), 2,025,751,691 distinct states found (1,445,511 ds/min), 147,237,191 states left on queue. +Progress(46) at 2024-11-07 06:54:11: 26,953,575,306 states generated (29,250,667 s/min), 2,027,292,041 distinct states found (1,540,350 ds/min), 146,929,253 states left on queue. +Progress(46) at 2024-11-07 06:55:11: 26,982,863,734 states generated (29,288,428 s/min), 2,029,056,116 distinct states found (1,764,075 ds/min), 146,774,179 states left on queue. +Progress(46) at 2024-11-07 06:56:11: 27,012,217,899 states generated (29,354,165 s/min), 2,030,705,091 distinct states found (1,648,975 ds/min), 146,523,776 states left on queue. +Progress(46) at 2024-11-07 06:57:11: 27,041,431,406 states generated (29,213,507 s/min), 2,032,122,917 distinct states found (1,417,826 ds/min), 146,066,712 states left on queue. +Progress(46) at 2024-11-07 06:58:11: 27,070,230,233 states generated (28,798,827 s/min), 2,033,502,867 distinct states found (1,379,950 ds/min), 145,580,465 states left on queue. +Progress(46) at 2024-11-07 06:59:11: 27,099,119,410 states generated (28,889,177 s/min), 2,035,080,295 distinct states found (1,577,428 ds/min), 145,255,429 states left on queue. +Progress(46) at 2024-11-07 07:00:11: 27,127,802,546 states generated (28,683,136 s/min), 2,036,480,069 distinct states found (1,399,774 ds/min), 144,763,326 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 07:01:11) +Progress(46) at 2024-11-07 07:01:11: 27,156,729,000 states generated (28,926,454 s/min), 2,037,888,171 distinct states found (1,408,102 ds/min), 144,282,188 states left on queue. +Progress(46) at 2024-11-07 07:02:11: 27,185,673,878 states generated (28,944,878 s/min), 2,039,404,499 distinct states found (1,516,328 ds/min), 143,933,899 states left on queue. +Progress(46) at 2024-11-07 07:03:11: 27,214,800,380 states generated (29,126,502 s/min), 2,040,991,907 distinct states found (1,587,408 ds/min), 143,736,528 states left on queue. +Progress(46) at 2024-11-07 07:04:11: 27,243,805,336 states generated (29,004,956 s/min), 2,042,560,493 distinct states found (1,568,586 ds/min), 143,474,607 states left on queue. +Progress(46) at 2024-11-07 07:05:11: 27,272,912,902 states generated (29,107,566 s/min), 2,044,549,687 distinct states found (1,989,194 ds/min), 143,501,934 states left on queue. +Progress(46) at 2024-11-07 07:06:11: 27,301,850,628 states generated (28,937,726 s/min), 2,046,213,816 distinct states found (1,664,129 ds/min), 143,280,971 states left on queue. +Progress(46) at 2024-11-07 07:07:11: 27,330,744,799 states generated (28,894,171 s/min), 2,047,777,121 distinct states found (1,563,305 ds/min), 142,943,602 states left on queue. +Progress(46) at 2024-11-07 07:08:11: 27,359,855,477 states generated (29,110,678 s/min), 2,049,356,015 distinct states found (1,578,894 ds/min), 142,631,188 states left on queue. +Progress(46) at 2024-11-07 07:09:11: 27,388,745,464 states generated (28,889,987 s/min), 2,050,822,496 distinct states found (1,466,481 ds/min), 142,190,439 states left on queue. +Progress(46) at 2024-11-07 07:10:11: 27,417,576,550 states generated (28,831,086 s/min), 2,052,379,523 distinct states found (1,557,027 ds/min), 141,821,153 states left on queue. +Progress(46) at 2024-11-07 07:11:11: 27,446,546,405 states generated (28,969,855 s/min), 2,053,934,499 distinct states found (1,554,976 ds/min), 141,462,097 states left on queue. +Progress(46) at 2024-11-07 07:12:11: 27,475,398,683 states generated (28,852,278 s/min), 2,055,510,649 distinct states found (1,576,150 ds/min), 141,116,110 states left on queue. +Progress(46) at 2024-11-07 07:13:11: 27,504,113,194 states generated (28,714,511 s/min), 2,057,051,677 distinct states found (1,541,028 ds/min), 140,743,906 states left on queue. +Progress(46) at 2024-11-07 07:14:11: 27,532,983,174 states generated (28,869,980 s/min), 2,058,669,649 distinct states found (1,617,972 ds/min), 140,436,853 states left on queue. +Progress(46) at 2024-11-07 07:15:11: 27,562,088,285 states generated (29,105,111 s/min), 2,060,404,146 distinct states found (1,734,497 ds/min), 140,213,296 states left on queue. +Progress(46) at 2024-11-07 07:16:11: 27,591,079,273 states generated (28,990,988 s/min), 2,061,979,907 distinct states found (1,575,761 ds/min), 139,895,056 states left on queue. +Progress(46) at 2024-11-07 07:17:11: 27,619,876,413 states generated (28,797,140 s/min), 2,063,482,225 distinct states found (1,502,318 ds/min), 139,506,174 states left on queue. +Progress(46) at 2024-11-07 07:18:11: 27,648,595,649 states generated (28,719,236 s/min), 2,064,847,355 distinct states found (1,365,130 ds/min), 139,035,783 states left on queue. +Progress(46) at 2024-11-07 07:19:11: 27,677,544,192 states generated (28,948,543 s/min), 2,066,507,355 distinct states found (1,660,000 ds/min), 138,783,592 states left on queue. +Progress(46) at 2024-11-07 07:20:11: 27,706,306,461 states generated (28,762,269 s/min), 2,068,019,192 distinct states found (1,511,837 ds/min), 138,418,256 states left on queue. +Progress(46) at 2024-11-07 07:21:11: 27,734,873,733 states generated (28,567,272 s/min), 2,069,467,142 distinct states found (1,447,950 ds/min), 137,977,630 states left on queue. +Progress(46) at 2024-11-07 07:22:11: 27,763,678,204 states generated (28,804,471 s/min), 2,071,034,824 distinct states found (1,567,682 ds/min), 137,622,296 states left on queue. +Progress(46) at 2024-11-07 07:23:11: 27,792,322,332 states generated (28,644,128 s/min), 2,072,586,226 distinct states found (1,551,402 ds/min), 137,231,762 states left on queue. +Progress(46) at 2024-11-07 07:24:11: 27,821,040,127 states generated (28,717,795 s/min), 2,074,030,831 distinct states found (1,444,605 ds/min), 136,731,600 states left on queue. +Progress(46) at 2024-11-07 07:25:11: 27,849,404,654 states generated (28,364,527 s/min), 2,075,273,409 distinct states found (1,242,578 ds/min), 136,082,131 states left on queue. +Progress(46) at 2024-11-07 07:26:11: 27,878,356,417 states generated (28,951,763 s/min), 2,076,656,601 distinct states found (1,383,192 ds/min), 135,570,796 states left on queue. +Progress(46) at 2024-11-07 07:27:11: 27,907,776,802 states generated (29,420,385 s/min), 2,078,383,391 distinct states found (1,726,790 ds/min), 135,306,248 states left on queue. +Progress(46) at 2024-11-07 07:28:11: 27,937,070,294 states generated (29,293,492 s/min), 2,080,076,828 distinct states found (1,693,437 ds/min), 135,034,380 states left on queue. +Progress(46) at 2024-11-07 07:29:11: 27,966,287,907 states generated (29,217,613 s/min), 2,081,855,223 distinct states found (1,778,395 ds/min), 134,839,763 states left on queue. +Progress(46) at 2024-11-07 07:30:11: 27,995,330,759 states generated (29,042,852 s/min), 2,083,372,197 distinct states found (1,516,974 ds/min), 134,461,641 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 07:31:12) +Progress(46) at 2024-11-07 07:31:12: 28,024,387,579 states generated (29,056,820 s/min), 2,085,018,193 distinct states found (1,645,996 ds/min), 134,150,126 states left on queue. +Progress(46) at 2024-11-07 07:32:12: 28,053,564,379 states generated (29,176,800 s/min), 2,086,850,158 distinct states found (1,831,965 ds/min), 133,983,084 states left on queue. +Progress(46) at 2024-11-07 07:33:12: 28,082,556,747 states generated (28,992,368 s/min), 2,088,444,271 distinct states found (1,594,113 ds/min), 133,651,269 states left on queue. +Progress(47) at 2024-11-07 07:34:12: 28,111,323,007 states generated (28,766,260 s/min), 2,090,072,790 distinct states found (1,628,519 ds/min), 133,350,640 states left on queue. +Progress(47) at 2024-11-07 07:35:12: 28,140,191,163 states generated (28,868,156 s/min), 2,091,740,224 distinct states found (1,667,434 ds/min), 133,070,266 states left on queue. +Progress(47) at 2024-11-07 07:36:12: 28,169,054,601 states generated (28,863,438 s/min), 2,093,375,975 distinct states found (1,635,751 ds/min), 132,752,319 states left on queue. +Progress(47) at 2024-11-07 07:37:12: 28,197,994,162 states generated (28,939,561 s/min), 2,094,929,793 distinct states found (1,553,818 ds/min), 132,356,738 states left on queue. +Progress(47) at 2024-11-07 07:38:12: 28,226,808,491 states generated (28,814,329 s/min), 2,096,311,441 distinct states found (1,381,648 ds/min), 131,832,292 states left on queue. +Progress(47) at 2024-11-07 07:39:12: 28,255,451,016 states generated (28,642,525 s/min), 2,097,907,185 distinct states found (1,595,744 ds/min), 131,487,862 states left on queue. +Progress(47) at 2024-11-07 07:40:12: 28,284,015,286 states generated (28,564,270 s/min), 2,099,332,452 distinct states found (1,425,267 ds/min), 130,982,897 states left on queue. +Progress(47) at 2024-11-07 07:41:12: 28,313,051,806 states generated (29,036,520 s/min), 2,101,053,792 distinct states found (1,721,340 ds/min), 130,744,522 states left on queue. +Progress(47) at 2024-11-07 07:42:12: 28,342,348,160 states generated (29,296,354 s/min), 2,102,778,970 distinct states found (1,725,178 ds/min), 130,505,777 states left on queue. +Progress(47) at 2024-11-07 07:43:12: 28,371,533,935 states generated (29,185,775 s/min), 2,104,337,778 distinct states found (1,558,808 ds/min), 130,144,304 states left on queue. +Progress(47) at 2024-11-07 07:44:12: 28,400,351,066 states generated (28,817,131 s/min), 2,105,835,284 distinct states found (1,497,506 ds/min), 129,719,871 states left on queue. +Progress(47) at 2024-11-07 07:45:12: 28,429,411,463 states generated (29,060,397 s/min), 2,107,704,752 distinct states found (1,869,468 ds/min), 129,618,749 states left on queue. +Progress(47) at 2024-11-07 07:46:12: 28,458,488,093 states generated (29,076,630 s/min), 2,109,483,825 distinct states found (1,779,073 ds/min), 129,439,723 states left on queue. +Progress(47) at 2024-11-07 07:47:12: 28,487,338,391 states generated (28,850,298 s/min), 2,111,230,358 distinct states found (1,746,533 ds/min), 129,232,124 states left on queue. +Progress(47) at 2024-11-07 07:48:12: 28,516,411,931 states generated (29,073,540 s/min), 2,113,150,785 distinct states found (1,920,427 ds/min), 129,168,385 states left on queue. +Progress(47) at 2024-11-07 07:49:12: 28,545,299,037 states generated (28,887,106 s/min), 2,114,878,071 distinct states found (1,727,286 ds/min), 128,948,735 states left on queue. +Progress(47) at 2024-11-07 07:50:12: 28,574,186,091 states generated (28,887,054 s/min), 2,116,622,746 distinct states found (1,744,675 ds/min), 128,711,386 states left on queue. +Progress(47) at 2024-11-07 07:51:12: 28,603,057,442 states generated (28,871,351 s/min), 2,118,435,710 distinct states found (1,812,964 ds/min), 128,543,573 states left on queue. +Progress(47) at 2024-11-07 07:52:12: 28,632,042,720 states generated (28,985,278 s/min), 2,120,240,818 distinct states found (1,805,108 ds/min), 128,349,742 states left on queue. +Progress(47) at 2024-11-07 07:53:12: 28,660,885,097 states generated (28,842,377 s/min), 2,121,904,885 distinct states found (1,664,067 ds/min), 128,002,987 states left on queue. +Progress(47) at 2024-11-07 07:54:12: 28,689,690,902 states generated (28,805,805 s/min), 2,123,498,767 distinct states found (1,593,882 ds/min), 127,622,035 states left on queue. +Progress(47) at 2024-11-07 07:55:12: 28,718,827,206 states generated (29,136,304 s/min), 2,125,375,087 distinct states found (1,876,320 ds/min), 127,518,682 states left on queue. +Progress(47) at 2024-11-07 07:56:12: 28,747,988,287 states generated (29,161,081 s/min), 2,127,234,055 distinct states found (1,858,968 ds/min), 127,390,123 states left on queue. +Progress(47) at 2024-11-07 07:57:12: 28,776,918,449 states generated (28,930,162 s/min), 2,128,896,639 distinct states found (1,662,584 ds/min), 127,099,202 states left on queue. +Progress(47) at 2024-11-07 07:58:12: 28,805,826,521 states generated (28,908,072 s/min), 2,130,485,896 distinct states found (1,589,257 ds/min), 126,731,846 states left on queue. +Progress(47) at 2024-11-07 07:59:12: 28,834,550,061 states generated (28,723,540 s/min), 2,132,267,049 distinct states found (1,781,153 ds/min), 126,524,859 states left on queue. +Progress(47) at 2024-11-07 08:00:12: 28,863,218,037 states generated (28,667,976 s/min), 2,133,901,471 distinct states found (1,634,422 ds/min), 126,149,810 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 08:01:13) +Progress(47) at 2024-11-07 08:01:13: 28,892,405,277 states generated (29,187,240 s/min), 2,135,683,266 distinct states found (1,781,795 ds/min), 125,938,046 states left on queue. +Progress(47) at 2024-11-07 08:02:13: 28,921,188,007 states generated (28,782,730 s/min), 2,137,299,589 distinct states found (1,616,323 ds/min), 125,575,223 states left on queue. +Progress(47) at 2024-11-07 08:03:13: 28,950,198,581 states generated (29,010,574 s/min), 2,138,945,715 distinct states found (1,646,126 ds/min), 125,225,825 states left on queue. +Progress(47) at 2024-11-07 08:04:13: 28,979,052,322 states generated (28,853,741 s/min), 2,140,384,312 distinct states found (1,438,597 ds/min), 124,739,890 states left on queue. +Progress(47) at 2024-11-07 08:05:13: 29,007,862,556 states generated (28,810,234 s/min), 2,142,020,690 distinct states found (1,636,378 ds/min), 124,389,570 states left on queue. +Progress(47) at 2024-11-07 08:06:13: 29,036,639,997 states generated (28,777,441 s/min), 2,143,436,769 distinct states found (1,416,079 ds/min), 123,853,456 states left on queue. +Progress(47) at 2024-11-07 08:07:13: 29,065,681,489 states generated (29,041,492 s/min), 2,144,841,718 distinct states found (1,404,949 ds/min), 123,385,042 states left on queue. +Progress(47) at 2024-11-07 08:08:13: 29,094,462,032 states generated (28,780,543 s/min), 2,146,214,867 distinct states found (1,373,149 ds/min), 122,908,921 states left on queue. +Progress(47) at 2024-11-07 08:09:13: 29,123,289,758 states generated (28,827,726 s/min), 2,147,553,984 distinct states found (1,339,117 ds/min), 122,446,193 states left on queue. +Progress(47) at 2024-11-07 08:10:13: 29,152,503,386 states generated (29,213,628 s/min), 2,148,942,911 distinct states found (1,388,927 ds/min), 122,030,640 states left on queue. +Progress(47) at 2024-11-07 08:11:13: 29,181,728,737 states generated (29,225,351 s/min), 2,150,631,619 distinct states found (1,688,708 ds/min), 121,816,919 states left on queue. +Progress(47) at 2024-11-07 08:12:13: 29,211,003,478 states generated (29,274,741 s/min), 2,152,175,254 distinct states found (1,543,635 ds/min), 121,489,774 states left on queue. +Progress(47) at 2024-11-07 08:13:13: 29,240,102,268 states generated (29,098,790 s/min), 2,153,537,952 distinct states found (1,362,698 ds/min), 120,992,206 states left on queue. +Progress(47) at 2024-11-07 08:14:13: 29,268,843,458 states generated (28,741,190 s/min), 2,154,896,522 distinct states found (1,358,570 ds/min), 120,481,830 states left on queue. +Progress(47) at 2024-11-07 08:15:13: 29,297,458,982 states generated (28,615,524 s/min), 2,156,228,693 distinct states found (1,332,171 ds/min), 119,935,590 states left on queue. +Progress(47) at 2024-11-07 08:16:13: 29,326,133,934 states generated (28,674,952 s/min), 2,157,558,222 distinct states found (1,329,529 ds/min), 119,402,611 states left on queue. +Progress(47) at 2024-11-07 08:17:13: 29,355,133,179 states generated (28,999,245 s/min), 2,159,036,229 distinct states found (1,478,007 ds/min), 119,059,305 states left on queue. +Progress(47) at 2024-11-07 08:18:13: 29,384,094,216 states generated (28,961,037 s/min), 2,160,401,726 distinct states found (1,365,497 ds/min), 118,659,528 states left on queue. +Progress(47) at 2024-11-07 08:19:13: 29,413,210,497 states generated (29,116,281 s/min), 2,162,252,062 distinct states found (1,850,336 ds/min), 118,605,990 states left on queue. +Progress(47) at 2024-11-07 08:20:13: 29,442,123,726 states generated (28,913,229 s/min), 2,163,968,572 distinct states found (1,716,510 ds/min), 118,430,828 states left on queue. +Progress(47) at 2024-11-07 08:21:13: 29,470,933,813 states generated (28,810,087 s/min), 2,165,411,802 distinct states found (1,443,230 ds/min), 118,017,068 states left on queue. +Progress(47) at 2024-11-07 08:22:13: 29,499,968,878 states generated (29,035,065 s/min), 2,166,884,069 distinct states found (1,472,267 ds/min), 117,620,342 states left on queue. +Progress(47) at 2024-11-07 08:23:13: 29,528,752,811 states generated (28,783,933 s/min), 2,168,252,577 distinct states found (1,368,508 ds/min), 117,101,560 states left on queue. +Progress(47) at 2024-11-07 08:24:13: 29,557,568,598 states generated (28,815,787 s/min), 2,169,705,158 distinct states found (1,452,581 ds/min), 116,651,662 states left on queue. +Progress(47) at 2024-11-07 08:25:13: 29,586,373,945 states generated (28,805,347 s/min), 2,171,138,563 distinct states found (1,433,405 ds/min), 116,184,414 states left on queue. +Progress(47) at 2024-11-07 08:26:13: 29,614,983,668 states generated (28,609,723 s/min), 2,172,585,802 distinct states found (1,447,239 ds/min), 115,737,683 states left on queue. +Progress(47) at 2024-11-07 08:27:13: 29,643,800,320 states generated (28,816,652 s/min), 2,174,078,381 distinct states found (1,492,579 ds/min), 115,326,021 states left on queue. +Progress(47) at 2024-11-07 08:28:13: 29,672,907,645 states generated (29,107,325 s/min), 2,175,677,520 distinct states found (1,599,139 ds/min), 114,997,885 states left on queue. +Progress(47) at 2024-11-07 08:29:13: 29,701,705,556 states generated (28,797,911 s/min), 2,177,190,856 distinct states found (1,513,336 ds/min), 114,628,087 states left on queue. +Progress(47) at 2024-11-07 08:30:13: 29,730,412,995 states generated (28,707,439 s/min), 2,178,512,609 distinct states found (1,321,753 ds/min), 114,087,841 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 08:31:13) +Progress(47) at 2024-11-07 08:31:13: 29,759,387,383 states generated (28,974,388 s/min), 2,180,040,685 distinct states found (1,528,076 ds/min), 113,747,306 states left on queue. +Progress(47) at 2024-11-07 08:32:13: 29,788,001,065 states generated (28,613,682 s/min), 2,181,419,698 distinct states found (1,379,013 ds/min), 113,276,454 states left on queue. +Progress(47) at 2024-11-07 08:33:13: 29,816,483,253 states generated (28,482,188 s/min), 2,182,794,509 distinct states found (1,374,811 ds/min), 112,764,181 states left on queue. +Progress(47) at 2024-11-07 08:34:13: 29,845,032,133 states generated (28,548,880 s/min), 2,184,180,577 distinct states found (1,386,068 ds/min), 112,275,854 states left on queue. +Progress(47) at 2024-11-07 08:35:13: 29,873,704,121 states generated (28,671,988 s/min), 2,185,610,616 distinct states found (1,430,039 ds/min), 111,765,886 states left on queue. +Progress(47) at 2024-11-07 08:36:13: 29,901,983,007 states generated (28,278,886 s/min), 2,186,742,865 distinct states found (1,132,249 ds/min), 111,037,502 states left on queue. +Progress(47) at 2024-11-07 08:37:13: 29,931,128,222 states generated (29,145,215 s/min), 2,188,247,053 distinct states found (1,504,188 ds/min), 110,610,871 states left on queue. +Progress(47) at 2024-11-07 08:38:13: 29,960,291,600 states generated (29,163,378 s/min), 2,189,791,380 distinct states found (1,544,327 ds/min), 110,219,347 states left on queue. +Progress(47) at 2024-11-07 08:39:13: 29,989,426,093 states generated (29,134,493 s/min), 2,191,523,686 distinct states found (1,732,306 ds/min), 109,988,090 states left on queue. +Progress(47) at 2024-11-07 08:40:13: 30,018,419,613 states generated (28,993,520 s/min), 2,192,983,724 distinct states found (1,460,038 ds/min), 109,567,153 states left on queue. +Progress(47) at 2024-11-07 08:41:13: 30,047,169,261 states generated (28,749,648 s/min), 2,194,485,325 distinct states found (1,501,601 ds/min), 109,159,610 states left on queue. +Progress(47) at 2024-11-07 08:42:13: 30,076,320,011 states generated (29,150,750 s/min), 2,196,261,852 distinct states found (1,776,527 ds/min), 108,952,775 states left on queue. +Progress(47) at 2024-11-07 08:43:13: 30,105,246,939 states generated (28,926,928 s/min), 2,197,745,917 distinct states found (1,484,065 ds/min), 108,533,801 states left on queue. +Progress(47) at 2024-11-07 08:44:13: 30,134,017,722 states generated (28,770,783 s/min), 2,199,274,846 distinct states found (1,528,929 ds/min), 108,138,210 states left on queue. +Progress(48) at 2024-11-07 08:45:13: 30,162,850,009 states generated (28,832,287 s/min), 2,200,818,695 distinct states found (1,543,849 ds/min), 107,749,686 states left on queue. +Progress(48) at 2024-11-07 08:46:13: 30,191,763,541 states generated (28,913,532 s/min), 2,202,269,881 distinct states found (1,451,186 ds/min), 107,274,074 states left on queue. +Progress(48) at 2024-11-07 08:47:13: 30,220,450,821 states generated (28,687,280 s/min), 2,203,579,369 distinct states found (1,309,488 ds/min), 106,693,506 states left on queue. +Progress(48) at 2024-11-07 08:48:13: 30,249,109,647 states generated (28,658,826 s/min), 2,204,980,828 distinct states found (1,401,459 ds/min), 106,171,815 states left on queue. +Progress(48) at 2024-11-07 08:49:13: 30,278,004,502 states generated (28,894,855 s/min), 2,206,546,641 distinct states found (1,565,813 ds/min), 105,800,017 states left on queue. +Progress(48) at 2024-11-07 08:50:13: 30,307,176,628 states generated (29,172,126 s/min), 2,208,173,395 distinct states found (1,626,754 ds/min), 105,492,735 states left on queue. +Progress(48) at 2024-11-07 08:51:13: 30,336,267,563 states generated (29,090,935 s/min), 2,209,629,083 distinct states found (1,455,688 ds/min), 105,046,561 states left on queue. +Progress(48) at 2024-11-07 08:52:13: 30,365,237,300 states generated (28,969,737 s/min), 2,211,221,126 distinct states found (1,592,043 ds/min), 104,707,696 states left on queue. +Progress(48) at 2024-11-07 08:53:13: 30,394,270,909 states generated (29,033,609 s/min), 2,212,948,437 distinct states found (1,727,311 ds/min), 104,490,104 states left on queue. +Progress(48) at 2024-11-07 08:54:13: 30,423,140,115 states generated (28,869,206 s/min), 2,214,640,116 distinct states found (1,691,679 ds/min), 104,243,061 states left on queue. +Progress(48) at 2024-11-07 08:55:13: 30,452,062,605 states generated (28,922,490 s/min), 2,216,327,939 distinct states found (1,687,823 ds/min), 103,983,745 states left on queue. +Progress(48) at 2024-11-07 08:56:13: 30,481,071,056 states generated (29,008,451 s/min), 2,217,983,905 distinct states found (1,655,966 ds/min), 103,702,586 states left on queue. +Progress(48) at 2024-11-07 08:57:13: 30,509,808,031 states generated (28,736,975 s/min), 2,219,662,593 distinct states found (1,678,688 ds/min), 103,423,522 states left on queue. +Progress(48) at 2024-11-07 08:58:13: 30,538,616,862 states generated (28,808,831 s/min), 2,221,288,821 distinct states found (1,626,228 ds/min), 103,098,334 states left on queue. +Progress(48) at 2024-11-07 08:59:13: 30,567,539,949 states generated (28,923,087 s/min), 2,222,969,669 distinct states found (1,680,848 ds/min), 102,811,145 states left on queue. +Progress(48) at 2024-11-07 09:00:13: 30,596,220,572 states generated (28,680,623 s/min), 2,224,451,086 distinct states found (1,481,417 ds/min), 102,320,643 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 09:01:14) +Progress(48) at 2024-11-07 09:01:14: 30,625,254,005 states generated (29,033,433 s/min), 2,225,971,213 distinct states found (1,520,127 ds/min), 101,895,678 states left on queue. +Progress(48) at 2024-11-07 09:02:14: 30,654,316,875 states generated (29,062,870 s/min), 2,227,776,007 distinct states found (1,804,794 ds/min), 101,720,925 states left on queue. +Progress(48) at 2024-11-07 09:03:14: 30,683,368,837 states generated (29,051,962 s/min), 2,229,520,592 distinct states found (1,744,585 ds/min), 101,516,049 states left on queue. +Progress(48) at 2024-11-07 09:04:14: 30,712,221,770 states generated (28,852,933 s/min), 2,231,006,576 distinct states found (1,485,984 ds/min), 101,059,951 states left on queue. +Progress(48) at 2024-11-07 09:05:14: 30,740,916,958 states generated (28,695,188 s/min), 2,232,634,565 distinct states found (1,627,989 ds/min), 100,742,863 states left on queue. +Progress(48) at 2024-11-07 09:06:14: 30,769,477,527 states generated (28,560,569 s/min), 2,234,099,495 distinct states found (1,464,930 ds/min), 100,237,731 states left on queue. +Progress(48) at 2024-11-07 09:07:14: 30,798,306,365 states generated (28,828,838 s/min), 2,235,757,510 distinct states found (1,658,015 ds/min), 99,936,798 states left on queue. +Progress(48) at 2024-11-07 09:08:14: 30,827,145,014 states generated (28,838,649 s/min), 2,237,323,374 distinct states found (1,565,864 ds/min), 99,542,928 states left on queue. +Progress(48) at 2024-11-07 09:09:14: 30,855,967,384 states generated (28,822,370 s/min), 2,238,712,445 distinct states found (1,389,071 ds/min), 98,994,892 states left on queue. +Progress(48) at 2024-11-07 09:10:14: 30,884,757,904 states generated (28,790,520 s/min), 2,240,211,537 distinct states found (1,499,092 ds/min), 98,555,003 states left on queue. +Progress(48) at 2024-11-07 09:11:14: 30,913,436,301 states generated (28,678,397 s/min), 2,241,549,402 distinct states found (1,337,865 ds/min), 97,972,368 states left on queue. +Progress(48) at 2024-11-07 09:12:14: 30,942,398,628 states generated (28,962,327 s/min), 2,242,894,478 distinct states found (1,345,076 ds/min), 97,450,191 states left on queue. +Progress(48) at 2024-11-07 09:13:14: 30,971,150,912 states generated (28,752,284 s/min), 2,244,149,533 distinct states found (1,255,055 ds/min), 96,915,440 states left on queue. +Progress(48) at 2024-11-07 09:14:14: 31,000,226,695 states generated (29,075,783 s/min), 2,245,486,253 distinct states found (1,336,720 ds/min), 96,453,711 states left on queue. +Progress(48) at 2024-11-07 09:15:14: 31,029,410,660 states generated (29,183,965 s/min), 2,247,033,348 distinct states found (1,547,095 ds/min), 96,134,910 states left on queue. +Progress(48) at 2024-11-07 09:16:14: 31,058,657,395 states generated (29,246,735 s/min), 2,248,447,081 distinct states found (1,413,733 ds/min), 95,701,875 states left on queue. +Progress(48) at 2024-11-07 09:17:14: 31,087,368,874 states generated (28,711,479 s/min), 2,249,703,997 distinct states found (1,256,916 ds/min), 95,112,797 states left on queue. +Progress(48) at 2024-11-07 09:18:14: 31,115,905,907 states generated (28,537,033 s/min), 2,250,949,093 distinct states found (1,245,096 ds/min), 94,499,889 states left on queue. +Progress(48) at 2024-11-07 09:19:14: 31,144,578,992 states generated (28,673,085 s/min), 2,252,226,995 distinct states found (1,277,902 ds/min), 93,927,098 states left on queue. +Progress(48) at 2024-11-07 09:20:14: 31,173,557,966 states generated (28,978,974 s/min), 2,253,602,196 distinct states found (1,375,201 ds/min), 93,561,559 states left on queue. +Progress(48) at 2024-11-07 09:21:14: 31,202,521,307 states generated (28,963,341 s/min), 2,255,224,149 distinct states found (1,621,953 ds/min), 93,337,000 states left on queue. +Progress(48) at 2024-11-07 09:22:14: 31,231,451,884 states generated (28,930,577 s/min), 2,256,879,564 distinct states found (1,655,415 ds/min), 93,119,996 states left on queue. +Progress(48) at 2024-11-07 09:23:14: 31,260,174,245 states generated (28,722,361 s/min), 2,258,206,514 distinct states found (1,326,950 ds/min), 92,610,216 states left on queue. +Progress(48) at 2024-11-07 09:24:14: 31,289,091,475 states generated (28,917,230 s/min), 2,259,564,810 distinct states found (1,358,296 ds/min), 92,123,452 states left on queue. +Progress(48) at 2024-11-07 09:25:14: 31,317,753,943 states generated (28,662,468 s/min), 2,260,868,559 distinct states found (1,303,749 ds/min), 91,550,997 states left on queue. +Progress(48) at 2024-11-07 09:26:14: 31,346,435,672 states generated (28,681,729 s/min), 2,262,197,433 distinct states found (1,328,874 ds/min), 91,002,731 states left on queue. +Progress(48) at 2024-11-07 09:27:14: 31,375,074,275 states generated (28,638,603 s/min), 2,263,549,308 distinct states found (1,351,875 ds/min), 90,479,028 states left on queue. +Progress(48) at 2024-11-07 09:28:14: 31,403,896,903 states generated (28,822,628 s/min), 2,264,999,048 distinct states found (1,449,740 ds/min), 90,030,284 states left on queue. +Progress(48) at 2024-11-07 09:29:14: 31,432,772,052 states generated (28,875,149 s/min), 2,266,431,878 distinct states found (1,432,830 ds/min), 89,580,165 states left on queue. +Progress(48) at 2024-11-07 09:30:14: 31,461,382,905 states generated (28,610,853 s/min), 2,267,701,315 distinct states found (1,269,437 ds/min), 89,008,135 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 09:31:15) +Progress(48) at 2024-11-07 09:31:15: 31,490,350,002 states generated (28,967,097 s/min), 2,269,120,991 distinct states found (1,419,676 ds/min), 88,574,899 states left on queue. +Progress(48) at 2024-11-07 09:32:15: 31,518,738,286 states generated (28,388,284 s/min), 2,270,333,667 distinct states found (1,212,676 ds/min), 87,950,800 states left on queue. +Progress(48) at 2024-11-07 09:33:15: 31,547,227,429 states generated (28,489,143 s/min), 2,271,632,491 distinct states found (1,298,824 ds/min), 87,379,110 states left on queue. +Progress(48) at 2024-11-07 09:34:15: 31,575,696,846 states generated (28,469,417 s/min), 2,272,873,166 distinct states found (1,240,675 ds/min), 86,717,955 states left on queue. +Progress(48) at 2024-11-07 09:35:15: 31,604,509,248 states generated (28,812,402 s/min), 2,274,166,128 distinct states found (1,292,962 ds/min), 86,122,414 states left on queue. +Progress(48) at 2024-11-07 09:36:15: 31,633,623,894 states generated (29,114,646 s/min), 2,275,690,739 distinct states found (1,524,611 ds/min), 85,718,820 states left on queue. +Progress(48) at 2024-11-07 09:37:15: 31,662,734,164 states generated (29,110,270 s/min), 2,277,282,041 distinct states found (1,591,302 ds/min), 85,389,121 states left on queue. +Progress(48) at 2024-11-07 09:38:15: 31,691,488,753 states generated (28,754,589 s/min), 2,278,666,982 distinct states found (1,384,941 ds/min), 84,903,119 states left on queue. +Progress(48) at 2024-11-07 09:39:15: 31,720,428,706 states generated (28,939,953 s/min), 2,280,231,311 distinct states found (1,564,329 ds/min), 84,529,794 states left on queue. +Progress(48) at 2024-11-07 09:40:15: 31,749,336,886 states generated (28,908,180 s/min), 2,281,688,218 distinct states found (1,456,907 ds/min), 84,091,511 states left on queue. +Progress(48) at 2024-11-07 09:41:15: 31,778,054,342 states generated (28,717,456 s/min), 2,283,102,693 distinct states found (1,414,475 ds/min), 83,605,316 states left on queue. +Progress(49) at 2024-11-07 09:42:15: 31,806,874,604 states generated (28,820,262 s/min), 2,284,525,902 distinct states found (1,423,209 ds/min), 83,115,134 states left on queue. +Progress(49) at 2024-11-07 09:43:15: 31,835,557,645 states generated (28,683,041 s/min), 2,285,776,893 distinct states found (1,250,991 ds/min), 82,491,419 states left on queue. +Progress(49) at 2024-11-07 09:44:15: 31,864,075,450 states generated (28,517,805 s/min), 2,287,028,991 distinct states found (1,252,098 ds/min), 81,847,819 states left on queue. +Progress(49) at 2024-11-07 09:45:15: 31,892,999,186 states generated (28,923,736 s/min), 2,288,552,140 distinct states found (1,523,149 ds/min), 81,459,937 states left on queue. +Progress(49) at 2024-11-07 09:46:15: 31,922,276,996 states generated (29,277,810 s/min), 2,290,137,668 distinct states found (1,585,528 ds/min), 81,115,285 states left on queue. +Progress(49) at 2024-11-07 09:47:15: 31,951,109,751 states generated (28,832,755 s/min), 2,291,477,001 distinct states found (1,339,333 ds/min), 80,582,606 states left on queue. +Progress(49) at 2024-11-07 09:48:15: 31,980,103,122 states generated (28,993,371 s/min), 2,293,149,633 distinct states found (1,672,632 ds/min), 80,321,900 states left on queue. +Progress(49) at 2024-11-07 09:49:15: 32,008,927,227 states generated (28,824,105 s/min), 2,294,737,299 distinct states found (1,587,666 ds/min), 79,988,982 states left on queue. +Progress(49) at 2024-11-07 09:50:15: 32,037,912,405 states generated (28,985,178 s/min), 2,296,369,269 distinct states found (1,631,970 ds/min), 79,688,340 states left on queue. +Progress(49) at 2024-11-07 09:51:15: 32,066,650,871 states generated (28,738,466 s/min), 2,297,881,682 distinct states found (1,512,413 ds/min), 79,285,058 states left on queue. +Progress(49) at 2024-11-07 09:52:15: 32,095,474,869 states generated (28,823,998 s/min), 2,299,386,856 distinct states found (1,505,174 ds/min), 78,860,285 states left on queue. +Progress(49) at 2024-11-07 09:53:15: 32,124,254,306 states generated (28,779,437 s/min), 2,300,974,245 distinct states found (1,587,389 ds/min), 78,501,509 states left on queue. +Progress(49) at 2024-11-07 09:54:15: 32,152,874,934 states generated (28,620,628 s/min), 2,302,313,494 distinct states found (1,339,249 ds/min), 77,908,264 states left on queue. +Progress(49) at 2024-11-07 09:55:15: 32,181,625,656 states generated (28,750,722 s/min), 2,303,719,911 distinct states found (1,406,417 ds/min), 77,409,147 states left on queue. +Progress(49) at 2024-11-07 09:56:15: 32,210,690,682 states generated (29,065,026 s/min), 2,305,458,559 distinct states found (1,738,648 ds/min), 77,178,015 states left on queue. +Progress(49) at 2024-11-07 09:57:15: 32,239,586,160 states generated (28,895,478 s/min), 2,307,003,156 distinct states found (1,544,597 ds/min), 76,805,818 states left on queue. +Progress(49) at 2024-11-07 09:58:15: 32,268,327,819 states generated (28,741,659 s/min), 2,308,436,891 distinct states found (1,433,735 ds/min), 76,324,212 states left on queue. +Progress(49) at 2024-11-07 09:59:15: 32,296,829,379 states generated (28,501,560 s/min), 2,309,831,948 distinct states found (1,395,057 ds/min), 75,779,735 states left on queue. +Progress(49) at 2024-11-07 10:00:15: 32,325,628,397 states generated (28,799,018 s/min), 2,311,380,882 distinct states found (1,548,934 ds/min), 75,395,162 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 10:01:15) +Progress(49) at 2024-11-07 10:01:15: 32,354,681,149 states generated (29,052,752 s/min), 2,312,867,979 distinct states found (1,487,097 ds/min), 74,928,503 states left on queue. +Progress(49) at 2024-11-07 10:02:15: 32,383,406,034 states generated (28,724,885 s/min), 2,314,202,265 distinct states found (1,334,286 ds/min), 74,352,680 states left on queue. +Progress(49) at 2024-11-07 10:03:15: 32,411,997,317 states generated (28,591,283 s/min), 2,315,435,082 distinct states found (1,232,817 ds/min), 73,700,708 states left on queue. +Progress(49) at 2024-11-07 10:04:15: 32,440,769,297 states generated (28,771,980 s/min), 2,316,687,791 distinct states found (1,252,709 ds/min), 73,114,003 states left on queue. +Progress(49) at 2024-11-07 10:05:15: 32,469,733,062 states generated (28,963,765 s/min), 2,317,885,762 distinct states found (1,197,971 ds/min), 72,558,372 states left on queue. +Progress(49) at 2024-11-07 10:06:15: 32,498,863,740 states generated (29,130,678 s/min), 2,319,353,511 distinct states found (1,467,749 ds/min), 72,186,248 states left on queue. +Progress(49) at 2024-11-07 10:07:15: 32,527,902,407 states generated (29,038,667 s/min), 2,320,635,445 distinct states found (1,281,934 ds/min), 71,639,893 states left on queue. +Progress(49) at 2024-11-07 10:08:15: 32,556,361,400 states generated (28,458,993 s/min), 2,321,793,726 distinct states found (1,158,281 ds/min), 70,954,333 states left on queue. +Progress(49) at 2024-11-07 10:09:15: 32,585,056,251 states generated (28,694,851 s/min), 2,323,009,155 distinct states found (1,215,429 ds/min), 70,362,671 states left on queue. +Progress(49) at 2024-11-07 10:10:15: 32,613,972,815 states generated (28,916,564 s/min), 2,324,321,084 distinct states found (1,311,929 ds/min), 69,935,186 states left on queue. +Progress(49) at 2024-11-07 10:11:15: 32,642,963,038 states generated (28,990,223 s/min), 2,325,997,874 distinct states found (1,676,790 ds/min), 69,730,871 states left on queue. +Progress(49) at 2024-11-07 10:12:15: 32,671,642,762 states generated (28,679,724 s/min), 2,327,294,217 distinct states found (1,296,343 ds/min), 69,221,413 states left on queue. +Progress(49) at 2024-11-07 10:13:15: 32,700,429,296 states generated (28,786,534 s/min), 2,328,535,742 distinct states found (1,241,525 ds/min), 68,635,066 states left on queue. +Progress(49) at 2024-11-07 10:14:15: 32,729,076,182 states generated (28,646,886 s/min), 2,329,760,071 distinct states found (1,224,329 ds/min), 67,997,735 states left on queue. +Progress(49) at 2024-11-07 10:15:15: 32,757,631,787 states generated (28,555,605 s/min), 2,331,002,517 distinct states found (1,242,446 ds/min), 67,379,374 states left on queue. +Progress(49) at 2024-11-07 10:16:15: 32,786,472,553 states generated (28,840,766 s/min), 2,332,364,440 distinct states found (1,361,923 ds/min), 66,856,953 states left on queue. +Progress(49) at 2024-11-07 10:17:15: 32,815,068,782 states generated (28,596,229 s/min), 2,333,629,799 distinct states found (1,265,359 ds/min), 66,266,973 states left on queue. +Progress(49) at 2024-11-07 10:18:15: 32,843,671,035 states generated (28,602,253 s/min), 2,334,875,787 distinct states found (1,245,988 ds/min), 65,714,901 states left on queue. +Progress(49) at 2024-11-07 10:19:15: 32,872,127,728 states generated (28,456,693 s/min), 2,336,030,334 distinct states found (1,154,547 ds/min), 65,023,805 states left on queue. +Progress(49) at 2024-11-07 10:20:15: 32,900,582,167 states generated (28,454,439 s/min), 2,337,180,611 distinct states found (1,150,277 ds/min), 64,304,348 states left on queue. +Progress(49) at 2024-11-07 10:21:15: 32,929,545,972 states generated (28,963,805 s/min), 2,338,488,833 distinct states found (1,308,222 ds/min), 63,715,470 states left on queue. +Progress(49) at 2024-11-07 10:22:15: 32,958,603,673 states generated (29,057,701 s/min), 2,339,992,330 distinct states found (1,503,497 ds/min), 63,307,968 states left on queue. +Progress(49) at 2024-11-07 10:23:15: 32,987,442,078 states generated (28,838,405 s/min), 2,341,335,966 distinct states found (1,343,636 ds/min), 62,792,292 states left on queue. +Progress(49) at 2024-11-07 10:24:15: 33,016,381,018 states generated (28,938,940 s/min), 2,342,828,482 distinct states found (1,492,516 ds/min), 62,365,394 states left on queue. +Progress(49) at 2024-11-07 10:25:15: 33,045,061,128 states generated (28,680,110 s/min), 2,344,118,515 distinct states found (1,290,033 ds/min), 61,789,542 states left on queue. +Progress(49) at 2024-11-07 10:26:15: 33,073,888,592 states generated (28,827,464 s/min), 2,345,475,829 distinct states found (1,357,314 ds/min), 61,253,128 states left on queue. +Progress(50) at 2024-11-07 10:27:15: 33,102,491,050 states generated (28,602,458 s/min), 2,346,652,625 distinct states found (1,176,796 ds/min), 60,570,177 states left on queue. +Progress(50) at 2024-11-07 10:28:15: 33,131,166,035 states generated (28,674,985 s/min), 2,347,941,873 distinct states found (1,289,248 ds/min), 59,969,815 states left on queue. +Progress(50) at 2024-11-07 10:29:15: 33,160,270,838 states generated (29,104,803 s/min), 2,349,441,004 distinct states found (1,499,131 ds/min), 59,570,847 states left on queue. +Progress(50) at 2024-11-07 10:30:15: 33,189,149,869 states generated (28,879,031 s/min), 2,350,812,706 distinct states found (1,371,702 ds/min), 59,068,202 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 10:31:16) +Progress(50) at 2024-11-07 10:31:16: 33,218,286,121 states generated (29,136,252 s/min), 2,352,357,375 distinct states found (1,544,669 ds/min), 58,692,343 states left on queue. +Progress(50) at 2024-11-07 10:32:16: 33,246,927,616 states generated (28,641,495 s/min), 2,353,796,993 distinct states found (1,439,618 ds/min), 58,245,674 states left on queue. +Progress(50) at 2024-11-07 10:33:16: 33,275,692,609 states generated (28,764,993 s/min), 2,355,282,278 distinct states found (1,485,285 ds/min), 57,825,713 states left on queue. +Progress(50) at 2024-11-07 10:34:16: 33,304,267,545 states generated (28,574,936 s/min), 2,356,681,270 distinct states found (1,398,992 ds/min), 57,325,849 states left on queue. +Progress(50) at 2024-11-07 10:35:16: 33,332,888,163 states generated (28,620,618 s/min), 2,358,099,683 distinct states found (1,418,413 ds/min), 56,833,993 states left on queue. +Progress(50) at 2024-11-07 10:36:16: 33,361,236,042 states generated (28,347,879 s/min), 2,359,281,358 distinct states found (1,181,675 ds/min), 56,126,890 states left on queue. +Progress(50) at 2024-11-07 10:37:16: 33,390,140,655 states generated (28,904,613 s/min), 2,360,868,517 distinct states found (1,587,159 ds/min), 55,791,859 states left on queue. +Progress(50) at 2024-11-07 10:38:16: 33,418,998,816 states generated (28,858,161 s/min), 2,362,363,780 distinct states found (1,495,263 ds/min), 55,385,255 states left on queue. +Progress(50) at 2024-11-07 10:39:16: 33,447,612,810 states generated (28,613,994 s/min), 2,363,728,858 distinct states found (1,365,078 ds/min), 54,854,942 states left on queue. +Progress(50) at 2024-11-07 10:40:16: 33,476,162,070 states generated (28,549,260 s/min), 2,365,099,267 distinct states found (1,370,409 ds/min), 54,312,039 states left on queue. +Progress(50) at 2024-11-07 10:41:16: 33,504,811,505 states generated (28,649,435 s/min), 2,366,473,549 distinct states found (1,374,282 ds/min), 53,784,809 states left on queue. +Progress(50) at 2024-11-07 10:42:16: 33,533,403,252 states generated (28,591,747 s/min), 2,367,734,253 distinct states found (1,260,704 ds/min), 53,158,819 states left on queue. +Progress(50) at 2024-11-07 10:43:16: 33,561,952,889 states generated (28,549,637 s/min), 2,368,855,124 distinct states found (1,120,871 ds/min), 52,441,471 states left on queue. +Progress(50) at 2024-11-07 10:44:16: 33,590,825,690 states generated (28,872,801 s/min), 2,370,054,403 distinct states found (1,199,279 ds/min), 51,878,202 states left on queue. +Progress(50) at 2024-11-07 10:45:16: 33,619,895,477 states generated (29,069,787 s/min), 2,371,355,035 distinct states found (1,300,632 ds/min), 51,382,836 states left on queue. +Progress(50) at 2024-11-07 10:46:16: 33,648,391,719 states generated (28,496,242 s/min), 2,372,441,699 distinct states found (1,086,664 ds/min), 50,647,071 states left on queue. +Progress(50) at 2024-11-07 10:47:16: 33,677,074,147 states generated (28,682,428 s/min), 2,373,600,507 distinct states found (1,158,808 ds/min), 50,052,421 states left on queue. +Progress(50) at 2024-11-07 10:48:16: 33,705,980,713 states generated (28,906,566 s/min), 2,375,050,402 distinct states found (1,449,895 ds/min), 49,692,912 states left on queue. +Progress(50) at 2024-11-07 10:49:16: 33,734,700,309 states generated (28,719,596 s/min), 2,376,355,805 distinct states found (1,305,403 ds/min), 49,202,990 states left on queue. +Progress(50) at 2024-11-07 10:50:16: 33,763,294,505 states generated (28,594,196 s/min), 2,377,489,014 distinct states found (1,133,209 ds/min), 48,526,991 states left on queue. +Progress(50) at 2024-11-07 10:51:16: 33,791,781,835 states generated (28,487,330 s/min), 2,378,610,114 distinct states found (1,121,100 ds/min), 47,806,234 states left on queue. +Progress(50) at 2024-11-07 10:52:16: 33,820,496,936 states generated (28,715,101 s/min), 2,379,861,294 distinct states found (1,251,180 ds/min), 47,194,112 states left on queue. +Progress(50) at 2024-11-07 10:53:16: 33,848,955,580 states generated (28,458,644 s/min), 2,381,018,247 distinct states found (1,156,953 ds/min), 46,544,595 states left on queue. +Progress(50) at 2024-11-07 10:54:16: 33,877,358,985 states generated (28,403,405 s/min), 2,382,084,162 distinct states found (1,065,915 ds/min), 45,797,353 states left on queue. +Progress(50) at 2024-11-07 10:55:16: 33,905,938,026 states generated (28,579,041 s/min), 2,383,237,725 distinct states found (1,153,563 ds/min), 45,079,182 states left on queue. +Progress(50) at 2024-11-07 10:56:16: 33,934,925,952 states generated (28,987,926 s/min), 2,384,648,770 distinct states found (1,411,045 ds/min), 44,602,865 states left on queue. +Progress(50) at 2024-11-07 10:57:16: 33,963,625,658 states generated (28,699,706 s/min), 2,385,892,826 distinct states found (1,244,056 ds/min), 44,000,281 states left on queue. +Progress(50) at 2024-11-07 10:58:16: 33,992,548,128 states generated (28,922,470 s/min), 2,387,290,030 distinct states found (1,397,204 ds/min), 43,514,140 states left on queue. +Progress(51) at 2024-11-07 10:59:16: 34,021,202,960 states generated (28,654,832 s/min), 2,388,511,227 distinct states found (1,221,197 ds/min), 42,867,785 states left on queue. +Progress(51) at 2024-11-07 11:00:16: 34,049,640,853 states generated (28,437,893 s/min), 2,389,565,989 distinct states found (1,054,762 ds/min), 42,084,713 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 11:01:17) +Progress(51) at 2024-11-07 11:01:17: 34,079,102,421 states generated (29,461,568 s/min), 2,391,039,395 distinct states found (1,473,406 ds/min), 41,644,463 states left on queue. +Progress(51) at 2024-11-07 11:02:17: 34,107,932,294 states generated (28,829,873 s/min), 2,392,415,920 distinct states found (1,376,525 ds/min), 41,153,999 states left on queue. +Progress(51) at 2024-11-07 11:03:17: 34,136,619,823 states generated (28,687,529 s/min), 2,393,784,341 distinct states found (1,368,421 ds/min), 40,648,398 states left on queue. +Progress(51) at 2024-11-07 11:04:17: 34,165,416,573 states generated (28,796,750 s/min), 2,395,186,568 distinct states found (1,402,227 ds/min), 40,162,223 states left on queue. +Progress(51) at 2024-11-07 11:05:17: 34,193,934,145 states generated (28,517,572 s/min), 2,396,461,207 distinct states found (1,274,639 ds/min), 39,558,749 states left on queue. +Progress(51) at 2024-11-07 11:06:17: 34,222,437,146 states generated (28,503,001 s/min), 2,397,667,005 distinct states found (1,205,798 ds/min), 38,877,170 states left on queue. +Progress(51) at 2024-11-07 11:07:17: 34,251,162,633 states generated (28,725,487 s/min), 2,399,047,586 distinct states found (1,380,581 ds/min), 38,366,536 states left on queue. +Progress(51) at 2024-11-07 11:08:17: 34,280,005,309 states generated (28,842,676 s/min), 2,400,476,715 distinct states found (1,429,129 ds/min), 37,912,093 states left on queue. +Progress(51) at 2024-11-07 11:09:17: 34,308,388,681 states generated (28,383,372 s/min), 2,401,648,509 distinct states found (1,171,794 ds/min), 37,215,479 states left on queue. +Progress(51) at 2024-11-07 11:10:17: 34,337,086,557 states generated (28,697,876 s/min), 2,403,035,913 distinct states found (1,387,404 ds/min), 36,712,331 states left on queue. +Progress(51) at 2024-11-07 11:11:17: 34,365,565,315 states generated (28,478,758 s/min), 2,404,187,792 distinct states found (1,151,879 ds/min), 36,008,223 states left on queue. +Progress(51) at 2024-11-07 11:12:17: 34,394,280,845 states generated (28,715,530 s/min), 2,405,264,161 distinct states found (1,076,369 ds/min), 35,318,651 states left on queue. +Progress(51) at 2024-11-07 11:13:17: 34,423,292,173 states generated (29,011,328 s/min), 2,406,461,030 distinct states found (1,196,869 ds/min), 34,731,310 states left on queue. +Progress(51) at 2024-11-07 11:14:17: 34,451,717,631 states generated (28,425,458 s/min), 2,407,470,263 distinct states found (1,009,233 ds/min), 33,977,845 states left on queue. +Progress(51) at 2024-11-07 11:15:17: 34,480,582,848 states generated (28,865,217 s/min), 2,408,844,472 distinct states found (1,374,209 ds/min), 33,563,385 states left on queue. +Progress(51) at 2024-11-07 11:16:17: 34,509,255,375 states generated (28,672,527 s/min), 2,409,992,223 distinct states found (1,147,751 ds/min), 32,948,371 states left on queue. +Progress(51) at 2024-11-07 11:17:17: 34,537,627,156 states generated (28,371,781 s/min), 2,411,007,744 distinct states found (1,015,521 ds/min), 32,138,450 states left on queue. +Progress(51) at 2024-11-07 11:18:17: 34,566,104,650 states generated (28,477,494 s/min), 2,412,094,834 distinct states found (1,087,090 ds/min), 31,405,790 states left on queue. +Progress(51) at 2024-11-07 11:19:17: 34,594,468,421 states generated (28,363,771 s/min), 2,413,136,514 distinct states found (1,041,680 ds/min), 30,631,648 states left on queue. +Progress(51) at 2024-11-07 11:20:17: 34,623,282,746 states generated (28,814,325 s/min), 2,414,376,756 distinct states found (1,240,242 ds/min), 30,011,457 states left on queue. +Progress(51) at 2024-11-07 11:21:17: 34,652,013,328 states generated (28,730,582 s/min), 2,415,631,977 distinct states found (1,255,221 ds/min), 29,420,035 states left on queue. +Progress(51) at 2024-11-07 11:22:17: 34,680,708,001 states generated (28,694,673 s/min), 2,416,841,149 distinct states found (1,209,172 ds/min), 28,780,239 states left on queue. +Progress(52) at 2024-11-07 11:23:17: 34,709,197,697 states generated (28,489,696 s/min), 2,417,931,157 distinct states found (1,090,008 ds/min), 28,033,256 states left on queue. +Progress(52) at 2024-11-07 11:24:17: 34,738,057,742 states generated (28,860,045 s/min), 2,419,214,866 distinct states found (1,283,709 ds/min), 27,476,210 states left on queue. +Progress(52) at 2024-11-07 11:25:17: 34,766,795,719 states generated (28,737,977 s/min), 2,420,575,203 distinct states found (1,360,337 ds/min), 26,973,510 states left on queue. +Progress(52) at 2024-11-07 11:26:17: 34,795,409,801 states generated (28,614,082 s/min), 2,421,852,170 distinct states found (1,276,967 ds/min), 26,383,152 states left on queue. +Progress(52) at 2024-11-07 11:27:17: 34,823,871,413 states generated (28,461,612 s/min), 2,423,018,118 distinct states found (1,165,948 ds/min), 25,687,358 states left on queue. +Progress(52) at 2024-11-07 11:28:17: 34,852,452,267 states generated (28,580,854 s/min), 2,424,258,491 distinct states found (1,240,373 ds/min), 25,061,677 states left on queue. +Progress(52) at 2024-11-07 11:29:17: 34,881,109,110 states generated (28,656,843 s/min), 2,425,536,450 distinct states found (1,277,959 ds/min), 24,485,682 states left on queue. +Progress(52) at 2024-11-07 11:30:17: 34,909,638,357 states generated (28,529,247 s/min), 2,426,766,241 distinct states found (1,229,791 ds/min), 23,851,800 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 11:31:18) +Progress(52) at 2024-11-07 11:31:18: 34,938,217,205 states generated (28,578,848 s/min), 2,427,804,784 distinct states found (1,038,543 ds/min), 23,061,400 states left on queue. +Progress(52) at 2024-11-07 11:32:18: 34,967,089,391 states generated (28,872,186 s/min), 2,428,907,251 distinct states found (1,102,467 ds/min), 22,421,037 states left on queue. +Progress(52) at 2024-11-07 11:33:18: 34,995,531,710 states generated (28,442,319 s/min), 2,429,963,142 distinct states found (1,055,891 ds/min), 21,740,235 states left on queue. +Progress(52) at 2024-11-07 11:34:18: 35,024,141,172 states generated (28,609,462 s/min), 2,431,122,150 distinct states found (1,159,008 ds/min), 21,149,288 states left on queue. +Progress(52) at 2024-11-07 11:35:18: 35,052,351,960 states generated (28,210,788 s/min), 2,432,077,858 distinct states found (955,708 ds/min), 20,295,072 states left on queue. +Progress(52) at 2024-11-07 11:36:18: 35,080,654,028 states generated (28,302,068 s/min), 2,433,061,991 distinct states found (984,133 ds/min), 19,478,746 states left on queue. +Progress(52) at 2024-11-07 11:37:18: 35,109,293,099 states generated (28,639,071 s/min), 2,434,258,110 distinct states found (1,196,119 ds/min), 18,850,062 states left on queue. +Progress(53) at 2024-11-07 11:38:18: 35,137,874,307 states generated (28,581,208 s/min), 2,435,408,538 distinct states found (1,150,428 ds/min), 18,171,042 states left on queue. +Progress(53) at 2024-11-07 11:39:18: 35,166,493,712 states generated (28,619,405 s/min), 2,436,567,034 distinct states found (1,158,496 ds/min), 17,510,811 states left on queue. +Progress(53) at 2024-11-07 11:40:18: 35,195,076,188 states generated (28,582,476 s/min), 2,437,810,887 distinct states found (1,243,853 ds/min), 16,916,098 states left on queue. +Progress(53) at 2024-11-07 11:41:18: 35,223,492,769 states generated (28,416,581 s/min), 2,438,939,934 distinct states found (1,129,047 ds/min), 16,200,301 states left on queue. +Progress(53) at 2024-11-07 11:42:18: 35,252,026,035 states generated (28,533,266 s/min), 2,440,130,151 distinct states found (1,190,217 ds/min), 15,545,447 states left on queue. +Progress(53) at 2024-11-07 11:43:18: 35,280,482,465 states generated (28,456,430 s/min), 2,441,297,027 distinct states found (1,166,876 ds/min), 14,879,990 states left on queue. +Progress(53) at 2024-11-07 11:44:18: 35,308,940,796 states generated (28,458,331 s/min), 2,442,317,453 distinct states found (1,020,426 ds/min), 14,116,803 states left on queue. +Progress(53) at 2024-11-07 11:45:18: 35,337,597,306 states generated (28,656,510 s/min), 2,443,328,791 distinct states found (1,011,338 ds/min), 13,403,307 states left on queue. +Progress(53) at 2024-11-07 11:46:18: 35,366,058,165 states generated (28,460,859 s/min), 2,444,336,498 distinct states found (1,007,707 ds/min), 12,657,418 states left on queue. +Progress(53) at 2024-11-07 11:47:18: 35,394,499,327 states generated (28,441,162 s/min), 2,445,346,072 distinct states found (1,009,574 ds/min), 11,856,670 states left on queue. +Progress(53) at 2024-11-07 11:48:18: 35,423,058,448 states generated (28,559,121 s/min), 2,446,449,527 distinct states found (1,103,455 ds/min), 11,150,850 states left on queue. +Progress(54) at 2024-11-07 11:49:18: 35,451,714,950 states generated (28,656,502 s/min), 2,447,608,246 distinct states found (1,158,719 ds/min), 10,497,489 states left on queue. +Progress(54) at 2024-11-07 11:50:18: 35,480,075,027 states generated (28,360,077 s/min), 2,448,668,413 distinct states found (1,060,167 ds/min), 9,734,924 states left on queue. +Progress(54) at 2024-11-07 11:51:18: 35,508,544,241 states generated (28,469,214 s/min), 2,449,793,995 distinct states found (1,125,582 ds/min), 9,041,108 states left on queue. +Progress(54) at 2024-11-07 11:52:18: 35,537,058,894 states generated (28,514,653 s/min), 2,450,835,560 distinct states found (1,041,565 ds/min), 8,304,357 states left on queue. +Progress(54) at 2024-11-07 11:53:18: 35,565,617,770 states generated (28,558,876 s/min), 2,451,805,307 distinct states found (969,747 ds/min), 7,554,593 states left on queue. +Progress(54) at 2024-11-07 11:54:18: 35,594,096,319 states generated (28,478,549 s/min), 2,452,829,286 distinct states found (1,023,979 ds/min), 6,777,854 states left on queue. +Progress(55) at 2024-11-07 11:55:18: 35,622,658,049 states generated (28,561,730 s/min), 2,453,911,213 distinct states found (1,081,927 ds/min), 6,063,348 states left on queue. +Progress(55) at 2024-11-07 11:56:18: 35,651,019,108 states generated (28,361,059 s/min), 2,454,944,844 distinct states found (1,033,631 ds/min), 5,290,297 states left on queue. +Progress(55) at 2024-11-07 11:57:18: 35,679,577,103 states generated (28,557,995 s/min), 2,455,941,484 distinct states found (996,640 ds/min), 4,540,257 states left on queue. +Progress(55) at 2024-11-07 11:58:18: 35,708,050,230 states generated (28,473,127 s/min), 2,456,911,566 distinct states found (970,082 ds/min), 3,737,722 states left on queue. +Progress(55) at 2024-11-07 11:59:18: 35,736,484,911 states generated (28,434,681 s/min), 2,457,942,176 distinct states found (1,030,610 ds/min), 2,980,348 states left on queue. +Progress(56) at 2024-11-07 12:00:18: 35,765,029,620 states generated (28,544,709 s/min), 2,458,911,346 distinct states found (969,170 ds/min), 2,201,353 states left on queue. +Checkpointing of run states/24-11-06-15-30-45.354 +Checkpointing completed at (2024-11-07 12:01:18) +Progress(57) at 2024-11-07 12:01:18: 35,793,733,161 states generated (28,703,541 s/min), 2,459,897,228 distinct states found (985,882 ds/min), 1,411,705 states left on queue. +Progress(58) at 2024-11-07 12:02:18: 35,822,110,432 states generated (28,377,271 s/min), 2,460,820,961 distinct states found (923,733 ds/min), 587,430 states left on queue. +Model checking completed. No error has been found. + Estimates of the probability that TLC did not check all reachable states + because two distinct states had the same fingerprint: + calculated (optimistic): val = 4.5 + based on the actual fingerprints: val = .25 +35840434685 states generated, 2461362509 distinct states found, 0 states left on queue. +The depth of the complete state graph search is 67. +The average outdegree of the complete state graph is 1 (minimum is 0, the maximum 8 and the 95th percentile is 2). +Finished in 20h 32min at (2024-11-07 12:03:02) diff --git a/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a5_t2_l2.cfg-2024-11-06--12-09-32.log b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a5_t2_l2.cfg-2024-11-06--12-09-32.log new file mode 100644 index 0000000000..c43d52302b --- /dev/null +++ b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a5_t2_l2.cfg-2024-11-06--12-09-32.log @@ -0,0 +1,89 @@ +git revision: 864f4667d +Platform: Linux neon-dev-arm64-1 6.8.0-48-generic #48-Ubuntu SMP PREEMPT_DYNAMIC Fri Sep 27 14:35:45 UTC 2024 aarch64 aarch64 aarch64 GNU/Linux +CPU Info Linux: Neoverse-N1 +CPU Cores Linux: 80 +CPU Info Mac: +CPU Cores Mac: +Spec: MCProposerAcceptorStatic.tla +Config: models/MCProposerAcceptorStatic_p2_a5_t2_l2.cfg +---- +CONSTANTS +NULL = NULL +proposers = {p1, p2} +acceptors = {a1, a2, a3, a4, a5} +max_term = 2 +max_entries = 2 +SPECIFICATION Spec +CONSTRAINT StateConstraint +INVARIANT +TypeOk +ElectionSafety +LogIsMonotonic +LogSafety +SYMMETRY ProposerAcceptorSymmetry +CHECK_DEADLOCK FALSE +ALIAS Alias + +---- + +TLC2 Version 2.20 of Day Month 20?? (rev: f68cb71) +Running breadth-first search Model-Checking with fp 90 and seed 2164066158568118414 with 80 workers on 80 cores with 54613MB heap and 61440MB offheap memory [pid: 30788] (Linux 6.8.0-48-generic aarch64, Ubuntu 21.0.4 x86_64, OffHeapDiskFPSet, DiskStateQueue). +Parsing file /home/arseny/neon/safekeeper/spec/MCProposerAcceptorStatic.tla +Parsing file /tmp/tlc-13824636513165485309/TLC.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLC.tla) +Parsing file /home/arseny/neon/safekeeper/spec/ProposerAcceptorStatic.tla +Parsing file /tmp/tlc-13824636513165485309/_TLCTrace.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/_TLCTrace.tla) +Parsing file /tmp/tlc-13824636513165485309/Integers.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Integers.tla) +Parsing file /tmp/tlc-13824636513165485309/Sequences.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Sequences.tla) +Parsing file /tmp/tlc-13824636513165485309/FiniteSets.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/FiniteSets.tla) +Parsing file /tmp/tlc-13824636513165485309/Naturals.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Naturals.tla) +Parsing file /tmp/tlc-13824636513165485309/TLCExt.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLCExt.tla) +Semantic processing of module Naturals +Semantic processing of module Sequences +Semantic processing of module FiniteSets +Semantic processing of module TLC +Semantic processing of module Integers +Semantic processing of module ProposerAcceptorStatic +Semantic processing of module TLCExt +Semantic processing of module _TLCTrace +Semantic processing of module MCProposerAcceptorStatic +Starting... (2024-11-06 12:09:33) +Computing initial states... +Finished computing initial states: 1 distinct state generated at 2024-11-06 12:09:36. +Progress(16) at 2024-11-06 12:09:39: 405,675 states generated (405,675 s/min), 18,042 distinct states found (18,042 ds/min), 7,612 states left on queue. +Progress(23) at 2024-11-06 12:10:39: 12,449,257 states generated (12,043,582 s/min), 467,293 distinct states found (449,251 ds/min), 161,057 states left on queue. +Progress(25) at 2024-11-06 12:11:39: 24,461,332 states generated (12,012,075 s/min), 861,011 distinct states found (393,718 ds/min), 267,072 states left on queue. +Progress(26) at 2024-11-06 12:12:39: 36,440,377 states generated (11,979,045 s/min), 1,234,052 distinct states found (373,041 ds/min), 355,372 states left on queue. +Progress(26) at 2024-11-06 12:13:39: 48,327,873 states generated (11,887,496 s/min), 1,583,736 distinct states found (349,684 ds/min), 425,209 states left on queue. +Progress(27) at 2024-11-06 12:14:39: 60,246,136 states generated (11,918,263 s/min), 1,933,499 distinct states found (349,763 ds/min), 494,269 states left on queue. +Progress(28) at 2024-11-06 12:15:39: 71,977,716 states generated (11,731,580 s/min), 2,265,302 distinct states found (331,803 ds/min), 553,777 states left on queue. +Progress(28) at 2024-11-06 12:16:39: 83,644,537 states generated (11,666,821 s/min), 2,575,451 distinct states found (310,149 ds/min), 594,142 states left on queue. +Progress(29) at 2024-11-06 12:17:39: 95,287,089 states generated (11,642,552 s/min), 2,888,793 distinct states found (313,342 ds/min), 639,273 states left on queue. +Progress(29) at 2024-11-06 12:18:39: 107,000,972 states generated (11,713,883 s/min), 3,194,255 distinct states found (305,462 ds/min), 673,353 states left on queue. +Progress(29) at 2024-11-06 12:19:39: 118,305,248 states generated (11,304,276 s/min), 3,467,775 distinct states found (273,520 ds/min), 692,915 states left on queue. +Progress(29) at 2024-11-06 12:20:39: 129,954,327 states generated (11,649,079 s/min), 3,763,186 distinct states found (295,411 ds/min), 720,349 states left on queue. +Progress(29) at 2024-11-06 12:21:39: 141,251,359 states generated (11,297,032 s/min), 4,020,407 distinct states found (257,221 ds/min), 724,036 states left on queue. +Progress(30) at 2024-11-06 12:22:39: 152,551,873 states generated (11,300,514 s/min), 4,284,278 distinct states found (263,871 ds/min), 733,726 states left on queue. +Progress(30) at 2024-11-06 12:23:39: 164,324,788 states generated (11,772,915 s/min), 4,569,569 distinct states found (285,291 ds/min), 746,476 states left on queue. +Progress(30) at 2024-11-06 12:24:39: 175,121,317 states generated (10,796,529 s/min), 4,779,505 distinct states found (209,936 ds/min), 723,070 states left on queue. +Progress(31) at 2024-11-06 12:25:39: 186,238,236 states generated (11,116,919 s/min), 5,016,034 distinct states found (236,529 ds/min), 712,944 states left on queue. +Progress(31) at 2024-11-06 12:26:39: 197,884,578 states generated (11,646,342 s/min), 5,276,094 distinct states found (260,060 ds/min), 705,471 states left on queue. +Progress(31) at 2024-11-06 12:27:39: 208,535,096 states generated (10,650,518 s/min), 5,463,450 distinct states found (187,356 ds/min), 665,661 states left on queue. +Progress(32) at 2024-11-06 12:28:39: 219,424,829 states generated (10,889,733 s/min), 5,673,673 distinct states found (210,223 ds/min), 637,975 states left on queue. +Progress(32) at 2024-11-06 12:29:39: 230,906,372 states generated (11,481,543 s/min), 5,903,516 distinct states found (229,843 ds/min), 606,255 states left on queue. +Progress(33) at 2024-11-06 12:30:39: 241,261,887 states generated (10,355,515 s/min), 6,065,731 distinct states found (162,215 ds/min), 552,728 states left on queue. +Progress(33) at 2024-11-06 12:31:39: 252,028,921 states generated (10,767,034 s/min), 6,255,487 distinct states found (189,756 ds/min), 509,620 states left on queue. +Progress(33) at 2024-11-06 12:32:39: 262,856,171 states generated (10,827,250 s/min), 6,431,063 distinct states found (175,576 ds/min), 448,834 states left on queue. +Progress(34) at 2024-11-06 12:33:39: 273,211,882 states generated (10,355,711 s/min), 6,586,644 distinct states found (155,581 ds/min), 386,905 states left on queue. +Progress(34) at 2024-11-06 12:34:39: 283,843,415 states generated (10,631,533 s/min), 6,743,916 distinct states found (157,272 ds/min), 315,135 states left on queue. +Progress(35) at 2024-11-06 12:35:39: 293,931,115 states generated (10,087,700 s/min), 6,878,405 distinct states found (134,489 ds/min), 241,126 states left on queue. +Progress(36) at 2024-11-06 12:36:39: 303,903,441 states generated (9,972,326 s/min), 6,996,394 distinct states found (117,989 ds/min), 152,775 states left on queue. +Progress(37) at 2024-11-06 12:37:39: 313,501,886 states generated (9,598,445 s/min), 7,093,031 distinct states found (96,637 ds/min), 54,009 states left on queue. +Model checking completed. No error has been found. + Estimates of the probability that TLC did not check all reachable states + because two distinct states had the same fingerprint: + calculated (optimistic): val = 1.2E-4 + based on the actual fingerprints: val = 2.1E-6 +318172398 states generated, 7127950 distinct states found, 0 states left on queue. +The depth of the complete state graph search is 44. +The average outdegree of the complete state graph is 1 (minimum is 0, the maximum 9 and the 95th percentile is 3). +Finished in 28min 43s at (2024-11-06 12:38:16) From a0cd64c4d3b1cf9738f8ec1b3a5bf433e42cc03d Mon Sep 17 00:00:00 2001 From: Folke Behrens Date: Mon, 2 Dec 2024 18:24:48 +0100 Subject: [PATCH 29/65] Bump OTel, tracing, reqwest crates (#9970) --- Cargo.lock | 158 ++++++++++++++++++++++++++--------------------------- Cargo.toml | 16 +++--- 2 files changed, 86 insertions(+), 88 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5ce27a7d45..ba02e3b11d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -185,7 +185,7 @@ checksum = "965c2d33e53cb6b267e148a4cb0760bc01f4904c1cd4bb4002a085bb016d1490" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", "synstructure", ] @@ -197,7 +197,7 @@ checksum = "7b18050c2cd6fe86c3a76584ef5e0baf286d038cda203eb6223df2cc413565f7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -256,7 +256,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -267,7 +267,7 @@ checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -969,7 +969,7 @@ dependencies = [ "regex", "rustc-hash", "shlex", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -1198,7 +1198,7 @@ dependencies = [ "heck 0.4.1", "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -1615,7 +1615,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -1626,7 +1626,7 @@ checksum = "29a358ff9f12ec09c3e61fef9b5a9902623a695a46a917b07f269bff1445611a" dependencies = [ "darling_core", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -1749,7 +1749,7 @@ dependencies = [ "dsl_auto_type", "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -1769,7 +1769,7 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "209c735641a413bc68c4923a9d6ad4bcb3ca306b794edaa7eb0b3228a99ffb25" dependencies = [ - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -1792,7 +1792,7 @@ checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -1815,7 +1815,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -1947,7 +1947,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -1980,7 +1980,7 @@ checksum = "3bf679796c0322556351f287a51b49e48f7c4986e727b5dd78c972d30e2e16cc" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -2234,7 +2234,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -2337,7 +2337,7 @@ checksum = "53010ccb100b96a67bc32c0175f0ed1426b31b655d562898e57325f81c023ac0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -3142,7 +3142,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -3515,9 +3515,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" [[package]] name = "opentelemetry" -version = "0.24.0" +version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c365a63eec4f55b7efeceb724f1336f26a9cf3427b70e59e2cd2a5b947fba96" +checksum = "570074cc999d1a58184080966e5bd3bf3a9a4af650c3b05047c2621e7405cd17" dependencies = [ "futures-core", "futures-sink", @@ -3529,9 +3529,9 @@ dependencies = [ [[package]] name = "opentelemetry-http" -version = "0.13.0" +version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad31e9de44ee3538fb9d64fe3376c1362f406162434609e79aea2a41a0af78ab" +checksum = "6351496aeaa49d7c267fb480678d85d1cd30c5edb20b497c48c56f62a8c14b99" dependencies = [ "async-trait", "bytes", @@ -3542,9 +3542,9 @@ dependencies = [ [[package]] name = "opentelemetry-otlp" -version = "0.17.0" +version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b925a602ffb916fb7421276b86756027b37ee708f9dce2dbdcc51739f07e727" +checksum = "29e1f9c8b032d4f635c730c0efcf731d5e2530ea13fa8bef7939ddc8420696bd" dependencies = [ "async-trait", "futures-core", @@ -3560,9 +3560,9 @@ dependencies = [ [[package]] name = "opentelemetry-proto" -version = "0.7.0" +version = "0.26.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30ee9f20bff9c984511a02f082dc8ede839e4a9bf15cc2487c8d6fea5ad850d9" +checksum = "c9d3968ce3aefdcca5c27e3c4ea4391b37547726a70893aab52d3de95d5f8b34" dependencies = [ "opentelemetry", "opentelemetry_sdk", @@ -3572,15 +3572,15 @@ dependencies = [ [[package]] name = "opentelemetry-semantic-conventions" -version = "0.16.0" +version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cefe0543875379e47eb5f1e68ff83f45cc41366a92dfd0d073d513bf68e9a05" +checksum = "db945c1eaea8ac6a9677185357480d215bb6999faa9f691d0c4d4d641eab7a09" [[package]] name = "opentelemetry_sdk" -version = "0.24.1" +version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "692eac490ec80f24a17828d49b40b60f5aeaccdfe6a503f939713afd22bc28df" +checksum = "d2c627d9f4c9cdc1f21a29ee4bfbd6028fcb8bcf2a857b43f3abdf72c9c862f3" dependencies = [ "async-trait", "futures-channel", @@ -3954,7 +3954,7 @@ dependencies = [ "parquet", "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -4056,7 +4056,7 @@ checksum = "39407670928234ebc5e6e580247dd567ad73a3578460c5990f9503df207e8f07" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -4334,7 +4334,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8d3928fb5db768cb86f891ff014f0144589297e3c6a1aba6ed7cecfdace270c7" dependencies = [ "proc-macro2", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -4348,9 +4348,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.78" +version = "1.0.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae" +checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0" dependencies = [ "unicode-ident", ] @@ -4424,7 +4424,7 @@ dependencies = [ "prost", "prost-types", "regex", - "syn 2.0.52", + "syn 2.0.90", "tempfile", ] @@ -4438,7 +4438,7 @@ dependencies = [ "itertools 0.12.1", "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -4992,9 +4992,9 @@ dependencies = [ [[package]] name = "reqwest-middleware" -version = "0.3.0" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0209efb52486ad88136190094ee214759ef7507068b27992256ed6610eb71a01" +checksum = "d1ccd3b55e711f91a9885a2fa6fbbb2e39db1776420b062efc058c6410f7e5e3" dependencies = [ "anyhow", "async-trait", @@ -5007,13 +5007,12 @@ dependencies = [ [[package]] name = "reqwest-retry" -version = "0.5.0" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40f342894422862af74c50e1e9601cf0931accc9c6981e5eb413c46603b616b5" +checksum = "29c73e4195a6bfbcb174b790d9b3407ab90646976c55de58a6515da25d851178" dependencies = [ "anyhow", "async-trait", - "chrono", "futures", "getrandom 0.2.11", "http 1.1.0", @@ -5022,6 +5021,7 @@ dependencies = [ "reqwest 0.12.4", "reqwest-middleware", "retry-policies", + "thiserror", "tokio", "tracing", "wasm-timer", @@ -5029,9 +5029,9 @@ dependencies = [ [[package]] name = "reqwest-tracing" -version = "0.5.3" +version = "0.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfdd9bfa64c72233d8dd99ab7883efcdefe9e16d46488ecb9228b71a2e2ceb45" +checksum = "ff82cf5730a1311fb9413b0bc2b8e743e0157cd73f010ab4ec374a923873b6a2" dependencies = [ "anyhow", "async-trait", @@ -5047,12 +5047,10 @@ dependencies = [ [[package]] name = "retry-policies" -version = "0.3.0" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "493b4243e32d6eedd29f9a398896e35c6943a123b55eec97dcaee98310d25810" +checksum = "5875471e6cab2871bc150ecb8c727db5113c9338cc3354dc5ee3425b6aa40a1c" dependencies = [ - "anyhow", - "chrono", "rand 0.8.5", ] @@ -5176,7 +5174,7 @@ dependencies = [ "regex", "relative-path", "rustc_version", - "syn 2.0.52", + "syn 2.0.90", "unicode-ident", ] @@ -5684,7 +5682,7 @@ checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -5766,7 +5764,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -6139,7 +6137,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -6190,9 +6188,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.52" +version = "2.0.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b699d15b36d1f02c3e7c69f8ffef53de37aefae075d8488d4ba1a7788d574a07" +checksum = "919d3b74a5dd0ccd15aeb8f93e7006bd9e14c295087c9896a110f490752bcf31" dependencies = [ "proc-macro2", "quote", @@ -6222,7 +6220,7 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -6300,27 +6298,27 @@ checksum = "78ea17a2dc368aeca6f554343ced1b1e31f76d63683fa8016e5844bd7a5144a1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] name = "thiserror" -version = "1.0.57" +version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e45bcbe8ed29775f228095caf2cd67af7a4ccf756ebff23a306bf3e8b47b24b" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.57" +version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a953cb265bef375dae3de6663da4d3804eee9682ea80d8e2542529b73c531c81" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -6494,7 +6492,7 @@ checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -6719,7 +6717,7 @@ dependencies = [ "prost-build", "prost-types", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -6756,9 +6754,9 @@ checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" [[package]] name = "tracing" -version = "0.1.40" +version = "0.1.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef" +checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0" dependencies = [ "log", "pin-project-lite", @@ -6779,20 +6777,20 @@ dependencies = [ [[package]] name = "tracing-attributes" -version = "0.1.27" +version = "0.1.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" +checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] name = "tracing-core" -version = "0.1.32" +version = "0.1.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54" +checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c" dependencies = [ "once_cell", "valuable", @@ -6821,9 +6819,9 @@ dependencies = [ [[package]] name = "tracing-opentelemetry" -version = "0.25.0" +version = "0.27.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9784ed4da7d921bc8df6963f8c80a0e4ce34ba6ba76668acadd3edbd985ff3b" +checksum = "dc58af5d3f6c5811462cabb3289aec0093f7338e367e5a33d28c0433b3c7360b" dependencies = [ "js-sys", "once_cell", @@ -6839,9 +6837,9 @@ dependencies = [ [[package]] name = "tracing-serde" -version = "0.1.3" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc6b213177105856957181934e4920de57730fc69bf42c37ee5bb664d406d9e1" +checksum = "704b1aeb7be0d0a84fc9828cae51dab5970fee5088f83d1dd7ee6f6246fc6ff1" dependencies = [ "serde", "tracing-core", @@ -6849,9 +6847,9 @@ dependencies = [ [[package]] name = "tracing-subscriber" -version = "0.3.18" +version = "0.3.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad0f048c97dbd9faa9b7df56362b8ebcaa52adb06b498c050d2f4e32f90a7a8b" +checksum = "e8189decb5ac0fa7bc8b96b7cb9b2701d60d48805aca84a238004d665fcc4008" dependencies = [ "matchers", "once_cell", @@ -7258,7 +7256,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", "wasm-bindgen-shared", ] @@ -7292,7 +7290,7 @@ checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -7669,7 +7667,7 @@ dependencies = [ "smallvec", "spki 0.7.3", "subtle", - "syn 2.0.52", + "syn 2.0.90", "sync_wrapper 0.1.2", "tikv-jemalloc-sys", "time", @@ -7769,7 +7767,7 @@ checksum = "b3c129550b3e6de3fd0ba67ba5c81818f9805e58b8d7fee80a3a59d2c9fc601a" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] @@ -7790,7 +7788,7 @@ checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69" dependencies = [ "proc-macro2", "quote", - "syn 2.0.52", + "syn 2.0.90", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 64c384f17a..036dc01057 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -127,10 +127,10 @@ notify = "6.0.0" num_cpus = "1.15" num-traits = "0.2.15" once_cell = "1.13" -opentelemetry = "0.24" -opentelemetry_sdk = "0.24" -opentelemetry-otlp = { version = "0.17", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] } -opentelemetry-semantic-conventions = "0.16" +opentelemetry = "0.26" +opentelemetry_sdk = "0.26" +opentelemetry-otlp = { version = "0.26", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] } +opentelemetry-semantic-conventions = "0.26" parking_lot = "0.12" parquet = { version = "53", default-features = false, features = ["zstd"] } parquet_derive = "53" @@ -144,9 +144,9 @@ rand = "0.8" redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] } regex = "1.10.2" reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] } -reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_24"] } -reqwest-middleware = "0.3.0" -reqwest-retry = "0.5" +reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_26"] } +reqwest-middleware = "0.4" +reqwest-retry = "0.7" routerify = "3" rpds = "0.13" rustc-hash = "1.1.0" @@ -191,7 +191,7 @@ tonic = {version = "0.12.3", features = ["tls", "tls-roots"]} tower-service = "0.3.2" tracing = "0.1" tracing-error = "0.2" -tracing-opentelemetry = "0.25" +tracing-opentelemetry = "0.27" tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] } try-lock = "0.2.5" twox-hash = { version = "1.6.3", default-features = false } From 32ba9811f979f0ae31ee489202304becd728d511 Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Mon, 2 Dec 2024 17:54:32 +0000 Subject: [PATCH 30/65] feat(proxy): emit JWT auth method and JWT issuer in parquet logs (#9971) Fix the HTTP AuthMethod to accomodate the JWT authorization method. Introduces the JWT issuer as an additional field in the parquet logs --- proxy/src/auth/backend/jwt.rs | 10 +++-- proxy/src/context/mod.rs | 9 +++++ proxy/src/context/parquet.rs | 53 +++++++++++++++------------ proxy/src/serverless/backend.rs | 4 ++ proxy/src/serverless/sql_over_http.rs | 3 -- 5 files changed, 49 insertions(+), 30 deletions(-) diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs index 517d4fd34b..a258090b15 100644 --- a/proxy/src/auth/backend/jwt.rs +++ b/proxy/src/auth/backend/jwt.rs @@ -350,6 +350,13 @@ impl JwkCacheEntryLock { let header = base64::decode_config(header, base64::URL_SAFE_NO_PAD)?; let header = serde_json::from_slice::>(&header)?; + let payloadb = base64::decode_config(payload, base64::URL_SAFE_NO_PAD)?; + let payload = serde_json::from_slice::>(&payloadb)?; + + if let Some(iss) = &payload.issuer { + ctx.set_jwt_issuer(iss.as_ref().to_owned()); + } + let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD)?; let kid = header.key_id.ok_or(JwtError::MissingKeyId)?; @@ -388,9 +395,6 @@ impl JwkCacheEntryLock { key => return Err(JwtError::UnsupportedKeyType(key.into())), }; - let payloadb = base64::decode_config(payload, base64::URL_SAFE_NO_PAD)?; - let payload = serde_json::from_slice::>(&payloadb)?; - tracing::debug!(?payload, "JWT signature valid with claims"); if let Some(aud) = expected_audience { diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs index 4a063a5faa..a9fb513d3c 100644 --- a/proxy/src/context/mod.rs +++ b/proxy/src/context/mod.rs @@ -57,6 +57,7 @@ struct RequestContextInner { application: Option, error_kind: Option, pub(crate) auth_method: Option, + jwt_issuer: Option, success: bool, pub(crate) cold_start_info: ColdStartInfo, pg_options: Option, @@ -79,6 +80,7 @@ pub(crate) enum AuthMethod { ScramSha256, ScramSha256Plus, Cleartext, + Jwt, } impl Clone for RequestContext { @@ -100,6 +102,7 @@ impl Clone for RequestContext { application: inner.application.clone(), error_kind: inner.error_kind, auth_method: inner.auth_method.clone(), + jwt_issuer: inner.jwt_issuer.clone(), success: inner.success, rejected: inner.rejected, cold_start_info: inner.cold_start_info, @@ -148,6 +151,7 @@ impl RequestContext { application: None, error_kind: None, auth_method: None, + jwt_issuer: None, success: false, rejected: None, cold_start_info: ColdStartInfo::Unknown, @@ -246,6 +250,11 @@ impl RequestContext { this.auth_method = Some(auth_method); } + pub(crate) fn set_jwt_issuer(&self, jwt_issuer: String) { + let mut this = self.0.try_lock().expect("should not deadlock"); + this.jwt_issuer = Some(jwt_issuer); + } + pub fn has_private_peer_addr(&self) -> bool { self.0 .try_lock() diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index b375eb886e..3105d08526 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -87,6 +87,8 @@ pub(crate) struct RequestData { branch: Option, pg_options: Option, auth_method: Option<&'static str>, + jwt_issuer: Option, + error: Option<&'static str>, /// Success is counted if we form a HTTP response with sql rows inside /// Or if we make it to proxy_pass @@ -138,7 +140,9 @@ impl From<&RequestContextInner> for RequestData { super::AuthMethod::ScramSha256 => "scram_sha_256", super::AuthMethod::ScramSha256Plus => "scram_sha_256_plus", super::AuthMethod::Cleartext => "cleartext", + super::AuthMethod::Jwt => "jwt", }), + jwt_issuer: value.jwt_issuer.clone(), protocol: value.protocol.as_str(), region: value.region, error: value.error_kind.as_ref().map(|e| e.to_metric_label()), @@ -519,6 +523,7 @@ mod tests { branch: Some(hex::encode(rng.gen::<[u8; 16]>())), pg_options: None, auth_method: None, + jwt_issuer: None, protocol: ["tcp", "ws", "http"][rng.gen_range(0..3)], region: "us-east-1", error: None, @@ -599,15 +604,15 @@ mod tests { assert_eq!( file_stats, [ - (1312632, 3, 6000), - (1312621, 3, 6000), - (1312680, 3, 6000), - (1312637, 3, 6000), - (1312773, 3, 6000), - (1312610, 3, 6000), - (1312404, 3, 6000), - (1312639, 3, 6000), - (437848, 1, 2000) + (1313105, 3, 6000), + (1313094, 3, 6000), + (1313153, 3, 6000), + (1313110, 3, 6000), + (1313246, 3, 6000), + (1313083, 3, 6000), + (1312877, 3, 6000), + (1313112, 3, 6000), + (438020, 1, 2000) ] ); @@ -639,11 +644,11 @@ mod tests { assert_eq!( file_stats, [ - (1203465, 5, 10000), - (1203189, 5, 10000), - (1203490, 5, 10000), - (1203475, 5, 10000), - (1203729, 5, 10000) + (1204324, 5, 10000), + (1204048, 5, 10000), + (1204349, 5, 10000), + (1204334, 5, 10000), + (1204588, 5, 10000) ] ); @@ -668,15 +673,15 @@ mod tests { assert_eq!( file_stats, [ - (1312632, 3, 6000), - (1312621, 3, 6000), - (1312680, 3, 6000), - (1312637, 3, 6000), - (1312773, 3, 6000), - (1312610, 3, 6000), - (1312404, 3, 6000), - (1312639, 3, 6000), - (437848, 1, 2000) + (1313105, 3, 6000), + (1313094, 3, 6000), + (1313153, 3, 6000), + (1313110, 3, 6000), + (1313246, 3, 6000), + (1313083, 3, 6000), + (1312877, 3, 6000), + (1313112, 3, 6000), + (438020, 1, 2000) ] ); @@ -713,7 +718,7 @@ mod tests { // files are smaller than the size threshold, but they took too long to fill so were flushed early assert_eq!( file_stats, - [(657696, 2, 3001), (657410, 2, 3000), (657206, 2, 2999)] + [(658014, 2, 3001), (657728, 2, 3000), (657524, 2, 2999)] ); tmpdir.close().unwrap(); diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index 75909f3358..57846a4c2c 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -53,6 +53,8 @@ impl PoolingBackend { user_info: &ComputeUserInfo, password: &[u8], ) -> Result { + ctx.set_auth_method(crate::context::AuthMethod::Cleartext); + let user_info = user_info.clone(); let backend = self.auth_backend.as_ref().map(|()| user_info.clone()); let (allowed_ips, maybe_secret) = backend.get_allowed_ips_and_secret(ctx).await?; @@ -115,6 +117,8 @@ impl PoolingBackend { user_info: &ComputeUserInfo, jwt: String, ) -> Result { + ctx.set_auth_method(crate::context::AuthMethod::Jwt); + match &self.auth_backend { crate::auth::Backend::ControlPlane(console, ()) => { self.config diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index afd93d02f0..a0ca7cc60d 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -139,9 +139,6 @@ fn get_conn_info( headers: &HeaderMap, tls: Option<&TlsConfig>, ) -> Result { - // HTTP only uses cleartext (for now and likely always) - ctx.set_auth_method(crate::context::AuthMethod::Cleartext); - let connection_string = headers .get(&CONN_STRING) .ok_or(ConnInfoError::InvalidHeader(&CONN_STRING))? From 84b48211185c818a8a788d412484035d561beaa4 Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Mon, 2 Dec 2024 12:06:19 -0600 Subject: [PATCH 31/65] Stop changing the value of neon.extension_server_port at runtime (#9972) On reconfigure, we no longer passed a port for the extension server which caused us to not write out the neon.extension_server_port line. Thus, Postgres thought we were setting the port to the default value of 0. PGC_POSTMASTER GUCs cannot be set at runtime, which causes the following log messages: > LOG: parameter "neon.extension_server_port" cannot be changed without restarting the server > LOG: configuration file "/var/db/postgres/compute/pgdata/postgresql.conf" contains errors; unaffected changes were applied Fixes: https://github.com/neondatabase/neon/issues/9945 Signed-off-by: Tristan Partin --- compute_tools/src/bin/compute_ctl.rs | 9 ++------- compute_tools/src/compute.rs | 19 +++++++------------ compute_tools/src/config.rs | 6 ++---- 3 files changed, 11 insertions(+), 23 deletions(-) diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index b178d7abd6..e73ccd908e 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -335,6 +335,7 @@ fn wait_spec( pgdata: pgdata.to_string(), pgbin: pgbin.to_string(), pgversion: get_pg_version_string(pgbin), + http_port, live_config_allowed, state: Mutex::new(new_state), state_changed: Condvar::new(), @@ -389,7 +390,6 @@ fn wait_spec( Ok(WaitSpecResult { compute, - http_port, resize_swap_on_bind, set_disk_quota_for_fs: set_disk_quota_for_fs.cloned(), }) @@ -397,8 +397,6 @@ fn wait_spec( struct WaitSpecResult { compute: Arc, - // passed through from ProcessCliResult - http_port: u16, resize_swap_on_bind: bool, set_disk_quota_for_fs: Option, } @@ -408,7 +406,6 @@ fn start_postgres( #[allow(unused_variables)] matches: &clap::ArgMatches, WaitSpecResult { compute, - http_port, resize_swap_on_bind, set_disk_quota_for_fs, }: WaitSpecResult, @@ -481,12 +478,10 @@ fn start_postgres( } } - let extension_server_port: u16 = http_port; - // Start Postgres let mut pg = None; if !prestartup_failed { - pg = match compute.start_compute(extension_server_port) { + pg = match compute.start_compute() { Ok(pg) => Some(pg), Err(err) => { error!("could not start the compute node: {:#}", err); diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index da1caf1a9b..0d1e6d680f 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -79,6 +79,8 @@ pub struct ComputeNode { /// - we push spec and it does configuration /// - but then it is restarted without any spec again pub live_config_allowed: bool, + /// The port that the compute's HTTP server listens on + pub http_port: u16, /// Volatile part of the `ComputeNode`, which should be used under `Mutex`. /// To allow HTTP API server to serving status requests, while configuration /// is in progress, lock should be held only for short periods of time to do @@ -611,11 +613,7 @@ impl ComputeNode { /// Do all the preparations like PGDATA directory creation, configuration, /// safekeepers sync, basebackup, etc. #[instrument(skip_all)] - pub fn prepare_pgdata( - &self, - compute_state: &ComputeState, - extension_server_port: u16, - ) -> Result<()> { + pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> { let pspec = compute_state.pspec.as_ref().expect("spec must be set"); let spec = &pspec.spec; let pgdata_path = Path::new(&self.pgdata); @@ -625,7 +623,7 @@ impl ComputeNode { config::write_postgres_conf( &pgdata_path.join("postgresql.conf"), &pspec.spec, - Some(extension_server_port), + self.http_port, )?; // Syncing safekeepers is only safe with primary nodes: if a primary @@ -1243,7 +1241,7 @@ impl ComputeNode { // Write new config let pgdata_path = Path::new(&self.pgdata); let postgresql_conf_path = pgdata_path.join("postgresql.conf"); - config::write_postgres_conf(&postgresql_conf_path, &spec, None)?; + config::write_postgres_conf(&postgresql_conf_path, &spec, self.http_port)?; // TODO(ololobus): We need a concurrency during reconfiguration as well, // but DB is already running and used by user. We can easily get out of @@ -1284,10 +1282,7 @@ impl ComputeNode { } #[instrument(skip_all)] - pub fn start_compute( - &self, - extension_server_port: u16, - ) -> Result<(std::process::Child, std::thread::JoinHandle<()>)> { + pub fn start_compute(&self) -> Result<(std::process::Child, std::thread::JoinHandle<()>)> { let compute_state = self.state.lock().unwrap().clone(); let pspec = compute_state.pspec.as_ref().expect("spec must be set"); info!( @@ -1362,7 +1357,7 @@ impl ComputeNode { info!("{:?}", remote_ext_metrics); } - self.prepare_pgdata(&compute_state, extension_server_port)?; + self.prepare_pgdata(&compute_state)?; let start_time = Utc::now(); let pg_process = self.start_postgres(pspec.storage_auth_token.clone())?; diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index d65fe73194..b257c8a68f 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -37,7 +37,7 @@ pub fn line_in_file(path: &Path, line: &str) -> Result { pub fn write_postgres_conf( path: &Path, spec: &ComputeSpec, - extension_server_port: Option, + extension_server_port: u16, ) -> Result<()> { // File::create() destroys the file content if it exists. let mut file = File::create(path)?; @@ -127,9 +127,7 @@ pub fn write_postgres_conf( writeln!(file, "# Managed by compute_ctl: end")?; } - if let Some(port) = extension_server_port { - writeln!(file, "neon.extension_server_port={}", port)?; - } + writeln!(file, "neon.extension_server_port={}", extension_server_port)?; // This is essential to keep this line at the end of the file, // because it is intended to override any settings above. From c4987b0b13b9c311ea0f74169b90071323e14cff Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Mon, 2 Dec 2024 19:46:06 +0100 Subject: [PATCH 32/65] fix(testing): Use 1 MB shared_buffers even with LFC (#9969) ## Problem After enabling LFC in tests and lowering `shared_buffers` we started having more problems with `test_pg_regress`. ## Summary of changes Set `shared_buffers` to 1MB to both exercise getPage requests/LFC, and still have enough room for Postgres to operate. Everything smaller might be not enough for Postgres under load, and can cause errors like 'no unpinned buffers available'. See Konstantin's comment [1] as well. Fixes #9956 [1]: https://github.com/neondatabase/neon/issues/9956#issuecomment-2511608097 --- control_plane/src/endpoint.rs | 4 ++++ test_runner/fixtures/neon_fixtures.py | 6 ++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 71514daa7c..1ca6dc43c4 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -310,6 +310,10 @@ impl Endpoint { conf.append("wal_log_hints", "off"); conf.append("max_replication_slots", "10"); conf.append("hot_standby", "on"); + // Set to 1MB to both exercise getPage requests/LFC, and still have enough room for + // Postgres to operate. Everything smaller might be not enough for Postgres under load, + // and can cause errors like 'no unpinned buffers available', see + // conf.append("shared_buffers", "1MB"); conf.append("fsync", "off"); conf.append("max_connections", "100"); diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 5709a3b82b..f55f06bebc 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -3801,13 +3801,11 @@ class Endpoint(PgProtocol, LogUtils): assert size_to_bytes(size) >= size_to_bytes( "1MB" ), "LFC size cannot be set less than 1MB" - # shared_buffers = 512kB to make postgres use LFC intensively - # neon.max_file_cache_size and neon.file_cache size limit are - # set to 1MB because small LFC is better for testing (helps to find more problems) lfc_path_escaped = str(lfc_path).replace("'", "''") config_lines = [ - "shared_buffers = 512kB", f"neon.file_cache_path = '{lfc_path_escaped}'", + # neon.max_file_cache_size and neon.file_cache size limits are + # set to 1MB because small LFC is better for testing (helps to find more problems) "neon.max_file_cache_size = 1MB", "neon.file_cache_size_limit = 1MB", ] + config_lines From 4a488b3e24af43289a6da6b0640ebcf7c83881f3 Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 3 Dec 2024 08:59:38 +0000 Subject: [PATCH 33/65] storcon: use proper schedule context during node delete (#9958) ## Problem I was touching `test_storage_controller_node_deletion` because for AZ scheduling work I was adding a change to the storage controller (kick secondaries during optimisation) that made a FIXME in this test defunct. While looking at it I also realized that we can easily fix the way node deletion currently doesn't use a proper ScheduleContext, using the iterator type recently added for that purpose. ## Summary of changes - A testing-only behavior in storage controller where if a secondary location isn't yet ready during optimisation, it will be actively polled. - Remove workaround in `test_storage_controller_node_deletion` that previously was needed because optimisation would get stuck on cold secondaries. - Update node deletion code to use a `TenantShardContextIterator` and thereby a proper ScheduleContext --- storage_controller/src/service.rs | 112 ++++++++++++++---- test_runner/regress/test_sharding.py | 7 ++ .../regress/test_storage_controller.py | 8 +- 3 files changed, 96 insertions(+), 31 deletions(-) diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 631fdb4923..52c9c4710d 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -5158,34 +5158,38 @@ impl Service { *nodes = Arc::new(nodes_mut); } - for (tenant_shard_id, shard) in tenants { - if shard.deref_node(node_id) { - // FIXME: we need to build a ScheduleContext that reflects this shard's peers, otherwise - // it won't properly do anti-affinity. - let mut schedule_context = ScheduleContext::default(); + for (_tenant_id, mut schedule_context, shards) in + TenantShardContextIterator::new(tenants, ScheduleMode::Normal) + { + for shard in shards { + if shard.deref_node(node_id) { + if let Err(e) = shard.schedule(scheduler, &mut schedule_context) { + // TODO: implement force flag to remove a node even if we can't reschedule + // a tenant + tracing::error!( + "Refusing to delete node, shard {} can't be rescheduled: {e}", + shard.tenant_shard_id + ); + return Err(e.into()); + } else { + tracing::info!( + "Rescheduled shard {} away from node during deletion", + shard.tenant_shard_id + ) + } - if let Err(e) = shard.schedule(scheduler, &mut schedule_context) { - // TODO: implement force flag to remove a node even if we can't reschedule - // a tenant - tracing::error!("Refusing to delete node, shard {tenant_shard_id} can't be rescheduled: {e}"); - return Err(e.into()); - } else { - tracing::info!( - "Rescheduled shard {tenant_shard_id} away from node during deletion" - ) + self.maybe_reconcile_shard(shard, nodes); } - self.maybe_reconcile_shard(shard, nodes); + // Here we remove an existing observed location for the node we're removing, and it will + // not be re-added by a reconciler's completion because we filter out removed nodes in + // process_result. + // + // Note that we update the shard's observed state _after_ calling maybe_reconcile_shard: that + // means any reconciles we spawned will know about the node we're deleting, enabling them + // to do live migrations if it's still online. + shard.observed.locations.remove(&node_id); } - - // Here we remove an existing observed location for the node we're removing, and it will - // not be re-added by a reconciler's completion because we filter out removed nodes in - // process_result. - // - // Note that we update the shard's observed state _after_ calling maybe_reconcile_shard: that - // means any reconciles we spawned will know about the node we're deleting, enabling them - // to do live migrations if it's still online. - shard.observed.locations.remove(&node_id); } scheduler.node_remove(node_id); @@ -6279,6 +6283,14 @@ impl Service { > DOWNLOAD_FRESHNESS_THRESHOLD { tracing::info!("Skipping migration of {tenant_shard_id} to {node} because secondary isn't ready: {progress:?}"); + + #[cfg(feature = "testing")] + if progress.heatmap_mtime.is_none() { + // No heatmap might mean the attached location has never uploaded one, or that + // the secondary download hasn't happened yet. This is relatively unusual in the field, + // but fairly common in tests. + self.kick_secondary_download(tenant_shard_id).await; + } } else { // Location looks ready: proceed tracing::info!( @@ -6293,6 +6305,58 @@ impl Service { validated_work } + /// Some aspects of scheduling optimisation wait for secondary locations to be warm. This + /// happens on multi-minute timescales in the field, which is fine because optimisation is meant + /// to be a lazy background thing. However, when testing, it is not practical to wait around, so + /// we have this helper to move things along faster. + #[cfg(feature = "testing")] + async fn kick_secondary_download(&self, tenant_shard_id: TenantShardId) { + let (attached_node, secondary_node) = { + let locked = self.inner.read().unwrap(); + let Some(shard) = locked.tenants.get(&tenant_shard_id) else { + return; + }; + let (Some(attached), Some(secondary)) = ( + shard.intent.get_attached(), + shard.intent.get_secondary().first(), + ) else { + return; + }; + ( + locked.nodes.get(attached).unwrap().clone(), + locked.nodes.get(secondary).unwrap().clone(), + ) + }; + + // Make remote API calls to upload + download heatmaps: we ignore errors because this is just + // a 'kick' to let scheduling optimisation run more promptly. + attached_node + .with_client_retries( + |client| async move { client.tenant_heatmap_upload(tenant_shard_id).await }, + &self.config.jwt_token, + 3, + 10, + SHORT_RECONCILE_TIMEOUT, + &self.cancel, + ) + .await; + + secondary_node + .with_client_retries( + |client| async move { + client + .tenant_secondary_download(tenant_shard_id, Some(Duration::from_secs(1))) + .await + }, + &self.config.jwt_token, + 3, + 10, + SHORT_RECONCILE_TIMEOUT, + &self.cancel, + ) + .await; + } + /// Look for shards which are oversized and in need of splitting async fn autosplit_tenants(self: &Arc) { let Some(split_threshold) = self.config.split_threshold else { diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index c86ba0d4ea..30abf91d3a 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -519,6 +519,13 @@ def test_sharding_split_smoke( # We will have 2 shards per pageserver once done (including secondaries) neon_env_builder.num_pageservers = split_shard_count + # Two AZs + def assign_az(ps_cfg): + az = f"az-{(ps_cfg['id'] - 1) % 2}" + ps_cfg["availability_zone"] = az + + neon_env_builder.pageserver_config_override = assign_az + # 1MiB stripes: enable getting some meaningful data distribution without # writing large quantities of data in this test. The stripe size is given # in number of 8KiB pages. diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index e93e251b4f..685af5caaf 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -2253,12 +2253,7 @@ def test_storage_controller_node_deletion( assert victim.id not in shard["node_secondary"] # Reconciles running during deletion should all complete - # FIXME: this currently doesn't work because the deletion schedules shards without a proper ScheduleContext, resulting - # in states that background_reconcile wants to optimize, but can't proceed with migrations yet because this is a short3 - # test that hasn't uploaded any heatmaps for secondaries. - # In the interim, just do a reconcile_all to enable the consistency check. - # env.storage_controller.reconcile_until_idle() - env.storage_controller.reconcile_all() + env.storage_controller.reconcile_until_idle() # Controller should pass its own consistency checks env.storage_controller.consistency_check() @@ -2267,7 +2262,6 @@ def test_storage_controller_node_deletion( env.storage_controller.stop() env.storage_controller.start() assert victim.id not in [n["id"] for n in env.storage_controller.node_list()] - env.storage_controller.reconcile_all() # FIXME: workaround for optimizations happening on startup, see FIXME above. env.storage_controller.consistency_check() From 8963ac85f9695f529ded4104d2e34e29bed38b7b Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Tue, 3 Dec 2024 11:55:13 +0100 Subject: [PATCH 34/65] storcon_cli tenant-describe: include tenant-wide information in output (#9899) Before this PR, the storcon_cli didn't have a way to show the tenant-wide information of the TenantDescribeResponse. Sadly, the `Serialize` impl for the tenant config doesn't skip on `None`, so, the output becomes a bit bloated. Maybe we can use `skip_serializing_if(Option::is_none)` in the future. => https://github.com/neondatabase/neon/issues/9983 --- control_plane/storcon_cli/src/main.rs | 16 ++++++++++++++-- test_runner/regress/test_storage_controller.py | 4 ++-- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs index b7f38c6286..e879424532 100644 --- a/control_plane/storcon_cli/src/main.rs +++ b/control_plane/storcon_cli/src/main.rs @@ -560,14 +560,26 @@ async fn main() -> anyhow::Result<()> { .await?; } Command::TenantDescribe { tenant_id } => { - let describe_response = storcon_client + let TenantDescribeResponse { + tenant_id, + shards, + stripe_size, + policy, + config, + } = storcon_client .dispatch::<(), TenantDescribeResponse>( Method::GET, format!("control/v1/tenant/{tenant_id}"), None, ) .await?; - let shards = describe_response.shards; + println!("Tenant {tenant_id}"); + let mut table = comfy_table::Table::new(); + table.add_row(["Policy", &format!("{:?}", policy)]); + table.add_row(["Stripe size", &format!("{:?}", stripe_size)]); + table.add_row(["Config", &serde_json::to_string_pretty(&config).unwrap()]); + println!("{table}"); + println!("Shards:"); let mut table = comfy_table::Table::new(); table.set_header(["Shard", "Attached", "Secondary", "Last error", "status"]); for shard in shards { diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 685af5caaf..244893a616 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -1747,8 +1747,8 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder): # Describe a tenant tenant_lines = storcon_cli(["tenant-describe", "--tenant-id", str(env.initial_tenant)]) - assert len(tenant_lines) == 3 + shard_count * 2 - assert str(env.initial_tenant) in tenant_lines[3] + assert len(tenant_lines) >= 3 + shard_count * 2 + assert str(env.initial_tenant) in tenant_lines[0] # Pause changes on a tenant storcon_cli(["tenant-policy", "--tenant-id", str(env.initial_tenant), "--scheduling", "stop"]) From dd76f1eeeee5eac0f7a606fc136f20df2e9218e1 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Tue, 3 Dec 2024 12:03:23 +0100 Subject: [PATCH 35/65] page_service: batching observability & include throttled time in smgr metrics (#9870) This PR - fixes smgr metrics https://github.com/neondatabase/neon/issues/9925 - adds an additional startup log line logging the current batching config - adds a histogram of batch sizes global and per-tenant - adds a metric exposing the current batching config The issue described #9925 is that before this PR, request latency was only observed *after* batching. This means that smgr latency metrics (most importantly getpage latency) don't account for - `wait_lsn` time - time spent waiting for batch to fill up / the executor stage to pick up the batch. The fix is to use a per-request batching timer, like we did before the initial batching PR. We funnel those timers through the entire request lifecycle. I noticed that even before the initial batching changes, we weren't accounting for the time spent writing & flushing the response to the wire. This PR drive-by fixes that deficiency by dropping the timers at the very end of processing the batch, i.e., after the `pgb.flush()` call. I was **unable to maintain the behavior that we deduct time-spent-in-throttle from various latency metrics. The reason is that we're using a *single* counter in `RequestContext` to track micros spent in throttle. But there are *N* metrics timers in the batch, one per request. As a consequence, the practice of consuming the counter in the drop handler of each timer no longer works because all but the first timer will encounter error `close() called on closed state`. A failed attempt to maintain the current behavior can be found in https://github.com/neondatabase/neon/pull/9951. So, this PR remvoes the deduction behavior from all metrics. I started a discussion on Slack about it the implications this has for our internal SLO calculation: https://neondb.slack.com/archives/C033RQ5SPDH/p1732910861704029 # Refs - fixes https://github.com/neondatabase/neon/issues/9925 - sub-issue https://github.com/neondatabase/neon/issues/9377 - epic: https://github.com/neondatabase/neon/issues/9376 --- pageserver/src/bin/pageserver.rs | 3 +- pageserver/src/context.rs | 5 - pageserver/src/context/optional_counter.rs | 101 ------ pageserver/src/metrics.rs | 249 +++++++------- pageserver/src/page_service.rs | 309 +++++++++++------- pageserver/src/pgdatadir_mapping.rs | 18 +- pageserver/src/tenant/throttle.rs | 17 +- pageserver/src/tenant/timeline.rs | 7 +- test_runner/fixtures/metrics.py | 1 + .../pageserver/test_page_service_batching.py | 28 +- .../test_pageserver_getpage_throttle.py | 31 +- 11 files changed, 373 insertions(+), 396 deletions(-) delete mode 100644 pageserver/src/context/optional_counter.rs diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index a8c2c2e992..31f4370855 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -127,6 +127,7 @@ fn main() -> anyhow::Result<()> { info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine"); info!(?conf.virtual_file_io_mode, "starting with virtual_file IO mode"); info!(?conf.wal_receiver_protocol, "starting with WAL receiver protocol"); + info!(?conf.page_service_pipelining, "starting with page service pipelining config"); // The tenants directory contains all the pageserver local disk state. // Create if not exists and make sure all the contents are durable before proceeding. @@ -302,7 +303,7 @@ fn start_pageserver( pageserver::metrics::tokio_epoll_uring::Collector::new(), )) .unwrap(); - pageserver::preinitialize_metrics(); + pageserver::preinitialize_metrics(conf); // If any failpoints were set from FAILPOINTS environment variable, // print them to the log for debugging purposes diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs index 7afcf52cf2..8f2177fe5b 100644 --- a/pageserver/src/context.rs +++ b/pageserver/src/context.rs @@ -91,8 +91,6 @@ use crate::task_mgr::TaskKind; -pub(crate) mod optional_counter; - // The main structure of this module, see module-level comment. #[derive(Debug)] pub struct RequestContext { @@ -100,7 +98,6 @@ pub struct RequestContext { download_behavior: DownloadBehavior, access_stats_behavior: AccessStatsBehavior, page_content_kind: PageContentKind, - pub micros_spent_throttled: optional_counter::MicroSecondsCounterU32, } /// The kind of access to the page cache. @@ -158,7 +155,6 @@ impl RequestContextBuilder { download_behavior: DownloadBehavior::Download, access_stats_behavior: AccessStatsBehavior::Update, page_content_kind: PageContentKind::Unknown, - micros_spent_throttled: Default::default(), }, } } @@ -172,7 +168,6 @@ impl RequestContextBuilder { download_behavior: original.download_behavior, access_stats_behavior: original.access_stats_behavior, page_content_kind: original.page_content_kind, - micros_spent_throttled: Default::default(), }, } } diff --git a/pageserver/src/context/optional_counter.rs b/pageserver/src/context/optional_counter.rs deleted file mode 100644 index 100c649f18..0000000000 --- a/pageserver/src/context/optional_counter.rs +++ /dev/null @@ -1,101 +0,0 @@ -use std::{ - sync::atomic::{AtomicU32, Ordering}, - time::Duration, -}; - -#[derive(Debug)] -pub struct CounterU32 { - inner: AtomicU32, -} -impl Default for CounterU32 { - fn default() -> Self { - Self { - inner: AtomicU32::new(u32::MAX), - } - } -} -impl CounterU32 { - pub fn open(&self) -> Result<(), &'static str> { - match self - .inner - .compare_exchange(u32::MAX, 0, Ordering::Relaxed, Ordering::Relaxed) - { - Ok(_) => Ok(()), - Err(_) => Err("open() called on clsoed state"), - } - } - pub fn close(&self) -> Result { - match self.inner.swap(u32::MAX, Ordering::Relaxed) { - u32::MAX => Err("close() called on closed state"), - x => Ok(x), - } - } - - pub fn add(&self, count: u32) -> Result<(), &'static str> { - if count == 0 { - return Ok(()); - } - let mut had_err = None; - self.inner - .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |cur| match cur { - u32::MAX => { - had_err = Some("add() called on closed state"); - None - } - x => { - let (new, overflowed) = x.overflowing_add(count); - if new == u32::MAX || overflowed { - had_err = Some("add() overflowed the counter"); - None - } else { - Some(new) - } - } - }) - .map_err(|_| had_err.expect("we set it whenever the function returns None")) - .map(|_| ()) - } -} - -#[derive(Default, Debug)] -pub struct MicroSecondsCounterU32 { - inner: CounterU32, -} - -impl MicroSecondsCounterU32 { - pub fn open(&self) -> Result<(), &'static str> { - self.inner.open() - } - pub fn add(&self, duration: Duration) -> Result<(), &'static str> { - match duration.as_micros().try_into() { - Ok(x) => self.inner.add(x), - Err(_) => Err("add(): duration conversion error"), - } - } - pub fn close_and_checked_sub_from(&self, from: Duration) -> Result { - let val = self.inner.close()?; - let val = Duration::from_micros(val as u64); - let subbed = match from.checked_sub(val) { - Some(v) => v, - None => return Err("Duration::checked_sub"), - }; - Ok(subbed) - } -} - -#[cfg(test)] -mod tests { - - use super::*; - - #[test] - fn test_basic() { - let counter = MicroSecondsCounterU32::default(); - counter.open().unwrap(); - counter.add(Duration::from_micros(23)).unwrap(); - let res = counter - .close_and_checked_sub_from(Duration::from_micros(42)) - .unwrap(); - assert_eq!(res, Duration::from_micros(42 - 23)); - } -} diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 86be97587f..d04fae7627 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -7,6 +7,10 @@ use metrics::{ IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec, }; use once_cell::sync::Lazy; +use pageserver_api::config::{ + PageServicePipeliningConfig, PageServicePipeliningConfigPipelined, + PageServiceProtocolPipelinedExecutionStrategy, +}; use pageserver_api::shard::TenantShardId; use postgres_backend::{is_expected_io_error, QueryError}; use pq_proto::framed::ConnectionError; @@ -1216,50 +1220,21 @@ pub(crate) mod virtual_file_io_engine { }); } -struct GlobalAndPerTimelineHistogramTimer<'a, 'c> { - global_latency_histo: &'a Histogram, +pub(crate) struct SmgrOpTimer { + global_latency_histo: Histogram, // Optional because not all op types are tracked per-timeline - per_timeline_latency_histo: Option<&'a Histogram>, + per_timeline_latency_histo: Option, - ctx: &'c RequestContext, - start: std::time::Instant, - op: SmgrQueryType, - count: usize, + start: Instant, } -impl Drop for GlobalAndPerTimelineHistogramTimer<'_, '_> { +impl Drop for SmgrOpTimer { fn drop(&mut self) { - let elapsed = self.start.elapsed(); - let ex_throttled = self - .ctx - .micros_spent_throttled - .close_and_checked_sub_from(elapsed); - let ex_throttled = match ex_throttled { - Ok(res) => res, - Err(error) => { - use utils::rate_limit::RateLimit; - static LOGGED: Lazy>> = - Lazy::new(|| { - Mutex::new(enum_map::EnumMap::from_array(std::array::from_fn(|_| { - RateLimit::new(Duration::from_secs(10)) - }))) - }); - let mut guard = LOGGED.lock().unwrap(); - let rate_limit = &mut guard[self.op]; - rate_limit.call(|| { - warn!(op=?self.op, error, "error deducting time spent throttled; this message is logged at a global rate limit"); - }); - elapsed - } - }; - - for _ in 0..self.count { - self.global_latency_histo - .observe(ex_throttled.as_secs_f64()); - if let Some(per_timeline_getpage_histo) = self.per_timeline_latency_histo { - per_timeline_getpage_histo.observe(ex_throttled.as_secs_f64()); - } + let elapsed = self.start.elapsed().as_secs_f64(); + self.global_latency_histo.observe(elapsed); + if let Some(per_timeline_getpage_histo) = &self.per_timeline_latency_histo { + per_timeline_getpage_histo.observe(elapsed); } } } @@ -1289,6 +1264,8 @@ pub(crate) struct SmgrQueryTimePerTimeline { global_latency: [Histogram; SmgrQueryType::COUNT], per_timeline_getpage_started: IntCounter, per_timeline_getpage_latency: Histogram, + global_batch_size: Histogram, + per_timeline_batch_size: Histogram, } static SMGR_QUERY_STARTED_GLOBAL: Lazy = Lazy::new(|| { @@ -1381,6 +1358,76 @@ static SMGR_QUERY_TIME_GLOBAL: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +static PAGE_SERVICE_BATCH_SIZE_BUCKETS_GLOBAL: Lazy> = Lazy::new(|| { + (1..=u32::try_from(Timeline::MAX_GET_VECTORED_KEYS).unwrap()) + .map(|v| v.into()) + .collect() +}); + +static PAGE_SERVICE_BATCH_SIZE_GLOBAL: Lazy = Lazy::new(|| { + register_histogram!( + "pageserver_page_service_batch_size_global", + "Batch size of pageserver page service requests", + PAGE_SERVICE_BATCH_SIZE_BUCKETS_GLOBAL.clone(), + ) + .expect("failed to define a metric") +}); + +static PAGE_SERVICE_BATCH_SIZE_BUCKETS_PER_TIMELINE: Lazy> = Lazy::new(|| { + let mut buckets = Vec::new(); + for i in 0.. { + let bucket = 1 << i; + if bucket > u32::try_from(Timeline::MAX_GET_VECTORED_KEYS).unwrap() { + break; + } + buckets.push(bucket.into()); + } + buckets +}); + +static PAGE_SERVICE_BATCH_SIZE_PER_TENANT_TIMELINE: Lazy = Lazy::new(|| { + register_histogram_vec!( + "pageserver_page_service_batch_size", + "Batch size of pageserver page service requests", + &["tenant_id", "shard_id", "timeline_id"], + PAGE_SERVICE_BATCH_SIZE_BUCKETS_PER_TIMELINE.clone() + ) + .expect("failed to define a metric") +}); + +pub(crate) static PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE: Lazy = Lazy::new(|| { + register_int_gauge_vec!( + "pageserver_page_service_config_max_batch_size", + "Configured maximum batch size for the server-side batching functionality of page_service. \ + Labels expose more of the configuration parameters.", + &["mode", "execution"] + ) + .expect("failed to define a metric") +}); + +fn set_page_service_config_max_batch_size(conf: &PageServicePipeliningConfig) { + PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE.reset(); + let (label_values, value) = match conf { + PageServicePipeliningConfig::Serial => (["serial", "-"], 1), + PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined { + max_batch_size, + execution, + }) => { + let mode = "pipelined"; + let execution = match execution { + PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures => { + "concurrent-futures" + } + PageServiceProtocolPipelinedExecutionStrategy::Tasks => "tasks", + }; + ([mode, execution], max_batch_size.get()) + } + }; + PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE + .with_label_values(&label_values) + .set(value.try_into().unwrap()); +} + impl SmgrQueryTimePerTimeline { pub(crate) fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self { let tenant_id = tenant_shard_id.tenant_id.to_string(); @@ -1416,78 +1463,51 @@ impl SmgrQueryTimePerTimeline { ]) .unwrap(); + let global_batch_size = PAGE_SERVICE_BATCH_SIZE_GLOBAL.clone(); + let per_timeline_batch_size = PAGE_SERVICE_BATCH_SIZE_PER_TENANT_TIMELINE + .get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id]) + .unwrap(); + Self { global_started, global_latency, per_timeline_getpage_latency, per_timeline_getpage_started, + global_batch_size, + per_timeline_batch_size, } } - pub(crate) fn start_timer<'c: 'a, 'a>( - &'a self, - op: SmgrQueryType, - ctx: &'c RequestContext, - ) -> Option { - self.start_timer_many(op, 1, ctx) - } - pub(crate) fn start_timer_many<'c: 'a, 'a>( - &'a self, - op: SmgrQueryType, - count: usize, - ctx: &'c RequestContext, - ) -> Option { - let start = Instant::now(); - + pub(crate) fn start_smgr_op(&self, op: SmgrQueryType, started_at: Instant) -> SmgrOpTimer { self.global_started[op as usize].inc(); - // We subtract time spent throttled from the observed latency. - match ctx.micros_spent_throttled.open() { - Ok(()) => (), - Err(error) => { - use utils::rate_limit::RateLimit; - static LOGGED: Lazy>> = - Lazy::new(|| { - Mutex::new(enum_map::EnumMap::from_array(std::array::from_fn(|_| { - RateLimit::new(Duration::from_secs(10)) - }))) - }); - let mut guard = LOGGED.lock().unwrap(); - let rate_limit = &mut guard[op]; - rate_limit.call(|| { - warn!(?op, error, "error opening micros_spent_throttled; this message is logged at a global rate limit"); - }); - } - } - let per_timeline_latency_histo = if matches!(op, SmgrQueryType::GetPageAtLsn) { self.per_timeline_getpage_started.inc(); - Some(&self.per_timeline_getpage_latency) + Some(self.per_timeline_getpage_latency.clone()) } else { None }; - Some(GlobalAndPerTimelineHistogramTimer { - global_latency_histo: &self.global_latency[op as usize], + SmgrOpTimer { + global_latency_histo: self.global_latency[op as usize].clone(), per_timeline_latency_histo, - ctx, - start, - op, - count, - }) + start: started_at, + } + } + + pub(crate) fn observe_getpage_batch_start(&self, batch_size: usize) { + self.global_batch_size.observe(batch_size as f64); + self.per_timeline_batch_size.observe(batch_size as f64); } } #[cfg(test)] mod smgr_query_time_tests { + use std::time::Instant; + use pageserver_api::shard::TenantShardId; use strum::IntoEnumIterator; use utils::id::{TenantId, TimelineId}; - use crate::{ - context::{DownloadBehavior, RequestContext}, - task_mgr::TaskKind, - }; - // Regression test, we used hard-coded string constants before using an enum. #[test] fn op_label_name() { @@ -1531,8 +1551,7 @@ mod smgr_query_time_tests { let (pre_global, pre_per_tenant_timeline) = get_counts(); assert_eq!(pre_per_tenant_timeline, 0); - let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Download); - let timer = metrics.start_timer(*op, &ctx); + let timer = metrics.start_smgr_op(*op, Instant::now()); drop(timer); let (post_global, post_per_tenant_timeline) = get_counts(); @@ -1579,58 +1598,24 @@ pub(crate) static BASEBACKUP_QUERY_TIME: Lazy = Lazy::new(| } }); -pub(crate) struct BasebackupQueryTimeOngoingRecording<'a, 'c> { +pub(crate) struct BasebackupQueryTimeOngoingRecording<'a> { parent: &'a BasebackupQueryTime, - ctx: &'c RequestContext, start: std::time::Instant, } impl BasebackupQueryTime { - pub(crate) fn start_recording<'c: 'a, 'a>( - &'a self, - ctx: &'c RequestContext, - ) -> BasebackupQueryTimeOngoingRecording<'a, 'a> { + pub(crate) fn start_recording(&self) -> BasebackupQueryTimeOngoingRecording<'_> { let start = Instant::now(); - match ctx.micros_spent_throttled.open() { - Ok(()) => (), - Err(error) => { - use utils::rate_limit::RateLimit; - static LOGGED: Lazy> = - Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10)))); - let mut rate_limit = LOGGED.lock().unwrap(); - rate_limit.call(|| { - warn!(error, "error opening micros_spent_throttled; this message is logged at a global rate limit"); - }); - } - } BasebackupQueryTimeOngoingRecording { parent: self, - ctx, start, } } } -impl BasebackupQueryTimeOngoingRecording<'_, '_> { +impl BasebackupQueryTimeOngoingRecording<'_> { pub(crate) fn observe(self, res: &Result) { - let elapsed = self.start.elapsed(); - let ex_throttled = self - .ctx - .micros_spent_throttled - .close_and_checked_sub_from(elapsed); - let ex_throttled = match ex_throttled { - Ok(ex_throttled) => ex_throttled, - Err(error) => { - use utils::rate_limit::RateLimit; - static LOGGED: Lazy> = - Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10)))); - let mut rate_limit = LOGGED.lock().unwrap(); - rate_limit.call(|| { - warn!(error, "error deducting time spent throttled; this message is logged at a global rate limit"); - }); - elapsed - } - }; + let elapsed = self.start.elapsed().as_secs_f64(); // If you want to change categorize of a specific error, also change it in `log_query_error`. let metric = match res { Ok(_) => &self.parent.ok, @@ -1641,7 +1626,7 @@ impl BasebackupQueryTimeOngoingRecording<'_, '_> { } Err(_) => &self.parent.error, }; - metric.observe(ex_throttled.as_secs_f64()); + metric.observe(elapsed); } } @@ -2722,6 +2707,11 @@ impl TimelineMetrics { shard_id, timeline_id, ]); + let _ = PAGE_SERVICE_BATCH_SIZE_PER_TENANT_TIMELINE.remove_label_values(&[ + tenant_id, + shard_id, + timeline_id, + ]); } } @@ -2747,10 +2737,12 @@ use std::sync::{Arc, Mutex}; use std::task::{Context, Poll}; use std::time::{Duration, Instant}; +use crate::config::PageServerConf; use crate::context::{PageContentKind, RequestContext}; use crate::task_mgr::TaskKind; use crate::tenant::mgr::TenantSlot; use crate::tenant::tasks::BackgroundLoopKind; +use crate::tenant::Timeline; /// Maintain a per timeline gauge in addition to the global gauge. pub(crate) struct PerTimelineRemotePhysicalSizeGauge { @@ -3562,7 +3554,9 @@ pub(crate) fn set_tokio_runtime_setup(setup: &str, num_threads: NonZeroUsize) { .set(u64::try_from(num_threads.get()).unwrap()); } -pub fn preinitialize_metrics() { +pub fn preinitialize_metrics(conf: &'static PageServerConf) { + set_page_service_config_max_batch_size(&conf.page_service_pipelining); + // Python tests need these and on some we do alerting. // // FIXME(4813): make it so that we have no top level metrics as this fn will easily fall out of @@ -3630,6 +3624,7 @@ pub fn preinitialize_metrics() { &WAL_REDO_RECORDS_HISTOGRAM, &WAL_REDO_BYTES_HISTOGRAM, &WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM, + &PAGE_SERVICE_BATCH_SIZE_GLOBAL, ] .into_iter() .for_each(|h| { diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 1917e7f5b7..64842aa5b8 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -51,7 +51,7 @@ use crate::auth::check_permission; use crate::basebackup::BasebackupError; use crate::config::PageServerConf; use crate::context::{DownloadBehavior, RequestContext}; -use crate::metrics::{self}; +use crate::metrics::{self, SmgrOpTimer}; use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS}; use crate::pgdatadir_mapping::Version; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; @@ -540,11 +540,13 @@ impl From for QueryError { enum BatchedFeMessage { Exists { span: Span, + timer: SmgrOpTimer, shard: timeline::handle::Handle, req: models::PagestreamExistsRequest, }, Nblocks { span: Span, + timer: SmgrOpTimer, shard: timeline::handle::Handle, req: models::PagestreamNblocksRequest, }, @@ -552,15 +554,17 @@ enum BatchedFeMessage { span: Span, shard: timeline::handle::Handle, effective_request_lsn: Lsn, - pages: smallvec::SmallVec<[(RelTag, BlockNumber); 1]>, + pages: smallvec::SmallVec<[(RelTag, BlockNumber, SmgrOpTimer); 1]>, }, DbSize { span: Span, + timer: SmgrOpTimer, shard: timeline::handle::Handle, req: models::PagestreamDbSizeRequest, }, GetSlruSegment { span: Span, + timer: SmgrOpTimer, shard: timeline::handle::Handle, req: models::PagestreamGetSlruSegmentRequest, }, @@ -632,6 +636,8 @@ impl PageServerHandler { msg = pgb.read_message() => { msg } }; + let received_at = Instant::now(); + let copy_data_bytes = match msg? { Some(FeMessage::CopyData(bytes)) => bytes, Some(FeMessage::Terminate) => { @@ -660,7 +666,15 @@ impl PageServerHandler { .get(tenant_id, timeline_id, ShardSelector::Zero) .instrument(span.clone()) // sets `shard_id` field .await?; - BatchedFeMessage::Exists { span, shard, req } + let timer = shard + .query_metrics + .start_smgr_op(metrics::SmgrQueryType::GetRelExists, received_at); + BatchedFeMessage::Exists { + span, + timer, + shard, + req, + } } PagestreamFeMessage::Nblocks(req) => { let span = tracing::info_span!(parent: parent_span, "handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn); @@ -668,7 +682,15 @@ impl PageServerHandler { .get(tenant_id, timeline_id, ShardSelector::Zero) .instrument(span.clone()) // sets `shard_id` field .await?; - BatchedFeMessage::Nblocks { span, shard, req } + let timer = shard + .query_metrics + .start_smgr_op(metrics::SmgrQueryType::GetRelSize, received_at); + BatchedFeMessage::Nblocks { + span, + timer, + shard, + req, + } } PagestreamFeMessage::DbSize(req) => { let span = tracing::info_span!(parent: parent_span, "handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn); @@ -676,7 +698,15 @@ impl PageServerHandler { .get(tenant_id, timeline_id, ShardSelector::Zero) .instrument(span.clone()) // sets `shard_id` field .await?; - BatchedFeMessage::DbSize { span, shard, req } + let timer = shard + .query_metrics + .start_smgr_op(metrics::SmgrQueryType::GetDbSize, received_at); + BatchedFeMessage::DbSize { + span, + timer, + shard, + req, + } } PagestreamFeMessage::GetSlruSegment(req) => { let span = tracing::info_span!(parent: parent_span, "handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn); @@ -684,7 +714,15 @@ impl PageServerHandler { .get(tenant_id, timeline_id, ShardSelector::Zero) .instrument(span.clone()) // sets `shard_id` field .await?; - BatchedFeMessage::GetSlruSegment { span, shard, req } + let timer = shard + .query_metrics + .start_smgr_op(metrics::SmgrQueryType::GetSlruSegment, received_at); + BatchedFeMessage::GetSlruSegment { + span, + timer, + shard, + req, + } } PagestreamFeMessage::GetPage(PagestreamGetPageRequest { request_lsn, @@ -728,6 +766,14 @@ impl PageServerHandler { return respond_error!(e.into()); } }; + + // It's important to start the timer before waiting for the LSN + // so that the _started counters are incremented before we do + // any serious waiting, e.g., for LSNs. + let timer = shard + .query_metrics + .start_smgr_op(metrics::SmgrQueryType::GetPageAtLsn, received_at); + let effective_request_lsn = match Self::wait_or_get_last_lsn( &shard, request_lsn, @@ -747,7 +793,7 @@ impl PageServerHandler { span, shard, effective_request_lsn, - pages: smallvec::smallvec![(rel, blkno)], + pages: smallvec::smallvec![(rel, blkno, timer)], } } }; @@ -832,88 +878,112 @@ impl PageServerHandler { IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, { // invoke handler function - let (handler_results, span): (Vec>, _) = - match batch { - BatchedFeMessage::Exists { span, shard, req } => { - fail::fail_point!("ps::handle-pagerequest-message::exists"); - ( - vec![ - self.handle_get_rel_exists_request(&shard, &req, ctx) - .instrument(span.clone()) - .await, - ], - span, - ) - } - BatchedFeMessage::Nblocks { span, shard, req } => { - fail::fail_point!("ps::handle-pagerequest-message::nblocks"); - ( - vec![ - self.handle_get_nblocks_request(&shard, &req, ctx) - .instrument(span.clone()) - .await, - ], - span, - ) - } - BatchedFeMessage::GetPage { + let (handler_results, span): ( + Vec>, + _, + ) = match batch { + BatchedFeMessage::Exists { + span, + timer, + shard, + req, + } => { + fail::fail_point!("ps::handle-pagerequest-message::exists"); + ( + vec![self + .handle_get_rel_exists_request(&shard, &req, ctx) + .instrument(span.clone()) + .await + .map(|msg| (msg, timer))], span, - shard, - effective_request_lsn, - pages, - } => { - fail::fail_point!("ps::handle-pagerequest-message::getpage"); - ( - { - let npages = pages.len(); - trace!(npages, "handling getpage request"); - let res = self - .handle_get_page_at_lsn_request_batched( - &shard, - effective_request_lsn, - pages, - ctx, - ) - .instrument(span.clone()) - .await; - assert_eq!(res.len(), npages); - res - }, - span, - ) - } - BatchedFeMessage::DbSize { span, shard, req } => { - fail::fail_point!("ps::handle-pagerequest-message::dbsize"); - ( - vec![ - self.handle_db_size_request(&shard, &req, ctx) - .instrument(span.clone()) - .await, - ], - span, - ) - } - BatchedFeMessage::GetSlruSegment { span, shard, req } => { - fail::fail_point!("ps::handle-pagerequest-message::slrusegment"); - ( - vec![ - self.handle_get_slru_segment_request(&shard, &req, ctx) - .instrument(span.clone()) - .await, - ], - span, - ) - } - BatchedFeMessage::RespondError { span, error } => { - // We've already decided to respond with an error, so we don't need to - // call the handler. - (vec![Err(error)], span) - } - }; + ) + } + BatchedFeMessage::Nblocks { + span, + timer, + shard, + req, + } => { + fail::fail_point!("ps::handle-pagerequest-message::nblocks"); + ( + vec![self + .handle_get_nblocks_request(&shard, &req, ctx) + .instrument(span.clone()) + .await + .map(|msg| (msg, timer))], + span, + ) + } + BatchedFeMessage::GetPage { + span, + shard, + effective_request_lsn, + pages, + } => { + fail::fail_point!("ps::handle-pagerequest-message::getpage"); + ( + { + let npages = pages.len(); + trace!(npages, "handling getpage request"); + let res = self + .handle_get_page_at_lsn_request_batched( + &shard, + effective_request_lsn, + pages, + ctx, + ) + .instrument(span.clone()) + .await; + assert_eq!(res.len(), npages); + res + }, + span, + ) + } + BatchedFeMessage::DbSize { + span, + timer, + shard, + req, + } => { + fail::fail_point!("ps::handle-pagerequest-message::dbsize"); + ( + vec![self + .handle_db_size_request(&shard, &req, ctx) + .instrument(span.clone()) + .await + .map(|msg| (msg, timer))], + span, + ) + } + BatchedFeMessage::GetSlruSegment { + span, + timer, + shard, + req, + } => { + fail::fail_point!("ps::handle-pagerequest-message::slrusegment"); + ( + vec![self + .handle_get_slru_segment_request(&shard, &req, ctx) + .instrument(span.clone()) + .await + .map(|msg| (msg, timer))], + span, + ) + } + BatchedFeMessage::RespondError { span, error } => { + // We've already decided to respond with an error, so we don't need to + // call the handler. + (vec![Err(error)], span) + } + }; // Map handler result to protocol behavior. // Some handler errors cause exit from pagestream protocol. // Other handler errors are sent back as an error message and we stay in pagestream protocol. + let mut timers: smallvec::SmallVec<[_; 1]> = + smallvec::SmallVec::with_capacity(handler_results.len()); for handler_result in handler_results { let response_msg = match handler_result { Err(e) => match &e { @@ -944,7 +1014,12 @@ impl PageServerHandler { }) } }, - Ok(response_msg) => response_msg, + Ok((response_msg, timer)) => { + // Extending the lifetime of the timers so observations on drop + // include the flush time. + timers.push(timer); + response_msg + } }; // marshal & transmit response message @@ -961,6 +1036,7 @@ impl PageServerHandler { res?; } } + drop(timers); Ok(()) } @@ -1423,10 +1499,6 @@ impl PageServerHandler { req: &PagestreamExistsRequest, ctx: &RequestContext, ) -> Result { - let _timer = timeline - .query_metrics - .start_timer(metrics::SmgrQueryType::GetRelExists, ctx); - let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( timeline, @@ -1453,10 +1525,6 @@ impl PageServerHandler { req: &PagestreamNblocksRequest, ctx: &RequestContext, ) -> Result { - let _timer = timeline - .query_metrics - .start_timer(metrics::SmgrQueryType::GetRelSize, ctx); - let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( timeline, @@ -1483,10 +1551,6 @@ impl PageServerHandler { req: &PagestreamDbSizeRequest, ctx: &RequestContext, ) -> Result { - let _timer = timeline - .query_metrics - .start_timer(metrics::SmgrQueryType::GetDbSize, ctx); - let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( timeline, @@ -1512,26 +1576,41 @@ impl PageServerHandler { &mut self, timeline: &Timeline, effective_lsn: Lsn, - pages: smallvec::SmallVec<[(RelTag, BlockNumber); 1]>, + requests: smallvec::SmallVec<[(RelTag, BlockNumber, SmgrOpTimer); 1]>, ctx: &RequestContext, - ) -> Vec> { + ) -> Vec> { debug_assert_current_span_has_tenant_and_timeline_id(); - let _timer = timeline.query_metrics.start_timer_many( - metrics::SmgrQueryType::GetPageAtLsn, - pages.len(), - ctx, - ); - let pages = timeline - .get_rel_page_at_lsn_batched(pages, effective_lsn, ctx) + timeline + .query_metrics + .observe_getpage_batch_start(requests.len()); + + let results = timeline + .get_rel_page_at_lsn_batched( + requests.iter().map(|(reltag, blkno, _)| (reltag, blkno)), + effective_lsn, + ctx, + ) .await; + assert_eq!(results.len(), requests.len()); - Vec::from_iter(pages.into_iter().map(|page| { - page.map(|page| { - PagestreamBeMessage::GetPage(models::PagestreamGetPageResponse { page }) - }) - .map_err(PageStreamError::from) - })) + // TODO: avoid creating the new Vec here + Vec::from_iter( + requests + .into_iter() + .zip(results.into_iter()) + .map(|((_, _, timer), res)| { + res.map(|page| { + ( + PagestreamBeMessage::GetPage(models::PagestreamGetPageResponse { + page, + }), + timer, + ) + }) + .map_err(PageStreamError::from) + }), + ) } #[instrument(skip_all, fields(shard_id))] @@ -1541,10 +1620,6 @@ impl PageServerHandler { req: &PagestreamGetSlruSegmentRequest, ctx: &RequestContext, ) -> Result { - let _timer = timeline - .query_metrics - .start_timer(metrics::SmgrQueryType::GetSlruSegment, ctx); - let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn( timeline, @@ -2045,7 +2120,7 @@ where COMPUTE_COMMANDS_COUNTERS .for_command(ComputeCommandKind::Basebackup) .inc(); - let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx); + let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(); let res = async { self.handle_basebackup_request( pgb, diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index d48a1ba117..a00ec761e2 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -203,9 +203,13 @@ impl Timeline { ) -> Result { match version { Version::Lsn(effective_lsn) => { - let pages = smallvec::smallvec![(tag, blknum)]; + let pages: smallvec::SmallVec<[_; 1]> = smallvec::smallvec![(tag, blknum)]; let res = self - .get_rel_page_at_lsn_batched(pages, effective_lsn, ctx) + .get_rel_page_at_lsn_batched( + pages.iter().map(|(tag, blknum)| (tag, blknum)), + effective_lsn, + ctx, + ) .await; assert_eq!(res.len(), 1); res.into_iter().next().unwrap() @@ -240,7 +244,7 @@ impl Timeline { /// The ordering of the returned vec corresponds to the ordering of `pages`. pub(crate) async fn get_rel_page_at_lsn_batched( &self, - pages: smallvec::SmallVec<[(RelTag, BlockNumber); 1]>, + pages: impl ExactSizeIterator, effective_lsn: Lsn, ctx: &RequestContext, ) -> Vec> { @@ -254,7 +258,7 @@ impl Timeline { let result_slots = result.spare_capacity_mut(); let mut keys_slots: BTreeMap> = BTreeMap::default(); - for (response_slot_idx, (tag, blknum)) in pages.into_iter().enumerate() { + for (response_slot_idx, (tag, blknum)) in pages.enumerate() { if tag.relnode == 0 { result_slots[response_slot_idx].write(Err(PageReconstructError::Other( RelationError::InvalidRelnode.into(), @@ -265,7 +269,7 @@ impl Timeline { } let nblocks = match self - .get_rel_size(tag, Version::Lsn(effective_lsn), ctx) + .get_rel_size(*tag, Version::Lsn(effective_lsn), ctx) .await { Ok(nblocks) => nblocks, @@ -276,7 +280,7 @@ impl Timeline { } }; - if blknum >= nblocks { + if *blknum >= nblocks { debug!( "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page", tag, blknum, effective_lsn, nblocks @@ -286,7 +290,7 @@ impl Timeline { continue; } - let key = rel_block_to_key(tag, blknum); + let key = rel_block_to_key(*tag, *blknum); let key_slots = keys_slots.entry(key).or_default(); key_slots.push(response_slot_idx); diff --git a/pageserver/src/tenant/throttle.rs b/pageserver/src/tenant/throttle.rs index 6a80953901..7c4de55a47 100644 --- a/pageserver/src/tenant/throttle.rs +++ b/pageserver/src/tenant/throttle.rs @@ -2,14 +2,14 @@ use std::{ str::FromStr, sync::{ atomic::{AtomicU64, Ordering}, - Arc, Mutex, + Arc, }, time::{Duration, Instant}, }; use arc_swap::ArcSwap; use enumset::EnumSet; -use tracing::{error, warn}; +use tracing::error; use utils::leaky_bucket::{LeakyBucketConfig, RateLimiter}; use crate::{context::RequestContext, task_mgr::TaskKind}; @@ -162,19 +162,6 @@ where .fetch_add(wait_time.as_micros() as u64, Ordering::Relaxed); let observation = Observation { wait_time }; self.metric.observe_throttling(&observation); - match ctx.micros_spent_throttled.add(wait_time) { - Ok(res) => res, - Err(error) => { - use once_cell::sync::Lazy; - use utils::rate_limit::RateLimit; - static WARN_RATE_LIMIT: Lazy> = - Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10)))); - let mut guard = WARN_RATE_LIMIT.lock().unwrap(); - guard.call(move || { - warn!(error, "error adding time spent throttled; this message is logged at a global rate limit"); - }); - } - } Some(wait_time) } else { None diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 730477a7f4..dc3f823f20 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1059,7 +1059,8 @@ impl Timeline { .map(|metric| (metric, Instant::now())); // start counting after throttle so that throttle time - // is always less than observation time + // is always less than observation time and we don't + // underflow when computing `ex_throttled` below. let throttled = self .timeline_get_throttle .throttle(ctx, key_count as usize) @@ -1138,7 +1139,9 @@ impl Timeline { .map(ScanLatencyOngoingRecording::start_recording); // start counting after throttle so that throttle time - // is always less than observation time + // is always less than observation time and we don't + // underflow when computing the `ex_throttled` value in + // `recording.observe(throttled)` below. let throttled = self .timeline_get_throttle // assume scan = 1 quota for now until we find a better way to process this diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 3f90c233a6..ffdbd988a5 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -173,6 +173,7 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = ( counter("pageserver_tenant_throttling_count_accounted_finish"), counter("pageserver_tenant_throttling_wait_usecs_sum"), counter("pageserver_tenant_throttling_count"), + *histogram("pageserver_page_service_batch_size"), *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS, # "pageserver_directory_entries_count", -- only used if above a certain threshold # "pageserver_broken_tenants_count" -- used only for broken diff --git a/test_runner/performance/pageserver/test_page_service_batching.py b/test_runner/performance/pageserver/test_page_service_batching.py index c47a849fec..562094a059 100644 --- a/test_runner/performance/pageserver/test_page_service_batching.py +++ b/test_runner/performance/pageserver/test_page_service_batching.py @@ -167,18 +167,18 @@ def test_throughput( @dataclass class Metrics: time: float - pageserver_getpage_count: float - pageserver_vectored_get_count: float + pageserver_batch_size_histo_sum: float + pageserver_batch_size_histo_count: float compute_getpage_count: float pageserver_cpu_seconds_total: float def __sub__(self, other: "Metrics") -> "Metrics": return Metrics( time=self.time - other.time, - pageserver_getpage_count=self.pageserver_getpage_count - - other.pageserver_getpage_count, - pageserver_vectored_get_count=self.pageserver_vectored_get_count - - other.pageserver_vectored_get_count, + pageserver_batch_size_histo_sum=self.pageserver_batch_size_histo_sum + - other.pageserver_batch_size_histo_sum, + pageserver_batch_size_histo_count=self.pageserver_batch_size_histo_count + - other.pageserver_batch_size_histo_count, compute_getpage_count=self.compute_getpage_count - other.compute_getpage_count, pageserver_cpu_seconds_total=self.pageserver_cpu_seconds_total - other.pageserver_cpu_seconds_total, @@ -187,8 +187,8 @@ def test_throughput( def normalize(self, by) -> "Metrics": return Metrics( time=self.time / by, - pageserver_getpage_count=self.pageserver_getpage_count / by, - pageserver_vectored_get_count=self.pageserver_vectored_get_count / by, + pageserver_batch_size_histo_sum=self.pageserver_batch_size_histo_sum / by, + pageserver_batch_size_histo_count=self.pageserver_batch_size_histo_count / by, compute_getpage_count=self.compute_getpage_count / by, pageserver_cpu_seconds_total=self.pageserver_cpu_seconds_total / by, ) @@ -202,11 +202,11 @@ def test_throughput( pageserver_metrics = ps_http.get_metrics() return Metrics( time=time.time(), - pageserver_getpage_count=pageserver_metrics.query_one( - "pageserver_smgr_query_seconds_count", {"smgr_query_type": "get_page_at_lsn"} + pageserver_batch_size_histo_sum=pageserver_metrics.query_one( + "pageserver_page_service_batch_size_sum" ).value, - pageserver_vectored_get_count=pageserver_metrics.query_one( - "pageserver_get_vectored_seconds_count", {"task_kind": "PageRequestHandler"} + pageserver_batch_size_histo_count=pageserver_metrics.query_one( + "pageserver_page_service_batch_size_count" ).value, compute_getpage_count=compute_getpage_count, pageserver_cpu_seconds_total=pageserver_metrics.query_one( @@ -243,7 +243,7 @@ def test_throughput( # Sanity-checks on the collected data # # assert that getpage counts roughly match between compute and ps - assert metrics.pageserver_getpage_count == pytest.approx( + assert metrics.pageserver_batch_size_histo_sum == pytest.approx( metrics.compute_getpage_count, rel=0.01 ) @@ -256,7 +256,7 @@ def test_throughput( zenbenchmark.record( "perfmetric.batching_factor", - metrics.pageserver_getpage_count / metrics.pageserver_vectored_get_count, + metrics.pageserver_batch_size_histo_sum / metrics.pageserver_batch_size_histo_count, unit="", report=MetricReport.HIGHER_IS_BETTER, ) diff --git a/test_runner/regress/test_pageserver_getpage_throttle.py b/test_runner/regress/test_pageserver_getpage_throttle.py index ba6a1d9045..62aec50a9e 100644 --- a/test_runner/regress/test_pageserver_getpage_throttle.py +++ b/test_runner/regress/test_pageserver_getpage_throttle.py @@ -4,6 +4,7 @@ import copy import json import uuid +import pytest from anyio import Path from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log @@ -70,14 +71,21 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P log.info("warmup / make sure metrics are present") run_pagebench_at_max_speed_and_get_total_requests_completed(2) - metrics_query = { + smgr_metrics_query = { "tenant_id": str(tenant_id), "timeline_id": str(timeline_id), "smgr_query_type": "get_page_at_lsn", } - metric_name = "pageserver_smgr_query_seconds_sum" - smgr_query_seconds_pre = ps_http.get_metric_value(metric_name, metrics_query) + smgr_metric_name = "pageserver_smgr_query_seconds_sum" + throttle_metrics_query = { + "tenant_id": str(tenant_id), + } + throttle_metric_name = "pageserver_tenant_throttling_wait_usecs_sum_total" + + smgr_query_seconds_pre = ps_http.get_metric_value(smgr_metric_name, smgr_metrics_query) assert smgr_query_seconds_pre is not None + throttled_usecs_pre = ps_http.get_metric_value(throttle_metric_name, throttle_metrics_query) + assert throttled_usecs_pre is not None marker = uuid.uuid4().hex ps_http.post_tracing_event("info", marker) @@ -108,14 +116,23 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P timeout=compaction_period, ) - log.info("validate that the metric doesn't include throttle wait time") - smgr_query_seconds_post = ps_http.get_metric_value(metric_name, metrics_query) + log.info("the smgr metric includes throttle time") + smgr_query_seconds_post = ps_http.get_metric_value(smgr_metric_name, smgr_metrics_query) assert smgr_query_seconds_post is not None + throttled_usecs_post = ps_http.get_metric_value(throttle_metric_name, throttle_metrics_query) + assert throttled_usecs_post is not None actual_smgr_query_seconds = smgr_query_seconds_post - smgr_query_seconds_pre + actual_throttled_usecs = throttled_usecs_post - throttled_usecs_pre + actual_throttled_secs = actual_throttled_usecs / 1_000_000 assert ( - duration_secs >= 10 * actual_smgr_query_seconds - ), "smgr metrics should not include throttle wait time" + pytest.approx(duration_secs, 0.1) == actual_smgr_query_seconds + ), "smgr metrics include throttle wait time" + smgr_ex_throttle = actual_smgr_query_seconds - actual_throttled_secs + assert smgr_ex_throttle > 0 + assert ( + duration_secs > 10 * smgr_ex_throttle + ), "most of the time in this test is spent throttled because the rate-limit's contribution to latency dominates" throttle_config_with_field_fair_set = { From 85b12ddd528c827a72f0928a0689edc44e377a06 Mon Sep 17 00:00:00 2001 From: a-masterov <72613290+a-masterov@users.noreply.github.com> Date: Tue, 3 Dec 2024 12:25:29 +0100 Subject: [PATCH 36/65] Add support for the extensions test for Postgres v17 (#9748) ## Problem The extensions for Postgres v17 are ready but we do not test the extensions shipped with v17 ## Summary of changes Build the test image based on Postgres v17. Run the tests for v17. --------- Co-authored-by: Anastasia Lubennikova --- .github/workflows/build_and_test.yml | 15 +- compute/compute-node.Dockerfile | 42 ++--- ...hint_plan.patch => pg_hint_plan_v16.patch} | 0 compute/patches/pg_hint_plan_v17.patch | 174 ++++++++++++++++++ docker-compose/compute_wrapper/Dockerfile | 6 +- docker-compose/docker_compose_test.sh | 27 ++- 6 files changed, 219 insertions(+), 45 deletions(-) rename compute/patches/{pg_hint_plan.patch => pg_hint_plan_v16.patch} (100%) create mode 100644 compute/patches/pg_hint_plan_v17.patch diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 9830c2a0c9..e9e111e7bd 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -669,7 +669,7 @@ jobs: neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }} - name: Build neon extensions test image - if: matrix.version.pg == 'v16' + if: matrix.version.pg >= 'v16' uses: docker/build-push-action@v6 with: context: . @@ -684,8 +684,7 @@ jobs: pull: true file: compute/compute-node.Dockerfile target: neon-pg-ext-test - cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }} - cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon-test-extensions-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }} + cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }} tags: | neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }} @@ -708,7 +707,7 @@ jobs: push: true pull: true file: compute/compute-node.Dockerfile - cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }} + cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }} cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-tools-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }} tags: | neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }} @@ -744,7 +743,7 @@ jobs: neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 - name: Create multi-arch neon-test-extensions image - if: matrix.version.pg == 'v16' + if: matrix.version.pg >= 'v16' run: | docker buildx imagetools create -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \ -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \ @@ -833,6 +832,7 @@ jobs: fail-fast: false matrix: arch: [ x64, arm64 ] + pg_version: [v16, v17] runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }} @@ -871,7 +871,10 @@ jobs: - name: Verify docker-compose example and test extensions timeout-minutes: 20 - run: env TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh + env: + TAG: ${{needs.tag.outputs.build-tag}} + TEST_VERSION_ONLY: ${{ matrix.pg_version }} + run: ./docker-compose/docker_compose_test.sh - name: Print logs and clean up if: always() diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 222a0cb88b..bf6311bf2b 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -1367,15 +1367,12 @@ RUN make PG_VERSION="${PG_VERSION}" -C compute FROM neon-pg-ext-build AS neon-pg-ext-test ARG PG_VERSION -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - mkdir /ext-src +RUN mkdir /ext-src #COPY --from=postgis-build /postgis.tar.gz /ext-src/ #COPY --from=postgis-build /sfcgal/* /usr COPY --from=plv8-build /plv8.tar.gz /ext-src/ -COPY --from=h3-pg-build /h3-pg.tar.gz /ext-src/ +#COPY --from=h3-pg-build /h3-pg.tar.gz /ext-src/ COPY --from=unit-pg-build /postgresql-unit.tar.gz /ext-src/ COPY --from=vector-pg-build /pgvector.tar.gz /ext-src/ COPY --from=vector-pg-build /pgvector.patch /ext-src/ @@ -1395,7 +1392,7 @@ COPY --from=hll-pg-build /hll.tar.gz /ext-src COPY --from=plpgsql-check-pg-build /plpgsql_check.tar.gz /ext-src #COPY --from=timescaledb-pg-build /timescaledb.tar.gz /ext-src COPY --from=pg-hint-plan-pg-build /pg_hint_plan.tar.gz /ext-src -COPY compute/patches/pg_hint_plan.patch /ext-src +COPY compute/patches/pg_hint_plan_${PG_VERSION}.patch /ext-src COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src COPY compute/patches/pg_cron.patch /ext-src #COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src @@ -1405,38 +1402,23 @@ COPY --from=pg-roaringbitmap-pg-build /pg_roaringbitmap.tar.gz /ext-src COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src #COPY --from=pg-embedding-pg-build /home/nonroot/pg_embedding-src/ /ext-src #COPY --from=wal2json-pg-build /wal2json_2_5.tar.gz /ext-src -COPY --from=pg-anon-pg-build /pg_anon.tar.gz /ext-src +#pg_anon is not supported yet for pg v17 so, don't fail if nothing found +COPY --from=pg-anon-pg-build /pg_anon.tar.g? /ext-src COPY compute/patches/pg_anon.patch /ext-src COPY --from=pg-ivm-build /pg_ivm.tar.gz /ext-src COPY --from=pg-partman-build /pg_partman.tar.gz /ext-src -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - cd /ext-src/ && for f in *.tar.gz; \ +RUN cd /ext-src/ && for f in *.tar.gz; \ do echo $f; dname=$(echo $f | sed 's/\.tar.*//')-src; \ rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \ || exit 1; rm -f $f; done -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - cd /ext-src/rum-src && patch -p1 <../rum.patch -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan.patch +RUN cd /ext-src/rum-src && patch -p1 <../rum.patch +RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch +RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan_${PG_VERSION}.patch COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - patch -p1 $SPEC_PATH/spec.json + fi PG_VERSION=$pg_version PG_TEST_VERSION=$PG_TEST_VERSION docker compose --profile test-extensions -f $COMPOSE_FILE up --build -d echo "wait until the compute is ready. timeout after 60s. " @@ -54,8 +61,7 @@ for pg_version in 14 15 16; do fi done - if [ $pg_version -ge 16 ] - then + if [ $pg_version -ge 16 ]; then echo Enabling trust connection docker exec $COMPUTE_CONTAINER_NAME bash -c "sed -i '\$d' /var/db/postgres/compute/pg_hba.conf && echo -e 'host\t all\t all\t all\t trust' >> /var/db/postgres/compute/pg_hba.conf && psql $PSQL_OPTION -c 'select pg_reload_conf()' " echo Adding postgres role @@ -68,10 +74,13 @@ for pg_version in 14 15 16; do # The test assumes that it is running on the same host with the postgres engine. # In our case it's not true, that's why we are copying files to the compute node TMPDIR=$(mktemp -d) - docker cp $TEST_CONTAINER_NAME:/ext-src/pg_anon-src/data $TMPDIR/data - echo -e '1\t too \t many \t tabs' > $TMPDIR/data/bad.csv - docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/tmp/tmp_anon_alternate_data + # Add support for pg_anon for pg_v16 + if [ $pg_version -ne 17 ]; then + docker cp $TEST_CONTAINER_NAME:/ext-src/pg_anon-src/data $TMPDIR/data + echo -e '1\t too \t many \t tabs' > $TMPDIR/data/bad.csv + docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/tmp/tmp_anon_alternate_data rm -rf $TMPDIR + fi TMPDIR=$(mktemp -d) # The following block does the same for the pg_hintplan test docker cp $TEST_CONTAINER_NAME:/ext-src/pg_hint_plan-src/data $TMPDIR/data @@ -97,4 +106,8 @@ for pg_version in 14 15 16; do fi fi cleanup + # The support of pg_anon not yet added to PG17, so we have to remove the corresponding option + if [ $pg_version -eq 17 ]; then + mv $SPEC_PATH/spec.bak $SPEC_PATH/spec.json + fi done From 0a2a84b766e78ae7288ac3b2624102776a28eea0 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Tue, 3 Dec 2024 12:35:59 +0100 Subject: [PATCH 37/65] safekeeper,pageserver: add heap profiling (#9778) ## Problem We don't have good observability for memory usage. This would be useful e.g. to debug OOM incidents or optimize performance or resource usage. We would also like to use continuous profiling with e.g. [Grafana Cloud Profiles](https://grafana.com/products/cloud/profiles-for-continuous-profiling/) (see https://github.com/neondatabase/cloud/issues/14888). This PR is intended as a proof of concept, to try it out in staging and drive further discussions about profiling more broadly. Touches https://github.com/neondatabase/neon/issues/9534. Touches https://github.com/neondatabase/cloud/issues/14888. Depends on #9779. Depends on #9780. ## Summary of changes Adds a HTTP route `/profile/heap` that takes a heap profile and returns it. Query parameters: * `format`: output format (`jemalloc` or `pprof`; default `pprof`). Unlike CPU profiles (see #9764), heap profiles are not symbolized and require the original binary to translate addresses to function names. To make this work with Grafana, we'll probably have to symbolize the process server-side -- this is left as future work, as is other output formats like SVG. Heap profiles don't work on macOS due to limitations in jemalloc. --- Cargo.lock | 89 ++++++++++++++++++++++++------- Cargo.toml | 3 +- libs/utils/Cargo.toml | 1 + libs/utils/src/http/endpoint.rs | 64 ++++++++++++++++++++++ pageserver/src/bin/pageserver.rs | 5 ++ pageserver/src/http/routes.rs | 8 +-- safekeeper/benches/receive_wal.rs | 6 +++ safekeeper/src/bin/safekeeper.rs | 5 ++ safekeeper/src/http/routes.rs | 7 ++- workspace_hack/Cargo.toml | 15 ++++-- 10 files changed, 175 insertions(+), 28 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ba02e3b11d..b2769e59f0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -301,7 +301,7 @@ dependencies = [ "aws-smithy-types", "aws-types", "bytes", - "fastrand 2.0.0", + "fastrand 2.2.0", "hex", "http 0.2.9", "hyper 0.14.30", @@ -341,7 +341,7 @@ dependencies = [ "aws-smithy-types", "aws-types", "bytes", - "fastrand 2.0.0", + "fastrand 2.2.0", "http 0.2.9", "http-body 0.4.5", "once_cell", @@ -417,7 +417,7 @@ dependencies = [ "aws-smithy-xml", "aws-types", "bytes", - "fastrand 2.0.0", + "fastrand 2.2.0", "hex", "hmac", "http 0.2.9", @@ -621,7 +621,7 @@ dependencies = [ "aws-smithy-runtime-api", "aws-smithy-types", "bytes", - "fastrand 2.0.0", + "fastrand 2.2.0", "h2 0.3.26", "http 0.2.9", "http-body 0.4.5", @@ -2054,9 +2054,9 @@ dependencies = [ [[package]] name = "fastrand" -version = "2.0.0" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6999dc1837253364c2ebb0704ba97994bd874e8f195d665c50b7548f6ea92764" +checksum = "486f806e73c5707928240ddc295403b1b93c96a02038563881c4a2fd84b81ac4" [[package]] name = "ff" @@ -2912,6 +2912,23 @@ version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" +[[package]] +name = "jemalloc_pprof" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a883828bd6a4b957cd9f618886ff19e5f3ebd34e06ba0e855849e049fef32fb" +dependencies = [ + "anyhow", + "libc", + "mappings", + "once_cell", + "pprof_util", + "tempfile", + "tikv-jemalloc-ctl", + "tokio", + "tracing", +] + [[package]] name = "jobserver" version = "0.1.32" @@ -3022,9 +3039,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.150" +version = "0.2.167" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89d92a4743f9a61002fae18374ed11e7973f530cb3a3255fb354818118b2203c" +checksum = "09d6582e104315a817dff97f75133544b2e094ee22447d2acf4a74e189ba06fc" [[package]] name = "libloading" @@ -3044,9 +3061,9 @@ checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" [[package]] name = "linux-raw-sys" -version = "0.4.13" +version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" +checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" [[package]] name = "linux-raw-sys" @@ -3079,6 +3096,19 @@ dependencies = [ "hashbrown 0.14.5", ] +[[package]] +name = "mappings" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce9229c438fbf1c333926e2053c4c091feabbd40a1b590ec62710fea2384af9e" +dependencies = [ + "anyhow", + "libc", + "once_cell", + "pprof_util", + "tracing", +] + [[package]] name = "matchers" version = "0.1.0" @@ -3346,6 +3376,7 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b05180d69e3da0e530ba2a1dae5110317e49e3b7f3d41be227dc5f92e49ee7af" dependencies = [ + "num-bigint", "num-complex", "num-integer", "num-iter", @@ -3434,6 +3465,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0638a1c9d0a3c0914158145bc76cff373a75a627e6ecbfb71cbe6f453a5a19b0" dependencies = [ "autocfg", + "num-bigint", "num-integer", "num-traits", ] @@ -3497,9 +3529,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.18.0" +version = "1.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" +checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" [[package]] name = "oorandom" @@ -4298,6 +4330,19 @@ dependencies = [ "thiserror", ] +[[package]] +name = "pprof_util" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "65c568b3f8c1c37886ae07459b1946249e725c315306b03be5632f84c239f781" +dependencies = [ + "anyhow", + "flate2", + "num", + "paste", + "prost", +] + [[package]] name = "ppv-lite86" version = "0.2.17" @@ -5220,14 +5265,14 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.28" +version = "0.38.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72e572a5e8ca657d7366229cdde4bd14c4eb5499a9573d4d366fe1b599daa316" +checksum = "d7f649912bc1495e167a6edee79151c84b1bad49748cb4f1f1167f459f6224f6" dependencies = [ "bitflags 2.4.1", "errno", "libc", - "linux-raw-sys 0.4.13", + "linux-raw-sys 0.4.14", "windows-sys 0.52.0", ] @@ -6251,13 +6296,13 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.9.0" +version = "3.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01ce4141aa927a6d1bd34a041795abd0db1cccba5d5f24b009f694bdf3a1f3fa" +checksum = "28cce251fcbc87fac86a866eeb0d6c2d536fc16d06f184bb61aeae11aa4cee0c" dependencies = [ "cfg-if", - "fastrand 2.0.0", - "redox_syscall 0.4.1", + "fastrand 2.2.0", + "once_cell", "rustix", "windows-sys 0.52.0", ] @@ -7058,6 +7103,7 @@ dependencies = [ "hex-literal", "humantime", "hyper 0.14.30", + "jemalloc_pprof", "jsonwebtoken", "metrics", "nix 0.27.1", @@ -7644,8 +7690,12 @@ dependencies = [ "memchr", "nix 0.26.4", "nom", + "num", "num-bigint", + "num-complex", "num-integer", + "num-iter", + "num-rational", "num-traits", "once_cell", "parquet", @@ -7669,6 +7719,7 @@ dependencies = [ "subtle", "syn 2.0.90", "sync_wrapper 0.1.2", + "tikv-jemalloc-ctl", "tikv-jemalloc-sys", "time", "time-macros", diff --git a/Cargo.toml b/Cargo.toml index 036dc01057..91fa6a2607 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -115,6 +115,7 @@ indoc = "2" ipnet = "2.10.0" itertools = "0.10" itoa = "1.0.11" +jemalloc_pprof = "0.6" jsonwebtoken = "9" lasso = "0.7" libc = "0.2" @@ -175,7 +176,7 @@ sync_wrapper = "0.1.2" tar = "0.4" test-context = "0.3" thiserror = "1.0" -tikv-jemallocator = { version = "0.6", features = ["stats"] } +tikv-jemallocator = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] } tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] } tokio = { version = "1.17", features = ["macros"] } tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" } diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 5648072a83..66500fb141 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -26,6 +26,7 @@ humantime.workspace = true hyper0 = { workspace = true, features = ["full"] } fail.workspace = true futures = { workspace = true} +jemalloc_pprof.workspace = true jsonwebtoken.workspace = true nix.workspace = true once_cell.workspace = true diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs index 6a85f0ddeb..d975b63677 100644 --- a/libs/utils/src/http/endpoint.rs +++ b/libs/utils/src/http/endpoint.rs @@ -10,6 +10,7 @@ use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder}; use once_cell::sync::Lazy; use routerify::ext::RequestExt; use routerify::{Middleware, RequestInfo, Router, RouterBuilder}; +use tokio_util::io::ReaderStream; use tracing::{debug, info, info_span, warn, Instrument}; use std::future::Future; @@ -407,6 +408,69 @@ pub async fn profile_cpu_handler(req: Request) -> Result, A } } +/// Generates heap profiles. +/// +/// This only works with jemalloc on Linux. +pub async fn profile_heap_handler(req: Request) -> Result, ApiError> { + enum Format { + Jemalloc, + Pprof, + } + + // Parameters. + let format = match get_query_param(&req, "format")?.as_deref() { + None => Format::Pprof, + Some("jemalloc") => Format::Jemalloc, + Some("pprof") => Format::Pprof, + Some(format) => return Err(ApiError::BadRequest(anyhow!("invalid format {format}"))), + }; + + // Obtain profiler handle. + let mut prof_ctl = jemalloc_pprof::PROF_CTL + .as_ref() + .ok_or(ApiError::InternalServerError(anyhow!( + "heap profiling not enabled" + )))? + .lock() + .await; + if !prof_ctl.activated() { + return Err(ApiError::InternalServerError(anyhow!( + "heap profiling not enabled" + ))); + } + + // Take and return the profile. + match format { + Format::Jemalloc => { + // NB: file is an open handle to a tempfile that's already deleted. + let file = tokio::task::spawn_blocking(move || prof_ctl.dump()) + .await + .map_err(|join_err| ApiError::InternalServerError(join_err.into()))? + .map_err(ApiError::InternalServerError)?; + let stream = ReaderStream::new(tokio::fs::File::from_std(file)); + Response::builder() + .status(200) + .header(CONTENT_TYPE, "application/octet-stream") + .header(CONTENT_DISPOSITION, "attachment; filename=\"heap.dump\"") + .body(Body::wrap_stream(stream)) + .map_err(|err| ApiError::InternalServerError(err.into())) + } + + Format::Pprof => { + let data = tokio::task::spawn_blocking(move || prof_ctl.dump_pprof()) + .await + .map_err(|join_err| ApiError::InternalServerError(join_err.into()))? + .map_err(ApiError::InternalServerError)?; + Response::builder() + .status(200) + .header(CONTENT_TYPE, "application/octet-stream") + .header(CONTENT_DISPOSITION, "attachment; filename=\"heap.pb\"") + .body(Body::from(data)) + .map_err(|err| ApiError::InternalServerError(err.into())) + } + } +} + pub fn add_request_id_middleware( ) -> Middleware { Middleware::pre(move |req| async move { diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 31f4370855..8fe225c6aa 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -53,6 +53,11 @@ project_build_tag!(BUILD_TAG); #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; +/// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20). +#[allow(non_upper_case_globals)] +#[export_name = "malloc_conf"] +pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0"; + const PID_FILE_NAME: &str = "pageserver.pid"; const FEATURES: &[&str] = &[ diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index ceb1c3b012..e127871549 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -56,9 +56,9 @@ use tokio_util::sync::CancellationToken; use tracing::*; use utils::auth::JwtAuth; use utils::failpoint_support::failpoints_handler; -use utils::http::endpoint::profile_cpu_handler; -use utils::http::endpoint::prometheus_metrics_handler; -use utils::http::endpoint::request_span; +use utils::http::endpoint::{ + profile_cpu_handler, profile_heap_handler, prometheus_metrics_handler, request_span, +}; use utils::http::request::must_parse_query_param; use utils::http::request::{get_request_param, must_get_query_param, parse_query_param}; @@ -155,6 +155,7 @@ impl State { "/swagger.yml", "/metrics", "/profile/cpu", + "/profile/heap", ]; Ok(Self { conf, @@ -3203,6 +3204,7 @@ pub fn make_router( .data(state) .get("/metrics", |r| request_span(r, prometheus_metrics_handler)) .get("/profile/cpu", |r| request_span(r, profile_cpu_handler)) + .get("/profile/heap", |r| request_span(r, profile_heap_handler)) .get("/v1/status", |r| api_handler(r, status_handler)) .put("/v1/failpoints", |r| { testing_api_handler("manage failpoints", r, failpoints_handler) diff --git a/safekeeper/benches/receive_wal.rs b/safekeeper/benches/receive_wal.rs index 8c4281cf52..313d945b94 100644 --- a/safekeeper/benches/receive_wal.rs +++ b/safekeeper/benches/receive_wal.rs @@ -24,9 +24,15 @@ const KB: usize = 1024; const MB: usize = 1024 * KB; const GB: usize = 1024 * MB; +/// Use jemalloc, and configure it to sample allocations for profiles every 1 MB. +/// This mirrors the configuration in bin/safekeeper.rs. #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; +#[allow(non_upper_case_globals)] +#[export_name = "malloc_conf"] +pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0"; + // Register benchmarks with Criterion. criterion_group!( name = benches; diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 3659bcd7e0..4dc7edef37 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -51,6 +51,11 @@ use utils::{ #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; +/// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20). +#[allow(non_upper_case_globals)] +#[export_name = "malloc_conf"] +pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0"; + const PID_FILE_NAME: &str = "safekeeper.pid"; const ID_FILE_NAME: &str = "safekeeper.id"; diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 28294abdb9..69b775fd76 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -14,7 +14,8 @@ use tokio_util::sync::CancellationToken; use tracing::{info_span, Instrument}; use utils::failpoint_support::failpoints_handler; use utils::http::endpoint::{ - profile_cpu_handler, prometheus_metrics_handler, request_span, ChannelWriter, + profile_cpu_handler, profile_heap_handler, prometheus_metrics_handler, request_span, + ChannelWriter, }; use utils::http::request::parse_query_param; @@ -573,7 +574,8 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder let mut router = endpoint::make_router(); if conf.http_auth.is_some() { router = router.middleware(auth_middleware(|request| { - const ALLOWLIST_ROUTES: &[&str] = &["/v1/status", "/metrics", "/profile/cpu"]; + const ALLOWLIST_ROUTES: &[&str] = + &["/v1/status", "/metrics", "/profile/cpu", "profile/heap"]; if ALLOWLIST_ROUTES.contains(&request.uri().path()) { None } else { @@ -594,6 +596,7 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder .data(auth) .get("/metrics", |r| request_span(r, prometheus_metrics_handler)) .get("/profile/cpu", |r| request_span(r, profile_cpu_handler)) + .get("/profile/heap", |r| request_span(r, profile_heap_handler)) .get("/v1/status", |r| request_span(r, status_handler)) .put("/v1/failpoints", |r| { request_span(r, move |r| async { diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index c0a3abc377..d19379aefd 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -55,12 +55,16 @@ log = { version = "0.4", default-features = false, features = ["std"] } memchr = { version = "2" } nix = { version = "0.26" } nom = { version = "7" } +num = { version = "0.4" } num-bigint = { version = "0.4" } +num-complex = { version = "0.4", default-features = false, features = ["std"] } num-integer = { version = "0.1", features = ["i128"] } +num-iter = { version = "0.1", default-features = false, features = ["i128", "std"] } +num-rational = { version = "0.4", default-features = false, features = ["num-bigint-std", "std"] } num-traits = { version = "0.2", features = ["i128", "libm"] } once_cell = { version = "1" } parquet = { version = "53", default-features = false, features = ["zstd"] } -prost = { version = "0.13", features = ["prost-derive"] } +prost = { version = "0.13", features = ["no-recursion-limit", "prost-derive"] } rand = { version = "0.8", features = ["small_rng"] } regex = { version = "1" } regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] } @@ -76,7 +80,8 @@ smallvec = { version = "1", default-features = false, features = ["const_new", " spki = { version = "0.7", default-features = false, features = ["pem", "std"] } subtle = { version = "2" } sync_wrapper = { version = "0.1", default-features = false, features = ["futures"] } -tikv-jemalloc-sys = { version = "0.6", features = ["stats"] } +tikv-jemalloc-ctl = { version = "0.6", features = ["stats", "use_std"] } +tikv-jemalloc-sys = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] } time = { version = "0.3", features = ["macros", "serde-well-known"] } tokio = { version = "1", features = ["full", "test-util"] } tokio-rustls = { version = "0.26", default-features = false, features = ["logging", "ring", "tls12"] } @@ -111,14 +116,18 @@ libc = { version = "0.2", features = ["extra_traits", "use_std"] } log = { version = "0.4", default-features = false, features = ["std"] } memchr = { version = "2" } nom = { version = "7" } +num = { version = "0.4" } num-bigint = { version = "0.4" } +num-complex = { version = "0.4", default-features = false, features = ["std"] } num-integer = { version = "0.1", features = ["i128"] } +num-iter = { version = "0.1", default-features = false, features = ["i128", "std"] } +num-rational = { version = "0.4", default-features = false, features = ["num-bigint-std", "std"] } num-traits = { version = "0.2", features = ["i128", "libm"] } once_cell = { version = "1" } parquet = { version = "53", default-features = false, features = ["zstd"] } prettyplease = { version = "0.2", default-features = false, features = ["verbatim"] } proc-macro2 = { version = "1" } -prost = { version = "0.13", features = ["prost-derive"] } +prost = { version = "0.13", features = ["no-recursion-limit", "prost-derive"] } quote = { version = "1" } regex = { version = "1" } regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] } From 907e4aa3c4d670960ca17429f4b83aa04ff8ff45 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Tue, 3 Dec 2024 15:33:31 +0100 Subject: [PATCH 38/65] test_runner: use immediate shutdown in `test_sharded_ingest` (#9984) ## Problem `test_sharded_ingest` ingests a lot of data, which can cause shutdown to be slow e.g. due to local "S3 uploads" or compactions. This can cause test flakes during teardown. Resolves #9740. ## Summary of changes Perform an immediate shutdown of the cluster. --- test_runner/performance/test_sharded_ingest.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/test_runner/performance/test_sharded_ingest.py b/test_runner/performance/test_sharded_ingest.py index 4c21e799c8..94fd54bade 100644 --- a/test_runner/performance/test_sharded_ingest.py +++ b/test_runner/performance/test_sharded_ingest.py @@ -90,6 +90,7 @@ def test_sharded_ingest( # Start the endpoint. endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) start_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0]) + # Ingest data and measure WAL volume and duration. with closing(endpoint.connect()) as conn: with conn.cursor() as cur: @@ -104,6 +105,8 @@ def test_sharded_ingest( wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) end_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0]) + + # Record metrics. wal_written_mb = round((end_lsn - start_lsn) / (1024 * 1024)) zenbenchmark.record("wal_written", wal_written_mb, "MB", MetricReport.TEST_PARAM) @@ -152,3 +155,7 @@ def test_sharded_ingest( log.info(f"WAL ingested by each pageserver {ingested_by_ps}") assert tenant_get_shards(env, tenant_id) == shards, "shards moved" + + # The pageservers can take a long time to shut down gracefully, presumably due to the upload + # queue or compactions or something. Just stop them immediately, we don't care. + env.stop(immediate=True) From 63cb8ce975c26121bf9f36601649096a775fbbbd Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Tue, 3 Dec 2024 16:25:58 +0100 Subject: [PATCH 39/65] pageserver: only throttle pagestream requests & bring back throttling deduction for smgr latency metrics (#9962) ## Problem In the batching PR - https://github.com/neondatabase/neon/pull/9870 I stopped deducting the time-spent-in-throttle fro latency metrics, i.e., - smgr latency metrics (`SmgrOpTimer`) - basebackup latency (+scan latency, which I think is part of basebackup). The reason for stopping the deduction was that with the introduction of batching, the trick with tracking time-spent-in-throttle inside RequestContext and swap-replacing it from the `impl Drop for SmgrOpTimer` no longer worked with >1 requests in a batch. However, deducting time-spent-in-throttle is desirable because our internal latency SLO definition does not account for throttling. ## Summary of changes - Redefine throttling to be a page_service pagestream request throttle instead of a throttle for repository `Key` reads through `Timeline::get` / `Timeline::get_vectored`. - This means reads done by `basebackup` are no longer subject to any throttle. - The throttle applies after batching, before handling of the request. - Drive-by fix: make throttle sensitive to cancellation. - Rename metric label `kind` from `timeline_get` to `pagestream` to reflect the new scope of throttling. To avoid config format breakage, we leave the config field named `timeline_get_throttle` and ignore the `task_kinds` field. This will be cleaned up in a future PR. ## Trade-Offs Ideally, we would apply the throttle before reading a request off the connection, so that we queue the minimal amount of work inside the process. However, that's not possible because we need to do shard routing. The redefinition of the throttle to limit pagestream request rate instead of repository `Key` rate comes with several downsides: - We're no longer able to use the throttle mechanism for other other tasks, e.g. image layer creation. However, in practice, we never used that capability anyways. - We no longer throttle basebackup. --- libs/pageserver_api/src/models.rs | 58 ++++++++++- pageserver/src/metrics.rs | 95 ++++++++++++------- pageserver/src/page_service.rs | 45 ++++++++- pageserver/src/tenant.rs | 20 ++-- pageserver/src/tenant/tasks.rs | 6 +- pageserver/src/tenant/throttle.rs | 33 ++----- pageserver/src/tenant/timeline.rs | 54 ++--------- pageserver/src/tenant/timeline/delete.rs | 2 +- .../test_pageserver_getpage_throttle.py | 16 ++-- 9 files changed, 198 insertions(+), 131 deletions(-) diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 42c5d10c05..5488f7b2c2 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -501,7 +501,9 @@ pub struct EvictionPolicyLayerAccessThreshold { #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)] pub struct ThrottleConfig { - pub task_kinds: Vec, // TaskKind + /// See [`ThrottleConfigTaskKinds`] for why we do the serde `rename`. + #[serde(rename = "task_kinds")] + pub enabled: ThrottleConfigTaskKinds, pub initial: u32, #[serde(with = "humantime_serde")] pub refill_interval: Duration, @@ -509,10 +511,38 @@ pub struct ThrottleConfig { pub max: u32, } +/// Before +/// the throttle was a per `Timeline::get`/`Timeline::get_vectored` call. +/// The `task_kinds` field controlled which Pageserver "Task Kind"s +/// were subject to the throttle. +/// +/// After that PR, the throttle is applied at pagestream request level +/// and the `task_kinds` field does not apply since the only task kind +/// that us subject to the throttle is that of the page service. +/// +/// However, we don't want to make a breaking config change right now +/// because it means we have to migrate all the tenant configs. +/// This will be done in a future PR. +/// +/// In the meantime, we use emptiness / non-emptsiness of the `task_kinds` +/// field to determine if the throttle is enabled or not. +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] +#[serde(transparent)] +pub struct ThrottleConfigTaskKinds(Vec); + +impl ThrottleConfigTaskKinds { + pub fn disabled() -> Self { + Self(vec![]) + } + pub fn is_enabled(&self) -> bool { + !self.0.is_empty() + } +} + impl ThrottleConfig { pub fn disabled() -> Self { Self { - task_kinds: vec![], // effectively disables the throttle + enabled: ThrottleConfigTaskKinds::disabled(), // other values don't matter with emtpy `task_kinds`. initial: 0, refill_interval: Duration::from_millis(1), @@ -526,6 +556,30 @@ impl ThrottleConfig { } } +#[cfg(test)] +mod throttle_config_tests { + use super::*; + + #[test] + fn test_disabled_is_disabled() { + let config = ThrottleConfig::disabled(); + assert!(!config.enabled.is_enabled()); + } + #[test] + fn test_enabled_backwards_compat() { + let input = serde_json::json!({ + "task_kinds": ["PageRequestHandler"], + "initial": 40000, + "refill_interval": "50ms", + "refill_amount": 1000, + "max": 40000, + "fair": true + }); + let config: ThrottleConfig = serde_json::from_value(input).unwrap(); + assert!(config.enabled.is_enabled()); + } +} + /// A flattened analog of a `pagesever::tenant::LocationMode`, which /// lists out all possible states (and the virtual "Detached" state) /// in a flat form rather than using rust-style enums. diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index d04fae7627..998c15ccaf 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -217,31 +217,16 @@ impl<'a> ScanLatencyOngoingRecording<'a> { ScanLatencyOngoingRecording { parent, start } } - pub(crate) fn observe(self, throttled: Option) { + pub(crate) fn observe(self) { let elapsed = self.start.elapsed(); - let ex_throttled = if let Some(throttled) = throttled { - elapsed.checked_sub(throttled) - } else { - Some(elapsed) - }; - if let Some(ex_throttled) = ex_throttled { - self.parent.observe(ex_throttled.as_secs_f64()); - } else { - use utils::rate_limit::RateLimit; - static LOGGED: Lazy> = - Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10)))); - let mut rate_limit = LOGGED.lock().unwrap(); - rate_limit.call(|| { - warn!("error deducting time spent throttled; this message is logged at a global rate limit"); - }); - } + self.parent.observe(elapsed.as_secs_f64()); } } pub(crate) static GET_VECTORED_LATENCY: Lazy = Lazy::new(|| { let inner = register_histogram_vec!( "pageserver_get_vectored_seconds", - "Time spent in get_vectored, excluding time spent in timeline_get_throttle.", + "Time spent in get_vectored.", &["task_kind"], CRITICAL_OP_BUCKETS.into(), ) @@ -264,7 +249,7 @@ pub(crate) static GET_VECTORED_LATENCY: Lazy = Lazy::new(|| pub(crate) static SCAN_LATENCY: Lazy = Lazy::new(|| { let inner = register_histogram_vec!( "pageserver_scan_seconds", - "Time spent in scan, excluding time spent in timeline_get_throttle.", + "Time spent in scan.", &["task_kind"], CRITICAL_OP_BUCKETS.into(), ) @@ -1227,11 +1212,44 @@ pub(crate) struct SmgrOpTimer { per_timeline_latency_histo: Option, start: Instant, + throttled: Duration, + op: SmgrQueryType, +} + +impl SmgrOpTimer { + pub(crate) fn deduct_throttle(&mut self, throttle: &Option) { + let Some(throttle) = throttle else { + return; + }; + self.throttled += *throttle; + } } impl Drop for SmgrOpTimer { fn drop(&mut self) { - let elapsed = self.start.elapsed().as_secs_f64(); + let elapsed = self.start.elapsed(); + + let elapsed = match elapsed.checked_sub(self.throttled) { + Some(elapsed) => elapsed, + None => { + use utils::rate_limit::RateLimit; + static LOGGED: Lazy>> = + Lazy::new(|| { + Mutex::new(enum_map::EnumMap::from_array(std::array::from_fn(|_| { + RateLimit::new(Duration::from_secs(10)) + }))) + }); + let mut guard = LOGGED.lock().unwrap(); + let rate_limit = &mut guard[self.op]; + rate_limit.call(|| { + warn!(op=?self.op, ?elapsed, ?self.throttled, "implementation error: time spent throttled exceeds total request wall clock time"); + }); + elapsed // un-throttled time, more info than just saturating to 0 + } + }; + + let elapsed = elapsed.as_secs_f64(); + self.global_latency_histo.observe(elapsed); if let Some(per_timeline_getpage_histo) = &self.per_timeline_latency_histo { per_timeline_getpage_histo.observe(elapsed); @@ -1491,6 +1509,8 @@ impl SmgrQueryTimePerTimeline { global_latency_histo: self.global_latency[op as usize].clone(), per_timeline_latency_histo, start: started_at, + op, + throttled: Duration::ZERO, } } @@ -3299,7 +3319,7 @@ pub(crate) mod tenant_throttling { use once_cell::sync::Lazy; use utils::shard::TenantShardId; - use crate::tenant::{self, throttle::Metric}; + use crate::tenant::{self}; struct GlobalAndPerTenantIntCounter { global: IntCounter, @@ -3318,7 +3338,7 @@ pub(crate) mod tenant_throttling { } } - pub(crate) struct TimelineGet { + pub(crate) struct Metrics { count_accounted_start: GlobalAndPerTenantIntCounter, count_accounted_finish: GlobalAndPerTenantIntCounter, wait_time: GlobalAndPerTenantIntCounter, @@ -3391,40 +3411,41 @@ pub(crate) mod tenant_throttling { .unwrap() }); - const KIND: &str = "timeline_get"; + const KINDS: &[&str] = &["pagestream"]; + pub type Pagestream = Metrics<0>; - impl TimelineGet { + impl Metrics { pub(crate) fn new(tenant_shard_id: &TenantShardId) -> Self { let per_tenant_label_values = &[ - KIND, + KINDS[KIND], &tenant_shard_id.tenant_id.to_string(), &tenant_shard_id.shard_slug().to_string(), ]; - TimelineGet { + Metrics { count_accounted_start: { GlobalAndPerTenantIntCounter { - global: COUNT_ACCOUNTED_START.with_label_values(&[KIND]), + global: COUNT_ACCOUNTED_START.with_label_values(&[KINDS[KIND]]), per_tenant: COUNT_ACCOUNTED_START_PER_TENANT .with_label_values(per_tenant_label_values), } }, count_accounted_finish: { GlobalAndPerTenantIntCounter { - global: COUNT_ACCOUNTED_FINISH.with_label_values(&[KIND]), + global: COUNT_ACCOUNTED_FINISH.with_label_values(&[KINDS[KIND]]), per_tenant: COUNT_ACCOUNTED_FINISH_PER_TENANT .with_label_values(per_tenant_label_values), } }, wait_time: { GlobalAndPerTenantIntCounter { - global: WAIT_USECS.with_label_values(&[KIND]), + global: WAIT_USECS.with_label_values(&[KINDS[KIND]]), per_tenant: WAIT_USECS_PER_TENANT .with_label_values(per_tenant_label_values), } }, count_throttled: { GlobalAndPerTenantIntCounter { - global: WAIT_COUNT.with_label_values(&[KIND]), + global: WAIT_COUNT.with_label_values(&[KINDS[KIND]]), per_tenant: WAIT_COUNT_PER_TENANT .with_label_values(per_tenant_label_values), } @@ -3447,15 +3468,17 @@ pub(crate) mod tenant_throttling { &WAIT_USECS_PER_TENANT, &WAIT_COUNT_PER_TENANT, ] { - let _ = m.remove_label_values(&[ - KIND, - &tenant_shard_id.tenant_id.to_string(), - &tenant_shard_id.shard_slug().to_string(), - ]); + for kind in KINDS { + let _ = m.remove_label_values(&[ + kind, + &tenant_shard_id.tenant_id.to_string(), + &tenant_shard_id.shard_slug().to_string(), + ]); + } } } - impl Metric for TimelineGet { + impl tenant::throttle::Metric for Metrics { #[inline(always)] fn accounting_start(&self) { self.count_accounted_start.inc(); diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 64842aa5b8..7026df9527 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -574,6 +574,41 @@ enum BatchedFeMessage { }, } +impl BatchedFeMessage { + async fn throttle(&mut self, cancel: &CancellationToken) -> Result<(), QueryError> { + let (shard, tokens, timers) = match self { + BatchedFeMessage::Exists { shard, timer, .. } + | BatchedFeMessage::Nblocks { shard, timer, .. } + | BatchedFeMessage::DbSize { shard, timer, .. } + | BatchedFeMessage::GetSlruSegment { shard, timer, .. } => { + ( + shard, + // 1 token is probably under-estimating because these + // request handlers typically do several Timeline::get calls. + 1, + itertools::Either::Left(std::iter::once(timer)), + ) + } + BatchedFeMessage::GetPage { shard, pages, .. } => ( + shard, + pages.len(), + itertools::Either::Right(pages.iter_mut().map(|(_, _, timer)| timer)), + ), + BatchedFeMessage::RespondError { .. } => return Ok(()), + }; + let throttled = tokio::select! { + throttled = shard.pagestream_throttle.throttle(tokens) => { throttled } + _ = cancel.cancelled() => { + return Err(QueryError::Shutdown); + } + }; + for timer in timers { + timer.deduct_throttle(&throttled); + } + Ok(()) + } +} + impl PageServerHandler { pub fn new( tenant_manager: Arc, @@ -1157,13 +1192,18 @@ impl PageServerHandler { Ok(msg) => msg, Err(e) => break e, }; - let msg = match msg { + let mut msg = match msg { Some(msg) => msg, None => { debug!("pagestream subprotocol end observed"); return ((pgb_reader, timeline_handles), Ok(())); } }; + + if let Err(cancelled) = msg.throttle(&self.cancel).await { + break cancelled; + } + let err = self .pagesteam_handle_batched_message(pgb_writer, msg, &cancel, ctx) .await; @@ -1321,12 +1361,13 @@ impl PageServerHandler { return Ok(()); } }; - let batch = match batch { + let mut batch = match batch { Ok(batch) => batch, Err(e) => { return Err(e); } }; + batch.throttle(&self.cancel).await?; self.pagesteam_handle_batched_message(pgb_writer, batch, &cancel, &ctx) .await?; } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index cd0690bb1a..ada5c4a977 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -357,8 +357,8 @@ pub struct Tenant { /// Throttle applied at the top of [`Timeline::get`]. /// All [`Tenant::timelines`] of a given [`Tenant`] instance share the same [`throttle::Throttle`] instance. - pub(crate) timeline_get_throttle: - Arc>, + pub(crate) pagestream_throttle: + Arc>, /// An ongoing timeline detach concurrency limiter. /// @@ -1678,7 +1678,7 @@ impl Tenant { remote_metadata, TimelineResources { remote_client, - timeline_get_throttle: self.timeline_get_throttle.clone(), + pagestream_throttle: self.pagestream_throttle.clone(), l0_flush_global_state: self.l0_flush_global_state.clone(), }, LoadTimelineCause::Attach, @@ -3835,7 +3835,7 @@ impl Tenant { } } - fn get_timeline_get_throttle_config( + fn get_pagestream_throttle_config( psconf: &'static PageServerConf, overrides: &TenantConfOpt, ) -> throttle::Config { @@ -3846,8 +3846,8 @@ impl Tenant { } pub(crate) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) { - let conf = Self::get_timeline_get_throttle_config(self.conf, new_conf); - self.timeline_get_throttle.reconfigure(conf) + let conf = Self::get_pagestream_throttle_config(self.conf, new_conf); + self.pagestream_throttle.reconfigure(conf) } /// Helper function to create a new Timeline struct. @@ -4009,9 +4009,9 @@ impl Tenant { attach_wal_lag_cooldown: Arc::new(std::sync::OnceLock::new()), cancel: CancellationToken::default(), gate: Gate::default(), - timeline_get_throttle: Arc::new(throttle::Throttle::new( - Tenant::get_timeline_get_throttle_config(conf, &attached_conf.tenant_conf), - crate::metrics::tenant_throttling::TimelineGet::new(&tenant_shard_id), + pagestream_throttle: Arc::new(throttle::Throttle::new( + Tenant::get_pagestream_throttle_config(conf, &attached_conf.tenant_conf), + crate::metrics::tenant_throttling::Metrics::new(&tenant_shard_id), )), tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)), ongoing_timeline_detach: std::sync::Mutex::default(), @@ -4909,7 +4909,7 @@ impl Tenant { fn build_timeline_resources(&self, timeline_id: TimelineId) -> TimelineResources { TimelineResources { remote_client: self.build_timeline_remote_client(timeline_id), - timeline_get_throttle: self.timeline_get_throttle.clone(), + pagestream_throttle: self.pagestream_throttle.clone(), l0_flush_global_state: self.l0_flush_global_state.clone(), } } diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 16dac10dca..0118a5ce5f 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -471,14 +471,14 @@ async fn ingest_housekeeping_loop(tenant: Arc, cancel: CancellationToken // TODO: rename the background loop kind to something more generic, like, tenant housekeeping. // Or just spawn another background loop for this throttle, it's not like it's super costly. - info_span!(parent: None, "timeline_get_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| { + info_span!(parent: None, "pagestream_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| { let now = Instant::now(); let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now); - let Stats { count_accounted_start, count_accounted_finish, count_throttled, sum_throttled_usecs} = tenant.timeline_get_throttle.reset_stats(); + let Stats { count_accounted_start, count_accounted_finish, count_throttled, sum_throttled_usecs} = tenant.pagestream_throttle.reset_stats(); if count_throttled == 0 { return; } - let allowed_rps = tenant.timeline_get_throttle.steady_rps(); + let allowed_rps = tenant.pagestream_throttle.steady_rps(); let delta = now - prev; info!( n_seconds=%format_args!("{:.3}", delta.as_secs_f64()), diff --git a/pageserver/src/tenant/throttle.rs b/pageserver/src/tenant/throttle.rs index 7c4de55a47..54c0e59daa 100644 --- a/pageserver/src/tenant/throttle.rs +++ b/pageserver/src/tenant/throttle.rs @@ -1,5 +1,4 @@ use std::{ - str::FromStr, sync::{ atomic::{AtomicU64, Ordering}, Arc, @@ -8,12 +7,8 @@ use std::{ }; use arc_swap::ArcSwap; -use enumset::EnumSet; -use tracing::error; use utils::leaky_bucket::{LeakyBucketConfig, RateLimiter}; -use crate::{context::RequestContext, task_mgr::TaskKind}; - /// Throttle for `async` functions. /// /// Runtime reconfigurable. @@ -35,7 +30,7 @@ pub struct Throttle { } pub struct Inner { - task_kinds: EnumSet, + enabled: bool, rate_limiter: Arc, } @@ -79,26 +74,12 @@ where } fn new_inner(config: Config) -> Inner { let Config { - task_kinds, + enabled, initial, refill_interval, refill_amount, max, } = config; - let task_kinds: EnumSet = task_kinds - .iter() - .filter_map(|s| match TaskKind::from_str(s) { - Ok(v) => Some(v), - Err(e) => { - // TODO: avoid this failure mode - error!( - "cannot parse task kind, ignoring for rate limiting {}", - utils::error::report_compact_sources(&e) - ); - None - } - }) - .collect(); // steady rate, we expect `refill_amount` requests per `refill_interval`. // dividing gives us the rps. @@ -112,7 +93,7 @@ where let rate_limiter = RateLimiter::with_initial_tokens(config, f64::from(initial_tokens)); Inner { - task_kinds, + enabled: enabled.is_enabled(), rate_limiter: Arc::new(rate_limiter), } } @@ -141,11 +122,13 @@ where self.inner.load().rate_limiter.steady_rps() } - pub async fn throttle(&self, ctx: &RequestContext, key_count: usize) -> Option { + pub async fn throttle(&self, key_count: usize) -> Option { let inner = self.inner.load_full(); // clones the `Inner` Arc - if !inner.task_kinds.contains(ctx.task_kind()) { + + if !inner.enabled { return None; - }; + } + let start = std::time::Instant::now(); self.metric.accounting_start(); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index dc3f823f20..1414bef0a5 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -208,8 +208,8 @@ fn drop_wlock(rlock: tokio::sync::RwLockWriteGuard<'_, T>) { /// The outward-facing resources required to build a Timeline pub struct TimelineResources { pub remote_client: RemoteTimelineClient, - pub timeline_get_throttle: - Arc>, + pub pagestream_throttle: + Arc>, pub l0_flush_global_state: l0_flush::L0FlushGlobalState, } @@ -411,9 +411,9 @@ pub struct Timeline { /// Timeline deletion will acquire both compaction and gc locks in whatever order. gc_lock: tokio::sync::Mutex<()>, - /// Cloned from [`super::Tenant::timeline_get_throttle`] on construction. - timeline_get_throttle: - Arc>, + /// Cloned from [`super::Tenant::pagestream_throttle`] on construction. + pub(crate) pagestream_throttle: + Arc>, /// Size estimator for aux file v2 pub(crate) aux_file_size_estimator: AuxFileSizeEstimator, @@ -949,7 +949,7 @@ impl Timeline { /// If a remote layer file is needed, it is downloaded as part of this /// call. /// - /// This method enforces [`Self::timeline_get_throttle`] internally. + /// This method enforces [`Self::pagestream_throttle`] internally. /// /// NOTE: It is considered an error to 'get' a key that doesn't exist. The /// abstraction above this needs to store suitable metadata to track what @@ -977,8 +977,6 @@ impl Timeline { // page_service. debug_assert!(!self.shard_identity.is_key_disposable(&key)); - self.timeline_get_throttle.throttle(ctx, 1).await; - let keyspace = KeySpace { ranges: vec![key..key.next()], }; @@ -1058,14 +1056,6 @@ impl Timeline { .for_task_kind(ctx.task_kind()) .map(|metric| (metric, Instant::now())); - // start counting after throttle so that throttle time - // is always less than observation time and we don't - // underflow when computing `ex_throttled` below. - let throttled = self - .timeline_get_throttle - .throttle(ctx, key_count as usize) - .await; - let res = self .get_vectored_impl( keyspace.clone(), @@ -1077,23 +1067,7 @@ impl Timeline { if let Some((metric, start)) = start { let elapsed = start.elapsed(); - let ex_throttled = if let Some(throttled) = throttled { - elapsed.checked_sub(throttled) - } else { - Some(elapsed) - }; - - if let Some(ex_throttled) = ex_throttled { - metric.observe(ex_throttled.as_secs_f64()); - } else { - use utils::rate_limit::RateLimit; - static LOGGED: Lazy> = - Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10)))); - let mut rate_limit = LOGGED.lock().unwrap(); - rate_limit.call(|| { - warn!("error deducting time spent throttled; this message is logged at a global rate limit"); - }); - } + metric.observe(elapsed.as_secs_f64()); } res @@ -1138,16 +1112,6 @@ impl Timeline { .for_task_kind(ctx.task_kind()) .map(ScanLatencyOngoingRecording::start_recording); - // start counting after throttle so that throttle time - // is always less than observation time and we don't - // underflow when computing the `ex_throttled` value in - // `recording.observe(throttled)` below. - let throttled = self - .timeline_get_throttle - // assume scan = 1 quota for now until we find a better way to process this - .throttle(ctx, 1) - .await; - let vectored_res = self .get_vectored_impl( keyspace.clone(), @@ -1158,7 +1122,7 @@ impl Timeline { .await; if let Some(recording) = start { - recording.observe(throttled); + recording.observe(); } vectored_res @@ -2374,7 +2338,7 @@ impl Timeline { standby_horizon: AtomicLsn::new(0), - timeline_get_throttle: resources.timeline_get_throttle, + pagestream_throttle: resources.pagestream_throttle, aux_file_size_estimator: AuxFileSizeEstimator::new(aux_file_metrics), diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index 67fc710c44..47a93b19d2 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -298,7 +298,7 @@ impl DeleteTimelineFlow { None, // Ancestor is not needed for deletion. TimelineResources { remote_client, - timeline_get_throttle: tenant.timeline_get_throttle.clone(), + pagestream_throttle: tenant.pagestream_throttle.clone(), l0_flush_global_state: tenant.l0_flush_global_state.clone(), }, // Important. We dont pass ancestor above because it can be missing. diff --git a/test_runner/regress/test_pageserver_getpage_throttle.py b/test_runner/regress/test_pageserver_getpage_throttle.py index 62aec50a9e..6d0661f068 100644 --- a/test_runner/regress/test_pageserver_getpage_throttle.py +++ b/test_runner/regress/test_pageserver_getpage_throttle.py @@ -33,7 +33,9 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P conf={ "compaction_period": f"{compaction_period}s", "timeline_get_throttle": { - "task_kinds": ["PageRequestHandler"], + "task_kinds": [ + "PageRequestHandler" + ], # any non-empty array will do here https://github.com/neondatabase/neon/pull/9962 "initial": 0, "refill_interval": "100ms", "refill_amount": int(rate_limit_rps / 10), @@ -116,7 +118,6 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P timeout=compaction_period, ) - log.info("the smgr metric includes throttle time") smgr_query_seconds_post = ps_http.get_metric_value(smgr_metric_name, smgr_metrics_query) assert smgr_query_seconds_post is not None throttled_usecs_post = ps_http.get_metric_value(throttle_metric_name, throttle_metrics_query) @@ -125,13 +126,14 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P actual_throttled_usecs = throttled_usecs_post - throttled_usecs_pre actual_throttled_secs = actual_throttled_usecs / 1_000_000 + log.info("validate that the metric doesn't include throttle wait time") assert ( - pytest.approx(duration_secs, 0.1) == actual_smgr_query_seconds - ), "smgr metrics include throttle wait time" - smgr_ex_throttle = actual_smgr_query_seconds - actual_throttled_secs - assert smgr_ex_throttle > 0 + duration_secs >= 10 * actual_smgr_query_seconds + ), "smgr metrics should not include throttle wait time" + + log.info("validate that the throttling wait time metrics is correct") assert ( - duration_secs > 10 * smgr_ex_throttle + pytest.approx(actual_throttled_secs + actual_smgr_query_seconds, 0.1) == duration_secs ), "most of the time in this test is spent throttled because the rate-limit's contribution to latency dominates" From 69c0d61c5c319a4f2a8fb0d3613af5020ca03c43 Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 3 Dec 2024 16:55:00 +0000 Subject: [PATCH 40/65] storcon: in shard splits, inherit parent's AZ (#9946) ## Problem Sharded tenants should be run in a single AZ for best performance, so that computes have AZ-local latency to all the shards. Part of https://github.com/neondatabase/neon/issues/8264 ## Summary of changes - When we split a tenant, instead of updating each shard's preferred AZ to wherever it is scheduled, propagate the preferred AZ from the parent. - Drop the check in `test_shard_preferred_azs` that asserts shards end up in their preferred AZ: this will not be true again until the optimize_attachment logic is updated to make this so. The existing check wasn't testing anything about scheduling, it was just asserting that we set preferred AZ in a way that matches the way things happen to be scheduled at time of split. --- storage_controller/src/service.rs | 68 ++++++------------- .../regress/test_storage_controller.py | 6 +- 2 files changed, 24 insertions(+), 50 deletions(-) diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 52c9c4710d..741d3dc2b4 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -44,12 +44,12 @@ use futures::{stream::FuturesUnordered, StreamExt}; use itertools::Itertools; use pageserver_api::{ controller_api::{ - MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, NodeRegisterRequest, - NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy, ShardSchedulingPolicy, - ShardsPreferredAzsRequest, ShardsPreferredAzsResponse, TenantCreateRequest, - TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse, - TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest, - TenantShardMigrateRequest, TenantShardMigrateResponse, + AvailabilityZone, MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, + NodeRegisterRequest, NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy, + ShardSchedulingPolicy, ShardsPreferredAzsRequest, ShardsPreferredAzsResponse, + TenantCreateRequest, TenantCreateResponse, TenantCreateResponseShard, + TenantDescribeResponse, TenantDescribeResponseShard, TenantLocateResponse, + TenantPolicyRequest, TenantShardMigrateRequest, TenantShardMigrateResponse, }, models::{ SecondaryProgress, TenantConfigRequest, TimelineArchivalConfigRequest, @@ -468,6 +468,7 @@ struct ShardSplitParams { policy: PlacementPolicy, config: TenantConfig, shard_ident: ShardIdentity, + preferred_az_id: Option, } // When preparing for a shard split, we may either choose to proceed with the split, @@ -4103,7 +4104,7 @@ impl Service { for parent_id in parent_ids { let child_ids = parent_id.split(new_shard_count); - let (pageserver, generation, policy, parent_ident, config) = { + let (pageserver, generation, policy, parent_ident, config, preferred_az) = { let mut old_state = tenants .remove(&parent_id) .expect("It was present, we just split it"); @@ -4122,6 +4123,7 @@ impl Service { old_state.policy.clone(), old_state.shard, old_state.config.clone(), + old_state.preferred_az().cloned(), ) }; @@ -4154,6 +4156,9 @@ impl Service { }; child_state.generation = Some(generation); child_state.config = config.clone(); + if let Some(preferred_az) = &preferred_az { + child_state.set_preferred_az(preferred_az.clone()); + } // The child's TenantShard::splitting is intentionally left at the default value of Idle, // as at this point in the split process we have succeeded and this part is infallible: @@ -4346,6 +4351,7 @@ impl Service { let mut policy = None; let mut config = None; let mut shard_ident = None; + let mut preferred_az_id = None; // Validate input, and calculate which shards we will create let (old_shard_count, targets) = { @@ -4404,6 +4410,9 @@ impl Service { if config.is_none() { config = Some(shard.config.clone()); } + if preferred_az_id.is_none() { + preferred_az_id = shard.preferred_az().cloned(); + } if tenant_shard_id.shard_count.count() == split_req.new_shard_count { tracing::info!( @@ -4474,6 +4483,7 @@ impl Service { policy, config, shard_ident, + preferred_az_id, }))) } @@ -4496,6 +4506,7 @@ impl Service { policy, config, shard_ident, + preferred_az_id, } = *params; // Drop any secondary locations: pageservers do not support splitting these, and in any case the @@ -4569,7 +4580,7 @@ impl Service { // Scheduling policies and preferred AZ do not carry through to children scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default()) .unwrap(), - preferred_az_id: None, + preferred_az_id: preferred_az_id.as_ref().map(|az| az.0.clone()), }); } @@ -4689,47 +4700,6 @@ impl Service { let (response, child_locations, waiters) = self.tenant_shard_split_commit_inmem(tenant_id, new_shard_count, new_stripe_size); - // Now that we have scheduled the child shards, attempt to set their preferred AZ - // to that of the pageserver they've been attached on. - let preferred_azs = { - let locked = self.inner.read().unwrap(); - child_locations - .iter() - .filter_map(|(tid, node_id, _stripe_size)| { - let az_id = locked - .nodes - .get(node_id) - .map(|n| n.get_availability_zone_id().clone())?; - - Some((*tid, az_id)) - }) - .collect::>() - }; - - let updated = self - .persistence - .set_tenant_shard_preferred_azs(preferred_azs) - .await - .map_err(|err| { - ApiError::InternalServerError(anyhow::anyhow!( - "Failed to persist preferred az ids: {err}" - )) - }); - - match updated { - Ok(updated) => { - let mut locked = self.inner.write().unwrap(); - for (tid, az_id) in updated { - if let Some(shard) = locked.tenants.get_mut(&tid) { - shard.set_preferred_az(az_id); - } - } - } - Err(err) => { - tracing::warn!("Failed to persist preferred AZs after split: {err}"); - } - } - // Send compute notifications for all the new shards let mut failed_notifications = Vec::new(); for (child_id, child_ps, stripe_size) in child_locations { diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 244893a616..f878116d53 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -3057,7 +3057,11 @@ def test_shard_preferred_azs(neon_env_builder: NeonEnvBuilder): for shard in shards: attached_to = shard["node_attached"] expected_az = env.get_pageserver(attached_to).az_id - assert shard["preferred_az_id"] == expected_az + + # The scheduling optimization logic is not yet AZ-aware, so doesn't succeed + # in putting the tenant shards in the preferred AZ. + # To be fixed in https://github.com/neondatabase/neon/pull/9916 + # assert shard["preferred_az_id"] == expected_az @run_only_on_default_postgres("Postgres version makes no difference here") From 41bb9c528009ff24ffe223cc2372722ee3054601 Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 3 Dec 2024 17:22:49 +0000 Subject: [PATCH 41/65] pageserver: only store SLRUs & aux files on shard zero (#9786) ## Problem Since https://github.com/neondatabase/neon/pull/9423 the non-zero shards no longer need SLRU content in order to do GC. This data is now redundant on shards >0. One release cycle after merging that PR, we may merge this one, which also stops writing those pages to shards > 0, reaping the efficiency benefit. Closes: https://github.com/neondatabase/neon/issues/7512 Closes: https://github.com/neondatabase/neon/issues/9641 ## Summary of changes - Avoid storing SLRUs on non-zero shards - Bonus: avoid storing aux files on non-zero shards --- libs/pageserver_api/src/key.rs | 5 ++ libs/pageserver_api/src/shard.rs | 34 +++++++++--- libs/wal_decoder/src/decoder.rs | 54 ++++++++++-------- pageserver/src/import_datadir.rs | 18 ++++-- pageserver/src/pgdatadir_mapping.rs | 55 ++++++++++++------- .../src/tenant/timeline/import_pgdata/flow.rs | 49 +++++++---------- pageserver/src/walingest.rs | 4 ++ 7 files changed, 134 insertions(+), 85 deletions(-) diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index 523d143381..37dff6fe46 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -770,6 +770,11 @@ impl Key { && self.field6 == 1 } + #[inline(always)] + pub fn is_aux_file_key(&self) -> bool { + self.field1 == AUX_KEY_PREFIX + } + /// Guaranteed to return `Ok()` if [`Self::is_rel_block_key`] returns `true` for `key`. #[inline(always)] pub fn to_rel_block(self) -> anyhow::Result<(RelTag, BlockNumber)> { diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index e83cf4c855..a5c94a82c1 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -170,19 +170,37 @@ impl ShardIdentity { } } + /// Return true if the key should be stored on all shards, not just one. + fn is_key_global(&self, key: &Key) -> bool { + if key.is_slru_block_key() || key.is_slru_segment_size_key() || key.is_aux_file_key() { + // Special keys that are only stored on shard 0 + false + } else if key.is_rel_block_key() { + // Ordinary relation blocks are distributed across shards + false + } else if key.is_rel_size_key() { + // All shards maintain rel size keys (although only shard 0 is responsible for + // keeping it strictly accurate, other shards just reflect the highest block they've ingested) + true + } else { + // For everything else, we assume it must be kept everywhere, because ingest code + // might assume this -- this covers functionality where the ingest code has + // not (yet) been made fully shard aware. + true + } + } + /// Return true if the key should be discarded if found in this shard's /// data store, e.g. during compaction after a split. /// /// Shards _may_ drop keys which return false here, but are not obliged to. pub fn is_key_disposable(&self, key: &Key) -> bool { - if key_is_shard0(key) { - // Q: Why can't we dispose of shard0 content if we're not shard 0? - // A1: because the WAL ingestion logic currently ingests some shard 0 - // content on all shards, even though it's only read on shard 0. If we - // dropped it, then subsequent WAL ingest to these keys would encounter - // an error. - // A2: because key_is_shard0 also covers relation size keys, which are written - // on all shards even though they're only maintained accurately on shard 0. + if self.count < ShardCount(2) { + // Fast path: unsharded tenant doesn't dispose of anything + return false; + } + + if self.is_key_global(key) { false } else { !self.is_key_local(key) diff --git a/libs/wal_decoder/src/decoder.rs b/libs/wal_decoder/src/decoder.rs index 36c4b19266..aa50c62911 100644 --- a/libs/wal_decoder/src/decoder.rs +++ b/libs/wal_decoder/src/decoder.rs @@ -112,30 +112,38 @@ impl MetadataRecord { }; // Next, filter the metadata record by shard. - - // Route VM page updates to the shards that own them. VM pages are stored in the VM fork - // of the main relation. These are sharded and managed just like regular relation pages. - // See: https://github.com/neondatabase/neon/issues/9855 - if let Some( - MetadataRecord::Heapam(HeapamRecord::ClearVmBits(ref mut clear_vm_bits)) - | MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits(ref mut clear_vm_bits)), - ) = metadata_record - { - let is_local_vm_page = |heap_blk| { - let vm_blk = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blk); - shard.is_key_local(&rel_block_to_key(clear_vm_bits.vm_rel, vm_blk)) - }; - // Send the old and new VM page updates to their respective shards. - clear_vm_bits.old_heap_blkno = clear_vm_bits - .old_heap_blkno - .filter(|&blkno| is_local_vm_page(blkno)); - clear_vm_bits.new_heap_blkno = clear_vm_bits - .new_heap_blkno - .filter(|&blkno| is_local_vm_page(blkno)); - // If neither VM page belongs to this shard, discard the record. - if clear_vm_bits.old_heap_blkno.is_none() && clear_vm_bits.new_heap_blkno.is_none() { - metadata_record = None + match metadata_record { + Some( + MetadataRecord::Heapam(HeapamRecord::ClearVmBits(ref mut clear_vm_bits)) + | MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits(ref mut clear_vm_bits)), + ) => { + // Route VM page updates to the shards that own them. VM pages are stored in the VM fork + // of the main relation. These are sharded and managed just like regular relation pages. + // See: https://github.com/neondatabase/neon/issues/9855 + let is_local_vm_page = |heap_blk| { + let vm_blk = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blk); + shard.is_key_local(&rel_block_to_key(clear_vm_bits.vm_rel, vm_blk)) + }; + // Send the old and new VM page updates to their respective shards. + clear_vm_bits.old_heap_blkno = clear_vm_bits + .old_heap_blkno + .filter(|&blkno| is_local_vm_page(blkno)); + clear_vm_bits.new_heap_blkno = clear_vm_bits + .new_heap_blkno + .filter(|&blkno| is_local_vm_page(blkno)); + // If neither VM page belongs to this shard, discard the record. + if clear_vm_bits.old_heap_blkno.is_none() && clear_vm_bits.new_heap_blkno.is_none() + { + metadata_record = None + } } + Some(MetadataRecord::LogicalMessage(LogicalMessageRecord::Put(_))) => { + // Filter LogicalMessage records (AUX files) to only be stored on shard zero + if !shard.is_shard_zero() { + metadata_record = None; + } + } + _ => {} } Ok(metadata_record) diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index 06c4553e1c..c061714010 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -575,18 +575,24 @@ async fn import_file( } else if file_path.starts_with("pg_xact") { let slru = SlruKind::Clog; - import_slru(modification, slru, file_path, reader, len, ctx).await?; - debug!("imported clog slru"); + if modification.tline.tenant_shard_id.is_shard_zero() { + import_slru(modification, slru, file_path, reader, len, ctx).await?; + debug!("imported clog slru"); + } } else if file_path.starts_with("pg_multixact/offsets") { let slru = SlruKind::MultiXactOffsets; - import_slru(modification, slru, file_path, reader, len, ctx).await?; - debug!("imported multixact offsets slru"); + if modification.tline.tenant_shard_id.is_shard_zero() { + import_slru(modification, slru, file_path, reader, len, ctx).await?; + debug!("imported multixact offsets slru"); + } } else if file_path.starts_with("pg_multixact/members") { let slru = SlruKind::MultiXactMembers; - import_slru(modification, slru, file_path, reader, len, ctx).await?; - debug!("imported multixact members slru"); + if modification.tline.tenant_shard_id.is_shard_zero() { + import_slru(modification, slru, file_path, reader, len, ctx).await?; + debug!("imported multixact members slru"); + } } else if file_path.starts_with("pg_twophase") { let bytes = read_all_bytes(reader).await?; diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index a00ec761e2..255bd01e25 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -530,6 +530,7 @@ impl Timeline { lsn: Lsn, ctx: &RequestContext, ) -> Result { + assert!(self.tenant_shard_id.is_shard_zero()); let n_blocks = self .get_slru_segment_size(kind, segno, Version::Lsn(lsn), ctx) .await?; @@ -552,6 +553,7 @@ impl Timeline { lsn: Lsn, ctx: &RequestContext, ) -> Result { + assert!(self.tenant_shard_id.is_shard_zero()); let key = slru_block_to_key(kind, segno, blknum); self.get(key, lsn, ctx).await } @@ -564,6 +566,7 @@ impl Timeline { version: Version<'_>, ctx: &RequestContext, ) -> Result { + assert!(self.tenant_shard_id.is_shard_zero()); let key = slru_segment_size_to_key(kind, segno); let mut buf = version.get(self, key, ctx).await?; Ok(buf.get_u32_le()) @@ -577,6 +580,7 @@ impl Timeline { version: Version<'_>, ctx: &RequestContext, ) -> Result { + assert!(self.tenant_shard_id.is_shard_zero()); // fetch directory listing let key = slru_dir_to_key(kind); let buf = version.get(self, key, ctx).await?; @@ -1047,26 +1051,28 @@ impl Timeline { } // Iterate SLRUs next - for kind in [ - SlruKind::Clog, - SlruKind::MultiXactMembers, - SlruKind::MultiXactOffsets, - ] { - let slrudir_key = slru_dir_to_key(kind); - result.add_key(slrudir_key); - let buf = self.get(slrudir_key, lsn, ctx).await?; - let dir = SlruSegmentDirectory::des(&buf)?; - let mut segments: Vec = dir.segments.iter().cloned().collect(); - segments.sort_unstable(); - for segno in segments { - let segsize_key = slru_segment_size_to_key(kind, segno); - let mut buf = self.get(segsize_key, lsn, ctx).await?; - let segsize = buf.get_u32_le(); + if self.tenant_shard_id.is_shard_zero() { + for kind in [ + SlruKind::Clog, + SlruKind::MultiXactMembers, + SlruKind::MultiXactOffsets, + ] { + let slrudir_key = slru_dir_to_key(kind); + result.add_key(slrudir_key); + let buf = self.get(slrudir_key, lsn, ctx).await?; + let dir = SlruSegmentDirectory::des(&buf)?; + let mut segments: Vec = dir.segments.iter().cloned().collect(); + segments.sort_unstable(); + for segno in segments { + let segsize_key = slru_segment_size_to_key(kind, segno); + let mut buf = self.get(segsize_key, lsn, ctx).await?; + let segsize = buf.get_u32_le(); - result.add_range( - slru_block_to_key(kind, segno, 0)..slru_block_to_key(kind, segno, segsize), - ); - result.add_key(segsize_key); + result.add_range( + slru_block_to_key(kind, segno, 0)..slru_block_to_key(kind, segno, segsize), + ); + result.add_key(segsize_key); + } } } @@ -1468,6 +1474,10 @@ impl<'a> DatadirModification<'a> { blknum: BlockNumber, rec: NeonWalRecord, ) -> anyhow::Result<()> { + if !self.tline.tenant_shard_id.is_shard_zero() { + return Ok(()); + } + self.put( slru_block_to_key(kind, segno, blknum), Value::WalRecord(rec), @@ -1501,6 +1511,8 @@ impl<'a> DatadirModification<'a> { blknum: BlockNumber, img: Bytes, ) -> anyhow::Result<()> { + assert!(self.tline.tenant_shard_id.is_shard_zero()); + let key = slru_block_to_key(kind, segno, blknum); if !key.is_valid_key_on_write_path() { anyhow::bail!( @@ -1542,6 +1554,7 @@ impl<'a> DatadirModification<'a> { segno: u32, blknum: BlockNumber, ) -> anyhow::Result<()> { + assert!(self.tline.tenant_shard_id.is_shard_zero()); let key = slru_block_to_key(kind, segno, blknum); if !key.is_valid_key_on_write_path() { anyhow::bail!( @@ -1853,6 +1866,8 @@ impl<'a> DatadirModification<'a> { nblocks: BlockNumber, ctx: &RequestContext, ) -> anyhow::Result<()> { + assert!(self.tline.tenant_shard_id.is_shard_zero()); + // Add it to the directory entry let dir_key = slru_dir_to_key(kind); let buf = self.get(dir_key, ctx).await?; @@ -1885,6 +1900,8 @@ impl<'a> DatadirModification<'a> { segno: u32, nblocks: BlockNumber, ) -> anyhow::Result<()> { + assert!(self.tline.tenant_shard_id.is_shard_zero()); + // Put size let size_key = slru_segment_size_to_key(kind, segno); let buf = nblocks.to_le_bytes(); diff --git a/pageserver/src/tenant/timeline/import_pgdata/flow.rs b/pageserver/src/tenant/timeline/import_pgdata/flow.rs index cbd4168c06..4388072606 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/flow.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs @@ -129,22 +129,23 @@ impl Flow { } // Import SLRUs - - // pg_xact (01:00 keyspace) - self.import_slru(SlruKind::Clog, &self.storage.pgdata().join("pg_xact")) + if self.timeline.tenant_shard_id.is_shard_zero() { + // pg_xact (01:00 keyspace) + self.import_slru(SlruKind::Clog, &self.storage.pgdata().join("pg_xact")) + .await?; + // pg_multixact/members (01:01 keyspace) + self.import_slru( + SlruKind::MultiXactMembers, + &self.storage.pgdata().join("pg_multixact/members"), + ) .await?; - // pg_multixact/members (01:01 keyspace) - self.import_slru( - SlruKind::MultiXactMembers, - &self.storage.pgdata().join("pg_multixact/members"), - ) - .await?; - // pg_multixact/offsets (01:02 keyspace) - self.import_slru( - SlruKind::MultiXactOffsets, - &self.storage.pgdata().join("pg_multixact/offsets"), - ) - .await?; + // pg_multixact/offsets (01:02 keyspace) + self.import_slru( + SlruKind::MultiXactOffsets, + &self.storage.pgdata().join("pg_multixact/offsets"), + ) + .await?; + } // Import pg_twophase. // TODO: as empty @@ -302,6 +303,8 @@ impl Flow { } async fn import_slru(&mut self, kind: SlruKind, path: &RemotePath) -> anyhow::Result<()> { + assert!(self.timeline.tenant_shard_id.is_shard_zero()); + let segments = self.storage.listfilesindir(path).await?; let segments: Vec<(String, u32, usize)> = segments .into_iter() @@ -337,7 +340,6 @@ impl Flow { debug!(%p, segno=%segno, %size, %start_key, %end_key, "scheduling SLRU segment"); self.tasks .push(AnyImportTask::SlruBlocks(ImportSlruBlocksTask::new( - *self.timeline.get_shard_identity(), start_key..end_key, &p, self.storage.clone(), @@ -631,21 +633,14 @@ impl ImportTask for ImportRelBlocksTask { } struct ImportSlruBlocksTask { - shard_identity: ShardIdentity, key_range: Range, path: RemotePath, storage: RemoteStorageWrapper, } impl ImportSlruBlocksTask { - fn new( - shard_identity: ShardIdentity, - key_range: Range, - path: &RemotePath, - storage: RemoteStorageWrapper, - ) -> Self { + fn new(key_range: Range, path: &RemotePath, storage: RemoteStorageWrapper) -> Self { ImportSlruBlocksTask { - shard_identity, key_range, path: path.clone(), storage, @@ -673,17 +668,13 @@ impl ImportTask for ImportSlruBlocksTask { let mut file_offset = 0; while blknum < end_blk { let key = slru_block_to_key(kind, segno, blknum); - assert!( - !self.shard_identity.is_key_disposable(&key), - "SLRU keys need to go into every shard" - ); let buf = &buf[file_offset..(file_offset + 8192)]; file_offset += 8192; layer_writer .put_image(key, Bytes::copy_from_slice(buf), ctx) .await?; - blknum += 1; nimages += 1; + blknum += 1; } Ok(nimages) } diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index d568da596a..93ae88936f 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -1392,6 +1392,10 @@ impl WalIngest { img: Bytes, ctx: &RequestContext, ) -> Result<()> { + if !self.shard.is_shard_zero() { + return Ok(()); + } + self.handle_slru_extend(modification, kind, segno, blknum, ctx) .await?; modification.put_slru_page_image(kind, segno, blknum, img)?; From f7d5322e8b428ea4277e2bdf572612e4733d193a Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 3 Dec 2024 18:36:37 +0000 Subject: [PATCH 42/65] pageserver: more detailed logs when calling re-attach (#9996) ## Problem We saw a peculiar case where a pageserver apparently got a 0-tenant response to `/re-attach` but we couldn't see the request landing on a storage controller. It was hard to confirm retrospectively that the pageserver was configured properly at the moment it sent the request. ## Summary of changes - Log the URL to which we are sending the request - Log the NodeId and metadata that we sent --- libs/pageserver_api/src/controller_api.rs | 4 ++-- pageserver/src/controller_upcall_client.rs | 12 +++++++++--- pageserver/src/tenant/mgr.rs | 2 +- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index 0ea30ce54f..9a5ebc95bd 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -48,7 +48,7 @@ pub struct TenantCreateResponse { pub shards: Vec, } -#[derive(Serialize, Deserialize)] +#[derive(Serialize, Deserialize, Debug, Clone)] pub struct NodeRegisterRequest { pub node_id: NodeId, @@ -75,7 +75,7 @@ pub struct TenantPolicyRequest { pub scheduling: Option, } -#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Hash)] +#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Hash, Debug)] pub struct AvailabilityZone(pub String); impl Display for AvailabilityZone { diff --git a/pageserver/src/controller_upcall_client.rs b/pageserver/src/controller_upcall_client.rs index 73fc6dc3ab..d41bfd9021 100644 --- a/pageserver/src/controller_upcall_client.rs +++ b/pageserver/src/controller_upcall_client.rs @@ -115,6 +115,10 @@ impl ControllerUpcallClient { Ok(res) } + + pub(crate) fn base_url(&self) -> &Url { + &self.base_url + } } impl ControlPlaneGenerationsApi for ControllerUpcallClient { @@ -191,13 +195,15 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient { let request = ReAttachRequest { node_id: self.node_id, - register, + register: register.clone(), }; let response: ReAttachResponse = self.retry_http_forever(&re_attach_path, request).await?; tracing::info!( - "Received re-attach response with {} tenants", - response.tenants.len() + "Received re-attach response with {} tenants (node {}, register: {:?})", + response.tenants.len(), + self.node_id, + register, ); failpoint_support::sleep_millis_async!("control-plane-client-re-attach"); diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index eb8191e43e..45481c4ed4 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -347,7 +347,7 @@ async fn init_load_generations( ); emergency_generations(tenant_confs) } else if let Some(client) = ControllerUpcallClient::new(conf, cancel) { - info!("Calling control plane API to re-attach tenants"); + info!("Calling {} API to re-attach tenants", client.base_url()); // If we are configured to use the control plane API, then it is the source of truth for what tenants to load. match client.re_attach(conf).await { Ok(tenants) => tenants From 60241127e29cee199fc75fe5ddc15b6729d96aa7 Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Tue, 3 Dec 2024 18:39:23 +0000 Subject: [PATCH 43/65] chore(proxy): remove postgres config parser and md5 support (#9990) Keeping the `mock` postgres cplane adaptor using "stock" tokio-postgres allows us to remove a lot of dead weight from our actual postgres connection logic. --- Cargo.lock | 2 +- libs/proxy/postgres-protocol2/Cargo.toml | 1 - .../src/authentication/mod.rs | 35 -- .../postgres-protocol2/src/message/backend.rs | 8 +- .../postgres-protocol2/src/password/mod.rs | 18 - .../postgres-protocol2/src/password/test.rs | 8 - libs/proxy/tokio-postgres2/src/config.rs | 465 +----------------- libs/proxy/tokio-postgres2/src/connect_raw.rs | 19 +- libs/proxy/tokio-postgres2/src/error/mod.rs | 6 - libs/proxy/tokio-postgres2/src/lib.rs | 20 - proxy/Cargo.toml | 6 +- proxy/src/auth/backend/classic.rs | 2 +- proxy/src/auth/backend/console_redirect.rs | 2 +- proxy/src/auth/backend/mod.rs | 2 +- proxy/src/auth/flow.rs | 2 +- proxy/src/cancellation.rs | 6 +- proxy/src/compute.rs | 26 +- proxy/src/control_plane/client/mock.rs | 3 +- proxy/src/control_plane/client/neon.rs | 2 +- proxy/src/error.rs | 2 +- proxy/src/postgres_rustls/mod.rs | 6 +- proxy/src/proxy/retry.rs | 16 +- proxy/src/proxy/tests/mitm.rs | 22 +- proxy/src/proxy/tests/mod.rs | 20 +- proxy/src/serverless/backend.rs | 18 +- proxy/src/serverless/conn_pool.rs | 6 +- proxy/src/serverless/conn_pool_lib.rs | 4 +- proxy/src/serverless/json.rs | 6 +- proxy/src/serverless/local_conn_pool.rs | 10 +- proxy/src/serverless/sql_over_http.rs | 18 +- 30 files changed, 96 insertions(+), 665 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b2769e59f0..5b80ec5e93 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4209,7 +4209,6 @@ dependencies = [ "bytes", "fallible-iterator", "hmac", - "md-5", "memchr", "rand 0.8.5", "sha2", @@ -4612,6 +4611,7 @@ dependencies = [ "tikv-jemalloc-ctl", "tikv-jemallocator", "tokio", + "tokio-postgres", "tokio-postgres2", "tokio-rustls 0.26.0", "tokio-tungstenite", diff --git a/libs/proxy/postgres-protocol2/Cargo.toml b/libs/proxy/postgres-protocol2/Cargo.toml index 284a632954..f71c1599c7 100644 --- a/libs/proxy/postgres-protocol2/Cargo.toml +++ b/libs/proxy/postgres-protocol2/Cargo.toml @@ -10,7 +10,6 @@ byteorder.workspace = true bytes.workspace = true fallible-iterator.workspace = true hmac.workspace = true -md-5 = "0.10" memchr = "2.0" rand.workspace = true sha2.workspace = true diff --git a/libs/proxy/postgres-protocol2/src/authentication/mod.rs b/libs/proxy/postgres-protocol2/src/authentication/mod.rs index 71afa4b9b6..0bdc177143 100644 --- a/libs/proxy/postgres-protocol2/src/authentication/mod.rs +++ b/libs/proxy/postgres-protocol2/src/authentication/mod.rs @@ -1,37 +1,2 @@ //! Authentication protocol support. -use md5::{Digest, Md5}; - pub mod sasl; - -/// Hashes authentication information in a way suitable for use in response -/// to an `AuthenticationMd5Password` message. -/// -/// The resulting string should be sent back to the database in a -/// `PasswordMessage` message. -#[inline] -pub fn md5_hash(username: &[u8], password: &[u8], salt: [u8; 4]) -> String { - let mut md5 = Md5::new(); - md5.update(password); - md5.update(username); - let output = md5.finalize_reset(); - md5.update(format!("{:x}", output)); - md5.update(salt); - format!("md5{:x}", md5.finalize()) -} - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn md5() { - let username = b"md5_user"; - let password = b"password"; - let salt = [0x2a, 0x3d, 0x8f, 0xe0]; - - assert_eq!( - md5_hash(username, password, salt), - "md562af4dd09bbb41884907a838a3233294" - ); - } -} diff --git a/libs/proxy/postgres-protocol2/src/message/backend.rs b/libs/proxy/postgres-protocol2/src/message/backend.rs index 33d77fc252..097964f9c1 100644 --- a/libs/proxy/postgres-protocol2/src/message/backend.rs +++ b/libs/proxy/postgres-protocol2/src/message/backend.rs @@ -79,7 +79,7 @@ pub enum Message { AuthenticationCleartextPassword, AuthenticationGss, AuthenticationKerberosV5, - AuthenticationMd5Password(AuthenticationMd5PasswordBody), + AuthenticationMd5Password, AuthenticationOk, AuthenticationScmCredential, AuthenticationSspi, @@ -191,11 +191,7 @@ impl Message { 0 => Message::AuthenticationOk, 2 => Message::AuthenticationKerberosV5, 3 => Message::AuthenticationCleartextPassword, - 5 => { - let mut salt = [0; 4]; - buf.read_exact(&mut salt)?; - Message::AuthenticationMd5Password(AuthenticationMd5PasswordBody { salt }) - } + 5 => Message::AuthenticationMd5Password, 6 => Message::AuthenticationScmCredential, 7 => Message::AuthenticationGss, 8 => Message::AuthenticationGssContinue, diff --git a/libs/proxy/postgres-protocol2/src/password/mod.rs b/libs/proxy/postgres-protocol2/src/password/mod.rs index e669e80f3f..38eb31dfcf 100644 --- a/libs/proxy/postgres-protocol2/src/password/mod.rs +++ b/libs/proxy/postgres-protocol2/src/password/mod.rs @@ -8,7 +8,6 @@ use crate::authentication::sasl; use hmac::{Hmac, Mac}; -use md5::Md5; use rand::RngCore; use sha2::digest::FixedOutput; use sha2::{Digest, Sha256}; @@ -88,20 +87,3 @@ pub(crate) async fn scram_sha_256_salt( base64::encode(server_key) ) } - -/// **Not recommended, as MD5 is not considered to be secure.** -/// -/// Hash password using MD5 with the username as the salt. -/// -/// The client may assume the returned string doesn't contain any -/// special characters that would require escaping. -pub fn md5(password: &[u8], username: &str) -> String { - // salt password with username - let mut salted_password = Vec::from(password); - salted_password.extend_from_slice(username.as_bytes()); - - let mut hash = Md5::new(); - hash.update(&salted_password); - let digest = hash.finalize(); - format!("md5{:x}", digest) -} diff --git a/libs/proxy/postgres-protocol2/src/password/test.rs b/libs/proxy/postgres-protocol2/src/password/test.rs index c9d340f09d..0692c07adb 100644 --- a/libs/proxy/postgres-protocol2/src/password/test.rs +++ b/libs/proxy/postgres-protocol2/src/password/test.rs @@ -9,11 +9,3 @@ async fn test_encrypt_scram_sha_256() { "SCRAM-SHA-256$4096:AQIDBAUGBwgJCgsMDQ4PEA==$8rrDg00OqaiWXJ7p+sCgHEIaBSHY89ZJl3mfIsf32oY=:05L1f+yZbiN8O0AnO40Og85NNRhvzTS57naKRWCcsIA=" ); } - -#[test] -fn test_encrypt_md5() { - assert_eq!( - password::md5(b"secret", "foo"), - "md54ab2c5d00339c4b2a4e921d2dc4edec7" - ); -} diff --git a/libs/proxy/tokio-postgres2/src/config.rs b/libs/proxy/tokio-postgres2/src/config.rs index 26124b38ef..5dad835c3b 100644 --- a/libs/proxy/tokio-postgres2/src/config.rs +++ b/libs/proxy/tokio-postgres2/src/config.rs @@ -6,11 +6,9 @@ use crate::connect_raw::RawConnection; use crate::tls::MakeTlsConnect; use crate::tls::TlsConnect; use crate::{Client, Connection, Error}; -use std::borrow::Cow; +use std::fmt; use std::str; -use std::str::FromStr; use std::time::Duration; -use std::{error, fmt, iter, mem}; use tokio::io::{AsyncRead, AsyncWrite}; pub use postgres_protocol2::authentication::sasl::ScramKeys; @@ -380,99 +378,6 @@ impl Config { self.max_backend_message_size } - fn param(&mut self, key: &str, value: &str) -> Result<(), Error> { - match key { - "user" => { - self.user(value); - } - "password" => { - self.password(value); - } - "dbname" => { - self.dbname(value); - } - "options" => { - self.options(value); - } - "application_name" => { - self.application_name(value); - } - "sslmode" => { - let mode = match value { - "disable" => SslMode::Disable, - "prefer" => SslMode::Prefer, - "require" => SslMode::Require, - _ => return Err(Error::config_parse(Box::new(InvalidValue("sslmode")))), - }; - self.ssl_mode(mode); - } - "host" => { - for host in value.split(',') { - self.host(host); - } - } - "port" => { - for port in value.split(',') { - let port = if port.is_empty() { - 5432 - } else { - port.parse() - .map_err(|_| Error::config_parse(Box::new(InvalidValue("port"))))? - }; - self.port(port); - } - } - "connect_timeout" => { - let timeout = value - .parse::() - .map_err(|_| Error::config_parse(Box::new(InvalidValue("connect_timeout"))))?; - if timeout > 0 { - self.connect_timeout(Duration::from_secs(timeout as u64)); - } - } - "target_session_attrs" => { - let target_session_attrs = match value { - "any" => TargetSessionAttrs::Any, - "read-write" => TargetSessionAttrs::ReadWrite, - _ => { - return Err(Error::config_parse(Box::new(InvalidValue( - "target_session_attrs", - )))); - } - }; - self.target_session_attrs(target_session_attrs); - } - "channel_binding" => { - let channel_binding = match value { - "disable" => ChannelBinding::Disable, - "prefer" => ChannelBinding::Prefer, - "require" => ChannelBinding::Require, - _ => { - return Err(Error::config_parse(Box::new(InvalidValue( - "channel_binding", - )))) - } - }; - self.channel_binding(channel_binding); - } - "max_backend_message_size" => { - let limit = value.parse::().map_err(|_| { - Error::config_parse(Box::new(InvalidValue("max_backend_message_size"))) - })?; - if limit > 0 { - self.max_backend_message_size(limit); - } - } - key => { - return Err(Error::config_parse(Box::new(UnknownOption( - key.to_string(), - )))); - } - } - - Ok(()) - } - /// Opens a connection to a PostgreSQL database. /// /// Requires the `runtime` Cargo feature (enabled by default). @@ -499,17 +404,6 @@ impl Config { } } -impl FromStr for Config { - type Err = Error; - - fn from_str(s: &str) -> Result { - match UrlParser::parse(s)? { - Some(config) => Ok(config), - None => Parser::parse(s), - } - } -} - // Omit password from debug output impl fmt::Debug for Config { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { @@ -536,360 +430,3 @@ impl fmt::Debug for Config { .finish() } } - -#[derive(Debug)] -struct UnknownOption(String); - -impl fmt::Display for UnknownOption { - fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(fmt, "unknown option `{}`", self.0) - } -} - -impl error::Error for UnknownOption {} - -#[derive(Debug)] -struct InvalidValue(&'static str); - -impl fmt::Display for InvalidValue { - fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(fmt, "invalid value for option `{}`", self.0) - } -} - -impl error::Error for InvalidValue {} - -struct Parser<'a> { - s: &'a str, - it: iter::Peekable>, -} - -impl<'a> Parser<'a> { - fn parse(s: &'a str) -> Result { - let mut parser = Parser { - s, - it: s.char_indices().peekable(), - }; - - let mut config = Config::new(); - - while let Some((key, value)) = parser.parameter()? { - config.param(key, &value)?; - } - - Ok(config) - } - - fn skip_ws(&mut self) { - self.take_while(char::is_whitespace); - } - - fn take_while(&mut self, f: F) -> &'a str - where - F: Fn(char) -> bool, - { - let start = match self.it.peek() { - Some(&(i, _)) => i, - None => return "", - }; - - loop { - match self.it.peek() { - Some(&(_, c)) if f(c) => { - self.it.next(); - } - Some(&(i, _)) => return &self.s[start..i], - None => return &self.s[start..], - } - } - } - - fn eat(&mut self, target: char) -> Result<(), Error> { - match self.it.next() { - Some((_, c)) if c == target => Ok(()), - Some((i, c)) => { - let m = format!( - "unexpected character at byte {}: expected `{}` but got `{}`", - i, target, c - ); - Err(Error::config_parse(m.into())) - } - None => Err(Error::config_parse("unexpected EOF".into())), - } - } - - fn eat_if(&mut self, target: char) -> bool { - match self.it.peek() { - Some(&(_, c)) if c == target => { - self.it.next(); - true - } - _ => false, - } - } - - fn keyword(&mut self) -> Option<&'a str> { - let s = self.take_while(|c| match c { - c if c.is_whitespace() => false, - '=' => false, - _ => true, - }); - - if s.is_empty() { - None - } else { - Some(s) - } - } - - fn value(&mut self) -> Result { - let value = if self.eat_if('\'') { - let value = self.quoted_value()?; - self.eat('\'')?; - value - } else { - self.simple_value()? - }; - - Ok(value) - } - - fn simple_value(&mut self) -> Result { - let mut value = String::new(); - - while let Some(&(_, c)) = self.it.peek() { - if c.is_whitespace() { - break; - } - - self.it.next(); - if c == '\\' { - if let Some((_, c2)) = self.it.next() { - value.push(c2); - } - } else { - value.push(c); - } - } - - if value.is_empty() { - return Err(Error::config_parse("unexpected EOF".into())); - } - - Ok(value) - } - - fn quoted_value(&mut self) -> Result { - let mut value = String::new(); - - while let Some(&(_, c)) = self.it.peek() { - if c == '\'' { - return Ok(value); - } - - self.it.next(); - if c == '\\' { - if let Some((_, c2)) = self.it.next() { - value.push(c2); - } - } else { - value.push(c); - } - } - - Err(Error::config_parse( - "unterminated quoted connection parameter value".into(), - )) - } - - fn parameter(&mut self) -> Result, Error> { - self.skip_ws(); - let keyword = match self.keyword() { - Some(keyword) => keyword, - None => return Ok(None), - }; - self.skip_ws(); - self.eat('=')?; - self.skip_ws(); - let value = self.value()?; - - Ok(Some((keyword, value))) - } -} - -// This is a pretty sloppy "URL" parser, but it matches the behavior of libpq, where things really aren't very strict -struct UrlParser<'a> { - s: &'a str, - config: Config, -} - -impl<'a> UrlParser<'a> { - fn parse(s: &'a str) -> Result, Error> { - let s = match Self::remove_url_prefix(s) { - Some(s) => s, - None => return Ok(None), - }; - - let mut parser = UrlParser { - s, - config: Config::new(), - }; - - parser.parse_credentials()?; - parser.parse_host()?; - parser.parse_path()?; - parser.parse_params()?; - - Ok(Some(parser.config)) - } - - fn remove_url_prefix(s: &str) -> Option<&str> { - for prefix in &["postgres://", "postgresql://"] { - if let Some(stripped) = s.strip_prefix(prefix) { - return Some(stripped); - } - } - - None - } - - fn take_until(&mut self, end: &[char]) -> Option<&'a str> { - match self.s.find(end) { - Some(pos) => { - let (head, tail) = self.s.split_at(pos); - self.s = tail; - Some(head) - } - None => None, - } - } - - fn take_all(&mut self) -> &'a str { - mem::take(&mut self.s) - } - - fn eat_byte(&mut self) { - self.s = &self.s[1..]; - } - - fn parse_credentials(&mut self) -> Result<(), Error> { - let creds = match self.take_until(&['@']) { - Some(creds) => creds, - None => return Ok(()), - }; - self.eat_byte(); - - let mut it = creds.splitn(2, ':'); - let user = self.decode(it.next().unwrap())?; - self.config.user(&user); - - if let Some(password) = it.next() { - let password = Cow::from(percent_encoding::percent_decode(password.as_bytes())); - self.config.password(password); - } - - Ok(()) - } - - fn parse_host(&mut self) -> Result<(), Error> { - let host = match self.take_until(&['/', '?']) { - Some(host) => host, - None => self.take_all(), - }; - - if host.is_empty() { - return Ok(()); - } - - for chunk in host.split(',') { - let (host, port) = if chunk.starts_with('[') { - let idx = match chunk.find(']') { - Some(idx) => idx, - None => return Err(Error::config_parse(InvalidValue("host").into())), - }; - - let host = &chunk[1..idx]; - let remaining = &chunk[idx + 1..]; - let port = if let Some(port) = remaining.strip_prefix(':') { - Some(port) - } else if remaining.is_empty() { - None - } else { - return Err(Error::config_parse(InvalidValue("host").into())); - }; - - (host, port) - } else { - let mut it = chunk.splitn(2, ':'); - (it.next().unwrap(), it.next()) - }; - - self.host_param(host)?; - let port = self.decode(port.unwrap_or("5432"))?; - self.config.param("port", &port)?; - } - - Ok(()) - } - - fn parse_path(&mut self) -> Result<(), Error> { - if !self.s.starts_with('/') { - return Ok(()); - } - self.eat_byte(); - - let dbname = match self.take_until(&['?']) { - Some(dbname) => dbname, - None => self.take_all(), - }; - - if !dbname.is_empty() { - self.config.dbname(&self.decode(dbname)?); - } - - Ok(()) - } - - fn parse_params(&mut self) -> Result<(), Error> { - if !self.s.starts_with('?') { - return Ok(()); - } - self.eat_byte(); - - while !self.s.is_empty() { - let key = match self.take_until(&['=']) { - Some(key) => self.decode(key)?, - None => return Err(Error::config_parse("unterminated parameter".into())), - }; - self.eat_byte(); - - let value = match self.take_until(&['&']) { - Some(value) => { - self.eat_byte(); - value - } - None => self.take_all(), - }; - - if key == "host" { - self.host_param(value)?; - } else { - let value = self.decode(value)?; - self.config.param(&key, &value)?; - } - } - - Ok(()) - } - - fn host_param(&mut self, s: &str) -> Result<(), Error> { - let s = self.decode(s)?; - self.config.param("host", &s) - } - - fn decode(&self, s: &'a str) -> Result, Error> { - percent_encoding::percent_decode(s.as_bytes()) - .decode_utf8() - .map_err(|e| Error::config_parse(e.into())) - } -} diff --git a/libs/proxy/tokio-postgres2/src/connect_raw.rs b/libs/proxy/tokio-postgres2/src/connect_raw.rs index 9c6f1a2552..390f133002 100644 --- a/libs/proxy/tokio-postgres2/src/connect_raw.rs +++ b/libs/proxy/tokio-postgres2/src/connect_raw.rs @@ -7,7 +7,6 @@ use crate::Error; use bytes::BytesMut; use fallible_iterator::FallibleIterator; use futures_util::{ready, Sink, SinkExt, Stream, TryStreamExt}; -use postgres_protocol2::authentication; use postgres_protocol2::authentication::sasl; use postgres_protocol2::authentication::sasl::ScramSha256; use postgres_protocol2::message::backend::{AuthenticationSaslBody, Message, NoticeResponseBody}; @@ -174,25 +173,11 @@ where authenticate_password(stream, pass).await?; } - Some(Message::AuthenticationMd5Password(body)) => { - can_skip_channel_binding(config)?; - - let user = config - .user - .as_ref() - .ok_or_else(|| Error::config("user missing".into()))?; - let pass = config - .password - .as_ref() - .ok_or_else(|| Error::config("password missing".into()))?; - - let output = authentication::md5_hash(user.as_bytes(), pass, body.salt()); - authenticate_password(stream, output.as_bytes()).await?; - } Some(Message::AuthenticationSasl(body)) => { authenticate_sasl(stream, body, config).await?; } - Some(Message::AuthenticationKerberosV5) + Some(Message::AuthenticationMd5Password) + | Some(Message::AuthenticationKerberosV5) | Some(Message::AuthenticationScmCredential) | Some(Message::AuthenticationGss) | Some(Message::AuthenticationSspi) => { diff --git a/libs/proxy/tokio-postgres2/src/error/mod.rs b/libs/proxy/tokio-postgres2/src/error/mod.rs index 6514322250..922c348525 100644 --- a/libs/proxy/tokio-postgres2/src/error/mod.rs +++ b/libs/proxy/tokio-postgres2/src/error/mod.rs @@ -349,7 +349,6 @@ enum Kind { Parse, Encode, Authentication, - ConfigParse, Config, Connect, Timeout, @@ -386,7 +385,6 @@ impl fmt::Display for Error { Kind::Parse => fmt.write_str("error parsing response from server")?, Kind::Encode => fmt.write_str("error encoding message to server")?, Kind::Authentication => fmt.write_str("authentication error")?, - Kind::ConfigParse => fmt.write_str("invalid connection string")?, Kind::Config => fmt.write_str("invalid configuration")?, Kind::Connect => fmt.write_str("error connecting to server")?, Kind::Timeout => fmt.write_str("timeout waiting for server")?, @@ -482,10 +480,6 @@ impl Error { Error::new(Kind::Authentication, Some(e)) } - pub(crate) fn config_parse(e: Box) -> Error { - Error::new(Kind::ConfigParse, Some(e)) - } - pub(crate) fn config(e: Box) -> Error { Error::new(Kind::Config, Some(e)) } diff --git a/libs/proxy/tokio-postgres2/src/lib.rs b/libs/proxy/tokio-postgres2/src/lib.rs index 57c639a7de..901ed0c96c 100644 --- a/libs/proxy/tokio-postgres2/src/lib.rs +++ b/libs/proxy/tokio-postgres2/src/lib.rs @@ -13,14 +13,12 @@ pub use crate::query::RowStream; pub use crate::row::{Row, SimpleQueryRow}; pub use crate::simple_query::SimpleQueryStream; pub use crate::statement::{Column, Statement}; -use crate::tls::MakeTlsConnect; pub use crate::tls::NoTls; pub use crate::to_statement::ToStatement; pub use crate::transaction::Transaction; pub use crate::transaction_builder::{IsolationLevel, TransactionBuilder}; use crate::types::ToSql; use postgres_protocol2::message::backend::ReadyForQueryBody; -use tokio::net::TcpStream; /// After executing a query, the connection will be in one of these states #[derive(Clone, Copy, Debug, PartialEq)] @@ -72,24 +70,6 @@ mod transaction; mod transaction_builder; pub mod types; -/// A convenience function which parses a connection string and connects to the database. -/// -/// See the documentation for [`Config`] for details on the connection string format. -/// -/// Requires the `runtime` Cargo feature (enabled by default). -/// -/// [`Config`]: config/struct.Config.html -pub async fn connect( - config: &str, - tls: T, -) -> Result<(Client, Connection), Error> -where - T: MakeTlsConnect, -{ - let config = config.parse::()?; - config.connect(tls).await -} - /// An asynchronous notification. #[derive(Clone, Debug)] pub struct Notification { diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index f5934c8a89..2f63ee3acc 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -6,7 +6,7 @@ license.workspace = true [features] default = [] -testing = [] +testing = ["dep:tokio-postgres"] [dependencies] ahash.workspace = true @@ -55,6 +55,7 @@ parquet.workspace = true parquet_derive.workspace = true pin-project-lite.workspace = true postgres_backend.workspace = true +postgres-client = { package = "tokio-postgres2", path = "../libs/proxy/tokio-postgres2" } postgres-protocol = { package = "postgres-protocol2", path = "../libs/proxy/postgres-protocol2" } pq_proto.workspace = true prometheus.workspace = true @@ -81,7 +82,7 @@ subtle.workspace = true thiserror.workspace = true tikv-jemallocator.workspace = true tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] } -tokio-postgres = { package = "tokio-postgres2", path = "../libs/proxy/tokio-postgres2" } +tokio-postgres = { workspace = true, optional = true } tokio-rustls.workspace = true tokio-util.workspace = true tokio = { workspace = true, features = ["signal"] } @@ -119,3 +120,4 @@ rcgen.workspace = true rstest.workspace = true walkdir.workspace = true rand_distr = "0.4" +tokio-postgres.workspace = true diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs index 491b272ac4..5e494dfdd6 100644 --- a/proxy/src/auth/backend/classic.rs +++ b/proxy/src/auth/backend/classic.rs @@ -66,7 +66,7 @@ pub(super) async fn authenticate( Ok(ComputeCredentials { info: creds, - keys: ComputeCredentialKeys::AuthKeys(tokio_postgres::config::AuthKeys::ScramSha256( + keys: ComputeCredentialKeys::AuthKeys(postgres_client::config::AuthKeys::ScramSha256( scram_keys, )), }) diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs index bf7a1cb070..494564de05 100644 --- a/proxy/src/auth/backend/console_redirect.rs +++ b/proxy/src/auth/backend/console_redirect.rs @@ -1,8 +1,8 @@ use async_trait::async_trait; +use postgres_client::config::SslMode; use pq_proto::BeMessage as Be; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; -use tokio_postgres::config::SslMode; use tracing::{info, info_span}; use super::ComputeCredentialKeys; diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs index 7e1b26a11a..84a572dcf9 100644 --- a/proxy/src/auth/backend/mod.rs +++ b/proxy/src/auth/backend/mod.rs @@ -11,8 +11,8 @@ pub use console_redirect::ConsoleRedirectBackend; pub(crate) use console_redirect::ConsoleRedirectError; use ipnet::{Ipv4Net, Ipv6Net}; use local::LocalBackend; +use postgres_client::config::AuthKeys; use tokio::io::{AsyncRead, AsyncWrite}; -use tokio_postgres::config::AuthKeys; use tracing::{debug, info, warn}; use crate::auth::credentials::check_peer_addr_is_in_list; diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs index 9c6ce151cb..60d1962d7f 100644 --- a/proxy/src/auth/flow.rs +++ b/proxy/src/auth/flow.rs @@ -227,7 +227,7 @@ pub(crate) async fn validate_password_and_exchange( }; Ok(sasl::Outcome::Success(ComputeCredentialKeys::AuthKeys( - tokio_postgres::config::AuthKeys::ScramSha256(keys), + postgres_client::config::AuthKeys::ScramSha256(keys), ))) } } diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index 91e198bf88..bcb0ef40bd 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -3,11 +3,11 @@ use std::sync::Arc; use dashmap::DashMap; use ipnet::{IpNet, Ipv4Net, Ipv6Net}; +use postgres_client::{CancelToken, NoTls}; use pq_proto::CancelKeyData; use thiserror::Error; use tokio::net::TcpStream; use tokio::sync::Mutex; -use tokio_postgres::{CancelToken, NoTls}; use tracing::{debug, info}; use uuid::Uuid; @@ -44,7 +44,7 @@ pub(crate) enum CancelError { IO(#[from] std::io::Error), #[error("{0}")] - Postgres(#[from] tokio_postgres::Error), + Postgres(#[from] postgres_client::Error), #[error("rate limit exceeded")] RateLimit, @@ -70,7 +70,7 @@ impl ReportableError for CancelError { impl CancellationHandler

{ /// Run async action within an ephemeral session identified by [`CancelKeyData`]. pub(crate) fn get_session(self: Arc) -> Session

{ - // HACK: We'd rather get the real backend_pid but tokio_postgres doesn't + // HACK: We'd rather get the real backend_pid but postgres_client doesn't // expose it and we don't want to do another roundtrip to query // for it. The client will be able to notice that this is not the // actual backend_pid, but backend_pid is not used for anything diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index b689b97a21..06bc71c559 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -6,6 +6,8 @@ use std::time::Duration; use futures::{FutureExt, TryFutureExt}; use itertools::Itertools; use once_cell::sync::OnceCell; +use postgres_client::tls::MakeTlsConnect; +use postgres_client::{CancelToken, RawConnection}; use postgres_protocol::message::backend::NoticeResponseBody; use pq_proto::StartupMessageParams; use rustls::client::danger::ServerCertVerifier; @@ -13,8 +15,6 @@ use rustls::crypto::ring; use rustls::pki_types::InvalidDnsNameError; use thiserror::Error; use tokio::net::TcpStream; -use tokio_postgres::tls::MakeTlsConnect; -use tokio_postgres::{CancelToken, RawConnection}; use tracing::{debug, error, info, warn}; use crate::auth::parse_endpoint_param; @@ -34,9 +34,9 @@ pub const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node"; #[derive(Debug, Error)] pub(crate) enum ConnectionError { /// This error doesn't seem to reveal any secrets; for instance, - /// `tokio_postgres::error::Kind` doesn't contain ip addresses and such. + /// `postgres_client::error::Kind` doesn't contain ip addresses and such. #[error("{COULD_NOT_CONNECT}: {0}")] - Postgres(#[from] tokio_postgres::Error), + Postgres(#[from] postgres_client::Error), #[error("{COULD_NOT_CONNECT}: {0}")] CouldNotConnect(#[from] io::Error), @@ -99,13 +99,13 @@ impl ReportableError for ConnectionError { } /// A pair of `ClientKey` & `ServerKey` for `SCRAM-SHA-256`. -pub(crate) type ScramKeys = tokio_postgres::config::ScramKeys<32>; +pub(crate) type ScramKeys = postgres_client::config::ScramKeys<32>; /// A config for establishing a connection to compute node. -/// Eventually, `tokio_postgres` will be replaced with something better. +/// Eventually, `postgres_client` will be replaced with something better. /// Newtype allows us to implement methods on top of it. #[derive(Clone, Default)] -pub(crate) struct ConnCfg(Box); +pub(crate) struct ConnCfg(Box); /// Creation and initialization routines. impl ConnCfg { @@ -126,7 +126,7 @@ impl ConnCfg { pub(crate) fn get_host(&self) -> Result { match self.0.get_hosts() { - [tokio_postgres::config::Host::Tcp(s)] => Ok(s.into()), + [postgres_client::config::Host::Tcp(s)] => Ok(s.into()), // we should not have multiple address or unix addresses. _ => Err(WakeComputeError::BadComputeAddress( "invalid compute address".into(), @@ -160,7 +160,7 @@ impl ConnCfg { // TODO: This is especially ugly... if let Some(replication) = params.get("replication") { - use tokio_postgres::config::ReplicationMode; + use postgres_client::config::ReplicationMode; match replication { "true" | "on" | "yes" | "1" => { self.replication_mode(ReplicationMode::Physical); @@ -182,7 +182,7 @@ impl ConnCfg { } impl std::ops::Deref for ConnCfg { - type Target = tokio_postgres::Config; + type Target = postgres_client::Config; fn deref(&self) -> &Self::Target { &self.0 @@ -199,7 +199,7 @@ impl std::ops::DerefMut for ConnCfg { impl ConnCfg { /// Establish a raw TCP connection to the compute node. async fn connect_raw(&self, timeout: Duration) -> io::Result<(SocketAddr, TcpStream, &str)> { - use tokio_postgres::config::Host; + use postgres_client::config::Host; // wrap TcpStream::connect with timeout let connect_with_timeout = |host, port| { @@ -224,7 +224,7 @@ impl ConnCfg { }) }; - // We can't reuse connection establishing logic from `tokio_postgres` here, + // We can't reuse connection establishing logic from `postgres_client` here, // because it has no means for extracting the underlying socket which we // require for our business. let mut connection_error = None; @@ -272,7 +272,7 @@ type RustlsStream = > pub(crate) struct PostgresConnection { /// Socket connected to a compute node. pub(crate) stream: - tokio_postgres::maybe_tls_stream::MaybeTlsStream, + postgres_client::maybe_tls_stream::MaybeTlsStream, /// PostgreSQL connection parameters. pub(crate) params: std::collections::HashMap, /// Query cancellation token. diff --git a/proxy/src/control_plane/client/mock.rs b/proxy/src/control_plane/client/mock.rs index 9537d717a1..4d55f96ca1 100644 --- a/proxy/src/control_plane/client/mock.rs +++ b/proxy/src/control_plane/client/mock.rs @@ -5,7 +5,6 @@ use std::sync::Arc; use futures::TryFutureExt; use thiserror::Error; -use tokio_postgres::config::SslMode; use tokio_postgres::Client; use tracing::{error, info, info_span, warn, Instrument}; @@ -165,7 +164,7 @@ impl MockControlPlane { config .host(self.endpoint.host_str().unwrap_or("localhost")) .port(self.endpoint.port().unwrap_or(5432)) - .ssl_mode(SslMode::Disable); + .ssl_mode(postgres_client::config::SslMode::Disable); let node = NodeInfo { config, diff --git a/proxy/src/control_plane/client/neon.rs b/proxy/src/control_plane/client/neon.rs index 2cad981d01..5a78ec9d32 100644 --- a/proxy/src/control_plane/client/neon.rs +++ b/proxy/src/control_plane/client/neon.rs @@ -6,8 +6,8 @@ use std::time::Duration; use ::http::header::AUTHORIZATION; use ::http::HeaderName; use futures::TryFutureExt; +use postgres_client::config::SslMode; use tokio::time::Instant; -use tokio_postgres::config::SslMode; use tracing::{debug, info, info_span, warn, Instrument}; use super::super::messages::{ControlPlaneErrorMessage, GetRoleSecret, WakeCompute}; diff --git a/proxy/src/error.rs b/proxy/src/error.rs index 2221aac407..6a379499dc 100644 --- a/proxy/src/error.rs +++ b/proxy/src/error.rs @@ -84,7 +84,7 @@ pub(crate) trait ReportableError: fmt::Display + Send + 'static { fn get_error_kind(&self) -> ErrorKind; } -impl ReportableError for tokio_postgres::error::Error { +impl ReportableError for postgres_client::error::Error { fn get_error_kind(&self) -> ErrorKind { if self.as_db_error().is_some() { ErrorKind::Postgres diff --git a/proxy/src/postgres_rustls/mod.rs b/proxy/src/postgres_rustls/mod.rs index 31e7915e89..5ef20991c3 100644 --- a/proxy/src/postgres_rustls/mod.rs +++ b/proxy/src/postgres_rustls/mod.rs @@ -1,10 +1,10 @@ use std::convert::TryFrom; use std::sync::Arc; +use postgres_client::tls::MakeTlsConnect; use rustls::pki_types::ServerName; use rustls::ClientConfig; use tokio::io::{AsyncRead, AsyncWrite}; -use tokio_postgres::tls::MakeTlsConnect; mod private { use std::future::Future; @@ -12,9 +12,9 @@ mod private { use std::pin::Pin; use std::task::{Context, Poll}; + use postgres_client::tls::{ChannelBinding, TlsConnect}; use rustls::pki_types::ServerName; use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; - use tokio_postgres::tls::{ChannelBinding, TlsConnect}; use tokio_rustls::client::TlsStream; use tokio_rustls::TlsConnector; @@ -59,7 +59,7 @@ mod private { pub struct RustlsStream(TlsStream); - impl tokio_postgres::tls::TlsStream for RustlsStream + impl postgres_client::tls::TlsStream for RustlsStream where S: AsyncRead + AsyncWrite + Unpin, { diff --git a/proxy/src/proxy/retry.rs b/proxy/src/proxy/retry.rs index d3f0c3e7d4..42d1491782 100644 --- a/proxy/src/proxy/retry.rs +++ b/proxy/src/proxy/retry.rs @@ -31,9 +31,9 @@ impl CouldRetry for io::Error { } } -impl CouldRetry for tokio_postgres::error::DbError { +impl CouldRetry for postgres_client::error::DbError { fn could_retry(&self) -> bool { - use tokio_postgres::error::SqlState; + use postgres_client::error::SqlState; matches!( self.code(), &SqlState::CONNECTION_FAILURE @@ -43,9 +43,9 @@ impl CouldRetry for tokio_postgres::error::DbError { ) } } -impl ShouldRetryWakeCompute for tokio_postgres::error::DbError { +impl ShouldRetryWakeCompute for postgres_client::error::DbError { fn should_retry_wake_compute(&self) -> bool { - use tokio_postgres::error::SqlState; + use postgres_client::error::SqlState; // Here are errors that happens after the user successfully authenticated to the database. // TODO: there are pgbouncer errors that should be retried, but they are not listed here. !matches!( @@ -61,21 +61,21 @@ impl ShouldRetryWakeCompute for tokio_postgres::error::DbError { } } -impl CouldRetry for tokio_postgres::Error { +impl CouldRetry for postgres_client::Error { fn could_retry(&self) -> bool { if let Some(io_err) = self.source().and_then(|x| x.downcast_ref()) { io::Error::could_retry(io_err) } else if let Some(db_err) = self.source().and_then(|x| x.downcast_ref()) { - tokio_postgres::error::DbError::could_retry(db_err) + postgres_client::error::DbError::could_retry(db_err) } else { false } } } -impl ShouldRetryWakeCompute for tokio_postgres::Error { +impl ShouldRetryWakeCompute for postgres_client::Error { fn should_retry_wake_compute(&self) -> bool { if let Some(db_err) = self.source().and_then(|x| x.downcast_ref()) { - tokio_postgres::error::DbError::should_retry_wake_compute(db_err) + postgres_client::error::DbError::should_retry_wake_compute(db_err) } else { // likely an IO error. Possible the compute has shutdown and the // cache is stale. diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs index fe211adfeb..ef351f3b54 100644 --- a/proxy/src/proxy/tests/mitm.rs +++ b/proxy/src/proxy/tests/mitm.rs @@ -8,9 +8,9 @@ use std::fmt::Debug; use bytes::{Bytes, BytesMut}; use futures::{SinkExt, StreamExt}; +use postgres_client::tls::TlsConnect; use postgres_protocol::message::frontend; use tokio::io::{AsyncReadExt, DuplexStream}; -use tokio_postgres::tls::TlsConnect; use tokio_util::codec::{Decoder, Encoder}; use super::*; @@ -158,8 +158,8 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> { Scram::new("password").await?, )); - let _client_err = tokio_postgres::Config::new() - .channel_binding(tokio_postgres::config::ChannelBinding::Disable) + let _client_err = postgres_client::Config::new() + .channel_binding(postgres_client::config::ChannelBinding::Disable) .user("user") .dbname("db") .password("password") @@ -175,7 +175,7 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> { async fn scram_auth_prefer_channel_binding() -> anyhow::Result<()> { connect_failure( Intercept::None, - tokio_postgres::config::ChannelBinding::Prefer, + postgres_client::config::ChannelBinding::Prefer, ) .await } @@ -185,7 +185,7 @@ async fn scram_auth_prefer_channel_binding() -> anyhow::Result<()> { async fn scram_auth_prefer_channel_binding_intercept() -> anyhow::Result<()> { connect_failure( Intercept::Methods, - tokio_postgres::config::ChannelBinding::Prefer, + postgres_client::config::ChannelBinding::Prefer, ) .await } @@ -195,7 +195,7 @@ async fn scram_auth_prefer_channel_binding_intercept() -> anyhow::Result<()> { async fn scram_auth_prefer_channel_binding_intercept_response() -> anyhow::Result<()> { connect_failure( Intercept::SASLResponse, - tokio_postgres::config::ChannelBinding::Prefer, + postgres_client::config::ChannelBinding::Prefer, ) .await } @@ -205,7 +205,7 @@ async fn scram_auth_prefer_channel_binding_intercept_response() -> anyhow::Resul async fn scram_auth_require_channel_binding() -> anyhow::Result<()> { connect_failure( Intercept::None, - tokio_postgres::config::ChannelBinding::Require, + postgres_client::config::ChannelBinding::Require, ) .await } @@ -215,7 +215,7 @@ async fn scram_auth_require_channel_binding() -> anyhow::Result<()> { async fn scram_auth_require_channel_binding_intercept() -> anyhow::Result<()> { connect_failure( Intercept::Methods, - tokio_postgres::config::ChannelBinding::Require, + postgres_client::config::ChannelBinding::Require, ) .await } @@ -225,14 +225,14 @@ async fn scram_auth_require_channel_binding_intercept() -> anyhow::Result<()> { async fn scram_auth_require_channel_binding_intercept_response() -> anyhow::Result<()> { connect_failure( Intercept::SASLResponse, - tokio_postgres::config::ChannelBinding::Require, + postgres_client::config::ChannelBinding::Require, ) .await } async fn connect_failure( intercept: Intercept, - channel_binding: tokio_postgres::config::ChannelBinding, + channel_binding: postgres_client::config::ChannelBinding, ) -> anyhow::Result<()> { let (server, client, client_config, server_config) = proxy_mitm(intercept).await; let proxy = tokio::spawn(dummy_proxy( @@ -241,7 +241,7 @@ async fn connect_failure( Scram::new("password").await?, )); - let _client_err = tokio_postgres::Config::new() + let _client_err = postgres_client::Config::new() .channel_binding(channel_binding) .user("user") .dbname("db") diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs index 15be6c9724..c8b742b3ff 100644 --- a/proxy/src/proxy/tests/mod.rs +++ b/proxy/src/proxy/tests/mod.rs @@ -7,13 +7,13 @@ use std::time::Duration; use anyhow::{bail, Context}; use async_trait::async_trait; use http::StatusCode; +use postgres_client::config::SslMode; +use postgres_client::tls::{MakeTlsConnect, NoTls}; use retry::{retry_after, ShouldRetryWakeCompute}; use rstest::rstest; use rustls::crypto::ring; use rustls::pki_types; use tokio::io::DuplexStream; -use tokio_postgres::config::SslMode; -use tokio_postgres::tls::{MakeTlsConnect, NoTls}; use super::connect_compute::ConnectMechanism; use super::retry::CouldRetry; @@ -204,7 +204,7 @@ async fn handshake_tls_is_enforced_by_proxy() -> anyhow::Result<()> { let (_, server_config) = generate_tls_config("generic-project-name.localhost", "localhost")?; let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), NoAuth)); - let client_err = tokio_postgres::Config::new() + let client_err = postgres_client::Config::new() .user("john_doe") .dbname("earth") .ssl_mode(SslMode::Disable) @@ -233,7 +233,7 @@ async fn handshake_tls() -> anyhow::Result<()> { generate_tls_config("generic-project-name.localhost", "localhost")?; let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), NoAuth)); - let _conn = tokio_postgres::Config::new() + let _conn = postgres_client::Config::new() .user("john_doe") .dbname("earth") .ssl_mode(SslMode::Require) @@ -249,7 +249,7 @@ async fn handshake_raw() -> anyhow::Result<()> { let proxy = tokio::spawn(dummy_proxy(client, None, NoAuth)); - let _conn = tokio_postgres::Config::new() + let _conn = postgres_client::Config::new() .user("john_doe") .dbname("earth") .options("project=generic-project-name") @@ -296,8 +296,8 @@ async fn scram_auth_good(#[case] password: &str) -> anyhow::Result<()> { Scram::new(password).await?, )); - let _conn = tokio_postgres::Config::new() - .channel_binding(tokio_postgres::config::ChannelBinding::Require) + let _conn = postgres_client::Config::new() + .channel_binding(postgres_client::config::ChannelBinding::Require) .user("user") .dbname("db") .password(password) @@ -320,8 +320,8 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> { Scram::new("password").await?, )); - let _conn = tokio_postgres::Config::new() - .channel_binding(tokio_postgres::config::ChannelBinding::Disable) + let _conn = postgres_client::Config::new() + .channel_binding(postgres_client::config::ChannelBinding::Disable) .user("user") .dbname("db") .password("password") @@ -348,7 +348,7 @@ async fn scram_auth_mock() -> anyhow::Result<()> { .map(char::from) .collect(); - let _client_err = tokio_postgres::Config::new() + let _client_err = postgres_client::Config::new() .user("user") .dbname("db") .password(&password) // no password will match the mocked secret diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index 57846a4c2c..8c7931907d 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -37,9 +37,9 @@ use crate::types::{EndpointId, Host, LOCAL_PROXY_SUFFIX}; pub(crate) struct PoolingBackend { pub(crate) http_conn_pool: Arc>>, - pub(crate) local_pool: Arc>, + pub(crate) local_pool: Arc>, pub(crate) pool: - Arc>>, + Arc>>, pub(crate) config: &'static ProxyConfig, pub(crate) auth_backend: &'static crate::auth::Backend<'static, ()>, @@ -170,7 +170,7 @@ impl PoolingBackend { conn_info: ConnInfo, keys: ComputeCredentials, force_new: bool, - ) -> Result, HttpConnError> { + ) -> Result, HttpConnError> { let maybe_client = if force_new { debug!("pool: pool is disabled"); None @@ -256,7 +256,7 @@ impl PoolingBackend { &self, ctx: &RequestContext, conn_info: ConnInfo, - ) -> Result, HttpConnError> { + ) -> Result, HttpConnError> { if let Some(client) = self.local_pool.get(ctx, &conn_info)? { return Ok(client); } @@ -315,7 +315,7 @@ impl PoolingBackend { )); let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); - let (client, connection) = config.connect(tokio_postgres::NoTls).await?; + let (client, connection) = config.connect(postgres_client::NoTls).await?; drop(pause); let pid = client.get_process_id(); @@ -360,7 +360,7 @@ pub(crate) enum HttpConnError { #[error("pooled connection closed at inconsistent state")] ConnectionClosedAbruptly(#[from] tokio::sync::watch::error::SendError), #[error("could not connection to postgres in compute")] - PostgresConnectionError(#[from] tokio_postgres::Error), + PostgresConnectionError(#[from] postgres_client::Error), #[error("could not connection to local-proxy in compute")] LocalProxyConnectionError(#[from] LocalProxyConnError), #[error("could not parse JWT payload")] @@ -479,7 +479,7 @@ impl ShouldRetryWakeCompute for LocalProxyConnError { } struct TokioMechanism { - pool: Arc>>, + pool: Arc>>, conn_info: ConnInfo, conn_id: uuid::Uuid, @@ -489,7 +489,7 @@ struct TokioMechanism { #[async_trait] impl ConnectMechanism for TokioMechanism { - type Connection = Client; + type Connection = Client; type ConnectError = HttpConnError; type Error = HttpConnError; @@ -509,7 +509,7 @@ impl ConnectMechanism for TokioMechanism { .connect_timeout(timeout); let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); - let res = config.connect(tokio_postgres::NoTls).await; + let res = config.connect(postgres_client::NoTls).await; drop(pause); let (client, connection) = permit.release_result(res)?; diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index c302eac568..cac5a173cb 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -5,11 +5,11 @@ use std::task::{ready, Poll}; use futures::future::poll_fn; use futures::Future; +use postgres_client::tls::NoTlsStream; +use postgres_client::AsyncMessage; use smallvec::SmallVec; use tokio::net::TcpStream; use tokio::time::Instant; -use tokio_postgres::tls::NoTlsStream; -use tokio_postgres::AsyncMessage; use tokio_util::sync::CancellationToken; use tracing::{error, info, info_span, warn, Instrument}; #[cfg(test)] @@ -58,7 +58,7 @@ pub(crate) fn poll_client( ctx: &RequestContext, conn_info: ConnInfo, client: C, - mut connection: tokio_postgres::Connection, + mut connection: postgres_client::Connection, conn_id: uuid::Uuid, aux: MetricsAuxInfo, ) -> Client { diff --git a/proxy/src/serverless/conn_pool_lib.rs b/proxy/src/serverless/conn_pool_lib.rs index fe1d2563bc..2a46c8f9c5 100644 --- a/proxy/src/serverless/conn_pool_lib.rs +++ b/proxy/src/serverless/conn_pool_lib.rs @@ -7,8 +7,8 @@ use std::time::Duration; use dashmap::DashMap; use parking_lot::RwLock; +use postgres_client::ReadyForQueryStatus; use rand::Rng; -use tokio_postgres::ReadyForQueryStatus; use tracing::{debug, info, Span}; use super::backend::HttpConnError; @@ -683,7 +683,7 @@ pub(crate) trait ClientInnerExt: Sync + Send + 'static { fn get_process_id(&self) -> i32; } -impl ClientInnerExt for tokio_postgres::Client { +impl ClientInnerExt for postgres_client::Client { fn is_closed(&self) -> bool { self.is_closed() } diff --git a/proxy/src/serverless/json.rs b/proxy/src/serverless/json.rs index 569e2da571..25b25c66d3 100644 --- a/proxy/src/serverless/json.rs +++ b/proxy/src/serverless/json.rs @@ -1,6 +1,6 @@ +use postgres_client::types::{Kind, Type}; +use postgres_client::Row; use serde_json::{Map, Value}; -use tokio_postgres::types::{Kind, Type}; -use tokio_postgres::Row; // // Convert json non-string types to strings, so that they can be passed to Postgres @@ -61,7 +61,7 @@ fn json_array_to_pg_array(value: &Value) -> Option { #[derive(Debug, thiserror::Error)] pub(crate) enum JsonConversionError { #[error("internal error compute returned invalid data: {0}")] - AsTextError(tokio_postgres::Error), + AsTextError(postgres_client::Error), #[error("parse int error: {0}")] ParseIntError(#[from] std::num::ParseIntError), #[error("parse float error: {0}")] diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs index db9ac49dae..b84cde9e25 100644 --- a/proxy/src/serverless/local_conn_pool.rs +++ b/proxy/src/serverless/local_conn_pool.rs @@ -22,13 +22,13 @@ use indexmap::IndexMap; use jose_jwk::jose_b64::base64ct::{Base64UrlUnpadded, Encoding}; use p256::ecdsa::{Signature, SigningKey}; use parking_lot::RwLock; +use postgres_client::tls::NoTlsStream; +use postgres_client::types::ToSql; +use postgres_client::AsyncMessage; use serde_json::value::RawValue; use signature::Signer; use tokio::net::TcpStream; use tokio::time::Instant; -use tokio_postgres::tls::NoTlsStream; -use tokio_postgres::types::ToSql; -use tokio_postgres::AsyncMessage; use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, info_span, warn, Instrument}; @@ -164,7 +164,7 @@ pub(crate) fn poll_client( ctx: &RequestContext, conn_info: ConnInfo, client: C, - mut connection: tokio_postgres::Connection, + mut connection: postgres_client::Connection, key: SigningKey, conn_id: uuid::Uuid, aux: MetricsAuxInfo, @@ -280,7 +280,7 @@ pub(crate) fn poll_client( ) } -impl ClientInnerCommon { +impl ClientInnerCommon { pub(crate) async fn set_jwt_session(&mut self, payload: &[u8]) -> Result<(), HttpConnError> { if let ClientDataEnum::Local(local_data) = &mut self.data { local_data.jti += 1; diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index a0ca7cc60d..5e85f5ec40 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -11,12 +11,12 @@ use http_body_util::{BodyExt, Full}; use hyper::body::Incoming; use hyper::http::{HeaderName, HeaderValue}; use hyper::{header, HeaderMap, Request, Response, StatusCode}; +use postgres_client::error::{DbError, ErrorPosition, SqlState}; +use postgres_client::{GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, Transaction}; use pq_proto::StartupMessageParamsBuilder; use serde::Serialize; use serde_json::Value; use tokio::time::{self, Instant}; -use tokio_postgres::error::{DbError, ErrorPosition, SqlState}; -use tokio_postgres::{GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, Transaction}; use tokio_util::sync::CancellationToken; use tracing::{debug, error, info}; use typed_json::json; @@ -361,7 +361,7 @@ pub(crate) enum SqlOverHttpError { #[error("invalid isolation level")] InvalidIsolationLevel, #[error("{0}")] - Postgres(#[from] tokio_postgres::Error), + Postgres(#[from] postgres_client::Error), #[error("{0}")] JsonConversion(#[from] JsonConversionError), #[error("{0}")] @@ -986,7 +986,7 @@ async fn query_to_json( // Manually drain the stream into a vector to leave row_stream hanging // around to get a command tag. Also check that the response is not too // big. - let mut rows: Vec = Vec::new(); + let mut rows: Vec = Vec::new(); while let Some(row) = row_stream.next().await { let row = row?; *current_size += row.body_len(); @@ -1063,13 +1063,13 @@ async fn query_to_json( } enum Client { - Remote(conn_pool_lib::Client), - Local(conn_pool_lib::Client), + Remote(conn_pool_lib::Client), + Local(conn_pool_lib::Client), } enum Discard<'a> { - Remote(conn_pool_lib::Discard<'a, tokio_postgres::Client>), - Local(conn_pool_lib::Discard<'a, tokio_postgres::Client>), + Remote(conn_pool_lib::Discard<'a, postgres_client::Client>), + Local(conn_pool_lib::Discard<'a, postgres_client::Client>), } impl Client { @@ -1080,7 +1080,7 @@ impl Client { } } - fn inner(&mut self) -> (&mut tokio_postgres::Client, Discard<'_>) { + fn inner(&mut self) -> (&mut postgres_client::Client, Discard<'_>) { match self { Client::Remote(client) => { let (c, d) = client.inner(); From 4157eaf4c56ed066f31512a82ea0a54689ba6648 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Tue, 3 Dec 2024 19:47:17 +0100 Subject: [PATCH 44/65] pageserver: respond to multiple shutdown signals (#9982) ## Problem The Pageserver signal handler would only respond to a single signal and initiate shutdown. Subsequent signals were ignored. This meant that a `SIGQUIT` sent after a `SIGTERM` had no effect (e.g. in the case of a slow or stalled shutdown). The `test_runner` uses this to force shutdown if graceful shutdown is slow. Touches #9740. ## Summary of changes Keep responding to signals after the initial shutdown signal has been received. Arguably, the `test_runner` should also use `SIGKILL` rather than `SIGQUIT` in this case, but it seems reasonable to respond to `SIGQUIT` regardless. --- pageserver/src/bin/pageserver.rs | 76 +++++++++++++++++++------------- 1 file changed, 45 insertions(+), 31 deletions(-) diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 8fe225c6aa..567a69da3b 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -636,45 +636,59 @@ fn start_pageserver( tokio::net::TcpListener::from_std(pageserver_listener).context("create tokio listener")? }); - let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard()); - // All started up! Now just sit and wait for shutdown signal. + BACKGROUND_RUNTIME.block_on(async move { + let signal_token = CancellationToken::new(); + let signal_cancel = signal_token.child_token(); - { - BACKGROUND_RUNTIME.block_on(async move { + // Spawn signal handlers. Runs in a loop since we want to be responsive to multiple signals + // even after triggering shutdown (e.g. a SIGQUIT after a slow SIGTERM shutdown). See: + // https://github.com/neondatabase/neon/issues/9740. + tokio::spawn(async move { let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt()).unwrap(); let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate()).unwrap(); let mut sigquit = tokio::signal::unix::signal(SignalKind::quit()).unwrap(); - let signal = tokio::select! { - _ = sigquit.recv() => { - info!("Got signal SIGQUIT. Terminating in immediate shutdown mode",); - std::process::exit(111); + + loop { + let signal = tokio::select! { + _ = sigquit.recv() => { + info!("Got signal SIGQUIT. Terminating in immediate shutdown mode."); + std::process::exit(111); + } + _ = sigint.recv() => "SIGINT", + _ = sigterm.recv() => "SIGTERM", + }; + + if !signal_token.is_cancelled() { + info!("Got signal {signal}. Terminating gracefully in fast shutdown mode."); + signal_token.cancel(); + } else { + info!("Got signal {signal}. Already shutting down."); } - _ = sigint.recv() => { "SIGINT" }, - _ = sigterm.recv() => { "SIGTERM" }, - }; + } + }); - info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",); + // Wait for cancellation signal and shut down the pageserver. + // + // This cancels the `shutdown_pageserver` cancellation tree. Right now that tree doesn't + // reach very far, and `task_mgr` is used instead. The plan is to change that over time. + signal_cancel.cancelled().await; - // This cancels the `shutdown_pageserver` cancellation tree. - // Right now that tree doesn't reach very far, and `task_mgr` is used instead. - // The plan is to change that over time. - shutdown_pageserver.take(); - pageserver::shutdown_pageserver( - http_endpoint_listener, - page_service, - consumption_metrics_tasks, - disk_usage_eviction_task, - &tenant_manager, - background_purges, - deletion_queue.clone(), - secondary_controller_tasks, - 0, - ) - .await; - unreachable!() - }) - } + shutdown_pageserver.cancel(); + pageserver::shutdown_pageserver( + http_endpoint_listener, + page_service, + consumption_metrics_tasks, + disk_usage_eviction_task, + &tenant_manager, + background_purges, + deletion_queue.clone(), + secondary_controller_tasks, + 0, + ) + .await; + unreachable!(); + }) } async fn create_remote_storage_client( From 5519e4261206ec5ba7caf243e176b948e2605ca5 Mon Sep 17 00:00:00 2001 From: Alexey Immoreev Date: Tue, 3 Dec 2024 22:59:44 +0400 Subject: [PATCH 45/65] Improvement: add console redirect timeout warning (#9985) ## Problem There is no information on session being cancelled in 2 minutes at the moment ## Summary of changes The timeout being logged for the user --- proxy/src/auth/backend/console_redirect.rs | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs index 494564de05..619c7b4ef1 100644 --- a/proxy/src/auth/backend/console_redirect.rs +++ b/proxy/src/auth/backend/console_redirect.rs @@ -49,13 +49,19 @@ impl ReportableError for ConsoleRedirectError { } } -fn hello_message(redirect_uri: &reqwest::Url, session_id: &str) -> String { +fn hello_message( + redirect_uri: &reqwest::Url, + session_id: &str, + duration: std::time::Duration, +) -> String { + let formatted_duration = humantime::format_duration(duration).to_string(); format!( concat![ "Welcome to Neon!\n", - "Authenticate by visiting:\n", + "Authenticate by visiting (will expire in {duration}):\n", " {redirect_uri}{session_id}\n\n", ], + duration = formatted_duration, redirect_uri = redirect_uri, session_id = session_id, ) @@ -118,7 +124,11 @@ async fn authenticate( }; let span = info_span!("console_redirect", psql_session_id = &psql_session_id); - let greeting = hello_message(link_uri, &psql_session_id); + let greeting = hello_message( + link_uri, + &psql_session_id, + auth_config.console_redirect_confirmation_timeout, + ); // Give user a URL to spawn a new database. info!(parent: &span, "sending the auth URL to the user"); From fbc8c36983122801ee9f12772c3062dda22ee915 Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Tue, 3 Dec 2024 20:00:14 +0000 Subject: [PATCH 46/65] chore(proxy): enforce single host+port (#9995) proxy doesn't ever provide multiple hosts/ports, so this code adds a lot of complexity of error handling for no good reason. (stacked on #9990) --- libs/proxy/tokio-postgres2/src/config.rs | 41 ++++----------- libs/proxy/tokio-postgres2/src/connect.rs | 42 ++++----------- proxy/src/auth/backend/console_redirect.rs | 8 +-- proxy/src/auth/backend/local.rs | 7 +-- proxy/src/compute.rs | 59 ++++++---------------- proxy/src/control_plane/client/mock.rs | 10 ++-- proxy/src/control_plane/client/neon.rs | 4 +- proxy/src/proxy/connect_compute.rs | 2 +- proxy/src/proxy/tests/mitm.rs | 4 +- proxy/src/proxy/tests/mod.rs | 14 ++--- proxy/src/serverless/backend.rs | 10 ++-- 11 files changed, 59 insertions(+), 142 deletions(-) diff --git a/libs/proxy/tokio-postgres2/src/config.rs b/libs/proxy/tokio-postgres2/src/config.rs index 5dad835c3b..fd10ef6f20 100644 --- a/libs/proxy/tokio-postgres2/src/config.rs +++ b/libs/proxy/tokio-postgres2/src/config.rs @@ -146,6 +146,9 @@ pub enum AuthKeys { /// ``` #[derive(Clone, PartialEq, Eq)] pub struct Config { + pub(crate) host: Host, + pub(crate) port: u16, + pub(crate) user: Option, pub(crate) password: Option>, pub(crate) auth_keys: Option>, @@ -153,8 +156,6 @@ pub struct Config { pub(crate) options: Option, pub(crate) application_name: Option, pub(crate) ssl_mode: SslMode, - pub(crate) host: Vec, - pub(crate) port: Vec, pub(crate) connect_timeout: Option, pub(crate) target_session_attrs: TargetSessionAttrs, pub(crate) channel_binding: ChannelBinding, @@ -162,16 +163,12 @@ pub struct Config { pub(crate) max_backend_message_size: Option, } -impl Default for Config { - fn default() -> Config { - Config::new() - } -} - impl Config { /// Creates a new configuration. - pub fn new() -> Config { + pub fn new(host: String, port: u16) -> Config { Config { + host: Host::Tcp(host), + port, user: None, password: None, auth_keys: None, @@ -179,8 +176,6 @@ impl Config { options: None, application_name: None, ssl_mode: SslMode::Prefer, - host: vec![], - port: vec![], connect_timeout: None, target_session_attrs: TargetSessionAttrs::Any, channel_binding: ChannelBinding::Prefer, @@ -283,32 +278,14 @@ impl Config { self.ssl_mode } - /// Adds a host to the configuration. - /// - /// Multiple hosts can be specified by calling this method multiple times, and each will be tried in order. - pub fn host(&mut self, host: &str) -> &mut Config { - self.host.push(Host::Tcp(host.to_string())); - self - } - /// Gets the hosts that have been added to the configuration with `host`. - pub fn get_hosts(&self) -> &[Host] { + pub fn get_host(&self) -> &Host { &self.host } - /// Adds a port to the configuration. - /// - /// Multiple ports can be specified by calling this method multiple times. There must either be no ports, in which - /// case the default of 5432 is used, a single port, in which it is used for all hosts, or the same number of ports - /// as hosts. - pub fn port(&mut self, port: u16) -> &mut Config { - self.port.push(port); - self - } - /// Gets the ports that have been added to the configuration with `port`. - pub fn get_ports(&self) -> &[u16] { - &self.port + pub fn get_port(&self) -> u16 { + self.port } /// Sets the timeout applied to socket-level connection attempts. diff --git a/libs/proxy/tokio-postgres2/src/connect.rs b/libs/proxy/tokio-postgres2/src/connect.rs index 98067d91f9..75a58e6eac 100644 --- a/libs/proxy/tokio-postgres2/src/connect.rs +++ b/libs/proxy/tokio-postgres2/src/connect.rs @@ -19,38 +19,18 @@ pub async fn connect( where T: MakeTlsConnect, { - if config.host.is_empty() { - return Err(Error::config("host missing".into())); + let hostname = match &config.host { + Host::Tcp(host) => host.as_str(), + }; + + let tls = tls + .make_tls_connect(hostname) + .map_err(|e| Error::tls(e.into()))?; + + match connect_once(&config.host, config.port, tls, config).await { + Ok((client, connection)) => Ok((client, connection)), + Err(e) => Err(e), } - - if config.port.len() > 1 && config.port.len() != config.host.len() { - return Err(Error::config("invalid number of ports".into())); - } - - let mut error = None; - for (i, host) in config.host.iter().enumerate() { - let port = config - .port - .get(i) - .or_else(|| config.port.first()) - .copied() - .unwrap_or(5432); - - let hostname = match host { - Host::Tcp(host) => host.as_str(), - }; - - let tls = tls - .make_tls_connect(hostname) - .map_err(|e| Error::tls(e.into()))?; - - match connect_once(host, port, tls, config).await { - Ok((client, connection)) => return Ok((client, connection)), - Err(e) => error = Some(e), - } - } - - Err(error.unwrap()) } async fn connect_once( diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs index 619c7b4ef1..575d60be85 100644 --- a/proxy/src/auth/backend/console_redirect.rs +++ b/proxy/src/auth/backend/console_redirect.rs @@ -161,12 +161,8 @@ async fn authenticate( // This config should be self-contained, because we won't // take username or dbname from client's startup message. - let mut config = compute::ConnCfg::new(); - config - .host(&db_info.host) - .port(db_info.port) - .dbname(&db_info.dbname) - .user(&db_info.user); + let mut config = compute::ConnCfg::new(db_info.host.to_string(), db_info.port); + config.dbname(&db_info.dbname).user(&db_info.user); ctx.set_dbname(db_info.dbname.into()); ctx.set_user(db_info.user.into()); diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs index 32e0f53615..d4273fb521 100644 --- a/proxy/src/auth/backend/local.rs +++ b/proxy/src/auth/backend/local.rs @@ -29,12 +29,7 @@ impl LocalBackend { api: http::Endpoint::new(compute_ctl, http::new_client()), }, node_info: NodeInfo { - config: { - let mut cfg = ConnCfg::new(); - cfg.host(&postgres_addr.ip().to_string()); - cfg.port(postgres_addr.port()); - cfg - }, + config: ConnCfg::new(postgres_addr.ip().to_string(), postgres_addr.port()), // TODO(conrad): make this better reflect compute info rather than endpoint info. aux: MetricsAuxInfo { endpoint_id: EndpointIdTag::get_interner().get_or_intern("local"), diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 06bc71c559..ab0ff4b795 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -104,13 +104,13 @@ pub(crate) type ScramKeys = postgres_client::config::ScramKeys<32>; /// A config for establishing a connection to compute node. /// Eventually, `postgres_client` will be replaced with something better. /// Newtype allows us to implement methods on top of it. -#[derive(Clone, Default)] +#[derive(Clone)] pub(crate) struct ConnCfg(Box); /// Creation and initialization routines. impl ConnCfg { - pub(crate) fn new() -> Self { - Self::default() + pub(crate) fn new(host: String, port: u16) -> Self { + Self(Box::new(postgres_client::Config::new(host, port))) } /// Reuse password or auth keys from the other config. @@ -124,13 +124,9 @@ impl ConnCfg { } } - pub(crate) fn get_host(&self) -> Result { - match self.0.get_hosts() { - [postgres_client::config::Host::Tcp(s)] => Ok(s.into()), - // we should not have multiple address or unix addresses. - _ => Err(WakeComputeError::BadComputeAddress( - "invalid compute address".into(), - )), + pub(crate) fn get_host(&self) -> Host { + match self.0.get_host() { + postgres_client::config::Host::Tcp(s) => s.into(), } } @@ -227,43 +223,20 @@ impl ConnCfg { // We can't reuse connection establishing logic from `postgres_client` here, // because it has no means for extracting the underlying socket which we // require for our business. - let mut connection_error = None; - let ports = self.0.get_ports(); - let hosts = self.0.get_hosts(); - // the ports array is supposed to have 0 entries, 1 entry, or as many entries as in the hosts array - if ports.len() > 1 && ports.len() != hosts.len() { - return Err(io::Error::new( - io::ErrorKind::Other, - format!( - "bad compute config, \ - ports and hosts entries' count does not match: {:?}", - self.0 - ), - )); - } + let port = self.0.get_port(); + let host = self.0.get_host(); - for (i, host) in hosts.iter().enumerate() { - let port = ports.get(i).or_else(|| ports.first()).unwrap_or(&5432); - let host = match host { - Host::Tcp(host) => host.as_str(), - }; + let host = match host { + Host::Tcp(host) => host.as_str(), + }; - match connect_once(host, *port).await { - Ok((sockaddr, stream)) => return Ok((sockaddr, stream, host)), - Err(err) => { - // We can't throw an error here, as there might be more hosts to try. - warn!("couldn't connect to compute node at {host}:{port}: {err}"); - connection_error = Some(err); - } + match connect_once(host, port).await { + Ok((sockaddr, stream)) => Ok((sockaddr, stream, host)), + Err(err) => { + warn!("couldn't connect to compute node at {host}:{port}: {err}"); + Err(err) } } - - Err(connection_error.unwrap_or_else(|| { - io::Error::new( - io::ErrorKind::Other, - format!("bad compute config: {:?}", self.0), - ) - })) } } diff --git a/proxy/src/control_plane/client/mock.rs b/proxy/src/control_plane/client/mock.rs index 4d55f96ca1..eaf692ab27 100644 --- a/proxy/src/control_plane/client/mock.rs +++ b/proxy/src/control_plane/client/mock.rs @@ -160,11 +160,11 @@ impl MockControlPlane { } async fn do_wake_compute(&self) -> Result { - let mut config = compute::ConnCfg::new(); - config - .host(self.endpoint.host_str().unwrap_or("localhost")) - .port(self.endpoint.port().unwrap_or(5432)) - .ssl_mode(postgres_client::config::SslMode::Disable); + let mut config = compute::ConnCfg::new( + self.endpoint.host_str().unwrap_or("localhost").to_owned(), + self.endpoint.port().unwrap_or(5432), + ); + config.ssl_mode(postgres_client::config::SslMode::Disable); let node = NodeInfo { config, diff --git a/proxy/src/control_plane/client/neon.rs b/proxy/src/control_plane/client/neon.rs index 5a78ec9d32..5c204ae1d7 100644 --- a/proxy/src/control_plane/client/neon.rs +++ b/proxy/src/control_plane/client/neon.rs @@ -241,8 +241,8 @@ impl NeonControlPlaneClient { // Don't set anything but host and port! This config will be cached. // We'll set username and such later using the startup message. // TODO: add more type safety (in progress). - let mut config = compute::ConnCfg::new(); - config.host(host).port(port).ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes. + let mut config = compute::ConnCfg::new(host.to_owned(), port); + config.ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes. let node = NodeInfo { config, diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs index 2e759b0894..585dce7bae 100644 --- a/proxy/src/proxy/connect_compute.rs +++ b/proxy/src/proxy/connect_compute.rs @@ -86,7 +86,7 @@ impl ConnectMechanism for TcpMechanism<'_> { node_info: &control_plane::CachedNodeInfo, timeout: time::Duration, ) -> Result { - let host = node_info.config.get_host()?; + let host = node_info.config.get_host(); let permit = self.locks.get_permit(&host).await?; permit.release_result(node_info.connect(ctx, timeout).await) } diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs index ef351f3b54..d72331c7bf 100644 --- a/proxy/src/proxy/tests/mitm.rs +++ b/proxy/src/proxy/tests/mitm.rs @@ -158,7 +158,7 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> { Scram::new("password").await?, )); - let _client_err = postgres_client::Config::new() + let _client_err = postgres_client::Config::new("test".to_owned(), 5432) .channel_binding(postgres_client::config::ChannelBinding::Disable) .user("user") .dbname("db") @@ -241,7 +241,7 @@ async fn connect_failure( Scram::new("password").await?, )); - let _client_err = postgres_client::Config::new() + let _client_err = postgres_client::Config::new("test".to_owned(), 5432) .channel_binding(channel_binding) .user("user") .dbname("db") diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs index c8b742b3ff..53345431e3 100644 --- a/proxy/src/proxy/tests/mod.rs +++ b/proxy/src/proxy/tests/mod.rs @@ -204,7 +204,7 @@ async fn handshake_tls_is_enforced_by_proxy() -> anyhow::Result<()> { let (_, server_config) = generate_tls_config("generic-project-name.localhost", "localhost")?; let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), NoAuth)); - let client_err = postgres_client::Config::new() + let client_err = postgres_client::Config::new("test".to_owned(), 5432) .user("john_doe") .dbname("earth") .ssl_mode(SslMode::Disable) @@ -233,7 +233,7 @@ async fn handshake_tls() -> anyhow::Result<()> { generate_tls_config("generic-project-name.localhost", "localhost")?; let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), NoAuth)); - let _conn = postgres_client::Config::new() + let _conn = postgres_client::Config::new("test".to_owned(), 5432) .user("john_doe") .dbname("earth") .ssl_mode(SslMode::Require) @@ -249,7 +249,7 @@ async fn handshake_raw() -> anyhow::Result<()> { let proxy = tokio::spawn(dummy_proxy(client, None, NoAuth)); - let _conn = postgres_client::Config::new() + let _conn = postgres_client::Config::new("test".to_owned(), 5432) .user("john_doe") .dbname("earth") .options("project=generic-project-name") @@ -296,7 +296,7 @@ async fn scram_auth_good(#[case] password: &str) -> anyhow::Result<()> { Scram::new(password).await?, )); - let _conn = postgres_client::Config::new() + let _conn = postgres_client::Config::new("test".to_owned(), 5432) .channel_binding(postgres_client::config::ChannelBinding::Require) .user("user") .dbname("db") @@ -320,7 +320,7 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> { Scram::new("password").await?, )); - let _conn = postgres_client::Config::new() + let _conn = postgres_client::Config::new("test".to_owned(), 5432) .channel_binding(postgres_client::config::ChannelBinding::Disable) .user("user") .dbname("db") @@ -348,7 +348,7 @@ async fn scram_auth_mock() -> anyhow::Result<()> { .map(char::from) .collect(); - let _client_err = postgres_client::Config::new() + let _client_err = postgres_client::Config::new("test".to_owned(), 5432) .user("user") .dbname("db") .password(&password) // no password will match the mocked secret @@ -546,7 +546,7 @@ impl TestControlPlaneClient for TestConnectMechanism { fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeInfo { let node = NodeInfo { - config: compute::ConnCfg::new(), + config: compute::ConnCfg::new("test".to_owned(), 5432), aux: MetricsAuxInfo { endpoint_id: (&EndpointId::from("endpoint")).into(), project_id: (&ProjectId::from("project")).into(), diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index 8c7931907d..55d2e47fd3 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -499,7 +499,7 @@ impl ConnectMechanism for TokioMechanism { node_info: &CachedNodeInfo, timeout: Duration, ) -> Result { - let host = node_info.config.get_host()?; + let host = node_info.config.get_host(); let permit = self.locks.get_permit(&host).await?; let mut config = (*node_info.config).clone(); @@ -549,16 +549,12 @@ impl ConnectMechanism for HyperMechanism { node_info: &CachedNodeInfo, timeout: Duration, ) -> Result { - let host = node_info.config.get_host()?; + let host = node_info.config.get_host(); let permit = self.locks.get_permit(&host).await?; let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); - let port = *node_info.config.get_ports().first().ok_or_else(|| { - HttpConnError::WakeCompute(WakeComputeError::BadComputeAddress( - "local-proxy port missing on compute address".into(), - )) - })?; + let port = node_info.config.get_port(); let res = connect_http2(&host, port, timeout).await; drop(pause); let (client, connection) = permit.release_result(res)?; From 718645e56cc2a1527c28620fe02701267cd96b9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Tue, 3 Dec 2024 21:39:10 +0100 Subject: [PATCH 47/65] Support tenant manifests in the scrubber (#9942) Support tenant manifests in the storage scrubber: * list the manifests, order them by generation * delete all manifests except for the two most recent generations * for the latest manifest: try parsing it. I've tested this patch by running the against a staging bucket and it successfully deleted stuff (and avoided deleting the latest two generations). In follow-up work, we might want to also check some invariants of the manifest, as mentioned in #8088. Part of #9386 Part of #8088 --------- Co-authored-by: Christian Schwarz --- .../src/tenant/remote_timeline_client.rs | 4 +- .../tenant/remote_timeline_client/manifest.rs | 2 +- storage_scrubber/src/checks.rs | 135 ++++++++- .../src/pageserver_physical_gc.rs | 258 ++++++++++++++---- test_runner/regress/test_timeline_archive.py | 114 ++++++++ 5 files changed, 459 insertions(+), 54 deletions(-) diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 007bd3eef0..4bb1bbf3cf 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -2564,9 +2564,9 @@ pub fn parse_remote_index_path(path: RemotePath) -> Option { } /// Given the key of a tenant manifest, parse out the generation number -pub(crate) fn parse_remote_tenant_manifest_path(path: RemotePath) -> Option { +pub fn parse_remote_tenant_manifest_path(path: RemotePath) -> Option { static RE: OnceLock = OnceLock::new(); - let re = RE.get_or_init(|| Regex::new(r".+tenant-manifest-([0-9a-f]{8}).json").unwrap()); + let re = RE.get_or_init(|| Regex::new(r".*tenant-manifest-([0-9a-f]{8}).json").unwrap()); re.captures(path.get_path().as_str()) .and_then(|c| c.get(1)) .and_then(|m| Generation::parse_suffix(m.as_str())) diff --git a/pageserver/src/tenant/remote_timeline_client/manifest.rs b/pageserver/src/tenant/remote_timeline_client/manifest.rs index c4382cb648..2029847a12 100644 --- a/pageserver/src/tenant/remote_timeline_client/manifest.rs +++ b/pageserver/src/tenant/remote_timeline_client/manifest.rs @@ -43,7 +43,7 @@ impl TenantManifest { offloaded_timelines: vec![], } } - pub(crate) fn from_json_bytes(bytes: &[u8]) -> Result { + pub fn from_json_bytes(bytes: &[u8]) -> Result { serde_json::from_slice::(bytes) } diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs index 8d855d263c..1b4ff01a17 100644 --- a/storage_scrubber/src/checks.rs +++ b/storage_scrubber/src/checks.rs @@ -4,17 +4,21 @@ use itertools::Itertools; use pageserver::tenant::checks::check_valid_layermap; use pageserver::tenant::layer_map::LayerMap; use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata; +use pageserver::tenant::remote_timeline_client::manifest::TenantManifest; use pageserver_api::shard::ShardIndex; use tokio_util::sync::CancellationToken; use tracing::{info, warn}; use utils::generation::Generation; use utils::id::TimelineId; +use utils::shard::TenantShardId; use crate::cloud_admin_api::BranchData; use crate::metadata_stream::stream_listing; use crate::{download_object_with_retries, RootTarget, TenantShardTimelineId}; use futures_util::StreamExt; -use pageserver::tenant::remote_timeline_client::{parse_remote_index_path, remote_layer_path}; +use pageserver::tenant::remote_timeline_client::{ + parse_remote_index_path, parse_remote_tenant_manifest_path, remote_layer_path, +}; use pageserver::tenant::storage_layer::LayerName; use pageserver::tenant::IndexPart; use remote_storage::{GenericRemoteStorage, ListingObject, RemotePath}; @@ -527,3 +531,132 @@ async fn list_timeline_blobs_impl( unknown_keys, })) } + +pub(crate) struct RemoteTenantManifestInfo { + pub(crate) latest_generation: Option, + pub(crate) manifests: Vec<(Generation, ListingObject)>, +} + +pub(crate) enum ListTenantManifestResult { + WithErrors { + errors: Vec<(String, String)>, + #[allow(dead_code)] + unknown_keys: Vec, + }, + NoErrors(RemoteTenantManifestInfo), +} + +/// Lists the tenant manifests in remote storage and parses the latest one, returning a [`ListTenantManifestResult`] object. +pub(crate) async fn list_tenant_manifests( + remote_client: &GenericRemoteStorage, + tenant_id: TenantShardId, + root_target: &RootTarget, +) -> anyhow::Result { + let mut errors = Vec::new(); + let mut unknown_keys = Vec::new(); + + let mut tenant_root_target = root_target.tenant_root(&tenant_id); + let original_prefix = tenant_root_target.prefix_in_bucket.clone(); + const TENANT_MANIFEST_STEM: &str = "tenant-manifest"; + tenant_root_target.prefix_in_bucket += TENANT_MANIFEST_STEM; + tenant_root_target.delimiter = String::new(); + + let mut manifests: Vec<(Generation, ListingObject)> = Vec::new(); + + let prefix_str = &original_prefix + .strip_prefix("/") + .unwrap_or(&original_prefix); + + let mut stream = std::pin::pin!(stream_listing(remote_client, &tenant_root_target)); + 'outer: while let Some(obj) = stream.next().await { + let (key, Some(obj)) = obj? else { + panic!("ListingObject not specified"); + }; + + 'err: { + // TODO a let chain would be nicer here. + let Some(name) = key.object_name() else { + break 'err; + }; + if !name.starts_with(TENANT_MANIFEST_STEM) { + break 'err; + } + let Some(generation) = parse_remote_tenant_manifest_path(key.clone()) else { + break 'err; + }; + tracing::debug!("tenant manifest {key}"); + manifests.push((generation, obj)); + continue 'outer; + } + tracing::info!("Listed an unknown key: {key}"); + unknown_keys.push(obj); + } + + if manifests.is_empty() { + tracing::debug!("No manifest for timeline."); + + return Ok(ListTenantManifestResult::WithErrors { + errors, + unknown_keys, + }); + } + if !unknown_keys.is_empty() { + errors.push(((*prefix_str).to_owned(), "unknown keys listed".to_string())); + + return Ok(ListTenantManifestResult::WithErrors { + errors, + unknown_keys, + }); + } + + // Find the manifest with the highest generation + let (latest_generation, latest_listing_object) = manifests + .iter() + .max_by_key(|i| i.0) + .map(|(g, obj)| (*g, obj.clone())) + .unwrap(); + + let manifest_bytes = + match download_object_with_retries(remote_client, &latest_listing_object.key).await { + Ok(bytes) => bytes, + Err(e) => { + // It is possible that the tenant gets deleted in-between we list the objects + // and we download the manifest file. + errors.push(( + latest_listing_object.key.get_path().as_str().to_owned(), + format!("failed to download tenant-manifest.json: {e}"), + )); + return Ok(ListTenantManifestResult::WithErrors { + errors, + unknown_keys, + }); + } + }; + + match TenantManifest::from_json_bytes(&manifest_bytes) { + Ok(_manifest) => { + return Ok(ListTenantManifestResult::NoErrors( + RemoteTenantManifestInfo { + latest_generation: Some(latest_generation), + manifests, + }, + )); + } + Err(parse_error) => errors.push(( + latest_listing_object.key.get_path().as_str().to_owned(), + format!("tenant-manifest.json body parsing error: {parse_error}"), + )), + } + + if errors.is_empty() { + errors.push(( + (*prefix_str).to_owned(), + "Unexpected: no errors did not lead to a successfully parsed blob return".to_string(), + )); + } + + Ok(ListTenantManifestResult::WithErrors { + errors, + unknown_keys, + }) +} diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs index 1e69ddbf15..20cb9c3633 100644 --- a/storage_scrubber/src/pageserver_physical_gc.rs +++ b/storage_scrubber/src/pageserver_physical_gc.rs @@ -2,12 +2,16 @@ use std::collections::{BTreeMap, BTreeSet, HashMap}; use std::sync::Arc; use std::time::Duration; -use crate::checks::{list_timeline_blobs, BlobDataParseResult}; +use crate::checks::{ + list_tenant_manifests, list_timeline_blobs, BlobDataParseResult, ListTenantManifestResult, +}; use crate::metadata_stream::{stream_tenant_timelines, stream_tenants}; use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId, MAX_RETRIES}; use futures_util::{StreamExt, TryStreamExt}; use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata; -use pageserver::tenant::remote_timeline_client::{parse_remote_index_path, remote_layer_path}; +use pageserver::tenant::remote_timeline_client::{ + parse_remote_index_path, parse_remote_tenant_manifest_path, remote_layer_path, +}; use pageserver::tenant::storage_layer::LayerName; use pageserver::tenant::IndexPart; use pageserver_api::controller_api::TenantDescribeResponse; @@ -25,6 +29,7 @@ use utils::id::{TenantId, TenantTimelineId}; #[derive(Serialize, Default)] pub struct GcSummary { indices_deleted: usize, + tenant_manifests_deleted: usize, remote_storage_errors: usize, controller_api_errors: usize, ancestor_layers_deleted: usize, @@ -34,12 +39,14 @@ impl GcSummary { fn merge(&mut self, other: Self) { let Self { indices_deleted, + tenant_manifests_deleted, remote_storage_errors, ancestor_layers_deleted, controller_api_errors, } = other; self.indices_deleted += indices_deleted; + self.tenant_manifests_deleted += tenant_manifests_deleted; self.remote_storage_errors += remote_storage_errors; self.ancestor_layers_deleted += ancestor_layers_deleted; self.controller_api_errors += controller_api_errors; @@ -352,6 +359,69 @@ async fn maybe_delete_index( } } +async fn maybe_delete_tenant_manifest( + remote_client: &GenericRemoteStorage, + min_age: &Duration, + latest_gen: Generation, + obj: &ListingObject, + mode: GcMode, + summary: &mut GcSummary, +) { + // Validation: we will only delete things that parse cleanly + let basename = obj.key.get_path().file_name().unwrap(); + let Some(candidate_generation) = + parse_remote_tenant_manifest_path(RemotePath::from_string(basename).unwrap()) + else { + // A strange key: we will not delete this because we don't understand it. + tracing::warn!("Bad index key"); + return; + }; + + // Validation: we will only delete manifests more than one generation old, and in fact we + // should never be called with such recent generations. + if candidate_generation >= latest_gen { + tracing::warn!("Deletion candidate is >= latest generation, this is a bug!"); + return; + } else if candidate_generation.next() == latest_gen { + tracing::warn!("Deletion candidate is >= latest generation - 1, this is a bug!"); + return; + } + + if !is_old_enough(min_age, obj, summary) { + return; + } + + if matches!(mode, GcMode::DryRun) { + tracing::info!("Dry run: would delete this key"); + return; + } + + // All validations passed: erase the object + let cancel = CancellationToken::new(); + match backoff::retry( + || remote_client.delete(&obj.key, &cancel), + |_| false, + 3, + MAX_RETRIES as u32, + "maybe_delete_tenant_manifest", + &cancel, + ) + .await + { + None => { + unreachable!("Using a dummy cancellation token"); + } + Some(Ok(_)) => { + tracing::info!("Successfully deleted tenant manifest"); + summary.tenant_manifests_deleted += 1; + } + Some(Err(e)) => { + tracing::warn!("Failed to delete tenant manifest: {e}"); + summary.remote_storage_errors += 1; + } + } +} + #[allow(clippy::too_many_arguments)] async fn gc_ancestor( remote_client: &GenericRemoteStorage, @@ -451,13 +521,100 @@ async fn gc_ancestor( Ok(()) } +async fn gc_tenant_manifests( + remote_client: &GenericRemoteStorage, + min_age: Duration, + target: &RootTarget, + mode: GcMode, + tenant_shard_id: TenantShardId, +) -> anyhow::Result { + let mut gc_summary = GcSummary::default(); + match list_tenant_manifests(remote_client, tenant_shard_id, target).await? { + ListTenantManifestResult::WithErrors { + errors, + unknown_keys: _, + } => { + for (_key, error) in errors { + tracing::warn!(%tenant_shard_id, "list_tenant_manifests: {error}"); + } + } + ListTenantManifestResult::NoErrors(mut manifest_info) => { + let Some(latest_gen) = manifest_info.latest_generation else { + return Ok(gc_summary); + }; + manifest_info + .manifests + .sort_by_key(|(generation, _obj)| *generation); + // skip the two latest generations (they don't neccessarily have to be 1 apart from each other) + let candidates = manifest_info.manifests.iter().rev().skip(2); + for (_generation, key) in candidates { + maybe_delete_tenant_manifest( + remote_client, + &min_age, + latest_gen, + key, + mode, + &mut gc_summary, + ) + .instrument( + info_span!("maybe_delete_tenant_manifest", %tenant_shard_id, ?latest_gen, %key.key), + ) + .await; + } + } + } + Ok(gc_summary) +} + +async fn gc_timeline( + remote_client: &GenericRemoteStorage, + min_age: &Duration, + target: &RootTarget, + mode: GcMode, + ttid: TenantShardTimelineId, + accumulator: &Arc>, +) -> anyhow::Result { + let mut summary = GcSummary::default(); + let data = list_timeline_blobs(remote_client, ttid, target).await?; + + let (index_part, latest_gen, candidates) = match &data.blob_data { + BlobDataParseResult::Parsed { + index_part, + index_part_generation, + s3_layers: _s3_layers, + } => (index_part, *index_part_generation, data.unused_index_keys), + BlobDataParseResult::Relic => { + // Post-deletion tenant location: don't try and GC it. + return Ok(summary); + } + BlobDataParseResult::Incorrect { + errors, + s3_layers: _, + } => { + // Our primary purpose isn't to report on bad data, but log this rather than skipping silently + tracing::warn!("Skipping timeline {ttid}, bad metadata: {errors:?}"); + return Ok(summary); + } + }; + + accumulator.lock().unwrap().update(ttid, index_part); + + for key in candidates { + maybe_delete_index(remote_client, min_age, latest_gen, &key, mode, &mut summary) + .instrument(info_span!("maybe_delete_index", %ttid, ?latest_gen, %key.key)) + .await; + } + + Ok(summary) +} + /// Physical garbage collection: removing unused S3 objects. /// /// This is distinct from the garbage collection done inside the pageserver, which operates at a higher level /// (keys, layers). This type of garbage collection is about removing: /// - Objects that were uploaded but never referenced in the remote index (e.g. because of a shutdown between /// uploading a layer and uploading an index) -/// - Index objects from historic generations +/// - Index objects and tenant manifests from historic generations /// /// This type of GC is not necessary for correctness: rather it serves to reduce wasted storage capacity, and /// make sure that object listings don't get slowed down by large numbers of garbage objects. @@ -470,6 +627,7 @@ pub async fn pageserver_physical_gc( ) -> anyhow::Result { let (remote_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?; + let remote_client = Arc::new(remote_client); let tenants = if tenant_shard_ids.is_empty() { futures::future::Either::Left(stream_tenants(&remote_client, &target)) } else { @@ -484,59 +642,59 @@ pub async fn pageserver_physical_gc( let accumulator = Arc::new(std::sync::Mutex::new(TenantRefAccumulator::default())); // Generate a stream of TenantTimelineId - let timelines = tenants.map_ok(|t| stream_tenant_timelines(&remote_client, &target, t)); - let timelines = timelines.try_buffered(CONCURRENCY); - let timelines = timelines.try_flatten(); - - // Generate a stream of S3TimelineBlobData - async fn gc_timeline( - remote_client: &GenericRemoteStorage, - min_age: &Duration, - target: &RootTarget, - mode: GcMode, - ttid: TenantShardTimelineId, - accumulator: &Arc>, - ) -> anyhow::Result { - let mut summary = GcSummary::default(); - let data = list_timeline_blobs(remote_client, ttid, target).await?; - - let (index_part, latest_gen, candidates) = match &data.blob_data { - BlobDataParseResult::Parsed { - index_part, - index_part_generation, - s3_layers: _s3_layers, - } => (index_part, *index_part_generation, data.unused_index_keys), - BlobDataParseResult::Relic => { - // Post-deletion tenant location: don't try and GC it. - return Ok(summary); - } - BlobDataParseResult::Incorrect { - errors, - s3_layers: _, - } => { - // Our primary purpose isn't to report on bad data, but log this rather than skipping silently - tracing::warn!("Skipping timeline {ttid}, bad metadata: {errors:?}"); - return Ok(summary); - } - }; - - accumulator.lock().unwrap().update(ttid, index_part); - - for key in candidates { - maybe_delete_index(remote_client, min_age, latest_gen, &key, mode, &mut summary) - .instrument(info_span!("maybe_delete_index", %ttid, ?latest_gen, %key.key)) - .await; - } - - Ok(summary) + enum GcSummaryOrContent { + Content(T), + GcSummary(GcSummary), } + let timelines = tenants.map_ok(|tenant_shard_id| { + let target_ref = ⌖ + let remote_client_ref = &remote_client; + async move { + let summaries_from_manifests = match gc_tenant_manifests( + remote_client_ref, + min_age, + target_ref, + mode, + tenant_shard_id, + ) + .await + { + Ok(gc_summary) => vec![Ok(GcSummaryOrContent::::GcSummary( + gc_summary, + ))], + Err(e) => { + tracing::warn!(%tenant_shard_id, "Error in gc_tenant_manifests: {e}"); + Vec::new() + } + }; + stream_tenant_timelines(remote_client_ref, target_ref, tenant_shard_id) + .await + .map(|stream| { + stream + .map_ok(GcSummaryOrContent::Content) + .chain(futures::stream::iter(summaries_from_manifests.into_iter())) + }) + } + }); + let timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY)); + let timelines = timelines.try_flatten(); let mut summary = GcSummary::default(); // Drain futures for per-shard GC, populating accumulator as a side effect { - let timelines = timelines.map_ok(|ttid| { - gc_timeline(&remote_client, &min_age, &target, mode, ttid, &accumulator) + let timelines = timelines.map_ok(|summary_or_ttid| match summary_or_ttid { + GcSummaryOrContent::Content(ttid) => futures::future::Either::Left(gc_timeline( + &remote_client, + &min_age, + &target, + mode, + ttid, + &accumulator, + )), + GcSummaryOrContent::GcSummary(gc_summary) => { + futures::future::Either::Right(futures::future::ok(gc_summary)) + } }); let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY)); diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py index 5a1e493bbe..e808dd1396 100644 --- a/test_runner/regress/test_timeline_archive.py +++ b/test_runner/regress/test_timeline_archive.py @@ -835,3 +835,117 @@ def test_timeline_retain_lsn( with env.endpoints.create_start("test_archived_branch", tenant_id=tenant_id) as endpoint: sum = endpoint.safe_psql("SELECT sum(key) from foo where v < 51200") assert sum == pre_branch_sum + + +def test_timeline_offload_generations(neon_env_builder: NeonEnvBuilder): + """ + Test for scrubber deleting old generations of manifests + """ + remote_storage_kind = s3_storage() + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) + + env = neon_env_builder.init_start() + ps_http = env.pageserver.http_client() + + # Turn off gc and compaction loops: we want to issue them manually for better reliability + tenant_id, root_timeline_id = env.create_tenant( + conf={ + "gc_period": "0s", + "compaction_period": "0s", + "checkpoint_distance": f"{1024 ** 2}", + } + ) + + # Create a branch and archive it + child_timeline_id = env.create_branch("test_archived_branch_persisted", tenant_id) + + with env.endpoints.create_start( + "test_archived_branch_persisted", tenant_id=tenant_id + ) as endpoint: + endpoint.safe_psql_many( + [ + "CREATE TABLE foo(key serial primary key, t text default 'data_content')", + "INSERT INTO foo SELECT FROM generate_series(1,512)", + ] + ) + sum = endpoint.safe_psql("SELECT sum(key) from foo where key % 3 = 2") + last_flush_lsn_upload(env, endpoint, tenant_id, child_timeline_id) + + assert_prefix_not_empty( + neon_env_builder.pageserver_remote_storage, + prefix=f"tenants/{str(tenant_id)}/", + ) + assert_prefix_empty( + neon_env_builder.pageserver_remote_storage, + prefix=f"tenants/{str(tenant_id)}/tenant-manifest", + ) + + ps_http.timeline_archival_config( + tenant_id, + child_timeline_id, + state=TimelineArchivalState.ARCHIVED, + ) + + def timeline_offloaded_api(timeline_id: TimelineId) -> bool: + # TODO add a proper API to check if a timeline has been offloaded or not + return not any( + timeline["timeline_id"] == str(timeline_id) + for timeline in ps_http.timeline_list(tenant_id=tenant_id) + ) + + def child_offloaded(): + ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=child_timeline_id) + assert timeline_offloaded_api(child_timeline_id) + + wait_until(child_offloaded) + + assert timeline_offloaded_api(child_timeline_id) + assert not timeline_offloaded_api(root_timeline_id) + + # Reboot the pageserver a bunch of times, do unoffloads, offloads + for i in range(5): + env.pageserver.stop() + env.pageserver.start() + + assert timeline_offloaded_api(child_timeline_id) + assert not timeline_offloaded_api(root_timeline_id) + + ps_http.timeline_archival_config( + tenant_id, + child_timeline_id, + state=TimelineArchivalState.UNARCHIVED, + ) + + assert not timeline_offloaded_api(child_timeline_id) + + if i % 2 == 0: + with env.endpoints.create_start( + "test_archived_branch_persisted", tenant_id=tenant_id + ) as endpoint: + sum_again = endpoint.safe_psql("SELECT sum(key) from foo where key % 3 = 2") + assert sum == sum_again + + ps_http.timeline_archival_config( + tenant_id, + child_timeline_id, + state=TimelineArchivalState.ARCHIVED, + ) + wait_until(child_offloaded) + + # + # Now ensure that scrubber runs will clean up old generations' manifests. + # + + # Sleep some amount larger than min_age_secs + time.sleep(3) + + # Ensure that min_age_secs has a deletion impeding effect + gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=3600, mode="full") + assert gc_summary["remote_storage_errors"] == 0 + assert gc_summary["indices_deleted"] == 0 + assert gc_summary["tenant_manifests_deleted"] == 0 + + gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=1, mode="full") + assert gc_summary["remote_storage_errors"] == 0 + assert gc_summary["indices_deleted"] > 0 + assert gc_summary["tenant_manifests_deleted"] > 0 From c719be64741a6459293d68f42d2b8fb783985948 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Tue, 3 Dec 2024 23:07:03 +0100 Subject: [PATCH 48/65] tests & benchmarks: unify the way we customize the default tenant config (#9992) Before this PR, some override callbacks used `.default()`, others used `.setdefault()`. As of this PR, all callbacks use `.setdefault()` which I think is least prone to failure. Aligning on a single way will set the right example for future tests that need such customization. The `test_pageserver_getpage_throttle.py` technically is a change in behavior: before, it replaced the `tenant_config` field, now it just configures the throttle. This is what I believe is intended anyway. --- test_runner/performance/test_branch_creation.py | 3 +-- test_runner/regress/test_disk_usage_eviction.py | 3 +-- test_runner/regress/test_pageserver_getpage_throttle.py | 3 ++- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py index 3ce27d6cd3..cf2212d447 100644 --- a/test_runner/performance/test_branch_creation.py +++ b/test_runner/performance/test_branch_creation.py @@ -142,10 +142,9 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape: # start without gc so we can time compaction with less noise; use shorter # period for compaction so it starts earlier def patch_default_tenant_config(config): - tenant_config = config.get("tenant_config", {}) + tenant_config = config.setdefault("tenant_config", {}) tenant_config["compaction_period"] = "3s" tenant_config["gc_period"] = "0s" - config["tenant_config"] = tenant_config env.pageserver.edit_config_toml(patch_default_tenant_config) env.pageserver.start( diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index 05956b5b93..954db914b9 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -62,9 +62,8 @@ def test_min_resident_size_override_handling( if config_level_override is not None: def set_min_resident_size(config): - tenant_config = config.get("tenant_config", {}) + tenant_config = config.setdefault("tenant_config", {}) tenant_config["min_resident_size_override"] = config_level_override - config["tenant_config"] = tenant_config env.pageserver.edit_config_toml(set_min_resident_size) env.pageserver.stop() diff --git a/test_runner/regress/test_pageserver_getpage_throttle.py b/test_runner/regress/test_pageserver_getpage_throttle.py index 6d0661f068..9644ebe3e2 100644 --- a/test_runner/regress/test_pageserver_getpage_throttle.py +++ b/test_runner/regress/test_pageserver_getpage_throttle.py @@ -183,7 +183,8 @@ def test_throttle_fair_config_is_settable_but_ignored_in_config_toml( """ def set_tenant_config(ps_cfg): - ps_cfg["tenant_config"] = {"timeline_get_throttle": throttle_config_with_field_fair_set} + tenant_config = ps_cfg.setdefault("tenant_config", {}) + tenant_config["timeline_get_throttle"] = throttle_config_with_field_fair_set neon_env_builder.pageserver_config_override = set_tenant_config env = neon_env_builder.init_start() From 73ccc2b08cb6ca69370733dd4eba520414943dc7 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Tue, 3 Dec 2024 22:46:18 +0000 Subject: [PATCH 49/65] test_page_service_batching: fix non-numeric metrics (#9998) ## Problem ``` 2024-12-03T15:42:46.5978335Z + poetry run python /__w/neon/neon/scripts/ingest_perf_test_result.py --ingest /__w/neon/neon/test_runner/perf-report-local 2024-12-03T15:42:49.5325077Z Traceback (most recent call last): 2024-12-03T15:42:49.5325603Z File "/__w/neon/neon/scripts/ingest_perf_test_result.py", line 165, in 2024-12-03T15:42:49.5326029Z main() 2024-12-03T15:42:49.5326316Z File "/__w/neon/neon/scripts/ingest_perf_test_result.py", line 155, in main 2024-12-03T15:42:49.5326739Z ingested = ingest_perf_test_result(cur, item, recorded_at_timestamp) 2024-12-03T15:42:49.5327488Z ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 2024-12-03T15:42:49.5327914Z File "/__w/neon/neon/scripts/ingest_perf_test_result.py", line 99, in ingest_perf_test_result 2024-12-03T15:42:49.5328321Z psycopg2.extras.execute_values( 2024-12-03T15:42:49.5328940Z File "/github/home/.cache/pypoetry/virtualenvs/non-package-mode-_pxWMzVK-py3.11/lib/python3.11/site-packages/psycopg2/extras.py", line 1299, in execute_values 2024-12-03T15:42:49.5335618Z cur.execute(b''.join(parts)) 2024-12-03T15:42:49.5335967Z psycopg2.errors.InvalidTextRepresentation: invalid input syntax for type numeric: "concurrent-futures" 2024-12-03T15:42:49.5336287Z LINE 57: 'concurrent-futures', 2024-12-03T15:42:49.5336462Z ^ ``` ## Summary of changes - `test_page_service_batching`: save non-numeric params as `labels` - Add a runtime check that `metric_value` is NUMERIC --- test_runner/fixtures/benchmark_fixture.py | 10 ++++++++++ .../pageserver/test_page_service_batching.py | 11 ++++------- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index bb8e75902e..fa3747c08f 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -266,6 +266,16 @@ class NeonBenchmarker: name = f"{self.PROPERTY_PREFIX}_{metric_name}" if labels is None: labels = {} + + # Sometimes mypy can't catch non-numeric values, + # so adding a check here + try: + float(metric_value) + except ValueError as e: + raise ValueError( + f"`metric_value` (`{metric_value}`) must be a NUMERIC-friendly data type" + ) from e + self.property_recorder( name, { diff --git a/test_runner/performance/pageserver/test_page_service_batching.py b/test_runner/performance/pageserver/test_page_service_batching.py index 562094a059..2c27368001 100644 --- a/test_runner/performance/pageserver/test_page_service_batching.py +++ b/test_runner/performance/pageserver/test_page_service_batching.py @@ -116,21 +116,18 @@ def test_throughput( # name is not a metric, we just use it to identify the test easily in the `test_...[...]`` notation } ) - params.update( - { - f"pipelining_config.{k}": (v, {}) - for k, v in dataclasses.asdict(pipelining_config).items() - } - ) + # For storing configuration as a metric, insert a fake 0 with labels with actual data + params.update({"pipelining_config": (0, {"labels": dataclasses.asdict(pipelining_config)})}) log.info("params: %s", params) for param, (value, kwargs) in params.items(): zenbenchmark.record( param, - metric_value=value, + metric_value=float(value), unit=kwargs.pop("unit", ""), report=MetricReport.TEST_PARAM, + labels=kwargs.pop("labels", None), **kwargs, ) From 16163fb8508a4383410d5f8589961fce6c4e3aa2 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 4 Dec 2024 01:07:49 +0100 Subject: [PATCH 50/65] page_service: enable batching in Rust & Python Tests + Python benchmarks (#9993) This is the first step towards batching rollout. Refs - rollout plan: https://github.com/neondatabase/cloud/issues/20620 - task https://github.com/neondatabase/neon/issues/9377 - uber-epic: https://github.com/neondatabase/neon/issues/9376 --- libs/pageserver_api/src/config.rs | 9 ++++++++- test_runner/fixtures/neon_fixtures.py | 11 +++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index e49d15ba87..09cfbc55fd 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -442,7 +442,14 @@ impl Default for ConfigToml { tenant_config: TenantConfigToml::default(), no_sync: None, wal_receiver_protocol: DEFAULT_WAL_RECEIVER_PROTOCOL, - page_service_pipelining: PageServicePipeliningConfig::Serial, + page_service_pipelining: if !cfg!(test) { + PageServicePipeliningConfig::Serial + } else { + PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined { + max_batch_size: NonZeroUsize::new(32).unwrap(), + execution: PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures, + }) + }, } } } diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index f55f06bebc..9c579373e8 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1095,6 +1095,17 @@ class NeonEnv: # the pageserver taking a long time to start up due to syncfs flushing other tests' data "no_sync": True, } + + # Batching (https://github.com/neondatabase/neon/issues/9377): + # enable batching by default in tests and benchmarks. + # Compat tests are exempt because old versions fail to parse the new config. + if not config.compatibility_neon_binpath: + ps_cfg["page_service_pipelining"] = { + "mode": "pipelined", + "execution": "concurrent-futures", + "max_batch_size": 32, + } + if self.pageserver_virtual_file_io_engine is not None: ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine if config.pageserver_default_tenant_config_compaction_algorithm is not None: From 1c9bbf1a92ec89fa6cff269cdeaf2c89408e1a35 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Wed, 4 Dec 2024 09:25:29 +0000 Subject: [PATCH 51/65] storcon: return an error for drain attempts while paused (#9997) ## Problem We currently allow drain operations to proceed while the node policy is paused. ## Summary of changes Return a precondition failed error in such cases. The orchestrator is updated in https://github.com/neondatabase/infra/pull/2544 to skip drain and fills if the pageserver is paused. Closes: https://github.com/neondatabase/neon/issues/9907 --- storage_controller/src/service.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 741d3dc2b4..92ec58cb4d 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -5681,7 +5681,7 @@ impl Service { } match node_policy { - NodeSchedulingPolicy::Active | NodeSchedulingPolicy::Pause => { + NodeSchedulingPolicy::Active => { self.node_configure(node_id, None, Some(NodeSchedulingPolicy::Draining)) .await?; From 709b8cd37180e81919a2ac7159e17c7a15becdc2 Mon Sep 17 00:00:00 2001 From: Peter Bendel Date: Wed, 4 Dec 2024 12:07:22 +0100 Subject: [PATCH 52/65] optimize parms for ingest bench (#9999) ## Problem we tried different parallelism settings for ingest bench ## Summary of changes the following settings seem optimal after merging - SK side Wal filtering - batched getpages Settings: - effective_io_concurrency 100 - concurrency limit 200 (different from Prod!) - jobs 4, maintenance workers 7 - 10 GB chunk size --- .github/workflows/ingest_benchmark.yml | 1 + .../performance/test_perf_ingest_using_pgcopydb.py | 10 +++++----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ingest_benchmark.yml b/.github/workflows/ingest_benchmark.yml index 1033dc6489..a5810e91a4 100644 --- a/.github/workflows/ingest_benchmark.yml +++ b/.github/workflows/ingest_benchmark.yml @@ -26,6 +26,7 @@ concurrency: jobs: ingest: strategy: + fail-fast: false # allow other variants to continue even if one fails matrix: target_project: [new_empty_project, large_existing_project] permissions: diff --git a/test_runner/performance/test_perf_ingest_using_pgcopydb.py b/test_runner/performance/test_perf_ingest_using_pgcopydb.py index 37f2e9db50..2f4574ba88 100644 --- a/test_runner/performance/test_perf_ingest_using_pgcopydb.py +++ b/test_runner/performance/test_perf_ingest_using_pgcopydb.py @@ -60,13 +60,13 @@ def build_pgcopydb_command(pgcopydb_filter_file: Path, test_output_dir: Path): "--no-acl", "--skip-db-properties", "--table-jobs", - "8", + "4", "--index-jobs", - "8", + "4", "--restore-jobs", - "8", + "4", "--split-tables-larger-than", - "5GB", + "10GB", "--skip-extensions", "--use-copy-binary", "--filters", @@ -136,7 +136,7 @@ def run_command_and_log_output(command, log_file_path: Path): "LD_LIBRARY_PATH": f"{os.getenv('PGCOPYDB_LIB_PATH')}:{os.getenv('PG_16_LIB_PATH')}", "PGCOPYDB_SOURCE_PGURI": cast(str, os.getenv("BENCHMARK_INGEST_SOURCE_CONNSTR")), "PGCOPYDB_TARGET_PGURI": cast(str, os.getenv("BENCHMARK_INGEST_TARGET_CONNSTR")), - "PGOPTIONS": "-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=16", + "PGOPTIONS": "-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7", } # Combine the current environment with custom variables env = os.environ.copy() From 33790d14a320cb5e2fd8b1cd6ba2886b79b67127 Mon Sep 17 00:00:00 2001 From: Peter Bendel Date: Wed, 4 Dec 2024 12:37:24 +0100 Subject: [PATCH 53/65] fix parsing human time output like "50m37s" (#10001) ## Problem In ingest_benchmark.yml workflow we use pgcopydb tool to migrate project. pgcopydb logs human time. Our parsing of the human time doesn't work for times like "50m37s". [Example workflow](https://github.com/neondatabase/neon/actions/runs/12145539948/job/33867418065#step:10:479) contains "57m45s" but we [reported](https://github.com/neondatabase/neon/actions/runs/12145539948/job/33867418065#step:10:500) only the seconds part: 45.000 s ## Summary of changes add a regex pattern for Minute/Second combination --- test_runner/performance/test_perf_ingest_using_pgcopydb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_runner/performance/test_perf_ingest_using_pgcopydb.py b/test_runner/performance/test_perf_ingest_using_pgcopydb.py index 2f4574ba88..f0a0c1f5a2 100644 --- a/test_runner/performance/test_perf_ingest_using_pgcopydb.py +++ b/test_runner/performance/test_perf_ingest_using_pgcopydb.py @@ -184,7 +184,7 @@ def parse_log_and_report_metrics( for metric_name, pattern in metric_patterns.items(): if pattern.search(line): # Extract duration and convert it to seconds - duration_match = re.search(r"\d+h\d+m|\d+s|\d+ms|\d+\.\d+s", line) + duration_match = re.search(r"\d+h\d+m|\d+m\d+s|\d+s|\d+ms|\d+\.\d+s", line) if duration_match: duration_str = duration_match.group(0) parts = re.findall(r"\d+[a-zA-Z]+", duration_str) From 13285c2a5e5ad450775d9f40c0fee3a6c62da61f Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Wed, 4 Dec 2024 13:53:52 +0100 Subject: [PATCH 54/65] pageserver: return proper status code for heatmap_upload errors (#9991) ## Problem During deploys, we see a lot of 500 errors due to heapmap uploads for inactive tenants. These should be 503s instead. Resolves #9574. ## Summary of changes Make the secondary tenant scheduler use `ApiError` rather than `anyhow::Error`, to propagate the tenant error and convert it to an appropriate status code. --- pageserver/src/http/routes.rs | 28 ++++++++++++---- pageserver/src/tenant.rs | 2 +- pageserver/src/tenant/mgr.rs | 5 ++- pageserver/src/tenant/secondary.rs | 33 +++++++++++++++---- pageserver/src/tenant/secondary/downloader.rs | 13 ++++---- .../src/tenant/secondary/heatmap_uploader.rs | 10 +++--- pageserver/src/tenant/secondary/scheduler.rs | 4 +-- 7 files changed, 68 insertions(+), 27 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index e127871549..e04f1460a8 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -279,7 +279,10 @@ impl From for ApiError { impl From for ApiError { fn from(tse: GetTenantError) -> ApiError { match tse { - GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()), + GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {tid}").into()), + GetTenantError::ShardNotFound(tid) => { + ApiError::NotFound(anyhow!("tenant {tid}").into()) + } GetTenantError::NotActive(_) => { // Why is this not `ApiError::NotFound`? // Because we must be careful to never return 404 for a tenant if it does @@ -387,6 +390,16 @@ impl From for ApiError { } } +impl From for ApiError { + fn from(ste: crate::tenant::secondary::SecondaryTenantError) -> ApiError { + use crate::tenant::secondary::SecondaryTenantError; + match ste { + SecondaryTenantError::GetTenant(gte) => gte.into(), + SecondaryTenantError::ShuttingDown => ApiError::ShuttingDown, + } + } +} + // Helper function to construct a TimelineInfo struct for a timeline async fn build_timeline_info( timeline: &Arc, @@ -1047,9 +1060,11 @@ async fn timeline_delete_handler( match e { // GetTenantError has a built-in conversion to ApiError, but in this context we don't // want to treat missing tenants as 404, to avoid ambiguity with successful deletions. - GetTenantError::NotFound(_) => ApiError::PreconditionFailed( - "Requested tenant is missing".to_string().into_boxed_str(), - ), + GetTenantError::NotFound(_) | GetTenantError::ShardNotFound(_) => { + ApiError::PreconditionFailed( + "Requested tenant is missing".to_string().into_boxed_str(), + ) + } e => e.into(), } })?; @@ -2462,8 +2477,7 @@ async fn secondary_upload_handler( state .secondary_controller .upload_tenant(tenant_shard_id) - .await - .map_err(ApiError::InternalServerError)?; + .await?; json_response(StatusCode::OK, ()) } @@ -2578,7 +2592,7 @@ async fn secondary_download_handler( // Edge case: downloads aren't usually fallible: things like a missing heatmap are considered // okay. We could get an error here in the unlikely edge case that the tenant // was detached between our check above and executing the download job. - Ok(Err(e)) => return Err(ApiError::InternalServerError(e)), + Ok(Err(e)) => return Err(e.into()), // A timeout is not an error: we have started the download, we're just not done // yet. The caller will get a response body indicating status. Err(_) => StatusCode::ACCEPTED, diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index ada5c4a977..5a9e398586 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -3422,7 +3422,7 @@ impl Tenant { r.map_err( |_e: tokio::sync::watch::error::RecvError| // Tenant existed but was dropped: report it as non-existent - GetActiveTenantError::NotFound(GetTenantError::NotFound(self.tenant_shard_id.tenant_id)) + GetActiveTenantError::NotFound(GetTenantError::ShardNotFound(self.tenant_shard_id)) )? } Err(TimeoutCancellableError::Cancelled) => { diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 45481c4ed4..e8b0d1d4dd 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -894,7 +894,7 @@ impl TenantManager { Some(TenantSlot::Attached(tenant)) => Ok(Arc::clone(tenant)), Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_shard_id)), None | Some(TenantSlot::Secondary(_)) => { - Err(GetTenantError::NotFound(tenant_shard_id.tenant_id)) + Err(GetTenantError::ShardNotFound(tenant_shard_id)) } } } @@ -2258,6 +2258,9 @@ pub(crate) enum GetTenantError { #[error("Tenant {0} not found")] NotFound(TenantId), + #[error("Tenant {0} not found")] + ShardNotFound(TenantShardId), + #[error("Tenant {0} is not active")] NotActive(TenantShardId), diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs index 3df89a928c..4bc208331b 100644 --- a/pageserver/src/tenant/secondary.rs +++ b/pageserver/src/tenant/secondary.rs @@ -22,6 +22,7 @@ use super::{ mgr::TenantManager, span::debug_assert_current_span_has_tenant_id, storage_layer::LayerName, + GetTenantError, }; use crate::metrics::SECONDARY_RESIDENT_PHYSICAL_SIZE; @@ -66,7 +67,21 @@ struct CommandRequest { } struct CommandResponse { - result: anyhow::Result<()>, + result: Result<(), SecondaryTenantError>, +} + +#[derive(thiserror::Error, Debug)] +pub(crate) enum SecondaryTenantError { + #[error("{0}")] + GetTenant(GetTenantError), + #[error("shutting down")] + ShuttingDown, +} + +impl From for SecondaryTenantError { + fn from(gte: GetTenantError) -> Self { + Self::GetTenant(gte) + } } // Whereas [`Tenant`] represents an attached tenant, this type represents the work @@ -285,7 +300,7 @@ impl SecondaryController { &self, queue: &tokio::sync::mpsc::Sender>, payload: T, - ) -> anyhow::Result<()> { + ) -> Result<(), SecondaryTenantError> { let (response_tx, response_rx) = tokio::sync::oneshot::channel(); queue @@ -294,20 +309,26 @@ impl SecondaryController { response_tx, }) .await - .map_err(|_| anyhow::anyhow!("Receiver shut down"))?; + .map_err(|_| SecondaryTenantError::ShuttingDown)?; let response = response_rx .await - .map_err(|_| anyhow::anyhow!("Request dropped"))?; + .map_err(|_| SecondaryTenantError::ShuttingDown)?; response.result } - pub async fn upload_tenant(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> { + pub(crate) async fn upload_tenant( + &self, + tenant_shard_id: TenantShardId, + ) -> Result<(), SecondaryTenantError> { self.dispatch(&self.upload_req_tx, UploadCommand::Upload(tenant_shard_id)) .await } - pub async fn download_tenant(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> { + pub(crate) async fn download_tenant( + &self, + tenant_shard_id: TenantShardId, + ) -> Result<(), SecondaryTenantError> { self.dispatch( &self.download_req_tx, DownloadCommand::Download(tenant_shard_id), diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index 8d771dc405..701e4cf04b 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -35,7 +35,7 @@ use super::{ self, period_jitter, period_warmup, Completion, JobGenerator, SchedulingResult, TenantBackgroundJobs, }, - SecondaryTenant, + GetTenantError, SecondaryTenant, SecondaryTenantError, }; use crate::tenant::{ @@ -470,15 +470,16 @@ impl JobGenerator anyhow::Result { + fn on_command( + &mut self, + command: DownloadCommand, + ) -> Result { let tenant_shard_id = command.get_tenant_shard_id(); let tenant = self .tenant_manager - .get_secondary_tenant_shard(*tenant_shard_id); - let Some(tenant) = tenant else { - return Err(anyhow::anyhow!("Not found or not in Secondary mode")); - }; + .get_secondary_tenant_shard(*tenant_shard_id) + .ok_or(GetTenantError::ShardNotFound(*tenant_shard_id))?; Ok(PendingDownload { target_time: None, diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs index e680fd705b..c5e5e04945 100644 --- a/pageserver/src/tenant/secondary/heatmap_uploader.rs +++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs @@ -28,7 +28,7 @@ use super::{ self, period_jitter, period_warmup, JobGenerator, RunningJob, SchedulingResult, TenantBackgroundJobs, }, - CommandRequest, UploadCommand, + CommandRequest, SecondaryTenantError, UploadCommand, }; use tokio_util::sync::CancellationToken; use tracing::{info_span, instrument, Instrument}; @@ -279,7 +279,10 @@ impl JobGenerator }.instrument(info_span!(parent: None, "heatmap_upload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug())))) } - fn on_command(&mut self, command: UploadCommand) -> anyhow::Result { + fn on_command( + &mut self, + command: UploadCommand, + ) -> Result { let tenant_shard_id = command.get_tenant_shard_id(); tracing::info!( @@ -287,8 +290,7 @@ impl JobGenerator "Starting heatmap write on command"); let tenant = self .tenant_manager - .get_attached_tenant_shard(*tenant_shard_id) - .map_err(|e| anyhow::anyhow!(e))?; + .get_attached_tenant_shard(*tenant_shard_id)?; if !tenant.is_active() { return Err(GetTenantError::NotActive(*tenant_shard_id).into()); } diff --git a/pageserver/src/tenant/secondary/scheduler.rs b/pageserver/src/tenant/secondary/scheduler.rs index 28cf2125df..e963c722b9 100644 --- a/pageserver/src/tenant/secondary/scheduler.rs +++ b/pageserver/src/tenant/secondary/scheduler.rs @@ -12,7 +12,7 @@ use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use utils::{completion::Barrier, yielding_loop::yielding_loop}; -use super::{CommandRequest, CommandResponse}; +use super::{CommandRequest, CommandResponse, SecondaryTenantError}; /// Scheduling interval is the time between calls to JobGenerator::schedule. /// When we schedule jobs, the job generator may provide a hint of its preferred @@ -112,7 +112,7 @@ where /// Called when a command is received. A job will be spawned immediately if the return /// value is Some, ignoring concurrency limits and the pending queue. - fn on_command(&mut self, cmd: CMD) -> anyhow::Result; + fn on_command(&mut self, cmd: CMD) -> Result; } /// [`JobGenerator`] returns this to provide pending jobs, and hints about scheduling From 6359342ffbc5b712350491f4f89e08d18626c786 Mon Sep 17 00:00:00 2001 From: Folke Behrens Date: Wed, 4 Dec 2024 13:58:31 +0100 Subject: [PATCH 55/65] Assign /libs/proxy/ to proxy team (#10003) --- CODEOWNERS | 1 + 1 file changed, 1 insertion(+) diff --git a/CODEOWNERS b/CODEOWNERS index 21b0e7c51f..f41462c98b 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -2,6 +2,7 @@ /compute_tools/ @neondatabase/control-plane @neondatabase/compute /libs/pageserver_api/ @neondatabase/storage /libs/postgres_ffi/ @neondatabase/compute @neondatabase/storage +/libs/proxy/ @neondatabase/proxy /libs/remote_storage/ @neondatabase/storage /libs/safekeeper_api/ @neondatabase/storage /libs/vm_monitor/ @neondatabase/autoscaling From cab498c7876db989c9bacb62cc94fe29c10b143a Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Wed, 4 Dec 2024 12:58:35 +0000 Subject: [PATCH 56/65] feat(proxy): add option to forward startup params (#9979) (stacked on #9990 and #9995) Partially fixes #1287 with a custom option field to enable the fixed behaviour. This allows us to gradually roll out the fix without silently changing the observed behaviour for our customers. related to https://github.com/neondatabase/cloud/issues/15284 --- Cargo.lock | 4 +- Cargo.toml | 2 +- libs/pq_proto/src/lib.rs | 2 +- .../src/authentication/sasl.rs | 4 +- .../src/message/frontend.rs | 28 ++- libs/proxy/tokio-postgres2/src/codec.rs | 13 +- libs/proxy/tokio-postgres2/src/config.rs | 193 +++--------------- libs/proxy/tokio-postgres2/src/connect.rs | 49 +---- libs/proxy/tokio-postgres2/src/connect_raw.rs | 31 +-- proxy/src/cancellation.rs | 11 +- proxy/src/compute.rs | 96 ++++----- proxy/src/console_redirect_proxy.rs | 1 + proxy/src/proxy/connect_compute.rs | 4 +- proxy/src/proxy/mod.rs | 40 +++- proxy/src/proxy/tests/mitm.rs | 8 +- proxy/src/proxy/tests/mod.rs | 2 +- proxy/src/serverless/backend.rs | 11 +- test_runner/fixtures/neon_fixtures.py | 2 +- test_runner/regress/test_proxy.py | 19 ++ 19 files changed, 180 insertions(+), 340 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5b80ec5e93..38158b7aec 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1031,9 +1031,9 @@ checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" [[package]] name = "bytes" -version = "1.5.0" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223" +checksum = "325918d6fe32f23b19878fe4b34794ae41fc19ddbe53b10571a4874d44ffd39b" dependencies = [ "serde", ] diff --git a/Cargo.toml b/Cargo.toml index 91fa6a2607..a35823e0c2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -74,7 +74,7 @@ bindgen = "0.70" bit_field = "0.10.2" bstr = "1.0" byteorder = "1.4" -bytes = "1.0" +bytes = "1.9" camino = "1.1.6" cfg-if = "1.0.0" chrono = { version = "0.4", default-features = false, features = ["clock"] } diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs index 43dfbc22a4..94714359a3 100644 --- a/libs/pq_proto/src/lib.rs +++ b/libs/pq_proto/src/lib.rs @@ -100,7 +100,7 @@ impl StartupMessageParamsBuilder { #[derive(Debug, Clone, Default)] pub struct StartupMessageParams { - params: Bytes, + pub params: Bytes, } impl StartupMessageParams { diff --git a/libs/proxy/postgres-protocol2/src/authentication/sasl.rs b/libs/proxy/postgres-protocol2/src/authentication/sasl.rs index 19aa3c1e9a..f2200a40ce 100644 --- a/libs/proxy/postgres-protocol2/src/authentication/sasl.rs +++ b/libs/proxy/postgres-protocol2/src/authentication/sasl.rs @@ -117,7 +117,7 @@ enum Credentials { /// A regular password as a vector of bytes. Password(Vec), /// A precomputed pair of keys. - Keys(Box>), + Keys(ScramKeys), } enum State { @@ -176,7 +176,7 @@ impl ScramSha256 { /// Constructs a new instance which will use the provided key pair for authentication. pub fn new_with_keys(keys: ScramKeys<32>, channel_binding: ChannelBinding) -> ScramSha256 { - let password = Credentials::Keys(keys.into()); + let password = Credentials::Keys(keys); ScramSha256::new_inner(password, channel_binding, nonce()) } diff --git a/libs/proxy/postgres-protocol2/src/message/frontend.rs b/libs/proxy/postgres-protocol2/src/message/frontend.rs index 5d0a8ff8c8..bc6168f337 100644 --- a/libs/proxy/postgres-protocol2/src/message/frontend.rs +++ b/libs/proxy/postgres-protocol2/src/message/frontend.rs @@ -255,22 +255,34 @@ pub fn ssl_request(buf: &mut BytesMut) { } #[inline] -pub fn startup_message<'a, I>(parameters: I, buf: &mut BytesMut) -> io::Result<()> -where - I: IntoIterator, -{ +pub fn startup_message(parameters: &StartupMessageParams, buf: &mut BytesMut) -> io::Result<()> { write_body(buf, |buf| { // postgres protocol version 3.0(196608) in bigger-endian buf.put_i32(0x00_03_00_00); - for (key, value) in parameters { - write_cstr(key.as_bytes(), buf)?; - write_cstr(value.as_bytes(), buf)?; - } + buf.put_slice(¶meters.params); buf.put_u8(0); Ok(()) }) } +#[derive(Debug, Clone, Default, PartialEq, Eq)] +pub struct StartupMessageParams { + pub params: BytesMut, +} + +impl StartupMessageParams { + /// Set parameter's value by its name. + pub fn insert(&mut self, name: &str, value: &str) { + if name.contains('\0') || value.contains('\0') { + panic!("startup parameter name or value contained a null") + } + self.params.put_slice(name.as_bytes()); + self.params.put_u8(0); + self.params.put_slice(value.as_bytes()); + self.params.put_u8(0); + } +} + #[inline] pub fn sync(buf: &mut BytesMut) { buf.put_u8(b'S'); diff --git a/libs/proxy/tokio-postgres2/src/codec.rs b/libs/proxy/tokio-postgres2/src/codec.rs index 7412db785b..0ec46198ce 100644 --- a/libs/proxy/tokio-postgres2/src/codec.rs +++ b/libs/proxy/tokio-postgres2/src/codec.rs @@ -35,9 +35,7 @@ impl FallibleIterator for BackendMessages { } } -pub struct PostgresCodec { - pub max_message_size: Option, -} +pub struct PostgresCodec; impl Encoder for PostgresCodec { type Error = io::Error; @@ -66,15 +64,6 @@ impl Decoder for PostgresCodec { break; } - if let Some(max) = self.max_message_size { - if len > max { - return Err(io::Error::new( - io::ErrorKind::InvalidInput, - "message too large", - )); - } - } - match header.tag() { backend::NOTICE_RESPONSE_TAG | backend::NOTIFICATION_RESPONSE_TAG diff --git a/libs/proxy/tokio-postgres2/src/config.rs b/libs/proxy/tokio-postgres2/src/config.rs index fd10ef6f20..11a361a81b 100644 --- a/libs/proxy/tokio-postgres2/src/config.rs +++ b/libs/proxy/tokio-postgres2/src/config.rs @@ -6,6 +6,7 @@ use crate::connect_raw::RawConnection; use crate::tls::MakeTlsConnect; use crate::tls::TlsConnect; use crate::{Client, Connection, Error}; +use postgres_protocol2::message::frontend::StartupMessageParams; use std::fmt; use std::str; use std::time::Duration; @@ -14,16 +15,6 @@ use tokio::io::{AsyncRead, AsyncWrite}; pub use postgres_protocol2::authentication::sasl::ScramKeys; use tokio::net::TcpStream; -/// Properties required of a session. -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -#[non_exhaustive] -pub enum TargetSessionAttrs { - /// No special properties are required. - Any, - /// The session must allow writes. - ReadWrite, -} - /// TLS configuration. #[derive(Debug, Copy, Clone, PartialEq, Eq)] #[non_exhaustive] @@ -73,94 +64,20 @@ pub enum AuthKeys { } /// Connection configuration. -/// -/// Configuration can be parsed from libpq-style connection strings. These strings come in two formats: -/// -/// # Key-Value -/// -/// This format consists of space-separated key-value pairs. Values which are either the empty string or contain -/// whitespace should be wrapped in `'`. `'` and `\` characters should be backslash-escaped. -/// -/// ## Keys -/// -/// * `user` - The username to authenticate with. Required. -/// * `password` - The password to authenticate with. -/// * `dbname` - The name of the database to connect to. Defaults to the username. -/// * `options` - Command line options used to configure the server. -/// * `application_name` - Sets the `application_name` parameter on the server. -/// * `sslmode` - Controls usage of TLS. If set to `disable`, TLS will not be used. If set to `prefer`, TLS will be used -/// if available, but not used otherwise. If set to `require`, TLS will be forced to be used. Defaults to `prefer`. -/// * `host` - The host to connect to. On Unix platforms, if the host starts with a `/` character it is treated as the -/// path to the directory containing Unix domain sockets. Otherwise, it is treated as a hostname. Multiple hosts -/// can be specified, separated by commas. Each host will be tried in turn when connecting. Required if connecting -/// with the `connect` method. -/// * `port` - The port to connect to. Multiple ports can be specified, separated by commas. The number of ports must be -/// either 1, in which case it will be used for all hosts, or the same as the number of hosts. Defaults to 5432 if -/// omitted or the empty string. -/// * `connect_timeout` - The time limit in seconds applied to each socket-level connection attempt. Note that hostnames -/// can resolve to multiple IP addresses, and this limit is applied to each address. Defaults to no timeout. -/// * `target_session_attrs` - Specifies requirements of the session. If set to `read-write`, the client will check that -/// the `transaction_read_write` session parameter is set to `on`. This can be used to connect to the primary server -/// in a database cluster as opposed to the secondary read-only mirrors. Defaults to `all`. -/// * `channel_binding` - Controls usage of channel binding in the authentication process. If set to `disable`, channel -/// binding will not be used. If set to `prefer`, channel binding will be used if available, but not used otherwise. -/// If set to `require`, the authentication process will fail if channel binding is not used. Defaults to `prefer`. -/// -/// ## Examples -/// -/// ```not_rust -/// host=localhost user=postgres connect_timeout=10 keepalives=0 -/// ``` -/// -/// ```not_rust -/// host=/var/lib/postgresql,localhost port=1234 user=postgres password='password with spaces' -/// ``` -/// -/// ```not_rust -/// host=host1,host2,host3 port=1234,,5678 user=postgres target_session_attrs=read-write -/// ``` -/// -/// # Url -/// -/// This format resembles a URL with a scheme of either `postgres://` or `postgresql://`. All components are optional, -/// and the format accepts query parameters for all of the key-value pairs described in the section above. Multiple -/// host/port pairs can be comma-separated. Unix socket paths in the host section of the URL should be percent-encoded, -/// as the path component of the URL specifies the database name. -/// -/// ## Examples -/// -/// ```not_rust -/// postgresql://user@localhost -/// ``` -/// -/// ```not_rust -/// postgresql://user:password@%2Fvar%2Flib%2Fpostgresql/mydb?connect_timeout=10 -/// ``` -/// -/// ```not_rust -/// postgresql://user@host1:1234,host2,host3:5678?target_session_attrs=read-write -/// ``` -/// -/// ```not_rust -/// postgresql:///mydb?user=user&host=/var/lib/postgresql -/// ``` #[derive(Clone, PartialEq, Eq)] pub struct Config { pub(crate) host: Host, pub(crate) port: u16, - pub(crate) user: Option, pub(crate) password: Option>, pub(crate) auth_keys: Option>, - pub(crate) dbname: Option, - pub(crate) options: Option, - pub(crate) application_name: Option, pub(crate) ssl_mode: SslMode, pub(crate) connect_timeout: Option, - pub(crate) target_session_attrs: TargetSessionAttrs, pub(crate) channel_binding: ChannelBinding, - pub(crate) replication_mode: Option, - pub(crate) max_backend_message_size: Option, + pub(crate) server_params: StartupMessageParams, + + database: bool, + username: bool, } impl Config { @@ -169,18 +86,15 @@ impl Config { Config { host: Host::Tcp(host), port, - user: None, password: None, auth_keys: None, - dbname: None, - options: None, - application_name: None, ssl_mode: SslMode::Prefer, connect_timeout: None, - target_session_attrs: TargetSessionAttrs::Any, channel_binding: ChannelBinding::Prefer, - replication_mode: None, - max_backend_message_size: None, + server_params: StartupMessageParams::default(), + + database: false, + username: false, } } @@ -188,14 +102,13 @@ impl Config { /// /// Required. pub fn user(&mut self, user: &str) -> &mut Config { - self.user = Some(user.to_string()); - self + self.set_param("user", user) } /// Gets the user to authenticate with, if one has been configured with /// the `user` method. - pub fn get_user(&self) -> Option<&str> { - self.user.as_deref() + pub fn user_is_set(&self) -> bool { + self.username } /// Sets the password to authenticate with. @@ -231,40 +144,26 @@ impl Config { /// /// Defaults to the user. pub fn dbname(&mut self, dbname: &str) -> &mut Config { - self.dbname = Some(dbname.to_string()); - self + self.set_param("database", dbname) } /// Gets the name of the database to connect to, if one has been configured /// with the `dbname` method. - pub fn get_dbname(&self) -> Option<&str> { - self.dbname.as_deref() + pub fn db_is_set(&self) -> bool { + self.database } - /// Sets command line options used to configure the server. - pub fn options(&mut self, options: &str) -> &mut Config { - self.options = Some(options.to_string()); + pub fn set_param(&mut self, name: &str, value: &str) -> &mut Config { + if name == "database" { + self.database = true; + } else if name == "user" { + self.username = true; + } + + self.server_params.insert(name, value); self } - /// Gets the command line options used to configure the server, if the - /// options have been set with the `options` method. - pub fn get_options(&self) -> Option<&str> { - self.options.as_deref() - } - - /// Sets the value of the `application_name` runtime parameter. - pub fn application_name(&mut self, application_name: &str) -> &mut Config { - self.application_name = Some(application_name.to_string()); - self - } - - /// Gets the value of the `application_name` runtime parameter, if it has - /// been set with the `application_name` method. - pub fn get_application_name(&self) -> Option<&str> { - self.application_name.as_deref() - } - /// Sets the SSL configuration. /// /// Defaults to `prefer`. @@ -303,23 +202,6 @@ impl Config { self.connect_timeout.as_ref() } - /// Sets the requirements of the session. - /// - /// This can be used to connect to the primary server in a clustered database rather than one of the read-only - /// secondary servers. Defaults to `Any`. - pub fn target_session_attrs( - &mut self, - target_session_attrs: TargetSessionAttrs, - ) -> &mut Config { - self.target_session_attrs = target_session_attrs; - self - } - - /// Gets the requirements of the session. - pub fn get_target_session_attrs(&self) -> TargetSessionAttrs { - self.target_session_attrs - } - /// Sets the channel binding behavior. /// /// Defaults to `prefer`. @@ -333,28 +215,6 @@ impl Config { self.channel_binding } - /// Set replication mode. - pub fn replication_mode(&mut self, replication_mode: ReplicationMode) -> &mut Config { - self.replication_mode = Some(replication_mode); - self - } - - /// Get replication mode. - pub fn get_replication_mode(&self) -> Option { - self.replication_mode - } - - /// Set limit for backend messages size. - pub fn max_backend_message_size(&mut self, max_backend_message_size: usize) -> &mut Config { - self.max_backend_message_size = Some(max_backend_message_size); - self - } - - /// Get limit for backend messages size. - pub fn get_max_backend_message_size(&self) -> Option { - self.max_backend_message_size - } - /// Opens a connection to a PostgreSQL database. /// /// Requires the `runtime` Cargo feature (enabled by default). @@ -392,18 +252,13 @@ impl fmt::Debug for Config { } f.debug_struct("Config") - .field("user", &self.user) .field("password", &self.password.as_ref().map(|_| Redaction {})) - .field("dbname", &self.dbname) - .field("options", &self.options) - .field("application_name", &self.application_name) .field("ssl_mode", &self.ssl_mode) .field("host", &self.host) .field("port", &self.port) .field("connect_timeout", &self.connect_timeout) - .field("target_session_attrs", &self.target_session_attrs) .field("channel_binding", &self.channel_binding) - .field("replication", &self.replication_mode) + .field("server_params", &self.server_params) .finish() } } diff --git a/libs/proxy/tokio-postgres2/src/connect.rs b/libs/proxy/tokio-postgres2/src/connect.rs index 75a58e6eac..e0cb69748d 100644 --- a/libs/proxy/tokio-postgres2/src/connect.rs +++ b/libs/proxy/tokio-postgres2/src/connect.rs @@ -1,14 +1,11 @@ use crate::client::SocketConfig; use crate::codec::BackendMessage; -use crate::config::{Host, TargetSessionAttrs}; +use crate::config::Host; use crate::connect_raw::connect_raw; use crate::connect_socket::connect_socket; use crate::tls::{MakeTlsConnect, TlsConnect}; -use crate::{Client, Config, Connection, Error, RawConnection, SimpleQueryMessage}; -use futures_util::{future, pin_mut, Future, FutureExt, Stream}; +use crate::{Client, Config, Connection, Error, RawConnection}; use postgres_protocol2::message::backend::Message; -use std::io; -use std::task::Poll; use tokio::net::TcpStream; use tokio::sync::mpsc; @@ -72,47 +69,7 @@ where .map(|m| BackendMessage::Async(Message::NoticeResponse(m))) .collect(); - let mut connection = Connection::new(stream, delayed, parameters, receiver); - - if let TargetSessionAttrs::ReadWrite = config.target_session_attrs { - let rows = client.simple_query_raw("SHOW transaction_read_only"); - pin_mut!(rows); - - let rows = future::poll_fn(|cx| { - if connection.poll_unpin(cx)?.is_ready() { - return Poll::Ready(Err(Error::closed())); - } - - rows.as_mut().poll(cx) - }) - .await?; - pin_mut!(rows); - - loop { - let next = future::poll_fn(|cx| { - if connection.poll_unpin(cx)?.is_ready() { - return Poll::Ready(Some(Err(Error::closed()))); - } - - rows.as_mut().poll_next(cx) - }); - - match next.await.transpose()? { - Some(SimpleQueryMessage::Row(row)) => { - if row.try_get(0)? == Some("on") { - return Err(Error::connect(io::Error::new( - io::ErrorKind::PermissionDenied, - "database does not allow writes", - ))); - } else { - break; - } - } - Some(_) => {} - None => return Err(Error::unexpected_message()), - } - } - } + let connection = Connection::new(stream, delayed, parameters, receiver); Ok((client, connection)) } diff --git a/libs/proxy/tokio-postgres2/src/connect_raw.rs b/libs/proxy/tokio-postgres2/src/connect_raw.rs index 390f133002..66db85e07d 100644 --- a/libs/proxy/tokio-postgres2/src/connect_raw.rs +++ b/libs/proxy/tokio-postgres2/src/connect_raw.rs @@ -1,5 +1,5 @@ use crate::codec::{BackendMessage, BackendMessages, FrontendMessage, PostgresCodec}; -use crate::config::{self, AuthKeys, Config, ReplicationMode}; +use crate::config::{self, AuthKeys, Config}; use crate::connect_tls::connect_tls; use crate::maybe_tls_stream::MaybeTlsStream; use crate::tls::{TlsConnect, TlsStream}; @@ -96,12 +96,7 @@ where let stream = connect_tls(stream, config.ssl_mode, tls).await?; let mut stream = StartupStream { - inner: Framed::new( - stream, - PostgresCodec { - max_message_size: config.max_backend_message_size, - }, - ), + inner: Framed::new(stream, PostgresCodec), buf: BackendMessages::empty(), delayed_notice: Vec::new(), }; @@ -124,28 +119,8 @@ where S: AsyncRead + AsyncWrite + Unpin, T: AsyncRead + AsyncWrite + Unpin, { - let mut params = vec![("client_encoding", "UTF8")]; - if let Some(user) = &config.user { - params.push(("user", &**user)); - } - if let Some(dbname) = &config.dbname { - params.push(("database", &**dbname)); - } - if let Some(options) = &config.options { - params.push(("options", &**options)); - } - if let Some(application_name) = &config.application_name { - params.push(("application_name", &**application_name)); - } - if let Some(replication_mode) = &config.replication_mode { - match replication_mode { - ReplicationMode::Physical => params.push(("replication", "true")), - ReplicationMode::Logical => params.push(("replication", "database")), - } - } - let mut buf = BytesMut::new(); - frontend::startup_message(params, &mut buf).map_err(Error::encode)?; + frontend::startup_message(&config.server_params, &mut buf).map_err(Error::encode)?; stream .send(FrontendMessage::Raw(buf.freeze())) diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index bcb0ef40bd..7bc5587a25 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -70,11 +70,12 @@ impl ReportableError for CancelError { impl CancellationHandler

{ /// Run async action within an ephemeral session identified by [`CancelKeyData`]. pub(crate) fn get_session(self: Arc) -> Session

{ - // HACK: We'd rather get the real backend_pid but postgres_client doesn't - // expose it and we don't want to do another roundtrip to query - // for it. The client will be able to notice that this is not the - // actual backend_pid, but backend_pid is not used for anything - // so it doesn't matter. + // we intentionally generate a random "backend pid" and "secret key" here. + // we use the corresponding u64 as an identifier for the + // actual endpoint+pid+secret for postgres/pgbouncer. + // + // if we forwarded the backend_pid from postgres to the client, there would be a lot + // of overlap between our computes as most pids are small (~100). let key = loop { let key = rand::random(); diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index ab0ff4b795..4113b5bb80 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -131,49 +131,37 @@ impl ConnCfg { } /// Apply startup message params to the connection config. - pub(crate) fn set_startup_params(&mut self, params: &StartupMessageParams) { - // Only set `user` if it's not present in the config. - // Console redirect auth flow takes username from the console's response. - if let (None, Some(user)) = (self.get_user(), params.get("user")) { - self.user(user); + pub(crate) fn set_startup_params( + &mut self, + params: &StartupMessageParams, + arbitrary_params: bool, + ) { + if !arbitrary_params { + self.set_param("client_encoding", "UTF8"); } - - // Only set `dbname` if it's not present in the config. - // Console redirect auth flow takes dbname from the console's response. - if let (None, Some(dbname)) = (self.get_dbname(), params.get("database")) { - self.dbname(dbname); - } - - // Don't add `options` if they were only used for specifying a project. - // Connection pools don't support `options`, because they affect backend startup. - if let Some(options) = filtered_options(params) { - self.options(&options); - } - - if let Some(app_name) = params.get("application_name") { - self.application_name(app_name); - } - - // TODO: This is especially ugly... - if let Some(replication) = params.get("replication") { - use postgres_client::config::ReplicationMode; - match replication { - "true" | "on" | "yes" | "1" => { - self.replication_mode(ReplicationMode::Physical); + for (k, v) in params.iter() { + match k { + // Only set `user` if it's not present in the config. + // Console redirect auth flow takes username from the console's response. + "user" if self.user_is_set() => continue, + "database" if self.db_is_set() => continue, + "options" => { + if let Some(options) = filtered_options(v) { + self.set_param(k, &options); + } } - "database" => { - self.replication_mode(ReplicationMode::Logical); + "user" | "database" | "application_name" | "replication" => { + self.set_param(k, v); } - _other => {} + + // if we allow arbitrary params, then we forward them through. + // this is a flag for a period of backwards compatibility + k if arbitrary_params => { + self.set_param(k, v); + } + _ => {} } } - - // TODO: extend the list of the forwarded startup parameters. - // Currently, tokio-postgres doesn't allow us to pass - // arbitrary parameters, but the ones above are a good start. - // - // This and the reverse params problem can be better addressed - // in a bespoke connection machinery (a new library for that sake). } } @@ -347,10 +335,9 @@ impl ConnCfg { } /// Retrieve `options` from a startup message, dropping all proxy-secific flags. -fn filtered_options(params: &StartupMessageParams) -> Option { +fn filtered_options(options: &str) -> Option { #[allow(unstable_name_collisions)] - let options: String = params - .options_raw()? + let options: String = StartupMessageParams::parse_options_raw(options) .filter(|opt| parse_endpoint_param(opt).is_none() && neon_option(opt).is_none()) .intersperse(" ") // TODO: use impl from std once it's stabilized .collect(); @@ -427,27 +414,24 @@ mod tests { #[test] fn test_filtered_options() { // Empty options is unlikely to be useful anyway. - let params = StartupMessageParams::new([("options", "")]); - assert_eq!(filtered_options(¶ms), None); + let params = ""; + assert_eq!(filtered_options(params), None); // It's likely that clients will only use options to specify endpoint/project. - let params = StartupMessageParams::new([("options", "project=foo")]); - assert_eq!(filtered_options(¶ms), None); + let params = "project=foo"; + assert_eq!(filtered_options(params), None); // Same, because unescaped whitespaces are no-op. - let params = StartupMessageParams::new([("options", " project=foo ")]); - assert_eq!(filtered_options(¶ms).as_deref(), None); + let params = " project=foo "; + assert_eq!(filtered_options(params).as_deref(), None); - let params = StartupMessageParams::new([("options", r"\ project=foo \ ")]); - assert_eq!(filtered_options(¶ms).as_deref(), Some(r"\ \ ")); + let params = r"\ project=foo \ "; + assert_eq!(filtered_options(params).as_deref(), Some(r"\ \ ")); - let params = StartupMessageParams::new([("options", "project = foo")]); - assert_eq!(filtered_options(¶ms).as_deref(), Some("project = foo")); + let params = "project = foo"; + assert_eq!(filtered_options(params).as_deref(), Some("project = foo")); - let params = StartupMessageParams::new([( - "options", - "project = foo neon_endpoint_type:read_write neon_lsn:0/2", - )]); - assert_eq!(filtered_options(¶ms).as_deref(), Some("project = foo")); + let params = "project = foo neon_endpoint_type:read_write neon_lsn:0/2 neon_proxy_params_compat:true"; + assert_eq!(filtered_options(params).as_deref(), Some("project = foo")); } } diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs index 8f78df1964..7db1179eea 100644 --- a/proxy/src/console_redirect_proxy.rs +++ b/proxy/src/console_redirect_proxy.rs @@ -206,6 +206,7 @@ pub(crate) async fn handle_client( let mut node = connect_to_compute( ctx, &TcpMechanism { + params_compat: true, params: ¶ms, locks: &config.connect_compute_locks, }, diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs index 585dce7bae..a3027abd7c 100644 --- a/proxy/src/proxy/connect_compute.rs +++ b/proxy/src/proxy/connect_compute.rs @@ -66,6 +66,8 @@ pub(crate) trait ComputeConnectBackend { } pub(crate) struct TcpMechanism<'a> { + pub(crate) params_compat: bool, + /// KV-dictionary with PostgreSQL connection params. pub(crate) params: &'a StartupMessageParams, @@ -92,7 +94,7 @@ impl ConnectMechanism for TcpMechanism<'_> { } fn update_connect_config(&self, config: &mut compute::ConnCfg) { - config.set_startup_params(self.params); + config.set_startup_params(self.params, self.params_compat); } } diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index af97fb3d71..f74eb5940f 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -338,9 +338,17 @@ pub(crate) async fn handle_client( } }; + let params_compat = match &user_info { + auth::Backend::ControlPlane(_, info) => { + info.info.options.get(NeonOptions::PARAMS_COMPAT).is_some() + } + auth::Backend::Local(_) => false, + }; + let mut node = connect_to_compute( ctx, &TcpMechanism { + params_compat, params: ¶ms, locks: &config.connect_compute_locks, }, @@ -409,19 +417,47 @@ pub(crate) async fn prepare_client_connection

( pub(crate) struct NeonOptions(Vec<(SmolStr, SmolStr)>); impl NeonOptions { + // proxy options: + + /// `PARAMS_COMPAT` allows opting in to forwarding all startup parameters from client to compute. + const PARAMS_COMPAT: &str = "proxy_params_compat"; + + // cplane options: + + /// `LSN` allows provisioning an ephemeral compute with time-travel to the provided LSN. + const LSN: &str = "lsn"; + + /// `ENDPOINT_TYPE` allows configuring an ephemeral compute to be read_only or read_write. + const ENDPOINT_TYPE: &str = "endpoint_type"; + pub(crate) fn parse_params(params: &StartupMessageParams) -> Self { params .options_raw() .map(Self::parse_from_iter) .unwrap_or_default() } + pub(crate) fn parse_options_raw(options: &str) -> Self { Self::parse_from_iter(StartupMessageParams::parse_options_raw(options)) } + pub(crate) fn get(&self, key: &str) -> Option { + self.0 + .iter() + .find_map(|(k, v)| (k == key).then_some(v)) + .cloned() + } + pub(crate) fn is_ephemeral(&self) -> bool { - // Currently, neon endpoint options are all reserved for ephemeral endpoints. - !self.0.is_empty() + self.0.iter().any(|(k, _)| match &**k { + // This is not a cplane option, we know it does not create ephemeral computes. + Self::PARAMS_COMPAT => false, + Self::LSN => true, + Self::ENDPOINT_TYPE => true, + // err on the side of caution. any cplane options we don't know about + // might lead to ephemeral computes. + _ => true, + }) } fn parse_from_iter<'a>(options: impl Iterator) -> Self { diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs index d72331c7bf..59c9ac27b8 100644 --- a/proxy/src/proxy/tests/mitm.rs +++ b/proxy/src/proxy/tests/mitm.rs @@ -55,7 +55,13 @@ async fn proxy_mitm( // give the end_server the startup parameters let mut buf = BytesMut::new(); - frontend::startup_message(startup.iter(), &mut buf).unwrap(); + frontend::startup_message( + &postgres_protocol::message::frontend::StartupMessageParams { + params: startup.params.into(), + }, + &mut buf, + ) + .unwrap(); end_server.send(buf.freeze()).await.unwrap(); // proxy messages between end_client and end_server diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs index 53345431e3..911b349416 100644 --- a/proxy/src/proxy/tests/mod.rs +++ b/proxy/src/proxy/tests/mod.rs @@ -252,7 +252,7 @@ async fn handshake_raw() -> anyhow::Result<()> { let _conn = postgres_client::Config::new("test".to_owned(), 5432) .user("john_doe") .dbname("earth") - .options("project=generic-project-name") + .set_param("options", "project=generic-project-name") .ssl_mode(SslMode::Prefer) .connect_raw(server, NoTls) .await?; diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index 55d2e47fd3..251aa47084 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -309,10 +309,13 @@ impl PoolingBackend { .config .user(&conn_info.user_info.user) .dbname(&conn_info.dbname) - .options(&format!( - "-c pg_session_jwt.jwk={}", - serde_json::to_string(&jwk).expect("serializing jwk to json should not fail") - )); + .set_param( + "options", + &format!( + "-c pg_session_jwt.jwk={}", + serde_json::to_string(&jwk).expect("serializing jwk to json should not fail") + ), + ); let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); let (client, connection) = config.connect(postgres_client::NoTls).await?; diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 9c579373e8..60c4a23936 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -269,7 +269,7 @@ class PgProtocol: for match in re.finditer(r"-c(\w*)=(\w*)", options): key = match.group(1) val = match.group(2) - if "server_options" in conn_options: + if "server_settings" in conn_options: conn_options["server_settings"].update({key: val}) else: conn_options["server_settings"] = {key: val} diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py index 5a01d90d85..d8df2efc78 100644 --- a/test_runner/regress/test_proxy.py +++ b/test_runner/regress/test_proxy.py @@ -5,6 +5,7 @@ import json import subprocess import time import urllib.parse +from contextlib import closing from typing import TYPE_CHECKING import psycopg2 @@ -131,6 +132,24 @@ def test_proxy_options(static_proxy: NeonProxy, option_name: str): assert out[0][0] == " str" +@pytest.mark.asyncio +async def test_proxy_arbitrary_params(static_proxy: NeonProxy): + with closing( + await static_proxy.connect_async(server_settings={"IntervalStyle": "iso_8601"}) + ) as conn: + out = await conn.fetchval("select to_json('0 seconds'::interval)") + assert out == '"00:00:00"' + + options = "neon_proxy_params_compat:true" + with closing( + await static_proxy.connect_async( + server_settings={"IntervalStyle": "iso_8601", "options": options} + ) + ) as conn: + out = await conn.fetchval("select to_json('0 seconds'::interval)") + assert out == '"PT0S"' + + def test_auth_errors(static_proxy: NeonProxy): """ Check that we throw very specific errors in some unsuccessful auth scenarios. From bbb050459bc0b9ee99fee937498c77fa6e70edfc Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Wed, 4 Dec 2024 14:05:31 +0100 Subject: [PATCH 57/65] feat(compute): Set default application_name for pgbouncer connections (#9973) ## Problem When client specifies `application_name`, pgbouncer propagates it to the Postgres. Yet, if client doesn't do it, we have hard time figuring out who opens a lot of Postgres connections (including the `cloud_admin` ones). See this investigation as an example: https://neondb.slack.com/archives/C0836R0RZ0D ## Summary of changes I haven't found this documented, but it looks like pgbouncer accepts standard Postgres connstring parameters in the connstring in the `[databases]` section, so put the default `application_name=pgbouncer` there. That way, we will always see who opens Postgres connections. I did tests, and if client specifies a `application_name`, pgbouncer overrides this default, so it only works if it's not specified or set to blank `&application_name=` in the connection string. This is the last place we could potentially open some Postgres connections without `application_name`. Everything else should be either of two: 1. Direct client connections without `application_name`, but these should be strictly non-`cloud_admin` ones 2. Some ad-hoc internal connections, so if we see spikes of unidentified `cloud_admin` connections, we will need to investigate it again. Fixes neondatabase/cloud#20948 --- compute/etc/pgbouncer.ini | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/compute/etc/pgbouncer.ini b/compute/etc/pgbouncer.ini index cb994f961c..abcd165636 100644 --- a/compute/etc/pgbouncer.ini +++ b/compute/etc/pgbouncer.ini @@ -1,5 +1,9 @@ [databases] -*=host=localhost port=5432 auth_user=cloud_admin +;; pgbouncer propagates application_name (if it's specified) to the server, but some +;; clients don't set it. We set default application_name=pgbouncer to make it +;; easier to identify pgbouncer connections in Postgres. If client sets +;; application_name, it will be used instead. +*=host=localhost port=5432 auth_user=cloud_admin application_name=pgbouncer [pgbouncer] listen_port=6432 listen_addr=0.0.0.0 From 8c6b41daf5333b09664542e235e64fa51e232fbe Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Wed, 4 Dec 2024 14:05:53 +0100 Subject: [PATCH 58/65] Display reqwest error source (#10004) ## Problem Reqwest errors don't include details about the inner source error. This means that we get opaque errors like: ``` receive body: error sending request for url (http://localhost:9898/v1/location_config) ``` Instead of the more helpful: ``` receive body: error sending request for url (http://localhost:9898/v1/location_config): operation timed out ``` Touches #9801. ## Summary of changes Include the source error for `reqwest::Error` wherever it's displayed. --- control_plane/src/safekeeper.rs | 3 ++- pageserver/client/src/mgmt_api.rs | 6 +++--- pageserver/src/consumption_metrics/upload.rs | 7 ++++++- safekeeper/src/http/client.rs | 3 ++- storage_controller/src/compute_hook.rs | 3 ++- storage_controller/src/peer_client.rs | 11 ++++++++--- storage_scrubber/src/cloud_admin_api.rs | 14 ++++++++++---- 7 files changed, 33 insertions(+), 14 deletions(-) diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index 7a019bce88..f0c3722925 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -5,6 +5,7 @@ //! ```text //! .neon/safekeepers/ //! ``` +use std::error::Error as _; use std::future::Future; use std::io::Write; use std::path::PathBuf; @@ -26,7 +27,7 @@ use crate::{ #[derive(Error, Debug)] pub enum SafekeeperHttpError { - #[error("Reqwest error: {0}")] + #[error("request error: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())] Transport(#[from] reqwest::Error), #[error("Error: {0}")] diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index 4d76c66905..c3a1ef8140 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -1,4 +1,4 @@ -use std::collections::HashMap; +use std::{collections::HashMap, error::Error as _}; use bytes::Bytes; use detach_ancestor::AncestorDetached; @@ -25,10 +25,10 @@ pub struct Client { #[derive(thiserror::Error, Debug)] pub enum Error { - #[error("send request: {0}")] + #[error("send request: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())] SendRequest(reqwest::Error), - #[error("receive body: {0}")] + #[error("receive body: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())] ReceiveBody(reqwest::Error), #[error("receive error body: {0}")] diff --git a/pageserver/src/consumption_metrics/upload.rs b/pageserver/src/consumption_metrics/upload.rs index 1cb4e917c0..448bf47525 100644 --- a/pageserver/src/consumption_metrics/upload.rs +++ b/pageserver/src/consumption_metrics/upload.rs @@ -1,3 +1,4 @@ +use std::error::Error as _; use std::time::SystemTime; use chrono::{DateTime, Utc}; @@ -350,7 +351,11 @@ impl std::fmt::Display for UploadError { match self { Rejected(code) => write!(f, "server rejected the metrics with {code}"), - Reqwest(e) => write!(f, "request failed: {e}"), + Reqwest(e) => write!( + f, + "request failed: {e}{}", + e.source().map(|e| format!(": {e}")).unwrap_or_default() + ), Cancelled => write!(f, "cancelled"), } } diff --git a/safekeeper/src/http/client.rs b/safekeeper/src/http/client.rs index c56f7880d4..a166fc1ab9 100644 --- a/safekeeper/src/http/client.rs +++ b/safekeeper/src/http/client.rs @@ -8,6 +8,7 @@ //! etc. use reqwest::{IntoUrl, Method, StatusCode}; +use std::error::Error as _; use utils::{ http::error::HttpErrorBody, id::{NodeId, TenantId, TimelineId}, @@ -26,7 +27,7 @@ pub struct Client { #[derive(thiserror::Error, Debug)] pub enum Error { /// Failed to receive body (reqwest error). - #[error("receive body: {0}")] + #[error("receive body: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())] ReceiveBody(reqwest::Error), /// Status is not ok, but failed to parse body as `HttpErrorBody`. diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs index b63a322b87..2b2ece3f02 100644 --- a/storage_controller/src/compute_hook.rs +++ b/storage_controller/src/compute_hook.rs @@ -1,3 +1,4 @@ +use std::error::Error as _; use std::sync::Arc; use std::{collections::HashMap, time::Duration}; @@ -172,7 +173,7 @@ struct ComputeHookNotifyRequest { #[derive(thiserror::Error, Debug)] pub(crate) enum NotifyError { // Request was not send successfully, e.g. transport error - #[error("Sending request: {0}")] + #[error("Sending request: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())] Request(#[from] reqwest::Error), // Request could not be serviced right now due to ongoing Operation in control plane, but should be possible soon. #[error("Control plane tenant busy")] diff --git a/storage_controller/src/peer_client.rs b/storage_controller/src/peer_client.rs index 3f8520fe55..ee4eb55294 100644 --- a/storage_controller/src/peer_client.rs +++ b/storage_controller/src/peer_client.rs @@ -1,7 +1,9 @@ use crate::tenant_shard::ObservedState; use pageserver_api::shard::TenantShardId; use serde::{Deserialize, Serialize}; -use std::{collections::HashMap, time::Duration}; +use std::collections::HashMap; +use std::error::Error as _; +use std::time::Duration; use tokio_util::sync::CancellationToken; use hyper::Uri; @@ -17,11 +19,14 @@ pub(crate) struct PeerClient { #[derive(thiserror::Error, Debug)] pub(crate) enum StorageControllerPeerError { - #[error("failed to deserialize error response with status code {0} at {1}: {2}")] + #[error( + "failed to deserialize error response with status code {0} at {1}: {2}{}", + .2.source().map(|e| format!(": {e}")).unwrap_or_default() + )] DeserializationError(StatusCode, Url, reqwest::Error), #[error("storage controller peer API error ({0}): {1}")] ApiError(StatusCode, String), - #[error("failed to send HTTP request: {0}")] + #[error("failed to send HTTP request: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())] SendError(reqwest::Error), #[error("Cancelled")] Cancelled, diff --git a/storage_scrubber/src/cloud_admin_api.rs b/storage_scrubber/src/cloud_admin_api.rs index c9a62cd256..b1dfe3a53f 100644 --- a/storage_scrubber/src/cloud_admin_api.rs +++ b/storage_scrubber/src/cloud_admin_api.rs @@ -1,3 +1,5 @@ +use std::error::Error as _; + use chrono::{DateTime, Utc}; use futures::Future; use hex::FromHex; @@ -30,14 +32,18 @@ impl std::fmt::Display for Error { match &self.kind { ErrorKind::RequestSend(e) => write!( f, - "Failed to send a request. Context: {}, error: {}", - self.context, e + "Failed to send a request. Context: {}, error: {}{}", + self.context, + e, + e.source().map(|e| format!(": {e}")).unwrap_or_default() ), ErrorKind::BodyRead(e) => { write!( f, - "Failed to read a request body. Context: {}, error: {}", - self.context, e + "Failed to read a request body. Context: {}, error: {}{}", + self.context, + e, + e.source().map(|e| format!(": {e}")).unwrap_or_default() ) } ErrorKind::ResponseStatus(status) => { From d550e3f6263d988166c84ac62358abdd7e01f948 Mon Sep 17 00:00:00 2001 From: a-masterov <72613290+a-masterov@users.noreply.github.com> Date: Wed, 4 Dec 2024 14:10:00 +0100 Subject: [PATCH 59/65] Create a branch for compute release (#9637) ## Problem We practice a manual release flow for the compute module. This will allow automation of the compute release process. ## Summary of changes The workflow was modified to make a compute release automatically on the branch release-compute. ## Checklist before requesting a review - [x] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. ## Checklist before merging - [ ] Do not forget to reformat commit message to not include the above checklist --- .../actions/allure-report-generate/action.yml | 3 ++- .../actions/allure-report-store/action.yml | 3 ++- .github/workflows/_create-release-pr.yml | 2 +- .github/workflows/build_and_test.yml | 23 +++++++++++-------- .github/workflows/release.yml | 23 ++++++++++++++++--- .github/workflows/trigger-e2e-tests.yml | 2 ++ 6 files changed, 41 insertions(+), 15 deletions(-) diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml index d1d09223db..d6219c31b4 100644 --- a/.github/actions/allure-report-generate/action.yml +++ b/.github/actions/allure-report-generate/action.yml @@ -43,7 +43,8 @@ runs: PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true) if [ "${PR_NUMBER}" != "null" ]; then BRANCH_OR_PR=pr-${PR_NUMBER} - elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || [ "${GITHUB_REF_NAME}" = "release-proxy" ]; then + elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || \ + [ "${GITHUB_REF_NAME}" = "release-proxy" ] || [ "${GITHUB_REF_NAME}" = "release-compute" ]; then # Shortcut for special branches BRANCH_OR_PR=${GITHUB_REF_NAME} else diff --git a/.github/actions/allure-report-store/action.yml b/.github/actions/allure-report-store/action.yml index 9c376f420a..3c83656c89 100644 --- a/.github/actions/allure-report-store/action.yml +++ b/.github/actions/allure-report-store/action.yml @@ -23,7 +23,8 @@ runs: PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true) if [ "${PR_NUMBER}" != "null" ]; then BRANCH_OR_PR=pr-${PR_NUMBER} - elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || [ "${GITHUB_REF_NAME}" = "release-proxy" ]; then + elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || \ + [ "${GITHUB_REF_NAME}" = "release-proxy" ] || [ "${GITHUB_REF_NAME}" = "release-compute" ]; then # Shortcut for special branches BRANCH_OR_PR=${GITHUB_REF_NAME} else diff --git a/.github/workflows/_create-release-pr.yml b/.github/workflows/_create-release-pr.yml index cc6994397f..3c130c8229 100644 --- a/.github/workflows/_create-release-pr.yml +++ b/.github/workflows/_create-release-pr.yml @@ -21,7 +21,7 @@ defaults: shell: bash -euo pipefail {0} jobs: - create-storage-release-branch: + create-release-branch: runs-on: ubuntu-22.04 permissions: diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index e9e111e7bd..cb966f292e 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -6,6 +6,7 @@ on: - main - release - release-proxy + - release-compute pull_request: defaults: @@ -70,8 +71,10 @@ jobs: echo "tag=release-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT + elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then + echo "tag=release-compute-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT else - echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" + echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release', 'release-proxy', 'release-compute'" echo "tag=$GITHUB_RUN_ID" >> $GITHUB_OUTPUT fi shell: bash @@ -513,7 +516,7 @@ jobs: }) trigger-e2e-tests: - if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' }} + if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' }} needs: [ check-permissions, promote-images, tag ] uses: ./.github/workflows/trigger-e2e-tests.yml secrets: inherit @@ -934,7 +937,7 @@ jobs: neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }} - name: Configure AWS-prod credentials - if: github.ref_name == 'release'|| github.ref_name == 'release-proxy' + if: github.ref_name == 'release'|| github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' uses: aws-actions/configure-aws-credentials@v4 with: aws-region: eu-central-1 @@ -943,12 +946,12 @@ jobs: - name: Login to prod ECR uses: docker/login-action@v3 - if: github.ref_name == 'release'|| github.ref_name == 'release-proxy' + if: github.ref_name == 'release'|| github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' with: registry: 093970136003.dkr.ecr.eu-central-1.amazonaws.com - name: Copy all images to prod ECR - if: github.ref_name == 'release'|| github.ref_name == 'release-proxy' + if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' run: | for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16,v17}; do docker buildx imagetools create -t 093970136003.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} \ @@ -968,7 +971,7 @@ jobs: tenant_id: ${{ vars.AZURE_TENANT_ID }} push-to-acr-prod: - if: github.ref_name == 'release'|| github.ref_name == 'release-proxy' + if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' needs: [ tag, promote-images ] uses: ./.github/workflows/_push-to-acr.yml with: @@ -1056,7 +1059,7 @@ jobs: deploy: needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait, push-to-acr-dev, push-to-acr-prod ] # `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod` - if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy') && !failure() && !cancelled() + if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute') && !failure() && !cancelled() runs-on: [ self-hosted, small ] container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest @@ -1105,13 +1108,15 @@ jobs: -f deployProxyAuthBroker=true \ -f branch=main \ -f dockerTag=${{needs.tag.outputs.build-tag}} + elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then + gh workflow --repo neondatabase/infra run deploy-compute-dev.yml --ref main -f dockerTag=${{needs.tag.outputs.build-tag}} else - echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" + echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main', 'release', 'release-proxy' or 'release-compute'" exit 1 fi - name: Create git tag - if: github.ref_name == 'release' || github.ref_name == 'release-proxy' + if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' uses: actions/github-script@v7 with: # Retry script for 5XX server errors: https://github.com/actions/github-script#retries diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 11f010b6d4..f0273b977f 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -15,6 +15,10 @@ on: type: boolean description: 'Create Proxy release PR' required: false + create-compute-release-branch: + type: boolean + description: 'Create Compute release PR' + required: false # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. permissions: {} @@ -25,20 +29,20 @@ defaults: jobs: create-storage-release-branch: - if: ${{ github.event.schedule == '0 6 * * MON' || format('{0}', inputs.create-storage-release-branch) == 'true' }} + if: ${{ github.event.schedule == '0 6 * * MON' || inputs.create-storage-release-branch }} permissions: contents: write uses: ./.github/workflows/_create-release-pr.yml with: - component-name: 'Storage & Compute' + component-name: 'Storage' release-branch: 'release' secrets: ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }} create-proxy-release-branch: - if: ${{ github.event.schedule == '0 6 * * THU' || format('{0}', inputs.create-proxy-release-branch) == 'true' }} + if: ${{ github.event.schedule == '0 6 * * THU' || inputs.create-proxy-release-branch }} permissions: contents: write @@ -49,3 +53,16 @@ jobs: release-branch: 'release-proxy' secrets: ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }} + + create-compute-release-branch: + if: inputs.create-compute-release-branch + + permissions: + contents: write + + uses: ./.github/workflows/_create-release-pr.yml + with: + component-name: 'Compute' + release-branch: 'release-compute' + secrets: + ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }} diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml index 1e7264c55a..70c2e8549f 100644 --- a/.github/workflows/trigger-e2e-tests.yml +++ b/.github/workflows/trigger-e2e-tests.yml @@ -51,6 +51,8 @@ jobs: echo "tag=release-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT + elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then + echo "tag=release-compute-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT else echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'" BUILD_AND_TEST_RUN_ID=$(gh run list -b $CURRENT_BRANCH -c $CURRENT_SHA -w 'Build and Test' -L 1 --json databaseId --jq '.[].databaseId') From 088eb72dd7a3d104cf854bf87be30ba922b52c90 Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 4 Dec 2024 15:04:04 +0000 Subject: [PATCH 60/65] tests: make storcon scale test AZ-aware (#9952) ## Problem We have a scale test for the storage controller which also acts as a good stress test for scheduling stability. However, it created nodes with no AZs set. ## Summary of changes - Bump node count to 6 and set AZs on them. This is a precursor to other AZ-related PRs, to make sure any new code that's landed is getting scale tested in an AZ-aware environment. --- test_runner/performance/test_storage_controller_scale.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py index 142bd3d669..49f41483ec 100644 --- a/test_runner/performance/test_storage_controller_scale.py +++ b/test_runner/performance/test_storage_controller_scale.py @@ -72,7 +72,7 @@ def test_storage_controller_many_tenants( we don't fall over for a thousand shards. """ - neon_env_builder.num_pageservers = 5 + neon_env_builder.num_pageservers = 6 neon_env_builder.storage_controller_config = { # Default neon_local uses a small timeout: use a longer one to tolerate longer pageserver restarts. # TODO: tune this down as restarts get faster (https://github.com/neondatabase/neon/pull/7553), to @@ -84,6 +84,11 @@ def test_storage_controller_many_tenants( compute_reconfigure_listener.control_plane_compute_hook_api ) + AZS = ["alpha", "bravo", "charlie"] + neon_env_builder.pageserver_config_override = lambda ps_cfg: ps_cfg.update( + {"availability_zone": f"az-{AZS[ps_cfg['id'] % len(AZS)]}"} + ) + # A small sleep on each call into the notify hook, to simulate the latency of doing a database write compute_reconfigure_listener.register_on_notify(lambda body: time.sleep(0.01)) From 7b0e3db868ff251f5b3f043c93ea8d59fb8e929e Mon Sep 17 00:00:00 2001 From: Yuchen Liang <70461588+yliang412@users.noreply.github.com> Date: Wed, 4 Dec 2024 11:54:56 -0500 Subject: [PATCH 61/65] pageserver: make `BufferedWriter` do double-buffering (#9693) Closes #9387. ## Problem `BufferedWriter` cannot proceed while the owned buffer is flushing to disk. We want to implement double buffering so that the flush can happen in the background. See #9387. ## Summary of changes - Maintain two owned buffers in `BufferedWriter`. - The writer is in charge of copying the data into owned, aligned buffer, once full, submit it to the flush task. - The flush background task is in charge of flushing the owned buffer to disk, and returned the buffer to the writer for reuse. - The writer and the flush background task communicate through a bi-directional channel. For in-memory layer, we also need to be able to read from the buffered writer in `get_values_reconstruct_data`. To handle this case, we did the following - Use replace `VirtualFile::write_all` with `VirtualFile::write_all_at`, and use `Arc` to share it between writer and background task. - leverage `IoBufferMut::freeze` to get a cheaply clonable `IoBuffer`, one clone will be submitted to the channel, the other clone will be saved within the writer to serve reads. When we want to reuse the buffer, we can invoke `IoBuffer::into_mut`, which gives us back the mutable aligned buffer. - InMemoryLayer reads is now aware of the maybe_flushed part of the buffer. **Caveat** - We removed the owned version of write, because this interface does not work well with buffer alignment. The result is that without direct IO enabled, [`download_object`](https://github.com/neondatabase/neon/blob/a439d57050dafd603d24e001215213eb5246a029/pageserver/src/tenant/remote_timeline_client/download.rs#L243) does one more memcpy than before this PR due to the switch to use `_borrowed` version of the write. - "Bypass aligned part of write" could be implemented later to avoid large amount of memcpy. **Testing** - use an oneshot channel based control mechanism to make flush behavior deterministic in test. - test reading from `EphemeralFile` when the last submitted buffer is not flushed, in-progress, and done flushing to disk. ## Performance We see performance improvement for small values, and regression on big values, likely due to being CPU bound + disk write latency. [Results](https://www.notion.so/neondatabase/Benchmarking-New-BufferedWriter-11-20-2024-143f189e0047805ba99acda89f984d51?pvs=4) ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. ## Checklist before merging - [ ] Do not forget to reformat commit message to not include the above checklist --------- Signed-off-by: Yuchen Liang Co-authored-by: Christian Schwarz --- libs/utils/src/sync.rs | 1 + libs/utils/src/sync/duplex.rs | 1 + libs/utils/src/sync/duplex/mpsc.rs | 36 ++ pageserver/benches/bench_ingest.rs | 4 +- pageserver/src/tenant/ephemeral_file.rs | 286 +++++++++----- .../src/tenant/remote_timeline_client.rs | 2 + .../tenant/remote_timeline_client/download.rs | 44 ++- pageserver/src/tenant/secondary/downloader.rs | 1 + .../tenant/storage_layer/inmemory_layer.rs | 5 +- pageserver/src/tenant/storage_layer/layer.rs | 1 + pageserver/src/tenant/timeline.rs | 3 +- .../src/tenant/timeline/layer_manager.rs | 14 +- pageserver/src/virtual_file.rs | 26 +- .../aligned_buffer/alignment.rs | 4 +- .../owned_buffers_io/aligned_buffer/buffer.rs | 18 +- .../aligned_buffer/buffer_mut.rs | 43 ++- .../owned_buffers_io/io_buf_aligned.rs | 10 +- .../owned_buffers_io/io_buf_ext.rs | 14 + .../util/size_tracking_writer.rs | 50 --- .../virtual_file/owned_buffers_io/write.rs | 358 +++++++++--------- .../owned_buffers_io/write/flush.rs | 314 +++++++++++++++ 21 files changed, 846 insertions(+), 389 deletions(-) create mode 100644 libs/utils/src/sync/duplex.rs create mode 100644 libs/utils/src/sync/duplex/mpsc.rs delete mode 100644 pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs create mode 100644 pageserver/src/virtual_file/owned_buffers_io/write/flush.rs diff --git a/libs/utils/src/sync.rs b/libs/utils/src/sync.rs index 7aa26e24bc..280637de8f 100644 --- a/libs/utils/src/sync.rs +++ b/libs/utils/src/sync.rs @@ -1,5 +1,6 @@ pub mod heavier_once_cell; +pub mod duplex; pub mod gate; pub mod spsc_fold; diff --git a/libs/utils/src/sync/duplex.rs b/libs/utils/src/sync/duplex.rs new file mode 100644 index 0000000000..fac79297a0 --- /dev/null +++ b/libs/utils/src/sync/duplex.rs @@ -0,0 +1 @@ +pub mod mpsc; diff --git a/libs/utils/src/sync/duplex/mpsc.rs b/libs/utils/src/sync/duplex/mpsc.rs new file mode 100644 index 0000000000..56b4e6d2b3 --- /dev/null +++ b/libs/utils/src/sync/duplex/mpsc.rs @@ -0,0 +1,36 @@ +use tokio::sync::mpsc; + +/// A bi-directional channel. +pub struct Duplex { + pub tx: mpsc::Sender, + pub rx: mpsc::Receiver, +} + +/// Creates a bi-directional channel. +/// +/// The channel will buffer up to the provided number of messages. Once the buffer is full, +/// attempts to send new messages will wait until a message is received from the channel. +/// The provided buffer capacity must be at least 1. +pub fn channel(buffer: usize) -> (Duplex, Duplex) { + let (tx_a, rx_a) = mpsc::channel::(buffer); + let (tx_b, rx_b) = mpsc::channel::(buffer); + + (Duplex { tx: tx_a, rx: rx_b }, Duplex { tx: tx_b, rx: rx_a }) +} + +impl Duplex { + /// Sends a value, waiting until there is capacity. + /// + /// A successful send occurs when it is determined that the other end of the channel has not hung up already. + pub async fn send(&self, x: S) -> Result<(), mpsc::error::SendError> { + self.tx.send(x).await + } + + /// Receives the next value for this receiver. + /// + /// This method returns `None` if the channel has been closed and there are + /// no remaining messages in the channel's buffer. + pub async fn recv(&mut self) -> Option { + self.rx.recv().await + } +} diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs index caacd365b3..b67a9cc479 100644 --- a/pageserver/benches/bench_ingest.rs +++ b/pageserver/benches/bench_ingest.rs @@ -62,10 +62,8 @@ async fn ingest( let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); let gate = utils::sync::gate::Gate::default(); - let entered = gate.enter().unwrap(); - let layer = - InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, entered, &ctx).await?; + let layer = InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, &gate, &ctx).await?; let data = Value::Image(Bytes::from(vec![0u8; put_size])); let data_ser_size = data.serialized_size().unwrap() as usize; diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index de0abab4c0..aaec8a4c31 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -8,10 +8,8 @@ use crate::page_cache; use crate::tenant::storage_layer::inmemory_layer::vectored_dio_read::File; use crate::virtual_file::owned_buffers_io::io_buf_aligned::IoBufAlignedMut; use crate::virtual_file::owned_buffers_io::slice::SliceMutExt; -use crate::virtual_file::owned_buffers_io::util::size_tracking_writer; use crate::virtual_file::owned_buffers_io::write::Buffer; use crate::virtual_file::{self, owned_buffers_io, IoBufferMut, VirtualFile}; -use bytes::BytesMut; use camino::Utf8PathBuf; use num_traits::Num; use pageserver_api::shard::TenantShardId; @@ -20,6 +18,7 @@ use tracing::error; use std::io; use std::sync::atomic::AtomicU64; +use std::sync::Arc; use utils::id::TimelineId; pub struct EphemeralFile { @@ -27,10 +26,7 @@ pub struct EphemeralFile { _timeline_id: TimelineId, page_cache_file_id: page_cache::FileId, bytes_written: u64, - buffered_writer: owned_buffers_io::write::BufferedWriter< - BytesMut, - size_tracking_writer::Writer, - >, + buffered_writer: owned_buffers_io::write::BufferedWriter, /// Gate guard is held on as long as we need to do operations in the path (delete on drop) _gate_guard: utils::sync::gate::GateGuard, } @@ -42,9 +38,9 @@ impl EphemeralFile { conf: &PageServerConf, tenant_shard_id: TenantShardId, timeline_id: TimelineId, - gate_guard: utils::sync::gate::GateGuard, + gate: &utils::sync::gate::Gate, ctx: &RequestContext, - ) -> Result { + ) -> anyhow::Result { static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1); let filename_disambiguator = NEXT_FILENAME.fetch_add(1, std::sync::atomic::Ordering::Relaxed); @@ -55,15 +51,17 @@ impl EphemeralFile { "ephemeral-{filename_disambiguator}" ))); - let file = VirtualFile::open_with_options( - &filename, - virtual_file::OpenOptions::new() - .read(true) - .write(true) - .create(true), - ctx, - ) - .await?; + let file = Arc::new( + VirtualFile::open_with_options_v2( + &filename, + virtual_file::OpenOptions::new() + .read(true) + .write(true) + .create(true), + ctx, + ) + .await?, + ); let page_cache_file_id = page_cache::next_file_id(); // XXX get rid, we're not page-caching anymore @@ -73,10 +71,12 @@ impl EphemeralFile { page_cache_file_id, bytes_written: 0, buffered_writer: owned_buffers_io::write::BufferedWriter::new( - size_tracking_writer::Writer::new(file), - BytesMut::with_capacity(TAIL_SZ), + file, + || IoBufferMut::with_capacity(TAIL_SZ), + gate.enter()?, + ctx, ), - _gate_guard: gate_guard, + _gate_guard: gate.enter()?, }) } } @@ -85,7 +85,7 @@ impl Drop for EphemeralFile { fn drop(&mut self) { // unlink the file // we are clear to do this, because we have entered a gate - let path = self.buffered_writer.as_inner().as_inner().path(); + let path = self.buffered_writer.as_inner().path(); let res = std::fs::remove_file(path); if let Err(e) = res { if e.kind() != std::io::ErrorKind::NotFound { @@ -132,6 +132,18 @@ impl EphemeralFile { srcbuf: &[u8], ctx: &RequestContext, ) -> std::io::Result { + let (pos, control) = self.write_raw_controlled(srcbuf, ctx).await?; + if let Some(control) = control { + control.release().await; + } + Ok(pos) + } + + async fn write_raw_controlled( + &mut self, + srcbuf: &[u8], + ctx: &RequestContext, + ) -> std::io::Result<(u64, Option)> { let pos = self.bytes_written; let new_bytes_written = pos.checked_add(srcbuf.len().into_u64()).ok_or_else(|| { @@ -145,9 +157,9 @@ impl EphemeralFile { })?; // Write the payload - let nwritten = self + let (nwritten, control) = self .buffered_writer - .write_buffered_borrowed(srcbuf, ctx) + .write_buffered_borrowed_controlled(srcbuf, ctx) .await?; assert_eq!( nwritten, @@ -157,7 +169,7 @@ impl EphemeralFile { self.bytes_written = new_bytes_written; - Ok(pos) + Ok((pos, control)) } } @@ -168,11 +180,12 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral dst: tokio_epoll_uring::Slice, ctx: &'a RequestContext, ) -> std::io::Result<(tokio_epoll_uring::Slice, usize)> { - let file_size_tracking_writer = self.buffered_writer.as_inner(); - let flushed_offset = file_size_tracking_writer.bytes_written(); + let submitted_offset = self.buffered_writer.bytes_submitted(); - let buffer = self.buffered_writer.inspect_buffer(); - let buffered = &buffer[0..buffer.pending()]; + let mutable = self.buffered_writer.inspect_mutable(); + let mutable = &mutable[0..mutable.pending()]; + + let maybe_flushed = self.buffered_writer.inspect_maybe_flushed(); let dst_cap = dst.bytes_total().into_u64(); let end = { @@ -197,11 +210,42 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral } } } - let written_range = Range(start, std::cmp::min(end, flushed_offset)); - let buffered_range = Range(std::cmp::max(start, flushed_offset), end); + + let (written_range, maybe_flushed_range) = { + if maybe_flushed.is_some() { + // [ written ][ maybe_flushed ][ mutable ] + // <- TAIL_SZ -><- TAIL_SZ -> + // ^ + // `submitted_offset` + // <++++++ on disk +++++++????????????????> + ( + Range( + start, + std::cmp::min(end, submitted_offset.saturating_sub(TAIL_SZ as u64)), + ), + Range( + std::cmp::max(start, submitted_offset.saturating_sub(TAIL_SZ as u64)), + std::cmp::min(end, submitted_offset), + ), + ) + } else { + // [ written ][ mutable ] + // <- TAIL_SZ -> + // ^ + // `submitted_offset` + // <++++++ on disk +++++++++++++++++++++++> + ( + Range(start, std::cmp::min(end, submitted_offset)), + // zero len + Range(submitted_offset, u64::MIN), + ) + } + }; + + let mutable_range = Range(std::cmp::max(start, submitted_offset), end); let dst = if written_range.len() > 0 { - let file: &VirtualFile = file_size_tracking_writer.as_inner(); + let file: &VirtualFile = self.buffered_writer.as_inner(); let bounds = dst.bounds(); let slice = file .read_exact_at(dst.slice(0..written_range.len().into_usize()), start, ctx) @@ -211,19 +255,21 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral dst }; - let dst = if buffered_range.len() > 0 { - let offset_in_buffer = buffered_range + let dst = if maybe_flushed_range.len() > 0 { + let offset_in_buffer = maybe_flushed_range .0 - .checked_sub(flushed_offset) + .checked_sub(submitted_offset.saturating_sub(TAIL_SZ as u64)) .unwrap() .into_usize(); - let to_copy = - &buffered[offset_in_buffer..(offset_in_buffer + buffered_range.len().into_usize())]; + // Checked previously the buffer is Some. + let maybe_flushed = maybe_flushed.unwrap(); + let to_copy = &maybe_flushed + [offset_in_buffer..(offset_in_buffer + maybe_flushed_range.len().into_usize())]; let bounds = dst.bounds(); let mut view = dst.slice({ let start = written_range.len().into_usize(); let end = start - .checked_add(buffered_range.len().into_usize()) + .checked_add(maybe_flushed_range.len().into_usize()) .unwrap(); start..end }); @@ -234,6 +280,28 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral dst }; + let dst = if mutable_range.len() > 0 { + let offset_in_buffer = mutable_range + .0 + .checked_sub(submitted_offset) + .unwrap() + .into_usize(); + let to_copy = + &mutable[offset_in_buffer..(offset_in_buffer + mutable_range.len().into_usize())]; + let bounds = dst.bounds(); + let mut view = dst.slice({ + let start = + written_range.len().into_usize() + maybe_flushed_range.len().into_usize(); + let end = start.checked_add(mutable_range.len().into_usize()).unwrap(); + start..end + }); + view.as_mut_rust_slice_full_zeroed() + .copy_from_slice(to_copy); + Slice::from_buf_bounds(Slice::into_inner(view), bounds) + } else { + dst + }; + // TODO: in debug mode, randomize the remaining bytes in `dst` to catch bugs Ok((dst, (end - start).into_usize())) @@ -295,7 +363,7 @@ mod tests { let gate = utils::sync::gate::Gate::default(); - let file = EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx) + let file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &ctx) .await .unwrap(); @@ -326,14 +394,15 @@ mod tests { let gate = utils::sync::gate::Gate::default(); - let mut file = - EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx) - .await - .unwrap(); + let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &ctx) + .await + .unwrap(); - let cap = file.buffered_writer.inspect_buffer().capacity(); + let mutable = file.buffered_writer.inspect_mutable(); + let cap = mutable.capacity(); + let align = mutable.align(); - let write_nbytes = cap + cap / 2; + let write_nbytes = cap * 2 + cap / 2; let content: Vec = rand::thread_rng() .sample_iter(rand::distributions::Standard) @@ -341,30 +410,39 @@ mod tests { .collect(); let mut value_offsets = Vec::new(); - for i in 0..write_nbytes { - let off = file.write_raw(&content[i..i + 1], &ctx).await.unwrap(); + for range in (0..write_nbytes) + .step_by(align) + .map(|start| start..(start + align).min(write_nbytes)) + { + let off = file.write_raw(&content[range], &ctx).await.unwrap(); value_offsets.push(off); } - assert!(file.len() as usize == write_nbytes); - for i in 0..write_nbytes { - assert_eq!(value_offsets[i], i.into_u64()); - let buf = IoBufferMut::with_capacity(1); + assert_eq!(file.len() as usize, write_nbytes); + for (i, range) in (0..write_nbytes) + .step_by(align) + .map(|start| start..(start + align).min(write_nbytes)) + .enumerate() + { + assert_eq!(value_offsets[i], range.start.into_u64()); + let buf = IoBufferMut::with_capacity(range.len()); let (buf_slice, nread) = file - .read_exact_at_eof_ok(i.into_u64(), buf.slice_full(), &ctx) + .read_exact_at_eof_ok(range.start.into_u64(), buf.slice_full(), &ctx) .await .unwrap(); let buf = buf_slice.into_inner(); - assert_eq!(nread, 1); - assert_eq!(&buf, &content[i..i + 1]); + assert_eq!(nread, range.len()); + assert_eq!(&buf, &content[range]); } - let file_contents = - std::fs::read(file.buffered_writer.as_inner().as_inner().path()).unwrap(); - assert_eq!(file_contents, &content[0..cap]); + let file_contents = std::fs::read(file.buffered_writer.as_inner().path()).unwrap(); + assert!(file_contents == content[0..cap * 2]); - let buffer_contents = file.buffered_writer.inspect_buffer(); - assert_eq!(buffer_contents, &content[cap..write_nbytes]); + let maybe_flushed_buffer_contents = file.buffered_writer.inspect_maybe_flushed().unwrap(); + assert_eq!(&maybe_flushed_buffer_contents[..], &content[cap..cap * 2]); + + let mutable_buffer_contents = file.buffered_writer.inspect_mutable(); + assert_eq!(mutable_buffer_contents, &content[cap * 2..write_nbytes]); } #[tokio::test] @@ -373,16 +451,16 @@ mod tests { let gate = utils::sync::gate::Gate::default(); - let mut file = - EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx) - .await - .unwrap(); + let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &ctx) + .await + .unwrap(); - let cap = file.buffered_writer.inspect_buffer().capacity(); + // mutable buffer and maybe_flushed buffer each has `cap` bytes. + let cap = file.buffered_writer.inspect_mutable().capacity(); let content: Vec = rand::thread_rng() .sample_iter(rand::distributions::Standard) - .take(cap + cap / 2) + .take(cap * 2 + cap / 2) .collect(); file.write_raw(&content, &ctx).await.unwrap(); @@ -390,23 +468,21 @@ mod tests { // assert the state is as this test expects it to be assert_eq!( &file.load_to_io_buf(&ctx).await.unwrap(), - &content[0..cap + cap / 2] + &content[0..cap * 2 + cap / 2] ); - let md = file - .buffered_writer - .as_inner() - .as_inner() - .path() - .metadata() - .unwrap(); + let md = file.buffered_writer.as_inner().path().metadata().unwrap(); assert_eq!( md.len(), - cap.into_u64(), - "buffered writer does one write if we write 1.5x buffer capacity" + 2 * cap.into_u64(), + "buffered writer requires one write to be flushed if we write 2.5x buffer capacity" ); assert_eq!( - &file.buffered_writer.inspect_buffer()[0..cap / 2], - &content[cap..cap + cap / 2] + &file.buffered_writer.inspect_maybe_flushed().unwrap()[0..cap], + &content[cap..cap * 2] + ); + assert_eq!( + &file.buffered_writer.inspect_mutable()[0..cap / 2], + &content[cap * 2..cap * 2 + cap / 2] ); } @@ -422,19 +498,19 @@ mod tests { let gate = utils::sync::gate::Gate::default(); - let mut file = - EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx) - .await - .unwrap(); - - let cap = file.buffered_writer.inspect_buffer().capacity(); + let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &ctx) + .await + .unwrap(); + let mutable = file.buffered_writer.inspect_mutable(); + let cap = mutable.capacity(); + let align = mutable.align(); let content: Vec = rand::thread_rng() .sample_iter(rand::distributions::Standard) - .take(cap + cap / 2) + .take(cap * 2 + cap / 2) .collect(); - file.write_raw(&content, &ctx).await.unwrap(); + let (_, control) = file.write_raw_controlled(&content, &ctx).await.unwrap(); let test_read = |start: usize, len: usize| { let file = &file; @@ -454,16 +530,38 @@ mod tests { } }; + let test_read_all_offset_combinations = || { + async move { + test_read(align, align).await; + // border onto edge of file + test_read(cap - align, align).await; + // read across file and buffer + test_read(cap - align, 2 * align).await; + // stay from start of maybe flushed buffer + test_read(cap, align).await; + // completely within maybe flushed buffer + test_read(cap + align, align).await; + // border onto edge of maybe flushed buffer. + test_read(cap * 2 - align, align).await; + // read across maybe flushed and mutable buffer + test_read(cap * 2 - align, 2 * align).await; + // read across three segments + test_read(cap - align, cap + 2 * align).await; + // completely within mutable buffer + test_read(cap * 2 + align, align).await; + } + }; + // completely within the file range - assert!(20 < cap, "test assumption"); - test_read(10, 10).await; - // border onto edge of file - test_read(cap - 10, 10).await; - // read across file and buffer - test_read(cap - 10, 20).await; - // stay from start of buffer - test_read(cap, 10).await; - // completely within buffer - test_read(cap + 10, 10).await; + assert!(align < cap, "test assumption"); + assert!(cap % align == 0); + + // test reads at different flush stages. + let not_started = control.unwrap().into_not_started(); + test_read_all_offset_combinations().await; + let in_progress = not_started.ready_to_flush(); + test_read_all_offset_combinations().await; + in_progress.wait_until_flush_is_done().await; + test_read_all_offset_combinations().await; } } diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 4bb1bbf3cf..89b935947d 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -681,6 +681,7 @@ impl RemoteTimelineClient { layer_file_name: &LayerName, layer_metadata: &LayerFileMetadata, local_path: &Utf8Path, + gate: &utils::sync::gate::Gate, cancel: &CancellationToken, ctx: &RequestContext, ) -> Result { @@ -700,6 +701,7 @@ impl RemoteTimelineClient { layer_file_name, layer_metadata, local_path, + gate, cancel, ctx, ) diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index 739615be9c..c5ae466f3a 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -6,6 +6,7 @@ use std::collections::HashSet; use std::future::Future; use std::str::FromStr; +use std::sync::Arc; use std::time::SystemTime; use anyhow::{anyhow, Context}; @@ -26,9 +27,7 @@ use crate::span::{ use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path}; use crate::tenant::storage_layer::LayerName; use crate::tenant::Generation; -#[cfg_attr(target_os = "macos", allow(unused_imports))] -use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt; -use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile}; +use crate::virtual_file::{on_fatal_io_error, IoBufferMut, MaybeFatalIo, VirtualFile}; use crate::TEMP_FILE_SUFFIX; use remote_storage::{ DownloadError, DownloadKind, DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath, @@ -60,6 +59,7 @@ pub async fn download_layer_file<'a>( layer_file_name: &'a LayerName, layer_metadata: &'a LayerFileMetadata, local_path: &Utf8Path, + gate: &utils::sync::gate::Gate, cancel: &CancellationToken, ctx: &RequestContext, ) -> Result { @@ -88,7 +88,9 @@ pub async fn download_layer_file<'a>( let temp_file_path = path_with_suffix_extension(local_path, TEMP_DOWNLOAD_EXTENSION); let bytes_amount = download_retry( - || async { download_object(storage, &remote_path, &temp_file_path, cancel, ctx).await }, + || async { + download_object(storage, &remote_path, &temp_file_path, gate, cancel, ctx).await + }, &format!("download {remote_path:?}"), cancel, ) @@ -148,6 +150,7 @@ async fn download_object<'a>( storage: &'a GenericRemoteStorage, src_path: &RemotePath, dst_path: &Utf8PathBuf, + gate: &utils::sync::gate::Gate, cancel: &CancellationToken, #[cfg_attr(target_os = "macos", allow(unused_variables))] ctx: &RequestContext, ) -> Result { @@ -205,13 +208,16 @@ async fn download_object<'a>( } #[cfg(target_os = "linux")] crate::virtual_file::io_engine::IoEngine::TokioEpollUring => { - use crate::virtual_file::owned_buffers_io::{self, util::size_tracking_writer}; - use bytes::BytesMut; + use crate::virtual_file::owned_buffers_io; async { - let destination_file = VirtualFile::create(dst_path, ctx) - .await - .with_context(|| format!("create a destination file for layer '{dst_path}'")) - .map_err(DownloadError::Other)?; + let destination_file = Arc::new( + VirtualFile::create(dst_path, ctx) + .await + .with_context(|| { + format!("create a destination file for layer '{dst_path}'") + }) + .map_err(DownloadError::Other)?, + ); let mut download = storage .download(src_path, &DownloadOpts::default(), cancel) @@ -219,14 +225,16 @@ async fn download_object<'a>( pausable_failpoint!("before-downloading-layer-stream-pausable"); + let mut buffered = owned_buffers_io::write::BufferedWriter::::new( + destination_file, + || IoBufferMut::with_capacity(super::BUFFER_SIZE), + gate.enter().map_err(|_| DownloadError::Cancelled)?, + ctx, + ); + // TODO: use vectored write (writev) once supported by tokio-epoll-uring. // There's chunks_vectored() on the stream. let (bytes_amount, destination_file) = async { - let size_tracking = size_tracking_writer::Writer::new(destination_file); - let mut buffered = owned_buffers_io::write::BufferedWriter::::new( - size_tracking, - BytesMut::with_capacity(super::BUFFER_SIZE), - ); while let Some(res) = futures::StreamExt::next(&mut download.download_stream).await { @@ -234,10 +242,10 @@ async fn download_object<'a>( Ok(chunk) => chunk, Err(e) => return Err(e), }; - buffered.write_buffered(chunk.slice_len(), ctx).await?; + buffered.write_buffered_borrowed(&chunk, ctx).await?; } - let size_tracking = buffered.flush_and_into_inner(ctx).await?; - Ok(size_tracking.into_inner()) + let inner = buffered.flush_and_into_inner(ctx).await?; + Ok(inner) } .await?; diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index 701e4cf04b..395e34e404 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -1183,6 +1183,7 @@ impl<'a> TenantDownloader<'a> { &layer.name, &layer.metadata, &local_path, + &self.secondary_state.gate, &self.secondary_state.cancel, ctx, ) diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index af6112d535..71e53da20f 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -555,13 +555,12 @@ impl InMemoryLayer { timeline_id: TimelineId, tenant_shard_id: TenantShardId, start_lsn: Lsn, - gate_guard: utils::sync::gate::GateGuard, + gate: &utils::sync::gate::Gate, ctx: &RequestContext, ) -> Result { trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}"); - let file = - EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate_guard, ctx).await?; + let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate, ctx).await?; let key = InMemoryLayerFileId(file.page_cache_file_id()); Ok(InMemoryLayer { diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index a9f1189b41..8933e8ceb1 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -1149,6 +1149,7 @@ impl LayerInner { &self.desc.layer_name(), &self.metadata(), &self.path, + &timeline.gate, &timeline.cancel, ctx, ) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 1414bef0a5..fc741826ab 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -3455,7 +3455,6 @@ impl Timeline { ctx: &RequestContext, ) -> anyhow::Result> { let mut guard = self.layers.write().await; - let gate_guard = self.gate.enter().context("enter gate for inmem layer")?; let last_record_lsn = self.get_last_record_lsn(); ensure!( @@ -3472,7 +3471,7 @@ impl Timeline { self.conf, self.timeline_id, self.tenant_shard_id, - gate_guard, + &self.gate, ctx, ) .await?; diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index 4293a44dca..3888e7f86a 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -182,7 +182,7 @@ impl OpenLayerManager { conf: &'static PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, - gate_guard: utils::sync::gate::GateGuard, + gate: &utils::sync::gate::Gate, ctx: &RequestContext, ) -> anyhow::Result> { ensure!(lsn.is_aligned()); @@ -212,15 +212,9 @@ impl OpenLayerManager { lsn ); - let new_layer = InMemoryLayer::create( - conf, - timeline_id, - tenant_shard_id, - start_lsn, - gate_guard, - ctx, - ) - .await?; + let new_layer = + InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn, gate, ctx) + .await?; let layer = Arc::new(new_layer); self.layer_map.open_layer = Some(layer.clone()); diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index b9f8c7ea20..8a7f4a4bf5 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -20,7 +20,7 @@ use camino::{Utf8Path, Utf8PathBuf}; use once_cell::sync::OnceCell; use owned_buffers_io::aligned_buffer::buffer::AlignedBuffer; use owned_buffers_io::aligned_buffer::{AlignedBufferMut, AlignedSlice, ConstAlign}; -use owned_buffers_io::io_buf_aligned::IoBufAlignedMut; +use owned_buffers_io::io_buf_aligned::{IoBufAligned, IoBufAlignedMut}; use owned_buffers_io::io_buf_ext::FullSlice; use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT; use pageserver_api::shard::TenantShardId; @@ -63,9 +63,6 @@ pub(crate) mod owned_buffers_io { pub(crate) mod io_buf_ext; pub(crate) mod slice; pub(crate) mod write; - pub(crate) mod util { - pub(crate) mod size_tracking_writer; - } } #[derive(Debug)] @@ -221,7 +218,7 @@ impl VirtualFile { self.inner.read_exact_at_page(page, offset, ctx).await } - pub async fn write_all_at( + pub async fn write_all_at( &self, buf: FullSlice, offset: u64, @@ -1325,14 +1322,14 @@ impl Drop for VirtualFileInner { } impl OwnedAsyncWriter for VirtualFile { - #[inline(always)] - async fn write_all( - &mut self, + async fn write_all_at( + &self, buf: FullSlice, + offset: u64, ctx: &RequestContext, - ) -> std::io::Result<(usize, FullSlice)> { - let (buf, res) = VirtualFile::write_all(self, buf, ctx).await; - res.map(move |v| (v, buf)) + ) -> std::io::Result> { + let (buf, res) = VirtualFile::write_all_at(self, buf, offset, ctx).await; + res.map(|_| buf) } } @@ -1451,7 +1448,7 @@ mod tests { } } } - async fn write_all_at( + async fn write_all_at( &self, buf: FullSlice, offset: u64, @@ -1594,6 +1591,7 @@ mod tests { &ctx, ) .await?; + file_a .write_all(b"foobar".to_vec().slice_len(), &ctx) .await?; @@ -1652,10 +1650,10 @@ mod tests { ) .await?; file_b - .write_all_at(b"BAR".to_vec().slice_len(), 3, &ctx) + .write_all_at(IoBuffer::from(b"BAR").slice_len(), 3, &ctx) .await?; file_b - .write_all_at(b"FOO".to_vec().slice_len(), 0, &ctx) + .write_all_at(IoBuffer::from(b"FOO").slice_len(), 0, &ctx) .await?; assert_eq!(file_b.read_string_at(2, 3, &ctx).await?, "OBA"); diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/alignment.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/alignment.rs index 933b78a13b..6b9992643f 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/alignment.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/alignment.rs @@ -4,7 +4,7 @@ pub trait Alignment: std::marker::Unpin + 'static { } /// Alignment at compile time. -#[derive(Debug)] +#[derive(Debug, Clone, Copy)] pub struct ConstAlign; impl Alignment for ConstAlign { @@ -14,7 +14,7 @@ impl Alignment for ConstAlign { } /// Alignment at run time. -#[derive(Debug)] +#[derive(Debug, Clone, Copy)] pub struct RuntimeAlign { align: usize, } diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs index 2fba6d699b..a5c26cd746 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs @@ -3,9 +3,10 @@ use std::{ sync::Arc, }; -use super::{alignment::Alignment, raw::RawAlignedBuffer}; +use super::{alignment::Alignment, raw::RawAlignedBuffer, AlignedBufferMut, ConstAlign}; /// An shared, immutable aligned buffer type. +#[derive(Clone, Debug)] pub struct AlignedBuffer { /// Shared raw buffer. raw: Arc>, @@ -86,6 +87,13 @@ impl AlignedBuffer { range: begin..end, } } + + /// Returns the mutable aligned buffer, if the immutable aligned buffer + /// has exactly one strong reference. Otherwise returns `None`. + pub fn into_mut(self) -> Option> { + let raw = Arc::into_inner(self.raw)?; + Some(AlignedBufferMut::from_raw(raw)) + } } impl Deref for AlignedBuffer { @@ -108,6 +116,14 @@ impl PartialEq<[u8]> for AlignedBuffer { } } +impl From<&[u8; N]> for AlignedBuffer> { + fn from(value: &[u8; N]) -> Self { + let mut buf = AlignedBufferMut::with_capacity(N); + buf.extend_from_slice(value); + buf.freeze() + } +} + /// SAFETY: the underlying buffer references a stable memory region. unsafe impl tokio_epoll_uring::IoBuf for AlignedBuffer { fn stable_ptr(&self) -> *const u8 { diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs index b3675d1aea..d2f5e206bb 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs @@ -1,4 +1,7 @@ -use std::ops::{Deref, DerefMut}; +use std::{ + mem::MaybeUninit, + ops::{Deref, DerefMut}, +}; use super::{ alignment::{Alignment, ConstAlign}, @@ -46,6 +49,11 @@ impl AlignedBufferMut> { } impl AlignedBufferMut { + /// Constructs a mutable aligned buffer from raw. + pub(super) fn from_raw(raw: RawAlignedBuffer) -> Self { + AlignedBufferMut { raw } + } + /// Returns the total number of bytes the buffer can hold. #[inline] pub fn capacity(&self) -> usize { @@ -128,6 +136,39 @@ impl AlignedBufferMut { let len = self.len(); AlignedBuffer::from_raw(self.raw, 0..len) } + + /// Clones and appends all elements in a slice to the buffer. Reserves additional capacity as needed. + #[inline] + pub fn extend_from_slice(&mut self, extend: &[u8]) { + let cnt = extend.len(); + self.reserve(cnt); + + // SAFETY: we already reserved additional `cnt` bytes, safe to perform memcpy. + unsafe { + let dst = self.spare_capacity_mut(); + // Reserved above + debug_assert!(dst.len() >= cnt); + + core::ptr::copy_nonoverlapping(extend.as_ptr(), dst.as_mut_ptr().cast(), cnt); + } + // SAFETY: We do have at least `cnt` bytes remaining before advance. + unsafe { + bytes::BufMut::advance_mut(self, cnt); + } + } + + /// Returns the remaining spare capacity of the vector as a slice of `MaybeUninit`. + #[inline] + fn spare_capacity_mut(&mut self) -> &mut [MaybeUninit] { + // SAFETY: we guarantees that the `Self::capacity()` bytes from + // `Self::as_mut_ptr()` are allocated. + unsafe { + let ptr = self.as_mut_ptr().add(self.len()); + let len = self.capacity() - self.len(); + + core::slice::from_raw_parts_mut(ptr.cast(), len) + } + } } impl Deref for AlignedBufferMut { diff --git a/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs b/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs index dba695196e..4ea6b17744 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs @@ -1,9 +1,15 @@ -use tokio_epoll_uring::IoBufMut; +use tokio_epoll_uring::{IoBuf, IoBufMut}; -use crate::virtual_file::{IoBufferMut, PageWriteGuardBuf}; +use crate::virtual_file::{IoBuffer, IoBufferMut, PageWriteGuardBuf}; +/// A marker trait for a mutable aligned buffer type. pub trait IoBufAlignedMut: IoBufMut {} +/// A marker trait for an aligned buffer type. +pub trait IoBufAligned: IoBuf {} + impl IoBufAlignedMut for IoBufferMut {} +impl IoBufAligned for IoBuffer {} + impl IoBufAlignedMut for PageWriteGuardBuf {} diff --git a/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs index c3940cf6ce..525f447b6d 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs @@ -5,6 +5,8 @@ use bytes::{Bytes, BytesMut}; use std::ops::{Deref, Range}; use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice}; +use super::write::CheapCloneForRead; + /// The true owned equivalent for Rust [`slice`]. Use this for the write path. /// /// Unlike [`tokio_epoll_uring::Slice`], which we unfortunately inherited from `tokio-uring`, @@ -43,6 +45,18 @@ where } } +impl CheapCloneForRead for FullSlice +where + B: IoBuf + CheapCloneForRead, +{ + fn cheap_clone(&self) -> Self { + let bounds = self.slice.bounds(); + let clone = self.slice.get_ref().cheap_clone(); + let slice = clone.slice(bounds); + Self { slice } + } +} + pub(crate) trait IoBufExt { /// Get a [`FullSlice`] for the entire buffer, i.e., `self[..]` or `self[0..self.len()]`. fn slice_len(self) -> FullSlice diff --git a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs deleted file mode 100644 index efcb61ba65..0000000000 --- a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs +++ /dev/null @@ -1,50 +0,0 @@ -use crate::{ - context::RequestContext, - virtual_file::owned_buffers_io::{io_buf_ext::FullSlice, write::OwnedAsyncWriter}, -}; -use tokio_epoll_uring::IoBuf; - -pub struct Writer { - dst: W, - bytes_amount: u64, -} - -impl Writer { - pub fn new(dst: W) -> Self { - Self { - dst, - bytes_amount: 0, - } - } - - pub fn bytes_written(&self) -> u64 { - self.bytes_amount - } - - pub fn as_inner(&self) -> &W { - &self.dst - } - - /// Returns the wrapped `VirtualFile` object as well as the number - /// of bytes that were written to it through this object. - #[cfg_attr(target_os = "macos", allow(dead_code))] - pub fn into_inner(self) -> (u64, W) { - (self.bytes_amount, self.dst) - } -} - -impl OwnedAsyncWriter for Writer -where - W: OwnedAsyncWriter, -{ - #[inline(always)] - async fn write_all( - &mut self, - buf: FullSlice, - ctx: &RequestContext, - ) -> std::io::Result<(usize, FullSlice)> { - let (nwritten, buf) = self.dst.write_all(buf, ctx).await?; - self.bytes_amount += u64::try_from(nwritten).unwrap(); - Ok((nwritten, buf)) - } -} diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs index 568cf62e56..20bf878123 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs @@ -1,55 +1,88 @@ -use bytes::BytesMut; +mod flush; +use std::sync::Arc; + +use flush::FlushHandle; use tokio_epoll_uring::IoBuf; -use crate::context::RequestContext; +use crate::{ + context::RequestContext, + virtual_file::{IoBuffer, IoBufferMut}, +}; -use super::io_buf_ext::{FullSlice, IoBufExt}; +use super::{ + io_buf_aligned::IoBufAligned, + io_buf_ext::{FullSlice, IoBufExt}, +}; + +pub(crate) use flush::FlushControl; + +pub(crate) trait CheapCloneForRead { + /// Returns a cheap clone of the buffer. + fn cheap_clone(&self) -> Self; +} + +impl CheapCloneForRead for IoBuffer { + fn cheap_clone(&self) -> Self { + // Cheap clone over an `Arc`. + self.clone() + } +} /// A trait for doing owned-buffer write IO. /// Think [`tokio::io::AsyncWrite`] but with owned buffers. +/// The owned buffers need to be aligned due to Direct IO requirements. pub trait OwnedAsyncWriter { - async fn write_all( - &mut self, + fn write_all_at( + &self, buf: FullSlice, + offset: u64, ctx: &RequestContext, - ) -> std::io::Result<(usize, FullSlice)>; + ) -> impl std::future::Future>> + Send; } /// A wrapper aorund an [`OwnedAsyncWriter`] that uses a [`Buffer`] to batch /// small writes into larger writes of size [`Buffer::cap`]. -/// -/// # Passthrough Of Large Writers -/// -/// Calls to [`BufferedWriter::write_buffered`] that are larger than [`Buffer::cap`] -/// cause the internal buffer to be flushed prematurely so that the large -/// buffered write is passed through to the underlying [`OwnedAsyncWriter`]. -/// -/// This pass-through is generally beneficial for throughput, but if -/// the storage backend of the [`OwnedAsyncWriter`] is a shared resource, -/// unlimited large writes may cause latency or fairness issues. -/// -/// In such cases, a different implementation that always buffers in memory -/// may be preferable. -pub struct BufferedWriter { - writer: W, +// TODO(yuchen): For large write, implementing buffer bypass for aligned parts of the write could be beneficial to throughput, +// since we would avoid copying majority of the data into the internal buffer. +pub struct BufferedWriter { + writer: Arc, /// invariant: always remains Some(buf) except /// - while IO is ongoing => goes back to Some() once the IO completed successfully /// - after an IO error => stays `None` forever /// /// In these exceptional cases, it's `None`. - buf: Option, + mutable: Option, + /// A handle to the background flush task for writting data to disk. + flush_handle: FlushHandle, + /// The number of bytes submitted to the background task. + bytes_submitted: u64, } impl BufferedWriter where - B: Buffer + Send, - Buf: IoBuf + Send, - W: OwnedAsyncWriter, + B: Buffer + Send + 'static, + Buf: IoBufAligned + Send + Sync + CheapCloneForRead, + W: OwnedAsyncWriter + Send + Sync + 'static + std::fmt::Debug, { - pub fn new(writer: W, buf: B) -> Self { + /// Creates a new buffered writer. + /// + /// The `buf_new` function provides a way to initialize the owned buffers used by this writer. + pub fn new( + writer: Arc, + buf_new: impl Fn() -> B, + gate_guard: utils::sync::gate::GateGuard, + ctx: &RequestContext, + ) -> Self { Self { - writer, - buf: Some(buf), + writer: writer.clone(), + mutable: Some(buf_new()), + flush_handle: FlushHandle::spawn_new( + writer, + buf_new(), + gate_guard, + ctx.attached_child(), + ), + bytes_submitted: 0, } } @@ -57,87 +90,70 @@ where &self.writer } + /// Returns the number of bytes submitted to the background flush task. + pub fn bytes_submitted(&self) -> u64 { + self.bytes_submitted + } + /// Panics if used after any of the write paths returned an error - pub fn inspect_buffer(&self) -> &B { - self.buf() + pub fn inspect_mutable(&self) -> &B { + self.mutable() + } + + /// Gets a reference to the maybe flushed read-only buffer. + /// Returns `None` if the writer has not submitted any flush request. + pub fn inspect_maybe_flushed(&self) -> Option<&FullSlice> { + self.flush_handle.maybe_flushed.as_ref() } #[cfg_attr(target_os = "macos", allow(dead_code))] - pub async fn flush_and_into_inner(mut self, ctx: &RequestContext) -> std::io::Result { + pub async fn flush_and_into_inner( + mut self, + ctx: &RequestContext, + ) -> std::io::Result<(u64, Arc)> { self.flush(ctx).await?; - let Self { buf, writer } = self; + let Self { + mutable: buf, + writer, + mut flush_handle, + bytes_submitted: bytes_amount, + } = self; + flush_handle.shutdown().await?; assert!(buf.is_some()); - Ok(writer) + Ok((bytes_amount, writer)) } + /// Gets a reference to the mutable in-memory buffer. #[inline(always)] - fn buf(&self) -> &B { - self.buf + fn mutable(&self) -> &B { + self.mutable .as_ref() .expect("must not use after we returned an error") } - /// Guarantees that if Ok() is returned, all bytes in `chunk` have been accepted. - #[cfg_attr(target_os = "macos", allow(dead_code))] - pub async fn write_buffered( + pub async fn write_buffered_borrowed( &mut self, - chunk: FullSlice, + chunk: &[u8], ctx: &RequestContext, - ) -> std::io::Result<(usize, FullSlice)> { - let chunk = chunk.into_raw_slice(); - - let chunk_len = chunk.len(); - // avoid memcpy for the middle of the chunk - if chunk.len() >= self.buf().cap() { - self.flush(ctx).await?; - // do a big write, bypassing `buf` - assert_eq!( - self.buf - .as_ref() - .expect("must not use after an error") - .pending(), - 0 - ); - let (nwritten, chunk) = self - .writer - .write_all(FullSlice::must_new(chunk), ctx) - .await?; - assert_eq!(nwritten, chunk_len); - return Ok((nwritten, chunk)); + ) -> std::io::Result { + let (len, control) = self.write_buffered_borrowed_controlled(chunk, ctx).await?; + if let Some(control) = control { + control.release().await; } - // in-memory copy the < BUFFER_SIZED tail of the chunk - assert!(chunk.len() < self.buf().cap()); - let mut slice = &chunk[..]; - while !slice.is_empty() { - let buf = self.buf.as_mut().expect("must not use after an error"); - let need = buf.cap() - buf.pending(); - let have = slice.len(); - let n = std::cmp::min(need, have); - buf.extend_from_slice(&slice[..n]); - slice = &slice[n..]; - if buf.pending() >= buf.cap() { - assert_eq!(buf.pending(), buf.cap()); - self.flush(ctx).await?; - } - } - assert!(slice.is_empty(), "by now we should have drained the chunk"); - Ok((chunk_len, FullSlice::must_new(chunk))) + Ok(len) } - /// Strictly less performant variant of [`Self::write_buffered`] that allows writing borrowed data. - /// - /// It is less performant because we always have to copy the borrowed data into the internal buffer - /// before we can do the IO. The [`Self::write_buffered`] can avoid this, which is more performant - /// for large writes. - pub async fn write_buffered_borrowed( + /// In addition to bytes submitted in this write, also returns a handle that can control the flush behavior. + pub(crate) async fn write_buffered_borrowed_controlled( &mut self, mut chunk: &[u8], ctx: &RequestContext, - ) -> std::io::Result { + ) -> std::io::Result<(usize, Option)> { let chunk_len = chunk.len(); + let mut control: Option = None; while !chunk.is_empty() { - let buf = self.buf.as_mut().expect("must not use after an error"); + let buf = self.mutable.as_mut().expect("must not use after an error"); let need = buf.cap() - buf.pending(); let have = chunk.len(); let n = std::cmp::min(need, have); @@ -145,26 +161,27 @@ where chunk = &chunk[n..]; if buf.pending() >= buf.cap() { assert_eq!(buf.pending(), buf.cap()); - self.flush(ctx).await?; + if let Some(control) = control.take() { + control.release().await; + } + control = self.flush(ctx).await?; } } - Ok(chunk_len) + Ok((chunk_len, control)) } - async fn flush(&mut self, ctx: &RequestContext) -> std::io::Result<()> { - let buf = self.buf.take().expect("must not use after an error"); + #[must_use = "caller must explcitly check the flush control"] + async fn flush(&mut self, _ctx: &RequestContext) -> std::io::Result> { + let buf = self.mutable.take().expect("must not use after an error"); let buf_len = buf.pending(); if buf_len == 0 { - self.buf = Some(buf); - return Ok(()); + self.mutable = Some(buf); + return Ok(None); } - let slice = buf.flush(); - let (nwritten, slice) = self.writer.write_all(slice, ctx).await?; - assert_eq!(nwritten, buf_len); - self.buf = Some(Buffer::reuse_after_flush( - slice.into_raw_slice().into_inner(), - )); - Ok(()) + let (recycled, flush_control) = self.flush_handle.flush(buf, self.bytes_submitted).await?; + self.bytes_submitted += u64::try_from(buf_len).unwrap(); + self.mutable = Some(recycled); + Ok(Some(flush_control)) } } @@ -192,64 +209,77 @@ pub trait Buffer { fn reuse_after_flush(iobuf: Self::IoBuf) -> Self; } -impl Buffer for BytesMut { - type IoBuf = BytesMut; +impl Buffer for IoBufferMut { + type IoBuf = IoBuffer; - #[inline(always)] fn cap(&self) -> usize { self.capacity() } fn extend_from_slice(&mut self, other: &[u8]) { - BytesMut::extend_from_slice(self, other) + if self.len() + other.len() > self.cap() { + panic!("Buffer capacity exceeded"); + } + + IoBufferMut::extend_from_slice(self, other); } - #[inline(always)] fn pending(&self) -> usize { self.len() } - fn flush(self) -> FullSlice { - self.slice_len() + fn flush(self) -> FullSlice { + self.freeze().slice_len() } - fn reuse_after_flush(mut iobuf: BytesMut) -> Self { - iobuf.clear(); - iobuf - } -} - -impl OwnedAsyncWriter for Vec { - async fn write_all( - &mut self, - buf: FullSlice, - _: &RequestContext, - ) -> std::io::Result<(usize, FullSlice)> { - self.extend_from_slice(&buf[..]); - Ok((buf.len(), buf)) + /// Caller should make sure that `iobuf` only have one strong reference before invoking this method. + fn reuse_after_flush(iobuf: Self::IoBuf) -> Self { + let mut recycled = iobuf + .into_mut() + .expect("buffer should only have one strong reference"); + recycled.clear(); + recycled } } #[cfg(test)] mod tests { - use bytes::BytesMut; + use std::sync::Mutex; use super::*; use crate::context::{DownloadBehavior, RequestContext}; use crate::task_mgr::TaskKind; - #[derive(Default)] + #[derive(Default, Debug)] struct RecorderWriter { - writes: Vec>, + /// record bytes and write offsets. + writes: Mutex, u64)>>, } + + impl RecorderWriter { + /// Gets recorded bytes and write offsets. + fn get_writes(&self) -> Vec> { + self.writes + .lock() + .unwrap() + .iter() + .map(|(buf, _)| buf.clone()) + .collect() + } + } + impl OwnedAsyncWriter for RecorderWriter { - async fn write_all( - &mut self, + async fn write_all_at( + &self, buf: FullSlice, + offset: u64, _: &RequestContext, - ) -> std::io::Result<(usize, FullSlice)> { - self.writes.push(Vec::from(&buf[..])); - Ok((buf.len(), buf)) + ) -> std::io::Result> { + self.writes + .lock() + .unwrap() + .push((Vec::from(&buf[..]), offset)); + Ok(buf) } } @@ -257,71 +287,21 @@ mod tests { RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error) } - macro_rules! write { - ($writer:ident, $data:literal) => {{ - $writer - .write_buffered(::bytes::Bytes::from_static($data).slice_len(), &test_ctx()) - .await?; - }}; - } - #[tokio::test] - async fn test_buffered_writes_only() -> std::io::Result<()> { - let recorder = RecorderWriter::default(); - let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2)); - write!(writer, b"a"); - write!(writer, b"b"); - write!(writer, b"c"); - write!(writer, b"d"); - write!(writer, b"e"); - let recorder = writer.flush_and_into_inner(&test_ctx()).await?; - assert_eq!( - recorder.writes, - vec![Vec::from(b"ab"), Vec::from(b"cd"), Vec::from(b"e")] - ); - Ok(()) - } - - #[tokio::test] - async fn test_passthrough_writes_only() -> std::io::Result<()> { - let recorder = RecorderWriter::default(); - let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2)); - write!(writer, b"abc"); - write!(writer, b"de"); - write!(writer, b""); - write!(writer, b"fghijk"); - let recorder = writer.flush_and_into_inner(&test_ctx()).await?; - assert_eq!( - recorder.writes, - vec![Vec::from(b"abc"), Vec::from(b"de"), Vec::from(b"fghijk")] - ); - Ok(()) - } - - #[tokio::test] - async fn test_passthrough_write_with_nonempty_buffer() -> std::io::Result<()> { - let recorder = RecorderWriter::default(); - let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2)); - write!(writer, b"a"); - write!(writer, b"bc"); - write!(writer, b"d"); - write!(writer, b"e"); - let recorder = writer.flush_and_into_inner(&test_ctx()).await?; - assert_eq!( - recorder.writes, - vec![Vec::from(b"a"), Vec::from(b"bc"), Vec::from(b"de")] - ); - Ok(()) - } - - #[tokio::test] - async fn test_write_all_borrowed_always_goes_through_buffer() -> std::io::Result<()> { + async fn test_write_all_borrowed_always_goes_through_buffer() -> anyhow::Result<()> { let ctx = test_ctx(); let ctx = &ctx; - let recorder = RecorderWriter::default(); - let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2)); + let recorder = Arc::new(RecorderWriter::default()); + let gate = utils::sync::gate::Gate::default(); + let mut writer = BufferedWriter::<_, RecorderWriter>::new( + recorder, + || IoBufferMut::with_capacity(2), + gate.enter()?, + ctx, + ); writer.write_buffered_borrowed(b"abc", ctx).await?; + writer.write_buffered_borrowed(b"", ctx).await?; writer.write_buffered_borrowed(b"d", ctx).await?; writer.write_buffered_borrowed(b"e", ctx).await?; writer.write_buffered_borrowed(b"fg", ctx).await?; @@ -329,9 +309,9 @@ mod tests { writer.write_buffered_borrowed(b"j", ctx).await?; writer.write_buffered_borrowed(b"klmno", ctx).await?; - let recorder = writer.flush_and_into_inner(ctx).await?; + let (_, recorder) = writer.flush_and_into_inner(ctx).await?; assert_eq!( - recorder.writes, + recorder.get_writes(), { let expect: &[&[u8]] = &[b"ab", b"cd", b"ef", b"gh", b"ij", b"kl", b"mn", b"o"]; expect diff --git a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs new file mode 100644 index 0000000000..9ce8b311bb --- /dev/null +++ b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs @@ -0,0 +1,314 @@ +use std::sync::Arc; + +use utils::sync::duplex; + +use crate::{ + context::RequestContext, + virtual_file::owned_buffers_io::{io_buf_aligned::IoBufAligned, io_buf_ext::FullSlice}, +}; + +use super::{Buffer, CheapCloneForRead, OwnedAsyncWriter}; + +/// A handle to the flush task. +pub struct FlushHandle { + inner: Option>, + /// Immutable buffer for serving tail reads. + /// `None` if no flush request has been submitted. + pub(super) maybe_flushed: Option>, +} + +pub struct FlushHandleInner { + /// A bi-directional channel that sends (buffer, offset) for writes, + /// and receives recyled buffer. + channel: duplex::mpsc::Duplex, FullSlice>, + /// Join handle for the background flush task. + join_handle: tokio::task::JoinHandle>>, +} + +struct FlushRequest { + slice: FullSlice, + offset: u64, + #[cfg(test)] + ready_to_flush_rx: tokio::sync::oneshot::Receiver<()>, + #[cfg(test)] + done_flush_tx: tokio::sync::oneshot::Sender<()>, +} + +/// Constructs a request and a control object for a new flush operation. +#[cfg(not(test))] +fn new_flush_op(slice: FullSlice, offset: u64) -> (FlushRequest, FlushControl) { + let request = FlushRequest { slice, offset }; + let control = FlushControl::untracked(); + + (request, control) +} + +/// Constructs a request and a control object for a new flush operation. +#[cfg(test)] +fn new_flush_op(slice: FullSlice, offset: u64) -> (FlushRequest, FlushControl) { + let (ready_to_flush_tx, ready_to_flush_rx) = tokio::sync::oneshot::channel(); + let (done_flush_tx, done_flush_rx) = tokio::sync::oneshot::channel(); + let control = FlushControl::not_started(ready_to_flush_tx, done_flush_rx); + + let request = FlushRequest { + slice, + offset, + ready_to_flush_rx, + done_flush_tx, + }; + (request, control) +} + +/// A handle to a `FlushRequest` that allows unit tests precise control over flush behavior. +#[cfg(test)] +pub(crate) struct FlushControl { + not_started: FlushNotStarted, +} + +#[cfg(not(test))] +pub(crate) struct FlushControl; + +impl FlushControl { + #[cfg(test)] + fn not_started( + ready_to_flush_tx: tokio::sync::oneshot::Sender<()>, + done_flush_rx: tokio::sync::oneshot::Receiver<()>, + ) -> Self { + FlushControl { + not_started: FlushNotStarted { + ready_to_flush_tx, + done_flush_rx, + }, + } + } + + #[cfg(not(test))] + fn untracked() -> Self { + FlushControl + } + + /// In tests, turn flush control into a not started state. + #[cfg(test)] + pub(crate) fn into_not_started(self) -> FlushNotStarted { + self.not_started + } + + /// Release control to the submitted buffer. + /// + /// In `cfg(test)` environment, the buffer is guranteed to be flushed to disk after [`FlushControl::release`] is finishes execution. + pub async fn release(self) { + #[cfg(test)] + { + self.not_started + .ready_to_flush() + .wait_until_flush_is_done() + .await; + } + } +} + +impl FlushHandle +where + Buf: IoBufAligned + Send + Sync + CheapCloneForRead, + W: OwnedAsyncWriter + Send + Sync + 'static + std::fmt::Debug, +{ + /// Spawns a new background flush task and obtains a handle. + /// + /// Note: The background task so we do not need to explicitly maintain a queue of buffers. + pub fn spawn_new( + file: Arc, + buf: B, + gate_guard: utils::sync::gate::GateGuard, + ctx: RequestContext, + ) -> Self + where + B: Buffer + Send + 'static, + { + // It is fine to buffer up to only 1 message. We only 1 message in-flight at a time. + let (front, back) = duplex::mpsc::channel(1); + + let join_handle = tokio::spawn(async move { + FlushBackgroundTask::new(back, file, gate_guard, ctx) + .run(buf.flush()) + .await + }); + + FlushHandle { + inner: Some(FlushHandleInner { + channel: front, + join_handle, + }), + maybe_flushed: None, + } + } + + /// Submits a buffer to be flushed in the background task. + /// Returns a buffer that completed flushing for re-use, length reset to 0, capacity unchanged. + /// If `save_buf_for_read` is true, then we save the buffer in `Self::maybe_flushed`, otherwise + /// clear `maybe_flushed`. + pub async fn flush(&mut self, buf: B, offset: u64) -> std::io::Result<(B, FlushControl)> + where + B: Buffer + Send + 'static, + { + let slice = buf.flush(); + + // Saves a buffer for read while flushing. This also removes reference to the old buffer. + self.maybe_flushed = Some(slice.cheap_clone()); + + let (request, flush_control) = new_flush_op(slice, offset); + + // Submits the buffer to the background task. + let submit = self.inner_mut().channel.send(request).await; + if submit.is_err() { + return self.handle_error().await; + } + + // Wait for an available buffer from the background flush task. + // This is the BACKPRESSURE mechanism: if the flush task can't keep up, + // then the write path will eventually wait for it here. + let Some(recycled) = self.inner_mut().channel.recv().await else { + return self.handle_error().await; + }; + + // The only other place that could hold a reference to the recycled buffer + // is in `Self::maybe_flushed`, but we have already replace it with the new buffer. + let recycled = Buffer::reuse_after_flush(recycled.into_raw_slice().into_inner()); + Ok((recycled, flush_control)) + } + + async fn handle_error(&mut self) -> std::io::Result { + Err(self + .shutdown() + .await + .expect_err("flush task only disconnects duplex if it exits with an error")) + } + + /// Cleans up the channel, join the flush task. + pub async fn shutdown(&mut self) -> std::io::Result> { + let handle = self + .inner + .take() + .expect("must not use after we returned an error"); + drop(handle.channel.tx); + handle.join_handle.await.unwrap() + } + + /// Gets a mutable reference to the inner handle. Panics if [`Self::inner`] is `None`. + /// This only happens if the handle is used after an error. + fn inner_mut(&mut self) -> &mut FlushHandleInner { + self.inner + .as_mut() + .expect("must not use after we returned an error") + } +} + +/// A background task for flushing data to disk. +pub struct FlushBackgroundTask { + /// A bi-directional channel that receives (buffer, offset) for writes, + /// and send back recycled buffer. + channel: duplex::mpsc::Duplex, FlushRequest>, + /// A writter for persisting data to disk. + writer: Arc, + ctx: RequestContext, + /// Prevent timeline from shuting down until the flush background task finishes flushing all remaining buffers to disk. + _gate_guard: utils::sync::gate::GateGuard, +} + +impl FlushBackgroundTask +where + Buf: IoBufAligned + Send + Sync, + W: OwnedAsyncWriter + Sync + 'static, +{ + /// Creates a new background flush task. + fn new( + channel: duplex::mpsc::Duplex, FlushRequest>, + file: Arc, + gate_guard: utils::sync::gate::GateGuard, + ctx: RequestContext, + ) -> Self { + FlushBackgroundTask { + channel, + writer: file, + _gate_guard: gate_guard, + ctx, + } + } + + /// Runs the background flush task. + /// The passed in slice is immediately sent back to the flush handle through the duplex channel. + async fn run(mut self, slice: FullSlice) -> std::io::Result> { + // Sends the extra buffer back to the handle. + self.channel.send(slice).await.map_err(|_| { + std::io::Error::new(std::io::ErrorKind::BrokenPipe, "flush handle closed early") + })?; + + // Exit condition: channel is closed and there is no remaining buffer to be flushed + while let Some(request) = self.channel.recv().await { + #[cfg(test)] + { + // In test, wait for control to signal that we are ready to flush. + if request.ready_to_flush_rx.await.is_err() { + tracing::debug!("control dropped"); + } + } + + // Write slice to disk at `offset`. + let slice = self + .writer + .write_all_at(request.slice, request.offset, &self.ctx) + .await?; + + #[cfg(test)] + { + // In test, tell control we are done flushing buffer. + if request.done_flush_tx.send(()).is_err() { + tracing::debug!("control dropped"); + } + } + + // Sends the buffer back to the handle for reuse. The handle is in charged of cleaning the buffer. + if self.channel.send(slice).await.is_err() { + // Although channel is closed. Still need to finish flushing the remaining buffers. + continue; + } + } + + Ok(self.writer) + } +} + +#[cfg(test)] +pub(crate) struct FlushNotStarted { + ready_to_flush_tx: tokio::sync::oneshot::Sender<()>, + done_flush_rx: tokio::sync::oneshot::Receiver<()>, +} + +#[cfg(test)] +pub(crate) struct FlushInProgress { + done_flush_rx: tokio::sync::oneshot::Receiver<()>, +} + +#[cfg(test)] +pub(crate) struct FlushDone; + +#[cfg(test)] +impl FlushNotStarted { + /// Signals the background task the buffer is ready to flush to disk. + pub fn ready_to_flush(self) -> FlushInProgress { + self.ready_to_flush_tx + .send(()) + .map(|_| FlushInProgress { + done_flush_rx: self.done_flush_rx, + }) + .unwrap() + } +} + +#[cfg(test)] +impl FlushInProgress { + /// Waits until background flush is done. + pub async fn wait_until_flush_is_done(self) -> FlushDone { + self.done_flush_rx.await.unwrap(); + FlushDone + } +} From d0289299457dc6d1caece326553af47aa976e80a Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Wed, 4 Dec 2024 17:42:17 +0000 Subject: [PATCH 62/65] chore: update clap (#10009) This updates clap to use a new version of anstream --- Cargo.lock | 62 +++++++++++++++++++++++++++++------------------------- 1 file changed, 33 insertions(+), 29 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 38158b7aec..de8785f87e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -84,16 +84,16 @@ checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" [[package]] name = "anstream" -version = "0.3.2" +version = "0.6.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ca84f3628370c59db74ee214b3263d58f9aadd9b4fe7e711fd87dc452b7f163" +checksum = "64e15c1ab1f89faffbf04a634d5e1962e9074f2741eef6d97f3c4e322426d526" dependencies = [ "anstyle", "anstyle-parse", "anstyle-query", "anstyle-wincon", "colorchoice", - "is-terminal", + "is_terminal_polyfill", "utf8parse", ] @@ -123,12 +123,12 @@ dependencies = [ [[package]] name = "anstyle-wincon" -version = "1.0.1" +version = "3.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "180abfa45703aebe0093f79badacc01b8fd4ea2e35118747e5811127f926e188" +checksum = "5bf74e1b6e971609db8ca7a9ce79fd5768ab6ae46441c572e46cf596f59e57f8" dependencies = [ "anstyle", - "windows-sys 0.48.0", + "windows-sys 0.52.0", ] [[package]] @@ -1167,35 +1167,33 @@ dependencies = [ [[package]] name = "clap" -version = "4.3.0" +version = "4.5.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93aae7a4192245f70fe75dd9157fc7b4a5bf53e88d30bd4396f7d8f9284d5acc" +checksum = "69371e34337c4c984bbe322360c2547210bf632eb2814bbe78a6e87a2935bd2b" dependencies = [ "clap_builder", "clap_derive", - "once_cell", ] [[package]] name = "clap_builder" -version = "4.3.0" +version = "4.5.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f423e341edefb78c9caba2d9c7f7687d0e72e89df3ce3394554754393ac3990" +checksum = "6e24c1b4099818523236a8ca881d2b45db98dadfb4625cf6608c12069fcbbde1" dependencies = [ "anstream", "anstyle", - "bitflags 1.3.2", "clap_lex", - "strsim", + "strsim 0.11.1", ] [[package]] name = "clap_derive" -version = "4.3.0" +version = "4.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "191d9573962933b4027f932c600cd252ce27a8ad5979418fe78e43c07996f27b" +checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab" dependencies = [ - "heck 0.4.1", + "heck", "proc-macro2", "quote", "syn 2.0.90", @@ -1203,9 +1201,9 @@ dependencies = [ [[package]] name = "clap_lex" -version = "0.5.0" +version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b" +checksum = "afb84c814227b90d6895e01398aee0d8033c00e7466aca416fb6a8e0eb19d8a7" [[package]] name = "colorchoice" @@ -1614,7 +1612,7 @@ dependencies = [ "ident_case", "proc-macro2", "quote", - "strsim", + "strsim 0.10.0", "syn 2.0.90", ] @@ -1812,7 +1810,7 @@ checksum = "0892a17df262a24294c382f0d5997571006e7a4348b4327557c4ff1cd4a8bccc" dependencies = [ "darling", "either", - "heck 0.5.0", + "heck", "proc-macro2", "quote", "syn 2.0.90", @@ -2465,12 +2463,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "heck" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" - [[package]] name = "heck" version = "0.5.0" @@ -2888,6 +2880,12 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "is_terminal_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" + [[package]] name = "itertools" version = "0.10.5" @@ -3169,7 +3167,7 @@ version = "0.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b9e6777fc80a575f9503d908c8b498782a6c3ee88a06cb416dc3941401e43b94" dependencies = [ - "heck 0.5.0", + "heck", "proc-macro2", "quote", "syn 2.0.90", @@ -4458,7 +4456,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c1318b19085f08681016926435853bbf7858f9c082d0999b80550ff5d9abe15" dependencies = [ "bytes", - "heck 0.5.0", + "heck", "itertools 0.12.1", "log", "multimap", @@ -6166,6 +6164,12 @@ version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + [[package]] name = "strum" version = "0.26.3" @@ -6178,7 +6182,7 @@ version = "0.26.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" dependencies = [ - "heck 0.5.0", + "heck", "proc-macro2", "quote", "rustversion", From ecdad5e6d5b614812b000acb24a99d5d7d2e56f4 Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Wed, 4 Dec 2024 21:07:44 +0000 Subject: [PATCH 63/65] chore: update rust-postgres (#10002) Like #9931 but without rebasing upstream just yet, to try and minimise the differences. Removes all proxy-specific commits from the rust-postgres fork, now that proxy no longer depends on them. Merging upstream changes to come later. --- Cargo.lock | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index de8785f87e..62f06d45bd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4169,7 +4169,7 @@ dependencies = [ [[package]] name = "postgres" version = "0.19.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#00940fcdb57a8e99e805297b75839e7c4c7b1796" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473" dependencies = [ "bytes", "fallible-iterator", @@ -4182,7 +4182,7 @@ dependencies = [ [[package]] name = "postgres-protocol" version = "0.6.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#00940fcdb57a8e99e805297b75839e7c4c7b1796" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473" dependencies = [ "base64 0.20.0", "byteorder", @@ -4195,7 +4195,6 @@ dependencies = [ "rand 0.8.5", "sha2", "stringprep", - "tokio", ] [[package]] @@ -4217,7 +4216,7 @@ dependencies = [ [[package]] name = "postgres-types" version = "0.2.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#00940fcdb57a8e99e805297b75839e7c4c7b1796" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473" dependencies = [ "bytes", "fallible-iterator", @@ -6547,7 +6546,7 @@ dependencies = [ [[package]] name = "tokio-postgres" version = "0.7.7" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#00940fcdb57a8e99e805297b75839e7c4c7b1796" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473" dependencies = [ "async-trait", "byteorder", From a83bd4e81c82680b4d74add46c6096b3ed2d9ce4 Mon Sep 17 00:00:00 2001 From: Yuchen Liang <70461588+yliang412@users.noreply.github.com> Date: Wed, 4 Dec 2024 21:16:09 -0500 Subject: [PATCH 64/65] pageserver: fix buffered-writer on macos build (#10019) ## Problem In https://github.com/neondatabase/neon/pull/9693, we forgot to check macos build. The [CI run](https://github.com/neondatabase/neon/actions/runs/12164541897/job/33926455468) on main showed that macos build failed with unused variables and dead code. ## Summary of changes - add `allow(dead_code)` and `allow(unused_variables)` to the relevant code that is not used on macos. Signed-off-by: Yuchen Liang --- pageserver/src/tenant/remote_timeline_client/download.rs | 7 ++++--- pageserver/src/virtual_file/owned_buffers_io/write.rs | 1 + 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index c5ae466f3a..d15f161fb6 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -6,7 +6,6 @@ use std::collections::HashSet; use std::future::Future; use std::str::FromStr; -use std::sync::Arc; use std::time::SystemTime; use anyhow::{anyhow, Context}; @@ -27,7 +26,7 @@ use crate::span::{ use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path}; use crate::tenant::storage_layer::LayerName; use crate::tenant::Generation; -use crate::virtual_file::{on_fatal_io_error, IoBufferMut, MaybeFatalIo, VirtualFile}; +use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile}; use crate::TEMP_FILE_SUFFIX; use remote_storage::{ DownloadError, DownloadKind, DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath, @@ -150,7 +149,7 @@ async fn download_object<'a>( storage: &'a GenericRemoteStorage, src_path: &RemotePath, dst_path: &Utf8PathBuf, - gate: &utils::sync::gate::Gate, + #[cfg_attr(target_os = "macos", allow(unused_variables))] gate: &utils::sync::gate::Gate, cancel: &CancellationToken, #[cfg_attr(target_os = "macos", allow(unused_variables))] ctx: &RequestContext, ) -> Result { @@ -209,6 +208,8 @@ async fn download_object<'a>( #[cfg(target_os = "linux")] crate::virtual_file::io_engine::IoEngine::TokioEpollUring => { use crate::virtual_file::owned_buffers_io; + use crate::virtual_file::IoBufferMut; + use std::sync::Arc; async { let destination_file = Arc::new( VirtualFile::create(dst_path, ctx) diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs index 20bf878123..7299d83703 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs @@ -132,6 +132,7 @@ where .expect("must not use after we returned an error") } + #[cfg_attr(target_os = "macos", allow(dead_code))] pub async fn write_buffered_borrowed( &mut self, chunk: &[u8], From 0fd211537baed6178d1d6be2928493cb02346c20 Mon Sep 17 00:00:00 2001 From: Ivan Efremov Date: Thu, 5 Dec 2024 07:30:38 +0200 Subject: [PATCH 65/65] proxy: Present new auth backend cplane_proxy_v1 (#10012) Implement a new auth backend based on the current Neon backend to switch to the new Proxy V1 cplane API. Implements [#21048](https://github.com/neondatabase/cloud/issues/21048) --- proxy/src/auth/backend/mod.rs | 4 + proxy/src/bin/proxy.rs | 99 +++- .../control_plane/client/cplane_proxy_v1.rs | 514 ++++++++++++++++++ proxy/src/control_plane/client/mod.rs | 7 + proxy/src/control_plane/client/neon.rs | 2 +- proxy/src/control_plane/messages.rs | 10 + 6 files changed, 634 insertions(+), 2 deletions(-) create mode 100644 proxy/src/control_plane/client/cplane_proxy_v1.rs diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs index 84a572dcf9..1bad7b3086 100644 --- a/proxy/src/auth/backend/mod.rs +++ b/proxy/src/auth/backend/mod.rs @@ -70,6 +70,10 @@ impl std::fmt::Display for Backend<'_, ()> { fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::ControlPlane(api, ()) => match &**api { + ControlPlaneClient::ProxyV1(endpoint) => fmt + .debug_tuple("ControlPlane::ProxyV1") + .field(&endpoint.url()) + .finish(), ControlPlaneClient::Neon(endpoint) => fmt .debug_tuple("ControlPlane::Neon") .field(&endpoint.url()) diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index c929b97d78..99144acef0 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -46,6 +46,9 @@ enum AuthBackendType { #[value(name("console"), alias("cplane"))] ControlPlane, + #[value(name("cplane-v1"), alias("control-plane"))] + ControlPlaneV1, + #[value(name("link"), alias("control-redirect"))] ConsoleRedirect, @@ -518,6 +521,39 @@ async fn main() -> anyhow::Result<()> { .instrument(span), ); } + } else if let proxy::control_plane::client::ControlPlaneClient::ProxyV1(api) = &**api { + match (redis_notifications_client, regional_redis_client.clone()) { + (None, None) => {} + (client1, client2) => { + let cache = api.caches.project_info.clone(); + if let Some(client) = client1 { + maintenance_tasks.spawn(notifications::task_main( + client, + cache.clone(), + cancel_map.clone(), + args.region.clone(), + )); + } + if let Some(client) = client2 { + maintenance_tasks.spawn(notifications::task_main( + client, + cache.clone(), + cancel_map.clone(), + args.region.clone(), + )); + } + maintenance_tasks.spawn(async move { cache.clone().gc_worker().await }); + } + } + if let Some(regional_redis_client) = regional_redis_client { + let cache = api.caches.endpoints_cache.clone(); + let con = regional_redis_client; + let span = tracing::info_span!("endpoints_cache"); + maintenance_tasks.spawn( + async move { cache.do_read(con, cancellation_token.clone()).await } + .instrument(span), + ); + } } } @@ -662,6 +698,65 @@ fn build_auth_backend( args: &ProxyCliArgs, ) -> anyhow::Result, &'static ConsoleRedirectBackend>> { match &args.auth_backend { + AuthBackendType::ControlPlaneV1 => { + let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?; + let project_info_cache_config: ProjectInfoCacheOptions = + args.project_info_cache.parse()?; + let endpoint_cache_config: config::EndpointCacheConfig = + args.endpoint_cache_config.parse()?; + + info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}"); + info!( + "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}" + ); + info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}"); + let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new( + wake_compute_cache_config, + project_info_cache_config, + endpoint_cache_config, + ))); + + let config::ConcurrencyLockOptions { + shards, + limiter, + epoch, + timeout, + } = args.wake_compute_lock.parse()?; + info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)"); + let locks = Box::leak(Box::new(control_plane::locks::ApiLocks::new( + "wake_compute_lock", + limiter, + shards, + timeout, + epoch, + &Metrics::get().wake_compute_lock, + )?)); + tokio::spawn(locks.garbage_collect_worker()); + + let url: proxy::url::ApiUrl = args.auth_endpoint.parse()?; + + let endpoint = http::Endpoint::new(url, http::new_client()); + + let mut wake_compute_rps_limit = args.wake_compute_limit.clone(); + RateBucketInfo::validate(&mut wake_compute_rps_limit)?; + let wake_compute_endpoint_rate_limiter = + Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit)); + + let api = control_plane::client::cplane_proxy_v1::NeonControlPlaneClient::new( + endpoint, + args.control_plane_token.clone(), + caches, + locks, + wake_compute_endpoint_rate_limiter, + ); + + let api = control_plane::client::ControlPlaneClient::ProxyV1(api); + let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ()); + let config = Box::leak(Box::new(auth_backend)); + + Ok(Either::Left(config)) + } + AuthBackendType::ControlPlane => { let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?; let project_info_cache_config: ProjectInfoCacheOptions = @@ -697,13 +792,15 @@ fn build_auth_backend( )?)); tokio::spawn(locks.garbage_collect_worker()); - let url = args.auth_endpoint.parse()?; + let url: proxy::url::ApiUrl = args.auth_endpoint.parse()?; + let endpoint = http::Endpoint::new(url, http::new_client()); let mut wake_compute_rps_limit = args.wake_compute_limit.clone(); RateBucketInfo::validate(&mut wake_compute_rps_limit)?; let wake_compute_endpoint_rate_limiter = Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit)); + let api = control_plane::client::neon::NeonControlPlaneClient::new( endpoint, args.control_plane_token.clone(), diff --git a/proxy/src/control_plane/client/cplane_proxy_v1.rs b/proxy/src/control_plane/client/cplane_proxy_v1.rs new file mode 100644 index 0000000000..e33a37f643 --- /dev/null +++ b/proxy/src/control_plane/client/cplane_proxy_v1.rs @@ -0,0 +1,514 @@ +//! Production console backend. + +use std::sync::Arc; +use std::time::Duration; + +use ::http::header::AUTHORIZATION; +use ::http::HeaderName; +use futures::TryFutureExt; +use postgres_client::config::SslMode; +use tokio::time::Instant; +use tracing::{debug, info, info_span, warn, Instrument}; + +use super::super::messages::{ControlPlaneErrorMessage, GetEndpointAccessControl, WakeCompute}; +use crate::auth::backend::jwt::AuthRule; +use crate::auth::backend::ComputeUserInfo; +use crate::cache::Cached; +use crate::context::RequestContext; +use crate::control_plane::caches::ApiCaches; +use crate::control_plane::errors::{ + ControlPlaneError, GetAuthInfoError, GetEndpointJwksError, WakeComputeError, +}; +use crate::control_plane::locks::ApiLocks; +use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, Reason}; +use crate::control_plane::{ + AuthInfo, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, NodeInfo, +}; +use crate::metrics::{CacheOutcome, Metrics}; +use crate::rate_limiter::WakeComputeRateLimiter; +use crate::types::{EndpointCacheKey, EndpointId}; +use crate::{compute, http, scram}; + +const X_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id"); + +#[derive(Clone)] +pub struct NeonControlPlaneClient { + endpoint: http::Endpoint, + pub caches: &'static ApiCaches, + pub(crate) locks: &'static ApiLocks, + pub(crate) wake_compute_endpoint_rate_limiter: Arc, + // put in a shared ref so we don't copy secrets all over in memory + jwt: Arc, +} + +impl NeonControlPlaneClient { + /// Construct an API object containing the auth parameters. + pub fn new( + endpoint: http::Endpoint, + jwt: Arc, + caches: &'static ApiCaches, + locks: &'static ApiLocks, + wake_compute_endpoint_rate_limiter: Arc, + ) -> Self { + Self { + endpoint, + caches, + locks, + wake_compute_endpoint_rate_limiter, + jwt, + } + } + + pub(crate) fn url(&self) -> &str { + self.endpoint.url().as_str() + } + + async fn do_get_auth_info( + &self, + ctx: &RequestContext, + user_info: &ComputeUserInfo, + ) -> Result { + if !self + .caches + .endpoints_cache + .is_valid(ctx, &user_info.endpoint.normalize()) + { + // TODO: refactor this because it's weird + // this is a failure to authenticate but we return Ok. + info!("endpoint is not valid, skipping the request"); + return Ok(AuthInfo::default()); + } + let request_id = ctx.session_id().to_string(); + let application_name = ctx.console_application_name(); + async { + let request = self + .endpoint + .get_path("get_endpoint_access_control") + .header(X_REQUEST_ID, &request_id) + .header(AUTHORIZATION, format!("Bearer {}", &self.jwt)) + .query(&[("session_id", ctx.session_id())]) + .query(&[ + ("application_name", application_name.as_str()), + ("endpointish", user_info.endpoint.as_str()), + ("role", user_info.user.as_str()), + ]) + .build()?; + + debug!(url = request.url().as_str(), "sending http request"); + let start = Instant::now(); + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane); + let response = self.endpoint.execute(request).await?; + drop(pause); + info!(duration = ?start.elapsed(), "received http response"); + let body = match parse_body::(response).await { + Ok(body) => body, + // Error 404 is special: it's ok not to have a secret. + // TODO(anna): retry + Err(e) => { + return if e.get_reason().is_not_found() { + // TODO: refactor this because it's weird + // this is a failure to authenticate but we return Ok. + Ok(AuthInfo::default()) + } else { + Err(e.into()) + }; + } + }; + + // Ivan: don't know where it will be used, so I leave it here + let _endpoint_vpc_ids = body.allowed_vpc_endpoint_ids.unwrap_or_default(); + + let secret = if body.role_secret.is_empty() { + None + } else { + let secret = scram::ServerSecret::parse(&body.role_secret) + .map(AuthSecret::Scram) + .ok_or(GetAuthInfoError::BadSecret)?; + Some(secret) + }; + let allowed_ips = body.allowed_ips.unwrap_or_default(); + Metrics::get() + .proxy + .allowed_ips_number + .observe(allowed_ips.len() as f64); + Ok(AuthInfo { + secret, + allowed_ips, + project_id: body.project_id, + }) + } + .inspect_err(|e| tracing::debug!(error = ?e)) + .instrument(info_span!("do_get_auth_info")) + .await + } + + async fn do_get_endpoint_jwks( + &self, + ctx: &RequestContext, + endpoint: EndpointId, + ) -> Result, GetEndpointJwksError> { + if !self + .caches + .endpoints_cache + .is_valid(ctx, &endpoint.normalize()) + { + return Err(GetEndpointJwksError::EndpointNotFound); + } + let request_id = ctx.session_id().to_string(); + async { + let request = self + .endpoint + .get_with_url(|url| { + url.path_segments_mut() + .push("endpoints") + .push(endpoint.as_str()) + .push("jwks"); + }) + .header(X_REQUEST_ID, &request_id) + .header(AUTHORIZATION, format!("Bearer {}", &self.jwt)) + .query(&[("session_id", ctx.session_id())]) + .build() + .map_err(GetEndpointJwksError::RequestBuild)?; + + debug!(url = request.url().as_str(), "sending http request"); + let start = Instant::now(); + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane); + let response = self + .endpoint + .execute(request) + .await + .map_err(GetEndpointJwksError::RequestExecute)?; + drop(pause); + info!(duration = ?start.elapsed(), "received http response"); + + let body = parse_body::(response).await?; + + let rules = body + .jwks + .into_iter() + .map(|jwks| AuthRule { + id: jwks.id, + jwks_url: jwks.jwks_url, + audience: jwks.jwt_audience, + role_names: jwks.role_names, + }) + .collect(); + + Ok(rules) + } + .inspect_err(|e| tracing::debug!(error = ?e)) + .instrument(info_span!("do_get_endpoint_jwks")) + .await + } + + async fn do_wake_compute( + &self, + ctx: &RequestContext, + user_info: &ComputeUserInfo, + ) -> Result { + let request_id = ctx.session_id().to_string(); + let application_name = ctx.console_application_name(); + async { + let mut request_builder = self + .endpoint + .get_path("wake_compute") + .header("X-Request-ID", &request_id) + .header("Authorization", format!("Bearer {}", &self.jwt)) + .query(&[("session_id", ctx.session_id())]) + .query(&[ + ("application_name", application_name.as_str()), + ("endpointish", user_info.endpoint.as_str()), + ]); + + let options = user_info.options.to_deep_object(); + if !options.is_empty() { + request_builder = request_builder.query(&options); + } + + let request = request_builder.build()?; + + debug!(url = request.url().as_str(), "sending http request"); + let start = Instant::now(); + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane); + let response = self.endpoint.execute(request).await?; + drop(pause); + info!(duration = ?start.elapsed(), "received http response"); + let body = parse_body::(response).await?; + + // Unfortunately, ownership won't let us use `Option::ok_or` here. + let (host, port) = match parse_host_port(&body.address) { + None => return Err(WakeComputeError::BadComputeAddress(body.address)), + Some(x) => x, + }; + + // Don't set anything but host and port! This config will be cached. + // We'll set username and such later using the startup message. + // TODO: add more type safety (in progress). + let mut config = compute::ConnCfg::new(host.to_owned(), port); + config.ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes. + + let node = NodeInfo { + config, + aux: body.aux, + allow_self_signed_compute: false, + }; + + Ok(node) + } + .inspect_err(|e| tracing::debug!(error = ?e)) + .instrument(info_span!("do_wake_compute")) + .await + } +} + +impl super::ControlPlaneApi for NeonControlPlaneClient { + #[tracing::instrument(skip_all)] + async fn get_role_secret( + &self, + ctx: &RequestContext, + user_info: &ComputeUserInfo, + ) -> Result { + let normalized_ep = &user_info.endpoint.normalize(); + let user = &user_info.user; + if let Some(role_secret) = self + .caches + .project_info + .get_role_secret(normalized_ep, user) + { + return Ok(role_secret); + } + let auth_info = self.do_get_auth_info(ctx, user_info).await?; + if let Some(project_id) = auth_info.project_id { + let normalized_ep_int = normalized_ep.into(); + self.caches.project_info.insert_role_secret( + project_id, + normalized_ep_int, + user.into(), + auth_info.secret.clone(), + ); + self.caches.project_info.insert_allowed_ips( + project_id, + normalized_ep_int, + Arc::new(auth_info.allowed_ips), + ); + ctx.set_project_id(project_id); + } + // When we just got a secret, we don't need to invalidate it. + Ok(Cached::new_uncached(auth_info.secret)) + } + + async fn get_allowed_ips_and_secret( + &self, + ctx: &RequestContext, + user_info: &ComputeUserInfo, + ) -> Result<(CachedAllowedIps, Option), GetAuthInfoError> { + let normalized_ep = &user_info.endpoint.normalize(); + if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(normalized_ep) { + Metrics::get() + .proxy + .allowed_ips_cache_misses + .inc(CacheOutcome::Hit); + return Ok((allowed_ips, None)); + } + Metrics::get() + .proxy + .allowed_ips_cache_misses + .inc(CacheOutcome::Miss); + let auth_info = self.do_get_auth_info(ctx, user_info).await?; + let allowed_ips = Arc::new(auth_info.allowed_ips); + let user = &user_info.user; + if let Some(project_id) = auth_info.project_id { + let normalized_ep_int = normalized_ep.into(); + self.caches.project_info.insert_role_secret( + project_id, + normalized_ep_int, + user.into(), + auth_info.secret.clone(), + ); + self.caches.project_info.insert_allowed_ips( + project_id, + normalized_ep_int, + allowed_ips.clone(), + ); + ctx.set_project_id(project_id); + } + Ok(( + Cached::new_uncached(allowed_ips), + Some(Cached::new_uncached(auth_info.secret)), + )) + } + + #[tracing::instrument(skip_all)] + async fn get_endpoint_jwks( + &self, + ctx: &RequestContext, + endpoint: EndpointId, + ) -> Result, GetEndpointJwksError> { + self.do_get_endpoint_jwks(ctx, endpoint).await + } + + #[tracing::instrument(skip_all)] + async fn wake_compute( + &self, + ctx: &RequestContext, + user_info: &ComputeUserInfo, + ) -> Result { + let key = user_info.endpoint_cache_key(); + + macro_rules! check_cache { + () => { + if let Some(cached) = self.caches.node_info.get(&key) { + let (cached, info) = cached.take_value(); + let info = info.map_err(|c| { + info!(key = &*key, "found cached wake_compute error"); + WakeComputeError::ControlPlane(ControlPlaneError::Message(Box::new(*c))) + })?; + + debug!(key = &*key, "found cached compute node info"); + ctx.set_project(info.aux.clone()); + return Ok(cached.map(|()| info)); + } + }; + } + + // Every time we do a wakeup http request, the compute node will stay up + // for some time (highly depends on the console's scale-to-zero policy); + // The connection info remains the same during that period of time, + // which means that we might cache it to reduce the load and latency. + check_cache!(); + + let permit = self.locks.get_permit(&key).await?; + + // after getting back a permit - it's possible the cache was filled + // double check + if permit.should_check_cache() { + // TODO: if there is something in the cache, mark the permit as success. + check_cache!(); + } + + // check rate limit + if !self + .wake_compute_endpoint_rate_limiter + .check(user_info.endpoint.normalize_intern(), 1) + { + return Err(WakeComputeError::TooManyConnections); + } + + let node = permit.release_result(self.do_wake_compute(ctx, user_info).await); + match node { + Ok(node) => { + ctx.set_project(node.aux.clone()); + debug!(key = &*key, "created a cache entry for woken compute node"); + + let mut stored_node = node.clone(); + // store the cached node as 'warm_cached' + stored_node.aux.cold_start_info = ColdStartInfo::WarmCached; + + let (_, cached) = self.caches.node_info.insert_unit(key, Ok(stored_node)); + + Ok(cached.map(|()| node)) + } + Err(err) => match err { + WakeComputeError::ControlPlane(ControlPlaneError::Message(err)) => { + let Some(status) = &err.status else { + return Err(WakeComputeError::ControlPlane(ControlPlaneError::Message( + err, + ))); + }; + + let reason = status + .details + .error_info + .map_or(Reason::Unknown, |x| x.reason); + + // if we can retry this error, do not cache it. + if reason.can_retry() { + return Err(WakeComputeError::ControlPlane(ControlPlaneError::Message( + err, + ))); + } + + // at this point, we should only have quota errors. + debug!( + key = &*key, + "created a cache entry for the wake compute error" + ); + + self.caches.node_info.insert_ttl( + key, + Err(err.clone()), + Duration::from_secs(30), + ); + + Err(WakeComputeError::ControlPlane(ControlPlaneError::Message( + err, + ))) + } + err => return Err(err), + }, + } + } +} + +/// Parse http response body, taking status code into account. +async fn parse_body serde::Deserialize<'a>>( + response: http::Response, +) -> Result { + let status = response.status(); + if status.is_success() { + // We shouldn't log raw body because it may contain secrets. + info!("request succeeded, processing the body"); + return Ok(response.json().await?); + } + let s = response.bytes().await?; + // Log plaintext to be able to detect, whether there are some cases not covered by the error struct. + info!("response_error plaintext: {:?}", s); + + // Don't throw an error here because it's not as important + // as the fact that the request itself has failed. + let mut body = serde_json::from_slice(&s).unwrap_or_else(|e| { + warn!("failed to parse error body: {e}"); + ControlPlaneErrorMessage { + error: "reason unclear (malformed error message)".into(), + http_status_code: status, + status: None, + } + }); + body.http_status_code = status; + + warn!("console responded with an error ({status}): {body:?}"); + Err(ControlPlaneError::Message(Box::new(body))) +} + +fn parse_host_port(input: &str) -> Option<(&str, u16)> { + let (host, port) = input.rsplit_once(':')?; + let ipv6_brackets: &[_] = &['[', ']']; + Some((host.trim_matches(ipv6_brackets), port.parse().ok()?)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_host_port_v4() { + let (host, port) = parse_host_port("127.0.0.1:5432").expect("failed to parse"); + assert_eq!(host, "127.0.0.1"); + assert_eq!(port, 5432); + } + + #[test] + fn test_parse_host_port_v6() { + let (host, port) = parse_host_port("[2001:db8::1]:5432").expect("failed to parse"); + assert_eq!(host, "2001:db8::1"); + assert_eq!(port, 5432); + } + + #[test] + fn test_parse_host_port_url() { + let (host, port) = parse_host_port("compute-foo-bar-1234.default.svc.cluster.local:5432") + .expect("failed to parse"); + assert_eq!(host, "compute-foo-bar-1234.default.svc.cluster.local"); + assert_eq!(port, 5432); + } +} diff --git a/proxy/src/control_plane/client/mod.rs b/proxy/src/control_plane/client/mod.rs index f8f74372f0..7ef5a9c9fd 100644 --- a/proxy/src/control_plane/client/mod.rs +++ b/proxy/src/control_plane/client/mod.rs @@ -1,3 +1,4 @@ +pub mod cplane_proxy_v1; #[cfg(any(test, feature = "testing"))] pub mod mock; pub mod neon; @@ -27,6 +28,8 @@ use crate::types::EndpointId; #[non_exhaustive] #[derive(Clone)] pub enum ControlPlaneClient { + /// New Proxy V1 control plane API + ProxyV1(cplane_proxy_v1::NeonControlPlaneClient), /// Current Management API (V2). Neon(neon::NeonControlPlaneClient), /// Local mock control plane. @@ -45,6 +48,7 @@ impl ControlPlaneApi for ControlPlaneClient { user_info: &ComputeUserInfo, ) -> Result { match self { + Self::ProxyV1(api) => api.get_role_secret(ctx, user_info).await, Self::Neon(api) => api.get_role_secret(ctx, user_info).await, #[cfg(any(test, feature = "testing"))] Self::PostgresMock(api) => api.get_role_secret(ctx, user_info).await, @@ -61,6 +65,7 @@ impl ControlPlaneApi for ControlPlaneClient { user_info: &ComputeUserInfo, ) -> Result<(CachedAllowedIps, Option), errors::GetAuthInfoError> { match self { + Self::ProxyV1(api) => api.get_allowed_ips_and_secret(ctx, user_info).await, Self::Neon(api) => api.get_allowed_ips_and_secret(ctx, user_info).await, #[cfg(any(test, feature = "testing"))] Self::PostgresMock(api) => api.get_allowed_ips_and_secret(ctx, user_info).await, @@ -75,6 +80,7 @@ impl ControlPlaneApi for ControlPlaneClient { endpoint: EndpointId, ) -> Result, errors::GetEndpointJwksError> { match self { + Self::ProxyV1(api) => api.get_endpoint_jwks(ctx, endpoint).await, Self::Neon(api) => api.get_endpoint_jwks(ctx, endpoint).await, #[cfg(any(test, feature = "testing"))] Self::PostgresMock(api) => api.get_endpoint_jwks(ctx, endpoint).await, @@ -89,6 +95,7 @@ impl ControlPlaneApi for ControlPlaneClient { user_info: &ComputeUserInfo, ) -> Result { match self { + Self::ProxyV1(api) => api.wake_compute(ctx, user_info).await, Self::Neon(api) => api.wake_compute(ctx, user_info).await, #[cfg(any(test, feature = "testing"))] Self::PostgresMock(api) => api.wake_compute(ctx, user_info).await, diff --git a/proxy/src/control_plane/client/neon.rs b/proxy/src/control_plane/client/neon.rs index 5c204ae1d7..bf62c0d6ab 100644 --- a/proxy/src/control_plane/client/neon.rs +++ b/proxy/src/control_plane/client/neon.rs @@ -1,4 +1,4 @@ -//! Production console backend. +//! Stale console backend, remove after migrating to Proxy V1 API (#15245). use std::sync::Arc; use std::time::Duration; diff --git a/proxy/src/control_plane/messages.rs b/proxy/src/control_plane/messages.rs index 8762ba874b..2662ab85f9 100644 --- a/proxy/src/control_plane/messages.rs +++ b/proxy/src/control_plane/messages.rs @@ -230,6 +230,16 @@ pub(crate) struct GetRoleSecret { pub(crate) project_id: Option, } +/// Response which holds client's auth secret, e.g. [`crate::scram::ServerSecret`]. +/// Returned by the `/get_endpoint_access_control` API method. +#[derive(Deserialize)] +pub(crate) struct GetEndpointAccessControl { + pub(crate) role_secret: Box, + pub(crate) allowed_ips: Option>, + pub(crate) project_id: Option, + pub(crate) allowed_vpc_endpoint_ids: Option>, +} + // Manually implement debug to omit sensitive info. impl fmt::Debug for GetRoleSecret { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {