Compare commits

...

79 Commits

Author SHA1 Message Date
Bojan Serafimov
c431a305aa WIP 2022-11-16 11:40:39 -05:00
Alexander Bayandin
2b728bc69e test_forward_compatibility: fix path to pg_distrib_dir (#2826)
Set correct `pg_distrib_dir` in `pageserver.toml` and in neon_local
`config`.

`test_forward_compatibility` shows flakiness during `neon_local pg
start`, so hopefully, the patch will help.

```
2022-11-15 16:07:34.091 GMT [13338] LOG:  starting with zenith basebackup at LSN 0/A6A9310, prev 0/0
2022-11-15 16:07:34.091 GMT [13338] FATAL:  cannot start in read-write mode from this base backup
2022-11-15 16:07:34.091 GMT [13337] LOG:  startup process (PID 13338) exited with exit code 1
```
2022-11-16 15:14:36 +00:00
Kirill Bulatov
5184685ced Revert "Introduce aws-sdk-rust as rusoto S3 replacement (#2802)" (#2837)
Despite tests working, on staging the library started to fail with the
following error:

```
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]: 2022-11-16T11:53:37.191211Z  INFO init_tenant_mgr:local_tenant_timeline_files: Collected files for 16 tenants
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]: thread 'main' panicked at 'A connector was not available. Either set a custom connector or enable the `rustls` and `native-tls` crate featu>
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]: stack backtrace:
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:    0: rust_begin_unwind
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at /rustc/e092d0b6b43f2de967af0887873151bb1c0b18d3/library/std/src/panicking.rs:584:5
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:    1: core::panicking::panic_fmt
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at /rustc/e092d0b6b43f2de967af0887873151bb1c0b18d3/library/core/src/panicking.rs:142:14
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:    2: core::panicking::panic_display
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at /rustc/e092d0b6b43f2de967af0887873151bb1c0b18d3/library/core/src/panicking.rs:72:5
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:    3: core::panicking::panic_str
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at /rustc/e092d0b6b43f2de967af0887873151bb1c0b18d3/library/core/src/panicking.rs:56:5
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:    4: core::option::expect_failed
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at /rustc/e092d0b6b43f2de967af0887873151bb1c0b18d3/library/core/src/option.rs:1854:5
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:    5: <core::future::from_generator::GenFuture<T> as core::future::future::Future>::poll
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:    6: <core::future::from_generator::GenFuture<T> as core::future::future::Future>::poll
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:    7: <core::future::from_generator::GenFuture<T> as core::future::future::Future>::poll
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:    8: <aws_types::credentials::provider::future::ProvideCredentials as core::future::future::Future>::poll
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:    9: <tracing::instrument::Instrumented<T> as core::future::future::Future>::poll
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   10: <core::future::from_generator::GenFuture<T> as core::future::future::Future>::poll
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   11: <aws_types::credentials::provider::future::ProvideCredentials as core::future::future::Future>::poll
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   12: <core::future::from_generator::GenFuture<T> as core::future::future::Future>::poll
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   13: <core::future::from_generator::GenFuture<T> as core::future::future::Future>::poll
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   14: <aws_smithy_http_tower::map_request::MapRequestFuture<F,E> as core::future::future::Future>::poll
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   15: <core::pin::Pin<P> as core::future::future::Future>::poll
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at /rustc/e092d0b6b43f2de967af0887873151bb1c0b18d3/library/core/src/future/future.rs:124:9
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   16: <aws_smithy_http_tower::parse_response::ParseResponseService<InnerService,ResponseHandler,RetryPolicy> as tower_service::Service<aws_>
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at /home/nonroot/.cargo/registry/src/github.com-1ecc6299db9ec823/aws-smithy-http-tower-0.51.0/src/parse_response.rs:109:34
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   17: <core::future::from_generator::GenFuture<T> as core::future::future::Future>::poll
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at /rustc/e092d0b6b43f2de967af0887873151bb1c0b18d3/library/core/src/future/mod.rs:91:19
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   18: <tracing::instrument::Instrumented<T> as core::future::future::Future>::poll
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at /home/nonroot/.cargo/registry/src/github.com-1ecc6299db9ec823/tracing-0.1.37/src/instrument.rs:272:9
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   19: <core::pin::Pin<P> as core::future::future::Future>::poll
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at /rustc/e092d0b6b43f2de967af0887873151bb1c0b18d3/library/core/src/future/future.rs:124:9
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   20: <aws_smithy_client::timeout::TimeoutServiceFuture<InnerFuture> as core::future::future::Future>::poll
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at /home/nonroot/.cargo/registry/src/github.com-1ecc6299db9ec823/aws-smithy-client-0.51.0/src/timeout.rs:189:70
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   21: <tower::retry::future::ResponseFuture<P,S,Request> as core::future::future::Future>::poll
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at /home/nonroot/.cargo/registry/src/github.com-1ecc6299db9ec823/tower-0.4.13/src/retry/future.rs:77:41
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   22: <aws_smithy_client::timeout::TimeoutServiceFuture<InnerFuture> as core::future::future::Future>::poll
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at /home/nonroot/.cargo/registry/src/github.com-1ecc6299db9ec823/aws-smithy-client-0.51.0/src/timeout.rs:189:70
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   23: aws_smithy_client::Client<C,M,R>::call_raw::{{closure}}
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at /home/nonroot/.cargo/registry/src/github.com-1ecc6299db9ec823/aws-smithy-client-0.51.0/src/lib.rs:227:56
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   24: <core::future::from_generator::GenFuture<T> as core::future::future::Future>::poll
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at /rustc/e092d0b6b43f2de967af0887873151bb1c0b18d3/library/core/src/future/mod.rs:91:19
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   25: aws_smithy_client::Client<C,M,R>::call::{{closure}}
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at /home/nonroot/.cargo/registry/src/github.com-1ecc6299db9ec823/aws-smithy-client-0.51.0/src/lib.rs:184:29
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   26: <core::future::from_generator::GenFuture<T> as core::future::future::Future>::poll
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at /rustc/e092d0b6b43f2de967af0887873151bb1c0b18d3/library/core/src/future/mod.rs:91:19
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   27: aws_sdk_s3::client::fluent_builders::GetObject::send::{{closure}}
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at /home/nonroot/.cargo/registry/src/github.com-1ecc6299db9ec823/aws-sdk-s3-0.21.0/src/client.rs:7735:40
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   28: <core::future::from_generator::GenFuture<T> as core::future::future::Future>::poll
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at /rustc/e092d0b6b43f2de967af0887873151bb1c0b18d3/library/core/src/future/mod.rs:91:19
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   29: remote_storage::s3_bucket::S3Bucket::download_object::{{closure}}
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at libs/remote_storage/src/s3_bucket.rs:205:20
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   30: <core::future::from_generator::GenFuture<T> as core::future::future::Future>::poll
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at /rustc/e092d0b6b43f2de967af0887873151bb1c0b18d3/library/core/src/future/mod.rs:91:19
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   31: <remote_storage::s3_bucket::S3Bucket as remote_storage::RemoteStorage>::download::{{closure}}
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at libs/remote_storage/src/s3_bucket.rs:399:11
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   32: <core::future::from_generator::GenFuture<T> as core::future::future::Future>::poll
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at /rustc/e092d0b6b43f2de967af0887873151bb1c0b18d3/library/core/src/future/mod.rs:91:19
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   33: <core::pin::Pin<P> as core::future::future::Future>::poll
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at /rustc/e092d0b6b43f2de967af0887873151bb1c0b18d3/library/core/src/future/future.rs:124:9
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   34: remote_storage::GenericRemoteStorage::download_storage_object::{{closure}}
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at libs/remote_storage/src/lib.rs:264:55
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   35: <core::future::from_generator::GenFuture<T> as core::future::future::Future>::poll
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at /rustc/e092d0b6b43f2de967af0887873151bb1c0b18d3/library/core/src/future/mod.rs:91:19
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   36: pageserver::storage_sync::download::download_index_part::{{closure}}
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at pageserver/src/storage_sync/download.rs:148:57
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   37: <core::future::from_generator::GenFuture<T> as core::future::future::Future>::poll
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at /rustc/e092d0b6b43f2de967af0887873151bb1c0b18d3/library/core/src/future/mod.rs:91:19
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   38: pageserver::storage_sync::download::download_index_parts::{{closure}}::{{closure}}::{{closure}}
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at pageserver/src/storage_sync/download.rs:77:75
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   39: <core::future::from_generator::GenFuture<T> as core::future::future::Future>::poll
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at /rustc/e092d0b6b43f2de967af0887873151bb1c0b18d3/library/core/src/future/mod.rs:91:19
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   40: <futures_util::stream::futures_unordered::FuturesUnordered<Fut> as futures_core::stream::Stream>::poll_next
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at /home/nonroot/.cargo/registry/src/github.com-1ecc6299db9ec823/futures-util-0.3.24/src/stream/futures_unordered/mod.rs:514:17
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   41: futures_util::stream::stream::StreamExt::poll_next_unpin
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at /home/nonroot/.cargo/registry/src/github.com-1ecc6299db9ec823/futures-util-0.3.24/src/stream/stream/mod.rs:1626:9
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   42: <futures_util::stream::stream::next::Next<St> as core::future::future::Future>::poll
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at /home/nonroot/.cargo/registry/src/github.com-1ecc6299db9ec823/futures-util-0.3.24/src/stream/stream/next.rs:32:9
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   43: pageserver::storage_sync::download::download_index_parts::{{closure}}
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at pageserver/src/storage_sync/download.rs:80:69
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   44: <core::future::from_generator::GenFuture<T> as core::future::future::Future>::poll
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at /rustc/e092d0b6b43f2de967af0887873151bb1c0b18d3/library/core/src/future/mod.rs:91:19
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   45: tokio::park:🧵:CachedParkThread::block_on::{{closure}}
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at /home/nonroot/.cargo/registry/src/github.com-1ecc6299db9ec823/tokio-1.21.1/src/park/thread.rs:267:54
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   46: tokio::coop::with_budget::{{closure}}
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at /home/nonroot/.cargo/registry/src/github.com-1ecc6299db9ec823/tokio-1.21.1/src/coop.rs:102:9
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   47: std:🧵:local::LocalKey<T>::try_with
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at /rustc/e092d0b6b43f2de967af0887873151bb1c0b18d3/library/std/src/thread/local.rs:445:16
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   48: std:🧵:local::LocalKey<T>::with
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at /rustc/e092d0b6b43f2de967af0887873151bb1c0b18d3/library/std/src/thread/local.rs:421:9
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   49: tokio::coop::with_budget
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at /home/nonroot/.cargo/registry/src/github.com-1ecc6299db9ec823/tokio-1.21.1/src/coop.rs:95:5
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   50: tokio::coop::budget
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at /home/nonroot/.cargo/registry/src/github.com-1ecc6299db9ec823/tokio-1.21.1/src/coop.rs:72:5
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   51: tokio::park:🧵:CachedParkThread::block_on
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at /home/nonroot/.cargo/registry/src/github.com-1ecc6299db9ec823/tokio-1.21.1/src/park/thread.rs:267:31
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   52: tokio::runtime::enter::Enter::block_on
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at /home/nonroot/.cargo/registry/src/github.com-1ecc6299db9ec823/tokio-1.21.1/src/runtime/enter.rs:152:13
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   53: tokio::runtime::scheduler::multi_thread::MultiThread::block_on
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at /home/nonroot/.cargo/registry/src/github.com-1ecc6299db9ec823/tokio-1.21.1/src/runtime/scheduler/multi_thread/mod.rs:79:9
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   54: tokio::runtime::Runtime::block_on
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at /home/nonroot/.cargo/registry/src/github.com-1ecc6299db9ec823/tokio-1.21.1/src/runtime/mod.rs:492:44
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   55: pageserver::storage_sync::spawn_storage_sync_task
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at pageserver/src/storage_sync.rs:656:34
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   56: pageserver::tenant_mgr::init_tenant_mgr
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at pageserver/src/tenant_mgr.rs:88:13
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   57: pageserver::start_pageserver
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at pageserver/src/bin/pageserver.rs:269:9
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   58: pageserver::main
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at pageserver/src/bin/pageserver.rs:103:5
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:   59: core::ops::function::FnOnce::call_once
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]:              at /rustc/e092d0b6b43f2de967af0887873151bb1c0b18d3/library/core/src/ops/function.rs:248:5
Nov 16 11:53:37 pageserver-0.us-east-2.aws.neon.build pageserver[481974]: note: Some details are omitted, run with `RUST_BACKTRACE=full` for a verbose backtrace.
```

Feels like better testing on the env is needed later, maybe more e2e
tests have to be written (albeit we have download tests, so something
else happens here, tls issues?)
2022-11-16 15:10:36 +00:00
Heikki Linnakangas
9ae4da4f31 Silence test failure caused by race condition between GC and detach.
Thanks to the race condition, GC sometimes fails with "no such file or
directory" error, if the tenant is detached concurrently. That's a
known issue, but it didn't cause test failures until we started to
check for unexpected ERRORs in the log in commit 46d30bf054. We should
fix the race condition, of course, but until we do, let's silence the
failures.
2022-11-16 15:50:49 +02:00
Sergey Melnikov
aca221ac8b Switch old staging to new etcd (#2834) 2022-11-16 16:54:55 +04:00
Heikki Linnakangas
d013a2b227 Make test_gc_cutoff test more robust.
Previously, if the failpoint was not reached for some reason, the test
would only fail because it would reach the 5 minute timeout we have on
all python tests. That's very subtle. Make it fail explicitly, if the
failpoint is not hit on each iteration of the loop.

Extracted from a larger PR, see
https://github.com/neondatabase/neon/pull/2785/files#r1022765794
2022-11-16 13:24:02 +02:00
Heikki Linnakangas
3f93c6c6f0 Improve checks for broken tenants in test_broken_timeline.py
- Refactor the code a little bit, removing the silly for-loop over a
  single element.

- Make it more clear in log messages that the errors are expectd

- Check for a more precise error message "Failed to load delta layer"
  instead of just "extracting base backup failed".
2022-11-16 13:16:00 +02:00
Rory de Zoete
53267969d7 Preparation for ARM runners (#2751)
Need to make the runner tag more specific else we inadvertently might
run workloads on the wrong arch

Co-authored-by: Rory de Zoete <rdezoete@Rorys-Mac-Studio.fritz.box>
Co-authored-by: Rory de Zoete <rdezoete@RorysMacStudio.fritz.box>
2022-11-16 11:28:57 +01:00
Andrés
c4b417ecdb Introduce aws-sdk-rust as rusoto S3 replacement (#2802)
- `aws-smithy-http`: Needed because of `SdkBody` see
https://github.com/awslabs/smithy-rs/issues/1759
- `aws-types`: Needed because of `SharedCredentialsProvider`, the
recommended way from aws is something like
`aws_config::from_env().region("us-east-1").load().await` but that is
problematic because of:

- `sync -> async ` in the creation of S3Client and i don't want to
change the signature of any method in this class.
- We do not need the four default steps in
https://github.com/awslabs/aws-sdk-rust/blob/main/sdk/aws-config/src/default_provider/credentials.rs#L235

- `Hyper`: Similar to what's currently doing Rusoto in
https://github.com/rusoto/rusoto/blob/master/rusoto/signature/src/signature.rs#L59
to stream the body, see also
https://github.com/awslabs/aws-sdk-rust/discussions/361

Co-authored-by: andres <andres.rodriguez@outlook.es>
2022-11-16 11:28:37 +02:00
Joonas Koivunen
1d105727cb perf: simple walredo bench (#2816)
adds a simple walredo bench to allow some comparison of the walredo
throughput.

Cc: #1339, #2778
2022-11-16 11:13:56 +02:00
Heikki Linnakangas
4787a744c2 Add documentation page about error handling and logging. (#2681)
Add a page to the internal documentation, on how we do error
handling and logging.
2022-11-16 10:38:03 +02:00
Sergey Melnikov
ac3ccac56c Add zenith-1-ps-4 and zenith-1-ps-5 (#2815) 2022-11-16 11:25:24 +04:00
Alexander Bayandin
638af96c51 postgres-v15: fix expected results for regress tests (#2822)
Fix expected output for regress tests for Postgres 15.

Required for https://github.com/neondatabase/neon/pull/2809
2022-11-15 22:32:12 +00:00
Kirill Bulatov
1e21ca1afe Trim whitespaces off Lsn strings when parsing (#2827) 2022-11-15 22:39:44 +02:00
Heikki Linnakangas
46d30bf054 Check for errors in pageserver log after each test.
If there are any unexpected ERRORs or WARNs in pageserver.log after test
finishes, fail the test. This requires whitelisting the errors that *are*
expected in each test, and there's also a few common errors that are
printed by most tests, which are whitelisted in the fixture itself.

With this, we don't need the special abort() call in testing mode, when
compaction or GC fails. Those failures will print ERRORs to the logs,
which will be picked up by this new mechanisms.

A bunch of errors are currently whitelisted that we probably shouldn't
be emitting in the first place, but fixing those is out of scope for this
commit, so I just left FIXME comments on them.
2022-11-15 18:47:28 +02:00
Heikki Linnakangas
d0105cea1f Avoid errors when removing a timeline that's still active 2022-11-15 18:47:28 +02:00
Heikki Linnakangas
e44e4a699b Downgrade log message, if client terminates COPY during basebackup import
It's more or less expected from pageserver's point of view. Change the
error kind to ConnectionReset, so that it gets logged at INFO level
instead of ERROR.
2022-11-15 18:47:28 +02:00
Heikki Linnakangas
223834a420 Fix confusion between Postgres and pageserver connection string in test.
We passed the pageserver's libpq endpoint URL as the 'compute_ctl
--connstr' argument, but that was bogus: the --connstr URL is supposed
to be the URL to the *Postgres* instance that compute_ctl launches and
monitors, not to the pageserver. compute_ctl does need the pageserver
URL too, but it is read from the cluster spec JSON, not --connstr.

That was pretty confusing, as you got a lot of "unknown command"
errors in the pageserver log, when compute_tools tries to run regular
SQL commands on the pageserver. The test still passed, however, as it
doesn't require the SQL commands to succeed. But to make this less
confusing, use an invalid hostname instead, so that the queries will
fail to even connect.
2022-11-15 18:47:28 +02:00
MMeent
01778e37cc Address issues in the pagestore prefetch mechanism: (#2790)
- Update vendored PostgreSQL to address prefetch issues
 - Make flushed state explicit in PrefetchState
 - Move flush logic into prefetch_wait_for, where possible
 - Clean up some prefetch state handling code in the various code
elements handling state transitions.
 - Fix a race condition in neon_read_at_lsn where a hash entry pointer
was used after the hash table was updated. This could result in
incorrect state transitions and assertion failures after disconnects
during prefetch_wait_for in that neon_read_at_lsn.
 
Fixes #2780
2022-11-15 15:12:38 +01:00
Alexander Bayandin
03190a2161 GitHub Actions: Do not create Allure report for cancelled jobs (#2813)
If a workflow is cancelled, do not delay its finishing by creating an allure
report.
2022-11-15 10:27:59 +00:00
Kirill Bulatov
f87017c04d Omit dependencies' debug info (#2803)
Based on https://neondb.slack.com/archives/C0277TKAJCA/p1668079753506749

Co-authored-by: Arseny Sher <sher-ars@yandex.ru>
2022-11-14 12:44:41 +00:00
andres
c11cbf0f5c fix test_compare_child_and_root_pgbench_perf to do a fair comparison 2022-11-13 21:03:54 +02:00
Heikki Linnakangas
f30ef00439 Stop building the legacy "compute-node" docker image.
Before we had separate images for v14 and v15, the compute node image
was called just "neondatabase/compute-node". It has been superseded by
the "neondatabase/compute-node-v14" and "neondatabase/compute-node-v15"
images. The old image is not used by the cloud console build or tests
anymore.
2022-11-12 20:48:10 +02:00
Heikki Linnakangas
dbe5b52494 Avoid some vector-growing overhead.
I saw this in 'perf' profile of a sequential scan:

> -   31.93%     0.21%  compute request  pageserver         [.] <pageserver::walredo::PostgresRedoManager as pageserver::walredo::WalRedoManager>::request_redo
>    - 31.72% <pageserver::walredo::PostgresRedoManager as pageserver::walredo::WalRedoManager>::request_redo
>       - 31.26% pageserver::walredo::PostgresRedoManager::apply_batch_postgres
>          + 7.64% <std::process::ChildStdin as std::io::Write>::write
>          + 6.17% nix::poll::poll
>          + 3.58% <std::process::ChildStderr as std::io::Read>::read
>          + 2.96% std::sync::condvar::Condvar::notify_one
>          + 2.48% std::sys::unix::locks::futex::Condvar::wait
>          + 2.19% alloc::raw_vec::RawVec<T,A>::reserve::do_reserve_and_handle
>          + 1.14% std::sys::unix::locks::futex::Mutex::lock_contended
>            0.67% __rust_alloc_zeroed
>            0.62% __stpcpy_ssse3
>            0.56% std::sys::unix::locks::futex::Mutex::wake

Note the 'do_reserve_handle' overhead. That's caused by having to grow
the buffer used to construct the WAL redo request. This commit
eliminates that overhead. It's only about 2% of the overall CPU usage,
but every little helps.

Also reuse the temp buffer when reading records from a DeltaLayer, and
call Vec::reserve to avoid growing a buffer when reading a blob across
pages. I saw a reduction from 2% to 1% of CPU spent in
do_reserve_and_handle in that codepath, but that's such a small change
that it could be just noise. Seems like it shouldn't hurt though.
2022-11-12 18:52:25 +02:00
Heikki Linnakangas
4131a6efae Remove unused Dockerfile.compute-node.legacy.
The cloud end-to-end tests use the docker images built by the neon PR
now, and don't need this legacy Dockerfile anymore.
2022-11-12 18:51:51 +02:00
Kirill Bulatov
03695261fc Test storage Docker images (#2767)
Closes https://github.com/neondatabase/neon/issues/2697
Example:
https://github.com/neondatabase/neon/actions/runs/3416774593/jobs/5688394855

Adds a set of tests on the storage Docker images before they are pushed
to the public registries:
* tests that pageserver binary has the correct version string (other
binaries are built with the same library, so it should be enough to test
one)
* tests that the compose file set-up works and all components are able
to start and perform a single SQL query (CREATE TABLE)
2022-11-11 19:42:26 +02:00
bojanserafimov
7fd88fab59 Trace read requests (#2762) 2022-11-10 16:43:04 -05:00
bojanserafimov
7edc098c40 Add perf test instructions (#2777) 2022-11-10 16:05:57 -05:00
Vadim Kharitonov
8421218152 Change the branch name for V14 as it does for V15 2022-11-10 17:41:36 +01:00
Alexander Bayandin
d5b7832c21 Fix test_wal_backpressure tests (#2792)
Fix expected return type for `fetchone `:

```
AssertionError: assert False
 +  where False = isinstance((Decimal('56048'), '55 kB', '0/1CF52D8', '0/1CE77E8'), list)
```
2022-11-10 16:15:04 +00:00
Arthur Petukhovsky
c6072d38c2 Remove debug logs in should_walsender_stop (#2791) 2022-11-10 15:49:00 +00:00
Alexander Bayandin
175779c0ef GitHub Actions: fix non-parallel benchmarks on CI (#2787)
Fix non-parallel pytest run by setting `--dist=loadgroup` only for
pytest command with xdist enabled (`-n` is set)
2022-11-10 12:51:47 +00:00
Christian Schwarz
8654e95fae walredo: fix zombie processes ([postgres] <defunct>)
This change wraps the std::process:Child that we spawn for WAL redo
into a type that ensures that we try to SIGKILL + waitpid() on it.

If there is no explicit call to kill_and_wait(), the Drop implementation
will spawns a task that does it in the BACKGROUND_RUNTIME.
That's an ugly hack but I think it's better than doing kill+wait
synchronously from Drop, since I think the general assumption in the
Rust ecosystem is that Drop doesn't block.
Especially since the drop sites can be _any_ place that drops the last
Arc<PostgresRedoManager>, e.g., compaction or GC.

The benefit of having the new type over just adding a Drop impl to
PostgresRedoProcess is that we can construct it earlier than the full
PostgresRedoProcess in PostgresRedoProcess::launch().
That allows us to correctly kill+wait the child if there is an error in
PostgresRedoProcess::launch() after spawning it.

I also took a stab at a regression test. I manually verified
that it fails before the fix to walredo.rs.

fixes https://github.com/neondatabase/neon/issues/2761
closes https://github.com/neondatabase/neon/pull/2776
2022-11-10 12:50:50 +01:00
Vadim Kharitonov
f720dd735e Stricter mypy linters for test_runner/fixtures/* 2022-11-10 12:47:27 +01:00
Alexander Bayandin
c4f9f1dc6d Add data format forward compatibility tests (#2766)
Add `test_forward_compatibility`, which checks if it's going to
be possible to roll back a release to the previous version.
The test uses artifacts (Neon & Postgres binaries) from the previous
release to start Neon on the repo created by the current version. It
performs exactly the same checks as `test_backward_compatibility` does.

Single `ALLOW_BREAKING_CHANGES` env var got replaced by
`ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE` &
`ALLOW_FORWARD_COMPATIBILITY_BREAKAGE` and can be set by `backward
compatibility breakage` and `forward compatibility breakage` labels
respectively.
2022-11-10 09:06:34 +00:00
Kirill Bulatov
4a10e1b066 Pass pushed storage Docker tag to e2e jobs 2022-11-10 08:50:42 +02:00
Vadim Kharitonov
b55466045e Introduce codeowners 2022-11-09 11:43:10 +01:00
Heikki Linnakangas
e999f66b01 Use a cached WaitEventSet instead of WaitLatchOrSocket.
When we repeatedly wait for the same events, it's faster to create the
event set once and reuse it. While testing with a sequential scan test
case, I saw WaitLatchOrSocket consuming a lot of CPU:

> -   40.52%     0.14%  postgres  postgres           [.] WaitLatchOrSocket
>    - 40.38% WaitLatchOrSocket
>       + 17.83% AddWaitEventToSet
>       + 9.47% close@plt
>       + 8.29% CreateWaitEventSet
>       + 4.57% WaitEventSetWait

This eliminates most of that overhead.
2022-11-08 19:45:14 +02:00
andres
1cf257bc4a feedback 2022-11-08 20:15:54 +04:00
andres
40164bd589 Use latestMsgReceivedAt in walproposer 2022-11-08 20:15:54 +04:00
Christian Schwarz
c3a470a29b walredo process management: handle every error on the kill() and drop path
If we're not calling kill() before dropping the PostgresRedoProcess, we
currently leak it.
That's most likely the root cause for #2761.
This patch
1. adds an error log message for that case and
2. adds error handling for all errors on the kill() path. If we're a
`testing` build, we panic. Otherwise, we log an error and leak the
process.

The error handling changes (2) are necessary to conclusively state that
the root cause for #2761 is indeed (1). If we didn't have them, the root
cause could be missing error handling instead.

To make the log messages useful, I've added tracing::instrument
attributes that log the tenant_id and PID. That helps mapping back the
PID of `defunct` processes to pageserver log messages. Note that a
defunct process's `/proc/$PID/` directory isn't very useful. We have
left little more than its PID.

Once we have validated the root cause, we'll find a fix, but that's
still an ongoing discussion.

refs https://github.com/neondatabase/neon/issues/2761
closes https://github.com/neondatabase/neon/pull/2769
2022-11-08 14:03:13 +01:00
Alexander Bayandin
c1a76eb0e5 test_runner: replace global variables with fixtures (#2754)
This PR replaces the following global variables in the test framework
with fixtures to make tests more configurable. I mainly need this for
the forward compatibility tests (draft in
https://github.com/neondatabase/neon/pull/2766).

```
base_dir
neon_binpath 
pg_distrib_dir
top_output_dir
default_pg_version (this one got replaced with a fixture named pg_version)
```

Also, this PR adds more `Path` type where the code implies it.
2022-11-07 18:39:51 +00:00
MMeent
d5b6471fa9 Update prefetch mechanism: (#2687)
Prefetch requests and responses are stored in a ringbuffer instead of a
queue, which means we can utilize prefetches of many relations
concurrently -- page reads of un-prefetched relations now don't imply
dropping prefetches.

In a future iteration, this may detect sequential scans based on the
read behavior of sequential scans, and will dynamically prefetch buffers
for such relations as needed. Right now, it still depends on explicit
prefetch requests from PostgreSQL.

The main improvement here is that we now have a buffer for prefetched
pages of 128 entries with random access. Before, we had a similarly sized
cache, but this cache did not allow for random access, which resulted in
dropped entries when multiple systems used the prefetching subsystem
concurrently.

See also: #2544
2022-11-07 18:13:24 +02:00
Joonas Koivunen
548d472b12 fix: logical size query at before initdb_lsn (#2755)
With more realistic selection of gc_horizon in tests there is an
immediate failure with trying to query logical size with lsn <
initdb_lsn. Fixes that, adds illustration gathered from clarity of
explaining this tenant size calculation to more people.

Cc: #2748, #2599.
2022-11-07 12:03:57 +02:00
Dmitry Rodionov
99e745a760 review adjustments 2022-11-06 13:42:18 +02:00
Dmitry Rodionov
15d970f731 decrease diff by moving check_checkpoint_distance back
Co-authored-by: Christian Schwarz <me@cschwarz.com>
2022-11-06 13:42:18 +02:00
Heikki Linnakangas
7b7f84f1b4 Refactor layer flushing task
Extracted from https://github.com/neondatabase/neon/pull/2595
2022-11-06 13:42:18 +02:00
Alexander Bayandin
bc40a5595f test_runner: update cryptography (#2753)
Bump `cryptography` package from 37.0.4 to 38.0.3 to fix
https://github.com/neondatabase/neon/security/dependabot/6 (rather just
in case).

Ref https://www.openssl.org/news/secadv/20221101.txt
2022-11-04 21:29:51 +00:00
Heikki Linnakangas
07b3ba5ce3 Bump postgres submodules, for some cosmetic cleanups. 2022-11-04 20:30:13 +02:00
Dmitry Ivanov
c38f38dab7 Move pq_proto to its own crate 2022-11-03 22:56:04 +03:00
bojanserafimov
71d268c7c4 Write message serialization test (#2746) 2022-11-03 14:24:15 +00:00
Joonas Koivunen
cf68963b18 Add initial tenant sizing model and a http route to query it (#2714)
Tenant size information is gathered by using existing parts of
`Tenant::gc_iteration` which are now separated as
`Tenant::refresh_gc_info`. `Tenant::refresh_gc_info` collects branch
points, and invokes `Timeline::update_gc_info`; nothing was supposed to
be changed there. The gathered branch points (through Timeline's
`GcInfo::retain_lsns`), `GcInfo::horizon_cutoff`, and
`GcInfo::pitr_cutoff` are used to build up a Vec of updates fed into the
`libs/tenant_size_model` to calculate the history size.

The gathered information is now exposed using `GET
/v1/tenant/{tenant_id}/size`, which which will respond with the actual
calculated size. Initially the idea was to have this delivered as tenant
background task and exported via metric, but it might be too
computationally expensive to run it periodically as we don't yet know if
the returned values are any good.

Adds one new metric:
- pageserver_storage_operations_seconds with label `logical_size`
    - separating from original `init_logical_size`

Adds a pageserver wide configuration variable:
- `concurrent_tenant_size_logical_size_queries` with default 1

This leaves a lot of TODO's, tracked on issue #2748.
2022-11-03 12:39:19 +00:00
Arseny Sher
63221e4b42 Fix sk->ps walsender shutdown on sk side on caughtup.
This will fix many threads issue, but code around awfully still wants
improvement.

https://github.com/neondatabase/neon/issues/2722
2022-11-03 16:20:55 +04:00
bojanserafimov
d7eeb73f6f Impl serialize for pagestream FeMessage (#2741) 2022-11-02 23:44:07 -04:00
Joonas Koivunen
5112142997 fix: use different port for temporary postgres (#2743)
`test_tenant_relocation` ends up starting a temporary postgres instance with a fixed port. the change makes the port configurable at scripts/export_import_between_pageservers.py and uses that in test_tenant_relocation.
2022-11-02 18:37:48 +00:00
bojanserafimov
a0a74868a4 Fix clippy (#2742) 2022-11-02 12:30:09 -04:00
Christian Schwarz
b154992510 timeline_list_handler: avoid spawn_blocking
As per https://github.com/neondatabase/neon/issues/2731#issuecomment-1299335813

refs https://github.com/neondatabase/neon/issues/2731
2022-11-02 16:22:58 +01:00
Christian Schwarz
a86a38c96e README: fix instructions on how to run tests
The `make debug` target doesn't exist, and I can't find it in the Git
history.
2022-11-02 16:22:58 +01:00
Christian Schwarz
590f894db8 tenant_status: remove unnecessary spawn_blocking
The spawn_blocking is pointless in this cases: get_tenant is not
expected to block for any meaningful amount of time. There are
get_tenant calls in most other functions in the file too, and they don't
bother with spawn_blocking. Let's remove the spawn_blocking from
tenant_status, too, to be consistent.

fixes https://github.com/neondatabase/neon/issues/2731
2022-11-02 16:22:58 +01:00
Alexander Bayandin
0a0595b98d test_backward_compatibility: assign random port to compute (#2738) 2022-11-02 15:22:38 +00:00
Dmitry Rodionov
e56d11c8e1 fix style if possible (cannot really split long lines in mermaid) 2022-11-02 17:15:49 +02:00
Dmitry Rodionov
ccdc3188ed update according to discussion and comments 2022-11-02 17:15:49 +02:00
Dmitry Rodionov
67401cbdb8 pageserver s3 coordination 2022-11-02 17:15:49 +02:00
Kirill Bulatov
d42700280f Remove daemonize from storage components (#2677)
Move daemonization logic into `control_plane`.
Storage binaries now only crate a lockfile to avoid concurrent services running in the same directory.
2022-11-02 02:26:37 +02:00
Kirill Bulatov
6df4d5c911 Bump rustc to 1.62.1 (#2728)
Changelog: https://github.com/rust-lang/rust/blob/master/RELEASES.md#version-1621-2022-07-19
2022-11-02 01:21:33 +02:00
Dmitry Rodionov
32d14403bd remove wrong is_active filter for timelines in compaction/gc
Gc needs to know about all branch points, not only ones for
timelines that are active at the moment of gc. If timeline
is inactive then we wont know about branch point. In this
case gc can delete data that is needed by child timeline.

For compaction it is less severe. Delaying compaction can
cause an effect on performance. So it is still better to run
it. There is a logic to exit it quickly if there is nothing
to compact
2022-11-01 18:07:08 +02:00
Dmitry Ivanov
0df3467146 Refactoring: replace utils::connstring with Url-based APIs 2022-11-01 18:17:36 +03:00
Dmitry Rodionov
c64a121aa8 do not nest wal_connection_manager span inside parent one 2022-11-01 15:08:23 +02:00
Heikki Linnakangas
22cc8760b9 Move walredo process code under pgxn in the main 'neon' repository.
- Refactor the way the WalProposerMain function is called when started
  with --sync-safekeepers. The postgres binary now explicitly loads
  the 'neon.so' library and calls the WalProposerMain in it. This is
  simpler than the global function callback "hook" we previously used.

- Move the WAL redo process code to a new library, neon_walredo.so,
  and use the same mechanism as for --sync-safekeepers to call the
  WalRedoMain function, when launched with --walredo argument.

- Also move the seccomp code to neon_walredo.so library. I kept the
  configure check in the postgres side for now, though.
2022-10-31 01:11:50 +01:00
Arseny Sher
596d622a82 Fix test_prepare_snapshot.
It should checkpoint pageserver after waiting for all data arrival, not before.
2022-10-28 22:12:31 +04:00
Sergey Melnikov
7481fb082c Fix bugs in #2713 (#2716) 2022-10-28 14:12:49 +00:00
Arseny Sher
1eb9bd052a Bump vendor/postgres-v15 to fix XLP_FIRST_IS_CONTRECORD issue.
ref https://github.com/neondatabase/cloud/issues/2688
2022-10-28 16:45:11 +03:00
Sergey Melnikov
59a3ca4ec6 Deploy proxy to new prod regions (#2713)
* Refactor proxy deploy

* Test new prod deploy

* Remove assume role

* Add new values

* Add all regions
2022-10-28 16:25:28 +03:00
Sergey Melnikov
e86a9105a4 Deploy storage to new prod regions (#2709) 2022-10-28 10:17:27 +00:00
Stas Kelvich
d3c8749da5 Build compute postgres with openssl support
The main reason for that change is that Postgres 15 requires OpenSSL
for `pgcrypto` to work. Also not a bad idea to have SSL-enabled
Postgres in general.
2022-10-28 10:39:22 +03:00
Alexander Bayandin
128dc8d405 Nightly Benchmarks: fix workflow (#2708) 2022-10-27 19:26:10 +03:00
Alexander Bayandin
0cbae6e8f3 test_backward_compatibility: friendlier error message (#2707) 2022-10-27 15:54:49 +00:00
Alexander Stanovoy
78e412b84b The fix of #2650. (#2686)
* Wrappers and drop implementations for image and delta layer writers.
* Two regression tests for the image and delta layer files.
2022-10-27 14:02:55 +00:00
Rory de Zoete
6dbf202e0d Update crane copy target (#2704)
Co-authored-by: Rory de Zoete <rdezoete@Rorys-Mac-Studio.fritz.box>
2022-10-27 16:00:40 +02:00
178 changed files with 8609 additions and 2457 deletions

View File

@@ -55,6 +55,22 @@ runs:
name: neon-${{ runner.os }}-${{ inputs.build_type }}-artifact
path: /tmp/neon
- name: Download Neon binaries for the previous release
if: inputs.build_type != 'remote'
uses: ./.github/actions/download
with:
name: neon-${{ runner.os }}-${{ inputs.build_type }}-artifact
path: /tmp/neon-previous
prefix: latest
- name: Download compatibility snapshot for Postgres 14
if: inputs.build_type != 'remote'
uses: ./.github/actions/download
with:
name: compatibility-snapshot-${{ inputs.build_type }}-pg14
path: /tmp/compatibility_snapshot_pg14
prefix: latest
- name: Checkout
if: inputs.needs_postgres_source == 'true'
uses: actions/checkout@v3
@@ -73,22 +89,18 @@ runs:
shell: bash -euxo pipefail {0}
run: ./scripts/pysync
- name: Download compatibility snapshot for Postgres 14
uses: ./.github/actions/download
with:
name: compatibility-snapshot-${{ inputs.build_type }}-pg14
path: /tmp/compatibility_snapshot_pg14
prefix: latest
- name: Run pytest
env:
NEON_BIN: /tmp/neon/bin
COMPATIBILITY_NEON_BIN: /tmp/neon-previous/bin
COMPATIBILITY_POSTGRES_DISTRIB_DIR: /tmp/neon-previous/pg_install
TEST_OUTPUT: /tmp/test_output
BUILD_TYPE: ${{ inputs.build_type }}
AWS_ACCESS_KEY_ID: ${{ inputs.real_s3_access_key_id }}
AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }}
COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg14
ALLOW_BREAKING_CHANGES: contains(github.event.pull_request.labels.*.name, 'breaking changes')
ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage')
ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
shell: bash -euxo pipefail {0}
run: |
# PLATFORM will be embedded in the perf test report
@@ -111,7 +123,12 @@ runs:
exit 1
fi
if [[ "${{ inputs.run_in_parallel }}" == "true" ]]; then
# -n4 uses four processes to run tests via pytest-xdist
EXTRA_PARAMS="-n4 $EXTRA_PARAMS"
# --dist=loadgroup points tests marked with @pytest.mark.xdist_group
# to the same worker to make @pytest.mark.order work with xdist
EXTRA_PARAMS="--dist=loadgroup $EXTRA_PARAMS"
fi
if [[ "${{ inputs.run_with_real_s3 }}" == "true" ]]; then
@@ -146,9 +163,9 @@ runs:
# --verbose prints name of each test (helpful when there are
# multiple tests in one file)
# -rA prints summary in the end
# -n4 uses four processes to run tests via pytest-xdist
# -s is not used to prevent pytest from capturing output, because tests are running
# in parallel and logs are mixed between different tests
#
mkdir -p $TEST_OUTPUT/allure/results
"${cov_prefix[@]}" ./scripts/pytest \
--junitxml=$TEST_OUTPUT/junit.xml \
@@ -168,12 +185,12 @@ runs:
uses: ./.github/actions/upload
with:
name: compatibility-snapshot-${{ inputs.build_type }}-pg14-${{ github.run_id }}
# The path includes a test name (test_prepare_snapshot) and directory that the test creates (compatibility_snapshot_pg14), keep the path in sync with the test
path: /tmp/test_output/test_prepare_snapshot/compatibility_snapshot_pg14/
# The path includes a test name (test_create_snapshot) and directory that the test creates (compatibility_snapshot_pg14), keep the path in sync with the test
path: /tmp/test_output/test_create_snapshot/compatibility_snapshot_pg14/
prefix: latest
- name: Create Allure report
if: always()
if: success() || failure()
uses: ./.github/actions/allure-report
with:
action: store

View File

@@ -0,0 +1,35 @@
storage:
vars:
bucket_name: neon-prod-storage-ap-southeast-1
bucket_region: ap-southeast-1
console_mgmt_base_url: http://console-release.local
etcd_endpoints: etcd-0.ap-southeast-1.aws.neon.tech:2379
pageserver_config_stub:
pg_distrib_dir: /usr/local
remote_storage:
bucket_name: "{{ bucket_name }}"
bucket_region: "{{ bucket_region }}"
prefix_in_bucket: "pageserver/v1"
safekeeper_s3_prefix: safekeeper/v1/wal
hostname_suffix: ""
remote_user: ssm-user
ansible_aws_ssm_region: ap-southeast-1
ansible_aws_ssm_bucket_name: neon-prod-storage-ap-southeast-1
console_region_id: aws-ap-southeast-1
children:
pageservers:
hosts:
pageserver-0.ap-southeast-1.aws.neon.tech:
ansible_host: i-064de8ea28bdb495b
pageserver-1.ap-southeast-1.aws.neon.tech:
ansible_host: i-0b180defcaeeb6b93
safekeepers:
hosts:
safekeeper-0.ap-southeast-1.aws.neon.tech:
ansible_host: i-0d6f1dc5161eef894
safekeeper-1.ap-southeast-1.aws.neon.tech:
ansible_host: i-0e338adda8eb2d19f
safekeeper-2.ap-southeast-1.aws.neon.tech:
ansible_host: i-04fb63634e4679eb9

View File

@@ -0,0 +1,35 @@
storage:
vars:
bucket_name: neon-prod-storage-eu-central-1
bucket_region: eu-central-1
console_mgmt_base_url: http://console-release.local
etcd_endpoints: etcd-0.eu-central-1.aws.neon.tech:2379
pageserver_config_stub:
pg_distrib_dir: /usr/local
remote_storage:
bucket_name: "{{ bucket_name }}"
bucket_region: "{{ bucket_region }}"
prefix_in_bucket: "pageserver/v1"
safekeeper_s3_prefix: safekeeper/v1/wal
hostname_suffix: ""
remote_user: ssm-user
ansible_aws_ssm_region: eu-central-1
ansible_aws_ssm_bucket_name: neon-prod-storage-eu-central-1
console_region_id: aws-eu-central-1
children:
pageservers:
hosts:
pageserver-0.eu-central-1.aws.neon.tech:
ansible_host: i-0cd8d316ecbb715be
pageserver-1.eu-central-1.aws.neon.tech:
ansible_host: i-090044ed3d383fef0
safekeepers:
hosts:
safekeeper-0.eu-central-1.aws.neon.tech:
ansible_host: i-0b238612d2318a050
safekeeper-1.eu-central-1.aws.neon.tech:
ansible_host: i-07b9c45e5c2637cd4
safekeeper-2.eu-central-1.aws.neon.tech:
ansible_host: i-020257302c3c93d88

View File

@@ -0,0 +1,36 @@
storage:
vars:
bucket_name: neon-prod-storage-us-east-2
bucket_region: us-east-2
console_mgmt_base_url: http://console-release.local
etcd_endpoints: etcd-0.us-east-2.aws.neon.tech:2379
pageserver_config_stub:
pg_distrib_dir: /usr/local
remote_storage:
bucket_name: "{{ bucket_name }}"
bucket_region: "{{ bucket_region }}"
prefix_in_bucket: "pageserver/v1"
safekeeper_s3_prefix: safekeeper/v1/wal
hostname_suffix: ""
remote_user: ssm-user
ansible_aws_ssm_region: us-east-2
ansible_aws_ssm_bucket_name: neon-prod-storage-us-east-2
console_region_id: aws-us-east-2
children:
pageservers:
hosts:
pageserver-0.us-east-2.aws.neon.tech:
ansible_host: i-062227ba7f119eb8c
pageserver-1.us-east-2.aws.neon.tech:
ansible_host: i-0b3ec0afab5968938
safekeepers:
hosts:
safekeeper-0.us-east-2.aws.neon.tech:
ansible_host: i-0e94224750c57d346
safekeeper-1.us-east-2.aws.neon.tech:
ansible_host: i-06d113fb73bfddeb0
safekeeper-2.us-east-2.aws.neon.tech:
ansible_host: i-09f66c8e04afff2e8

View File

@@ -22,6 +22,10 @@ storage:
console_region_id: aws-us-west-2
zenith-1-ps-3:
console_region_id: aws-us-west-2
zenith-1-ps-4:
console_region_id: aws-us-west-2
zenith-1-ps-5:
console_region_id: aws-us-west-2
safekeepers:
hosts:

View File

@@ -1,3 +1,2 @@
ansible_connection: aws_ssm
ansible_aws_ssm_bucket_name: neon-dev-bucket
ansible_python_interpreter: /usr/bin/python3

View File

@@ -3,7 +3,7 @@ storage:
bucket_name: zenith-staging-storage-us-east-1
bucket_region: us-east-1
console_mgmt_base_url: http://console-staging.local
etcd_endpoints: zenith-us-stage-etcd.local:2379
etcd_endpoints: etcd-0.us-east-2.aws.neon.build:2379
pageserver_config_stub:
pg_distrib_dir: /usr/local
remote_storage:

View File

@@ -14,6 +14,7 @@ storage:
hostname_suffix: ""
remote_user: ssm-user
ansible_aws_ssm_region: us-east-2
ansible_aws_ssm_bucket_name: neon-staging-storage-us-east-2
console_region_id: aws-us-east-2
children:

View File

@@ -0,0 +1,31 @@
# Helm chart values for neon-proxy-scram.
# This is a YAML-formatted file.
image:
repository: neondatabase/neon
settings:
authBackend: "console"
authEndpoint: "http://console-release.local/management/api/v2"
domain: "*.ap-southeast-1.aws.neon.tech"
# -- Additional labels for neon-proxy pods
podLabels:
zenith_service: proxy-scram
zenith_env: prod
zenith_region: ap-southeast-1
zenith_region_slug: ap-southeast-1
exposedService:
annotations:
service.beta.kubernetes.io/aws-load-balancer-type: external
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: ap-southeast-1.aws.neon.tech
#metrics:
# enabled: true
# serviceMonitor:
# enabled: true
# selector:
# release: kube-prometheus-stack

View File

@@ -0,0 +1,31 @@
# Helm chart values for neon-proxy-scram.
# This is a YAML-formatted file.
image:
repository: neondatabase/neon
settings:
authBackend: "console"
authEndpoint: "http://console-release.local/management/api/v2"
domain: "*.eu-central-1.aws.neon.tech"
# -- Additional labels for neon-proxy pods
podLabels:
zenith_service: proxy-scram
zenith_env: prod
zenith_region: eu-central-1
zenith_region_slug: eu-central-1
exposedService:
annotations:
service.beta.kubernetes.io/aws-load-balancer-type: external
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: eu-central-1.aws.neon.tech
#metrics:
# enabled: true
# serviceMonitor:
# enabled: true
# selector:
# release: kube-prometheus-stack

View File

@@ -0,0 +1,31 @@
# Helm chart values for neon-proxy-scram.
# This is a YAML-formatted file.
image:
repository: neondatabase/neon
settings:
authBackend: "console"
authEndpoint: "http://console-release.local/management/api/v2"
domain: "*.us-east-2.aws.neon.tech"
# -- Additional labels for neon-proxy pods
podLabels:
zenith_service: proxy-scram
zenith_env: prod
zenith_region: us-east-2
zenith_region_slug: us-east-2
exposedService:
annotations:
service.beta.kubernetes.io/aws-load-balancer-type: external
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: us-east-2.aws.neon.tech
#metrics:
# enabled: true
# serviceMonitor:
# enabled: true
# selector:
# release: kube-prometheus-stack

View File

@@ -164,7 +164,7 @@ jobs:
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }}
PLATFORM: ${{ matrix.platform }}
runs-on: dev
runs-on: [ self-hosted, dev, x64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:pinned
options: --init
@@ -265,7 +265,7 @@ jobs:
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
- name: Create Allure report
if: always()
if: success() || failure()
uses: ./.github/actions/allure-report
with:
action: generate

View File

@@ -18,8 +18,8 @@ env:
jobs:
tag:
runs-on: dev
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest
runs-on: [ self-hosted, dev, x64 ]
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
outputs:
build-tag: ${{steps.build-tag.outputs.tag}}
@@ -46,7 +46,7 @@ jobs:
id: build-tag
build-neon:
runs-on: dev
runs-on: [ self-hosted, dev, x64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
@@ -127,8 +127,8 @@ jobs:
target/
# Fall back to older versions of the key, if no cache for current Cargo.lock was found
key: |
v9-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
v9-${{ runner.os }}-${{ matrix.build_type }}-cargo-
v10-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
v10-${{ runner.os }}-${{ matrix.build_type }}-cargo-
- name: Cache postgres v14 build
id: cache_pg_14
@@ -236,7 +236,7 @@ jobs:
uses: ./.github/actions/save-coverage-data
regress-tests:
runs-on: dev
runs-on: [ self-hosted, dev, x64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
@@ -268,34 +268,8 @@ jobs:
if: matrix.build_type == 'debug'
uses: ./.github/actions/save-coverage-data
upload-latest-artifacts:
runs-on: dev
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
needs: [ regress-tests ]
if: github.ref_name == 'main'
steps:
- name: Copy Neon artifact to the latest directory
shell: bash -euxo pipefail {0}
env:
BUCKET: neon-github-public-dev
PREFIX: artifacts/${{ github.run_id }}
run: |
for build_type in debug release; do
FILENAME=neon-${{ runner.os }}-${build_type}-artifact.tar.zst
S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
if [ -z "${S3_KEY}" ]; then
echo 2>&1 "Neither s3://${BUCKET}/${PREFIX}/${FILENAME} nor its version from previous attempts exist"
exit 1
fi
time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/artifacts/latest/${FILENAME}
done
benchmarks:
runs-on: dev
runs-on: [ self-hosted, dev, x64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
@@ -326,12 +300,12 @@ jobs:
# while coverage is currently collected for the debug ones
merge-allure-report:
runs-on: dev
runs-on: [ self-hosted, dev, x64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
needs: [ regress-tests, benchmarks ]
if: always()
if: success() || failure()
strategy:
fail-fast: false
matrix:
@@ -364,7 +338,7 @@ jobs:
DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json
coverage-report:
runs-on: dev
runs-on: [ self-hosted, dev, x64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
@@ -389,7 +363,7 @@ jobs:
!~/.cargo/registry/src
~/.cargo/git/
target/
key: v9-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
key: v10-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
- name: Get Neon artifact
uses: ./.github/actions/download
@@ -441,15 +415,19 @@ jobs:
shell: bash -euxo pipefail {0}
trigger-e2e-tests:
runs-on: dev
runs-on: [ self-hosted, dev, x64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
options: --init
needs: [ build-neon ]
needs: [ push-docker-hub, tag ]
steps:
- name: Set PR's status to pending and request a remote CI test
run: |
# For pull requests, GH Actions set "github.sha" variable to point at a fake merge commit
# but we need to use a real sha of a latest commit in the PR's branch for the e2e job,
# to place a job run status update later.
COMMIT_SHA=${{ github.event.pull_request.head.sha }}
# For non-PR kinds of runs, the above will produce an empty variable, pick the original sha value for those
COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
REMOTE_REPO="${{ github.repository_owner }}/cloud"
@@ -475,12 +453,14 @@ jobs:
\"inputs\": {
\"ci_job_name\": \"neon-cloud-e2e\",
\"commit_hash\": \"$COMMIT_SHA\",
\"remote_repo\": \"${{ github.repository }}\"
\"remote_repo\": \"${{ github.repository }}\",
\"storage_image_tag\": \"${{ needs.tag.outputs.build-tag }}\",
\"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\"
}
}"
neon-image:
runs-on: dev
runs-on: [ self-hosted, dev, x64 ]
needs: [ tag ]
container: gcr.io/kaniko-project/executor:v1.9.0-debug
@@ -498,7 +478,7 @@ jobs:
run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
compute-tools-image:
runs-on: dev
runs-on: [ self-hosted, dev, x64 ]
needs: [ tag ]
container: gcr.io/kaniko-project/executor:v1.9.0-debug
@@ -512,28 +492,8 @@ jobs:
- name: Kaniko build compute tools
run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
compute-node-image:
runs-on: dev
container: gcr.io/kaniko-project/executor:v1.9.0-debug
needs: [ tag ]
steps:
- name: Checkout
uses: actions/checkout@v1 # v3 won't work with kaniko
with:
submodules: true
fetch-depth: 0
- name: Configure ECR login
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
# compute-node uses postgres 14, which is default now
# cloud repo depends on this image name, thus duplicating it
# remove compute-node when cloud repo is updated
- name: Kaniko build compute node with extensions v14 (compatibility)
run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:${{needs.tag.outputs.build-tag}}
compute-node-image-v14:
runs-on: dev
runs-on: [ self-hosted, dev, x64 ]
container: gcr.io/kaniko-project/executor:v1.9.0-debug
needs: [ tag ]
steps:
@@ -549,9 +509,8 @@ jobs:
- name: Kaniko build compute node with extensions v14
run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}}
compute-node-image-v15:
runs-on: dev
runs-on: [ self-hosted, dev, x64 ]
container: gcr.io/kaniko-project/executor:v1.9.0-debug
needs: [ tag ]
steps:
@@ -567,18 +526,58 @@ jobs:
- name: Kaniko build compute node with extensions v15
run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v15 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}}
test-images:
needs: [ tag, neon-image, compute-node-image-v14, compute-node-image-v15, compute-tools-image ]
runs-on: [ self-hosted, dev, x64 ]
steps:
- name: Checkout
uses: actions/checkout@v3
with:
fetch-depth: 0
# `neondatabase/neon` contains multiple binaries, all of them use the same input for the version into the same version formatting library.
# Pick pageserver as currently the only binary with extra "version" features printed in the string to verify.
# Regular pageserver version string looks like
# Neon page server git-env:32d14403bd6ab4f4520a94cbfd81a6acef7a526c failpoints: true, features: []
# Bad versions might loop like:
# Neon page server git-env:local failpoints: true, features: ["testing"]
# Ensure that we don't have bad versions.
- name: Verify image versions
shell: bash # ensure no set -e for better error messages
run: |
pageserver_version=$(docker run --rm 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} "/bin/sh" "-c" "/usr/local/bin/pageserver --version")
echo "Pageserver version string: $pageserver_version"
if ! echo "$pageserver_version" | grep -qv 'git-env:local' ; then
echo "Pageserver version should not be the default Dockerfile one"
exit 1
fi
if ! echo "$pageserver_version" | grep -qv '"testing"' ; then
echo "Pageserver version should have no testing feature enabled"
exit 1
fi
- name: Verify docker-compose example
run: env REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh
- name: Print logs and clean up
if: always()
run: |
docker compose -f ./docker-compose/docker-compose.yml logs || 0
docker compose -f ./docker-compose/docker-compose.yml down
promote-images:
runs-on: dev
needs: [ tag, neon-image, compute-node-image, compute-node-image-v14, compute-node-image-v15, compute-tools-image ]
runs-on: [ self-hosted, dev, x64 ]
needs: [ tag, test-images ]
if: github.event_name != 'workflow_dispatch'
container: amazon/aws-cli
strategy:
fail-fast: false
matrix:
# compute-node uses postgres 14, which is default now
# cloud repo depends on this image name, thus duplicating it
# remove compute-node when cloud repo is updated
name: [ neon, compute-node, compute-node-v14, compute-node-v15, compute-tools ]
name: [ neon, compute-node-v14, compute-node-v15, compute-tools ]
steps:
- name: Promote image to latest
@@ -587,7 +586,7 @@ jobs:
aws ecr put-image --repository-name ${{ matrix.name }} --image-tag latest --image-manifest "$MANIFEST"
push-docker-hub:
runs-on: dev
runs-on: [ self-hosted, dev, x64 ]
needs: [ promote-images, tag ]
container: golang:1.19-bullseye
@@ -608,9 +607,6 @@ jobs:
- name: Pull compute tools image from ECR
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} compute-tools
- name: Pull compute node image from ECR
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:${{needs.tag.outputs.build-tag}} compute-node
- name: Pull compute node v14 image from ECR
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} compute-node-v14
@@ -625,11 +621,10 @@ jobs:
(github.ref_name == 'main' || github.ref_name == 'release') &&
github.event_name != 'workflow_dispatch'
run: |
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/neon:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-tools:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node-v14:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node-v15:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
- name: Configure Docker Hub login
run: |
@@ -643,9 +638,6 @@ jobs:
- name: Push compute tools image to Docker Hub
run: crane push compute-tools neondatabase/compute-tools:${{needs.tag.outputs.build-tag}}
- name: Push compute node image to Docker Hub
run: crane push compute-node neondatabase/compute-node:${{needs.tag.outputs.build-tag}}
- name: Push compute node v14 image to Docker Hub
run: crane push compute-node-v14 neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}}
@@ -662,7 +654,6 @@ jobs:
run: |
crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/compute-node:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
@@ -745,7 +736,7 @@ jobs:
rm -f neon_install.tar.gz .neon_current_version
deploy-new:
runs-on: dev
runs-on: [ self-hosted, dev, x64 ]
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
# We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
# If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
@@ -756,9 +747,9 @@ jobs:
defaults:
run:
shell: bash
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
strategy:
matrix:
target_region: [ us-east-2 ]
steps:
- name: Checkout
uses: actions/checkout@v3
@@ -781,11 +772,51 @@ jobs:
fi
ansible-galaxy collection install sivel.toiletwater
ansible-playbook deploy.yaml -i staging.us-east-2.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_STAGING_API_KEY}}
ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_STAGING_API_KEY}}
rm -f neon_install.tar.gz .neon_current_version
deploy-prod-new:
runs-on: prod
container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
# We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
# If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
if: |
(github.ref_name == 'release') &&
github.event_name != 'workflow_dispatch'
defaults:
run:
shell: bash
strategy:
matrix:
target_region: [ us-east-2, eu-central-1, ap-southeast-1 ]
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 0
- name: Redeploy
run: |
export DOCKER_TAG=${{needs.tag.outputs.build-tag}}
cd "$(pwd)/.github/ansible"
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
./get_binaries.sh
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
RELEASE=true ./get_binaries.sh
else
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
exit 1
fi
ansible-galaxy collection install sivel.toiletwater
ansible-playbook deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_PRODUCTION_API_KEY}}
rm -f neon_install.tar.gz .neon_current_version
deploy-proxy:
runs-on: dev
runs-on: [ self-hosted, dev, x64 ]
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest
# Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
@@ -827,7 +858,7 @@ jobs:
helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
deploy-proxy-new:
runs-on: dev
runs-on: [ self-hosted, dev, x64 ]
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
# Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
@@ -837,6 +868,11 @@ jobs:
defaults:
run:
shell: bash
strategy:
matrix:
include:
- target_region: us-east-2
target_cluster: dev-us-east-2-beta
steps:
- name: Checkout
uses: actions/checkout@v3
@@ -847,15 +883,52 @@ jobs:
- name: Configure environment
run: |
helm repo add neondatabase https://neondatabase.github.io/helm-charts
aws --region us-east-2 eks update-kubeconfig --name dev-us-east-2-beta --role-arn arn:aws:iam::369495373322:role/github-runner
aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }}
- name: Re-deploy proxy
run: |
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
promote-compatibility-test-snapshot:
runs-on: dev
deploy-proxy-prod-new:
runs-on: prod
container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
# Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
if: |
(github.ref_name == 'release') &&
github.event_name != 'workflow_dispatch'
defaults:
run:
shell: bash
strategy:
matrix:
include:
- target_region: us-east-2
target_cluster: prod-us-east-2-delta
- target_region: eu-central-1
target_cluster: prod-eu-central-1-gamma
- target_region: ap-southeast-1
target_cluster: prod-ap-southeast-1-epsilon
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 0
- name: Configure environment
run: |
helm repo add neondatabase https://neondatabase.github.io/helm-charts
aws --region ${{ matrix.target_region }} eks update-kubeconfig --name ${{ matrix.target_cluster }}
- name: Re-deploy proxy
run: |
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
promote-compatibility-data:
runs-on: [ self-hosted, dev, x64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
@@ -868,9 +941,24 @@ jobs:
BUCKET: neon-github-public-dev
PREFIX: artifacts/latest
run: |
# Update compatibility snapshot for the release
for build_type in debug release; do
OLD_FILENAME=compatibility-snapshot-${build_type}-pg14-${GITHUB_RUN_ID}.tar.zst
NEW_FILENAME=compatibility-snapshot-${build_type}-pg14.tar.zst
time aws s3 mv --only-show-errors s3://${BUCKET}/${PREFIX}/${OLD_FILENAME} s3://${BUCKET}/${PREFIX}/${NEW_FILENAME}
done
# Update Neon artifact for the release (reuse already uploaded artifact)
for build_type in debug release; do
OLD_PREFIX=artifacts/${GITHUB_RUN_ID}
FILENAME=neon-${{ runner.os }}-${build_type}-artifact.tar.zst
S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
if [ -z "${S3_KEY}" ]; then
echo 2>&1 "Neither s3://${BUCKET}/${OLD_PREFIX}/${FILENAME} nor its version from previous attempts exist"
exit 1
fi
time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/${PREFIX}/${FILENAME}
done

View File

@@ -106,7 +106,7 @@ jobs:
!~/.cargo/registry/src
~/.cargo/git
target
key: v5-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust
key: v6-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust
- name: Run cargo clippy
run: ./run_clippy.sh
@@ -115,7 +115,7 @@ jobs:
run: cargo build --locked --all --all-targets
check-rust-dependencies:
runs-on: dev
runs-on: [ self-hosted, dev, x64 ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init

2
.gitmodules vendored
View File

@@ -1,7 +1,7 @@
[submodule "vendor/postgres-v14"]
path = vendor/postgres-v14
url = https://github.com/neondatabase/postgres.git
branch = main
branch = REL_14_STABLE_neon
[submodule "vendor/postgres-v15"]
path = vendor/postgres-v15
url = https://github.com/neondatabase/postgres.git

10
CODEOWNERS Normal file
View File

@@ -0,0 +1,10 @@
/compute_tools/ @neondatabase/control-plane
/control_plane/ @neondatabase/compute @neondatabase/storage
/libs/pageserver_api/ @neondatabase/compute @neondatabase/storage
/libs/postgres_ffi/ @neondatabase/compute
/libs/remote_storage/ @neondatabase/storage
/libs/safekeeper_api/ @neondatabase/safekeepers
/pageserver/ @neondatabase/compute @neondatabase/storage
/pgxn/ @neondatabase/compute
/proxy/ @neondatabase/control-plane
/safekeeper/ @neondatabase/safekeepers

55
Cargo.lock generated
View File

@@ -317,12 +317,6 @@ dependencies = [
"generic-array",
]
[[package]]
name = "boxfnonce"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5988cb1d626264ac94100be357308f29ff7cbdd3b36bda27f450a4ee3f713426"
[[package]]
name = "bstr"
version = "1.0.1"
@@ -600,6 +594,7 @@ dependencies = [
"tar",
"thiserror",
"toml",
"url",
"utils",
"workspace_hack",
]
@@ -849,16 +844,6 @@ dependencies = [
"syn",
]
[[package]]
name = "daemonize"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "70c24513e34f53b640819f0ac9f705b673fcf4006d7aab8778bee72ebfc89815"
dependencies = [
"boxfnonce",
"libc",
]
[[package]]
name = "darling"
version = "0.14.1"
@@ -2140,7 +2125,6 @@ dependencies = [
"crc32c",
"criterion",
"crossbeam-utils",
"daemonize",
"etcd_broker",
"fail",
"futures",
@@ -2161,6 +2145,7 @@ dependencies = [
"postgres-types",
"postgres_ffi",
"pprof",
"pq_proto",
"rand",
"regex",
"remote_storage",
@@ -2173,6 +2158,7 @@ dependencies = [
"svg_fmt",
"tar",
"tempfile",
"tenant_size_model",
"thiserror",
"tokio",
"tokio-postgres",
@@ -2190,6 +2176,7 @@ name = "pageserver_api"
version = "0.1.0"
dependencies = [
"anyhow",
"byteorder",
"bytes",
"const_format",
"postgres_ffi",
@@ -2452,6 +2439,21 @@ version = "0.2.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872"
[[package]]
name = "pq_proto"
version = "0.1.0"
dependencies = [
"anyhow",
"bytes",
"pin-project-lite",
"postgres-protocol",
"rand",
"serde",
"tokio",
"tracing",
"workspace_hack",
]
[[package]]
name = "prettyplease"
version = "0.1.21"
@@ -2584,6 +2586,7 @@ dependencies = [
"once_cell",
"parking_lot 0.12.1",
"pin-project-lite",
"pq_proto",
"rand",
"rcgen",
"reqwest",
@@ -3087,7 +3090,6 @@ dependencies = [
"clap 4.0.15",
"const_format",
"crc32c",
"daemonize",
"etcd_broker",
"fs2",
"git-version",
@@ -3095,11 +3097,13 @@ dependencies = [
"humantime",
"hyper",
"metrics",
"nix 0.25.0",
"once_cell",
"parking_lot 0.12.1",
"postgres",
"postgres-protocol",
"postgres_ffi",
"pq_proto",
"regex",
"remote_storage",
"safekeeper_api",
@@ -3548,6 +3552,13 @@ dependencies = [
"winapi",
]
[[package]]
name = "tenant_size_model"
version = "0.1.0"
dependencies = [
"workspace_hack",
]
[[package]]
name = "termcolor"
version = "1.1.3"
@@ -4053,9 +4064,7 @@ dependencies = [
"metrics",
"nix 0.25.0",
"once_cell",
"pin-project-lite",
"postgres",
"postgres-protocol",
"pq_proto",
"rand",
"routerify",
"rustls",
@@ -4380,6 +4389,9 @@ dependencies = [
"crossbeam-utils",
"either",
"fail",
"futures-channel",
"futures-task",
"futures-util",
"hashbrown",
"indexmap",
"libc",
@@ -4393,6 +4405,7 @@ dependencies = [
"rand",
"regex",
"regex-syntax",
"reqwest",
"scopeguard",
"serde",
"stable_deref_trait",

View File

@@ -25,6 +25,10 @@ members = [
# Besides, debug info should not affect the performance.
debug = true
# disable debug symbols for all packages except this one to decrease binaries size
[profile.release.package."*"]
debug = false
[profile.release-line-debug]
inherits = "release"
debug = 1 # true = 2 = all symbols, 1 = line only

View File

@@ -13,7 +13,7 @@ ARG TAG=pinned
FROM debian:bullseye-slim AS build-deps
RUN apt update && \
apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev
#########################################################################################
#
@@ -24,7 +24,7 @@ RUN apt update && \
FROM build-deps AS pg-build
COPY vendor/postgres-v14 postgres
RUN cd postgres && \
./configure CFLAGS='-O2 -g3' --enable-debug --with-uuid=ossp && \
./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp && \
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
# Install headers

View File

@@ -13,7 +13,7 @@ ARG TAG=pinned
FROM debian:bullseye-slim AS build-deps
RUN apt update && \
apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev
#########################################################################################
#
@@ -24,7 +24,7 @@ RUN apt update && \
FROM build-deps AS pg-build
COPY vendor/postgres-v15 postgres
RUN cd postgres && \
./configure CFLAGS='-O2 -g3' --enable-debug --with-uuid=ossp && \
./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp && \
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
# Install headers

View File

@@ -1,88 +0,0 @@
#
# Legacy version of the Dockerfile for the compute node.
# Used by e2e CI. Building Dockerfile.compute-node will take
# unreasonable ammount of time without v2 runners.
#
# TODO: remove once cloud repo CI is moved to v2 runners.
#
# Allow specifiyng different compute-tools tag and image repo, so we are
# able to use different images
ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
ARG IMAGE=compute-tools
ARG TAG=latest
#
# Image with pre-built tools
#
FROM $REPOSITORY/$IMAGE:$TAG AS compute-deps
# Only to get ready compute_ctl binary as deppendency
#
# Image with Postgres build deps
#
FROM debian:bullseye-slim AS build-deps
RUN apt-get update && apt-get -yq install automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
libcurl4-openssl-dev libossp-uuid-dev
#
# Image with built Postgres
#
FROM build-deps AS pg-build
# Add user postgres
RUN adduser postgres
RUN mkdir /pg && chown postgres:postgres /pg
# Copy source files
# version 14 is default for now
COPY ./vendor/postgres-v14 /pg/
COPY ./pgxn /pg/
# Build and install Postgres locally
RUN mkdir /pg/compute_build && cd /pg/compute_build && \
../configure CFLAGS='-O2 -g3' --prefix=$(pwd)/postgres_bin --enable-debug --with-uuid=ossp && \
# Install main binaries and contribs
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
# Install headers
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install
# Install neon contrib
RUN make MAKELEVEL=0 PG_CONFIG=/pg/compute_build/postgres_bin/bin/pg_config -j $(getconf _NPROCESSORS_ONLN) -C /pg/neon install
USER postgres
WORKDIR /pg
#
# Final compute node image to be exported
#
FROM debian:bullseye-slim
# libreadline-dev is required to run psql
RUN apt-get update && apt-get -yq install libreadline-dev libossp-uuid-dev
# Add user postgres
RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
echo "postgres:test_console_pass" | chpasswd && \
mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
chown -R postgres:postgres /var/db/postgres && \
chmod 0750 /var/db/postgres/compute
# Copy ready Postgres binaries
COPY --from=pg-build /pg/compute_build/postgres_bin /usr/local
# Copy binaries from compute-tools
COPY --from=compute-deps /usr/local/bin/compute_ctl /usr/local/bin/compute_ctl
# XXX: temporary symlink for compatibility with old control-plane
RUN ln -s /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl
# Add postgres shared objects to the search path
RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
USER postgres
ENTRYPOINT ["/usr/local/bin/compute_ctl"]

View File

@@ -151,6 +151,11 @@ neon-pg-ext-v14: postgres-v14
(cd $(POSTGRES_INSTALL_DIR)/build/neon-v14 && \
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install)
+@echo "Compiling neon_walredo v14"
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v14
(cd $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v14 && \
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v14/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install)
+@echo "Compiling neon_test_utils" v14
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14
(cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v14 && \
@@ -163,6 +168,11 @@ neon-pg-ext-v15: postgres-v15
(cd $(POSTGRES_INSTALL_DIR)/build/neon-v15 && \
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install)
+@echo "Compiling neon_walredo v15"
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v15
(cd $(POSTGRES_INSTALL_DIR)/build/neon-walredo-v15 && \
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v15/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile install)
+@echo "Compiling neon_test_utils" v15
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15
(cd $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-v15 && \

View File

@@ -223,10 +223,7 @@ Ensure your dependencies are installed as described [here](https://github.com/ne
```sh
git clone --recursive https://github.com/neondatabase/neon.git
# either:
CARGO_BUILD_FLAGS="--features=testing" make
# or:
make debug
./scripts/pytest
```

View File

@@ -65,7 +65,7 @@ impl GenericOption {
let name = match self.name.as_str() {
"safekeepers" => "neon.safekeepers",
"wal_acceptor_reconnect" => "neon.safekeeper_reconnect_timeout",
"wal_acceptor_connect_timeout" => "neon.safekeeper_connect_timeout",
"wal_acceptor_connection_timeout" => "neon.safekeeper_connection_timeout",
it => it,
};

View File

@@ -4,20 +4,21 @@ version = "0.1.0"
edition = "2021"
[dependencies]
anyhow = "1.0"
clap = "4.0"
comfy-table = "6.1"
git-version = "0.3.5"
tar = "0.4.38"
nix = "0.25"
once_cell = "1.13.0"
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
regex = "1"
reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
serde = { version = "1.0", features = ["derive"] }
serde_with = "2.0"
toml = "0.5"
once_cell = "1.13.0"
regex = "1"
anyhow = "1.0"
tar = "0.4.38"
thiserror = "1"
nix = "0.25"
reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
toml = "0.5"
url = "2.2.2"
# Note: Do not directly depend on pageserver or safekeeper; use pageserver_api or safekeeper_api
# instead, so that recompile times are better.

View File

@@ -0,0 +1,264 @@
//! Spawns and kills background processes that are needed by Neon CLI.
//! Applies common set-up such as log and pid files (if needed) to every process.
//!
//! Neon CLI does not run in background, so it needs to store the information about
//! spawned processes, which it does in this module.
//! We do that by storing the pid of the process in the "${process_name}.pid" file.
//! The pid file can be created by the process itself
//! (Neon storage binaries do that and also ensure that a lock is taken onto that file)
//! or we create such file after starting the process
//! (non-Neon binaries don't necessarily follow our pidfile conventions).
//! The pid stored in the file is later used to stop the service.
//!
//! See [`lock_file`] module for more info.
use std::ffi::OsStr;
use std::io::Write;
use std::path::Path;
use std::process::{Child, Command};
use std::time::Duration;
use std::{fs, io, thread};
use anyhow::{anyhow, bail, Context, Result};
use nix::errno::Errno;
use nix::sys::signal::{kill, Signal};
use nix::unistd::Pid;
use utils::lock_file;
const RETRIES: u32 = 15;
const RETRY_TIMEOUT_MILLIS: u64 = 500;
/// Argument to `start_process`, to indicate whether it should create pidfile or if the process creates
/// it itself.
pub enum InitialPidFile<'t> {
/// Create a pidfile, to allow future CLI invocations to manipulate the process.
Create(&'t Path),
/// The process will create the pidfile itself, need to wait for that event.
Expect(&'t Path),
}
/// Start a background child process using the parameters given.
pub fn start_process<F, S: AsRef<OsStr>>(
process_name: &str,
datadir: &Path,
command: &Path,
args: &[S],
initial_pid_file: InitialPidFile,
process_status_check: F,
) -> anyhow::Result<Child>
where
F: Fn() -> anyhow::Result<bool>,
{
let log_path = datadir.join(format!("{process_name}.log"));
let process_log_file = fs::OpenOptions::new()
.create(true)
.write(true)
.append(true)
.open(&log_path)
.with_context(|| {
format!("Could not open {process_name} log file {log_path:?} for writing")
})?;
let same_file_for_stderr = process_log_file.try_clone().with_context(|| {
format!("Could not reuse {process_name} log file {log_path:?} for writing stderr")
})?;
let mut command = Command::new(command);
let background_command = command
.stdout(process_log_file)
.stderr(same_file_for_stderr)
.args(args);
let filled_cmd = fill_aws_secrets_vars(fill_rust_env_vars(background_command));
let mut spawned_process = filled_cmd.spawn().with_context(|| {
format!("Could not spawn {process_name}, see console output and log files for details.")
})?;
let pid = spawned_process.id();
let pid = Pid::from_raw(
i32::try_from(pid)
.with_context(|| format!("Subprocess {process_name} has invalid pid {pid}"))?,
);
let pid_file_to_check = match initial_pid_file {
InitialPidFile::Create(target_pid_file_path) => {
match lock_file::create_lock_file(target_pid_file_path, pid.to_string()) {
lock_file::LockCreationResult::Created { .. } => {
// We use "lock" file here only to create the pid file. The lock on the pidfile will be dropped as soon
// as this CLI invocation exits, so it's a bit useless, but doesn't any harm either.
}
lock_file::LockCreationResult::AlreadyLocked { .. } => {
anyhow::bail!("Cannot write pid file for {process_name} at path {target_pid_file_path:?}: file is already locked by another process")
}
lock_file::LockCreationResult::CreationFailed(e) => {
return Err(e.context(format!(
"Failed to create pid file for {process_name} at path {target_pid_file_path:?}"
)))
}
}
None
}
InitialPidFile::Expect(pid_file_path) => Some(pid_file_path),
};
for retries in 0..RETRIES {
match process_started(pid, pid_file_to_check, &process_status_check) {
Ok(true) => {
println!("\n{process_name} started, pid: {pid}");
return Ok(spawned_process);
}
Ok(false) => {
if retries < 5 {
print!(".");
io::stdout().flush().unwrap();
} else {
if retries == 5 {
println!() // put a line break after dots for second message
}
println!("{process_name} has not started yet, retrying ({retries})...");
}
thread::sleep(Duration::from_millis(RETRY_TIMEOUT_MILLIS));
}
Err(e) => {
println!("{process_name} failed to start: {e:#}");
if let Err(e) = spawned_process.kill() {
println!("Could not stop {process_name} subprocess: {e:#}")
};
return Err(e);
}
}
}
anyhow::bail!("{process_name} could not start in {RETRIES} attempts");
}
/// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> anyhow::Result<()> {
if !pid_file.exists() {
println!("{process_name} is already stopped: no pid file {pid_file:?} is present");
return Ok(());
}
let pid = read_pidfile(pid_file)?;
let sig = if immediate {
print!("Stopping {process_name} with pid {pid} immediately..");
Signal::SIGQUIT
} else {
print!("Stopping {process_name} with pid {pid} gracefully..");
Signal::SIGTERM
};
io::stdout().flush().unwrap();
match kill(pid, sig) {
Ok(()) => (),
Err(Errno::ESRCH) => {
println!(
"{process_name} with pid {pid} does not exist, but a pid file {pid_file:?} was found"
);
return Ok(());
}
Err(e) => anyhow::bail!("Failed to send signal to {process_name} with pid {pid}: {e}"),
}
// Wait until process is gone
for _ in 0..RETRIES {
match process_has_stopped(pid) {
Ok(true) => {
println!("\n{process_name} stopped");
if let Err(e) = fs::remove_file(pid_file) {
if e.kind() != io::ErrorKind::NotFound {
eprintln!("Failed to remove pid file {pid_file:?} after stopping the process: {e:#}");
}
}
return Ok(());
}
Ok(false) => {
print!(".");
io::stdout().flush().unwrap();
thread::sleep(Duration::from_secs(1))
}
Err(e) => {
println!("{process_name} with pid {pid} failed to stop: {e:#}");
return Err(e);
}
}
}
anyhow::bail!("{process_name} with pid {pid} failed to stop in {RETRIES} attempts");
}
fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
let mut filled_cmd = cmd.env_clear().env("RUST_BACKTRACE", "1");
let var = "LLVM_PROFILE_FILE";
if let Some(val) = std::env::var_os(var) {
filled_cmd = filled_cmd.env(var, val);
}
const RUST_LOG_KEY: &str = "RUST_LOG";
if let Ok(rust_log_value) = std::env::var(RUST_LOG_KEY) {
filled_cmd.env(RUST_LOG_KEY, rust_log_value)
} else {
filled_cmd
}
}
fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
for env_key in [
"AWS_ACCESS_KEY_ID",
"AWS_SECRET_ACCESS_KEY",
"AWS_SESSION_TOKEN",
] {
if let Ok(value) = std::env::var(env_key) {
cmd = cmd.env(env_key, value);
}
}
cmd
}
fn process_started<F>(
pid: Pid,
pid_file_to_check: Option<&Path>,
status_check: &F,
) -> anyhow::Result<bool>
where
F: Fn() -> anyhow::Result<bool>,
{
match status_check() {
Ok(true) => match pid_file_to_check {
Some(pid_file_path) => {
if pid_file_path.exists() {
let pid_in_file = read_pidfile(pid_file_path)?;
Ok(pid_in_file == pid)
} else {
Ok(false)
}
}
None => Ok(true),
},
Ok(false) => Ok(false),
Err(e) => anyhow::bail!("process failed to start: {e}"),
}
}
/// Read a PID file
///
/// We expect a file that contains a single integer.
fn read_pidfile(pidfile: &Path) -> Result<Pid> {
let pid_str = fs::read_to_string(pidfile)
.with_context(|| format!("failed to read pidfile {pidfile:?}"))?;
let pid: i32 = pid_str
.parse()
.map_err(|_| anyhow!("failed to parse pidfile {pidfile:?}"))?;
if pid < 1 {
bail!("pidfile {pidfile:?} contained bad value '{pid}'");
}
Ok(Pid::from_raw(pid))
}
fn process_has_stopped(pid: Pid) -> anyhow::Result<bool> {
match kill(pid, None) {
// Process exists, keep waiting
Ok(_) => Ok(false),
// Process not found, we're done
Err(Errno::ESRCH) => Ok(true),
Err(err) => anyhow::bail!("Failed to send signal to process with pid {pid}: {err}"),
}
}

View File

@@ -9,8 +9,8 @@ use anyhow::{anyhow, bail, Context, Result};
use clap::{value_parser, Arg, ArgAction, ArgMatches, Command};
use control_plane::compute::ComputeControlPlane;
use control_plane::local_env::{EtcdBroker, LocalEnv};
use control_plane::pageserver::PageServerNode;
use control_plane::safekeeper::SafekeeperNode;
use control_plane::storage::PageServerNode;
use control_plane::{etcd, local_env};
use pageserver_api::models::TimelineInfo;
use pageserver_api::{

View File

@@ -12,15 +12,14 @@ use std::time::Duration;
use anyhow::{Context, Result};
use utils::{
connstring::connection_host_port,
id::{TenantId, TimelineId},
lsn::Lsn,
postgres_backend::AuthType,
};
use crate::local_env::{LocalEnv, DEFAULT_PG_VERSION};
use crate::pageserver::PageServerNode;
use crate::postgresql_conf::PostgresConf;
use crate::storage::PageServerNode;
//
// ComputeControlPlane
@@ -300,7 +299,8 @@ impl PostgresNode {
// Configure the node to fetch pages from pageserver
let pageserver_connstr = {
let (host, port) = connection_host_port(&self.pageserver.pg_connection_config);
let config = &self.pageserver.pg_connection_config;
let (host, port) = (config.host(), config.port());
// Set up authentication
//

View File

@@ -0,0 +1,57 @@
use url::Url;
#[derive(Debug)]
pub struct PgConnectionConfig {
url: Url,
}
impl PgConnectionConfig {
pub fn host(&self) -> &str {
self.url.host_str().expect("BUG: no host")
}
pub fn port(&self) -> u16 {
self.url.port().expect("BUG: no port")
}
/// Return a `<host>:<port>` string.
pub fn raw_address(&self) -> String {
format!("{}:{}", self.host(), self.port())
}
/// Connect using postgres protocol with TLS disabled.
pub fn connect_no_tls(&self) -> Result<postgres::Client, postgres::Error> {
postgres::Client::connect(self.url.as_str(), postgres::NoTls)
}
}
impl std::str::FromStr for PgConnectionConfig {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
let mut url: Url = s.parse()?;
match url.scheme() {
"postgres" | "postgresql" => {}
other => anyhow::bail!("invalid scheme: {other}"),
}
// It's not a valid connection url if host is unavailable.
if url.host().is_none() {
anyhow::bail!(url::ParseError::EmptyHost);
}
// E.g. `postgres:bar`.
if url.cannot_be_a_base() {
anyhow::bail!("URL cannot be a base");
}
// Set the default PG port if it's missing.
if url.port().is_none() {
url.set_port(Some(5432))
.expect("BUG: couldn't set the default port");
}
Ok(Self { url })
}
}

View File

@@ -1,99 +1,75 @@
use std::{
fs,
path::PathBuf,
process::{Command, Stdio},
};
use std::{fs, path::PathBuf};
use anyhow::Context;
use nix::{
sys::signal::{kill, Signal},
unistd::Pid,
};
use crate::{local_env, read_pidfile};
use crate::{background_process, local_env};
pub fn start_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
let etcd_broker = &env.etcd_broker;
println!(
"Starting etcd broker using {}",
etcd_broker.etcd_binary_path.display()
"Starting etcd broker using {:?}",
etcd_broker.etcd_binary_path
);
let etcd_data_dir = env.base_data_dir.join("etcd");
fs::create_dir_all(&etcd_data_dir).with_context(|| {
format!(
"Failed to create etcd data dir: {}",
etcd_data_dir.display()
)
})?;
fs::create_dir_all(&etcd_data_dir)
.with_context(|| format!("Failed to create etcd data dir {etcd_data_dir:?}"))?;
let etcd_stdout_file =
fs::File::create(etcd_data_dir.join("etcd.stdout.log")).with_context(|| {
format!(
"Failed to create etcd stout file in directory {}",
etcd_data_dir.display()
)
})?;
let etcd_stderr_file =
fs::File::create(etcd_data_dir.join("etcd.stderr.log")).with_context(|| {
format!(
"Failed to create etcd stderr file in directory {}",
etcd_data_dir.display()
)
})?;
let client_urls = etcd_broker.comma_separated_endpoints();
let args = [
format!("--data-dir={}", etcd_data_dir.display()),
format!("--listen-client-urls={client_urls}"),
format!("--advertise-client-urls={client_urls}"),
// Set --quota-backend-bytes to keep the etcd virtual memory
// size smaller. Our test etcd clusters are very small.
// See https://github.com/etcd-io/etcd/issues/7910
"--quota-backend-bytes=100000000".to_string(),
// etcd doesn't compact (vacuum) with default settings,
// enable it to prevent space exhaustion.
"--auto-compaction-mode=revision".to_string(),
"--auto-compaction-retention=1".to_string(),
];
let etcd_process = Command::new(&etcd_broker.etcd_binary_path)
.args(&[
format!("--data-dir={}", etcd_data_dir.display()),
format!("--listen-client-urls={client_urls}"),
format!("--advertise-client-urls={client_urls}"),
// Set --quota-backend-bytes to keep the etcd virtual memory
// size smaller. Our test etcd clusters are very small.
// See https://github.com/etcd-io/etcd/issues/7910
"--quota-backend-bytes=100000000".to_string(),
// etcd doesn't compact (vacuum) with default settings,
// enable it to prevent space exhaustion.
"--auto-compaction-mode=revision".to_string(),
"--auto-compaction-retention=1".to_string(),
])
.stdout(Stdio::from(etcd_stdout_file))
.stderr(Stdio::from(etcd_stderr_file))
.spawn()
.context("Failed to spawn etcd subprocess")?;
let pid = etcd_process.id();
let pid_file_path = etcd_pid_file_path(env);
let etcd_pid_file_path = etcd_pid_file_path(env);
fs::write(&etcd_pid_file_path, pid.to_string()).with_context(|| {
format!(
"Failed to create etcd pid file at {}",
etcd_pid_file_path.display()
)
})?;
let client = reqwest::blocking::Client::new();
background_process::start_process(
"etcd",
&etcd_data_dir,
&etcd_broker.etcd_binary_path,
&args,
background_process::InitialPidFile::Create(&pid_file_path),
|| {
for broker_endpoint in &etcd_broker.broker_endpoints {
let request = broker_endpoint
.join("health")
.with_context(|| {
format!(
"Failed to append /health path to broker endopint {}",
broker_endpoint
)
})
.and_then(|url| {
client.get(&url.to_string()).build().with_context(|| {
format!("Failed to construct request to etcd endpoint {url}")
})
})?;
if client.execute(request).is_ok() {
return Ok(true);
}
}
Ok(false)
},
)
.context("Failed to spawn etcd subprocess")?;
Ok(())
}
pub fn stop_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
let etcd_path = &env.etcd_broker.etcd_binary_path;
println!("Stopping etcd broker at {}", etcd_path.display());
let etcd_pid_file_path = etcd_pid_file_path(env);
let pid = Pid::from_raw(read_pidfile(&etcd_pid_file_path).with_context(|| {
format!(
"Failed to read etcd pid file at {}",
etcd_pid_file_path.display()
)
})?);
kill(pid, Signal::SIGTERM).with_context(|| {
format!(
"Failed to stop etcd with pid {pid} at {}",
etcd_pid_file_path.display()
)
})?;
Ok(())
background_process::stop_process(true, "etcd", &etcd_pid_file_path(env))
}
fn etcd_pid_file_path(env: &local_env::LocalEnv) -> PathBuf {

View File

@@ -6,59 +6,12 @@
// Intended to be used in integration tests and in CLI tools for
// local installations.
//
use anyhow::{anyhow, bail, Context, Result};
use std::fs;
use std::path::Path;
use std::process::Command;
mod background_process;
pub mod compute;
pub mod connection;
pub mod etcd;
pub mod local_env;
pub mod pageserver;
pub mod postgresql_conf;
pub mod safekeeper;
pub mod storage;
/// Read a PID file
///
/// We expect a file that contains a single integer.
/// We return an i32 for compatibility with libc and nix.
pub fn read_pidfile(pidfile: &Path) -> Result<i32> {
let pid_str = fs::read_to_string(pidfile)
.with_context(|| format!("failed to read pidfile {:?}", pidfile))?;
let pid: i32 = pid_str
.parse()
.map_err(|_| anyhow!("failed to parse pidfile {:?}", pidfile))?;
if pid < 1 {
bail!("pidfile {:?} contained bad value '{}'", pidfile, pid);
}
Ok(pid)
}
fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
let cmd = cmd.env_clear().env("RUST_BACKTRACE", "1");
let var = "LLVM_PROFILE_FILE";
if let Some(val) = std::env::var_os(var) {
cmd.env(var, val);
}
const RUST_LOG_KEY: &str = "RUST_LOG";
if let Ok(rust_log_value) = std::env::var(RUST_LOG_KEY) {
cmd.env(RUST_LOG_KEY, rust_log_value)
} else {
cmd
}
}
fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
for env_key in [
"AWS_ACCESS_KEY_ID",
"AWS_SECRET_ACCESS_KEY",
"AWS_SESSION_TOKEN",
] {
if let Ok(value) = std::env::var(env_key) {
cmd = cmd.env(env_key, value);
}
}
cmd
}

View File

@@ -226,12 +226,12 @@ impl LocalEnv {
}
}
pub fn pageserver_bin(&self) -> anyhow::Result<PathBuf> {
Ok(self.neon_distrib_dir.join("pageserver"))
pub fn pageserver_bin(&self) -> PathBuf {
self.neon_distrib_dir.join("pageserver")
}
pub fn safekeeper_bin(&self) -> anyhow::Result<PathBuf> {
Ok(self.neon_distrib_dir.join("safekeeper"))
pub fn safekeeper_bin(&self) -> PathBuf {
self.neon_distrib_dir.join("safekeeper")
}
pub fn pg_data_dirs_path(&self) -> PathBuf {

View File

@@ -1,33 +1,27 @@
use std::collections::HashMap;
use std::fs::File;
use std::fs::{self, File};
use std::io::{BufReader, Write};
use std::num::NonZeroU64;
use std::path::{Path, PathBuf};
use std::process::Command;
use std::time::Duration;
use std::{io, result, thread};
use std::process::Child;
use std::{io, result};
use crate::connection::PgConnectionConfig;
use anyhow::{bail, Context};
use nix::errno::Errno;
use nix::sys::signal::{kill, Signal};
use nix::unistd::Pid;
use pageserver_api::models::{
TenantConfigRequest, TenantCreateRequest, TenantInfo, TimelineCreateRequest, TimelineInfo,
};
use postgres::{Config, NoTls};
use reqwest::blocking::{Client, RequestBuilder, Response};
use reqwest::{IntoUrl, Method};
use thiserror::Error;
use utils::{
connstring::connection_address,
http::error::HttpErrorBody,
id::{TenantId, TimelineId},
lsn::Lsn,
postgres_backend::AuthType,
};
use crate::local_env::LocalEnv;
use crate::{fill_aws_secrets_vars, fill_rust_env_vars, read_pidfile};
use crate::{background_process, local_env::LocalEnv};
#[derive(Error, Debug)]
pub enum PageserverHttpError {
@@ -75,7 +69,7 @@ impl ResponseErrorMessageExt for Response {
//
#[derive(Debug)]
pub struct PageServerNode {
pub pg_connection_config: Config,
pub pg_connection_config: PgConnectionConfig,
pub env: LocalEnv,
pub http_client: Client,
pub http_base_url: String,
@@ -101,7 +95,7 @@ impl PageServerNode {
}
/// Construct libpq connection string for connecting to the pageserver.
fn pageserver_connection_config(password: &str, listen_addr: &str) -> Config {
fn pageserver_connection_config(password: &str, listen_addr: &str) -> PgConnectionConfig {
format!("postgresql://no_user:{password}@{listen_addr}/no_db")
.parse()
.unwrap()
@@ -161,7 +155,15 @@ impl PageServerNode {
init_config_overrides.push("auth_validation_public_key_path='auth_public_key.pem'");
}
self.start_node(&init_config_overrides, &self.env.base_data_dir, true)?;
let mut pageserver_process = self
.start_node(&init_config_overrides, &self.env.base_data_dir, true)
.with_context(|| {
format!(
"Failed to start a process for pageserver {}",
self.env.pageserver.id,
)
})?;
let init_result = self
.try_init_timeline(create_tenant, initial_timeline_id, pg_version)
.context("Failed to create initial tenant and timeline for pageserver");
@@ -171,7 +173,29 @@ impl PageServerNode {
}
Err(e) => eprintln!("{e:#}"),
}
self.stop(false)?;
match pageserver_process.kill() {
Err(e) => {
eprintln!(
"Failed to stop pageserver {} process with pid {}: {e:#}",
self.env.pageserver.id,
pageserver_process.id(),
)
}
Ok(()) => {
println!(
"Stopped pageserver {} process with pid {}",
self.env.pageserver.id,
pageserver_process.id(),
);
// cleanup after pageserver startup, since we do not call regular `stop_process` during init
let pid_file = self.pid_file();
if let Err(e) = fs::remove_file(&pid_file) {
if e.kind() != io::ErrorKind::NotFound {
eprintln!("Failed to remove pid file {pid_file:?} after stopping the process: {e:#}");
}
}
}
}
init_result
}
@@ -196,11 +220,14 @@ impl PageServerNode {
self.env.pageserver_data_dir()
}
pub fn pid_file(&self) -> PathBuf {
/// The pid file is created by the pageserver process, with its pid stored inside.
/// Other pageservers cannot lock the same file and overwrite it for as long as the current
/// pageserver runs. (Unless someone removes the file manually; never do that!)
fn pid_file(&self) -> PathBuf {
self.repo_path().join("pageserver.pid")
}
pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result<Child> {
self.start_node(config_overrides, &self.repo_path(), false)
}
@@ -209,10 +236,10 @@ impl PageServerNode {
config_overrides: &[&str],
datadir: &Path,
update_config: bool,
) -> anyhow::Result<()> {
) -> anyhow::Result<Child> {
println!(
"Starting pageserver at '{}' in '{}'",
connection_address(&self.pg_connection_config),
self.pg_connection_config.raw_address(),
datadir.display()
);
io::stdout().flush()?;
@@ -220,10 +247,7 @@ impl PageServerNode {
let mut args = vec![
"-D",
datadir.to_str().with_context(|| {
format!(
"Datadir path '{}' cannot be represented as a unicode string",
datadir.display()
)
format!("Datadir path {datadir:?} cannot be represented as a unicode string")
})?,
];
@@ -235,48 +259,18 @@ impl PageServerNode {
args.extend(["-c", config_override]);
}
let mut cmd = Command::new(self.env.pageserver_bin()?);
let mut filled_cmd = fill_rust_env_vars(cmd.args(&args).arg("--daemonize"));
filled_cmd = fill_aws_secrets_vars(filled_cmd);
if !filled_cmd.status()?.success() {
bail!(
"Pageserver failed to start. See console output and '{}' for details.",
datadir.join("pageserver.log").display()
);
}
// It takes a while for the page server to start up. Wait until it is
// open for business.
const RETRIES: i8 = 15;
for retries in 1..RETRIES {
match self.check_status() {
Ok(()) => {
println!("\nPageserver started");
return Ok(());
}
Err(err) => {
match err {
PageserverHttpError::Transport(err) => {
if err.is_connect() && retries < 5 {
print!(".");
io::stdout().flush().unwrap();
} else {
if retries == 5 {
println!() // put a line break after dots for second message
}
println!("Pageserver not responding yet, err {err} retrying ({retries})...");
}
}
PageserverHttpError::Response(msg) => {
bail!("pageserver failed to start: {msg} ")
}
}
thread::sleep(Duration::from_secs(1));
}
}
}
bail!("pageserver failed to start in {RETRIES} seconds");
background_process::start_process(
"pageserver",
datadir,
&self.env.pageserver_bin(),
&args,
background_process::InitialPidFile::Expect(&self.pid_file()),
|| match self.check_status() {
Ok(()) => Ok(true),
Err(PageserverHttpError::Transport(_)) => Ok(false),
Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
},
)
}
///
@@ -288,69 +282,18 @@ impl PageServerNode {
/// If the server is not running, returns success
///
pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
let pid_file = self.pid_file();
if !pid_file.exists() {
println!("Pageserver is already stopped");
return Ok(());
}
let pid = Pid::from_raw(read_pidfile(&pid_file)?);
let sig = if immediate {
print!("Stopping pageserver immediately..");
Signal::SIGQUIT
} else {
print!("Stopping pageserver gracefully..");
Signal::SIGTERM
};
io::stdout().flush().unwrap();
match kill(pid, sig) {
Ok(_) => (),
Err(Errno::ESRCH) => {
println!("Pageserver with pid {pid} does not exist, but a PID file was found");
return Ok(());
}
Err(err) => bail!(
"Failed to send signal to pageserver with pid {pid}: {}",
err.desc()
),
}
// Wait until process is gone
for i in 0..600 {
let signal = None; // Send no signal, just get the error code
match kill(pid, signal) {
Ok(_) => (), // Process exists, keep waiting
Err(Errno::ESRCH) => {
// Process not found, we're done
println!("done!");
return Ok(());
}
Err(err) => bail!(
"Failed to send signal to pageserver with pid {}: {}",
pid,
err.desc()
),
};
if i % 10 == 0 {
print!(".");
io::stdout().flush().unwrap();
}
thread::sleep(Duration::from_millis(100));
}
bail!("Failed to stop pageserver with pid {pid}");
background_process::stop_process(immediate, "pageserver", &self.pid_file())
}
pub fn page_server_psql(&self, sql: &str) -> Vec<postgres::SimpleQueryMessage> {
let mut client = self.pg_connection_config.connect(NoTls).unwrap();
let mut client = self.pg_connection_config.connect_no_tls().unwrap();
println!("Pageserver query: '{sql}'");
client.simple_query(sql).unwrap()
}
pub fn page_server_psql_client(&self) -> result::Result<postgres::Client, postgres::Error> {
self.pg_connection_config.connect(NoTls)
self.pg_connection_config.connect_no_tls()
}
fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {
@@ -419,6 +362,11 @@ impl PageServerNode {
.map(|x| x.parse::<NonZeroU64>())
.transpose()
.context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
trace_read_requests: settings
.remove("trace_read_requests")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'trace_read_requests' as bool")?,
};
if !settings.is_empty() {
bail!("Unrecognized tenant settings: {settings:?}")
@@ -481,6 +429,11 @@ impl PageServerNode {
.map(|x| x.parse::<NonZeroU64>())
.transpose()
.context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
trace_read_requests: settings
.get("trace_read_requests")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'trace_read_requests' as bool")?,
})
.send()?
.error_from_body()?;
@@ -549,7 +502,7 @@ impl PageServerNode {
pg_wal: Option<(Lsn, PathBuf)>,
pg_version: u32,
) -> anyhow::Result<()> {
let mut client = self.pg_connection_config.connect(NoTls).unwrap();
let mut client = self.pg_connection_config.connect_no_tls().unwrap();
// Init base reader
let (start_lsn, base_tarfile_path) = base;

View File

@@ -1,23 +1,21 @@
use std::io::Write;
use std::path::PathBuf;
use std::process::Command;
use std::process::Child;
use std::sync::Arc;
use std::time::Duration;
use std::{io, result, thread};
use std::{io, result};
use anyhow::bail;
use nix::errno::Errno;
use nix::sys::signal::{kill, Signal};
use nix::unistd::Pid;
use postgres::Config;
use anyhow::Context;
use reqwest::blocking::{Client, RequestBuilder, Response};
use reqwest::{IntoUrl, Method};
use thiserror::Error;
use utils::{connstring::connection_address, http::error::HttpErrorBody, id::NodeId};
use utils::{http::error::HttpErrorBody, id::NodeId};
use crate::local_env::{LocalEnv, SafekeeperConf};
use crate::storage::PageServerNode;
use crate::{fill_aws_secrets_vars, fill_rust_env_vars, read_pidfile};
use crate::connection::PgConnectionConfig;
use crate::pageserver::PageServerNode;
use crate::{
background_process,
local_env::{LocalEnv, SafekeeperConf},
};
#[derive(Error, Debug)]
pub enum SafekeeperHttpError {
@@ -63,7 +61,7 @@ pub struct SafekeeperNode {
pub conf: SafekeeperConf,
pub pg_connection_config: Config,
pub pg_connection_config: PgConnectionConfig,
pub env: LocalEnv,
pub http_client: Client,
pub http_base_url: String,
@@ -87,15 +85,15 @@ impl SafekeeperNode {
}
/// Construct libpq connection string for connecting to this safekeeper.
fn safekeeper_connection_config(port: u16) -> Config {
fn safekeeper_connection_config(port: u16) -> PgConnectionConfig {
// TODO safekeeper authentication not implemented yet
format!("postgresql://no_user@127.0.0.1:{}/no_db", port)
format!("postgresql://no_user@127.0.0.1:{port}/no_db")
.parse()
.unwrap()
}
pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf {
env.safekeeper_data_dir(format!("sk{}", sk_id).as_ref())
env.safekeeper_data_dir(&format!("sk{sk_id}"))
}
pub fn datadir_path(&self) -> PathBuf {
@@ -106,91 +104,78 @@ impl SafekeeperNode {
self.datadir_path().join("safekeeper.pid")
}
pub fn start(&self) -> anyhow::Result<()> {
pub fn start(&self) -> anyhow::Result<Child> {
print!(
"Starting safekeeper at '{}' in '{}'",
connection_address(&self.pg_connection_config),
self.pg_connection_config.raw_address(),
self.datadir_path().display()
);
io::stdout().flush().unwrap();
let listen_pg = format!("127.0.0.1:{}", self.conf.pg_port);
let listen_http = format!("127.0.0.1:{}", self.conf.http_port);
let id = self.id;
let datadir = self.datadir_path();
let mut cmd = Command::new(self.env.safekeeper_bin()?);
fill_rust_env_vars(
cmd.args(&["-D", self.datadir_path().to_str().unwrap()])
.args(&["--id", self.id.to_string().as_ref()])
.args(&["--listen-pg", &listen_pg])
.args(&["--listen-http", &listen_http])
.arg("--daemonize"),
);
let id_string = id.to_string();
let mut args = vec![
"-D",
datadir.to_str().with_context(|| {
format!("Datadir path {datadir:?} cannot be represented as a unicode string")
})?,
"--id",
&id_string,
"--listen-pg",
&listen_pg,
"--listen-http",
&listen_http,
];
if !self.conf.sync {
cmd.arg("--no-sync");
args.push("--no-sync");
}
let comma_separated_endpoints = self.env.etcd_broker.comma_separated_endpoints();
if !comma_separated_endpoints.is_empty() {
cmd.args(&["--broker-endpoints", &comma_separated_endpoints]);
args.extend(["--broker-endpoints", &comma_separated_endpoints]);
}
if let Some(prefix) = self.env.etcd_broker.broker_etcd_prefix.as_deref() {
cmd.args(&["--broker-etcd-prefix", prefix]);
args.extend(["--broker-etcd-prefix", prefix]);
}
let mut backup_threads = String::new();
if let Some(threads) = self.conf.backup_threads {
cmd.args(&["--backup-threads", threads.to_string().as_ref()]);
backup_threads = threads.to_string();
args.extend(["--backup-threads", &backup_threads]);
} else {
drop(backup_threads);
}
if let Some(ref remote_storage) = self.conf.remote_storage {
cmd.args(&["--remote-storage", remote_storage]);
args.extend(["--remote-storage", remote_storage]);
}
let key_path = self.env.base_data_dir.join("auth_public_key.pem");
if self.conf.auth_enabled {
cmd.arg("--auth-validation-public-key-path");
// PathBuf is better be passed as is, not via `String`.
cmd.arg(self.env.base_data_dir.join("auth_public_key.pem"));
args.extend([
"--auth-validation-public-key-path",
key_path.to_str().with_context(|| {
format!("Key path {key_path:?} cannot be represented as a unicode string")
})?,
]);
}
fill_aws_secrets_vars(&mut cmd);
if !cmd.status()?.success() {
bail!(
"Safekeeper failed to start. See '{}' for details.",
self.datadir_path().join("safekeeper.log").display()
);
}
// It takes a while for the safekeeper to start up. Wait until it is
// open for business.
const RETRIES: i8 = 15;
for retries in 1..RETRIES {
match self.check_status() {
Ok(_) => {
println!("\nSafekeeper started");
return Ok(());
}
Err(err) => {
match err {
SafekeeperHttpError::Transport(err) => {
if err.is_connect() && retries < 5 {
print!(".");
io::stdout().flush().unwrap();
} else {
if retries == 5 {
println!() // put a line break after dots for second message
}
println!(
"Safekeeper not responding yet, err {} retrying ({})...",
err, retries
);
}
}
SafekeeperHttpError::Response(msg) => {
bail!("safekeeper failed to start: {} ", msg)
}
}
thread::sleep(Duration::from_secs(1));
}
}
}
bail!("safekeeper failed to start in {} seconds", RETRIES);
background_process::start_process(
&format!("safekeeper {id}"),
&datadir,
&self.env.safekeeper_bin(),
&args,
background_process::InitialPidFile::Expect(&self.pid_file()),
|| match self.check_status() {
Ok(()) => Ok(true),
Err(SafekeeperHttpError::Transport(_)) => Ok(false),
Err(e) => Err(anyhow::anyhow!("Failed to check node status: {e}")),
},
)
}
///
@@ -202,63 +187,11 @@ impl SafekeeperNode {
/// If the server is not running, returns success
///
pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
let pid_file = self.pid_file();
if !pid_file.exists() {
println!("Safekeeper {} is already stopped", self.id);
return Ok(());
}
let pid = read_pidfile(&pid_file)?;
let pid = Pid::from_raw(pid);
let sig = if immediate {
print!("Stopping safekeeper {} immediately..", self.id);
Signal::SIGQUIT
} else {
print!("Stopping safekeeper {} gracefully..", self.id);
Signal::SIGTERM
};
io::stdout().flush().unwrap();
match kill(pid, sig) {
Ok(_) => (),
Err(Errno::ESRCH) => {
println!(
"Safekeeper with pid {} does not exist, but a PID file was found",
pid
);
return Ok(());
}
Err(err) => bail!(
"Failed to send signal to safekeeper with pid {}: {}",
pid,
err.desc()
),
}
// Wait until process is gone
for i in 0..600 {
let signal = None; // Send no signal, just get the error code
match kill(pid, signal) {
Ok(_) => (), // Process exists, keep waiting
Err(Errno::ESRCH) => {
// Process not found, we're done
println!("done!");
return Ok(());
}
Err(err) => bail!(
"Failed to send signal to pageserver with pid {}: {}",
pid,
err.desc()
),
};
if i % 10 == 0 {
print!(".");
io::stdout().flush().unwrap();
}
thread::sleep(Duration::from_millis(100));
}
bail!("Failed to stop safekeeper with pid {}", pid);
background_process::stop_process(
immediate,
&format!("safekeeper {}", self.id),
&self.pid_file(),
)
}
fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {

View File

@@ -0,0 +1,13 @@
ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
ARG COMPUTE_IMAGE=compute-node-v14
ARG TAG=latest
FROM $REPOSITORY/${COMPUTE_IMAGE}:$TAG
USER root
RUN apt-get update && \
apt-get install -y curl \
jq \
netcat
USER postgres

View File

@@ -2,6 +2,7 @@ version: '3'
services:
etcd:
restart: always
image: quay.io/coreos/etcd:v3.5.4
ports:
- 2379:2379
@@ -9,7 +10,7 @@ services:
environment:
# This signifficantly speeds up etcd and we anyway don't data persistency there.
ETCD_UNSAFE_NO_FSYNC: "1"
command:
command:
- "etcd"
- "--auto-compaction-mode=revision"
- "--auto-compaction-retention=1"
@@ -24,6 +25,7 @@ services:
- "--quota-backend-bytes=134217728" # 128 MB
minio:
restart: always
image: quay.io/minio/minio:RELEASE.2022-10-20T00-55-09Z
ports:
- 9000:9000
@@ -41,7 +43,7 @@ services:
entrypoint:
- "/bin/sh"
- "-c"
command:
command:
- "until (/usr/bin/mc alias set minio http://minio:9000 $$MINIO_ROOT_USER $$MINIO_ROOT_PASSWORD) do
echo 'Waiting to start minio...' && sleep 1;
done;
@@ -51,7 +53,8 @@ services:
- minio
pageserver:
image: neondatabase/neon:${TAG:-latest}
restart: always
image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
environment:
- BROKER_ENDPOINT='http://etcd:2379'
- AWS_ACCESS_KEY_ID=minio
@@ -77,7 +80,8 @@ services:
- minio_create_buckets
safekeeper1:
image: neondatabase/neon:${TAG:-latest}
restart: always
image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
environment:
- SAFEKEEPER_ADVERTISE_URL=safekeeper1:5454
- SAFEKEEPER_ID=1
@@ -106,7 +110,8 @@ services:
- minio_create_buckets
safekeeper2:
image: neondatabase/neon:${TAG:-latest}
restart: always
image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
environment:
- SAFEKEEPER_ADVERTISE_URL=safekeeper2:5454
- SAFEKEEPER_ID=2
@@ -135,7 +140,8 @@ services:
- minio_create_buckets
safekeeper3:
image: neondatabase/neon:${TAG:-latest}
restart: always
image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
environment:
- SAFEKEEPER_ADVERTISE_URL=safekeeper3:5454
- SAFEKEEPER_ID=3
@@ -164,18 +170,21 @@ services:
- minio_create_buckets
compute:
restart: always
build:
context: ./image/compute
context: ./compute_wrapper/
args:
- COMPUTE_IMAGE=compute-node-v${PG_VERSION:-14}:${TAG:-latest}
- COMPUTE_IMAGE=compute-node-v${PG_VERSION:-14}
- TAG=${TAG:-latest}
- http_proxy=$http_proxy
- https_proxy=$https_proxy
environment:
- PG_VERSION=${PG_VERSION:-14}
#- RUST_BACKTRACE=1
# Mount the test files directly, for faster editing cycle.
volumes:
- ./compute/var/db/postgres/specs/:/var/db/postgres/specs/
- ./compute/shell/:/shell/
- ./compute_wrapper/var/db/postgres/specs/:/var/db/postgres/specs/
- ./compute_wrapper/shell/:/shell/
ports:
- 55433:55433 # pg protocol handler
- 3080:3080 # http endpoints

View File

@@ -0,0 +1,60 @@
#!/bin/bash
# A basic test to ensure Docker images are built correctly.
# Build a wrapper around the compute, start all services and runs a simple SQL query.
# Repeats the process for all currenly supported Postgres versions.
# Implicitly accepts `REPOSITORY` and `TAG` env vars that are passed into the compose file
# Their defaults point at DockerHub `neondatabase/neon:latest` image.`,
# to verify custom image builds (e.g pre-published ones).
# XXX: Current does not work on M1 macs due to x86_64 Docker images compiled only, and no seccomp support in M1 Docker emulation layer.
set -eux -o pipefail
SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
COMPOSE_FILE=$SCRIPT_DIR/docker-compose.yml
COMPUTE_CONTAINER_NAME=docker-compose-compute-1
SQL="CREATE TABLE t(key int primary key, value text); insert into t values(1,1); select * from t;"
PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -c '$SQL' postgres"
cleanup() {
echo "show container information"
docker ps
docker compose -f $COMPOSE_FILE logs
echo "stop containers..."
docker compose -f $COMPOSE_FILE down
}
echo "clean up containers if exists"
cleanup
for pg_version in 14 15; do
echo "start containers (pg_version=$pg_version)."
PG_VERSION=$pg_version docker compose -f $COMPOSE_FILE up --build -d
echo "wait until the compute is ready. timeout after 60s. "
cnt=0
while sleep 1; do
# check timeout
cnt=`expr $cnt + 1`
if [ $cnt -gt 60 ]; then
echo "timeout before the compute is ready."
cleanup
exit 1
fi
# check if the compute is ready
set +o pipefail
result=`docker compose -f $COMPOSE_FILE logs "compute_is_ready" | grep "accepting connections" | wc -l`
set -o pipefail
if [ $result -eq 1 ]; then
echo "OK. The compute is ready to connect."
echo "execute simple queries."
docker exec $COMPUTE_CONTAINER_NAME /bin/bash -c "psql $PSQL_OPTION"
cleanup
break
fi
done
done

View File

@@ -1,10 +0,0 @@
ARG COMPUTE_IMAGE=compute-node-v14:latest
FROM neondatabase/${COMPUTE_IMAGE}
USER root
RUN apt-get update && \
apt-get install -y curl \
jq \
netcat
USER postgres

View File

@@ -37,7 +37,7 @@
- [Source view](./sourcetree.md)
- [docker.md](./docker.md) — Docker images and building pipeline.
- [Error handling and logging]()
- [Error handling and logging](./error-handling.md)
- [Testing]()
- [Unit testing]()
- [Integration testing]()

198
docs/error-handling.md Normal file
View File

@@ -0,0 +1,198 @@
# Error handling and logging
## Logging errors
The principle is that errors are logged when they are handled. If you
just propagate an error to the caller in a function, you don't need to
log it; the caller will. But if you consume an error in a function,
you *must* log it (if it needs to be logged at all).
For example:
```rust
fn read_motd_file() -> std::io::Result<String> {
let mut f = File::open("/etc/motd")?;
let mut result = String::new();
f.read_to_string(&mut result)?;
result
}
```
Opening or reading the file could fail, but there is no need to log
the error here. The function merely propagates the error to the
caller, and it is up to the caller to log the error or propagate it
further, if the failure is not expected. But if, for example, it is
normal that the "/etc/motd" file doesn't exist, the caller can choose
to silently ignore the error, or log it as an INFO or DEBUG level
message:
```rust
fn get_message_of_the_day() -> String {
// Get the motd from /etc/motd, or return the default proverb
match read_motd_file() {
Ok(motd) => motd,
Err(err) => {
// It's normal that /etc/motd doesn't exist, but if we fail to
// read it for some other reason, that's unexpected. The message
// of the day isn't very important though, so we just WARN and
// continue with the default in any case.
if err.kind() != std::io::ErrorKind::NotFound {
tracing::warn!("could not read \"/etc/motd\": {err:?}");
}
"An old error is always more popular than a new truth. - German proverb"
}
}
}
```
## Error types
We use the `anyhow` crate widely. It contains many convenient macros
like `bail!` and `ensure!` to construct and return errors, and to
propagate many kinds of low-level errors, wrapped in `anyhow::Error`.
A downside of `anyhow::Error` is that the caller cannot distinguish
between different error cases. Most errors are propagated all the way
to the mgmt API handler function, or the main loop that handles a
connection with the compute node, and they are all handled the same
way: the error is logged and returned to the client as an HTTP or
libpq error.
But in some cases, we need to distinguish between errors and handle
them differently. For example, attaching a tenant to the pageserver
could fail either because the tenant has already been attached, or
because we could not load its metadata from cloud storage. The first
case is more or less expected. The console sends the Attach request to
the pageserver, and the pageserver completes the operation, but the
network connection might be lost before the console receives the
response. The console will retry the operation in that case, but the
tenant has already been attached. It is important that the pagserver
responds with the HTTP 403 Already Exists error in that case, rather
than a generic HTTP 500 Internal Server Error.
If you need to distinguish between different kinds of errors, create a
new `Error` type. The `thiserror` crate is useful for that. But in
most cases `anyhow::Error` is good enough.
## Panics
Depending on where a panic happens, it can cause the whole pageserver
or safekeeper to restart, or just a single tenant. In either case,
that is pretty bad and causes an outage. Avoid panics. Never use
`unwrap()` or other calls that might panic, to verify inputs from the
network or from disk.
It is acceptable to use functions that might panic, like `unwrap()`, if
it is obvious that it cannot panic. For example, if you have just
checked that a variable is not None, it is OK to call `unwrap()` on it,
but it is still preferable to use `expect("reason")` instead to explain
why the function cannot fail.
`assert!` and `panic!` are reserved for checking clear invariants and
very obvious "can't happen" cases. When in doubt, use anyhow `ensure!`
or `bail!` instead.
## Error levels
`tracing::Level` doesn't provide very clear guidelines on what the
different levels mean, or when to use which level. Here is how we use
them:
### Error
Examples:
- could not open file "foobar"
- invalid tenant id
Errors are not expected to happen during normal operation. Incorrect
inputs from client can cause ERRORs. For example, if a client tries to
call a mgmt API that doesn't exist, or if a compute node sends passes
an LSN that has already been garbage collected away.
These should *not* happen during normal operations. "Normal
operations" is not a very precise concept. But for example, disk
errors are not expected to happen when the system is working, so those
count as Errors. However, if a TCP connection to a compute node is
lost, that is not considered an Error, because it doesn't affect the
pageserver's or safekeeper's operation in any way, and happens fairly
frequently when compute nodes are shut down, or are killed abruptly
because of errors in the compute.
**Errors are monitored, and always need human investigation to determine
the cause.**
Whether something should be logged at ERROR, WARNING or INFO level can
depend on the callers and clients. For example, it might be unexpected
and a sign of a serious issue if the console calls the
"timeline_detail" mgmt API for a timeline that doesn't exist. ERROR
would be appropriate in that case. But if the console routinely calls
the API after deleting a timeline, to check if the deletion has
completed, then it would be totally normal and an INFO or DEBUG level
message would be more appropriate. If a message is logged as an ERROR,
but it in fact happens frequently in production and never requires any
action, it should probably be demoted to an INFO level message.
### Warn
Examples:
- could not remove temporary file "foobar.temp"
- unrecognized file "foobar" in timeline directory
Warnings are similar to Errors, in that they should not happen
when the system is operating normally. The difference between Error and
Warning is that an Error means that the operation failed, whereas Warning
means that something unexpected happened, but the operation continued anyway.
For example, if deleting a file fails because the file already didn't exist,
it should be logged as Warning.
> **Note:** The python regression tests, under `test_regress`, check the
> pageserver log after each test for any ERROR and WARN lines. If there are
> any ERRORs or WARNs that have not been explicitly listed in the test as
> allowed, the test is marked a failed. This is to catch unexpected errors
> e.g. in background operations, that don't cause immediate misbehaviour in
> the tested functionality.
### Info
Info level is used to log useful information when the system is
operating normally. Info level is appropriate e.g. for logging state
changes, background operations, and network connections.
Examples:
- "system is shutting down"
- "tenant was created"
- "retrying S3 upload"
### Debug & Trace
Debug and Trace level messages are not printed to the log in our normal
production configuration, but could be enabled for a specific server or
tenant, to aid debugging. (Although we don't actually have that
capability as of this writing).
## Context
We use logging "spans" to hold context information about the current
operation. Almost every operation happens on a particular tenant and
timeline, so we enter a span with the "tenant_id" and "timeline_id"
very early when processing an incoming API request, for example. All
background operations should also run in a span containing at least
those two fields, and any other parameters or information that might
be useful when debugging an error that might happen when performing
the operation.
TODO: Spans are not captured in the Error when it is created, but when
the error is logged. It would be more useful to capture them at Error
creation. We should consider using `tracing_error::SpanTrace` to do
that.
## Error message style
PostgreSQL has a style guide for writing error messages:
https://www.postgresql.org/docs/current/error-style-guide.html
Follow that guide when writing error messages in the PostgreSQL
extension. We don't follow it strictly in the pageserver and
safekeeper, but the advice in the PostgreSQL style guide is generally
good, and you can't go wrong by following it.

View File

@@ -0,0 +1,246 @@
# Coordinating access of multiple pageservers to the same s3 data
## Motivation
There are some blind spots around coordinating access of multiple pageservers
to the same s3 data. Currently this is applicable only to tenant relocation
case, but in the future we'll need to solve similar problems for
replica/standby pageservers.
## Impacted components (e.g. pageserver, safekeeper, console, etc)
Pageserver
## The problem
### Relocation
During relocation both pageservers can write to s3. This should be ok for all
data except the `index_part.json`. For index part it causes problems during
compaction/gc because they remove files from index/s3.
Imagine this case:
```mermaid
sequenceDiagram
autonumber
participant PS1
participant S3
participant PS2
PS1->>S3: Uploads L1, L2 <br/> Index contains L1 L2
PS2->>S3: Attach called, sees L1, L2
PS1->>S3: Compaction comes <br/> Removes L1, adds L3
note over S3: Index now L2, L3
PS2->>S3: Uploads new layer L4 <br/> (added to previous view of the index)
note over S3: Index now L1, L2, L4
```
At this point it is not possible to restore from index, it contains L2 which
is no longer available in s3 and doesnt contain L3 added by compaction by the
first pageserver. So if any of the pageservers restart initial sync will fail
(or in on-demand world it will fail a bit later during page request from
missing layer)
### Standby pageserver
Another related case is standby pageserver. In this case second pageserver can
be used as a replica to scale reads and serve as a failover target in case
first one fails.
In this mode second pageserver needs to have the same picture of s3 files to
be able to load layers on-demand. To accomplish that second pageserver
cannot run gc/compaction jobs. Instead it needs to receive updates for index
contents. (There is no need to run walreceiver on the second pageserver then).
## Observations
- If both pageservers ingest wal then their layer set diverges, because layer
file generation is not deterministic
- If one of the pageservers does not ingest wal (and just picks up layer
updates) then it lags behind and cannot really answer queries in the same
pace as the primary one
- Can compaction help make layers deterministic? E g we do not upload level
zero layers and construction of higher levels should be deterministic.
This way we can guarantee that layer creation by timeout wont mess things up.
This way one pageserver uploads data and second one can just ingest it.
But we still need some form of election
## Solutions
### Manual orchestration
One possible solution for relocation case is to orchestrate background jobs
from outside. The oracle who runs migration can turn off background jobs on
PS1 before migration and then run migration -> enable them on PS2. The problem
comes if migration fails. In this case in order to resume background jobs
oracle needs to guarantee that PS2 doesnt run background jobs and if it doesnt
respond then PS1 is stuck unable to run compaction/gc. This cannot be solved
without human ensuring that no upload from PS2 can happen. In order to be able
to resolve this automatically CAS is required on S3 side so pageserver can
avoid overwriting index part if it is no longer the leading one
Note that flag that disables background jobs needs to be persistent, because
otherwise pageserver restart will clean it
### Avoid index_part.json
Index part consists of two parts, list of layers and metadata. List of layers
can be easily obtained by `ListObjects` S3 API method. But what to do with
metadata? Create metadata instance for each checkpoint and add some counter
to the file name?
Back to potentially long s3 ls.
### Coordination based approach
Do it like safekeepers chose leader for WAL upload. Ping each other and decide
based on some heuristics e g smallest node id. During relocation PS1 sends
"resign" ping message so others can start election without waiting for a timeout.
This still leaves metadata question open and non deterministic layers are a
problem as well
### Avoid metadata file
One way to eliminate metadata file is to store it in layer files under some
special key. This may resonate with intention to keep all relation sizes in
some special segment to avoid initial download during size calculation.
Maybe with that we can even store pre calculated value.
As a downside each checkpoint gets 512 bytes larger.
If we entirely avoid metadata file this opens up many approaches
* * *
During discussion it seems that we converged on the approach consisting of:
- index files stored per pageserver in the same timeline directory. With that
index file name starts to look like: `<pageserver_node_id>_index_part.json`.
In such set up there are no concurrent overwrites of index file by different
pageservers.
- For replica pageservers the solution would be for primary to broadcast index
changes to any followers with an ability to check index files in s3 and
restore the full state. To properly merge changes with index files we can use
a counter that is persisted in an index file, is incremented on every change
to it and passed along with broadcasted change. This way we can determine
whether we need to apply change to the index state or not.
- Responsibility for running background jobs is assigned externally. Pageserver
keeps locally persistent flag for each tenant that indicates whether this
pageserver is considered as primary one or not. TODO what happends if we
crash and cannot start for some extended period of time? Control plane can
assign ownership to some other pageserver. Pageserver needs some way to check
if its still the blessed one. Maybe by explicit request to control plane on
start.
Requirement for deterministic layer generation was considered overly strict
because of two reasons:
- It can limit possible optimizations e g when pageserver wants to reshuffle
some data locally and doesnt want to coordinate this
- The deterministic algorithm itself can change so during deployments for some
time there will be two different version running at the same time which can
cause non determinism
### External elections
The above case with lost state in this schema with externally managed
leadership is represented like this:
Note that here we keep objects list in the index file.
```mermaid
sequenceDiagram
autonumber
participant PS1
participant CP as Control Plane
participant S3
participant PS2
note over PS1,PS2: PS1 starts up and still a leader
PS1->>CP: Am I still the leader for Tenant X?
activate CP
CP->>PS1: Yes
deactivate CP
PS1->>S3: Fetch PS1 index.
note over PS1: Continue operations, start backround jobs
note over PS1,PS2: PS1 starts up and still and is not a leader anymore
PS1->>CP: Am I still the leader for Tenant X?
CP->>PS1: No
PS1->>PS2: Subscribe to index changes
PS1->>S3: Fetch PS1 and PS2 indexes
note over PS1: Combine index file to include layers <br> from both indexes to be able <br> to see newer files from leader (PS2)
note over PS1: Continue operations, do not start background jobs
```
### Internal elections
To manage leadership internally we can use broker to exchange pings so nodes
can decide on the leader roles. In case multiple pageservers are active leader
is the one with lowest node id.
Operations with internally managed elections:
```mermaid
sequenceDiagram
autonumber
participant PS1
participant S3
note over PS1: Starts up
note over PS1: Subscribes to changes, waits for two ping <br> timeouts to see if there is a leader
PS1->>S3: Fetch indexes from s3
alt there is a leader
note over PS1: do not start background jobs, <br> continue applying index updates
else there is no leader
note over PS1: start background jobs, <br> broadcast index changes
end
note over PS1,S3: Then the picture is similar to external elections <br> the difference is that follower can become a leader <br> if there are no pings after some timeout new leader gets elected
```
### Eviction
When two pageservers operate on a tenant for extended period of time follower
doesnt perform write operations in s3. When layer is evicted follower relies
on updates from primary to get info about layers it needs to cover range for
evicted layer.
Note that it wont match evicted layer exactly, so layers will overlap and
lookup code needs to correctly handle that.
### Relocation flow
Actions become:
- Attach tenant to new pageserver
- New pageserver becomes follower since previous one is still leading
- New pageserver starts replicating from safekeepers but does not upload layers
- Detach is called on the old one
- New pageserver becomes leader after it realizes that old one disappeared
### Index File
Using `s3 ls` on startup simplifies things, but we still need metadata, so we
need to fetch index files anyway. If they contain list of files we can combine
them and avoid costly `s3 ls`
### Remaining issues
- More than one remote consistent lsn for safekeepers to know
Anything else?
### Proposed solution
To recap. On meeting we converged on approach with external elections but I
think it will be overall harder to manage and will introduce a dependency on
control plane for pageserver. Using separate index files for each pageserver
consisting of log of operations and a metadata snapshot should be enough.
### What we need to get there?
- Change index file structure to contain log of changes instead of just the
file list
- Implement pinging/elections for pageservers

View File

@@ -52,6 +52,10 @@ PostgreSQL extension that implements storage manager API and network communicati
PostgreSQL extension that contains functions needed for testing and debugging.
`/pgxn/neon_walredo`:
Library to run Postgres as a "WAL redo process" in the pageserver.
`/safekeeper`:
The neon WAL service that receives WAL from a primary compute nodes and streams it to the pageserver.

View File

@@ -9,6 +9,7 @@ serde_with = "2.0"
const_format = "0.2.21"
anyhow = { version = "1.0", features = ["backtrace"] }
bytes = "1.0.1"
byteorder = "1.4.3"
utils = { path = "../utils" }
postgres_ffi = { path = "../postgres_ffi" }

View File

@@ -1,5 +1,6 @@
use std::num::NonZeroU64;
use byteorder::{BigEndian, ReadBytesExt};
use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr};
use utils::{
@@ -9,7 +10,7 @@ use utils::{
use crate::reltag::RelTag;
use anyhow::bail;
use bytes::{Buf, BufMut, Bytes, BytesMut};
use bytes::{BufMut, Bytes, BytesMut};
/// A state of a tenant in pageserver's memory.
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -72,6 +73,7 @@ pub struct TenantCreateRequest {
pub walreceiver_connect_timeout: Option<String>,
pub lagging_wal_timeout: Option<String>,
pub max_lsn_wal_lag: Option<NonZeroU64>,
pub trace_read_requests: Option<bool>,
}
#[serde_as]
@@ -111,6 +113,7 @@ pub struct TenantConfigRequest {
pub walreceiver_connect_timeout: Option<String>,
pub lagging_wal_timeout: Option<String>,
pub max_lsn_wal_lag: Option<NonZeroU64>,
pub trace_read_requests: Option<bool>,
}
impl TenantConfigRequest {
@@ -129,6 +132,7 @@ impl TenantConfigRequest {
walreceiver_connect_timeout: None,
lagging_wal_timeout: None,
max_lsn_wal_lag: None,
trace_read_requests: None,
}
}
}
@@ -225,6 +229,7 @@ pub struct TimelineGcRequest {
}
// Wrapped in libpq CopyData
#[derive(PartialEq, Eq)]
pub enum PagestreamFeMessage {
Exists(PagestreamExistsRequest),
Nblocks(PagestreamNblocksRequest),
@@ -241,21 +246,21 @@ pub enum PagestreamBeMessage {
DbSize(PagestreamDbSizeResponse),
}
#[derive(Debug)]
#[derive(Debug, PartialEq, Eq)]
pub struct PagestreamExistsRequest {
pub latest: bool,
pub lsn: Lsn,
pub rel: RelTag,
}
#[derive(Debug)]
#[derive(Debug, PartialEq, Eq)]
pub struct PagestreamNblocksRequest {
pub latest: bool,
pub lsn: Lsn,
pub rel: RelTag,
}
#[derive(Debug)]
#[derive(Debug, PartialEq, Eq)]
pub struct PagestreamGetPageRequest {
pub latest: bool,
pub lsn: Lsn,
@@ -263,7 +268,7 @@ pub struct PagestreamGetPageRequest {
pub blkno: u32,
}
#[derive(Debug)]
#[derive(Debug, PartialEq, Eq)]
pub struct PagestreamDbSizeRequest {
pub latest: bool,
pub lsn: Lsn,
@@ -296,52 +301,98 @@ pub struct PagestreamDbSizeResponse {
}
impl PagestreamFeMessage {
pub fn parse(mut body: Bytes) -> anyhow::Result<PagestreamFeMessage> {
pub fn serialize(&self) -> Bytes {
let mut bytes = BytesMut::new();
match self {
Self::Exists(req) => {
bytes.put_u8(0);
bytes.put_u8(if req.latest { 1 } else { 0 });
bytes.put_u64(req.lsn.0);
bytes.put_u32(req.rel.spcnode);
bytes.put_u32(req.rel.dbnode);
bytes.put_u32(req.rel.relnode);
bytes.put_u8(req.rel.forknum);
}
Self::Nblocks(req) => {
bytes.put_u8(1);
bytes.put_u8(if req.latest { 1 } else { 0 });
bytes.put_u64(req.lsn.0);
bytes.put_u32(req.rel.spcnode);
bytes.put_u32(req.rel.dbnode);
bytes.put_u32(req.rel.relnode);
bytes.put_u8(req.rel.forknum);
}
Self::GetPage(req) => {
bytes.put_u8(2);
bytes.put_u8(if req.latest { 1 } else { 0 });
bytes.put_u64(req.lsn.0);
bytes.put_u32(req.rel.spcnode);
bytes.put_u32(req.rel.dbnode);
bytes.put_u32(req.rel.relnode);
bytes.put_u8(req.rel.forknum);
bytes.put_u32(req.blkno);
}
Self::DbSize(req) => {
bytes.put_u8(3);
bytes.put_u8(if req.latest { 1 } else { 0 });
bytes.put_u64(req.lsn.0);
bytes.put_u32(req.dbnode);
}
}
bytes.into()
}
pub fn parse<R: std::io::Read>(body: &mut R) -> anyhow::Result<PagestreamFeMessage> {
// TODO these gets can fail
// these correspond to the NeonMessageTag enum in pagestore_client.h
//
// TODO: consider using protobuf or serde bincode for less error prone
// serialization.
let msg_tag = body.get_u8();
let msg_tag = body.read_u8()?;
match msg_tag {
0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
latest: body.get_u8() != 0,
lsn: Lsn::from(body.get_u64()),
latest: body.read_u8()? != 0,
lsn: Lsn::from(body.read_u64::<BigEndian>()?),
rel: RelTag {
spcnode: body.get_u32(),
dbnode: body.get_u32(),
relnode: body.get_u32(),
forknum: body.get_u8(),
spcnode: body.read_u32::<BigEndian>()?,
dbnode: body.read_u32::<BigEndian>()?,
relnode: body.read_u32::<BigEndian>()?,
forknum: body.read_u8()?,
},
})),
1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
latest: body.get_u8() != 0,
lsn: Lsn::from(body.get_u64()),
latest: body.read_u8()? != 0,
lsn: Lsn::from(body.read_u64::<BigEndian>()?),
rel: RelTag {
spcnode: body.get_u32(),
dbnode: body.get_u32(),
relnode: body.get_u32(),
forknum: body.get_u8(),
spcnode: body.read_u32::<BigEndian>()?,
dbnode: body.read_u32::<BigEndian>()?,
relnode: body.read_u32::<BigEndian>()?,
forknum: body.read_u8()?,
},
})),
2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
latest: body.get_u8() != 0,
lsn: Lsn::from(body.get_u64()),
latest: body.read_u8()? != 0,
lsn: Lsn::from(body.read_u64::<BigEndian>()?),
rel: RelTag {
spcnode: body.get_u32(),
dbnode: body.get_u32(),
relnode: body.get_u32(),
forknum: body.get_u8(),
spcnode: body.read_u32::<BigEndian>()?,
dbnode: body.read_u32::<BigEndian>()?,
relnode: body.read_u32::<BigEndian>()?,
forknum: body.read_u8()?,
},
blkno: body.get_u32(),
blkno: body.read_u32::<BigEndian>()?,
})),
3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
latest: body.get_u8() != 0,
lsn: Lsn::from(body.get_u64()),
dbnode: body.get_u32(),
latest: body.read_u8()? != 0,
lsn: Lsn::from(body.read_u64::<BigEndian>()?),
dbnode: body.read_u32::<BigEndian>()?,
})),
_ => bail!("unknown smgr message tag: {},'{:?}'", msg_tag, body),
_ => bail!("unknown smgr message tag: {:?}", msg_tag),
}
}
}
@@ -380,3 +431,58 @@ impl PagestreamBeMessage {
bytes.into()
}
}
#[cfg(test)]
mod tests {
use bytes::Buf;
use super::*;
#[test]
fn test_pagestream() {
// Test serialization/deserialization of PagestreamFeMessage
let messages = vec![
PagestreamFeMessage::Exists(PagestreamExistsRequest {
latest: true,
lsn: Lsn(4),
rel: RelTag {
forknum: 1,
spcnode: 2,
dbnode: 3,
relnode: 4,
},
}),
PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
latest: false,
lsn: Lsn(4),
rel: RelTag {
forknum: 1,
spcnode: 2,
dbnode: 3,
relnode: 4,
},
}),
PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
latest: true,
lsn: Lsn(4),
rel: RelTag {
forknum: 1,
spcnode: 2,
dbnode: 3,
relnode: 4,
},
blkno: 7,
}),
PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
latest: true,
lsn: Lsn(4),
dbnode: 7,
}),
];
for msg in messages {
let bytes = msg.serialize();
let reconstructed = PagestreamFeMessage::parse(&mut bytes.reader()).unwrap();
assert!(msg == reconstructed);
}
}
}

16
libs/pq_proto/Cargo.toml Normal file
View File

@@ -0,0 +1,16 @@
[package]
name = "pq_proto"
version = "0.1.0"
edition = "2021"
[dependencies]
anyhow = "1.0"
bytes = "1.0.1"
pin-project-lite = "0.2.7"
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
rand = "0.8.3"
serde = { version = "1.0", features = ["derive"] }
tokio = { version = "1.17", features = ["macros"] }
tracing = "0.1"
workspace_hack = { version = "0.1", path = "../../workspace_hack" }

View File

@@ -2,7 +2,9 @@
//! <https://www.postgresql.org/docs/devel/protocol-message-formats.html>
//! on message formats.
use crate::sync::{AsyncishRead, SyncFuture};
// Tools for calling certain async methods in sync contexts.
pub mod sync;
use anyhow::{bail, ensure, Context, Result};
use bytes::{Buf, BufMut, Bytes, BytesMut};
use postgres_protocol::PG_EPOCH;
@@ -16,6 +18,7 @@ use std::{
str,
time::{Duration, SystemTime},
};
use sync::{AsyncishRead, SyncFuture};
use tokio::io::AsyncReadExt;
use tracing::{trace, warn};
@@ -198,7 +201,7 @@ impl FeMessage {
///
/// ```
/// # use std::io;
/// # use utils::pq_proto::FeMessage;
/// # use pq_proto::FeMessage;
/// #
/// # fn process_message(msg: FeMessage) -> anyhow::Result<()> {
/// # Ok(())
@@ -302,6 +305,7 @@ impl FeStartupPacket {
Err(e) => return Err(e.into()),
};
#[allow(clippy::manual_range_contains)]
if len < 4 || len > MAX_STARTUP_PACKET_LENGTH {
bail!("invalid message length");
}

View File

@@ -29,7 +29,7 @@ impl<S, T: Future> SyncFuture<S, T> {
/// Example:
///
/// ```
/// # use utils::sync::SyncFuture;
/// # use pq_proto::sync::SyncFuture;
/// # use std::future::Future;
/// # use tokio::io::AsyncReadExt;
/// #

3
libs/tenant_size_model/.gitignore vendored Normal file
View File

@@ -0,0 +1,3 @@
*.dot
*.png
*.svg

View File

@@ -0,0 +1,8 @@
[package]
name = "tenant_size_model"
version = "0.1.0"
edition = "2021"
publish = false
[dependencies]
workspace_hack = { version = "0.1", path = "../../workspace_hack" }

View File

@@ -0,0 +1,13 @@
all: 1.svg 2.svg 3.svg 4.svg 1.png 2.png 3.png 4.png
../../target/debug/tenant_size_model: Cargo.toml src/main.rs src/lib.rs
cargo build --bin tenant_size_model
%.svg: %.dot
dot -Tsvg $< > $@
%.png: %.dot
dot -Tpng $< > $@
%.dot: ../../target/debug/tenant_size_model
../../target/debug/tenant_size_model $* > $@

View File

@@ -0,0 +1,7 @@
# Logical size + WAL pricing
This is a simulator to calculate the tenant size in different scenarios,
using the "Logical size + WAL" method. Makefile produces diagrams used in a
private presentation:
https://docs.google.com/presentation/d/1OapE4k11xmcwMh7I7YvNWGC63yCRLh6udO9bXZ-fZmo/edit?usp=sharing

View File

@@ -0,0 +1,349 @@
use std::borrow::Cow;
use std::collections::HashMap;
/// Pricing model or history size builder.
///
/// Maintains knowledge of the branches and their modifications. Generic over the branch name key
/// type.
pub struct Storage<K: 'static> {
segments: Vec<Segment>,
/// Mapping from the branch name to the index of a segment describing it's latest state.
branches: HashMap<K, usize>,
}
/// Snapshot of a branch.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct Segment {
/// Previous segment index into ['Storage::segments`], if any.
parent: Option<usize>,
/// Description of how did we get to this state.
///
/// Mainly used in the original scenarios 1..=4 with insert, delete and update. Not used when
/// modifying a branch directly.
pub op: Cow<'static, str>,
/// LSN before this state
start_lsn: u64,
/// LSN at this state
pub end_lsn: u64,
/// Logical size before this state
start_size: u64,
/// Logical size at this state
pub end_size: u64,
/// Indices to [`Storage::segments`]
///
/// FIXME: this could be an Option<usize>
children_after: Vec<usize>,
/// Determined by `retention_period` given to [`Storage::calculate`]
pub needed: bool,
}
//
//
//
//
// *-g--*---D--->
// /
// /
// / *---b----*-B--->
// / /
// / /
// -----*--e---*-----f----* C
// E \
// \
// *--a---*---A-->
//
// If A and B need to be retained, is it cheaper to store
// snapshot at C+a+b, or snapshots at A and B ?
//
// If D also needs to be retained, which is cheaper:
//
// 1. E+g+e+f+a+b
// 2. D+C+a+b
// 3. D+A+B
/// [`Segment`] which has had it's size calculated.
pub struct SegmentSize {
pub seg_id: usize,
pub method: SegmentMethod,
this_size: u64,
pub children: Vec<SegmentSize>,
}
impl SegmentSize {
fn total(&self) -> u64 {
self.this_size + self.children.iter().fold(0, |acc, x| acc + x.total())
}
pub fn total_children(&self) -> u64 {
if self.method == SnapshotAfter {
self.this_size + self.children.iter().fold(0, |acc, x| acc + x.total())
} else {
self.children.iter().fold(0, |acc, x| acc + x.total())
}
}
}
/// Different methods to retain history from a particular state
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum SegmentMethod {
SnapshotAfter,
Wal,
WalNeeded,
Skipped,
}
use SegmentMethod::*;
impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
/// Creates a new storage with the given default branch name.
pub fn new(initial_branch: K) -> Storage<K> {
let init_segment = Segment {
op: "".into(),
needed: false,
parent: None,
start_lsn: 0,
end_lsn: 0,
start_size: 0,
end_size: 0,
children_after: Vec::new(),
};
Storage {
segments: vec![init_segment],
branches: HashMap::from([(initial_branch, 0)]),
}
}
/// Advances the branch with the named operation, by the relative LSN and logical size bytes.
pub fn modify_branch<Q: ?Sized>(
&mut self,
branch: &Q,
op: Cow<'static, str>,
lsn_bytes: u64,
size_bytes: i64,
) where
K: std::borrow::Borrow<Q>,
Q: std::hash::Hash + Eq,
{
let lastseg_id = *self.branches.get(branch).unwrap();
let newseg_id = self.segments.len();
let lastseg = &mut self.segments[lastseg_id];
let newseg = Segment {
op,
parent: Some(lastseg_id),
start_lsn: lastseg.end_lsn,
end_lsn: lastseg.end_lsn + lsn_bytes,
start_size: lastseg.end_size,
end_size: (lastseg.end_size as i64 + size_bytes) as u64,
children_after: Vec::new(),
needed: false,
};
lastseg.children_after.push(newseg_id);
self.segments.push(newseg);
*self.branches.get_mut(branch).expect("read already") = newseg_id;
}
pub fn insert<Q: ?Sized>(&mut self, branch: &Q, bytes: u64)
where
K: std::borrow::Borrow<Q>,
Q: std::hash::Hash + Eq,
{
self.modify_branch(branch, "insert".into(), bytes, bytes as i64);
}
pub fn update<Q: ?Sized>(&mut self, branch: &Q, bytes: u64)
where
K: std::borrow::Borrow<Q>,
Q: std::hash::Hash + Eq,
{
self.modify_branch(branch, "update".into(), bytes, 0i64);
}
pub fn delete<Q: ?Sized>(&mut self, branch: &Q, bytes: u64)
where
K: std::borrow::Borrow<Q>,
Q: std::hash::Hash + Eq,
{
self.modify_branch(branch, "delete".into(), bytes, -(bytes as i64));
}
/// Panics if the parent branch cannot be found.
pub fn branch<Q: ?Sized>(&mut self, parent: &Q, name: K)
where
K: std::borrow::Borrow<Q>,
Q: std::hash::Hash + Eq,
{
// Find the right segment
let branchseg_id = *self
.branches
.get(parent)
.expect("should had found the parent by key");
let _branchseg = &mut self.segments[branchseg_id];
// Create branch name for it
self.branches.insert(name, branchseg_id);
}
pub fn calculate(&mut self, retention_period: u64) -> SegmentSize {
// Phase 1: Mark all the segments that need to be retained
for (_branch, &last_seg_id) in self.branches.iter() {
let last_seg = &self.segments[last_seg_id];
let cutoff_lsn = last_seg.start_lsn.saturating_sub(retention_period);
let mut seg_id = last_seg_id;
loop {
let seg = &mut self.segments[seg_id];
if seg.end_lsn < cutoff_lsn {
break;
}
seg.needed = true;
if let Some(prev_seg_id) = seg.parent {
seg_id = prev_seg_id;
} else {
break;
}
}
}
// Phase 2: For each oldest segment in a chain that needs to be retained,
// calculate if we should store snapshot or WAL
self.size_from_snapshot_later(0)
}
fn size_from_wal(&self, seg_id: usize) -> SegmentSize {
let seg = &self.segments[seg_id];
let this_size = seg.end_lsn - seg.start_lsn;
let mut children = Vec::new();
// try both ways
for &child_id in seg.children_after.iter() {
// try each child both ways
let child = &self.segments[child_id];
let p1 = self.size_from_wal(child_id);
let p = if !child.needed {
let p2 = self.size_from_snapshot_later(child_id);
if p1.total() < p2.total() {
p1
} else {
p2
}
} else {
p1
};
children.push(p);
}
SegmentSize {
seg_id,
method: if seg.needed { WalNeeded } else { Wal },
this_size,
children,
}
}
fn size_from_snapshot_later(&self, seg_id: usize) -> SegmentSize {
// If this is needed, then it's time to do the snapshot and continue
// with wal method.
let seg = &self.segments[seg_id];
//eprintln!("snap: seg{}: {} needed: {}", seg_id, seg.children_after.len(), seg.needed);
if seg.needed {
let mut children = Vec::new();
for &child_id in seg.children_after.iter() {
// try each child both ways
let child = &self.segments[child_id];
let p1 = self.size_from_wal(child_id);
let p = if !child.needed {
let p2 = self.size_from_snapshot_later(child_id);
if p1.total() < p2.total() {
p1
} else {
p2
}
} else {
p1
};
children.push(p);
}
SegmentSize {
seg_id,
method: WalNeeded,
this_size: seg.start_size,
children,
}
} else {
// If any of the direct children are "needed", need to be able to reconstruct here
let mut children_needed = false;
for &child in seg.children_after.iter() {
let seg = &self.segments[child];
if seg.needed {
children_needed = true;
break;
}
}
let method1 = if !children_needed {
let mut children = Vec::new();
for child in seg.children_after.iter() {
children.push(self.size_from_snapshot_later(*child));
}
Some(SegmentSize {
seg_id,
method: Skipped,
this_size: 0,
children,
})
} else {
None
};
// If this a junction, consider snapshotting here
let method2 = if children_needed || seg.children_after.len() >= 2 {
let mut children = Vec::new();
for child in seg.children_after.iter() {
children.push(self.size_from_wal(*child));
}
Some(SegmentSize {
seg_id,
method: SnapshotAfter,
this_size: seg.end_size,
children,
})
} else {
None
};
match (method1, method2) {
(None, None) => panic!(),
(Some(method), None) => method,
(None, Some(method)) => method,
(Some(method1), Some(method2)) => {
if method1.total() < method2.total() {
method1
} else {
method2
}
}
}
}
}
pub fn into_segments(self) -> Vec<Segment> {
self.segments
}
}

View File

@@ -0,0 +1,268 @@
//! Tenant size model testing ground.
//!
//! Has a number of scenarios and a `main` for invoking these by number, calculating the history
//! size, outputs graphviz graph. Makefile in directory shows how to use graphviz to turn scenarios
//! into pngs.
use tenant_size_model::{Segment, SegmentSize, Storage};
// Main branch only. Some updates on it.
fn scenario_1() -> (Vec<Segment>, SegmentSize) {
// Create main branch
let mut storage = Storage::new("main");
// Bulk load 5 GB of data to it
storage.insert("main", 5_000);
// Stream of updates
for _ in 0..5 {
storage.update("main", 1_000);
}
let size = storage.calculate(1000);
(storage.into_segments(), size)
}
// Main branch only. Some updates on it.
fn scenario_2() -> (Vec<Segment>, SegmentSize) {
// Create main branch
let mut storage = Storage::new("main");
// Bulk load 5 GB of data to it
storage.insert("main", 5_000);
// Stream of updates
for _ in 0..5 {
storage.update("main", 1_000);
}
// Branch
storage.branch("main", "child");
storage.update("child", 1_000);
// More updates on parent
storage.update("main", 1_000);
let size = storage.calculate(1000);
(storage.into_segments(), size)
}
// Like 2, but more updates on main
fn scenario_3() -> (Vec<Segment>, SegmentSize) {
// Create main branch
let mut storage = Storage::new("main");
// Bulk load 5 GB of data to it
storage.insert("main", 5_000);
// Stream of updates
for _ in 0..5 {
storage.update("main", 1_000);
}
// Branch
storage.branch("main", "child");
storage.update("child", 1_000);
// More updates on parent
for _ in 0..5 {
storage.update("main", 1_000);
}
let size = storage.calculate(1000);
(storage.into_segments(), size)
}
// Diverged branches
fn scenario_4() -> (Vec<Segment>, SegmentSize) {
// Create main branch
let mut storage = Storage::new("main");
// Bulk load 5 GB of data to it
storage.insert("main", 5_000);
// Stream of updates
for _ in 0..5 {
storage.update("main", 1_000);
}
// Branch
storage.branch("main", "child");
storage.update("child", 1_000);
// More updates on parent
for _ in 0..8 {
storage.update("main", 1_000);
}
let size = storage.calculate(1000);
(storage.into_segments(), size)
}
fn scenario_5() -> (Vec<Segment>, SegmentSize) {
let mut storage = Storage::new("a");
storage.insert("a", 5000);
storage.branch("a", "b");
storage.update("b", 4000);
storage.update("a", 2000);
storage.branch("a", "c");
storage.insert("c", 4000);
storage.insert("a", 2000);
let size = storage.calculate(5000);
(storage.into_segments(), size)
}
fn scenario_6() -> (Vec<Segment>, SegmentSize) {
use std::borrow::Cow;
const NO_OP: Cow<'static, str> = Cow::Borrowed("");
let branches = [
Some(0x7ff1edab8182025f15ae33482edb590a_u128),
Some(0xb1719e044db05401a05a2ed588a3ad3f),
Some(0xb68d6691c895ad0a70809470020929ef),
];
// compared to other scenarios, this one uses bytes instead of kB
let mut storage = Storage::new(None);
storage.branch(&None, branches[0]); // at 0
storage.modify_branch(&branches[0], NO_OP, 108951064, 43696128); // at 108951064
storage.branch(&branches[0], branches[1]); // at 108951064
storage.modify_branch(&branches[1], NO_OP, 15560408, -1851392); // at 124511472
storage.modify_branch(&branches[0], NO_OP, 174464360, -1531904); // at 283415424
storage.branch(&branches[0], branches[2]); // at 283415424
storage.modify_branch(&branches[2], NO_OP, 15906192, 8192); // at 299321616
storage.modify_branch(&branches[0], NO_OP, 18909976, 32768); // at 302325400
let size = storage.calculate(100_000);
(storage.into_segments(), size)
}
fn main() {
let args: Vec<String> = std::env::args().collect();
let scenario = if args.len() < 2 { "1" } else { &args[1] };
let (segments, size) = match scenario {
"1" => scenario_1(),
"2" => scenario_2(),
"3" => scenario_3(),
"4" => scenario_4(),
"5" => scenario_5(),
"6" => scenario_6(),
other => {
eprintln!("invalid scenario {}", other);
std::process::exit(1);
}
};
graphviz_tree(&segments, &size);
}
fn graphviz_recurse(segments: &[Segment], node: &SegmentSize) {
use tenant_size_model::SegmentMethod::*;
let seg_id = node.seg_id;
let seg = segments.get(seg_id).unwrap();
let lsn = seg.end_lsn;
let size = seg.end_size;
let method = node.method;
println!(" {{");
println!(" node [width=0.1 height=0.1 shape=oval]");
let tenant_size = node.total_children();
let penwidth = if seg.needed { 6 } else { 3 };
let x = match method {
SnapshotAfter =>
format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" style=filled penwidth={penwidth}"),
Wal =>
format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"black\" penwidth={penwidth}"),
WalNeeded =>
format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"black\" penwidth={penwidth}"),
Skipped =>
format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"gray\" penwidth={penwidth}"),
};
println!(" \"seg{seg_id}\" [{x}]");
println!(" }}");
// Recurse. Much of the data is actually on the edge
for child in node.children.iter() {
let child_id = child.seg_id;
graphviz_recurse(segments, child);
let edge_color = match child.method {
SnapshotAfter => "gray",
Wal => "black",
WalNeeded => "black",
Skipped => "gray",
};
println!(" {{");
println!(" edge [] ");
print!(" \"seg{seg_id}\" -> \"seg{child_id}\" [");
print!("color={edge_color}");
if child.method == WalNeeded {
print!(" penwidth=6");
}
if child.method == Wal {
print!(" penwidth=3");
}
let next = segments.get(child_id).unwrap();
if next.op.is_empty() {
print!(
" label=\"{} / {}\"",
next.end_lsn - seg.end_lsn,
(next.end_size as i128 - seg.end_size as i128)
);
} else {
print!(" label=\"{}: {}\"", next.op, next.end_lsn - seg.end_lsn);
}
println!("]");
println!(" }}");
}
}
fn graphviz_tree(segments: &[Segment], tree: &SegmentSize) {
println!("digraph G {{");
println!(" fontname=\"Helvetica,Arial,sans-serif\"");
println!(" node [fontname=\"Helvetica,Arial,sans-serif\"]");
println!(" edge [fontname=\"Helvetica,Arial,sans-serif\"]");
println!(" graph [center=1 rankdir=LR]");
println!(" edge [dir=none]");
graphviz_recurse(segments, tree);
println!("}}");
}
#[test]
fn scenarios_return_same_size() {
type ScenarioFn = fn() -> (Vec<Segment>, SegmentSize);
let truths: &[(u32, ScenarioFn, _)] = &[
(line!(), scenario_1, 8000),
(line!(), scenario_2, 9000),
(line!(), scenario_3, 13000),
(line!(), scenario_4, 16000),
(line!(), scenario_5, 17000),
(line!(), scenario_6, 333_792_000),
];
for (line, scenario, expected) in truths {
let (_, size) = scenario();
assert_eq!(*expected, size.total_children(), "scenario on line {line}");
}
}

View File

@@ -9,9 +9,6 @@ anyhow = "1.0"
bincode = "1.3"
bytes = "1.0.1"
hyper = { version = "0.14.7", features = ["full"] }
pin-project-lite = "0.2.7"
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
routerify = "3"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1"
@@ -33,8 +30,8 @@ once_cell = "1.13.0"
strum = "0.24"
strum_macros = "0.24"
metrics = { path = "../metrics" }
pq_proto = { path = "../pq_proto" }
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
[dev-dependencies]

View File

@@ -1,52 +0,0 @@
use postgres::Config;
pub fn connection_host_port(config: &Config) -> (String, u16) {
assert_eq!(
config.get_hosts().len(),
1,
"only one pair of host and port is supported in connection string"
);
assert_eq!(
config.get_ports().len(),
1,
"only one pair of host and port is supported in connection string"
);
let host = match &config.get_hosts()[0] {
postgres::config::Host::Tcp(host) => host.as_ref(),
postgres::config::Host::Unix(host) => host.to_str().unwrap(),
};
(host.to_owned(), config.get_ports()[0])
}
pub fn connection_address(config: &Config) -> String {
let (host, port) = connection_host_port(config);
format!("{}:{}", host, port)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_connection_host_port() {
let config: Config = "postgresql://no_user@localhost:64000/no_db"
.parse()
.unwrap();
assert_eq!(
connection_host_port(&config),
("localhost".to_owned(), 64000)
);
}
#[test]
#[should_panic(expected = "only one pair of host and port is supported in connection string")]
fn test_connection_host_port_multiple_ports() {
let config: Config = "postgresql://no_user@localhost:64000,localhost:64001/no_db"
.parse()
.unwrap();
assert_eq!(
connection_host_port(&config),
("localhost".to_owned(), 64000)
);
}
}

View File

@@ -204,6 +204,17 @@ pub struct TenantId(Id);
id_newtype!(TenantId);
/// Neon Connection Id identifies long-lived connections (for example a pagestream
/// connection with the page_service). Is used for better logging and tracing
///
/// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look
/// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`.
/// See [`Id`] for alternative ways to serialize it.
#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
pub struct ConnectionId(Id);
id_newtype!(ConnectionId);
// A pair uniquely identifying Neon instance.
#[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct TenantTimelineId {

View File

@@ -1,8 +1,6 @@
//! `utils` is intended to be a place to put code that is shared
//! between other crates in this repository.
#![allow(clippy::manual_range_contains)]
/// `Lsn` type implements common tasks on Log Sequence Numbers
pub mod lsn;
/// SeqWait allows waiting for a future sequence number to arrive
@@ -17,10 +15,6 @@ pub mod vec_map;
pub mod bin_ser;
pub mod postgres_backend;
pub mod postgres_backend_async;
pub mod pq_proto;
// dealing with connstring parsing and handy access to it's parts
pub mod connstring;
// helper functions for creating and fsyncing
pub mod crashsafe;
@@ -39,13 +33,12 @@ pub mod sock_split;
// common log initialisation routine
pub mod logging;
pub mod lock_file;
// Misc
pub mod accum;
pub mod shutdown;
// Tools for calling certain async methods in sync contexts
pub mod sync;
// Utility for binding TcpListeners with proper socket options.
pub mod tcp_listener;

View File

@@ -0,0 +1,81 @@
//! A module to create and read lock files. A lock file ensures that only one
//! process is running at a time, in a particular directory.
//!
//! File locking is done using [`fcntl::flock`], which means that holding the
//! lock on file only prevents acquiring another lock on it; all other
//! operations are still possible on files. Other process can still open, read,
//! write, or remove the file, for example.
//! If the file is removed while a process is holding a lock on it,
//! the process that holds the lock does not get any error or notification.
//! Furthermore, you can create a new file with the same name and lock the new file,
//! while the old process is still running.
//! Deleting the lock file while the locking process is still running is a bad idea!
use std::{fs, os::unix::prelude::AsRawFd, path::Path};
use anyhow::Context;
use nix::fcntl;
use crate::crashsafe;
pub enum LockCreationResult {
Created {
new_lock_contents: String,
file: fs::File,
},
AlreadyLocked {
existing_lock_contents: String,
},
CreationFailed(anyhow::Error),
}
/// Creates a lock file in the path given and writes the given contents into the file.
/// Note: The lock is automatically released when the file closed. You might want to use Box::leak to make sure it lives until the end of the program.
pub fn create_lock_file(lock_file_path: &Path, contents: String) -> LockCreationResult {
let lock_file = match fs::OpenOptions::new()
.create(true) // O_CREAT
.write(true)
.open(lock_file_path)
.context("Failed to open lock file")
{
Ok(file) => file,
Err(e) => return LockCreationResult::CreationFailed(e),
};
match fcntl::flock(
lock_file.as_raw_fd(),
fcntl::FlockArg::LockExclusiveNonblock,
) {
Ok(()) => {
match lock_file
.set_len(0)
.context("Failed to truncate lockfile")
.and_then(|()| {
fs::write(lock_file_path, &contents).with_context(|| {
format!("Failed to write '{contents}' contents into lockfile")
})
})
.and_then(|()| {
crashsafe::fsync_file_and_parent(lock_file_path)
.context("Failed to fsync lockfile")
}) {
Ok(()) => LockCreationResult::Created {
new_lock_contents: contents,
file: lock_file,
},
Err(e) => LockCreationResult::CreationFailed(e),
}
}
Err(nix::errno::Errno::EAGAIN) => {
match fs::read_to_string(lock_file_path).context("Failed to read lockfile contents") {
Ok(existing_lock_contents) => LockCreationResult::AlreadyLocked {
existing_lock_contents,
},
Err(e) => LockCreationResult::CreationFailed(e),
}
}
Err(e) => {
LockCreationResult::CreationFailed(anyhow::anyhow!("Failed to lock lockfile: {e}"))
}
}
}

View File

@@ -1,10 +1,6 @@
use std::{
fs::{File, OpenOptions},
path::Path,
str::FromStr,
};
use std::str::FromStr;
use anyhow::{Context, Result};
use anyhow::Context;
use strum_macros::{EnumString, EnumVariantNames};
#[derive(EnumString, EnumVariantNames, Eq, PartialEq, Debug, Clone, Copy)]
@@ -25,19 +21,8 @@ impl LogFormat {
})
}
}
pub fn init(
log_filename: impl AsRef<Path>,
daemonize: bool,
log_format: LogFormat,
) -> Result<File> {
// Don't open the same file for output multiple times;
// the different fds could overwrite each other's output.
let log_file = OpenOptions::new()
.create(true)
.append(true)
.open(&log_filename)
.with_context(|| format!("failed to open {:?}", log_filename.as_ref()))?;
pub fn init(log_format: LogFormat) -> anyhow::Result<()> {
let default_filter_str = "info";
// We fall back to printing all spans at info-level or above if
@@ -45,50 +30,16 @@ pub fn init(
let env_filter = tracing_subscriber::EnvFilter::try_from_default_env()
.unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_filter_str));
let x: File = log_file.try_clone().unwrap();
let base_logger = tracing_subscriber::fmt()
.with_env_filter(env_filter)
.with_target(false)
.with_ansi(false)
.with_writer(move || -> Box<dyn std::io::Write> {
// we are cloning and returning log file in order to allow redirecting daemonized stdout and stderr to it
// if we do not use daemonization (e.g. in docker) it is better to log to stdout directly
// for example to be in line with docker log command which expects logs comimg from stdout
if daemonize {
Box::new(x.try_clone().unwrap())
} else {
Box::new(std::io::stdout())
}
});
.with_writer(std::io::stdout);
match log_format {
LogFormat::Json => base_logger.json().init(),
LogFormat::Plain => base_logger.init(),
}
Ok(log_file)
}
// #[cfg(test)]
// Due to global logger, can't run tests in same process.
// So until there's a non-global one, the tests are in ../tests/ as separate files.
#[macro_export(local_inner_macros)]
macro_rules! test_init_file_logger {
($log_level:expr, $log_format:expr) => {{
use std::str::FromStr;
std::env::set_var("RUST_LOG", $log_level);
let tmp_dir = tempfile::TempDir::new().unwrap();
let log_file_path = tmp_dir.path().join("logfile");
let log_format = $crate::logging::LogFormat::from_str($log_format).unwrap();
let _log_file = $crate::logging::init(&log_file_path, true, log_format).unwrap();
let log_file = std::fs::OpenOptions::new()
.read(true)
.open(&log_file_path)
.unwrap();
log_file
}};
Ok(())
}

View File

@@ -13,7 +13,7 @@ use crate::seqwait::MonotonicCounter;
pub const XLOG_BLCKSZ: u32 = 8192;
/// A Postgres LSN (Log Sequence Number), also known as an XLogRecPtr
#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Serialize, Deserialize)]
#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash, Serialize, Deserialize)]
#[serde(transparent)]
pub struct Lsn(pub u64);
@@ -138,7 +138,7 @@ impl FromStr for Lsn {
///
/// If the input string is missing the '/' character, then use `Lsn::from_hex`
fn from_str(s: &str) -> Result<Self, Self::Err> {
let mut splitter = s.split('/');
let mut splitter = s.trim().split('/');
if let (Some(left), Some(right), None) = (splitter.next(), splitter.next(), splitter.next())
{
let left_num = u32::from_str_radix(left, 16).map_err(|_| LsnParseError)?;
@@ -270,6 +270,11 @@ mod tests {
);
assert_eq!(Lsn::from_hex("0"), Ok(Lsn(0)));
assert_eq!(Lsn::from_hex("F12345678AAAA5555"), Err(LsnParseError));
let expected_lsn = Lsn(0x3C490F8);
assert_eq!(" 0/3C490F8".parse(), Ok(expected_lsn));
assert_eq!("0/3C490F8 ".parse(), Ok(expected_lsn));
assert_eq!(" 0/3C490F8 ".parse(), Ok(expected_lsn));
}
#[test]

View File

@@ -3,10 +3,10 @@
//! implementation determining how to process the queries. Currently its API
//! is rather narrow, but we can extend it once required.
use crate::pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket};
use crate::sock_split::{BidiStream, ReadStream, WriteStream};
use anyhow::{bail, ensure, Context, Result};
use bytes::{Bytes, BytesMut};
use pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket};
use rand::Rng;
use serde::{Deserialize, Serialize};
use std::fmt;

View File

@@ -4,9 +4,9 @@
//! is rather narrow, but we can extend it once required.
use crate::postgres_backend::AuthType;
use crate::pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket};
use anyhow::{bail, Context, Result};
use bytes::{Bytes, BytesMut};
use pq_proto::{BeMessage, BeParameterStatusMessage, FeMessage, FeStartupPacket};
use rand::Rng;
use std::future::Future;
use std::net::SocketAddr;

View File

@@ -1,36 +0,0 @@
// This could be in ../src/logging.rs but since the logger is global, these
// can't be run in threads of the same process
use std::fs::File;
use std::io::{BufRead, BufReader, Lines};
use tracing::*;
use utils::test_init_file_logger;
fn read_lines(file: File) -> Lines<BufReader<File>> {
BufReader::new(file).lines()
}
#[test]
fn test_json_format_has_message_and_custom_field() {
std::env::set_var("RUST_LOG", "info");
let log_file = test_init_file_logger!("info", "json");
let custom_field: &str = "hi";
trace!(custom = %custom_field, "test log message");
debug!(custom = %custom_field, "test log message");
info!(custom = %custom_field, "test log message");
warn!(custom = %custom_field, "test log message");
error!(custom = %custom_field, "test log message");
let lines = read_lines(log_file);
for line in lines {
let content = line.unwrap();
let json_object = serde_json::from_str::<serde_json::Value>(&content).unwrap();
assert_eq!(json_object["fields"]["custom"], "hi");
assert_eq!(json_object["fields"]["message"], "test log message");
assert_ne!(json_object["level"], "TRACE");
assert_ne!(json_object["level"], "DEBUG");
}
}

View File

@@ -1,36 +0,0 @@
// This could be in ../src/logging.rs but since the logger is global, these
// can't be run in threads of the same process
use std::fs::File;
use std::io::{BufRead, BufReader, Lines};
use tracing::*;
use utils::test_init_file_logger;
fn read_lines(file: File) -> Lines<BufReader<File>> {
BufReader::new(file).lines()
}
#[test]
fn test_plain_format_has_message_and_custom_field() {
std::env::set_var("RUST_LOG", "warn");
let log_file = test_init_file_logger!("warn", "plain");
let custom_field: &str = "hi";
trace!(custom = %custom_field, "test log message");
debug!(custom = %custom_field, "test log message");
info!(custom = %custom_field, "test log message");
warn!(custom = %custom_field, "test log message");
error!(custom = %custom_field, "test log message");
let lines = read_lines(log_file);
for line in lines {
let content = line.unwrap();
serde_json::from_str::<serde_json::Value>(&content).unwrap_err();
assert!(content.contains("custom=hi"));
assert!(content.contains("test log message"));
assert!(!content.contains("TRACE"));
assert!(!content.contains("DEBUG"));
assert!(!content.contains("INFO"));
}
}

View File

@@ -12,62 +12,61 @@ testing = ["fail/failpoints"]
profiling = ["pprof"]
[dependencies]
amplify_num = { git = "https://github.com/hlinnaka/rust-amplify.git", branch = "unsigned-int-perf" }
anyhow = { version = "1.0", features = ["backtrace"] }
async-stream = "0.3"
async-trait = "0.1"
chrono = "0.4.19"
rand = "0.8.3"
regex = "1.4.5"
bytes = "1.0.1"
byteorder = "1.4.3"
bytes = "1.0.1"
chrono = "0.4.19"
clap = { version = "4.0", features = ["string"] }
close_fds = "0.3.2"
const_format = "0.2.21"
crc32c = "0.6.0"
crossbeam-utils = "0.8.5"
fail = "0.5.0"
futures = "0.3.13"
git-version = "0.3.5"
hex = "0.4.3"
humantime = "2.1.0"
humantime-serde = "1.1.1"
hyper = "0.14"
itertools = "0.10.3"
clap = { version = "4.0", features = ["string"] }
daemonize = "0.4.1"
tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
tokio-util = { version = "0.7.3", features = ["io", "io-util"] }
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
nix = "0.25"
num-traits = "0.2.15"
once_cell = "1.13.0"
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
anyhow = { version = "1.0", features = ["backtrace"] }
crc32c = "0.6.0"
thiserror = "1.0"
tar = "0.4.33"
humantime = "2.1.0"
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
pprof = { git = "https://github.com/neondatabase/pprof-rs.git", branch = "wallclock-profiling", features = ["flamegraph"], optional = true }
rand = "0.8.3"
regex = "1.4.5"
rstar = "0.9.3"
scopeguard = "1.1.0"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1"
serde_with = "2.0"
humantime-serde = "1.1.1"
pprof = { git = "https://github.com/neondatabase/pprof-rs.git", branch = "wallclock-profiling", features = ["flamegraph"], optional = true }
toml_edit = { version = "0.14", features = ["easy"] }
scopeguard = "1.1.0"
const_format = "0.2.21"
tracing = "0.1.36"
signal-hook = "0.3.10"
svg_fmt = "0.4.1"
tar = "0.4.33"
thiserror = "1.0"
tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
tokio-util = { version = "0.7.3", features = ["io", "io-util"] }
toml_edit = { version = "0.14", features = ["easy"] }
tracing = "0.1.36"
url = "2"
nix = "0.25"
once_cell = "1.13.0"
crossbeam-utils = "0.8.5"
fail = "0.5.0"
git-version = "0.3.5"
rstar = "0.9.3"
num-traits = "0.2.15"
amplify_num = { git = "https://github.com/hlinnaka/rust-amplify.git", branch = "unsigned-int-perf" }
walkdir = "2.3.2"
pageserver_api = { path = "../libs/pageserver_api" }
postgres_ffi = { path = "../libs/postgres_ffi" }
etcd_broker = { path = "../libs/etcd_broker" }
metrics = { path = "../libs/metrics" }
utils = { path = "../libs/utils" }
pageserver_api = { path = "../libs/pageserver_api" }
postgres_ffi = { path = "../libs/postgres_ffi" }
pq_proto = { path = "../libs/pq_proto" }
remote_storage = { path = "../libs/remote_storage" }
tenant_size_model = { path = "../libs/tenant_size_model" }
utils = { path = "../libs/utils" }
workspace_hack = { version = "0.1", path = "../workspace_hack" }
close_fds = "0.3.2"
walkdir = "2.3.2"
svg_fmt = "0.4.1"
[dev-dependencies]
criterion = "0.4"
@@ -77,3 +76,7 @@ tempfile = "3.2"
[[bench]]
name = "bench_layer_map"
harness = false
[[bench]]
name = "bench_walredo"
harness = false

File diff suppressed because one or more lines are too long

View File

@@ -1,17 +1,14 @@
//! Main entry point for the Page Server executable.
use remote_storage::GenericRemoteStorage;
use std::{env, ops::ControlFlow, path::Path, str::FromStr};
use anyhow::{anyhow, Context};
use clap::{Arg, ArgAction, Command};
use fail::FailScenario;
use nix::unistd::Pid;
use tracing::*;
use anyhow::{anyhow, bail, Context, Result};
use clap::{Arg, ArgAction, Command};
use daemonize::Daemonize;
use fail::FailScenario;
use metrics::set_build_info_metric;
use pageserver::{
config::{defaults::*, PageServerConf},
http, page_cache, page_service, profiling, task_mgr,
@@ -19,20 +16,22 @@ use pageserver::{
task_mgr::{
BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
},
tenant_mgr, virtual_file, LOG_FILE_NAME,
tenant_mgr, virtual_file,
};
use remote_storage::GenericRemoteStorage;
use utils::{
auth::JwtAuth,
logging,
lock_file, logging,
postgres_backend::AuthType,
project_git_version,
shutdown::exit_now,
signals::{self, Signal},
tcp_listener,
};
project_git_version!(GIT_VERSION);
const PID_FILE_NAME: &str = "pageserver.pid";
const FEATURES: &[&str] = &[
#[cfg(feature = "testing")]
"testing",
@@ -65,6 +64,7 @@ fn main() -> anyhow::Result<()> {
let workdir = workdir
.canonicalize()
.with_context(|| format!("Error opening workdir '{}'", workdir.display()))?;
let cfg_file_path = workdir.join("pageserver.toml");
// Set CWD to workdir for non-daemon modes
@@ -75,8 +75,6 @@ fn main() -> anyhow::Result<()> {
)
})?;
let daemonize = arg_matches.get_flag("daemonize");
let conf = match initialize_config(&cfg_file_path, arg_matches, &workdir)? {
ControlFlow::Continue(conf) => conf,
ControlFlow::Break(()) => {
@@ -102,7 +100,7 @@ fn main() -> anyhow::Result<()> {
virtual_file::init(conf.max_file_descriptors);
page_cache::init(conf.page_cache_size);
start_pageserver(conf, daemonize).context("Failed to start pageserver")?;
start_pageserver(conf).context("Failed to start pageserver")?;
scenario.teardown();
Ok(())
@@ -197,12 +195,34 @@ fn initialize_config(
})
}
fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()> {
// Initialize logger
let log_file = logging::init(LOG_FILE_NAME, daemonize, conf.log_format)?;
fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
logging::init(conf.log_format)?;
info!("version: {}", version());
let lock_file_path = conf.workdir.join(PID_FILE_NAME);
let lock_file = match lock_file::create_lock_file(&lock_file_path, Pid::this().to_string()) {
lock_file::LockCreationResult::Created {
new_lock_contents,
file,
} => {
info!("Created lock file at {lock_file_path:?} with contenst {new_lock_contents}");
file
}
lock_file::LockCreationResult::AlreadyLocked {
existing_lock_contents,
} => anyhow::bail!(
"Could not lock pid file; pageserver is already running in {:?} with PID {}",
conf.workdir,
existing_lock_contents
),
lock_file::LockCreationResult::CreationFailed(e) => {
return Err(e.context(format!("Failed to create lock file at {lock_file_path:?}")))
}
};
// ensure that the lock file is held even if the main thread of the process is panics
// we need to release the lock file only when the current process is gone
let _ = Box::leak(Box::new(lock_file));
// TODO: Check that it looks like a valid repository before going further
// bind sockets before daemonizing so we report errors early and do not return until we are listening
@@ -218,33 +238,6 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
);
let pageserver_listener = tcp_listener::bind(conf.listen_pg_addr.clone())?;
// NB: Don't spawn any threads before daemonizing!
if daemonize {
info!("daemonizing...");
// There shouldn't be any logging to stdin/stdout. Redirect it to the main log so
// that we will see any accidental manual fprintf's or backtraces.
let stdout = log_file
.try_clone()
.with_context(|| format!("Failed to clone log file '{:?}'", log_file))?;
let stderr = log_file;
let daemonize = Daemonize::new()
.pid_file("pageserver.pid")
.working_directory(".")
.stdout(stdout)
.stderr(stderr);
// XXX: The parent process should exit abruptly right after
// it has spawned a child to prevent coverage machinery from
// dumping stats into a `profraw` file now owned by the child.
// Otherwise, the coverage data will be damaged.
match daemonize.exit_action(|| exit_now(0)).start() {
Ok(_) => info!("Success, daemonized"),
Err(err) => bail!("{err}. could not daemonize. bailing."),
}
}
let signals = signals::install_shutdown_handlers()?;
// start profiler (if enabled)
@@ -347,14 +340,6 @@ fn cli() -> Command {
Command::new("Neon page server")
.about("Materializes WAL stream to pages and serves them to the postgres")
.version(version())
.arg(
Arg::new("daemonize")
.short('d')
.long("daemonize")
.action(ArgAction::SetTrue)
.help("Run in the background"),
)
.arg(
Arg::new("init")
.long("init")

View File

@@ -8,7 +8,9 @@ use anyhow::{anyhow, bail, ensure, Context, Result};
use remote_storage::RemoteStorageConfig;
use std::env;
use utils::crashsafe::path_with_suffix_extension;
use utils::id::ConnectionId;
use std::num::NonZeroUsize;
use std::path::{Path, PathBuf};
use std::str::FromStr;
use std::time::Duration;
@@ -48,6 +50,9 @@ pub mod defaults {
pub const DEFAULT_LOG_FORMAT: &str = "plain";
pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize =
super::ConfigurableSemaphore::DEFAULT_INITIAL.get();
///
/// Default built-in configuration file.
///
@@ -67,6 +72,9 @@ pub mod defaults {
#initial_superuser_name = '{DEFAULT_SUPERUSER}'
#log_format = '{DEFAULT_LOG_FORMAT}'
#concurrent_tenant_size_logical_size_queries = '{DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES}'
# [tenant_config]
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -132,6 +140,9 @@ pub struct PageServerConf {
pub broker_endpoints: Vec<Url>,
pub log_format: LogFormat,
/// Number of concurrent [`Tenant::gather_size_inputs`] allowed.
pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
}
#[derive(Debug, Clone, PartialEq, Eq)]
@@ -200,6 +211,8 @@ struct PageServerConfigBuilder {
broker_endpoints: BuilderValue<Vec<Url>>,
log_format: BuilderValue<LogFormat>,
concurrent_tenant_size_logical_size_queries: BuilderValue<ConfigurableSemaphore>,
}
impl Default for PageServerConfigBuilder {
@@ -228,6 +241,8 @@ impl Default for PageServerConfigBuilder {
broker_etcd_prefix: Set(etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string()),
broker_endpoints: Set(Vec::new()),
log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
concurrent_tenant_size_logical_size_queries: Set(ConfigurableSemaphore::default()),
}
}
}
@@ -304,6 +319,10 @@ impl PageServerConfigBuilder {
self.log_format = BuilderValue::Set(log_format)
}
pub fn concurrent_tenant_size_logical_size_queries(&mut self, u: ConfigurableSemaphore) {
self.concurrent_tenant_size_logical_size_queries = BuilderValue::Set(u);
}
pub fn build(self) -> anyhow::Result<PageServerConf> {
let broker_endpoints = self
.broker_endpoints
@@ -349,6 +368,11 @@ impl PageServerConfigBuilder {
.broker_etcd_prefix
.ok_or(anyhow!("missing broker_etcd_prefix"))?,
log_format: self.log_format.ok_or(anyhow!("missing log_format"))?,
concurrent_tenant_size_logical_size_queries: self
.concurrent_tenant_size_logical_size_queries
.ok_or(anyhow!(
"missing concurrent_tenant_size_logical_size_queries"
))?,
})
}
}
@@ -391,6 +415,22 @@ impl PageServerConf {
)
}
pub fn traces_path(&self) -> PathBuf {
self.workdir.join("traces")
}
pub fn trace_path(
&self,
tenant_id: &TenantId,
timeline_id: &TimelineId,
connection_id: &ConnectionId,
) -> PathBuf {
self.traces_path()
.join(tenant_id.to_string())
.join(timeline_id.to_string())
.join(connection_id.to_string())
}
/// Points to a place in pageserver's local directory,
/// where certain timeline's metadata file should be located.
pub fn metadata_path(&self, timeline_id: TimelineId, tenant_id: TenantId) -> PathBuf {
@@ -476,6 +516,12 @@ impl PageServerConf {
"log_format" => builder.log_format(
LogFormat::from_config(&parse_toml_string(key, item)?)?
),
"concurrent_tenant_size_logical_size_queries" => builder.concurrent_tenant_size_logical_size_queries({
let input = parse_toml_string(key, item)?;
let permits = input.parse::<usize>().context("expected a number of initial permits, not {s:?}")?;
let permits = NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?;
ConfigurableSemaphore::new(permits)
}),
_ => bail!("unrecognized pageserver option '{key}'"),
}
}
@@ -568,8 +614,9 @@ impl PageServerConf {
PathBuf::from(format!("../tmp_check/test_{test_name}"))
}
#[cfg(test)]
pub fn dummy_conf(repo_dir: PathBuf) -> Self {
let pg_distrib_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../pg_install");
PageServerConf {
id: NodeId(0),
wait_lsn_timeout: Duration::from_secs(60),
@@ -580,7 +627,7 @@ impl PageServerConf {
listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
superuser: "cloud_admin".to_string(),
workdir: repo_dir,
pg_distrib_dir: PathBuf::new(),
pg_distrib_dir,
auth_type: AuthType::Trust,
auth_validation_public_key_path: None,
remote_storage_config: None,
@@ -589,6 +636,7 @@ impl PageServerConf {
broker_endpoints: Vec::new(),
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
}
}
}
@@ -654,6 +702,58 @@ fn parse_toml_array(name: &str, item: &Item) -> anyhow::Result<Vec<String>> {
.collect()
}
/// Configurable semaphore permits setting.
///
/// Does not allow semaphore permits to be zero, because at runtime initially zero permits and empty
/// semaphore cannot be distinguished, leading any feature using these to await forever (or until
/// new permits are added).
#[derive(Debug, Clone)]
pub struct ConfigurableSemaphore {
initial_permits: NonZeroUsize,
inner: std::sync::Arc<tokio::sync::Semaphore>,
}
impl ConfigurableSemaphore {
pub const DEFAULT_INITIAL: NonZeroUsize = match NonZeroUsize::new(1) {
Some(x) => x,
None => panic!("const unwrap is not yet stable"),
};
/// Initializse using a non-zero amount of permits.
///
/// Require a non-zero initial permits, because using permits == 0 is a crude way to disable a
/// feature such as [`Tenant::gather_size_inputs`]. Otherwise any semaphore using future will
/// behave like [`futures::future::pending`], just waiting until new permits are added.
pub fn new(initial_permits: NonZeroUsize) -> Self {
ConfigurableSemaphore {
initial_permits,
inner: std::sync::Arc::new(tokio::sync::Semaphore::new(initial_permits.get())),
}
}
}
impl Default for ConfigurableSemaphore {
fn default() -> Self {
Self::new(Self::DEFAULT_INITIAL)
}
}
impl PartialEq for ConfigurableSemaphore {
fn eq(&self, other: &Self) -> bool {
// the number of permits can be increased at runtime, so we cannot really fulfill the
// PartialEq value equality otherwise
self.initial_permits == other.initial_permits
}
}
impl Eq for ConfigurableSemaphore {}
impl ConfigurableSemaphore {
pub fn inner(&self) -> &std::sync::Arc<tokio::sync::Semaphore> {
&self.inner
}
}
#[cfg(test)]
mod tests {
use std::{
@@ -725,6 +825,7 @@ log_format = 'json'
.expect("Failed to parse a valid broker endpoint URL")],
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
},
"Correct defaults should be used when no config values are provided"
);
@@ -770,6 +871,7 @@ log_format = 'json'
.expect("Failed to parse a valid broker endpoint URL")],
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
log_format: LogFormat::Json,
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
},
"Should be able to parse all basic config values correctly"
);

View File

@@ -354,6 +354,54 @@ paths:
schema:
$ref: "#/components/schemas/Error"
/v1/tenant/{tenant_id}/size:
parameters:
- name: tenant_id
in: path
required: true
schema:
type: string
format: hex
get:
description: |
Calculate tenant's size, which is a mixture of WAL (bytes) and logical_size (bytes).
responses:
"200":
description: OK,
content:
application/json:
schema:
type: object
required:
- id
- size
properties:
id:
type: string
format: hex
size:
type: integer
description: |
Size metric in bytes.
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
/v1/tenant/{tenant_id}/timeline/:
parameters:
- name: tenant_id

View File

@@ -227,13 +227,10 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
let state = get_state(&request);
let timelines = tokio::task::spawn_blocking(move || {
let _enter = info_span!("timeline_list", tenant = %tenant_id).entered();
let timelines = info_span!("timeline_list", tenant = %tenant_id).in_scope(|| {
let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
Ok(tenant.list_timelines())
})
.await
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
})?;
let mut response_data = Vec::with_capacity(timelines.len());
for timeline in timelines {
@@ -523,9 +520,7 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
check_permission(&request, Some(tenant_id))?;
// if tenant is in progress of downloading it can be absent in global tenant map
let tenant = tokio::task::spawn_blocking(move || tenant_mgr::get_tenant(tenant_id, false))
.await
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?;
let tenant = tenant_mgr::get_tenant(tenant_id, false);
let state = get_state(&request);
let remote_index = &state.remote_index;
@@ -571,6 +566,44 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
)
}
async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let tenant = tenant_mgr::get_tenant(tenant_id, false).map_err(ApiError::InternalServerError)?;
// this can be long operation, it currently is not backed by any request coalescing or similar
let inputs = tenant
.gather_size_inputs()
.await
.map_err(ApiError::InternalServerError)?;
let size = inputs.calculate().map_err(ApiError::InternalServerError)?;
/// Private response type with the additional "unstable" `inputs` field.
///
/// The type is described with `id` and `size` in the openapi_spec file, but the `inputs` is
/// intentionally left out. The type resides in the pageserver not to expose `ModelInputs`.
#[serde_with::serde_as]
#[derive(serde::Serialize)]
struct TenantHistorySize {
#[serde_as(as = "serde_with::DisplayFromStr")]
id: TenantId,
/// Size is a mixture of WAL and logical size, so the unit is bytes.
size: u64,
inputs: crate::tenant::size::ModelInputs,
}
json_response(
StatusCode::OK,
TenantHistorySize {
id: tenant_id,
size,
inputs,
},
)
}
// Helper function to standardize the error messages we produce on bad durations
//
// Intended to be used with anyhow's `with_context`, e.g.:
@@ -585,6 +618,7 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
check_permission(&request, None)?;
let request_data: TenantCreateRequest = json_request(&mut request).await?;
println!("tenant create: {:?}", request_data.trace_read_requests);
let remote_index = get_state(&request).remote_index.clone();
let mut tenant_conf = TenantConfOpt::default();
@@ -626,6 +660,9 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
if let Some(max_lsn_wal_lag) = request_data.max_lsn_wal_lag {
tenant_conf.max_lsn_wal_lag = Some(max_lsn_wal_lag);
}
if let Some(trace_read_requests) = request_data.trace_read_requests {
tenant_conf.trace_read_requests = Some(trace_read_requests);
}
tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
if let Some(checkpoint_timeout) = request_data.checkpoint_timeout {
@@ -713,6 +750,9 @@ async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Bo
if let Some(max_lsn_wal_lag) = request_data.max_lsn_wal_lag {
tenant_conf.max_lsn_wal_lag = Some(max_lsn_wal_lag);
}
if let Some(trace_read_requests) = request_data.trace_read_requests {
tenant_conf.trace_read_requests = Some(trace_read_requests);
}
tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
if let Some(checkpoint_timeout) = request_data.checkpoint_timeout {
@@ -792,14 +832,14 @@ async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body
let tenant = tenant_mgr::get_tenant(tenant_id, false).map_err(ApiError::NotFound)?;
let gc_req: TimelineGcRequest = json_request(&mut request).await?;
let _span_guard =
info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id).entered();
let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
// Use tenant's pitr setting
let pitr = tenant.get_pitr_interval();
let result = tenant
.gc_iteration(Some(timeline_id), gc_horizon, pitr, true)
.instrument(info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id))
.await
// FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
// better once the types support it.
.map_err(ApiError::InternalServerError)?;
@@ -835,6 +875,7 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
.map_err(ApiError::NotFound)?;
timeline
.checkpoint(CheckpointConfig::Forced)
.await
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::OK, ())
@@ -898,6 +939,7 @@ pub fn make_router(
.get("/v1/tenant", tenant_list_handler)
.post("/v1/tenant", tenant_create_handler)
.get("/v1/tenant/:tenant_id", tenant_status)
.get("/v1/tenant/:tenant_id/size", tenant_size_handler)
.put("/v1/tenant/config", tenant_config_handler)
.get("/v1/tenant/:tenant_id/timeline", timeline_list_handler)
.post("/v1/tenant/:tenant_id/timeline", timeline_create_handler)

View File

@@ -15,6 +15,7 @@ pub mod tenant;
pub mod tenant_config;
pub mod tenant_mgr;
pub mod tenant_tasks;
pub mod trace;
pub mod virtual_file;
pub mod walingest;
pub mod walreceiver;
@@ -43,8 +44,6 @@ pub const DEFAULT_PG_VERSION: u32 = 14;
pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
pub const DELTA_FILE_MAGIC: u16 = 0x5A61;
pub const LOG_FILE_NAME: &str = "pageserver.log";
static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
/// Config for the Repository checkpointer
@@ -81,7 +80,6 @@ pub async fn shutdown_pageserver(exit_code: i32) {
// There should be nothing left, but let's be sure
task_mgr::shutdown_tasks(None, None, None).await;
info!("Shut down successfully completed");
std::process::exit(exit_code);
}

View File

@@ -31,6 +31,7 @@ const STORAGE_TIME_OPERATIONS: &[&str] = &[
"compact",
"create images",
"init logical size",
"logical size",
"load layer map",
"gc",
];
@@ -365,6 +366,7 @@ pub struct TimelineMetrics {
pub compact_time_histo: Histogram,
pub create_images_time_histo: Histogram,
pub init_logical_size_histo: Histogram,
pub logical_size_histo: Histogram,
pub load_layer_map_histo: Histogram,
pub last_record_gauge: IntGauge,
pub wait_lsn_time_histo: Histogram,
@@ -397,6 +399,9 @@ impl TimelineMetrics {
let init_logical_size_histo = STORAGE_TIME
.get_metric_with_label_values(&["init logical size", &tenant_id, &timeline_id])
.unwrap();
let logical_size_histo = STORAGE_TIME
.get_metric_with_label_values(&["logical size", &tenant_id, &timeline_id])
.unwrap();
let load_layer_map_histo = STORAGE_TIME
.get_metric_with_label_values(&["load layer map", &tenant_id, &timeline_id])
.unwrap();
@@ -428,6 +433,7 @@ impl TimelineMetrics {
compact_time_histo,
create_images_time_histo,
init_logical_size_histo,
logical_size_histo,
load_layer_map_histo,
last_record_gauge,
wait_lsn_time_histo,

View File

@@ -10,6 +10,7 @@
//
use anyhow::{bail, ensure, Context, Result};
use bytes::Buf;
use bytes::Bytes;
use futures::{Stream, StreamExt};
use pageserver_api::models::{
@@ -18,21 +19,23 @@ use pageserver_api::models::{
PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
PagestreamNblocksRequest, PagestreamNblocksResponse,
};
use pq_proto::{BeMessage, FeMessage, RowDescriptor};
use std::io;
use std::net::TcpListener;
use std::str;
use std::str::FromStr;
use std::sync::Arc;
use tokio::pin;
use tokio_util::io::StreamReader;
use tokio_util::io::SyncIoBridge;
use tracing::*;
use utils::id::ConnectionId;
use utils::{
auth::{self, Claims, JwtAuth, Scope},
id::{TenantId, TimelineId},
lsn::Lsn,
postgres_backend::AuthType,
postgres_backend_async::{self, PostgresBackend},
pq_proto::{BeMessage, FeMessage, RowDescriptor},
simple_rcu::RcuReadGuard,
};
@@ -45,6 +48,7 @@ use crate::task_mgr;
use crate::task_mgr::TaskKind;
use crate::tenant::Timeline;
use crate::tenant_mgr;
use crate::trace::Tracer;
use crate::CheckpointConfig;
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
@@ -72,6 +76,12 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Byt
FeMessage::CopyData(bytes) => bytes,
FeMessage::CopyDone => { break },
FeMessage::Sync => continue,
FeMessage::Terminate => {
let msg = format!("client terminated connection with Terminate message during COPY");
pgb.write_message(&BeMessage::ErrorResponse(&msg))?;
Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
break;
}
m => {
let msg = format!("unexpected message {:?}", m);
pgb.write_message(&BeMessage::ErrorResponse(&msg))?;
@@ -83,10 +93,10 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Byt
yield copy_data_bytes;
}
Ok(None) => {
let msg = "client closed connection";
let msg = "client closed connection during COPY";
pgb.write_message(&BeMessage::ErrorResponse(msg))?;
pgb.flush().await?;
Err(io::Error::new(io::ErrorKind::Other, msg))?;
Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
}
Err(e) => {
Err(io::Error::new(io::ErrorKind::Other, e))?;
@@ -267,6 +277,18 @@ impl PageServerHandler {
// so there is no need to reset the association
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
// Make request tracer if needed
let tenant = tenant_mgr::get_tenant(tenant_id, true)?;
let mut tracer = if tenant.get_trace_read_requests() {
let connection_id = ConnectionId::generate();
let path = tenant
.conf
.trace_path(&tenant_id, &timeline_id, &connection_id);
Some(Tracer::new(path))
} else {
None
};
// Check that the timeline exists
let timeline = get_local_timeline(tenant_id, timeline_id)?;
@@ -299,7 +321,12 @@ impl PageServerHandler {
trace!("query: {copy_data_bytes:?}");
let neon_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?;
// Trace request if needed
if let Some(t) = tracer.as_mut() {
t.trace(&copy_data_bytes)
}
let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;
let response = match neon_fe_msg {
PagestreamFeMessage::Exists(req) => {
@@ -366,14 +393,12 @@ impl PageServerHandler {
pgb.write_message(&BeMessage::CopyInResponse)?;
pgb.flush().await?;
// import_basebackup_from_tar() is not async, mainly because the Tar crate
// it uses is not async. So we need to jump through some hoops:
// - convert the input from client connection to a synchronous Read
// - use block_in_place()
let mut copyin_stream = Box::pin(copyin_stream(pgb));
let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream));
tokio::task::block_in_place(|| timeline.import_basebackup_from_tar(reader, base_lsn))?;
timeline.initialize()?;
let copyin_stream = copyin_stream(pgb);
pin!(copyin_stream);
timeline
.import_basebackup_from_tar(&mut copyin_stream, base_lsn)
.await?;
// Drain the rest of the Copy data
let mut bytes_after_tar = 0;
@@ -438,7 +463,7 @@ impl PageServerHandler {
// We only want to persist the data, and it doesn't matter if it's in the
// shape of deltas or images.
info!("flushing layers");
timeline.checkpoint(CheckpointConfig::Flush)?;
timeline.checkpoint(CheckpointConfig::Flush).await?;
info!("done");
Ok(())

View File

@@ -12,8 +12,12 @@
//!
use anyhow::{bail, Context};
use bytes::Bytes;
use futures::Stream;
use pageserver_api::models::TimelineState;
use tokio::sync::watch;
use tokio_util::io::StreamReader;
use tokio_util::io::SyncIoBridge;
use tracing::*;
use utils::crashsafe::path_with_suffix_extension;
@@ -29,6 +33,7 @@ use std::io::Write;
use std::ops::Bound::Included;
use std::path::Path;
use std::path::PathBuf;
use std::pin::Pin;
use std::process::Command;
use std::process::Stdio;
use std::sync::Arc;
@@ -72,6 +77,8 @@ pub mod storage_layer;
mod timeline;
pub mod size;
use storage_layer::Layer;
pub use timeline::Timeline;
@@ -120,6 +127,9 @@ pub struct Tenant {
/// Makes every timeline to backup their files to remote storage.
upload_layers: bool,
/// Cached logical sizes updated updated on each [`Tenant::gather_size_inputs`].
cached_logical_sizes: tokio::sync::Mutex<HashMap<(TimelineId, Lsn), u64>>,
}
/// A timeline with some of its files on disk, being initialized.
@@ -132,7 +142,7 @@ pub struct Tenant {
pub struct UninitializedTimeline<'t> {
owning_tenant: &'t Tenant,
timeline_id: TimelineId,
raw_timeline: Option<(Timeline, TimelineUninitMark)>,
raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark)>,
}
/// An uninit mark file, created along the timeline dir to ensure the timeline either gets fully initialized and loaded into pageserver's memory,
@@ -164,7 +174,6 @@ impl UninitializedTimeline<'_> {
let (new_timeline, uninit_mark) = self.raw_timeline.take().with_context(|| {
format!("No timeline for initalization found for {tenant_id}/{timeline_id}")
})?;
let new_timeline = Arc::new(new_timeline);
let new_disk_consistent_lsn = new_timeline.get_disk_consistent_lsn();
// TODO it would be good to ensure that, but apparently a lot of our testing is dependend on that at least
@@ -192,6 +201,9 @@ impl UninitializedTimeline<'_> {
})?;
new_timeline.set_state(TimelineState::Active);
v.insert(Arc::clone(&new_timeline));
new_timeline.maybe_spawn_flush_loop();
new_timeline.launch_wal_receiver();
}
}
@@ -200,20 +212,28 @@ impl UninitializedTimeline<'_> {
}
/// Prepares timeline data by loading it from the basebackup archive.
pub fn import_basebackup_from_tar(
&self,
reader: impl std::io::Read,
pub async fn import_basebackup_from_tar(
self,
mut copyin_stream: &mut Pin<&mut impl Stream<Item = io::Result<Bytes>>>,
base_lsn: Lsn,
) -> anyhow::Result<()> {
) -> anyhow::Result<Arc<Timeline>> {
let raw_timeline = self.raw_timeline()?;
import_datadir::import_basebackup_from_tar(raw_timeline, reader, base_lsn).with_context(
|| {
format!(
"Failed to import basebackup for timeline {}/{}",
self.owning_tenant.tenant_id, self.timeline_id
)
},
)?;
// import_basebackup_from_tar() is not async, mainly because the Tar crate
// it uses is not async. So we need to jump through some hoops:
// - convert the input from client connection to a synchronous Read
// - use block_in_place()
let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream));
tokio::task::block_in_place(|| {
import_datadir::import_basebackup_from_tar(raw_timeline, reader, base_lsn)
.context("Failed to import basebackup")
})?;
// Flush loop needs to be spawned in order for checkpoint to be able to flush.
// We want to run proper checkpoint before we mark timeline as available to outside world
// Thus spawning flush loop manually and skipping flush_loop setup in initialize_with_lock
raw_timeline.maybe_spawn_flush_loop();
fail::fail_point!("before-checkpoint-new-timeline", |_| {
bail!("failpoint before-checkpoint-new-timeline");
@@ -221,16 +241,15 @@ impl UninitializedTimeline<'_> {
raw_timeline
.checkpoint(CheckpointConfig::Flush)
.with_context(|| {
format!(
"Failed to checkpoint after basebackup import for timeline {}/{}",
self.owning_tenant.tenant_id, self.timeline_id
)
})?;
Ok(())
.await
.context("Failed to checkpoint after basebackup import")?;
let timeline = self.initialize()?;
Ok(timeline)
}
fn raw_timeline(&self) -> anyhow::Result<&Timeline> {
fn raw_timeline(&self) -> anyhow::Result<&Arc<Timeline>> {
Ok(&self
.raw_timeline
.as_ref()
@@ -465,7 +484,7 @@ impl Tenant {
self.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)?
}
None => self.bootstrap_timeline(new_timeline_id, pg_version)?,
None => self.bootstrap_timeline(new_timeline_id, pg_version).await?,
};
// Have added new timeline into the tenant, now its background tasks are needed.
@@ -483,7 +502,7 @@ impl Tenant {
/// `checkpoint_before_gc` parameter is used to force compaction of storage before GC
/// to make tests more deterministic.
/// TODO Do we still need it or we can call checkpoint explicitly in tests where needed?
pub fn gc_iteration(
pub async fn gc_iteration(
&self,
target_timeline_id: Option<TimelineId>,
horizon: u64,
@@ -499,11 +518,13 @@ impl Tenant {
.map(|x| x.to_string())
.unwrap_or_else(|| "-".to_string());
STORAGE_TIME
.with_label_values(&["gc", &self.tenant_id.to_string(), &timeline_str])
.observe_closure_duration(|| {
self.gc_iteration_internal(target_timeline_id, horizon, pitr, checkpoint_before_gc)
})
{
let _timer = STORAGE_TIME
.with_label_values(&["gc", &self.tenant_id.to_string(), &timeline_str])
.start_timer();
self.gc_iteration_internal(target_timeline_id, horizon, pitr, checkpoint_before_gc)
.await
}
}
/// Perform one compaction iteration.
@@ -523,7 +544,6 @@ impl Tenant {
let timelines = self.timelines.lock().unwrap();
let timelines_to_compact = timelines
.iter()
.filter(|(_, timeline)| timeline.is_active())
.map(|(timeline_id, timeline)| (*timeline_id, timeline.clone()))
.collect::<Vec<_>>();
drop(timelines);
@@ -540,23 +560,24 @@ impl Tenant {
///
/// Used at graceful shutdown.
///
pub fn checkpoint(&self) -> anyhow::Result<()> {
pub async fn checkpoint(&self) -> anyhow::Result<()> {
// Scan through the hashmap and collect a list of all the timelines,
// while holding the lock. Then drop the lock and actually perform the
// checkpoints. We don't want to block everything else while the
// checkpoint runs.
let timelines = self.timelines.lock().unwrap();
let timelines_to_checkpoint = timelines
.iter()
.map(|(timeline_id, timeline)| (*timeline_id, Arc::clone(timeline)))
.collect::<Vec<_>>();
drop(timelines);
let timelines_to_checkpoint = {
let timelines = self.timelines.lock().unwrap();
timelines
.iter()
.map(|(id, timeline)| (*id, Arc::clone(timeline)))
.collect::<Vec<_>>()
};
for (timeline_id, timeline) in &timelines_to_checkpoint {
let _entered =
info_span!("checkpoint", timeline = %timeline_id, tenant = %self.tenant_id)
.entered();
timeline.checkpoint(CheckpointConfig::Flush)?;
for (id, timeline) in &timelines_to_checkpoint {
timeline
.checkpoint(CheckpointConfig::Flush)
.instrument(info_span!("checkpoint", timeline = %id, tenant = %self.tenant_id))
.await?;
}
Ok(())
@@ -785,6 +806,13 @@ impl Tenant {
.unwrap_or(self.conf.default_tenant_conf.pitr_interval)
}
pub fn get_trace_read_requests(&self) -> bool {
let tenant_conf = self.tenant_conf.read().unwrap();
tenant_conf
.trace_read_requests
.unwrap_or(self.conf.default_tenant_conf.trace_read_requests)
}
pub fn update_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
self.tenant_conf.write().unwrap().update(&new_tenant_conf);
}
@@ -835,6 +863,7 @@ impl Tenant {
remote_index,
upload_layers,
state,
cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
}
}
@@ -956,8 +985,9 @@ impl Tenant {
// +-----baz-------->
//
//
// 1. Grab 'gc_cs' mutex to prevent new timelines from being created
// 2. Scan all timelines, and on each timeline, make note of the
// 1. Grab 'gc_cs' mutex to prevent new timelines from being created while Timeline's
// `gc_infos` are being refreshed
// 2. Scan collected timelines, and on each timeline, make note of the
// all the points where other timelines have been branched off.
// We will refrain from removing page versions at those LSNs.
// 3. For each timeline, scan all layer files on the timeline.
@@ -968,7 +998,7 @@ impl Tenant {
// - if a relation has a non-incremental persistent layer on a child branch, then we
// don't need to keep that in the parent anymore. But currently
// we do.
fn gc_iteration_internal(
async fn gc_iteration_internal(
&self,
target_timeline_id: Option<TimelineId>,
horizon: u64,
@@ -978,6 +1008,68 @@ impl Tenant {
let mut totals: GcResult = Default::default();
let now = Instant::now();
let gc_timelines = self.refresh_gc_info_internal(target_timeline_id, horizon, pitr)?;
// Perform GC for each timeline.
//
// Note that we don't hold the GC lock here because we don't want
// to delay the branch creation task, which requires the GC lock.
// A timeline GC iteration can be slow because it may need to wait for
// compaction (both require `layer_removal_cs` lock),
// but the GC iteration can run concurrently with branch creation.
//
// See comments in [`Tenant::branch_timeline`] for more information
// about why branch creation task can run concurrently with timeline's GC iteration.
for timeline in gc_timelines {
if task_mgr::is_shutdown_requested() {
// We were requested to shut down. Stop and return with the progress we
// made.
break;
}
// If requested, force flush all in-memory layers to disk first,
// so that they too can be garbage collected. That's
// used in tests, so we want as deterministic results as possible.
if checkpoint_before_gc {
timeline.checkpoint(CheckpointConfig::Forced).await?;
info!(
"timeline {} checkpoint_before_gc done",
timeline.timeline_id
);
}
let result = timeline.gc()?;
totals += result;
}
totals.elapsed = now.elapsed();
Ok(totals)
}
/// Refreshes the Timeline::gc_info for all timelines, returning the
/// vector of timelines which have [`Timeline::get_last_record_lsn`] past
/// [`Tenant::get_gc_horizon`].
///
/// This is usually executed as part of periodic gc, but can now be triggered more often.
pub fn refresh_gc_info(&self) -> anyhow::Result<Vec<Arc<Timeline>>> {
// since this method can now be called at different rates than the configured gc loop, it
// might be that these configuration values get applied faster than what it was previously,
// since these were only read from the gc task.
let horizon = self.get_gc_horizon();
let pitr = self.get_pitr_interval();
// refresh all timelines
let target_timeline_id = None;
self.refresh_gc_info_internal(target_timeline_id, horizon, pitr)
}
fn refresh_gc_info_internal(
&self,
target_timeline_id: Option<TimelineId>,
horizon: u64,
pitr: Duration,
) -> anyhow::Result<Vec<Arc<Timeline>>> {
// grab mutex to prevent new timelines from being created here.
let gc_cs = self.gc_cs.lock().unwrap();
@@ -995,11 +1087,7 @@ impl Tenant {
timelines
.iter()
.filter(|(_, timeline)| timeline.is_active())
.map(|(timeline_id, timeline_entry)| {
// This is unresolved question for now, how to do gc in presence of remote timelines
// especially when this is combined with branching.
// Somewhat related: https://github.com/neondatabase/neon/issues/999
if let Some(ancestor_timeline_id) = &timeline_entry.get_ancestor_timeline_id() {
// If target_timeline is specified, we only need to know branchpoints of its children
if let Some(timeline_id) = target_timeline_id {
@@ -1053,41 +1141,7 @@ impl Tenant {
}
}
drop(gc_cs);
// Perform GC for each timeline.
//
// Note that we don't hold the GC lock here because we don't want
// to delay the branch creation task, which requires the GC lock.
// A timeline GC iteration can be slow because it may need to wait for
// compaction (both require `layer_removal_cs` lock),
// but the GC iteration can run concurrently with branch creation.
//
// See comments in [`Tenant::branch_timeline`] for more information
// about why branch creation task can run concurrently with timeline's GC iteration.
for timeline in gc_timelines {
if task_mgr::is_shutdown_requested() {
// We were requested to shut down. Stop and return with the progress we
// made.
break;
}
// If requested, force flush all in-memory layers to disk first,
// so that they too can be garbage collected. That's
// used in tests, so we want as deterministic results as possible.
if checkpoint_before_gc {
timeline.checkpoint(CheckpointConfig::Forced)?;
info!(
"timeline {} checkpoint_before_gc done",
timeline.timeline_id
);
}
let result = timeline.gc()?;
totals += result;
}
totals.elapsed = now.elapsed();
Ok(totals)
Ok(gc_timelines)
}
/// Branch an existing timeline
@@ -1191,14 +1245,15 @@ impl Tenant {
/// - run initdb to init temporary instance and get bootstrap data
/// - after initialization complete, remove the temp dir.
fn bootstrap_timeline(
async fn bootstrap_timeline(
&self,
timeline_id: TimelineId,
pg_version: u32,
) -> anyhow::Result<Arc<Timeline>> {
let timelines = self.timelines.lock().unwrap();
let timeline_uninit_mark = self.create_timeline_uninit_mark(timeline_id, &timelines)?;
drop(timelines);
let timeline_uninit_mark = {
let timelines = self.timelines.lock().unwrap();
self.create_timeline_uninit_mark(timeline_id, &timelines)?
};
// create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/`
// temporary directory for basebackup files for the given timeline.
let initdb_path = path_with_suffix_extension(
@@ -1248,25 +1303,35 @@ impl Tenant {
let tenant_id = raw_timeline.owning_tenant.tenant_id;
let unfinished_timeline = raw_timeline.raw_timeline()?;
import_datadir::import_timeline_from_postgres_datadir(
unfinished_timeline,
pgdata_path,
pgdata_lsn,
)
tokio::task::block_in_place(|| {
import_datadir::import_timeline_from_postgres_datadir(
unfinished_timeline,
pgdata_path,
pgdata_lsn,
)
})
.with_context(|| {
format!("Failed to import pgdatadir for timeline {tenant_id}/{timeline_id}")
})?;
// Flush loop needs to be spawned in order for checkpoint to be able to flush.
// We want to run proper checkpoint before we mark timeline as available to outside world
// Thus spawning flush loop manually and skipping flush_loop setup in initialize_with_lock
unfinished_timeline.maybe_spawn_flush_loop();
fail::fail_point!("before-checkpoint-new-timeline", |_| {
anyhow::bail!("failpoint before-checkpoint-new-timeline");
});
unfinished_timeline
.checkpoint(CheckpointConfig::Forced)
.checkpoint(CheckpointConfig::Forced).await
.with_context(|| format!("Failed to checkpoint after pgdatadir import for timeline {tenant_id}/{timeline_id}"))?;
let mut timelines = self.timelines.lock().unwrap();
let timeline = raw_timeline.initialize_with_lock(&mut timelines, false)?;
drop(timelines);
let timeline = {
let mut timelines = self.timelines.lock().unwrap();
raw_timeline.initialize_with_lock(&mut timelines, false)?
};
info!(
"created root timeline {} timeline.lsn {}",
@@ -1306,7 +1371,7 @@ impl Tenant {
Ok(UninitializedTimeline {
owning_tenant: self,
timeline_id: new_timeline_id,
raw_timeline: Some((new_timeline, uninit_mark)),
raw_timeline: Some((Arc::new(new_timeline), uninit_mark)),
})
}
Err(e) => {
@@ -1425,7 +1490,7 @@ impl Tenant {
let timeline = UninitializedTimeline {
owning_tenant: self,
timeline_id,
raw_timeline: Some((dummy_timeline, TimelineUninitMark::dummy())),
raw_timeline: Some((Arc::new(dummy_timeline), TimelineUninitMark::dummy())),
};
match timeline.initialize_with_lock(&mut timelines_accessor, true) {
Ok(initialized_timeline) => {
@@ -1446,6 +1511,25 @@ impl Tenant {
Ok(())
}
/// Gathers inputs from all of the timelines to produce a sizing model input.
///
/// Future is cancellation safe. Only one calculation can be running at once per tenant.
#[instrument(skip_all, fields(tenant_id=%self.tenant_id))]
pub async fn gather_size_inputs(&self) -> anyhow::Result<size::ModelInputs> {
let logical_sizes_at_once = self
.conf
.concurrent_tenant_size_logical_size_queries
.inner();
// TODO: Having a single mutex block concurrent reads is unfortunate, but since the queries
// are for testing/experimenting, we tolerate this.
//
// See more for on the issue #2748 condenced out of the initial PR review.
let mut shared_cache = self.cached_logical_sizes.lock().await;
size::gather_inputs(self, logical_sizes_at_once, &mut *shared_cache).await
}
}
/// Create the cluster temporarily in 'initdbpath' directory inside the repository
@@ -1589,6 +1673,7 @@ pub mod harness {
walreceiver_connect_timeout: Some(tenant_conf.walreceiver_connect_timeout),
lagging_wal_timeout: Some(tenant_conf.lagging_wal_timeout),
max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag),
trace_read_requests: Some(tenant_conf.trace_read_requests),
}
}
}
@@ -1860,7 +1945,7 @@ mod tests {
Ok(())
}
fn make_some_layers(tline: &Timeline, start_lsn: Lsn) -> anyhow::Result<()> {
async fn make_some_layers(tline: &Timeline, start_lsn: Lsn) -> anyhow::Result<()> {
let mut lsn = start_lsn;
#[allow(non_snake_case)]
{
@@ -1881,7 +1966,7 @@ mod tests {
writer.finish_write(lsn);
lsn += 0x10;
}
tline.checkpoint(CheckpointConfig::Forced)?;
tline.checkpoint(CheckpointConfig::Forced).await?;
{
let writer = tline.writer();
writer.put(
@@ -1898,24 +1983,26 @@ mod tests {
)?;
writer.finish_write(lsn);
}
tline.checkpoint(CheckpointConfig::Forced)
tline.checkpoint(CheckpointConfig::Forced).await
}
#[test]
fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> {
#[tokio::test]
async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> {
let tenant =
TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?
.load();
let tline = tenant
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
.initialize()?;
make_some_layers(tline.as_ref(), Lsn(0x20))?;
make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
// this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
// FIXME: this doesn't actually remove any layer currently, given how the checkpointing
// and compaction works. But it does set the 'cutoff' point so that the cross check
// below should fail.
tenant.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
tenant
.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)
.await?;
// try to branch at lsn 25, should fail because we already garbage collected the data
match tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) {
@@ -1960,14 +2047,14 @@ mod tests {
/*
// FIXME: This currently fails to error out. Calling GC doesn't currently
// remove the old value, we'd need to work a little harder
#[test]
fn test_prohibit_get_for_garbage_collected_data() -> anyhow::Result<()> {
#[tokio::test]
async fn test_prohibit_get_for_garbage_collected_data() -> anyhow::Result<()> {
let repo =
RepoHarness::create("test_prohibit_get_for_garbage_collected_data")?
.load();
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?;
make_some_layers(tline.as_ref(), Lsn(0x20))?;
make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn();
@@ -1980,43 +2067,47 @@ mod tests {
}
*/
#[test]
fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> {
#[tokio::test]
async fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> {
let tenant =
TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?.load();
let tline = tenant
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
.initialize()?;
make_some_layers(tline.as_ref(), Lsn(0x20))?;
make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
let newtline = tenant
.get_timeline(NEW_TIMELINE_ID, true)
.expect("Should have a local timeline");
// this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
tenant.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
tenant
.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)
.await?;
assert!(newtline.get(*TEST_KEY, Lsn(0x25)).is_ok());
Ok(())
}
#[test]
fn test_parent_keeps_data_forever_after_branching() -> anyhow::Result<()> {
#[tokio::test]
async fn test_parent_keeps_data_forever_after_branching() -> anyhow::Result<()> {
let tenant =
TenantHarness::create("test_parent_keeps_data_forever_after_branching")?.load();
let tline = tenant
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
.initialize()?;
make_some_layers(tline.as_ref(), Lsn(0x20))?;
make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
let newtline = tenant
.get_timeline(NEW_TIMELINE_ID, true)
.expect("Should have a local timeline");
make_some_layers(newtline.as_ref(), Lsn(0x60))?;
make_some_layers(newtline.as_ref(), Lsn(0x60)).await?;
// run gc on parent
tenant.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
tenant
.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)
.await?;
// Check that the data is still accessible on the branch.
assert_eq!(
@@ -2027,8 +2118,8 @@ mod tests {
Ok(())
}
#[test]
fn timeline_load() -> anyhow::Result<()> {
#[tokio::test]
async fn timeline_load() -> anyhow::Result<()> {
const TEST_NAME: &str = "timeline_load";
let harness = TenantHarness::create(TEST_NAME)?;
{
@@ -2036,8 +2127,8 @@ mod tests {
let tline = tenant
.create_empty_timeline(TIMELINE_ID, Lsn(0x8000), DEFAULT_PG_VERSION)?
.initialize()?;
make_some_layers(tline.as_ref(), Lsn(0x8000))?;
tline.checkpoint(CheckpointConfig::Forced)?;
make_some_layers(tline.as_ref(), Lsn(0x8000)).await?;
tline.checkpoint(CheckpointConfig::Forced).await?;
}
let tenant = harness.load();
@@ -2048,8 +2139,8 @@ mod tests {
Ok(())
}
#[test]
fn timeline_load_with_ancestor() -> anyhow::Result<()> {
#[tokio::test]
async fn timeline_load_with_ancestor() -> anyhow::Result<()> {
const TEST_NAME: &str = "timeline_load_with_ancestor";
let harness = TenantHarness::create(TEST_NAME)?;
// create two timelines
@@ -2059,8 +2150,8 @@ mod tests {
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
.initialize()?;
make_some_layers(tline.as_ref(), Lsn(0x20))?;
tline.checkpoint(CheckpointConfig::Forced)?;
make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
tline.checkpoint(CheckpointConfig::Forced).await?;
tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
@@ -2068,8 +2159,8 @@ mod tests {
.get_timeline(NEW_TIMELINE_ID, true)
.expect("Should have a local timeline");
make_some_layers(newtline.as_ref(), Lsn(0x60))?;
tline.checkpoint(CheckpointConfig::Forced)?;
make_some_layers(newtline.as_ref(), Lsn(0x60)).await?;
tline.checkpoint(CheckpointConfig::Forced).await?;
}
// check that both of them are initially unloaded
@@ -2129,8 +2220,8 @@ mod tests {
Ok(())
}
#[test]
fn test_images() -> anyhow::Result<()> {
#[tokio::test]
async fn test_images() -> anyhow::Result<()> {
let tenant = TenantHarness::create("test_images")?.load();
let tline = tenant
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
@@ -2141,7 +2232,7 @@ mod tests {
writer.finish_write(Lsn(0x10));
drop(writer);
tline.checkpoint(CheckpointConfig::Forced)?;
tline.checkpoint(CheckpointConfig::Forced).await?;
tline.compact()?;
let writer = tline.writer();
@@ -2149,7 +2240,7 @@ mod tests {
writer.finish_write(Lsn(0x20));
drop(writer);
tline.checkpoint(CheckpointConfig::Forced)?;
tline.checkpoint(CheckpointConfig::Forced).await?;
tline.compact()?;
let writer = tline.writer();
@@ -2157,7 +2248,7 @@ mod tests {
writer.finish_write(Lsn(0x30));
drop(writer);
tline.checkpoint(CheckpointConfig::Forced)?;
tline.checkpoint(CheckpointConfig::Forced).await?;
tline.compact()?;
let writer = tline.writer();
@@ -2165,7 +2256,7 @@ mod tests {
writer.finish_write(Lsn(0x40));
drop(writer);
tline.checkpoint(CheckpointConfig::Forced)?;
tline.checkpoint(CheckpointConfig::Forced).await?;
tline.compact()?;
assert_eq!(tline.get(*TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10"));
@@ -2181,8 +2272,8 @@ mod tests {
// Insert 1000 key-value pairs with increasing keys, checkpoint,
// repeat 50 times.
//
#[test]
fn test_bulk_insert() -> anyhow::Result<()> {
#[tokio::test]
async fn test_bulk_insert() -> anyhow::Result<()> {
let tenant = TenantHarness::create("test_bulk_insert")?.load();
let tline = tenant
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
@@ -2215,7 +2306,7 @@ mod tests {
let cutoff = tline.get_last_record_lsn();
tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?;
tline.checkpoint(CheckpointConfig::Forced)?;
tline.checkpoint(CheckpointConfig::Forced).await?;
tline.compact()?;
tline.gc()?;
}
@@ -2223,8 +2314,8 @@ mod tests {
Ok(())
}
#[test]
fn test_random_updates() -> anyhow::Result<()> {
#[tokio::test]
async fn test_random_updates() -> anyhow::Result<()> {
let tenant = TenantHarness::create("test_random_updates")?.load();
let tline = tenant
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
@@ -2287,7 +2378,7 @@ mod tests {
println!("checkpointing {}", lsn);
let cutoff = tline.get_last_record_lsn();
tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?;
tline.checkpoint(CheckpointConfig::Forced)?;
tline.checkpoint(CheckpointConfig::Forced).await?;
tline.compact()?;
tline.gc()?;
}
@@ -2295,8 +2386,8 @@ mod tests {
Ok(())
}
#[test]
fn test_traverse_branches() -> anyhow::Result<()> {
#[tokio::test]
async fn test_traverse_branches() -> anyhow::Result<()> {
let tenant = TenantHarness::create("test_traverse_branches")?.load();
let mut tline = tenant
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
@@ -2368,7 +2459,7 @@ mod tests {
println!("checkpointing {}", lsn);
let cutoff = tline.get_last_record_lsn();
tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?;
tline.checkpoint(CheckpointConfig::Forced)?;
tline.checkpoint(CheckpointConfig::Forced).await?;
tline.compact()?;
tline.gc()?;
}

View File

@@ -74,6 +74,7 @@ where
};
dstbuf.clear();
dstbuf.reserve(len);
// Read the payload
let mut remain = len;

View File

@@ -260,8 +260,9 @@ impl Layer for DeltaLayer {
// Ok, 'offsets' now contains the offsets of all the entries we need to read
let mut cursor = file.block_cursor();
let mut buf = Vec::new();
for (entry_lsn, pos) in offsets {
let buf = cursor.read_blob(pos).with_context(|| {
cursor.read_blob_into_buf(pos, &mut buf).with_context(|| {
format!(
"Failed to read blob from virtual file {}",
file.file.path.display()
@@ -610,9 +611,9 @@ impl DeltaLayer {
///
/// 3. Call `finish`.
///
pub struct DeltaLayerWriter {
struct DeltaLayerWriterInner {
conf: &'static PageServerConf,
path: PathBuf,
pub path: PathBuf,
timeline_id: TimelineId,
tenant_id: TenantId,
@@ -624,17 +625,17 @@ pub struct DeltaLayerWriter {
blob_writer: WriteBlobWriter<BufWriter<VirtualFile>>,
}
impl DeltaLayerWriter {
impl DeltaLayerWriterInner {
///
/// Start building a new delta layer.
///
pub fn new(
fn new(
conf: &'static PageServerConf,
timeline_id: TimelineId,
tenant_id: TenantId,
key_start: Key,
lsn_range: Range<Lsn>,
) -> Result<DeltaLayerWriter> {
) -> anyhow::Result<Self> {
// Create the file initially with a temporary filename. We don't know
// the end key yet, so we cannot form the final filename yet. We will
// rename it when we're done.
@@ -653,7 +654,7 @@ impl DeltaLayerWriter {
let block_buf = BlockBuf::new();
let tree_builder = DiskBtreeBuilder::new(block_buf);
Ok(DeltaLayerWriter {
Ok(Self {
conf,
path,
timeline_id,
@@ -670,17 +671,17 @@ impl DeltaLayerWriter {
///
/// The values must be appended in key, lsn order.
///
pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> Result<()> {
fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
self.put_value_bytes(key, lsn, &Value::ser(&val)?, val.will_init())
}
pub fn put_value_bytes(
fn put_value_bytes(
&mut self,
key: Key,
lsn: Lsn,
val: &[u8],
will_init: bool,
) -> Result<()> {
) -> anyhow::Result<()> {
assert!(self.lsn_range.start <= lsn);
let off = self.blob_writer.write_blob(val)?;
@@ -693,14 +694,14 @@ impl DeltaLayerWriter {
Ok(())
}
pub fn size(&self) -> u64 {
fn size(&self) -> u64 {
self.blob_writer.size() + self.tree.borrow_writer().size()
}
///
/// Finish writing the delta layer.
///
pub fn finish(self, key_end: Key) -> anyhow::Result<DeltaLayer> {
fn finish(self, key_end: Key) -> anyhow::Result<DeltaLayer> {
let index_start_blk =
((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
@@ -768,6 +769,102 @@ impl DeltaLayerWriter {
}
}
/// A builder object for constructing a new delta layer.
///
/// Usage:
///
/// 1. Create the DeltaLayerWriter by calling DeltaLayerWriter::new(...)
///
/// 2. Write the contents by calling `put_value` for every page
/// version to store in the layer.
///
/// 3. Call `finish`.
///
/// # Note
///
/// As described in https://github.com/neondatabase/neon/issues/2650, it's
/// possible for the writer to drop before `finish` is actually called. So this
/// could lead to odd temporary files in the directory, exhausting file system.
/// This structure wraps `DeltaLayerWriterInner` and also contains `Drop`
/// implementation that cleans up the temporary file in failure. It's not
/// possible to do this directly in `DeltaLayerWriterInner` since `finish` moves
/// out some fields, making it impossible to implement `Drop`.
///
#[must_use]
pub struct DeltaLayerWriter {
inner: Option<DeltaLayerWriterInner>,
}
impl DeltaLayerWriter {
///
/// Start building a new delta layer.
///
pub fn new(
conf: &'static PageServerConf,
timeline_id: TimelineId,
tenant_id: TenantId,
key_start: Key,
lsn_range: Range<Lsn>,
) -> anyhow::Result<Self> {
Ok(Self {
inner: Some(DeltaLayerWriterInner::new(
conf,
timeline_id,
tenant_id,
key_start,
lsn_range,
)?),
})
}
///
/// Append a key-value pair to the file.
///
/// The values must be appended in key, lsn order.
///
pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
self.inner.as_mut().unwrap().put_value(key, lsn, val)
}
pub fn put_value_bytes(
&mut self,
key: Key,
lsn: Lsn,
val: &[u8],
will_init: bool,
) -> anyhow::Result<()> {
self.inner
.as_mut()
.unwrap()
.put_value_bytes(key, lsn, val, will_init)
}
pub fn size(&self) -> u64 {
self.inner.as_ref().unwrap().size()
}
///
/// Finish writing the delta layer.
///
pub fn finish(mut self, key_end: Key) -> anyhow::Result<DeltaLayer> {
self.inner.take().unwrap().finish(key_end)
}
}
impl Drop for DeltaLayerWriter {
fn drop(&mut self) {
if let Some(inner) = self.inner.take() {
match inner.blob_writer.into_inner().into_inner() {
Ok(vfile) => vfile.remove(),
Err(err) => warn!(
"error while flushing buffer of image layer temporary file: {}",
err
),
}
}
}
}
///
/// Iterator over all key-value pairse stored in a delta layer
///

View File

@@ -411,7 +411,7 @@ impl ImageLayer {
///
/// 3. Call `finish`.
///
pub struct ImageLayerWriter {
struct ImageLayerWriterInner {
conf: &'static PageServerConf,
path: PathBuf,
timeline_id: TimelineId,
@@ -423,14 +423,17 @@ pub struct ImageLayerWriter {
tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
}
impl ImageLayerWriter {
pub fn new(
impl ImageLayerWriterInner {
///
/// Start building a new image layer.
///
fn new(
conf: &'static PageServerConf,
timeline_id: TimelineId,
tenant_id: TenantId,
key_range: &Range<Key>,
lsn: Lsn,
) -> anyhow::Result<ImageLayerWriter> {
) -> anyhow::Result<Self> {
// Create the file initially with a temporary filename.
// We'll atomically rename it to the final name when we're done.
let path = ImageLayer::temp_path_for(
@@ -455,7 +458,7 @@ impl ImageLayerWriter {
let block_buf = BlockBuf::new();
let tree_builder = DiskBtreeBuilder::new(block_buf);
let writer = ImageLayerWriter {
let writer = Self {
conf,
path,
timeline_id,
@@ -474,7 +477,7 @@ impl ImageLayerWriter {
///
/// The page versions must be appended in blknum order.
///
pub fn put_image(&mut self, key: Key, img: &[u8]) -> Result<()> {
fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
ensure!(self.key_range.contains(&key));
let off = self.blob_writer.write_blob(img)?;
@@ -485,7 +488,10 @@ impl ImageLayerWriter {
Ok(())
}
pub fn finish(self) -> anyhow::Result<ImageLayer> {
///
/// Finish writing the image layer.
///
fn finish(self) -> anyhow::Result<ImageLayer> {
let index_start_blk =
((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
@@ -552,3 +558,76 @@ impl ImageLayerWriter {
Ok(layer)
}
}
/// A builder object for constructing a new image layer.
///
/// Usage:
///
/// 1. Create the ImageLayerWriter by calling ImageLayerWriter::new(...)
///
/// 2. Write the contents by calling `put_page_image` for every key-value
/// pair in the key range.
///
/// 3. Call `finish`.
///
/// # Note
///
/// As described in https://github.com/neondatabase/neon/issues/2650, it's
/// possible for the writer to drop before `finish` is actually called. So this
/// could lead to odd temporary files in the directory, exhausting file system.
/// This structure wraps `ImageLayerWriterInner` and also contains `Drop`
/// implementation that cleans up the temporary file in failure. It's not
/// possible to do this directly in `ImageLayerWriterInner` since `finish` moves
/// out some fields, making it impossible to implement `Drop`.
///
#[must_use]
pub struct ImageLayerWriter {
inner: Option<ImageLayerWriterInner>,
}
impl ImageLayerWriter {
///
/// Start building a new image layer.
///
pub fn new(
conf: &'static PageServerConf,
timeline_id: TimelineId,
tenant_id: TenantId,
key_range: &Range<Key>,
lsn: Lsn,
) -> anyhow::Result<ImageLayerWriter> {
Ok(Self {
inner: Some(ImageLayerWriterInner::new(
conf,
timeline_id,
tenant_id,
key_range,
lsn,
)?),
})
}
///
/// Write next value to the file.
///
/// The page versions must be appended in blknum order.
///
pub fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
self.inner.as_mut().unwrap().put_image(key, img)
}
///
/// Finish writing the image layer.
///
pub fn finish(mut self) -> anyhow::Result<ImageLayer> {
self.inner.take().unwrap().finish()
}
}
impl Drop for ImageLayerWriter {
fn drop(&mut self) {
if let Some(inner) = self.inner.take() {
inner.blob_writer.into_inner().remove();
}
}
}

View File

@@ -0,0 +1,475 @@
use std::cmp;
use std::collections::{HashMap, HashSet};
use std::sync::Arc;
use anyhow::Context;
use tokio::sync::Semaphore;
use super::Tenant;
use utils::id::TimelineId;
use utils::lsn::Lsn;
use tracing::*;
/// Inputs to the actual tenant sizing model
///
/// Implements [`serde::Serialize`] but is not meant to be part of the public API, instead meant to
/// be a transferrable format between execution environments and developer.
#[serde_with::serde_as]
#[derive(Debug, serde::Serialize, serde::Deserialize)]
pub struct ModelInputs {
updates: Vec<Update>,
retention_period: u64,
#[serde_as(as = "HashMap<serde_with::DisplayFromStr, _>")]
timeline_inputs: HashMap<TimelineId, TimelineInputs>,
}
/// Collect all relevant LSNs to the inputs. These will only be helpful in the serialized form as
/// part of [`ModelInputs`] from the HTTP api, explaining the inputs.
#[serde_with::serde_as]
#[derive(Debug, serde::Serialize, serde::Deserialize)]
struct TimelineInputs {
#[serde_as(as = "serde_with::DisplayFromStr")]
last_record: Lsn,
#[serde_as(as = "serde_with::DisplayFromStr")]
latest_gc_cutoff: Lsn,
#[serde_as(as = "serde_with::DisplayFromStr")]
horizon_cutoff: Lsn,
#[serde_as(as = "serde_with::DisplayFromStr")]
pitr_cutoff: Lsn,
#[serde_as(as = "serde_with::DisplayFromStr")]
next_gc_cutoff: Lsn,
}
/// Gathers the inputs for the tenant sizing model.
///
/// Tenant size does not consider the latest state, but only the state until next_gc_cutoff, which
/// is updated on-demand, during the start of this calculation and separate from the
/// [`Timeline::latest_gc_cutoff`].
///
/// For timelines in general:
///
/// ```ignore
/// 0-----|---------|----|------------| · · · · · |·> lsn
/// initdb_lsn branchpoints* next_gc_cutoff latest
/// ```
///
/// Until gc_horizon_cutoff > `Timeline::last_record_lsn` for any of the tenant's timelines, the
/// tenant size will be zero.
pub(super) async fn gather_inputs(
tenant: &Tenant,
limit: &Arc<Semaphore>,
logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
) -> anyhow::Result<ModelInputs> {
// with joinset, on drop, all of the tasks will just be de-scheduled, which we can use to
// our advantage with `?` error handling.
let mut joinset = tokio::task::JoinSet::new();
let timelines = tenant
.refresh_gc_info()
.context("Failed to refresh gc_info before gathering inputs")?;
if timelines.is_empty() {
// All timelines are below tenant's gc_horizon; alternative would be to use
// Tenant::list_timelines but then those gc_info's would not be updated yet, possibly
// missing GcInfo::retain_lsns or having obsolete values for cutoff's.
return Ok(ModelInputs {
updates: vec![],
retention_period: 0,
timeline_inputs: HashMap::new(),
});
}
// record the used/inserted cache keys here, to remove extras not to start leaking
// after initial run the cache should be quite stable, but live timelines will eventually
// require new lsns to be inspected.
let mut needed_cache = HashSet::<(TimelineId, Lsn)>::new();
let mut updates = Vec::new();
// record the per timline values used to determine `retention_period`
let mut timeline_inputs = HashMap::with_capacity(timelines.len());
// used to determine the `retention_period` for the size model
let mut max_cutoff_distance = None;
// this will probably conflict with on-demand downloaded layers, or at least force them all
// to be downloaded
for timeline in timelines {
let last_record_lsn = timeline.get_last_record_lsn();
let (interesting_lsns, horizon_cutoff, pitr_cutoff, next_gc_cutoff) = {
// there's a race between the update (holding tenant.gc_lock) and this read but it
// might not be an issue, because it's not for Timeline::gc
let gc_info = timeline.gc_info.read().unwrap();
// similar to gc, but Timeline::get_latest_gc_cutoff_lsn() will not be updated before a
// new gc run, which we have no control over. however differently from `Timeline::gc`
// we don't consider the `Timeline::disk_consistent_lsn` at all, because we are not
// actually removing files.
let next_gc_cutoff = cmp::min(gc_info.horizon_cutoff, gc_info.pitr_cutoff);
// the minimum where we should find the next_gc_cutoff for our calculations.
//
// next_gc_cutoff in parent branch are not of interest (right now at least), nor do we
// want to query any logical size before initdb_lsn.
let cutoff_minimum = cmp::max(timeline.get_ancestor_lsn(), timeline.initdb_lsn);
let maybe_cutoff = if next_gc_cutoff > cutoff_minimum {
Some((next_gc_cutoff, LsnKind::GcCutOff))
} else {
None
};
// this assumes there are no other lsns than the branchpoints
let lsns = gc_info
.retain_lsns
.iter()
.inspect(|&&lsn| {
trace!(
timeline_id=%timeline.timeline_id,
"retained lsn: {lsn:?}, is_before_ancestor_lsn={}",
lsn < timeline.get_ancestor_lsn()
)
})
.filter(|&&lsn| lsn > timeline.get_ancestor_lsn())
.copied()
.map(|lsn| (lsn, LsnKind::BranchPoint))
.chain(maybe_cutoff)
.collect::<Vec<_>>();
(
lsns,
gc_info.horizon_cutoff,
gc_info.pitr_cutoff,
next_gc_cutoff,
)
};
// update this to have a retention_period later for the tenant_size_model
// tenant_size_model compares this to the last segments start_lsn
if let Some(cutoff_distance) = last_record_lsn.checked_sub(next_gc_cutoff) {
match max_cutoff_distance.as_mut() {
Some(max) => {
*max = std::cmp::max(*max, cutoff_distance);
}
_ => {
max_cutoff_distance = Some(cutoff_distance);
}
}
}
// all timelines branch from something, because it might be impossible to pinpoint
// which is the tenant_size_model's "default" branch.
updates.push(Update {
lsn: timeline.get_ancestor_lsn(),
command: Command::BranchFrom(timeline.get_ancestor_timeline_id()),
timeline_id: timeline.timeline_id,
});
for (lsn, _kind) in &interesting_lsns {
if let Some(size) = logical_size_cache.get(&(timeline.timeline_id, *lsn)) {
updates.push(Update {
lsn: *lsn,
timeline_id: timeline.timeline_id,
command: Command::Update(*size),
});
needed_cache.insert((timeline.timeline_id, *lsn));
} else {
let timeline = Arc::clone(&timeline);
let parallel_size_calcs = Arc::clone(limit);
joinset.spawn(calculate_logical_size(parallel_size_calcs, timeline, *lsn));
}
}
timeline_inputs.insert(
timeline.timeline_id,
TimelineInputs {
last_record: last_record_lsn,
// this is not used above, because it might not have updated recently enough
latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(),
horizon_cutoff,
pitr_cutoff,
next_gc_cutoff,
},
);
}
let mut have_any_error = false;
while let Some(res) = joinset.join_next().await {
// each of these come with Result<Result<_, JoinError>, JoinError>
// because of spawn + spawn_blocking
let res = res.and_then(|inner| inner);
match res {
Ok(TimelineAtLsnSizeResult(timeline, lsn, Ok(size))) => {
debug!(timeline_id=%timeline.timeline_id, %lsn, size, "size calculated");
logical_size_cache.insert((timeline.timeline_id, lsn), size);
needed_cache.insert((timeline.timeline_id, lsn));
updates.push(Update {
lsn,
timeline_id: timeline.timeline_id,
command: Command::Update(size),
});
}
Ok(TimelineAtLsnSizeResult(timeline, lsn, Err(error))) => {
warn!(
timeline_id=%timeline.timeline_id,
"failed to calculate logical size at {lsn}: {error:#}"
);
have_any_error = true;
}
Err(join_error) if join_error.is_cancelled() => {
unreachable!("we are not cancelling any of the futures, nor should be");
}
Err(join_error) => {
// cannot really do anything, as this panic is likely a bug
error!("logical size query panicked: {join_error:#}");
have_any_error = true;
}
}
}
// prune any keys not needed anymore; we record every used key and added key.
logical_size_cache.retain(|key, _| needed_cache.contains(key));
if have_any_error {
// we cannot complete this round, because we are missing data.
// we have however cached all we were able to request calculation on.
anyhow::bail!("failed to calculate some logical_sizes");
}
// the data gathered to updates is per lsn, regardless of the branch, so we can use it to
// our advantage, not requiring a sorted container or graph walk.
//
// for branch points, which come as multiple updates at the same LSN, the Command::Update
// is needed before a branch is made out of that branch Command::BranchFrom. this is
// handled by the variant order in `Command`.
updates.sort_unstable();
let retention_period = match max_cutoff_distance {
Some(max) => max.0,
None => {
anyhow::bail!("the first branch should have a gc_cutoff after it's branch point at 0")
}
};
Ok(ModelInputs {
updates,
retention_period,
timeline_inputs,
})
}
impl ModelInputs {
pub fn calculate(&self) -> anyhow::Result<u64> {
// Option<TimelineId> is used for "naming" the branches because it is assumed to be
// impossible to always determine the a one main branch.
let mut storage = tenant_size_model::Storage::<Option<TimelineId>>::new(None);
// tracking these not to require modifying the current implementation of the size model,
// which works in relative LSNs and sizes.
let mut last_state: HashMap<TimelineId, (Lsn, u64)> = HashMap::new();
for update in &self.updates {
let Update {
lsn,
command: op,
timeline_id,
} = update;
match op {
Command::Update(sz) => {
let latest = last_state.get_mut(timeline_id).ok_or_else(|| {
anyhow::anyhow!(
"ordering-mismatch: there must had been a previous state for {timeline_id}"
)
})?;
let lsn_bytes = {
let Lsn(now) = lsn;
let Lsn(prev) = latest.0;
debug_assert!(prev <= *now, "self.updates should had been sorted");
now - prev
};
let size_diff =
i64::try_from(*sz as i128 - latest.1 as i128).with_context(|| {
format!("size difference i64 overflow for {timeline_id}")
})?;
storage.modify_branch(&Some(*timeline_id), "".into(), lsn_bytes, size_diff);
*latest = (*lsn, *sz);
}
Command::BranchFrom(parent) => {
storage.branch(parent, Some(*timeline_id));
let size = parent
.as_ref()
.and_then(|id| last_state.get(id))
.map(|x| x.1)
.unwrap_or(0);
last_state.insert(*timeline_id, (*lsn, size));
}
}
}
Ok(storage.calculate(self.retention_period).total_children())
}
}
/// Single size model update.
///
/// Sizing model works with relative increments over latest branch state.
/// Updates are absolute, so additional state needs to be tracked when applying.
#[serde_with::serde_as]
#[derive(
Debug, PartialEq, PartialOrd, Eq, Ord, Clone, Copy, serde::Serialize, serde::Deserialize,
)]
struct Update {
#[serde_as(as = "serde_with::DisplayFromStr")]
lsn: utils::lsn::Lsn,
command: Command,
#[serde_as(as = "serde_with::DisplayFromStr")]
timeline_id: TimelineId,
}
#[serde_with::serde_as]
#[derive(PartialOrd, PartialEq, Eq, Ord, Clone, Copy, serde::Serialize, serde::Deserialize)]
#[serde(rename_all = "snake_case")]
enum Command {
Update(u64),
BranchFrom(#[serde_as(as = "Option<serde_with::DisplayFromStr>")] Option<TimelineId>),
}
impl std::fmt::Debug for Command {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
// custom one-line implementation makes it more enjoyable to read {:#?} avoiding 3
// linebreaks
match self {
Self::Update(arg0) => write!(f, "Update({arg0})"),
Self::BranchFrom(arg0) => write!(f, "BranchFrom({arg0:?})"),
}
}
}
#[derive(Debug, Clone, Copy)]
enum LsnKind {
BranchPoint,
GcCutOff,
}
/// Newtype around the tuple that carries the timeline at lsn logical size calculation.
struct TimelineAtLsnSizeResult(
Arc<crate::tenant::Timeline>,
utils::lsn::Lsn,
anyhow::Result<u64>,
);
#[instrument(skip_all, fields(timeline_id=%timeline.timeline_id, lsn=%lsn))]
async fn calculate_logical_size(
limit: Arc<tokio::sync::Semaphore>,
timeline: Arc<crate::tenant::Timeline>,
lsn: utils::lsn::Lsn,
) -> Result<TimelineAtLsnSizeResult, tokio::task::JoinError> {
let permit = tokio::sync::Semaphore::acquire_owned(limit)
.await
.expect("global semaphore should not had been closed");
tokio::task::spawn_blocking(move || {
let _permit = permit;
let size_res = timeline.calculate_logical_size(lsn);
TimelineAtLsnSizeResult(timeline, lsn, size_res)
})
.await
}
#[test]
fn updates_sort() {
use std::str::FromStr;
use utils::id::TimelineId;
use utils::lsn::Lsn;
let ids = [
TimelineId::from_str("7ff1edab8182025f15ae33482edb590a").unwrap(),
TimelineId::from_str("b1719e044db05401a05a2ed588a3ad3f").unwrap(),
TimelineId::from_str("b68d6691c895ad0a70809470020929ef").unwrap(),
];
// try through all permutations
let ids = [
[&ids[0], &ids[1], &ids[2]],
[&ids[0], &ids[2], &ids[1]],
[&ids[1], &ids[0], &ids[2]],
[&ids[1], &ids[2], &ids[0]],
[&ids[2], &ids[0], &ids[1]],
[&ids[2], &ids[1], &ids[0]],
];
for ids in ids {
// apply a fixture which uses a permutation of ids
let commands = [
Update {
lsn: Lsn(0),
command: Command::BranchFrom(None),
timeline_id: *ids[0],
},
Update {
lsn: Lsn::from_str("0/67E7618").unwrap(),
command: Command::Update(43696128),
timeline_id: *ids[0],
},
Update {
lsn: Lsn::from_str("0/67E7618").unwrap(),
command: Command::BranchFrom(Some(*ids[0])),
timeline_id: *ids[1],
},
Update {
lsn: Lsn::from_str("0/76BE4F0").unwrap(),
command: Command::Update(41844736),
timeline_id: *ids[1],
},
Update {
lsn: Lsn::from_str("0/10E49380").unwrap(),
command: Command::Update(42164224),
timeline_id: *ids[0],
},
Update {
lsn: Lsn::from_str("0/10E49380").unwrap(),
command: Command::BranchFrom(Some(*ids[0])),
timeline_id: *ids[2],
},
Update {
lsn: Lsn::from_str("0/11D74910").unwrap(),
command: Command::Update(42172416),
timeline_id: *ids[2],
},
Update {
lsn: Lsn::from_str("0/12051E98").unwrap(),
command: Command::Update(42196992),
timeline_id: *ids[0],
},
];
let mut sorted = commands;
// these must sort in the same order, regardless of how the ids sort
// which is why the timeline_id is the last field
sorted.sort_unstable();
assert_eq!(commands, sorted, "{:#?} vs. {:#?}", commands, sorted);
}
}
#[test]
fn verify_size_for_multiple_branches() {
// this is generated from integration test test_tenant_size_with_multiple_branches, but this way
// it has the stable lsn's
let doc = r#"{"updates":[{"lsn":"0/0","command":{"branch_from":null},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"update":25763840},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/1819818","command":{"update":26075136},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/18B5E40","command":{"update":26427392},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"update":26492928},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"230fc9d756f7363574c0d66533564dcc"},{"lsn":"0/220F438","command":{"update":25239552},"timeline_id":"230fc9d756f7363574c0d66533564dcc"}],"retention_period":131072,"timeline_inputs":{"cd9d9409c216e64bf580904facedb01b":{"last_record":"0/18D5E40","latest_gc_cutoff":"0/169ACF0","horizon_cutoff":"0/18B5E40","pitr_cutoff":"0/18B5E40","next_gc_cutoff":"0/18B5E40"},"10b532a550540bc15385eac4edde416a":{"last_record":"0/1839818","latest_gc_cutoff":"0/169ACF0","horizon_cutoff":"0/1819818","pitr_cutoff":"0/1819818","next_gc_cutoff":"0/1819818"},"230fc9d756f7363574c0d66533564dcc":{"last_record":"0/222F438","latest_gc_cutoff":"0/169ACF0","horizon_cutoff":"0/220F438","pitr_cutoff":"0/220F438","next_gc_cutoff":"0/220F438"}}}"#;
let inputs: ModelInputs = serde_json::from_str(doc).unwrap();
assert_eq!(inputs.calculate().unwrap(), 36_409_872);
}

View File

@@ -16,7 +16,7 @@ use std::fs;
use std::ops::{Deref, Range};
use std::path::PathBuf;
use std::sync::atomic::{self, AtomicBool, AtomicI64, Ordering as AtomicOrdering};
use std::sync::{Arc, Mutex, MutexGuard, RwLock, TryLockError};
use std::sync::{Arc, Mutex, MutexGuard, RwLock};
use std::time::{Duration, Instant, SystemTime};
use crate::tenant::{
@@ -121,8 +121,16 @@ pub struct Timeline {
/// to avoid deadlock.
write_lock: Mutex<()>,
/// Used to ensure that there is only task performing flushing at a time
layer_flush_lock: Mutex<()>,
/// Used to avoid multiple `flush_loop` tasks running
flush_loop_started: Mutex<bool>,
/// layer_flush_start_tx can be used to wake up the layer-flushing task.
/// The value is a counter, incremented every time a new flush cycle is requested.
/// The flush cycle counter is sent back on the layer_flush_done channel when
/// the flush finishes. You can use that to wait for the flush to finish.
layer_flush_start_tx: tokio::sync::watch::Sender<u64>,
/// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel
layer_flush_done_tx: tokio::sync::watch::Sender<(u64, anyhow::Result<()>)>,
/// Layer removal lock.
/// A lock to ensure that no layer of the timeline is removed concurrently by other tasks.
@@ -272,6 +280,11 @@ impl LogicalSize {
self.size_added_after_initial
.fetch_add(delta, AtomicOrdering::SeqCst);
}
/// Returns the initialized (already calculated) value, if any.
fn initialized_size(&self) -> Option<u64> {
self.initial_logical_size.get().copied()
}
}
pub struct WalReceiverInfo {
@@ -461,15 +474,16 @@ impl Timeline {
///
/// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't
/// know anything about them here in the repository.
pub fn checkpoint(&self, cconf: CheckpointConfig) -> anyhow::Result<()> {
#[instrument(skip(self), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id))]
pub async fn checkpoint(&self, cconf: CheckpointConfig) -> anyhow::Result<()> {
match cconf {
CheckpointConfig::Flush => {
self.freeze_inmem_layer(false);
self.flush_frozen_layers(true)
self.flush_frozen_layers_and_wait().await
}
CheckpointConfig::Forced => {
self.freeze_inmem_layer(false);
self.flush_frozen_layers(true)?;
self.flush_frozen_layers_and_wait().await?;
self.compact()
}
}
@@ -619,24 +633,8 @@ impl Timeline {
self.last_freeze_at.store(last_lsn);
*(self.last_freeze_ts.write().unwrap()) = Instant::now();
// Launch a task to flush the frozen layer to disk, unless
// a task was already running. (If the task was running
// at the time that we froze the layer, it must've seen the
// the layer we just froze before it exited; see comments
// in flush_frozen_layers())
if let Ok(guard) = self.layer_flush_lock.try_lock() {
drop(guard);
let self_clone = Arc::clone(self);
task_mgr::spawn(
task_mgr::BACKGROUND_RUNTIME.handle(),
task_mgr::TaskKind::LayerFlushTask,
Some(self.tenant_id),
Some(self.timeline_id),
"layer flush task",
false,
async move { self_clone.flush_frozen_layers(false) },
);
}
// Wake up the layer flusher
self.flush_frozen_layers();
}
}
Ok(())
@@ -727,6 +725,9 @@ impl Timeline {
let disk_consistent_lsn = metadata.disk_consistent_lsn();
let (state, _) = watch::channel(TimelineState::Suspended);
let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0);
let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));
let mut result = Timeline {
conf,
tenant_conf,
@@ -754,8 +755,12 @@ impl Timeline {
upload_layers: AtomicBool::new(upload_layers),
flush_loop_started: Mutex::new(false),
layer_flush_start_tx,
layer_flush_done_tx,
write_lock: Mutex::new(()),
layer_flush_lock: Mutex::new(()),
layer_removal_cs: Mutex::new(()),
gc_info: RwLock::new(GcInfo {
@@ -788,6 +793,33 @@ impl Timeline {
result
}
pub(super) fn maybe_spawn_flush_loop(self: &Arc<Self>) {
let mut flush_loop_started = self.flush_loop_started.lock().unwrap();
if *flush_loop_started {
info!(
"skipping attempt to start flush_loop twice {}/{}",
self.tenant_id, self.timeline_id
);
return;
}
let layer_flush_start_rx = self.layer_flush_start_tx.subscribe();
let self_clone = Arc::clone(self);
info!("spawning flush loop");
task_mgr::spawn(
task_mgr::BACKGROUND_RUNTIME.handle(),
task_mgr::TaskKind::LayerFlushTask,
Some(self.tenant_id),
Some(self.timeline_id),
"layer flush task",
false,
async move { self_clone.flush_loop(layer_flush_start_rx).await; Ok(()) }
.instrument(info_span!(parent: None, "layer flush task", tenant = %self.tenant_id, timeline = %self.timeline_id))
);
*flush_loop_started = true;
}
pub(super) fn launch_wal_receiver(self: &Arc<Self>) {
if !is_etcd_client_initialized() {
if cfg!(test) {
@@ -979,9 +1011,26 @@ impl Timeline {
/// Calculate the logical size of the database at the latest LSN.
///
/// NOTE: counted incrementally, includes ancestors, this can be a slow operation.
fn calculate_logical_size(&self, up_to_lsn: Lsn) -> anyhow::Result<u64> {
info!("Calculating logical size for timeline {}", self.timeline_id);
let timer = self.metrics.init_logical_size_histo.start_timer();
pub fn calculate_logical_size(&self, up_to_lsn: Lsn) -> anyhow::Result<u64> {
info!(
"Calculating logical size for timeline {} at {}",
self.timeline_id, up_to_lsn
);
let timer = if up_to_lsn == self.initdb_lsn {
if let Some(size) = self.current_logical_size.initialized_size() {
if size != 0 {
// non-zero size means that the size has already been calculated by this method
// after startup. if the logical size is for a new timeline without layers the
// size will be zero, and we cannot use that, or this caching strategy until
// pageserver restart.
return Ok(size);
}
}
self.metrics.init_logical_size_histo.start_timer()
} else {
self.metrics.logical_size_histo.start_timer()
};
let logical_size = self.get_current_logical_size_non_incremental(up_to_lsn)?;
debug!("calculated logical size: {logical_size}");
timer.stop_and_record();
@@ -1267,53 +1316,94 @@ impl Timeline {
drop(layers);
}
/// Flush all frozen layers to disk.
///
/// Only one task at a time can be doing layer-flushing for a
/// given timeline. If 'wait' is true, and another task is
/// currently doing the flushing, this function will wait for it
/// to finish. If 'wait' is false, this function will return
/// immediately instead.
fn flush_frozen_layers(&self, wait: bool) -> anyhow::Result<()> {
let flush_lock_guard = if wait {
self.layer_flush_lock.lock().unwrap()
} else {
match self.layer_flush_lock.try_lock() {
Ok(guard) => guard,
Err(TryLockError::WouldBlock) => return Ok(()),
Err(TryLockError::Poisoned(err)) => panic!("{:?}", err),
}
};
let timer = self.metrics.flush_time_histo.start_timer();
/// Layer flusher task's main loop.
async fn flush_loop(&self, mut layer_flush_start_rx: tokio::sync::watch::Receiver<u64>) {
info!("started flush loop");
loop {
let layers = self.layers.read().unwrap();
if let Some(frozen_layer) = layers.frozen_layers.front() {
let frozen_layer = Arc::clone(frozen_layer);
drop(layers); // to allow concurrent reads and writes
self.flush_frozen_layer(frozen_layer)?;
} else {
// Drop the 'layer_flush_lock' *before* 'layers'. That
// way, if you freeze a layer, and then call
// flush_frozen_layers(false), it is guaranteed that
// if another thread was busy flushing layers and the
// call therefore returns immediately, the other
// thread will have seen the newly-frozen layer and
// will flush that too (assuming no errors).
drop(flush_lock_guard);
drop(layers);
break;
tokio::select! {
_ = task_mgr::shutdown_watcher() => {
info!("shutting down layer flush task");
break;
},
_ = layer_flush_start_rx.changed() => {}
}
trace!("waking up");
let timer = self.metrics.flush_time_histo.start_timer();
let flush_counter = *layer_flush_start_rx.borrow();
let result = loop {
let layer_to_flush = {
let layers = self.layers.read().unwrap();
layers.frozen_layers.front().cloned()
// drop 'layers' lock to allow concurrent reads and writes
};
if let Some(layer_to_flush) = layer_to_flush {
if let Err(err) = self.flush_frozen_layer(layer_to_flush).await {
error!("could not flush frozen layer: {err:?}");
break Err(err);
}
continue;
} else {
break Ok(());
}
};
// Notify any listeners that we're done
let _ = self
.layer_flush_done_tx
.send_replace((flush_counter, result));
timer.stop_and_record();
}
}
async fn flush_frozen_layers_and_wait(&self) -> anyhow::Result<()> {
let mut rx = self.layer_flush_done_tx.subscribe();
// Increment the flush cycle counter and wake up the flush task.
// Remember the new value, so that when we listen for the flush
// to finish, we know when the flush that we initiated has
// finished, instead of some other flush that was started earlier.
let mut my_flush_request = 0;
if !&*self.flush_loop_started.lock().unwrap() {
anyhow::bail!("cannot flush frozen layers when flush_loop is not running")
}
timer.stop_and_record();
self.layer_flush_start_tx.send_modify(|counter| {
my_flush_request = *counter + 1;
*counter = my_flush_request;
});
Ok(())
loop {
{
let (last_result_counter, last_result) = &*rx.borrow();
if *last_result_counter >= my_flush_request {
if let Err(_err) = last_result {
// We already logged the original error in
// flush_loop. We cannot propagate it to the caller
// here, because it might not be Cloneable
anyhow::bail!(
"Could not flush frozen layer. Request id: {}",
my_flush_request
);
} else {
return Ok(());
}
}
}
trace!("waiting for flush to complete");
rx.changed().await?;
trace!("done")
}
}
fn flush_frozen_layers(&self) {
self.layer_flush_start_tx.send_modify(|val| *val += 1);
}
/// Flush one frozen in-memory layer to disk, as a new delta layer.
fn flush_frozen_layer(&self, frozen_layer: Arc<InMemoryLayer>) -> anyhow::Result<()> {
#[instrument(skip(self, frozen_layer), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer.filename().display()))]
async fn flush_frozen_layer(&self, frozen_layer: Arc<InMemoryLayer>) -> anyhow::Result<()> {
// As a special case, when we have just imported an image into the repository,
// instead of writing out a L0 delta layer, we directly write out image layer
// files instead. This is possible as long as *all* the data imported into the
@@ -1541,6 +1631,10 @@ impl Timeline {
lsn,
)?;
fail_point!("image-layer-writer-fail-before-finish", |_| {
anyhow::bail!("failpoint image-layer-writer-fail-before-finish");
});
for range in &partition.ranges {
let mut key = range.start;
while key < range.end {
@@ -1835,6 +1929,11 @@ impl Timeline {
},
)?);
}
fail_point!("delta-layer-writer-fail-before-finish", |_| {
anyhow::bail!("failpoint delta-layer-writer-fail-before-finish");
});
writer.as_mut().unwrap().put_value(key, lsn, value)?;
prev_key = Some(key);
}
@@ -2234,13 +2333,10 @@ impl Timeline {
let last_rec_lsn = data.records.last().unwrap().0;
let img = self.walredo_mgr.request_redo(
key,
request_lsn,
base_img,
data.records,
self.pg_version,
)?;
let img = self
.walredo_mgr
.request_redo(key, request_lsn, base_img, data.records, self.pg_version)
.context("Failed to reconstruct a page image:")?;
if img.len() == page_cache::PAGE_SZ {
let cache = page_cache::get();

View File

@@ -82,6 +82,7 @@ pub struct TenantConf {
/// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update,
/// to avoid eager reconnects.
pub max_lsn_wal_lag: NonZeroU64,
pub trace_read_requests: bool,
}
/// Same as TenantConf, but this struct preserves the information about
@@ -105,6 +106,7 @@ pub struct TenantConfOpt {
#[serde(with = "humantime_serde")]
pub lagging_wal_timeout: Option<Duration>,
pub max_lsn_wal_lag: Option<NonZeroU64>,
pub trace_read_requests: Option<bool>,
}
impl TenantConfOpt {
@@ -138,6 +140,9 @@ impl TenantConfOpt {
.lagging_wal_timeout
.unwrap_or(global_conf.lagging_wal_timeout),
max_lsn_wal_lag: self.max_lsn_wal_lag.unwrap_or(global_conf.max_lsn_wal_lag),
trace_read_requests: self
.trace_read_requests
.unwrap_or(global_conf.trace_read_requests),
}
}
@@ -207,10 +212,10 @@ impl TenantConf {
.expect("cannot parse default walreceiver lagging wal timeout"),
max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
.expect("cannot parse default max walreceiver Lsn wal lag"),
trace_read_requests: false,
}
}
#[cfg(test)]
pub fn dummy_conf() -> Self {
TenantConf {
checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE,
@@ -232,6 +237,7 @@ impl TenantConf {
.unwrap(),
max_lsn_wal_lag: NonZeroU64::new(defaults::DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
.unwrap(),
trace_read_requests: false,
}
}
}

View File

@@ -241,7 +241,7 @@ pub async fn shutdown_all_tenants() {
let tenant_id = tenant.tenant_id();
debug!("shutdown tenant {tenant_id}");
if let Err(err) = tenant.checkpoint() {
if let Err(err) = tenant.checkpoint().await {
error!("Could not checkpoint tenant {tenant_id} during shutdown: {err:?}");
}
}

View File

@@ -72,8 +72,6 @@ async fn compaction_loop(tenant_id: TenantId) {
if let Err(e) = tenant.compaction_iteration() {
sleep_duration = wait_duration;
error!("Compaction failed, retrying in {:?}: {e:#}", sleep_duration);
#[cfg(feature = "testing")]
std::process::abort();
}
// Sleep
@@ -119,12 +117,10 @@ async fn gc_loop(tenant_id: TenantId) {
let gc_horizon = tenant.get_gc_horizon();
let mut sleep_duration = gc_period;
if gc_horizon > 0 {
if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), false)
if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), false).await
{
sleep_duration = wait_duration;
error!("Gc failed, retrying in {:?}: {e:#}", sleep_duration);
#[cfg(feature = "testing")]
std::process::abort();
}
}

36
pageserver/src/trace.rs Normal file
View File

@@ -0,0 +1,36 @@
use bytes::Bytes;
use std::{
fs::{create_dir_all, File},
io::{BufWriter, Write},
path::PathBuf,
};
pub struct Tracer {
writer: BufWriter<File>,
}
impl Drop for Tracer {
fn drop(&mut self) {
self.flush()
}
}
impl Tracer {
pub fn new(path: PathBuf) -> Self {
let parent = path.parent().expect("failed to parse parent path");
create_dir_all(parent).expect("failed to create trace dir");
let file = File::create(path).expect("failed to create trace file");
Tracer {
writer: BufWriter::new(file),
}
}
pub fn trace(&mut self, msg: &Bytes) {
self.writer.write_all(msg).expect("failed to write trace");
}
pub fn flush(&mut self) {
self.writer.flush().expect("failed to flush trace file");
}
}

View File

@@ -319,6 +319,12 @@ impl VirtualFile {
Ok(result)
}
pub fn remove(self) {
let path = self.path.clone();
drop(self);
std::fs::remove_file(path).expect("failed to remove the virtual file");
}
}
impl Drop for VirtualFile {

View File

@@ -155,22 +155,19 @@ impl<E: Clone> TaskHandle<E> {
/// Aborts current task, waiting for it to finish.
pub async fn shutdown(self) {
match self.join_handle {
Some(jh) => {
self.cancellation.send(()).ok();
match jh.await {
Ok(Ok(())) => debug!("Shutdown success"),
Ok(Err(e)) => error!("Shutdown task error: {e:?}"),
Err(join_error) => {
if join_error.is_cancelled() {
error!("Shutdown task was cancelled");
} else {
error!("Shutdown task join error: {join_error}")
}
if let Some(jh) = self.join_handle {
self.cancellation.send(()).ok();
match jh.await {
Ok(Ok(())) => debug!("Shutdown success"),
Ok(Err(e)) => error!("Shutdown task error: {e:?}"),
Err(join_error) => {
if join_error.is_cancelled() {
error!("Shutdown task was cancelled");
} else {
error!("Shutdown task join error: {join_error}")
}
}
}
None => {}
}
}
}

View File

@@ -93,7 +93,7 @@ pub fn spawn_connection_manager_task(
}
}
.instrument(
info_span!("wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id),
info_span!(parent: None, "wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id),
),
);
}
@@ -836,15 +836,20 @@ fn wal_stream_connection_string(
listen_pg_addr_str: &str,
) -> anyhow::Result<String> {
let sk_connstr = format!("postgresql://no_user@{listen_pg_addr_str}/no_db");
let me_conf = sk_connstr
.parse::<postgres::config::Config>()
.with_context(|| {
format!("Failed to parse pageserver connection string '{sk_connstr}' as a postgres one")
})?;
let (host, port) = utils::connstring::connection_host_port(&me_conf);
Ok(format!(
"host={host} port={port} options='-c timeline_id={timeline_id} tenant_id={tenant_id}'"
))
sk_connstr
.parse()
.context("bad url")
.and_then(|url: url::Url| {
let host = url.host_str().context("host is missing")?;
let port = url.port().unwrap_or(5432); // default PG port
Ok(format!(
"host={host} \
port={port} \
options='-c timeline_id={timeline_id} tenant_id={tenant_id}'"
))
})
.with_context(|| format!("Failed to parse pageserver connection URL '{sk_connstr}'"))
}
#[cfg(test)]
@@ -892,7 +897,7 @@ mod tests {
peer_horizon_lsn: None,
local_start_lsn: None,
safekeeper_connstr: Some("no commit_lsn".to_string()),
safekeeper_connstr: Some("no_commit_lsn".to_string()),
},
etcd_version: 0,
latest_update: now,
@@ -909,7 +914,7 @@ mod tests {
remote_consistent_lsn: None,
peer_horizon_lsn: None,
local_start_lsn: None,
safekeeper_connstr: Some("no commit_lsn".to_string()),
safekeeper_connstr: Some("no_commit_lsn".to_string()),
},
etcd_version: 0,
latest_update: now,
@@ -1005,7 +1010,7 @@ mod tests {
peer_horizon_lsn: None,
local_start_lsn: None,
safekeeper_connstr: Some("not advanced Lsn".to_string()),
safekeeper_connstr: Some("not_advanced_lsn".to_string()),
},
etcd_version: 0,
latest_update: now,
@@ -1023,7 +1028,7 @@ mod tests {
peer_horizon_lsn: None,
local_start_lsn: None,
safekeeper_connstr: Some("not enough advanced Lsn".to_string()),
safekeeper_connstr: Some("not_enough_advanced_lsn".to_string()),
},
etcd_version: 0,
latest_update: now,
@@ -1093,7 +1098,7 @@ mod tests {
peer_horizon_lsn: None,
local_start_lsn: None,
safekeeper_connstr: Some("smaller commit_lsn".to_string()),
safekeeper_connstr: Some("smaller_commit_lsn".to_string()),
},
etcd_version: 0,
latest_update: now,
@@ -1283,7 +1288,7 @@ mod tests {
peer_horizon_lsn: None,
local_start_lsn: None,
safekeeper_connstr: Some("advanced by Lsn safekeeper".to_string()),
safekeeper_connstr: Some("advanced_by_lsn_safekeeper".to_string()),
},
etcd_version: 0,
latest_update: now,
@@ -1307,7 +1312,7 @@ mod tests {
);
assert!(over_threshcurrent_candidate
.wal_source_connstr
.contains("advanced by Lsn safekeeper"));
.contains("advanced_by_lsn_safekeeper"));
Ok(())
}

View File

@@ -31,8 +31,8 @@ use crate::{
walrecord::DecodedWALRecord,
};
use postgres_ffi::waldecoder::WalStreamDecoder;
use utils::id::TenantTimelineId;
use utils::{lsn::Lsn, pq_proto::ReplicationFeedback};
use pq_proto::ReplicationFeedback;
use utils::{id::TenantTimelineId, lsn::Lsn};
/// Status of the connection.
#[derive(Debug, Clone)]

View File

@@ -10,7 +10,7 @@
//! process. Then we get the page image back. Communication with the
//! postgres process happens via stdin/stdout
//!
//! See src/backend/tcop/zenith_wal_redo.c for the other side of
//! See pgxn/neon_walredo/walredoproc.c for the other side of
//! this communication.
//!
//! The Postgres process is assumed to be secure against malicious WAL
@@ -22,10 +22,10 @@ use byteorder::{ByteOrder, LittleEndian};
use bytes::{BufMut, Bytes, BytesMut};
use nix::poll::*;
use serde::Serialize;
use std::fs;
use std::fs::OpenOptions;
use std::io::prelude::*;
use std::io::{Error, ErrorKind};
use std::ops::{Deref, DerefMut};
use std::os::unix::io::AsRawFd;
use std::os::unix::prelude::CommandExt;
use std::path::PathBuf;
@@ -34,6 +34,7 @@ use std::process::{Child, ChildStderr, ChildStdin, ChildStdout, Command};
use std::sync::Mutex;
use std::time::Duration;
use std::time::Instant;
use std::{fs, io};
use tracing::*;
use utils::crashsafe::path_with_suffix_extension;
use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock};
@@ -44,6 +45,7 @@ use crate::metrics::{
};
use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block};
use crate::repository::Key;
use crate::task_mgr::BACKGROUND_RUNTIME;
use crate::walrecord::NeonWalRecord;
use crate::{config::PageServerConf, TEMP_FILE_SUFFIX};
use pageserver_api::reltag::{RelTag, SlruKind};
@@ -208,6 +210,16 @@ impl PostgresRedoManager {
}
}
/// Launch process pre-emptively. Should not be needed except for benchmarking.
pub fn launch_process(&mut self, pg_version: u32) -> anyhow::Result<()> {
let inner = self.process.get_mut().unwrap();
if inner.is_none() {
let p = PostgresRedoProcess::launch(self.conf, self.tenant_id, pg_version)?;
*inner = Some(p);
}
Ok(())
}
///
/// Process one request for WAL redo using wal-redo postgres
///
@@ -229,7 +241,7 @@ impl PostgresRedoManager {
// launch the WAL redo process on first use
if process_guard.is_none() {
let p = PostgresRedoProcess::launch(self.conf, &self.tenant_id, pg_version)?;
let p = PostgresRedoProcess::launch(self.conf, self.tenant_id, pg_version)?;
*process_guard = Some(p);
}
let process = process_guard.as_mut().unwrap();
@@ -579,7 +591,8 @@ impl<C: CommandExt> CloseFileDescriptors for C {
/// Handle to the Postgres WAL redo process
///
struct PostgresRedoProcess {
child: Child,
tenant_id: TenantId,
child: NoLeakChild,
stdin: ChildStdin,
stdout: ChildStdout,
stderr: ChildStderr,
@@ -589,16 +602,17 @@ impl PostgresRedoProcess {
//
// Start postgres binary in special WAL redo mode.
//
#[instrument(skip_all,fields(tenant_id=%tenant_id, pg_version=pg_version))]
fn launch(
conf: &PageServerConf,
tenant_id: &TenantId,
tenant_id: TenantId,
pg_version: u32,
) -> Result<PostgresRedoProcess, Error> {
// FIXME: We need a dummy Postgres cluster to run the process in. Currently, we
// just create one with constant name. That fails if you try to launch more than
// one WAL redo manager concurrently.
let datadir = path_with_suffix_extension(
conf.tenant_path(tenant_id).join("wal-redo-datadir"),
conf.tenant_path(&tenant_id).join("wal-redo-datadir"),
TEMP_FILE_SUFFIX,
);
@@ -644,18 +658,16 @@ impl PostgresRedoProcess {
),
));
} else {
// Limit shared cache for wal-redo-postres
// Limit shared cache for wal-redo-postgres
let mut config = OpenOptions::new()
.append(true)
.open(PathBuf::from(&datadir).join("postgresql.conf"))?;
config.write_all(b"shared_buffers=128kB\n")?;
config.write_all(b"fsync=off\n")?;
config.write_all(b"shared_preload_libraries=neon\n")?;
config.write_all(b"neon.wal_redo=on\n")?;
}
// Start postgres itself
let mut child = Command::new(pg_bin_dir_path.join("postgres"))
let child = Command::new(pg_bin_dir_path.join("postgres"))
.arg("--wal-redo")
.stdin(Stdio::piped())
.stderr(Stdio::piped())
@@ -664,20 +676,17 @@ impl PostgresRedoProcess {
.env("LD_LIBRARY_PATH", &pg_lib_dir_path)
.env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
.env("PGDATA", &datadir)
// The redo process is not trusted, so it runs in seccomp mode
// (see seccomp in zenith_wal_redo.c). We have to make sure it doesn't
// inherit any file descriptors from the pageserver that would allow
// an attacker to do bad things.
// The redo process is not trusted, and runs in seccomp mode that
// doesn't allow it to open any files. We have to also make sure it
// doesn't inherit any file descriptors from the pageserver, that
// would allow an attacker to read any files that happen to be open
// in the pageserver.
//
// The Rust standard library makes sure to mark any file descriptors with
// as close-on-exec by default, but that's not enough, since we use
// libraries that directly call libc open without setting that flag.
//
// One example is the pidfile of the daemonize library, which doesn't
// currently mark file descriptors as close-on-exec. Either way, we
// want to be on the safe side and prevent accidental regression.
.close_fds()
.spawn()
.spawn_no_leak_child()
.map_err(|e| {
Error::new(
e.kind(),
@@ -685,20 +694,33 @@ impl PostgresRedoProcess {
)
})?;
info!(
"launched WAL redo postgres process on {}",
datadir.display()
);
let mut child = scopeguard::guard(child, |child| {
error!("killing wal-redo-postgres process due to a problem during launch");
child.kill_and_wait();
});
let stdin = child.stdin.take().unwrap();
let stdout = child.stdout.take().unwrap();
let stderr = child.stderr.take().unwrap();
set_nonblock(stdin.as_raw_fd())?;
set_nonblock(stdout.as_raw_fd())?;
set_nonblock(stderr.as_raw_fd())?;
macro_rules! set_nonblock_or_log_err {
($file:ident) => {{
let res = set_nonblock($file.as_raw_fd());
if let Err(e) = &res {
error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed");
}
res
}};
}
set_nonblock_or_log_err!(stdin)?;
set_nonblock_or_log_err!(stdout)?;
set_nonblock_or_log_err!(stderr)?;
// all fallible operations post-spawn are complete, so get rid of the guard
let child = scopeguard::ScopeGuard::into_inner(child);
Ok(PostgresRedoProcess {
tenant_id,
child,
stdin,
stdout,
@@ -706,18 +728,16 @@ impl PostgresRedoProcess {
})
}
fn kill(mut self) {
let _ = self.child.kill();
if let Ok(exit_status) = self.child.wait() {
error!("wal-redo-postgres exited with code {}", exit_status);
}
drop(self);
#[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.child.id()))]
fn kill(self) {
self.child.kill_and_wait();
}
//
// Apply given WAL records ('records') over an old page image. Returns
// new page image.
//
#[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.child.id()))]
fn apply_wal_records(
&mut self,
tag: BufferTag,
@@ -730,7 +750,11 @@ impl PostgresRedoProcess {
// This could be problematic if there are millions of records to replay,
// but in practice the number of records is usually so small that it doesn't
// matter, and it's better to keep this code simple.
let mut writebuf: Vec<u8> = Vec::new();
//
// Most requests start with a before-image with BLCKSZ bytes, followed by
// by some other WAL records. Start with a buffer that can hold that
// comfortably.
let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
build_begin_redo_for_block_msg(tag, &mut writebuf);
if let Some(img) = base_img {
build_push_page_msg(tag, &img, &mut writebuf);
@@ -843,8 +867,101 @@ impl PostgresRedoProcess {
}
}
/// Wrapper type around `std::process::Child` which guarantees that the child
/// will be killed and waited-for by this process before being dropped.
struct NoLeakChild {
child: Option<Child>,
}
impl Deref for NoLeakChild {
type Target = Child;
fn deref(&self) -> &Self::Target {
self.child.as_ref().expect("must not use from drop")
}
}
impl DerefMut for NoLeakChild {
fn deref_mut(&mut self) -> &mut Self::Target {
self.child.as_mut().expect("must not use from drop")
}
}
impl NoLeakChild {
fn spawn(command: &mut Command) -> io::Result<Self> {
let child = command.spawn()?;
Ok(NoLeakChild { child: Some(child) })
}
fn kill_and_wait(mut self) {
let child = match self.child.take() {
Some(child) => child,
None => return,
};
Self::kill_and_wait_impl(child);
}
#[instrument(skip_all, fields(pid=child.id()))]
fn kill_and_wait_impl(mut child: Child) {
let res = child.kill();
if let Err(e) = res {
// This branch is very unlikely because:
// - We (= pageserver) spawned this process successfully, so, we're allowed to kill it.
// - This is the only place that calls .kill()
// - We consume `self`, so, .kill() can't be called twice.
// - If the process exited by itself or was killed by someone else,
// .kill() will still succeed because we haven't wait()'ed yet.
//
// So, if we arrive here, we have really no idea what happened,
// whether the PID stored in self.child is still valid, etc.
// If this function were fallible, we'd return an error, but
// since it isn't, all we can do is log an error and proceed
// with the wait().
error!(error = %e, "failed to SIGKILL; subsequent wait() might fail or wait for wrong process");
}
match child.wait() {
Ok(exit_status) => {
// log at error level since .kill() is something we only do on errors ATM
error!(exit_status = %exit_status, "wait successful");
}
Err(e) => {
error!(error = %e, "wait error; might leak the child process; it will show as zombie (defunct)");
}
}
}
}
impl Drop for NoLeakChild {
fn drop(&mut self) {
let child = match self.child.take() {
Some(child) => child,
None => return,
};
// Offload the kill+wait of the child process into the background.
// If someone stops the runtime, we'll leak the child process.
// We can ignore that case because we only stop the runtime on pageserver exit.
BACKGROUND_RUNTIME.spawn(async move {
tokio::task::spawn_blocking(move || {
Self::kill_and_wait_impl(child);
})
.await
});
}
}
trait NoLeakChildCommandExt {
fn spawn_no_leak_child(&mut self) -> io::Result<NoLeakChild>;
}
impl NoLeakChildCommandExt for Command {
fn spawn_no_leak_child(&mut self) -> io::Result<NoLeakChild> {
NoLeakChild::spawn(self)
}
}
// Functions for constructing messages to send to the postgres WAL redo
// process. See vendor/postgres/src/backend/tcop/zenith_wal_redo.c for
// process. See pgxn/neon_walredo/walredoproc.c for
// explanation of the protocol.
fn build_begin_redo_for_block_msg(tag: BufferTag, buf: &mut Vec<u8>) {

View File

@@ -4,7 +4,6 @@
MODULE_big = neon
OBJS = \
$(WIN32RES) \
inmem_smgr.o \
libpagestore.o \
libpqwalproposer.o \
pagestore_smgr.o \

View File

@@ -40,8 +40,22 @@
bool connected = false;
PGconn *pageserver_conn = NULL;
/*
* WaitEventSet containing:
* - WL_SOCKET_READABLE on pageserver_conn,
* - WL_LATCH_SET on MyLatch, and
* - WL_EXIT_ON_PM_DEATH.
*/
WaitEventSet *pageserver_conn_wes = NULL;
char *page_server_connstring_raw;
int n_unflushed_requests = 0;
int flush_every_n_requests = 8;
int readahead_buffer_size = 128;
static void pageserver_flush(void);
static void
pageserver_connect()
{
@@ -58,6 +72,7 @@ pageserver_connect()
PQfinish(pageserver_conn);
pageserver_conn = NULL;
ereport(ERROR,
(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
errmsg(NEON_TAG "could not establish connection to pageserver"),
@@ -73,22 +88,26 @@ pageserver_connect()
neon_log(ERROR, "could not send pagestream command to pageserver");
}
pageserver_conn_wes = CreateWaitEventSet(TopMemoryContext, 3);
AddWaitEventToSet(pageserver_conn_wes, WL_LATCH_SET, PGINVALID_SOCKET,
MyLatch, NULL);
AddWaitEventToSet(pageserver_conn_wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
NULL, NULL);
AddWaitEventToSet(pageserver_conn_wes, WL_SOCKET_READABLE, PQsocket(pageserver_conn), NULL, NULL);
while (PQisBusy(pageserver_conn))
{
int wc;
WaitEvent event;
/* Sleep until there's something to do */
wc = WaitLatchOrSocket(MyLatch,
WL_LATCH_SET | WL_SOCKET_READABLE |
WL_EXIT_ON_PM_DEATH,
PQsocket(pageserver_conn),
-1L, PG_WAIT_EXTENSION);
wc = WaitEventSetWait(pageserver_conn_wes, -1L, &event, 1, PG_WAIT_EXTENSION);
ResetLatch(MyLatch);
CHECK_FOR_INTERRUPTS();
/* Data available in socket? */
if (wc & WL_SOCKET_READABLE)
if (event.events & WL_SOCKET_READABLE)
{
if (!PQconsumeInput(pageserver_conn))
{
@@ -96,6 +115,7 @@ pageserver_connect()
PQfinish(pageserver_conn);
pageserver_conn = NULL;
FreeWaitEventSet(pageserver_conn_wes);
neon_log(ERROR, "could not complete handshake with pageserver: %s",
msg);
@@ -112,33 +132,30 @@ pageserver_connect()
* A wrapper around PQgetCopyData that checks for interrupts while sleeping.
*/
static int
call_PQgetCopyData(PGconn *conn, char **buffer)
call_PQgetCopyData(char **buffer)
{
int ret;
retry:
ret = PQgetCopyData(conn, buffer, 1 /* async */ );
ret = PQgetCopyData(pageserver_conn, buffer, 1 /* async */ );
if (ret == 0)
{
int wc;
WaitEvent event;
/* Sleep until there's something to do */
wc = WaitLatchOrSocket(MyLatch,
WL_LATCH_SET | WL_SOCKET_READABLE |
WL_EXIT_ON_PM_DEATH,
PQsocket(conn),
-1L, PG_WAIT_EXTENSION);
wc = WaitEventSetWait(pageserver_conn_wes, -1L, &event, 1, PG_WAIT_EXTENSION);
ResetLatch(MyLatch);
CHECK_FOR_INTERRUPTS();
/* Data available in socket? */
if (wc & WL_SOCKET_READABLE)
if (event.events & WL_SOCKET_READABLE)
{
if (!PQconsumeInput(conn))
if (!PQconsumeInput(pageserver_conn))
neon_log(ERROR, "could not get response from pageserver: %s",
PQerrorMessage(conn));
PQerrorMessage(pageserver_conn));
}
goto retry;
@@ -164,7 +181,11 @@ pageserver_disconnect(void)
PQfinish(pageserver_conn);
pageserver_conn = NULL;
connected = false;
prefetch_on_ps_disconnect();
}
if (pageserver_conn_wes != NULL)
FreeWaitEventSet(pageserver_conn_wes);
}
static void
@@ -174,11 +195,7 @@ pageserver_send(NeonRequest * request)
/* If the connection was lost for some reason, reconnect */
if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD)
{
PQfinish(pageserver_conn);
pageserver_conn = NULL;
connected = false;
}
pageserver_disconnect();
if (!connected)
pageserver_connect();
@@ -202,6 +219,11 @@ pageserver_send(NeonRequest * request)
}
pfree(req_buff.data);
n_unflushed_requests++;
if (flush_every_n_requests > 0 && n_unflushed_requests >= flush_every_n_requests)
pageserver_flush();
if (message_level_is_interesting(PageStoreTrace))
{
char *msg = nm_to_string((NeonMessage *) request);
@@ -220,7 +242,7 @@ pageserver_receive(void)
PG_TRY();
{
/* read response */
resp_buff.len = call_PQgetCopyData(pageserver_conn, &resp_buff.data);
resp_buff.len = call_PQgetCopyData(&resp_buff.data);
resp_buff.cursor = 0;
if (resp_buff.len < 0)
@@ -255,25 +277,21 @@ pageserver_receive(void)
static void
pageserver_flush(void)
{
if (PQflush(pageserver_conn))
if (!connected)
{
neon_log(WARNING, "Tried to flush while disconnected");
}
else if (PQflush(pageserver_conn))
{
char *msg = PQerrorMessage(pageserver_conn);
pageserver_disconnect();
neon_log(ERROR, "failed to flush page requests: %s", msg);
}
}
static NeonResponse *
pageserver_call(NeonRequest * request)
{
pageserver_send(request);
pageserver_flush();
return pageserver_receive();
n_unflushed_requests = 0;
}
page_server_api api = {
.request = pageserver_call,
.send = pageserver_send,
.flush = pageserver_flush,
.receive = pageserver_receive
@@ -419,15 +437,6 @@ pg_init_libpagestore(void)
0, /* no flags required */
check_neon_id, NULL, NULL);
DefineCustomBoolVariable("neon.wal_redo",
"start in wal-redo mode",
NULL,
&wal_redo,
false,
PGC_POSTMASTER,
0,
NULL, NULL, NULL);
DefineCustomIntVariable("neon.max_cluster_size",
"cluster size limit",
NULL,
@@ -436,6 +445,27 @@ pg_init_libpagestore(void)
PGC_SIGHUP,
GUC_UNIT_MB,
NULL, NULL, NULL);
DefineCustomIntVariable("neon.flush_output_after",
"Flush the output buffer after every N unflushed requests",
NULL,
&flush_every_n_requests,
8, -1, INT_MAX,
PGC_USERSET,
0, /* no flags required */
NULL, NULL, NULL);
DefineCustomIntVariable("neon.readahead_buffer_size",
"number of prefetches to buffer",
"This buffer is used to store prefetched data; so "
"it is important that this buffer is at least as "
"large as the configured value of all tablespaces' "
"effective_io_concurrency and maintenance_io_concurrency, "
"your sessions' values of these, and the value for "
"seqscan_prefetch_buffers.",
&readahead_buffer_size,
128, 16, 1024,
PGC_USERSET,
0, /* no flags required */
NULL, (GucIntAssignHook) &readahead_buffer_resize, NULL);
relsize_hash_init();
@@ -452,13 +482,7 @@ pg_init_libpagestore(void)
neon_timeline_walproposer = neon_timeline;
neon_tenant_walproposer = neon_tenant;
if (wal_redo)
{
neon_log(PageStoreTrace, "set inmem_smgr hook");
smgr_hook = smgr_inmem;
smgr_init_hook = smgr_init_inmem;
}
else if (page_server_connstring && page_server_connstring[0])
if (page_server_connstring && page_server_connstring[0])
{
neon_log(PageStoreTrace, "set neon_smgr hook");
smgr_hook = smgr_neon;

View File

@@ -115,6 +115,8 @@ typedef struct
char page[FLEXIBLE_ARRAY_MEMBER];
} NeonGetPageResponse;
#define PS_GETPAGERESPONSE_SIZE (MAXALIGN(offsetof(NeonGetPageResponse, page) + BLCKSZ))
typedef struct
{
NeonMessageTag tag;
@@ -138,15 +140,20 @@ extern char *nm_to_string(NeonMessage * msg);
typedef struct
{
NeonResponse *(*request) (NeonRequest * request);
void (*send) (NeonRequest * request);
NeonResponse *(*receive) (void);
void (*flush) (void);
} page_server_api;
extern void prefetch_on_ps_disconnect(void);
extern page_server_api * page_server;
extern char *page_server_connstring;
extern int flush_every_n_requests;
extern int readahead_buffer_size;
extern bool seqscan_prefetch_enabled;
extern int seqscan_prefetch_distance;
extern char *neon_timeline;
extern char *neon_tenant;
extern bool wal_redo;
@@ -154,10 +161,7 @@ extern int32 max_cluster_size;
extern const f_smgr *smgr_neon(BackendId backend, RelFileNode rnode);
extern void smgr_init_neon(void);
extern const f_smgr *smgr_inmem(BackendId backend, RelFileNode rnode);
extern void smgr_init_inmem(void);
extern void smgr_shutdown_inmem(void);
extern void readahead_buffer_resize(int newsize, void *extra);
/* Neon storage manager functionality */
@@ -171,7 +175,6 @@ extern void neon_extend(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer, bool skipFsync);
extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum);
extern void neon_reset_prefetch(SMgrRelation reln);
extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
char *buffer);
@@ -188,29 +191,6 @@ extern void neon_truncate(SMgrRelation reln, ForkNumber forknum,
BlockNumber nblocks);
extern void neon_immedsync(SMgrRelation reln, ForkNumber forknum);
/* neon wal-redo storage manager functionality */
extern void inmem_init(void);
extern void inmem_open(SMgrRelation reln);
extern void inmem_close(SMgrRelation reln, ForkNumber forknum);
extern void inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo);
extern bool inmem_exists(SMgrRelation reln, ForkNumber forknum);
extern void inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo);
extern void inmem_extend(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer, bool skipFsync);
extern bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum);
extern void inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
char *buffer);
extern void inmem_write(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer, bool skipFsync);
extern void inmem_writeback(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, BlockNumber nblocks);
extern BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum);
extern void inmem_truncate(SMgrRelation reln, ForkNumber forknum,
BlockNumber nblocks);
extern void inmem_immedsync(SMgrRelation reln, ForkNumber forknum);
/* utils for neon relsize cache */
extern void relsize_hash_init(void);
extern bool get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber *size);

View File

@@ -49,22 +49,20 @@
#include "access/xlog.h"
#include "access/xloginsert.h"
#include "access/xlog_internal.h"
#include "catalog/pg_class.h"
#include "pagestore_client.h"
#include "pagestore_client.h"
#include "storage/smgr.h"
#include "access/xlogdefs.h"
#include "catalog/pg_class.h"
#include "common/hashfn.h"
#include "pagestore_client.h"
#include "postmaster/interrupt.h"
#include "postmaster/autovacuum.h"
#include "replication/walsender.h"
#include "storage/bufmgr.h"
#include "storage/relfilenode.h"
#include "storage/buf_internals.h"
#include "storage/smgr.h"
#include "storage/md.h"
#include "fmgr.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "catalog/pg_tablespace_d.h"
#include "postmaster/autovacuum.h"
#if PG_VERSION_NUM >= 150000
#include "access/xlogutils.h"
@@ -99,7 +97,6 @@ char *page_server_connstring;
/*with substituted password*/
char *neon_timeline;
char *neon_tenant;
bool wal_redo = false;
int32 max_cluster_size;
/* unlogged relation build states */
@@ -114,48 +111,646 @@ typedef enum
static SMgrRelation unlogged_build_rel = NULL;
static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
/*
* Prefetch implementation:
*
* Prefetch is performed locally by each backend.
* There can be up to MAX_PREFETCH_REQUESTS registered using smgr_prefetch
* before smgr_read. All this requests are appended to primary smgr_read request.
* It is assumed that pages will be requested in prefetch order.
* Reading of prefetch responses is delayed until them are actually needed (smgr_read).
* It make it possible to parallelize processing and receiving of prefetched pages.
* In case of prefetch miss or any other SMGR request other than smgr_read,
* all prefetch responses has to be consumed.
*
* There can be up to readahead_buffer_size active IO requests registered at
* any time. Requests using smgr_prefetch are sent to the pageserver, but we
* don't wait on the response. Requests using smgr_read are either read from
* the buffer, or (if that's not possible) we wait on the response to arrive -
* this also will allow us to receive other prefetched pages.
* Each request is immediately written to the output buffer of the pageserver
* connection, but may not be flushed if smgr_prefetch is used: pageserver
* flushes sent requests on manual flush, or every neon.flush_output_after
* unflushed requests; which is not necessarily always and all the time.
*
* Once we have received a response, this value will be stored in the response
* buffer, indexed in a hash table. This allows us to retain our buffered
* prefetch responses even when we have cache misses.
*
* Reading of prefetch responses is delayed until them are actually needed
* (smgr_read). In case of prefetch miss or any other SMGR request other than
* smgr_read, all prefetch responses in the pipeline will need to be read from
* the connection; the responses are stored for later use.
*
* NOTE: The current implementation of the prefetch system implements a ring
* buffer of up to readahead_buffer_size requests. If there are more _read and
* _prefetch requests between the initial _prefetch and the _read of a buffer,
* the prefetch request will have been dropped from this prefetch buffer, and
* your prefetch was wasted.
*/
#define MAX_PREFETCH_REQUESTS 128
/*
* State machine:
*
* not in hash : in hash
* :
* UNUSED ------> REQUESTED --> RECEIVED
* ^ : | |
* | : v |
* | : TAG_UNUSED |
* | : | |
* +----------------+------------+
* :
*/
typedef enum PrefetchStatus {
PRFS_UNUSED = 0, /* unused slot */
PRFS_REQUESTED, /* request was written to the sendbuffer to PS, but not
* necessarily flushed.
* all fields except response valid */
PRFS_RECEIVED, /* all fields valid */
PRFS_TAG_REMAINS, /* only buftag and my_ring_index are still valid */
} PrefetchStatus;
BufferTag prefetch_requests[MAX_PREFETCH_REQUESTS];
BufferTag prefetch_responses[MAX_PREFETCH_REQUESTS];
int n_prefetch_requests;
int n_prefetch_responses;
int n_prefetched_buffers;
int n_prefetch_hits;
int n_prefetch_misses;
XLogRecPtr prefetch_lsn;
typedef struct PrefetchRequest {
BufferTag buftag; /* must be first entry in the struct */
XLogRecPtr effective_request_lsn;
NeonResponse *response; /* may be null */
PrefetchStatus status;
uint64 my_ring_index;
} PrefetchRequest;
/* prefetch buffer lookup hash table */
typedef struct PrfHashEntry {
PrefetchRequest *slot;
uint32 status;
uint32 hash;
} PrfHashEntry;
#define SH_PREFIX prfh
#define SH_ELEMENT_TYPE PrfHashEntry
#define SH_KEY_TYPE PrefetchRequest *
#define SH_KEY slot
#define SH_STORE_HASH
#define SH_GET_HASH(tb, a) ((a)->hash)
#define SH_HASH_KEY(tb, key) hash_bytes( \
((const unsigned char *) &(key)->buftag), \
sizeof(BufferTag) \
)
#define SH_EQUAL(tb, a, b) (BUFFERTAGS_EQUAL((a)->buftag, (b)->buftag))
#define SH_SCOPE static inline
#define SH_DEFINE
#define SH_DECLARE
#include "lib/simplehash.h"
/*
* PrefetchState maintains the state of (prefetch) getPage@LSN requests.
* It maintains a (ring) buffer of in-flight requests and responses.
*
* We maintain several indexes into the ring buffer:
* ring_unused >= ring_flush >= ring_receive >= ring_last >= 0
*
* ring_unused points to the first unused slot of the buffer
* ring_receive is the next request that is to be received
* ring_last is the oldest received entry in the buffer
*
* Apart from being an entry in the ring buffer of prefetch requests, each
* PrefetchRequest that is not UNUSED is indexed in prf_hash by buftag.
*/
typedef struct PrefetchState {
MemoryContext bufctx; /* context for prf_buffer[].response allocations */
MemoryContext errctx; /* context for prf_buffer[].response allocations */
MemoryContext hashctx; /* context for prf_buffer */
/* buffer indexes */
uint64 ring_unused; /* first unused slot */
uint64 ring_flush; /* next request to flush */
uint64 ring_receive; /* next slot that is to receive a response */
uint64 ring_last; /* min slot with a response value */
/* metrics / statistics */
int n_responses_buffered; /* count of PS responses not yet in buffers */
int n_requests_inflight; /* count of PS requests considered in flight */
int n_unused; /* count of buffers < unused, > last, that are also unused */
/* the buffers */
prfh_hash *prf_hash;
PrefetchRequest prf_buffer[]; /* prefetch buffers */
} PrefetchState;
PrefetchState *MyPState;
#define GetPrfSlot(ring_index) ( \
( \
AssertMacro((ring_index) < MyPState->ring_unused && \
(ring_index) >= MyPState->ring_last), \
&MyPState->prf_buffer[((ring_index) % readahead_buffer_size)] \
) \
)
int n_prefetch_hits = 0;
int n_prefetch_misses = 0;
int n_prefetch_missed_caches = 0;
int n_prefetch_dupes = 0;
XLogRecPtr prefetch_lsn = 0;
static void consume_prefetch_responses(void);
static uint64 prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn);
static void prefetch_read(PrefetchRequest *slot);
static void prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn);
static void prefetch_wait_for(uint64 ring_index);
static void prefetch_cleanup(void);
static inline void prefetch_set_unused(uint64 ring_index);
static XLogRecPtr neon_get_request_lsn(bool *latest, RelFileNode rnode,
ForkNumber forknum, BlockNumber blkno);
void
readahead_buffer_resize(int newsize, void *extra)
{
uint64 end,
nfree = newsize;
PrefetchState *newPState;
Size newprfs_size = offsetof(PrefetchState, prf_buffer) + (
sizeof(PrefetchRequest) * readahead_buffer_size
);
/* don't try to re-initialize if we haven't initialized yet */
if (MyPState == NULL)
return;
/*
* Make sure that we don't lose track of active prefetch requests by
* ensuring we have received all but the last n requests (n = newsize).
*/
if (MyPState->n_requests_inflight > newsize)
prefetch_wait_for(MyPState->ring_unused - newsize);
/* construct the new PrefetchState, and copy over the memory contexts */
newPState = MemoryContextAllocZero(TopMemoryContext, newprfs_size);
newPState->bufctx = MyPState->bufctx;
newPState->errctx = MyPState->errctx;
newPState->hashctx = MyPState->hashctx;
newPState->prf_hash = prfh_create(MyPState->hashctx, newsize, NULL);
newPState->n_unused = newsize;
newPState->n_requests_inflight = 0;
newPState->n_responses_buffered = 0;
newPState->ring_last = newsize;
newPState->ring_unused = newsize;
newPState->ring_receive = newsize;
newPState->ring_flush = newsize;
/*
* Copy over the prefetches.
*
* We populate the prefetch array from the end; to retain the most recent
* prefetches, but this has the benefit of only needing to do one iteration
* on the dataset, and trivial compaction.
*/
for (end = MyPState->ring_unused - 1;
end >= MyPState->ring_last && end != UINT64_MAX && nfree != 0;
end -= 1)
{
PrefetchRequest *slot = GetPrfSlot(end);
PrefetchRequest *newslot;
bool found;
if (slot->status == PRFS_UNUSED)
continue;
nfree -= 1;
newslot = &newPState->prf_buffer[nfree];
*newslot = *slot;
newslot->my_ring_index = nfree;
prfh_insert(newPState->prf_hash, newslot, &found);
Assert(!found);
switch (newslot->status)
{
case PRFS_UNUSED:
pg_unreachable();
case PRFS_REQUESTED:
newPState->n_requests_inflight += 1;
newPState->ring_receive -= 1;
newPState->ring_last -= 1;
break;
case PRFS_RECEIVED:
newPState->n_responses_buffered += 1;
newPState->ring_last -= 1;
break;
case PRFS_TAG_REMAINS:
newPState->ring_last -= 1;
break;
}
newPState->n_unused -= 1;
}
for (; end >= MyPState->ring_last && end != UINT64_MAX; end -= 1)
{
prefetch_set_unused(end);
}
prfh_destroy(MyPState->prf_hash);
pfree(MyPState);
MyPState = newPState;
}
/*
* Make sure that there are no responses still in the buffer.
*
* NOTE: this function may indirectly update MyPState->pfs_hash; which
* invalidates any active pointers into the hash table.
*/
static void
consume_prefetch_responses(void)
{
for (int i = n_prefetched_buffers; i < n_prefetch_responses; i++)
{
NeonResponse *resp = page_server->receive();
if (MyPState->ring_receive < MyPState->ring_unused)
prefetch_wait_for(MyPState->ring_unused - 1);
}
pfree(resp);
static void
prefetch_cleanup(void)
{
uint64 ring_index;
PrefetchRequest *slot;
while (MyPState->ring_last < MyPState->ring_receive) {
ring_index = MyPState->ring_last;
slot = GetPrfSlot(ring_index);
if (slot->status == PRFS_UNUSED)
MyPState->ring_last += 1;
else
break;
}
n_prefetched_buffers = 0;
n_prefetch_responses = 0;
}
/*
* Wait for slot of ring_index to have received its response.
* The caller is responsible for making sure the request buffer is flushed.
*
* NOTE: this function may indirectly update MyPState->pfs_hash; which
* invalidates any active pointers into the hash table.
*/
static void
prefetch_wait_for(uint64 ring_index)
{
PrefetchRequest *entry;
if (MyPState->ring_flush <= ring_index &&
MyPState->ring_unused > MyPState->ring_flush)
{
page_server->flush();
MyPState->ring_flush = MyPState->ring_unused;
}
Assert(MyPState->ring_unused > ring_index);
while (MyPState->ring_receive <= ring_index)
{
entry = GetPrfSlot(MyPState->ring_receive);
Assert(entry->status == PRFS_REQUESTED);
prefetch_read(entry);
}
}
/*
* Read the response of a prefetch request into its slot.
*
* The caller is responsible for making sure that the request for this buffer
* was flushed to the PageServer.
*
* NOTE: this function may indirectly update MyPState->pfs_hash; which
* invalidates any active pointers into the hash table.
*/
static void
prefetch_read(PrefetchRequest *slot)
{
NeonResponse *response;
MemoryContext old;
Assert(slot->status == PRFS_REQUESTED);
Assert(slot->response == NULL);
Assert(slot->my_ring_index == MyPState->ring_receive);
old = MemoryContextSwitchTo(MyPState->errctx);
response = (NeonResponse *) page_server->receive();
MemoryContextSwitchTo(old);
/* update prefetch state */
MyPState->n_responses_buffered += 1;
MyPState->n_requests_inflight -= 1;
MyPState->ring_receive += 1;
/* update slot state */
slot->status = PRFS_RECEIVED;
slot->response = response;
}
/*
* Disconnect hook - drop prefetches when the connection drops
*
* If we don't remove the failed prefetches, we'd be serving incorrect
* data to the smgr.
*/
void
prefetch_on_ps_disconnect(void)
{
MyPState->ring_flush = MyPState->ring_unused;
while (MyPState->ring_receive < MyPState->ring_unused)
{
PrefetchRequest *slot;
uint64 ring_index = MyPState->ring_receive;
slot = GetPrfSlot(ring_index);
Assert(slot->status == PRFS_REQUESTED);
Assert(slot->my_ring_index == ring_index);
/* clean up the request */
slot->status = PRFS_TAG_REMAINS;
MyPState->n_requests_inflight -= 1;
MyPState->ring_receive += 1;
prefetch_set_unused(ring_index);
}
}
/*
* prefetch_set_unused() - clear a received prefetch slot
*
* The slot at ring_index must be a current member of the ring buffer,
* and may not be in the PRFS_REQUESTED state.
*
* NOTE: this function will update MyPState->pfs_hash; which invalidates any
* active pointers into the hash table.
*/
static inline void
prefetch_set_unused(uint64 ring_index)
{
PrefetchRequest *slot = GetPrfSlot(ring_index);
if (ring_index < MyPState->ring_last)
return; /* Should already be unused */
Assert(MyPState->ring_unused > ring_index);
if (slot->status == PRFS_UNUSED)
return;
Assert(slot->status == PRFS_RECEIVED || slot->status == PRFS_TAG_REMAINS);
if (slot->status == PRFS_RECEIVED)
{
pfree(slot->response);
slot->response = NULL;
MyPState->n_responses_buffered -= 1;
MyPState->n_unused += 1;
}
else
{
Assert(slot->response == NULL);
}
prfh_delete(MyPState->prf_hash, slot);
/* clear all fields */
MemSet(slot, 0, sizeof(PrefetchRequest));
slot->status = PRFS_UNUSED;
/* run cleanup if we're holding back ring_last */
if (MyPState->ring_last == ring_index)
prefetch_cleanup();
}
static void
prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn)
{
bool found;
NeonGetPageRequest request = {
.req.tag = T_NeonGetPageRequest,
.req.latest = false,
.req.lsn = 0,
.rnode = slot->buftag.rnode,
.forknum = slot->buftag.forkNum,
.blkno = slot->buftag.blockNum,
};
if (force_lsn && force_latest)
{
request.req.lsn = *force_lsn;
request.req.latest = *force_latest;
slot->effective_request_lsn = *force_lsn;
}
else
{
XLogRecPtr lsn = neon_get_request_lsn(
&request.req.latest,
slot->buftag.rnode,
slot->buftag.forkNum,
slot->buftag.blockNum
);
/*
* Note: effective_request_lsn is potentially higher than the requested
* LSN, but still correct:
*
* We know there are no changes between the actual requested LSN and
* the value of effective_request_lsn: If there were, the page would
* have been in cache and evicted between those LSN values, which
* then would have had to result in a larger request LSN for this page.
*
* It is possible that a concurrent backend loads the page, modifies
* it and then evicts it again, but the LSN of that eviction cannot be
* smaller than the current WAL insert/redo pointer, which is already
* larger than this prefetch_lsn. So in any case, that would
* invalidate this cache.
*
* The best LSN to use for effective_request_lsn would be
* XLogCtl->Insert.RedoRecPtr, but that's expensive to access.
*/
request.req.lsn = lsn;
prefetch_lsn = Max(prefetch_lsn, lsn);
slot->effective_request_lsn = prefetch_lsn;
}
Assert(slot->response == NULL);
Assert(slot->my_ring_index == MyPState->ring_unused);
page_server->send((NeonRequest *) &request);
/* update prefetch state */
MyPState->n_requests_inflight += 1;
MyPState->n_unused -= 1;
MyPState->ring_unused += 1;
/* update slot state */
slot->status = PRFS_REQUESTED;
prfh_insert(MyPState->prf_hash, slot, &found);
Assert(!found);
}
/*
* prefetch_register_buffer() - register and prefetch buffer
*
* Register that we may want the contents of BufferTag in the near future.
*
* If force_latest and force_lsn are not NULL, those values are sent to the
* pageserver. If they are NULL, we utilize the lastWrittenLsn -infrastructure
* to fill in these values manually.
*
* NOTE: this function may indirectly update MyPState->pfs_hash; which
* invalidates any active pointers into the hash table.
*/
static uint64
prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn)
{
uint64 ring_index;
PrefetchRequest req;
PrefetchRequest *slot;
PrfHashEntry *entry;
/* use an intermediate PrefetchRequest struct to ensure correct alignment */
req.buftag = tag;
entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &req);
if (entry != NULL)
{
slot = entry->slot;
ring_index = slot->my_ring_index;
Assert(slot == GetPrfSlot(ring_index));
Assert(slot->status != PRFS_UNUSED);
Assert(MyPState->ring_last <= ring_index &&
ring_index < MyPState->ring_unused);
Assert(BUFFERTAGS_EQUAL(slot->buftag, tag));
/*
* If we want a specific lsn, we do not accept requests that were made
* with a potentially different LSN.
*/
if (force_latest && force_lsn)
{
/* if we want the latest version, any effective_request_lsn < request lsn is OK */
if (*force_latest)
{
if (*force_lsn > slot->effective_request_lsn)
{
prefetch_wait_for(ring_index);
prefetch_set_unused(ring_index);
entry = NULL;
}
}
/* if we don't want the latest version, only accept requests with the exact same LSN */
else
{
if (*force_lsn != slot->effective_request_lsn)
{
prefetch_wait_for(ring_index);
prefetch_set_unused(ring_index);
entry = NULL;
}
}
}
/*
* We received a prefetch for a page that was recently read and
* removed from the buffers. Remove that request from the buffers.
*/
else if (slot->status == PRFS_TAG_REMAINS)
{
prefetch_set_unused(ring_index);
entry = NULL;
}
else
{
/* The buffered request is good enough, return that index */
n_prefetch_dupes++;
return ring_index;
}
}
/*
* If the prefetch queue is full, we need to make room by clearing the
* oldest slot. If the oldest slot holds a buffer that was already
* received, we can just throw it away; we fetched the page unnecessarily
* in that case. If the oldest slot holds a request that we haven't
* received a response for yet, we have to wait for the response to that
* before we can continue. We might not have even flushed the request to
* the pageserver yet, it might be just sitting in the output buffer. In
* that case, we flush it and wait for the response. (We could decide not
* to send it, but it's hard to abort when the request is already in the
* output buffer, and 'not sending' a prefetch request kind of goes
* against the principles of prefetching)
*/
if (MyPState->ring_last + readahead_buffer_size - 1 == MyPState->ring_unused)
{
uint64 cleanup_index = MyPState->ring_last;
slot = GetPrfSlot(cleanup_index);
Assert(slot->status != PRFS_UNUSED);
/* We have the slot for ring_last, so that must still be in progress */
switch (slot->status)
{
case PRFS_REQUESTED:
Assert(MyPState->ring_receive == cleanup_index);
prefetch_wait_for(cleanup_index);
prefetch_set_unused(cleanup_index);
break;
case PRFS_RECEIVED:
case PRFS_TAG_REMAINS:
prefetch_set_unused(cleanup_index);
break;
default:
pg_unreachable();
}
}
/*
* The next buffer pointed to by `ring_unused` is now definitely empty,
* so we can insert the new request to it.
*/
ring_index = MyPState->ring_unused;
slot = &MyPState->prf_buffer[((ring_index) % readahead_buffer_size)];
Assert(MyPState->ring_last <= ring_index);
Assert(slot->status == PRFS_UNUSED);
/*
* We must update the slot data before insertion, because the hash
* function reads the buffer tag from the slot.
*/
slot->buftag = tag;
slot->my_ring_index = ring_index;
prefetch_do_request(slot, force_latest, force_lsn);
Assert(slot->status == PRFS_REQUESTED);
Assert(MyPState->ring_last <= ring_index &&
ring_index < MyPState->ring_unused);
if (flush_every_n_requests > 0 &&
MyPState->ring_unused - MyPState->ring_flush >= flush_every_n_requests)
{
page_server->flush();
MyPState->ring_flush = MyPState->ring_unused;
}
return ring_index;
}
static NeonResponse *
page_server_request(void const *req)
{
page_server->send((NeonRequest *) req);
page_server->flush();
MyPState->ring_flush = MyPState->ring_unused;
consume_prefetch_responses();
return page_server->request((NeonRequest *) req);
return page_server->receive();
}
@@ -269,12 +864,15 @@ nm_unpack_response(StringInfo s)
case T_NeonGetPageResponse:
{
NeonGetPageResponse *msg_resp = palloc0(offsetof(NeonGetPageResponse, page) + BLCKSZ);
NeonGetPageResponse *msg_resp;
msg_resp = MemoryContextAllocZero(MyPState->bufctx, PS_GETPAGERESPONSE_SIZE);
msg_resp->tag = tag;
/* XXX: should be varlena */
memcpy(msg_resp->page, pq_getmsgbytes(s, BLCKSZ), BLCKSZ);
pq_getmsgend(s);
Assert(msg_resp->tag == T_NeonGetPageResponse);
resp = (NeonResponse *) msg_resp;
break;
@@ -618,7 +1216,33 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch
void
neon_init(void)
{
/* noop */
Size prfs_size;
if (MyPState != NULL)
return;
prfs_size = offsetof(PrefetchState, prf_buffer) + (
sizeof(PrefetchRequest) * readahead_buffer_size
);
MyPState = MemoryContextAllocZero(TopMemoryContext, prfs_size);
MyPState->n_unused = readahead_buffer_size;
MyPState->bufctx = SlabContextCreate(TopMemoryContext,
"NeonSMGR/prefetch",
SLAB_DEFAULT_BLOCK_SIZE * 17,
PS_GETPAGERESPONSE_SIZE);
MyPState->errctx = AllocSetContextCreate(TopMemoryContext,
"NeonSMGR/errors",
ALLOCSET_DEFAULT_SIZES);
MyPState->hashctx = AllocSetContextCreate(TopMemoryContext,
"NeonSMGR/prefetch",
ALLOCSET_DEFAULT_SIZES);
MyPState->prf_hash = prfh_create(MyPState->hashctx,
readahead_buffer_size, NULL);
#ifdef DEBUG_COMPARE_LOCAL
mdinit();
#endif
@@ -1005,27 +1629,17 @@ neon_close(SMgrRelation reln, ForkNumber forknum)
}
/*
* neon_reset_prefetch() -- reoe all previously rgistered prefeth requests
*/
void
neon_reset_prefetch(SMgrRelation reln)
{
n_prefetch_requests = 0;
}
/*
* neon_prefetch() -- Initiate asynchronous read of the specified block of a relation
*/
bool
neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
{
uint64 ring_index PG_USED_FOR_ASSERTS_ONLY;
switch (reln->smgr_relpersistence)
{
case 0:
/* probably shouldn't happen, but ignore it */
break;
case 0: /* probably shouldn't happen, but ignore it */
case RELPERSISTENCE_PERMANENT:
break;
@@ -1037,14 +1651,17 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
if (n_prefetch_requests < MAX_PREFETCH_REQUESTS)
{
prefetch_requests[n_prefetch_requests].rnode = reln->smgr_rnode.node;
prefetch_requests[n_prefetch_requests].forkNum = forknum;
prefetch_requests[n_prefetch_requests].blockNum = blocknum;
n_prefetch_requests += 1;
return true;
}
BufferTag tag = (BufferTag) {
.rnode = reln->smgr_rnode.node,
.forkNum = forknum,
.blockNum = blocknum
};
ring_index = prefetch_register_buffer(tag, NULL, NULL);
Assert(ring_index < MyPState->ring_unused &&
MyPState->ring_last <= ring_index);
return false;
}
@@ -1095,81 +1712,70 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
XLogRecPtr request_lsn, bool request_latest, char *buffer)
{
NeonResponse *resp;
int i;
BufferTag buftag;
uint64 ring_index;
PrfHashEntry *entry;
PrefetchRequest *slot;
buftag = (BufferTag) {
.rnode = rnode,
.forkNum = forkNum,
.blockNum = blkno,
};
/*
* Try to find prefetched page. It is assumed that pages will be requested
* in the same order as them are prefetched, but some other backend may
* load page in shared buffers, so some prefetch responses should be
* skipped.
* Try to find prefetched page in the list of received pages.
*/
for (i = n_prefetched_buffers; i < n_prefetch_responses; i++)
{
resp = page_server->receive();
if (resp->tag == T_NeonGetPageResponse &&
RelFileNodeEquals(prefetch_responses[i].rnode, rnode) &&
prefetch_responses[i].forkNum == forkNum &&
prefetch_responses[i].blockNum == blkno)
{
char *page = ((NeonGetPageResponse *) resp)->page;
entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &buftag);
if (entry != NULL)
{
slot = entry->slot;
if (slot->effective_request_lsn >= request_lsn)
{
ring_index = slot->my_ring_index;
n_prefetch_hits += 1;
}
else /* the current prefetch LSN is not large enough, so drop the prefetch */
{
/*
* Check if prefetched page is still relevant. If it is updated by
* some other backend, then it should not be requested from smgr
* unless it is evicted from shared buffers. In the last case
* last_evicted_lsn should be updated and request_lsn should be
* greater than prefetch_lsn. Maximum with page LSN is used
* because page returned by page server may have LSN either
* greater either smaller than requested.
* We can't drop cache for not-yet-received requested items. It is
* unlikely this happens, but it can happen if prefetch distance is
* large enough and a backend didn't consume all prefetch requests.
*/
if (Max(prefetch_lsn, PageGetLSN(page)) >= request_lsn)
if (slot->status == PRFS_REQUESTED)
{
n_prefetched_buffers = i + 1;
n_prefetch_hits += 1;
n_prefetch_requests = 0;
memcpy(buffer, page, BLCKSZ);
pfree(resp);
return;
prefetch_wait_for(slot->my_ring_index);
}
/* drop caches */
prefetch_set_unused(slot->my_ring_index);
n_prefetch_missed_caches += 1;
/* make it look like a prefetch cache miss */
entry = NULL;
}
pfree(resp);
}
n_prefetched_buffers = 0;
n_prefetch_responses = 0;
n_prefetch_misses += 1;
{
NeonGetPageRequest request = {
.req.tag = T_NeonGetPageRequest,
.req.latest = request_latest,
.req.lsn = request_lsn,
.rnode = rnode,
.forknum = forkNum,
.blkno = blkno
};
if (n_prefetch_requests > 0)
{
/* Combine all prefetch requests with primary request */
page_server->send((NeonRequest *) & request);
for (i = 0; i < n_prefetch_requests; i++)
{
request.rnode = prefetch_requests[i].rnode;
request.forknum = prefetch_requests[i].forkNum;
request.blkno = prefetch_requests[i].blockNum;
prefetch_responses[i] = prefetch_requests[i];
page_server->send((NeonRequest *) & request);
}
page_server->flush();
n_prefetch_responses = n_prefetch_requests;
n_prefetch_requests = 0;
prefetch_lsn = request_lsn;
resp = page_server->receive();
}
else
{
resp = page_server->request((NeonRequest *) & request);
}
if (entry == NULL)
{
n_prefetch_misses += 1;
ring_index = prefetch_register_buffer(buftag, &request_latest,
&request_lsn);
slot = GetPrfSlot(ring_index);
}
Assert(slot->my_ring_index == ring_index);
Assert(MyPState->ring_last <= ring_index &&
MyPState->ring_unused > ring_index);
Assert(slot->status != PRFS_UNUSED);
Assert(GetPrfSlot(ring_index) == slot);
prefetch_wait_for(ring_index);
Assert(slot->status == PRFS_RECEIVED);
resp = slot->response;
switch (resp->tag)
{
case T_NeonGetPageResponse:
@@ -1189,12 +1795,13 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
errdetail("page server returned error: %s",
((NeonErrorResponse *) resp)->message)));
break;
default:
elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
}
pfree(resp);
/* buffer was used, clean up for later reuse */
prefetch_set_unused(ring_index);
prefetch_cleanup();
}
/*
@@ -1816,7 +2423,6 @@ static const struct f_smgr neon_smgr =
.smgr_unlink = neon_unlink,
.smgr_extend = neon_extend,
.smgr_prefetch = neon_prefetch,
.smgr_reset_prefetch = neon_reset_prefetch,
.smgr_read = neon_read,
.smgr_write = neon_write,
.smgr_writeback = neon_writeback,

View File

@@ -43,6 +43,7 @@
#if PG_VERSION_NUM >= 150000
#include "access/xlogrecovery.h"
#endif
#include "storage/fd.h"
#include "storage/latch.h"
#include "miscadmin.h"
#include "pgstat.h"
@@ -69,11 +70,12 @@
#include "neon.h"
#include "walproposer.h"
#include "walproposer_utils.h"
#include "replication/walpropshim.h"
static bool syncSafekeepers = false;
char *wal_acceptors_list;
int wal_acceptor_reconnect_timeout;
int wal_acceptor_connect_timeout;
int wal_acceptor_connection_timeout;
bool am_wal_proposer;
char *neon_timeline_walproposer = NULL;
@@ -117,8 +119,8 @@ static TimestampTz last_reconnect_attempt;
static WalproposerShmemState * walprop_shared;
/* Prototypes for private functions */
static void WalProposerInitImpl(XLogRecPtr flushRecPtr, uint64 systemId);
static void WalProposerStartImpl(void);
static void WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId);
static void WalProposerStart(void);
static void WalProposerLoop(void);
static void InitEventSet(void);
static void UpdateEventSet(Safekeeper *sk, uint32 events);
@@ -186,9 +188,56 @@ pg_init_walproposer(void)
ProcessInterruptsCallback = backpressure_throttling_impl;
WalProposerRegister();
}
WalProposerInit = &WalProposerInitImpl;
WalProposerStart = &WalProposerStartImpl;
/*
* Entry point for `postgres --sync-safekeepers`.
*/
void
WalProposerSync(int argc, char *argv[])
{
struct stat stat_buf;
syncSafekeepers = true;
#if PG_VERSION_NUM < 150000
ThisTimeLineID = 1;
#endif
/*
* Initialize postmaster_alive_fds as WaitEventSet checks them.
*
* Copied from InitPostmasterDeathWatchHandle()
*/
if (pipe(postmaster_alive_fds) < 0)
ereport(FATAL,
(errcode_for_file_access(),
errmsg_internal("could not create pipe to monitor postmaster death: %m")));
if (fcntl(postmaster_alive_fds[POSTMASTER_FD_WATCH], F_SETFL, O_NONBLOCK) == -1)
ereport(FATAL,
(errcode_for_socket_access(),
errmsg_internal("could not set postmaster death monitoring pipe to nonblocking mode: %m")));
ChangeToDataDir();
/* Create pg_wal directory, if it doesn't exist */
if (stat(XLOGDIR, &stat_buf) != 0)
{
ereport(LOG, (errmsg("creating missing WAL directory \"%s\"", XLOGDIR)));
if (MakePGDirectory(XLOGDIR) < 0)
{
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not create directory \"%s\": %m",
XLOGDIR)));
exit(1);
}
}
WalProposerInit(0, 0);
BackgroundWorkerUnblockSignals();
WalProposerStart();
}
static void
@@ -217,9 +266,9 @@ nwp_register_gucs(void)
DefineCustomIntVariable(
"neon.safekeeper_connect_timeout",
"Timeout after which give up connection attempt to safekeeper.",
"Timeout for connection establishement and it's maintenance against safekeeper",
NULL,
&wal_acceptor_connect_timeout,
&wal_acceptor_connection_timeout,
5000, 0, INT_MAX,
PGC_SIGHUP,
GUC_UNIT_MS,
@@ -368,7 +417,9 @@ WalProposerPoll(void)
ResetLatch(MyLatch);
break;
}
if (rc == 0) /* timeout expired: poll state */
now = GetCurrentTimestamp();
if (rc == 0 || TimeToReconnect(now) <= 0) /* timeout expired: poll state */
{
TimestampTz now;
@@ -389,13 +440,11 @@ WalProposerPoll(void)
{
Safekeeper *sk = &safekeeper[i];
if ((sk->state == SS_CONNECTING_WRITE ||
sk->state == SS_CONNECTING_READ) &&
TimestampDifferenceExceeds(sk->startedConnAt, now,
wal_acceptor_connect_timeout))
if (TimestampDifferenceExceeds(sk->latestMsgReceivedAt, now,
wal_acceptor_connection_timeout))
{
elog(WARNING, "failed to connect to node '%s:%s': exceeded connection timeout %dms",
sk->host, sk->port, wal_acceptor_connect_timeout);
elog(WARNING, "failed to connect to node '%s:%s' in '%s' state: exceeded connection timeout %dms",
sk->host, sk->port, FormatSafekeeperState(sk->state), wal_acceptor_connection_timeout);
ShutdownConnection(sk);
}
}
@@ -429,7 +478,7 @@ WalProposerRegister(void)
}
static void
WalProposerInitImpl(XLogRecPtr flushRecPtr, uint64 systemId)
WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId)
{
char *host;
char *sep;
@@ -508,7 +557,7 @@ WalProposerInitImpl(XLogRecPtr flushRecPtr, uint64 systemId)
}
static void
WalProposerStartImpl(void)
WalProposerStart(void)
{
/* Initiate connections to all safekeeper nodes */
@@ -711,7 +760,7 @@ ResetConnection(Safekeeper *sk)
elog(LOG, "connecting with node %s:%s", sk->host, sk->port);
sk->state = SS_CONNECTING_WRITE;
sk->startedConnAt = GetCurrentTimestamp();
sk->latestMsgReceivedAt = GetCurrentTimestamp();
sock = walprop_socket(sk->conn);
sk->eventPos = AddWaitEventToSet(waitEvents, WL_SOCKET_WRITEABLE, sock, NULL, sk);
@@ -869,7 +918,7 @@ HandleConnectionEvent(Safekeeper *sk)
case WP_CONN_POLLING_OK:
elog(LOG, "connected with node %s:%s", sk->host,
sk->port);
sk->latestMsgReceivedAt = GetCurrentTimestamp();
/*
* We have to pick some event to update event set. We'll
* eventually need the socket to be readable, so we go with that.
@@ -2255,7 +2304,7 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage * anymsg)
ResetConnection(sk);
return false;
}
sk->latestMsgReceivedAt = GetCurrentTimestamp();
switch (tag)
{
case 'g':

View File

@@ -30,7 +30,7 @@
extern char *wal_acceptors_list;
extern int wal_acceptor_reconnect_timeout;
extern int wal_acceptor_connect_timeout;
extern int wal_acceptor_connection_timeout;
extern bool am_wal_proposer;
struct WalProposerConn; /* Defined in libpqwalproposer */
@@ -371,7 +371,7 @@ typedef struct Safekeeper
int eventPos; /* position in wait event set. Equal to -1 if*
* no event */
SafekeeperState state; /* safekeeper state machine state */
TimestampTz startedConnAt; /* when connection attempt started */
TimestampTz latestMsgReceivedAt; /* when latest msg is received */
AcceptorGreeting greetResponse; /* acceptor greeting */
VoteResponse voteResponse; /* the vote */
AppendResponse appendResponse; /* feedback for master */

View File

@@ -0,0 +1,22 @@
# pgxs/neon_walredo/Makefile
MODULE_big = neon_walredo
OBJS = \
$(WIN32RES) \
inmem_smgr.o \
walredoproc.o \
# This really should be guarded by $(with_libseccomp), but I couldn't
# make that work with pgxs. So we always compile it, but its contents
# are wrapped in #ifdef HAVE_LIBSECCOMP instead.
OBJS += seccomp.o
PGFILEDESC = "neon_walredo - helper process that runs in Neon pageserver"
PG_CONFIG = pg_config
PGXS := $(shell $(PG_CONFIG) --pgxs)
include $(PGXS)
ifeq ($(with_libseccomp),yes)
SHLIB_LINK += -lseccomp
endif

View File

@@ -3,9 +3,8 @@
* inmem_smgr.c
*
* This is an implementation of the SMGR interface, used in the WAL redo
* process (see src/backend/tcop/zenith_wal_redo.c). It has no persistent
* storage, the pages that are written out are kept in a small number of
* in-memory buffers.
* process. It has no persistent storage, the pages that are written out
* are kept in a small number of in-memory buffers.
*
* Normally, replaying a WAL record only needs to access a handful of
* buffers, which fit in the normal buffer cache, so this is just for
@@ -15,15 +14,11 @@
* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* contrib/neon/inmem_smgr.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/xlog.h"
#include "pagestore_client.h"
#include "storage/block.h"
#include "storage/buf_internals.h"
#include "storage/relfilenode.h"
@@ -33,6 +28,8 @@
#include "access/xlogutils.h"
#endif
#include "inmem_smgr.h"
/* Size of the in-memory smgr */
#define MAX_PAGES 64
@@ -59,10 +56,34 @@ locate_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno)
return -1;
}
/* neon wal-redo storage manager functionality */
static void inmem_init(void);
static void inmem_open(SMgrRelation reln);
static void inmem_close(SMgrRelation reln, ForkNumber forknum);
static void inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo);
static bool inmem_exists(SMgrRelation reln, ForkNumber forknum);
static void inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo);
static void inmem_extend(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer, bool skipFsync);
static bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum);
static void inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
char *buffer);
static void inmem_write(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer, bool skipFsync);
static void inmem_writeback(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, BlockNumber nblocks);
static BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum);
static void inmem_truncate(SMgrRelation reln, ForkNumber forknum,
BlockNumber nblocks);
static void inmem_immedsync(SMgrRelation reln, ForkNumber forknum);
/*
* inmem_init() -- Initialize private state
*/
void
static void
inmem_init(void)
{
used_pages = 0;
@@ -71,7 +92,7 @@ inmem_init(void)
/*
* inmem_exists() -- Does the physical file exist?
*/
bool
static bool
inmem_exists(SMgrRelation reln, ForkNumber forknum)
{
for (int i = 0; i < used_pages; i++)
@@ -90,7 +111,7 @@ inmem_exists(SMgrRelation reln, ForkNumber forknum)
*
* If isRedo is true, it's okay for the relation to exist already.
*/
void
static void
inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo)
{
}
@@ -98,7 +119,7 @@ inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo)
/*
* inmem_unlink() -- Unlink a relation.
*/
void
static void
inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo)
{
}
@@ -112,7 +133,7 @@ inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo)
* EOF). Note that we assume writing a block beyond current EOF
* causes intervening file space to become filled with zeroes.
*/
void
static void
inmem_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
char *buffer, bool skipFsync)
{
@@ -123,7 +144,7 @@ inmem_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
/*
* inmem_open() -- Initialize newly-opened relation.
*/
void
static void
inmem_open(SMgrRelation reln)
{
}
@@ -131,7 +152,7 @@ inmem_open(SMgrRelation reln)
/*
* inmem_close() -- Close the specified relation, if it isn't closed already.
*/
void
static void
inmem_close(SMgrRelation reln, ForkNumber forknum)
{
}
@@ -139,7 +160,7 @@ inmem_close(SMgrRelation reln, ForkNumber forknum)
/*
* inmem_prefetch() -- Initiate asynchronous read of the specified block of a relation
*/
bool
static bool
inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
{
return true;
@@ -148,7 +169,7 @@ inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
/*
* inmem_writeback() -- Tell the kernel to write pages back to storage.
*/
void
static void
inmem_writeback(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, BlockNumber nblocks)
{
@@ -157,7 +178,7 @@ inmem_writeback(SMgrRelation reln, ForkNumber forknum,
/*
* inmem_read() -- Read the specified block from a relation.
*/
void
static void
inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
char *buffer)
{
@@ -177,7 +198,7 @@ inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
* relation (ie, those before the current EOF). To extend a relation,
* use mdextend().
*/
void
static void
inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
char *buffer, bool skipFsync)
{
@@ -224,7 +245,7 @@ inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
/*
* inmem_nblocks() -- Get the number of blocks stored in a relation.
*/
BlockNumber
static BlockNumber
inmem_nblocks(SMgrRelation reln, ForkNumber forknum)
{
/*
@@ -243,7 +264,7 @@ inmem_nblocks(SMgrRelation reln, ForkNumber forknum)
/*
* inmem_truncate() -- Truncate relation to specified number of blocks.
*/
void
static void
inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
{
}
@@ -251,7 +272,7 @@ inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
/*
* inmem_immedsync() -- Immediately sync a relation to stable storage.
*/
void
static void
inmem_immedsync(SMgrRelation reln, ForkNumber forknum)
{
}

View File

@@ -0,0 +1,17 @@
/*-------------------------------------------------------------------------
*
* inmem_smgr.h
*
*
* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*-------------------------------------------------------------------------
*/
#ifndef INMEM_SMGR_H
#define INMEM_SMGR_H
extern const f_smgr *smgr_inmem(BackendId backend, RelFileNode rnode);
extern void smgr_init_inmem(void);
#endif /* INMEM_SMGR_H */

View File

@@ -0,0 +1,22 @@
#ifndef NEON_SECCOMP_H
#define NEON_SECCOMP_H
#include <seccomp.h>
typedef struct {
int psr_syscall; /* syscall number */
uint32 psr_action; /* libseccomp action, e.g. SCMP_ACT_ALLOW */
} PgSeccompRule;
#define PG_SCMP(syscall, action) \
(PgSeccompRule) { \
.psr_syscall = SCMP_SYS(syscall), \
.psr_action = (action), \
}
#define PG_SCMP_ALLOW(syscall) \
PG_SCMP(syscall, SCMP_ACT_ALLOW)
extern void seccomp_load_rules(PgSeccompRule *syscalls, int count);
#endif /* NEON_SECCOMP_H */

Some files were not shown because too many files have changed in this diff Show More