mirror of
https://github.com/neondatabase/neon.git
synced 2026-07-01 11:10:37 +00:00
Compare commits
18 Commits
perf-summa
...
test-prefe
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c431a305aa | ||
|
|
2b728bc69e | ||
|
|
5184685ced | ||
|
|
9ae4da4f31 | ||
|
|
aca221ac8b | ||
|
|
d013a2b227 | ||
|
|
3f93c6c6f0 | ||
|
|
53267969d7 | ||
|
|
c4b417ecdb | ||
|
|
1d105727cb | ||
|
|
4787a744c2 | ||
|
|
ac3ccac56c | ||
|
|
638af96c51 | ||
|
|
1e21ca1afe | ||
|
|
46d30bf054 | ||
|
|
d0105cea1f | ||
|
|
e44e4a699b | ||
|
|
223834a420 |
4
.github/ansible/production.hosts.yaml
vendored
4
.github/ansible/production.hosts.yaml
vendored
@@ -22,6 +22,10 @@ storage:
|
||||
console_region_id: aws-us-west-2
|
||||
zenith-1-ps-3:
|
||||
console_region_id: aws-us-west-2
|
||||
zenith-1-ps-4:
|
||||
console_region_id: aws-us-west-2
|
||||
zenith-1-ps-5:
|
||||
console_region_id: aws-us-west-2
|
||||
|
||||
safekeepers:
|
||||
hosts:
|
||||
|
||||
2
.github/ansible/staging.hosts.yaml
vendored
2
.github/ansible/staging.hosts.yaml
vendored
@@ -3,7 +3,7 @@ storage:
|
||||
bucket_name: zenith-staging-storage-us-east-1
|
||||
bucket_region: us-east-1
|
||||
console_mgmt_base_url: http://console-staging.local
|
||||
etcd_endpoints: zenith-us-stage-etcd.local:2379
|
||||
etcd_endpoints: etcd-0.us-east-2.aws.neon.build:2379
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
remote_storage:
|
||||
|
||||
2
.github/workflows/benchmarking.yml
vendored
2
.github/workflows/benchmarking.yml
vendored
@@ -164,7 +164,7 @@ jobs:
|
||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }}
|
||||
PLATFORM: ${{ matrix.platform }}
|
||||
|
||||
runs-on: dev
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:pinned
|
||||
options: --init
|
||||
|
||||
38
.github/workflows/build_and_test.yml
vendored
38
.github/workflows/build_and_test.yml
vendored
@@ -18,8 +18,8 @@ env:
|
||||
|
||||
jobs:
|
||||
tag:
|
||||
runs-on: dev
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
|
||||
outputs:
|
||||
build-tag: ${{steps.build-tag.outputs.tag}}
|
||||
|
||||
@@ -46,7 +46,7 @@ jobs:
|
||||
id: build-tag
|
||||
|
||||
build-neon:
|
||||
runs-on: dev
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
@@ -236,7 +236,7 @@ jobs:
|
||||
uses: ./.github/actions/save-coverage-data
|
||||
|
||||
regress-tests:
|
||||
runs-on: dev
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
@@ -269,7 +269,7 @@ jobs:
|
||||
uses: ./.github/actions/save-coverage-data
|
||||
|
||||
benchmarks:
|
||||
runs-on: dev
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
@@ -300,7 +300,7 @@ jobs:
|
||||
# while coverage is currently collected for the debug ones
|
||||
|
||||
merge-allure-report:
|
||||
runs-on: dev
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
@@ -338,7 +338,7 @@ jobs:
|
||||
DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json
|
||||
|
||||
coverage-report:
|
||||
runs-on: dev
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
@@ -415,7 +415,7 @@ jobs:
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
trigger-e2e-tests:
|
||||
runs-on: dev
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
|
||||
options: --init
|
||||
@@ -460,7 +460,7 @@ jobs:
|
||||
}"
|
||||
|
||||
neon-image:
|
||||
runs-on: dev
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
needs: [ tag ]
|
||||
container: gcr.io/kaniko-project/executor:v1.9.0-debug
|
||||
|
||||
@@ -478,7 +478,7 @@ jobs:
|
||||
run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
compute-tools-image:
|
||||
runs-on: dev
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
needs: [ tag ]
|
||||
container: gcr.io/kaniko-project/executor:v1.9.0-debug
|
||||
|
||||
@@ -493,7 +493,7 @@ jobs:
|
||||
run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
compute-node-image-v14:
|
||||
runs-on: dev
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container: gcr.io/kaniko-project/executor:v1.9.0-debug
|
||||
needs: [ tag ]
|
||||
steps:
|
||||
@@ -510,7 +510,7 @@ jobs:
|
||||
run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
compute-node-image-v15:
|
||||
runs-on: dev
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container: gcr.io/kaniko-project/executor:v1.9.0-debug
|
||||
needs: [ tag ]
|
||||
steps:
|
||||
@@ -528,7 +528,7 @@ jobs:
|
||||
|
||||
test-images:
|
||||
needs: [ tag, neon-image, compute-node-image-v14, compute-node-image-v15, compute-tools-image ]
|
||||
runs-on: dev
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
@@ -570,7 +570,7 @@ jobs:
|
||||
docker compose -f ./docker-compose/docker-compose.yml down
|
||||
|
||||
promote-images:
|
||||
runs-on: dev
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
needs: [ tag, test-images ]
|
||||
if: github.event_name != 'workflow_dispatch'
|
||||
container: amazon/aws-cli
|
||||
@@ -586,7 +586,7 @@ jobs:
|
||||
aws ecr put-image --repository-name ${{ matrix.name }} --image-tag latest --image-manifest "$MANIFEST"
|
||||
|
||||
push-docker-hub:
|
||||
runs-on: dev
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
needs: [ promote-images, tag ]
|
||||
container: golang:1.19-bullseye
|
||||
|
||||
@@ -736,7 +736,7 @@ jobs:
|
||||
rm -f neon_install.tar.gz .neon_current_version
|
||||
|
||||
deploy-new:
|
||||
runs-on: dev
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
|
||||
# We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
|
||||
# If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
|
||||
@@ -816,7 +816,7 @@ jobs:
|
||||
rm -f neon_install.tar.gz .neon_current_version
|
||||
|
||||
deploy-proxy:
|
||||
runs-on: dev
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest
|
||||
# Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
|
||||
needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
|
||||
@@ -858,7 +858,7 @@ jobs:
|
||||
helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
|
||||
|
||||
deploy-proxy-new:
|
||||
runs-on: dev
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
|
||||
# Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
|
||||
needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
|
||||
@@ -928,7 +928,7 @@ jobs:
|
||||
helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
|
||||
|
||||
promote-compatibility-data:
|
||||
runs-on: dev
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
|
||||
2
.github/workflows/codestyle.yml
vendored
2
.github/workflows/codestyle.yml
vendored
@@ -115,7 +115,7 @@ jobs:
|
||||
run: cargo build --locked --all --all-targets
|
||||
|
||||
check-rust-dependencies:
|
||||
runs-on: dev
|
||||
runs-on: [ self-hosted, dev, x64 ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
|
||||
@@ -37,7 +37,7 @@
|
||||
|
||||
- [Source view](./sourcetree.md)
|
||||
- [docker.md](./docker.md) — Docker images and building pipeline.
|
||||
- [Error handling and logging]()
|
||||
- [Error handling and logging](./error-handling.md)
|
||||
- [Testing]()
|
||||
- [Unit testing]()
|
||||
- [Integration testing]()
|
||||
|
||||
198
docs/error-handling.md
Normal file
198
docs/error-handling.md
Normal file
@@ -0,0 +1,198 @@
|
||||
# Error handling and logging
|
||||
|
||||
## Logging errors
|
||||
|
||||
The principle is that errors are logged when they are handled. If you
|
||||
just propagate an error to the caller in a function, you don't need to
|
||||
log it; the caller will. But if you consume an error in a function,
|
||||
you *must* log it (if it needs to be logged at all).
|
||||
|
||||
For example:
|
||||
|
||||
```rust
|
||||
fn read_motd_file() -> std::io::Result<String> {
|
||||
let mut f = File::open("/etc/motd")?;
|
||||
let mut result = String::new();
|
||||
f.read_to_string(&mut result)?;
|
||||
result
|
||||
}
|
||||
```
|
||||
|
||||
Opening or reading the file could fail, but there is no need to log
|
||||
the error here. The function merely propagates the error to the
|
||||
caller, and it is up to the caller to log the error or propagate it
|
||||
further, if the failure is not expected. But if, for example, it is
|
||||
normal that the "/etc/motd" file doesn't exist, the caller can choose
|
||||
to silently ignore the error, or log it as an INFO or DEBUG level
|
||||
message:
|
||||
|
||||
```rust
|
||||
fn get_message_of_the_day() -> String {
|
||||
// Get the motd from /etc/motd, or return the default proverb
|
||||
match read_motd_file() {
|
||||
Ok(motd) => motd,
|
||||
Err(err) => {
|
||||
// It's normal that /etc/motd doesn't exist, but if we fail to
|
||||
// read it for some other reason, that's unexpected. The message
|
||||
// of the day isn't very important though, so we just WARN and
|
||||
// continue with the default in any case.
|
||||
if err.kind() != std::io::ErrorKind::NotFound {
|
||||
tracing::warn!("could not read \"/etc/motd\": {err:?}");
|
||||
}
|
||||
"An old error is always more popular than a new truth. - German proverb"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Error types
|
||||
|
||||
We use the `anyhow` crate widely. It contains many convenient macros
|
||||
like `bail!` and `ensure!` to construct and return errors, and to
|
||||
propagate many kinds of low-level errors, wrapped in `anyhow::Error`.
|
||||
|
||||
A downside of `anyhow::Error` is that the caller cannot distinguish
|
||||
between different error cases. Most errors are propagated all the way
|
||||
to the mgmt API handler function, or the main loop that handles a
|
||||
connection with the compute node, and they are all handled the same
|
||||
way: the error is logged and returned to the client as an HTTP or
|
||||
libpq error.
|
||||
|
||||
But in some cases, we need to distinguish between errors and handle
|
||||
them differently. For example, attaching a tenant to the pageserver
|
||||
could fail either because the tenant has already been attached, or
|
||||
because we could not load its metadata from cloud storage. The first
|
||||
case is more or less expected. The console sends the Attach request to
|
||||
the pageserver, and the pageserver completes the operation, but the
|
||||
network connection might be lost before the console receives the
|
||||
response. The console will retry the operation in that case, but the
|
||||
tenant has already been attached. It is important that the pagserver
|
||||
responds with the HTTP 403 Already Exists error in that case, rather
|
||||
than a generic HTTP 500 Internal Server Error.
|
||||
|
||||
If you need to distinguish between different kinds of errors, create a
|
||||
new `Error` type. The `thiserror` crate is useful for that. But in
|
||||
most cases `anyhow::Error` is good enough.
|
||||
|
||||
## Panics
|
||||
|
||||
Depending on where a panic happens, it can cause the whole pageserver
|
||||
or safekeeper to restart, or just a single tenant. In either case,
|
||||
that is pretty bad and causes an outage. Avoid panics. Never use
|
||||
`unwrap()` or other calls that might panic, to verify inputs from the
|
||||
network or from disk.
|
||||
|
||||
It is acceptable to use functions that might panic, like `unwrap()`, if
|
||||
it is obvious that it cannot panic. For example, if you have just
|
||||
checked that a variable is not None, it is OK to call `unwrap()` on it,
|
||||
but it is still preferable to use `expect("reason")` instead to explain
|
||||
why the function cannot fail.
|
||||
|
||||
`assert!` and `panic!` are reserved for checking clear invariants and
|
||||
very obvious "can't happen" cases. When in doubt, use anyhow `ensure!`
|
||||
or `bail!` instead.
|
||||
|
||||
## Error levels
|
||||
|
||||
`tracing::Level` doesn't provide very clear guidelines on what the
|
||||
different levels mean, or when to use which level. Here is how we use
|
||||
them:
|
||||
|
||||
### Error
|
||||
|
||||
Examples:
|
||||
- could not open file "foobar"
|
||||
- invalid tenant id
|
||||
|
||||
Errors are not expected to happen during normal operation. Incorrect
|
||||
inputs from client can cause ERRORs. For example, if a client tries to
|
||||
call a mgmt API that doesn't exist, or if a compute node sends passes
|
||||
an LSN that has already been garbage collected away.
|
||||
|
||||
These should *not* happen during normal operations. "Normal
|
||||
operations" is not a very precise concept. But for example, disk
|
||||
errors are not expected to happen when the system is working, so those
|
||||
count as Errors. However, if a TCP connection to a compute node is
|
||||
lost, that is not considered an Error, because it doesn't affect the
|
||||
pageserver's or safekeeper's operation in any way, and happens fairly
|
||||
frequently when compute nodes are shut down, or are killed abruptly
|
||||
because of errors in the compute.
|
||||
|
||||
**Errors are monitored, and always need human investigation to determine
|
||||
the cause.**
|
||||
|
||||
Whether something should be logged at ERROR, WARNING or INFO level can
|
||||
depend on the callers and clients. For example, it might be unexpected
|
||||
and a sign of a serious issue if the console calls the
|
||||
"timeline_detail" mgmt API for a timeline that doesn't exist. ERROR
|
||||
would be appropriate in that case. But if the console routinely calls
|
||||
the API after deleting a timeline, to check if the deletion has
|
||||
completed, then it would be totally normal and an INFO or DEBUG level
|
||||
message would be more appropriate. If a message is logged as an ERROR,
|
||||
but it in fact happens frequently in production and never requires any
|
||||
action, it should probably be demoted to an INFO level message.
|
||||
|
||||
### Warn
|
||||
|
||||
Examples:
|
||||
- could not remove temporary file "foobar.temp"
|
||||
- unrecognized file "foobar" in timeline directory
|
||||
|
||||
Warnings are similar to Errors, in that they should not happen
|
||||
when the system is operating normally. The difference between Error and
|
||||
Warning is that an Error means that the operation failed, whereas Warning
|
||||
means that something unexpected happened, but the operation continued anyway.
|
||||
For example, if deleting a file fails because the file already didn't exist,
|
||||
it should be logged as Warning.
|
||||
|
||||
> **Note:** The python regression tests, under `test_regress`, check the
|
||||
> pageserver log after each test for any ERROR and WARN lines. If there are
|
||||
> any ERRORs or WARNs that have not been explicitly listed in the test as
|
||||
> allowed, the test is marked a failed. This is to catch unexpected errors
|
||||
> e.g. in background operations, that don't cause immediate misbehaviour in
|
||||
> the tested functionality.
|
||||
|
||||
### Info
|
||||
|
||||
Info level is used to log useful information when the system is
|
||||
operating normally. Info level is appropriate e.g. for logging state
|
||||
changes, background operations, and network connections.
|
||||
|
||||
Examples:
|
||||
- "system is shutting down"
|
||||
- "tenant was created"
|
||||
- "retrying S3 upload"
|
||||
|
||||
### Debug & Trace
|
||||
|
||||
Debug and Trace level messages are not printed to the log in our normal
|
||||
production configuration, but could be enabled for a specific server or
|
||||
tenant, to aid debugging. (Although we don't actually have that
|
||||
capability as of this writing).
|
||||
|
||||
## Context
|
||||
|
||||
We use logging "spans" to hold context information about the current
|
||||
operation. Almost every operation happens on a particular tenant and
|
||||
timeline, so we enter a span with the "tenant_id" and "timeline_id"
|
||||
very early when processing an incoming API request, for example. All
|
||||
background operations should also run in a span containing at least
|
||||
those two fields, and any other parameters or information that might
|
||||
be useful when debugging an error that might happen when performing
|
||||
the operation.
|
||||
|
||||
TODO: Spans are not captured in the Error when it is created, but when
|
||||
the error is logged. It would be more useful to capture them at Error
|
||||
creation. We should consider using `tracing_error::SpanTrace` to do
|
||||
that.
|
||||
|
||||
## Error message style
|
||||
|
||||
PostgreSQL has a style guide for writing error messages:
|
||||
|
||||
https://www.postgresql.org/docs/current/error-style-guide.html
|
||||
|
||||
Follow that guide when writing error messages in the PostgreSQL
|
||||
extension. We don't follow it strictly in the pageserver and
|
||||
safekeeper, but the advice in the PostgreSQL style guide is generally
|
||||
good, and you can't go wrong by following it.
|
||||
@@ -138,7 +138,7 @@ impl FromStr for Lsn {
|
||||
///
|
||||
/// If the input string is missing the '/' character, then use `Lsn::from_hex`
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
let mut splitter = s.split('/');
|
||||
let mut splitter = s.trim().split('/');
|
||||
if let (Some(left), Some(right), None) = (splitter.next(), splitter.next(), splitter.next())
|
||||
{
|
||||
let left_num = u32::from_str_radix(left, 16).map_err(|_| LsnParseError)?;
|
||||
@@ -270,6 +270,11 @@ mod tests {
|
||||
);
|
||||
assert_eq!(Lsn::from_hex("0"), Ok(Lsn(0)));
|
||||
assert_eq!(Lsn::from_hex("F12345678AAAA5555"), Err(LsnParseError));
|
||||
|
||||
let expected_lsn = Lsn(0x3C490F8);
|
||||
assert_eq!(" 0/3C490F8".parse(), Ok(expected_lsn));
|
||||
assert_eq!("0/3C490F8 ".parse(), Ok(expected_lsn));
|
||||
assert_eq!(" 0/3C490F8 ".parse(), Ok(expected_lsn));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -76,3 +76,7 @@ tempfile = "3.2"
|
||||
[[bench]]
|
||||
name = "bench_layer_map"
|
||||
harness = false
|
||||
|
||||
[[bench]]
|
||||
name = "bench_walredo"
|
||||
harness = false
|
||||
|
||||
453
pageserver/benches/bench_walredo.rs
Normal file
453
pageserver/benches/bench_walredo.rs
Normal file
File diff suppressed because one or more lines are too long
@@ -614,8 +614,9 @@ impl PageServerConf {
|
||||
PathBuf::from(format!("../tmp_check/test_{test_name}"))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn dummy_conf(repo_dir: PathBuf) -> Self {
|
||||
let pg_distrib_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../pg_install");
|
||||
|
||||
PageServerConf {
|
||||
id: NodeId(0),
|
||||
wait_lsn_timeout: Duration::from_secs(60),
|
||||
@@ -626,7 +627,7 @@ impl PageServerConf {
|
||||
listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
|
||||
superuser: "cloud_admin".to_string(),
|
||||
workdir: repo_dir,
|
||||
pg_distrib_dir: PathBuf::new(),
|
||||
pg_distrib_dir,
|
||||
auth_type: AuthType::Trust,
|
||||
auth_validation_public_key_path: None,
|
||||
remote_storage_config: None,
|
||||
|
||||
@@ -76,6 +76,12 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Byt
|
||||
FeMessage::CopyData(bytes) => bytes,
|
||||
FeMessage::CopyDone => { break },
|
||||
FeMessage::Sync => continue,
|
||||
FeMessage::Terminate => {
|
||||
let msg = format!("client terminated connection with Terminate message during COPY");
|
||||
pgb.write_message(&BeMessage::ErrorResponse(&msg))?;
|
||||
Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
|
||||
break;
|
||||
}
|
||||
m => {
|
||||
let msg = format!("unexpected message {:?}", m);
|
||||
pgb.write_message(&BeMessage::ErrorResponse(&msg))?;
|
||||
@@ -87,10 +93,10 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Byt
|
||||
yield copy_data_bytes;
|
||||
}
|
||||
Ok(None) => {
|
||||
let msg = "client closed connection";
|
||||
let msg = "client closed connection during COPY";
|
||||
pgb.write_message(&BeMessage::ErrorResponse(msg))?;
|
||||
pgb.flush().await?;
|
||||
Err(io::Error::new(io::ErrorKind::Other, msg))?;
|
||||
Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
|
||||
}
|
||||
Err(e) => {
|
||||
Err(io::Error::new(io::ErrorKind::Other, e))?;
|
||||
|
||||
@@ -216,7 +216,6 @@ impl TenantConf {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn dummy_conf() -> Self {
|
||||
TenantConf {
|
||||
checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE,
|
||||
|
||||
@@ -72,8 +72,6 @@ async fn compaction_loop(tenant_id: TenantId) {
|
||||
if let Err(e) = tenant.compaction_iteration() {
|
||||
sleep_duration = wait_duration;
|
||||
error!("Compaction failed, retrying in {:?}: {e:#}", sleep_duration);
|
||||
#[cfg(feature = "testing")]
|
||||
std::process::abort();
|
||||
}
|
||||
|
||||
// Sleep
|
||||
@@ -123,8 +121,6 @@ async fn gc_loop(tenant_id: TenantId) {
|
||||
{
|
||||
sleep_duration = wait_duration;
|
||||
error!("Gc failed, retrying in {:?}: {e:#}", sleep_duration);
|
||||
#[cfg(feature = "testing")]
|
||||
std::process::abort();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -210,6 +210,16 @@ impl PostgresRedoManager {
|
||||
}
|
||||
}
|
||||
|
||||
/// Launch process pre-emptively. Should not be needed except for benchmarking.
|
||||
pub fn launch_process(&mut self, pg_version: u32) -> anyhow::Result<()> {
|
||||
let inner = self.process.get_mut().unwrap();
|
||||
if inner.is_none() {
|
||||
let p = PostgresRedoProcess::launch(self.conf, self.tenant_id, pg_version)?;
|
||||
*inner = Some(p);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Process one request for WAL redo using wal-redo postgres
|
||||
///
|
||||
|
||||
@@ -440,68 +440,35 @@ def pytest_terminal_summary(
|
||||
|
||||
terminalreporter.section("Benchmark results", "-")
|
||||
|
||||
# TODO group by test report
|
||||
reports = {
|
||||
report.head_line: report
|
||||
for report in terminalreporter.stats.get("passed", [])
|
||||
}
|
||||
result = []
|
||||
for test_report in terminalreporter.stats.get("passed", []):
|
||||
result_entry = []
|
||||
|
||||
results = []
|
||||
for name, report in reports.items():
|
||||
# terminalreporter.write(f"{name}", green=True)
|
||||
# terminalreporter.line("")
|
||||
if "[neon" in name:
|
||||
vanilla_report = reports.get(name.replace("[neon", "[vanilla"))
|
||||
if vanilla_report:
|
||||
for key, prop in report.user_properties:
|
||||
if prop["unit"] == "s":
|
||||
neon_value = prop["value"]
|
||||
vanilla_value = dict(vanilla_report.user_properties)[key]["value"]
|
||||
try:
|
||||
ratio = float(neon_value) / vanilla_value
|
||||
except ZeroDivisionError:
|
||||
ratio = 99999
|
||||
for _, recorded_property in test_report.user_properties:
|
||||
terminalreporter.write(
|
||||
"{}.{}: ".format(test_report.head_line, recorded_property["name"])
|
||||
)
|
||||
unit = recorded_property["unit"]
|
||||
value = recorded_property["value"]
|
||||
if unit == "MB":
|
||||
terminalreporter.write("{0:,.0f}".format(value), green=True)
|
||||
elif unit in ("s", "ms") and isinstance(value, float):
|
||||
terminalreporter.write("{0:,.3f}".format(value), green=True)
|
||||
elif isinstance(value, float):
|
||||
terminalreporter.write("{0:,.4f}".format(value), green=True)
|
||||
else:
|
||||
terminalreporter.write(str(value), green=True)
|
||||
terminalreporter.line(" {}".format(unit))
|
||||
|
||||
results.append((ratio, name.replace("[neon", "[neon/vanilla"), prop["name"]))
|
||||
result_entry.append(recorded_property)
|
||||
|
||||
results.sort(reverse=True)
|
||||
for ratio, test, prop in results:
|
||||
terminalreporter.write("{}.{}: ".format(test, prop))
|
||||
terminalreporter.write("{0:,.3f}".format(ratio), green=True)
|
||||
terminalreporter.line("")
|
||||
|
||||
# result = []
|
||||
# for test_report in terminalreporter.stats.get("passed", []):
|
||||
# result_entry = []
|
||||
|
||||
# durations = [
|
||||
# prop
|
||||
# for _, prop in test_report.user_properties
|
||||
# if prop["unit"] == "s"
|
||||
# ]
|
||||
|
||||
# for _, recorded_property in test_report.user_properties:
|
||||
# terminalreporter.write("{}.{}: ".format(test_report.head_line,
|
||||
# recorded_property["name"]))
|
||||
# unit = recorded_property["unit"]
|
||||
# value = recorded_property["value"]
|
||||
# if unit == "MB":
|
||||
# terminalreporter.write("{0:,.0f}".format(value), green=True)
|
||||
# elif unit in ("s", "ms") and isinstance(value, float):
|
||||
# terminalreporter.write("{0:,.3f}".format(value), green=True)
|
||||
# elif isinstance(value, float):
|
||||
# terminalreporter.write("{0:,.4f}".format(value), green=True)
|
||||
# else:
|
||||
# terminalreporter.write(str(value), green=True)
|
||||
# terminalreporter.line(" {}".format(unit))
|
||||
|
||||
# result_entry.append(recorded_property)
|
||||
|
||||
# result.append({
|
||||
# "suit": test_report.nodeid,
|
||||
# "total_duration": test_report.duration,
|
||||
# "data": result_entry,
|
||||
# })
|
||||
result.append(
|
||||
{
|
||||
"suit": test_report.nodeid,
|
||||
"total_duration": test_report.duration,
|
||||
"data": result_entry,
|
||||
}
|
||||
)
|
||||
|
||||
out_dir = config.getoption("out_dir")
|
||||
if out_dir is None:
|
||||
|
||||
@@ -784,6 +784,8 @@ class NeonEnvBuilder:
|
||||
|
||||
self.cleanup_remote_storage()
|
||||
|
||||
self.env.pageserver.assert_no_errors()
|
||||
|
||||
|
||||
class NeonEnv:
|
||||
"""
|
||||
@@ -1723,6 +1725,43 @@ class NeonPageserver(PgProtocol):
|
||||
self.config_override = config_override
|
||||
self.version = env.get_pageserver_version()
|
||||
|
||||
# After a test finishes, we will scrape the log to see if there are any
|
||||
# unexpected error messages. If your test expects an error, add it to
|
||||
# 'allowed_errors' in the test with something like:
|
||||
#
|
||||
# env.pageserver.allowed_errors.append(".*could not open garage door.*")
|
||||
#
|
||||
# The entries in the list are regular experessions.
|
||||
self.allowed_errors = [
|
||||
# All tests print these, when starting up or shutting down
|
||||
".*wal receiver task finished with an error: walreceiver connection handling failure.*",
|
||||
".*Shutdown task error: walreceiver connection handling failure.*",
|
||||
".*Etcd client error: grpc request error: status: Unavailable.*",
|
||||
".*query handler for .* failed: Connection reset by peer.*",
|
||||
".*serving compute connection task.*exited with error: Broken pipe.*",
|
||||
".*Connection aborted: error communicating with the server: Broken pipe.*",
|
||||
".*Connection aborted: error communicating with the server: Transport endpoint is not connected.*",
|
||||
".*Connection aborted: error communicating with the server: Connection reset by peer.*",
|
||||
".*kill_and_wait_impl.*: wait successful.*",
|
||||
".*end streaming to Some.*",
|
||||
# safekeeper connection can fail with this, in the window between timeline creation
|
||||
# and streaming start
|
||||
".*Failed to process query for timeline .*: state uninitialized, no data to read.*",
|
||||
# Tests related to authentication and authorization print these
|
||||
".*Error processing HTTP request: Forbidden",
|
||||
# intentional failpoints
|
||||
".*failpoint ",
|
||||
# FIXME: there is a race condition between GC and detach, see
|
||||
# https://github.com/neondatabase/neon/issues/2442
|
||||
".*could not remove ephemeral file.*No such file or directory.*",
|
||||
# FIXME: These need investigation
|
||||
".*gc_loop.*Failed to get a tenant .* Tenant .* not found in the local state.*",
|
||||
".*compaction_loop.*Failed to get a tenant .* Tenant .* not found in the local state.*",
|
||||
".*manual_gc.*is_shutdown_requested\\(\\) called in an unexpected task or thread.*",
|
||||
".*tenant_list: timeline is not found in remote index while it is present in the tenants registry.*",
|
||||
".*Removing intermediate uninit mark file.*",
|
||||
]
|
||||
|
||||
def start(self, overrides: Tuple[str, ...] = ()) -> "NeonPageserver":
|
||||
"""
|
||||
Start the page server.
|
||||
@@ -1771,6 +1810,26 @@ class NeonPageserver(PgProtocol):
|
||||
is_testing_enabled_or_skip=self.is_testing_enabled_or_skip,
|
||||
)
|
||||
|
||||
def assert_no_errors(self):
|
||||
logfile = open(os.path.join(self.env.repo_dir, "pageserver.log"), "r")
|
||||
|
||||
error_or_warn = re.compile("ERROR|WARN")
|
||||
errors = []
|
||||
while True:
|
||||
line = logfile.readline()
|
||||
if not line:
|
||||
break
|
||||
|
||||
if error_or_warn.search(line):
|
||||
# It's an ERROR or WARN. Is it in the allow-list?
|
||||
for a in self.allowed_errors:
|
||||
if re.match(a, line):
|
||||
break
|
||||
else:
|
||||
errors.append(line)
|
||||
|
||||
assert not errors
|
||||
|
||||
|
||||
def append_pageserver_param_overrides(
|
||||
params_to_update: List[str],
|
||||
@@ -2661,8 +2720,6 @@ def test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Iterator[P
|
||||
|
||||
yield test_dir
|
||||
|
||||
shutil.rmtree(test_dir)
|
||||
|
||||
allure_attach_from_dir(test_dir)
|
||||
|
||||
|
||||
|
||||
@@ -25,6 +25,11 @@ def test_seqscans(neon_with_baseline: PgCompare, rows: int, iters: int, workers:
|
||||
|
||||
with closing(env.pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
|
||||
if True:
|
||||
cur.execute("set enable_seqscan_prefetch = on;")
|
||||
cur.execute("set seqscan_prefetch_buffers = 10;")
|
||||
|
||||
cur.execute("create table t (i integer);")
|
||||
cur.execute(f"insert into t values (generate_series(1,{rows}));")
|
||||
|
||||
|
||||
@@ -116,6 +116,13 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv):
|
||||
env = neon_simple_env
|
||||
pageserver_http_client = env.pageserver.http_client()
|
||||
|
||||
env.pageserver.allowed_errors.extend(
|
||||
[
|
||||
".*invalid branch start lsn: less than latest GC cutoff.*",
|
||||
".*invalid branch start lsn: less than planned GC cutoff.*",
|
||||
]
|
||||
)
|
||||
|
||||
# Disable background GC but set the `pitr_interval` to be small, so GC can delete something
|
||||
tenant, _ = env.neon_cli.create_tenant(
|
||||
conf={
|
||||
|
||||
@@ -13,6 +13,9 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
env.pageserver.allowed_errors.append(".*invalid branch start lsn.*")
|
||||
env.pageserver.allowed_errors.append(".*invalid start lsn .* for ancestor timeline.*")
|
||||
|
||||
# Branch at the point where only 100 rows were inserted
|
||||
env.neon_cli.create_branch("test_branch_behind")
|
||||
pgmain = env.postgres.create_start("test_branch_behind")
|
||||
|
||||
@@ -11,10 +11,17 @@ from fixtures.types import TenantId, TimelineId
|
||||
# Test restarting page server, while safekeeper and compute node keep
|
||||
# running.
|
||||
def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
|
||||
# One safekeeper is enough for this test.
|
||||
neon_env_builder.num_safekeepers = 3
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
env.pageserver.allowed_errors.extend(
|
||||
[
|
||||
".*No timelines to attach received.*",
|
||||
".*Failed to process timeline dir contents.*",
|
||||
".*Failed to load delta layer.*",
|
||||
".*Timeline .* was not found.*",
|
||||
]
|
||||
)
|
||||
|
||||
tenant_timelines: List[Tuple[TenantId, TimelineId, Postgres]] = []
|
||||
|
||||
for n in range(4):
|
||||
@@ -72,23 +79,24 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
|
||||
# First timeline would not get loaded into pageserver due to corrupt metadata file
|
||||
with pytest.raises(Exception, match=f"Timeline {tenant1}/{timeline1} was not found") as err:
|
||||
pg1.start()
|
||||
log.info(f"compute startup failed eagerly for timeline with corrupt metadata: {err}")
|
||||
log.info(
|
||||
f"As expected, compute startup failed eagerly for timeline with corrupt metadata: {err}"
|
||||
)
|
||||
|
||||
# Second timeline has no ancestors, only the metadata file and no layer files
|
||||
# We don't have the remote storage enabled, which means timeline is in an incorrect state,
|
||||
# it's not loaded at all
|
||||
with pytest.raises(Exception, match=f"Timeline {tenant2}/{timeline2} was not found") as err:
|
||||
pg2.start()
|
||||
log.info(f"compute startup failed eagerly for timeline with corrupt metadata: {err}")
|
||||
log.info(f"As expected, compute startup failed for timeline with missing layers: {err}")
|
||||
|
||||
# Yet other timelines will fail when their layers will be queried during basebackup: we don't check layer file contents on startup, when loading the timeline
|
||||
for n in range(3, 4):
|
||||
(bad_tenant, bad_timeline, pg) = tenant_timelines[n]
|
||||
with pytest.raises(Exception, match="extracting base backup failed") as err:
|
||||
pg.start()
|
||||
log.info(
|
||||
f"compute startup failed lazily for timeline {bad_tenant}/{bad_timeline} with corrupt layers, during basebackup preparation: {err}"
|
||||
)
|
||||
# Third timeline will also fail during basebackup, because the layer file is corrupt.
|
||||
# (We don't check layer file contents on startup, when loading the timeline)
|
||||
with pytest.raises(Exception, match="Failed to load delta layer") as err:
|
||||
pg3.start()
|
||||
log.info(
|
||||
f"As expected, compute startup failed for timeline {tenant3}/{timeline3} with corrupt layers: {err}"
|
||||
)
|
||||
|
||||
|
||||
def test_create_multiple_timelines_parallel(neon_simple_env: NeonEnv):
|
||||
@@ -111,6 +119,13 @@ def test_timeline_init_break_before_checkpoint(neon_simple_env: NeonEnv):
|
||||
env = neon_simple_env
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
|
||||
env.pageserver.allowed_errors.extend(
|
||||
[
|
||||
".*Failed to process timeline dir contents.*Timeline has no ancestor and no layer files.*",
|
||||
".*Timeline got dropped without initializing, cleaning its files.*",
|
||||
]
|
||||
)
|
||||
|
||||
tenant_id, _ = env.neon_cli.create_tenant()
|
||||
|
||||
timelines_dir = env.repo_dir / "tenants" / str(tenant_id) / "timelines"
|
||||
|
||||
@@ -2,7 +2,7 @@ import os
|
||||
import shutil
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from typing import Any, Optional
|
||||
|
||||
import pytest
|
||||
import toml # TODO: replace with tomllib for Python >= 3.11
|
||||
@@ -50,6 +50,12 @@ def test_create_snapshot(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, test_o
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
pg = env.postgres.create_start("main")
|
||||
|
||||
# FIXME: Is this expected?
|
||||
env.pageserver.allowed_errors.append(
|
||||
".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
|
||||
)
|
||||
|
||||
pg_bin.run(["pgbench", "--initialize", "--scale=10", pg.connstr()])
|
||||
pg_bin.run(["pgbench", "--time=60", "--progress=2", pg.connstr()])
|
||||
pg_bin.run(["pg_dumpall", f"--dbname={pg.connstr()}", f"--file={test_output_dir / 'dump.sql'}"])
|
||||
@@ -154,6 +160,7 @@ def test_forward_compatibility(
|
||||
from_dir=compatibility_snapshot_dir,
|
||||
to_dir=test_output_dir / "compatibility_snapshot",
|
||||
port_distributor=port_distributor,
|
||||
pg_distrib_dir=compatibility_postgres_distrib_dir,
|
||||
)
|
||||
|
||||
breaking_changes_allowed = (
|
||||
@@ -183,7 +190,12 @@ def test_forward_compatibility(
|
||||
), "Breaking changes are allowed by ALLOW_FORWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"
|
||||
|
||||
|
||||
def prepare_snapshot(from_dir: Path, to_dir: Path, port_distributor: PortDistributor):
|
||||
def prepare_snapshot(
|
||||
from_dir: Path,
|
||||
to_dir: Path,
|
||||
port_distributor: PortDistributor,
|
||||
pg_distrib_dir: Optional[Path] = None,
|
||||
):
|
||||
assert from_dir.exists(), f"Snapshot '{from_dir}' doesn't exist"
|
||||
assert (from_dir / "repo").exists(), f"Snapshot '{from_dir}' doesn't contain a repo directory"
|
||||
assert (from_dir / "dump.sql").exists(), f"Snapshot '{from_dir}' doesn't contain a dump.sql"
|
||||
@@ -208,7 +220,7 @@ def prepare_snapshot(from_dir: Path, to_dir: Path, port_distributor: PortDistrib
|
||||
# Update paths and ports in config files
|
||||
pageserver_toml = repo_dir / "pageserver.toml"
|
||||
pageserver_config = toml.load(pageserver_toml)
|
||||
pageserver_config["remote_storage"]["local_path"] = repo_dir / "local_fs_remote_storage"
|
||||
pageserver_config["remote_storage"]["local_path"] = str(repo_dir / "local_fs_remote_storage")
|
||||
pageserver_config["listen_http_addr"] = port_distributor.replace_with_new_port(
|
||||
pageserver_config["listen_http_addr"]
|
||||
)
|
||||
@@ -219,6 +231,9 @@ def prepare_snapshot(from_dir: Path, to_dir: Path, port_distributor: PortDistrib
|
||||
port_distributor.replace_with_new_port(ep) for ep in pageserver_config["broker_endpoints"]
|
||||
]
|
||||
|
||||
if pg_distrib_dir:
|
||||
pageserver_config["pg_distrib_dir"] = str(pg_distrib_dir)
|
||||
|
||||
with pageserver_toml.open("w") as f:
|
||||
toml.dump(pageserver_config, f)
|
||||
|
||||
@@ -238,7 +253,10 @@ def prepare_snapshot(from_dir: Path, to_dir: Path, port_distributor: PortDistrib
|
||||
sk["http_port"] = port_distributor.replace_with_new_port(sk["http_port"])
|
||||
sk["pg_port"] = port_distributor.replace_with_new_port(sk["pg_port"])
|
||||
|
||||
with (snapshot_config_toml).open("w") as f:
|
||||
if pg_distrib_dir:
|
||||
snapshot_config["pg_distrib_dir"] = str(pg_distrib_dir)
|
||||
|
||||
with snapshot_config_toml.open("w") as f:
|
||||
toml.dump(snapshot_config, f)
|
||||
|
||||
# Ensure that snapshot doesn't contain references to the original path
|
||||
|
||||
@@ -179,7 +179,16 @@ def test_sync_safekeepers_logs(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
|
||||
# run compute_ctl and wait for 10s
|
||||
try:
|
||||
ctl.raw_cli(
|
||||
["--connstr", ps_connstr, "--pgdata", pgdata, "--spec", spec, "--pgbin", pg_bin_path],
|
||||
[
|
||||
"--connstr",
|
||||
"postgres://invalid/",
|
||||
"--pgdata",
|
||||
pgdata,
|
||||
"--spec",
|
||||
spec,
|
||||
"--pgbin",
|
||||
pg_bin_path,
|
||||
],
|
||||
timeout=10,
|
||||
)
|
||||
except TimeoutExpired as exc:
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import pytest
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
|
||||
|
||||
|
||||
@@ -9,6 +10,11 @@ from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
|
||||
# test anyway, so it doesn't need any special attention here.
|
||||
def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
# These warnings are expected, when the pageserver is restarted abruptly
|
||||
env.pageserver.allowed_errors.append(".*found future image layer.*")
|
||||
env.pageserver.allowed_errors.append(".*found future delta layer.*")
|
||||
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
|
||||
# Use aggressive GC and checkpoint settings, so that we also exercise GC during the test
|
||||
@@ -30,10 +36,9 @@ def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
|
||||
|
||||
pageserver_http.configure_failpoints(("after-timeline-gc-removed-layers", "exit"))
|
||||
|
||||
for i in range(5):
|
||||
try:
|
||||
for _ in range(5):
|
||||
with pytest.raises(Exception):
|
||||
pg_bin.run_capture(["pgbench", "-N", "-c5", "-T100", "-Mprepared", connstr])
|
||||
except Exception:
|
||||
env.pageserver.stop()
|
||||
env.pageserver.start()
|
||||
pageserver_http.configure_failpoints(("after-timeline-gc-removed-layers", "exit"))
|
||||
env.pageserver.stop()
|
||||
env.pageserver.start()
|
||||
pageserver_http.configure_failpoints(("after-timeline-gc-removed-layers", "exit"))
|
||||
|
||||
@@ -76,6 +76,26 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
|
||||
env = neon_env_builder.init_start()
|
||||
env.pageserver.http_client().tenant_create(tenant)
|
||||
|
||||
env.pageserver.allowed_errors.extend(
|
||||
[
|
||||
".*error importing base backup .*",
|
||||
".*Timeline got dropped without initializing, cleaning its files.*",
|
||||
".*Removing intermediate uninit mark file.*",
|
||||
".*InternalServerError.*timeline not found.*",
|
||||
".*InternalServerError.*Tenant .* not found.*",
|
||||
".*InternalServerError.*Timeline .* not found.*",
|
||||
".*InternalServerError.*Cannot delete timeline which has child timelines.*",
|
||||
]
|
||||
)
|
||||
|
||||
# FIXME: we should clean up pageserver to not print this
|
||||
env.pageserver.allowed_errors.append(".*exited with error: unexpected message type: CopyData.*")
|
||||
|
||||
# FIXME: Is this expected?
|
||||
env.pageserver.allowed_errors.append(
|
||||
".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
|
||||
)
|
||||
|
||||
def import_tar(base, wal):
|
||||
env.neon_cli.raw_cli(
|
||||
[
|
||||
@@ -122,6 +142,11 @@ def test_import_from_pageserver_small(pg_bin: PgBin, neon_env_builder: NeonEnvBu
|
||||
neon_env_builder.enable_local_fs_remote_storage()
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
# FIXME: Is this expected?
|
||||
env.pageserver.allowed_errors.append(
|
||||
".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
|
||||
)
|
||||
|
||||
timeline = env.neon_cli.create_branch("test_import_from_pageserver_small")
|
||||
pg = env.postgres.create_start("test_import_from_pageserver_small")
|
||||
|
||||
|
||||
@@ -67,6 +67,10 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
|
||||
def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder):
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
# These warnings are expected, when the pageserver is restarted abruptly
|
||||
env.pageserver.allowed_errors.append(".*found future image layer.*")
|
||||
env.pageserver.allowed_errors.append(".*found future delta layer.*")
|
||||
|
||||
# Use a tiny checkpoint distance, to create a lot of layers quickly.
|
||||
# That allows us to stress the compaction and layer flushing logic more.
|
||||
tenant, _ = env.neon_cli.create_tenant(
|
||||
|
||||
@@ -143,6 +143,8 @@ def test_read_validation_neg(neon_simple_env: NeonEnv):
|
||||
env = neon_simple_env
|
||||
env.neon_cli.create_branch("test_read_validation_neg", "empty")
|
||||
|
||||
env.pageserver.allowed_errors.append(".*invalid LSN\\(0\\) in request.*")
|
||||
|
||||
pg = env.postgres.create_start("test_read_validation_neg")
|
||||
log.info("postgres is running on 'test_read_validation_neg' branch")
|
||||
|
||||
|
||||
@@ -17,6 +17,8 @@ def test_readonly_node(neon_simple_env: NeonEnv):
|
||||
pgmain = env.postgres.create_start("test_readonly_node")
|
||||
log.info("postgres is running on 'test_readonly_node' branch")
|
||||
|
||||
env.pageserver.allowed_errors.append(".*basebackup .* failed: invalid basebackup lsn.*")
|
||||
|
||||
main_pg_conn = pgmain.connect()
|
||||
main_cur = main_pg_conn.cursor()
|
||||
|
||||
|
||||
@@ -17,6 +17,10 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
neon_env_builder.start()
|
||||
|
||||
# These warnings are expected, when the pageserver is restarted abruptly
|
||||
env.pageserver.allowed_errors.append(".*found future delta layer.*")
|
||||
env.pageserver.allowed_errors.append(".*found future image layer.*")
|
||||
|
||||
# Create a branch for us
|
||||
env.neon_cli.create_branch("test_pageserver_recovery", "main")
|
||||
|
||||
|
||||
@@ -56,6 +56,17 @@ def test_remote_storage_backup_and_restore(
|
||||
|
||||
##### First start, insert secret data and upload it to the remote storage
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
# FIXME: Is this expected?
|
||||
env.pageserver.allowed_errors.append(
|
||||
".*marking .* as locally complete, while it doesnt exist in remote index.*"
|
||||
)
|
||||
env.pageserver.allowed_errors.append(".*No timelines to attach received.*")
|
||||
|
||||
env.pageserver.allowed_errors.append(".*Tenant download is already in progress.*")
|
||||
env.pageserver.allowed_errors.append(".*Failed to get local tenant state.*")
|
||||
env.pageserver.allowed_errors.append(".*No metadata file found in the timeline directory.*")
|
||||
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
pg = env.postgres.create_start("main")
|
||||
|
||||
|
||||
@@ -20,6 +20,11 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
|
||||
env = neon_env_builder.init_start()
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
|
||||
env.pageserver.allowed_errors.append(".*NotFound\\(Tenant .* not found in the local state")
|
||||
# FIXME: we have a race condition between GC and detach. GC might fail with this
|
||||
# error. Similar to https://github.com/neondatabase/neon/issues/2671
|
||||
env.pageserver.allowed_errors.append(".*InternalServerError\\(No such file or directory.*")
|
||||
|
||||
# first check for non existing tenant
|
||||
tenant_id = TenantId.generate()
|
||||
with pytest.raises(
|
||||
@@ -28,6 +33,9 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
|
||||
):
|
||||
pageserver_http.tenant_detach(tenant_id)
|
||||
|
||||
# the error will be printed to the log too
|
||||
env.pageserver.allowed_errors.append(".*Tenant not found for id.*")
|
||||
|
||||
# create new nenant
|
||||
tenant_id, timeline_id = env.neon_cli.create_tenant()
|
||||
|
||||
@@ -50,6 +58,9 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
|
||||
bogus_timeline_id = TimelineId.generate()
|
||||
pageserver_http.timeline_gc(tenant_id, bogus_timeline_id, 0)
|
||||
|
||||
# the error will be printed to the log too
|
||||
env.pageserver.allowed_errors.append(".*gc target timeline does not exist.*")
|
||||
|
||||
# try to concurrently run gc and detach
|
||||
gc_thread = Thread(target=lambda: do_gc_target(pageserver_http, tenant_id, timeline_id))
|
||||
gc_thread.start()
|
||||
|
||||
@@ -259,6 +259,11 @@ def test_tenant_relocation(
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
# FIXME: Is this expected?
|
||||
env.pageserver.allowed_errors.append(
|
||||
".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
|
||||
)
|
||||
|
||||
# create folder for remote storage mock
|
||||
remote_storage_mock_path = env.repo_dir / "local_fs_remote_storage"
|
||||
|
||||
|
||||
@@ -25,6 +25,13 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv):
|
||||
)
|
||||
initial_tenant_dirs = [d for d in tenants_dir.iterdir()]
|
||||
|
||||
neon_simple_env.pageserver.allowed_errors.extend(
|
||||
[
|
||||
".*Failed to create directory structure for tenant .*, cleaning tmp data.*",
|
||||
".*Failed to fsync removed temporary tenant directory .*",
|
||||
]
|
||||
)
|
||||
|
||||
pageserver_http = neon_simple_env.pageserver.http_client()
|
||||
pageserver_http.configure_failpoints(("tenant-creation-before-tmp-rename", "return"))
|
||||
with pytest.raises(Exception, match="tenant-creation-before-tmp-rename"):
|
||||
@@ -206,6 +213,13 @@ def test_pageserver_with_empty_tenants(
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
env.pageserver.allowed_errors.append(
|
||||
".*marking .* as locally complete, while it doesnt exist in remote index.*"
|
||||
)
|
||||
env.pageserver.allowed_errors.append(".*Tenant .* has no timelines directory.*")
|
||||
env.pageserver.allowed_errors.append(".*No timelines to attach received.*")
|
||||
|
||||
client = env.pageserver.http_client()
|
||||
|
||||
tenant_without_timelines_dir = env.initial_tenant
|
||||
|
||||
@@ -66,6 +66,11 @@ def test_tenants_many(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Rem
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
# FIXME: Is this expected?
|
||||
env.pageserver.allowed_errors.append(
|
||||
".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
|
||||
)
|
||||
|
||||
tenants_pgs: List[Tuple[TenantId, Postgres]] = []
|
||||
|
||||
for _ in range(1, 5):
|
||||
@@ -117,6 +122,13 @@ def test_tenants_attached_after_download(
|
||||
|
||||
##### First start, insert secret data and upload it to the remote storage
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
# FIXME: Are these expected?
|
||||
env.pageserver.allowed_errors.append(".*No timelines to attach received.*")
|
||||
env.pageserver.allowed_errors.append(
|
||||
".*marking .* as locally complete, while it doesnt exist in remote index.*"
|
||||
)
|
||||
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
pg = env.postgres.create_start("main")
|
||||
|
||||
@@ -209,6 +221,16 @@ def test_tenant_upgrades_index_json_from_v0(
|
||||
# launch pageserver, populate the default tenants timeline, wait for it to be uploaded,
|
||||
# then go ahead and modify the "remote" version as if it was downgraded, needing upgrade
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
# FIXME: Are these expected?
|
||||
env.pageserver.allowed_errors.append(
|
||||
".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
|
||||
)
|
||||
env.pageserver.allowed_errors.append(".*No timelines to attach received.*")
|
||||
env.pageserver.allowed_errors.append(
|
||||
".*Failed to get local tenant state: Tenant .* not found in the local state.*"
|
||||
)
|
||||
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
pg = env.postgres.create_start("main")
|
||||
|
||||
@@ -315,6 +337,20 @@ def test_tenant_redownloads_truncated_file_on_startup(
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
env.pageserver.allowed_errors.append(
|
||||
".*Redownloading locally existing .* due to size mismatch.*"
|
||||
)
|
||||
env.pageserver.allowed_errors.append(
|
||||
".*Downloaded layer exists already but layer file metadata mismatches.*"
|
||||
)
|
||||
|
||||
# FIXME: Are these expected?
|
||||
env.pageserver.allowed_errors.append(
|
||||
".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
|
||||
)
|
||||
env.pageserver.allowed_errors.append(".*No timelines to attach received.*")
|
||||
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
pg = env.postgres.create_start("main")
|
||||
|
||||
|
||||
@@ -7,6 +7,11 @@ from fixtures.utils import wait_until
|
||||
def test_timeline_delete(neon_simple_env: NeonEnv):
|
||||
env = neon_simple_env
|
||||
|
||||
env.pageserver.allowed_errors.append(".*Timeline .* was not found.*")
|
||||
env.pageserver.allowed_errors.append(".*timeline not found.*")
|
||||
env.pageserver.allowed_errors.append(".*Cannot delete timeline which has child timelines.*")
|
||||
env.pageserver.allowed_errors.append(".*Tenant .* not found in the local state.*")
|
||||
|
||||
ps_http = env.pageserver.http_client()
|
||||
|
||||
# first try to delete non existing timeline
|
||||
|
||||
@@ -263,6 +263,12 @@ def test_broker(neon_env_builder: NeonEnvBuilder):
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
env.neon_cli.create_branch("test_broker", "main")
|
||||
|
||||
# FIXME: Is this expected?
|
||||
env.pageserver.allowed_errors.append(
|
||||
".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
|
||||
)
|
||||
|
||||
pg = env.postgres.create_start("test_broker")
|
||||
pg.safe_psql("CREATE TABLE t(key int primary key, value text)")
|
||||
|
||||
@@ -306,6 +312,11 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
|
||||
neon_env_builder.auth_enabled = auth_enabled
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
# FIXME: Is this expected?
|
||||
env.pageserver.allowed_errors.append(
|
||||
".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
|
||||
)
|
||||
|
||||
env.neon_cli.create_branch("test_safekeepers_wal_removal")
|
||||
pg = env.postgres.create_start("test_safekeepers_wal_removal")
|
||||
|
||||
@@ -538,6 +549,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re
|
||||
)
|
||||
|
||||
pg.stop_and_destroy()
|
||||
ps_cli.timeline_delete(tenant_id, timeline_id)
|
||||
|
||||
# Also delete and manually create timeline on safekeepers -- this tests
|
||||
# scenario of manual recovery on different set of safekeepers.
|
||||
@@ -562,7 +574,6 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re
|
||||
shutil.copy(f_partial_saved, f_partial_path)
|
||||
|
||||
# recreate timeline on pageserver from scratch
|
||||
ps_cli.timeline_delete(tenant_id, timeline_id)
|
||||
ps_cli.timeline_create(tenant_id, timeline_id)
|
||||
|
||||
wait_lsn_timeout = 60 * 3
|
||||
@@ -1081,6 +1092,14 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
|
||||
neon_env_builder.auth_enabled = auth_enabled
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
# FIXME: are these expected?
|
||||
env.pageserver.allowed_errors.extend(
|
||||
[
|
||||
".*Failed to process query for timeline .*: Timeline .* was not found in global map.*",
|
||||
".*end streaming to Some.*",
|
||||
]
|
||||
)
|
||||
|
||||
# Create two tenants: one will be deleted, other should be preserved.
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id_1 = env.neon_cli.create_branch("br1") # Active, delete explicitly
|
||||
|
||||
@@ -22,6 +22,8 @@ def assert_child_processes(pageserver_pid, wal_redo_present=False, defunct_prese
|
||||
# as a zombie process.
|
||||
def test_walredo_not_left_behind_on_detach(neon_env_builder: NeonEnvBuilder):
|
||||
env = neon_env_builder.init_start()
|
||||
# We intentionally test for a non-existent tenant.
|
||||
env.pageserver.allowed_errors.append(".*Tenant not found.*")
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
|
||||
pagserver_pid = int((env.repo_dir / "pageserver.pid").read_text())
|
||||
|
||||
2
vendor/postgres-v15
vendored
2
vendor/postgres-v15
vendored
Submodule vendor/postgres-v15 updated: e9e0fd5947...1bf5e3f53c
Reference in New Issue
Block a user