Compare commits

..

13 Commits

Author SHA1 Message Date
Arpad Müller
a9e5da9613 wip 2023-09-01 00:19:47 +02:00
Arpad Müller
a5acfdaa5a Use write_and_fsync in save_metadata 2023-09-01 00:19:47 +02:00
Arpad Müller
70052ae1ca Make save_metadata async fn 2023-09-01 00:19:47 +02:00
Arpad Müller
fe69dd9a40 Add write_and_fsync function 2023-09-01 00:19:47 +02:00
Arpad Müller
930eccfcaa Make write_blob and the things it calls async fn 2023-09-01 00:19:47 +02:00
Arpad Müller
29c2381fa5 Move generics on trait into macro
Monomorphization is basically like macro expansion, it just happens at a
later compiler phase.
2023-09-01 00:19:47 +02:00
Arpad Müller
7839cda66a Remove bounds 2023-09-01 00:19:47 +02:00
Arpad Müller
d565df25d6 Make write_all_at async 2023-09-01 00:19:47 +02:00
Christian Schwarz
92b7d7f466 FileBlockReader::fill_buffer make it obvious that we need to switch to async API 2023-09-01 00:19:47 +02:00
Arpad Müller
cfabd8b598 Remove Read impl that was only used in one place 2023-09-01 00:19:47 +02:00
Arpad Müller
4432094443 Move used FileExt functions to inherent impls 2023-09-01 00:19:47 +02:00
Christian Schwarz
83babcce30 FileBlockReader<File> is never used 2023-09-01 00:19:47 +02:00
Arpad Müller
735e20112a Move VirtualFile::seek to inherent function 2023-09-01 00:19:47 +02:00
39 changed files with 863 additions and 816 deletions

View File

@@ -899,7 +899,7 @@ jobs:
- name: Cleanup ECR folder
run: rm -rf ~/.ecr
trigger-custom-extensions-build:
build-private-extensions:
runs-on: [ self-hosted, gen3, small ]
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
@@ -908,7 +908,8 @@ jobs:
steps:
- name: Set PR's status to pending and request a remote CI test
run: |
COMMIT_SHA=${{ github.event.pull_request.head.sha || github.sha }}
COMMIT_SHA=${{ github.event.pull_request.head.sha }}
COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
REMOTE_REPO="${{ github.repository_owner }}/build-custom-extensions"
curl -f -X POST \
@@ -938,53 +939,10 @@ jobs:
}
}"
wait-for-extensions-build:
runs-on: ubuntu-latest
needs: [ trigger-custom-extensions-build ]
steps:
- name: Wait for extension build to finish
env:
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
run: |
TIMEOUT=600 # 10 minutes, currently it takes ~2-3 minutes
INTERVAL=15 # try each N seconds
last_status="" # a variable to carry the last status of the "build-and-upload-extensions" context
for ((i=0; i <= $TIMEOUT; i+=$INTERVAL)); do
sleep $INTERVAL
# Get statuses for the latest commit in the PR / branch
gh api \
-H "Accept: application/vnd.github+json" \
-H "X-GitHub-Api-Version: 2022-11-28" \
"/repos/${{ github.repository }}/statuses/${{ github.event.pull_request.head.sha || github.sha }}" > statuses.json
# Get the latest status for the "build-and-upload-extensions" context
last_status=$(jq --raw-output '[.[] | select(.context == "build-and-upload-extensions")] | sort_by(.created_at)[-1].state' statuses.json)
if [ "${last_status}" = "pending" ]; then
# Extension build is still in progress.
continue
elif [ "${last_status}" = "success" ]; then
# Extension build is successful.
exit 0
else
# Status is neither "pending" nor "success", exit the loop and fail the job.
break
fi
done
# Extension build failed, print `statuses.json` for debugging and fail the job.
jq '.' statuses.json
echo >&2 "Status of extension build is '${last_status}' != 'success'"
exit 1
deploy:
runs-on: [ self-hosted, gen3, small ]
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
needs: [ promote-images, tag, regress-tests, wait-for-extensions-build ]
needs: [ promote-images, tag, regress-tests ]
if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
steps:
- name: Fix git ownership

View File

@@ -1,6 +1,4 @@
use std::convert::Infallible;
use std::net::IpAddr;
use std::net::Ipv6Addr;
use std::net::SocketAddr;
use std::sync::Arc;
use std::thread;
@@ -300,9 +298,7 @@ fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
// Main Hyper HTTP server function that runs it and blocks waiting on it forever.
#[tokio::main]
async fn serve(port: u16, state: Arc<ComputeNode>) {
// this usually binds to both IPv4 and IPv6 on linux
// see e.g. https://github.com/rust-lang/rust/pull/34440
let addr = SocketAddr::new(IpAddr::from(Ipv6Addr::UNSPECIFIED), port);
let addr = SocketAddr::from(([0, 0, 0, 0], port));
let make_service = make_service_fn(move |_conn| {
let state = state.clone();

View File

@@ -6,4 +6,4 @@ pub const DEFAULT_LOG_LEVEL: &str = "info";
// https://www.postgresql.org/docs/15/auth-password.html
//
// So it's safe to set md5 here, as `control-plane` anyway uses SCRAM for all roles.
pub const PG_HBA_ALL_MD5: &str = "host\tall\t\tall\t\tall\t\tmd5";
pub const PG_HBA_ALL_MD5: &str = "host\tall\t\tall\t\t0.0.0.0/0\t\tmd5";

View File

@@ -1,281 +0,0 @@
# Crash-Consistent Layer Map Updates By Leveraging `index_part.json`
* Created on: Aug 23, 2023
* Author: Christian Schwarz
## Summary
This RFC describes a simple scheme to make layer map updates crash consistent by leveraging the `index_part.json` in remote storage.
Without such a mechanism, crashes can induce certain edge cases in which broadly held assumptions about system invariants don't hold.
## Motivation
### Background
We can currently easily make complex, atomic updates to the layer map by means of an RwLock.
If we crash or restart pageserver, we reconstruct the layer map from:
1. local timeline directory contents
2. remote `index_part.json` contents.
The function that is responsible for this is called `Timeline::load_layer_map()`.
The reconciliation process's behavior is the following:
* local-only files will become part of the layer map as local-only layers and rescheduled for upload
* For a file name that, by its name, is present locally and in the remote `index_part.json`, but where the local file has a different size (future: checksum) than the remote file, we will delete the local file and leave the remote file as a `RemoteLayer` in the layer map.
### The Problem
There are are cases where we need to make an atomic update to the layer map that involves **more than one layer**.
The best example is compaction, where we need to insert the L1 layers generated from the L0 layers, and remove the L0 layers.
As stated above, making the update to the layer map in atomic way is trivial.
But, there is no system call API to make an atomic update to a directory that involves more than one file rename and deletion.
Currently, we issue the system calls one by one and hope we don't crash.
What happens if we crash and restart in the middle of that system call sequence?
We will reconstruct the layer map according to the reconciliation process, taking as input whatever transitory state the timeline directory ended up in.
We cannot roll back or complete the timeline directory update during which we crashed, because we keep no record of the changes we plan to make.
### Problem's Implications For Compaction
The implications of the above are primarily problematic for compaction.
Specifically, the part of it that compacts L0 layers into L1 layers.
Remember that compaction takes a set of L0 layers and reshuffles the delta records in them into L1 layer files.
Once the L1 layer files are written to disk, it atomically removes the L0 layers from the layer map and adds the L1 layers to the layer map.
It then deletes the L0 layers locally, and schedules an upload of the L1 layers and and updated index part.
If we crash before deleting L0s, but after writing out L1s, the next compaction after restart will re-digest the L0s and produce new L1s.
This means the compaction after restart will **overwrite** the previously written L1s.
Currently we also schedule an S3 upload of the overwritten L1.
If the compaction algorithm doesn't change between the two compaction runs, is deterministic, and uses the same set of L0s as input, then the second run will produce identical L1s and the overwrites will go unnoticed.
*However*:
1. the file size of the overwritten L1s may not be identical, and
2. the bit pattern of the overwritten L1s may not be identical, and,
3. in the future, we may want to make the compaction code non-determinstic, influenced by past access patterns, or otherwise change it, resulting in L1 overwrites with a different set of delta records than before the overwrite
The items above are a problem for the [split-brain protection RFC](https://github.com/neondatabase/neon/pull/4919) because it assumes that layer files in S3 are only ever deleted, but never replaced (overPUTted).
For example, if an unresponsive node A becomes active again after control plane has relocated the tenant to a new node B, the node A may overwrite some L1s.
But node B based its world view on the version of node A's `index_part.json` from _before_ the overwrite.
That earlier `index_part.json`` contained the file size of the pre-overwrite L1.
If the overwritten L1 has a different file size, node B will refuse to read data from the overwritten L1.
Effectively, the data in the L1 has become inaccessible to node B.
If node B already uploaded an index part itself, all subsequent attachments will use node B's index part, and run into the same probem.
If we ever introduce checksums instead of checking just the file size, then a mismatching bit pattern (2) will cause similar problems.
In case of (1) and (2), where we know that the logical content of the layers is still the same, we can recover by manually patching the `index_part.json` of the new node to the overwritten L1's file size / checksum.
But if (3) ever happens, the logical content may be different, and, we could have truly lost data.
Given the above considerations, we should avoid making correctness of split-brain protection dependent on overwrites preserving _logical_ layer file contents.
**It is a much cleaner separation of concerns to require that layer files are truly immutable in S3, i.e., PUT once and then only DELETEd, never overwritten (overPUTted).**
## Design
Instead of reconciling a layer map from local timeline directory contents and remote index part, this RFC proposes to view the remote index part as authoritative during timeline load.
Local layer files will be recognized if they match what's listed in remote index part, and removed otherwise.
During **timeline load**, the only thing that matters is the remote index part content.
Essentially, timeline load becomes much like attach, except we don't need to prefix-list the remote timelines.
The local timeline dir's `metadata` file does not matter.
The layer files in the local timeline dir are seen as a nice-to-have cache of layer files that are in the remote index part.
Any layer files in the local timeline dir that aren't in the remote index part are removed during startup.
The `Timeline::load_layer_map()` no longer "merges" local timeline dir contents with the remote index part.
Instead, it treats the remote index part as the authoritative layer map.
If the local timeline dir contains a layer that is in the remote index part, that's nice, and we'll re-use it if file size (and in the future, check sum) match what's stated in the index part.
If it doesn't match, we remove the file from the local timeline dir.
After load, **at runtime**, nothing changes compared to what we did before this RFC.
The procedure for single- and multi-object changes is reproduced here for reference:
* For any new layers that the change adds:
* Write them to a temporary location.
* While holding layer map lock:
* Move them to the final location.
* Insert into layer map.
* Make the S3 changes.
We won't reproduce the remote timeline client method calls here because these are subject to change.
Instead we reproduce the sequence of s3 changes that must result for a given single-/multi-object change:
* PUT layer files inserted by the change.
* PUT an index part that has insertions and deletions of the change.
* DELETE the layer files that are deleted by the change.
Note that it is safe for the DELETE to be deferred arbitrarily.
* If it never happens, we leak the object, but, that's not a correctness concern.
* As of #4938, we don't schedule the remote timeline client operation for deletion immediately, but, only when we drop the `LayerInner`.
* With the [split-brain protection RFC](https://github.com/neondatabase/neon/pull/4919), the deletions will be written to deletion queue for processing when it's safe to do so (see the RFC for details).
## How This Solves The Problem
If we crash before we've finished the S3 changes, then timeline load will reset layer map to the state that's in the S3 index part.
The S3 change sequence above is obviously crash-consistent.
If we crash before the index part PUT, then we leak the inserted layer files to S3.
If we crash after the index part PUT, we leak the to-be-DELETEd layer files to S3.
Leaking is fine, it's a pre-existing condition and not addressed in this RFC.
Multi-object changes that previously created and removed files in timeline dir are now atomic because the layer map updates are atomic and crash consistent:
* atomic layer map update at runtime, currently by using an RwLock in write mode
* atomic `index_part.json` update in S3, as per guarantee that S3 PUT is atomic
* local timeline dir state:
* irrelevant for layer map content => irrelevant for atomic updates / crash consistency
* if we crash after index part PUT, local layer files will be used, so, no on-demand downloads neede for them
* if we crash before index part PUT, local layer files will be deleted
## Trade-Offs
### Fundamental
If we crash before finishing the index part PUT, we lose all the work that hasn't reached the S3 `index_part.json`:
* wal ingest: we lose not-yet-uploaded L0s; load on the **safekeepers** + work for pageserver
* compaction: we lose the entire compaction iteration work; need to re-do it again
* gc: no change to what we have today
If the work is still deemed necessary after restart, the restarted restarted pageserver will re-do this work.
The amount of work to be re-do is capped to the lag of S3 changes to the local changes.
Assuming upload queue allows for unlimited queue depth (that's what it does today), this means:
* on-demand downloads that were needed to do the work: are likely still present, not lost
* wal ingest: currently unbounded
* L0 => L1 compaction: CPU time proportional to `O(sum(L0 size))` and upload work proportional to `O()`
* Compaction threshold is 10 L0s and each L0 can be up to 256M in size. Target size for L1 is 128M.
* In practive, most L0s are tiny due to 10minute `DEFAULT_CHECKPOINT_TIMEOUT`.
* image layer generation: CPU time `O(sum(input data))` + upload work `O(sum(new image layer size))`
* I have no intuition how expensive / long-running it is in reality.
* gc: `update_gc_info`` work (not substantial, AFAIK)
To limit the amount of lost upload work, and ingest work, we can limit the upload queue depth (see suggestions in the next sub-section).
However, to limit the amount of lost CPU work, we would need a way to make make the compaction/image-layer-generation algorithms interruptible & resumable.
We aren't there yet, the need for it is tracked by ([#4580](https://github.com/neondatabase/neon/issues/4580)).
However, this RFC is not constraining the design space either.
### Practical
#### Pageserver Restarts
Pageserver crashes are very rare ; it would likely be acceptable to re-do the lost work in that case.
However, regular pageserver restart happen frequently, e.g., during weekly deploys.
In general, pageserver restart faces the problem of tenants that "take too long" to shut down.
They are a problem because other tenants that shut down quickly are unavailble while we wait for the slow tenants to shut down.
We currently allot 10 seconds for graceful shutdown until we SIGKILL the pageserver process (as per `pageserver.service` unit file).
A longer budget would expose tenants that are done early to a longer downtime.
A short budget would risk throwing away more work that'd have to be re-done after restart.
In the context of this RFC, killing the process would mean losing the work that hasn't made it to S3.
We can mitigate this problem as follows:
0. initially, by accepting that we need to do the work again
1. short-term, introducing measures to cap the amount of in-flight work:
- cap upload queue length, use backpressure to slow down compaction
- disabling compaction/image-layer-generation X minutes before `systemctl restart pageserver`
- introducing a read-only shutdown state for tenants that are fast to shut down;
that state would be equivalent to the state of a tenant in hot standby / readonly mode.
2. mid term, by not restarting pageserver in place, but using [*seamless tenant migration*](https://github.com/neondatabase/neon/pull/5029) to drain a pageserver's tenants before we restart it.
#### `disk_consistent_lsn` can go backwards
`disk_consistent_lsn` can go backwards across restarts if we crash before we've finished the index part PUT.
Nobody should care about it, because the only thing that matters is `remote_consistent_lsn`.
Compute certainly doesn't care about `disk_consistent_lsn`.
## Side-Effects Of This Design
* local `metadata` is basically reduced to a cache of which timelines exist for this tenant; i.e., we can avoid a `ListObjects` requests for a tenant's timelines during tenant load.
## Limitations
Multi-object changes that span multiple timelines aren't covered by this RFC.
That's fine because we currently don't need them, as evidenced by the absence
of a Pageserver operation that holds multiple timelines' layer map lock at a time.
## Impacted components
Primarily pageservers.
Safekeepers will experience more load when we need to re-ingest WAL because we've thrown away work.
No changes to safekeepers are needed.
## Alternatives considered
### Alternative 1: WAL
We could have a local WAL for timeline dir changes, as proposed here https://github.com/neondatabase/neon/issues/4418 and partially implemented here https://github.com/neondatabase/neon/pull/4422 .
The WAL would be used to
1. make multi-object changes atomic
2. replace `reconcile_with_remote()` reconciliation: scheduling of layer upload would be part of WAL replay.
The WAL is appealing in a local-first world, but, it's much more complex than the design described above:
* New on-disk state to get right.
* Forward- and backward-compatibility development costs in the future.
### Alternative 2: Flow Everything Through `index_part.json`
We could have gone to the other extreme and **only** update the layer map whenever we've PUT `index_part.json`.
I.e., layer map would always be the last-persisted S3 state.
That's axiomatically beautiful, not least because it fully separates the layer file production and consumption path (=> [layer file spreading proposal](https://www.notion.so/neondatabase/One-Pager-Layer-File-Spreading-Christian-eb6b64182a214e11b3fceceee688d843?pvs=4)).
And it might make hot standbys / read-only pageservers less of a special case in the future.
But, I have some uncertainties with regard to WAL ingestion, because it needs to be able to do some reads for the logical size feedback to safekeepers.
And it's silly that we wouldn't be able to use the results of compaction or image layer generation before we're done with the upload.
Lastly, a temporarily clogged-up upload queue (e.g. S3 is down) shouldn't immediately render ingestion unavailable.
### Alternative 3: Sequence Numbers For Layers
Instead of what's proposed in this RFC, we could use unique numbers to identify layer files:
```
# before
tenants/$tenant/timelines/$timeline/$key_and_lsn_range
# after
tenants/$tenant/timelines/$timeline/$layer_file_id-$key_and_lsn_range
```
To guarantee uniqueness, the unqiue number is a sequence number, stored in `index_part.json`.
This alternative does not solve atomic layer map updates.
In our crash-during-compaction scenario above, the compaction run after the crash will not overwrite the L1s, but write/PUT new files with new sequence numbers.
In fact, this alternative makes it worse because the data is now duplicated in the not-overwritten and overwritten L1 layer files.
We'd need to write a deduplication pass that checks if perfectly overlapping layers have identical contents.
However, this alternative is appealing because it systematically prevents overwrites at a lower level than this RFC.
So, this alternative is sufficient for the needs of the split-brain safety RFC (immutable layer files locally and in S3).
But it doesn't solve the problems with crash-during-compaction outlined earlier in this RFC, and in fact, makes it much more accute.
The proposed design in this RFC addresses both.
So, if this alternative sounds appealing, we should implement the proposal in this RFC first, then implement this alternative on top.
That way, we avoid a phase where the crash-during-compaction problem is accute.
## Related issues
- https://github.com/neondatabase/neon/issues/4749
- https://github.com/neondatabase/neon/issues/4418
- https://github.com/neondatabase/neon/pull/4422
- https://github.com/neondatabase/neon/issues/5077
- https://github.com/neondatabase/neon/issues/4088
- (re)resolutions:
- https://github.com/neondatabase/neon/pull/4696
- https://github.com/neondatabase/neon/pull/4094
- https://neondb.slack.com/archives/C033QLM5P7D/p1682519017949719
Note that the test case introduced in https://github.com/neondatabase/neon/pull/4696/files#diff-13114949d1deb49ae394405d4c49558adad91150ba8a34004133653a8a5aeb76 will produce L1s with the same logical content, but, as outlined in the last paragraph of the _Problem Statement_ section above, we don't want to make that assumption in order to fix the problem.
## Implementation Plan
1. Remove support for `remote_storage=None`, because we now rely on the existence of an index part.
- The nasty part here is to fix all the tests that fiddle with the local timeline directory.
Possibly they are just irrelevant with this change, but, each case will require inspection.
2. Implement the design above.
- Initially, ship without the mitigations for restart and accept we will do some work twice.
- Measure the impact and implement one of the mitigations.

View File

@@ -68,7 +68,7 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
},
)
.await?;
let cursor = BlockCursor::new_fileblockreader(&file);
let cursor = BlockCursor::new_fileblockreader_virtual(&file);
for (k, v) in all {
let value = cursor.read_blob(v.pos()).await?;
println!("key:{} value_len:{}", k, value.len());

View File

@@ -32,7 +32,9 @@ use std::fmt::Debug;
use std::fmt::Display;
use std::fs;
use std::fs::File;
use std::fs::OpenOptions;
use std::io;
use std::io::Write;
use std::ops::Bound::Included;
use std::path::Path;
use std::path::PathBuf;
@@ -113,6 +115,7 @@ pub mod block_io;
pub mod disk_btree;
pub(crate) mod ephemeral_file;
pub mod layer_map;
pub mod manifest;
mod span;
pub mod metadata;
@@ -192,7 +195,7 @@ pub struct Tenant {
walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
// provides access to timeline data sitting in the remote storage
pub(crate) remote_storage: Option<GenericRemoteStorage>,
remote_storage: Option<GenericRemoteStorage>,
/// Cached logical sizes updated updated on each [`Tenant::gather_size_inputs`].
cached_logical_sizes: tokio::sync::Mutex<HashMap<(TimelineId, Lsn), u64>>,
@@ -404,6 +407,7 @@ impl Tenant {
remote_startup_data: Option<RemoteStartupData>,
local_metadata: Option<TimelineMetadata>,
ancestor: Option<Arc<Timeline>>,
first_save: bool,
init_order: Option<&InitializationOrder>,
_ctx: &RequestContext,
) -> anyhow::Result<()> {
@@ -437,9 +441,15 @@ impl Tenant {
// Save the metadata file to local disk.
if !picked_local {
save_metadata(self.conf, &tenant_id, &timeline_id, up_to_date_metadata)
.await
.context("save_metadata")?;
save_metadata(
self.conf,
&tenant_id,
&timeline_id,
up_to_date_metadata,
first_save,
)
.await
.context("save_metadata")?;
}
let index_part = remote_startup_data.as_ref().map(|x| &x.index_part);
@@ -824,6 +834,7 @@ impl Tenant {
}),
local_metadata,
ancestor,
true,
None,
ctx,
)
@@ -1376,6 +1387,7 @@ impl Tenant {
remote_startup_data,
Some(local_metadata),
ancestor,
false,
init_order,
ctx,
)
@@ -1509,15 +1521,6 @@ impl Tenant {
tline.maybe_spawn_flush_loop();
tline.freeze_and_flush().await.context("freeze_and_flush")?;
// Make sure the freeze_and_flush reaches remote storage.
tline
.remote_client
.as_ref()
.unwrap()
.wait_completion()
.await
.unwrap();
let tl = uninit_tl.finish_creation()?;
// The non-test code would call tl.activate() here.
tl.set_state(TimelineState::Active);
@@ -1694,6 +1697,65 @@ impl Tenant {
Ok(())
}
/// Flush all in-memory data to disk and remote storage, if any.
///
/// Used at graceful shutdown.
async fn freeze_and_flush_on_shutdown(&self) {
let mut js = tokio::task::JoinSet::new();
// execute on each timeline on the JoinSet, join after.
let per_timeline = |timeline_id: TimelineId, timeline: Arc<Timeline>| {
async move {
debug_assert_current_span_has_tenant_and_timeline_id();
match timeline.freeze_and_flush().await {
Ok(()) => {}
Err(e) => {
warn!("failed to freeze and flush: {e:#}");
return;
}
}
let res = if let Some(client) = timeline.remote_client.as_ref() {
// if we did not wait for completion here, it might be our shutdown process
// didn't wait for remote uploads to complete at all, as new tasks can forever
// be spawned.
//
// what is problematic is the shutting down of RemoteTimelineClient, because
// obviously it does not make sense to stop while we wait for it, but what
// about corner cases like s3 suddenly hanging up?
client.wait_completion().await
} else {
Ok(())
};
if let Err(e) = res {
warn!("failed to await for frozen and flushed uploads: {e:#}");
}
}
.instrument(tracing::info_span!("freeze_and_flush_on_shutdown", %timeline_id))
};
{
let timelines = self.timelines.lock().unwrap();
timelines
.iter()
.map(|(id, tl)| (*id, Arc::clone(tl)))
.for_each(|(timeline_id, timeline)| {
js.spawn(per_timeline(timeline_id, timeline));
})
};
while let Some(res) = js.join_next().await {
match res {
Ok(()) => {}
Err(je) if je.is_cancelled() => unreachable!("no cancelling used"),
Err(je) if je.is_panic() => { /* logged already */ }
Err(je) => warn!("unexpected JoinError: {je:?}"),
}
}
}
pub fn current_state(&self) -> TenantState {
self.state.borrow().clone()
}
@@ -1824,22 +1886,19 @@ impl Tenant {
}
};
let mut js = tokio::task::JoinSet::new();
{
let timelines = self.timelines.lock().unwrap();
timelines.values().for_each(|timeline| {
let timeline = Arc::clone(timeline);
let span = Span::current();
js.spawn(async move { timeline.shutdown(freeze_and_flush).instrument(span).await });
})
};
while let Some(res) = js.join_next().await {
match res {
Ok(()) => {}
Err(je) if je.is_cancelled() => unreachable!("no cancelling used"),
Err(je) if je.is_panic() => { /* logged already */ }
Err(je) => warn!("unexpected JoinError: {je:?}"),
}
if freeze_and_flush {
// walreceiver has already began to shutdown with TenantState::Stopping, but we need to
// await for them to stop.
task_mgr::shutdown_tasks(
Some(TaskKind::WalReceiverManager),
Some(self.tenant_id),
None,
)
.await;
// this will wait for uploads to complete; in the past, it was done outside tenant
// shutdown in pageserver::shutdown_pageserver.
self.freeze_and_flush_on_shutdown().await;
}
// shutdown all tenant and timeline tasks: gc, compaction, page service
@@ -2366,37 +2425,72 @@ impl Tenant {
Ok(tenant_conf)
}
#[tracing::instrument(skip_all, fields(%tenant_id))]
pub(super) async fn persist_tenant_config(
pub(super) fn persist_tenant_config(
tenant_id: &TenantId,
target_config_path: &Path,
tenant_conf: TenantConfOpt,
creating_tenant: bool,
) -> anyhow::Result<()> {
// imitate a try-block with a closure
info!("persisting tenantconf to {}", target_config_path.display());
let _enter = info_span!("saving tenantconf").entered();
let mut conf_content = r#"# This file contains a specific per-tenant's config.
// imitate a try-block with a closure
let do_persist = |target_config_path: &Path| -> anyhow::Result<()> {
let target_config_parent = target_config_path.parent().with_context(|| {
format!(
"Config path does not have a parent: {}",
target_config_path.display()
)
})?;
info!("persisting tenantconf to {}", target_config_path.display());
let mut conf_content = r#"# This file contains a specific per-tenant's config.
# It is read in case of pageserver restart.
[tenant_config]
"#
.to_string();
.to_string();
// Convert the config to a toml file.
conf_content += &toml_edit::ser::to_string(&tenant_conf)?;
// Convert the config to a toml file.
conf_content += &toml_edit::ser::to_string(&tenant_conf)?;
let conf_content = conf_content.as_bytes();
let mut target_config_file = VirtualFile::open_with_options(
target_config_path,
OpenOptions::new()
.truncate(true) // This needed for overwriting with small config files
.write(true)
.create_new(creating_tenant)
// when creating a new tenant, first_save will be true and `.create(true)` will be
// ignored (per rust std docs).
//
// later when updating the config of created tenant, or persisting config for the
// first time for attached tenant, the `.create(true)` is used.
.create(true),
)?;
let temp_path = path_with_suffix_extension(target_config_path, TEMP_FILE_SUFFIX);
VirtualFile::crashsafe_overwrite(target_config_path, &temp_path, conf_content)
.await
.with_context(|| {
format!(
"write tenant {tenant_id} config to {}",
target_config_path.display()
)
})?;
Ok(())
target_config_file
.write(conf_content.as_bytes())
.context("write toml bytes into file")
.and_then(|_| target_config_file.sync_all().context("fsync config file"))
.context("write config file")?;
// fsync the parent directory to ensure the directory entry is durable.
// before this was done conditionally on creating_tenant, but these management actions are rare
// enough to just fsync it always.
crashsafe::fsync(target_config_parent)?;
// XXX we're not fsyncing the parent dir, need to do that in case `creating_tenant`
Ok(())
};
// this function is called from creating the tenant and updating the tenant config, which
// would otherwise share this context, so keep it here in one place.
do_persist(target_config_path).with_context(|| {
format!(
"write tenant {tenant_id} config to {}",
target_config_path.display()
)
})
}
//
@@ -2930,9 +3024,15 @@ impl Tenant {
anyhow::bail!("failpoint after-timeline-uninit-mark-creation");
});
save_metadata(self.conf, &self.tenant_id, new_timeline_id, new_metadata)
.await
.context("Failed to create timeline metadata")?;
save_metadata(
self.conf,
&self.tenant_id,
new_timeline_id,
new_metadata,
true,
)
.await
.context("Failed to create timeline metadata")?;
Ok(())
}
@@ -3079,7 +3179,7 @@ pub(crate) enum CreateTenantFilesMode {
Attach,
}
pub(crate) async fn create_tenant_files(
pub(crate) fn create_tenant_files(
conf: &'static PageServerConf,
tenant_conf: TenantConfOpt,
tenant_id: &TenantId,
@@ -3115,8 +3215,7 @@ pub(crate) async fn create_tenant_files(
mode,
&temporary_tenant_dir,
&target_tenant_directory,
)
.await;
);
if creation_result.is_err() {
error!("Failed to create directory structure for tenant {tenant_id}, cleaning tmp data");
@@ -3134,7 +3233,7 @@ pub(crate) async fn create_tenant_files(
Ok(target_tenant_directory)
}
async fn try_create_target_tenant_dir(
fn try_create_target_tenant_dir(
conf: &'static PageServerConf,
tenant_conf: TenantConfOpt,
tenant_id: &TenantId,
@@ -3173,7 +3272,7 @@ async fn try_create_target_tenant_dir(
)
.with_context(|| format!("resolve tenant {tenant_id} temporary config path"))?;
Tenant::persist_tenant_config(tenant_id, &temporary_tenant_config_path, tenant_conf).await?;
Tenant::persist_tenant_config(tenant_id, &temporary_tenant_config_path, tenant_conf, true)?;
crashsafe::create_dir(&temporary_tenant_timelines_dir).with_context(|| {
format!(
@@ -3378,8 +3477,6 @@ pub mod harness {
pub tenant_conf: TenantConf,
pub tenant_id: TenantId,
pub generation: Generation,
remote_storage: GenericRemoteStorage,
pub remote_fs_dir: PathBuf,
}
static LOG_HANDLE: OnceCell<()> = OnceCell::new();
@@ -3417,39 +3514,29 @@ pub mod harness {
fs::create_dir_all(conf.tenant_path(&tenant_id))?;
fs::create_dir_all(conf.timelines_path(&tenant_id))?;
use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
let remote_fs_dir = conf.workdir.join("localfs");
std::fs::create_dir_all(&remote_fs_dir).unwrap();
let config = RemoteStorageConfig {
// TODO: why not remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
max_concurrent_syncs: std::num::NonZeroUsize::new(2_000_000).unwrap(),
// TODO: why not remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
max_sync_errors: std::num::NonZeroU32::new(3_000_000).unwrap(),
storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
};
let remote_storage = GenericRemoteStorage::from_config(&config).unwrap();
Ok(Self {
conf,
tenant_conf,
tenant_id,
generation: Generation::new(0xdeadbeef),
remote_storage,
remote_fs_dir,
})
}
pub async fn load(&self) -> (Arc<Tenant>, RequestContext) {
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
(
self.try_load(&ctx)
self.try_load(&ctx, None)
.await
.expect("failed to load test tenant"),
ctx,
)
}
pub async fn try_load(&self, ctx: &RequestContext) -> anyhow::Result<Arc<Tenant>> {
pub async fn try_load(
&self,
ctx: &RequestContext,
remote_storage: Option<remote_storage::GenericRemoteStorage>,
) -> anyhow::Result<Arc<Tenant>> {
let walredo_mgr = Arc::new(TestRedoManager);
let tenant = Arc::new(Tenant::new(
@@ -3459,7 +3546,7 @@ pub mod harness {
walredo_mgr,
self.tenant_id,
self.generation,
Some(self.remote_storage.clone()),
remote_storage,
));
tenant
.load(None, ctx)
@@ -3930,13 +4017,6 @@ mod tests {
.create_test_timeline(TIMELINE_ID, Lsn(0x7000), DEFAULT_PG_VERSION, &ctx)
.await?;
make_some_layers(tline.as_ref(), Lsn(0x8000)).await?;
// so that all uploads finish & we can call harness.load() below again
tenant
.shutdown(Default::default(), true)
.instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_id))
.await
.ok()
.unwrap();
}
let (tenant, _ctx) = harness.load().await;
@@ -3970,14 +4050,6 @@ mod tests {
.expect("Should have a local timeline");
make_some_layers(newtline.as_ref(), Lsn(0x60)).await?;
// so that all uploads finish & we can call harness.load() below again
tenant
.shutdown(Default::default(), true)
.instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_id))
.await
.ok()
.unwrap();
}
// check that both of them are initially unloaded
@@ -4030,13 +4102,6 @@ mod tests {
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
.await?;
drop(tline);
// so that all uploads finish & we can call harness.try_load() below again
tenant
.shutdown(Default::default(), true)
.instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_id))
.await
.ok()
.unwrap();
drop(tenant);
let metadata_path = harness.timeline_path(&TIMELINE_ID).join(METADATA_FILE_NAME);
@@ -4048,7 +4113,11 @@ mod tests {
metadata_bytes[8] ^= 1;
std::fs::write(metadata_path, metadata_bytes)?;
let err = harness.try_load(&ctx).await.err().expect("should fail");
let err = harness
.try_load(&ctx, None)
.await
.err()
.expect("should fail");
// get all the stack with all .context, not only the last one
let message = format!("{err:#}");
let expected = "failed to load metadata";
@@ -4504,11 +4573,6 @@ mod tests {
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
.await?;
// Keeps uninit mark in place
let raw_tline = tline.raw_timeline().unwrap();
raw_tline
.shutdown(false)
.instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_id))
.await;
std::mem::forget(tline);
}

View File

@@ -123,33 +123,38 @@ impl<W> WriteBlobWriter<W> {
}
}
impl<W> BlobWriter for WriteBlobWriter<W>
where
W: std::io::Write,
{
fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, Error> {
let offset = self.offset;
macro_rules! write_blob_impl {
(WriteBlobWriter<$ty:ty>) => {
impl WriteBlobWriter<$ty> {
pub async fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, Error> {
use std::io::Write;
let offset = self.offset;
if srcbuf.len() < 128 {
// Short blob. Write a 1-byte length header
let len_buf = srcbuf.len() as u8;
self.inner.write_all(&[len_buf])?;
self.offset += 1;
} else {
// Write a 4-byte length header
if srcbuf.len() > 0x7fff_ffff {
return Err(Error::new(
ErrorKind::Other,
format!("blob too large ({} bytes)", srcbuf.len()),
));
if srcbuf.len() < 128 {
// Short blob. Write a 1-byte length header
let len_buf = srcbuf.len() as u8;
self.inner.write_all(&[len_buf])?;
self.offset += 1;
} else {
// Write a 4-byte length header
if srcbuf.len() > 0x7fff_ffff {
return Err(Error::new(
ErrorKind::Other,
format!("blob too large ({} bytes)", srcbuf.len()),
));
}
let mut len_buf = ((srcbuf.len()) as u32).to_be_bytes();
len_buf[0] |= 0x80;
self.inner.write_all(&len_buf)?;
self.offset += 4;
}
self.inner.write_all(srcbuf)?;
self.offset += srcbuf.len() as u64;
Ok(offset)
}
let mut len_buf = ((srcbuf.len()) as u32).to_be_bytes();
len_buf[0] |= 0x80;
self.inner.write_all(&len_buf)?;
self.offset += 4;
}
self.inner.write_all(srcbuf)?;
self.offset += srcbuf.len() as u64;
Ok(offset)
}
};
}
write_blob_impl!(WriteBlobWriter<crate::tenant::io::BufWriter<crate::virtual_file::VirtualFile>>);
write_blob_impl!(WriteBlobWriter<crate::virtual_file::VirtualFile>);

View File

@@ -71,7 +71,7 @@ impl<'a> Deref for BlockLease<'a> {
///
/// Unlike traits, we also support the read function to be async though.
pub(crate) enum BlockReaderRef<'a> {
FileBlockReaderVirtual(&'a FileBlockReader),
FileBlockReaderVirtual(&'a FileBlockReader<VirtualFile>),
EphemeralFile(&'a EphemeralFile),
Adapter(Adapter<&'a DeltaLayerInner>),
#[cfg(test)]
@@ -101,7 +101,7 @@ impl<'a> BlockReaderRef<'a> {
///
/// ```no_run
/// # use pageserver::tenant::block_io::{BlockReader, FileBlockReader};
/// # let reader: FileBlockReader = unimplemented!("stub");
/// # let reader: FileBlockReader<std::fs::File> = unimplemented!("stub");
/// let cursor = reader.block_cursor();
/// let buf = cursor.read_blk(1);
/// // do stuff with 'buf'
@@ -118,7 +118,7 @@ impl<'a> BlockCursor<'a> {
BlockCursor { reader }
}
// Needed by cli
pub fn new_fileblockreader(reader: &'a FileBlockReader) -> Self {
pub fn new_fileblockreader_virtual(reader: &'a FileBlockReader<VirtualFile>) -> Self {
BlockCursor {
reader: BlockReaderRef::FileBlockReaderVirtual(reader),
}
@@ -139,14 +139,14 @@ impl<'a> BlockCursor<'a> {
///
/// The file is assumed to be immutable. This doesn't provide any functions
/// for modifying the file, nor for invalidating the cache if it is modified.
pub struct FileBlockReader {
pub file: VirtualFile,
pub struct FileBlockReader<F> {
pub file: F,
/// Unique ID of this file, used as key in the page cache.
file_id: page_cache::FileId,
}
impl FileBlockReader {
impl FileBlockReader<VirtualFile> {
pub fn new(file: VirtualFile) -> Self {
let file_id = page_cache::next_file_id();
@@ -156,9 +156,7 @@ impl FileBlockReader {
/// Read a page from the underlying file into given buffer.
async fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), std::io::Error> {
assert!(buf.len() == PAGE_SZ);
self.file
.read_exact_at(buf, blkno as u64 * PAGE_SZ as u64)
.await
self.file.read_exact_at(buf, blkno as u64 * PAGE_SZ as u64)
}
/// Read a block.
///
@@ -191,7 +189,7 @@ impl FileBlockReader {
}
}
impl BlockReader for FileBlockReader {
impl BlockReader for FileBlockReader<VirtualFile> {
fn block_cursor(&self) -> BlockCursor<'_> {
BlockCursor::new(BlockReaderRef::FileBlockReaderVirtual(self))
}

View File

@@ -87,8 +87,7 @@ impl EphemeralFile {
let buf: &mut [u8] = write_guard.deref_mut();
debug_assert_eq!(buf.len(), PAGE_SZ);
self.file
.read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)
.await?;
.read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)?;
write_guard.mark_valid();
// Swap for read lock

View File

@@ -0,0 +1,326 @@
//! This module contains the encoding and decoding of the local manifest file.
//!
//! MANIFEST is a write-ahead log which is stored locally to each timeline. It
//! records the state of the storage engine. It contains a snapshot of the
//! state and all operations proceeding that snapshot. The file begins with a
//! header recording MANIFEST version number. After that, it contains a snapshot.
//! The snapshot is followed by a list of operations. Each operation is a list
//! of records. Each record is either an addition or a removal of a layer.
//!
//! With MANIFEST, we can:
//!
//! 1. recover state quickly by reading the file, potentially boosting the
//! startup speed.
//! 2. ensure all operations are atomic and avoid corruption, solving issues
//! like redundant image layer and preparing us for future compaction
//! strategies.
//!
//! There is also a format for storing all layer files on S3, called
//! `index_part.json`. Compared with index_part, MANIFEST is an WAL which
//! records all operations as logs, and therefore we can easily replay the
//! operations when recovering from crash, while ensuring those operations
//! are atomic upon restart.
//!
//! Currently, this is not used in the system. Future refactors will ensure
//! the storage state will be recorded in this file, and the system can be
//! recovered from this file. This is tracked in
//! <https://github.com/neondatabase/neon/issues/4418>
use std::io::{self, Write};
use crate::virtual_file::VirtualFile;
use anyhow::Result;
use bytes::{Buf, BufMut, Bytes, BytesMut};
use crc32c::crc32c;
use serde::{Deserialize, Serialize};
use tracing::log::warn;
use utils::lsn::Lsn;
use super::storage_layer::PersistentLayerDesc;
pub struct Manifest {
file: VirtualFile,
}
#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
pub struct Snapshot {
pub layers: Vec<PersistentLayerDesc>,
}
/// serde by default encode this in tagged enum, and therefore it will be something
/// like `{ "AddLayer": { ... } }`.
#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
pub enum Record {
AddLayer(PersistentLayerDesc),
RemoveLayer(PersistentLayerDesc),
}
/// `echo neon.manifest | sha1sum` and take the leading 8 bytes.
const MANIFEST_MAGIC_NUMBER: u64 = 0xf5c44592b806109c;
const MANIFEST_VERSION: u64 = 1;
#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
pub struct ManifestHeader {
magic_number: u64,
version: u64,
}
const MANIFEST_HEADER_LEN: usize = 16;
impl ManifestHeader {
fn encode(&self) -> BytesMut {
let mut buf = BytesMut::with_capacity(MANIFEST_HEADER_LEN);
buf.put_u64(self.magic_number);
buf.put_u64(self.version);
buf
}
fn decode(mut buf: &[u8]) -> Self {
assert!(buf.len() == MANIFEST_HEADER_LEN, "invalid header");
Self {
magic_number: buf.get_u64(),
version: buf.get_u64(),
}
}
}
#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
pub enum Operation {
/// A snapshot of the current state.
///
/// Lsn field represents the LSN that is persisted to disk for this snapshot.
Snapshot(Snapshot, Lsn),
/// An atomic operation that changes the state.
///
/// Lsn field represents the LSN that is persisted to disk after the operation is done.
/// This will only change when new L0 is flushed to the disk.
Operation(Vec<Record>, Lsn),
}
struct RecordHeader {
size: u32,
checksum: u32,
}
const RECORD_HEADER_LEN: usize = 8;
impl RecordHeader {
fn encode(&self) -> BytesMut {
let mut buf = BytesMut::with_capacity(RECORD_HEADER_LEN);
buf.put_u32(self.size);
buf.put_u32(self.checksum);
buf
}
fn decode(mut buf: &[u8]) -> Self {
assert!(buf.len() == RECORD_HEADER_LEN, "invalid header");
Self {
size: buf.get_u32(),
checksum: buf.get_u32(),
}
}
}
#[derive(Debug, thiserror::Error)]
pub enum ManifestLoadError {
#[error("manifest header is corrupted")]
CorruptedManifestHeader,
#[error("unsupported manifest version: got {0}, expected {1}")]
UnsupportedVersion(u64, u64),
#[error("error when decoding record: {0}")]
DecodeRecord(serde_json::Error),
#[error("I/O error: {0}")]
Io(io::Error),
}
#[must_use = "Should check if the manifest is partially corrupted"]
pub struct ManifestPartiallyCorrupted(bool);
impl Manifest {
/// Create a new manifest by writing the manifest header and a snapshot record to the given file.
pub fn init(file: VirtualFile, snapshot: Snapshot, lsn: Lsn) -> Result<Self> {
let mut manifest = Self { file };
manifest.append_manifest_header(ManifestHeader {
magic_number: MANIFEST_MAGIC_NUMBER,
version: MANIFEST_VERSION,
})?;
manifest.append_operation(Operation::Snapshot(snapshot, lsn))?;
Ok(manifest)
}
/// Load a manifest. Returns the manifest and a list of operations. If the manifest is corrupted,
/// the bool flag will be set to true and the user is responsible to reconstruct a new manifest and
/// backup the current one.
pub async fn load(
file: VirtualFile,
) -> Result<(Self, Vec<Operation>, ManifestPartiallyCorrupted), ManifestLoadError> {
let mut buf = vec![];
file.read_exact_at(&mut buf, 0)
.map_err(ManifestLoadError::Io)?;
// Read manifest header
let mut buf = Bytes::from(buf);
if buf.remaining() < MANIFEST_HEADER_LEN {
return Err(ManifestLoadError::CorruptedManifestHeader);
}
let header = ManifestHeader::decode(&buf[..MANIFEST_HEADER_LEN]);
buf.advance(MANIFEST_HEADER_LEN);
if header.version != MANIFEST_VERSION {
return Err(ManifestLoadError::UnsupportedVersion(
header.version,
MANIFEST_VERSION,
));
}
// Read operations
let mut operations = Vec::new();
let corrupted = loop {
if buf.remaining() == 0 {
break false;
}
if buf.remaining() < RECORD_HEADER_LEN {
warn!("incomplete header when decoding manifest, could be corrupted");
break true;
}
let RecordHeader { size, checksum } = RecordHeader::decode(&buf[..RECORD_HEADER_LEN]);
let size = size as usize;
buf.advance(RECORD_HEADER_LEN);
if buf.remaining() < size {
warn!("incomplete data when decoding manifest, could be corrupted");
break true;
}
let data = &buf[..size];
if crc32c(data) != checksum {
warn!("checksum mismatch when decoding manifest, could be corrupted");
break true;
}
// if the following decode fails, we cannot use the manifest or safely ignore any record.
operations.push(serde_json::from_slice(data).map_err(ManifestLoadError::DecodeRecord)?);
buf.advance(size);
};
Ok((
Self { file },
operations,
ManifestPartiallyCorrupted(corrupted),
))
}
fn append_data(&mut self, data: &[u8]) -> Result<()> {
if data.len() >= u32::MAX as usize {
panic!("data too large");
}
let header = RecordHeader {
size: data.len() as u32,
checksum: crc32c(data),
};
let header = header.encode();
self.file.write_all(&header)?;
self.file.write_all(data)?;
self.file.sync_all()?;
Ok(())
}
fn append_manifest_header(&mut self, header: ManifestHeader) -> Result<()> {
let encoded = header.encode();
self.file.write_all(&encoded)?;
Ok(())
}
/// Add an operation to the manifest. The operation will be appended to the end of the file,
/// and the file will fsync.
pub fn append_operation(&mut self, operation: Operation) -> Result<()> {
let encoded = Vec::from(serde_json::to_string(&operation)?);
self.append_data(&encoded)
}
}
#[cfg(test)]
mod tests {
use std::fs::OpenOptions;
use crate::repository::Key;
use super::*;
#[tokio::test]
async fn test_read_manifest() {
let testdir = crate::config::PageServerConf::test_repo_dir("test_read_manifest");
std::fs::create_dir_all(&testdir).unwrap();
let file = VirtualFile::create(&testdir.join("MANIFEST")).unwrap();
let layer1 = PersistentLayerDesc::new_test(Key::from_i128(0)..Key::from_i128(233));
let layer2 = PersistentLayerDesc::new_test(Key::from_i128(233)..Key::from_i128(2333));
let layer3 = PersistentLayerDesc::new_test(Key::from_i128(2333)..Key::from_i128(23333));
let layer4 = PersistentLayerDesc::new_test(Key::from_i128(23333)..Key::from_i128(233333));
// Write a manifest with a snapshot and some operations
let snapshot = Snapshot {
layers: vec![layer1, layer2],
};
let mut manifest = Manifest::init(file, snapshot.clone(), Lsn::from(0)).unwrap();
manifest
.append_operation(Operation::Operation(
vec![Record::AddLayer(layer3.clone())],
Lsn::from(1),
))
.unwrap();
drop(manifest);
// Open the second time and write
let file = VirtualFile::open_with_options(
&testdir.join("MANIFEST"),
OpenOptions::new()
.read(true)
.write(true)
.create_new(false)
.truncate(false),
)
.unwrap();
let (mut manifest, operations, corrupted) = Manifest::load(file).await.unwrap();
assert!(!corrupted.0);
assert_eq!(operations.len(), 2);
assert_eq!(
&operations[0],
&Operation::Snapshot(snapshot.clone(), Lsn::from(0))
);
assert_eq!(
&operations[1],
&Operation::Operation(vec![Record::AddLayer(layer3.clone())], Lsn::from(1))
);
manifest
.append_operation(Operation::Operation(
vec![
Record::RemoveLayer(layer3.clone()),
Record::AddLayer(layer4.clone()),
],
Lsn::from(2),
))
.unwrap();
drop(manifest);
// Open the third time and verify
let file = VirtualFile::open_with_options(
&testdir.join("MANIFEST"),
OpenOptions::new()
.read(true)
.write(true)
.create_new(false)
.truncate(false),
)
.unwrap();
let (_manifest, operations, corrupted) = Manifest::load(file).await.unwrap();
assert!(!corrupted.0);
assert_eq!(operations.len(), 3);
assert_eq!(&operations[0], &Operation::Snapshot(snapshot, Lsn::from(0)));
assert_eq!(
&operations[1],
&Operation::Operation(vec![Record::AddLayer(layer3.clone())], Lsn::from(1))
);
assert_eq!(
&operations[2],
&Operation::Operation(
vec![Record::RemoveLayer(layer3), Record::AddLayer(layer4)],
Lsn::from(2)
)
);
}
}

View File

@@ -8,13 +8,14 @@
//!
//! [`remote_timeline_client`]: super::remote_timeline_client
use std::io::{self};
use std::fs::{File, OpenOptions};
use std::io;
use anyhow::{ensure, Context};
use serde::{de::Error, Deserialize, Serialize, Serializer};
use thiserror::Error;
use tracing::info_span;
use utils::bin_ser::SerializeError;
use utils::crashsafe::path_with_suffix_extension;
use utils::{
bin_ser::BeSer,
id::{TenantId, TimelineId},
@@ -23,7 +24,6 @@ use utils::{
use crate::config::PageServerConf;
use crate::virtual_file::VirtualFile;
use crate::TEMP_FILE_SUFFIX;
/// Use special format number to enable backward compatibility.
const METADATA_FORMAT_VERSION: u16 = 4;
@@ -255,19 +255,35 @@ impl Serialize for TimelineMetadata {
}
/// Save timeline metadata to file
#[tracing::instrument(skip_all, fields(%tenant_id, %timeline_id))]
pub async fn save_metadata(
conf: &'static PageServerConf,
tenant_id: &TenantId,
timeline_id: &TimelineId,
data: &TimelineMetadata,
first_save: bool,
) -> anyhow::Result<()> {
let _enter = info_span!("saving metadata").entered();
let path = conf.metadata_path(tenant_id, timeline_id);
let temp_path = path_with_suffix_extension(&path, TEMP_FILE_SUFFIX);
let metadata_bytes = data.to_bytes().context("serialize metadata")?;
VirtualFile::crashsafe_overwrite(&path, &temp_path, &metadata_bytes)
.await
.context("write metadata")?;
// use OpenOptions to ensure file presence is consistent with first_save
let mut file = VirtualFile::open_with_options(
&path,
OpenOptions::new().write(true).create_new(first_save),
)
.context("open_with_options")?;
let metadata_bytes = data.to_bytes().context("Failed to get metadata bytes")?;
file.write_and_fsync(&metadata_bytes)?;
// fsync the parent directory to ensure the directory entry is durable
if first_save {
let timeline_dir = File::open(
path.parent()
.expect("Metadata should always have a parent dir"),
)?;
timeline_dir.sync_all()?;
}
Ok(())
}

View File

@@ -22,9 +22,8 @@ use crate::task_mgr::{self, TaskKind};
use crate::tenant::config::TenantConfOpt;
use crate::tenant::delete::DeleteTenantFlow;
use crate::tenant::{create_tenant_files, CreateTenantFilesMode, Tenant, TenantState};
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME};
use utils::crashsafe::path_with_suffix_extension;
use utils::fs_ext::PathExt;
use utils::generation::Generation;
use utils::id::{TenantId, TimelineId};
@@ -61,29 +60,6 @@ impl TenantsMap {
}
}
/// This is "safe" in that that it won't leave behind a partially deleted directory
/// at the original path, because we rename with TEMP_FILE_SUFFIX before starting deleting
/// the contents.
///
/// This is pageserver-specific, as it relies on future processes after a crash to check
/// for TEMP_FILE_SUFFIX when loading things.
async fn safe_remove_tenant_dir_all(path: impl AsRef<Path>) -> std::io::Result<()> {
let parent = path
.as_ref()
.parent()
// It is invalid to call this function with a relative path. Tenant directories
// should always have a parent.
.ok_or(std::io::Error::new(
std::io::ErrorKind::InvalidInput,
"Path must be absolute",
))?;
let tmp_path = path_with_suffix_extension(&path, TEMP_FILE_SUFFIX);
fs::rename(&path, &tmp_path).await?;
fs::File::open(parent).await?.sync_all().await?;
fs::remove_dir_all(tmp_path).await
}
static TENANTS: Lazy<RwLock<TenantsMap>> = Lazy::new(|| RwLock::new(TenantsMap::Initializing));
/// Initialize repositories with locally available timelines.
@@ -116,8 +92,6 @@ pub async fn init_tenant_mgr(
"Found temporary tenant directory, removing: {}",
tenant_dir_path.display()
);
// No need to use safe_remove_tenant_dir_all because this is already
// a temporary path
if let Err(e) = fs::remove_dir_all(&tenant_dir_path).await {
error!(
"Failed to remove temporary directory '{}': {:?}",
@@ -387,11 +361,11 @@ pub async fn create_tenant(
remote_storage: Option<GenericRemoteStorage>,
ctx: &RequestContext,
) -> Result<Arc<Tenant>, TenantMapInsertError> {
tenant_map_insert(tenant_id, || async {
tenant_map_insert(tenant_id, || {
// We're holding the tenants lock in write mode while doing local IO.
// If this section ever becomes contentious, introduce a new `TenantState::Creating`
// and do the work in that state.
let tenant_directory = super::create_tenant_files(conf, tenant_conf, &tenant_id, CreateTenantFilesMode::Create).await?;
let tenant_directory = super::create_tenant_files(conf, tenant_conf, &tenant_id, CreateTenantFilesMode::Create)?;
// TODO: tenant directory remains on disk if we bail out from here on.
// See https://github.com/neondatabase/neon/issues/4233
@@ -430,8 +404,7 @@ pub async fn set_new_tenant_config(
let tenant = get_tenant(tenant_id, true).await?;
let tenant_config_path = conf.tenant_config_path(&tenant_id);
Tenant::persist_tenant_config(&tenant_id, &tenant_config_path, new_tenant_conf)
.await
Tenant::persist_tenant_config(&tenant_id, &tenant_config_path, new_tenant_conf, false)
.map_err(SetNewTenantConfigError::Persist)?;
tenant.set_new_tenant_config(new_tenant_conf);
Ok(())
@@ -517,7 +490,7 @@ async fn detach_tenant0(
) -> Result<(), TenantStateError> {
let local_files_cleanup_operation = |tenant_id_to_clean| async move {
let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean);
safe_remove_tenant_dir_all(&local_tenant_directory)
fs::remove_dir_all(&local_tenant_directory)
.await
.with_context(|| {
format!("local tenant directory {local_tenant_directory:?} removal")
@@ -552,7 +525,7 @@ pub async fn load_tenant(
remote_storage: Option<GenericRemoteStorage>,
ctx: &RequestContext,
) -> Result<(), TenantMapInsertError> {
tenant_map_insert(tenant_id, || async {
tenant_map_insert(tenant_id, || {
let tenant_path = conf.tenant_path(&tenant_id);
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_id);
if tenant_ignore_mark.exists() {
@@ -633,8 +606,8 @@ pub async fn attach_tenant(
remote_storage: GenericRemoteStorage,
ctx: &RequestContext,
) -> Result<(), TenantMapInsertError> {
tenant_map_insert(tenant_id, || async {
let tenant_dir = create_tenant_files(conf, tenant_conf, &tenant_id, CreateTenantFilesMode::Attach).await?;
tenant_map_insert(tenant_id, || {
let tenant_dir = create_tenant_files(conf, tenant_conf, &tenant_id, CreateTenantFilesMode::Attach)?;
// TODO: tenant directory remains on disk if we bail out from here on.
// See https://github.com/neondatabase/neon/issues/4233
@@ -682,13 +655,12 @@ pub enum TenantMapInsertError {
///
/// NB: the closure should return quickly because the current implementation of tenants map
/// serializes access through an `RwLock`.
async fn tenant_map_insert<F, R>(
async fn tenant_map_insert<F>(
tenant_id: TenantId,
insert_fn: F,
) -> Result<Arc<Tenant>, TenantMapInsertError>
where
F: FnOnce() -> R,
R: std::future::Future<Output = anyhow::Result<Arc<Tenant>>>,
F: FnOnce() -> anyhow::Result<Arc<Tenant>>,
{
let mut guard = TENANTS.write().await;
let m = match &mut *guard {
@@ -701,7 +673,7 @@ where
tenant_id,
e.get().current_state(),
)),
hash_map::Entry::Vacant(v) => match insert_fn().await {
hash_map::Entry::Vacant(v) => match insert_fn() {
Ok(tenant) => {
v.insert(tenant.clone());
Ok(tenant)

View File

@@ -342,12 +342,7 @@ impl RemoteTimelineClient {
) -> RemoteTimelineClient {
RemoteTimelineClient {
conf,
runtime: if cfg!(test) {
// remote_timeline_client.rs tests rely on current-thread runtime
tokio::runtime::Handle::current()
} else {
BACKGROUND_RUNTIME.handle().clone()
},
runtime: BACKGROUND_RUNTIME.handle().to_owned(),
tenant_id,
timeline_id,
generation,
@@ -1468,8 +1463,11 @@ mod tests {
},
DEFAULT_PG_VERSION,
};
use std::{collections::HashSet, path::Path};
use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
use std::{
collections::HashSet,
path::{Path, PathBuf},
};
use utils::lsn::Lsn;
pub(super) fn dummy_contents(name: &str) -> Vec<u8> {
@@ -1526,6 +1524,8 @@ mod tests {
tenant: Arc<Tenant>,
timeline: Arc<Timeline>,
tenant_ctx: RequestContext,
remote_fs_dir: PathBuf,
client: Arc<RemoteTimelineClient>,
}
impl TestSetup {
@@ -1535,15 +1535,52 @@ mod tests {
let harness = TenantHarness::create(test_name)?;
let (tenant, ctx) = harness.load().await;
// create an empty timeline directory
let timeline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
.await?;
let remote_fs_dir = harness.conf.workdir.join("remote_fs");
std::fs::create_dir_all(remote_fs_dir)?;
let remote_fs_dir = std::fs::canonicalize(harness.conf.workdir.join("remote_fs"))?;
let storage_config = RemoteStorageConfig {
max_concurrent_syncs: std::num::NonZeroUsize::new(
remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
)
.unwrap(),
max_sync_errors: std::num::NonZeroU32::new(
remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
)
.unwrap(),
storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
};
let generation = Generation::new(0xdeadbeef);
let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
let client = Arc::new(RemoteTimelineClient {
conf: harness.conf,
runtime: tokio::runtime::Handle::current(),
tenant_id: harness.tenant_id,
timeline_id: TIMELINE_ID,
generation,
storage_impl: storage,
upload_queue: Mutex::new(UploadQueue::Uninitialized),
metrics: Arc::new(RemoteTimelineClientMetrics::new(
&harness.tenant_id,
&TIMELINE_ID,
)),
});
Ok(Self {
harness,
tenant,
timeline,
tenant_ctx: ctx,
remote_fs_dir,
client,
})
}
}
@@ -1568,37 +1605,26 @@ mod tests {
let TestSetup {
harness,
tenant: _tenant,
timeline,
timeline: _timeline,
tenant_ctx: _tenant_ctx,
remote_fs_dir,
client,
} = TestSetup::new("upload_scheduling").await.unwrap();
let client = timeline.remote_client.as_ref().unwrap();
// Download back the index.json, and check that the list of files is correct
let initial_index_part = match client.download_index_file().await.unwrap() {
MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"),
};
let initial_layers = initial_index_part
.layer_metadata
.keys()
.map(|f| f.to_owned())
.collect::<HashSet<LayerFileName>>();
let initial_layer = {
assert!(initial_layers.len() == 1);
initial_layers.into_iter().next().unwrap()
};
let timeline_path = harness.timeline_path(&TIMELINE_ID);
println!("workdir: {}", harness.conf.workdir.display());
let remote_timeline_dir = harness
.remote_fs_dir
.join(timeline_path.strip_prefix(&harness.conf.workdir).unwrap());
let remote_timeline_dir =
remote_fs_dir.join(timeline_path.strip_prefix(&harness.conf.workdir).unwrap());
println!("remote_timeline_dir: {}", remote_timeline_dir.display());
let generation = harness.generation;
let metadata = dummy_metadata(Lsn(0x10));
client
.init_upload_queue_for_empty_remote(&metadata)
.unwrap();
let generation = Generation::new(0xdeadbeef);
// Create a couple of dummy files, schedule upload for them
let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
@@ -1679,7 +1705,6 @@ mod tests {
.map(|f| f.to_owned())
.collect(),
&[
&initial_layer.file_name(),
&layer_file_name_1.file_name(),
&layer_file_name_2.file_name(),
],
@@ -1709,7 +1734,6 @@ mod tests {
}
assert_remote_files(
&[
&initial_layer.file_name(),
&layer_file_name_1.file_name(),
&layer_file_name_2.file_name(),
"index_part.json",
@@ -1723,7 +1747,6 @@ mod tests {
assert_remote_files(
&[
&initial_layer.file_name(),
&layer_file_name_2.file_name(),
&layer_file_name_3.file_name(),
"index_part.json",
@@ -1740,10 +1763,16 @@ mod tests {
let TestSetup {
harness,
tenant: _tenant,
timeline,
timeline: _timeline,
client,
..
} = TestSetup::new("metrics").await.unwrap();
let client = timeline.remote_client.as_ref().unwrap();
let metadata = dummy_metadata(Lsn(0x10));
client
.init_upload_queue_for_empty_remote(&metadata)
.unwrap();
let timeline_path = harness.timeline_path(&TIMELINE_ID);
let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
@@ -1754,20 +1783,11 @@ mod tests {
)
.unwrap();
#[derive(Debug, PartialEq, Clone, Copy)]
#[derive(Debug, PartialEq)]
struct BytesStartedFinished {
started: Option<usize>,
finished: Option<usize>,
}
impl std::ops::Add for BytesStartedFinished {
type Output = Self;
fn add(self, rhs: Self) -> Self::Output {
Self {
started: self.started.map(|v| v + rhs.started.unwrap_or(0)),
finished: self.finished.map(|v| v + rhs.finished.unwrap_or(0)),
}
}
}
let get_bytes_started_stopped = || {
let started = client
.metrics
@@ -1784,38 +1804,47 @@ mod tests {
};
// Test
tracing::info!("now doing actual test");
let actual_a = get_bytes_started_stopped();
let generation = Generation::new(0xdeadbeef);
let init = get_bytes_started_stopped();
client
.schedule_layer_file_upload(
&layer_file_name_1,
&LayerFileMetadata::new(content_1.len() as u64, harness.generation),
&LayerFileMetadata::new(content_1.len() as u64, generation),
)
.unwrap();
let actual_b = get_bytes_started_stopped();
let pre = get_bytes_started_stopped();
client.wait_completion().await.unwrap();
let actual_c = get_bytes_started_stopped();
let post = get_bytes_started_stopped();
// Validate
let expected_b = actual_a
+ BytesStartedFinished {
assert_eq!(
init,
BytesStartedFinished {
started: None,
finished: None
}
);
assert_eq!(
pre,
BytesStartedFinished {
started: Some(content_1.len()),
// assert that the _finished metric is created eagerly so that subtractions work on first sample
finished: Some(0),
};
assert_eq!(actual_b, expected_b);
let expected_c = actual_a
+ BytesStartedFinished {
}
);
assert_eq!(
post,
BytesStartedFinished {
started: Some(content_1.len()),
finished: Some(content_1.len()),
};
assert_eq!(actual_c, expected_c);
finished: Some(content_1.len())
}
);
}
}

View File

@@ -31,7 +31,7 @@ use crate::config::PageServerConf;
use crate::context::RequestContext;
use crate::page_cache::PAGE_SZ;
use crate::repository::{Key, Value, KEY_SIZE};
use crate::tenant::blob_io::{BlobWriter, WriteBlobWriter};
use crate::tenant::blob_io::WriteBlobWriter;
use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
use crate::tenant::storage_layer::{
@@ -219,7 +219,7 @@ pub struct DeltaLayerInner {
index_root_blk: u32,
/// Reader object for reading blocks from the file.
file: FileBlockReader,
file: FileBlockReader<VirtualFile>,
}
impl AsRef<DeltaLayerInner> for DeltaLayerInner {
@@ -632,11 +632,12 @@ impl DeltaLayerWriterInner {
///
/// The values must be appended in key, lsn order.
///
fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
async fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
self.put_value_bytes(key, lsn, &Value::ser(&val)?, val.will_init())
.await
}
fn put_value_bytes(
async fn put_value_bytes(
&mut self,
key: Key,
lsn: Lsn,
@@ -645,7 +646,7 @@ impl DeltaLayerWriterInner {
) -> anyhow::Result<()> {
assert!(self.lsn_range.start <= lsn);
let off = self.blob_writer.write_blob(val)?;
let off = self.blob_writer.write_blob(val).await?;
let blob_ref = BlobRef::new(off, will_init);
@@ -797,11 +798,11 @@ impl DeltaLayerWriter {
///
/// The values must be appended in key, lsn order.
///
pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
self.inner.as_mut().unwrap().put_value(key, lsn, val)
pub async fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> anyhow::Result<()> {
self.inner.as_mut().unwrap().put_value(key, lsn, val).await
}
pub fn put_value_bytes(
pub async fn put_value_bytes(
&mut self,
key: Key,
lsn: Lsn,
@@ -812,6 +813,7 @@ impl DeltaLayerWriter {
.as_mut()
.unwrap()
.put_value_bytes(key, lsn, val, will_init)
.await
}
pub fn size(&self) -> u64 {

View File

@@ -27,7 +27,7 @@ use crate::config::PageServerConf;
use crate::context::RequestContext;
use crate::page_cache::PAGE_SZ;
use crate::repository::{Key, KEY_SIZE};
use crate::tenant::blob_io::{BlobWriter, WriteBlobWriter};
use crate::tenant::blob_io::WriteBlobWriter;
use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
use crate::tenant::storage_layer::{
@@ -155,7 +155,7 @@ pub struct ImageLayerInner {
lsn: Lsn,
/// Reader object for reading blocks from the file.
file: FileBlockReader,
file: FileBlockReader<VirtualFile>,
}
impl std::fmt::Debug for ImageLayerInner {
@@ -569,9 +569,9 @@ impl ImageLayerWriterInner {
///
/// The page versions must be appended in blknum order.
///
fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
async fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
ensure!(self.key_range.contains(&key));
let off = self.blob_writer.write_blob(img)?;
let off = self.blob_writer.write_blob(img).await?;
let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
key.write_to_byte_slice(&mut keybuf);
@@ -710,8 +710,8 @@ impl ImageLayerWriter {
///
/// The page versions must be appended in blknum order.
///
pub fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
self.inner.as_mut().unwrap().put_image(key, img)
pub async fn put_image(&mut self, key: Key, img: &[u8]) -> anyhow::Result<()> {
self.inner.as_mut().unwrap().put_image(key, img).await
}
///

View File

@@ -348,7 +348,9 @@ impl InMemoryLayer {
for (lsn, pos) in vec_map.as_slice() {
cursor.read_blob_into_buf(*pos, &mut buf).await?;
let will_init = Value::des(&buf)?.will_init();
delta_layer_writer.put_value_bytes(key, *lsn, &buf, will_init)?;
delta_layer_writer
.put_value_bytes(key, *lsn, &buf, will_init)
.await?;
}
}

View File

@@ -90,7 +90,6 @@ use self::logical_size::LogicalSize;
use self::walreceiver::{WalReceiver, WalReceiverConf};
use super::config::TenantConf;
use super::debug_assert_current_span_has_tenant_and_timeline_id;
use super::remote_timeline_client::index::IndexPart;
use super::remote_timeline_client::RemoteTimelineClient;
use super::storage_layer::{
@@ -934,48 +933,6 @@ impl Timeline {
self.launch_eviction_task(background_jobs_can_start);
}
#[instrument(skip_all, fields(timeline_id=%self.timeline_id))]
pub async fn shutdown(self: &Arc<Self>, freeze_and_flush: bool) {
debug_assert_current_span_has_tenant_and_timeline_id();
// prevent writes to the InMemoryLayer
task_mgr::shutdown_tasks(
Some(TaskKind::WalReceiverManager),
Some(self.tenant_id),
Some(self.timeline_id),
)
.await;
// now all writers to InMemory layer are gone, do the final flush if requested
if freeze_and_flush {
match self.freeze_and_flush().await {
Ok(()) => {}
Err(e) => {
warn!("failed to freeze and flush: {e:#}");
return; // TODO: should probably drain remote timeline client anyways?
}
}
// drain the upload queue
let res = if let Some(client) = self.remote_client.as_ref() {
// if we did not wait for completion here, it might be our shutdown process
// didn't wait for remote uploads to complete at all, as new tasks can forever
// be spawned.
//
// what is problematic is the shutting down of RemoteTimelineClient, because
// obviously it does not make sense to stop while we wait for it, but what
// about corner cases like s3 suddenly hanging up?
client.wait_completion().await
} else {
Ok(())
};
if let Err(e) = res {
warn!("failed to await for frozen and flushed uploads: {e:#}");
}
}
}
pub fn set_state(&self, new_state: TimelineState) {
match (self.current_state(), new_state) {
(equal_state_1, equal_state_2) if equal_state_1 == equal_state_2 => {
@@ -2828,9 +2785,15 @@ impl Timeline {
x.unwrap()
));
save_metadata(self.conf, &self.tenant_id, &self.timeline_id, &metadata)
.await
.context("save_metadata")?;
save_metadata(
self.conf,
&self.tenant_id,
&self.timeline_id,
&metadata,
false,
)
.await
.context("save_metadata")?;
if let Some(remote_client) = &self.remote_client {
for (path, layer_metadata) in layer_paths_to_upload {
@@ -3069,7 +3032,7 @@ impl Timeline {
}
}
};
image_layer_writer.put_image(key, &img)?;
image_layer_writer.put_image(key, &img).await?;
key = key.next();
}
}
@@ -3655,7 +3618,7 @@ impl Timeline {
)))
});
writer.as_mut().unwrap().put_value(key, lsn, value)?;
writer.as_mut().unwrap().put_value(key, lsn, value).await?;
prev_key = Some(key);
}
if let Some(writer) = writer {
@@ -4782,8 +4745,22 @@ mod tests {
let harness =
TenantHarness::create("two_layer_eviction_attempts_at_the_same_time").unwrap();
let remote_storage = {
// this is never used for anything, because of how the create_test_timeline works, but
// it is with us in spirit and a Some.
use remote_storage::{GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind};
let path = harness.conf.workdir.join("localfs");
std::fs::create_dir_all(&path).unwrap();
let config = RemoteStorageConfig {
max_concurrent_syncs: std::num::NonZeroUsize::new(2_000_000).unwrap(),
max_sync_errors: std::num::NonZeroU32::new(3_000_000).unwrap(),
storage: RemoteStorageKind::LocalFs(path),
};
GenericRemoteStorage::from_config(&config).unwrap()
};
let ctx = any_context();
let tenant = harness.try_load(&ctx).await.unwrap();
let tenant = harness.try_load(&ctx, Some(remote_storage)).await.unwrap();
let timeline = tenant
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
.await
@@ -4833,8 +4810,22 @@ mod tests {
async fn layer_eviction_aba_fails() {
let harness = TenantHarness::create("layer_eviction_aba_fails").unwrap();
let remote_storage = {
// this is never used for anything, because of how the create_test_timeline works, but
// it is with us in spirit and a Some.
use remote_storage::{GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind};
let path = harness.conf.workdir.join("localfs");
std::fs::create_dir_all(&path).unwrap();
let config = RemoteStorageConfig {
max_concurrent_syncs: std::num::NonZeroUsize::new(2_000_000).unwrap(),
max_sync_errors: std::num::NonZeroU32::new(3_000_000).unwrap(),
storage: RemoteStorageKind::LocalFs(path),
};
GenericRemoteStorage::from_config(&config).unwrap()
};
let ctx = any_context();
let tenant = harness.try_load(&ctx).await.unwrap();
let tenant = harness.try_load(&ctx, Some(remote_storage)).await.unwrap();
let timeline = tenant
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
.await

View File

@@ -172,41 +172,6 @@ impl OpenFiles {
}
}
#[derive(Debug, thiserror::Error)]
pub enum CrashsafeOverwriteError {
#[error("final path has no parent dir")]
FinalPathHasNoParentDir,
#[error("remove tempfile: {0}")]
RemovePreviousTempfile(#[source] std::io::Error),
#[error("create tempfile: {0}")]
CreateTempfile(#[source] std::io::Error),
#[error("write tempfile: {0}")]
WriteContents(#[source] std::io::Error),
#[error("sync tempfile: {0}")]
SyncTempfile(#[source] std::io::Error),
#[error("rename tempfile to final path: {0}")]
RenameTempfileToFinalPath(#[source] std::io::Error),
#[error("open final path parent dir: {0}")]
OpenFinalPathParentDir(#[source] std::io::Error),
#[error("sync final path parent dir: {0}")]
SyncFinalPathParentDir(#[source] std::io::Error),
}
impl CrashsafeOverwriteError {
/// Returns true iff the new contents are durably stored.
pub fn are_new_contents_durable(&self) -> bool {
match self {
Self::FinalPathHasNoParentDir => false,
Self::RemovePreviousTempfile(_) => false,
Self::CreateTempfile(_) => false,
Self::WriteContents(_) => false,
Self::SyncTempfile(_) => false,
Self::RenameTempfileToFinalPath(_) => false,
Self::OpenFinalPathParentDir(_) => false,
Self::SyncFinalPathParentDir(_) => true,
}
}
}
impl VirtualFile {
/// Open a file in read-only mode. Like File::open.
pub fn open(path: &Path) -> Result<VirtualFile, std::io::Error> {
@@ -271,56 +236,6 @@ impl VirtualFile {
Ok(vfile)
}
/// Writes a file to the specified `final_path` in a crash safe fasion
///
/// The file is first written to the specified tmp_path, and in a second
/// step, the tmp path is renamed to the final path. As renames are
/// atomic, a crash during the write operation will never leave behind a
/// partially written file.
pub async fn crashsafe_overwrite(
final_path: &Path,
tmp_path: &Path,
content: &[u8],
) -> Result<(), CrashsafeOverwriteError> {
let Some(final_path_parent) = final_path.parent() else {
return Err(CrashsafeOverwriteError::FinalPathHasNoParentDir);
};
match std::fs::remove_file(tmp_path) {
Ok(()) => {}
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
Err(e) => return Err(CrashsafeOverwriteError::RemovePreviousTempfile(e)),
}
let mut file = Self::open_with_options(
tmp_path,
OpenOptions::new()
.write(true)
// Use `create_new` so that, if we race with ourselves or something else,
// we bail out instead of causing damage.
.create_new(true),
)
.map_err(CrashsafeOverwriteError::CreateTempfile)?;
file.write_all(content)
.map_err(CrashsafeOverwriteError::WriteContents)?;
file.sync_all()
.map_err(CrashsafeOverwriteError::SyncTempfile)?;
drop(file); // before the rename, that's important!
// renames are atomic
std::fs::rename(tmp_path, final_path)
.map_err(CrashsafeOverwriteError::RenameTempfileToFinalPath)?;
// Only open final path parent dirfd now, so that this operation only
// ever holds one VirtualFile fd at a time. That's important because
// the current `find_victim_slot` impl might pick the same slot for both
// VirtualFile., and it eventually does a blocking write lock instead of
// try_lock.
let final_parent_dirfd =
Self::open_with_options(final_path_parent, OpenOptions::new().read(true))
.map_err(CrashsafeOverwriteError::OpenFinalPathParentDir)?;
final_parent_dirfd
.sync_all()
.map_err(CrashsafeOverwriteError::SyncFinalPathParentDir)?;
Ok(())
}
/// Call File::sync_all() on the underlying File.
pub fn sync_all(&self) -> Result<(), Error> {
self.with_file("fsync", |file| file.sync_all())?
@@ -432,26 +347,10 @@ impl VirtualFile {
Ok(self.pos)
}
#[cfg(test)]
async fn read_to_end(&mut self, buf: &mut Vec<u8>) -> Result<(), Error> {
loop {
let mut tmp = [0; 128];
match self.read_at(&mut tmp, self.pos).await {
Ok(0) => return Ok(()),
Ok(n) => {
self.pos += n as u64;
buf.extend_from_slice(&tmp[..n]);
}
Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
Err(e) => return Err(e),
}
}
}
// Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135
pub async fn read_exact_at(&self, mut buf: &mut [u8], mut offset: u64) -> Result<(), Error> {
pub fn read_exact_at(&self, mut buf: &mut [u8], mut offset: u64) -> Result<(), Error> {
while !buf.is_empty() {
match self.read_at(buf, offset).await {
match self.read_at(buf, offset) {
Ok(0) => {
return Err(Error::new(
std::io::ErrorKind::UnexpectedEof,
@@ -490,7 +389,21 @@ impl VirtualFile {
Ok(())
}
pub async fn read_at(&self, buf: &mut [u8], offset: u64) -> Result<usize, Error> {
/// Write the given buffer (which has to be below the kernel's internal page size) and fsync
///
/// This ensures some level of atomicity (not a good one, but it's the best we have).
pub fn write_and_fsync(&mut self, buf: &[u8]) -> Result<(), Error> {
if self.write(buf)? != buf.len() {
return Err(Error::new(
std::io::ErrorKind::Other,
"Could not write all the bytes in a single call",
));
}
self.sync_all()?;
Ok(())
}
pub fn read_at(&self, buf: &mut [u8], offset: u64) -> Result<usize, Error> {
let result = self.with_file("read", |file| file.read_at(buf, offset))?;
if let Ok(size) = result {
STORAGE_IO_SIZE
@@ -600,6 +513,7 @@ mod tests {
use rand::thread_rng;
use rand::Rng;
use std::sync::Arc;
use std::thread;
enum MaybeVirtualFile {
VirtualFile(VirtualFile),
@@ -607,9 +521,9 @@ mod tests {
}
impl MaybeVirtualFile {
async fn read_exact_at(&self, buf: &mut [u8], offset: u64) -> Result<(), Error> {
fn read_exact_at(&self, buf: &mut [u8], offset: u64) -> Result<(), Error> {
match self {
MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(buf, offset).await,
MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(buf, offset),
MaybeVirtualFile::File(file) => file.read_exact_at(buf, offset),
}
}
@@ -638,22 +552,17 @@ mod tests {
use std::io::Read;
let mut buf = String::new();
match self {
MaybeVirtualFile::VirtualFile(file) => {
let mut buf = Vec::new();
file.read_to_end(&mut buf).await?;
return Ok(String::from_utf8(buf).unwrap());
}
MaybeVirtualFile::File(file) => {
file.read_to_string(&mut buf)?;
}
MaybeVirtualFile::VirtualFile(file) => file.read_to_string(&mut buf)?,
MaybeVirtualFile::File(file) => file.read_to_string(&mut buf)?,
}
Ok(buf)
}
// Helper function to slurp a portion of a file into a string
async fn read_string_at(&mut self, pos: u64, len: usize) -> Result<String, Error> {
let mut buf = vec![0; len];
self.read_exact_at(&mut buf, pos).await?;
let mut buf = Vec::new();
buf.resize(len, 0);
self.read_exact_at(&mut buf, pos)?;
Ok(String::from_utf8(buf).unwrap())
}
}
@@ -697,13 +606,13 @@ mod tests {
file_a.write_all(b"foobar").await?;
// cannot read from a file opened in write-only mode
let _ = file_a.read_string().await.unwrap_err();
assert!(file_a.read_string().await.is_err());
// Close the file and re-open for reading
let mut file_a = openfunc(&path_a, OpenOptions::new().read(true))?;
// cannot write to a file opened in read-only mode
let _ = file_a.write_all(b"bar").await.unwrap_err();
assert!(file_a.write_all(b"bar").await.is_err());
// Try simple read
assert_eq!("foobar", file_a.read_string().await?);
@@ -807,22 +716,28 @@ mod tests {
let files = Arc::new(files);
// Launch many threads, and use the virtual files concurrently in random order.
let rt = tokio::runtime::Builder::new_multi_thread()
.worker_threads(THREADS)
.thread_name("test_vfile_concurrency thread")
.build()
.unwrap();
for _threadno in 0..THREADS {
let mut threads = Vec::new();
for threadno in 0..THREADS {
let builder =
thread::Builder::new().name(format!("test_vfile_concurrency thread {}", threadno));
let files = files.clone();
rt.spawn(async move {
let mut buf = [0u8; SIZE];
let mut rng = rand::rngs::OsRng;
for _ in 1..1000 {
let f = &files[rng.gen_range(0..files.len())];
f.read_exact_at(&mut buf, 0).await.unwrap();
assert!(buf == SAMPLE);
}
});
let thread = builder
.spawn(move || {
let mut buf = [0u8; SIZE];
let mut rng = rand::thread_rng();
for _ in 1..1000 {
let f = &files[rng.gen_range(0..files.len())];
f.read_exact_at(&mut buf, 0).unwrap();
assert!(buf == SAMPLE);
}
})
.unwrap();
threads.push(thread);
}
for thread in threads {
thread.join().unwrap();
}
Ok(())

View File

@@ -64,7 +64,7 @@ pub mod errors {
}
http::StatusCode::LOCKED => {
// Status 423: project might be in maintenance mode (or bad state), or quotas exceeded.
format!("{REQUEST_FAILED}: endpoint is temporary unavailable. check your quotas and/or contact our support")
format!("{REQUEST_FAILED}: endpoint is temporary unavailable. check your quotas and/or contract our support")
}
_ => REQUEST_FAILED.to_owned(),
},

View File

@@ -8,7 +8,6 @@ use super::{
use crate::{auth::ClientCredentials, compute, http, scram};
use async_trait::async_trait;
use futures::TryFutureExt;
use std::net::SocketAddr;
use tokio::time::Instant;
use tokio_postgres::config::SslMode;
use tracing::{error, info, info_span, warn, Instrument};
@@ -118,7 +117,7 @@ impl Api {
// We'll set username and such later using the startup message.
// TODO: add more type safety (in progress).
let mut config = compute::ConnCfg::new();
config.host(&host).port(port).ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes.
config.host(host).port(port).ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes.
let node = NodeInfo {
config,
@@ -195,9 +194,9 @@ async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
Err(ApiError::Console { status, text })
}
fn parse_host_port(input: &str) -> Option<(String, u16)> {
let parsed: SocketAddr = input.parse().ok()?;
Some((parsed.ip().to_string(), parsed.port()))
fn parse_host_port(input: &str) -> Option<(&str, u16)> {
let (host, port) = input.split_once(':')?;
Some((host, port.parse().ok()?))
}
#[cfg(test)]

View File

@@ -414,7 +414,6 @@ class NeonEnvBuilder:
neon_binpath: Path,
pg_distrib_dir: Path,
pg_version: PgVersion,
test_name: str,
remote_storage: Optional[RemoteStorage] = None,
remote_storage_users: RemoteStorageUsers = RemoteStorageUsers.PAGESERVER,
pageserver_config_override: Optional[str] = None,
@@ -456,11 +455,6 @@ class NeonEnvBuilder:
self.initial_tenant = initial_tenant or TenantId.generate()
self.initial_timeline = initial_timeline or TimelineId.generate()
assert test_name.startswith(
"test_"
), "Unexpectedly instantiated from outside a test function"
self.test_name = test_name
def init_configs(self) -> NeonEnv:
# Cannot create more than one environment from one builder
assert self.env is None, "environment already initialized"
@@ -492,24 +486,23 @@ class NeonEnvBuilder:
def enable_remote_storage(
self,
remote_storage_kind: RemoteStorageKind,
test_name: str,
force_enable: bool = True,
enable_remote_extensions: bool = False,
):
bucket_name = re.sub(r"[_\[\]]", "-", self.test_name)[:63]
if remote_storage_kind == RemoteStorageKind.NOOP:
return
elif remote_storage_kind == RemoteStorageKind.LOCAL_FS:
self.enable_local_fs_remote_storage(force_enable=force_enable)
elif remote_storage_kind == RemoteStorageKind.MOCK_S3:
self.enable_mock_s3_remote_storage(
bucket_name=bucket_name,
bucket_name=test_name,
force_enable=force_enable,
enable_remote_extensions=enable_remote_extensions,
)
elif remote_storage_kind == RemoteStorageKind.REAL_S3:
self.enable_real_s3_remote_storage(
test_name=bucket_name,
test_name=test_name,
force_enable=force_enable,
enable_remote_extensions=enable_remote_extensions,
)
@@ -956,7 +949,6 @@ def _shared_simple_env(
pg_version=pg_version,
run_id=run_id,
preserve_database_files=pytestconfig.getoption("--preserve-database-files"),
test_name=request.node.name,
) as builder:
env = builder.init_start()
@@ -992,7 +984,6 @@ def neon_env_builder(
pg_version: PgVersion,
default_broker: NeonBroker,
run_id: uuid.UUID,
request: FixtureRequest,
) -> Iterator[NeonEnvBuilder]:
"""
Fixture to create a Neon environment for test.
@@ -1021,7 +1012,6 @@ def neon_env_builder(
broker=default_broker,
run_id=run_id,
preserve_database_files=pytestconfig.getoption("--preserve-database-files"),
test_name=request.node.name,
) as builder:
yield builder

View File

@@ -16,6 +16,7 @@ from fixtures.utils import wait_until
def positive_env(neon_env_builder: NeonEnvBuilder) -> NeonEnv:
neon_env_builder.enable_remote_storage(
remote_storage_kind=RemoteStorageKind.LOCAL_FS,
test_name="test_attach_tenant_config",
)
env = neon_env_builder.init_start()
@@ -38,6 +39,7 @@ class NegativeTests:
def negative_env(neon_env_builder: NeonEnvBuilder) -> Generator[NegativeTests, None, None]:
neon_env_builder.enable_remote_storage(
remote_storage_kind=RemoteStorageKind.LOCAL_FS,
test_name="test_attach_tenant_config",
)
env = neon_env_builder.init_start()
assert isinstance(env.remote_storage, LocalFsStorage)

View File

@@ -135,7 +135,7 @@ def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> Ev
log.info(f"setting up eviction_env for test {request.node.name}")
neon_env_builder.enable_remote_storage(RemoteStorageKind.LOCAL_FS)
neon_env_builder.enable_remote_storage(RemoteStorageKind.LOCAL_FS, f"{request.node.name}")
# initial tenant will not be present on this pageserver
env = neon_env_builder.init_configs()

View File

@@ -90,6 +90,7 @@ def test_remote_extensions(
):
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_remote_extensions",
enable_remote_extensions=True,
)
env = neon_env_builder.init_start()
@@ -156,6 +157,7 @@ def test_remote_library(
):
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_remote_library",
enable_remote_extensions=True,
)
env = neon_env_builder.init_start()
@@ -216,6 +218,7 @@ def test_multiple_extensions_one_archive(
):
neon_env_builder.enable_remote_storage(
remote_storage_kind=RemoteStorageKind.REAL_S3,
test_name="test_multiple_extensions_one_archive",
enable_remote_extensions=True,
)
env = neon_env_builder.init_start()
@@ -263,6 +266,7 @@ def test_extension_download_after_restart(
neon_env_builder.enable_remote_storage(
remote_storage_kind=RemoteStorageKind.MOCK_S3,
test_name="test_extension_download_after_restart",
enable_remote_extensions=True,
)
env = neon_env_builder.init_start()

View File

@@ -102,6 +102,7 @@ def test_gc_index_upload(neon_env_builder: NeonEnvBuilder, remote_storage_kind:
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_gc_index_upload",
)
env = neon_env_builder.init_start()

View File

@@ -21,6 +21,7 @@ def test_basic_eviction(
):
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_download_remote_layers_api",
)
env = neon_env_builder.init_start(
@@ -156,6 +157,7 @@ def test_basic_eviction(
def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder):
neon_env_builder.enable_remote_storage(
remote_storage_kind=RemoteStorageKind.LOCAL_FS,
test_name="test_gc_of_remote_layers",
)
env = neon_env_builder.init_start()

View File

@@ -96,6 +96,7 @@ def test_metric_collection(
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_metric_collection",
)
log.info(f"test_metric_collection endpoint is {metric_collection_endpoint}")

View File

@@ -54,6 +54,7 @@ def test_ondemand_download_large_rel(
):
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_ondemand_download_large_rel",
)
# thinking about using a shared environment? the test assumes that global
@@ -156,6 +157,7 @@ def test_ondemand_download_timetravel(
):
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_ondemand_download_timetravel",
)
# thinking about using a shared environment? the test assumes that global
@@ -317,6 +319,7 @@ def test_download_remote_layers_api(
):
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_download_remote_layers_api",
)
##### First start, insert data and upload it to the remote storage
@@ -478,6 +481,7 @@ def test_compaction_downloads_on_demand_without_image_creation(
"""
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_compaction_downloads_on_demand_without_image_creation",
)
conf = {
@@ -565,6 +569,7 @@ def test_compaction_downloads_on_demand_with_image_creation(
"""
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_compaction_downloads_on_demand",
)
conf = {
@@ -665,6 +670,7 @@ def test_ondemand_download_failure_to_replace(
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_ondemand_download_failure_to_replace",
)
# disable gc and compaction via default tenant config because config is lost while detaching

View File

@@ -62,6 +62,7 @@ def test_remote_storage_backup_and_restore(
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_remote_storage_backup_and_restore",
)
# Exercise retry code path by making all uploads and downloads fail for the
@@ -224,6 +225,7 @@ def test_remote_storage_upload_queue_retries(
):
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_remote_storage_upload_queue_retries",
)
env = neon_env_builder.init_start()
@@ -379,6 +381,7 @@ def test_remote_timeline_client_calls_started_metric(
):
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_remote_timeline_client_metrics",
)
# thinking about using a shared environment? the test assumes that global
@@ -521,6 +524,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
):
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_timeline_deletion_with_files_stuck_in_upload_queue",
)
env = neon_env_builder.init_start(
@@ -638,6 +642,7 @@ def test_empty_branch_remote_storage_upload(
):
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_empty_branch_remote_storage_upload",
)
env = neon_env_builder.init_start()
@@ -689,6 +694,7 @@ def test_empty_branch_remote_storage_upload_on_restart(
"""
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_empty_branch_remote_storage_upload_on_restart",
)
env = neon_env_builder.init_start()
@@ -786,6 +792,7 @@ def test_compaction_delete_before_upload(
):
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_compaction_delete_before_upload",
)
env = neon_env_builder.init_start(

View File

@@ -294,6 +294,7 @@ eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold =
def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder):
neon_env_builder.enable_remote_storage(
remote_storage_kind=RemoteStorageKind.LOCAL_FS,
test_name="test_creating_tenant_conf_after_attach",
)
env = neon_env_builder.init_start()
@@ -338,6 +339,7 @@ def test_live_reconfig_get_evictions_low_residence_duration_metric_threshold(
):
neon_env_builder.enable_remote_storage(
remote_storage_kind=RemoteStorageKind.LOCAL_FS,
test_name="test_live_reconfig_get_evictions_low_residence_duration_metric_threshold",
)
env = neon_env_builder.init_start()

View File

@@ -43,6 +43,7 @@ def test_tenant_delete_smoke(
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_tenant_delete_smoke",
)
env = neon_env_builder.init_start()
@@ -176,7 +177,9 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
if simulate_failures:
neon_env_builder.pageserver_config_override = "test_remote_failures=1"
neon_env_builder.enable_remote_storage(remote_storage_kind)
neon_env_builder.enable_remote_storage(
remote_storage_kind, "test_delete_tenant_exercise_crash_safety_failpoints"
)
env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
@@ -189,7 +192,7 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
# allow errors caused by failpoints
f".*failpoint: {failpoint}",
# It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
# We may leave some upload tasks in the queue. They're likely deletes.
# For uploads we explicitly wait with `last_flush_lsn_upload` below.
# So by ignoring these instead of waiting for empty upload queue
@@ -297,6 +300,7 @@ def test_tenant_delete_is_resumed_on_attach(
):
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_deleted_tenant_ignored_on_attach",
)
env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
@@ -334,7 +338,7 @@ def test_tenant_delete_is_resumed_on_attach(
# From deletion polling
f".*NotFound: tenant {env.initial_tenant}.*",
# It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
# error from http response is also logged
".*InternalServerError\\(Tenant is marked as deleted on remote storage.*",
'.*shutdown_pageserver{exit_code=0}: stopping left-over name="remote upload".*',

View File

@@ -46,6 +46,7 @@ def test_tenant_reattach(
):
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_tenant_reattach",
)
# Exercise retry code path by making all uploads and downloads fail for the
@@ -230,6 +231,7 @@ def test_tenant_reattach_while_busy(
):
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_tenant_reattach_while_busy",
)
env = neon_env_builder.init_start()
@@ -451,6 +453,7 @@ def test_detach_while_attaching(
):
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_detach_while_attaching",
)
##### First start, insert secret data and upload it to the remote storage
@@ -534,6 +537,7 @@ def test_ignored_tenant_reattach(
):
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_ignored_tenant_reattach",
)
env = neon_env_builder.init_start()
pageserver_http = env.pageserver.http_client()
@@ -605,6 +609,7 @@ def test_ignored_tenant_download_missing_layers(
):
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_ignored_tenant_download_and_attach",
)
env = neon_env_builder.init_start()
pageserver_http = env.pageserver.http_client()
@@ -670,6 +675,7 @@ def test_ignored_tenant_stays_broken_without_metadata(
):
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_ignored_tenant_stays_broken_without_metadata",
)
env = neon_env_builder.init_start()
pageserver_http = env.pageserver.http_client()
@@ -682,7 +688,7 @@ def test_ignored_tenant_stays_broken_without_metadata(
# temporarily detached produces these errors in the pageserver log.
env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
env.pageserver.allowed_errors.append(
f".*Tenant {tenant_id} will not become active\\. Current state: (Broken|Stopping).*"
f".*Tenant {tenant_id} will not become active\\. Current state: Broken.*"
)
# ignore the tenant and remove its metadata
@@ -713,6 +719,7 @@ def test_load_attach_negatives(
):
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_load_attach_negatives",
)
env = neon_env_builder.init_start()
pageserver_http = env.pageserver.http_client()
@@ -757,6 +764,7 @@ def test_ignore_while_attaching(
):
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_ignore_while_attaching",
)
env = neon_env_builder.init_start()
@@ -860,6 +868,7 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading(
):
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_metrics_while_ignoring_broken_tenant_and_reloading",
)
env = neon_env_builder.init_start()

View File

@@ -526,6 +526,7 @@ def test_emergency_relocate_with_branches_slow_replay(
):
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_emergency_relocate_with_branches_slow_replay",
)
env = neon_env_builder.init_start()
@@ -682,6 +683,7 @@ def test_emergency_relocate_with_branches_createdb(
):
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_emergency_relocate_with_branches_createdb",
)
env = neon_env_builder.init_start()

View File

@@ -244,6 +244,7 @@ def test_pageserver_metrics_removed_after_detach(
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_pageserver_metrics_removed_after_detach",
)
neon_env_builder.num_safekeepers = 3
@@ -304,6 +305,7 @@ def test_pageserver_with_empty_tenants(
):
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_pageserver_with_empty_tenants",
)
env = neon_env_builder.init_start()

View File

@@ -64,6 +64,7 @@ async def all_tenants_workload(env: NeonEnv, tenants_endpoints):
def test_tenants_many(neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind):
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_tenants_many",
)
env = neon_env_builder.init_start()
@@ -116,6 +117,7 @@ def test_tenants_attached_after_download(
):
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="remote_storage_kind",
)
data_id = 1
@@ -230,6 +232,7 @@ def test_tenant_redownloads_truncated_file_on_startup(
# since we now store the layer file length metadata, we notice on startup that a layer file is of wrong size, and proceed to redownload it.
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_tenant_redownloads_truncated_file_on_startup",
)
env = neon_env_builder.init_start()

View File

@@ -16,12 +16,13 @@ from pytest_httpserver import HTTPServer
def test_threshold_based_eviction(
request,
httpserver: HTTPServer,
httpserver_listen_address,
pg_bin: PgBin,
neon_env_builder: NeonEnvBuilder,
):
neon_env_builder.enable_remote_storage(RemoteStorageKind.LOCAL_FS)
neon_env_builder.enable_remote_storage(RemoteStorageKind.LOCAL_FS, f"{request.node.name}")
# Start with metrics collection enabled, so that the eviction task
# imitates its accesses. We'll use a non-existent endpoint to make it fail.

View File

@@ -191,7 +191,9 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
8. Retry or restart without the failpoint and check the result.
"""
neon_env_builder.enable_remote_storage(remote_storage_kind)
neon_env_builder.enable_remote_storage(
remote_storage_kind, "test_delete_timeline_exercise_crash_safety_failpoints"
)
env = neon_env_builder.init_start(
initial_tenant_conf={
@@ -229,7 +231,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
env.pageserver.allowed_errors.append(f".*{timeline_id}.*failpoint: {failpoint}")
# It appears when we stopped flush loop during deletion and then pageserver is stopped
env.pageserver.allowed_errors.append(
".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited"
)
# This happens when we fail before scheduling background operation.
# Timeline is left in stopping state and retry tries to stop it again.
@@ -348,6 +350,7 @@ def test_timeline_resurrection_on_attach(
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_timeline_resurrection_on_attach",
)
##### First start, insert data and upload it to the remote storage
@@ -435,6 +438,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
neon_env_builder.enable_remote_storage(
remote_storage_kind=RemoteStorageKind.MOCK_S3,
test_name="test_timeline_delete_fail_before_local_delete",
)
env = neon_env_builder.init_start()
@@ -445,7 +449,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
)
# this happens, because the stuck timeline is visible to shutdown
env.pageserver.allowed_errors.append(
".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
".*freeze_and_flush_on_shutdown.+: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited"
)
ps_http = env.pageserver.http_client()
@@ -554,6 +558,7 @@ def test_concurrent_timeline_delete_stuck_on(
neon_env_builder.enable_remote_storage(
remote_storage_kind=RemoteStorageKind.MOCK_S3,
test_name=f"concurrent_timeline_delete_stuck_on_{stuck_failpoint}",
)
env = neon_env_builder.init_start()
@@ -631,6 +636,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):
"""
neon_env_builder.enable_remote_storage(
remote_storage_kind=RemoteStorageKind.MOCK_S3,
test_name="test_delete_timeline_client_hangup",
)
env = neon_env_builder.init_start()
@@ -700,6 +706,7 @@ def test_timeline_delete_works_for_remote_smoke(
):
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_timeline_delete_works_for_remote_smoke",
)
env = neon_env_builder.init_start()
@@ -773,7 +780,7 @@ def test_delete_orphaned_objects(
pg_bin: PgBin,
):
remote_storage_kind = RemoteStorageKind.LOCAL_FS
neon_env_builder.enable_remote_storage(remote_storage_kind)
neon_env_builder.enable_remote_storage(remote_storage_kind, "test_delete_orphaned_objects")
env = neon_env_builder.init_start(
initial_tenant_conf={
@@ -837,6 +844,7 @@ def test_timeline_delete_resumed_on_attach(
):
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_deleted_tenant_ignored_on_attach",
)
env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
@@ -873,7 +881,7 @@ def test_timeline_delete_resumed_on_attach(
# allow errors caused by failpoints
f".*failpoint: {failpoint}",
# It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
# error from http response is also logged
".*InternalServerError\\(Tenant is marked as deleted on remote storage.*",
# Polling after attach may fail with this

View File

@@ -306,7 +306,9 @@ def test_timeline_physical_size_init(
neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind]
):
if remote_storage_kind is not None:
neon_env_builder.enable_remote_storage(remote_storage_kind)
neon_env_builder.enable_remote_storage(
remote_storage_kind, "test_timeline_physical_size_init"
)
env = neon_env_builder.init_start()
@@ -347,7 +349,9 @@ def test_timeline_physical_size_post_checkpoint(
neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind]
):
if remote_storage_kind is not None:
neon_env_builder.enable_remote_storage(remote_storage_kind)
neon_env_builder.enable_remote_storage(
remote_storage_kind, "test_timeline_physical_size_init"
)
env = neon_env_builder.init_start()
@@ -378,7 +382,9 @@ def test_timeline_physical_size_post_compaction(
neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind]
):
if remote_storage_kind is not None:
neon_env_builder.enable_remote_storage(remote_storage_kind)
neon_env_builder.enable_remote_storage(
remote_storage_kind, "test_timeline_physical_size_init"
)
# Disable background compaction as we don't want it to happen after `get_physical_size` request
# and before checking the expected size on disk, which makes the assertion failed
@@ -431,7 +437,9 @@ def test_timeline_physical_size_post_gc(
neon_env_builder: NeonEnvBuilder, remote_storage_kind: Optional[RemoteStorageKind]
):
if remote_storage_kind is not None:
neon_env_builder.enable_remote_storage(remote_storage_kind)
neon_env_builder.enable_remote_storage(
remote_storage_kind, "test_timeline_physical_size_init"
)
# Disable background compaction and GC as we don't want it to happen after `get_physical_size` request
# and before checking the expected size on disk, which makes the assertion failed
@@ -564,7 +572,9 @@ def test_tenant_physical_size(
random.seed(100)
if remote_storage_kind is not None:
neon_env_builder.enable_remote_storage(remote_storage_kind)
neon_env_builder.enable_remote_storage(
remote_storage_kind, "test_timeline_physical_size_init"
)
env = neon_env_builder.init_start()

View File

@@ -439,6 +439,7 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Remot
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_safekeepers_wal_backup",
)
neon_env_builder.remote_storage_users = RemoteStorageUsers.SAFEKEEPER
@@ -490,6 +491,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re
neon_env_builder.enable_remote_storage(
remote_storage_kind=remote_storage_kind,
test_name="test_s3_wal_replay",
)
neon_env_builder.remote_storage_users = RemoteStorageUsers.SAFEKEEPER