mirror of
https://github.com/neondatabase/neon.git
synced 2026-02-16 00:50:36 +00:00
Compare commits
1 Commits
compaction
...
upd-pr-tem
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5153e02f5a |
11
.github/pull_request_template.md
vendored
11
.github/pull_request_template.md
vendored
@@ -1,14 +1,3 @@
|
||||
## Problem
|
||||
|
||||
## Summary of changes
|
||||
|
||||
## Checklist before requesting a review
|
||||
|
||||
- [ ] I have performed a self-review of my code.
|
||||
- [ ] If it is a core feature, I have added thorough tests.
|
||||
- [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard?
|
||||
- [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section.
|
||||
|
||||
## Checklist before merging
|
||||
|
||||
- [ ] Do not forget to reformat commit message to not include the above checklist
|
||||
|
||||
2
.github/workflows/release.yml
vendored
2
.github/workflows/release.yml
vendored
@@ -2,7 +2,7 @@ name: Create Release Branch
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: '0 7 * * 5'
|
||||
- cron: '0 7 * * 2'
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
|
||||
82
Cargo.lock
generated
82
Cargo.lock
generated
@@ -1609,6 +1609,16 @@ dependencies = [
|
||||
"subtle",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ctor"
|
||||
version = "0.1.26"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6d2301688392eb071b0bf1a37be05c469d3cc4dbbd95df672fe28ab021e6a096"
|
||||
dependencies = [
|
||||
"quote",
|
||||
"syn 1.0.109",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ctr"
|
||||
version = "0.6.0"
|
||||
@@ -2704,10 +2714,11 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "log"
|
||||
version = "0.4.20"
|
||||
version = "0.4.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
|
||||
checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"value-bag",
|
||||
]
|
||||
|
||||
@@ -3243,7 +3254,6 @@ dependencies = [
|
||||
"num_cpus",
|
||||
"once_cell",
|
||||
"pageserver_api",
|
||||
"pageserver_compaction",
|
||||
"pin-project-lite",
|
||||
"postgres",
|
||||
"postgres-protocol",
|
||||
@@ -3302,52 +3312,6 @@ dependencies = [
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pageserver_compaction"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-compression",
|
||||
"async-stream",
|
||||
"async-trait",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"chrono",
|
||||
"clap",
|
||||
"const_format",
|
||||
"consumption_metrics",
|
||||
"criterion",
|
||||
"crossbeam-utils",
|
||||
"either",
|
||||
"fail",
|
||||
"flate2",
|
||||
"futures",
|
||||
"git-version",
|
||||
"hex",
|
||||
"hex-literal",
|
||||
"humantime",
|
||||
"humantime-serde",
|
||||
"itertools",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
"pin-project-lite",
|
||||
"rand 0.8.5",
|
||||
"smallvec",
|
||||
"svg_fmt",
|
||||
"sync_wrapper",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-io-timeout",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
"tracing-error",
|
||||
"tracing-subscriber",
|
||||
"url",
|
||||
"utils",
|
||||
"walkdir",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "parking"
|
||||
version = "2.1.1"
|
||||
@@ -3597,7 +3561,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres"
|
||||
version = "0.19.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -3610,7 +3574,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-native-tls"
|
||||
version = "0.5.0"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
|
||||
dependencies = [
|
||||
"native-tls",
|
||||
"tokio",
|
||||
@@ -3621,7 +3585,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-protocol"
|
||||
version = "0.6.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
|
||||
dependencies = [
|
||||
"base64 0.20.0",
|
||||
"byteorder",
|
||||
@@ -3639,7 +3603,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-types"
|
||||
version = "0.2.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -4466,7 +4430,6 @@ dependencies = [
|
||||
"itertools",
|
||||
"pageserver",
|
||||
"rand 0.8.5",
|
||||
"remote_storage",
|
||||
"reqwest",
|
||||
"serde",
|
||||
"serde_json",
|
||||
@@ -4525,7 +4488,6 @@ dependencies = [
|
||||
"tokio",
|
||||
"tokio-io-timeout",
|
||||
"tokio-postgres",
|
||||
"tokio-stream",
|
||||
"toml_edit",
|
||||
"tracing",
|
||||
"url",
|
||||
@@ -5445,7 +5407,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "tokio-postgres"
|
||||
version = "0.7.7"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"byteorder",
|
||||
@@ -6049,9 +6011,13 @@ checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
|
||||
|
||||
[[package]]
|
||||
name = "value-bag"
|
||||
version = "1.4.2"
|
||||
version = "1.0.0-alpha.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4a72e1902dde2bd6441347de2b70b7f5d59bf157c6c62f0c44572607a1d55bbe"
|
||||
checksum = "2209b78d1249f7e6f3293657c9779fe31ced465df091bbd433a1cf88e916ec55"
|
||||
dependencies = [
|
||||
"ctor",
|
||||
"version_check",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "vcpkg"
|
||||
|
||||
14
Cargo.toml
14
Cargo.toml
@@ -4,7 +4,6 @@ members = [
|
||||
"compute_tools",
|
||||
"control_plane",
|
||||
"pageserver",
|
||||
"pageserver/compaction",
|
||||
"pageserver/ctl",
|
||||
"proxy",
|
||||
"safekeeper",
|
||||
@@ -162,11 +161,11 @@ env_logger = "0.10"
|
||||
log = "0.4"
|
||||
|
||||
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
|
||||
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
|
||||
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
|
||||
|
||||
## Other git libraries
|
||||
heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
|
||||
@@ -176,7 +175,6 @@ compute_api = { version = "0.1", path = "./libs/compute_api/" }
|
||||
consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
|
||||
metrics = { version = "0.1", path = "./libs/metrics/" }
|
||||
pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
|
||||
pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" }
|
||||
postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
|
||||
postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" }
|
||||
postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }
|
||||
@@ -204,7 +202,7 @@ tonic-build = "0.9"
|
||||
|
||||
# This is only needed for proxy's tests.
|
||||
# TODO: we should probably fork `tokio-postgres-rustls` instead.
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
|
||||
|
||||
################# Binary contents sections
|
||||
|
||||
|
||||
@@ -278,26 +278,32 @@ fn main() -> Result<()> {
|
||||
if #[cfg(target_os = "linux")] {
|
||||
use std::env;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
let vm_monitor_addr = matches
|
||||
.get_one::<String>("vm-monitor-addr")
|
||||
.expect("--vm-monitor-addr should always be set because it has a default arg");
|
||||
use tracing::warn;
|
||||
let vm_monitor_addr = matches.get_one::<String>("vm-monitor-addr");
|
||||
let file_cache_connstr = matches.get_one::<String>("filecache-connstr");
|
||||
let cgroup = matches.get_one::<String>("cgroup");
|
||||
let file_cache_on_disk = matches.get_flag("file-cache-on-disk");
|
||||
|
||||
// Only make a runtime if we need to.
|
||||
// Note: it seems like you can make a runtime in an inner scope and
|
||||
// if you start a task in it it won't be dropped. However, make it
|
||||
// in the outermost scope just to be safe.
|
||||
let rt = if env::var_os("AUTOSCALING").is_some() {
|
||||
Some(
|
||||
let rt = match (env::var_os("AUTOSCALING"), vm_monitor_addr) {
|
||||
(None, None) => None,
|
||||
(None, Some(_)) => {
|
||||
warn!("--vm-monitor-addr option set but AUTOSCALING env var not present");
|
||||
None
|
||||
}
|
||||
(Some(_), None) => {
|
||||
panic!("AUTOSCALING env var present but --vm-monitor-addr option not set")
|
||||
}
|
||||
(Some(_), Some(_)) => Some(
|
||||
tokio::runtime::Builder::new_multi_thread()
|
||||
.worker_threads(4)
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("failed to create tokio runtime for monitor")
|
||||
)
|
||||
} else {
|
||||
None
|
||||
.expect("failed to create tokio runtime for monitor"),
|
||||
),
|
||||
};
|
||||
|
||||
// This token is used internally by the monitor to clean up all threads
|
||||
@@ -308,7 +314,8 @@ fn main() -> Result<()> {
|
||||
Box::leak(Box::new(vm_monitor::Args {
|
||||
cgroup: cgroup.cloned(),
|
||||
pgconnstr: file_cache_connstr.cloned(),
|
||||
addr: vm_monitor_addr.clone(),
|
||||
addr: vm_monitor_addr.cloned().unwrap(),
|
||||
file_cache_on_disk,
|
||||
})),
|
||||
token.clone(),
|
||||
))
|
||||
@@ -480,8 +487,6 @@ fn cli() -> clap::Command {
|
||||
.value_name("FILECACHE_CONNSTR"),
|
||||
)
|
||||
.arg(
|
||||
// DEPRECATED, NO LONGER DOES ANYTHING.
|
||||
// See https://github.com/neondatabase/cloud/issues/7516
|
||||
Arg::new("file-cache-on-disk")
|
||||
.long("file-cache-on-disk")
|
||||
.action(clap::ArgAction::SetTrue),
|
||||
|
||||
@@ -24,7 +24,7 @@ fn do_control_plane_request(
|
||||
) -> Result<ControlPlaneSpecResponse, (bool, String)> {
|
||||
let resp = reqwest::blocking::Client::new()
|
||||
.get(uri)
|
||||
.header("Authorization", format!("Bearer {}", jwt))
|
||||
.header("Authorization", jwt)
|
||||
.send()
|
||||
.map_err(|e| {
|
||||
(
|
||||
@@ -68,7 +68,7 @@ pub fn get_spec_from_control_plane(
|
||||
base_uri: &str,
|
||||
compute_id: &str,
|
||||
) -> Result<Option<ComputeSpec>> {
|
||||
let cp_uri = format!("{base_uri}/compute/api/v2/computes/{compute_id}/spec");
|
||||
let cp_uri = format!("{base_uri}/management/api/v2/computes/{compute_id}/spec");
|
||||
let jwt: String = match std::env::var("NEON_CONTROL_PLANE_TOKEN") {
|
||||
Ok(v) => v,
|
||||
Err(_) => "".to_string(),
|
||||
|
||||
@@ -12,7 +12,6 @@ use hyper::{Body, Request, Response};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
use utils::http::endpoint::request_span;
|
||||
use utils::logging::{self, LogFormat};
|
||||
use utils::signals::{ShutdownSignals, Signal};
|
||||
|
||||
@@ -222,25 +221,8 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
|
||||
generation: 0,
|
||||
});
|
||||
|
||||
if let Some(attaching_pageserver) = attach_req.node_id.as_ref() {
|
||||
if attach_req.node_id.is_some() {
|
||||
tenant_state.generation += 1;
|
||||
tracing::info!(
|
||||
tenant_id = %attach_req.tenant_id,
|
||||
ps_id = %attaching_pageserver,
|
||||
generation = %tenant_state.generation,
|
||||
"issuing",
|
||||
);
|
||||
} else if let Some(ps_id) = tenant_state.pageserver {
|
||||
tracing::info!(
|
||||
tenant_id = %attach_req.tenant_id,
|
||||
%ps_id,
|
||||
generation = %tenant_state.generation,
|
||||
"dropping",
|
||||
);
|
||||
} else {
|
||||
tracing::info!(
|
||||
tenant_id = %attach_req.tenant_id,
|
||||
"no-op: tenant already has no pageserver");
|
||||
}
|
||||
tenant_state.pageserver = attach_req.node_id;
|
||||
let generation = tenant_state.generation;
|
||||
@@ -258,9 +240,9 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
|
||||
fn make_router(persistent_state: PersistentState) -> RouterBuilder<hyper::Body, ApiError> {
|
||||
endpoint::make_router()
|
||||
.data(Arc::new(State::new(persistent_state)))
|
||||
.post("/re-attach", |r| request_span(r, handle_re_attach))
|
||||
.post("/validate", |r| request_span(r, handle_validate))
|
||||
.post("/attach-hook", |r| request_span(r, handle_attach_hook))
|
||||
.post("/re-attach", handle_re_attach)
|
||||
.post("/validate", handle_validate)
|
||||
.post("/attach-hook", handle_attach_hook)
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
|
||||
@@ -345,11 +345,6 @@ impl PageServerNode {
|
||||
.remove("compaction_threshold")
|
||||
.map(|x| x.parse::<usize>())
|
||||
.transpose()?,
|
||||
compaction_algorithm: settings
|
||||
.remove("compaction_algorithm")
|
||||
.map(serde_json::from_str)
|
||||
.transpose()
|
||||
.context("Failed to parse 'compaction_algorithm' json")?,
|
||||
gc_horizon: settings
|
||||
.remove("gc_horizon")
|
||||
.map(|x| x.parse::<u64>())
|
||||
@@ -445,11 +440,6 @@ impl PageServerNode {
|
||||
.map(|x| x.parse::<usize>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'compaction_threshold' as an integer")?,
|
||||
compaction_algorithm: settings
|
||||
.remove("compactin_algorithm")
|
||||
.map(serde_json::from_str)
|
||||
.transpose()
|
||||
.context("Failed to parse 'compaction_algorithm' json")?,
|
||||
gc_horizon: settings
|
||||
.remove("gc_horizon")
|
||||
.map(|x| x.parse::<u64>())
|
||||
|
||||
@@ -1,108 +0,0 @@
|
||||
# Updating Postgres
|
||||
|
||||
## Minor Versions
|
||||
|
||||
When upgrading to a new minor version of Postgres, please follow these steps:
|
||||
|
||||
_Example: 15.4 is the new minor version to upgrade to from 15.3._
|
||||
|
||||
1. Clone the Neon Postgres repository if you have not done so already.
|
||||
|
||||
```shell
|
||||
git clone git@github.com:neondatabase/postgres.git
|
||||
```
|
||||
|
||||
1. Add the Postgres upstream remote.
|
||||
|
||||
```shell
|
||||
git remote add upstream https://git.postgresql.org/git/postgresql.git
|
||||
```
|
||||
|
||||
1. Create a new branch based on the stable branch you are updating.
|
||||
|
||||
```shell
|
||||
git checkout -b my-branch REL_15_STABLE_neon
|
||||
```
|
||||
|
||||
1. Tag the last commit on the stable branch you are updating.
|
||||
|
||||
```shell
|
||||
git tag REL_15_3_neon
|
||||
```
|
||||
|
||||
1. Push the new tag to the Neon Postgres repository.
|
||||
|
||||
```shell
|
||||
git push origin REL_15_3_neon
|
||||
```
|
||||
|
||||
1. Find the release tags you're looking for. They are of the form `REL_X_Y`.
|
||||
|
||||
1. Rebase the branch you created on the tag and resolve any conflicts.
|
||||
|
||||
```shell
|
||||
git fetch upstream REL_15_4
|
||||
git rebase REL_15_4
|
||||
```
|
||||
|
||||
1. Run the Postgres test suite to make sure our commits have not affected
|
||||
Postgres in a negative way.
|
||||
|
||||
```shell
|
||||
make check
|
||||
# OR
|
||||
meson test -C builddir
|
||||
```
|
||||
|
||||
1. Push your branch to the Neon Postgres repository.
|
||||
|
||||
```shell
|
||||
git push origin my-branch
|
||||
```
|
||||
|
||||
1. Clone the Neon repository if you have not done so already.
|
||||
|
||||
```shell
|
||||
git clone git@github.com:neondatabase/neon.git
|
||||
```
|
||||
|
||||
1. Create a new branch.
|
||||
|
||||
1. Change the `revisions.json` file to point at the HEAD of your Postgres
|
||||
branch.
|
||||
|
||||
1. Update the Git submodule.
|
||||
|
||||
```shell
|
||||
git submodule set-branch --branch my-branch vendor/postgres-v15
|
||||
git submodule update --remote vendor/postgres-v15
|
||||
```
|
||||
|
||||
1. Run the Neon test suite to make sure that Neon is still good to go on this
|
||||
minor Postgres release.
|
||||
|
||||
```shell
|
||||
./scripts/poetry -k pg15
|
||||
```
|
||||
|
||||
1. Commit your changes.
|
||||
|
||||
1. Create a pull request, and wait for CI to go green.
|
||||
|
||||
1. Force push the rebased Postgres branches into the Neon Postgres repository.
|
||||
|
||||
```shell
|
||||
git push --force origin my-branch:REL_15_STABLE_neon
|
||||
```
|
||||
|
||||
It may require disabling various branch protections.
|
||||
|
||||
1. Update your Neon PR to point at the branches.
|
||||
|
||||
```shell
|
||||
git submodule set-branch --branch REL_15_STABLE_neon vendor/postgres-v15
|
||||
git commit --amend --no-edit
|
||||
git push --force origin
|
||||
```
|
||||
|
||||
1. Merge the pull request after getting approval(s) and CI completion.
|
||||
@@ -227,8 +227,6 @@ pub struct TenantConfig {
|
||||
pub compaction_target_size: Option<u64>,
|
||||
pub compaction_period: Option<String>,
|
||||
pub compaction_threshold: Option<usize>,
|
||||
// defer parsing compaction_algorithm, like eviction_policy
|
||||
pub compaction_algorithm: Option<serde_json::Value>,
|
||||
pub gc_horizon: Option<u64>,
|
||||
pub gc_period: Option<String>,
|
||||
pub image_creation_threshold: Option<usize>,
|
||||
@@ -326,7 +324,6 @@ impl TenantConfigRequest {
|
||||
compaction_target_size: None,
|
||||
compaction_period: None,
|
||||
compaction_threshold: None,
|
||||
compaction_algorithm: None,
|
||||
gc_horizon: None,
|
||||
gc_period: None,
|
||||
image_creation_threshold: None,
|
||||
|
||||
@@ -14,7 +14,6 @@ macro_rules! xlog_utils_test {
|
||||
($version:ident) => {
|
||||
#[path = "."]
|
||||
mod $version {
|
||||
#[allow(unused_imports)]
|
||||
pub use postgres_ffi::$version::wal_craft_test_export::*;
|
||||
#[allow(clippy::duplicate_mod)]
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize};
|
||||
///
|
||||
/// See docs/rfcs/025-generation-numbers.md for detail on how generation
|
||||
/// numbers are used.
|
||||
#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
|
||||
#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)]
|
||||
pub enum Generation {
|
||||
// Generations with this magic value will not add a suffix to S3 keys, and will not
|
||||
// be included in persisted index_part.json. This value is only to be used
|
||||
|
||||
@@ -14,11 +14,6 @@ use tracing::{self, debug, info, info_span, warn, Instrument};
|
||||
use std::future::Future;
|
||||
use std::str::FromStr;
|
||||
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use std::io::Write as _;
|
||||
use tokio::sync::mpsc;
|
||||
use tokio_stream::wrappers::ReceiverStream;
|
||||
|
||||
static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"libmetrics_metric_handler_requests_total",
|
||||
@@ -151,89 +146,94 @@ impl Drop for RequestCancelled {
|
||||
}
|
||||
}
|
||||
|
||||
/// An [`std::io::Write`] implementation on top of a channel sending [`bytes::Bytes`] chunks.
|
||||
pub struct ChannelWriter {
|
||||
buffer: BytesMut,
|
||||
pub tx: mpsc::Sender<std::io::Result<Bytes>>,
|
||||
written: usize,
|
||||
}
|
||||
|
||||
impl ChannelWriter {
|
||||
pub fn new(buf_len: usize, tx: mpsc::Sender<std::io::Result<Bytes>>) -> Self {
|
||||
assert_ne!(buf_len, 0);
|
||||
ChannelWriter {
|
||||
// split about half off the buffer from the start, because we flush depending on
|
||||
// capacity. first flush will come sooner than without this, but now resizes will
|
||||
// have better chance of picking up the "other" half. not guaranteed of course.
|
||||
buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2),
|
||||
tx,
|
||||
written: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn flush0(&mut self) -> std::io::Result<usize> {
|
||||
let n = self.buffer.len();
|
||||
if n == 0 {
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
tracing::trace!(n, "flushing");
|
||||
let ready = self.buffer.split().freeze();
|
||||
|
||||
// not ideal to call from blocking code to block_on, but we are sure that this
|
||||
// operation does not spawn_blocking other tasks
|
||||
let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async {
|
||||
self.tx.send(Ok(ready)).await.map_err(|_| ())?;
|
||||
|
||||
// throttle sending to allow reuse of our buffer in `write`.
|
||||
self.tx.reserve().await.map_err(|_| ())?;
|
||||
|
||||
// now the response task has picked up the buffer and hopefully started
|
||||
// sending it to the client.
|
||||
Ok(())
|
||||
});
|
||||
if res.is_err() {
|
||||
return Err(std::io::ErrorKind::BrokenPipe.into());
|
||||
}
|
||||
self.written += n;
|
||||
Ok(n)
|
||||
}
|
||||
|
||||
pub fn flushed_bytes(&self) -> usize {
|
||||
self.written
|
||||
}
|
||||
}
|
||||
|
||||
impl std::io::Write for ChannelWriter {
|
||||
fn write(&mut self, mut buf: &[u8]) -> std::io::Result<usize> {
|
||||
let remaining = self.buffer.capacity() - self.buffer.len();
|
||||
|
||||
let out_of_space = remaining < buf.len();
|
||||
|
||||
let original_len = buf.len();
|
||||
|
||||
if out_of_space {
|
||||
let can_still_fit = buf.len() - remaining;
|
||||
self.buffer.extend_from_slice(&buf[..can_still_fit]);
|
||||
buf = &buf[can_still_fit..];
|
||||
self.flush0()?;
|
||||
}
|
||||
|
||||
// assume that this will often under normal operation just move the pointer back to the
|
||||
// beginning of allocation, because previous split off parts are already sent and
|
||||
// dropped.
|
||||
self.buffer.extend_from_slice(buf);
|
||||
Ok(original_len)
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> std::io::Result<()> {
|
||||
self.flush0().map(|_| ())
|
||||
}
|
||||
}
|
||||
|
||||
async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use std::io::Write as _;
|
||||
use tokio::sync::mpsc;
|
||||
use tokio_stream::wrappers::ReceiverStream;
|
||||
|
||||
SERVE_METRICS_COUNT.inc();
|
||||
|
||||
/// An [`std::io::Write`] implementation on top of a channel sending [`bytes::Bytes`] chunks.
|
||||
struct ChannelWriter {
|
||||
buffer: BytesMut,
|
||||
tx: mpsc::Sender<std::io::Result<Bytes>>,
|
||||
written: usize,
|
||||
}
|
||||
|
||||
impl ChannelWriter {
|
||||
fn new(buf_len: usize, tx: mpsc::Sender<std::io::Result<Bytes>>) -> Self {
|
||||
assert_ne!(buf_len, 0);
|
||||
ChannelWriter {
|
||||
// split about half off the buffer from the start, because we flush depending on
|
||||
// capacity. first flush will come sooner than without this, but now resizes will
|
||||
// have better chance of picking up the "other" half. not guaranteed of course.
|
||||
buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2),
|
||||
tx,
|
||||
written: 0,
|
||||
}
|
||||
}
|
||||
|
||||
fn flush0(&mut self) -> std::io::Result<usize> {
|
||||
let n = self.buffer.len();
|
||||
if n == 0 {
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
tracing::trace!(n, "flushing");
|
||||
let ready = self.buffer.split().freeze();
|
||||
|
||||
// not ideal to call from blocking code to block_on, but we are sure that this
|
||||
// operation does not spawn_blocking other tasks
|
||||
let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async {
|
||||
self.tx.send(Ok(ready)).await.map_err(|_| ())?;
|
||||
|
||||
// throttle sending to allow reuse of our buffer in `write`.
|
||||
self.tx.reserve().await.map_err(|_| ())?;
|
||||
|
||||
// now the response task has picked up the buffer and hopefully started
|
||||
// sending it to the client.
|
||||
Ok(())
|
||||
});
|
||||
if res.is_err() {
|
||||
return Err(std::io::ErrorKind::BrokenPipe.into());
|
||||
}
|
||||
self.written += n;
|
||||
Ok(n)
|
||||
}
|
||||
|
||||
fn flushed_bytes(&self) -> usize {
|
||||
self.written
|
||||
}
|
||||
}
|
||||
|
||||
impl std::io::Write for ChannelWriter {
|
||||
fn write(&mut self, mut buf: &[u8]) -> std::io::Result<usize> {
|
||||
let remaining = self.buffer.capacity() - self.buffer.len();
|
||||
|
||||
let out_of_space = remaining < buf.len();
|
||||
|
||||
let original_len = buf.len();
|
||||
|
||||
if out_of_space {
|
||||
let can_still_fit = buf.len() - remaining;
|
||||
self.buffer.extend_from_slice(&buf[..can_still_fit]);
|
||||
buf = &buf[can_still_fit..];
|
||||
self.flush0()?;
|
||||
}
|
||||
|
||||
// assume that this will often under normal operation just move the pointer back to the
|
||||
// beginning of allocation, because previous split off parts are already sent and
|
||||
// dropped.
|
||||
self.buffer.extend_from_slice(buf);
|
||||
Ok(original_len)
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> std::io::Result<()> {
|
||||
self.flush0().map(|_| ())
|
||||
}
|
||||
}
|
||||
|
||||
let started_at = std::time::Instant::now();
|
||||
|
||||
let (tx, rx) = mpsc::channel(1);
|
||||
|
||||
@@ -1,7 +1,4 @@
|
||||
use std::sync::{
|
||||
atomic::{AtomicUsize, Ordering},
|
||||
Arc, Mutex, MutexGuard,
|
||||
};
|
||||
use std::sync::{Arc, Mutex, MutexGuard};
|
||||
use tokio::sync::Semaphore;
|
||||
|
||||
/// Custom design like [`tokio::sync::OnceCell`] but using [`OwnedSemaphorePermit`] instead of
|
||||
@@ -13,7 +10,6 @@ use tokio::sync::Semaphore;
|
||||
/// [`OwnedSemaphorePermit`]: tokio::sync::OwnedSemaphorePermit
|
||||
pub struct OnceCell<T> {
|
||||
inner: Mutex<Inner<T>>,
|
||||
initializers: AtomicUsize,
|
||||
}
|
||||
|
||||
impl<T> Default for OnceCell<T> {
|
||||
@@ -21,7 +17,6 @@ impl<T> Default for OnceCell<T> {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
inner: Default::default(),
|
||||
initializers: AtomicUsize::new(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -54,7 +49,6 @@ impl<T> OnceCell<T> {
|
||||
init_semaphore: Arc::new(sem),
|
||||
value: Some(value),
|
||||
}),
|
||||
initializers: AtomicUsize::new(0),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -66,8 +60,8 @@ impl<T> OnceCell<T> {
|
||||
/// Initialization is panic-safe and cancellation-safe.
|
||||
pub async fn get_or_init<F, Fut, E>(&self, factory: F) -> Result<Guard<'_, T>, E>
|
||||
where
|
||||
F: FnOnce(InitPermit) -> Fut,
|
||||
Fut: std::future::Future<Output = Result<(T, InitPermit), E>>,
|
||||
F: FnOnce() -> Fut,
|
||||
Fut: std::future::Future<Output = Result<T, E>>,
|
||||
{
|
||||
let sem = {
|
||||
let guard = self.inner.lock().unwrap();
|
||||
@@ -77,61 +71,29 @@ impl<T> OnceCell<T> {
|
||||
guard.init_semaphore.clone()
|
||||
};
|
||||
|
||||
let permit = {
|
||||
// increment the count for the duration of queued
|
||||
let _guard = CountWaitingInitializers::start(self);
|
||||
sem.acquire_owned().await
|
||||
};
|
||||
let permit = sem.acquire_owned().await;
|
||||
if permit.is_err() {
|
||||
let guard = self.inner.lock().unwrap();
|
||||
assert!(
|
||||
guard.value.is_some(),
|
||||
"semaphore got closed, must be initialized"
|
||||
);
|
||||
return Ok(Guard(guard));
|
||||
} else {
|
||||
// now we try
|
||||
let value = factory().await?;
|
||||
|
||||
match permit {
|
||||
Ok(permit) => {
|
||||
let permit = InitPermit(permit);
|
||||
let (value, _permit) = factory(permit).await?;
|
||||
|
||||
let guard = self.inner.lock().unwrap();
|
||||
|
||||
Ok(Self::set0(value, guard))
|
||||
}
|
||||
Err(_closed) => {
|
||||
let guard = self.inner.lock().unwrap();
|
||||
assert!(
|
||||
guard.value.is_some(),
|
||||
"semaphore got closed, must be initialized"
|
||||
);
|
||||
return Ok(Guard(guard));
|
||||
}
|
||||
let mut guard = self.inner.lock().unwrap();
|
||||
assert!(
|
||||
guard.value.is_none(),
|
||||
"we won permit, must not be initialized"
|
||||
);
|
||||
guard.value = Some(value);
|
||||
guard.init_semaphore.close();
|
||||
Ok(Guard(guard))
|
||||
}
|
||||
}
|
||||
|
||||
/// Assuming a permit is held after previous call to [`Guard::take_and_deinit`], it can be used
|
||||
/// to complete initializing the inner value.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// If the inner has already been initialized.
|
||||
pub fn set(&self, value: T, _permit: InitPermit) -> Guard<'_, T> {
|
||||
let guard = self.inner.lock().unwrap();
|
||||
|
||||
// cannot assert that this permit is for self.inner.semaphore, but we can assert it cannot
|
||||
// give more permits right now.
|
||||
if guard.init_semaphore.try_acquire().is_ok() {
|
||||
drop(guard);
|
||||
panic!("permit is of wrong origin");
|
||||
}
|
||||
|
||||
Self::set0(value, guard)
|
||||
}
|
||||
|
||||
fn set0(value: T, mut guard: std::sync::MutexGuard<'_, Inner<T>>) -> Guard<'_, T> {
|
||||
if guard.value.is_some() {
|
||||
drop(guard);
|
||||
unreachable!("we won permit, must not be initialized");
|
||||
}
|
||||
guard.value = Some(value);
|
||||
guard.init_semaphore.close();
|
||||
Guard(guard)
|
||||
}
|
||||
|
||||
/// Returns a guard to an existing initialized value, if any.
|
||||
pub fn get(&self) -> Option<Guard<'_, T>> {
|
||||
let guard = self.inner.lock().unwrap();
|
||||
@@ -141,28 +103,6 @@ impl<T> OnceCell<T> {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Return the number of [`Self::get_or_init`] calls waiting for initialization to complete.
|
||||
pub fn initializer_count(&self) -> usize {
|
||||
self.initializers.load(Ordering::Relaxed)
|
||||
}
|
||||
}
|
||||
|
||||
/// DropGuard counter for queued tasks waiting to initialize, mainly accessible for the
|
||||
/// initializing task for example at the end of initialization.
|
||||
struct CountWaitingInitializers<'a, T>(&'a OnceCell<T>);
|
||||
|
||||
impl<'a, T> CountWaitingInitializers<'a, T> {
|
||||
fn start(target: &'a OnceCell<T>) -> Self {
|
||||
target.initializers.fetch_add(1, Ordering::Relaxed);
|
||||
CountWaitingInitializers(target)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T> Drop for CountWaitingInitializers<'a, T> {
|
||||
fn drop(&mut self) {
|
||||
self.0.initializers.fetch_sub(1, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
/// Uninteresting guard object to allow short-lived access to inspect or clone the held,
|
||||
@@ -195,7 +135,7 @@ impl<'a, T> Guard<'a, T> {
|
||||
///
|
||||
/// The permit will be on a semaphore part of the new internal value, and any following
|
||||
/// [`OnceCell::get_or_init`] will wait on it to complete.
|
||||
pub fn take_and_deinit(&mut self) -> (T, InitPermit) {
|
||||
pub fn take_and_deinit(&mut self) -> (T, tokio::sync::OwnedSemaphorePermit) {
|
||||
let mut swapped = Inner::default();
|
||||
let permit = swapped
|
||||
.init_semaphore
|
||||
@@ -205,14 +145,11 @@ impl<'a, T> Guard<'a, T> {
|
||||
std::mem::swap(&mut *self.0, &mut swapped);
|
||||
swapped
|
||||
.value
|
||||
.map(|v| (v, InitPermit(permit)))
|
||||
.map(|v| (v, permit))
|
||||
.expect("guard is not created unless value has been initialized")
|
||||
}
|
||||
}
|
||||
|
||||
/// Type held by OnceCell (de)initializing task.
|
||||
pub struct InitPermit(tokio::sync::OwnedSemaphorePermit);
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -248,11 +185,11 @@ mod tests {
|
||||
barrier.wait().await;
|
||||
let won = {
|
||||
let g = cell
|
||||
.get_or_init(|permit| {
|
||||
.get_or_init(|| {
|
||||
counters.factory_got_to_run.fetch_add(1, Ordering::Relaxed);
|
||||
async {
|
||||
counters.future_polled.fetch_add(1, Ordering::Relaxed);
|
||||
Ok::<_, Infallible>((i, permit))
|
||||
Ok::<_, Infallible>(i)
|
||||
}
|
||||
})
|
||||
.await
|
||||
@@ -306,7 +243,7 @@ mod tests {
|
||||
deinitialization_started.wait().await;
|
||||
|
||||
let started_at = tokio::time::Instant::now();
|
||||
cell.get_or_init(|permit| async { Ok::<_, Infallible>((reinit, permit)) })
|
||||
cell.get_or_init(|| async { Ok::<_, Infallible>(reinit) })
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
@@ -321,32 +258,18 @@ mod tests {
|
||||
assert_eq!(*cell.get().unwrap(), reinit);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn reinit_with_deinit_permit() {
|
||||
let cell = Arc::new(OnceCell::new(42));
|
||||
|
||||
let (mol, permit) = cell.get().unwrap().take_and_deinit();
|
||||
cell.set(5, permit);
|
||||
assert_eq!(*cell.get().unwrap(), 5);
|
||||
|
||||
let (five, permit) = cell.get().unwrap().take_and_deinit();
|
||||
assert_eq!(5, five);
|
||||
cell.set(mol, permit);
|
||||
assert_eq!(*cell.get().unwrap(), 42);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn initialization_attemptable_until_ok() {
|
||||
let cell = OnceCell::default();
|
||||
|
||||
for _ in 0..10 {
|
||||
cell.get_or_init(|_permit| async { Err("whatever error") })
|
||||
cell.get_or_init(|| async { Err("whatever error") })
|
||||
.await
|
||||
.unwrap_err();
|
||||
}
|
||||
|
||||
let g = cell
|
||||
.get_or_init(|permit| async { Ok::<_, Infallible>(("finally success", permit)) })
|
||||
.get_or_init(|| async { Ok::<_, Infallible>("finally success") })
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(*g, "finally success");
|
||||
@@ -358,11 +281,11 @@ mod tests {
|
||||
|
||||
let barrier = tokio::sync::Barrier::new(2);
|
||||
|
||||
let initializer = cell.get_or_init(|permit| async {
|
||||
let initializer = cell.get_or_init(|| async {
|
||||
barrier.wait().await;
|
||||
futures::future::pending::<()>().await;
|
||||
|
||||
Ok::<_, Infallible>(("never reached", permit))
|
||||
Ok::<_, Infallible>("never reached")
|
||||
});
|
||||
|
||||
tokio::select! {
|
||||
@@ -375,7 +298,7 @@ mod tests {
|
||||
assert!(cell.get().is_none());
|
||||
|
||||
let g = cell
|
||||
.get_or_init(|permit| async { Ok::<_, Infallible>(("now initialized", permit)) })
|
||||
.get_or_init(|| async { Ok::<_, Infallible>("now initialized") })
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(*g, "now initialized");
|
||||
|
||||
@@ -21,6 +21,11 @@ pub struct FileCacheState {
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct FileCacheConfig {
|
||||
/// Whether the file cache is *actually* stored in memory (e.g. by writing to
|
||||
/// a tmpfs or shmem file). If true, the size of the file cache will be counted against the
|
||||
/// memory available for the cgroup.
|
||||
pub(crate) in_memory: bool,
|
||||
|
||||
/// The size of the file cache, in terms of the size of the resource it consumes
|
||||
/// (currently: only memory)
|
||||
///
|
||||
@@ -54,9 +59,22 @@ pub struct FileCacheConfig {
|
||||
spread_factor: f64,
|
||||
}
|
||||
|
||||
impl Default for FileCacheConfig {
|
||||
fn default() -> Self {
|
||||
impl FileCacheConfig {
|
||||
pub fn default_in_memory() -> Self {
|
||||
Self {
|
||||
in_memory: true,
|
||||
// 75 %
|
||||
resource_multiplier: 0.75,
|
||||
// 640 MiB; (512 + 128)
|
||||
min_remaining_after_cache: NonZeroU64::new(640 * MiB).unwrap(),
|
||||
// ensure any increase in file cache size is split 90-10 with 10% to other memory
|
||||
spread_factor: 0.1,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn default_on_disk() -> Self {
|
||||
Self {
|
||||
in_memory: false,
|
||||
resource_multiplier: 0.75,
|
||||
// 256 MiB - lower than when in memory because overcommitting is safe; if we don't have
|
||||
// memory, the kernel will just evict from its page cache, rather than e.g. killing
|
||||
@@ -65,9 +83,7 @@ impl Default for FileCacheConfig {
|
||||
spread_factor: 0.1,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FileCacheConfig {
|
||||
/// Make sure fields of the config are consistent.
|
||||
pub fn validate(&self) -> anyhow::Result<()> {
|
||||
// Single field validity
|
||||
|
||||
@@ -39,6 +39,16 @@ pub struct Args {
|
||||
#[arg(short, long)]
|
||||
pub pgconnstr: Option<String>,
|
||||
|
||||
/// Flag to signal that the Postgres file cache is on disk (i.e. not in memory aside from the
|
||||
/// kernel's page cache), and therefore should not count against available memory.
|
||||
//
|
||||
// NB: Ideally this flag would directly refer to whether the file cache is in memory (rather
|
||||
// than a roundabout way, via whether it's on disk), but in order to be backwards compatible
|
||||
// during the switch away from an in-memory file cache, we had to default to the previous
|
||||
// behavior.
|
||||
#[arg(long)]
|
||||
pub file_cache_on_disk: bool,
|
||||
|
||||
/// The address we should listen on for connection requests. For the
|
||||
/// agent, this is 0.0.0.0:10301. For the informant, this is 127.0.0.1:10369.
|
||||
#[arg(short, long)]
|
||||
|
||||
@@ -156,7 +156,10 @@ impl Runner {
|
||||
// memory limits.
|
||||
if let Some(connstr) = &args.pgconnstr {
|
||||
info!("initializing file cache");
|
||||
let config = FileCacheConfig::default();
|
||||
let config = match args.file_cache_on_disk {
|
||||
true => FileCacheConfig::default_on_disk(),
|
||||
false => FileCacheConfig::default_in_memory(),
|
||||
};
|
||||
|
||||
let mut file_cache = FileCacheState::new(connstr, config, token.clone())
|
||||
.await
|
||||
@@ -184,7 +187,10 @@ impl Runner {
|
||||
info!("file cache size actually got set to {actual_size}")
|
||||
}
|
||||
|
||||
file_cache_disk_size = actual_size;
|
||||
if args.file_cache_on_disk {
|
||||
file_cache_disk_size = actual_size;
|
||||
}
|
||||
|
||||
state.filecache = Some(file_cache);
|
||||
}
|
||||
|
||||
@@ -233,11 +239,17 @@ impl Runner {
|
||||
|
||||
let requested_mem = target.mem;
|
||||
let usable_system_memory = requested_mem.saturating_sub(self.config.sys_buffer_bytes);
|
||||
let expected_file_cache_size = self
|
||||
let (expected_file_cache_size, expected_file_cache_disk_size) = self
|
||||
.filecache
|
||||
.as_ref()
|
||||
.map(|file_cache| file_cache.config.calculate_cache_size(usable_system_memory))
|
||||
.unwrap_or(0);
|
||||
.map(|file_cache| {
|
||||
let size = file_cache.config.calculate_cache_size(usable_system_memory);
|
||||
match file_cache.config.in_memory {
|
||||
true => (size, 0),
|
||||
false => (size, size),
|
||||
}
|
||||
})
|
||||
.unwrap_or((0, 0));
|
||||
if let Some(cgroup) = &self.cgroup {
|
||||
let (last_time, last_history) = *cgroup.watcher.borrow();
|
||||
|
||||
@@ -261,7 +273,7 @@ impl Runner {
|
||||
|
||||
let new_threshold = self
|
||||
.config
|
||||
.cgroup_threshold(usable_system_memory, expected_file_cache_size);
|
||||
.cgroup_threshold(usable_system_memory, expected_file_cache_disk_size);
|
||||
|
||||
let current = last_history.avg_non_reclaimable;
|
||||
|
||||
@@ -288,10 +300,13 @@ impl Runner {
|
||||
.set_file_cache_size(expected_file_cache_size)
|
||||
.await
|
||||
.context("failed to set file cache size")?;
|
||||
file_cache_disk_size = actual_usage;
|
||||
if !file_cache.config.in_memory {
|
||||
file_cache_disk_size = actual_usage;
|
||||
}
|
||||
let message = format!(
|
||||
"set file cache size to {} MiB",
|
||||
"set file cache size to {} MiB (in memory = {})",
|
||||
bytes_to_mebibytes(actual_usage),
|
||||
file_cache.config.in_memory,
|
||||
);
|
||||
info!("downscale: {message}");
|
||||
status.push(message);
|
||||
@@ -342,7 +357,9 @@ impl Runner {
|
||||
.set_file_cache_size(expected_usage)
|
||||
.await
|
||||
.context("failed to set file cache size")?;
|
||||
file_cache_disk_size = actual_usage;
|
||||
if !file_cache.config.in_memory {
|
||||
file_cache_disk_size = actual_usage;
|
||||
}
|
||||
|
||||
if actual_usage != expected_usage {
|
||||
warn!(
|
||||
|
||||
@@ -68,7 +68,6 @@ url.workspace = true
|
||||
walkdir.workspace = true
|
||||
metrics.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
pageserver_compaction.workspace = true
|
||||
postgres_connection.workspace = true
|
||||
postgres_ffi.workspace = true
|
||||
pq_proto.workspace = true
|
||||
|
||||
@@ -1,53 +0,0 @@
|
||||
[package]
|
||||
name = "pageserver_compaction"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[features]
|
||||
default = []
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
async-compression.workspace = true
|
||||
async-stream.workspace = true
|
||||
async-trait.workspace = true
|
||||
byteorder.workspace = true
|
||||
bytes.workspace = true
|
||||
chrono = { workspace = true, features = ["serde"] }
|
||||
clap = { workspace = true, features = ["string"] }
|
||||
const_format.workspace = true
|
||||
consumption_metrics.workspace = true
|
||||
crossbeam-utils.workspace = true
|
||||
either.workspace = true
|
||||
flate2.workspace = true
|
||||
fail.workspace = true
|
||||
futures.workspace = true
|
||||
git-version.workspace = true
|
||||
hex.workspace = true
|
||||
humantime.workspace = true
|
||||
humantime-serde.workspace = true
|
||||
itertools.workspace = true
|
||||
once_cell.workspace = true
|
||||
pin-project-lite.workspace = true
|
||||
rand.workspace = true
|
||||
smallvec = { workspace = true, features = ["write"] }
|
||||
svg_fmt.workspace = true
|
||||
sync_wrapper.workspace = true
|
||||
thiserror.workspace = true
|
||||
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
|
||||
tokio-io-timeout.workspace = true
|
||||
tokio-util.workspace = true
|
||||
tracing.workspace = true
|
||||
tracing-error.workspace = true
|
||||
tracing-subscriber.workspace = true
|
||||
url.workspace = true
|
||||
walkdir.workspace = true
|
||||
metrics.workspace = true
|
||||
utils.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
criterion.workspace = true
|
||||
hex-literal.workspace = true
|
||||
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }
|
||||
@@ -1,49 +0,0 @@
|
||||
# TODO
|
||||
|
||||
- If the key space can be perfectly partitioned at some key, perform planning on each
|
||||
partition separately. For example, if we are compacting a level with layers like this:
|
||||
|
||||
:
|
||||
+--+ +----+ : +------+
|
||||
| | | | : | |
|
||||
+--+ +----+ : +------+
|
||||
:
|
||||
+-----+ +-+ : +--------+
|
||||
| | | | : | |
|
||||
+-----+ +-+ : +--------+
|
||||
:
|
||||
|
||||
At the dotted line, there is a natural split in the key space, such that all
|
||||
layers are either on the left or the right of it. We can compact the
|
||||
partitions separately. We could choose to create image layers for one
|
||||
partition but not the other one, for example.
|
||||
|
||||
- All the layers don't have to be exactly the same size, we can choose to cut a
|
||||
layer short or stretch it a little larger than the target size, if it helps
|
||||
the overall system. We can help perfect partitions (see previous bullet point)
|
||||
to happen more frequently, by choosing the cut points wisely. For example, try
|
||||
to cut layers at boundaries of underlying image layers. And "snap to grid",
|
||||
i.e. don't cut layers at any key, but e.g. only when key % 10000 = 0.
|
||||
|
||||
- Avoid rewriting layers when we'd just create an identical layer to an input
|
||||
layer.
|
||||
|
||||
- Parallelism. The code is already split up into planning and execution, so that
|
||||
we first split up the compaction work into "Jobs", and then execute them.
|
||||
It would be straightforward to execute multiple jobs in parallel.
|
||||
|
||||
- Materialize extra pages in delta layers during compaction. This would reduce
|
||||
read amplification. There has been the idea of partial image layers. Materializing
|
||||
extra pages in the delta layers achieve the same goal, without introducing a new
|
||||
concept.
|
||||
|
||||
## Simulator
|
||||
|
||||
- Expand the simulator for more workloads
|
||||
- Automate a test suite that runs the simluator with different workloads and
|
||||
spits out a table of results
|
||||
- Model read amplification
|
||||
- More sanity checking. One idea is to keep a reference count of each
|
||||
MockRecord, i.e. use Arc<MockRecord> instead of plain MockRecord, and panic if
|
||||
a MockRecord that is newer than PITR horizon is completely dropped. That would
|
||||
indicate that the record was lost.
|
||||
@@ -1,214 +0,0 @@
|
||||
use clap::{Parser, Subcommand};
|
||||
use pageserver_compaction::simulator::MockTimeline;
|
||||
use rand::Rng;
|
||||
use std::io::Write;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::OnceLock;
|
||||
|
||||
use utils::project_git_version;
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
|
||||
#[derive(Parser)]
|
||||
#[command(
|
||||
version = GIT_VERSION,
|
||||
about = "Neon Pageserver compaction simulator",
|
||||
long_about = "A developer tool to visualize and test compaction"
|
||||
)]
|
||||
#[command(propagate_version = true)]
|
||||
struct CliOpts {
|
||||
#[command(subcommand)]
|
||||
command: Commands,
|
||||
}
|
||||
|
||||
#[derive(Subcommand)]
|
||||
enum Commands {
|
||||
RunSuite,
|
||||
Simulate(SimulateCmd),
|
||||
}
|
||||
|
||||
#[derive(Clone, clap::ValueEnum)]
|
||||
enum Distribution {
|
||||
Uniform,
|
||||
HotCold,
|
||||
}
|
||||
|
||||
/// Read and update pageserver metadata file
|
||||
#[derive(Parser)]
|
||||
struct SimulateCmd {
|
||||
distribution: Distribution,
|
||||
|
||||
/// Number of records to digest
|
||||
num_records: u64,
|
||||
/// Record length
|
||||
record_len: u64,
|
||||
|
||||
// Logical database size in MB
|
||||
logical_size: u64,
|
||||
}
|
||||
|
||||
async fn simulate(cmd: &SimulateCmd, results_path: &Path) -> anyhow::Result<()> {
|
||||
let mut executor = MockTimeline::new();
|
||||
|
||||
// Convert the logical size in MB into a key range.
|
||||
let key_range = 0..((cmd.logical_size * 1024 * 1024) / 8192);
|
||||
//let key_range = u64::MIN..u64::MAX;
|
||||
println!(
|
||||
"starting simulation with key range {:016X}-{:016X}",
|
||||
key_range.start, key_range.end
|
||||
);
|
||||
|
||||
// helper function to print progress indicator
|
||||
let print_progress = |i| -> anyhow::Result<()> {
|
||||
if i == 0 || (i + 1) % 10000 == 0 || i == cmd.num_records - 1 {
|
||||
print!(
|
||||
"\ringested {} / {} records, {} MiB / {} MiB...",
|
||||
i + 1,
|
||||
cmd.num_records,
|
||||
(i + 1) * cmd.record_len / (1_000_000),
|
||||
cmd.num_records * cmd.record_len / (1_000_000),
|
||||
);
|
||||
std::io::stdout().flush()?;
|
||||
}
|
||||
Ok(())
|
||||
};
|
||||
|
||||
match cmd.distribution {
|
||||
Distribution::Uniform => {
|
||||
for i in 0..cmd.num_records {
|
||||
executor.ingest_uniform(1, cmd.record_len, &key_range)?;
|
||||
executor.compact_if_needed().await?;
|
||||
|
||||
print_progress(i)?;
|
||||
}
|
||||
}
|
||||
Distribution::HotCold => {
|
||||
let splitpoint = key_range.end / 10;
|
||||
let hot_key_range = 0..splitpoint;
|
||||
let cold_key_range = splitpoint..key_range.end;
|
||||
|
||||
for i in 0..cmd.num_records {
|
||||
let chosen_range = if rand::thread_rng().gen_bool(0.9) {
|
||||
&hot_key_range
|
||||
} else {
|
||||
&cold_key_range
|
||||
};
|
||||
executor.ingest_uniform(1, cmd.record_len, chosen_range)?;
|
||||
executor.compact_if_needed().await?;
|
||||
|
||||
print_progress(i)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
println!("done!");
|
||||
executor.flush_l0();
|
||||
executor.compact_if_needed().await?;
|
||||
let stats = executor.print_stats()?;
|
||||
|
||||
// Print the stats to stdout, and also to a file
|
||||
print!("{}", stats);
|
||||
std::fs::write(results_path.join("stats.txt"), stats)?;
|
||||
|
||||
let animation_path = results_path.join("compaction-animation.html");
|
||||
executor.draw_history(std::fs::File::create(&animation_path)?)?;
|
||||
println!(
|
||||
"animation: file://{}",
|
||||
animation_path.canonicalize()?.display()
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn run_suite_cmd(results_path: &Path, workload: &SimulateCmd) -> anyhow::Result<()> {
|
||||
std::fs::create_dir(results_path)?;
|
||||
|
||||
set_log_file(File::create(results_path.join("log"))?);
|
||||
let result = simulate(workload, results_path).await;
|
||||
set_log_stdout();
|
||||
result
|
||||
}
|
||||
|
||||
async fn run_suite() -> anyhow::Result<()> {
|
||||
let top_results_path = PathBuf::from(format!(
|
||||
"compaction-suite-results.{}",
|
||||
std::time::SystemTime::UNIX_EPOCH.elapsed()?.as_secs()
|
||||
));
|
||||
std::fs::create_dir(&top_results_path)?;
|
||||
|
||||
let workload = SimulateCmd {
|
||||
distribution: Distribution::Uniform,
|
||||
// Generate 20 GB of WAL
|
||||
record_len: 1_000,
|
||||
num_records: 20_000_000,
|
||||
// Logical size 5 GB
|
||||
logical_size: 5_000,
|
||||
};
|
||||
|
||||
run_suite_cmd(&top_results_path.join("uniform-20GB-5GB"), &workload).await?;
|
||||
|
||||
println!(
|
||||
"All tests finished. Results in {}",
|
||||
top_results_path.display()
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
use std::fs::File;
|
||||
use std::io::Stdout;
|
||||
use std::sync::Mutex;
|
||||
use tracing_subscriber::fmt::writer::EitherWriter;
|
||||
use tracing_subscriber::fmt::MakeWriter;
|
||||
|
||||
static LOG_FILE: OnceLock<Mutex<EitherWriter<File, Stdout>>> = OnceLock::new();
|
||||
fn get_log_output() -> &'static Mutex<EitherWriter<File, Stdout>> {
|
||||
LOG_FILE.get_or_init(|| std::sync::Mutex::new(EitherWriter::B(std::io::stdout())))
|
||||
}
|
||||
|
||||
fn set_log_file(f: File) {
|
||||
*get_log_output().lock().unwrap() = EitherWriter::A(f);
|
||||
}
|
||||
|
||||
fn set_log_stdout() {
|
||||
*get_log_output().lock().unwrap() = EitherWriter::B(std::io::stdout());
|
||||
}
|
||||
|
||||
fn init_logging() -> anyhow::Result<()> {
|
||||
// We fall back to printing all spans at info-level or above if
|
||||
// the RUST_LOG environment variable is not set.
|
||||
let rust_log_env_filter = || {
|
||||
tracing_subscriber::EnvFilter::try_from_default_env()
|
||||
.unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info"))
|
||||
};
|
||||
|
||||
// NB: the order of the with() calls does not matter.
|
||||
// See https://docs.rs/tracing-subscriber/0.3.16/tracing_subscriber/layer/index.html#per-layer-filtering
|
||||
use tracing_subscriber::prelude::*;
|
||||
tracing_subscriber::registry()
|
||||
.with({
|
||||
let log_layer = tracing_subscriber::fmt::layer()
|
||||
.with_target(false)
|
||||
.with_ansi(false)
|
||||
.with_writer(|| get_log_output().make_writer());
|
||||
log_layer.with_filter(rust_log_env_filter())
|
||||
})
|
||||
.init();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
let cli = CliOpts::parse();
|
||||
|
||||
init_logging()?;
|
||||
|
||||
match cli.command {
|
||||
Commands::Simulate(cmd) => {
|
||||
simulate(&cmd, &PathBuf::from("/tmp/compactions.html")).await?;
|
||||
}
|
||||
Commands::RunSuite => {
|
||||
run_suite().await?;
|
||||
}
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,870 +0,0 @@
|
||||
//! # Tiered compaction algorithm.
|
||||
//!
|
||||
//! Read all the input delta files, and write a new set of delta files that
|
||||
//! include all the input WAL records. See retile_deltas().
|
||||
//!
|
||||
//! In a "normal" LSM tree, you get to remove any values that are overwritten by
|
||||
//! later values, but in our system, we keep all the history. So the reshuffling
|
||||
//! doesn't remove any garbage, it just reshuffles the records to reduce read
|
||||
//! amplification, i.e. the number of files that you need to access to find the
|
||||
//! WAL records for a given key.
|
||||
//!
|
||||
//! If the new delta files would be very "narrow", i.e. each file would cover
|
||||
//! only a narrow key range, then we create a new set of image files
|
||||
//! instead. The current threshold is that if the estimated total size of the
|
||||
//! image layers is smaller than the size of the deltas, then we create image
|
||||
//! layers. That amounts to 2x storage amplification, and it means that the
|
||||
//! distance of image layers in LSN dimension is roughly equal to the logical
|
||||
//! database size. For example, if the logical database size is 10 GB, we would
|
||||
//! generate new image layers every 10 GB of WAL.
|
||||
//!
|
||||
use futures::StreamExt;
|
||||
use tracing::{debug, info};
|
||||
|
||||
use std::collections::{HashSet, VecDeque};
|
||||
use std::ops::Range;
|
||||
|
||||
use crate::helpers::{accum_key_values, keyspace_total_size, merge_delta_keys, overlaps_with};
|
||||
use crate::interface::*;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::identify_levels::identify_level;
|
||||
|
||||
/// Main entry point to compaction.
|
||||
///
|
||||
/// The starting point is a cutoff LSN (`end_lsn`). The compaction is run on
|
||||
/// everything below that point, that needs compaction. The cutoff LSN must
|
||||
/// partition the layers so that there are no layers that span across that
|
||||
/// LSN. To start compaction at the top of the tree, pass the end LSN of the
|
||||
/// written last L0 layer.
|
||||
pub async fn compact_tiered<E: CompactionJobExecutor>(
|
||||
executor: &mut E,
|
||||
end_lsn: Lsn,
|
||||
target_file_size: u64,
|
||||
fanout: u64,
|
||||
ctx: &E::RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
assert!(fanout >= 2);
|
||||
// Start at L0
|
||||
let mut current_level_no = 0;
|
||||
let mut current_level_target_height = target_file_size;
|
||||
loop {
|
||||
// end LSN +1 to include possible image layers exactly at 'end_lsn'.
|
||||
let all_layers = executor
|
||||
.get_layers(
|
||||
&(E::Key::MIN..E::Key::MAX),
|
||||
&(Lsn(u64::MIN)..end_lsn + 1),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
info!(
|
||||
"Compacting L{}, total # of layers: {}",
|
||||
current_level_no,
|
||||
all_layers.len()
|
||||
);
|
||||
|
||||
// Identify the range of LSNs that belong to this level. We assume that
|
||||
// each file in this level span an LSN range up to 1.75x target file
|
||||
// size. That should give us enough slop that if we created a slightly
|
||||
// oversized L0 layer, e.g. because flushing the in-memory layer was
|
||||
// delayed for some reason, we don't consider the oversized layer to
|
||||
// belong to L1. But not too much slop, that we don't accidentally
|
||||
// "skip" levels.
|
||||
let max_height = (current_level_target_height as f64 * 1.75) as u64;
|
||||
let Some(level) = identify_level(all_layers, end_lsn, max_height).await? else {
|
||||
break;
|
||||
};
|
||||
|
||||
// Calculate the height of this level. If the # of tiers exceeds the
|
||||
// fanout parameter, it's time to compact it.
|
||||
let depth = level.depth();
|
||||
info!(
|
||||
"Level {} identified as LSN range {}-{}: depth {}",
|
||||
current_level_no, level.lsn_range.start, level.lsn_range.end, depth
|
||||
);
|
||||
for l in &level.layers {
|
||||
debug!("LEVEL {} layer: {}", current_level_no, l.short_id());
|
||||
}
|
||||
if depth < fanout {
|
||||
debug!(
|
||||
level = current_level_no,
|
||||
depth = depth,
|
||||
fanout,
|
||||
"too few deltas to compact"
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
compact_level(
|
||||
&level.lsn_range,
|
||||
&level.layers,
|
||||
executor,
|
||||
target_file_size,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
if target_file_size == u64::MAX {
|
||||
break;
|
||||
}
|
||||
current_level_no += 1;
|
||||
current_level_target_height = current_level_target_height.saturating_mul(fanout);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn compact_level<E: CompactionJobExecutor>(
|
||||
lsn_range: &Range<Lsn>,
|
||||
layers: &[E::Layer],
|
||||
executor: &mut E,
|
||||
target_file_size: u64,
|
||||
ctx: &E::RequestContext,
|
||||
) -> anyhow::Result<bool> {
|
||||
let mut layer_fragments = Vec::new();
|
||||
for l in layers {
|
||||
layer_fragments.push(LayerFragment::new(l.clone()));
|
||||
}
|
||||
|
||||
let mut state = LevelCompactionState {
|
||||
target_file_size,
|
||||
_lsn_range: lsn_range.clone(),
|
||||
layers: layer_fragments,
|
||||
jobs: Vec::new(),
|
||||
job_queue: Vec::new(),
|
||||
next_level: false,
|
||||
executor,
|
||||
};
|
||||
|
||||
let first_job = CompactionJob {
|
||||
key_range: E::Key::MIN..E::Key::MAX,
|
||||
lsn_range: lsn_range.clone(),
|
||||
strategy: CompactionStrategy::Divide,
|
||||
input_layers: state
|
||||
.layers
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|i| LayerId(i.0))
|
||||
.collect(),
|
||||
completed: false,
|
||||
};
|
||||
|
||||
state.jobs.push(first_job);
|
||||
state.job_queue.push(JobId(0));
|
||||
state.execute(ctx).await?;
|
||||
|
||||
info!(
|
||||
"compaction completed! Need to process next level: {}",
|
||||
state.next_level
|
||||
);
|
||||
|
||||
Ok(state.next_level)
|
||||
}
|
||||
|
||||
/// Blackboard that keeps track of the state of all the jobs and work remaining
|
||||
struct LevelCompactionState<'a, E>
|
||||
where
|
||||
E: CompactionJobExecutor,
|
||||
{
|
||||
// parameters
|
||||
target_file_size: u64,
|
||||
|
||||
_lsn_range: Range<Lsn>,
|
||||
layers: Vec<LayerFragment<E>>,
|
||||
|
||||
// job queue
|
||||
jobs: Vec<CompactionJob<E>>,
|
||||
job_queue: Vec<JobId>,
|
||||
|
||||
/// If false, no need to compact levels below this
|
||||
next_level: bool,
|
||||
|
||||
/// Interface to the outside world
|
||||
executor: &'a mut E,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
|
||||
struct LayerId(usize);
|
||||
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
|
||||
struct JobId(usize);
|
||||
|
||||
struct PendingJobSet {
|
||||
pending: HashSet<JobId>,
|
||||
completed: HashSet<JobId>,
|
||||
}
|
||||
|
||||
impl PendingJobSet {
|
||||
fn new() -> Self {
|
||||
PendingJobSet {
|
||||
pending: HashSet::new(),
|
||||
completed: HashSet::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn complete_job(&mut self, job_id: JobId) {
|
||||
self.pending.remove(&job_id);
|
||||
self.completed.insert(job_id);
|
||||
}
|
||||
|
||||
fn all_completed(&self) -> bool {
|
||||
self.pending.is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
// When we decide to rewrite a set of layers, LayerFragment is used to keep
|
||||
// track which new layers supersede an old layer. When all the stakeholder jobs
|
||||
// have completed, this layer can be deleted.
|
||||
struct LayerFragment<E>
|
||||
where
|
||||
E: CompactionJobExecutor,
|
||||
{
|
||||
layer: E::Layer,
|
||||
|
||||
// If we will write new layers to replace this one, this keeps track of the
|
||||
// jobs that need to complete before this layer can be deleted. As the jobs
|
||||
// complete, they are moved from 'pending' to 'completed' set. Once the
|
||||
// 'pending' set becomes empty, the layer can be deleted.
|
||||
//
|
||||
// If None, this layer is not rewritten and must not be deleted.
|
||||
deletable_after: Option<PendingJobSet>,
|
||||
|
||||
deleted: bool,
|
||||
}
|
||||
|
||||
impl<E> LayerFragment<E>
|
||||
where
|
||||
E: CompactionJobExecutor,
|
||||
{
|
||||
fn new(layer: E::Layer) -> Self {
|
||||
LayerFragment {
|
||||
layer,
|
||||
deletable_after: None,
|
||||
deleted: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(PartialEq)]
|
||||
enum CompactionStrategy {
|
||||
Divide,
|
||||
CreateDelta,
|
||||
CreateImage,
|
||||
}
|
||||
|
||||
#[allow(dead_code)] // Todo
|
||||
struct CompactionJob<E: CompactionJobExecutor> {
|
||||
key_range: Range<E::Key>,
|
||||
lsn_range: Range<Lsn>,
|
||||
|
||||
strategy: CompactionStrategy,
|
||||
|
||||
input_layers: Vec<LayerId>,
|
||||
|
||||
completed: bool,
|
||||
}
|
||||
|
||||
impl<'a, E> LevelCompactionState<'a, E>
|
||||
where
|
||||
E: CompactionJobExecutor,
|
||||
{
|
||||
/// Main loop of the executor.
|
||||
///
|
||||
/// In each iteration, we take the next job from the queue, and execute it.
|
||||
/// The execution might add new jobs to the queue. Keep going until the
|
||||
/// queue is empty.
|
||||
///
|
||||
/// Initially, the job queue consists of one Divide job over the whole
|
||||
/// level. On first call, it is divided into smaller jobs.
|
||||
///
|
||||
async fn execute(&mut self, ctx: &E::RequestContext) -> anyhow::Result<()> {
|
||||
// TODO: this would be pretty straightforward to parallelize with FuturesUnordered
|
||||
while let Some(next_job_id) = self.job_queue.pop() {
|
||||
info!("executing job {}", next_job_id.0);
|
||||
self.execute_job(next_job_id, ctx).await?;
|
||||
}
|
||||
|
||||
// all done!
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn execute_job(&mut self, job_id: JobId, ctx: &E::RequestContext) -> anyhow::Result<()> {
|
||||
let job = &self.jobs[job_id.0];
|
||||
match job.strategy {
|
||||
CompactionStrategy::Divide => {
|
||||
self.divide_job(job_id, ctx).await?;
|
||||
Ok(())
|
||||
}
|
||||
CompactionStrategy::CreateDelta => {
|
||||
let mut deltas: Vec<E::DeltaLayer> = Vec::new();
|
||||
let mut layer_ids: Vec<LayerId> = Vec::new();
|
||||
for layer_id in &job.input_layers {
|
||||
let layer = &self.layers[layer_id.0].layer;
|
||||
if let Some(dl) = self.executor.downcast_delta_layer(layer).await? {
|
||||
deltas.push(dl.clone());
|
||||
layer_ids.push(*layer_id);
|
||||
}
|
||||
}
|
||||
|
||||
self.executor
|
||||
.create_delta(&job.lsn_range, &job.key_range, &deltas, ctx)
|
||||
.await?;
|
||||
self.jobs[job_id.0].completed = true;
|
||||
|
||||
// did we complete any fragments?
|
||||
for layer_id in layer_ids {
|
||||
let l = &mut self.layers[layer_id.0];
|
||||
if let Some(deletable_after) = l.deletable_after.as_mut() {
|
||||
deletable_after.complete_job(job_id);
|
||||
if deletable_after.all_completed() {
|
||||
self.executor.delete_layer(&l.layer, ctx).await?;
|
||||
l.deleted = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
self.next_level = true;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
CompactionStrategy::CreateImage => {
|
||||
self.executor
|
||||
.create_image(job.lsn_range.end, &job.key_range, ctx)
|
||||
.await?;
|
||||
self.jobs[job_id.0].completed = true;
|
||||
|
||||
// TODO: we could check if any layers < PITR horizon became deletable
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn push_job(&mut self, job: CompactionJob<E>) -> JobId {
|
||||
let job_id = JobId(self.jobs.len());
|
||||
self.jobs.push(job);
|
||||
self.job_queue.push(job_id);
|
||||
job_id
|
||||
}
|
||||
|
||||
///
|
||||
/// Take a partition of the key space, and decide how to compact it.
|
||||
///
|
||||
/// TODO: Currently, this is called exactly once for the level, and we
|
||||
/// decide whether to create new image layers to cover the whole level, or
|
||||
/// write a new set of delta. In the future, this should try to partition
|
||||
/// the key space, and make the decision separately for each partition.
|
||||
///
|
||||
async fn divide_job(&mut self, job_id: JobId, ctx: &E::RequestContext) -> anyhow::Result<()> {
|
||||
let job = &self.jobs[job_id.0];
|
||||
assert!(job.strategy == CompactionStrategy::Divide);
|
||||
|
||||
// Check for dummy cases
|
||||
if job.input_layers.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let job = &self.jobs[job_id.0];
|
||||
assert!(job.strategy == CompactionStrategy::Divide);
|
||||
|
||||
// Would it be better to create images for this partition?
|
||||
// Decide based on the average density of the level
|
||||
let keyspace_size = keyspace_total_size(
|
||||
&self
|
||||
.executor
|
||||
.get_keyspace(&job.key_range, job.lsn_range.end, ctx)
|
||||
.await?,
|
||||
) * 8192;
|
||||
|
||||
let wal_size = job
|
||||
.input_layers
|
||||
.iter()
|
||||
.filter(|layer_id| self.layers[layer_id.0].layer.is_delta())
|
||||
.map(|layer_id| self.layers[layer_id.0].layer.file_size())
|
||||
.sum::<u64>();
|
||||
if keyspace_size < wal_size {
|
||||
// seems worth it
|
||||
info!(
|
||||
"covering with images, because keyspace_size is {}, size of deltas between {}-{} is {}",
|
||||
keyspace_size, job.lsn_range.start, job.lsn_range.end, wal_size
|
||||
);
|
||||
self.cover_with_images(job_id, ctx).await
|
||||
} else {
|
||||
// do deltas
|
||||
info!(
|
||||
"coverage not worth it, keyspace_size {}, wal_size {}",
|
||||
keyspace_size, wal_size
|
||||
);
|
||||
self.retile_deltas(job_id, ctx).await
|
||||
}
|
||||
}
|
||||
|
||||
// LSN
|
||||
// ^
|
||||
// |
|
||||
// | ###|###|#####
|
||||
// | +--+-----+--+ +--+-----+--+
|
||||
// | | | | | | | | |
|
||||
// | +--+--+--+--+ +--+--+--+--+
|
||||
// | | | | | | |
|
||||
// | +---+-+-+---+ ==> +---+-+-+---+
|
||||
// | | | | | | | | |
|
||||
// | +---+-+-++--+ +---+-+-++--+
|
||||
// | | | | | | | | |
|
||||
// | +-----+--+--+ +-----+--+--+
|
||||
// |
|
||||
// +--------------> key
|
||||
//
|
||||
async fn cover_with_images(
|
||||
&mut self,
|
||||
job_id: JobId,
|
||||
ctx: &E::RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let job = &self.jobs[job_id.0];
|
||||
assert!(job.strategy == CompactionStrategy::Divide);
|
||||
|
||||
// XXX: do we still need the "holes" stuff?
|
||||
|
||||
let mut new_jobs = Vec::new();
|
||||
|
||||
// Slide a window through the keyspace
|
||||
let keyspace = self
|
||||
.executor
|
||||
.get_keyspace(&job.key_range, job.lsn_range.end, ctx)
|
||||
.await?;
|
||||
|
||||
let mut window = KeyspaceWindow::new(
|
||||
E::Key::MIN..E::Key::MAX,
|
||||
keyspace,
|
||||
self.target_file_size / 8192,
|
||||
);
|
||||
while let Some(key_range) = window.choose_next_image() {
|
||||
new_jobs.push(CompactionJob::<E> {
|
||||
key_range,
|
||||
lsn_range: job.lsn_range.clone(),
|
||||
strategy: CompactionStrategy::CreateImage,
|
||||
input_layers: Vec::new(), // XXX: Is it OK for this to be empty for image layer?
|
||||
completed: false,
|
||||
});
|
||||
}
|
||||
|
||||
for j in new_jobs.into_iter().rev() {
|
||||
let _job_id = self.push_job(j);
|
||||
|
||||
// TODO: image layers don't let us delete anything. unless < PITR horizon
|
||||
//let j = &self.jobs[job_id.0];
|
||||
// for layer_id in j.input_layers.iter() {
|
||||
// self.layers[layer_id.0].pending_stakeholders.insert(job_id);
|
||||
//}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Merge the contents of all the input delta layers into a new set
|
||||
// of delta layers, based on the current partitioning.
|
||||
//
|
||||
// We split the new delta layers on the key dimension. We iterate through
|
||||
// the key space, and for each key, check if including the next key to the
|
||||
// current output layer we're building would cause the layer to become too
|
||||
// large. If so, dump the current output layer and start new one. It's
|
||||
// possible that there is a single key with so many page versions that
|
||||
// storing all of them in a single layer file would be too large. In that
|
||||
// case, we also split on the LSN dimension.
|
||||
//
|
||||
// LSN
|
||||
// ^
|
||||
// |
|
||||
// | +-----------+ +--+--+--+--+
|
||||
// | | | | | | | |
|
||||
// | +-----------+ | | | | |
|
||||
// | | | | | | | |
|
||||
// | +-----------+ ==> | | | | |
|
||||
// | | | | | | | |
|
||||
// | +-----------+ | | | | |
|
||||
// | | | | | | | |
|
||||
// | +-----------+ +--+--+--+--+
|
||||
// |
|
||||
// +--------------> key
|
||||
//
|
||||
//
|
||||
// If one key (X) has a lot of page versions:
|
||||
//
|
||||
// LSN
|
||||
// ^
|
||||
// | (X)
|
||||
// | +-----------+ +--+--+--+--+
|
||||
// | | | | | | | |
|
||||
// | +-----------+ | | +--+ |
|
||||
// | | | | | | | |
|
||||
// | +-----------+ ==> | | | | |
|
||||
// | | | | | +--+ |
|
||||
// | +-----------+ | | | | |
|
||||
// | | | | | | | |
|
||||
// | +-----------+ +--+--+--+--+
|
||||
// |
|
||||
// +--------------> key
|
||||
//
|
||||
// TODO: this actually divides the layers into fixed-size chunks, not
|
||||
// based on the partitioning.
|
||||
//
|
||||
// TODO: we should also opportunistically materialize and
|
||||
// garbage collect what we can.
|
||||
async fn retile_deltas(
|
||||
&mut self,
|
||||
job_id: JobId,
|
||||
ctx: &E::RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let job = &self.jobs[job_id.0];
|
||||
assert!(job.strategy == CompactionStrategy::Divide);
|
||||
|
||||
// Sweep the key space left to right, running an estimate of how much
|
||||
// disk size and keyspace we have accumulated
|
||||
//
|
||||
// Once the disk size reaches the target threshold, stop and think.
|
||||
// If we have accumulated only a narrow band of keyspace, create an
|
||||
// image layer. Otherwise write a delta layer.
|
||||
|
||||
// FIXME: deal with the case of lots of values for same key
|
||||
|
||||
// FIXME: we are ignoring images here. Did we already divide the work
|
||||
// so that we won't encounter them here?
|
||||
|
||||
let mut deltas: Vec<E::DeltaLayer> = Vec::new();
|
||||
for layer_id in &job.input_layers {
|
||||
let l = &self.layers[layer_id.0];
|
||||
if let Some(dl) = self.executor.downcast_delta_layer(&l.layer).await? {
|
||||
deltas.push(dl.clone());
|
||||
}
|
||||
}
|
||||
// Open stream
|
||||
let key_value_stream = std::pin::pin!(merge_delta_keys::<E>(deltas.as_slice(), ctx));
|
||||
let mut new_jobs = Vec::new();
|
||||
|
||||
// Slide a window through the keyspace
|
||||
let mut key_accum = std::pin::pin!(accum_key_values(key_value_stream));
|
||||
let mut all_in_window: bool = false;
|
||||
let mut window = Window::new();
|
||||
loop {
|
||||
if all_in_window && window.elems.is_empty() {
|
||||
// All done!
|
||||
break;
|
||||
}
|
||||
if let Some(key_range) = window.choose_next_delta(self.target_file_size, !all_in_window)
|
||||
{
|
||||
let batch_layers: Vec<LayerId> = job
|
||||
.input_layers
|
||||
.iter()
|
||||
.filter(|layer_id| {
|
||||
overlaps_with(self.layers[layer_id.0].layer.key_range(), &key_range)
|
||||
})
|
||||
.cloned()
|
||||
.collect();
|
||||
assert!(!batch_layers.is_empty());
|
||||
new_jobs.push(CompactionJob {
|
||||
key_range,
|
||||
lsn_range: job.lsn_range.clone(),
|
||||
strategy: CompactionStrategy::CreateDelta,
|
||||
input_layers: batch_layers,
|
||||
completed: false,
|
||||
});
|
||||
} else {
|
||||
assert!(!all_in_window);
|
||||
if let Some(next_key) = key_accum.next().await.transpose()? {
|
||||
window.feed(next_key.key, next_key.size);
|
||||
} else {
|
||||
all_in_window = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// All the input files are rewritten. Set up the tracking for when they can
|
||||
// be deleted.
|
||||
for layer_id in job.input_layers.iter() {
|
||||
let l = &mut self.layers[layer_id.0];
|
||||
assert!(l.deletable_after.is_none());
|
||||
l.deletable_after = Some(PendingJobSet::new());
|
||||
}
|
||||
for j in new_jobs.into_iter().rev() {
|
||||
let job_id = self.push_job(j);
|
||||
let j = &self.jobs[job_id.0];
|
||||
for layer_id in j.input_layers.iter() {
|
||||
self.layers[layer_id.0]
|
||||
.deletable_after
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
.pending
|
||||
.insert(job_id);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
// Sliding window through keyspace and values
|
||||
// This is used by over_with_images to decide on good split points
|
||||
struct KeyspaceWindow<K> {
|
||||
head: KeyspaceWindowHead<K>,
|
||||
|
||||
start_pos: KeyspaceWindowPos<K>,
|
||||
}
|
||||
struct KeyspaceWindowHead<K> {
|
||||
// overall key range to cover
|
||||
key_range: Range<K>,
|
||||
|
||||
keyspace: Vec<Range<K>>,
|
||||
target_keysize: u64,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct KeyspaceWindowPos<K> {
|
||||
end_key: K,
|
||||
|
||||
keyspace_idx: usize,
|
||||
|
||||
accum_keysize: u64,
|
||||
}
|
||||
impl<K: CompactionKey> KeyspaceWindowPos<K> {
|
||||
fn reached_end(&self, w: &KeyspaceWindowHead<K>) -> bool {
|
||||
self.keyspace_idx == w.keyspace.len()
|
||||
}
|
||||
|
||||
// Advance the cursor until it reaches 'target_keysize'.
|
||||
fn advance_until_size(&mut self, w: &KeyspaceWindowHead<K>, max_size: u64) {
|
||||
while self.accum_keysize < max_size && !self.reached_end(w) {
|
||||
let curr_range = &w.keyspace[self.keyspace_idx];
|
||||
if self.end_key < curr_range.start {
|
||||
// skip over any unused space
|
||||
self.end_key = curr_range.start;
|
||||
}
|
||||
|
||||
// We're now within 'curr_range'. Can we advance past it completely?
|
||||
let distance = K::key_range_size(&(self.end_key..curr_range.end));
|
||||
if (self.accum_keysize + distance as u64) < max_size {
|
||||
// oh yeah, it fits
|
||||
self.end_key = curr_range.end;
|
||||
self.keyspace_idx += 1;
|
||||
self.accum_keysize += distance as u64;
|
||||
} else {
|
||||
// advance within the range
|
||||
let skip_key = self.end_key.skip_some();
|
||||
let distance = K::key_range_size(&(self.end_key..skip_key));
|
||||
if (self.accum_keysize + distance as u64) < max_size {
|
||||
self.end_key = skip_key;
|
||||
self.accum_keysize += distance as u64;
|
||||
} else {
|
||||
self.end_key = self.end_key.next();
|
||||
self.accum_keysize += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<K> KeyspaceWindow<K>
|
||||
where
|
||||
K: CompactionKey,
|
||||
{
|
||||
fn new(key_range: Range<K>, keyspace: CompactionKeySpace<K>, target_keysize: u64) -> Self {
|
||||
assert!(keyspace.first().unwrap().start >= key_range.start);
|
||||
|
||||
let start_key = key_range.start;
|
||||
let start_pos = KeyspaceWindowPos::<K> {
|
||||
end_key: start_key,
|
||||
keyspace_idx: 0,
|
||||
accum_keysize: 0,
|
||||
};
|
||||
Self {
|
||||
head: KeyspaceWindowHead::<K> {
|
||||
key_range,
|
||||
keyspace,
|
||||
target_keysize,
|
||||
},
|
||||
start_pos,
|
||||
}
|
||||
}
|
||||
|
||||
fn choose_next_image(&mut self) -> Option<Range<K>> {
|
||||
if self.start_pos.keyspace_idx == self.head.keyspace.len() {
|
||||
// we've reached the end
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut next_pos = self.start_pos.clone();
|
||||
next_pos.advance_until_size(
|
||||
&self.head,
|
||||
self.start_pos.accum_keysize + self.head.target_keysize,
|
||||
);
|
||||
|
||||
// See if we can gobble up the rest of the keyspace if we stretch out the layer, up to
|
||||
// 1.25x target size
|
||||
let mut end_pos = next_pos.clone();
|
||||
end_pos.advance_until_size(
|
||||
&self.head,
|
||||
self.start_pos.accum_keysize + (self.head.target_keysize * 5 / 4),
|
||||
);
|
||||
if end_pos.reached_end(&self.head) {
|
||||
// gobble up any unused keyspace between the last used key and end of the range
|
||||
assert!(end_pos.end_key <= self.head.key_range.end);
|
||||
end_pos.end_key = self.head.key_range.end;
|
||||
next_pos = end_pos;
|
||||
}
|
||||
|
||||
let start_key = self.start_pos.end_key;
|
||||
self.start_pos = next_pos;
|
||||
Some(start_key..self.start_pos.end_key)
|
||||
}
|
||||
}
|
||||
|
||||
// Sliding window through keyspace and values
|
||||
//
|
||||
// This is used to decide what layer to write next, from the beginning of the window.
|
||||
//
|
||||
// Candidates:
|
||||
//
|
||||
// 1. Create an image layer, snapping to previous images
|
||||
// 2. Create a delta layer, snapping to previous images
|
||||
// 3. Create an image layer, snapping to
|
||||
//
|
||||
//
|
||||
|
||||
// Take previous partitioning, based on the image layers below.
|
||||
//
|
||||
// Candidate is at the front:
|
||||
//
|
||||
// Consider stretching an image layer to next divider? If it's close enough,
|
||||
// that's the image candidate
|
||||
//
|
||||
// If it's too far, consider splitting at a reasonable point
|
||||
//
|
||||
// Is the image candidate smaller than the equivalent delta? If so,
|
||||
// split off the image. Otherwise, split off one delta.
|
||||
// Try to snap off the delta at a reasonable point
|
||||
|
||||
struct WindowElement<K> {
|
||||
start_key: K, // inclusive
|
||||
last_key: K, // inclusive
|
||||
accum_size: u64,
|
||||
}
|
||||
struct Window<K> {
|
||||
elems: VecDeque<WindowElement<K>>,
|
||||
|
||||
// last key that was split off, inclusive
|
||||
splitoff_key: Option<K>,
|
||||
splitoff_size: u64,
|
||||
}
|
||||
|
||||
impl<K> Window<K>
|
||||
where
|
||||
K: CompactionKey,
|
||||
{
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
elems: VecDeque::new(),
|
||||
splitoff_key: None,
|
||||
splitoff_size: 0,
|
||||
}
|
||||
}
|
||||
|
||||
fn feed(&mut self, key: K, size: u64) {
|
||||
let last_size;
|
||||
if let Some(last) = self.elems.back_mut() {
|
||||
assert!(last.last_key <= key);
|
||||
if key == last.last_key {
|
||||
last.accum_size += size;
|
||||
return;
|
||||
}
|
||||
last_size = last.accum_size;
|
||||
} else {
|
||||
last_size = 0;
|
||||
}
|
||||
// This is a new key.
|
||||
let elem = WindowElement {
|
||||
start_key: key,
|
||||
last_key: key,
|
||||
accum_size: last_size + size,
|
||||
};
|
||||
self.elems.push_back(elem);
|
||||
}
|
||||
|
||||
fn remain_size(&self) -> u64 {
|
||||
self.elems.back().unwrap().accum_size - self.splitoff_size
|
||||
}
|
||||
|
||||
fn peek_size(&self) -> u64 {
|
||||
self.elems.front().unwrap().accum_size - self.splitoff_size
|
||||
}
|
||||
|
||||
fn commit_upto(&mut self, mut upto: usize) {
|
||||
while upto > 1 {
|
||||
let popped = self.elems.pop_front().unwrap();
|
||||
self.elems.front_mut().unwrap().start_key = popped.start_key;
|
||||
upto -= 1;
|
||||
}
|
||||
}
|
||||
|
||||
fn find_size_split(&self, target_size: u64) -> usize {
|
||||
self.elems
|
||||
.partition_point(|elem| elem.accum_size - self.splitoff_size < target_size)
|
||||
}
|
||||
|
||||
fn pop(&mut self) {
|
||||
let first = self.elems.pop_front().unwrap();
|
||||
self.splitoff_size = first.accum_size;
|
||||
|
||||
self.splitoff_key = Some(first.last_key);
|
||||
}
|
||||
|
||||
// the difference between delta and image is that an image covers
|
||||
// any unused keyspace before and after, while a delta tries to
|
||||
// minimize that. TODO: difference not implemented
|
||||
fn pop_delta(&mut self) -> Range<K> {
|
||||
let first = self.elems.front().unwrap();
|
||||
let key_range = first.start_key..first.last_key.next();
|
||||
|
||||
self.pop();
|
||||
key_range
|
||||
}
|
||||
|
||||
// Prerequisite: we have enough input in the window
|
||||
//
|
||||
// On return None, the caller should feed more data and call again
|
||||
fn choose_next_delta(&mut self, target_size: u64, has_more: bool) -> Option<Range<K>> {
|
||||
if has_more && self.elems.is_empty() {
|
||||
// Starting up
|
||||
return None;
|
||||
}
|
||||
|
||||
// If we still have an undersized candidate, just keep going
|
||||
while self.peek_size() < target_size {
|
||||
if self.elems.len() > 1 {
|
||||
self.commit_upto(2);
|
||||
} else if has_more {
|
||||
return None;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure we have enough input in the window to make a good decision
|
||||
if has_more && self.remain_size() < target_size * 5 / 4 {
|
||||
return None;
|
||||
}
|
||||
|
||||
// The candidate on the front is now large enough, for a delta.
|
||||
// And we have enough data in the window to decide.
|
||||
|
||||
// If we're willing to stretch it up to 1.25 target size, could we
|
||||
// gobble up the rest of the work? This avoids creating very small
|
||||
// "tail" layers at the end of the keyspace
|
||||
if !has_more && self.remain_size() < target_size * 5 / 3 {
|
||||
self.commit_upto(self.elems.len());
|
||||
} else {
|
||||
let delta_split_at = self.find_size_split(target_size);
|
||||
self.commit_upto(delta_split_at);
|
||||
|
||||
// If it's still not large enough, request the caller to fill the window
|
||||
if self.elems.len() == 1 && has_more {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
Some(self.pop_delta())
|
||||
}
|
||||
}
|
||||
@@ -1,251 +0,0 @@
|
||||
//! This file contains generic utility functions over the interface types,
|
||||
//! which could be handy for any compaction implementation.
|
||||
use crate::interface::*;
|
||||
|
||||
use futures::future::BoxFuture;
|
||||
use futures::{Stream, StreamExt};
|
||||
use itertools::Itertools;
|
||||
use pin_project_lite::pin_project;
|
||||
use std::cmp::Ord;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::collections::VecDeque;
|
||||
use std::future::Future;
|
||||
use std::ops::{DerefMut, Range};
|
||||
use std::pin::Pin;
|
||||
use std::task::Poll;
|
||||
|
||||
pub fn keyspace_total_size<K>(keyspace: &CompactionKeySpace<K>) -> u64
|
||||
where
|
||||
K: CompactionKey,
|
||||
{
|
||||
let mut total = 0;
|
||||
for r in keyspace.iter() {
|
||||
total += K::key_range_size(r) as u64;
|
||||
}
|
||||
total
|
||||
}
|
||||
|
||||
pub fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
|
||||
!(a.end <= b.start || b.end <= a.start)
|
||||
}
|
||||
|
||||
pub fn union_to_keyspace<K: Ord>(a: &mut CompactionKeySpace<K>, b: CompactionKeySpace<K>) {
|
||||
let x = std::mem::take(a);
|
||||
let mut all_ranges_iter = [x.into_iter(), b.into_iter()]
|
||||
.into_iter()
|
||||
.kmerge_by(|a, b| a.start < b.start);
|
||||
let mut ranges = Vec::new();
|
||||
if let Some(first) = all_ranges_iter.next() {
|
||||
let (mut start, mut end) = (first.start, first.end);
|
||||
|
||||
for r in all_ranges_iter {
|
||||
assert!(r.start >= start);
|
||||
if r.start > end {
|
||||
ranges.push(start..end);
|
||||
start = r.start;
|
||||
end = r.end;
|
||||
} else if r.end > end {
|
||||
end = r.end;
|
||||
}
|
||||
}
|
||||
ranges.push(start..end);
|
||||
}
|
||||
*a = ranges
|
||||
}
|
||||
|
||||
pub fn intersect_keyspace<K: Ord + Clone + Copy>(
|
||||
a: &CompactionKeySpace<K>,
|
||||
r: &Range<K>,
|
||||
) -> CompactionKeySpace<K> {
|
||||
let mut ranges: Vec<Range<K>> = Vec::new();
|
||||
|
||||
for x in a.iter() {
|
||||
if x.end <= r.start {
|
||||
continue;
|
||||
}
|
||||
if x.start >= r.end {
|
||||
break;
|
||||
}
|
||||
ranges.push(x.clone())
|
||||
}
|
||||
|
||||
// trim the ends
|
||||
if let Some(first) = ranges.first_mut() {
|
||||
first.start = std::cmp::max(first.start, r.start);
|
||||
}
|
||||
if let Some(last) = ranges.last_mut() {
|
||||
last.end = std::cmp::min(last.end, r.end);
|
||||
}
|
||||
ranges
|
||||
}
|
||||
|
||||
/// Create a stream that iterates through all DeltaEntrys among all input
|
||||
/// layers, in key-lsn order.
|
||||
///
|
||||
/// This is public because the create_delta() implementation likely wants to use this too
|
||||
/// TODO: move to a more shared place
|
||||
pub fn merge_delta_keys<'a, E: CompactionJobExecutor>(
|
||||
layers: &'a [E::DeltaLayer],
|
||||
ctx: &'a E::RequestContext,
|
||||
) -> MergeDeltaKeys<'a, E> {
|
||||
// Use a binary heap to merge the layers. Each input layer is initially
|
||||
// represented by a LazyLoadLayer::Unloaded element, which uses the start of
|
||||
// the layer's key range as the key. The first time a layer reaches the top
|
||||
// of the heap, all the keys of the layer are loaded into a sorted vector.
|
||||
//
|
||||
// This helps to keep the memory usage reasonable: we only need to hold in
|
||||
// memory the DeltaEntrys of the layers that overlap with the "current" key.
|
||||
let mut heap: BinaryHeap<LazyLoadLayer<'a, E>> = BinaryHeap::new();
|
||||
for l in layers {
|
||||
heap.push(LazyLoadLayer::Unloaded(l));
|
||||
}
|
||||
MergeDeltaKeys {
|
||||
heap,
|
||||
ctx,
|
||||
load_future: None,
|
||||
}
|
||||
}
|
||||
|
||||
enum LazyLoadLayer<'a, E: CompactionJobExecutor> {
|
||||
Loaded(VecDeque<<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>),
|
||||
Unloaded(&'a E::DeltaLayer),
|
||||
}
|
||||
impl<'a, E: CompactionJobExecutor> LazyLoadLayer<'a, E> {
|
||||
fn key(&self) -> E::Key {
|
||||
match self {
|
||||
Self::Loaded(entries) => entries.front().unwrap().key(),
|
||||
Self::Unloaded(dl) => dl.key_range().start,
|
||||
}
|
||||
}
|
||||
}
|
||||
impl<'a, E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'a, E> {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
|
||||
// reverse order so that we get a min-heap
|
||||
other.key().partial_cmp(&self.key())
|
||||
}
|
||||
}
|
||||
impl<'a, E: CompactionJobExecutor> Ord for LazyLoadLayer<'a, E> {
|
||||
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
|
||||
// reverse order so that we get a min-heap
|
||||
other.key().cmp(&self.key())
|
||||
}
|
||||
}
|
||||
impl<'a, E: CompactionJobExecutor> PartialEq for LazyLoadLayer<'a, E> {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.key().eq(&other.key())
|
||||
}
|
||||
}
|
||||
impl<'a, E: CompactionJobExecutor> Eq for LazyLoadLayer<'a, E> {}
|
||||
|
||||
type LoadFuture<'a, E> = BoxFuture<'a, anyhow::Result<Vec<E>>>;
|
||||
|
||||
// Stream returned by `merge_delta_keys`
|
||||
pin_project! {
|
||||
#[allow(clippy::type_complexity)]
|
||||
pub struct MergeDeltaKeys<'a, E: CompactionJobExecutor> {
|
||||
heap: BinaryHeap<LazyLoadLayer<'a, E>>,
|
||||
|
||||
#[pin]
|
||||
load_future: Option<LoadFuture<'a, <E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>>,
|
||||
|
||||
ctx: &'a E::RequestContext,
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, E> Stream for MergeDeltaKeys<'a, E>
|
||||
where
|
||||
E: CompactionJobExecutor + 'a,
|
||||
{
|
||||
type Item = anyhow::Result<<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>;
|
||||
|
||||
fn poll_next(
|
||||
self: Pin<&mut Self>,
|
||||
cx: &mut std::task::Context<'_>,
|
||||
) -> Poll<std::option::Option<<Self as futures::Stream>::Item>> {
|
||||
let mut this = self.project();
|
||||
loop {
|
||||
if let Some(mut load_future) = this.load_future.as_mut().as_pin_mut() {
|
||||
// We are waiting for loading the keys to finish
|
||||
match load_future.as_mut().poll(cx) {
|
||||
Poll::Ready(Ok(entries)) => {
|
||||
this.load_future.set(None);
|
||||
*this.heap.peek_mut().unwrap() =
|
||||
LazyLoadLayer::Loaded(VecDeque::from(entries));
|
||||
}
|
||||
Poll::Ready(Err(e)) => {
|
||||
return Poll::Ready(Some(Err(e)));
|
||||
}
|
||||
Poll::Pending => {
|
||||
return Poll::Pending;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If the topmost layer in the heap hasn't been loaded yet, start
|
||||
// loading it. Otherwise return the next entry from it and update
|
||||
// the layer's position in the heap (this decreaseKey operation is
|
||||
// performed implicitly when `top` is dropped).
|
||||
if let Some(mut top) = this.heap.peek_mut() {
|
||||
match top.deref_mut() {
|
||||
LazyLoadLayer::Unloaded(ref mut l) => {
|
||||
let fut = l.load_keys(this.ctx);
|
||||
this.load_future.set(Some(Box::pin(fut)));
|
||||
continue;
|
||||
}
|
||||
LazyLoadLayer::Loaded(ref mut entries) => {
|
||||
let result = entries.pop_front().unwrap();
|
||||
if entries.is_empty() {
|
||||
std::collections::binary_heap::PeekMut::pop(top);
|
||||
}
|
||||
return Poll::Ready(Some(Ok(result)));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
return Poll::Ready(None);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Accumulate values at key boundaries
|
||||
pub struct KeySize<K> {
|
||||
pub key: K,
|
||||
pub num_values: u64,
|
||||
pub size: u64,
|
||||
}
|
||||
|
||||
pub fn accum_key_values<'a, I, K, D, E>(input: I) -> impl Stream<Item = Result<KeySize<K>, E>>
|
||||
where
|
||||
K: Eq,
|
||||
I: Stream<Item = Result<D, E>>,
|
||||
D: CompactionDeltaEntry<'a, K>,
|
||||
{
|
||||
async_stream::try_stream! {
|
||||
// Initialize the state from the first value
|
||||
let mut input = std::pin::pin!(input);
|
||||
|
||||
if let Some(first) = input.next().await {
|
||||
let first = first?;
|
||||
let mut accum: KeySize<K> = KeySize {
|
||||
key: first.key(),
|
||||
num_values: 1,
|
||||
size: first.size(),
|
||||
};
|
||||
while let Some(this) = input.next().await {
|
||||
let this = this?;
|
||||
if this.key() == accum.key {
|
||||
accum.size += this.size();
|
||||
accum.num_values += 1;
|
||||
} else {
|
||||
yield accum;
|
||||
accum = KeySize {
|
||||
key: this.key(),
|
||||
num_values: 1,
|
||||
size: this.size(),
|
||||
};
|
||||
}
|
||||
}
|
||||
yield accum;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,376 +0,0 @@
|
||||
//! An LSM tree consists of multiple levels, each exponential larger than the
|
||||
//! previous level. And each level consists of be multiple "tiers". With tiered
|
||||
//! compaction, a level is compacted when it has accumulated more than N tiers,
|
||||
//! forming one tier on the next level.
|
||||
//!
|
||||
//! In the pageserver, we don't explicitly track the levels and tiers. Instead,
|
||||
//! we identify them by looking at the shapes of the layers. It's an easy task
|
||||
//! for a human, but it's not straightforward to come up with the exact
|
||||
//! rules. Especially if there are cases like interrupted, half-finished
|
||||
//! compactions, or highly skewed data distributions that have let us "skip"
|
||||
//! some levels. It's not critical to classify all cases correctly; at worst we
|
||||
//! delay some compaction work, and suffer from more read amplification, or we
|
||||
//! perform some unnecessary compaction work.
|
||||
//!
|
||||
//! `identify_level` performs that shape-matching.
|
||||
//!
|
||||
//! It returns a Level struct, which has `depth()` function to count the number
|
||||
//! of "tiers" in the level. The tier count is the max depth of stacked layers
|
||||
//! within the level. That's a good measure, because the point of compacting is
|
||||
//! to reduce read amplification, and the depth is what determines that.
|
||||
//!
|
||||
//! One interesting effect of this is that if we generate very small delta
|
||||
//! layers at L0, e.g. because the L0 layers are flushed by timeout rather than
|
||||
//! because they reach the target size, the L0 compaction will combine them to
|
||||
//! one larger file. But if the combined file is still smaller than the target
|
||||
//! file size, the file will still be considered to be part of L0 at the next
|
||||
//! iteration.
|
||||
|
||||
use anyhow::bail;
|
||||
use std::collections::BTreeSet;
|
||||
use std::ops::Range;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::interface::*;
|
||||
|
||||
use tracing::{info, trace};
|
||||
|
||||
pub struct Level<L> {
|
||||
pub lsn_range: Range<Lsn>,
|
||||
pub layers: Vec<L>,
|
||||
}
|
||||
|
||||
/// Identify an LSN > `end_lsn` that partitions the LSN space, so that there are
|
||||
/// no layers that cross the boundary LSN.
|
||||
///
|
||||
/// A further restriction is that all layers in the returned partition cover at
|
||||
/// most 'lsn_max_size' LSN bytes.
|
||||
pub async fn identify_level<K, L>(
|
||||
all_layers: Vec<L>,
|
||||
end_lsn: Lsn,
|
||||
lsn_max_size: u64,
|
||||
) -> anyhow::Result<Option<Level<L>>>
|
||||
where
|
||||
K: CompactionKey,
|
||||
L: CompactionLayer<K> + Clone,
|
||||
{
|
||||
// filter out layers that are above the `end_lsn`, they are completely irrelevant.
|
||||
let mut layers = Vec::new();
|
||||
for l in all_layers {
|
||||
if l.lsn_range().start < end_lsn && l.lsn_range().end > end_lsn {
|
||||
// shouldn't happen. Indicates that the caller passed a bogus
|
||||
// end_lsn.
|
||||
bail!("identify_level() called with end_lsn that does not partition the LSN space: end_lsn {} intersects with layer {}", end_lsn, l.short_id());
|
||||
}
|
||||
// include image layers sitting exacty at `end_lsn`.
|
||||
let is_image = !l.is_delta();
|
||||
if (is_image && l.lsn_range().start > end_lsn)
|
||||
|| (!is_image && l.lsn_range().start >= end_lsn)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
layers.push(l);
|
||||
}
|
||||
// All the remaining layers either belong to this level, or are below it.
|
||||
info!(
|
||||
"identify level at {}, size {}, num layers below: {}",
|
||||
end_lsn,
|
||||
lsn_max_size,
|
||||
layers.len()
|
||||
);
|
||||
if layers.is_empty() {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
// Walk the ranges in LSN order.
|
||||
//
|
||||
// ----- end_lsn
|
||||
// |
|
||||
// |
|
||||
// v
|
||||
//
|
||||
layers.sort_by_key(|l| l.lsn_range().end);
|
||||
let mut candidate_start_lsn = end_lsn;
|
||||
let mut candidate_layers: Vec<L> = Vec::new();
|
||||
let mut current_best_start_lsn = end_lsn;
|
||||
let mut current_best_layers: Vec<L> = Vec::new();
|
||||
let mut iter = layers.into_iter();
|
||||
loop {
|
||||
let Some(l) = iter.next_back() else {
|
||||
// Reached end. Accept the last candidate
|
||||
current_best_start_lsn = candidate_start_lsn;
|
||||
current_best_layers.extend_from_slice(&std::mem::take(&mut candidate_layers));
|
||||
break;
|
||||
};
|
||||
trace!(
|
||||
"inspecting {} for candidate {}, current best {}",
|
||||
l.short_id(),
|
||||
candidate_start_lsn,
|
||||
current_best_start_lsn
|
||||
);
|
||||
|
||||
let r = l.lsn_range();
|
||||
|
||||
// Image layers don't restrict our choice of cutoff LSN
|
||||
if l.is_delta() {
|
||||
// Is this candidate workable? In other words, are there any
|
||||
// delta layers that span across this LSN
|
||||
//
|
||||
// Valid: Not valid:
|
||||
// + +
|
||||
// | | +
|
||||
// + <- candidate + | <- candidate
|
||||
// + +
|
||||
// |
|
||||
// +
|
||||
if r.end <= candidate_start_lsn {
|
||||
// Hooray, there are no crossing LSNs. And we have visited
|
||||
// through all the layers within candidate..end_lsn. The
|
||||
// current candidate can be accepted.
|
||||
current_best_start_lsn = r.end;
|
||||
current_best_layers.extend_from_slice(&std::mem::take(&mut candidate_layers));
|
||||
candidate_start_lsn = r.start;
|
||||
}
|
||||
|
||||
// Is it small enough to be considered part of this level?
|
||||
if r.end.0 - r.start.0 > lsn_max_size {
|
||||
// Too large, this layer belongs to next level. Stop.
|
||||
trace!(
|
||||
"too large {}, size {} vs {}",
|
||||
l.short_id(),
|
||||
r.end.0 - r.start.0,
|
||||
lsn_max_size
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
// If this crosses the candidate lsn, push it down.
|
||||
if r.start < candidate_start_lsn {
|
||||
trace!(
|
||||
"layer {} prevents from stopping at {}",
|
||||
l.short_id(),
|
||||
candidate_start_lsn
|
||||
);
|
||||
candidate_start_lsn = r.start;
|
||||
}
|
||||
}
|
||||
|
||||
// Include this layer in our candidate
|
||||
candidate_layers.push(l);
|
||||
}
|
||||
|
||||
Ok(if current_best_start_lsn == end_lsn {
|
||||
// empty level
|
||||
None
|
||||
} else {
|
||||
Some(Level {
|
||||
lsn_range: current_best_start_lsn..end_lsn,
|
||||
layers: current_best_layers,
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
// helper struct used in depth()
|
||||
struct Event<K> {
|
||||
key: K,
|
||||
layer_idx: usize,
|
||||
start: bool,
|
||||
}
|
||||
|
||||
impl<L> Level<L> {
|
||||
/// Count the number of deltas stacked on each other.
|
||||
pub fn depth<K>(&self) -> u64
|
||||
where
|
||||
K: CompactionKey,
|
||||
L: CompactionLayer<K>,
|
||||
{
|
||||
let mut events: Vec<Event<K>> = Vec::new();
|
||||
for (idx, l) in self.layers.iter().enumerate() {
|
||||
events.push(Event {
|
||||
key: l.key_range().start,
|
||||
layer_idx: idx,
|
||||
start: true,
|
||||
});
|
||||
events.push(Event {
|
||||
key: l.key_range().end,
|
||||
layer_idx: idx,
|
||||
start: false,
|
||||
});
|
||||
}
|
||||
events.sort_by_key(|e| (e.key, e.start));
|
||||
|
||||
// Sweep the key space left to right. Stop at each distinct key, and
|
||||
// count the number of deltas on top of the highest image at that key.
|
||||
//
|
||||
// This is a little enefficient, as we walk through the active_set on
|
||||
// every key. We could increment/decrement a counter on each step
|
||||
// instead, but that'd require a bit more complex bookkeeping.
|
||||
let mut active_set: BTreeSet<(Lsn, bool, usize)> = BTreeSet::new();
|
||||
let mut max_depth = 0;
|
||||
let mut events_iter = events.iter().peekable();
|
||||
while let Some(e) = events_iter.next() {
|
||||
let l = &self.layers[e.layer_idx];
|
||||
let is_image = !l.is_delta();
|
||||
|
||||
// update the active set
|
||||
if e.start {
|
||||
active_set.insert((l.lsn_range().end, is_image, e.layer_idx));
|
||||
} else {
|
||||
active_set.remove(&(l.lsn_range().end, is_image, e.layer_idx));
|
||||
}
|
||||
|
||||
// recalculate depth if this was the last event at this point
|
||||
let more_events_at_this_key = events_iter
|
||||
.peek()
|
||||
.map_or(false, |next_e| next_e.key == e.key);
|
||||
if !more_events_at_this_key {
|
||||
let mut active_depth = 0;
|
||||
for (_end_lsn, is_image, _idx) in active_set.iter().rev() {
|
||||
if *is_image {
|
||||
break;
|
||||
}
|
||||
active_depth += 1;
|
||||
}
|
||||
if active_depth > max_depth {
|
||||
max_depth = active_depth;
|
||||
}
|
||||
}
|
||||
}
|
||||
max_depth
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::simulator::{Key, MockDeltaLayer, MockImageLayer, MockLayer};
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
fn delta(key_range: Range<Key>, lsn_range: Range<Lsn>) -> MockLayer {
|
||||
MockLayer::Delta(Arc::new(MockDeltaLayer {
|
||||
key_range,
|
||||
lsn_range,
|
||||
// identify_level() doesn't pay attention to the rest of the fields
|
||||
file_size: 0,
|
||||
deleted: Mutex::new(false),
|
||||
records: vec![],
|
||||
}))
|
||||
}
|
||||
|
||||
fn image(key_range: Range<Key>, lsn: Lsn) -> MockLayer {
|
||||
MockLayer::Image(Arc::new(MockImageLayer {
|
||||
key_range,
|
||||
lsn_range: lsn..(lsn + 1),
|
||||
// identify_level() doesn't pay attention to the rest of the fields
|
||||
file_size: 0,
|
||||
deleted: Mutex::new(false),
|
||||
}))
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_identify_level() -> anyhow::Result<()> {
|
||||
let layers = vec![
|
||||
delta(Key::MIN..Key::MAX, Lsn(0x8000)..Lsn(0x9000)),
|
||||
delta(Key::MIN..Key::MAX, Lsn(0x5000)..Lsn(0x7000)),
|
||||
delta(Key::MIN..Key::MAX, Lsn(0x4000)..Lsn(0x5000)),
|
||||
delta(Key::MIN..Key::MAX, Lsn(0x3000)..Lsn(0x4000)),
|
||||
delta(Key::MIN..Key::MAX, Lsn(0x2000)..Lsn(0x3000)),
|
||||
delta(Key::MIN..Key::MAX, Lsn(0x1000)..Lsn(0x2000)),
|
||||
];
|
||||
|
||||
// All layers fit in the max file size
|
||||
let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
|
||||
.await?
|
||||
.unwrap();
|
||||
assert_eq!(level.depth(), 6);
|
||||
|
||||
// Same LSN with smaller max file size. The second layer from the top is larger
|
||||
// and belongs to next level.
|
||||
let level = identify_level(layers.clone(), Lsn(0x10000), 0x1000)
|
||||
.await?
|
||||
.unwrap();
|
||||
assert_eq!(level.depth(), 1);
|
||||
|
||||
// Call with a smaller LSN
|
||||
let level = identify_level(layers.clone(), Lsn(0x3000), 0x1000)
|
||||
.await?
|
||||
.unwrap();
|
||||
assert_eq!(level.depth(), 2);
|
||||
|
||||
// Call with an LSN that doesn't partition the space
|
||||
let result = identify_level(layers, Lsn(0x6000), 0x1000).await;
|
||||
assert!(result.is_err());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_overlapping_lsn_ranges() -> anyhow::Result<()> {
|
||||
// The files LSN ranges overlap, so even though there are more files that
|
||||
// fit under the file size, they are not included in the level because they
|
||||
// overlap so that we'd need to include the oldest file, too, which is
|
||||
// larger
|
||||
let layers = vec![
|
||||
delta(Key::MIN..Key::MAX, Lsn(0x4000)..Lsn(0x5000)),
|
||||
delta(Key::MIN..Key::MAX, Lsn(0x3000)..Lsn(0x4000)), // overlap
|
||||
delta(Key::MIN..Key::MAX, Lsn(0x2500)..Lsn(0x3500)), // overlap
|
||||
delta(Key::MIN..Key::MAX, Lsn(0x2000)..Lsn(0x3000)), // overlap
|
||||
delta(Key::MIN..Key::MAX, Lsn(0x1000)..Lsn(0x2500)), // larger
|
||||
];
|
||||
|
||||
let level = identify_level(layers.clone(), Lsn(0x10000), 0x1000)
|
||||
.await?
|
||||
.unwrap();
|
||||
assert_eq!(level.depth(), 1);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_depth_nonoverlapping() -> anyhow::Result<()> {
|
||||
// The key ranges don't overlap, so depth is only 1.
|
||||
let layers = vec![
|
||||
delta(4000..5000, Lsn(0x6000)..Lsn(0x7000)),
|
||||
delta(3000..4000, Lsn(0x7000)..Lsn(0x8000)),
|
||||
delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)),
|
||||
];
|
||||
|
||||
let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
|
||||
.await?
|
||||
.unwrap();
|
||||
assert_eq!(level.layers.len(), 3);
|
||||
assert_eq!(level.depth(), 1);
|
||||
|
||||
// Staggered. The 1st and 3rd layer don't overlap with each other.
|
||||
let layers = vec![
|
||||
delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)),
|
||||
delta(1500..2500, Lsn(0x7000)..Lsn(0x8000)),
|
||||
delta(2000..3000, Lsn(0x6000)..Lsn(0x7000)),
|
||||
];
|
||||
|
||||
let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
|
||||
.await?
|
||||
.unwrap();
|
||||
assert_eq!(level.layers.len(), 3);
|
||||
assert_eq!(level.depth(), 2);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_depth_images() -> anyhow::Result<()> {
|
||||
let layers: Vec<MockLayer> = vec![
|
||||
delta(1000..2000, Lsn(0x8000)..Lsn(0x9000)),
|
||||
delta(1500..2500, Lsn(0x7000)..Lsn(0x8000)),
|
||||
delta(2000..3000, Lsn(0x6000)..Lsn(0x7000)),
|
||||
// This covers the same key range as the 2nd delta layer. The depth
|
||||
// in that key range is therefore 0.
|
||||
image(1500..2500, Lsn(0x9000)),
|
||||
];
|
||||
|
||||
let level = identify_level(layers.clone(), Lsn(0x10000), 0x2000)
|
||||
.await?
|
||||
.unwrap();
|
||||
assert_eq!(level.layers.len(), 4);
|
||||
assert_eq!(level.depth(), 1);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -1,152 +0,0 @@
|
||||
//! This is what the compaction implementation needs to know about
|
||||
//! layers, keyspace etc.
|
||||
//!
|
||||
//! All the heavy lifting is done by the create_image and create_delta
|
||||
//! functions that the implementor provides.
|
||||
//!
|
||||
use async_trait::async_trait;
|
||||
use std::ops::Range;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
/// Public interface. This is the main thing that the implementor needs to provide
|
||||
#[async_trait]
|
||||
pub trait CompactionJobExecutor {
|
||||
// Type system.
|
||||
//
|
||||
// We assume that there are two kinds of layers, deltas and images. The
|
||||
// compaction doesn't distinguish whether they are stored locally or
|
||||
// remotely.
|
||||
//
|
||||
// The keyspace is defined by CompactionKey trait.
|
||||
//
|
||||
type Key: CompactionKey;
|
||||
|
||||
type Layer: CompactionLayer<Self::Key> + Clone;
|
||||
type DeltaLayer: CompactionDeltaLayer<Self> + Clone;
|
||||
type ImageLayer: CompactionImageLayer<Self> + Clone;
|
||||
|
||||
// This is passed through to all the interface functions. The compaction
|
||||
// implementation doesn't do anything with it, but it might be useful for
|
||||
// the interface implementation.
|
||||
type RequestContext: CompactionRequestContext;
|
||||
|
||||
// ----
|
||||
// Functions that the planner uses to support its decisions
|
||||
// ----
|
||||
|
||||
/// Return all layers that overlap the given bounding box.
|
||||
async fn get_layers(
|
||||
&mut self,
|
||||
key_range: &Range<Self::Key>,
|
||||
lsn_range: &Range<Lsn>,
|
||||
ctx: &Self::RequestContext,
|
||||
) -> anyhow::Result<Vec<Self::Layer>>;
|
||||
|
||||
async fn get_keyspace(
|
||||
&mut self,
|
||||
key_range: &Range<Self::Key>,
|
||||
lsn: Lsn,
|
||||
ctx: &Self::RequestContext,
|
||||
) -> anyhow::Result<CompactionKeySpace<Self::Key>>;
|
||||
|
||||
/// NB: This is a pretty expensive operation. In the real pageserver
|
||||
/// implementation, it downloads the layer, and keeps it resident
|
||||
/// until the DeltaLayer is dropped.
|
||||
async fn downcast_delta_layer(
|
||||
&self,
|
||||
layer: &Self::Layer,
|
||||
) -> anyhow::Result<Option<Self::DeltaLayer>>;
|
||||
|
||||
// ----
|
||||
// Functions to execute the plan
|
||||
// ----
|
||||
|
||||
/// Create a new image layer, materializing all the values in the key range,
|
||||
/// at given 'lsn'.
|
||||
async fn create_image(
|
||||
&mut self,
|
||||
lsn: Lsn,
|
||||
key_range: &Range<Self::Key>,
|
||||
ctx: &Self::RequestContext,
|
||||
) -> anyhow::Result<()>;
|
||||
|
||||
/// Create a new delta layer, containing all the values from 'input_layers'
|
||||
/// in the given key and LSN range.
|
||||
async fn create_delta(
|
||||
&mut self,
|
||||
lsn_range: &Range<Lsn>,
|
||||
key_range: &Range<Self::Key>,
|
||||
input_layers: &[Self::DeltaLayer],
|
||||
ctx: &Self::RequestContext,
|
||||
) -> anyhow::Result<()>;
|
||||
|
||||
/// Delete a layer. The compaction implementation will call this only after
|
||||
/// all the create_image() or create_delta() calls that deletion of this
|
||||
/// layer depends on have finished. But if the implementor has extra lazy
|
||||
/// background tasks, like uploading the index json file to remote storage,
|
||||
/// it is the implemenation's responsibility to track those.
|
||||
async fn delete_layer(
|
||||
&mut self,
|
||||
layer: &Self::Layer,
|
||||
ctx: &Self::RequestContext,
|
||||
) -> anyhow::Result<()>;
|
||||
}
|
||||
|
||||
pub trait CompactionKey: std::cmp::Ord + Clone + Copy + std::fmt::Display {
|
||||
const MIN: Self;
|
||||
const MAX: Self;
|
||||
|
||||
/// Calculate distance between key_range.start and key_range.end.
|
||||
///
|
||||
/// This returns u32, for compatibility with Repository::key. If the
|
||||
/// distance is larger, return u32::MAX.
|
||||
fn key_range_size(key_range: &Range<Self>) -> u32;
|
||||
|
||||
// return "self + 1"
|
||||
fn next(&self) -> Self;
|
||||
|
||||
// return "self + <some decent amount to skip>". The amount to skip
|
||||
// is left to the implementation.
|
||||
// FIXME: why not just "add(u32)" ? This is hard to use
|
||||
fn skip_some(&self) -> Self;
|
||||
}
|
||||
|
||||
/// Contiguous ranges of keys that belong to the key space. In key order, and
|
||||
/// with no overlap.
|
||||
pub type CompactionKeySpace<K> = Vec<Range<K>>;
|
||||
|
||||
/// Functions needed from all layers.
|
||||
pub trait CompactionLayer<K: CompactionKey + ?Sized> {
|
||||
fn key_range(&self) -> &Range<K>;
|
||||
fn lsn_range(&self) -> &Range<Lsn>;
|
||||
|
||||
fn file_size(&self) -> u64;
|
||||
|
||||
/// For debugging, short human-readable representation of the layer. E.g. filename.
|
||||
fn short_id(&self) -> String;
|
||||
|
||||
fn is_delta(&self) -> bool;
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
pub trait CompactionDeltaLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {
|
||||
type DeltaEntry<'a>: CompactionDeltaEntry<'a, E::Key>
|
||||
where
|
||||
Self: 'a;
|
||||
|
||||
/// Return all keys in this delta layer.
|
||||
async fn load_keys<'a>(
|
||||
&self,
|
||||
ctx: &E::RequestContext,
|
||||
) -> anyhow::Result<Vec<Self::DeltaEntry<'_>>>;
|
||||
}
|
||||
|
||||
pub trait CompactionImageLayer<E: CompactionJobExecutor + ?Sized>: CompactionLayer<E::Key> {}
|
||||
|
||||
pub trait CompactionDeltaEntry<'a, K> {
|
||||
fn key(&self) -> K;
|
||||
fn lsn(&self) -> Lsn;
|
||||
fn size(&self) -> u64;
|
||||
}
|
||||
|
||||
pub trait CompactionRequestContext {}
|
||||
@@ -1,12 +0,0 @@
|
||||
// The main module implementing the compaction algorithm
|
||||
pub mod compact_tiered;
|
||||
pub(crate) mod identify_levels;
|
||||
|
||||
// Traits that the caller of the compaction needs to implement
|
||||
pub mod interface;
|
||||
|
||||
// Utility functions, useful for the implementation
|
||||
pub mod helpers;
|
||||
|
||||
// A simulator with mock implementations of 'interface'
|
||||
pub mod simulator;
|
||||
@@ -1,613 +0,0 @@
|
||||
mod draw;
|
||||
|
||||
use draw::{LayerTraceEvent, LayerTraceFile, LayerTraceOp};
|
||||
|
||||
use async_trait::async_trait;
|
||||
use futures::StreamExt;
|
||||
use rand::Rng;
|
||||
use tracing::info;
|
||||
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use std::fmt::Write;
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
use std::sync::Mutex;
|
||||
|
||||
use crate::helpers::{merge_delta_keys, overlaps_with};
|
||||
|
||||
use crate::interface;
|
||||
use crate::interface::CompactionLayer;
|
||||
|
||||
//
|
||||
// Implementation for the CompactionExecutor interface
|
||||
//
|
||||
pub struct MockTimeline {
|
||||
// Parameters for the compaction algorithm
|
||||
pub target_file_size: u64,
|
||||
tiers_per_level: u64,
|
||||
|
||||
num_l0_flushes: u64,
|
||||
last_compact_at_flush: u64,
|
||||
last_flush_lsn: Lsn,
|
||||
|
||||
// In-memory layer
|
||||
records: Vec<MockRecord>,
|
||||
total_len: u64,
|
||||
start_lsn: Lsn,
|
||||
end_lsn: Lsn,
|
||||
|
||||
// Current keyspace at `end_lsn`. This is updated on every ingested record.
|
||||
keyspace: KeySpace,
|
||||
|
||||
// historic keyspaces
|
||||
old_keyspaces: Vec<(Lsn, KeySpace)>,
|
||||
|
||||
// "on-disk" layers
|
||||
pub live_layers: Vec<MockLayer>,
|
||||
|
||||
num_deleted_layers: u64,
|
||||
|
||||
// Statistics
|
||||
wal_ingested: u64,
|
||||
bytes_written: u64,
|
||||
bytes_deleted: u64,
|
||||
layers_created: u64,
|
||||
layers_deleted: u64,
|
||||
|
||||
// All the events - creation and deletion of files - are collected
|
||||
// in 'history'. It is used to draw the SVG animation at the end.
|
||||
time: u64,
|
||||
history: Vec<draw::LayerTraceEvent>,
|
||||
}
|
||||
|
||||
type KeySpace = interface::CompactionKeySpace<Key>;
|
||||
|
||||
pub struct MockRequestContext {}
|
||||
impl interface::CompactionRequestContext for MockRequestContext {}
|
||||
|
||||
pub type Key = u64;
|
||||
|
||||
impl interface::CompactionKey for Key {
|
||||
const MIN: Self = u64::MIN;
|
||||
const MAX: Self = u64::MAX;
|
||||
|
||||
fn key_range_size(key_range: &Range<Self>) -> u32 {
|
||||
std::cmp::min(key_range.end - key_range.start, u32::MAX as u64) as u32
|
||||
}
|
||||
|
||||
fn next(&self) -> Self {
|
||||
self + 1
|
||||
}
|
||||
fn skip_some(&self) -> Self {
|
||||
// round up to next xx
|
||||
self + 100
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct MockRecord {
|
||||
lsn: Lsn,
|
||||
key: Key,
|
||||
len: u64,
|
||||
}
|
||||
|
||||
impl interface::CompactionDeltaEntry<'_, Key> for MockRecord {
|
||||
fn key(&self) -> Key {
|
||||
self.key
|
||||
}
|
||||
fn lsn(&self) -> Lsn {
|
||||
self.lsn
|
||||
}
|
||||
fn size(&self) -> u64 {
|
||||
self.len
|
||||
}
|
||||
}
|
||||
|
||||
pub struct MockDeltaLayer {
|
||||
pub key_range: Range<Key>,
|
||||
pub lsn_range: Range<Lsn>,
|
||||
|
||||
pub file_size: u64,
|
||||
|
||||
pub deleted: Mutex<bool>,
|
||||
|
||||
pub records: Vec<MockRecord>,
|
||||
}
|
||||
|
||||
impl interface::CompactionLayer<Key> for Arc<MockDeltaLayer> {
|
||||
fn key_range(&self) -> &Range<Key> {
|
||||
&self.key_range
|
||||
}
|
||||
fn lsn_range(&self) -> &Range<Lsn> {
|
||||
&self.lsn_range
|
||||
}
|
||||
|
||||
fn file_size(&self) -> u64 {
|
||||
self.file_size
|
||||
}
|
||||
|
||||
fn short_id(&self) -> String {
|
||||
format!(
|
||||
"{:016X}-{:016X}__{:08X}-{:08X}",
|
||||
self.key_range.start, self.key_range.end, self.lsn_range.start.0, self.lsn_range.end.0
|
||||
)
|
||||
}
|
||||
|
||||
fn is_delta(&self) -> bool {
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl interface::CompactionDeltaLayer<MockTimeline> for Arc<MockDeltaLayer> {
|
||||
type DeltaEntry<'a> = MockRecord;
|
||||
|
||||
async fn load_keys<'a>(&self, _ctx: &MockRequestContext) -> anyhow::Result<Vec<MockRecord>> {
|
||||
Ok(self.records.clone())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct MockImageLayer {
|
||||
pub key_range: Range<Key>,
|
||||
pub lsn_range: Range<Lsn>,
|
||||
|
||||
pub file_size: u64,
|
||||
|
||||
pub deleted: Mutex<bool>,
|
||||
}
|
||||
|
||||
impl interface::CompactionImageLayer<MockTimeline> for Arc<MockImageLayer> {}
|
||||
|
||||
impl interface::CompactionLayer<Key> for Arc<MockImageLayer> {
|
||||
fn key_range(&self) -> &Range<Key> {
|
||||
&self.key_range
|
||||
}
|
||||
fn lsn_range(&self) -> &Range<Lsn> {
|
||||
&self.lsn_range
|
||||
}
|
||||
|
||||
fn file_size(&self) -> u64 {
|
||||
self.file_size
|
||||
}
|
||||
|
||||
fn short_id(&self) -> String {
|
||||
format!(
|
||||
"{:016X}-{:016X}__{:08X}",
|
||||
self.key_range.start, self.key_range.end, self.lsn_range.start.0,
|
||||
)
|
||||
}
|
||||
|
||||
fn is_delta(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
impl MockTimeline {
|
||||
pub fn new() -> Self {
|
||||
MockTimeline {
|
||||
target_file_size: 256 * 1024 * 1024,
|
||||
tiers_per_level: 4,
|
||||
|
||||
num_l0_flushes: 0,
|
||||
last_compact_at_flush: 0,
|
||||
last_flush_lsn: Lsn(0),
|
||||
|
||||
records: Vec::new(),
|
||||
total_len: 0,
|
||||
start_lsn: Lsn(1000),
|
||||
end_lsn: Lsn(1000),
|
||||
keyspace: KeySpace::new(),
|
||||
|
||||
old_keyspaces: vec![],
|
||||
|
||||
live_layers: vec![],
|
||||
|
||||
num_deleted_layers: 0,
|
||||
|
||||
wal_ingested: 0,
|
||||
bytes_written: 0,
|
||||
bytes_deleted: 0,
|
||||
layers_created: 0,
|
||||
layers_deleted: 0,
|
||||
|
||||
time: 0,
|
||||
history: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn compact(&mut self) -> anyhow::Result<()> {
|
||||
let ctx = MockRequestContext {};
|
||||
|
||||
crate::compact_tiered::compact_tiered(
|
||||
self,
|
||||
self.last_flush_lsn,
|
||||
self.target_file_size,
|
||||
self.tiers_per_level,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Ingest one record to the timeline
|
||||
pub fn ingest_record(&mut self, key: Key, len: u64) {
|
||||
self.records.push(MockRecord {
|
||||
lsn: self.end_lsn,
|
||||
key,
|
||||
len,
|
||||
});
|
||||
self.total_len += len;
|
||||
self.end_lsn += len;
|
||||
|
||||
if self.total_len > self.target_file_size {
|
||||
self.flush_l0();
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn compact_if_needed(&mut self) -> anyhow::Result<()> {
|
||||
if self.num_l0_flushes - self.last_compact_at_flush >= self.tiers_per_level {
|
||||
self.compact().await?;
|
||||
self.last_compact_at_flush = self.num_l0_flushes;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn flush_l0(&mut self) {
|
||||
if self.records.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
let mut records = std::mem::take(&mut self.records);
|
||||
records.sort_by_key(|rec| rec.key);
|
||||
|
||||
let lsn_range = self.start_lsn..self.end_lsn;
|
||||
let new_layer = Arc::new(MockDeltaLayer {
|
||||
key_range: Key::MIN..Key::MAX,
|
||||
lsn_range: lsn_range.clone(),
|
||||
file_size: self.total_len,
|
||||
records,
|
||||
deleted: Mutex::new(false),
|
||||
});
|
||||
info!("flushed L0 layer {}", new_layer.short_id());
|
||||
self.live_layers.push(MockLayer::from(&new_layer));
|
||||
|
||||
// reset L0
|
||||
self.start_lsn = self.end_lsn;
|
||||
self.total_len = 0;
|
||||
self.records = Vec::new();
|
||||
|
||||
self.layers_created += 1;
|
||||
self.bytes_written += new_layer.file_size;
|
||||
|
||||
self.time += 1;
|
||||
self.history.push(LayerTraceEvent {
|
||||
time_rel: self.time,
|
||||
op: LayerTraceOp::Flush,
|
||||
file: LayerTraceFile {
|
||||
filename: new_layer.short_id(),
|
||||
key_range: new_layer.key_range.clone(),
|
||||
lsn_range: new_layer.lsn_range.clone(),
|
||||
},
|
||||
});
|
||||
|
||||
self.num_l0_flushes += 1;
|
||||
self.last_flush_lsn = self.end_lsn;
|
||||
}
|
||||
|
||||
// Ingest `num_records' records to the timeline, with random keys
|
||||
// uniformly distributed in `key_range`
|
||||
pub fn ingest_uniform(
|
||||
&mut self,
|
||||
num_records: u64,
|
||||
len: u64,
|
||||
key_range: &Range<Key>,
|
||||
) -> anyhow::Result<()> {
|
||||
crate::helpers::union_to_keyspace(&mut self.keyspace, vec![key_range.clone()]);
|
||||
let mut rng = rand::thread_rng();
|
||||
for _ in 0..num_records {
|
||||
self.ingest_record(rng.gen_range(key_range.clone()), len);
|
||||
self.wal_ingested += len;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn print_stats(&self) -> anyhow::Result<String> {
|
||||
let mut s = String::new();
|
||||
|
||||
writeln!(s, "STATISTICS:")?;
|
||||
writeln!(
|
||||
s,
|
||||
"WAL ingested: {:>10} MB",
|
||||
self.wal_ingested / (1024 * 1024)
|
||||
)?;
|
||||
writeln!(
|
||||
s,
|
||||
"size created: {:>10} MB",
|
||||
self.bytes_written / (1024 * 1024)
|
||||
)?;
|
||||
writeln!(
|
||||
s,
|
||||
"size deleted: {:>10} MB",
|
||||
self.bytes_deleted / (1024 * 1024)
|
||||
)?;
|
||||
writeln!(s, "files created: {:>10}", self.layers_created)?;
|
||||
writeln!(s, "files deleted: {:>10}", self.layers_deleted)?;
|
||||
writeln!(
|
||||
s,
|
||||
"write amp: {:>10.2}",
|
||||
self.bytes_written as f64 / self.wal_ingested as f64
|
||||
)?;
|
||||
writeln!(
|
||||
s,
|
||||
"storage amp: {:>10.2}",
|
||||
(self.bytes_written - self.bytes_deleted) as f64 / self.wal_ingested as f64
|
||||
)?;
|
||||
|
||||
Ok(s)
|
||||
}
|
||||
|
||||
pub fn draw_history<W: std::io::Write>(&self, output: W) -> anyhow::Result<()> {
|
||||
draw::draw_history(&self.history, output)
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for MockTimeline {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub enum MockLayer {
|
||||
Delta(Arc<MockDeltaLayer>),
|
||||
Image(Arc<MockImageLayer>),
|
||||
}
|
||||
|
||||
impl interface::CompactionLayer<Key> for MockLayer {
|
||||
fn key_range(&self) -> &Range<Key> {
|
||||
match self {
|
||||
MockLayer::Delta(this) => this.key_range(),
|
||||
MockLayer::Image(this) => this.key_range(),
|
||||
}
|
||||
}
|
||||
fn lsn_range(&self) -> &Range<Lsn> {
|
||||
match self {
|
||||
MockLayer::Delta(this) => this.lsn_range(),
|
||||
MockLayer::Image(this) => this.lsn_range(),
|
||||
}
|
||||
}
|
||||
fn file_size(&self) -> u64 {
|
||||
match self {
|
||||
MockLayer::Delta(this) => this.file_size(),
|
||||
MockLayer::Image(this) => this.file_size(),
|
||||
}
|
||||
}
|
||||
fn short_id(&self) -> String {
|
||||
match self {
|
||||
MockLayer::Delta(this) => this.short_id(),
|
||||
MockLayer::Image(this) => this.short_id(),
|
||||
}
|
||||
}
|
||||
|
||||
fn is_delta(&self) -> bool {
|
||||
match self {
|
||||
MockLayer::Delta(_) => true,
|
||||
MockLayer::Image(_) => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl MockLayer {
|
||||
fn is_deleted(&self) -> bool {
|
||||
let guard = match self {
|
||||
MockLayer::Delta(this) => this.deleted.lock().unwrap(),
|
||||
MockLayer::Image(this) => this.deleted.lock().unwrap(),
|
||||
};
|
||||
*guard
|
||||
}
|
||||
fn mark_deleted(&self) {
|
||||
let mut deleted_guard = match self {
|
||||
MockLayer::Delta(this) => this.deleted.lock().unwrap(),
|
||||
MockLayer::Image(this) => this.deleted.lock().unwrap(),
|
||||
};
|
||||
assert!(!*deleted_guard, "layer already deleted");
|
||||
*deleted_guard = true;
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&Arc<MockDeltaLayer>> for MockLayer {
|
||||
fn from(l: &Arc<MockDeltaLayer>) -> Self {
|
||||
MockLayer::Delta(l.clone())
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&Arc<MockImageLayer>> for MockLayer {
|
||||
fn from(l: &Arc<MockImageLayer>) -> Self {
|
||||
MockLayer::Image(l.clone())
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl interface::CompactionJobExecutor for MockTimeline {
|
||||
type Key = Key;
|
||||
type Layer = MockLayer;
|
||||
type DeltaLayer = Arc<MockDeltaLayer>;
|
||||
type ImageLayer = Arc<MockImageLayer>;
|
||||
type RequestContext = MockRequestContext;
|
||||
|
||||
async fn get_layers(
|
||||
&mut self,
|
||||
key_range: &Range<Self::Key>,
|
||||
lsn_range: &Range<Lsn>,
|
||||
_ctx: &Self::RequestContext,
|
||||
) -> anyhow::Result<Vec<Self::Layer>> {
|
||||
// Clear any deleted layers from our vec
|
||||
self.live_layers.retain(|l| !l.is_deleted());
|
||||
|
||||
let layers: Vec<MockLayer> = self
|
||||
.live_layers
|
||||
.iter()
|
||||
.filter(|l| {
|
||||
overlaps_with(l.lsn_range(), lsn_range) && overlaps_with(l.key_range(), key_range)
|
||||
})
|
||||
.cloned()
|
||||
.collect();
|
||||
|
||||
Ok(layers)
|
||||
}
|
||||
|
||||
async fn get_keyspace(
|
||||
&mut self,
|
||||
key_range: &Range<Self::Key>,
|
||||
_lsn: Lsn,
|
||||
_ctx: &Self::RequestContext,
|
||||
) -> anyhow::Result<interface::CompactionKeySpace<Key>> {
|
||||
// find it in the levels
|
||||
if self.old_keyspaces.is_empty() {
|
||||
Ok(crate::helpers::intersect_keyspace(
|
||||
&self.keyspace,
|
||||
key_range,
|
||||
))
|
||||
} else {
|
||||
// not implemented
|
||||
|
||||
// The mock implementation only allows requesting the
|
||||
// keyspace at the level's end LSN. That's all that the
|
||||
// current implementation needs.
|
||||
panic!("keyspace not available for requested lsn");
|
||||
}
|
||||
}
|
||||
|
||||
async fn downcast_delta_layer(
|
||||
&self,
|
||||
layer: &MockLayer,
|
||||
) -> anyhow::Result<Option<Arc<MockDeltaLayer>>> {
|
||||
Ok(match layer {
|
||||
MockLayer::Delta(l) => Some(l.clone()),
|
||||
MockLayer::Image(_) => None,
|
||||
})
|
||||
}
|
||||
|
||||
async fn create_image(
|
||||
&mut self,
|
||||
lsn: Lsn,
|
||||
key_range: &Range<Key>,
|
||||
ctx: &MockRequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let keyspace = self.get_keyspace(key_range, lsn, ctx).await?;
|
||||
|
||||
let mut accum_size: u64 = 0;
|
||||
for r in keyspace {
|
||||
accum_size += r.end - r.start;
|
||||
}
|
||||
|
||||
let new_layer = Arc::new(MockImageLayer {
|
||||
key_range: key_range.clone(),
|
||||
lsn_range: lsn..lsn,
|
||||
file_size: accum_size * 8192,
|
||||
deleted: Mutex::new(false),
|
||||
});
|
||||
info!(
|
||||
"created image layer, size {}: {}",
|
||||
new_layer.file_size,
|
||||
new_layer.short_id()
|
||||
);
|
||||
self.live_layers.push(MockLayer::Image(new_layer.clone()));
|
||||
|
||||
// update stats
|
||||
self.bytes_written += new_layer.file_size;
|
||||
self.layers_created += 1;
|
||||
|
||||
self.time += 1;
|
||||
self.history.push(LayerTraceEvent {
|
||||
time_rel: self.time,
|
||||
op: LayerTraceOp::CreateImage,
|
||||
file: LayerTraceFile {
|
||||
filename: new_layer.short_id(),
|
||||
key_range: new_layer.key_range.clone(),
|
||||
lsn_range: new_layer.lsn_range.clone(),
|
||||
},
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn create_delta(
|
||||
&mut self,
|
||||
lsn_range: &Range<Lsn>,
|
||||
key_range: &Range<Key>,
|
||||
input_layers: &[Arc<MockDeltaLayer>],
|
||||
ctx: &MockRequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut key_value_stream =
|
||||
std::pin::pin!(merge_delta_keys::<MockTimeline>(input_layers, ctx));
|
||||
let mut records: Vec<MockRecord> = Vec::new();
|
||||
let mut total_len = 2;
|
||||
while let Some(delta_entry) = key_value_stream.next().await {
|
||||
let delta_entry: MockRecord = delta_entry?;
|
||||
if key_range.contains(&delta_entry.key) && lsn_range.contains(&delta_entry.lsn) {
|
||||
total_len += delta_entry.len;
|
||||
records.push(delta_entry);
|
||||
}
|
||||
}
|
||||
let total_records = records.len();
|
||||
let new_layer = Arc::new(MockDeltaLayer {
|
||||
key_range: key_range.clone(),
|
||||
lsn_range: lsn_range.clone(),
|
||||
file_size: total_len,
|
||||
records,
|
||||
deleted: Mutex::new(false),
|
||||
});
|
||||
info!(
|
||||
"created delta layer, recs {}, size {}: {}",
|
||||
total_records,
|
||||
total_len,
|
||||
new_layer.short_id()
|
||||
);
|
||||
self.live_layers.push(MockLayer::Delta(new_layer.clone()));
|
||||
|
||||
// update stats
|
||||
self.bytes_written += total_len;
|
||||
self.layers_created += 1;
|
||||
|
||||
self.time += 1;
|
||||
self.history.push(LayerTraceEvent {
|
||||
time_rel: self.time,
|
||||
op: LayerTraceOp::CreateDelta,
|
||||
file: LayerTraceFile {
|
||||
filename: new_layer.short_id(),
|
||||
key_range: new_layer.key_range.clone(),
|
||||
lsn_range: new_layer.lsn_range.clone(),
|
||||
},
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn delete_layer(
|
||||
&mut self,
|
||||
layer: &Self::Layer,
|
||||
_ctx: &MockRequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let layer = std::pin::pin!(layer);
|
||||
info!("deleting layer: {}", layer.short_id());
|
||||
self.num_deleted_layers += 1;
|
||||
self.bytes_deleted += layer.file_size();
|
||||
layer.mark_deleted();
|
||||
|
||||
self.time += 1;
|
||||
self.history.push(LayerTraceEvent {
|
||||
time_rel: self.time,
|
||||
op: LayerTraceOp::Delete,
|
||||
file: LayerTraceFile {
|
||||
filename: layer.short_id(),
|
||||
key_range: layer.key_range().clone(),
|
||||
lsn_range: layer.lsn_range().clone(),
|
||||
},
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -1,411 +0,0 @@
|
||||
use super::Key;
|
||||
use anyhow::Result;
|
||||
use std::cmp::Ordering;
|
||||
use std::{
|
||||
collections::{BTreeMap, BTreeSet, HashSet},
|
||||
fmt::Write,
|
||||
ops::Range,
|
||||
};
|
||||
use svg_fmt::{rgb, BeginSvg, EndSvg, Fill, Stroke, Style};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
// Map values to their compressed coordinate - the index the value
|
||||
// would have in a sorted and deduplicated list of all values.
|
||||
struct CoordinateMap<T: Ord + Copy> {
|
||||
map: BTreeMap<T, usize>,
|
||||
stretch: f32,
|
||||
}
|
||||
|
||||
impl<T: Ord + Copy> CoordinateMap<T> {
|
||||
fn new(coords: Vec<T>, stretch: f32) -> Self {
|
||||
let set: BTreeSet<T> = coords.into_iter().collect();
|
||||
|
||||
let mut map: BTreeMap<T, usize> = BTreeMap::new();
|
||||
for (i, e) in set.iter().enumerate() {
|
||||
map.insert(*e, i);
|
||||
}
|
||||
|
||||
Self { map, stretch }
|
||||
}
|
||||
|
||||
// This assumes that the map contains an exact point for this.
|
||||
// Use map_inexact for values inbetween
|
||||
fn map(&self, val: T) -> f32 {
|
||||
*self.map.get(&val).unwrap() as f32 * self.stretch
|
||||
}
|
||||
|
||||
// the value is still assumed to be within the min/max bounds
|
||||
// (this is currently unused)
|
||||
fn _map_inexact(&self, val: T) -> f32 {
|
||||
let prev = *self.map.range(..=val).next().unwrap().1;
|
||||
let next = *self.map.range(val..).next().unwrap().1;
|
||||
|
||||
// interpolate
|
||||
(prev as f32 + (next - prev) as f32) * self.stretch
|
||||
}
|
||||
|
||||
fn max(&self) -> f32 {
|
||||
self.map.len() as f32 * self.stretch
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Hash, Eq)]
|
||||
pub enum LayerTraceOp {
|
||||
Flush,
|
||||
CreateDelta,
|
||||
CreateImage,
|
||||
Delete,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for LayerTraceOp {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
|
||||
let op_str = match self {
|
||||
LayerTraceOp::Flush => "flush",
|
||||
LayerTraceOp::CreateDelta => "create_delta",
|
||||
LayerTraceOp::CreateImage => "create_image",
|
||||
LayerTraceOp::Delete => "delete",
|
||||
};
|
||||
f.write_str(op_str)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Hash, Eq, Clone)]
|
||||
pub struct LayerTraceFile {
|
||||
pub filename: String,
|
||||
pub key_range: Range<Key>,
|
||||
pub lsn_range: Range<Lsn>,
|
||||
}
|
||||
|
||||
impl LayerTraceFile {
|
||||
fn is_image(&self) -> bool {
|
||||
self.lsn_range.end == self.lsn_range.start
|
||||
}
|
||||
}
|
||||
|
||||
pub struct LayerTraceEvent {
|
||||
pub time_rel: u64,
|
||||
pub op: LayerTraceOp,
|
||||
pub file: LayerTraceFile,
|
||||
}
|
||||
|
||||
pub fn draw_history<W: std::io::Write>(history: &[LayerTraceEvent], mut output: W) -> Result<()> {
|
||||
let mut files: Vec<LayerTraceFile> = Vec::new();
|
||||
|
||||
for event in history {
|
||||
files.push(event.file.clone());
|
||||
}
|
||||
let last_time_rel = history.last().unwrap().time_rel;
|
||||
|
||||
// Collect all coordinates
|
||||
let mut keys: Vec<Key> = vec![];
|
||||
let mut lsns: Vec<Lsn> = vec![];
|
||||
for f in files.iter() {
|
||||
keys.push(f.key_range.start);
|
||||
keys.push(f.key_range.end);
|
||||
lsns.push(f.lsn_range.start);
|
||||
lsns.push(f.lsn_range.end);
|
||||
}
|
||||
|
||||
// Analyze
|
||||
let key_map = CoordinateMap::new(keys, 2.0);
|
||||
// Stretch out vertically for better visibility
|
||||
let lsn_map = CoordinateMap::new(lsns, 3.0);
|
||||
|
||||
let mut svg = String::new();
|
||||
|
||||
// Draw
|
||||
writeln!(
|
||||
svg,
|
||||
"{}",
|
||||
BeginSvg {
|
||||
w: key_map.max(),
|
||||
h: lsn_map.max(),
|
||||
}
|
||||
)?;
|
||||
let lsn_max = lsn_map.max();
|
||||
|
||||
// Sort the files by LSN, but so that image layers go after all delta layers
|
||||
// The SVG is painted in the order the elements appear, and we want to draw
|
||||
// image layers on top of the delta layers if they overlap
|
||||
//
|
||||
// (This could also be implemented via z coordinates: image layers get one z
|
||||
// coord, delta layers get another z coord.)
|
||||
let mut files_sorted: Vec<LayerTraceFile> = files.into_iter().collect();
|
||||
files_sorted.sort_by(|a, b| {
|
||||
if a.is_image() && !b.is_image() {
|
||||
Ordering::Greater
|
||||
} else if !a.is_image() && b.is_image() {
|
||||
Ordering::Less
|
||||
} else {
|
||||
a.lsn_range.end.cmp(&b.lsn_range.end)
|
||||
}
|
||||
});
|
||||
|
||||
writeln!(svg, "<!-- layers -->")?;
|
||||
let mut files_seen = HashSet::new();
|
||||
for f in files_sorted {
|
||||
if files_seen.contains(&f) {
|
||||
continue;
|
||||
}
|
||||
let key_start = key_map.map(f.key_range.start);
|
||||
let key_end = key_map.map(f.key_range.end);
|
||||
let key_diff = key_end - key_start;
|
||||
|
||||
if key_start >= key_end {
|
||||
panic!("Invalid key range {}-{}", key_start, key_end);
|
||||
}
|
||||
|
||||
let lsn_start = lsn_map.map(f.lsn_range.start);
|
||||
let lsn_end = lsn_map.map(f.lsn_range.end);
|
||||
|
||||
// Fill in and thicken rectangle if it's an
|
||||
// image layer so that we can see it.
|
||||
let mut style = Style::default();
|
||||
style.fill = Fill::Color(rgb(0x80, 0x80, 0x80));
|
||||
style.stroke = Stroke::Color(rgb(0, 0, 0), 0.5);
|
||||
|
||||
let y_start = lsn_max - lsn_start;
|
||||
let y_end = lsn_max - lsn_end;
|
||||
|
||||
let x_margin = 0.25;
|
||||
let y_margin = 0.5;
|
||||
|
||||
match f.lsn_range.start.cmp(&f.lsn_range.end) {
|
||||
Ordering::Less => {
|
||||
write!(
|
||||
svg,
|
||||
r#" <rect id="layer_{}" x="{}" y="{}" width="{}" height="{}" ry="{}" style="{}">"#,
|
||||
f.filename,
|
||||
key_start + x_margin,
|
||||
y_end + y_margin,
|
||||
key_diff - x_margin * 2.0,
|
||||
y_start - y_end - y_margin * 2.0,
|
||||
1.0, // border_radius,
|
||||
style,
|
||||
)?;
|
||||
write!(svg, "<title>{}</title>", f.filename)?;
|
||||
writeln!(svg, "</rect>")?;
|
||||
}
|
||||
Ordering::Equal => {
|
||||
//lsn_diff = 0.3;
|
||||
//lsn_offset = -lsn_diff / 2.0;
|
||||
//margin = 0.05;
|
||||
style.fill = Fill::Color(rgb(0x80, 0, 0x80));
|
||||
style.stroke = Stroke::Color(rgb(0x80, 0, 0x80), 3.0);
|
||||
write!(
|
||||
svg,
|
||||
r#" <line id="layer_{}" x1="{}" y1="{}" x2="{}" y2="{}" style="{}">"#,
|
||||
f.filename,
|
||||
key_start + x_margin,
|
||||
y_end,
|
||||
key_end - x_margin,
|
||||
y_end,
|
||||
style,
|
||||
)?;
|
||||
write!(
|
||||
svg,
|
||||
"<title>{}<br>{} - {}</title>",
|
||||
f.filename, lsn_end, y_end
|
||||
)?;
|
||||
writeln!(svg, "</line>")?;
|
||||
}
|
||||
Ordering::Greater => panic!("Invalid lsn range {}-{}", lsn_start, lsn_end),
|
||||
}
|
||||
files_seen.insert(f);
|
||||
}
|
||||
|
||||
let mut record_style = Style::default();
|
||||
record_style.fill = Fill::Color(rgb(0x80, 0x80, 0x80));
|
||||
record_style.stroke = Stroke::None;
|
||||
|
||||
writeln!(svg, "{}", EndSvg)?;
|
||||
|
||||
let mut layer_events_str = String::new();
|
||||
let mut first = true;
|
||||
for e in history {
|
||||
if !first {
|
||||
writeln!(layer_events_str, ",")?;
|
||||
}
|
||||
write!(
|
||||
layer_events_str,
|
||||
r#" {{"time_rel": {}, "filename": "{}", "op": "{}"}}"#,
|
||||
e.time_rel, e.file.filename, e.op
|
||||
)?;
|
||||
first = false;
|
||||
}
|
||||
writeln!(layer_events_str)?;
|
||||
|
||||
writeln!(
|
||||
output,
|
||||
r#"<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<style>
|
||||
/* Keep the slider pinned at top */
|
||||
.topbar {{
|
||||
display: block;
|
||||
overflow: hidden;
|
||||
background-color: lightgrey;
|
||||
position: fixed;
|
||||
top: 0;
|
||||
width: 100%;
|
||||
/* width: 500px; */
|
||||
}}
|
||||
.slidercontainer {{
|
||||
float: left;
|
||||
width: 50%;
|
||||
margin-right: 200px;
|
||||
}}
|
||||
.slider {{
|
||||
float: left;
|
||||
width: 100%;
|
||||
}}
|
||||
.legend {{
|
||||
width: 200px;
|
||||
float: right;
|
||||
}}
|
||||
|
||||
/* Main content */
|
||||
.main {{
|
||||
margin-top: 50px; /* Add a top margin to avoid content overlay */
|
||||
}}
|
||||
</style>
|
||||
</head>
|
||||
|
||||
<body onload="init()">
|
||||
<script type="text/javascript">
|
||||
|
||||
var layer_events = [{layer_events_str}]
|
||||
|
||||
let ticker;
|
||||
|
||||
function init() {{
|
||||
for (let i = 0; i < layer_events.length; i++) {{
|
||||
var layer = document.getElementById("layer_" + layer_events[i].filename);
|
||||
layer.style.visibility = "hidden";
|
||||
}}
|
||||
last_layer_event = -1;
|
||||
moveSlider(last_slider_pos)
|
||||
}}
|
||||
|
||||
function startAnimation() {{
|
||||
ticker = setInterval(animateStep, 100);
|
||||
}}
|
||||
function stopAnimation() {{
|
||||
clearInterval(ticker);
|
||||
}}
|
||||
|
||||
function animateStep() {{
|
||||
if (last_layer_event < layer_events.length - 1) {{
|
||||
var slider = document.getElementById("time-slider");
|
||||
let prevPos = slider.value
|
||||
let nextEvent = last_layer_event + 1
|
||||
while (nextEvent <= layer_events.length - 1) {{
|
||||
if (layer_events[nextEvent].time_rel > prevPos) {{
|
||||
break;
|
||||
}}
|
||||
nextEvent += 1;
|
||||
}}
|
||||
let nextPos = layer_events[nextEvent].time_rel
|
||||
slider.value = nextPos
|
||||
moveSlider(nextPos)
|
||||
}}
|
||||
}}
|
||||
|
||||
function redoLayerEvent(n, dir) {{
|
||||
var layer = document.getElementById("layer_" + layer_events[n].filename);
|
||||
switch (layer_events[n].op) {{
|
||||
case "flush":
|
||||
layer.style.visibility = "visible";
|
||||
break;
|
||||
case "create_delta":
|
||||
layer.style.visibility = "visible";
|
||||
break;
|
||||
case "create_image":
|
||||
layer.style.visibility = "visible";
|
||||
break;
|
||||
case "delete":
|
||||
layer.style.visibility = "hidden";
|
||||
break;
|
||||
}}
|
||||
}}
|
||||
function undoLayerEvent(n) {{
|
||||
var layer = document.getElementById("layer_" + layer_events[n].filename);
|
||||
switch (layer_events[n].op) {{
|
||||
case "flush":
|
||||
layer.style.visibility = "hidden";
|
||||
break;
|
||||
case "create_delta":
|
||||
layer.style.visibility = "hidden";
|
||||
break;
|
||||
case "create_image":
|
||||
layer.style.visibility = "hidden";
|
||||
break;
|
||||
case "delete":
|
||||
layer.style.visibility = "visible";
|
||||
break;
|
||||
}}
|
||||
}}
|
||||
|
||||
var last_slider_pos = 0
|
||||
var last_layer_event = 0
|
||||
|
||||
var moveSlider = function(new_pos) {{
|
||||
if (new_pos > last_slider_pos) {{
|
||||
while (last_layer_event < layer_events.length - 1) {{
|
||||
if (layer_events[last_layer_event + 1].time_rel > new_pos) {{
|
||||
break;
|
||||
}}
|
||||
last_layer_event += 1;
|
||||
redoLayerEvent(last_layer_event)
|
||||
}}
|
||||
}}
|
||||
if (new_pos < last_slider_pos) {{
|
||||
while (last_layer_event >= 0) {{
|
||||
if (layer_events[last_layer_event].time_rel <= new_pos) {{
|
||||
break;
|
||||
}}
|
||||
undoLayerEvent(last_layer_event)
|
||||
last_layer_event -= 1;
|
||||
}}
|
||||
}}
|
||||
last_slider_pos = new_pos;
|
||||
document.getElementById("debug_pos").textContent=new_pos;
|
||||
if (last_layer_event >= 0) {{
|
||||
document.getElementById("debug_layer_event").textContent=last_layer_event + " " + layer_events[last_layer_event].time_rel + " " + layer_events[last_layer_event].op;
|
||||
}} else {{
|
||||
document.getElementById("debug_layer_event").textContent="begin";
|
||||
}}
|
||||
}}
|
||||
</script>
|
||||
|
||||
<div class="topbar">
|
||||
<div class="slidercontainer">
|
||||
<label for="time-slider">TIME</label>:
|
||||
<input id="time-slider" class="slider" type="range" min="0" max="{last_time_rel}" value="0" oninput="moveSlider(this.value)"><br>
|
||||
|
||||
pos: <span id="debug_pos"></span><br>
|
||||
event: <span id="debug_layer_event"></span><br>
|
||||
gc: <span id="debug_gc_event"></span><br>
|
||||
</div>
|
||||
|
||||
<button onclick="startAnimation()">Play</button>
|
||||
<button onclick="stopAnimation()">Stop</button>
|
||||
|
||||
<svg class="legend">
|
||||
<rect x=5 y=0 width=20 height=20 style="fill:rgb(128,128,128);stroke:rgb(0,0,0);stroke-width:0.5;fill-opacity:1;stroke-opacity:1;"/>
|
||||
<line x1=5 y1=30 x2=25 y2=30 style="fill:rgb(128,0,128);stroke:rgb(128,0,128);stroke-width:3;fill-opacity:1;stroke-opacity:1;"/>
|
||||
<line x1=0 y1=40 x2=30 y2=40 style="fill:none;stroke:rgb(255,0,0);stroke-width:0.5;fill-opacity:1;stroke-opacity:1;"/>
|
||||
</svg>
|
||||
</div>
|
||||
|
||||
<div class="main">
|
||||
{svg}
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
"#
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,37 +0,0 @@
|
||||
use pageserver_compaction::interface::CompactionLayer;
|
||||
use pageserver_compaction::simulator::MockTimeline;
|
||||
|
||||
/// Test the extreme case that there are so many updates for a single key that
|
||||
/// even if we produce an extremely narrow delta layer, spanning just that one
|
||||
/// key, we still too many records to fit in the target file size. We need to
|
||||
/// split in the LSN dimension too in that case.
|
||||
///
|
||||
/// TODO: The code to avoid this problem has not been implemented yet! So the
|
||||
/// assertion currently fails, but we need to make it not fail.
|
||||
#[ignore]
|
||||
#[tokio::test]
|
||||
async fn test_many_updates_for_single_key() -> anyhow::Result<()> {
|
||||
let mut executor = MockTimeline::new();
|
||||
executor.target_file_size = 10_000_000; // 10 MB
|
||||
|
||||
// Ingest 100 MB of updates to a single key.
|
||||
for _ in 1..1000 {
|
||||
executor.ingest_uniform(100, 10, &(0..100_000))?;
|
||||
executor.ingest_uniform(10_000, 10, &(0..1))?;
|
||||
executor.compact().await?;
|
||||
}
|
||||
|
||||
// Check that all the layers are smaller than the target size (with some slop)
|
||||
for l in executor.live_layers.iter() {
|
||||
println!("layer {}: {}", l.short_id(), l.file_size());
|
||||
}
|
||||
for l in executor.live_layers.iter() {
|
||||
assert!(l.file_size() < executor.target_file_size * 2);
|
||||
// sanity check that none of the delta layers are stupidly small either
|
||||
if l.is_delta() {
|
||||
assert!(l.file_size() > executor.target_file_size / 2);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -880,13 +880,6 @@ impl PageServerConf {
|
||||
);
|
||||
}
|
||||
|
||||
if let Some(compaction_algorithm) = item.get("compaction_algorithm") {
|
||||
t_conf.compaction_algorithm = Some(
|
||||
deserialize_from_item("compaction_algorithm", compaction_algorithm)
|
||||
.context("parse compaction_algorithm")?,
|
||||
);
|
||||
}
|
||||
|
||||
if let Some(gc_horizon) = item.get("gc_horizon") {
|
||||
t_conf.gc_horizon = Some(parse_toml_u64("gc_horizon", gc_horizon)?);
|
||||
}
|
||||
|
||||
@@ -16,7 +16,7 @@ use tracing::*;
|
||||
use utils::id::NodeId;
|
||||
|
||||
mod metrics;
|
||||
use crate::consumption_metrics::metrics::MetricsKey;
|
||||
use metrics::MetricsKey;
|
||||
mod disk_cache;
|
||||
mod upload;
|
||||
|
||||
|
||||
@@ -57,10 +57,7 @@ impl ControlPlaneClient {
|
||||
|
||||
if let Some(jwt) = &conf.control_plane_api_token {
|
||||
let mut headers = hyper::HeaderMap::new();
|
||||
headers.insert(
|
||||
"Authorization",
|
||||
format!("Bearer {}", jwt.get_contents()).parse().unwrap(),
|
||||
);
|
||||
headers.insert("Authorization", jwt.get_contents().parse().unwrap());
|
||||
client = client.default_headers(headers);
|
||||
}
|
||||
|
||||
|
||||
@@ -10,7 +10,6 @@ use crate::control_plane_client::ControlPlaneGenerationsApi;
|
||||
use crate::metrics;
|
||||
use crate::tenant::remote_timeline_client::remote_layer_path;
|
||||
use crate::tenant::remote_timeline_client::remote_timeline_path;
|
||||
use crate::virtual_file::MaybeFatalIo;
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use anyhow::Context;
|
||||
use camino::Utf8PathBuf;
|
||||
@@ -272,9 +271,7 @@ impl DeletionHeader {
|
||||
let temp_path = path_with_suffix_extension(&header_path, TEMP_SUFFIX);
|
||||
VirtualFile::crashsafe_overwrite(&header_path, &temp_path, &header_bytes)
|
||||
.await
|
||||
.maybe_fatal_err("save deletion header")?;
|
||||
|
||||
Ok(())
|
||||
.map_err(Into::into)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -363,7 +360,6 @@ impl DeletionList {
|
||||
let bytes = serde_json::to_vec(self).expect("Failed to serialize deletion list");
|
||||
VirtualFile::crashsafe_overwrite(&path, &temp_path, &bytes)
|
||||
.await
|
||||
.maybe_fatal_err("save deletion list")
|
||||
.map_err(Into::into)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -34,8 +34,6 @@ use crate::deletion_queue::TEMP_SUFFIX;
|
||||
use crate::metrics;
|
||||
use crate::tenant::remote_timeline_client::remote_layer_path;
|
||||
use crate::tenant::storage_layer::LayerFileName;
|
||||
use crate::virtual_file::on_fatal_io_error;
|
||||
use crate::virtual_file::MaybeFatalIo;
|
||||
|
||||
// The number of keys in a DeletionList before we will proactively persist it
|
||||
// (without reaching a flush deadline). This aims to deliver objects of the order
|
||||
@@ -197,7 +195,7 @@ impl ListWriter {
|
||||
debug!("Deletion header {header_path} not found, first start?");
|
||||
Ok(None)
|
||||
} else {
|
||||
on_fatal_io_error(&e, "reading deletion header");
|
||||
Err(anyhow::anyhow!(e))
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -218,9 +216,16 @@ impl ListWriter {
|
||||
self.pending.sequence = validated_sequence + 1;
|
||||
|
||||
let deletion_directory = self.conf.deletion_prefix();
|
||||
let mut dir = tokio::fs::read_dir(&deletion_directory)
|
||||
.await
|
||||
.fatal_err("read deletion directory");
|
||||
let mut dir = match tokio::fs::read_dir(&deletion_directory).await {
|
||||
Ok(d) => d,
|
||||
Err(e) => {
|
||||
warn!("Failed to open deletion list directory {deletion_directory}: {e:#}");
|
||||
|
||||
// Give up: if we can't read the deletion list directory, we probably can't
|
||||
// write lists into it later, so the queue won't work.
|
||||
return Err(e.into());
|
||||
}
|
||||
};
|
||||
|
||||
let list_name_pattern =
|
||||
Regex::new("(?<sequence>[a-zA-Z0-9]{16})-(?<version>[a-zA-Z0-9]{2}).list").unwrap();
|
||||
@@ -228,7 +233,7 @@ impl ListWriter {
|
||||
let temp_extension = format!(".{TEMP_SUFFIX}");
|
||||
let header_path = self.conf.deletion_header_path();
|
||||
let mut seqs: Vec<u64> = Vec::new();
|
||||
while let Some(dentry) = dir.next_entry().await.fatal_err("read deletion dentry") {
|
||||
while let Some(dentry) = dir.next_entry().await? {
|
||||
let file_name = dentry.file_name();
|
||||
let dentry_str = file_name.to_string_lossy();
|
||||
|
||||
@@ -241,9 +246,11 @@ impl ListWriter {
|
||||
info!("Cleaning up temporary file {dentry_str}");
|
||||
let absolute_path =
|
||||
deletion_directory.join(dentry.file_name().to_str().expect("non-Unicode path"));
|
||||
tokio::fs::remove_file(&absolute_path)
|
||||
.await
|
||||
.fatal_err("delete temp file");
|
||||
if let Err(e) = tokio::fs::remove_file(&absolute_path).await {
|
||||
// Non-fatal error: we will just leave the file behind but not
|
||||
// try and load it.
|
||||
warn!("Failed to clean up temporary file {absolute_path}: {e:#}");
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
@@ -283,9 +290,7 @@ impl ListWriter {
|
||||
for s in seqs {
|
||||
let list_path = self.conf.deletion_list_path(s);
|
||||
|
||||
let list_bytes = tokio::fs::read(&list_path)
|
||||
.await
|
||||
.fatal_err("read deletion list");
|
||||
let list_bytes = tokio::fs::read(&list_path).await?;
|
||||
|
||||
let mut deletion_list = match serde_json::from_slice::<DeletionList>(&list_bytes) {
|
||||
Ok(l) => l,
|
||||
|
||||
@@ -28,7 +28,6 @@ use crate::config::PageServerConf;
|
||||
use crate::control_plane_client::ControlPlaneGenerationsApi;
|
||||
use crate::control_plane_client::RetryForeverError;
|
||||
use crate::metrics;
|
||||
use crate::virtual_file::MaybeFatalIo;
|
||||
|
||||
use super::deleter::DeleterMessage;
|
||||
use super::DeletionHeader;
|
||||
@@ -288,9 +287,16 @@ where
|
||||
async fn cleanup_lists(&mut self, list_paths: Vec<Utf8PathBuf>) {
|
||||
for list_path in list_paths {
|
||||
debug!("Removing deletion list {list_path}");
|
||||
tokio::fs::remove_file(&list_path)
|
||||
.await
|
||||
.fatal_err("remove deletion list");
|
||||
|
||||
if let Err(e) = tokio::fs::remove_file(&list_path).await {
|
||||
// Unexpected: we should have permissions and nothing else should
|
||||
// be touching these files. We will leave the file behind. Subsequent
|
||||
// pageservers will try and load it again: hopefully whatever storage
|
||||
// issue (probably permissions) has been fixed by then.
|
||||
tracing::error!("Failed to delete {list_path}: {e:#}");
|
||||
metrics::DELETION_QUEUE.unexpected_errors.inc();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -569,17 +569,7 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/NotFoundError"
|
||||
"409":
|
||||
description: |
|
||||
The tenant is already known to Pageserver in some way,
|
||||
and hence this `/attach` call has been rejected.
|
||||
|
||||
Some examples of how this can happen:
|
||||
- tenant was created on this pageserver
|
||||
- tenant attachment was started by an earlier call to `/attach`.
|
||||
|
||||
Callers should poll the tenant status's `attachment_status` field,
|
||||
like for status 202. See the longer description for `POST /attach`
|
||||
for details.
|
||||
description: Tenant download is already in progress
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
|
||||
@@ -172,21 +172,6 @@ impl Key {
|
||||
}
|
||||
}
|
||||
|
||||
impl pageserver_compaction::interface::CompactionKey for Key {
|
||||
const MIN: Self = Self::MIN;
|
||||
const MAX: Self = Self::MAX;
|
||||
|
||||
fn key_range_size(r: &std::ops::Range<Self>) -> u32 {
|
||||
key_range_size(r)
|
||||
}
|
||||
fn next(&self) -> Key {
|
||||
(self as &Key).next()
|
||||
}
|
||||
fn skip_some(&self) -> Key {
|
||||
self.add(128)
|
||||
}
|
||||
}
|
||||
|
||||
/// A 'value' stored for a one Key.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub enum Value {
|
||||
|
||||
@@ -3443,7 +3443,6 @@ pub(crate) mod harness {
|
||||
compaction_target_size: Some(tenant_conf.compaction_target_size),
|
||||
compaction_period: Some(tenant_conf.compaction_period),
|
||||
compaction_threshold: Some(tenant_conf.compaction_threshold),
|
||||
compaction_algorithm: Some(tenant_conf.compaction_algorithm),
|
||||
gc_horizon: Some(tenant_conf.gc_horizon),
|
||||
gc_period: Some(tenant_conf.gc_period),
|
||||
image_creation_threshold: Some(tenant_conf.image_creation_threshold),
|
||||
|
||||
@@ -23,17 +23,12 @@ pub mod defaults {
|
||||
pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024;
|
||||
pub const DEFAULT_CHECKPOINT_TIMEOUT: &str = "10 m";
|
||||
|
||||
// FIXME the below configs are only used by legacy algorithm. The new algorithm
|
||||
// has different parameters.
|
||||
|
||||
// Target file size, when creating image and delta layers.
|
||||
// This parameter determines L1 layer file size.
|
||||
pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024;
|
||||
|
||||
pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s";
|
||||
pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
|
||||
pub const DEFAULT_COMPACTION_ALGORITHM: super::CompactionAlgorithm =
|
||||
super::CompactionAlgorithm::Legacy;
|
||||
|
||||
pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
|
||||
|
||||
@@ -275,7 +270,6 @@ pub struct TenantConf {
|
||||
pub compaction_period: Duration,
|
||||
// Level0 delta layer threshold for compaction.
|
||||
pub compaction_threshold: usize,
|
||||
pub compaction_algorithm: CompactionAlgorithm,
|
||||
// Determines how much history is retained, to allow
|
||||
// branching and read replicas at an older point in time.
|
||||
// The unit is #of bytes of WAL.
|
||||
@@ -339,10 +333,6 @@ pub struct TenantConfOpt {
|
||||
#[serde(default)]
|
||||
pub compaction_threshold: Option<usize>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(default)]
|
||||
pub compaction_algorithm: Option<CompactionAlgorithm>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(default)]
|
||||
pub gc_horizon: Option<u64>,
|
||||
@@ -397,13 +387,6 @@ pub struct TenantConfOpt {
|
||||
pub gc_feedback: Option<bool>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[serde(tag = "kind")]
|
||||
pub enum CompactionAlgorithm {
|
||||
Legacy,
|
||||
Tiered,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[serde(tag = "kind")]
|
||||
pub enum EvictionPolicy {
|
||||
@@ -446,9 +429,6 @@ impl TenantConfOpt {
|
||||
compaction_threshold: self
|
||||
.compaction_threshold
|
||||
.unwrap_or(global_conf.compaction_threshold),
|
||||
compaction_algorithm: self
|
||||
.compaction_algorithm
|
||||
.unwrap_or(global_conf.compaction_algorithm),
|
||||
gc_horizon: self.gc_horizon.unwrap_or(global_conf.gc_horizon),
|
||||
gc_period: self.gc_period.unwrap_or(global_conf.gc_period),
|
||||
image_creation_threshold: self
|
||||
@@ -488,7 +468,6 @@ impl Default for TenantConf {
|
||||
compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
|
||||
.expect("cannot parse default compaction period"),
|
||||
compaction_threshold: DEFAULT_COMPACTION_THRESHOLD,
|
||||
compaction_algorithm: DEFAULT_COMPACTION_ALGORITHM,
|
||||
gc_horizon: DEFAULT_GC_HORIZON,
|
||||
gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
|
||||
.expect("cannot parse default gc period"),
|
||||
@@ -577,12 +556,6 @@ impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt {
|
||||
|
||||
tenant_conf.compaction_target_size = request_data.compaction_target_size;
|
||||
tenant_conf.compaction_threshold = request_data.compaction_threshold;
|
||||
if let Some(compaction_algorithm) = &request_data.compaction_algorithm {
|
||||
tenant_conf.compaction_algorithm = Some(
|
||||
serde::Deserialize::deserialize(compaction_algorithm)
|
||||
.context("parse field `compaction_algorithm`")?,
|
||||
);
|
||||
}
|
||||
|
||||
if let Some(compaction_period) = &request_data.compaction_period {
|
||||
tenant_conf.compaction_period = Some(
|
||||
|
||||
@@ -1542,7 +1542,7 @@ pub fn remote_index_path(
|
||||
}
|
||||
|
||||
/// Given the key of an index, parse out the generation part of the name
|
||||
pub fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
|
||||
pub(crate) fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
|
||||
let file_name = match path.get_path().file_name() {
|
||||
Some(f) => f,
|
||||
None => {
|
||||
|
||||
@@ -155,7 +155,7 @@ pub struct IndexLayerMetadata {
|
||||
|
||||
#[serde(default = "Generation::none")]
|
||||
#[serde(skip_serializing_if = "Generation::is_none")]
|
||||
pub generation: Generation,
|
||||
pub(super) generation: Generation,
|
||||
}
|
||||
|
||||
impl From<LayerFileMetadata> for IndexLayerMetadata {
|
||||
|
||||
@@ -882,15 +882,3 @@ impl AsRef<DeltaLayerInner> for DeltaLayerInner {
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> pageserver_compaction::interface::CompactionDeltaEntry<'a, Key> for DeltaEntry<'a> {
|
||||
fn key(&self) -> Key {
|
||||
self.key
|
||||
}
|
||||
fn lsn(&self) -> Lsn {
|
||||
self.lsn
|
||||
}
|
||||
fn size(&self) -> u64 {
|
||||
self.size
|
||||
}
|
||||
}
|
||||
|
||||
@@ -125,7 +125,6 @@ impl Layer {
|
||||
let inner = Arc::new(DownloadedLayer {
|
||||
owner: owner.clone(),
|
||||
kind: tokio::sync::OnceCell::default(),
|
||||
version: 0,
|
||||
});
|
||||
resident = Some(inner.clone());
|
||||
|
||||
@@ -164,7 +163,6 @@ impl Layer {
|
||||
let inner = Arc::new(DownloadedLayer {
|
||||
owner: owner.clone(),
|
||||
kind: tokio::sync::OnceCell::default(),
|
||||
version: 0,
|
||||
});
|
||||
resident = Some(inner.clone());
|
||||
let access_stats = LayerAccessStats::empty_will_record_residence_event_later();
|
||||
@@ -330,46 +328,42 @@ impl Layer {
|
||||
/// read with [`Layer::get_value_reconstruct_data`].
|
||||
///
|
||||
/// [`LayerMap::search`]: crate::tenant::layer_map::LayerMap::search
|
||||
#[derive(Debug)]
|
||||
enum ResidentOrWantedEvicted {
|
||||
Resident(Arc<DownloadedLayer>),
|
||||
WantedEvicted(Weak<DownloadedLayer>, usize),
|
||||
WantedEvicted(Weak<DownloadedLayer>),
|
||||
}
|
||||
|
||||
impl ResidentOrWantedEvicted {
|
||||
fn get_and_upgrade(&mut self) -> Option<(Arc<DownloadedLayer>, bool)> {
|
||||
fn get(&self) -> Option<Arc<DownloadedLayer>> {
|
||||
match self {
|
||||
ResidentOrWantedEvicted::Resident(strong) => Some((strong.clone(), false)),
|
||||
ResidentOrWantedEvicted::WantedEvicted(weak, _) => match weak.upgrade() {
|
||||
ResidentOrWantedEvicted::Resident(strong) => Some(strong.clone()),
|
||||
ResidentOrWantedEvicted::WantedEvicted(weak) => match weak.upgrade() {
|
||||
Some(strong) => {
|
||||
LAYER_IMPL_METRICS.inc_raced_wanted_evicted_accesses();
|
||||
|
||||
*self = ResidentOrWantedEvicted::Resident(strong.clone());
|
||||
|
||||
Some((strong, true))
|
||||
Some(strong)
|
||||
}
|
||||
None => None,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
/// When eviction is first requested, drop down to holding a [`Weak`].
|
||||
///
|
||||
/// Returns `Some` if this was the first time eviction was requested. Care should be taken to
|
||||
/// drop the possibly last strong reference outside of the mutex of
|
||||
/// heavier_once_cell::OnceCell.
|
||||
fn downgrade(&mut self) -> Option<Arc<DownloadedLayer>> {
|
||||
match self {
|
||||
/// Returns `true` if this was the first time eviction was requested.
|
||||
fn downgrade(&mut self) -> &Weak<DownloadedLayer> {
|
||||
let _was_first = match self {
|
||||
ResidentOrWantedEvicted::Resident(strong) => {
|
||||
let weak = Arc::downgrade(strong);
|
||||
let mut temp = ResidentOrWantedEvicted::WantedEvicted(weak, strong.version);
|
||||
std::mem::swap(self, &mut temp);
|
||||
match temp {
|
||||
ResidentOrWantedEvicted::Resident(strong) => Some(strong),
|
||||
ResidentOrWantedEvicted::WantedEvicted(..) => unreachable!("just swapped"),
|
||||
}
|
||||
*self = ResidentOrWantedEvicted::WantedEvicted(weak);
|
||||
// returning the weak is not useful, because the drop could had already ran with
|
||||
// the replacement above, and that will take care of cleaning the Option we are in
|
||||
true
|
||||
}
|
||||
ResidentOrWantedEvicted::WantedEvicted(..) => None,
|
||||
ResidentOrWantedEvicted::WantedEvicted(_) => false,
|
||||
};
|
||||
|
||||
match self {
|
||||
ResidentOrWantedEvicted::WantedEvicted(ref weak) => weak,
|
||||
_ => unreachable!("just wrote wanted evicted"),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -404,17 +398,11 @@ struct LayerInner {
|
||||
/// [`LayerInner::on_downloaded_layer_drop`].
|
||||
wanted_evicted: AtomicBool,
|
||||
|
||||
/// Version is to make sure we will only evict a specific download of a file.
|
||||
///
|
||||
/// Incremented for each download, stored in `DownloadedLayer::version` or
|
||||
/// `ResidentOrWantedEvicted::WantedEvicted`.
|
||||
/// Version is to make sure we will in fact only evict a file if no new download has been
|
||||
/// started.
|
||||
version: AtomicUsize,
|
||||
|
||||
/// Allow subscribing to when the layer actually gets evicted.
|
||||
///
|
||||
/// If in future we need to implement "wait until layer instances are gone and done", carrying
|
||||
/// this over to the gc spawn_blocking from LayerInner::drop will do the trick, and adding a
|
||||
/// method for "wait_gc" which will wait to this being closed.
|
||||
status: tokio::sync::broadcast::Sender<Status>,
|
||||
|
||||
/// Counter for exponential backoff with the download
|
||||
@@ -527,14 +515,6 @@ impl LayerInner {
|
||||
.timeline_path(&timeline.tenant_id, &timeline.timeline_id)
|
||||
.join(desc.filename().to_string());
|
||||
|
||||
let (inner, version) = if let Some(inner) = downloaded {
|
||||
let version = inner.version;
|
||||
let resident = ResidentOrWantedEvicted::Resident(inner);
|
||||
(heavier_once_cell::OnceCell::new(resident), version)
|
||||
} else {
|
||||
(heavier_once_cell::OnceCell::default(), 0)
|
||||
};
|
||||
|
||||
LayerInner {
|
||||
conf,
|
||||
path,
|
||||
@@ -544,8 +524,12 @@ impl LayerInner {
|
||||
access_stats,
|
||||
wanted_garbage_collected: AtomicBool::new(false),
|
||||
wanted_evicted: AtomicBool::new(false),
|
||||
inner,
|
||||
version: AtomicUsize::new(version),
|
||||
inner: if let Some(inner) = downloaded {
|
||||
heavier_once_cell::OnceCell::new(ResidentOrWantedEvicted::Resident(inner))
|
||||
} else {
|
||||
heavier_once_cell::OnceCell::default()
|
||||
},
|
||||
version: AtomicUsize::new(0),
|
||||
status: tokio::sync::broadcast::channel(1).0,
|
||||
consecutive_failures: AtomicUsize::new(0),
|
||||
generation,
|
||||
@@ -565,8 +549,6 @@ impl LayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
/// Cancellation safe, however dropping the future and calling this method again might result
|
||||
/// in a new attempt to evict OR join the previously started attempt.
|
||||
pub(crate) async fn evict_and_wait(
|
||||
&self,
|
||||
_: &RemoteTimelineClient,
|
||||
@@ -577,22 +559,20 @@ impl LayerInner {
|
||||
|
||||
let mut rx = self.status.subscribe();
|
||||
|
||||
let strong = {
|
||||
match self.inner.get() {
|
||||
Some(mut either) => {
|
||||
self.wanted_evicted.store(true, Ordering::Relaxed);
|
||||
either.downgrade()
|
||||
}
|
||||
None => return Err(EvictionError::NotFound),
|
||||
}
|
||||
};
|
||||
let res =
|
||||
self.wanted_evicted
|
||||
.compare_exchange(false, true, Ordering::Release, Ordering::Relaxed);
|
||||
|
||||
if strong.is_some() {
|
||||
// drop the DownloadedLayer outside of the holding the guard
|
||||
drop(strong);
|
||||
if res.is_ok() {
|
||||
LAYER_IMPL_METRICS.inc_started_evictions();
|
||||
}
|
||||
|
||||
if self.get().is_none() {
|
||||
// it was not evictable in the first place
|
||||
// our store to the wanted_evicted does not matter; it will be reset by next download
|
||||
return Err(EvictionError::NotFound);
|
||||
}
|
||||
|
||||
match rx.recv().await {
|
||||
Ok(Status::Evicted) => Ok(()),
|
||||
Ok(Status::Downloaded) => Err(EvictionError::Downloaded),
|
||||
@@ -606,8 +586,7 @@ impl LayerInner {
|
||||
//
|
||||
// use however late (compared to the initial expressing of wanted) as the
|
||||
// "outcome" now
|
||||
LAYER_IMPL_METRICS.inc_broadcast_lagged();
|
||||
match self.inner.get() {
|
||||
match self.get() {
|
||||
Some(_) => Err(EvictionError::Downloaded),
|
||||
None => Ok(()),
|
||||
}
|
||||
@@ -615,19 +594,17 @@ impl LayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
/// Cancellation safe.
|
||||
#[tracing::instrument(skip_all, fields(layer=%self))]
|
||||
/// Should be cancellation safe, but cancellation is troublesome together with the spawned
|
||||
/// download.
|
||||
async fn get_or_maybe_download(
|
||||
self: &Arc<Self>,
|
||||
allow_download: bool,
|
||||
ctx: Option<&RequestContext>,
|
||||
) -> Result<Arc<DownloadedLayer>, DownloadError> {
|
||||
let mut init_permit = None;
|
||||
|
||||
loop {
|
||||
let download = move |permit| async move {
|
||||
let download = move || async move {
|
||||
// disable any scheduled but not yet running eviction deletions for this
|
||||
let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);
|
||||
self.version.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
// no need to make the evict_and_wait wait for the actual download to complete
|
||||
drop(self.status.send(Status::Downloaded));
|
||||
@@ -646,11 +623,7 @@ impl LayerInner {
|
||||
.await
|
||||
.map_err(DownloadError::PreStatFailed)?;
|
||||
|
||||
let permit = if let Some(reason) = needs_download {
|
||||
if let NeedsDownload::NotFile(ft) = reason {
|
||||
return Err(DownloadError::NotFile(ft));
|
||||
}
|
||||
|
||||
if let Some(reason) = needs_download {
|
||||
// only reset this after we've decided we really need to download. otherwise it'd
|
||||
// be impossible to mark cancelled downloads for eviction, like one could imagine
|
||||
// we would like to do for prefetching which was not needed.
|
||||
@@ -660,6 +633,8 @@ impl LayerInner {
|
||||
return Err(DownloadError::NoRemoteStorage);
|
||||
}
|
||||
|
||||
tracing::debug!(%reason, "downloading layer");
|
||||
|
||||
if let Some(ctx) = ctx {
|
||||
self.check_expected_download(ctx)?;
|
||||
}
|
||||
@@ -670,21 +645,16 @@ impl LayerInner {
|
||||
return Err(DownloadError::DownloadRequired);
|
||||
}
|
||||
|
||||
tracing::info!(%reason, "downloading on-demand");
|
||||
|
||||
self.spawn_download_and_wait(timeline, permit).await?
|
||||
self.spawn_download_and_wait(timeline).await?;
|
||||
} else {
|
||||
// the file is present locally, probably by a previous but cancelled call to
|
||||
// get_or_maybe_download. alternatively we might be running without remote storage.
|
||||
LAYER_IMPL_METRICS.inc_init_needed_no_download();
|
||||
|
||||
permit
|
||||
};
|
||||
}
|
||||
|
||||
let res = Arc::new(DownloadedLayer {
|
||||
owner: Arc::downgrade(self),
|
||||
kind: tokio::sync::OnceCell::default(),
|
||||
version: next_version,
|
||||
});
|
||||
|
||||
self.access_stats.record_residence_event(
|
||||
@@ -692,60 +662,19 @@ impl LayerInner {
|
||||
LayerResidenceEventReason::ResidenceChange,
|
||||
);
|
||||
|
||||
let waiters = self.inner.initializer_count();
|
||||
if waiters > 0 {
|
||||
tracing::info!(waiters, "completing the on-demand download for other tasks");
|
||||
}
|
||||
|
||||
Ok((ResidentOrWantedEvicted::Resident(res), permit))
|
||||
Ok(ResidentOrWantedEvicted::Resident(res))
|
||||
};
|
||||
|
||||
if let Some(init_permit) = init_permit.take() {
|
||||
// use the already held initialization permit because it is impossible to hit the
|
||||
// below paths anymore essentially limiting the max loop iterations to 2.
|
||||
let (value, init_permit) = download(init_permit).await?;
|
||||
let mut guard = self.inner.set(value, init_permit);
|
||||
let (strong, _upgraded) = guard
|
||||
.get_and_upgrade()
|
||||
.expect("init creates strong reference, we held the init permit");
|
||||
let locked = self.inner.get_or_init(download).await?;
|
||||
|
||||
if let Some(strong) = Self::get_or_apply_evictedness(Some(locked), &self.wanted_evicted)
|
||||
{
|
||||
return Ok(strong);
|
||||
}
|
||||
|
||||
let (weak, permit) = {
|
||||
let mut locked = self.inner.get_or_init(download).await?;
|
||||
|
||||
if let Some((strong, upgraded)) = locked.get_and_upgrade() {
|
||||
if upgraded {
|
||||
// when upgraded back, the Arc<DownloadedLayer> is still available, but
|
||||
// previously a `evict_and_wait` was received.
|
||||
self.wanted_evicted.store(false, Ordering::Relaxed);
|
||||
|
||||
// error out any `evict_and_wait`
|
||||
drop(self.status.send(Status::Downloaded));
|
||||
LAYER_IMPL_METRICS
|
||||
.inc_eviction_cancelled(EvictionCancelled::UpgradedBackOnAccess);
|
||||
}
|
||||
|
||||
return Ok(strong);
|
||||
} else {
|
||||
// path to here: the evict_blocking is stuck on spawn_blocking queue.
|
||||
//
|
||||
// reset the contents, deactivating the eviction and causing a
|
||||
// EvictionCancelled::LostToDownload or EvictionCancelled::VersionCheckFailed.
|
||||
locked.take_and_deinit()
|
||||
}
|
||||
};
|
||||
|
||||
// unlock first, then drop the weak, but because upgrade failed, we
|
||||
// know it cannot be a problem.
|
||||
|
||||
assert!(
|
||||
matches!(weak, ResidentOrWantedEvicted::WantedEvicted(..)),
|
||||
"unexpected {weak:?}, ResidentOrWantedEvicted::get_and_upgrade has a bug"
|
||||
);
|
||||
|
||||
init_permit = Some(permit);
|
||||
|
||||
// the situation in which we might need to retry is that our init was ready
|
||||
// immediatedly, but the DownloadedLayer had been dropped BUT failed to complete
|
||||
// Self::evict_blocking
|
||||
LAYER_IMPL_METRICS.inc_retried_get_or_maybe_download();
|
||||
}
|
||||
}
|
||||
@@ -757,8 +686,8 @@ impl LayerInner {
|
||||
match b {
|
||||
Download => Ok(()),
|
||||
Warn | Error => {
|
||||
tracing::info!(
|
||||
"unexpectedly on-demand downloading for task kind {:?}",
|
||||
tracing::warn!(
|
||||
"unexpectedly on-demand downloading remote layer {self} for task kind {:?}",
|
||||
ctx.task_kind()
|
||||
);
|
||||
crate::metrics::UNEXPECTED_ONDEMAND_DOWNLOADS.inc();
|
||||
@@ -780,17 +709,14 @@ impl LayerInner {
|
||||
async fn spawn_download_and_wait(
|
||||
self: &Arc<Self>,
|
||||
timeline: Arc<Timeline>,
|
||||
permit: heavier_once_cell::InitPermit,
|
||||
) -> Result<heavier_once_cell::InitPermit, DownloadError> {
|
||||
) -> Result<(), DownloadError> {
|
||||
let task_name = format!("download layer {}", self);
|
||||
|
||||
let (tx, rx) = tokio::sync::oneshot::channel();
|
||||
|
||||
// this is sadly needed because of task_mgr::shutdown_tasks, otherwise we cannot
|
||||
// block tenant::mgr::remove_tenant_from_memory.
|
||||
|
||||
let this: Arc<Self> = self.clone();
|
||||
|
||||
crate::task_mgr::spawn(
|
||||
&tokio::runtime::Handle::current(),
|
||||
crate::task_mgr::TaskKind::RemoteDownloadTask,
|
||||
@@ -799,7 +725,6 @@ impl LayerInner {
|
||||
&task_name,
|
||||
false,
|
||||
async move {
|
||||
|
||||
let client = timeline
|
||||
.remote_client
|
||||
.as_ref()
|
||||
@@ -821,9 +746,9 @@ impl LayerInner {
|
||||
}
|
||||
};
|
||||
|
||||
if let Err(res) = tx.send((result, permit)) {
|
||||
if let Err(res) = tx.send(result) {
|
||||
match res {
|
||||
(Ok(()), _) => {
|
||||
Ok(()) => {
|
||||
// our caller is cancellation safe so this is fine; if someone
|
||||
// else requests the layer, they'll find it already downloaded
|
||||
// or redownload.
|
||||
@@ -834,7 +759,7 @@ impl LayerInner {
|
||||
tracing::info!("layer file download completed after requester had cancelled");
|
||||
LAYER_IMPL_METRICS.inc_download_completed_without_requester();
|
||||
},
|
||||
(Err(e), _) => {
|
||||
Err(e) => {
|
||||
// our caller is cancellation safe, but we might be racing with
|
||||
// another attempt to initialize. before we have cancellation
|
||||
// token support: these attempts should converge regardless of
|
||||
@@ -850,7 +775,7 @@ impl LayerInner {
|
||||
.in_current_span(),
|
||||
);
|
||||
match rx.await {
|
||||
Ok((Ok(()), permit)) => {
|
||||
Ok(Ok(())) => {
|
||||
if let Some(reason) = self
|
||||
.needs_download()
|
||||
.await
|
||||
@@ -861,12 +786,10 @@ impl LayerInner {
|
||||
}
|
||||
|
||||
self.consecutive_failures.store(0, Ordering::Relaxed);
|
||||
tracing::info!("on-demand download successful");
|
||||
|
||||
Ok(permit)
|
||||
Ok(())
|
||||
}
|
||||
Ok((Err(e), _permit)) => {
|
||||
// FIXME: this should be with the spawned task and be cancellation sensitive
|
||||
Ok(Err(e)) => {
|
||||
let consecutive_failures =
|
||||
self.consecutive_failures.fetch_add(1, Ordering::Relaxed);
|
||||
tracing::error!(consecutive_failures, "layer file download failed: {e:#}");
|
||||
@@ -884,6 +807,33 @@ impl LayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
/// Access the current state without waiting for the file to be downloaded.
|
||||
///
|
||||
/// Requires that we've initialized to state which is respective to the
|
||||
/// actual residency state.
|
||||
fn get(&self) -> Option<Arc<DownloadedLayer>> {
|
||||
let locked = self.inner.get();
|
||||
Self::get_or_apply_evictedness(locked, &self.wanted_evicted)
|
||||
}
|
||||
|
||||
fn get_or_apply_evictedness(
|
||||
guard: Option<heavier_once_cell::Guard<'_, ResidentOrWantedEvicted>>,
|
||||
wanted_evicted: &AtomicBool,
|
||||
) -> Option<Arc<DownloadedLayer>> {
|
||||
if let Some(mut x) = guard {
|
||||
if let Some(won) = x.get() {
|
||||
// there are no guarantees that we will always get to observe a concurrent call
|
||||
// to evict
|
||||
if wanted_evicted.load(Ordering::Acquire) {
|
||||
x.downgrade();
|
||||
}
|
||||
return Some(won);
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
async fn needs_download(&self) -> Result<Option<NeedsDownload>, std::io::Error> {
|
||||
match tokio::fs::metadata(&self.path).await {
|
||||
Ok(m) => Ok(self.is_file_present_and_good_size(&m).err()),
|
||||
@@ -903,7 +853,7 @@ impl LayerInner {
|
||||
fn is_file_present_and_good_size(&self, m: &std::fs::Metadata) -> Result<(), NeedsDownload> {
|
||||
// in future, this should include sha2-256 validation of the file.
|
||||
if !m.is_file() {
|
||||
Err(NeedsDownload::NotFile(m.file_type()))
|
||||
Err(NeedsDownload::NotFile)
|
||||
} else if m.len() != self.desc.file_size {
|
||||
Err(NeedsDownload::WrongSize {
|
||||
actual: m.len(),
|
||||
@@ -917,9 +867,7 @@ impl LayerInner {
|
||||
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
|
||||
let layer_file_name = self.desc.filename().file_name();
|
||||
|
||||
// this is not accurate: we could have the file locally but there was a cancellation
|
||||
// and now we are not in sync, or we are currently downloading it.
|
||||
let remote = self.inner.get().is_none();
|
||||
let remote = self.get().is_none();
|
||||
|
||||
let access_stats = self.access_stats.as_api_model(reset);
|
||||
|
||||
@@ -948,7 +896,7 @@ impl LayerInner {
|
||||
}
|
||||
|
||||
/// `DownloadedLayer` is being dropped, so it calls this method.
|
||||
fn on_downloaded_layer_drop(self: Arc<LayerInner>, version: usize) {
|
||||
fn on_downloaded_layer_drop(self: Arc<LayerInner>) {
|
||||
let gc = self.wanted_garbage_collected.load(Ordering::Acquire);
|
||||
let evict = self.wanted_evicted.load(Ordering::Acquire);
|
||||
let can_evict = self.have_remote_client;
|
||||
@@ -956,16 +904,15 @@ impl LayerInner {
|
||||
if gc {
|
||||
// do nothing now, only in LayerInner::drop
|
||||
} else if can_evict && evict {
|
||||
let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_id, timeline_id = %self.desc.timeline_id, layer=%self, %version);
|
||||
let version = self.version.load(Ordering::Relaxed);
|
||||
|
||||
let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_id, timeline_id = %self.desc.timeline_id, layer=%self);
|
||||
|
||||
// downgrade for queueing, in case there's a tear down already ongoing we should not
|
||||
// hold it alive.
|
||||
let this = Arc::downgrade(&self);
|
||||
drop(self);
|
||||
|
||||
// NOTE: this scope *must* never call `self.inner.get` because evict_and_wait might
|
||||
// drop while the `self.inner` is being locked, leading to a deadlock.
|
||||
|
||||
crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(move || {
|
||||
let _g = span.entered();
|
||||
|
||||
@@ -975,15 +922,19 @@ impl LayerInner {
|
||||
LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
|
||||
return;
|
||||
};
|
||||
match this.evict_blocking(version) {
|
||||
Ok(()) => LAYER_IMPL_METRICS.inc_completed_evictions(),
|
||||
Err(reason) => LAYER_IMPL_METRICS.inc_eviction_cancelled(reason),
|
||||
}
|
||||
this.evict_blocking(version);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
fn evict_blocking(&self, only_version: usize) -> Result<(), EvictionCancelled> {
|
||||
fn evict_blocking(&self, version: usize) {
|
||||
match self.evict_blocking0(version) {
|
||||
Ok(()) => LAYER_IMPL_METRICS.inc_completed_evictions(),
|
||||
Err(reason) => LAYER_IMPL_METRICS.inc_eviction_cancelled(reason),
|
||||
}
|
||||
}
|
||||
|
||||
fn evict_blocking0(&self, version: usize) -> Result<(), EvictionCancelled> {
|
||||
// deleted or detached timeline, don't do anything.
|
||||
let Some(timeline) = self.timeline.upgrade() else {
|
||||
return Err(EvictionCancelled::TimelineGone);
|
||||
@@ -994,34 +945,32 @@ impl LayerInner {
|
||||
let _permit = {
|
||||
let maybe_downloaded = self.inner.get();
|
||||
|
||||
let (_weak, permit) = match maybe_downloaded {
|
||||
Some(mut guard) => {
|
||||
if let ResidentOrWantedEvicted::WantedEvicted(_weak, version) = &*guard {
|
||||
if *version == only_version {
|
||||
guard.take_and_deinit()
|
||||
} else {
|
||||
// this was not for us; maybe there's another eviction job
|
||||
// TODO: does it make any sense to stall here? unique versions do not
|
||||
// matter, we only want to make sure not to evict a resident, which we
|
||||
// are not doing.
|
||||
return Err(EvictionCancelled::VersionCheckFailed);
|
||||
}
|
||||
} else {
|
||||
return Err(EvictionCancelled::AlreadyReinitialized);
|
||||
}
|
||||
if version != self.version.load(Ordering::Relaxed) {
|
||||
// downloadness-state has advanced, we might no longer be the latest eviction
|
||||
// work; don't do anything.
|
||||
//
|
||||
// this is possible to get to by having:
|
||||
//
|
||||
// 1. wanted_evicted.store(true)
|
||||
// 2. ResidentOrWantedEvicted::downgrade
|
||||
// 3. DownloadedLayer::drop
|
||||
// 4. LayerInner::get_or_maybe_download
|
||||
// 5. LayerInner::evict_blocking
|
||||
return Err(EvictionCancelled::VersionCheckFailed);
|
||||
}
|
||||
|
||||
// free the DownloadedLayer allocation
|
||||
match maybe_downloaded.map(|mut g| g.take_and_deinit()) {
|
||||
Some((taken, permit)) => {
|
||||
assert!(matches!(taken, ResidentOrWantedEvicted::WantedEvicted(_)));
|
||||
permit
|
||||
}
|
||||
None => {
|
||||
// already deinitialized, perhaps get_or_maybe_download did this and is
|
||||
// currently waiting to reinitialize it
|
||||
return Err(EvictionCancelled::LostToDownload);
|
||||
unreachable!("we do the version checking for this exact reason")
|
||||
}
|
||||
};
|
||||
|
||||
permit
|
||||
}
|
||||
};
|
||||
|
||||
// now accesses to inner.get_or_init wait on the semaphore or the `_permit`
|
||||
|
||||
self.access_stats.record_residence_event(
|
||||
LayerResidenceStatus::Evicted,
|
||||
LayerResidenceEventReason::ResidenceChange,
|
||||
@@ -1054,14 +1003,11 @@ impl LayerInner {
|
||||
Ok(())
|
||||
}
|
||||
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
|
||||
tracing::error!(
|
||||
layer_size = %self.desc.file_size,
|
||||
"failed to evict layer from disk, it was already gone (metrics will be inaccurate)"
|
||||
);
|
||||
tracing::info!("failed to evict file from disk, it was already gone");
|
||||
Err(EvictionCancelled::FileNotFound)
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::error!("failed to evict file from disk: {e:#}");
|
||||
tracing::warn!("failed to evict file from disk: {e:#}");
|
||||
Err(EvictionCancelled::RemoveFailed)
|
||||
}
|
||||
};
|
||||
@@ -1105,8 +1051,6 @@ enum DownloadError {
|
||||
ContextAndConfigReallyDeniesDownloads,
|
||||
#[error("downloading is really required but not allowed by this method")]
|
||||
DownloadRequired,
|
||||
#[error("layer path exists, but it is not a file: {0:?}")]
|
||||
NotFile(std::fs::FileType),
|
||||
/// Why no error here? Because it will be reported by page_service. We should had also done
|
||||
/// retries already.
|
||||
#[error("downloading evicted layer file failed")]
|
||||
@@ -1122,7 +1066,7 @@ enum DownloadError {
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub(crate) enum NeedsDownload {
|
||||
NotFound,
|
||||
NotFile(std::fs::FileType),
|
||||
NotFile,
|
||||
WrongSize { actual: u64, expected: u64 },
|
||||
}
|
||||
|
||||
@@ -1130,7 +1074,7 @@ impl std::fmt::Display for NeedsDownload {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
NeedsDownload::NotFound => write!(f, "file was not found"),
|
||||
NeedsDownload::NotFile(ft) => write!(f, "path is not a file; {ft:?}"),
|
||||
NeedsDownload::NotFile => write!(f, "path is not a file"),
|
||||
NeedsDownload::WrongSize { actual, expected } => {
|
||||
write!(f, "file size mismatch {actual} vs. {expected}")
|
||||
}
|
||||
@@ -1141,10 +1085,7 @@ impl std::fmt::Display for NeedsDownload {
|
||||
/// Existence of `DownloadedLayer` means that we have the file locally, and can later evict it.
|
||||
pub(crate) struct DownloadedLayer {
|
||||
owner: Weak<LayerInner>,
|
||||
// Use tokio OnceCell as we do not need to deinitialize this, it'll just get dropped with the
|
||||
// DownloadedLayer
|
||||
kind: tokio::sync::OnceCell<anyhow::Result<LayerKind>>,
|
||||
version: usize,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for DownloadedLayer {
|
||||
@@ -1152,7 +1093,6 @@ impl std::fmt::Debug for DownloadedLayer {
|
||||
f.debug_struct("DownloadedLayer")
|
||||
// owner omitted because it is always "Weak"
|
||||
.field("kind", &self.kind)
|
||||
.field("version", &self.version)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
@@ -1160,7 +1100,7 @@ impl std::fmt::Debug for DownloadedLayer {
|
||||
impl Drop for DownloadedLayer {
|
||||
fn drop(&mut self) {
|
||||
if let Some(owner) = self.owner.upgrade() {
|
||||
owner.on_downloaded_layer_drop(self.version);
|
||||
owner.on_downloaded_layer_drop();
|
||||
} else {
|
||||
// no need to do anything, we are shutting down
|
||||
}
|
||||
@@ -1186,6 +1126,7 @@ impl DownloadedLayer {
|
||||
"these are the same, just avoiding the upgrade"
|
||||
);
|
||||
|
||||
// there is nothing async here, but it should be async
|
||||
let res = if owner.desc.is_delta {
|
||||
let summary = Some(delta_layer::Summary::expected(
|
||||
owner.desc.tenant_id,
|
||||
@@ -1284,8 +1225,6 @@ impl std::fmt::Debug for ResidentLayer {
|
||||
|
||||
impl ResidentLayer {
|
||||
/// Release the eviction guard, converting back into a plain [`Layer`].
|
||||
///
|
||||
/// You can access the [`Layer`] also by using `as_ref`.
|
||||
pub(crate) fn drop_eviction_guard(self) -> Layer {
|
||||
self.into()
|
||||
}
|
||||
@@ -1341,7 +1280,7 @@ impl AsRef<Layer> for ResidentLayer {
|
||||
}
|
||||
}
|
||||
|
||||
/// Drop the eviction guard.
|
||||
/// Allow slimming down if we don't want the `2*usize` with eviction candidates?
|
||||
impl From<ResidentLayer> for Layer {
|
||||
fn from(value: ResidentLayer) -> Self {
|
||||
value.owner
|
||||
@@ -1511,13 +1450,6 @@ impl LayerImplMetrics {
|
||||
.unwrap()
|
||||
.inc();
|
||||
}
|
||||
|
||||
fn inc_broadcast_lagged(&self) {
|
||||
self.rare_counters
|
||||
.get_metric_with_label_values(&["broadcast_lagged"])
|
||||
.unwrap()
|
||||
.inc();
|
||||
}
|
||||
}
|
||||
|
||||
enum EvictionCancelled {
|
||||
@@ -1526,11 +1458,6 @@ enum EvictionCancelled {
|
||||
VersionCheckFailed,
|
||||
FileNotFound,
|
||||
RemoveFailed,
|
||||
AlreadyReinitialized,
|
||||
/// Not evicted because of a pending reinitialization
|
||||
LostToDownload,
|
||||
/// After eviction, there was a new layer access which cancelled the eviction.
|
||||
UpgradedBackOnAccess,
|
||||
}
|
||||
|
||||
impl EvictionCancelled {
|
||||
@@ -1541,9 +1468,6 @@ impl EvictionCancelled {
|
||||
EvictionCancelled::VersionCheckFailed => "version_check_fail",
|
||||
EvictionCancelled::FileNotFound => "file_not_found",
|
||||
EvictionCancelled::RemoveFailed => "remove_failed",
|
||||
EvictionCancelled::AlreadyReinitialized => "already_reinitialized",
|
||||
EvictionCancelled::LostToDownload => "lost_to_download",
|
||||
EvictionCancelled::UpgradedBackOnAccess => "upgraded_back_on_access",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -162,11 +162,8 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
|
||||
// TODO: we shouldn't need to await to find tenant and this could be moved outside of
|
||||
// loop, #3501. There are also additional "allowed_errors" in tests.
|
||||
if first {
|
||||
first = false;
|
||||
if random_init_delay(period, &cancel).await.is_err() {
|
||||
break;
|
||||
}
|
||||
if first && random_init_delay(period, &cancel).await.is_err() {
|
||||
break;
|
||||
}
|
||||
|
||||
let started_at = Instant::now();
|
||||
@@ -196,7 +193,16 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
}
|
||||
};
|
||||
|
||||
warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Compaction);
|
||||
if !first {
|
||||
// The first iteration is typically much slower, because all tenants compete for the
|
||||
// compaction sempahore to run, and because of concurrent startup work like initializing
|
||||
// logical sizes. To avoid routinely spamming warnings, we suppress this log on first iteration.
|
||||
warn_when_period_overrun(
|
||||
started_at.elapsed(),
|
||||
period,
|
||||
BackgroundLoopKind::Compaction,
|
||||
);
|
||||
}
|
||||
|
||||
// Sleep
|
||||
if tokio::time::timeout(sleep_duration, cancel.cancelled())
|
||||
@@ -205,6 +211,8 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
first = false;
|
||||
}
|
||||
}
|
||||
.await;
|
||||
@@ -239,11 +247,8 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
|
||||
let period = tenant.get_gc_period();
|
||||
|
||||
if first {
|
||||
first = false;
|
||||
if random_init_delay(period, &cancel).await.is_err() {
|
||||
break;
|
||||
}
|
||||
if first && random_init_delay(period, &cancel).await.is_err() {
|
||||
break;
|
||||
}
|
||||
|
||||
let started_at = Instant::now();
|
||||
@@ -277,7 +282,12 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
}
|
||||
};
|
||||
|
||||
warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc);
|
||||
if !first {
|
||||
// The first iteration is typically much slower, because all tenants compete for the
|
||||
// compaction sempahore to run, and because of concurrent startup work like initializing
|
||||
// logical sizes. To avoid routinely spamming warnings, we suppress this log on first iteration.
|
||||
warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc);
|
||||
}
|
||||
|
||||
// Sleep
|
||||
if tokio::time::timeout(sleep_duration, cancel.cancelled())
|
||||
@@ -286,6 +296,8 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
first = false;
|
||||
}
|
||||
}
|
||||
.await;
|
||||
@@ -361,7 +373,7 @@ pub(crate) fn warn_when_period_overrun(
|
||||
// humantime does no significant digits clamping whereas Duration's debug is a bit more
|
||||
// intelligent. however it makes sense to keep the "configuration format" for period, even
|
||||
// though there's no way to output the actual config value.
|
||||
info!(
|
||||
warn!(
|
||||
?elapsed,
|
||||
period = %humantime::format_duration(period),
|
||||
?task,
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
mod compaction;
|
||||
pub mod delete;
|
||||
mod eviction_task;
|
||||
mod init;
|
||||
@@ -60,7 +59,7 @@ use crate::metrics::{
|
||||
use crate::pgdatadir_mapping::LsnForTimestamp;
|
||||
use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key};
|
||||
use crate::pgdatadir_mapping::{BlockNumber, CalculateLogicalSizeError};
|
||||
use crate::tenant::config::{CompactionAlgorithm, EvictionPolicy, TenantConfOpt};
|
||||
use crate::tenant::config::{EvictionPolicy, TenantConfOpt};
|
||||
use pageserver_api::reltag::RelTag;
|
||||
|
||||
use postgres_connection::PgConnectionConfig;
|
||||
@@ -698,18 +697,6 @@ impl Timeline {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
match self.get_compaction_algorithm() {
|
||||
CompactionAlgorithm::Tiered => self.compact_tiered(cancel, ctx).await,
|
||||
CompactionAlgorithm::Legacy => self.compact_legacy(cancel, ctx).await,
|
||||
}
|
||||
}
|
||||
|
||||
/// TODO: cancellation
|
||||
async fn compact_legacy(
|
||||
self: &Arc<Self>,
|
||||
_cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), CompactionError> {
|
||||
// High level strategy for compaction / image creation:
|
||||
//
|
||||
// 1. First, calculate the desired "partitioning" of the
|
||||
@@ -1219,13 +1206,6 @@ impl Timeline {
|
||||
.unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
|
||||
}
|
||||
|
||||
fn get_compaction_algorithm(&self) -> CompactionAlgorithm {
|
||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
|
||||
tenant_conf
|
||||
.compaction_algorithm
|
||||
.unwrap_or(self.conf.default_tenant_conf.compaction_algorithm)
|
||||
}
|
||||
|
||||
fn get_eviction_policy(&self) -> EvictionPolicy {
|
||||
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
|
||||
tenant_conf
|
||||
@@ -3030,7 +3010,7 @@ impl TryFrom<CompactLevel0Phase1StatsBuilder> for CompactLevel0Phase1Stats {
|
||||
}
|
||||
|
||||
impl Timeline {
|
||||
/// Level0 files first phase of compaction, explained in the [`compact_legacy`] comment.
|
||||
/// Level0 files first phase of compaction, explained in the [`Self::compact`] comment.
|
||||
///
|
||||
/// This method takes the `_layer_removal_cs` guard to highlight it required downloads are
|
||||
/// returned as an error. If the `layer_removal_cs` boundary is changed not to be taken in the
|
||||
@@ -3513,23 +3493,6 @@ impl Timeline {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
self.finish_compact_batch(
|
||||
layer_removal_cs,
|
||||
&new_layers,
|
||||
&Vec::new(),
|
||||
&deltas_to_compact,
|
||||
)
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn finish_compact_batch(
|
||||
self: &Arc<Self>,
|
||||
layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
|
||||
new_deltas: &[ResidentLayer],
|
||||
new_images: &[ResidentLayer],
|
||||
layers_to_remove: &[Layer],
|
||||
) -> anyhow::Result<()> {
|
||||
// Before deleting any layers, we need to wait for their upload ops to finish.
|
||||
// See remote_timeline_client module level comment on consistency.
|
||||
// Do it here because we don't want to hold self.layers.write() while waiting.
|
||||
@@ -3545,9 +3508,9 @@ impl Timeline {
|
||||
|
||||
let mut duplicated_layers = HashSet::new();
|
||||
|
||||
let mut insert_layers = Vec::with_capacity(new_deltas.len());
|
||||
let mut insert_layers = Vec::with_capacity(new_layers.len());
|
||||
|
||||
for l in new_deltas {
|
||||
for l in &new_layers {
|
||||
if guard.contains(l.as_ref()) {
|
||||
// expected in tests
|
||||
tracing::error!(layer=%l, "duplicated L1 layer");
|
||||
@@ -3558,22 +3521,18 @@ impl Timeline {
|
||||
// because we have not implemented L0 => L0 compaction.
|
||||
duplicated_layers.insert(l.layer_desc().key());
|
||||
} else if LayerMap::is_l0(l.layer_desc()) {
|
||||
bail!("compaction generates a L0 layer file as output, which will cause infinite compaction.");
|
||||
return Err(CompactionError::Other(anyhow!("compaction generates a L0 layer file as output, which will cause infinite compaction.")));
|
||||
} else {
|
||||
insert_layers.push(l.clone());
|
||||
}
|
||||
}
|
||||
|
||||
// only remove those inputs which were not outputs
|
||||
let remove_layers: Vec<Layer> = layers_to_remove
|
||||
.iter()
|
||||
.filter(|l| !duplicated_layers.contains(&l.layer_desc().key()))
|
||||
.cloned()
|
||||
.collect();
|
||||
|
||||
if !new_images.is_empty() {
|
||||
guard.track_new_image_layers(new_images, &self.metrics);
|
||||
}
|
||||
let remove_layers = {
|
||||
let mut deltas_to_compact = deltas_to_compact;
|
||||
// only remove those inputs which were not outputs
|
||||
deltas_to_compact.retain(|l| !duplicated_layers.contains(&l.layer_desc().key()));
|
||||
deltas_to_compact
|
||||
};
|
||||
|
||||
// deletion will happen later, the layer file manager calls garbage_collect_on_drop
|
||||
guard.finish_compact_l0(
|
||||
@@ -3584,7 +3543,7 @@ impl Timeline {
|
||||
);
|
||||
|
||||
if let Some(remote_client) = self.remote_client.as_ref() {
|
||||
remote_client.schedule_compaction_update(&remove_layers, &new_deltas)?;
|
||||
remote_client.schedule_compaction_update(&remove_layers, &new_layers)?;
|
||||
}
|
||||
|
||||
drop_wlock(guard);
|
||||
|
||||
@@ -1,473 +0,0 @@
|
||||
//! New compaction implementation. The algorithm itself is implemented in the
|
||||
//! compaction crate. This file implements the callbacks and structs that allow
|
||||
//! the algorithm to drive the process.
|
||||
//!
|
||||
//! The old legacy algorithm is implemented directly in `timeline.rs`.
|
||||
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
|
||||
use super::Timeline;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use fail::fail_point;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, trace, warn};
|
||||
|
||||
use crate::context::RequestContext;
|
||||
use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc};
|
||||
use crate::tenant::timeline::{is_rel_fsm_block_key, is_rel_vm_block_key};
|
||||
use crate::tenant::timeline::{DeltaLayerWriter, ImageLayerWriter};
|
||||
use crate::tenant::timeline::{Layer, ResidentLayer};
|
||||
use crate::tenant::DeltaLayer;
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::ZERO_PAGE;
|
||||
|
||||
use crate::keyspace::KeySpace;
|
||||
use crate::repository::Key;
|
||||
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use pageserver_compaction::helpers::overlaps_with;
|
||||
use pageserver_compaction::interface::*;
|
||||
|
||||
use super::CompactionError;
|
||||
|
||||
impl Timeline {
|
||||
/// Entry point for new tiered compaction algorithm.
|
||||
///
|
||||
/// All the real work is in the implementation in the pageserver_compaction
|
||||
/// crate. The code here would apply to any algorithm implemented by the
|
||||
/// same interface, but tiered is the only one at the moment.
|
||||
///
|
||||
/// TODO: cancellation
|
||||
pub(crate) async fn compact_tiered(
|
||||
self: &Arc<Self>,
|
||||
_cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), CompactionError> {
|
||||
let fanout = self.get_compaction_threshold() as u64;
|
||||
let target_file_size = self.get_checkpoint_distance();
|
||||
|
||||
// Find the top of the historical layers
|
||||
let end_lsn = {
|
||||
let guard = self.layers.read().await;
|
||||
let layers = guard.layer_map();
|
||||
|
||||
let l0_deltas = layers.get_level0_deltas()?;
|
||||
drop(guard);
|
||||
|
||||
// As an optimization, if we find that there are too few L0 layers,
|
||||
// bail out early. We know that the compaction algorithm would do
|
||||
// nothing in that case.
|
||||
if l0_deltas.len() < fanout as usize {
|
||||
// doesn't need compacting
|
||||
return Ok(());
|
||||
}
|
||||
l0_deltas.iter().map(|l| l.lsn_range.end).max().unwrap()
|
||||
};
|
||||
|
||||
// now lock out layer removal (compaction, gc, timeline deletion)
|
||||
let layer_removal_cs = Arc::new(self.layer_removal_cs.clone().lock_owned().await);
|
||||
// Is the timeline being deleted?
|
||||
if self.is_stopping() {
|
||||
trace!("Dropping out of compaction on timeline shutdown");
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
}
|
||||
|
||||
let keyspace = self.collect_keyspace(end_lsn, ctx).await?;
|
||||
let mut adaptor = TimelineAdaptor::new(self, layer_removal_cs, (end_lsn, keyspace));
|
||||
let ctx_adaptor = RequestContextAdaptor(ctx.clone());
|
||||
|
||||
pageserver_compaction::compact_tiered::compact_tiered(
|
||||
&mut adaptor,
|
||||
end_lsn,
|
||||
target_file_size,
|
||||
fanout,
|
||||
&ctx_adaptor,
|
||||
)
|
||||
.await?;
|
||||
|
||||
adaptor.flush_updates().await?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
struct TimelineAdaptor {
|
||||
timeline: Arc<Timeline>,
|
||||
layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
|
||||
|
||||
keyspace: (Lsn, KeySpace),
|
||||
|
||||
new_deltas: Vec<ResidentLayer>,
|
||||
new_images: Vec<ResidentLayer>,
|
||||
layers_to_delete: Vec<Arc<PersistentLayerDesc>>,
|
||||
}
|
||||
|
||||
impl TimelineAdaptor {
|
||||
pub fn new(
|
||||
timeline: &Arc<Timeline>,
|
||||
layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
|
||||
keyspace: (Lsn, KeySpace),
|
||||
) -> Self {
|
||||
Self {
|
||||
timeline: timeline.clone(),
|
||||
layer_removal_cs,
|
||||
keyspace,
|
||||
new_images: Vec::new(),
|
||||
new_deltas: Vec::new(),
|
||||
layers_to_delete: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn flush_updates(&mut self) -> anyhow::Result<()> {
|
||||
let layers_to_delete = {
|
||||
let guard = self.timeline.layers.read().await;
|
||||
self.layers_to_delete
|
||||
.iter()
|
||||
.map(|x| guard.get_from_desc(x))
|
||||
.collect::<Vec<Layer>>()
|
||||
};
|
||||
self.timeline
|
||||
.finish_compact_batch(
|
||||
self.layer_removal_cs.clone(),
|
||||
&self.new_deltas,
|
||||
&self.new_images,
|
||||
&layers_to_delete,
|
||||
)
|
||||
.await?;
|
||||
self.new_images.clear();
|
||||
self.new_deltas.clear();
|
||||
self.layers_to_delete.clear();
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct ResidentDeltaLayer(ResidentLayer);
|
||||
#[derive(Clone)]
|
||||
struct ResidentImageLayer(ResidentLayer);
|
||||
|
||||
#[async_trait]
|
||||
impl CompactionJobExecutor for TimelineAdaptor {
|
||||
type Key = crate::repository::Key;
|
||||
|
||||
type Layer = Arc<PersistentLayerDesc>;
|
||||
type DeltaLayer = ResidentDeltaLayer;
|
||||
type ImageLayer = ResidentImageLayer;
|
||||
|
||||
type RequestContext = RequestContextAdaptor;
|
||||
|
||||
async fn get_layers(
|
||||
&mut self,
|
||||
key_range: &Range<Key>,
|
||||
lsn_range: &Range<Lsn>,
|
||||
_ctx: &RequestContextAdaptor,
|
||||
) -> anyhow::Result<Vec<Arc<PersistentLayerDesc>>> {
|
||||
self.flush_updates().await?;
|
||||
|
||||
let guard = self.timeline.layers.read().await;
|
||||
let layer_map = guard.layer_map();
|
||||
|
||||
let result = layer_map
|
||||
.iter_historic_layers()
|
||||
.filter(|l| {
|
||||
overlaps_with(&l.lsn_range, lsn_range) && overlaps_with(&l.key_range, key_range)
|
||||
})
|
||||
.collect();
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
async fn get_keyspace(
|
||||
&mut self,
|
||||
key_range: &Range<Key>,
|
||||
lsn: Lsn,
|
||||
_ctx: &RequestContextAdaptor,
|
||||
) -> anyhow::Result<Vec<Range<Key>>> {
|
||||
if lsn == self.keyspace.0 {
|
||||
Ok(pageserver_compaction::helpers::intersect_keyspace(
|
||||
&self.keyspace.1.ranges,
|
||||
key_range,
|
||||
))
|
||||
} else {
|
||||
// The current compaction implementatin only ever requests the key space
|
||||
// at the compaction end LSN.
|
||||
anyhow::bail!("keyspace not available for requested lsn");
|
||||
}
|
||||
}
|
||||
|
||||
async fn downcast_delta_layer(
|
||||
&self,
|
||||
layer: &Arc<PersistentLayerDesc>,
|
||||
) -> anyhow::Result<Option<ResidentDeltaLayer>> {
|
||||
// this is a lot more complex than a simple downcast...
|
||||
if layer.is_delta() {
|
||||
let l = {
|
||||
let guard = self.timeline.layers.read().await;
|
||||
guard.get_from_desc(layer)
|
||||
};
|
||||
let result = l.download_and_keep_resident().await?;
|
||||
|
||||
Ok(Some(ResidentDeltaLayer(result)))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
async fn create_image(
|
||||
&mut self,
|
||||
lsn: Lsn,
|
||||
key_range: &Range<Key>,
|
||||
ctx: &RequestContextAdaptor,
|
||||
) -> anyhow::Result<()> {
|
||||
Ok(self.create_image_impl(lsn, key_range, ctx).await?)
|
||||
}
|
||||
|
||||
async fn create_delta(
|
||||
&mut self,
|
||||
lsn_range: &Range<Lsn>,
|
||||
key_range: &Range<Key>,
|
||||
input_layers: &[ResidentDeltaLayer],
|
||||
ctx: &RequestContextAdaptor,
|
||||
) -> anyhow::Result<()> {
|
||||
debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
|
||||
|
||||
let mut all_entries = Vec::new();
|
||||
for dl in input_layers.iter() {
|
||||
all_entries.extend(dl.load_keys(ctx).await?);
|
||||
}
|
||||
|
||||
// The current stdlib sorting implementation is designed in a way where it is
|
||||
// particularly fast where the slice is made up of sorted sub-ranges.
|
||||
all_entries.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn));
|
||||
|
||||
let mut writer = DeltaLayerWriter::new(
|
||||
self.timeline.conf,
|
||||
self.timeline.timeline_id,
|
||||
self.timeline.tenant_id,
|
||||
key_range.start,
|
||||
lsn_range.clone(),
|
||||
)
|
||||
.await?;
|
||||
|
||||
let mut dup_values = 0;
|
||||
|
||||
// This iterator walks through all key-value pairs from all the layers
|
||||
// we're compacting, in key, LSN order.
|
||||
let mut prev: Option<(Key, Lsn)> = None;
|
||||
for &DeltaEntry {
|
||||
key, lsn, ref val, ..
|
||||
} in all_entries.iter()
|
||||
{
|
||||
if prev == Some((key, lsn)) {
|
||||
// This is a duplicate. Skip it.
|
||||
//
|
||||
// It can happen if compaction is interrupted after writing some
|
||||
// layers but not all, and we are compacting the range again.
|
||||
// The calculations in the algorithm assume that there are no
|
||||
// duplicates, so the math on targeted file size is likely off,
|
||||
// and we will create smaller files than expected.
|
||||
dup_values += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
let value = val.load(ctx).await?;
|
||||
|
||||
writer.put_value(key, lsn, value).await?;
|
||||
|
||||
prev = Some((key, lsn));
|
||||
}
|
||||
|
||||
if dup_values > 0 {
|
||||
warn!("delta layer created with {} duplicate values", dup_values);
|
||||
}
|
||||
|
||||
fail_point!("delta-layer-writer-fail-before-finish", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint delta-layer-writer-fail-before-finish"
|
||||
))
|
||||
});
|
||||
|
||||
let new_delta_layer = writer
|
||||
.finish(prev.unwrap().0.next(), &self.timeline)
|
||||
.await?;
|
||||
|
||||
self.new_deltas.push(new_delta_layer);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn delete_layer(
|
||||
&mut self,
|
||||
layer: &Arc<PersistentLayerDesc>,
|
||||
_ctx: &RequestContextAdaptor,
|
||||
) -> anyhow::Result<()> {
|
||||
self.layers_to_delete.push(layer.clone());
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl TimelineAdaptor {
|
||||
async fn create_image_impl(
|
||||
&mut self,
|
||||
lsn: Lsn,
|
||||
key_range: &Range<Key>,
|
||||
ctx: &RequestContextAdaptor,
|
||||
) -> Result<(), PageReconstructError> {
|
||||
let timer = self.timeline.metrics.create_images_time_histo.start_timer();
|
||||
|
||||
let mut image_layer_writer = ImageLayerWriter::new(
|
||||
self.timeline.conf,
|
||||
self.timeline.timeline_id,
|
||||
self.timeline.tenant_id,
|
||||
key_range,
|
||||
lsn,
|
||||
)
|
||||
.await?;
|
||||
|
||||
fail_point!("image-layer-writer-fail-before-finish", |_| {
|
||||
Err(PageReconstructError::Other(anyhow::anyhow!(
|
||||
"failpoint image-layer-writer-fail-before-finish"
|
||||
)))
|
||||
});
|
||||
let keyspace_ranges = self.get_keyspace(key_range, lsn, ctx).await?;
|
||||
for range in &keyspace_ranges {
|
||||
let mut key = range.start;
|
||||
while key < range.end {
|
||||
let img = match self.timeline.get(key, lsn, ctx).await {
|
||||
Ok(img) => img,
|
||||
Err(err) => {
|
||||
// If we fail to reconstruct a VM or FSM page, we can zero the
|
||||
// page without losing any actual user data. That seems better
|
||||
// than failing repeatedly and getting stuck.
|
||||
//
|
||||
// We had a bug at one point, where we truncated the FSM and VM
|
||||
// in the pageserver, but the Postgres didn't know about that
|
||||
// and continued to generate incremental WAL records for pages
|
||||
// that didn't exist in the pageserver. Trying to replay those
|
||||
// WAL records failed to find the previous image of the page.
|
||||
// This special case allows us to recover from that situation.
|
||||
// See https://github.com/neondatabase/neon/issues/2601.
|
||||
//
|
||||
// Unfortunately we cannot do this for the main fork, or for
|
||||
// any metadata keys, keys, as that would lead to actual data
|
||||
// loss.
|
||||
if is_rel_fsm_block_key(key) || is_rel_vm_block_key(key) {
|
||||
warn!("could not reconstruct FSM or VM key {key}, filling with zeros: {err:?}");
|
||||
ZERO_PAGE.clone()
|
||||
} else {
|
||||
return Err(err);
|
||||
}
|
||||
}
|
||||
};
|
||||
image_layer_writer.put_image(key, &img).await?;
|
||||
key = key.next();
|
||||
}
|
||||
}
|
||||
let image_layer = image_layer_writer.finish(&self.timeline).await?;
|
||||
|
||||
self.new_images.push(image_layer);
|
||||
|
||||
timer.stop_and_record();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct RequestContextAdaptor(pub RequestContext);
|
||||
|
||||
impl std::ops::Deref for RequestContextAdaptor {
|
||||
type Target = RequestContext;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl CompactionRequestContext for RequestContextAdaptor {}
|
||||
|
||||
impl CompactionLayer<Key> for Arc<PersistentLayerDesc> {
|
||||
fn key_range(&self) -> &Range<Key> {
|
||||
&self.key_range
|
||||
}
|
||||
fn lsn_range(&self) -> &Range<Lsn> {
|
||||
&self.lsn_range
|
||||
}
|
||||
fn file_size(&self) -> u64 {
|
||||
self.file_size
|
||||
}
|
||||
fn short_id(&self) -> std::string::String {
|
||||
self.as_ref().short_id().to_string()
|
||||
}
|
||||
fn is_delta(&self) -> bool {
|
||||
self.as_ref().is_delta()
|
||||
}
|
||||
}
|
||||
|
||||
impl CompactionLayer<Key> for Arc<DeltaLayer> {
|
||||
fn key_range(&self) -> &Range<Key> {
|
||||
&self.layer_desc().key_range
|
||||
}
|
||||
fn lsn_range(&self) -> &Range<Lsn> {
|
||||
&self.layer_desc().lsn_range
|
||||
}
|
||||
fn file_size(&self) -> u64 {
|
||||
self.layer_desc().file_size
|
||||
}
|
||||
fn short_id(&self) -> std::string::String {
|
||||
self.layer_desc().short_id().to_string()
|
||||
}
|
||||
fn is_delta(&self) -> bool {
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
use crate::tenant::timeline::DeltaEntry;
|
||||
|
||||
impl CompactionLayer<Key> for ResidentDeltaLayer {
|
||||
fn key_range(&self) -> &Range<Key> {
|
||||
&self.0.layer_desc().key_range
|
||||
}
|
||||
fn lsn_range(&self) -> &Range<Lsn> {
|
||||
&self.0.layer_desc().lsn_range
|
||||
}
|
||||
fn file_size(&self) -> u64 {
|
||||
self.0.layer_desc().file_size
|
||||
}
|
||||
fn short_id(&self) -> std::string::String {
|
||||
self.0.layer_desc().short_id().to_string()
|
||||
}
|
||||
fn is_delta(&self) -> bool {
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl CompactionDeltaLayer<TimelineAdaptor> for ResidentDeltaLayer {
|
||||
type DeltaEntry<'a> = DeltaEntry<'a>;
|
||||
|
||||
async fn load_keys<'a>(
|
||||
&self,
|
||||
ctx: &RequestContextAdaptor,
|
||||
) -> anyhow::Result<Vec<DeltaEntry<'_>>> {
|
||||
self.0.load_keys(ctx).await
|
||||
}
|
||||
}
|
||||
|
||||
impl CompactionLayer<Key> for ResidentImageLayer {
|
||||
fn key_range(&self) -> &Range<Key> {
|
||||
&self.0.layer_desc().key_range
|
||||
}
|
||||
fn lsn_range(&self) -> &Range<Lsn> {
|
||||
&self.0.layer_desc().lsn_range
|
||||
}
|
||||
fn file_size(&self) -> u64 {
|
||||
self.0.layer_desc().file_size
|
||||
}
|
||||
fn short_id(&self) -> std::string::String {
|
||||
self.0.layer_desc().short_id().to_string()
|
||||
}
|
||||
fn is_delta(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
impl CompactionImageLayer<TimelineAdaptor> for ResidentImageLayer {}
|
||||
@@ -459,7 +459,7 @@ async fn identify_system(client: &Client) -> anyhow::Result<IdentifySystem> {
|
||||
|
||||
// extract the row contents into an IdentifySystem struct.
|
||||
// written as a closure so I can use ? for Option here.
|
||||
if let Some(SimpleQueryMessage::Row(first_row)) = response.first() {
|
||||
if let Some(SimpleQueryMessage::Row(first_row)) = response.get(0) {
|
||||
Ok(IdentifySystem {
|
||||
systemid: get_parse(first_row, 0)?,
|
||||
timeline: get_parse(first_row, 1)?,
|
||||
|
||||
@@ -19,7 +19,6 @@ use std::io::{Error, ErrorKind, Seek, SeekFrom};
|
||||
use std::os::unix::fs::FileExt;
|
||||
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
|
||||
use std::sync::{RwLock, RwLockWriteGuard};
|
||||
use utils::fs_ext;
|
||||
|
||||
///
|
||||
/// A virtual file descriptor. You can use this just like std::fs::File, but internally
|
||||
@@ -174,78 +173,37 @@ impl OpenFiles {
|
||||
}
|
||||
}
|
||||
|
||||
/// Identify error types that should alwways terminate the process. Other
|
||||
/// error types may be elegible for retry.
|
||||
pub(crate) fn is_fatal_io_error(e: &std::io::Error) -> bool {
|
||||
use nix::errno::Errno::*;
|
||||
match e.raw_os_error().map(nix::errno::from_i32) {
|
||||
Some(EIO) => {
|
||||
// Terminate on EIO because we no longer trust the device to store
|
||||
// data safely, or to uphold persistence guarantees on fsync.
|
||||
true
|
||||
}
|
||||
Some(EROFS) => {
|
||||
// Terminate on EROFS because a filesystem is usually remounted
|
||||
// readonly when it has experienced some critical issue, so the same
|
||||
// logic as EIO applies.
|
||||
true
|
||||
}
|
||||
Some(EACCES) => {
|
||||
// Terminate on EACCESS because we should always have permissions
|
||||
// for our own data dir: if we don't, then we can't do our job and
|
||||
// need administrative intervention to fix permissions. Terminating
|
||||
// is the best way to make sure we stop cleanly rather than going
|
||||
// into infinite retry loops, and will make it clear to the outside
|
||||
// world that we need help.
|
||||
true
|
||||
}
|
||||
_ => {
|
||||
// Treat all other local file I/O errors are retryable. This includes:
|
||||
// - ENOSPC: we stay up and wait for eviction to free some space
|
||||
// - EINVAL, EBADF, EBADFD: this is a code bug, not a filesystem/hardware issue
|
||||
// - WriteZero, Interrupted: these are used internally VirtualFile
|
||||
false
|
||||
}
|
||||
}
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum CrashsafeOverwriteError {
|
||||
#[error("final path has no parent dir")]
|
||||
FinalPathHasNoParentDir,
|
||||
#[error("remove tempfile")]
|
||||
RemovePreviousTempfile(#[source] std::io::Error),
|
||||
#[error("create tempfile")]
|
||||
CreateTempfile(#[source] std::io::Error),
|
||||
#[error("write tempfile")]
|
||||
WriteContents(#[source] std::io::Error),
|
||||
#[error("sync tempfile")]
|
||||
SyncTempfile(#[source] std::io::Error),
|
||||
#[error("rename tempfile to final path")]
|
||||
RenameTempfileToFinalPath(#[source] std::io::Error),
|
||||
#[error("open final path parent dir")]
|
||||
OpenFinalPathParentDir(#[source] std::io::Error),
|
||||
#[error("sync final path parent dir")]
|
||||
SyncFinalPathParentDir(#[source] std::io::Error),
|
||||
}
|
||||
|
||||
/// Call this when the local filesystem gives us an error with an external
|
||||
/// cause: this includes EIO, EROFS, and EACCESS: all these indicate either
|
||||
/// bad storage or bad configuration, and we can't fix that from inside
|
||||
/// a running process.
|
||||
pub(crate) fn on_fatal_io_error(e: &std::io::Error, context: &str) -> ! {
|
||||
tracing::error!("Fatal I/O error: {e}: {context})");
|
||||
std::process::abort();
|
||||
}
|
||||
|
||||
pub(crate) trait MaybeFatalIo<T> {
|
||||
fn maybe_fatal_err(self, context: &str) -> std::io::Result<T>;
|
||||
fn fatal_err(self, context: &str) -> T;
|
||||
}
|
||||
|
||||
impl<T> MaybeFatalIo<T> for std::io::Result<T> {
|
||||
/// Terminate the process if the result is an error of a fatal type, else pass it through
|
||||
///
|
||||
/// This is appropriate for writes, where we typically want to die on EIO/ACCES etc, but
|
||||
/// not on ENOSPC.
|
||||
fn maybe_fatal_err(self, context: &str) -> std::io::Result<T> {
|
||||
if let Err(e) = &self {
|
||||
if is_fatal_io_error(e) {
|
||||
on_fatal_io_error(e, context);
|
||||
}
|
||||
}
|
||||
self
|
||||
}
|
||||
|
||||
/// Terminate the process on any I/O error.
|
||||
///
|
||||
/// This is appropriate for reads on files that we know exist: they should always work.
|
||||
fn fatal_err(self, context: &str) -> T {
|
||||
impl CrashsafeOverwriteError {
|
||||
/// Returns true iff the new contents are durably stored.
|
||||
pub fn are_new_contents_durable(&self) -> bool {
|
||||
match self {
|
||||
Ok(v) => v,
|
||||
Err(e) => {
|
||||
on_fatal_io_error(&e, context);
|
||||
}
|
||||
Self::FinalPathHasNoParentDir => false,
|
||||
Self::RemovePreviousTempfile(_) => false,
|
||||
Self::CreateTempfile(_) => false,
|
||||
Self::WriteContents(_) => false,
|
||||
Self::SyncTempfile(_) => false,
|
||||
Self::RenameTempfileToFinalPath(_) => false,
|
||||
Self::OpenFinalPathParentDir(_) => false,
|
||||
Self::SyncFinalPathParentDir(_) => true,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -326,13 +284,15 @@ impl VirtualFile {
|
||||
final_path: &Utf8Path,
|
||||
tmp_path: &Utf8Path,
|
||||
content: &[u8],
|
||||
) -> std::io::Result<()> {
|
||||
) -> Result<(), CrashsafeOverwriteError> {
|
||||
let Some(final_path_parent) = final_path.parent() else {
|
||||
return Err(std::io::Error::from_raw_os_error(
|
||||
nix::errno::Errno::EINVAL as i32,
|
||||
));
|
||||
return Err(CrashsafeOverwriteError::FinalPathHasNoParentDir);
|
||||
};
|
||||
std::fs::remove_file(tmp_path).or_else(fs_ext::ignore_not_found)?;
|
||||
match std::fs::remove_file(tmp_path) {
|
||||
Ok(()) => {}
|
||||
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
|
||||
Err(e) => return Err(CrashsafeOverwriteError::RemovePreviousTempfile(e)),
|
||||
}
|
||||
let mut file = Self::open_with_options(
|
||||
tmp_path,
|
||||
OpenOptions::new()
|
||||
@@ -341,20 +301,31 @@ impl VirtualFile {
|
||||
// we bail out instead of causing damage.
|
||||
.create_new(true),
|
||||
)
|
||||
.await?;
|
||||
file.write_all(content).await?;
|
||||
file.sync_all().await?;
|
||||
.await
|
||||
.map_err(CrashsafeOverwriteError::CreateTempfile)?;
|
||||
file.write_all(content)
|
||||
.await
|
||||
.map_err(CrashsafeOverwriteError::WriteContents)?;
|
||||
file.sync_all()
|
||||
.await
|
||||
.map_err(CrashsafeOverwriteError::SyncTempfile)?;
|
||||
drop(file); // before the rename, that's important!
|
||||
// renames are atomic
|
||||
std::fs::rename(tmp_path, final_path)?;
|
||||
std::fs::rename(tmp_path, final_path)
|
||||
.map_err(CrashsafeOverwriteError::RenameTempfileToFinalPath)?;
|
||||
// Only open final path parent dirfd now, so that this operation only
|
||||
// ever holds one VirtualFile fd at a time. That's important because
|
||||
// the current `find_victim_slot` impl might pick the same slot for both
|
||||
// VirtualFile., and it eventually does a blocking write lock instead of
|
||||
// try_lock.
|
||||
let final_parent_dirfd =
|
||||
Self::open_with_options(final_path_parent, OpenOptions::new().read(true)).await?;
|
||||
final_parent_dirfd.sync_all().await?;
|
||||
Self::open_with_options(final_path_parent, OpenOptions::new().read(true))
|
||||
.await
|
||||
.map_err(CrashsafeOverwriteError::OpenFinalPathParentDir)?;
|
||||
final_parent_dirfd
|
||||
.sync_all()
|
||||
.await
|
||||
.map_err(CrashsafeOverwriteError::SyncFinalPathParentDir)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -857,8 +857,7 @@ impl WalRedoProcess {
|
||||
let in_revents = stdin_pollfds[0].revents().unwrap();
|
||||
if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
|
||||
nwrite += proc.stdin.write(&writebuf[nwrite..])?;
|
||||
}
|
||||
if in_revents.contains(PollFlags::POLLHUP) {
|
||||
} else if in_revents.contains(PollFlags::POLLHUP) {
|
||||
// We still have more data to write, but the process closed the pipe.
|
||||
anyhow::bail!("WAL redo process closed its stdin unexpectedly");
|
||||
}
|
||||
@@ -908,8 +907,7 @@ impl WalRedoProcess {
|
||||
let out_revents = stdout_pollfds[0].revents().unwrap();
|
||||
if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
|
||||
nresult += output.stdout.read(&mut resultbuf[nresult..])?;
|
||||
}
|
||||
if out_revents.contains(PollFlags::POLLHUP) {
|
||||
} else if out_revents.contains(PollFlags::POLLHUP) {
|
||||
anyhow::bail!("WAL redo process closed its stdout unexpectedly");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -88,7 +88,7 @@ static void StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd);
|
||||
static void WalSndLoop(WalProposer *wp);
|
||||
static void XLogBroadcastWalProposer(WalProposer *wp);
|
||||
|
||||
static void XLogWalPropWrite(WalProposer *wp, char *buf, Size nbytes, XLogRecPtr recptr);
|
||||
static void XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr);
|
||||
static void XLogWalPropClose(XLogRecPtr recptr);
|
||||
|
||||
static void
|
||||
@@ -1241,7 +1241,7 @@ WalProposerRecovery(Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XL
|
||||
rec_end_lsn = rec_start_lsn + len - XLOG_HDR_SIZE;
|
||||
|
||||
/* write WAL to disk */
|
||||
XLogWalPropWrite(sk->wp, &buf[XLOG_HDR_SIZE], len - XLOG_HDR_SIZE, rec_start_lsn);
|
||||
XLogWalPropWrite(&buf[XLOG_HDR_SIZE], len - XLOG_HDR_SIZE, rec_start_lsn);
|
||||
|
||||
ereport(DEBUG1,
|
||||
(errmsg("Recover message %X/%X length %d",
|
||||
@@ -1283,24 +1283,11 @@ static XLogSegNo walpropSegNo = 0;
|
||||
* Write XLOG data to disk.
|
||||
*/
|
||||
static void
|
||||
XLogWalPropWrite(WalProposer *wp, char *buf, Size nbytes, XLogRecPtr recptr)
|
||||
XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr)
|
||||
{
|
||||
int startoff;
|
||||
int byteswritten;
|
||||
|
||||
/*
|
||||
* Apart from walproposer, basebackup LSN page is also written out by
|
||||
* postgres itself which writes WAL only in pages, and in basebackup it is
|
||||
* inherently dummy (only safekeepers have historic WAL). Update WAL buffers
|
||||
* here to avoid dummy page overwriting correct one we download here. Ugly,
|
||||
* but alternatives are about the same ugly. We won't need that if we switch
|
||||
* to on-demand WAL download from safekeepers, without writing to disk.
|
||||
*
|
||||
* https://github.com/neondatabase/neon/issues/5749
|
||||
*/
|
||||
if (!wp->config->syncSafekeepers)
|
||||
XLogUpdateWalBuffers(buf, recptr, nbytes);
|
||||
|
||||
while (nbytes > 0)
|
||||
{
|
||||
int segbytes;
|
||||
|
||||
@@ -13,7 +13,6 @@ pub struct ConsoleError {
|
||||
#[derive(Deserialize)]
|
||||
pub struct GetRoleSecret {
|
||||
pub role_secret: Box<str>,
|
||||
pub allowed_ips: Option<Vec<Box<str>>>,
|
||||
}
|
||||
|
||||
// Manually implement debug to omit sensitive info.
|
||||
@@ -188,31 +187,4 @@ mod tests {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_wake_compute() -> anyhow::Result<()> {
|
||||
let json = json!({
|
||||
"address": "0.0.0.0",
|
||||
"aux": dummy_aux(),
|
||||
});
|
||||
let _: WakeCompute = serde_json::from_str(&json.to_string())?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_get_role_secret() -> anyhow::Result<()> {
|
||||
// Empty `allowed_ips` field.
|
||||
let json = json!({
|
||||
"role_secret": "secret",
|
||||
});
|
||||
let _: GetRoleSecret = serde_json::from_str(&json.to_string())?;
|
||||
// Empty `allowed_ips` field.
|
||||
let json = json!({
|
||||
"role_secret": "secret",
|
||||
"allowed_ips": ["8.8.8.8"],
|
||||
});
|
||||
let _: GetRoleSecret = serde_json::from_str(&json.to_string())?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -59,7 +59,7 @@ impl Api {
|
||||
let rows = client.query(query, &[&creds.user]).await?;
|
||||
|
||||
// We can get at most one row, because `rolname` is unique.
|
||||
let row = match rows.first() {
|
||||
let row = match rows.get(0) {
|
||||
Some(row) => row,
|
||||
// This means that the user doesn't exist, so there can be no secret.
|
||||
// However, this is still a *valid* outcome which is very similar
|
||||
|
||||
@@ -49,7 +49,7 @@ impl Api {
|
||||
.endpoint
|
||||
.get("proxy_get_role_secret")
|
||||
.header("X-Request-ID", &request_id)
|
||||
.header("Authorization", format!("Bearer {}", &self.jwt))
|
||||
.header("Authorization", &self.jwt)
|
||||
.query(&[("session_id", extra.session_id)])
|
||||
.query(&[
|
||||
("application_name", extra.application_name),
|
||||
@@ -94,7 +94,7 @@ impl Api {
|
||||
.endpoint
|
||||
.get("proxy_wake_compute")
|
||||
.header("X-Request-ID", &request_id)
|
||||
.header("Authorization", format!("Bearer {}", &self.jwt))
|
||||
.header("Authorization", &self.jwt)
|
||||
.query(&[("session_id", extra.session_id)])
|
||||
.query(&[
|
||||
("application_name", extra.application_name),
|
||||
|
||||
@@ -18,6 +18,7 @@ mod password;
|
||||
pub use exchange::Exchange;
|
||||
pub use key::ScramKey;
|
||||
pub use secret::ServerSecret;
|
||||
pub use secret::*;
|
||||
|
||||
use hmac::{Hmac, Mac};
|
||||
use sha2::{Digest, Sha256};
|
||||
|
||||
@@ -470,26 +470,30 @@ async fn query_to_json<T: GenericClient>(
|
||||
}
|
||||
.and_then(|s| s.parse::<i64>().ok());
|
||||
|
||||
let mut fields = vec![];
|
||||
let mut columns = vec![];
|
||||
|
||||
for c in row_stream.columns() {
|
||||
fields.push(json!({
|
||||
"name": Value::String(c.name().to_owned()),
|
||||
"dataTypeID": Value::Number(c.type_().oid().into()),
|
||||
"tableID": c.table_oid(),
|
||||
"columnID": c.column_id(),
|
||||
"dataTypeSize": c.type_size(),
|
||||
"dataTypeModifier": c.type_modifier(),
|
||||
"format": "text",
|
||||
}));
|
||||
columns.push(client.get_type(c.type_oid()).await?);
|
||||
}
|
||||
let fields = if !rows.is_empty() {
|
||||
rows[0]
|
||||
.columns()
|
||||
.iter()
|
||||
.map(|c| {
|
||||
json!({
|
||||
"name": Value::String(c.name().to_owned()),
|
||||
"dataTypeID": Value::Number(c.type_().oid().into()),
|
||||
"tableID": c.table_oid(),
|
||||
"columnID": c.column_id(),
|
||||
"dataTypeSize": c.type_size(),
|
||||
"dataTypeModifier": c.type_modifier(),
|
||||
"format": "text",
|
||||
})
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
} else {
|
||||
Vec::new()
|
||||
};
|
||||
|
||||
// convert rows to JSON
|
||||
let rows = rows
|
||||
.iter()
|
||||
.map(|row| pg_text_row_to_json(row, &columns, raw_output, array_mode))
|
||||
.map(|row| pg_text_row_to_json(row, raw_output, array_mode))
|
||||
.collect::<Result<Vec<_>, _>>()?;
|
||||
|
||||
// resulting JSON format is based on the format of node-postgres result
|
||||
@@ -510,28 +514,22 @@ async fn query_to_json<T: GenericClient>(
|
||||
//
|
||||
pub fn pg_text_row_to_json(
|
||||
row: &Row,
|
||||
columns: &[Type],
|
||||
raw_output: bool,
|
||||
array_mode: bool,
|
||||
) -> Result<Value, anyhow::Error> {
|
||||
let iter = row
|
||||
.columns()
|
||||
.iter()
|
||||
.zip(columns)
|
||||
.enumerate()
|
||||
.map(|(i, (column, typ))| {
|
||||
let name = column.name();
|
||||
let pg_value = row.as_text(i)?;
|
||||
let json_value = if raw_output {
|
||||
match pg_value {
|
||||
Some(v) => Value::String(v.to_string()),
|
||||
None => Value::Null,
|
||||
}
|
||||
} else {
|
||||
pg_text_to_json(pg_value, typ)?
|
||||
};
|
||||
Ok((name.to_string(), json_value))
|
||||
});
|
||||
let iter = row.columns().iter().enumerate().map(|(i, column)| {
|
||||
let name = column.name();
|
||||
let pg_value = row.as_text(i)?;
|
||||
let json_value = if raw_output {
|
||||
match pg_value {
|
||||
Some(v) => Value::String(v.to_string()),
|
||||
None => Value::Null,
|
||||
}
|
||||
} else {
|
||||
pg_text_to_json(pg_value, column.type_())?
|
||||
};
|
||||
Ok((name.to_string(), json_value))
|
||||
});
|
||||
|
||||
if array_mode {
|
||||
// drop keys and aggregate into array
|
||||
|
||||
@@ -33,7 +33,6 @@ reqwest = { workspace = true, default-features = false, features = ["rustls-tls"
|
||||
aws-config = { workspace = true, default-features = false, features = ["rustls", "credentials-sso"] }
|
||||
|
||||
pageserver = { path = "../pageserver" }
|
||||
remote_storage = { path = "../libs/remote_storage" }
|
||||
|
||||
tracing.workspace = true
|
||||
tracing-subscriber.workspace = true
|
||||
|
||||
@@ -1,18 +1,13 @@
|
||||
use std::collections::HashSet;
|
||||
|
||||
use anyhow::Context;
|
||||
use aws_sdk_s3::{types::ObjectIdentifier, Client};
|
||||
use aws_sdk_s3::Client;
|
||||
use tracing::{error, info, warn};
|
||||
use utils::generation::Generation;
|
||||
|
||||
use crate::cloud_admin_api::BranchData;
|
||||
use crate::metadata_stream::stream_listing;
|
||||
use crate::{download_object_with_retries, RootTarget};
|
||||
use futures_util::{pin_mut, StreamExt};
|
||||
use pageserver::tenant::remote_timeline_client::parse_remote_index_path;
|
||||
use crate::{download_object_with_retries, list_objects_with_retries, RootTarget};
|
||||
use pageserver::tenant::storage_layer::LayerFileName;
|
||||
use pageserver::tenant::IndexPart;
|
||||
use remote_storage::RemotePath;
|
||||
use utils::id::TenantTimelineId;
|
||||
|
||||
pub(crate) struct TimelineAnalysis {
|
||||
@@ -73,7 +68,6 @@ pub(crate) async fn branch_cleanup_and_check_errors(
|
||||
match s3_data.blob_data {
|
||||
BlobDataParseResult::Parsed {
|
||||
index_part,
|
||||
index_part_generation,
|
||||
mut s3_layers,
|
||||
} => {
|
||||
if !IndexPart::KNOWN_VERSIONS.contains(&index_part.get_version()) {
|
||||
@@ -113,62 +107,33 @@ pub(crate) async fn branch_cleanup_and_check_errors(
|
||||
))
|
||||
}
|
||||
|
||||
let layer_map_key = (layer, metadata.generation);
|
||||
if !s3_layers.remove(&layer_map_key) {
|
||||
// FIXME: this will emit false positives if an index was
|
||||
// uploaded concurrently with our scan. To make this check
|
||||
// correct, we need to try sending a HEAD request for the
|
||||
// layer we think is missing.
|
||||
if !s3_layers.remove(&layer) {
|
||||
result.errors.push(format!(
|
||||
"index_part.json contains a layer {}{} that is not present in remote storage",
|
||||
layer_map_key.0.file_name(),
|
||||
layer_map_key.1.get_suffix()
|
||||
"index_part.json contains a layer {} that is not present in S3",
|
||||
layer.file_name(),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
let orphan_layers: Vec<(LayerFileName, Generation)> = s3_layers
|
||||
.into_iter()
|
||||
.filter(|(_layer_name, gen)|
|
||||
// A layer is only considered orphaned if it has a generation below
|
||||
// the index. If the generation is >= the index, then the layer may
|
||||
// be an upload from a running pageserver, or even an upload from
|
||||
// a new generation that didn't upload an index yet.
|
||||
//
|
||||
// Even so, a layer that is not referenced by the index could just
|
||||
// be something enqueued for deletion, so while this check is valid
|
||||
// for indicating that a layer is garbage, it is not an indicator
|
||||
// of a problem.
|
||||
gen < &index_part_generation)
|
||||
.collect();
|
||||
|
||||
if !orphan_layers.is_empty() {
|
||||
if !s3_layers.is_empty() {
|
||||
result.errors.push(format!(
|
||||
"index_part.json does not contain layers from S3: {:?}",
|
||||
orphan_layers
|
||||
s3_layers
|
||||
.iter()
|
||||
.map(|(layer_name, gen)| format!(
|
||||
"{}{}",
|
||||
layer_name.file_name(),
|
||||
gen.get_suffix()
|
||||
))
|
||||
.map(|layer_name| layer_name.file_name())
|
||||
.collect::<Vec<_>>(),
|
||||
));
|
||||
result.garbage_keys.extend(orphan_layers.iter().map(
|
||||
|(layer_name, layer_gen)| {
|
||||
result
|
||||
.garbage_keys
|
||||
.extend(s3_layers.iter().map(|layer_name| {
|
||||
let mut key = s3_root.timeline_root(id).prefix_in_bucket;
|
||||
let delimiter = s3_root.delimiter();
|
||||
if !key.ends_with(delimiter) {
|
||||
key.push_str(delimiter);
|
||||
}
|
||||
key.push_str(&format!(
|
||||
"{}{}",
|
||||
&layer_name.file_name(),
|
||||
layer_gen.get_suffix()
|
||||
));
|
||||
key.push_str(&layer_name.file_name());
|
||||
key
|
||||
},
|
||||
));
|
||||
}));
|
||||
}
|
||||
}
|
||||
BlobDataParseResult::Incorrect(parse_errors) => result.errors.extend(
|
||||
@@ -213,96 +178,69 @@ pub(crate) struct S3TimelineBlobData {
|
||||
pub(crate) enum BlobDataParseResult {
|
||||
Parsed {
|
||||
index_part: IndexPart,
|
||||
index_part_generation: Generation,
|
||||
s3_layers: HashSet<(LayerFileName, Generation)>,
|
||||
s3_layers: HashSet<LayerFileName>,
|
||||
},
|
||||
Incorrect(Vec<String>),
|
||||
}
|
||||
|
||||
fn parse_layer_object_name(name: &str) -> Result<(LayerFileName, Generation), String> {
|
||||
match name.rsplit_once('-') {
|
||||
// FIXME: this is gross, just use a regex?
|
||||
Some((layer_filename, gen)) if gen.len() == 8 => {
|
||||
let layer = layer_filename.parse::<LayerFileName>()?;
|
||||
let gen =
|
||||
Generation::parse_suffix(gen).ok_or("Malformed generation suffix".to_string())?;
|
||||
Ok((layer, gen))
|
||||
}
|
||||
_ => Ok((name.parse::<LayerFileName>()?, Generation::none())),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn list_timeline_blobs(
|
||||
s3_client: &Client,
|
||||
id: TenantTimelineId,
|
||||
s3_root: &RootTarget,
|
||||
) -> anyhow::Result<S3TimelineBlobData> {
|
||||
let mut s3_layers = HashSet::new();
|
||||
let mut index_part_object = None;
|
||||
|
||||
let timeline_dir_target = s3_root.timeline_root(&id);
|
||||
let mut continuation_token = None;
|
||||
|
||||
let mut errors = Vec::new();
|
||||
let mut keys_to_remove = Vec::new();
|
||||
|
||||
let mut timeline_dir_target = s3_root.timeline_root(&id);
|
||||
timeline_dir_target.delimiter = String::new();
|
||||
loop {
|
||||
let fetch_response =
|
||||
list_objects_with_retries(s3_client, &timeline_dir_target, continuation_token.clone())
|
||||
.await?;
|
||||
|
||||
let mut index_parts: Vec<ObjectIdentifier> = Vec::new();
|
||||
let subdirectories = fetch_response.common_prefixes().unwrap_or_default();
|
||||
if !subdirectories.is_empty() {
|
||||
errors.push(format!(
|
||||
"S3 list response should not contain any subdirectories, but got {subdirectories:?}"
|
||||
));
|
||||
}
|
||||
|
||||
let stream = stream_listing(s3_client, &timeline_dir_target);
|
||||
pin_mut!(stream);
|
||||
while let Some(obj) = stream.next().await {
|
||||
let obj = obj?;
|
||||
let key = match obj.key() {
|
||||
Some(k) => k,
|
||||
None => continue,
|
||||
};
|
||||
|
||||
let blob_name = key.strip_prefix(&timeline_dir_target.prefix_in_bucket);
|
||||
match blob_name {
|
||||
Some(name) if name.starts_with("index_part.json") => {
|
||||
tracing::info!("Index key {key}");
|
||||
index_parts.push(obj)
|
||||
}
|
||||
Some(maybe_layer_name) => match parse_layer_object_name(maybe_layer_name) {
|
||||
Ok((new_layer, gen)) => {
|
||||
tracing::info!("Parsed layer key: {} {:?}", new_layer, gen);
|
||||
s3_layers.insert((new_layer, gen));
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::info!("Error parsing key {maybe_layer_name}");
|
||||
errors.push(
|
||||
format!("S3 list response got an object with key {key} that is not a layer name: {e}"),
|
||||
);
|
||||
for (object, key) in fetch_response
|
||||
.contents()
|
||||
.unwrap_or_default()
|
||||
.iter()
|
||||
.filter_map(|object| Some((object, object.key()?)))
|
||||
{
|
||||
let blob_name = key.strip_prefix(&timeline_dir_target.prefix_in_bucket);
|
||||
match blob_name {
|
||||
Some("index_part.json") => index_part_object = Some(object.clone()),
|
||||
Some(maybe_layer_name) => match maybe_layer_name.parse::<LayerFileName>() {
|
||||
Ok(new_layer) => {
|
||||
s3_layers.insert(new_layer);
|
||||
}
|
||||
Err(e) => {
|
||||
errors.push(
|
||||
format!("S3 list response got an object with key {key} that is not a layer name: {e}"),
|
||||
);
|
||||
keys_to_remove.push(key.to_string());
|
||||
}
|
||||
},
|
||||
None => {
|
||||
errors.push(format!("S3 list response got an object with odd key {key}"));
|
||||
keys_to_remove.push(key.to_string());
|
||||
}
|
||||
},
|
||||
None => {
|
||||
tracing::info!("Peculiar key {}", key);
|
||||
errors.push(format!("S3 list response got an object with odd key {key}"));
|
||||
keys_to_remove.push(key.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Choose the index_part with the highest generation
|
||||
let (index_part_object, index_part_generation) = match index_parts
|
||||
.iter()
|
||||
.filter_map(|k| {
|
||||
let key = k.key().unwrap();
|
||||
// Stripping the index key to the last part, because RemotePath doesn't
|
||||
// like absolute paths, and depending on prefix_in_bucket it's possible
|
||||
// for the keys we read back to start with a slash.
|
||||
let basename = key.rsplit_once('/').unwrap().1;
|
||||
parse_remote_index_path(RemotePath::from_string(basename).unwrap()).map(|g| (k, g))
|
||||
})
|
||||
.max_by_key(|i| i.1)
|
||||
.map(|(k, g)| (k.clone(), g))
|
||||
{
|
||||
Some((key, gen)) => (Some(key), gen),
|
||||
None => {
|
||||
// Legacy/missing case: one or zero index parts, which did not have a generation
|
||||
(index_parts.pop(), Generation::none())
|
||||
match fetch_response.next_continuation_token {
|
||||
Some(new_token) => continuation_token = Some(new_token),
|
||||
None => break,
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
if index_part_object.is_none() {
|
||||
errors.push("S3 list response got no index_part.json file".to_string());
|
||||
@@ -323,7 +261,6 @@ pub(crate) async fn list_timeline_blobs(
|
||||
return Ok(S3TimelineBlobData {
|
||||
blob_data: BlobDataParseResult::Parsed {
|
||||
index_part,
|
||||
index_part_generation,
|
||||
s3_layers,
|
||||
},
|
||||
keys_to_remove,
|
||||
|
||||
@@ -34,9 +34,6 @@ const CLOUD_ADMIN_API_TOKEN_ENV_VAR: &str = "CLOUD_ADMIN_API_TOKEN";
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct S3Target {
|
||||
pub bucket_name: String,
|
||||
/// This `prefix_in_bucket` is only equal to the PS/SK config of the same
|
||||
/// name for the RootTarget: other instances of S3Target will have prefix_in_bucket
|
||||
/// with extra parts.
|
||||
pub prefix_in_bucket: String,
|
||||
pub delimiter: String,
|
||||
}
|
||||
@@ -80,13 +77,9 @@ impl Display for NodeKind {
|
||||
impl S3Target {
|
||||
pub fn with_sub_segment(&self, new_segment: &str) -> Self {
|
||||
let mut new_self = self.clone();
|
||||
if new_self.prefix_in_bucket.is_empty() {
|
||||
new_self.prefix_in_bucket = format!("/{}/", new_segment);
|
||||
} else {
|
||||
let _ = new_self.prefix_in_bucket.pop();
|
||||
new_self.prefix_in_bucket =
|
||||
[&new_self.prefix_in_bucket, new_segment, ""].join(&new_self.delimiter);
|
||||
}
|
||||
let _ = new_self.prefix_in_bucket.pop();
|
||||
new_self.prefix_in_bucket =
|
||||
[&new_self.prefix_in_bucket, new_segment, ""].join(&new_self.delimiter);
|
||||
new_self
|
||||
}
|
||||
}
|
||||
@@ -98,10 +91,10 @@ pub enum RootTarget {
|
||||
}
|
||||
|
||||
impl RootTarget {
|
||||
pub fn tenants_root(&self) -> S3Target {
|
||||
pub fn tenants_root(&self) -> &S3Target {
|
||||
match self {
|
||||
Self::Pageserver(root) => root.with_sub_segment(TENANTS_SEGMENT_NAME),
|
||||
Self::Safekeeper(root) => root.with_sub_segment("wal"),
|
||||
Self::Pageserver(root) => root,
|
||||
Self::Safekeeper(root) => root,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -140,7 +133,6 @@ impl RootTarget {
|
||||
pub struct BucketConfig {
|
||||
pub region: String,
|
||||
pub bucket: String,
|
||||
pub prefix_in_bucket: Option<String>,
|
||||
|
||||
/// Use SSO if this is set, else rely on AWS_* environment vars
|
||||
pub sso_account_id: Option<String>,
|
||||
@@ -163,12 +155,10 @@ impl BucketConfig {
|
||||
let sso_account_id = env::var("SSO_ACCOUNT_ID").ok();
|
||||
let region = env::var("REGION").context("'REGION' param retrieval")?;
|
||||
let bucket = env::var("BUCKET").context("'BUCKET' param retrieval")?;
|
||||
let prefix_in_bucket = env::var("BUCKET_PREFIX").ok();
|
||||
|
||||
Ok(Self {
|
||||
region,
|
||||
bucket,
|
||||
prefix_in_bucket,
|
||||
sso_account_id,
|
||||
})
|
||||
}
|
||||
@@ -201,14 +191,14 @@ pub fn init_logging(file_name: &str) -> WorkerGuard {
|
||||
.with_target(false)
|
||||
.with_ansi(false)
|
||||
.with_writer(file_writer);
|
||||
let stderr_logs = fmt::Layer::new()
|
||||
.with_ansi(std::io::stderr().is_terminal())
|
||||
let stdout_logs = fmt::Layer::new()
|
||||
.with_ansi(std::io::stdout().is_terminal())
|
||||
.with_target(false)
|
||||
.with_writer(std::io::stderr);
|
||||
.with_writer(std::io::stdout);
|
||||
tracing_subscriber::registry()
|
||||
.with(EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")))
|
||||
.with(file_logs)
|
||||
.with(stderr_logs)
|
||||
.with(stdout_logs)
|
||||
.init();
|
||||
|
||||
guard
|
||||
@@ -260,20 +250,15 @@ fn init_remote(
|
||||
let bucket_region = Region::new(bucket_config.region);
|
||||
let delimiter = "/".to_string();
|
||||
let s3_client = Arc::new(init_s3_client(bucket_config.sso_account_id, bucket_region));
|
||||
|
||||
let s3_root = match node_kind {
|
||||
NodeKind::Pageserver => RootTarget::Pageserver(S3Target {
|
||||
bucket_name: bucket_config.bucket,
|
||||
prefix_in_bucket: bucket_config
|
||||
.prefix_in_bucket
|
||||
.unwrap_or("pageserver/v1".to_string()),
|
||||
prefix_in_bucket: ["pageserver", "v1", TENANTS_SEGMENT_NAME, ""].join(&delimiter),
|
||||
delimiter,
|
||||
}),
|
||||
NodeKind::Safekeeper => RootTarget::Safekeeper(S3Target {
|
||||
bucket_name: bucket_config.bucket,
|
||||
prefix_in_bucket: bucket_config
|
||||
.prefix_in_bucket
|
||||
.unwrap_or("safekeeper/v1".to_string()),
|
||||
prefix_in_bucket: ["safekeeper", "v1", "wal", ""].join(&delimiter),
|
||||
delimiter,
|
||||
}),
|
||||
};
|
||||
|
||||
@@ -31,10 +31,7 @@ enum Command {
|
||||
#[arg(short, long, default_value_t = PurgeMode::DeletedOnly)]
|
||||
mode: PurgeMode,
|
||||
},
|
||||
ScanMetadata {
|
||||
#[arg(short, long, default_value_t = false)]
|
||||
json: bool,
|
||||
},
|
||||
ScanMetadata {},
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
@@ -57,17 +54,13 @@ async fn main() -> anyhow::Result<()> {
|
||||
));
|
||||
|
||||
match cli.command {
|
||||
Command::ScanMetadata { json } => match scan_metadata(bucket_config).await {
|
||||
Command::ScanMetadata {} => match scan_metadata(bucket_config).await {
|
||||
Err(e) => {
|
||||
tracing::error!("Failed: {e}");
|
||||
Err(e)
|
||||
}
|
||||
Ok(summary) => {
|
||||
if json {
|
||||
println!("{}", serde_json::to_string(&summary).unwrap())
|
||||
} else {
|
||||
println!("{}", summary.summary_string());
|
||||
}
|
||||
println!("{}", summary.summary_string());
|
||||
if summary.is_fatal() {
|
||||
Err(anyhow::anyhow!("Fatal scrub errors detected"))
|
||||
} else {
|
||||
|
||||
@@ -13,10 +13,10 @@ pub fn stream_tenants<'a>(
|
||||
) -> impl Stream<Item = anyhow::Result<TenantId>> + 'a {
|
||||
try_stream! {
|
||||
let mut continuation_token = None;
|
||||
let tenants_target = target.tenants_root();
|
||||
loop {
|
||||
let tenants_target = target.tenants_root();
|
||||
let fetch_response =
|
||||
list_objects_with_retries(s3_client, &tenants_target, continuation_token.clone()).await?;
|
||||
list_objects_with_retries(s3_client, tenants_target, continuation_token.clone()).await?;
|
||||
|
||||
let new_entry_ids = fetch_response
|
||||
.common_prefixes()
|
||||
|
||||
@@ -10,10 +10,8 @@ use aws_sdk_s3::Client;
|
||||
use futures_util::{pin_mut, StreamExt, TryStreamExt};
|
||||
use histogram::Histogram;
|
||||
use pageserver::tenant::IndexPart;
|
||||
use serde::Serialize;
|
||||
use utils::id::TenantTimelineId;
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub struct MetadataSummary {
|
||||
count: usize,
|
||||
with_errors: HashSet<TenantTimelineId>,
|
||||
@@ -27,9 +25,7 @@ pub struct MetadataSummary {
|
||||
}
|
||||
|
||||
/// A histogram plus minimum and maximum tracking
|
||||
#[derive(Serialize)]
|
||||
struct MinMaxHisto {
|
||||
#[serde(skip)]
|
||||
histo: Histogram,
|
||||
min: u64,
|
||||
max: u64,
|
||||
@@ -113,7 +109,6 @@ impl MetadataSummary {
|
||||
self.count += 1;
|
||||
if let BlobDataParseResult::Parsed {
|
||||
index_part,
|
||||
index_part_generation: _,
|
||||
s3_layers: _,
|
||||
} = &data.blob_data
|
||||
{
|
||||
|
||||
@@ -47,7 +47,6 @@ pq_proto.workspace = true
|
||||
remote_storage.workspace = true
|
||||
safekeeper_api.workspace = true
|
||||
storage_broker.workspace = true
|
||||
tokio-stream.workspace = true
|
||||
utils.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -5,7 +5,6 @@ use std::fs::DirEntry;
|
||||
use std::io::BufReader;
|
||||
use std::io::Read;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::Result;
|
||||
use camino::Utf8Path;
|
||||
@@ -29,7 +28,7 @@ use crate::send_wal::WalSenderState;
|
||||
use crate::GlobalTimelines;
|
||||
|
||||
/// Various filters that influence the resulting JSON output.
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct Args {
|
||||
/// Dump all available safekeeper state. False by default.
|
||||
pub dump_all: bool,
|
||||
@@ -54,76 +53,15 @@ pub struct Args {
|
||||
}
|
||||
|
||||
/// Response for debug dump request.
|
||||
#[derive(Debug, Serialize)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct Response {
|
||||
pub start_time: DateTime<Utc>,
|
||||
pub finish_time: DateTime<Utc>,
|
||||
pub timelines: Vec<TimelineDumpSer>,
|
||||
pub timelines: Vec<Timeline>,
|
||||
pub timelines_count: usize,
|
||||
pub config: Config,
|
||||
}
|
||||
|
||||
pub struct TimelineDumpSer {
|
||||
pub tli: Arc<crate::timeline::Timeline>,
|
||||
pub args: Args,
|
||||
pub runtime: Arc<tokio::runtime::Runtime>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for TimelineDumpSer {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("TimelineDumpSer")
|
||||
.field("tli", &self.tli.ttid)
|
||||
.field("args", &self.args)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for TimelineDumpSer {
|
||||
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
let dump = self
|
||||
.runtime
|
||||
.block_on(build_from_tli_dump(self.tli.clone(), self.args.clone()));
|
||||
dump.serialize(serializer)
|
||||
}
|
||||
}
|
||||
|
||||
async fn build_from_tli_dump(timeline: Arc<crate::timeline::Timeline>, args: Args) -> Timeline {
|
||||
let control_file = if args.dump_control_file {
|
||||
let mut state = timeline.get_state().await.1;
|
||||
if !args.dump_term_history {
|
||||
state.acceptor_state.term_history = TermHistory(vec![]);
|
||||
}
|
||||
Some(state)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let memory = if args.dump_memory {
|
||||
Some(timeline.memory_dump().await)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let disk_content = if args.dump_disk_content {
|
||||
// build_disk_content can fail, but we don't want to fail the whole
|
||||
// request because of that.
|
||||
build_disk_content(&timeline.timeline_dir).ok()
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
Timeline {
|
||||
tenant_id: timeline.ttid.tenant_id,
|
||||
timeline_id: timeline.ttid.timeline_id,
|
||||
control_file,
|
||||
memory,
|
||||
disk_content,
|
||||
}
|
||||
}
|
||||
|
||||
/// Safekeeper configuration.
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct Config {
|
||||
@@ -202,12 +140,8 @@ pub async fn build(args: Args) -> Result<Response> {
|
||||
GlobalTimelines::get_all()
|
||||
};
|
||||
|
||||
// TODO: return Stream instead of Vec
|
||||
let mut timelines = Vec::new();
|
||||
let runtime = Arc::new(
|
||||
tokio::runtime::Builder::new_current_thread()
|
||||
.build()
|
||||
.unwrap(),
|
||||
);
|
||||
for tli in ptrs_snapshot {
|
||||
let ttid = tli.ttid;
|
||||
if let Some(tenant_id) = args.tenant_id {
|
||||
@@ -221,11 +155,38 @@ pub async fn build(args: Args) -> Result<Response> {
|
||||
}
|
||||
}
|
||||
|
||||
timelines.push(TimelineDumpSer {
|
||||
tli,
|
||||
args: args.clone(),
|
||||
runtime: runtime.clone(),
|
||||
});
|
||||
let control_file = if args.dump_control_file {
|
||||
let mut state = tli.get_state().await.1;
|
||||
if !args.dump_term_history {
|
||||
state.acceptor_state.term_history = TermHistory(vec![]);
|
||||
}
|
||||
Some(state)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let memory = if args.dump_memory {
|
||||
Some(tli.memory_dump().await)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let disk_content = if args.dump_disk_content {
|
||||
// build_disk_content can fail, but we don't want to fail the whole
|
||||
// request because of that.
|
||||
build_disk_content(&tli.timeline_dir).ok()
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let timeline = Timeline {
|
||||
tenant_id: ttid.tenant_id,
|
||||
timeline_id: ttid.timeline_id,
|
||||
control_file,
|
||||
memory,
|
||||
disk_content,
|
||||
};
|
||||
timelines.push(timeline);
|
||||
}
|
||||
|
||||
let config = GlobalTimelines::get_global_config();
|
||||
|
||||
@@ -13,12 +13,7 @@ use storage_broker::proto::SafekeeperTimelineInfo;
|
||||
use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
|
||||
use tokio::fs::File;
|
||||
use tokio::io::AsyncReadExt;
|
||||
|
||||
use std::io::Write as _;
|
||||
use tokio::sync::mpsc;
|
||||
use tokio_stream::wrappers::ReceiverStream;
|
||||
use tracing::info_span;
|
||||
use utils::http::endpoint::{request_span, ChannelWriter};
|
||||
use utils::http::endpoint::request_span;
|
||||
|
||||
use crate::receive_wal::WalReceiverState;
|
||||
use crate::safekeeper::Term;
|
||||
@@ -378,52 +373,8 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
let started_at = std::time::Instant::now();
|
||||
|
||||
let (tx, rx) = mpsc::channel(1);
|
||||
|
||||
let body = Body::wrap_stream(ReceiverStream::new(rx));
|
||||
|
||||
let mut writer = ChannelWriter::new(128 * 1024, tx);
|
||||
|
||||
let response = Response::builder()
|
||||
.status(200)
|
||||
.header(hyper::header::CONTENT_TYPE, "application/octet-stream")
|
||||
.body(body)
|
||||
.unwrap();
|
||||
|
||||
let span = info_span!("blocking");
|
||||
tokio::task::spawn_blocking(move || {
|
||||
let _span = span.entered();
|
||||
|
||||
let res = serde_json::to_writer(&mut writer, &resp)
|
||||
.map_err(std::io::Error::from)
|
||||
.and_then(|_| writer.flush());
|
||||
|
||||
match res {
|
||||
Ok(()) => {
|
||||
tracing::info!(
|
||||
bytes = writer.flushed_bytes(),
|
||||
elapsed_ms = started_at.elapsed().as_millis(),
|
||||
"responded /v1/debug_dump"
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!("failed to write out /v1/debug_dump response: {e:#}");
|
||||
// semantics of this error are quite... unclear. we want to error the stream out to
|
||||
// abort the response to somehow notify the client that we failed.
|
||||
//
|
||||
// though, most likely the reason for failure is that the receiver is already gone.
|
||||
drop(
|
||||
writer
|
||||
.tx
|
||||
.blocking_send(Err(std::io::ErrorKind::BrokenPipe.into())),
|
||||
);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
Ok(response)
|
||||
// TODO: use streaming response
|
||||
json_response(StatusCode::OK, resp)
|
||||
}
|
||||
|
||||
/// Safekeeper http router.
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
use chrono::{DateTime, Utc};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use anyhow::{bail, Context, Result};
|
||||
@@ -33,16 +32,6 @@ pub struct Response {
|
||||
// TODO: add more fields?
|
||||
}
|
||||
|
||||
/// Response for debug dump request.
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct DebugDumpResponse {
|
||||
pub start_time: DateTime<Utc>,
|
||||
pub finish_time: DateTime<Utc>,
|
||||
pub timelines: Vec<debug_dump::Timeline>,
|
||||
pub timelines_count: usize,
|
||||
pub config: debug_dump::Config,
|
||||
}
|
||||
|
||||
/// Find the most advanced safekeeper and pull timeline from it.
|
||||
pub async fn handle_request(request: Request) -> Result<Response> {
|
||||
let existing_tli = GlobalTimelines::get(TenantTimelineId::new(
|
||||
@@ -114,7 +103,7 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response>
|
||||
|
||||
// Implementing our own scp over HTTP.
|
||||
// At first, we need to fetch list of files from safekeeper.
|
||||
let dump: DebugDumpResponse = client
|
||||
let dump: debug_dump::Response = client
|
||||
.get(format!(
|
||||
"{}/v1/debug_dump?dump_all=true&tenant_id={}&timeline_id={}",
|
||||
host, status.tenant_id, status.timeline_id
|
||||
|
||||
@@ -81,6 +81,7 @@ FALLBACK_DURATION = {
|
||||
"test_runner/performance/test_seqscans.py::test_seqscans[vanilla-100000-100-0]": 0.55,
|
||||
"test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-0]": 12.189,
|
||||
"test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-4]": 13.899,
|
||||
"test_runner/performance/test_startup.py::test_startup": 890.114,
|
||||
"test_runner/performance/test_startup.py::test_startup_simple": 2.51,
|
||||
"test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_off-10-5-5]": 527.245,
|
||||
"test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_on-10-5-5]": 583.46,
|
||||
|
||||
@@ -2868,7 +2868,7 @@ class SafekeeperHttpClient(requests.Session):
|
||||
params = params or {}
|
||||
res = self.get(f"http://localhost:{self.port}/v1/debug_dump", params=params)
|
||||
res.raise_for_status()
|
||||
res_json = json.loads(res.text)
|
||||
res_json = res.json()
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
@@ -2968,33 +2968,24 @@ class S3Scrubber:
|
||||
self.env = env
|
||||
self.log_dir = log_dir
|
||||
|
||||
def scrubber_cli(self, args: list[str], timeout) -> str:
|
||||
def scrubber_cli(self, args, timeout):
|
||||
assert isinstance(self.env.pageserver_remote_storage, S3Storage)
|
||||
s3_storage = self.env.pageserver_remote_storage
|
||||
|
||||
env = {
|
||||
"REGION": s3_storage.bucket_region,
|
||||
"BUCKET": s3_storage.bucket_name,
|
||||
"BUCKET_PREFIX": s3_storage.prefix_in_bucket,
|
||||
"RUST_LOG": "DEBUG",
|
||||
}
|
||||
env.update(s3_storage.access_env_vars())
|
||||
|
||||
if s3_storage.endpoint is not None:
|
||||
env.update({"AWS_ENDPOINT_URL": s3_storage.endpoint})
|
||||
|
||||
base_args = [str(self.env.neon_binpath / "s3_scrubber")]
|
||||
base_args = [self.env.neon_binpath / "s3_scrubber"]
|
||||
args = base_args + args
|
||||
|
||||
(output_path, stdout, status_code) = subprocess_capture(
|
||||
self.log_dir,
|
||||
args,
|
||||
echo_stderr=True,
|
||||
echo_stdout=True,
|
||||
env=env,
|
||||
check=False,
|
||||
capture_stdout=True,
|
||||
timeout=timeout,
|
||||
(output_path, _, status_code) = subprocess_capture(
|
||||
self.log_dir, args, echo_stderr=True, echo_stdout=True, env=env, check=False
|
||||
)
|
||||
if status_code:
|
||||
log.warning(f"Scrub command {args} failed")
|
||||
@@ -3003,18 +2994,8 @@ class S3Scrubber:
|
||||
|
||||
raise RuntimeError("Remote storage scrub failed")
|
||||
|
||||
assert stdout is not None
|
||||
return stdout
|
||||
|
||||
def scan_metadata(self) -> Any:
|
||||
stdout = self.scrubber_cli(["scan-metadata", "--json"], timeout=30)
|
||||
|
||||
try:
|
||||
return json.loads(stdout)
|
||||
except:
|
||||
log.error("Failed to decode JSON output from `scan-metadata`. Dumping stdout:")
|
||||
log.error(stdout)
|
||||
raise
|
||||
def scan_metadata(self):
|
||||
self.scrubber_cli(["scan-metadata"], timeout=30)
|
||||
|
||||
|
||||
def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
|
||||
|
||||
@@ -249,7 +249,7 @@ def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str
|
||||
# this has been seen in the wild by tests with the below contradicting logging
|
||||
# https://neon-github-public-dev.s3.amazonaws.com/reports/pr-5322/6207777020/index.html#suites/3556ed71f2d69272a7014df6dcb02317/53b5c368b5a68865
|
||||
# this seems like a mock_s3 issue
|
||||
log.warning(
|
||||
log.warn(
|
||||
f"contrading ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}, assuming this means KeyCount=0"
|
||||
)
|
||||
keys = 0
|
||||
@@ -257,7 +257,7 @@ def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str
|
||||
# this has been seen in one case with mock_s3:
|
||||
# https://neon-github-public-dev.s3.amazonaws.com/reports/pr-4938/6000769714/index.html#suites/3556ed71f2d69272a7014df6dcb02317/ca01e4f4d8d9a11f
|
||||
# looking at moto impl, it might be there's a race with common prefix (sub directory) not going away with deletes
|
||||
log.warning(
|
||||
log.warn(
|
||||
f"contradicting ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}"
|
||||
)
|
||||
|
||||
|
||||
@@ -35,7 +35,6 @@ def subprocess_capture(
|
||||
echo_stderr=False,
|
||||
echo_stdout=False,
|
||||
capture_stdout=False,
|
||||
timeout=None,
|
||||
**kwargs: Any,
|
||||
) -> Tuple[str, Optional[str], int]:
|
||||
"""Run a process and bifurcate its output to files and the `log` logger
|
||||
@@ -105,7 +104,7 @@ def subprocess_capture(
|
||||
stderr_handler = OutputHandler(p.stderr, stderr_f, echo=echo_stderr, capture=False)
|
||||
stderr_handler.start()
|
||||
|
||||
r = p.wait(timeout=timeout)
|
||||
r = p.wait()
|
||||
|
||||
stdout_handler.join()
|
||||
stderr_handler.join()
|
||||
|
||||
@@ -1,10 +1,8 @@
|
||||
from contextlib import closing
|
||||
|
||||
from fixtures.benchmark_fixture import MetricReport
|
||||
from fixtures.compare_fixtures import NeonCompare, PgCompare
|
||||
from fixtures.pageserver.utils import wait_tenant_status_404
|
||||
from fixtures.pg_version import PgVersion
|
||||
from fixtures.types import Lsn
|
||||
|
||||
|
||||
#
|
||||
@@ -20,8 +18,6 @@ from fixtures.types import Lsn
|
||||
def test_bulk_insert(neon_with_baseline: PgCompare):
|
||||
env = neon_with_baseline
|
||||
|
||||
start_lsn = Lsn(env.pg.safe_psql("SELECT pg_current_wal_lsn()")[0][0])
|
||||
|
||||
with closing(env.pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("create table huge (i int, j int);")
|
||||
@@ -35,13 +31,6 @@ def test_bulk_insert(neon_with_baseline: PgCompare):
|
||||
env.report_peak_memory_use()
|
||||
env.report_size()
|
||||
|
||||
# Report amount of wal written. Useful for comparing vanilla wal format vs
|
||||
# neon wal format, measuring neon write amplification, etc.
|
||||
end_lsn = Lsn(env.pg.safe_psql("SELECT pg_current_wal_lsn()")[0][0])
|
||||
wal_written_bytes = end_lsn - start_lsn
|
||||
wal_written_mb = round(wal_written_bytes / (1024 * 1024))
|
||||
env.zenbenchmark.record("wal_written", wal_written_mb, "MB", MetricReport.TEST_PARAM)
|
||||
|
||||
# When testing neon, also check how long it takes the pageserver to reingest the
|
||||
# wal from safekeepers. If this number is close to total runtime, then the pageserver
|
||||
# is the bottleneck.
|
||||
|
||||
@@ -1,3 +1,6 @@
|
||||
from contextlib import closing
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder
|
||||
@@ -78,3 +81,49 @@ def test_startup_simple(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenc
|
||||
|
||||
# Imitate optimizations that console would do for the second start
|
||||
endpoint.respec(skip_pg_catalog_updates=True)
|
||||
|
||||
|
||||
# This test sometimes runs for longer than the global 5 minute timeout.
|
||||
@pytest.mark.timeout(900)
|
||||
def test_startup(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker):
|
||||
neon_env_builder.num_safekeepers = 3
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
# Start
|
||||
env.neon_cli.create_branch("test_startup")
|
||||
with zenbenchmark.record_duration("startup_time"):
|
||||
endpoint = env.endpoints.create_start("test_startup")
|
||||
endpoint.safe_psql("select 1;")
|
||||
|
||||
# Restart
|
||||
endpoint.stop_and_destroy()
|
||||
with zenbenchmark.record_duration("restart_time"):
|
||||
endpoint.create_start("test_startup")
|
||||
endpoint.safe_psql("select 1;")
|
||||
|
||||
# Fill up
|
||||
num_rows = 1000000 # 30 MB
|
||||
num_tables = 100
|
||||
with closing(endpoint.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
for i in range(num_tables):
|
||||
cur.execute(f"create table t_{i} (i integer);")
|
||||
cur.execute(f"insert into t_{i} values (generate_series(1,{num_rows}));")
|
||||
|
||||
# Read
|
||||
with zenbenchmark.record_duration("read_time"):
|
||||
endpoint.safe_psql("select * from t_0;")
|
||||
|
||||
# Read again
|
||||
with zenbenchmark.record_duration("second_read_time"):
|
||||
endpoint.safe_psql("select * from t_0;")
|
||||
|
||||
# Restart
|
||||
endpoint.stop_and_destroy()
|
||||
with zenbenchmark.record_duration("restart_with_data"):
|
||||
endpoint.create_start("test_startup")
|
||||
endpoint.safe_psql("select 1;")
|
||||
|
||||
# Read
|
||||
with zenbenchmark.record_duration("read_after_restart"):
|
||||
endpoint.safe_psql("select * from t_0;")
|
||||
|
||||
@@ -150,9 +150,6 @@ def test_fully_custom_config(positive_env: NeonEnv):
|
||||
"compaction_target_size": 1048576,
|
||||
"checkpoint_distance": 10000,
|
||||
"checkpoint_timeout": "13m",
|
||||
"compaction_algorithm": {
|
||||
"kind": "Tiered",
|
||||
},
|
||||
"eviction_policy": {
|
||||
"kind": "LayerAccessThreshold",
|
||||
"period": "20s",
|
||||
|
||||
@@ -72,7 +72,7 @@ class DdlForwardingContext:
|
||||
self.dbs: Dict[str, str] = {}
|
||||
self.roles: Dict[str, str] = {}
|
||||
self.fail = False
|
||||
endpoint = "/test/roles_and_databases"
|
||||
endpoint = "/management/api/v2/roles_and_databases"
|
||||
ddl_url = f"http://{host}:{port}{endpoint}"
|
||||
self.pg.configure(
|
||||
[
|
||||
|
||||
@@ -1,14 +1,11 @@
|
||||
import time
|
||||
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnv,
|
||||
logical_replication_sync,
|
||||
wait_for_last_flush_lsn,
|
||||
)
|
||||
from fixtures.types import Lsn
|
||||
from fixtures.utils import query_scalar
|
||||
|
||||
|
||||
def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg):
|
||||
@@ -150,89 +147,3 @@ COMMIT;
|
||||
endpoint.start()
|
||||
# it must be gone (but walproposer slot still exists, hence 1)
|
||||
assert endpoint.safe_psql("select count(*) from pg_replication_slots")[0][0] == 1
|
||||
|
||||
|
||||
# Test compute start at LSN page of which starts with contrecord
|
||||
# https://github.com/neondatabase/neon/issues/5749
|
||||
def test_wal_page_boundary_start(neon_simple_env: NeonEnv, vanilla_pg):
|
||||
env = neon_simple_env
|
||||
|
||||
env.neon_cli.create_branch("init")
|
||||
endpoint = env.endpoints.create_start("init")
|
||||
tenant_id = endpoint.safe_psql("show neon.tenant_id")[0][0]
|
||||
timeline_id = endpoint.safe_psql("show neon.timeline_id")[0][0]
|
||||
|
||||
cur = endpoint.connect().cursor()
|
||||
cur.execute("create table t(key int, value text)")
|
||||
cur.execute("CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int);")
|
||||
cur.execute("insert into replication_example values (1, 2)")
|
||||
cur.execute("create publication pub1 for table replication_example")
|
||||
|
||||
# now start subscriber
|
||||
vanilla_pg.start()
|
||||
vanilla_pg.safe_psql("create table t(pk integer primary key, value text)")
|
||||
vanilla_pg.safe_psql("CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int);")
|
||||
|
||||
log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}")
|
||||
connstr = endpoint.connstr().replace("'", "''")
|
||||
vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
|
||||
logical_replication_sync(vanilla_pg, endpoint)
|
||||
vanilla_pg.stop()
|
||||
|
||||
with endpoint.cursor() as cur:
|
||||
# measure how much space logical message takes. Sometimes first attempt
|
||||
# creates huge message and then it stabilizes, have no idea why.
|
||||
for _ in range(3):
|
||||
lsn_before = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
|
||||
log.info(f"current_lsn={lsn_before}")
|
||||
# Non-transactional logical message doesn't write WAL, only XLogInsert's
|
||||
# it, so use transactional. Which is a bit problematic as transactional
|
||||
# necessitates commit record. Alternatively we can do smth like
|
||||
# select neon_xlogflush(pg_current_wal_insert_lsn());
|
||||
# but isn't much better + that particular call complains on 'xlog flush
|
||||
# request 0/282C018 is not satisfied' as pg_current_wal_insert_lsn skips
|
||||
# page headers.
|
||||
payload = "blahblah"
|
||||
cur.execute(f"select pg_logical_emit_message(true, 'pref', '{payload}')")
|
||||
lsn_after_by_curr_wal_lsn = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
|
||||
lsn_diff = lsn_after_by_curr_wal_lsn - lsn_before
|
||||
logical_message_base = lsn_after_by_curr_wal_lsn - lsn_before - len(payload)
|
||||
log.info(
|
||||
f"before {lsn_before}, after {lsn_after_by_curr_wal_lsn}, lsn diff is {lsn_diff}, base {logical_message_base}"
|
||||
)
|
||||
|
||||
# and write logical message spanning exactly as we want
|
||||
lsn_before = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
|
||||
log.info(f"current_lsn={lsn_before}")
|
||||
curr_lsn = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
|
||||
offs = int(curr_lsn) % 8192
|
||||
till_page = 8192 - offs
|
||||
payload_len = (
|
||||
till_page - logical_message_base - 8
|
||||
) # not sure why 8 is here, it is deduced from experiments
|
||||
log.info(f"current_lsn={curr_lsn}, offs {offs}, till_page {till_page}")
|
||||
|
||||
# payload_len above would go exactly till the page boundary; but we want contrecord, so make it slightly longer
|
||||
payload_len += 8
|
||||
|
||||
cur.execute(f"select pg_logical_emit_message(true, 'pref', 'f{'a' * payload_len}')")
|
||||
supposedly_contrecord_end = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
|
||||
log.info(f"supposedly_page_boundary={supposedly_contrecord_end}")
|
||||
# The calculations to hit the page boundary are very fuzzy, so just
|
||||
# ignore test if we fail to reach it.
|
||||
if not (int(supposedly_contrecord_end) % 8192 == 32):
|
||||
pytest.skip("missed page boundary, bad luck")
|
||||
|
||||
cur.execute("insert into replication_example values (2, 3)")
|
||||
|
||||
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
|
||||
endpoint.stop().start()
|
||||
|
||||
cur = endpoint.connect().cursor()
|
||||
# this should flush current wal page
|
||||
cur.execute("insert into replication_example values (3, 4)")
|
||||
vanilla_pg.start()
|
||||
logical_replication_sync(vanilla_pg, endpoint)
|
||||
assert vanilla_pg.safe_psql(
|
||||
"select sum(somedata) from replication_example"
|
||||
) == endpoint.safe_psql("select sum(somedata) from replication_example")
|
||||
|
||||
@@ -21,7 +21,6 @@ from fixtures.neon_fixtures import (
|
||||
NeonEnv,
|
||||
NeonEnvBuilder,
|
||||
PgBin,
|
||||
S3Scrubber,
|
||||
last_flush_lsn_upload,
|
||||
wait_for_last_flush_lsn,
|
||||
)
|
||||
@@ -235,22 +234,8 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
|
||||
assert len(suffixed_objects) > 0
|
||||
assert len(legacy_objects) > 0
|
||||
|
||||
# Flush through deletions to get a clean state for scrub: we are implicitly validating
|
||||
# that our generations-enabled pageserver was able to do deletions of layers
|
||||
# from earlier which don't have a generation.
|
||||
env.pageserver.http_client().deletion_queue_flush(execute=True)
|
||||
|
||||
assert get_deletion_queue_unexpected_errors(env.pageserver.http_client()) == 0
|
||||
|
||||
# Having written a mixture of generation-aware and legacy index_part.json,
|
||||
# ensure the scrubber handles the situation as expected.
|
||||
metadata_summary = S3Scrubber(
|
||||
neon_env_builder.test_output_dir, neon_env_builder
|
||||
).scan_metadata()
|
||||
assert metadata_summary["count"] == 1 # Scrubber should have seen our timeline
|
||||
assert not metadata_summary["with_errors"]
|
||||
assert not metadata_summary["with_warnings"]
|
||||
|
||||
|
||||
def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.enable_generations = True
|
||||
|
||||
@@ -432,47 +432,3 @@ def test_sql_over_http_pool_idle(static_proxy: NeonProxy):
|
||||
query(200, "BEGIN")
|
||||
pid2 = query(200, GET_CONNECTION_PID_QUERY)["rows"][0]["pid"]
|
||||
assert pid1 != pid2
|
||||
|
||||
|
||||
@pytest.mark.timeout(60)
|
||||
def test_sql_over_http_pool_dos(static_proxy: NeonProxy):
|
||||
static_proxy.safe_psql("create user http_auth with password 'http' superuser")
|
||||
|
||||
static_proxy.safe_psql("CREATE TYPE foo AS ENUM ('foo')")
|
||||
|
||||
def query(status: int, query: str) -> Any:
|
||||
return static_proxy.http_query(
|
||||
query,
|
||||
[],
|
||||
user="http_auth",
|
||||
password="http",
|
||||
expected_code=status,
|
||||
)
|
||||
|
||||
# query generates a million rows - should hit the 10MB reponse limit quickly
|
||||
response = query(
|
||||
400,
|
||||
"select * from generate_series(1, 5000) a cross join generate_series(1, 5000) b cross join (select 'foo'::foo) c;",
|
||||
)
|
||||
assert "response is too large (max is 10485760 bytes)" in response["message"]
|
||||
|
||||
|
||||
def test_sql_over_http_pool_custom_types(static_proxy: NeonProxy):
|
||||
static_proxy.safe_psql("create user http_auth with password 'http' superuser")
|
||||
|
||||
static_proxy.safe_psql("CREATE TYPE foo AS ENUM ('foo','bar','baz')")
|
||||
|
||||
def query(status: int, query: str) -> Any:
|
||||
return static_proxy.http_query(
|
||||
query,
|
||||
[],
|
||||
user="http_auth",
|
||||
password="http",
|
||||
expected_code=status,
|
||||
)
|
||||
|
||||
response = query(
|
||||
200,
|
||||
"select array['foo'::foo, 'bar'::foo, 'baz'::foo] as data",
|
||||
)
|
||||
assert response["rows"][0]["data"] == ["foo", "bar", "baz"]
|
||||
|
||||
2
vendor/postgres-v14
vendored
2
vendor/postgres-v14
vendored
Submodule vendor/postgres-v14 updated: dd067cf656...6669a672ee
2
vendor/postgres-v15
vendored
2
vendor/postgres-v15
vendored
Submodule vendor/postgres-v15 updated: bc88f53931...ab67ab9635
2
vendor/postgres-v16
vendored
2
vendor/postgres-v16
vendored
Submodule vendor/postgres-v16 updated: 763000f1d0...550ffa6495
6
vendor/revisions.json
vendored
6
vendor/revisions.json
vendored
@@ -1,5 +1,5 @@
|
||||
{
|
||||
"postgres-v16": "763000f1d0873b827829c41f2f6f799ffc0de55c",
|
||||
"postgres-v15": "bc88f539312fcc4bb292ce94ae9db09ab6656e8a",
|
||||
"postgres-v14": "dd067cf656f6810a25aca6025633d32d02c5085a"
|
||||
"postgres-v16": "550ffa6495a5dc62fccc3a8b449386633758680b",
|
||||
"postgres-v15": "ab67ab96355d61e9d0218630be4aa7db53bf83e7",
|
||||
"postgres-v14": "6669a672ee14ab2c09d44c4552f9a13fad3afc10"
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user