mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-28 18:40:38 +00:00
Compare commits
23 Commits
yuchen/tes
...
rc/proxy/2
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cfa45ff5ee | ||
|
|
acc075071d | ||
|
|
9627747d35 | ||
|
|
63a0d0d039 | ||
|
|
793b5061ec | ||
|
|
a889a49e06 | ||
|
|
5eb7322d08 | ||
|
|
c0ba18a112 | ||
|
|
992a951b5e | ||
|
|
c5ef779801 | ||
|
|
2d10306f7a | ||
|
|
9b9f90c562 | ||
|
|
52cb33770b | ||
|
|
12850dd5e9 | ||
|
|
5d527133a3 | ||
|
|
09362b6363 | ||
|
|
7820c572e7 | ||
|
|
bf03713fa1 | ||
|
|
0f65684263 | ||
|
|
97241776aa | ||
|
|
2dd53e7ae0 | ||
|
|
d6eede515a | ||
|
|
d48229f50f |
6
.github/ISSUE_TEMPLATE/config.yml
vendored
Normal file
6
.github/ISSUE_TEMPLATE/config.yml
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
|
||||
blank_issues_enabled: true
|
||||
contact_links:
|
||||
- name: Feature request
|
||||
url: https://console.neon.tech/app/projects?modal=feedback
|
||||
about: For feature requests in the Neon product, please submit via the feedback form on `https://console.neon.tech`
|
||||
@@ -217,7 +217,9 @@ jobs:
|
||||
${cov_prefix} cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
|
||||
|
||||
for io_engine in std-fs tokio-epoll-uring ; do
|
||||
NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
|
||||
for io_buffer_alignment in 0 1 512 ; do
|
||||
NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine NEON_PAGESERVER_UNIT_TEST_IO_BUFFER_ALIGNMENT=$io_buffer_alignment ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
|
||||
done
|
||||
done
|
||||
|
||||
# Run separate tests for real S3
|
||||
|
||||
14
Cargo.lock
generated
14
Cargo.lock
generated
@@ -936,6 +936,12 @@ dependencies = [
|
||||
"which",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bit_field"
|
||||
version = "0.10.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dc827186963e592360843fb5ba4b973e145841266c1357f7180c43526f2e5b61"
|
||||
|
||||
[[package]]
|
||||
name = "bitflags"
|
||||
version = "1.3.2"
|
||||
@@ -3683,6 +3689,7 @@ dependencies = [
|
||||
"async-compression",
|
||||
"async-stream",
|
||||
"async-trait",
|
||||
"bit_field",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"camino",
|
||||
@@ -3732,6 +3739,7 @@ dependencies = [
|
||||
"reqwest 0.12.4",
|
||||
"rpds",
|
||||
"scopeguard",
|
||||
"send-future",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_path_to_error",
|
||||
@@ -5455,6 +5463,12 @@ version = "1.0.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bebd363326d05ec3e2f532ab7660680f3b02130d780c299bca73469d521bc0ed"
|
||||
|
||||
[[package]]
|
||||
name = "send-future"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "224e328af6e080cddbab3c770b1cf50f0351ba0577091ef2410c3951d835ff87"
|
||||
|
||||
[[package]]
|
||||
name = "sentry"
|
||||
version = "0.32.3"
|
||||
|
||||
@@ -65,6 +65,7 @@ axum = { version = "0.6.20", features = ["ws"] }
|
||||
base64 = "0.13.0"
|
||||
bincode = "1.3"
|
||||
bindgen = "0.65"
|
||||
bit_field = "0.10.2"
|
||||
bstr = "1.0"
|
||||
byteorder = "1.4"
|
||||
bytes = "1.0"
|
||||
@@ -113,7 +114,7 @@ md5 = "0.7.0"
|
||||
measured = { version = "0.0.22", features=["lasso"] }
|
||||
measured-process = { version = "0.0.22" }
|
||||
memoffset = "0.8"
|
||||
nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
|
||||
nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal", "poll"] }
|
||||
notify = "6.0.0"
|
||||
num_cpus = "1.15"
|
||||
num-traits = "0.2.15"
|
||||
@@ -145,6 +146,7 @@ rustls-split = "0.3"
|
||||
scopeguard = "1.1"
|
||||
sysinfo = "0.29.2"
|
||||
sd-notify = "0.4.1"
|
||||
send-future = "0.1.0"
|
||||
sentry = { version = "0.32", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
|
||||
@@ -942,7 +942,7 @@ COPY --from=hll-pg-build /hll.tar.gz /ext-src
|
||||
COPY --from=plpgsql-check-pg-build /plpgsql_check.tar.gz /ext-src
|
||||
#COPY --from=timescaledb-pg-build /timescaledb.tar.gz /ext-src
|
||||
COPY --from=pg-hint-plan-pg-build /pg_hint_plan.tar.gz /ext-src
|
||||
COPY patches/pg_hintplan.patch /ext-src
|
||||
COPY patches/pg_hint_plan.patch /ext-src
|
||||
COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
|
||||
COPY patches/pg_cron.patch /ext-src
|
||||
#COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
|
||||
@@ -964,7 +964,7 @@ RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
|
||||
RUN cd /ext-src/rum-src && patch -p1 <../rum.patch
|
||||
# cmake is required for the h3 test
|
||||
RUN apt-get update && apt-get install -y cmake
|
||||
RUN patch -p1 < /ext-src/pg_hintplan.patch
|
||||
RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan.patch
|
||||
COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
|
||||
RUN patch -p1 </ext-src/pg_anon.patch
|
||||
RUN patch -p1 </ext-src/pg_cron.patch
|
||||
|
||||
@@ -44,6 +44,7 @@ use std::{thread, time::Duration};
|
||||
use anyhow::{Context, Result};
|
||||
use chrono::Utc;
|
||||
use clap::Arg;
|
||||
use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static;
|
||||
use signal_hook::consts::{SIGQUIT, SIGTERM};
|
||||
use signal_hook::{consts::SIGINT, iterator::Signals};
|
||||
use tracing::{error, info, warn};
|
||||
@@ -366,6 +367,8 @@ fn wait_spec(
|
||||
state.start_time = now;
|
||||
}
|
||||
|
||||
launch_lsn_lease_bg_task_for_static(&compute);
|
||||
|
||||
Ok(WaitSpecResult {
|
||||
compute,
|
||||
http_port,
|
||||
|
||||
@@ -11,6 +11,7 @@ pub mod logger;
|
||||
pub mod catalog;
|
||||
pub mod compute;
|
||||
pub mod extension_server;
|
||||
pub mod lsn_lease;
|
||||
mod migration;
|
||||
pub mod monitor;
|
||||
pub mod params;
|
||||
|
||||
186
compute_tools/src/lsn_lease.rs
Normal file
186
compute_tools/src/lsn_lease.rs
Normal file
@@ -0,0 +1,186 @@
|
||||
use anyhow::bail;
|
||||
use anyhow::Result;
|
||||
use postgres::{NoTls, SimpleQueryMessage};
|
||||
use std::time::SystemTime;
|
||||
use std::{str::FromStr, sync::Arc, thread, time::Duration};
|
||||
use utils::id::TenantId;
|
||||
use utils::id::TimelineId;
|
||||
|
||||
use compute_api::spec::ComputeMode;
|
||||
use tracing::{info, warn};
|
||||
use utils::{
|
||||
lsn::Lsn,
|
||||
shard::{ShardCount, ShardNumber, TenantShardId},
|
||||
};
|
||||
|
||||
use crate::compute::ComputeNode;
|
||||
|
||||
/// Spawns a background thread to periodically renew LSN leases for static compute.
|
||||
/// Do nothing if the compute is not in static mode.
|
||||
pub fn launch_lsn_lease_bg_task_for_static(compute: &Arc<ComputeNode>) {
|
||||
let (tenant_id, timeline_id, lsn) = {
|
||||
let state = compute.state.lock().unwrap();
|
||||
let spec = state.pspec.as_ref().expect("Spec must be set");
|
||||
match spec.spec.mode {
|
||||
ComputeMode::Static(lsn) => (spec.tenant_id, spec.timeline_id, lsn),
|
||||
_ => return,
|
||||
}
|
||||
};
|
||||
let compute = compute.clone();
|
||||
|
||||
let span = tracing::info_span!("lsn_lease_bg_task", %tenant_id, %timeline_id, %lsn);
|
||||
thread::spawn(move || {
|
||||
let _entered = span.entered();
|
||||
if let Err(e) = lsn_lease_bg_task(compute, tenant_id, timeline_id, lsn) {
|
||||
// TODO: might need stronger error feedback than logging an warning.
|
||||
warn!("Exited with error: {e}");
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/// Renews lsn lease periodically so static compute are not affected by GC.
|
||||
fn lsn_lease_bg_task(
|
||||
compute: Arc<ComputeNode>,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
lsn: Lsn,
|
||||
) -> Result<()> {
|
||||
loop {
|
||||
let valid_until = acquire_lsn_lease_with_retry(&compute, tenant_id, timeline_id, lsn)?;
|
||||
let valid_duration = valid_until
|
||||
.duration_since(SystemTime::now())
|
||||
.unwrap_or(Duration::ZERO);
|
||||
|
||||
// Sleep for 60 seconds less than the valid duration but no more than half of the valid duration.
|
||||
let sleep_duration = valid_duration
|
||||
.saturating_sub(Duration::from_secs(60))
|
||||
.max(valid_duration / 2);
|
||||
|
||||
info!(
|
||||
"Succeeded, sleeping for {} seconds",
|
||||
sleep_duration.as_secs()
|
||||
);
|
||||
thread::sleep(sleep_duration);
|
||||
}
|
||||
}
|
||||
|
||||
/// Acquires lsn lease in a retry loop. Returns the expiration time if a lease is granted.
|
||||
/// Returns an error if a lease is explicitly not granted. Otherwise, we keep sending requests.
|
||||
fn acquire_lsn_lease_with_retry(
|
||||
compute: &Arc<ComputeNode>,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
lsn: Lsn,
|
||||
) -> Result<SystemTime> {
|
||||
let mut attempts = 0usize;
|
||||
let mut retry_period_ms: f64 = 500.0;
|
||||
const MAX_RETRY_PERIOD_MS: f64 = 60.0 * 1000.0;
|
||||
|
||||
loop {
|
||||
// Note: List of pageservers is dynamic, need to re-read configs before each attempt.
|
||||
let configs = {
|
||||
let state = compute.state.lock().unwrap();
|
||||
|
||||
let spec = state.pspec.as_ref().expect("spec must be set");
|
||||
|
||||
let conn_strings = spec.pageserver_connstr.split(',');
|
||||
|
||||
conn_strings
|
||||
.map(|connstr| {
|
||||
let mut config = postgres::Config::from_str(connstr).expect("Invalid connstr");
|
||||
if let Some(storage_auth_token) = &spec.storage_auth_token {
|
||||
info!("Got storage auth token from spec file");
|
||||
config.password(storage_auth_token.clone());
|
||||
} else {
|
||||
info!("Storage auth token not set");
|
||||
}
|
||||
config
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
let result = try_acquire_lsn_lease(tenant_id, timeline_id, lsn, &configs);
|
||||
match result {
|
||||
Ok(Some(res)) => {
|
||||
return Ok(res);
|
||||
}
|
||||
Ok(None) => {
|
||||
bail!("Permanent error: lease could not be obtained, LSN is behind the GC cutoff");
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("Failed to acquire lsn lease: {e} (attempt {attempts}");
|
||||
|
||||
thread::sleep(Duration::from_millis(retry_period_ms as u64));
|
||||
retry_period_ms *= 1.5;
|
||||
retry_period_ms = retry_period_ms.min(MAX_RETRY_PERIOD_MS);
|
||||
}
|
||||
}
|
||||
attempts += 1;
|
||||
}
|
||||
}
|
||||
|
||||
/// Tries to acquire an LSN lease through PS page_service API.
|
||||
fn try_acquire_lsn_lease(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
lsn: Lsn,
|
||||
configs: &[postgres::Config],
|
||||
) -> Result<Option<SystemTime>> {
|
||||
fn get_valid_until(
|
||||
config: &postgres::Config,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
lsn: Lsn,
|
||||
) -> Result<Option<SystemTime>> {
|
||||
let mut client = config.connect(NoTls)?;
|
||||
let cmd = format!("lease lsn {} {} {} ", tenant_shard_id, timeline_id, lsn);
|
||||
let res = client.simple_query(&cmd)?;
|
||||
let msg = match res.first() {
|
||||
Some(msg) => msg,
|
||||
None => bail!("empty response"),
|
||||
};
|
||||
let row = match msg {
|
||||
SimpleQueryMessage::Row(row) => row,
|
||||
_ => bail!("error parsing lsn lease response"),
|
||||
};
|
||||
|
||||
// Note: this will be None if a lease is explicitly not granted.
|
||||
let valid_until_str = row.get("valid_until");
|
||||
|
||||
let valid_until = valid_until_str.map(|s| {
|
||||
SystemTime::UNIX_EPOCH
|
||||
.checked_add(Duration::from_millis(u128::from_str(s).unwrap() as u64))
|
||||
.expect("Time larger than max SystemTime could handle")
|
||||
});
|
||||
Ok(valid_until)
|
||||
}
|
||||
|
||||
let shard_count = configs.len();
|
||||
|
||||
let valid_until = if shard_count > 1 {
|
||||
configs
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(shard_number, config)| {
|
||||
let tenant_shard_id = TenantShardId {
|
||||
tenant_id,
|
||||
shard_count: ShardCount::new(shard_count as u8),
|
||||
shard_number: ShardNumber(shard_number as u8),
|
||||
};
|
||||
get_valid_until(config, tenant_shard_id, timeline_id, lsn)
|
||||
})
|
||||
.collect::<Result<Vec<Option<SystemTime>>>>()?
|
||||
.into_iter()
|
||||
.min()
|
||||
.unwrap()
|
||||
} else {
|
||||
get_valid_until(
|
||||
&configs[0],
|
||||
TenantShardId::unsharded(tenant_id),
|
||||
timeline_id,
|
||||
lsn,
|
||||
)?
|
||||
};
|
||||
|
||||
Ok(valid_until)
|
||||
}
|
||||
@@ -41,6 +41,8 @@ enum Command {
|
||||
listen_http_addr: String,
|
||||
#[arg(long)]
|
||||
listen_http_port: u16,
|
||||
#[arg(long)]
|
||||
availability_zone_id: String,
|
||||
},
|
||||
|
||||
/// Modify a node's configuration in the storage controller
|
||||
@@ -322,6 +324,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
listen_pg_port,
|
||||
listen_http_addr,
|
||||
listen_http_port,
|
||||
availability_zone_id,
|
||||
} => {
|
||||
storcon_client
|
||||
.dispatch::<_, ()>(
|
||||
@@ -333,6 +336,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
listen_pg_port,
|
||||
listen_http_addr,
|
||||
listen_http_port,
|
||||
availability_zone_id: Some(availability_zone_id),
|
||||
}),
|
||||
)
|
||||
.await?;
|
||||
|
||||
@@ -3,7 +3,7 @@ set -x
|
||||
|
||||
cd /ext-src || exit 2
|
||||
FAILED=
|
||||
LIST=$( (echo "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u)
|
||||
LIST=$( (echo -e "${SKIP//","/"\n"}"; ls -d -- *-src) | sort | uniq -u)
|
||||
for d in ${LIST}
|
||||
do
|
||||
[ -d "${d}" ] || continue
|
||||
|
||||
259
docs/rfcs/037-storage-controller-restarts.md
Normal file
259
docs/rfcs/037-storage-controller-restarts.md
Normal file
@@ -0,0 +1,259 @@
|
||||
# Rolling Storage Controller Restarts
|
||||
|
||||
## Summary
|
||||
|
||||
This RFC describes the issues around the current storage controller restart procedure
|
||||
and describes an implementation which reduces downtime to a few milliseconds on the happy path.
|
||||
|
||||
## Motivation
|
||||
|
||||
Storage controller upgrades (restarts, more generally) can cause multi-second availability gaps.
|
||||
While the storage controller does not sit on the main data path, it's generally not acceptable
|
||||
to block management requests for extended periods of time (e.g. https://github.com/neondatabase/neon/issues/8034).
|
||||
|
||||
### Current Implementation
|
||||
|
||||
The storage controller runs in a Kubernetes Deployment configured for one replica and strategy set to [Recreate](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#recreate-deployment).
|
||||
In non Kubernetes terms, during an upgrade, the currently running storage controller is stopped and, only after,
|
||||
a new instance is created.
|
||||
|
||||
At start-up, the storage controller calls into all the pageservers it manages (retrieved from DB) to learn the
|
||||
latest locations of all tenant shards present on them. This is usually fast, but can push into tens of seconds
|
||||
under unfavourable circumstances: pageservers are heavily loaded or unavailable.
|
||||
|
||||
## Prior Art
|
||||
|
||||
There's probably as many ways of handling restarts gracefully as there are distributed systems. Some examples include:
|
||||
* Active/Standby architectures: Two or more instance of the same service run, but traffic is only routed to one of them.
|
||||
For fail-over, traffic is routed to one of the standbys (which becomes active).
|
||||
* Consensus Algorithms (Raft, Paxos and friends): The part of consensus we care about here is leader election: peers communicate to each other
|
||||
and use a voting scheme that ensures the existence of a single leader (e.g. Raft epochs).
|
||||
|
||||
## Requirements
|
||||
|
||||
* Reduce storage controller unavailability during upgrades to milliseconds
|
||||
* Minimize the interval in which it's possible for more than one storage controller
|
||||
to issue reconciles.
|
||||
* Have one uniform implementation for restarts and upgrades
|
||||
* Fit in with the current Kubernetes deployment scheme
|
||||
|
||||
## Non Goals
|
||||
|
||||
* Implement our own consensus algorithm from scratch
|
||||
* Completely eliminate downtime storage controller downtime. Instead we aim to reduce it to the point where it looks
|
||||
like a transient error to the control plane
|
||||
|
||||
## Impacted Components
|
||||
|
||||
* storage controller
|
||||
* deployment orchestration (i.e. Ansible)
|
||||
* helm charts
|
||||
|
||||
## Terminology
|
||||
|
||||
* Observed State: in-memory mapping between tenant shards and their current pageserver locations - currently built up
|
||||
at start-up by quering pageservers
|
||||
* Deployment: Kubernetes [primitive](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/) that models
|
||||
a set of replicas
|
||||
|
||||
## Implementation
|
||||
|
||||
### High Level Flow
|
||||
|
||||
At a very high level the proposed idea is to start a new storage controller instance while
|
||||
the previous one is still running and cut-over to it when it becomes ready. The new instance,
|
||||
should coordinate with the existing one and transition responsibility gracefully. While the controller
|
||||
has built in safety against split-brain situations (via generation numbers), we'd like to avoid such
|
||||
scenarios since they can lead to availability issues for tenants that underwent changes while two controllers
|
||||
were operating at the same time and require operator intervention to remedy.
|
||||
|
||||
### Kubernetes Deployment Configuration
|
||||
|
||||
On the Kubernetes configuration side, the proposal is to update the storage controller `Deployment`
|
||||
to use `spec.strategy.type = RollingUpdate`, `spec.strategy.rollingUpdate.maxSurge=1` and `spec.strategy.maxUnavailable=0`.
|
||||
Under the hood, Kubernetes creates a new replica set and adds one pod to it (`maxSurge=1`). The old replica set does not
|
||||
scale down until the new replica set has one replica in the ready state (`maxUnavailable=0`).
|
||||
|
||||
The various possible failure scenarios are investigated in the [Handling Failures](#handling-failures) section.
|
||||
|
||||
### Storage Controller Start-Up
|
||||
|
||||
This section describes the primitives required on the storage controller side and the flow of the happy path.
|
||||
|
||||
#### Database Table For Leader Synchronization
|
||||
|
||||
A new table should be added to the storage controller database for leader synchronization during startup.
|
||||
This table will always contain at most one row. The proposed name for the table is `leader` and the schema
|
||||
contains two elements:
|
||||
* `hostname`: represents the hostname for the current storage controller leader - should be addressible
|
||||
from other pods in the deployment
|
||||
* `start_timestamp`: holds the start timestamp for the current storage controller leader (UTC timezone) - only required
|
||||
for failure case handling: see [Previous Leader Crashes Before New Leader Readiness](#previous-leader-crashes-before-new-leader-readiness)
|
||||
|
||||
Storage controllers will read the leader row at start-up and then update it to mark themselves as the leader
|
||||
at the end of the start-up sequence. We want compare-and-exchange semantics for the update: avoid the
|
||||
situation where two concurrent updates succeed and overwrite each other. The default Postgres isolation
|
||||
level is `READ COMMITTED`, which isn't strict enough here. This update transaction should use at least `REPEATABLE
|
||||
READ` isolation level in order to [prevent lost updates](https://www.interdb.jp/pg/pgsql05/08.html). Currently,
|
||||
the storage controller uses the stricter `SERIALIZABLE` isolation level for all transactions. This more than suits
|
||||
our needs here.
|
||||
|
||||
```
|
||||
START TRANSACTION ISOLATION LEVEL REPEATABLE READ
|
||||
UPDATE leader SET hostname=<new_hostname>, start_timestamp=<new_start_ts>
|
||||
WHERE hostname=<old_hostname>, start_timestampt=<old_start_ts>;
|
||||
```
|
||||
|
||||
If the transaction fails or if no rows have been updated, then the compare-and-exchange is regarded as a failure.
|
||||
|
||||
#### Step Down API
|
||||
|
||||
A new HTTP endpoint should be added to the storage controller: `POST /control/v1/step_down`. Upon receiving this
|
||||
request the leader cancels any pending reconciles and goes into a mode where it replies with 503 to all other APIs
|
||||
and does not issue any location configurations to its pageservers. The successful HTTP response will return a serialized
|
||||
snapshot of the observed state.
|
||||
|
||||
If other step down requests come in after the initial one, the request is handled and the observed state is returned (required
|
||||
for failure scenario handling - see [Handling Failures](#handling-failures)).
|
||||
|
||||
#### Graceful Restart Happy Path
|
||||
|
||||
At start-up, the first thing the storage controller does is retrieve the sole row from the new
|
||||
`leader` table. If such an entry exists, send a `/step_down` PUT API call to the current leader.
|
||||
This should be retried a few times with a short backoff (see [1]). The aspiring leader loads the
|
||||
observed state into memory and the start-up sequence proceeds as usual, but *without* querying the
|
||||
pageservers in order to build up the observed state.
|
||||
|
||||
Before doing any reconciliations or persistence change, update the `leader` database table as described in the [Database Table For Leader Synchronization](database-table-for-leader-synchronization)
|
||||
section. If this step fails, the storage controller process exits.
|
||||
|
||||
Note that no row will exist in the `leaders` table for the first graceful restart. In that case, force update the `leader` table
|
||||
(without the WHERE clause) and perform with the pre-existing start-up procedure (i.e. build observed state by querying pageservers).
|
||||
|
||||
Summary of proposed new start-up sequence:
|
||||
1. Call `/step_down`
|
||||
2. Perform any pending database migrations
|
||||
3. Load state from database
|
||||
4. Load observed state returned in step (1) into memory
|
||||
5. Do initial heartbeat round (may be moved after 5)
|
||||
7. Mark self as leader by updating the database
|
||||
8. Reschedule and reconcile everything
|
||||
|
||||
Some things to note from the steps above:
|
||||
* The storage controller makes no changes to the cluster state before step (5) (i.e. no location config
|
||||
calls to the pageserver and no compute notifications)
|
||||
* Ask the current leader to step down before loading state from database so we don't get a lost update
|
||||
if the transactions overlap.
|
||||
* Before loading the observed state at step (3), cross-validate against the database. If validation fails,
|
||||
fall back to asking the pageservers about their current locations.
|
||||
* Database migrations should only run **after** the previous instance steps down (or the step down times out).
|
||||
|
||||
|
||||
[1] The API call might fail because there's no storage controller running (i.e. [restart](#storage-controller-crash-or-restart)),
|
||||
so we don't want to extend the unavailability period by much. We still want to retry since that's not the common case.
|
||||
|
||||
### Handling Failures
|
||||
|
||||
#### Storage Controller Crash Or Restart
|
||||
|
||||
The storage controller may crash or be restarted outside of roll-outs. When a new pod is created, its call to
|
||||
`/step_down` will fail since the previous leader is no longer reachable. In this case perform the pre-existing
|
||||
start-up procedure and update the leader table (with the WHERE clause). If the update fails, the storage controller
|
||||
exists and consistency is maintained.
|
||||
|
||||
#### Previous Leader Crashes Before New Leader Readiness
|
||||
|
||||
When the previous leader (P1) crashes before the new leader (P2) passses the readiness check, Kubernetes will
|
||||
reconcile the old replica set and create a new pod for it (P1'). The `/step_down` API call will fail for P1'
|
||||
(see [2]).
|
||||
|
||||
Now we have two cases to consider:
|
||||
* P2 updates the `leader` table first: The database update from P1' will fail and P1' will exit, or be terminated
|
||||
by Kubernetes depending on timings.
|
||||
* P1' updates the `leader` table first: The `hostname` field of the `leader` row stays the same, but the `start_timestamp` field changes.
|
||||
The database update from P2 will fail (since `start_timestamp` does not match). P2 will exit and Kubernetes will
|
||||
create a new replacement pod for it (P2'). Now the entire dance starts again, but with P1' as the leader and P2' as the incumbent.
|
||||
|
||||
[2] P1 and P1' may (more likely than not) be the same pod and have the same hostname. The implementation
|
||||
should avoid this self reference and fail the API call at the client if the persisted hostname matches
|
||||
the current one.
|
||||
|
||||
#### Previous Leader Crashes After New Leader Readiness
|
||||
|
||||
The deployment's replica sets already satisfy the deployment's replica count requirements and the
|
||||
Kubernetes deployment rollout will just clean up the dead pod.
|
||||
|
||||
#### New Leader Crashes Before Pasing Readiness Check
|
||||
|
||||
The deployment controller scales up the new replica sets by creating a new pod. The entire procedure is repeated
|
||||
with the new pod.
|
||||
|
||||
#### Network Partition Between New Pod and Previous Leader
|
||||
|
||||
This feels very unlikely, but should be considered in any case. P2 (the new aspiring leader) fails the `/step_down`
|
||||
API call into P1 (the current leader). P2 proceeds with the pre-existing startup procedure and updates the `leader` table.
|
||||
Kubernetes will terminate P1, but there may be a brief period where both storage controller can drive reconciles.
|
||||
|
||||
### Dealing With Split Brain Scenarios
|
||||
|
||||
As we've seen in the previous section, we can end up with two storage controller running at the same time. The split brain
|
||||
duration is not bounded since the Kubernetes controller might become partitioned from the pods (unlikely though). While these
|
||||
scenarios are not fatal, they can cause tenant unavailability, so we'd like to reduce the chances of this happening.
|
||||
The rest of this section sketches some safety measure. It's likely overkill to implement all of them however.
|
||||
|
||||
### Ensure Leadership Before Producing Side Effects
|
||||
|
||||
The storage controller has two types of side effects: location config requests into pageservers and compute notifications into the control plane.
|
||||
Before issuing either, the storage controller could check that it is indeed still the leader by querying the database. Side effects might still be
|
||||
applied if they race with the database updatem, but the situation will eventually be detected. The storage controller process should terminate in these cases.
|
||||
|
||||
### Leadership Lease
|
||||
|
||||
Up until now, the leadership defined by this RFC is static. In order to bound the length of the split brain scenario, we could require the leadership
|
||||
to be renewed periodically. Two new columns would be added to the leaders table:
|
||||
1. `last_renewed` - timestamp indicating when the lease was last renewed
|
||||
2. `lease_duration` - duration indicating the amount of time after which the lease expires
|
||||
|
||||
The leader periodically attempts to renew the lease by checking that it is in fact still the legitimate leader and updating `last_renewed` in the
|
||||
same transaction. If the update fails, the process exits. New storage controller instances wishing to become leaders must wait for the current lease
|
||||
to expire before acquiring leadership if they have not succesfully received a response to the `/step_down` request.
|
||||
|
||||
### Notify Pageserver Of Storage Controller Term
|
||||
|
||||
Each time that leadership changes, we can bump a `term` integer column in the `leader` table. This term uniquely identifies a leader.
|
||||
Location config requests and re-attach responses can include this term. On the pageserver side, keep the latest term in memory and refuse
|
||||
anything which contains a stale term (i.e. smaller than the current one).
|
||||
|
||||
### Observability
|
||||
|
||||
* The storage controller should expose a metric which describes it's state (`Active | WarmingUp | SteppedDown`).
|
||||
Per region alerts should be added on this metric which triggers when:
|
||||
+ no storage controller has been in the `Active` state for an extended period of time
|
||||
+ more than one storage controllers are in the `Active` state
|
||||
|
||||
* An alert that periodically verifies that the `leader` table is in sync with the metric above would be very useful.
|
||||
We'd have to expose the storage controller read only database to Grafana (perhaps it is already done).
|
||||
|
||||
## Alternatives
|
||||
|
||||
### Kubernetes Leases
|
||||
|
||||
Kubernetes has a [lease primitive](https://kubernetes.io/docs/concepts/architecture/leases/) which can be used to implement leader election.
|
||||
Only one instance may hold a lease at any given time. This lease needs to be periodically renewed and has an expiration period.
|
||||
|
||||
In our case, it would work something like this:
|
||||
* `/step_down` deletes the lease or stops it from renewing
|
||||
* lease acquisition becomes part of the start-up procedure
|
||||
|
||||
The kubert crate implements a [lightweight lease API](https://docs.rs/kubert/latest/kubert/lease/struct.LeaseManager.html), but it's still
|
||||
not exactly trivial to implement.
|
||||
|
||||
This approach has the benefit of baked in observability (`kubectl describe lease`), but:
|
||||
* We offload the responsibility to Kubernetes which makes it harder to debug when things go wrong.
|
||||
* More code surface than the simple "row in database" approach. Also, most of this code would be in
|
||||
a dependency not subject to code review, etc.
|
||||
* Hard to test. Our testing infra does not run the storage controller in Kubernetes and changing it do
|
||||
so is not simple and complictes and the test set-up.
|
||||
|
||||
To my mind, the "row in database" approach is straightforward enough that we don't have to offload this
|
||||
to something external.
|
||||
@@ -56,6 +56,8 @@ pub struct NodeRegisterRequest {
|
||||
|
||||
pub listen_http_addr: String,
|
||||
pub listen_http_port: u16,
|
||||
|
||||
pub availability_zone_id: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
|
||||
@@ -236,6 +236,15 @@ impl Key {
|
||||
field5: u8::MAX,
|
||||
field6: u32::MAX,
|
||||
};
|
||||
/// A key slightly smaller than [`Key::MAX`] for use in layer key ranges to avoid them to be confused with L0 layers
|
||||
pub const NON_L0_MAX: Key = Key {
|
||||
field1: u8::MAX,
|
||||
field2: u32::MAX,
|
||||
field3: u32::MAX,
|
||||
field4: u32::MAX,
|
||||
field5: u8::MAX,
|
||||
field6: u32::MAX - 1,
|
||||
};
|
||||
|
||||
pub fn from_hex(s: &str) -> Result<Self> {
|
||||
if s.len() != 36 {
|
||||
|
||||
@@ -718,6 +718,7 @@ pub struct TimelineInfo {
|
||||
pub pg_version: u32,
|
||||
|
||||
pub state: TimelineState,
|
||||
pub is_archived: bool,
|
||||
|
||||
pub walreceiver_status: String,
|
||||
|
||||
@@ -1062,7 +1063,7 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
|
||||
}
|
||||
}
|
||||
|
||||
// In the V2 protocol version, a GetPage request contains two LSN values:
|
||||
// A GetPage request contains two LSN values:
|
||||
//
|
||||
// request_lsn: Get the page version at this point in time. Lsn::Max is a special value that means
|
||||
// "get the latest version present". It's used by the primary server, which knows that no one else
|
||||
@@ -1075,7 +1076,7 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
|
||||
// passing an earlier LSN can speed up the request, by allowing the pageserver to process the
|
||||
// request without waiting for 'request_lsn' to arrive.
|
||||
//
|
||||
// The legacy V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was
|
||||
// The now-defunct V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was
|
||||
// sufficient for the primary; the 'lsn' was equivalent to the 'not_modified_since' value, and
|
||||
// 'latest' was set to true. The V2 interface was added because there was no correct way for a
|
||||
// standby to request a page at a particular non-latest LSN, and also include the
|
||||
@@ -1083,15 +1084,11 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
|
||||
// request, if the standby knows that the page hasn't been modified since, and risk getting an error
|
||||
// if that LSN has fallen behind the GC horizon, or requesting the current replay LSN, which could
|
||||
// require the pageserver unnecessarily to wait for the WAL to arrive up to that point. The new V2
|
||||
// interface allows sending both LSNs, and let the pageserver do the right thing. There is no
|
||||
// interface allows sending both LSNs, and let the pageserver do the right thing. There was no
|
||||
// difference in the responses between V1 and V2.
|
||||
//
|
||||
// The Request structs below reflect the V2 interface. If V1 is used, the parse function
|
||||
// maps the old format requests to the new format.
|
||||
//
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum PagestreamProtocolVersion {
|
||||
V1,
|
||||
V2,
|
||||
}
|
||||
|
||||
@@ -1230,36 +1227,17 @@ impl PagestreamFeMessage {
|
||||
bytes.into()
|
||||
}
|
||||
|
||||
pub fn parse<R: std::io::Read>(
|
||||
body: &mut R,
|
||||
protocol_version: PagestreamProtocolVersion,
|
||||
) -> anyhow::Result<PagestreamFeMessage> {
|
||||
pub fn parse<R: std::io::Read>(body: &mut R) -> anyhow::Result<PagestreamFeMessage> {
|
||||
// these correspond to the NeonMessageTag enum in pagestore_client.h
|
||||
//
|
||||
// TODO: consider using protobuf or serde bincode for less error prone
|
||||
// serialization.
|
||||
let msg_tag = body.read_u8()?;
|
||||
|
||||
let (request_lsn, not_modified_since) = match protocol_version {
|
||||
PagestreamProtocolVersion::V2 => (
|
||||
Lsn::from(body.read_u64::<BigEndian>()?),
|
||||
Lsn::from(body.read_u64::<BigEndian>()?),
|
||||
),
|
||||
PagestreamProtocolVersion::V1 => {
|
||||
// In the old protocol, each message starts with a boolean 'latest' flag,
|
||||
// followed by 'lsn'. Convert that to the two LSNs, 'request_lsn' and
|
||||
// 'not_modified_since', used in the new protocol version.
|
||||
let latest = body.read_u8()? != 0;
|
||||
let request_lsn = Lsn::from(body.read_u64::<BigEndian>()?);
|
||||
if latest {
|
||||
(Lsn::MAX, request_lsn) // get latest version
|
||||
} else {
|
||||
(request_lsn, request_lsn) // get version at specified LSN
|
||||
}
|
||||
}
|
||||
};
|
||||
// these two fields are the same for every request type
|
||||
let request_lsn = Lsn::from(body.read_u64::<BigEndian>()?);
|
||||
let not_modified_since = Lsn::from(body.read_u64::<BigEndian>()?);
|
||||
|
||||
// The rest of the messages are the same between V1 and V2
|
||||
match msg_tag {
|
||||
0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
|
||||
request_lsn,
|
||||
@@ -1467,9 +1445,7 @@ mod tests {
|
||||
];
|
||||
for msg in messages {
|
||||
let bytes = msg.serialize();
|
||||
let reconstructed =
|
||||
PagestreamFeMessage::parse(&mut bytes.reader(), PagestreamProtocolVersion::V2)
|
||||
.unwrap();
|
||||
let reconstructed = PagestreamFeMessage::parse(&mut bytes.reader()).unwrap();
|
||||
assert!(msg == reconstructed);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,6 +16,7 @@ arc-swap.workspace = true
|
||||
async-compression.workspace = true
|
||||
async-stream.workspace = true
|
||||
async-trait.workspace = true
|
||||
bit_field.workspace = true
|
||||
byteorder.workspace = true
|
||||
bytes.workspace = true
|
||||
camino.workspace = true
|
||||
@@ -52,6 +53,7 @@ rand.workspace = true
|
||||
range-set-blaze = { version = "0.1.16", features = ["alloc"] }
|
||||
regex.workspace = true
|
||||
scopeguard.workspace = true
|
||||
send-future.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json = { workspace = true, features = ["raw_value"] }
|
||||
serde_path_to_error.workspace = true
|
||||
|
||||
@@ -4,7 +4,7 @@ use bytes::Bytes;
|
||||
use camino::Utf8PathBuf;
|
||||
use criterion::{criterion_group, criterion_main, Criterion};
|
||||
use pageserver::{
|
||||
config::PageServerConf,
|
||||
config::{defaults::DEFAULT_IO_BUFFER_ALIGNMENT, PageServerConf},
|
||||
context::{DownloadBehavior, RequestContext},
|
||||
l0_flush::{L0FlushConfig, L0FlushGlobalState},
|
||||
page_cache,
|
||||
@@ -103,13 +103,13 @@ async fn ingest(
|
||||
batch.push((key.to_compact(), lsn, data_ser_size, data.clone()));
|
||||
if batch.len() >= BATCH_SIZE {
|
||||
let this_batch = std::mem::take(&mut batch);
|
||||
let serialized = SerializedBatch::from_values(this_batch);
|
||||
let serialized = SerializedBatch::from_values(this_batch).unwrap();
|
||||
layer.put_batch(serialized, &ctx).await?;
|
||||
}
|
||||
}
|
||||
if !batch.is_empty() {
|
||||
let this_batch = std::mem::take(&mut batch);
|
||||
let serialized = SerializedBatch::from_values(this_batch);
|
||||
let serialized = SerializedBatch::from_values(this_batch).unwrap();
|
||||
layer.put_batch(serialized, &ctx).await?;
|
||||
}
|
||||
layer.freeze(lsn + 1).await;
|
||||
@@ -164,7 +164,11 @@ fn criterion_benchmark(c: &mut Criterion) {
|
||||
let conf: &'static PageServerConf = Box::leak(Box::new(
|
||||
pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()),
|
||||
));
|
||||
virtual_file::init(16384, virtual_file::io_engine_for_bench());
|
||||
virtual_file::init(
|
||||
16384,
|
||||
virtual_file::io_engine_for_bench(),
|
||||
DEFAULT_IO_BUFFER_ALIGNMENT,
|
||||
);
|
||||
page_cache::init(conf.page_cache_size);
|
||||
|
||||
{
|
||||
|
||||
@@ -506,6 +506,16 @@ impl Client {
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
/// Configs io buffer alignment at runtime.
|
||||
pub async fn put_io_alignment(&self, align: usize) -> Result<()> {
|
||||
let uri = format!("{}/v1/io_alignment", self.mgmt_api_endpoint);
|
||||
self.request(Method::PUT, uri, align)
|
||||
.await?
|
||||
.json()
|
||||
.await
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
pub async fn get_utilization(&self) -> Result<PageserverUtilization> {
|
||||
let uri = format!("{}/v1/utilization", self.mgmt_api_endpoint);
|
||||
self.get(uri)
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
|
||||
use anyhow::Result;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use pageserver::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
|
||||
use pageserver::context::{DownloadBehavior, RequestContext};
|
||||
use pageserver::task_mgr::TaskKind;
|
||||
use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
|
||||
@@ -144,7 +145,11 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
|
||||
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
||||
|
||||
// Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree.
|
||||
pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
|
||||
pageserver::virtual_file::init(
|
||||
10,
|
||||
virtual_file::api::IoEngineKind::StdFs,
|
||||
DEFAULT_IO_BUFFER_ALIGNMENT,
|
||||
);
|
||||
pageserver::page_cache::init(100);
|
||||
|
||||
let mut total_delta_layers = 0usize;
|
||||
|
||||
@@ -3,6 +3,7 @@ use std::path::{Path, PathBuf};
|
||||
use anyhow::Result;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use clap::Subcommand;
|
||||
use pageserver::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
|
||||
use pageserver::context::{DownloadBehavior, RequestContext};
|
||||
use pageserver::task_mgr::TaskKind;
|
||||
use pageserver::tenant::block_io::BlockCursor;
|
||||
@@ -59,7 +60,7 @@ pub(crate) enum LayerCmd {
|
||||
|
||||
async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
|
||||
let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
|
||||
virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
|
||||
virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs, 1);
|
||||
page_cache::init(100);
|
||||
let file = VirtualFile::open(path, ctx).await?;
|
||||
let file_id = page_cache::next_file_id();
|
||||
@@ -189,7 +190,11 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
|
||||
new_tenant_id,
|
||||
new_timeline_id,
|
||||
} => {
|
||||
pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
|
||||
pageserver::virtual_file::init(
|
||||
10,
|
||||
virtual_file::api::IoEngineKind::StdFs,
|
||||
DEFAULT_IO_BUFFER_ALIGNMENT,
|
||||
);
|
||||
pageserver::page_cache::init(100);
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
||||
|
||||
@@ -20,6 +20,7 @@ use clap::{Parser, Subcommand};
|
||||
use index_part::IndexPartCmd;
|
||||
use layers::LayerCmd;
|
||||
use pageserver::{
|
||||
config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
|
||||
context::{DownloadBehavior, RequestContext},
|
||||
page_cache,
|
||||
task_mgr::TaskKind,
|
||||
@@ -205,7 +206,11 @@ fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> {
|
||||
|
||||
async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> {
|
||||
// Basic initialization of things that don't change after startup
|
||||
virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
|
||||
virtual_file::init(
|
||||
10,
|
||||
virtual_file::api::IoEngineKind::StdFs,
|
||||
DEFAULT_IO_BUFFER_ALIGNMENT,
|
||||
);
|
||||
page_cache::init(100);
|
||||
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
||||
dump_layerfile_from_path(path, true, &ctx).await
|
||||
|
||||
@@ -58,6 +58,11 @@ pub(crate) struct Args {
|
||||
/// [`pageserver_api::models::virtual_file::IoEngineKind`].
|
||||
#[clap(long)]
|
||||
set_io_engine: Option<pageserver_api::models::virtual_file::IoEngineKind>,
|
||||
|
||||
/// Before starting the benchmark, live-reconfigure the pageserver to use specified alignment for io buffers.
|
||||
#[clap(long)]
|
||||
set_io_alignment: Option<usize>,
|
||||
|
||||
targets: Option<Vec<TenantTimelineId>>,
|
||||
}
|
||||
|
||||
@@ -124,6 +129,10 @@ async fn main_impl(
|
||||
mgmt_api_client.put_io_engine(engine_str).await?;
|
||||
}
|
||||
|
||||
if let Some(align) = args.set_io_alignment {
|
||||
mgmt_api_client.put_io_alignment(align).await?;
|
||||
}
|
||||
|
||||
// discover targets
|
||||
let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
|
||||
&mgmt_api_client,
|
||||
|
||||
39
pageserver/src/assert_u64_eq_usize.rs
Normal file
39
pageserver/src/assert_u64_eq_usize.rs
Normal file
@@ -0,0 +1,39 @@
|
||||
//! `u64`` and `usize`` aren't guaranteed to be identical in Rust, but life is much simpler if that's the case.
|
||||
|
||||
pub(crate) const _ASSERT_U64_EQ_USIZE: () = {
|
||||
if std::mem::size_of::<usize>() != std::mem::size_of::<u64>() {
|
||||
panic!("the traits defined in this module assume that usize and u64 can be converted to each other without loss of information");
|
||||
}
|
||||
};
|
||||
|
||||
pub(crate) trait U64IsUsize {
|
||||
fn into_usize(self) -> usize;
|
||||
}
|
||||
|
||||
impl U64IsUsize for u64 {
|
||||
#[inline(always)]
|
||||
fn into_usize(self) -> usize {
|
||||
#[allow(clippy::let_unit_value)]
|
||||
let _ = _ASSERT_U64_EQ_USIZE;
|
||||
self as usize
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) trait UsizeIsU64 {
|
||||
fn into_u64(self) -> u64;
|
||||
}
|
||||
|
||||
impl UsizeIsU64 for usize {
|
||||
#[inline(always)]
|
||||
fn into_u64(self) -> u64 {
|
||||
#[allow(clippy::let_unit_value)]
|
||||
let _ = _ASSERT_U64_EQ_USIZE;
|
||||
self as u64
|
||||
}
|
||||
}
|
||||
|
||||
pub const fn u64_to_usize(x: u64) -> usize {
|
||||
#[allow(clippy::let_unit_value)]
|
||||
let _ = _ASSERT_U64_EQ_USIZE;
|
||||
x as usize
|
||||
}
|
||||
@@ -125,18 +125,69 @@ fn main() -> anyhow::Result<()> {
|
||||
info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
|
||||
info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings");
|
||||
info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access");
|
||||
info!(?conf.io_buffer_alignment, "starting with setting for IO buffer alignment");
|
||||
|
||||
// The tenants directory contains all the pageserver local disk state.
|
||||
// Create if not exists and make sure all the contents are durable before proceeding.
|
||||
// Ensuring durability eliminates a whole bug class where we come up after an unclean shutdown.
|
||||
// After unclea shutdown, we don't know if all the filesystem content we can read via syscalls is actually durable or not.
|
||||
// Examples for that: OOM kill, systemd killing us during shutdown, self abort due to unrecoverable IO error.
|
||||
let tenants_path = conf.tenants_path();
|
||||
if !tenants_path.exists() {
|
||||
utils::crashsafe::create_dir_all(conf.tenants_path())
|
||||
.with_context(|| format!("Failed to create tenants root dir at '{tenants_path}'"))?;
|
||||
{
|
||||
let open = || {
|
||||
nix::dir::Dir::open(
|
||||
tenants_path.as_std_path(),
|
||||
nix::fcntl::OFlag::O_DIRECTORY | nix::fcntl::OFlag::O_RDONLY,
|
||||
nix::sys::stat::Mode::empty(),
|
||||
)
|
||||
};
|
||||
let dirfd = match open() {
|
||||
Ok(dirfd) => dirfd,
|
||||
Err(e) => match e {
|
||||
nix::errno::Errno::ENOENT => {
|
||||
utils::crashsafe::create_dir_all(&tenants_path).with_context(|| {
|
||||
format!("Failed to create tenants root dir at '{tenants_path}'")
|
||||
})?;
|
||||
open().context("open tenants dir after creating it")?
|
||||
}
|
||||
e => anyhow::bail!(e),
|
||||
},
|
||||
};
|
||||
|
||||
let started = Instant::now();
|
||||
// Linux guarantees durability for syncfs.
|
||||
// POSIX doesn't have syncfs, and further does not actually guarantee durability of sync().
|
||||
#[cfg(target_os = "linux")]
|
||||
{
|
||||
use std::os::fd::AsRawFd;
|
||||
nix::unistd::syncfs(dirfd.as_raw_fd()).context("syncfs")?;
|
||||
}
|
||||
#[cfg(target_os = "macos")]
|
||||
{
|
||||
// macOS is not a production platform for Neon, don't even bother.
|
||||
drop(dirfd);
|
||||
}
|
||||
#[cfg(not(any(target_os = "linux", target_os = "macos")))]
|
||||
{
|
||||
compile_error!("Unsupported OS");
|
||||
}
|
||||
|
||||
let elapsed = started.elapsed();
|
||||
info!(
|
||||
elapsed_ms = elapsed.as_millis(),
|
||||
"made tenant directory contents durable"
|
||||
);
|
||||
}
|
||||
|
||||
// Initialize up failpoints support
|
||||
let scenario = failpoint_support::init();
|
||||
|
||||
// Basic initialization of things that don't change after startup
|
||||
virtual_file::init(conf.max_file_descriptors, conf.virtual_file_io_engine);
|
||||
virtual_file::init(
|
||||
conf.max_file_descriptors,
|
||||
conf.virtual_file_io_engine,
|
||||
conf.io_buffer_alignment,
|
||||
);
|
||||
page_cache::init(conf.page_cache_size);
|
||||
|
||||
start_pageserver(launch_ts, conf).context("Failed to start pageserver")?;
|
||||
|
||||
@@ -31,6 +31,7 @@ use utils::{
|
||||
|
||||
use crate::l0_flush::L0FlushConfig;
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
use crate::tenant::storage_layer::inmemory_layer::IndexEntry;
|
||||
use crate::tenant::timeline::compaction::CompactL0Phase1ValueAccess;
|
||||
use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
|
||||
use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
|
||||
@@ -95,6 +96,8 @@ pub mod defaults {
|
||||
|
||||
pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
|
||||
|
||||
pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 0;
|
||||
|
||||
///
|
||||
/// Default built-in configuration file.
|
||||
///
|
||||
@@ -289,6 +292,8 @@ pub struct PageServerConf {
|
||||
|
||||
/// Direct IO settings
|
||||
pub virtual_file_direct_io: virtual_file::DirectIoMode,
|
||||
|
||||
pub io_buffer_alignment: usize,
|
||||
}
|
||||
|
||||
/// We do not want to store this in a PageServerConf because the latter may be logged
|
||||
@@ -393,6 +398,8 @@ struct PageServerConfigBuilder {
|
||||
compact_level0_phase1_value_access: BuilderValue<CompactL0Phase1ValueAccess>,
|
||||
|
||||
virtual_file_direct_io: BuilderValue<virtual_file::DirectIoMode>,
|
||||
|
||||
io_buffer_alignment: BuilderValue<usize>,
|
||||
}
|
||||
|
||||
impl PageServerConfigBuilder {
|
||||
@@ -481,6 +488,7 @@ impl PageServerConfigBuilder {
|
||||
l0_flush: Set(L0FlushConfig::default()),
|
||||
compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()),
|
||||
virtual_file_direct_io: Set(virtual_file::DirectIoMode::default()),
|
||||
io_buffer_alignment: Set(DEFAULT_IO_BUFFER_ALIGNMENT),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -660,6 +668,10 @@ impl PageServerConfigBuilder {
|
||||
self.virtual_file_direct_io = BuilderValue::Set(value);
|
||||
}
|
||||
|
||||
pub fn io_buffer_alignment(&mut self, value: usize) {
|
||||
self.io_buffer_alignment = BuilderValue::Set(value);
|
||||
}
|
||||
|
||||
pub fn build(self, id: NodeId) -> anyhow::Result<PageServerConf> {
|
||||
let default = Self::default_values();
|
||||
|
||||
@@ -716,6 +728,7 @@ impl PageServerConfigBuilder {
|
||||
l0_flush,
|
||||
compact_level0_phase1_value_access,
|
||||
virtual_file_direct_io,
|
||||
io_buffer_alignment,
|
||||
}
|
||||
CUSTOM LOGIC
|
||||
{
|
||||
@@ -985,6 +998,9 @@ impl PageServerConf {
|
||||
"virtual_file_direct_io" => {
|
||||
builder.virtual_file_direct_io(utils::toml_edit_ext::deserialize_item(item).context("virtual_file_direct_io")?)
|
||||
}
|
||||
"io_buffer_alignment" => {
|
||||
builder.io_buffer_alignment(parse_toml_u64("io_buffer_alignment", item)? as usize)
|
||||
}
|
||||
_ => bail!("unrecognized pageserver option '{key}'"),
|
||||
}
|
||||
}
|
||||
@@ -1005,6 +1021,15 @@ impl PageServerConf {
|
||||
|
||||
conf.default_tenant_conf = t_conf.merge(TenantConf::default());
|
||||
|
||||
IndexEntry::validate_checkpoint_distance(conf.default_tenant_conf.checkpoint_distance)
|
||||
.map_err(|msg| anyhow::anyhow!("{msg}"))
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"effective checkpoint distance is unsupported: {}",
|
||||
conf.default_tenant_conf.checkpoint_distance
|
||||
)
|
||||
})?;
|
||||
|
||||
Ok(conf)
|
||||
}
|
||||
|
||||
@@ -1068,6 +1093,7 @@ impl PageServerConf {
|
||||
l0_flush: L0FlushConfig::default(),
|
||||
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
|
||||
virtual_file_direct_io: virtual_file::DirectIoMode::default(),
|
||||
io_buffer_alignment: defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1308,6 +1334,7 @@ background_task_maximum_delay = '334 s'
|
||||
l0_flush: L0FlushConfig::default(),
|
||||
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
|
||||
virtual_file_direct_io: virtual_file::DirectIoMode::default(),
|
||||
io_buffer_alignment: defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
|
||||
},
|
||||
"Correct defaults should be used when no config values are provided"
|
||||
);
|
||||
@@ -1381,6 +1408,7 @@ background_task_maximum_delay = '334 s'
|
||||
l0_flush: L0FlushConfig::default(),
|
||||
compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(),
|
||||
virtual_file_direct_io: virtual_file::DirectIoMode::default(),
|
||||
io_buffer_alignment: defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
|
||||
},
|
||||
"Should be able to parse all basic config values correctly"
|
||||
);
|
||||
|
||||
@@ -141,12 +141,18 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
|
||||
m.other
|
||||
);
|
||||
|
||||
let az_id = m
|
||||
.other
|
||||
.get("availability_zone_id")
|
||||
.and_then(|jv| jv.as_str().map(|str| str.to_owned()));
|
||||
|
||||
Some(NodeRegisterRequest {
|
||||
node_id: conf.id,
|
||||
listen_pg_addr: m.postgres_host,
|
||||
listen_pg_port: m.postgres_port,
|
||||
listen_http_addr: m.http_host,
|
||||
listen_http_port: m.http_port,
|
||||
availability_zone_id: az_id,
|
||||
})
|
||||
}
|
||||
Err(e) => {
|
||||
|
||||
@@ -318,6 +318,24 @@ impl From<crate::tenant::DeleteTimelineError> for ApiError {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<crate::tenant::TimelineArchivalError> for ApiError {
|
||||
fn from(value: crate::tenant::TimelineArchivalError) -> Self {
|
||||
use crate::tenant::TimelineArchivalError::*;
|
||||
match value {
|
||||
NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found").into()),
|
||||
Timeout => ApiError::Timeout("hit pageserver internal timeout".into()),
|
||||
HasUnarchivedChildren(children) => ApiError::PreconditionFailed(
|
||||
format!(
|
||||
"Cannot archive timeline which has non-archived child timelines: {children:?}"
|
||||
)
|
||||
.into_boxed_str(),
|
||||
),
|
||||
a @ AlreadyInProgress => ApiError::Conflict(a.to_string()),
|
||||
Other(e) => ApiError::InternalServerError(e),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
|
||||
fn from(value: crate::tenant::mgr::DeleteTimelineError) -> Self {
|
||||
use crate::tenant::mgr::DeleteTimelineError::*;
|
||||
@@ -405,6 +423,8 @@ async fn build_timeline_info_common(
|
||||
let current_logical_size = timeline.get_current_logical_size(logical_size_task_priority, ctx);
|
||||
let current_physical_size = Some(timeline.layer_size_sum().await);
|
||||
let state = timeline.current_state();
|
||||
// Report is_archived = false if the timeline is still loading
|
||||
let is_archived = timeline.is_archived().unwrap_or(false);
|
||||
let remote_consistent_lsn_projected = timeline
|
||||
.get_remote_consistent_lsn_projected()
|
||||
.unwrap_or(Lsn(0));
|
||||
@@ -445,6 +465,7 @@ async fn build_timeline_info_common(
|
||||
pg_version: timeline.pg_version,
|
||||
|
||||
state,
|
||||
is_archived,
|
||||
|
||||
walreceiver_status,
|
||||
|
||||
@@ -686,9 +707,7 @@ async fn timeline_archival_config_handler(
|
||||
|
||||
tenant
|
||||
.apply_timeline_archival_config(timeline_id, request_data.state)
|
||||
.await
|
||||
.context("applying archival config")
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
.await?;
|
||||
Ok::<_, ApiError>(())
|
||||
}
|
||||
.instrument(info_span!("timeline_archival_config",
|
||||
@@ -2325,6 +2344,20 @@ async fn put_io_engine_handler(
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn put_io_alignment_handler(
|
||||
mut r: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&r, None)?;
|
||||
let align: usize = json_request(&mut r).await?;
|
||||
crate::virtual_file::set_io_buffer_alignment(align).map_err(|align| {
|
||||
ApiError::PreconditionFailed(
|
||||
format!("Requested io alignment ({align}) is not a power of two").into(),
|
||||
)
|
||||
})?;
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
/// Polled by control plane.
|
||||
///
|
||||
/// See [`crate::utilization`].
|
||||
@@ -3012,6 +3045,9 @@ pub fn make_router(
|
||||
|r| api_handler(r, timeline_collect_keyspace),
|
||||
)
|
||||
.put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
|
||||
.put("/v1/io_alignment", |r| {
|
||||
api_handler(r, put_io_alignment_handler)
|
||||
})
|
||||
.put(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/force_aux_policy_switch",
|
||||
|r| api_handler(r, force_aux_policy_switch_handler),
|
||||
|
||||
@@ -16,6 +16,7 @@ pub mod l0_flush;
|
||||
use futures::{stream::FuturesUnordered, StreamExt};
|
||||
pub use pageserver_api::keyspace;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
mod assert_u64_eq_usize;
|
||||
pub mod aux_file;
|
||||
pub mod metrics;
|
||||
pub mod page_cache;
|
||||
|
||||
@@ -1552,7 +1552,6 @@ pub(crate) static LIVE_CONNECTIONS: Lazy<IntCounterPairVec> = Lazy::new(|| {
|
||||
#[derive(Clone, Copy, enum_map::Enum, IntoStaticStr)]
|
||||
pub(crate) enum ComputeCommandKind {
|
||||
PageStreamV2,
|
||||
PageStream,
|
||||
Basebackup,
|
||||
Fullbackup,
|
||||
LeaseLsn,
|
||||
|
||||
@@ -557,7 +557,7 @@ impl PageServerHandler {
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
protocol_version: PagestreamProtocolVersion,
|
||||
_protocol_version: PagestreamProtocolVersion,
|
||||
ctx: RequestContext,
|
||||
) -> Result<(), QueryError>
|
||||
where
|
||||
@@ -601,8 +601,7 @@ impl PageServerHandler {
|
||||
fail::fail_point!("ps::handle-pagerequest-message");
|
||||
|
||||
// parse request
|
||||
let neon_fe_msg =
|
||||
PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?;
|
||||
let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;
|
||||
|
||||
// invoke handler function
|
||||
let (handler_result, span) = match neon_fe_msg {
|
||||
@@ -754,16 +753,21 @@ impl PageServerHandler {
|
||||
}
|
||||
|
||||
if request_lsn < **latest_gc_cutoff_lsn {
|
||||
// Check explicitly for INVALID just to get a less scary error message if the
|
||||
// request is obviously bogus
|
||||
return Err(if request_lsn == Lsn::INVALID {
|
||||
PageStreamError::BadRequest("invalid LSN(0) in request".into())
|
||||
} else {
|
||||
PageStreamError::BadRequest(format!(
|
||||
let gc_info = &timeline.gc_info.read().unwrap();
|
||||
if !gc_info.leases.contains_key(&request_lsn) {
|
||||
// The requested LSN is below gc cutoff and is not guarded by a lease.
|
||||
|
||||
// Check explicitly for INVALID just to get a less scary error message if the
|
||||
// request is obviously bogus
|
||||
return Err(if request_lsn == Lsn::INVALID {
|
||||
PageStreamError::BadRequest("invalid LSN(0) in request".into())
|
||||
} else {
|
||||
PageStreamError::BadRequest(format!(
|
||||
"tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
|
||||
request_lsn, **latest_gc_cutoff_lsn
|
||||
).into())
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for WAL up to 'not_modified_since' to arrive, if necessary
|
||||
@@ -790,6 +794,8 @@ impl PageServerHandler {
|
||||
}
|
||||
}
|
||||
|
||||
/// Handles the lsn lease request.
|
||||
/// If a lease cannot be obtained, the client will receive NULL.
|
||||
#[instrument(skip_all, fields(shard_id, %lsn))]
|
||||
async fn handle_make_lsn_lease<IO>(
|
||||
&mut self,
|
||||
@@ -812,19 +818,25 @@ impl PageServerHandler {
|
||||
.await?;
|
||||
set_tracing_field_shard_id(&timeline);
|
||||
|
||||
let lease = timeline.make_lsn_lease(lsn, timeline.get_lsn_lease_length(), ctx)?;
|
||||
let valid_until = lease
|
||||
.valid_until
|
||||
.duration_since(SystemTime::UNIX_EPOCH)
|
||||
.map_err(|e| QueryError::Other(e.into()))?;
|
||||
let lease = timeline
|
||||
.make_lsn_lease(lsn, timeline.get_lsn_lease_length(), ctx)
|
||||
.inspect_err(|e| {
|
||||
warn!("{e}");
|
||||
})
|
||||
.ok();
|
||||
let valid_until_str = lease.map(|l| {
|
||||
l.valid_until
|
||||
.duration_since(SystemTime::UNIX_EPOCH)
|
||||
.expect("valid_until is earlier than UNIX_EPOCH")
|
||||
.as_millis()
|
||||
.to_string()
|
||||
});
|
||||
let bytes = valid_until_str.as_ref().map(|x| x.as_bytes());
|
||||
|
||||
pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col(
|
||||
b"valid_until",
|
||||
)]))?
|
||||
.write_message_noflush(&BeMessage::DataRow(&[Some(
|
||||
&valid_until.as_millis().to_be_bytes(),
|
||||
)]))?
|
||||
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
.write_message_noflush(&BeMessage::DataRow(&[bytes]))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -1275,35 +1287,6 @@ where
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
} else if let Some(params) = parts.strip_prefix(&["pagestream"]) {
|
||||
if params.len() != 2 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number for pagestream command"
|
||||
)));
|
||||
}
|
||||
let tenant_id = TenantId::from_str(params[0])
|
||||
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
|
||||
let timeline_id = TimelineId::from_str(params[1])
|
||||
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
|
||||
|
||||
tracing::Span::current()
|
||||
.record("tenant_id", field::display(tenant_id))
|
||||
.record("timeline_id", field::display(timeline_id));
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
COMPUTE_COMMANDS_COUNTERS
|
||||
.for_command(ComputeCommandKind::PageStream)
|
||||
.inc();
|
||||
|
||||
self.handle_pagerequests(
|
||||
pgb,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
PagestreamProtocolVersion::V1,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
} else if let Some(params) = parts.strip_prefix(&["basebackup"]) {
|
||||
if params.len() < 2 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
|
||||
@@ -501,6 +501,38 @@ impl Debug for DeleteTimelineError {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error)]
|
||||
pub enum TimelineArchivalError {
|
||||
#[error("NotFound")]
|
||||
NotFound,
|
||||
|
||||
#[error("Timeout")]
|
||||
Timeout,
|
||||
|
||||
#[error("HasUnarchivedChildren")]
|
||||
HasUnarchivedChildren(Vec<TimelineId>),
|
||||
|
||||
#[error("Timeline archival is already in progress")]
|
||||
AlreadyInProgress,
|
||||
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
impl Debug for TimelineArchivalError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
Self::NotFound => write!(f, "NotFound"),
|
||||
Self::Timeout => write!(f, "Timeout"),
|
||||
Self::HasUnarchivedChildren(c) => {
|
||||
f.debug_tuple("HasUnarchivedChildren").field(c).finish()
|
||||
}
|
||||
Self::AlreadyInProgress => f.debug_tuple("AlreadyInProgress").finish(),
|
||||
Self::Other(e) => f.debug_tuple("Other").field(e).finish(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub enum SetStoppingError {
|
||||
AlreadyStopping(completion::Barrier),
|
||||
Broken,
|
||||
@@ -845,6 +877,12 @@ impl Tenant {
|
||||
});
|
||||
};
|
||||
|
||||
// TODO: should also be rejecting tenant conf changes that violate this check.
|
||||
if let Err(e) = crate::tenant::storage_layer::inmemory_layer::IndexEntry::validate_checkpoint_distance(tenant_clone.get_checkpoint_distance()) {
|
||||
make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let mut init_order = init_order;
|
||||
// take the completion because initial tenant loading will complete when all of
|
||||
// these tasks complete.
|
||||
@@ -1326,24 +1364,50 @@ impl Tenant {
|
||||
&self,
|
||||
timeline_id: TimelineId,
|
||||
state: TimelineArchivalState,
|
||||
) -> anyhow::Result<()> {
|
||||
let timeline = self
|
||||
.get_timeline(timeline_id, false)
|
||||
.context("Cannot apply timeline archival config to inexistent timeline")?;
|
||||
) -> Result<(), TimelineArchivalError> {
|
||||
info!("setting timeline archival config");
|
||||
let timeline = {
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
|
||||
let timeline = match timelines.get(&timeline_id) {
|
||||
Some(t) => t,
|
||||
None => return Err(TimelineArchivalError::NotFound),
|
||||
};
|
||||
|
||||
// Ensure that there are no non-archived child timelines
|
||||
let children: Vec<TimelineId> = timelines
|
||||
.iter()
|
||||
.filter_map(|(id, entry)| {
|
||||
if entry.get_ancestor_timeline_id() != Some(timeline_id) {
|
||||
return None;
|
||||
}
|
||||
if entry.is_archived() == Some(true) {
|
||||
return None;
|
||||
}
|
||||
Some(*id)
|
||||
})
|
||||
.collect();
|
||||
|
||||
if !children.is_empty() && state == TimelineArchivalState::Archived {
|
||||
return Err(TimelineArchivalError::HasUnarchivedChildren(children));
|
||||
}
|
||||
Arc::clone(timeline)
|
||||
};
|
||||
|
||||
let upload_needed = timeline
|
||||
.remote_client
|
||||
.schedule_index_upload_for_timeline_archival_state(state)?;
|
||||
|
||||
if upload_needed {
|
||||
info!("Uploading new state");
|
||||
const MAX_WAIT: Duration = Duration::from_secs(10);
|
||||
let Ok(v) =
|
||||
tokio::time::timeout(MAX_WAIT, timeline.remote_client.wait_completion()).await
|
||||
else {
|
||||
tracing::warn!("reached timeout for waiting on upload queue");
|
||||
bail!("reached timeout for upload queue flush");
|
||||
return Err(TimelineArchivalError::Timeout);
|
||||
};
|
||||
v?;
|
||||
v.map_err(|e| TimelineArchivalError::Other(anyhow::anyhow!(e)))?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -7013,18 +7077,14 @@ mod tests {
|
||||
vec![
|
||||
// Image layer at GC horizon
|
||||
PersistentLayerKey {
|
||||
key_range: {
|
||||
let mut key = Key::MAX;
|
||||
key.field6 -= 1;
|
||||
Key::MIN..key
|
||||
},
|
||||
key_range: Key::MIN..Key::NON_L0_MAX,
|
||||
lsn_range: Lsn(0x30)..Lsn(0x31),
|
||||
is_delta: false
|
||||
},
|
||||
// The delta layer that is cut in the middle
|
||||
// The delta layer covers the full range (with the layer key hack to avoid being recognized as L0)
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(3)..get_key(4),
|
||||
lsn_range: Lsn(0x30)..Lsn(0x41),
|
||||
key_range: Key::MIN..Key::NON_L0_MAX,
|
||||
lsn_range: Lsn(0x30)..Lsn(0x48),
|
||||
is_delta: true
|
||||
},
|
||||
// The delta3 layer that should not be picked for the compaction
|
||||
@@ -8004,6 +8064,214 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_simple_bottom_most_compaction_with_retain_lsns_single_key() -> anyhow::Result<()>
|
||||
{
|
||||
let harness =
|
||||
TenantHarness::create("test_simple_bottom_most_compaction_with_retain_lsns_single_key")
|
||||
.await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
fn get_key(id: u32) -> Key {
|
||||
// using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
|
||||
let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
|
||||
key.field6 = id;
|
||||
key
|
||||
}
|
||||
|
||||
let img_layer = (0..10)
|
||||
.map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
|
||||
.collect_vec();
|
||||
|
||||
let delta1 = vec![
|
||||
(
|
||||
get_key(1),
|
||||
Lsn(0x20),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
|
||||
),
|
||||
(
|
||||
get_key(1),
|
||||
Lsn(0x28),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x28")),
|
||||
),
|
||||
];
|
||||
let delta2 = vec![
|
||||
(
|
||||
get_key(1),
|
||||
Lsn(0x30),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x30")),
|
||||
),
|
||||
(
|
||||
get_key(1),
|
||||
Lsn(0x38),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x38")),
|
||||
),
|
||||
];
|
||||
let delta3 = vec![
|
||||
(
|
||||
get_key(8),
|
||||
Lsn(0x48),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
|
||||
),
|
||||
(
|
||||
get_key(9),
|
||||
Lsn(0x48),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
|
||||
),
|
||||
];
|
||||
|
||||
let tline = tenant
|
||||
.create_test_timeline_with_layers(
|
||||
TIMELINE_ID,
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
vec![
|
||||
// delta1 and delta 2 only contain a single key but multiple updates
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x30), delta1),
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x30)..Lsn(0x50), delta2),
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x50), delta3),
|
||||
], // delta layers
|
||||
vec![(Lsn(0x10), img_layer)], // image layers
|
||||
Lsn(0x50),
|
||||
)
|
||||
.await?;
|
||||
{
|
||||
// Update GC info
|
||||
let mut guard = tline.gc_info.write().unwrap();
|
||||
*guard = GcInfo {
|
||||
retain_lsns: vec![
|
||||
(Lsn(0x10), tline.timeline_id),
|
||||
(Lsn(0x20), tline.timeline_id),
|
||||
],
|
||||
cutoffs: GcCutoffs {
|
||||
time: Lsn(0x30),
|
||||
space: Lsn(0x30),
|
||||
},
|
||||
leases: Default::default(),
|
||||
within_ancestor_pitr: false,
|
||||
};
|
||||
}
|
||||
|
||||
let expected_result = [
|
||||
Bytes::from_static(b"value 0@0x10"),
|
||||
Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30@0x38"),
|
||||
Bytes::from_static(b"value 2@0x10"),
|
||||
Bytes::from_static(b"value 3@0x10"),
|
||||
Bytes::from_static(b"value 4@0x10"),
|
||||
Bytes::from_static(b"value 5@0x10"),
|
||||
Bytes::from_static(b"value 6@0x10"),
|
||||
Bytes::from_static(b"value 7@0x10"),
|
||||
Bytes::from_static(b"value 8@0x10@0x48"),
|
||||
Bytes::from_static(b"value 9@0x10@0x48"),
|
||||
];
|
||||
|
||||
let expected_result_at_gc_horizon = [
|
||||
Bytes::from_static(b"value 0@0x10"),
|
||||
Bytes::from_static(b"value 1@0x10@0x20@0x28@0x30"),
|
||||
Bytes::from_static(b"value 2@0x10"),
|
||||
Bytes::from_static(b"value 3@0x10"),
|
||||
Bytes::from_static(b"value 4@0x10"),
|
||||
Bytes::from_static(b"value 5@0x10"),
|
||||
Bytes::from_static(b"value 6@0x10"),
|
||||
Bytes::from_static(b"value 7@0x10"),
|
||||
Bytes::from_static(b"value 8@0x10"),
|
||||
Bytes::from_static(b"value 9@0x10"),
|
||||
];
|
||||
|
||||
let expected_result_at_lsn_20 = [
|
||||
Bytes::from_static(b"value 0@0x10"),
|
||||
Bytes::from_static(b"value 1@0x10@0x20"),
|
||||
Bytes::from_static(b"value 2@0x10"),
|
||||
Bytes::from_static(b"value 3@0x10"),
|
||||
Bytes::from_static(b"value 4@0x10"),
|
||||
Bytes::from_static(b"value 5@0x10"),
|
||||
Bytes::from_static(b"value 6@0x10"),
|
||||
Bytes::from_static(b"value 7@0x10"),
|
||||
Bytes::from_static(b"value 8@0x10"),
|
||||
Bytes::from_static(b"value 9@0x10"),
|
||||
];
|
||||
|
||||
let expected_result_at_lsn_10 = [
|
||||
Bytes::from_static(b"value 0@0x10"),
|
||||
Bytes::from_static(b"value 1@0x10"),
|
||||
Bytes::from_static(b"value 2@0x10"),
|
||||
Bytes::from_static(b"value 3@0x10"),
|
||||
Bytes::from_static(b"value 4@0x10"),
|
||||
Bytes::from_static(b"value 5@0x10"),
|
||||
Bytes::from_static(b"value 6@0x10"),
|
||||
Bytes::from_static(b"value 7@0x10"),
|
||||
Bytes::from_static(b"value 8@0x10"),
|
||||
Bytes::from_static(b"value 9@0x10"),
|
||||
];
|
||||
|
||||
let verify_result = || async {
|
||||
let gc_horizon = {
|
||||
let gc_info = tline.gc_info.read().unwrap();
|
||||
gc_info.cutoffs.time
|
||||
};
|
||||
for idx in 0..10 {
|
||||
assert_eq!(
|
||||
tline
|
||||
.get(get_key(idx as u32), Lsn(0x50), &ctx)
|
||||
.await
|
||||
.unwrap(),
|
||||
&expected_result[idx]
|
||||
);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get(get_key(idx as u32), gc_horizon, &ctx)
|
||||
.await
|
||||
.unwrap(),
|
||||
&expected_result_at_gc_horizon[idx]
|
||||
);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get(get_key(idx as u32), Lsn(0x20), &ctx)
|
||||
.await
|
||||
.unwrap(),
|
||||
&expected_result_at_lsn_20[idx]
|
||||
);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get(get_key(idx as u32), Lsn(0x10), &ctx)
|
||||
.await
|
||||
.unwrap(),
|
||||
&expected_result_at_lsn_10[idx]
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
verify_result().await;
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
let mut dryrun_flags = EnumSet::new();
|
||||
dryrun_flags.insert(CompactFlags::DryRun);
|
||||
|
||||
tline
|
||||
.compact_with_gc(&cancel, dryrun_flags, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
// We expect layer map to be the same b/c the dry run flag, but we don't know whether there will be other background jobs
|
||||
// cleaning things up, and therefore, we don't do sanity checks on the layer map during unit tests.
|
||||
verify_result().await;
|
||||
|
||||
tline
|
||||
.compact_with_gc(&cancel, EnumSet::new(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
verify_result().await;
|
||||
|
||||
// compact again
|
||||
tline
|
||||
.compact_with_gc(&cancel, EnumSet::new(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
verify_result().await;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_simple_bottom_most_compaction_on_branch() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_simple_bottom_most_compaction_on_branch").await?;
|
||||
|
||||
@@ -148,7 +148,7 @@ pub(super) const LEN_COMPRESSION_BIT_MASK: u8 = 0xf0;
|
||||
|
||||
/// The maximum size of blobs we support. The highest few bits
|
||||
/// are reserved for compression and other further uses.
|
||||
const MAX_SUPPORTED_LEN: usize = 0x0fff_ffff;
|
||||
pub(crate) const MAX_SUPPORTED_BLOB_LEN: usize = 0x0fff_ffff;
|
||||
|
||||
pub(super) const BYTE_UNCOMPRESSED: u8 = 0x80;
|
||||
pub(super) const BYTE_ZSTD: u8 = BYTE_UNCOMPRESSED | 0x10;
|
||||
@@ -326,7 +326,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
(self.write_all(io_buf.slice_len(), ctx).await, srcbuf)
|
||||
} else {
|
||||
// Write a 4-byte length header
|
||||
if len > MAX_SUPPORTED_LEN {
|
||||
if len > MAX_SUPPORTED_BLOB_LEN {
|
||||
return (
|
||||
(
|
||||
io_buf.slice_len(),
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
//! Low-level Block-oriented I/O functions
|
||||
//!
|
||||
|
||||
use super::ephemeral_file::EphemeralFile;
|
||||
use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
|
||||
use crate::context::RequestContext;
|
||||
use crate::page_cache::{self, FileId, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ};
|
||||
@@ -81,9 +80,7 @@ impl<'a> Deref for BlockLease<'a> {
|
||||
/// Unlike traits, we also support the read function to be async though.
|
||||
pub(crate) enum BlockReaderRef<'a> {
|
||||
FileBlockReader(&'a FileBlockReader<'a>),
|
||||
EphemeralFile(&'a EphemeralFile),
|
||||
Adapter(Adapter<&'a DeltaLayerInner>),
|
||||
Slice(&'a [u8]),
|
||||
#[cfg(test)]
|
||||
TestDisk(&'a super::disk_btree::tests::TestDisk),
|
||||
#[cfg(test)]
|
||||
@@ -100,9 +97,7 @@ impl<'a> BlockReaderRef<'a> {
|
||||
use BlockReaderRef::*;
|
||||
match self {
|
||||
FileBlockReader(r) => r.read_blk(blknum, ctx).await,
|
||||
EphemeralFile(r) => r.read_blk(blknum, ctx).await,
|
||||
Adapter(r) => r.read_blk(blknum, ctx).await,
|
||||
Slice(s) => Self::read_blk_slice(s, blknum),
|
||||
#[cfg(test)]
|
||||
TestDisk(r) => r.read_blk(blknum),
|
||||
#[cfg(test)]
|
||||
@@ -111,24 +106,6 @@ impl<'a> BlockReaderRef<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> BlockReaderRef<'a> {
|
||||
fn read_blk_slice(slice: &[u8], blknum: u32) -> std::io::Result<BlockLease> {
|
||||
let start = (blknum as usize).checked_mul(PAGE_SZ).unwrap();
|
||||
let end = start.checked_add(PAGE_SZ).unwrap();
|
||||
if end > slice.len() {
|
||||
return Err(std::io::Error::new(
|
||||
std::io::ErrorKind::UnexpectedEof,
|
||||
format!("slice too short, len={} end={}", slice.len(), end),
|
||||
));
|
||||
}
|
||||
let slice = &slice[start..end];
|
||||
let page_sized: &[u8; PAGE_SZ] = slice
|
||||
.try_into()
|
||||
.expect("we add PAGE_SZ to start, so the slice must have PAGE_SZ");
|
||||
Ok(BlockLease::Slice(page_sized))
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// A "cursor" for efficiently reading multiple pages from a BlockReader
|
||||
///
|
||||
|
||||
@@ -1,13 +1,21 @@
|
||||
//! Implementation of append-only file data structure
|
||||
//! used to keep in-memory layers spilled on disk.
|
||||
|
||||
use crate::assert_u64_eq_usize::{U64IsUsize, UsizeIsU64};
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::RequestContext;
|
||||
use crate::page_cache;
|
||||
use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
|
||||
use crate::virtual_file::{self, VirtualFile};
|
||||
use crate::tenant::storage_layer::inmemory_layer::vectored_dio_read::File;
|
||||
use crate::virtual_file::owned_buffers_io::slice::SliceMutExt;
|
||||
use crate::virtual_file::owned_buffers_io::util::size_tracking_writer;
|
||||
use crate::virtual_file::owned_buffers_io::write::Buffer;
|
||||
use crate::virtual_file::{self, owned_buffers_io, VirtualFile};
|
||||
use bytes::BytesMut;
|
||||
use camino::Utf8PathBuf;
|
||||
use num_traits::Num;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use tokio_epoll_uring::{BoundedBuf, Slice};
|
||||
use tracing::error;
|
||||
|
||||
use std::io;
|
||||
use std::sync::atomic::AtomicU64;
|
||||
@@ -16,12 +24,17 @@ use utils::id::TimelineId;
|
||||
pub struct EphemeralFile {
|
||||
_tenant_shard_id: TenantShardId,
|
||||
_timeline_id: TimelineId,
|
||||
|
||||
rw: page_caching::RW,
|
||||
page_cache_file_id: page_cache::FileId,
|
||||
bytes_written: u64,
|
||||
buffered_writer: owned_buffers_io::write::BufferedWriter<
|
||||
BytesMut,
|
||||
size_tracking_writer::Writer<VirtualFile>,
|
||||
>,
|
||||
/// Gate guard is held on as long as we need to do operations in the path (delete on drop)
|
||||
_gate_guard: utils::sync::gate::GateGuard,
|
||||
}
|
||||
|
||||
mod page_caching;
|
||||
mod zero_padded_read_write;
|
||||
const TAIL_SZ: usize = 64 * 1024;
|
||||
|
||||
impl EphemeralFile {
|
||||
pub async fn create(
|
||||
@@ -51,75 +64,178 @@ impl EphemeralFile {
|
||||
)
|
||||
.await?;
|
||||
|
||||
let page_cache_file_id = page_cache::next_file_id(); // XXX get rid, we're not page-caching anymore
|
||||
|
||||
Ok(EphemeralFile {
|
||||
_tenant_shard_id: tenant_shard_id,
|
||||
_timeline_id: timeline_id,
|
||||
rw: page_caching::RW::new(file, gate_guard),
|
||||
page_cache_file_id,
|
||||
bytes_written: 0,
|
||||
buffered_writer: owned_buffers_io::write::BufferedWriter::new(
|
||||
size_tracking_writer::Writer::new(file),
|
||||
BytesMut::with_capacity(TAIL_SZ),
|
||||
),
|
||||
_gate_guard: gate_guard,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for EphemeralFile {
|
||||
fn drop(&mut self) {
|
||||
// unlink the file
|
||||
// we are clear to do this, because we have entered a gate
|
||||
let path = &self.buffered_writer.as_inner().as_inner().path;
|
||||
let res = std::fs::remove_file(path);
|
||||
if let Err(e) = res {
|
||||
if e.kind() != std::io::ErrorKind::NotFound {
|
||||
// just never log the not found errors, we cannot do anything for them; on detach
|
||||
// the tenant directory is already gone.
|
||||
//
|
||||
// not found files might also be related to https://github.com/neondatabase/neon/issues/2442
|
||||
error!("could not remove ephemeral file '{path}': {e}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl EphemeralFile {
|
||||
pub(crate) fn len(&self) -> u64 {
|
||||
self.rw.bytes_written()
|
||||
self.bytes_written
|
||||
}
|
||||
|
||||
pub(crate) fn page_cache_file_id(&self) -> page_cache::FileId {
|
||||
self.rw.page_cache_file_id()
|
||||
self.page_cache_file_id
|
||||
}
|
||||
|
||||
/// See [`self::page_caching::RW::load_to_vec`].
|
||||
pub(crate) async fn load_to_vec(&self, ctx: &RequestContext) -> Result<Vec<u8>, io::Error> {
|
||||
self.rw.load_to_vec(ctx).await
|
||||
}
|
||||
|
||||
pub(crate) async fn read_blk(
|
||||
&self,
|
||||
blknum: u32,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<BlockLease, io::Error> {
|
||||
self.rw.read_blk(blknum, ctx).await
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
// This is a test helper: outside of tests, we are always written to via a pre-serialized batch.
|
||||
pub(crate) async fn write_blob(
|
||||
&mut self,
|
||||
srcbuf: &[u8],
|
||||
ctx: &RequestContext,
|
||||
) -> Result<u64, io::Error> {
|
||||
let pos = self.rw.bytes_written();
|
||||
|
||||
let mut len_bytes = std::io::Cursor::new(Vec::new());
|
||||
crate::tenant::storage_layer::inmemory_layer::SerializedBatch::write_blob_length(
|
||||
srcbuf.len(),
|
||||
&mut len_bytes,
|
||||
);
|
||||
let len_bytes = len_bytes.into_inner();
|
||||
|
||||
// Write the length field
|
||||
self.rw.write_all_borrowed(&len_bytes, ctx).await?;
|
||||
|
||||
// Write the payload
|
||||
self.rw.write_all_borrowed(srcbuf, ctx).await?;
|
||||
|
||||
Ok(pos)
|
||||
let size = self.len().into_usize();
|
||||
let vec = Vec::with_capacity(size);
|
||||
let (slice, nread) = self.read_exact_at_eof_ok(0, vec.slice_full(), ctx).await?;
|
||||
assert_eq!(nread, size);
|
||||
let vec = slice.into_inner();
|
||||
assert_eq!(vec.len(), nread);
|
||||
assert_eq!(vec.capacity(), size, "we shouldn't be reallocating");
|
||||
Ok(vec)
|
||||
}
|
||||
|
||||
/// Returns the offset at which the first byte of the input was written, for use
|
||||
/// in constructing indices over the written value.
|
||||
///
|
||||
/// Panics if the write is short because there's no way we can recover from that.
|
||||
/// TODO: make upstack handle this as an error.
|
||||
pub(crate) async fn write_raw(
|
||||
&mut self,
|
||||
srcbuf: &[u8],
|
||||
ctx: &RequestContext,
|
||||
) -> Result<u64, io::Error> {
|
||||
let pos = self.rw.bytes_written();
|
||||
) -> std::io::Result<u64> {
|
||||
let pos = self.bytes_written;
|
||||
|
||||
let new_bytes_written = pos.checked_add(srcbuf.len().into_u64()).ok_or_else(|| {
|
||||
std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
format!(
|
||||
"write would grow EphemeralFile beyond u64::MAX: len={pos} writen={srcbuf_len}",
|
||||
srcbuf_len = srcbuf.len(),
|
||||
),
|
||||
)
|
||||
})?;
|
||||
|
||||
// Write the payload
|
||||
self.rw.write_all_borrowed(srcbuf, ctx).await?;
|
||||
let nwritten = self
|
||||
.buffered_writer
|
||||
.write_buffered_borrowed(srcbuf, ctx)
|
||||
.await?;
|
||||
assert_eq!(
|
||||
nwritten,
|
||||
srcbuf.len(),
|
||||
"buffered writer has no short writes"
|
||||
);
|
||||
|
||||
self.bytes_written = new_bytes_written;
|
||||
|
||||
Ok(pos)
|
||||
}
|
||||
}
|
||||
|
||||
impl super::storage_layer::inmemory_layer::vectored_dio_read::File for EphemeralFile {
|
||||
async fn read_exact_at_eof_ok<'a, 'b, B: tokio_epoll_uring::IoBufMut + Send>(
|
||||
&'b self,
|
||||
start: u64,
|
||||
dst: tokio_epoll_uring::Slice<B>,
|
||||
ctx: &'a RequestContext,
|
||||
) -> std::io::Result<(tokio_epoll_uring::Slice<B>, usize)> {
|
||||
let file_size_tracking_writer = self.buffered_writer.as_inner();
|
||||
let flushed_offset = file_size_tracking_writer.bytes_written();
|
||||
|
||||
let buffer = self.buffered_writer.inspect_buffer();
|
||||
let buffered = &buffer[0..buffer.pending()];
|
||||
|
||||
let dst_cap = dst.bytes_total().into_u64();
|
||||
let end = {
|
||||
// saturating_add is correct here because the max file size is u64::MAX, so,
|
||||
// if start + dst.len() > u64::MAX, then we know it will be a short read
|
||||
let mut end: u64 = start.saturating_add(dst_cap);
|
||||
if end > self.bytes_written {
|
||||
end = self.bytes_written;
|
||||
}
|
||||
end
|
||||
};
|
||||
|
||||
// inclusive, exclusive
|
||||
#[derive(Debug)]
|
||||
struct Range<N>(N, N);
|
||||
impl<N: Num + Clone + Copy + PartialOrd + Ord> Range<N> {
|
||||
fn len(&self) -> N {
|
||||
if self.0 > self.1 {
|
||||
N::zero()
|
||||
} else {
|
||||
self.1 - self.0
|
||||
}
|
||||
}
|
||||
}
|
||||
let written_range = Range(start, std::cmp::min(end, flushed_offset));
|
||||
let buffered_range = Range(std::cmp::max(start, flushed_offset), end);
|
||||
|
||||
let dst = if written_range.len() > 0 {
|
||||
let file: &VirtualFile = file_size_tracking_writer.as_inner();
|
||||
let bounds = dst.bounds();
|
||||
let slice = file
|
||||
.read_exact_at(dst.slice(0..written_range.len().into_usize()), start, ctx)
|
||||
.await?;
|
||||
Slice::from_buf_bounds(Slice::into_inner(slice), bounds)
|
||||
} else {
|
||||
dst
|
||||
};
|
||||
|
||||
let dst = if buffered_range.len() > 0 {
|
||||
let offset_in_buffer = buffered_range
|
||||
.0
|
||||
.checked_sub(flushed_offset)
|
||||
.unwrap()
|
||||
.into_usize();
|
||||
let to_copy =
|
||||
&buffered[offset_in_buffer..(offset_in_buffer + buffered_range.len().into_usize())];
|
||||
let bounds = dst.bounds();
|
||||
let mut view = dst.slice({
|
||||
let start = written_range.len().into_usize();
|
||||
let end = start
|
||||
.checked_add(buffered_range.len().into_usize())
|
||||
.unwrap();
|
||||
start..end
|
||||
});
|
||||
view.as_mut_rust_slice_full_zeroed()
|
||||
.copy_from_slice(to_copy);
|
||||
Slice::from_buf_bounds(Slice::into_inner(view), bounds)
|
||||
} else {
|
||||
dst
|
||||
};
|
||||
|
||||
// TODO: in debug mode, randomize the remaining bytes in `dst` to catch bugs
|
||||
|
||||
Ok((dst, (end - start).into_usize()))
|
||||
}
|
||||
}
|
||||
|
||||
/// Does the given filename look like an ephemeral file?
|
||||
pub fn is_ephemeral_file(filename: &str) -> bool {
|
||||
if let Some(rest) = filename.strip_prefix("ephemeral-") {
|
||||
@@ -129,19 +245,13 @@ pub fn is_ephemeral_file(filename: &str) -> bool {
|
||||
}
|
||||
}
|
||||
|
||||
impl BlockReader for EphemeralFile {
|
||||
fn block_cursor(&self) -> super::block_io::BlockCursor<'_> {
|
||||
BlockCursor::new(super::block_io::BlockReaderRef::EphemeralFile(self))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use rand::Rng;
|
||||
|
||||
use super::*;
|
||||
use crate::context::DownloadBehavior;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::block_io::BlockReaderRef;
|
||||
use rand::{thread_rng, RngCore};
|
||||
use std::fs;
|
||||
use std::str::FromStr;
|
||||
|
||||
@@ -172,69 +282,6 @@ mod tests {
|
||||
Ok((conf, tenant_shard_id, timeline_id, ctx))
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_ephemeral_blobs() -> Result<(), io::Error> {
|
||||
let (conf, tenant_id, timeline_id, ctx) = harness("ephemeral_blobs")?;
|
||||
|
||||
let gate = utils::sync::gate::Gate::default();
|
||||
|
||||
let entered = gate.enter().unwrap();
|
||||
|
||||
let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, entered, &ctx).await?;
|
||||
|
||||
let pos_foo = file.write_blob(b"foo", &ctx).await?;
|
||||
assert_eq!(
|
||||
b"foo",
|
||||
file.block_cursor()
|
||||
.read_blob(pos_foo, &ctx)
|
||||
.await?
|
||||
.as_slice()
|
||||
);
|
||||
let pos_bar = file.write_blob(b"bar", &ctx).await?;
|
||||
assert_eq!(
|
||||
b"foo",
|
||||
file.block_cursor()
|
||||
.read_blob(pos_foo, &ctx)
|
||||
.await?
|
||||
.as_slice()
|
||||
);
|
||||
assert_eq!(
|
||||
b"bar",
|
||||
file.block_cursor()
|
||||
.read_blob(pos_bar, &ctx)
|
||||
.await?
|
||||
.as_slice()
|
||||
);
|
||||
|
||||
let mut blobs = Vec::new();
|
||||
for i in 0..10000 {
|
||||
let data = Vec::from(format!("blob{}", i).as_bytes());
|
||||
let pos = file.write_blob(&data, &ctx).await?;
|
||||
blobs.push((pos, data));
|
||||
}
|
||||
// also test with a large blobs
|
||||
for i in 0..100 {
|
||||
let data = format!("blob{}", i).as_bytes().repeat(100);
|
||||
let pos = file.write_blob(&data, &ctx).await?;
|
||||
blobs.push((pos, data));
|
||||
}
|
||||
|
||||
let cursor = BlockCursor::new(BlockReaderRef::EphemeralFile(&file));
|
||||
for (pos, expected) in blobs {
|
||||
let actual = cursor.read_blob(pos, &ctx).await?;
|
||||
assert_eq!(actual, expected);
|
||||
}
|
||||
|
||||
// Test a large blob that spans multiple pages
|
||||
let mut large_data = vec![0; 20000];
|
||||
thread_rng().fill_bytes(&mut large_data);
|
||||
let pos_large = file.write_blob(&large_data, &ctx).await?;
|
||||
let result = file.block_cursor().read_blob(pos_large, &ctx).await?;
|
||||
assert_eq!(result, large_data);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn ephemeral_file_holds_gate_open() {
|
||||
const FOREVER: std::time::Duration = std::time::Duration::from_secs(5);
|
||||
@@ -268,4 +315,151 @@ mod tests {
|
||||
.expect("closing completes right away")
|
||||
.expect("closing does not panic");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_ephemeral_file_basics() {
|
||||
let (conf, tenant_id, timeline_id, ctx) = harness("test_ephemeral_file_basics").unwrap();
|
||||
|
||||
let gate = utils::sync::gate::Gate::default();
|
||||
|
||||
let mut file =
|
||||
EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let cap = file.buffered_writer.inspect_buffer().capacity();
|
||||
|
||||
let write_nbytes = cap + cap / 2;
|
||||
|
||||
let content: Vec<u8> = rand::thread_rng()
|
||||
.sample_iter(rand::distributions::Standard)
|
||||
.take(write_nbytes)
|
||||
.collect();
|
||||
|
||||
let mut value_offsets = Vec::new();
|
||||
for i in 0..write_nbytes {
|
||||
let off = file.write_raw(&content[i..i + 1], &ctx).await.unwrap();
|
||||
value_offsets.push(off);
|
||||
}
|
||||
|
||||
assert!(file.len() as usize == write_nbytes);
|
||||
for i in 0..write_nbytes {
|
||||
assert_eq!(value_offsets[i], i.into_u64());
|
||||
let buf = Vec::with_capacity(1);
|
||||
let (buf_slice, nread) = file
|
||||
.read_exact_at_eof_ok(i.into_u64(), buf.slice_full(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let buf = buf_slice.into_inner();
|
||||
assert_eq!(nread, 1);
|
||||
assert_eq!(&buf, &content[i..i + 1]);
|
||||
}
|
||||
|
||||
let file_contents =
|
||||
std::fs::read(&file.buffered_writer.as_inner().as_inner().path).unwrap();
|
||||
assert_eq!(file_contents, &content[0..cap]);
|
||||
|
||||
let buffer_contents = file.buffered_writer.inspect_buffer();
|
||||
assert_eq!(buffer_contents, &content[cap..write_nbytes]);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_flushes_do_happen() {
|
||||
let (conf, tenant_id, timeline_id, ctx) = harness("test_flushes_do_happen").unwrap();
|
||||
|
||||
let gate = utils::sync::gate::Gate::default();
|
||||
|
||||
let mut file =
|
||||
EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let cap = file.buffered_writer.inspect_buffer().capacity();
|
||||
|
||||
let content: Vec<u8> = rand::thread_rng()
|
||||
.sample_iter(rand::distributions::Standard)
|
||||
.take(cap + cap / 2)
|
||||
.collect();
|
||||
|
||||
file.write_raw(&content, &ctx).await.unwrap();
|
||||
|
||||
// assert the state is as this test expects it to be
|
||||
assert_eq!(
|
||||
&file.load_to_vec(&ctx).await.unwrap(),
|
||||
&content[0..cap + cap / 2]
|
||||
);
|
||||
let md = file
|
||||
.buffered_writer
|
||||
.as_inner()
|
||||
.as_inner()
|
||||
.path
|
||||
.metadata()
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
md.len(),
|
||||
cap.into_u64(),
|
||||
"buffered writer does one write if we write 1.5x buffer capacity"
|
||||
);
|
||||
assert_eq!(
|
||||
&file.buffered_writer.inspect_buffer()[0..cap / 2],
|
||||
&content[cap..cap + cap / 2]
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_read_split_across_file_and_buffer() {
|
||||
// This test exercises the logic on the read path that splits the logical read
|
||||
// into a read from the flushed part (= the file) and a copy from the buffered writer's buffer.
|
||||
//
|
||||
// This test build on the assertions in test_flushes_do_happen
|
||||
|
||||
let (conf, tenant_id, timeline_id, ctx) =
|
||||
harness("test_read_split_across_file_and_buffer").unwrap();
|
||||
|
||||
let gate = utils::sync::gate::Gate::default();
|
||||
|
||||
let mut file =
|
||||
EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let cap = file.buffered_writer.inspect_buffer().capacity();
|
||||
|
||||
let content: Vec<u8> = rand::thread_rng()
|
||||
.sample_iter(rand::distributions::Standard)
|
||||
.take(cap + cap / 2)
|
||||
.collect();
|
||||
|
||||
file.write_raw(&content, &ctx).await.unwrap();
|
||||
|
||||
let test_read = |start: usize, len: usize| {
|
||||
let file = &file;
|
||||
let ctx = &ctx;
|
||||
let content = &content;
|
||||
async move {
|
||||
let (buf, nread) = file
|
||||
.read_exact_at_eof_ok(
|
||||
start.into_u64(),
|
||||
Vec::with_capacity(len).slice_full(),
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(nread, len);
|
||||
assert_eq!(&buf.into_inner(), &content[start..(start + len)]);
|
||||
}
|
||||
};
|
||||
|
||||
// completely within the file range
|
||||
assert!(20 < cap, "test assumption");
|
||||
test_read(10, 10).await;
|
||||
// border onto edge of file
|
||||
test_read(cap - 10, 10).await;
|
||||
// read across file and buffer
|
||||
test_read(cap - 10, 20).await;
|
||||
// stay from start of buffer
|
||||
test_read(cap, 10).await;
|
||||
// completely within buffer
|
||||
test_read(cap + 10, 10).await;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,153 +0,0 @@
|
||||
//! Wrapper around [`super::zero_padded_read_write::RW`] that uses the
|
||||
//! [`crate::page_cache`] to serve reads that need to go to the underlying [`VirtualFile`].
|
||||
//!
|
||||
//! Subject to removal in <https://github.com/neondatabase/neon/pull/8537>
|
||||
|
||||
use crate::context::RequestContext;
|
||||
use crate::page_cache::{self, PAGE_SZ};
|
||||
use crate::tenant::block_io::BlockLease;
|
||||
use crate::virtual_file::owned_buffers_io::util::size_tracking_writer;
|
||||
use crate::virtual_file::VirtualFile;
|
||||
|
||||
use std::io::{self};
|
||||
use tokio_epoll_uring::BoundedBuf;
|
||||
use tracing::*;
|
||||
|
||||
use super::zero_padded_read_write;
|
||||
|
||||
/// See module-level comment.
|
||||
pub struct RW {
|
||||
page_cache_file_id: page_cache::FileId,
|
||||
rw: super::zero_padded_read_write::RW<size_tracking_writer::Writer<VirtualFile>>,
|
||||
/// Gate guard is held on as long as we need to do operations in the path (delete on drop).
|
||||
_gate_guard: utils::sync::gate::GateGuard,
|
||||
}
|
||||
|
||||
impl RW {
|
||||
pub fn new(file: VirtualFile, _gate_guard: utils::sync::gate::GateGuard) -> Self {
|
||||
let page_cache_file_id = page_cache::next_file_id();
|
||||
Self {
|
||||
page_cache_file_id,
|
||||
rw: super::zero_padded_read_write::RW::new(size_tracking_writer::Writer::new(file)),
|
||||
_gate_guard,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn page_cache_file_id(&self) -> page_cache::FileId {
|
||||
self.page_cache_file_id
|
||||
}
|
||||
|
||||
pub(crate) async fn write_all_borrowed(
|
||||
&mut self,
|
||||
srcbuf: &[u8],
|
||||
ctx: &RequestContext,
|
||||
) -> Result<usize, io::Error> {
|
||||
// It doesn't make sense to proactively fill the page cache on the Pageserver write path
|
||||
// because Compute is unlikely to access recently written data.
|
||||
self.rw.write_all_borrowed(srcbuf, ctx).await
|
||||
}
|
||||
|
||||
pub(crate) fn bytes_written(&self) -> u64 {
|
||||
self.rw.bytes_written()
|
||||
}
|
||||
|
||||
/// Load all blocks that can be read via [`Self::read_blk`] into a contiguous memory buffer.
|
||||
///
|
||||
/// This includes the blocks that aren't yet flushed to disk by the internal buffered writer.
|
||||
/// The last block is zero-padded to [`PAGE_SZ`], so, the returned buffer is always a multiple of [`PAGE_SZ`].
|
||||
pub(super) async fn load_to_vec(&self, ctx: &RequestContext) -> Result<Vec<u8>, io::Error> {
|
||||
// round up to the next PAGE_SZ multiple, required by blob_io
|
||||
let size = {
|
||||
let s = usize::try_from(self.bytes_written()).unwrap();
|
||||
if s % PAGE_SZ == 0 {
|
||||
s
|
||||
} else {
|
||||
s.checked_add(PAGE_SZ - (s % PAGE_SZ)).unwrap()
|
||||
}
|
||||
};
|
||||
let vec = Vec::with_capacity(size);
|
||||
|
||||
// read from disk what we've already flushed
|
||||
let file_size_tracking_writer = self.rw.as_writer();
|
||||
let flushed_range = 0..usize::try_from(file_size_tracking_writer.bytes_written()).unwrap();
|
||||
let mut vec = file_size_tracking_writer
|
||||
.as_inner()
|
||||
.read_exact_at(
|
||||
vec.slice(0..(flushed_range.end - flushed_range.start)),
|
||||
u64::try_from(flushed_range.start).unwrap(),
|
||||
ctx,
|
||||
)
|
||||
.await?
|
||||
.into_inner();
|
||||
|
||||
// copy from in-memory buffer what we haven't flushed yet but would return when accessed via read_blk
|
||||
let buffered = self.rw.get_tail_zero_padded();
|
||||
vec.extend_from_slice(buffered);
|
||||
assert_eq!(vec.len(), size);
|
||||
assert_eq!(vec.len() % PAGE_SZ, 0);
|
||||
Ok(vec)
|
||||
}
|
||||
|
||||
pub(crate) async fn read_blk(
|
||||
&self,
|
||||
blknum: u32,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<BlockLease, io::Error> {
|
||||
match self.rw.read_blk(blknum).await? {
|
||||
zero_padded_read_write::ReadResult::NeedsReadFromWriter { writer } => {
|
||||
let cache = page_cache::get();
|
||||
match cache
|
||||
.read_immutable_buf(self.page_cache_file_id, blknum, ctx)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
// order path before error because error is anyhow::Error => might have many contexts
|
||||
format!(
|
||||
"ephemeral file: read immutable page #{}: {}: {:#}",
|
||||
blknum,
|
||||
self.rw.as_writer().as_inner().path,
|
||||
e,
|
||||
),
|
||||
)
|
||||
})? {
|
||||
page_cache::ReadBufResult::Found(guard) => {
|
||||
return Ok(BlockLease::PageReadGuard(guard))
|
||||
}
|
||||
page_cache::ReadBufResult::NotFound(write_guard) => {
|
||||
let write_guard = writer
|
||||
.as_inner()
|
||||
.read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64, ctx)
|
||||
.await?;
|
||||
let read_guard = write_guard.mark_valid();
|
||||
return Ok(BlockLease::PageReadGuard(read_guard));
|
||||
}
|
||||
}
|
||||
}
|
||||
zero_padded_read_write::ReadResult::ServedFromZeroPaddedMutableTail { buffer } => {
|
||||
Ok(BlockLease::EphemeralFileMutableTail(buffer))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for RW {
|
||||
fn drop(&mut self) {
|
||||
// There might still be pages in the [`crate::page_cache`] for this file.
|
||||
// We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed.
|
||||
|
||||
// unlink the file
|
||||
// we are clear to do this, because we have entered a gate
|
||||
let path = &self.rw.as_writer().as_inner().path;
|
||||
let res = std::fs::remove_file(path);
|
||||
if let Err(e) = res {
|
||||
if e.kind() != std::io::ErrorKind::NotFound {
|
||||
// just never log the not found errors, we cannot do anything for them; on detach
|
||||
// the tenant directory is already gone.
|
||||
//
|
||||
// not found files might also be related to https://github.com/neondatabase/neon/issues/2442
|
||||
error!("could not remove ephemeral file '{path}': {e}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,145 +0,0 @@
|
||||
//! The heart of how [`super::EphemeralFile`] does its reads and writes.
|
||||
//!
|
||||
//! # Writes
|
||||
//!
|
||||
//! [`super::EphemeralFile`] writes small, borrowed buffers using [`RW::write_all_borrowed`].
|
||||
//! The [`RW`] batches these into [`TAIL_SZ`] bigger writes, using [`owned_buffers_io::write::BufferedWriter`].
|
||||
//!
|
||||
//! # Reads
|
||||
//!
|
||||
//! [`super::EphemeralFile`] always reads full [`PAGE_SZ`]ed blocks using [`RW::read_blk`].
|
||||
//!
|
||||
//! The [`RW`] serves these reads either from the buffered writer's in-memory buffer
|
||||
//! or redirects the caller to read from the underlying [`OwnedAsyncWriter`]
|
||||
//! if the read is for the prefix that has already been flushed.
|
||||
//!
|
||||
//! # Current Usage
|
||||
//!
|
||||
//! The current user of this module is [`super::page_caching::RW`].
|
||||
|
||||
mod zero_padded;
|
||||
|
||||
use crate::{
|
||||
context::RequestContext,
|
||||
page_cache::PAGE_SZ,
|
||||
virtual_file::owned_buffers_io::{
|
||||
self,
|
||||
write::{Buffer, OwnedAsyncWriter},
|
||||
},
|
||||
};
|
||||
|
||||
const TAIL_SZ: usize = 64 * 1024;
|
||||
|
||||
/// See module-level comment.
|
||||
pub struct RW<W: OwnedAsyncWriter> {
|
||||
buffered_writer: owned_buffers_io::write::BufferedWriter<
|
||||
zero_padded::Buffer<TAIL_SZ>,
|
||||
owned_buffers_io::util::size_tracking_writer::Writer<W>,
|
||||
>,
|
||||
}
|
||||
|
||||
pub enum ReadResult<'a, W> {
|
||||
NeedsReadFromWriter { writer: &'a W },
|
||||
ServedFromZeroPaddedMutableTail { buffer: &'a [u8; PAGE_SZ] },
|
||||
}
|
||||
|
||||
impl<W> RW<W>
|
||||
where
|
||||
W: OwnedAsyncWriter,
|
||||
{
|
||||
pub fn new(writer: W) -> Self {
|
||||
let bytes_flushed_tracker =
|
||||
owned_buffers_io::util::size_tracking_writer::Writer::new(writer);
|
||||
let buffered_writer = owned_buffers_io::write::BufferedWriter::new(
|
||||
bytes_flushed_tracker,
|
||||
zero_padded::Buffer::default(),
|
||||
);
|
||||
Self { buffered_writer }
|
||||
}
|
||||
|
||||
pub(crate) fn as_writer(&self) -> &W {
|
||||
self.buffered_writer.as_inner().as_inner()
|
||||
}
|
||||
|
||||
pub async fn write_all_borrowed(
|
||||
&mut self,
|
||||
buf: &[u8],
|
||||
ctx: &RequestContext,
|
||||
) -> std::io::Result<usize> {
|
||||
self.buffered_writer.write_buffered_borrowed(buf, ctx).await
|
||||
}
|
||||
|
||||
pub fn bytes_written(&self) -> u64 {
|
||||
let flushed_offset = self.buffered_writer.as_inner().bytes_written();
|
||||
let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
|
||||
flushed_offset + u64::try_from(buffer.pending()).unwrap()
|
||||
}
|
||||
|
||||
/// Get a slice of all blocks that [`Self::read_blk`] would return as [`ReadResult::ServedFromZeroPaddedMutableTail`].
|
||||
pub fn get_tail_zero_padded(&self) -> &[u8] {
|
||||
let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
|
||||
let buffer_written_up_to = buffer.pending();
|
||||
// pad to next page boundary
|
||||
let read_up_to = if buffer_written_up_to % PAGE_SZ == 0 {
|
||||
buffer_written_up_to
|
||||
} else {
|
||||
buffer_written_up_to
|
||||
.checked_add(PAGE_SZ - (buffer_written_up_to % PAGE_SZ))
|
||||
.unwrap()
|
||||
};
|
||||
&buffer.as_zero_padded_slice()[0..read_up_to]
|
||||
}
|
||||
|
||||
pub(crate) async fn read_blk(&self, blknum: u32) -> Result<ReadResult<'_, W>, std::io::Error> {
|
||||
let flushed_offset = self.buffered_writer.as_inner().bytes_written();
|
||||
let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
|
||||
let buffered_offset = flushed_offset + u64::try_from(buffer.pending()).unwrap();
|
||||
let read_offset = (blknum as u64) * (PAGE_SZ as u64);
|
||||
|
||||
// The trailing page ("block") might only be partially filled,
|
||||
// yet the blob_io code relies on us to return a full PAGE_SZed slice anyway.
|
||||
// Moreover, it has to be zero-padded, because when we still had
|
||||
// a write-back page cache, it provided pre-zeroed pages, and blob_io came to rely on it.
|
||||
// DeltaLayer probably has the same issue, not sure why it needs no special treatment.
|
||||
// => check here that the read doesn't go beyond this potentially trailing
|
||||
// => the zero-padding is done in the `else` branch below
|
||||
let blocks_written = if buffered_offset % (PAGE_SZ as u64) == 0 {
|
||||
buffered_offset / (PAGE_SZ as u64)
|
||||
} else {
|
||||
(buffered_offset / (PAGE_SZ as u64)) + 1
|
||||
};
|
||||
if (blknum as u64) >= blocks_written {
|
||||
return Err(std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!("read past end of ephemeral_file: read=0x{read_offset:x} buffered=0x{buffered_offset:x} flushed=0x{flushed_offset}")));
|
||||
}
|
||||
|
||||
// assertions for the `if-else` below
|
||||
assert_eq!(
|
||||
flushed_offset % (TAIL_SZ as u64), 0,
|
||||
"we only use write_buffered_borrowed to write to the buffered writer, so it's guaranteed that flushes happen buffer.cap()-sized chunks"
|
||||
);
|
||||
assert_eq!(
|
||||
flushed_offset % (PAGE_SZ as u64),
|
||||
0,
|
||||
"the logic below can't handle if the page is spread across the flushed part and the buffer"
|
||||
);
|
||||
|
||||
if read_offset < flushed_offset {
|
||||
assert!(read_offset + (PAGE_SZ as u64) <= flushed_offset);
|
||||
Ok(ReadResult::NeedsReadFromWriter {
|
||||
writer: self.as_writer(),
|
||||
})
|
||||
} else {
|
||||
let read_offset_in_buffer = read_offset
|
||||
.checked_sub(flushed_offset)
|
||||
.expect("would have taken `if` branch instead of this one");
|
||||
let read_offset_in_buffer = usize::try_from(read_offset_in_buffer).unwrap();
|
||||
let zero_padded_slice = buffer.as_zero_padded_slice();
|
||||
let page = &zero_padded_slice[read_offset_in_buffer..(read_offset_in_buffer + PAGE_SZ)];
|
||||
Ok(ReadResult::ServedFromZeroPaddedMutableTail {
|
||||
buffer: page
|
||||
.try_into()
|
||||
.expect("the slice above got it as page-size slice"),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,110 +0,0 @@
|
||||
//! A [`crate::virtual_file::owned_buffers_io::write::Buffer`] whose
|
||||
//! unwritten range is guaranteed to be zero-initialized.
|
||||
//! This is used by [`crate::tenant::ephemeral_file::zero_padded_read_write::RW::read_blk`]
|
||||
//! to serve page-sized reads of the trailing page when the trailing page has only been partially filled.
|
||||
|
||||
use std::mem::MaybeUninit;
|
||||
|
||||
use crate::virtual_file::owned_buffers_io::io_buf_ext::FullSlice;
|
||||
|
||||
/// See module-level comment.
|
||||
pub struct Buffer<const N: usize> {
|
||||
allocation: Box<[u8; N]>,
|
||||
written: usize,
|
||||
}
|
||||
|
||||
impl<const N: usize> Default for Buffer<N> {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
allocation: Box::new(
|
||||
// SAFETY: zeroed memory is a valid [u8; N]
|
||||
unsafe { MaybeUninit::zeroed().assume_init() },
|
||||
),
|
||||
written: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<const N: usize> Buffer<N> {
|
||||
#[inline(always)]
|
||||
fn invariants(&self) {
|
||||
// don't check by default, unoptimized is too expensive even for debug mode
|
||||
if false {
|
||||
debug_assert!(self.written <= N, "{}", self.written);
|
||||
debug_assert!(self.allocation[self.written..N].iter().all(|v| *v == 0));
|
||||
}
|
||||
}
|
||||
|
||||
pub fn as_zero_padded_slice(&self) -> &[u8; N] {
|
||||
&self.allocation
|
||||
}
|
||||
}
|
||||
|
||||
impl<const N: usize> crate::virtual_file::owned_buffers_io::write::Buffer for Buffer<N> {
|
||||
type IoBuf = Self;
|
||||
|
||||
fn cap(&self) -> usize {
|
||||
self.allocation.len()
|
||||
}
|
||||
|
||||
fn extend_from_slice(&mut self, other: &[u8]) {
|
||||
self.invariants();
|
||||
let remaining = self.allocation.len() - self.written;
|
||||
if other.len() > remaining {
|
||||
panic!("calling extend_from_slice() with insufficient remaining capacity");
|
||||
}
|
||||
self.allocation[self.written..(self.written + other.len())].copy_from_slice(other);
|
||||
self.written += other.len();
|
||||
self.invariants();
|
||||
}
|
||||
|
||||
fn pending(&self) -> usize {
|
||||
self.written
|
||||
}
|
||||
|
||||
fn flush(self) -> FullSlice<Self> {
|
||||
self.invariants();
|
||||
let written = self.written;
|
||||
FullSlice::must_new(tokio_epoll_uring::BoundedBuf::slice(self, 0..written))
|
||||
}
|
||||
|
||||
fn reuse_after_flush(iobuf: Self::IoBuf) -> Self {
|
||||
let Self {
|
||||
mut allocation,
|
||||
written,
|
||||
} = iobuf;
|
||||
allocation[0..written].fill(0);
|
||||
let new = Self {
|
||||
allocation,
|
||||
written: 0,
|
||||
};
|
||||
new.invariants();
|
||||
new
|
||||
}
|
||||
}
|
||||
|
||||
/// We have this trait impl so that the `flush` method in the `Buffer` impl above can produce a
|
||||
/// [`tokio_epoll_uring::BoundedBuf::slice`] of the [`Self::written`] range of the data.
|
||||
///
|
||||
/// Remember that bytes_init is generally _not_ a tracker of the amount
|
||||
/// of valid data in the io buffer; we use `Slice` for that.
|
||||
/// The `IoBuf` is _only_ for keeping track of uninitialized memory, a bit like MaybeUninit.
|
||||
///
|
||||
/// SAFETY:
|
||||
///
|
||||
/// The [`Self::allocation`] is stable becauses boxes are stable.
|
||||
/// The memory is zero-initialized, so, bytes_init is always N.
|
||||
unsafe impl<const N: usize> tokio_epoll_uring::IoBuf for Buffer<N> {
|
||||
fn stable_ptr(&self) -> *const u8 {
|
||||
self.allocation.as_ptr()
|
||||
}
|
||||
|
||||
fn bytes_init(&self) -> usize {
|
||||
// Yes, N, not self.written; Read the full comment of this impl block!
|
||||
N
|
||||
}
|
||||
|
||||
fn bytes_total(&self) -> usize {
|
||||
N
|
||||
}
|
||||
}
|
||||
@@ -8,7 +8,6 @@ mod layer_desc;
|
||||
mod layer_name;
|
||||
pub mod merge_iterator;
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod split_writer;
|
||||
|
||||
use crate::context::{AccessStatsBehavior, RequestContext};
|
||||
|
||||
@@ -36,10 +36,11 @@ use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, Fi
|
||||
use crate::tenant::disk_btree::{
|
||||
DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
|
||||
};
|
||||
use crate::tenant::storage_layer::layer::S3_UPLOAD_LIMIT;
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::vectored_blob_io::{
|
||||
BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
|
||||
VectoredReadPlanner,
|
||||
VectoredReadCoalesceMode, VectoredReadPlanner,
|
||||
};
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
|
||||
@@ -64,7 +65,7 @@ use std::os::unix::fs::FileExt;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::OnceCell;
|
||||
use tokio_epoll_uring::IoBufMut;
|
||||
use tokio_epoll_uring::IoBuf;
|
||||
use tracing::*;
|
||||
|
||||
use utils::{
|
||||
@@ -470,7 +471,7 @@ impl DeltaLayerWriterInner {
|
||||
ctx: &RequestContext,
|
||||
) -> (FullSlice<Buf>, anyhow::Result<()>)
|
||||
where
|
||||
Buf: IoBufMut + Send,
|
||||
Buf: IoBuf + Send,
|
||||
{
|
||||
assert!(
|
||||
self.lsn_range.start <= lsn,
|
||||
@@ -568,7 +569,6 @@ impl DeltaLayerWriterInner {
|
||||
// 5GB limit for objects without multipart upload (which we don't want to use)
|
||||
// Make it a little bit below to account for differing GB units
|
||||
// https://docs.aws.amazon.com/AmazonS3/latest/userguide/upload-objects.html
|
||||
const S3_UPLOAD_LIMIT: u64 = 4_500_000_000;
|
||||
ensure!(
|
||||
metadata.len() <= S3_UPLOAD_LIMIT,
|
||||
"Created delta layer file at {} of size {} above limit {S3_UPLOAD_LIMIT}!",
|
||||
@@ -678,7 +678,7 @@ impl DeltaLayerWriter {
|
||||
ctx: &RequestContext,
|
||||
) -> (FullSlice<Buf>, anyhow::Result<()>)
|
||||
where
|
||||
Buf: IoBufMut + Send,
|
||||
Buf: IoBuf + Send,
|
||||
{
|
||||
self.inner
|
||||
.as_mut()
|
||||
@@ -702,12 +702,10 @@ impl DeltaLayerWriter {
|
||||
self.inner.take().unwrap().finish(key_end, ctx).await
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn num_keys(&self) -> usize {
|
||||
self.inner.as_ref().unwrap().num_keys
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn estimated_size(&self) -> u64 {
|
||||
let inner = self.inner.as_ref().unwrap();
|
||||
inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64
|
||||
@@ -1207,6 +1205,7 @@ impl DeltaLayerInner {
|
||||
let mut prev: Option<(Key, Lsn, BlobRef)> = None;
|
||||
|
||||
let mut read_builder: Option<VectoredReadBuilder> = None;
|
||||
let read_mode = VectoredReadCoalesceMode::get();
|
||||
|
||||
let max_read_size = self
|
||||
.max_vectored_read_bytes
|
||||
@@ -1255,6 +1254,7 @@ impl DeltaLayerInner {
|
||||
offsets.end.pos(),
|
||||
meta,
|
||||
max_read_size,
|
||||
read_mode,
|
||||
))
|
||||
}
|
||||
} else {
|
||||
@@ -2283,7 +2283,7 @@ pub(crate) mod test {
|
||||
.await
|
||||
.unwrap();
|
||||
let delta_layer = resident_layer.get_as_delta(&ctx).await.unwrap();
|
||||
for max_read_size in [1, 1024] {
|
||||
for max_read_size in [1, 2048] {
|
||||
for batch_size in [1, 2, 4, 8, 3, 7, 13] {
|
||||
println!("running with batch_size={batch_size} max_read_size={max_read_size}");
|
||||
// Test if the batch size is correctly determined
|
||||
|
||||
@@ -716,10 +716,6 @@ struct ImageLayerWriterInner {
|
||||
}
|
||||
|
||||
impl ImageLayerWriterInner {
|
||||
fn size(&self) -> u64 {
|
||||
self.tree.borrow_writer().size() + self.blob_writer.size()
|
||||
}
|
||||
|
||||
///
|
||||
/// Start building a new image layer.
|
||||
///
|
||||
@@ -854,13 +850,19 @@ impl ImageLayerWriterInner {
|
||||
res?;
|
||||
}
|
||||
|
||||
let final_key_range = if let Some(end_key) = end_key {
|
||||
self.key_range.start..end_key
|
||||
} else {
|
||||
self.key_range.clone()
|
||||
};
|
||||
|
||||
// Fill in the summary on blk 0
|
||||
let summary = Summary {
|
||||
magic: IMAGE_FILE_MAGIC,
|
||||
format_version: STORAGE_FORMAT_VERSION,
|
||||
tenant_id: self.tenant_shard_id.tenant_id,
|
||||
timeline_id: self.timeline_id,
|
||||
key_range: self.key_range.clone(),
|
||||
key_range: final_key_range.clone(),
|
||||
lsn: self.lsn,
|
||||
index_start_blk,
|
||||
index_root_blk,
|
||||
@@ -881,11 +883,7 @@ impl ImageLayerWriterInner {
|
||||
let desc = PersistentLayerDesc::new_img(
|
||||
self.tenant_shard_id,
|
||||
self.timeline_id,
|
||||
if let Some(end_key) = end_key {
|
||||
self.key_range.start..end_key
|
||||
} else {
|
||||
self.key_range.clone()
|
||||
},
|
||||
final_key_range,
|
||||
self.lsn,
|
||||
metadata.len(),
|
||||
);
|
||||
@@ -974,14 +972,12 @@ impl ImageLayerWriter {
|
||||
self.inner.as_mut().unwrap().put_image(key, img, ctx).await
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
/// Estimated size of the image layer.
|
||||
pub(crate) fn estimated_size(&self) -> u64 {
|
||||
let inner = self.inner.as_ref().unwrap();
|
||||
inner.blob_writer.size() + inner.tree.borrow_writer().size() + PAGE_SZ as u64
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) fn num_keys(&self) -> usize {
|
||||
self.inner.as_ref().unwrap().num_keys
|
||||
}
|
||||
@@ -997,7 +993,6 @@ impl ImageLayerWriter {
|
||||
self.inner.take().unwrap().finish(timeline, ctx, None).await
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
/// Finish writing the image layer with an end key, used in [`super::split_writer::SplitImageLayerWriter`]. The end key determines the end of the image layer's covered range and is exclusive.
|
||||
pub(super) async fn finish_with_end_key(
|
||||
mut self,
|
||||
@@ -1011,10 +1006,6 @@ impl ImageLayerWriter {
|
||||
.finish(timeline, ctx, Some(end_key))
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) fn size(&self) -> u64 {
|
||||
self.inner.as_ref().unwrap().size()
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for ImageLayerWriter {
|
||||
@@ -1376,7 +1367,7 @@ mod test {
|
||||
.await
|
||||
.unwrap();
|
||||
let img_layer = resident_layer.get_as_image(&ctx).await.unwrap();
|
||||
for max_read_size in [1, 1024] {
|
||||
for max_read_size in [1, 2048] {
|
||||
for batch_size in [1, 2, 4, 8, 3, 7, 13] {
|
||||
println!("running with batch_size={batch_size} max_read_size={max_read_size}");
|
||||
// Test if the batch size is correctly determined
|
||||
|
||||
@@ -4,23 +4,23 @@
|
||||
//! held in an ephemeral file, not in memory. The metadata for each page version, i.e.
|
||||
//! its position in the file, is kept in memory, though.
|
||||
//!
|
||||
use crate::assert_u64_eq_usize::{u64_to_usize, U64IsUsize, UsizeIsU64};
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
|
||||
use crate::page_cache::PAGE_SZ;
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef};
|
||||
use crate::tenant::ephemeral_file::EphemeralFile;
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
|
||||
use crate::{l0_flush, page_cache};
|
||||
use anyhow::{anyhow, Result};
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use bytes::Bytes;
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver_api::key::CompactKey;
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::InMemoryLayerInfo;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::sync::{Arc, OnceLock};
|
||||
use std::time::Instant;
|
||||
use tracing::*;
|
||||
@@ -39,6 +39,8 @@ use super::{
|
||||
DeltaLayerWriter, PersistentLayerDesc, ValueReconstructSituation, ValuesReconstructState,
|
||||
};
|
||||
|
||||
pub(crate) mod vectored_dio_read;
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)]
|
||||
pub(crate) struct InMemoryLayerFileId(page_cache::FileId);
|
||||
|
||||
@@ -78,9 +80,9 @@ impl std::fmt::Debug for InMemoryLayer {
|
||||
|
||||
pub struct InMemoryLayerInner {
|
||||
/// All versions of all pages in the layer are kept here. Indexed
|
||||
/// by block number and LSN. The value is an offset into the
|
||||
/// by block number and LSN. The [`IndexEntry`] is an offset into the
|
||||
/// ephemeral file where the page version is stored.
|
||||
index: BTreeMap<CompactKey, VecMap<Lsn, u64>>,
|
||||
index: BTreeMap<CompactKey, VecMap<Lsn, IndexEntry>>,
|
||||
|
||||
/// The values are stored in a serialized format in this file.
|
||||
/// Each serialized Value is preceded by a 'u32' length field.
|
||||
@@ -90,6 +92,154 @@ pub struct InMemoryLayerInner {
|
||||
resource_units: GlobalResourceUnits,
|
||||
}
|
||||
|
||||
/// Support the same max blob length as blob_io, because ultimately
|
||||
/// all the InMemoryLayer contents end up being written into a delta layer,
|
||||
/// using the [`crate::tenant::blob_io`].
|
||||
const MAX_SUPPORTED_BLOB_LEN: usize = crate::tenant::blob_io::MAX_SUPPORTED_BLOB_LEN;
|
||||
const MAX_SUPPORTED_BLOB_LEN_BITS: usize = {
|
||||
let trailing_ones = MAX_SUPPORTED_BLOB_LEN.trailing_ones() as usize;
|
||||
let leading_zeroes = MAX_SUPPORTED_BLOB_LEN.leading_zeros() as usize;
|
||||
assert!(trailing_ones + leading_zeroes == std::mem::size_of::<usize>() * 8);
|
||||
trailing_ones
|
||||
};
|
||||
|
||||
/// See [`InMemoryLayerInner::index`].
|
||||
///
|
||||
/// For memory efficiency, the data is packed into a u64.
|
||||
///
|
||||
/// Layout:
|
||||
/// - 1 bit: `will_init`
|
||||
/// - [`MAX_SUPPORTED_BLOB_LEN_BITS`]: `len`
|
||||
/// - [`MAX_SUPPORTED_POS_BITS`]: `pos`
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub struct IndexEntry(u64);
|
||||
|
||||
impl IndexEntry {
|
||||
/// See [`Self::MAX_SUPPORTED_POS`].
|
||||
const MAX_SUPPORTED_POS_BITS: usize = {
|
||||
let remainder = 64 - 1 - MAX_SUPPORTED_BLOB_LEN_BITS;
|
||||
if remainder < 32 {
|
||||
panic!("pos can be u32 as per type system, support that");
|
||||
}
|
||||
remainder
|
||||
};
|
||||
/// The maximum supported blob offset that can be represented by [`Self`].
|
||||
/// See also [`Self::validate_checkpoint_distance`].
|
||||
const MAX_SUPPORTED_POS: usize = (1 << Self::MAX_SUPPORTED_POS_BITS) - 1;
|
||||
|
||||
// Layout
|
||||
const WILL_INIT_RANGE: Range<usize> = 0..1;
|
||||
const LEN_RANGE: Range<usize> =
|
||||
Self::WILL_INIT_RANGE.end..Self::WILL_INIT_RANGE.end + MAX_SUPPORTED_BLOB_LEN_BITS;
|
||||
const POS_RANGE: Range<usize> =
|
||||
Self::LEN_RANGE.end..Self::LEN_RANGE.end + Self::MAX_SUPPORTED_POS_BITS;
|
||||
const _ASSERT: () = {
|
||||
if Self::POS_RANGE.end != 64 {
|
||||
panic!("we don't want undefined bits for our own sanity")
|
||||
}
|
||||
};
|
||||
|
||||
/// Fails if and only if the offset or length encoded in `arg` is too large to be represented by [`Self`].
|
||||
///
|
||||
/// The only reason why that can happen in the system is if the [`InMemoryLayer`] grows too long.
|
||||
/// The [`InMemoryLayer`] size is determined by the checkpoint distance, enforced by [`crate::tenant::Timeline::should_roll`].
|
||||
///
|
||||
/// Thus, to avoid failure of this function, whenever we start up and/or change checkpoint distance,
|
||||
/// call [`Self::validate_checkpoint_distance`] with the new checkpoint distance value.
|
||||
///
|
||||
/// TODO: this check should happen ideally at config parsing time (and in the request handler when a change to checkpoint distance is requested)
|
||||
/// When cleaning this up, also look into the s3 max file size check that is performed in delta layer writer.
|
||||
#[inline(always)]
|
||||
fn new(arg: IndexEntryNewArgs) -> anyhow::Result<Self> {
|
||||
let IndexEntryNewArgs {
|
||||
base_offset,
|
||||
batch_offset,
|
||||
len,
|
||||
will_init,
|
||||
} = arg;
|
||||
|
||||
let pos = base_offset
|
||||
.checked_add(batch_offset)
|
||||
.ok_or_else(|| anyhow::anyhow!("base_offset + batch_offset overflows u64: base_offset={base_offset} batch_offset={batch_offset}"))?;
|
||||
|
||||
if pos.into_usize() > Self::MAX_SUPPORTED_POS {
|
||||
anyhow::bail!(
|
||||
"base_offset+batch_offset exceeds the maximum supported value: base_offset={base_offset} batch_offset={batch_offset} (+)={pos} max={max}",
|
||||
max = Self::MAX_SUPPORTED_POS
|
||||
);
|
||||
}
|
||||
|
||||
if len > MAX_SUPPORTED_BLOB_LEN {
|
||||
anyhow::bail!(
|
||||
"len exceeds the maximum supported length: len={len} max={MAX_SUPPORTED_BLOB_LEN}",
|
||||
);
|
||||
}
|
||||
|
||||
let mut data: u64 = 0;
|
||||
use bit_field::BitField;
|
||||
data.set_bits(Self::WILL_INIT_RANGE, if will_init { 1 } else { 0 });
|
||||
data.set_bits(Self::LEN_RANGE, len.into_u64());
|
||||
data.set_bits(Self::POS_RANGE, pos);
|
||||
|
||||
Ok(Self(data))
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn unpack(&self) -> IndexEntryUnpacked {
|
||||
use bit_field::BitField;
|
||||
IndexEntryUnpacked {
|
||||
will_init: self.0.get_bits(Self::WILL_INIT_RANGE) != 0,
|
||||
len: self.0.get_bits(Self::LEN_RANGE),
|
||||
pos: self.0.get_bits(Self::POS_RANGE),
|
||||
}
|
||||
}
|
||||
|
||||
/// See [`Self::new`].
|
||||
pub(crate) const fn validate_checkpoint_distance(
|
||||
checkpoint_distance: u64,
|
||||
) -> Result<(), &'static str> {
|
||||
if checkpoint_distance > Self::MAX_SUPPORTED_POS as u64 {
|
||||
return Err("exceeds the maximum supported value");
|
||||
}
|
||||
let res = u64_to_usize(checkpoint_distance).checked_add(MAX_SUPPORTED_BLOB_LEN);
|
||||
if res.is_none() {
|
||||
return Err(
|
||||
"checkpoint distance + max supported blob len overflows in-memory addition",
|
||||
);
|
||||
}
|
||||
|
||||
// NB: it is ok for the result of the addition to be larger than MAX_SUPPORTED_POS
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
const _ASSERT_DEFAULT_CHECKPOINT_DISTANCE_IS_VALID: () = {
|
||||
let res = Self::validate_checkpoint_distance(
|
||||
crate::tenant::config::defaults::DEFAULT_CHECKPOINT_DISTANCE,
|
||||
);
|
||||
if res.is_err() {
|
||||
panic!("default checkpoint distance is valid")
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/// Args to [`IndexEntry::new`].
|
||||
#[derive(Clone, Copy)]
|
||||
struct IndexEntryNewArgs {
|
||||
base_offset: u64,
|
||||
batch_offset: u64,
|
||||
len: usize,
|
||||
will_init: bool,
|
||||
}
|
||||
|
||||
/// Unpacked representation of the bitfielded [`IndexEntry`].
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
|
||||
struct IndexEntryUnpacked {
|
||||
will_init: bool,
|
||||
len: u64,
|
||||
pos: u64,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for InMemoryLayerInner {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("InMemoryLayerInner").finish()
|
||||
@@ -276,7 +426,12 @@ impl InMemoryLayer {
|
||||
.build();
|
||||
|
||||
let inner = self.inner.read().await;
|
||||
let reader = inner.file.block_cursor();
|
||||
|
||||
struct ValueRead {
|
||||
entry_lsn: Lsn,
|
||||
read: vectored_dio_read::LogicalRead<Vec<u8>>,
|
||||
}
|
||||
let mut reads: HashMap<Key, Vec<ValueRead>> = HashMap::new();
|
||||
|
||||
for range in keyspace.ranges.iter() {
|
||||
for (key, vec_map) in inner
|
||||
@@ -291,24 +446,62 @@ impl InMemoryLayer {
|
||||
|
||||
let slice = vec_map.slice_range(lsn_range);
|
||||
|
||||
for (entry_lsn, pos) in slice.iter().rev() {
|
||||
// TODO: this uses the page cache => https://github.com/neondatabase/neon/issues/8183
|
||||
let buf = reader.read_blob(*pos, &ctx).await;
|
||||
if let Err(e) = buf {
|
||||
reconstruct_state.on_key_error(key, PageReconstructError::from(anyhow!(e)));
|
||||
for (entry_lsn, index_entry) in slice.iter().rev() {
|
||||
let IndexEntryUnpacked {
|
||||
pos,
|
||||
len,
|
||||
will_init,
|
||||
} = index_entry.unpack();
|
||||
reads.entry(key).or_default().push(ValueRead {
|
||||
entry_lsn: *entry_lsn,
|
||||
read: vectored_dio_read::LogicalRead::new(
|
||||
pos,
|
||||
Vec::with_capacity(len as usize),
|
||||
),
|
||||
});
|
||||
if will_init {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let value = Value::des(&buf.unwrap());
|
||||
if let Err(e) = value {
|
||||
// Execute the reads.
|
||||
|
||||
let f = vectored_dio_read::execute(
|
||||
&inner.file,
|
||||
reads
|
||||
.iter()
|
||||
.flat_map(|(_, value_reads)| value_reads.iter().map(|v| &v.read)),
|
||||
&ctx,
|
||||
);
|
||||
send_future::SendFuture::send(f) // https://github.com/rust-lang/rust/issues/96865
|
||||
.await;
|
||||
|
||||
// Process results into the reconstruct state
|
||||
'next_key: for (key, value_reads) in reads {
|
||||
for ValueRead { entry_lsn, read } in value_reads {
|
||||
match read.into_result().expect("we run execute() above") {
|
||||
Err(e) => {
|
||||
reconstruct_state.on_key_error(key, PageReconstructError::from(anyhow!(e)));
|
||||
break;
|
||||
continue 'next_key;
|
||||
}
|
||||
Ok(value_buf) => {
|
||||
let value = Value::des(&value_buf);
|
||||
if let Err(e) = value {
|
||||
reconstruct_state
|
||||
.on_key_error(key, PageReconstructError::from(anyhow!(e)));
|
||||
continue 'next_key;
|
||||
}
|
||||
|
||||
let key_situation =
|
||||
reconstruct_state.update_key(&key, *entry_lsn, value.unwrap());
|
||||
if key_situation == ValueReconstructSituation::Complete {
|
||||
break;
|
||||
let key_situation =
|
||||
reconstruct_state.update_key(&key, entry_lsn, value.unwrap());
|
||||
if key_situation == ValueReconstructSituation::Complete {
|
||||
// TODO: metric to see if we fetched more values than necessary
|
||||
continue 'next_key;
|
||||
}
|
||||
|
||||
// process the next value in the next iteration of the loop
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -324,8 +517,9 @@ impl InMemoryLayer {
|
||||
struct SerializedBatchOffset {
|
||||
key: CompactKey,
|
||||
lsn: Lsn,
|
||||
/// offset in bytes from the start of the batch's buffer to the Value's serialized size header.
|
||||
offset: u64,
|
||||
// TODO: separate type when we start serde-serializing this value, to avoid coupling
|
||||
// in-memory representation to serialization format.
|
||||
index_entry: IndexEntry,
|
||||
}
|
||||
|
||||
pub struct SerializedBatch {
|
||||
@@ -340,30 +534,10 @@ pub struct SerializedBatch {
|
||||
}
|
||||
|
||||
impl SerializedBatch {
|
||||
/// Write a blob length in the internal format of the EphemeralFile
|
||||
pub(crate) fn write_blob_length(len: usize, cursor: &mut std::io::Cursor<Vec<u8>>) {
|
||||
use std::io::Write;
|
||||
|
||||
if len < 0x80 {
|
||||
// short one-byte length header
|
||||
let len_buf = [len as u8];
|
||||
|
||||
cursor
|
||||
.write_all(&len_buf)
|
||||
.expect("Writing to Vec is infallible");
|
||||
} else {
|
||||
let mut len_buf = u32::to_be_bytes(len as u32);
|
||||
len_buf[0] |= 0x80;
|
||||
cursor
|
||||
.write_all(&len_buf)
|
||||
.expect("Writing to Vec is infallible");
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_values(batch: Vec<(CompactKey, Lsn, usize, Value)>) -> Self {
|
||||
pub fn from_values(batch: Vec<(CompactKey, Lsn, usize, Value)>) -> anyhow::Result<Self> {
|
||||
// Pre-allocate a big flat buffer to write into. This should be large but not huge: it is soft-limited in practice by
|
||||
// [`crate::pgdatadir_mapping::DatadirModification::MAX_PENDING_BYTES`]
|
||||
let buffer_size = batch.iter().map(|i| i.2).sum::<usize>() + 4 * batch.len();
|
||||
let buffer_size = batch.iter().map(|i| i.2).sum::<usize>();
|
||||
let mut cursor = std::io::Cursor::new(Vec::<u8>::with_capacity(buffer_size));
|
||||
|
||||
let mut offsets: Vec<SerializedBatchOffset> = Vec::with_capacity(batch.len());
|
||||
@@ -371,14 +545,19 @@ impl SerializedBatch {
|
||||
for (key, lsn, val_ser_size, val) in batch {
|
||||
let relative_off = cursor.position();
|
||||
|
||||
Self::write_blob_length(val_ser_size, &mut cursor);
|
||||
val.ser_into(&mut cursor)
|
||||
.expect("Writing into in-memory buffer is infallible");
|
||||
|
||||
offsets.push(SerializedBatchOffset {
|
||||
key,
|
||||
lsn,
|
||||
offset: relative_off,
|
||||
index_entry: IndexEntry::new(IndexEntryNewArgs {
|
||||
base_offset: 0,
|
||||
batch_offset: relative_off,
|
||||
len: val_ser_size,
|
||||
will_init: val.will_init(),
|
||||
})
|
||||
.context("higher-level code ensures that values are within supported ranges")?,
|
||||
});
|
||||
max_lsn = std::cmp::max(max_lsn, lsn);
|
||||
}
|
||||
@@ -388,11 +567,11 @@ impl SerializedBatch {
|
||||
// Assert that we didn't do any extra allocations while building buffer.
|
||||
debug_assert!(buffer.len() <= buffer_size);
|
||||
|
||||
Self {
|
||||
Ok(Self {
|
||||
raw: buffer,
|
||||
offsets,
|
||||
max_lsn,
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -456,44 +635,69 @@ impl InMemoryLayer {
|
||||
})
|
||||
}
|
||||
|
||||
// Write path.
|
||||
/// Write path.
|
||||
///
|
||||
/// Errors are not retryable, the [`InMemoryLayer`] must be discarded, and not be read from.
|
||||
/// The reason why it's not retryable is that the [`EphemeralFile`] writes are not retryable.
|
||||
/// TODO: it can be made retryable if we aborted the process on EphemeralFile write errors.
|
||||
pub async fn put_batch(
|
||||
&self,
|
||||
serialized_batch: SerializedBatch,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<()> {
|
||||
) -> anyhow::Result<()> {
|
||||
let mut inner = self.inner.write().await;
|
||||
self.assert_writable();
|
||||
|
||||
let base_off = {
|
||||
inner
|
||||
.file
|
||||
.write_raw(
|
||||
&serialized_batch.raw,
|
||||
&RequestContextBuilder::extend(ctx)
|
||||
.page_content_kind(PageContentKind::InMemoryLayer)
|
||||
.build(),
|
||||
)
|
||||
.await?
|
||||
};
|
||||
let base_offset = inner.file.len();
|
||||
|
||||
let SerializedBatch {
|
||||
raw,
|
||||
mut offsets,
|
||||
max_lsn: _,
|
||||
} = serialized_batch;
|
||||
|
||||
// Add the base_offset to the batch's index entries which are relative to the batch start.
|
||||
for offset in &mut offsets {
|
||||
let IndexEntryUnpacked {
|
||||
will_init,
|
||||
len,
|
||||
pos,
|
||||
} = offset.index_entry.unpack();
|
||||
offset.index_entry = IndexEntry::new(IndexEntryNewArgs {
|
||||
base_offset,
|
||||
batch_offset: pos,
|
||||
len: len.into_usize(),
|
||||
will_init,
|
||||
})?;
|
||||
}
|
||||
|
||||
// Write the batch to the file
|
||||
inner.file.write_raw(&raw, ctx).await?;
|
||||
let new_size = inner.file.len();
|
||||
let expected_new_len = base_offset
|
||||
.checked_add(raw.len().into_u64())
|
||||
// write_raw would error if we were to overflow u64.
|
||||
// also IndexEntry and higher levels in
|
||||
//the code don't allow the file to grow that large
|
||||
.unwrap();
|
||||
assert_eq!(new_size, expected_new_len);
|
||||
|
||||
// Update the index with the new entries
|
||||
for SerializedBatchOffset {
|
||||
key,
|
||||
lsn,
|
||||
offset: relative_off,
|
||||
} in serialized_batch.offsets
|
||||
index_entry,
|
||||
} in offsets
|
||||
{
|
||||
let off = base_off + relative_off;
|
||||
let vec_map = inner.index.entry(key).or_default();
|
||||
let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
|
||||
let old = vec_map.append_or_update_last(lsn, index_entry).unwrap().0;
|
||||
if old.is_some() {
|
||||
// We already had an entry for this LSN. That's odd..
|
||||
warn!("Key {} at {} already exists", key, lsn);
|
||||
}
|
||||
}
|
||||
|
||||
let size = inner.file.len();
|
||||
inner.resource_units.maybe_publish_size(size);
|
||||
inner.resource_units.maybe_publish_size(new_size);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -537,7 +741,7 @@ impl InMemoryLayer {
|
||||
{
|
||||
let inner = self.inner.write().await;
|
||||
for vec_map in inner.index.values() {
|
||||
for (lsn, _pos) in vec_map.as_slice() {
|
||||
for (lsn, _) in vec_map.as_slice() {
|
||||
assert!(*lsn < end_lsn);
|
||||
}
|
||||
}
|
||||
@@ -601,36 +805,23 @@ impl InMemoryLayer {
|
||||
match l0_flush_global_state {
|
||||
l0_flush::Inner::Direct { .. } => {
|
||||
let file_contents: Vec<u8> = inner.file.load_to_vec(ctx).await?;
|
||||
assert_eq!(
|
||||
file_contents.len() % PAGE_SZ,
|
||||
0,
|
||||
"needed by BlockReaderRef::Slice"
|
||||
);
|
||||
assert_eq!(file_contents.len(), {
|
||||
let written = usize::try_from(inner.file.len()).unwrap();
|
||||
if written % PAGE_SZ == 0 {
|
||||
written
|
||||
} else {
|
||||
written.checked_add(PAGE_SZ - (written % PAGE_SZ)).unwrap()
|
||||
}
|
||||
});
|
||||
|
||||
let cursor = BlockCursor::new(BlockReaderRef::Slice(&file_contents));
|
||||
|
||||
let mut buf = Vec::new();
|
||||
let file_contents = Bytes::from(file_contents);
|
||||
|
||||
for (key, vec_map) in inner.index.iter() {
|
||||
// Write all page versions
|
||||
for (lsn, pos) in vec_map.as_slice() {
|
||||
// TODO: once we have blob lengths in the in-memory index, we can
|
||||
// 1. get rid of the blob_io / BlockReaderRef::Slice business and
|
||||
// 2. load the file contents into a Bytes and
|
||||
// 3. the use `Bytes::slice` to get the `buf` that is our blob
|
||||
// 4. pass that `buf` into `put_value_bytes`
|
||||
// => https://github.com/neondatabase/neon/issues/8183
|
||||
cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?;
|
||||
let will_init = Value::des(&buf)?.will_init();
|
||||
let (tmp, res) = delta_layer_writer
|
||||
for (lsn, entry) in vec_map
|
||||
.as_slice()
|
||||
.iter()
|
||||
.map(|(lsn, entry)| (lsn, entry.unpack()))
|
||||
{
|
||||
let IndexEntryUnpacked {
|
||||
pos,
|
||||
len,
|
||||
will_init,
|
||||
} = entry;
|
||||
let buf = Bytes::slice(&file_contents, pos as usize..(pos + len) as usize);
|
||||
let (_buf, res) = delta_layer_writer
|
||||
.put_value_bytes(
|
||||
Key::from_compact(*key),
|
||||
*lsn,
|
||||
@@ -640,7 +831,6 @@ impl InMemoryLayer {
|
||||
)
|
||||
.await;
|
||||
res?;
|
||||
buf = tmp.into_raw_slice().into_inner();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -662,3 +852,134 @@ impl InMemoryLayer {
|
||||
Ok(Some((desc, path)))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_index_entry() {
|
||||
const MAX_SUPPORTED_POS: usize = IndexEntry::MAX_SUPPORTED_POS;
|
||||
use IndexEntryNewArgs as Args;
|
||||
use IndexEntryUnpacked as Unpacked;
|
||||
|
||||
let roundtrip = |args, expect: Unpacked| {
|
||||
let res = IndexEntry::new(args).expect("this tests expects no errors");
|
||||
let IndexEntryUnpacked {
|
||||
will_init,
|
||||
len,
|
||||
pos,
|
||||
} = res.unpack();
|
||||
assert_eq!(will_init, expect.will_init);
|
||||
assert_eq!(len, expect.len);
|
||||
assert_eq!(pos, expect.pos);
|
||||
};
|
||||
|
||||
// basic roundtrip
|
||||
for pos in [0, MAX_SUPPORTED_POS] {
|
||||
for len in [0, MAX_SUPPORTED_BLOB_LEN] {
|
||||
for will_init in [true, false] {
|
||||
let expect = Unpacked {
|
||||
will_init,
|
||||
len: len.into_u64(),
|
||||
pos: pos.into_u64(),
|
||||
};
|
||||
roundtrip(
|
||||
Args {
|
||||
will_init,
|
||||
base_offset: pos.into_u64(),
|
||||
batch_offset: 0,
|
||||
len,
|
||||
},
|
||||
expect,
|
||||
);
|
||||
roundtrip(
|
||||
Args {
|
||||
will_init,
|
||||
base_offset: 0,
|
||||
batch_offset: pos.into_u64(),
|
||||
len,
|
||||
},
|
||||
expect,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// too-large len
|
||||
let too_large = Args {
|
||||
will_init: false,
|
||||
len: MAX_SUPPORTED_BLOB_LEN + 1,
|
||||
base_offset: 0,
|
||||
batch_offset: 0,
|
||||
};
|
||||
assert!(IndexEntry::new(too_large).is_err());
|
||||
|
||||
// too-large pos
|
||||
{
|
||||
let too_large = Args {
|
||||
will_init: false,
|
||||
len: 0,
|
||||
base_offset: MAX_SUPPORTED_POS.into_u64() + 1,
|
||||
batch_offset: 0,
|
||||
};
|
||||
assert!(IndexEntry::new(too_large).is_err());
|
||||
let too_large = Args {
|
||||
will_init: false,
|
||||
len: 0,
|
||||
base_offset: 0,
|
||||
batch_offset: MAX_SUPPORTED_POS.into_u64() + 1,
|
||||
};
|
||||
assert!(IndexEntry::new(too_large).is_err());
|
||||
}
|
||||
|
||||
// too large (base_offset + batch_offset)
|
||||
{
|
||||
let too_large = Args {
|
||||
will_init: false,
|
||||
len: 0,
|
||||
base_offset: MAX_SUPPORTED_POS.into_u64(),
|
||||
batch_offset: 1,
|
||||
};
|
||||
assert!(IndexEntry::new(too_large).is_err());
|
||||
let too_large = Args {
|
||||
will_init: false,
|
||||
len: 0,
|
||||
base_offset: MAX_SUPPORTED_POS.into_u64() - 1,
|
||||
batch_offset: MAX_SUPPORTED_POS.into_u64() - 1,
|
||||
};
|
||||
assert!(IndexEntry::new(too_large).is_err());
|
||||
}
|
||||
|
||||
// valid special cases
|
||||
// - area past the max supported pos that is accessible by len
|
||||
for len in [1, MAX_SUPPORTED_BLOB_LEN] {
|
||||
roundtrip(
|
||||
Args {
|
||||
will_init: false,
|
||||
len,
|
||||
base_offset: MAX_SUPPORTED_POS.into_u64(),
|
||||
batch_offset: 0,
|
||||
},
|
||||
Unpacked {
|
||||
will_init: false,
|
||||
len: len as u64,
|
||||
pos: MAX_SUPPORTED_POS.into_u64(),
|
||||
},
|
||||
);
|
||||
roundtrip(
|
||||
Args {
|
||||
will_init: false,
|
||||
len,
|
||||
base_offset: 0,
|
||||
batch_offset: MAX_SUPPORTED_POS.into_u64(),
|
||||
},
|
||||
Unpacked {
|
||||
will_init: false,
|
||||
len: len as u64,
|
||||
pos: MAX_SUPPORTED_POS.into_u64(),
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,937 @@
|
||||
use std::{
|
||||
collections::BTreeMap,
|
||||
sync::{Arc, RwLock},
|
||||
};
|
||||
|
||||
use itertools::Itertools;
|
||||
use tokio_epoll_uring::{BoundedBuf, IoBufMut, Slice};
|
||||
|
||||
use crate::{
|
||||
assert_u64_eq_usize::{U64IsUsize, UsizeIsU64},
|
||||
context::RequestContext,
|
||||
};
|
||||
|
||||
/// The file interface we require. At runtime, this is a [`crate::tenant::ephemeral_file::EphemeralFile`].
|
||||
pub trait File: Send {
|
||||
/// Attempt to read the bytes in `self` in range `[start,start+dst.bytes_total())`
|
||||
/// and return the number of bytes read (let's call it `nread`).
|
||||
/// The bytes read are placed in `dst`, i.e., `&dst[..nread]` will contain the read bytes.
|
||||
///
|
||||
/// The only reason why the read may be short (i.e., `nread != dst.bytes_total()`)
|
||||
/// is if the file is shorter than `start+dst.len()`.
|
||||
///
|
||||
/// This is unlike [`std::os::unix::fs::FileExt::read_exact_at`] which returns an
|
||||
/// [`std::io::ErrorKind::UnexpectedEof`] error if the file is shorter than `start+dst.len()`.
|
||||
///
|
||||
/// No guarantees are made about the remaining bytes in `dst` in case of a short read.
|
||||
async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>(
|
||||
&'b self,
|
||||
start: u64,
|
||||
dst: Slice<B>,
|
||||
ctx: &'a RequestContext,
|
||||
) -> std::io::Result<(Slice<B>, usize)>;
|
||||
}
|
||||
|
||||
/// A logical read from [`File`]. See [`Self::new`].
|
||||
pub struct LogicalRead<B: Buffer> {
|
||||
pos: u64,
|
||||
state: RwLockRefCell<LogicalReadState<B>>,
|
||||
}
|
||||
|
||||
enum LogicalReadState<B: Buffer> {
|
||||
NotStarted(B),
|
||||
Ongoing(B),
|
||||
Ok(B),
|
||||
Error(Arc<std::io::Error>),
|
||||
Undefined,
|
||||
}
|
||||
|
||||
impl<B: Buffer> LogicalRead<B> {
|
||||
/// Create a new [`LogicalRead`] from [`File`] of the data in the file in range `[ pos, pos + buf.cap() )`.
|
||||
pub fn new(pos: u64, buf: B) -> Self {
|
||||
Self {
|
||||
pos,
|
||||
state: RwLockRefCell::new(LogicalReadState::NotStarted(buf)),
|
||||
}
|
||||
}
|
||||
pub fn into_result(self) -> Option<Result<B, Arc<std::io::Error>>> {
|
||||
match self.state.into_inner() {
|
||||
LogicalReadState::Ok(buf) => Some(Ok(buf)),
|
||||
LogicalReadState::Error(e) => Some(Err(e)),
|
||||
LogicalReadState::NotStarted(_) | LogicalReadState::Ongoing(_) => None,
|
||||
LogicalReadState::Undefined => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// The buffer into which a [`LogicalRead`] result is placed.
|
||||
pub trait Buffer: std::ops::Deref<Target = [u8]> {
|
||||
/// Immutable.
|
||||
fn cap(&self) -> usize;
|
||||
/// Changes only through [`Self::extend_from_slice`].
|
||||
fn len(&self) -> usize;
|
||||
/// Panics if the total length would exceed the initialized capacity.
|
||||
fn extend_from_slice(&mut self, src: &[u8]);
|
||||
}
|
||||
|
||||
/// The minimum alignment and size requirement for disk offsets and memory buffer size for direct IO.
|
||||
const DIO_CHUNK_SIZE: usize = 512;
|
||||
|
||||
/// If multiple chunks need to be read, merge adjacent chunk reads into batches of max size `MAX_CHUNK_BATCH_SIZE`.
|
||||
/// (The unit is the number of chunks.)
|
||||
const MAX_CHUNK_BATCH_SIZE: usize = {
|
||||
let desired = 128 * 1024; // 128k
|
||||
if desired % DIO_CHUNK_SIZE != 0 {
|
||||
panic!("MAX_CHUNK_BATCH_SIZE must be a multiple of DIO_CHUNK_SIZE")
|
||||
// compile-time error
|
||||
}
|
||||
desired / DIO_CHUNK_SIZE
|
||||
};
|
||||
|
||||
/// Execute the given logical `reads` against `file`.
|
||||
/// The results are placed in the buffers of the [`LogicalRead`]s.
|
||||
/// Retrieve the results by calling [`LogicalRead::into_result`] on each [`LogicalRead`].
|
||||
///
|
||||
/// The [`LogicalRead`]s must be freshly created using [`LogicalRead::new`] when calling this function.
|
||||
/// Otherwise, this function panics.
|
||||
pub async fn execute<'a, I, F, B>(file: &F, reads: I, ctx: &RequestContext)
|
||||
where
|
||||
I: IntoIterator<Item = &'a LogicalRead<B>>,
|
||||
F: File,
|
||||
B: Buffer + IoBufMut + Send,
|
||||
{
|
||||
// Terminology:
|
||||
// logical read = a request to read an arbitrary range of bytes from `file`; byte-level granularity
|
||||
// chunk = we conceptually divide up the byte range of `file` into DIO_CHUNK_SIZEs ranges
|
||||
// interest = a range within a chunk that a logical read is interested in; one logical read gets turned into many interests
|
||||
// physical read = the read request we're going to issue to the OS; covers a range of chunks; chunk-level granularity
|
||||
|
||||
// Preserve a copy of the logical reads for debug assertions at the end
|
||||
#[cfg(debug_assertions)]
|
||||
let (reads, assert_logical_reads) = {
|
||||
let (reads, assert) = reads.into_iter().tee();
|
||||
(reads, Some(Vec::from_iter(assert)))
|
||||
};
|
||||
#[cfg(not(debug_assertions))]
|
||||
let (reads, assert_logical_reads): (_, Option<Vec<&'a LogicalRead<B>>>) = (reads, None);
|
||||
|
||||
// Plan which parts of which chunks need to be appended to which buffer
|
||||
let mut by_chunk: BTreeMap<u64, Vec<Interest<B>>> = BTreeMap::new();
|
||||
struct Interest<'a, B: Buffer> {
|
||||
logical_read: &'a LogicalRead<B>,
|
||||
offset_in_chunk: u64,
|
||||
len: u64,
|
||||
}
|
||||
for logical_read in reads {
|
||||
let LogicalRead { pos, state } = logical_read;
|
||||
let mut state = state.borrow_mut();
|
||||
|
||||
// transition from NotStarted to Ongoing
|
||||
let cur = std::mem::replace(&mut *state, LogicalReadState::Undefined);
|
||||
let req_len = match cur {
|
||||
LogicalReadState::NotStarted(buf) => {
|
||||
if buf.len() != 0 {
|
||||
panic!("The `LogicalRead`s that are passed in must be freshly created using `LogicalRead::new`");
|
||||
}
|
||||
// buf.cap() == 0 is ok
|
||||
|
||||
// transition into Ongoing state
|
||||
let req_len = buf.cap();
|
||||
*state = LogicalReadState::Ongoing(buf);
|
||||
req_len
|
||||
}
|
||||
x => panic!("must only call with fresh LogicalReads, got another state, leaving Undefined state behind state={x:?}"),
|
||||
};
|
||||
|
||||
// plan which chunks we need to read from
|
||||
let mut remaining = req_len;
|
||||
let mut chunk_no = *pos / (DIO_CHUNK_SIZE.into_u64());
|
||||
let mut offset_in_chunk = pos.into_usize() % DIO_CHUNK_SIZE;
|
||||
while remaining > 0 {
|
||||
let remaining_in_chunk = std::cmp::min(remaining, DIO_CHUNK_SIZE - offset_in_chunk);
|
||||
by_chunk.entry(chunk_no).or_default().push(Interest {
|
||||
logical_read,
|
||||
offset_in_chunk: offset_in_chunk.into_u64(),
|
||||
len: remaining_in_chunk.into_u64(),
|
||||
});
|
||||
offset_in_chunk = 0;
|
||||
chunk_no += 1;
|
||||
remaining -= remaining_in_chunk;
|
||||
}
|
||||
}
|
||||
|
||||
// At this point, we could iterate over by_chunk, in chunk order,
|
||||
// read each chunk from disk, and fill the buffers.
|
||||
// However, we can merge adjacent chunks into batches of MAX_CHUNK_BATCH_SIZE
|
||||
// so we issue fewer IOs = fewer roundtrips = lower overall latency.
|
||||
struct PhysicalRead<'a, B: Buffer> {
|
||||
start_chunk_no: u64,
|
||||
nchunks: usize,
|
||||
dsts: Vec<PhysicalInterest<'a, B>>,
|
||||
}
|
||||
struct PhysicalInterest<'a, B: Buffer> {
|
||||
logical_read: &'a LogicalRead<B>,
|
||||
offset_in_physical_read: u64,
|
||||
len: u64,
|
||||
}
|
||||
let mut physical_reads: Vec<PhysicalRead<B>> = Vec::new();
|
||||
let mut by_chunk = by_chunk.into_iter().peekable();
|
||||
loop {
|
||||
let mut last_chunk_no = None;
|
||||
let to_merge: Vec<(u64, Vec<Interest<B>>)> = by_chunk
|
||||
.peeking_take_while(|(chunk_no, _)| {
|
||||
if let Some(last_chunk_no) = last_chunk_no {
|
||||
if *chunk_no != last_chunk_no + 1 {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
last_chunk_no = Some(*chunk_no);
|
||||
true
|
||||
})
|
||||
.take(MAX_CHUNK_BATCH_SIZE)
|
||||
.collect(); // TODO: avoid this .collect()
|
||||
let Some(start_chunk_no) = to_merge.first().map(|(chunk_no, _)| *chunk_no) else {
|
||||
break;
|
||||
};
|
||||
let nchunks = to_merge.len();
|
||||
let dsts = to_merge
|
||||
.into_iter()
|
||||
.enumerate()
|
||||
.flat_map(|(i, (_, dsts))| {
|
||||
dsts.into_iter().map(
|
||||
move |Interest {
|
||||
logical_read,
|
||||
offset_in_chunk,
|
||||
len,
|
||||
}| {
|
||||
PhysicalInterest {
|
||||
logical_read,
|
||||
offset_in_physical_read: i
|
||||
.checked_mul(DIO_CHUNK_SIZE)
|
||||
.unwrap()
|
||||
.into_u64()
|
||||
+ offset_in_chunk,
|
||||
len,
|
||||
}
|
||||
},
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
physical_reads.push(PhysicalRead {
|
||||
start_chunk_no,
|
||||
nchunks,
|
||||
dsts,
|
||||
});
|
||||
}
|
||||
drop(by_chunk);
|
||||
|
||||
// Execute physical reads and fill the logical read buffers
|
||||
// TODO: pipelined reads; prefetch;
|
||||
let get_io_buffer = |nchunks| Vec::with_capacity(nchunks * DIO_CHUNK_SIZE);
|
||||
for PhysicalRead {
|
||||
start_chunk_no,
|
||||
nchunks,
|
||||
dsts,
|
||||
} in physical_reads
|
||||
{
|
||||
let all_done = dsts
|
||||
.iter()
|
||||
.all(|PhysicalInterest { logical_read, .. }| logical_read.state.borrow().is_terminal());
|
||||
if all_done {
|
||||
continue;
|
||||
}
|
||||
let read_offset = start_chunk_no
|
||||
.checked_mul(DIO_CHUNK_SIZE.into_u64())
|
||||
.expect("we produce chunk_nos by dividing by DIO_CHUNK_SIZE earlier");
|
||||
let io_buf = get_io_buffer(nchunks).slice_full();
|
||||
let req_len = io_buf.len();
|
||||
let (io_buf_slice, nread) = match file.read_exact_at_eof_ok(read_offset, io_buf, ctx).await
|
||||
{
|
||||
Ok(t) => t,
|
||||
Err(e) => {
|
||||
let e = Arc::new(e);
|
||||
for PhysicalInterest { logical_read, .. } in dsts {
|
||||
*logical_read.state.borrow_mut() = LogicalReadState::Error(Arc::clone(&e));
|
||||
// this will make later reads for the given LogicalRead short-circuit, see top of loop body
|
||||
}
|
||||
continue;
|
||||
}
|
||||
};
|
||||
let io_buf = io_buf_slice.into_inner();
|
||||
assert!(
|
||||
nread <= io_buf.len(),
|
||||
"the last chunk in the file can be a short read, so, no =="
|
||||
);
|
||||
let io_buf = &io_buf[..nread];
|
||||
for PhysicalInterest {
|
||||
logical_read,
|
||||
offset_in_physical_read,
|
||||
len,
|
||||
} in dsts
|
||||
{
|
||||
let mut logical_read_state_borrow = logical_read.state.borrow_mut();
|
||||
let logical_read_buf = match &mut *logical_read_state_borrow {
|
||||
LogicalReadState::NotStarted(_) => {
|
||||
unreachable!("we transition it into Ongoing at function entry")
|
||||
}
|
||||
LogicalReadState::Ongoing(buf) => buf,
|
||||
LogicalReadState::Ok(_) | LogicalReadState::Error(_) => {
|
||||
continue;
|
||||
}
|
||||
LogicalReadState::Undefined => unreachable!(),
|
||||
};
|
||||
let range_in_io_buf = std::ops::Range {
|
||||
start: offset_in_physical_read as usize,
|
||||
end: offset_in_physical_read as usize + len as usize,
|
||||
};
|
||||
assert!(range_in_io_buf.end >= range_in_io_buf.start);
|
||||
if range_in_io_buf.end > nread {
|
||||
let msg = format!(
|
||||
"physical read returned EOF where this logical read expected more data in the file: offset=0x{read_offset:x} req_len=0x{req_len:x} nread=0x{nread:x} {:?}",
|
||||
&*logical_read_state_borrow
|
||||
);
|
||||
logical_read_state_borrow.transition_to_terminal(Err(std::io::Error::new(
|
||||
std::io::ErrorKind::UnexpectedEof,
|
||||
msg,
|
||||
)));
|
||||
continue;
|
||||
}
|
||||
let data = &io_buf[range_in_io_buf];
|
||||
|
||||
// Copy data from io buffer into the logical read buffer.
|
||||
// (And in debug mode, validate that the buffer impl adheres to the Buffer trait spec.)
|
||||
let pre = if cfg!(debug_assertions) {
|
||||
Some((logical_read_buf.len(), logical_read_buf.cap()))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
logical_read_buf.extend_from_slice(data);
|
||||
let post = if cfg!(debug_assertions) {
|
||||
Some((logical_read_buf.len(), logical_read_buf.cap()))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
match (pre, post) {
|
||||
(None, None) => {}
|
||||
(Some(_), None) | (None, Some(_)) => unreachable!(),
|
||||
(Some((pre_len, pre_cap)), Some((post_len, post_cap))) => {
|
||||
assert_eq!(pre_len + len as usize, post_len);
|
||||
assert_eq!(pre_cap, post_cap);
|
||||
}
|
||||
}
|
||||
|
||||
if logical_read_buf.len() == logical_read_buf.cap() {
|
||||
logical_read_state_borrow.transition_to_terminal(Ok(()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(assert_logical_reads) = assert_logical_reads {
|
||||
for logical_read in assert_logical_reads {
|
||||
assert!(logical_read.state.borrow().is_terminal());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<B: Buffer> LogicalReadState<B> {
|
||||
fn is_terminal(&self) -> bool {
|
||||
match self {
|
||||
LogicalReadState::NotStarted(_) | LogicalReadState::Ongoing(_) => false,
|
||||
LogicalReadState::Ok(_) | LogicalReadState::Error(_) => true,
|
||||
LogicalReadState::Undefined => unreachable!(),
|
||||
}
|
||||
}
|
||||
fn transition_to_terminal(&mut self, err: std::io::Result<()>) {
|
||||
let cur = std::mem::replace(self, LogicalReadState::Undefined);
|
||||
let buf = match cur {
|
||||
LogicalReadState::Ongoing(buf) => buf,
|
||||
x => panic!("must only call in state Ongoing, got {x:?}"),
|
||||
};
|
||||
*self = match err {
|
||||
Ok(()) => LogicalReadState::Ok(buf),
|
||||
Err(e) => LogicalReadState::Error(Arc::new(e)),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
impl<B: Buffer> std::fmt::Debug for LogicalReadState<B> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
#[derive(Debug)]
|
||||
#[allow(unused)]
|
||||
struct BufferDebug {
|
||||
len: usize,
|
||||
cap: usize,
|
||||
}
|
||||
impl<'a> From<&'a dyn Buffer> for BufferDebug {
|
||||
fn from(buf: &'a dyn Buffer) -> Self {
|
||||
Self {
|
||||
len: buf.len(),
|
||||
cap: buf.cap(),
|
||||
}
|
||||
}
|
||||
}
|
||||
match self {
|
||||
LogicalReadState::NotStarted(b) => {
|
||||
write!(f, "NotStarted({:?})", BufferDebug::from(b as &dyn Buffer))
|
||||
}
|
||||
LogicalReadState::Ongoing(b) => {
|
||||
write!(f, "Ongoing({:?})", BufferDebug::from(b as &dyn Buffer))
|
||||
}
|
||||
LogicalReadState::Ok(b) => write!(f, "Ok({:?})", BufferDebug::from(b as &dyn Buffer)),
|
||||
LogicalReadState::Error(e) => write!(f, "Error({:?})", e),
|
||||
LogicalReadState::Undefined => write!(f, "Undefined"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct RwLockRefCell<T>(RwLock<T>);
|
||||
impl<T> RwLockRefCell<T> {
|
||||
fn new(value: T) -> Self {
|
||||
Self(RwLock::new(value))
|
||||
}
|
||||
fn borrow(&self) -> impl std::ops::Deref<Target = T> + '_ {
|
||||
self.0.try_read().unwrap()
|
||||
}
|
||||
fn borrow_mut(&self) -> impl std::ops::DerefMut<Target = T> + '_ {
|
||||
self.0.try_write().unwrap()
|
||||
}
|
||||
fn into_inner(self) -> T {
|
||||
self.0.into_inner().unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
impl Buffer for Vec<u8> {
|
||||
fn cap(&self) -> usize {
|
||||
self.capacity()
|
||||
}
|
||||
|
||||
fn len(&self) -> usize {
|
||||
self.len()
|
||||
}
|
||||
|
||||
fn extend_from_slice(&mut self, src: &[u8]) {
|
||||
if self.len() + src.len() > self.cap() {
|
||||
panic!("Buffer capacity exceeded");
|
||||
}
|
||||
Vec::extend_from_slice(self, src);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[allow(clippy::assertions_on_constants)]
|
||||
mod tests {
|
||||
use rand::Rng;
|
||||
|
||||
use crate::{
|
||||
context::DownloadBehavior, task_mgr::TaskKind,
|
||||
virtual_file::owned_buffers_io::slice::SliceMutExt,
|
||||
};
|
||||
|
||||
use super::*;
|
||||
use std::{cell::RefCell, collections::VecDeque};
|
||||
|
||||
struct InMemoryFile {
|
||||
content: Vec<u8>,
|
||||
}
|
||||
|
||||
impl InMemoryFile {
|
||||
fn new_random(len: usize) -> Self {
|
||||
Self {
|
||||
content: rand::thread_rng()
|
||||
.sample_iter(rand::distributions::Standard)
|
||||
.take(len)
|
||||
.collect(),
|
||||
}
|
||||
}
|
||||
fn test_logical_read(&self, pos: u64, len: usize) -> TestLogicalRead {
|
||||
let expected_result = if pos as usize + len > self.content.len() {
|
||||
Err("InMemoryFile short read".to_string())
|
||||
} else {
|
||||
Ok(self.content[pos as usize..pos as usize + len].to_vec())
|
||||
};
|
||||
TestLogicalRead::new(pos, len, expected_result)
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_in_memory_file() {
|
||||
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
|
||||
let file = InMemoryFile::new_random(10);
|
||||
let test_read = |pos, len| {
|
||||
let buf = vec![0; len];
|
||||
let fut = file.read_exact_at_eof_ok(pos, buf.slice_full(), &ctx);
|
||||
use futures::FutureExt;
|
||||
let (slice, nread) = fut
|
||||
.now_or_never()
|
||||
.expect("impl never awaits")
|
||||
.expect("impl never errors");
|
||||
let mut buf = slice.into_inner();
|
||||
buf.truncate(nread);
|
||||
buf
|
||||
};
|
||||
assert_eq!(test_read(0, 1), &file.content[0..1]);
|
||||
assert_eq!(test_read(1, 2), &file.content[1..3]);
|
||||
assert_eq!(test_read(9, 2), &file.content[9..]);
|
||||
assert!(test_read(10, 2).is_empty());
|
||||
assert!(test_read(11, 2).is_empty());
|
||||
}
|
||||
|
||||
impl File for InMemoryFile {
|
||||
async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>(
|
||||
&'b self,
|
||||
start: u64,
|
||||
mut dst: Slice<B>,
|
||||
_ctx: &'a RequestContext,
|
||||
) -> std::io::Result<(Slice<B>, usize)> {
|
||||
let dst_slice: &mut [u8] = dst.as_mut_rust_slice_full_zeroed();
|
||||
let nread = {
|
||||
let req_len = dst_slice.len();
|
||||
let len = std::cmp::min(req_len, self.content.len().saturating_sub(start as usize));
|
||||
if start as usize >= self.content.len() {
|
||||
0
|
||||
} else {
|
||||
dst_slice[..len]
|
||||
.copy_from_slice(&self.content[start as usize..start as usize + len]);
|
||||
len
|
||||
}
|
||||
};
|
||||
rand::Rng::fill(&mut rand::thread_rng(), &mut dst_slice[nread..]); // to discover bugs
|
||||
Ok((dst, nread))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct TestLogicalRead {
|
||||
pos: u64,
|
||||
len: usize,
|
||||
expected_result: Result<Vec<u8>, String>,
|
||||
}
|
||||
|
||||
impl TestLogicalRead {
|
||||
fn new(pos: u64, len: usize, expected_result: Result<Vec<u8>, String>) -> Self {
|
||||
Self {
|
||||
pos,
|
||||
len,
|
||||
expected_result,
|
||||
}
|
||||
}
|
||||
fn make_logical_read(&self) -> LogicalRead<Vec<u8>> {
|
||||
LogicalRead::new(self.pos, Vec::with_capacity(self.len))
|
||||
}
|
||||
}
|
||||
|
||||
async fn execute_and_validate_test_logical_reads<I, F>(
|
||||
file: &F,
|
||||
test_logical_reads: I,
|
||||
ctx: &RequestContext,
|
||||
) where
|
||||
I: IntoIterator<Item = TestLogicalRead>,
|
||||
F: File,
|
||||
{
|
||||
let (tmp, test_logical_reads) = test_logical_reads.into_iter().tee();
|
||||
let logical_reads = tmp.map(|tr| tr.make_logical_read()).collect::<Vec<_>>();
|
||||
execute(file, logical_reads.iter(), ctx).await;
|
||||
for (logical_read, test_logical_read) in logical_reads.into_iter().zip(test_logical_reads) {
|
||||
let actual = logical_read.into_result().expect("we call execute()");
|
||||
match (actual, test_logical_read.expected_result) {
|
||||
(Ok(actual), Ok(expected)) if actual == expected => {}
|
||||
(Err(actual), Err(expected)) => {
|
||||
assert_eq!(actual.to_string(), expected);
|
||||
}
|
||||
(actual, expected) => panic!("expected {expected:?}\nactual {actual:?}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_blackbox() {
|
||||
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
|
||||
let cs = DIO_CHUNK_SIZE;
|
||||
let cs_u64 = cs.into_u64();
|
||||
|
||||
let file = InMemoryFile::new_random(10 * cs);
|
||||
|
||||
let test_logical_reads = vec![
|
||||
file.test_logical_read(0, 1),
|
||||
// adjacent to logical_read0
|
||||
file.test_logical_read(1, 2),
|
||||
// gap
|
||||
// spans adjacent chunks
|
||||
file.test_logical_read(cs_u64 - 1, 2),
|
||||
// gap
|
||||
// tail of chunk 3, all of chunk 4, and 2 bytes of chunk 5
|
||||
file.test_logical_read(3 * cs_u64 - 1, cs + 2),
|
||||
// gap
|
||||
file.test_logical_read(5 * cs_u64, 1),
|
||||
];
|
||||
let num_test_logical_reads = test_logical_reads.len();
|
||||
let test_logical_reads_perms = test_logical_reads
|
||||
.into_iter()
|
||||
.permutations(num_test_logical_reads);
|
||||
|
||||
// test all orderings of LogicalReads, the order shouldn't matter for the results
|
||||
for test_logical_reads in test_logical_reads_perms {
|
||||
execute_and_validate_test_logical_reads(&file, test_logical_reads, &ctx).await;
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
#[should_panic]
|
||||
async fn test_reusing_logical_reads_panics() {
|
||||
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
|
||||
let file = InMemoryFile::new_random(DIO_CHUNK_SIZE);
|
||||
let a = file.test_logical_read(23, 10);
|
||||
let logical_reads = vec![a.make_logical_read()];
|
||||
execute(&file, &logical_reads, &ctx).await;
|
||||
// reuse pancis
|
||||
execute(&file, &logical_reads, &ctx).await;
|
||||
}
|
||||
|
||||
struct RecorderFile<'a> {
|
||||
recorded: RefCell<Vec<RecordedRead>>,
|
||||
file: &'a InMemoryFile,
|
||||
}
|
||||
|
||||
struct RecordedRead {
|
||||
pos: u64,
|
||||
req_len: usize,
|
||||
res: Vec<u8>,
|
||||
}
|
||||
|
||||
impl<'a> RecorderFile<'a> {
|
||||
fn new(file: &'a InMemoryFile) -> RecorderFile<'a> {
|
||||
Self {
|
||||
recorded: Default::default(),
|
||||
file,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'x> File for RecorderFile<'x> {
|
||||
async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>(
|
||||
&'b self,
|
||||
start: u64,
|
||||
dst: Slice<B>,
|
||||
ctx: &'a RequestContext,
|
||||
) -> std::io::Result<(Slice<B>, usize)> {
|
||||
let (dst, nread) = self.file.read_exact_at_eof_ok(start, dst, ctx).await?;
|
||||
self.recorded.borrow_mut().push(RecordedRead {
|
||||
pos: start,
|
||||
req_len: dst.bytes_total(),
|
||||
res: Vec::from(&dst[..nread]),
|
||||
});
|
||||
Ok((dst, nread))
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_logical_reads_to_same_chunk_are_merged_into_one_chunk_read() {
|
||||
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
|
||||
|
||||
let file = InMemoryFile::new_random(2 * DIO_CHUNK_SIZE);
|
||||
|
||||
let a = file.test_logical_read(DIO_CHUNK_SIZE.into_u64(), 10);
|
||||
let b = file.test_logical_read(DIO_CHUNK_SIZE.into_u64() + 30, 20);
|
||||
|
||||
let recorder = RecorderFile::new(&file);
|
||||
|
||||
execute_and_validate_test_logical_reads(&recorder, vec![a, b], &ctx).await;
|
||||
|
||||
let recorded = recorder.recorded.borrow();
|
||||
assert_eq!(recorded.len(), 1);
|
||||
let RecordedRead { pos, req_len, .. } = &recorded[0];
|
||||
assert_eq!(*pos, DIO_CHUNK_SIZE.into_u64());
|
||||
assert_eq!(*req_len, DIO_CHUNK_SIZE);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_max_chunk_batch_size_is_respected() {
|
||||
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
|
||||
|
||||
let file = InMemoryFile::new_random(4 * MAX_CHUNK_BATCH_SIZE * DIO_CHUNK_SIZE);
|
||||
|
||||
// read the 10th byte of each chunk 3 .. 3+2*MAX_CHUNK_BATCH_SIZE
|
||||
assert!(3 < MAX_CHUNK_BATCH_SIZE, "test assumption");
|
||||
assert!(10 < DIO_CHUNK_SIZE, "test assumption");
|
||||
let mut test_logical_reads = Vec::new();
|
||||
for i in 3..3 + MAX_CHUNK_BATCH_SIZE + MAX_CHUNK_BATCH_SIZE / 2 {
|
||||
test_logical_reads
|
||||
.push(file.test_logical_read(i.into_u64() * DIO_CHUNK_SIZE.into_u64() + 10, 1));
|
||||
}
|
||||
|
||||
let recorder = RecorderFile::new(&file);
|
||||
|
||||
execute_and_validate_test_logical_reads(&recorder, test_logical_reads, &ctx).await;
|
||||
|
||||
let recorded = recorder.recorded.borrow();
|
||||
assert_eq!(recorded.len(), 2);
|
||||
{
|
||||
let RecordedRead { pos, req_len, .. } = &recorded[0];
|
||||
assert_eq!(*pos as usize, 3 * DIO_CHUNK_SIZE);
|
||||
assert_eq!(*req_len, MAX_CHUNK_BATCH_SIZE * DIO_CHUNK_SIZE);
|
||||
}
|
||||
{
|
||||
let RecordedRead { pos, req_len, .. } = &recorded[1];
|
||||
assert_eq!(*pos as usize, (3 + MAX_CHUNK_BATCH_SIZE) * DIO_CHUNK_SIZE);
|
||||
assert_eq!(*req_len, MAX_CHUNK_BATCH_SIZE / 2 * DIO_CHUNK_SIZE);
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_batch_breaks_if_chunk_is_not_interesting() {
|
||||
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
|
||||
|
||||
assert!(MAX_CHUNK_BATCH_SIZE > 10, "test assumption");
|
||||
let file = InMemoryFile::new_random(3 * DIO_CHUNK_SIZE);
|
||||
|
||||
let a = file.test_logical_read(0, 1); // chunk 0
|
||||
let b = file.test_logical_read(2 * DIO_CHUNK_SIZE.into_u64(), 1); // chunk 2
|
||||
|
||||
let recorder = RecorderFile::new(&file);
|
||||
|
||||
execute_and_validate_test_logical_reads(&recorder, vec![a, b], &ctx).await;
|
||||
|
||||
let recorded = recorder.recorded.borrow();
|
||||
|
||||
assert_eq!(recorded.len(), 2);
|
||||
{
|
||||
let RecordedRead { pos, req_len, .. } = &recorded[0];
|
||||
assert_eq!(*pos, 0);
|
||||
assert_eq!(*req_len, DIO_CHUNK_SIZE);
|
||||
}
|
||||
{
|
||||
let RecordedRead { pos, req_len, .. } = &recorded[1];
|
||||
assert_eq!(*pos, 2 * DIO_CHUNK_SIZE.into_u64());
|
||||
assert_eq!(*req_len, DIO_CHUNK_SIZE);
|
||||
}
|
||||
}
|
||||
|
||||
struct ExpectedRead {
|
||||
expect_pos: u64,
|
||||
expect_len: usize,
|
||||
respond: Result<Vec<u8>, String>,
|
||||
}
|
||||
|
||||
struct MockFile {
|
||||
expected: RefCell<VecDeque<ExpectedRead>>,
|
||||
}
|
||||
|
||||
impl Drop for MockFile {
|
||||
fn drop(&mut self) {
|
||||
assert!(
|
||||
self.expected.borrow().is_empty(),
|
||||
"expected reads not satisfied"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
macro_rules! mock_file {
|
||||
($($pos:expr , $len:expr => $respond:expr),* $(,)?) => {{
|
||||
MockFile {
|
||||
expected: RefCell::new(VecDeque::from(vec![$(ExpectedRead {
|
||||
expect_pos: $pos,
|
||||
expect_len: $len,
|
||||
respond: $respond,
|
||||
}),*])),
|
||||
}
|
||||
}};
|
||||
}
|
||||
|
||||
impl File for MockFile {
|
||||
async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>(
|
||||
&'b self,
|
||||
start: u64,
|
||||
mut dst: Slice<B>,
|
||||
_ctx: &'a RequestContext,
|
||||
) -> std::io::Result<(Slice<B>, usize)> {
|
||||
let ExpectedRead {
|
||||
expect_pos,
|
||||
expect_len,
|
||||
respond,
|
||||
} = self
|
||||
.expected
|
||||
.borrow_mut()
|
||||
.pop_front()
|
||||
.expect("unexpected read");
|
||||
assert_eq!(start, expect_pos);
|
||||
assert_eq!(dst.bytes_total(), expect_len);
|
||||
match respond {
|
||||
Ok(mocked_bytes) => {
|
||||
let len = std::cmp::min(dst.bytes_total(), mocked_bytes.len());
|
||||
let dst_slice: &mut [u8] = dst.as_mut_rust_slice_full_zeroed();
|
||||
dst_slice[..len].copy_from_slice(&mocked_bytes[..len]);
|
||||
rand::Rng::fill(&mut rand::thread_rng(), &mut dst_slice[len..]); // to discover bugs
|
||||
Ok((dst, len))
|
||||
}
|
||||
Err(e) => Err(std::io::Error::new(std::io::ErrorKind::Other, e)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_mock_file() {
|
||||
// Self-test to ensure the relevant features of mock file work as expected.
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
|
||||
|
||||
let mock_file = mock_file! {
|
||||
0 , 512 => Ok(vec![0; 512]),
|
||||
512 , 512 => Ok(vec![1; 512]),
|
||||
1024 , 512 => Ok(vec![2; 10]),
|
||||
2048, 1024 => Err("foo".to_owned()),
|
||||
};
|
||||
|
||||
let buf = Vec::with_capacity(512);
|
||||
let (buf, nread) = mock_file
|
||||
.read_exact_at_eof_ok(0, buf.slice_full(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(nread, 512);
|
||||
assert_eq!(&buf.into_inner()[..nread], &[0; 512]);
|
||||
|
||||
let buf = Vec::with_capacity(512);
|
||||
let (buf, nread) = mock_file
|
||||
.read_exact_at_eof_ok(512, buf.slice_full(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(nread, 512);
|
||||
assert_eq!(&buf.into_inner()[..nread], &[1; 512]);
|
||||
|
||||
let buf = Vec::with_capacity(512);
|
||||
let (buf, nread) = mock_file
|
||||
.read_exact_at_eof_ok(1024, buf.slice_full(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(nread, 10);
|
||||
assert_eq!(&buf.into_inner()[..nread], &[2; 10]);
|
||||
|
||||
let buf = Vec::with_capacity(1024);
|
||||
let err = mock_file
|
||||
.read_exact_at_eof_ok(2048, buf.slice_full(), &ctx)
|
||||
.await
|
||||
.err()
|
||||
.unwrap();
|
||||
assert_eq!(err.to_string(), "foo");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_error_on_one_chunk_read_fails_only_dependent_logical_reads() {
|
||||
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
|
||||
|
||||
let test_logical_reads = vec![
|
||||
// read spanning two batches
|
||||
TestLogicalRead::new(
|
||||
DIO_CHUNK_SIZE.into_u64() / 2,
|
||||
MAX_CHUNK_BATCH_SIZE * DIO_CHUNK_SIZE,
|
||||
Err("foo".to_owned()),
|
||||
),
|
||||
// second read in failing chunk
|
||||
TestLogicalRead::new(
|
||||
(MAX_CHUNK_BATCH_SIZE * DIO_CHUNK_SIZE).into_u64() + DIO_CHUNK_SIZE.into_u64() - 10,
|
||||
5,
|
||||
Err("foo".to_owned()),
|
||||
),
|
||||
// read unaffected
|
||||
TestLogicalRead::new(
|
||||
(MAX_CHUNK_BATCH_SIZE * DIO_CHUNK_SIZE).into_u64()
|
||||
+ 2 * DIO_CHUNK_SIZE.into_u64()
|
||||
+ 10,
|
||||
5,
|
||||
Ok(vec![1; 5]),
|
||||
),
|
||||
];
|
||||
let (tmp, test_logical_reads) = test_logical_reads.into_iter().tee();
|
||||
let test_logical_read_perms = tmp.permutations(test_logical_reads.len());
|
||||
|
||||
for test_logical_reads in test_logical_read_perms {
|
||||
let file = mock_file!(
|
||||
0, MAX_CHUNK_BATCH_SIZE*DIO_CHUNK_SIZE => Ok(vec![0; MAX_CHUNK_BATCH_SIZE*DIO_CHUNK_SIZE]),
|
||||
(MAX_CHUNK_BATCH_SIZE*DIO_CHUNK_SIZE).into_u64(), DIO_CHUNK_SIZE => Err("foo".to_owned()),
|
||||
(MAX_CHUNK_BATCH_SIZE*DIO_CHUNK_SIZE + 2*DIO_CHUNK_SIZE).into_u64(), DIO_CHUNK_SIZE => Ok(vec![1; DIO_CHUNK_SIZE]),
|
||||
);
|
||||
execute_and_validate_test_logical_reads(&file, test_logical_reads, &ctx).await;
|
||||
}
|
||||
}
|
||||
|
||||
struct TestShortReadsSetup {
|
||||
ctx: RequestContext,
|
||||
file: InMemoryFile,
|
||||
written: u64,
|
||||
}
|
||||
fn setup_short_chunk_read_tests() -> TestShortReadsSetup {
|
||||
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
|
||||
assert!(DIO_CHUNK_SIZE > 20, "test assumption");
|
||||
let written = (2 * DIO_CHUNK_SIZE - 10).into_u64();
|
||||
let file = InMemoryFile::new_random(written as usize);
|
||||
TestShortReadsSetup { ctx, file, written }
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_short_chunk_read_from_written_range() {
|
||||
// Test what happens if there are logical reads
|
||||
// that start within the last chunk, and
|
||||
// the last chunk is not the full chunk length.
|
||||
//
|
||||
// The read should succeed despite the short chunk length.
|
||||
let TestShortReadsSetup { ctx, file, written } = setup_short_chunk_read_tests();
|
||||
|
||||
let a = file.test_logical_read(written - 10, 5);
|
||||
let recorder = RecorderFile::new(&file);
|
||||
|
||||
execute_and_validate_test_logical_reads(&recorder, vec![a], &ctx).await;
|
||||
|
||||
let recorded = recorder.recorded.borrow();
|
||||
assert_eq!(recorded.len(), 1);
|
||||
let RecordedRead { pos, req_len, res } = &recorded[0];
|
||||
assert_eq!(*pos, DIO_CHUNK_SIZE.into_u64());
|
||||
assert_eq!(*req_len, DIO_CHUNK_SIZE);
|
||||
assert_eq!(res, &file.content[DIO_CHUNK_SIZE..(written as usize)]);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_short_chunk_read_and_logical_read_from_unwritten_range() {
|
||||
// Test what happens if there are logical reads
|
||||
// that start within the last chunk, and
|
||||
// the last chunk is not the full chunk length, and
|
||||
// the logical reads end in the unwritten range.
|
||||
//
|
||||
// All should fail with UnexpectedEof and have the same IO pattern.
|
||||
async fn the_impl(offset_delta: i64) {
|
||||
let TestShortReadsSetup { ctx, file, written } = setup_short_chunk_read_tests();
|
||||
|
||||
let offset = u64::try_from(
|
||||
i64::try_from(written)
|
||||
.unwrap()
|
||||
.checked_add(offset_delta)
|
||||
.unwrap(),
|
||||
)
|
||||
.unwrap();
|
||||
let a = file.test_logical_read(offset, 5);
|
||||
let recorder = RecorderFile::new(&file);
|
||||
let a_vr = a.make_logical_read();
|
||||
execute(&recorder, vec![&a_vr], &ctx).await;
|
||||
|
||||
// validate the LogicalRead result
|
||||
let a_res = a_vr.into_result().unwrap();
|
||||
let a_err = a_res.unwrap_err();
|
||||
assert_eq!(a_err.kind(), std::io::ErrorKind::UnexpectedEof);
|
||||
|
||||
// validate the IO pattern
|
||||
let recorded = recorder.recorded.borrow();
|
||||
assert_eq!(recorded.len(), 1);
|
||||
let RecordedRead { pos, req_len, res } = &recorded[0];
|
||||
assert_eq!(*pos, DIO_CHUNK_SIZE.into_u64());
|
||||
assert_eq!(*req_len, DIO_CHUNK_SIZE);
|
||||
assert_eq!(res, &file.content[DIO_CHUNK_SIZE..(written as usize)]);
|
||||
}
|
||||
|
||||
the_impl(-1).await; // start == length - 1
|
||||
the_impl(0).await; // start == length
|
||||
the_impl(1).await; // start == length + 1
|
||||
}
|
||||
|
||||
// TODO: mixed: some valid, some UnexpectedEof
|
||||
|
||||
// TODO: same tests but with merges
|
||||
}
|
||||
@@ -35,6 +35,8 @@ mod tests;
|
||||
#[cfg(test)]
|
||||
mod failpoints;
|
||||
|
||||
pub const S3_UPLOAD_LIMIT: u64 = 4_500_000_000;
|
||||
|
||||
/// A Layer contains all data in a "rectangle" consisting of a range of keys and
|
||||
/// range of LSNs.
|
||||
///
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use std::{ops::Range, sync::Arc};
|
||||
use std::{future::Future, ops::Range, sync::Arc};
|
||||
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::key::{Key, KEY_SIZE};
|
||||
@@ -7,7 +7,32 @@ use utils::{id::TimelineId, lsn::Lsn, shard::TenantShardId};
|
||||
use crate::tenant::storage_layer::Layer;
|
||||
use crate::{config::PageServerConf, context::RequestContext, repository::Value, tenant::Timeline};
|
||||
|
||||
use super::{DeltaLayerWriter, ImageLayerWriter, ResidentLayer};
|
||||
use super::layer::S3_UPLOAD_LIMIT;
|
||||
use super::{
|
||||
DeltaLayerWriter, ImageLayerWriter, PersistentLayerDesc, PersistentLayerKey, ResidentLayer,
|
||||
};
|
||||
|
||||
pub(crate) enum SplitWriterResult {
|
||||
Produced(ResidentLayer),
|
||||
Discarded(PersistentLayerKey),
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl SplitWriterResult {
|
||||
fn into_resident_layer(self) -> ResidentLayer {
|
||||
match self {
|
||||
SplitWriterResult::Produced(layer) => layer,
|
||||
SplitWriterResult::Discarded(_) => panic!("unexpected discarded layer"),
|
||||
}
|
||||
}
|
||||
|
||||
fn into_discarded_layer(self) -> PersistentLayerKey {
|
||||
match self {
|
||||
SplitWriterResult::Produced(_) => panic!("unexpected produced layer"),
|
||||
SplitWriterResult::Discarded(layer) => layer,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// An image writer that takes images and produces multiple image layers. The interface does not
|
||||
/// guarantee atomicity (i.e., if the image layer generation fails, there might be leftover files
|
||||
@@ -16,11 +41,12 @@ use super::{DeltaLayerWriter, ImageLayerWriter, ResidentLayer};
|
||||
pub struct SplitImageLayerWriter {
|
||||
inner: ImageLayerWriter,
|
||||
target_layer_size: u64,
|
||||
generated_layers: Vec<ResidentLayer>,
|
||||
generated_layers: Vec<SplitWriterResult>,
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
lsn: Lsn,
|
||||
start_key: Key,
|
||||
}
|
||||
|
||||
impl SplitImageLayerWriter {
|
||||
@@ -49,16 +75,22 @@ impl SplitImageLayerWriter {
|
||||
timeline_id,
|
||||
tenant_shard_id,
|
||||
lsn,
|
||||
start_key,
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn put_image(
|
||||
pub async fn put_image_with_discard_fn<D, F>(
|
||||
&mut self,
|
||||
key: Key,
|
||||
img: Bytes,
|
||||
tline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
discard: D,
|
||||
) -> anyhow::Result<()>
|
||||
where
|
||||
D: FnOnce(&PersistentLayerKey) -> F,
|
||||
F: Future<Output = bool>,
|
||||
{
|
||||
// The current estimation is an upper bound of the space that the key/image could take
|
||||
// because we did not consider compression in this estimation. The resulting image layer
|
||||
// could be smaller than the target size.
|
||||
@@ -76,33 +108,87 @@ impl SplitImageLayerWriter {
|
||||
)
|
||||
.await?;
|
||||
let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer);
|
||||
self.generated_layers.push(
|
||||
prev_image_writer
|
||||
.finish_with_end_key(tline, key, ctx)
|
||||
.await?,
|
||||
);
|
||||
let layer_key = PersistentLayerKey {
|
||||
key_range: self.start_key..key,
|
||||
lsn_range: PersistentLayerDesc::image_layer_lsn_range(self.lsn),
|
||||
is_delta: false,
|
||||
};
|
||||
self.start_key = key;
|
||||
|
||||
if discard(&layer_key).await {
|
||||
drop(prev_image_writer);
|
||||
self.generated_layers
|
||||
.push(SplitWriterResult::Discarded(layer_key));
|
||||
} else {
|
||||
self.generated_layers.push(SplitWriterResult::Produced(
|
||||
prev_image_writer
|
||||
.finish_with_end_key(tline, key, ctx)
|
||||
.await?,
|
||||
));
|
||||
}
|
||||
}
|
||||
self.inner.put_image(key, img, ctx).await
|
||||
}
|
||||
|
||||
pub(crate) async fn finish(
|
||||
#[cfg(test)]
|
||||
pub async fn put_image(
|
||||
&mut self,
|
||||
key: Key,
|
||||
img: Bytes,
|
||||
tline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
self.put_image_with_discard_fn(key, img, tline, ctx, |_| async { false })
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) async fn finish_with_discard_fn<D, F>(
|
||||
self,
|
||||
tline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
end_key: Key,
|
||||
) -> anyhow::Result<Vec<ResidentLayer>> {
|
||||
discard: D,
|
||||
) -> anyhow::Result<Vec<SplitWriterResult>>
|
||||
where
|
||||
D: FnOnce(&PersistentLayerKey) -> F,
|
||||
F: Future<Output = bool>,
|
||||
{
|
||||
let Self {
|
||||
mut generated_layers,
|
||||
inner,
|
||||
..
|
||||
} = self;
|
||||
generated_layers.push(inner.finish_with_end_key(tline, end_key, ctx).await?);
|
||||
if inner.num_keys() == 0 {
|
||||
return Ok(generated_layers);
|
||||
}
|
||||
let layer_key = PersistentLayerKey {
|
||||
key_range: self.start_key..end_key,
|
||||
lsn_range: PersistentLayerDesc::image_layer_lsn_range(self.lsn),
|
||||
is_delta: false,
|
||||
};
|
||||
if discard(&layer_key).await {
|
||||
generated_layers.push(SplitWriterResult::Discarded(layer_key));
|
||||
} else {
|
||||
generated_layers.push(SplitWriterResult::Produced(
|
||||
inner.finish_with_end_key(tline, end_key, ctx).await?,
|
||||
));
|
||||
}
|
||||
Ok(generated_layers)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) async fn finish(
|
||||
self,
|
||||
tline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
end_key: Key,
|
||||
) -> anyhow::Result<Vec<SplitWriterResult>> {
|
||||
self.finish_with_discard_fn(tline, ctx, end_key, |_| async { false })
|
||||
.await
|
||||
}
|
||||
|
||||
/// When split writer fails, the caller should call this function and handle partially generated layers.
|
||||
#[allow(dead_code)]
|
||||
pub(crate) async fn take(self) -> anyhow::Result<(Vec<ResidentLayer>, ImageLayerWriter)> {
|
||||
pub(crate) fn take(self) -> anyhow::Result<(Vec<SplitWriterResult>, ImageLayerWriter)> {
|
||||
Ok((self.generated_layers, self.inner))
|
||||
}
|
||||
}
|
||||
@@ -110,15 +196,21 @@ impl SplitImageLayerWriter {
|
||||
/// A delta writer that takes key-lsn-values and produces multiple delta layers. The interface does not
|
||||
/// guarantee atomicity (i.e., if the delta layer generation fails, there might be leftover files
|
||||
/// to be cleaned up).
|
||||
///
|
||||
/// Note that if updates of a single key exceed the target size limit, all of the updates will be batched
|
||||
/// into a single file. This behavior might change in the future. For reference, the legacy compaction algorithm
|
||||
/// will split them into multiple files based on size.
|
||||
#[must_use]
|
||||
pub struct SplitDeltaLayerWriter {
|
||||
inner: DeltaLayerWriter,
|
||||
target_layer_size: u64,
|
||||
generated_layers: Vec<ResidentLayer>,
|
||||
generated_layers: Vec<SplitWriterResult>,
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
lsn_range: Range<Lsn>,
|
||||
last_key_written: Key,
|
||||
start_key: Key,
|
||||
}
|
||||
|
||||
impl SplitDeltaLayerWriter {
|
||||
@@ -147,9 +239,74 @@ impl SplitDeltaLayerWriter {
|
||||
timeline_id,
|
||||
tenant_shard_id,
|
||||
lsn_range,
|
||||
last_key_written: Key::MIN,
|
||||
start_key,
|
||||
})
|
||||
}
|
||||
|
||||
/// Put value into the layer writer. In the case the writer decides to produce a layer, and the discard fn returns true, no layer will be written in the end.
|
||||
pub async fn put_value_with_discard_fn<D, F>(
|
||||
&mut self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
val: Value,
|
||||
tline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
discard: D,
|
||||
) -> anyhow::Result<()>
|
||||
where
|
||||
D: FnOnce(&PersistentLayerKey) -> F,
|
||||
F: Future<Output = bool>,
|
||||
{
|
||||
// The current estimation is key size plus LSN size plus value size estimation. This is not an accurate
|
||||
// number, and therefore the final layer size could be a little bit larger or smaller than the target.
|
||||
//
|
||||
// Also, keep all updates of a single key in a single file. TODO: split them using the legacy compaction
|
||||
// strategy. https://github.com/neondatabase/neon/issues/8837
|
||||
let addition_size_estimation = KEY_SIZE as u64 + 8 /* LSN u64 size */ + 80 /* value size estimation */;
|
||||
if self.inner.num_keys() >= 1
|
||||
&& self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size
|
||||
{
|
||||
if key != self.last_key_written {
|
||||
let next_delta_writer = DeltaLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_shard_id,
|
||||
key,
|
||||
self.lsn_range.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
let prev_delta_writer = std::mem::replace(&mut self.inner, next_delta_writer);
|
||||
let layer_key = PersistentLayerKey {
|
||||
key_range: self.start_key..key,
|
||||
lsn_range: self.lsn_range.clone(),
|
||||
is_delta: true,
|
||||
};
|
||||
self.start_key = key;
|
||||
if discard(&layer_key).await {
|
||||
drop(prev_delta_writer);
|
||||
self.generated_layers
|
||||
.push(SplitWriterResult::Discarded(layer_key));
|
||||
} else {
|
||||
let (desc, path) = prev_delta_writer.finish(key, ctx).await?;
|
||||
let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
|
||||
self.generated_layers
|
||||
.push(SplitWriterResult::Produced(delta_layer));
|
||||
}
|
||||
} else if self.inner.estimated_size() >= S3_UPLOAD_LIMIT {
|
||||
// We have to produce a very large file b/c a key is updated too often.
|
||||
anyhow::bail!(
|
||||
"a single key is updated too often: key={}, estimated_size={}, and the layer file cannot be produced",
|
||||
key,
|
||||
self.inner.estimated_size()
|
||||
);
|
||||
}
|
||||
}
|
||||
self.last_key_written = key;
|
||||
self.inner.put_value(key, lsn, val, ctx).await
|
||||
}
|
||||
|
||||
pub async fn put_value(
|
||||
&mut self,
|
||||
key: Key,
|
||||
@@ -158,56 +315,64 @@ impl SplitDeltaLayerWriter {
|
||||
tline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
// The current estimation is key size plus LSN size plus value size estimation. This is not an accurate
|
||||
// number, and therefore the final layer size could be a little bit larger or smaller than the target.
|
||||
let addition_size_estimation = KEY_SIZE as u64 + 8 /* LSN u64 size */ + 80 /* value size estimation */;
|
||||
if self.inner.num_keys() >= 1
|
||||
&& self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size
|
||||
{
|
||||
let next_delta_writer = DeltaLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_shard_id,
|
||||
key,
|
||||
self.lsn_range.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
let prev_delta_writer = std::mem::replace(&mut self.inner, next_delta_writer);
|
||||
let (desc, path) = prev_delta_writer.finish(key, ctx).await?;
|
||||
let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
|
||||
self.generated_layers.push(delta_layer);
|
||||
}
|
||||
self.inner.put_value(key, lsn, val, ctx).await
|
||||
self.put_value_with_discard_fn(key, lsn, val, tline, ctx, |_| async { false })
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) async fn finish(
|
||||
pub(crate) async fn finish_with_discard_fn<D, F>(
|
||||
self,
|
||||
tline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
end_key: Key,
|
||||
) -> anyhow::Result<Vec<ResidentLayer>> {
|
||||
discard: D,
|
||||
) -> anyhow::Result<Vec<SplitWriterResult>>
|
||||
where
|
||||
D: FnOnce(&PersistentLayerKey) -> F,
|
||||
F: Future<Output = bool>,
|
||||
{
|
||||
let Self {
|
||||
mut generated_layers,
|
||||
inner,
|
||||
..
|
||||
} = self;
|
||||
|
||||
let (desc, path) = inner.finish(end_key, ctx).await?;
|
||||
let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
|
||||
generated_layers.push(delta_layer);
|
||||
if inner.num_keys() == 0 {
|
||||
return Ok(generated_layers);
|
||||
}
|
||||
let layer_key = PersistentLayerKey {
|
||||
key_range: self.start_key..end_key,
|
||||
lsn_range: self.lsn_range.clone(),
|
||||
is_delta: true,
|
||||
};
|
||||
if discard(&layer_key).await {
|
||||
generated_layers.push(SplitWriterResult::Discarded(layer_key));
|
||||
} else {
|
||||
let (desc, path) = inner.finish(end_key, ctx).await?;
|
||||
let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
|
||||
generated_layers.push(SplitWriterResult::Produced(delta_layer));
|
||||
}
|
||||
Ok(generated_layers)
|
||||
}
|
||||
|
||||
/// When split writer fails, the caller should call this function and handle partially generated layers.
|
||||
#[allow(dead_code)]
|
||||
pub(crate) async fn take(self) -> anyhow::Result<(Vec<ResidentLayer>, DeltaLayerWriter)> {
|
||||
pub(crate) async fn finish(
|
||||
self,
|
||||
tline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
end_key: Key,
|
||||
) -> anyhow::Result<Vec<SplitWriterResult>> {
|
||||
self.finish_with_discard_fn(tline, ctx, end_key, |_| async { false })
|
||||
.await
|
||||
}
|
||||
|
||||
/// When split writer fails, the caller should call this function and handle partially generated layers.
|
||||
pub(crate) fn take(self) -> anyhow::Result<(Vec<SplitWriterResult>, DeltaLayerWriter)> {
|
||||
Ok((self.generated_layers, self.inner))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use itertools::Itertools;
|
||||
use rand::{RngCore, SeedableRng};
|
||||
|
||||
use crate::{
|
||||
@@ -302,9 +467,16 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn write_split() {
|
||||
let harness = TenantHarness::create("split_writer_write_split")
|
||||
.await
|
||||
.unwrap();
|
||||
write_split_helper("split_writer_write_split", false).await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn write_split_discard() {
|
||||
write_split_helper("split_writer_write_split_discard", false).await;
|
||||
}
|
||||
|
||||
async fn write_split_helper(harness_name: &'static str, discard: bool) {
|
||||
let harness = TenantHarness::create(harness_name).await.unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let tline = tenant
|
||||
@@ -338,16 +510,19 @@ mod tests {
|
||||
for i in 0..N {
|
||||
let i = i as u32;
|
||||
image_writer
|
||||
.put_image(get_key(i), get_large_img(), &tline, &ctx)
|
||||
.put_image_with_discard_fn(get_key(i), get_large_img(), &tline, &ctx, |_| async {
|
||||
discard
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
delta_writer
|
||||
.put_value(
|
||||
.put_value_with_discard_fn(
|
||||
get_key(i),
|
||||
Lsn(0x20),
|
||||
Value::Image(get_large_img()),
|
||||
&tline,
|
||||
&ctx,
|
||||
|_| async { discard },
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
@@ -360,22 +535,39 @@ mod tests {
|
||||
.finish(&tline, &ctx, get_key(N as u32))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(image_layers.len(), N / 512 + 1);
|
||||
assert_eq!(delta_layers.len(), N / 512 + 1);
|
||||
for idx in 0..image_layers.len() {
|
||||
assert_ne!(image_layers[idx].layer_desc().key_range.start, Key::MIN);
|
||||
assert_ne!(image_layers[idx].layer_desc().key_range.end, Key::MAX);
|
||||
assert_ne!(delta_layers[idx].layer_desc().key_range.start, Key::MIN);
|
||||
assert_ne!(delta_layers[idx].layer_desc().key_range.end, Key::MAX);
|
||||
if idx > 0 {
|
||||
assert_eq!(
|
||||
image_layers[idx - 1].layer_desc().key_range.end,
|
||||
image_layers[idx].layer_desc().key_range.start
|
||||
);
|
||||
assert_eq!(
|
||||
delta_layers[idx - 1].layer_desc().key_range.end,
|
||||
delta_layers[idx].layer_desc().key_range.start
|
||||
);
|
||||
if discard {
|
||||
for layer in image_layers {
|
||||
layer.into_discarded_layer();
|
||||
}
|
||||
for layer in delta_layers {
|
||||
layer.into_discarded_layer();
|
||||
}
|
||||
} else {
|
||||
let image_layers = image_layers
|
||||
.into_iter()
|
||||
.map(|x| x.into_resident_layer())
|
||||
.collect_vec();
|
||||
let delta_layers = delta_layers
|
||||
.into_iter()
|
||||
.map(|x| x.into_resident_layer())
|
||||
.collect_vec();
|
||||
assert_eq!(image_layers.len(), N / 512 + 1);
|
||||
assert_eq!(delta_layers.len(), N / 512 + 1);
|
||||
for idx in 0..image_layers.len() {
|
||||
assert_ne!(image_layers[idx].layer_desc().key_range.start, Key::MIN);
|
||||
assert_ne!(image_layers[idx].layer_desc().key_range.end, Key::MAX);
|
||||
assert_ne!(delta_layers[idx].layer_desc().key_range.start, Key::MIN);
|
||||
assert_ne!(delta_layers[idx].layer_desc().key_range.end, Key::MAX);
|
||||
if idx > 0 {
|
||||
assert_eq!(
|
||||
image_layers[idx - 1].layer_desc().key_range.end,
|
||||
image_layers[idx].layer_desc().key_range.start
|
||||
);
|
||||
assert_eq!(
|
||||
delta_layers[idx - 1].layer_desc().key_range.end,
|
||||
delta_layers[idx].layer_desc().key_range.start
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -456,4 +648,49 @@ mod tests {
|
||||
.unwrap();
|
||||
assert_eq!(layers.len(), 2);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn write_split_single_key() {
|
||||
let harness = TenantHarness::create("split_writer_write_split_single_key")
|
||||
.await
|
||||
.unwrap();
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
const N: usize = 2000;
|
||||
let mut delta_writer = SplitDeltaLayerWriter::new(
|
||||
tenant.conf,
|
||||
tline.timeline_id,
|
||||
tenant.tenant_shard_id,
|
||||
get_key(0),
|
||||
Lsn(0x10)..Lsn(N as u64 * 16 + 0x10),
|
||||
4 * 1024 * 1024,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
for i in 0..N {
|
||||
let i = i as u32;
|
||||
delta_writer
|
||||
.put_value(
|
||||
get_key(0),
|
||||
Lsn(i as u64 * 16 + 0x10),
|
||||
Value::Image(get_large_img()),
|
||||
&tline,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
let delta_layers = delta_writer
|
||||
.finish(&tline, &ctx, get_key(N as u32))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(delta_layers.len(), 1);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -69,7 +69,7 @@ use crate::{
|
||||
config::defaults::DEFAULT_PITR_INTERVAL,
|
||||
layer_map::{LayerMap, SearchResult},
|
||||
metadata::TimelineMetadata,
|
||||
storage_layer::PersistentLayerDesc,
|
||||
storage_layer::{inmemory_layer::IndexEntry, PersistentLayerDesc},
|
||||
},
|
||||
walredo,
|
||||
};
|
||||
@@ -1907,6 +1907,8 @@ impl Timeline {
|
||||
|
||||
true
|
||||
} else if projected_layer_size >= checkpoint_distance {
|
||||
// NB: this check is relied upon by:
|
||||
let _ = IndexEntry::validate_checkpoint_distance;
|
||||
info!(
|
||||
"Will roll layer at {} with layer size {} due to layer size ({})",
|
||||
projected_lsn, layer_size, projected_layer_size
|
||||
@@ -5444,12 +5446,17 @@ impl Timeline {
|
||||
!(a.end <= b.start || b.end <= a.start)
|
||||
}
|
||||
|
||||
let guard = self.layers.read().await;
|
||||
for layer in guard.layer_map()?.iter_historic_layers() {
|
||||
if layer.is_delta()
|
||||
&& overlaps_with(&layer.lsn_range, &deltas.lsn_range)
|
||||
&& layer.lsn_range != deltas.lsn_range
|
||||
{
|
||||
if deltas.key_range.start.next() != deltas.key_range.end {
|
||||
let guard = self.layers.read().await;
|
||||
let mut invalid_layers =
|
||||
guard.layer_map()?.iter_historic_layers().filter(|layer| {
|
||||
layer.is_delta()
|
||||
&& overlaps_with(&layer.lsn_range, &deltas.lsn_range)
|
||||
&& layer.lsn_range != deltas.lsn_range
|
||||
// skip single-key layer files
|
||||
&& layer.key_range.start.next() != layer.key_range.end
|
||||
});
|
||||
if let Some(layer) = invalid_layers.next() {
|
||||
// If a delta layer overlaps with another delta layer AND their LSN range is not the same, panic
|
||||
panic!(
|
||||
"inserted layer violates delta layer LSN invariant: current_lsn_range={}..{}, conflict_lsn_range={}..{}",
|
||||
@@ -5697,7 +5704,7 @@ impl<'a> TimelineWriter<'a> {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let serialized_batch = inmemory_layer::SerializedBatch::from_values(batch);
|
||||
let serialized_batch = inmemory_layer::SerializedBatch::from_values(batch)?;
|
||||
let batch_max_lsn = serialized_batch.max_lsn;
|
||||
let buf_size: u64 = serialized_batch.raw.len() as u64;
|
||||
|
||||
|
||||
@@ -14,7 +14,7 @@ use super::{
|
||||
RecordedDuration, Timeline,
|
||||
};
|
||||
|
||||
use anyhow::{anyhow, Context};
|
||||
use anyhow::{anyhow, bail, Context};
|
||||
use bytes::Bytes;
|
||||
use enumset::EnumSet;
|
||||
use fail::fail_point;
|
||||
@@ -32,6 +32,9 @@ use crate::page_cache;
|
||||
use crate::tenant::config::defaults::{DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD};
|
||||
use crate::tenant::remote_timeline_client::WaitCompletionError;
|
||||
use crate::tenant::storage_layer::merge_iterator::MergeIterator;
|
||||
use crate::tenant::storage_layer::split_writer::{
|
||||
SplitDeltaLayerWriter, SplitImageLayerWriter, SplitWriterResult,
|
||||
};
|
||||
use crate::tenant::storage_layer::{
|
||||
AsLayerDesc, PersistentLayerDesc, PersistentLayerKey, ValueReconstructState,
|
||||
};
|
||||
@@ -71,15 +74,60 @@ pub(crate) struct KeyHistoryRetention {
|
||||
}
|
||||
|
||||
impl KeyHistoryRetention {
|
||||
/// Hack: skip delta layer if we need to produce a layer of a same key-lsn.
|
||||
///
|
||||
/// This can happen if we have removed some deltas in "the middle" of some existing layer's key-lsn-range.
|
||||
/// For example, consider the case where a single delta with range [0x10,0x50) exists.
|
||||
/// And we have branches at LSN 0x10, 0x20, 0x30.
|
||||
/// Then we delete branch @ 0x20.
|
||||
/// Bottom-most compaction may now delete the delta [0x20,0x30).
|
||||
/// And that wouldnt' change the shape of the layer.
|
||||
///
|
||||
/// Note that bottom-most-gc-compaction never _adds_ new data in that case, only removes.
|
||||
///
|
||||
/// `discard_key` will only be called when the writer reaches its target (instead of for every key), so it's fine to grab a lock inside.
|
||||
async fn discard_key(key: &PersistentLayerKey, tline: &Arc<Timeline>, dry_run: bool) -> bool {
|
||||
if dry_run {
|
||||
return true;
|
||||
}
|
||||
let guard = tline.layers.read().await;
|
||||
if !guard.contains_key(key) {
|
||||
return false;
|
||||
}
|
||||
let layer_generation = guard.get_from_key(key).metadata().generation;
|
||||
drop(guard);
|
||||
if layer_generation == tline.generation {
|
||||
info!(
|
||||
key=%key,
|
||||
?layer_generation,
|
||||
"discard layer due to duplicated layer key in the same generation",
|
||||
);
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
/// Pipe a history of a single key to the writers.
|
||||
///
|
||||
/// If `image_writer` is none, the images will be placed into the delta layers.
|
||||
/// The delta writer will contain all images and deltas (below and above the horizon) except the bottom-most images.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn pipe_to(
|
||||
self,
|
||||
key: Key,
|
||||
delta_writer: &mut Vec<(Key, Lsn, Value)>,
|
||||
mut image_writer: Option<&mut ImageLayerWriter>,
|
||||
tline: &Arc<Timeline>,
|
||||
delta_writer: &mut SplitDeltaLayerWriter,
|
||||
mut image_writer: Option<&mut SplitImageLayerWriter>,
|
||||
stat: &mut CompactionStatistics,
|
||||
dry_run: bool,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut first_batch = true;
|
||||
let discard = |key: &PersistentLayerKey| {
|
||||
let key = key.clone();
|
||||
async move { Self::discard_key(&key, tline, dry_run).await }
|
||||
};
|
||||
for (cutoff_lsn, KeyLogAtLsn(logs)) in self.below_horizon {
|
||||
if first_batch {
|
||||
if logs.len() == 1 && logs[0].1.is_image() {
|
||||
@@ -88,28 +136,45 @@ impl KeyHistoryRetention {
|
||||
};
|
||||
stat.produce_image_key(img);
|
||||
if let Some(image_writer) = image_writer.as_mut() {
|
||||
image_writer.put_image(key, img.clone(), ctx).await?;
|
||||
image_writer
|
||||
.put_image_with_discard_fn(key, img.clone(), tline, ctx, discard)
|
||||
.await?;
|
||||
} else {
|
||||
delta_writer.push((key, cutoff_lsn, Value::Image(img.clone())));
|
||||
delta_writer
|
||||
.put_value_with_discard_fn(
|
||||
key,
|
||||
cutoff_lsn,
|
||||
Value::Image(img.clone()),
|
||||
tline,
|
||||
ctx,
|
||||
discard,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
} else {
|
||||
for (lsn, val) in logs {
|
||||
stat.produce_key(&val);
|
||||
delta_writer.push((key, lsn, val));
|
||||
delta_writer
|
||||
.put_value_with_discard_fn(key, lsn, val, tline, ctx, discard)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
first_batch = false;
|
||||
} else {
|
||||
for (lsn, val) in logs {
|
||||
stat.produce_key(&val);
|
||||
delta_writer.push((key, lsn, val));
|
||||
delta_writer
|
||||
.put_value_with_discard_fn(key, lsn, val, tline, ctx, discard)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
}
|
||||
let KeyLogAtLsn(above_horizon_logs) = self.above_horizon;
|
||||
for (lsn, val) in above_horizon_logs {
|
||||
stat.produce_key(&val);
|
||||
delta_writer.push((key, lsn, val));
|
||||
delta_writer
|
||||
.put_value_with_discard_fn(key, lsn, val, tline, ctx, discard)
|
||||
.await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -1814,11 +1879,27 @@ impl Timeline {
|
||||
}
|
||||
let mut selected_layers = Vec::new();
|
||||
drop(gc_info);
|
||||
// Pick all the layers intersect or below the gc_cutoff, get the largest LSN in the selected layers.
|
||||
let Some(max_layer_lsn) = layers
|
||||
.iter_historic_layers()
|
||||
.filter(|desc| desc.get_lsn_range().start <= gc_cutoff)
|
||||
.map(|desc| desc.get_lsn_range().end)
|
||||
.max()
|
||||
else {
|
||||
info!("no layers to compact with gc");
|
||||
return Ok(());
|
||||
};
|
||||
// Then, pick all the layers that are below the max_layer_lsn. This is to ensure we can pick all single-key
|
||||
// layers to compact.
|
||||
for desc in layers.iter_historic_layers() {
|
||||
if desc.get_lsn_range().start <= gc_cutoff {
|
||||
if desc.get_lsn_range().end <= max_layer_lsn {
|
||||
selected_layers.push(guard.get_from_desc(&desc));
|
||||
}
|
||||
}
|
||||
if selected_layers.is_empty() {
|
||||
info!("no layers to compact with gc");
|
||||
return Ok(());
|
||||
}
|
||||
retain_lsns_below_horizon.sort();
|
||||
(selected_layers, gc_cutoff, retain_lsns_below_horizon)
|
||||
};
|
||||
@@ -1848,27 +1929,53 @@ impl Timeline {
|
||||
lowest_retain_lsn
|
||||
);
|
||||
// Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs.
|
||||
// Also, collect the layer information to decide when to split the new delta layers.
|
||||
let mut downloaded_layers = Vec::new();
|
||||
let mut delta_split_points = BTreeSet::new();
|
||||
// Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point.
|
||||
let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?)
|
||||
for layer in &layer_selection {
|
||||
let resident_layer = layer.download_and_keep_resident().await?;
|
||||
downloaded_layers.push(resident_layer);
|
||||
|
||||
let desc = layer.layer_desc();
|
||||
if desc.is_delta() {
|
||||
// TODO: is it correct to only record split points for deltas intersecting with the GC horizon? (exclude those below/above the horizon)
|
||||
// so that we can avoid having too many small delta layers.
|
||||
let key_range = desc.get_key_range();
|
||||
delta_split_points.insert(key_range.start);
|
||||
delta_split_points.insert(key_range.end);
|
||||
// ignore single-key layer files
|
||||
if desc.key_range.start.next() != desc.key_range.end {
|
||||
let lsn_range = &desc.lsn_range;
|
||||
lsn_split_point.insert(lsn_range.start);
|
||||
lsn_split_point.insert(lsn_range.end);
|
||||
}
|
||||
stat.visit_delta_layer(desc.file_size());
|
||||
} else {
|
||||
stat.visit_image_layer(desc.file_size());
|
||||
}
|
||||
}
|
||||
for layer in &layer_selection {
|
||||
let desc = layer.layer_desc();
|
||||
let key_range = &desc.key_range;
|
||||
if desc.is_delta() && key_range.start.next() != key_range.end {
|
||||
let lsn_range = desc.lsn_range.clone();
|
||||
let intersects = lsn_split_point.range(lsn_range).collect_vec();
|
||||
if intersects.len() > 1 {
|
||||
bail!(
|
||||
"cannot run gc-compaction because it violates the layer map LSN split assumption: layer {} intersects with LSN [{}]",
|
||||
desc.key(),
|
||||
intersects.into_iter().map(|lsn| lsn.to_string()).join(", ")
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
// The maximum LSN we are processing in this compaction loop
|
||||
let end_lsn = layer_selection
|
||||
.iter()
|
||||
.map(|l| l.layer_desc().lsn_range.end)
|
||||
.max()
|
||||
.unwrap();
|
||||
// We don't want any of the produced layers to cover the full key range (i.e., MIN..MAX) b/c it will then be recognized
|
||||
// as an L0 layer.
|
||||
let hack_end_key = Key::NON_L0_MAX;
|
||||
let mut delta_layers = Vec::new();
|
||||
let mut image_layers = Vec::new();
|
||||
let mut downloaded_layers = Vec::new();
|
||||
for layer in &layer_selection {
|
||||
let resident_layer = layer.download_and_keep_resident().await?;
|
||||
downloaded_layers.push(resident_layer);
|
||||
}
|
||||
for resident_layer in &downloaded_layers {
|
||||
if resident_layer.layer_desc().is_delta() {
|
||||
let layer = resident_layer.get_as_delta(ctx).await?;
|
||||
@@ -1884,138 +1991,17 @@ impl Timeline {
|
||||
let mut accumulated_values = Vec::new();
|
||||
let mut last_key: Option<Key> = None;
|
||||
|
||||
enum FlushDeltaResult {
|
||||
/// Create a new resident layer
|
||||
CreateResidentLayer(ResidentLayer),
|
||||
/// Keep an original delta layer
|
||||
KeepLayer(PersistentLayerKey),
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn flush_deltas(
|
||||
deltas: &mut Vec<(Key, Lsn, crate::repository::Value)>,
|
||||
last_key: Key,
|
||||
delta_split_points: &[Key],
|
||||
current_delta_split_point: &mut usize,
|
||||
tline: &Arc<Timeline>,
|
||||
lowest_retain_lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
stats: &mut CompactionStatistics,
|
||||
dry_run: bool,
|
||||
last_batch: bool,
|
||||
) -> anyhow::Result<Option<FlushDeltaResult>> {
|
||||
// Check if we need to split the delta layer. We split at the original delta layer boundary to avoid
|
||||
// overlapping layers.
|
||||
//
|
||||
// If we have a structure like this:
|
||||
//
|
||||
// | Delta 1 | | Delta 4 |
|
||||
// |---------| Delta 2 |---------|
|
||||
// | Delta 3 | | Delta 5 |
|
||||
//
|
||||
// And we choose to compact delta 2+3+5. We will get an overlapping delta layer with delta 1+4.
|
||||
// A simple solution here is to split the delta layers using the original boundary, while this
|
||||
// might produce a lot of small layers. This should be improved and fixed in the future.
|
||||
let mut need_split = false;
|
||||
while *current_delta_split_point < delta_split_points.len()
|
||||
&& last_key >= delta_split_points[*current_delta_split_point]
|
||||
{
|
||||
*current_delta_split_point += 1;
|
||||
need_split = true;
|
||||
}
|
||||
if !need_split && !last_batch {
|
||||
return Ok(None);
|
||||
}
|
||||
let deltas: Vec<(Key, Lsn, Value)> = std::mem::take(deltas);
|
||||
if deltas.is_empty() {
|
||||
return Ok(None);
|
||||
}
|
||||
let end_lsn = deltas.iter().map(|(_, lsn, _)| lsn).max().copied().unwrap() + 1;
|
||||
let delta_key = PersistentLayerKey {
|
||||
key_range: {
|
||||
let key_start = deltas.first().unwrap().0;
|
||||
let key_end = deltas.last().unwrap().0.next();
|
||||
key_start..key_end
|
||||
},
|
||||
lsn_range: lowest_retain_lsn..end_lsn,
|
||||
is_delta: true,
|
||||
};
|
||||
{
|
||||
// Hack: skip delta layer if we need to produce a layer of a same key-lsn.
|
||||
//
|
||||
// This can happen if we have removed some deltas in "the middle" of some existing layer's key-lsn-range.
|
||||
// For example, consider the case where a single delta with range [0x10,0x50) exists.
|
||||
// And we have branches at LSN 0x10, 0x20, 0x30.
|
||||
// Then we delete branch @ 0x20.
|
||||
// Bottom-most compaction may now delete the delta [0x20,0x30).
|
||||
// And that wouldnt' change the shape of the layer.
|
||||
//
|
||||
// Note that bottom-most-gc-compaction never _adds_ new data in that case, only removes.
|
||||
// That's why it's safe to skip.
|
||||
let guard = tline.layers.read().await;
|
||||
|
||||
if guard.contains_key(&delta_key) {
|
||||
let layer_generation = guard.get_from_key(&delta_key).metadata().generation;
|
||||
drop(guard);
|
||||
if layer_generation == tline.generation {
|
||||
stats.discard_delta_layer();
|
||||
// TODO: depending on whether we design this compaction process to run along with
|
||||
// other compactions, there could be layer map modifications after we drop the
|
||||
// layer guard, and in case it creates duplicated layer key, we will still error
|
||||
// in the end.
|
||||
info!(
|
||||
key=%delta_key,
|
||||
?layer_generation,
|
||||
"discard delta layer due to duplicated layer in the same generation"
|
||||
);
|
||||
return Ok(Some(FlushDeltaResult::KeepLayer(delta_key)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut delta_layer_writer = DeltaLayerWriter::new(
|
||||
tline.conf,
|
||||
tline.timeline_id,
|
||||
tline.tenant_shard_id,
|
||||
delta_key.key_range.start,
|
||||
lowest_retain_lsn..end_lsn,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
for (key, lsn, val) in deltas {
|
||||
delta_layer_writer.put_value(key, lsn, val, ctx).await?;
|
||||
}
|
||||
|
||||
stats.produce_delta_layer(delta_layer_writer.size());
|
||||
if dry_run {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let (desc, path) = delta_layer_writer
|
||||
.finish(delta_key.key_range.end, ctx)
|
||||
.await?;
|
||||
let delta_layer = Layer::finish_creating(tline.conf, tline, desc, &path)?;
|
||||
Ok(Some(FlushDeltaResult::CreateResidentLayer(delta_layer)))
|
||||
}
|
||||
|
||||
// Hack the key range to be min..(max-1). Otherwise, the image layer will be
|
||||
// interpreted as an L0 delta layer.
|
||||
let hack_image_layer_range = {
|
||||
let mut end_key = Key::MAX;
|
||||
end_key.field6 -= 1;
|
||||
Key::MIN..end_key
|
||||
};
|
||||
|
||||
// Only create image layers when there is no ancestor branches. TODO: create covering image layer
|
||||
// when some condition meet.
|
||||
let mut image_layer_writer = if self.ancestor_timeline.is_none() {
|
||||
Some(
|
||||
ImageLayerWriter::new(
|
||||
SplitImageLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_shard_id,
|
||||
&hack_image_layer_range, // covers the full key range
|
||||
Key::MIN,
|
||||
lowest_retain_lsn,
|
||||
self.get_compaction_target_size(),
|
||||
ctx,
|
||||
)
|
||||
.await?,
|
||||
@@ -2024,6 +2010,17 @@ impl Timeline {
|
||||
None
|
||||
};
|
||||
|
||||
let mut delta_layer_writer = SplitDeltaLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_shard_id,
|
||||
Key::MIN,
|
||||
lowest_retain_lsn..end_lsn,
|
||||
self.get_compaction_target_size(),
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
/// Returns None if there is no ancestor branch. Throw an error when the key is not found.
|
||||
///
|
||||
/// Currently, we always get the ancestor image for each key in the child branch no matter whether the image
|
||||
@@ -2044,47 +2041,11 @@ impl Timeline {
|
||||
let img = tline.get(key, tline.ancestor_lsn, ctx).await?;
|
||||
Ok(Some((key, tline.ancestor_lsn, img)))
|
||||
}
|
||||
let image_layer_key = PersistentLayerKey {
|
||||
key_range: hack_image_layer_range,
|
||||
lsn_range: PersistentLayerDesc::image_layer_lsn_range(lowest_retain_lsn),
|
||||
is_delta: false,
|
||||
};
|
||||
|
||||
// Like with delta layers, it can happen that we re-produce an already existing image layer.
|
||||
// This could happen when a user triggers force compaction and image generation. In this case,
|
||||
// it's always safe to rewrite the layer.
|
||||
let discard_image_layer = {
|
||||
let guard = self.layers.read().await;
|
||||
if guard.contains_key(&image_layer_key) {
|
||||
let layer_generation = guard.get_from_key(&image_layer_key).metadata().generation;
|
||||
drop(guard);
|
||||
if layer_generation == self.generation {
|
||||
// TODO: depending on whether we design this compaction process to run along with
|
||||
// other compactions, there could be layer map modifications after we drop the
|
||||
// layer guard, and in case it creates duplicated layer key, we will still error
|
||||
// in the end.
|
||||
info!(
|
||||
key=%image_layer_key,
|
||||
?layer_generation,
|
||||
"discard image layer due to duplicated layer key in the same generation",
|
||||
);
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
} else {
|
||||
false
|
||||
}
|
||||
};
|
||||
|
||||
// Actually, we can decide not to write to the image layer at all at this point because
|
||||
// the key and LSN range are determined. However, to keep things simple here, we still
|
||||
// create this writer, and discard the writer in the end.
|
||||
|
||||
let mut delta_values = Vec::new();
|
||||
let delta_split_points = delta_split_points.into_iter().collect_vec();
|
||||
let mut current_delta_split_point = 0;
|
||||
let mut delta_layers = Vec::new();
|
||||
while let Some((key, lsn, val)) = merge_iter.next().await? {
|
||||
if cancel.is_cancelled() {
|
||||
return Err(anyhow!("cancelled")); // TODO: refactor to CompactionError and pass cancel error
|
||||
@@ -2115,27 +2076,14 @@ impl Timeline {
|
||||
retention
|
||||
.pipe_to(
|
||||
*last_key,
|
||||
&mut delta_values,
|
||||
self,
|
||||
&mut delta_layer_writer,
|
||||
image_layer_writer.as_mut(),
|
||||
&mut stat,
|
||||
dry_run,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
delta_layers.extend(
|
||||
flush_deltas(
|
||||
&mut delta_values,
|
||||
*last_key,
|
||||
&delta_split_points,
|
||||
&mut current_delta_split_point,
|
||||
self,
|
||||
lowest_retain_lsn,
|
||||
ctx,
|
||||
&mut stat,
|
||||
dry_run,
|
||||
false,
|
||||
)
|
||||
.await?,
|
||||
);
|
||||
accumulated_values.clear();
|
||||
*last_key = key;
|
||||
accumulated_values.push((key, lsn, val));
|
||||
@@ -2159,43 +2107,75 @@ impl Timeline {
|
||||
retention
|
||||
.pipe_to(
|
||||
last_key,
|
||||
&mut delta_values,
|
||||
self,
|
||||
&mut delta_layer_writer,
|
||||
image_layer_writer.as_mut(),
|
||||
&mut stat,
|
||||
dry_run,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
delta_layers.extend(
|
||||
flush_deltas(
|
||||
&mut delta_values,
|
||||
last_key,
|
||||
&delta_split_points,
|
||||
&mut current_delta_split_point,
|
||||
self,
|
||||
lowest_retain_lsn,
|
||||
ctx,
|
||||
&mut stat,
|
||||
dry_run,
|
||||
true,
|
||||
)
|
||||
.await?,
|
||||
);
|
||||
assert!(delta_values.is_empty(), "unprocessed keys");
|
||||
|
||||
let image_layer = if discard_image_layer {
|
||||
stat.discard_image_layer();
|
||||
None
|
||||
} else if let Some(writer) = image_layer_writer {
|
||||
stat.produce_image_layer(writer.size());
|
||||
let discard = |key: &PersistentLayerKey| {
|
||||
let key = key.clone();
|
||||
async move { KeyHistoryRetention::discard_key(&key, self, dry_run).await }
|
||||
};
|
||||
|
||||
let produced_image_layers = if let Some(writer) = image_layer_writer {
|
||||
if !dry_run {
|
||||
Some(writer.finish(self, ctx).await?)
|
||||
writer
|
||||
.finish_with_discard_fn(self, ctx, hack_end_key, discard)
|
||||
.await?
|
||||
} else {
|
||||
None
|
||||
let (layers, _) = writer.take()?;
|
||||
assert!(layers.is_empty(), "image layers produced in dry run mode?");
|
||||
Vec::new()
|
||||
}
|
||||
} else {
|
||||
None
|
||||
Vec::new()
|
||||
};
|
||||
|
||||
let produced_delta_layers = if !dry_run {
|
||||
delta_layer_writer
|
||||
.finish_with_discard_fn(self, ctx, hack_end_key, discard)
|
||||
.await?
|
||||
} else {
|
||||
let (layers, _) = delta_layer_writer.take()?;
|
||||
assert!(layers.is_empty(), "delta layers produced in dry run mode?");
|
||||
Vec::new()
|
||||
};
|
||||
|
||||
let mut compact_to = Vec::new();
|
||||
let mut keep_layers = HashSet::new();
|
||||
let produced_delta_layers_len = produced_delta_layers.len();
|
||||
let produced_image_layers_len = produced_image_layers.len();
|
||||
for action in produced_delta_layers {
|
||||
match action {
|
||||
SplitWriterResult::Produced(layer) => {
|
||||
stat.produce_delta_layer(layer.layer_desc().file_size());
|
||||
compact_to.push(layer);
|
||||
}
|
||||
SplitWriterResult::Discarded(l) => {
|
||||
keep_layers.insert(l);
|
||||
stat.discard_delta_layer();
|
||||
}
|
||||
}
|
||||
}
|
||||
for action in produced_image_layers {
|
||||
match action {
|
||||
SplitWriterResult::Produced(layer) => {
|
||||
stat.produce_image_layer(layer.layer_desc().file_size());
|
||||
compact_to.push(layer);
|
||||
}
|
||||
SplitWriterResult::Discarded(l) => {
|
||||
keep_layers.insert(l);
|
||||
stat.discard_image_layer();
|
||||
}
|
||||
}
|
||||
}
|
||||
let mut layer_selection = layer_selection;
|
||||
layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
|
||||
|
||||
info!(
|
||||
"gc-compaction statistics: {}",
|
||||
serde_json::to_string(&stat)?
|
||||
@@ -2206,28 +2186,11 @@ impl Timeline {
|
||||
}
|
||||
|
||||
info!(
|
||||
"produced {} delta layers and {} image layers",
|
||||
delta_layers.len(),
|
||||
if image_layer.is_some() { 1 } else { 0 }
|
||||
"produced {} delta layers and {} image layers, {} layers are kept",
|
||||
produced_delta_layers_len,
|
||||
produced_image_layers_len,
|
||||
layer_selection.len()
|
||||
);
|
||||
let mut compact_to = Vec::new();
|
||||
let mut keep_layers = HashSet::new();
|
||||
for action in delta_layers {
|
||||
match action {
|
||||
FlushDeltaResult::CreateResidentLayer(layer) => {
|
||||
compact_to.push(layer);
|
||||
}
|
||||
FlushDeltaResult::KeepLayer(l) => {
|
||||
keep_layers.insert(l);
|
||||
}
|
||||
}
|
||||
}
|
||||
if discard_image_layer {
|
||||
keep_layers.insert(image_layer_key);
|
||||
}
|
||||
let mut layer_selection = layer_selection;
|
||||
layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
|
||||
compact_to.extend(image_layer);
|
||||
|
||||
// Step 3: Place back to the layer map.
|
||||
{
|
||||
|
||||
@@ -25,9 +25,10 @@ use tokio_epoll_uring::BoundedBuf;
|
||||
use utils::lsn::Lsn;
|
||||
use utils::vec_map::VecMap;
|
||||
|
||||
use crate::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
|
||||
use crate::context::RequestContext;
|
||||
use crate::tenant::blob_io::{BYTE_UNCOMPRESSED, BYTE_ZSTD, LEN_COMPRESSION_BIT_MASK};
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::virtual_file::{self, VirtualFile};
|
||||
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
|
||||
pub struct MaxVectoredReadBytes(pub NonZeroUsize);
|
||||
@@ -60,7 +61,7 @@ pub struct VectoredBlobsBuf {
|
||||
pub struct VectoredRead {
|
||||
pub start: u64,
|
||||
pub end: u64,
|
||||
/// Starting offsets and metadata for each blob in this read
|
||||
/// Start offset and metadata for each blob in this read
|
||||
pub blobs_at: VecMap<u64, BlobMeta>,
|
||||
}
|
||||
|
||||
@@ -76,14 +77,109 @@ pub(crate) enum VectoredReadExtended {
|
||||
No,
|
||||
}
|
||||
|
||||
pub(crate) struct VectoredReadBuilder {
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
|
||||
pub enum VectoredReadCoalesceMode {
|
||||
/// Only coalesce exactly adjacent reads.
|
||||
AdjacentOnly,
|
||||
/// In addition to adjacent reads, also consider reads whose corresponding
|
||||
/// `end` and `start` offsets reside at the same chunk.
|
||||
Chunked(usize),
|
||||
}
|
||||
|
||||
impl VectoredReadCoalesceMode {
|
||||
/// [`AdjacentVectoredReadBuilder`] is used if alignment requirement is 0,
|
||||
/// whereas [`ChunkedVectoredReadBuilder`] is used for alignment requirement 1 and higher.
|
||||
pub(crate) fn get() -> Self {
|
||||
let align = virtual_file::get_io_buffer_alignment_raw();
|
||||
if align == DEFAULT_IO_BUFFER_ALIGNMENT {
|
||||
VectoredReadCoalesceMode::AdjacentOnly
|
||||
} else {
|
||||
VectoredReadCoalesceMode::Chunked(align)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) enum VectoredReadBuilder {
|
||||
Adjacent(AdjacentVectoredReadBuilder),
|
||||
Chunked(ChunkedVectoredReadBuilder),
|
||||
}
|
||||
|
||||
impl VectoredReadBuilder {
|
||||
fn new_impl(
|
||||
start_offset: u64,
|
||||
end_offset: u64,
|
||||
meta: BlobMeta,
|
||||
max_read_size: Option<usize>,
|
||||
mode: VectoredReadCoalesceMode,
|
||||
) -> Self {
|
||||
match mode {
|
||||
VectoredReadCoalesceMode::AdjacentOnly => Self::Adjacent(
|
||||
AdjacentVectoredReadBuilder::new(start_offset, end_offset, meta, max_read_size),
|
||||
),
|
||||
VectoredReadCoalesceMode::Chunked(chunk_size) => {
|
||||
Self::Chunked(ChunkedVectoredReadBuilder::new(
|
||||
start_offset,
|
||||
end_offset,
|
||||
meta,
|
||||
max_read_size,
|
||||
chunk_size,
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn new(
|
||||
start_offset: u64,
|
||||
end_offset: u64,
|
||||
meta: BlobMeta,
|
||||
max_read_size: usize,
|
||||
mode: VectoredReadCoalesceMode,
|
||||
) -> Self {
|
||||
Self::new_impl(start_offset, end_offset, meta, Some(max_read_size), mode)
|
||||
}
|
||||
|
||||
pub(crate) fn new_streaming(
|
||||
start_offset: u64,
|
||||
end_offset: u64,
|
||||
meta: BlobMeta,
|
||||
mode: VectoredReadCoalesceMode,
|
||||
) -> Self {
|
||||
Self::new_impl(start_offset, end_offset, meta, None, mode)
|
||||
}
|
||||
|
||||
pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
|
||||
match self {
|
||||
VectoredReadBuilder::Adjacent(builder) => builder.extend(start, end, meta),
|
||||
VectoredReadBuilder::Chunked(builder) => builder.extend(start, end, meta),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn build(self) -> VectoredRead {
|
||||
match self {
|
||||
VectoredReadBuilder::Adjacent(builder) => builder.build(),
|
||||
VectoredReadBuilder::Chunked(builder) => builder.build(),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn size(&self) -> usize {
|
||||
match self {
|
||||
VectoredReadBuilder::Adjacent(builder) => builder.size(),
|
||||
VectoredReadBuilder::Chunked(builder) => builder.size(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct AdjacentVectoredReadBuilder {
|
||||
/// Start offset of the read.
|
||||
start: u64,
|
||||
// End offset of the read.
|
||||
end: u64,
|
||||
/// Start offset and metadata for each blob in this read
|
||||
blobs_at: VecMap<u64, BlobMeta>,
|
||||
max_read_size: Option<usize>,
|
||||
}
|
||||
|
||||
impl VectoredReadBuilder {
|
||||
impl AdjacentVectoredReadBuilder {
|
||||
/// Start building a new vectored read.
|
||||
///
|
||||
/// Note that by design, this does not check against reading more than `max_read_size` to
|
||||
@@ -93,7 +189,7 @@ impl VectoredReadBuilder {
|
||||
start_offset: u64,
|
||||
end_offset: u64,
|
||||
meta: BlobMeta,
|
||||
max_read_size: usize,
|
||||
max_read_size: Option<usize>,
|
||||
) -> Self {
|
||||
let mut blobs_at = VecMap::default();
|
||||
blobs_at
|
||||
@@ -104,7 +200,7 @@ impl VectoredReadBuilder {
|
||||
start: start_offset,
|
||||
end: end_offset,
|
||||
blobs_at,
|
||||
max_read_size: Some(max_read_size),
|
||||
max_read_size,
|
||||
}
|
||||
}
|
||||
/// Attempt to extend the current read with a new blob if the start
|
||||
@@ -113,13 +209,15 @@ impl VectoredReadBuilder {
|
||||
pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
|
||||
tracing::trace!(start, end, "trying to extend");
|
||||
let size = (end - start) as usize;
|
||||
if self.end == start && {
|
||||
let not_limited_by_max_read_size = {
|
||||
if let Some(max_read_size) = self.max_read_size {
|
||||
self.size() + size <= max_read_size
|
||||
} else {
|
||||
true
|
||||
}
|
||||
} {
|
||||
};
|
||||
|
||||
if self.end == start && not_limited_by_max_read_size {
|
||||
self.end = end;
|
||||
self.blobs_at
|
||||
.append(start, meta)
|
||||
@@ -144,6 +242,107 @@ impl VectoredReadBuilder {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct ChunkedVectoredReadBuilder {
|
||||
/// Start block number
|
||||
start_blk_no: usize,
|
||||
/// End block number (exclusive).
|
||||
end_blk_no: usize,
|
||||
/// Start offset and metadata for each blob in this read
|
||||
blobs_at: VecMap<u64, BlobMeta>,
|
||||
max_read_size: Option<usize>,
|
||||
/// Chunk size reads are coalesced into.
|
||||
chunk_size: usize,
|
||||
}
|
||||
|
||||
/// Computes x / d rounded up.
|
||||
fn div_round_up(x: usize, d: usize) -> usize {
|
||||
(x + (d - 1)) / d
|
||||
}
|
||||
|
||||
impl ChunkedVectoredReadBuilder {
|
||||
/// Start building a new vectored read.
|
||||
///
|
||||
/// Note that by design, this does not check against reading more than `max_read_size` to
|
||||
/// support reading larger blobs than the configuration value. The builder will be single use
|
||||
/// however after that.
|
||||
pub(crate) fn new(
|
||||
start_offset: u64,
|
||||
end_offset: u64,
|
||||
meta: BlobMeta,
|
||||
max_read_size: Option<usize>,
|
||||
chunk_size: usize,
|
||||
) -> Self {
|
||||
let mut blobs_at = VecMap::default();
|
||||
blobs_at
|
||||
.append(start_offset, meta)
|
||||
.expect("First insertion always succeeds");
|
||||
|
||||
let start_blk_no = start_offset as usize / chunk_size;
|
||||
let end_blk_no = div_round_up(end_offset as usize, chunk_size);
|
||||
Self {
|
||||
start_blk_no,
|
||||
end_blk_no,
|
||||
blobs_at,
|
||||
max_read_size,
|
||||
chunk_size,
|
||||
}
|
||||
}
|
||||
|
||||
/// Attempts to extend the current read with a new blob if the new blob resides in the same or the immediate next chunk.
|
||||
///
|
||||
/// The resulting size also must be below the max read size.
|
||||
pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
|
||||
tracing::trace!(start, end, "trying to extend");
|
||||
let start_blk_no = start as usize / self.chunk_size;
|
||||
let end_blk_no = div_round_up(end as usize, self.chunk_size);
|
||||
|
||||
let not_limited_by_max_read_size = {
|
||||
if let Some(max_read_size) = self.max_read_size {
|
||||
let coalesced_size = (end_blk_no - self.start_blk_no) * self.chunk_size;
|
||||
coalesced_size <= max_read_size
|
||||
} else {
|
||||
true
|
||||
}
|
||||
};
|
||||
|
||||
// True if the second block starts in the same block or the immediate next block where the first block ended.
|
||||
//
|
||||
// Note: This automatically handles the case where two blocks are adjacent to each other,
|
||||
// whether they starts on chunk size boundary or not.
|
||||
let is_adjacent_chunk_read = {
|
||||
// 1. first.end & second.start are in the same block
|
||||
self.end_blk_no == start_blk_no + 1 ||
|
||||
// 2. first.end ends one block before second.start
|
||||
self.end_blk_no == start_blk_no
|
||||
};
|
||||
|
||||
if is_adjacent_chunk_read && not_limited_by_max_read_size {
|
||||
self.end_blk_no = end_blk_no;
|
||||
self.blobs_at
|
||||
.append(start, meta)
|
||||
.expect("LSNs are ordered within vectored reads");
|
||||
|
||||
return VectoredReadExtended::Yes;
|
||||
}
|
||||
|
||||
VectoredReadExtended::No
|
||||
}
|
||||
|
||||
pub(crate) fn size(&self) -> usize {
|
||||
(self.end_blk_no - self.start_blk_no) * self.chunk_size
|
||||
}
|
||||
|
||||
pub(crate) fn build(self) -> VectoredRead {
|
||||
let start = (self.start_blk_no * self.chunk_size) as u64;
|
||||
let end = (self.end_blk_no * self.chunk_size) as u64;
|
||||
VectoredRead {
|
||||
start,
|
||||
end,
|
||||
blobs_at: self.blobs_at,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub enum BlobFlag {
|
||||
None,
|
||||
@@ -166,14 +365,18 @@ pub struct VectoredReadPlanner {
|
||||
prev: Option<(Key, Lsn, u64, BlobFlag)>,
|
||||
|
||||
max_read_size: usize,
|
||||
|
||||
mode: VectoredReadCoalesceMode,
|
||||
}
|
||||
|
||||
impl VectoredReadPlanner {
|
||||
pub fn new(max_read_size: usize) -> Self {
|
||||
let mode = VectoredReadCoalesceMode::get();
|
||||
Self {
|
||||
blobs: BTreeMap::new(),
|
||||
prev: None,
|
||||
max_read_size,
|
||||
mode,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -252,6 +455,7 @@ impl VectoredReadPlanner {
|
||||
end_offset,
|
||||
BlobMeta { key, lsn },
|
||||
self.max_read_size,
|
||||
self.mode,
|
||||
);
|
||||
|
||||
let prev_read_builder = current_read_builder.replace(next_read_builder);
|
||||
@@ -303,6 +507,18 @@ impl<'a> VectoredBlobReader<'a> {
|
||||
read.size(),
|
||||
buf.capacity()
|
||||
);
|
||||
|
||||
if cfg!(debug_assertions) {
|
||||
let align = virtual_file::get_io_buffer_alignment() as u64;
|
||||
debug_assert_eq!(
|
||||
read.start % align,
|
||||
0,
|
||||
"Read start at {} does not satisfy the required io buffer alignment ({} bytes)",
|
||||
read.start,
|
||||
align
|
||||
);
|
||||
}
|
||||
|
||||
let mut buf = self
|
||||
.file
|
||||
.read_exact_at(buf.slice(0..read.size()), read.start, ctx)
|
||||
@@ -310,27 +526,20 @@ impl<'a> VectoredBlobReader<'a> {
|
||||
.into_inner();
|
||||
|
||||
let blobs_at = read.blobs_at.as_slice();
|
||||
let start_offset = blobs_at.first().expect("VectoredRead is never empty").0;
|
||||
|
||||
let start_offset = read.start;
|
||||
|
||||
let mut metas = Vec::with_capacity(blobs_at.len());
|
||||
|
||||
// Blobs in `read` only provide their starting offset. The end offset
|
||||
// of a blob is implicit: the start of the next blob if one exists
|
||||
// or the end of the read.
|
||||
let pairs = blobs_at.iter().zip(
|
||||
blobs_at
|
||||
.iter()
|
||||
.map(Some)
|
||||
.skip(1)
|
||||
.chain(std::iter::once(None)),
|
||||
);
|
||||
|
||||
// Some scratch space, put here for reusing the allocation
|
||||
let mut decompressed_vec = Vec::new();
|
||||
|
||||
for ((offset, meta), next) in pairs {
|
||||
let offset_in_buf = offset - start_offset;
|
||||
let first_len_byte = buf[offset_in_buf as usize];
|
||||
for (blob_start, meta) in blobs_at {
|
||||
let blob_start_in_buf = blob_start - start_offset;
|
||||
let first_len_byte = buf[blob_start_in_buf as usize];
|
||||
|
||||
// Each blob is prefixed by a header containing its size and compression information.
|
||||
// Extract the size and skip that header to find the start of the data.
|
||||
@@ -340,7 +549,7 @@ impl<'a> VectoredBlobReader<'a> {
|
||||
(1, first_len_byte as u64, BYTE_UNCOMPRESSED)
|
||||
} else {
|
||||
let mut blob_size_buf = [0u8; 4];
|
||||
let offset_in_buf = offset_in_buf as usize;
|
||||
let offset_in_buf = blob_start_in_buf as usize;
|
||||
|
||||
blob_size_buf.copy_from_slice(&buf[offset_in_buf..offset_in_buf + 4]);
|
||||
blob_size_buf[0] &= !LEN_COMPRESSION_BIT_MASK;
|
||||
@@ -353,12 +562,8 @@ impl<'a> VectoredBlobReader<'a> {
|
||||
)
|
||||
};
|
||||
|
||||
let start_raw = offset_in_buf + size_length;
|
||||
let end_raw = match next {
|
||||
Some((next_blob_start_offset, _)) => next_blob_start_offset - start_offset,
|
||||
None => start_raw + blob_size,
|
||||
};
|
||||
assert_eq!(end_raw - start_raw, blob_size);
|
||||
let start_raw = blob_start_in_buf + size_length;
|
||||
let end_raw = start_raw + blob_size;
|
||||
let (start, end);
|
||||
if compression_bits == BYTE_UNCOMPRESSED {
|
||||
start = start_raw as usize;
|
||||
@@ -407,18 +612,22 @@ pub struct StreamingVectoredReadPlanner {
|
||||
max_cnt: usize,
|
||||
/// Size of the current batch
|
||||
cnt: usize,
|
||||
|
||||
mode: VectoredReadCoalesceMode,
|
||||
}
|
||||
|
||||
impl StreamingVectoredReadPlanner {
|
||||
pub fn new(max_read_size: u64, max_cnt: usize) -> Self {
|
||||
assert!(max_cnt > 0);
|
||||
assert!(max_read_size > 0);
|
||||
let mode = VectoredReadCoalesceMode::get();
|
||||
Self {
|
||||
read_builder: None,
|
||||
prev: None,
|
||||
max_cnt,
|
||||
max_read_size,
|
||||
cnt: 0,
|
||||
mode,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -467,17 +676,12 @@ impl StreamingVectoredReadPlanner {
|
||||
}
|
||||
None => {
|
||||
self.read_builder = {
|
||||
let mut blobs_at = VecMap::default();
|
||||
blobs_at
|
||||
.append(start_offset, BlobMeta { key, lsn })
|
||||
.expect("First insertion always succeeds");
|
||||
|
||||
Some(VectoredReadBuilder {
|
||||
start: start_offset,
|
||||
end: end_offset,
|
||||
blobs_at,
|
||||
max_read_size: None,
|
||||
})
|
||||
Some(VectoredReadBuilder::new_streaming(
|
||||
start_offset,
|
||||
end_offset,
|
||||
BlobMeta { key, lsn },
|
||||
self.mode,
|
||||
))
|
||||
};
|
||||
}
|
||||
}
|
||||
@@ -511,7 +715,9 @@ mod tests {
|
||||
use super::*;
|
||||
|
||||
fn validate_read(read: &VectoredRead, offset_range: &[(Key, Lsn, u64, BlobFlag)]) {
|
||||
assert_eq!(read.start, offset_range.first().unwrap().2);
|
||||
let align = virtual_file::get_io_buffer_alignment() as u64;
|
||||
assert_eq!(read.start % align, 0);
|
||||
assert_eq!(read.start / align, offset_range.first().unwrap().2 / align);
|
||||
|
||||
let expected_offsets_in_read: Vec<_> = offset_range.iter().map(|o| o.2).collect();
|
||||
|
||||
@@ -525,6 +731,63 @@ mod tests {
|
||||
assert_eq!(expected_offsets_in_read, offsets_in_read);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn planner_chunked_coalesce_all_test() {
|
||||
use crate::virtual_file;
|
||||
|
||||
const CHUNK_SIZE: u64 = 512;
|
||||
virtual_file::set_io_buffer_alignment(CHUNK_SIZE as usize).unwrap();
|
||||
let max_read_size = CHUNK_SIZE as usize * 8;
|
||||
let key = Key::MIN;
|
||||
let lsn = Lsn(0);
|
||||
|
||||
let blob_descriptions = [
|
||||
(key, lsn, CHUNK_SIZE / 8, BlobFlag::None), // Read 1 BEGIN
|
||||
(key, lsn, CHUNK_SIZE / 4, BlobFlag::Ignore), // Gap
|
||||
(key, lsn, CHUNK_SIZE / 2, BlobFlag::None),
|
||||
(key, lsn, CHUNK_SIZE - 2, BlobFlag::Ignore), // Gap
|
||||
(key, lsn, CHUNK_SIZE, BlobFlag::None),
|
||||
(key, lsn, CHUNK_SIZE * 2 - 1, BlobFlag::None),
|
||||
(key, lsn, CHUNK_SIZE * 2 + 1, BlobFlag::Ignore), // Gap
|
||||
(key, lsn, CHUNK_SIZE * 3 + 1, BlobFlag::None),
|
||||
(key, lsn, CHUNK_SIZE * 5 + 1, BlobFlag::None),
|
||||
(key, lsn, CHUNK_SIZE * 6 + 1, BlobFlag::Ignore), // skipped chunk size, but not a chunk: should coalesce.
|
||||
(key, lsn, CHUNK_SIZE * 7 + 1, BlobFlag::None),
|
||||
(key, lsn, CHUNK_SIZE * 8, BlobFlag::None), // Read 2 BEGIN (b/c max_read_size)
|
||||
(key, lsn, CHUNK_SIZE * 9, BlobFlag::Ignore), // ==== skipped a chunk
|
||||
(key, lsn, CHUNK_SIZE * 10, BlobFlag::None), // Read 3 BEGIN (cannot coalesce)
|
||||
];
|
||||
|
||||
let ranges = [
|
||||
&[
|
||||
blob_descriptions[0],
|
||||
blob_descriptions[2],
|
||||
blob_descriptions[4],
|
||||
blob_descriptions[5],
|
||||
blob_descriptions[7],
|
||||
blob_descriptions[8],
|
||||
blob_descriptions[10],
|
||||
],
|
||||
&blob_descriptions[11..12],
|
||||
&blob_descriptions[13..],
|
||||
];
|
||||
|
||||
let mut planner = VectoredReadPlanner::new(max_read_size);
|
||||
for (key, lsn, offset, flag) in blob_descriptions {
|
||||
planner.handle(key, lsn, offset, flag);
|
||||
}
|
||||
|
||||
planner.handle_range_end(652 * 1024);
|
||||
|
||||
let reads = planner.finish();
|
||||
|
||||
assert_eq!(reads.len(), ranges.len());
|
||||
|
||||
for (idx, read) in reads.iter().enumerate() {
|
||||
validate_read(read, ranges[idx]);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn planner_max_read_size_test() {
|
||||
let max_read_size = 128 * 1024;
|
||||
@@ -737,6 +1000,7 @@ mod tests {
|
||||
let reserved_bytes = blobs.iter().map(|bl| bl.len()).max().unwrap() * 2 + 16;
|
||||
let mut buf = BytesMut::with_capacity(reserved_bytes);
|
||||
|
||||
let mode = VectoredReadCoalesceMode::get();
|
||||
let vectored_blob_reader = VectoredBlobReader::new(&file);
|
||||
let meta = BlobMeta {
|
||||
key: Key::MIN,
|
||||
@@ -748,7 +1012,7 @@ mod tests {
|
||||
if idx + 1 == offsets.len() {
|
||||
continue;
|
||||
}
|
||||
let read_builder = VectoredReadBuilder::new(*offset, *end, meta, 16 * 4096);
|
||||
let read_builder = VectoredReadBuilder::new(*offset, *end, meta, 16 * 4096, mode);
|
||||
let read = read_builder.build();
|
||||
let result = vectored_blob_reader.read_blobs(&read, buf, &ctx).await?;
|
||||
assert_eq!(result.blobs.len(), 1);
|
||||
@@ -784,4 +1048,12 @@ mod tests {
|
||||
round_trip_test_compressed(&blobs, true).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_div_round_up() {
|
||||
const CHUNK_SIZE: usize = 512;
|
||||
assert_eq!(1, div_round_up(200, CHUNK_SIZE));
|
||||
assert_eq!(1, div_round_up(CHUNK_SIZE, CHUNK_SIZE));
|
||||
assert_eq!(2, div_round_up(CHUNK_SIZE + 1, CHUNK_SIZE));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
//! This is similar to PostgreSQL's virtual file descriptor facility in
|
||||
//! src/backend/storage/file/fd.c
|
||||
//!
|
||||
use crate::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
|
||||
use crate::context::RequestContext;
|
||||
use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC};
|
||||
|
||||
@@ -1140,10 +1141,13 @@ impl OpenFiles {
|
||||
/// server startup.
|
||||
///
|
||||
#[cfg(not(test))]
|
||||
pub fn init(num_slots: usize, engine: IoEngineKind) {
|
||||
pub fn init(num_slots: usize, engine: IoEngineKind, io_buffer_alignment: usize) {
|
||||
if OPEN_FILES.set(OpenFiles::new(num_slots)).is_err() {
|
||||
panic!("virtual_file::init called twice");
|
||||
}
|
||||
if set_io_buffer_alignment(io_buffer_alignment).is_err() {
|
||||
panic!("IO buffer alignment ({io_buffer_alignment}) is not a power of two");
|
||||
}
|
||||
io_engine::init(engine);
|
||||
crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64);
|
||||
}
|
||||
@@ -1167,6 +1171,61 @@ fn get_open_files() -> &'static OpenFiles {
|
||||
}
|
||||
}
|
||||
|
||||
static IO_BUFFER_ALIGNMENT: AtomicUsize = AtomicUsize::new(DEFAULT_IO_BUFFER_ALIGNMENT);
|
||||
|
||||
/// Returns true if `x` is zero or a power of two.
|
||||
fn is_zero_or_power_of_two(x: usize) -> bool {
|
||||
(x == 0) || ((x & (x - 1)) == 0)
|
||||
}
|
||||
|
||||
#[allow(unused)]
|
||||
pub(crate) fn set_io_buffer_alignment(align: usize) -> Result<(), usize> {
|
||||
if is_zero_or_power_of_two(align) {
|
||||
IO_BUFFER_ALIGNMENT.store(align, std::sync::atomic::Ordering::Relaxed);
|
||||
Ok(())
|
||||
} else {
|
||||
Err(align)
|
||||
}
|
||||
}
|
||||
|
||||
/// Gets the io buffer alignment requirement. Returns 0 if there is no requirement specified.
|
||||
///
|
||||
/// This function should be used to check the raw config value.
|
||||
pub(crate) fn get_io_buffer_alignment_raw() -> usize {
|
||||
let align = IO_BUFFER_ALIGNMENT.load(std::sync::atomic::Ordering::Relaxed);
|
||||
|
||||
if cfg!(test) {
|
||||
let env_var_name = "NEON_PAGESERVER_UNIT_TEST_IO_BUFFER_ALIGNMENT";
|
||||
if align == DEFAULT_IO_BUFFER_ALIGNMENT {
|
||||
if let Some(test_align) = utils::env::var(env_var_name) {
|
||||
if is_zero_or_power_of_two(test_align) {
|
||||
test_align
|
||||
} else {
|
||||
panic!("IO buffer alignment ({test_align}) is not a power of two");
|
||||
}
|
||||
} else {
|
||||
crate::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT
|
||||
}
|
||||
} else {
|
||||
align
|
||||
}
|
||||
} else {
|
||||
align
|
||||
}
|
||||
}
|
||||
|
||||
/// Gets the io buffer alignment requirement. Returns 1 if the alignment config is set to zero.
|
||||
///
|
||||
/// This function should be used for getting the actual alignment value to use.
|
||||
pub(crate) fn get_io_buffer_alignment() -> usize {
|
||||
let align = get_io_buffer_alignment_raw();
|
||||
if align == DEFAULT_IO_BUFFER_ALIGNMENT {
|
||||
1
|
||||
} else {
|
||||
align
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::context::DownloadBehavior;
|
||||
|
||||
@@ -78,6 +78,7 @@ where
|
||||
.expect("must not use after we returned an error")
|
||||
}
|
||||
|
||||
/// Guarantees that if Ok() is returned, all bytes in `chunk` have been accepted.
|
||||
#[cfg_attr(target_os = "macos", allow(dead_code))]
|
||||
pub async fn write_buffered<S: IoBuf + Send>(
|
||||
&mut self,
|
||||
|
||||
@@ -1,13 +1,7 @@
|
||||
commit f7925d4d1406c0f0229e3c691c94b69e381899b1 (HEAD -> master)
|
||||
Author: Alexey Masterov <alexeymasterov@neon.tech>
|
||||
Date: Thu Jun 6 08:02:42 2024 +0000
|
||||
|
||||
Patch expected files to consider Neon's log messages
|
||||
|
||||
diff --git a/ext-src/pg_hint_plan-src/expected/ut-A.out b/ext-src/pg_hint_plan-src/expected/ut-A.out
|
||||
index da723b8..f8d0102 100644
|
||||
--- a/ext-src/pg_hint_plan-src/expected/ut-A.out
|
||||
+++ b/ext-src/pg_hint_plan-src/expected/ut-A.out
|
||||
diff --git a/expected/ut-A.out b/expected/ut-A.out
|
||||
index da723b8..5328114 100644
|
||||
--- a/expected/ut-A.out
|
||||
+++ b/expected/ut-A.out
|
||||
@@ -9,13 +9,16 @@ SET search_path TO public;
|
||||
----
|
||||
-- No.A-1-1-3
|
||||
@@ -25,10 +19,18 @@ index da723b8..f8d0102 100644
|
||||
DROP SCHEMA other_schema;
|
||||
----
|
||||
---- No. A-5-1 comment pattern
|
||||
diff --git a/ext-src/pg_hint_plan-src/expected/ut-fdw.out b/ext-src/pg_hint_plan-src/expected/ut-fdw.out
|
||||
@@ -3175,6 +3178,7 @@ SELECT s.query, s.calls
|
||||
FROM public.pg_stat_statements s
|
||||
JOIN pg_catalog.pg_database d
|
||||
ON (s.dbid = d.oid)
|
||||
+ WHERE s.query LIKE 'SELECT * FROM s1.t1%' OR s.query LIKE '%pg_stat_statements_reset%'
|
||||
ORDER BY 1;
|
||||
query | calls
|
||||
--------------------------------------+-------
|
||||
diff --git a/expected/ut-fdw.out b/expected/ut-fdw.out
|
||||
index d372459..6282afe 100644
|
||||
--- a/ext-src/pg_hint_plan-src/expected/ut-fdw.out
|
||||
+++ b/ext-src/pg_hint_plan-src/expected/ut-fdw.out
|
||||
--- a/expected/ut-fdw.out
|
||||
+++ b/expected/ut-fdw.out
|
||||
@@ -7,6 +7,7 @@ SET pg_hint_plan.debug_print TO on;
|
||||
SET client_min_messages TO LOG;
|
||||
SET pg_hint_plan.enable_hint TO on;
|
||||
@@ -37,3 +39,15 @@ index d372459..6282afe 100644
|
||||
CREATE SERVER file_server FOREIGN DATA WRAPPER file_fdw;
|
||||
CREATE USER MAPPING FOR PUBLIC SERVER file_server;
|
||||
CREATE FOREIGN TABLE ft1 (id int, val int) SERVER file_server OPTIONS (format 'csv', filename :'filename');
|
||||
diff --git a/sql/ut-A.sql b/sql/ut-A.sql
|
||||
index 7c7d58a..4fd1a07 100644
|
||||
--- a/sql/ut-A.sql
|
||||
+++ b/sql/ut-A.sql
|
||||
@@ -963,6 +963,7 @@ SELECT s.query, s.calls
|
||||
FROM public.pg_stat_statements s
|
||||
JOIN pg_catalog.pg_database d
|
||||
ON (s.dbid = d.oid)
|
||||
+ WHERE s.query LIKE 'SELECT * FROM s1.t1%' OR s.query LIKE '%pg_stat_statements_reset%'
|
||||
ORDER BY 1;
|
||||
|
||||
----
|
||||
@@ -550,9 +550,6 @@ pageserver_connect(shardno_t shard_no, int elevel)
|
||||
case 2:
|
||||
pagestream_query = psprintf("pagestream_v2 %s %s", neon_tenant, neon_timeline);
|
||||
break;
|
||||
case 1:
|
||||
pagestream_query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
|
||||
break;
|
||||
default:
|
||||
elog(ERROR, "unexpected neon_protocol_version %d", neon_protocol_version);
|
||||
}
|
||||
@@ -1063,7 +1060,7 @@ pg_init_libpagestore(void)
|
||||
NULL,
|
||||
&neon_protocol_version,
|
||||
2, /* use protocol version 2 */
|
||||
1, /* min */
|
||||
2, /* min */
|
||||
2, /* max */
|
||||
PGC_SU_BACKEND,
|
||||
0, /* no flags required */
|
||||
|
||||
@@ -87,9 +87,8 @@ typedef enum {
|
||||
* can skip traversing through recent layers which we know to not contain any
|
||||
* versions for the requested page.
|
||||
*
|
||||
* These structs describe the V2 of these requests. The old V1 protocol contained
|
||||
* just one LSN and a boolean 'latest' flag. If the neon_protocol_version GUC is
|
||||
* set to 1, we will convert these to the V1 requests before sending.
|
||||
* These structs describe the V2 of these requests. (The old now-defunct V1
|
||||
* protocol contained just one LSN and a boolean 'latest' flag.)
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
|
||||
@@ -1001,51 +1001,10 @@ nm_pack_request(NeonRequest *msg)
|
||||
|
||||
initStringInfo(&s);
|
||||
|
||||
if (neon_protocol_version >= 2)
|
||||
{
|
||||
pq_sendbyte(&s, msg->tag);
|
||||
pq_sendint64(&s, msg->lsn);
|
||||
pq_sendint64(&s, msg->not_modified_since);
|
||||
}
|
||||
else
|
||||
{
|
||||
bool latest;
|
||||
XLogRecPtr lsn;
|
||||
pq_sendbyte(&s, msg->tag);
|
||||
pq_sendint64(&s, msg->lsn);
|
||||
pq_sendint64(&s, msg->not_modified_since);
|
||||
|
||||
/*
|
||||
* In primary, we always request the latest page version.
|
||||
*/
|
||||
if (!RecoveryInProgress())
|
||||
{
|
||||
latest = true;
|
||||
lsn = msg->not_modified_since;
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
* In the protocol V1, we cannot represent that we want to read
|
||||
* page at LSN X, and we know that it hasn't been modified since
|
||||
* Y. We can either use 'not_modified_lsn' as the request LSN, and
|
||||
* risk getting an error if that LSN is too old and has already
|
||||
* fallen out of the pageserver's GC horizon, or we can send
|
||||
* 'request_lsn', causing the pageserver to possibly wait for the
|
||||
* recent WAL to arrive unnecessarily. Or something in between. We
|
||||
* choose to use the old LSN and risk GC errors, because that's
|
||||
* what we've done historically.
|
||||
*/
|
||||
latest = false;
|
||||
lsn = msg->not_modified_since;
|
||||
}
|
||||
|
||||
pq_sendbyte(&s, msg->tag);
|
||||
pq_sendbyte(&s, latest);
|
||||
pq_sendint64(&s, lsn);
|
||||
}
|
||||
|
||||
/*
|
||||
* The rest of the request messages are the same between protocol V1 and
|
||||
* V2
|
||||
*/
|
||||
switch (messageTag(msg))
|
||||
{
|
||||
/* pagestore_client -> pagestore */
|
||||
|
||||
@@ -220,6 +220,64 @@ nwp_register_gucs(void)
|
||||
NULL, NULL, NULL);
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
split_safekeepers_list(char *safekeepers_list, char *safekeepers[])
|
||||
{
|
||||
int n_safekeepers = 0;
|
||||
char *curr_sk = safekeepers_list;
|
||||
|
||||
for (char *coma = safekeepers_list; coma != NULL && *coma != '\0'; curr_sk = coma)
|
||||
{
|
||||
if (++n_safekeepers >= MAX_SAFEKEEPERS) {
|
||||
wpg_log(FATAL, "too many safekeepers");
|
||||
}
|
||||
|
||||
coma = strchr(coma, ',');
|
||||
safekeepers[n_safekeepers-1] = curr_sk;
|
||||
|
||||
if (coma != NULL) {
|
||||
*coma++ = '\0';
|
||||
}
|
||||
}
|
||||
|
||||
return n_safekeepers;
|
||||
}
|
||||
|
||||
/*
|
||||
* Accept two coma-separated strings with list of safekeeper host:port addresses.
|
||||
* Split them into arrays and return false if two sets do not match, ignoring the order.
|
||||
*/
|
||||
static bool
|
||||
safekeepers_cmp(char *old, char *new)
|
||||
{
|
||||
char *safekeepers_old[MAX_SAFEKEEPERS];
|
||||
char *safekeepers_new[MAX_SAFEKEEPERS];
|
||||
int len_old = 0;
|
||||
int len_new = 0;
|
||||
|
||||
len_old = split_safekeepers_list(old, safekeepers_old);
|
||||
len_new = split_safekeepers_list(new, safekeepers_new);
|
||||
|
||||
if (len_old != len_new)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
qsort(&safekeepers_old, len_old, sizeof(char *), pg_qsort_strcmp);
|
||||
qsort(&safekeepers_new, len_new, sizeof(char *), pg_qsort_strcmp);
|
||||
|
||||
for (int i = 0; i < len_new; i++)
|
||||
{
|
||||
if (strcmp(safekeepers_old[i], safekeepers_new[i]) != 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* GUC assign_hook for neon.safekeepers. Restarts walproposer through FATAL if
|
||||
* the list changed.
|
||||
@@ -235,19 +293,26 @@ assign_neon_safekeepers(const char *newval, void *extra)
|
||||
wpg_log(FATAL, "neon.safekeepers is empty");
|
||||
}
|
||||
|
||||
/* Copy values because we will modify them in split_safekeepers_list() */
|
||||
char *newval_copy = pstrdup(newval);
|
||||
char *oldval = pstrdup(wal_acceptors_list);
|
||||
|
||||
/*
|
||||
* TODO: restarting through FATAL is stupid and introduces 1s delay before
|
||||
* next bgw start. We should refactor walproposer to allow graceful exit and
|
||||
* thus remove this delay.
|
||||
* XXX: If you change anything here, sync with test_safekeepers_reconfigure_reorder.
|
||||
*/
|
||||
if (strcmp(wal_acceptors_list, newval) != 0)
|
||||
if (!safekeepers_cmp(oldval, newval_copy))
|
||||
{
|
||||
wpg_log(FATAL, "restarting walproposer to change safekeeper list from %s to %s",
|
||||
wal_acceptors_list, newval);
|
||||
}
|
||||
pfree(newval_copy);
|
||||
pfree(oldval);
|
||||
}
|
||||
|
||||
/* Check if we need to suspend inserts because of lagging replication. */
|
||||
/* Check if we need to suspend inserts because of lagging replication. */
|
||||
static uint64
|
||||
backpressure_lag_impl(void)
|
||||
{
|
||||
|
||||
@@ -6,7 +6,7 @@ Proxy binary accepts `--auth-backend` CLI option, which determines auth scheme a
|
||||
new SCRAM-based console API; uses SNI info to select the destination project (endpoint soon)
|
||||
* postgres
|
||||
uses postgres to select auth secrets of existing roles. Useful for local testing
|
||||
* link
|
||||
* web (or link)
|
||||
sends login link for all usernames
|
||||
|
||||
Also proxy can expose following services to the external world:
|
||||
|
||||
@@ -1,20 +1,20 @@
|
||||
//! Client authentication mechanisms.
|
||||
|
||||
pub mod backend;
|
||||
pub use backend::BackendType;
|
||||
pub use backend::Backend;
|
||||
|
||||
mod credentials;
|
||||
pub use credentials::{
|
||||
pub(crate) use credentials::{
|
||||
check_peer_addr_is_in_list, endpoint_sni, ComputeUserInfoMaybeEndpoint,
|
||||
ComputeUserInfoParseError, IpPattern,
|
||||
};
|
||||
|
||||
mod password_hack;
|
||||
pub use password_hack::parse_endpoint_param;
|
||||
pub(crate) use password_hack::parse_endpoint_param;
|
||||
use password_hack::PasswordHackPayload;
|
||||
|
||||
mod flow;
|
||||
pub use flow::*;
|
||||
pub(crate) use flow::*;
|
||||
use tokio::time::error::Elapsed;
|
||||
|
||||
use crate::{
|
||||
@@ -25,13 +25,13 @@ use std::{io, net::IpAddr};
|
||||
use thiserror::Error;
|
||||
|
||||
/// Convenience wrapper for the authentication error.
|
||||
pub type Result<T> = std::result::Result<T, AuthError>;
|
||||
pub(crate) type Result<T> = std::result::Result<T, AuthError>;
|
||||
|
||||
/// Common authentication error.
|
||||
#[derive(Debug, Error)]
|
||||
pub enum AuthErrorImpl {
|
||||
pub(crate) enum AuthErrorImpl {
|
||||
#[error(transparent)]
|
||||
Link(#[from] backend::LinkAuthError),
|
||||
Web(#[from] backend::WebAuthError),
|
||||
|
||||
#[error(transparent)]
|
||||
GetAuthInfo(#[from] console::errors::GetAuthInfoError),
|
||||
@@ -77,30 +77,30 @@ pub enum AuthErrorImpl {
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
#[error(transparent)]
|
||||
pub struct AuthError(Box<AuthErrorImpl>);
|
||||
pub(crate) struct AuthError(Box<AuthErrorImpl>);
|
||||
|
||||
impl AuthError {
|
||||
pub fn bad_auth_method(name: impl Into<Box<str>>) -> Self {
|
||||
pub(crate) fn bad_auth_method(name: impl Into<Box<str>>) -> Self {
|
||||
AuthErrorImpl::BadAuthMethod(name.into()).into()
|
||||
}
|
||||
|
||||
pub fn auth_failed(user: impl Into<Box<str>>) -> Self {
|
||||
pub(crate) fn auth_failed(user: impl Into<Box<str>>) -> Self {
|
||||
AuthErrorImpl::AuthFailed(user.into()).into()
|
||||
}
|
||||
|
||||
pub fn ip_address_not_allowed(ip: IpAddr) -> Self {
|
||||
pub(crate) fn ip_address_not_allowed(ip: IpAddr) -> Self {
|
||||
AuthErrorImpl::IpAddressNotAllowed(ip).into()
|
||||
}
|
||||
|
||||
pub fn too_many_connections() -> Self {
|
||||
pub(crate) fn too_many_connections() -> Self {
|
||||
AuthErrorImpl::TooManyConnections.into()
|
||||
}
|
||||
|
||||
pub fn is_auth_failed(&self) -> bool {
|
||||
pub(crate) fn is_auth_failed(&self) -> bool {
|
||||
matches!(self.0.as_ref(), AuthErrorImpl::AuthFailed(_))
|
||||
}
|
||||
|
||||
pub fn user_timeout(elapsed: Elapsed) -> Self {
|
||||
pub(crate) fn user_timeout(elapsed: Elapsed) -> Self {
|
||||
AuthErrorImpl::UserTimeout(elapsed).into()
|
||||
}
|
||||
}
|
||||
@@ -114,7 +114,7 @@ impl<E: Into<AuthErrorImpl>> From<E> for AuthError {
|
||||
impl UserFacingError for AuthError {
|
||||
fn to_string_client(&self) -> String {
|
||||
match self.0.as_ref() {
|
||||
AuthErrorImpl::Link(e) => e.to_string_client(),
|
||||
AuthErrorImpl::Web(e) => e.to_string_client(),
|
||||
AuthErrorImpl::GetAuthInfo(e) => e.to_string_client(),
|
||||
AuthErrorImpl::Sasl(e) => e.to_string_client(),
|
||||
AuthErrorImpl::AuthFailed(_) => self.to_string(),
|
||||
@@ -132,7 +132,7 @@ impl UserFacingError for AuthError {
|
||||
impl ReportableError for AuthError {
|
||||
fn get_error_kind(&self) -> crate::error::ErrorKind {
|
||||
match self.0.as_ref() {
|
||||
AuthErrorImpl::Link(e) => e.get_error_kind(),
|
||||
AuthErrorImpl::Web(e) => e.get_error_kind(),
|
||||
AuthErrorImpl::GetAuthInfo(e) => e.get_error_kind(),
|
||||
AuthErrorImpl::Sasl(e) => e.get_error_kind(),
|
||||
AuthErrorImpl::AuthFailed(_) => crate::error::ErrorKind::User,
|
||||
|
||||
@@ -1,19 +1,19 @@
|
||||
mod classic;
|
||||
mod hacks;
|
||||
pub mod jwt;
|
||||
mod link;
|
||||
pub mod local;
|
||||
mod web;
|
||||
|
||||
use std::net::IpAddr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use ipnet::{Ipv4Net, Ipv6Net};
|
||||
pub use link::LinkAuthError;
|
||||
use local::LocalBackend;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tokio_postgres::config::AuthKeys;
|
||||
use tracing::{info, warn};
|
||||
pub(crate) use web::WebAuthError;
|
||||
|
||||
use crate::auth::credentials::check_peer_addr_is_in_list;
|
||||
use crate::auth::{validate_password_and_exchange, AuthError};
|
||||
@@ -65,27 +65,27 @@ impl<T> std::ops::Deref for MaybeOwned<'_, T> {
|
||||
/// * However, when we substitute `T` with [`ComputeUserInfoMaybeEndpoint`],
|
||||
/// this helps us provide the credentials only to those auth
|
||||
/// backends which require them for the authentication process.
|
||||
pub enum BackendType<'a, T, D> {
|
||||
pub enum Backend<'a, T, D> {
|
||||
/// Cloud API (V2).
|
||||
Console(MaybeOwned<'a, ConsoleBackend>, T),
|
||||
/// Authentication via a web browser.
|
||||
Link(MaybeOwned<'a, url::ApiUrl>, D),
|
||||
Web(MaybeOwned<'a, url::ApiUrl>, D),
|
||||
/// Local proxy uses configured auth credentials and does not wake compute
|
||||
Local(MaybeOwned<'a, LocalBackend>),
|
||||
}
|
||||
|
||||
pub trait TestBackend: Send + Sync + 'static {
|
||||
#[cfg(test)]
|
||||
pub(crate) trait TestBackend: Send + Sync + 'static {
|
||||
fn wake_compute(&self) -> Result<CachedNodeInfo, console::errors::WakeComputeError>;
|
||||
fn get_allowed_ips_and_secret(
|
||||
&self,
|
||||
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), console::errors::GetAuthInfoError>;
|
||||
fn get_role_secret(&self) -> Result<CachedRoleSecret, console::errors::GetAuthInfoError>;
|
||||
}
|
||||
|
||||
impl std::fmt::Display for BackendType<'_, (), ()> {
|
||||
impl std::fmt::Display for Backend<'_, (), ()> {
|
||||
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
Self::Console(api, _) => match &**api {
|
||||
Self::Console(api, ()) => match &**api {
|
||||
ConsoleBackend::Console(endpoint) => {
|
||||
fmt.debug_tuple("Console").field(&endpoint.url()).finish()
|
||||
}
|
||||
@@ -96,73 +96,73 @@ impl std::fmt::Display for BackendType<'_, (), ()> {
|
||||
#[cfg(test)]
|
||||
ConsoleBackend::Test(_) => fmt.debug_tuple("Test").finish(),
|
||||
},
|
||||
Self::Link(url, _) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
|
||||
Self::Web(url, ()) => fmt.debug_tuple("Web").field(&url.as_str()).finish(),
|
||||
Self::Local(_) => fmt.debug_tuple("Local").finish(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T, D> BackendType<'_, T, D> {
|
||||
impl<T, D> Backend<'_, T, D> {
|
||||
/// Very similar to [`std::option::Option::as_ref`].
|
||||
/// This helps us pass structured config to async tasks.
|
||||
pub fn as_ref(&self) -> BackendType<'_, &T, &D> {
|
||||
pub(crate) fn as_ref(&self) -> Backend<'_, &T, &D> {
|
||||
match self {
|
||||
Self::Console(c, x) => BackendType::Console(MaybeOwned::Borrowed(c), x),
|
||||
Self::Link(c, x) => BackendType::Link(MaybeOwned::Borrowed(c), x),
|
||||
Self::Local(l) => BackendType::Local(MaybeOwned::Borrowed(l)),
|
||||
Self::Console(c, x) => Backend::Console(MaybeOwned::Borrowed(c), x),
|
||||
Self::Web(c, x) => Backend::Web(MaybeOwned::Borrowed(c), x),
|
||||
Self::Local(l) => Backend::Local(MaybeOwned::Borrowed(l)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T, D> BackendType<'a, T, D> {
|
||||
impl<'a, T, D> Backend<'a, T, D> {
|
||||
/// Very similar to [`std::option::Option::map`].
|
||||
/// Maps [`BackendType<T>`] to [`BackendType<R>`] by applying
|
||||
/// Maps [`Backend<T>`] to [`Backend<R>`] by applying
|
||||
/// a function to a contained value.
|
||||
pub fn map<R>(self, f: impl FnOnce(T) -> R) -> BackendType<'a, R, D> {
|
||||
pub(crate) fn map<R>(self, f: impl FnOnce(T) -> R) -> Backend<'a, R, D> {
|
||||
match self {
|
||||
Self::Console(c, x) => BackendType::Console(c, f(x)),
|
||||
Self::Link(c, x) => BackendType::Link(c, x),
|
||||
Self::Local(l) => BackendType::Local(l),
|
||||
Self::Console(c, x) => Backend::Console(c, f(x)),
|
||||
Self::Web(c, x) => Backend::Web(c, x),
|
||||
Self::Local(l) => Backend::Local(l),
|
||||
}
|
||||
}
|
||||
}
|
||||
impl<'a, T, D, E> BackendType<'a, Result<T, E>, D> {
|
||||
impl<'a, T, D, E> Backend<'a, Result<T, E>, D> {
|
||||
/// Very similar to [`std::option::Option::transpose`].
|
||||
/// This is most useful for error handling.
|
||||
pub fn transpose(self) -> Result<BackendType<'a, T, D>, E> {
|
||||
pub(crate) fn transpose(self) -> Result<Backend<'a, T, D>, E> {
|
||||
match self {
|
||||
Self::Console(c, x) => x.map(|x| BackendType::Console(c, x)),
|
||||
Self::Link(c, x) => Ok(BackendType::Link(c, x)),
|
||||
Self::Local(l) => Ok(BackendType::Local(l)),
|
||||
Self::Console(c, x) => x.map(|x| Backend::Console(c, x)),
|
||||
Self::Web(c, x) => Ok(Backend::Web(c, x)),
|
||||
Self::Local(l) => Ok(Backend::Local(l)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct ComputeCredentials {
|
||||
pub info: ComputeUserInfo,
|
||||
pub keys: ComputeCredentialKeys,
|
||||
pub(crate) struct ComputeCredentials {
|
||||
pub(crate) info: ComputeUserInfo,
|
||||
pub(crate) keys: ComputeCredentialKeys,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ComputeUserInfoNoEndpoint {
|
||||
pub user: RoleName,
|
||||
pub options: NeonOptions,
|
||||
pub(crate) struct ComputeUserInfoNoEndpoint {
|
||||
pub(crate) user: RoleName,
|
||||
pub(crate) options: NeonOptions,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ComputeUserInfo {
|
||||
pub endpoint: EndpointId,
|
||||
pub user: RoleName,
|
||||
pub options: NeonOptions,
|
||||
pub(crate) struct ComputeUserInfo {
|
||||
pub(crate) endpoint: EndpointId,
|
||||
pub(crate) user: RoleName,
|
||||
pub(crate) options: NeonOptions,
|
||||
}
|
||||
|
||||
impl ComputeUserInfo {
|
||||
pub fn endpoint_cache_key(&self) -> EndpointCacheKey {
|
||||
pub(crate) fn endpoint_cache_key(&self) -> EndpointCacheKey {
|
||||
self.options.get_cache_key(&self.endpoint)
|
||||
}
|
||||
}
|
||||
|
||||
pub enum ComputeCredentialKeys {
|
||||
pub(crate) enum ComputeCredentialKeys {
|
||||
Password(Vec<u8>),
|
||||
AuthKeys(AuthKeys),
|
||||
None,
|
||||
@@ -222,7 +222,7 @@ impl RateBucketInfo {
|
||||
}
|
||||
|
||||
impl AuthenticationConfig {
|
||||
pub fn check_rate_limit(
|
||||
pub(crate) fn check_rate_limit(
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
config: &AuthenticationConfig,
|
||||
@@ -324,21 +324,20 @@ async fn auth_quirks(
|
||||
};
|
||||
let (cached_entry, secret) = cached_secret.take_value();
|
||||
|
||||
let secret = match secret {
|
||||
Some(secret) => config.check_rate_limit(
|
||||
let secret = if let Some(secret) = secret {
|
||||
config.check_rate_limit(
|
||||
ctx,
|
||||
config,
|
||||
secret,
|
||||
&info.endpoint,
|
||||
unauthenticated_password.is_some() || allow_cleartext,
|
||||
)?,
|
||||
None => {
|
||||
// If we don't have an authentication secret, we mock one to
|
||||
// prevent malicious probing (possible due to missing protocol steps).
|
||||
// This mocked secret will never lead to successful authentication.
|
||||
info!("authentication info not found, mocking it");
|
||||
AuthSecret::Scram(scram::ServerSecret::mock(rand::random()))
|
||||
}
|
||||
)?
|
||||
} else {
|
||||
// If we don't have an authentication secret, we mock one to
|
||||
// prevent malicious probing (possible due to missing protocol steps).
|
||||
// This mocked secret will never lead to successful authentication.
|
||||
info!("authentication info not found, mocking it");
|
||||
AuthSecret::Scram(scram::ServerSecret::mock(rand::random()))
|
||||
};
|
||||
|
||||
match authenticate_with_secret(
|
||||
@@ -404,35 +403,26 @@ async fn authenticate_with_secret(
|
||||
classic::authenticate(ctx, info, client, config, secret).await
|
||||
}
|
||||
|
||||
impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
|
||||
/// Get compute endpoint name from the credentials.
|
||||
pub fn get_endpoint(&self) -> Option<EndpointId> {
|
||||
match self {
|
||||
Self::Console(_, user_info) => user_info.endpoint_id.clone(),
|
||||
Self::Link(_, _) => Some("link".into()),
|
||||
Self::Local(_) => Some("local".into()),
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint, &()> {
|
||||
/// Get username from the credentials.
|
||||
pub fn get_user(&self) -> &str {
|
||||
pub(crate) fn get_user(&self) -> &str {
|
||||
match self {
|
||||
Self::Console(_, user_info) => &user_info.user,
|
||||
Self::Link(_, _) => "link",
|
||||
Self::Web(_, ()) => "web",
|
||||
Self::Local(_) => "local",
|
||||
}
|
||||
}
|
||||
|
||||
/// Authenticate the client via the requested backend, possibly using credentials.
|
||||
#[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)]
|
||||
pub async fn authenticate(
|
||||
pub(crate) async fn authenticate(
|
||||
self,
|
||||
ctx: &RequestMonitoring,
|
||||
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
||||
allow_cleartext: bool,
|
||||
config: &'static AuthenticationConfig,
|
||||
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
|
||||
) -> auth::Result<BackendType<'a, ComputeCredentials, NodeInfo>> {
|
||||
) -> auth::Result<Backend<'a, ComputeCredentials, NodeInfo>> {
|
||||
let res = match self {
|
||||
Self::Console(api, user_info) => {
|
||||
info!(
|
||||
@@ -451,15 +441,15 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
|
||||
endpoint_rate_limiter,
|
||||
)
|
||||
.await?;
|
||||
BackendType::Console(api, credentials)
|
||||
Backend::Console(api, credentials)
|
||||
}
|
||||
// NOTE: this auth backend doesn't use client credentials.
|
||||
Self::Link(url, _) => {
|
||||
info!("performing link authentication");
|
||||
Self::Web(url, ()) => {
|
||||
info!("performing web authentication");
|
||||
|
||||
let info = link::authenticate(ctx, &url, client).await?;
|
||||
let info = web::authenticate(ctx, &url, client).await?;
|
||||
|
||||
BackendType::Link(url, info)
|
||||
Backend::Web(url, info)
|
||||
}
|
||||
Self::Local(_) => {
|
||||
return Err(auth::AuthError::bad_auth_method("invalid for local proxy"))
|
||||
@@ -471,39 +461,39 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
|
||||
}
|
||||
}
|
||||
|
||||
impl BackendType<'_, ComputeUserInfo, &()> {
|
||||
pub async fn get_role_secret(
|
||||
impl Backend<'_, ComputeUserInfo, &()> {
|
||||
pub(crate) async fn get_role_secret(
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
) -> Result<CachedRoleSecret, GetAuthInfoError> {
|
||||
match self {
|
||||
Self::Console(api, user_info) => api.get_role_secret(ctx, user_info).await,
|
||||
Self::Link(_, _) => Ok(Cached::new_uncached(None)),
|
||||
Self::Web(_, ()) => Ok(Cached::new_uncached(None)),
|
||||
Self::Local(_) => Ok(Cached::new_uncached(None)),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn get_allowed_ips_and_secret(
|
||||
pub(crate) async fn get_allowed_ips_and_secret(
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
|
||||
match self {
|
||||
Self::Console(api, user_info) => api.get_allowed_ips_and_secret(ctx, user_info).await,
|
||||
Self::Link(_, _) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
|
||||
Self::Web(_, ()) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
|
||||
Self::Local(_) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> {
|
||||
impl ComputeConnectBackend for Backend<'_, ComputeCredentials, NodeInfo> {
|
||||
async fn wake_compute(
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
|
||||
match self {
|
||||
Self::Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
|
||||
Self::Link(_, info) => Ok(Cached::new_uncached(info.clone())),
|
||||
Self::Web(_, info) => Ok(Cached::new_uncached(info.clone())),
|
||||
Self::Local(local) => Ok(Cached::new_uncached(local.node_info.clone())),
|
||||
}
|
||||
}
|
||||
@@ -511,21 +501,23 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> {
|
||||
fn get_keys(&self) -> &ComputeCredentialKeys {
|
||||
match self {
|
||||
Self::Console(_, creds) => &creds.keys,
|
||||
Self::Link(_, _) => &ComputeCredentialKeys::None,
|
||||
Self::Web(_, _) => &ComputeCredentialKeys::None,
|
||||
Self::Local(_) => &ComputeCredentialKeys::None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
|
||||
impl ComputeConnectBackend for Backend<'_, ComputeCredentials, &()> {
|
||||
async fn wake_compute(
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
|
||||
match self {
|
||||
Self::Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
|
||||
Self::Link(_, _) => unreachable!("link auth flow doesn't support waking the compute"),
|
||||
Self::Web(_, ()) => {
|
||||
unreachable!("web auth flow doesn't support waking the compute")
|
||||
}
|
||||
Self::Local(local) => Ok(Cached::new_uncached(local.node_info.clone())),
|
||||
}
|
||||
}
|
||||
@@ -533,7 +525,7 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
|
||||
fn get_keys(&self) -> &ComputeCredentialKeys {
|
||||
match self {
|
||||
Self::Console(_, creds) => &creds.keys,
|
||||
Self::Link(_, _) => &ComputeCredentialKeys::None,
|
||||
Self::Web(_, ()) => &ComputeCredentialKeys::None,
|
||||
Self::Local(_) => &ComputeCredentialKeys::None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,7 +17,7 @@ use tracing::{info, warn};
|
||||
/// one round trip and *expensive* computations (>= 4096 HMAC iterations).
|
||||
/// These properties are benefical for serverless JS workers, so we
|
||||
/// use this mechanism for websocket connections.
|
||||
pub async fn authenticate_cleartext(
|
||||
pub(crate) async fn authenticate_cleartext(
|
||||
ctx: &RequestMonitoring,
|
||||
info: ComputeUserInfo,
|
||||
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
||||
@@ -59,7 +59,7 @@ pub async fn authenticate_cleartext(
|
||||
/// Workaround for clients which don't provide an endpoint (project) name.
|
||||
/// Similar to [`authenticate_cleartext`], but there's a specific password format,
|
||||
/// and passwords are not yet validated (we don't know how to validate them!)
|
||||
pub async fn password_hack_no_authentication(
|
||||
pub(crate) async fn password_hack_no_authentication(
|
||||
ctx: &RequestMonitoring,
|
||||
info: ComputeUserInfoNoEndpoint,
|
||||
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
||||
|
||||
@@ -22,27 +22,27 @@ const MAX_RENEW: Duration = Duration::from_secs(3600);
|
||||
const MAX_JWK_BODY_SIZE: usize = 64 * 1024;
|
||||
|
||||
/// How to get the JWT auth rules
|
||||
pub trait FetchAuthRules: Clone + Send + Sync + 'static {
|
||||
pub(crate) trait FetchAuthRules: Clone + Send + Sync + 'static {
|
||||
fn fetch_auth_rules(
|
||||
&self,
|
||||
role_name: RoleName,
|
||||
) -> impl Future<Output = anyhow::Result<Vec<AuthRule>>> + Send;
|
||||
}
|
||||
|
||||
pub struct AuthRule {
|
||||
pub id: String,
|
||||
pub jwks_url: url::Url,
|
||||
pub audience: Option<String>,
|
||||
pub(crate) struct AuthRule {
|
||||
pub(crate) id: String,
|
||||
pub(crate) jwks_url: url::Url,
|
||||
pub(crate) audience: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct JwkCache {
|
||||
pub(crate) struct JwkCache {
|
||||
client: reqwest::Client,
|
||||
|
||||
map: DashMap<(EndpointId, RoleName), Arc<JwkCacheEntryLock>>,
|
||||
}
|
||||
|
||||
pub struct JwkCacheEntry {
|
||||
pub(crate) struct JwkCacheEntry {
|
||||
/// Should refetch at least every hour to verify when old keys have been removed.
|
||||
/// Should refetch when new key IDs are seen only every 5 minutes or so
|
||||
last_retrieved: Instant,
|
||||
@@ -75,7 +75,7 @@ impl KeySet {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct JwkCacheEntryLock {
|
||||
pub(crate) struct JwkCacheEntryLock {
|
||||
cached: ArcSwapOption<JwkCacheEntry>,
|
||||
lookup: tokio::sync::Semaphore,
|
||||
}
|
||||
@@ -224,10 +224,10 @@ impl JwkCacheEntryLock {
|
||||
// where Signature = alg(<B64(Header)> || . || <B64(Payload)>);
|
||||
|
||||
let (header_payload, signature) = jwt
|
||||
.rsplit_once(".")
|
||||
.rsplit_once('.')
|
||||
.context("Provided authentication token is not a valid JWT encoding")?;
|
||||
let (header, payload) = header_payload
|
||||
.split_once(".")
|
||||
.split_once('.')
|
||||
.context("Provided authentication token is not a valid JWT encoding")?;
|
||||
|
||||
let header = base64::decode_config(header, base64::URL_SAFE_NO_PAD)
|
||||
@@ -309,7 +309,7 @@ impl JwkCacheEntryLock {
|
||||
}
|
||||
|
||||
impl JwkCache {
|
||||
pub async fn check_jwt<F: FetchAuthRules>(
|
||||
pub(crate) async fn check_jwt<F: FetchAuthRules>(
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
endpoint: EndpointId,
|
||||
@@ -320,14 +320,11 @@ impl JwkCache {
|
||||
// try with just a read lock first
|
||||
let key = (endpoint, role_name.clone());
|
||||
let entry = self.map.get(&key).as_deref().map(Arc::clone);
|
||||
let entry = match entry {
|
||||
Some(entry) => entry,
|
||||
None => {
|
||||
// acquire a write lock after to insert.
|
||||
let entry = self.map.entry(key).or_default();
|
||||
Arc::clone(&*entry)
|
||||
}
|
||||
};
|
||||
let entry = entry.unwrap_or_else(|| {
|
||||
// acquire a write lock after to insert.
|
||||
let entry = self.map.entry(key).or_default();
|
||||
Arc::clone(&*entry)
|
||||
});
|
||||
|
||||
entry
|
||||
.check_jwt(ctx, jwt, &self.client, role_name, fetch)
|
||||
|
||||
@@ -16,16 +16,14 @@ use crate::{
|
||||
use super::jwt::{AuthRule, FetchAuthRules, JwkCache};
|
||||
|
||||
pub struct LocalBackend {
|
||||
pub jwks_cache: JwkCache,
|
||||
pub postgres_addr: SocketAddr,
|
||||
pub node_info: NodeInfo,
|
||||
pub(crate) jwks_cache: JwkCache,
|
||||
pub(crate) node_info: NodeInfo,
|
||||
}
|
||||
|
||||
impl LocalBackend {
|
||||
pub fn new(postgres_addr: SocketAddr) -> Self {
|
||||
LocalBackend {
|
||||
jwks_cache: JwkCache::default(),
|
||||
postgres_addr,
|
||||
node_info: NodeInfo {
|
||||
config: {
|
||||
let mut cfg = ConnCfg::new();
|
||||
@@ -47,7 +45,7 @@ impl LocalBackend {
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub struct StaticAuthRules;
|
||||
pub(crate) struct StaticAuthRules;
|
||||
|
||||
pub static JWKS_ROLE_MAP: ArcSwapOption<JwksRoleSettings> = ArcSwapOption::const_empty();
|
||||
|
||||
|
||||
@@ -13,7 +13,7 @@ use tokio_postgres::config::SslMode;
|
||||
use tracing::{info, info_span};
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
pub enum LinkAuthError {
|
||||
pub(crate) enum WebAuthError {
|
||||
#[error(transparent)]
|
||||
WaiterRegister(#[from] waiters::RegisterError),
|
||||
|
||||
@@ -24,18 +24,18 @@ pub enum LinkAuthError {
|
||||
Io(#[from] std::io::Error),
|
||||
}
|
||||
|
||||
impl UserFacingError for LinkAuthError {
|
||||
impl UserFacingError for WebAuthError {
|
||||
fn to_string_client(&self) -> String {
|
||||
"Internal error".to_string()
|
||||
}
|
||||
}
|
||||
|
||||
impl ReportableError for LinkAuthError {
|
||||
impl ReportableError for WebAuthError {
|
||||
fn get_error_kind(&self) -> crate::error::ErrorKind {
|
||||
match self {
|
||||
LinkAuthError::WaiterRegister(_) => crate::error::ErrorKind::Service,
|
||||
LinkAuthError::WaiterWait(_) => crate::error::ErrorKind::Service,
|
||||
LinkAuthError::Io(_) => crate::error::ErrorKind::ClientDisconnect,
|
||||
Self::WaiterRegister(_) => crate::error::ErrorKind::Service,
|
||||
Self::WaiterWait(_) => crate::error::ErrorKind::Service,
|
||||
Self::Io(_) => crate::error::ErrorKind::ClientDisconnect,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -52,7 +52,7 @@ fn hello_message(redirect_uri: &reqwest::Url, session_id: &str) -> String {
|
||||
)
|
||||
}
|
||||
|
||||
pub fn new_psql_session_id() -> String {
|
||||
pub(crate) fn new_psql_session_id() -> String {
|
||||
hex::encode(rand::random::<[u8; 8]>())
|
||||
}
|
||||
|
||||
@@ -74,7 +74,7 @@ pub(super) async fn authenticate(
|
||||
}
|
||||
};
|
||||
|
||||
let span = info_span!("link", psql_session_id = &psql_session_id);
|
||||
let span = info_span!("web", psql_session_id = &psql_session_id);
|
||||
let greeting = hello_message(link_uri, &psql_session_id);
|
||||
|
||||
// Give user a URL to spawn a new database.
|
||||
@@ -87,7 +87,7 @@ pub(super) async fn authenticate(
|
||||
|
||||
// Wait for web console response (see `mgmt`).
|
||||
info!(parent: &span, "waiting for console's reply...");
|
||||
let db_info = waiter.await.map_err(LinkAuthError::from)?;
|
||||
let db_info = waiter.await.map_err(WebAuthError::from)?;
|
||||
|
||||
client.write_message_noflush(&Be::NoticeResponse("Connecting to database."))?;
|
||||
|
||||
@@ -16,7 +16,7 @@ use thiserror::Error;
|
||||
use tracing::{info, warn};
|
||||
|
||||
#[derive(Debug, Error, PartialEq, Eq, Clone)]
|
||||
pub enum ComputeUserInfoParseError {
|
||||
pub(crate) enum ComputeUserInfoParseError {
|
||||
#[error("Parameter '{0}' is missing in startup packet.")]
|
||||
MissingKey(&'static str),
|
||||
|
||||
@@ -51,20 +51,20 @@ impl ReportableError for ComputeUserInfoParseError {
|
||||
/// Various client credentials which we use for authentication.
|
||||
/// Note that we don't store any kind of client key or password here.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct ComputeUserInfoMaybeEndpoint {
|
||||
pub user: RoleName,
|
||||
pub endpoint_id: Option<EndpointId>,
|
||||
pub options: NeonOptions,
|
||||
pub(crate) struct ComputeUserInfoMaybeEndpoint {
|
||||
pub(crate) user: RoleName,
|
||||
pub(crate) endpoint_id: Option<EndpointId>,
|
||||
pub(crate) options: NeonOptions,
|
||||
}
|
||||
|
||||
impl ComputeUserInfoMaybeEndpoint {
|
||||
#[inline]
|
||||
pub fn endpoint(&self) -> Option<&str> {
|
||||
pub(crate) fn endpoint(&self) -> Option<&str> {
|
||||
self.endpoint_id.as_deref()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn endpoint_sni(
|
||||
pub(crate) fn endpoint_sni(
|
||||
sni: &str,
|
||||
common_names: &HashSet<String>,
|
||||
) -> Result<Option<EndpointId>, ComputeUserInfoParseError> {
|
||||
@@ -83,7 +83,7 @@ pub fn endpoint_sni(
|
||||
}
|
||||
|
||||
impl ComputeUserInfoMaybeEndpoint {
|
||||
pub fn parse(
|
||||
pub(crate) fn parse(
|
||||
ctx: &RequestMonitoring,
|
||||
params: &StartupMessageParams,
|
||||
sni: Option<&str>,
|
||||
@@ -130,9 +130,12 @@ impl ComputeUserInfoMaybeEndpoint {
|
||||
}))
|
||||
}
|
||||
// Invariant: project name may not contain certain characters.
|
||||
(a, b) => a.or(b).map(|name| match project_name_valid(name.as_ref()) {
|
||||
false => Err(ComputeUserInfoParseError::MalformedProjectName(name)),
|
||||
true => Ok(name),
|
||||
(a, b) => a.or(b).map(|name| {
|
||||
if project_name_valid(name.as_ref()) {
|
||||
Ok(name)
|
||||
} else {
|
||||
Err(ComputeUserInfoParseError::MalformedProjectName(name))
|
||||
}
|
||||
}),
|
||||
}
|
||||
.transpose()?;
|
||||
@@ -170,12 +173,12 @@ impl ComputeUserInfoMaybeEndpoint {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn check_peer_addr_is_in_list(peer_addr: &IpAddr, ip_list: &[IpPattern]) -> bool {
|
||||
pub(crate) fn check_peer_addr_is_in_list(peer_addr: &IpAddr, ip_list: &[IpPattern]) -> bool {
|
||||
ip_list.is_empty() || ip_list.iter().any(|pattern| check_ip(peer_addr, pattern))
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Eq, PartialEq)]
|
||||
pub enum IpPattern {
|
||||
pub(crate) enum IpPattern {
|
||||
Subnet(ipnet::IpNet),
|
||||
Range(IpAddr, IpAddr),
|
||||
Single(IpAddr),
|
||||
|
||||
@@ -17,17 +17,20 @@ use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::info;
|
||||
|
||||
/// Every authentication selector is supposed to implement this trait.
|
||||
pub trait AuthMethod {
|
||||
pub(crate) trait AuthMethod {
|
||||
/// Any authentication selector should provide initial backend message
|
||||
/// containing auth method name and parameters, e.g. md5 salt.
|
||||
fn first_message(&self, channel_binding: bool) -> BeMessage<'_>;
|
||||
}
|
||||
|
||||
/// Initial state of [`AuthFlow`].
|
||||
pub struct Begin;
|
||||
pub(crate) struct Begin;
|
||||
|
||||
/// Use [SCRAM](crate::scram)-based auth in [`AuthFlow`].
|
||||
pub struct Scram<'a>(pub &'a scram::ServerSecret, pub &'a RequestMonitoring);
|
||||
pub(crate) struct Scram<'a>(
|
||||
pub(crate) &'a scram::ServerSecret,
|
||||
pub(crate) &'a RequestMonitoring,
|
||||
);
|
||||
|
||||
impl AuthMethod for Scram<'_> {
|
||||
#[inline(always)]
|
||||
@@ -44,7 +47,7 @@ impl AuthMethod for Scram<'_> {
|
||||
|
||||
/// Use an ad hoc auth flow (for clients which don't support SNI) proposed in
|
||||
/// <https://github.com/neondatabase/cloud/issues/1620#issuecomment-1165332290>.
|
||||
pub struct PasswordHack;
|
||||
pub(crate) struct PasswordHack;
|
||||
|
||||
impl AuthMethod for PasswordHack {
|
||||
#[inline(always)]
|
||||
@@ -55,10 +58,10 @@ impl AuthMethod for PasswordHack {
|
||||
|
||||
/// Use clear-text password auth called `password` in docs
|
||||
/// <https://www.postgresql.org/docs/current/auth-password.html>
|
||||
pub struct CleartextPassword {
|
||||
pub pool: Arc<ThreadPool>,
|
||||
pub endpoint: EndpointIdInt,
|
||||
pub secret: AuthSecret,
|
||||
pub(crate) struct CleartextPassword {
|
||||
pub(crate) pool: Arc<ThreadPool>,
|
||||
pub(crate) endpoint: EndpointIdInt,
|
||||
pub(crate) secret: AuthSecret,
|
||||
}
|
||||
|
||||
impl AuthMethod for CleartextPassword {
|
||||
@@ -70,7 +73,7 @@ impl AuthMethod for CleartextPassword {
|
||||
|
||||
/// This wrapper for [`PqStream`] performs client authentication.
|
||||
#[must_use]
|
||||
pub struct AuthFlow<'a, S, State> {
|
||||
pub(crate) struct AuthFlow<'a, S, State> {
|
||||
/// The underlying stream which implements libpq's protocol.
|
||||
stream: &'a mut PqStream<Stream<S>>,
|
||||
/// State might contain ancillary data (see [`Self::begin`]).
|
||||
@@ -81,7 +84,7 @@ pub struct AuthFlow<'a, S, State> {
|
||||
/// Initial state of the stream wrapper.
|
||||
impl<'a, S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'a, S, Begin> {
|
||||
/// Create a new wrapper for client authentication.
|
||||
pub fn new(stream: &'a mut PqStream<Stream<S>>) -> Self {
|
||||
pub(crate) fn new(stream: &'a mut PqStream<Stream<S>>) -> Self {
|
||||
let tls_server_end_point = stream.get_ref().tls_server_end_point();
|
||||
|
||||
Self {
|
||||
@@ -92,7 +95,7 @@ impl<'a, S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'a, S, Begin> {
|
||||
}
|
||||
|
||||
/// Move to the next step by sending auth method's name & params to client.
|
||||
pub async fn begin<M: AuthMethod>(self, method: M) -> io::Result<AuthFlow<'a, S, M>> {
|
||||
pub(crate) async fn begin<M: AuthMethod>(self, method: M) -> io::Result<AuthFlow<'a, S, M>> {
|
||||
self.stream
|
||||
.write_message(&method.first_message(self.tls_server_end_point.supported()))
|
||||
.await?;
|
||||
@@ -107,7 +110,7 @@ impl<'a, S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'a, S, Begin> {
|
||||
|
||||
impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, PasswordHack> {
|
||||
/// Perform user authentication. Raise an error in case authentication failed.
|
||||
pub async fn get_password(self) -> super::Result<PasswordHackPayload> {
|
||||
pub(crate) async fn get_password(self) -> super::Result<PasswordHackPayload> {
|
||||
let msg = self.stream.read_password_message().await?;
|
||||
let password = msg
|
||||
.strip_suffix(&[0])
|
||||
@@ -126,7 +129,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, PasswordHack> {
|
||||
|
||||
impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, CleartextPassword> {
|
||||
/// Perform user authentication. Raise an error in case authentication failed.
|
||||
pub async fn authenticate(self) -> super::Result<sasl::Outcome<ComputeCredentialKeys>> {
|
||||
pub(crate) async fn authenticate(self) -> super::Result<sasl::Outcome<ComputeCredentialKeys>> {
|
||||
let msg = self.stream.read_password_message().await?;
|
||||
let password = msg
|
||||
.strip_suffix(&[0])
|
||||
@@ -151,7 +154,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, CleartextPassword> {
|
||||
/// Stream wrapper for handling [SCRAM](crate::scram) auth.
|
||||
impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
|
||||
/// Perform user authentication. Raise an error in case authentication failed.
|
||||
pub async fn authenticate(self) -> super::Result<sasl::Outcome<scram::ScramKey>> {
|
||||
pub(crate) async fn authenticate(self) -> super::Result<sasl::Outcome<scram::ScramKey>> {
|
||||
let Scram(secret, ctx) = self.state;
|
||||
|
||||
// pause the timer while we communicate with the client
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
//! Payload for ad hoc authentication method for clients that don't support SNI.
|
||||
//! See the `impl` for [`super::backend::BackendType<ClientCredentials>`].
|
||||
//! See the `impl` for [`super::backend::Backend<ClientCredentials>`].
|
||||
//! Read more: <https://github.com/neondatabase/cloud/issues/1620#issuecomment-1165332290>.
|
||||
//! UPDATE (Mon Aug 8 13:20:34 UTC 2022): the payload format has been simplified.
|
||||
|
||||
@@ -7,13 +7,13 @@ use bstr::ByteSlice;
|
||||
|
||||
use crate::EndpointId;
|
||||
|
||||
pub struct PasswordHackPayload {
|
||||
pub endpoint: EndpointId,
|
||||
pub password: Vec<u8>,
|
||||
pub(crate) struct PasswordHackPayload {
|
||||
pub(crate) endpoint: EndpointId,
|
||||
pub(crate) password: Vec<u8>,
|
||||
}
|
||||
|
||||
impl PasswordHackPayload {
|
||||
pub fn parse(bytes: &[u8]) -> Option<Self> {
|
||||
pub(crate) fn parse(bytes: &[u8]) -> Option<Self> {
|
||||
// The format is `project=<utf-8>;<password-bytes>` or `project=<utf-8>$<password-bytes>`.
|
||||
let separators = [";", "$"];
|
||||
for sep in separators {
|
||||
@@ -30,7 +30,7 @@ impl PasswordHackPayload {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn parse_endpoint_param(bytes: &str) -> Option<&str> {
|
||||
pub(crate) fn parse_endpoint_param(bytes: &str) -> Option<&str> {
|
||||
bytes
|
||||
.strip_prefix("project=")
|
||||
.or_else(|| bytes.strip_prefix("endpoint="))
|
||||
|
||||
@@ -212,7 +212,7 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig
|
||||
|
||||
Ok(Box::leak(Box::new(ProxyConfig {
|
||||
tls_config: None,
|
||||
auth_backend: proxy::auth::BackendType::Local(proxy::auth::backend::MaybeOwned::Owned(
|
||||
auth_backend: proxy::auth::Backend::Local(proxy::auth::backend::MaybeOwned::Owned(
|
||||
LocalBackend::new(args.compute),
|
||||
)),
|
||||
metric_collection: None,
|
||||
|
||||
@@ -60,11 +60,14 @@ use clap::{Parser, ValueEnum};
|
||||
static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
|
||||
|
||||
#[derive(Clone, Debug, ValueEnum)]
|
||||
enum AuthBackend {
|
||||
enum AuthBackendType {
|
||||
Console,
|
||||
#[cfg(feature = "testing")]
|
||||
Postgres,
|
||||
Link,
|
||||
// clap only shows the name, not the alias, in usage text.
|
||||
// TODO: swap name/alias and deprecate "link"
|
||||
#[value(name("link"), alias("web"))]
|
||||
Web,
|
||||
}
|
||||
|
||||
/// Neon proxy/router
|
||||
@@ -77,8 +80,8 @@ struct ProxyCliArgs {
|
||||
/// listen for incoming client connections on ip:port
|
||||
#[clap(short, long, default_value = "127.0.0.1:4432")]
|
||||
proxy: String,
|
||||
#[clap(value_enum, long, default_value_t = AuthBackend::Link)]
|
||||
auth_backend: AuthBackend,
|
||||
#[clap(value_enum, long, default_value_t = AuthBackendType::Web)]
|
||||
auth_backend: AuthBackendType,
|
||||
/// listen for management callback connection on ip:port
|
||||
#[clap(short, long, default_value = "127.0.0.1:7000")]
|
||||
mgmt: String,
|
||||
@@ -88,7 +91,7 @@ struct ProxyCliArgs {
|
||||
/// listen for incoming wss connections on ip:port
|
||||
#[clap(long)]
|
||||
wss: Option<String>,
|
||||
/// redirect unauthenticated users to the given uri in case of link auth
|
||||
/// redirect unauthenticated users to the given uri in case of web auth
|
||||
#[clap(short, long, default_value = "http://localhost:3000/psql_session/")]
|
||||
uri: String,
|
||||
/// cloud API endpoint for authenticating users
|
||||
@@ -470,7 +473,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
));
|
||||
}
|
||||
|
||||
if let auth::BackendType::Console(api, _) = &config.auth_backend {
|
||||
if let auth::Backend::Console(api, _) = &config.auth_backend {
|
||||
if let proxy::console::provider::ConsoleBackend::Console(api) = &**api {
|
||||
match (redis_notifications_client, regional_redis_client.clone()) {
|
||||
(None, None) => {}
|
||||
@@ -575,7 +578,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
}
|
||||
|
||||
let auth_backend = match &args.auth_backend {
|
||||
AuthBackend::Console => {
|
||||
AuthBackendType::Console => {
|
||||
let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
|
||||
let project_info_cache_config: ProjectInfoCacheOptions =
|
||||
args.project_info_cache.parse()?;
|
||||
@@ -624,18 +627,18 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
wake_compute_endpoint_rate_limiter,
|
||||
);
|
||||
let api = console::provider::ConsoleBackend::Console(api);
|
||||
auth::BackendType::Console(MaybeOwned::Owned(api), ())
|
||||
auth::Backend::Console(MaybeOwned::Owned(api), ())
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
AuthBackend::Postgres => {
|
||||
AuthBackendType::Postgres => {
|
||||
let url = args.auth_endpoint.parse()?;
|
||||
let api = console::provider::mock::Api::new(url);
|
||||
let api = console::provider::ConsoleBackend::Postgres(api);
|
||||
auth::BackendType::Console(MaybeOwned::Owned(api), ())
|
||||
auth::Backend::Console(MaybeOwned::Owned(api), ())
|
||||
}
|
||||
AuthBackend::Link => {
|
||||
AuthBackendType::Web => {
|
||||
let url = args.uri.parse()?;
|
||||
auth::BackendType::Link(MaybeOwned::Owned(url), ())
|
||||
auth::Backend::Web(MaybeOwned::Owned(url), ())
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
pub mod common;
|
||||
pub mod endpoints;
|
||||
pub mod project_info;
|
||||
pub(crate) mod common;
|
||||
pub(crate) mod endpoints;
|
||||
pub(crate) mod project_info;
|
||||
mod timed_lru;
|
||||
|
||||
pub use common::{Cache, Cached};
|
||||
pub use timed_lru::TimedLru;
|
||||
pub(crate) use common::{Cache, Cached};
|
||||
pub(crate) use timed_lru::TimedLru;
|
||||
|
||||
18
proxy/src/cache/common.rs
vendored
18
proxy/src/cache/common.rs
vendored
@@ -3,7 +3,7 @@ use std::ops::{Deref, DerefMut};
|
||||
/// A generic trait which exposes types of cache's key and value,
|
||||
/// as well as the notion of cache entry invalidation.
|
||||
/// This is useful for [`Cached`].
|
||||
pub trait Cache {
|
||||
pub(crate) trait Cache {
|
||||
/// Entry's key.
|
||||
type Key;
|
||||
|
||||
@@ -29,21 +29,21 @@ impl<C: Cache> Cache for &C {
|
||||
}
|
||||
|
||||
/// Wrapper for convenient entry invalidation.
|
||||
pub struct Cached<C: Cache, V = <C as Cache>::Value> {
|
||||
pub(crate) struct Cached<C: Cache, V = <C as Cache>::Value> {
|
||||
/// Cache + lookup info.
|
||||
pub token: Option<(C, C::LookupInfo<C::Key>)>,
|
||||
pub(crate) token: Option<(C, C::LookupInfo<C::Key>)>,
|
||||
|
||||
/// The value itself.
|
||||
pub value: V,
|
||||
pub(crate) value: V,
|
||||
}
|
||||
|
||||
impl<C: Cache, V> Cached<C, V> {
|
||||
/// Place any entry into this wrapper; invalidation will be a no-op.
|
||||
pub fn new_uncached(value: V) -> Self {
|
||||
pub(crate) fn new_uncached(value: V) -> Self {
|
||||
Self { token: None, value }
|
||||
}
|
||||
|
||||
pub fn take_value(self) -> (Cached<C, ()>, V) {
|
||||
pub(crate) fn take_value(self) -> (Cached<C, ()>, V) {
|
||||
(
|
||||
Cached {
|
||||
token: self.token,
|
||||
@@ -53,7 +53,7 @@ impl<C: Cache, V> Cached<C, V> {
|
||||
)
|
||||
}
|
||||
|
||||
pub fn map<U>(self, f: impl FnOnce(V) -> U) -> Cached<C, U> {
|
||||
pub(crate) fn map<U>(self, f: impl FnOnce(V) -> U) -> Cached<C, U> {
|
||||
Cached {
|
||||
token: self.token,
|
||||
value: f(self.value),
|
||||
@@ -61,7 +61,7 @@ impl<C: Cache, V> Cached<C, V> {
|
||||
}
|
||||
|
||||
/// Drop this entry from a cache if it's still there.
|
||||
pub fn invalidate(self) -> V {
|
||||
pub(crate) fn invalidate(self) -> V {
|
||||
if let Some((cache, info)) = &self.token {
|
||||
cache.invalidate(info);
|
||||
}
|
||||
@@ -69,7 +69,7 @@ impl<C: Cache, V> Cached<C, V> {
|
||||
}
|
||||
|
||||
/// Tell if this entry is actually cached.
|
||||
pub fn cached(&self) -> bool {
|
||||
pub(crate) fn cached(&self) -> bool {
|
||||
self.token.is_some()
|
||||
}
|
||||
}
|
||||
|
||||
6
proxy/src/cache/endpoints.rs
vendored
6
proxy/src/cache/endpoints.rs
vendored
@@ -28,7 +28,7 @@ use crate::{
|
||||
};
|
||||
|
||||
#[derive(Deserialize, Debug, Clone)]
|
||||
pub struct ControlPlaneEventKey {
|
||||
pub(crate) struct ControlPlaneEventKey {
|
||||
endpoint_created: Option<EndpointCreated>,
|
||||
branch_created: Option<BranchCreated>,
|
||||
project_created: Option<ProjectCreated>,
|
||||
@@ -56,7 +56,7 @@ pub struct EndpointsCache {
|
||||
}
|
||||
|
||||
impl EndpointsCache {
|
||||
pub fn new(config: EndpointCacheConfig) -> Self {
|
||||
pub(crate) fn new(config: EndpointCacheConfig) -> Self {
|
||||
Self {
|
||||
limiter: Arc::new(Mutex::new(GlobalRateLimiter::new(
|
||||
config.limiter_info.clone(),
|
||||
@@ -68,7 +68,7 @@ impl EndpointsCache {
|
||||
ready: AtomicBool::new(false),
|
||||
}
|
||||
}
|
||||
pub async fn is_valid(&self, ctx: &RequestMonitoring, endpoint: &EndpointId) -> bool {
|
||||
pub(crate) async fn is_valid(&self, ctx: &RequestMonitoring, endpoint: &EndpointId) -> bool {
|
||||
if !self.ready.load(Ordering::Acquire) {
|
||||
return true;
|
||||
}
|
||||
|
||||
32
proxy/src/cache/project_info.rs
vendored
32
proxy/src/cache/project_info.rs
vendored
@@ -24,7 +24,7 @@ use crate::{
|
||||
use super::{Cache, Cached};
|
||||
|
||||
#[async_trait]
|
||||
pub trait ProjectInfoCache {
|
||||
pub(crate) trait ProjectInfoCache {
|
||||
fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt);
|
||||
fn invalidate_role_secret_for_project(&self, project_id: ProjectIdInt, role_name: RoleNameInt);
|
||||
async fn decrement_active_listeners(&self);
|
||||
@@ -37,7 +37,7 @@ struct Entry<T> {
|
||||
}
|
||||
|
||||
impl<T> Entry<T> {
|
||||
pub fn new(value: T) -> Self {
|
||||
pub(crate) fn new(value: T) -> Self {
|
||||
Self {
|
||||
created_at: Instant::now(),
|
||||
value,
|
||||
@@ -64,7 +64,7 @@ impl EndpointInfo {
|
||||
Some(t) => t < created_at,
|
||||
}
|
||||
}
|
||||
pub fn get_role_secret(
|
||||
pub(crate) fn get_role_secret(
|
||||
&self,
|
||||
role_name: RoleNameInt,
|
||||
valid_since: Instant,
|
||||
@@ -81,7 +81,7 @@ impl EndpointInfo {
|
||||
None
|
||||
}
|
||||
|
||||
pub fn get_allowed_ips(
|
||||
pub(crate) fn get_allowed_ips(
|
||||
&self,
|
||||
valid_since: Instant,
|
||||
ignore_cache_since: Option<Instant>,
|
||||
@@ -96,10 +96,10 @@ impl EndpointInfo {
|
||||
}
|
||||
None
|
||||
}
|
||||
pub fn invalidate_allowed_ips(&mut self) {
|
||||
pub(crate) fn invalidate_allowed_ips(&mut self) {
|
||||
self.allowed_ips = None;
|
||||
}
|
||||
pub fn invalidate_role_secret(&mut self, role_name: RoleNameInt) {
|
||||
pub(crate) fn invalidate_role_secret(&mut self, role_name: RoleNameInt) {
|
||||
self.secret.remove(&role_name);
|
||||
}
|
||||
}
|
||||
@@ -178,7 +178,7 @@ impl ProjectInfoCache for ProjectInfoCacheImpl {
|
||||
}
|
||||
|
||||
impl ProjectInfoCacheImpl {
|
||||
pub fn new(config: ProjectInfoCacheOptions) -> Self {
|
||||
pub(crate) fn new(config: ProjectInfoCacheOptions) -> Self {
|
||||
Self {
|
||||
cache: DashMap::new(),
|
||||
project2ep: DashMap::new(),
|
||||
@@ -189,7 +189,7 @@ impl ProjectInfoCacheImpl {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_role_secret(
|
||||
pub(crate) fn get_role_secret(
|
||||
&self,
|
||||
endpoint_id: &EndpointId,
|
||||
role_name: &RoleName,
|
||||
@@ -212,7 +212,7 @@ impl ProjectInfoCacheImpl {
|
||||
}
|
||||
Some(Cached::new_uncached(value))
|
||||
}
|
||||
pub fn get_allowed_ips(
|
||||
pub(crate) fn get_allowed_ips(
|
||||
&self,
|
||||
endpoint_id: &EndpointId,
|
||||
) -> Option<Cached<&Self, Arc<Vec<IpPattern>>>> {
|
||||
@@ -230,7 +230,7 @@ impl ProjectInfoCacheImpl {
|
||||
}
|
||||
Some(Cached::new_uncached(value))
|
||||
}
|
||||
pub fn insert_role_secret(
|
||||
pub(crate) fn insert_role_secret(
|
||||
&self,
|
||||
project_id: ProjectIdInt,
|
||||
endpoint_id: EndpointIdInt,
|
||||
@@ -247,7 +247,7 @@ impl ProjectInfoCacheImpl {
|
||||
entry.secret.insert(role_name, secret.into());
|
||||
}
|
||||
}
|
||||
pub fn insert_allowed_ips(
|
||||
pub(crate) fn insert_allowed_ips(
|
||||
&self,
|
||||
project_id: ProjectIdInt,
|
||||
endpoint_id: EndpointIdInt,
|
||||
@@ -274,13 +274,13 @@ impl ProjectInfoCacheImpl {
|
||||
let ttl_disabled_since_us = self
|
||||
.ttl_disabled_since_us
|
||||
.load(std::sync::atomic::Ordering::Relaxed);
|
||||
let ignore_cache_since = if ttl_disabled_since_us != u64::MAX {
|
||||
let ignore_cache_since = if ttl_disabled_since_us == u64::MAX {
|
||||
None
|
||||
} else {
|
||||
let ignore_cache_since = self.start_time + Duration::from_micros(ttl_disabled_since_us);
|
||||
// We are fine if entry is not older than ttl or was added before we are getting notifications.
|
||||
valid_since = valid_since.min(ignore_cache_since);
|
||||
Some(ignore_cache_since)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
(valid_since, ignore_cache_since)
|
||||
}
|
||||
@@ -306,7 +306,7 @@ impl ProjectInfoCacheImpl {
|
||||
let mut removed = 0;
|
||||
let shard = self.project2ep.shards()[shard].write();
|
||||
for (_, endpoints) in shard.iter() {
|
||||
for endpoint in endpoints.get().iter() {
|
||||
for endpoint in endpoints.get() {
|
||||
self.cache.remove(endpoint);
|
||||
removed += 1;
|
||||
}
|
||||
@@ -319,7 +319,7 @@ impl ProjectInfoCacheImpl {
|
||||
|
||||
/// Lookup info for project info cache.
|
||||
/// This is used to invalidate cache entries.
|
||||
pub struct CachedLookupInfo {
|
||||
pub(crate) struct CachedLookupInfo {
|
||||
/// Search by this key.
|
||||
endpoint_id: EndpointIdInt,
|
||||
lookup_type: LookupType,
|
||||
|
||||
45
proxy/src/cache/timed_lru.rs
vendored
45
proxy/src/cache/timed_lru.rs
vendored
@@ -39,7 +39,7 @@ use super::{common::Cached, *};
|
||||
///
|
||||
/// * It's possible for an entry that has not yet expired entry to be evicted
|
||||
/// before expired items. That's a bit wasteful, but probably fine in practice.
|
||||
pub struct TimedLru<K, V> {
|
||||
pub(crate) struct TimedLru<K, V> {
|
||||
/// Cache's name for tracing.
|
||||
name: &'static str,
|
||||
|
||||
@@ -72,7 +72,7 @@ struct Entry<T> {
|
||||
|
||||
impl<K: Hash + Eq, V> TimedLru<K, V> {
|
||||
/// Construct a new LRU cache with timed entries.
|
||||
pub fn new(
|
||||
pub(crate) fn new(
|
||||
name: &'static str,
|
||||
capacity: usize,
|
||||
ttl: Duration,
|
||||
@@ -207,11 +207,11 @@ impl<K: Hash + Eq, V> TimedLru<K, V> {
|
||||
}
|
||||
|
||||
impl<K: Hash + Eq + Clone, V: Clone> TimedLru<K, V> {
|
||||
pub fn insert_ttl(&self, key: K, value: V, ttl: Duration) {
|
||||
pub(crate) fn insert_ttl(&self, key: K, value: V, ttl: Duration) {
|
||||
self.insert_raw_ttl(key, value, ttl, false);
|
||||
}
|
||||
|
||||
pub fn insert_unit(&self, key: K, value: V) -> (Option<V>, Cached<&Self, ()>) {
|
||||
pub(crate) fn insert_unit(&self, key: K, value: V) -> (Option<V>, Cached<&Self, ()>) {
|
||||
let (created_at, old) = self.insert_raw(key.clone(), value);
|
||||
|
||||
let cached = Cached {
|
||||
@@ -221,22 +221,11 @@ impl<K: Hash + Eq + Clone, V: Clone> TimedLru<K, V> {
|
||||
|
||||
(old, cached)
|
||||
}
|
||||
|
||||
pub fn insert(&self, key: K, value: V) -> (Option<V>, Cached<&Self>) {
|
||||
let (created_at, old) = self.insert_raw(key.clone(), value.clone());
|
||||
|
||||
let cached = Cached {
|
||||
token: Some((self, LookupInfo { created_at, key })),
|
||||
value,
|
||||
};
|
||||
|
||||
(old, cached)
|
||||
}
|
||||
}
|
||||
|
||||
impl<K: Hash + Eq, V: Clone> TimedLru<K, V> {
|
||||
/// Retrieve a cached entry in convenient wrapper.
|
||||
pub fn get<Q>(&self, key: &Q) -> Option<timed_lru::Cached<&Self>>
|
||||
pub(crate) fn get<Q>(&self, key: &Q) -> Option<timed_lru::Cached<&Self>>
|
||||
where
|
||||
K: Borrow<Q> + Clone,
|
||||
Q: Hash + Eq + ?Sized,
|
||||
@@ -253,32 +242,10 @@ impl<K: Hash + Eq, V: Clone> TimedLru<K, V> {
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
/// Retrieve a cached entry in convenient wrapper, ignoring its TTL.
|
||||
pub fn get_ignoring_ttl<Q>(&self, key: &Q) -> Option<timed_lru::Cached<&Self>>
|
||||
where
|
||||
K: Borrow<Q>,
|
||||
Q: Hash + Eq + ?Sized,
|
||||
{
|
||||
let mut cache = self.cache.lock();
|
||||
cache
|
||||
.get(key)
|
||||
.map(|entry| Cached::new_uncached(entry.value.clone()))
|
||||
}
|
||||
|
||||
/// Remove an entry from the cache.
|
||||
pub fn remove<Q>(&self, key: &Q) -> Option<V>
|
||||
where
|
||||
K: Borrow<Q> + Clone,
|
||||
Q: Hash + Eq + ?Sized,
|
||||
{
|
||||
let mut cache = self.cache.lock();
|
||||
cache.remove(key).map(|entry| entry.value)
|
||||
}
|
||||
}
|
||||
|
||||
/// Lookup information for key invalidation.
|
||||
pub struct LookupInfo<K> {
|
||||
pub(crate) struct LookupInfo<K> {
|
||||
/// Time of creation of a cache [`Entry`].
|
||||
/// We use this during invalidation lookups to prevent eviction of a newer
|
||||
/// entry sharing the same key (it might've been inserted by a different
|
||||
|
||||
@@ -18,7 +18,7 @@ use crate::{
|
||||
|
||||
pub type CancelMap = Arc<DashMap<CancelKeyData, Option<CancelClosure>>>;
|
||||
pub type CancellationHandlerMain = CancellationHandler<Option<Arc<Mutex<RedisPublisherClient>>>>;
|
||||
pub type CancellationHandlerMainInternal = Option<Arc<Mutex<RedisPublisherClient>>>;
|
||||
pub(crate) type CancellationHandlerMainInternal = Option<Arc<Mutex<RedisPublisherClient>>>;
|
||||
|
||||
/// Enables serving `CancelRequest`s.
|
||||
///
|
||||
@@ -32,7 +32,7 @@ pub struct CancellationHandler<P> {
|
||||
}
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
pub enum CancelError {
|
||||
pub(crate) enum CancelError {
|
||||
#[error("{0}")]
|
||||
IO(#[from] std::io::Error),
|
||||
#[error("{0}")]
|
||||
@@ -53,7 +53,7 @@ impl ReportableError for CancelError {
|
||||
|
||||
impl<P: CancellationPublisher> CancellationHandler<P> {
|
||||
/// Run async action within an ephemeral session identified by [`CancelKeyData`].
|
||||
pub fn get_session(self: Arc<Self>) -> Session<P> {
|
||||
pub(crate) fn get_session(self: Arc<Self>) -> Session<P> {
|
||||
// HACK: We'd rather get the real backend_pid but tokio_postgres doesn't
|
||||
// expose it and we don't want to do another roundtrip to query
|
||||
// for it. The client will be able to notice that this is not the
|
||||
@@ -81,7 +81,7 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
|
||||
}
|
||||
/// Try to cancel a running query for the corresponding connection.
|
||||
/// If the cancellation key is not found, it will be published to Redis.
|
||||
pub async fn cancel_session(
|
||||
pub(crate) async fn cancel_session(
|
||||
&self,
|
||||
key: CancelKeyData,
|
||||
session_id: Uuid,
|
||||
@@ -155,14 +155,14 @@ pub struct CancelClosure {
|
||||
}
|
||||
|
||||
impl CancelClosure {
|
||||
pub fn new(socket_addr: SocketAddr, cancel_token: CancelToken) -> Self {
|
||||
pub(crate) fn new(socket_addr: SocketAddr, cancel_token: CancelToken) -> Self {
|
||||
Self {
|
||||
socket_addr,
|
||||
cancel_token,
|
||||
}
|
||||
}
|
||||
/// Cancels the query running on user's compute node.
|
||||
pub async fn try_cancel_query(self) -> Result<(), CancelError> {
|
||||
pub(crate) async fn try_cancel_query(self) -> Result<(), CancelError> {
|
||||
let socket = TcpStream::connect(self.socket_addr).await?;
|
||||
self.cancel_token.cancel_query_raw(socket, NoTls).await?;
|
||||
info!("query was cancelled");
|
||||
@@ -171,7 +171,7 @@ impl CancelClosure {
|
||||
}
|
||||
|
||||
/// Helper for registering query cancellation tokens.
|
||||
pub struct Session<P> {
|
||||
pub(crate) struct Session<P> {
|
||||
/// The user-facing key identifying this session.
|
||||
key: CancelKeyData,
|
||||
/// The [`CancelMap`] this session belongs to.
|
||||
@@ -181,7 +181,7 @@ pub struct Session<P> {
|
||||
impl<P> Session<P> {
|
||||
/// Store the cancel token for the given session.
|
||||
/// This enables query cancellation in `crate::proxy::prepare_client_connection`.
|
||||
pub fn enable_query_cancellation(&self, cancel_closure: CancelClosure) -> CancelKeyData {
|
||||
pub(crate) fn enable_query_cancellation(&self, cancel_closure: CancelClosure) -> CancelKeyData {
|
||||
info!("enabling query cancellation for this session");
|
||||
self.cancellation_handler
|
||||
.map
|
||||
@@ -220,7 +220,8 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn cancel_session_noop_regression() {
|
||||
let handler = CancellationHandler::<()>::new(Default::default(), CancellationSource::Local);
|
||||
let handler =
|
||||
CancellationHandler::<()>::new(CancelMap::default(), CancellationSource::Local);
|
||||
handler
|
||||
.cancel_session(
|
||||
CancelKeyData {
|
||||
|
||||
@@ -23,7 +23,7 @@ use tracing::{error, info, warn};
|
||||
const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node";
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
pub enum ConnectionError {
|
||||
pub(crate) enum ConnectionError {
|
||||
/// This error doesn't seem to reveal any secrets; for instance,
|
||||
/// `tokio_postgres::error::Kind` doesn't contain ip addresses and such.
|
||||
#[error("{COULD_NOT_CONNECT}: {0}")]
|
||||
@@ -86,22 +86,22 @@ impl ReportableError for ConnectionError {
|
||||
}
|
||||
|
||||
/// A pair of `ClientKey` & `ServerKey` for `SCRAM-SHA-256`.
|
||||
pub type ScramKeys = tokio_postgres::config::ScramKeys<32>;
|
||||
pub(crate) type ScramKeys = tokio_postgres::config::ScramKeys<32>;
|
||||
|
||||
/// A config for establishing a connection to compute node.
|
||||
/// Eventually, `tokio_postgres` will be replaced with something better.
|
||||
/// Newtype allows us to implement methods on top of it.
|
||||
#[derive(Clone, Default)]
|
||||
pub struct ConnCfg(Box<tokio_postgres::Config>);
|
||||
pub(crate) struct ConnCfg(Box<tokio_postgres::Config>);
|
||||
|
||||
/// Creation and initialization routines.
|
||||
impl ConnCfg {
|
||||
pub fn new() -> Self {
|
||||
pub(crate) fn new() -> Self {
|
||||
Self::default()
|
||||
}
|
||||
|
||||
/// Reuse password or auth keys from the other config.
|
||||
pub fn reuse_password(&mut self, other: Self) {
|
||||
pub(crate) fn reuse_password(&mut self, other: Self) {
|
||||
if let Some(password) = other.get_password() {
|
||||
self.password(password);
|
||||
}
|
||||
@@ -111,7 +111,7 @@ impl ConnCfg {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_host(&self) -> Result<Host, WakeComputeError> {
|
||||
pub(crate) fn get_host(&self) -> Result<Host, WakeComputeError> {
|
||||
match self.0.get_hosts() {
|
||||
[tokio_postgres::config::Host::Tcp(s)] => Ok(s.into()),
|
||||
// we should not have multiple address or unix addresses.
|
||||
@@ -122,15 +122,15 @@ impl ConnCfg {
|
||||
}
|
||||
|
||||
/// Apply startup message params to the connection config.
|
||||
pub fn set_startup_params(&mut self, params: &StartupMessageParams) {
|
||||
pub(crate) fn set_startup_params(&mut self, params: &StartupMessageParams) {
|
||||
// Only set `user` if it's not present in the config.
|
||||
// Link auth flow takes username from the console's response.
|
||||
// Web auth flow takes username from the console's response.
|
||||
if let (None, Some(user)) = (self.get_user(), params.get("user")) {
|
||||
self.user(user);
|
||||
}
|
||||
|
||||
// Only set `dbname` if it's not present in the config.
|
||||
// Link auth flow takes dbname from the console's response.
|
||||
// Web auth flow takes dbname from the console's response.
|
||||
if let (None, Some(dbname)) = (self.get_dbname(), params.get("database")) {
|
||||
self.dbname(dbname);
|
||||
}
|
||||
@@ -255,25 +255,25 @@ impl ConnCfg {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PostgresConnection {
|
||||
pub(crate) struct PostgresConnection {
|
||||
/// Socket connected to a compute node.
|
||||
pub stream: tokio_postgres::maybe_tls_stream::MaybeTlsStream<
|
||||
pub(crate) stream: tokio_postgres::maybe_tls_stream::MaybeTlsStream<
|
||||
tokio::net::TcpStream,
|
||||
tokio_postgres_rustls::RustlsStream<tokio::net::TcpStream>,
|
||||
>,
|
||||
/// PostgreSQL connection parameters.
|
||||
pub params: std::collections::HashMap<String, String>,
|
||||
pub(crate) params: std::collections::HashMap<String, String>,
|
||||
/// Query cancellation token.
|
||||
pub cancel_closure: CancelClosure,
|
||||
pub(crate) cancel_closure: CancelClosure,
|
||||
/// Labels for proxy's metrics.
|
||||
pub aux: MetricsAuxInfo,
|
||||
pub(crate) aux: MetricsAuxInfo,
|
||||
|
||||
_guage: NumDbConnectionsGuard<'static>,
|
||||
}
|
||||
|
||||
impl ConnCfg {
|
||||
/// Connect to a corresponding compute node.
|
||||
pub async fn connect(
|
||||
pub(crate) async fn connect(
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
allow_self_signed_compute: bool,
|
||||
@@ -286,7 +286,7 @@ impl ConnCfg {
|
||||
|
||||
let client_config = if allow_self_signed_compute {
|
||||
// Allow all certificates for creating the connection
|
||||
let verifier = Arc::new(AcceptEverythingVerifier) as Arc<dyn ServerCertVerifier>;
|
||||
let verifier = Arc::new(AcceptEverythingVerifier);
|
||||
rustls::ClientConfig::builder()
|
||||
.dangerous()
|
||||
.with_custom_certificate_verifier(verifier)
|
||||
|
||||
@@ -25,7 +25,7 @@ use x509_parser::oid_registry;
|
||||
|
||||
pub struct ProxyConfig {
|
||||
pub tls_config: Option<TlsConfig>,
|
||||
pub auth_backend: auth::BackendType<'static, (), ()>,
|
||||
pub auth_backend: auth::Backend<'static, (), ()>,
|
||||
pub metric_collection: Option<MetricCollectionConfig>,
|
||||
pub allow_self_signed_compute: bool,
|
||||
pub http_config: HttpConfig,
|
||||
@@ -247,7 +247,7 @@ impl CertResolver {
|
||||
|
||||
let common_name = pem.subject().to_string();
|
||||
|
||||
// We only use non-wildcard certificates in link proxy so it seems okay to treat them the same as
|
||||
// We only use non-wildcard certificates in web auth proxy so it seems okay to treat them the same as
|
||||
// wildcard ones as we don't use SNI there. That treatment only affects certificate selection, so
|
||||
// verify-full will still check wildcard match. Old coding here just ignored non-wildcard common names
|
||||
// and passed None instead, which blows up number of cases downstream code should handle. Proper coding
|
||||
@@ -318,7 +318,7 @@ impl CertResolver {
|
||||
// a) Instead of multi-cert approach use single cert with extra
|
||||
// domains listed in Subject Alternative Name (SAN).
|
||||
// b) Deploy separate proxy instances for extra domains.
|
||||
self.default.as_ref().cloned()
|
||||
self.default.clone()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,7 +10,7 @@ pub(crate) use provider::{errors, Api, AuthSecret, CachedNodeInfo, NodeInfo};
|
||||
|
||||
/// Various cache-related types.
|
||||
pub mod caches {
|
||||
pub use super::provider::{ApiCaches, NodeInfoCache};
|
||||
pub use super::provider::ApiCaches;
|
||||
}
|
||||
|
||||
/// Various cache-related types.
|
||||
|
||||
@@ -12,22 +12,22 @@ use crate::RoleName;
|
||||
/// Generic error response with human-readable description.
|
||||
/// Note that we can't always present it to user as is.
|
||||
#[derive(Debug, Deserialize, Clone)]
|
||||
pub struct ConsoleError {
|
||||
pub error: Box<str>,
|
||||
pub(crate) struct ConsoleError {
|
||||
pub(crate) error: Box<str>,
|
||||
#[serde(skip)]
|
||||
pub http_status_code: http::StatusCode,
|
||||
pub status: Option<Status>,
|
||||
pub(crate) http_status_code: http::StatusCode,
|
||||
pub(crate) status: Option<Status>,
|
||||
}
|
||||
|
||||
impl ConsoleError {
|
||||
pub fn get_reason(&self) -> Reason {
|
||||
pub(crate) fn get_reason(&self) -> Reason {
|
||||
self.status
|
||||
.as_ref()
|
||||
.and_then(|s| s.details.error_info.as_ref())
|
||||
.map_or(Reason::Unknown, |e| e.reason)
|
||||
}
|
||||
|
||||
pub fn get_user_facing_message(&self) -> String {
|
||||
pub(crate) fn get_user_facing_message(&self) -> String {
|
||||
use super::provider::errors::REQUEST_FAILED;
|
||||
self.status
|
||||
.as_ref()
|
||||
@@ -88,27 +88,28 @@ impl CouldRetry for ConsoleError {
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize, Clone)]
|
||||
pub struct Status {
|
||||
pub code: Box<str>,
|
||||
pub message: Box<str>,
|
||||
pub details: Details,
|
||||
#[allow(dead_code)]
|
||||
pub(crate) struct Status {
|
||||
pub(crate) code: Box<str>,
|
||||
pub(crate) message: Box<str>,
|
||||
pub(crate) details: Details,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize, Clone)]
|
||||
pub struct Details {
|
||||
pub error_info: Option<ErrorInfo>,
|
||||
pub retry_info: Option<RetryInfo>,
|
||||
pub user_facing_message: Option<UserFacingMessage>,
|
||||
pub(crate) struct Details {
|
||||
pub(crate) error_info: Option<ErrorInfo>,
|
||||
pub(crate) retry_info: Option<RetryInfo>,
|
||||
pub(crate) user_facing_message: Option<UserFacingMessage>,
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug, Deserialize)]
|
||||
pub struct ErrorInfo {
|
||||
pub reason: Reason,
|
||||
pub(crate) struct ErrorInfo {
|
||||
pub(crate) reason: Reason,
|
||||
// Schema could also have `metadata` field, but it's not structured. Skip it for now.
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, Deserialize, Default)]
|
||||
pub enum Reason {
|
||||
pub(crate) enum Reason {
|
||||
/// RoleProtected indicates that the role is protected and the attempted operation is not permitted on protected roles.
|
||||
#[serde(rename = "ROLE_PROTECTED")]
|
||||
RoleProtected,
|
||||
@@ -168,7 +169,7 @@ pub enum Reason {
|
||||
}
|
||||
|
||||
impl Reason {
|
||||
pub fn is_not_found(&self) -> bool {
|
||||
pub(crate) fn is_not_found(self) -> bool {
|
||||
matches!(
|
||||
self,
|
||||
Reason::ResourceNotFound
|
||||
@@ -178,7 +179,7 @@ impl Reason {
|
||||
)
|
||||
}
|
||||
|
||||
pub fn can_retry(&self) -> bool {
|
||||
pub(crate) fn can_retry(self) -> bool {
|
||||
match self {
|
||||
// do not retry role protected errors
|
||||
// not a transitive error
|
||||
@@ -208,22 +209,23 @@ impl Reason {
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug, Deserialize)]
|
||||
pub struct RetryInfo {
|
||||
pub retry_delay_ms: u64,
|
||||
#[allow(dead_code)]
|
||||
pub(crate) struct RetryInfo {
|
||||
pub(crate) retry_delay_ms: u64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize, Clone)]
|
||||
pub struct UserFacingMessage {
|
||||
pub message: Box<str>,
|
||||
pub(crate) struct UserFacingMessage {
|
||||
pub(crate) message: Box<str>,
|
||||
}
|
||||
|
||||
/// Response which holds client's auth secret, e.g. [`crate::scram::ServerSecret`].
|
||||
/// Returned by the `/proxy_get_role_secret` API method.
|
||||
#[derive(Deserialize)]
|
||||
pub struct GetRoleSecret {
|
||||
pub role_secret: Box<str>,
|
||||
pub allowed_ips: Option<Vec<IpPattern>>,
|
||||
pub project_id: Option<ProjectIdInt>,
|
||||
pub(crate) struct GetRoleSecret {
|
||||
pub(crate) role_secret: Box<str>,
|
||||
pub(crate) allowed_ips: Option<Vec<IpPattern>>,
|
||||
pub(crate) project_id: Option<ProjectIdInt>,
|
||||
}
|
||||
|
||||
// Manually implement debug to omit sensitive info.
|
||||
@@ -236,21 +238,21 @@ impl fmt::Debug for GetRoleSecret {
|
||||
/// Response which holds compute node's `host:port` pair.
|
||||
/// Returned by the `/proxy_wake_compute` API method.
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct WakeCompute {
|
||||
pub address: Box<str>,
|
||||
pub aux: MetricsAuxInfo,
|
||||
pub(crate) struct WakeCompute {
|
||||
pub(crate) address: Box<str>,
|
||||
pub(crate) aux: MetricsAuxInfo,
|
||||
}
|
||||
|
||||
/// Async response which concludes the link auth flow.
|
||||
/// Async response which concludes the web auth flow.
|
||||
/// Also known as `kickResponse` in the console.
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct KickSession<'a> {
|
||||
pub(crate) struct KickSession<'a> {
|
||||
/// Session ID is assigned by the proxy.
|
||||
pub session_id: &'a str,
|
||||
pub(crate) session_id: &'a str,
|
||||
|
||||
/// Compute node connection params.
|
||||
#[serde(deserialize_with = "KickSession::parse_db_info")]
|
||||
pub result: DatabaseInfo,
|
||||
pub(crate) result: DatabaseInfo,
|
||||
}
|
||||
|
||||
impl KickSession<'_> {
|
||||
@@ -273,15 +275,15 @@ impl KickSession<'_> {
|
||||
|
||||
/// Compute node connection params.
|
||||
#[derive(Deserialize)]
|
||||
pub struct DatabaseInfo {
|
||||
pub host: Box<str>,
|
||||
pub port: u16,
|
||||
pub dbname: Box<str>,
|
||||
pub user: Box<str>,
|
||||
pub(crate) struct DatabaseInfo {
|
||||
pub(crate) host: Box<str>,
|
||||
pub(crate) port: u16,
|
||||
pub(crate) dbname: Box<str>,
|
||||
pub(crate) user: Box<str>,
|
||||
/// Console always provides a password, but it might
|
||||
/// be inconvenient for debug with local PG instance.
|
||||
pub password: Option<Box<str>>,
|
||||
pub aux: MetricsAuxInfo,
|
||||
pub(crate) password: Option<Box<str>>,
|
||||
pub(crate) aux: MetricsAuxInfo,
|
||||
}
|
||||
|
||||
// Manually implement debug to omit sensitive info.
|
||||
@@ -299,12 +301,12 @@ impl fmt::Debug for DatabaseInfo {
|
||||
/// Various labels for prometheus metrics.
|
||||
/// Also known as `ProxyMetricsAuxInfo` in the console.
|
||||
#[derive(Debug, Deserialize, Clone)]
|
||||
pub struct MetricsAuxInfo {
|
||||
pub endpoint_id: EndpointIdInt,
|
||||
pub project_id: ProjectIdInt,
|
||||
pub branch_id: BranchIdInt,
|
||||
pub(crate) struct MetricsAuxInfo {
|
||||
pub(crate) endpoint_id: EndpointIdInt,
|
||||
pub(crate) project_id: ProjectIdInt,
|
||||
pub(crate) branch_id: BranchIdInt,
|
||||
#[serde(default)]
|
||||
pub cold_start_info: ColdStartInfo,
|
||||
pub(crate) cold_start_info: ColdStartInfo,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Serialize, Deserialize, Clone, Copy, FixedCardinalityLabel)]
|
||||
@@ -331,7 +333,7 @@ pub enum ColdStartInfo {
|
||||
}
|
||||
|
||||
impl ColdStartInfo {
|
||||
pub fn as_str(&self) -> &'static str {
|
||||
pub(crate) fn as_str(self) -> &'static str {
|
||||
match self {
|
||||
ColdStartInfo::Unknown => "unknown",
|
||||
ColdStartInfo::Warm => "warm",
|
||||
|
||||
@@ -14,18 +14,18 @@ use tracing::{error, info, info_span, Instrument};
|
||||
static CPLANE_WAITERS: Lazy<Waiters<ComputeReady>> = Lazy::new(Default::default);
|
||||
|
||||
/// Give caller an opportunity to wait for the cloud's reply.
|
||||
pub fn get_waiter(
|
||||
pub(crate) fn get_waiter(
|
||||
psql_session_id: impl Into<String>,
|
||||
) -> Result<Waiter<'static, ComputeReady>, waiters::RegisterError> {
|
||||
CPLANE_WAITERS.register(psql_session_id.into())
|
||||
}
|
||||
|
||||
pub fn notify(psql_session_id: &str, msg: ComputeReady) -> Result<(), waiters::NotifyError> {
|
||||
pub(crate) fn notify(psql_session_id: &str, msg: ComputeReady) -> Result<(), waiters::NotifyError> {
|
||||
CPLANE_WAITERS.notify(psql_session_id, msg)
|
||||
}
|
||||
|
||||
/// Console management API listener task.
|
||||
/// It spawns console response handlers needed for the link auth.
|
||||
/// It spawns console response handlers needed for the web auth.
|
||||
pub async fn task_main(listener: TcpListener) -> anyhow::Result<Infallible> {
|
||||
scopeguard::defer! {
|
||||
info!("mgmt has shut down");
|
||||
@@ -74,7 +74,7 @@ async fn handle_connection(socket: TcpStream) -> Result<(), QueryError> {
|
||||
}
|
||||
|
||||
/// A message received by `mgmt` when a compute node is ready.
|
||||
pub type ComputeReady = DatabaseInfo;
|
||||
pub(crate) type ComputeReady = DatabaseInfo;
|
||||
|
||||
// TODO: replace with an http-based protocol.
|
||||
struct MgmtHandler;
|
||||
|
||||
@@ -23,7 +23,7 @@ use std::{hash::Hash, sync::Arc, time::Duration};
|
||||
use tokio::time::Instant;
|
||||
use tracing::info;
|
||||
|
||||
pub mod errors {
|
||||
pub(crate) mod errors {
|
||||
use crate::{
|
||||
console::messages::{self, ConsoleError, Reason},
|
||||
error::{io_error, ErrorKind, ReportableError, UserFacingError},
|
||||
@@ -34,11 +34,11 @@ pub mod errors {
|
||||
use super::ApiLockError;
|
||||
|
||||
/// A go-to error message which doesn't leak any detail.
|
||||
pub const REQUEST_FAILED: &str = "Console request failed";
|
||||
pub(crate) const REQUEST_FAILED: &str = "Console request failed";
|
||||
|
||||
/// Common console API error.
|
||||
#[derive(Debug, Error)]
|
||||
pub enum ApiError {
|
||||
pub(crate) enum ApiError {
|
||||
/// Error returned by the console itself.
|
||||
#[error("{REQUEST_FAILED} with {0}")]
|
||||
Console(ConsoleError),
|
||||
@@ -50,7 +50,7 @@ pub mod errors {
|
||||
|
||||
impl ApiError {
|
||||
/// Returns HTTP status code if it's the reason for failure.
|
||||
pub fn get_reason(&self) -> messages::Reason {
|
||||
pub(crate) fn get_reason(&self) -> messages::Reason {
|
||||
match self {
|
||||
ApiError::Console(e) => e.get_reason(),
|
||||
ApiError::Transport(_) => messages::Reason::Unknown,
|
||||
@@ -146,7 +146,7 @@ pub mod errors {
|
||||
}
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
pub enum GetAuthInfoError {
|
||||
pub(crate) enum GetAuthInfoError {
|
||||
// We shouldn't include the actual secret here.
|
||||
#[error("Console responded with a malformed auth secret")]
|
||||
BadSecret,
|
||||
@@ -183,7 +183,7 @@ pub mod errors {
|
||||
}
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
pub enum WakeComputeError {
|
||||
pub(crate) enum WakeComputeError {
|
||||
#[error("Console responded with a malformed compute address: {0}")]
|
||||
BadComputeAddress(Box<str>),
|
||||
|
||||
@@ -247,7 +247,7 @@ pub mod errors {
|
||||
|
||||
/// Auth secret which is managed by the cloud.
|
||||
#[derive(Clone, Eq, PartialEq, Debug)]
|
||||
pub enum AuthSecret {
|
||||
pub(crate) enum AuthSecret {
|
||||
#[cfg(any(test, feature = "testing"))]
|
||||
/// Md5 hash of user's password.
|
||||
Md5([u8; 16]),
|
||||
@@ -257,32 +257,32 @@ pub enum AuthSecret {
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct AuthInfo {
|
||||
pub secret: Option<AuthSecret>,
|
||||
pub(crate) struct AuthInfo {
|
||||
pub(crate) secret: Option<AuthSecret>,
|
||||
/// List of IP addresses allowed for the autorization.
|
||||
pub allowed_ips: Vec<IpPattern>,
|
||||
pub(crate) allowed_ips: Vec<IpPattern>,
|
||||
/// Project ID. This is used for cache invalidation.
|
||||
pub project_id: Option<ProjectIdInt>,
|
||||
pub(crate) project_id: Option<ProjectIdInt>,
|
||||
}
|
||||
|
||||
/// Info for establishing a connection to a compute node.
|
||||
/// This is what we get after auth succeeded, but not before!
|
||||
#[derive(Clone)]
|
||||
pub struct NodeInfo {
|
||||
pub(crate) struct NodeInfo {
|
||||
/// Compute node connection params.
|
||||
/// It's sad that we have to clone this, but this will improve
|
||||
/// once we migrate to a bespoke connection logic.
|
||||
pub config: compute::ConnCfg,
|
||||
pub(crate) config: compute::ConnCfg,
|
||||
|
||||
/// Labels for proxy's metrics.
|
||||
pub aux: MetricsAuxInfo,
|
||||
pub(crate) aux: MetricsAuxInfo,
|
||||
|
||||
/// Whether we should accept self-signed certificates (for testing)
|
||||
pub allow_self_signed_compute: bool,
|
||||
pub(crate) allow_self_signed_compute: bool,
|
||||
}
|
||||
|
||||
impl NodeInfo {
|
||||
pub async fn connect(
|
||||
pub(crate) async fn connect(
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
timeout: Duration,
|
||||
@@ -296,12 +296,12 @@ impl NodeInfo {
|
||||
)
|
||||
.await
|
||||
}
|
||||
pub fn reuse_settings(&mut self, other: Self) {
|
||||
pub(crate) fn reuse_settings(&mut self, other: Self) {
|
||||
self.allow_self_signed_compute = other.allow_self_signed_compute;
|
||||
self.config.reuse_password(other.config);
|
||||
}
|
||||
|
||||
pub fn set_keys(&mut self, keys: &ComputeCredentialKeys) {
|
||||
pub(crate) fn set_keys(&mut self, keys: &ComputeCredentialKeys) {
|
||||
match keys {
|
||||
ComputeCredentialKeys::Password(password) => self.config.password(password),
|
||||
ComputeCredentialKeys::AuthKeys(auth_keys) => self.config.auth_keys(*auth_keys),
|
||||
@@ -310,10 +310,10 @@ impl NodeInfo {
|
||||
}
|
||||
}
|
||||
|
||||
pub type NodeInfoCache = TimedLru<EndpointCacheKey, Result<NodeInfo, Box<ConsoleError>>>;
|
||||
pub type CachedNodeInfo = Cached<&'static NodeInfoCache, NodeInfo>;
|
||||
pub type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, Option<AuthSecret>>;
|
||||
pub type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc<Vec<IpPattern>>>;
|
||||
pub(crate) type NodeInfoCache = TimedLru<EndpointCacheKey, Result<NodeInfo, Box<ConsoleError>>>;
|
||||
pub(crate) type CachedNodeInfo = Cached<&'static NodeInfoCache, NodeInfo>;
|
||||
pub(crate) type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, Option<AuthSecret>>;
|
||||
pub(crate) type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc<Vec<IpPattern>>>;
|
||||
|
||||
/// This will allocate per each call, but the http requests alone
|
||||
/// already require a few allocations, so it should be fine.
|
||||
@@ -350,6 +350,7 @@ pub enum ConsoleBackend {
|
||||
Postgres(mock::Api),
|
||||
/// Internal testing
|
||||
#[cfg(test)]
|
||||
#[allow(private_interfaces)]
|
||||
Test(Box<dyn crate::auth::backend::TestBackend>),
|
||||
}
|
||||
|
||||
@@ -402,7 +403,7 @@ impl Api for ConsoleBackend {
|
||||
/// Various caches for [`console`](super).
|
||||
pub struct ApiCaches {
|
||||
/// Cache for the `wake_compute` API method.
|
||||
pub node_info: NodeInfoCache,
|
||||
pub(crate) node_info: NodeInfoCache,
|
||||
/// Cache which stores project_id -> endpoint_ids mapping.
|
||||
pub project_info: Arc<ProjectInfoCacheImpl>,
|
||||
/// List of all valid endpoints.
|
||||
@@ -439,7 +440,7 @@ pub struct ApiLocks<K> {
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum ApiLockError {
|
||||
pub(crate) enum ApiLockError {
|
||||
#[error("timeout acquiring resource permit")]
|
||||
TimeoutError(#[from] tokio::time::error::Elapsed),
|
||||
}
|
||||
@@ -471,7 +472,7 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn get_permit(&self, key: &K) -> Result<WakeComputePermit, ApiLockError> {
|
||||
pub(crate) async fn get_permit(&self, key: &K) -> Result<WakeComputePermit, ApiLockError> {
|
||||
if self.config.initial_limit == 0 {
|
||||
return Ok(WakeComputePermit {
|
||||
permit: Token::disabled(),
|
||||
@@ -531,18 +532,18 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct WakeComputePermit {
|
||||
pub(crate) struct WakeComputePermit {
|
||||
permit: Token,
|
||||
}
|
||||
|
||||
impl WakeComputePermit {
|
||||
pub fn should_check_cache(&self) -> bool {
|
||||
pub(crate) fn should_check_cache(&self) -> bool {
|
||||
!self.permit.is_disabled()
|
||||
}
|
||||
pub fn release(self, outcome: Outcome) {
|
||||
pub(crate) fn release(self, outcome: Outcome) {
|
||||
self.permit.release(outcome);
|
||||
}
|
||||
pub fn release_result<T, E>(self, res: Result<T, E>) -> Result<T, E> {
|
||||
pub(crate) fn release_result<T, E>(self, res: Result<T, E>) -> Result<T, E> {
|
||||
match res {
|
||||
Ok(_) => self.release(Outcome::Success),
|
||||
Err(_) => self.release(Outcome::Overload),
|
||||
|
||||
@@ -48,7 +48,7 @@ impl Api {
|
||||
Self { endpoint }
|
||||
}
|
||||
|
||||
pub fn url(&self) -> &str {
|
||||
pub(crate) fn url(&self) -> &str {
|
||||
self.endpoint.as_str()
|
||||
}
|
||||
|
||||
@@ -64,7 +64,7 @@ impl Api {
|
||||
tokio_postgres::connect(self.endpoint.as_str(), tokio_postgres::NoTls).await?;
|
||||
|
||||
tokio::spawn(connection);
|
||||
let secret = match get_execute_postgres_query(
|
||||
let secret = if let Some(entry) = get_execute_postgres_query(
|
||||
&client,
|
||||
"select rolpassword from pg_catalog.pg_authid where rolname = $1",
|
||||
&[&&*user_info.user],
|
||||
@@ -72,15 +72,12 @@ impl Api {
|
||||
)
|
||||
.await?
|
||||
{
|
||||
Some(entry) => {
|
||||
info!("got a secret: {entry}"); // safe since it's not a prod scenario
|
||||
let secret = scram::ServerSecret::parse(&entry).map(AuthSecret::Scram);
|
||||
secret.or_else(|| parse_md5(&entry).map(AuthSecret::Md5))
|
||||
}
|
||||
None => {
|
||||
warn!("user '{}' does not exist", user_info.user);
|
||||
None
|
||||
}
|
||||
info!("got a secret: {entry}"); // safe since it's not a prod scenario
|
||||
let secret = scram::ServerSecret::parse(&entry).map(AuthSecret::Scram);
|
||||
secret.or_else(|| parse_md5(&entry).map(AuthSecret::Md5))
|
||||
} else {
|
||||
warn!("user '{}' does not exist", user_info.user);
|
||||
None
|
||||
};
|
||||
let allowed_ips = match get_execute_postgres_query(
|
||||
&client,
|
||||
@@ -142,12 +139,11 @@ async fn get_execute_postgres_query(
|
||||
let rows = client.query(query, params).await?;
|
||||
|
||||
// We can get at most one row, because `rolname` is unique.
|
||||
let row = match rows.first() {
|
||||
Some(row) => row,
|
||||
let Some(row) = rows.first() else {
|
||||
// This means that the user doesn't exist, so there can be no secret.
|
||||
// However, this is still a *valid* outcome which is very similar
|
||||
// to getting `404 Not found` from the Neon console.
|
||||
None => return Ok(None),
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
let entry = row.try_get(idx).map_err(MockApiError::PasswordNotSet)?;
|
||||
|
||||
@@ -25,8 +25,8 @@ use tracing::{debug, error, info, info_span, warn, Instrument};
|
||||
pub struct Api {
|
||||
endpoint: http::Endpoint,
|
||||
pub caches: &'static ApiCaches,
|
||||
pub locks: &'static ApiLocks<EndpointCacheKey>,
|
||||
pub wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
|
||||
pub(crate) locks: &'static ApiLocks<EndpointCacheKey>,
|
||||
pub(crate) wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
|
||||
jwt: String,
|
||||
}
|
||||
|
||||
@@ -38,9 +38,9 @@ impl Api {
|
||||
locks: &'static ApiLocks<EndpointCacheKey>,
|
||||
wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
|
||||
) -> Self {
|
||||
let jwt: String = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") {
|
||||
let jwt = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") {
|
||||
Ok(v) => v,
|
||||
Err(_) => "".to_string(),
|
||||
Err(_) => String::new(),
|
||||
};
|
||||
Self {
|
||||
endpoint,
|
||||
@@ -51,7 +51,7 @@ impl Api {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn url(&self) -> &str {
|
||||
pub(crate) fn url(&self) -> &str {
|
||||
self.endpoint.url().as_str()
|
||||
}
|
||||
|
||||
@@ -96,10 +96,10 @@ impl Api {
|
||||
// Error 404 is special: it's ok not to have a secret.
|
||||
// TODO(anna): retry
|
||||
Err(e) => {
|
||||
if e.get_reason().is_not_found() {
|
||||
return Ok(AuthInfo::default());
|
||||
return if e.get_reason().is_not_found() {
|
||||
Ok(AuthInfo::default())
|
||||
} else {
|
||||
return Err(e.into());
|
||||
Err(e.into())
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@@ -22,8 +22,9 @@ use self::parquet::RequestData;
|
||||
|
||||
pub mod parquet;
|
||||
|
||||
pub static LOG_CHAN: OnceCell<mpsc::WeakUnboundedSender<RequestData>> = OnceCell::new();
|
||||
pub static LOG_CHAN_DISCONNECT: OnceCell<mpsc::WeakUnboundedSender<RequestData>> = OnceCell::new();
|
||||
pub(crate) static LOG_CHAN: OnceCell<mpsc::WeakUnboundedSender<RequestData>> = OnceCell::new();
|
||||
pub(crate) static LOG_CHAN_DISCONNECT: OnceCell<mpsc::WeakUnboundedSender<RequestData>> =
|
||||
OnceCell::new();
|
||||
|
||||
/// Context data for a single request to connect to a database.
|
||||
///
|
||||
@@ -38,12 +39,12 @@ pub struct RequestMonitoring(
|
||||
);
|
||||
|
||||
struct RequestMonitoringInner {
|
||||
pub peer_addr: IpAddr,
|
||||
pub session_id: Uuid,
|
||||
pub protocol: Protocol,
|
||||
pub(crate) peer_addr: IpAddr,
|
||||
pub(crate) session_id: Uuid,
|
||||
pub(crate) protocol: Protocol,
|
||||
first_packet: chrono::DateTime<Utc>,
|
||||
region: &'static str,
|
||||
pub span: Span,
|
||||
pub(crate) span: Span,
|
||||
|
||||
// filled in as they are discovered
|
||||
project: Option<ProjectIdInt>,
|
||||
@@ -63,15 +64,15 @@ struct RequestMonitoringInner {
|
||||
sender: Option<mpsc::UnboundedSender<RequestData>>,
|
||||
// This sender is only used to log the length of session in case of success.
|
||||
disconnect_sender: Option<mpsc::UnboundedSender<RequestData>>,
|
||||
pub latency_timer: LatencyTimer,
|
||||
pub(crate) latency_timer: LatencyTimer,
|
||||
// Whether proxy decided that it's not a valid endpoint end rejected it before going to cplane.
|
||||
rejected: Option<bool>,
|
||||
disconnect_timestamp: Option<chrono::DateTime<Utc>>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub enum AuthMethod {
|
||||
// aka link aka passwordless
|
||||
pub(crate) enum AuthMethod {
|
||||
// aka passwordless, fka link
|
||||
Web,
|
||||
ScramSha256,
|
||||
ScramSha256Plus,
|
||||
@@ -125,11 +126,11 @@ impl RequestMonitoring {
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn test() -> Self {
|
||||
pub(crate) fn test() -> Self {
|
||||
RequestMonitoring::new(Uuid::now_v7(), [127, 0, 0, 1].into(), Protocol::Tcp, "test")
|
||||
}
|
||||
|
||||
pub fn console_application_name(&self) -> String {
|
||||
pub(crate) fn console_application_name(&self) -> String {
|
||||
let this = self.0.try_lock().expect("should not deadlock");
|
||||
format!(
|
||||
"{}/{}",
|
||||
@@ -138,19 +139,19 @@ impl RequestMonitoring {
|
||||
)
|
||||
}
|
||||
|
||||
pub fn set_rejected(&self, rejected: bool) {
|
||||
pub(crate) fn set_rejected(&self, rejected: bool) {
|
||||
let mut this = self.0.try_lock().expect("should not deadlock");
|
||||
this.rejected = Some(rejected);
|
||||
}
|
||||
|
||||
pub fn set_cold_start_info(&self, info: ColdStartInfo) {
|
||||
pub(crate) fn set_cold_start_info(&self, info: ColdStartInfo) {
|
||||
self.0
|
||||
.try_lock()
|
||||
.expect("should not deadlock")
|
||||
.set_cold_start_info(info);
|
||||
}
|
||||
|
||||
pub fn set_db_options(&self, options: StartupMessageParams) {
|
||||
pub(crate) fn set_db_options(&self, options: StartupMessageParams) {
|
||||
let mut this = self.0.try_lock().expect("should not deadlock");
|
||||
this.set_application(options.get("application_name").map(SmolStr::from));
|
||||
if let Some(user) = options.get("user") {
|
||||
@@ -163,7 +164,7 @@ impl RequestMonitoring {
|
||||
this.pg_options = Some(options);
|
||||
}
|
||||
|
||||
pub fn set_project(&self, x: MetricsAuxInfo) {
|
||||
pub(crate) fn set_project(&self, x: MetricsAuxInfo) {
|
||||
let mut this = self.0.try_lock().expect("should not deadlock");
|
||||
if this.endpoint_id.is_none() {
|
||||
this.set_endpoint_id(x.endpoint_id.as_str().into());
|
||||
@@ -173,33 +174,33 @@ impl RequestMonitoring {
|
||||
this.set_cold_start_info(x.cold_start_info);
|
||||
}
|
||||
|
||||
pub fn set_project_id(&self, project_id: ProjectIdInt) {
|
||||
pub(crate) fn set_project_id(&self, project_id: ProjectIdInt) {
|
||||
let mut this = self.0.try_lock().expect("should not deadlock");
|
||||
this.project = Some(project_id);
|
||||
}
|
||||
|
||||
pub fn set_endpoint_id(&self, endpoint_id: EndpointId) {
|
||||
pub(crate) fn set_endpoint_id(&self, endpoint_id: EndpointId) {
|
||||
self.0
|
||||
.try_lock()
|
||||
.expect("should not deadlock")
|
||||
.set_endpoint_id(endpoint_id);
|
||||
}
|
||||
|
||||
pub fn set_dbname(&self, dbname: DbName) {
|
||||
pub(crate) fn set_dbname(&self, dbname: DbName) {
|
||||
self.0
|
||||
.try_lock()
|
||||
.expect("should not deadlock")
|
||||
.set_dbname(dbname);
|
||||
}
|
||||
|
||||
pub fn set_user(&self, user: RoleName) {
|
||||
pub(crate) fn set_user(&self, user: RoleName) {
|
||||
self.0
|
||||
.try_lock()
|
||||
.expect("should not deadlock")
|
||||
.set_user(user);
|
||||
}
|
||||
|
||||
pub fn set_auth_method(&self, auth_method: AuthMethod) {
|
||||
pub(crate) fn set_auth_method(&self, auth_method: AuthMethod) {
|
||||
let mut this = self.0.try_lock().expect("should not deadlock");
|
||||
this.auth_method = Some(auth_method);
|
||||
}
|
||||
@@ -211,7 +212,7 @@ impl RequestMonitoring {
|
||||
.has_private_peer_addr()
|
||||
}
|
||||
|
||||
pub fn set_error_kind(&self, kind: ErrorKind) {
|
||||
pub(crate) fn set_error_kind(&self, kind: ErrorKind) {
|
||||
let mut this = self.0.try_lock().expect("should not deadlock");
|
||||
// Do not record errors from the private address to metrics.
|
||||
if !this.has_private_peer_addr() {
|
||||
@@ -237,30 +238,30 @@ impl RequestMonitoring {
|
||||
.log_connect();
|
||||
}
|
||||
|
||||
pub fn protocol(&self) -> Protocol {
|
||||
pub(crate) fn protocol(&self) -> Protocol {
|
||||
self.0.try_lock().expect("should not deadlock").protocol
|
||||
}
|
||||
|
||||
pub fn span(&self) -> Span {
|
||||
pub(crate) fn span(&self) -> Span {
|
||||
self.0.try_lock().expect("should not deadlock").span.clone()
|
||||
}
|
||||
|
||||
pub fn session_id(&self) -> Uuid {
|
||||
pub(crate) fn session_id(&self) -> Uuid {
|
||||
self.0.try_lock().expect("should not deadlock").session_id
|
||||
}
|
||||
|
||||
pub fn peer_addr(&self) -> IpAddr {
|
||||
pub(crate) fn peer_addr(&self) -> IpAddr {
|
||||
self.0.try_lock().expect("should not deadlock").peer_addr
|
||||
}
|
||||
|
||||
pub fn cold_start_info(&self) -> ColdStartInfo {
|
||||
pub(crate) fn cold_start_info(&self) -> ColdStartInfo {
|
||||
self.0
|
||||
.try_lock()
|
||||
.expect("should not deadlock")
|
||||
.cold_start_info
|
||||
}
|
||||
|
||||
pub fn latency_timer_pause(&self, waiting_for: Waiting) -> LatencyTimerPause<'_> {
|
||||
pub(crate) fn latency_timer_pause(&self, waiting_for: Waiting) -> LatencyTimerPause<'_> {
|
||||
LatencyTimerPause {
|
||||
ctx: self,
|
||||
start: tokio::time::Instant::now(),
|
||||
@@ -268,7 +269,7 @@ impl RequestMonitoring {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn success(&self) {
|
||||
pub(crate) fn success(&self) {
|
||||
self.0
|
||||
.try_lock()
|
||||
.expect("should not deadlock")
|
||||
@@ -277,7 +278,7 @@ impl RequestMonitoring {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct LatencyTimerPause<'a> {
|
||||
pub(crate) struct LatencyTimerPause<'a> {
|
||||
ctx: &'a RequestMonitoring,
|
||||
start: tokio::time::Instant,
|
||||
waiting_for: Waiting,
|
||||
|
||||
@@ -62,8 +62,8 @@ pub struct ParquetUploadArgs {
|
||||
// But after FAILED_UPLOAD_WARN_THRESHOLD retries, we start to log it at WARN
|
||||
// level instead, as repeated failures can mean a more serious problem. If it
|
||||
// fails more than FAILED_UPLOAD_RETRIES times, we give up
|
||||
pub const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
|
||||
pub const FAILED_UPLOAD_MAX_RETRIES: u32 = 10;
|
||||
pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
|
||||
pub(crate) const FAILED_UPLOAD_MAX_RETRIES: u32 = 10;
|
||||
|
||||
// the parquet crate leaves a lot to be desired...
|
||||
// what follows is an attempt to write parquet files with minimal allocs.
|
||||
@@ -73,7 +73,7 @@ pub const FAILED_UPLOAD_MAX_RETRIES: u32 = 10;
|
||||
// * after each rowgroup write, we check the length of the file and upload to s3 if large enough
|
||||
|
||||
#[derive(parquet_derive::ParquetRecordWriter)]
|
||||
pub struct RequestData {
|
||||
pub(crate) struct RequestData {
|
||||
region: &'static str,
|
||||
protocol: &'static str,
|
||||
/// Must be UTC. The derive macro doesn't like the timezones
|
||||
|
||||
@@ -3,12 +3,12 @@ use std::{error::Error as StdError, fmt, io};
|
||||
use measured::FixedCardinalityLabel;
|
||||
|
||||
/// Upcast (almost) any error into an opaque [`io::Error`].
|
||||
pub fn io_error(e: impl Into<Box<dyn StdError + Send + Sync>>) -> io::Error {
|
||||
pub(crate) fn io_error(e: impl Into<Box<dyn StdError + Send + Sync>>) -> io::Error {
|
||||
io::Error::new(io::ErrorKind::Other, e)
|
||||
}
|
||||
|
||||
/// A small combinator for pluggable error logging.
|
||||
pub fn log_error<E: fmt::Display>(e: E) -> E {
|
||||
pub(crate) fn log_error<E: fmt::Display>(e: E) -> E {
|
||||
tracing::error!("{e}");
|
||||
e
|
||||
}
|
||||
@@ -19,7 +19,7 @@ pub fn log_error<E: fmt::Display>(e: E) -> E {
|
||||
/// NOTE: This trait should not be implemented for [`anyhow::Error`], since it
|
||||
/// is way too convenient and tends to proliferate all across the codebase,
|
||||
/// ultimately leading to accidental leaks of sensitive data.
|
||||
pub trait UserFacingError: ReportableError {
|
||||
pub(crate) trait UserFacingError: ReportableError {
|
||||
/// Format the error for client, stripping all sensitive info.
|
||||
///
|
||||
/// Although this might be a no-op for many types, it's highly
|
||||
@@ -64,7 +64,7 @@ pub enum ErrorKind {
|
||||
}
|
||||
|
||||
impl ErrorKind {
|
||||
pub fn to_metric_label(&self) -> &'static str {
|
||||
pub(crate) fn to_metric_label(self) -> &'static str {
|
||||
match self {
|
||||
ErrorKind::User => "user",
|
||||
ErrorKind::ClientDisconnect => "clientdisconnect",
|
||||
@@ -78,7 +78,7 @@ impl ErrorKind {
|
||||
}
|
||||
}
|
||||
|
||||
pub trait ReportableError: fmt::Display + Send + 'static {
|
||||
pub(crate) trait ReportableError: fmt::Display + Send + 'static {
|
||||
fn get_error_kind(&self) -> ErrorKind;
|
||||
}
|
||||
|
||||
|
||||
@@ -12,9 +12,9 @@ use http_body_util::BodyExt;
|
||||
use hyper1::body::Body;
|
||||
use serde::de::DeserializeOwned;
|
||||
|
||||
pub use reqwest::{Request, Response, StatusCode};
|
||||
pub use reqwest_middleware::{ClientWithMiddleware, Error};
|
||||
pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
|
||||
pub(crate) use reqwest::{Request, Response};
|
||||
pub(crate) use reqwest_middleware::{ClientWithMiddleware, Error};
|
||||
pub(crate) use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
|
||||
|
||||
use crate::{
|
||||
metrics::{ConsoleRequest, Metrics},
|
||||
@@ -35,7 +35,7 @@ pub fn new_client() -> ClientWithMiddleware {
|
||||
.build()
|
||||
}
|
||||
|
||||
pub fn new_client_with_timeout(default_timout: Duration) -> ClientWithMiddleware {
|
||||
pub(crate) fn new_client_with_timeout(default_timout: Duration) -> ClientWithMiddleware {
|
||||
let timeout_client = reqwest::ClientBuilder::new()
|
||||
.timeout(default_timout)
|
||||
.build()
|
||||
@@ -77,20 +77,20 @@ impl Endpoint {
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn url(&self) -> &ApiUrl {
|
||||
pub(crate) fn url(&self) -> &ApiUrl {
|
||||
&self.endpoint
|
||||
}
|
||||
|
||||
/// Return a [builder](RequestBuilder) for a `GET` request,
|
||||
/// appending a single `path` segment to the base endpoint URL.
|
||||
pub fn get(&self, path: &str) -> RequestBuilder {
|
||||
pub(crate) fn get(&self, path: &str) -> RequestBuilder {
|
||||
let mut url = self.endpoint.clone();
|
||||
url.path_segments_mut().push(path);
|
||||
self.client.get(url.into_inner())
|
||||
}
|
||||
|
||||
/// Execute a [request](reqwest::Request).
|
||||
pub async fn execute(&self, request: Request) -> Result<Response, Error> {
|
||||
pub(crate) async fn execute(&self, request: Request) -> Result<Response, Error> {
|
||||
let _timer = Metrics::get()
|
||||
.proxy
|
||||
.console_request_latency
|
||||
@@ -102,7 +102,7 @@ impl Endpoint {
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn parse_json_body_with_limit<D: DeserializeOwned>(
|
||||
pub(crate) async fn parse_json_body_with_limit<D: DeserializeOwned>(
|
||||
mut b: impl Body<Data = Bytes, Error = reqwest::Error> + Unpin,
|
||||
limit: usize,
|
||||
) -> anyhow::Result<D> {
|
||||
|
||||
@@ -29,10 +29,10 @@ impl<Id: InternId> std::fmt::Display for InternedString<Id> {
|
||||
}
|
||||
|
||||
impl<Id: InternId> InternedString<Id> {
|
||||
pub fn as_str(&self) -> &'static str {
|
||||
pub(crate) fn as_str(&self) -> &'static str {
|
||||
Id::get_interner().inner.resolve(&self.inner)
|
||||
}
|
||||
pub fn get(s: &str) -> Option<Self> {
|
||||
pub(crate) fn get(s: &str) -> Option<Self> {
|
||||
Id::get_interner().get(s)
|
||||
}
|
||||
}
|
||||
@@ -78,7 +78,7 @@ impl<Id: InternId> serde::Serialize for InternedString<Id> {
|
||||
}
|
||||
|
||||
impl<Id: InternId> StringInterner<Id> {
|
||||
pub fn new() -> Self {
|
||||
pub(crate) fn new() -> Self {
|
||||
StringInterner {
|
||||
inner: ThreadedRodeo::with_capacity_memory_limits_and_hasher(
|
||||
Capacity::new(2500, NonZeroUsize::new(1 << 16).unwrap()),
|
||||
@@ -90,26 +90,24 @@ impl<Id: InternId> StringInterner<Id> {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.inner.is_empty()
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
#[cfg(test)]
|
||||
fn len(&self) -> usize {
|
||||
self.inner.len()
|
||||
}
|
||||
|
||||
pub fn current_memory_usage(&self) -> usize {
|
||||
#[cfg(test)]
|
||||
fn current_memory_usage(&self) -> usize {
|
||||
self.inner.current_memory_usage()
|
||||
}
|
||||
|
||||
pub fn get_or_intern(&self, s: &str) -> InternedString<Id> {
|
||||
pub(crate) fn get_or_intern(&self, s: &str) -> InternedString<Id> {
|
||||
InternedString {
|
||||
inner: self.inner.get_or_intern(s),
|
||||
_id: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get(&self, s: &str) -> Option<InternedString<Id>> {
|
||||
pub(crate) fn get(&self, s: &str) -> Option<InternedString<Id>> {
|
||||
Some(InternedString {
|
||||
inner: self.inner.get(s)?,
|
||||
_id: PhantomData,
|
||||
@@ -132,14 +130,14 @@ impl<Id: InternId> Default for StringInterner<Id> {
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
|
||||
pub struct RoleNameTag;
|
||||
pub(crate) struct RoleNameTag;
|
||||
impl InternId for RoleNameTag {
|
||||
fn get_interner() -> &'static StringInterner<Self> {
|
||||
pub static ROLE_NAMES: OnceLock<StringInterner<RoleNameTag>> = OnceLock::new();
|
||||
static ROLE_NAMES: OnceLock<StringInterner<RoleNameTag>> = OnceLock::new();
|
||||
ROLE_NAMES.get_or_init(Default::default)
|
||||
}
|
||||
}
|
||||
pub type RoleNameInt = InternedString<RoleNameTag>;
|
||||
pub(crate) type RoleNameInt = InternedString<RoleNameTag>;
|
||||
impl From<&RoleName> for RoleNameInt {
|
||||
fn from(value: &RoleName) -> Self {
|
||||
RoleNameTag::get_interner().get_or_intern(value)
|
||||
@@ -150,7 +148,7 @@ impl From<&RoleName> for RoleNameInt {
|
||||
pub struct EndpointIdTag;
|
||||
impl InternId for EndpointIdTag {
|
||||
fn get_interner() -> &'static StringInterner<Self> {
|
||||
pub static ROLE_NAMES: OnceLock<StringInterner<EndpointIdTag>> = OnceLock::new();
|
||||
static ROLE_NAMES: OnceLock<StringInterner<EndpointIdTag>> = OnceLock::new();
|
||||
ROLE_NAMES.get_or_init(Default::default)
|
||||
}
|
||||
}
|
||||
@@ -170,7 +168,7 @@ impl From<EndpointId> for EndpointIdInt {
|
||||
pub struct BranchIdTag;
|
||||
impl InternId for BranchIdTag {
|
||||
fn get_interner() -> &'static StringInterner<Self> {
|
||||
pub static ROLE_NAMES: OnceLock<StringInterner<BranchIdTag>> = OnceLock::new();
|
||||
static ROLE_NAMES: OnceLock<StringInterner<BranchIdTag>> = OnceLock::new();
|
||||
ROLE_NAMES.get_or_init(Default::default)
|
||||
}
|
||||
}
|
||||
@@ -190,7 +188,7 @@ impl From<BranchId> for BranchIdInt {
|
||||
pub struct ProjectIdTag;
|
||||
impl InternId for ProjectIdTag {
|
||||
fn get_interner() -> &'static StringInterner<Self> {
|
||||
pub static ROLE_NAMES: OnceLock<StringInterner<ProjectIdTag>> = OnceLock::new();
|
||||
static ROLE_NAMES: OnceLock<StringInterner<ProjectIdTag>> = OnceLock::new();
|
||||
ROLE_NAMES.get_or_init(Default::default)
|
||||
}
|
||||
}
|
||||
@@ -217,7 +215,7 @@ mod tests {
|
||||
struct MyId;
|
||||
impl InternId for MyId {
|
||||
fn get_interner() -> &'static StringInterner<Self> {
|
||||
pub static ROLE_NAMES: OnceLock<StringInterner<MyId>> = OnceLock::new();
|
||||
pub(crate) static ROLE_NAMES: OnceLock<StringInterner<MyId>> = OnceLock::new();
|
||||
ROLE_NAMES.get_or_init(Default::default)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,6 +12,8 @@
|
||||
// https://rust-lang.github.io/rust-clippy/master/index.html#?groups=restriction
|
||||
#![warn(
|
||||
clippy::undocumented_unsafe_blocks,
|
||||
// TODO: Enable once all individual checks are enabled.
|
||||
//clippy::as_conversions,
|
||||
clippy::dbg_macro,
|
||||
clippy::empty_enum_variants_with_brackets,
|
||||
clippy::exit,
|
||||
@@ -31,8 +33,15 @@
|
||||
)]
|
||||
// List of permanently allowed lints.
|
||||
#![allow(
|
||||
// It's ok to cast u8 to bool, etc.
|
||||
// It's ok to cast bool to u8, etc.
|
||||
clippy::cast_lossless,
|
||||
// Seems unavoidable.
|
||||
clippy::multiple_crate_versions,
|
||||
// While #[must_use] is a great feature this check is too noisy.
|
||||
clippy::must_use_candidate,
|
||||
// Inline consts, structs, fns, imports, etc. are ok if they're used by
|
||||
// the following statement(s).
|
||||
clippy::items_after_statements,
|
||||
)]
|
||||
// List of temporarily allowed lints.
|
||||
// TODO: Switch to except() once stable with 1.81.
|
||||
@@ -43,46 +52,26 @@
|
||||
clippy::cast_possible_wrap,
|
||||
clippy::cast_precision_loss,
|
||||
clippy::cast_sign_loss,
|
||||
clippy::default_trait_access,
|
||||
clippy::doc_markdown,
|
||||
clippy::explicit_iter_loop,
|
||||
clippy::float_cmp,
|
||||
clippy::if_not_else,
|
||||
clippy::ignored_unit_patterns,
|
||||
clippy::implicit_hasher,
|
||||
clippy::inconsistent_struct_constructor,
|
||||
clippy::inline_always,
|
||||
clippy::items_after_statements,
|
||||
clippy::manual_assert,
|
||||
clippy::manual_let_else,
|
||||
clippy::manual_string_new,
|
||||
clippy::match_bool,
|
||||
clippy::match_same_arms,
|
||||
clippy::match_wild_err_arm,
|
||||
clippy::missing_errors_doc,
|
||||
clippy::missing_panics_doc,
|
||||
clippy::module_name_repetitions,
|
||||
clippy::multiple_crate_versions,
|
||||
clippy::must_use_candidate,
|
||||
clippy::needless_for_each,
|
||||
clippy::needless_pass_by_value,
|
||||
clippy::needless_raw_string_hashes,
|
||||
clippy::option_as_ref_cloned,
|
||||
clippy::redundant_closure_for_method_calls,
|
||||
clippy::redundant_else,
|
||||
clippy::return_self_not_must_use,
|
||||
clippy::similar_names,
|
||||
clippy::single_char_pattern,
|
||||
clippy::single_match_else,
|
||||
clippy::struct_excessive_bools,
|
||||
clippy::struct_field_names,
|
||||
clippy::too_many_lines,
|
||||
clippy::uninlined_format_args,
|
||||
clippy::unnested_or_patterns,
|
||||
clippy::unreadable_literal,
|
||||
clippy::unused_async,
|
||||
clippy::unused_self,
|
||||
clippy::used_underscore_binding,
|
||||
clippy::wildcard_imports
|
||||
)]
|
||||
// List of temporarily allowed lints to unblock beta/nightly.
|
||||
@@ -168,7 +157,8 @@ macro_rules! smol_str_wrapper {
|
||||
pub struct $name(smol_str::SmolStr);
|
||||
|
||||
impl $name {
|
||||
pub fn as_str(&self) -> &str {
|
||||
#[allow(unused)]
|
||||
pub(crate) fn as_str(&self) -> &str {
|
||||
self.0.as_str()
|
||||
}
|
||||
}
|
||||
@@ -263,19 +253,19 @@ smol_str_wrapper!(Host);
|
||||
|
||||
// Endpoints are a bit tricky. Rare they might be branches or projects.
|
||||
impl EndpointId {
|
||||
pub fn is_endpoint(&self) -> bool {
|
||||
pub(crate) fn is_endpoint(&self) -> bool {
|
||||
self.0.starts_with("ep-")
|
||||
}
|
||||
pub fn is_branch(&self) -> bool {
|
||||
pub(crate) fn is_branch(&self) -> bool {
|
||||
self.0.starts_with("br-")
|
||||
}
|
||||
pub fn is_project(&self) -> bool {
|
||||
!self.is_endpoint() && !self.is_branch()
|
||||
}
|
||||
pub fn as_branch(&self) -> BranchId {
|
||||
// pub(crate) fn is_project(&self) -> bool {
|
||||
// !self.is_endpoint() && !self.is_branch()
|
||||
// }
|
||||
pub(crate) fn as_branch(&self) -> BranchId {
|
||||
BranchId(self.0.clone())
|
||||
}
|
||||
pub fn as_project(&self) -> ProjectId {
|
||||
pub(crate) fn as_project(&self) -> ProjectId {
|
||||
ProjectId(self.0.clone())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,14 +2,14 @@
|
||||
|
||||
use std::ffi::CStr;
|
||||
|
||||
pub fn split_cstr(bytes: &[u8]) -> Option<(&CStr, &[u8])> {
|
||||
pub(crate) fn split_cstr(bytes: &[u8]) -> Option<(&CStr, &[u8])> {
|
||||
let cstr = CStr::from_bytes_until_nul(bytes).ok()?;
|
||||
let (_, other) = bytes.split_at(cstr.to_bytes_with_nul().len());
|
||||
Some((cstr, other))
|
||||
}
|
||||
|
||||
/// See <https://doc.rust-lang.org/std/primitive.slice.html#method.split_array_ref>.
|
||||
pub fn split_at_const<const N: usize>(bytes: &[u8]) -> Option<(&[u8; N], &[u8])> {
|
||||
pub(crate) fn split_at_const<const N: usize>(bytes: &[u8]) -> Option<(&[u8; N], &[u8])> {
|
||||
(bytes.len() >= N).then(|| {
|
||||
let (head, tail) = bytes.split_at(N);
|
||||
(head.try_into().unwrap(), tail)
|
||||
|
||||
@@ -13,9 +13,9 @@ use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf};
|
||||
|
||||
pin_project! {
|
||||
/// A chained [`AsyncRead`] with [`AsyncWrite`] passthrough
|
||||
pub struct ChainRW<T> {
|
||||
pub(crate) struct ChainRW<T> {
|
||||
#[pin]
|
||||
pub inner: T,
|
||||
pub(crate) inner: T,
|
||||
buf: BytesMut,
|
||||
}
|
||||
}
|
||||
@@ -60,7 +60,7 @@ const HEADER: [u8; 12] = [
|
||||
0x0D, 0x0A, 0x0D, 0x0A, 0x00, 0x0D, 0x0A, 0x51, 0x55, 0x49, 0x54, 0x0A,
|
||||
];
|
||||
|
||||
pub async fn read_proxy_protocol<T: AsyncRead + Unpin>(
|
||||
pub(crate) async fn read_proxy_protocol<T: AsyncRead + Unpin>(
|
||||
mut read: T,
|
||||
) -> std::io::Result<(ChainRW<T>, Option<SocketAddr>)> {
|
||||
let mut buf = BytesMut::with_capacity(128);
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
||||
pub mod connect_compute;
|
||||
pub(crate) mod connect_compute;
|
||||
mod copy_bidirectional;
|
||||
pub mod handshake;
|
||||
pub mod passthrough;
|
||||
pub mod retry;
|
||||
pub mod wake_compute;
|
||||
pub(crate) mod handshake;
|
||||
pub(crate) mod passthrough;
|
||||
pub(crate) mod retry;
|
||||
pub(crate) mod wake_compute;
|
||||
pub use copy_bidirectional::copy_bidirectional_client_compute;
|
||||
pub use copy_bidirectional::ErrorSource;
|
||||
|
||||
@@ -170,21 +170,21 @@ pub async fn task_main(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub enum ClientMode {
|
||||
pub(crate) enum ClientMode {
|
||||
Tcp,
|
||||
Websockets { hostname: Option<String> },
|
||||
}
|
||||
|
||||
/// Abstracts the logic of handling TCP vs WS clients
|
||||
impl ClientMode {
|
||||
pub fn allow_cleartext(&self) -> bool {
|
||||
pub(crate) fn allow_cleartext(&self) -> bool {
|
||||
match self {
|
||||
ClientMode::Tcp => false,
|
||||
ClientMode::Websockets { .. } => true,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn allow_self_signed_compute(&self, config: &ProxyConfig) -> bool {
|
||||
pub(crate) fn allow_self_signed_compute(&self, config: &ProxyConfig) -> bool {
|
||||
match self {
|
||||
ClientMode::Tcp => config.allow_self_signed_compute,
|
||||
ClientMode::Websockets { .. } => false,
|
||||
@@ -213,7 +213,7 @@ impl ClientMode {
|
||||
// 2. Handshake: handshake reports errors if it can, otherwise if the handshake fails due to protocol violation,
|
||||
// we cannot be sure the client even understands our error message
|
||||
// 3. PrepareClient: The client disconnected, so we can't tell them anyway...
|
||||
pub enum ClientRequestError {
|
||||
pub(crate) enum ClientRequestError {
|
||||
#[error("{0}")]
|
||||
Cancellation(#[from] cancellation::CancelError),
|
||||
#[error("{0}")]
|
||||
@@ -238,7 +238,7 @@ impl ReportableError for ClientRequestError {
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
config: &'static ProxyConfig,
|
||||
ctx: &RequestMonitoring,
|
||||
cancellation_handler: Arc<CancellationHandlerMain>,
|
||||
@@ -254,7 +254,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
|
||||
let metrics = &Metrics::get().proxy;
|
||||
let proto = ctx.protocol();
|
||||
let _request_gauge = metrics.connection_requests.guard(proto);
|
||||
let request_gauge = metrics.connection_requests.guard(proto);
|
||||
|
||||
let tls = config.tls_config.as_ref();
|
||||
|
||||
@@ -283,7 +283,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
let result = config
|
||||
.auth_backend
|
||||
.as_ref()
|
||||
.map(|_| auth::ComputeUserInfoMaybeEndpoint::parse(ctx, ¶ms, hostname, common_names))
|
||||
.map(|()| auth::ComputeUserInfoMaybeEndpoint::parse(ctx, ¶ms, hostname, common_names))
|
||||
.transpose();
|
||||
|
||||
let user_info = match result {
|
||||
@@ -340,9 +340,9 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
client: stream,
|
||||
aux: node.aux.clone(),
|
||||
compute: node,
|
||||
req: _request_gauge,
|
||||
conn: conn_gauge,
|
||||
cancel: session,
|
||||
_req: request_gauge,
|
||||
_conn: conn_gauge,
|
||||
_cancel: session,
|
||||
}))
|
||||
}
|
||||
|
||||
@@ -377,20 +377,20 @@ async fn prepare_client_connection<P>(
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Default)]
|
||||
pub struct NeonOptions(Vec<(SmolStr, SmolStr)>);
|
||||
pub(crate) struct NeonOptions(Vec<(SmolStr, SmolStr)>);
|
||||
|
||||
impl NeonOptions {
|
||||
pub fn parse_params(params: &StartupMessageParams) -> Self {
|
||||
pub(crate) fn parse_params(params: &StartupMessageParams) -> Self {
|
||||
params
|
||||
.options_raw()
|
||||
.map(Self::parse_from_iter)
|
||||
.unwrap_or_default()
|
||||
}
|
||||
pub fn parse_options_raw(options: &str) -> Self {
|
||||
pub(crate) fn parse_options_raw(options: &str) -> Self {
|
||||
Self::parse_from_iter(StartupMessageParams::parse_options_raw(options))
|
||||
}
|
||||
|
||||
pub fn is_ephemeral(&self) -> bool {
|
||||
pub(crate) fn is_ephemeral(&self) -> bool {
|
||||
// Currently, neon endpoint options are all reserved for ephemeral endpoints.
|
||||
!self.0.is_empty()
|
||||
}
|
||||
@@ -404,7 +404,7 @@ impl NeonOptions {
|
||||
Self(options)
|
||||
}
|
||||
|
||||
pub fn get_cache_key(&self, prefix: &str) -> EndpointCacheKey {
|
||||
pub(crate) fn get_cache_key(&self, prefix: &str) -> EndpointCacheKey {
|
||||
// prefix + format!(" {k}:{v}")
|
||||
// kinda jank because SmolStr is immutable
|
||||
std::iter::once(prefix)
|
||||
@@ -415,7 +415,7 @@ impl NeonOptions {
|
||||
|
||||
/// <https://swagger.io/docs/specification/serialization/> DeepObject format
|
||||
/// `paramName[prop1]=value1¶mName[prop2]=value2&...`
|
||||
pub fn to_deep_object(&self) -> Vec<(SmolStr, SmolStr)> {
|
||||
pub(crate) fn to_deep_object(&self) -> Vec<(SmolStr, SmolStr)> {
|
||||
self.0
|
||||
.iter()
|
||||
.map(|(k, v)| (format_smolstr!("options[{}]", k), v.clone()))
|
||||
@@ -423,7 +423,7 @@ impl NeonOptions {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn neon_option(bytes: &str) -> Option<(&str, &str)> {
|
||||
pub(crate) fn neon_option(bytes: &str) -> Option<(&str, &str)> {
|
||||
static RE: OnceCell<Regex> = OnceCell::new();
|
||||
let re = RE.get_or_init(|| Regex::new(r"^neon_(\w+):(.+)").unwrap());
|
||||
|
||||
|
||||
@@ -25,14 +25,15 @@ const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2);
|
||||
/// (e.g. the compute node's address might've changed at the wrong time).
|
||||
/// Invalidate the cache entry (if any) to prevent subsequent errors.
|
||||
#[tracing::instrument(name = "invalidate_cache", skip_all)]
|
||||
pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> NodeInfo {
|
||||
pub(crate) fn invalidate_cache(node_info: console::CachedNodeInfo) -> NodeInfo {
|
||||
let is_cached = node_info.cached();
|
||||
if is_cached {
|
||||
warn!("invalidating stalled compute node info cache entry");
|
||||
}
|
||||
let label = match is_cached {
|
||||
true => ConnectionFailureKind::ComputeCached,
|
||||
false => ConnectionFailureKind::ComputeUncached,
|
||||
let label = if is_cached {
|
||||
ConnectionFailureKind::ComputeCached
|
||||
} else {
|
||||
ConnectionFailureKind::ComputeUncached
|
||||
};
|
||||
Metrics::get().proxy.connection_failures_total.inc(label);
|
||||
|
||||
@@ -40,7 +41,7 @@ pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> NodeInfo {
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
pub trait ConnectMechanism {
|
||||
pub(crate) trait ConnectMechanism {
|
||||
type Connection;
|
||||
type ConnectError: ReportableError;
|
||||
type Error: From<Self::ConnectError>;
|
||||
@@ -55,7 +56,7 @@ pub trait ConnectMechanism {
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
pub trait ComputeConnectBackend {
|
||||
pub(crate) trait ComputeConnectBackend {
|
||||
async fn wake_compute(
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
@@ -64,12 +65,12 @@ pub trait ComputeConnectBackend {
|
||||
fn get_keys(&self) -> &ComputeCredentialKeys;
|
||||
}
|
||||
|
||||
pub struct TcpMechanism<'a> {
|
||||
pub(crate) struct TcpMechanism<'a> {
|
||||
/// KV-dictionary with PostgreSQL connection params.
|
||||
pub params: &'a StartupMessageParams,
|
||||
pub(crate) params: &'a StartupMessageParams,
|
||||
|
||||
/// connect_to_compute concurrency lock
|
||||
pub locks: &'static ApiLocks<Host>,
|
||||
pub(crate) locks: &'static ApiLocks<Host>,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
@@ -97,7 +98,7 @@ impl ConnectMechanism for TcpMechanism<'_> {
|
||||
|
||||
/// Try to connect to the compute node, retrying if necessary.
|
||||
#[tracing::instrument(skip_all)]
|
||||
pub async fn connect_to_compute<M: ConnectMechanism, B: ComputeConnectBackend>(
|
||||
pub(crate) async fn connect_to_compute<M: ConnectMechanism, B: ComputeConnectBackend>(
|
||||
ctx: &RequestMonitoring,
|
||||
mechanism: &M,
|
||||
user_info: &B,
|
||||
|
||||
@@ -14,7 +14,7 @@ enum TransferState {
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum ErrorDirection {
|
||||
pub(crate) enum ErrorDirection {
|
||||
Read(io::Error),
|
||||
Write(io::Error),
|
||||
}
|
||||
@@ -230,11 +230,10 @@ impl CopyBuffer {
|
||||
io::ErrorKind::WriteZero,
|
||||
"write zero byte into writer",
|
||||
))));
|
||||
} else {
|
||||
self.pos += i;
|
||||
self.amt += i as u64;
|
||||
self.need_flush = true;
|
||||
}
|
||||
self.pos += i;
|
||||
self.amt += i as u64;
|
||||
self.need_flush = true;
|
||||
}
|
||||
|
||||
// If pos larger than cap, this loop will never stop.
|
||||
|
||||
@@ -18,7 +18,7 @@ use crate::{
|
||||
};
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum HandshakeError {
|
||||
pub(crate) enum HandshakeError {
|
||||
#[error("data is sent before server replied with EncryptionResponse")]
|
||||
EarlyData,
|
||||
|
||||
@@ -57,7 +57,7 @@ impl ReportableError for HandshakeError {
|
||||
}
|
||||
}
|
||||
|
||||
pub enum HandshakeData<S> {
|
||||
pub(crate) enum HandshakeData<S> {
|
||||
Startup(PqStream<Stream<S>>, StartupMessageParams),
|
||||
Cancel(CancelKeyData),
|
||||
}
|
||||
@@ -67,7 +67,7 @@ pub enum HandshakeData<S> {
|
||||
/// It's easier to work with owned `stream` here as we need to upgrade it to TLS;
|
||||
/// we also take an extra care of propagating only the select handshake errors to client.
|
||||
#[tracing::instrument(skip_all)]
|
||||
pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
pub(crate) async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
ctx: &RequestMonitoring,
|
||||
stream: S,
|
||||
mut tls: Option<&TlsConfig>,
|
||||
|
||||
@@ -14,7 +14,7 @@ use super::copy_bidirectional::ErrorSource;
|
||||
|
||||
/// Forward bytes in both directions (client <-> compute).
|
||||
#[tracing::instrument(skip_all)]
|
||||
pub async fn proxy_pass(
|
||||
pub(crate) async fn proxy_pass(
|
||||
client: impl AsyncRead + AsyncWrite + Unpin,
|
||||
compute: impl AsyncRead + AsyncWrite + Unpin,
|
||||
aux: MetricsAuxInfo,
|
||||
@@ -57,18 +57,18 @@ pub async fn proxy_pass(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub struct ProxyPassthrough<P, S> {
|
||||
pub client: Stream<S>,
|
||||
pub compute: PostgresConnection,
|
||||
pub aux: MetricsAuxInfo,
|
||||
pub(crate) struct ProxyPassthrough<P, S> {
|
||||
pub(crate) client: Stream<S>,
|
||||
pub(crate) compute: PostgresConnection,
|
||||
pub(crate) aux: MetricsAuxInfo,
|
||||
|
||||
pub req: NumConnectionRequestsGuard<'static>,
|
||||
pub conn: NumClientConnectionsGuard<'static>,
|
||||
pub cancel: cancellation::Session<P>,
|
||||
pub(crate) _req: NumConnectionRequestsGuard<'static>,
|
||||
pub(crate) _conn: NumClientConnectionsGuard<'static>,
|
||||
pub(crate) _cancel: cancellation::Session<P>,
|
||||
}
|
||||
|
||||
impl<P, S: AsyncRead + AsyncWrite + Unpin> ProxyPassthrough<P, S> {
|
||||
pub async fn proxy_pass(self) -> Result<(), ErrorSource> {
|
||||
pub(crate) async fn proxy_pass(self) -> Result<(), ErrorSource> {
|
||||
let res = proxy_pass(self.client, self.compute.stream, self.aux).await;
|
||||
if let Err(err) = self.compute.cancel_closure.try_cancel_query().await {
|
||||
tracing::error!(?err, "could not cancel the query in the database");
|
||||
|
||||
@@ -2,18 +2,18 @@ use crate::{compute, config::RetryConfig};
|
||||
use std::{error::Error, io};
|
||||
use tokio::time;
|
||||
|
||||
pub trait CouldRetry {
|
||||
pub(crate) trait CouldRetry {
|
||||
/// Returns true if the error could be retried
|
||||
fn could_retry(&self) -> bool;
|
||||
}
|
||||
|
||||
pub trait ShouldRetryWakeCompute {
|
||||
pub(crate) trait ShouldRetryWakeCompute {
|
||||
/// Returns true if we need to invalidate the cache for this node.
|
||||
/// If false, we can continue retrying with the current node cache.
|
||||
fn should_retry_wake_compute(&self) -> bool;
|
||||
}
|
||||
|
||||
pub fn should_retry(err: &impl CouldRetry, num_retries: u32, config: RetryConfig) -> bool {
|
||||
pub(crate) fn should_retry(err: &impl CouldRetry, num_retries: u32, config: RetryConfig) -> bool {
|
||||
num_retries < config.max_retries && err.could_retry()
|
||||
}
|
||||
|
||||
@@ -101,7 +101,7 @@ impl ShouldRetryWakeCompute for compute::ConnectionError {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn retry_after(num_retries: u32, config: RetryConfig) -> time::Duration {
|
||||
pub(crate) fn retry_after(num_retries: u32, config: RetryConfig) -> time::Duration {
|
||||
config
|
||||
.base_delay
|
||||
.mul_f64(config.backoff_factor.powi((num_retries as i32) - 1))
|
||||
|
||||
@@ -11,14 +11,14 @@ use crate::auth::backend::{
|
||||
ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, MaybeOwned, TestBackend,
|
||||
};
|
||||
use crate::config::{CertResolver, RetryConfig};
|
||||
use crate::console::caches::NodeInfoCache;
|
||||
use crate::console::messages::{ConsoleError, Details, MetricsAuxInfo, Status};
|
||||
use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBackend};
|
||||
use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBackend, NodeInfoCache};
|
||||
use crate::console::{self, CachedNodeInfo, NodeInfo};
|
||||
use crate::error::ErrorKind;
|
||||
use crate::{http, sasl, scram, BranchId, EndpointId, ProjectId};
|
||||
use crate::{sasl, scram, BranchId, EndpointId, ProjectId};
|
||||
use anyhow::{bail, Context};
|
||||
use async_trait::async_trait;
|
||||
use http::StatusCode;
|
||||
use retry::{retry_after, ShouldRetryWakeCompute};
|
||||
use rstest::rstest;
|
||||
use rustls::pki_types;
|
||||
@@ -433,7 +433,7 @@ impl ReportableError for TestConnectError {
|
||||
|
||||
impl std::fmt::Display for TestConnectError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{:?}", self)
|
||||
write!(f, "{self:?}")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -475,7 +475,7 @@ impl ConnectMechanism for TestConnectMechanism {
|
||||
retryable: false,
|
||||
kind: ErrorKind::Compute,
|
||||
}),
|
||||
x => panic!("expecting action {:?}, connect is called instead", x),
|
||||
x => panic!("expecting action {x:?}, connect is called instead"),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -491,7 +491,7 @@ impl TestBackend for TestConnectMechanism {
|
||||
ConnectAction::Wake => Ok(helper_create_cached_node_info(self.cache)),
|
||||
ConnectAction::WakeFail => {
|
||||
let err = console::errors::ApiError::Console(ConsoleError {
|
||||
http_status_code: http::StatusCode::BAD_REQUEST,
|
||||
http_status_code: StatusCode::BAD_REQUEST,
|
||||
error: "TEST".into(),
|
||||
status: None,
|
||||
});
|
||||
@@ -500,7 +500,7 @@ impl TestBackend for TestConnectMechanism {
|
||||
}
|
||||
ConnectAction::WakeRetry => {
|
||||
let err = console::errors::ApiError::Console(ConsoleError {
|
||||
http_status_code: http::StatusCode::BAD_REQUEST,
|
||||
http_status_code: StatusCode::BAD_REQUEST,
|
||||
error: "TEST".into(),
|
||||
status: Some(Status {
|
||||
code: "error".into(),
|
||||
@@ -515,7 +515,7 @@ impl TestBackend for TestConnectMechanism {
|
||||
assert!(err.could_retry());
|
||||
Err(console::errors::WakeComputeError::ApiError(err))
|
||||
}
|
||||
x => panic!("expecting action {:?}, wake_compute is called instead", x),
|
||||
x => panic!("expecting action {x:?}, wake_compute is called instead"),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -525,9 +525,6 @@ impl TestBackend for TestConnectMechanism {
|
||||
{
|
||||
unimplemented!("not used in tests")
|
||||
}
|
||||
fn get_role_secret(&self) -> Result<CachedRoleSecret, console::errors::GetAuthInfoError> {
|
||||
unimplemented!("not used in tests")
|
||||
}
|
||||
}
|
||||
|
||||
fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeInfo {
|
||||
@@ -547,8 +544,8 @@ fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeIn
|
||||
|
||||
fn helper_create_connect_info(
|
||||
mechanism: &TestConnectMechanism,
|
||||
) -> auth::BackendType<'static, ComputeCredentials, &()> {
|
||||
let user_info = auth::BackendType::Console(
|
||||
) -> auth::Backend<'static, ComputeCredentials, &()> {
|
||||
let user_info = auth::Backend::Console(
|
||||
MaybeOwned::Owned(ConsoleBackend::Test(Box::new(mechanism.clone()))),
|
||||
ComputeCredentials {
|
||||
info: ComputeUserInfo {
|
||||
|
||||
@@ -102,7 +102,7 @@ async fn proxy_mitm(
|
||||
}
|
||||
|
||||
/// taken from tokio-postgres
|
||||
pub async fn connect_tls<S, T>(mut stream: S, tls: T) -> T::Stream
|
||||
pub(crate) async fn connect_tls<S, T>(mut stream: S, tls: T) -> T::Stream
|
||||
where
|
||||
S: AsyncRead + AsyncWrite + Unpin,
|
||||
T: TlsConnect<S>,
|
||||
@@ -115,9 +115,7 @@ where
|
||||
let mut buf = [0];
|
||||
stream.read_exact(&mut buf).await.unwrap();
|
||||
|
||||
if buf[0] != b'S' {
|
||||
panic!("ssl not supported by server");
|
||||
}
|
||||
assert!(buf[0] == b'S', "ssl not supported by server");
|
||||
|
||||
tls.connect(stream).await.unwrap()
|
||||
}
|
||||
|
||||
@@ -12,7 +12,7 @@ use tracing::{error, info, warn};
|
||||
|
||||
use super::connect_compute::ComputeConnectBackend;
|
||||
|
||||
pub async fn wake_compute<B: ComputeConnectBackend>(
|
||||
pub(crate) async fn wake_compute<B: ComputeConnectBackend>(
|
||||
num_retries: &mut u32,
|
||||
ctx: &RequestMonitoring,
|
||||
api: &B,
|
||||
|
||||
@@ -1,10 +1,16 @@
|
||||
mod leaky_bucket;
|
||||
mod limit_algorithm;
|
||||
mod limiter;
|
||||
pub use limit_algorithm::{
|
||||
aimd::Aimd, DynamicLimiter, Outcome, RateLimitAlgorithm, RateLimiterConfig, Token,
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) use limit_algorithm::aimd::Aimd;
|
||||
|
||||
pub(crate) use limit_algorithm::{
|
||||
DynamicLimiter, Outcome, RateLimitAlgorithm, RateLimiterConfig, Token,
|
||||
};
|
||||
pub use limiter::{BucketRateLimiter, GlobalRateLimiter, RateBucketInfo, WakeComputeRateLimiter};
|
||||
mod leaky_bucket;
|
||||
pub(crate) use limiter::GlobalRateLimiter;
|
||||
|
||||
pub use leaky_bucket::{
|
||||
EndpointRateLimiter, LeakyBucketConfig, LeakyBucketRateLimiter, LeakyBucketState,
|
||||
};
|
||||
pub use limiter::{BucketRateLimiter, RateBucketInfo, WakeComputeRateLimiter};
|
||||
|
||||
@@ -35,7 +35,7 @@ impl<K: Hash + Eq> LeakyBucketRateLimiter<K> {
|
||||
}
|
||||
|
||||
/// Check that number of connections to the endpoint is below `max_rps` rps.
|
||||
pub fn check(&self, key: K, n: u32) -> bool {
|
||||
pub(crate) fn check(&self, key: K, n: u32) -> bool {
|
||||
let now = Instant::now();
|
||||
|
||||
if self.access_count.fetch_add(1, Ordering::AcqRel) % 2048 == 0 {
|
||||
@@ -73,8 +73,9 @@ pub struct LeakyBucketState {
|
||||
time: Instant,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl LeakyBucketConfig {
|
||||
pub fn new(rps: f64, max: f64) -> Self {
|
||||
pub(crate) fn new(rps: f64, max: f64) -> Self {
|
||||
assert!(rps > 0.0, "rps must be positive");
|
||||
assert!(max > 0.0, "max must be positive");
|
||||
Self { rps, max }
|
||||
@@ -82,7 +83,7 @@ impl LeakyBucketConfig {
|
||||
}
|
||||
|
||||
impl LeakyBucketState {
|
||||
pub fn new() -> Self {
|
||||
pub(crate) fn new() -> Self {
|
||||
Self {
|
||||
filled: 0.0,
|
||||
time: Instant::now(),
|
||||
@@ -100,7 +101,7 @@ impl LeakyBucketState {
|
||||
self.filled == 0.0
|
||||
}
|
||||
|
||||
pub fn check(&mut self, info: &LeakyBucketConfig, now: Instant, n: f64) -> bool {
|
||||
pub(crate) fn check(&mut self, info: &LeakyBucketConfig, now: Instant, n: f64) -> bool {
|
||||
self.update(info, now);
|
||||
|
||||
if self.filled + n > info.max {
|
||||
@@ -119,6 +120,7 @@ impl Default for LeakyBucketState {
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[allow(clippy::float_cmp)]
|
||||
mod tests {
|
||||
use std::time::Duration;
|
||||
|
||||
|
||||
@@ -8,13 +8,13 @@ use tokio::{
|
||||
|
||||
use self::aimd::Aimd;
|
||||
|
||||
pub mod aimd;
|
||||
pub(crate) mod aimd;
|
||||
|
||||
/// Whether a job succeeded or failed as a result of congestion/overload.
|
||||
///
|
||||
/// Errors not considered to be caused by overload should be ignored.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum Outcome {
|
||||
pub(crate) enum Outcome {
|
||||
/// The job succeeded, or failed in a way unrelated to overload.
|
||||
Success,
|
||||
/// The job failed because of overload, e.g. it timed out or an explicit backpressure signal
|
||||
@@ -23,14 +23,14 @@ pub enum Outcome {
|
||||
}
|
||||
|
||||
/// An algorithm for controlling a concurrency limit.
|
||||
pub trait LimitAlgorithm: Send + Sync + 'static {
|
||||
pub(crate) trait LimitAlgorithm: Send + Sync + 'static {
|
||||
/// Update the concurrency limit in response to a new job completion.
|
||||
fn update(&self, old_limit: usize, sample: Sample) -> usize;
|
||||
}
|
||||
|
||||
/// The result of a job (or jobs), including the [`Outcome`] (loss) and latency (delay).
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Copy)]
|
||||
pub struct Sample {
|
||||
pub(crate) struct Sample {
|
||||
pub(crate) latency: Duration,
|
||||
/// Jobs in flight when the sample was taken.
|
||||
pub(crate) in_flight: usize,
|
||||
@@ -39,7 +39,7 @@ pub struct Sample {
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default, serde::Deserialize, PartialEq)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum RateLimitAlgorithm {
|
||||
pub(crate) enum RateLimitAlgorithm {
|
||||
#[default]
|
||||
Fixed,
|
||||
Aimd {
|
||||
@@ -48,7 +48,7 @@ pub enum RateLimitAlgorithm {
|
||||
},
|
||||
}
|
||||
|
||||
pub struct Fixed;
|
||||
pub(crate) struct Fixed;
|
||||
|
||||
impl LimitAlgorithm for Fixed {
|
||||
fn update(&self, old_limit: usize, _sample: Sample) -> usize {
|
||||
@@ -59,12 +59,12 @@ impl LimitAlgorithm for Fixed {
|
||||
#[derive(Clone, Copy, Debug, serde::Deserialize, PartialEq)]
|
||||
pub struct RateLimiterConfig {
|
||||
#[serde(flatten)]
|
||||
pub algorithm: RateLimitAlgorithm,
|
||||
pub initial_limit: usize,
|
||||
pub(crate) algorithm: RateLimitAlgorithm,
|
||||
pub(crate) initial_limit: usize,
|
||||
}
|
||||
|
||||
impl RateLimiterConfig {
|
||||
pub fn create_rate_limit_algorithm(self) -> Box<dyn LimitAlgorithm> {
|
||||
pub(crate) fn create_rate_limit_algorithm(self) -> Box<dyn LimitAlgorithm> {
|
||||
match self.algorithm {
|
||||
RateLimitAlgorithm::Fixed => Box::new(Fixed),
|
||||
RateLimitAlgorithm::Aimd { conf } => Box::new(conf),
|
||||
@@ -72,7 +72,7 @@ impl RateLimiterConfig {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct LimiterInner {
|
||||
pub(crate) struct LimiterInner {
|
||||
alg: Box<dyn LimitAlgorithm>,
|
||||
available: usize,
|
||||
limit: usize,
|
||||
@@ -114,7 +114,7 @@ impl LimiterInner {
|
||||
///
|
||||
/// The limit will be automatically adjusted based on observed latency (delay) and/or failures
|
||||
/// caused by overload (loss).
|
||||
pub struct DynamicLimiter {
|
||||
pub(crate) struct DynamicLimiter {
|
||||
config: RateLimiterConfig,
|
||||
inner: Mutex<LimiterInner>,
|
||||
// to notify when a token is available
|
||||
@@ -124,7 +124,7 @@ pub struct DynamicLimiter {
|
||||
/// A concurrency token, required to run a job.
|
||||
///
|
||||
/// Release the token back to the [`DynamicLimiter`] after the job is complete.
|
||||
pub struct Token {
|
||||
pub(crate) struct Token {
|
||||
start: Instant,
|
||||
limiter: Option<Arc<DynamicLimiter>>,
|
||||
}
|
||||
@@ -133,14 +133,14 @@ pub struct Token {
|
||||
///
|
||||
/// Not guaranteed to be consistent under high concurrency.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct LimiterState {
|
||||
#[cfg(test)]
|
||||
struct LimiterState {
|
||||
limit: usize,
|
||||
in_flight: usize,
|
||||
}
|
||||
|
||||
impl DynamicLimiter {
|
||||
/// Create a limiter with a given limit control algorithm.
|
||||
pub fn new(config: RateLimiterConfig) -> Arc<Self> {
|
||||
pub(crate) fn new(config: RateLimiterConfig) -> Arc<Self> {
|
||||
let ready = Notify::new();
|
||||
ready.notify_one();
|
||||
|
||||
@@ -157,7 +157,10 @@ impl DynamicLimiter {
|
||||
}
|
||||
|
||||
/// Try to acquire a concurrency [Token], waiting for `duration` if there are none available.
|
||||
pub async fn acquire_timeout(self: &Arc<Self>, duration: Duration) -> Result<Token, Elapsed> {
|
||||
pub(crate) async fn acquire_timeout(
|
||||
self: &Arc<Self>,
|
||||
duration: Duration,
|
||||
) -> Result<Token, Elapsed> {
|
||||
tokio::time::timeout(duration, self.acquire()).await?
|
||||
}
|
||||
|
||||
@@ -174,9 +177,8 @@ impl DynamicLimiter {
|
||||
let mut inner = self.inner.lock();
|
||||
if inner.take(&self.ready).is_some() {
|
||||
break Ok(Token::new(self.clone()));
|
||||
} else {
|
||||
notified.set(self.ready.notified());
|
||||
}
|
||||
notified.set(self.ready.notified());
|
||||
}
|
||||
notified.as_mut().await;
|
||||
ready = true;
|
||||
@@ -209,12 +211,10 @@ impl DynamicLimiter {
|
||||
}
|
||||
|
||||
/// The current state of the limiter.
|
||||
pub fn state(&self) -> LimiterState {
|
||||
#[cfg(test)]
|
||||
fn state(&self) -> LimiterState {
|
||||
let inner = self.inner.lock();
|
||||
LimiterState {
|
||||
limit: inner.limit,
|
||||
in_flight: inner.in_flight,
|
||||
}
|
||||
LimiterState { limit: inner.limit }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -225,22 +225,22 @@ impl Token {
|
||||
limiter: Some(limiter),
|
||||
}
|
||||
}
|
||||
pub fn disabled() -> Self {
|
||||
pub(crate) fn disabled() -> Self {
|
||||
Self {
|
||||
start: Instant::now(),
|
||||
limiter: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_disabled(&self) -> bool {
|
||||
pub(crate) fn is_disabled(&self) -> bool {
|
||||
self.limiter.is_none()
|
||||
}
|
||||
|
||||
pub fn release(mut self, outcome: Outcome) {
|
||||
pub(crate) fn release(mut self, outcome: Outcome) {
|
||||
self.release_mut(Some(outcome));
|
||||
}
|
||||
|
||||
pub fn release_mut(&mut self, outcome: Option<Outcome>) {
|
||||
pub(crate) fn release_mut(&mut self, outcome: Option<Outcome>) {
|
||||
if let Some(limiter) = self.limiter.take() {
|
||||
limiter.release_inner(self.start, outcome);
|
||||
}
|
||||
@@ -253,13 +253,10 @@ impl Drop for Token {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl LimiterState {
|
||||
/// The current concurrency limit.
|
||||
pub fn limit(&self) -> usize {
|
||||
fn limit(self) -> usize {
|
||||
self.limit
|
||||
}
|
||||
/// The number of jobs in flight.
|
||||
pub fn in_flight(&self) -> usize {
|
||||
self.in_flight
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,17 +10,17 @@ use super::{LimitAlgorithm, Outcome, Sample};
|
||||
///
|
||||
/// Reduces available concurrency by a factor when load-based errors are detected.
|
||||
#[derive(Clone, Copy, Debug, serde::Deserialize, PartialEq)]
|
||||
pub struct Aimd {
|
||||
pub(crate) struct Aimd {
|
||||
/// Minimum limit for AIMD algorithm.
|
||||
pub min: usize,
|
||||
pub(crate) min: usize,
|
||||
/// Maximum limit for AIMD algorithm.
|
||||
pub max: usize,
|
||||
pub(crate) max: usize,
|
||||
/// Decrease AIMD decrease by value in case of error.
|
||||
pub dec: f32,
|
||||
pub(crate) dec: f32,
|
||||
/// Increase AIMD increase by value in case of success.
|
||||
pub inc: usize,
|
||||
pub(crate) inc: usize,
|
||||
/// A threshold below which the limit won't be increased.
|
||||
pub utilisation: f32,
|
||||
pub(crate) utilisation: f32,
|
||||
}
|
||||
|
||||
impl LimitAlgorithm for Aimd {
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user