mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-14 03:30:36 +00:00
Compare commits
19 Commits
jcsp/issue
...
vlad/refac
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d82df3bba1 | ||
|
|
10f4b99a60 | ||
|
|
991a4c00b8 | ||
|
|
84099e4977 | ||
|
|
345ae45faa | ||
|
|
d37d6f5fd0 | ||
|
|
e2d008bd21 | ||
|
|
bdd8a5be6a | ||
|
|
2dc64aecae | ||
|
|
79bd6f8d44 | ||
|
|
813bd0213d | ||
|
|
6fdba1a427 | ||
|
|
6355aac0bc | ||
|
|
ceadcc4f35 | ||
|
|
ee2a962028 | ||
|
|
9ea2f32b80 | ||
|
|
fb74c21e8c | ||
|
|
d92d36a315 | ||
|
|
ec4cc30de9 |
22
Cargo.lock
generated
22
Cargo.lock
generated
@@ -2695,6 +2695,7 @@ checksum = "ad227c3af19d4914570ad36d30409928b75967c298feb9ea1969db3a610bb14e"
|
||||
dependencies = [
|
||||
"equivalent",
|
||||
"hashbrown 0.14.5",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2794,9 +2795,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "itoa"
|
||||
version = "1.0.6"
|
||||
version = "1.0.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6"
|
||||
checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
|
||||
|
||||
[[package]]
|
||||
name = "jobserver"
|
||||
@@ -3655,6 +3656,7 @@ dependencies = [
|
||||
"tracing",
|
||||
"url",
|
||||
"utils",
|
||||
"wal_decoder",
|
||||
"walkdir",
|
||||
"workspace_hack",
|
||||
]
|
||||
@@ -4092,6 +4094,7 @@ dependencies = [
|
||||
"regex",
|
||||
"serde",
|
||||
"thiserror",
|
||||
"tracing",
|
||||
"utils",
|
||||
]
|
||||
|
||||
@@ -4296,6 +4299,7 @@ dependencies = [
|
||||
"indexmap 2.0.1",
|
||||
"ipnet",
|
||||
"itertools 0.10.5",
|
||||
"itoa",
|
||||
"jose-jwa",
|
||||
"jose-jwk",
|
||||
"lasso",
|
||||
@@ -6858,6 +6862,19 @@ dependencies = [
|
||||
"utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wal_decoder"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bytes",
|
||||
"pageserver_api",
|
||||
"postgres_ffi",
|
||||
"serde",
|
||||
"tracing",
|
||||
"utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "walkdir"
|
||||
version = "2.3.3"
|
||||
@@ -7307,6 +7324,7 @@ dependencies = [
|
||||
"hyper 1.4.1",
|
||||
"hyper-util",
|
||||
"indexmap 1.9.3",
|
||||
"indexmap 2.0.1",
|
||||
"itertools 0.12.1",
|
||||
"lazy_static",
|
||||
"libc",
|
||||
|
||||
@@ -33,6 +33,7 @@ members = [
|
||||
"libs/postgres_ffi/wal_craft",
|
||||
"libs/vm_monitor",
|
||||
"libs/walproposer",
|
||||
"libs/wal_decoder",
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
@@ -107,6 +108,7 @@ indexmap = "2"
|
||||
indoc = "2"
|
||||
ipnet = "2.9.0"
|
||||
itertools = "0.10"
|
||||
itoa = "1.0.11"
|
||||
jsonwebtoken = "9"
|
||||
lasso = "0.7"
|
||||
libc = "0.2"
|
||||
@@ -237,6 +239,7 @@ tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
|
||||
utils = { version = "0.1", path = "./libs/utils/" }
|
||||
vm_monitor = { version = "0.1", path = "./libs/vm_monitor/" }
|
||||
walproposer = { version = "0.1", path = "./libs/walproposer/" }
|
||||
wal_decoder = { version = "0.1", path = "./libs/wal_decoder" }
|
||||
|
||||
## Common library dependency
|
||||
workspace_hack = { version = "0.1", path = "./workspace_hack/" }
|
||||
|
||||
@@ -929,8 +929,8 @@ ARG PG_VERSION
|
||||
RUN case "${PG_VERSION}" in "v17") \
|
||||
echo "pg_session_jwt does not yet have a release that supports pg17" && exit 0;; \
|
||||
esac && \
|
||||
wget https://github.com/neondatabase/pg_session_jwt/archive/ff0a72440e8ff584dab24b3f9b7c00c56c660b8e.tar.gz -O pg_session_jwt.tar.gz && \
|
||||
echo "1fbb2b5a339263bcf6daa847fad8bccbc0b451cea6a62e6d3bf232b0087f05cb pg_session_jwt.tar.gz" | sha256sum --check && \
|
||||
wget https://github.com/neondatabase/pg_session_jwt/archive/5aee2625af38213650e1a07ae038fdc427250ee4.tar.gz -O pg_session_jwt.tar.gz && \
|
||||
echo "5d91b10bc1347d36cffc456cb87bec25047935d6503dc652ca046f04760828e7 pg_session_jwt.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \
|
||||
sed -i 's/pgrx = "=0.11.3"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
cargo pgrx install --release
|
||||
|
||||
@@ -5,9 +5,11 @@ pub mod controller_api;
|
||||
pub mod key;
|
||||
pub mod keyspace;
|
||||
pub mod models;
|
||||
pub mod record;
|
||||
pub mod reltag;
|
||||
pub mod shard;
|
||||
/// Public API types
|
||||
pub mod upcall_api;
|
||||
pub mod value;
|
||||
|
||||
pub mod config;
|
||||
|
||||
113
libs/pageserver_api/src/record.rs
Normal file
113
libs/pageserver_api/src/record.rs
Normal file
@@ -0,0 +1,113 @@
|
||||
//! This module defines the WAL record format used within the pageserver.
|
||||
|
||||
use bytes::Bytes;
|
||||
use postgres_ffi::record::{describe_postgres_wal_record, MultiXactMember};
|
||||
use postgres_ffi::{MultiXactId, MultiXactOffset, TimestampTz, TransactionId};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::bin_ser::DeserializeError;
|
||||
|
||||
/// Each update to a page is represented by a NeonWalRecord. It can be a wrapper
|
||||
/// around a PostgreSQL WAL record, or a custom neon-specific "record".
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub enum NeonWalRecord {
|
||||
/// Native PostgreSQL WAL record
|
||||
Postgres { will_init: bool, rec: Bytes },
|
||||
|
||||
/// Clear bits in heap visibility map. ('flags' is bitmap of bits to clear)
|
||||
ClearVisibilityMapFlags {
|
||||
new_heap_blkno: Option<u32>,
|
||||
old_heap_blkno: Option<u32>,
|
||||
flags: u8,
|
||||
},
|
||||
/// Mark transaction IDs as committed on a CLOG page
|
||||
ClogSetCommitted {
|
||||
xids: Vec<TransactionId>,
|
||||
timestamp: TimestampTz,
|
||||
},
|
||||
/// Mark transaction IDs as aborted on a CLOG page
|
||||
ClogSetAborted { xids: Vec<TransactionId> },
|
||||
/// Extend multixact offsets SLRU
|
||||
MultixactOffsetCreate {
|
||||
mid: MultiXactId,
|
||||
moff: MultiXactOffset,
|
||||
},
|
||||
/// Extend multixact members SLRU.
|
||||
MultixactMembersCreate {
|
||||
moff: MultiXactOffset,
|
||||
members: Vec<MultiXactMember>,
|
||||
},
|
||||
/// Update the map of AUX files, either writing or dropping an entry
|
||||
AuxFile {
|
||||
file_path: String,
|
||||
content: Option<Bytes>,
|
||||
},
|
||||
|
||||
/// A testing record for unit testing purposes. It supports append data to an existing image, or clear it.
|
||||
#[cfg(feature = "testing")]
|
||||
Test {
|
||||
/// Append a string to the image.
|
||||
append: String,
|
||||
/// Clear the image before appending.
|
||||
clear: bool,
|
||||
/// Treat this record as an init record. `clear` should be set to true if this field is set
|
||||
/// to true. This record does not need the history WALs to reconstruct. See [`NeonWalRecord::will_init`] and
|
||||
/// its references in `timeline.rs`.
|
||||
will_init: bool,
|
||||
},
|
||||
}
|
||||
|
||||
impl NeonWalRecord {
|
||||
/// Does replaying this WAL record initialize the page from scratch, or does
|
||||
/// it need to be applied over the previous image of the page?
|
||||
pub fn will_init(&self) -> bool {
|
||||
// If you change this function, you'll also need to change ValueBytes::will_init
|
||||
match self {
|
||||
NeonWalRecord::Postgres { will_init, rec: _ } => *will_init,
|
||||
#[cfg(feature = "testing")]
|
||||
NeonWalRecord::Test { will_init, .. } => *will_init,
|
||||
// None of the special neon record types currently initialize the page
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
pub fn wal_append(s: impl AsRef<str>) -> Self {
|
||||
Self::Test {
|
||||
append: s.as_ref().to_string(),
|
||||
clear: false,
|
||||
will_init: false,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
pub fn wal_clear() -> Self {
|
||||
Self::Test {
|
||||
append: "".to_string(),
|
||||
clear: true,
|
||||
will_init: false,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
pub fn wal_init() -> Self {
|
||||
Self::Test {
|
||||
append: "".to_string(),
|
||||
clear: true,
|
||||
will_init: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Build a human-readable string to describe a WAL record
|
||||
///
|
||||
/// For debugging purposes
|
||||
pub fn describe_wal_record(rec: &NeonWalRecord) -> Result<String, DeserializeError> {
|
||||
match rec {
|
||||
NeonWalRecord::Postgres { will_init, rec } => Ok(format!(
|
||||
"will_init: {}, {}",
|
||||
will_init,
|
||||
describe_postgres_wal_record(rec)?
|
||||
)),
|
||||
_ => Ok(format!("{:?}", rec)),
|
||||
}
|
||||
}
|
||||
@@ -1,13 +1,9 @@
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::Result;
|
||||
//! This module defines the value type used by the storage engine.
|
||||
|
||||
use crate::record::NeonWalRecord;
|
||||
use bytes::Bytes;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::ops::AddAssign;
|
||||
use std::time::Duration;
|
||||
|
||||
pub use pageserver_api::key::{Key, KEY_SIZE};
|
||||
|
||||
/// A 'value' stored for a one Key.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub enum Value {
|
||||
/// An Image value contains a full copy of the value
|
||||
@@ -33,17 +29,17 @@ impl Value {
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub(crate) enum InvalidInput {
|
||||
pub enum InvalidInput {
|
||||
TooShortValue,
|
||||
TooShortPostgresRecord,
|
||||
}
|
||||
|
||||
/// We could have a ValueRef where everything is `serde(borrow)`. Before implementing that, lets
|
||||
/// use this type for querying if a slice looks some particular way.
|
||||
pub(crate) struct ValueBytes;
|
||||
pub struct ValueBytes;
|
||||
|
||||
impl ValueBytes {
|
||||
pub(crate) fn will_init(raw: &[u8]) -> Result<bool, InvalidInput> {
|
||||
pub fn will_init(raw: &[u8]) -> Result<bool, InvalidInput> {
|
||||
if raw.len() < 12 {
|
||||
return Err(InvalidInput::TooShortValue);
|
||||
}
|
||||
@@ -79,6 +75,7 @@ impl ValueBytes {
|
||||
mod test {
|
||||
use super::*;
|
||||
|
||||
use bytes::Bytes;
|
||||
use utils::bin_ser::BeSer;
|
||||
|
||||
macro_rules! roundtrip {
|
||||
@@ -229,56 +226,3 @@ mod test {
|
||||
assert!(!ValueBytes::will_init(&expected).unwrap());
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Result of performing GC
|
||||
///
|
||||
#[derive(Default, Serialize, Debug)]
|
||||
pub struct GcResult {
|
||||
pub layers_total: u64,
|
||||
pub layers_needed_by_cutoff: u64,
|
||||
pub layers_needed_by_pitr: u64,
|
||||
pub layers_needed_by_branches: u64,
|
||||
pub layers_needed_by_leases: u64,
|
||||
pub layers_not_updated: u64,
|
||||
pub layers_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files.
|
||||
|
||||
#[serde(serialize_with = "serialize_duration_as_millis")]
|
||||
pub elapsed: Duration,
|
||||
|
||||
/// The layers which were garbage collected.
|
||||
///
|
||||
/// Used in `/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc` to wait for the layers to be
|
||||
/// dropped in tests.
|
||||
#[cfg(feature = "testing")]
|
||||
#[serde(skip)]
|
||||
pub(crate) doomed_layers: Vec<crate::tenant::storage_layer::Layer>,
|
||||
}
|
||||
|
||||
// helper function for `GcResult`, serializing a `Duration` as an integer number of milliseconds
|
||||
fn serialize_duration_as_millis<S>(d: &Duration, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
d.as_millis().serialize(serializer)
|
||||
}
|
||||
|
||||
impl AddAssign for GcResult {
|
||||
fn add_assign(&mut self, other: Self) {
|
||||
self.layers_total += other.layers_total;
|
||||
self.layers_needed_by_pitr += other.layers_needed_by_pitr;
|
||||
self.layers_needed_by_cutoff += other.layers_needed_by_cutoff;
|
||||
self.layers_needed_by_branches += other.layers_needed_by_branches;
|
||||
self.layers_needed_by_leases += other.layers_needed_by_leases;
|
||||
self.layers_not_updated += other.layers_not_updated;
|
||||
self.layers_removed += other.layers_removed;
|
||||
|
||||
self.elapsed += other.elapsed;
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
{
|
||||
let mut other = other;
|
||||
self.doomed_layers.append(&mut other.doomed_layers);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -15,6 +15,7 @@ memoffset.workspace = true
|
||||
thiserror.workspace = true
|
||||
serde.workspace = true
|
||||
utils.workspace = true
|
||||
tracing.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
env_logger.workspace = true
|
||||
|
||||
@@ -216,6 +216,7 @@ macro_rules! enum_pgversion {
|
||||
}
|
||||
|
||||
pub mod pg_constants;
|
||||
pub mod record;
|
||||
pub mod relfile_utils;
|
||||
|
||||
// Export some widely used datatypes that are unlikely to change across Postgres versions
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
14
libs/wal_decoder/Cargo.toml
Normal file
14
libs/wal_decoder/Cargo.toml
Normal file
@@ -0,0 +1,14 @@
|
||||
[package]
|
||||
name = "wal_decoder"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
bytes.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
postgres_ffi.workspace = true
|
||||
serde.workspace = true
|
||||
tracing.workspace = true
|
||||
utils.workspace = true
|
||||
1000
libs/wal_decoder/src/decoder.rs
Normal file
1000
libs/wal_decoder/src/decoder.rs
Normal file
File diff suppressed because it is too large
Load Diff
2
libs/wal_decoder/src/lib.rs
Normal file
2
libs/wal_decoder/src/lib.rs
Normal file
@@ -0,0 +1,2 @@
|
||||
pub mod decoder;
|
||||
pub mod models;
|
||||
177
libs/wal_decoder/src/models.rs
Normal file
177
libs/wal_decoder/src/models.rs
Normal file
@@ -0,0 +1,177 @@
|
||||
//! This module houses types which represent decoded PG WAL records
|
||||
//! ready for the pageserver to interpret.
|
||||
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::key::CompactKey;
|
||||
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
|
||||
use pageserver_api::value::Value;
|
||||
use postgres_ffi::record::{
|
||||
XlMultiXactCreate, XlMultiXactTruncate, XlRelmapUpdate, XlReploriginDrop, XlReploriginSet,
|
||||
XlXactParsedRecord,
|
||||
};
|
||||
use postgres_ffi::{Oid, TransactionId};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
pub enum FlushUncommittedRecords {
|
||||
Yes,
|
||||
No,
|
||||
}
|
||||
|
||||
pub struct InterpretedWalRecord {
|
||||
pub metadata_record: Option<MetadataRecord>,
|
||||
pub blocks: Vec<(CompactKey, Option<Value>)>,
|
||||
pub lsn: Lsn,
|
||||
pub flush_uncommitted: FlushUncommittedRecords,
|
||||
pub xid: TransactionId,
|
||||
}
|
||||
|
||||
pub enum MetadataRecord {
|
||||
Heapam(HeapamRecord),
|
||||
Neonrmgr(NeonrmgrRecord),
|
||||
Smgr(SmgrRecord),
|
||||
Dbase(DbaseRecord),
|
||||
Clog(ClogRecord),
|
||||
Xact(XactRecord),
|
||||
MultiXact(MultiXactRecord),
|
||||
Relmap(RelmapRecord),
|
||||
Xlog(XlogRecord),
|
||||
LogicalMessage(LogicalMessageRecord),
|
||||
Standby(StandbyRecord),
|
||||
Replorigin(ReploriginRecord),
|
||||
}
|
||||
|
||||
pub enum HeapamRecord {
|
||||
ClearVmBits(ClearVmBits),
|
||||
}
|
||||
|
||||
pub struct ClearVmBits {
|
||||
pub new_heap_blkno: Option<u32>,
|
||||
pub old_heap_blkno: Option<u32>,
|
||||
pub vm_rel: RelTag,
|
||||
pub flags: u8,
|
||||
}
|
||||
|
||||
pub enum NeonrmgrRecord {
|
||||
ClearVmBits(ClearVmBits),
|
||||
}
|
||||
|
||||
pub enum SmgrRecord {
|
||||
Create(SmgrCreate),
|
||||
Truncate(SmgrTruncate),
|
||||
}
|
||||
|
||||
pub struct SmgrCreate {
|
||||
pub rel: RelTag,
|
||||
}
|
||||
|
||||
pub struct SmgrTruncate {
|
||||
pub rel: RelTag,
|
||||
pub to: BlockNumber,
|
||||
}
|
||||
|
||||
pub enum DbaseRecord {
|
||||
Create(DbaseCreate),
|
||||
Drop(DbaseDrop),
|
||||
}
|
||||
|
||||
pub struct DbaseCreate {
|
||||
pub db_id: Oid,
|
||||
pub tablespace_id: Oid,
|
||||
pub src_db_id: Oid,
|
||||
pub src_tablespace_id: Oid,
|
||||
}
|
||||
|
||||
pub struct DbaseDrop {
|
||||
pub db_id: Oid,
|
||||
pub tablespace_ids: Vec<Oid>,
|
||||
}
|
||||
|
||||
pub enum ClogRecord {
|
||||
ZeroPage(ClogZeroPage),
|
||||
Truncate(ClogTruncate),
|
||||
}
|
||||
|
||||
pub struct ClogZeroPage {
|
||||
pub segno: u32,
|
||||
pub rpageno: u32,
|
||||
}
|
||||
|
||||
pub struct ClogTruncate {
|
||||
pub pageno: u32,
|
||||
pub oldest_xid: TransactionId,
|
||||
pub oldest_xid_db: Oid,
|
||||
}
|
||||
|
||||
pub enum XactRecord {
|
||||
Commit(XactCommon),
|
||||
Abort(XactCommon),
|
||||
CommitPrepared(XactCommon),
|
||||
AbortPrepared(XactCommon),
|
||||
Prepare(XactPrepare),
|
||||
}
|
||||
|
||||
pub struct XactCommon {
|
||||
pub parsed: XlXactParsedRecord,
|
||||
pub origin_id: u16,
|
||||
// Fields below are only used for logging
|
||||
pub xl_xid: TransactionId,
|
||||
pub lsn: Lsn,
|
||||
}
|
||||
|
||||
pub struct XactPrepare {
|
||||
pub xl_xid: TransactionId,
|
||||
pub data: Bytes,
|
||||
}
|
||||
|
||||
pub enum MultiXactRecord {
|
||||
ZeroPage(MultiXactZeroPage),
|
||||
Create(XlMultiXactCreate),
|
||||
Truncate(XlMultiXactTruncate),
|
||||
}
|
||||
|
||||
pub struct MultiXactZeroPage {
|
||||
pub slru_kind: SlruKind,
|
||||
pub segno: u32,
|
||||
pub rpageno: u32,
|
||||
}
|
||||
|
||||
pub enum RelmapRecord {
|
||||
Update(RelmapUpdate),
|
||||
}
|
||||
|
||||
pub struct RelmapUpdate {
|
||||
pub update: XlRelmapUpdate,
|
||||
pub buf: Bytes,
|
||||
}
|
||||
|
||||
pub enum XlogRecord {
|
||||
Raw(RawXlogRecord),
|
||||
}
|
||||
|
||||
pub struct RawXlogRecord {
|
||||
pub info: u8,
|
||||
pub lsn: Lsn,
|
||||
pub buf: Bytes,
|
||||
}
|
||||
|
||||
pub enum LogicalMessageRecord {
|
||||
Put(PutLogicalMessage),
|
||||
}
|
||||
|
||||
pub struct PutLogicalMessage {
|
||||
pub buf: Bytes,
|
||||
pub prefix_size: usize,
|
||||
}
|
||||
|
||||
pub enum StandbyRecord {
|
||||
RunningXacts(StandbyRunningXacts),
|
||||
}
|
||||
|
||||
pub struct StandbyRunningXacts {
|
||||
pub oldest_running_xid: TransactionId,
|
||||
}
|
||||
|
||||
pub enum ReploriginRecord {
|
||||
Set(XlReploriginSet),
|
||||
Drop(XlReploriginDrop),
|
||||
}
|
||||
@@ -83,6 +83,7 @@ enum-map.workspace = true
|
||||
enumset = { workspace = true, features = ["serde"]}
|
||||
strum.workspace = true
|
||||
strum_macros.workspace = true
|
||||
wal_decoder.workspace = true
|
||||
|
||||
[target.'cfg(target_os = "linux")'.dependencies]
|
||||
procfs.workspace = true
|
||||
@@ -92,6 +93,7 @@ criterion.workspace = true
|
||||
hex-literal.workspace = true
|
||||
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }
|
||||
indoc.workspace = true
|
||||
# pageserver_api = { workspace = true, features = ["testing"] }
|
||||
|
||||
[[bench]]
|
||||
name = "bench_layer_map"
|
||||
|
||||
@@ -6,15 +6,15 @@ use criterion::{criterion_group, criterion_main, Criterion};
|
||||
use pageserver::{
|
||||
config::PageServerConf,
|
||||
context::{DownloadBehavior, RequestContext},
|
||||
gc_result::Value,
|
||||
l0_flush::{L0FlushConfig, L0FlushGlobalState},
|
||||
page_cache,
|
||||
repository::Value,
|
||||
task_mgr::TaskKind,
|
||||
tenant::storage_layer::inmemory_layer::SerializedBatch,
|
||||
tenant::storage_layer::InMemoryLayer,
|
||||
virtual_file,
|
||||
};
|
||||
use pageserver_api::{key::Key, shard::TenantShardId};
|
||||
use pageserver_api::{key::Key, shard::TenantShardId, value::Value};
|
||||
use utils::{
|
||||
bin_ser::BeSer,
|
||||
id::{TenantId, TimelineId},
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
use criterion::measurement::WallTime;
|
||||
use pageserver::keyspace::{KeyPartitioning, KeySpace};
|
||||
use pageserver::repository::Key;
|
||||
use pageserver::tenant::layer_map::LayerMap;
|
||||
use pageserver::tenant::storage_layer::LayerName;
|
||||
use pageserver::tenant::storage_layer::PersistentLayerDesc;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use rand::prelude::{SeedableRng, SliceRandom, StdRng};
|
||||
use std::cmp::{max, min};
|
||||
|
||||
@@ -60,7 +60,8 @@ use anyhow::Context;
|
||||
use bytes::{Buf, Bytes};
|
||||
use criterion::{BenchmarkId, Criterion};
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
|
||||
use pageserver::{config::PageServerConf, walredo::PostgresRedoManager};
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::{key::Key, shard::TenantShardId};
|
||||
use std::{
|
||||
future::Future,
|
||||
|
||||
@@ -51,7 +51,7 @@
|
||||
//!
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use pageserver::repository::Key;
|
||||
use pageserver_api::key::Key;
|
||||
use std::cmp::Ordering;
|
||||
use std::io::{self, BufRead};
|
||||
use std::path::PathBuf;
|
||||
|
||||
@@ -13,12 +13,12 @@ use std::ops::Range;
|
||||
use std::{fs, str};
|
||||
|
||||
use pageserver::page_cache::{self, PAGE_SZ};
|
||||
use pageserver::repository::{Key, KEY_SIZE};
|
||||
use pageserver::tenant::block_io::FileBlockReader;
|
||||
use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection};
|
||||
use pageserver::tenant::storage_layer::delta_layer::{Summary, DELTA_KEY_SIZE};
|
||||
use pageserver::tenant::storage_layer::range_overlaps;
|
||||
use pageserver::virtual_file::{self, VirtualFile};
|
||||
use pageserver_api::key::{Key, KEY_SIZE};
|
||||
|
||||
use utils::{bin_ser::BeSer, lsn::Lsn};
|
||||
|
||||
|
||||
@@ -13,13 +13,13 @@ use pageserver::tenant::storage_layer::{DeltaLayer, ImageLayer};
|
||||
use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
|
||||
use pageserver::{page_cache, virtual_file};
|
||||
use pageserver::{
|
||||
repository::{Key, KEY_SIZE},
|
||||
tenant::{
|
||||
block_io::FileBlockReader, disk_btree::VisitDirection,
|
||||
storage_layer::delta_layer::DELTA_KEY_SIZE,
|
||||
},
|
||||
virtual_file::VirtualFile,
|
||||
};
|
||||
use pageserver_api::key::{Key, KEY_SIZE};
|
||||
use std::fs;
|
||||
use utils::bin_ser::BeSer;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
@@ -696,7 +696,7 @@ impl DeletionQueue {
|
||||
mod test {
|
||||
use camino::Utf8Path;
|
||||
use hex_literal::hex;
|
||||
use pageserver_api::{shard::ShardIndex, upcall_api::ReAttachResponseTenant};
|
||||
use pageserver_api::{key::Key, shard::ShardIndex, upcall_api::ReAttachResponseTenant};
|
||||
use std::{io::ErrorKind, time::Duration};
|
||||
use tracing::info;
|
||||
|
||||
@@ -705,7 +705,6 @@ mod test {
|
||||
|
||||
use crate::{
|
||||
controller_upcall_client::RetryForeverError,
|
||||
repository::Key,
|
||||
tenant::{harness::TenantHarness, storage_layer::DeltaLayerName},
|
||||
};
|
||||
|
||||
|
||||
57
pageserver/src/gc_result.rs
Normal file
57
pageserver/src/gc_result.rs
Normal file
@@ -0,0 +1,57 @@
|
||||
use anyhow::Result;
|
||||
use serde::Serialize;
|
||||
use std::ops::AddAssign;
|
||||
use std::time::Duration;
|
||||
|
||||
///
|
||||
/// Result of performing GC
|
||||
///
|
||||
#[derive(Default, Serialize, Debug)]
|
||||
pub struct GcResult {
|
||||
pub layers_total: u64,
|
||||
pub layers_needed_by_cutoff: u64,
|
||||
pub layers_needed_by_pitr: u64,
|
||||
pub layers_needed_by_branches: u64,
|
||||
pub layers_needed_by_leases: u64,
|
||||
pub layers_not_updated: u64,
|
||||
pub layers_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files.
|
||||
|
||||
#[serde(serialize_with = "serialize_duration_as_millis")]
|
||||
pub elapsed: Duration,
|
||||
|
||||
/// The layers which were garbage collected.
|
||||
///
|
||||
/// Used in `/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc` to wait for the layers to be
|
||||
/// dropped in tests.
|
||||
#[cfg(feature = "testing")]
|
||||
#[serde(skip)]
|
||||
pub(crate) doomed_layers: Vec<crate::tenant::storage_layer::Layer>,
|
||||
}
|
||||
|
||||
// helper function for `GcResult`, serializing a `Duration` as an integer number of milliseconds
|
||||
fn serialize_duration_as_millis<S>(d: &Duration, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
d.as_millis().serialize(serializer)
|
||||
}
|
||||
|
||||
impl AddAssign for GcResult {
|
||||
fn add_assign(&mut self, other: Self) {
|
||||
self.layers_total += other.layers_total;
|
||||
self.layers_needed_by_pitr += other.layers_needed_by_pitr;
|
||||
self.layers_needed_by_cutoff += other.layers_needed_by_cutoff;
|
||||
self.layers_needed_by_branches += other.layers_needed_by_branches;
|
||||
self.layers_needed_by_leases += other.layers_needed_by_leases;
|
||||
self.layers_not_updated += other.layers_not_updated;
|
||||
self.layers_removed += other.layers_removed;
|
||||
|
||||
self.elapsed += other.elapsed;
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
{
|
||||
let mut other = other;
|
||||
self.doomed_layers.append(&mut other.doomed_layers);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -77,6 +77,7 @@ use crate::tenant::secondary::SecondaryController;
|
||||
use crate::tenant::size::ModelInputs;
|
||||
use crate::tenant::storage_layer::LayerAccessStatsReset;
|
||||
use crate::tenant::storage_layer::LayerName;
|
||||
use crate::tenant::timeline::offload::offload_timeline;
|
||||
use crate::tenant::timeline::CompactFlags;
|
||||
use crate::tenant::timeline::CompactionError;
|
||||
use crate::tenant::timeline::Timeline;
|
||||
@@ -325,6 +326,7 @@ impl From<crate::tenant::TimelineArchivalError> for ApiError {
|
||||
match value {
|
||||
NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found").into()),
|
||||
Timeout => ApiError::Timeout("hit pageserver internal timeout".into()),
|
||||
Cancelled => ApiError::ShuttingDown,
|
||||
e @ HasArchivedParent(_) => {
|
||||
ApiError::PreconditionFailed(e.to_string().into_boxed_str())
|
||||
}
|
||||
@@ -1785,6 +1787,49 @@ async fn timeline_compact_handler(
|
||||
.await
|
||||
}
|
||||
|
||||
// Run offload immediately on given timeline.
|
||||
async fn timeline_offload_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
async {
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
|
||||
if tenant.get_offloaded_timeline(timeline_id).is_ok() {
|
||||
return json_response(StatusCode::OK, ());
|
||||
}
|
||||
let timeline =
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
|
||||
if !tenant.timeline_has_no_attached_children(timeline_id) {
|
||||
return Err(ApiError::PreconditionFailed(
|
||||
"timeline has attached children".into(),
|
||||
));
|
||||
}
|
||||
if !timeline.can_offload() {
|
||||
return Err(ApiError::PreconditionFailed(
|
||||
"Timeline::can_offload() returned false".into(),
|
||||
));
|
||||
}
|
||||
offload_timeline(&tenant, &timeline)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
.instrument(info_span!("manual_timeline_offload", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
|
||||
.await
|
||||
}
|
||||
|
||||
// Run checkpoint immediately on given timeline.
|
||||
async fn timeline_checkpoint_handler(
|
||||
request: Request<Body>,
|
||||
@@ -2005,13 +2050,13 @@ async fn getpage_at_lsn_handler(
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
let state = get_state(&request);
|
||||
|
||||
struct Key(crate::repository::Key);
|
||||
struct Key(pageserver_api::key::Key);
|
||||
|
||||
impl std::str::FromStr for Key {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
|
||||
crate::repository::Key::from_hex(s).map(Key)
|
||||
pageserver_api::key::Key::from_hex(s).map(Key)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3008,6 +3053,10 @@ pub fn make_router(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact",
|
||||
|r| api_handler(r, timeline_compact_handler),
|
||||
)
|
||||
.put(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/offload",
|
||||
|r| testing_api_handler("attempt timeline offload", r, timeline_offload_handler),
|
||||
)
|
||||
.put(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/checkpoint",
|
||||
|r| testing_api_handler("run timeline checkpoint", r, timeline_checkpoint_handler),
|
||||
|
||||
@@ -12,6 +12,7 @@ use pageserver_api::key::rel_block_to_key;
|
||||
use tokio::io::{AsyncRead, AsyncReadExt};
|
||||
use tokio_tar::Archive;
|
||||
use tracing::*;
|
||||
use wal_decoder::models::InterpretedWalRecord;
|
||||
use walkdir::WalkDir;
|
||||
|
||||
use crate::context::RequestContext;
|
||||
@@ -19,8 +20,6 @@ use crate::metrics::WAL_INGEST;
|
||||
use crate::pgdatadir_mapping::*;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::walingest::WalIngest;
|
||||
use crate::walrecord::decode_wal_record;
|
||||
use crate::walrecord::DecodedWALRecord;
|
||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::relfile_utils::*;
|
||||
@@ -313,11 +312,15 @@ async fn import_wal(
|
||||
let mut modification = tline.begin_modification(last_lsn);
|
||||
while last_lsn <= endpoint {
|
||||
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
let mut decoded = DecodedWALRecord::default();
|
||||
decode_wal_record(recdata, &mut decoded, tline.pg_version)?;
|
||||
let interpreted = InterpretedWalRecord::from_bytes(
|
||||
recdata,
|
||||
tline.get_shard_identity(),
|
||||
lsn,
|
||||
tline.pg_version,
|
||||
)?;
|
||||
|
||||
walingest
|
||||
.ingest_record(decoded, lsn, &mut modification, ctx)
|
||||
.ingest_record(interpreted, &mut modification, ctx)
|
||||
.await?;
|
||||
WAL_INGEST.records_committed.inc();
|
||||
|
||||
@@ -454,10 +457,15 @@ pub async fn import_wal_from_tar(
|
||||
let mut modification = tline.begin_modification(last_lsn);
|
||||
while last_lsn <= end_lsn {
|
||||
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
let mut decoded = DecodedWALRecord::default();
|
||||
decode_wal_record(recdata, &mut decoded, tline.pg_version)?;
|
||||
let interpreted = InterpretedWalRecord::from_bytes(
|
||||
recdata,
|
||||
tline.get_shard_identity(),
|
||||
lsn,
|
||||
tline.pg_version,
|
||||
)?;
|
||||
|
||||
walingest
|
||||
.ingest_record(decoded, lsn, &mut modification, ctx)
|
||||
.ingest_record(interpreted, &mut modification, ctx)
|
||||
.await?;
|
||||
modification.commit(ctx).await?;
|
||||
last_lsn = lsn;
|
||||
|
||||
@@ -20,11 +20,11 @@ pub use pageserver_api::keyspace;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
mod assert_u64_eq_usize;
|
||||
pub mod aux_file;
|
||||
pub mod gc_result;
|
||||
pub mod metrics;
|
||||
pub mod page_cache;
|
||||
pub mod page_service;
|
||||
pub mod pgdatadir_mapping;
|
||||
pub mod repository;
|
||||
pub mod span;
|
||||
pub(crate) mod statvfs;
|
||||
pub mod task_mgr;
|
||||
@@ -32,7 +32,6 @@ pub mod tenant;
|
||||
pub mod utilization;
|
||||
pub mod virtual_file;
|
||||
pub mod walingest;
|
||||
pub mod walrecord;
|
||||
pub mod walredo;
|
||||
|
||||
use camino::Utf8Path;
|
||||
|
||||
@@ -7,14 +7,14 @@
|
||||
//! Clarify that)
|
||||
//!
|
||||
use super::tenant::{PageReconstructError, Timeline};
|
||||
use crate::aux_file;
|
||||
use crate::context::RequestContext;
|
||||
use crate::keyspace::{KeySpace, KeySpaceAccum};
|
||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use crate::{aux_file, repository::*};
|
||||
use anyhow::{ensure, Context};
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
use enum_map::Enum;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::key::{
|
||||
dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
|
||||
relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
|
||||
@@ -23,7 +23,9 @@ use pageserver_api::key::{
|
||||
};
|
||||
use pageserver_api::keyspace::SparseKeySpace;
|
||||
use pageserver_api::models::AuxFilePolicy;
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
|
||||
use pageserver_api::value::Value;
|
||||
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
|
||||
use postgres_ffi::BLCKSZ;
|
||||
use postgres_ffi::{Oid, RepOriginId, TimestampTz, TransactionId};
|
||||
|
||||
@@ -79,6 +79,7 @@ use crate::config::PageServerConf;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::deletion_queue::DeletionQueueClient;
|
||||
use crate::deletion_queue::DeletionQueueError;
|
||||
use crate::gc_result::GcResult;
|
||||
use crate::import_datadir;
|
||||
use crate::is_uninit_mark;
|
||||
use crate::l0_flush::L0FlushGlobalState;
|
||||
@@ -87,7 +88,6 @@ use crate::metrics::{
|
||||
remove_tenant_metrics, BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN,
|
||||
TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC,
|
||||
};
|
||||
use crate::repository::GcResult;
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::config::LocationMode;
|
||||
@@ -461,10 +461,10 @@ impl WalRedoManager {
|
||||
/// This method is cancellation-safe.
|
||||
pub async fn request_redo(
|
||||
&self,
|
||||
key: crate::repository::Key,
|
||||
key: pageserver_api::key::Key,
|
||||
lsn: Lsn,
|
||||
base_img: Option<(Lsn, bytes::Bytes)>,
|
||||
records: Vec<(Lsn, crate::walrecord::NeonWalRecord)>,
|
||||
records: Vec<(Lsn, pageserver_api::record::NeonWalRecord)>,
|
||||
pg_version: u32,
|
||||
) -> Result<bytes::Bytes, walredo::Error> {
|
||||
match self {
|
||||
@@ -619,6 +619,9 @@ pub enum TimelineArchivalError {
|
||||
#[error("Timeout")]
|
||||
Timeout,
|
||||
|
||||
#[error("Cancelled")]
|
||||
Cancelled,
|
||||
|
||||
#[error("ancestor is archived: {}", .0)]
|
||||
HasArchivedParent(TimelineId),
|
||||
|
||||
@@ -637,6 +640,7 @@ impl Debug for TimelineArchivalError {
|
||||
match self {
|
||||
Self::NotFound => write!(f, "NotFound"),
|
||||
Self::Timeout => write!(f, "Timeout"),
|
||||
Self::Cancelled => write!(f, "Cancelled"),
|
||||
Self::HasArchivedParent(p) => f.debug_tuple("HasArchivedParent").field(p).finish(),
|
||||
Self::HasUnarchivedChildren(c) => {
|
||||
f.debug_tuple("HasUnarchivedChildren").field(c).finish()
|
||||
@@ -1552,6 +1556,7 @@ impl Tenant {
|
||||
timeline_id: TimelineId,
|
||||
ctx: RequestContext,
|
||||
) -> Result<Arc<Timeline>, TimelineArchivalError> {
|
||||
info!("unoffloading timeline");
|
||||
let cancel = self.cancel.clone();
|
||||
let timeline_preload = self
|
||||
.load_timeline_metadata(timeline_id, self.remote_storage.clone(), cancel)
|
||||
@@ -1566,6 +1571,7 @@ impl Tenant {
|
||||
error!(%timeline_id, "index_part not found on remote");
|
||||
return Err(TimelineArchivalError::NotFound);
|
||||
}
|
||||
Err(DownloadError::Cancelled) => return Err(TimelineArchivalError::Cancelled),
|
||||
Err(e) => {
|
||||
// Some (possibly ephemeral) error happened during index_part download.
|
||||
warn!(%timeline_id, "Failed to load index_part from remote storage, failed creation? ({e})");
|
||||
@@ -1603,6 +1609,7 @@ impl Tenant {
|
||||
if offloaded_timelines.remove(&timeline_id).is_none() {
|
||||
warn!("timeline already removed from offloaded timelines");
|
||||
}
|
||||
info!("timeline unoffloading complete");
|
||||
Ok(Arc::clone(timeline))
|
||||
} else {
|
||||
warn!("timeline not available directly after attach");
|
||||
@@ -1683,6 +1690,21 @@ impl Tenant {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn get_offloaded_timeline(
|
||||
&self,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<Arc<OffloadedTimeline>, GetTimelineError> {
|
||||
self.timelines_offloaded
|
||||
.lock()
|
||||
.unwrap()
|
||||
.get(&timeline_id)
|
||||
.map(Arc::clone)
|
||||
.ok_or(GetTimelineError::NotFound {
|
||||
tenant_id: self.tenant_shard_id,
|
||||
timeline_id,
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn tenant_shard_id(&self) -> TenantShardId {
|
||||
self.tenant_shard_id
|
||||
}
|
||||
@@ -2218,6 +2240,13 @@ impl Tenant {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn timeline_has_no_attached_children(&self, timeline_id: TimelineId) -> bool {
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
!timelines
|
||||
.iter()
|
||||
.any(|(_id, tl)| tl.get_ancestor_timeline_id() == Some(timeline_id))
|
||||
}
|
||||
|
||||
pub fn current_state(&self) -> TenantState {
|
||||
self.state.borrow().clone()
|
||||
}
|
||||
@@ -4268,7 +4297,8 @@ pub(crate) mod harness {
|
||||
use crate::deletion_queue::mock::MockDeletionQueue;
|
||||
use crate::l0_flush::L0FlushConfig;
|
||||
use crate::walredo::apply_neon;
|
||||
use crate::{repository::Key, walrecord::NeonWalRecord};
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
|
||||
use super::*;
|
||||
use hex_literal::hex;
|
||||
@@ -4539,17 +4569,17 @@ mod tests {
|
||||
use super::*;
|
||||
use crate::keyspace::KeySpaceAccum;
|
||||
use crate::pgdatadir_mapping::AuxFilesDirectory;
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::tenant::harness::*;
|
||||
use crate::tenant::timeline::CompactFlags;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use crate::DEFAULT_PG_VERSION;
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use hex_literal::hex;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::key::{AUX_FILES_KEY, AUX_KEY_PREFIX, NON_INHERITED_RANGE};
|
||||
use pageserver_api::key::{Key, AUX_FILES_KEY, AUX_KEY_PREFIX, NON_INHERITED_RANGE};
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::value::Value;
|
||||
use rand::{thread_rng, Rng};
|
||||
use storage_layer::PersistentLayerKey;
|
||||
use tests::storage_layer::ValuesReconstructState;
|
||||
|
||||
@@ -48,9 +48,9 @@ mod layer_coverage;
|
||||
|
||||
use crate::context::RequestContext;
|
||||
use crate::keyspace::KeyPartitioning;
|
||||
use crate::repository::Key;
|
||||
use crate::tenant::storage_layer::InMemoryLayer;
|
||||
use anyhow::Result;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::keyspace::{KeySpace, KeySpaceAccum};
|
||||
use range_set_blaze::{CheckSortedDisjoint, RangeSetBlaze};
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
|
||||
@@ -2836,7 +2836,7 @@ where
|
||||
}
|
||||
|
||||
use {
|
||||
crate::repository::GcResult, pageserver_api::models::TimelineGcRequest,
|
||||
crate::gc_result::GcResult, pageserver_api::models::TimelineGcRequest,
|
||||
utils::http::error::ApiError,
|
||||
};
|
||||
|
||||
|
||||
@@ -11,11 +11,11 @@ pub mod merge_iterator;
|
||||
pub mod split_writer;
|
||||
|
||||
use crate::context::{AccessStatsBehavior, RequestContext};
|
||||
use crate::repository::Value;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::value::Value;
|
||||
use std::cmp::{Ordering, Reverse};
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::{BinaryHeap, HashMap};
|
||||
|
||||
@@ -30,7 +30,6 @@
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
|
||||
use crate::page_cache::{self, FileId, PAGE_SZ};
|
||||
use crate::repository::{Key, Value, KEY_SIZE};
|
||||
use crate::tenant::blob_io::BlobWriter;
|
||||
use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
|
||||
use crate::tenant::disk_btree::{
|
||||
@@ -45,7 +44,7 @@ use crate::tenant::vectored_blob_io::{
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
|
||||
use crate::virtual_file::{self, MaybeFatalIo, VirtualFile};
|
||||
use crate::{walrecord, TEMP_FILE_SUFFIX};
|
||||
use crate::TEMP_FILE_SUFFIX;
|
||||
use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use bytes::BytesMut;
|
||||
@@ -54,9 +53,11 @@ use futures::StreamExt;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::config::MaxVectoredReadBytes;
|
||||
use pageserver_api::key::DBDIR_KEY;
|
||||
use pageserver_api::key::{Key, KEY_SIZE};
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::ImageCompressionAlgorithm;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_api::value::Value;
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::VecDeque;
|
||||
@@ -1294,7 +1295,7 @@ impl DeltaLayerInner {
|
||||
// is it an image or will_init walrecord?
|
||||
// FIXME: this could be handled by threading the BlobRef to the
|
||||
// VectoredReadBuilder
|
||||
let will_init = crate::repository::ValueBytes::will_init(&data)
|
||||
let will_init = pageserver_api::value::ValueBytes::will_init(&data)
|
||||
.inspect_err(|_e| {
|
||||
#[cfg(feature = "testing")]
|
||||
tracing::error!(data=?utils::Hex(&data), err=?_e, %key, %lsn, "failed to parse will_init out of serialized value");
|
||||
@@ -1357,7 +1358,7 @@ impl DeltaLayerInner {
|
||||
format!(" img {} bytes", img.len())
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
let wal_desc = walrecord::describe_wal_record(&rec)?;
|
||||
let wal_desc = pageserver_api::record::describe_wal_record(&rec)?;
|
||||
format!(
|
||||
" rec {} bytes will_init: {} {}",
|
||||
buf.len(),
|
||||
@@ -1602,7 +1603,6 @@ pub(crate) mod test {
|
||||
use rand::RngCore;
|
||||
|
||||
use super::*;
|
||||
use crate::repository::Value;
|
||||
use crate::tenant::harness::TIMELINE_ID;
|
||||
use crate::tenant::storage_layer::{Layer, ResidentLayer};
|
||||
use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
|
||||
@@ -1614,6 +1614,7 @@ pub(crate) mod test {
|
||||
DEFAULT_PG_VERSION,
|
||||
};
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::value::Value;
|
||||
|
||||
/// Construct an index for a fictional delta layer and and then
|
||||
/// traverse in order to plan vectored reads for a query. Finally,
|
||||
@@ -1966,8 +1967,8 @@ pub(crate) mod test {
|
||||
|
||||
#[tokio::test]
|
||||
async fn copy_delta_prefix_smoke() {
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
|
||||
let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke")
|
||||
.await
|
||||
|
||||
@@ -7,7 +7,7 @@ use pageserver_api::{
|
||||
};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::repository::Value;
|
||||
use pageserver_api::value::Value;
|
||||
|
||||
use super::merge_iterator::MergeIterator;
|
||||
|
||||
@@ -121,8 +121,8 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn filter_keyspace_iterator() {
|
||||
use crate::repository::Value;
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::value::Value;
|
||||
|
||||
let harness = TenantHarness::create("filter_iterator_filter_keyspace_iterator")
|
||||
.await
|
||||
|
||||
@@ -28,7 +28,6 @@
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
|
||||
use crate::page_cache::{self, FileId, PAGE_SZ};
|
||||
use crate::repository::{Key, Value, KEY_SIZE};
|
||||
use crate::tenant::blob_io::BlobWriter;
|
||||
use crate::tenant::block_io::{BlockBuf, FileBlockReader};
|
||||
use crate::tenant::disk_btree::{
|
||||
@@ -50,8 +49,10 @@ use hex;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::config::MaxVectoredReadBytes;
|
||||
use pageserver_api::key::DBDIR_KEY;
|
||||
use pageserver_api::key::{Key, KEY_SIZE};
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::shard::{ShardIdentity, TenantShardId};
|
||||
use pageserver_api::value::Value;
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::VecDeque;
|
||||
@@ -1093,6 +1094,7 @@ mod test {
|
||||
use pageserver_api::{
|
||||
key::Key,
|
||||
shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize},
|
||||
value::Value,
|
||||
};
|
||||
use utils::{
|
||||
generation::Generation,
|
||||
@@ -1102,7 +1104,6 @@ mod test {
|
||||
|
||||
use crate::{
|
||||
context::RequestContext,
|
||||
repository::Value,
|
||||
tenant::{
|
||||
config::TenantConf,
|
||||
harness::{TenantHarness, TIMELINE_ID},
|
||||
|
||||
@@ -7,7 +7,6 @@
|
||||
use crate::assert_u64_eq_usize::{u64_to_usize, U64IsUsize, UsizeIsU64};
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::tenant::ephemeral_file::EphemeralFile;
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::PageReconstructError;
|
||||
@@ -17,9 +16,11 @@ use anyhow::{anyhow, Context, Result};
|
||||
use bytes::Bytes;
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver_api::key::CompactKey;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::InMemoryLayerInfo;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_api::value::Value;
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::sync::{Arc, OnceLock};
|
||||
use std::time::Instant;
|
||||
@@ -392,10 +393,6 @@ impl InMemoryLayer {
|
||||
self.end_lsn.get().copied().unwrap_or(Lsn::MAX)
|
||||
}
|
||||
|
||||
pub(crate) fn start_lsn(&self) -> Lsn {
|
||||
self.start_lsn
|
||||
}
|
||||
|
||||
pub(crate) fn get_lsn_range(&self) -> Range<Lsn> {
|
||||
self.start_lsn..self.end_lsn_or_max()
|
||||
}
|
||||
|
||||
@@ -760,8 +760,8 @@ async fn evict_and_wait_does_not_wait_for_download() {
|
||||
/// Also checks that the same does not happen on a non-evicted layer (regression test).
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn eviction_cancellation_on_drop() {
|
||||
use crate::repository::Value;
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::value::Value;
|
||||
|
||||
// this is the runtime on which Layer spawns the blocking tasks on
|
||||
let handle = tokio::runtime::Handle::current();
|
||||
@@ -782,7 +782,7 @@ async fn eviction_cancellation_on_drop() {
|
||||
let mut writer = timeline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
crate::repository::Key::from_i128(5),
|
||||
pageserver_api::key::Key::from_i128(5),
|
||||
Lsn(0x20),
|
||||
&Value::Image(Bytes::from_static(b"this does not matter either")),
|
||||
&ctx,
|
||||
|
||||
@@ -3,7 +3,7 @@ use pageserver_api::shard::TenantShardId;
|
||||
use std::ops::Range;
|
||||
use utils::{id::TimelineId, lsn::Lsn};
|
||||
|
||||
use crate::repository::Key;
|
||||
use pageserver_api::key::Key;
|
||||
|
||||
use super::{DeltaLayerName, ImageLayerName, LayerName};
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
//!
|
||||
//! Helper functions for dealing with filenames of the image and delta layer files.
|
||||
//!
|
||||
use crate::repository::Key;
|
||||
use pageserver_api::key::Key;
|
||||
use std::borrow::Cow;
|
||||
use std::cmp::Ordering;
|
||||
use std::fmt;
|
||||
|
||||
@@ -7,7 +7,8 @@ use anyhow::bail;
|
||||
use pageserver_api::key::Key;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::{context::RequestContext, repository::Value};
|
||||
use crate::context::RequestContext;
|
||||
use pageserver_api::value::Value;
|
||||
|
||||
use super::{
|
||||
delta_layer::{DeltaLayerInner, DeltaLayerIterator},
|
||||
@@ -293,9 +294,9 @@ mod tests {
|
||||
harness::{TenantHarness, TIMELINE_ID},
|
||||
storage_layer::delta_layer::test::{produce_delta_layer, sort_delta, sort_delta_value},
|
||||
},
|
||||
walrecord::NeonWalRecord,
|
||||
DEFAULT_PG_VERSION,
|
||||
};
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
|
||||
async fn assert_merge_iter_equal(
|
||||
merge_iter: &mut MergeIterator<'_>,
|
||||
@@ -319,8 +320,8 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn merge_in_between() {
|
||||
use crate::repository::Value;
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::value::Value;
|
||||
|
||||
let harness = TenantHarness::create("merge_iterator_merge_in_between")
|
||||
.await
|
||||
@@ -384,8 +385,8 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn delta_merge() {
|
||||
use crate::repository::Value;
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::value::Value;
|
||||
|
||||
let harness = TenantHarness::create("merge_iterator_delta_merge")
|
||||
.await
|
||||
@@ -460,8 +461,8 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn delta_image_mixed_merge() {
|
||||
use crate::repository::Value;
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::value::Value;
|
||||
|
||||
let harness = TenantHarness::create("merge_iterator_delta_image_mixed_merge")
|
||||
.await
|
||||
|
||||
@@ -5,7 +5,8 @@ use pageserver_api::key::{Key, KEY_SIZE};
|
||||
use utils::{id::TimelineId, lsn::Lsn, shard::TenantShardId};
|
||||
|
||||
use crate::tenant::storage_layer::Layer;
|
||||
use crate::{config::PageServerConf, context::RequestContext, repository::Value, tenant::Timeline};
|
||||
use crate::{config::PageServerConf, context::RequestContext, tenant::Timeline};
|
||||
use pageserver_api::value::Value;
|
||||
|
||||
use super::layer::S3_UPLOAD_LIMIT;
|
||||
use super::{
|
||||
|
||||
@@ -125,11 +125,12 @@ use utils::{
|
||||
simple_rcu::{Rcu, RcuReadGuard},
|
||||
};
|
||||
|
||||
use crate::repository::GcResult;
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::gc_result::GcResult;
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::ZERO_PAGE;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::value::Value;
|
||||
|
||||
use self::delete::DeleteTimelineFlow;
|
||||
pub(super) use self::eviction_task::EvictionTaskTenantState;
|
||||
@@ -5854,17 +5855,15 @@ fn is_send() {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::value::Value;
|
||||
use utils::{id::TimelineId, lsn::Lsn};
|
||||
|
||||
use crate::{
|
||||
repository::Value,
|
||||
tenant::{
|
||||
harness::{test_img, TenantHarness},
|
||||
layer_map::LayerMap,
|
||||
storage_layer::{Layer, LayerName},
|
||||
timeline::{DeltaLayerTestDesc, EvictionError},
|
||||
Timeline,
|
||||
},
|
||||
use crate::tenant::{
|
||||
harness::{test_img, TenantHarness},
|
||||
layer_map::LayerMap,
|
||||
storage_layer::{Layer, LayerName},
|
||||
timeline::{DeltaLayerTestDesc, EvictionError},
|
||||
Timeline,
|
||||
};
|
||||
|
||||
#[tokio::test]
|
||||
|
||||
@@ -48,9 +48,10 @@ use pageserver_api::config::tenant_conf_defaults::{
|
||||
DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD,
|
||||
};
|
||||
|
||||
use crate::keyspace::KeySpace;
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::value::Value;
|
||||
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
@@ -646,21 +647,6 @@ impl Timeline {
|
||||
readable_points.push(*child_lsn);
|
||||
}
|
||||
readable_points.push(head_lsn);
|
||||
|
||||
// The Timeline get page process will walk all InMemoryLayers before it starts walking historic
|
||||
// layers. That means it might fail to see image layers that overlap with the LSN range of
|
||||
// InMemoryLayers, so there is a de-facto read point at the start_lsn of the oldest InMemoryLayer.
|
||||
//
|
||||
// This behavior in the getpage path is considered a but, and including InMemoryLayer's start_lsn here
|
||||
// is a workaround. See https://github.com/neondatabase/neon/issues/9185
|
||||
if let Some(oldest_inmemory_layer) = layer_map.frozen_layers.front() {
|
||||
readable_points.push(oldest_inmemory_layer.start_lsn())
|
||||
} else if let Some(open_layer) = layer_map.open_layer.as_ref() {
|
||||
readable_points.push(open_layer.start_lsn());
|
||||
}
|
||||
|
||||
readable_points.sort();
|
||||
|
||||
readable_points
|
||||
};
|
||||
|
||||
@@ -2143,7 +2129,7 @@ struct ResidentDeltaLayer(ResidentLayer);
|
||||
struct ResidentImageLayer(ResidentLayer);
|
||||
|
||||
impl CompactionJobExecutor for TimelineAdaptor {
|
||||
type Key = crate::repository::Key;
|
||||
type Key = pageserver_api::key::Key;
|
||||
|
||||
type Layer = OwnArc<PersistentLayerDesc>;
|
||||
type DeltaLayer = ResidentDeltaLayer;
|
||||
|
||||
@@ -19,6 +19,9 @@ pub(crate) async fn offload_timeline(
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
|
||||
timeline.shutdown(super::ShutdownMode::Hard).await;
|
||||
|
||||
// TODO extend guard mechanism above with method
|
||||
// to make deletions possible while offloading is in progress
|
||||
|
||||
|
||||
@@ -22,6 +22,7 @@ use tokio::{select, sync::watch, time};
|
||||
use tokio_postgres::{replication::ReplicationStream, Client};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, error, info, trace, warn, Instrument};
|
||||
use wal_decoder::models::{FlushUncommittedRecords, InterpretedWalRecord};
|
||||
|
||||
use super::TaskStateUpdate;
|
||||
use crate::{
|
||||
@@ -31,7 +32,6 @@ use crate::{
|
||||
task_mgr::{TaskKind, WALRECEIVER_RUNTIME},
|
||||
tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
|
||||
walingest::WalIngest,
|
||||
walrecord::{decode_wal_record, DecodedWALRecord},
|
||||
};
|
||||
use postgres_backend::is_expected_io_error;
|
||||
use postgres_connection::PgConnectionConfig;
|
||||
@@ -339,11 +339,15 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
return Err(WalReceiverError::Other(anyhow!("LSN not aligned")));
|
||||
}
|
||||
|
||||
// Deserialize WAL record
|
||||
let mut decoded = DecodedWALRecord::default();
|
||||
decode_wal_record(recdata, &mut decoded, modification.tline.pg_version)?;
|
||||
// Deserialize and interpret WAL record
|
||||
let interpreted = InterpretedWalRecord::from_bytes(
|
||||
recdata,
|
||||
modification.tline.get_shard_identity(),
|
||||
lsn,
|
||||
modification.tline.pg_version,
|
||||
)?;
|
||||
|
||||
if decoded.is_dbase_create_copy(timeline.pg_version)
|
||||
if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes)
|
||||
&& uncommitted_records > 0
|
||||
{
|
||||
// Special case: legacy PG database creations operate by reading pages from a 'template' database:
|
||||
@@ -360,7 +364,7 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
|
||||
// Ingest the records without immediately committing them.
|
||||
let ingested = walingest
|
||||
.ingest_record(decoded, lsn, &mut modification, &ctx)
|
||||
.ingest_record(interpreted, &mut modification, &ctx)
|
||||
.await
|
||||
.with_context(|| format!("could not ingest record at {lsn}"))?;
|
||||
if !ingested {
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -29,11 +29,11 @@ use crate::metrics::{
|
||||
WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM,
|
||||
WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_TIME,
|
||||
};
|
||||
use crate::repository::Key;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::Context;
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus};
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::future::Future;
|
||||
use std::sync::Arc;
|
||||
@@ -548,9 +548,10 @@ impl PostgresRedoManager {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::PostgresRedoManager;
|
||||
use crate::repository::Key;
|
||||
use crate::{config::PageServerConf, walrecord::NeonWalRecord};
|
||||
use crate::config::PageServerConf;
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::str::FromStr;
|
||||
use tracing::Instrument;
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
use crate::pgdatadir_mapping::AuxFilesDirectory;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::Context;
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
use bytes::{BufMut, BytesMut};
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::reltag::SlruKind;
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
|
||||
@@ -244,7 +244,7 @@ pub(crate) fn apply_in_neon(
|
||||
let mut writer = page.writer();
|
||||
dir.ser_into(&mut writer)?;
|
||||
}
|
||||
#[cfg(test)]
|
||||
#[cfg(feature = "testing")]
|
||||
NeonWalRecord::Test {
|
||||
append,
|
||||
clear,
|
||||
|
||||
@@ -8,10 +8,10 @@ use crate::{
|
||||
metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
|
||||
page_cache::PAGE_SZ,
|
||||
span::debug_assert_current_span_has_tenant_id,
|
||||
walrecord::NeonWalRecord,
|
||||
};
|
||||
use anyhow::Context;
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::{reltag::RelTag, shard::TenantShardId};
|
||||
use postgres_ffi::BLCKSZ;
|
||||
#[cfg(feature = "testing")]
|
||||
|
||||
@@ -42,9 +42,10 @@ hyper0.workspace = true
|
||||
hyper = { workspace = true, features = ["server", "http1", "http2"] }
|
||||
hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] }
|
||||
http-body-util = { version = "0.1" }
|
||||
indexmap.workspace = true
|
||||
indexmap = { workspace = true, features = ["serde"] }
|
||||
ipnet.workspace = true
|
||||
itertools.workspace = true
|
||||
itoa.workspace = true
|
||||
lasso = { workspace = true, features = ["multi-threaded"] }
|
||||
measured = { workspace = true, features = ["lasso"] }
|
||||
metrics.workspace = true
|
||||
|
||||
@@ -4,21 +4,20 @@ use std::{
|
||||
time::{Duration, SystemTime},
|
||||
};
|
||||
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use arc_swap::ArcSwapOption;
|
||||
use dashmap::DashMap;
|
||||
use jose_jwk::crypto::KeyInfo;
|
||||
use serde::{de::Visitor, Deserialize, Deserializer};
|
||||
use signature::Verifier;
|
||||
use thiserror::Error;
|
||||
use tokio::time::Instant;
|
||||
|
||||
use crate::{
|
||||
context::RequestMonitoring, http::parse_json_body_with_limit, intern::RoleNameInt, EndpointId,
|
||||
RoleName,
|
||||
auth::backend::ComputeCredentialKeys, context::RequestMonitoring,
|
||||
control_plane::errors::GetEndpointJwksError, http::parse_json_body_with_limit,
|
||||
intern::RoleNameInt, EndpointId, RoleName,
|
||||
};
|
||||
|
||||
use super::ComputeCredentialKeys;
|
||||
|
||||
// TODO(conrad): make these configurable.
|
||||
const CLOCK_SKEW_LEEWAY: Duration = Duration::from_secs(30);
|
||||
const MIN_RENEW: Duration = Duration::from_secs(30);
|
||||
@@ -32,7 +31,16 @@ pub(crate) trait FetchAuthRules: Clone + Send + Sync + 'static {
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
endpoint: EndpointId,
|
||||
) -> impl Future<Output = anyhow::Result<Vec<AuthRule>>> + Send;
|
||||
) -> impl Future<Output = Result<Vec<AuthRule>, FetchAuthRulesError>> + Send;
|
||||
}
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub(crate) enum FetchAuthRulesError {
|
||||
#[error(transparent)]
|
||||
GetEndpointJwks(#[from] GetEndpointJwksError),
|
||||
|
||||
#[error("JWKs settings for this role were not configured")]
|
||||
RoleJwksNotConfigured,
|
||||
}
|
||||
|
||||
pub(crate) struct AuthRule {
|
||||
@@ -122,7 +130,7 @@ impl JwkCacheEntryLock {
|
||||
client: &reqwest::Client,
|
||||
endpoint: EndpointId,
|
||||
auth_rules: &F,
|
||||
) -> anyhow::Result<Arc<JwkCacheEntry>> {
|
||||
) -> Result<Arc<JwkCacheEntry>, JwtError> {
|
||||
// double check that no one beat us to updating the cache.
|
||||
let now = Instant::now();
|
||||
let guard = self.cached.load_full();
|
||||
@@ -188,7 +196,7 @@ impl JwkCacheEntryLock {
|
||||
client: &reqwest::Client,
|
||||
endpoint: EndpointId,
|
||||
fetch: &F,
|
||||
) -> Result<Arc<JwkCacheEntry>, anyhow::Error> {
|
||||
) -> Result<Arc<JwkCacheEntry>, JwtError> {
|
||||
let now = Instant::now();
|
||||
let guard = self.cached.load_full();
|
||||
|
||||
@@ -243,27 +251,24 @@ impl JwkCacheEntryLock {
|
||||
endpoint: EndpointId,
|
||||
role_name: &RoleName,
|
||||
fetch: &F,
|
||||
) -> Result<ComputeCredentialKeys, anyhow::Error> {
|
||||
) -> Result<ComputeCredentialKeys, JwtError> {
|
||||
// JWT compact form is defined to be
|
||||
// <B64(Header)> || . || <B64(Payload)> || . || <B64(Signature)>
|
||||
// where Signature = alg(<B64(Header)> || . || <B64(Payload)>);
|
||||
|
||||
let (header_payload, signature) = jwt
|
||||
.rsplit_once('.')
|
||||
.context("Provided authentication token is not a valid JWT encoding")?;
|
||||
.ok_or(JwtEncodingError::InvalidCompactForm)?;
|
||||
let (header, payload) = header_payload
|
||||
.split_once('.')
|
||||
.context("Provided authentication token is not a valid JWT encoding")?;
|
||||
.ok_or(JwtEncodingError::InvalidCompactForm)?;
|
||||
|
||||
let header = base64::decode_config(header, base64::URL_SAFE_NO_PAD)
|
||||
.context("Provided authentication token is not a valid JWT encoding")?;
|
||||
let header = serde_json::from_slice::<JwtHeader<'_>>(&header)
|
||||
.context("Provided authentication token is not a valid JWT encoding")?;
|
||||
let header = base64::decode_config(header, base64::URL_SAFE_NO_PAD)?;
|
||||
let header = serde_json::from_slice::<JwtHeader<'_>>(&header)?;
|
||||
|
||||
let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD)
|
||||
.context("Provided authentication token is not a valid JWT encoding")?;
|
||||
let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD)?;
|
||||
|
||||
let kid = header.key_id.context("missing key id")?;
|
||||
let kid = header.key_id.ok_or(JwtError::MissingKeyId)?;
|
||||
|
||||
let mut guard = self
|
||||
.get_or_update_jwk_cache(ctx, client, endpoint.clone(), fetch)
|
||||
@@ -281,16 +286,13 @@ impl JwkCacheEntryLock {
|
||||
.renew_jwks(permit, ctx, client, endpoint.clone(), fetch)
|
||||
.await?;
|
||||
}
|
||||
_ => {
|
||||
bail!("jwk not found");
|
||||
}
|
||||
_ => return Err(JwtError::JwkNotFound),
|
||||
}
|
||||
};
|
||||
|
||||
ensure!(
|
||||
jwk.is_supported(&header.algorithm),
|
||||
"signature algorithm not supported"
|
||||
);
|
||||
if !jwk.is_supported(&header.algorithm) {
|
||||
return Err(JwtError::SignatureAlgorithmNotSupported);
|
||||
}
|
||||
|
||||
match &jwk.key {
|
||||
jose_jwk::Key::Ec(key) => {
|
||||
@@ -299,34 +301,32 @@ impl JwkCacheEntryLock {
|
||||
jose_jwk::Key::Rsa(key) => {
|
||||
verify_rsa_signature(header_payload.as_bytes(), &sig, key, &header.algorithm)?;
|
||||
}
|
||||
key => bail!("unsupported key type {key:?}"),
|
||||
key => return Err(JwtError::UnsupportedKeyType(key.into())),
|
||||
};
|
||||
|
||||
let payloadb = base64::decode_config(payload, base64::URL_SAFE_NO_PAD)
|
||||
.context("Provided authentication token is not a valid JWT encoding")?;
|
||||
let payload = serde_json::from_slice::<JwtPayload<'_>>(&payloadb)
|
||||
.context("Provided authentication token is not a valid JWT encoding")?;
|
||||
let payloadb = base64::decode_config(payload, base64::URL_SAFE_NO_PAD)?;
|
||||
let payload = serde_json::from_slice::<JwtPayload<'_>>(&payloadb)?;
|
||||
|
||||
tracing::debug!(?payload, "JWT signature valid with claims");
|
||||
|
||||
if let Some(aud) = expected_audience {
|
||||
ensure!(
|
||||
payload.audience.0.iter().any(|s| s == aud),
|
||||
"invalid JWT token audience"
|
||||
);
|
||||
if payload.audience.0.iter().all(|s| s != aud) {
|
||||
return Err(JwtError::InvalidJwtTokenAudience);
|
||||
}
|
||||
}
|
||||
|
||||
let now = SystemTime::now();
|
||||
|
||||
if let Some(exp) = payload.expiration {
|
||||
ensure!(now < exp + CLOCK_SKEW_LEEWAY, "JWT token has expired");
|
||||
if now >= exp + CLOCK_SKEW_LEEWAY {
|
||||
return Err(JwtError::JwtTokenHasExpired);
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(nbf) = payload.not_before {
|
||||
ensure!(
|
||||
nbf < now + CLOCK_SKEW_LEEWAY,
|
||||
"JWT token is not yet ready to use"
|
||||
);
|
||||
if nbf >= now + CLOCK_SKEW_LEEWAY {
|
||||
return Err(JwtError::JwtTokenNotYetReadyToUse);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(ComputeCredentialKeys::JwtPayload(payloadb))
|
||||
@@ -341,7 +341,7 @@ impl JwkCache {
|
||||
role_name: &RoleName,
|
||||
fetch: &F,
|
||||
jwt: &str,
|
||||
) -> Result<ComputeCredentialKeys, anyhow::Error> {
|
||||
) -> Result<ComputeCredentialKeys, JwtError> {
|
||||
// try with just a read lock first
|
||||
let key = (endpoint.clone(), role_name.clone());
|
||||
let entry = self.map.get(&key).as_deref().map(Arc::clone);
|
||||
@@ -357,19 +357,18 @@ impl JwkCache {
|
||||
}
|
||||
}
|
||||
|
||||
fn verify_ec_signature(data: &[u8], sig: &[u8], key: &jose_jwk::Ec) -> anyhow::Result<()> {
|
||||
fn verify_ec_signature(data: &[u8], sig: &[u8], key: &jose_jwk::Ec) -> Result<(), JwtError> {
|
||||
use ecdsa::Signature;
|
||||
use signature::Verifier;
|
||||
|
||||
match key.crv {
|
||||
jose_jwk::EcCurves::P256 => {
|
||||
let pk =
|
||||
p256::PublicKey::try_from(key).map_err(|_| anyhow::anyhow!("invalid P256 key"))?;
|
||||
let pk = p256::PublicKey::try_from(key).map_err(JwtError::InvalidP256Key)?;
|
||||
let key = p256::ecdsa::VerifyingKey::from(&pk);
|
||||
let sig = Signature::from_slice(sig)?;
|
||||
key.verify(data, &sig)?;
|
||||
}
|
||||
key => bail!("unsupported ec key type {key:?}"),
|
||||
key => return Err(JwtError::UnsupportedEcKeyType(key)),
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -380,14 +379,14 @@ fn verify_rsa_signature(
|
||||
sig: &[u8],
|
||||
key: &jose_jwk::Rsa,
|
||||
alg: &jose_jwa::Algorithm,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> Result<(), JwtError> {
|
||||
use jose_jwa::{Algorithm, Signing};
|
||||
use rsa::{
|
||||
pkcs1v15::{Signature, VerifyingKey},
|
||||
RsaPublicKey,
|
||||
};
|
||||
|
||||
let key = RsaPublicKey::try_from(key).map_err(|_| anyhow::anyhow!("invalid RSA key"))?;
|
||||
let key = RsaPublicKey::try_from(key).map_err(JwtError::InvalidRsaKey)?;
|
||||
|
||||
match alg {
|
||||
Algorithm::Signing(Signing::Rs256) => {
|
||||
@@ -395,7 +394,7 @@ fn verify_rsa_signature(
|
||||
let sig = Signature::try_from(sig)?;
|
||||
key.verify(data, &sig)?;
|
||||
}
|
||||
_ => bail!("invalid RSA signing algorithm"),
|
||||
_ => return Err(JwtError::InvalidRsaSigningAlgorithm),
|
||||
};
|
||||
|
||||
Ok(())
|
||||
@@ -561,6 +560,99 @@ impl Drop for JwkRenewalPermit<'_> {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
#[non_exhaustive]
|
||||
pub(crate) enum JwtError {
|
||||
#[error("jwk not found")]
|
||||
JwkNotFound,
|
||||
|
||||
#[error("missing key id")]
|
||||
MissingKeyId,
|
||||
|
||||
#[error("Provided authentication token is not a valid JWT encoding")]
|
||||
JwtEncoding(#[from] JwtEncodingError),
|
||||
|
||||
#[error("invalid JWT token audience")]
|
||||
InvalidJwtTokenAudience,
|
||||
|
||||
#[error("JWT token has expired")]
|
||||
JwtTokenHasExpired,
|
||||
|
||||
#[error("JWT token is not yet ready to use")]
|
||||
JwtTokenNotYetReadyToUse,
|
||||
|
||||
#[error("invalid P256 key")]
|
||||
InvalidP256Key(jose_jwk::crypto::Error),
|
||||
|
||||
#[error("invalid RSA key")]
|
||||
InvalidRsaKey(jose_jwk::crypto::Error),
|
||||
|
||||
#[error("invalid RSA signing algorithm")]
|
||||
InvalidRsaSigningAlgorithm,
|
||||
|
||||
#[error("unsupported EC key type {0:?}")]
|
||||
UnsupportedEcKeyType(jose_jwk::EcCurves),
|
||||
|
||||
#[error("unsupported key type {0:?}")]
|
||||
UnsupportedKeyType(KeyType),
|
||||
|
||||
#[error("signature algorithm not supported")]
|
||||
SignatureAlgorithmNotSupported,
|
||||
|
||||
#[error("signature error: {0}")]
|
||||
Signature(#[from] signature::Error),
|
||||
|
||||
#[error("failed to fetch auth rules: {0}")]
|
||||
FetchAuthRules(#[from] FetchAuthRulesError),
|
||||
}
|
||||
|
||||
impl From<base64::DecodeError> for JwtError {
|
||||
fn from(err: base64::DecodeError) -> Self {
|
||||
JwtEncodingError::Base64Decode(err).into()
|
||||
}
|
||||
}
|
||||
|
||||
impl From<serde_json::Error> for JwtError {
|
||||
fn from(err: serde_json::Error) -> Self {
|
||||
JwtEncodingError::SerdeJson(err).into()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
#[non_exhaustive]
|
||||
pub enum JwtEncodingError {
|
||||
#[error(transparent)]
|
||||
Base64Decode(#[from] base64::DecodeError),
|
||||
|
||||
#[error(transparent)]
|
||||
SerdeJson(#[from] serde_json::Error),
|
||||
|
||||
#[error("invalid compact form")]
|
||||
InvalidCompactForm,
|
||||
}
|
||||
|
||||
#[allow(dead_code, reason = "Debug use only")]
|
||||
#[derive(Debug)]
|
||||
pub(crate) enum KeyType {
|
||||
Ec(jose_jwk::EcCurves),
|
||||
Rsa,
|
||||
Oct,
|
||||
Okp(jose_jwk::OkpCurves),
|
||||
Unknown,
|
||||
}
|
||||
|
||||
impl From<&jose_jwk::Key> for KeyType {
|
||||
fn from(key: &jose_jwk::Key) -> Self {
|
||||
match key {
|
||||
jose_jwk::Key::Ec(ec) => Self::Ec(ec.crv),
|
||||
jose_jwk::Key::Rsa(_rsa) => Self::Rsa,
|
||||
jose_jwk::Key::Oct(_oct) => Self::Oct,
|
||||
jose_jwk::Key::Okp(okp) => Self::Okp(okp.crv),
|
||||
_ => Self::Unknown,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::RoleName;
|
||||
@@ -758,7 +850,7 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
|
||||
&self,
|
||||
_ctx: &RequestMonitoring,
|
||||
_endpoint: EndpointId,
|
||||
) -> anyhow::Result<Vec<AuthRule>> {
|
||||
) -> Result<Vec<AuthRule>, FetchAuthRulesError> {
|
||||
Ok(vec![
|
||||
AuthRule {
|
||||
id: "foo".to_owned(),
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
use std::net::SocketAddr;
|
||||
|
||||
use anyhow::Context;
|
||||
use arc_swap::ArcSwapOption;
|
||||
|
||||
use crate::{
|
||||
auth::backend::jwt::FetchAuthRulesError,
|
||||
compute::ConnCfg,
|
||||
context::RequestMonitoring,
|
||||
control_plane::{
|
||||
@@ -53,11 +53,11 @@ impl FetchAuthRules for StaticAuthRules {
|
||||
&self,
|
||||
_ctx: &RequestMonitoring,
|
||||
_endpoint: EndpointId,
|
||||
) -> anyhow::Result<Vec<AuthRule>> {
|
||||
) -> Result<Vec<AuthRule>, FetchAuthRulesError> {
|
||||
let mappings = JWKS_ROLE_MAP.load();
|
||||
let role_mappings = mappings
|
||||
.as_deref()
|
||||
.context("JWKs settings for this role were not configured")?;
|
||||
.ok_or(FetchAuthRulesError::RoleJwksNotConfigured)?;
|
||||
let mut rules = vec![];
|
||||
for setting in &role_mappings.jwks {
|
||||
rules.push(AuthRule {
|
||||
|
||||
@@ -561,7 +561,8 @@ mod tests {
|
||||
&self,
|
||||
_ctx: &RequestMonitoring,
|
||||
_endpoint: crate::EndpointId,
|
||||
) -> anyhow::Result<Vec<super::jwt::AuthRule>> {
|
||||
) -> Result<Vec<super::jwt::AuthRule>, control_plane::errors::GetEndpointJwksError>
|
||||
{
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
|
||||
@@ -5,7 +5,8 @@ use super::{
|
||||
AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo,
|
||||
};
|
||||
use crate::{
|
||||
auth::backend::jwt::AuthRule, context::RequestMonitoring, intern::RoleNameInt, RoleName,
|
||||
auth::backend::jwt::AuthRule, context::RequestMonitoring,
|
||||
control_plane::errors::GetEndpointJwksError, intern::RoleNameInt, RoleName,
|
||||
};
|
||||
use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl};
|
||||
use crate::{auth::IpPattern, cache::Cached};
|
||||
@@ -120,7 +121,10 @@ impl Api {
|
||||
})
|
||||
}
|
||||
|
||||
async fn do_get_endpoint_jwks(&self, endpoint: EndpointId) -> anyhow::Result<Vec<AuthRule>> {
|
||||
async fn do_get_endpoint_jwks(
|
||||
&self,
|
||||
endpoint: EndpointId,
|
||||
) -> Result<Vec<AuthRule>, GetEndpointJwksError> {
|
||||
let (client, connection) =
|
||||
tokio_postgres::connect(self.endpoint.as_str(), tokio_postgres::NoTls).await?;
|
||||
|
||||
@@ -224,7 +228,7 @@ impl super::Api for Api {
|
||||
&self,
|
||||
_ctx: &RequestMonitoring,
|
||||
endpoint: EndpointId,
|
||||
) -> anyhow::Result<Vec<AuthRule>> {
|
||||
) -> Result<Vec<AuthRule>, GetEndpointJwksError> {
|
||||
self.do_get_endpoint_jwks(endpoint).await
|
||||
}
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ use super::messages::{ControlPlaneError, MetricsAuxInfo};
|
||||
use crate::{
|
||||
auth::{
|
||||
backend::{
|
||||
jwt::{AuthRule, FetchAuthRules},
|
||||
jwt::{AuthRule, FetchAuthRules, FetchAuthRulesError},
|
||||
ComputeCredentialKeys, ComputeUserInfo,
|
||||
},
|
||||
IpPattern,
|
||||
@@ -44,7 +44,7 @@ pub(crate) mod errors {
|
||||
pub(crate) enum ApiError {
|
||||
/// Error returned by the console itself.
|
||||
#[error("{REQUEST_FAILED} with {0}")]
|
||||
ControlPlane(ControlPlaneError),
|
||||
ControlPlane(Box<ControlPlaneError>),
|
||||
|
||||
/// Various IO errors like broken pipe or malformed payload.
|
||||
#[error("{REQUEST_FAILED}: {0}")]
|
||||
@@ -90,7 +90,7 @@ pub(crate) mod errors {
|
||||
Reason::ConcurrencyLimitReached => ErrorKind::ControlPlane,
|
||||
Reason::LockAlreadyTaken => ErrorKind::ControlPlane,
|
||||
Reason::RunningOperations => ErrorKind::ControlPlane,
|
||||
Reason::Unknown => match &e {
|
||||
Reason::Unknown => match &**e {
|
||||
ControlPlaneError {
|
||||
http_status_code:
|
||||
http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE,
|
||||
@@ -246,6 +246,33 @@ pub(crate) mod errors {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
pub enum GetEndpointJwksError {
|
||||
#[error("endpoint not found")]
|
||||
EndpointNotFound,
|
||||
|
||||
#[error("failed to build control plane request: {0}")]
|
||||
RequestBuild(#[source] reqwest::Error),
|
||||
|
||||
#[error("failed to send control plane request: {0}")]
|
||||
RequestExecute(#[source] reqwest_middleware::Error),
|
||||
|
||||
#[error(transparent)]
|
||||
ControlPlane(#[from] ApiError),
|
||||
|
||||
#[cfg(any(test, feature = "testing"))]
|
||||
#[error(transparent)]
|
||||
TokioPostgres(#[from] tokio_postgres::Error),
|
||||
|
||||
#[cfg(any(test, feature = "testing"))]
|
||||
#[error(transparent)]
|
||||
ParseUrl(#[from] url::ParseError),
|
||||
|
||||
#[cfg(any(test, feature = "testing"))]
|
||||
#[error(transparent)]
|
||||
TaskJoin(#[from] tokio::task::JoinError),
|
||||
}
|
||||
}
|
||||
|
||||
/// Auth secret which is managed by the cloud.
|
||||
@@ -342,7 +369,7 @@ pub(crate) trait Api {
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
endpoint: EndpointId,
|
||||
) -> anyhow::Result<Vec<AuthRule>>;
|
||||
) -> Result<Vec<AuthRule>, errors::GetEndpointJwksError>;
|
||||
|
||||
/// Wake up the compute node and return the corresponding connection info.
|
||||
async fn wake_compute(
|
||||
@@ -401,7 +428,7 @@ impl Api for ControlPlaneBackend {
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
endpoint: EndpointId,
|
||||
) -> anyhow::Result<Vec<AuthRule>> {
|
||||
) -> Result<Vec<AuthRule>, errors::GetEndpointJwksError> {
|
||||
match self {
|
||||
Self::Management(api) => api.get_endpoint_jwks(ctx, endpoint).await,
|
||||
#[cfg(any(test, feature = "testing"))]
|
||||
@@ -583,7 +610,9 @@ impl FetchAuthRules for ControlPlaneBackend {
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
endpoint: EndpointId,
|
||||
) -> anyhow::Result<Vec<AuthRule>> {
|
||||
self.get_endpoint_jwks(ctx, endpoint).await
|
||||
) -> Result<Vec<AuthRule>, FetchAuthRulesError> {
|
||||
self.get_endpoint_jwks(ctx, endpoint)
|
||||
.await
|
||||
.map_err(FetchAuthRulesError::GetEndpointJwks)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,7 +9,10 @@ use super::{
|
||||
use crate::{
|
||||
auth::backend::{jwt::AuthRule, ComputeUserInfo},
|
||||
compute,
|
||||
control_plane::messages::{ColdStartInfo, EndpointJwksResponse, Reason},
|
||||
control_plane::{
|
||||
errors::GetEndpointJwksError,
|
||||
messages::{ColdStartInfo, EndpointJwksResponse, Reason},
|
||||
},
|
||||
http,
|
||||
metrics::{CacheOutcome, Metrics},
|
||||
rate_limiter::WakeComputeRateLimiter,
|
||||
@@ -17,7 +20,6 @@ use crate::{
|
||||
};
|
||||
use crate::{cache::Cached, context::RequestMonitoring};
|
||||
use ::http::{header::AUTHORIZATION, HeaderName};
|
||||
use anyhow::bail;
|
||||
use futures::TryFutureExt;
|
||||
use std::{sync::Arc, time::Duration};
|
||||
use tokio::time::Instant;
|
||||
@@ -137,14 +139,14 @@ impl Api {
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
endpoint: EndpointId,
|
||||
) -> anyhow::Result<Vec<AuthRule>> {
|
||||
) -> Result<Vec<AuthRule>, GetEndpointJwksError> {
|
||||
if !self
|
||||
.caches
|
||||
.endpoints_cache
|
||||
.is_valid(ctx, &endpoint.normalize())
|
||||
.await
|
||||
{
|
||||
bail!("endpoint not found");
|
||||
return Err(GetEndpointJwksError::EndpointNotFound);
|
||||
}
|
||||
let request_id = ctx.session_id().to_string();
|
||||
async {
|
||||
@@ -159,12 +161,17 @@ impl Api {
|
||||
.header(X_REQUEST_ID, &request_id)
|
||||
.header(AUTHORIZATION, format!("Bearer {}", &self.jwt))
|
||||
.query(&[("session_id", ctx.session_id())])
|
||||
.build()?;
|
||||
.build()
|
||||
.map_err(GetEndpointJwksError::RequestBuild)?;
|
||||
|
||||
info!(url = request.url().as_str(), "sending http request");
|
||||
let start = Instant::now();
|
||||
let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
|
||||
let response = self.endpoint.execute(request).await?;
|
||||
let response = self
|
||||
.endpoint
|
||||
.execute(request)
|
||||
.await
|
||||
.map_err(GetEndpointJwksError::RequestExecute)?;
|
||||
drop(pause);
|
||||
info!(duration = ?start.elapsed(), "received http response");
|
||||
|
||||
@@ -330,7 +337,7 @@ impl super::Api for Api {
|
||||
&self,
|
||||
ctx: &RequestMonitoring,
|
||||
endpoint: EndpointId,
|
||||
) -> anyhow::Result<Vec<AuthRule>> {
|
||||
) -> Result<Vec<AuthRule>, GetEndpointJwksError> {
|
||||
self.do_get_endpoint_jwks(ctx, endpoint).await
|
||||
}
|
||||
|
||||
@@ -348,7 +355,7 @@ impl super::Api for Api {
|
||||
let (cached, info) = cached.take_value();
|
||||
let info = info.map_err(|c| {
|
||||
info!(key = &*key, "found cached wake_compute error");
|
||||
WakeComputeError::ApiError(ApiError::ControlPlane(*c))
|
||||
WakeComputeError::ApiError(ApiError::ControlPlane(Box::new(*c)))
|
||||
})?;
|
||||
|
||||
debug!(key = &*key, "found cached compute node info");
|
||||
@@ -418,7 +425,7 @@ impl super::Api for Api {
|
||||
|
||||
self.caches.node_info.insert_ttl(
|
||||
key,
|
||||
Err(Box::new(err.clone())),
|
||||
Err(err.clone()),
|
||||
Duration::from_secs(30),
|
||||
);
|
||||
|
||||
@@ -457,7 +464,7 @@ async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
|
||||
body.http_status_code = status;
|
||||
|
||||
warn!("console responded with an error ({status}): {body:?}");
|
||||
Err(ApiError::ControlPlane(body))
|
||||
Err(ApiError::ControlPlane(Box::new(body)))
|
||||
}
|
||||
|
||||
fn parse_host_port(input: &str) -> Option<(&str, u16)> {
|
||||
|
||||
@@ -492,30 +492,32 @@ impl TestBackend for TestConnectMechanism {
|
||||
match action {
|
||||
ConnectAction::Wake => Ok(helper_create_cached_node_info(self.cache)),
|
||||
ConnectAction::WakeFail => {
|
||||
let err = control_plane::errors::ApiError::ControlPlane(ControlPlaneError {
|
||||
http_status_code: StatusCode::BAD_REQUEST,
|
||||
error: "TEST".into(),
|
||||
status: None,
|
||||
});
|
||||
let err =
|
||||
control_plane::errors::ApiError::ControlPlane(Box::new(ControlPlaneError {
|
||||
http_status_code: StatusCode::BAD_REQUEST,
|
||||
error: "TEST".into(),
|
||||
status: None,
|
||||
}));
|
||||
assert!(!err.could_retry());
|
||||
Err(control_plane::errors::WakeComputeError::ApiError(err))
|
||||
}
|
||||
ConnectAction::WakeRetry => {
|
||||
let err = control_plane::errors::ApiError::ControlPlane(ControlPlaneError {
|
||||
http_status_code: StatusCode::BAD_REQUEST,
|
||||
error: "TEST".into(),
|
||||
status: Some(Status {
|
||||
code: "error".into(),
|
||||
message: "error".into(),
|
||||
details: Details {
|
||||
error_info: None,
|
||||
retry_info: Some(control_plane::messages::RetryInfo {
|
||||
retry_delay_ms: 1,
|
||||
}),
|
||||
user_facing_message: None,
|
||||
},
|
||||
}),
|
||||
});
|
||||
let err =
|
||||
control_plane::errors::ApiError::ControlPlane(Box::new(ControlPlaneError {
|
||||
http_status_code: StatusCode::BAD_REQUEST,
|
||||
error: "TEST".into(),
|
||||
status: Some(Status {
|
||||
code: "error".into(),
|
||||
message: "error".into(),
|
||||
details: Details {
|
||||
error_info: None,
|
||||
retry_info: Some(control_plane::messages::RetryInfo {
|
||||
retry_delay_ms: 1,
|
||||
}),
|
||||
user_facing_message: None,
|
||||
},
|
||||
}),
|
||||
}));
|
||||
assert!(err.could_retry());
|
||||
Err(control_plane::errors::WakeComputeError::ApiError(err))
|
||||
}
|
||||
|
||||
@@ -79,7 +79,7 @@ fn report_error(e: &WakeComputeError, retry: bool) {
|
||||
Reason::ConcurrencyLimitReached => WakeupFailureKind::ApiConsoleLocked,
|
||||
Reason::LockAlreadyTaken => WakeupFailureKind::ApiConsoleLocked,
|
||||
Reason::RunningOperations => WakeupFailureKind::ApiConsoleLocked,
|
||||
Reason::Unknown => match e {
|
||||
Reason::Unknown => match **e {
|
||||
ControlPlaneError {
|
||||
http_status_code: StatusCode::LOCKED,
|
||||
ref error,
|
||||
|
||||
@@ -2,8 +2,9 @@ use std::{io, sync::Arc, time::Duration};
|
||||
|
||||
use async_trait::async_trait;
|
||||
use hyper_util::rt::{TokioExecutor, TokioIo, TokioTimer};
|
||||
use p256::{ecdsa::SigningKey, elliptic_curve::JwkEcKey};
|
||||
use rand::rngs::OsRng;
|
||||
use tokio::net::{lookup_host, TcpStream};
|
||||
use tokio_postgres::types::ToSql;
|
||||
use tracing::{debug, field::display, info};
|
||||
|
||||
use crate::{
|
||||
@@ -267,50 +268,58 @@ impl PoolingBackend {
|
||||
auth::Backend::Local(local) => local.node_info.clone(),
|
||||
};
|
||||
|
||||
let (key, jwk) = create_random_jwk();
|
||||
|
||||
let config = node_info
|
||||
.config
|
||||
.user(&conn_info.user_info.user)
|
||||
.dbname(&conn_info.dbname);
|
||||
.dbname(&conn_info.dbname)
|
||||
.options(&format!(
|
||||
"-c pg_session_jwt.jwk={}",
|
||||
serde_json::to_string(&jwk).expect("serializing jwk to json should not fail")
|
||||
));
|
||||
|
||||
let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
|
||||
let (client, connection) = config.connect(tokio_postgres::NoTls).await?;
|
||||
drop(pause);
|
||||
|
||||
tracing::Span::current().record("pid", tracing::field::display(client.get_process_id()));
|
||||
let pid = client.get_process_id();
|
||||
tracing::Span::current().record("pid", pid);
|
||||
|
||||
let handle = local_conn_pool::poll_client(
|
||||
let mut handle = local_conn_pool::poll_client(
|
||||
self.local_pool.clone(),
|
||||
ctx,
|
||||
conn_info,
|
||||
client,
|
||||
connection,
|
||||
key,
|
||||
conn_id,
|
||||
node_info.aux.clone(),
|
||||
);
|
||||
|
||||
let kid = handle.get_client().get_process_id() as i64;
|
||||
let jwk = p256::PublicKey::from(handle.key().verifying_key()).to_jwk();
|
||||
{
|
||||
let (client, mut discard) = handle.inner();
|
||||
debug!("setting up backend session state");
|
||||
|
||||
debug!(kid, ?jwk, "setting up backend session state");
|
||||
// initiates the auth session
|
||||
if let Err(e) = client.query("select auth.init()", &[]).await {
|
||||
discard.discard();
|
||||
return Err(e.into());
|
||||
}
|
||||
|
||||
// initiates the auth session
|
||||
handle
|
||||
.get_client()
|
||||
.query(
|
||||
"select auth.init($1, $2);",
|
||||
&[
|
||||
&kid as &(dyn ToSql + Sync),
|
||||
&tokio_postgres::types::Json(jwk),
|
||||
],
|
||||
)
|
||||
.await?;
|
||||
|
||||
info!(?kid, "backend session state init");
|
||||
info!("backend session state initialized");
|
||||
}
|
||||
|
||||
Ok(handle)
|
||||
}
|
||||
}
|
||||
|
||||
fn create_random_jwk() -> (SigningKey, JwkEcKey) {
|
||||
let key = SigningKey::random(&mut OsRng);
|
||||
let jwk = p256::PublicKey::from(key.verifying_key()).to_jwk();
|
||||
(key, jwk)
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub(crate) enum HttpConnError {
|
||||
#[error("pooled connection closed at inconsistent state")]
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
use futures::{future::poll_fn, Future};
|
||||
use indexmap::IndexMap;
|
||||
use jose_jwk::jose_b64::base64ct::{Base64UrlUnpadded, Encoding};
|
||||
use p256::ecdsa::{Signature, SigningKey};
|
||||
use parking_lot::RwLock;
|
||||
use rand::rngs::OsRng;
|
||||
use serde_json::Value;
|
||||
use serde_json::value::RawValue;
|
||||
use signature::Signer;
|
||||
use std::task::{ready, Poll};
|
||||
use std::{collections::HashMap, pin::pin, sync::Arc, sync::Weak, time::Duration};
|
||||
@@ -12,14 +12,13 @@ use tokio_postgres::tls::NoTlsStream;
|
||||
use tokio_postgres::types::ToSql;
|
||||
use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use typed_json::json;
|
||||
|
||||
use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
|
||||
use crate::metrics::Metrics;
|
||||
use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
|
||||
use crate::{context::RequestMonitoring, DbName, RoleName};
|
||||
|
||||
use tracing::{debug, error, warn, Span};
|
||||
use tracing::{error, warn, Span};
|
||||
use tracing::{info, info_span, Instrument};
|
||||
|
||||
use super::backend::HttpConnError;
|
||||
@@ -245,12 +244,14 @@ impl<C: ClientInnerExt> LocalConnPool<C> {
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub(crate) fn poll_client(
|
||||
global_pool: Arc<LocalConnPool<tokio_postgres::Client>>,
|
||||
ctx: &RequestMonitoring,
|
||||
conn_info: ConnInfo,
|
||||
client: tokio_postgres::Client,
|
||||
mut connection: tokio_postgres::Connection<Socket, NoTlsStream>,
|
||||
key: SigningKey,
|
||||
conn_id: uuid::Uuid,
|
||||
aux: MetricsAuxInfo,
|
||||
) -> LocalClient<tokio_postgres::Client> {
|
||||
@@ -346,8 +347,6 @@ pub(crate) fn poll_client(
|
||||
}
|
||||
.instrument(span));
|
||||
|
||||
let key = SigningKey::random(&mut OsRng);
|
||||
|
||||
let inner = ClientInner {
|
||||
inner: client,
|
||||
session: tx,
|
||||
@@ -430,13 +429,6 @@ impl<C: ClientInnerExt> LocalClient<C> {
|
||||
let inner = inner.as_mut().expect("client inner should not be removed");
|
||||
(&mut inner.inner, Discard { conn_info, pool })
|
||||
}
|
||||
pub(crate) fn key(&self) -> &SigningKey {
|
||||
let inner = &self
|
||||
.inner
|
||||
.as_ref()
|
||||
.expect("client inner should not be removed");
|
||||
&inner.key
|
||||
}
|
||||
}
|
||||
|
||||
impl LocalClient<tokio_postgres::Client> {
|
||||
@@ -445,25 +437,9 @@ impl LocalClient<tokio_postgres::Client> {
|
||||
.inner
|
||||
.as_mut()
|
||||
.expect("client inner should not be removed");
|
||||
|
||||
inner.jti += 1;
|
||||
|
||||
let kid = inner.inner.get_process_id();
|
||||
let header = json!({"kid":kid}).to_string();
|
||||
|
||||
let mut payload = serde_json::from_slice::<serde_json::Map<String, Value>>(payload)
|
||||
.map_err(HttpConnError::JwtPayloadError)?;
|
||||
payload.insert("jti".to_string(), Value::Number(inner.jti.into()));
|
||||
let payload = Value::Object(payload).to_string();
|
||||
|
||||
debug!(
|
||||
kid,
|
||||
jti = inner.jti,
|
||||
?header,
|
||||
?payload,
|
||||
"signing new ephemeral JWT"
|
||||
);
|
||||
|
||||
let token = sign_jwt(&inner.key, header, payload);
|
||||
let token = resign_jwt(&inner.key, payload, inner.jti)?;
|
||||
|
||||
// initiates the auth session
|
||||
inner.inner.simple_query("discard all").await?;
|
||||
@@ -475,20 +451,74 @@ impl LocalClient<tokio_postgres::Client> {
|
||||
)
|
||||
.await?;
|
||||
|
||||
info!(kid, jti = inner.jti, "user session state init");
|
||||
let pid = inner.inner.get_process_id();
|
||||
info!(pid, jti = inner.jti, "user session state init");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn sign_jwt(sk: &SigningKey, header: String, payload: String) -> String {
|
||||
let header = Base64UrlUnpadded::encode_string(header.as_bytes());
|
||||
let payload = Base64UrlUnpadded::encode_string(payload.as_bytes());
|
||||
/// implements relatively efficient in-place json object key upserting
|
||||
///
|
||||
/// only supports top-level keys
|
||||
fn upsert_json_object(
|
||||
payload: &[u8],
|
||||
key: &str,
|
||||
value: &RawValue,
|
||||
) -> Result<String, serde_json::Error> {
|
||||
let mut payload = serde_json::from_slice::<IndexMap<&str, &RawValue>>(payload)?;
|
||||
payload.insert(key, value);
|
||||
serde_json::to_string(&payload)
|
||||
}
|
||||
|
||||
let message = format!("{header}.{payload}");
|
||||
let sig: Signature = sk.sign(message.as_bytes());
|
||||
let base64_sig = Base64UrlUnpadded::encode_string(&sig.to_bytes());
|
||||
format!("{message}.{base64_sig}")
|
||||
fn resign_jwt(sk: &SigningKey, payload: &[u8], jti: u64) -> Result<String, HttpConnError> {
|
||||
let mut buffer = itoa::Buffer::new();
|
||||
|
||||
// encode the jti integer to a json rawvalue
|
||||
let jti = serde_json::from_str::<&RawValue>(buffer.format(jti)).unwrap();
|
||||
|
||||
// update the jti in-place
|
||||
let payload =
|
||||
upsert_json_object(payload, "jti", jti).map_err(HttpConnError::JwtPayloadError)?;
|
||||
|
||||
// sign the jwt
|
||||
let token = sign_jwt(sk, payload.as_bytes());
|
||||
|
||||
Ok(token)
|
||||
}
|
||||
|
||||
fn sign_jwt(sk: &SigningKey, payload: &[u8]) -> String {
|
||||
let header_len = 20;
|
||||
let payload_len = Base64UrlUnpadded::encoded_len(payload);
|
||||
let signature_len = Base64UrlUnpadded::encoded_len(&[0; 64]);
|
||||
let total_len = header_len + payload_len + signature_len + 2;
|
||||
|
||||
let mut jwt = String::with_capacity(total_len);
|
||||
let cap = jwt.capacity();
|
||||
|
||||
// we only need an empty header with the alg specified.
|
||||
// base64url(r#"{"alg":"ES256"}"#) == "eyJhbGciOiJFUzI1NiJ9"
|
||||
jwt.push_str("eyJhbGciOiJFUzI1NiJ9.");
|
||||
|
||||
// encode the jwt payload in-place
|
||||
base64::encode_config_buf(payload, base64::URL_SAFE_NO_PAD, &mut jwt);
|
||||
|
||||
// create the signature from the encoded header || payload
|
||||
let sig: Signature = sk.sign(jwt.as_bytes());
|
||||
|
||||
jwt.push('.');
|
||||
|
||||
// encode the jwt signature in-place
|
||||
base64::encode_config_buf(sig.to_bytes(), base64::URL_SAFE_NO_PAD, &mut jwt);
|
||||
|
||||
debug_assert_eq!(
|
||||
jwt.len(),
|
||||
total_len,
|
||||
"the jwt len should match our expected len"
|
||||
);
|
||||
debug_assert_eq!(jwt.capacity(), cap, "the jwt capacity should not change");
|
||||
|
||||
jwt
|
||||
}
|
||||
|
||||
impl<C: ClientInnerExt> Discard<'_, C> {
|
||||
@@ -509,14 +539,6 @@ impl<C: ClientInnerExt> Discard<'_, C> {
|
||||
}
|
||||
|
||||
impl<C: ClientInnerExt> LocalClient<C> {
|
||||
pub fn get_client(&self) -> &C {
|
||||
&self
|
||||
.inner
|
||||
.as_ref()
|
||||
.expect("client inner should not be removed")
|
||||
.inner
|
||||
}
|
||||
|
||||
fn do_drop(&mut self) -> Option<impl FnOnce()> {
|
||||
let conn_info = self.conn_info.clone();
|
||||
let client = self
|
||||
@@ -542,3 +564,30 @@ impl<C: ClientInnerExt> Drop for LocalClient<C> {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use p256::ecdsa::SigningKey;
|
||||
use typed_json::json;
|
||||
|
||||
use super::resign_jwt;
|
||||
|
||||
#[test]
|
||||
fn jwt_token_snapshot() {
|
||||
let key = SigningKey::from_bytes(&[1; 32].into()).unwrap();
|
||||
let data =
|
||||
json!({"foo":"bar","jti":"foo\nbar","nested":{"jti":"tricky nesting"}}).to_string();
|
||||
|
||||
let jwt = resign_jwt(&key, data.as_bytes(), 2).unwrap();
|
||||
|
||||
// To validate the JWT, copy the JWT string and paste it into https://jwt.io/.
|
||||
// In the public-key box, paste the following jwk public key
|
||||
// `{"kty":"EC","crv":"P-256","x":"b_A7lJJBzh2t1DUZ5pYOCoW0GmmgXDKBA6orzhWUyhY","y":"PE91OlW_AdxT9sCwx-7ni0DG_30lqW4igrmJzvccFEo"}`
|
||||
|
||||
// let pub_key = p256::ecdsa::VerifyingKey::from(&key);
|
||||
// let pub_key = p256::PublicKey::from(pub_key);
|
||||
// println!("{}", pub_key.to_jwk_string());
|
||||
|
||||
assert_eq!(jwt, "eyJhbGciOiJFUzI1NiJ9.eyJmb28iOiJiYXIiLCJqdGkiOjIsIm5lc3RlZCI6eyJqdGkiOiJ0cmlja3kgbmVzdGluZyJ9fQ.pYf0LxoJ8sDgpmsYOgrbNecOSipnPBEGwnZzB-JhW2cONrKlqRsgXwK8_cOsyolGy-hTTe8GXbWTl_UdpF5RyA");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -583,6 +583,22 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
|
||||
log.info(f"Got GC request response code: {res.status_code}")
|
||||
self.verbose_error(res)
|
||||
|
||||
def timeline_offload(
|
||||
self,
|
||||
tenant_id: Union[TenantId, TenantShardId],
|
||||
timeline_id: TimelineId,
|
||||
):
|
||||
self.is_testing_enabled_or_skip()
|
||||
|
||||
log.info(f"Requesting offload: tenant {tenant_id}, timeline {timeline_id}")
|
||||
res = self.put(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/offload",
|
||||
)
|
||||
log.info(f"Got offload request response code: {res.status_code}")
|
||||
self.verbose_error(res)
|
||||
res_json = res.json()
|
||||
assert res_json is None
|
||||
|
||||
def timeline_compact(
|
||||
self,
|
||||
tenant_id: Union[TenantId, TenantShardId],
|
||||
|
||||
@@ -6,6 +6,7 @@ from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
)
|
||||
from fixtures.pageserver.http import PageserverApiException
|
||||
from fixtures.utils import wait_until
|
||||
|
||||
|
||||
@pytest.mark.parametrize("shard_count", [0, 4])
|
||||
@@ -114,3 +115,86 @@ def test_timeline_archive(neon_env_builder: NeonEnvBuilder, shard_count: int):
|
||||
leaf_timeline_id,
|
||||
state=TimelineArchivalState.UNARCHIVED,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("manual_offload", [False, True])
|
||||
def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: bool):
|
||||
env = neon_env_builder.init_start()
|
||||
ps_http = env.pageserver.http_client()
|
||||
|
||||
# Turn off gc and compaction loops: we want to issue them manually for better reliability
|
||||
tenant_id, initial_timeline_id = env.create_tenant(
|
||||
conf={
|
||||
"gc_period": "0s",
|
||||
"compaction_period": "0s" if manual_offload else "1s",
|
||||
}
|
||||
)
|
||||
|
||||
# Create two branches and archive them
|
||||
parent_timeline_id = env.create_branch("test_ancestor_branch_archive_parent", tenant_id)
|
||||
leaf_timeline_id = env.create_branch(
|
||||
"test_ancestor_branch_archive_branch1", tenant_id, "test_ancestor_branch_archive_parent"
|
||||
)
|
||||
|
||||
ps_http.timeline_archival_config(
|
||||
tenant_id,
|
||||
leaf_timeline_id,
|
||||
state=TimelineArchivalState.ARCHIVED,
|
||||
)
|
||||
leaf_detail = ps_http.timeline_detail(
|
||||
tenant_id,
|
||||
leaf_timeline_id,
|
||||
)
|
||||
assert leaf_detail["is_archived"] is True
|
||||
|
||||
ps_http.timeline_archival_config(
|
||||
tenant_id,
|
||||
parent_timeline_id,
|
||||
state=TimelineArchivalState.ARCHIVED,
|
||||
)
|
||||
|
||||
def timeline_offloaded(timeline_id: TimelineId) -> bool:
|
||||
return (
|
||||
env.pageserver.log_contains(f".*{timeline_id}.* offloading archived timeline.*")
|
||||
is not None
|
||||
)
|
||||
|
||||
if manual_offload:
|
||||
with pytest.raises(
|
||||
PageserverApiException,
|
||||
match="timeline has attached children",
|
||||
):
|
||||
# This only tests the (made for testing only) http handler,
|
||||
# but still demonstrates the constraints we have.
|
||||
ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=parent_timeline_id)
|
||||
|
||||
def parent_offloaded():
|
||||
if manual_offload:
|
||||
ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=parent_timeline_id)
|
||||
assert timeline_offloaded(parent_timeline_id)
|
||||
|
||||
def leaf_offloaded():
|
||||
if manual_offload:
|
||||
ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=leaf_timeline_id)
|
||||
assert timeline_offloaded(leaf_timeline_id)
|
||||
|
||||
wait_until(30, 1, leaf_offloaded)
|
||||
wait_until(30, 1, parent_offloaded)
|
||||
|
||||
ps_http.timeline_archival_config(
|
||||
tenant_id,
|
||||
parent_timeline_id,
|
||||
state=TimelineArchivalState.UNARCHIVED,
|
||||
)
|
||||
ps_http.timeline_archival_config(
|
||||
tenant_id,
|
||||
leaf_timeline_id,
|
||||
state=TimelineArchivalState.UNARCHIVED,
|
||||
)
|
||||
leaf_detail = ps_http.timeline_detail(
|
||||
tenant_id,
|
||||
leaf_timeline_id,
|
||||
)
|
||||
assert leaf_detail["is_archived"] is False
|
||||
|
||||
assert not timeline_offloaded(initial_timeline_id)
|
||||
|
||||
@@ -46,7 +46,8 @@ hmac = { version = "0.12", default-features = false, features = ["reset"] }
|
||||
hyper-582f2526e08bb6a0 = { package = "hyper", version = "0.14", features = ["full"] }
|
||||
hyper-dff4ba8e3ae991db = { package = "hyper", version = "1", features = ["full"] }
|
||||
hyper-util = { version = "0.1", features = ["client-legacy", "server-auto", "service"] }
|
||||
indexmap = { version = "1", default-features = false, features = ["std"] }
|
||||
indexmap-dff4ba8e3ae991db = { package = "indexmap", version = "1", default-features = false, features = ["std"] }
|
||||
indexmap-f595c2ba2a3f28df = { package = "indexmap", version = "2", features = ["serde"] }
|
||||
itertools = { version = "0.12" }
|
||||
lazy_static = { version = "1", default-features = false, features = ["spin_no_std"] }
|
||||
libc = { version = "0.2", features = ["extra_traits", "use_std"] }
|
||||
@@ -101,7 +102,8 @@ either = { version = "1" }
|
||||
getrandom = { version = "0.2", default-features = false, features = ["std"] }
|
||||
half = { version = "2", default-features = false, features = ["num-traits"] }
|
||||
hashbrown = { version = "0.14", features = ["raw"] }
|
||||
indexmap = { version = "1", default-features = false, features = ["std"] }
|
||||
indexmap-dff4ba8e3ae991db = { package = "indexmap", version = "1", default-features = false, features = ["std"] }
|
||||
indexmap-f595c2ba2a3f28df = { package = "indexmap", version = "2", features = ["serde"] }
|
||||
itertools = { version = "0.12" }
|
||||
libc = { version = "0.2", features = ["extra_traits", "use_std"] }
|
||||
log = { version = "0.4", default-features = false, features = ["std"] }
|
||||
|
||||
Reference in New Issue
Block a user