mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-25 09:00:37 +00:00
Compare commits
1 Commits
sasha_dont
...
sk-create-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5cad6090e3 |
10
Cargo.lock
generated
10
Cargo.lock
generated
@@ -3643,7 +3643,6 @@ dependencies = [
|
||||
"serde",
|
||||
"serde_json",
|
||||
"sha2",
|
||||
"smol_str",
|
||||
"socket2 0.5.3",
|
||||
"sync_wrapper",
|
||||
"task-local-extensions",
|
||||
@@ -4710,15 +4709,6 @@ version = "1.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9"
|
||||
|
||||
[[package]]
|
||||
name = "smol_str"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "74212e6bbe9a4352329b2f68ba3130c15a3f26fe88ff22dbdc6cdd58fa85e99c"
|
||||
dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "socket2"
|
||||
version = "0.4.9"
|
||||
|
||||
@@ -132,7 +132,6 @@ serde_assert = "0.5.0"
|
||||
sha2 = "0.10.2"
|
||||
signal-hook = "0.3"
|
||||
smallvec = "1.11"
|
||||
smol_str = { version = "0.2.0", features = ["serde"] }
|
||||
socket2 = "0.5"
|
||||
strum = "0.24"
|
||||
strum_macros = "0.24"
|
||||
|
||||
@@ -387,10 +387,18 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
ARG PG_VERSION
|
||||
ENV PATH "/usr/local/pgsql/bin:$PATH"
|
||||
|
||||
RUN apt-get update && \
|
||||
RUN case "${PG_VERSION}" in \
|
||||
"v14" | "v15") \
|
||||
export TIMESCALEDB_VERSION=2.10.1 \
|
||||
export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \
|
||||
;; \
|
||||
*) \
|
||||
echo "TimescaleDB not supported on this PostgreSQL version. See https://github.com/timescale/timescaledb/issues/5752" && exit 0;; \
|
||||
esac && \
|
||||
apt-get update && \
|
||||
apt-get install -y cmake && \
|
||||
wget https://github.com/timescale/timescaledb/archive/refs/tags/2.13.0.tar.gz -O timescaledb.tar.gz && \
|
||||
echo "584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d timescaledb.tar.gz" | sha256sum --check && \
|
||||
wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \
|
||||
echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \
|
||||
mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \
|
||||
./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \
|
||||
cd build && \
|
||||
@@ -721,7 +729,8 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.
|
||||
echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \
|
||||
mkdir wal2json-src && cd wal2json-src && tar xvzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/wal2json.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
|
||||
@@ -274,13 +274,7 @@ fn main() -> Result<()> {
|
||||
let mut state = compute.state.lock().unwrap();
|
||||
state.error = Some(format!("{:?}", err));
|
||||
state.status = ComputeStatus::Failed;
|
||||
// Notify others that Postgres failed to start. In case of configuring the
|
||||
// empty compute, it's likely that API handler is still waiting for compute
|
||||
// state change. With this we will notify it that compute is in Failed state,
|
||||
// so control plane will know about it earlier and record proper error instead
|
||||
// of timeout.
|
||||
compute.state_changed.notify_all();
|
||||
drop(state); // unlock
|
||||
drop(state);
|
||||
delay_exit = true;
|
||||
None
|
||||
}
|
||||
|
||||
@@ -22,7 +22,7 @@ use utils::id::{TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use compute_api::responses::{ComputeMetrics, ComputeStatus};
|
||||
use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec};
|
||||
use compute_api::spec::{ComputeMode, ComputeSpec};
|
||||
use utils::measured_stream::MeasuredReader;
|
||||
|
||||
use remote_storage::{DownloadError, RemotePath};
|
||||
@@ -277,17 +277,6 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()>
|
||||
}
|
||||
|
||||
impl ComputeNode {
|
||||
/// Check that compute node has corresponding feature enabled.
|
||||
pub fn has_feature(&self, feature: ComputeFeature) -> bool {
|
||||
let state = self.state.lock().unwrap();
|
||||
|
||||
if let Some(s) = state.pspec.as_ref() {
|
||||
s.spec.features.contains(&feature)
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set_status(&self, status: ComputeStatus) {
|
||||
let mut state = self.state.lock().unwrap();
|
||||
state.status = status;
|
||||
@@ -739,12 +728,7 @@ impl ComputeNode {
|
||||
|
||||
// Write new config
|
||||
let pgdata_path = Path::new(&self.pgdata);
|
||||
let postgresql_conf_path = pgdata_path.join("postgresql.conf");
|
||||
config::write_postgres_conf(&postgresql_conf_path, &spec, None)?;
|
||||
// temporarily reset max_cluster_size in config
|
||||
// to avoid the possibility of hitting the limit, while we are reconfiguring:
|
||||
// creating new extensions, roles, etc...
|
||||
config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?;
|
||||
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?;
|
||||
self.pg_reload_conf()?;
|
||||
|
||||
let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
|
||||
@@ -765,10 +749,6 @@ impl ComputeNode {
|
||||
// 'Close' connection
|
||||
drop(client);
|
||||
|
||||
// reset max_cluster_size in config back to original value and reload config
|
||||
config::compute_ctl_temp_override_remove(pgdata_path)?;
|
||||
self.pg_reload_conf()?;
|
||||
|
||||
let unknown_op = "unknown".to_string();
|
||||
let op_id = spec.operation_uuid.as_ref().unwrap_or(&unknown_op);
|
||||
info!(
|
||||
@@ -829,17 +809,7 @@ impl ComputeNode {
|
||||
|
||||
let config_time = Utc::now();
|
||||
if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
|
||||
let pgdata_path = Path::new(&self.pgdata);
|
||||
// temporarily reset max_cluster_size in config
|
||||
// to avoid the possibility of hitting the limit, while we are applying config:
|
||||
// creating new extensions, roles, etc...
|
||||
config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?;
|
||||
self.pg_reload_conf()?;
|
||||
|
||||
self.apply_config(&compute_state)?;
|
||||
|
||||
config::compute_ctl_temp_override_remove(pgdata_path)?;
|
||||
self.pg_reload_conf()?;
|
||||
}
|
||||
|
||||
let startup_end_time = Utc::now();
|
||||
|
||||
@@ -93,25 +93,5 @@ pub fn write_postgres_conf(
|
||||
writeln!(file, "neon.extension_server_port={}", port)?;
|
||||
}
|
||||
|
||||
// This is essential to keep this line at the end of the file,
|
||||
// because it is intended to override any settings above.
|
||||
writeln!(file, "include_if_exists = 'compute_ctl_temp_override.conf'")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// create file compute_ctl_temp_override.conf in pgdata_dir
|
||||
/// add provided options to this file
|
||||
pub fn compute_ctl_temp_override_create(pgdata_path: &Path, options: &str) -> Result<()> {
|
||||
let path = pgdata_path.join("compute_ctl_temp_override.conf");
|
||||
let mut file = File::create(path)?;
|
||||
write!(file, "{}", options)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// remove file compute_ctl_temp_override.conf in pgdata_dir
|
||||
pub fn compute_ctl_temp_override_remove(pgdata_path: &Path) -> Result<()> {
|
||||
let path = pgdata_path.join("compute_ctl_temp_override.conf");
|
||||
std::fs::remove_file(path)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -227,7 +227,7 @@ async fn handle_configure_request(
|
||||
|
||||
let parsed_spec = match ParsedSpec::try_from(spec) {
|
||||
Ok(ps) => ps,
|
||||
Err(msg) => return Err((msg, StatusCode::BAD_REQUEST)),
|
||||
Err(msg) => return Err((msg, StatusCode::PRECONDITION_FAILED)),
|
||||
};
|
||||
|
||||
// XXX: wrap state update under lock in code blocks. Otherwise,
|
||||
|
||||
@@ -156,17 +156,17 @@ paths:
|
||||
description: Error text or 'OK' if download succeeded.
|
||||
example: "OK"
|
||||
400:
|
||||
description: Request is invalid.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/GenericError"
|
||||
description: Request is invalid.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/GenericError"
|
||||
500:
|
||||
description: Extension download request failed.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/GenericError"
|
||||
description: Extension download request failed.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/GenericError"
|
||||
|
||||
components:
|
||||
securitySchemes:
|
||||
|
||||
@@ -118,6 +118,19 @@ pub fn get_spec_from_control_plane(
|
||||
spec
|
||||
}
|
||||
|
||||
/// It takes cluster specification and does the following:
|
||||
/// - Serialize cluster config and put it into `postgresql.conf` completely rewriting the file.
|
||||
/// - Update `pg_hba.conf` to allow external connections.
|
||||
pub fn handle_configuration(spec: &ComputeSpec, pgdata_path: &Path) -> Result<()> {
|
||||
// File `postgresql.conf` is no longer included into `basebackup`, so just
|
||||
// always write all config into it creating new file.
|
||||
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec, None)?;
|
||||
|
||||
update_pg_hba(pgdata_path)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Check `pg_hba.conf` and update if needed to allow external connections.
|
||||
pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
|
||||
// XXX: consider making it a part of spec.json
|
||||
|
||||
@@ -415,7 +415,6 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
|
||||
None,
|
||||
None,
|
||||
Some(pg_version),
|
||||
None,
|
||||
)?;
|
||||
let new_timeline_id = timeline_info.timeline_id;
|
||||
let last_record_lsn = timeline_info.last_record_lsn;
|
||||
@@ -496,7 +495,6 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
|
||||
None,
|
||||
None,
|
||||
Some(pg_version),
|
||||
None,
|
||||
)?;
|
||||
let new_timeline_id = timeline_info.timeline_id;
|
||||
|
||||
@@ -584,7 +582,6 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
|
||||
start_lsn,
|
||||
Some(ancestor_timeline_id),
|
||||
None,
|
||||
None,
|
||||
)?;
|
||||
let new_timeline_id = timeline_info.timeline_id;
|
||||
|
||||
|
||||
@@ -519,7 +519,6 @@ impl Endpoint {
|
||||
skip_pg_catalog_updates: self.skip_pg_catalog_updates,
|
||||
format_version: 1.0,
|
||||
operation_uuid: None,
|
||||
features: vec![],
|
||||
cluster: Cluster {
|
||||
cluster_id: None, // project ID: not used
|
||||
name: None, // project name: not used
|
||||
|
||||
@@ -565,7 +565,6 @@ impl PageServerNode {
|
||||
ancestor_start_lsn: Option<Lsn>,
|
||||
ancestor_timeline_id: Option<TimelineId>,
|
||||
pg_version: Option<u32>,
|
||||
existing_initdb_timeline_id: Option<TimelineId>,
|
||||
) -> anyhow::Result<TimelineInfo> {
|
||||
// If timeline ID was not specified, generate one
|
||||
let new_timeline_id = new_timeline_id.unwrap_or(TimelineId::generate());
|
||||
@@ -579,7 +578,6 @@ impl PageServerNode {
|
||||
ancestor_start_lsn,
|
||||
ancestor_timeline_id,
|
||||
pg_version,
|
||||
existing_initdb_timeline_id,
|
||||
})
|
||||
.send()?
|
||||
.error_from_body()?
|
||||
|
||||
@@ -26,13 +26,6 @@ pub struct ComputeSpec {
|
||||
// but we don't use it for anything. Serde will ignore missing fields when
|
||||
// deserializing it.
|
||||
pub operation_uuid: Option<String>,
|
||||
|
||||
/// Compute features to enable. These feature flags are provided, when we
|
||||
/// know all the details about client's compute, so they cannot be used
|
||||
/// to change `Empty` compute behavior.
|
||||
#[serde(default)]
|
||||
pub features: Vec<ComputeFeature>,
|
||||
|
||||
/// Expected cluster state at the end of transition process.
|
||||
pub cluster: Cluster,
|
||||
pub delta_operations: Option<Vec<DeltaOp>>,
|
||||
@@ -75,19 +68,6 @@ pub struct ComputeSpec {
|
||||
pub remote_extensions: Option<RemoteExtSpec>,
|
||||
}
|
||||
|
||||
/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
|
||||
#[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum ComputeFeature {
|
||||
// XXX: Add more feature flags here.
|
||||
|
||||
// This is a special feature flag that is used to represent unknown feature flags.
|
||||
// Basically all unknown to enum flags are represented as this one. See unit test
|
||||
// `parse_unknown_features()` for more details.
|
||||
#[serde(other)]
|
||||
UnknownFeature,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default, Deserialize, Serialize)]
|
||||
pub struct RemoteExtSpec {
|
||||
pub public_extensions: Option<Vec<String>>,
|
||||
@@ -249,10 +229,7 @@ mod tests {
|
||||
#[test]
|
||||
fn parse_spec_file() {
|
||||
let file = File::open("tests/cluster_spec.json").unwrap();
|
||||
let spec: ComputeSpec = serde_json::from_reader(file).unwrap();
|
||||
|
||||
// Features list defaults to empty vector.
|
||||
assert!(spec.features.is_empty());
|
||||
let _spec: ComputeSpec = serde_json::from_reader(file).unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -264,22 +241,4 @@ mod tests {
|
||||
ob.insert("unknown_field_123123123".into(), "hello".into());
|
||||
let _spec: ComputeSpec = serde_json::from_value(json).unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_unknown_features() {
|
||||
// Test that unknown feature flags do not cause any errors.
|
||||
let file = File::open("tests/cluster_spec.json").unwrap();
|
||||
let mut json: serde_json::Value = serde_json::from_reader(file).unwrap();
|
||||
let ob = json.as_object_mut().unwrap();
|
||||
|
||||
// Add unknown feature flags.
|
||||
let features = vec!["foo_bar_feature", "baz_feature"];
|
||||
ob.insert("features".into(), features.into());
|
||||
|
||||
let spec: ComputeSpec = serde_json::from_value(json).unwrap();
|
||||
|
||||
assert!(spec.features.len() == 2);
|
||||
assert!(spec.features.contains(&ComputeFeature::UnknownFeature));
|
||||
assert_eq!(spec.features, vec![ComputeFeature::UnknownFeature; 2]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -140,7 +140,3 @@ impl Key {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_rel_block_key(key: &Key) -> bool {
|
||||
key.field1 == 0x00 && key.field4 != 0
|
||||
}
|
||||
|
||||
@@ -179,8 +179,6 @@ pub struct TimelineCreateRequest {
|
||||
#[serde(default)]
|
||||
pub ancestor_timeline_id: Option<TimelineId>,
|
||||
#[serde(default)]
|
||||
pub existing_initdb_timeline_id: Option<TimelineId>,
|
||||
#[serde(default)]
|
||||
pub ancestor_start_lsn: Option<Lsn>,
|
||||
pub pg_version: Option<u32>,
|
||||
}
|
||||
@@ -384,9 +382,7 @@ pub struct TimelineInfo {
|
||||
/// The LSN that we are advertizing to safekeepers
|
||||
pub remote_consistent_lsn_visible: Lsn,
|
||||
|
||||
pub current_logical_size: u64,
|
||||
pub current_logical_size_is_accurate: bool,
|
||||
|
||||
pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
|
||||
/// Sum of the size of all layer files.
|
||||
/// If a layer is present in both local FS and S3, it counts only once.
|
||||
pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use std::{ops::RangeInclusive, str::FromStr};
|
||||
|
||||
use crate::key::{is_rel_block_key, Key};
|
||||
use hex::FromHex;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use thiserror;
|
||||
@@ -303,8 +302,6 @@ pub struct ShardStripeSize(pub u32);
|
||||
pub struct ShardLayout(u8);
|
||||
|
||||
const LAYOUT_V1: ShardLayout = ShardLayout(1);
|
||||
/// ShardIdentity uses a magic layout value to indicate if it is unusable
|
||||
const LAYOUT_BROKEN: ShardLayout = ShardLayout(255);
|
||||
|
||||
/// Default stripe size in pages: 256MiB divided by 8kiB page size.
|
||||
const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
|
||||
@@ -313,10 +310,10 @@ const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
|
||||
/// to resolve a key to a shard, and then check whether that shard is ==self.
|
||||
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
|
||||
pub struct ShardIdentity {
|
||||
pub layout: ShardLayout,
|
||||
pub number: ShardNumber,
|
||||
pub count: ShardCount,
|
||||
stripe_size: ShardStripeSize,
|
||||
layout: ShardLayout,
|
||||
pub stripe_size: ShardStripeSize,
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug, PartialEq, Eq)]
|
||||
@@ -342,22 +339,6 @@ impl ShardIdentity {
|
||||
}
|
||||
}
|
||||
|
||||
/// A broken instance of this type is only used for `TenantState::Broken` tenants,
|
||||
/// which are constructed in code paths that don't have access to proper configuration.
|
||||
///
|
||||
/// A ShardIdentity in this state may not be used for anything, and should not be persisted.
|
||||
/// Enforcement is via assertions, to avoid making our interface fallible for this
|
||||
/// edge case: it is the Tenant's responsibility to avoid trying to do any I/O when in a broken
|
||||
/// state, and by extension to avoid trying to do any page->shard resolution.
|
||||
pub fn broken(number: ShardNumber, count: ShardCount) -> Self {
|
||||
Self {
|
||||
number,
|
||||
count,
|
||||
layout: LAYOUT_BROKEN,
|
||||
stripe_size: DEFAULT_STRIPE_SIZE,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_unsharded(&self) -> bool {
|
||||
self.number == ShardNumber(0) && self.count == ShardCount(0)
|
||||
}
|
||||
@@ -384,33 +365,6 @@ impl ShardIdentity {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn is_broken(&self) -> bool {
|
||||
self.layout == LAYOUT_BROKEN
|
||||
}
|
||||
|
||||
pub fn get_shard_number(&self, key: &Key) -> ShardNumber {
|
||||
assert!(!self.is_broken());
|
||||
key_to_shard_number(self.count, self.stripe_size, key)
|
||||
}
|
||||
|
||||
/// Return true if the key should be ingested by this shard
|
||||
pub fn is_key_local(&self, key: &Key) -> bool {
|
||||
assert!(!self.is_broken());
|
||||
if self.count < ShardCount(2) || (key_is_shard0(key) && self.number == ShardNumber(0)) {
|
||||
true
|
||||
} else {
|
||||
key_to_shard_number(self.count, self.stripe_size, key) == self.number
|
||||
}
|
||||
}
|
||||
|
||||
pub fn shard_slug(&self) -> String {
|
||||
if self.count > ShardCount(0) {
|
||||
format!("-{:02x}{:02x}", self.number.0, self.count.0)
|
||||
} else {
|
||||
String::new()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for ShardIndex {
|
||||
@@ -484,65 +438,6 @@ impl<'de> Deserialize<'de> for ShardIndex {
|
||||
}
|
||||
}
|
||||
|
||||
/// Whether this key is always held on shard 0 (e.g. shard 0 holds all SLRU keys
|
||||
/// in order to be able to serve basebackup requests without peer communication).
|
||||
fn key_is_shard0(key: &Key) -> bool {
|
||||
// To decide what to shard out to shards >0, we apply a simple rule that only
|
||||
// relation pages are distributed to shards other than shard zero. Everything else gets
|
||||
// stored on shard 0. This guarantees that shard 0 can independently serve basebackup
|
||||
// requests, and any request other than those for particular blocks in relations.
|
||||
//
|
||||
// In this condition:
|
||||
// - is_rel_block_key includes only relations, i.e. excludes SLRU data and
|
||||
// all metadata.
|
||||
// - field6 is set to -1 for relation size pages.
|
||||
!(is_rel_block_key(key) && key.field6 != 0xffffffff)
|
||||
}
|
||||
|
||||
/// Provide the same result as the function in postgres `hashfn.h` with the same name
|
||||
fn murmurhash32(mut h: u32) -> u32 {
|
||||
h ^= h >> 16;
|
||||
h = h.wrapping_mul(0x85ebca6b);
|
||||
h ^= h >> 13;
|
||||
h = h.wrapping_mul(0xc2b2ae35);
|
||||
h ^= h >> 16;
|
||||
h
|
||||
}
|
||||
|
||||
/// Provide the same result as the function in postgres `hashfn.h` with the same name
|
||||
fn hash_combine(mut a: u32, mut b: u32) -> u32 {
|
||||
b = b.wrapping_add(0x9e3779b9);
|
||||
b = b.wrapping_add(a << 6);
|
||||
b = b.wrapping_add(a >> 2);
|
||||
|
||||
a ^= b;
|
||||
a
|
||||
}
|
||||
|
||||
/// Where a Key is to be distributed across shards, select the shard. This function
|
||||
/// does not account for keys that should be broadcast across shards.
|
||||
///
|
||||
/// The hashing in this function must exactly match what we do in postgres smgr
|
||||
/// code. The resulting distribution of pages is intended to preserve locality within
|
||||
/// `stripe_size` ranges of contiguous block numbers in the same relation, while otherwise
|
||||
/// distributing data pseudo-randomly.
|
||||
///
|
||||
/// The mapping of key to shard is not stable across changes to ShardCount: this is intentional
|
||||
/// and will be handled at higher levels when shards are split.
|
||||
fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Key) -> ShardNumber {
|
||||
// Fast path for un-sharded tenants or broadcast keys
|
||||
if count < ShardCount(2) || key_is_shard0(key) {
|
||||
return ShardNumber(0);
|
||||
}
|
||||
|
||||
// relNode
|
||||
let mut hash = murmurhash32(key.field4);
|
||||
// blockNum/stripe size
|
||||
hash = hash_combine(hash, murmurhash32(key.field6 / stripe_size.0));
|
||||
|
||||
ShardNumber((hash % count.0 as u32) as u8)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::str::FromStr;
|
||||
@@ -714,29 +609,4 @@ mod tests {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// These are only smoke tests to spot check that our implementation doesn't
|
||||
// deviate from a few examples values: not aiming to validate the overall
|
||||
// hashing algorithm.
|
||||
#[test]
|
||||
fn murmur_hash() {
|
||||
assert_eq!(murmurhash32(0), 0);
|
||||
|
||||
assert_eq!(hash_combine(0xb1ff3b40, 0), 0xfb7923c9);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn shard_mapping() {
|
||||
let key = Key {
|
||||
field1: 0x00,
|
||||
field2: 0x67f,
|
||||
field3: 0x5,
|
||||
field4: 0x400c,
|
||||
field5: 0x00,
|
||||
field6: 0x7d06,
|
||||
};
|
||||
|
||||
let shard = key_to_shard_number(ShardCount(10), DEFAULT_STRIPE_SIZE, &key);
|
||||
assert_eq!(shard, ShardNumber(8));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -289,10 +289,10 @@ impl FeStartupPacket {
|
||||
// We shouldn't advance `buf` as probably full message is not there yet,
|
||||
// so can't directly use Bytes::get_u32 etc.
|
||||
let len = (&buf[0..4]).read_u32::<BigEndian>().unwrap() as usize;
|
||||
// The proposed replacement is `!(8..=MAX_STARTUP_PACKET_LENGTH).contains(&len)`
|
||||
// The proposed replacement is `!(4..=MAX_STARTUP_PACKET_LENGTH).contains(&len)`
|
||||
// which is less readable
|
||||
#[allow(clippy::manual_range_contains)]
|
||||
if len < 8 || len > MAX_STARTUP_PACKET_LENGTH {
|
||||
if len < 4 || len > MAX_STARTUP_PACKET_LENGTH {
|
||||
return Err(ProtocolError::Protocol(format!(
|
||||
"invalid startup packet message length {}",
|
||||
len
|
||||
@@ -975,10 +975,4 @@ mod tests {
|
||||
let params = make_params("foo\\ bar \\ \\\\ baz\\ lol");
|
||||
assert_eq!(split_options(¶ms), ["foo bar", " \\", "baz ", "lol"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_fe_startup_packet_regression() {
|
||||
let data = [0, 0, 0, 7, 0, 0, 0, 0];
|
||||
FeStartupPacket::parse(&mut BytesMut::from_iter(data)).unwrap_err();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -378,7 +378,7 @@ impl RemoteStorage for S3Bucket {
|
||||
let empty = Vec::new();
|
||||
let prefixes = response.common_prefixes.as_ref().unwrap_or(&empty);
|
||||
|
||||
tracing::debug!("list: {} prefixes, {} keys", prefixes.len(), keys.len());
|
||||
tracing::info!("list: {} prefixes, {} keys", prefixes.len(), keys.len());
|
||||
|
||||
for object in keys {
|
||||
let object_path = object.key().expect("response does not contain a key");
|
||||
|
||||
@@ -152,16 +152,3 @@ impl Debug for Generation {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn generation_gt() {
|
||||
// Important that a None generation compares less than a valid one, during upgrades from
|
||||
// pre-generation systems.
|
||||
assert!(Generation::none() < Generation::new(0));
|
||||
assert!(Generation::none() < Generation::new(1));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -402,11 +402,15 @@ fn start_pageserver(
|
||||
let (init_remote_done_tx, init_remote_done_rx) = utils::completion::channel();
|
||||
let (init_done_tx, init_done_rx) = utils::completion::channel();
|
||||
|
||||
let (init_logical_size_done_tx, init_logical_size_done_rx) = utils::completion::channel();
|
||||
|
||||
let (background_jobs_can_start, background_jobs_barrier) = utils::completion::channel();
|
||||
|
||||
let order = pageserver::InitializationOrder {
|
||||
initial_tenant_load_remote: Some(init_done_tx),
|
||||
initial_tenant_load: Some(init_remote_done_tx),
|
||||
initial_logical_size_can_start: init_done_rx.clone(),
|
||||
initial_logical_size_attempt: Some(init_logical_size_done_tx),
|
||||
background_jobs_can_start: background_jobs_barrier.clone(),
|
||||
};
|
||||
|
||||
@@ -460,7 +464,7 @@ fn start_pageserver(
|
||||
});
|
||||
|
||||
let WaitForPhaseResult {
|
||||
timeout_remaining: _timeout,
|
||||
timeout_remaining: timeout,
|
||||
skipped: init_load_skipped,
|
||||
} = wait_for_phase("initial_tenant_load", init_load_done, timeout).await;
|
||||
|
||||
@@ -468,6 +472,26 @@ fn start_pageserver(
|
||||
|
||||
scopeguard::ScopeGuard::into_inner(guard);
|
||||
|
||||
let guard = scopeguard::guard_on_success((), |_| {
|
||||
tracing::info!("Cancelled before initial logical sizes completed")
|
||||
});
|
||||
|
||||
let logical_sizes_done = std::pin::pin!(async {
|
||||
init_logical_size_done_rx.wait().await;
|
||||
startup_checkpoint(
|
||||
started_startup_at,
|
||||
"initial_logical_sizes",
|
||||
"Initial logical sizes completed",
|
||||
);
|
||||
});
|
||||
|
||||
let WaitForPhaseResult {
|
||||
timeout_remaining: _,
|
||||
skipped: logical_sizes_skipped,
|
||||
} = wait_for_phase("initial_logical_sizes", logical_sizes_done, timeout).await;
|
||||
|
||||
scopeguard::ScopeGuard::into_inner(guard);
|
||||
|
||||
// allow background jobs to start: we either completed prior stages, or they reached timeout
|
||||
// and were skipped. It is important that we do not let them block background jobs indefinitely,
|
||||
// because things like consumption metrics for billing are blocked by this barrier.
|
||||
@@ -490,6 +514,9 @@ fn start_pageserver(
|
||||
if let Some(f) = init_load_skipped {
|
||||
f.await;
|
||||
}
|
||||
if let Some(f) = logical_sizes_skipped {
|
||||
f.await;
|
||||
}
|
||||
scopeguard::ScopeGuard::into_inner(guard);
|
||||
|
||||
startup_checkpoint(started_startup_at, "complete", "Startup complete");
|
||||
|
||||
@@ -855,8 +855,7 @@ impl PageServerConf {
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn test_repo_dir(test_name: &str) -> Utf8PathBuf {
|
||||
let test_output_dir = std::env::var("TEST_OUTPUT").unwrap_or("../tmp_check".into());
|
||||
Utf8PathBuf::from(format!("{test_output_dir}/test_{test_name}"))
|
||||
Utf8PathBuf::from(format!("../tmp_check/test_{test_name}"))
|
||||
}
|
||||
|
||||
pub fn dummy_conf(repo_dir: Utf8PathBuf) -> Self {
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
use crate::{context::RequestContext, tenant::timeline::logical_size::CurrentLogicalSize};
|
||||
use crate::context::RequestContext;
|
||||
use anyhow::Context;
|
||||
use chrono::{DateTime, Utc};
|
||||
use consumption_metrics::EventType;
|
||||
use futures::stream::StreamExt;
|
||||
use pageserver_api::shard::ShardNumber;
|
||||
use std::{sync::Arc, time::SystemTime};
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
@@ -229,11 +229,6 @@ where
|
||||
while let Some((tenant_id, tenant)) = tenants.next().await {
|
||||
let mut tenant_resident_size = 0;
|
||||
|
||||
// Sharded tenants report all consumption metrics from shard zero
|
||||
if tenant.tenant_shard_id().shard_number != ShardNumber(0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for timeline in tenant.list_timelines() {
|
||||
let timeline_id = timeline.timeline_id;
|
||||
|
||||
@@ -357,16 +352,13 @@ impl TimelineSnapshot {
|
||||
|
||||
let current_exact_logical_size = {
|
||||
let span = tracing::info_span!("collect_metrics_iteration", tenant_id = %t.tenant_shard_id.tenant_id, timeline_id = %t.timeline_id);
|
||||
let size = span.in_scope(|| {
|
||||
t.get_current_logical_size(
|
||||
crate::tenant::timeline::GetLogicalSizePriority::Background,
|
||||
ctx,
|
||||
)
|
||||
});
|
||||
match size {
|
||||
let res = span
|
||||
.in_scope(|| t.get_current_logical_size(ctx))
|
||||
.context("get_current_logical_size");
|
||||
match res? {
|
||||
// Only send timeline logical size when it is fully calculated.
|
||||
CurrentLogicalSize::Exact(ref size) => Some(size.into()),
|
||||
CurrentLogicalSize::Approximate(_) => None,
|
||||
(size, is_exact) if is_exact => Some(size),
|
||||
(_, _) => None,
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -1028,9 +1028,6 @@ paths:
|
||||
format: hex
|
||||
pg_version:
|
||||
type: integer
|
||||
existing_initdb_timeline_id:
|
||||
type: string
|
||||
format: hex
|
||||
responses:
|
||||
"201":
|
||||
description: TimelineInfo
|
||||
|
||||
@@ -338,8 +338,13 @@ async fn build_timeline_info_common(
|
||||
Lsn(0) => None,
|
||||
lsn @ Lsn(_) => Some(lsn),
|
||||
};
|
||||
let current_logical_size =
|
||||
timeline.get_current_logical_size(tenant::timeline::GetLogicalSizePriority::User, ctx);
|
||||
let current_logical_size = match timeline.get_current_logical_size(ctx) {
|
||||
Ok((size, _)) => Some(size),
|
||||
Err(err) => {
|
||||
error!("Timeline info creation failed to get current logical size: {err:?}");
|
||||
None
|
||||
}
|
||||
};
|
||||
let current_physical_size = Some(timeline.layer_size_sum().await);
|
||||
let state = timeline.current_state();
|
||||
let remote_consistent_lsn_projected = timeline
|
||||
@@ -363,11 +368,7 @@ async fn build_timeline_info_common(
|
||||
last_record_lsn,
|
||||
prev_record_lsn: Some(timeline.get_prev_record_lsn()),
|
||||
latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(),
|
||||
current_logical_size: current_logical_size.size_dont_care_about_accuracy(),
|
||||
current_logical_size_is_accurate: match current_logical_size.accuracy() {
|
||||
tenant::timeline::logical_size::Accuracy::Approximate => false,
|
||||
tenant::timeline::logical_size::Accuracy::Exact => true,
|
||||
},
|
||||
current_logical_size,
|
||||
current_physical_size,
|
||||
current_logical_size_non_incremental: None,
|
||||
timeline_dir_layer_file_size_sum: None,
|
||||
@@ -440,7 +441,6 @@ async fn timeline_create_handler(
|
||||
request_data.ancestor_timeline_id.map(TimelineId::from),
|
||||
request_data.ancestor_start_lsn,
|
||||
request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
|
||||
request_data.existing_initdb_timeline_id,
|
||||
state.broker_client.clone(),
|
||||
&ctx,
|
||||
)
|
||||
@@ -709,26 +709,6 @@ async fn tenant_detach_handler(
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn tenant_reset_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
let drop_cache: Option<bool> = parse_query_param(&request, "drop_cache")?;
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
||||
let state = get_state(&request);
|
||||
state
|
||||
.tenant_manager
|
||||
.reset_tenant(tenant_shard_id, drop_cache.unwrap_or(false), ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn tenant_load_handler(
|
||||
mut request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
@@ -1848,9 +1828,6 @@ pub fn make_router(
|
||||
.post("/v1/tenant/:tenant_id/detach", |r| {
|
||||
api_handler(r, tenant_detach_handler)
|
||||
})
|
||||
.post("/v1/tenant/:tenant_shard_id/reset", |r| {
|
||||
api_handler(r, tenant_reset_handler)
|
||||
})
|
||||
.post("/v1/tenant/:tenant_id/load", |r| {
|
||||
api_handler(r, tenant_load_handler)
|
||||
})
|
||||
|
||||
@@ -7,13 +7,12 @@ use std::pin::Pin;
|
||||
use std::task::{self, Poll};
|
||||
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use async_compression::tokio::bufread::ZstdDecoder;
|
||||
use async_compression::{tokio::write::ZstdEncoder, zstd::CParameter, Level};
|
||||
use bytes::Bytes;
|
||||
use camino::Utf8Path;
|
||||
use futures::StreamExt;
|
||||
use nix::NixPath;
|
||||
use tokio::io::{AsyncBufRead, AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt};
|
||||
use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt};
|
||||
use tokio_tar::Archive;
|
||||
use tokio_tar::Builder;
|
||||
use tokio_tar::HeaderMode;
|
||||
@@ -733,13 +732,3 @@ pub async fn create_tar_zst(pgdata_path: &Utf8Path) -> Result<Vec<u8>> {
|
||||
}
|
||||
Ok(compressed.buf)
|
||||
}
|
||||
|
||||
pub async fn extract_tar_zst(
|
||||
pgdata_path: &Utf8Path,
|
||||
tar_zst: impl AsyncBufRead + Unpin,
|
||||
) -> Result<()> {
|
||||
let tar = Box::pin(ZstdDecoder::new(tar_zst));
|
||||
let mut archive = Archive::new(tar);
|
||||
archive.unpack(pgdata_path).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -186,6 +186,13 @@ pub struct InitializationOrder {
|
||||
/// Each initial tenant load task carries this until completion.
|
||||
pub initial_tenant_load: Option<utils::completion::Completion>,
|
||||
|
||||
/// Barrier for when we can start initial logical size calculations.
|
||||
pub initial_logical_size_can_start: utils::completion::Barrier,
|
||||
|
||||
/// Each timeline owns a clone of this to be consumed on the initial logical size calculation
|
||||
/// attempt. It is important to drop this once the attempt has completed.
|
||||
pub initial_logical_size_attempt: Option<utils::completion::Completion>,
|
||||
|
||||
/// Barrier for when we can start any background jobs.
|
||||
///
|
||||
/// This can be broken up later on, but right now there is just one class of a background job.
|
||||
@@ -205,7 +212,7 @@ async fn timed<Fut: std::future::Future>(
|
||||
match tokio::time::timeout(warn_at, &mut fut).await {
|
||||
Ok(ret) => {
|
||||
tracing::info!(
|
||||
stage = name,
|
||||
task = name,
|
||||
elapsed_ms = started.elapsed().as_millis(),
|
||||
"completed"
|
||||
);
|
||||
@@ -213,7 +220,7 @@ async fn timed<Fut: std::future::Future>(
|
||||
}
|
||||
Err(_) => {
|
||||
tracing::info!(
|
||||
stage = name,
|
||||
task = name,
|
||||
elapsed_ms = started.elapsed().as_millis(),
|
||||
"still waiting, taking longer than expected..."
|
||||
);
|
||||
@@ -222,7 +229,7 @@ async fn timed<Fut: std::future::Future>(
|
||||
|
||||
// this has a global allowed_errors
|
||||
tracing::warn!(
|
||||
stage = name,
|
||||
task = name,
|
||||
elapsed_ms = started.elapsed().as_millis(),
|
||||
"completed, took longer than expected"
|
||||
);
|
||||
|
||||
@@ -403,129 +403,6 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
.expect("failed to define current logical size metric")
|
||||
});
|
||||
|
||||
pub(crate) mod initial_logical_size {
|
||||
use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
pub(crate) struct StartCalculation(IntCounterVec);
|
||||
pub(crate) static START_CALCULATION: Lazy<StartCalculation> = Lazy::new(|| {
|
||||
StartCalculation(
|
||||
register_int_counter_vec!(
|
||||
"pageserver_initial_logical_size_start_calculation",
|
||||
"Incremented each time we start an initial logical size calculation attempt. \
|
||||
The `circumstances` label provides some additional details.",
|
||||
&["attempt", "circumstances"]
|
||||
)
|
||||
.unwrap(),
|
||||
)
|
||||
});
|
||||
|
||||
struct DropCalculation {
|
||||
first: IntCounter,
|
||||
retry: IntCounter,
|
||||
}
|
||||
|
||||
static DROP_CALCULATION: Lazy<DropCalculation> = Lazy::new(|| {
|
||||
let vec = register_int_counter_vec!(
|
||||
"pageserver_initial_logical_size_drop_calculation",
|
||||
"Incremented each time we abort a started size calculation attmpt.",
|
||||
&["attempt"]
|
||||
)
|
||||
.unwrap();
|
||||
DropCalculation {
|
||||
first: vec.with_label_values(&["first"]),
|
||||
retry: vec.with_label_values(&["retry"]),
|
||||
}
|
||||
});
|
||||
|
||||
pub(crate) struct Calculated {
|
||||
pub(crate) births: IntCounter,
|
||||
pub(crate) deaths: IntCounter,
|
||||
}
|
||||
|
||||
pub(crate) static CALCULATED: Lazy<Calculated> = Lazy::new(|| Calculated {
|
||||
births: register_int_counter!(
|
||||
"pageserver_initial_logical_size_finish_calculation",
|
||||
"Incremented every time we finish calculation of initial logical size.\
|
||||
If everything is working well, this should happen at most once per Timeline object."
|
||||
)
|
||||
.unwrap(),
|
||||
deaths: register_int_counter!(
|
||||
"pageserver_initial_logical_size_drop_finished_calculation",
|
||||
"Incremented when we drop a finished initial logical size calculation result.\
|
||||
Mainly useful to turn pageserver_initial_logical_size_finish_calculation into a gauge."
|
||||
)
|
||||
.unwrap(),
|
||||
});
|
||||
|
||||
pub(crate) struct OngoingCalculationGuard {
|
||||
inc_drop_calculation: Option<IntCounter>,
|
||||
}
|
||||
|
||||
#[derive(strum_macros::IntoStaticStr)]
|
||||
pub(crate) enum StartCircumstances {
|
||||
EmptyInitial,
|
||||
SkippedConcurrencyLimiter,
|
||||
AfterBackgroundTasksRateLimit,
|
||||
}
|
||||
|
||||
impl StartCalculation {
|
||||
pub(crate) fn first(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard {
|
||||
let circumstances_label: &'static str = circumstances.into();
|
||||
self.0.with_label_values(&["first", circumstances_label]);
|
||||
OngoingCalculationGuard {
|
||||
inc_drop_calculation: Some(DROP_CALCULATION.first.clone()),
|
||||
}
|
||||
}
|
||||
pub(crate) fn retry(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard {
|
||||
let circumstances_label: &'static str = circumstances.into();
|
||||
self.0.with_label_values(&["retry", circumstances_label]);
|
||||
OngoingCalculationGuard {
|
||||
inc_drop_calculation: Some(DROP_CALCULATION.retry.clone()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for OngoingCalculationGuard {
|
||||
fn drop(&mut self) {
|
||||
if let Some(counter) = self.inc_drop_calculation.take() {
|
||||
counter.inc();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl OngoingCalculationGuard {
|
||||
pub(crate) fn calculation_result_saved(mut self) -> FinishedCalculationGuard {
|
||||
drop(self.inc_drop_calculation.take());
|
||||
CALCULATED.births.inc();
|
||||
FinishedCalculationGuard {
|
||||
inc_on_drop: CALCULATED.deaths.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct FinishedCalculationGuard {
|
||||
inc_on_drop: IntCounter,
|
||||
}
|
||||
|
||||
impl Drop for FinishedCalculationGuard {
|
||||
fn drop(&mut self) {
|
||||
self.inc_on_drop.inc();
|
||||
}
|
||||
}
|
||||
|
||||
// context: https://github.com/neondatabase/neon/issues/5963
|
||||
pub(crate) static TIMELINES_WHERE_WALRECEIVER_GOT_APPROXIMATE_SIZE: Lazy<IntCounter> =
|
||||
Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_initial_logical_size_timelines_where_walreceiver_got_approximate_size",
|
||||
"Counter for the following event: walreceiver calls\
|
||||
Timeline::get_current_logical_size() and it returns `Approximate` for the first time."
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
}
|
||||
|
||||
pub(crate) static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_tenant_states_count",
|
||||
@@ -1388,8 +1265,6 @@ pub(crate) static WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM: Lazy<Histogram> =
|
||||
pub(crate) struct WalRedoProcessCounters {
|
||||
pub(crate) started: IntCounter,
|
||||
pub(crate) killed_by_cause: enum_map::EnumMap<WalRedoKillCause, IntCounter>,
|
||||
pub(crate) active_stderr_logger_tasks_started: IntCounter,
|
||||
pub(crate) active_stderr_logger_tasks_finished: IntCounter,
|
||||
}
|
||||
|
||||
#[derive(Debug, enum_map::Enum, strum_macros::IntoStaticStr)]
|
||||
@@ -1413,19 +1288,6 @@ impl Default for WalRedoProcessCounters {
|
||||
&["cause"],
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let active_stderr_logger_tasks_started = register_int_counter!(
|
||||
"pageserver_walredo_stderr_logger_tasks_started_total",
|
||||
"Number of active walredo stderr logger tasks that have started",
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let active_stderr_logger_tasks_finished = register_int_counter!(
|
||||
"pageserver_walredo_stderr_logger_tasks_finished_total",
|
||||
"Number of active walredo stderr logger tasks that have finished",
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
Self {
|
||||
started,
|
||||
killed_by_cause: EnumMap::from_array(std::array::from_fn(|i| {
|
||||
@@ -1433,8 +1295,6 @@ impl Default for WalRedoProcessCounters {
|
||||
let cause_str: &'static str = cause.into();
|
||||
killed.with_label_values(&[cause_str])
|
||||
})),
|
||||
active_stderr_logger_tasks_started,
|
||||
active_stderr_logger_tasks_finished,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -53,14 +53,12 @@ use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::import_datadir::import_wal_from_tar;
|
||||
use crate::metrics;
|
||||
use crate::metrics::LIVE_CONNECTIONS_COUNT;
|
||||
use crate::pgdatadir_mapping::rel_block_to_key;
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::tenant::mgr;
|
||||
use crate::tenant::mgr::get_active_tenant_with_timeout;
|
||||
use crate::tenant::mgr::GetActiveTenantError;
|
||||
use crate::tenant::mgr::ShardSelector;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::trace::Tracer;
|
||||
|
||||
@@ -401,19 +399,16 @@ impl PageServerHandler {
|
||||
{
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
// Note that since one connection may contain getpage requests that target different
|
||||
// shards (e.g. during splitting when the compute is not yet aware of the split), the tenant
|
||||
// that we look up here may not be the one that serves all the actual requests: we will double
|
||||
// check the mapping of key->shard later before calling into Timeline for getpage requests.
|
||||
// TODO(sharding): enumerate local tenant shards for this tenant, and select the one
|
||||
// that should serve this request.
|
||||
|
||||
// Make request tracer if needed
|
||||
let tenant = mgr::get_active_tenant_with_timeout(
|
||||
tenant_id,
|
||||
ShardSelector::First,
|
||||
ACTIVE_TENANT_TIMEOUT,
|
||||
&task_mgr::shutdown_token(),
|
||||
)
|
||||
.await?;
|
||||
|
||||
// Make request tracer if needed
|
||||
let mut tracer = if tenant.get_trace_read_requests() {
|
||||
let connection_id = ConnectionId::generate();
|
||||
let path =
|
||||
@@ -571,7 +566,6 @@ impl PageServerHandler {
|
||||
info!("creating new timeline");
|
||||
let tenant = get_active_tenant_with_timeout(
|
||||
tenant_id,
|
||||
ShardSelector::Zero,
|
||||
ACTIVE_TENANT_TIMEOUT,
|
||||
&task_mgr::shutdown_token(),
|
||||
)
|
||||
@@ -634,7 +628,7 @@ impl PageServerHandler {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
let timeline = self
|
||||
.get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.get_active_tenant_timeline(tenant_id, timeline_id)
|
||||
.await?;
|
||||
let last_record_lsn = timeline.get_last_record_lsn();
|
||||
if last_record_lsn != start_lsn {
|
||||
@@ -813,49 +807,9 @@ impl PageServerHandler {
|
||||
}
|
||||
*/
|
||||
|
||||
let key = rel_block_to_key(req.rel, req.blkno);
|
||||
let page = if timeline.get_shard_identity().is_key_local(&key) {
|
||||
timeline
|
||||
.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
|
||||
.await?
|
||||
} else {
|
||||
// The Tenant shard we looked up at connection start does not hold this particular
|
||||
// key: look for other shards in this tenant. This scenario occurs if a pageserver
|
||||
// has multiple shards for the same tenant.
|
||||
//
|
||||
// TODO: optimize this (https://github.com/neondatabase/neon/pull/6037)
|
||||
let timeline = match self
|
||||
.get_active_tenant_timeline(
|
||||
timeline.tenant_shard_id.tenant_id,
|
||||
timeline.timeline_id,
|
||||
ShardSelector::Page(key),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(t) => t,
|
||||
Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
|
||||
// We already know this tenant exists in general, because we resolved it at
|
||||
// start of connection. Getting a NotFound here indicates that the shard containing
|
||||
// the requested page is not present on this node.
|
||||
|
||||
// TODO: this should be some kind of structured error that the client will understand,
|
||||
// so that it can block until its config is updated: this error is expected in the case
|
||||
// that the Tenant's shards' placements are being updated and the client hasn't been
|
||||
// informed yet.
|
||||
//
|
||||
// https://github.com/neondatabase/neon/issues/6038
|
||||
return Err(anyhow::anyhow!("Request routed to wrong shard"));
|
||||
}
|
||||
Err(e) => return Err(e.into()),
|
||||
};
|
||||
|
||||
// Take a GateGuard for the duration of this request. If we were using our main Timeline object,
|
||||
// the GateGuard was already held over the whole connection.
|
||||
let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?;
|
||||
timeline
|
||||
.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
|
||||
.await?
|
||||
};
|
||||
let page = timeline
|
||||
.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
|
||||
.await?;
|
||||
|
||||
Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
|
||||
page,
|
||||
@@ -884,7 +838,7 @@ impl PageServerHandler {
|
||||
|
||||
// check that the timeline exists
|
||||
let timeline = self
|
||||
.get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.get_active_tenant_timeline(tenant_id, timeline_id)
|
||||
.await?;
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
if let Some(lsn) = lsn {
|
||||
@@ -990,11 +944,9 @@ impl PageServerHandler {
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
selector: ShardSelector,
|
||||
) -> Result<Arc<Timeline>, GetActiveTimelineError> {
|
||||
let tenant = get_active_tenant_with_timeout(
|
||||
tenant_id,
|
||||
selector,
|
||||
ACTIVE_TENANT_TIMEOUT,
|
||||
&task_mgr::shutdown_token(),
|
||||
)
|
||||
@@ -1168,7 +1120,7 @@ where
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
let timeline = self
|
||||
.get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.get_active_tenant_timeline(tenant_id, timeline_id)
|
||||
.await?;
|
||||
|
||||
let end_of_timeline = timeline.get_last_record_rlsn();
|
||||
@@ -1355,7 +1307,6 @@ where
|
||||
|
||||
let tenant = get_active_tenant_with_timeout(
|
||||
tenant_id,
|
||||
ShardSelector::Zero,
|
||||
ACTIVE_TENANT_TIMEOUT,
|
||||
&task_mgr::shutdown_token(),
|
||||
)
|
||||
|
||||
@@ -13,7 +13,6 @@ use crate::repository::*;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::Context;
|
||||
use bytes::{Buf, Bytes};
|
||||
use pageserver_api::key::is_rel_block_key;
|
||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
|
||||
use postgres_ffi::BLCKSZ;
|
||||
@@ -283,10 +282,6 @@ impl Timeline {
|
||||
}
|
||||
|
||||
/// Get a list of all existing relations in given tablespace and database.
|
||||
///
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// This method is cancellation-safe.
|
||||
pub async fn list_rels(
|
||||
&self,
|
||||
spcnode: Oid,
|
||||
@@ -635,10 +630,6 @@ impl Timeline {
|
||||
///
|
||||
/// Only relation blocks are counted currently. That excludes metadata,
|
||||
/// SLRUs, twophase files etc.
|
||||
///
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// This method is cancellation-safe.
|
||||
pub async fn get_current_logical_size_non_incremental(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
@@ -1323,7 +1314,7 @@ impl<'a> DatadirModification<'a> {
|
||||
// Flush relation and SLRU data blocks, keep metadata.
|
||||
let mut retained_pending_updates = HashMap::new();
|
||||
for (key, value) in self.pending_updates.drain() {
|
||||
if is_rel_block_key(&key) || is_slru_block_key(key) {
|
||||
if is_rel_block_key(key) || is_slru_block_key(key) {
|
||||
// This bails out on first error without modifying pending_updates.
|
||||
// That's Ok, cf this function's doc comment.
|
||||
writer.put(key, self.lsn, &value, ctx).await?;
|
||||
@@ -1579,7 +1570,7 @@ fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
|
||||
fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
|
||||
Key {
|
||||
field1: 0x00,
|
||||
field2: rel.spcnode,
|
||||
@@ -1778,6 +1769,10 @@ pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
|
||||
})
|
||||
}
|
||||
|
||||
fn is_rel_block_key(key: Key) -> bool {
|
||||
key.field1 == 0x00 && key.field4 != 0
|
||||
}
|
||||
|
||||
pub fn is_rel_fsm_block_key(key: Key) -> bool {
|
||||
key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
|
||||
}
|
||||
|
||||
@@ -19,13 +19,11 @@ use futures::stream::FuturesUnordered;
|
||||
use futures::FutureExt;
|
||||
use futures::StreamExt;
|
||||
use pageserver_api::models::TimelineState;
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use remote_storage::DownloadError;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use std::fmt;
|
||||
use storage_broker::BrokerClientChannel;
|
||||
use tokio::io::BufReader;
|
||||
use tokio::runtime::Handle;
|
||||
use tokio::sync::watch;
|
||||
use tokio::task::JoinSet;
|
||||
@@ -237,9 +235,6 @@ pub struct Tenant {
|
||||
|
||||
tenant_shard_id: TenantShardId,
|
||||
|
||||
// The detailed sharding information, beyond the number/count in tenant_shard_id
|
||||
shard_identity: ShardIdentity,
|
||||
|
||||
/// The remote storage generation, used to protect S3 objects from split-brain.
|
||||
/// Does not change over the lifetime of the [`Tenant`] object.
|
||||
///
|
||||
@@ -316,9 +311,6 @@ impl WalRedoManager {
|
||||
}
|
||||
}
|
||||
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// This method is cancellation-safe.
|
||||
pub async fn request_redo(
|
||||
&self,
|
||||
key: crate::repository::Key,
|
||||
@@ -476,6 +468,7 @@ impl Tenant {
|
||||
index_part: Option<IndexPart>,
|
||||
metadata: TimelineMetadata,
|
||||
ancestor: Option<Arc<Timeline>>,
|
||||
init_order: Option<&InitializationOrder>,
|
||||
_ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let tenant_id = self.tenant_shard_id;
|
||||
@@ -485,6 +478,7 @@ impl Tenant {
|
||||
&metadata,
|
||||
ancestor.clone(),
|
||||
resources,
|
||||
init_order,
|
||||
CreateTimelineCause::Load,
|
||||
)?;
|
||||
let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
|
||||
@@ -572,7 +566,6 @@ impl Tenant {
|
||||
tenant_shard_id: TenantShardId,
|
||||
resources: TenantSharedResources,
|
||||
attached_conf: AttachedTenantConf,
|
||||
shard_identity: ShardIdentity,
|
||||
init_order: Option<InitializationOrder>,
|
||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||
mode: SpawnMode,
|
||||
@@ -594,7 +587,6 @@ impl Tenant {
|
||||
TenantState::Attaching,
|
||||
conf,
|
||||
attached_conf,
|
||||
shard_identity,
|
||||
wal_redo_manager,
|
||||
tenant_shard_id,
|
||||
remote_storage.clone(),
|
||||
@@ -687,6 +679,10 @@ impl Tenant {
|
||||
// as we are no longer loading, signal completion by dropping
|
||||
// the completion while we resume deletion
|
||||
drop(_completion);
|
||||
// do not hold to initial_logical_size_attempt as it will prevent loading from proceeding without timeout
|
||||
let _ = init_order
|
||||
.as_mut()
|
||||
.and_then(|x| x.initial_logical_size_attempt.take());
|
||||
let background_jobs_can_start =
|
||||
init_order.as_ref().map(|x| &x.background_jobs_can_start);
|
||||
if let Some(background) = background_jobs_can_start {
|
||||
@@ -700,6 +696,7 @@ impl Tenant {
|
||||
&tenant_clone,
|
||||
preload,
|
||||
tenants,
|
||||
init_order,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
@@ -712,7 +709,7 @@ impl Tenant {
|
||||
}
|
||||
}
|
||||
|
||||
match tenant_clone.attach(preload, &ctx).await {
|
||||
match tenant_clone.attach(init_order, preload, &ctx).await {
|
||||
Ok(()) => {
|
||||
info!("attach finished, activating");
|
||||
tenant_clone.activate(broker_client, None, &ctx);
|
||||
@@ -775,6 +772,7 @@ impl Tenant {
|
||||
///
|
||||
async fn attach(
|
||||
self: &Arc<Tenant>,
|
||||
init_order: Option<InitializationOrder>,
|
||||
preload: Option<TenantPreload>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
@@ -787,7 +785,7 @@ impl Tenant {
|
||||
None => {
|
||||
// Deprecated dev mode: load from local disk state instead of remote storage
|
||||
// https://github.com/neondatabase/neon/issues/5624
|
||||
return self.load_local(ctx).await;
|
||||
return self.load_local(init_order, ctx).await;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -797,31 +795,20 @@ impl Tenant {
|
||||
let mut timeline_ancestors = HashMap::new();
|
||||
let mut existent_timelines = HashSet::new();
|
||||
for (timeline_id, preload) in preload.timelines {
|
||||
// In this context a timeline "exists" if it has any content in remote storage: this will
|
||||
// be our cue to not delete any corresponding local directory
|
||||
existent_timelines.insert(timeline_id);
|
||||
|
||||
let index_part = match preload.index_part {
|
||||
Ok(i) => {
|
||||
debug!("remote index part exists for timeline {timeline_id}");
|
||||
// We found index_part on the remote, this is the standard case.
|
||||
existent_timelines.insert(timeline_id);
|
||||
i
|
||||
}
|
||||
Err(DownloadError::NotFound) => {
|
||||
// There is no index_part on the remote. We only get here
|
||||
// if there is some prefix for the timeline in the remote storage.
|
||||
// This can e.g. be the initdb.tar.zst archive, maybe a
|
||||
// remnant from a prior incomplete creation or deletion attempt.
|
||||
// Delete the local directory as the deciding criterion for a
|
||||
// timeline's existence is presence of index_part.
|
||||
info!(%timeline_id, "index_part not found on remote");
|
||||
continue;
|
||||
}
|
||||
Err(e) => {
|
||||
// Some (possibly ephemeral) error happened during index_part download.
|
||||
// Pretend the timeline exists to not delete the timeline directory,
|
||||
// as it might be a temporary issue and we don't want to re-download
|
||||
// everything after it resolves.
|
||||
// Timeline creation is not atomic: we might upload a layer but no index_part. We expect
|
||||
// that the creation will be retried by the control plane and eventually result in
|
||||
// a valid loadable state.
|
||||
warn!(%timeline_id, "Failed to load index_part from remote storage, failed creation? ({e})");
|
||||
|
||||
existent_timelines.insert(timeline_id);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
@@ -882,6 +869,7 @@ impl Tenant {
|
||||
&index_part.metadata,
|
||||
Some(remote_timeline_client),
|
||||
self.deletion_queue_client.clone(),
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.context("resume_deletion")
|
||||
@@ -1006,6 +994,10 @@ impl Tenant {
|
||||
None
|
||||
};
|
||||
|
||||
// we can load remote timelines during init, but they are assumed to be so rare that
|
||||
// initialization order is not passed to here.
|
||||
let init_order = None;
|
||||
|
||||
// timeline loading after attach expects to find metadata file for each metadata
|
||||
save_metadata(
|
||||
self.conf,
|
||||
@@ -1023,6 +1015,7 @@ impl Tenant {
|
||||
Some(index_part),
|
||||
remote_metadata,
|
||||
ancestor,
|
||||
init_order,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
@@ -1046,9 +1039,6 @@ impl Tenant {
|
||||
},
|
||||
conf,
|
||||
AttachedTenantConf::try_from(LocationConf::default()).unwrap(),
|
||||
// Shard identity isn't meaningful for a broken tenant: it's just a placeholder
|
||||
// to occupy the slot for this TenantShardId.
|
||||
ShardIdentity::broken(tenant_shard_id.shard_number, tenant_shard_id.shard_count),
|
||||
wal_redo_manager,
|
||||
tenant_shard_id,
|
||||
None,
|
||||
@@ -1267,7 +1257,11 @@ impl Tenant {
|
||||
/// files on disk. Used at pageserver startup.
|
||||
///
|
||||
/// No background tasks are started as part of this routine.
|
||||
async fn load_local(self: &Arc<Tenant>, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
async fn load_local(
|
||||
self: &Arc<Tenant>,
|
||||
init_order: Option<InitializationOrder>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
span::debug_assert_current_span_has_tenant_id();
|
||||
|
||||
debug!("loading tenant task");
|
||||
@@ -1293,7 +1287,7 @@ impl Tenant {
|
||||
// Process loadable timelines first
|
||||
for (timeline_id, local_metadata) in scan.sorted_timelines_to_load {
|
||||
if let Err(e) = self
|
||||
.load_local_timeline(timeline_id, local_metadata, ctx, false)
|
||||
.load_local_timeline(timeline_id, local_metadata, init_order.as_ref(), ctx, false)
|
||||
.await
|
||||
{
|
||||
match e {
|
||||
@@ -1327,7 +1321,13 @@ impl Tenant {
|
||||
}
|
||||
Some(local_metadata) => {
|
||||
if let Err(e) = self
|
||||
.load_local_timeline(timeline_id, local_metadata, ctx, true)
|
||||
.load_local_timeline(
|
||||
timeline_id,
|
||||
local_metadata,
|
||||
init_order.as_ref(),
|
||||
ctx,
|
||||
true,
|
||||
)
|
||||
.await
|
||||
{
|
||||
match e {
|
||||
@@ -1355,11 +1355,12 @@ impl Tenant {
|
||||
/// Subroutine of `load_tenant`, to load an individual timeline
|
||||
///
|
||||
/// NB: The parent is assumed to be already loaded!
|
||||
#[instrument(skip(self, local_metadata, ctx))]
|
||||
#[instrument(skip(self, local_metadata, init_order, ctx))]
|
||||
async fn load_local_timeline(
|
||||
self: &Arc<Self>,
|
||||
timeline_id: TimelineId,
|
||||
local_metadata: TimelineMetadata,
|
||||
init_order: Option<&InitializationOrder>,
|
||||
ctx: &RequestContext,
|
||||
found_delete_mark: bool,
|
||||
) -> Result<(), LoadLocalTimelineError> {
|
||||
@@ -1376,6 +1377,7 @@ impl Tenant {
|
||||
&local_metadata,
|
||||
None,
|
||||
self.deletion_queue_client.clone(),
|
||||
init_order,
|
||||
)
|
||||
.await
|
||||
.context("resume deletion")
|
||||
@@ -1392,9 +1394,17 @@ impl Tenant {
|
||||
None
|
||||
};
|
||||
|
||||
self.timeline_init_and_sync(timeline_id, resources, None, local_metadata, ancestor, ctx)
|
||||
.await
|
||||
.map_err(LoadLocalTimelineError::Load)
|
||||
self.timeline_init_and_sync(
|
||||
timeline_id,
|
||||
resources,
|
||||
None,
|
||||
local_metadata,
|
||||
ancestor,
|
||||
init_order,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
.map_err(LoadLocalTimelineError::Load)
|
||||
}
|
||||
|
||||
pub(crate) fn tenant_id(&self) -> TenantId {
|
||||
@@ -1548,14 +1558,12 @@ impl Tenant {
|
||||
///
|
||||
/// If the caller specified the timeline ID to use (`new_timeline_id`), and timeline with
|
||||
/// the same timeline ID already exists, returns CreateTimelineError::AlreadyExists.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn create_timeline(
|
||||
&self,
|
||||
new_timeline_id: TimelineId,
|
||||
ancestor_timeline_id: Option<TimelineId>,
|
||||
mut ancestor_start_lsn: Option<Lsn>,
|
||||
pg_version: u32,
|
||||
load_existing_initdb: Option<TimelineId>,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Arc<Timeline>, CreateTimelineError> {
|
||||
@@ -1630,7 +1638,7 @@ impl Tenant {
|
||||
.await?
|
||||
}
|
||||
None => {
|
||||
self.bootstrap_timeline(new_timeline_id, pg_version, load_existing_initdb, ctx)
|
||||
self.bootstrap_timeline(new_timeline_id, pg_version, ctx)
|
||||
.await?
|
||||
}
|
||||
};
|
||||
@@ -2289,6 +2297,7 @@ impl Tenant {
|
||||
new_metadata: &TimelineMetadata,
|
||||
ancestor: Option<Arc<Timeline>>,
|
||||
resources: TimelineResources,
|
||||
init_order: Option<&InitializationOrder>,
|
||||
cause: CreateTimelineCause,
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
let state = match cause {
|
||||
@@ -2303,6 +2312,9 @@ impl Tenant {
|
||||
CreateTimelineCause::Delete => TimelineState::Stopping,
|
||||
};
|
||||
|
||||
let initial_logical_size_can_start = init_order.map(|x| &x.initial_logical_size_can_start);
|
||||
let initial_logical_size_attempt = init_order.map(|x| &x.initial_logical_size_attempt);
|
||||
|
||||
let pg_version = new_metadata.pg_version();
|
||||
|
||||
let timeline = Timeline::new(
|
||||
@@ -2313,10 +2325,11 @@ impl Tenant {
|
||||
new_timeline_id,
|
||||
self.tenant_shard_id,
|
||||
self.generation,
|
||||
self.shard_identity,
|
||||
Arc::clone(&self.walredo_mgr),
|
||||
resources,
|
||||
pg_version,
|
||||
initial_logical_size_can_start.cloned(),
|
||||
initial_logical_size_attempt.cloned().flatten(),
|
||||
state,
|
||||
self.cancel.child_token(),
|
||||
);
|
||||
@@ -2331,7 +2344,6 @@ impl Tenant {
|
||||
state: TenantState,
|
||||
conf: &'static PageServerConf,
|
||||
attached_conf: AttachedTenantConf,
|
||||
shard_identity: ShardIdentity,
|
||||
walredo_mgr: Arc<WalRedoManager>,
|
||||
tenant_shard_id: TenantShardId,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
@@ -2393,7 +2405,6 @@ impl Tenant {
|
||||
|
||||
Tenant {
|
||||
tenant_shard_id,
|
||||
shard_identity,
|
||||
generation: attached_conf.location.generation,
|
||||
conf,
|
||||
// using now here is good enough approximation to catch tenants with really long
|
||||
@@ -2515,7 +2526,7 @@ impl Tenant {
|
||||
}
|
||||
}
|
||||
|
||||
debug!("persisting tenantconf to {config_path}");
|
||||
info!("persisting tenantconf to {config_path}");
|
||||
|
||||
let mut conf_content = r#"# This file contains a specific per-tenant's config.
|
||||
# It is read in case of pageserver restart.
|
||||
@@ -2550,7 +2561,7 @@ impl Tenant {
|
||||
target_config_path: &Utf8Path,
|
||||
tenant_conf: &TenantConfOpt,
|
||||
) -> anyhow::Result<()> {
|
||||
debug!("persisting tenantconf to {target_config_path}");
|
||||
info!("persisting tenantconf to {target_config_path}");
|
||||
|
||||
let mut conf_content = r#"# This file contains a specific per-tenant's config.
|
||||
# It is read in case of pageserver restart.
|
||||
@@ -2940,7 +2951,6 @@ impl Tenant {
|
||||
&self,
|
||||
timeline_id: TimelineId,
|
||||
pg_version: u32,
|
||||
load_existing_initdb: Option<TimelineId>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
let timeline_uninit_mark = {
|
||||
@@ -2963,6 +2973,8 @@ impl Tenant {
|
||||
format!("Failed to remove already existing initdb directory: {pgdata_path}")
|
||||
})?;
|
||||
}
|
||||
// Init temporarily repo to get bootstrap data, this creates a directory in the `initdb_path` path
|
||||
run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel).await?;
|
||||
// this new directory is very temporary, set to remove it immediately after bootstrap, we don't need it
|
||||
scopeguard::defer! {
|
||||
if let Err(e) = fs::remove_dir_all(&pgdata_path) {
|
||||
@@ -2970,59 +2982,32 @@ impl Tenant {
|
||||
error!("Failed to remove temporary initdb directory '{pgdata_path}': {e}");
|
||||
}
|
||||
}
|
||||
if let Some(existing_initdb_timeline_id) = load_existing_initdb {
|
||||
let Some(storage) = &self.remote_storage else {
|
||||
bail!("no storage configured but load_existing_initdb set to {existing_initdb_timeline_id}");
|
||||
};
|
||||
let (initdb_tar_zst_path, initdb_tar_zst) =
|
||||
self::remote_timeline_client::download_initdb_tar_zst(
|
||||
self.conf,
|
||||
storage,
|
||||
&self.tenant_shard_id,
|
||||
&existing_initdb_timeline_id,
|
||||
)
|
||||
.await
|
||||
.context("download initdb tar")?;
|
||||
let buf_read = Box::pin(BufReader::new(initdb_tar_zst));
|
||||
import_datadir::extract_tar_zst(&pgdata_path, buf_read)
|
||||
.await
|
||||
.context("extract initdb tar")?;
|
||||
|
||||
if initdb_tar_zst_path.exists() {
|
||||
tokio::fs::remove_file(&initdb_tar_zst_path)
|
||||
.await
|
||||
.context("tempfile removal")?;
|
||||
}
|
||||
} else {
|
||||
// Init temporarily repo to get bootstrap data, this creates a directory in the `initdb_path` path
|
||||
run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel).await?;
|
||||
|
||||
// Upload the created data dir to S3
|
||||
if let Some(storage) = &self.remote_storage {
|
||||
let pgdata_zstd = import_datadir::create_tar_zst(&pgdata_path).await?;
|
||||
let pgdata_zstd = Bytes::from(pgdata_zstd);
|
||||
backoff::retry(
|
||||
|| async {
|
||||
self::remote_timeline_client::upload_initdb_dir(
|
||||
storage,
|
||||
&self.tenant_shard_id.tenant_id,
|
||||
&timeline_id,
|
||||
pgdata_zstd.clone(),
|
||||
)
|
||||
.await
|
||||
},
|
||||
|_| false,
|
||||
3,
|
||||
u32::MAX,
|
||||
"persist_initdb_tar_zst",
|
||||
// TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
|
||||
backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
let pgdata_lsn = import_datadir::get_lsn_from_controlfile(&pgdata_path)?.align();
|
||||
|
||||
// Upload the created data dir to S3
|
||||
if let Some(storage) = &self.remote_storage {
|
||||
let pgdata_zstd = import_datadir::create_tar_zst(&pgdata_path).await?;
|
||||
let pgdata_zstd = Bytes::from(pgdata_zstd);
|
||||
backoff::retry(
|
||||
|| async {
|
||||
self::remote_timeline_client::upload_initdb_dir(
|
||||
storage,
|
||||
&self.tenant_shard_id.tenant_id,
|
||||
&timeline_id,
|
||||
pgdata_zstd.clone(),
|
||||
)
|
||||
.await
|
||||
},
|
||||
|_| false,
|
||||
3,
|
||||
u32::MAX,
|
||||
"persist_initdb_tar_zst",
|
||||
// TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
|
||||
backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
|
||||
// Import the contents of the data directory at the initial checkpoint
|
||||
// LSN, and any WAL after that.
|
||||
// Initdb lsn will be equal to last_record_lsn which will be set after import.
|
||||
@@ -3140,6 +3125,7 @@ impl Tenant {
|
||||
new_metadata,
|
||||
ancestor,
|
||||
resources,
|
||||
None,
|
||||
CreateTimelineCause::Load,
|
||||
)
|
||||
.context("Failed to create timeline data structure")?;
|
||||
@@ -3213,10 +3199,7 @@ impl Tenant {
|
||||
let uninit_mark_path = self
|
||||
.conf
|
||||
.timeline_uninit_mark_file_path(tenant_shard_id, timeline_id);
|
||||
fs::OpenOptions::new()
|
||||
.write(true)
|
||||
.create_new(true)
|
||||
.open(&uninit_mark_path)
|
||||
fs::File::create(&uninit_mark_path)
|
||||
.context("Failed to create uninit mark file")
|
||||
.and_then(|_| {
|
||||
crashsafe::fsync_file_and_parent(&uninit_mark_path)
|
||||
@@ -3805,8 +3788,6 @@ pub(crate) mod harness {
|
||||
self.generation,
|
||||
))
|
||||
.unwrap(),
|
||||
// This is a legacy/test code path: sharding isn't supported here.
|
||||
ShardIdentity::unsharded(),
|
||||
walredo_mgr,
|
||||
self.tenant_shard_id,
|
||||
Some(self.remote_storage.clone()),
|
||||
@@ -3816,7 +3797,7 @@ pub(crate) mod harness {
|
||||
match mode {
|
||||
LoadMode::Local => {
|
||||
tenant
|
||||
.load_local(ctx)
|
||||
.load_local(None, ctx)
|
||||
.instrument(info_span!("try_load", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
|
||||
.await?;
|
||||
}
|
||||
@@ -3826,7 +3807,7 @@ pub(crate) mod harness {
|
||||
.instrument(info_span!("try_load_preload", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
|
||||
.await?;
|
||||
tenant
|
||||
.attach(Some(preload), ctx)
|
||||
.attach(None, Some(preload), ctx)
|
||||
.instrument(info_span!("try_load", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
|
||||
.await?;
|
||||
}
|
||||
@@ -3869,9 +3850,6 @@ pub(crate) mod harness {
|
||||
pub(crate) struct TestRedoManager;
|
||||
|
||||
impl TestRedoManager {
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// This method is cancellation-safe.
|
||||
pub async fn request_redo(
|
||||
&self,
|
||||
key: Key,
|
||||
|
||||
@@ -15,6 +15,7 @@ use crate::{
|
||||
context::RequestContext,
|
||||
task_mgr::{self, TaskKind},
|
||||
tenant::mgr::{TenantSlot, TenantsMapRemoveResult},
|
||||
InitializationOrder,
|
||||
};
|
||||
|
||||
use super::{
|
||||
@@ -389,6 +390,7 @@ impl DeleteTenantFlow {
|
||||
tenant: &Arc<Tenant>,
|
||||
preload: Option<TenantPreload>,
|
||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||
init_order: Option<InitializationOrder>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
let (_, progress) = completion::channel();
|
||||
@@ -398,7 +400,10 @@ impl DeleteTenantFlow {
|
||||
.await
|
||||
.expect("cant be stopping or broken");
|
||||
|
||||
tenant.attach(preload, ctx).await.context("attach")?;
|
||||
tenant
|
||||
.attach(init_order, preload, ctx)
|
||||
.await
|
||||
.context("attach")?;
|
||||
|
||||
Self::background(
|
||||
guard,
|
||||
|
||||
@@ -2,8 +2,7 @@
|
||||
//! page server.
|
||||
|
||||
use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::shard::{ShardIdentity, ShardNumber, TenantShardId};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use std::borrow::Cow;
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
@@ -131,18 +130,6 @@ pub(crate) enum TenantsMapRemoveResult {
|
||||
InProgress(utils::completion::Barrier),
|
||||
}
|
||||
|
||||
/// When resolving a TenantId to a shard, we may be looking for the 0th
|
||||
/// shard, or we might be looking for whichever shard holds a particular page.
|
||||
pub(crate) enum ShardSelector {
|
||||
/// Only return the 0th shard, if it is present. If a non-0th shard is present,
|
||||
/// ignore it.
|
||||
Zero,
|
||||
/// Pick the first shard we find for the TenantId
|
||||
First,
|
||||
/// Pick the shard that holds this key
|
||||
Page(Key),
|
||||
}
|
||||
|
||||
impl TenantsMap {
|
||||
/// Convenience function for typical usage, where we want to get a `Tenant` object, for
|
||||
/// working with attached tenants. If the TenantId is in the map but in Secondary state,
|
||||
@@ -157,49 +144,6 @@ impl TenantsMap {
|
||||
}
|
||||
}
|
||||
|
||||
/// A page service client sends a TenantId, and to look up the correct Tenant we must
|
||||
/// resolve this to a fully qualified TenantShardId.
|
||||
fn resolve_shard(
|
||||
&self,
|
||||
tenant_id: &TenantId,
|
||||
selector: ShardSelector,
|
||||
) -> Option<TenantShardId> {
|
||||
let mut want_shard = None;
|
||||
match self {
|
||||
TenantsMap::Initializing => None,
|
||||
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
|
||||
for slot in m.range(TenantShardId::tenant_range(*tenant_id)) {
|
||||
match selector {
|
||||
ShardSelector::First => return Some(*slot.0),
|
||||
ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
|
||||
return Some(*slot.0)
|
||||
}
|
||||
ShardSelector::Page(key) => {
|
||||
if let Some(tenant) = slot.1.get_attached() {
|
||||
// First slot we see for this tenant, calculate the expected shard number
|
||||
// for the key: we will use this for checking if this and subsequent
|
||||
// slots contain the key, rather than recalculating the hash each time.
|
||||
if want_shard.is_none() {
|
||||
want_shard = Some(tenant.shard_identity.get_shard_number(&key));
|
||||
}
|
||||
|
||||
if Some(tenant.shard_identity.number) == want_shard {
|
||||
return Some(*slot.0);
|
||||
}
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
_ => continue,
|
||||
}
|
||||
}
|
||||
|
||||
// Fall through: we didn't find an acceptable shard
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Only for use from DeleteTenantFlow. This method directly removes a TenantSlot from the map.
|
||||
///
|
||||
/// The normal way to remove a tenant is using a SlotGuard, which will gracefully remove the guarded
|
||||
@@ -270,6 +214,49 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
|
||||
static TENANTS: Lazy<std::sync::RwLock<TenantsMap>> =
|
||||
Lazy::new(|| std::sync::RwLock::new(TenantsMap::Initializing));
|
||||
|
||||
/// Create a directory, including parents. This does no fsyncs and makes
|
||||
/// no guarantees about the persistence of the resulting metadata: for
|
||||
/// use when creating dirs for use as cache.
|
||||
async fn unsafe_create_dir_all(path: &Utf8PathBuf) -> std::io::Result<()> {
|
||||
let mut dirs_to_create = Vec::new();
|
||||
let mut path: &Utf8Path = path.as_ref();
|
||||
|
||||
// Figure out which directories we need to create.
|
||||
loop {
|
||||
let meta = tokio::fs::metadata(path).await;
|
||||
match meta {
|
||||
Ok(metadata) if metadata.is_dir() => break,
|
||||
Ok(_) => {
|
||||
return Err(std::io::Error::new(
|
||||
std::io::ErrorKind::AlreadyExists,
|
||||
format!("non-directory found in path: {path}"),
|
||||
));
|
||||
}
|
||||
Err(ref e) if e.kind() == std::io::ErrorKind::NotFound => {}
|
||||
Err(e) => return Err(e),
|
||||
}
|
||||
|
||||
dirs_to_create.push(path);
|
||||
|
||||
match path.parent() {
|
||||
Some(parent) => path = parent,
|
||||
None => {
|
||||
return Err(std::io::Error::new(
|
||||
std::io::ErrorKind::InvalidInput,
|
||||
format!("can't find parent of path '{path}'"),
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Create directories from parent to child.
|
||||
for &path in dirs_to_create.iter().rev() {
|
||||
tokio::fs::create_dir(path).await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// The TenantManager is responsible for storing and mutating the collection of all tenants
|
||||
/// that this pageserver process has state for. Every Tenant and SecondaryTenant instance
|
||||
/// lives inside the TenantManager.
|
||||
@@ -528,14 +515,12 @@ pub async fn init_tenant_mgr(
|
||||
location_conf.attach_in_generation(generation);
|
||||
Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;
|
||||
|
||||
let shard_identity = location_conf.shard;
|
||||
match tenant_spawn(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
&tenant_dir_path,
|
||||
resources.clone(),
|
||||
AttachedTenantConf::try_from(location_conf)?,
|
||||
shard_identity,
|
||||
Some(init_order.clone()),
|
||||
&TENANTS,
|
||||
SpawnMode::Normal,
|
||||
@@ -576,7 +561,6 @@ pub(crate) fn tenant_spawn(
|
||||
tenant_path: &Utf8Path,
|
||||
resources: TenantSharedResources,
|
||||
location_conf: AttachedTenantConf,
|
||||
shard_identity: ShardIdentity,
|
||||
init_order: Option<InitializationOrder>,
|
||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||
mode: SpawnMode,
|
||||
@@ -603,19 +587,12 @@ pub(crate) fn tenant_spawn(
|
||||
"Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}"
|
||||
);
|
||||
|
||||
info!(
|
||||
tenant_id = %tenant_shard_id.tenant_id,
|
||||
shard_id = %tenant_shard_id.shard_slug(),
|
||||
generation = ?location_conf.location.generation,
|
||||
attach_mode = ?location_conf.location.attach_mode,
|
||||
"Attaching tenant"
|
||||
);
|
||||
info!("Attaching tenant {tenant_shard_id}");
|
||||
let tenant = match Tenant::spawn(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
resources,
|
||||
location_conf,
|
||||
shard_identity,
|
||||
init_order,
|
||||
tenants,
|
||||
mode,
|
||||
@@ -785,14 +762,12 @@ pub(crate) async fn create_tenant(
|
||||
tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?;
|
||||
let tenant_path = super::create_tenant_files(conf, &location_conf, &tenant_shard_id).await?;
|
||||
|
||||
let shard_identity = location_conf.shard;
|
||||
let created_tenant = tenant_spawn(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
&tenant_path,
|
||||
resources,
|
||||
AttachedTenantConf::try_from(location_conf)?,
|
||||
shard_identity,
|
||||
None,
|
||||
&TENANTS,
|
||||
SpawnMode::Create,
|
||||
@@ -885,7 +860,6 @@ impl TenantManager {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
|
||||
pub(crate) async fn upsert_location(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
@@ -998,7 +972,7 @@ impl TenantManager {
|
||||
LocationMode::Secondary(_) => {
|
||||
// Directory doesn't need to be fsync'd because if we crash it can
|
||||
// safely be recreated next time this tenant location is configured.
|
||||
tokio::fs::create_dir_all(&tenant_path)
|
||||
unsafe_create_dir_all(&tenant_path)
|
||||
.await
|
||||
.with_context(|| format!("Creating {tenant_path}"))?;
|
||||
|
||||
@@ -1014,7 +988,7 @@ impl TenantManager {
|
||||
// Directory doesn't need to be fsync'd because we do not depend on
|
||||
// it to exist after crashes: it may be recreated when tenant is
|
||||
// re-attached, see https://github.com/neondatabase/neon/issues/5550
|
||||
tokio::fs::create_dir_all(&tenant_path)
|
||||
unsafe_create_dir_all(&timelines_path)
|
||||
.await
|
||||
.with_context(|| format!("Creating {timelines_path}"))?;
|
||||
|
||||
@@ -1022,14 +996,12 @@ impl TenantManager {
|
||||
.await
|
||||
.map_err(SetNewTenantConfigError::Persist)?;
|
||||
|
||||
let shard_identity = new_location_config.shard;
|
||||
let tenant = tenant_spawn(
|
||||
self.conf,
|
||||
tenant_shard_id,
|
||||
&tenant_path,
|
||||
self.resources.clone(),
|
||||
AttachedTenantConf::try_from(new_location_config)?,
|
||||
shard_identity,
|
||||
None,
|
||||
self.tenants,
|
||||
SpawnMode::Normal,
|
||||
@@ -1044,81 +1016,6 @@ impl TenantManager {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Resetting a tenant is equivalent to detaching it, then attaching it again with the same
|
||||
/// LocationConf that was last used to attach it. Optionally, the local file cache may be
|
||||
/// dropped before re-attaching.
|
||||
///
|
||||
/// This is not part of a tenant's normal lifecycle: it is used for debug/support, in situations
|
||||
/// where an issue is identified that would go away with a restart of the tenant.
|
||||
///
|
||||
/// This does not have any special "force" shutdown of a tenant: it relies on the tenant's tasks
|
||||
/// to respect the cancellation tokens used in normal shutdown().
|
||||
#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %drop_cache))]
|
||||
pub(crate) async fn reset_tenant(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
drop_cache: bool,
|
||||
ctx: RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
|
||||
let Some(old_slot) = slot_guard.get_old_value() else {
|
||||
anyhow::bail!("Tenant not found when trying to reset");
|
||||
};
|
||||
|
||||
let Some(tenant) = old_slot.get_attached() else {
|
||||
slot_guard.revert();
|
||||
anyhow::bail!("Tenant is not in attached state");
|
||||
};
|
||||
|
||||
let (_guard, progress) = utils::completion::channel();
|
||||
match tenant.shutdown(progress, false).await {
|
||||
Ok(()) => {
|
||||
slot_guard.drop_old_value()?;
|
||||
}
|
||||
Err(_barrier) => {
|
||||
slot_guard.revert();
|
||||
anyhow::bail!("Cannot reset Tenant, already shutting down");
|
||||
}
|
||||
}
|
||||
|
||||
let tenant_path = self.conf.tenant_path(&tenant_shard_id);
|
||||
let timelines_path = self.conf.timelines_path(&tenant_shard_id);
|
||||
let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?;
|
||||
|
||||
if drop_cache {
|
||||
tracing::info!("Dropping local file cache");
|
||||
|
||||
match tokio::fs::read_dir(&timelines_path).await {
|
||||
Err(e) => {
|
||||
tracing::warn!("Failed to list timelines while dropping cache: {}", e);
|
||||
}
|
||||
Ok(mut entries) => {
|
||||
while let Some(entry) = entries.next_entry().await? {
|
||||
tokio::fs::remove_dir_all(entry.path()).await?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let shard_identity = config.shard;
|
||||
let tenant = tenant_spawn(
|
||||
self.conf,
|
||||
tenant_shard_id,
|
||||
&tenant_path,
|
||||
self.resources.clone(),
|
||||
AttachedTenantConf::try_from(config)?,
|
||||
shard_identity,
|
||||
None,
|
||||
self.tenants,
|
||||
SpawnMode::Normal,
|
||||
&ctx,
|
||||
)?;
|
||||
|
||||
slot_guard.upsert(TenantSlot::Attached(tenant))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
@@ -1203,7 +1100,6 @@ pub(crate) enum GetActiveTenantError {
|
||||
/// then wait for up to `timeout` (minus however long we waited for the slot).
|
||||
pub(crate) async fn get_active_tenant_with_timeout(
|
||||
tenant_id: TenantId,
|
||||
shard_selector: ShardSelector,
|
||||
timeout: Duration,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Arc<Tenant>, GetActiveTenantError> {
|
||||
@@ -1212,17 +1108,15 @@ pub(crate) async fn get_active_tenant_with_timeout(
|
||||
Tenant(Arc<Tenant>),
|
||||
}
|
||||
|
||||
// TODO(sharding): make page service interface sharding-aware (page service should apply ShardIdentity to the key
|
||||
// to decide which shard services the request)
|
||||
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
|
||||
|
||||
let wait_start = Instant::now();
|
||||
let deadline = wait_start + timeout;
|
||||
|
||||
let (wait_for, tenant_shard_id) = {
|
||||
let wait_for = {
|
||||
let locked = TENANTS.read().unwrap();
|
||||
|
||||
// Resolve TenantId to TenantShardId
|
||||
let tenant_shard_id = locked.resolve_shard(&tenant_id, shard_selector).ok_or(
|
||||
GetActiveTenantError::NotFound(GetTenantError::NotFound(tenant_id)),
|
||||
)?;
|
||||
|
||||
let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
|
||||
.map_err(GetTenantError::MapState)?;
|
||||
match peek_slot {
|
||||
@@ -1232,7 +1126,7 @@ pub(crate) async fn get_active_tenant_with_timeout(
|
||||
// Fast path: we don't need to do any async waiting.
|
||||
return Ok(tenant.clone());
|
||||
}
|
||||
_ => (WaitFor::Tenant(tenant.clone()), tenant_shard_id),
|
||||
_ => WaitFor::Tenant(tenant.clone()),
|
||||
}
|
||||
}
|
||||
Some(TenantSlot::Secondary) => {
|
||||
@@ -1240,9 +1134,7 @@ pub(crate) async fn get_active_tenant_with_timeout(
|
||||
tenant_id,
|
||||
)))
|
||||
}
|
||||
Some(TenantSlot::InProgress(barrier)) => {
|
||||
(WaitFor::Barrier(barrier.clone()), tenant_shard_id)
|
||||
}
|
||||
Some(TenantSlot::InProgress(barrier)) => WaitFor::Barrier(barrier.clone()),
|
||||
None => {
|
||||
return Err(GetActiveTenantError::NotFound(GetTenantError::NotFound(
|
||||
tenant_id,
|
||||
@@ -1327,7 +1219,8 @@ pub(crate) async fn delete_tenant(
|
||||
// See https://github.com/neondatabase/neon/issues/5080
|
||||
|
||||
// TODO(sharding): make delete API sharding-aware
|
||||
let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;
|
||||
let mut slot_guard =
|
||||
tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;
|
||||
|
||||
// unwrap is safe because we used MustExist mode when acquiring
|
||||
let tenant = match slot_guard.get_old_value().as_ref().unwrap() {
|
||||
@@ -1484,14 +1377,12 @@ pub(crate) async fn load_tenant(
|
||||
|
||||
Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;
|
||||
|
||||
let shard_identity = location_conf.shard;
|
||||
let new_tenant = tenant_spawn(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
&tenant_path,
|
||||
resources,
|
||||
AttachedTenantConf::try_from(location_conf)?,
|
||||
shard_identity,
|
||||
None,
|
||||
&TENANTS,
|
||||
SpawnMode::Normal,
|
||||
@@ -1581,14 +1472,12 @@ pub(crate) async fn attach_tenant(
|
||||
// TODO: tenant directory remains on disk if we bail out from here on.
|
||||
// See https://github.com/neondatabase/neon/issues/4233
|
||||
|
||||
let shard_identity = location_conf.shard;
|
||||
let attached_tenant = tenant_spawn(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
&tenant_dir,
|
||||
resources,
|
||||
AttachedTenantConf::try_from(location_conf)?,
|
||||
shard_identity,
|
||||
None,
|
||||
&TENANTS,
|
||||
SpawnMode::Normal,
|
||||
@@ -1654,10 +1543,9 @@ pub enum TenantSlotUpsertError {
|
||||
MapState(#[from] TenantMapError),
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
#[derive(Debug)]
|
||||
enum TenantSlotDropError {
|
||||
/// It is only legal to drop a TenantSlot if its contents are fully shut down
|
||||
#[error("Tenant was not shut down")]
|
||||
NotShutdown,
|
||||
}
|
||||
|
||||
@@ -1717,9 +1605,9 @@ impl SlotGuard {
|
||||
}
|
||||
}
|
||||
|
||||
/// Get any value that was present in the slot before we acquired ownership
|
||||
/// Take any value that was present in the slot before we acquired ownership
|
||||
/// of it: in state transitions, this will be the old state.
|
||||
fn get_old_value(&self) -> &Option<TenantSlot> {
|
||||
fn get_old_value(&mut self) -> &Option<TenantSlot> {
|
||||
&self.old_value
|
||||
}
|
||||
|
||||
|
||||
@@ -188,7 +188,6 @@ use anyhow::Context;
|
||||
use camino::Utf8Path;
|
||||
use chrono::{NaiveDateTime, Utc};
|
||||
|
||||
pub(crate) use download::download_initdb_tar_zst;
|
||||
use pageserver_api::shard::{ShardIndex, TenantShardId};
|
||||
use scopeguard::ScopeGuard;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -1078,17 +1077,7 @@ impl RemoteTimelineClient {
|
||||
|
||||
let remaining_layers: Vec<RemotePath> = remaining
|
||||
.into_iter()
|
||||
.filter(|p| {
|
||||
if p == &latest_index {
|
||||
return false;
|
||||
}
|
||||
if let Some(name) = p.object_name() {
|
||||
if name == INITDB_PATH {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
true
|
||||
})
|
||||
.filter(|p| p!= &latest_index)
|
||||
.inspect(|path| {
|
||||
if let Some(name) = path.object_name() {
|
||||
info!(%name, "deleting a file not referenced from index_part.json");
|
||||
|
||||
@@ -8,12 +8,11 @@ use std::future::Future;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{anyhow, Context};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use camino::Utf8Path;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use tokio::fs::{self, File, OpenOptions};
|
||||
use tokio::io::{AsyncSeekExt, AsyncWriteExt};
|
||||
use tokio::fs;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::warn;
|
||||
use utils::{backoff, crashsafe};
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
@@ -21,15 +20,14 @@ use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_
|
||||
use crate::tenant::storage_layer::LayerFileName;
|
||||
use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::tenant::Generation;
|
||||
use crate::TEMP_FILE_SUFFIX;
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode};
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::id::TimelineId;
|
||||
|
||||
use super::index::{IndexPart, LayerFileMetadata};
|
||||
use super::{
|
||||
parse_remote_index_path, remote_index_path, remote_initdb_archive_path,
|
||||
FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, INITDB_PATH,
|
||||
parse_remote_index_path, remote_index_path, FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
};
|
||||
|
||||
static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120);
|
||||
@@ -363,7 +361,7 @@ pub(super) async fn download_index_part(
|
||||
None => {
|
||||
// Migration from legacy pre-generation state: we have a generation but no prior
|
||||
// attached pageservers did. Try to load from a no-generation path.
|
||||
tracing::debug!("No index_part.json* found");
|
||||
tracing::info!("No index_part.json* found");
|
||||
do_download_index_part(
|
||||
storage,
|
||||
tenant_shard_id,
|
||||
@@ -376,69 +374,6 @@ pub(super) async fn download_index_part(
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn download_initdb_tar_zst(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &GenericRemoteStorage,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
timeline_id: &TimelineId,
|
||||
) -> Result<(Utf8PathBuf, File), DownloadError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
let remote_path = remote_initdb_archive_path(&tenant_shard_id.tenant_id, timeline_id);
|
||||
|
||||
let timeline_path = conf.timelines_path(tenant_shard_id);
|
||||
|
||||
if !timeline_path.exists() {
|
||||
tokio::fs::create_dir_all(&timeline_path)
|
||||
.await
|
||||
.with_context(|| format!("timeline dir creation {timeline_path}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
}
|
||||
let temp_path = timeline_path.join(format!("{INITDB_PATH}-{timeline_id}.{TEMP_FILE_SUFFIX}"));
|
||||
|
||||
let file = download_retry(
|
||||
|| async {
|
||||
let mut file = OpenOptions::new()
|
||||
.create(true)
|
||||
.truncate(true)
|
||||
.read(true)
|
||||
.write(true)
|
||||
.open(&temp_path)
|
||||
.await
|
||||
.with_context(|| format!("tempfile creation {temp_path}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let mut download = storage.download(&remote_path).await?;
|
||||
|
||||
tokio::io::copy(&mut download.download_stream, &mut file)
|
||||
.await
|
||||
.with_context(|| format!("download initdb.tar.zst at {remote_path:?}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
file.seek(std::io::SeekFrom::Start(0))
|
||||
.await
|
||||
.with_context(|| format!("rewinding initdb.tar.zst at: {remote_path:?}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
Ok(file)
|
||||
},
|
||||
&format!("download {remote_path}"),
|
||||
)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
if temp_path.exists() {
|
||||
// Do a best-effort attempt at deleting the temporary file upon encountering an error.
|
||||
// We don't have async here nor do we want to pile on any extra errors.
|
||||
if let Err(e) = std::fs::remove_file(&temp_path) {
|
||||
warn!("error deleting temporary file {temp_path}: {e}");
|
||||
}
|
||||
}
|
||||
e
|
||||
})?;
|
||||
|
||||
Ok((temp_path, file))
|
||||
}
|
||||
|
||||
/// Helper function to handle retries for a download operation.
|
||||
///
|
||||
/// Remote operations can fail due to rate limits (IAM, S3), spurious network
|
||||
|
||||
@@ -230,10 +230,6 @@ impl Layer {
|
||||
///
|
||||
/// It is up to the caller to collect more data from the previous layer and
|
||||
/// perform WAL redo, if necessary.
|
||||
///
|
||||
/// # Cancellation-Safety
|
||||
///
|
||||
/// This method is cancellation-safe.
|
||||
pub(crate) async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
|
||||
@@ -44,7 +44,6 @@ pub(crate) enum BackgroundLoopKind {
|
||||
Eviction,
|
||||
ConsumptionMetricsCollectMetrics,
|
||||
ConsumptionMetricsSyntheticSizeWorker,
|
||||
InitialLogicalSizeCalculation,
|
||||
}
|
||||
|
||||
impl BackgroundLoopKind {
|
||||
|
||||
@@ -2,7 +2,7 @@ pub mod delete;
|
||||
mod eviction_task;
|
||||
mod init;
|
||||
pub mod layer_manager;
|
||||
pub(crate) mod logical_size;
|
||||
mod logical_size;
|
||||
pub mod span;
|
||||
pub mod uninit;
|
||||
mod walreceiver;
|
||||
@@ -18,29 +18,25 @@ use pageserver_api::{
|
||||
DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, LayerMapInfo,
|
||||
TimelineState,
|
||||
},
|
||||
shard::{ShardIdentity, TenantShardId},
|
||||
shard::TenantShardId,
|
||||
};
|
||||
use rand::Rng;
|
||||
use serde_with::serde_as;
|
||||
use storage_broker::BrokerClientChannel;
|
||||
use tokio::{
|
||||
runtime::Handle,
|
||||
sync::{oneshot, watch},
|
||||
sync::{oneshot, watch, TryAcquireError},
|
||||
};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::{id::TenantTimelineId, sync::gate::Gate};
|
||||
|
||||
use std::cmp::{max, min, Ordering};
|
||||
use std::collections::{BinaryHeap, HashMap, HashSet};
|
||||
use std::ops::{Deref, Range};
|
||||
use std::pin::pin;
|
||||
use std::sync::atomic::Ordering as AtomicOrdering;
|
||||
use std::sync::{Arc, Mutex, RwLock, Weak};
|
||||
use std::time::{Duration, Instant, SystemTime};
|
||||
use std::{
|
||||
cmp::{max, min, Ordering},
|
||||
ops::ControlFlow,
|
||||
};
|
||||
|
||||
use crate::context::{
|
||||
AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder,
|
||||
@@ -167,10 +163,6 @@ pub struct Timeline {
|
||||
/// this copy enforces the invariant that generatio doesn't change during a Tenant's lifetime.
|
||||
pub(crate) generation: Generation,
|
||||
|
||||
/// The detailed sharding information from our parent Tenant. This enables us to map keys
|
||||
/// to shards, and is constant through the lifetime of this Timeline.
|
||||
shard_identity: ShardIdentity,
|
||||
|
||||
pub pg_version: u32,
|
||||
|
||||
/// The tuple has two elements.
|
||||
@@ -306,6 +298,13 @@ pub struct Timeline {
|
||||
|
||||
eviction_task_timeline_state: tokio::sync::Mutex<EvictionTaskTimelineState>,
|
||||
|
||||
/// Barrier to wait before doing initial logical size calculation. Used only during startup.
|
||||
initial_logical_size_can_start: Option<completion::Barrier>,
|
||||
|
||||
/// Completion shared between all timelines loaded during startup; used to delay heavier
|
||||
/// background tasks until some logical sizes have been calculated.
|
||||
initial_logical_size_attempt: Mutex<Option<completion::Completion>>,
|
||||
|
||||
/// Load or creation time information about the disk_consistent_lsn and when the loading
|
||||
/// happened. Used for consumption metrics.
|
||||
pub(crate) loaded_at: (Lsn, SystemTime),
|
||||
@@ -454,11 +453,6 @@ pub enum LogicalSizeCalculationCause {
|
||||
TenantSizeHandler,
|
||||
}
|
||||
|
||||
pub enum GetLogicalSizePriority {
|
||||
User,
|
||||
Background,
|
||||
}
|
||||
|
||||
#[derive(enumset::EnumSetType)]
|
||||
pub(crate) enum CompactFlags {
|
||||
ForceRepartition,
|
||||
@@ -495,9 +489,6 @@ impl Timeline {
|
||||
/// an ancestor branch, for example, or waste a lot of cycles chasing the
|
||||
/// non-existing key.
|
||||
///
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// This method is cancellation-safe.
|
||||
pub async fn get(
|
||||
&self,
|
||||
key: Key,
|
||||
@@ -810,12 +801,7 @@ impl Timeline {
|
||||
.access_stats_behavior(AccessStatsBehavior::Skip)
|
||||
.build();
|
||||
|
||||
// 2. Compact
|
||||
let timer = self.metrics.compact_time_histo.start_timer();
|
||||
self.compact_level0(target_file_size, ctx).await?;
|
||||
timer.stop_and_record();
|
||||
|
||||
// 3. Create new image layers for partitions that have been modified
|
||||
// 2. Create new image layers for partitions that have been modified
|
||||
// "enough".
|
||||
let layers = self
|
||||
.create_image_layers(&partitioning, lsn, false, &image_ctx)
|
||||
@@ -827,6 +813,11 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Compact
|
||||
let timer = self.metrics.compact_time_histo.start_timer();
|
||||
self.compact_level0(target_file_size, ctx).await?;
|
||||
timer.stop_and_record();
|
||||
|
||||
if let Some(remote_client) = &self.remote_client {
|
||||
// should any new image layer been created, not uploading index_part will
|
||||
// result in a mismatch between remote_physical_size and layermap calculated
|
||||
@@ -858,6 +849,31 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
/// Retrieve current logical size of the timeline.
|
||||
///
|
||||
/// The size could be lagging behind the actual number, in case
|
||||
/// the initial size calculation has not been run (gets triggered on the first size access).
|
||||
///
|
||||
/// return size and boolean flag that shows if the size is exact
|
||||
pub fn get_current_logical_size(
|
||||
self: &Arc<Self>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<(u64, bool)> {
|
||||
let current_size = self.current_logical_size.current_size()?;
|
||||
debug!("Current size: {current_size:?}");
|
||||
|
||||
let mut is_exact = true;
|
||||
let size = current_size.size();
|
||||
if let (CurrentLogicalSize::Approximate(_), Some(initial_part_end)) =
|
||||
(current_size, self.current_logical_size.initial_part_end)
|
||||
{
|
||||
is_exact = false;
|
||||
self.try_spawn_size_init_task(initial_part_end, ctx);
|
||||
}
|
||||
|
||||
Ok((size, is_exact))
|
||||
}
|
||||
|
||||
/// Check if more than 'checkpoint_distance' of WAL has been accumulated in
|
||||
/// the in-memory layer, and initiate flushing it if so.
|
||||
///
|
||||
@@ -907,7 +923,6 @@ impl Timeline {
|
||||
background_jobs_can_start: Option<&completion::Barrier>,
|
||||
ctx: &RequestContext,
|
||||
) {
|
||||
self.spawn_initial_logical_size_computation_task(ctx);
|
||||
self.launch_wal_receiver(ctx, broker_client);
|
||||
self.set_state(TimelineState::Active);
|
||||
self.launch_eviction_task(background_jobs_can_start);
|
||||
@@ -1021,6 +1036,17 @@ impl Timeline {
|
||||
error!("Not activating a Stopping timeline");
|
||||
}
|
||||
(_, new_state) => {
|
||||
if matches!(
|
||||
new_state,
|
||||
TimelineState::Stopping | TimelineState::Broken { .. }
|
||||
) {
|
||||
// drop the completion guard, if any; it might be holding off the completion
|
||||
// forever needlessly
|
||||
self.initial_logical_size_attempt
|
||||
.lock()
|
||||
.unwrap_or_else(|e| e.into_inner())
|
||||
.take();
|
||||
}
|
||||
self.state.send_replace(new_state);
|
||||
}
|
||||
}
|
||||
@@ -1339,10 +1365,11 @@ impl Timeline {
|
||||
timeline_id: TimelineId,
|
||||
tenant_shard_id: TenantShardId,
|
||||
generation: Generation,
|
||||
shard_identity: ShardIdentity,
|
||||
walredo_mgr: Arc<super::WalRedoManager>,
|
||||
resources: TimelineResources,
|
||||
pg_version: u32,
|
||||
initial_logical_size_can_start: Option<completion::Barrier>,
|
||||
initial_logical_size_attempt: Option<completion::Completion>,
|
||||
state: TimelineState,
|
||||
cancel: CancellationToken,
|
||||
) -> Arc<Self> {
|
||||
@@ -1369,7 +1396,6 @@ impl Timeline {
|
||||
timeline_id,
|
||||
tenant_shard_id,
|
||||
generation,
|
||||
shard_identity,
|
||||
pg_version,
|
||||
layers: Arc::new(tokio::sync::RwLock::new(LayerManager::create())),
|
||||
wanted_image_layers: Mutex::new(None),
|
||||
@@ -1443,6 +1469,8 @@ impl Timeline {
|
||||
),
|
||||
delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTimelineFlow::default())),
|
||||
|
||||
initial_logical_size_can_start,
|
||||
initial_logical_size_attempt: Mutex::new(initial_logical_size_attempt),
|
||||
cancel,
|
||||
gate: Gate::new(format!("Timeline<{tenant_shard_id}/{timeline_id}>")),
|
||||
|
||||
@@ -1754,91 +1782,38 @@ impl Timeline {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Retrieve current logical size of the timeline.
|
||||
///
|
||||
/// The size could be lagging behind the actual number, in case
|
||||
/// the initial size calculation has not been run (gets triggered on the first size access).
|
||||
///
|
||||
/// return size and boolean flag that shows if the size is exact
|
||||
pub(crate) fn get_current_logical_size(
|
||||
self: &Arc<Self>,
|
||||
priority: GetLogicalSizePriority,
|
||||
ctx: &RequestContext,
|
||||
) -> logical_size::CurrentLogicalSize {
|
||||
let current_size = self.current_logical_size.current_size();
|
||||
debug!("Current size: {current_size:?}");
|
||||
|
||||
match (current_size.accuracy(), priority) {
|
||||
(logical_size::Accuracy::Exact, _) => (), // nothing to do
|
||||
(logical_size::Accuracy::Approximate, GetLogicalSizePriority::Background) => {
|
||||
// background task will eventually deliver an exact value, we're in no rush
|
||||
}
|
||||
(logical_size::Accuracy::Approximate, GetLogicalSizePriority::User) => {
|
||||
// background task is not ready, but user is asking for it now;
|
||||
// => make the background task skip the line
|
||||
// (The alternative would be to calculate the size here, but,
|
||||
// it can actually take a long time if the user has a lot of rels.
|
||||
// And we'll inevitable need it again; So, let the background task do the work.)
|
||||
match self
|
||||
.current_logical_size
|
||||
.cancel_wait_for_background_loop_concurrency_limit_semaphore
|
||||
.get()
|
||||
{
|
||||
Some(cancel) => cancel.cancel(),
|
||||
None => {
|
||||
let state = self.current_state();
|
||||
if matches!(
|
||||
state,
|
||||
TimelineState::Broken { .. } | TimelineState::Stopping
|
||||
) {
|
||||
|
||||
// Can happen when timeline detail endpoint is used when deletion is ongoing (or its broken).
|
||||
// Don't make noise.
|
||||
} else {
|
||||
warn!("unexpected: cancel_wait_for_background_loop_concurrency_limit_semaphore not set, priority-boosting of logical size calculation will not work");
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
if let CurrentLogicalSize::Approximate(_) = ¤t_size {
|
||||
if ctx.task_kind() == TaskKind::WalReceiverConnectionHandler {
|
||||
let first = self
|
||||
.current_logical_size
|
||||
.did_return_approximate_to_walreceiver
|
||||
.compare_exchange(
|
||||
false,
|
||||
true,
|
||||
AtomicOrdering::Relaxed,
|
||||
AtomicOrdering::Relaxed,
|
||||
)
|
||||
.is_ok();
|
||||
if first {
|
||||
crate::metrics::initial_logical_size::TIMELINES_WHERE_WALRECEIVER_GOT_APPROXIMATE_SIZE.inc();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
current_size
|
||||
}
|
||||
|
||||
fn spawn_initial_logical_size_computation_task(self: &Arc<Self>, ctx: &RequestContext) {
|
||||
let Some(initial_part_end) = self.current_logical_size.initial_part_end else {
|
||||
// nothing to do for freshly created timelines;
|
||||
assert_eq!(
|
||||
self.current_logical_size.current_size().accuracy(),
|
||||
logical_size::Accuracy::Exact,
|
||||
);
|
||||
fn try_spawn_size_init_task(self: &Arc<Self>, lsn: Lsn, ctx: &RequestContext) {
|
||||
let state = self.current_state();
|
||||
if matches!(
|
||||
state,
|
||||
TimelineState::Broken { .. } | TimelineState::Stopping
|
||||
) {
|
||||
// Can happen when timeline detail endpoint is used when deletion is ongoing (or its broken).
|
||||
return;
|
||||
}
|
||||
|
||||
let permit = match Arc::clone(&self.current_logical_size.initial_size_computation)
|
||||
.try_acquire_owned()
|
||||
{
|
||||
Ok(permit) => permit,
|
||||
Err(TryAcquireError::NoPermits) => {
|
||||
// computation already ongoing or finished with success
|
||||
return;
|
||||
}
|
||||
Err(TryAcquireError::Closed) => unreachable!("we never call close"),
|
||||
};
|
||||
debug_assert!(self
|
||||
.current_logical_size
|
||||
.initial_logical_size
|
||||
.get()
|
||||
.is_none());
|
||||
|
||||
let cancel_wait_for_background_loop_concurrency_limit_semaphore = CancellationToken::new();
|
||||
let token = cancel_wait_for_background_loop_concurrency_limit_semaphore.clone();
|
||||
self.current_logical_size
|
||||
.cancel_wait_for_background_loop_concurrency_limit_semaphore.set(token)
|
||||
.expect("initial logical size calculation task must be spawned exactly once per Timeline object");
|
||||
|
||||
info!(
|
||||
"spawning logical size computation from context of task kind {:?}",
|
||||
ctx.task_kind()
|
||||
);
|
||||
// We need to start the computation task.
|
||||
// It gets a separate context since it will outlive the request that called this function.
|
||||
let self_clone = Arc::clone(self);
|
||||
let background_ctx = ctx.detached_child(
|
||||
TaskKind::InitialLogicalSizeCalculation,
|
||||
@@ -1853,152 +1828,89 @@ impl Timeline {
|
||||
false,
|
||||
// NB: don't log errors here, task_mgr will do that.
|
||||
async move {
|
||||
|
||||
let cancel = task_mgr::shutdown_token();
|
||||
self_clone
|
||||
.initial_logical_size_calculation_task(
|
||||
initial_part_end,
|
||||
cancel_wait_for_background_loop_concurrency_limit_semaphore,
|
||||
cancel,
|
||||
background_ctx,
|
||||
)
|
||||
.await;
|
||||
Ok(())
|
||||
}
|
||||
.instrument(info_span!(parent: None, "initial_size_calculation", tenant_id=%self.tenant_shard_id.tenant_id, timeline_id=%self.timeline_id)),
|
||||
);
|
||||
}
|
||||
|
||||
async fn initial_logical_size_calculation_task(
|
||||
self: Arc<Self>,
|
||||
initial_part_end: Lsn,
|
||||
skip_concurrency_limiter: CancellationToken,
|
||||
cancel: CancellationToken,
|
||||
background_ctx: RequestContext,
|
||||
) {
|
||||
enum BackgroundCalculationError {
|
||||
Cancelled,
|
||||
Other(anyhow::Error),
|
||||
}
|
||||
|
||||
let try_once = |attempt: usize| {
|
||||
let background_ctx = &background_ctx;
|
||||
let self_ref = &self;
|
||||
let skip_concurrency_limiter = &skip_concurrency_limiter;
|
||||
async move {
|
||||
let cancel = task_mgr::shutdown_token();
|
||||
let wait_for_permit = super::tasks::concurrent_background_tasks_rate_limit(
|
||||
BackgroundLoopKind::InitialLogicalSizeCalculation,
|
||||
background_ctx,
|
||||
&cancel,
|
||||
);
|
||||
|
||||
use crate::metrics::initial_logical_size::StartCircumstances;
|
||||
let (_maybe_permit, circumstances) = tokio::select! {
|
||||
res = wait_for_permit => {
|
||||
match res {
|
||||
Ok(permit) => (Some(permit), StartCircumstances::AfterBackgroundTasksRateLimit),
|
||||
Err(RateLimitError::Cancelled) => {
|
||||
return Err(BackgroundCalculationError::Cancelled);
|
||||
}
|
||||
}
|
||||
}
|
||||
() = skip_concurrency_limiter.cancelled() => {
|
||||
// Some action that is part of a end user interaction requested logical size
|
||||
// => break out of the rate limit
|
||||
// TODO: ideally we'd not run on BackgroundRuntime but the requester's runtime;
|
||||
// but then again what happens if they cancel; also, we should just be using
|
||||
// one runtime across the entire process, so, let's leave this for now.
|
||||
(None, StartCircumstances::SkippedConcurrencyLimiter)
|
||||
}
|
||||
// in case we were created during pageserver initialization, wait for
|
||||
// initialization to complete before proceeding. startup time init runs on the same
|
||||
// runtime.
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => { return Ok(()); },
|
||||
_ = completion::Barrier::maybe_wait(self_clone.initial_logical_size_can_start.clone()) => {}
|
||||
};
|
||||
|
||||
let metrics_guard = if attempt == 1 {
|
||||
crate::metrics::initial_logical_size::START_CALCULATION.first(circumstances)
|
||||
} else {
|
||||
crate::metrics::initial_logical_size::START_CALCULATION.retry(circumstances)
|
||||
};
|
||||
// hold off background tasks from starting until all timelines get to try at least
|
||||
// once initial logical size calculation; though retry will rarely be useful.
|
||||
// holding off is done because heavier tasks execute blockingly on the same
|
||||
// runtime.
|
||||
//
|
||||
// dropping this at every outcome is probably better than trying to cling on to it,
|
||||
// delay will be terminated by a timeout regardless.
|
||||
let _completion = { self_clone.initial_logical_size_attempt.lock().expect("unexpected initial_logical_size_attempt poisoned").take() };
|
||||
|
||||
match self_ref
|
||||
.logical_size_calculation_task(
|
||||
initial_part_end,
|
||||
LogicalSizeCalculationCause::Initial,
|
||||
background_ctx,
|
||||
)
|
||||
let calculated_size = match self_clone
|
||||
.logical_size_calculation_task(lsn, LogicalSizeCalculationCause::Initial, &background_ctx)
|
||||
.await
|
||||
{
|
||||
Ok(calculated_size) => Ok((calculated_size, metrics_guard)),
|
||||
Ok(s) => s,
|
||||
Err(CalculateLogicalSizeError::Cancelled) => {
|
||||
Err(BackgroundCalculationError::Cancelled)
|
||||
// Don't make noise, this is a common task.
|
||||
// In the unlikely case that there is another call to this function, we'll retry
|
||||
// because initial_logical_size is still None.
|
||||
info!("initial size calculation cancelled, likely timeline delete / tenant detach");
|
||||
return Ok(());
|
||||
}
|
||||
Err(CalculateLogicalSizeError::Other(err)) => {
|
||||
if let Some(PageReconstructError::AncestorStopping(_)) =
|
||||
if let Some(e @ PageReconstructError::AncestorStopping(_)) =
|
||||
err.root_cause().downcast_ref()
|
||||
{
|
||||
Err(BackgroundCalculationError::Cancelled)
|
||||
} else {
|
||||
Err(BackgroundCalculationError::Other(err))
|
||||
// This can happen if the timeline parent timeline switches to
|
||||
// Stopping state while we're still calculating the initial
|
||||
// timeline size for the child, for example if the tenant is
|
||||
// being detached or the pageserver is shut down. Like with
|
||||
// CalculateLogicalSizeError::Cancelled, don't make noise.
|
||||
info!("initial size calculation failed because the timeline or its ancestor is Stopping, likely because the tenant is being detached: {e:#}");
|
||||
return Ok(());
|
||||
}
|
||||
return Err(err.context("Failed to calculate logical size"));
|
||||
}
|
||||
};
|
||||
|
||||
// we cannot query current_logical_size.current_size() to know the current
|
||||
// *negative* value, only truncated to u64.
|
||||
let added = self_clone
|
||||
.current_logical_size
|
||||
.size_added_after_initial
|
||||
.load(AtomicOrdering::Relaxed);
|
||||
|
||||
let sum = calculated_size.saturating_add_signed(added);
|
||||
|
||||
// set the gauge value before it can be set in `update_current_logical_size`.
|
||||
self_clone.metrics.current_logical_size_gauge.set(sum);
|
||||
|
||||
match self_clone
|
||||
.current_logical_size
|
||||
.initial_logical_size
|
||||
.set(calculated_size)
|
||||
{
|
||||
Ok(()) => (),
|
||||
Err(_what_we_just_attempted_to_set) => {
|
||||
let existing_size = self_clone
|
||||
.current_logical_size
|
||||
.initial_logical_size
|
||||
.get()
|
||||
.expect("once_cell set was lost, then get failed, impossible.");
|
||||
// This shouldn't happen because the semaphore is initialized with 1.
|
||||
// But if it happens, just complain & report success so there are no further retries.
|
||||
error!("Tried to update initial timeline size value to {calculated_size}, but the size was already set to {existing_size}, not changing")
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let retrying = async {
|
||||
let mut attempt = 0;
|
||||
loop {
|
||||
attempt += 1;
|
||||
|
||||
match try_once(attempt).await {
|
||||
Ok(res) => return ControlFlow::Continue(res),
|
||||
Err(BackgroundCalculationError::Cancelled) => return ControlFlow::Break(()),
|
||||
Err(BackgroundCalculationError::Other(e)) => {
|
||||
warn!(attempt, "initial size calculation failed: {e:?}");
|
||||
// exponential back-off doesn't make sense at these long intervals;
|
||||
// use fixed retry interval with generous jitter instead
|
||||
let sleep_duration = Duration::from_secs(
|
||||
u64::try_from(
|
||||
// 1hour base
|
||||
(60_i64 * 60_i64)
|
||||
// 10min jitter
|
||||
+ rand::thread_rng().gen_range(-10 * 60..10 * 60),
|
||||
)
|
||||
.expect("10min < 1hour"),
|
||||
);
|
||||
tokio::time::sleep(sleep_duration).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let (calculated_size, metrics_guard) = tokio::select! {
|
||||
res = retrying => {
|
||||
match res {
|
||||
ControlFlow::Continue(calculated_size) => calculated_size,
|
||||
ControlFlow::Break(()) => return,
|
||||
}
|
||||
}
|
||||
_ = cancel.cancelled() => {
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
// we cannot query current_logical_size.current_size() to know the current
|
||||
// *negative* value, only truncated to u64.
|
||||
let added = self
|
||||
.current_logical_size
|
||||
.size_added_after_initial
|
||||
.load(AtomicOrdering::Relaxed);
|
||||
|
||||
let sum = calculated_size.saturating_add_signed(added);
|
||||
|
||||
// set the gauge value before it can be set in `update_current_logical_size`.
|
||||
self.metrics.current_logical_size_gauge.set(sum);
|
||||
|
||||
self.current_logical_size
|
||||
.initial_logical_size
|
||||
.set((calculated_size, metrics_guard.calculation_result_saved()))
|
||||
.ok()
|
||||
.expect("only this task sets it");
|
||||
// now that `initial_logical_size.is_some()`, reduce permit count to 0
|
||||
// so that we prevent future callers from spawning this task
|
||||
permit.forget();
|
||||
Ok(())
|
||||
}.in_current_span(),
|
||||
);
|
||||
}
|
||||
|
||||
pub fn spawn_ondemand_logical_size_calculation(
|
||||
@@ -2036,9 +1948,6 @@ impl Timeline {
|
||||
receiver
|
||||
}
|
||||
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// This method is cancellation-safe.
|
||||
#[instrument(skip_all)]
|
||||
async fn logical_size_calculation_task(
|
||||
self: &Arc<Self>,
|
||||
@@ -2076,10 +1985,6 @@ impl Timeline {
|
||||
///
|
||||
/// NOTE: counted incrementally, includes ancestors. This can be a slow operation,
|
||||
/// especially if we need to download remote layers.
|
||||
///
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// This method is cancellation-safe.
|
||||
pub async fn calculate_logical_size(
|
||||
&self,
|
||||
up_to_lsn: Lsn,
|
||||
@@ -2144,14 +2049,16 @@ impl Timeline {
|
||||
// one value while current_logical_size is set to the
|
||||
// other.
|
||||
match logical_size.current_size() {
|
||||
CurrentLogicalSize::Exact(ref new_current_size) => self
|
||||
Ok(CurrentLogicalSize::Exact(new_current_size)) => self
|
||||
.metrics
|
||||
.current_logical_size_gauge
|
||||
.set(new_current_size.into()),
|
||||
CurrentLogicalSize::Approximate(_) => {
|
||||
.set(new_current_size),
|
||||
Ok(CurrentLogicalSize::Approximate(_)) => {
|
||||
// don't update the gauge yet, this allows us not to update the gauge back and
|
||||
// forth between the initial size calculation task.
|
||||
}
|
||||
// this is overflow
|
||||
Err(e) => error!("Failed to compute current logical size for metrics update: {e:?}"),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2195,10 +2102,6 @@ impl Timeline {
|
||||
///
|
||||
/// This function takes the current timeline's locked LayerMap as an argument,
|
||||
/// so callers can avoid potential race conditions.
|
||||
///
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// This method is cancellation-safe.
|
||||
async fn get_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
@@ -2447,9 +2350,6 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
/// # Cancel-safety
|
||||
///
|
||||
/// This method is cancellation-safe.
|
||||
async fn lookup_cached_page(
|
||||
&self,
|
||||
key: &Key,
|
||||
@@ -2484,10 +2384,6 @@ impl Timeline {
|
||||
Ok(Arc::clone(ancestor))
|
||||
}
|
||||
|
||||
pub(crate) fn get_shard_identity(&self) -> &ShardIdentity {
|
||||
&self.shard_identity
|
||||
}
|
||||
|
||||
///
|
||||
/// Get a handle to the latest layer for appending.
|
||||
///
|
||||
|
||||
@@ -21,6 +21,7 @@ use crate::{
|
||||
},
|
||||
CreateTimelineCause, DeleteTimelineError, Tenant,
|
||||
},
|
||||
InitializationOrder,
|
||||
};
|
||||
|
||||
use super::{Timeline, TimelineResources};
|
||||
@@ -406,6 +407,7 @@ impl DeleteTimelineFlow {
|
||||
local_metadata: &TimelineMetadata,
|
||||
remote_client: Option<RemoteTimelineClient>,
|
||||
deletion_queue_client: DeletionQueueClient,
|
||||
init_order: Option<&InitializationOrder>,
|
||||
) -> anyhow::Result<()> {
|
||||
// Note: here we even skip populating layer map. Timeline is essentially uninitialized.
|
||||
// RemoteTimelineClient is the only functioning part.
|
||||
@@ -418,6 +420,7 @@ impl DeleteTimelineFlow {
|
||||
remote_client,
|
||||
deletion_queue_client,
|
||||
},
|
||||
init_order,
|
||||
// Important. We dont pass ancestor above because it can be missing.
|
||||
// Thus we need to skip the validation here.
|
||||
CreateTimelineCause::Delete,
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
use anyhow::Context;
|
||||
|
||||
use once_cell::sync::OnceCell;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
use tokio::sync::Semaphore;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use std::sync::atomic::{AtomicBool, AtomicI64, Ordering as AtomicOrdering};
|
||||
use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering};
|
||||
use std::sync::Arc;
|
||||
|
||||
/// Internal structure to hold all data needed for logical size calculation.
|
||||
///
|
||||
@@ -22,17 +23,10 @@ pub(super) struct LogicalSize {
|
||||
///
|
||||
/// NOTE: size at a given LSN is constant, but after a restart we will calculate
|
||||
/// the initial size at a different LSN.
|
||||
pub initial_logical_size: OnceCell<(
|
||||
u64,
|
||||
crate::metrics::initial_logical_size::FinishedCalculationGuard,
|
||||
)>,
|
||||
pub initial_logical_size: OnceCell<u64>,
|
||||
|
||||
/// Cancellation for the best-effort logical size calculation.
|
||||
///
|
||||
/// The token is kept in a once-cell so that we can error out if a higher priority
|
||||
/// request comes in *before* we have started the normal logical size calculation.
|
||||
pub(crate) cancel_wait_for_background_loop_concurrency_limit_semaphore:
|
||||
OnceCell<CancellationToken>,
|
||||
/// Semaphore to track ongoing calculation of `initial_logical_size`.
|
||||
pub initial_size_computation: Arc<tokio::sync::Semaphore>,
|
||||
|
||||
/// Latest Lsn that has its size uncalculated, could be absent for freshly created timelines.
|
||||
pub initial_part_end: Option<Lsn>,
|
||||
@@ -58,57 +52,25 @@ pub(super) struct LogicalSize {
|
||||
/// see `current_logical_size_gauge`. Use the `update_current_logical_size`
|
||||
/// to modify this, it will also keep the prometheus metric in sync.
|
||||
pub size_added_after_initial: AtomicI64,
|
||||
|
||||
/// For [`crate::metrics::initial_logical_size::TIMELINES_WHERE_WALRECEIVER_GOT_APPROXIMATE_SIZE`].
|
||||
pub(super) did_return_approximate_to_walreceiver: AtomicBool,
|
||||
}
|
||||
|
||||
/// Normalized current size, that the data in pageserver occupies.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub(crate) enum CurrentLogicalSize {
|
||||
pub(super) enum CurrentLogicalSize {
|
||||
/// The size is not yet calculated to the end, this is an intermediate result,
|
||||
/// constructed from walreceiver increments and normalized: logical data could delete some objects, hence be negative,
|
||||
/// yet total logical size cannot be below 0.
|
||||
Approximate(Approximate),
|
||||
Approximate(u64),
|
||||
// Fully calculated logical size, only other future walreceiver increments are changing it, and those changes are
|
||||
// available for observation without any calculations.
|
||||
Exact(Exact),
|
||||
}
|
||||
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
|
||||
pub(crate) enum Accuracy {
|
||||
Approximate,
|
||||
Exact,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub(crate) struct Approximate(u64);
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub(crate) struct Exact(u64);
|
||||
|
||||
impl From<&Approximate> for u64 {
|
||||
fn from(value: &Approximate) -> Self {
|
||||
value.0
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&Exact> for u64 {
|
||||
fn from(val: &Exact) -> Self {
|
||||
val.0
|
||||
}
|
||||
Exact(u64),
|
||||
}
|
||||
|
||||
impl CurrentLogicalSize {
|
||||
pub(crate) fn size_dont_care_about_accuracy(&self) -> u64 {
|
||||
match self {
|
||||
Self::Approximate(size) => size.into(),
|
||||
Self::Exact(size) => size.into(),
|
||||
}
|
||||
}
|
||||
pub(crate) fn accuracy(&self) -> Accuracy {
|
||||
match self {
|
||||
Self::Approximate(_) => Accuracy::Approximate,
|
||||
Self::Exact(_) => Accuracy::Exact,
|
||||
pub(super) fn size(&self) -> u64 {
|
||||
*match self {
|
||||
Self::Approximate(size) => size,
|
||||
Self::Exact(size) => size,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -116,42 +78,36 @@ impl CurrentLogicalSize {
|
||||
impl LogicalSize {
|
||||
pub(super) fn empty_initial() -> Self {
|
||||
Self {
|
||||
initial_logical_size: OnceCell::with_value((0, {
|
||||
crate::metrics::initial_logical_size::START_CALCULATION
|
||||
.first(crate::metrics::initial_logical_size::StartCircumstances::EmptyInitial)
|
||||
.calculation_result_saved()
|
||||
})),
|
||||
cancel_wait_for_background_loop_concurrency_limit_semaphore: OnceCell::new(),
|
||||
initial_logical_size: OnceCell::with_value(0),
|
||||
// initial_logical_size already computed, so, don't admit any calculations
|
||||
initial_size_computation: Arc::new(Semaphore::new(0)),
|
||||
initial_part_end: None,
|
||||
size_added_after_initial: AtomicI64::new(0),
|
||||
did_return_approximate_to_walreceiver: AtomicBool::new(false),
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn deferred_initial(compute_to: Lsn) -> Self {
|
||||
Self {
|
||||
initial_logical_size: OnceCell::new(),
|
||||
cancel_wait_for_background_loop_concurrency_limit_semaphore: OnceCell::new(),
|
||||
initial_size_computation: Arc::new(Semaphore::new(1)),
|
||||
initial_part_end: Some(compute_to),
|
||||
size_added_after_initial: AtomicI64::new(0),
|
||||
did_return_approximate_to_walreceiver: AtomicBool::new(false),
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn current_size(&self) -> CurrentLogicalSize {
|
||||
pub(super) fn current_size(&self) -> anyhow::Result<CurrentLogicalSize> {
|
||||
let size_increment: i64 = self.size_added_after_initial.load(AtomicOrdering::Acquire);
|
||||
// ^^^ keep this type explicit so that the casts in this function break if
|
||||
// we change the type.
|
||||
match self.initial_logical_size.get() {
|
||||
Some((initial_size, _)) => {
|
||||
CurrentLogicalSize::Exact(Exact(initial_size.checked_add_signed(size_increment)
|
||||
Some(initial_size) => {
|
||||
initial_size.checked_add_signed(size_increment)
|
||||
.with_context(|| format!("Overflow during logical size calculation, initial_size: {initial_size}, size_increment: {size_increment}"))
|
||||
.unwrap()))
|
||||
.map(CurrentLogicalSize::Exact)
|
||||
}
|
||||
None => {
|
||||
|
||||
let non_negative_size_increment = u64::try_from(size_increment).unwrap_or(0);
|
||||
CurrentLogicalSize::Approximate(Approximate(non_negative_size_increment))
|
||||
Ok(CurrentLogicalSize::Approximate(non_negative_size_increment))
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -165,7 +121,7 @@ impl LogicalSize {
|
||||
/// available for re-use. This doesn't contain the incremental part.
|
||||
pub(super) fn initialized_size(&self, lsn: Lsn) -> Option<u64> {
|
||||
match self.initial_part_end {
|
||||
Some(v) if v == lsn => self.initial_logical_size.get().map(|(s, _)| *s),
|
||||
Some(v) if v == lsn => self.initial_logical_size.get().copied(),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -396,15 +396,11 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
|
||||
// Send the replication feedback message.
|
||||
// Regular standby_status_update fields are put into this message.
|
||||
let current_timeline_size = timeline
|
||||
.get_current_logical_size(
|
||||
crate::tenant::timeline::GetLogicalSizePriority::User,
|
||||
&ctx,
|
||||
)
|
||||
// FIXME: https://github.com/neondatabase/neon/issues/5963
|
||||
.size_dont_care_about_accuracy();
|
||||
let (timeline_logical_size, _) = timeline
|
||||
.get_current_logical_size(&ctx)
|
||||
.context("Status update creation failed to get current logical size")?;
|
||||
let status_update = PageserverFeedback {
|
||||
current_timeline_size,
|
||||
current_timeline_size: timeline_logical_size,
|
||||
last_received_lsn,
|
||||
disk_consistent_lsn,
|
||||
remote_consistent_lsn,
|
||||
|
||||
@@ -1437,16 +1437,7 @@ impl<'a> WalIngest<'a> {
|
||||
// record.
|
||||
// TODO: would be nice if to be more explicit about it
|
||||
let last_lsn = modification.lsn;
|
||||
|
||||
// Get current size and put rel creation if rel doesn't exist
|
||||
//
|
||||
// NOTE: we check the cache first even though get_rel_exists and get_rel_size would
|
||||
// check the cache too. This is because eagerly checking the cache results in
|
||||
// less work overall and 10% better performance. It's more work on cache miss
|
||||
// but cache miss is rare.
|
||||
let old_nblocks = if let Some(nblocks) = self.timeline.get_cached_rel_size(&rel, last_lsn) {
|
||||
nblocks
|
||||
} else if !self
|
||||
let old_nblocks = if !self
|
||||
.timeline
|
||||
.get_rel_exists(rel, last_lsn, true, ctx)
|
||||
.await?
|
||||
@@ -2124,7 +2115,7 @@ mod tests {
|
||||
.load()
|
||||
.await;
|
||||
let tline = tenant
|
||||
.bootstrap_timeline(TIMELINE_ID, pg_version, None, &ctx)
|
||||
.bootstrap_timeline(TIMELINE_ID, pg_version, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
|
||||
@@ -34,6 +34,7 @@ use std::process::{Child, ChildStdin, ChildStdout, Command};
|
||||
use std::sync::{Arc, Mutex, MutexGuard, RwLock};
|
||||
use std::time::Duration;
|
||||
use std::time::Instant;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock};
|
||||
|
||||
@@ -123,9 +124,7 @@ impl PostgresRedoManager {
|
||||
/// The WAL redo is handled by a separate thread, so this just sends a request
|
||||
/// to the thread and waits for response.
|
||||
///
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// This method is cancellation-safe.
|
||||
/// CANCEL SAFETY: NOT CANCEL SAFE.
|
||||
pub async fn request_redo(
|
||||
&self,
|
||||
key: Key,
|
||||
@@ -158,6 +157,7 @@ impl PostgresRedoManager {
|
||||
self.conf.wal_redo_timeout,
|
||||
pg_version,
|
||||
)
|
||||
.await
|
||||
};
|
||||
img = Some(result?);
|
||||
|
||||
@@ -178,6 +178,7 @@ impl PostgresRedoManager {
|
||||
self.conf.wal_redo_timeout,
|
||||
pg_version,
|
||||
)
|
||||
.await
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -215,7 +216,7 @@ impl PostgresRedoManager {
|
||||
/// Process one request for WAL redo using wal-redo postgres
|
||||
///
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn apply_batch_postgres(
|
||||
async fn apply_batch_postgres(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
@@ -331,7 +332,12 @@ impl PostgresRedoManager {
|
||||
// than we can SIGKILL & `wait` for them to exit. By doing it the way we do here,
|
||||
// we limit this risk of run-away to at most $num_runtimes * $num_executor_threads.
|
||||
// This probably needs revisiting at some later point.
|
||||
let mut wait_done = proc.stderr_logger_task_done.clone();
|
||||
drop(proc);
|
||||
wait_done
|
||||
.wait_for(|v| *v)
|
||||
.await
|
||||
.expect("we use scopeguard to ensure we always send `true` to the channel before dropping the sender");
|
||||
} else if n_attempts != 0 {
|
||||
info!(n_attempts, "retried walredo succeeded");
|
||||
}
|
||||
@@ -643,6 +649,8 @@ struct WalRedoProcess {
|
||||
child: Option<NoLeakChild>,
|
||||
stdout: Mutex<ProcessOutput>,
|
||||
stdin: Mutex<ProcessInput>,
|
||||
stderr_logger_cancel: CancellationToken,
|
||||
stderr_logger_task_done: tokio::sync::watch::Receiver<bool>,
|
||||
/// Counter to separate same sized walredo inputs failing at the same millisecond.
|
||||
#[cfg(feature = "testing")]
|
||||
dump_sequence: AtomicUsize,
|
||||
@@ -691,8 +699,6 @@ impl WalRedoProcess {
|
||||
let stdin = child.stdin.take().unwrap();
|
||||
let stdout = child.stdout.take().unwrap();
|
||||
let stderr = child.stderr.take().unwrap();
|
||||
let stderr = tokio::process::ChildStderr::from_std(stderr)
|
||||
.context("convert to tokio::ChildStderr")?;
|
||||
macro_rules! set_nonblock_or_log_err {
|
||||
($file:ident) => {{
|
||||
let res = set_nonblock($file.as_raw_fd());
|
||||
@@ -704,45 +710,69 @@ impl WalRedoProcess {
|
||||
}
|
||||
set_nonblock_or_log_err!(stdin)?;
|
||||
set_nonblock_or_log_err!(stdout)?;
|
||||
set_nonblock_or_log_err!(stderr)?;
|
||||
|
||||
let mut stderr = tokio::io::unix::AsyncFd::new(stderr).context("AsyncFd::with_interest")?;
|
||||
|
||||
// all fallible operations post-spawn are complete, so get rid of the guard
|
||||
let child = scopeguard::ScopeGuard::into_inner(child);
|
||||
|
||||
tokio::spawn(
|
||||
let stderr_logger_cancel = CancellationToken::new();
|
||||
let (stderr_logger_task_done_tx, stderr_logger_task_done_rx) =
|
||||
tokio::sync::watch::channel(false);
|
||||
tokio::spawn({
|
||||
let stderr_logger_cancel = stderr_logger_cancel.clone();
|
||||
async move {
|
||||
scopeguard::defer! {
|
||||
debug!("wal-redo-postgres stderr_logger_task finished");
|
||||
crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
|
||||
let _ = stderr_logger_task_done_tx.send(true);
|
||||
}
|
||||
debug!("wal-redo-postgres stderr_logger_task started");
|
||||
crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
|
||||
|
||||
use tokio::io::AsyncBufReadExt;
|
||||
let mut stderr_lines = tokio::io::BufReader::new(stderr);
|
||||
let mut buf = Vec::new();
|
||||
let res = loop {
|
||||
buf.clear();
|
||||
// TODO we don't trust the process to cap its stderr length.
|
||||
// Currently it can do unbounded Vec allocation.
|
||||
match stderr_lines.read_until(b'\n', &mut buf).await {
|
||||
Ok(0) => break Ok(()), // eof
|
||||
Ok(num_bytes) => {
|
||||
let output = String::from_utf8_lossy(&buf[..num_bytes]);
|
||||
error!(%output, "received output");
|
||||
loop {
|
||||
// NB: we purposefully don't do a select! for the cancellation here.
|
||||
// The cancellation would likely cause us to miss stderr messages.
|
||||
// We can rely on this to return from .await because when we SIGKILL
|
||||
// the child, the writing end of the stderr pipe gets closed.
|
||||
match stderr.readable_mut().await {
|
||||
Ok(mut guard) => {
|
||||
let mut errbuf = [0; 16384];
|
||||
let res = guard.try_io(|fd| {
|
||||
use std::io::Read;
|
||||
fd.get_mut().read(&mut errbuf)
|
||||
});
|
||||
match res {
|
||||
Ok(Ok(0)) => {
|
||||
// it closed the stderr pipe
|
||||
break;
|
||||
}
|
||||
Ok(Ok(n)) => {
|
||||
// The message might not be split correctly into lines here. But this is
|
||||
// good enough, the important thing is to get the message to the log.
|
||||
let output = String::from_utf8_lossy(&errbuf[0..n]).to_string();
|
||||
error!(output, "received output");
|
||||
},
|
||||
Ok(Err(e)) => {
|
||||
error!(error = ?e, "read() error, waiting for cancellation");
|
||||
stderr_logger_cancel.cancelled().await;
|
||||
error!(error = ?e, "read() error, cancellation complete");
|
||||
break;
|
||||
}
|
||||
Err(e) => {
|
||||
let _e: tokio::io::unix::TryIoError = e;
|
||||
// the read() returned WouldBlock, that's expected
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
break Err(e);
|
||||
error!(error = ?e, "read() error, waiting for cancellation");
|
||||
stderr_logger_cancel.cancelled().await;
|
||||
error!(error = ?e, "read() error, cancellation complete");
|
||||
break;
|
||||
}
|
||||
}
|
||||
};
|
||||
match res {
|
||||
Ok(()) => (),
|
||||
Err(e) => {
|
||||
error!(error=?e, "failed to read from walredo stderr");
|
||||
}
|
||||
}
|
||||
}.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_id, %pg_version))
|
||||
);
|
||||
});
|
||||
|
||||
Ok(Self {
|
||||
conf,
|
||||
@@ -757,6 +787,8 @@ impl WalRedoProcess {
|
||||
pending_responses: VecDeque::new(),
|
||||
n_processed_responses: 0,
|
||||
}),
|
||||
stderr_logger_cancel,
|
||||
stderr_logger_task_done: stderr_logger_task_done_rx,
|
||||
#[cfg(feature = "testing")]
|
||||
dump_sequence: AtomicUsize::default(),
|
||||
})
|
||||
@@ -997,6 +1029,7 @@ impl Drop for WalRedoProcess {
|
||||
.take()
|
||||
.expect("we only do this once")
|
||||
.kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
|
||||
self.stderr_logger_cancel.cancel();
|
||||
// no way to wait for stderr_logger_task from Drop because that is async only
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
#include "access/xlog.h"
|
||||
#include "access/xlogutils.h"
|
||||
#include "storage/buf_internals.h"
|
||||
#include "storage/lwlock.h"
|
||||
#include "storage/ipc.h"
|
||||
#include "storage/pg_shmem.h"
|
||||
#include "c.h"
|
||||
@@ -36,8 +37,6 @@
|
||||
#include "walproposer.h"
|
||||
#include "neon_utils.h"
|
||||
|
||||
#include <pthread.h>
|
||||
|
||||
#define PageStoreTrace DEBUG5
|
||||
|
||||
#define RECONNECT_INTERVAL_USEC 1000000
|
||||
@@ -70,7 +69,7 @@ int max_reconnect_attempts = 60;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
pthread_rwlock_t lock;
|
||||
LWLockId lock;
|
||||
pg_atomic_uint64 update_counter;
|
||||
char pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
|
||||
} PagestoreShmemState;
|
||||
@@ -106,10 +105,10 @@ AssignPageserverConnstring(const char *newval, void *extra)
|
||||
{
|
||||
if(!PagestoreShmemIsValid())
|
||||
return;
|
||||
pthread_rwlock_wrlock(&pagestore_shared->lock);
|
||||
LWLockAcquire(pagestore_shared->lock, LW_EXCLUSIVE);
|
||||
strlcpy(pagestore_shared->pageserver_connstring, newval, MAX_PAGESERVER_CONNSTRING_SIZE);
|
||||
pg_atomic_fetch_add_u64(&pagestore_shared->update_counter, 1);
|
||||
pthread_rwlock_unlock(&pagestore_shared->lock);
|
||||
LWLockRelease(pagestore_shared->lock);
|
||||
}
|
||||
|
||||
static bool
|
||||
@@ -120,24 +119,15 @@ CheckConnstringUpdated()
|
||||
return pagestore_local_counter < pg_atomic_read_u64(&pagestore_shared->update_counter);
|
||||
}
|
||||
|
||||
/* Returns true if the connstring has changed and false if not */
|
||||
static bool
|
||||
static void
|
||||
ReloadConnstring()
|
||||
{
|
||||
if(!PagestoreShmemIsValid())
|
||||
return false;
|
||||
pthread_rwlock_rdlock(&pagestore_shared->lock);
|
||||
|
||||
if(strcmp(local_pageserver_connstring, pagestore_shared->pageserver_connstring) == 0)
|
||||
{
|
||||
pthread_rwlock_unlock(&pagestore_shared->lock);
|
||||
return false;
|
||||
}
|
||||
|
||||
return;
|
||||
LWLockAcquire(pagestore_shared->lock, LW_SHARED);
|
||||
strlcpy(local_pageserver_connstring, pagestore_shared->pageserver_connstring, sizeof(local_pageserver_connstring));
|
||||
pagestore_local_counter = pg_atomic_read_u64(&pagestore_shared->update_counter);
|
||||
pthread_rwlock_unlock(&pagestore_shared->lock);
|
||||
return true;
|
||||
LWLockRelease(pagestore_shared->lock);
|
||||
}
|
||||
|
||||
static bool
|
||||
@@ -300,6 +290,7 @@ pageserver_disconnect(void)
|
||||
*/
|
||||
if (connected)
|
||||
{
|
||||
neon_log(LOG, "dropping connection to page server due to error");
|
||||
PQfinish(pageserver_conn);
|
||||
pageserver_conn = NULL;
|
||||
connected = false;
|
||||
@@ -320,12 +311,8 @@ pageserver_send(NeonRequest * request)
|
||||
|
||||
if(CheckConnstringUpdated())
|
||||
{
|
||||
bool should_disconnect = ReloadConnstring();
|
||||
if(should_disconnect)
|
||||
{
|
||||
neon_log(LOG, "pageserver_send disconnect because connstring changed");
|
||||
pageserver_disconnect();
|
||||
}
|
||||
pageserver_disconnect();
|
||||
ReloadConnstring();
|
||||
}
|
||||
|
||||
/* If the connection was lost for some reason, reconnect */
|
||||
@@ -497,12 +484,7 @@ PagestoreShmemInit(void)
|
||||
&found);
|
||||
if(!found)
|
||||
{
|
||||
pthread_rwlockattr_t attr;
|
||||
pthread_rwlockattr_init(&attr);
|
||||
pthread_rwlockattr_setpshared(&attr, PTHREAD_PROCESS_SHARED);
|
||||
pthread_rwlock_init(&pagestore_shared->lock, &attr);
|
||||
pthread_rwlockattr_destroy(&attr);
|
||||
|
||||
pagestore_shared->lock = &(GetNamedLWLockTranche("neon_libpagestore")->lock);
|
||||
pg_atomic_init_u64(&pagestore_shared->update_counter, 0);
|
||||
AssignPageserverConnstring(page_server_connstring, NULL);
|
||||
}
|
||||
|
||||
@@ -59,7 +59,6 @@
|
||||
#include "replication/walsender.h"
|
||||
#include "storage/bufmgr.h"
|
||||
#include "storage/buf_internals.h"
|
||||
#include "storage/fsm_internals.h"
|
||||
#include "storage/smgr.h"
|
||||
#include "storage/md.h"
|
||||
#include "pgstat.h"
|
||||
@@ -2723,86 +2722,6 @@ smgr_init_neon(void)
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, XLogRecPtr end_recptr)
|
||||
{
|
||||
BlockNumber relsize;
|
||||
/* Extend the relation if we know its size */
|
||||
if (get_cached_relsize(rinfo, forknum, &relsize))
|
||||
{
|
||||
if (relsize < blkno + 1)
|
||||
{
|
||||
update_cached_relsize(rinfo, forknum, blkno + 1);
|
||||
SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
* Size was not cached. We populate the cache now, with the size of the
|
||||
* relation measured after this WAL record is applied.
|
||||
*
|
||||
* This length is later reused when we open the smgr to read the block,
|
||||
* which is fine and expected.
|
||||
*/
|
||||
|
||||
NeonResponse *response;
|
||||
NeonNblocksResponse *nbresponse;
|
||||
NeonNblocksRequest request = {
|
||||
.req = (NeonRequest) {
|
||||
.lsn = end_recptr,
|
||||
.latest = false,
|
||||
.tag = T_NeonNblocksRequest,
|
||||
},
|
||||
.rinfo = rinfo,
|
||||
.forknum = forknum,
|
||||
};
|
||||
|
||||
response = page_server_request(&request);
|
||||
|
||||
Assert(response->tag == T_NeonNblocksResponse);
|
||||
nbresponse = (NeonNblocksResponse *) response;
|
||||
|
||||
relsize = Max(nbresponse->n_blocks, blkno+1);
|
||||
|
||||
set_cached_relsize(rinfo, forknum, relsize);
|
||||
SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum);
|
||||
|
||||
elog(SmgrTrace, "Set length to %d", relsize);
|
||||
}
|
||||
}
|
||||
|
||||
#define FSM_TREE_DEPTH ((SlotsPerFSMPage >= 1626) ? 3 : 4)
|
||||
|
||||
/*
|
||||
* TODO: May be it is better to make correspondent fgunctio from freespace.c public?
|
||||
*/
|
||||
static BlockNumber
|
||||
get_fsm_physical_block(BlockNumber heapblk)
|
||||
{
|
||||
BlockNumber pages;
|
||||
int leafno;
|
||||
int l;
|
||||
|
||||
/*
|
||||
* Calculate the logical page number of the first leaf page below the
|
||||
* given page.
|
||||
*/
|
||||
leafno = heapblk / SlotsPerFSMPage;
|
||||
|
||||
/* Count upper level nodes required to address the leaf page */
|
||||
pages = 0;
|
||||
for (l = 0; l < FSM_TREE_DEPTH; l++)
|
||||
{
|
||||
pages += leafno + 1;
|
||||
leafno /= SlotsPerFSMPage;
|
||||
}
|
||||
|
||||
/* Turn the page count into 0-based block number */
|
||||
return pages - 1;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Return whether we can skip the redo for this block.
|
||||
*
|
||||
@@ -2850,6 +2769,7 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
|
||||
LWLock *partitionLock;
|
||||
Buffer buffer;
|
||||
bool no_redo_needed;
|
||||
BlockNumber relsize;
|
||||
|
||||
if (old_redo_read_buffer_filter && old_redo_read_buffer_filter(record, block_id))
|
||||
return true;
|
||||
@@ -2899,10 +2819,49 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
|
||||
|
||||
LWLockRelease(partitionLock);
|
||||
|
||||
neon_extend_rel_size(rinfo, forknum, blkno, end_recptr);
|
||||
if (forknum == MAIN_FORKNUM)
|
||||
/* Extend the relation if we know its size */
|
||||
if (get_cached_relsize(rinfo, forknum, &relsize))
|
||||
{
|
||||
neon_extend_rel_size(rinfo, FSM_FORKNUM, get_fsm_physical_block(blkno), end_recptr);
|
||||
if (relsize < blkno + 1)
|
||||
{
|
||||
update_cached_relsize(rinfo, forknum, blkno + 1);
|
||||
SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
* Size was not cached. We populate the cache now, with the size of the
|
||||
* relation measured after this WAL record is applied.
|
||||
*
|
||||
* This length is later reused when we open the smgr to read the block,
|
||||
* which is fine and expected.
|
||||
*/
|
||||
|
||||
NeonResponse *response;
|
||||
NeonNblocksResponse *nbresponse;
|
||||
NeonNblocksRequest request = {
|
||||
.req = (NeonRequest) {
|
||||
.lsn = end_recptr,
|
||||
.latest = false,
|
||||
.tag = T_NeonNblocksRequest,
|
||||
},
|
||||
.rinfo = rinfo,
|
||||
.forknum = forknum,
|
||||
};
|
||||
|
||||
response = page_server_request(&request);
|
||||
|
||||
Assert(response->tag == T_NeonNblocksResponse);
|
||||
nbresponse = (NeonNblocksResponse *) response;
|
||||
|
||||
Assert(nbresponse->n_blocks > blkno);
|
||||
|
||||
set_cached_relsize(rinfo, forknum, nbresponse->n_blocks);
|
||||
SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum);
|
||||
|
||||
elog(SmgrTrace, "Set length to %d", nbresponse->n_blocks);
|
||||
}
|
||||
|
||||
return no_redo_needed;
|
||||
}
|
||||
|
||||
20
poetry.lock
generated
20
poetry.lock
generated
@@ -1967,18 +1967,18 @@ pytest = [
|
||||
|
||||
[[package]]
|
||||
name = "pytest-rerunfailures"
|
||||
version = "13.0"
|
||||
version = "11.1.2"
|
||||
description = "pytest plugin to re-run tests to eliminate flaky failures"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "pytest-rerunfailures-13.0.tar.gz", hash = "sha256:e132dbe420bc476f544b96e7036edd0a69707574209b6677263c950d19b09199"},
|
||||
{file = "pytest_rerunfailures-13.0-py3-none-any.whl", hash = "sha256:34919cb3fcb1f8e5d4b940aa75ccdea9661bade925091873b7c6fa5548333069"},
|
||||
{file = "pytest-rerunfailures-11.1.2.tar.gz", hash = "sha256:55611661e873f1cafa384c82f08d07883954f4b76435f4b8a5b470c1954573de"},
|
||||
{file = "pytest_rerunfailures-11.1.2-py3-none-any.whl", hash = "sha256:d21fe2e46d9774f8ad95f1aa799544ae95cac3a223477af94aa985adfae92b7e"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
packaging = ">=17.1"
|
||||
pytest = ">=7"
|
||||
pytest = ">=5.3"
|
||||
|
||||
[[package]]
|
||||
name = "pytest-split"
|
||||
@@ -2476,6 +2476,16 @@ files = [
|
||||
{file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
|
||||
{file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
|
||||
{file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
|
||||
{file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
|
||||
{file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
|
||||
{file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
|
||||
{file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
|
||||
@@ -2697,4 +2707,4 @@ cffi = ["cffi (>=1.11)"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.9"
|
||||
content-hash = "9f33b4404dbb9803ede5785469241dde1d09132427b87db8928bdbc37ccd6b7a"
|
||||
content-hash = "25ffa9ed98d890a3b85e6036792296a60bb705e8f9eaa1f07336501116a58756"
|
||||
|
||||
@@ -69,7 +69,6 @@ webpki-roots.workspace = true
|
||||
x509-parser.workspace = true
|
||||
native-tls.workspace = true
|
||||
postgres-native-tls.workspace = true
|
||||
smol_str.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
tokio-util.workspace = true
|
||||
|
||||
@@ -106,7 +106,7 @@ pub(super) async fn authenticate(
|
||||
reported_auth_ok: true,
|
||||
value: NodeInfo {
|
||||
config,
|
||||
aux: db_info.aux,
|
||||
aux: db_info.aux.into(),
|
||||
allow_self_signed_compute: false, // caller may override
|
||||
},
|
||||
})
|
||||
|
||||
@@ -284,5 +284,5 @@ async fn handle_client(
|
||||
let client = tokio::net::TcpStream::connect(destination).await?;
|
||||
|
||||
let metrics_aux: MetricsAuxInfo = Default::default();
|
||||
proxy::proxy::proxy_pass(tls_stream, client, metrics_aux).await
|
||||
proxy::proxy::proxy_pass(tls_stream, client, &metrics_aux).await
|
||||
}
|
||||
|
||||
@@ -103,7 +103,7 @@ struct ProxyCliArgs {
|
||||
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
|
||||
require_client_ip: bool,
|
||||
/// Disable dynamic rate limiter and store the metrics to ensure its production behaviour.
|
||||
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
|
||||
#[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
|
||||
disable_dynamic_rate_limiter: bool,
|
||||
/// Rate limit algorithm. Makes sense only if `disable_rate_limiter` is `false`.
|
||||
#[clap(value_enum, long, default_value_t = proxy::rate_limiter::RateLimitAlgorithm::Aimd)]
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use serde::Deserialize;
|
||||
use smol_str::SmolStr;
|
||||
use std::fmt;
|
||||
|
||||
/// Generic error response with human-readable description.
|
||||
@@ -89,11 +88,11 @@ impl fmt::Debug for DatabaseInfo {
|
||||
|
||||
/// Various labels for prometheus metrics.
|
||||
/// Also known as `ProxyMetricsAuxInfo` in the console.
|
||||
#[derive(Debug, Deserialize, Clone, Default)]
|
||||
#[derive(Debug, Deserialize, Default)]
|
||||
pub struct MetricsAuxInfo {
|
||||
pub endpoint_id: SmolStr,
|
||||
pub project_id: SmolStr,
|
||||
pub branch_id: SmolStr,
|
||||
pub endpoint_id: Box<str>,
|
||||
pub project_id: Box<str>,
|
||||
pub branch_id: Box<str>,
|
||||
}
|
||||
|
||||
impl MetricsAuxInfo {
|
||||
|
||||
@@ -229,7 +229,7 @@ pub struct NodeInfo {
|
||||
pub config: compute::ConnCfg,
|
||||
|
||||
/// Labels for proxy's metrics.
|
||||
pub aux: MetricsAuxInfo,
|
||||
pub aux: Arc<MetricsAuxInfo>,
|
||||
|
||||
/// Whether we should accept self-signed certificates (for testing)
|
||||
pub allow_self_signed_compute: bool,
|
||||
|
||||
@@ -144,7 +144,7 @@ impl Api {
|
||||
|
||||
let node = NodeInfo {
|
||||
config,
|
||||
aux: body.aux,
|
||||
aux: body.aux.into(),
|
||||
allow_self_signed_compute: false,
|
||||
};
|
||||
|
||||
|
||||
@@ -134,9 +134,9 @@ pub static ALLOWED_IPS_BY_CACHE_OUTCOME: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
|
||||
pub static RATE_LIMITER_ACQUIRE_LATENCY: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"proxy_control_plane_token_acquire_seconds",
|
||||
"semaphore_control_plane_token_acquire_seconds",
|
||||
"Time it took for proxy to establish a connection to the compute endpoint",
|
||||
// largest bucket = 3^16 * 0.05ms = 2.15s
|
||||
// largest bucket = 3^16 * 0.00005ms = 2.15s
|
||||
exponential_buckets(0.00005, 3.0, 16).unwrap(),
|
||||
)
|
||||
.unwrap()
|
||||
@@ -877,11 +877,11 @@ async fn prepare_client_connection(
|
||||
pub async fn proxy_pass(
|
||||
client: impl AsyncRead + AsyncWrite + Unpin,
|
||||
compute: impl AsyncRead + AsyncWrite + Unpin,
|
||||
aux: MetricsAuxInfo,
|
||||
aux: &MetricsAuxInfo,
|
||||
) -> anyhow::Result<()> {
|
||||
let usage = USAGE_METRICS.register(Ids {
|
||||
endpoint_id: aux.endpoint_id.clone(),
|
||||
branch_id: aux.branch_id.clone(),
|
||||
endpoint_id: aux.endpoint_id.to_string(),
|
||||
branch_id: aux.branch_id.to_string(),
|
||||
});
|
||||
|
||||
let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx"]);
|
||||
@@ -1032,7 +1032,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
|
||||
// immediately after opening the connection.
|
||||
let (stream, read_buf) = stream.into_inner();
|
||||
node.stream.write_all(&read_buf).await?;
|
||||
proxy_pass(stream, node.stream, aux).await
|
||||
proxy_pass(stream, node.stream, &aux).await
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -8,7 +8,6 @@ use pbkdf2::{
|
||||
Params, Pbkdf2,
|
||||
};
|
||||
use pq_proto::StartupMessageParams;
|
||||
use smol_str::SmolStr;
|
||||
use std::{collections::HashMap, net::SocketAddr, sync::Arc};
|
||||
use std::{
|
||||
fmt,
|
||||
@@ -42,16 +41,16 @@ const MAX_CONNS_PER_ENDPOINT: usize = 20;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ConnInfo {
|
||||
pub username: SmolStr,
|
||||
pub dbname: SmolStr,
|
||||
pub hostname: SmolStr,
|
||||
pub password: SmolStr,
|
||||
pub options: Option<SmolStr>,
|
||||
pub username: String,
|
||||
pub dbname: String,
|
||||
pub hostname: String,
|
||||
pub password: String,
|
||||
pub options: Option<String>,
|
||||
}
|
||||
|
||||
impl ConnInfo {
|
||||
// hm, change to hasher to avoid cloning?
|
||||
pub fn db_and_user(&self) -> (SmolStr, SmolStr) {
|
||||
pub fn db_and_user(&self) -> (String, String) {
|
||||
(self.dbname.clone(), self.username.clone())
|
||||
}
|
||||
}
|
||||
@@ -71,7 +70,7 @@ struct ConnPoolEntry {
|
||||
// Per-endpoint connection pool, (dbname, username) -> DbUserConnPool
|
||||
// Number of open connections is limited by the `max_conns_per_endpoint`.
|
||||
pub struct EndpointConnPool {
|
||||
pools: HashMap<(SmolStr, SmolStr), DbUserConnPool>,
|
||||
pools: HashMap<(String, String), DbUserConnPool>,
|
||||
total_conns: usize,
|
||||
}
|
||||
|
||||
@@ -96,7 +95,7 @@ pub struct GlobalConnPool {
|
||||
//
|
||||
// That should be a fairly conteded map, so return reference to the per-endpoint
|
||||
// pool as early as possible and release the lock.
|
||||
global_pool: DashMap<SmolStr, Arc<RwLock<EndpointConnPool>>>,
|
||||
global_pool: DashMap<String, Arc<RwLock<EndpointConnPool>>>,
|
||||
|
||||
/// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each.
|
||||
/// That seems like far too much effort, so we're using a relaxed increment counter instead.
|
||||
@@ -328,7 +327,7 @@ impl GlobalConnPool {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get_or_create_endpoint_pool(&self, endpoint: &SmolStr) -> Arc<RwLock<EndpointConnPool>> {
|
||||
fn get_or_create_endpoint_pool(&self, endpoint: &String) -> Arc<RwLock<EndpointConnPool>> {
|
||||
// fast path
|
||||
if let Some(pool) = self.global_pool.get(endpoint) {
|
||||
return pool.clone();
|
||||
@@ -469,7 +468,7 @@ async fn connect_to_compute_once(
|
||||
|
||||
let (client, mut connection) = config
|
||||
.user(&conn_info.username)
|
||||
.password(&*conn_info.password)
|
||||
.password(&conn_info.password)
|
||||
.dbname(&conn_info.dbname)
|
||||
.connect_timeout(timeout)
|
||||
.connect(tokio_postgres::NoTls)
|
||||
@@ -483,8 +482,8 @@ async fn connect_to_compute_once(
|
||||
info!(%conn_info, %session, "new connection");
|
||||
});
|
||||
let ids = Ids {
|
||||
endpoint_id: node_info.aux.endpoint_id.clone(),
|
||||
branch_id: node_info.aux.branch_id.clone(),
|
||||
endpoint_id: node_info.aux.endpoint_id.to_string(),
|
||||
branch_id: node_info.aux.branch_id.to_string(),
|
||||
};
|
||||
|
||||
tokio::spawn(
|
||||
|
||||
@@ -182,16 +182,16 @@ fn get_conn_info(
|
||||
|
||||
for (key, value) in pairs {
|
||||
if key == "options" {
|
||||
options = Some(value.into());
|
||||
options = Some(value.to_string());
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(ConnInfo {
|
||||
username: username.into(),
|
||||
dbname: dbname.into(),
|
||||
hostname: hostname.into(),
|
||||
password: password.into(),
|
||||
username: username.to_owned(),
|
||||
dbname: dbname.to_owned(),
|
||||
hostname: hostname.to_owned(),
|
||||
password: password.to_owned(),
|
||||
options,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -6,7 +6,6 @@ use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_S
|
||||
use dashmap::{mapref::entry::Entry, DashMap};
|
||||
use once_cell::sync::Lazy;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use smol_str::SmolStr;
|
||||
use std::{
|
||||
convert::Infallible,
|
||||
sync::{
|
||||
@@ -30,8 +29,8 @@ const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
|
||||
/// because we enrich the event with project_id in the control-plane endpoint.
|
||||
#[derive(Eq, Hash, PartialEq, Serialize, Deserialize, Debug, Clone)]
|
||||
pub struct Ids {
|
||||
pub endpoint_id: SmolStr,
|
||||
pub branch_id: SmolStr,
|
||||
pub endpoint_id: String,
|
||||
pub branch_id: String,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
@@ -291,8 +290,8 @@ mod tests {
|
||||
|
||||
// register a new counter
|
||||
let counter = metrics.register(Ids {
|
||||
endpoint_id: "e1".into(),
|
||||
branch_id: "b1".into(),
|
||||
endpoint_id: "e1".to_string(),
|
||||
branch_id: "b1".to_string(),
|
||||
});
|
||||
|
||||
// the counter should be observed despite 0 egress
|
||||
|
||||
@@ -34,7 +34,7 @@ types-psutil = "^5.9.5.12"
|
||||
types-toml = "^0.10.8.6"
|
||||
pytest-httpserver = "^1.0.8"
|
||||
aiohttp = "3.9.0"
|
||||
pytest-rerunfailures = "^13.0"
|
||||
pytest-rerunfailures = "^11.1.2"
|
||||
types-pytest-lazy-fixture = "^0.6.3.3"
|
||||
pytest-split = "^0.8.1"
|
||||
zstandard = "^0.21.0"
|
||||
|
||||
@@ -178,11 +178,9 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
|
||||
system_id: request_data.system_id.unwrap_or(0),
|
||||
wal_seg_size: request_data.wal_seg_size.unwrap_or(WAL_SEGMENT_SIZE as u32),
|
||||
};
|
||||
let local_start_lsn = request_data.local_start_lsn.unwrap_or_else(|| {
|
||||
request_data
|
||||
.commit_lsn
|
||||
.segment_lsn(server_info.wal_seg_size as usize)
|
||||
});
|
||||
let local_start_lsn = request_data
|
||||
.local_start_lsn
|
||||
.unwrap_or(request_data.commit_lsn);
|
||||
GlobalTimelines::create(ttid, server_info, request_data.commit_lsn, local_start_lsn)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
@@ -914,14 +914,9 @@ where
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Persist in-memory state of control file to disk.
|
||||
//
|
||||
// TODO: passing inmem_remote_consistent_lsn everywhere is ugly, better
|
||||
// separate state completely and give Arc to all those who need it.
|
||||
pub async fn persist_inmem(&mut self, inmem_remote_consistent_lsn: Lsn) -> Result<()> {
|
||||
let mut state = self.state.clone();
|
||||
state.remote_consistent_lsn = inmem_remote_consistent_lsn;
|
||||
self.persist_control_file(state).await
|
||||
/// Persist control file to disk, called only after timeline creation (bootstrap).
|
||||
pub async fn persist(&mut self) -> Result<()> {
|
||||
self.persist_control_file(self.state.clone()).await
|
||||
}
|
||||
|
||||
/// Persist in-memory state to the disk, taking other data from state.
|
||||
@@ -935,7 +930,7 @@ where
|
||||
|
||||
/// Persist control file if there is something to save and enough time
|
||||
/// passed after the last save.
|
||||
pub async fn maybe_persist_inmem_control_file(
|
||||
pub async fn maybe_persist_control_file(
|
||||
&mut self,
|
||||
inmem_remote_consistent_lsn: Lsn,
|
||||
) -> Result<()> {
|
||||
@@ -948,7 +943,9 @@ where
|
||||
|| self.inmem.peer_horizon_lsn > self.state.peer_horizon_lsn
|
||||
|| inmem_remote_consistent_lsn > self.state.remote_consistent_lsn;
|
||||
if need_persist {
|
||||
self.persist_inmem(inmem_remote_consistent_lsn).await?;
|
||||
let mut state = self.state.clone();
|
||||
state.remote_consistent_lsn = inmem_remote_consistent_lsn;
|
||||
self.persist_control_file(state).await?;
|
||||
trace!("saved control file: {CF_SAVE_INTERVAL:?} passed");
|
||||
}
|
||||
Ok(())
|
||||
@@ -1067,6 +1064,8 @@ where
|
||||
|
||||
if sync_control_file {
|
||||
let mut state = self.state.clone();
|
||||
// Note: we could make remote_consistent_lsn update in cf common by
|
||||
// storing Arc to walsenders in Safekeeper.
|
||||
state.remote_consistent_lsn = new_remote_consistent_lsn;
|
||||
self.persist_control_file(state).await?;
|
||||
}
|
||||
|
||||
@@ -182,9 +182,8 @@ impl SharedState {
|
||||
}
|
||||
|
||||
/// Mark timeline active/inactive and return whether s3 offloading requires
|
||||
/// start/stop action. If timeline is deactivated, control file is persisted
|
||||
/// as maintenance task does that only for active timelines.
|
||||
async fn update_status(
|
||||
/// start/stop action.
|
||||
fn update_status(
|
||||
&mut self,
|
||||
num_computes: usize,
|
||||
remote_consistent_lsn: Lsn,
|
||||
@@ -192,15 +191,7 @@ impl SharedState {
|
||||
) -> bool {
|
||||
let is_active = self.is_active(num_computes, remote_consistent_lsn);
|
||||
if self.active != is_active {
|
||||
info!(
|
||||
"timeline {} active={} now, remote_consistent_lsn={}, commit_lsn={}",
|
||||
ttid, is_active, remote_consistent_lsn, self.sk.inmem.commit_lsn
|
||||
);
|
||||
if !is_active {
|
||||
if let Err(e) = self.sk.persist_inmem(remote_consistent_lsn).await {
|
||||
warn!("control file save in update_status failed: {:?}", e);
|
||||
}
|
||||
}
|
||||
info!("timeline {} active={} now", ttid, is_active);
|
||||
}
|
||||
self.active = is_active;
|
||||
self.is_wal_backup_action_pending(num_computes)
|
||||
@@ -447,7 +438,7 @@ impl Timeline {
|
||||
fs::create_dir_all(&self.timeline_dir).await?;
|
||||
|
||||
// Write timeline to disk and start background tasks.
|
||||
if let Err(e) = shared_state.sk.persist_inmem(Lsn::INVALID).await {
|
||||
if let Err(e) = shared_state.sk.persist().await {
|
||||
// Bootstrap failed, cancel timeline and remove timeline directory.
|
||||
self.cancel(shared_state);
|
||||
|
||||
@@ -520,14 +511,12 @@ impl Timeline {
|
||||
self.mutex.lock().await
|
||||
}
|
||||
|
||||
async fn update_status(&self, shared_state: &mut SharedState) -> bool {
|
||||
shared_state
|
||||
.update_status(
|
||||
self.walreceivers.get_num(),
|
||||
self.get_walsenders().get_remote_consistent_lsn(),
|
||||
self.ttid,
|
||||
)
|
||||
.await
|
||||
fn update_status(&self, shared_state: &mut SharedState) -> bool {
|
||||
shared_state.update_status(
|
||||
self.walreceivers.get_num(),
|
||||
self.get_walsenders().get_remote_consistent_lsn(),
|
||||
self.ttid,
|
||||
)
|
||||
}
|
||||
|
||||
/// Update timeline status and kick wal backup launcher to stop/start offloading if needed.
|
||||
@@ -537,7 +526,7 @@ impl Timeline {
|
||||
}
|
||||
let is_wal_backup_action_pending: bool = {
|
||||
let mut shared_state = self.write_shared_state().await;
|
||||
self.update_status(&mut shared_state).await
|
||||
self.update_status(&mut shared_state)
|
||||
};
|
||||
if is_wal_backup_action_pending {
|
||||
// Can fail only if channel to a static thread got closed, which is not normal at all.
|
||||
@@ -694,7 +683,7 @@ impl Timeline {
|
||||
shared_state.sk.record_safekeeper_info(&sk_info).await?;
|
||||
let peer_info = PeerInfo::from_sk_info(&sk_info, Instant::now());
|
||||
shared_state.peers_info.upsert(&peer_info);
|
||||
is_wal_backup_action_pending = self.update_status(&mut shared_state).await;
|
||||
is_wal_backup_action_pending = self.update_status(&mut shared_state);
|
||||
commit_lsn = shared_state.sk.inmem.commit_lsn;
|
||||
}
|
||||
self.commit_lsn_watch_tx.send(commit_lsn)?;
|
||||
@@ -839,7 +828,7 @@ impl Timeline {
|
||||
self.write_shared_state()
|
||||
.await
|
||||
.sk
|
||||
.maybe_persist_inmem_control_file(remote_consistent_lsn)
|
||||
.maybe_persist_control_file(remote_consistent_lsn)
|
||||
.await
|
||||
}
|
||||
|
||||
|
||||
@@ -3029,11 +3029,6 @@ def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
|
||||
"""Compute the working directory for an individual test."""
|
||||
test_name = request.node.name
|
||||
test_dir = top_output_dir / test_name.replace("/", "-")
|
||||
|
||||
# We rerun flaky tests multiple times, use a separate directory for each run.
|
||||
if (suffix := getattr(request.node, "execution_count", None)) is not None:
|
||||
test_dir = test_dir.parent / f"{test_dir.name}-{suffix}"
|
||||
|
||||
log.info(f"get_test_output_dir is {test_dir}")
|
||||
# make mypy happy
|
||||
assert isinstance(test_dir, Path)
|
||||
|
||||
@@ -260,14 +260,6 @@ class PageserverHttpClient(requests.Session):
|
||||
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach", params=params)
|
||||
self.verbose_error(res)
|
||||
|
||||
def tenant_reset(self, tenant_id: TenantId, drop_cache: bool):
|
||||
params = {}
|
||||
if drop_cache:
|
||||
params["drop_cache"] = "true"
|
||||
|
||||
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/reset", params=params)
|
||||
self.verbose_error(res)
|
||||
|
||||
def tenant_delete(self, tenant_id: TenantId):
|
||||
res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
|
||||
self.verbose_error(res)
|
||||
@@ -370,16 +362,12 @@ class PageserverHttpClient(requests.Session):
|
||||
new_timeline_id: TimelineId,
|
||||
ancestor_timeline_id: Optional[TimelineId] = None,
|
||||
ancestor_start_lsn: Optional[Lsn] = None,
|
||||
existing_initdb_timeline_id: Optional[TimelineId] = None,
|
||||
**kwargs,
|
||||
) -> Dict[Any, Any]:
|
||||
body: Dict[str, Any] = {
|
||||
"new_timeline_id": str(new_timeline_id),
|
||||
"ancestor_start_lsn": str(ancestor_start_lsn) if ancestor_start_lsn else None,
|
||||
"ancestor_timeline_id": str(ancestor_timeline_id) if ancestor_timeline_id else None,
|
||||
"existing_initdb_timeline_id": str(existing_initdb_timeline_id)
|
||||
if existing_initdb_timeline_id
|
||||
else None,
|
||||
}
|
||||
if pg_version != PgVersion.NOT_SET:
|
||||
body["pg_version"] = int(pg_version)
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import time
|
||||
from typing import TYPE_CHECKING, Any, Dict, List, Optional
|
||||
from typing import TYPE_CHECKING, Any, Dict, Optional
|
||||
|
||||
from mypy_boto3_s3.type_defs import ListObjectsV2OutputTypeDef, ObjectTypeDef
|
||||
from mypy_boto3_s3.type_defs import ListObjectsV2OutputTypeDef
|
||||
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
|
||||
@@ -235,14 +235,10 @@ if TYPE_CHECKING:
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder
|
||||
|
||||
|
||||
def assert_prefix_empty(
|
||||
neon_env_builder: "NeonEnvBuilder",
|
||||
prefix: Optional[str] = None,
|
||||
allowed_postfix: Optional[str] = None,
|
||||
):
|
||||
def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None):
|
||||
response = list_prefix(neon_env_builder, prefix)
|
||||
keys = response["KeyCount"]
|
||||
objects: List[ObjectTypeDef] = response.get("Contents", [])
|
||||
objects = response.get("Contents", [])
|
||||
common_prefixes = response.get("CommonPrefixes", [])
|
||||
|
||||
remote_storage = neon_env_builder.pageserver_remote_storage
|
||||
@@ -265,18 +261,7 @@ def assert_prefix_empty(
|
||||
f"contradicting ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}"
|
||||
)
|
||||
|
||||
filtered_count = 0
|
||||
if allowed_postfix is None:
|
||||
filtered_count = len(objects)
|
||||
else:
|
||||
for _obj in objects:
|
||||
key: str = str(response.get("Key", []))
|
||||
if not (allowed_postfix.endswith(key)):
|
||||
filtered_count += 1
|
||||
|
||||
assert (
|
||||
filtered_count == 0
|
||||
), f"remote dir with prefix {prefix} is not empty after deletion: {objects}"
|
||||
assert keys == 0, f"remote dir with prefix {prefix} is not empty after deletion: {objects}"
|
||||
|
||||
|
||||
def assert_prefix_not_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None):
|
||||
|
||||
@@ -48,6 +48,6 @@ def test_pg_clients(test_output_dir: Path, remote_pg: RemotePostgres, client: st
|
||||
subprocess_capture(test_output_dir, build_cmd, check=True)
|
||||
|
||||
run_cmd = [docker_bin, "run", "--rm", "--env-file", env_file, image_tag]
|
||||
_, output, _ = subprocess_capture(test_output_dir, run_cmd, check=True, capture_stdout=True)
|
||||
basepath, _, _ = subprocess_capture(test_output_dir, run_cmd, check=True)
|
||||
|
||||
assert str(output).strip() == "1"
|
||||
assert Path(f"{basepath}.stdout").read_text().strip() == "1"
|
||||
|
||||
@@ -114,6 +114,7 @@ def test_timeline_init_break_before_checkpoint(neon_env_builder: NeonEnvBuilder)
|
||||
[
|
||||
".*Failed to process timeline dir contents.*Timeline has no ancestor and no layer files.*",
|
||||
".*Timeline got dropped without initializing, cleaning its files.*",
|
||||
".*Failed to load index_part from remote storage, failed creation?.*",
|
||||
]
|
||||
)
|
||||
|
||||
@@ -143,13 +144,8 @@ def test_timeline_init_break_before_checkpoint(neon_env_builder: NeonEnvBuilder)
|
||||
), "pageserver should clean its temp timeline files on timeline creation failure"
|
||||
|
||||
|
||||
# The "exit" case is for a reproducer of issue 6007: an unclean shutdown where we can't do local fs cleanups
|
||||
@pytest.mark.parametrize("exit_or_return", ["return", "exit"])
|
||||
def test_timeline_init_break_before_checkpoint_recreate(
|
||||
neon_env_builder: NeonEnvBuilder, exit_or_return: str
|
||||
):
|
||||
env = neon_env_builder.init_configs()
|
||||
env.start()
|
||||
def test_timeline_init_break_before_checkpoint_recreate(neon_env_builder: NeonEnvBuilder):
|
||||
env = neon_env_builder.init_start()
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
|
||||
env.pageserver.allowed_errors.extend(
|
||||
@@ -160,7 +156,6 @@ def test_timeline_init_break_before_checkpoint_recreate(
|
||||
]
|
||||
)
|
||||
|
||||
pageserver_http.tenant_create(env.initial_tenant)
|
||||
tenant_id = env.initial_tenant
|
||||
|
||||
timelines_dir = env.pageserver.timeline_dir(tenant_id)
|
||||
@@ -171,17 +166,13 @@ def test_timeline_init_break_before_checkpoint_recreate(
|
||||
timeline_id = TimelineId("1080243c1f76fe3c5147266663c9860b")
|
||||
|
||||
# Introduce failpoint during timeline init (some intermediate files are on disk), before it's checkpointed.
|
||||
failpoint = "before-checkpoint-new-timeline"
|
||||
pattern = failpoint
|
||||
if exit_or_return == "exit":
|
||||
# in reality a read error happens, but there are automatic retries which now fail because pageserver is dead
|
||||
pattern = "Connection aborted."
|
||||
pageserver_http.configure_failpoints(("before-checkpoint-new-timeline", "return"))
|
||||
with pytest.raises(Exception, match="before-checkpoint-new-timeline"):
|
||||
_ = env.neon_cli.create_timeline(
|
||||
"test_timeline_init_break_before_checkpoint", tenant_id, timeline_id
|
||||
)
|
||||
|
||||
pageserver_http.configure_failpoints((failpoint, exit_or_return))
|
||||
with pytest.raises(Exception, match=pattern):
|
||||
_ = pageserver_http.timeline_create(env.pg_version, tenant_id, timeline_id)
|
||||
|
||||
# Restart the page server (with the failpoint disabled)
|
||||
# Restart the page server
|
||||
env.pageserver.restart(immediate=True)
|
||||
|
||||
# Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally.
|
||||
@@ -195,9 +186,11 @@ def test_timeline_init_break_before_checkpoint_recreate(
|
||||
timeline_dirs == initial_timeline_dirs
|
||||
), "pageserver should clean its temp timeline files on timeline creation failure"
|
||||
|
||||
# Disable the failpoint again
|
||||
pageserver_http.configure_failpoints(("before-checkpoint-new-timeline", "off"))
|
||||
# creating the branch should have worked now
|
||||
new_timeline_id = TimelineId(
|
||||
pageserver_http.timeline_create(env.pg_version, tenant_id, timeline_id)["timeline_id"]
|
||||
new_timeline_id = env.neon_cli.create_timeline(
|
||||
"test_timeline_init_break_before_checkpoint", tenant_id, timeline_id
|
||||
)
|
||||
|
||||
assert timeline_id == new_timeline_id
|
||||
|
||||
@@ -60,10 +60,7 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
|
||||
execute("SELECT count(*) FROM foo")
|
||||
assert fetchone() == (100000,)
|
||||
|
||||
# Reconfigure it using the same connstring just to make sure nothing breaks
|
||||
# as we have special handling for if the connstring doesn't change
|
||||
for _ in range(5):
|
||||
endpoint.reconfigure(pageserver_id=alt_pageserver_id)
|
||||
endpoint.reconfigure(pageserver_id=alt_pageserver_id)
|
||||
|
||||
# Verify that the neon.pageserver_connstring GUC is set to the correct thing
|
||||
execute("SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'")
|
||||
|
||||
@@ -49,7 +49,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
|
||||
"compaction_period": "0s", # we want to control when compaction runs
|
||||
"checkpoint_timeout": "24h", # something we won't reach
|
||||
"checkpoint_distance": f"{50 * (1024**2)}", # something we won't reach, we checkpoint manually
|
||||
"image_creation_threshold": "100", # we want to control when image is created
|
||||
"image_creation_threshold": f"{image_creation_threshold}",
|
||||
"compaction_threshold": f"{l0_l1_threshold}",
|
||||
"compaction_target_size": f"{128 * (1024**3)}", # make it so that we only have 1 partition => image coverage for delta layers => enables gc of delta layers
|
||||
}
|
||||
@@ -124,10 +124,6 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
|
||||
), "sanity check for what above loop is supposed to do"
|
||||
|
||||
# create the image layer from the future
|
||||
ps_http.patch_tenant_config_client_side(
|
||||
tenant_id, {"image_creation_threshold": image_creation_threshold}, None
|
||||
)
|
||||
assert ps_http.tenant_config(tenant_id).effective_config["image_creation_threshold"] == 1
|
||||
ps_http.timeline_compact(tenant_id, timeline_id, force_repartition=True)
|
||||
assert (
|
||||
len(
|
||||
|
||||
@@ -384,7 +384,7 @@ def test_download_remote_layers_api(
|
||||
env.pageserver.allowed_errors.extend(
|
||||
[
|
||||
".*download failed: downloading evicted layer file failed.*",
|
||||
f".*initial_size_calculation.*{tenant_id}.*{timeline_id}.*initial size calculation failed: downloading evicted layer file failed",
|
||||
f".*initial size calculation.*{tenant_id}.*{timeline_id}.*Failed to calculate logical size",
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
@@ -106,6 +106,7 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder, generations: bool)
|
||||
# Initial tenant load should reflect the delay we injected
|
||||
("initial_tenant_load", lambda t, p: t >= (tenant_load_delay_ms / 1000.0) and t >= p),
|
||||
# Subsequent steps should occur in expected order
|
||||
("initial_logical_sizes", lambda t, p: t > 0 and t >= p),
|
||||
("background_jobs_can_start", lambda t, p: t > 0 and t >= p),
|
||||
("complete", lambda t, p: t > 0 and t >= p),
|
||||
]
|
||||
|
||||
@@ -1,29 +0,0 @@
|
||||
import random
|
||||
import time
|
||||
|
||||
from fixtures.neon_fixtures import NeonEnv
|
||||
|
||||
|
||||
def test_physical_replication(neon_simple_env: NeonEnv):
|
||||
env = neon_simple_env
|
||||
n_records = 100000
|
||||
with env.endpoints.create_start(
|
||||
branch_name="main",
|
||||
endpoint_id="primary",
|
||||
) as primary:
|
||||
with primary.connect() as p_con:
|
||||
with p_con.cursor() as p_cur:
|
||||
p_cur.execute(
|
||||
"CREATE TABLE t(pk bigint primary key, payload text default repeat('?',200))"
|
||||
)
|
||||
time.sleep(1)
|
||||
with env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") as secondary:
|
||||
with primary.connect() as p_con:
|
||||
with p_con.cursor() as p_cur:
|
||||
with secondary.connect() as s_con:
|
||||
with s_con.cursor() as s_cur:
|
||||
for pk in range(n_records):
|
||||
p_cur.execute("insert into t (pk) values (%s)", (pk,))
|
||||
s_cur.execute(
|
||||
"select * from t where pk=%s", (random.randrange(1, n_records),)
|
||||
)
|
||||
@@ -603,12 +603,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
|
||||
assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
|
||||
remote_timeline_path = env.pageserver_remote_storage.timeline_path(tenant_id, timeline_id)
|
||||
|
||||
filtered = [
|
||||
path
|
||||
for path in remote_timeline_path.iterdir()
|
||||
if not (path.name.endswith("initdb.tar.zst"))
|
||||
]
|
||||
assert len(filtered) == 0
|
||||
assert not list(remote_timeline_path.iterdir())
|
||||
|
||||
# timeline deletion should kill ongoing uploads, so, the metric will be gone
|
||||
assert get_queued_count(file_kind="index", op_kind="upload") is None
|
||||
|
||||
@@ -285,7 +285,6 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
|
||||
str(tenant_id),
|
||||
)
|
||||
),
|
||||
allowed_postfix="initdb.tar.zst",
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import asyncio
|
||||
import enum
|
||||
import random
|
||||
import time
|
||||
from threading import Thread
|
||||
@@ -52,20 +51,11 @@ def do_gc_target(
|
||||
log.info("gc http thread returning")
|
||||
|
||||
|
||||
class ReattachMode(str, enum.Enum):
|
||||
REATTACH_EXPLICIT = "explicit"
|
||||
REATTACH_RESET = "reset"
|
||||
REATTACH_RESET_DROP = "reset"
|
||||
|
||||
|
||||
# Basic detach and re-attach test
|
||||
@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
|
||||
@pytest.mark.parametrize(
|
||||
"mode",
|
||||
[ReattachMode.REATTACH_EXPLICIT, ReattachMode.REATTACH_RESET, ReattachMode.REATTACH_RESET_DROP],
|
||||
)
|
||||
def test_tenant_reattach(
|
||||
neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind, mode: str
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
remote_storage_kind: RemoteStorageKind,
|
||||
):
|
||||
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
|
||||
|
||||
@@ -110,15 +100,8 @@ def test_tenant_reattach(
|
||||
ps_metrics.query_one("pageserver_last_record_lsn", filter=tenant_metric_filter).value
|
||||
)
|
||||
|
||||
if mode == ReattachMode.REATTACH_EXPLICIT:
|
||||
# Explicitly detach then attach the tenant as two separate API calls
|
||||
pageserver_http.tenant_detach(tenant_id)
|
||||
pageserver_http.tenant_attach(tenant_id)
|
||||
elif mode in (ReattachMode.REATTACH_RESET, ReattachMode.REATTACH_RESET_DROP):
|
||||
# Use the reset API to detach/attach in one shot
|
||||
pageserver_http.tenant_reset(tenant_id, mode == ReattachMode.REATTACH_RESET_DROP)
|
||||
else:
|
||||
raise NotImplementedError(mode)
|
||||
pageserver_http.tenant_detach(tenant_id)
|
||||
pageserver_http.tenant_attach(tenant_id)
|
||||
|
||||
time.sleep(1) # for metrics propagation
|
||||
|
||||
|
||||
@@ -290,12 +290,10 @@ def test_pageserver_with_empty_tenants(
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
env.pageserver.allowed_errors.extend(
|
||||
[
|
||||
".*marking .* as locally complete, while it doesnt exist in remote index.*",
|
||||
".*load failed.*list timelines directory.*",
|
||||
]
|
||||
env.pageserver.allowed_errors.append(
|
||||
".*marking .* as locally complete, while it doesnt exist in remote index.*"
|
||||
)
|
||||
env.pageserver.allowed_errors.append(".*load failed.*list timelines directory.*")
|
||||
|
||||
client = env.pageserver.http_client()
|
||||
|
||||
|
||||
@@ -308,10 +308,8 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
|
||||
)
|
||||
|
||||
timeline_dir = env.pageserver.timeline_dir(env.initial_tenant, timeline_id)
|
||||
|
||||
# Check local is empty
|
||||
assert (not timeline_dir.exists()) or len(os.listdir(timeline_dir)) == 0
|
||||
|
||||
assert not timeline_dir.exists()
|
||||
# Check no delete mark present
|
||||
assert not (timeline_dir.parent / f"{timeline_id}.___deleted").exists()
|
||||
|
||||
|
||||
@@ -146,72 +146,6 @@ def wait_for_pageserver_catchup(endpoint_main: Endpoint, polling_interval=1, tim
|
||||
time.sleep(polling_interval)
|
||||
|
||||
|
||||
def test_timeline_size_quota_on_startup(neon_env_builder: NeonEnvBuilder):
|
||||
env = neon_env_builder.init_start()
|
||||
client = env.pageserver.http_client()
|
||||
new_timeline_id = env.neon_cli.create_branch("test_timeline_size_quota_on_startup")
|
||||
|
||||
wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id)
|
||||
|
||||
endpoint_main = env.endpoints.create(
|
||||
"test_timeline_size_quota_on_startup",
|
||||
# Set small limit for the test
|
||||
config_lines=["neon.max_cluster_size=30MB"],
|
||||
)
|
||||
endpoint_main.start()
|
||||
|
||||
log.info("postgres is running on 'test_timeline_size_quota_on_startup' branch")
|
||||
|
||||
with closing(endpoint_main.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("CREATE TABLE foo (t text)")
|
||||
|
||||
# Insert many rows. This query must fail because of space limit
|
||||
try:
|
||||
for _i in range(5000):
|
||||
cur.execute(
|
||||
"""
|
||||
INSERT INTO foo
|
||||
SELECT 'long string to consume some space' || g
|
||||
FROM generate_series(1, 100) g
|
||||
"""
|
||||
)
|
||||
|
||||
# If we get here, the timeline size limit failed
|
||||
log.error("Query unexpectedly succeeded")
|
||||
raise AssertionError()
|
||||
|
||||
except psycopg2.errors.DiskFull as err:
|
||||
log.info(f"Query expectedly failed with: {err}")
|
||||
|
||||
# Restart endpoint that reached the limit to ensure that it doesn't fail on startup
|
||||
# i.e. the size limit is not enforced during startup.
|
||||
endpoint_main.stop()
|
||||
# don't skip pg_catalog updates - it runs CREATE EXTENSION neon
|
||||
# which is needed for neon.pg_cluster_size() to work
|
||||
endpoint_main.respec(skip_pg_catalog_updates=False)
|
||||
endpoint_main.start()
|
||||
|
||||
# ensure that the limit is enforced after startup
|
||||
with closing(endpoint_main.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
# This query must fail because of space limit
|
||||
try:
|
||||
cur.execute(
|
||||
"""
|
||||
INSERT INTO foo
|
||||
SELECT 'long string to consume some space' || g
|
||||
FROM generate_series(1, 100000) g
|
||||
"""
|
||||
)
|
||||
# If we get here, the timeline size limit failed
|
||||
log.error("Query unexpectedly succeeded")
|
||||
raise AssertionError()
|
||||
|
||||
except psycopg2.errors.DiskFull as err:
|
||||
log.info(f"Query expectedly failed with: {err}")
|
||||
|
||||
|
||||
def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder):
|
||||
env = neon_env_builder.init_start()
|
||||
client = env.pageserver.http_client()
|
||||
|
||||
@@ -30,7 +30,6 @@ from fixtures.neon_fixtures import (
|
||||
Safekeeper,
|
||||
SafekeeperHttpClient,
|
||||
SafekeeperPort,
|
||||
last_flush_lsn_upload,
|
||||
)
|
||||
from fixtures.pageserver.utils import (
|
||||
timeline_delete_wait_completed,
|
||||
@@ -287,43 +286,29 @@ def test_broker(neon_env_builder: NeonEnvBuilder):
|
||||
# wait until remote_consistent_lsn gets advanced on all safekeepers
|
||||
clients = [sk.http_client() for sk in env.safekeepers]
|
||||
stat_before = [cli.timeline_status(tenant_id, timeline_id) for cli in clients]
|
||||
log.info(f"statuses before insert: {stat_before}")
|
||||
log.info(f"statuses is {stat_before}")
|
||||
|
||||
endpoint.safe_psql("INSERT INTO t SELECT generate_series(1,100), 'payload'")
|
||||
|
||||
# wait for remote_consistent_lsn to reach flush_lsn, forcing it with checkpoint
|
||||
new_rcl = last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
|
||||
log.info(f"new_rcl: {new_rcl}")
|
||||
endpoint.stop()
|
||||
# force checkpoint in pageserver to advance remote_consistent_lsn
|
||||
wait_lsn_force_checkpoint(tenant_id, timeline_id, endpoint, env.pageserver)
|
||||
|
||||
# and wait till remote_consistent_lsn propagates to all safekeepers
|
||||
#
|
||||
# TODO: this executes long as timeline on safekeeper is immediately
|
||||
# deactivated once rcl reaches pageserver one, and thus we generally wait
|
||||
# till pageserver reconnects to all safekeepers one by one here. Timeline
|
||||
# status on safekeeper should take into account peers state as well.
|
||||
started_at = time.time()
|
||||
while True:
|
||||
stat_after = [cli.timeline_status(tenant_id, timeline_id) for cli in clients]
|
||||
if all([s_after.remote_consistent_lsn >= new_rcl for s_after in stat_after]):
|
||||
if all(
|
||||
s_after.remote_consistent_lsn > s_before.remote_consistent_lsn
|
||||
for s_after, s_before in zip(stat_after, stat_before)
|
||||
):
|
||||
break
|
||||
elapsed = time.time() - started_at
|
||||
if elapsed > 30:
|
||||
if elapsed > 20:
|
||||
raise RuntimeError(
|
||||
f"timed out waiting {elapsed:.0f}s for remote_consistent_lsn propagation: status before {stat_before}, status current {stat_after}"
|
||||
)
|
||||
time.sleep(1)
|
||||
|
||||
# Ensure that safekeepers don't lose remote_consistent_lsn on restart.
|
||||
# Control file is persisted each 5s. TODO: do that on shutdown and remove sleep.
|
||||
time.sleep(6)
|
||||
for sk in env.safekeepers:
|
||||
sk.stop()
|
||||
sk.start()
|
||||
stat_after_restart = [cli.timeline_status(tenant_id, timeline_id) for cli in clients]
|
||||
log.info(f"statuses after {stat_after_restart}")
|
||||
assert all([s.remote_consistent_lsn >= new_rcl for s in stat_after_restart])
|
||||
|
||||
|
||||
# Test that old WAL consumed by peers and pageserver is removed from safekeepers.
|
||||
@pytest.mark.parametrize("auth_enabled", [False, True])
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
import sys
|
||||
import tarfile
|
||||
import tempfile
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
@@ -126,43 +125,3 @@ def test_wal_restore_initdb(
|
||||
)
|
||||
log.info(f"original lsn: {original_lsn}, restored lsn: {restored_lsn}")
|
||||
assert restored.safe_psql("select count(*) from t", user="cloud_admin") == [(300000,)]
|
||||
|
||||
|
||||
def test_wal_restore_http(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
test_output_dir: Path,
|
||||
):
|
||||
env = neon_env_builder.init_start()
|
||||
endpoint = env.endpoints.create_start("main")
|
||||
endpoint.safe_psql("create table t as select generate_series(1,300000)")
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
|
||||
ps_client = env.pageserver.http_client()
|
||||
|
||||
# shut down the endpoint and delete the timeline from the pageserver
|
||||
endpoint.stop()
|
||||
|
||||
assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
|
||||
|
||||
test_output_dir / "initdb.tar.zst"
|
||||
|
||||
(env.pageserver_remote_storage.timeline_path(tenant_id, timeline_id) / "initdb.tar.zst")
|
||||
|
||||
ps_client.timeline_delete(tenant_id, timeline_id)
|
||||
time.sleep(2)
|
||||
|
||||
# verify that it is indeed deleted
|
||||
# TODO
|
||||
|
||||
# issue the restoration command
|
||||
ps_client.timeline_create(
|
||||
tenant_id=tenant_id,
|
||||
new_timeline_id=timeline_id,
|
||||
existing_initdb_timeline_id=timeline_id,
|
||||
pg_version=env.pg_version,
|
||||
)
|
||||
|
||||
# the table is back now!
|
||||
restored = env.endpoints.create_start("main")
|
||||
assert restored.safe_psql("select count(*) from t", user="cloud_admin") == [(300000,)]
|
||||
|
||||
@@ -13,10 +13,6 @@ commands:
|
||||
user: nobody
|
||||
sysvInitAction: respawn
|
||||
shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres" /bin/postgres_exporter'
|
||||
- name: sql-exporter
|
||||
user: nobody
|
||||
sysvInitAction: respawn
|
||||
shell: '/bin/sql_exporter -config.file=/etc/sql_exporter.yml'
|
||||
shutdownHook: |
|
||||
su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
|
||||
files:
|
||||
@@ -50,77 +46,6 @@ files:
|
||||
}
|
||||
memory {}
|
||||
}
|
||||
- filename: sql_exporter.yml
|
||||
content: |
|
||||
# Configuration for sql_exporter
|
||||
# Global defaults.
|
||||
global:
|
||||
# If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s.
|
||||
scrape_timeout: 10s
|
||||
# Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first.
|
||||
scrape_timeout_offset: 500ms
|
||||
# Minimum interval between collector runs: by default (0s) collectors are executed on every scrape.
|
||||
min_interval: 0s
|
||||
# Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections,
|
||||
# as will concurrent scrapes.
|
||||
max_connections: 1
|
||||
# Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should
|
||||
# always be the same as max_connections.
|
||||
max_idle_connections: 1
|
||||
# Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse.
|
||||
# If 0, connections are not closed due to a connection's age.
|
||||
max_connection_lifetime: 5m
|
||||
|
||||
# The target to monitor and the collectors to execute on it.
|
||||
target:
|
||||
# Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
|
||||
# the schema gets dropped or replaced to match the driver expected DSN format.
|
||||
data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable'
|
||||
|
||||
# Collectors (referenced by name) to execute on the target.
|
||||
# Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
|
||||
collectors: [neon_collector]
|
||||
|
||||
# Collector files specifies a list of globs. One collector definition is read from each matching file.
|
||||
# Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
|
||||
collector_files:
|
||||
- "neon_collector.yml"
|
||||
- filename: neon_collector.yml
|
||||
content: |
|
||||
collector_name: neon_collector
|
||||
metrics:
|
||||
- metric_name: lfc_misses
|
||||
type: gauge
|
||||
help: 'lfc_misses'
|
||||
key_labels:
|
||||
values: [lfc_misses]
|
||||
query: |
|
||||
select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses';
|
||||
|
||||
- metric_name: lfc_used
|
||||
type: gauge
|
||||
help: 'lfc_used'
|
||||
key_labels:
|
||||
values: [lfc_used]
|
||||
query: |
|
||||
select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used';
|
||||
|
||||
- metric_name: lfc_hits
|
||||
type: gauge
|
||||
help: 'lfc_hits'
|
||||
key_labels:
|
||||
values: [lfc_hits]
|
||||
query: |
|
||||
select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits';
|
||||
|
||||
- metric_name: lfc_writes
|
||||
type: gauge
|
||||
help: 'lfc_writes'
|
||||
key_labels:
|
||||
values: [lfc_writes]
|
||||
query: |
|
||||
select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes';
|
||||
|
||||
build: |
|
||||
# Build cgroup-tools
|
||||
#
|
||||
@@ -157,8 +82,6 @@ build: |
|
||||
|
||||
FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.0 AS postgres-exporter
|
||||
|
||||
FROM burningalchemist/sql_exporter:0.13 AS sql-exporter
|
||||
|
||||
# Build pgbouncer
|
||||
#
|
||||
FROM debian:bullseye-slim AS pgbouncer
|
||||
@@ -193,19 +116,13 @@ merge: |
|
||||
|
||||
COPY cgconfig.conf /etc/cgconfig.conf
|
||||
COPY pgbouncer.ini /etc/pgbouncer.ini
|
||||
COPY sql_exporter.yml /etc/sql_exporter.yml
|
||||
COPY neon_collector.yml /etc/neon_collector.yml
|
||||
|
||||
RUN set -e \
|
||||
&& chown postgres:postgres /etc/pgbouncer.ini \
|
||||
&& chmod 0644 /etc/pgbouncer.ini \
|
||||
&& chmod 0644 /etc/cgconfig.conf \
|
||||
&& chmod 0644 /etc/sql_exporter.yml \
|
||||
&& chmod 0644 /etc/neon_collector.yml
|
||||
&& chmod 0644 /etc/cgconfig.conf
|
||||
|
||||
COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/
|
||||
COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/
|
||||
COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/
|
||||
COPY --from=postgres-exporter /bin/postgres_exporter /bin/postgres_exporter
|
||||
COPY --from=sql-exporter /bin/sql_exporter /bin/sql_exporter
|
||||
COPY --from=pgbouncer /usr/local/pgbouncer/bin/pgbouncer /usr/local/bin/pgbouncer
|
||||
|
||||
Reference in New Issue
Block a user