Compare commits

..

3 Commits

Author SHA1 Message Date
Alexander Bayandin
527727d0f5 Merge branch 'main' into vk/pg_embedding_v16_support 2023-09-12 20:09:12 +01:00
Alexander Bayandin
d770e88886 Update checksum from pg_embedding source code 2023-09-12 20:08:20 +01:00
Vadim Kharitonov
a34bdc6eee pg_embedding with Pg16 support 2023-09-11 13:02:52 +02:00
57 changed files with 1286 additions and 2910 deletions

View File

@@ -16,29 +16,21 @@ on:
# Actual magic happens here:
- labeled
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
PR_NUMBER: ${{ github.event.pull_request.number }}
BRANCH: "ci-run/pr-${{ github.event.pull_request.number }}"
# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
permissions: {}
permissions: write-all
defaults:
run:
shell: bash -euo pipefail {0}
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
jobs:
remove-label:
# Remove `approved-for-ci-run` label if the workflow is triggered by changes in a PR.
# The PR should be reviewed and labelled manually again.
permissions:
pull-requests: write # For `gh pr edit`
if: |
contains(fromJSON('["opened", "synchronize", "reopened", "closed"]'), github.event.action) &&
contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')
@@ -51,10 +43,6 @@ jobs:
create-or-update-pr-for-ci-run:
# Create local PR for an `approved-for-ci-run` labelled PR to run CI pipeline in it.
permissions:
pull-requests: write # for `gh pr edit`
# For `git push` and `gh pr create` we use CI_ACCESS_TOKEN
if: |
github.event.action == 'labeled' &&
contains(github.event.pull_request.labels.*.name, 'approved-for-ci-run')
@@ -87,7 +75,7 @@ jobs:
Feel free to review/comment/discuss the original PR #${PR_NUMBER}.
EOF
ALREADY_CREATED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${BRANCH} --base main --json number --jq '.[].number')"
ALREADY_CREATED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${HEAD} --base main --json number --jq '.[].number')"
if [ -z "${ALREADY_CREATED}" ]; then
gh pr --repo "${GITHUB_REPOSITORY}" create --title "CI run for PR #${PR_NUMBER}" \
--body-file "body.md" \
@@ -99,10 +87,6 @@ jobs:
cleanup:
# Close PRs and delete branchs if the original PR is closed.
permissions:
contents: write # for `--delete-branch` flag in `gh pr close`
pull-requests: write # for `gh pr close`
if: |
github.event.action == 'closed' &&
github.event.pull_request.head.repo.full_name != github.repository
@@ -110,9 +94,8 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Close PR and delete `ci-run/pr-${{ env.PR_NUMBER }}` branch
run: |
CLOSED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${BRANCH} --json 'closed' --jq '.[].closed')"
- run: |
CLOSED="$(gh pr --repo ${GITHUB_REPOSITORY} list --head ${HEAD} --json 'closed' --jq '.[].closed')"
if [ "${CLOSED}" == "false" ]; then
gh pr --repo "${GITHUB_REPOSITORY}" close "${BRANCH}" --delete-branch
fi

View File

@@ -834,7 +834,7 @@ jobs:
run:
shell: sh -eu {0}
env:
VM_BUILDER_VERSION: v0.17.10
VM_BUILDER_VERSION: v0.17.5
steps:
- name: Checkout

View File

@@ -124,21 +124,8 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN apt update && \
apt install -y ninja-build python3-dev libncurses5 binutils clang
RUN case "${PG_VERSION}" in \
"v14" | "v15") \
export PLV8_VERSION=3.1.5 \
export PLV8_CHECKSUM=1e108d5df639e4c189e1c5bdfa2432a521c126ca89e7e5a969d46899ca7bf106 \
;; \
"v16") \
export PLV8_VERSION=3.1.8 \
export PLV8_CHECKSUM=92b10c7db39afdae97ff748c9ec54713826af222c459084ad002571b79eb3f49 \
;; \
*) \
echo "Export the valid PG_VERSION variable" && exit 1 \
;; \
esac && \
wget https://github.com/plv8/plv8/archive/refs/tags/v${PLV8_VERSION}.tar.gz -O plv8.tar.gz && \
echo "${PLV8_CHECKSUM} plv8.tar.gz" | sha256sum --check && \
RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.8.tar.gz -O plv8.tar.gz && \
echo "92b10c7db39afdae97ff748c9ec54713826af222c459084ad002571b79eb3f49 plv8.tar.gz" | sha256sum --check && \
mkdir plv8-src && cd plv8-src && tar xvzf ../plv8.tar.gz --strip-components=1 -C . && \
export PATH="/usr/local/pgsql/bin:$PATH" && \
make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -429,7 +416,7 @@ RUN case "${PG_VERSION}" in \
;; \
"v16") \
export PG_HINT_PLAN_VERSION=16_1_6_0 \
export PG_HINT_PLAN_CHECKSUM=fc85a9212e7d2819d4ae4ac75817481101833c3cfa9f0fe1f980984e12347d00 \
export PG_HINT_PLAN_CHECKSUM=ce6a8040c78012000f5da7240caf6a971401412f41d33f930f09291e6c304b99 \
;; \
*) \
echo "Export the valid PG_HINT_PLAN_VERSION variable" && exit 1 \
@@ -579,8 +566,8 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PATH "/usr/local/pgsql/bin/:$PATH"
RUN case "${PG_VERSION}" in \
"v14" | "v15") \
export PG_EMBEDDING_VERSION=0.3.5 \
export PG_EMBEDDING_CHECKSUM=0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 \
export PG_EMBEDDING_VERSION=0.3.6 \
export PG_EMBEDDING_CHECKSUM=b2e2b359335d26987778c7fae0c9bcc8ebc3530fc214113be1ddbc8a136e52ac \
;; \
*) \
echo "pg_embedding not supported on this PostgreSQL version. Use pgvector instead." && exit 0;; \

View File

@@ -630,27 +630,25 @@ impl ComputeNode {
/// Start Postgres as a child process and manage DBs/roles.
/// After that this will hang waiting on the postmaster process to exit.
#[instrument(skip_all)]
pub fn start_postgres(&self, pspec: &ParsedSpec) -> Result<std::process::Child> {
pub fn start_postgres(
&self,
storage_auth_token: Option<String>,
) -> Result<std::process::Child> {
let pgdata_path = Path::new(&self.pgdata);
let spec = &pspec.spec;
// Run postgres as a child process.
let mut pg_command = maybe_cgexec(&self.pgbin);
pg_command.args(["-D", &self.pgdata]);
let mut pg = maybe_cgexec(&self.pgbin)
.args(["-D", &self.pgdata])
.envs(if let Some(storage_auth_token) = &storage_auth_token {
vec![("NEON_AUTH_TOKEN", storage_auth_token)]
} else {
vec![]
})
.spawn()
.expect("cannot start postgres process");
if let Some(storage_auth_token) = &spec.storage_auth_token {
pg_command.env("NEON_AUTH_TOKEN", storage_auth_token);
}
if let Some(env_vars) = &spec.env_vars {
pg_command.envs(
env_vars
.iter()
.map(|(k, v)| (k, v.clone().unwrap_or(String::new()))),
);
}
let mut pg = pg_command.spawn().expect("cannot start postgres process");
wait_for_postgres(&mut pg, pgdata_path)?;
Ok(pg)
}
@@ -799,7 +797,7 @@ impl ComputeNode {
self.prepare_pgdata(&compute_state, extension_server_port)?;
let start_time = Utc::now();
let pg = self.start_postgres(pspec)?;
let pg = self.start_postgres(pspec.storage_auth_token.clone())?;
let config_time = Utc::now();
if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {

View File

@@ -500,7 +500,6 @@ impl Endpoint {
safekeeper_connstrings,
storage_auth_token: auth_token.clone(),
remote_extensions: None,
env_vars: None,
};
let spec_path = self.endpoint_path().join("spec.json");
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;

View File

@@ -58,6 +58,7 @@ pub struct ComputeSpec {
pub pageserver_connstring: Option<String>,
#[serde(default)]
pub safekeeper_connstrings: Vec<String>,
#[serde(default)]
pub mode: ComputeMode,
@@ -67,9 +68,6 @@ pub struct ComputeSpec {
// information about available remote extensions
pub remote_extensions: Option<RemoteExtSpec>,
// Environment variables to set on the compute
pub env_vars: Option<HashMap<String, Option<String>>>,
}
#[derive(Clone, Debug, Default, Deserialize, Serialize)]

View File

@@ -243,9 +243,5 @@
"public_extensions": [
"postgis"
]
},
"env_vars": {
"OPENAI_API_KEY" : "i hope heikki has a good day today",
"MY_NULL_ENV_VAR" : null
}
}
}

View File

@@ -3,9 +3,9 @@
//!
use chrono::{DateTime, Utc};
use rand::Rng;
use serde::{Deserialize, Serialize};
use serde::Serialize;
#[derive(Serialize, serde::Deserialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
#[derive(Serialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
#[serde(tag = "type")]
pub enum EventType {
#[serde(rename = "absolute")]
@@ -27,8 +27,7 @@ impl EventType {
}
pub fn incremental_timerange(&self) -> Option<std::ops::Range<&DateTime<Utc>>> {
// these can most likely be thought of as Range or RangeFull, at least pageserver creates
// incremental ranges where the stop and next start are equal.
// these can most likely be thought of as Range or RangeFull
use EventType::*;
match self {
Incremental {
@@ -42,25 +41,15 @@ impl EventType {
pub fn is_incremental(&self) -> bool {
matches!(self, EventType::Incremental { .. })
}
/// Returns the absolute time, or for incremental ranges, the stop time.
pub fn recorded_at(&self) -> &DateTime<Utc> {
use EventType::*;
match self {
Absolute { time } => time,
Incremental { stop_time, .. } => stop_time,
}
}
}
#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
pub struct Event<Extra, Metric> {
#[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
pub struct Event<Extra> {
#[serde(flatten)]
#[serde(rename = "type")]
pub kind: EventType,
pub metric: Metric,
pub metric: &'static str,
pub idempotency_key: String,
pub value: u64,
@@ -69,38 +58,12 @@ pub struct Event<Extra, Metric> {
}
pub fn idempotency_key(node_id: &str) -> String {
IdempotencyKey::generate(node_id).to_string()
}
/// Downstream users will use these to detect upload retries.
pub struct IdempotencyKey<'a> {
now: chrono::DateTime<Utc>,
node_id: &'a str,
nonce: u16,
}
impl std::fmt::Display for IdempotencyKey<'_> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}-{}-{:04}", self.now, self.node_id, self.nonce)
}
}
impl<'a> IdempotencyKey<'a> {
pub fn generate(node_id: &'a str) -> Self {
IdempotencyKey {
now: Utc::now(),
node_id,
nonce: rand::thread_rng().gen_range(0..=9999),
}
}
pub fn for_tests(now: DateTime<Utc>, node_id: &'a str, nonce: u16) -> Self {
IdempotencyKey {
now,
node_id,
nonce,
}
}
format!(
"{}-{}-{:04}",
Utc::now(),
node_id,
rand::thread_rng().gen_range(0..=9999)
)
}
pub const CHUNK_SIZE: usize = 1000;

View File

@@ -137,12 +137,9 @@ pub const XLOG_HEAP_INSERT: u8 = 0x00;
pub const XLOG_HEAP_DELETE: u8 = 0x10;
pub const XLOG_HEAP_UPDATE: u8 = 0x20;
pub const XLOG_HEAP_HOT_UPDATE: u8 = 0x40;
pub const XLOG_HEAP_LOCK: u8 = 0x60;
pub const XLOG_HEAP_INIT_PAGE: u8 = 0x80;
pub const XLOG_HEAP2_VISIBLE: u8 = 0x40;
pub const XLOG_HEAP2_MULTI_INSERT: u8 = 0x50;
pub const XLOG_HEAP2_LOCK_UPDATED: u8 = 0x60;
pub const XLH_LOCK_ALL_FROZEN_CLEARED: u8 = 0x01;
pub const XLH_INSERT_ALL_FROZEN_SET: u8 = (1 << 5) as u8;
pub const XLH_INSERT_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
pub const XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;

View File

@@ -315,8 +315,12 @@ impl CgroupWatcher {
where
E: Stream<Item = Sequenced<u64>>,
{
// There are several actions might do when receiving a `memory.high`,
// such as freezing the cgroup, or increasing its `memory.high`. We don't
// want to do these things too often (because postgres needs to run, and
// we only have so much memory). These timers serve as rate limits for this.
let mut wait_to_freeze = pin!(tokio::time::sleep(Duration::ZERO));
let mut last_memory_high_increase_at: Option<Instant> = None;
let mut wait_to_increase_memory_high = pin!(tokio::time::sleep(Duration::ZERO));
let mut events = pin!(events);
// Are we waiting to be upscaled? Could be true if we request upscale due
@@ -328,8 +332,6 @@ impl CgroupWatcher {
upscale = upscales.recv() => {
let Sequenced { seqnum, data } = upscale
.context("failed to listen on upscale notification channel")?;
waiting_on_upscale = false;
last_memory_high_increase_at = None;
self.last_upscale_seqnum.store(seqnum, Ordering::Release);
info!(cpu = data.cpu, mem_bytes = data.mem, "received upscale");
}
@@ -394,17 +396,12 @@ impl CgroupWatcher {
.send(())
.await
.context("failed to request upscale")?;
waiting_on_upscale = true;
continue;
}
// Shoot, we can't freeze or and we're still waiting on upscale,
// increase memory.high to reduce throttling
let can_increase_memory_high = match last_memory_high_increase_at {
None => true,
Some(t) => t.elapsed() > self.config.memory_high_increase_every,
};
if can_increase_memory_high {
if wait_to_increase_memory_high.is_elapsed() {
info!(
"received memory.high event, \
but too soon to refreeze and already requested upscale \
@@ -440,11 +437,12 @@ impl CgroupWatcher {
);
self.set_high_bytes(new_high)
.context("failed to set memory.high")?;
last_memory_high_increase_at = Some(Instant::now());
continue;
wait_to_increase_memory_high
.as_mut()
.reset(Instant::now() + self.config.memory_high_increase_every)
}
info!("received memory.high event, but can't do anything");
// we can't do anything
}
};
}
@@ -561,7 +559,14 @@ impl CgroupWatcher {
/// Setting these values also affects the thresholds for receiving usage alerts.
#[derive(Debug)]
pub struct MemoryLimits {
pub high: u64,
high: u64,
max: u64,
}
impl MemoryLimits {
pub fn new(high: u64, max: u64) -> Self {
Self { max, high }
}
}
// Methods for manipulating the actual cgroup
@@ -638,7 +643,12 @@ impl CgroupWatcher {
/// Set cgroup memory.high and memory.max.
pub fn set_limits(&self, limits: &MemoryLimits) -> anyhow::Result<()> {
info!(limits.high, path = self.path(), "writing new memory limits",);
info!(
limits.high,
limits.max,
path = self.path(),
"writing new memory limits",
);
self.memory()
.context("failed to get memory subsystem while setting memory limits")?
.set_mem(cgroups_rs::memory::SetMemory {
@@ -647,7 +657,7 @@ impl CgroupWatcher {
high: Some(MaxValue::Value(
u64::min(limits.high, i64::MAX as u64) as i64
)),
max: None,
max: Some(MaxValue::Value(u64::min(limits.max, i64::MAX as u64) as i64)),
})
.context("failed to set memory limits")
}
@@ -655,7 +665,7 @@ impl CgroupWatcher {
/// Given some amount of available memory, set the desired cgroup memory limits
pub fn set_memory_limits(&mut self, available_memory: u64) -> anyhow::Result<()> {
let new_high = self.config.calculate_memory_high_value(available_memory);
let limits = MemoryLimits { high: new_high };
let limits = MemoryLimits::new(new_high, available_memory);
info!(
path = self.path(),
memory = ?limits,

View File

@@ -257,11 +257,12 @@ impl Runner {
new_cgroup_mem_high = cgroup.config.calculate_memory_high_value(available_memory);
}
let limits = MemoryLimits {
let limits = MemoryLimits::new(
// new_cgroup_mem_high is initialized to 0 but it is guarancontextd to not be here
// since it is properly initialized in the previous cgroup if let block
high: new_cgroup_mem_high,
};
new_cgroup_mem_high,
available_memory,
);
cgroup
.set_limits(&limits)
.context("failed to set cgroup memory limits")?;
@@ -327,9 +328,7 @@ impl Runner {
name = cgroup.path(),
"updating cgroup memory.high",
);
let limits = MemoryLimits {
high: new_cgroup_mem_high,
};
let limits = MemoryLimits::new(new_cgroup_mem_high, available_memory);
cgroup
.set_limits(&limits)
.context("failed to set file cache size")?;

View File

@@ -80,11 +80,11 @@ enum-map.workspace = true
enumset.workspace = true
strum.workspace = true
strum_macros.workspace = true
tempfile.workspace = true
[dev-dependencies]
criterion.workspace = true
hex-literal.workspace = true
tempfile.workspace = true
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }
[[bench]]

View File

@@ -518,9 +518,6 @@ fn start_pageserver(
// creates a child context with the right DownloadBehavior.
DownloadBehavior::Error,
);
let local_disk_storage = conf.workdir.join("last_consumption_metrics.json");
task_mgr::spawn(
crate::BACKGROUND_RUNTIME.handle(),
TaskKind::MetricsCollection,
@@ -547,7 +544,6 @@ fn start_pageserver(
conf.cached_metric_collection_interval,
conf.synthetic_size_calculation_interval,
conf.id,
local_disk_storage,
metrics_ctx,
)
.instrument(info_span!("metrics_collection"))

View File

@@ -64,7 +64,7 @@ pub mod defaults {
super::ConfigurableSemaphore::DEFAULT_INITIAL.get();
pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min";
pub const DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL: &str = "0s";
pub const DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL: &str = "1 hour";
pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";

View File

@@ -1,54 +1,188 @@
//!
//! Periodically collect consumption metrics for all active tenants
//! and push them to a HTTP endpoint.
//! Cache metrics to send only the updated ones.
//!
use crate::context::{DownloadBehavior, RequestContext};
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
use crate::tenant::{mgr, LogicalSizeCalculationCause};
use consumption_metrics::EventType;
use anyhow;
use chrono::{DateTime, Utc};
use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
use pageserver_api::models::TenantState;
use reqwest::Url;
use serde::Serialize;
use serde_with::{serde_as, DisplayFromStr};
use std::collections::HashMap;
use std::path::PathBuf;
use std::sync::Arc;
use std::time::{Duration, SystemTime};
use tracing::*;
use utils::id::NodeId;
mod metrics;
use metrics::MetricsKey;
mod disk_cache;
mod upload;
use utils::id::{NodeId, TenantId, TimelineId};
use utils::lsn::Lsn;
const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
/// Basically a key-value pair, but usually in a Vec except for [`Cache`].
///
/// This is as opposed to `consumption_metrics::Event` which is the externally communicated form.
/// Difference is basically the missing idempotency key, which lives only for the duration of
/// upload attempts.
type RawMetric = (MetricsKey, (EventType, u64));
#[serde_as]
#[derive(Serialize, Debug, Clone, Copy)]
struct Ids {
#[serde_as(as = "DisplayFromStr")]
tenant_id: TenantId,
#[serde_as(as = "Option<DisplayFromStr>")]
#[serde(skip_serializing_if = "Option::is_none")]
timeline_id: Option<TimelineId>,
}
/// Caches the [`RawMetric`]s
///
/// In practice, during startup, last sent values are stored here to be used in calculating new
/// ones. After successful uploading, the cached values are updated to cache. This used to be used
/// for deduplication, but that is no longer needed.
type Cache = HashMap<MetricsKey, (EventType, u64)>;
/// Key that uniquely identifies the object, this metric describes.
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
struct MetricsKey {
tenant_id: TenantId,
timeline_id: Option<TimelineId>,
metric: &'static str,
}
impl MetricsKey {
const fn absolute_values(self) -> AbsoluteValueFactory {
AbsoluteValueFactory(self)
}
const fn incremental_values(self) -> IncrementalValueFactory {
IncrementalValueFactory(self)
}
}
/// Helper type which each individual metric kind can return to produce only absolute values.
struct AbsoluteValueFactory(MetricsKey);
impl AbsoluteValueFactory {
fn at(self, time: DateTime<Utc>, val: u64) -> (MetricsKey, (EventType, u64)) {
let key = self.0;
(key, (EventType::Absolute { time }, val))
}
}
/// Helper type which each individual metric kind can return to produce only incremental values.
struct IncrementalValueFactory(MetricsKey);
impl IncrementalValueFactory {
#[allow(clippy::wrong_self_convention)]
fn from_previous_up_to(
self,
prev_end: DateTime<Utc>,
up_to: DateTime<Utc>,
val: u64,
) -> (MetricsKey, (EventType, u64)) {
let key = self.0;
// cannot assert prev_end < up_to because these are realtime clock based
(
key,
(
EventType::Incremental {
start_time: prev_end,
stop_time: up_to,
},
val,
),
)
}
fn key(&self) -> &MetricsKey {
&self.0
}
}
// the static part of a MetricsKey
impl MetricsKey {
/// Absolute value of [`Timeline::get_last_record_lsn`].
///
/// [`Timeline::get_last_record_lsn`]: crate::tenant::Timeline::get_last_record_lsn
const fn written_size(tenant_id: TenantId, timeline_id: TimelineId) -> AbsoluteValueFactory {
MetricsKey {
tenant_id,
timeline_id: Some(timeline_id),
metric: "written_size",
}
.absolute_values()
}
/// Values will be the difference of the latest [`MetricsKey::written_size`] to what we
/// previously sent, starting from the previously sent incremental time range ending at the
/// latest absolute measurement.
const fn written_size_delta(
tenant_id: TenantId,
timeline_id: TimelineId,
) -> IncrementalValueFactory {
MetricsKey {
tenant_id,
timeline_id: Some(timeline_id),
// the name here is correctly about data not size, because that is what is wanted by
// downstream pipeline
metric: "written_data_bytes_delta",
}
.incremental_values()
}
/// Exact [`Timeline::get_current_logical_size`].
///
/// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
const fn timeline_logical_size(
tenant_id: TenantId,
timeline_id: TimelineId,
) -> AbsoluteValueFactory {
MetricsKey {
tenant_id,
timeline_id: Some(timeline_id),
metric: "timeline_logical_size",
}
.absolute_values()
}
/// [`Tenant::remote_size`]
///
/// [`Tenant::remote_size`]: crate::tenant::Tenant::remote_size
const fn remote_storage_size(tenant_id: TenantId) -> AbsoluteValueFactory {
MetricsKey {
tenant_id,
timeline_id: None,
metric: "remote_storage_size",
}
.absolute_values()
}
/// Sum of [`Timeline::resident_physical_size`] for each `Tenant`.
///
/// [`Timeline::resident_physical_size`]: crate::tenant::Timeline::resident_physical_size
const fn resident_size(tenant_id: TenantId) -> AbsoluteValueFactory {
MetricsKey {
tenant_id,
timeline_id: None,
metric: "resident_size",
}
.absolute_values()
}
/// [`Tenant::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`].
///
/// [`Tenant::cached_synthetic_size`]: crate::tenant::Tenant::cached_synthetic_size
const fn synthetic_size(tenant_id: TenantId) -> AbsoluteValueFactory {
MetricsKey {
tenant_id,
timeline_id: None,
metric: "synthetic_storage_size",
}
.absolute_values()
}
}
/// Main thread that serves metrics collection
pub async fn collect_metrics(
metric_collection_endpoint: &Url,
metric_collection_interval: Duration,
_cached_metric_collection_interval: Duration,
cached_metric_collection_interval: Duration,
synthetic_size_calculation_interval: Duration,
node_id: NodeId,
local_disk_storage: PathBuf,
ctx: RequestContext,
) -> anyhow::Result<()> {
if _cached_metric_collection_interval != Duration::ZERO {
tracing::warn!(
"cached_metric_collection_interval is no longer used, please set it to zero."
)
}
let mut ticker = tokio::time::interval(metric_collection_interval);
info!("starting collect_metrics");
// spin up background worker that caclulates tenant sizes
let worker_ctx =
@@ -68,216 +202,543 @@ pub async fn collect_metrics(
},
);
let path: Arc<PathBuf> = Arc::new(local_disk_storage);
let cancel = task_mgr::shutdown_token();
let restore_and_reschedule = restore_and_reschedule(&path, metric_collection_interval);
let mut cached_metrics = tokio::select! {
_ = cancel.cancelled() => return Ok(()),
ret = restore_and_reschedule => ret,
};
// define client here to reuse it for all requests
let client = reqwest::ClientBuilder::new()
.timeout(DEFAULT_HTTP_REPORTING_TIMEOUT)
.build()
.expect("Failed to create http client with timeout");
let node_id = node_id.to_string();
// reminder: ticker is ready immediatedly
let mut ticker = tokio::time::interval(metric_collection_interval);
let mut cached_metrics = HashMap::new();
let mut prev_iteration_time: std::time::Instant = std::time::Instant::now();
loop {
let tick_at = tokio::select! {
_ = cancel.cancelled() => return Ok(()),
tick_at = ticker.tick() => tick_at,
};
tokio::select! {
_ = task_mgr::shutdown_watcher() => {
info!("collect_metrics received cancellation request");
return Ok(());
},
tick_at = ticker.tick() => {
// these are point in time, with variable "now"
let metrics = metrics::collect_all_metrics(&cached_metrics, &ctx).await;
// send cached metrics every cached_metric_collection_interval
let send_cached = prev_iteration_time.elapsed() >= cached_metric_collection_interval;
if metrics.is_empty() {
continue;
if send_cached {
prev_iteration_time = std::time::Instant::now();
}
collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, node_id, &ctx, send_cached).await;
crate::tenant::tasks::warn_when_period_overrun(
tick_at.elapsed(),
metric_collection_interval,
"consumption_metrics_collect_metrics",
);
}
}
let metrics = Arc::new(metrics);
// why not race cancellation here? because we are one of the last tasks, and if we are
// already here, better to try to flush the new values.
let flush = async {
match disk_cache::flush_metrics_to_disk(&metrics, &path).await {
Ok(()) => {
tracing::debug!("flushed metrics to disk");
}
Err(e) => {
// idea here is that if someone creates a directory as our path, then they
// might notice it from the logs before shutdown and remove it
tracing::error!("failed to persist metrics to {path:?}: {e:#}");
}
}
};
let upload = async {
let res = upload::upload_metrics(
&client,
metric_collection_endpoint,
&cancel,
&node_id,
&metrics,
&mut cached_metrics,
)
.await;
if let Err(e) = res {
// serialization error which should never happen
tracing::error!("failed to upload due to {e:#}");
}
};
// let these run concurrently
let (_, _) = tokio::join!(flush, upload);
crate::tenant::tasks::warn_when_period_overrun(
tick_at.elapsed(),
metric_collection_interval,
"consumption_metrics_collect_metrics",
);
}
}
/// Called on the first iteration in an attempt to join the metric uploading schedule from previous
/// pageserver session. Pageserver is supposed to upload at intervals regardless of restarts.
/// One iteration of metrics collection
///
/// Cancellation safe.
async fn restore_and_reschedule(
path: &Arc<PathBuf>,
metric_collection_interval: Duration,
) -> Cache {
let (cached, earlier_metric_at) = match disk_cache::read_metrics_from_disk(path.clone()).await {
Ok(found_some) => {
// there is no min needed because we write these sequentially in
// collect_all_metrics
let earlier_metric_at = found_some
.iter()
.map(|(_, (et, _))| et.recorded_at())
.copied()
.next();
/// Gather per-tenant and per-timeline metrics and send them to the `metric_collection_endpoint`.
/// Cache metrics to avoid sending the same metrics multiple times.
///
/// This function handles all errors internally
/// and doesn't break iteration if just one tenant fails.
///
/// TODO
/// - refactor this function (chunking+sending part) to reuse it in proxy module;
async fn collect_metrics_iteration(
client: &reqwest::Client,
cached_metrics: &mut HashMap<MetricsKey, (EventType, u64)>,
metric_collection_endpoint: &reqwest::Url,
node_id: NodeId,
ctx: &RequestContext,
send_cached: bool,
) {
let mut current_metrics: Vec<(MetricsKey, (EventType, u64))> = Vec::new();
trace!(
"starting collect_metrics_iteration. metric_collection_endpoint: {}",
metric_collection_endpoint
);
let cached = found_some.into_iter().collect::<Cache>();
(cached, earlier_metric_at)
}
Err(e) => {
use std::io::{Error, ErrorKind};
let root = e.root_cause();
let maybe_ioerr = root.downcast_ref::<Error>();
let is_not_found = maybe_ioerr.is_some_and(|e| e.kind() == ErrorKind::NotFound);
if !is_not_found {
tracing::info!("failed to read any previous metrics from {path:?}: {e:#}");
}
(HashMap::new(), None)
// get list of tenants
let tenants = match mgr::list_tenants().await {
Ok(tenants) => tenants,
Err(err) => {
error!("failed to list tenants: {:?}", err);
return;
}
};
if let Some(earlier_metric_at) = earlier_metric_at {
let earlier_metric_at: SystemTime = earlier_metric_at.into();
// iterate through list of Active tenants and collect metrics
for (tenant_id, tenant_state) in tenants {
if tenant_state != TenantState::Active {
continue;
}
let error = reschedule(earlier_metric_at, metric_collection_interval).await;
if let Some(error) = error {
if error.as_secs() >= 60 {
tracing::info!(
error_ms = error.as_millis(),
"startup scheduling error due to restart"
)
let tenant = match mgr::get_tenant(tenant_id, true).await {
Ok(tenant) => tenant,
Err(err) => {
// It is possible that tenant was deleted between
// `list_tenants` and `get_tenant`, so just warn about it.
warn!("failed to get tenant {tenant_id:?}: {err:?}");
continue;
}
};
let mut tenant_resident_size = 0;
// iterate through list of timelines in tenant
for timeline in tenant.list_timelines() {
// collect per-timeline metrics only for active timelines
let timeline_id = timeline.timeline_id;
match TimelineSnapshot::collect(&timeline, ctx) {
Ok(Some(snap)) => {
snap.to_metrics(
tenant_id,
timeline_id,
Utc::now(),
&mut current_metrics,
cached_metrics,
);
}
Ok(None) => {}
Err(e) => {
error!(
"failed to get metrics values for tenant {tenant_id} timeline {}: {e:#?}",
timeline.timeline_id
);
continue;
}
}
tenant_resident_size += timeline.resident_physical_size();
}
current_metrics
.push(MetricsKey::remote_storage_size(tenant_id).at(Utc::now(), tenant.remote_size()));
current_metrics
.push(MetricsKey::resident_size(tenant_id).at(Utc::now(), tenant_resident_size));
// Note that this metric is calculated in a separate bgworker
// Here we only use cached value, which may lag behind the real latest one
let synthetic_size = tenant.cached_synthetic_size();
if synthetic_size != 0 {
// only send non-zeroes because otherwise these show up as errors in logs
current_metrics
.push(MetricsKey::synthetic_size(tenant_id).at(Utc::now(), synthetic_size));
}
}
cached
// Filter metrics, unless we want to send all metrics, including cached ones.
// See: https://github.com/neondatabase/neon/issues/3485
if !send_cached {
current_metrics.retain(|(curr_key, (kind, curr_val))| {
if kind.is_incremental() {
// incremental values (currently only written_size_delta) should not get any cache
// deduplication because they will be used by upstream for "is still alive."
true
} else {
match cached_metrics.get(curr_key) {
Some((_, val)) => val != curr_val,
None => true,
}
}
});
}
if current_metrics.is_empty() {
trace!("no new metrics to send");
return;
}
// Send metrics.
// Split into chunks of 1000 metrics to avoid exceeding the max request size
let chunks = current_metrics.chunks(CHUNK_SIZE);
let mut chunk_to_send: Vec<Event<Ids>> = Vec::with_capacity(CHUNK_SIZE);
let node_id = node_id.to_string();
for chunk in chunks {
chunk_to_send.clear();
// enrich metrics with type,timestamp and idempotency key before sending
chunk_to_send.extend(chunk.iter().map(|(curr_key, (when, curr_val))| Event {
kind: *when,
metric: curr_key.metric,
idempotency_key: idempotency_key(&node_id),
value: *curr_val,
extra: Ids {
tenant_id: curr_key.tenant_id,
timeline_id: curr_key.timeline_id,
},
}));
const MAX_RETRIES: u32 = 3;
for attempt in 0..MAX_RETRIES {
let res = client
.post(metric_collection_endpoint.clone())
.json(&EventChunk {
events: (&chunk_to_send).into(),
})
.send()
.await;
match res {
Ok(res) => {
if res.status().is_success() {
// update cached metrics after they were sent successfully
for (curr_key, curr_val) in chunk.iter() {
cached_metrics.insert(curr_key.clone(), *curr_val);
}
} else {
error!("metrics endpoint refused the sent metrics: {:?}", res);
for metric in chunk_to_send
.iter()
.filter(|metric| metric.value > (1u64 << 40))
{
// Report if the metric value is suspiciously large
error!("potentially abnormal metric value: {:?}", metric);
}
}
break;
}
Err(err) if err.is_timeout() => {
error!(attempt, "timeout sending metrics, retrying immediately");
continue;
}
Err(err) => {
error!(attempt, ?err, "failed to send metrics");
break;
}
}
}
}
}
async fn reschedule(
earlier_metric_at: SystemTime,
metric_collection_interval: Duration,
) -> Option<Duration> {
let now = SystemTime::now();
match now.duration_since(earlier_metric_at) {
Ok(from_last_send) if from_last_send < metric_collection_interval => {
let sleep_for = metric_collection_interval - from_last_send;
/// Internal type to make timeline metric production testable.
///
/// As this value type contains all of the information needed from a timeline to produce the
/// metrics, it can easily be created with different values in test.
struct TimelineSnapshot {
loaded_at: (Lsn, SystemTime),
last_record_lsn: Lsn,
current_exact_logical_size: Option<u64>,
}
let deadline = std::time::Instant::now() + sleep_for;
impl TimelineSnapshot {
/// Collect the metrics from an actual timeline.
///
/// Fails currently only when [`Timeline::get_current_logical_size`] fails.
///
/// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
fn collect(
t: &Arc<crate::tenant::Timeline>,
ctx: &RequestContext,
) -> anyhow::Result<Option<Self>> {
use anyhow::Context;
tokio::time::sleep_until(deadline.into()).await;
if !t.is_active() {
// no collection for broken or stopping needed, we will still keep the cached values
// though at the caller.
Ok(None)
} else {
let loaded_at = t.loaded_at;
let last_record_lsn = t.get_last_record_lsn();
let now = std::time::Instant::now();
let current_exact_logical_size = {
let span = info_span!("collect_metrics_iteration", tenant_id = %t.tenant_id, timeline_id = %t.timeline_id);
let res = span
.in_scope(|| t.get_current_logical_size(ctx))
.context("get_current_logical_size");
match res? {
// Only send timeline logical size when it is fully calculated.
(size, is_exact) if is_exact => Some(size),
(_, _) => None,
}
};
// executor threads might be busy, add extra measurements
Some(if now < deadline {
deadline - now
} else {
now - deadline
})
Ok(Some(TimelineSnapshot {
loaded_at,
last_record_lsn,
current_exact_logical_size,
}))
}
Ok(from_last_send) => Some(from_last_send.saturating_sub(metric_collection_interval)),
Err(_) => {
tracing::warn!(
?now,
?earlier_metric_at,
"oldest recorded metric is in future; first values will come out with inconsistent timestamps"
);
earlier_metric_at.duration_since(now).ok()
}
/// Produce the timeline consumption metrics into the `metrics` argument.
fn to_metrics(
&self,
tenant_id: TenantId,
timeline_id: TimelineId,
now: DateTime<Utc>,
metrics: &mut Vec<(MetricsKey, (EventType, u64))>,
cache: &HashMap<MetricsKey, (EventType, u64)>,
) {
let timeline_written_size = u64::from(self.last_record_lsn);
let (key, written_size_now) =
MetricsKey::written_size(tenant_id, timeline_id).at(now, timeline_written_size);
// last_record_lsn can only go up, right now at least, TODO: #2592 or related
// features might change this.
let written_size_delta_key = MetricsKey::written_size_delta(tenant_id, timeline_id);
// use this when available, because in a stream of incremental values, it will be
// accurate where as when last_record_lsn stops moving, we will only cache the last
// one of those.
let last_stop_time = cache
.get(written_size_delta_key.key())
.map(|(until, _val)| {
until
.incremental_timerange()
.expect("never create EventType::Absolute for written_size_delta")
.end
});
// by default, use the last sent written_size as the basis for
// calculating the delta. if we don't yet have one, use the load time value.
let prev = cache
.get(&key)
.map(|(prev_at, prev)| {
// use the prev time from our last incremental update, or default to latest
// absolute update on the first round.
let prev_at = prev_at
.absolute_time()
.expect("never create EventType::Incremental for written_size");
let prev_at = last_stop_time.unwrap_or(prev_at);
(*prev_at, *prev)
})
.unwrap_or_else(|| {
// if we don't have a previous point of comparison, compare to the load time
// lsn.
let (disk_consistent_lsn, loaded_at) = &self.loaded_at;
(DateTime::from(*loaded_at), disk_consistent_lsn.0)
});
// written_size_bytes_delta
metrics.extend(
if let Some(delta) = written_size_now.1.checked_sub(prev.1) {
let up_to = written_size_now
.0
.absolute_time()
.expect("never create EventType::Incremental for written_size");
let key_value = written_size_delta_key.from_previous_up_to(prev.0, *up_to, delta);
Some(key_value)
} else {
None
},
);
// written_size
metrics.push((key, written_size_now));
if let Some(size) = self.current_exact_logical_size {
metrics.push(MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, size));
}
}
}
/// Caclculate synthetic size for each active tenant
async fn calculate_synthetic_size_worker(
pub async fn calculate_synthetic_size_worker(
synthetic_size_calculation_interval: Duration,
ctx: &RequestContext,
) -> anyhow::Result<()> {
info!("starting calculate_synthetic_size_worker");
// reminder: ticker is ready immediatedly
let mut ticker = tokio::time::interval(synthetic_size_calculation_interval);
let cause = LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize;
loop {
let tick_at = tokio::select! {
_ = task_mgr::shutdown_watcher() => return Ok(()),
tick_at = ticker.tick() => tick_at,
};
tokio::select! {
_ = task_mgr::shutdown_watcher() => {
return Ok(());
},
tick_at = ticker.tick() => {
let tenants = match mgr::list_tenants().await {
Ok(tenants) => tenants,
Err(e) => {
warn!("cannot get tenant list: {e:#}");
continue;
}
};
let tenants = match mgr::list_tenants().await {
Ok(tenants) => tenants,
Err(e) => {
warn!("cannot get tenant list: {e:#}");
continue;
}
};
// iterate through list of Active tenants and collect metrics
for (tenant_id, tenant_state) in tenants {
for (tenant_id, tenant_state) in tenants {
if tenant_state != TenantState::Active {
continue;
}
if tenant_state != TenantState::Active {
continue;
}
if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await
{
if let Err(e) = tenant.calculate_synthetic_size(
LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize,
ctx).await {
error!("failed to calculate synthetic size for tenant {}: {}", tenant_id, e);
}
}
if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await {
if let Err(e) = tenant.calculate_synthetic_size(cause, ctx).await {
error!("failed to calculate synthetic size for tenant {tenant_id}: {e:#}");
}
crate::tenant::tasks::warn_when_period_overrun(
tick_at.elapsed(),
synthetic_size_calculation_interval,
"consumption_metrics_synthetic_size_worker",
);
}
}
crate::tenant::tasks::warn_when_period_overrun(
tick_at.elapsed(),
synthetic_size_calculation_interval,
"consumption_metrics_synthetic_size_worker",
);
}
}
#[cfg(test)]
mod tests {
use std::collections::HashMap;
use std::time::SystemTime;
use utils::{
id::{TenantId, TimelineId},
lsn::Lsn,
};
use crate::consumption_metrics::MetricsKey;
use super::TimelineSnapshot;
use chrono::{DateTime, Utc};
#[test]
fn startup_collected_timeline_metrics_before_advancing() {
let tenant_id = TenantId::generate();
let timeline_id = TimelineId::generate();
let mut metrics = Vec::new();
let cache = HashMap::new();
let initdb_lsn = Lsn(0x10000);
let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
let snap = TimelineSnapshot {
loaded_at: (disk_consistent_lsn, SystemTime::now()),
last_record_lsn: disk_consistent_lsn,
current_exact_logical_size: Some(0x42000),
};
let now = DateTime::<Utc>::from(SystemTime::now());
snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
assert_eq!(
metrics,
&[
MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
snap.loaded_at.1.into(),
now,
0
),
MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
]
);
}
#[test]
fn startup_collected_timeline_metrics_second_round() {
let tenant_id = TenantId::generate();
let timeline_id = TimelineId::generate();
let [now, before, init] = time_backwards();
let now = DateTime::<Utc>::from(now);
let before = DateTime::<Utc>::from(before);
let initdb_lsn = Lsn(0x10000);
let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
let mut metrics = Vec::new();
let cache = HashMap::from([
MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0)
]);
let snap = TimelineSnapshot {
loaded_at: (disk_consistent_lsn, init),
last_record_lsn: disk_consistent_lsn,
current_exact_logical_size: Some(0x42000),
};
snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
assert_eq!(
metrics,
&[
MetricsKey::written_size_delta(tenant_id, timeline_id)
.from_previous_up_to(before, now, 0),
MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
]
);
}
#[test]
fn startup_collected_timeline_metrics_nth_round_at_same_lsn() {
let tenant_id = TenantId::generate();
let timeline_id = TimelineId::generate();
let [now, just_before, before, init] = time_backwards();
let now = DateTime::<Utc>::from(now);
let just_before = DateTime::<Utc>::from(just_before);
let before = DateTime::<Utc>::from(before);
let initdb_lsn = Lsn(0x10000);
let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
let mut metrics = Vec::new();
let cache = HashMap::from([
// at t=before was the last time the last_record_lsn changed
MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0),
// end time of this event is used for the next ones
MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
before,
just_before,
0,
),
]);
let snap = TimelineSnapshot {
loaded_at: (disk_consistent_lsn, init),
last_record_lsn: disk_consistent_lsn,
current_exact_logical_size: Some(0x42000),
};
snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
assert_eq!(
metrics,
&[
MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to(
just_before,
now,
0
),
MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
]
);
}
fn time_backwards<const N: usize>() -> [std::time::SystemTime; N] {
let mut times = [std::time::SystemTime::UNIX_EPOCH; N];
times[0] = std::time::SystemTime::now();
for behind in 1..N {
times[behind] = times[0] - std::time::Duration::from_secs(behind as u64);
}
times
}
}

View File

@@ -1,117 +0,0 @@
use anyhow::Context;
use std::path::PathBuf;
use std::sync::Arc;
use super::RawMetric;
pub(super) async fn read_metrics_from_disk(path: Arc<PathBuf>) -> anyhow::Result<Vec<RawMetric>> {
// do not add context to each error, callsite will log with full path
let span = tracing::Span::current();
tokio::task::spawn_blocking(move || {
let _e = span.entered();
if let Some(parent) = path.parent() {
if let Err(e) = scan_and_delete_with_same_prefix(&path) {
tracing::info!("failed to cleanup temporary files in {parent:?}: {e:#}");
}
}
let mut file = std::fs::File::open(&*path)?;
let reader = std::io::BufReader::new(&mut file);
anyhow::Ok(serde_json::from_reader::<_, Vec<RawMetric>>(reader)?)
})
.await
.context("read metrics join error")
.and_then(|x| x)
}
fn scan_and_delete_with_same_prefix(path: &std::path::Path) -> std::io::Result<()> {
let it = std::fs::read_dir(path.parent().expect("caller checked"))?;
let prefix = path.file_name().expect("caller checked").to_string_lossy();
for entry in it {
let entry = entry?;
if !entry.metadata()?.is_file() {
continue;
}
let file_name = entry.file_name();
if path.file_name().unwrap() == file_name {
// do not remove our actual file
continue;
}
let file_name = file_name.to_string_lossy();
if !file_name.starts_with(&*prefix) {
continue;
}
let path = entry.path();
if let Err(e) = std::fs::remove_file(&path) {
tracing::warn!("cleaning up old tempfile {file_name:?} failed: {e:#}");
} else {
tracing::info!("cleaned up old tempfile {file_name:?}");
}
}
Ok(())
}
pub(super) async fn flush_metrics_to_disk(
current_metrics: &Arc<Vec<RawMetric>>,
path: &Arc<PathBuf>,
) -> anyhow::Result<()> {
use std::io::Write;
anyhow::ensure!(path.parent().is_some(), "path must have parent: {path:?}");
anyhow::ensure!(
path.file_name().is_some(),
"path must have filename: {path:?}"
);
let span = tracing::Span::current();
tokio::task::spawn_blocking({
let current_metrics = current_metrics.clone();
let path = path.clone();
move || {
let _e = span.entered();
let parent = path.parent().expect("existence checked");
let file_name = path.file_name().expect("existence checked");
let mut tempfile = tempfile::Builder::new()
.prefix(file_name)
.suffix(".tmp")
.tempfile_in(parent)?;
tracing::debug!("using tempfile {:?}", tempfile.path());
// write out all of the raw metrics, to be read out later on restart as cached values
{
let mut writer = std::io::BufWriter::new(&mut tempfile);
serde_json::to_writer(&mut writer, &*current_metrics)
.context("serialize metrics")?;
writer
.into_inner()
.map_err(|_| anyhow::anyhow!("flushing metrics failed"))?;
}
tempfile.flush()?;
tempfile.as_file().sync_all()?;
fail::fail_point!("before-persist-last-metrics-collected");
drop(tempfile.persist(&*path).map_err(|e| e.error)?);
let f = std::fs::File::open(path.parent().unwrap())?;
f.sync_all()?;
anyhow::Ok(())
}
})
.await
.with_context(|| format!("write metrics to {path:?} join error"))
.and_then(|x| x.with_context(|| format!("write metrics to {path:?}")))
}

View File

@@ -1,455 +0,0 @@
use crate::context::RequestContext;
use anyhow::Context;
use chrono::{DateTime, Utc};
use consumption_metrics::EventType;
use futures::stream::StreamExt;
use serde_with::serde_as;
use std::{sync::Arc, time::SystemTime};
use utils::{
id::{TenantId, TimelineId},
lsn::Lsn,
};
use super::{Cache, RawMetric};
/// Name of the metric, used by `MetricsKey` factory methods and `deserialize_cached_events`
/// instead of static str.
// Do not rename any of these without first consulting with data team and partner
// management.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
pub(super) enum Name {
/// Timeline last_record_lsn, absolute
#[serde(rename = "written_size")]
WrittenSize,
/// Timeline last_record_lsn, incremental
#[serde(rename = "written_data_bytes_delta")]
WrittenSizeDelta,
/// Timeline logical size
#[serde(rename = "timeline_logical_size")]
LogicalSize,
/// Tenant remote size
#[serde(rename = "remote_storage_size")]
RemoteSize,
/// Tenant resident size
#[serde(rename = "resident_size")]
ResidentSize,
/// Tenant synthetic size
#[serde(rename = "synthetic_storage_size")]
SyntheticSize,
}
/// Key that uniquely identifies the object this metric describes.
///
/// This is a denormalization done at the MetricsKey const methods; these should not be constructed
/// elsewhere.
#[serde_with::serde_as]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
pub(crate) struct MetricsKey {
#[serde_as(as = "serde_with::DisplayFromStr")]
pub(super) tenant_id: TenantId,
#[serde_as(as = "Option<serde_with::DisplayFromStr>")]
#[serde(skip_serializing_if = "Option::is_none")]
pub(super) timeline_id: Option<TimelineId>,
pub(super) metric: Name,
}
impl MetricsKey {
const fn absolute_values(self) -> AbsoluteValueFactory {
AbsoluteValueFactory(self)
}
const fn incremental_values(self) -> IncrementalValueFactory {
IncrementalValueFactory(self)
}
}
/// Helper type which each individual metric kind can return to produce only absolute values.
struct AbsoluteValueFactory(MetricsKey);
impl AbsoluteValueFactory {
const fn at(self, time: DateTime<Utc>, val: u64) -> RawMetric {
let key = self.0;
(key, (EventType::Absolute { time }, val))
}
fn key(&self) -> &MetricsKey {
&self.0
}
}
/// Helper type which each individual metric kind can return to produce only incremental values.
struct IncrementalValueFactory(MetricsKey);
impl IncrementalValueFactory {
#[allow(clippy::wrong_self_convention)]
const fn from_until(
self,
prev_end: DateTime<Utc>,
up_to: DateTime<Utc>,
val: u64,
) -> RawMetric {
let key = self.0;
// cannot assert prev_end < up_to because these are realtime clock based
let when = EventType::Incremental {
start_time: prev_end,
stop_time: up_to,
};
(key, (when, val))
}
fn key(&self) -> &MetricsKey {
&self.0
}
}
// the static part of a MetricsKey
impl MetricsKey {
/// Absolute value of [`Timeline::get_last_record_lsn`].
///
/// [`Timeline::get_last_record_lsn`]: crate::tenant::Timeline::get_last_record_lsn
const fn written_size(tenant_id: TenantId, timeline_id: TimelineId) -> AbsoluteValueFactory {
MetricsKey {
tenant_id,
timeline_id: Some(timeline_id),
metric: Name::WrittenSize,
}
.absolute_values()
}
/// Values will be the difference of the latest [`MetricsKey::written_size`] to what we
/// previously sent, starting from the previously sent incremental time range ending at the
/// latest absolute measurement.
const fn written_size_delta(
tenant_id: TenantId,
timeline_id: TimelineId,
) -> IncrementalValueFactory {
MetricsKey {
tenant_id,
timeline_id: Some(timeline_id),
metric: Name::WrittenSizeDelta,
}
.incremental_values()
}
/// Exact [`Timeline::get_current_logical_size`].
///
/// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
const fn timeline_logical_size(
tenant_id: TenantId,
timeline_id: TimelineId,
) -> AbsoluteValueFactory {
MetricsKey {
tenant_id,
timeline_id: Some(timeline_id),
metric: Name::LogicalSize,
}
.absolute_values()
}
/// [`Tenant::remote_size`]
///
/// [`Tenant::remote_size`]: crate::tenant::Tenant::remote_size
const fn remote_storage_size(tenant_id: TenantId) -> AbsoluteValueFactory {
MetricsKey {
tenant_id,
timeline_id: None,
metric: Name::RemoteSize,
}
.absolute_values()
}
/// Sum of [`Timeline::resident_physical_size`] for each `Tenant`.
///
/// [`Timeline::resident_physical_size`]: crate::tenant::Timeline::resident_physical_size
const fn resident_size(tenant_id: TenantId) -> AbsoluteValueFactory {
MetricsKey {
tenant_id,
timeline_id: None,
metric: Name::ResidentSize,
}
.absolute_values()
}
/// [`Tenant::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`].
///
/// [`Tenant::cached_synthetic_size`]: crate::tenant::Tenant::cached_synthetic_size
/// [`calculate_synthetic_size_worker`]: super::calculate_synthetic_size_worker
const fn synthetic_size(tenant_id: TenantId) -> AbsoluteValueFactory {
MetricsKey {
tenant_id,
timeline_id: None,
metric: Name::SyntheticSize,
}
.absolute_values()
}
}
pub(super) async fn collect_all_metrics(
cached_metrics: &Cache,
ctx: &RequestContext,
) -> Vec<RawMetric> {
use pageserver_api::models::TenantState;
let started_at = std::time::Instant::now();
let tenants = match crate::tenant::mgr::list_tenants().await {
Ok(tenants) => tenants,
Err(err) => {
tracing::error!("failed to list tenants: {:?}", err);
return vec![];
}
};
let tenants = futures::stream::iter(tenants).filter_map(|(id, state)| async move {
if state != TenantState::Active {
None
} else {
crate::tenant::mgr::get_tenant(id, true)
.await
.ok()
.map(|tenant| (id, tenant))
}
});
let res = collect(tenants, cached_metrics, ctx).await;
tracing::info!(
elapsed_ms = started_at.elapsed().as_millis(),
total = res.len(),
"collected metrics"
);
res
}
async fn collect<S>(tenants: S, cache: &Cache, ctx: &RequestContext) -> Vec<RawMetric>
where
S: futures::stream::Stream<Item = (TenantId, Arc<crate::tenant::Tenant>)>,
{
let mut current_metrics: Vec<RawMetric> = Vec::new();
let mut tenants = std::pin::pin!(tenants);
while let Some((tenant_id, tenant)) = tenants.next().await {
let mut tenant_resident_size = 0;
for timeline in tenant.list_timelines() {
let timeline_id = timeline.timeline_id;
match TimelineSnapshot::collect(&timeline, ctx) {
Ok(Some(snap)) => {
snap.to_metrics(
tenant_id,
timeline_id,
Utc::now(),
&mut current_metrics,
cache,
);
}
Ok(None) => {}
Err(e) => {
tracing::error!(
"failed to get metrics values for tenant {tenant_id} timeline {}: {e:#?}",
timeline.timeline_id
);
continue;
}
}
tenant_resident_size += timeline.resident_physical_size();
}
let snap = TenantSnapshot::collect(&tenant, tenant_resident_size);
snap.to_metrics(tenant_id, Utc::now(), cache, &mut current_metrics);
}
current_metrics
}
/// In-between abstraction to allow testing metrics without actual Tenants.
struct TenantSnapshot {
resident_size: u64,
remote_size: u64,
synthetic_size: u64,
}
impl TenantSnapshot {
/// Collect tenant status to have metrics created out of it.
///
/// `resident_size` is calculated of the timelines we had access to for other metrics, so we
/// cannot just list timelines here.
fn collect(t: &Arc<crate::tenant::Tenant>, resident_size: u64) -> Self {
TenantSnapshot {
resident_size,
remote_size: t.remote_size(),
// Note that this metric is calculated in a separate bgworker
// Here we only use cached value, which may lag behind the real latest one
synthetic_size: t.cached_synthetic_size(),
}
}
fn to_metrics(
&self,
tenant_id: TenantId,
now: DateTime<Utc>,
cached: &Cache,
metrics: &mut Vec<RawMetric>,
) {
let remote_size = MetricsKey::remote_storage_size(tenant_id).at(now, self.remote_size);
let resident_size = MetricsKey::resident_size(tenant_id).at(now, self.resident_size);
let synthetic_size = {
let factory = MetricsKey::synthetic_size(tenant_id);
let mut synthetic_size = self.synthetic_size;
if synthetic_size == 0 {
if let Some((_, value)) = cached.get(factory.key()) {
// use the latest value from previous session
synthetic_size = *value;
}
}
if synthetic_size != 0 {
// only send non-zeroes because otherwise these show up as errors in logs
Some(factory.at(now, synthetic_size))
} else {
None
}
};
metrics.extend(
[Some(remote_size), Some(resident_size), synthetic_size]
.into_iter()
.flatten(),
);
}
}
/// Internal type to make timeline metric production testable.
///
/// As this value type contains all of the information needed from a timeline to produce the
/// metrics, it can easily be created with different values in test.
struct TimelineSnapshot {
loaded_at: (Lsn, SystemTime),
last_record_lsn: Lsn,
current_exact_logical_size: Option<u64>,
}
impl TimelineSnapshot {
/// Collect the metrics from an actual timeline.
///
/// Fails currently only when [`Timeline::get_current_logical_size`] fails.
///
/// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size
fn collect(
t: &Arc<crate::tenant::Timeline>,
ctx: &RequestContext,
) -> anyhow::Result<Option<Self>> {
if !t.is_active() {
// no collection for broken or stopping needed, we will still keep the cached values
// though at the caller.
Ok(None)
} else {
let loaded_at = t.loaded_at;
let last_record_lsn = t.get_last_record_lsn();
let current_exact_logical_size = {
let span = tracing::info_span!("collect_metrics_iteration", tenant_id = %t.tenant_id, timeline_id = %t.timeline_id);
let res = span
.in_scope(|| t.get_current_logical_size(ctx))
.context("get_current_logical_size");
match res? {
// Only send timeline logical size when it is fully calculated.
(size, is_exact) if is_exact => Some(size),
(_, _) => None,
}
};
Ok(Some(TimelineSnapshot {
loaded_at,
last_record_lsn,
current_exact_logical_size,
}))
}
}
/// Produce the timeline consumption metrics into the `metrics` argument.
fn to_metrics(
&self,
tenant_id: TenantId,
timeline_id: TimelineId,
now: DateTime<Utc>,
metrics: &mut Vec<RawMetric>,
cache: &Cache,
) {
let timeline_written_size = u64::from(self.last_record_lsn);
let written_size_delta_key = MetricsKey::written_size_delta(tenant_id, timeline_id);
let last_stop_time = cache
.get(written_size_delta_key.key())
.map(|(until, _val)| {
until
.incremental_timerange()
.expect("never create EventType::Absolute for written_size_delta")
.end
});
let (key, written_size_now) =
MetricsKey::written_size(tenant_id, timeline_id).at(now, timeline_written_size);
// by default, use the last sent written_size as the basis for
// calculating the delta. if we don't yet have one, use the load time value.
let prev = cache
.get(&key)
.map(|(prev_at, prev)| {
// use the prev time from our last incremental update, or default to latest
// absolute update on the first round.
let prev_at = prev_at
.absolute_time()
.expect("never create EventType::Incremental for written_size");
let prev_at = last_stop_time.unwrap_or(prev_at);
(*prev_at, *prev)
})
.unwrap_or_else(|| {
// if we don't have a previous point of comparison, compare to the load time
// lsn.
let (disk_consistent_lsn, loaded_at) = &self.loaded_at;
(DateTime::from(*loaded_at), disk_consistent_lsn.0)
});
let up_to = now;
if let Some(delta) = written_size_now.1.checked_sub(prev.1) {
let key_value = written_size_delta_key.from_until(prev.0, up_to, delta);
// written_size_delta
metrics.push(key_value);
// written_size
metrics.push((key, written_size_now));
} else {
// the cached value was ahead of us, report zero until we've caught up
metrics.push(written_size_delta_key.from_until(prev.0, up_to, 0));
// the cached value was ahead of us, report the same until we've caught up
metrics.push((key, (written_size_now.0, prev.1)));
}
{
let factory = MetricsKey::timeline_logical_size(tenant_id, timeline_id);
let current_or_previous = self
.current_exact_logical_size
.or_else(|| cache.get(factory.key()).map(|(_, val)| *val));
if let Some(size) = current_or_previous {
metrics.push(factory.at(now, size));
}
}
}
}
#[cfg(test)]
mod tests;
#[cfg(test)]
pub(crate) use tests::metric_examples;

View File

@@ -1,297 +0,0 @@
use super::*;
use std::collections::HashMap;
use std::time::SystemTime;
use utils::lsn::Lsn;
#[test]
fn startup_collected_timeline_metrics_before_advancing() {
let tenant_id = TenantId::generate();
let timeline_id = TimelineId::generate();
let mut metrics = Vec::new();
let cache = HashMap::new();
let initdb_lsn = Lsn(0x10000);
let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
let snap = TimelineSnapshot {
loaded_at: (disk_consistent_lsn, SystemTime::now()),
last_record_lsn: disk_consistent_lsn,
current_exact_logical_size: Some(0x42000),
};
let now = DateTime::<Utc>::from(SystemTime::now());
snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
assert_eq!(
metrics,
&[
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(
snap.loaded_at.1.into(),
now,
0
),
MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
]
);
}
#[test]
fn startup_collected_timeline_metrics_second_round() {
let tenant_id = TenantId::generate();
let timeline_id = TimelineId::generate();
let [now, before, init] = time_backwards();
let now = DateTime::<Utc>::from(now);
let before = DateTime::<Utc>::from(before);
let initdb_lsn = Lsn(0x10000);
let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
let mut metrics = Vec::new();
let cache = HashMap::from([
MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0)
]);
let snap = TimelineSnapshot {
loaded_at: (disk_consistent_lsn, init),
last_record_lsn: disk_consistent_lsn,
current_exact_logical_size: Some(0x42000),
};
snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
assert_eq!(
metrics,
&[
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0),
MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
]
);
}
#[test]
fn startup_collected_timeline_metrics_nth_round_at_same_lsn() {
let tenant_id = TenantId::generate();
let timeline_id = TimelineId::generate();
let [now, just_before, before, init] = time_backwards();
let now = DateTime::<Utc>::from(now);
let just_before = DateTime::<Utc>::from(just_before);
let before = DateTime::<Utc>::from(before);
let initdb_lsn = Lsn(0x10000);
let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
let mut metrics = Vec::new();
let cache = HashMap::from([
// at t=before was the last time the last_record_lsn changed
MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0),
// end time of this event is used for the next ones
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, just_before, 0),
]);
let snap = TimelineSnapshot {
loaded_at: (disk_consistent_lsn, init),
last_record_lsn: disk_consistent_lsn,
current_exact_logical_size: Some(0x42000),
};
snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
assert_eq!(
metrics,
&[
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(just_before, now, 0),
MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0),
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000)
]
);
}
#[test]
fn post_restart_written_sizes_with_rolled_back_last_record_lsn() {
// it can happen that we lose the inmemorylayer but have previously sent metrics and we
// should never go backwards
let tenant_id = TenantId::generate();
let timeline_id = TimelineId::generate();
let [later, now, at_restart] = time_backwards();
// FIXME: tests would be so much easier if we did not need to juggle back and forth
// SystemTime and DateTime::<Utc> ... Could do the conversion only at upload time?
let now = DateTime::<Utc>::from(now);
let later = DateTime::<Utc>::from(later);
let before_restart = at_restart - std::time::Duration::from_secs(5 * 60);
let way_before = before_restart - std::time::Duration::from_secs(10 * 60);
let before_restart = DateTime::<Utc>::from(before_restart);
let way_before = DateTime::<Utc>::from(way_before);
let snap = TimelineSnapshot {
loaded_at: (Lsn(50), at_restart),
last_record_lsn: Lsn(50),
current_exact_logical_size: None,
};
let mut cache = HashMap::from([
MetricsKey::written_size(tenant_id, timeline_id).at(before_restart, 100),
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(
way_before,
before_restart,
// not taken into account, but the timestamps are important
999_999_999,
),
]);
let mut metrics = Vec::new();
snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
assert_eq!(
metrics,
&[
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(
before_restart,
now,
0
),
MetricsKey::written_size(tenant_id, timeline_id).at(now, 100),
]
);
// now if we cache these metrics, and re-run while "still in recovery"
cache.extend(metrics.drain(..));
// "still in recovery", because our snapshot did not change
snap.to_metrics(tenant_id, timeline_id, later, &mut metrics, &cache);
assert_eq!(
metrics,
&[
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(now, later, 0),
MetricsKey::written_size(tenant_id, timeline_id).at(later, 100),
]
);
}
#[test]
fn post_restart_current_exact_logical_size_uses_cached() {
let tenant_id = TenantId::generate();
let timeline_id = TimelineId::generate();
let [now, at_restart] = time_backwards();
let now = DateTime::<Utc>::from(now);
let before_restart = at_restart - std::time::Duration::from_secs(5 * 60);
let before_restart = DateTime::<Utc>::from(before_restart);
let snap = TimelineSnapshot {
loaded_at: (Lsn(50), at_restart),
last_record_lsn: Lsn(50),
current_exact_logical_size: None,
};
let cache = HashMap::from([
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(before_restart, 100)
]);
let mut metrics = Vec::new();
snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
metrics.retain(|(key, _)| key.metric == Name::LogicalSize);
assert_eq!(
metrics,
&[MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 100)]
);
}
#[test]
fn post_restart_synthetic_size_uses_cached_if_available() {
let tenant_id = TenantId::generate();
let ts = TenantSnapshot {
resident_size: 1000,
remote_size: 1000,
// not yet calculated
synthetic_size: 0,
};
let now = SystemTime::now();
let before_restart = DateTime::<Utc>::from(now - std::time::Duration::from_secs(5 * 60));
let now = DateTime::<Utc>::from(now);
let cached = HashMap::from([MetricsKey::synthetic_size(tenant_id).at(before_restart, 1000)]);
let mut metrics = Vec::new();
ts.to_metrics(tenant_id, now, &cached, &mut metrics);
assert_eq!(
metrics,
&[
MetricsKey::remote_storage_size(tenant_id).at(now, 1000),
MetricsKey::resident_size(tenant_id).at(now, 1000),
MetricsKey::synthetic_size(tenant_id).at(now, 1000),
]
);
}
#[test]
fn post_restart_synthetic_size_is_not_sent_when_not_cached() {
let tenant_id = TenantId::generate();
let ts = TenantSnapshot {
resident_size: 1000,
remote_size: 1000,
// not yet calculated
synthetic_size: 0,
};
let now = SystemTime::now();
let now = DateTime::<Utc>::from(now);
let cached = HashMap::new();
let mut metrics = Vec::new();
ts.to_metrics(tenant_id, now, &cached, &mut metrics);
assert_eq!(
metrics,
&[
MetricsKey::remote_storage_size(tenant_id).at(now, 1000),
MetricsKey::resident_size(tenant_id).at(now, 1000),
// no synthetic size here
]
);
}
fn time_backwards<const N: usize>() -> [std::time::SystemTime; N] {
let mut times = [std::time::SystemTime::UNIX_EPOCH; N];
times[0] = std::time::SystemTime::now();
for behind in 1..N {
times[behind] = times[0] - std::time::Duration::from_secs(behind as u64);
}
times
}
pub(crate) const fn metric_examples(
tenant_id: TenantId,
timeline_id: TimelineId,
now: DateTime<Utc>,
before: DateTime<Utc>,
) -> [RawMetric; 6] {
[
MetricsKey::written_size(tenant_id, timeline_id).at(now, 0),
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0),
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0),
MetricsKey::remote_storage_size(tenant_id).at(now, 0),
MetricsKey::resident_size(tenant_id).at(now, 0),
MetricsKey::synthetic_size(tenant_id).at(now, 1),
]
}

View File

@@ -1,443 +0,0 @@
use consumption_metrics::{Event, EventChunk, IdempotencyKey, CHUNK_SIZE};
use serde_with::serde_as;
use tokio_util::sync::CancellationToken;
use tracing::Instrument;
use super::{metrics::Name, Cache, MetricsKey, RawMetric};
use utils::id::{TenantId, TimelineId};
/// How the metrics from pageserver are identified.
#[serde_with::serde_as]
#[derive(serde::Serialize, serde::Deserialize, Debug, Clone, Copy, PartialEq)]
struct Ids {
#[serde_as(as = "serde_with::DisplayFromStr")]
pub(super) tenant_id: TenantId,
#[serde_as(as = "Option<serde_with::DisplayFromStr>")]
#[serde(skip_serializing_if = "Option::is_none")]
pub(super) timeline_id: Option<TimelineId>,
}
#[tracing::instrument(skip_all, fields(metrics_total = %metrics.len()))]
pub(super) async fn upload_metrics(
client: &reqwest::Client,
metric_collection_endpoint: &reqwest::Url,
cancel: &CancellationToken,
node_id: &str,
metrics: &[RawMetric],
cached_metrics: &mut Cache,
) -> anyhow::Result<()> {
let mut uploaded = 0;
let mut failed = 0;
let started_at = std::time::Instant::now();
let mut iter = serialize_in_chunks(CHUNK_SIZE, metrics, node_id);
while let Some(res) = iter.next() {
let (chunk, body) = res?;
let event_bytes = body.len();
let is_last = iter.len() == 0;
let res = upload(client, metric_collection_endpoint, body, cancel, is_last)
.instrument(tracing::info_span!(
"upload",
%event_bytes,
uploaded,
total = metrics.len(),
))
.await;
match res {
Ok(()) => {
for (curr_key, curr_val) in chunk {
cached_metrics.insert(*curr_key, *curr_val);
}
uploaded += chunk.len();
}
Err(_) => {
// failure(s) have already been logged
//
// however this is an inconsistency: if we crash here, we will start with the
// values as uploaded. in practice, the rejections no longer happen.
failed += chunk.len();
}
}
}
let elapsed = started_at.elapsed();
tracing::info!(
uploaded,
failed,
elapsed_ms = elapsed.as_millis(),
"done sending metrics"
);
Ok(())
}
// The return type is quite ugly, but we gain testability in isolation
fn serialize_in_chunks<'a, F>(
chunk_size: usize,
input: &'a [RawMetric],
factory: F,
) -> impl ExactSizeIterator<Item = Result<(&'a [RawMetric], bytes::Bytes), serde_json::Error>> + 'a
where
F: KeyGen<'a> + 'a,
{
use bytes::BufMut;
struct Iter<'a, F> {
inner: std::slice::Chunks<'a, RawMetric>,
chunk_size: usize,
// write to a BytesMut so that we can cheaply clone the frozen Bytes for retries
buffer: bytes::BytesMut,
// chunk amount of events are reused to produce the serialized document
scratch: Vec<Event<Ids, Name>>,
factory: F,
}
impl<'a, F: KeyGen<'a>> Iterator for Iter<'a, F> {
type Item = Result<(&'a [RawMetric], bytes::Bytes), serde_json::Error>;
fn next(&mut self) -> Option<Self::Item> {
let chunk = self.inner.next()?;
if self.scratch.is_empty() {
// first round: create events with N strings
self.scratch.extend(
chunk
.iter()
.map(|raw_metric| raw_metric.as_event(&self.factory.generate())),
);
} else {
// next rounds: update_in_place to reuse allocations
assert_eq!(self.scratch.len(), self.chunk_size);
self.scratch
.iter_mut()
.zip(chunk.iter())
.for_each(|(slot, raw_metric)| {
raw_metric.update_in_place(slot, &self.factory.generate())
});
}
let res = serde_json::to_writer(
(&mut self.buffer).writer(),
&EventChunk {
events: (&self.scratch[..chunk.len()]).into(),
},
);
match res {
Ok(()) => Some(Ok((chunk, self.buffer.split().freeze()))),
Err(e) => Some(Err(e)),
}
}
fn size_hint(&self) -> (usize, Option<usize>) {
self.inner.size_hint()
}
}
impl<'a, F: KeyGen<'a>> ExactSizeIterator for Iter<'a, F> {}
let buffer = bytes::BytesMut::new();
let inner = input.chunks(chunk_size);
let scratch = Vec::new();
Iter {
inner,
chunk_size,
buffer,
scratch,
factory,
}
}
trait RawMetricExt {
fn as_event(&self, key: &IdempotencyKey<'_>) -> Event<Ids, Name>;
fn update_in_place(&self, event: &mut Event<Ids, Name>, key: &IdempotencyKey<'_>);
}
impl RawMetricExt for RawMetric {
fn as_event(&self, key: &IdempotencyKey<'_>) -> Event<Ids, Name> {
let MetricsKey {
metric,
tenant_id,
timeline_id,
} = self.0;
let (kind, value) = self.1;
Event {
kind,
metric,
idempotency_key: key.to_string(),
value,
extra: Ids {
tenant_id,
timeline_id,
},
}
}
fn update_in_place(&self, event: &mut Event<Ids, Name>, key: &IdempotencyKey<'_>) {
use std::fmt::Write;
let MetricsKey {
metric,
tenant_id,
timeline_id,
} = self.0;
let (kind, value) = self.1;
*event = Event {
kind,
metric,
idempotency_key: {
event.idempotency_key.clear();
write!(event.idempotency_key, "{key}").unwrap();
std::mem::take(&mut event.idempotency_key)
},
value,
extra: Ids {
tenant_id,
timeline_id,
},
};
}
}
trait KeyGen<'a>: Copy {
fn generate(&self) -> IdempotencyKey<'a>;
}
impl<'a> KeyGen<'a> for &'a str {
fn generate(&self) -> IdempotencyKey<'a> {
IdempotencyKey::generate(self)
}
}
enum UploadError {
Rejected(reqwest::StatusCode),
Reqwest(reqwest::Error),
Cancelled,
}
impl std::fmt::Debug for UploadError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
// use same impl because backoff::retry will log this using both
std::fmt::Display::fmt(self, f)
}
}
impl std::fmt::Display for UploadError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
use UploadError::*;
match self {
Rejected(code) => write!(f, "server rejected the metrics with {code}"),
Reqwest(e) => write!(f, "request failed: {e}"),
Cancelled => write!(f, "cancelled"),
}
}
}
impl UploadError {
fn is_reject(&self) -> bool {
matches!(self, UploadError::Rejected(_))
}
}
// this is consumed by the test verifiers
static LAST_IN_BATCH: reqwest::header::HeaderName =
reqwest::header::HeaderName::from_static("pageserver-metrics-last-upload-in-batch");
async fn upload(
client: &reqwest::Client,
metric_collection_endpoint: &reqwest::Url,
body: bytes::Bytes,
cancel: &CancellationToken,
is_last: bool,
) -> Result<(), UploadError> {
let warn_after = 3;
let max_attempts = 10;
let res = utils::backoff::retry(
move || {
let body = body.clone();
async move {
let res = client
.post(metric_collection_endpoint.clone())
.header(reqwest::header::CONTENT_TYPE, "application/json")
.header(
LAST_IN_BATCH.clone(),
if is_last { "true" } else { "false" },
)
.body(body)
.send()
.await;
let res = res.and_then(|res| res.error_for_status());
// 10 redirects are normally allowed, so we don't need worry about 3xx
match res {
Ok(_response) => Ok(()),
Err(e) => {
let status = e.status().filter(|s| s.is_client_error());
if let Some(status) = status {
// rejection used to be a thing when the server could reject a
// whole batch of metrics if one metric was bad.
Err(UploadError::Rejected(status))
} else {
Err(UploadError::Reqwest(e))
}
}
}
}
},
UploadError::is_reject,
warn_after,
max_attempts,
"upload consumption_metrics",
utils::backoff::Cancel::new(cancel.clone(), || UploadError::Cancelled),
)
.await;
match &res {
Ok(_) => {}
Err(e) if e.is_reject() => {
// permanent errors currently do not get logged by backoff::retry
// display alternate has no effect, but keeping it here for easier pattern matching.
tracing::error!("failed to upload metrics: {e:#}");
}
Err(_) => {
// these have been logged already
}
}
res
}
#[cfg(test)]
mod tests {
use super::*;
use chrono::{DateTime, Utc};
use once_cell::sync::Lazy;
#[test]
fn chunked_serialization() {
let examples = metric_samples();
assert!(examples.len() > 1);
let factory = FixedGen::new(Utc::now(), "1", 42);
// need to use Event here because serde_json::Value uses default hashmap, not linked
// hashmap
#[derive(serde::Deserialize)]
struct EventChunk {
events: Vec<Event<Ids, Name>>,
}
let correct = serialize_in_chunks(examples.len(), &examples, factory)
.map(|res| res.unwrap().1)
.flat_map(|body| serde_json::from_slice::<EventChunk>(&body).unwrap().events)
.collect::<Vec<_>>();
for chunk_size in 1..examples.len() {
let actual = serialize_in_chunks(chunk_size, &examples, factory)
.map(|res| res.unwrap().1)
.flat_map(|body| serde_json::from_slice::<EventChunk>(&body).unwrap().events)
.collect::<Vec<_>>();
// if these are equal, it means that multi-chunking version works as well
assert_eq!(correct, actual);
}
}
#[derive(Clone, Copy)]
struct FixedGen<'a>(chrono::DateTime<chrono::Utc>, &'a str, u16);
impl<'a> FixedGen<'a> {
fn new(now: chrono::DateTime<chrono::Utc>, node_id: &'a str, nonce: u16) -> Self {
FixedGen(now, node_id, nonce)
}
}
impl<'a> KeyGen<'a> for FixedGen<'a> {
fn generate(&self) -> IdempotencyKey<'a> {
IdempotencyKey::for_tests(self.0, self.1, self.2)
}
}
static SAMPLES_NOW: Lazy<DateTime<Utc>> = Lazy::new(|| {
DateTime::parse_from_rfc3339("2023-09-15T00:00:00.123456789Z")
.unwrap()
.into()
});
#[test]
fn metric_image_stability() {
// it is important that these strings stay as they are
let examples = [
(
line!(),
r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"written_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#,
),
(
line!(),
r#"{"type":"incremental","start_time":"2023-09-14T00:00:00.123456789Z","stop_time":"2023-09-15T00:00:00.123456789Z","metric":"written_data_bytes_delta","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#,
),
(
line!(),
r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"timeline_logical_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000","timeline_id":"ffffffffffffffffffffffffffffffff"}"#,
),
(
line!(),
r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"remote_storage_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000"}"#,
),
(
line!(),
r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"resident_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":0,"tenant_id":"00000000000000000000000000000000"}"#,
),
(
line!(),
r#"{"type":"absolute","time":"2023-09-15T00:00:00.123456789Z","metric":"synthetic_storage_size","idempotency_key":"2023-09-15 00:00:00.123456789 UTC-1-0000","value":1,"tenant_id":"00000000000000000000000000000000"}"#,
),
];
let idempotency_key = consumption_metrics::IdempotencyKey::for_tests(*SAMPLES_NOW, "1", 0);
let examples = examples.into_iter().zip(metric_samples());
for ((line, expected), (key, (kind, value))) in examples {
let e = consumption_metrics::Event {
kind,
metric: key.metric,
idempotency_key: idempotency_key.to_string(),
value,
extra: Ids {
tenant_id: key.tenant_id,
timeline_id: key.timeline_id,
},
};
let actual = serde_json::to_string(&e).unwrap();
assert_eq!(expected, actual, "example for {kind:?} from line {line}");
}
}
fn metric_samples() -> [RawMetric; 6] {
let tenant_id = TenantId::from_array([0; 16]);
let timeline_id = TimelineId::from_array([0xff; 16]);
let before = DateTime::parse_from_rfc3339("2023-09-14T00:00:00.123456789Z")
.unwrap()
.into();
let [now, before] = [*SAMPLES_NOW, before];
super::super::metrics::metric_examples(tenant_id, timeline_id, now, before)
}
}

View File

@@ -444,7 +444,6 @@ impl<'a> WalIngest<'a> {
// need to clear the corresponding bits in the visibility map.
let mut new_heap_blkno: Option<u32> = None;
let mut old_heap_blkno: Option<u32> = None;
let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS;
match self.timeline.pg_version {
14 => {
@@ -471,20 +470,14 @@ impl<'a> WalIngest<'a> {
// we can't validate the remaining number of bytes without parsing
// the tuple data.
if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
old_heap_blkno = Some(decoded.blocks[0].blkno);
}
if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
// PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
// non-HOT update where the new tuple goes to different page than
// the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
// set.
new_heap_blkno = Some(decoded.blocks[0].blkno);
}
} else if info == pg_constants::XLOG_HEAP_LOCK {
let xlrec = v14::XlHeapLock::decode(buf);
if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
old_heap_blkno = Some(decoded.blocks[0].blkno);
flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
new_heap_blkno = Some(decoded.blocks[1].blkno);
}
}
} else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
@@ -504,12 +497,6 @@ impl<'a> WalIngest<'a> {
if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
new_heap_blkno = Some(decoded.blocks[0].blkno);
}
} else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED {
let xlrec = v14::XlHeapLockUpdated::decode(buf);
if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
old_heap_blkno = Some(decoded.blocks[0].blkno);
flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
}
}
} else {
bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
@@ -539,20 +526,14 @@ impl<'a> WalIngest<'a> {
// we can't validate the remaining number of bytes without parsing
// the tuple data.
if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
old_heap_blkno = Some(decoded.blocks[0].blkno);
}
if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
// PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
// non-HOT update where the new tuple goes to different page than
// the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
// set.
new_heap_blkno = Some(decoded.blocks[0].blkno);
}
} else if info == pg_constants::XLOG_HEAP_LOCK {
let xlrec = v15::XlHeapLock::decode(buf);
if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
old_heap_blkno = Some(decoded.blocks[0].blkno);
flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
new_heap_blkno = Some(decoded.blocks[1].blkno);
}
}
} else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
@@ -572,12 +553,6 @@ impl<'a> WalIngest<'a> {
if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
new_heap_blkno = Some(decoded.blocks[0].blkno);
}
} else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED {
let xlrec = v15::XlHeapLockUpdated::decode(buf);
if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
old_heap_blkno = Some(decoded.blocks[0].blkno);
flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
}
}
} else {
bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
@@ -607,20 +582,14 @@ impl<'a> WalIngest<'a> {
// we can't validate the remaining number of bytes without parsing
// the tuple data.
if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
old_heap_blkno = Some(decoded.blocks[0].blkno);
}
if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
// PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
// non-HOT update where the new tuple goes to different page than
// the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
// set.
new_heap_blkno = Some(decoded.blocks[0].blkno);
}
} else if info == pg_constants::XLOG_HEAP_LOCK {
let xlrec = v16::XlHeapLock::decode(buf);
if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
old_heap_blkno = Some(decoded.blocks[0].blkno);
flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
new_heap_blkno = Some(decoded.blocks[1].blkno);
}
}
} else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
@@ -640,12 +609,6 @@ impl<'a> WalIngest<'a> {
if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
new_heap_blkno = Some(decoded.blocks[0].blkno);
}
} else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED {
let xlrec = v16::XlHeapLockUpdated::decode(buf);
if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
old_heap_blkno = Some(decoded.blocks[0].blkno);
flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
}
}
} else {
bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
@@ -653,6 +616,7 @@ impl<'a> WalIngest<'a> {
}
_ => {}
}
// FIXME: What about XLOG_HEAP_LOCK and XLOG_HEAP2_LOCK_UPDATED?
// Clear the VM bits if required.
if new_heap_blkno.is_some() || old_heap_blkno.is_some() {
@@ -696,7 +660,7 @@ impl<'a> WalIngest<'a> {
NeonWalRecord::ClearVisibilityMapFlags {
new_heap_blkno,
old_heap_blkno,
flags,
flags: pg_constants::VISIBILITYMAP_VALID_BITS,
},
ctx,
)
@@ -712,7 +676,7 @@ impl<'a> WalIngest<'a> {
NeonWalRecord::ClearVisibilityMapFlags {
new_heap_blkno,
old_heap_blkno: None,
flags,
flags: pg_constants::VISIBILITYMAP_VALID_BITS,
},
ctx,
)
@@ -726,7 +690,7 @@ impl<'a> WalIngest<'a> {
NeonWalRecord::ClearVisibilityMapFlags {
new_heap_blkno: None,
old_heap_blkno,
flags,
flags: pg_constants::VISIBILITYMAP_VALID_BITS,
},
ctx,
)
@@ -753,8 +717,6 @@ impl<'a> WalIngest<'a> {
// need to clear the corresponding bits in the visibility map.
let mut new_heap_blkno: Option<u32> = None;
let mut old_heap_blkno: Option<u32> = None;
let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS;
assert_eq!(decoded.xl_rmid, pg_constants::RM_NEON_ID);
match self.timeline.pg_version {
@@ -783,14 +745,14 @@ impl<'a> WalIngest<'a> {
// we can't validate the remaining number of bytes without parsing
// the tuple data.
if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
old_heap_blkno = Some(decoded.blocks[0].blkno);
}
if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
// PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
// non-HOT update where the new tuple goes to different page than
// the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
// set.
new_heap_blkno = Some(decoded.blocks[0].blkno);
new_heap_blkno = Some(decoded.blocks[1].blkno);
}
}
pg_constants::XLOG_NEON_HEAP_MULTI_INSERT => {
@@ -810,11 +772,7 @@ impl<'a> WalIngest<'a> {
}
}
pg_constants::XLOG_NEON_HEAP_LOCK => {
let xlrec = v16::rm_neon::XlNeonHeapLock::decode(buf);
if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
old_heap_blkno = Some(decoded.blocks[0].blkno);
flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
}
/* XLOG_NEON_HEAP_LOCK doesn't need special care */
}
info => bail!("Unknown WAL record type for Neon RMGR: {}", info),
}
@@ -825,6 +783,8 @@ impl<'a> WalIngest<'a> {
),
}
// FIXME: What about XLOG_NEON_HEAP_LOCK?
// Clear the VM bits if required.
if new_heap_blkno.is_some() || old_heap_blkno.is_some() {
let vm_rel = RelTag {
@@ -867,7 +827,7 @@ impl<'a> WalIngest<'a> {
NeonWalRecord::ClearVisibilityMapFlags {
new_heap_blkno,
old_heap_blkno,
flags,
flags: pg_constants::VISIBILITYMAP_VALID_BITS,
},
ctx,
)
@@ -883,7 +843,7 @@ impl<'a> WalIngest<'a> {
NeonWalRecord::ClearVisibilityMapFlags {
new_heap_blkno,
old_heap_blkno: None,
flags,
flags: pg_constants::VISIBILITYMAP_VALID_BITS,
},
ctx,
)
@@ -897,7 +857,7 @@ impl<'a> WalIngest<'a> {
NeonWalRecord::ClearVisibilityMapFlags {
new_heap_blkno: None,
old_heap_blkno,
flags,
flags: pg_constants::VISIBILITYMAP_VALID_BITS,
},
ctx,
)

View File

@@ -219,66 +219,20 @@ pub mod v14 {
old_offnum: buf.get_u16_le(),
old_infobits_set: buf.get_u8(),
flags: buf.get_u8(),
t_cid: buf.get_u32_le(),
t_cid: buf.get_u32(),
new_xmax: buf.get_u32_le(),
new_offnum: buf.get_u16_le(),
}
}
}
#[repr(C)]
#[derive(Debug)]
pub struct XlHeapLock {
pub locking_xid: TransactionId,
pub offnum: OffsetNumber,
pub _padding: u16,
pub t_cid: u32,
pub infobits_set: u8,
pub flags: u8,
}
impl XlHeapLock {
pub fn decode(buf: &mut Bytes) -> XlHeapLock {
XlHeapLock {
locking_xid: buf.get_u32_le(),
offnum: buf.get_u16_le(),
_padding: buf.get_u16_le(),
t_cid: buf.get_u32_le(),
infobits_set: buf.get_u8(),
flags: buf.get_u8(),
}
}
}
#[repr(C)]
#[derive(Debug)]
pub struct XlHeapLockUpdated {
pub xmax: TransactionId,
pub offnum: OffsetNumber,
pub infobits_set: u8,
pub flags: u8,
}
impl XlHeapLockUpdated {
pub fn decode(buf: &mut Bytes) -> XlHeapLockUpdated {
XlHeapLockUpdated {
xmax: buf.get_u32_le(),
offnum: buf.get_u16_le(),
infobits_set: buf.get_u8(),
flags: buf.get_u8(),
}
}
}
}
pub mod v15 {
pub use super::v14::{
XlHeapDelete, XlHeapInsert, XlHeapLock, XlHeapLockUpdated, XlHeapMultiInsert, XlHeapUpdate,
};
pub use super::v14::{XlHeapDelete, XlHeapInsert, XlHeapMultiInsert, XlHeapUpdate};
}
pub mod v16 {
pub use super::v14::{XlHeapInsert, XlHeapLockUpdated, XlHeapMultiInsert};
pub use super::v14::{XlHeapInsert, XlHeapMultiInsert};
use bytes::{Buf, Bytes};
use postgres_ffi::{OffsetNumber, TransactionId};
@@ -324,26 +278,6 @@ pub mod v16 {
}
}
#[repr(C)]
#[derive(Debug)]
pub struct XlHeapLock {
pub locking_xid: TransactionId,
pub offnum: OffsetNumber,
pub infobits_set: u8,
pub flags: u8,
}
impl XlHeapLock {
pub fn decode(buf: &mut Bytes) -> XlHeapLock {
XlHeapLock {
locking_xid: buf.get_u32_le(),
offnum: buf.get_u16_le(),
infobits_set: buf.get_u8(),
flags: buf.get_u8(),
}
}
}
/* Since PG16, we have the Neon RMGR (RM_NEON_ID) to manage Neon-flavored WAL. */
pub mod rm_neon {
use bytes::{Buf, Bytes};
@@ -432,28 +366,6 @@ pub mod v16 {
}
}
}
#[repr(C)]
#[derive(Debug)]
pub struct XlNeonHeapLock {
pub locking_xid: TransactionId,
pub t_cid: u32,
pub offnum: OffsetNumber,
pub infobits_set: u8,
pub flags: u8,
}
impl XlNeonHeapLock {
pub fn decode(buf: &mut Bytes) -> XlNeonHeapLock {
XlNeonHeapLock {
locking_xid: buf.get_u32_le(),
t_cid: buf.get_u32_le(),
offnum: buf.get_u16_le(),
infobits_set: buf.get_u8(),
flags: buf.get_u8(),
}
}
}
}
}

View File

@@ -153,7 +153,7 @@ lfc_ensure_opened(void)
return false;
}
}
return true;
return false;
}
static void
@@ -222,9 +222,8 @@ lfc_change_limit_hook(int newval, void *extra)
/*
* Stats collector detach shared memory, so we should not try to access shared memory here.
* Parallel workers first assign default value (0), so not perform truncation in parallel workers.
* The Postmaster can handle SIGHUP and it has access to shared memory (UsedShmemSegAddr != NULL), but has no PGPROC.
*/
if (!lfc_ctl || !MyProc || !UsedShmemSegAddr || IsParallelWorker())
if (!lfc_ctl || !UsedShmemSegAddr || IsParallelWorker())
return;
/* Open cache file if not done yet */
@@ -641,7 +640,6 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
rc = pwrite(lfc_desc, buffer, BLCKSZ, ((off_t)entry->offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ);
if (rc != BLCKSZ)
{
LWLockRelease(lfc_lock);
lfc_disable("write");
}
else
@@ -652,8 +650,9 @@ lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
dlist_push_tail(&lfc_ctl->lru, &entry->lru_node);
entry->bitmap[chunk_offs >> 5] |= (1 << (chunk_offs & 31));
LWLockRelease(lfc_lock);
}
LWLockRelease(lfc_lock);
}
/*

View File

@@ -121,7 +121,7 @@ async fn collect_metrics_iteration(
let current_metrics = gather_proxy_io_bytes_per_client();
let metrics_to_send: Vec<Event<Ids, &'static str>> = current_metrics
let metrics_to_send: Vec<Event<Ids>> = current_metrics
.iter()
.filter_map(|(curr_key, (curr_val, curr_time))| {
let mut start_time = *curr_time;

View File

@@ -1,7 +1,6 @@
pytest_plugins = (
"fixtures.pg_version",
"fixtures.parametrize",
"fixtures.httpserver",
"fixtures.neon_fixtures",
"fixtures.benchmark_fixture",
"fixtures.pg_stats",

View File

@@ -1,45 +0,0 @@
from typing import Tuple
import pytest
from pytest_httpserver import HTTPServer
# TODO: mypy fails with:
# Module "fixtures.neon_fixtures" does not explicitly export attribute "PortDistributor" [attr-defined]
# from fixtures.neon_fixtures import PortDistributor
# compared to the fixtures from pytest_httpserver with same names, these are
# always function scoped, so you can check and stop the server in tests.
@pytest.fixture(scope="function")
def httpserver_ssl_context():
return None
@pytest.fixture(scope="function")
def make_httpserver(httpserver_listen_address, httpserver_ssl_context):
host, port = httpserver_listen_address
if not host:
host = HTTPServer.DEFAULT_LISTEN_HOST
if not port:
port = HTTPServer.DEFAULT_LISTEN_PORT
server = HTTPServer(host=host, port=port, ssl_context=httpserver_ssl_context)
server.start()
yield server
server.clear()
if server.is_running():
server.stop()
@pytest.fixture(scope="function")
def httpserver(make_httpserver):
server = make_httpserver
yield server
server.clear()
@pytest.fixture(scope="function")
def httpserver_listen_address(port_distributor) -> Tuple[str, int]:
port = port_distributor.get_port()
return ("localhost", port)

View File

@@ -223,6 +223,12 @@ def port_distributor(worker_base_port: int, worker_port_num: int) -> PortDistrib
return PortDistributor(base_port=worker_base_port, port_number=worker_port_num)
@pytest.fixture(scope="session")
def httpserver_listen_address(port_distributor: PortDistributor):
port = port_distributor.get_port()
return ("localhost", port)
@pytest.fixture(scope="function")
def default_broker(
port_distributor: PortDistributor,
@@ -847,6 +853,18 @@ class NeonEnv:
"""Get list of safekeeper endpoints suitable for safekeepers GUC"""
return ",".join(f"localhost:{wa.port.pg}" for wa in self.safekeepers)
def timeline_dir(
self, tenant_id: TenantId, timeline_id: TimelineId, pageserver_id: Optional[int] = None
) -> Path:
"""Get a timeline directory's path based on the repo directory of the test environment"""
return (
self.tenant_dir(tenant_id, pageserver_id=pageserver_id) / "timelines" / str(timeline_id)
)
def tenant_dir(self, tenant_id: TenantId, pageserver_id: Optional[int] = None) -> Path:
"""Get a tenant directory's path based on the repo directory of the test environment"""
return self.get_pageserver(pageserver_id).workdir / "tenants" / str(tenant_id)
def get_pageserver_version(self) -> str:
bin_pageserver = str(self.neon_binpath / "pageserver")
res = subprocess.run(
@@ -1568,21 +1586,6 @@ class NeonPageserver(PgProtocol):
'.*registered custom resource manager "neon".*',
]
def timeline_dir(self, tenant_id: TenantId, timeline_id: Optional[TimelineId] = None) -> Path:
"""Get a timeline directory's path based on the repo directory of the test environment"""
if timeline_id is None:
return self.tenant_dir(tenant_id) / "timelines"
return self.tenant_dir(tenant_id) / "timelines" / str(timeline_id)
def tenant_dir(
self,
tenant_id: Optional[TenantId] = None,
) -> Path:
"""Get a tenant directory's path based on the repo directory of the test environment"""
if tenant_id is None:
return self.workdir / "tenants"
return self.workdir / "tenants" / str(tenant_id)
def start(
self,
overrides: Tuple[str, ...] = (),
@@ -2133,28 +2136,6 @@ class NeonProxy(PgProtocol):
def _wait_until_ready(self):
requests.get(f"http://{self.host}:{self.http_port}/v1/status")
def http_query(self, query, args, **kwargs):
# TODO maybe use default values if not provided
user = kwargs["user"]
password = kwargs["password"]
expected_code = kwargs.get("expected_code")
connstr = f"postgresql://{user}:{password}@{self.domain}:{self.proxy_port}/postgres"
response = requests.post(
f"https://{self.domain}:{self.external_http_port}/sql",
data=json.dumps({"query": query, "params": args}),
headers={
"Content-Type": "application/sql",
"Neon-Connection-String": connstr,
"Neon-Pool-Opt-In": "true",
},
verify=str(self.test_output_dir / "proxy.crt"),
)
if expected_code is not None:
assert response.status_code == kwargs["expected_code"], f"response: {response.json()}"
return response.json()
def get_metrics(self) -> str:
request_result = requests.get(f"http://{self.host}:{self.http_port}/metrics")
request_result.raise_for_status()

View File

@@ -236,27 +236,15 @@ def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str
response = list_prefix(neon_env_builder, prefix)
keys = response["KeyCount"]
objects = response.get("Contents", [])
common_prefixes = response.get("CommonPrefixes", [])
remote_storage = neon_env_builder.pageserver_remote_storage
is_mock_s3 = isinstance(remote_storage, S3Storage) and not remote_storage.cleanup
if is_mock_s3:
if keys == 1 and len(objects) == 0 and len(common_prefixes) == 1:
# this has been seen in the wild by tests with the below contradicting logging
# https://neon-github-public-dev.s3.amazonaws.com/reports/pr-5322/6207777020/index.html#suites/3556ed71f2d69272a7014df6dcb02317/53b5c368b5a68865
# this seems like a mock_s3 issue
log.warn(
f"contrading ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}, assuming this means KeyCount=0"
)
keys = 0
elif keys != 0 and len(objects) == 0:
# this has been seen in one case with mock_s3:
# https://neon-github-public-dev.s3.amazonaws.com/reports/pr-4938/6000769714/index.html#suites/3556ed71f2d69272a7014df6dcb02317/ca01e4f4d8d9a11f
# looking at moto impl, it might be there's a race with common prefix (sub directory) not going away with deletes
log.warn(
f"contradicting ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}"
)
if keys != 0 and len(objects) == 0:
# this has been seen in one case with mock_s3:
# https://neon-github-public-dev.s3.amazonaws.com/reports/pr-4938/6000769714/index.html#suites/3556ed71f2d69272a7014df6dcb02317/ca01e4f4d8d9a11f
# looking at moto impl, it might be there's a race with common prefix (sub directory) not going away with deletes
common_prefixes = response.get("CommonPrefixes", [])
log.warn(
f"contradicting ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}"
)
assert keys == 0, f"remote dir with prefix {prefix} is not empty after deletion: {objects}"

View File

@@ -115,8 +115,6 @@ class S3Storage:
prefix_in_bucket: str
client: S3Client
cleanup: bool
"""Is this MOCK_S3 (false) or REAL_S3 (true)"""
real: bool
endpoint: Optional[str] = None
def access_env_vars(self) -> Dict[str, str]:
@@ -267,7 +265,6 @@ class RemoteStorageKind(str, enum.Enum):
prefix_in_bucket="",
client=client,
cleanup=False,
real=False,
)
assert self == RemoteStorageKind.REAL_S3
@@ -303,7 +300,6 @@ class RemoteStorageKind(str, enum.Enum):
prefix_in_bucket=prefix_in_bucket,
client=client,
cleanup=True,
real=True,
)

View File

@@ -44,7 +44,7 @@ def measure_recovery_time(env: NeonCompare):
# Stop pageserver and remove tenant data
env.env.pageserver.stop()
timeline_dir = env.env.pageserver.timeline_dir(env.tenant, env.timeline)
timeline_dir = env.env.timeline_dir(env.tenant, env.timeline)
shutil.rmtree(timeline_dir)
# Start pageserver

View File

@@ -135,7 +135,7 @@ def test_timeline_init_break_before_checkpoint(neon_env_builder: NeonEnvBuilder)
tenant_id = env.initial_tenant
timelines_dir = env.pageserver.timeline_dir(tenant_id)
timelines_dir = env.pageserver.workdir / "tenants" / str(tenant_id) / "timelines"
old_tenant_timelines = env.neon_cli.list_timelines(tenant_id)
initial_timeline_dirs = [d for d in timelines_dir.iterdir()]
@@ -166,7 +166,7 @@ def test_timeline_create_break_after_uninit_mark(neon_env_builder: NeonEnvBuilde
tenant_id = env.initial_tenant
timelines_dir = env.pageserver.timeline_dir(tenant_id)
timelines_dir = env.pageserver.workdir / "tenants" / str(tenant_id) / "timelines"
old_tenant_timelines = env.neon_cli.list_timelines(tenant_id)
initial_timeline_dirs = [d for d in timelines_dir.iterdir()]

View File

@@ -20,7 +20,7 @@ from fixtures.pageserver.utils import (
wait_for_last_record_lsn,
wait_for_upload,
)
from fixtures.pg_version import PgVersion
from fixtures.pg_version import PgVersion, skip_on_postgres
from fixtures.port_distributor import PortDistributor
from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, RemoteStorageUser
from fixtures.types import Lsn
@@ -151,6 +151,7 @@ def test_create_snapshot(
shutil.copytree(test_output_dir, compatibility_snapshot_dir)
@skip_on_postgres(PgVersion.V16, reason="TODO: Enable after the first Postgres 16 release")
@check_ondisk_data_compatibility_if_enabled
@pytest.mark.xdist_group("compatibility")
@pytest.mark.order(after="test_create_snapshot")
@@ -208,6 +209,7 @@ def test_backward_compatibility(
), "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"
@skip_on_postgres(PgVersion.V16, reason="TODO: Enable after the first Postgres 16 release")
@check_ondisk_data_compatibility_if_enabled
@pytest.mark.xdist_group("compatibility")
@pytest.mark.order(after="test_create_snapshot")

View File

@@ -1,22 +1,16 @@
import os
import pathlib
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content
from fixtures.pg_version import PgVersion
from fixtures.utils import query_scalar
#
# Test CREATE DATABASE when there have been relmapper changes
#
@pytest.mark.parametrize("strategy", ["file_copy", "wal_log"])
def test_createdb(neon_simple_env: NeonEnv, strategy: str):
def test_createdb(neon_simple_env: NeonEnv):
env = neon_simple_env
if env.pg_version == PgVersion.V14 and strategy == "wal_log":
pytest.skip("wal_log strategy not supported on PostgreSQL 14")
env.neon_cli.create_branch("test_createdb", "empty")
endpoint = env.endpoints.create_start("test_createdb")
@@ -26,10 +20,7 @@ def test_createdb(neon_simple_env: NeonEnv, strategy: str):
# Cause a 'relmapper' change in the original branch
cur.execute("VACUUM FULL pg_class")
if env.pg_version == PgVersion.V14:
cur.execute("CREATE DATABASE foodb")
else:
cur.execute(f"CREATE DATABASE foodb STRATEGY={strategy}")
cur.execute("CREATE DATABASE foodb")
lsn = query_scalar(cur, "SELECT pg_current_wal_insert_lsn()")

View File

@@ -42,11 +42,12 @@ def handle_role(dbs, roles, operation):
raise ValueError("Invalid op")
def ddl_forward_handler(
request: Request, dbs: Dict[str, str], roles: Dict[str, str], ddl: "DdlForwardingContext"
) -> Response:
fail = False
def ddl_forward_handler(request: Request, dbs: Dict[str, str], roles: Dict[str, str]) -> Response:
log.info(f"Received request with data {request.get_data(as_text=True)}")
if ddl.fail:
if fail:
log.info("FAILING")
return Response(status=500, response="Failed just cuz")
if request.json is None:
@@ -71,7 +72,6 @@ class DdlForwardingContext:
self.port = port
self.dbs: Dict[str, str] = {}
self.roles: Dict[str, str] = {}
self.fail = False
endpoint = "/management/api/v2/roles_and_databases"
ddl_url = f"http://{host}:{port}{endpoint}"
self.pg.configure(
@@ -82,7 +82,7 @@ class DdlForwardingContext:
)
log.info(f"Listening on {ddl_url}")
self.server.expect_request(endpoint, method="PATCH").respond_with_handler(
lambda request: ddl_forward_handler(request, self.dbs, self.roles, self)
lambda request: ddl_forward_handler(request, self.dbs, self.roles)
)
def __enter__(self):
@@ -103,9 +103,6 @@ class DdlForwardingContext:
def wait(self, timeout=3):
self.server.wait(timeout=timeout)
def failures(self, bool):
self.fail = bool
def send_and_wait(self, query: str, timeout=3) -> List[Tuple[Any, ...]]:
res = self.send(query)
self.wait(timeout=timeout)
@@ -206,9 +203,9 @@ def test_ddl_forwarding(ddl: DdlForwardingContext):
assert ddl.dbs == {"stork": "cork"}
with pytest.raises(psycopg2.InternalError):
ddl.failures(True)
global fail
fail = True
cur.execute("CREATE DATABASE failure WITH OWNER=cork")
ddl.wait()
ddl.failures(False)
conn.close()

View File

@@ -417,7 +417,7 @@ def poor_mans_du(
largest_layer = 0
smallest_layer = None
for tenant_id, timeline_id in timelines:
timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id)
timeline_dir = env.timeline_dir(tenant_id, timeline_id)
assert timeline_dir.exists(), f"timeline dir does not exist: {timeline_dir}"
total = 0
for file in timeline_dir.iterdir():

View File

@@ -15,45 +15,45 @@ from fixtures.types import TimelineId
# Test configuration
#
# Create a table with {NUM_ROWS} rows, and perform {UPDATES_TO_PERFORM} random
# UPDATEs on it, using {NUM_CONNECTIONS} separate connections.
NUM_CONNECTIONS = 10
NUM_ROWS = 100000
UPDATES_TO_PERFORM = 10000
# Create a table with {num_rows} rows, and perform {updates_to_perform} random
# UPDATEs on it, using {num_connections} separate connections.
num_connections = 10
num_rows = 100000
updates_to_perform = 10000
updates_performed = 0
# Run random UPDATEs on test table
async def update_table(endpoint: Endpoint):
global updates_performed
pg_conn = await endpoint.connect_async()
while updates_performed < updates_to_perform:
updates_performed += 1
id = random.randrange(1, num_rows)
await pg_conn.fetchrow(f"UPDATE foo SET counter = counter + 1 WHERE id = {id}")
# Perform aggressive GC with 0 horizon
async def gc(env: NeonEnv, timeline: TimelineId):
pageserver_http = env.pageserver.http_client()
loop = asyncio.get_running_loop()
def do_gc():
pageserver_http.timeline_checkpoint(env.initial_tenant, timeline)
pageserver_http.timeline_gc(env.initial_tenant, timeline, 0)
with concurrent.futures.ThreadPoolExecutor() as pool:
while updates_performed < updates_to_perform:
await loop.run_in_executor(pool, do_gc)
# At the same time, run UPDATEs and GC
async def update_and_gc(env: NeonEnv, endpoint: Endpoint, timeline: TimelineId):
workers = []
updates_performed = 0
# Perform aggressive GC with 0 horizon
async def gc(env: NeonEnv, timeline: TimelineId):
pageserver_http = env.pageserver.http_client()
nonlocal updates_performed
global UPDATES_TO_PERFORM
loop = asyncio.get_running_loop()
def do_gc():
pageserver_http.timeline_checkpoint(env.initial_tenant, timeline)
pageserver_http.timeline_gc(env.initial_tenant, timeline, 0)
with concurrent.futures.ThreadPoolExecutor() as pool:
while updates_performed < UPDATES_TO_PERFORM:
await loop.run_in_executor(pool, do_gc)
# Run random UPDATEs on test table
async def update_table(endpoint: Endpoint):
pg_conn = await endpoint.connect_async()
nonlocal updates_performed
while updates_performed < UPDATES_TO_PERFORM:
updates_performed += 1
id = random.randrange(1, NUM_ROWS)
await pg_conn.fetchrow(f"UPDATE foo SET counter = counter + 1 WHERE id = {id}")
for _ in range(NUM_CONNECTIONS):
for _ in range(num_connections):
workers.append(asyncio.create_task(update_table(endpoint)))
workers.append(asyncio.create_task(gc(env, timeline)))
@@ -81,7 +81,7 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder):
f"""
INSERT INTO foo
SELECT g, 0, 'long string to consume some space' || g
FROM generate_series(1, {NUM_ROWS}) g
FROM generate_series(1, {num_rows}) g
"""
)
cur.execute("CREATE INDEX ON foo(id)")
@@ -91,7 +91,7 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder):
cur.execute("SELECT COUNT(*), SUM(counter) FROM foo")
r = cur.fetchone()
assert r is not None
assert r == (NUM_ROWS, UPDATES_TO_PERFORM)
assert r == (num_rows, updates_to_perform)
#
@@ -99,7 +99,6 @@ def test_gc_aggressive(neon_env_builder: NeonEnvBuilder):
def test_gc_index_upload(neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind):
# Disable time-based pitr, we will use LSN-based thresholds in the manual GC calls
neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
num_index_uploads = 0
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
@@ -161,5 +160,5 @@ def test_gc_index_upload(neon_env_builder: NeonEnvBuilder, remote_storage_kind:
log.info(f"{num_index_uploads} index uploads after GC iteration {i}")
after = num_index_uploads
log.info(f"{after - before} new index uploads during test")
log.info(f"{after-before} new index uploads during test")
assert after - before < 5

View File

@@ -271,7 +271,7 @@ def _import(
env.endpoints.stop_all()
env.pageserver.stop()
dir_to_clear = env.pageserver.tenant_dir()
dir_to_clear = Path(env.pageserver.workdir) / "tenants"
shutil.rmtree(dir_to_clear)
os.mkdir(dir_to_clear)

View File

@@ -55,7 +55,7 @@ def test_basic_eviction(
for sk in env.safekeepers:
sk.stop()
timeline_path = env.pageserver.timeline_dir(tenant_id, timeline_id)
timeline_path = env.timeline_dir(tenant_id, timeline_id)
initial_local_layers = sorted(
list(filter(lambda path: path.name != "metadata", timeline_path.glob("*")))
)
@@ -243,7 +243,7 @@ def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder):
assert by_kind["Image"] > 0
assert by_kind["Delta"] > 0
assert by_kind["InMemory"] == 0
resident_layers = list(env.pageserver.timeline_dir(tenant_id, timeline_id).glob("*-*_*"))
resident_layers = list(env.timeline_dir(tenant_id, timeline_id).glob("*-*_*"))
log.info("resident layers count before eviction: %s", len(resident_layers))
log.info("evict all layers")
@@ -251,7 +251,7 @@ def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder):
def ensure_resident_and_remote_size_metrics():
log.info("ensure that all the layers are gone")
resident_layers = list(env.pageserver.timeline_dir(tenant_id, timeline_id).glob("*-*_*"))
resident_layers = list(env.timeline_dir(tenant_id, timeline_id).glob("*-*_*"))
# we have disabled all background loops, so, this should hold
assert len(resident_layers) == 0

View File

@@ -38,7 +38,7 @@ def test_image_layer_writer_fail_before_finish(neon_simple_env: NeonEnv):
new_temp_layer_files = list(
filter(
lambda file: str(file).endswith(NeonPageserver.TEMP_FILE_SUFFIX),
[path for path in env.pageserver.timeline_dir(tenant_id, timeline_id).iterdir()],
[path for path in env.timeline_dir(tenant_id, timeline_id).iterdir()],
)
)
@@ -84,7 +84,7 @@ def test_delta_layer_writer_fail_before_finish(neon_simple_env: NeonEnv):
new_temp_layer_files = list(
filter(
lambda file: str(file).endswith(NeonPageserver.TEMP_FILE_SUFFIX),
[path for path in env.pageserver.timeline_dir(tenant_id, timeline_id).iterdir()],
[path for path in env.timeline_dir(tenant_id, timeline_id).iterdir()],
)
)

View File

@@ -1,44 +0,0 @@
import threading
import time
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnv, PgBin
#
# Test branching, when a transaction is in prepared state
#
@pytest.mark.timeout(600)
def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin):
env = neon_simple_env
env.neon_cli.create_branch("test_lfc_resize", "empty")
endpoint = env.endpoints.create_start(
"test_lfc_resize",
config_lines=[
"neon.file_cache_path='file.cache'",
"neon.max_file_cache_size=1GB",
"neon.file_cache_size_limit=1GB",
],
)
n_resize = 10
scale = 10
log.info("postgres is running on 'test_lfc_resize' branch")
def run_pgbench(connstr: str):
log.info(f"Start a pgbench workload on pg {connstr}")
pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])
pg_bin.run_capture(["pgbench", "-c4", f"-T{n_resize}", "-Mprepared", connstr])
thread = threading.Thread(target=run_pgbench, args=(endpoint.connstr(),), daemon=True)
thread.start()
conn = endpoint.connect()
cur = conn.cursor()
for i in range(n_resize):
cur.execute(f"alter system set neon.file_cache_size_limit='{i*10}MB'")
cur.execute("select pg_reload_conf()")
time.sleep(1)
thread.join()

View File

@@ -0,0 +1,265 @@
#
# Test for collecting metrics from pageserver and proxy.
# Use mock HTTP server to receive metrics and verify that they look sane.
#
import time
from pathlib import Path
from typing import Iterator
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
PSQL,
NeonEnvBuilder,
NeonProxy,
VanillaPostgres,
wait_for_last_flush_lsn,
)
from fixtures.port_distributor import PortDistributor
from fixtures.remote_storage import RemoteStorageKind
from fixtures.types import TenantId
from pytest_httpserver import HTTPServer
from werkzeug.wrappers.request import Request
from werkzeug.wrappers.response import Response
# ==============================================================================
# Storage metrics tests
# ==============================================================================
initial_tenant = TenantId.generate()
remote_uploaded = 0
checks = {
"written_size": lambda value: value > 0,
"resident_size": lambda value: value >= 0,
# >= 0 check here is to avoid race condition when we receive metrics before
# remote_uploaded is updated
"remote_storage_size": lambda value: value > 0 if remote_uploaded > 0 else value >= 0,
# logical size may lag behind the actual size, so allow 0 here
"timeline_logical_size": lambda value: value >= 0,
}
metric_kinds_checked = set([])
#
# verify that metrics look minilally sane
#
def metrics_handler(request: Request) -> Response:
if request.json is None:
return Response(status=400)
events = request.json["events"]
log.info("received events:")
log.info(events)
for event in events:
assert event["tenant_id"] == str(
initial_tenant
), "Expecting metrics only from the initial tenant"
metric_name = event["metric"]
check = checks.get(metric_name)
# calm down mypy
if check is not None:
assert check(event["value"]), f"{metric_name} isn't valid"
global metric_kinds_checked
metric_kinds_checked.add(metric_name)
return Response(status=200)
@pytest.mark.parametrize(
"remote_storage_kind", [RemoteStorageKind.NOOP, RemoteStorageKind.LOCAL_FS]
)
def test_metric_collection(
httpserver: HTTPServer,
neon_env_builder: NeonEnvBuilder,
httpserver_listen_address,
remote_storage_kind: RemoteStorageKind,
):
(host, port) = httpserver_listen_address
metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events"
# Require collecting metrics frequently, since we change
# the timeline and want something to be logged about it.
#
# Disable time-based pitr, we will use the manual GC calls
# to trigger remote storage operations in a controlled way
neon_env_builder.pageserver_config_override = (
f"""
metric_collection_interval="1s"
metric_collection_endpoint="{metric_collection_endpoint}"
"""
+ "tenant_config={pitr_interval = '0 sec'}"
)
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
log.info(f"test_metric_collection endpoint is {metric_collection_endpoint}")
# Set initial tenant of the test, that we expect the logs from
global initial_tenant
initial_tenant = neon_env_builder.initial_tenant
# mock http server that returns OK for the metrics
httpserver.expect_request("/billing/api/v1/usage_events", method="POST").respond_with_handler(
metrics_handler
)
# spin up neon, after http server is ready
env = neon_env_builder.init_start()
# Order of fixtures shutdown is not specified, and if http server gets down
# before pageserver, pageserver log might contain such errors in the end.
env.pageserver.allowed_errors.append(".*metrics endpoint refused the sent metrics*")
tenant_id = env.initial_tenant
timeline_id = env.neon_cli.create_branch("test_metric_collection")
endpoint = env.endpoints.create_start("test_metric_collection")
pg_conn = endpoint.connect()
cur = pg_conn.cursor()
cur.execute("CREATE TABLE foo (id int, counter int, t text)")
cur.execute(
"""
INSERT INTO foo
SELECT g, 0, 'long string to consume some space' || g
FROM generate_series(1, 100000) g
"""
)
# Helper function that gets the number of given kind of remote ops from the metrics
def get_num_remote_ops(file_kind: str, op_kind: str) -> int:
ps_metrics = env.pageserver.http_client().get_metrics()
total = 0.0
for sample in ps_metrics.query_all(
name="pageserver_remote_operation_seconds_count",
filter={
"file_kind": str(file_kind),
"op_kind": str(op_kind),
},
):
total += sample[2]
return int(total)
# upload some data to remote storage
if remote_storage_kind == RemoteStorageKind.LOCAL_FS:
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
pageserver_http = env.pageserver.http_client()
pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
pageserver_http.timeline_gc(tenant_id, timeline_id, 10000)
global remote_uploaded
remote_uploaded = get_num_remote_ops("index", "upload")
assert remote_uploaded > 0
# wait longer than collecting interval and check that all requests are served
time.sleep(3)
httpserver.check()
global metric_kinds_checked, checks
expected_checks = set(checks.keys())
assert len(metric_kinds_checked) == len(
checks
), f"Expected to receive and check all kind of metrics, but {expected_checks - metric_kinds_checked} got uncovered"
# ==============================================================================
# Proxy metrics tests
# ==============================================================================
def proxy_metrics_handler(request: Request) -> Response:
if request.json is None:
return Response(status=400)
events = request.json["events"]
log.info("received events:")
log.info(events)
# perform basic sanity checks
for event in events:
assert event["metric"] == "proxy_io_bytes_per_client"
assert event["endpoint_id"] == "test_endpoint_id"
assert event["value"] >= 0
assert event["stop_time"] >= event["start_time"]
return Response(status=200)
@pytest.fixture(scope="function")
def proxy_with_metric_collector(
port_distributor: PortDistributor,
neon_binpath: Path,
httpserver_listen_address,
test_output_dir: Path,
) -> Iterator[NeonProxy]:
"""Neon proxy that routes through link auth and has metric collection enabled."""
http_port = port_distributor.get_port()
proxy_port = port_distributor.get_port()
mgmt_port = port_distributor.get_port()
external_http_port = port_distributor.get_port()
(host, port) = httpserver_listen_address
metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events"
metric_collection_interval = "5s"
with NeonProxy(
neon_binpath=neon_binpath,
test_output_dir=test_output_dir,
proxy_port=proxy_port,
http_port=http_port,
mgmt_port=mgmt_port,
external_http_port=external_http_port,
metric_collection_endpoint=metric_collection_endpoint,
metric_collection_interval=metric_collection_interval,
auth_backend=NeonProxy.Link(),
) as proxy:
proxy.start()
yield proxy
@pytest.mark.asyncio
async def test_proxy_metric_collection(
httpserver: HTTPServer,
proxy_with_metric_collector: NeonProxy,
vanilla_pg: VanillaPostgres,
):
# mock http server that returns OK for the metrics
httpserver.expect_request("/billing/api/v1/usage_events", method="POST").respond_with_handler(
proxy_metrics_handler
)
# do something to generate load to generate metrics
# sleep for 5 seconds to give metric collector time to collect metrics
psql = await PSQL(
host=proxy_with_metric_collector.host, port=proxy_with_metric_collector.proxy_port
).run(
"create table tbl as select * from generate_series(0,1000); select pg_sleep(5); select 42"
)
base_uri = proxy_with_metric_collector.link_auth_uri
link = await NeonProxy.find_auth_link(base_uri, psql)
psql_session_id = NeonProxy.get_session_id(base_uri, link)
await NeonProxy.activate_link_auth(vanilla_pg, proxy_with_metric_collector, psql_session_id)
assert psql.stdout is not None
out = (await psql.stdout.read()).decode("utf-8").strip()
assert out == "42"
# do something to generate load to generate metrics
# sleep for 5 seconds to give metric collector time to collect metrics
psql = await PSQL(
host=proxy_with_metric_collector.host, port=proxy_with_metric_collector.proxy_port
).run("insert into tbl select * from generate_series(0,1000); select pg_sleep(5); select 42")
link = await NeonProxy.find_auth_link(base_uri, psql)
psql_session_id = NeonProxy.get_session_id(base_uri, link)
await NeonProxy.activate_link_auth(
vanilla_pg, proxy_with_metric_collector, psql_session_id, create_user=False
)
assert psql.stdout is not None
out = (await psql.stdout.read()).decode("utf-8").strip()
assert out == "42"
httpserver.check()

View File

@@ -3,6 +3,7 @@
import time
from collections import defaultdict
from pathlib import Path
from typing import Any, DefaultDict, Dict, Tuple
import pytest
@@ -114,7 +115,7 @@ def test_ondemand_download_large_rel(
env.pageserver.stop()
# remove all the layer files
for layer in env.pageserver.tenant_dir().glob("*/timelines/*/*-*_*"):
for layer in (Path(env.pageserver.workdir) / "tenants").glob("*/timelines/*/*-*_*"):
log.info(f"unlinking layer {layer}")
layer.unlink()
@@ -236,7 +237,7 @@ def test_ondemand_download_timetravel(
env.pageserver.stop()
# remove all the layer files
for layer in env.pageserver.tenant_dir().glob("*/timelines/*/*-*_*"):
for layer in (Path(env.pageserver.workdir) / "tenants").glob("*/timelines/*/*-*_*"):
log.info(f"unlinking layer {layer}")
layer.unlink()
@@ -300,7 +301,6 @@ def test_ondemand_download_timetravel(
# they are present only in the remote storage, only locally, or both.
# It should not change.
assert filled_current_physical == get_api_current_physical_size()
endpoint_old.stop()
#
@@ -323,8 +323,8 @@ def test_download_remote_layers_api(
"compaction_period": "0s",
# small checkpoint distance to create more delta layer files
"checkpoint_distance": f"{1 * 1024 ** 2}", # 1 MB
"compaction_threshold": "999999",
"image_creation_threshold": "999999",
"compaction_threshold": "1",
"image_creation_threshold": "1",
"compaction_target_size": f"{1 * 1024 ** 2}", # 1 MB
}
)
@@ -357,20 +357,8 @@ def test_download_remote_layers_api(
tenant_id, timeline_id, "pageserver_resident_physical_size"
)
# Shut down safekeepers before starting the pageserver.
# If we don't, they might stream us more WAL.
for sk in env.safekeepers:
sk.stop()
# it is sad we cannot do a flush inmem layer without compaction, but
# working around with very high layer0 count and image layer creation
# threshold
client.timeline_checkpoint(tenant_id, timeline_id)
wait_for_upload_queue_empty(client, tenant_id, timeline_id)
filled_current_physical = get_api_current_physical_size()
log.info(f"filled_current_physical: {filled_current_physical}")
log.info(filled_current_physical)
filled_size = get_resident_physical_size()
log.info(f"filled_size: {filled_size}")
assert filled_current_physical == filled_size, "we don't yet do layer eviction"
@@ -378,10 +366,18 @@ def test_download_remote_layers_api(
env.pageserver.stop()
# remove all the layer files
for layer in env.pageserver.tenant_dir().glob("*/timelines/*/*-*_*"):
# XXX only delete some of the layer files, to show that it really just downloads all the layers
for layer in (Path(env.pageserver.workdir) / "tenants").glob("*/timelines/*/*-*_*"):
log.info(f"unlinking layer {layer.name}")
layer.unlink()
# Shut down safekeepers before starting the pageserver.
# If we don't, the tenant's walreceiver handler will trigger the
# the logical size computation task, and that downloads layes,
# which makes our assertions on size fail.
for sk in env.safekeepers:
sk.stop(immediate=True)
##### Second start, restore the data and ensure it's the same
env.pageserver.start(extra_env_vars={"FAILPOINTS": "remote-storage-download-pre-rename=return"})
env.pageserver.allowed_errors.extend(
@@ -395,21 +391,32 @@ def test_download_remote_layers_api(
###### Phase 1: exercise download error code path
# comparison here is requiring the size to be at least the previous size, because it's possible received WAL after last_flush_lsn_upload
# witnessed for example difference of 29827072 (filled_current_physical) to 29868032 (here) is no good reason to fail a test.
this_time = get_api_current_physical_size()
assert (
filled_current_physical == this_time
filled_current_physical <= this_time
), "current_physical_size is sum of loaded layer sizes, independent of whether local or remote"
if filled_current_physical != this_time:
log.info(
f"fixing up filled_current_physical from {filled_current_physical} to {this_time} ({this_time - filled_current_physical})"
)
filled_current_physical = this_time
post_unlink_size = get_resident_physical_size()
log.info(f"post_unlink_size: {post_unlink_size}")
assert (
post_unlink_size < filled_size
), "we just deleted layers and didn't cause anything to re-download them yet"
assert filled_size - post_unlink_size > 5 * (
1024**2
), "we may be downloading some layers as part of tenant activation"
# issue downloads that we know will fail
info = client.timeline_download_remote_layers(
tenant_id,
timeline_id,
# allow some concurrency to unveil potential concurrency bugs
max_concurrent_downloads=10,
errors_ok=True,
at_least_one_download=False,
@@ -418,9 +425,9 @@ def test_download_remote_layers_api(
assert info["state"] == "Completed"
assert info["total_layer_count"] > 0
assert info["successful_download_count"] == 0
# can't assert == total_layer_count because timeline_detail also tries to
# download layers for logical size, but this might not always hold.
assert info["failed_download_count"] > 0
assert (
info["failed_download_count"] > 0
) # can't assert == total_layer_count because attach + tenant status downloads some layers
assert (
info["total_layer_count"]
== info["successful_download_count"] + info["failed_download_count"]
@@ -429,6 +436,7 @@ def test_download_remote_layers_api(
assert (
get_resident_physical_size() == post_unlink_size
), "didn't download anything new due to failpoint"
# would be nice to assert that the layers in the layer map are still RemoteLayer
##### Retry, this time without failpoints
client.configure_failpoints(("remote-storage-download-pre-rename", "off"))

View File

@@ -1,481 +0,0 @@
import json
import time
from dataclasses import dataclass
from pathlib import Path
from queue import SimpleQueue
from typing import Any, Dict, Set
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
NeonEnvBuilder,
wait_for_last_flush_lsn,
)
from fixtures.remote_storage import RemoteStorageKind
from fixtures.types import TenantId, TimelineId
from pytest_httpserver import HTTPServer
from werkzeug.wrappers.request import Request
from werkzeug.wrappers.response import Response
@pytest.mark.parametrize(
"remote_storage_kind", [RemoteStorageKind.NOOP, RemoteStorageKind.LOCAL_FS]
)
def test_metric_collection(
httpserver: HTTPServer,
neon_env_builder: NeonEnvBuilder,
httpserver_listen_address,
remote_storage_kind: RemoteStorageKind,
):
(host, port) = httpserver_listen_address
metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events"
# this should be Union[str, Tuple[List[Any], bool]], but it will make unpacking much more verbose
uploads: SimpleQueue[Any] = SimpleQueue()
def metrics_handler(request: Request) -> Response:
if request.json is None:
return Response(status=400)
events = request.json["events"]
is_last = request.headers["pageserver-metrics-last-upload-in-batch"]
assert is_last in ["true", "false"]
uploads.put((events, is_last == "true"))
return Response(status=200)
# Require collecting metrics frequently, since we change
# the timeline and want something to be logged about it.
#
# Disable time-based pitr, we will use the manual GC calls
# to trigger remote storage operations in a controlled way
neon_env_builder.pageserver_config_override = f"""
metric_collection_interval="1s"
metric_collection_endpoint="{metric_collection_endpoint}"
cached_metric_collection_interval="0s"
synthetic_size_calculation_interval="3s"
"""
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
log.info(f"test_metric_collection endpoint is {metric_collection_endpoint}")
# mock http server that returns OK for the metrics
httpserver.expect_request("/billing/api/v1/usage_events", method="POST").respond_with_handler(
metrics_handler
)
# spin up neon, after http server is ready
env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"})
# httpserver is shut down before pageserver during passing run
env.pageserver.allowed_errors.append(".*metrics endpoint refused the sent metrics*")
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
pg_conn = endpoint.connect()
cur = pg_conn.cursor()
cur.execute("CREATE TABLE foo (id int, counter int, t text)")
cur.execute(
"""
INSERT INTO foo
SELECT g, 0, 'long string to consume some space' || g
FROM generate_series(1, 100000) g
"""
)
# Helper function that gets the number of given kind of remote ops from the metrics
def get_num_remote_ops(file_kind: str, op_kind: str) -> int:
ps_metrics = env.pageserver.http_client().get_metrics()
total = 0.0
for sample in ps_metrics.query_all(
name="pageserver_remote_operation_seconds_count",
filter={
"file_kind": str(file_kind),
"op_kind": str(op_kind),
},
):
total += sample[2]
return int(total)
remote_uploaded = 0
# upload some data to remote storage
if remote_storage_kind == RemoteStorageKind.LOCAL_FS:
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
pageserver_http = env.pageserver.http_client()
pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
pageserver_http.timeline_gc(tenant_id, timeline_id, 10000)
remote_uploaded = get_num_remote_ops("index", "upload")
assert remote_uploaded > 0
# we expect uploads at 1Hz, on busy runners this could be too optimistic,
# so give 5s we only want to get the following upload after "ready" value.
timeout = 5
# these strings in the upload queue allow synchronizing with the uploads
# and the main test execution
uploads.put("ready")
# note that this verifier graph should live across restarts as long as the
# cache file lives
v = MetricsVerifier()
while True:
events = uploads.get(timeout=timeout)
if events == "ready":
(events, is_last) = uploads.get(timeout=timeout)
v.ingest(events, is_last)
break
else:
(events, is_last) = events
v.ingest(events, is_last)
if "synthetic_storage_size" not in v.accepted_event_names():
log.info("waiting for synthetic storage size to be calculated and uploaded...")
rounds = 0
while "synthetic_storage_size" not in v.accepted_event_names():
(events, is_last) = uploads.get(timeout=timeout)
v.ingest(events, is_last)
rounds += 1
assert rounds < 10, "did not get synthetic_storage_size in 10 uploads"
# once we have it in verifiers, it will assert that future batches will contain it
env.pageserver.stop()
time.sleep(1)
uploads.put("ready")
env.pageserver.start()
while True:
events = uploads.get(timeout=timeout)
if events == "ready":
(events, is_last) = uploads.get(timeout=timeout * 3)
v.ingest(events, is_last)
(events, is_last) = uploads.get(timeout=timeout)
v.ingest(events, is_last)
break
else:
(events, is_last) = events
v.ingest(events, is_last)
httpserver.check()
def test_metric_collection_cleans_up_tempfile(
httpserver: HTTPServer,
neon_env_builder: NeonEnvBuilder,
httpserver_listen_address,
):
(host, port) = httpserver_listen_address
metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events"
# this should be Union[str, Tuple[List[Any], bool]], but it will make unpacking much more verbose
uploads: SimpleQueue[Any] = SimpleQueue()
def metrics_handler(request: Request) -> Response:
if request.json is None:
return Response(status=400)
events = request.json["events"]
is_last = request.headers["pageserver-metrics-last-upload-in-batch"]
assert is_last in ["true", "false"]
uploads.put((events, is_last == "true"))
return Response(status=200)
# Require collecting metrics frequently, since we change
# the timeline and want something to be logged about it.
#
# Disable time-based pitr, we will use the manual GC calls
# to trigger remote storage operations in a controlled way
neon_env_builder.pageserver_config_override = f"""
metric_collection_interval="1s"
metric_collection_endpoint="{metric_collection_endpoint}"
cached_metric_collection_interval="0s"
synthetic_size_calculation_interval="3s"
"""
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
# mock http server that returns OK for the metrics
httpserver.expect_request("/billing/api/v1/usage_events", method="POST").respond_with_handler(
metrics_handler
)
# spin up neon, after http server is ready
env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"})
pageserver_http = env.pageserver.http_client()
# httpserver is shut down before pageserver during passing run
env.pageserver.allowed_errors.append(".*metrics endpoint refused the sent metrics*")
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
pg_conn = endpoint.connect()
cur = pg_conn.cursor()
cur.execute("CREATE TABLE foo (id int, counter int, t text)")
cur.execute(
"""
INSERT INTO foo
SELECT g, 0, 'long string to consume some space' || g
FROM generate_series(1, 100000) g
"""
)
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
# we expect uploads at 1Hz, on busy runners this could be too optimistic,
# so give 5s we only want to get the following upload after "ready" value.
timeout = 5
# these strings in the upload queue allow synchronizing with the uploads
# and the main test execution
uploads.put("ready")
while True:
events = uploads.get(timeout=timeout)
if events == "ready":
(events, _) = uploads.get(timeout=timeout)
break
# should really configure an env?
pageserver_http.configure_failpoints(("before-persist-last-metrics-collected", "exit"))
time.sleep(3)
env.pageserver.stop()
initially = iterate_pageserver_workdir(env.pageserver.workdir, "last_consumption_metrics.json")
assert (
len(initially.matching) == 2
), f"expecting actual file and tempfile, but not found: {initially.matching}"
uploads.put("ready")
env.pageserver.start()
while True:
events = uploads.get(timeout=timeout * 3)
if events == "ready":
(events, _) = uploads.get(timeout=timeout)
break
env.pageserver.stop()
later = iterate_pageserver_workdir(env.pageserver.workdir, "last_consumption_metrics.json")
# it is possible we shutdown the pageserver right at the correct time, so the old tempfile
# is gone, but we also have a new one.
only = set(["last_consumption_metrics.json"])
assert (
initially.matching.intersection(later.matching) == only
), "only initial tempfile should had been removed"
assert initially.other.issuperset(later.other), "no other files should had been removed"
@dataclass
class PrefixPartitionedFiles:
matching: Set[str]
other: Set[str]
def iterate_pageserver_workdir(path: Path, prefix: str) -> PrefixPartitionedFiles:
"""
Iterates the files in the workdir, returns two sets:
- files with the prefix
- files without the prefix
"""
matching = set()
other = set()
for entry in path.iterdir():
if not entry.is_file():
continue
if not entry.name.startswith(prefix):
other.add(entry.name)
else:
matching.add(entry.name)
return PrefixPartitionedFiles(matching, other)
class MetricsVerifier:
"""
A graph of per tenant per timeline verifiers, allowing one for each
metric
"""
def __init__(self):
self.tenants: Dict[TenantId, TenantMetricsVerifier] = {}
pass
def ingest(self, events, is_last):
stringified = json.dumps(events, indent=2)
log.info(f"ingesting: {stringified}")
for event in events:
id = TenantId(event["tenant_id"])
if id not in self.tenants:
self.tenants[id] = TenantMetricsVerifier(id)
self.tenants[id].ingest(event)
if is_last:
for t in self.tenants.values():
t.post_batch()
def accepted_event_names(self) -> Set[str]:
names: Set[str] = set()
for t in self.tenants.values():
names = names.union(t.accepted_event_names())
return names
class TenantMetricsVerifier:
def __init__(self, id: TenantId):
self.id = id
self.timelines: Dict[TimelineId, TimelineMetricsVerifier] = {}
self.state: Dict[str, Any] = {}
def ingest(self, event):
assert TenantId(event["tenant_id"]) == self.id
if "timeline_id" in event:
id = TimelineId(event["timeline_id"])
if id not in self.timelines:
self.timelines[id] = TimelineMetricsVerifier(self.id, id)
self.timelines[id].ingest(event)
else:
name = event["metric"]
if name not in self.state:
self.state[name] = PER_METRIC_VERIFIERS[name]()
self.state[name].ingest(event, self)
def post_batch(self):
for v in self.state.values():
v.post_batch(self)
for tl in self.timelines.values():
tl.post_batch(self)
def accepted_event_names(self) -> Set[str]:
names = set(self.state.keys())
for t in self.timelines.values():
names = names.union(t.accepted_event_names())
return names
class TimelineMetricsVerifier:
def __init__(self, tenant_id: TenantId, timeline_id: TimelineId):
self.id = timeline_id
self.state: Dict[str, Any] = {}
def ingest(self, event):
name = event["metric"]
if name not in self.state:
self.state[name] = PER_METRIC_VERIFIERS[name]()
self.state[name].ingest(event, self)
def post_batch(self, parent):
for v in self.state.values():
v.post_batch(self)
def accepted_event_names(self) -> Set[str]:
return set(self.state.keys())
class CannotVerifyAnything:
"""We can only assert types, but rust already has types, so no need."""
def __init__(self):
pass
def ingest(self, event, parent):
pass
def post_batch(self, parent):
pass
class WrittenDataVerifier:
def __init__(self):
self.values = []
pass
def ingest(self, event, parent):
self.values.append(event["value"])
def post_batch(self, parent):
pass
class WrittenDataDeltaVerifier:
def __init__(self):
self.value = None
self.sum = 0
self.timerange = None
pass
def ingest(self, event, parent):
assert event["type"] == "incremental"
self.value = event["value"]
self.sum += event["value"]
start = event["start_time"]
stop = event["stop_time"]
timerange = (start, stop)
if self.timerange is not None:
# this holds across restarts
assert self.timerange[1] == timerange[0], "time ranges should be continious"
self.timerange = timerange
def post_batch(self, parent):
absolute = parent.state["written_size"]
if len(absolute.values) == 1:
# in tests this comes up as initdb execution, so we can have 0 or
# about 30MB on the first event. it is not consistent.
assert self.value is not None
else:
assert self.value == absolute.values[-1] - absolute.values[-2]
# sounds like this should hold, but it will not for branches -- probably related to timing
# assert self.sum == absolute.latest
class SyntheticSizeVerifier:
def __init__(self):
self.prev = None
self.value = None
pass
def ingest(self, event, parent):
assert isinstance(parent, TenantMetricsVerifier)
assert event["type"] == "absolute"
value = event["value"]
self.value = value
def post_batch(self, parent):
if self.prev is not None:
# this is assuming no one goes and deletes the cache file
assert (
self.value is not None
), "after calculating first synthetic size, cached or more recent should be sent"
self.prev = self.value
self.value = None
PER_METRIC_VERIFIERS = {
"remote_storage_size": CannotVerifyAnything,
"resident_size": CannotVerifyAnything,
"written_size": WrittenDataVerifier,
"written_data_bytes_delta": WrittenDataDeltaVerifier,
"timeline_logical_size": CannotVerifyAnything,
"synthetic_storage_size": SyntheticSizeVerifier,
}

View File

@@ -346,13 +346,23 @@ def test_sql_over_http_pool(static_proxy: NeonProxy):
static_proxy.safe_psql("create user http_auth with password 'http' superuser")
def get_pid(status: int, pw: str) -> Any:
return static_proxy.http_query(
"SELECT pid FROM pg_stat_activity WHERE state = 'active'",
[],
user="http_auth",
password=pw,
expected_code=status,
connstr = (
f"postgresql://http_auth:{pw}@{static_proxy.domain}:{static_proxy.proxy_port}/postgres"
)
response = requests.post(
f"https://{static_proxy.domain}:{static_proxy.external_http_port}/sql",
data=json.dumps(
{"query": "SELECT pid FROM pg_stat_activity WHERE state = 'active'", "params": []}
),
headers={
"Content-Type": "application/sql",
"Neon-Connection-String": connstr,
"Neon-Pool-Opt-In": "true",
},
verify=str(static_proxy.test_output_dir / "proxy.crt"),
)
assert response.status_code == status
return response.json()
pid1 = get_pid(200, "http")["rows"][0]["pid"]
@@ -377,23 +387,3 @@ def test_sql_over_http_pool(static_proxy: NeonProxy):
# old password should not work
res = get_pid(400, "http")
assert "password authentication failed for user" in res["message"]
# Beginning a transaction should not impact the next query,
# which might come from a completely different client.
@pytest.mark.xfail(reason="not implemented")
def test_http_pool_begin(static_proxy: NeonProxy):
static_proxy.safe_psql("create user http_auth with password 'http' superuser")
def query(status: int, query: str, *args) -> Any:
static_proxy.http_query(
query,
args,
user="http_auth",
password="http",
expected_code=status,
)
query(200, "BEGIN;")
query(400, "garbage-lol(&(&(&(&") # Intentional error to break the transaction
query(200, "SELECT 1;") # Query that should succeed regardless of the transaction

View File

@@ -1,113 +0,0 @@
from pathlib import Path
from typing import Iterator
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
PSQL,
NeonProxy,
VanillaPostgres,
)
from fixtures.port_distributor import PortDistributor
from pytest_httpserver import HTTPServer
from werkzeug.wrappers.request import Request
from werkzeug.wrappers.response import Response
def proxy_metrics_handler(request: Request) -> Response:
if request.json is None:
return Response(status=400)
events = request.json["events"]
log.info("received events:")
log.info(events)
# perform basic sanity checks
for event in events:
assert event["metric"] == "proxy_io_bytes_per_client"
assert event["endpoint_id"] == "test_endpoint_id"
assert event["value"] >= 0
assert event["stop_time"] >= event["start_time"]
return Response(status=200)
@pytest.fixture(scope="function")
def proxy_with_metric_collector(
port_distributor: PortDistributor,
neon_binpath: Path,
httpserver_listen_address,
test_output_dir: Path,
) -> Iterator[NeonProxy]:
"""Neon proxy that routes through link auth and has metric collection enabled."""
http_port = port_distributor.get_port()
proxy_port = port_distributor.get_port()
mgmt_port = port_distributor.get_port()
external_http_port = port_distributor.get_port()
(host, port) = httpserver_listen_address
metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events"
metric_collection_interval = "5s"
with NeonProxy(
neon_binpath=neon_binpath,
test_output_dir=test_output_dir,
proxy_port=proxy_port,
http_port=http_port,
mgmt_port=mgmt_port,
external_http_port=external_http_port,
metric_collection_endpoint=metric_collection_endpoint,
metric_collection_interval=metric_collection_interval,
auth_backend=NeonProxy.Link(),
) as proxy:
proxy.start()
yield proxy
@pytest.mark.asyncio
async def test_proxy_metric_collection(
httpserver: HTTPServer,
proxy_with_metric_collector: NeonProxy,
vanilla_pg: VanillaPostgres,
):
# mock http server that returns OK for the metrics
httpserver.expect_request("/billing/api/v1/usage_events", method="POST").respond_with_handler(
proxy_metrics_handler
)
# do something to generate load to generate metrics
# sleep for 5 seconds to give metric collector time to collect metrics
psql = await PSQL(
host=proxy_with_metric_collector.host, port=proxy_with_metric_collector.proxy_port
).run(
"create table tbl as select * from generate_series(0,1000); select pg_sleep(5); select 42"
)
base_uri = proxy_with_metric_collector.link_auth_uri
link = await NeonProxy.find_auth_link(base_uri, psql)
psql_session_id = NeonProxy.get_session_id(base_uri, link)
await NeonProxy.activate_link_auth(vanilla_pg, proxy_with_metric_collector, psql_session_id)
assert psql.stdout is not None
out = (await psql.stdout.read()).decode("utf-8").strip()
assert out == "42"
# do something to generate load to generate metrics
# sleep for 5 seconds to give metric collector time to collect metrics
psql = await PSQL(
host=proxy_with_metric_collector.host, port=proxy_with_metric_collector.proxy_port
).run("insert into tbl select * from generate_series(0,1000); select pg_sleep(5); select 42")
link = await NeonProxy.find_auth_link(base_uri, psql)
psql_session_id = NeonProxy.get_session_id(base_uri, link)
await NeonProxy.activate_link_auth(
vanilla_pg, proxy_with_metric_collector, psql_session_id, create_user=False
)
assert psql.stdout is not None
out = (await psql.stdout.read()).decode("utf-8").strip()
assert out == "42"
httpserver.check()

View File

@@ -6,6 +6,7 @@ import queue
import shutil
import threading
import time
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import pytest
@@ -136,7 +137,7 @@ def test_remote_storage_backup_and_restore(
env.endpoints.stop_all()
env.pageserver.stop()
dir_to_clear = env.pageserver.tenant_dir()
dir_to_clear = Path(env.pageserver.workdir) / "tenants"
shutil.rmtree(dir_to_clear)
os.mkdir(dir_to_clear)
@@ -352,7 +353,7 @@ def test_remote_storage_upload_queue_retries(
env.pageserver.stop(immediate=True)
env.endpoints.stop_all()
dir_to_clear = env.pageserver.tenant_dir()
dir_to_clear = Path(env.pageserver.workdir) / "tenants"
shutil.rmtree(dir_to_clear)
os.mkdir(dir_to_clear)
@@ -487,7 +488,7 @@ def test_remote_timeline_client_calls_started_metric(
env.pageserver.stop(immediate=True)
env.endpoints.stop_all()
dir_to_clear = env.pageserver.tenant_dir()
dir_to_clear = Path(env.pageserver.workdir) / "tenants"
shutil.rmtree(dir_to_clear)
os.mkdir(dir_to_clear)
@@ -532,7 +533,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
timeline_path = env.pageserver.timeline_dir(tenant_id, timeline_id)
timeline_path = env.timeline_dir(tenant_id, timeline_id)
client = env.pageserver.http_client()
@@ -703,9 +704,7 @@ def test_empty_branch_remote_storage_upload_on_restart(
# index upload is now hitting the failpoint, it should block the shutdown
env.pageserver.stop(immediate=True)
local_metadata = (
env.pageserver.timeline_dir(env.initial_tenant, new_branch_timeline_id) / "metadata"
)
local_metadata = env.timeline_dir(env.initial_tenant, new_branch_timeline_id) / "metadata"
assert local_metadata.is_file()
assert isinstance(env.pageserver_remote_storage, LocalFsStorage)

View File

@@ -299,7 +299,7 @@ def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder):
# tenant is created with defaults, as in without config file
(tenant_id, timeline_id) = env.neon_cli.create_tenant()
config_path = env.pageserver.tenant_dir(tenant_id) / "config"
config_path = env.pageserver.workdir / "tenants" / str(tenant_id) / "config"
assert config_path.exists(), "config file is always initially created"
http_client = env.pageserver.http_client()

View File

@@ -89,7 +89,7 @@ def test_tenant_delete_smoke(
tenant_delete_wait_completed(ps_http, tenant_id, iterations)
tenant_path = env.pageserver.tenant_dir(tenant_id)
tenant_path = env.tenant_dir(tenant_id=tenant_id)
assert not tenant_path.exists()
if remote_storage_kind in available_s3_storages():
@@ -269,7 +269,7 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations)
tenant_dir = env.pageserver.tenant_dir(tenant_id)
tenant_dir = env.tenant_dir(tenant_id)
# Check local is empty
assert not tenant_dir.exists()
@@ -366,7 +366,7 @@ def test_tenant_delete_is_resumed_on_attach(
env.endpoints.stop_all()
env.pageserver.stop()
dir_to_clear = env.pageserver.tenant_dir()
dir_to_clear = env.pageserver.workdir / "tenants"
shutil.rmtree(dir_to_clear)
os.mkdir(dir_to_clear)
@@ -379,7 +379,7 @@ def test_tenant_delete_is_resumed_on_attach(
wait_tenant_status_404(ps_http, tenant_id, iterations)
# we shouldn've created tenant dir on disk
tenant_path = env.pageserver.tenant_dir(tenant_id)
tenant_path = env.tenant_dir(tenant_id=tenant_id)
assert not tenant_path.exists()
if remote_storage_kind in available_s3_storages():

View File

@@ -119,6 +119,65 @@ def test_tenant_reattach(
num_connections = 10
num_rows = 100000
updates_to_perform = 0
updates_started = 0
updates_finished = 0
# Run random UPDATEs on test table. On failure, try again.
async def update_table(pg_conn: asyncpg.Connection):
global updates_started, updates_finished, updates_to_perform
while updates_started < updates_to_perform or updates_to_perform == 0:
updates_started += 1
id = random.randrange(1, num_rows)
# Loop to retry until the UPDATE succeeds
while True:
try:
await pg_conn.fetchrow(f"UPDATE t SET counter = counter + 1 WHERE id = {id}")
updates_finished += 1
if updates_finished % 1000 == 0:
log.info(f"update {updates_finished} / {updates_to_perform}")
break
except asyncpg.PostgresError as e:
# Received error from Postgres. Log it, sleep a little, and continue
log.info(f"UPDATE error: {e}")
await asyncio.sleep(0.1)
async def sleep_and_reattach(pageserver_http: PageserverHttpClient, tenant_id: TenantId):
global updates_started, updates_finished, updates_to_perform
# Wait until we have performed some updates
wait_until(20, 0.5, lambda: updates_finished > 500)
log.info("Detaching tenant")
pageserver_http.tenant_detach(tenant_id)
await asyncio.sleep(1)
log.info("Re-attaching tenant")
pageserver_http.tenant_attach(tenant_id)
log.info("Re-attach finished")
# Continue with 5000 more updates
updates_to_perform = updates_started + 5000
# async guts of test_tenant_reattach_while_bysy test
async def reattach_while_busy(
env: NeonEnv, endpoint: Endpoint, pageserver_http: PageserverHttpClient, tenant_id: TenantId
):
workers = []
for _ in range(num_connections):
pg_conn = await endpoint.connect_async()
workers.append(asyncio.create_task(update_table(pg_conn)))
workers.append(asyncio.create_task(sleep_and_reattach(pageserver_http, tenant_id)))
await asyncio.gather(*workers)
assert updates_finished == updates_to_perform
# Detach and re-attach tenant, while compute is busy running queries.
#
@@ -167,62 +226,6 @@ def test_tenant_reattach_while_busy(
neon_env_builder: NeonEnvBuilder,
remote_storage_kind: RemoteStorageKind,
):
updates_started = 0
updates_finished = 0
updates_to_perform = 0
# Run random UPDATEs on test table. On failure, try again.
async def update_table(pg_conn: asyncpg.Connection):
nonlocal updates_started, updates_finished, updates_to_perform
while updates_started < updates_to_perform or updates_to_perform == 0:
updates_started += 1
id = random.randrange(1, num_rows)
# Loop to retry until the UPDATE succeeds
while True:
try:
await pg_conn.fetchrow(f"UPDATE t SET counter = counter + 1 WHERE id = {id}")
updates_finished += 1
if updates_finished % 1000 == 0:
log.info(f"update {updates_finished} / {updates_to_perform}")
break
except asyncpg.PostgresError as e:
# Received error from Postgres. Log it, sleep a little, and continue
log.info(f"UPDATE error: {e}")
await asyncio.sleep(0.1)
async def sleep_and_reattach(pageserver_http: PageserverHttpClient, tenant_id: TenantId):
nonlocal updates_started, updates_finished, updates_to_perform
# Wait until we have performed some updates
wait_until(20, 0.5, lambda: updates_finished > 500)
log.info("Detaching tenant")
pageserver_http.tenant_detach(tenant_id)
await asyncio.sleep(1)
log.info("Re-attaching tenant")
pageserver_http.tenant_attach(tenant_id)
log.info("Re-attach finished")
# Continue with 5000 more updates
updates_to_perform = updates_started + 5000
# async guts of test_tenant_reattach_while_bysy test
async def reattach_while_busy(
env: NeonEnv, endpoint: Endpoint, pageserver_http: PageserverHttpClient, tenant_id: TenantId
):
nonlocal updates_to_perform, updates_finished
workers = []
for _ in range(num_connections):
pg_conn = await endpoint.connect_async()
workers.append(asyncio.create_task(update_table(pg_conn)))
workers.append(asyncio.create_task(sleep_and_reattach(pageserver_http, tenant_id)))
await asyncio.gather(*workers)
assert updates_finished == updates_to_perform
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
env = neon_env_builder.init_start()
@@ -286,7 +289,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
)
# assert tenant exists on disk
assert env.pageserver.tenant_dir(tenant_id).exists()
assert (env.pageserver.workdir / "tenants" / str(tenant_id)).exists()
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
# we rely upon autocommit after each statement
@@ -329,7 +332,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
log.info("gc thread returned")
# check that nothing is left on disk for deleted tenant
assert not env.pageserver.tenant_dir(tenant_id).exists()
assert not (env.pageserver.workdir / "tenants" / str(tenant_id)).exists()
with pytest.raises(
expected_exception=PageserverApiException, match=f"NotFound: tenant {tenant_id}"
@@ -354,7 +357,7 @@ def test_tenant_detach_ignored_tenant(neon_simple_env: NeonEnv):
)
# assert tenant exists on disk
assert env.pageserver.tenant_dir(tenant_id).exists()
assert (env.pageserver.workdir / "tenants" / str(tenant_id)).exists()
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
# we rely upon autocommit after each statement
@@ -383,7 +386,7 @@ def test_tenant_detach_ignored_tenant(neon_simple_env: NeonEnv):
log.info("ignored tenant detached without error")
# check that nothing is left on disk for deleted tenant
assert not env.pageserver.tenant_dir(tenant_id).exists()
assert not (env.pageserver.workdir / "tenants" / str(tenant_id)).exists()
# assert the tenant does not exists in the Pageserver
tenants_after_detach = [tenant["id"] for tenant in client.tenant_list()]
@@ -410,7 +413,7 @@ def test_tenant_detach_regular_tenant(neon_simple_env: NeonEnv):
)
# assert tenant exists on disk
assert env.pageserver.tenant_dir(tenant_id).exists()
assert (env.pageserver.workdir / "tenants" / str(tenant_id)).exists()
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
# we rely upon autocommit after each statement
@@ -427,7 +430,7 @@ def test_tenant_detach_regular_tenant(neon_simple_env: NeonEnv):
log.info("regular tenant detached without error")
# check that nothing is left on disk for deleted tenant
assert not env.pageserver.tenant_dir(tenant_id).exists()
assert not (env.pageserver.workdir / "tenants" / str(tenant_id)).exists()
# assert the tenant does not exists in the Pageserver
tenants_after_detach = [tenant["id"] for tenant in client.tenant_list()]
@@ -528,7 +531,7 @@ def test_ignored_tenant_reattach(
pageserver_http = env.pageserver.http_client()
ignored_tenant_id, _ = env.neon_cli.create_tenant()
tenant_dir = env.pageserver.tenant_dir(ignored_tenant_id)
tenant_dir = env.pageserver.workdir / "tenants" / str(ignored_tenant_id)
tenants_before_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
tenants_before_ignore.sort()
timelines_before_ignore = [
@@ -619,7 +622,7 @@ def test_ignored_tenant_download_missing_layers(
# ignore the tenant and remove its layers
pageserver_http.tenant_ignore(tenant_id)
timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id)
timeline_dir = env.timeline_dir(tenant_id, timeline_id)
layers_removed = False
for dir_entry in timeline_dir.iterdir():
if dir_entry.name.startswith("00000"):
@@ -672,7 +675,7 @@ def test_ignored_tenant_stays_broken_without_metadata(
# ignore the tenant and remove its metadata
pageserver_http.tenant_ignore(tenant_id)
timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id)
timeline_dir = env.timeline_dir(tenant_id, timeline_id)
metadata_removed = False
for dir_entry in timeline_dir.iterdir():
if dir_entry.name == "metadata":

View File

@@ -216,7 +216,7 @@ def switch_pg_to_new_pageserver(
endpoint.start()
timeline_to_detach_local_path = env.pageserver.timeline_dir(tenant_id, timeline_id)
timeline_to_detach_local_path = env.timeline_dir(tenant_id, timeline_id)
files_before_detach = os.listdir(timeline_to_detach_local_path)
assert (
"metadata" in files_before_detach
@@ -561,7 +561,7 @@ def test_emergency_relocate_with_branches_slow_replay(
# simpler than initializing a new one from scratch, but the effect on the single tenant
# is the same.
env.pageserver.stop(immediate=True)
shutil.rmtree(env.pageserver.tenant_dir(tenant_id))
shutil.rmtree(env.pageserver.workdir / "tenants" / str(tenant_id))
env.pageserver.start()
# This fail point will pause the WAL ingestion on the main branch, after the
@@ -709,7 +709,7 @@ def test_emergency_relocate_with_branches_createdb(
# Kill the pageserver, remove the tenant directory, and restart
env.pageserver.stop(immediate=True)
shutil.rmtree(env.pageserver.tenant_dir(tenant_id))
shutil.rmtree(env.pageserver.workdir / "tenants" / str(tenant_id))
env.pageserver.start()
# Wait before ingesting the WAL for CREATE DATABASE on the main branch. The original

View File

@@ -27,7 +27,7 @@ from prometheus_client.samples import Sample
def test_tenant_creation_fails(neon_simple_env: NeonEnv):
tenants_dir = neon_simple_env.pageserver.tenant_dir()
tenants_dir = Path(neon_simple_env.pageserver.workdir) / "tenants"
initial_tenants = sorted(
map(lambda t: t.split()[0], neon_simple_env.neon_cli.list_tenants().stdout.splitlines())
)
@@ -320,7 +320,13 @@ def test_pageserver_with_empty_tenants(
)
files_in_timelines_dir = sum(
1 for _p in Path.iterdir(env.pageserver.timeline_dir(tenant_with_empty_timelines))
1
for _p in Path.iterdir(
Path(env.pageserver.workdir)
/ "tenants"
/ str(tenant_with_empty_timelines)
/ "timelines"
)
)
assert (
files_in_timelines_dir == 0
@@ -331,7 +337,9 @@ def test_pageserver_with_empty_tenants(
env.pageserver.stop()
tenant_without_timelines_dir = env.initial_tenant
shutil.rmtree(env.pageserver.timeline_dir(tenant_without_timelines_dir))
shutil.rmtree(
Path(env.pageserver.workdir) / "tenants" / str(tenant_without_timelines_dir) / "timelines"
)
env.pageserver.start()

View File

@@ -179,7 +179,9 @@ def test_tenants_attached_after_download(
env.pageserver.stop()
timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id)
timeline_dir = (
Path(env.pageserver.workdir) / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
)
local_layer_deleted = False
for path in Path.iterdir(timeline_dir):
if path.name.startswith("00000"):
@@ -257,7 +259,7 @@ def test_tenant_redownloads_truncated_file_on_startup(
env.endpoints.stop_all()
env.pageserver.stop()
timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id)
timeline_dir = env.timeline_dir(tenant_id, timeline_id)
local_layer_truncated = None
for path in Path.iterdir(timeline_dir):
if path.name.startswith("00000"):

View File

@@ -32,9 +32,7 @@ def test_threshold_based_eviction(
synthetic_size_calculation_interval="2s"
metric_collection_endpoint="http://{host}:{port}/nonexistent"
"""
metrics_refused_log_line = (
".*metrics_collection:.* upload consumption_metrics (still failed|failed, will retry).*"
)
metrics_refused_log_line = ".*metrics endpoint refused the sent metrics.*/nonexistent.*"
env = neon_env_builder.init_start()
env.pageserver.allowed_errors.append(metrics_refused_log_line)

View File

@@ -3,6 +3,7 @@ import os
import queue
import shutil
import threading
from pathlib import Path
import pytest
import requests
@@ -71,7 +72,13 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
"test_ancestor_branch_delete_branch1", "test_ancestor_branch_delete_parent"
)
timeline_path = env.pageserver.timeline_dir(env.initial_tenant, parent_timeline_id)
timeline_path = (
env.pageserver.workdir
/ "tenants"
/ str(env.initial_tenant)
/ "timelines"
/ str(parent_timeline_id)
)
with pytest.raises(
PageserverApiException, match="Cannot delete timeline which has child timelines"
@@ -82,7 +89,13 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
assert exc.value.status_code == 412
timeline_path = env.pageserver.timeline_dir(env.initial_tenant, leaf_timeline_id)
timeline_path = (
env.pageserver.workdir
/ "tenants"
/ str(env.initial_tenant)
/ "timelines"
/ str(leaf_timeline_id)
)
assert timeline_path.exists()
# retry deletes when compaction or gc is running in pageserver
@@ -323,7 +336,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
),
)
timeline_dir = env.pageserver.timeline_dir(env.initial_tenant, timeline_id)
timeline_dir = env.timeline_dir(env.initial_tenant, timeline_id)
# Check local is empty
assert not timeline_dir.exists()
# Check no delete mark present
@@ -403,7 +416,7 @@ def test_timeline_resurrection_on_attach(
env.endpoints.stop_all()
env.pageserver.stop()
dir_to_clear = env.pageserver.tenant_dir()
dir_to_clear = Path(env.pageserver.workdir) / "tenants"
shutil.rmtree(dir_to_clear)
os.mkdir(dir_to_clear)
@@ -454,7 +467,13 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
"test_timeline_delete_fail_before_local_delete",
)
leaf_timeline_path = env.pageserver.timeline_dir(env.initial_tenant, leaf_timeline_id)
leaf_timeline_path = (
env.pageserver.workdir
/ "tenants"
/ str(env.initial_tenant)
/ "timelines"
/ str(leaf_timeline_id)
)
ps_http.timeline_delete(env.initial_tenant, leaf_timeline_id)
timeline_info = wait_until_timeline_state(
@@ -902,7 +921,7 @@ def test_timeline_delete_resumed_on_attach(
env.endpoints.stop_all()
env.pageserver.stop()
dir_to_clear = env.pageserver.tenant_dir()
dir_to_clear = Path(env.pageserver.workdir) / "tenants"
shutil.rmtree(dir_to_clear)
os.mkdir(dir_to_clear)
@@ -914,7 +933,7 @@ def test_timeline_delete_resumed_on_attach(
# delete should be resumed
wait_timeline_detail_404(ps_http, env.initial_tenant, timeline_id, iterations=iterations)
tenant_path = env.pageserver.timeline_dir(tenant_id, timeline_id)
tenant_path = env.timeline_dir(tenant_id=tenant_id, timeline_id=timeline_id)
assert not tenant_path.exists()
if remote_storage_kind in available_s3_storages():

View File

@@ -518,7 +518,7 @@ def test_timeline_size_metrics(
).value
# assert that the physical size metric matches the actual physical size on disk
timeline_path = env.pageserver.timeline_dir(env.initial_tenant, new_timeline_id)
timeline_path = env.timeline_dir(env.initial_tenant, new_timeline_id)
assert tl_physical_size_metric == get_timeline_dir_size(timeline_path)
# Check that the logical size metric is sane, and matches
@@ -658,7 +658,7 @@ def get_physical_size_values(
)
res.api_current_physical = detail["current_physical_size"]
timeline_path = env.pageserver.timeline_dir(tenant_id, timeline_id)
timeline_path = env.timeline_dir(tenant_id, timeline_id)
res.python_timelinedir_layerfiles_physical = get_timeline_dir_size(timeline_path)
return res

View File

@@ -19,40 +19,18 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv):
# Install extension containing function needed for test
cur.execute("CREATE EXTENSION neon_test_utils")
# Create a test table for a few different scenarios and freeze it to set the VM bits.
# Create a test table and freeze it to set the VM bit.
cur.execute("CREATE TABLE vmtest_delete (id integer PRIMARY KEY)")
cur.execute("INSERT INTO vmtest_delete VALUES (1)")
cur.execute("VACUUM FREEZE vmtest_delete")
cur.execute("CREATE TABLE vmtest_hot_update (id integer PRIMARY KEY, filler text)")
cur.execute("INSERT INTO vmtest_hot_update VALUES (1, 'x')")
cur.execute("VACUUM FREEZE vmtest_hot_update")
cur.execute("CREATE TABLE vmtest_cold_update (id integer PRIMARY KEY)")
cur.execute("INSERT INTO vmtest_cold_update SELECT g FROM generate_series(1, 1000) g")
cur.execute("VACUUM FREEZE vmtest_cold_update")
cur.execute(
"CREATE TABLE vmtest_cold_update2 (id integer PRIMARY KEY, filler text) WITH (fillfactor=100)"
)
cur.execute("INSERT INTO vmtest_cold_update2 SELECT g, '' FROM generate_series(1, 1000) g")
cur.execute("VACUUM FREEZE vmtest_cold_update2")
cur.execute("CREATE TABLE vmtest_update (id integer PRIMARY KEY)")
cur.execute("INSERT INTO vmtest_update SELECT g FROM generate_series(1, 1000) g")
cur.execute("VACUUM FREEZE vmtest_update")
# DELETE and UPDATE the rows.
cur.execute("DELETE FROM vmtest_delete WHERE id = 1")
cur.execute("UPDATE vmtest_hot_update SET filler='x' WHERE id = 1")
cur.execute("UPDATE vmtest_cold_update SET id = 5000 WHERE id = 1")
# Clear the VM bit on the last page with an INSERT. Then clear the VM bit on
# the page where row 1 is (block 0), by doing an UPDATE. The UPDATE is a
# cold update, and the new tuple goes to the last page, which already had
# its VM bit cleared. The point is that the UPDATE *only* clears the VM bit
# on the page containing the old tuple. We had a bug where we got the old
# and new pages mixed up, and that only shows up when one of the bits is
# cleared, but not the other one.
cur.execute("INSERT INTO vmtest_cold_update2 VALUES (9999, 'x')")
# Clears the VM bit on the old page
cur.execute("UPDATE vmtest_cold_update2 SET id = 5000, filler=repeat('x', 200) WHERE id = 1")
cur.execute("UPDATE vmtest_update SET id = 5000 WHERE id = 1")
# Branch at this point, to test that later
fork_at_current_lsn(env, endpoint, "test_vm_bit_clear_new", "test_vm_bit_clear")
@@ -72,13 +50,9 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv):
"""
)
cur.execute("SELECT id FROM vmtest_delete WHERE id = 1")
cur.execute("SELECT * FROM vmtest_delete WHERE id = 1")
assert cur.fetchall() == []
cur.execute("SELECT id FROM vmtest_hot_update WHERE id = 1")
assert cur.fetchall() == [(1,)]
cur.execute("SELECT id FROM vmtest_cold_update WHERE id = 1")
assert cur.fetchall() == []
cur.execute("SELECT id FROM vmtest_cold_update2 WHERE id = 1")
cur.execute("SELECT * FROM vmtest_update WHERE id = 1")
assert cur.fetchall() == []
cur.close()
@@ -103,111 +77,7 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv):
"""
)
cur_new.execute("SELECT id FROM vmtest_delete WHERE id = 1")
cur_new.execute("SELECT * FROM vmtest_delete WHERE id = 1")
assert cur_new.fetchall() == []
cur_new.execute("SELECT id FROM vmtest_hot_update WHERE id = 1")
assert cur_new.fetchall() == [(1,)]
cur_new.execute("SELECT id FROM vmtest_cold_update WHERE id = 1")
cur_new.execute("SELECT * FROM vmtest_update WHERE id = 1")
assert cur_new.fetchall() == []
cur_new.execute("SELECT id FROM vmtest_cold_update2 WHERE id = 1")
assert cur_new.fetchall() == []
#
# Test that the ALL_FROZEN VM bit is cleared correctly at a HEAP_LOCK
# record.
#
def test_vm_bit_clear_on_heap_lock(neon_simple_env: NeonEnv):
env = neon_simple_env
env.neon_cli.create_branch("test_vm_bit_clear_on_heap_lock", "empty")
endpoint = env.endpoints.create_start(
"test_vm_bit_clear_on_heap_lock",
config_lines=[
"log_autovacuum_min_duration = 0",
# Perform anti-wraparound vacuuming aggressively
"autovacuum_naptime='1 s'",
"autovacuum_freeze_max_age = 1000000",
],
)
pg_conn = endpoint.connect()
cur = pg_conn.cursor()
# Install extension containing function needed for test
cur.execute("CREATE EXTENSION neon_test_utils")
cur.execute("SELECT pg_switch_wal()")
# Create a test table and freeze it to set the all-frozen VM bit on all pages.
cur.execute("CREATE TABLE vmtest_lock (id integer PRIMARY KEY)")
cur.execute("INSERT INTO vmtest_lock SELECT g FROM generate_series(1, 50000) g")
cur.execute("VACUUM FREEZE vmtest_lock")
# Lock a row. This clears the all-frozen VM bit for that page.
cur.execute("SELECT * FROM vmtest_lock WHERE id = 40000 FOR UPDATE")
# Remember the XID. We will use it later to verify that we have consumed a lot of
# XIDs after this.
cur.execute("select pg_current_xact_id()")
locking_xid = cur.fetchall()[0][0]
# Stop and restart postgres, to clear the buffer cache.
#
# NOTE: clear_buffer_cache() will not do, because it evicts the dirty pages
# in a "clean" way. Our neon extension will write a full-page image of the VM
# page, and we want to avoid that.
endpoint.stop()
endpoint.start()
pg_conn = endpoint.connect()
cur = pg_conn.cursor()
cur.execute("select xmin, xmax, * from vmtest_lock where id = 40000 ")
tup = cur.fetchall()
xmax_before = tup[0][1]
# Consume a lot of XIDs, so that anti-wraparound autovacuum kicks
# in and the clog gets truncated. We set autovacuum_freeze_max_age to a very
# low value, so it doesn't take all that many XIDs for autovacuum to kick in.
for i in range(1000):
cur.execute(
"""
CREATE TEMP TABLE othertable (i int) ON COMMIT DROP;
do $$
begin
for i in 1..100000 loop
-- Use a begin-exception block to generate a new subtransaction on each iteration
begin
insert into othertable values (i);
exception when others then
raise 'not expected %', sqlerrm;
end;
end loop;
end;
$$;
"""
)
cur.execute("select xmin, xmax, * from vmtest_lock where id = 40000 ")
tup = cur.fetchall()
log.info(f"tuple = {tup}")
xmax = tup[0][1]
assert xmax == xmax_before
if i % 50 == 0:
cur.execute("select datfrozenxid from pg_database where datname='postgres'")
datfrozenxid = cur.fetchall()[0][0]
if datfrozenxid > locking_xid:
break
cur.execute("select pg_current_xact_id()")
curr_xid = cur.fetchall()[0][0]
assert int(curr_xid) - int(locking_xid) >= 100000
# Now, if the VM all-frozen bit was not correctly cleared on
# replay, we will try to fetch the status of the XID that was
# already truncated away.
#
# ERROR: could not access status of transaction 1027
cur.execute("select xmin, xmax, * from vmtest_lock where id = 40000 for update")
tup = cur.fetchall()
log.info(f"tuple = {tup}")

View File

@@ -14,8 +14,6 @@ from pathlib import Path
from typing import Any, List, Optional
import psycopg2
import psycopg2.errors
import psycopg2.extras
import pytest
from fixtures.broker import NeonBroker
from fixtures.log_helper import log
@@ -262,7 +260,7 @@ def test_restarts(neon_env_builder: NeonEnvBuilder):
else:
failed_node.start()
failed_node = None
assert query_scalar(cur, "SELECT sum(key) FROM t") == (n_inserts * (n_inserts + 1)) // 2
assert query_scalar(cur, "SELECT sum(key) FROM t") == 500500
# Test that safekeepers push their info to the broker and learn peer status from it

View File

@@ -43,7 +43,7 @@ def test_walredo_not_left_behind_on_detach(neon_env_builder: NeonEnvBuilder):
tenant_id, _ = env.neon_cli.create_tenant()
# assert tenant exists on disk
assert (env.pageserver.tenant_dir(tenant_id)).exists()
assert (env.pageserver.workdir / "tenants" / str(tenant_id)).exists()
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
@@ -101,7 +101,7 @@ def test_walredo_not_left_behind_on_detach(neon_env_builder: NeonEnvBuilder):
pytest.fail(f"could not detach tenant: {last_error}")
# check that nothing is left on disk for deleted tenant
assert not env.pageserver.tenant_dir(tenant_id).exists()
assert not (env.pageserver.workdir / "tenants" / str(tenant_id)).exists()
# Pageserver schedules kill+wait of the WAL redo process to the background runtime,
# asynchronously to tenant detach. Cut it some slack to complete kill+wait before