Compare commits

..

1 Commits

Author SHA1 Message Date
BodoBolero
87e59ab82b add sudo package to build-tools image 2025-05-16 17:13:15 +02:00
64 changed files with 655 additions and 2168 deletions

View File

@@ -963,7 +963,7 @@ jobs:
fi
- name: Verify docker-compose example and test extensions
timeout-minutes: 60
timeout-minutes: 20
env:
TAG: >-
${{

62
Cargo.lock generated
View File

@@ -3794,16 +3794,6 @@ version = "0.8.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
[[package]]
name = "neon-shmem"
version = "0.1.0"
dependencies = [
"nix 0.30.1",
"tempfile",
"thiserror 1.0.69",
"workspace_hack",
]
[[package]]
name = "never-say-never"
version = "6.6.666"
@@ -3898,16 +3888,6 @@ dependencies = [
"winapi",
]
[[package]]
name = "nu-ansi-term"
version = "0.46.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
dependencies = [
"overload",
"winapi",
]
[[package]]
name = "num"
version = "0.4.1"
@@ -4192,12 +4172,6 @@ version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a"
[[package]]
name = "overload"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
[[package]]
name = "p256"
version = "0.11.1"
@@ -4302,7 +4276,6 @@ dependencies = [
"enumset",
"fail",
"futures",
"hashlink",
"hex",
"hex-literal",
"http-utils",
@@ -4451,16 +4424,6 @@ dependencies = [
"workspace_hack",
]
[[package]]
name = "pageserver_page_api"
version = "0.1.0"
dependencies = [
"prost 0.13.3",
"tonic",
"tonic-build",
"workspace_hack",
]
[[package]]
name = "papaya"
version = "0.2.1"
@@ -5255,7 +5218,6 @@ dependencies = [
"tracing-log",
"tracing-opentelemetry",
"tracing-subscriber",
"tracing-test",
"tracing-utils",
"try-lock",
"typed-json",
@@ -7706,7 +7668,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e8189decb5ac0fa7bc8b96b7cb9b2701d60d48805aca84a238004d665fcc4008"
dependencies = [
"matchers",
"nu-ansi-term",
"once_cell",
"regex",
"serde",
@@ -7720,27 +7681,6 @@ dependencies = [
"tracing-serde",
]
[[package]]
name = "tracing-test"
version = "0.2.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "557b891436fe0d5e0e363427fc7f217abf9ccd510d5136549847bdcbcd011d68"
dependencies = [
"tracing-core",
"tracing-subscriber",
"tracing-test-macro",
]
[[package]]
name = "tracing-test-macro"
version = "0.2.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "04659ddb06c87d233c566112c1c9c5b9e98256d9af50ec3bc9c8327f873a7568"
dependencies = [
"quote",
"syn 2.0.100",
]
[[package]]
name = "tracing-utils"
version = "0.1.0"
@@ -8542,7 +8482,6 @@ dependencies = [
"log",
"memchr",
"nix 0.26.4",
"nix 0.30.1",
"nom",
"num",
"num-bigint",
@@ -8593,7 +8532,6 @@ dependencies = [
"tracing",
"tracing-core",
"tracing-log",
"tracing-subscriber",
"url",
"uuid",
"zeroize",

View File

@@ -9,7 +9,6 @@ members = [
"pageserver/ctl",
"pageserver/client",
"pageserver/pagebench",
"pageserver/page_api",
"proxy",
"safekeeper",
"safekeeper/client",
@@ -24,7 +23,6 @@ members = [
"libs/postgres_ffi",
"libs/safekeeper_api",
"libs/desim",
"libs/neon-shmem",
"libs/utils",
"libs/consumption_metrics",
"libs/postgres_backend",
@@ -129,7 +127,7 @@ md5 = "0.7.0"
measured = { version = "0.0.22", features=["lasso"] }
measured-process = { version = "0.0.22" }
memoffset = "0.9"
nix = { version = "0.30.1", features = ["dir", "fs", "mman", "process", "socket", "signal", "poll"] }
nix = { version = "0.30.1", features = ["dir", "fs", "process", "socket", "signal", "poll"] }
# Do not update to >= 7.0.0, at least. The update will have a significant impact
# on compute startup metrics (start_postgres_ms), >= 25% degradation.
notify = "6.0.0"
@@ -253,7 +251,6 @@ pageserver = { path = "./pageserver" }
pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
pageserver_client = { path = "./pageserver/client" }
pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" }
pageserver_page_api = { path = "./pageserver/page_api" }
postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" }
postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }

View File

@@ -144,6 +144,7 @@ RUN set -e \
openssh-client \
parallel \
pkg-config \
sudo \
unzip \
wget \
xz-utils \

View File

@@ -1439,38 +1439,6 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_partman.control
#########################################################################################
# Layer "pg_tracing"
# compile pg_tracing extension
#
#########################################################################################
FROM build-deps AS pg_tracing-src
ARG PG_VERSION
WORKDIR /ext-src
RUN case "${PG_VERSION:?}" in \
"v14" | "v15") \
echo "pg_tracing not supported on this PostgreSQL version." && exit 0 \
;; \
*) \
;; \
esac && \
wget https://github.com/DataDog/pg_tracing/archive/refs/tags/v0.1.3.tar.gz -O pg_tracing.tar.gz && \
echo "d0a7cca7279bb29601ba6c4c1aaeb3a44d71e6afa3b78aae1e3b7269e688f907 pg_tracing.tar.gz" | sha256sum --check && \
mkdir pg_tracing-src && cd pg_tracing-src && tar xzf ../pg_tracing.tar.gz --strip-components=1 -C .
FROM pg-build AS pg_tracing-build
COPY --from=pg_tracing-src /ext-src/ /ext-src/
WORKDIR /ext-src/pg_tracing-src
RUN case "${PG_VERSION:?}" in \
"v14" | "v15") \
echo "pg_tracing not supported on this PostgreSQL version." && exit 0 \
;; \
*) \
;; \
esac && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install
#########################################################################################
#
# Layer "pg_mooncake"
@@ -1701,7 +1669,6 @@ COPY --from=wal2json-build /usr/local/pgsql /usr/local/pgsql
COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg_ivm-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg_tracing-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg_mooncake-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg_duckdb-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg_repack-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -1878,7 +1845,6 @@ COPY --from=pg_semver-src /ext-src/ /ext-src/
#COPY --from=wal2json-src /ext-src/ /ext-src/
COPY --from=pg_ivm-src /ext-src/ /ext-src/
COPY --from=pg_partman-src /ext-src/ /ext-src/
COPY --from=pg_tracing-src /ext-src/ /ext-src/
#COPY --from=pg_mooncake-src /ext-src/ /ext-src/
COPY --from=pg_repack-src /ext-src/ /ext-src/
COPY --from=pg_repack-build /usr/local/pgsql/ /usr/local/pgsql/

View File

@@ -7,7 +7,7 @@ index 255e616..1c6edb7 100644
RelationGetRelationName(index));
+#ifdef NEON_SMGR
+ smgr_start_unlogged_build(RelationGetSmgr(index));
+ smgr_start_unlogged_build(index->rd_smgr);
+#endif
+
initRumState(&buildstate.rumstate, index);
@@ -18,7 +18,7 @@ index 255e616..1c6edb7 100644
rumUpdateStats(index, &buildstate.buildStats, buildstate.rumstate.isBuild);
+#ifdef NEON_SMGR
+ smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index));
+ smgr_finish_unlogged_build_phase_1(index->rd_smgr);
+#endif
+
/*
@@ -29,7 +29,7 @@ index 255e616..1c6edb7 100644
}
+#ifdef NEON_SMGR
+ smgr_end_unlogged_build(RelationGetSmgr(index));
+ smgr_end_unlogged_build(index->rd_smgr);
+#endif
+
/*

View File

@@ -213,10 +213,8 @@ impl Escaping for PgIdent {
// Find the first suitable tag that is not present in the string.
// Postgres' max role/DB name length is 63 bytes, so even in the
// worst case it won't take long. Outer tag is always `tag + "x"`,
// so if `tag` is not present in the string, `outer_tag` is not
// present in the string either.
while self.contains(&tag.to_string()) {
// worst case it won't take long.
while self.contains(&format!("${tag}$")) || self.contains(&format!("${outer_tag}$")) {
tag += "x";
outer_tag = tag.clone() + "x";
}

View File

@@ -71,14 +71,6 @@ test.escaping = 'here''s a backslash \\ and a quote '' and a double-quote " hoor
("name$$$", ("$x$name$$$$x$", "xx")),
("name$$$$", ("$x$name$$$$$x$", "xx")),
("name$x$", ("$xx$name$x$$xx$", "xxx")),
("x", ("$xx$x$xx$", "xxx")),
("xx", ("$xxx$xx$xxx$", "xxxx")),
("$x", ("$xx$$x$xx$", "xxx")),
("x$", ("$xx$x$$xx$", "xxx")),
("$x$", ("$xx$$x$$xx$", "xxx")),
("xx$", ("$xxx$xx$$xxx$", "xxxx")),
("$xx", ("$xxx$$xx$xxx$", "xxxx")),
("$xx$", ("$xxx$$xx$$xxx$", "xxxx")),
];
for (input, expected) in test_cases {

View File

@@ -546,11 +546,6 @@ impl PageServerNode {
.map(serde_json::from_str)
.transpose()
.context("Falied to parse 'sampling_ratio'")?,
relsize_snapshot_cache_capacity: settings
.remove("relsize snapshot cache capacity")
.map(|x| x.parse::<usize>())
.transpose()
.context("Falied to parse 'relsize_snapshot_cache_capacity' as integer")?,
};
if !settings.is_empty() {
bail!("Unrecognized tenant settings: {settings:?}")

View File

@@ -462,8 +462,6 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
if var(REAL_S3_ENV).is_ok() {
assert!(body.contains("remote_storage_s3_deleted_objects_total"));
}
#[cfg(target_os = "linux")]
assert!(body.contains("process_threads"));
}

View File

@@ -1,13 +0,0 @@
[package]
name = "neon-shmem"
version = "0.1.0"
edition.workspace = true
license.workspace = true
[dependencies]
thiserror.workspace = true
nix.workspace=true
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
[target.'cfg(target_os = "macos")'.dependencies]
tempfile = "3.14.0"

View File

@@ -1,418 +0,0 @@
//! Shared memory utilities for neon communicator
use std::num::NonZeroUsize;
use std::os::fd::{AsFd, BorrowedFd, OwnedFd};
use std::ptr::NonNull;
use std::sync::atomic::{AtomicUsize, Ordering};
use nix::errno::Errno;
use nix::sys::mman::MapFlags;
use nix::sys::mman::ProtFlags;
use nix::sys::mman::mmap as nix_mmap;
use nix::sys::mman::munmap as nix_munmap;
use nix::unistd::ftruncate as nix_ftruncate;
/// ShmemHandle represents a shared memory area that can be shared by processes over fork().
/// Unlike shared memory allocated by Postgres, this area is resizable, up to 'max_size' that's
/// specified at creation.
///
/// The area is backed by an anonymous file created with memfd_create(). The full address space for
/// 'max_size' is reserved up-front with mmap(), but whenever you call [`ShmemHandle::set_size`],
/// the underlying file is resized. Do not access the area beyond the current size. Currently, that
/// will cause the file to be expanded, but we might use mprotect() etc. to enforce that in the
/// future.
pub struct ShmemHandle {
/// memfd file descriptor
fd: OwnedFd,
max_size: usize,
// Pointer to the beginning of the shared memory area. The header is stored there.
shared_ptr: NonNull<SharedStruct>,
// Pointer to the beginning of the user data
pub data_ptr: NonNull<u8>,
}
/// This is stored at the beginning in the shared memory area.
struct SharedStruct {
max_size: usize,
/// Current size of the backing file. The high-order bit is used for the RESIZE_IN_PROGRESS flag
current_size: AtomicUsize,
}
const RESIZE_IN_PROGRESS: usize = 1 << 63;
const HEADER_SIZE: usize = std::mem::size_of::<SharedStruct>();
/// Error type returned by the ShmemHandle functions.
#[derive(thiserror::Error, Debug)]
#[error("{msg}: {errno}")]
pub struct Error {
pub msg: String,
pub errno: Errno,
}
impl Error {
fn new(msg: &str, errno: Errno) -> Error {
Error {
msg: msg.to_string(),
errno,
}
}
}
impl ShmemHandle {
/// Create a new shared memory area. To communicate between processes, the processes need to be
/// fork()'d after calling this, so that the ShmemHandle is inherited by all processes.
///
/// If the ShmemHandle is dropped, the memory is unmapped from the current process. Other
/// processes can continue using it, however.
pub fn new(name: &str, initial_size: usize, max_size: usize) -> Result<ShmemHandle, Error> {
// create the backing anonymous file.
let fd = create_backing_file(name)?;
Self::new_with_fd(fd, initial_size, max_size)
}
fn new_with_fd(
fd: OwnedFd,
initial_size: usize,
max_size: usize,
) -> Result<ShmemHandle, Error> {
// We reserve the high-order bit for the RESIZE_IN_PROGRESS flag, and the actual size
// is a little larger than this because of the SharedStruct header. Make the upper limit
// somewhat smaller than that, because with anything close to that, you'll run out of
// memory anyway.
if max_size >= 1 << 48 {
panic!("max size {} too large", max_size);
}
if initial_size > max_size {
panic!("initial size {initial_size} larger than max size {max_size}");
}
// The actual initial / max size is the one given by the caller, plus the size of
// 'SharedStruct'.
let initial_size = HEADER_SIZE + initial_size;
let max_size = NonZeroUsize::new(HEADER_SIZE + max_size).unwrap();
// Reserve address space for it with mmap
//
// TODO: Use MAP_HUGETLB if possible
let start_ptr = unsafe {
nix_mmap(
None,
max_size,
ProtFlags::PROT_READ | ProtFlags::PROT_WRITE,
MapFlags::MAP_SHARED,
&fd,
0,
)
}
.map_err(|e| Error::new("mmap failed: {e}", e))?;
// Reserve space for the initial size
enlarge_file(fd.as_fd(), initial_size as u64)?;
// Initialize the header
let shared: NonNull<SharedStruct> = start_ptr.cast();
unsafe {
shared.write(SharedStruct {
max_size: max_size.into(),
current_size: AtomicUsize::new(initial_size),
})
};
// The user data begins after the header
let data_ptr = unsafe { start_ptr.cast().add(HEADER_SIZE) };
Ok(ShmemHandle {
fd,
max_size: max_size.into(),
shared_ptr: shared,
data_ptr,
})
}
// return reference to the header
fn shared(&self) -> &SharedStruct {
unsafe { self.shared_ptr.as_ref() }
}
/// Resize the shared memory area. 'new_size' must not be larger than the 'max_size' specified
/// when creating the area.
///
/// This may only be called from one process/thread concurrently. We detect that case
/// and return an Error.
pub fn set_size(&self, new_size: usize) -> Result<(), Error> {
let new_size = new_size + HEADER_SIZE;
let shared = self.shared();
if new_size > self.max_size {
panic!(
"new size ({} is greater than max size ({})",
new_size, self.max_size
);
}
assert_eq!(self.max_size, shared.max_size);
// Lock the area by setting the bit in 'current_size'
//
// Ordering::Relaxed would probably be sufficient here, as we don't access any other memory
// and the posix_fallocate/ftruncate call is surely a synchronization point anyway. But
// since this is not performance-critical, better safe than sorry .
let mut old_size = shared.current_size.load(Ordering::Acquire);
loop {
if (old_size & RESIZE_IN_PROGRESS) != 0 {
return Err(Error::new(
"concurrent resize detected",
Errno::UnknownErrno,
));
}
match shared.current_size.compare_exchange(
old_size,
new_size,
Ordering::Acquire,
Ordering::Relaxed,
) {
Ok(_) => break,
Err(x) => old_size = x,
}
}
// Ok, we got the lock.
//
// NB: If anything goes wrong, we *must* clear the bit!
let result = {
use std::cmp::Ordering::{Equal, Greater, Less};
match new_size.cmp(&old_size) {
Less => nix_ftruncate(&self.fd, new_size as i64).map_err(|e| {
Error::new("could not shrink shmem segment, ftruncate failed: {e}", e)
}),
Equal => Ok(()),
Greater => enlarge_file(self.fd.as_fd(), new_size as u64),
}
};
// Unlock
shared.current_size.store(
if result.is_ok() { new_size } else { old_size },
Ordering::Release,
);
result
}
/// Returns the current user-visible size of the shared memory segment.
///
/// NOTE: a concurrent set_size() call can change the size at any time. It is the caller's
/// responsibility not to access the area beyond the current size.
pub fn current_size(&self) -> usize {
let total_current_size =
self.shared().current_size.load(Ordering::Relaxed) & !RESIZE_IN_PROGRESS;
total_current_size - HEADER_SIZE
}
}
impl Drop for ShmemHandle {
fn drop(&mut self) {
// SAFETY: The pointer was obtained from mmap() with the given size.
// We unmap the entire region.
let _ = unsafe { nix_munmap(self.shared_ptr.cast(), self.max_size) };
// The fd is dropped automatically by OwnedFd.
}
}
/// Create a "backing file" for the shared memory area. On Linux, use memfd_create(), to create an
/// anonymous in-memory file. One macos, fall back to a regular file. That's good enough for
/// development and testing, but in production we want the file to stay in memory.
///
/// disable 'unused_variables' warnings, because in the macos path, 'name' is unused.
#[allow(unused_variables)]
fn create_backing_file(name: &str) -> Result<OwnedFd, Error> {
#[cfg(not(target_os = "macos"))]
{
nix::sys::memfd::memfd_create(name, nix::sys::memfd::MFdFlags::empty())
.map_err(|e| Error::new("memfd_create failed: {e}", e))
}
#[cfg(target_os = "macos")]
{
let file = tempfile::tempfile().map_err(|e| {
Error::new(
"could not create temporary file to back shmem area: {e}",
nix::errno::Errno::from_raw(e.raw_os_error().unwrap_or(0)),
)
})?;
Ok(OwnedFd::from(file))
}
}
fn enlarge_file(fd: BorrowedFd, size: u64) -> Result<(), Error> {
// Use posix_fallocate() to enlarge the file. It reserves the space correctly, so that
// we don't get a segfault later when trying to actually use it.
#[cfg(not(target_os = "macos"))]
{
nix::fcntl::posix_fallocate(fd, 0, size as i64).map_err(|e| {
Error::new(
"could not grow shmem segment, posix_fallocate failed: {e}",
e,
)
})
}
// As a fallback on macos, which doesn't have posix_fallocate, use plain 'fallocate'
#[cfg(target_os = "macos")]
{
nix::unistd::ftruncate(fd, size as i64)
.map_err(|e| Error::new("could not grow shmem segment, ftruncate failed: {e}", e))
}
}
#[cfg(test)]
mod tests {
use super::*;
use nix::unistd::ForkResult;
use std::ops::Range;
/// check that all bytes in given range have the expected value.
fn assert_range(ptr: *const u8, expected: u8, range: Range<usize>) {
for i in range {
let b = unsafe { *(ptr.add(i)) };
assert_eq!(expected, b, "unexpected byte at offset {}", i);
}
}
/// Write 'b' to all bytes in the given range
fn write_range(ptr: *mut u8, b: u8, range: Range<usize>) {
unsafe { std::ptr::write_bytes(ptr.add(range.start), b, range.end - range.start) };
}
// simple single-process test of growing and shrinking
#[test]
fn test_shmem_resize() -> Result<(), Error> {
let max_size = 1024 * 1024;
let init_struct = ShmemHandle::new("test_shmem_resize", 0, max_size)?;
assert_eq!(init_struct.current_size(), 0);
// Initial grow
let size1 = 10000;
init_struct.set_size(size1).unwrap();
assert_eq!(init_struct.current_size(), size1);
// Write some data
let data_ptr = init_struct.data_ptr.as_ptr();
write_range(data_ptr, 0xAA, 0..size1);
assert_range(data_ptr, 0xAA, 0..size1);
// Shrink
let size2 = 5000;
init_struct.set_size(size2).unwrap();
assert_eq!(init_struct.current_size(), size2);
// Grow again
let size3 = 20000;
init_struct.set_size(size3).unwrap();
assert_eq!(init_struct.current_size(), size3);
// Try to read it. The area that was shrunk and grown again should read as all zeros now
assert_range(data_ptr, 0xAA, 0..5000);
assert_range(data_ptr, 0, 5000..size1);
// Try to grow beyond max_size
//let size4 = max_size + 1;
//assert!(init_struct.set_size(size4).is_err());
// Dropping init_struct should unmap the memory
drop(init_struct);
Ok(())
}
/// This is used in tests to coordinate between test processes. It's like std::sync::Barrier,
/// but is stored in the shared memory area and works across processes. It's implemented by
/// polling, because e.g. standard rust mutexes are not guaranteed to work across processes.
struct SimpleBarrier {
num_procs: usize,
count: AtomicUsize,
}
impl SimpleBarrier {
unsafe fn init(ptr: *mut SimpleBarrier, num_procs: usize) {
unsafe {
*ptr = SimpleBarrier {
num_procs,
count: AtomicUsize::new(0),
}
}
}
pub fn wait(&self) {
let old = self.count.fetch_add(1, Ordering::Relaxed);
let generation = old / self.num_procs;
let mut current = old + 1;
while current < (generation + 1) * self.num_procs {
std::thread::sleep(std::time::Duration::from_millis(10));
current = self.count.load(Ordering::Relaxed);
}
}
}
#[test]
fn test_multi_process() {
// Initialize
let max_size = 1_000_000_000_000;
let init_struct = ShmemHandle::new("test_multi_process", 0, max_size).unwrap();
let ptr = init_struct.data_ptr.as_ptr();
// Store the SimpleBarrier in the first 1k of the area.
init_struct.set_size(10000).unwrap();
let barrier_ptr: *mut SimpleBarrier = unsafe {
ptr.add(ptr.align_offset(std::mem::align_of::<SimpleBarrier>()))
.cast()
};
unsafe { SimpleBarrier::init(barrier_ptr, 2) };
let barrier = unsafe { barrier_ptr.as_ref().unwrap() };
// Fork another test process. The code after this runs in both processes concurrently.
let fork_result = unsafe { nix::unistd::fork().unwrap() };
// In the parent, fill bytes between 1000..2000. In the child, between 2000..3000
if fork_result.is_parent() {
write_range(ptr, 0xAA, 1000..2000);
} else {
write_range(ptr, 0xBB, 2000..3000);
}
barrier.wait();
// Verify the contents. (in both processes)
assert_range(ptr, 0xAA, 1000..2000);
assert_range(ptr, 0xBB, 2000..3000);
// Grow, from the child this time
let size = 10_000_000;
if !fork_result.is_parent() {
init_struct.set_size(size).unwrap();
}
barrier.wait();
// make some writes at the end
if fork_result.is_parent() {
write_range(ptr, 0xAA, (size - 10)..size);
} else {
write_range(ptr, 0xBB, (size - 20)..(size - 10));
}
barrier.wait();
// Verify the contents. (This runs in both processes)
assert_range(ptr, 0, (size - 1000)..(size - 20));
assert_range(ptr, 0xBB, (size - 20)..(size - 10));
assert_range(ptr, 0xAA, (size - 10)..size);
if let ForkResult::Parent { child } = fork_result {
nix::sys::wait::waitpid(child, None).unwrap();
}
}
}

View File

@@ -235,7 +235,7 @@ pub enum PageServiceProtocolPipelinedBatchingStrategy {
ScatteredLsn,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
#[serde(tag = "mode", rename_all = "kebab-case")]
pub enum GetVectoredConcurrentIo {
/// The read path is fully sequential: layers are visited
@@ -491,8 +491,6 @@ pub struct TenantConfigToml {
/// Tenant level performance sampling ratio override. Controls the ratio of get page requests
/// that will get perf sampling for the tenant.
pub sampling_ratio: Option<Ratio>,
/// Capacity of relsize snapshot cache (used by replicas).
pub relsize_snapshot_cache_capacity: usize,
}
pub mod defaults {
@@ -732,7 +730,6 @@ pub mod tenant_conf_defaults {
pub const DEFAULT_GC_COMPACTION_VERIFICATION: bool = true;
pub const DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB: u64 = 5 * 1024 * 1024; // 5GB
pub const DEFAULT_GC_COMPACTION_RATIO_PERCENT: u64 = 100;
pub const DEFAULT_RELSIZE_SNAPSHOT_CACHE_CAPACITY: usize = 1000;
}
impl Default for TenantConfigToml {
@@ -790,7 +787,6 @@ impl Default for TenantConfigToml {
gc_compaction_initial_threshold_kb: DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB,
gc_compaction_ratio_percent: DEFAULT_GC_COMPACTION_RATIO_PERCENT,
sampling_ratio: None,
relsize_snapshot_cache_capacity: DEFAULT_RELSIZE_SNAPSHOT_CACHE_CAPACITY,
}
}
}

View File

@@ -630,8 +630,6 @@ pub struct TenantConfigPatch {
pub gc_compaction_ratio_percent: FieldPatch<u64>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub sampling_ratio: FieldPatch<Option<Ratio>>,
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
pub relsize_snapshot_cache_capacity: FieldPatch<usize>,
}
/// Like [`crate::config::TenantConfigToml`], but preserves the information
@@ -761,9 +759,6 @@ pub struct TenantConfig {
#[serde(skip_serializing_if = "Option::is_none")]
pub sampling_ratio: Option<Option<Ratio>>,
#[serde(skip_serializing_if = "Option::is_none")]
pub relsize_snapshot_cache_capacity: Option<usize>,
}
impl TenantConfig {
@@ -809,7 +804,6 @@ impl TenantConfig {
mut gc_compaction_initial_threshold_kb,
mut gc_compaction_ratio_percent,
mut sampling_ratio,
mut relsize_snapshot_cache_capacity,
} = self;
patch.checkpoint_distance.apply(&mut checkpoint_distance);
@@ -911,9 +905,6 @@ impl TenantConfig {
.gc_compaction_ratio_percent
.apply(&mut gc_compaction_ratio_percent);
patch.sampling_ratio.apply(&mut sampling_ratio);
patch
.relsize_snapshot_cache_capacity
.apply(&mut relsize_snapshot_cache_capacity);
Ok(Self {
checkpoint_distance,
@@ -953,7 +944,6 @@ impl TenantConfig {
gc_compaction_initial_threshold_kb,
gc_compaction_ratio_percent,
sampling_ratio,
relsize_snapshot_cache_capacity,
})
}
@@ -1062,9 +1052,6 @@ impl TenantConfig {
.gc_compaction_ratio_percent
.unwrap_or(global_conf.gc_compaction_ratio_percent),
sampling_ratio: self.sampling_ratio.unwrap_or(global_conf.sampling_ratio),
relsize_snapshot_cache_capacity: self
.relsize_snapshot_cache_capacity
.unwrap_or(global_conf.relsize_snapshot_cache_capacity),
}
}
}

View File

@@ -86,27 +86,6 @@ pub struct DbError {
}
impl DbError {
pub fn new_test_error(code: SqlState, message: String) -> Self {
DbError {
severity: "ERROR".to_string(),
parsed_severity: Some(Severity::Error),
code,
message,
detail: None,
hint: None,
position: None,
where_: None,
schema: None,
table: None,
column: None,
datatype: None,
constraint: None,
file: None,
line: None,
routine: None,
}
}
pub(crate) fn parse(fields: &mut ErrorFields<'_>) -> io::Result<DbError> {
let mut severity = None;
let mut parsed_severity = None;

View File

@@ -30,7 +30,6 @@ crc32c.workspace = true
either.workspace = true
fail.workspace = true
futures.workspace = true
hashlink.workspace = true
hex.workspace = true
humantime.workspace = true
humantime-serde.workspace = true

View File

@@ -1,13 +0,0 @@
[package]
name = "pageserver_page_api"
version = "0.1.0"
edition.workspace = true
license.workspace = true
[dependencies]
prost.workspace = true
tonic.workspace = true
workspace_hack.workspace = true
[build-dependencies]
tonic-build.workspace = true

View File

@@ -1,13 +0,0 @@
use std::env;
use std::path::PathBuf;
/// Generates Rust code from .proto Protobuf schemas, along with a binary file
/// descriptor set for Protobuf schema reflection.
fn main() -> Result<(), Box<dyn std::error::Error>> {
let out_dir = PathBuf::from(env::var("OUT_DIR")?);
tonic_build::configure()
.bytes(["."])
.file_descriptor_set_path(out_dir.join("page_api_descriptor.bin"))
.compile_protos(&["proto/page_service.proto"], &["proto"])
.map_err(|err| err.into())
}

View File

@@ -1,233 +0,0 @@
// Page service, presented by pageservers for computes.
//
// This is the compute read path. It primarily serves page versions at given
// LSNs, but also base backups, SLRU segments, and relation metadata.
//
// EXPERIMENTAL: this is still under development and subject to change.
//
// Request metadata headers:
// - authorization: JWT token ("Bearer <token>"), if auth is enabled
// - neon-tenant-id: tenant ID ("7c4a1f9e3bd6470c8f3e21a65bd2e980")
// - neon-shard-id: shard ID, as <number><count> in hex ("0b10" = shard 11 of 16, 0-based)
// - neon-timeline-id: timeline ID ("f08c4e9a2d5f76b1e3a7c2d8910f4b3e")
//
// The service can be accessed via e.g. grpcurl:
//
// ```
// grpcurl \
// -plaintext \
// -H "neon-tenant-id: 7c4a1f9e3bd6470c8f3e21a65bd2e980" \
// -H "neon-shard-id: 0b10" \
// -H "neon-timeline-id: f08c4e9a2d5f76b1e3a7c2d8910f4b3e" \
// -H "authorization: Bearer $JWT" \
// -d '{"read_lsn": {"request_lsn": 1234567890}, "rel": {"spc_oid": 1663, "db_oid": 1234, "rel_number": 5678, "fork_number": 0}}'
// localhost:51051 page_api.PageService/CheckRelExists
// ```
//
// TODO: consider adding neon-compute-mode ("primary", "static", "replica").
// However, this will require reconnecting when changing modes.
//
// TODO: write implementation guidance on
// - Health checks
// - Tracing, OpenTelemetry
// - Compression
syntax = "proto3";
package page_api;
service PageService {
// Returns whether a relation exists.
rpc CheckRelExists(CheckRelExistsRequest) returns (CheckRelExistsResponse);
// Fetches a base backup.
rpc GetBaseBackup (GetBaseBackupRequest) returns (stream GetBaseBackupResponseChunk);
// Returns the total size of a database, as # of bytes.
rpc GetDbSize (GetDbSizeRequest) returns (GetDbSizeResponse);
// Fetches pages.
//
// This is implemented as a bidirectional streaming RPC for performance. Unary
// requests incur costs for e.g. HTTP/2 stream setup, header parsing,
// authentication, and so on -- with streaming, we only pay these costs during
// the initial stream setup. This ~doubles throughput in benchmarks. Other
// RPCs use regular unary requests, since they are not as frequent and
// performance-critical, and this simplifies implementation.
//
// NB: a status response (e.g. errors) will terminate the stream. The stream
// may be shared by e.g. multiple Postgres backends, so we should avoid this.
// Most errors are therefore sent as GetPageResponse.status instead.
rpc GetPages (stream GetPageRequest) returns (stream GetPageResponse);
// Returns the size of a relation, as # of blocks.
rpc GetRelSize (GetRelSizeRequest) returns (GetRelSizeResponse);
// Fetches an SLRU segment.
rpc GetSlruSegment (GetSlruSegmentRequest) returns (GetSlruSegmentResponse);
}
// The LSN a request should read at.
message ReadLsn {
// The request's read LSN. Required.
uint64 request_lsn = 1;
// If given, the caller guarantees that the page has not been modified since
// this LSN. Must be smaller than or equal to request_lsn. This allows the
// Pageserver to serve an old page without waiting for the request LSN to
// arrive. Valid for all request types.
//
// It is undefined behaviour to make a request such that the page was, in
// fact, modified between request_lsn and not_modified_since_lsn. The
// Pageserver might detect it and return an error, or it might return the old
// page version or the new page version. Setting not_modified_since_lsn equal
// to request_lsn is always safe, but can lead to unnecessary waiting.
uint64 not_modified_since_lsn = 2;
}
// A relation identifier.
message RelTag {
uint32 spc_oid = 1;
uint32 db_oid = 2;
uint32 rel_number = 3;
uint32 fork_number = 4;
}
// Checks whether a relation exists, at the given LSN. Only valid on shard 0,
// other shards will error.
message CheckRelExistsRequest {
ReadLsn read_lsn = 1;
RelTag rel = 2;
}
message CheckRelExistsResponse {
bool exists = 1;
}
// Requests a base backup at a given LSN.
message GetBaseBackupRequest {
// The LSN to fetch a base backup at.
ReadLsn read_lsn = 1;
// If true, logical replication slots will not be created.
bool replica = 2;
}
// Base backup response chunk, returned as an ordered stream.
message GetBaseBackupResponseChunk {
// A basebackup data chunk. The size is undefined, but bounded by the 4 MB
// gRPC message size limit.
bytes chunk = 1;
}
// Requests the size of a database, as # of bytes. Only valid on shard 0, other
// shards will error.
message GetDbSizeRequest {
ReadLsn read_lsn = 1;
uint32 db_oid = 2;
}
message GetDbSizeResponse {
uint64 num_bytes = 1;
}
// Requests one or more pages.
message GetPageRequest {
// A request ID. Will be included in the response. Should be unique for
// in-flight requests on the stream.
uint64 request_id = 1;
// The request class.
GetPageClass request_class = 2;
// The LSN to read at.
ReadLsn read_lsn = 3;
// The relation to read from.
RelTag rel = 4;
// Page numbers to read. Must belong to the remote shard.
//
// Multiple pages will be executed as a single batch by the Pageserver,
// amortizing layer access costs and parallelizing them. This may increase the
// latency of any individual request, but improves the overall latency and
// throughput of the batch as a whole.
//
// TODO: this causes an allocation in the common single-block case. The sender
// can use a SmallVec to stack-allocate it, but Prost will always deserialize
// into a heap-allocated Vec. Consider optimizing this.
//
// TODO: we might be able to avoid a sort or something if we mandate that these
// are always in order. But we can't currenly rely on this on the server, because
// of compatibility with the libpq protocol handler.
repeated uint32 block_number = 5;
}
// A GetPageRequest class. Primarily intended for observability, but may also be
// used for prioritization in the future.
enum GetPageClass {
// Unknown class. For forwards compatibility: used when the client sends a
// class that the server doesn't know about.
GET_PAGE_CLASS_UNKNOWN = 0;
// A normal request. This is the default.
GET_PAGE_CLASS_NORMAL = 1;
// A prefetch request. NB: can only be classified on pg < 18.
GET_PAGE_CLASS_PREFETCH = 2;
// A background request (e.g. vacuum).
GET_PAGE_CLASS_BACKGROUND = 3;
}
// A GetPage response.
//
// A batch response will contain all of the requested pages. We could eagerly
// emit individual pages as soon as they are ready, but on a readv() Postgres
// holds buffer pool locks on all pages in the batch and we'll only return once
// the entire batch is ready, so no one can make use of the individual pages.
message GetPageResponse {
// The original request's ID.
uint64 request_id = 1;
// The response status code.
GetPageStatus status = 2;
// A string describing the status, if any.
string reason = 3;
// The 8KB page images, in the same order as the request. Empty if status != OK.
repeated bytes page_image = 4;
}
// A GetPageResponse status code. Since we use a bidirectional stream, we don't
// want to send errors as gRPC statuses, since this would terminate the stream.
enum GetPageStatus {
// Unknown status. For forwards compatibility: used when the server sends a
// status code that the client doesn't know about.
GET_PAGE_STATUS_UNKNOWN = 0;
// The request was successful.
GET_PAGE_STATUS_OK = 1;
// The page did not exist. The tenant/timeline/shard has already been
// validated during stream setup.
GET_PAGE_STATUS_NOT_FOUND = 2;
// The request was invalid.
GET_PAGE_STATUS_INVALID = 3;
// The tenant is rate limited. Slow down and retry later.
GET_PAGE_STATUS_SLOW_DOWN = 4;
// TODO: consider adding a GET_PAGE_STATUS_LAYER_DOWNLOAD in the case of a
// layer download. This could free up the server task to process other
// requests while the layer download is in progress.
}
// Fetches the size of a relation at a given LSN, as # of blocks. Only valid on
// shard 0, other shards will error.
message GetRelSizeRequest {
ReadLsn read_lsn = 1;
RelTag rel = 2;
}
message GetRelSizeResponse {
uint32 num_blocks = 1;
}
// Requests an SLRU segment. Only valid on shard 0, other shards will error.
message GetSlruSegmentRequest {
ReadLsn read_lsn = 1;
uint32 kind = 2;
uint32 segno = 3;
}
// Returns an SLRU segment.
//
// These are up 32 pages (256 KB), so we can send them as a single response.
message GetSlruSegmentResponse {
bytes segment = 1;
}

View File

@@ -1,19 +0,0 @@
//! This crate provides the Pageserver's page API. It contains:
//!
//! * proto/page_service.proto: the Protobuf schema for the page API.
//! * proto: auto-generated Protobuf types for gRPC.
//!
//! This crate is used by both the client and the server. Try to keep it slim.
// Code generated by protobuf.
pub mod proto {
tonic::include_proto!("page_api");
/// File descriptor set for Protobuf schema reflection. This allows using
/// e.g. grpcurl with the API.
pub const FILE_DESCRIPTOR_SET: &[u8] =
tonic::include_file_descriptor_set!("page_api_descriptor");
pub use page_service_client::PageServiceClient;
pub use page_service_server::{PageService, PageServiceServer};
}

View File

@@ -144,7 +144,7 @@ where
replica,
ctx,
io_concurrency: IoConcurrency::spawn_from_conf(
timeline.conf.get_vectored_concurrent_io,
timeline.conf,
timeline
.gate
.enter()
@@ -343,7 +343,7 @@ where
// Gather non-relational files from object storage pages.
let slru_partitions = self
.timeline
.get_slru_keyspace(Version::at(self.lsn), self.ctx)
.get_slru_keyspace(Version::Lsn(self.lsn), self.ctx)
.await?
.partition(
self.timeline.get_shard_identity(),
@@ -378,7 +378,7 @@ where
// Otherwise only include init forks of unlogged relations.
let rels = self
.timeline
.list_rels(spcnode, dbnode, Version::at(self.lsn), self.ctx)
.list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
.await?;
for &rel in rels.iter() {
// Send init fork as main fork to provide well formed empty
@@ -517,7 +517,7 @@ where
async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> Result<(), BasebackupError> {
let nblocks = self
.timeline
.get_rel_size(src, Version::at(self.lsn), self.ctx)
.get_rel_size(src, Version::Lsn(self.lsn), self.ctx)
.await?;
// If the relation is empty, create an empty file
@@ -577,7 +577,7 @@ where
let relmap_img = if has_relmap_file {
let img = self
.timeline
.get_relmap_file(spcnode, dbnode, Version::at(self.lsn), self.ctx)
.get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
.await?;
if img.len()
@@ -631,7 +631,7 @@ where
if !has_relmap_file
&& self
.timeline
.list_rels(spcnode, dbnode, Version::at(self.lsn), self.ctx)
.list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
.await?
.is_empty()
{

View File

@@ -544,23 +544,6 @@ impl PageServerConf {
ratio.numerator, ratio.denominator
)
);
let url = Url::parse(&tracing_config.export_config.endpoint)
.map_err(anyhow::Error::msg)
.with_context(|| {
format!(
"tracing endpoint URL is invalid : {}",
tracing_config.export_config.endpoint
)
})?;
ensure!(
url.scheme() == "http" || url.scheme() == "https",
format!(
"tracing endpoint URL must start with http:// or https://: {}",
tracing_config.export_config.endpoint
)
);
}
IndexEntry::validate_checkpoint_distance(conf.default_tenant_conf.checkpoint_distance)
@@ -677,25 +660,4 @@ mod tests {
PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir)
.expect("parse_and_validate");
}
#[test]
fn test_config_tracing_endpoint_is_invalid() {
let input = r#"
control_plane_api = "http://localhost:6666"
[tracing]
sampling_ratio = { numerator = 1, denominator = 0 }
[tracing.export_config]
endpoint = "localhost:4317"
protocol = "http-binary"
timeout = "1ms"
"#;
let config_toml = toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(input)
.expect("config has valid fields");
let workdir = Utf8PathBuf::from("/nonexistent");
PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir)
.expect_err("parse_and_validate should fail for endpoint without scheme");
}
}

View File

@@ -449,7 +449,7 @@ async fn build_timeline_info_common(
// Internally we distinguish between the planned GC cutoff (PITR point) and the "applied" GC cutoff (where we
// actually trimmed data to), which can pass each other when PITR is changed.
let min_readable_lsn = std::cmp::max(
timeline.get_gc_cutoff_lsn().unwrap_or_default(),
timeline.get_gc_cutoff_lsn(),
*timeline.get_applied_gc_cutoff_lsn(),
);
@@ -3199,7 +3199,7 @@ async fn list_aux_files(
.await?;
let io_concurrency = IoConcurrency::spawn_from_conf(
state.conf.get_vectored_concurrent_io,
state.conf,
timeline.gate.enter().map_err(|_| ApiError::Cancelled)?,
);

View File

@@ -843,50 +843,23 @@ pub(crate) static COMPRESSION_IMAGE_OUTPUT_BYTES: Lazy<IntCounter> = Lazy::new(|
.expect("failed to define a metric")
});
pub(crate) static RELSIZE_LATEST_CACHE_ENTRIES: Lazy<UIntGauge> = Lazy::new(|| {
pub(crate) static RELSIZE_CACHE_ENTRIES: Lazy<UIntGauge> = Lazy::new(|| {
register_uint_gauge!(
"pageserver_relsize_latest_cache_entries",
"Number of entries in the latest relation size cache",
"pageserver_relsize_cache_entries",
"Number of entries in the relation size cache",
)
.expect("failed to define a metric")
});
pub(crate) static RELSIZE_LATEST_CACHE_HITS: Lazy<IntCounter> = Lazy::new(|| {
pub(crate) static RELSIZE_CACHE_HITS: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!("pageserver_relsize_cache_hits", "Relation size cache hits",)
.expect("failed to define a metric")
});
pub(crate) static RELSIZE_CACHE_MISSES: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_relsize_latest_cache_hits",
"Latest relation size cache hits",
)
.expect("failed to define a metric")
});
pub(crate) static RELSIZE_LATEST_CACHE_MISSES: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_relsize_latest_cache_misses",
"Relation size latest cache misses",
)
.expect("failed to define a metric")
});
pub(crate) static RELSIZE_SNAPSHOT_CACHE_ENTRIES: Lazy<UIntGauge> = Lazy::new(|| {
register_uint_gauge!(
"pageserver_relsize_snapshot_cache_entries",
"Number of entries in the pitr relation size cache",
)
.expect("failed to define a metric")
});
pub(crate) static RELSIZE_SNAPSHOT_CACHE_HITS: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_relsize_snapshot_cache_hits",
"Pitr relation size cache hits",
)
.expect("failed to define a metric")
});
pub(crate) static RELSIZE_SNAPSHOT_CACHE_MISSES: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_relsize_snapshot_cache_misses",
"Relation size snapshot cache misses",
"pageserver_relsize_cache_misses",
"Relation size cache misses",
)
.expect("failed to define a metric")
});
@@ -1066,15 +1039,6 @@ pub(crate) static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|
.expect("Failed to register pageserver_tenant_synthetic_cached_size_bytes metric")
});
pub(crate) static TENANT_OFFLOADED_TIMELINES: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_tenant_offloaded_timelines",
"Number of offloaded timelines of a tenant",
&["tenant_id", "shard_id"]
)
.expect("Failed to register pageserver_tenant_offloaded_timelines metric")
});
pub(crate) static EVICTION_ITERATION_DURATION: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
"pageserver_eviction_iteration_duration_seconds_global",
@@ -3560,14 +3524,11 @@ impl TimelineMetrics {
}
pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
let tid = tenant_shard_id.tenant_id.to_string();
let shard_id = tenant_shard_id.shard_slug().to_string();
// Only shard zero deals in synthetic sizes
if tenant_shard_id.is_shard_zero() {
let tid = tenant_shard_id.tenant_id.to_string();
let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
}
let _ = TENANT_OFFLOADED_TIMELINES.remove_label_values(&[&tid, &shard_id]);
tenant_throttling::remove_tenant_metrics(tenant_shard_id);

View File

@@ -18,7 +18,7 @@ use itertools::Itertools;
use jsonwebtoken::TokenData;
use once_cell::sync::OnceCell;
use pageserver_api::config::{
GetVectoredConcurrentIo, PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
PageServiceProtocolPipelinedBatchingStrategy, PageServiceProtocolPipelinedExecutionStrategy,
};
use pageserver_api::key::rel_block_to_key;
@@ -62,7 +62,7 @@ use crate::metrics::{
self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, GetPageBatchBreakReason, LIVE_CONNECTIONS,
SmgrOpTimer, TimelineMetrics,
};
use crate::pgdatadir_mapping::{LsnRange, Version};
use crate::pgdatadir_mapping::Version;
use crate::span::{
debug_assert_current_span_has_tenant_and_timeline_id,
debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id,
@@ -331,10 +331,10 @@ async fn page_service_conn_main(
// But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
// and create the per-query context in process_query ourselves.
let mut conn_handler = PageServerHandler::new(
conf,
tenant_manager,
auth,
pipelining_config,
conf.get_vectored_concurrent_io,
perf_span_fields,
connection_ctx,
cancel.clone(),
@@ -371,6 +371,7 @@ async fn page_service_conn_main(
}
struct PageServerHandler {
conf: &'static PageServerConf,
auth: Option<Arc<SwappableJwtAuth>>,
claims: Option<Claims>,
@@ -388,7 +389,6 @@ struct PageServerHandler {
timeline_handles: Option<TimelineHandles>,
pipelining_config: PageServicePipeliningConfig,
get_vectored_concurrent_io: GetVectoredConcurrentIo,
gate_guard: GateGuard,
}
@@ -642,7 +642,7 @@ impl std::fmt::Display for BatchedPageStreamError {
struct BatchedGetPageRequest {
req: PagestreamGetPageRequest,
timer: SmgrOpTimer,
lsn_range: LsnRange,
effective_request_lsn: Lsn,
ctx: RequestContext,
}
@@ -764,12 +764,12 @@ impl BatchedFeMessage {
match batching_strategy {
PageServiceProtocolPipelinedBatchingStrategy::UniformLsn => {
if let Some(last_in_batch) = accum_pages.last() {
if last_in_batch.lsn_range.effective_lsn
!= this_pages[0].lsn_range.effective_lsn
if last_in_batch.effective_request_lsn
!= this_pages[0].effective_request_lsn
{
trace!(
accum_lsn = %last_in_batch.lsn_range.effective_lsn,
this_lsn = %this_pages[0].lsn_range.effective_lsn,
accum_lsn = %last_in_batch.effective_request_lsn,
this_lsn = %this_pages[0].effective_request_lsn,
"stopping batching because LSN changed"
);
@@ -784,15 +784,15 @@ impl BatchedFeMessage {
let same_page_different_lsn = accum_pages.iter().any(|batched| {
batched.req.rel == this_pages[0].req.rel
&& batched.req.blkno == this_pages[0].req.blkno
&& batched.lsn_range.effective_lsn
!= this_pages[0].lsn_range.effective_lsn
&& batched.effective_request_lsn
!= this_pages[0].effective_request_lsn
});
if same_page_different_lsn {
trace!(
rel=%this_pages[0].req.rel,
blkno=%this_pages[0].req.blkno,
lsn=%this_pages[0].lsn_range.effective_lsn,
lsn=%this_pages[0].effective_request_lsn,
"stopping batching because same page was requested at different LSNs"
);
@@ -844,16 +844,17 @@ impl BatchedFeMessage {
impl PageServerHandler {
#[allow(clippy::too_many_arguments)]
pub fn new(
conf: &'static PageServerConf,
tenant_manager: Arc<TenantManager>,
auth: Option<Arc<SwappableJwtAuth>>,
pipelining_config: PageServicePipeliningConfig,
get_vectored_concurrent_io: GetVectoredConcurrentIo,
perf_span_fields: ConnectionPerfSpanFields,
connection_ctx: RequestContext,
cancel: CancellationToken,
gate_guard: GateGuard,
) -> Self {
PageServerHandler {
conf,
auth,
claims: None,
connection_ctx,
@@ -861,7 +862,6 @@ impl PageServerHandler {
timeline_handles: Some(TimelineHandles::new(tenant_manager)),
cancel,
pipelining_config,
get_vectored_concurrent_io,
gate_guard,
}
}
@@ -1158,7 +1158,7 @@ impl PageServerHandler {
.await?;
// We're holding the Handle
let effective_lsn = match Self::effective_request_lsn(
let effective_request_lsn = match Self::effective_request_lsn(
&shard,
shard.get_last_record_lsn(),
req.hdr.request_lsn,
@@ -1177,10 +1177,7 @@ impl PageServerHandler {
pages: smallvec::smallvec![BatchedGetPageRequest {
req,
timer,
lsn_range: LsnRange {
effective_lsn,
request_lsn: req.hdr.request_lsn
},
effective_request_lsn,
ctx,
}],
// The executor grabs the batch when it becomes idle.
@@ -1281,7 +1278,7 @@ impl PageServerHandler {
}
#[instrument(level = tracing::Level::DEBUG, skip_all)]
async fn pagestream_handle_batched_message<IO>(
async fn pagesteam_handle_batched_message<IO>(
&mut self,
pgb_writer: &mut PostgresBackend<IO>,
batch: BatchedFeMessage,
@@ -1626,7 +1623,7 @@ impl PageServerHandler {
}
let io_concurrency = IoConcurrency::spawn_from_conf(
self.get_vectored_concurrent_io,
self.conf,
match self.gate_guard.try_clone() {
Ok(guard) => guard,
Err(_) => {
@@ -1736,7 +1733,7 @@ impl PageServerHandler {
};
let result = self
.pagestream_handle_batched_message(
.pagesteam_handle_batched_message(
pgb_writer,
msg,
io_concurrency.clone(),
@@ -1912,7 +1909,7 @@ impl PageServerHandler {
return Err(e);
}
};
self.pagestream_handle_batched_message(
self.pagesteam_handle_batched_message(
pgb_writer,
batch,
io_concurrency.clone(),
@@ -2130,14 +2127,7 @@ impl PageServerHandler {
.await?;
let exists = timeline
.get_rel_exists(
req.rel,
Version::LsnRange(LsnRange {
effective_lsn: lsn,
request_lsn: req.hdr.request_lsn,
}),
ctx,
)
.get_rel_exists(req.rel, Version::Lsn(lsn), ctx)
.await?;
Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
@@ -2164,14 +2154,7 @@ impl PageServerHandler {
.await?;
let n_blocks = timeline
.get_rel_size(
req.rel,
Version::LsnRange(LsnRange {
effective_lsn: lsn,
request_lsn: req.hdr.request_lsn,
}),
ctx,
)
.get_rel_size(req.rel, Version::Lsn(lsn), ctx)
.await?;
Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
@@ -2198,15 +2181,7 @@ impl PageServerHandler {
.await?;
let total_blocks = timeline
.get_db_size(
DEFAULTTABLESPACE_OID,
req.dbnode,
Version::LsnRange(LsnRange {
effective_lsn: lsn,
request_lsn: req.hdr.request_lsn,
}),
ctx,
)
.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, Version::Lsn(lsn), ctx)
.await?;
let db_size = total_blocks as i64 * BLCKSZ as i64;
@@ -2239,7 +2214,7 @@ impl PageServerHandler {
// Ignore error (trace buffer may be full or tracer may have disconnected).
_ = page_trace.try_send(PageTraceEvent {
key,
effective_lsn: batch.lsn_range.effective_lsn,
effective_lsn: batch.effective_request_lsn,
time,
});
}
@@ -2254,7 +2229,7 @@ impl PageServerHandler {
perf_instrument = true;
}
req.lsn_range.effective_lsn
req.effective_request_lsn
})
.max()
.expect("batch is never empty");
@@ -2308,7 +2283,7 @@ impl PageServerHandler {
(
&p.req.rel,
&p.req.blkno,
p.lsn_range,
p.effective_request_lsn,
p.ctx.attached_child(),
)
}),

View File

@@ -43,9 +43,7 @@ use crate::aux_file;
use crate::context::{PerfInstrumentFutureExt, RequestContext, RequestContextBuilder};
use crate::keyspace::{KeySpace, KeySpaceAccum};
use crate::metrics::{
RELSIZE_CACHE_MISSES_OLD, RELSIZE_LATEST_CACHE_ENTRIES, RELSIZE_LATEST_CACHE_HITS,
RELSIZE_LATEST_CACHE_MISSES, RELSIZE_SNAPSHOT_CACHE_ENTRIES, RELSIZE_SNAPSHOT_CACHE_HITS,
RELSIZE_SNAPSHOT_CACHE_MISSES,
RELSIZE_CACHE_ENTRIES, RELSIZE_CACHE_HITS, RELSIZE_CACHE_MISSES, RELSIZE_CACHE_MISSES_OLD,
};
use crate::span::{
debug_assert_current_span_has_tenant_and_timeline_id,
@@ -92,28 +90,6 @@ pub enum LsnForTimestamp {
NoData(Lsn),
}
/// Each request to page server contains LSN range: `not_modified_since..request_lsn`.
/// See comments libs/pageserver_api/src/models.rs.
/// Based on this range and `last_record_lsn` PS calculates `effective_lsn`.
/// But to distinguish requests from primary and replicas we need also to pass `request_lsn`.
#[derive(Debug, Clone, Copy, Default)]
pub struct LsnRange {
pub effective_lsn: Lsn,
pub request_lsn: Lsn,
}
impl LsnRange {
pub fn at(lsn: Lsn) -> LsnRange {
LsnRange {
effective_lsn: lsn,
request_lsn: lsn,
}
}
pub fn is_latest(&self) -> bool {
self.request_lsn == Lsn::MAX
}
}
#[derive(Debug, thiserror::Error)]
pub(crate) enum CalculateLogicalSizeError {
#[error("cancelled")]
@@ -226,13 +202,13 @@ impl Timeline {
io_concurrency: IoConcurrency,
) -> Result<Bytes, PageReconstructError> {
match version {
Version::LsnRange(lsns) => {
Version::Lsn(effective_lsn) => {
let pages: smallvec::SmallVec<[_; 1]> = smallvec::smallvec![(tag, blknum)];
let res = self
.get_rel_page_at_lsn_batched(
pages
.iter()
.map(|(tag, blknum)| (tag, blknum, lsns, ctx.attached_child())),
pages.iter().map(|(tag, blknum)| {
(tag, blknum, effective_lsn, ctx.attached_child())
}),
io_concurrency.clone(),
ctx,
)
@@ -270,7 +246,7 @@ impl Timeline {
/// The ordering of the returned vec corresponds to the ordering of `pages`.
pub(crate) async fn get_rel_page_at_lsn_batched(
&self,
pages: impl ExactSizeIterator<Item = (&RelTag, &BlockNumber, LsnRange, RequestContext)>,
pages: impl ExactSizeIterator<Item = (&RelTag, &BlockNumber, Lsn, RequestContext)>,
io_concurrency: IoConcurrency,
ctx: &RequestContext,
) -> Vec<Result<Bytes, PageReconstructError>> {
@@ -289,7 +265,7 @@ impl Timeline {
let mut req_keyspaces: HashMap<Lsn, KeySpaceRandomAccum> =
HashMap::with_capacity(pages.len());
for (response_slot_idx, (tag, blknum, lsns, ctx)) in pages.enumerate() {
for (response_slot_idx, (tag, blknum, lsn, ctx)) in pages.enumerate() {
if tag.relnode == 0 {
result_slots[response_slot_idx].write(Err(PageReconstructError::Other(
RelationError::InvalidRelnode.into(),
@@ -298,7 +274,7 @@ impl Timeline {
slots_filled += 1;
continue;
}
let lsn = lsns.effective_lsn;
let nblocks = {
let ctx = RequestContextBuilder::from(&ctx)
.perf_span(|crnt_perf_span| {
@@ -313,7 +289,7 @@ impl Timeline {
.attached_child();
match self
.get_rel_size(*tag, Version::LsnRange(lsns), &ctx)
.get_rel_size(*tag, Version::Lsn(lsn), &ctx)
.maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone())
.await
{
@@ -494,7 +470,7 @@ impl Timeline {
));
}
if let Some(nblocks) = self.get_cached_rel_size(&tag, version) {
if let Some(nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) {
return Ok(nblocks);
}
@@ -512,7 +488,7 @@ impl Timeline {
let mut buf = version.get(self, key, ctx).await?;
let nblocks = buf.get_u32_le();
self.update_cached_rel_size(tag, version, nblocks);
self.update_cached_rel_size(tag, version.get_lsn(), nblocks);
Ok(nblocks)
}
@@ -534,7 +510,7 @@ impl Timeline {
}
// first try to lookup relation in cache
if let Some(_nblocks) = self.get_cached_rel_size(&tag, version) {
if let Some(_nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) {
return Ok(true);
}
// then check if the database was already initialized.
@@ -610,7 +586,7 @@ impl Timeline {
// scan directory listing (new), merge with the old results
let key_range = rel_tag_sparse_key_range(spcnode, dbnode);
let io_concurrency = IoConcurrency::spawn_from_conf(
self.conf.get_vectored_concurrent_io,
self.conf,
self.gate
.enter()
.map_err(|_| PageReconstructError::Cancelled)?,
@@ -656,7 +632,7 @@ impl Timeline {
) -> Result<Bytes, PageReconstructError> {
assert!(self.tenant_shard_id.is_shard_zero());
let n_blocks = self
.get_slru_segment_size(kind, segno, Version::at(lsn), ctx)
.get_slru_segment_size(kind, segno, Version::Lsn(lsn), ctx)
.await?;
let keyspace = KeySpace::single(
@@ -669,7 +645,7 @@ impl Timeline {
);
let io_concurrency = IoConcurrency::spawn_from_conf(
self.conf.get_vectored_concurrent_io,
self.conf,
self.gate
.enter()
.map_err(|_| PageReconstructError::Cancelled)?,
@@ -891,11 +867,11 @@ impl Timeline {
mut f: impl FnMut(TimestampTz) -> ControlFlow<T>,
) -> Result<T, PageReconstructError> {
for segno in self
.list_slru_segments(SlruKind::Clog, Version::at(probe_lsn), ctx)
.list_slru_segments(SlruKind::Clog, Version::Lsn(probe_lsn), ctx)
.await?
{
let nblocks = self
.get_slru_segment_size(SlruKind::Clog, segno, Version::at(probe_lsn), ctx)
.get_slru_segment_size(SlruKind::Clog, segno, Version::Lsn(probe_lsn), ctx)
.await?;
let keyspace = KeySpace::single(
@@ -909,7 +885,7 @@ impl Timeline {
);
let io_concurrency = IoConcurrency::spawn_from_conf(
self.conf.get_vectored_concurrent_io,
self.conf,
self.gate
.enter()
.map_err(|_| PageReconstructError::Cancelled)?,
@@ -1161,7 +1137,7 @@ impl Timeline {
let mut total_size: u64 = 0;
for (spcnode, dbnode) in dbdir.dbdirs.keys() {
for rel in self
.list_rels(*spcnode, *dbnode, Version::at(lsn), ctx)
.list_rels(*spcnode, *dbnode, Version::Lsn(lsn), ctx)
.await?
{
if self.cancel.is_cancelled() {
@@ -1236,7 +1212,7 @@ impl Timeline {
result.add_key(rel_dir_to_key(spcnode, dbnode));
let mut rels: Vec<RelTag> = self
.list_rels(spcnode, dbnode, Version::at(lsn), ctx)
.list_rels(spcnode, dbnode, Version::Lsn(lsn), ctx)
.await?
.into_iter()
.collect();
@@ -1353,75 +1329,59 @@ impl Timeline {
Ok((dense_keyspace, sparse_keyspace))
}
/// Get cached size of relation. There are two caches: one for primary updates, it captures the latest state of
/// of the timeline and snapshot cache, which key includes LSN and so can be used by replicas to get relation size
/// at the particular LSN (snapshot).
pub fn get_cached_rel_size(&self, tag: &RelTag, version: Version<'_>) -> Option<BlockNumber> {
let lsn = version.get_lsn();
{
let rel_size_cache = self.rel_size_latest_cache.read().unwrap();
if let Some((cached_lsn, nblocks)) = rel_size_cache.get(tag) {
if lsn >= *cached_lsn {
RELSIZE_LATEST_CACHE_HITS.inc();
return Some(*nblocks);
}
RELSIZE_CACHE_MISSES_OLD.inc();
/// Get cached size of relation if it not updated after specified LSN
pub fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option<BlockNumber> {
let rel_size_cache = self.rel_size_cache.read().unwrap();
if let Some((cached_lsn, nblocks)) = rel_size_cache.map.get(tag) {
if lsn >= *cached_lsn {
RELSIZE_CACHE_HITS.inc();
return Some(*nblocks);
}
RELSIZE_CACHE_MISSES_OLD.inc();
}
{
let mut rel_size_cache = self.rel_size_snapshot_cache.lock().unwrap();
if let Some(nblock) = rel_size_cache.get(&(lsn, *tag)) {
RELSIZE_SNAPSHOT_CACHE_HITS.inc();
return Some(*nblock);
}
}
if version.is_latest() {
RELSIZE_LATEST_CACHE_MISSES.inc();
} else {
RELSIZE_SNAPSHOT_CACHE_MISSES.inc();
}
RELSIZE_CACHE_MISSES.inc();
None
}
/// Update cached relation size if there is no more recent update
pub fn update_cached_rel_size(&self, tag: RelTag, version: Version<'_>, nblocks: BlockNumber) {
let lsn = version.get_lsn();
if version.is_latest() {
let mut rel_size_cache = self.rel_size_latest_cache.write().unwrap();
match rel_size_cache.entry(tag) {
hash_map::Entry::Occupied(mut entry) => {
let cached_lsn = entry.get_mut();
if lsn >= cached_lsn.0 {
*cached_lsn = (lsn, nblocks);
}
}
hash_map::Entry::Vacant(entry) => {
entry.insert((lsn, nblocks));
RELSIZE_LATEST_CACHE_ENTRIES.inc();
pub fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
let mut rel_size_cache = self.rel_size_cache.write().unwrap();
if lsn < rel_size_cache.complete_as_of {
// Do not cache old values. It's safe to cache the size on read, as long as
// the read was at an LSN since we started the WAL ingestion. Reasoning: we
// never evict values from the cache, so if the relation size changed after
// 'lsn', the new value is already in the cache.
return;
}
match rel_size_cache.map.entry(tag) {
hash_map::Entry::Occupied(mut entry) => {
let cached_lsn = entry.get_mut();
if lsn >= cached_lsn.0 {
*cached_lsn = (lsn, nblocks);
}
}
} else {
let mut rel_size_cache = self.rel_size_snapshot_cache.lock().unwrap();
if rel_size_cache.capacity() != 0 {
rel_size_cache.insert((lsn, tag), nblocks);
RELSIZE_SNAPSHOT_CACHE_ENTRIES.set(rel_size_cache.len() as u64);
hash_map::Entry::Vacant(entry) => {
entry.insert((lsn, nblocks));
RELSIZE_CACHE_ENTRIES.inc();
}
}
}
/// Store cached relation size
pub fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
let mut rel_size_cache = self.rel_size_latest_cache.write().unwrap();
if rel_size_cache.insert(tag, (lsn, nblocks)).is_none() {
RELSIZE_LATEST_CACHE_ENTRIES.inc();
let mut rel_size_cache = self.rel_size_cache.write().unwrap();
if rel_size_cache.map.insert(tag, (lsn, nblocks)).is_none() {
RELSIZE_CACHE_ENTRIES.inc();
}
}
/// Remove cached relation size
pub fn remove_cached_rel_size(&self, tag: &RelTag) {
let mut rel_size_cache = self.rel_size_latest_cache.write().unwrap();
if rel_size_cache.remove(tag).is_some() {
RELSIZE_LATEST_CACHE_ENTRIES.dec();
let mut rel_size_cache = self.rel_size_cache.write().unwrap();
if rel_size_cache.map.remove(tag).is_some() {
RELSIZE_CACHE_ENTRIES.dec();
}
}
}
@@ -1625,10 +1585,7 @@ impl DatadirModification<'_> {
// check the cache too. This is because eagerly checking the cache results in
// less work overall and 10% better performance. It's more work on cache miss
// but cache miss is rare.
if let Some(nblocks) = self
.tline
.get_cached_rel_size(&rel, Version::Modified(self))
{
if let Some(nblocks) = self.tline.get_cached_rel_size(&rel, self.get_lsn()) {
Ok(nblocks)
} else if !self
.tline
@@ -2710,7 +2667,7 @@ pub struct DatadirModificationStats {
/// timeline to not miss the latest updates.
#[derive(Clone, Copy)]
pub enum Version<'a> {
LsnRange(LsnRange),
Lsn(Lsn),
Modified(&'a DatadirModification<'a>),
}
@@ -2722,7 +2679,7 @@ impl Version<'_> {
ctx: &RequestContext,
) -> Result<Bytes, PageReconstructError> {
match self {
Version::LsnRange(lsns) => timeline.get(key, lsns.effective_lsn, ctx).await,
Version::Lsn(lsn) => timeline.get(key, *lsn, ctx).await,
Version::Modified(modification) => modification.get(key, ctx).await,
}
}
@@ -2744,26 +2701,12 @@ impl Version<'_> {
}
}
pub fn is_latest(&self) -> bool {
fn get_lsn(&self) -> Lsn {
match self {
Version::LsnRange(lsns) => lsns.is_latest(),
Version::Modified(_) => true,
}
}
pub fn get_lsn(&self) -> Lsn {
match self {
Version::LsnRange(lsns) => lsns.effective_lsn,
Version::Lsn(lsn) => *lsn,
Version::Modified(modification) => modification.lsn,
}
}
pub fn at(lsn: Lsn) -> Self {
Version::LsnRange(LsnRange {
effective_lsn: lsn,
request_lsn: lsn,
})
}
}
//--- Metadata structs stored in key-value pairs in the repository.

View File

@@ -86,8 +86,8 @@ use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError};
use crate::l0_flush::L0FlushGlobalState;
use crate::metrics::{
BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN, CONCURRENT_INITDBS,
INITDB_RUN_TIME, INITDB_SEMAPHORE_ACQUISITION_TIME, TENANT, TENANT_OFFLOADED_TIMELINES,
TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC, remove_tenant_metrics,
INITDB_RUN_TIME, INITDB_SEMAPHORE_ACQUISITION_TIME, TENANT, TENANT_STATE_METRIC,
TENANT_SYNTHETIC_SIZE_METRIC, remove_tenant_metrics,
};
use crate::task_mgr::TaskKind;
use crate::tenant::config::LocationMode;
@@ -3348,13 +3348,6 @@ impl TenantShard {
activated_timelines += 1;
}
let tid = self.tenant_shard_id.tenant_id.to_string();
let shard_id = self.tenant_shard_id.shard_slug().to_string();
let offloaded_timeline_count = timelines_offloaded_accessor.len();
TENANT_OFFLOADED_TIMELINES
.with_label_values(&[&tid, &shard_id])
.set(offloaded_timeline_count as u64);
self.state.send_modify(move |current_state| {
assert!(
matches!(current_state, TenantState::Activating(_)),
@@ -4594,7 +4587,7 @@ impl TenantShard {
target.cutoffs = GcCutoffs {
space: space_cutoff,
time: None,
time: Lsn::INVALID,
};
}
}
@@ -4678,7 +4671,7 @@ impl TenantShard {
if let Some(ancestor_id) = timeline.get_ancestor_timeline_id() {
if let Some(ancestor_gc_cutoffs) = gc_cutoffs.get(&ancestor_id) {
target.within_ancestor_pitr =
Some(timeline.get_ancestor_lsn()) >= ancestor_gc_cutoffs.time;
timeline.get_ancestor_lsn() >= ancestor_gc_cutoffs.time;
}
}
@@ -4691,15 +4684,13 @@ impl TenantShard {
} else {
0
});
if let Some(time_cutoff) = target.cutoffs.time {
timeline.metrics.pitr_history_size.set(
timeline
.get_last_record_lsn()
.checked_sub(time_cutoff)
.unwrap_or_default()
.0,
);
}
timeline.metrics.pitr_history_size.set(
timeline
.get_last_record_lsn()
.checked_sub(target.cutoffs.time)
.unwrap_or(Lsn(0))
.0,
);
// Apply the cutoffs we found to the Timeline's GcInfo. Why might we _not_ have cutoffs for a timeline?
// - this timeline was created while we were finding cutoffs
@@ -4708,8 +4699,8 @@ impl TenantShard {
let original_cutoffs = target.cutoffs.clone();
// GC cutoffs should never go back
target.cutoffs = GcCutoffs {
space: cutoffs.space.max(original_cutoffs.space),
time: cutoffs.time.max(original_cutoffs.time),
space: Lsn(cutoffs.space.0.max(original_cutoffs.space.0)),
time: Lsn(cutoffs.time.0.max(original_cutoffs.time.0)),
}
}
}
@@ -5569,14 +5560,6 @@ impl TenantShard {
}
}
// Update metrics
let tid = self.tenant_shard_id.to_string();
let shard_id = self.tenant_shard_id.shard_slug().to_string();
let set_key = &[tid.as_str(), shard_id.as_str()][..];
TENANT_OFFLOADED_TIMELINES
.with_label_values(set_key)
.set(manifest.offloaded_timelines.len() as u64);
// Upload the manifest. Remote storage does no retries internally, so retry here.
match backoff::retry(
|| async {
@@ -8613,10 +8596,8 @@ mod tests {
lsn: Lsn,
ctx: &RequestContext,
) -> Result<Option<Bytes>, GetVectoredError> {
let io_concurrency = IoConcurrency::spawn_from_conf(
tline.conf.get_vectored_concurrent_io,
tline.gate.enter().unwrap(),
);
let io_concurrency =
IoConcurrency::spawn_from_conf(tline.conf, tline.gate.enter().unwrap());
let mut reconstruct_state = ValuesReconstructState::new(io_concurrency);
let query = VersionedKeySpaceQuery::uniform(KeySpace::single(key..key.next()), lsn);
let mut res = tline
@@ -8954,7 +8935,7 @@ mod tests {
.await;
// Update GC info
let mut guard = tline.gc_info.write().unwrap();
guard.cutoffs.time = Some(Lsn(0x30));
guard.cutoffs.time = Lsn(0x30);
guard.cutoffs.space = Lsn(0x30);
}
@@ -9062,7 +9043,7 @@ mod tests {
.await;
// Update GC info
let mut guard = tline.gc_info.write().unwrap();
guard.cutoffs.time = Some(Lsn(0x40));
guard.cutoffs.time = Lsn(0x40);
guard.cutoffs.space = Lsn(0x40);
}
tline
@@ -9480,7 +9461,7 @@ mod tests {
*guard = GcInfo {
retain_lsns: vec![],
cutoffs: GcCutoffs {
time: Some(Lsn(0x30)),
time: Lsn(0x30),
space: Lsn(0x30),
},
leases: Default::default(),
@@ -9564,7 +9545,7 @@ mod tests {
.await;
// Update GC info
let mut guard = tline.gc_info.write().unwrap();
guard.cutoffs.time = Some(Lsn(0x40));
guard.cutoffs.time = Lsn(0x40);
guard.cutoffs.space = Lsn(0x40);
}
tline
@@ -10035,7 +10016,7 @@ mod tests {
(Lsn(0x20), tline.timeline_id, MaybeOffloaded::No),
],
cutoffs: GcCutoffs {
time: Some(Lsn(0x30)),
time: Lsn(0x30),
space: Lsn(0x30),
},
leases: Default::default(),
@@ -10098,7 +10079,7 @@ mod tests {
let verify_result = || async {
let gc_horizon = {
let gc_info = tline.gc_info.read().unwrap();
gc_info.cutoffs.time.unwrap_or_default()
gc_info.cutoffs.time
};
for idx in 0..10 {
assert_eq!(
@@ -10176,7 +10157,7 @@ mod tests {
.await;
// Update GC info
let mut guard = tline.gc_info.write().unwrap();
guard.cutoffs.time = Some(Lsn(0x38));
guard.cutoffs.time = Lsn(0x38);
guard.cutoffs.space = Lsn(0x38);
}
tline
@@ -10284,7 +10265,7 @@ mod tests {
(Lsn(0x20), tline.timeline_id, MaybeOffloaded::No),
],
cutoffs: GcCutoffs {
time: Some(Lsn(0x30)),
time: Lsn(0x30),
space: Lsn(0x30),
},
leases: Default::default(),
@@ -10347,7 +10328,7 @@ mod tests {
let verify_result = || async {
let gc_horizon = {
let gc_info = tline.gc_info.read().unwrap();
gc_info.cutoffs.time.unwrap_or_default()
gc_info.cutoffs.time
};
for idx in 0..10 {
assert_eq!(
@@ -10533,7 +10514,7 @@ mod tests {
*guard = GcInfo {
retain_lsns: vec![(Lsn(0x18), branch_tline.timeline_id, MaybeOffloaded::No)],
cutoffs: GcCutoffs {
time: Some(Lsn(0x10)),
time: Lsn(0x10),
space: Lsn(0x10),
},
leases: Default::default(),
@@ -10553,7 +10534,7 @@ mod tests {
*guard = GcInfo {
retain_lsns: vec![(Lsn(0x40), branch_tline.timeline_id, MaybeOffloaded::No)],
cutoffs: GcCutoffs {
time: Some(Lsn(0x50)),
time: Lsn(0x50),
space: Lsn(0x50),
},
leases: Default::default(),
@@ -11274,7 +11255,7 @@ mod tests {
*guard = GcInfo {
retain_lsns: vec![(Lsn(0x20), tline.timeline_id, MaybeOffloaded::No)],
cutoffs: GcCutoffs {
time: Some(Lsn(0x30)),
time: Lsn(0x30),
space: Lsn(0x30),
},
leases: Default::default(),
@@ -11663,7 +11644,7 @@ mod tests {
(Lsn(0x20), tline.timeline_id, MaybeOffloaded::No),
],
cutoffs: GcCutoffs {
time: Some(Lsn(0x30)),
time: Lsn(0x30),
space: Lsn(0x30),
},
leases: Default::default(),
@@ -11726,7 +11707,7 @@ mod tests {
let verify_result = || async {
let gc_horizon = {
let gc_info = tline.gc_info.read().unwrap();
gc_info.cutoffs.time.unwrap_or_default()
gc_info.cutoffs.time
};
for idx in 0..10 {
assert_eq!(
@@ -11915,7 +11896,7 @@ mod tests {
(Lsn(0x20), tline.timeline_id, MaybeOffloaded::No),
],
cutoffs: GcCutoffs {
time: Some(Lsn(0x30)),
time: Lsn(0x30),
space: Lsn(0x30),
},
leases: Default::default(),
@@ -11978,7 +11959,7 @@ mod tests {
let verify_result = || async {
let gc_horizon = {
let gc_info = tline.gc_info.read().unwrap();
gc_info.cutoffs.time.unwrap_or_default()
gc_info.cutoffs.time
};
for idx in 0..10 {
assert_eq!(
@@ -12241,7 +12222,7 @@ mod tests {
*guard = GcInfo {
retain_lsns: vec![],
cutoffs: GcCutoffs {
time: Some(Lsn(0x30)),
time: Lsn(0x30),
space: Lsn(0x30),
},
leases: Default::default(),

View File

@@ -235,7 +235,7 @@ pub(super) async fn gather_inputs(
// than our internal space cutoff. This means that if someone drops a database and waits for their
// PITR interval, they will see synthetic size decrease, even if we are still storing data inside
// the space cutoff.
let mut next_pitr_cutoff = gc_info.cutoffs.time.unwrap_or_default(); // TODO: handle None
let mut next_pitr_cutoff = gc_info.cutoffs.time;
// If the caller provided a shorter retention period, use that instead of the GC cutoff.
let retention_param_cutoff = if let Some(max_retention_period) = max_retention_period {

View File

@@ -31,7 +31,6 @@ pub use inmemory_layer::InMemoryLayer;
pub(crate) use layer::{EvictionError, Layer, ResidentLayer};
pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
pub use layer_name::{DeltaLayerName, ImageLayerName, LayerName};
use pageserver_api::config::GetVectoredConcurrentIo;
use pageserver_api::key::Key;
use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
use pageserver_api::record::NeonWalRecord;
@@ -44,6 +43,7 @@ use self::inmemory_layer::InMemoryLayerFileId;
use super::PageReconstructError;
use super::layer_map::InMemoryLayerDesc;
use super::timeline::{GetVectoredError, ReadPath};
use crate::config::PageServerConf;
use crate::context::{
AccessStatsBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
};
@@ -318,10 +318,11 @@ impl IoConcurrency {
}
pub(crate) fn spawn_from_conf(
conf: GetVectoredConcurrentIo,
conf: &'static PageServerConf,
gate_guard: GateGuard,
) -> IoConcurrency {
let selected = match conf {
use pageserver_api::config::GetVectoredConcurrentIo;
let selected = match conf.get_vectored_concurrent_io {
GetVectoredConcurrentIo::Sequential => SelectedIoConcurrency::Sequential,
GetVectoredConcurrentIo::SidecarTask => SelectedIoConcurrency::SidecarTask(gate_guard),
};

View File

@@ -63,28 +63,7 @@ pub struct InMemoryLayer {
opened_at: Instant,
/// All versions of all pages in the layer are kept here. Indexed
/// by block number and LSN. The [`IndexEntry`] is an offset into the
/// ephemeral file where the page version is stored.
///
/// We use a separate lock for the index to reduce the critical section
/// during which reads cannot be planned.
///
/// If you need access to both the index and the underlying file at the same time,
/// respect the following locking order to avoid deadlocks:
/// 1. [`InMemoryLayer::inner`]
/// 2. [`InMemoryLayer::index`]
///
/// Note that the file backing [`InMemoryLayer::inner`] is append-only,
/// so it is not necessary to hold simultaneous locks on index.
/// This avoids holding index locks across IO, and is crucial for avoiding read tail latency.
/// In particular:
/// 1. It is safe to read and release [`InMemoryLayer::index`] before locking and reading from [`InMemoryLayer::inner`].
/// 2. It is safe to write and release [`InMemoryLayer::inner`] before locking and updating [`InMemoryLayer::index`].
index: RwLock<BTreeMap<CompactKey, VecMap<Lsn, IndexEntry>>>,
/// The above fields never change, except for `end_lsn`, which is only set once,
/// and `index` (see rationale there).
/// The above fields never change, except for `end_lsn`, which is only set once.
/// All other changing parts are in `inner`, and protected by a mutex.
inner: RwLock<InMemoryLayerInner>,
@@ -102,6 +81,11 @@ impl std::fmt::Debug for InMemoryLayer {
}
pub struct InMemoryLayerInner {
/// All versions of all pages in the layer are kept here. Indexed
/// by block number and LSN. The [`IndexEntry`] is an offset into the
/// ephemeral file where the page version is stored.
index: BTreeMap<CompactKey, VecMap<Lsn, IndexEntry>>,
/// The values are stored in a serialized format in this file.
/// Each serialized Value is preceded by a 'u32' length field.
/// PerSeg::page_versions map stores offsets into this file.
@@ -121,7 +105,7 @@ const MAX_SUPPORTED_BLOB_LEN_BITS: usize = {
trailing_ones
};
/// See [`InMemoryLayer::index`].
/// See [`InMemoryLayerInner::index`].
///
/// For memory efficiency, the data is packed into a u64.
///
@@ -441,7 +425,7 @@ impl InMemoryLayer {
.page_content_kind(PageContentKind::InMemoryLayer)
.attached_child();
let index = self.index.read().await;
let inner = self.inner.read().await;
struct ValueRead {
entry_lsn: Lsn,
@@ -451,7 +435,10 @@ impl InMemoryLayer {
let mut ios: HashMap<(Key, Lsn), OnDiskValueIo> = Default::default();
for range in keyspace.ranges.iter() {
for (key, vec_map) in index.range(range.start.to_compact()..range.end.to_compact()) {
for (key, vec_map) in inner
.index
.range(range.start.to_compact()..range.end.to_compact())
{
let key = Key::from_compact(*key);
let slice = vec_map.slice_range(lsn_range.clone());
@@ -479,7 +466,7 @@ impl InMemoryLayer {
}
}
}
drop(index); // release the lock before we spawn the IO; if it's serial-mode IO we will deadlock on the read().await below
drop(inner); // release the lock before we spawn the IO; if it's serial-mode IO we will deadlock on the read().await below
let read_from = Arc::clone(self);
let read_ctx = ctx.attached_child();
reconstruct_state
@@ -586,8 +573,8 @@ impl InMemoryLayer {
start_lsn,
end_lsn: OnceLock::new(),
opened_at: Instant::now(),
index: RwLock::new(BTreeMap::new()),
inner: RwLock::new(InMemoryLayerInner {
index: BTreeMap::new(),
file,
resource_units: GlobalResourceUnits::new(),
}),
@@ -605,39 +592,31 @@ impl InMemoryLayer {
serialized_batch: SerializedValueBatch,
ctx: &RequestContext,
) -> anyhow::Result<()> {
let (base_offset, metadata) = {
let mut inner = self.inner.write().await;
self.assert_writable();
let mut inner = self.inner.write().await;
self.assert_writable();
let base_offset = inner.file.len();
let base_offset = inner.file.len();
let SerializedValueBatch {
raw,
metadata,
max_lsn: _,
len: _,
} = serialized_batch;
let SerializedValueBatch {
raw,
metadata,
max_lsn: _,
len: _,
} = serialized_batch;
// Write the batch to the file
inner.file.write_raw(&raw, ctx).await?;
let new_size = inner.file.len();
// Write the batch to the file
inner.file.write_raw(&raw, ctx).await?;
let new_size = inner.file.len();
let expected_new_len = base_offset
.checked_add(raw.len().into_u64())
// write_raw would error if we were to overflow u64.
// also IndexEntry and higher levels in
//the code don't allow the file to grow that large
.unwrap();
assert_eq!(new_size, expected_new_len);
inner.resource_units.maybe_publish_size(new_size);
(base_offset, metadata)
};
let expected_new_len = base_offset
.checked_add(raw.len().into_u64())
// write_raw would error if we were to overflow u64.
// also IndexEntry and higher levels in
//the code don't allow the file to grow that large
.unwrap();
assert_eq!(new_size, expected_new_len);
// Update the index with the new entries
let mut index = self.index.write().await;
for meta in metadata {
let SerializedValueMeta {
key,
@@ -660,7 +639,7 @@ impl InMemoryLayer {
will_init,
})?;
let vec_map = index.entry(key).or_default();
let vec_map = inner.index.entry(key).or_default();
let old = vec_map.append_or_update_last(lsn, index_entry).unwrap().0;
if old.is_some() {
// This should not break anything, but is unexpected: ingestion code aims to filter out
@@ -679,6 +658,8 @@ impl InMemoryLayer {
);
}
inner.resource_units.maybe_publish_size(new_size);
Ok(())
}
@@ -699,18 +680,6 @@ impl InMemoryLayer {
/// Records the end_lsn for non-dropped layers.
/// `end_lsn` is exclusive
///
/// A note on locking:
/// The current API of [`InMemoryLayer`] does not ensure that there's no ongoing
/// writes while freezing the layer. This is enforced at a higher level via
/// [`crate::tenant::Timeline::write_lock`]. Freeze might be called via two code paths:
/// 1. Via the active [`crate::tenant::timeline::TimelineWriter`]. This holds the
/// Timeline::write_lock for its lifetime. The rolling is handled in
/// [`crate::tenant::timeline::TimelineWriter::put_batch`]. It's a &mut self function
/// so can't be called from different threads.
/// 2. In the background via [`crate::tenant::Timeline::maybe_freeze_ephemeral_layer`].
/// This only proceeds if try_lock on Timeline::write_lock succeeds (i.e. there's no active writer),
/// hence there can be no concurrent writes
pub async fn freeze(&self, end_lsn: Lsn) {
assert!(
self.start_lsn < end_lsn,
@@ -731,8 +700,8 @@ impl InMemoryLayer {
#[cfg(debug_assertions)]
{
let index = self.index.read().await;
for vec_map in index.values() {
let inner = self.inner.write().await;
for vec_map in inner.index.values() {
for (lsn, _) in vec_map.as_slice() {
assert!(*lsn < end_lsn);
}
@@ -755,11 +724,14 @@ impl InMemoryLayer {
) -> Result<Option<(PersistentLayerDesc, Utf8PathBuf)>> {
// Grab the lock in read-mode. We hold it over the I/O, but because this
// layer is not writeable anymore, no one should be trying to acquire the
// write lock on it, so we shouldn't block anyone. See the comment on
// [`InMemoryLayer::freeze`] to understand how locking between the append path
// and layer flushing works.
// write lock on it, so we shouldn't block anyone. There's one exception
// though: another thread might have grabbed a reference to this layer
// in `get_layer_for_write' just before the checkpointer called
// `freeze`, and then `write_to_disk` on it. When the thread gets the
// lock, it will see that it's not writeable anymore and retry, but it
// would have to wait until we release it. That race condition is very
// rare though, so we just accept the potential latency hit for now.
let inner = self.inner.read().await;
let index = self.index.read().await;
use l0_flush::Inner;
let _concurrency_permit = match l0_flush_global_state {
@@ -771,9 +743,13 @@ impl InMemoryLayer {
let key_count = if let Some(key_range) = key_range {
let key_range = key_range.start.to_compact()..key_range.end.to_compact();
index.iter().filter(|(k, _)| key_range.contains(k)).count()
inner
.index
.iter()
.filter(|(k, _)| key_range.contains(k))
.count()
} else {
index.len()
inner.index.len()
};
if key_count == 0 {
return Ok(None);
@@ -796,7 +772,7 @@ impl InMemoryLayer {
let file_contents = inner.file.load_to_io_buf(ctx).await?;
let file_contents = file_contents.freeze();
for (key, vec_map) in index.iter() {
for (key, vec_map) in inner.index.iter() {
// Write all page versions
for (lsn, entry) in vec_map
.as_slice()

View File

@@ -14,7 +14,6 @@ pub mod span;
pub mod uninit;
mod walreceiver;
use hashlink::LruCache;
use std::array;
use std::cmp::{max, min};
use std::collections::btree_map::Entry;
@@ -198,6 +197,16 @@ pub struct TimelineResources {
pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
}
/// The relation size cache caches relation sizes at the end of the timeline. It speeds up WAL
/// ingestion considerably, because WAL ingestion needs to check on most records if the record
/// implicitly extends the relation. At startup, `complete_as_of` is initialized to the current end
/// of the timeline (disk_consistent_lsn). It's used on reads of relation sizes to check if the
/// value can be used to also update the cache, see [`Timeline::update_cached_rel_size`].
pub(crate) struct RelSizeCache {
pub(crate) complete_as_of: Lsn,
pub(crate) map: HashMap<RelTag, (Lsn, BlockNumber)>,
}
pub struct Timeline {
pub(crate) conf: &'static PageServerConf,
tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
@@ -356,8 +365,7 @@ pub struct Timeline {
pub walreceiver: Mutex<Option<WalReceiver>>,
/// Relation size cache
pub(crate) rel_size_latest_cache: RwLock<HashMap<RelTag, (Lsn, BlockNumber)>>,
pub(crate) rel_size_snapshot_cache: Mutex<LruCache<(Lsn, RelTag), BlockNumber>>,
pub(crate) rel_size_cache: RwLock<RelSizeCache>,
download_all_remote_layers_task_info: RwLock<Option<DownloadRemoteLayersTaskInfo>>,
@@ -529,24 +537,29 @@ impl GcInfo {
/// The `GcInfo` component describing which Lsns need to be retained. Functionally, this
/// is a single number (the oldest LSN which we must retain), but it internally distinguishes
/// between time-based and space-based retention for observability and consumption metrics purposes.
#[derive(Clone, Debug, Default)]
#[derive(Debug, Clone)]
pub(crate) struct GcCutoffs {
/// Calculated from the [`pageserver_api::models::TenantConfig::gc_horizon`], this LSN indicates how much
/// history we must keep to retain a specified number of bytes of WAL.
pub(crate) space: Lsn,
/// Calculated from [`pageserver_api::models::TenantConfig::pitr_interval`], this LSN indicates
/// how much history we must keep to enable reading back at least the PITR interval duration.
///
/// None indicates that the PITR cutoff has not been computed. A PITR interval of 0 will yield
/// Some(last_record_lsn).
pub(crate) time: Option<Lsn>,
/// Calculated from [`pageserver_api::models::TenantConfig::pitr_interval`], this LSN indicates how much
/// history we must keep to enable reading back at least the PITR interval duration.
pub(crate) time: Lsn,
}
impl Default for GcCutoffs {
fn default() -> Self {
Self {
space: Lsn::INVALID,
time: Lsn::INVALID,
}
}
}
impl GcCutoffs {
fn select_min(&self) -> Lsn {
// NB: if we haven't computed the PITR cutoff yet, we can't GC anything.
self.space.min(self.time.unwrap_or_default())
std::cmp::min(self.space, self.time)
}
}
@@ -1083,14 +1096,11 @@ impl Timeline {
/// Get the bytes written since the PITR cutoff on this branch, and
/// whether this branch's ancestor_lsn is within its parent's PITR.
pub(crate) fn get_pitr_history_stats(&self) -> (u64, bool) {
// TODO: for backwards compatibility, we return the full history back to 0 when the PITR
// cutoff has not yet been initialized. This should return None instead, but this is exposed
// in external HTTP APIs and callers may not handle a null value.
let gc_info = self.gc_info.read().unwrap();
let history = self
.get_last_record_lsn()
.checked_sub(gc_info.cutoffs.time.unwrap_or_default())
.unwrap_or_default()
.checked_sub(gc_info.cutoffs.time)
.unwrap_or(Lsn(0))
.0;
(history, gc_info.within_ancestor_pitr)
}
@@ -1100,10 +1110,9 @@ impl Timeline {
self.applied_gc_cutoff_lsn.read()
}
/// Read timeline's planned GC cutoff: this is the logical end of history that users are allowed
/// to read (based on configured PITR), even if physically we have more history. Returns None
/// if the PITR cutoff has not yet been initialized.
pub(crate) fn get_gc_cutoff_lsn(&self) -> Option<Lsn> {
/// Read timeline's planned GC cutoff: this is the logical end of history that users
/// are allowed to read (based on configured PITR), even if physically we have more history.
pub(crate) fn get_gc_cutoff_lsn(&self) -> Lsn {
self.gc_info.read().unwrap().cutoffs.time
}
@@ -2811,13 +2820,6 @@ impl Timeline {
self.remote_client.update_config(&new_conf.location);
let mut rel_size_cache = self.rel_size_snapshot_cache.lock().unwrap();
if let Some(new_capacity) = new_conf.tenant_conf.relsize_snapshot_cache_capacity {
if new_capacity != rel_size_cache.capacity() {
rel_size_cache.set_capacity(new_capacity);
}
}
self.metrics
.evictions_with_low_residence_duration
.write()
@@ -2876,14 +2878,6 @@ impl Timeline {
ancestor_gc_info.insert_child(timeline_id, metadata.ancestor_lsn(), is_offloaded);
}
let relsize_snapshot_cache_capacity = {
let loaded_tenant_conf = tenant_conf.load();
loaded_tenant_conf
.tenant_conf
.relsize_snapshot_cache_capacity
.unwrap_or(conf.default_tenant_conf.relsize_snapshot_cache_capacity)
};
Arc::new_cyclic(|myself| {
let metrics = Arc::new(TimelineMetrics::new(
&tenant_shard_id,
@@ -2975,8 +2969,10 @@ impl Timeline {
last_image_layer_creation_check_instant: Mutex::new(None),
last_received_wal: Mutex::new(None),
rel_size_latest_cache: RwLock::new(HashMap::new()),
rel_size_snapshot_cache: Mutex::new(LruCache::new(relsize_snapshot_cache_capacity)),
rel_size_cache: RwLock::new(RelSizeCache {
complete_as_of: disk_consistent_lsn,
map: HashMap::new(),
}),
download_all_remote_layers_task_info: RwLock::new(None),
@@ -3534,7 +3530,7 @@ impl Timeline {
};
let io_concurrency = IoConcurrency::spawn_from_conf(
self_ref.conf.get_vectored_concurrent_io,
self_ref.conf,
self_ref
.gate
.enter()
@@ -5563,7 +5559,7 @@ impl Timeline {
});
let io_concurrency = IoConcurrency::spawn_from_conf(
self.conf.get_vectored_concurrent_io,
self.conf,
self.gate
.enter()
.map_err(|_| CreateImageLayersError::Cancelled)?,
@@ -6234,12 +6230,14 @@ impl Timeline {
pausable_failpoint!("Timeline::find_gc_cutoffs-pausable");
if cfg!(test) && pitr == Duration::ZERO {
if cfg!(test) {
// Unit tests which specify zero PITR interval expect to avoid doing any I/O for timestamp lookup
return Ok(GcCutoffs {
time: Some(self.get_last_record_lsn()),
space: space_cutoff,
});
if pitr == Duration::ZERO {
return Ok(GcCutoffs {
time: self.get_last_record_lsn(),
space: space_cutoff,
});
}
}
// Calculate a time-based limit on how much to retain:
@@ -6253,14 +6251,14 @@ impl Timeline {
// PITR is not set. Retain the size-based limit, or the default time retention,
// whichever requires less data.
GcCutoffs {
time: Some(self.get_last_record_lsn()),
time: self.get_last_record_lsn(),
space: std::cmp::max(time_cutoff, space_cutoff),
}
}
(Duration::ZERO, None) => {
// PITR is not set, and time lookup failed
GcCutoffs {
time: Some(self.get_last_record_lsn()),
time: self.get_last_record_lsn(),
space: space_cutoff,
}
}
@@ -6268,7 +6266,7 @@ impl Timeline {
// PITR interval is set & we didn't look up a timestamp successfully. Conservatively assume PITR
// cannot advance beyond what was already GC'd, and respect space-based retention
GcCutoffs {
time: Some(*self.get_applied_gc_cutoff_lsn()),
time: *self.get_applied_gc_cutoff_lsn(),
space: space_cutoff,
}
}
@@ -6276,7 +6274,7 @@ impl Timeline {
// PITR interval is set and we looked up timestamp successfully. Ignore
// size based retention and make time cutoff authoritative
GcCutoffs {
time: Some(time_cutoff),
time: time_cutoff,
space: time_cutoff,
}
}
@@ -6329,7 +6327,7 @@ impl Timeline {
)
};
let mut new_gc_cutoff = space_cutoff.min(time_cutoff.unwrap_or_default());
let mut new_gc_cutoff = Lsn::min(space_cutoff, time_cutoff);
let standby_horizon = self.standby_horizon.load();
// Hold GC for the standby, but as a safety guard do it only within some
// reasonable lag.
@@ -6378,7 +6376,7 @@ impl Timeline {
async fn gc_timeline(
&self,
space_cutoff: Lsn,
time_cutoff: Option<Lsn>, // None if uninitialized
time_cutoff: Lsn,
retain_lsns: Vec<Lsn>,
max_lsn_with_valid_lease: Option<Lsn>,
new_gc_cutoff: Lsn,
@@ -6397,12 +6395,6 @@ impl Timeline {
return Ok(result);
}
let Some(time_cutoff) = time_cutoff else {
// The GC cutoff should have been computed by now, but let's be defensive.
info!("Nothing to GC: time_cutoff not yet computed");
return Ok(result);
};
// We need to ensure that no one tries to read page versions or create
// branches at a point before latest_gc_cutoff_lsn. See branch_timeline()
// for details. This will block until the old value is no longer in use.

View File

@@ -1526,7 +1526,7 @@ impl Timeline {
info!(
"starting shard ancestor compaction, rewriting {} layers and dropping {} layers, \
checked {layers_checked}/{layers_total} layers \
(latest_gc_cutoff={} pitr_cutoff={:?})",
(latest_gc_cutoff={} pitr_cutoff={})",
layers_to_rewrite.len(),
drop_layers.len(),
*latest_gc_cutoff,

View File

@@ -188,7 +188,7 @@ pub(crate) async fn generate_tombstone_image_layer(
"removing non-inherited keys by writing an image layer with tombstones at the detach LSN"
);
let io_concurrency = IoConcurrency::spawn_from_conf(
detached.conf.get_vectored_concurrent_io,
detached.conf,
detached.gate.enter().map_err(|_| Error::ShuttingDown)?,
);
let mut reconstruct_state = ValuesReconstructState::new(io_concurrency);

View File

@@ -1684,31 +1684,31 @@ mod tests {
// The relation was created at LSN 2, not visible at LSN 1 yet.
assert_eq!(
tline
.get_rel_exists(TESTREL_A, Version::at(Lsn(0x10)), &ctx)
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
.await?,
false
);
assert!(
tline
.get_rel_size(TESTREL_A, Version::at(Lsn(0x10)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
.await
.is_err()
);
assert_eq!(
tline
.get_rel_exists(TESTREL_A, Version::at(Lsn(0x20)), &ctx)
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
.await?,
true
);
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::at(Lsn(0x20)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
.await?,
1
);
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::at(Lsn(0x50)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx)
.await?,
3
);
@@ -1719,7 +1719,7 @@ mod tests {
.get_rel_page_at_lsn(
TESTREL_A,
0,
Version::at(Lsn(0x20)),
Version::Lsn(Lsn(0x20)),
&ctx,
io_concurrency.clone()
)
@@ -1733,7 +1733,7 @@ mod tests {
.get_rel_page_at_lsn(
TESTREL_A,
0,
Version::at(Lsn(0x30)),
Version::Lsn(Lsn(0x30)),
&ctx,
io_concurrency.clone()
)
@@ -1747,7 +1747,7 @@ mod tests {
.get_rel_page_at_lsn(
TESTREL_A,
0,
Version::at(Lsn(0x40)),
Version::Lsn(Lsn(0x40)),
&ctx,
io_concurrency.clone()
)
@@ -1760,7 +1760,7 @@ mod tests {
.get_rel_page_at_lsn(
TESTREL_A,
1,
Version::at(Lsn(0x40)),
Version::Lsn(Lsn(0x40)),
&ctx,
io_concurrency.clone()
)
@@ -1774,7 +1774,7 @@ mod tests {
.get_rel_page_at_lsn(
TESTREL_A,
0,
Version::at(Lsn(0x50)),
Version::Lsn(Lsn(0x50)),
&ctx,
io_concurrency.clone()
)
@@ -1787,7 +1787,7 @@ mod tests {
.get_rel_page_at_lsn(
TESTREL_A,
1,
Version::at(Lsn(0x50)),
Version::Lsn(Lsn(0x50)),
&ctx,
io_concurrency.clone()
)
@@ -1800,7 +1800,7 @@ mod tests {
.get_rel_page_at_lsn(
TESTREL_A,
2,
Version::at(Lsn(0x50)),
Version::Lsn(Lsn(0x50)),
&ctx,
io_concurrency.clone()
)
@@ -1820,7 +1820,7 @@ mod tests {
// Check reported size and contents after truncation
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::at(Lsn(0x60)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), &ctx)
.await?,
2
);
@@ -1829,7 +1829,7 @@ mod tests {
.get_rel_page_at_lsn(
TESTREL_A,
0,
Version::at(Lsn(0x60)),
Version::Lsn(Lsn(0x60)),
&ctx,
io_concurrency.clone()
)
@@ -1842,7 +1842,7 @@ mod tests {
.get_rel_page_at_lsn(
TESTREL_A,
1,
Version::at(Lsn(0x60)),
Version::Lsn(Lsn(0x60)),
&ctx,
io_concurrency.clone()
)
@@ -1854,7 +1854,7 @@ mod tests {
// should still see the truncated block with older LSN
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::at(Lsn(0x50)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx)
.await?,
3
);
@@ -1863,7 +1863,7 @@ mod tests {
.get_rel_page_at_lsn(
TESTREL_A,
2,
Version::at(Lsn(0x50)),
Version::Lsn(Lsn(0x50)),
&ctx,
io_concurrency.clone()
)
@@ -1880,7 +1880,7 @@ mod tests {
m.commit(&ctx).await?;
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::at(Lsn(0x68)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x68)), &ctx)
.await?,
0
);
@@ -1893,7 +1893,7 @@ mod tests {
m.commit(&ctx).await?;
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::at(Lsn(0x70)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x70)), &ctx)
.await?,
2
);
@@ -1902,7 +1902,7 @@ mod tests {
.get_rel_page_at_lsn(
TESTREL_A,
0,
Version::at(Lsn(0x70)),
Version::Lsn(Lsn(0x70)),
&ctx,
io_concurrency.clone()
)
@@ -1915,7 +1915,7 @@ mod tests {
.get_rel_page_at_lsn(
TESTREL_A,
1,
Version::at(Lsn(0x70)),
Version::Lsn(Lsn(0x70)),
&ctx,
io_concurrency.clone()
)
@@ -1932,7 +1932,7 @@ mod tests {
m.commit(&ctx).await?;
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::at(Lsn(0x80)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx)
.await?,
1501
);
@@ -1942,7 +1942,7 @@ mod tests {
.get_rel_page_at_lsn(
TESTREL_A,
blk,
Version::at(Lsn(0x80)),
Version::Lsn(Lsn(0x80)),
&ctx,
io_concurrency.clone()
)
@@ -1956,7 +1956,7 @@ mod tests {
.get_rel_page_at_lsn(
TESTREL_A,
1500,
Version::at(Lsn(0x80)),
Version::Lsn(Lsn(0x80)),
&ctx,
io_concurrency.clone()
)
@@ -1990,13 +1990,13 @@ mod tests {
// Check that rel exists and size is correct
assert_eq!(
tline
.get_rel_exists(TESTREL_A, Version::at(Lsn(0x20)), &ctx)
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
.await?,
true
);
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::at(Lsn(0x20)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
.await?,
1
);
@@ -2011,7 +2011,7 @@ mod tests {
// Check that rel is not visible anymore
assert_eq!(
tline
.get_rel_exists(TESTREL_A, Version::at(Lsn(0x30)), &ctx)
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x30)), &ctx)
.await?,
false
);
@@ -2029,13 +2029,13 @@ mod tests {
// Check that rel exists and size is correct
assert_eq!(
tline
.get_rel_exists(TESTREL_A, Version::at(Lsn(0x40)), &ctx)
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x40)), &ctx)
.await?,
true
);
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::at(Lsn(0x40)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x40)), &ctx)
.await?,
1
);
@@ -2077,26 +2077,26 @@ mod tests {
// The relation was created at LSN 20, not visible at LSN 1 yet.
assert_eq!(
tline
.get_rel_exists(TESTREL_A, Version::at(Lsn(0x10)), &ctx)
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
.await?,
false
);
assert!(
tline
.get_rel_size(TESTREL_A, Version::at(Lsn(0x10)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), &ctx)
.await
.is_err()
);
assert_eq!(
tline
.get_rel_exists(TESTREL_A, Version::at(Lsn(0x20)), &ctx)
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
.await?,
true
);
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::at(Lsn(0x20)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), &ctx)
.await?,
relsize
);
@@ -2110,7 +2110,7 @@ mod tests {
.get_rel_page_at_lsn(
TESTREL_A,
blkno,
Version::at(lsn),
Version::Lsn(lsn),
&ctx,
io_concurrency.clone()
)
@@ -2131,7 +2131,7 @@ mod tests {
// Check reported size and contents after truncation
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::at(Lsn(0x60)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), &ctx)
.await?,
1
);
@@ -2144,7 +2144,7 @@ mod tests {
.get_rel_page_at_lsn(
TESTREL_A,
blkno,
Version::at(Lsn(0x60)),
Version::Lsn(Lsn(0x60)),
&ctx,
io_concurrency.clone()
)
@@ -2157,7 +2157,7 @@ mod tests {
// should still see all blocks with older LSN
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::at(Lsn(0x50)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), &ctx)
.await?,
relsize
);
@@ -2169,7 +2169,7 @@ mod tests {
.get_rel_page_at_lsn(
TESTREL_A,
blkno,
Version::at(Lsn(0x50)),
Version::Lsn(Lsn(0x50)),
&ctx,
io_concurrency.clone()
)
@@ -2193,13 +2193,13 @@ mod tests {
assert_eq!(
tline
.get_rel_exists(TESTREL_A, Version::at(Lsn(0x80)), &ctx)
.get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx)
.await?,
true
);
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::at(Lsn(0x80)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), &ctx)
.await?,
relsize
);
@@ -2212,7 +2212,7 @@ mod tests {
.get_rel_page_at_lsn(
TESTREL_A,
blkno,
Version::at(Lsn(0x80)),
Version::Lsn(Lsn(0x80)),
&ctx,
io_concurrency.clone()
)
@@ -2250,7 +2250,7 @@ mod tests {
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::at(Lsn(lsn)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx)
.await?,
RELSEG_SIZE + 1
);
@@ -2264,7 +2264,7 @@ mod tests {
m.commit(&ctx).await?;
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::at(Lsn(lsn)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx)
.await?,
RELSEG_SIZE
);
@@ -2279,7 +2279,7 @@ mod tests {
m.commit(&ctx).await?;
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::at(Lsn(lsn)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx)
.await?,
RELSEG_SIZE - 1
);
@@ -2297,7 +2297,7 @@ mod tests {
m.commit(&ctx).await?;
assert_eq!(
tline
.get_rel_size(TESTREL_A, Version::at(Lsn(lsn)), &ctx)
.get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), &ctx)
.await?,
size as BlockNumber
);

View File

@@ -936,44 +936,6 @@ lfc_prewarm_main(Datum main_arg)
lfc_ctl->prewarm_workers[worker_id].completed = GetCurrentTimestamp();
}
void
lfc_invalidate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks)
{
BufferTag tag;
FileCacheEntry *entry;
uint32 hash;
if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
return;
CopyNRelFileInfoToBufTag(tag, rinfo);
tag.forkNum = forkNum;
CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
if (LFC_ENABLED())
{
for (BlockNumber blkno = 0; blkno < nblocks; blkno += lfc_blocks_per_chunk)
{
tag.blockNum = blkno;
hash = get_hash_value(lfc_hash, &tag);
entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
if (entry != NULL)
{
for (int i = 0; i < lfc_blocks_per_chunk; i++)
{
if (GET_STATE(entry, i) == AVAILABLE)
{
lfc_ctl->used_pages -= 1;
SET_STATE(entry, i, UNAVAILABLE);
}
}
}
}
}
LWLockRelease(lfc_lock);
}
/*
* Check if page is present in the cache.

View File

@@ -28,7 +28,6 @@ typedef struct FileCacheState
extern bool lfc_store_prefetch_result;
/* functions for local file cache */
extern void lfc_invalidate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks);
extern void lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum,
BlockNumber blkno, const void *const *buffers,
BlockNumber nblocks);

View File

@@ -86,7 +86,7 @@ InitBufferTag(BufferTag *tag, const RelFileNode *rnode,
#define InvalidRelFileNumber InvalidOid
#define SMgrRelGetRelInfo(reln) \
#define SMgrRelGetRelInfo(reln) \
(reln->smgr_rnode.node)
#define DropRelationAllLocalBuffers DropRelFileNodeAllLocalBuffers
@@ -148,12 +148,6 @@ InitBufferTag(BufferTag *tag, const RelFileNode *rnode,
#define DropRelationAllLocalBuffers DropRelationAllLocalBuffers
#endif
#define NRelFileInfoInvalidate(rinfo) do { \
NInfoGetSpcOid(rinfo) = InvalidOid; \
NInfoGetDbOid(rinfo) = InvalidOid; \
NInfoGetRelNumber(rinfo) = InvalidRelFileNumber; \
} while (0)
#if PG_MAJORVERSION_NUM < 17
#define ProcNumber BackendId
#define INVALID_PROC_NUMBER InvalidBackendId

View File

@@ -108,7 +108,7 @@ typedef enum
UNLOGGED_BUILD_NOT_PERMANENT
} UnloggedBuildPhase;
static NRelFileInfo unlogged_build_rel_info;
static SMgrRelation unlogged_build_rel = NULL;
static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
static bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id);
@@ -912,19 +912,16 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
{
case 0:
neon_log(ERROR, "cannot call smgrextend() on rel with unknown persistence");
break;
case RELPERSISTENCE_PERMANENT:
if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
{
mdextend(reln, forkNum, blkno, buffer, skipFsync);
return;
}
break;
case RELPERSISTENCE_TEMP:
case RELPERSISTENCE_UNLOGGED:
mdextend(reln, forkNum, blkno, buffer, skipFsync);
/* Update LFC in case of unlogged index build */
if (reln == unlogged_build_rel && unlogged_build_phase == UNLOGGED_BUILD_PHASE_2)
lfc_write(InfoFromSMgrRel(reln), forkNum, blkno, buffer);
return;
default:
@@ -1006,19 +1003,21 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
{
case 0:
neon_log(ERROR, "cannot call smgrextend() on rel with unknown persistence");
break;
case RELPERSISTENCE_PERMANENT:
if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
{
mdzeroextend(reln, forkNum, blocknum, nblocks, skipFsync);
return;
}
break;
case RELPERSISTENCE_TEMP:
case RELPERSISTENCE_UNLOGGED:
mdzeroextend(reln, forkNum, blocknum, nblocks, skipFsync);
/* Update LFC in case of unlogged index build */
if (reln == unlogged_build_rel && unlogged_build_phase == UNLOGGED_BUILD_PHASE_2)
{
for (int i = 0; i < nblocks; i++)
{
lfc_write(InfoFromSMgrRel(reln), forkNum, blocknum + i, buffer.data);
}
}
return;
default:
@@ -1388,14 +1387,8 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
{
case 0:
neon_log(ERROR, "cannot call smgrread() on rel with unknown persistence");
break;
case RELPERSISTENCE_PERMANENT:
if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
{
mdread(reln, forkNum, blkno, buffer);
return;
}
break;
case RELPERSISTENCE_TEMP:
@@ -1481,14 +1474,8 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
{
case 0:
neon_log(ERROR, "cannot call smgrread() on rel with unknown persistence");
break;
case RELPERSISTENCE_PERMANENT:
if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
{
mdreadv(reln, forknum, blocknum, buffers, nblocks);
return;
}
break;
case RELPERSISTENCE_TEMP:
@@ -1621,15 +1608,6 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
break;
case RELPERSISTENCE_PERMANENT:
if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
{
#if PG_MAJORVERSION_NUM >= 17
mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync);
#else
mdwrite(reln, forknum, blocknum, buffer, skipFsync);
#endif
return;
}
break;
case RELPERSISTENCE_TEMP:
@@ -1639,6 +1617,9 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
#else
mdwrite(reln, forknum, blocknum, buffer, skipFsync);
#endif
/* Update LFC in case of unlogged index build */
if (reln == unlogged_build_rel && unlogged_build_phase == UNLOGGED_BUILD_PHASE_2)
lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer);
return;
default:
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
@@ -1699,16 +1680,14 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
break;
case RELPERSISTENCE_PERMANENT:
if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
{
mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync);
return;
}
break;
case RELPERSISTENCE_TEMP:
case RELPERSISTENCE_UNLOGGED:
mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync);
/* Update LFC in case of unlogged index build */
if (reln == unlogged_build_rel && unlogged_build_phase == UNLOGGED_BUILD_PHASE_2)
lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks);
return;
default:
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
@@ -1744,10 +1723,6 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
break;
case RELPERSISTENCE_PERMANENT:
if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
{
return mdnblocks(reln, forknum);
}
break;
case RELPERSISTENCE_TEMP:
@@ -1817,11 +1792,6 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, Blo
break;
case RELPERSISTENCE_PERMANENT:
if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)))
{
mdtruncate(reln, forknum, old_blocks, nblocks);
return;
}
break;
case RELPERSISTENCE_TEMP:
@@ -1960,6 +1930,7 @@ neon_start_unlogged_build(SMgrRelation reln)
*/
if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS)
neon_log(ERROR, "unlogged relation build is already in progress");
Assert(unlogged_build_rel == NULL);
ereport(SmgrTrace,
(errmsg(NEON_TAG "starting unlogged build of relation %u/%u/%u",
@@ -1976,7 +1947,7 @@ neon_start_unlogged_build(SMgrRelation reln)
case RELPERSISTENCE_TEMP:
case RELPERSISTENCE_UNLOGGED:
unlogged_build_rel_info = InfoFromSMgrRel(reln);
unlogged_build_rel = reln;
unlogged_build_phase = UNLOGGED_BUILD_NOT_PERMANENT;
#ifdef DEBUG_COMPARE_LOCAL
if (!IsParallelWorker())
@@ -1997,9 +1968,12 @@ neon_start_unlogged_build(SMgrRelation reln)
neon_log(ERROR, "cannot perform unlogged index build, index is not empty ");
#endif
unlogged_build_rel_info = InfoFromSMgrRel(reln);
unlogged_build_rel = reln;
unlogged_build_phase = UNLOGGED_BUILD_PHASE_1;
/* Make the relation look like it's unlogged */
reln->smgr_relpersistence = RELPERSISTENCE_UNLOGGED;
/*
* Create the local file. In a parallel build, the leader is expected to
* call this first and do it.
@@ -2026,16 +2000,17 @@ neon_start_unlogged_build(SMgrRelation reln)
static void
neon_finish_unlogged_build_phase_1(SMgrRelation reln)
{
Assert(RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)));
Assert(unlogged_build_rel == reln);
ereport(SmgrTrace,
(errmsg(NEON_TAG "finishing phase 1 of unlogged build of relation %u/%u/%u",
RelFileInfoFmt((unlogged_build_rel_info)))));
RelFileInfoFmt(InfoFromSMgrRel(reln)))));
if (unlogged_build_phase == UNLOGGED_BUILD_NOT_PERMANENT)
return;
Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_1);
Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED);
/*
* In a parallel build, (only) the leader process performs the 2nd
@@ -2043,7 +2018,7 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln)
*/
if (IsParallelWorker())
{
NRelFileInfoInvalidate(unlogged_build_rel_info);
unlogged_build_rel = NULL;
unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
}
else
@@ -2064,11 +2039,11 @@ neon_end_unlogged_build(SMgrRelation reln)
{
NRelFileInfoBackend rinfob = InfoBFromSMgrRel(reln);
Assert(RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln)));
Assert(unlogged_build_rel == reln);
ereport(SmgrTrace,
(errmsg(NEON_TAG "ending unlogged build of relation %u/%u/%u",
RelFileInfoFmt(unlogged_build_rel_info))));
RelFileInfoFmt(InfoFromNInfoB(rinfob)))));
if (unlogged_build_phase != UNLOGGED_BUILD_NOT_PERMANENT)
{
@@ -2076,6 +2051,7 @@ neon_end_unlogged_build(SMgrRelation reln)
BlockNumber nblocks;
Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_2);
Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED);
/*
* Update the last-written LSN cache.
@@ -2096,6 +2072,9 @@ neon_end_unlogged_build(SMgrRelation reln)
InfoFromNInfoB(rinfob),
MAIN_FORKNUM);
/* Make the relation look permanent again */
reln->smgr_relpersistence = RELPERSISTENCE_PERMANENT;
/* Remove local copy */
for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
{
@@ -2104,8 +2083,6 @@ neon_end_unlogged_build(SMgrRelation reln)
forknum);
forget_cached_relsize(InfoFromNInfoB(rinfob), forknum);
lfc_invalidate(InfoFromNInfoB(rinfob), forknum, nblocks);
mdclose(reln, forknum);
#ifndef DEBUG_COMPARE_LOCAL
/* use isRedo == true, so that we drop it immediately */
@@ -2116,7 +2093,7 @@ neon_end_unlogged_build(SMgrRelation reln)
mdunlink(rinfob, INIT_FORKNUM, true);
#endif
}
NRelFileInfoInvalidate(unlogged_build_rel_info);
unlogged_build_rel = NULL;
unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
}
@@ -2189,7 +2166,7 @@ AtEOXact_neon(XactEvent event, void *arg)
* Forget about any build we might have had in progress. The local
* file will be unlinked by smgrDoPendingDeletes()
*/
NRelFileInfoInvalidate(unlogged_build_rel_info);
unlogged_build_rel = NULL;
unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
break;
@@ -2201,7 +2178,7 @@ AtEOXact_neon(XactEvent event, void *arg)
case XACT_EVENT_PRE_PREPARE:
if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS)
{
NRelFileInfoInvalidate(unlogged_build_rel_info);
unlogged_build_rel = NULL;
unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
ereport(ERROR,
(errcode(ERRCODE_INTERNAL_ERROR),

32
poetry.lock generated
View File

@@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand.
# This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand.
[[package]]
name = "aiohappyeyeballs"
@@ -1145,19 +1145,18 @@ dotenv = ["python-dotenv"]
[[package]]
name = "flask-cors"
version = "6.0.0"
description = "A Flask extension simplifying CORS support"
version = "5.0.0"
description = "A Flask extension adding a decorator for CORS support"
optional = false
python-versions = "<4.0,>=3.9"
python-versions = "*"
groups = ["main"]
files = [
{file = "flask_cors-6.0.0-py3-none-any.whl", hash = "sha256:6332073356452343a8ccddbfec7befdc3fdd040141fe776ec9b94c262f058657"},
{file = "flask_cors-6.0.0.tar.gz", hash = "sha256:4592c1570246bf7beee96b74bc0adbbfcb1b0318f6ba05c412e8909eceec3393"},
{file = "Flask_Cors-5.0.0-py2.py3-none-any.whl", hash = "sha256:b9e307d082a9261c100d8fb0ba909eec6a228ed1b60a8315fd85f783d61910bc"},
{file = "flask_cors-5.0.0.tar.gz", hash = "sha256:5aadb4b950c4e93745034594d9f3ea6591f734bb3662e16e255ffbf5e89c88ef"},
]
[package.dependencies]
flask = ">=0.9"
Werkzeug = ">=0.7"
Flask = ">=0.9"
[[package]]
name = "frozenlist"
@@ -3170,24 +3169,19 @@ pbr = "*"
[[package]]
name = "setuptools"
version = "78.1.1"
version = "70.0.0"
description = "Easily download, build, install, upgrade, and uninstall Python packages"
optional = false
python-versions = ">=3.9"
python-versions = ">=3.8"
groups = ["main"]
files = [
{file = "setuptools-78.1.1-py3-none-any.whl", hash = "sha256:c3a9c4211ff4c309edb8b8c4f1cbfa7ae324c4ba9f91ff254e3d305b9fd54561"},
{file = "setuptools-78.1.1.tar.gz", hash = "sha256:fcc17fd9cd898242f6b4adfaca46137a9edef687f43e6f78469692a5e70d851d"},
{file = "setuptools-70.0.0-py3-none-any.whl", hash = "sha256:54faa7f2e8d2d11bcd2c07bed282eef1046b5c080d1c32add737d7b5817b1ad4"},
{file = "setuptools-70.0.0.tar.gz", hash = "sha256:f211a66637b8fa059bb28183da127d4e86396c991a942b028c6650d4319c3fd0"},
]
[package.extras]
check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\"", "ruff (>=0.8.0) ; sys_platform != \"cygwin\""]
core = ["importlib_metadata (>=6) ; python_version < \"3.10\"", "jaraco.functools (>=4)", "jaraco.text (>=3.7)", "more_itertools", "more_itertools (>=8.8)", "packaging (>=24.2)", "platformdirs (>=4.2.2)", "tomli (>=2.0.1) ; python_version < \"3.11\"", "wheel (>=0.43.0)"]
cover = ["pytest-cov"]
doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier", "towncrier (<24.7)"]
enabler = ["pytest-enabler (>=2.2)"]
test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21) ; python_version >= \"3.9\" and sys_platform != \"cygwin\"", "jaraco.envs (>=2.2)", "jaraco.path (>=3.7.2)", "jaraco.test (>=5.5)", "packaging (>=24.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf ; sys_platform != \"cygwin\"", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"]
type = ["importlib_metadata (>=7.0.2) ; python_version < \"3.10\"", "jaraco.develop (>=7.21) ; sys_platform != \"cygwin\"", "mypy (==1.14.*)", "pytest-mypy"]
docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
testing = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21) ; python_version >= \"3.9\" and sys_platform != \"cygwin\"", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mypy (==1.9)", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.1)", "pytest-checkdocs (>=2.4)", "pytest-cov ; platform_python_implementation != \"PyPy\"", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy", "pytest-perf ; sys_platform != \"cygwin\"", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\"", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
[[package]]
name = "six"

View File

@@ -127,4 +127,3 @@ rstest.workspace = true
walkdir.workspace = true
rand_distr = "0.4"
tokio-postgres.workspace = true
tracing-test = "0.2"

View File

@@ -80,22 +80,10 @@ impl std::fmt::Display for Backend<'_, ()> {
.field(&endpoint.url())
.finish(),
#[cfg(any(test, feature = "testing"))]
ControlPlaneClient::PostgresMock(endpoint) => {
let url = endpoint.url();
match url::Url::parse(url) {
Ok(mut url) => {
let _ = url.set_password(Some("_redacted_"));
let url = url.as_str();
fmt.debug_tuple("ControlPlane::PostgresMock")
.field(&url)
.finish()
}
Err(_) => fmt
.debug_tuple("ControlPlane::PostgresMock")
.field(&url)
.finish(),
}
}
ControlPlaneClient::PostgresMock(endpoint) => fmt
.debug_tuple("ControlPlane::PostgresMock")
.field(&endpoint.url())
.finish(),
#[cfg(test)]
ControlPlaneClient::Test(_) => fmt.debug_tuple("ControlPlane::Test").finish(),
},

View File

@@ -1,13 +1,9 @@
#[cfg(any(test, feature = "testing"))]
use std::env;
use std::net::SocketAddr;
use std::path::PathBuf;
use std::pin::pin;
use std::sync::Arc;
use std::time::Duration;
#[cfg(any(test, feature = "testing"))]
use anyhow::Context;
use anyhow::{bail, ensure};
use arc_swap::ArcSwapOption;
use futures::future::Either;
@@ -39,8 +35,6 @@ use crate::scram::threadpool::ThreadPool;
use crate::serverless::GlobalConnPoolOptions;
use crate::serverless::cancel_set::CancelSet;
use crate::tls::client_config::compute_client_config_with_root_certs;
#[cfg(any(test, feature = "testing"))]
use crate::url::ApiUrl;
use crate::{auth, control_plane, http, serverless, usage_metrics};
project_git_version!(GIT_VERSION);
@@ -167,11 +161,8 @@ struct ProxyCliArgs {
#[clap(long, default_values_t = RateBucketInfo::DEFAULT_REDIS_SET)]
redis_rps_limit: Vec<RateBucketInfo>,
/// Cancellation channel size (max queue size for redis kv client)
#[clap(long, default_value_t = 1024)]
#[clap(long, default_value = "1024")]
cancellation_ch_size: usize,
/// Cancellation ops batch size for redis
#[clap(long, default_value_t = 8)]
cancellation_batch_size: usize,
/// cache for `allowed_ips` (use `size=0` to disable)
#[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
allowed_ips_cache: String,
@@ -551,12 +542,7 @@ pub async fn run() -> anyhow::Result<()> {
if let Some(mut redis_kv_client) = redis_kv_client {
maintenance_tasks.spawn(async move {
redis_kv_client.try_connect().await?;
handle_cancel_messages(
&mut redis_kv_client,
rx_cancel,
args.cancellation_batch_size,
)
.await?;
handle_cancel_messages(&mut redis_kv_client, rx_cancel).await?;
drop(redis_kv_client);
@@ -783,13 +769,7 @@ fn build_auth_backend(
#[cfg(any(test, feature = "testing"))]
AuthBackendType::Postgres => {
let mut url: ApiUrl = args.auth_endpoint.parse()?;
if url.password().is_none() {
let password = env::var("PGPASSWORD")
.with_context(|| "auth-endpoint does not contain a password and environment variable `PGPASSWORD` is not set")?;
url.set_password(Some(&password))
.expect("Failed to set password");
}
let url = args.auth_endpoint.parse()?;
let api = control_plane::client::mock::MockControlPlane::new(
url,
!args.is_private_access_proxy,

View File

@@ -30,6 +30,8 @@ use crate::tls::postgres_rustls::MakeRustlsConnect;
type IpSubnetKey = IpNet;
const CANCEL_KEY_TTL: i64 = 1_209_600; // 2 weeks cancellation key expire time
const REDIS_SEND_TIMEOUT: std::time::Duration = std::time::Duration::from_millis(10);
const BATCH_SIZE: usize = 8;
// Message types for sending through mpsc channel
pub enum CancelKeyOp {
@@ -229,13 +231,12 @@ impl CancelReplyOp {
pub async fn handle_cancel_messages(
client: &mut RedisKVClient,
mut rx: mpsc::Receiver<CancelKeyOp>,
batch_size: usize,
) -> anyhow::Result<()> {
let mut batch = Vec::with_capacity(batch_size);
let mut pipeline = Pipeline::with_capacity(batch_size);
let mut batch = Vec::with_capacity(BATCH_SIZE);
let mut pipeline = Pipeline::with_capacity(BATCH_SIZE);
loop {
if rx.recv_many(&mut batch, batch_size).await == 0 {
if rx.recv_many(&mut batch, BATCH_SIZE).await == 0 {
warn!("shutting down cancellation queue");
break Ok(());
}
@@ -366,7 +367,8 @@ impl CancellationHandler {
return Err(CancelError::InternalError);
};
tx.try_send(op)
tx.send_timeout(op, REDIS_SEND_TIMEOUT)
.await
.map_err(|e| {
tracing::warn!("failed to send GetCancelData for {key}: {e}");
})
@@ -568,7 +570,7 @@ impl Session {
}
// Send the store key op to the cancellation handler and set TTL for the key
pub(crate) fn write_cancel_key(
pub(crate) async fn write_cancel_key(
&self,
cancel_closure: CancelClosure,
) -> Result<(), CancelError> {
@@ -594,14 +596,14 @@ impl Session {
expire: CANCEL_KEY_TTL,
};
let _ = tx.try_send(op).map_err(|e| {
let _ = tx.send_timeout(op, REDIS_SEND_TIMEOUT).await.map_err(|e| {
let key = self.key;
tracing::warn!("failed to send StoreCancelKey for {key}: {e}");
});
Ok(())
}
pub(crate) fn remove_cancel_key(&self) -> Result<(), CancelError> {
pub(crate) async fn remove_cancel_key(&self) -> Result<(), CancelError> {
let Some(tx) = &self.cancellation_handler.tx else {
tracing::warn!("cancellation handler is not available");
return Err(CancelError::InternalError);
@@ -617,7 +619,7 @@ impl Session {
.guard(RedisMsgKind::HDel),
};
let _ = tx.try_send(op).map_err(|e| {
let _ = tx.send_timeout(op, REDIS_SEND_TIMEOUT).await.map_err(|e| {
let key = self.key;
tracing::warn!("failed to send RemoveCancelKey for {key}: {e}");
});

View File

@@ -244,7 +244,9 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
let cancellation_handler_clone = Arc::clone(&cancellation_handler);
let session = cancellation_handler_clone.get_key();
session.write_cancel_key(node.cancel_closure.clone())?;
session
.write_cancel_key(node.cancel_closure.clone())
.await?;
prepare_client_connection(&node, *session.key(), &mut stream).await?;

View File

@@ -1,11 +1,13 @@
use std::cell::RefCell;
use std::cell::{Cell, RefCell};
use std::collections::HashMap;
use std::sync::Arc;
use std::hash::BuildHasher;
use std::sync::atomic::{AtomicU32, Ordering};
use std::{env, io};
use std::{array, env, fmt, io};
use chrono::{DateTime, Utc};
use indexmap::IndexSet;
use opentelemetry::trace::TraceContextExt;
use scopeguard::defer;
use serde::ser::{SerializeMap, Serializer};
use tracing::subscriber::Interest;
use tracing::{Event, Metadata, Span, Subscriber, callsite, span};
@@ -17,6 +19,7 @@ use tracing_subscriber::fmt::{FormatEvent, FormatFields};
use tracing_subscriber::layer::{Context, Layer};
use tracing_subscriber::prelude::*;
use tracing_subscriber::registry::{LookupSpan, SpanRef};
use try_lock::TryLock;
/// Initialize logging and OpenTelemetry tracing and exporter.
///
@@ -52,7 +55,7 @@ pub async fn init() -> anyhow::Result<LoggingGuard> {
StderrWriter {
stderr: std::io::stderr(),
},
&["request_id", "session_id", "conn_id"],
["request_id", "session_id", "conn_id"],
))
} else {
None
@@ -180,65 +183,50 @@ impl Clock for RealClock {
/// Name of the field used by tracing crate to store the event message.
const MESSAGE_FIELD: &str = "message";
/// Tracing used to enforce that spans/events have no more than 32 fields.
/// It seems this is no longer the case, but it's still documented in some places.
/// Generally, we shouldn't expect more than 32 fields anyway, so we can try and
/// rely on it for some (minor) performance gains.
const MAX_TRACING_FIELDS: usize = 32;
thread_local! {
/// Protects against deadlocks and double panics during log writing.
/// The current panic handler will use tracing to log panic information.
static REENTRANCY_GUARD: Cell<bool> = const { Cell::new(false) };
/// Thread-local instance with per-thread buffer for log writing.
static EVENT_FORMATTER: RefCell<EventFormatter> = const { RefCell::new(EventFormatter::new()) };
static EVENT_FORMATTER: RefCell<EventFormatter> = RefCell::new(EventFormatter::new());
/// Cached OS thread ID.
static THREAD_ID: u64 = gettid::gettid();
}
/// Map for values fixed at callsite registration.
// We use papaya here because registration rarely happens post-startup.
// papaya is good for read-heavy workloads.
//
// We use rustc_hash here because callsite::Identifier will always be an integer with low-bit entropy,
// since it's always a pointer to static mutable data. rustc_hash was designed for low-bit entropy.
type CallsiteMap<T> =
papaya::HashMap<callsite::Identifier, T, std::hash::BuildHasherDefault<rustc_hash::FxHasher>>;
/// Implements tracing layer to handle events specific to logging.
struct JsonLoggingLayer<C: Clock, W: MakeWriter> {
struct JsonLoggingLayer<C: Clock, W: MakeWriter, const F: usize> {
clock: C,
skipped_field_indices: papaya::HashMap<callsite::Identifier, SkippedFieldIndices>,
callsite_ids: papaya::HashMap<callsite::Identifier, CallsiteId>,
writer: W,
/// tracks which fields of each **event** are duplicates
skipped_field_indices: CallsiteMap<SkippedFieldIndices>,
span_info: CallsiteMap<CallsiteSpanInfo>,
/// Fields we want to keep track of in a separate json object.
extract_fields: &'static [&'static str],
// We use a const generic and arrays to bypass one heap allocation.
extract_fields: IndexSet<&'static str>,
_marker: std::marker::PhantomData<[&'static str; F]>,
}
impl<C: Clock, W: MakeWriter> JsonLoggingLayer<C, W> {
fn new(clock: C, writer: W, extract_fields: &'static [&'static str]) -> Self {
impl<C: Clock, W: MakeWriter, const F: usize> JsonLoggingLayer<C, W, F> {
fn new(clock: C, writer: W, extract_fields: [&'static str; F]) -> Self {
JsonLoggingLayer {
clock,
skipped_field_indices: CallsiteMap::default(),
span_info: CallsiteMap::default(),
skipped_field_indices: papaya::HashMap::default(),
callsite_ids: papaya::HashMap::default(),
writer,
extract_fields,
extract_fields: IndexSet::from_iter(extract_fields),
_marker: std::marker::PhantomData,
}
}
#[inline]
fn span_info(&self, metadata: &'static Metadata<'static>) -> CallsiteSpanInfo {
self.span_info
fn callsite_id(&self, cs: callsite::Identifier) -> CallsiteId {
*self
.callsite_ids
.pin()
.get_or_insert_with(metadata.callsite(), || {
CallsiteSpanInfo::new(metadata, self.extract_fields)
})
.clone()
.get_or_insert_with(cs, CallsiteId::next)
}
}
impl<S, C: Clock + 'static, W: MakeWriter + 'static> Layer<S> for JsonLoggingLayer<C, W>
impl<S, C: Clock + 'static, W: MakeWriter + 'static, const F: usize> Layer<S>
for JsonLoggingLayer<C, W, F>
where
S: Subscriber + for<'a> LookupSpan<'a>,
{
@@ -249,25 +237,35 @@ where
// early, before OTel machinery, and add as event extension.
let now = self.clock.now();
let res: io::Result<()> = EVENT_FORMATTER.with(|f| {
let mut borrow = f.try_borrow_mut();
let formatter = match borrow.as_deref_mut() {
Ok(formatter) => formatter,
// If the thread local formatter is borrowed,
// then we likely hit an edge case were we panicked during formatting.
// We allow the logging to proceed with an uncached formatter.
Err(_) => &mut EventFormatter::new(),
};
let res: io::Result<()> = REENTRANCY_GUARD.with(move |entered| {
if entered.get() {
let mut formatter = EventFormatter::new();
formatter.format::<S, F>(
now,
event,
&ctx,
&self.skipped_field_indices,
&self.callsite_ids,
&self.extract_fields,
)?;
self.writer.make_writer().write_all(formatter.buffer())
} else {
entered.set(true);
defer!(entered.set(false););
formatter.reset();
formatter.format(
now,
event,
&ctx,
&self.skipped_field_indices,
self.extract_fields,
)?;
self.writer.make_writer().write_all(formatter.buffer())
EVENT_FORMATTER.with_borrow_mut(move |formatter| {
formatter.reset();
formatter.format::<S, F>(
now,
event,
&ctx,
&self.skipped_field_indices,
&self.callsite_ids,
&self.extract_fields,
)?;
self.writer.make_writer().write_all(formatter.buffer())
})
}
});
// In case logging fails we generate a simpler JSON object.
@@ -289,48 +287,50 @@ where
/// Registers a SpanFields instance as span extension.
fn on_new_span(&self, attrs: &span::Attributes<'_>, id: &span::Id, ctx: Context<'_, S>) {
let span = ctx.span(id).expect("span must exist");
let fields = SpanFields::default();
fields.record_fields(attrs);
let mut fields = SpanFields::new(self.span_info(span.metadata()));
attrs.record(&mut fields);
// This could deadlock when there's a panic somewhere in the tracing
// event handling and a read or write guard is still held. This includes
// the OTel subscriber.
let mut exts = span.extensions_mut();
// This is a new span: the extensions should not be locked
// unless some layer spawned a thread to process this span.
// I don't think any layers do that.
span.extensions_mut().insert(fields);
exts.insert(fields);
}
fn on_record(&self, id: &span::Id, values: &span::Record<'_>, ctx: Context<'_, S>) {
let span = ctx.span(id).expect("span must exist");
// assumption: `on_record` is rarely called.
// assumption: a span being updated by one thread,
// and formatted by another thread is even rarer.
let mut ext = span.extensions_mut();
if let Some(fields) = ext.get_mut::<SpanFields>() {
values.record(fields);
let ext = span.extensions();
if let Some(data) = ext.get::<SpanFields>() {
data.record_fields(values);
}
}
/// Called (lazily) roughly once per event/span instance. We quickly check
/// for duplicate field names and record duplicates as skippable. Last field wins.
/// Called (lazily) whenever a new log call is executed. We quickly check
/// for duplicate field names and record duplicates as skippable. Last one
/// wins.
fn register_callsite(&self, metadata: &'static Metadata<'static>) -> Interest {
debug_assert!(
metadata.fields().len() <= MAX_TRACING_FIELDS,
"callsite {metadata:?} has too many fields."
);
if !metadata.is_event() {
// register the span info.
self.span_info(metadata);
self.callsite_id(metadata.callsite());
// Must not be never because we wouldn't get trace and span data.
return Interest::always();
}
let mut field_indices = SkippedFieldIndices::default();
let mut seen_fields = HashMap::new();
let mut seen_fields = HashMap::<&'static str, usize>::new();
for field in metadata.fields() {
if let Some(old_index) = seen_fields.insert(field.name(), field.index()) {
field_indices.set(old_index);
use std::collections::hash_map::Entry;
match seen_fields.entry(field.name()) {
Entry::Vacant(entry) => {
// field not seen yet
entry.insert(field.index());
}
Entry::Occupied(mut entry) => {
// replace currently stored index
let old_index = entry.insert(field.index());
// ... and append it to list of skippable indices
field_indices.push(old_index);
}
}
}
@@ -344,113 +344,110 @@ where
}
}
/// Any span info that is fixed to a particular callsite. Not variable between span instances.
#[derive(Clone)]
struct CallsiteSpanInfo {
/// index of each field to extract. usize::MAX if not found.
extract: Arc<[usize]>,
#[derive(Copy, Clone, Debug, Default)]
#[repr(transparent)]
struct CallsiteId(u32);
/// tracks the fixed "callsite ID" for each span.
/// note: this is not stable between runs.
normalized_name: Arc<str>,
}
impl CallsiteSpanInfo {
fn new(metadata: &'static Metadata<'static>, extract_fields: &[&'static str]) -> Self {
impl CallsiteId {
#[inline]
fn next() -> Self {
// Start at 1 to reserve 0 for default.
static COUNTER: AtomicU32 = AtomicU32::new(1);
CallsiteId(COUNTER.fetch_add(1, Ordering::Relaxed))
}
}
let names: Vec<&'static str> = metadata.fields().iter().map(|f| f.name()).collect();
// get all the indices of span fields we want to focus
let extract = extract_fields
.iter()
// use rposition, since we want last match wins.
.map(|f1| names.iter().rposition(|f2| f1 == f2).unwrap_or(usize::MAX))
.collect();
// normalized_name is unique for each callsite, but it is not
// unified across separate proxy instances.
// todo: can we do better here?
let cid = COUNTER.fetch_add(1, Ordering::Relaxed);
let normalized_name = format!("{}#{cid}", metadata.name()).into();
Self {
extract,
normalized_name,
}
impl fmt::Display for CallsiteId {
#[inline]
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
self.0.fmt(f)
}
}
/// Stores span field values recorded during the spans lifetime.
#[derive(Default)]
struct SpanFields {
values: [serde_json::Value; MAX_TRACING_FIELDS],
/// cached span info so we can avoid extra hashmap lookups in the hot path.
span_info: CallsiteSpanInfo,
// TODO: Switch to custom enum with lasso::Spur for Strings?
fields: papaya::HashMap<&'static str, serde_json::Value>,
}
impl SpanFields {
fn new(span_info: CallsiteSpanInfo) -> Self {
Self {
span_info,
values: [const { serde_json::Value::Null }; MAX_TRACING_FIELDS],
}
#[inline]
fn record_fields<R: tracing_subscriber::field::RecordFields>(&self, fields: R) {
fields.record(&mut SpanFieldsRecorder {
fields: self.fields.pin(),
});
}
}
impl tracing::field::Visit for SpanFields {
/// Implements a tracing field visitor to convert and store values.
struct SpanFieldsRecorder<'m, S, G> {
fields: papaya::HashMapRef<'m, &'static str, serde_json::Value, S, G>,
}
impl<S: BuildHasher, G: papaya::Guard> tracing::field::Visit for SpanFieldsRecorder<'_, S, G> {
#[inline]
fn record_f64(&mut self, field: &tracing::field::Field, value: f64) {
self.values[field.index()] = serde_json::Value::from(value);
self.fields
.insert(field.name(), serde_json::Value::from(value));
}
#[inline]
fn record_i64(&mut self, field: &tracing::field::Field, value: i64) {
self.values[field.index()] = serde_json::Value::from(value);
self.fields
.insert(field.name(), serde_json::Value::from(value));
}
#[inline]
fn record_u64(&mut self, field: &tracing::field::Field, value: u64) {
self.values[field.index()] = serde_json::Value::from(value);
self.fields
.insert(field.name(), serde_json::Value::from(value));
}
#[inline]
fn record_i128(&mut self, field: &tracing::field::Field, value: i128) {
if let Ok(value) = i64::try_from(value) {
self.values[field.index()] = serde_json::Value::from(value);
self.fields
.insert(field.name(), serde_json::Value::from(value));
} else {
self.values[field.index()] = serde_json::Value::from(format!("{value}"));
self.fields
.insert(field.name(), serde_json::Value::from(format!("{value}")));
}
}
#[inline]
fn record_u128(&mut self, field: &tracing::field::Field, value: u128) {
if let Ok(value) = u64::try_from(value) {
self.values[field.index()] = serde_json::Value::from(value);
self.fields
.insert(field.name(), serde_json::Value::from(value));
} else {
self.values[field.index()] = serde_json::Value::from(format!("{value}"));
self.fields
.insert(field.name(), serde_json::Value::from(format!("{value}")));
}
}
#[inline]
fn record_bool(&mut self, field: &tracing::field::Field, value: bool) {
self.values[field.index()] = serde_json::Value::from(value);
self.fields
.insert(field.name(), serde_json::Value::from(value));
}
#[inline]
fn record_bytes(&mut self, field: &tracing::field::Field, value: &[u8]) {
self.values[field.index()] = serde_json::Value::from(value);
self.fields
.insert(field.name(), serde_json::Value::from(value));
}
#[inline]
fn record_str(&mut self, field: &tracing::field::Field, value: &str) {
self.values[field.index()] = serde_json::Value::from(value);
self.fields
.insert(field.name(), serde_json::Value::from(value));
}
#[inline]
fn record_debug(&mut self, field: &tracing::field::Field, value: &dyn std::fmt::Debug) {
self.values[field.index()] = serde_json::Value::from(format!("{value:?}"));
self.fields
.insert(field.name(), serde_json::Value::from(format!("{value:?}")));
}
#[inline]
@@ -459,33 +456,38 @@ impl tracing::field::Visit for SpanFields {
field: &tracing::field::Field,
value: &(dyn std::error::Error + 'static),
) {
self.values[field.index()] = serde_json::Value::from(format!("{value}"));
self.fields
.insert(field.name(), serde_json::Value::from(format!("{value}")));
}
}
/// List of field indices skipped during logging. Can list duplicate fields or
/// metafields not meant to be logged.
#[derive(Copy, Clone, Default)]
#[derive(Clone, Default)]
struct SkippedFieldIndices {
// 32-bits is large enough for `MAX_TRACING_FIELDS`
bits: u32,
bits: u64,
}
impl SkippedFieldIndices {
#[inline]
fn is_empty(self) -> bool {
fn is_empty(&self) -> bool {
self.bits == 0
}
#[inline]
fn set(&mut self, index: usize) {
debug_assert!(index <= 32, "index out of bounds of 32-bit set");
self.bits |= 1 << index;
fn push(&mut self, index: usize) {
self.bits |= 1u64
.checked_shl(index as u32)
.expect("field index too large");
}
#[inline]
fn contains(self, index: usize) -> bool {
self.bits & (1 << index) != 0
fn contains(&self, index: usize) -> bool {
self.bits
& 1u64
.checked_shl(index as u32)
.expect("field index too large")
!= 0
}
}
@@ -497,7 +499,7 @@ struct EventFormatter {
impl EventFormatter {
#[inline]
const fn new() -> Self {
fn new() -> Self {
EventFormatter {
logline_buffer: Vec::new(),
}
@@ -513,13 +515,14 @@ impl EventFormatter {
self.logline_buffer.clear();
}
fn format<S>(
fn format<S, const F: usize>(
&mut self,
now: DateTime<Utc>,
event: &Event<'_>,
ctx: &Context<'_, S>,
skipped_field_indices: &CallsiteMap<SkippedFieldIndices>,
extract_fields: &'static [&'static str],
skipped_field_indices: &papaya::HashMap<callsite::Identifier, SkippedFieldIndices>,
callsite_ids: &papaya::HashMap<callsite::Identifier, CallsiteId>,
extract_fields: &IndexSet<&'static str>,
) -> io::Result<()>
where
S: Subscriber + for<'a> LookupSpan<'a>,
@@ -530,11 +533,8 @@ impl EventFormatter {
let normalized_meta = event.normalized_metadata();
let meta = normalized_meta.as_ref().unwrap_or_else(|| event.metadata());
let skipped_field_indices = skipped_field_indices
.pin()
.get(&meta.callsite())
.copied()
.unwrap_or_default();
let skipped_field_indices = skipped_field_indices.pin();
let skipped_field_indices = skipped_field_indices.get(&meta.callsite());
let mut serialize = || {
let mut serializer = serde_json::Serializer::new(&mut self.logline_buffer);
@@ -565,11 +565,9 @@ impl EventFormatter {
}
let spans = SerializableSpans {
// collect all spans from parent to root.
spans: ctx
.event_span(event)
.map_or(vec![], |parent| parent.scope().collect()),
extracted: ExtractedSpanFields::new(extract_fields),
ctx,
callsite_ids,
extract: ExtractedSpanFields::<'_, F>::new(extract_fields),
};
serializer.serialize_entry("spans", &spans)?;
@@ -622,9 +620,9 @@ impl EventFormatter {
}
}
if spans.extracted.has_values() {
if spans.extract.has_values() {
// TODO: add fields from event, too?
serializer.serialize_entry("extract", &spans.extracted)?;
serializer.serialize_entry("extract", &spans.extract)?;
}
serializer.end()
@@ -637,15 +635,15 @@ impl EventFormatter {
}
/// Extracts the message field that's mixed will other fields.
struct MessageFieldExtractor<S: serde::ser::SerializeMap> {
struct MessageFieldExtractor<'a, S: serde::ser::SerializeMap> {
serializer: S,
skipped_field_indices: SkippedFieldIndices,
skipped_field_indices: Option<&'a SkippedFieldIndices>,
state: Option<Result<(), S::Error>>,
}
impl<S: serde::ser::SerializeMap> MessageFieldExtractor<S> {
impl<'a, S: serde::ser::SerializeMap> MessageFieldExtractor<'a, S> {
#[inline]
fn new(serializer: S, skipped_field_indices: SkippedFieldIndices) -> Self {
fn new(serializer: S, skipped_field_indices: Option<&'a SkippedFieldIndices>) -> Self {
Self {
serializer,
skipped_field_indices,
@@ -667,11 +665,13 @@ impl<S: serde::ser::SerializeMap> MessageFieldExtractor<S> {
fn accept_field(&self, field: &tracing::field::Field) -> bool {
self.state.is_none()
&& field.name() == MESSAGE_FIELD
&& !self.skipped_field_indices.contains(field.index())
&& !self
.skipped_field_indices
.is_some_and(|i| i.contains(field.index()))
}
}
impl<S: serde::ser::SerializeMap> tracing::field::Visit for MessageFieldExtractor<S> {
impl<S: serde::ser::SerializeMap> tracing::field::Visit for MessageFieldExtractor<'_, S> {
#[inline]
fn record_f64(&mut self, field: &tracing::field::Field, value: f64) {
if self.accept_field(field) {
@@ -751,14 +751,14 @@ impl<S: serde::ser::SerializeMap> tracing::field::Visit for MessageFieldExtracto
/// can be skipped.
// This is entirely optional and only cosmetic, though maybe helps a
// bit during log parsing in dashboards when there's no field with empty object.
struct FieldsPresent(pub bool, SkippedFieldIndices);
struct FieldsPresent<'a>(pub bool, Option<&'a SkippedFieldIndices>);
// Even though some methods have an overhead (error, bytes) it is assumed the
// compiler won't include this since we ignore the value entirely.
impl tracing::field::Visit for FieldsPresent {
impl tracing::field::Visit for FieldsPresent<'_> {
#[inline]
fn record_debug(&mut self, field: &tracing::field::Field, _: &dyn std::fmt::Debug) {
if !self.1.contains(field.index())
if !self.1.is_some_and(|i| i.contains(field.index()))
&& field.name() != MESSAGE_FIELD
&& !field.name().starts_with("log.")
{
@@ -768,7 +768,10 @@ impl tracing::field::Visit for FieldsPresent {
}
/// Serializes the fields directly supplied with a log event.
struct SerializableEventFields<'a, 'event>(&'a tracing::Event<'event>, SkippedFieldIndices);
struct SerializableEventFields<'a, 'event>(
&'a tracing::Event<'event>,
Option<&'a SkippedFieldIndices>,
);
impl serde::ser::Serialize for SerializableEventFields<'_, '_> {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
@@ -785,15 +788,15 @@ impl serde::ser::Serialize for SerializableEventFields<'_, '_> {
}
/// A tracing field visitor that skips the message field.
struct MessageFieldSkipper<S: serde::ser::SerializeMap> {
struct MessageFieldSkipper<'a, S: serde::ser::SerializeMap> {
serializer: S,
skipped_field_indices: SkippedFieldIndices,
skipped_field_indices: Option<&'a SkippedFieldIndices>,
state: Result<(), S::Error>,
}
impl<S: serde::ser::SerializeMap> MessageFieldSkipper<S> {
impl<'a, S: serde::ser::SerializeMap> MessageFieldSkipper<'a, S> {
#[inline]
fn new(serializer: S, skipped_field_indices: SkippedFieldIndices) -> Self {
fn new(serializer: S, skipped_field_indices: Option<&'a SkippedFieldIndices>) -> Self {
Self {
serializer,
skipped_field_indices,
@@ -806,7 +809,9 @@ impl<S: serde::ser::SerializeMap> MessageFieldSkipper<S> {
self.state.is_ok()
&& field.name() != MESSAGE_FIELD
&& !field.name().starts_with("log.")
&& !self.skipped_field_indices.contains(field.index())
&& !self
.skipped_field_indices
.is_some_and(|i| i.contains(field.index()))
}
#[inline]
@@ -816,7 +821,7 @@ impl<S: serde::ser::SerializeMap> MessageFieldSkipper<S> {
}
}
impl<S: serde::ser::SerializeMap> tracing::field::Visit for MessageFieldSkipper<S> {
impl<S: serde::ser::SerializeMap> tracing::field::Visit for MessageFieldSkipper<'_, S> {
#[inline]
fn record_f64(&mut self, field: &tracing::field::Field, value: f64) {
if self.accept_field(field) {
@@ -900,17 +905,18 @@ impl<S: serde::ser::SerializeMap> tracing::field::Visit for MessageFieldSkipper<
/// with the span names as keys. To prevent collision we append a numberic value
/// to the name. Also, collects any span fields we're interested in. Last one
/// wins.
struct SerializableSpans<'ctx, S>
struct SerializableSpans<'a, 'ctx, Span, const F: usize>
where
S: for<'lookup> LookupSpan<'lookup>,
Span: Subscriber + for<'lookup> LookupSpan<'lookup>,
{
spans: Vec<SpanRef<'ctx, S>>,
extracted: ExtractedSpanFields,
ctx: &'a Context<'ctx, Span>,
callsite_ids: &'a papaya::HashMap<callsite::Identifier, CallsiteId>,
extract: ExtractedSpanFields<'a, F>,
}
impl<S> serde::ser::Serialize for SerializableSpans<'_, S>
impl<Span, const F: usize> serde::ser::Serialize for SerializableSpans<'_, '_, Span, F>
where
S: for<'lookup> LookupSpan<'lookup>,
Span: Subscriber + for<'lookup> LookupSpan<'lookup>,
{
fn serialize<Ser>(&self, serializer: Ser) -> Result<Ser::Ok, Ser::Error>
where
@@ -918,22 +924,25 @@ where
{
let mut serializer = serializer.serialize_map(None)?;
for span in self.spans.iter().rev() {
let ext = span.extensions();
if let Some(leaf_span) = self.ctx.lookup_current() {
for span in leaf_span.scope().from_root() {
// Append a numeric callsite ID to the span name to keep the name unique
// in the JSON object.
let cid = self
.callsite_ids
.pin()
.get(&span.metadata().callsite())
.copied()
.unwrap_or_default();
// all spans should have this extension.
let Some(fields) = ext.get() else { continue };
// Loki turns the # into an underscore during field name concatenation.
serializer.serialize_key(&format_args!("{}#{}", span.metadata().name(), &cid))?;
self.extracted.layer_span(fields);
let SpanFields { values, span_info } = fields;
serializer.serialize_entry(
&*span_info.normalized_name,
&SerializableSpanFields {
fields: span.metadata().fields(),
values,
},
)?;
serializer.serialize_value(&SerializableSpanFields {
span: &span,
extract: &self.extract,
})?;
}
}
serializer.end()
@@ -941,77 +950,80 @@ where
}
/// Serializes the span fields as object.
struct SerializableSpanFields<'span> {
fields: &'span tracing::field::FieldSet,
values: &'span [serde_json::Value; MAX_TRACING_FIELDS],
struct SerializableSpanFields<'a, 'span, Span, const F: usize>
where
Span: for<'lookup> LookupSpan<'lookup>,
{
span: &'a SpanRef<'span, Span>,
extract: &'a ExtractedSpanFields<'a, F>,
}
impl serde::ser::Serialize for SerializableSpanFields<'_> {
impl<Span, const F: usize> serde::ser::Serialize for SerializableSpanFields<'_, '_, Span, F>
where
Span: for<'lookup> LookupSpan<'lookup>,
{
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::ser::Serializer,
{
let mut serializer = serializer.serialize_map(None)?;
for (field, value) in std::iter::zip(self.fields, self.values) {
if value.is_null() {
continue;
let ext = self.span.extensions();
if let Some(data) = ext.get::<SpanFields>() {
for (name, value) in &data.fields.pin() {
serializer.serialize_entry(name, value)?;
// TODO: replace clone with reference, if possible.
self.extract.set(name, value.clone());
}
serializer.serialize_entry(field.name(), value)?;
}
serializer.end()
}
}
struct ExtractedSpanFields {
names: &'static [&'static str],
values: RefCell<Vec<serde_json::Value>>,
struct ExtractedSpanFields<'a, const F: usize> {
names: &'a IndexSet<&'static str>,
// TODO: replace TryLock with something local thread and interior mutability.
// serde API doesn't let us use `mut`.
values: TryLock<([Option<serde_json::Value>; F], bool)>,
}
impl ExtractedSpanFields {
fn new(names: &'static [&'static str]) -> Self {
impl<'a, const F: usize> ExtractedSpanFields<'a, F> {
fn new(names: &'a IndexSet<&'static str>) -> Self {
ExtractedSpanFields {
names,
values: RefCell::new(vec![serde_json::Value::Null; names.len()]),
values: TryLock::new((array::from_fn(|_| Option::default()), false)),
}
}
fn layer_span(&self, fields: &SpanFields) {
let mut v = self.values.borrow_mut();
let SpanFields { values, span_info } = fields;
// extract the fields
for (i, &j) in span_info.extract.iter().enumerate() {
let Some(value) = values.get(j) else { continue };
if !value.is_null() {
// TODO: replace clone with reference, if possible.
v[i] = value.clone();
}
#[inline]
fn set(&self, name: &'static str, value: serde_json::Value) {
if let Some((index, _)) = self.names.get_full(name) {
let mut fields = self.values.try_lock().expect("thread-local use");
fields.0[index] = Some(value);
fields.1 = true;
}
}
#[inline]
fn has_values(&self) -> bool {
self.values.borrow().iter().any(|v| !v.is_null())
self.values.try_lock().expect("thread-local use").1
}
}
impl serde::ser::Serialize for ExtractedSpanFields {
impl<const F: usize> serde::ser::Serialize for ExtractedSpanFields<'_, F> {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::ser::Serializer,
{
let mut serializer = serializer.serialize_map(None)?;
let values = self.values.borrow();
for (key, value) in std::iter::zip(self.names, &*values) {
if value.is_null() {
continue;
let values = self.values.try_lock().expect("thread-local use");
for (i, value) in values.0.iter().enumerate() {
if let Some(value) = value {
let key = self.names[i];
serializer.serialize_entry(key, value)?;
}
serializer.serialize_entry(key, value)?;
}
serializer.end()
@@ -1020,6 +1032,7 @@ impl serde::ser::Serialize for ExtractedSpanFields {
#[cfg(test)]
mod tests {
use std::marker::PhantomData;
use std::sync::{Arc, Mutex, MutexGuard};
use assert_json_diff::assert_json_eq;
@@ -1068,9 +1081,10 @@ mod tests {
let log_layer = JsonLoggingLayer {
clock: clock.clone(),
skipped_field_indices: papaya::HashMap::default(),
span_info: papaya::HashMap::default(),
callsite_ids: papaya::HashMap::default(),
writer: buffer.clone(),
extract_fields: &["x"],
extract_fields: IndexSet::from_iter(["x"]),
_marker: PhantomData::<[&'static str; 1]>,
};
let registry = tracing_subscriber::Registry::default().with(log_layer);

View File

@@ -383,7 +383,9 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
let cancellation_handler_clone = Arc::clone(&cancellation_handler);
let session = cancellation_handler_clone.get_key();
session.write_cancel_key(node.cancel_closure.clone())?;
session
.write_cancel_key(node.cancel_closure.clone())
.await?;
prepare_client_connection(&node, *session.key(), &mut stream).await?;

View File

@@ -94,7 +94,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> ProxyPassthrough<S> {
tracing::warn!(session_id = ?self.session_id, ?err, "could not cancel the query in the database");
}
drop(self.cancel.remove_cancel_key()); // we don't need a result. If the queue is full, we just log the error
drop(self.cancel.remove_cancel_key().await); // we don't need a result. If the queue is full, we just log the error
res
}

View File

@@ -48,7 +48,7 @@ impl ShouldRetryWakeCompute for postgres_client::error::DbError {
use postgres_client::error::SqlState;
// Here are errors that happens after the user successfully authenticated to the database.
// TODO: there are pgbouncer errors that should be retried, but they are not listed here.
let non_retriable_pg_errors = matches!(
!matches!(
self.code(),
&SqlState::TOO_MANY_CONNECTIONS
| &SqlState::OUT_OF_MEMORY
@@ -56,20 +56,8 @@ impl ShouldRetryWakeCompute for postgres_client::error::DbError {
| &SqlState::T_R_SERIALIZATION_FAILURE
| &SqlState::INVALID_CATALOG_NAME
| &SqlState::INVALID_SCHEMA_NAME
| &SqlState::INVALID_PARAMETER_VALUE,
);
if non_retriable_pg_errors {
return false;
}
// PGBouncer errors that should not trigger a wake_compute retry.
if self.code() == &SqlState::PROTOCOL_VIOLATION {
// Source for the error message:
// https://github.com/pgbouncer/pgbouncer/blob/f15997fe3effe3a94ba8bcc1ea562e6117d1a131/src/client.c#L1070
return !self
.message()
.contains("no more connections allowed (max_client_conn)");
}
true
| &SqlState::INVALID_PARAMETER_VALUE
)
}
}
@@ -122,55 +110,3 @@ pub(crate) fn retry_after(num_retries: u32, config: RetryConfig) -> time::Durati
.base_delay
.mul_f64(config.backoff_factor.powi((num_retries as i32) - 1))
}
#[cfg(test)]
mod tests {
use super::ShouldRetryWakeCompute;
use postgres_client::error::{DbError, SqlState};
#[test]
fn should_retry_wake_compute_for_db_error() {
// These SQLStates should NOT trigger a wake_compute retry.
let non_retry_states = [
SqlState::TOO_MANY_CONNECTIONS,
SqlState::OUT_OF_MEMORY,
SqlState::SYNTAX_ERROR,
SqlState::T_R_SERIALIZATION_FAILURE,
SqlState::INVALID_CATALOG_NAME,
SqlState::INVALID_SCHEMA_NAME,
SqlState::INVALID_PARAMETER_VALUE,
];
for state in non_retry_states {
let err = DbError::new_test_error(state.clone(), "oops".to_string());
assert!(
!err.should_retry_wake_compute(),
"State {state:?} unexpectedly retried"
);
}
// Errors coming from pgbouncer should not trigger a wake_compute retry
let non_retry_pgbouncer_errors = ["no more connections allowed (max_client_conn)"];
for error in non_retry_pgbouncer_errors {
let err = DbError::new_test_error(SqlState::PROTOCOL_VIOLATION, error.to_string());
assert!(
!err.should_retry_wake_compute(),
"PGBouncer error {error:?} unexpectedly retried"
);
}
// These SQLStates should trigger a wake_compute retry.
let retry_states = [
SqlState::CONNECTION_FAILURE,
SqlState::CONNECTION_EXCEPTION,
SqlState::CONNECTION_DOES_NOT_EXIST,
SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION,
];
for state in retry_states {
let err = DbError::new_test_error(state.clone(), "oops".to_string());
assert!(
err.should_retry_wake_compute(),
"State {state:?} unexpectedly skipped retry"
);
}
}
}

View File

@@ -15,7 +15,6 @@ use rstest::rstest;
use rustls::crypto::ring;
use rustls::pki_types;
use tokio::io::DuplexStream;
use tracing_test::traced_test;
use super::connect_compute::ConnectMechanism;
use super::retry::CouldRetry;
@@ -382,14 +381,8 @@ enum ConnectAction {
WakeFail,
WakeRetry,
Connect,
// connect_once -> Err, could_retry = true, should_retry_wake_compute = true
Retry,
// connect_once -> Err, could_retry = true, should_retry_wake_compute = false
RetryNoWake,
// connect_once -> Err, could_retry = false, should_retry_wake_compute = true
Fail,
// connect_once -> Err, could_retry = false, should_retry_wake_compute = false
FailNoWake,
}
#[derive(Clone)]
@@ -431,7 +424,6 @@ struct TestConnection;
#[derive(Debug)]
struct TestConnectError {
retryable: bool,
wakeable: bool,
kind: crate::error::ErrorKind,
}
@@ -456,7 +448,7 @@ impl CouldRetry for TestConnectError {
}
impl ShouldRetryWakeCompute for TestConnectError {
fn should_retry_wake_compute(&self) -> bool {
self.wakeable
true
}
}
@@ -479,22 +471,10 @@ impl ConnectMechanism for TestConnectMechanism {
ConnectAction::Connect => Ok(TestConnection),
ConnectAction::Retry => Err(TestConnectError {
retryable: true,
wakeable: true,
kind: ErrorKind::Compute,
}),
ConnectAction::RetryNoWake => Err(TestConnectError {
retryable: true,
wakeable: false,
kind: ErrorKind::Compute,
}),
ConnectAction::Fail => Err(TestConnectError {
retryable: false,
wakeable: true,
kind: ErrorKind::Compute,
}),
ConnectAction::FailNoWake => Err(TestConnectError {
retryable: false,
wakeable: false,
kind: ErrorKind::Compute,
}),
x => panic!("expecting action {x:?}, connect is called instead"),
@@ -729,92 +709,3 @@ async fn wake_non_retry() {
.unwrap_err();
mechanism.verify();
}
#[tokio::test]
#[traced_test]
async fn fail_but_wake_invalidates_cache() {
let ctx = RequestContext::test();
let mech = TestConnectMechanism::new(vec![
ConnectAction::Wake,
ConnectAction::Fail,
ConnectAction::Wake,
ConnectAction::Connect,
]);
let user = helper_create_connect_info(&mech);
let cfg = config();
connect_to_compute(&ctx, &mech, &user, cfg.retry, &cfg)
.await
.unwrap();
assert!(logs_contain(
"invalidating stalled compute node info cache entry"
));
}
#[tokio::test]
#[traced_test]
async fn fail_no_wake_skips_cache_invalidation() {
let ctx = RequestContext::test();
let mech = TestConnectMechanism::new(vec![
ConnectAction::Wake,
ConnectAction::FailNoWake,
ConnectAction::Connect,
]);
let user = helper_create_connect_info(&mech);
let cfg = config();
connect_to_compute(&ctx, &mech, &user, cfg.retry, &cfg)
.await
.unwrap();
assert!(!logs_contain(
"invalidating stalled compute node info cache entry"
));
}
#[tokio::test]
#[traced_test]
async fn retry_but_wake_invalidates_cache() {
let _ = env_logger::try_init();
use ConnectAction::*;
let ctx = RequestContext::test();
// Wake → Retry (retryable + wakeable) → Wake → Connect
let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Connect]);
let user_info = helper_create_connect_info(&mechanism);
let cfg = config();
connect_to_compute(&ctx, &mechanism, &user_info, cfg.retry, &cfg)
.await
.unwrap();
mechanism.verify();
// Because Retry has wakeable=true, we should see invalidate_cache
assert!(logs_contain(
"invalidating stalled compute node info cache entry"
));
}
#[tokio::test]
#[traced_test]
async fn retry_no_wake_skips_invalidation() {
let _ = env_logger::try_init();
use ConnectAction::*;
let ctx = RequestContext::test();
// Wake → RetryNoWake (retryable + NOT wakeable)
let mechanism = TestConnectMechanism::new(vec![Wake, RetryNoWake]);
let user_info = helper_create_connect_info(&mechanism);
let cfg = config();
connect_to_compute(&ctx, &mechanism, &user_info, cfg.retry, &cfg)
.await
.unwrap_err();
mechanism.verify();
// Because RetryNoWake has wakeable=false, we must NOT see invalidate_cache
assert!(!logs_contain(
"invalidating stalled compute node info cache entry"
));
}

View File

@@ -13,19 +13,22 @@ pub(crate) struct Pbkdf2 {
// inspired from <https://github.com/neondatabase/rust-postgres/blob/20031d7a9ee1addeae6e0968e3899ae6bf01cee2/postgres-protocol/src/authentication/sasl.rs#L36-L61>
impl Pbkdf2 {
pub(crate) fn start(str: &[u8], salt: &[u8], iterations: u32) -> Self {
// key the HMAC and derive the first block in-place
let mut hmac =
let hmac =
Hmac::<Sha256>::new_from_slice(str).expect("HMAC is able to accept all key sizes");
hmac.update(salt);
hmac.update(&1u32.to_be_bytes());
let init_block = hmac.finalize_reset().into_bytes();
let prev = hmac
.clone()
.chain_update(salt)
.chain_update(1u32.to_be_bytes())
.finalize()
.into_bytes();
Self {
hmac,
// one iteration spent above
// one consumed for the hash above
iterations: iterations - 1,
hi: init_block,
prev: init_block,
hi: prev,
prev,
}
}
@@ -41,17 +44,14 @@ impl Pbkdf2 {
iterations,
} = self;
// only do up to 4096 iterations per turn for fairness
// only do 4096 iterations per turn before sharing the thread for fairness
let n = (*iterations).clamp(0, 4096);
for _ in 0..n {
hmac.update(prev);
let block = hmac.finalize_reset().into_bytes();
*prev = hmac.clone().chain_update(*prev).finalize().into_bytes();
for (hi_byte, &b) in hi.iter_mut().zip(block.iter()) {
*hi_byte ^= b;
for (hi, prev) in hi.iter_mut().zip(*prev) {
*hi ^= prev;
}
*prev = block;
}
*iterations -= n;

View File

@@ -43,12 +43,6 @@ impl std::ops::Deref for ApiUrl {
}
}
impl std::ops::DerefMut for ApiUrl {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.0
}
}
impl std::fmt::Display for ApiUrl {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
self.0.fmt(f)

View File

@@ -184,7 +184,6 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
"pageserver_evictions_with_low_residence_duration_total",
"pageserver_aux_file_estimated_size",
"pageserver_valid_lsn_lease_count",
"pageserver_tenant_offloaded_timelines",
counter("pageserver_tenant_throttling_count_accounted_start"),
counter("pageserver_tenant_throttling_count_accounted_finish"),
counter("pageserver_tenant_throttling_wait_usecs_sum"),

View File

@@ -103,7 +103,7 @@ class AbstractNeonCli:
else:
stdout = ""
log.warning(f"CLI timeout: stderr={stderr}, stdout={stdout}")
log.warn(f"CLI timeout: stderr={stderr}, stdout={stdout}")
raise
indent = " "

View File

@@ -187,7 +187,6 @@ def test_fully_custom_config(positive_env: NeonEnv):
"args": {"format": "bincode", "compression": {"zstd": {"level": 1}}},
},
"rel_size_v2_enabled": True,
"relsize_snapshot_cache_capacity": 10000,
"gc_compaction_enabled": True,
"gc_compaction_verification": False,
"gc_compaction_initial_threshold_kb": 1024000,

View File

@@ -19,16 +19,6 @@ TEST_ROLE_NAMES = [
{"name": "role$"},
{"name": "role$$"},
{"name": "role$x$"},
{"name": "x"},
{"name": "xx"},
{"name": "$x"},
{"name": "x$"},
{"name": "$x$"},
{"name": "xx$"},
{"name": "$xx"},
{"name": "$xx$"},
# 63 bytes is the limit for role/DB names in Postgres
{"name": "x" * 63},
]
TEST_DB_NAMES = [
@@ -84,43 +74,6 @@ TEST_DB_NAMES = [
"name": "db name$x$",
"owner": "role$x$",
},
{
"name": "x",
"owner": "x",
},
{
"name": "xx",
"owner": "xx",
},
{
"name": "$x",
"owner": "$x",
},
{
"name": "x$",
"owner": "x$",
},
{
"name": "$x$",
"owner": "$x$",
},
{
"name": "xx$",
"owner": "xx$",
},
{
"name": "$xx",
"owner": "$xx",
},
{
"name": "$xx$",
"owner": "$xx$",
},
# 63 bytes is the limit for role/DB names in Postgres
{
"name": "x" * 63,
"owner": "x" * 63,
},
]
@@ -193,10 +146,6 @@ def test_compute_create_drop_dbs_and_roles(neon_simple_env: NeonEnv):
"""
Test that compute_ctl can create and work with databases and roles
with special characters (whitespaces, %, tabs, etc.) in the name.
Also use `drop_subscriptions_before_start: true`. We do not actually
have any subscriptions in this test, so it should be no-op, but it
i) simulates the case when we create a second dev branch together with
a new project creation, and ii) just generally stresses more code paths.
"""
env = neon_simple_env
@@ -210,7 +159,6 @@ def test_compute_create_drop_dbs_and_roles(neon_simple_env: NeonEnv):
**{
"spec": {
"skip_pg_catalog_updates": False,
"drop_subscriptions_before_start": True,
"cluster": {
"roles": TEST_ROLE_NAMES,
"databases": TEST_DB_NAMES,
@@ -254,7 +202,6 @@ def test_compute_create_drop_dbs_and_roles(neon_simple_env: NeonEnv):
**{
"spec": {
"skip_pg_catalog_updates": False,
"drop_subscriptions_before_start": True,
"cluster": {
"roles": [],
"databases": [],

View File

@@ -510,7 +510,7 @@ def list_elegible_layers(
except KeyError:
# Unexpected: tests should call this when pageservers are in a quiet state such that the layer map
# matches what's on disk.
log.warning(f"Lookup {layer_file_name} from {list(visible_map.keys())}")
log.warn(f"Lookup {layer_file_name} from {list(visible_map.keys())}")
raise
return list(c for c in candidates if is_visible(c))
@@ -636,7 +636,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
except:
# On assertion failures, log some details to help with debugging
heatmap = env.pageserver_remote_storage.heatmap_content(tenant_id)
log.warning(f"heatmap contents: {json.dumps(heatmap, indent=2)}")
log.warn(f"heatmap contents: {json.dumps(heatmap, indent=2)}")
raise
# Scrub the remote storage

View File

@@ -27,9 +27,8 @@ from contextlib import closing
import psycopg2
import pytest
from fixtures.common_types import Lsn
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnv, PgBin, wait_for_last_flush_lsn, wait_replica_caughtup
from fixtures.neon_fixtures import NeonEnv, wait_for_last_flush_lsn, wait_replica_caughtup
from fixtures.pg_version import PgVersion
from fixtures.utils import query_scalar, skip_on_postgres, wait_until
@@ -696,110 +695,3 @@ def test_replica_start_with_too_many_unused_xids(neon_simple_env: NeonEnv):
with secondary.cursor() as secondary_cur:
secondary_cur.execute("select count(*) from t")
assert secondary_cur.fetchone() == (n_restarts,)
def test_ephemeral_endpoints_vacuum(neon_simple_env: NeonEnv, pg_bin: PgBin):
env = neon_simple_env
endpoint = env.endpoints.create_start("main")
sql = """
CREATE TABLE CHAR_TBL(f1 char(4));
CREATE TABLE FLOAT8_TBL(f1 float8);
CREATE TABLE INT2_TBL(f1 int2);
CREATE TABLE INT4_TBL(f1 int4);
CREATE TABLE INT8_TBL(q1 int8, q2 int8);
CREATE TABLE POINT_TBL(f1 point);
CREATE TABLE TEXT_TBL (f1 text);
CREATE TABLE VARCHAR_TBL(f1 varchar(4));
CREATE TABLE onek (unique1 int4);
CREATE TABLE onek2 AS SELECT * FROM onek;
CREATE TABLE tenk1 (unique1 int4);
CREATE TABLE tenk2 AS SELECT * FROM tenk1;
CREATE TABLE person (name text, age int4,location point);
CREATE TABLE emp (salary int4, manager name) INHERITS (person);
CREATE TABLE student (gpa float8) INHERITS (person);
CREATE TABLE stud_emp ( percent int4) INHERITS (emp, student);
CREATE TABLE road (name text,thepath path);
CREATE TABLE ihighway () INHERITS (road);
CREATE TABLE shighway(surface text) INHERITS (road);
CREATE TABLE BOOLTBL3 (d text, b bool, o int);
CREATE TABLE booltbl4(isfalse bool, istrue bool, isnul bool);
DROP TABLE BOOLTBL3;
DROP TABLE BOOLTBL4;
CREATE TABLE ceil_floor_round (a numeric);
DROP TABLE ceil_floor_round;
CREATE TABLE width_bucket_test (operand_num numeric, operand_f8 float8);
DROP TABLE width_bucket_test;
CREATE TABLE num_input_test (n1 numeric);
CREATE TABLE num_variance (a numeric);
INSERT INTO num_variance VALUES (0);
CREATE TABLE snapshot_test (nr integer, snap txid_snapshot);
CREATE TABLE guid1(guid_field UUID, text_field TEXT DEFAULT(now()));
CREATE TABLE guid2(guid_field UUID, text_field TEXT DEFAULT(now()));
CREATE INDEX guid1_btree ON guid1 USING BTREE (guid_field);
CREATE INDEX guid1_hash ON guid1 USING HASH (guid_field);
TRUNCATE guid1;
DROP TABLE guid1;
DROP TABLE guid2 CASCADE;
CREATE TABLE numrange_test (nr NUMRANGE);
CREATE INDEX numrange_test_btree on numrange_test(nr);
CREATE TABLE numrange_test2(nr numrange);
CREATE INDEX numrange_test2_hash_idx on numrange_test2 using hash (nr);
INSERT INTO numrange_test2 VALUES('[, 5)');
CREATE TABLE textrange_test (tr text);
CREATE INDEX textrange_test_btree on textrange_test(tr);
CREATE TABLE test_range_gist(ir int4range);
CREATE INDEX test_range_gist_idx on test_range_gist using gist (ir);
DROP INDEX test_range_gist_idx;
CREATE INDEX test_range_gist_idx on test_range_gist using gist (ir);
CREATE TABLE test_range_spgist(ir int4range);
CREATE INDEX test_range_spgist_idx on test_range_spgist using spgist (ir);
DROP INDEX test_range_spgist_idx;
CREATE INDEX test_range_spgist_idx on test_range_spgist using spgist (ir);
CREATE TABLE test_range_elem(i int4);
CREATE INDEX test_range_elem_idx on test_range_elem (i);
CREATE INDEX ON test_range_elem using spgist(int4range(i,i+10));
DROP TABLE test_range_elem;
CREATE TABLE test_range_excl(room int4range, speaker int4range, during tsrange, exclude using gist (room with =, during with &&), exclude using gist (speaker with =, during with &&));
CREATE TABLE f_test(f text, i int);
CREATE TABLE i8r_array (f1 int, f2 text);
CREATE TYPE arrayrange as range (subtype=int4[]);
CREATE TYPE two_ints as (a int, b int);
DROP TYPE two_ints cascade;
CREATE TABLE text_support_test (t text);
CREATE TABLE TEMP_FLOAT (f1 FLOAT8);
CREATE TABLE TEMP_INT4 (f1 INT4);
CREATE TABLE TEMP_INT2 (f1 INT2);
CREATE TABLE TEMP_GROUP (f1 INT4, f2 INT4, f3 FLOAT8);
CREATE TABLE POLYGON_TBL(f1 polygon);
CREATE TABLE quad_poly_tbl (id int, p polygon);
INSERT INTO quad_poly_tbl SELECT (x - 1) * 100 + y, polygon(circle(point(x * 10, y * 10), 1 + (x + y) % 10)) FROM generate_series(1, 200) x, generate_series(1, 100) y;
CREATE TABLE quad_poly_tbl_ord_seq2 AS SELECT 1 FROM quad_poly_tbl;
CREATE TABLE quad_poly_tbl_ord_idx2 AS SELECT 1 FROM quad_poly_tbl;
"""
with endpoint.cursor() as cur:
lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
env.endpoints.create_start(branch_name="main", lsn=lsn)
log.info(f"lsn: {lsn}")
for line in sql.split("\n"):
if len(line.strip()) == 0 or line.startswith("--"):
continue
cur.execute(line)
lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
env.endpoints.create_start(branch_name="main", lsn=lsn)
log.info(f"lsn: {lsn}")
cur.execute("VACUUM FULL pg_class;")
for ep in env.endpoints.endpoints:
log.info(f"{ep.endpoint_id} / {ep.pg_port}")
pg_dump_command = ["pg_dumpall", "-f", f"/tmp/dump-{ep.endpoint_id}.sql"]
env_vars = {
"PGPORT": str(ep.pg_port),
"PGUSER": endpoint.default_options["user"],
"PGHOST": endpoint.default_options["host"],
}
pg_bin.run_capture(pg_dump_command, env=env_vars)

View File

@@ -193,11 +193,6 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b
"test_ancestor_branch_archive_branch1", tenant_id, "test_ancestor_branch_archive_parent"
)
offloaded_count = ps_http.get_metric_value(
"pageserver_tenant_offloaded_timelines", {"tenant_id": f"{tenant_id}"}
)
assert offloaded_count == 0
ps_http.timeline_archival_config(
tenant_id,
leaf_timeline_id,
@@ -249,11 +244,6 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b
wait_until(leaf_offloaded)
wait_until(parent_offloaded)
offloaded_count = ps_http.get_metric_value(
"pageserver_tenant_offloaded_timelines", {"tenant_id": f"{tenant_id}"}
)
assert offloaded_count == 2
# Offloaded child timelines should still prevent deletion
with pytest.raises(
PageserverApiException,

View File

@@ -1,18 +1,18 @@
{
"v17": [
"17.5",
"8be779fd3ab9e87206da96a7e4842ef1abf04f44"
"e5374b72997b0afc8374137674e873f7a558120a"
],
"v16": [
"16.9",
"0bf96bd6d70301a0b43b0b3457bb3cf8fb43c198"
"15710a76b7d07912110fcbbaf0c8ad6d7e5a9fbc"
],
"v15": [
"15.13",
"de7640f55da07512834d5cc40c4b3fb376b5f04f"
"daa81cffcf063c54b29a9aabdb6604625f675ad0"
],
"v14": [
"14.18",
"55c0d45abe6467c02084c2192bca117eda6ce1e7"
"4cca6f8083483dda9e12eae292cf788d45bd561f"
]
}

View File

@@ -60,8 +60,7 @@ lazy_static = { version = "1", default-features = false, features = ["spin_no_st
libc = { version = "0.2", features = ["extra_traits", "use_std"] }
log = { version = "0.4", default-features = false, features = ["std"] }
memchr = { version = "2" }
nix-2f80eeee3b1b6c7e = { package = "nix", version = "0.26" }
nix-fa1f6196edfd7249 = { package = "nix", version = "0.30", features = ["dir", "ioctl", "mman", "poll", "signal", "socket"] }
nix = { version = "0.26" }
nom = { version = "7" }
num = { version = "0.4" }
num-bigint = { version = "0.4" }
@@ -107,7 +106,6 @@ tower = { version = "0.4", default-features = false, features = ["balance", "buf
tracing = { version = "0.1", features = ["log"] }
tracing-core = { version = "0.1" }
tracing-log = { version = "0.2" }
tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] }
url = { version = "2", features = ["serde"] }
uuid = { version = "1", features = ["serde", "v4", "v7"] }
zeroize = { version = "1", features = ["derive", "serde"] }