mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-17 05:00:38 +00:00
Compare commits
25 Commits
conrad/pro
...
test-pull-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
bb45db3982 | ||
|
|
597125e124 | ||
|
|
e71d20d392 | ||
|
|
aa0554fd1e | ||
|
|
b853f78136 | ||
|
|
6ad99826c1 | ||
|
|
311ee793b9 | ||
|
|
ad472bd4a1 | ||
|
|
c51db1db61 | ||
|
|
34c1295594 | ||
|
|
b593e51eae | ||
|
|
4c4cb80186 | ||
|
|
92273b6d5e | ||
|
|
e74e7aac93 | ||
|
|
4cca5cdb12 | ||
|
|
9d425b54f7 | ||
|
|
ec790870d5 | ||
|
|
4d7111f240 | ||
|
|
b1fd086c0c | ||
|
|
b6eea65597 | ||
|
|
c42c28b339 | ||
|
|
e4837b0a5a | ||
|
|
14c4fae64a | ||
|
|
cc70fc802d | ||
|
|
fa07097f2f |
2
.github/actionlint.yml
vendored
2
.github/actionlint.yml
vendored
@@ -21,3 +21,5 @@ config-variables:
|
||||
- SLACK_UPCOMING_RELEASE_CHANNEL_ID
|
||||
- DEV_AWS_OIDC_ROLE_ARN
|
||||
- BENCHMARK_INGEST_TARGET_PROJECTID
|
||||
- PGREGRESS_PG16_PROJECT_ID
|
||||
- PGREGRESS_PG17_PROJECT_ID
|
||||
|
||||
32
.github/workflows/cloud-regress.yml
vendored
32
.github/workflows/cloud-regress.yml
vendored
@@ -23,11 +23,14 @@ jobs:
|
||||
regress:
|
||||
env:
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
DEFAULT_PG_VERSION: 16
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUILD_TYPE: remote
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
pg-version: [16, 17]
|
||||
|
||||
runs-on: us-east-2
|
||||
container:
|
||||
@@ -40,9 +43,11 @@ jobs:
|
||||
submodules: true
|
||||
|
||||
- name: Patch the test
|
||||
env:
|
||||
PG_VERSION: ${{matrix.pg-version}}
|
||||
run: |
|
||||
cd "vendor/postgres-v${DEFAULT_PG_VERSION}"
|
||||
patch -p1 < "../../compute/patches/cloud_regress_pg${DEFAULT_PG_VERSION}.patch"
|
||||
cd "vendor/postgres-v${PG_VERSION}"
|
||||
patch -p1 < "../../compute/patches/cloud_regress_pg${PG_VERSION}.patch"
|
||||
|
||||
- name: Generate a random password
|
||||
id: pwgen
|
||||
@@ -55,8 +60,9 @@ jobs:
|
||||
- name: Change tests according to the generated password
|
||||
env:
|
||||
DBPASS: ${{ steps.pwgen.outputs.DBPASS }}
|
||||
PG_VERSION: ${{matrix.pg-version}}
|
||||
run: |
|
||||
cd vendor/postgres-v"${DEFAULT_PG_VERSION}"/src/test/regress
|
||||
cd vendor/postgres-v"${PG_VERSION}"/src/test/regress
|
||||
for fname in sql/*.sql expected/*.out; do
|
||||
sed -i.bak s/NEON_PASSWORD_PLACEHOLDER/"'${DBPASS}'"/ "${fname}"
|
||||
done
|
||||
@@ -73,15 +79,29 @@ jobs:
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
|
||||
- name: Create a new branch
|
||||
id: create-branch
|
||||
uses: ./.github/actions/neon-branch-create
|
||||
with:
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
project_id: ${{ vars[format('PGREGRESS_PG{0}_PROJECT_ID', matrix.pg-version)] }}
|
||||
|
||||
- name: Run the regression tests
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
test_selection: cloud_regress
|
||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
pg_version: ${{matrix.pg-version}}
|
||||
extra_params: -m remote_cluster
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ secrets.PG_REGRESS_CONNSTR }}
|
||||
BENCHMARK_CONNSTR: ${{steps.create-branch.outputs.dsn}}
|
||||
|
||||
- name: Delete branch
|
||||
uses: ./.github/actions/neon-branch-delete
|
||||
with:
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
project_id: ${{ vars[format('PGREGRESS_PG{0}_PROJECT_ID', matrix.pg-version)] }}
|
||||
branch_id: ${{steps.create-branch.outputs.branch_id}}
|
||||
|
||||
- name: Create Allure report
|
||||
id: create-allure-report
|
||||
|
||||
33
CODEOWNERS
33
CODEOWNERS
@@ -1,16 +1,29 @@
|
||||
/.github/ @neondatabase/developer-productivity
|
||||
/compute_tools/ @neondatabase/control-plane @neondatabase/compute
|
||||
/libs/pageserver_api/ @neondatabase/storage
|
||||
/libs/postgres_ffi/ @neondatabase/compute @neondatabase/storage
|
||||
/libs/proxy/ @neondatabase/proxy
|
||||
/libs/remote_storage/ @neondatabase/storage
|
||||
/libs/safekeeper_api/ @neondatabase/storage
|
||||
# Autoscaling
|
||||
/libs/vm_monitor/ @neondatabase/autoscaling
|
||||
/pageserver/ @neondatabase/storage
|
||||
|
||||
# DevProd
|
||||
/.github/ @neondatabase/developer-productivity
|
||||
|
||||
# Compute
|
||||
/pgxn/ @neondatabase/compute
|
||||
/pgxn/neon/ @neondatabase/compute @neondatabase/storage
|
||||
/vendor/ @neondatabase/compute
|
||||
/compute/ @neondatabase/compute
|
||||
/compute_tools/ @neondatabase/compute
|
||||
|
||||
# Proxy
|
||||
/libs/proxy/ @neondatabase/proxy
|
||||
/proxy/ @neondatabase/proxy
|
||||
|
||||
# Storage
|
||||
/pageserver/ @neondatabase/storage
|
||||
/safekeeper/ @neondatabase/storage
|
||||
/storage_controller @neondatabase/storage
|
||||
/storage_scrubber @neondatabase/storage
|
||||
/vendor/ @neondatabase/compute
|
||||
/libs/pageserver_api/ @neondatabase/storage
|
||||
/libs/remote_storage/ @neondatabase/storage
|
||||
/libs/safekeeper_api/ @neondatabase/storage
|
||||
|
||||
# Shared
|
||||
/pgxn/neon/ @neondatabase/compute @neondatabase/storage
|
||||
/libs/compute_api/ @neondatabase/compute @neondatabase/control-plane
|
||||
/libs/postgres_ffi/ @neondatabase/compute @neondatabase/storage
|
||||
|
||||
515
Cargo.lock
generated
515
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
10
Cargo.toml
10
Cargo.toml
@@ -51,10 +51,6 @@ anyhow = { version = "1.0", features = ["backtrace"] }
|
||||
arc-swap = "1.6"
|
||||
async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
|
||||
atomic-take = "1.1.0"
|
||||
azure_core = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls", "hmac_rust"] }
|
||||
azure_identity = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls"] }
|
||||
azure_storage = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls"] }
|
||||
azure_storage_blobs = { version = "0.19", default-features = false, features = ["enable_reqwest_rustls"] }
|
||||
flate2 = "1.0.26"
|
||||
async-stream = "0.3"
|
||||
async-trait = "0.1"
|
||||
@@ -216,6 +212,12 @@ postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git",
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
|
||||
|
||||
## Azure SDK crates
|
||||
azure_core = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls", "hmac_rust"] }
|
||||
azure_identity = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls"] }
|
||||
azure_storage = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls"] }
|
||||
azure_storage_blobs = { git = "https://github.com/neondatabase/azure-sdk-for-rust.git", branch = "neon", default-features = false, features = ["enable_reqwest_rustls"] }
|
||||
|
||||
## Local libraries
|
||||
compute_api = { version = "0.1", path = "./libs/compute_api/" }
|
||||
consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
|
||||
|
||||
@@ -115,7 +115,7 @@ RUN set -e \
|
||||
|
||||
# Keep the version the same as in compute/compute-node.Dockerfile and
|
||||
# test_runner/regress/test_compute_metrics.py.
|
||||
ENV SQL_EXPORTER_VERSION=0.13.1
|
||||
ENV SQL_EXPORTER_VERSION=0.16.0
|
||||
RUN curl -fsSL \
|
||||
"https://github.com/burningalchemist/sql_exporter/releases/download/${SQL_EXPORTER_VERSION}/sql_exporter-${SQL_EXPORTER_VERSION}.linux-$(case "$(uname -m)" in x86_64) echo amd64;; aarch64) echo arm64;; esac).tar.gz" \
|
||||
--output sql_exporter.tar.gz \
|
||||
|
||||
@@ -1324,7 +1324,7 @@ FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.1 AS postgres-exporter
|
||||
|
||||
# Keep the version the same as in build-tools.Dockerfile and
|
||||
# test_runner/regress/test_compute_metrics.py.
|
||||
FROM burningalchemist/sql_exporter:0.13.1 AS sql-exporter
|
||||
FROM burningalchemist/sql_exporter:0.16.0 AS sql-exporter
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
|
||||
4047
compute/patches/cloud_regress_pg17.patch
Normal file
4047
compute/patches/cloud_regress_pg17.patch
Normal file
File diff suppressed because it is too large
Load Diff
@@ -810,7 +810,7 @@ impl Endpoint {
|
||||
}
|
||||
|
||||
let client = reqwest::Client::builder()
|
||||
.timeout(Duration::from_secs(30))
|
||||
.timeout(Duration::from_secs(120))
|
||||
.build()
|
||||
.unwrap();
|
||||
let response = client
|
||||
|
||||
@@ -42,6 +42,7 @@ allow = [
|
||||
"MPL-2.0",
|
||||
"OpenSSL",
|
||||
"Unicode-DFS-2016",
|
||||
"Unicode-3.0",
|
||||
]
|
||||
confidence-threshold = 0.8
|
||||
exceptions = [
|
||||
|
||||
@@ -245,6 +245,17 @@ impl From<NodeAvailability> for NodeAvailabilityWrapper {
|
||||
}
|
||||
}
|
||||
|
||||
/// Scheduling policy enables us to selectively disable some automatic actions that the
|
||||
/// controller performs on a tenant shard. This is only set to a non-default value by
|
||||
/// human intervention, and it is reset to the default value (Active) when the tenant's
|
||||
/// placement policy is modified away from Attached.
|
||||
///
|
||||
/// The typical use of a non-Active scheduling policy is one of:
|
||||
/// - Pinnning a shard to a node (i.e. migrating it there & setting a non-Active scheduling policy)
|
||||
/// - Working around a bug (e.g. if something is flapping and we need to stop it until the bug is fixed)
|
||||
///
|
||||
/// If you're not sure which policy to use to pin a shard to its current location, you probably
|
||||
/// want Pause.
|
||||
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
|
||||
pub enum ShardSchedulingPolicy {
|
||||
// Normal mode: the tenant's scheduled locations may be updated at will, including
|
||||
|
||||
@@ -4,18 +4,23 @@ use crate::config::Host;
|
||||
use crate::config::SslMode;
|
||||
use crate::connection::{Request, RequestMessages};
|
||||
|
||||
use crate::types::{Oid, Type};
|
||||
use crate::query::RowStream;
|
||||
use crate::simple_query::SimpleQueryStream;
|
||||
|
||||
use crate::types::{Oid, ToSql, Type};
|
||||
|
||||
use crate::{
|
||||
simple_query, CancelToken, Error, ReadyForQueryStatus, Statement, Transaction,
|
||||
TransactionBuilder,
|
||||
prepare, query, simple_query, slice_iter, CancelToken, Error, ReadyForQueryStatus, Row,
|
||||
SimpleQueryMessage, Statement, ToStatement, Transaction, TransactionBuilder,
|
||||
};
|
||||
use bytes::BytesMut;
|
||||
use fallible_iterator::FallibleIterator;
|
||||
use futures_util::{future, ready};
|
||||
use futures_util::{future, ready, TryStreamExt};
|
||||
use parking_lot::Mutex;
|
||||
use postgres_protocol2::message::{backend::Message, frontend};
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::sync::Arc;
|
||||
use std::task::{Context, Poll};
|
||||
use tokio::sync::mpsc;
|
||||
|
||||
@@ -50,7 +55,7 @@ impl Responses {
|
||||
/// A cache of type info and prepared statements for fetching type info
|
||||
/// (corresponding to the queries in the [prepare] module).
|
||||
#[derive(Default)]
|
||||
pub(crate) struct CachedTypeInfo {
|
||||
struct CachedTypeInfo {
|
||||
/// A statement for basic information for a type from its
|
||||
/// OID. Corresponds to [TYPEINFO_QUERY](prepare::TYPEINFO_QUERY) (or its
|
||||
/// fallback).
|
||||
@@ -66,45 +71,13 @@ pub(crate) struct CachedTypeInfo {
|
||||
/// Cache of types already looked up.
|
||||
types: HashMap<Oid, Type>,
|
||||
}
|
||||
impl CachedTypeInfo {
|
||||
pub(crate) fn typeinfo(&mut self) -> Option<&Statement> {
|
||||
self.typeinfo.as_ref()
|
||||
}
|
||||
|
||||
pub(crate) fn set_typeinfo(&mut self, statement: Statement) -> &Statement {
|
||||
self.typeinfo.insert(statement)
|
||||
}
|
||||
|
||||
pub(crate) fn typeinfo_composite(&mut self) -> Option<&Statement> {
|
||||
self.typeinfo_composite.as_ref()
|
||||
}
|
||||
|
||||
pub(crate) fn set_typeinfo_composite(&mut self, statement: Statement) -> &Statement {
|
||||
self.typeinfo_composite.insert(statement)
|
||||
}
|
||||
|
||||
pub(crate) fn typeinfo_enum(&mut self) -> Option<&Statement> {
|
||||
self.typeinfo_enum.as_ref()
|
||||
}
|
||||
|
||||
pub(crate) fn set_typeinfo_enum(&mut self, statement: Statement) -> &Statement {
|
||||
self.typeinfo_enum.insert(statement)
|
||||
}
|
||||
|
||||
pub(crate) fn type_(&mut self, oid: Oid) -> Option<Type> {
|
||||
self.types.get(&oid).cloned()
|
||||
}
|
||||
|
||||
pub(crate) fn set_type(&mut self, oid: Oid, type_: &Type) {
|
||||
self.types.insert(oid, type_.clone());
|
||||
}
|
||||
}
|
||||
|
||||
pub struct InnerClient {
|
||||
sender: mpsc::UnboundedSender<Request>,
|
||||
cached_typeinfo: Mutex<CachedTypeInfo>,
|
||||
|
||||
/// A buffer to use when writing out postgres commands.
|
||||
buffer: BytesMut,
|
||||
buffer: Mutex<BytesMut>,
|
||||
}
|
||||
|
||||
impl InnerClient {
|
||||
@@ -119,14 +92,47 @@ impl InnerClient {
|
||||
})
|
||||
}
|
||||
|
||||
pub fn typeinfo(&self) -> Option<Statement> {
|
||||
self.cached_typeinfo.lock().typeinfo.clone()
|
||||
}
|
||||
|
||||
pub fn set_typeinfo(&self, statement: &Statement) {
|
||||
self.cached_typeinfo.lock().typeinfo = Some(statement.clone());
|
||||
}
|
||||
|
||||
pub fn typeinfo_composite(&self) -> Option<Statement> {
|
||||
self.cached_typeinfo.lock().typeinfo_composite.clone()
|
||||
}
|
||||
|
||||
pub fn set_typeinfo_composite(&self, statement: &Statement) {
|
||||
self.cached_typeinfo.lock().typeinfo_composite = Some(statement.clone());
|
||||
}
|
||||
|
||||
pub fn typeinfo_enum(&self) -> Option<Statement> {
|
||||
self.cached_typeinfo.lock().typeinfo_enum.clone()
|
||||
}
|
||||
|
||||
pub fn set_typeinfo_enum(&self, statement: &Statement) {
|
||||
self.cached_typeinfo.lock().typeinfo_enum = Some(statement.clone());
|
||||
}
|
||||
|
||||
pub fn type_(&self, oid: Oid) -> Option<Type> {
|
||||
self.cached_typeinfo.lock().types.get(&oid).cloned()
|
||||
}
|
||||
|
||||
pub fn set_type(&self, oid: Oid, type_: &Type) {
|
||||
self.cached_typeinfo.lock().types.insert(oid, type_.clone());
|
||||
}
|
||||
|
||||
/// Call the given function with a buffer to be used when writing out
|
||||
/// postgres commands.
|
||||
pub fn with_buf<F, R>(&mut self, f: F) -> R
|
||||
pub fn with_buf<F, R>(&self, f: F) -> R
|
||||
where
|
||||
F: FnOnce(&mut BytesMut) -> R,
|
||||
{
|
||||
let r = f(&mut self.buffer);
|
||||
self.buffer.clear();
|
||||
let mut buffer = self.buffer.lock();
|
||||
let r = f(&mut buffer);
|
||||
buffer.clear();
|
||||
r
|
||||
}
|
||||
}
|
||||
@@ -144,8 +150,7 @@ pub struct SocketConfig {
|
||||
/// The client is one half of what is returned when a connection is established. Users interact with the database
|
||||
/// through this client object.
|
||||
pub struct Client {
|
||||
pub(crate) inner: InnerClient,
|
||||
pub(crate) cached_typeinfo: CachedTypeInfo,
|
||||
inner: Arc<InnerClient>,
|
||||
|
||||
socket_config: SocketConfig,
|
||||
ssl_mode: SslMode,
|
||||
@@ -162,11 +167,11 @@ impl Client {
|
||||
secret_key: i32,
|
||||
) -> Client {
|
||||
Client {
|
||||
inner: InnerClient {
|
||||
inner: Arc::new(InnerClient {
|
||||
sender,
|
||||
cached_typeinfo: Default::default(),
|
||||
buffer: Default::default(),
|
||||
},
|
||||
cached_typeinfo: Default::default(),
|
||||
}),
|
||||
|
||||
socket_config,
|
||||
ssl_mode,
|
||||
@@ -180,6 +185,161 @@ impl Client {
|
||||
self.process_id
|
||||
}
|
||||
|
||||
pub(crate) fn inner(&self) -> &Arc<InnerClient> {
|
||||
&self.inner
|
||||
}
|
||||
|
||||
/// Creates a new prepared statement.
|
||||
///
|
||||
/// Prepared statements can be executed repeatedly, and may contain query parameters (indicated by `$1`, `$2`, etc),
|
||||
/// which are set when executed. Prepared statements can only be used with the connection that created them.
|
||||
pub async fn prepare(&self, query: &str) -> Result<Statement, Error> {
|
||||
self.prepare_typed(query, &[]).await
|
||||
}
|
||||
|
||||
/// Like `prepare`, but allows the types of query parameters to be explicitly specified.
|
||||
///
|
||||
/// The list of types may be smaller than the number of parameters - the types of the remaining parameters will be
|
||||
/// inferred. For example, `client.prepare_typed(query, &[])` is equivalent to `client.prepare(query)`.
|
||||
pub async fn prepare_typed(
|
||||
&self,
|
||||
query: &str,
|
||||
parameter_types: &[Type],
|
||||
) -> Result<Statement, Error> {
|
||||
prepare::prepare(&self.inner, query, parameter_types).await
|
||||
}
|
||||
|
||||
/// Executes a statement, returning a vector of the resulting rows.
|
||||
///
|
||||
/// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list
|
||||
/// provided, 1-indexed.
|
||||
///
|
||||
/// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be
|
||||
/// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front
|
||||
/// with the `prepare` method.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// Panics if the number of parameters provided does not match the number expected.
|
||||
pub async fn query<T>(
|
||||
&self,
|
||||
statement: &T,
|
||||
params: &[&(dyn ToSql + Sync)],
|
||||
) -> Result<Vec<Row>, Error>
|
||||
where
|
||||
T: ?Sized + ToStatement,
|
||||
{
|
||||
self.query_raw(statement, slice_iter(params))
|
||||
.await?
|
||||
.try_collect()
|
||||
.await
|
||||
}
|
||||
|
||||
/// The maximally flexible version of [`query`].
|
||||
///
|
||||
/// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list
|
||||
/// provided, 1-indexed.
|
||||
///
|
||||
/// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be
|
||||
/// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front
|
||||
/// with the `prepare` method.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// Panics if the number of parameters provided does not match the number expected.
|
||||
///
|
||||
/// [`query`]: #method.query
|
||||
pub async fn query_raw<'a, T, I>(&self, statement: &T, params: I) -> Result<RowStream, Error>
|
||||
where
|
||||
T: ?Sized + ToStatement,
|
||||
I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
|
||||
I::IntoIter: ExactSizeIterator,
|
||||
{
|
||||
let statement = statement.__convert().into_statement(self).await?;
|
||||
query::query(&self.inner, statement, params).await
|
||||
}
|
||||
|
||||
/// Pass text directly to the Postgres backend to allow it to sort out typing itself and
|
||||
/// to save a roundtrip
|
||||
pub async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
|
||||
where
|
||||
S: AsRef<str>,
|
||||
I: IntoIterator<Item = Option<S>>,
|
||||
I::IntoIter: ExactSizeIterator,
|
||||
{
|
||||
query::query_txt(&self.inner, statement, params).await
|
||||
}
|
||||
|
||||
/// Executes a statement, returning the number of rows modified.
|
||||
///
|
||||
/// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list
|
||||
/// provided, 1-indexed.
|
||||
///
|
||||
/// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be
|
||||
/// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front
|
||||
/// with the `prepare` method.
|
||||
///
|
||||
/// If the statement does not modify any rows (e.g. `SELECT`), 0 is returned.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// Panics if the number of parameters provided does not match the number expected.
|
||||
pub async fn execute<T>(
|
||||
&self,
|
||||
statement: &T,
|
||||
params: &[&(dyn ToSql + Sync)],
|
||||
) -> Result<u64, Error>
|
||||
where
|
||||
T: ?Sized + ToStatement,
|
||||
{
|
||||
self.execute_raw(statement, slice_iter(params)).await
|
||||
}
|
||||
|
||||
/// The maximally flexible version of [`execute`].
|
||||
///
|
||||
/// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list
|
||||
/// provided, 1-indexed.
|
||||
///
|
||||
/// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be
|
||||
/// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front
|
||||
/// with the `prepare` method.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// Panics if the number of parameters provided does not match the number expected.
|
||||
///
|
||||
/// [`execute`]: #method.execute
|
||||
pub async fn execute_raw<'a, T, I>(&self, statement: &T, params: I) -> Result<u64, Error>
|
||||
where
|
||||
T: ?Sized + ToStatement,
|
||||
I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
|
||||
I::IntoIter: ExactSizeIterator,
|
||||
{
|
||||
let statement = statement.__convert().into_statement(self).await?;
|
||||
query::execute(self.inner(), statement, params).await
|
||||
}
|
||||
|
||||
/// Executes a sequence of SQL statements using the simple query protocol, returning the resulting rows.
|
||||
///
|
||||
/// Statements should be separated by semicolons. If an error occurs, execution of the sequence will stop at that
|
||||
/// point. The simple query protocol returns the values in rows as strings rather than in their binary encodings,
|
||||
/// so the associated row type doesn't work with the `FromSql` trait. Rather than simply returning a list of the
|
||||
/// rows, this method returns a list of an enum which indicates either the completion of one of the commands,
|
||||
/// or a row of data. This preserves the framing between the separate statements in the request.
|
||||
///
|
||||
/// # Warning
|
||||
///
|
||||
/// Prepared statements should be use for any query which contains user-specified data, as they provided the
|
||||
/// functionality to safely embed that data in the request. Do not form statements via string concatenation and pass
|
||||
/// them to this method!
|
||||
pub async fn simple_query(&self, query: &str) -> Result<Vec<SimpleQueryMessage>, Error> {
|
||||
self.simple_query_raw(query).await?.try_collect().await
|
||||
}
|
||||
|
||||
pub(crate) async fn simple_query_raw(&self, query: &str) -> Result<SimpleQueryStream, Error> {
|
||||
simple_query::simple_query(self.inner(), query).await
|
||||
}
|
||||
|
||||
/// Executes a sequence of SQL statements using the simple query protocol.
|
||||
///
|
||||
/// Statements should be separated by semicolons. If an error occurs, execution of the sequence will stop at that
|
||||
@@ -190,8 +350,8 @@ impl Client {
|
||||
/// Prepared statements should be use for any query which contains user-specified data, as they provided the
|
||||
/// functionality to safely embed that data in the request. Do not form statements via string concatenation and pass
|
||||
/// them to this method!
|
||||
pub async fn batch_execute(&mut self, query: &str) -> Result<ReadyForQueryStatus, Error> {
|
||||
simple_query::batch_execute(&mut self.inner, query).await
|
||||
pub async fn batch_execute(&self, query: &str) -> Result<ReadyForQueryStatus, Error> {
|
||||
simple_query::batch_execute(self.inner(), query).await
|
||||
}
|
||||
|
||||
/// Begins a new database transaction.
|
||||
@@ -199,7 +359,7 @@ impl Client {
|
||||
/// The transaction will roll back by default - use the `commit` method to commit it.
|
||||
pub async fn transaction(&mut self) -> Result<Transaction<'_>, Error> {
|
||||
struct RollbackIfNotDone<'me> {
|
||||
client: &'me mut Client,
|
||||
client: &'me Client,
|
||||
done: bool,
|
||||
}
|
||||
|
||||
@@ -209,13 +369,13 @@ impl Client {
|
||||
return;
|
||||
}
|
||||
|
||||
let buf = self.client.inner.with_buf(|buf| {
|
||||
let buf = self.client.inner().with_buf(|buf| {
|
||||
frontend::query("ROLLBACK", buf).unwrap();
|
||||
buf.split().freeze()
|
||||
});
|
||||
let _ = self
|
||||
.client
|
||||
.inner
|
||||
.inner()
|
||||
.send(RequestMessages::Single(FrontendMessage::Raw(buf)));
|
||||
}
|
||||
}
|
||||
@@ -230,7 +390,7 @@ impl Client {
|
||||
client: self,
|
||||
done: false,
|
||||
};
|
||||
cleaner.client.batch_execute("BEGIN").await?;
|
||||
self.batch_execute("BEGIN").await?;
|
||||
cleaner.done = true;
|
||||
}
|
||||
|
||||
@@ -256,6 +416,11 @@ impl Client {
|
||||
}
|
||||
}
|
||||
|
||||
/// Query for type information
|
||||
pub async fn get_type(&self, oid: Oid) -> Result<Type, Error> {
|
||||
crate::prepare::get_type(&self.inner, oid).await
|
||||
}
|
||||
|
||||
/// Determines if the connection to the server has already closed.
|
||||
///
|
||||
/// In that case, all future queries will fail.
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use crate::query::{self, RowStream};
|
||||
use crate::query::RowStream;
|
||||
use crate::types::Type;
|
||||
use crate::{Client, Error, Transaction};
|
||||
use async_trait::async_trait;
|
||||
@@ -13,32 +13,33 @@ mod private {
|
||||
/// This trait is "sealed", and cannot be implemented outside of this crate.
|
||||
#[async_trait]
|
||||
pub trait GenericClient: private::Sealed {
|
||||
async fn query_raw_txt<S, I>(&mut self, statement: &str, params: I) -> Result<RowStream, Error>
|
||||
/// Like `Client::query_raw_txt`.
|
||||
async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
|
||||
where
|
||||
S: AsRef<str> + Sync + Send,
|
||||
I: IntoIterator<Item = Option<S>> + Sync + Send,
|
||||
I::IntoIter: ExactSizeIterator + Sync + Send;
|
||||
|
||||
/// Query for type information
|
||||
async fn get_type(&mut self, oid: Oid) -> Result<Type, Error>;
|
||||
async fn get_type(&self, oid: Oid) -> Result<Type, Error>;
|
||||
}
|
||||
|
||||
impl private::Sealed for Client {}
|
||||
|
||||
#[async_trait]
|
||||
impl GenericClient for Client {
|
||||
async fn query_raw_txt<S, I>(&mut self, statement: &str, params: I) -> Result<RowStream, Error>
|
||||
async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
|
||||
where
|
||||
S: AsRef<str> + Sync + Send,
|
||||
I: IntoIterator<Item = Option<S>> + Sync + Send,
|
||||
I::IntoIter: ExactSizeIterator + Sync + Send,
|
||||
{
|
||||
query::query_txt(&mut self.inner, statement, params).await
|
||||
self.query_raw_txt(statement, params).await
|
||||
}
|
||||
|
||||
/// Query for type information
|
||||
async fn get_type(&mut self, oid: Oid) -> Result<Type, Error> {
|
||||
crate::prepare::get_type(&mut self.inner, &mut self.cached_typeinfo, oid).await
|
||||
async fn get_type(&self, oid: Oid) -> Result<Type, Error> {
|
||||
self.get_type(oid).await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -47,18 +48,17 @@ impl private::Sealed for Transaction<'_> {}
|
||||
#[async_trait]
|
||||
#[allow(clippy::needless_lifetimes)]
|
||||
impl GenericClient for Transaction<'_> {
|
||||
async fn query_raw_txt<S, I>(&mut self, statement: &str, params: I) -> Result<RowStream, Error>
|
||||
async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
|
||||
where
|
||||
S: AsRef<str> + Sync + Send,
|
||||
I: IntoIterator<Item = Option<S>> + Sync + Send,
|
||||
I::IntoIter: ExactSizeIterator + Sync + Send,
|
||||
{
|
||||
query::query_txt(&mut self.client().inner, statement, params).await
|
||||
self.query_raw_txt(statement, params).await
|
||||
}
|
||||
|
||||
/// Query for type information
|
||||
async fn get_type(&mut self, oid: Oid) -> Result<Type, Error> {
|
||||
let client = self.client();
|
||||
crate::prepare::get_type(&mut client.inner, &mut client.cached_typeinfo, oid).await
|
||||
async fn get_type(&self, oid: Oid) -> Result<Type, Error> {
|
||||
self.client().get_type(oid).await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -10,10 +10,11 @@ use crate::error::DbError;
|
||||
pub use crate::error::Error;
|
||||
pub use crate::generic_client::GenericClient;
|
||||
pub use crate::query::RowStream;
|
||||
pub use crate::row::Row;
|
||||
pub use crate::row::{Row, SimpleQueryRow};
|
||||
pub use crate::simple_query::SimpleQueryStream;
|
||||
pub use crate::statement::{Column, Statement};
|
||||
pub use crate::tls::NoTls;
|
||||
// pub use crate::to_statement::ToStatement;
|
||||
pub use crate::to_statement::ToStatement;
|
||||
pub use crate::transaction::Transaction;
|
||||
pub use crate::transaction_builder::{IsolationLevel, TransactionBuilder};
|
||||
use crate::types::ToSql;
|
||||
@@ -64,7 +65,7 @@ pub mod row;
|
||||
mod simple_query;
|
||||
mod statement;
|
||||
pub mod tls;
|
||||
// mod to_statement;
|
||||
mod to_statement;
|
||||
mod transaction;
|
||||
mod transaction_builder;
|
||||
pub mod types;
|
||||
@@ -97,6 +98,7 @@ impl Notification {
|
||||
/// An asynchronous message from the server.
|
||||
#[allow(clippy::large_enum_variant)]
|
||||
#[derive(Debug, Clone)]
|
||||
#[non_exhaustive]
|
||||
pub enum AsyncMessage {
|
||||
/// A notice.
|
||||
///
|
||||
@@ -108,6 +110,18 @@ pub enum AsyncMessage {
|
||||
Notification(Notification),
|
||||
}
|
||||
|
||||
/// Message returned by the `SimpleQuery` stream.
|
||||
#[derive(Debug)]
|
||||
#[non_exhaustive]
|
||||
pub enum SimpleQueryMessage {
|
||||
/// A row of data.
|
||||
Row(SimpleQueryRow),
|
||||
/// A statement in the query has completed.
|
||||
///
|
||||
/// The number of rows modified or selected is returned.
|
||||
CommandComplete(u64),
|
||||
}
|
||||
|
||||
fn slice_iter<'a>(
|
||||
s: &'a [&'a (dyn ToSql + Sync)],
|
||||
) -> impl ExactSizeIterator<Item = &'a (dyn ToSql + Sync)> + 'a {
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use crate::client::{CachedTypeInfo, InnerClient};
|
||||
use crate::client::InnerClient;
|
||||
use crate::codec::FrontendMessage;
|
||||
use crate::connection::RequestMessages;
|
||||
use crate::error::SqlState;
|
||||
@@ -7,13 +7,14 @@ use crate::{query, slice_iter};
|
||||
use crate::{Column, Error, Statement};
|
||||
use bytes::Bytes;
|
||||
use fallible_iterator::FallibleIterator;
|
||||
use futures_util::{pin_mut, StreamExt, TryStreamExt};
|
||||
use futures_util::{pin_mut, TryStreamExt};
|
||||
use log::debug;
|
||||
use postgres_protocol2::message::backend::Message;
|
||||
use postgres_protocol2::message::frontend;
|
||||
use std::future::Future;
|
||||
use std::pin::{pin, Pin};
|
||||
use std::pin::Pin;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::sync::Arc;
|
||||
|
||||
pub(crate) const TYPEINFO_QUERY: &str = "\
|
||||
SELECT t.typname, t.typtype, t.typelem, r.rngsubtype, t.typbasetype, n.nspname, t.typrelid
|
||||
@@ -58,8 +59,7 @@ ORDER BY attnum
|
||||
static NEXT_ID: AtomicUsize = AtomicUsize::new(0);
|
||||
|
||||
pub async fn prepare(
|
||||
client: &mut InnerClient,
|
||||
cache: &mut CachedTypeInfo,
|
||||
client: &Arc<InnerClient>,
|
||||
query: &str,
|
||||
types: &[Type],
|
||||
) -> Result<Statement, Error> {
|
||||
@@ -86,7 +86,7 @@ pub async fn prepare(
|
||||
let mut parameters = vec![];
|
||||
let mut it = parameter_description.parameters();
|
||||
while let Some(oid) = it.next().map_err(Error::parse)? {
|
||||
let type_ = get_type(client, cache, oid).await?;
|
||||
let type_ = get_type(client, oid).await?;
|
||||
parameters.push(type_);
|
||||
}
|
||||
|
||||
@@ -94,30 +94,24 @@ pub async fn prepare(
|
||||
if let Some(row_description) = row_description {
|
||||
let mut it = row_description.fields();
|
||||
while let Some(field) = it.next().map_err(Error::parse)? {
|
||||
let type_ = get_type(client, cache, field.type_oid()).await?;
|
||||
let type_ = get_type(client, field.type_oid()).await?;
|
||||
let column = Column::new(field.name().to_string(), type_, field);
|
||||
columns.push(column);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Statement::new(name, parameters, columns))
|
||||
Ok(Statement::new(client, name, parameters, columns))
|
||||
}
|
||||
|
||||
fn prepare_rec<'a>(
|
||||
client: &'a mut InnerClient,
|
||||
cache: &'a mut CachedTypeInfo,
|
||||
client: &'a Arc<InnerClient>,
|
||||
query: &'a str,
|
||||
types: &'a [Type],
|
||||
) -> Pin<Box<dyn Future<Output = Result<Statement, Error>> + 'a + Send>> {
|
||||
Box::pin(prepare(client, cache, query, types))
|
||||
Box::pin(prepare(client, query, types))
|
||||
}
|
||||
|
||||
fn encode(
|
||||
client: &mut InnerClient,
|
||||
name: &str,
|
||||
query: &str,
|
||||
types: &[Type],
|
||||
) -> Result<Bytes, Error> {
|
||||
fn encode(client: &InnerClient, name: &str, query: &str, types: &[Type]) -> Result<Bytes, Error> {
|
||||
if types.is_empty() {
|
||||
debug!("preparing query {}: {}", name, query);
|
||||
} else {
|
||||
@@ -132,20 +126,16 @@ fn encode(
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn get_type(
|
||||
client: &mut InnerClient,
|
||||
cache: &mut CachedTypeInfo,
|
||||
oid: Oid,
|
||||
) -> Result<Type, Error> {
|
||||
pub async fn get_type(client: &Arc<InnerClient>, oid: Oid) -> Result<Type, Error> {
|
||||
if let Some(type_) = Type::from_oid(oid) {
|
||||
return Ok(type_);
|
||||
}
|
||||
|
||||
if let Some(type_) = cache.type_(oid) {
|
||||
if let Some(type_) = client.type_(oid) {
|
||||
return Ok(type_);
|
||||
}
|
||||
|
||||
let stmt = typeinfo_statement(client, cache).await?;
|
||||
let stmt = typeinfo_statement(client).await?;
|
||||
|
||||
let rows = query::query(client, stmt, slice_iter(&[&oid])).await?;
|
||||
pin_mut!(rows);
|
||||
@@ -155,141 +145,118 @@ pub async fn get_type(
|
||||
None => return Err(Error::unexpected_message()),
|
||||
};
|
||||
|
||||
let name: String = row.try_get(stmt.columns(), 0)?;
|
||||
let type_: i8 = row.try_get(stmt.columns(), 1)?;
|
||||
let elem_oid: Oid = row.try_get(stmt.columns(), 2)?;
|
||||
let rngsubtype: Option<Oid> = row.try_get(stmt.columns(), 3)?;
|
||||
let basetype: Oid = row.try_get(stmt.columns(), 4)?;
|
||||
let schema: String = row.try_get(stmt.columns(), 5)?;
|
||||
let relid: Oid = row.try_get(stmt.columns(), 6)?;
|
||||
let name: String = row.try_get(0)?;
|
||||
let type_: i8 = row.try_get(1)?;
|
||||
let elem_oid: Oid = row.try_get(2)?;
|
||||
let rngsubtype: Option<Oid> = row.try_get(3)?;
|
||||
let basetype: Oid = row.try_get(4)?;
|
||||
let schema: String = row.try_get(5)?;
|
||||
let relid: Oid = row.try_get(6)?;
|
||||
|
||||
let kind = if type_ == b'e' as i8 {
|
||||
let variants = get_enum_variants(client, cache, oid).await?;
|
||||
let variants = get_enum_variants(client, oid).await?;
|
||||
Kind::Enum(variants)
|
||||
} else if type_ == b'p' as i8 {
|
||||
Kind::Pseudo
|
||||
} else if basetype != 0 {
|
||||
let type_ = get_type_rec(client, cache, basetype).await?;
|
||||
let type_ = get_type_rec(client, basetype).await?;
|
||||
Kind::Domain(type_)
|
||||
} else if elem_oid != 0 {
|
||||
let type_ = get_type_rec(client, cache, elem_oid).await?;
|
||||
let type_ = get_type_rec(client, elem_oid).await?;
|
||||
Kind::Array(type_)
|
||||
} else if relid != 0 {
|
||||
let fields = get_composite_fields(client, cache, relid).await?;
|
||||
let fields = get_composite_fields(client, relid).await?;
|
||||
Kind::Composite(fields)
|
||||
} else if let Some(rngsubtype) = rngsubtype {
|
||||
let type_ = get_type_rec(client, cache, rngsubtype).await?;
|
||||
let type_ = get_type_rec(client, rngsubtype).await?;
|
||||
Kind::Range(type_)
|
||||
} else {
|
||||
Kind::Simple
|
||||
};
|
||||
|
||||
let type_ = Type::new(name, oid, kind, schema);
|
||||
cache.set_type(oid, &type_);
|
||||
client.set_type(oid, &type_);
|
||||
|
||||
Ok(type_)
|
||||
}
|
||||
|
||||
fn get_type_rec<'a>(
|
||||
client: &'a mut InnerClient,
|
||||
cache: &'a mut CachedTypeInfo,
|
||||
client: &'a Arc<InnerClient>,
|
||||
oid: Oid,
|
||||
) -> Pin<Box<dyn Future<Output = Result<Type, Error>> + Send + 'a>> {
|
||||
Box::pin(get_type(client, cache, oid))
|
||||
Box::pin(get_type(client, oid))
|
||||
}
|
||||
|
||||
async fn typeinfo_statement<'c>(
|
||||
client: &mut InnerClient,
|
||||
cache: &'c mut CachedTypeInfo,
|
||||
) -> Result<&'c Statement, Error> {
|
||||
if cache.typeinfo().is_some() {
|
||||
// needed to get around a borrow checker limitation
|
||||
return Ok(cache.typeinfo().unwrap());
|
||||
async fn typeinfo_statement(client: &Arc<InnerClient>) -> Result<Statement, Error> {
|
||||
if let Some(stmt) = client.typeinfo() {
|
||||
return Ok(stmt);
|
||||
}
|
||||
|
||||
let stmt = match prepare_rec(client, cache, TYPEINFO_QUERY, &[]).await {
|
||||
let stmt = match prepare_rec(client, TYPEINFO_QUERY, &[]).await {
|
||||
Ok(stmt) => stmt,
|
||||
Err(ref e) if e.code() == Some(&SqlState::UNDEFINED_TABLE) => {
|
||||
prepare_rec(client, cache, TYPEINFO_FALLBACK_QUERY, &[]).await?
|
||||
prepare_rec(client, TYPEINFO_FALLBACK_QUERY, &[]).await?
|
||||
}
|
||||
Err(e) => return Err(e),
|
||||
};
|
||||
|
||||
Ok(cache.set_typeinfo(stmt))
|
||||
client.set_typeinfo(&stmt);
|
||||
Ok(stmt)
|
||||
}
|
||||
|
||||
async fn get_enum_variants(
|
||||
client: &mut InnerClient,
|
||||
cache: &mut CachedTypeInfo,
|
||||
oid: Oid,
|
||||
) -> Result<Vec<String>, Error> {
|
||||
let stmt = typeinfo_enum_statement(client, cache).await?;
|
||||
async fn get_enum_variants(client: &Arc<InnerClient>, oid: Oid) -> Result<Vec<String>, Error> {
|
||||
let stmt = typeinfo_enum_statement(client).await?;
|
||||
|
||||
let mut out = vec![];
|
||||
|
||||
let mut rows = pin!(query::query(client, stmt, slice_iter(&[&oid])).await?);
|
||||
while let Some(row) = rows.next().await {
|
||||
out.push(row?.try_get(stmt.columns(), 0)?)
|
||||
}
|
||||
Ok(out)
|
||||
query::query(client, stmt, slice_iter(&[&oid]))
|
||||
.await?
|
||||
.and_then(|row| async move { row.try_get(0) })
|
||||
.try_collect()
|
||||
.await
|
||||
}
|
||||
|
||||
async fn typeinfo_enum_statement<'c>(
|
||||
client: &mut InnerClient,
|
||||
cache: &'c mut CachedTypeInfo,
|
||||
) -> Result<&'c Statement, Error> {
|
||||
if cache.typeinfo_enum().is_some() {
|
||||
// needed to get around a borrow checker limitation
|
||||
return Ok(cache.typeinfo_enum().unwrap());
|
||||
async fn typeinfo_enum_statement(client: &Arc<InnerClient>) -> Result<Statement, Error> {
|
||||
if let Some(stmt) = client.typeinfo_enum() {
|
||||
return Ok(stmt);
|
||||
}
|
||||
|
||||
let stmt = match prepare_rec(client, cache, TYPEINFO_ENUM_QUERY, &[]).await {
|
||||
let stmt = match prepare_rec(client, TYPEINFO_ENUM_QUERY, &[]).await {
|
||||
Ok(stmt) => stmt,
|
||||
Err(ref e) if e.code() == Some(&SqlState::UNDEFINED_COLUMN) => {
|
||||
prepare_rec(client, cache, TYPEINFO_ENUM_FALLBACK_QUERY, &[]).await?
|
||||
prepare_rec(client, TYPEINFO_ENUM_FALLBACK_QUERY, &[]).await?
|
||||
}
|
||||
Err(e) => return Err(e),
|
||||
};
|
||||
|
||||
Ok(cache.set_typeinfo_enum(stmt))
|
||||
client.set_typeinfo_enum(&stmt);
|
||||
Ok(stmt)
|
||||
}
|
||||
|
||||
async fn get_composite_fields(
|
||||
client: &mut InnerClient,
|
||||
cache: &mut CachedTypeInfo,
|
||||
oid: Oid,
|
||||
) -> Result<Vec<Field>, Error> {
|
||||
let stmt = typeinfo_composite_statement(client, cache).await?;
|
||||
async fn get_composite_fields(client: &Arc<InnerClient>, oid: Oid) -> Result<Vec<Field>, Error> {
|
||||
let stmt = typeinfo_composite_statement(client).await?;
|
||||
|
||||
let mut rows = pin!(query::query(client, stmt, slice_iter(&[&oid])).await?);
|
||||
|
||||
let mut oids = vec![];
|
||||
while let Some(row) = rows.next().await {
|
||||
let row = row?;
|
||||
let name = row.try_get(stmt.columns(), 0)?;
|
||||
let oid = row.try_get(stmt.columns(), 1)?;
|
||||
oids.push((name, oid));
|
||||
}
|
||||
let rows = query::query(client, stmt, slice_iter(&[&oid]))
|
||||
.await?
|
||||
.try_collect::<Vec<_>>()
|
||||
.await?;
|
||||
|
||||
let mut fields = vec![];
|
||||
for (name, oid) in oids {
|
||||
let type_ = get_type_rec(client, cache, oid).await?;
|
||||
for row in rows {
|
||||
let name = row.try_get(0)?;
|
||||
let oid = row.try_get(1)?;
|
||||
let type_ = get_type_rec(client, oid).await?;
|
||||
fields.push(Field::new(name, type_));
|
||||
}
|
||||
|
||||
Ok(fields)
|
||||
}
|
||||
|
||||
async fn typeinfo_composite_statement<'c>(
|
||||
client: &mut InnerClient,
|
||||
cache: &'c mut CachedTypeInfo,
|
||||
) -> Result<&'c Statement, Error> {
|
||||
if cache.typeinfo_composite().is_some() {
|
||||
// needed to get around a borrow checker limitation
|
||||
return Ok(cache.typeinfo_composite().unwrap());
|
||||
async fn typeinfo_composite_statement(client: &Arc<InnerClient>) -> Result<Statement, Error> {
|
||||
if let Some(stmt) = client.typeinfo_composite() {
|
||||
return Ok(stmt);
|
||||
}
|
||||
|
||||
let stmt = prepare_rec(client, cache, TYPEINFO_COMPOSITE_QUERY, &[]).await?;
|
||||
let stmt = prepare_rec(client, TYPEINFO_COMPOSITE_QUERY, &[]).await?;
|
||||
|
||||
Ok(cache.set_typeinfo_composite(stmt))
|
||||
client.set_typeinfo_composite(&stmt);
|
||||
Ok(stmt)
|
||||
}
|
||||
|
||||
@@ -14,6 +14,7 @@ use postgres_types2::{Format, ToSql, Type};
|
||||
use std::fmt;
|
||||
use std::marker::PhantomPinned;
|
||||
use std::pin::Pin;
|
||||
use std::sync::Arc;
|
||||
use std::task::{Context, Poll};
|
||||
|
||||
struct BorrowToSqlParamsDebug<'a>(&'a [&'a (dyn ToSql + Sync)]);
|
||||
@@ -25,10 +26,10 @@ impl fmt::Debug for BorrowToSqlParamsDebug<'_> {
|
||||
}
|
||||
|
||||
pub async fn query<'a, I>(
|
||||
client: &mut InnerClient,
|
||||
statement: &Statement,
|
||||
client: &InnerClient,
|
||||
statement: Statement,
|
||||
params: I,
|
||||
) -> Result<RawRowStream, Error>
|
||||
) -> Result<RowStream, Error>
|
||||
where
|
||||
I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
|
||||
I::IntoIter: ExactSizeIterator,
|
||||
@@ -40,12 +41,13 @@ where
|
||||
statement.name(),
|
||||
BorrowToSqlParamsDebug(params.as_slice()),
|
||||
);
|
||||
encode(client, statement, params)?
|
||||
encode(client, &statement, params)?
|
||||
} else {
|
||||
encode(client, statement, params)?
|
||||
encode(client, &statement, params)?
|
||||
};
|
||||
let responses = start(client, buf).await?;
|
||||
Ok(RawRowStream {
|
||||
Ok(RowStream {
|
||||
statement,
|
||||
responses,
|
||||
command_tag: None,
|
||||
status: ReadyForQueryStatus::Unknown,
|
||||
@@ -55,7 +57,7 @@ where
|
||||
}
|
||||
|
||||
pub async fn query_txt<S, I>(
|
||||
client: &mut InnerClient,
|
||||
client: &Arc<InnerClient>,
|
||||
query: &str,
|
||||
params: I,
|
||||
) -> Result<RowStream, Error>
|
||||
@@ -155,6 +157,49 @@ where
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn execute<'a, I>(
|
||||
client: &InnerClient,
|
||||
statement: Statement,
|
||||
params: I,
|
||||
) -> Result<u64, Error>
|
||||
where
|
||||
I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
|
||||
I::IntoIter: ExactSizeIterator,
|
||||
{
|
||||
let buf = if log_enabled!(Level::Debug) {
|
||||
let params = params.into_iter().collect::<Vec<_>>();
|
||||
debug!(
|
||||
"executing statement {} with parameters: {:?}",
|
||||
statement.name(),
|
||||
BorrowToSqlParamsDebug(params.as_slice()),
|
||||
);
|
||||
encode(client, &statement, params)?
|
||||
} else {
|
||||
encode(client, &statement, params)?
|
||||
};
|
||||
let mut responses = start(client, buf).await?;
|
||||
|
||||
let mut rows = 0;
|
||||
loop {
|
||||
match responses.next().await? {
|
||||
Message::DataRow(_) => {}
|
||||
Message::CommandComplete(body) => {
|
||||
rows = body
|
||||
.tag()
|
||||
.map_err(Error::parse)?
|
||||
.rsplit(' ')
|
||||
.next()
|
||||
.unwrap()
|
||||
.parse()
|
||||
.unwrap_or(0);
|
||||
}
|
||||
Message::EmptyQueryResponse => rows = 0,
|
||||
Message::ReadyForQuery(_) => return Ok(rows),
|
||||
_ => return Err(Error::unexpected_message()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn start(client: &InnerClient, buf: Bytes) -> Result<Responses, Error> {
|
||||
let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?;
|
||||
|
||||
@@ -166,11 +211,7 @@ async fn start(client: &InnerClient, buf: Bytes) -> Result<Responses, Error> {
|
||||
Ok(responses)
|
||||
}
|
||||
|
||||
pub fn encode<'a, I>(
|
||||
client: &mut InnerClient,
|
||||
statement: &Statement,
|
||||
params: I,
|
||||
) -> Result<Bytes, Error>
|
||||
pub fn encode<'a, I>(client: &InnerClient, statement: &Statement, params: I) -> Result<Bytes, Error>
|
||||
where
|
||||
I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
|
||||
I::IntoIter: ExactSizeIterator,
|
||||
@@ -255,7 +296,11 @@ impl Stream for RowStream {
|
||||
loop {
|
||||
match ready!(this.responses.poll_next(cx)?) {
|
||||
Message::DataRow(body) => {
|
||||
return Poll::Ready(Some(Ok(Row::new(body, *this.output_format)?)))
|
||||
return Poll::Ready(Some(Ok(Row::new(
|
||||
this.statement.clone(),
|
||||
body,
|
||||
*this.output_format,
|
||||
)?)))
|
||||
}
|
||||
Message::EmptyQueryResponse | Message::PortalSuspended => {}
|
||||
Message::CommandComplete(body) => {
|
||||
@@ -293,41 +338,3 @@ impl RowStream {
|
||||
self.status
|
||||
}
|
||||
}
|
||||
|
||||
pin_project! {
|
||||
/// A stream of table rows.
|
||||
pub struct RawRowStream {
|
||||
responses: Responses,
|
||||
command_tag: Option<String>,
|
||||
output_format: Format,
|
||||
status: ReadyForQueryStatus,
|
||||
#[pin]
|
||||
_p: PhantomPinned,
|
||||
}
|
||||
}
|
||||
|
||||
impl Stream for RawRowStream {
|
||||
type Item = Result<Row, Error>;
|
||||
|
||||
fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
|
||||
let this = self.project();
|
||||
loop {
|
||||
match ready!(this.responses.poll_next(cx)?) {
|
||||
Message::DataRow(body) => {
|
||||
return Poll::Ready(Some(Ok(Row::new(body, *this.output_format)?)))
|
||||
}
|
||||
Message::EmptyQueryResponse | Message::PortalSuspended => {}
|
||||
Message::CommandComplete(body) => {
|
||||
if let Ok(tag) = body.tag() {
|
||||
*this.command_tag = Some(tag.to_string());
|
||||
}
|
||||
}
|
||||
Message::ReadyForQuery(status) => {
|
||||
*this.status = status.into();
|
||||
return Poll::Ready(None);
|
||||
}
|
||||
_ => return Poll::Ready(Some(Err(Error::unexpected_message()))),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,16 +1,103 @@
|
||||
//! Rows.
|
||||
|
||||
use crate::row::sealed::{AsName, Sealed};
|
||||
use crate::simple_query::SimpleColumn;
|
||||
use crate::statement::Column;
|
||||
use crate::types::{FromSql, Type, WrongType};
|
||||
use crate::Error;
|
||||
use crate::{Error, Statement};
|
||||
use fallible_iterator::FallibleIterator;
|
||||
use postgres_protocol2::message::backend::DataRowBody;
|
||||
use postgres_types2::{Format, WrongFormat};
|
||||
use std::fmt;
|
||||
use std::ops::Range;
|
||||
use std::str;
|
||||
use std::sync::Arc;
|
||||
|
||||
mod sealed {
|
||||
pub trait Sealed {}
|
||||
|
||||
pub trait AsName {
|
||||
fn as_name(&self) -> &str;
|
||||
}
|
||||
}
|
||||
|
||||
impl AsName for Column {
|
||||
fn as_name(&self) -> &str {
|
||||
self.name()
|
||||
}
|
||||
}
|
||||
|
||||
impl AsName for String {
|
||||
fn as_name(&self) -> &str {
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
/// A trait implemented by types that can index into columns of a row.
|
||||
///
|
||||
/// This cannot be implemented outside of this crate.
|
||||
pub trait RowIndex: Sealed {
|
||||
#[doc(hidden)]
|
||||
fn __idx<T>(&self, columns: &[T]) -> Option<usize>
|
||||
where
|
||||
T: AsName;
|
||||
}
|
||||
|
||||
impl Sealed for usize {}
|
||||
|
||||
impl RowIndex for usize {
|
||||
#[inline]
|
||||
fn __idx<T>(&self, columns: &[T]) -> Option<usize>
|
||||
where
|
||||
T: AsName,
|
||||
{
|
||||
if *self >= columns.len() {
|
||||
None
|
||||
} else {
|
||||
Some(*self)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Sealed for str {}
|
||||
|
||||
impl RowIndex for str {
|
||||
#[inline]
|
||||
fn __idx<T>(&self, columns: &[T]) -> Option<usize>
|
||||
where
|
||||
T: AsName,
|
||||
{
|
||||
if let Some(idx) = columns.iter().position(|d| d.as_name() == self) {
|
||||
return Some(idx);
|
||||
};
|
||||
|
||||
// FIXME ASCII-only case insensitivity isn't really the right thing to
|
||||
// do. Postgres itself uses a dubious wrapper around tolower and JDBC
|
||||
// uses the US locale.
|
||||
columns
|
||||
.iter()
|
||||
.position(|d| d.as_name().eq_ignore_ascii_case(self))
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Sealed for &T where T: ?Sized + Sealed {}
|
||||
|
||||
impl<T> RowIndex for &T
|
||||
where
|
||||
T: ?Sized + RowIndex,
|
||||
{
|
||||
#[inline]
|
||||
fn __idx<U>(&self, columns: &[U]) -> Option<usize>
|
||||
where
|
||||
U: AsName,
|
||||
{
|
||||
T::__idx(*self, columns)
|
||||
}
|
||||
}
|
||||
|
||||
/// A row of data returned from the database by a query.
|
||||
pub struct Row {
|
||||
statement: Statement,
|
||||
output_format: Format,
|
||||
body: DataRowBody,
|
||||
ranges: Vec<Option<Range<usize>>>,
|
||||
@@ -18,33 +105,80 @@ pub struct Row {
|
||||
|
||||
impl fmt::Debug for Row {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.debug_struct("Row").finish()
|
||||
f.debug_struct("Row")
|
||||
.field("columns", &self.columns())
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl Row {
|
||||
pub(crate) fn new(
|
||||
// statement: Statement,
|
||||
statement: Statement,
|
||||
body: DataRowBody,
|
||||
output_format: Format,
|
||||
) -> Result<Row, Error> {
|
||||
let ranges = body.ranges().collect().map_err(Error::parse)?;
|
||||
Ok(Row {
|
||||
statement,
|
||||
body,
|
||||
ranges,
|
||||
output_format,
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn try_get<'a, T>(&'a self, columns: &[Column], idx: usize) -> Result<T, Error>
|
||||
/// Returns information about the columns of data in the row.
|
||||
pub fn columns(&self) -> &[Column] {
|
||||
self.statement.columns()
|
||||
}
|
||||
|
||||
/// Determines if the row contains no values.
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.len() == 0
|
||||
}
|
||||
|
||||
/// Returns the number of values in the row.
|
||||
pub fn len(&self) -> usize {
|
||||
self.columns().len()
|
||||
}
|
||||
|
||||
/// Deserializes a value from the row.
|
||||
///
|
||||
/// The value can be specified either by its numeric index in the row, or by its column name.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// Panics if the index is out of bounds or if the value cannot be converted to the specified type.
|
||||
pub fn get<'a, I, T>(&'a self, idx: I) -> T
|
||||
where
|
||||
I: RowIndex + fmt::Display,
|
||||
T: FromSql<'a>,
|
||||
{
|
||||
let Some(column) = columns.get(idx) else {
|
||||
return Err(Error::column(idx.to_string()));
|
||||
match self.get_inner(&idx) {
|
||||
Ok(ok) => ok,
|
||||
Err(err) => panic!("error retrieving column {}: {}", idx, err),
|
||||
}
|
||||
}
|
||||
|
||||
/// Like `Row::get`, but returns a `Result` rather than panicking.
|
||||
pub fn try_get<'a, I, T>(&'a self, idx: I) -> Result<T, Error>
|
||||
where
|
||||
I: RowIndex + fmt::Display,
|
||||
T: FromSql<'a>,
|
||||
{
|
||||
self.get_inner(&idx)
|
||||
}
|
||||
|
||||
fn get_inner<'a, I, T>(&'a self, idx: &I) -> Result<T, Error>
|
||||
where
|
||||
I: RowIndex + fmt::Display,
|
||||
T: FromSql<'a>,
|
||||
{
|
||||
let idx = match idx.__idx(self.columns()) {
|
||||
Some(idx) => idx,
|
||||
None => return Err(Error::column(idx.to_string())),
|
||||
};
|
||||
|
||||
let ty = column.type_();
|
||||
let ty = self.columns()[idx].type_();
|
||||
if !T::accepts(ty) {
|
||||
return Err(Error::from_sql(
|
||||
Box::new(WrongType::new::<T>(ty.clone())),
|
||||
@@ -82,3 +216,85 @@ impl Row {
|
||||
self.body.buffer().len()
|
||||
}
|
||||
}
|
||||
|
||||
impl AsName for SimpleColumn {
|
||||
fn as_name(&self) -> &str {
|
||||
self.name()
|
||||
}
|
||||
}
|
||||
|
||||
/// A row of data returned from the database by a simple query.
|
||||
#[derive(Debug)]
|
||||
pub struct SimpleQueryRow {
|
||||
columns: Arc<[SimpleColumn]>,
|
||||
body: DataRowBody,
|
||||
ranges: Vec<Option<Range<usize>>>,
|
||||
}
|
||||
|
||||
impl SimpleQueryRow {
|
||||
#[allow(clippy::new_ret_no_self)]
|
||||
pub(crate) fn new(
|
||||
columns: Arc<[SimpleColumn]>,
|
||||
body: DataRowBody,
|
||||
) -> Result<SimpleQueryRow, Error> {
|
||||
let ranges = body.ranges().collect().map_err(Error::parse)?;
|
||||
Ok(SimpleQueryRow {
|
||||
columns,
|
||||
body,
|
||||
ranges,
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns information about the columns of data in the row.
|
||||
pub fn columns(&self) -> &[SimpleColumn] {
|
||||
&self.columns
|
||||
}
|
||||
|
||||
/// Determines if the row contains no values.
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.len() == 0
|
||||
}
|
||||
|
||||
/// Returns the number of values in the row.
|
||||
pub fn len(&self) -> usize {
|
||||
self.columns.len()
|
||||
}
|
||||
|
||||
/// Returns a value from the row.
|
||||
///
|
||||
/// The value can be specified either by its numeric index in the row, or by its column name.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// Panics if the index is out of bounds or if the value cannot be converted to the specified type.
|
||||
pub fn get<I>(&self, idx: I) -> Option<&str>
|
||||
where
|
||||
I: RowIndex + fmt::Display,
|
||||
{
|
||||
match self.get_inner(&idx) {
|
||||
Ok(ok) => ok,
|
||||
Err(err) => panic!("error retrieving column {}: {}", idx, err),
|
||||
}
|
||||
}
|
||||
|
||||
/// Like `SimpleQueryRow::get`, but returns a `Result` rather than panicking.
|
||||
pub fn try_get<I>(&self, idx: I) -> Result<Option<&str>, Error>
|
||||
where
|
||||
I: RowIndex + fmt::Display,
|
||||
{
|
||||
self.get_inner(&idx)
|
||||
}
|
||||
|
||||
fn get_inner<I>(&self, idx: &I) -> Result<Option<&str>, Error>
|
||||
where
|
||||
I: RowIndex + fmt::Display,
|
||||
{
|
||||
let idx = match idx.__idx(&self.columns) {
|
||||
Some(idx) => idx,
|
||||
None => return Err(Error::column(idx.to_string())),
|
||||
};
|
||||
|
||||
let buf = self.ranges[idx].clone().map(|r| &self.body.buffer()[r]);
|
||||
FromSql::from_sql_nullable(&Type::TEXT, buf).map_err(|e| Error::from_sql(e, idx))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,14 +1,52 @@
|
||||
use crate::client::InnerClient;
|
||||
use crate::client::{InnerClient, Responses};
|
||||
use crate::codec::FrontendMessage;
|
||||
use crate::connection::RequestMessages;
|
||||
use crate::{Error, ReadyForQueryStatus};
|
||||
use crate::{Error, ReadyForQueryStatus, SimpleQueryMessage, SimpleQueryRow};
|
||||
use bytes::Bytes;
|
||||
use fallible_iterator::FallibleIterator;
|
||||
use futures_util::{ready, Stream};
|
||||
use log::debug;
|
||||
use pin_project_lite::pin_project;
|
||||
use postgres_protocol2::message::backend::Message;
|
||||
use postgres_protocol2::message::frontend;
|
||||
use std::marker::PhantomPinned;
|
||||
use std::pin::Pin;
|
||||
use std::sync::Arc;
|
||||
use std::task::{Context, Poll};
|
||||
|
||||
/// Information about a column of a single query row.
|
||||
#[derive(Debug)]
|
||||
pub struct SimpleColumn {
|
||||
name: String,
|
||||
}
|
||||
|
||||
impl SimpleColumn {
|
||||
pub(crate) fn new(name: String) -> SimpleColumn {
|
||||
SimpleColumn { name }
|
||||
}
|
||||
|
||||
/// Returns the name of the column.
|
||||
pub fn name(&self) -> &str {
|
||||
&self.name
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn simple_query(client: &InnerClient, query: &str) -> Result<SimpleQueryStream, Error> {
|
||||
debug!("executing simple query: {}", query);
|
||||
|
||||
let buf = encode(client, query)?;
|
||||
let responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?;
|
||||
|
||||
Ok(SimpleQueryStream {
|
||||
responses,
|
||||
columns: None,
|
||||
status: ReadyForQueryStatus::Unknown,
|
||||
_p: PhantomPinned,
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn batch_execute(
|
||||
client: &mut InnerClient,
|
||||
client: &InnerClient,
|
||||
query: &str,
|
||||
) -> Result<ReadyForQueryStatus, Error> {
|
||||
debug!("executing statement batch: {}", query);
|
||||
@@ -28,9 +66,77 @@ pub async fn batch_execute(
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn encode(client: &mut InnerClient, query: &str) -> Result<Bytes, Error> {
|
||||
pub(crate) fn encode(client: &InnerClient, query: &str) -> Result<Bytes, Error> {
|
||||
client.with_buf(|buf| {
|
||||
frontend::query(query, buf).map_err(Error::encode)?;
|
||||
Ok(buf.split().freeze())
|
||||
})
|
||||
}
|
||||
|
||||
pin_project! {
|
||||
/// A stream of simple query results.
|
||||
pub struct SimpleQueryStream {
|
||||
responses: Responses,
|
||||
columns: Option<Arc<[SimpleColumn]>>,
|
||||
status: ReadyForQueryStatus,
|
||||
#[pin]
|
||||
_p: PhantomPinned,
|
||||
}
|
||||
}
|
||||
|
||||
impl SimpleQueryStream {
|
||||
/// Returns if the connection is ready for querying, with the status of the connection.
|
||||
///
|
||||
/// This might be available only after the stream has been exhausted.
|
||||
pub fn ready_status(&self) -> ReadyForQueryStatus {
|
||||
self.status
|
||||
}
|
||||
}
|
||||
|
||||
impl Stream for SimpleQueryStream {
|
||||
type Item = Result<SimpleQueryMessage, Error>;
|
||||
|
||||
fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
|
||||
let this = self.project();
|
||||
loop {
|
||||
match ready!(this.responses.poll_next(cx)?) {
|
||||
Message::CommandComplete(body) => {
|
||||
let rows = body
|
||||
.tag()
|
||||
.map_err(Error::parse)?
|
||||
.rsplit(' ')
|
||||
.next()
|
||||
.unwrap()
|
||||
.parse()
|
||||
.unwrap_or(0);
|
||||
return Poll::Ready(Some(Ok(SimpleQueryMessage::CommandComplete(rows))));
|
||||
}
|
||||
Message::EmptyQueryResponse => {
|
||||
return Poll::Ready(Some(Ok(SimpleQueryMessage::CommandComplete(0))));
|
||||
}
|
||||
Message::RowDescription(body) => {
|
||||
let columns = body
|
||||
.fields()
|
||||
.map(|f| Ok(SimpleColumn::new(f.name().to_string())))
|
||||
.collect::<Vec<_>>()
|
||||
.map_err(Error::parse)?
|
||||
.into();
|
||||
|
||||
*this.columns = Some(columns);
|
||||
}
|
||||
Message::DataRow(body) => {
|
||||
let row = match &this.columns {
|
||||
Some(columns) => SimpleQueryRow::new(columns.clone(), body)?,
|
||||
None => return Poll::Ready(Some(Err(Error::unexpected_message()))),
|
||||
};
|
||||
return Poll::Ready(Some(Ok(SimpleQueryMessage::Row(row))));
|
||||
}
|
||||
Message::ReadyForQuery(s) => {
|
||||
*this.status = s.into();
|
||||
return Poll::Ready(None);
|
||||
}
|
||||
_ => return Poll::Ready(Some(Err(Error::unexpected_message()))),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,33 +1,64 @@
|
||||
use crate::client::InnerClient;
|
||||
use crate::codec::FrontendMessage;
|
||||
use crate::connection::RequestMessages;
|
||||
use crate::types::Type;
|
||||
use postgres_protocol2::{message::backend::Field, Oid};
|
||||
use std::fmt;
|
||||
use postgres_protocol2::{
|
||||
message::{backend::Field, frontend},
|
||||
Oid,
|
||||
};
|
||||
use std::{
|
||||
fmt,
|
||||
sync::{Arc, Weak},
|
||||
};
|
||||
|
||||
struct StatementInner {
|
||||
client: Weak<InnerClient>,
|
||||
name: String,
|
||||
params: Vec<Type>,
|
||||
columns: Vec<Column>,
|
||||
}
|
||||
|
||||
impl Drop for StatementInner {
|
||||
fn drop(&mut self) {
|
||||
if let Some(client) = self.client.upgrade() {
|
||||
let buf = client.with_buf(|buf| {
|
||||
frontend::close(b'S', &self.name, buf).unwrap();
|
||||
frontend::sync(buf);
|
||||
buf.split().freeze()
|
||||
});
|
||||
let _ = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A prepared statement.
|
||||
///
|
||||
/// Prepared statements can only be used with the connection that created them.
|
||||
pub struct Statement(StatementInner);
|
||||
#[derive(Clone)]
|
||||
pub struct Statement(Arc<StatementInner>);
|
||||
|
||||
impl Statement {
|
||||
pub(crate) fn new(name: String, params: Vec<Type>, columns: Vec<Column>) -> Statement {
|
||||
Statement(StatementInner {
|
||||
pub(crate) fn new(
|
||||
inner: &Arc<InnerClient>,
|
||||
name: String,
|
||||
params: Vec<Type>,
|
||||
columns: Vec<Column>,
|
||||
) -> Statement {
|
||||
Statement(Arc::new(StatementInner {
|
||||
client: Arc::downgrade(inner),
|
||||
name,
|
||||
params,
|
||||
columns,
|
||||
})
|
||||
}))
|
||||
}
|
||||
|
||||
pub(crate) fn new_anonymous(params: Vec<Type>, columns: Vec<Column>) -> Statement {
|
||||
Statement(StatementInner {
|
||||
Statement(Arc::new(StatementInner {
|
||||
client: Weak::new(),
|
||||
name: String::new(),
|
||||
params,
|
||||
columns,
|
||||
})
|
||||
}))
|
||||
}
|
||||
|
||||
pub(crate) fn name(&self) -> &str {
|
||||
|
||||
57
libs/proxy/tokio-postgres2/src/to_statement.rs
Normal file
57
libs/proxy/tokio-postgres2/src/to_statement.rs
Normal file
@@ -0,0 +1,57 @@
|
||||
use crate::to_statement::private::{Sealed, ToStatementType};
|
||||
use crate::Statement;
|
||||
|
||||
mod private {
|
||||
use crate::{Client, Error, Statement};
|
||||
|
||||
pub trait Sealed {}
|
||||
|
||||
pub enum ToStatementType<'a> {
|
||||
Statement(&'a Statement),
|
||||
Query(&'a str),
|
||||
}
|
||||
|
||||
impl<'a> ToStatementType<'a> {
|
||||
pub async fn into_statement(self, client: &Client) -> Result<Statement, Error> {
|
||||
match self {
|
||||
ToStatementType::Statement(s) => Ok(s.clone()),
|
||||
ToStatementType::Query(s) => client.prepare(s).await,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A trait abstracting over prepared and unprepared statements.
|
||||
///
|
||||
/// Many methods are generic over this bound, so that they support both a raw query string as well as a statement which
|
||||
/// was prepared previously.
|
||||
///
|
||||
/// This trait is "sealed" and cannot be implemented by anything outside this crate.
|
||||
pub trait ToStatement: Sealed {
|
||||
#[doc(hidden)]
|
||||
fn __convert(&self) -> ToStatementType<'_>;
|
||||
}
|
||||
|
||||
impl ToStatement for Statement {
|
||||
fn __convert(&self) -> ToStatementType<'_> {
|
||||
ToStatementType::Statement(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl Sealed for Statement {}
|
||||
|
||||
impl ToStatement for str {
|
||||
fn __convert(&self) -> ToStatementType<'_> {
|
||||
ToStatementType::Query(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl Sealed for str {}
|
||||
|
||||
impl ToStatement for String {
|
||||
fn __convert(&self) -> ToStatementType<'_> {
|
||||
ToStatementType::Query(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl Sealed for String {}
|
||||
@@ -1,5 +1,6 @@
|
||||
use crate::codec::FrontendMessage;
|
||||
use crate::connection::RequestMessages;
|
||||
use crate::query::RowStream;
|
||||
use crate::{CancelToken, Client, Error, ReadyForQueryStatus};
|
||||
use postgres_protocol2::message::frontend;
|
||||
|
||||
@@ -18,13 +19,13 @@ impl Drop for Transaction<'_> {
|
||||
return;
|
||||
}
|
||||
|
||||
let buf = self.client.inner.with_buf(|buf| {
|
||||
let buf = self.client.inner().with_buf(|buf| {
|
||||
frontend::query("ROLLBACK", buf).unwrap();
|
||||
buf.split().freeze()
|
||||
});
|
||||
let _ = self
|
||||
.client
|
||||
.inner
|
||||
.inner()
|
||||
.send(RequestMessages::Single(FrontendMessage::Raw(buf)));
|
||||
}
|
||||
}
|
||||
@@ -51,13 +52,23 @@ impl<'a> Transaction<'a> {
|
||||
self.client.batch_execute("ROLLBACK").await
|
||||
}
|
||||
|
||||
/// Like `Client::query_raw_txt`.
|
||||
pub async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
|
||||
where
|
||||
S: AsRef<str>,
|
||||
I: IntoIterator<Item = Option<S>>,
|
||||
I::IntoIter: ExactSizeIterator,
|
||||
{
|
||||
self.client.query_raw_txt(statement, params).await
|
||||
}
|
||||
|
||||
/// Like `Client::cancel_token`.
|
||||
pub fn cancel_token(&self) -> CancelToken {
|
||||
self.client.cancel_token()
|
||||
}
|
||||
|
||||
/// Returns a reference to the underlying `Client`.
|
||||
pub fn client(&mut self) -> &mut Client {
|
||||
pub fn client(&self) -> &Client {
|
||||
self.client
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,15 +8,14 @@ use std::io;
|
||||
use std::num::NonZeroU32;
|
||||
use std::pin::Pin;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use std::time::SystemTime;
|
||||
|
||||
use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
|
||||
use anyhow::Context;
|
||||
use anyhow::Result;
|
||||
use azure_core::request_options::{IfMatchCondition, MaxResults, Metadata, Range};
|
||||
use azure_core::{Continuable, RetryOptions};
|
||||
use azure_identity::DefaultAzureCredential;
|
||||
use azure_storage::StorageCredentials;
|
||||
use azure_storage_blobs::blob::CopyStatus;
|
||||
use azure_storage_blobs::prelude::ClientBuilder;
|
||||
@@ -76,8 +75,9 @@ impl AzureBlobStorage {
|
||||
let credentials = if let Ok(access_key) = env::var("AZURE_STORAGE_ACCESS_KEY") {
|
||||
StorageCredentials::access_key(account.clone(), access_key)
|
||||
} else {
|
||||
let token_credential = DefaultAzureCredential::default();
|
||||
StorageCredentials::token_credential(Arc::new(token_credential))
|
||||
let token_credential = azure_identity::create_default_credential()
|
||||
.context("trying to obtain Azure default credentials")?;
|
||||
StorageCredentials::token_credential(token_credential)
|
||||
};
|
||||
|
||||
// we have an outer retry
|
||||
@@ -624,6 +624,10 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
res
|
||||
}
|
||||
|
||||
fn max_keys_per_delete(&self) -> usize {
|
||||
super::MAX_KEYS_PER_DELETE_AZURE
|
||||
}
|
||||
|
||||
async fn copy(
|
||||
&self,
|
||||
from: &RemotePath,
|
||||
|
||||
@@ -70,7 +70,14 @@ pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 100;
|
||||
pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;
|
||||
|
||||
/// As defined in S3 docs
|
||||
pub const MAX_KEYS_PER_DELETE: usize = 1000;
|
||||
///
|
||||
/// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_DeleteObjects.html>
|
||||
pub const MAX_KEYS_PER_DELETE_S3: usize = 1000;
|
||||
|
||||
/// As defined in Azure docs
|
||||
///
|
||||
/// <https://learn.microsoft.com/en-us/rest/api/storageservices/blob-batch>
|
||||
pub const MAX_KEYS_PER_DELETE_AZURE: usize = 256;
|
||||
|
||||
const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
|
||||
|
||||
@@ -340,6 +347,14 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()>;
|
||||
|
||||
/// Returns the maximum number of keys that a call to [`Self::delete_objects`] can delete without chunking
|
||||
///
|
||||
/// The value returned is only an optimization hint, One can pass larger number of objects to
|
||||
/// `delete_objects` as well.
|
||||
///
|
||||
/// The value is guaranteed to be >= 1.
|
||||
fn max_keys_per_delete(&self) -> usize;
|
||||
|
||||
/// Deletes all objects matching the given prefix.
|
||||
///
|
||||
/// NB: this uses NoDelimiter and will match partial prefixes. For example, the prefix /a/b will
|
||||
@@ -533,6 +548,16 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
|
||||
}
|
||||
}
|
||||
|
||||
/// [`RemoteStorage::max_keys_per_delete`]
|
||||
pub fn max_keys_per_delete(&self) -> usize {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.max_keys_per_delete(),
|
||||
Self::AwsS3(s) => s.max_keys_per_delete(),
|
||||
Self::AzureBlob(s) => s.max_keys_per_delete(),
|
||||
Self::Unreliable(s) => s.max_keys_per_delete(),
|
||||
}
|
||||
}
|
||||
|
||||
/// See [`RemoteStorage::delete_prefix`]
|
||||
pub async fn delete_prefix(
|
||||
&self,
|
||||
|
||||
@@ -573,6 +573,10 @@ impl RemoteStorage for LocalFs {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn max_keys_per_delete(&self) -> usize {
|
||||
super::MAX_KEYS_PER_DELETE_S3
|
||||
}
|
||||
|
||||
async fn copy(
|
||||
&self,
|
||||
from: &RemotePath,
|
||||
|
||||
@@ -48,7 +48,7 @@ use crate::{
|
||||
metrics::{start_counting_cancelled_wait, start_measuring_requests},
|
||||
support::PermitCarrying,
|
||||
ConcurrencyLimiter, Download, DownloadError, DownloadOpts, Listing, ListingMode, ListingObject,
|
||||
RemotePath, RemoteStorage, TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE,
|
||||
RemotePath, RemoteStorage, TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE_S3,
|
||||
REMOTE_STORAGE_PREFIX_SEPARATOR,
|
||||
};
|
||||
|
||||
@@ -355,7 +355,7 @@ impl S3Bucket {
|
||||
let kind = RequestKind::Delete;
|
||||
let mut cancel = std::pin::pin!(cancel.cancelled());
|
||||
|
||||
for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) {
|
||||
for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE_S3) {
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let req = self
|
||||
@@ -832,6 +832,10 @@ impl RemoteStorage for S3Bucket {
|
||||
self.delete_oids(&permit, &delete_objects, cancel).await
|
||||
}
|
||||
|
||||
fn max_keys_per_delete(&self) -> usize {
|
||||
MAX_KEYS_PER_DELETE_S3
|
||||
}
|
||||
|
||||
async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> {
|
||||
let paths = std::array::from_ref(path);
|
||||
self.delete_objects(paths, cancel).await
|
||||
|
||||
@@ -203,6 +203,10 @@ impl RemoteStorage for UnreliableWrapper {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn max_keys_per_delete(&self) -> usize {
|
||||
self.inner.max_keys_per_delete()
|
||||
}
|
||||
|
||||
async fn copy(
|
||||
&self,
|
||||
from: &RemotePath,
|
||||
|
||||
@@ -9,7 +9,6 @@
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use remote_storage::RemotePath;
|
||||
use remote_storage::TimeoutOrCancel;
|
||||
use remote_storage::MAX_KEYS_PER_DELETE;
|
||||
use std::time::Duration;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::info;
|
||||
@@ -131,7 +130,8 @@ impl Deleter {
|
||||
}
|
||||
|
||||
pub(super) async fn background(&mut self) -> Result<(), DeletionQueueError> {
|
||||
self.accumulator.reserve(MAX_KEYS_PER_DELETE);
|
||||
let max_keys_per_delete = self.remote_storage.max_keys_per_delete();
|
||||
self.accumulator.reserve(max_keys_per_delete);
|
||||
|
||||
loop {
|
||||
if self.cancel.is_cancelled() {
|
||||
@@ -156,14 +156,14 @@ impl Deleter {
|
||||
|
||||
match msg {
|
||||
DeleterMessage::Delete(mut list) => {
|
||||
while !list.is_empty() || self.accumulator.len() == MAX_KEYS_PER_DELETE {
|
||||
if self.accumulator.len() == MAX_KEYS_PER_DELETE {
|
||||
while !list.is_empty() || self.accumulator.len() == max_keys_per_delete {
|
||||
if self.accumulator.len() == max_keys_per_delete {
|
||||
self.flush().await?;
|
||||
// If we have received this number of keys, proceed with attempting to execute
|
||||
assert_eq!(self.accumulator.len(), 0);
|
||||
}
|
||||
|
||||
let available_slots = MAX_KEYS_PER_DELETE - self.accumulator.len();
|
||||
let available_slots = max_keys_per_delete - self.accumulator.len();
|
||||
let take_count = std::cmp::min(available_slots, list.len());
|
||||
for path in list.drain(list.len() - take_count..) {
|
||||
self.accumulator.push(path);
|
||||
|
||||
@@ -2036,15 +2036,23 @@ async fn timeline_compact_handler(
|
||||
parse_query_param::<_, bool>(&request, "wait_until_scheduled_compaction_done")?
|
||||
.unwrap_or(false);
|
||||
|
||||
let sub_compaction = compact_request
|
||||
.as_ref()
|
||||
.map(|r| r.sub_compaction)
|
||||
.unwrap_or(false);
|
||||
let options = CompactOptions {
|
||||
compact_range: compact_request
|
||||
.as_ref()
|
||||
.and_then(|r| r.compact_range.clone()),
|
||||
compact_below_lsn: compact_request.as_ref().and_then(|r| r.compact_below_lsn),
|
||||
flags,
|
||||
sub_compaction,
|
||||
};
|
||||
|
||||
let scheduled = compact_request.map(|r| r.scheduled).unwrap_or(false);
|
||||
let scheduled = compact_request
|
||||
.as_ref()
|
||||
.map(|r| r.scheduled)
|
||||
.unwrap_or(false);
|
||||
|
||||
async {
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
@@ -2053,7 +2061,7 @@ async fn timeline_compact_handler(
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
let rx = tenant.schedule_compaction(timeline_id, options).await;
|
||||
let rx = tenant.schedule_compaction(timeline_id, options).await.map_err(ApiError::InternalServerError)?;
|
||||
if wait_until_scheduled_compaction_done {
|
||||
// It is possible that this will take a long time, dropping the HTTP request will not cancel the compaction.
|
||||
rx.await.ok();
|
||||
|
||||
@@ -1223,31 +1223,60 @@ pub(crate) mod virtual_file_io_engine {
|
||||
});
|
||||
}
|
||||
|
||||
pub(crate) struct SmgrOpTimer {
|
||||
pub(crate) struct SmgrOpTimer(Option<SmgrOpTimerInner>);
|
||||
pub(crate) struct SmgrOpTimerInner {
|
||||
global_latency_histo: Histogram,
|
||||
|
||||
// Optional because not all op types are tracked per-timeline
|
||||
per_timeline_latency_histo: Option<Histogram>,
|
||||
|
||||
global_flush_in_progress_micros: IntCounter,
|
||||
per_timeline_flush_in_progress_micros: IntCounter,
|
||||
|
||||
start: Instant,
|
||||
throttled: Duration,
|
||||
op: SmgrQueryType,
|
||||
}
|
||||
|
||||
pub(crate) struct SmgrOpFlushInProgress {
|
||||
base: Instant,
|
||||
global_micros: IntCounter,
|
||||
per_timeline_micros: IntCounter,
|
||||
}
|
||||
|
||||
impl SmgrOpTimer {
|
||||
pub(crate) fn deduct_throttle(&mut self, throttle: &Option<Duration>) {
|
||||
let Some(throttle) = throttle else {
|
||||
return;
|
||||
};
|
||||
self.throttled += *throttle;
|
||||
let inner = self.0.as_mut().expect("other public methods consume self");
|
||||
inner.throttled += *throttle;
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for SmgrOpTimer {
|
||||
fn drop(&mut self) {
|
||||
let elapsed = self.start.elapsed();
|
||||
pub(crate) fn observe_smgr_op_completion_and_start_flushing(mut self) -> SmgrOpFlushInProgress {
|
||||
let (flush_start, inner) = self
|
||||
.smgr_op_end()
|
||||
.expect("this method consume self, and the only other caller is drop handler");
|
||||
let SmgrOpTimerInner {
|
||||
global_flush_in_progress_micros,
|
||||
per_timeline_flush_in_progress_micros,
|
||||
..
|
||||
} = inner;
|
||||
SmgrOpFlushInProgress {
|
||||
base: flush_start,
|
||||
global_micros: global_flush_in_progress_micros,
|
||||
per_timeline_micros: per_timeline_flush_in_progress_micros,
|
||||
}
|
||||
}
|
||||
|
||||
let elapsed = match elapsed.checked_sub(self.throttled) {
|
||||
/// Returns `None`` if this method has already been called, `Some` otherwise.
|
||||
fn smgr_op_end(&mut self) -> Option<(Instant, SmgrOpTimerInner)> {
|
||||
let inner = self.0.take()?;
|
||||
|
||||
let now = Instant::now();
|
||||
let elapsed = now - inner.start;
|
||||
|
||||
let elapsed = match elapsed.checked_sub(inner.throttled) {
|
||||
Some(elapsed) => elapsed,
|
||||
None => {
|
||||
use utils::rate_limit::RateLimit;
|
||||
@@ -1258,9 +1287,9 @@ impl Drop for SmgrOpTimer {
|
||||
})))
|
||||
});
|
||||
let mut guard = LOGGED.lock().unwrap();
|
||||
let rate_limit = &mut guard[self.op];
|
||||
let rate_limit = &mut guard[inner.op];
|
||||
rate_limit.call(|| {
|
||||
warn!(op=?self.op, ?elapsed, ?self.throttled, "implementation error: time spent throttled exceeds total request wall clock time");
|
||||
warn!(op=?inner.op, ?elapsed, ?inner.throttled, "implementation error: time spent throttled exceeds total request wall clock time");
|
||||
});
|
||||
elapsed // un-throttled time, more info than just saturating to 0
|
||||
}
|
||||
@@ -1268,10 +1297,54 @@ impl Drop for SmgrOpTimer {
|
||||
|
||||
let elapsed = elapsed.as_secs_f64();
|
||||
|
||||
self.global_latency_histo.observe(elapsed);
|
||||
if let Some(per_timeline_getpage_histo) = &self.per_timeline_latency_histo {
|
||||
inner.global_latency_histo.observe(elapsed);
|
||||
if let Some(per_timeline_getpage_histo) = &inner.per_timeline_latency_histo {
|
||||
per_timeline_getpage_histo.observe(elapsed);
|
||||
}
|
||||
|
||||
Some((now, inner))
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for SmgrOpTimer {
|
||||
fn drop(&mut self) {
|
||||
self.smgr_op_end();
|
||||
}
|
||||
}
|
||||
|
||||
impl SmgrOpFlushInProgress {
|
||||
pub(crate) async fn measure<Fut, O>(mut self, mut fut: Fut) -> O
|
||||
where
|
||||
Fut: std::future::Future<Output = O>,
|
||||
{
|
||||
let mut fut = std::pin::pin!(fut);
|
||||
|
||||
let now = Instant::now();
|
||||
// Whenever observe_guard gets called, or dropped,
|
||||
// it adds the time elapsed since its last call to metrics.
|
||||
// Last call is tracked in `now`.
|
||||
let mut observe_guard = scopeguard::guard(
|
||||
|| {
|
||||
let elapsed = now - self.base;
|
||||
self.global_micros
|
||||
.inc_by(u64::try_from(elapsed.as_micros()).unwrap());
|
||||
self.per_timeline_micros
|
||||
.inc_by(u64::try_from(elapsed.as_micros()).unwrap());
|
||||
self.base = now;
|
||||
},
|
||||
|mut observe| {
|
||||
observe();
|
||||
},
|
||||
);
|
||||
|
||||
loop {
|
||||
match tokio::time::timeout(Duration::from_secs(10), &mut fut).await {
|
||||
Ok(v) => return v,
|
||||
Err(_timeout) => {
|
||||
(*observe_guard)();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1302,6 +1375,8 @@ pub(crate) struct SmgrQueryTimePerTimeline {
|
||||
per_timeline_getpage_latency: Histogram,
|
||||
global_batch_size: Histogram,
|
||||
per_timeline_batch_size: Histogram,
|
||||
global_flush_in_progress_micros: IntCounter,
|
||||
per_timeline_flush_in_progress_micros: IntCounter,
|
||||
}
|
||||
|
||||
static SMGR_QUERY_STARTED_GLOBAL: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
@@ -1464,6 +1539,26 @@ fn set_page_service_config_max_batch_size(conf: &PageServicePipeliningConfig) {
|
||||
.set(value.try_into().unwrap());
|
||||
}
|
||||
|
||||
static PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_page_service_pagestream_flush_in_progress_micros",
|
||||
"Counter that sums up the microseconds that a pagestream response was being flushed into the TCP connection. \
|
||||
If the flush is particularly slow, this counter will be updated periodically to make slow flushes \
|
||||
easily discoverable in monitoring. \
|
||||
Hence, this is NOT a completion latency historgram.",
|
||||
&["tenant_id", "shard_id", "timeline_id"],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_page_service_pagestream_flush_in_progress_micros_global",
|
||||
"Like pageserver_page_service_pagestream_flush_in_progress_seconds, but instance-wide.",
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
impl SmgrQueryTimePerTimeline {
|
||||
pub(crate) fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
|
||||
let tenant_id = tenant_shard_id.tenant_id.to_string();
|
||||
@@ -1504,6 +1599,12 @@ impl SmgrQueryTimePerTimeline {
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
|
||||
.unwrap();
|
||||
|
||||
let global_flush_in_progress_micros =
|
||||
PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL.clone();
|
||||
let per_timeline_flush_in_progress_micros = PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
|
||||
.unwrap();
|
||||
|
||||
Self {
|
||||
global_started,
|
||||
global_latency,
|
||||
@@ -1511,6 +1612,8 @@ impl SmgrQueryTimePerTimeline {
|
||||
per_timeline_getpage_started,
|
||||
global_batch_size,
|
||||
per_timeline_batch_size,
|
||||
global_flush_in_progress_micros,
|
||||
per_timeline_flush_in_progress_micros,
|
||||
}
|
||||
}
|
||||
pub(crate) fn start_smgr_op(&self, op: SmgrQueryType, started_at: Instant) -> SmgrOpTimer {
|
||||
@@ -1523,13 +1626,17 @@ impl SmgrQueryTimePerTimeline {
|
||||
None
|
||||
};
|
||||
|
||||
SmgrOpTimer {
|
||||
SmgrOpTimer(Some(SmgrOpTimerInner {
|
||||
global_latency_histo: self.global_latency[op as usize].clone(),
|
||||
per_timeline_latency_histo,
|
||||
start: started_at,
|
||||
op,
|
||||
throttled: Duration::ZERO,
|
||||
}
|
||||
global_flush_in_progress_micros: self.global_flush_in_progress_micros.clone(),
|
||||
per_timeline_flush_in_progress_micros: self
|
||||
.per_timeline_flush_in_progress_micros
|
||||
.clone(),
|
||||
}))
|
||||
}
|
||||
|
||||
pub(crate) fn observe_getpage_batch_start(&self, batch_size: usize) {
|
||||
@@ -2204,6 +2311,15 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMet
|
||||
.expect("failed to define a metric"),
|
||||
});
|
||||
|
||||
pub(crate) static PAGESERVER_TIMELINE_WAL_RECORDS_RECEIVED: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_timeline_wal_records_received",
|
||||
"Number of WAL records received per shard",
|
||||
&["tenant_id", "shard_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_wal_redo_seconds",
|
||||
@@ -2431,6 +2547,7 @@ pub(crate) struct TimelineMetrics {
|
||||
pub evictions_with_low_residence_duration: std::sync::RwLock<EvictionsWithLowResidenceDuration>,
|
||||
/// Number of valid LSN leases.
|
||||
pub valid_lsn_lease_count_gauge: UIntGauge,
|
||||
pub wal_records_received: IntCounter,
|
||||
shutdown: std::sync::atomic::AtomicBool,
|
||||
}
|
||||
|
||||
@@ -2588,6 +2705,10 @@ impl TimelineMetrics {
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
|
||||
let wal_records_received = PAGESERVER_TIMELINE_WAL_RECORDS_RECEIVED
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
|
||||
TimelineMetrics {
|
||||
tenant_id,
|
||||
shard_id,
|
||||
@@ -2620,6 +2741,7 @@ impl TimelineMetrics {
|
||||
evictions_with_low_residence_duration,
|
||||
),
|
||||
valid_lsn_lease_count_gauge,
|
||||
wal_records_received,
|
||||
shutdown: std::sync::atomic::AtomicBool::default(),
|
||||
}
|
||||
}
|
||||
@@ -2757,6 +2879,16 @@ impl TimelineMetrics {
|
||||
shard_id,
|
||||
timeline_id,
|
||||
]);
|
||||
let _ = PAGESERVER_TIMELINE_WAL_RECORDS_RECEIVED.remove_label_values(&[
|
||||
tenant_id,
|
||||
shard_id,
|
||||
timeline_id,
|
||||
]);
|
||||
let _ = PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS.remove_label_values(&[
|
||||
tenant_id,
|
||||
shard_id,
|
||||
timeline_id,
|
||||
]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1017,10 +1017,8 @@ impl PageServerHandler {
|
||||
// Map handler result to protocol behavior.
|
||||
// Some handler errors cause exit from pagestream protocol.
|
||||
// Other handler errors are sent back as an error message and we stay in pagestream protocol.
|
||||
let mut timers: smallvec::SmallVec<[_; 1]> =
|
||||
smallvec::SmallVec::with_capacity(handler_results.len());
|
||||
for handler_result in handler_results {
|
||||
let response_msg = match handler_result {
|
||||
let (response_msg, timer) = match handler_result {
|
||||
Err(e) => match &e {
|
||||
PageStreamError::Shutdown => {
|
||||
// If we fail to fulfil a request during shutdown, which may be _because_ of
|
||||
@@ -1044,34 +1042,66 @@ impl PageServerHandler {
|
||||
span.in_scope(|| {
|
||||
error!("error reading relation or page version: {full:#}")
|
||||
});
|
||||
PagestreamBeMessage::Error(PagestreamErrorResponse {
|
||||
message: e.to_string(),
|
||||
})
|
||||
(
|
||||
PagestreamBeMessage::Error(PagestreamErrorResponse {
|
||||
message: e.to_string(),
|
||||
}),
|
||||
None, // TODO: measure errors
|
||||
)
|
||||
}
|
||||
},
|
||||
Ok((response_msg, timer)) => {
|
||||
// Extending the lifetime of the timers so observations on drop
|
||||
// include the flush time.
|
||||
timers.push(timer);
|
||||
response_msg
|
||||
}
|
||||
Ok((response_msg, timer)) => (response_msg, Some(timer)),
|
||||
};
|
||||
|
||||
//
|
||||
// marshal & transmit response message
|
||||
//
|
||||
|
||||
pgb_writer.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?;
|
||||
}
|
||||
tokio::select! {
|
||||
biased;
|
||||
_ = cancel.cancelled() => {
|
||||
// We were requested to shut down.
|
||||
info!("shutdown request received in page handler");
|
||||
return Err(QueryError::Shutdown)
|
||||
}
|
||||
res = pgb_writer.flush() => {
|
||||
res?;
|
||||
|
||||
// We purposefully don't count flush time into the timer.
|
||||
//
|
||||
// The reason is that current compute client will not perform protocol processing
|
||||
// if the postgres backend process is doing things other than `->smgr_read()`.
|
||||
// This is especially the case for prefetch.
|
||||
//
|
||||
// If the compute doesn't read from the connection, eventually TCP will backpressure
|
||||
// all the way into our flush call below.
|
||||
//
|
||||
// The timer's underlying metric is used for a storage-internal latency SLO and
|
||||
// we don't want to include latency in it that we can't control.
|
||||
// And as pointed out above, in this case, we don't control the time that flush will take.
|
||||
let flushing_timer =
|
||||
timer.map(|timer| timer.observe_smgr_op_completion_and_start_flushing());
|
||||
|
||||
// what we want to do
|
||||
let flush_fut = pgb_writer.flush();
|
||||
// metric for how long flushing takes
|
||||
let flush_fut = match flushing_timer {
|
||||
Some(flushing_timer) => {
|
||||
futures::future::Either::Left(flushing_timer.measure(flush_fut))
|
||||
}
|
||||
None => futures::future::Either::Right(flush_fut),
|
||||
};
|
||||
// do it while respecting cancellation
|
||||
let _: () = async move {
|
||||
tokio::select! {
|
||||
biased;
|
||||
_ = cancel.cancelled() => {
|
||||
// We were requested to shut down.
|
||||
info!("shutdown request received in page handler");
|
||||
return Err(QueryError::Shutdown)
|
||||
}
|
||||
res = flush_fut => {
|
||||
res?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
// and log the info! line inside the request span
|
||||
.instrument(span.clone())
|
||||
.await?;
|
||||
}
|
||||
drop(timers);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -49,6 +49,7 @@ use timeline::import_pgdata;
|
||||
use timeline::offload::offload_timeline;
|
||||
use timeline::CompactFlags;
|
||||
use timeline::CompactOptions;
|
||||
use timeline::CompactionError;
|
||||
use timeline::ShutdownMode;
|
||||
use tokio::io::BufReader;
|
||||
use tokio::sync::watch;
|
||||
@@ -2987,10 +2988,16 @@ impl Tenant {
|
||||
if has_pending_l0_compaction_task {
|
||||
Some(true)
|
||||
} else {
|
||||
let has_pending_scheduled_compaction_task;
|
||||
let mut has_pending_scheduled_compaction_task;
|
||||
let next_scheduled_compaction_task = {
|
||||
let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
|
||||
if let Some(tline_pending_tasks) = guard.get_mut(timeline_id) {
|
||||
if !tline_pending_tasks.is_empty() {
|
||||
info!(
|
||||
"{} tasks left in the compaction schedule queue",
|
||||
tline_pending_tasks.len()
|
||||
);
|
||||
}
|
||||
let next_task = tline_pending_tasks.pop_front();
|
||||
has_pending_scheduled_compaction_task = !tline_pending_tasks.is_empty();
|
||||
next_task
|
||||
@@ -3007,6 +3014,41 @@ impl Tenant {
|
||||
.contains(CompactFlags::EnhancedGcBottomMostCompaction)
|
||||
{
|
||||
warn!("ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}", next_scheduled_compaction_task.options);
|
||||
} else if next_scheduled_compaction_task.options.sub_compaction {
|
||||
info!("running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs");
|
||||
let jobs = timeline
|
||||
.gc_compaction_split_jobs(next_scheduled_compaction_task.options)
|
||||
.await
|
||||
.map_err(CompactionError::Other)?;
|
||||
if jobs.is_empty() {
|
||||
info!("no jobs to run, skipping scheduled compaction task");
|
||||
} else {
|
||||
has_pending_scheduled_compaction_task = true;
|
||||
let jobs_len = jobs.len();
|
||||
let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
|
||||
let tline_pending_tasks = guard.entry(*timeline_id).or_default();
|
||||
for (idx, job) in jobs.into_iter().enumerate() {
|
||||
tline_pending_tasks.push_back(if idx == jobs_len - 1 {
|
||||
ScheduledCompactionTask {
|
||||
options: job,
|
||||
// The last job in the queue sends the signal and releases the gc guard
|
||||
result_tx: next_scheduled_compaction_task
|
||||
.result_tx
|
||||
.take(),
|
||||
gc_block: next_scheduled_compaction_task
|
||||
.gc_block
|
||||
.take(),
|
||||
}
|
||||
} else {
|
||||
ScheduledCompactionTask {
|
||||
options: job,
|
||||
result_tx: None,
|
||||
gc_block: None,
|
||||
}
|
||||
});
|
||||
}
|
||||
info!("scheduled enhanced gc bottom-most compaction with sub-compaction, split into {} jobs", jobs_len);
|
||||
}
|
||||
} else {
|
||||
let _ = timeline
|
||||
.compact_with_options(
|
||||
@@ -3062,15 +3104,22 @@ impl Tenant {
|
||||
&self,
|
||||
timeline_id: TimelineId,
|
||||
options: CompactOptions,
|
||||
) -> tokio::sync::oneshot::Receiver<()> {
|
||||
) -> anyhow::Result<tokio::sync::oneshot::Receiver<()>> {
|
||||
let gc_guard = match self.gc_block.start().await {
|
||||
Ok(guard) => guard,
|
||||
Err(e) => {
|
||||
bail!("cannot run gc-compaction because gc is blocked: {}", e);
|
||||
}
|
||||
};
|
||||
let (tx, rx) = tokio::sync::oneshot::channel();
|
||||
let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
|
||||
let tline_pending_tasks = guard.entry(timeline_id).or_default();
|
||||
tline_pending_tasks.push_back(ScheduledCompactionTask {
|
||||
options,
|
||||
result_tx: Some(tx),
|
||||
gc_block: Some(gc_guard),
|
||||
});
|
||||
rx
|
||||
Ok(rx)
|
||||
}
|
||||
|
||||
// Call through to all timelines to freeze ephemeral layers if needed. Usually
|
||||
@@ -4457,7 +4506,12 @@ impl Tenant {
|
||||
// - this timeline was created while we were finding cutoffs
|
||||
// - lsn for timestamp search fails for this timeline repeatedly
|
||||
if let Some(cutoffs) = gc_cutoffs.get(&timeline.timeline_id) {
|
||||
target.cutoffs = cutoffs.clone();
|
||||
let original_cutoffs = target.cutoffs.clone();
|
||||
// GC cutoffs should never go back
|
||||
target.cutoffs = GcCutoffs {
|
||||
space: Lsn(cutoffs.space.0.max(original_cutoffs.space.0)),
|
||||
time: Lsn(cutoffs.time.0.max(original_cutoffs.time.0)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8117,6 +8171,12 @@ mod tests {
|
||||
)
|
||||
.await?;
|
||||
{
|
||||
tline
|
||||
.latest_gc_cutoff_lsn
|
||||
.lock_for_write()
|
||||
.store_and_unlock(Lsn(0x30))
|
||||
.wait()
|
||||
.await;
|
||||
// Update GC info
|
||||
let mut guard = tline.gc_info.write().unwrap();
|
||||
guard.cutoffs.time = Lsn(0x30);
|
||||
@@ -8219,6 +8279,12 @@ mod tests {
|
||||
|
||||
// increase GC horizon and compact again
|
||||
{
|
||||
tline
|
||||
.latest_gc_cutoff_lsn
|
||||
.lock_for_write()
|
||||
.store_and_unlock(Lsn(0x40))
|
||||
.wait()
|
||||
.await;
|
||||
// Update GC info
|
||||
let mut guard = tline.gc_info.write().unwrap();
|
||||
guard.cutoffs.time = Lsn(0x40);
|
||||
@@ -8599,6 +8665,12 @@ mod tests {
|
||||
.await?
|
||||
};
|
||||
{
|
||||
tline
|
||||
.latest_gc_cutoff_lsn
|
||||
.lock_for_write()
|
||||
.store_and_unlock(Lsn(0x30))
|
||||
.wait()
|
||||
.await;
|
||||
// Update GC info
|
||||
let mut guard = tline.gc_info.write().unwrap();
|
||||
*guard = GcInfo {
|
||||
@@ -8680,6 +8752,12 @@ mod tests {
|
||||
|
||||
// increase GC horizon and compact again
|
||||
{
|
||||
tline
|
||||
.latest_gc_cutoff_lsn
|
||||
.lock_for_write()
|
||||
.store_and_unlock(Lsn(0x40))
|
||||
.wait()
|
||||
.await;
|
||||
// Update GC info
|
||||
let mut guard = tline.gc_info.write().unwrap();
|
||||
guard.cutoffs.time = Lsn(0x40);
|
||||
@@ -9127,6 +9205,12 @@ mod tests {
|
||||
)
|
||||
.await?;
|
||||
{
|
||||
tline
|
||||
.latest_gc_cutoff_lsn
|
||||
.lock_for_write()
|
||||
.store_and_unlock(Lsn(0x30))
|
||||
.wait()
|
||||
.await;
|
||||
// Update GC info
|
||||
let mut guard = tline.gc_info.write().unwrap();
|
||||
*guard = GcInfo {
|
||||
@@ -9244,7 +9328,7 @@ mod tests {
|
||||
CompactOptions {
|
||||
flags: dryrun_flags,
|
||||
compact_range: None,
|
||||
compact_below_lsn: None,
|
||||
..Default::default()
|
||||
},
|
||||
&ctx,
|
||||
)
|
||||
@@ -9269,6 +9353,12 @@ mod tests {
|
||||
|
||||
// increase GC horizon and compact again
|
||||
{
|
||||
tline
|
||||
.latest_gc_cutoff_lsn
|
||||
.lock_for_write()
|
||||
.store_and_unlock(Lsn(0x38))
|
||||
.wait()
|
||||
.await;
|
||||
// Update GC info
|
||||
let mut guard = tline.gc_info.write().unwrap();
|
||||
guard.cutoffs.time = Lsn(0x38);
|
||||
@@ -9364,6 +9454,12 @@ mod tests {
|
||||
)
|
||||
.await?;
|
||||
{
|
||||
tline
|
||||
.latest_gc_cutoff_lsn
|
||||
.lock_for_write()
|
||||
.store_and_unlock(Lsn(0x30))
|
||||
.wait()
|
||||
.await;
|
||||
// Update GC info
|
||||
let mut guard = tline.gc_info.write().unwrap();
|
||||
*guard = GcInfo {
|
||||
@@ -9481,7 +9577,7 @@ mod tests {
|
||||
CompactOptions {
|
||||
flags: dryrun_flags,
|
||||
compact_range: None,
|
||||
compact_below_lsn: None,
|
||||
..Default::default()
|
||||
},
|
||||
&ctx,
|
||||
)
|
||||
@@ -9608,6 +9704,12 @@ mod tests {
|
||||
branch_tline.add_extra_test_dense_keyspace(KeySpace::single(get_key(0)..get_key(10)));
|
||||
|
||||
{
|
||||
parent_tline
|
||||
.latest_gc_cutoff_lsn
|
||||
.lock_for_write()
|
||||
.store_and_unlock(Lsn(0x10))
|
||||
.wait()
|
||||
.await;
|
||||
// Update GC info
|
||||
let mut guard = parent_tline.gc_info.write().unwrap();
|
||||
*guard = GcInfo {
|
||||
@@ -9622,6 +9724,12 @@ mod tests {
|
||||
}
|
||||
|
||||
{
|
||||
branch_tline
|
||||
.latest_gc_cutoff_lsn
|
||||
.lock_for_write()
|
||||
.store_and_unlock(Lsn(0x50))
|
||||
.wait()
|
||||
.await;
|
||||
// Update GC info
|
||||
let mut guard = branch_tline.gc_info.write().unwrap();
|
||||
*guard = GcInfo {
|
||||
@@ -9951,6 +10059,12 @@ mod tests {
|
||||
.await?;
|
||||
|
||||
{
|
||||
tline
|
||||
.latest_gc_cutoff_lsn
|
||||
.lock_for_write()
|
||||
.store_and_unlock(Lsn(0x30))
|
||||
.wait()
|
||||
.await;
|
||||
// Update GC info
|
||||
let mut guard = tline.gc_info.write().unwrap();
|
||||
*guard = GcInfo {
|
||||
@@ -9973,7 +10087,7 @@ mod tests {
|
||||
CompactOptions {
|
||||
flags: EnumSet::new(),
|
||||
compact_range: Some((get_key(0)..get_key(2)).into()),
|
||||
compact_below_lsn: None,
|
||||
..Default::default()
|
||||
},
|
||||
&ctx,
|
||||
)
|
||||
@@ -10020,7 +10134,7 @@ mod tests {
|
||||
CompactOptions {
|
||||
flags: EnumSet::new(),
|
||||
compact_range: Some((get_key(2)..get_key(4)).into()),
|
||||
compact_below_lsn: None,
|
||||
..Default::default()
|
||||
},
|
||||
&ctx,
|
||||
)
|
||||
@@ -10072,7 +10186,7 @@ mod tests {
|
||||
CompactOptions {
|
||||
flags: EnumSet::new(),
|
||||
compact_range: Some((get_key(4)..get_key(9)).into()),
|
||||
compact_below_lsn: None,
|
||||
..Default::default()
|
||||
},
|
||||
&ctx,
|
||||
)
|
||||
@@ -10123,7 +10237,7 @@ mod tests {
|
||||
CompactOptions {
|
||||
flags: EnumSet::new(),
|
||||
compact_range: Some((get_key(9)..get_key(10)).into()),
|
||||
compact_below_lsn: None,
|
||||
..Default::default()
|
||||
},
|
||||
&ctx,
|
||||
)
|
||||
@@ -10179,7 +10293,7 @@ mod tests {
|
||||
CompactOptions {
|
||||
flags: EnumSet::new(),
|
||||
compact_range: Some((get_key(0)..get_key(10)).into()),
|
||||
compact_below_lsn: None,
|
||||
..Default::default()
|
||||
},
|
||||
&ctx,
|
||||
)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use std::collections::HashMap;
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
|
||||
use utils::id::TimelineId;
|
||||
|
||||
@@ -20,7 +20,7 @@ pub(crate) struct GcBlock {
|
||||
/// Do not add any more features taking and forbidding taking this lock. It should be
|
||||
/// `tokio::sync::Notify`, but that is rarely used. On the other side, [`GcBlock::insert`]
|
||||
/// synchronizes with gc attempts by locking and unlocking this mutex.
|
||||
blocking: tokio::sync::Mutex<()>,
|
||||
blocking: Arc<tokio::sync::Mutex<()>>,
|
||||
}
|
||||
|
||||
impl GcBlock {
|
||||
@@ -30,7 +30,7 @@ impl GcBlock {
|
||||
/// it's ending, or if not currently possible, a value describing the reasons why not.
|
||||
///
|
||||
/// Cancellation safe.
|
||||
pub(super) async fn start(&self) -> Result<Guard<'_>, BlockingReasons> {
|
||||
pub(super) async fn start(&self) -> Result<Guard, BlockingReasons> {
|
||||
let reasons = {
|
||||
let g = self.reasons.lock().unwrap();
|
||||
|
||||
@@ -44,7 +44,7 @@ impl GcBlock {
|
||||
Err(reasons)
|
||||
} else {
|
||||
Ok(Guard {
|
||||
_inner: self.blocking.lock().await,
|
||||
_inner: self.blocking.clone().lock_owned().await,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -170,8 +170,8 @@ impl GcBlock {
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) struct Guard<'a> {
|
||||
_inner: tokio::sync::MutexGuard<'a, ()>,
|
||||
pub(crate) struct Guard {
|
||||
_inner: tokio::sync::OwnedMutexGuard<()>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
|
||||
@@ -785,6 +785,9 @@ pub(crate) struct CompactRequest {
|
||||
/// Whether the compaction job should be scheduled.
|
||||
#[serde(default)]
|
||||
pub scheduled: bool,
|
||||
/// Whether the compaction job should be split across key ranges.
|
||||
#[serde(default)]
|
||||
pub sub_compaction: bool,
|
||||
}
|
||||
|
||||
#[serde_with::serde_as]
|
||||
@@ -814,6 +817,9 @@ pub(crate) struct CompactOptions {
|
||||
/// If set, the compaction will only compact the LSN below this value.
|
||||
/// This option is only used by GC compaction.
|
||||
pub compact_below_lsn: Option<Lsn>,
|
||||
/// Enable sub-compaction (split compaction job across key ranges).
|
||||
/// This option is only used by GC compaction.
|
||||
pub sub_compaction: bool,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for Timeline {
|
||||
@@ -1637,6 +1643,7 @@ impl Timeline {
|
||||
flags,
|
||||
compact_range: None,
|
||||
compact_below_lsn: None,
|
||||
sub_compaction: false,
|
||||
},
|
||||
ctx,
|
||||
)
|
||||
|
||||
@@ -10,8 +10,8 @@ use std::sync::Arc;
|
||||
|
||||
use super::layer_manager::LayerManager;
|
||||
use super::{
|
||||
CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, ImageLayerCreationMode,
|
||||
RecordedDuration, Timeline,
|
||||
CompactFlags, CompactOptions, CompactRange, CreateImageLayersError, DurationRecorder,
|
||||
ImageLayerCreationMode, RecordedDuration, Timeline,
|
||||
};
|
||||
|
||||
use anyhow::{anyhow, bail, Context};
|
||||
@@ -29,7 +29,6 @@ use utils::id::TimelineId;
|
||||
use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
|
||||
use crate::page_cache;
|
||||
use crate::statvfs::Statvfs;
|
||||
use crate::tenant::checks::check_valid_layermap;
|
||||
use crate::tenant::remote_timeline_client::WaitCompletionError;
|
||||
use crate::tenant::storage_layer::batch_split_writer::{
|
||||
BatchWriterResult, SplitDeltaLayerWriter, SplitImageLayerWriter,
|
||||
@@ -42,7 +41,7 @@ use crate::tenant::storage_layer::{
|
||||
use crate::tenant::timeline::ImageLayerCreationOutcome;
|
||||
use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter};
|
||||
use crate::tenant::timeline::{Layer, ResidentLayer};
|
||||
use crate::tenant::{DeltaLayer, MaybeOffloaded};
|
||||
use crate::tenant::{gc_block, DeltaLayer, MaybeOffloaded};
|
||||
use crate::virtual_file::{MaybeFatalIo, VirtualFile};
|
||||
use pageserver_api::config::tenant_conf_defaults::{
|
||||
DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD,
|
||||
@@ -64,9 +63,12 @@ use super::CompactionError;
|
||||
const COMPACTION_DELTA_THRESHOLD: usize = 5;
|
||||
|
||||
/// A scheduled compaction task.
|
||||
pub struct ScheduledCompactionTask {
|
||||
pub(crate) struct ScheduledCompactionTask {
|
||||
pub options: CompactOptions,
|
||||
/// The channel to send the compaction result. If this is a subcompaction, the last compaction job holds the sender.
|
||||
pub result_tx: Option<tokio::sync::oneshot::Sender<()>>,
|
||||
/// Hold the GC block. If this is a subcompaction, the last compaction job holds the gc block guard.
|
||||
pub gc_block: Option<gc_block::Guard>,
|
||||
}
|
||||
|
||||
pub struct GcCompactionJobDescription {
|
||||
@@ -1752,6 +1754,115 @@ impl Timeline {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Split a gc-compaction job into multiple compaction jobs. Optimally, this function should return a vector of
|
||||
/// `GcCompactionJobDesc`. But we want to keep it simple on the tenant scheduling side without exposing too much
|
||||
/// ad-hoc information about gc compaction itself.
|
||||
pub(crate) async fn gc_compaction_split_jobs(
|
||||
self: &Arc<Self>,
|
||||
options: CompactOptions,
|
||||
) -> anyhow::Result<Vec<CompactOptions>> {
|
||||
if !options.sub_compaction {
|
||||
return Ok(vec![options]);
|
||||
}
|
||||
let compact_range = options.compact_range.clone().unwrap_or(CompactRange {
|
||||
start: Key::MIN,
|
||||
end: Key::MAX,
|
||||
});
|
||||
let compact_below_lsn = if let Some(compact_below_lsn) = options.compact_below_lsn {
|
||||
compact_below_lsn
|
||||
} else {
|
||||
*self.get_latest_gc_cutoff_lsn() // use the real gc cutoff
|
||||
};
|
||||
let mut compact_jobs = Vec::new();
|
||||
// For now, we simply use the key partitioning information; we should do a more fine-grained partitioning
|
||||
// by estimating the amount of files read for a compaction job. We should also partition on LSN.
|
||||
let Ok(partition) = self.partitioning.try_lock() else {
|
||||
bail!("failed to acquire partition lock");
|
||||
};
|
||||
let ((dense_ks, sparse_ks), _) = &*partition;
|
||||
// Truncate the key range to be within user specified compaction range.
|
||||
fn truncate_to(
|
||||
source_start: &Key,
|
||||
source_end: &Key,
|
||||
target_start: &Key,
|
||||
target_end: &Key,
|
||||
) -> Option<(Key, Key)> {
|
||||
let start = source_start.max(target_start);
|
||||
let end = source_end.min(target_end);
|
||||
if start < end {
|
||||
Some((*start, *end))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
let mut split_key_ranges = Vec::new();
|
||||
let ranges = dense_ks
|
||||
.parts
|
||||
.iter()
|
||||
.map(|partition| partition.ranges.iter())
|
||||
.chain(sparse_ks.parts.iter().map(|x| x.0.ranges.iter()))
|
||||
.flatten()
|
||||
.cloned()
|
||||
.collect_vec();
|
||||
for range in ranges.iter() {
|
||||
let Some((start, end)) = truncate_to(
|
||||
&range.start,
|
||||
&range.end,
|
||||
&compact_range.start,
|
||||
&compact_range.end,
|
||||
) else {
|
||||
continue;
|
||||
};
|
||||
split_key_ranges.push((start, end));
|
||||
}
|
||||
split_key_ranges.sort();
|
||||
let guard = self.layers.read().await;
|
||||
let layer_map = guard.layer_map()?;
|
||||
let mut current_start = None;
|
||||
// Split compaction job to about 2GB each
|
||||
const GC_COMPACT_MAX_SIZE_MB: u64 = 4 * 1024; // 4GB, TODO: should be configuration in the future
|
||||
let ranges_num = split_key_ranges.len();
|
||||
for (idx, (start, end)) in split_key_ranges.into_iter().enumerate() {
|
||||
if current_start.is_none() {
|
||||
current_start = Some(start);
|
||||
}
|
||||
let start = current_start.unwrap();
|
||||
if start >= end {
|
||||
// We have already processed this partition.
|
||||
continue;
|
||||
}
|
||||
let res = layer_map.range_search(start..end, compact_below_lsn);
|
||||
let total_size = res.found.keys().map(|x| x.layer.file_size()).sum::<u64>();
|
||||
if total_size > GC_COMPACT_MAX_SIZE_MB * 1024 * 1024 || ranges_num == idx + 1 {
|
||||
let mut compact_options = options.clone();
|
||||
// Try to extend the compaction range so that we include at least one full layer file.
|
||||
let extended_end = res
|
||||
.found
|
||||
.keys()
|
||||
.map(|layer| layer.layer.key_range.end)
|
||||
.min();
|
||||
// It is possible that the search range does not contain any layer files when we reach the end of the loop.
|
||||
// In this case, we simply use the specified key range end.
|
||||
let end = if let Some(extended_end) = extended_end {
|
||||
extended_end.max(end)
|
||||
} else {
|
||||
end
|
||||
};
|
||||
info!(
|
||||
"splitting compaction job: {}..{}, estimated_size={}",
|
||||
start, end, total_size
|
||||
);
|
||||
compact_options.compact_range = Some(CompactRange { start, end });
|
||||
compact_options.compact_below_lsn = Some(compact_below_lsn);
|
||||
compact_options.sub_compaction = false;
|
||||
compact_jobs.push(compact_options);
|
||||
current_start = Some(end);
|
||||
}
|
||||
}
|
||||
drop(guard);
|
||||
Ok(compact_jobs)
|
||||
}
|
||||
|
||||
/// An experimental compaction building block that combines compaction with garbage collection.
|
||||
///
|
||||
/// The current implementation picks all delta + image layers that are below or intersecting with
|
||||
@@ -1774,6 +1885,36 @@ impl Timeline {
|
||||
options: CompactOptions,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
if options.sub_compaction {
|
||||
info!("running enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs");
|
||||
let jobs = self.gc_compaction_split_jobs(options).await?;
|
||||
let jobs_len = jobs.len();
|
||||
for (idx, job) in jobs.into_iter().enumerate() {
|
||||
info!(
|
||||
"running enhanced gc bottom-most compaction, sub-compaction {}/{}",
|
||||
idx + 1,
|
||||
jobs_len
|
||||
);
|
||||
self.compact_with_gc_inner(cancel, job, ctx).await?;
|
||||
}
|
||||
if jobs_len == 0 {
|
||||
info!("no jobs to run, skipping gc bottom-most compaction");
|
||||
}
|
||||
return Ok(());
|
||||
}
|
||||
self.compact_with_gc_inner(cancel, options, ctx).await
|
||||
}
|
||||
|
||||
async fn compact_with_gc_inner(
|
||||
self: &Arc<Self>,
|
||||
cancel: &CancellationToken,
|
||||
options: CompactOptions,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
assert!(
|
||||
!options.sub_compaction,
|
||||
"sub-compaction should be handled by the outer function"
|
||||
);
|
||||
// Block other compaction/GC tasks from running for now. GC-compaction could run along
|
||||
// with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc.
|
||||
// Note that we already acquired the compaction lock when the outer `compact` function gets called.
|
||||
@@ -1823,7 +1964,11 @@ impl Timeline {
|
||||
let gc_info = self.gc_info.read().unwrap();
|
||||
let mut retain_lsns_below_horizon = Vec::new();
|
||||
let gc_cutoff = {
|
||||
let real_gc_cutoff = gc_info.cutoffs.select_min();
|
||||
// Currently, gc-compaction only kicks in after the legacy gc has updated the gc_cutoff.
|
||||
// Therefore, it can only clean up data that cannot be cleaned up with legacy gc, instead of
|
||||
// cleaning everything that theoritically it could. In the future, it should use `self.gc_info`
|
||||
// to get the truth data.
|
||||
let real_gc_cutoff = *self.get_latest_gc_cutoff_lsn();
|
||||
// The compaction algorithm will keep all keys above the gc_cutoff while keeping only necessary keys below the gc_cutoff for
|
||||
// each of the retain_lsn. Therefore, if the user-provided `compact_below_lsn` is larger than the real gc cutoff, we will use
|
||||
// the real cutoff.
|
||||
@@ -1943,14 +2088,15 @@ impl Timeline {
|
||||
|
||||
// Step 1: construct a k-merge iterator over all layers.
|
||||
// Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point.
|
||||
let layer_names = job_desc
|
||||
.selected_layers
|
||||
.iter()
|
||||
.map(|layer| layer.layer_desc().layer_name())
|
||||
.collect_vec();
|
||||
if let Some(err) = check_valid_layermap(&layer_names) {
|
||||
warn!("gc-compaction layer map check failed because {}, this is normal if partial compaction is not finished yet", err);
|
||||
}
|
||||
// disable the check for now because we need to adjust the check for partial compactions, will enable later.
|
||||
// let layer_names = job_desc
|
||||
// .selected_layers
|
||||
// .iter()
|
||||
// .map(|layer| layer.layer_desc().layer_name())
|
||||
// .collect_vec();
|
||||
// if let Some(err) = check_valid_layermap(&layer_names) {
|
||||
// warn!("gc-compaction layer map check failed because {}, this is normal if partial compaction is not finished yet", err);
|
||||
// }
|
||||
// The maximum LSN we are processing in this compaction loop
|
||||
let end_lsn = job_desc
|
||||
.selected_layers
|
||||
|
||||
@@ -369,6 +369,13 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
// advances it to its end LSN. 0 is just an initialization placeholder.
|
||||
let mut modification = timeline.begin_modification(Lsn(0));
|
||||
|
||||
if !records.is_empty() {
|
||||
timeline
|
||||
.metrics
|
||||
.wal_records_received
|
||||
.inc_by(records.len() as u64);
|
||||
}
|
||||
|
||||
for interpreted in records {
|
||||
if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes)
|
||||
&& uncommitted_records > 0
|
||||
@@ -510,6 +517,7 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
}
|
||||
|
||||
// Ingest the records without immediately committing them.
|
||||
timeline.metrics.wal_records_received.inc();
|
||||
let ingested = walingest
|
||||
.ingest_record(interpreted, &mut modification, &ctx)
|
||||
.await
|
||||
|
||||
@@ -22,6 +22,7 @@
|
||||
#include "libpq/pqformat.h"
|
||||
#include "miscadmin.h"
|
||||
#include "pgstat.h"
|
||||
#include "portability/instr_time.h"
|
||||
#include "postmaster/interrupt.h"
|
||||
#include "storage/buf_internals.h"
|
||||
#include "storage/ipc.h"
|
||||
@@ -118,6 +119,11 @@ typedef struct
|
||||
*/
|
||||
PSConnectionState state;
|
||||
PGconn *conn;
|
||||
|
||||
/* request / response counters for debugging */
|
||||
uint64 nrequests_sent;
|
||||
uint64 nresponses_received;
|
||||
|
||||
/*---
|
||||
* WaitEventSet containing:
|
||||
* - WL_SOCKET_READABLE on 'conn'
|
||||
@@ -628,6 +634,8 @@ pageserver_connect(shardno_t shard_no, int elevel)
|
||||
}
|
||||
|
||||
shard->state = PS_Connected;
|
||||
shard->nrequests_sent = 0;
|
||||
shard->nresponses_received = 0;
|
||||
}
|
||||
/* FALLTHROUGH */
|
||||
case PS_Connected:
|
||||
@@ -656,6 +664,27 @@ call_PQgetCopyData(shardno_t shard_no, char **buffer)
|
||||
int ret;
|
||||
PageServer *shard = &page_servers[shard_no];
|
||||
PGconn *pageserver_conn = shard->conn;
|
||||
instr_time now,
|
||||
start_ts,
|
||||
since_start,
|
||||
last_log_ts,
|
||||
since_last_log;
|
||||
bool logged = false;
|
||||
|
||||
/*
|
||||
* As a debugging aid, if we don't get a response for a long time, print a
|
||||
* log message.
|
||||
*
|
||||
* 10 s is a very generous threshold, normally we expect a response in a
|
||||
* few milliseconds. We have metrics to track latencies in normal ranges,
|
||||
* but in the cases that take exceptionally long, it's useful to log the
|
||||
* exact timestamps.
|
||||
*/
|
||||
#define LOG_INTERVAL_US UINT64CONST(10 * 1000000)
|
||||
|
||||
INSTR_TIME_SET_CURRENT(now);
|
||||
start_ts = last_log_ts = now;
|
||||
INSTR_TIME_SET_ZERO(since_last_log);
|
||||
|
||||
retry:
|
||||
ret = PQgetCopyData(pageserver_conn, buffer, 1 /* async */ );
|
||||
@@ -663,9 +692,12 @@ retry:
|
||||
if (ret == 0)
|
||||
{
|
||||
WaitEvent event;
|
||||
long timeout;
|
||||
|
||||
timeout = Min(0, LOG_INTERVAL_US - INSTR_TIME_GET_MICROSEC(since_last_log));
|
||||
|
||||
/* Sleep until there's something to do */
|
||||
(void) WaitEventSetWait(shard->wes_read, -1L, &event, 1,
|
||||
(void) WaitEventSetWait(shard->wes_read, timeout, &event, 1,
|
||||
WAIT_EVENT_NEON_PS_READ);
|
||||
ResetLatch(MyLatch);
|
||||
|
||||
@@ -684,9 +716,40 @@ retry:
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Print a message to the log if a long time has passed with no
|
||||
* response.
|
||||
*/
|
||||
INSTR_TIME_SET_CURRENT(now);
|
||||
since_last_log = now;
|
||||
INSTR_TIME_SUBTRACT(since_last_log, last_log_ts);
|
||||
if (INSTR_TIME_GET_MICROSEC(since_last_log) >= LOG_INTERVAL_US)
|
||||
{
|
||||
since_start = now;
|
||||
INSTR_TIME_SUBTRACT(since_start, start_ts);
|
||||
neon_shard_log(shard_no, LOG, "no response received from pageserver for %0.3f s, still waiting (sent " UINT64_FORMAT " requests, received " UINT64_FORMAT " responses)",
|
||||
INSTR_TIME_GET_DOUBLE(since_start),
|
||||
shard->nrequests_sent, shard->nresponses_received);
|
||||
last_log_ts = now;
|
||||
logged = true;
|
||||
}
|
||||
|
||||
goto retry;
|
||||
}
|
||||
|
||||
/*
|
||||
* If we logged earlier that the response is taking a long time, log
|
||||
* another message when the response is finally received.
|
||||
*/
|
||||
if (logged)
|
||||
{
|
||||
INSTR_TIME_SET_CURRENT(now);
|
||||
since_start = now;
|
||||
INSTR_TIME_SUBTRACT(since_start, start_ts);
|
||||
neon_shard_log(shard_no, LOG, "received response from pageserver after %0.3f s",
|
||||
INSTR_TIME_GET_DOUBLE(since_start));
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -786,6 +849,7 @@ pageserver_send(shardno_t shard_no, NeonRequest *request)
|
||||
* PGRES_POLLING_WRITING state. It's kinda dirty to disconnect at this
|
||||
* point, but on the grand scheme of things it's only a small issue.
|
||||
*/
|
||||
shard->nrequests_sent++;
|
||||
if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(pageserver_conn));
|
||||
@@ -878,6 +942,7 @@ pageserver_receive(shardno_t shard_no)
|
||||
neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: unexpected PQgetCopyData return value: %d", rc);
|
||||
}
|
||||
|
||||
shard->nresponses_received++;
|
||||
return (NeonResponse *) resp;
|
||||
}
|
||||
|
||||
|
||||
@@ -423,7 +423,11 @@ readahead_buffer_resize(int newsize, void *extra)
|
||||
* ensuring we have received all but the last n requests (n = newsize).
|
||||
*/
|
||||
if (MyPState->n_requests_inflight > newsize)
|
||||
prefetch_wait_for(MyPState->ring_unused - newsize);
|
||||
{
|
||||
Assert(MyPState->ring_unused >= MyPState->n_requests_inflight - newsize);
|
||||
prefetch_wait_for(MyPState->ring_unused - (MyPState->n_requests_inflight - newsize));
|
||||
Assert(MyPState->n_requests_inflight <= newsize);
|
||||
}
|
||||
|
||||
/* construct the new PrefetchState, and copy over the memory contexts */
|
||||
newPState = MemoryContextAllocZero(TopMemoryContext, newprfs_size);
|
||||
@@ -438,7 +442,6 @@ readahead_buffer_resize(int newsize, void *extra)
|
||||
newPState->ring_last = newsize;
|
||||
newPState->ring_unused = newsize;
|
||||
newPState->ring_receive = newsize;
|
||||
newPState->ring_flush = newsize;
|
||||
newPState->max_shard_no = MyPState->max_shard_no;
|
||||
memcpy(newPState->shard_bitmap, MyPState->shard_bitmap, sizeof(MyPState->shard_bitmap));
|
||||
|
||||
@@ -489,6 +492,7 @@ readahead_buffer_resize(int newsize, void *extra)
|
||||
}
|
||||
newPState->n_unused -= 1;
|
||||
}
|
||||
newPState->ring_flush = newPState->ring_receive;
|
||||
|
||||
MyNeonCounters->getpage_prefetches_buffered =
|
||||
MyPState->n_responses_buffered;
|
||||
@@ -498,6 +502,7 @@ readahead_buffer_resize(int newsize, void *extra)
|
||||
for (; end >= MyPState->ring_last && end != UINT64_MAX; end -= 1)
|
||||
{
|
||||
PrefetchRequest *slot = GetPrfSlot(end);
|
||||
Assert(slot->status != PRFS_REQUESTED);
|
||||
if (slot->status == PRFS_RECEIVED)
|
||||
{
|
||||
pfree(slot->response);
|
||||
@@ -610,6 +615,9 @@ prefetch_read(PrefetchRequest *slot)
|
||||
{
|
||||
NeonResponse *response;
|
||||
MemoryContext old;
|
||||
BufferTag buftag;
|
||||
shardno_t shard_no;
|
||||
uint64 my_ring_index;
|
||||
|
||||
Assert(slot->status == PRFS_REQUESTED);
|
||||
Assert(slot->response == NULL);
|
||||
@@ -623,11 +631,29 @@ prefetch_read(PrefetchRequest *slot)
|
||||
slot->status, slot->response,
|
||||
(long)slot->my_ring_index, (long)MyPState->ring_receive);
|
||||
|
||||
/*
|
||||
* Copy the request info so that if an error happens and the prefetch
|
||||
* queue is flushed during the receive call, we can print the original
|
||||
* values in the error message
|
||||
*/
|
||||
buftag = slot->buftag;
|
||||
shard_no = slot->shard_no;
|
||||
my_ring_index = slot->my_ring_index;
|
||||
|
||||
old = MemoryContextSwitchTo(MyPState->errctx);
|
||||
response = (NeonResponse *) page_server->receive(slot->shard_no);
|
||||
response = (NeonResponse *) page_server->receive(shard_no);
|
||||
MemoryContextSwitchTo(old);
|
||||
if (response)
|
||||
{
|
||||
/* The slot should still be valid */
|
||||
if (slot->status != PRFS_REQUESTED ||
|
||||
slot->response != NULL ||
|
||||
slot->my_ring_index != MyPState->ring_receive)
|
||||
neon_shard_log(shard_no, ERROR,
|
||||
"Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu",
|
||||
slot->status, slot->response,
|
||||
(long) slot->my_ring_index, (long) MyPState->ring_receive);
|
||||
|
||||
/* update prefetch state */
|
||||
MyPState->n_responses_buffered += 1;
|
||||
MyPState->n_requests_inflight -= 1;
|
||||
@@ -642,11 +668,15 @@ prefetch_read(PrefetchRequest *slot)
|
||||
}
|
||||
else
|
||||
{
|
||||
neon_shard_log(slot->shard_no, LOG,
|
||||
/*
|
||||
* Note: The slot might no longer be valid, if the connection was lost
|
||||
* and the prefetch queue was flushed during the receive call
|
||||
*/
|
||||
neon_shard_log(shard_no, LOG,
|
||||
"No response from reading prefetch entry %lu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect",
|
||||
(long)slot->my_ring_index,
|
||||
RelFileInfoFmt(BufTagGetNRelFileInfo(slot->buftag)),
|
||||
slot->buftag.forkNum, slot->buftag.blockNum);
|
||||
(long) my_ring_index,
|
||||
RelFileInfoFmt(BufTagGetNRelFileInfo(buftag)),
|
||||
buftag.forkNum, buftag.blockNum);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -115,7 +115,8 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
|
||||
IpAddr::V6(ip) => IpNet::V6(Ipv6Net::new_assert(ip, 64).trunc()),
|
||||
};
|
||||
if !self.limiter.lock().unwrap().check(subnet_key, 1) {
|
||||
tracing::debug!("Rate limit exceeded. Skipping cancellation message");
|
||||
// log only the subnet part of the IP address to know which subnet is rate limited
|
||||
tracing::warn!("Rate limit exceeded. Skipping cancellation message, {subnet_key}");
|
||||
Metrics::get()
|
||||
.proxy
|
||||
.cancellation_requests_total
|
||||
|
||||
@@ -163,32 +163,36 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
|
||||
let do_handshake = handshake(ctx, stream, tls, record_handshake_error);
|
||||
|
||||
let (mut stream, params) =
|
||||
match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? {
|
||||
HandshakeData::Startup(stream, params) => (stream, params),
|
||||
HandshakeData::Cancel(cancel_key_data) => {
|
||||
// spawn a task to cancel the session, but don't wait for it
|
||||
cancellations.spawn({
|
||||
let cancellation_handler_clone = Arc::clone(&cancellation_handler);
|
||||
let session_id = ctx.session_id();
|
||||
let peer_ip = ctx.peer_addr();
|
||||
async move {
|
||||
drop(
|
||||
cancellation_handler_clone
|
||||
.cancel_session(
|
||||
cancel_key_data,
|
||||
session_id,
|
||||
peer_ip,
|
||||
config.authentication_config.ip_allowlist_check_enabled,
|
||||
)
|
||||
.await,
|
||||
);
|
||||
}
|
||||
});
|
||||
let (mut stream, params) = match tokio::time::timeout(config.handshake_timeout, do_handshake)
|
||||
.await??
|
||||
{
|
||||
HandshakeData::Startup(stream, params) => (stream, params),
|
||||
HandshakeData::Cancel(cancel_key_data) => {
|
||||
// spawn a task to cancel the session, but don't wait for it
|
||||
cancellations.spawn({
|
||||
let cancellation_handler_clone = Arc::clone(&cancellation_handler);
|
||||
let session_id = ctx.session_id();
|
||||
let peer_ip = ctx.peer_addr();
|
||||
let cancel_span = tracing::span!(parent: None, tracing::Level::INFO, "cancel_session", session_id = ?session_id);
|
||||
cancel_span.follows_from(tracing::Span::current());
|
||||
async move {
|
||||
drop(
|
||||
cancellation_handler_clone
|
||||
.cancel_session(
|
||||
cancel_key_data,
|
||||
session_id,
|
||||
peer_ip,
|
||||
config.authentication_config.ip_allowlist_check_enabled,
|
||||
)
|
||||
.instrument(cancel_span)
|
||||
.await,
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
return Ok(None);
|
||||
}
|
||||
};
|
||||
return Ok(None);
|
||||
}
|
||||
};
|
||||
drop(pause);
|
||||
|
||||
ctx.set_db_options(params.clone());
|
||||
|
||||
@@ -272,32 +272,36 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client);
|
||||
let do_handshake = handshake(ctx, stream, mode.handshake_tls(tls), record_handshake_error);
|
||||
|
||||
let (mut stream, params) =
|
||||
match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? {
|
||||
HandshakeData::Startup(stream, params) => (stream, params),
|
||||
HandshakeData::Cancel(cancel_key_data) => {
|
||||
// spawn a task to cancel the session, but don't wait for it
|
||||
cancellations.spawn({
|
||||
let cancellation_handler_clone = Arc::clone(&cancellation_handler);
|
||||
let session_id = ctx.session_id();
|
||||
let peer_ip = ctx.peer_addr();
|
||||
async move {
|
||||
drop(
|
||||
cancellation_handler_clone
|
||||
.cancel_session(
|
||||
cancel_key_data,
|
||||
session_id,
|
||||
peer_ip,
|
||||
config.authentication_config.ip_allowlist_check_enabled,
|
||||
)
|
||||
.await,
|
||||
);
|
||||
}
|
||||
});
|
||||
let (mut stream, params) = match tokio::time::timeout(config.handshake_timeout, do_handshake)
|
||||
.await??
|
||||
{
|
||||
HandshakeData::Startup(stream, params) => (stream, params),
|
||||
HandshakeData::Cancel(cancel_key_data) => {
|
||||
// spawn a task to cancel the session, but don't wait for it
|
||||
cancellations.spawn({
|
||||
let cancellation_handler_clone = Arc::clone(&cancellation_handler);
|
||||
let session_id = ctx.session_id();
|
||||
let peer_ip = ctx.peer_addr();
|
||||
let cancel_span = tracing::span!(parent: None, tracing::Level::INFO, "cancel_session", session_id = ?session_id);
|
||||
cancel_span.follows_from(tracing::Span::current());
|
||||
async move {
|
||||
drop(
|
||||
cancellation_handler_clone
|
||||
.cancel_session(
|
||||
cancel_key_data,
|
||||
session_id,
|
||||
peer_ip,
|
||||
config.authentication_config.ip_allowlist_check_enabled,
|
||||
)
|
||||
.instrument(cancel_span)
|
||||
.await,
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
return Ok(None);
|
||||
}
|
||||
};
|
||||
return Ok(None);
|
||||
}
|
||||
};
|
||||
drop(pause);
|
||||
|
||||
ctx.set_db_options(params.clone());
|
||||
|
||||
@@ -13,6 +13,7 @@ use crate::cache::project_info::ProjectInfoCache;
|
||||
use crate::cancellation::{CancelMap, CancellationHandler};
|
||||
use crate::intern::{ProjectIdInt, RoleNameInt};
|
||||
use crate::metrics::{Metrics, RedisErrors, RedisEventsCount};
|
||||
use tracing::Instrument;
|
||||
|
||||
const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
|
||||
pub(crate) const PROXY_CHANNEL_NAME: &str = "neondb-proxy-to-proxy-updates";
|
||||
@@ -143,6 +144,8 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
|
||||
let peer_addr = cancel_session
|
||||
.peer_addr
|
||||
.unwrap_or(std::net::IpAddr::V4(std::net::Ipv4Addr::UNSPECIFIED));
|
||||
let cancel_span = tracing::span!(parent: None, tracing::Level::INFO, "cancel_session", session_id = ?cancel_session.session_id);
|
||||
cancel_span.follows_from(tracing::Span::current());
|
||||
// This instance of cancellation_handler doesn't have a RedisPublisherClient so it can't publish the message.
|
||||
match self
|
||||
.cancellation_handler
|
||||
@@ -152,6 +155,7 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
|
||||
peer_addr,
|
||||
cancel_session.peer_addr.is_some(),
|
||||
)
|
||||
.instrument(cancel_span)
|
||||
.await
|
||||
{
|
||||
Ok(()) => {}
|
||||
|
||||
@@ -340,7 +340,7 @@ impl PoolingBackend {
|
||||
debug!("setting up backend session state");
|
||||
|
||||
// initiates the auth session
|
||||
if let Err(e) = client.batch_execute("select auth.init();").await {
|
||||
if let Err(e) = client.execute("select auth.init()", &[]).await {
|
||||
discard.discard();
|
||||
return Err(e.into());
|
||||
}
|
||||
|
||||
@@ -11,7 +11,7 @@ use smallvec::SmallVec;
|
||||
use tokio::net::TcpStream;
|
||||
use tokio::time::Instant;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, error, info, info_span, Instrument};
|
||||
use tracing::{error, info, info_span, warn, Instrument};
|
||||
#[cfg(test)]
|
||||
use {
|
||||
super::conn_pool_lib::GlobalConnPoolOptions,
|
||||
@@ -125,10 +125,13 @@ pub(crate) fn poll_client<C: ClientInnerExt>(
|
||||
|
||||
match message {
|
||||
Some(Ok(AsyncMessage::Notice(notice))) => {
|
||||
debug!(%session_id, "notice: {}", notice);
|
||||
info!(%session_id, "notice: {}", notice);
|
||||
}
|
||||
Some(Ok(AsyncMessage::Notification(notif))) => {
|
||||
debug!(%session_id, pid = notif.process_id(), channel = notif.channel(), "notification received");
|
||||
warn!(%session_id, pid = notif.process_id(), channel = notif.channel(), "notification received");
|
||||
}
|
||||
Some(Ok(_)) => {
|
||||
warn!(%session_id, "unknown message");
|
||||
}
|
||||
Some(Err(e)) => {
|
||||
error!(%session_id, "connection error: {}", e);
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use postgres_client::types::{Kind, Type};
|
||||
use postgres_client::{Column, Row};
|
||||
use postgres_client::Row;
|
||||
use serde_json::{Map, Value};
|
||||
|
||||
//
|
||||
@@ -77,14 +77,14 @@ pub(crate) enum JsonConversionError {
|
||||
//
|
||||
pub(crate) fn pg_text_row_to_json(
|
||||
row: &Row,
|
||||
columns: &[Column],
|
||||
c_types: &[Type],
|
||||
columns: &[Type],
|
||||
raw_output: bool,
|
||||
array_mode: bool,
|
||||
) -> Result<Value, JsonConversionError> {
|
||||
let iter = columns
|
||||
let iter = row
|
||||
.columns()
|
||||
.iter()
|
||||
.zip(c_types)
|
||||
.zip(columns)
|
||||
.enumerate()
|
||||
.map(|(i, (column, typ))| {
|
||||
let name = column.name();
|
||||
|
||||
@@ -23,13 +23,14 @@ use jose_jwk::jose_b64::base64ct::{Base64UrlUnpadded, Encoding};
|
||||
use p256::ecdsa::{Signature, SigningKey};
|
||||
use parking_lot::RwLock;
|
||||
use postgres_client::tls::NoTlsStream;
|
||||
use postgres_client::types::ToSql;
|
||||
use postgres_client::AsyncMessage;
|
||||
use serde_json::value::RawValue;
|
||||
use signature::Signer;
|
||||
use tokio::net::TcpStream;
|
||||
use tokio::time::Instant;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, error, info, info_span, Instrument};
|
||||
use tracing::{debug, error, info, info_span, warn, Instrument};
|
||||
|
||||
use super::backend::HttpConnError;
|
||||
use super::conn_pool_lib::{
|
||||
@@ -228,10 +229,13 @@ pub(crate) fn poll_client<C: ClientInnerExt>(
|
||||
|
||||
match message {
|
||||
Some(Ok(AsyncMessage::Notice(notice))) => {
|
||||
debug!(%session_id, "notice: {}", notice);
|
||||
info!(%session_id, "notice: {}", notice);
|
||||
}
|
||||
Some(Ok(AsyncMessage::Notification(notif))) => {
|
||||
debug!(%session_id, pid = notif.process_id(), channel = notif.channel(), "notification received");
|
||||
warn!(%session_id, pid = notif.process_id(), channel = notif.channel(), "notification received");
|
||||
}
|
||||
Some(Ok(_)) => {
|
||||
warn!(%session_id, "unknown message");
|
||||
}
|
||||
Some(Err(e)) => {
|
||||
error!(%session_id, "connection error: {}", e);
|
||||
@@ -283,11 +287,12 @@ impl ClientInnerCommon<postgres_client::Client> {
|
||||
let token = resign_jwt(&local_data.key, payload, local_data.jti)?;
|
||||
|
||||
// initiates the auth session
|
||||
// the token contains only `[a-zA-Z1-9_-\.]+` so it cannot escape the string literal formatting.
|
||||
self.inner.batch_execute("discard all").await?;
|
||||
self.inner
|
||||
.batch_execute(&format!(
|
||||
"discard all; select auth.jwt_session_init('{token}');"
|
||||
))
|
||||
.execute(
|
||||
"select auth.jwt_session_init($1)",
|
||||
&[&&*token as &(dyn ToSql + Sync)],
|
||||
)
|
||||
.await?;
|
||||
|
||||
let pid = self.inner.get_process_id();
|
||||
|
||||
@@ -797,13 +797,7 @@ impl QueryData {
|
||||
let cancel_token = inner.cancel_token();
|
||||
|
||||
let res = match select(
|
||||
pin!(query_to_json(
|
||||
config,
|
||||
&mut *inner,
|
||||
self,
|
||||
&mut 0,
|
||||
parsed_headers
|
||||
)),
|
||||
pin!(query_to_json(config, &*inner, self, &mut 0, parsed_headers)),
|
||||
pin!(cancel.cancelled()),
|
||||
)
|
||||
.await
|
||||
@@ -887,7 +881,7 @@ impl BatchQueryData {
|
||||
builder = builder.deferrable(true);
|
||||
}
|
||||
|
||||
let mut transaction = builder.start().await.inspect_err(|_| {
|
||||
let transaction = builder.start().await.inspect_err(|_| {
|
||||
// if we cannot start a transaction, we should return immediately
|
||||
// and not return to the pool. connection is clearly broken
|
||||
discard.discard();
|
||||
@@ -896,7 +890,7 @@ impl BatchQueryData {
|
||||
let json_output = match query_batch(
|
||||
config,
|
||||
cancel.child_token(),
|
||||
&mut transaction,
|
||||
&transaction,
|
||||
self,
|
||||
parsed_headers,
|
||||
)
|
||||
@@ -940,7 +934,7 @@ impl BatchQueryData {
|
||||
async fn query_batch(
|
||||
config: &'static HttpConfig,
|
||||
cancel: CancellationToken,
|
||||
transaction: &mut Transaction<'_>,
|
||||
transaction: &Transaction<'_>,
|
||||
queries: BatchQueryData,
|
||||
parsed_headers: HttpHeaders,
|
||||
) -> Result<String, SqlOverHttpError> {
|
||||
@@ -978,7 +972,7 @@ async fn query_batch(
|
||||
|
||||
async fn query_to_json<T: GenericClient>(
|
||||
config: &'static HttpConfig,
|
||||
client: &mut T,
|
||||
client: &T,
|
||||
data: QueryData,
|
||||
current_size: &mut usize,
|
||||
parsed_headers: HttpHeaders,
|
||||
@@ -1033,7 +1027,7 @@ async fn query_to_json<T: GenericClient>(
|
||||
|
||||
let columns_len = row_stream.columns().len();
|
||||
let mut fields = Vec::with_capacity(columns_len);
|
||||
let mut c_types = Vec::with_capacity(columns_len);
|
||||
let mut columns = Vec::with_capacity(columns_len);
|
||||
|
||||
for c in row_stream.columns() {
|
||||
fields.push(json!({
|
||||
@@ -1045,7 +1039,7 @@ async fn query_to_json<T: GenericClient>(
|
||||
"dataTypeModifier": c.type_modifier(),
|
||||
"format": "text",
|
||||
}));
|
||||
c_types.push(client.get_type(c.type_oid()).await?);
|
||||
columns.push(client.get_type(c.type_oid()).await?);
|
||||
}
|
||||
|
||||
let array_mode = data.array_mode.unwrap_or(parsed_headers.default_array_mode);
|
||||
@@ -1053,15 +1047,7 @@ async fn query_to_json<T: GenericClient>(
|
||||
// convert rows to JSON
|
||||
let rows = rows
|
||||
.iter()
|
||||
.map(|row| {
|
||||
pg_text_row_to_json(
|
||||
row,
|
||||
row_stream.columns(),
|
||||
&c_types,
|
||||
parsed_headers.raw_output,
|
||||
array_mode,
|
||||
)
|
||||
})
|
||||
.map(|row| pg_text_row_to_json(row, &columns, parsed_headers.raw_output, array_mode))
|
||||
.collect::<Result<Vec<_>, _>>()?;
|
||||
|
||||
// Resulting JSON format is based on the format of node-postgres result.
|
||||
|
||||
@@ -83,14 +83,20 @@ impl Env {
|
||||
node_id: NodeId,
|
||||
ttid: TenantTimelineId,
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
let conf = self.make_conf(node_id);
|
||||
let conf = Arc::new(self.make_conf(node_id));
|
||||
let timeline_dir = get_timeline_dir(&conf, &ttid);
|
||||
let remote_path = remote_timeline_path(&ttid)?;
|
||||
|
||||
let safekeeper = self.make_safekeeper(node_id, ttid).await?;
|
||||
let shared_state = SharedState::new(StateSK::Loaded(safekeeper));
|
||||
|
||||
let timeline = Timeline::new(ttid, &timeline_dir, &remote_path, shared_state);
|
||||
let timeline = Timeline::new(
|
||||
ttid,
|
||||
&timeline_dir,
|
||||
&remote_path,
|
||||
shared_state,
|
||||
conf.clone(),
|
||||
);
|
||||
timeline.bootstrap(
|
||||
&mut timeline.write_shared_state().await,
|
||||
&conf,
|
||||
|
||||
@@ -338,7 +338,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
};
|
||||
|
||||
let conf = SafeKeeperConf {
|
||||
let conf = Arc::new(SafeKeeperConf {
|
||||
workdir,
|
||||
my_id: id,
|
||||
listen_pg_addr: args.listen_pg,
|
||||
@@ -368,7 +368,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
control_file_save_interval: args.control_file_save_interval,
|
||||
partial_backup_concurrency: args.partial_backup_concurrency,
|
||||
eviction_min_resident: args.eviction_min_resident,
|
||||
};
|
||||
});
|
||||
|
||||
// initialize sentry if SENTRY_DSN is provided
|
||||
let _sentry_guard = init_sentry(
|
||||
@@ -382,7 +382,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
/// complete, e.g. panicked, inner is error produced by task itself.
|
||||
type JoinTaskRes = Result<anyhow::Result<()>, JoinError>;
|
||||
|
||||
async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
|
||||
async fn start_safekeeper(conf: Arc<SafeKeeperConf>) -> Result<()> {
|
||||
// fsync the datadir to make sure we have a consistent state on disk.
|
||||
if !conf.no_sync {
|
||||
let dfd = File::open(&conf.workdir).context("open datadir for syncfs")?;
|
||||
@@ -428,9 +428,11 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
|
||||
e
|
||||
})?;
|
||||
|
||||
let global_timelines = Arc::new(GlobalTimelines::new(conf.clone()));
|
||||
|
||||
// Register metrics collector for active timelines. It's important to do this
|
||||
// after daemonizing, otherwise process collector will be upset.
|
||||
let timeline_collector = safekeeper::metrics::TimelineCollector::new();
|
||||
let timeline_collector = safekeeper::metrics::TimelineCollector::new(global_timelines.clone());
|
||||
metrics::register_internal(Box::new(timeline_collector))?;
|
||||
|
||||
wal_backup::init_remote_storage(&conf).await;
|
||||
@@ -447,9 +449,8 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
|
||||
.then(|| Handle::try_current().expect("no runtime in main"));
|
||||
|
||||
// Load all timelines from disk to memory.
|
||||
GlobalTimelines::init(conf.clone()).await?;
|
||||
global_timelines.init().await?;
|
||||
|
||||
let conf_ = conf.clone();
|
||||
// Run everything in current thread rt, if asked.
|
||||
if conf.current_thread_runtime {
|
||||
info!("running in current thread runtime");
|
||||
@@ -459,14 +460,16 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
|
||||
.as_ref()
|
||||
.unwrap_or_else(|| WAL_SERVICE_RUNTIME.handle())
|
||||
.spawn(wal_service::task_main(
|
||||
conf_,
|
||||
conf.clone(),
|
||||
pg_listener,
|
||||
Scope::SafekeeperData,
|
||||
global_timelines.clone(),
|
||||
))
|
||||
// wrap with task name for error reporting
|
||||
.map(|res| ("WAL service main".to_owned(), res));
|
||||
tasks_handles.push(Box::pin(wal_service_handle));
|
||||
|
||||
let global_timelines_ = global_timelines.clone();
|
||||
let timeline_housekeeping_handle = current_thread_rt
|
||||
.as_ref()
|
||||
.unwrap_or_else(|| WAL_SERVICE_RUNTIME.handle())
|
||||
@@ -474,40 +477,45 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
|
||||
const TOMBSTONE_TTL: Duration = Duration::from_secs(3600 * 24);
|
||||
loop {
|
||||
tokio::time::sleep(TOMBSTONE_TTL).await;
|
||||
GlobalTimelines::housekeeping(&TOMBSTONE_TTL);
|
||||
global_timelines_.housekeeping(&TOMBSTONE_TTL);
|
||||
}
|
||||
})
|
||||
.map(|res| ("Timeline map housekeeping".to_owned(), res));
|
||||
tasks_handles.push(Box::pin(timeline_housekeeping_handle));
|
||||
|
||||
if let Some(pg_listener_tenant_only) = pg_listener_tenant_only {
|
||||
let conf_ = conf.clone();
|
||||
let wal_service_handle = current_thread_rt
|
||||
.as_ref()
|
||||
.unwrap_or_else(|| WAL_SERVICE_RUNTIME.handle())
|
||||
.spawn(wal_service::task_main(
|
||||
conf_,
|
||||
conf.clone(),
|
||||
pg_listener_tenant_only,
|
||||
Scope::Tenant,
|
||||
global_timelines.clone(),
|
||||
))
|
||||
// wrap with task name for error reporting
|
||||
.map(|res| ("WAL service tenant only main".to_owned(), res));
|
||||
tasks_handles.push(Box::pin(wal_service_handle));
|
||||
}
|
||||
|
||||
let conf_ = conf.clone();
|
||||
let http_handle = current_thread_rt
|
||||
.as_ref()
|
||||
.unwrap_or_else(|| HTTP_RUNTIME.handle())
|
||||
.spawn(http::task_main(conf_, http_listener))
|
||||
.spawn(http::task_main(
|
||||
conf.clone(),
|
||||
http_listener,
|
||||
global_timelines.clone(),
|
||||
))
|
||||
.map(|res| ("HTTP service main".to_owned(), res));
|
||||
tasks_handles.push(Box::pin(http_handle));
|
||||
|
||||
let conf_ = conf.clone();
|
||||
let broker_task_handle = current_thread_rt
|
||||
.as_ref()
|
||||
.unwrap_or_else(|| BROKER_RUNTIME.handle())
|
||||
.spawn(broker::task_main(conf_).instrument(info_span!("broker")))
|
||||
.spawn(
|
||||
broker::task_main(conf.clone(), global_timelines.clone())
|
||||
.instrument(info_span!("broker")),
|
||||
)
|
||||
.map(|res| ("broker main".to_owned(), res));
|
||||
tasks_handles.push(Box::pin(broker_task_handle));
|
||||
|
||||
|
||||
@@ -39,14 +39,17 @@ const RETRY_INTERVAL_MSEC: u64 = 1000;
|
||||
const PUSH_INTERVAL_MSEC: u64 = 1000;
|
||||
|
||||
/// Push once in a while data about all active timelines to the broker.
|
||||
async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
|
||||
async fn push_loop(
|
||||
conf: Arc<SafeKeeperConf>,
|
||||
global_timelines: Arc<GlobalTimelines>,
|
||||
) -> anyhow::Result<()> {
|
||||
if conf.disable_periodic_broker_push {
|
||||
info!("broker push_loop is disabled, doing nothing...");
|
||||
futures::future::pending::<()>().await; // sleep forever
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let active_timelines_set = GlobalTimelines::get_global_broker_active_set();
|
||||
let active_timelines_set = global_timelines.get_global_broker_active_set();
|
||||
|
||||
let mut client =
|
||||
storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?;
|
||||
@@ -87,8 +90,13 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
|
||||
|
||||
/// Subscribe and fetch all the interesting data from the broker.
|
||||
#[instrument(name = "broker_pull", skip_all)]
|
||||
async fn pull_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()> {
|
||||
let mut client = storage_broker::connect(conf.broker_endpoint, conf.broker_keepalive_interval)?;
|
||||
async fn pull_loop(
|
||||
conf: Arc<SafeKeeperConf>,
|
||||
global_timelines: Arc<GlobalTimelines>,
|
||||
stats: Arc<BrokerStats>,
|
||||
) -> Result<()> {
|
||||
let mut client =
|
||||
storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?;
|
||||
|
||||
// TODO: subscribe only to local timelines instead of all
|
||||
let request = SubscribeSafekeeperInfoRequest {
|
||||
@@ -113,7 +121,7 @@ async fn pull_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()>
|
||||
.as_ref()
|
||||
.ok_or_else(|| anyhow!("missing tenant_timeline_id"))?;
|
||||
let ttid = parse_proto_ttid(proto_ttid)?;
|
||||
if let Ok(tli) = GlobalTimelines::get(ttid) {
|
||||
if let Ok(tli) = global_timelines.get(ttid) {
|
||||
// Note that we also receive *our own* info. That's
|
||||
// important, as it is used as an indication of live
|
||||
// connection to the broker.
|
||||
@@ -135,7 +143,11 @@ async fn pull_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()>
|
||||
|
||||
/// Process incoming discover requests. This is done in a separate task to avoid
|
||||
/// interfering with the normal pull/push loops.
|
||||
async fn discover_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<()> {
|
||||
async fn discover_loop(
|
||||
conf: Arc<SafeKeeperConf>,
|
||||
global_timelines: Arc<GlobalTimelines>,
|
||||
stats: Arc<BrokerStats>,
|
||||
) -> Result<()> {
|
||||
let mut client =
|
||||
storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?;
|
||||
|
||||
@@ -171,7 +183,7 @@ async fn discover_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<
|
||||
.as_ref()
|
||||
.ok_or_else(|| anyhow!("missing tenant_timeline_id"))?;
|
||||
let ttid = parse_proto_ttid(proto_ttid)?;
|
||||
if let Ok(tli) = GlobalTimelines::get(ttid) {
|
||||
if let Ok(tli) = global_timelines.get(ttid) {
|
||||
// we received a discovery request for a timeline we know about
|
||||
discover_counter.inc();
|
||||
|
||||
@@ -210,7 +222,10 @@ async fn discover_loop(conf: SafeKeeperConf, stats: Arc<BrokerStats>) -> Result<
|
||||
bail!("end of stream");
|
||||
}
|
||||
|
||||
pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
|
||||
pub async fn task_main(
|
||||
conf: Arc<SafeKeeperConf>,
|
||||
global_timelines: Arc<GlobalTimelines>,
|
||||
) -> anyhow::Result<()> {
|
||||
info!("started, broker endpoint {:?}", conf.broker_endpoint);
|
||||
|
||||
let mut ticker = tokio::time::interval(Duration::from_millis(RETRY_INTERVAL_MSEC));
|
||||
@@ -261,13 +276,13 @@ pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
|
||||
},
|
||||
_ = ticker.tick() => {
|
||||
if push_handle.is_none() {
|
||||
push_handle = Some(tokio::spawn(push_loop(conf.clone())));
|
||||
push_handle = Some(tokio::spawn(push_loop(conf.clone(), global_timelines.clone())));
|
||||
}
|
||||
if pull_handle.is_none() {
|
||||
pull_handle = Some(tokio::spawn(pull_loop(conf.clone(), stats.clone())));
|
||||
pull_handle = Some(tokio::spawn(pull_loop(conf.clone(), global_timelines.clone(), stats.clone())));
|
||||
}
|
||||
if discover_handle.is_none() {
|
||||
discover_handle = Some(tokio::spawn(discover_loop(conf.clone(), stats.clone())));
|
||||
discover_handle = Some(tokio::spawn(discover_loop(conf.clone(), global_timelines.clone(), stats.clone())));
|
||||
}
|
||||
},
|
||||
_ = &mut stats_task => {}
|
||||
|
||||
@@ -1,9 +1,7 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::{bail, Result};
|
||||
use camino::Utf8PathBuf;
|
||||
|
||||
use postgres_ffi::{MAX_SEND_SIZE, WAL_SEGMENT_SIZE};
|
||||
use std::sync::Arc;
|
||||
use tokio::{
|
||||
fs::OpenOptions,
|
||||
io::{AsyncSeekExt, AsyncWriteExt},
|
||||
@@ -14,7 +12,7 @@ use utils::{id::TenantTimelineId, lsn::Lsn};
|
||||
use crate::{
|
||||
control_file::FileStorage,
|
||||
state::TimelinePersistentState,
|
||||
timeline::{Timeline, TimelineError, WalResidentTimeline},
|
||||
timeline::{TimelineError, WalResidentTimeline},
|
||||
timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline},
|
||||
wal_backup::copy_s3_segments,
|
||||
wal_storage::{wal_file_paths, WalReader},
|
||||
@@ -25,16 +23,19 @@ use crate::{
|
||||
const MAX_BACKUP_LAG: u64 = 10 * WAL_SEGMENT_SIZE as u64;
|
||||
|
||||
pub struct Request {
|
||||
pub source: Arc<Timeline>,
|
||||
pub source_ttid: TenantTimelineId,
|
||||
pub until_lsn: Lsn,
|
||||
pub destination_ttid: TenantTimelineId,
|
||||
}
|
||||
|
||||
pub async fn handle_request(request: Request) -> Result<()> {
|
||||
pub async fn handle_request(
|
||||
request: Request,
|
||||
global_timelines: Arc<GlobalTimelines>,
|
||||
) -> Result<()> {
|
||||
// TODO: request.until_lsn MUST be a valid LSN, and we cannot check it :(
|
||||
// if LSN will point to the middle of a WAL record, timeline will be in "broken" state
|
||||
|
||||
match GlobalTimelines::get(request.destination_ttid) {
|
||||
match global_timelines.get(request.destination_ttid) {
|
||||
// timeline already exists. would be good to check that this timeline is the copy
|
||||
// of the source timeline, but it isn't obvious how to do that
|
||||
Ok(_) => return Ok(()),
|
||||
@@ -46,9 +47,10 @@ pub async fn handle_request(request: Request) -> Result<()> {
|
||||
}
|
||||
}
|
||||
|
||||
let source_tli = request.source.wal_residence_guard().await?;
|
||||
let source = global_timelines.get(request.source_ttid)?;
|
||||
let source_tli = source.wal_residence_guard().await?;
|
||||
|
||||
let conf = &GlobalTimelines::get_global_config();
|
||||
let conf = &global_timelines.get_global_config();
|
||||
let ttid = request.destination_ttid;
|
||||
|
||||
let (_tmp_dir, tli_dir_path) = create_temp_timeline_dir(conf, ttid).await?;
|
||||
@@ -127,7 +129,7 @@ pub async fn handle_request(request: Request) -> Result<()> {
|
||||
|
||||
copy_s3_segments(
|
||||
wal_seg_size,
|
||||
&request.source.ttid,
|
||||
&request.source_ttid,
|
||||
&request.destination_ttid,
|
||||
first_segment,
|
||||
first_ondisk_segment,
|
||||
@@ -158,7 +160,9 @@ pub async fn handle_request(request: Request) -> Result<()> {
|
||||
|
||||
// now we have a ready timeline in a temp directory
|
||||
validate_temp_timeline(conf, request.destination_ttid, &tli_dir_path).await?;
|
||||
GlobalTimelines::load_temp_timeline(request.destination_ttid, &tli_dir_path, true).await?;
|
||||
global_timelines
|
||||
.load_temp_timeline(request.destination_ttid, &tli_dir_path, true)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -207,23 +207,23 @@ pub struct FileInfo {
|
||||
}
|
||||
|
||||
/// Build debug dump response, using the provided [`Args`] filters.
|
||||
pub async fn build(args: Args) -> Result<Response> {
|
||||
pub async fn build(args: Args, global_timelines: Arc<GlobalTimelines>) -> Result<Response> {
|
||||
let start_time = Utc::now();
|
||||
let timelines_count = GlobalTimelines::timelines_count();
|
||||
let config = GlobalTimelines::get_global_config();
|
||||
let timelines_count = global_timelines.timelines_count();
|
||||
let config = global_timelines.get_global_config();
|
||||
|
||||
let ptrs_snapshot = if args.tenant_id.is_some() && args.timeline_id.is_some() {
|
||||
// If both tenant_id and timeline_id are specified, we can just get the
|
||||
// timeline directly, without taking a snapshot of the whole list.
|
||||
let ttid = TenantTimelineId::new(args.tenant_id.unwrap(), args.timeline_id.unwrap());
|
||||
if let Ok(tli) = GlobalTimelines::get(ttid) {
|
||||
if let Ok(tli) = global_timelines.get(ttid) {
|
||||
vec![tli]
|
||||
} else {
|
||||
vec![]
|
||||
}
|
||||
} else {
|
||||
// Otherwise, take a snapshot of the whole list.
|
||||
GlobalTimelines::get_all()
|
||||
global_timelines.get_all()
|
||||
};
|
||||
|
||||
let mut timelines = Vec::new();
|
||||
@@ -344,12 +344,12 @@ fn get_wal_last_modified(path: &Utf8Path) -> Result<Option<DateTime<Utc>>> {
|
||||
|
||||
/// Converts SafeKeeperConf to Config, filtering out the fields that are not
|
||||
/// supposed to be exposed.
|
||||
fn build_config(config: SafeKeeperConf) -> Config {
|
||||
fn build_config(config: Arc<SafeKeeperConf>) -> Config {
|
||||
Config {
|
||||
id: config.my_id,
|
||||
workdir: config.workdir.into(),
|
||||
listen_pg_addr: config.listen_pg_addr,
|
||||
listen_http_addr: config.listen_http_addr,
|
||||
workdir: config.workdir.clone().into(),
|
||||
listen_pg_addr: config.listen_pg_addr.clone(),
|
||||
listen_http_addr: config.listen_http_addr.clone(),
|
||||
no_sync: config.no_sync,
|
||||
max_offloader_lag_bytes: config.max_offloader_lag_bytes,
|
||||
wal_backup_enabled: config.wal_backup_enabled,
|
||||
|
||||
@@ -33,7 +33,7 @@ use utils::{
|
||||
|
||||
/// Safekeeper handler of postgres commands
|
||||
pub struct SafekeeperPostgresHandler {
|
||||
pub conf: SafeKeeperConf,
|
||||
pub conf: Arc<SafeKeeperConf>,
|
||||
/// assigned application name
|
||||
pub appname: Option<String>,
|
||||
pub tenant_id: Option<TenantId>,
|
||||
@@ -43,6 +43,7 @@ pub struct SafekeeperPostgresHandler {
|
||||
pub protocol: Option<PostgresClientProtocol>,
|
||||
/// Unique connection id is logged in spans for observability.
|
||||
pub conn_id: ConnectionId,
|
||||
pub global_timelines: Arc<GlobalTimelines>,
|
||||
/// Auth scope allowed on the connections and public key used to check auth tokens. None if auth is not configured.
|
||||
auth: Option<(Scope, Arc<JwtAuth>)>,
|
||||
claims: Option<Claims>,
|
||||
@@ -314,10 +315,11 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
|
||||
|
||||
impl SafekeeperPostgresHandler {
|
||||
pub fn new(
|
||||
conf: SafeKeeperConf,
|
||||
conf: Arc<SafeKeeperConf>,
|
||||
conn_id: u32,
|
||||
io_metrics: Option<TrafficMetrics>,
|
||||
auth: Option<(Scope, Arc<JwtAuth>)>,
|
||||
global_timelines: Arc<GlobalTimelines>,
|
||||
) -> Self {
|
||||
SafekeeperPostgresHandler {
|
||||
conf,
|
||||
@@ -331,6 +333,7 @@ impl SafekeeperPostgresHandler {
|
||||
claims: None,
|
||||
auth,
|
||||
io_metrics,
|
||||
global_timelines,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -360,7 +363,7 @@ impl SafekeeperPostgresHandler {
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
) -> Result<(), QueryError> {
|
||||
// Get timeline, handling "not found" error
|
||||
let tli = match GlobalTimelines::get(self.ttid) {
|
||||
let tli = match self.global_timelines.get(self.ttid) {
|
||||
Ok(tli) => Ok(Some(tli)),
|
||||
Err(TimelineError::NotFound(_)) => Ok(None),
|
||||
Err(e) => Err(QueryError::Other(e.into())),
|
||||
@@ -394,7 +397,10 @@ impl SafekeeperPostgresHandler {
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
) -> Result<(), QueryError> {
|
||||
let tli = GlobalTimelines::get(self.ttid).map_err(|e| QueryError::Other(e.into()))?;
|
||||
let tli = self
|
||||
.global_timelines
|
||||
.get(self.ttid)
|
||||
.map_err(|e| QueryError::Other(e.into()))?;
|
||||
|
||||
let lsn = if self.is_walproposer_recovery() {
|
||||
// walproposer should get all local WAL until flush_lsn
|
||||
|
||||
@@ -3,14 +3,16 @@ pub mod routes;
|
||||
pub use routes::make_router;
|
||||
|
||||
pub use safekeeper_api::models;
|
||||
use std::sync::Arc;
|
||||
|
||||
use crate::SafeKeeperConf;
|
||||
use crate::{GlobalTimelines, SafeKeeperConf};
|
||||
|
||||
pub async fn task_main(
|
||||
conf: SafeKeeperConf,
|
||||
conf: Arc<SafeKeeperConf>,
|
||||
http_listener: std::net::TcpListener,
|
||||
global_timelines: Arc<GlobalTimelines>,
|
||||
) -> anyhow::Result<()> {
|
||||
let router = make_router(conf)
|
||||
let router = make_router(conf, global_timelines)
|
||||
.build()
|
||||
.map_err(|err| anyhow::anyhow!(err))?;
|
||||
let service = utils::http::RouterService::new(router).unwrap();
|
||||
|
||||
@@ -66,6 +66,13 @@ fn get_conf(request: &Request<Body>) -> &SafeKeeperConf {
|
||||
.as_ref()
|
||||
}
|
||||
|
||||
fn get_global_timelines(request: &Request<Body>) -> Arc<GlobalTimelines> {
|
||||
request
|
||||
.data::<Arc<GlobalTimelines>>()
|
||||
.expect("unknown state type")
|
||||
.clone()
|
||||
}
|
||||
|
||||
/// Same as TermLsn, but serializes LSN using display serializer
|
||||
/// in Postgres format, i.e. 0/FFFFFFFF. Used only for the API response.
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
|
||||
@@ -123,9 +130,11 @@ async fn tenant_delete_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
let only_local = parse_query_param(&request, "only_local")?.unwrap_or(false);
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
ensure_no_body(&mut request).await?;
|
||||
let global_timelines = get_global_timelines(&request);
|
||||
// FIXME: `delete_force_all_for_tenant` can return an error for multiple different reasons;
|
||||
// Using an `InternalServerError` should be fixed when the types support it
|
||||
let delete_info = GlobalTimelines::delete_force_all_for_tenant(&tenant_id, only_local)
|
||||
let delete_info = global_timelines
|
||||
.delete_force_all_for_tenant(&tenant_id, only_local)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
json_response(
|
||||
@@ -156,7 +165,9 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
|
||||
.commit_lsn
|
||||
.segment_lsn(server_info.wal_seg_size as usize)
|
||||
});
|
||||
GlobalTimelines::create(ttid, server_info, request_data.commit_lsn, local_start_lsn)
|
||||
let global_timelines = get_global_timelines(&request);
|
||||
global_timelines
|
||||
.create(ttid, server_info, request_data.commit_lsn, local_start_lsn)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
@@ -167,7 +178,9 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
|
||||
/// Note: it is possible to do the same with debug_dump.
|
||||
async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&request, None)?;
|
||||
let res: Vec<TenantTimelineId> = GlobalTimelines::get_all()
|
||||
let global_timelines = get_global_timelines(&request);
|
||||
let res: Vec<TenantTimelineId> = global_timelines
|
||||
.get_all()
|
||||
.iter()
|
||||
.map(|tli| tli.ttid)
|
||||
.collect();
|
||||
@@ -182,7 +195,8 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
|
||||
);
|
||||
check_permission(&request, Some(ttid.tenant_id))?;
|
||||
|
||||
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
|
||||
let global_timelines = get_global_timelines(&request);
|
||||
let tli = global_timelines.get(ttid).map_err(ApiError::from)?;
|
||||
let (inmem, state) = tli.get_state().await;
|
||||
let flush_lsn = tli.get_flush_lsn().await;
|
||||
|
||||
@@ -233,9 +247,11 @@ async fn timeline_delete_handler(mut request: Request<Body>) -> Result<Response<
|
||||
let only_local = parse_query_param(&request, "only_local")?.unwrap_or(false);
|
||||
check_permission(&request, Some(ttid.tenant_id))?;
|
||||
ensure_no_body(&mut request).await?;
|
||||
let global_timelines = get_global_timelines(&request);
|
||||
// FIXME: `delete_force` can fail from both internal errors and bad requests. Add better
|
||||
// error handling here when we're able to.
|
||||
let resp = GlobalTimelines::delete(&ttid, only_local)
|
||||
let resp = global_timelines
|
||||
.delete(&ttid, only_local)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
json_response(StatusCode::OK, resp)
|
||||
@@ -247,8 +263,9 @@ async fn timeline_pull_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
|
||||
let data: pull_timeline::Request = json_request(&mut request).await?;
|
||||
let conf = get_conf(&request);
|
||||
let global_timelines = get_global_timelines(&request);
|
||||
|
||||
let resp = pull_timeline::handle_request(data, conf.sk_auth_token.clone())
|
||||
let resp = pull_timeline::handle_request(data, conf.sk_auth_token.clone(), global_timelines)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
json_response(StatusCode::OK, resp)
|
||||
@@ -263,7 +280,8 @@ async fn timeline_snapshot_handler(request: Request<Body>) -> Result<Response<Bo
|
||||
);
|
||||
check_permission(&request, Some(ttid.tenant_id))?;
|
||||
|
||||
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
|
||||
let global_timelines = get_global_timelines(&request);
|
||||
let tli = global_timelines.get(ttid).map_err(ApiError::from)?;
|
||||
|
||||
// To stream the body use wrap_stream which wants Stream of Result<Bytes>,
|
||||
// so create the chan and write to it in another task.
|
||||
@@ -293,19 +311,19 @@ async fn timeline_copy_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
check_permission(&request, None)?;
|
||||
|
||||
let request_data: TimelineCopyRequest = json_request(&mut request).await?;
|
||||
let ttid = TenantTimelineId::new(
|
||||
let source_ttid = TenantTimelineId::new(
|
||||
parse_request_param(&request, "tenant_id")?,
|
||||
parse_request_param(&request, "source_timeline_id")?,
|
||||
);
|
||||
|
||||
let source = GlobalTimelines::get(ttid)?;
|
||||
let global_timelines = get_global_timelines(&request);
|
||||
|
||||
copy_timeline::handle_request(copy_timeline::Request{
|
||||
source,
|
||||
source_ttid,
|
||||
until_lsn: request_data.until_lsn,
|
||||
destination_ttid: TenantTimelineId::new(ttid.tenant_id, request_data.target_timeline_id),
|
||||
})
|
||||
.instrument(info_span!("copy_timeline", from=%ttid, to=%request_data.target_timeline_id, until_lsn=%request_data.until_lsn))
|
||||
destination_ttid: TenantTimelineId::new(source_ttid.tenant_id, request_data.target_timeline_id),
|
||||
}, global_timelines)
|
||||
.instrument(info_span!("copy_timeline", from=%source_ttid, to=%request_data.target_timeline_id, until_lsn=%request_data.until_lsn))
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
@@ -322,7 +340,8 @@ async fn patch_control_file_handler(
|
||||
parse_request_param(&request, "timeline_id")?,
|
||||
);
|
||||
|
||||
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
|
||||
let global_timelines = get_global_timelines(&request);
|
||||
let tli = global_timelines.get(ttid).map_err(ApiError::from)?;
|
||||
|
||||
let patch_request: patch_control_file::Request = json_request(&mut request).await?;
|
||||
let response = patch_control_file::handle_request(tli, patch_request)
|
||||
@@ -341,7 +360,8 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
|
||||
parse_request_param(&request, "timeline_id")?,
|
||||
);
|
||||
|
||||
let tli = GlobalTimelines::get(ttid)?;
|
||||
let global_timelines = get_global_timelines(&request);
|
||||
let tli = global_timelines.get(ttid)?;
|
||||
tli.write_shared_state()
|
||||
.await
|
||||
.sk
|
||||
@@ -359,6 +379,7 @@ async fn timeline_digest_handler(request: Request<Body>) -> Result<Response<Body
|
||||
);
|
||||
check_permission(&request, Some(ttid.tenant_id))?;
|
||||
|
||||
let global_timelines = get_global_timelines(&request);
|
||||
let from_lsn: Option<Lsn> = parse_query_param(&request, "from_lsn")?;
|
||||
let until_lsn: Option<Lsn> = parse_query_param(&request, "until_lsn")?;
|
||||
|
||||
@@ -371,7 +392,7 @@ async fn timeline_digest_handler(request: Request<Body>) -> Result<Response<Body
|
||||
)))?,
|
||||
};
|
||||
|
||||
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
|
||||
let tli = global_timelines.get(ttid).map_err(ApiError::from)?;
|
||||
let tli = tli
|
||||
.wal_residence_guard()
|
||||
.await
|
||||
@@ -393,7 +414,8 @@ async fn timeline_backup_partial_reset(request: Request<Body>) -> Result<Respons
|
||||
);
|
||||
check_permission(&request, Some(ttid.tenant_id))?;
|
||||
|
||||
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
|
||||
let global_timelines = get_global_timelines(&request);
|
||||
let tli = global_timelines.get(ttid).map_err(ApiError::from)?;
|
||||
|
||||
let response = tli
|
||||
.backup_partial_reset()
|
||||
@@ -415,7 +437,8 @@ async fn timeline_term_bump_handler(
|
||||
|
||||
let request_data: TimelineTermBumpRequest = json_request(&mut request).await?;
|
||||
|
||||
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
|
||||
let global_timelines = get_global_timelines(&request);
|
||||
let tli = global_timelines.get(ttid).map_err(ApiError::from)?;
|
||||
let response = tli
|
||||
.term_bump(request_data.term)
|
||||
.await
|
||||
@@ -452,7 +475,8 @@ async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<B
|
||||
standby_horizon: sk_info.standby_horizon.0,
|
||||
};
|
||||
|
||||
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
|
||||
let global_timelines = get_global_timelines(&request);
|
||||
let tli = global_timelines.get(ttid).map_err(ApiError::from)?;
|
||||
tli.record_safekeeper_info(proto_sk_info)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
@@ -506,6 +530,8 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
|
||||
let dump_term_history = dump_term_history.unwrap_or(true);
|
||||
let dump_wal_last_modified = dump_wal_last_modified.unwrap_or(dump_all);
|
||||
|
||||
let global_timelines = get_global_timelines(&request);
|
||||
|
||||
let args = debug_dump::Args {
|
||||
dump_all,
|
||||
dump_control_file,
|
||||
@@ -517,7 +543,7 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
|
||||
timeline_id,
|
||||
};
|
||||
|
||||
let resp = debug_dump::build(args)
|
||||
let resp = debug_dump::build(args, global_timelines)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
@@ -570,7 +596,10 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
|
||||
}
|
||||
|
||||
/// Safekeeper http router.
|
||||
pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError> {
|
||||
pub fn make_router(
|
||||
conf: Arc<SafeKeeperConf>,
|
||||
global_timelines: Arc<GlobalTimelines>,
|
||||
) -> RouterBuilder<hyper::Body, ApiError> {
|
||||
let mut router = endpoint::make_router();
|
||||
if conf.http_auth.is_some() {
|
||||
router = router.middleware(auth_middleware(|request| {
|
||||
@@ -592,7 +621,8 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
|
||||
// located nearby (/safekeeper/src/http/openapi_spec.yaml).
|
||||
let auth = conf.http_auth.clone();
|
||||
router
|
||||
.data(Arc::new(conf))
|
||||
.data(conf)
|
||||
.data(global_timelines)
|
||||
.data(auth)
|
||||
.get("/metrics", |r| request_span(r, prometheus_metrics_handler))
|
||||
.get("/profile/cpu", |r| request_span(r, profile_cpu_handler))
|
||||
|
||||
@@ -11,7 +11,6 @@ use postgres_backend::QueryError;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::*;
|
||||
use utils::id::TenantTimelineId;
|
||||
|
||||
use crate::handler::SafekeeperPostgresHandler;
|
||||
use crate::safekeeper::{AcceptorProposerMessage, AppendResponse, ServerInfo};
|
||||
@@ -21,7 +20,6 @@ use crate::safekeeper::{
|
||||
use crate::safekeeper::{Term, TermHistory, TermLsn};
|
||||
use crate::state::TimelinePersistentState;
|
||||
use crate::timeline::WalResidentTimeline;
|
||||
use crate::GlobalTimelines;
|
||||
use postgres_backend::PostgresBackend;
|
||||
use postgres_ffi::encode_logical_message;
|
||||
use postgres_ffi::WAL_SEGMENT_SIZE;
|
||||
@@ -70,7 +68,7 @@ pub async fn handle_json_ctrl<IO: AsyncRead + AsyncWrite + Unpin>(
|
||||
info!("JSON_CTRL request: {append_request:?}");
|
||||
|
||||
// need to init safekeeper state before AppendRequest
|
||||
let tli = prepare_safekeeper(spg.ttid, append_request.pg_version).await?;
|
||||
let tli = prepare_safekeeper(spg, append_request.pg_version).await?;
|
||||
|
||||
// if send_proposer_elected is true, we need to update local history
|
||||
if append_request.send_proposer_elected {
|
||||
@@ -99,20 +97,22 @@ pub async fn handle_json_ctrl<IO: AsyncRead + AsyncWrite + Unpin>(
|
||||
/// Prepare safekeeper to process append requests without crashes,
|
||||
/// by sending ProposerGreeting with default server.wal_seg_size.
|
||||
async fn prepare_safekeeper(
|
||||
ttid: TenantTimelineId,
|
||||
spg: &SafekeeperPostgresHandler,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<WalResidentTimeline> {
|
||||
let tli = GlobalTimelines::create(
|
||||
ttid,
|
||||
ServerInfo {
|
||||
pg_version,
|
||||
wal_seg_size: WAL_SEGMENT_SIZE as u32,
|
||||
system_id: 0,
|
||||
},
|
||||
Lsn::INVALID,
|
||||
Lsn::INVALID,
|
||||
)
|
||||
.await?;
|
||||
let tli = spg
|
||||
.global_timelines
|
||||
.create(
|
||||
spg.ttid,
|
||||
ServerInfo {
|
||||
pg_version,
|
||||
wal_seg_size: WAL_SEGMENT_SIZE as u32,
|
||||
system_id: 0,
|
||||
},
|
||||
Lsn::INVALID,
|
||||
Lsn::INVALID,
|
||||
)
|
||||
.await?;
|
||||
|
||||
tli.wal_residence_guard().await
|
||||
}
|
||||
|
||||
@@ -455,6 +455,7 @@ pub struct FullTimelineInfo {
|
||||
|
||||
/// Collects metrics for all active timelines.
|
||||
pub struct TimelineCollector {
|
||||
global_timelines: Arc<GlobalTimelines>,
|
||||
descs: Vec<Desc>,
|
||||
commit_lsn: GenericGaugeVec<AtomicU64>,
|
||||
backup_lsn: GenericGaugeVec<AtomicU64>,
|
||||
@@ -478,14 +479,8 @@ pub struct TimelineCollector {
|
||||
active_timelines_count: IntGauge,
|
||||
}
|
||||
|
||||
impl Default for TimelineCollector {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl TimelineCollector {
|
||||
pub fn new() -> TimelineCollector {
|
||||
pub fn new(global_timelines: Arc<GlobalTimelines>) -> TimelineCollector {
|
||||
let mut descs = Vec::new();
|
||||
|
||||
let commit_lsn = GenericGaugeVec::new(
|
||||
@@ -676,6 +671,7 @@ impl TimelineCollector {
|
||||
descs.extend(active_timelines_count.desc().into_iter().cloned());
|
||||
|
||||
TimelineCollector {
|
||||
global_timelines,
|
||||
descs,
|
||||
commit_lsn,
|
||||
backup_lsn,
|
||||
@@ -728,17 +724,18 @@ impl Collector for TimelineCollector {
|
||||
self.written_wal_seconds.reset();
|
||||
self.flushed_wal_seconds.reset();
|
||||
|
||||
let timelines_count = GlobalTimelines::get_all().len();
|
||||
let timelines_count = self.global_timelines.get_all().len();
|
||||
let mut active_timelines_count = 0;
|
||||
|
||||
// Prometheus Collector is sync, and data is stored under async lock. To
|
||||
// bridge the gap with a crutch, collect data in spawned thread with
|
||||
// local tokio runtime.
|
||||
let global_timelines = self.global_timelines.clone();
|
||||
let infos = std::thread::spawn(|| {
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.build()
|
||||
.expect("failed to create rt");
|
||||
rt.block_on(collect_timeline_metrics())
|
||||
rt.block_on(collect_timeline_metrics(global_timelines))
|
||||
})
|
||||
.join()
|
||||
.expect("collect_timeline_metrics thread panicked");
|
||||
@@ -857,9 +854,9 @@ impl Collector for TimelineCollector {
|
||||
}
|
||||
}
|
||||
|
||||
async fn collect_timeline_metrics() -> Vec<FullTimelineInfo> {
|
||||
async fn collect_timeline_metrics(global_timelines: Arc<GlobalTimelines>) -> Vec<FullTimelineInfo> {
|
||||
let mut res = vec![];
|
||||
let active_timelines = GlobalTimelines::get_global_broker_active_set().get_all();
|
||||
let active_timelines = global_timelines.get_global_broker_active_set().get_all();
|
||||
|
||||
for tli in active_timelines {
|
||||
if let Some(info) = tli.info_for_metrics().await {
|
||||
|
||||
@@ -409,8 +409,9 @@ pub struct DebugDumpResponse {
|
||||
pub async fn handle_request(
|
||||
request: Request,
|
||||
sk_auth_token: Option<SecretString>,
|
||||
global_timelines: Arc<GlobalTimelines>,
|
||||
) -> Result<Response> {
|
||||
let existing_tli = GlobalTimelines::get(TenantTimelineId::new(
|
||||
let existing_tli = global_timelines.get(TenantTimelineId::new(
|
||||
request.tenant_id,
|
||||
request.timeline_id,
|
||||
));
|
||||
@@ -453,13 +454,14 @@ pub async fn handle_request(
|
||||
assert!(status.tenant_id == request.tenant_id);
|
||||
assert!(status.timeline_id == request.timeline_id);
|
||||
|
||||
pull_timeline(status, safekeeper_host, sk_auth_token).await
|
||||
pull_timeline(status, safekeeper_host, sk_auth_token, global_timelines).await
|
||||
}
|
||||
|
||||
async fn pull_timeline(
|
||||
status: TimelineStatus,
|
||||
host: String,
|
||||
sk_auth_token: Option<SecretString>,
|
||||
global_timelines: Arc<GlobalTimelines>,
|
||||
) -> Result<Response> {
|
||||
let ttid = TenantTimelineId::new(status.tenant_id, status.timeline_id);
|
||||
info!(
|
||||
@@ -472,7 +474,7 @@ async fn pull_timeline(
|
||||
status.acceptor_state.epoch
|
||||
);
|
||||
|
||||
let conf = &GlobalTimelines::get_global_config();
|
||||
let conf = &global_timelines.get_global_config();
|
||||
|
||||
let (_tmp_dir, tli_dir_path) = create_temp_timeline_dir(conf, ttid).await?;
|
||||
|
||||
@@ -531,7 +533,9 @@ async fn pull_timeline(
|
||||
assert!(status.commit_lsn <= status.flush_lsn);
|
||||
|
||||
// Finally, load the timeline.
|
||||
let _tli = GlobalTimelines::load_temp_timeline(ttid, &tli_dir_path, false).await?;
|
||||
let _tli = global_timelines
|
||||
.load_temp_timeline(ttid, &tli_dir_path, false)
|
||||
.await?;
|
||||
|
||||
Ok(Response {
|
||||
safekeeper_host: host,
|
||||
|
||||
@@ -267,6 +267,7 @@ impl SafekeeperPostgresHandler {
|
||||
pgb_reader: &mut pgb_reader,
|
||||
peer_addr,
|
||||
acceptor_handle: &mut acceptor_handle,
|
||||
global_timelines: self.global_timelines.clone(),
|
||||
};
|
||||
|
||||
// Read first message and create timeline if needed.
|
||||
@@ -331,6 +332,7 @@ struct NetworkReader<'a, IO> {
|
||||
// WalAcceptor is spawned when we learn server info from walproposer and
|
||||
// create timeline; handle is put here.
|
||||
acceptor_handle: &'a mut Option<JoinHandle<anyhow::Result<()>>>,
|
||||
global_timelines: Arc<GlobalTimelines>,
|
||||
}
|
||||
|
||||
impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
|
||||
@@ -350,10 +352,11 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
|
||||
system_id: greeting.system_id,
|
||||
wal_seg_size: greeting.wal_seg_size,
|
||||
};
|
||||
let tli =
|
||||
GlobalTimelines::create(self.ttid, server_info, Lsn::INVALID, Lsn::INVALID)
|
||||
.await
|
||||
.context("create timeline")?;
|
||||
let tli = self
|
||||
.global_timelines
|
||||
.create(self.ttid, server_info, Lsn::INVALID, Lsn::INVALID)
|
||||
.await
|
||||
.context("create timeline")?;
|
||||
tli.wal_residence_guard().await?
|
||||
}
|
||||
_ => {
|
||||
|
||||
@@ -10,7 +10,6 @@ use crate::timeline::WalResidentTimeline;
|
||||
use crate::wal_reader_stream::WalReaderStreamBuilder;
|
||||
use crate::wal_service::ConnectionId;
|
||||
use crate::wal_storage::WalReader;
|
||||
use crate::GlobalTimelines;
|
||||
use anyhow::{bail, Context as AnyhowContext};
|
||||
use bytes::Bytes;
|
||||
use futures::future::Either;
|
||||
@@ -400,7 +399,10 @@ impl SafekeeperPostgresHandler {
|
||||
start_pos: Lsn,
|
||||
term: Option<Term>,
|
||||
) -> Result<(), QueryError> {
|
||||
let tli = GlobalTimelines::get(self.ttid).map_err(|e| QueryError::Other(e.into()))?;
|
||||
let tli = self
|
||||
.global_timelines
|
||||
.get(self.ttid)
|
||||
.map_err(|e| QueryError::Other(e.into()))?;
|
||||
let residence_guard = tli.wal_residence_guard().await?;
|
||||
|
||||
if let Err(end) = self
|
||||
|
||||
@@ -44,8 +44,8 @@ use crate::wal_backup_partial::PartialRemoteSegment;
|
||||
|
||||
use crate::metrics::{FullTimelineInfo, WalStorageMetrics, MISC_OPERATION_SECONDS};
|
||||
use crate::wal_storage::{Storage as wal_storage_iface, WalReader};
|
||||
use crate::SafeKeeperConf;
|
||||
use crate::{debug_dump, timeline_manager, wal_storage};
|
||||
use crate::{GlobalTimelines, SafeKeeperConf};
|
||||
|
||||
/// Things safekeeper should know about timeline state on peers.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
@@ -467,6 +467,7 @@ pub struct Timeline {
|
||||
walreceivers: Arc<WalReceivers>,
|
||||
timeline_dir: Utf8PathBuf,
|
||||
manager_ctl: ManagerCtl,
|
||||
conf: Arc<SafeKeeperConf>,
|
||||
|
||||
/// Hold this gate from code that depends on the Timeline's non-shut-down state. While holding
|
||||
/// this gate, you must respect [`Timeline::cancel`]
|
||||
@@ -489,6 +490,7 @@ impl Timeline {
|
||||
timeline_dir: &Utf8Path,
|
||||
remote_path: &RemotePath,
|
||||
shared_state: SharedState,
|
||||
conf: Arc<SafeKeeperConf>,
|
||||
) -> Arc<Self> {
|
||||
let (commit_lsn_watch_tx, commit_lsn_watch_rx) =
|
||||
watch::channel(shared_state.sk.state().commit_lsn);
|
||||
@@ -516,6 +518,7 @@ impl Timeline {
|
||||
gate: Default::default(),
|
||||
cancel: CancellationToken::default(),
|
||||
manager_ctl: ManagerCtl::new(),
|
||||
conf,
|
||||
broker_active: AtomicBool::new(false),
|
||||
wal_backup_active: AtomicBool::new(false),
|
||||
last_removed_segno: AtomicU64::new(0),
|
||||
@@ -524,11 +527,14 @@ impl Timeline {
|
||||
}
|
||||
|
||||
/// Load existing timeline from disk.
|
||||
pub fn load_timeline(conf: &SafeKeeperConf, ttid: TenantTimelineId) -> Result<Arc<Timeline>> {
|
||||
pub fn load_timeline(
|
||||
conf: Arc<SafeKeeperConf>,
|
||||
ttid: TenantTimelineId,
|
||||
) -> Result<Arc<Timeline>> {
|
||||
let _enter = info_span!("load_timeline", timeline = %ttid.timeline_id).entered();
|
||||
|
||||
let shared_state = SharedState::restore(conf, &ttid)?;
|
||||
let timeline_dir = get_timeline_dir(conf, &ttid);
|
||||
let shared_state = SharedState::restore(conf.as_ref(), &ttid)?;
|
||||
let timeline_dir = get_timeline_dir(conf.as_ref(), &ttid);
|
||||
let remote_path = remote_timeline_path(&ttid)?;
|
||||
|
||||
Ok(Timeline::new(
|
||||
@@ -536,6 +542,7 @@ impl Timeline {
|
||||
&timeline_dir,
|
||||
&remote_path,
|
||||
shared_state,
|
||||
conf,
|
||||
))
|
||||
}
|
||||
|
||||
@@ -604,8 +611,7 @@ impl Timeline {
|
||||
// it is cancelled, so WAL storage won't be opened again.
|
||||
shared_state.sk.close_wal_store();
|
||||
|
||||
let conf = GlobalTimelines::get_global_config();
|
||||
if !only_local && conf.is_wal_backup_enabled() {
|
||||
if !only_local && self.conf.is_wal_backup_enabled() {
|
||||
// Note: we concurrently delete remote storage data from multiple
|
||||
// safekeepers. That's ok, s3 replies 200 if object doesn't exist and we
|
||||
// do some retries anyway.
|
||||
@@ -951,7 +957,7 @@ impl WalResidentTimeline {
|
||||
|
||||
pub async fn get_walreader(&self, start_lsn: Lsn) -> Result<WalReader> {
|
||||
let (_, persisted_state) = self.get_state().await;
|
||||
let enable_remote_read = GlobalTimelines::get_global_config().is_wal_backup_enabled();
|
||||
let enable_remote_read = self.conf.is_wal_backup_enabled();
|
||||
|
||||
WalReader::new(
|
||||
&self.ttid,
|
||||
@@ -1061,7 +1067,6 @@ impl ManagerTimeline {
|
||||
|
||||
/// Try to switch state Offloaded->Present.
|
||||
pub(crate) async fn switch_to_present(&self) -> anyhow::Result<()> {
|
||||
let conf = GlobalTimelines::get_global_config();
|
||||
let mut shared = self.write_shared_state().await;
|
||||
|
||||
// trying to restore WAL storage
|
||||
@@ -1069,7 +1074,7 @@ impl ManagerTimeline {
|
||||
&self.ttid,
|
||||
&self.timeline_dir,
|
||||
shared.sk.state(),
|
||||
conf.no_sync,
|
||||
self.conf.no_sync,
|
||||
)?;
|
||||
|
||||
// updating control file
|
||||
@@ -1096,7 +1101,7 @@ impl ManagerTimeline {
|
||||
// now we can switch shared.sk to Present, shouldn't fail
|
||||
let prev_sk = std::mem::replace(&mut shared.sk, StateSK::Empty);
|
||||
let cfile_state = prev_sk.take_state();
|
||||
shared.sk = StateSK::Loaded(SafeKeeper::new(cfile_state, wal_store, conf.my_id)?);
|
||||
shared.sk = StateSK::Loaded(SafeKeeper::new(cfile_state, wal_store, self.conf.my_id)?);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -13,7 +13,6 @@ use crate::{control_file, wal_storage, SafeKeeperConf};
|
||||
use anyhow::{bail, Context, Result};
|
||||
use camino::Utf8PathBuf;
|
||||
use camino_tempfile::Utf8TempDir;
|
||||
use once_cell::sync::Lazy;
|
||||
use serde::Serialize;
|
||||
use std::collections::HashMap;
|
||||
use std::str::FromStr;
|
||||
@@ -42,23 +41,16 @@ struct GlobalTimelinesState {
|
||||
// this map is dropped on restart.
|
||||
tombstones: HashMap<TenantTimelineId, Instant>,
|
||||
|
||||
conf: Option<SafeKeeperConf>,
|
||||
conf: Arc<SafeKeeperConf>,
|
||||
broker_active_set: Arc<TimelinesSet>,
|
||||
global_rate_limiter: RateLimiter,
|
||||
}
|
||||
|
||||
impl GlobalTimelinesState {
|
||||
/// Get configuration, which must be set once during init.
|
||||
fn get_conf(&self) -> &SafeKeeperConf {
|
||||
self.conf
|
||||
.as_ref()
|
||||
.expect("GlobalTimelinesState conf is not initialized")
|
||||
}
|
||||
|
||||
/// Get dependencies for a timeline constructor.
|
||||
fn get_dependencies(&self) -> (SafeKeeperConf, Arc<TimelinesSet>, RateLimiter) {
|
||||
fn get_dependencies(&self) -> (Arc<SafeKeeperConf>, Arc<TimelinesSet>, RateLimiter) {
|
||||
(
|
||||
self.get_conf().clone(),
|
||||
self.conf.clone(),
|
||||
self.broker_active_set.clone(),
|
||||
self.global_rate_limiter.clone(),
|
||||
)
|
||||
@@ -82,35 +74,39 @@ impl GlobalTimelinesState {
|
||||
}
|
||||
}
|
||||
|
||||
static TIMELINES_STATE: Lazy<Mutex<GlobalTimelinesState>> = Lazy::new(|| {
|
||||
Mutex::new(GlobalTimelinesState {
|
||||
timelines: HashMap::new(),
|
||||
tombstones: HashMap::new(),
|
||||
conf: None,
|
||||
broker_active_set: Arc::new(TimelinesSet::default()),
|
||||
global_rate_limiter: RateLimiter::new(1, 1),
|
||||
})
|
||||
});
|
||||
|
||||
/// A zero-sized struct used to manage access to the global timelines map.
|
||||
pub struct GlobalTimelines;
|
||||
/// A struct used to manage access to the global timelines map.
|
||||
pub struct GlobalTimelines {
|
||||
state: Mutex<GlobalTimelinesState>,
|
||||
}
|
||||
|
||||
impl GlobalTimelines {
|
||||
/// Create a new instance of the global timelines map.
|
||||
pub fn new(conf: Arc<SafeKeeperConf>) -> Self {
|
||||
Self {
|
||||
state: Mutex::new(GlobalTimelinesState {
|
||||
timelines: HashMap::new(),
|
||||
tombstones: HashMap::new(),
|
||||
conf,
|
||||
broker_active_set: Arc::new(TimelinesSet::default()),
|
||||
global_rate_limiter: RateLimiter::new(1, 1),
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
/// Inject dependencies needed for the timeline constructors and load all timelines to memory.
|
||||
pub async fn init(conf: SafeKeeperConf) -> Result<()> {
|
||||
pub async fn init(&self) -> Result<()> {
|
||||
// clippy isn't smart enough to understand that drop(state) releases the
|
||||
// lock, so use explicit block
|
||||
let tenants_dir = {
|
||||
let mut state = TIMELINES_STATE.lock().unwrap();
|
||||
let mut state = self.state.lock().unwrap();
|
||||
state.global_rate_limiter = RateLimiter::new(
|
||||
conf.partial_backup_concurrency,
|
||||
state.conf.partial_backup_concurrency,
|
||||
DEFAULT_EVICTION_CONCURRENCY,
|
||||
);
|
||||
state.conf = Some(conf);
|
||||
|
||||
// Iterate through all directories and load tenants for all directories
|
||||
// named as a valid tenant_id.
|
||||
state.get_conf().workdir.clone()
|
||||
state.conf.workdir.clone()
|
||||
};
|
||||
let mut tenant_count = 0;
|
||||
for tenants_dir_entry in std::fs::read_dir(&tenants_dir)
|
||||
@@ -122,7 +118,7 @@ impl GlobalTimelines {
|
||||
TenantId::from_str(tenants_dir_entry.file_name().to_str().unwrap_or(""))
|
||||
{
|
||||
tenant_count += 1;
|
||||
GlobalTimelines::load_tenant_timelines(tenant_id).await?;
|
||||
self.load_tenant_timelines(tenant_id).await?;
|
||||
}
|
||||
}
|
||||
Err(e) => error!(
|
||||
@@ -135,7 +131,7 @@ impl GlobalTimelines {
|
||||
info!(
|
||||
"found {} tenants directories, successfully loaded {} timelines",
|
||||
tenant_count,
|
||||
TIMELINES_STATE.lock().unwrap().timelines.len()
|
||||
self.state.lock().unwrap().timelines.len()
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
@@ -143,13 +139,13 @@ impl GlobalTimelines {
|
||||
/// Loads all timelines for the given tenant to memory. Returns fs::read_dir
|
||||
/// errors if any.
|
||||
///
|
||||
/// It is async, but TIMELINES_STATE lock is sync and there is no important
|
||||
/// It is async, but self.state lock is sync and there is no important
|
||||
/// reason to make it async (it is always held for a short while), so we
|
||||
/// just lock and unlock it for each timeline -- this function is called
|
||||
/// during init when nothing else is running, so this is fine.
|
||||
async fn load_tenant_timelines(tenant_id: TenantId) -> Result<()> {
|
||||
async fn load_tenant_timelines(&self, tenant_id: TenantId) -> Result<()> {
|
||||
let (conf, broker_active_set, partial_backup_rate_limiter) = {
|
||||
let state = TIMELINES_STATE.lock().unwrap();
|
||||
let state = self.state.lock().unwrap();
|
||||
state.get_dependencies()
|
||||
};
|
||||
|
||||
@@ -163,10 +159,10 @@ impl GlobalTimelines {
|
||||
TimelineId::from_str(timeline_dir_entry.file_name().to_str().unwrap_or(""))
|
||||
{
|
||||
let ttid = TenantTimelineId::new(tenant_id, timeline_id);
|
||||
match Timeline::load_timeline(&conf, ttid) {
|
||||
match Timeline::load_timeline(conf.clone(), ttid) {
|
||||
Ok(tli) => {
|
||||
let mut shared_state = tli.write_shared_state().await;
|
||||
TIMELINES_STATE
|
||||
self.state
|
||||
.lock()
|
||||
.unwrap()
|
||||
.timelines
|
||||
@@ -200,29 +196,30 @@ impl GlobalTimelines {
|
||||
}
|
||||
|
||||
/// Get the number of timelines in the map.
|
||||
pub fn timelines_count() -> usize {
|
||||
TIMELINES_STATE.lock().unwrap().timelines.len()
|
||||
pub fn timelines_count(&self) -> usize {
|
||||
self.state.lock().unwrap().timelines.len()
|
||||
}
|
||||
|
||||
/// Get the global safekeeper config.
|
||||
pub fn get_global_config() -> SafeKeeperConf {
|
||||
TIMELINES_STATE.lock().unwrap().get_conf().clone()
|
||||
pub fn get_global_config(&self) -> Arc<SafeKeeperConf> {
|
||||
self.state.lock().unwrap().conf.clone()
|
||||
}
|
||||
|
||||
pub fn get_global_broker_active_set() -> Arc<TimelinesSet> {
|
||||
TIMELINES_STATE.lock().unwrap().broker_active_set.clone()
|
||||
pub fn get_global_broker_active_set(&self) -> Arc<TimelinesSet> {
|
||||
self.state.lock().unwrap().broker_active_set.clone()
|
||||
}
|
||||
|
||||
/// Create a new timeline with the given id. If the timeline already exists, returns
|
||||
/// an existing timeline.
|
||||
pub(crate) async fn create(
|
||||
&self,
|
||||
ttid: TenantTimelineId,
|
||||
server_info: ServerInfo,
|
||||
commit_lsn: Lsn,
|
||||
local_start_lsn: Lsn,
|
||||
) -> Result<Arc<Timeline>> {
|
||||
let (conf, _, _) = {
|
||||
let state = TIMELINES_STATE.lock().unwrap();
|
||||
let state = self.state.lock().unwrap();
|
||||
if let Ok(timeline) = state.get(&ttid) {
|
||||
// Timeline already exists, return it.
|
||||
return Ok(timeline);
|
||||
@@ -245,7 +242,7 @@ impl GlobalTimelines {
|
||||
let state =
|
||||
TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn)?;
|
||||
control_file::FileStorage::create_new(&tmp_dir_path, state, conf.no_sync).await?;
|
||||
let timeline = GlobalTimelines::load_temp_timeline(ttid, &tmp_dir_path, true).await?;
|
||||
let timeline = self.load_temp_timeline(ttid, &tmp_dir_path, true).await?;
|
||||
Ok(timeline)
|
||||
}
|
||||
|
||||
@@ -261,13 +258,14 @@ impl GlobalTimelines {
|
||||
/// 2) move the directory and load the timeline
|
||||
/// 3) take lock again and insert the timeline into the global map.
|
||||
pub async fn load_temp_timeline(
|
||||
&self,
|
||||
ttid: TenantTimelineId,
|
||||
tmp_path: &Utf8PathBuf,
|
||||
check_tombstone: bool,
|
||||
) -> Result<Arc<Timeline>> {
|
||||
// Check for existence and mark that we're creating it.
|
||||
let (conf, broker_active_set, partial_backup_rate_limiter) = {
|
||||
let mut state = TIMELINES_STATE.lock().unwrap();
|
||||
let mut state = self.state.lock().unwrap();
|
||||
match state.timelines.get(&ttid) {
|
||||
Some(GlobalMapTimeline::CreationInProgress) => {
|
||||
bail!(TimelineError::CreationInProgress(ttid));
|
||||
@@ -295,10 +293,10 @@ impl GlobalTimelines {
|
||||
};
|
||||
|
||||
// Do the actual move and reflect the result in the map.
|
||||
match GlobalTimelines::install_temp_timeline(ttid, tmp_path, &conf).await {
|
||||
match GlobalTimelines::install_temp_timeline(ttid, tmp_path, conf.clone()).await {
|
||||
Ok(timeline) => {
|
||||
let mut timeline_shared_state = timeline.write_shared_state().await;
|
||||
let mut state = TIMELINES_STATE.lock().unwrap();
|
||||
let mut state = self.state.lock().unwrap();
|
||||
assert!(matches!(
|
||||
state.timelines.get(&ttid),
|
||||
Some(GlobalMapTimeline::CreationInProgress)
|
||||
@@ -319,7 +317,7 @@ impl GlobalTimelines {
|
||||
}
|
||||
Err(e) => {
|
||||
// Init failed, remove the marker from the map
|
||||
let mut state = TIMELINES_STATE.lock().unwrap();
|
||||
let mut state = self.state.lock().unwrap();
|
||||
assert!(matches!(
|
||||
state.timelines.get(&ttid),
|
||||
Some(GlobalMapTimeline::CreationInProgress)
|
||||
@@ -334,10 +332,10 @@ impl GlobalTimelines {
|
||||
async fn install_temp_timeline(
|
||||
ttid: TenantTimelineId,
|
||||
tmp_path: &Utf8PathBuf,
|
||||
conf: &SafeKeeperConf,
|
||||
conf: Arc<SafeKeeperConf>,
|
||||
) -> Result<Arc<Timeline>> {
|
||||
let tenant_path = get_tenant_dir(conf, &ttid.tenant_id);
|
||||
let timeline_path = get_timeline_dir(conf, &ttid);
|
||||
let tenant_path = get_tenant_dir(conf.as_ref(), &ttid.tenant_id);
|
||||
let timeline_path = get_timeline_dir(conf.as_ref(), &ttid);
|
||||
|
||||
// We must have already checked that timeline doesn't exist in the map,
|
||||
// but there might be existing datadir: if timeline is corrupted it is
|
||||
@@ -382,9 +380,9 @@ impl GlobalTimelines {
|
||||
/// Get a timeline from the global map. If it's not present, it doesn't exist on disk,
|
||||
/// or was corrupted and couldn't be loaded on startup. Returned timeline is always valid,
|
||||
/// i.e. loaded in memory and not cancelled.
|
||||
pub(crate) fn get(ttid: TenantTimelineId) -> Result<Arc<Timeline>, TimelineError> {
|
||||
pub(crate) fn get(&self, ttid: TenantTimelineId) -> Result<Arc<Timeline>, TimelineError> {
|
||||
let tli_res = {
|
||||
let state = TIMELINES_STATE.lock().unwrap();
|
||||
let state = self.state.lock().unwrap();
|
||||
state.get(&ttid)
|
||||
};
|
||||
match tli_res {
|
||||
@@ -399,8 +397,8 @@ impl GlobalTimelines {
|
||||
}
|
||||
|
||||
/// Returns all timelines. This is used for background timeline processes.
|
||||
pub fn get_all() -> Vec<Arc<Timeline>> {
|
||||
let global_lock = TIMELINES_STATE.lock().unwrap();
|
||||
pub fn get_all(&self) -> Vec<Arc<Timeline>> {
|
||||
let global_lock = self.state.lock().unwrap();
|
||||
global_lock
|
||||
.timelines
|
||||
.values()
|
||||
@@ -419,8 +417,8 @@ impl GlobalTimelines {
|
||||
|
||||
/// Returns all timelines belonging to a given tenant. Used for deleting all timelines of a tenant,
|
||||
/// and that's why it can return cancelled timelines, to retry deleting them.
|
||||
fn get_all_for_tenant(tenant_id: TenantId) -> Vec<Arc<Timeline>> {
|
||||
let global_lock = TIMELINES_STATE.lock().unwrap();
|
||||
fn get_all_for_tenant(&self, tenant_id: TenantId) -> Vec<Arc<Timeline>> {
|
||||
let global_lock = self.state.lock().unwrap();
|
||||
global_lock
|
||||
.timelines
|
||||
.values()
|
||||
@@ -435,11 +433,12 @@ impl GlobalTimelines {
|
||||
/// Cancels timeline, then deletes the corresponding data directory.
|
||||
/// If only_local, doesn't remove WAL segments in remote storage.
|
||||
pub(crate) async fn delete(
|
||||
&self,
|
||||
ttid: &TenantTimelineId,
|
||||
only_local: bool,
|
||||
) -> Result<TimelineDeleteForceResult> {
|
||||
let tli_res = {
|
||||
let state = TIMELINES_STATE.lock().unwrap();
|
||||
let state = self.state.lock().unwrap();
|
||||
|
||||
if state.tombstones.contains_key(ttid) {
|
||||
// Presence of a tombstone guarantees that a previous deletion has completed and there is no work to do.
|
||||
@@ -472,7 +471,7 @@ impl GlobalTimelines {
|
||||
}
|
||||
Err(_) => {
|
||||
// Timeline is not memory, but it may still exist on disk in broken state.
|
||||
let dir_path = get_timeline_dir(TIMELINES_STATE.lock().unwrap().get_conf(), ttid);
|
||||
let dir_path = get_timeline_dir(self.state.lock().unwrap().conf.as_ref(), ttid);
|
||||
let dir_existed = delete_dir(dir_path)?;
|
||||
|
||||
Ok(TimelineDeleteForceResult {
|
||||
@@ -485,7 +484,7 @@ impl GlobalTimelines {
|
||||
// Finalize deletion, by dropping Timeline objects and storing smaller tombstones. The tombstones
|
||||
// are used to prevent still-running computes from re-creating the same timeline when they send data,
|
||||
// and to speed up repeated deletion calls by avoiding re-listing objects.
|
||||
TIMELINES_STATE.lock().unwrap().delete(*ttid);
|
||||
self.state.lock().unwrap().delete(*ttid);
|
||||
|
||||
result
|
||||
}
|
||||
@@ -497,17 +496,18 @@ impl GlobalTimelines {
|
||||
///
|
||||
/// If only_local, doesn't remove WAL segments in remote storage.
|
||||
pub async fn delete_force_all_for_tenant(
|
||||
&self,
|
||||
tenant_id: &TenantId,
|
||||
only_local: bool,
|
||||
) -> Result<HashMap<TenantTimelineId, TimelineDeleteForceResult>> {
|
||||
info!("deleting all timelines for tenant {}", tenant_id);
|
||||
let to_delete = Self::get_all_for_tenant(*tenant_id);
|
||||
let to_delete = self.get_all_for_tenant(*tenant_id);
|
||||
|
||||
let mut err = None;
|
||||
|
||||
let mut deleted = HashMap::new();
|
||||
for tli in &to_delete {
|
||||
match Self::delete(&tli.ttid, only_local).await {
|
||||
match self.delete(&tli.ttid, only_local).await {
|
||||
Ok(result) => {
|
||||
deleted.insert(tli.ttid, result);
|
||||
}
|
||||
@@ -529,15 +529,15 @@ impl GlobalTimelines {
|
||||
// so the directory may be not empty. In this case timelines will have bad state
|
||||
// and timeline background jobs can panic.
|
||||
delete_dir(get_tenant_dir(
|
||||
TIMELINES_STATE.lock().unwrap().get_conf(),
|
||||
self.state.lock().unwrap().conf.as_ref(),
|
||||
tenant_id,
|
||||
))?;
|
||||
|
||||
Ok(deleted)
|
||||
}
|
||||
|
||||
pub fn housekeeping(tombstone_ttl: &Duration) {
|
||||
let mut state = TIMELINES_STATE.lock().unwrap();
|
||||
pub fn housekeeping(&self, tombstone_ttl: &Duration) {
|
||||
let mut state = self.state.lock().unwrap();
|
||||
|
||||
// We keep tombstones long enough to have a good chance of preventing rogue computes from re-creating deleted
|
||||
// timelines. If a compute kept running for longer than this TTL (or across a safekeeper restart) then they
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
//!
|
||||
use anyhow::{Context, Result};
|
||||
use postgres_backend::QueryError;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use tokio::net::TcpStream;
|
||||
use tokio_io_timeout::TimeoutReader;
|
||||
@@ -11,9 +12,9 @@ use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::{auth::Scope, measured_stream::MeasuredStream};
|
||||
|
||||
use crate::handler::SafekeeperPostgresHandler;
|
||||
use crate::metrics::TrafficMetrics;
|
||||
use crate::SafeKeeperConf;
|
||||
use crate::{handler::SafekeeperPostgresHandler, GlobalTimelines};
|
||||
use postgres_backend::{AuthType, PostgresBackend};
|
||||
|
||||
/// Accept incoming TCP connections and spawn them into a background thread.
|
||||
@@ -22,9 +23,10 @@ use postgres_backend::{AuthType, PostgresBackend};
|
||||
/// to any tenant are allowed) or Tenant (only tokens giving access to specific
|
||||
/// tenant are allowed). Doesn't matter if auth is disabled in conf.
|
||||
pub async fn task_main(
|
||||
conf: SafeKeeperConf,
|
||||
conf: Arc<SafeKeeperConf>,
|
||||
pg_listener: std::net::TcpListener,
|
||||
allowed_auth_scope: Scope,
|
||||
global_timelines: Arc<GlobalTimelines>,
|
||||
) -> anyhow::Result<()> {
|
||||
// Tokio's from_std won't do this for us, per its comment.
|
||||
pg_listener.set_nonblocking(true)?;
|
||||
@@ -37,10 +39,10 @@ pub async fn task_main(
|
||||
debug!("accepted connection from {}", peer_addr);
|
||||
let conf = conf.clone();
|
||||
let conn_id = issue_connection_id(&mut connection_count);
|
||||
|
||||
let global_timelines = global_timelines.clone();
|
||||
tokio::spawn(
|
||||
async move {
|
||||
if let Err(err) = handle_socket(socket, conf, conn_id, allowed_auth_scope).await {
|
||||
if let Err(err) = handle_socket(socket, conf, conn_id, allowed_auth_scope, global_timelines).await {
|
||||
error!("connection handler exited: {}", err);
|
||||
}
|
||||
}
|
||||
@@ -53,9 +55,10 @@ pub async fn task_main(
|
||||
///
|
||||
async fn handle_socket(
|
||||
socket: TcpStream,
|
||||
conf: SafeKeeperConf,
|
||||
conf: Arc<SafeKeeperConf>,
|
||||
conn_id: ConnectionId,
|
||||
allowed_auth_scope: Scope,
|
||||
global_timelines: Arc<GlobalTimelines>,
|
||||
) -> Result<(), QueryError> {
|
||||
socket.set_nodelay(true)?;
|
||||
let peer_addr = socket.peer_addr()?;
|
||||
@@ -96,8 +99,13 @@ async fn handle_socket(
|
||||
Some(_) => AuthType::NeonJWT,
|
||||
};
|
||||
let auth_pair = auth_key.map(|key| (allowed_auth_scope, key));
|
||||
let mut conn_handler =
|
||||
SafekeeperPostgresHandler::new(conf, conn_id, Some(traffic_metrics.clone()), auth_pair);
|
||||
let mut conn_handler = SafekeeperPostgresHandler::new(
|
||||
conf,
|
||||
conn_id,
|
||||
Some(traffic_metrics.clone()),
|
||||
auth_pair,
|
||||
global_timelines,
|
||||
);
|
||||
let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
|
||||
// libpq protocol between safekeeper and walproposer / pageserver
|
||||
// We don't use shutdown.
|
||||
|
||||
@@ -636,6 +636,13 @@ impl Persistence {
|
||||
.into_boxed(),
|
||||
};
|
||||
|
||||
// Clear generation_pageserver if we are moving into a state where we won't have
|
||||
// any attached pageservers.
|
||||
let input_generation_pageserver = match input_placement_policy {
|
||||
None | Some(PlacementPolicy::Attached(_)) => None,
|
||||
Some(PlacementPolicy::Detached | PlacementPolicy::Secondary) => Some(None),
|
||||
};
|
||||
|
||||
#[derive(AsChangeset)]
|
||||
#[diesel(table_name = crate::schema::tenant_shards)]
|
||||
struct ShardUpdate {
|
||||
@@ -643,6 +650,7 @@ impl Persistence {
|
||||
placement_policy: Option<String>,
|
||||
config: Option<String>,
|
||||
scheduling_policy: Option<String>,
|
||||
generation_pageserver: Option<Option<i64>>,
|
||||
}
|
||||
|
||||
let update = ShardUpdate {
|
||||
@@ -655,6 +663,7 @@ impl Persistence {
|
||||
.map(|c| serde_json::to_string(&c).unwrap()),
|
||||
scheduling_policy: input_scheduling_policy
|
||||
.map(|p| serde_json::to_string(&p).unwrap()),
|
||||
generation_pageserver: input_generation_pageserver,
|
||||
};
|
||||
|
||||
query.set(update).execute(conn)?;
|
||||
|
||||
@@ -513,6 +513,9 @@ struct ShardUpdate {
|
||||
|
||||
/// If this is None, generation is not updated.
|
||||
generation: Option<Generation>,
|
||||
|
||||
/// If this is None, scheduling policy is not updated.
|
||||
scheduling_policy: Option<ShardSchedulingPolicy>,
|
||||
}
|
||||
|
||||
enum StopReconciliationsReason {
|
||||
@@ -2376,6 +2379,23 @@ impl Service {
|
||||
}
|
||||
};
|
||||
|
||||
// Ordinarily we do not update scheduling policy, but when making major changes
|
||||
// like detaching or demoting to secondary-only, we need to force the scheduling
|
||||
// mode to Active, or the caller's expected outcome (detach it) will not happen.
|
||||
let scheduling_policy = match req.config.mode {
|
||||
LocationConfigMode::Detached | LocationConfigMode::Secondary => {
|
||||
// Special case: when making major changes like detaching or demoting to secondary-only,
|
||||
// we need to force the scheduling mode to Active, or nothing will happen.
|
||||
Some(ShardSchedulingPolicy::Active)
|
||||
}
|
||||
LocationConfigMode::AttachedMulti
|
||||
| LocationConfigMode::AttachedSingle
|
||||
| LocationConfigMode::AttachedStale => {
|
||||
// While attached, continue to respect whatever the existing scheduling mode is.
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
let mut create = true;
|
||||
for (shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
|
||||
// Saw an existing shard: this is not a creation
|
||||
@@ -2401,6 +2421,7 @@ impl Service {
|
||||
placement_policy: placement_policy.clone(),
|
||||
tenant_config: req.config.tenant_conf.clone(),
|
||||
generation: set_generation,
|
||||
scheduling_policy,
|
||||
});
|
||||
}
|
||||
|
||||
@@ -2497,6 +2518,7 @@ impl Service {
|
||||
placement_policy,
|
||||
tenant_config,
|
||||
generation,
|
||||
scheduling_policy,
|
||||
} in &updates
|
||||
{
|
||||
self.persistence
|
||||
@@ -2505,7 +2527,7 @@ impl Service {
|
||||
Some(placement_policy.clone()),
|
||||
Some(tenant_config.clone()),
|
||||
*generation,
|
||||
None,
|
||||
*scheduling_policy,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
@@ -2521,6 +2543,7 @@ impl Service {
|
||||
placement_policy,
|
||||
tenant_config,
|
||||
generation: update_generation,
|
||||
scheduling_policy,
|
||||
} in updates
|
||||
{
|
||||
let Some(shard) = tenants.get_mut(&tenant_shard_id) else {
|
||||
@@ -2539,6 +2562,10 @@ impl Service {
|
||||
shard.generation = Some(generation);
|
||||
}
|
||||
|
||||
if let Some(scheduling_policy) = scheduling_policy {
|
||||
shard.set_scheduling_policy(scheduling_policy);
|
||||
}
|
||||
|
||||
shard.schedule(scheduler, &mut schedule_context)?;
|
||||
|
||||
let maybe_waiter = self.maybe_reconcile_shard(shard, nodes);
|
||||
@@ -2992,9 +3019,17 @@ impl Service {
|
||||
|
||||
let TenantPolicyRequest {
|
||||
placement,
|
||||
scheduling,
|
||||
mut scheduling,
|
||||
} = req;
|
||||
|
||||
if let Some(PlacementPolicy::Detached | PlacementPolicy::Secondary) = placement {
|
||||
// When someone configures a tenant to detach, we force the scheduling policy to enable
|
||||
// this to take effect.
|
||||
if scheduling.is_none() {
|
||||
scheduling = Some(ShardSchedulingPolicy::Active);
|
||||
}
|
||||
}
|
||||
|
||||
self.persistence
|
||||
.update_tenant_shard(
|
||||
TenantFilter::Tenant(tenant_id),
|
||||
|
||||
@@ -459,12 +459,10 @@ pub async fn get_timeline_objects(
|
||||
Ok(list.keys)
|
||||
}
|
||||
|
||||
const MAX_KEYS_PER_DELETE: usize = 1000;
|
||||
|
||||
/// Drain a buffer of keys into DeleteObjects requests
|
||||
///
|
||||
/// If `drain` is true, drains keys completely; otherwise stops when <
|
||||
/// MAX_KEYS_PER_DELETE keys are left.
|
||||
/// `max_keys_per_delete`` keys are left.
|
||||
/// `num_deleted` returns number of deleted keys.
|
||||
async fn do_delete(
|
||||
remote_client: &GenericRemoteStorage,
|
||||
@@ -474,9 +472,10 @@ async fn do_delete(
|
||||
progress_tracker: &mut DeletionProgressTracker,
|
||||
) -> anyhow::Result<()> {
|
||||
let cancel = CancellationToken::new();
|
||||
while (!keys.is_empty() && drain) || (keys.len() >= MAX_KEYS_PER_DELETE) {
|
||||
let max_keys_per_delete = remote_client.max_keys_per_delete();
|
||||
while (!keys.is_empty() && drain) || (keys.len() >= max_keys_per_delete) {
|
||||
let request_keys =
|
||||
keys.split_off(keys.len() - (std::cmp::min(MAX_KEYS_PER_DELETE, keys.len())));
|
||||
keys.split_off(keys.len() - (std::cmp::min(max_keys_per_delete, keys.len())));
|
||||
|
||||
let request_keys: Vec<RemotePath> = request_keys.into_iter().map(|o| o.key).collect();
|
||||
|
||||
@@ -617,7 +616,7 @@ pub async fn purge_garbage(
|
||||
}
|
||||
|
||||
objects_to_delete.append(&mut object_list);
|
||||
if objects_to_delete.len() >= MAX_KEYS_PER_DELETE {
|
||||
if objects_to_delete.len() >= remote_client.max_keys_per_delete() {
|
||||
do_delete(
|
||||
&remote_client,
|
||||
&mut objects_to_delete,
|
||||
|
||||
@@ -86,6 +86,8 @@ enum Command {
|
||||
/// For safekeeper node_kind only, json list of timelines and their lsn info
|
||||
#[arg(long, default_value = None)]
|
||||
timeline_lsns: Option<String>,
|
||||
#[arg(long, default_value_t = false)]
|
||||
verbose: bool,
|
||||
},
|
||||
TenantSnapshot {
|
||||
#[arg(long = "tenant-id")]
|
||||
@@ -166,6 +168,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
dump_db_connstr,
|
||||
dump_db_table,
|
||||
timeline_lsns,
|
||||
verbose,
|
||||
} => {
|
||||
if let NodeKind::Safekeeper = node_kind {
|
||||
let db_or_list = match (timeline_lsns, dump_db_connstr) {
|
||||
@@ -203,6 +206,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
tenant_ids,
|
||||
json,
|
||||
post_to_storcon,
|
||||
verbose,
|
||||
cli.exit_code,
|
||||
)
|
||||
.await
|
||||
@@ -313,6 +317,7 @@ pub async fn run_cron_job(
|
||||
Vec::new(),
|
||||
true,
|
||||
post_to_storcon,
|
||||
false, // default to non-verbose mode
|
||||
exit_code,
|
||||
)
|
||||
.await?;
|
||||
@@ -362,12 +367,13 @@ pub async fn scan_pageserver_metadata_cmd(
|
||||
tenant_shard_ids: Vec<TenantShardId>,
|
||||
json: bool,
|
||||
post_to_storcon: bool,
|
||||
verbose: bool,
|
||||
exit_code: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
if controller_client.is_none() && post_to_storcon {
|
||||
return Err(anyhow!("Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run"));
|
||||
}
|
||||
match scan_pageserver_metadata(bucket_config.clone(), tenant_shard_ids).await {
|
||||
match scan_pageserver_metadata(bucket_config.clone(), tenant_shard_ids, verbose).await {
|
||||
Err(e) => {
|
||||
tracing::error!("Failed: {e}");
|
||||
Err(e)
|
||||
|
||||
@@ -21,8 +21,12 @@ pub struct MetadataSummary {
|
||||
tenant_count: usize,
|
||||
timeline_count: usize,
|
||||
timeline_shard_count: usize,
|
||||
with_errors: HashSet<TenantShardTimelineId>,
|
||||
with_warnings: HashSet<TenantShardTimelineId>,
|
||||
/// Tenant-shard timeline (key) mapping to errors. The key has to be a string because it will be serialized to a JSON.
|
||||
/// The key is generated using `TenantShardTimelineId::to_string()`.
|
||||
with_errors: HashMap<String, Vec<String>>,
|
||||
/// Tenant-shard timeline (key) mapping to warnings. The key has to be a string because it will be serialized to a JSON.
|
||||
/// The key is generated using `TenantShardTimelineId::to_string()`.
|
||||
with_warnings: HashMap<String, Vec<String>>,
|
||||
with_orphans: HashSet<TenantShardTimelineId>,
|
||||
indices_by_version: HashMap<usize, usize>,
|
||||
|
||||
@@ -52,7 +56,12 @@ impl MetadataSummary {
|
||||
}
|
||||
}
|
||||
|
||||
fn update_analysis(&mut self, id: &TenantShardTimelineId, analysis: &TimelineAnalysis) {
|
||||
fn update_analysis(
|
||||
&mut self,
|
||||
id: &TenantShardTimelineId,
|
||||
analysis: &TimelineAnalysis,
|
||||
verbose: bool,
|
||||
) {
|
||||
if analysis.is_healthy() {
|
||||
self.healthy_tenant_shards.insert(id.tenant_shard_id);
|
||||
} else {
|
||||
@@ -61,11 +70,17 @@ impl MetadataSummary {
|
||||
}
|
||||
|
||||
if !analysis.errors.is_empty() {
|
||||
self.with_errors.insert(*id);
|
||||
let entry = self.with_errors.entry(id.to_string()).or_default();
|
||||
if verbose {
|
||||
entry.extend(analysis.errors.iter().cloned());
|
||||
}
|
||||
}
|
||||
|
||||
if !analysis.warnings.is_empty() {
|
||||
self.with_warnings.insert(*id);
|
||||
let entry = self.with_warnings.entry(id.to_string()).or_default();
|
||||
if verbose {
|
||||
entry.extend(analysis.warnings.iter().cloned());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -120,6 +135,7 @@ Index versions: {version_summary}
|
||||
pub async fn scan_pageserver_metadata(
|
||||
bucket_config: BucketConfig,
|
||||
tenant_ids: Vec<TenantShardId>,
|
||||
verbose: bool,
|
||||
) -> anyhow::Result<MetadataSummary> {
|
||||
let (remote_client, target) = init_remote(bucket_config, NodeKind::Pageserver).await?;
|
||||
|
||||
@@ -164,6 +180,7 @@ pub async fn scan_pageserver_metadata(
|
||||
mut tenant_objects: TenantObjectListing,
|
||||
timelines: Vec<(TenantShardTimelineId, RemoteTimelineBlobData)>,
|
||||
highest_shard_count: ShardCount,
|
||||
verbose: bool,
|
||||
) {
|
||||
summary.tenant_count += 1;
|
||||
|
||||
@@ -203,7 +220,7 @@ pub async fn scan_pageserver_metadata(
|
||||
Some(data),
|
||||
)
|
||||
.await;
|
||||
summary.update_analysis(&ttid, &analysis);
|
||||
summary.update_analysis(&ttid, &analysis, verbose);
|
||||
|
||||
timeline_ids.insert(ttid.timeline_id);
|
||||
} else {
|
||||
@@ -271,10 +288,6 @@ pub async fn scan_pageserver_metadata(
|
||||
summary.update_data(&data);
|
||||
|
||||
match tenant_id {
|
||||
None => {
|
||||
tenant_id = Some(ttid.tenant_shard_id.tenant_id);
|
||||
highest_shard_count = highest_shard_count.max(ttid.tenant_shard_id.shard_count);
|
||||
}
|
||||
Some(prev_tenant_id) => {
|
||||
if prev_tenant_id != ttid.tenant_shard_id.tenant_id {
|
||||
// New tenant: analyze this tenant's timelines, clear accumulated tenant_timeline_results
|
||||
@@ -287,6 +300,7 @@ pub async fn scan_pageserver_metadata(
|
||||
tenant_objects,
|
||||
timelines,
|
||||
highest_shard_count,
|
||||
verbose,
|
||||
)
|
||||
.instrument(info_span!("analyze-tenant", tenant = %prev_tenant_id))
|
||||
.await;
|
||||
@@ -296,6 +310,10 @@ pub async fn scan_pageserver_metadata(
|
||||
highest_shard_count = highest_shard_count.max(ttid.tenant_shard_id.shard_count);
|
||||
}
|
||||
}
|
||||
None => {
|
||||
tenant_id = Some(ttid.tenant_shard_id.tenant_id);
|
||||
highest_shard_count = highest_shard_count.max(ttid.tenant_shard_id.shard_count);
|
||||
}
|
||||
}
|
||||
|
||||
match &data.blob_data {
|
||||
@@ -326,6 +344,7 @@ pub async fn scan_pageserver_metadata(
|
||||
tenant_objects,
|
||||
tenant_timeline_results,
|
||||
highest_shard_count,
|
||||
verbose,
|
||||
)
|
||||
.instrument(info_span!("analyze-tenant", tenant = %tenant_id))
|
||||
.await;
|
||||
|
||||
21
test_runner/cloud_regress/README.md
Normal file
21
test_runner/cloud_regress/README.md
Normal file
@@ -0,0 +1,21 @@
|
||||
# How to run the `pg_regress` tests on a cloud Neon instance.
|
||||
|
||||
* Create a Neon project on staging.
|
||||
* Grant the superuser privileges to the DB user.
|
||||
* (Optional) create a branch for testing
|
||||
* Configure the endpoint by updating the control-plane database with the following settings:
|
||||
* `Timeone`: `America/Los_Angeles`
|
||||
* `DateStyle`: `Postgres,MDY`
|
||||
* `compute_query_id`: `off`
|
||||
* Checkout the actual `Neon` sources
|
||||
* Patch the sql and expected files for the specific PostgreSQL version, e.g. for v17:
|
||||
```bash
|
||||
$ cd vendor/postgres-v17
|
||||
$ patch -p1 <../../compute/patches/cloud_regress_pg17.patch
|
||||
```
|
||||
* Set the environment variable `BENCHMARK_CONNSTR` to the connection URI of your project.
|
||||
* Set the environment variable `PG_VERSION` to the version of your project.
|
||||
* Run
|
||||
```bash
|
||||
$ pytest -m remote_cluster -k cloud_regress
|
||||
```
|
||||
@@ -5,68 +5,15 @@ Run the regression tests on the cloud instance of Neon
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import psycopg2
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import RemotePostgres
|
||||
from fixtures.pg_version import PgVersion
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def setup(remote_pg: RemotePostgres):
|
||||
"""
|
||||
Setup and teardown of the tests
|
||||
"""
|
||||
with psycopg2.connect(remote_pg.connstr()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
log.info("Creating the extension")
|
||||
cur.execute("CREATE EXTENSION IF NOT EXISTS regress_so")
|
||||
conn.commit()
|
||||
# TODO: Migrate to branches and remove this code
|
||||
log.info("Looking for subscriptions in the regress database")
|
||||
cur.execute(
|
||||
"SELECT subname FROM pg_catalog.pg_subscription WHERE "
|
||||
"subdbid = (SELECT oid FROM pg_catalog.pg_database WHERE datname='regression');"
|
||||
)
|
||||
if cur.rowcount > 0:
|
||||
with psycopg2.connect(
|
||||
dbname="regression",
|
||||
host=remote_pg.default_options["host"],
|
||||
user=remote_pg.default_options["user"],
|
||||
password=remote_pg.default_options["password"],
|
||||
) as regress_conn:
|
||||
with regress_conn.cursor() as regress_cur:
|
||||
for sub in cur:
|
||||
regress_cur.execute(f"ALTER SUBSCRIPTION {sub[0]} DISABLE")
|
||||
regress_cur.execute(
|
||||
f"ALTER SUBSCRIPTION {sub[0]} SET (slot_name = NONE)"
|
||||
)
|
||||
regress_cur.execute(f"DROP SUBSCRIPTION {sub[0]}")
|
||||
regress_conn.commit()
|
||||
|
||||
yield
|
||||
# TODO: Migrate to branches and remove this code
|
||||
log.info("Looking for extra roles...")
|
||||
with psycopg2.connect(remote_pg.connstr()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
"SELECT rolname FROM pg_catalog.pg_roles WHERE oid > 16384 AND rolname <> 'neondb_owner'"
|
||||
)
|
||||
roles: list[Any] = []
|
||||
for role in cur:
|
||||
log.info("Role found: %s", role[0])
|
||||
roles.append(role[0])
|
||||
for role in roles:
|
||||
cur.execute(f"DROP ROLE {role}")
|
||||
conn.commit()
|
||||
|
||||
|
||||
@pytest.mark.timeout(7200)
|
||||
@pytest.mark.remote_cluster
|
||||
def test_cloud_regress(
|
||||
setup,
|
||||
remote_pg: RemotePostgres,
|
||||
pg_version: PgVersion,
|
||||
pg_distrib_dir: Path,
|
||||
|
||||
@@ -175,6 +175,8 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
|
||||
counter("pageserver_tenant_throttling_count_accounted_finish"),
|
||||
counter("pageserver_tenant_throttling_wait_usecs_sum"),
|
||||
counter("pageserver_tenant_throttling_count"),
|
||||
counter("pageserver_timeline_wal_records_received"),
|
||||
counter("pageserver_page_service_pagestream_flush_in_progress_micros"),
|
||||
*histogram("pageserver_page_service_batch_size"),
|
||||
*PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
|
||||
# "pageserver_directory_entries_count", -- only used if above a certain threshold
|
||||
|
||||
@@ -4556,6 +4556,7 @@ class StorageScrubber:
|
||||
def __init__(self, env: NeonEnv, log_dir: Path):
|
||||
self.env = env
|
||||
self.log_dir = log_dir
|
||||
self.allowed_errors: list[str] = []
|
||||
|
||||
def scrubber_cli(
|
||||
self, args: list[str], timeout, extra_env: dict[str, str] | None = None
|
||||
@@ -4633,19 +4634,70 @@ class StorageScrubber:
|
||||
if timeline_lsns is not None:
|
||||
args.append("--timeline-lsns")
|
||||
args.append(json.dumps(timeline_lsns))
|
||||
if node_kind == NodeKind.PAGESERVER:
|
||||
args.append("--verbose")
|
||||
stdout = self.scrubber_cli(args, timeout=30, extra_env=extra_env)
|
||||
|
||||
try:
|
||||
summary = json.loads(stdout)
|
||||
# summary does not contain "with_warnings" if node_kind is the safekeeper
|
||||
no_warnings = "with_warnings" not in summary or not summary["with_warnings"]
|
||||
healthy = not summary["with_errors"] and no_warnings
|
||||
healthy = self._check_run_healthy(summary)
|
||||
return healthy, summary
|
||||
except:
|
||||
log.error("Failed to decode JSON output from `scan-metadata`. Dumping stdout:")
|
||||
log.error(stdout)
|
||||
raise
|
||||
|
||||
def _check_line_allowed(self, line: str) -> bool:
|
||||
for a in self.allowed_errors:
|
||||
try:
|
||||
if re.match(a, line):
|
||||
return True
|
||||
except re.error:
|
||||
log.error(f"Invalid regex: '{a}'")
|
||||
raise
|
||||
return False
|
||||
|
||||
def _check_line_list_allowed(self, lines: list[str]) -> bool:
|
||||
for line in lines:
|
||||
if not self._check_line_allowed(line):
|
||||
return False
|
||||
return True
|
||||
|
||||
def _check_run_healthy(self, summary: dict[str, Any]) -> bool:
|
||||
# summary does not contain "with_warnings" if node_kind is the safekeeper
|
||||
healthy = True
|
||||
with_warnings = summary.get("with_warnings", None)
|
||||
if with_warnings is not None:
|
||||
if isinstance(with_warnings, list):
|
||||
if len(with_warnings) > 0:
|
||||
# safekeeper scan_metadata output is a list of tenants
|
||||
healthy = False
|
||||
else:
|
||||
for _, warnings in with_warnings.items():
|
||||
assert (
|
||||
len(warnings) > 0
|
||||
), "with_warnings value should not be empty, running without verbose mode?"
|
||||
if not self._check_line_list_allowed(warnings):
|
||||
healthy = False
|
||||
break
|
||||
if not healthy:
|
||||
return healthy
|
||||
with_errors = summary.get("with_errors", None)
|
||||
if with_errors is not None:
|
||||
if isinstance(with_errors, list):
|
||||
if len(with_errors) > 0:
|
||||
# safekeeper scan_metadata output is a list of tenants
|
||||
healthy = False
|
||||
else:
|
||||
for _, errors in with_errors.items():
|
||||
assert (
|
||||
len(errors) > 0
|
||||
), "with_errors value should not be empty, running without verbose mode?"
|
||||
if not self._check_line_list_allowed(errors):
|
||||
healthy = False
|
||||
break
|
||||
return healthy
|
||||
|
||||
def tenant_snapshot(self, tenant_id: TenantId, output_path: Path):
|
||||
stdout = self.scrubber_cli(
|
||||
["tenant-snapshot", "--tenant-id", str(tenant_id), "--output-path", str(output_path)],
|
||||
|
||||
@@ -54,23 +54,15 @@ def wait_for_upload(
|
||||
tenant: TenantId | TenantShardId,
|
||||
timeline: TimelineId,
|
||||
lsn: Lsn,
|
||||
timeout=20,
|
||||
):
|
||||
"""waits for local timeline upload up to specified lsn"""
|
||||
"""Waits for local timeline upload up to specified LSN"""
|
||||
|
||||
current_lsn = Lsn(0)
|
||||
for i in range(20):
|
||||
current_lsn = remote_consistent_lsn(pageserver_http, tenant, timeline)
|
||||
if current_lsn >= lsn:
|
||||
log.info("wait finished")
|
||||
return
|
||||
lr_lsn = last_record_lsn(pageserver_http, tenant, timeline)
|
||||
log.info(
|
||||
f"waiting for remote_consistent_lsn to reach {lsn}, now {current_lsn}, last_record_lsn={lr_lsn}, iteration {i + 1}"
|
||||
)
|
||||
time.sleep(1)
|
||||
raise Exception(
|
||||
f"timed out while waiting for {tenant}/{timeline} remote_consistent_lsn to reach {lsn}, was {current_lsn}"
|
||||
)
|
||||
def is_uploaded():
|
||||
remote_lsn = remote_consistent_lsn(pageserver_http, tenant, timeline)
|
||||
assert remote_lsn >= lsn, f"remote_consistent_lsn at {remote_lsn}"
|
||||
|
||||
wait_until(is_uploaded, name=f"upload to {lsn}", timeout=timeout)
|
||||
|
||||
|
||||
def _tenant_in_expected_state(tenant_info: dict[str, Any], expected_state: str):
|
||||
|
||||
142
test_runner/performance/test_ingest_insert_bulk.py
Normal file
142
test_runner/performance/test_ingest_insert_bulk.py
Normal file
@@ -0,0 +1,142 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import random
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
import pytest
|
||||
from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
|
||||
from fixtures.common_types import Lsn
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
wait_for_last_flush_lsn,
|
||||
)
|
||||
from fixtures.pageserver.utils import (
|
||||
wait_for_last_record_lsn,
|
||||
wait_for_upload,
|
||||
wait_for_upload_queue_empty,
|
||||
)
|
||||
from fixtures.remote_storage import s3_storage
|
||||
|
||||
|
||||
@pytest.mark.timeout(900)
|
||||
@pytest.mark.parametrize("size", [8, 1024, 8192])
|
||||
@pytest.mark.parametrize("s3", [True, False], ids=["s3", "local"])
|
||||
@pytest.mark.parametrize("backpressure", [True, False], ids=["backpressure", "nobackpressure"])
|
||||
@pytest.mark.parametrize("fsync", [True, False], ids=["fsync", "nofsync"])
|
||||
def test_ingest_insert_bulk(
|
||||
request: pytest.FixtureRequest,
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
zenbenchmark: NeonBenchmarker,
|
||||
fsync: bool,
|
||||
backpressure: bool,
|
||||
s3: bool,
|
||||
size: int,
|
||||
):
|
||||
"""
|
||||
Benchmarks ingestion of 5 GB of sequential insert WAL. Measures ingestion and S3 upload
|
||||
separately. Also does a Safekeeper→Pageserver re-ingestion to measure Pageserver ingestion in
|
||||
isolation.
|
||||
"""
|
||||
|
||||
CONCURRENCY = 1 # 1 is optimal without fsync or backpressure
|
||||
VOLUME = 5 * 1024**3
|
||||
rows = VOLUME // (size + 64) # +64 roughly accounts for per-row WAL overhead
|
||||
|
||||
neon_env_builder.safekeepers_enable_fsync = fsync
|
||||
|
||||
if s3:
|
||||
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
|
||||
# NB: don't use S3 for Safekeeper. It doesn't affect throughput (no backpressure), but it
|
||||
# would compete with Pageserver for bandwidth.
|
||||
# neon_env_builder.enable_safekeeper_remote_storage(s3_storage())
|
||||
|
||||
neon_env_builder.disable_scrub_on_exit() # immediate shutdown may leave stray layers
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
endpoint = env.endpoints.create_start(
|
||||
"main",
|
||||
config_lines=[
|
||||
f"fsync = {fsync}",
|
||||
"max_replication_apply_lag = 0",
|
||||
f"max_replication_flush_lag = {'10GB' if backpressure else '0'}",
|
||||
# NB: neon_local defaults to 15MB, which is too slow -- production uses 500MB.
|
||||
f"max_replication_write_lag = {'500MB' if backpressure else '0'}",
|
||||
],
|
||||
)
|
||||
endpoint.safe_psql("create extension neon")
|
||||
|
||||
# Wait for the timeline to be propagated to the pageserver.
|
||||
wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
|
||||
|
||||
# Ingest rows.
|
||||
log.info("Ingesting data")
|
||||
start_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0])
|
||||
|
||||
def insert_rows(endpoint, table, count, value):
|
||||
with endpoint.connect().cursor() as cur:
|
||||
cur.execute("set statement_timeout = 0")
|
||||
cur.execute(f"create table {table} (id int, data bytea)")
|
||||
cur.execute(f"insert into {table} values (generate_series(1, {count}), %s)", (value,))
|
||||
|
||||
with zenbenchmark.record_duration("upload"):
|
||||
with zenbenchmark.record_duration("ingest"):
|
||||
with ThreadPoolExecutor(max_workers=CONCURRENCY) as pool:
|
||||
for i in range(CONCURRENCY):
|
||||
# Write a random value for all rows. This is sufficient to prevent compression,
|
||||
# e.g. in TOAST. Randomly generating every row is too slow.
|
||||
value = random.randbytes(size)
|
||||
worker_rows = rows / CONCURRENCY
|
||||
pool.submit(insert_rows, endpoint, f"table{i}", worker_rows, value)
|
||||
|
||||
end_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0])
|
||||
|
||||
# Wait for pageserver to ingest the WAL.
|
||||
client = env.pageserver.http_client()
|
||||
wait_for_last_record_lsn(client, env.initial_tenant, env.initial_timeline, end_lsn)
|
||||
|
||||
# Wait for pageserver S3 upload. Checkpoint to flush the last in-memory layer.
|
||||
client.timeline_checkpoint(
|
||||
env.initial_tenant,
|
||||
env.initial_timeline,
|
||||
compact=False,
|
||||
wait_until_flushed=False,
|
||||
)
|
||||
wait_for_upload(client, env.initial_tenant, env.initial_timeline, end_lsn, timeout=600)
|
||||
|
||||
# Empty out upload queue for next benchmark.
|
||||
wait_for_upload_queue_empty(client, env.initial_tenant, env.initial_timeline)
|
||||
|
||||
backpressure_time = endpoint.safe_psql("select backpressure_throttling_time()")[0][0]
|
||||
|
||||
# Now that all data is ingested, delete and recreate the tenant in the pageserver. This will
|
||||
# reingest all the WAL directly from the safekeeper. This gives us a baseline of how fast the
|
||||
# pageserver can ingest this WAL in isolation.
|
||||
status = env.storage_controller.inspect(tenant_shard_id=env.initial_tenant)
|
||||
assert status is not None
|
||||
|
||||
endpoint.stop() # avoid spurious getpage errors
|
||||
client.tenant_delete(env.initial_tenant)
|
||||
env.pageserver.tenant_create(tenant_id=env.initial_tenant, generation=status[0])
|
||||
|
||||
with zenbenchmark.record_duration("recover"):
|
||||
log.info("Recovering WAL into pageserver")
|
||||
client.timeline_create(env.pg_version, env.initial_tenant, env.initial_timeline)
|
||||
wait_for_last_record_lsn(client, env.initial_tenant, env.initial_timeline, end_lsn)
|
||||
|
||||
# Emit metrics.
|
||||
wal_written_mb = round((end_lsn - start_lsn) / (1024 * 1024))
|
||||
zenbenchmark.record("wal_written", wal_written_mb, "MB", MetricReport.TEST_PARAM)
|
||||
zenbenchmark.record("row_count", rows, "rows", MetricReport.TEST_PARAM)
|
||||
zenbenchmark.record("concurrency", CONCURRENCY, "clients", MetricReport.TEST_PARAM)
|
||||
zenbenchmark.record(
|
||||
"backpressure_time", backpressure_time // 1000, "ms", MetricReport.LOWER_IS_BETTER
|
||||
)
|
||||
|
||||
props = {p["name"]: p["value"] for _, p in request.node.user_properties}
|
||||
for name in ("ingest", "upload", "recover"):
|
||||
throughput = int(wal_written_mb / props[name])
|
||||
zenbenchmark.record(f"{name}_throughput", throughput, "MB/s", MetricReport.HIGHER_IS_BETTER)
|
||||
|
||||
# Pageserver shutdown will likely get stuck on the upload queue, just shut it down immediately.
|
||||
env.stop(immediate=True)
|
||||
@@ -153,19 +153,20 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
|
||||
if i % 10 == 0:
|
||||
log.info(f"Running churn round {i}/{churn_rounds} ...")
|
||||
|
||||
ps_http.timeline_compact(
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
enhanced_gc_bottom_most_compaction=True,
|
||||
body={
|
||||
"scheduled": True,
|
||||
"compact_range": {
|
||||
"start": "000000000000000000000000000000000000",
|
||||
# skip the SLRU range for now -- it races with get-lsn-by-timestamp, TODO: fix this
|
||||
"end": "010000000000000000000000000000000000",
|
||||
# Run gc-compaction every 10 rounds to ensure the test doesn't take too long time.
|
||||
ps_http.timeline_compact(
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
enhanced_gc_bottom_most_compaction=True,
|
||||
body={
|
||||
"scheduled": True,
|
||||
"sub_compaction": True,
|
||||
"compact_range": {
|
||||
"start": "000000000000000000000000000000000000",
|
||||
"end": "030000000000000000000000000000000000",
|
||||
},
|
||||
},
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
workload.churn_rows(row_count, env.pageserver.id)
|
||||
|
||||
@@ -177,6 +178,10 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
|
||||
log.info("Validating at workload end ...")
|
||||
workload.validate(env.pageserver.id)
|
||||
|
||||
# Run a legacy compaction+gc to ensure gc-compaction can coexist with legacy compaction.
|
||||
ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True)
|
||||
ps_http.timeline_gc(tenant_id, timeline_id, None)
|
||||
|
||||
|
||||
# Stripe sizes in number of pages.
|
||||
TINY_STRIPES = 16
|
||||
|
||||
@@ -215,7 +215,7 @@ if SQL_EXPORTER is None:
|
||||
#
|
||||
# The "host" network mode allows sql_exporter to talk to the
|
||||
# endpoint which is running on the host.
|
||||
super().__init__("docker.io/burningalchemist/sql_exporter:0.13.1", network_mode="host")
|
||||
super().__init__("docker.io/burningalchemist/sql_exporter:0.16.0", network_mode="host")
|
||||
|
||||
self.__logs_dir = logs_dir
|
||||
self.__port = port
|
||||
|
||||
124
test_runner/regress/test_nbtree_pagesplit_cycleid.py
Normal file
124
test_runner/regress/test_nbtree_pagesplit_cycleid.py
Normal file
@@ -0,0 +1,124 @@
|
||||
import threading
|
||||
import time
|
||||
|
||||
from fixtures.neon_fixtures import NeonEnv
|
||||
|
||||
BTREE_NUM_CYCLEID_PAGES = """
|
||||
WITH raw_pages AS (
|
||||
SELECT blkno, get_raw_page_at_lsn('t_uidx', 'main', blkno, NULL, NULL) page
|
||||
FROM generate_series(1, pg_relation_size('t_uidx'::regclass) / 8192) blkno
|
||||
),
|
||||
parsed_pages AS (
|
||||
/* cycle ID is the last 2 bytes of the btree page */
|
||||
SELECT blkno, SUBSTRING(page FROM 8191 FOR 2) as cycle_id
|
||||
FROM raw_pages
|
||||
)
|
||||
SELECT count(*),
|
||||
encode(cycle_id, 'hex')
|
||||
FROM parsed_pages
|
||||
WHERE encode(cycle_id, 'hex') != '0000'
|
||||
GROUP BY encode(cycle_id, 'hex');
|
||||
"""
|
||||
|
||||
|
||||
def test_nbtree_pagesplit_cycleid(neon_simple_env: NeonEnv):
|
||||
env = neon_simple_env
|
||||
endpoint = env.endpoints.create_start("main")
|
||||
|
||||
ses1 = endpoint.connect().cursor()
|
||||
ses1.execute("ALTER SYSTEM SET autovacuum = off;")
|
||||
ses1.execute("ALTER SYSTEM SET enable_seqscan = off;")
|
||||
ses1.execute("ALTER SYSTEM SET full_page_writes = off;")
|
||||
ses1.execute("SELECT pg_reload_conf();")
|
||||
ses1.execute("CREATE EXTENSION neon_test_utils;")
|
||||
# prepare a large index
|
||||
ses1.execute("CREATE TABLE t(id integer GENERATED ALWAYS AS IDENTITY, txt text);")
|
||||
ses1.execute("CREATE UNIQUE INDEX t_uidx ON t(id);")
|
||||
ses1.execute("INSERT INTO t (txt) SELECT i::text FROM generate_series(1, 2035) i;")
|
||||
|
||||
ses1.execute("SELECT neon_xlogflush();")
|
||||
ses1.execute(BTREE_NUM_CYCLEID_PAGES)
|
||||
pages = ses1.fetchall()
|
||||
assert (
|
||||
len(pages) == 0
|
||||
), f"0 back splits with cycle ID expected, real {len(pages)} first {pages[0]}"
|
||||
# Delete enough tuples to clear the first index page.
|
||||
# (there are up to 407 rows per 8KiB page; 406 for non-rightmost leafs.
|
||||
ses1.execute("DELETE FROM t WHERE id <= 406;")
|
||||
# Make sure the page is cleaned up
|
||||
ses1.execute("VACUUM (FREEZE, INDEX_CLEANUP ON) t;")
|
||||
|
||||
# Do another delete-then-indexcleanup cycle, to move the pages from
|
||||
# "dead" to "reusable"
|
||||
ses1.execute("DELETE FROM t WHERE id <= 446;")
|
||||
ses1.execute("VACUUM (FREEZE, INDEX_CLEANUP ON) t;")
|
||||
|
||||
# Make sure the vacuum we're about to trigger in s3 has cleanup work to do
|
||||
ses1.execute("DELETE FROM t WHERE id <= 610;")
|
||||
|
||||
# Flush wal, for checking purposes
|
||||
ses1.execute("SELECT neon_xlogflush();")
|
||||
ses1.execute(BTREE_NUM_CYCLEID_PAGES)
|
||||
pages = ses1.fetchall()
|
||||
assert len(pages) == 0, f"No back splits with cycle ID expected, got batches of {pages} instead"
|
||||
|
||||
ses2 = endpoint.connect().cursor()
|
||||
ses3 = endpoint.connect().cursor()
|
||||
|
||||
# Session 2 pins a btree page, which prevents vacuum from processing that
|
||||
# page, thus allowing us to reliably split pages while a concurrent vacuum
|
||||
# is running.
|
||||
ses2.execute("BEGIN;")
|
||||
ses2.execute(
|
||||
"DECLARE foo NO SCROLL CURSOR FOR SELECT row_number() over () FROM t ORDER BY id ASC"
|
||||
)
|
||||
ses2.execute("FETCH FROM foo;") # pins the leaf page with id 611
|
||||
wait_evt = threading.Event()
|
||||
|
||||
# Session 3 runs the VACUUM command. Note that this will block, and
|
||||
# therefore must run on another thread.
|
||||
# We rely on this running quickly enough to hit the pinned page from
|
||||
# session 2 by the time we start other work again in session 1, but
|
||||
# technically there is a race where the thread (and/or PostgreSQL process)
|
||||
# don't get to that pinned page with vacuum until >2s after evt.set() was
|
||||
# called, and session 1 thus might already have split pages.
|
||||
def vacuum_freeze_t(ses3, evt: threading.Event):
|
||||
# Begin parallel vacuum that should hit the index
|
||||
evt.set()
|
||||
# this'll hang until s2 fetches enough new data from its cursor.
|
||||
# this is technically a race with the time.sleep(2) below, but if this
|
||||
# command doesn't hit
|
||||
ses3.execute("VACUUM (FREEZE, INDEX_CLEANUP on, DISABLE_PAGE_SKIPPING on) t;")
|
||||
|
||||
ses3t = threading.Thread(target=vacuum_freeze_t, args=(ses3, wait_evt))
|
||||
ses3t.start()
|
||||
wait_evt.wait()
|
||||
# Make extra sure we got the thread started and vacuum is stuck, by waiting
|
||||
# some time even after wait_evt got set. This isn't truly reliable (it is
|
||||
# possible
|
||||
time.sleep(2)
|
||||
|
||||
# Insert 2 pages worth of new data.
|
||||
# This should reuse the one empty page, plus another page at the end of
|
||||
# the index relation; with split ordering
|
||||
# old_blk -> blkno=1 -> old_blk + 1.
|
||||
# As this is run while vacuum in session 3 is happening, these splits
|
||||
# should receive cycle IDs where applicable.
|
||||
ses1.execute("INSERT INTO t (txt) SELECT i::text FROM generate_series(1, 812) i;")
|
||||
# unpin the btree page, allowing s3's vacuum to complete
|
||||
ses2.execute("FETCH ALL FROM foo;")
|
||||
ses2.execute("ROLLBACK;")
|
||||
# flush WAL to make sure PS is up-to-date
|
||||
ses1.execute("SELECT neon_xlogflush();")
|
||||
# check that our expectations are correct
|
||||
ses1.execute(BTREE_NUM_CYCLEID_PAGES)
|
||||
pages = ses1.fetchall()
|
||||
assert (
|
||||
len(pages) == 1 and pages[0][0] == 3
|
||||
), f"3 page splits with cycle ID expected; actual {pages}"
|
||||
|
||||
# final cleanup
|
||||
ses3t.join()
|
||||
ses1.close()
|
||||
ses2.close()
|
||||
ses3.close()
|
||||
@@ -3230,3 +3230,55 @@ def test_multi_attached_timeline_creation(neon_env_builder: NeonEnvBuilder, migr
|
||||
# Always disable 'pause' failpoints, even on failure, to avoid hanging in shutdown
|
||||
env.storage_controller.configure_failpoints((migration_failpoint.value, "off"))
|
||||
raise
|
||||
|
||||
|
||||
@run_only_on_default_postgres("Postgres version makes no difference here")
|
||||
def test_storage_controller_detached_stopped(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
):
|
||||
"""
|
||||
Test that detaching a tenant while it has scheduling policy set to Paused or Stop works
|
||||
"""
|
||||
|
||||
remote_storage_kind = s3_storage()
|
||||
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
|
||||
|
||||
neon_env_builder.num_pageservers = 1
|
||||
|
||||
env = neon_env_builder.init_configs()
|
||||
env.start()
|
||||
virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
|
||||
|
||||
tenant_id = TenantId.generate()
|
||||
env.storage_controller.tenant_create(
|
||||
tenant_id,
|
||||
shard_count=1,
|
||||
)
|
||||
|
||||
assert len(env.pageserver.http_client().tenant_list_locations()["tenant_shards"]) == 1
|
||||
|
||||
# Disable scheduling: ordinarily this would prevent the tenant's configuration being
|
||||
# reconciled to pageservers, but this should be overridden when detaching.
|
||||
env.storage_controller.allowed_errors.append(".*Scheduling is disabled by policy.*")
|
||||
env.storage_controller.tenant_policy_update(
|
||||
tenant_id,
|
||||
{"scheduling": "Stop"},
|
||||
)
|
||||
|
||||
env.storage_controller.consistency_check()
|
||||
|
||||
# Detach the tenant
|
||||
virtual_ps_http.tenant_location_conf(
|
||||
tenant_id,
|
||||
{
|
||||
"mode": "Detached",
|
||||
"secondary_conf": None,
|
||||
"tenant_conf": {},
|
||||
"generation": None,
|
||||
},
|
||||
)
|
||||
|
||||
env.storage_controller.consistency_check()
|
||||
|
||||
# Confirm the detach happened
|
||||
assert env.pageserver.http_client().tenant_list_locations()["tenant_shards"] == []
|
||||
|
||||
@@ -572,4 +572,10 @@ def test_scrubber_scan_pageserver_metadata(
|
||||
unhealthy = env.storage_controller.metadata_health_list_unhealthy()["unhealthy_tenant_shards"]
|
||||
assert len(unhealthy) == 1 and unhealthy[0] == str(tenant_shard_id)
|
||||
|
||||
neon_env_builder.disable_scrub_on_exit()
|
||||
healthy, _ = env.storage_scrubber.scan_metadata()
|
||||
assert not healthy
|
||||
env.storage_scrubber.allowed_errors.append(".*not present in remote storage.*")
|
||||
healthy, _ = env.storage_scrubber.scan_metadata()
|
||||
assert healthy
|
||||
|
||||
neon_env_builder.disable_scrub_on_exit() # We already ran scrubber, no need to do an extra run
|
||||
|
||||
@@ -4,7 +4,7 @@ import time
|
||||
from contextlib import closing
|
||||
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, fork_at_current_lsn
|
||||
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin, fork_at_current_lsn
|
||||
from fixtures.utils import query_scalar
|
||||
|
||||
|
||||
@@ -292,3 +292,76 @@ def test_vm_bit_clear_on_heap_lock_blackbox(neon_env_builder: NeonEnvBuilder):
|
||||
tup = cur.fetchall()
|
||||
log.info(f"tuple = {tup}")
|
||||
cur.execute("commit transaction")
|
||||
|
||||
|
||||
def test_check_visibility_map(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
|
||||
"""
|
||||
Runs pgbench across a few databases on a sharded tenant, then performs a visibility map
|
||||
consistency check. Regression test for https://github.com/neondatabase/neon/issues/9914.
|
||||
"""
|
||||
|
||||
# Use a large number of shards with small stripe sizes, to ensure the visibility
|
||||
# map will end up on non-zero shards.
|
||||
SHARD_COUNT = 8
|
||||
STRIPE_SIZE = 32 # in 8KB pages
|
||||
PGBENCH_RUNS = 4
|
||||
|
||||
env = neon_env_builder.init_start(
|
||||
initial_tenant_shard_count=SHARD_COUNT, initial_tenant_shard_stripe_size=STRIPE_SIZE
|
||||
)
|
||||
endpoint = env.endpoints.create_start(
|
||||
"main",
|
||||
config_lines=[
|
||||
"shared_buffers = 64MB",
|
||||
],
|
||||
)
|
||||
|
||||
# Run pgbench in 4 different databases, to exercise different shards.
|
||||
dbnames = [f"pgbench{i}" for i in range(PGBENCH_RUNS)]
|
||||
for i, dbname in enumerate(dbnames):
|
||||
log.info(f"pgbench run {i+1}/{PGBENCH_RUNS}")
|
||||
endpoint.safe_psql(f"create database {dbname}")
|
||||
connstr = endpoint.connstr(dbname=dbname)
|
||||
# pgbench -i will automatically vacuum the tables. This creates the visibility map.
|
||||
pg_bin.run(["pgbench", "-i", "-s", "10", connstr])
|
||||
# Freeze the tuples to set the initial frozen bit.
|
||||
endpoint.safe_psql("vacuum freeze", dbname=dbname)
|
||||
# Run pgbench.
|
||||
pg_bin.run(["pgbench", "-c", "32", "-j", "8", "-T", "10", connstr])
|
||||
|
||||
# Restart the endpoint to flush the compute page cache. We want to make sure we read VM pages
|
||||
# from storage, not cache.
|
||||
endpoint.stop()
|
||||
endpoint.start()
|
||||
|
||||
# Check that the visibility map matches the heap contents for pg_accounts (the main table).
|
||||
for dbname in dbnames:
|
||||
log.info(f"Checking visibility map for {dbname}")
|
||||
with endpoint.cursor(dbname=dbname) as cur:
|
||||
cur.execute("create extension pg_visibility")
|
||||
|
||||
cur.execute("select count(*) from pg_check_visible('pgbench_accounts')")
|
||||
row = cur.fetchone()
|
||||
assert row is not None
|
||||
assert row[0] == 0, f"{row[0]} inconsistent VM pages (visible)"
|
||||
|
||||
cur.execute("select count(*) from pg_check_frozen('pgbench_accounts')")
|
||||
row = cur.fetchone()
|
||||
assert row is not None
|
||||
assert row[0] == 0, f"{row[0]} inconsistent VM pages (frozen)"
|
||||
|
||||
# Vacuum and freeze the tables, and check that the visibility map is still accurate.
|
||||
for dbname in dbnames:
|
||||
log.info(f"Vacuuming and checking visibility map for {dbname}")
|
||||
with endpoint.cursor(dbname=dbname) as cur:
|
||||
cur.execute("vacuum freeze")
|
||||
|
||||
cur.execute("select count(*) from pg_check_visible('pgbench_accounts')")
|
||||
row = cur.fetchone()
|
||||
assert row is not None
|
||||
assert row[0] == 0, f"{row[0]} inconsistent VM pages (visible)"
|
||||
|
||||
cur.execute("select count(*) from pg_check_frozen('pgbench_accounts')")
|
||||
row = cur.fetchone()
|
||||
assert row is not None
|
||||
assert row[0] == 0, f"{row[0]} inconsistent VM pages (frozen)"
|
||||
|
||||
2
vendor/postgres-v14
vendored
2
vendor/postgres-v14
vendored
Submodule vendor/postgres-v14 updated: 373f9decad...13ff324150
2
vendor/postgres-v15
vendored
2
vendor/postgres-v15
vendored
Submodule vendor/postgres-v15 updated: 972e325e62...8736b10c1d
2
vendor/postgres-v16
vendored
2
vendor/postgres-v16
vendored
Submodule vendor/postgres-v16 updated: dff6615a8e...81428621f7
2
vendor/postgres-v17
vendored
2
vendor/postgres-v17
vendored
Submodule vendor/postgres-v17 updated: a10d95be67...01fa3c4866
8
vendor/revisions.json
vendored
8
vendor/revisions.json
vendored
@@ -1,18 +1,18 @@
|
||||
{
|
||||
"v17": [
|
||||
"17.2",
|
||||
"a10d95be67265e0f10a422ba0457f5a7af01de71"
|
||||
"01fa3c48664ca030cfb69bb4a350aa9df4691d88"
|
||||
],
|
||||
"v16": [
|
||||
"16.6",
|
||||
"dff6615a8e48a10bb17a03fa3c00635f1ace7a92"
|
||||
"81428621f7c04aed03671cf80a928e0a36d92505"
|
||||
],
|
||||
"v15": [
|
||||
"15.10",
|
||||
"972e325e62b455957adbbdd8580e31275bb5b8c9"
|
||||
"8736b10c1d93d11b9c0489872dd529c4c0f5338f"
|
||||
],
|
||||
"v14": [
|
||||
"14.15",
|
||||
"373f9decad933d2d46f321231032ae8b0da81acd"
|
||||
"13ff324150fceaac72920e01742addc053db9462"
|
||||
]
|
||||
}
|
||||
|
||||
@@ -33,6 +33,7 @@ deranged = { version = "0.3", default-features = false, features = ["powerfmt",
|
||||
digest = { version = "0.10", features = ["mac", "oid", "std"] }
|
||||
either = { version = "1" }
|
||||
fail = { version = "0.5", default-features = false, features = ["failpoints"] }
|
||||
form_urlencoded = { version = "1" }
|
||||
futures-channel = { version = "0.3", features = ["sink"] }
|
||||
futures-executor = { version = "0.3" }
|
||||
futures-io = { version = "0.3" }
|
||||
@@ -78,6 +79,7 @@ sha2 = { version = "0.10", features = ["asm", "oid"] }
|
||||
signature = { version = "2", default-features = false, features = ["digest", "rand_core", "std"] }
|
||||
smallvec = { version = "1", default-features = false, features = ["const_new", "write"] }
|
||||
spki = { version = "0.7", default-features = false, features = ["pem", "std"] }
|
||||
stable_deref_trait = { version = "1" }
|
||||
subtle = { version = "2" }
|
||||
sync_wrapper = { version = "0.1", default-features = false, features = ["futures"] }
|
||||
tikv-jemalloc-ctl = { version = "0.6", features = ["stats", "use_std"] }
|
||||
@@ -105,6 +107,7 @@ anyhow = { version = "1", features = ["backtrace"] }
|
||||
bytes = { version = "1", features = ["serde"] }
|
||||
cc = { version = "1", default-features = false, features = ["parallel"] }
|
||||
chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] }
|
||||
displaydoc = { version = "0.2" }
|
||||
either = { version = "1" }
|
||||
getrandom = { version = "0.2", default-features = false, features = ["std"] }
|
||||
half = { version = "2", default-features = false, features = ["num-traits"] }
|
||||
|
||||
Reference in New Issue
Block a user