mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-17 21:20:37 +00:00
Compare commits
28 Commits
sk-basic-b
...
problame/2
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
91cec9ba48 | ||
|
|
d9f89f828d | ||
|
|
74df4a7b76 | ||
|
|
799db161d3 | ||
|
|
47380be12d | ||
|
|
c7b02ce8ec | ||
|
|
4010adf653 | ||
|
|
e10a7ee391 | ||
|
|
e8c9a51273 | ||
|
|
3c3ee8f3e8 | ||
|
|
6928a34f59 | ||
|
|
bc684e9d3b | ||
|
|
08532231ee | ||
|
|
79137a089f | ||
|
|
e3cb715e8a | ||
|
|
c70bf9150f | ||
|
|
8e4da52069 | ||
|
|
2ff1a5cecd | ||
|
|
ec8dcc2231 | ||
|
|
b844c6f0c7 | ||
|
|
6a85a06e1b | ||
|
|
b04a6acd6c | ||
|
|
0c7b89235c | ||
|
|
1e9a50bca8 | ||
|
|
511e730cc0 | ||
|
|
c1148dc9ac | ||
|
|
8253cf1931 | ||
|
|
3a82430432 |
@@ -1,27 +1,28 @@
|
||||
*
|
||||
|
||||
!rust-toolchain.toml
|
||||
!Cargo.toml
|
||||
# Files
|
||||
!Cargo.lock
|
||||
!Cargo.toml
|
||||
!Makefile
|
||||
!rust-toolchain.toml
|
||||
!scripts/combine_control_files.py
|
||||
!scripts/ninstall.sh
|
||||
!vm-cgconfig.conf
|
||||
|
||||
# Directories
|
||||
!.cargo/
|
||||
!.config/
|
||||
!control_plane/
|
||||
!compute_tools/
|
||||
!control_plane/
|
||||
!libs/
|
||||
!neon_local/
|
||||
!pageserver/
|
||||
!patches/
|
||||
!pgxn/
|
||||
!proxy/
|
||||
!safekeeper/
|
||||
!s3_scrubber/
|
||||
!safekeeper/
|
||||
!storage_broker/
|
||||
!trace/
|
||||
!vendor/postgres-v14/
|
||||
!vendor/postgres-v15/
|
||||
!vendor/postgres-v16/
|
||||
!vendor/postgres-*/
|
||||
!workspace_hack/
|
||||
!neon_local/
|
||||
!scripts/ninstall.sh
|
||||
!scripts/combine_control_files.py
|
||||
!vm-cgconfig.conf
|
||||
|
||||
2
.github/workflows/build_and_test.yml
vendored
2
.github/workflows/build_and_test.yml
vendored
@@ -508,7 +508,7 @@ jobs:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
|
||||
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: std-fs
|
||||
# XXX: no coverage data handling here, since benchmarks are run on release builds,
|
||||
# while coverage is currently collected for the debug ones
|
||||
|
||||
|
||||
54
Cargo.lock
generated
54
Cargo.lock
generated
@@ -285,7 +285,6 @@ dependencies = [
|
||||
"metrics",
|
||||
"pageserver_api",
|
||||
"pageserver_client",
|
||||
"postgres_backend",
|
||||
"postgres_connection",
|
||||
"serde",
|
||||
"serde_json",
|
||||
@@ -2736,6 +2735,12 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "libm"
|
||||
version = "0.2.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
|
||||
|
||||
[[package]]
|
||||
name = "linux-raw-sys"
|
||||
version = "0.1.4"
|
||||
@@ -2832,6 +2837,9 @@ dependencies = [
|
||||
"libc",
|
||||
"once_cell",
|
||||
"prometheus",
|
||||
"rand 0.8.5",
|
||||
"rand_distr",
|
||||
"twox-hash",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
@@ -3057,6 +3065,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"libm",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4071,6 +4080,8 @@ dependencies = [
|
||||
"sync_wrapper",
|
||||
"task-local-extensions",
|
||||
"thiserror",
|
||||
"tikv-jemalloc-ctl",
|
||||
"tikv-jemallocator",
|
||||
"tls-listener",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
@@ -4171,6 +4182,16 @@ dependencies = [
|
||||
"getrandom 0.2.11",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_distr"
|
||||
version = "0.4.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31"
|
||||
dependencies = [
|
||||
"num-traits",
|
||||
"rand 0.8.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_hc"
|
||||
version = "0.2.0"
|
||||
@@ -5511,6 +5532,37 @@ dependencies = [
|
||||
"ordered-float 2.10.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tikv-jemalloc-ctl"
|
||||
version = "0.5.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "619bfed27d807b54f7f776b9430d4f8060e66ee138a28632ca898584d462c31c"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"paste",
|
||||
"tikv-jemalloc-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tikv-jemalloc-sys"
|
||||
version = "0.5.4+5.3.0-patched"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9402443cb8fd499b6f327e40565234ff34dbda27460c5b47db0db77443dd85d1"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tikv-jemallocator"
|
||||
version = "0.5.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "965fe0c26be5c56c94e38ba547249074803efd52adfb66de62107d95aab3eaca"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"tikv-jemalloc-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "time"
|
||||
version = "0.3.21"
|
||||
|
||||
@@ -149,6 +149,8 @@ tar = "0.4"
|
||||
task-local-extensions = "0.1.4"
|
||||
test-context = "0.1"
|
||||
thiserror = "1.0"
|
||||
tikv-jemallocator = "0.5"
|
||||
tikv-jemalloc-ctl = "0.5"
|
||||
tls-listener = { version = "0.7", features = ["rustls", "hyper-h1"] }
|
||||
tokio = { version = "1.17", features = ["macros"] }
|
||||
tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
|
||||
@@ -165,6 +167,7 @@ tracing = "0.1"
|
||||
tracing-error = "0.2.0"
|
||||
tracing-opentelemetry = "0.20.0"
|
||||
tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
|
||||
twox-hash = { version = "1.6.3", default-features = false }
|
||||
url = "2.2"
|
||||
uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
|
||||
walkdir = "2.3.2"
|
||||
|
||||
@@ -53,6 +53,7 @@ RUN set -e \
|
||||
--bin pagectl \
|
||||
--bin safekeeper \
|
||||
--bin storage_broker \
|
||||
--bin attachment_service \
|
||||
--bin proxy \
|
||||
--bin neon_local \
|
||||
--locked --release \
|
||||
@@ -80,6 +81,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/attachment_service /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local /usr/local/bin
|
||||
|
||||
|
||||
@@ -241,9 +241,12 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
|
||||
FROM build-deps AS vector-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.1.tar.gz -O pgvector.tar.gz && \
|
||||
echo "cc7a8e034a96e30a819911ac79d32f6bc47bdd1aa2de4d7d4904e26b83209dc8 pgvector.tar.gz" | sha256sum --check && \
|
||||
COPY patches/pgvector.patch /pgvector.patch
|
||||
|
||||
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.6.0.tar.gz -O pgvector.tar.gz && \
|
||||
echo "b0cf4ba1ab016335ac8fb1cada0d2106235889a194fffeece217c5bda90b2f19 pgvector.tar.gz" | sha256sum --check && \
|
||||
mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
|
||||
patch -p1 < /pgvector.patch && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
|
||||
@@ -520,8 +523,7 @@ RUN apt-get update && \
|
||||
libboost-regex1.74-dev \
|
||||
libboost-serialization1.74-dev \
|
||||
libboost-system1.74-dev \
|
||||
libeigen3-dev \
|
||||
libfreetype6-dev
|
||||
libeigen3-dev
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
|
||||
RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \
|
||||
@@ -547,6 +549,7 @@ RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.
|
||||
-D PostgreSQL_LIBRARY_DIR=`pg_config --libdir` \
|
||||
-D RDK_INSTALL_INTREE=OFF \
|
||||
-D RDK_INSTALL_COMIC_FONTS=OFF \
|
||||
-D RDK_BUILD_FREETYPE_SUPPORT=OFF \
|
||||
-D CMAKE_BUILD_TYPE=Release \
|
||||
. && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
@@ -901,7 +904,7 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
|
||||
# libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS
|
||||
# libxml2, libxslt1.1 for xml2
|
||||
# libzstd1 for zstd
|
||||
# libboost*, libfreetype6, and zlib1g for rdkit
|
||||
# libboost* for rdkit
|
||||
# ca-certificates for communicating with s3 by compute_ctl
|
||||
RUN apt update && \
|
||||
apt install --no-install-recommends -y \
|
||||
@@ -914,7 +917,6 @@ RUN apt update && \
|
||||
libboost-serialization1.74.0 \
|
||||
libboost-system1.74.0 \
|
||||
libossp-uuid16 \
|
||||
libfreetype6 \
|
||||
libgeos-c1v5 \
|
||||
libgdal28 \
|
||||
libproj19 \
|
||||
@@ -926,7 +928,6 @@ RUN apt update && \
|
||||
libcurl4-openssl-dev \
|
||||
locales \
|
||||
procps \
|
||||
zlib1g \
|
||||
ca-certificates && \
|
||||
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
|
||||
localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
|
||||
|
||||
@@ -758,6 +758,14 @@ BEGIN
|
||||
END LOOP;
|
||||
END $$;
|
||||
"#,
|
||||
r#"
|
||||
DO $$
|
||||
BEGIN
|
||||
IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
|
||||
EXECUTE 'GRANT pg_create_subscription TO neon_superuser';
|
||||
END IF;
|
||||
END
|
||||
$$;"#,
|
||||
];
|
||||
|
||||
let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
|
||||
|
||||
@@ -21,10 +21,6 @@ tokio.workspace = true
|
||||
tokio-util.workspace = true
|
||||
tracing.workspace = true
|
||||
|
||||
# TODO: remove this after DB persistence is added, it is only used for
|
||||
# a parsing function when loading pageservers from neon_local LocalEnv
|
||||
postgres_backend.workspace = true
|
||||
|
||||
diesel = { version = "2.1.4", features = ["serde_json", "postgres"] }
|
||||
|
||||
utils = { path = "../../libs/utils/" }
|
||||
|
||||
@@ -2,13 +2,17 @@ use crate::reconciler::ReconcileError;
|
||||
use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
|
||||
use hyper::{Body, Request, Response};
|
||||
use hyper::{StatusCode, Uri};
|
||||
use pageserver_api::models::{TenantCreateRequest, TimelineCreateRequest};
|
||||
use pageserver_api::models::{
|
||||
TenantCreateRequest, TenantLocationConfigRequest, TimelineCreateRequest,
|
||||
};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_client::mgmt_api;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
use utils::auth::SwappableJwtAuth;
|
||||
use utils::http::endpoint::{auth_middleware, request_span};
|
||||
use utils::http::request::parse_request_param;
|
||||
use utils::id::TenantId;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use utils::{
|
||||
http::{
|
||||
@@ -112,6 +116,78 @@ async fn handle_tenant_create(
|
||||
json_response(StatusCode::OK, service.tenant_create(create_req).await?)
|
||||
}
|
||||
|
||||
// For tenant and timeline deletions, which both implement an "initially return 202, then 404 once
|
||||
// we're done" semantic, we wrap with a retry loop to expose a simpler API upstream. This avoids
|
||||
// needing to track a "deleting" state for tenants.
|
||||
async fn deletion_wrapper<R, F>(service: Arc<Service>, f: F) -> Result<Response<Body>, ApiError>
|
||||
where
|
||||
R: std::future::Future<Output = Result<StatusCode, ApiError>> + Send + 'static,
|
||||
F: Fn(Arc<Service>) -> R + Send + Sync + 'static,
|
||||
{
|
||||
let started_at = Instant::now();
|
||||
// To keep deletion reasonably snappy for small tenants, initially check after 1 second if deletion
|
||||
// completed.
|
||||
let mut retry_period = Duration::from_secs(1);
|
||||
// On subsequent retries, wait longer.
|
||||
let max_retry_period = Duration::from_secs(5);
|
||||
// Enable callers with a 30 second request timeout to reliably get a response
|
||||
let max_wait = Duration::from_secs(25);
|
||||
|
||||
loop {
|
||||
let status = f(service.clone()).await?;
|
||||
match status {
|
||||
StatusCode::ACCEPTED => {
|
||||
tracing::info!("Deletion accepted, waiting to try again...");
|
||||
tokio::time::sleep(retry_period).await;
|
||||
retry_period = max_retry_period;
|
||||
}
|
||||
StatusCode::NOT_FOUND => {
|
||||
tracing::info!("Deletion complete");
|
||||
return json_response(StatusCode::OK, ());
|
||||
}
|
||||
_ => {
|
||||
tracing::warn!("Unexpected status {status}");
|
||||
return json_response(status, ());
|
||||
}
|
||||
}
|
||||
|
||||
let now = Instant::now();
|
||||
if now + retry_period > started_at + max_wait {
|
||||
tracing::info!("Deletion timed out waiting for 404");
|
||||
// REQUEST_TIMEOUT would be more appropriate, but CONFLICT is already part of
|
||||
// the pageserver's swagger definition for this endpoint, and has the same desired
|
||||
// effect of causing the control plane to retry later.
|
||||
return json_response(StatusCode::CONFLICT, ());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn handle_tenant_location_config(
|
||||
service: Arc<Service>,
|
||||
mut req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
let config_req = json_request::<TenantLocationConfigRequest>(&mut req).await?;
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
service
|
||||
.tenant_location_config(tenant_id, config_req)
|
||||
.await?,
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_tenant_delete(
|
||||
service: Arc<Service>,
|
||||
req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
|
||||
deletion_wrapper(service, move |service| async move {
|
||||
service.tenant_delete(tenant_id).await
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
async fn handle_tenant_timeline_create(
|
||||
service: Arc<Service>,
|
||||
mut req: Request<Body>,
|
||||
@@ -126,6 +202,63 @@ async fn handle_tenant_timeline_create(
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_tenant_timeline_delete(
|
||||
service: Arc<Service>,
|
||||
req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
|
||||
|
||||
deletion_wrapper(service, move |service| async move {
|
||||
service.tenant_timeline_delete(tenant_id, timeline_id).await
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
async fn handle_tenant_timeline_passthrough(
|
||||
service: Arc<Service>,
|
||||
req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
|
||||
let Some(path) = req.uri().path_and_query() else {
|
||||
// This should never happen, our request router only calls us if there is a path
|
||||
return Err(ApiError::BadRequest(anyhow::anyhow!("Missing path")));
|
||||
};
|
||||
|
||||
tracing::info!("Proxying request for tenant {} ({})", tenant_id, path);
|
||||
|
||||
// Find the node that holds shard zero
|
||||
let (base_url, tenant_shard_id) = service.tenant_shard0_baseurl(tenant_id)?;
|
||||
|
||||
// Callers will always pass an unsharded tenant ID. Before proxying, we must
|
||||
// rewrite this to a shard-aware shard zero ID.
|
||||
let path = format!("{}", path);
|
||||
let tenant_str = tenant_id.to_string();
|
||||
let tenant_shard_str = format!("{}", tenant_shard_id);
|
||||
let path = path.replace(&tenant_str, &tenant_shard_str);
|
||||
|
||||
let client = mgmt_api::Client::new(base_url, service.get_config().jwt_token.as_deref());
|
||||
let resp = client.get_raw(path).await.map_err(|_e|
|
||||
// FIXME: give APiError a proper Unavailable variant. We return 503 here because
|
||||
// if we can't successfully send a request to the pageserver, we aren't available.
|
||||
ApiError::ShuttingDown)?;
|
||||
|
||||
// We have a reqest::Response, would like a http::Response
|
||||
let mut builder = hyper::Response::builder()
|
||||
.status(resp.status())
|
||||
.version(resp.version());
|
||||
for (k, v) in resp.headers() {
|
||||
builder = builder.header(k, v);
|
||||
}
|
||||
|
||||
let response = builder
|
||||
.body(Body::wrap_stream(resp.bytes_stream()))
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
async fn handle_tenant_locate(
|
||||
service: Arc<Service>,
|
||||
req: Request<Body>,
|
||||
@@ -141,6 +274,11 @@ async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>,
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let state = get_state(&req);
|
||||
json_response(StatusCode::OK, state.service.node_list().await?)
|
||||
}
|
||||
|
||||
async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let node_id: NodeId = parse_request_param(&req, "node_id")?;
|
||||
let config_req = json_request::<NodeConfigureRequest>(&mut req).await?;
|
||||
@@ -226,26 +364,64 @@ pub fn make_router(
|
||||
|
||||
router
|
||||
.data(Arc::new(HttpState::new(service, auth)))
|
||||
// Non-prefixed generic endpoints (status, metrics)
|
||||
.get("/status", |r| request_span(r, handle_status))
|
||||
.post("/re-attach", |r| request_span(r, handle_re_attach))
|
||||
.post("/validate", |r| request_span(r, handle_validate))
|
||||
.post("/attach-hook", |r| request_span(r, handle_attach_hook))
|
||||
.post("/inspect", |r| request_span(r, handle_inspect))
|
||||
.post("/node", |r| request_span(r, handle_node_register))
|
||||
.put("/node/:node_id/config", |r| {
|
||||
// Upcalls for the pageserver: point the pageserver's `control_plane_api` config to this prefix
|
||||
.post("/upcall/v1/re-attach", |r| {
|
||||
request_span(r, handle_re_attach)
|
||||
})
|
||||
.post("/upcall/v1/validate", |r| request_span(r, handle_validate))
|
||||
// Test/dev/debug endpoints
|
||||
.post("/debug/v1/attach-hook", |r| {
|
||||
request_span(r, handle_attach_hook)
|
||||
})
|
||||
.post("/debug/v1/inspect", |r| request_span(r, handle_inspect))
|
||||
.get("/control/v1/tenant/:tenant_id/locate", |r| {
|
||||
tenant_service_handler(r, handle_tenant_locate)
|
||||
})
|
||||
// Node operations
|
||||
.post("/control/v1/node", |r| {
|
||||
request_span(r, handle_node_register)
|
||||
})
|
||||
.get("/control/v1/node", |r| request_span(r, handle_node_list))
|
||||
.put("/control/v1/node/:node_id/config", |r| {
|
||||
request_span(r, handle_node_configure)
|
||||
})
|
||||
// Tenant Shard operations
|
||||
.put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
|
||||
tenant_service_handler(r, handle_tenant_shard_migrate)
|
||||
})
|
||||
// Tenant operations
|
||||
// The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
|
||||
// this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
|
||||
.post("/v1/tenant", |r| {
|
||||
tenant_service_handler(r, handle_tenant_create)
|
||||
})
|
||||
.delete("/v1/tenant/:tenant_id", |r| {
|
||||
tenant_service_handler(r, handle_tenant_delete)
|
||||
})
|
||||
.put("/v1/tenant/:tenant_id/location_config", |r| {
|
||||
tenant_service_handler(r, handle_tenant_location_config)
|
||||
})
|
||||
// Tenant Shard operations (low level/maintenance)
|
||||
.put("/tenant/:tenant_shard_id/migrate", |r| {
|
||||
tenant_service_handler(r, handle_tenant_shard_migrate)
|
||||
})
|
||||
// Timeline operations
|
||||
.delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
|
||||
tenant_service_handler(r, handle_tenant_timeline_delete)
|
||||
})
|
||||
.post("/v1/tenant/:tenant_id/timeline", |r| {
|
||||
tenant_service_handler(r, handle_tenant_timeline_create)
|
||||
})
|
||||
.get("/tenant/:tenant_id/locate", |r| {
|
||||
tenant_service_handler(r, handle_tenant_locate)
|
||||
// Tenant detail GET passthrough to shard zero
|
||||
.get("/v1/tenant/:tenant_id*", |r| {
|
||||
tenant_service_handler(r, handle_tenant_timeline_passthrough)
|
||||
})
|
||||
.put("/tenant/:tenant_shard_id/migrate", |r| {
|
||||
tenant_service_handler(r, handle_tenant_shard_migrate)
|
||||
// Timeline GET passthrough to shard zero. Note that the `*` in the URL is a wildcard: any future
|
||||
// timeline GET APIs will be implicitly included.
|
||||
.get("/v1/tenant/:tenant_id/timeline*", |r| {
|
||||
tenant_service_handler(r, handle_tenant_timeline_passthrough)
|
||||
})
|
||||
// Path aliases for tests_forward_compatibility
|
||||
// TODO: remove these in future PR
|
||||
|
||||
@@ -9,7 +9,6 @@ use diesel::prelude::*;
|
||||
use diesel::Connection;
|
||||
use pageserver_api::models::TenantConfig;
|
||||
use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
|
||||
use postgres_connection::parse_host_port;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::generation::Generation;
|
||||
use utils::id::{NodeId, TenantId};
|
||||
@@ -129,51 +128,11 @@ impl Persistence {
|
||||
})
|
||||
.await?;
|
||||
|
||||
if nodes.is_empty() {
|
||||
return self.list_nodes_local_env().await;
|
||||
}
|
||||
|
||||
tracing::info!("list_nodes: loaded {} nodes", nodes.len());
|
||||
|
||||
Ok(nodes)
|
||||
}
|
||||
|
||||
/// Shim for automated compatibility tests: load nodes from LocalEnv instead of database
|
||||
pub(crate) async fn list_nodes_local_env(&self) -> DatabaseResult<Vec<Node>> {
|
||||
// Enable test_backward_compatibility to work by populating our list of
|
||||
// nodes from LocalEnv when it is not present in persistent storage. Otherwise at
|
||||
// first startup in the compat test, we may have shards but no nodes.
|
||||
use control_plane::local_env::LocalEnv;
|
||||
let env = LocalEnv::load_config().map_err(|e| DatabaseError::Logical(format!("{e}")))?;
|
||||
tracing::info!(
|
||||
"Loading {} pageserver nodes from LocalEnv",
|
||||
env.pageservers.len()
|
||||
);
|
||||
let mut nodes = Vec::new();
|
||||
for ps_conf in env.pageservers {
|
||||
let (pg_host, pg_port) =
|
||||
parse_host_port(&ps_conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
|
||||
let (http_host, http_port) = parse_host_port(&ps_conf.listen_http_addr)
|
||||
.expect("Unable to parse listen_http_addr");
|
||||
let node = Node {
|
||||
id: ps_conf.id,
|
||||
listen_pg_addr: pg_host.to_string(),
|
||||
listen_pg_port: pg_port.unwrap_or(5432),
|
||||
listen_http_addr: http_host.to_string(),
|
||||
listen_http_port: http_port.unwrap_or(80),
|
||||
availability: NodeAvailability::Active,
|
||||
scheduling: NodeSchedulingPolicy::Active,
|
||||
};
|
||||
|
||||
// Synchronize database with what we learn from LocalEnv
|
||||
self.insert_node(&node).await?;
|
||||
|
||||
nodes.push(node);
|
||||
}
|
||||
|
||||
Ok(nodes)
|
||||
}
|
||||
|
||||
/// At startup, load the high level state for shards, such as their config + policy. This will
|
||||
/// be enriched at runtime with state discovered on pageservers.
|
||||
pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
|
||||
|
||||
@@ -21,6 +21,7 @@ use pageserver_api::{
|
||||
models,
|
||||
models::{
|
||||
LocationConfig, LocationConfigMode, ShardParameters, TenantConfig, TenantCreateRequest,
|
||||
TenantLocationConfigRequest, TenantLocationConfigResponse, TenantShardLocation,
|
||||
TimelineCreateRequest, TimelineInfo,
|
||||
},
|
||||
shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId},
|
||||
@@ -30,14 +31,14 @@ use utils::{
|
||||
completion::Barrier,
|
||||
generation::Generation,
|
||||
http::error::ApiError,
|
||||
id::{NodeId, TenantId},
|
||||
id::{NodeId, TenantId, TimelineId},
|
||||
seqwait::SeqWait,
|
||||
};
|
||||
|
||||
use crate::{
|
||||
compute_hook::ComputeHook,
|
||||
node::Node,
|
||||
persistence::{DatabaseError, Persistence, TenantShardPersistence},
|
||||
persistence::{DatabaseError, NodePersistence, Persistence, TenantShardPersistence},
|
||||
scheduler::Scheduler,
|
||||
tenant_state::{
|
||||
IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError,
|
||||
@@ -635,7 +636,7 @@ impl Service {
|
||||
shard_number: tenant_shard_id.shard_number.0 as i32,
|
||||
shard_count: tenant_shard_id.shard_count.0 as i32,
|
||||
shard_stripe_size: create_req.shard_parameters.stripe_size.0 as i32,
|
||||
generation: 0,
|
||||
generation: create_req.generation.map(|g| g as i32).unwrap_or(0),
|
||||
generation_pageserver: i64::MAX,
|
||||
placement_policy: serde_json::to_string(&placement_policy).unwrap(),
|
||||
config: serde_json::to_string(&create_req.config).unwrap(),
|
||||
@@ -677,6 +678,7 @@ impl Service {
|
||||
})?;
|
||||
|
||||
response_shards.push(TenantCreateResponseShard {
|
||||
shard_id: tenant_shard_id,
|
||||
node_id: entry
|
||||
.get()
|
||||
.intent
|
||||
@@ -709,6 +711,7 @@ impl Service {
|
||||
})?;
|
||||
|
||||
response_shards.push(TenantCreateResponseShard {
|
||||
shard_id: tenant_shard_id,
|
||||
node_id: state
|
||||
.intent
|
||||
.attached
|
||||
@@ -742,14 +745,257 @@ impl Service {
|
||||
(waiters, response_shards)
|
||||
};
|
||||
|
||||
let deadline = Instant::now().checked_add(Duration::from_secs(5)).unwrap();
|
||||
self.await_waiters(waiters).await?;
|
||||
|
||||
Ok(TenantCreateResponse {
|
||||
shards: response_shards,
|
||||
})
|
||||
}
|
||||
|
||||
/// Helper for functions that reconcile a number of shards, and would like to do a timeout-bounded
|
||||
/// wait for reconciliation to complete before responding.
|
||||
async fn await_waiters(
|
||||
&self,
|
||||
waiters: Vec<ReconcilerWaiter>,
|
||||
) -> Result<(), ReconcileWaitError> {
|
||||
let deadline = Instant::now().checked_add(Duration::from_secs(30)).unwrap();
|
||||
for waiter in waiters {
|
||||
let timeout = deadline.duration_since(Instant::now());
|
||||
waiter.wait_timeout(timeout).await?;
|
||||
}
|
||||
Ok(TenantCreateResponse {
|
||||
shards: response_shards,
|
||||
})
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// This API is used by the cloud control plane to do coarse-grained control of tenants:
|
||||
/// - Call with mode Attached* to upsert the tenant.
|
||||
/// - Call with mode Detached to switch to PolicyMode::Detached
|
||||
///
|
||||
/// In future, calling with mode Secondary may switch to a detach-lite mode in which a tenant only has
|
||||
/// secondary locations.
|
||||
pub(crate) async fn tenant_location_config(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
req: TenantLocationConfigRequest,
|
||||
) -> Result<TenantLocationConfigResponse, ApiError> {
|
||||
if req.tenant_id.shard_count.0 > 1 {
|
||||
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"This API is for importing single-sharded or unsharded tenants"
|
||||
)));
|
||||
}
|
||||
|
||||
let mut waiters = Vec::new();
|
||||
let mut result = TenantLocationConfigResponse { shards: Vec::new() };
|
||||
let maybe_create = {
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let result_tx = locked.result_tx.clone();
|
||||
let compute_hook = locked.compute_hook.clone();
|
||||
let pageservers = locked.nodes.clone();
|
||||
|
||||
let mut scheduler = Scheduler::new(&locked.tenants, &locked.nodes);
|
||||
|
||||
// Maybe we have existing shards
|
||||
let mut create = true;
|
||||
for (shard_id, shard) in locked
|
||||
.tenants
|
||||
.range_mut(TenantShardId::tenant_range(tenant_id))
|
||||
{
|
||||
// Saw an existing shard: this is not a creation
|
||||
create = false;
|
||||
|
||||
// Note that for existing tenants we do _not_ respect the generation in the request: this is likely
|
||||
// to be stale. Once a tenant is created in this service, our view of generation is authoritative, and
|
||||
// callers' generations may be ignored. This represents a one-way migration of tenants from the outer
|
||||
// cloud control plane into this service.
|
||||
|
||||
// Use location config mode as an indicator of policy: if they ask for
|
||||
// attached we go to default HA attached mode. If they ask for secondary
|
||||
// we go to secondary-only mode. If they ask for detached we detach.
|
||||
match req.config.mode {
|
||||
LocationConfigMode::Detached => {
|
||||
shard.policy = PlacementPolicy::Detached;
|
||||
}
|
||||
LocationConfigMode::Secondary => {
|
||||
// TODO: implement secondary-only mode.
|
||||
todo!();
|
||||
}
|
||||
LocationConfigMode::AttachedMulti
|
||||
| LocationConfigMode::AttachedSingle
|
||||
| LocationConfigMode::AttachedStale => {
|
||||
// TODO: persistence for changes in policy
|
||||
if pageservers.len() > 1 {
|
||||
shard.policy = PlacementPolicy::Double(1)
|
||||
} else {
|
||||
// Convenience for dev/test: if we just have one pageserver, import
|
||||
// tenants into Single mode so that scheduling will succeed.
|
||||
shard.policy = PlacementPolicy::Single
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
shard.schedule(&mut scheduler)?;
|
||||
|
||||
let maybe_waiter = shard.maybe_reconcile(
|
||||
result_tx.clone(),
|
||||
&pageservers,
|
||||
&compute_hook,
|
||||
&self.config,
|
||||
&self.persistence,
|
||||
);
|
||||
if let Some(waiter) = maybe_waiter {
|
||||
waiters.push(waiter);
|
||||
}
|
||||
|
||||
if let Some(node_id) = shard.intent.attached {
|
||||
result.shards.push(TenantShardLocation {
|
||||
shard_id: *shard_id,
|
||||
node_id,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
if create {
|
||||
// Validate request mode
|
||||
match req.config.mode {
|
||||
LocationConfigMode::Detached | LocationConfigMode::Secondary => {
|
||||
// When using this API to onboard an existing tenant to this service, it must start in
|
||||
// an attached state, because we need the request to come with a generation
|
||||
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"Imported tenant must be in attached mode"
|
||||
)));
|
||||
}
|
||||
|
||||
LocationConfigMode::AttachedMulti
|
||||
| LocationConfigMode::AttachedSingle
|
||||
| LocationConfigMode::AttachedStale => {
|
||||
// Pass
|
||||
}
|
||||
}
|
||||
|
||||
// Validate request generation
|
||||
let Some(generation) = req.config.generation else {
|
||||
// We can only import attached tenants, because we need the request to come with a generation
|
||||
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"Generation is mandatory when importing tenant"
|
||||
)));
|
||||
};
|
||||
|
||||
// Synthesize a creation request
|
||||
Some(TenantCreateRequest {
|
||||
new_tenant_id: TenantShardId::unsharded(tenant_id),
|
||||
generation: Some(generation),
|
||||
shard_parameters: ShardParameters {
|
||||
// Must preserve the incoming shard_count do distinguish unsharded (0)
|
||||
// from single-sharded (1): this distinction appears in the S3 keys of the tenant.
|
||||
count: req.tenant_id.shard_count,
|
||||
// We only import un-sharded or single-sharded tenants, so stripe
|
||||
// size can be made up arbitrarily here.
|
||||
stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
},
|
||||
config: req.config.tenant_conf,
|
||||
})
|
||||
} else {
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
if let Some(create_req) = maybe_create {
|
||||
let create_resp = self.tenant_create(create_req).await?;
|
||||
result.shards = create_resp
|
||||
.shards
|
||||
.into_iter()
|
||||
.map(|s| TenantShardLocation {
|
||||
node_id: s.node_id,
|
||||
shard_id: s.shard_id,
|
||||
})
|
||||
.collect();
|
||||
} else {
|
||||
// This was an update, wait for reconciliation
|
||||
self.await_waiters(waiters).await?;
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
pub(crate) async fn tenant_delete(&self, tenant_id: TenantId) -> Result<StatusCode, ApiError> {
|
||||
// TODO: refactor into helper
|
||||
let targets = {
|
||||
let locked = self.inner.read().unwrap();
|
||||
let mut targets = Vec::new();
|
||||
|
||||
for (tenant_shard_id, shard) in
|
||||
locked.tenants.range(TenantShardId::tenant_range(tenant_id))
|
||||
{
|
||||
let node_id = shard.intent.attached.ok_or_else(|| {
|
||||
ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled"))
|
||||
})?;
|
||||
let node = locked
|
||||
.nodes
|
||||
.get(&node_id)
|
||||
.expect("Pageservers may not be deleted while referenced");
|
||||
|
||||
targets.push((*tenant_shard_id, node.clone()));
|
||||
}
|
||||
targets
|
||||
};
|
||||
|
||||
// TODO: error out if the tenant is not attached anywhere.
|
||||
|
||||
// Phase 1: delete on the pageservers
|
||||
let mut any_pending = false;
|
||||
for (tenant_shard_id, node) in targets {
|
||||
let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
|
||||
// TODO: this, like many other places, requires proper retry handling for 503, timeout: those should not
|
||||
// surface immediately as an error to our caller.
|
||||
let status = client.tenant_delete(tenant_shard_id).await.map_err(|e| {
|
||||
ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"Error deleting shard {tenant_shard_id} on node {}: {e}",
|
||||
node.id
|
||||
))
|
||||
})?;
|
||||
tracing::info!(
|
||||
"Shard {tenant_shard_id} on node {}, delete returned {}",
|
||||
node.id,
|
||||
status
|
||||
);
|
||||
if status == StatusCode::ACCEPTED {
|
||||
any_pending = true;
|
||||
}
|
||||
}
|
||||
|
||||
if any_pending {
|
||||
// Caller should call us again later. When we eventually see 404s from
|
||||
// all the shards, we may proceed to delete our records of the tenant.
|
||||
tracing::info!(
|
||||
"Tenant {} has some shards pending deletion, returning 202",
|
||||
tenant_id
|
||||
);
|
||||
return Ok(StatusCode::ACCEPTED);
|
||||
}
|
||||
|
||||
// Fall through: deletion of the tenant on pageservers is complete, we may proceed to drop
|
||||
// our in-memory state and database state.
|
||||
|
||||
// Ordering: we delete persistent state first: if we then
|
||||
// crash, we will drop the in-memory state.
|
||||
|
||||
// Drop persistent state.
|
||||
self.persistence.delete_tenant(tenant_id).await?;
|
||||
|
||||
// Drop in-memory state
|
||||
{
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
locked
|
||||
.tenants
|
||||
.retain(|tenant_shard_id, _shard| tenant_shard_id.tenant_id != tenant_id);
|
||||
tracing::info!(
|
||||
"Deleted tenant {tenant_id}, now have {} tenants",
|
||||
locked.tenants.len()
|
||||
);
|
||||
};
|
||||
|
||||
// Success is represented as 404, to imitate the existing pageserver deletion API
|
||||
Ok(StatusCode::NOT_FOUND)
|
||||
}
|
||||
|
||||
pub(crate) async fn tenant_timeline_create(
|
||||
@@ -759,25 +1005,15 @@ impl Service {
|
||||
) -> Result<TimelineInfo, ApiError> {
|
||||
let mut timeline_info = None;
|
||||
|
||||
let ensure_waiters = {
|
||||
let locked = self.inner.write().unwrap();
|
||||
tracing::info!(
|
||||
"Creating timeline {}/{}, have {} pageservers",
|
||||
tenant_id,
|
||||
create_req.new_timeline_id,
|
||||
locked.nodes.len()
|
||||
);
|
||||
tracing::info!(
|
||||
"Creating timeline {}/{}",
|
||||
tenant_id,
|
||||
create_req.new_timeline_id,
|
||||
);
|
||||
|
||||
self.ensure_attached(locked, tenant_id)
|
||||
.map_err(ApiError::InternalServerError)?
|
||||
};
|
||||
|
||||
let deadline = Instant::now().checked_add(Duration::from_secs(5)).unwrap();
|
||||
for waiter in ensure_waiters {
|
||||
let timeout = deadline.duration_since(Instant::now());
|
||||
waiter.wait_timeout(timeout).await?;
|
||||
}
|
||||
self.ensure_attached_wait(tenant_id).await?;
|
||||
|
||||
// TODO: refuse to do this if shard splitting is in progress
|
||||
let targets = {
|
||||
let locked = self.inner.read().unwrap();
|
||||
let mut targets = Vec::new();
|
||||
@@ -848,6 +1084,111 @@ impl Service {
|
||||
Ok(timeline_info.expect("targets cannot be empty"))
|
||||
}
|
||||
|
||||
pub(crate) async fn tenant_timeline_delete(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<StatusCode, ApiError> {
|
||||
tracing::info!("Deleting timeline {}/{}", tenant_id, timeline_id,);
|
||||
|
||||
self.ensure_attached_wait(tenant_id).await?;
|
||||
|
||||
// TODO: refuse to do this if shard splitting is in progress
|
||||
let targets = {
|
||||
let locked = self.inner.read().unwrap();
|
||||
let mut targets = Vec::new();
|
||||
|
||||
for (tenant_shard_id, shard) in
|
||||
locked.tenants.range(TenantShardId::tenant_range(tenant_id))
|
||||
{
|
||||
let node_id = shard.intent.attached.ok_or_else(|| {
|
||||
ApiError::InternalServerError(anyhow::anyhow!("Shard not scheduled"))
|
||||
})?;
|
||||
let node = locked
|
||||
.nodes
|
||||
.get(&node_id)
|
||||
.expect("Pageservers may not be deleted while referenced");
|
||||
|
||||
targets.push((*tenant_shard_id, node.clone()));
|
||||
}
|
||||
targets
|
||||
};
|
||||
|
||||
if targets.is_empty() {
|
||||
return Err(ApiError::NotFound(
|
||||
anyhow::anyhow!("Tenant not found").into(),
|
||||
));
|
||||
}
|
||||
|
||||
// TODO: call into shards concurrently
|
||||
let mut any_pending = false;
|
||||
for (tenant_shard_id, node) in targets {
|
||||
let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
|
||||
|
||||
tracing::info!(
|
||||
"Deleting timeline on shard {}/{}, attached to node {}",
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
node.id
|
||||
);
|
||||
|
||||
let status = client
|
||||
.timeline_delete(tenant_shard_id, timeline_id)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"Error deleting timeline {timeline_id} on {tenant_shard_id} on node {}: {e}",
|
||||
node.id
|
||||
))
|
||||
})?;
|
||||
|
||||
if status == StatusCode::ACCEPTED {
|
||||
any_pending = true;
|
||||
}
|
||||
}
|
||||
|
||||
if any_pending {
|
||||
Ok(StatusCode::ACCEPTED)
|
||||
} else {
|
||||
Ok(StatusCode::NOT_FOUND)
|
||||
}
|
||||
}
|
||||
|
||||
/// When you need to send an HTTP request to the pageserver that holds shard0 of a tenant, this
|
||||
/// function looks it up and returns the url. If the tenant isn't found, returns Err(ApiError::NotFound)
|
||||
pub(crate) fn tenant_shard0_baseurl(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
) -> Result<(String, TenantShardId), ApiError> {
|
||||
let locked = self.inner.read().unwrap();
|
||||
let Some((tenant_shard_id, shard)) = locked
|
||||
.tenants
|
||||
.range(TenantShardId::tenant_range(tenant_id))
|
||||
.next()
|
||||
else {
|
||||
return Err(ApiError::NotFound(
|
||||
anyhow::anyhow!("Tenant {tenant_id} not found").into(),
|
||||
));
|
||||
};
|
||||
|
||||
// TODO: should use the ID last published to compute_hook, rather than the intent: the intent might
|
||||
// point to somewhere we haven't attached yet.
|
||||
let Some(node_id) = shard.intent.attached else {
|
||||
return Err(ApiError::Conflict(
|
||||
"Cannot call timeline API on non-attached tenant".to_string(),
|
||||
));
|
||||
};
|
||||
|
||||
let Some(node) = locked.nodes.get(&node_id) else {
|
||||
// This should never happen
|
||||
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"Shard refers to nonexistent node"
|
||||
)));
|
||||
};
|
||||
|
||||
Ok((node.base_url(), *tenant_shard_id))
|
||||
}
|
||||
|
||||
pub(crate) fn tenant_locate(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
@@ -993,6 +1334,20 @@ impl Service {
|
||||
Ok(TenantShardMigrateResponse {})
|
||||
}
|
||||
|
||||
pub(crate) async fn node_list(&self) -> Result<Vec<NodePersistence>, ApiError> {
|
||||
// It is convenient to avoid taking the big lock and converting Node to a serializable
|
||||
// structure, by fetching from storage instead of reading in-memory state.
|
||||
let nodes = self
|
||||
.persistence
|
||||
.list_nodes()
|
||||
.await?
|
||||
.into_iter()
|
||||
.map(|n| n.to_persistent())
|
||||
.collect();
|
||||
|
||||
Ok(nodes)
|
||||
}
|
||||
|
||||
pub(crate) async fn node_register(
|
||||
&self,
|
||||
register_req: NodeRegisterRequest,
|
||||
@@ -1166,7 +1521,7 @@ impl Service {
|
||||
/// Helper for methods that will try and call pageserver APIs for
|
||||
/// a tenant, such as timeline CRUD: they cannot proceed unless the tenant
|
||||
/// is attached somewhere.
|
||||
fn ensure_attached(
|
||||
fn ensure_attached_schedule(
|
||||
&self,
|
||||
mut locked: std::sync::RwLockWriteGuard<'_, ServiceState>,
|
||||
tenant_id: TenantId,
|
||||
@@ -1196,6 +1551,23 @@ impl Service {
|
||||
Ok(waiters)
|
||||
}
|
||||
|
||||
async fn ensure_attached_wait(&self, tenant_id: TenantId) -> Result<(), ApiError> {
|
||||
let ensure_waiters = {
|
||||
let locked = self.inner.write().unwrap();
|
||||
|
||||
self.ensure_attached_schedule(locked, tenant_id)
|
||||
.map_err(ApiError::InternalServerError)?
|
||||
};
|
||||
|
||||
let deadline = Instant::now().checked_add(Duration::from_secs(5)).unwrap();
|
||||
for waiter in ensure_waiters {
|
||||
let timeout = deadline.duration_since(Instant::now());
|
||||
waiter.wait_timeout(timeout).await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Check all tenants for pending reconciliation work, and reconcile those in need
|
||||
///
|
||||
/// Returns how many reconciliation tasks were started
|
||||
|
||||
@@ -17,6 +17,7 @@ use serde::{de::DeserializeOwned, Deserialize, Serialize};
|
||||
use std::{env, str::FromStr};
|
||||
use tokio::process::Command;
|
||||
use tracing::instrument;
|
||||
use url::Url;
|
||||
use utils::{
|
||||
auth::{Claims, Scope},
|
||||
id::{NodeId, TenantId},
|
||||
@@ -59,6 +60,7 @@ pub struct InspectResponse {
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantCreateResponseShard {
|
||||
pub shard_id: TenantShardId,
|
||||
pub node_id: NodeId,
|
||||
pub generation: u32,
|
||||
}
|
||||
@@ -523,13 +525,15 @@ impl AttachmentService {
|
||||
RQ: Serialize + Sized,
|
||||
RS: DeserializeOwned + Sized,
|
||||
{
|
||||
let url = self
|
||||
.env
|
||||
.control_plane_api
|
||||
.clone()
|
||||
.unwrap()
|
||||
.join(&path)
|
||||
.unwrap();
|
||||
// The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
|
||||
// for general purpose API access.
|
||||
let listen_url = self.env.control_plane_api.clone().unwrap();
|
||||
let url = Url::from_str(&format!(
|
||||
"http://{}:{}/{path}",
|
||||
listen_url.host_str().unwrap(),
|
||||
listen_url.port().unwrap()
|
||||
))
|
||||
.unwrap();
|
||||
|
||||
let mut builder = self.client.request(method, url);
|
||||
if let Some(body) = body {
|
||||
@@ -566,7 +570,7 @@ impl AttachmentService {
|
||||
let response = self
|
||||
.dispatch::<_, AttachHookResponse>(
|
||||
Method::POST,
|
||||
"attach-hook".to_string(),
|
||||
"debug/v1/attach-hook".to_string(),
|
||||
Some(request),
|
||||
)
|
||||
.await?;
|
||||
@@ -582,7 +586,11 @@ impl AttachmentService {
|
||||
let request = InspectRequest { tenant_shard_id };
|
||||
|
||||
let response = self
|
||||
.dispatch::<_, InspectResponse>(Method::POST, "inspect".to_string(), Some(request))
|
||||
.dispatch::<_, InspectResponse>(
|
||||
Method::POST,
|
||||
"debug/v1/inspect".to_string(),
|
||||
Some(request),
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(response.attachment)
|
||||
@@ -599,8 +607,12 @@ impl AttachmentService {
|
||||
|
||||
#[instrument(skip(self))]
|
||||
pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> {
|
||||
self.dispatch::<(), _>(Method::GET, format!("tenant/{tenant_id}/locate"), None)
|
||||
.await
|
||||
self.dispatch::<(), _>(
|
||||
Method::GET,
|
||||
format!("control/v1/tenant/{tenant_id}/locate"),
|
||||
None,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
#[instrument(skip(self))]
|
||||
@@ -622,7 +634,7 @@ impl AttachmentService {
|
||||
|
||||
#[instrument(skip_all, fields(node_id=%req.node_id))]
|
||||
pub async fn node_register(&self, req: NodeRegisterRequest) -> anyhow::Result<()> {
|
||||
self.dispatch::<_, ()>(Method::POST, "node".to_string(), Some(req))
|
||||
self.dispatch::<_, ()>(Method::POST, "control/v1/node".to_string(), Some(req))
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -630,7 +642,7 @@ impl AttachmentService {
|
||||
pub async fn node_configure(&self, req: NodeConfigureRequest) -> anyhow::Result<()> {
|
||||
self.dispatch::<_, ()>(
|
||||
Method::PUT,
|
||||
format!("node/{}/config", req.node_id),
|
||||
format!("control/v1/node/{}/config", req.node_id),
|
||||
Some(req),
|
||||
)
|
||||
.await
|
||||
|
||||
@@ -51,7 +51,7 @@ project_git_version!(GIT_VERSION);
|
||||
|
||||
const DEFAULT_PG_VERSION: &str = "15";
|
||||
|
||||
const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/";
|
||||
const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";
|
||||
|
||||
fn default_conf(num_pageservers: u16) -> String {
|
||||
let mut template = format!(
|
||||
|
||||
@@ -9,5 +9,10 @@ prometheus.workspace = true
|
||||
libc.workspace = true
|
||||
once_cell.workspace = true
|
||||
chrono.workspace = true
|
||||
twox-hash.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
rand = "0.8"
|
||||
rand_distr = "0.4.3"
|
||||
|
||||
523
libs/metrics/src/hll.rs
Normal file
523
libs/metrics/src/hll.rs
Normal file
@@ -0,0 +1,523 @@
|
||||
//! HyperLogLog is an algorithm for the count-distinct problem,
|
||||
//! approximating the number of distinct elements in a multiset.
|
||||
//! Calculating the exact cardinality of the distinct elements
|
||||
//! of a multiset requires an amount of memory proportional to
|
||||
//! the cardinality, which is impractical for very large data sets.
|
||||
//! Probabilistic cardinality estimators, such as the HyperLogLog algorithm,
|
||||
//! use significantly less memory than this, but can only approximate the cardinality.
|
||||
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
hash::{BuildHasher, BuildHasherDefault, Hash, Hasher},
|
||||
sync::{atomic::AtomicU8, Arc, RwLock},
|
||||
};
|
||||
|
||||
use prometheus::{
|
||||
core::{self, Describer},
|
||||
proto, Opts,
|
||||
};
|
||||
use twox_hash::xxh3;
|
||||
|
||||
/// Create an [`HyperLogLogVec`] and registers to default registry.
|
||||
#[macro_export(local_inner_macros)]
|
||||
macro_rules! register_hll_vec {
|
||||
($N:literal, $OPTS:expr, $LABELS_NAMES:expr $(,)?) => {{
|
||||
let hll_vec = $crate::HyperLogLogVec::<$N>::new($OPTS, $LABELS_NAMES).unwrap();
|
||||
$crate::register(Box::new(hll_vec.clone())).map(|_| hll_vec)
|
||||
}};
|
||||
|
||||
($N:literal, $NAME:expr, $HELP:expr, $LABELS_NAMES:expr $(,)?) => {{
|
||||
$crate::register_hll_vec!($N, $crate::opts!($NAME, $HELP), $LABELS_NAMES)
|
||||
}};
|
||||
}
|
||||
|
||||
/// Create an [`HyperLogLog`] and registers to default registry.
|
||||
#[macro_export(local_inner_macros)]
|
||||
macro_rules! register_hll {
|
||||
($N:literal, $OPTS:expr $(,)?) => {{
|
||||
let hll = $crate::HyperLogLog::<$N>::with_opts($OPTS).unwrap();
|
||||
$crate::register(Box::new(hll.clone())).map(|_| hll)
|
||||
}};
|
||||
|
||||
($N:literal, $NAME:expr, $HELP:expr $(,)?) => {{
|
||||
$crate::register_hll!($N, $crate::opts!($NAME, $HELP), $LABELS_NAMES)
|
||||
}};
|
||||
}
|
||||
|
||||
/// HLL is a probabilistic cardinality measure.
|
||||
///
|
||||
/// How to use this time-series for a metric name `my_metrics_total_hll`:
|
||||
///
|
||||
/// ```promql
|
||||
/// # harmonic mean
|
||||
/// 1 / (
|
||||
/// sum (
|
||||
/// 2 ^ -(
|
||||
/// # HLL merge operation
|
||||
/// max (my_metrics_total_hll{}) by (hll_shard, other_labels...)
|
||||
/// )
|
||||
/// ) without (hll_shard)
|
||||
/// )
|
||||
/// * alpha
|
||||
/// * shards_count
|
||||
/// * shards_count
|
||||
/// ```
|
||||
///
|
||||
/// If you want an estimate over time, you can use the following query:
|
||||
///
|
||||
/// ```promql
|
||||
/// # harmonic mean
|
||||
/// 1 / (
|
||||
/// sum (
|
||||
/// 2 ^ -(
|
||||
/// # HLL merge operation
|
||||
/// max (
|
||||
/// max_over_time(my_metrics_total_hll{}[$__rate_interval])
|
||||
/// ) by (hll_shard, other_labels...)
|
||||
/// )
|
||||
/// ) without (hll_shard)
|
||||
/// )
|
||||
/// * alpha
|
||||
/// * shards_count
|
||||
/// * shards_count
|
||||
/// ```
|
||||
///
|
||||
/// In the case of low cardinality, you might want to use the linear counting approximation:
|
||||
///
|
||||
/// ```promql
|
||||
/// # LinearCounting(m, V) = m log (m / V)
|
||||
/// shards_count * ln(shards_count /
|
||||
/// # calculate V = how many shards contain a 0
|
||||
/// count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard)
|
||||
/// )
|
||||
/// ```
|
||||
///
|
||||
/// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
|
||||
#[derive(Clone)]
|
||||
pub struct HyperLogLogVec<const N: usize> {
|
||||
core: Arc<HyperLogLogVecCore<N>>,
|
||||
}
|
||||
|
||||
struct HyperLogLogVecCore<const N: usize> {
|
||||
pub children: RwLock<HashMap<u64, HyperLogLog<N>, BuildHasherDefault<xxh3::Hash64>>>,
|
||||
pub desc: core::Desc,
|
||||
pub opts: Opts,
|
||||
}
|
||||
|
||||
impl<const N: usize> core::Collector for HyperLogLogVec<N> {
|
||||
fn desc(&self) -> Vec<&core::Desc> {
|
||||
vec![&self.core.desc]
|
||||
}
|
||||
|
||||
fn collect(&self) -> Vec<proto::MetricFamily> {
|
||||
let mut m = proto::MetricFamily::default();
|
||||
m.set_name(self.core.desc.fq_name.clone());
|
||||
m.set_help(self.core.desc.help.clone());
|
||||
m.set_field_type(proto::MetricType::GAUGE);
|
||||
|
||||
let mut metrics = Vec::new();
|
||||
for child in self.core.children.read().unwrap().values() {
|
||||
child.core.collect_into(&mut metrics);
|
||||
}
|
||||
m.set_metric(metrics);
|
||||
|
||||
vec![m]
|
||||
}
|
||||
}
|
||||
|
||||
impl<const N: usize> HyperLogLogVec<N> {
|
||||
/// Create a new [`HyperLogLogVec`] based on the provided
|
||||
/// [`Opts`] and partitioned by the given label names. At least one label name must be
|
||||
/// provided.
|
||||
pub fn new(opts: Opts, label_names: &[&str]) -> prometheus::Result<Self> {
|
||||
assert!(N.is_power_of_two());
|
||||
let variable_names = label_names.iter().map(|s| (*s).to_owned()).collect();
|
||||
let opts = opts.variable_labels(variable_names);
|
||||
|
||||
let desc = opts.describe()?;
|
||||
let v = HyperLogLogVecCore {
|
||||
children: RwLock::new(HashMap::default()),
|
||||
desc,
|
||||
opts,
|
||||
};
|
||||
|
||||
Ok(Self { core: Arc::new(v) })
|
||||
}
|
||||
|
||||
/// `get_metric_with_label_values` returns the [`HyperLogLog<P>`] for the given slice
|
||||
/// of label values (same order as the VariableLabels in Desc). If that combination of
|
||||
/// label values is accessed for the first time, a new [`HyperLogLog<P>`] is created.
|
||||
///
|
||||
/// An error is returned if the number of label values is not the same as the
|
||||
/// number of VariableLabels in Desc.
|
||||
pub fn get_metric_with_label_values(
|
||||
&self,
|
||||
vals: &[&str],
|
||||
) -> prometheus::Result<HyperLogLog<N>> {
|
||||
self.core.get_metric_with_label_values(vals)
|
||||
}
|
||||
|
||||
/// `with_label_values` works as `get_metric_with_label_values`, but panics if an error
|
||||
/// occurs.
|
||||
pub fn with_label_values(&self, vals: &[&str]) -> HyperLogLog<N> {
|
||||
self.get_metric_with_label_values(vals).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
impl<const N: usize> HyperLogLogVecCore<N> {
|
||||
pub fn get_metric_with_label_values(
|
||||
&self,
|
||||
vals: &[&str],
|
||||
) -> prometheus::Result<HyperLogLog<N>> {
|
||||
let h = self.hash_label_values(vals)?;
|
||||
|
||||
if let Some(metric) = self.children.read().unwrap().get(&h).cloned() {
|
||||
return Ok(metric);
|
||||
}
|
||||
|
||||
self.get_or_create_metric(h, vals)
|
||||
}
|
||||
|
||||
pub(crate) fn hash_label_values(&self, vals: &[&str]) -> prometheus::Result<u64> {
|
||||
if vals.len() != self.desc.variable_labels.len() {
|
||||
return Err(prometheus::Error::InconsistentCardinality {
|
||||
expect: self.desc.variable_labels.len(),
|
||||
got: vals.len(),
|
||||
});
|
||||
}
|
||||
|
||||
let mut h = xxh3::Hash64::default();
|
||||
for val in vals {
|
||||
h.write(val.as_bytes());
|
||||
}
|
||||
|
||||
Ok(h.finish())
|
||||
}
|
||||
|
||||
fn get_or_create_metric(
|
||||
&self,
|
||||
hash: u64,
|
||||
label_values: &[&str],
|
||||
) -> prometheus::Result<HyperLogLog<N>> {
|
||||
let mut children = self.children.write().unwrap();
|
||||
// Check exist first.
|
||||
if let Some(metric) = children.get(&hash).cloned() {
|
||||
return Ok(metric);
|
||||
}
|
||||
|
||||
let metric = HyperLogLog::with_opts_and_label_values(&self.opts, label_values)?;
|
||||
children.insert(hash, metric.clone());
|
||||
Ok(metric)
|
||||
}
|
||||
}
|
||||
|
||||
/// HLL is a probabilistic cardinality measure.
|
||||
///
|
||||
/// How to use this time-series for a metric name `my_metrics_total_hll`:
|
||||
///
|
||||
/// ```promql
|
||||
/// # harmonic mean
|
||||
/// 1 / (
|
||||
/// sum (
|
||||
/// 2 ^ -(
|
||||
/// # HLL merge operation
|
||||
/// max (my_metrics_total_hll{}) by (hll_shard, other_labels...)
|
||||
/// )
|
||||
/// ) without (hll_shard)
|
||||
/// )
|
||||
/// * alpha
|
||||
/// * shards_count
|
||||
/// * shards_count
|
||||
/// ```
|
||||
///
|
||||
/// If you want an estimate over time, you can use the following query:
|
||||
///
|
||||
/// ```promql
|
||||
/// # harmonic mean
|
||||
/// 1 / (
|
||||
/// sum (
|
||||
/// 2 ^ -(
|
||||
/// # HLL merge operation
|
||||
/// max (
|
||||
/// max_over_time(my_metrics_total_hll{}[$__rate_interval])
|
||||
/// ) by (hll_shard, other_labels...)
|
||||
/// )
|
||||
/// ) without (hll_shard)
|
||||
/// )
|
||||
/// * alpha
|
||||
/// * shards_count
|
||||
/// * shards_count
|
||||
/// ```
|
||||
///
|
||||
/// In the case of low cardinality, you might want to use the linear counting approximation:
|
||||
///
|
||||
/// ```promql
|
||||
/// # LinearCounting(m, V) = m log (m / V)
|
||||
/// shards_count * ln(shards_count /
|
||||
/// # calculate V = how many shards contain a 0
|
||||
/// count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard)
|
||||
/// )
|
||||
/// ```
|
||||
///
|
||||
/// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
|
||||
#[derive(Clone)]
|
||||
pub struct HyperLogLog<const N: usize> {
|
||||
core: Arc<HyperLogLogCore<N>>,
|
||||
}
|
||||
|
||||
impl<const N: usize> HyperLogLog<N> {
|
||||
/// Create a [`HyperLogLog`] with the `name` and `help` arguments.
|
||||
pub fn new<S1: Into<String>, S2: Into<String>>(name: S1, help: S2) -> prometheus::Result<Self> {
|
||||
assert!(N.is_power_of_two());
|
||||
let opts = Opts::new(name, help);
|
||||
Self::with_opts(opts)
|
||||
}
|
||||
|
||||
/// Create a [`HyperLogLog`] with the `opts` options.
|
||||
pub fn with_opts(opts: Opts) -> prometheus::Result<Self> {
|
||||
Self::with_opts_and_label_values(&opts, &[])
|
||||
}
|
||||
|
||||
fn with_opts_and_label_values(opts: &Opts, label_values: &[&str]) -> prometheus::Result<Self> {
|
||||
let desc = opts.describe()?;
|
||||
let labels = make_label_pairs(&desc, label_values)?;
|
||||
|
||||
let v = HyperLogLogCore {
|
||||
shards: [0; N].map(AtomicU8::new),
|
||||
desc,
|
||||
labels,
|
||||
};
|
||||
Ok(Self { core: Arc::new(v) })
|
||||
}
|
||||
|
||||
pub fn measure(&self, item: &impl Hash) {
|
||||
// changing the hasher will break compatibility with previous measurements.
|
||||
self.record(BuildHasherDefault::<xxh3::Hash64>::default().hash_one(item));
|
||||
}
|
||||
|
||||
fn record(&self, hash: u64) {
|
||||
let p = N.ilog2() as u8;
|
||||
let j = hash & (N as u64 - 1);
|
||||
let rho = (hash >> p).leading_zeros() as u8 + 1 - p;
|
||||
self.core.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
struct HyperLogLogCore<const N: usize> {
|
||||
shards: [AtomicU8; N],
|
||||
desc: core::Desc,
|
||||
labels: Vec<proto::LabelPair>,
|
||||
}
|
||||
|
||||
impl<const N: usize> core::Collector for HyperLogLog<N> {
|
||||
fn desc(&self) -> Vec<&core::Desc> {
|
||||
vec![&self.core.desc]
|
||||
}
|
||||
|
||||
fn collect(&self) -> Vec<proto::MetricFamily> {
|
||||
let mut m = proto::MetricFamily::default();
|
||||
m.set_name(self.core.desc.fq_name.clone());
|
||||
m.set_help(self.core.desc.help.clone());
|
||||
m.set_field_type(proto::MetricType::GAUGE);
|
||||
|
||||
let mut metrics = Vec::new();
|
||||
self.core.collect_into(&mut metrics);
|
||||
m.set_metric(metrics);
|
||||
|
||||
vec![m]
|
||||
}
|
||||
}
|
||||
|
||||
impl<const N: usize> HyperLogLogCore<N> {
|
||||
fn collect_into(&self, metrics: &mut Vec<proto::Metric>) {
|
||||
self.shards.iter().enumerate().for_each(|(i, x)| {
|
||||
let mut shard_label = proto::LabelPair::default();
|
||||
shard_label.set_name("hll_shard".to_owned());
|
||||
shard_label.set_value(format!("{i}"));
|
||||
|
||||
// We reset the counter to 0 so we can perform a cardinality measure over any time slice in prometheus.
|
||||
|
||||
// This seems like it would be a race condition,
|
||||
// but HLL is not impacted by a write in one shard happening in between.
|
||||
// This is because in PromQL we will be implementing a harmonic mean of all buckets.
|
||||
// we will also merge samples in a time series using `max by (hll_shard)`.
|
||||
|
||||
// TODO: maybe we shouldn't reset this on every collect, instead, only after a time window.
|
||||
// this would mean that a dev port-forwarding the metrics url won't break the sampling.
|
||||
let v = x.swap(0, std::sync::atomic::Ordering::Relaxed);
|
||||
|
||||
let mut m = proto::Metric::default();
|
||||
let mut c = proto::Gauge::default();
|
||||
c.set_value(v as f64);
|
||||
m.set_gauge(c);
|
||||
|
||||
let mut labels = Vec::with_capacity(self.labels.len() + 1);
|
||||
labels.extend_from_slice(&self.labels);
|
||||
labels.push(shard_label);
|
||||
|
||||
m.set_label(labels);
|
||||
metrics.push(m);
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn make_label_pairs(
|
||||
desc: &core::Desc,
|
||||
label_values: &[&str],
|
||||
) -> prometheus::Result<Vec<proto::LabelPair>> {
|
||||
if desc.variable_labels.len() != label_values.len() {
|
||||
return Err(prometheus::Error::InconsistentCardinality {
|
||||
expect: desc.variable_labels.len(),
|
||||
got: label_values.len(),
|
||||
});
|
||||
}
|
||||
|
||||
let total_len = desc.variable_labels.len() + desc.const_label_pairs.len();
|
||||
if total_len == 0 {
|
||||
return Ok(vec![]);
|
||||
}
|
||||
|
||||
if desc.variable_labels.is_empty() {
|
||||
return Ok(desc.const_label_pairs.clone());
|
||||
}
|
||||
|
||||
let mut label_pairs = Vec::with_capacity(total_len);
|
||||
for (i, n) in desc.variable_labels.iter().enumerate() {
|
||||
let mut label_pair = proto::LabelPair::default();
|
||||
label_pair.set_name(n.clone());
|
||||
label_pair.set_value(label_values[i].to_owned());
|
||||
label_pairs.push(label_pair);
|
||||
}
|
||||
|
||||
for label_pair in &desc.const_label_pairs {
|
||||
label_pairs.push(label_pair.clone());
|
||||
}
|
||||
label_pairs.sort();
|
||||
Ok(label_pairs)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::HashSet;
|
||||
|
||||
use prometheus::{proto, Opts};
|
||||
use rand::{rngs::StdRng, Rng, SeedableRng};
|
||||
use rand_distr::{Distribution, Zipf};
|
||||
|
||||
use crate::HyperLogLogVec;
|
||||
|
||||
fn collect(hll: &HyperLogLogVec<32>) -> Vec<proto::Metric> {
|
||||
let mut metrics = vec![];
|
||||
hll.core
|
||||
.children
|
||||
.read()
|
||||
.unwrap()
|
||||
.values()
|
||||
.for_each(|c| c.core.collect_into(&mut metrics));
|
||||
metrics
|
||||
}
|
||||
fn get_cardinality(metrics: &[proto::Metric], filter: impl Fn(&proto::Metric) -> bool) -> f64 {
|
||||
let mut buckets = [0.0; 32];
|
||||
for metric in metrics.chunks_exact(32) {
|
||||
if filter(&metric[0]) {
|
||||
for (i, m) in metric.iter().enumerate() {
|
||||
buckets[i] = f64::max(buckets[i], m.get_gauge().get_value());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
buckets
|
||||
.into_iter()
|
||||
.map(|f| 2.0f64.powf(-f))
|
||||
.sum::<f64>()
|
||||
.recip()
|
||||
* 0.697
|
||||
* 32.0
|
||||
* 32.0
|
||||
}
|
||||
|
||||
fn test_cardinality(n: usize, dist: impl Distribution<f64>) -> ([usize; 3], [f64; 3]) {
|
||||
let hll = HyperLogLogVec::<32>::new(Opts::new("foo", "bar"), &["x"]).unwrap();
|
||||
|
||||
let mut iter = StdRng::seed_from_u64(0x2024_0112).sample_iter(dist);
|
||||
let mut set_a = HashSet::new();
|
||||
let mut set_b = HashSet::new();
|
||||
|
||||
for x in iter.by_ref().take(n) {
|
||||
set_a.insert(x.to_bits());
|
||||
hll.with_label_values(&["a"]).measure(&x.to_bits());
|
||||
}
|
||||
for x in iter.by_ref().take(n) {
|
||||
set_b.insert(x.to_bits());
|
||||
hll.with_label_values(&["b"]).measure(&x.to_bits());
|
||||
}
|
||||
let merge = &set_a | &set_b;
|
||||
|
||||
let metrics = collect(&hll);
|
||||
let len = get_cardinality(&metrics, |_| true);
|
||||
let len_a = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "a");
|
||||
let len_b = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "b");
|
||||
|
||||
([merge.len(), set_a.len(), set_b.len()], [len, len_a, len_b])
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cardinality_small() {
|
||||
let (actual, estimate) = test_cardinality(100, Zipf::new(100, 1.2f64).unwrap());
|
||||
|
||||
assert_eq!(actual, [46, 30, 32]);
|
||||
assert!(51.3 < estimate[0] && estimate[0] < 51.4);
|
||||
assert!(44.0 < estimate[1] && estimate[1] < 44.1);
|
||||
assert!(39.0 < estimate[2] && estimate[2] < 39.1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cardinality_medium() {
|
||||
let (actual, estimate) = test_cardinality(10000, Zipf::new(10000, 1.2f64).unwrap());
|
||||
|
||||
assert_eq!(actual, [2529, 1618, 1629]);
|
||||
assert!(2309.1 < estimate[0] && estimate[0] < 2309.2);
|
||||
assert!(1566.6 < estimate[1] && estimate[1] < 1566.7);
|
||||
assert!(1629.5 < estimate[2] && estimate[2] < 1629.6);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cardinality_large() {
|
||||
let (actual, estimate) = test_cardinality(1_000_000, Zipf::new(1_000_000, 1.2f64).unwrap());
|
||||
|
||||
assert_eq!(actual, [129077, 79579, 79630]);
|
||||
assert!(126067.2 < estimate[0] && estimate[0] < 126067.3);
|
||||
assert!(83076.8 < estimate[1] && estimate[1] < 83076.9);
|
||||
assert!(64251.2 < estimate[2] && estimate[2] < 64251.3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cardinality_small2() {
|
||||
let (actual, estimate) = test_cardinality(100, Zipf::new(200, 0.8f64).unwrap());
|
||||
|
||||
assert_eq!(actual, [92, 58, 60]);
|
||||
assert!(116.1 < estimate[0] && estimate[0] < 116.2);
|
||||
assert!(81.7 < estimate[1] && estimate[1] < 81.8);
|
||||
assert!(69.3 < estimate[2] && estimate[2] < 69.4);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cardinality_medium2() {
|
||||
let (actual, estimate) = test_cardinality(10000, Zipf::new(20000, 0.8f64).unwrap());
|
||||
|
||||
assert_eq!(actual, [8201, 5131, 5051]);
|
||||
assert!(6846.4 < estimate[0] && estimate[0] < 6846.5);
|
||||
assert!(5239.1 < estimate[1] && estimate[1] < 5239.2);
|
||||
assert!(4292.8 < estimate[2] && estimate[2] < 4292.9);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cardinality_large2() {
|
||||
let (actual, estimate) = test_cardinality(1_000_000, Zipf::new(2_000_000, 0.8f64).unwrap());
|
||||
|
||||
assert_eq!(actual, [777847, 482069, 482246]);
|
||||
assert!(699437.4 < estimate[0] && estimate[0] < 699437.5);
|
||||
assert!(374948.9 < estimate[1] && estimate[1] < 374949.0);
|
||||
assert!(434609.7 < estimate[2] && estimate[2] < 434609.8);
|
||||
}
|
||||
}
|
||||
@@ -28,7 +28,9 @@ use prometheus::{Registry, Result};
|
||||
pub mod launch_timestamp;
|
||||
mod wrappers;
|
||||
pub use wrappers::{CountedReader, CountedWriter};
|
||||
mod hll;
|
||||
pub mod metric_vec_duration;
|
||||
pub use hll::{HyperLogLog, HyperLogLogVec};
|
||||
|
||||
pub type UIntGauge = GenericGauge<AtomicU64>;
|
||||
pub type UIntGaugeVec = GenericGaugeVec<AtomicU64>;
|
||||
|
||||
@@ -364,6 +364,19 @@ pub struct TenantLocationConfigRequest {
|
||||
pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TenantShardLocation {
|
||||
pub shard_id: TenantShardId,
|
||||
pub node_id: NodeId,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TenantLocationConfigResponse {
|
||||
pub shards: Vec<TenantShardLocation>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TenantConfigRequest {
|
||||
|
||||
@@ -207,10 +207,16 @@ pub fn find_end_of_wal(
|
||||
let seg_offs = curr_lsn.segment_offset(wal_seg_size);
|
||||
segment.seek(SeekFrom::Start(seg_offs as u64))?;
|
||||
// loop inside segment
|
||||
loop {
|
||||
while curr_lsn.segment_number(wal_seg_size) == segno {
|
||||
let bytes_read = segment.read(&mut buf)?;
|
||||
if bytes_read == 0 {
|
||||
break; // EOF
|
||||
debug!(
|
||||
"find_end_of_wal reached end at {:?}, EOF in segment {:?} at offset {}",
|
||||
result,
|
||||
seg_file_path,
|
||||
curr_lsn.segment_offset(wal_seg_size)
|
||||
);
|
||||
return Ok(result);
|
||||
}
|
||||
curr_lsn += bytes_read as u64;
|
||||
decoder.feed_bytes(&buf[0..bytes_read]);
|
||||
|
||||
@@ -646,7 +646,7 @@ impl RemoteStorage for S3Bucket {
|
||||
let timestamp = DateTime::from(timestamp);
|
||||
let done_if_after = DateTime::from(done_if_after);
|
||||
|
||||
tracing::trace!("Target time: {timestamp:?}, done_if_after {done_if_after:?}");
|
||||
tracing::info!("Target time: {timestamp:?}, done_if_after {done_if_after:?}");
|
||||
|
||||
// get the passed prefix or if it is not set use prefix_in_bucket value
|
||||
let prefix = prefix
|
||||
@@ -657,75 +657,108 @@ impl RemoteStorage for S3Bucket {
|
||||
let max_retries = 10;
|
||||
let is_permanent = |_e: &_| false;
|
||||
|
||||
let list = backoff::retry(
|
||||
|| async {
|
||||
Ok(self
|
||||
.client
|
||||
.list_object_versions()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.set_prefix(prefix.clone())
|
||||
.send()
|
||||
.await?)
|
||||
},
|
||||
is_permanent,
|
||||
warn_threshold,
|
||||
max_retries,
|
||||
"listing object versions for time_travel_recover",
|
||||
backoff::Cancel::new(cancel.clone(), || anyhow!("Cancelled")),
|
||||
)
|
||||
.await?;
|
||||
let mut key_marker = None;
|
||||
let mut version_id_marker = None;
|
||||
let mut versions_and_deletes = Vec::new();
|
||||
|
||||
if list.is_truncated().unwrap_or_default() {
|
||||
anyhow::bail!("Received truncated ListObjectVersions response for prefix={prefix:?}");
|
||||
loop {
|
||||
let response = backoff::retry(
|
||||
|| async {
|
||||
Ok(self
|
||||
.client
|
||||
.list_object_versions()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.set_prefix(prefix.clone())
|
||||
.set_key_marker(key_marker.clone())
|
||||
.set_version_id_marker(version_id_marker.clone())
|
||||
.send()
|
||||
.await?)
|
||||
},
|
||||
is_permanent,
|
||||
warn_threshold,
|
||||
max_retries,
|
||||
"listing object versions for time_travel_recover",
|
||||
backoff::Cancel::new(cancel.clone(), || anyhow!("Cancelled")),
|
||||
)
|
||||
.await?;
|
||||
|
||||
tracing::trace!(
|
||||
" Got List response version_id_marker={:?}, key_marker={:?}",
|
||||
response.version_id_marker,
|
||||
response.key_marker
|
||||
);
|
||||
let versions = response
|
||||
.versions
|
||||
.unwrap_or_default()
|
||||
.into_iter()
|
||||
.map(VerOrDelete::from_version);
|
||||
let deletes = response
|
||||
.delete_markers
|
||||
.unwrap_or_default()
|
||||
.into_iter()
|
||||
.map(VerOrDelete::from_delete_marker);
|
||||
itertools::process_results(versions.chain(deletes), |n_vds| {
|
||||
versions_and_deletes.extend(n_vds)
|
||||
})?;
|
||||
fn none_if_empty(v: Option<String>) -> Option<String> {
|
||||
v.filter(|v| !v.is_empty())
|
||||
}
|
||||
version_id_marker = none_if_empty(response.next_version_id_marker);
|
||||
key_marker = none_if_empty(response.next_key_marker);
|
||||
if version_id_marker.is_none() {
|
||||
// The final response is not supposed to be truncated
|
||||
if response.is_truncated.unwrap_or_default() {
|
||||
anyhow::bail!(
|
||||
"Received truncated ListObjectVersions response for prefix={prefix:?}"
|
||||
);
|
||||
}
|
||||
break;
|
||||
}
|
||||
// Limit the number of versions deletions, mostly so that we don't
|
||||
// keep requesting forever if the list is too long, as we'd put the
|
||||
// list in RAM.
|
||||
// Building a list of 100k entries that reaches the limit roughly takes
|
||||
// 40 seconds, and roughly corresponds to tenants of 2 TiB physical size.
|
||||
const COMPLEXITY_LIMIT: usize = 100_000;
|
||||
if versions_and_deletes.len() >= COMPLEXITY_LIMIT {
|
||||
anyhow::bail!(
|
||||
"Limit for number of versions/deletions exceeded for prefix={prefix:?}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
let mut versions_deletes = list
|
||||
.versions()
|
||||
.iter()
|
||||
.map(VerOrDelete::Version)
|
||||
.chain(list.delete_markers().iter().map(VerOrDelete::DeleteMarker))
|
||||
.collect::<Vec<_>>();
|
||||
// Work on the list of references instead of the objects directly,
|
||||
// otherwise we get lifetime errors in the sort_by_key call below.
|
||||
let mut versions_and_deletes = versions_and_deletes.iter().collect::<Vec<_>>();
|
||||
|
||||
versions_deletes.sort_by_key(|vd| (vd.key(), vd.last_modified()));
|
||||
versions_and_deletes.sort_by_key(|vd| (&vd.key, &vd.last_modified));
|
||||
|
||||
let mut vds_for_key = HashMap::<_, Vec<_>>::new();
|
||||
|
||||
for vd in versions_deletes {
|
||||
let last_modified = vd.last_modified();
|
||||
let version_id = vd.version_id();
|
||||
let key = vd.key();
|
||||
let (Some(last_modified), Some(version_id), Some(key)) =
|
||||
(last_modified, version_id, key)
|
||||
else {
|
||||
anyhow::bail!(
|
||||
"One (or more) of last_modified, key, and id is None. \
|
||||
Is versioning enabled in the bucket? last_modified={:?} key={:?} version_id={:?}",
|
||||
last_modified, key, version_id,
|
||||
);
|
||||
};
|
||||
for vd in &versions_and_deletes {
|
||||
let VerOrDelete {
|
||||
version_id, key, ..
|
||||
} = &vd;
|
||||
if version_id == "null" {
|
||||
anyhow::bail!("Received ListVersions response for key={key} with version_id='null', \
|
||||
indicating either disabled versioning, or legacy objects with null version id values");
|
||||
}
|
||||
tracing::trace!(
|
||||
"Parsing version key={key} version_id={version_id} is_delete={}",
|
||||
matches!(vd, VerOrDelete::DeleteMarker(_))
|
||||
"Parsing version key={key} version_id={version_id} kind={:?}",
|
||||
vd.kind
|
||||
);
|
||||
|
||||
vds_for_key
|
||||
.entry(key)
|
||||
.or_default()
|
||||
.push((vd, last_modified, version_id));
|
||||
vds_for_key.entry(key).or_default().push(vd);
|
||||
}
|
||||
for (key, versions) in vds_for_key {
|
||||
let (last_vd, last_last_modified, _version_id) = versions.last().unwrap();
|
||||
if last_last_modified > &&done_if_after {
|
||||
let last_vd = versions.last().unwrap();
|
||||
if last_vd.last_modified > done_if_after {
|
||||
tracing::trace!("Key {key} has version later than done_if_after, skipping");
|
||||
continue;
|
||||
}
|
||||
// the version we want to restore to.
|
||||
let version_to_restore_to =
|
||||
match versions.binary_search_by_key(×tamp, |tpl| *tpl.1) {
|
||||
match versions.binary_search_by_key(×tamp, |tpl| tpl.last_modified) {
|
||||
Ok(v) => v,
|
||||
Err(e) => e,
|
||||
};
|
||||
@@ -743,7 +776,11 @@ impl RemoteStorage for S3Bucket {
|
||||
do_delete = true;
|
||||
} else {
|
||||
match &versions[version_to_restore_to - 1] {
|
||||
(VerOrDelete::Version(_), _last_modified, version_id) => {
|
||||
VerOrDelete {
|
||||
kind: VerOrDeleteKind::Version,
|
||||
version_id,
|
||||
..
|
||||
} => {
|
||||
tracing::trace!("Copying old version {version_id} for {key}...");
|
||||
// Restore the state to the last version by copying
|
||||
let source_id =
|
||||
@@ -768,13 +805,16 @@ impl RemoteStorage for S3Bucket {
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
(VerOrDelete::DeleteMarker(_), _last_modified, _version_id) => {
|
||||
VerOrDelete {
|
||||
kind: VerOrDeleteKind::DeleteMarker,
|
||||
..
|
||||
} => {
|
||||
do_delete = true;
|
||||
}
|
||||
}
|
||||
};
|
||||
if do_delete {
|
||||
if matches!(last_vd, VerOrDelete::DeleteMarker(_)) {
|
||||
if matches!(last_vd.kind, VerOrDeleteKind::DeleteMarker) {
|
||||
// Key has since been deleted (but there was some history), no need to do anything
|
||||
tracing::trace!("Key {key} already deleted, skipping.");
|
||||
} else {
|
||||
@@ -811,29 +851,59 @@ fn start_measuring_requests(
|
||||
})
|
||||
}
|
||||
|
||||
enum VerOrDelete<'a> {
|
||||
Version(&'a ObjectVersion),
|
||||
DeleteMarker(&'a DeleteMarkerEntry),
|
||||
// Save RAM and only store the needed data instead of the entire ObjectVersion/DeleteMarkerEntry
|
||||
struct VerOrDelete {
|
||||
kind: VerOrDeleteKind,
|
||||
last_modified: DateTime,
|
||||
version_id: String,
|
||||
key: String,
|
||||
}
|
||||
|
||||
impl<'a> VerOrDelete<'a> {
|
||||
fn last_modified(&self) -> Option<&'a DateTime> {
|
||||
match self {
|
||||
VerOrDelete::Version(v) => v.last_modified(),
|
||||
VerOrDelete::DeleteMarker(v) => v.last_modified(),
|
||||
}
|
||||
#[derive(Debug)]
|
||||
enum VerOrDeleteKind {
|
||||
Version,
|
||||
DeleteMarker,
|
||||
}
|
||||
|
||||
impl VerOrDelete {
|
||||
fn with_kind(
|
||||
kind: VerOrDeleteKind,
|
||||
last_modified: Option<DateTime>,
|
||||
version_id: Option<String>,
|
||||
key: Option<String>,
|
||||
) -> anyhow::Result<Self> {
|
||||
let lvk = (last_modified, version_id, key);
|
||||
let (Some(last_modified), Some(version_id), Some(key)) = lvk else {
|
||||
anyhow::bail!(
|
||||
"One (or more) of last_modified, key, and id is None. \
|
||||
Is versioning enabled in the bucket? last_modified={:?}, version_id={:?}, key={:?}",
|
||||
lvk.0,
|
||||
lvk.1,
|
||||
lvk.2,
|
||||
);
|
||||
};
|
||||
Ok(Self {
|
||||
kind,
|
||||
last_modified,
|
||||
version_id,
|
||||
key,
|
||||
})
|
||||
}
|
||||
fn version_id(&self) -> Option<&'a str> {
|
||||
match self {
|
||||
VerOrDelete::Version(v) => v.version_id(),
|
||||
VerOrDelete::DeleteMarker(v) => v.version_id(),
|
||||
}
|
||||
fn from_version(v: ObjectVersion) -> anyhow::Result<Self> {
|
||||
Self::with_kind(
|
||||
VerOrDeleteKind::Version,
|
||||
v.last_modified,
|
||||
v.version_id,
|
||||
v.key,
|
||||
)
|
||||
}
|
||||
fn key(&self) -> Option<&'a str> {
|
||||
match self {
|
||||
VerOrDelete::Version(v) => v.key(),
|
||||
VerOrDelete::DeleteMarker(v) => v.key(),
|
||||
}
|
||||
fn from_delete_marker(v: DeleteMarkerEntry) -> anyhow::Result<Self> {
|
||||
Self::with_kind(
|
||||
VerOrDeleteKind::DeleteMarker,
|
||||
v.last_modified,
|
||||
v.version_id,
|
||||
v.key,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -112,6 +112,55 @@ pub async fn fsync_async(path: impl AsRef<Utf8Path>) -> Result<(), std::io::Erro
|
||||
tokio::fs::File::open(path.as_ref()).await?.sync_all().await
|
||||
}
|
||||
|
||||
pub async fn fsync_async_opt(
|
||||
path: impl AsRef<Utf8Path>,
|
||||
do_fsync: bool,
|
||||
) -> Result<(), std::io::Error> {
|
||||
if do_fsync {
|
||||
fsync_async(path.as_ref()).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Like postgres' durable_rename, renames file issuing fsyncs do make it
|
||||
/// durable. After return, file and rename are guaranteed to be persisted.
|
||||
///
|
||||
/// Unlike postgres, it only does fsyncs to 1) file to be renamed to make
|
||||
/// contents durable; 2) its directory entry to make rename durable 3) again to
|
||||
/// already renamed file, which is not required by standards but postgres does
|
||||
/// it, let's stick to that. Postgres additionally fsyncs newpath *before*
|
||||
/// rename if it exists to ensure that at least one of the files survives, but
|
||||
/// current callers don't need that.
|
||||
///
|
||||
/// virtual_file.rs has similar code, but it doesn't use vfs.
|
||||
///
|
||||
/// Useful links: <https://lwn.net/Articles/457667/>
|
||||
/// <https://www.postgresql.org/message-id/flat/56583BDD.9060302%402ndquadrant.com>
|
||||
/// <https://thunk.org/tytso/blog/2009/03/15/dont-fear-the-fsync/>
|
||||
pub async fn durable_rename(
|
||||
old_path: impl AsRef<Utf8Path>,
|
||||
new_path: impl AsRef<Utf8Path>,
|
||||
do_fsync: bool,
|
||||
) -> io::Result<()> {
|
||||
// first fsync the file
|
||||
fsync_async_opt(old_path.as_ref(), do_fsync).await?;
|
||||
|
||||
// Time to do the real deal.
|
||||
tokio::fs::rename(old_path.as_ref(), new_path.as_ref()).await?;
|
||||
|
||||
// Postgres'ish fsync of renamed file.
|
||||
fsync_async_opt(new_path.as_ref(), do_fsync).await?;
|
||||
|
||||
// Now fsync the parent
|
||||
let parent = match new_path.as_ref().parent() {
|
||||
Some(p) => p,
|
||||
None => Utf8Path::new("./"), // assume current dir if there is no parent
|
||||
};
|
||||
fsync_async_opt(parent, do_fsync).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
|
||||
@@ -69,6 +69,25 @@ impl Client {
|
||||
resp.json().await.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
/// Get an arbitrary path and returning a streaming Response. This function is suitable
|
||||
/// for pass-through/proxy use cases where we don't care what the response content looks
|
||||
/// like.
|
||||
///
|
||||
/// Use/add one of the properly typed methods below if you know aren't proxying, and
|
||||
/// know what kind of response you expect.
|
||||
pub async fn get_raw(&self, path: String) -> Result<reqwest::Response> {
|
||||
debug_assert!(path.starts_with('/'));
|
||||
let uri = format!("{}{}", self.mgmt_api_endpoint, path);
|
||||
|
||||
let req = self.client.request(Method::GET, uri);
|
||||
let req = if let Some(value) = &self.authorization_header {
|
||||
req.header(reqwest::header::AUTHORIZATION, value)
|
||||
} else {
|
||||
req
|
||||
};
|
||||
req.send().await.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
pub async fn tenant_details(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
@@ -171,6 +190,25 @@ impl Client {
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
/// The tenant deletion API can return 202 if deletion is incomplete, or
|
||||
/// 404 if it is complete. Callers are responsible for checking the status
|
||||
/// code and retrying. Error codes other than 404 will return Err().
|
||||
pub async fn tenant_delete(&self, tenant_shard_id: TenantShardId) -> Result<StatusCode> {
|
||||
let uri = format!("{}/v1/tenant/{tenant_shard_id}", self.mgmt_api_endpoint);
|
||||
|
||||
match self.request(Method::DELETE, &uri, ()).await {
|
||||
Err(Error::ApiError(status_code, msg)) => {
|
||||
if status_code == StatusCode::NOT_FOUND {
|
||||
Ok(StatusCode::NOT_FOUND)
|
||||
} else {
|
||||
Err(Error::ApiError(status_code, msg))
|
||||
}
|
||||
}
|
||||
Err(e) => Err(e),
|
||||
Ok(response) => Ok(response.status()),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn tenant_config(&self, req: &TenantConfigRequest) -> Result<()> {
|
||||
let uri = format!("{}/v1/tenant/config", self.mgmt_api_endpoint);
|
||||
self.request(Method::PUT, &uri, req).await?;
|
||||
@@ -234,6 +272,32 @@ impl Client {
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
/// The timeline deletion API can return 201 if deletion is incomplete, or
|
||||
/// 403 if it is complete. Callers are responsible for checking the status
|
||||
/// code and retrying. Error codes other than 403 will return Err().
|
||||
pub async fn timeline_delete(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<StatusCode> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}",
|
||||
self.mgmt_api_endpoint
|
||||
);
|
||||
|
||||
match self.request(Method::DELETE, &uri, ()).await {
|
||||
Err(Error::ApiError(status_code, msg)) => {
|
||||
if status_code == StatusCode::NOT_FOUND {
|
||||
Ok(StatusCode::NOT_FOUND)
|
||||
} else {
|
||||
Err(Error::ApiError(status_code, msg))
|
||||
}
|
||||
}
|
||||
Err(e) => Err(e),
|
||||
Ok(response) => Ok(response.status()),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{}/reset",
|
||||
|
||||
@@ -97,23 +97,86 @@ pub enum EvictionOrder {
|
||||
|
||||
/// Order the layers to be evicted by how recently they have been accessed relatively within
|
||||
/// the set of resident layers of a tenant.
|
||||
///
|
||||
/// This strategy will evict layers more fairly but is untested.
|
||||
RelativeAccessed {
|
||||
#[serde(default)]
|
||||
/// Determines if the tenant with most layers should lose first.
|
||||
///
|
||||
/// Having this enabled is currently the only reasonable option, because the order in which
|
||||
/// we read tenants is deterministic. If we find the need to use this as `false`, we need
|
||||
/// to ensure nondeterminism by adding in a random number to break the
|
||||
/// `relative_last_activity==0.0` ties.
|
||||
#[serde(default = "default_highest_layer_count_loses_first")]
|
||||
highest_layer_count_loses_first: bool,
|
||||
},
|
||||
}
|
||||
|
||||
fn default_highest_layer_count_loses_first() -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
impl EvictionOrder {
|
||||
/// Return true, if with [`Self::RelativeAccessed`] order the tenants with the highest layer
|
||||
/// counts should be the first ones to have their layers evicted.
|
||||
fn highest_layer_count_loses_first(&self) -> bool {
|
||||
fn sort(&self, candidates: &mut [(MinResidentSizePartition, EvictionCandidate)]) {
|
||||
use EvictionOrder::*;
|
||||
|
||||
match self {
|
||||
EvictionOrder::AbsoluteAccessed => false,
|
||||
EvictionOrder::RelativeAccessed {
|
||||
AbsoluteAccessed => {
|
||||
candidates.sort_unstable_by_key(|(partition, candidate)| {
|
||||
(*partition, candidate.last_activity_ts)
|
||||
});
|
||||
}
|
||||
RelativeAccessed { .. } => candidates.sort_unstable_by_key(|(partition, candidate)| {
|
||||
(*partition, candidate.relative_last_activity)
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
/// Called to fill in the [`EvictionCandidate::relative_last_activity`] while iterating tenants
|
||||
/// layers in **most** recently used order.
|
||||
fn relative_last_activity(&self, total: usize, index: usize) -> finite_f32::FiniteF32 {
|
||||
use EvictionOrder::*;
|
||||
|
||||
match self {
|
||||
AbsoluteAccessed => finite_f32::FiniteF32::ZERO,
|
||||
RelativeAccessed {
|
||||
highest_layer_count_loses_first,
|
||||
} => *highest_layer_count_loses_first,
|
||||
} => {
|
||||
// keeping the -1 or not decides if every tenant should lose their least recently accessed
|
||||
// layer OR if this should happen in the order of having highest layer count:
|
||||
let fudge = if *highest_layer_count_loses_first {
|
||||
// relative_last_activity vs. tenant layer count:
|
||||
// - 0.1..=1.0 (10 layers)
|
||||
// - 0.01..=1.0 (100 layers)
|
||||
// - 0.001..=1.0 (1000 layers)
|
||||
//
|
||||
// leading to evicting less of the smallest tenants.
|
||||
0
|
||||
} else {
|
||||
// use full 0.0..=1.0 range, which means even the smallest tenants could always lose a
|
||||
// layer. the actual ordering is unspecified: for 10k tenants on a pageserver it could
|
||||
// be that less than 10k layer evictions is enough, so we would not need to evict from
|
||||
// all tenants.
|
||||
//
|
||||
// as the tenant ordering is now deterministic this could hit the same tenants
|
||||
// disproportionetly on multiple invocations. alternative could be to remember how many
|
||||
// layers did we evict last time from this tenant, and inject that as an additional
|
||||
// fudge here.
|
||||
1
|
||||
};
|
||||
|
||||
let total = total.checked_sub(fudge).filter(|&x| x > 1).unwrap_or(1);
|
||||
let divider = total as f32;
|
||||
|
||||
// most recently used is always (total - 0) / divider == 1.0
|
||||
// least recently used depends on the fudge:
|
||||
// - (total - 1) - (total - 1) / total => 0 / total
|
||||
// - total - (total - 1) / total => 1 / total
|
||||
let distance = (total - index) as f32;
|
||||
|
||||
finite_f32::FiniteF32::try_from_normalized(distance / divider)
|
||||
.unwrap_or_else(|val| {
|
||||
tracing::warn!(%fudge, "calculated invalid relative_last_activity for i={index}, total={total}: {val}");
|
||||
finite_f32::FiniteF32::ZERO
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -389,52 +452,6 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
|
||||
let selection = select_victims(&candidates, usage_pre);
|
||||
|
||||
let mut candidates = candidates;
|
||||
|
||||
let selection = if matches!(eviction_order, EvictionOrder::RelativeAccessed { .. }) {
|
||||
// we currently have the layers ordered by AbsoluteAccessed so that we can get the summary
|
||||
// for comparison here. this is a temporary measure to develop alternatives.
|
||||
use std::fmt::Write;
|
||||
|
||||
let mut summary_buf = String::with_capacity(256);
|
||||
|
||||
{
|
||||
let absolute_summary = candidates
|
||||
.iter()
|
||||
.take(selection.amount)
|
||||
.map(|(_, candidate)| candidate)
|
||||
.collect::<summary::EvictionSummary>();
|
||||
|
||||
write!(summary_buf, "{absolute_summary}").expect("string grows");
|
||||
|
||||
info!("absolute accessed selection summary: {summary_buf}");
|
||||
}
|
||||
|
||||
candidates.sort_unstable_by_key(|(partition, candidate)| {
|
||||
(*partition, candidate.relative_last_activity)
|
||||
});
|
||||
|
||||
let selection = select_victims(&candidates, usage_pre);
|
||||
|
||||
{
|
||||
summary_buf.clear();
|
||||
|
||||
let relative_summary = candidates
|
||||
.iter()
|
||||
.take(selection.amount)
|
||||
.map(|(_, candidate)| candidate)
|
||||
.collect::<summary::EvictionSummary>();
|
||||
|
||||
write!(summary_buf, "{relative_summary}").expect("string grows");
|
||||
|
||||
info!("relative accessed selection summary: {summary_buf}");
|
||||
}
|
||||
|
||||
selection
|
||||
} else {
|
||||
selection
|
||||
};
|
||||
|
||||
let (evicted_amount, usage_planned) = selection.into_amount_and_planned();
|
||||
|
||||
// phase2: evict layers
|
||||
@@ -835,54 +852,12 @@ async fn collect_eviction_candidates(
|
||||
.sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));
|
||||
let mut cumsum: i128 = 0;
|
||||
|
||||
// keeping the -1 or not decides if every tenant should lose their least recently accessed
|
||||
// layer OR if this should happen in the order of having highest layer count:
|
||||
let fudge = if eviction_order.highest_layer_count_loses_first() {
|
||||
// relative_age vs. tenant layer count:
|
||||
// - 0.1..=1.0 (10 layers)
|
||||
// - 0.01..=1.0 (100 layers)
|
||||
// - 0.001..=1.0 (1000 layers)
|
||||
//
|
||||
// leading to evicting less of the smallest tenants.
|
||||
0
|
||||
} else {
|
||||
// use full 0.0..=1.0 range, which means even the smallest tenants could always lose a
|
||||
// layer. the actual ordering is unspecified: for 10k tenants on a pageserver it could
|
||||
// be that less than 10k layer evictions is enough, so we would not need to evict from
|
||||
// all tenants.
|
||||
//
|
||||
// as the tenant ordering is now deterministic this could hit the same tenants
|
||||
// disproportionetly on multiple invocations. alternative could be to remember how many
|
||||
// layers did we evict last time from this tenant, and inject that as an additional
|
||||
// fudge here.
|
||||
1
|
||||
};
|
||||
|
||||
let total = tenant_candidates
|
||||
.len()
|
||||
.checked_sub(fudge)
|
||||
.filter(|&x| x > 0)
|
||||
// support 0 or 1 resident layer tenants as well
|
||||
.unwrap_or(1);
|
||||
let divider = total as f32;
|
||||
let total = tenant_candidates.len();
|
||||
|
||||
for (i, mut candidate) in tenant_candidates.into_iter().enumerate() {
|
||||
// as we iterate this reverse sorted list, the most recently accessed layer will always
|
||||
// be 1.0; this is for us to evict it last.
|
||||
candidate.relative_last_activity = if matches!(
|
||||
eviction_order,
|
||||
EvictionOrder::RelativeAccessed { .. }
|
||||
) {
|
||||
// another possibility: use buckets, like (256.0 * relative_last_activity) as u8 or
|
||||
// similarly for u16. unsure how it would help.
|
||||
finite_f32::FiniteF32::try_from_normalized((total - i) as f32 / divider)
|
||||
.unwrap_or_else(|val| {
|
||||
tracing::warn!(%fudge, "calculated invalid relative_last_activity for i={i}, total={total}: {val}");
|
||||
finite_f32::FiniteF32::ZERO
|
||||
})
|
||||
} else {
|
||||
finite_f32::FiniteF32::ZERO
|
||||
};
|
||||
candidate.relative_last_activity = eviction_order.relative_last_activity(total, i);
|
||||
|
||||
let partition = if cumsum > min_resident_size as i128 {
|
||||
MinResidentSizePartition::Above
|
||||
@@ -927,10 +902,7 @@ async fn collect_eviction_candidates(
|
||||
debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
|
||||
"as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
|
||||
|
||||
// always behave as if AbsoluteAccessed was selected. if RelativeAccessed is in use, we
|
||||
// will sort later by candidate.relative_last_activity to get compare evictions.
|
||||
candidates
|
||||
.sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts));
|
||||
eviction_order.sort(&mut candidates);
|
||||
|
||||
Ok(EvictionCandidates::Finished(candidates))
|
||||
}
|
||||
@@ -1070,6 +1042,12 @@ pub(crate) mod finite_f32 {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<FiniteF32> for f32 {
|
||||
fn from(value: FiniteF32) -> f32 {
|
||||
value.0
|
||||
}
|
||||
}
|
||||
|
||||
impl FiniteF32 {
|
||||
pub const ZERO: FiniteF32 = FiniteF32(0.0);
|
||||
|
||||
@@ -1082,136 +1060,9 @@ pub(crate) mod finite_f32 {
|
||||
Err(value)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
mod summary {
|
||||
use super::finite_f32::FiniteF32;
|
||||
use super::{EvictionCandidate, LayerCount};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::time::SystemTime;
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub(super) struct EvictionSummary {
|
||||
evicted_per_tenant: HashMap<TenantShardId, LayerCount>,
|
||||
total: LayerCount,
|
||||
|
||||
last_absolute: Option<SystemTime>,
|
||||
last_relative: Option<FiniteF32>,
|
||||
}
|
||||
|
||||
impl<'a> FromIterator<&'a EvictionCandidate> for EvictionSummary {
|
||||
fn from_iter<T: IntoIterator<Item = &'a EvictionCandidate>>(iter: T) -> Self {
|
||||
let mut summary = EvictionSummary::default();
|
||||
for item in iter {
|
||||
let counts = summary
|
||||
.evicted_per_tenant
|
||||
.entry(*item.layer.get_tenant_shard_id())
|
||||
.or_default();
|
||||
|
||||
let sz = item.layer.get_file_size();
|
||||
|
||||
counts.file_sizes += sz;
|
||||
counts.count += 1;
|
||||
|
||||
summary.total.file_sizes += sz;
|
||||
summary.total.count += 1;
|
||||
|
||||
summary.last_absolute = Some(item.last_activity_ts);
|
||||
summary.last_relative = Some(item.relative_last_activity);
|
||||
}
|
||||
|
||||
summary
|
||||
}
|
||||
}
|
||||
|
||||
struct SiBytesAmount(u64);
|
||||
|
||||
impl std::fmt::Display for SiBytesAmount {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
if self.0 < 1024 {
|
||||
return write!(f, "{}B", self.0);
|
||||
}
|
||||
|
||||
let mut tmp = self.0;
|
||||
let mut ch = 0;
|
||||
let suffixes = b"KMGTPE";
|
||||
|
||||
while tmp > 1024 * 1024 && ch < suffixes.len() - 1 {
|
||||
tmp /= 1024;
|
||||
ch += 1;
|
||||
}
|
||||
|
||||
let ch = suffixes[ch] as char;
|
||||
|
||||
write!(f, "{:.1}{ch}iB", tmp as f64 / 1024.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for EvictionSummary {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
// wasteful, but it's for testing
|
||||
|
||||
let mut sorted: BTreeMap<usize, Vec<(TenantShardId, u64)>> = BTreeMap::new();
|
||||
|
||||
for (tenant_shard_id, count) in &self.evicted_per_tenant {
|
||||
sorted
|
||||
.entry(count.count)
|
||||
.or_default()
|
||||
.push((*tenant_shard_id, count.file_sizes));
|
||||
}
|
||||
|
||||
let total_file_sizes = SiBytesAmount(self.total.file_sizes);
|
||||
|
||||
writeln!(
|
||||
f,
|
||||
"selected {} layers of {total_file_sizes} up to ({:?}, {:.2?}):",
|
||||
self.total.count, self.last_absolute, self.last_relative,
|
||||
)?;
|
||||
|
||||
for (count, per_tenant) in sorted.iter().rev().take(10) {
|
||||
write!(f, "- {count} layers: ")?;
|
||||
|
||||
if per_tenant.len() < 3 {
|
||||
for (i, (tenant_shard_id, bytes)) in per_tenant.iter().enumerate() {
|
||||
if i > 0 {
|
||||
write!(f, ", ")?;
|
||||
}
|
||||
let bytes = SiBytesAmount(*bytes);
|
||||
write!(f, "{tenant_shard_id} ({bytes})")?;
|
||||
}
|
||||
} else {
|
||||
let num_tenants = per_tenant.len();
|
||||
let total_bytes = per_tenant.iter().map(|(_id, bytes)| bytes).sum::<u64>();
|
||||
let total_bytes = SiBytesAmount(total_bytes);
|
||||
let layers = num_tenants * count;
|
||||
|
||||
write!(
|
||||
f,
|
||||
"{num_tenants} tenants {total_bytes} in total {layers} layers",
|
||||
)?;
|
||||
}
|
||||
|
||||
writeln!(f)?;
|
||||
}
|
||||
|
||||
if sorted.len() > 10 {
|
||||
let (rem_count, rem_bytes) = sorted
|
||||
.iter()
|
||||
.rev()
|
||||
.map(|(count, per_tenant)| {
|
||||
(
|
||||
count,
|
||||
per_tenant.iter().map(|(_id, bytes)| bytes).sum::<u64>(),
|
||||
)
|
||||
})
|
||||
.fold((0, 0), |acc, next| (acc.0 + next.0, acc.1 + next.1));
|
||||
let rem_bytes = SiBytesAmount(rem_bytes);
|
||||
writeln!(f, "- rest of tenants ({}) not shown ({rem_count} layers or {:.1}%, {rem_bytes} or {:.1}% bytes)", sorted.len() - 10, 100.0 * rem_count as f64 / self.total.count as f64, 100.0 * rem_bytes.0 as f64 / self.total.file_sizes as f64)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
pub fn into_inner(self) -> f32 {
|
||||
self.into()
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1336,3 +1187,40 @@ mod filesystem_level_usage {
|
||||
assert!(!usage.has_pressure());
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn relative_equal_bounds() {
|
||||
let order = EvictionOrder::RelativeAccessed {
|
||||
highest_layer_count_loses_first: false,
|
||||
};
|
||||
|
||||
let len = 10;
|
||||
let v = (0..len)
|
||||
.map(|i| order.relative_last_activity(len, i).into_inner())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
assert_eq!(v.first(), Some(&1.0));
|
||||
assert_eq!(v.last(), Some(&0.0));
|
||||
assert!(v.windows(2).all(|slice| slice[0] > slice[1]));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn relative_spare_bounds() {
|
||||
let order = EvictionOrder::RelativeAccessed {
|
||||
highest_layer_count_loses_first: true,
|
||||
};
|
||||
|
||||
let len = 10;
|
||||
let v = (0..len)
|
||||
.map(|i| order.relative_last_activity(len, i).into_inner())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
assert_eq!(v.first(), Some(&1.0));
|
||||
assert_eq!(v.last(), Some(&0.1));
|
||||
assert!(v.windows(2).all(|slice| slice[0] > slice[1]));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -419,12 +419,6 @@ paths:
|
||||
type: string
|
||||
format: date-time
|
||||
description: A timestamp to get the LSN
|
||||
- name: version
|
||||
in: query
|
||||
required: false
|
||||
schema:
|
||||
type: integer
|
||||
description: The version of the endpoint to use
|
||||
responses:
|
||||
"200":
|
||||
description: OK
|
||||
@@ -674,6 +668,10 @@ paths:
|
||||
responses:
|
||||
"200":
|
||||
description: Tenant is now in requested state
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/TenantLocationConfigResponse"
|
||||
"503":
|
||||
description: Tenant's state cannot be changed right now. Wait a few seconds and retry.
|
||||
content:
|
||||
@@ -1426,6 +1424,27 @@ components:
|
||||
$ref: '#/components/schemas/SecondaryConfig'
|
||||
tenant_conf:
|
||||
$ref: '#/components/schemas/TenantConfig'
|
||||
TenantLocationConfigResponse:
|
||||
type: object
|
||||
required:
|
||||
- shards
|
||||
properties:
|
||||
shards:
|
||||
description: Pageservers where this tenant's shards are attached. Not populated for secondary locations.
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/TenantShardLocation"
|
||||
TenantShardLocation:
|
||||
type: object
|
||||
required:
|
||||
- node_id
|
||||
- shard_id
|
||||
properties:
|
||||
node_id:
|
||||
description: Pageserver node ID where this shard is attached
|
||||
type: integer
|
||||
shard_id: Tenant shard ID of the shard
|
||||
type: string
|
||||
SecondaryConfig:
|
||||
type: object
|
||||
properties:
|
||||
|
||||
@@ -17,6 +17,8 @@ use metrics::launch_timestamp::LaunchTimestamp;
|
||||
use pageserver_api::models::LocationConfigListResponse;
|
||||
use pageserver_api::models::ShardParameters;
|
||||
use pageserver_api::models::TenantDetails;
|
||||
use pageserver_api::models::TenantLocationConfigResponse;
|
||||
use pageserver_api::models::TenantShardLocation;
|
||||
use pageserver_api::models::TenantState;
|
||||
use pageserver_api::models::{
|
||||
DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
|
||||
@@ -1356,7 +1358,7 @@ async fn put_tenant_location_config_handler(
|
||||
let location_conf =
|
||||
LocationConf::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
|
||||
|
||||
state
|
||||
let attached = state
|
||||
.tenant_manager
|
||||
.upsert_location(
|
||||
tenant_shard_id,
|
||||
@@ -1365,7 +1367,8 @@ async fn put_tenant_location_config_handler(
|
||||
tenant::SpawnMode::Normal,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
.await?
|
||||
.is_some();
|
||||
|
||||
if let Some(_flush_ms) = flush {
|
||||
match state
|
||||
@@ -1384,7 +1387,18 @@ async fn put_tenant_location_config_handler(
|
||||
tracing::info!("No flush requested when configuring");
|
||||
}
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
// This API returns a vector of pageservers where the tenant is attached: this is
|
||||
// primarily for use in the sharding service. For compatibilty, we also return this
|
||||
// when called directly on a pageserver, but the payload is always zero or one shards.
|
||||
let mut response = TenantLocationConfigResponse { shards: Vec::new() };
|
||||
if attached {
|
||||
response.shards.push(TenantShardLocation {
|
||||
shard_id: tenant_shard_id,
|
||||
node_id: state.conf.id,
|
||||
})
|
||||
}
|
||||
|
||||
json_response(StatusCode::OK, response)
|
||||
}
|
||||
|
||||
async fn list_location_config_handler(
|
||||
|
||||
@@ -368,6 +368,16 @@ impl From<WaitLsnError> for PageStreamError {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<WaitLsnError> for QueryError {
|
||||
fn from(value: WaitLsnError) -> Self {
|
||||
match value {
|
||||
e @ WaitLsnError::Timeout(_) => Self::Other(anyhow::Error::new(e)),
|
||||
WaitLsnError::Shutdown => Self::Shutdown,
|
||||
WaitLsnError::BadState => Self::Reconnect,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PageServerHandler {
|
||||
pub fn new(
|
||||
conf: &'static PageServerConf,
|
||||
@@ -1139,7 +1149,7 @@ impl PageServerHandler {
|
||||
full_backup: bool,
|
||||
gzip: bool,
|
||||
ctx: RequestContext,
|
||||
) -> anyhow::Result<()>
|
||||
) -> Result<(), QueryError>
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
@@ -1404,7 +1414,7 @@ where
|
||||
)
|
||||
.await?;
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
anyhow::Ok(())
|
||||
Result::<(), QueryError>::Ok(())
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
@@ -1678,6 +1688,7 @@ impl From<GetActiveTenantError> for QueryError {
|
||||
| GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping { .. }) => {
|
||||
QueryError::Shutdown
|
||||
}
|
||||
e @ GetActiveTenantError::NotFound(_) => QueryError::NotFound(format!("{e}").into()),
|
||||
e => QueryError::Other(anyhow::anyhow!(e)),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3778,6 +3778,11 @@ async fn run_initdb(
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", &initdb_lib_dir)
|
||||
.env("DYLD_LIBRARY_PATH", &initdb_lib_dir)
|
||||
.stdin(std::process::Stdio::null())
|
||||
// stdout invocation produces the same output every time, we don't need it
|
||||
.stdout(std::process::Stdio::null())
|
||||
// we would be interested in the stderr output, if there was any
|
||||
.stderr(std::process::Stdio::piped())
|
||||
.spawn()?;
|
||||
|
||||
// Ideally we'd select here with the cancellation token, but the problem is that
|
||||
|
||||
@@ -51,7 +51,10 @@ use crate::keyspace::KeyPartitioning;
|
||||
use crate::repository::Key;
|
||||
use crate::tenant::storage_layer::InMemoryLayer;
|
||||
use anyhow::Result;
|
||||
use std::collections::VecDeque;
|
||||
use pageserver_api::keyspace::KeySpaceAccum;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::{BTreeMap, VecDeque};
|
||||
use std::iter::Peekable;
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
use utils::lsn::Lsn;
|
||||
@@ -144,11 +147,221 @@ impl Drop for BatchedUpdates<'_> {
|
||||
}
|
||||
|
||||
/// Return value of LayerMap::search
|
||||
#[derive(Eq, PartialEq, Debug)]
|
||||
pub struct SearchResult {
|
||||
pub layer: Arc<PersistentLayerDesc>,
|
||||
pub lsn_floor: Lsn,
|
||||
}
|
||||
|
||||
pub struct OrderedSearchResult(SearchResult);
|
||||
|
||||
impl Ord for OrderedSearchResult {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
self.0.lsn_floor.cmp(&other.0.lsn_floor)
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialOrd for OrderedSearchResult {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for OrderedSearchResult {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.0.lsn_floor == other.0.lsn_floor
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for OrderedSearchResult {}
|
||||
|
||||
pub struct RangeSearchResult {
|
||||
pub found: BTreeMap<OrderedSearchResult, KeySpaceAccum>,
|
||||
pub not_found: KeySpaceAccum,
|
||||
}
|
||||
|
||||
impl RangeSearchResult {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
found: BTreeMap::new(),
|
||||
not_found: KeySpaceAccum::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Collector for results of range search queries on the LayerMap.
|
||||
/// It should be provided with two iterators for the delta and image coverage
|
||||
/// that contain all the changes for layers which intersect the range.
|
||||
struct RangeSearchCollector<Iter>
|
||||
where
|
||||
Iter: Iterator<Item = (i128, Option<Arc<PersistentLayerDesc>>)>,
|
||||
{
|
||||
delta_coverage: Peekable<Iter>,
|
||||
image_coverage: Peekable<Iter>,
|
||||
key_range: Range<Key>,
|
||||
end_lsn: Lsn,
|
||||
|
||||
current_delta: Option<Arc<PersistentLayerDesc>>,
|
||||
current_image: Option<Arc<PersistentLayerDesc>>,
|
||||
|
||||
result: RangeSearchResult,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
enum NextLayerType {
|
||||
Delta(i128),
|
||||
Image(i128),
|
||||
Both(i128),
|
||||
}
|
||||
|
||||
impl NextLayerType {
|
||||
fn next_change_at_key(&self) -> Key {
|
||||
match self {
|
||||
NextLayerType::Delta(at) => Key::from_i128(*at),
|
||||
NextLayerType::Image(at) => Key::from_i128(*at),
|
||||
NextLayerType::Both(at) => Key::from_i128(*at),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<Iter> RangeSearchCollector<Iter>
|
||||
where
|
||||
Iter: Iterator<Item = (i128, Option<Arc<PersistentLayerDesc>>)>,
|
||||
{
|
||||
fn new(
|
||||
key_range: Range<Key>,
|
||||
end_lsn: Lsn,
|
||||
delta_coverage: Iter,
|
||||
image_coverage: Iter,
|
||||
) -> Self {
|
||||
Self {
|
||||
delta_coverage: delta_coverage.peekable(),
|
||||
image_coverage: image_coverage.peekable(),
|
||||
key_range,
|
||||
end_lsn,
|
||||
current_delta: None,
|
||||
current_image: None,
|
||||
result: RangeSearchResult::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Run the collector. Collection is implemented via a two pointer algorithm.
|
||||
/// One pointer tracks the start of the current range and the other tracks
|
||||
/// the beginning of the next range which will overlap with the next change
|
||||
/// in coverage across both image and delta.
|
||||
fn collect(mut self) -> RangeSearchResult {
|
||||
let next_layer_type = self.choose_next_layer_type();
|
||||
let mut current_range_start = match next_layer_type {
|
||||
None => {
|
||||
// No changes for the range
|
||||
self.pad_range(self.key_range.clone());
|
||||
return self.result;
|
||||
}
|
||||
Some(layer_type) if self.key_range.end <= layer_type.next_change_at_key() => {
|
||||
// Changes only after the end of the range
|
||||
self.pad_range(self.key_range.clone());
|
||||
return self.result;
|
||||
}
|
||||
Some(layer_type) => {
|
||||
// Changes for the range exist. Record anything before the first
|
||||
// coverage change as not found.
|
||||
let coverage_start = layer_type.next_change_at_key();
|
||||
let range_before = self.key_range.start..coverage_start;
|
||||
self.pad_range(range_before);
|
||||
|
||||
self.advance(&layer_type);
|
||||
coverage_start
|
||||
}
|
||||
};
|
||||
|
||||
while current_range_start < self.key_range.end {
|
||||
let next_layer_type = self.choose_next_layer_type();
|
||||
match next_layer_type {
|
||||
Some(t) => {
|
||||
let current_range_end = t.next_change_at_key();
|
||||
self.add_range(current_range_start..current_range_end);
|
||||
current_range_start = current_range_end;
|
||||
|
||||
self.advance(&t);
|
||||
}
|
||||
None => {
|
||||
self.add_range(current_range_start..self.key_range.end);
|
||||
current_range_start = self.key_range.end;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
self.result
|
||||
}
|
||||
|
||||
/// Mark a range as not found (i.e. no layers intersect it)
|
||||
fn pad_range(&mut self, key_range: Range<Key>) {
|
||||
if !key_range.is_empty() {
|
||||
self.result.not_found.add_range(key_range);
|
||||
}
|
||||
}
|
||||
|
||||
/// Select the appropiate layer for the given range and update
|
||||
/// the collector.
|
||||
fn add_range(&mut self, covered_range: Range<Key>) {
|
||||
let selected = LayerMap::select_layer(
|
||||
self.current_delta.clone(),
|
||||
self.current_image.clone(),
|
||||
self.end_lsn,
|
||||
);
|
||||
|
||||
match selected {
|
||||
Some(search_result) => self
|
||||
.result
|
||||
.found
|
||||
.entry(OrderedSearchResult(search_result))
|
||||
.or_default()
|
||||
.add_range(covered_range),
|
||||
None => self.pad_range(covered_range),
|
||||
}
|
||||
}
|
||||
|
||||
/// Move to the next coverage change.
|
||||
fn advance(&mut self, layer_type: &NextLayerType) {
|
||||
match layer_type {
|
||||
NextLayerType::Delta(_) => {
|
||||
let (_, layer) = self.delta_coverage.next().unwrap();
|
||||
self.current_delta = layer;
|
||||
}
|
||||
NextLayerType::Image(_) => {
|
||||
let (_, layer) = self.image_coverage.next().unwrap();
|
||||
self.current_image = layer;
|
||||
}
|
||||
NextLayerType::Both(_) => {
|
||||
let (_, image_layer) = self.image_coverage.next().unwrap();
|
||||
let (_, delta_layer) = self.delta_coverage.next().unwrap();
|
||||
|
||||
self.current_image = image_layer;
|
||||
self.current_delta = delta_layer;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Pick the next coverage change: the one at the lesser key or both if they're alligned.
|
||||
fn choose_next_layer_type(&mut self) -> Option<NextLayerType> {
|
||||
let next_delta_at = self.delta_coverage.peek().map(|(key, _)| key);
|
||||
let next_image_at = self.image_coverage.peek().map(|(key, _)| key);
|
||||
|
||||
match (next_delta_at, next_image_at) {
|
||||
(None, None) => None,
|
||||
(Some(next_delta_at), None) => Some(NextLayerType::Delta(*next_delta_at)),
|
||||
(None, Some(next_image_at)) => Some(NextLayerType::Image(*next_image_at)),
|
||||
(Some(next_delta_at), Some(next_image_at)) if next_image_at < next_delta_at => {
|
||||
Some(NextLayerType::Image(*next_image_at))
|
||||
}
|
||||
(Some(next_delta_at), Some(next_image_at)) if next_delta_at < next_image_at => {
|
||||
Some(NextLayerType::Delta(*next_delta_at))
|
||||
}
|
||||
(Some(next_delta_at), Some(_)) => Some(NextLayerType::Both(*next_delta_at)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl LayerMap {
|
||||
///
|
||||
/// Find the latest layer (by lsn.end) that covers the given
|
||||
@@ -186,7 +399,18 @@ impl LayerMap {
|
||||
let latest_delta = version.delta_coverage.query(key.to_i128());
|
||||
let latest_image = version.image_coverage.query(key.to_i128());
|
||||
|
||||
match (latest_delta, latest_image) {
|
||||
Self::select_layer(latest_delta, latest_image, end_lsn)
|
||||
}
|
||||
|
||||
fn select_layer(
|
||||
delta_layer: Option<Arc<PersistentLayerDesc>>,
|
||||
image_layer: Option<Arc<PersistentLayerDesc>>,
|
||||
end_lsn: Lsn,
|
||||
) -> Option<SearchResult> {
|
||||
assert!(delta_layer.as_ref().map_or(true, |l| l.is_delta()));
|
||||
assert!(image_layer.as_ref().map_or(true, |l| !l.is_delta()));
|
||||
|
||||
match (delta_layer, image_layer) {
|
||||
(None, None) => None,
|
||||
(None, Some(image)) => {
|
||||
let lsn_floor = image.get_lsn_range().start;
|
||||
@@ -223,6 +447,17 @@ impl LayerMap {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn range_search(&self, key_range: Range<Key>, end_lsn: Lsn) -> Option<RangeSearchResult> {
|
||||
let version = self.historic.get().unwrap().get_version(end_lsn.0 - 1)?;
|
||||
|
||||
let raw_range = key_range.start.to_i128()..key_range.end.to_i128();
|
||||
let delta_changes = version.delta_coverage.range_overlaps(&raw_range);
|
||||
let image_changes = version.image_coverage.range_overlaps(&raw_range);
|
||||
|
||||
let collector = RangeSearchCollector::new(key_range, end_lsn, delta_changes, image_changes);
|
||||
Some(collector.collect())
|
||||
}
|
||||
|
||||
/// Start a batch of updates, applied on drop
|
||||
pub fn batch_update(&mut self) -> BatchedUpdates<'_> {
|
||||
BatchedUpdates { layer_map: self }
|
||||
@@ -631,3 +866,126 @@ impl LayerMap {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[derive(Clone)]
|
||||
struct LayerDesc {
|
||||
key_range: Range<Key>,
|
||||
lsn_range: Range<Lsn>,
|
||||
is_delta: bool,
|
||||
}
|
||||
|
||||
fn create_layer_map(layers: Vec<LayerDesc>) -> LayerMap {
|
||||
let mut layer_map = LayerMap::default();
|
||||
|
||||
for layer in layers {
|
||||
layer_map.insert_historic_noflush(PersistentLayerDesc::new_test(
|
||||
layer.key_range,
|
||||
layer.lsn_range,
|
||||
layer.is_delta,
|
||||
));
|
||||
}
|
||||
|
||||
layer_map.flush_updates();
|
||||
layer_map
|
||||
}
|
||||
|
||||
fn assert_range_search_result_eq(lhs: RangeSearchResult, rhs: RangeSearchResult) {
|
||||
assert_eq!(lhs.not_found.to_keyspace(), rhs.not_found.to_keyspace());
|
||||
let lhs: Vec<_> = lhs
|
||||
.found
|
||||
.into_iter()
|
||||
.map(|(search_result, accum)| (search_result.0, accum.to_keyspace()))
|
||||
.collect();
|
||||
let rhs: Vec<_> = rhs
|
||||
.found
|
||||
.into_iter()
|
||||
.map(|(search_result, accum)| (search_result.0, accum.to_keyspace()))
|
||||
.collect();
|
||||
|
||||
assert_eq!(lhs, rhs);
|
||||
}
|
||||
|
||||
fn brute_force_range_search(
|
||||
layer_map: &LayerMap,
|
||||
key_range: Range<Key>,
|
||||
end_lsn: Lsn,
|
||||
) -> RangeSearchResult {
|
||||
let mut range_search_result = RangeSearchResult::new();
|
||||
|
||||
let mut key = key_range.start;
|
||||
while key != key_range.end {
|
||||
let res = layer_map.search(key, end_lsn);
|
||||
match res {
|
||||
Some(res) => {
|
||||
range_search_result
|
||||
.found
|
||||
.entry(OrderedSearchResult(res))
|
||||
.or_default()
|
||||
.add_key(key);
|
||||
}
|
||||
None => {
|
||||
range_search_result.not_found.add_key(key);
|
||||
}
|
||||
}
|
||||
|
||||
key = key.next();
|
||||
}
|
||||
|
||||
range_search_result
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ranged_search_on_empty_layer_map() {
|
||||
let layer_map = LayerMap::default();
|
||||
let range = Key::from_i128(100)..Key::from_i128(200);
|
||||
|
||||
let res = layer_map.range_search(range, Lsn(100));
|
||||
assert!(res.is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ranged_search() {
|
||||
let layers = vec![
|
||||
LayerDesc {
|
||||
key_range: Key::from_i128(15)..Key::from_i128(50),
|
||||
lsn_range: Lsn(0)..Lsn(5),
|
||||
is_delta: false,
|
||||
},
|
||||
LayerDesc {
|
||||
key_range: Key::from_i128(10)..Key::from_i128(20),
|
||||
lsn_range: Lsn(5)..Lsn(20),
|
||||
is_delta: true,
|
||||
},
|
||||
LayerDesc {
|
||||
key_range: Key::from_i128(15)..Key::from_i128(25),
|
||||
lsn_range: Lsn(20)..Lsn(30),
|
||||
is_delta: true,
|
||||
},
|
||||
LayerDesc {
|
||||
key_range: Key::from_i128(35)..Key::from_i128(40),
|
||||
lsn_range: Lsn(25)..Lsn(35),
|
||||
is_delta: true,
|
||||
},
|
||||
LayerDesc {
|
||||
key_range: Key::from_i128(35)..Key::from_i128(40),
|
||||
lsn_range: Lsn(35)..Lsn(40),
|
||||
is_delta: false,
|
||||
},
|
||||
];
|
||||
|
||||
let layer_map = create_layer_map(layers.clone());
|
||||
for start in 0..60 {
|
||||
for end in (start + 1)..60 {
|
||||
let range = Key::from_i128(start)..Key::from_i128(end);
|
||||
let result = layer_map.range_search(range.clone(), Lsn(100)).unwrap();
|
||||
let expected = brute_force_range_search(&layer_map, range, Lsn(100));
|
||||
|
||||
assert_range_search_result_eq(result, expected);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -129,6 +129,42 @@ impl<Value: Clone> LayerCoverage<Value> {
|
||||
.map(|(k, v)| (*k, v.as_ref().map(|x| x.1.clone())))
|
||||
}
|
||||
|
||||
/// Returns an iterator which includes all coverage changes for layers that intersect
|
||||
/// with the provided range.
|
||||
pub fn range_overlaps(
|
||||
&self,
|
||||
key_range: &Range<i128>,
|
||||
) -> impl Iterator<Item = (i128, Option<Value>)> + '_
|
||||
where
|
||||
Value: Eq,
|
||||
{
|
||||
let first_change = self.query(key_range.start);
|
||||
match first_change {
|
||||
Some(change) => {
|
||||
// If the start of the range is covered, we have to deal with two cases:
|
||||
// 1. Start of the range is aligned with the start of a layer.
|
||||
// In this case the return of `self.range` will contain the layer which aligns with the start of the key range.
|
||||
// We advance said iterator to avoid duplicating the first change.
|
||||
// 2. Start of the range is not aligned with the start of a layer.
|
||||
let range = key_range.start..key_range.end;
|
||||
let mut range_coverage = self.range(range).peekable();
|
||||
if range_coverage
|
||||
.peek()
|
||||
.is_some_and(|c| c.1.as_ref() == Some(&change))
|
||||
{
|
||||
range_coverage.next();
|
||||
}
|
||||
itertools::Either::Left(
|
||||
std::iter::once((key_range.start, Some(change))).chain(range_coverage),
|
||||
)
|
||||
}
|
||||
None => {
|
||||
let range = key_range.start..key_range.end;
|
||||
let coverage = self.range(range);
|
||||
itertools::Either::Right(coverage)
|
||||
}
|
||||
}
|
||||
}
|
||||
/// O(1) clone
|
||||
pub fn clone(&self) -> Self {
|
||||
Self {
|
||||
|
||||
@@ -55,13 +55,13 @@ impl PersistentLayerDesc {
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn new_test(key_range: Range<Key>) -> Self {
|
||||
pub fn new_test(key_range: Range<Key>, lsn_range: Range<Lsn>, is_delta: bool) -> Self {
|
||||
Self {
|
||||
tenant_shard_id: TenantShardId::unsharded(TenantId::generate()),
|
||||
timeline_id: TimelineId::generate(),
|
||||
key_range,
|
||||
lsn_range: Lsn(0)..Lsn(1),
|
||||
is_delta: false,
|
||||
lsn_range,
|
||||
is_delta,
|
||||
file_size: 0,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1363,16 +1363,22 @@ impl WalIngest {
|
||||
self.checkpoint.nextMultiOffset = xlrec.moff + xlrec.nmembers;
|
||||
self.checkpoint_modified = true;
|
||||
}
|
||||
let max_mbr_xid = xlrec.members.iter().fold(0u32, |acc, mbr| {
|
||||
if mbr.xid.wrapping_sub(acc) as i32 > 0 {
|
||||
mbr.xid
|
||||
let max_mbr_xid = xlrec.members.iter().fold(None, |acc, mbr| {
|
||||
if let Some(max_xid) = acc {
|
||||
if mbr.xid.wrapping_sub(max_xid) as i32 > 0 {
|
||||
Some(mbr.xid)
|
||||
} else {
|
||||
acc
|
||||
}
|
||||
} else {
|
||||
acc
|
||||
Some(mbr.xid)
|
||||
}
|
||||
});
|
||||
|
||||
if self.checkpoint.update_next_xid(max_mbr_xid) {
|
||||
self.checkpoint_modified = true;
|
||||
if let Some(max_xid) = max_mbr_xid {
|
||||
if self.checkpoint.update_next_xid(max_xid) {
|
||||
self.checkpoint_modified = true;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -93,7 +93,7 @@ struct ProcessOutput {
|
||||
pub struct PostgresRedoManager {
|
||||
tenant_shard_id: TenantShardId,
|
||||
conf: &'static PageServerConf,
|
||||
last_redo_at: std::sync::Mutex<Option<Instant>>,
|
||||
last_successful_redo_at: std::sync::Mutex<Option<Instant>>,
|
||||
redo_process: RwLock<Option<Arc<WalRedoProcess>>>,
|
||||
}
|
||||
|
||||
@@ -193,7 +193,7 @@ impl PostgresRedoManager {
|
||||
PostgresRedoManager {
|
||||
tenant_shard_id,
|
||||
conf,
|
||||
last_redo_at: std::sync::Mutex::default(),
|
||||
last_successful_redo_at: std::sync::Mutex::default(),
|
||||
redo_process: RwLock::new(None),
|
||||
}
|
||||
}
|
||||
@@ -202,9 +202,21 @@ impl PostgresRedoManager {
|
||||
/// rely on our owner calling this function periodically in its own housekeeping
|
||||
/// loops.
|
||||
pub(crate) fn maybe_quiesce(&self, idle_timeout: Duration) {
|
||||
if let Ok(g) = self.last_redo_at.try_lock() {
|
||||
if let Some(last_redo_at) = *g {
|
||||
if last_redo_at.elapsed() >= idle_timeout {
|
||||
if let Ok(g) = self.last_successful_redo_at.try_lock() {
|
||||
if let Some(last_successful_redo_at) = *g {
|
||||
// Kill the walredo process if
|
||||
// - it has been unused for `idle_timeout`
|
||||
// - it has been used, but, without success.
|
||||
// The former is just good housekeeping.
|
||||
// The latter adds robustness for the case where something is wrong
|
||||
// with the walredo process.
|
||||
//
|
||||
// Note that we don't want to kill the process immediately on each redo failure.
|
||||
// The reason is that the redo failure could be caused by corrupted or malicious data.
|
||||
// We don't want to get into a kill-respawn loop in that case.
|
||||
// So, we piggy-back on the quiescing mechanism,
|
||||
// resulting in a max kill-respawn frequency of `1/idle_timeout`.
|
||||
if last_successful_redo_at.elapsed() >= idle_timeout {
|
||||
drop(g);
|
||||
let mut guard = self.redo_process.write().unwrap();
|
||||
*guard = None;
|
||||
@@ -227,8 +239,32 @@ impl PostgresRedoManager {
|
||||
wal_redo_timeout: Duration,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<Bytes> {
|
||||
*(self.last_redo_at.lock().unwrap()) = Some(Instant::now());
|
||||
let res = self.apply_batch_postgres0(
|
||||
key,
|
||||
lsn,
|
||||
base_img,
|
||||
base_img_lsn,
|
||||
records,
|
||||
wal_redo_timeout,
|
||||
pg_version,
|
||||
);
|
||||
if res.is_ok() {
|
||||
*self.last_successful_redo_at.lock().unwrap() = Some(Instant::now());
|
||||
}
|
||||
res
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn apply_batch_postgres0(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
base_img: Option<Bytes>,
|
||||
base_img_lsn: Lsn,
|
||||
records: &[(Lsn, NeonWalRecord)],
|
||||
wal_redo_timeout: Duration,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<Bytes> {
|
||||
let (rel, blknum) = key_to_rel_block(key).context("invalid record")?;
|
||||
const MAX_RETRY_ATTEMPTS: u32 = 1;
|
||||
let mut n_attempts = 0u32;
|
||||
|
||||
56
patches/pgvector.patch
Normal file
56
patches/pgvector.patch
Normal file
@@ -0,0 +1,56 @@
|
||||
From 5518a806a70e7f40d5054a762ccda7d5e6b0d31c Mon Sep 17 00:00:00 2001
|
||||
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
|
||||
Date: Tue, 30 Jan 2024 14:33:00 +0200
|
||||
Subject: [PATCH] Make v0.6.0 work with Neon
|
||||
|
||||
Now that the WAL-logging happens as a separate step at the end of the
|
||||
build, we need a few neon-specific hints to make it work.
|
||||
---
|
||||
src/hnswbuild.c | 28 ++++++++++++++++++++++++++++
|
||||
1 file changed, 28 insertions(+)
|
||||
|
||||
diff --git a/src/hnswbuild.c b/src/hnswbuild.c
|
||||
index 680789ba9044900eac9321844ee2a808a4a2ed12..41c5b709bcb2367ac8b8c498788ecac4c1148b74 100644
|
||||
--- a/src/hnswbuild.c
|
||||
+++ b/src/hnswbuild.c
|
||||
@@ -1089,13 +1089,41 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
|
||||
SeedRandom(42);
|
||||
#endif
|
||||
|
||||
+#ifdef NEON_SMGR
|
||||
+ smgr_start_unlogged_build(index->rd_smgr);
|
||||
+#endif
|
||||
+
|
||||
InitBuildState(buildstate, heap, index, indexInfo, forkNum);
|
||||
|
||||
BuildGraph(buildstate, forkNum);
|
||||
|
||||
+#ifdef NEON_SMGR
|
||||
+ smgr_finish_unlogged_build_phase_1(index->rd_smgr);
|
||||
+#endif
|
||||
+
|
||||
if (RelationNeedsWAL(index))
|
||||
+ {
|
||||
log_newpage_range(index, forkNum, 0, RelationGetNumberOfBlocks(index), true);
|
||||
|
||||
+#ifdef NEON_SMGR
|
||||
+ {
|
||||
+#if PG_VERSION_NUM >= 160000
|
||||
+ RelFileLocator rlocator = index->rd_smgr->smgr_rlocator.locator;
|
||||
+#else
|
||||
+ RelFileNode rlocator = index->rd_smgr->smgr_rnode.node;
|
||||
+#endif
|
||||
+
|
||||
+ SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator,
|
||||
+ MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
|
||||
+ SetLastWrittenLSNForRelation(XactLastRecEnd, rlocator, MAIN_FORKNUM);
|
||||
+ }
|
||||
+#endif
|
||||
+ }
|
||||
+
|
||||
+#ifdef NEON_SMGR
|
||||
+ smgr_end_unlogged_build(index->rd_smgr);
|
||||
+#endif
|
||||
+
|
||||
FreeBuildState(buildstate);
|
||||
}
|
||||
@@ -328,18 +328,14 @@ pageserver_connect(shardno_t shard_no, int elevel)
|
||||
|
||||
now = GetCurrentTimestamp();
|
||||
us_since_last_connect = now - last_connect_time;
|
||||
if (us_since_last_connect < delay_us)
|
||||
if (us_since_last_connect < MAX_RECONNECT_INTERVAL_USEC)
|
||||
{
|
||||
pg_usleep(delay_us - us_since_last_connect);
|
||||
pg_usleep(delay_us);
|
||||
delay_us *= 2;
|
||||
if (delay_us > MAX_RECONNECT_INTERVAL_USEC)
|
||||
delay_us = MAX_RECONNECT_INTERVAL_USEC;
|
||||
last_connect_time = GetCurrentTimestamp();
|
||||
}
|
||||
else
|
||||
{
|
||||
delay_us = MIN_RECONNECT_INTERVAL_USEC;
|
||||
last_connect_time = now;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -366,6 +362,7 @@ pageserver_connect(shardno_t shard_no, int elevel)
|
||||
values[n] = NULL;
|
||||
n++;
|
||||
conn = PQconnectdbParams(keywords, values, 1);
|
||||
last_connect_time = GetCurrentTimestamp();
|
||||
|
||||
if (PQstatus(conn) == CONNECTION_BAD)
|
||||
{
|
||||
|
||||
157
poetry.lock
generated
157
poetry.lock
generated
@@ -2,87 +2,87 @@
|
||||
|
||||
[[package]]
|
||||
name = "aiohttp"
|
||||
version = "3.9.0"
|
||||
version = "3.9.2"
|
||||
description = "Async http client/server framework (asyncio)"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "aiohttp-3.9.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:6896b8416be9ada4d22cd359d7cb98955576ce863eadad5596b7cdfbf3e17c6c"},
|
||||
{file = "aiohttp-3.9.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1736d87dad8ef46a8ec9cddd349fa9f7bd3a064c47dd6469c0d6763d3d49a4fc"},
|
||||
{file = "aiohttp-3.9.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8c9e5f4d7208cda1a2bb600e29069eecf857e6980d0ccc922ccf9d1372c16f4b"},
|
||||
{file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8488519aa05e636c5997719fe543c8daf19f538f4fa044f3ce94bee608817cff"},
|
||||
{file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5ab16c254e2312efeb799bc3c06897f65a133b38b69682bf75d1f1ee1a9c43a9"},
|
||||
{file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7a94bde005a8f926d0fa38b88092a03dea4b4875a61fbcd9ac6f4351df1b57cd"},
|
||||
{file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b777c9286b6c6a94f50ddb3a6e730deec327e9e2256cb08b5530db0f7d40fd8"},
|
||||
{file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:571760ad7736b34d05597a1fd38cbc7d47f7b65deb722cb8e86fd827404d1f6b"},
|
||||
{file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:deac0a32aec29608eb25d730f4bc5a261a65b6c48ded1ed861d2a1852577c932"},
|
||||
{file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:4ee1b4152bc3190cc40ddd6a14715e3004944263ea208229ab4c297712aa3075"},
|
||||
{file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:3607375053df58ed6f23903aa10cf3112b1240e8c799d243bbad0f7be0666986"},
|
||||
{file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:65b0a70a25456d329a5e1426702dde67be0fb7a4ead718005ba2ca582d023a94"},
|
||||
{file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:5a2eb5311a37fe105aa35f62f75a078537e1a9e4e1d78c86ec9893a3c97d7a30"},
|
||||
{file = "aiohttp-3.9.0-cp310-cp310-win32.whl", hash = "sha256:2cbc14a13fb6b42d344e4f27746a4b03a2cb0c1c3c5b932b0d6ad8881aa390e3"},
|
||||
{file = "aiohttp-3.9.0-cp310-cp310-win_amd64.whl", hash = "sha256:ac9669990e2016d644ba8ae4758688534aabde8dbbc81f9af129c3f5f01ca9cd"},
|
||||
{file = "aiohttp-3.9.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:f8e05f5163528962ce1d1806fce763ab893b1c5b7ace0a3538cd81a90622f844"},
|
||||
{file = "aiohttp-3.9.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4afa8f71dba3a5a2e1e1282a51cba7341ae76585345c43d8f0e624882b622218"},
|
||||
{file = "aiohttp-3.9.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f929f4c9b9a00f3e6cc0587abb95ab9c05681f8b14e0fe1daecfa83ea90f8318"},
|
||||
{file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:28185e36a78d247c55e9fbea2332d16aefa14c5276a582ce7a896231c6b1c208"},
|
||||
{file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a486ddf57ab98b6d19ad36458b9f09e6022de0381674fe00228ca7b741aacb2f"},
|
||||
{file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:70e851f596c00f40a2f00a46126c95c2e04e146015af05a9da3e4867cfc55911"},
|
||||
{file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c5b7bf8fe4d39886adc34311a233a2e01bc10eb4e842220235ed1de57541a896"},
|
||||
{file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c67a51ea415192c2e53e4e048c78bab82d21955b4281d297f517707dc836bf3d"},
|
||||
{file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:694df243f394629bcae2d8ed94c589a181e8ba8604159e6e45e7b22e58291113"},
|
||||
{file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:3dd8119752dd30dd7bca7d4bc2a92a59be6a003e4e5c2cf7e248b89751b8f4b7"},
|
||||
{file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:eb6dfd52063186ac97b4caa25764cdbcdb4b10d97f5c5f66b0fa95052e744eb7"},
|
||||
{file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:d97c3e286d0ac9af6223bc132dc4bad6540b37c8d6c0a15fe1e70fb34f9ec411"},
|
||||
{file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:816f4db40555026e4cdda604a1088577c1fb957d02f3f1292e0221353403f192"},
|
||||
{file = "aiohttp-3.9.0-cp311-cp311-win32.whl", hash = "sha256:3abf0551874fecf95f93b58f25ef4fc9a250669a2257753f38f8f592db85ddea"},
|
||||
{file = "aiohttp-3.9.0-cp311-cp311-win_amd64.whl", hash = "sha256:e18d92c3e9e22553a73e33784fcb0ed484c9874e9a3e96c16a8d6a1e74a0217b"},
|
||||
{file = "aiohttp-3.9.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:99ae01fb13a618b9942376df77a1f50c20a281390dad3c56a6ec2942e266220d"},
|
||||
{file = "aiohttp-3.9.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:05857848da443c8c12110d99285d499b4e84d59918a21132e45c3f0804876994"},
|
||||
{file = "aiohttp-3.9.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:317719d7f824eba55857fe0729363af58e27c066c731bc62cd97bc9c3d9c7ea4"},
|
||||
{file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1e3b3c107ccb0e537f309f719994a55621acd2c8fdf6d5ce5152aed788fb940"},
|
||||
{file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:45820ddbb276113ead8d4907a7802adb77548087ff5465d5c554f9aa3928ae7d"},
|
||||
{file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:05a183f1978802588711aed0dea31e697d760ce9055292db9dc1604daa9a8ded"},
|
||||
{file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:51a4cd44788ea0b5e6bb8fa704597af3a30be75503a7ed1098bc5b8ffdf6c982"},
|
||||
{file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:673343fbc0c1ac44d0d2640addc56e97a052504beacd7ade0dc5e76d3a4c16e8"},
|
||||
{file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7e8a3b79b6d186a9c99761fd4a5e8dd575a48d96021f220ac5b5fa856e5dd029"},
|
||||
{file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:6777a390e41e78e7c45dab43a4a0196c55c3b8c30eebe017b152939372a83253"},
|
||||
{file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:7ae5f99a32c53731c93ac3075abd3e1e5cfbe72fc3eaac4c27c9dd64ba3b19fe"},
|
||||
{file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:f1e4f254e9c35d8965d377e065c4a8a55d396fe87c8e7e8429bcfdeeb229bfb3"},
|
||||
{file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:11ca808f9a6b63485059f5f6e164ef7ec826483c1212a44f268b3653c91237d8"},
|
||||
{file = "aiohttp-3.9.0-cp312-cp312-win32.whl", hash = "sha256:de3cc86f4ea8b4c34a6e43a7306c40c1275e52bfa9748d869c6b7d54aa6dad80"},
|
||||
{file = "aiohttp-3.9.0-cp312-cp312-win_amd64.whl", hash = "sha256:ca4fddf84ac7d8a7d0866664936f93318ff01ee33e32381a115b19fb5a4d1202"},
|
||||
{file = "aiohttp-3.9.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:f09960b5bb1017d16c0f9e9f7fc42160a5a49fa1e87a175fd4a2b1a1833ea0af"},
|
||||
{file = "aiohttp-3.9.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8303531e2c17b1a494ffaeba48f2da655fe932c4e9a2626c8718403c83e5dd2b"},
|
||||
{file = "aiohttp-3.9.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4790e44f46a4aa07b64504089def5744d3b6780468c4ec3a1a36eb7f2cae9814"},
|
||||
{file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1d7edf74a36de0e5ca50787e83a77cf352f5504eb0ffa3f07000a911ba353fb"},
|
||||
{file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:94697c7293199c2a2551e3e3e18438b4cba293e79c6bc2319f5fd652fccb7456"},
|
||||
{file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a1b66dbb8a7d5f50e9e2ea3804b01e766308331d0cac76eb30c563ac89c95985"},
|
||||
{file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9623cfd9e85b76b83ef88519d98326d4731f8d71869867e47a0b979ffec61c73"},
|
||||
{file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f32c86dc967ab8c719fd229ce71917caad13cc1e8356ee997bf02c5b368799bf"},
|
||||
{file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:f50b4663c3e0262c3a361faf440761fbef60ccdde5fe8545689a4b3a3c149fb4"},
|
||||
{file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:dcf71c55ec853826cd70eadb2b6ac62ec577416442ca1e0a97ad875a1b3a0305"},
|
||||
{file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:42fe4fd9f0dfcc7be4248c162d8056f1d51a04c60e53366b0098d1267c4c9da8"},
|
||||
{file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:76a86a9989ebf82ee61e06e2bab408aec4ea367dc6da35145c3352b60a112d11"},
|
||||
{file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:f9e09a1c83521d770d170b3801eea19b89f41ccaa61d53026ed111cb6f088887"},
|
||||
{file = "aiohttp-3.9.0-cp38-cp38-win32.whl", hash = "sha256:a00ce44c21612d185c5275c5cba4bab8d7c1590f248638b667ed8a782fa8cd6f"},
|
||||
{file = "aiohttp-3.9.0-cp38-cp38-win_amd64.whl", hash = "sha256:d5b9345ab92ebe6003ae11d8092ce822a0242146e6fa270889b9ba965457ca40"},
|
||||
{file = "aiohttp-3.9.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:98d21092bf2637c5fa724a428a69e8f5955f2182bff61f8036827cf6ce1157bf"},
|
||||
{file = "aiohttp-3.9.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:35a68cd63ca6aaef5707888f17a70c36efe62b099a4e853d33dc2e9872125be8"},
|
||||
{file = "aiohttp-3.9.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3d7f6235c7475658acfc1769d968e07ab585c79f6ca438ddfecaa9a08006aee2"},
|
||||
{file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:db04d1de548f7a62d1dd7e7cdf7c22893ee168e22701895067a28a8ed51b3735"},
|
||||
{file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:536b01513d67d10baf6f71c72decdf492fb7433c5f2f133e9a9087379d4b6f31"},
|
||||
{file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:87c8b0a6487e8109427ccf638580865b54e2e3db4a6e0e11c02639231b41fc0f"},
|
||||
{file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7276fe0017664414fdc3618fca411630405f1aaf0cc3be69def650eb50441787"},
|
||||
{file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:23170247ef89ffa842a02bbfdc425028574d9e010611659abeb24d890bc53bb8"},
|
||||
{file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b1a2ea8252cacc7fd51df5a56d7a2bb1986ed39be9397b51a08015727dfb69bd"},
|
||||
{file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:2d71abc15ff7047412ef26bf812dfc8d0d1020d664617f4913df2df469f26b76"},
|
||||
{file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:2d820162c8c2bdbe97d328cd4f417c955ca370027dce593345e437b2e9ffdc4d"},
|
||||
{file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:2779f5e7c70f7b421915fd47db332c81de365678180a9f3ab404088f87ba5ff9"},
|
||||
{file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:366bc870d7ac61726f32a489fbe3d1d8876e87506870be66b01aeb84389e967e"},
|
||||
{file = "aiohttp-3.9.0-cp39-cp39-win32.whl", hash = "sha256:1df43596b826022b14998f0460926ce261544fedefe0d2f653e1b20f49e96454"},
|
||||
{file = "aiohttp-3.9.0-cp39-cp39-win_amd64.whl", hash = "sha256:9c196b30f1b1aa3363a69dd69079ae9bec96c2965c4707eaa6914ba099fb7d4f"},
|
||||
{file = "aiohttp-3.9.0.tar.gz", hash = "sha256:09f23292d29135025e19e8ff4f0a68df078fe4ee013bca0105b2e803989de92d"},
|
||||
{file = "aiohttp-3.9.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:772fbe371788e61c58d6d3d904268e48a594ba866804d08c995ad71b144f94cb"},
|
||||
{file = "aiohttp-3.9.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:edd4f1af2253f227ae311ab3d403d0c506c9b4410c7fc8d9573dec6d9740369f"},
|
||||
{file = "aiohttp-3.9.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cfee9287778399fdef6f8a11c9e425e1cb13cc9920fd3a3df8f122500978292b"},
|
||||
{file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3cc158466f6a980a6095ee55174d1de5730ad7dec251be655d9a6a9dd7ea1ff9"},
|
||||
{file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:54ec82f45d57c9a65a1ead3953b51c704f9587440e6682f689da97f3e8defa35"},
|
||||
{file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:abeb813a18eb387f0d835ef51f88568540ad0325807a77a6e501fed4610f864e"},
|
||||
{file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc91d07280d7d169f3a0f9179d8babd0ee05c79d4d891447629ff0d7d8089ec2"},
|
||||
{file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b65e861f4bebfb660f7f0f40fa3eb9f2ab9af10647d05dac824390e7af8f75b7"},
|
||||
{file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:04fd8ffd2be73d42bcf55fd78cde7958eeee6d4d8f73c3846b7cba491ecdb570"},
|
||||
{file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:3d8d962b439a859b3ded9a1e111a4615357b01620a546bc601f25b0211f2da81"},
|
||||
{file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:8ceb658afd12b27552597cf9a65d9807d58aef45adbb58616cdd5ad4c258c39e"},
|
||||
{file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:0e4ee4df741670560b1bc393672035418bf9063718fee05e1796bf867e995fad"},
|
||||
{file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:2dec87a556f300d3211decf018bfd263424f0690fcca00de94a837949fbcea02"},
|
||||
{file = "aiohttp-3.9.2-cp310-cp310-win32.whl", hash = "sha256:3e1a800f988ce7c4917f34096f81585a73dbf65b5c39618b37926b1238cf9bc4"},
|
||||
{file = "aiohttp-3.9.2-cp310-cp310-win_amd64.whl", hash = "sha256:ea510718a41b95c236c992b89fdfc3d04cc7ca60281f93aaada497c2b4e05c46"},
|
||||
{file = "aiohttp-3.9.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6aaa6f99256dd1b5756a50891a20f0d252bd7bdb0854c5d440edab4495c9f973"},
|
||||
{file = "aiohttp-3.9.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a27d8c70ad87bcfce2e97488652075a9bdd5b70093f50b10ae051dfe5e6baf37"},
|
||||
{file = "aiohttp-3.9.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:54287bcb74d21715ac8382e9de146d9442b5f133d9babb7e5d9e453faadd005e"},
|
||||
{file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5bb3d05569aa83011fcb346b5266e00b04180105fcacc63743fc2e4a1862a891"},
|
||||
{file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c8534e7d69bb8e8d134fe2be9890d1b863518582f30c9874ed7ed12e48abe3c4"},
|
||||
{file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4bd9d5b989d57b41e4ff56ab250c5ddf259f32db17159cce630fd543376bd96b"},
|
||||
{file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa6904088e6642609981f919ba775838ebf7df7fe64998b1a954fb411ffb4663"},
|
||||
{file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bda42eb410be91b349fb4ee3a23a30ee301c391e503996a638d05659d76ea4c2"},
|
||||
{file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:193cc1ccd69d819562cc7f345c815a6fc51d223b2ef22f23c1a0f67a88de9a72"},
|
||||
{file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:b9f1cb839b621f84a5b006848e336cf1496688059d2408e617af33e3470ba204"},
|
||||
{file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:d22a0931848b8c7a023c695fa2057c6aaac19085f257d48baa24455e67df97ec"},
|
||||
{file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:4112d8ba61fbd0abd5d43a9cb312214565b446d926e282a6d7da3f5a5aa71d36"},
|
||||
{file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c4ad4241b52bb2eb7a4d2bde060d31c2b255b8c6597dd8deac2f039168d14fd7"},
|
||||
{file = "aiohttp-3.9.2-cp311-cp311-win32.whl", hash = "sha256:ee2661a3f5b529f4fc8a8ffee9f736ae054adfb353a0d2f78218be90617194b3"},
|
||||
{file = "aiohttp-3.9.2-cp311-cp311-win_amd64.whl", hash = "sha256:4deae2c165a5db1ed97df2868ef31ca3cc999988812e82386d22937d9d6fed52"},
|
||||
{file = "aiohttp-3.9.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:6f4cdba12539215aaecf3c310ce9d067b0081a0795dd8a8805fdb67a65c0572a"},
|
||||
{file = "aiohttp-3.9.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:84e843b33d5460a5c501c05539809ff3aee07436296ff9fbc4d327e32aa3a326"},
|
||||
{file = "aiohttp-3.9.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8008d0f451d66140a5aa1c17e3eedc9d56e14207568cd42072c9d6b92bf19b52"},
|
||||
{file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:61c47ab8ef629793c086378b1df93d18438612d3ed60dca76c3422f4fbafa792"},
|
||||
{file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bc71f748e12284312f140eaa6599a520389273174b42c345d13c7e07792f4f57"},
|
||||
{file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a1c3a4d0ab2f75f22ec80bca62385db2e8810ee12efa8c9e92efea45c1849133"},
|
||||
{file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9a87aa0b13bbee025faa59fa58861303c2b064b9855d4c0e45ec70182bbeba1b"},
|
||||
{file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e2cc0d04688b9f4a7854c56c18aa7af9e5b0a87a28f934e2e596ba7e14783192"},
|
||||
{file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:1956e3ac376b1711c1533266dec4efd485f821d84c13ce1217d53e42c9e65f08"},
|
||||
{file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:114da29f39eccd71b93a0fcacff178749a5c3559009b4a4498c2c173a6d74dff"},
|
||||
{file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:3f17999ae3927d8a9a823a1283b201344a0627272f92d4f3e3a4efe276972fe8"},
|
||||
{file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:f31df6a32217a34ae2f813b152a6f348154f948c83213b690e59d9e84020925c"},
|
||||
{file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:7a75307ffe31329928a8d47eae0692192327c599113d41b278d4c12b54e1bd11"},
|
||||
{file = "aiohttp-3.9.2-cp312-cp312-win32.whl", hash = "sha256:972b63d589ff8f305463593050a31b5ce91638918da38139b9d8deaba9e0fed7"},
|
||||
{file = "aiohttp-3.9.2-cp312-cp312-win_amd64.whl", hash = "sha256:200dc0246f0cb5405c80d18ac905c8350179c063ea1587580e3335bfc243ba6a"},
|
||||
{file = "aiohttp-3.9.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:158564d0d1020e0d3fe919a81d97aadad35171e13e7b425b244ad4337fc6793a"},
|
||||
{file = "aiohttp-3.9.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:da1346cd0ccb395f0ed16b113ebb626fa43b7b07fd7344fce33e7a4f04a8897a"},
|
||||
{file = "aiohttp-3.9.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:eaa9256de26ea0334ffa25f1913ae15a51e35c529a1ed9af8e6286dd44312554"},
|
||||
{file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1543e7fb00214fb4ccead42e6a7d86f3bb7c34751ec7c605cca7388e525fd0b4"},
|
||||
{file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:186e94570433a004e05f31f632726ae0f2c9dee4762a9ce915769ce9c0a23d89"},
|
||||
{file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d52d20832ac1560f4510d68e7ba8befbc801a2b77df12bd0cd2bcf3b049e52a4"},
|
||||
{file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1c45e4e815ac6af3b72ca2bde9b608d2571737bb1e2d42299fc1ffdf60f6f9a1"},
|
||||
{file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aa906b9bdfd4a7972dd0628dbbd6413d2062df5b431194486a78f0d2ae87bd55"},
|
||||
{file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:68bbee9e17d66f17bb0010aa15a22c6eb28583edcc8b3212e2b8e3f77f3ebe2a"},
|
||||
{file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:4c189b64bd6d9a403a1a3f86a3ab3acbc3dc41a68f73a268a4f683f89a4dec1f"},
|
||||
{file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:8a7876f794523123bca6d44bfecd89c9fec9ec897a25f3dd202ee7fc5c6525b7"},
|
||||
{file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:d23fba734e3dd7b1d679b9473129cd52e4ec0e65a4512b488981a56420e708db"},
|
||||
{file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b141753be581fab842a25cb319f79536d19c2a51995d7d8b29ee290169868eab"},
|
||||
{file = "aiohttp-3.9.2-cp38-cp38-win32.whl", hash = "sha256:103daf41ff3b53ba6fa09ad410793e2e76c9d0269151812e5aba4b9dd674a7e8"},
|
||||
{file = "aiohttp-3.9.2-cp38-cp38-win_amd64.whl", hash = "sha256:328918a6c2835861ff7afa8c6d2c70c35fdaf996205d5932351bdd952f33fa2f"},
|
||||
{file = "aiohttp-3.9.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5264d7327c9464786f74e4ec9342afbbb6ee70dfbb2ec9e3dfce7a54c8043aa3"},
|
||||
{file = "aiohttp-3.9.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:07205ae0015e05c78b3288c1517afa000823a678a41594b3fdc870878d645305"},
|
||||
{file = "aiohttp-3.9.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ae0a1e638cffc3ec4d4784b8b4fd1cf28968febc4bd2718ffa25b99b96a741bd"},
|
||||
{file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d43302a30ba1166325974858e6ef31727a23bdd12db40e725bec0f759abce505"},
|
||||
{file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:16a967685907003765855999af11a79b24e70b34dc710f77a38d21cd9fc4f5fe"},
|
||||
{file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6fa3ee92cd441d5c2d07ca88d7a9cef50f7ec975f0117cd0c62018022a184308"},
|
||||
{file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b500c5ad9c07639d48615a770f49618130e61be36608fc9bc2d9bae31732b8f"},
|
||||
{file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c07327b368745b1ce2393ae9e1aafed7073d9199e1dcba14e035cc646c7941bf"},
|
||||
{file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:cc7d6502c23a0ec109687bf31909b3fb7b196faf198f8cff68c81b49eb316ea9"},
|
||||
{file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:07be2be7071723c3509ab5c08108d3a74f2181d4964e869f2504aaab68f8d3e8"},
|
||||
{file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:122468f6fee5fcbe67cb07014a08c195b3d4c41ff71e7b5160a7bcc41d585a5f"},
|
||||
{file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:00a9abcea793c81e7f8778ca195a1714a64f6d7436c4c0bb168ad2a212627000"},
|
||||
{file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:7a9825fdd64ecac5c670234d80bb52bdcaa4139d1f839165f548208b3779c6c6"},
|
||||
{file = "aiohttp-3.9.2-cp39-cp39-win32.whl", hash = "sha256:5422cd9a4a00f24c7244e1b15aa9b87935c85fb6a00c8ac9b2527b38627a9211"},
|
||||
{file = "aiohttp-3.9.2-cp39-cp39-win_amd64.whl", hash = "sha256:7d579dcd5d82a86a46f725458418458fa43686f6a7b252f2966d359033ffc8ab"},
|
||||
{file = "aiohttp-3.9.2.tar.gz", hash = "sha256:b0ad0a5e86ce73f5368a164c10ada10504bf91869c05ab75d982c6048217fbf7"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@@ -2043,6 +2043,7 @@ files = [
|
||||
{file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
|
||||
@@ -2668,4 +2669,4 @@ cffi = ["cffi (>=1.11)"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.9"
|
||||
content-hash = "9cf2734cafd5b6963165d398f1b24621193d5284d0bc7cc26a720a014f523860"
|
||||
content-hash = "e99954cbbfef8dcc5e13cea7103c87657639a192f2372983bdb8c5d624c2e447"
|
||||
|
||||
@@ -62,6 +62,8 @@ socket2.workspace = true
|
||||
sync_wrapper.workspace = true
|
||||
task-local-extensions.workspace = true
|
||||
thiserror.workspace = true
|
||||
tikv-jemallocator.workspace = true
|
||||
tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] }
|
||||
tls-listener.workspace = true
|
||||
tokio-postgres.workspace = true
|
||||
tokio-rustls.workspace = true
|
||||
|
||||
@@ -190,7 +190,10 @@ async fn auth_quirks(
|
||||
Err(info) => {
|
||||
let res = hacks::password_hack_no_authentication(info, client, &mut ctx.latency_timer)
|
||||
.await?;
|
||||
ctx.set_endpoint_id(Some(res.info.endpoint.clone()));
|
||||
|
||||
ctx.set_endpoint_id(res.info.endpoint.clone());
|
||||
tracing::Span::current().record("ep", &tracing::field::display(&res.info.endpoint));
|
||||
|
||||
(res.info, Some(res.keys))
|
||||
}
|
||||
Ok(info) => (info, None),
|
||||
@@ -271,19 +274,12 @@ async fn authenticate_with_secret(
|
||||
classic::authenticate(info, client, config, &mut ctx.latency_timer, secret).await
|
||||
}
|
||||
|
||||
/// Authenticate the user and then wake a compute (or retrieve an existing compute session from cache)
|
||||
/// only if authentication was successfuly.
|
||||
async fn auth_and_wake_compute(
|
||||
/// wake a compute (or retrieve an existing compute session from cache)
|
||||
async fn wake_compute(
|
||||
ctx: &mut RequestMonitoring,
|
||||
api: &impl console::Api,
|
||||
user_info: ComputeUserInfoMaybeEndpoint,
|
||||
client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
|
||||
allow_cleartext: bool,
|
||||
config: &'static AuthenticationConfig,
|
||||
compute_credentials: ComputeCredentials<ComputeCredentialKeys>,
|
||||
) -> auth::Result<(CachedNodeInfo, ComputeUserInfo)> {
|
||||
let compute_credentials =
|
||||
auth_quirks(ctx, api, user_info, client, allow_cleartext, config).await?;
|
||||
|
||||
let mut num_retries = 0;
|
||||
let mut node = loop {
|
||||
let wake_res = api.wake_compute(ctx, &compute_credentials.info).await;
|
||||
@@ -358,16 +354,16 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
|
||||
"performing authentication using the console"
|
||||
);
|
||||
|
||||
let (cache_info, user_info) =
|
||||
auth_and_wake_compute(ctx, &*api, user_info, client, allow_cleartext, config)
|
||||
.await?;
|
||||
let compute_credentials =
|
||||
auth_quirks(ctx, &*api, user_info, client, allow_cleartext, config).await?;
|
||||
let (cache_info, user_info) = wake_compute(ctx, &*api, compute_credentials).await?;
|
||||
(cache_info, BackendType::Console(api, user_info))
|
||||
}
|
||||
// NOTE: this auth backend doesn't use client credentials.
|
||||
Link(url) => {
|
||||
info!("performing link authentication");
|
||||
|
||||
let node_info = link::authenticate(&url, client).await?;
|
||||
let node_info = link::authenticate(ctx, &url, client).await?;
|
||||
|
||||
(
|
||||
CachedNodeInfo::new_uncached(node_info),
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use crate::{
|
||||
auth, compute,
|
||||
console::{self, provider::NodeInfo},
|
||||
context::RequestMonitoring,
|
||||
error::UserFacingError,
|
||||
stream::PqStream,
|
||||
waiters,
|
||||
@@ -54,6 +55,7 @@ pub fn new_psql_session_id() -> String {
|
||||
}
|
||||
|
||||
pub(super) async fn authenticate(
|
||||
ctx: &mut RequestMonitoring,
|
||||
link_uri: &reqwest::Url,
|
||||
client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
|
||||
) -> auth::Result<NodeInfo> {
|
||||
@@ -94,6 +96,10 @@ pub(super) async fn authenticate(
|
||||
.dbname(&db_info.dbname)
|
||||
.user(&db_info.user);
|
||||
|
||||
ctx.set_user(db_info.user.into());
|
||||
ctx.set_project(db_info.aux.clone());
|
||||
tracing::Span::current().record("ep", &tracing::field::display(&db_info.aux.endpoint_id));
|
||||
|
||||
// Backwards compatibility. pg_sni_proxy uses "--" in domain names
|
||||
// while direct connections do not. Once we migrate to pg_sni_proxy
|
||||
// everywhere, we can remove this.
|
||||
|
||||
@@ -2,7 +2,8 @@
|
||||
|
||||
use crate::{
|
||||
auth::password_hack::parse_endpoint_param, context::RequestMonitoring, error::UserFacingError,
|
||||
metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, proxy::NeonOptions, EndpointId, RoleName,
|
||||
metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, proxy::NeonOptions, serverless::SERVERLESS_DRIVER_SNI,
|
||||
EndpointId, RoleName,
|
||||
};
|
||||
use itertools::Itertools;
|
||||
use pq_proto::StartupMessageParams;
|
||||
@@ -54,10 +55,10 @@ impl ComputeUserInfoMaybeEndpoint {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn endpoint_sni<'a>(
|
||||
sni: &'a str,
|
||||
pub fn endpoint_sni(
|
||||
sni: &str,
|
||||
common_names: &HashSet<String>,
|
||||
) -> Result<&'a str, ComputeUserInfoParseError> {
|
||||
) -> Result<Option<EndpointId>, ComputeUserInfoParseError> {
|
||||
let Some((subdomain, common_name)) = sni.split_once('.') else {
|
||||
return Err(ComputeUserInfoParseError::UnknownCommonName { cn: sni.into() });
|
||||
};
|
||||
@@ -66,7 +67,10 @@ pub fn endpoint_sni<'a>(
|
||||
cn: common_name.into(),
|
||||
});
|
||||
}
|
||||
Ok(subdomain)
|
||||
if subdomain == SERVERLESS_DRIVER_SNI {
|
||||
return Ok(None);
|
||||
}
|
||||
Ok(Some(EndpointId::from(subdomain)))
|
||||
}
|
||||
|
||||
impl ComputeUserInfoMaybeEndpoint {
|
||||
@@ -85,7 +89,6 @@ impl ComputeUserInfoMaybeEndpoint {
|
||||
// record the values if we have them
|
||||
ctx.set_application(params.get("application_name").map(SmolStr::from));
|
||||
ctx.set_user(user.clone());
|
||||
ctx.set_endpoint_id(sni.map(EndpointId::from));
|
||||
|
||||
// Project name might be passed via PG's command-line options.
|
||||
let endpoint_option = params
|
||||
@@ -103,7 +106,7 @@ impl ComputeUserInfoMaybeEndpoint {
|
||||
|
||||
let endpoint_from_domain = if let Some(sni_str) = sni {
|
||||
if let Some(cn) = common_names {
|
||||
Some(EndpointId::from(endpoint_sni(sni_str, cn)?))
|
||||
endpoint_sni(sni_str, cn)?
|
||||
} else {
|
||||
None
|
||||
}
|
||||
@@ -117,13 +120,18 @@ impl ComputeUserInfoMaybeEndpoint {
|
||||
Some(Err(InconsistentProjectNames { domain, option }))
|
||||
}
|
||||
// Invariant: project name may not contain certain characters.
|
||||
(a, b) => a.or(b).map(|name| match project_name_valid(&name) {
|
||||
(a, b) => a.or(b).map(|name| match project_name_valid(name.as_ref()) {
|
||||
false => Err(MalformedProjectName(name)),
|
||||
true => Ok(name),
|
||||
}),
|
||||
}
|
||||
.transpose()?;
|
||||
|
||||
if let Some(ep) = &endpoint {
|
||||
ctx.set_endpoint_id(ep.clone());
|
||||
tracing::Span::current().record("ep", &tracing::field::display(ep));
|
||||
}
|
||||
|
||||
info!(%user, project = endpoint.as_deref(), "credentials");
|
||||
if sni.is_some() {
|
||||
info!("Connection with sni");
|
||||
@@ -146,7 +154,7 @@ impl ComputeUserInfoMaybeEndpoint {
|
||||
|
||||
Ok(Self {
|
||||
user,
|
||||
endpoint_id: endpoint.map(EndpointId::from),
|
||||
endpoint_id: endpoint,
|
||||
options,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -272,5 +272,5 @@ async fn handle_client(
|
||||
let client = tokio::net::TcpStream::connect(destination).await?;
|
||||
|
||||
let metrics_aux: MetricsAuxInfo = Default::default();
|
||||
proxy::proxy::proxy_pass(ctx, tls_stream, client, metrics_aux).await
|
||||
proxy::proxy::passthrough::proxy_pass(ctx, tls_stream, client, metrics_aux).await
|
||||
}
|
||||
|
||||
@@ -32,6 +32,9 @@ project_build_tag!(BUILD_TAG);
|
||||
|
||||
use clap::{Parser, ValueEnum};
|
||||
|
||||
#[global_allocator]
|
||||
static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
|
||||
|
||||
#[derive(Clone, Debug, ValueEnum)]
|
||||
enum AuthBackend {
|
||||
Console,
|
||||
@@ -187,6 +190,13 @@ async fn main() -> anyhow::Result<()> {
|
||||
info!("Build_tag: {BUILD_TAG}");
|
||||
::metrics::set_build_info_metric(GIT_VERSION, BUILD_TAG);
|
||||
|
||||
match proxy::jemalloc::MetricRecorder::new(prometheus::default_registry()) {
|
||||
Ok(t) => {
|
||||
t.start();
|
||||
}
|
||||
Err(e) => tracing::error!(error = ?e, "could not start jemalloc metrics loop"),
|
||||
}
|
||||
|
||||
let args = ProxyCliArgs::parse();
|
||||
let config = build_config(&args)?;
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use anyhow::{bail, Context};
|
||||
use anyhow::Context;
|
||||
use dashmap::DashMap;
|
||||
use pq_proto::CancelKeyData;
|
||||
use std::net::SocketAddr;
|
||||
use std::{net::SocketAddr, sync::Arc};
|
||||
use tokio::net::TcpStream;
|
||||
use tokio_postgres::{CancelToken, NoTls};
|
||||
use tracing::info;
|
||||
@@ -25,39 +25,31 @@ impl CancelMap {
|
||||
}
|
||||
|
||||
/// Run async action within an ephemeral session identified by [`CancelKeyData`].
|
||||
pub async fn with_session<'a, F, R, V>(&'a self, f: F) -> anyhow::Result<V>
|
||||
where
|
||||
F: FnOnce(Session<'a>) -> R,
|
||||
R: std::future::Future<Output = anyhow::Result<V>>,
|
||||
{
|
||||
pub fn get_session(self: Arc<Self>) -> Session {
|
||||
// HACK: We'd rather get the real backend_pid but tokio_postgres doesn't
|
||||
// expose it and we don't want to do another roundtrip to query
|
||||
// for it. The client will be able to notice that this is not the
|
||||
// actual backend_pid, but backend_pid is not used for anything
|
||||
// so it doesn't matter.
|
||||
let key = rand::random();
|
||||
let key = loop {
|
||||
let key = rand::random();
|
||||
|
||||
// Random key collisions are unlikely to happen here, but they're still possible,
|
||||
// which is why we have to take care not to rewrite an existing key.
|
||||
match self.0.entry(key) {
|
||||
dashmap::mapref::entry::Entry::Occupied(_) => {
|
||||
bail!("query cancellation key already exists: {key}")
|
||||
// Random key collisions are unlikely to happen here, but they're still possible,
|
||||
// which is why we have to take care not to rewrite an existing key.
|
||||
match self.0.entry(key) {
|
||||
dashmap::mapref::entry::Entry::Occupied(_) => continue,
|
||||
dashmap::mapref::entry::Entry::Vacant(e) => {
|
||||
e.insert(None);
|
||||
}
|
||||
}
|
||||
dashmap::mapref::entry::Entry::Vacant(e) => {
|
||||
e.insert(None);
|
||||
}
|
||||
}
|
||||
|
||||
// This will guarantee that the session gets dropped
|
||||
// as soon as the future is finished.
|
||||
scopeguard::defer! {
|
||||
self.0.remove(&key);
|
||||
info!("dropped query cancellation key {key}");
|
||||
}
|
||||
break key;
|
||||
};
|
||||
|
||||
info!("registered new query cancellation key {key}");
|
||||
let session = Session::new(key, self);
|
||||
f(session).await
|
||||
Session {
|
||||
key,
|
||||
cancel_map: self,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -98,23 +90,17 @@ impl CancelClosure {
|
||||
}
|
||||
|
||||
/// Helper for registering query cancellation tokens.
|
||||
pub struct Session<'a> {
|
||||
pub struct Session {
|
||||
/// The user-facing key identifying this session.
|
||||
key: CancelKeyData,
|
||||
/// The [`CancelMap`] this session belongs to.
|
||||
cancel_map: &'a CancelMap,
|
||||
cancel_map: Arc<CancelMap>,
|
||||
}
|
||||
|
||||
impl<'a> Session<'a> {
|
||||
fn new(key: CancelKeyData, cancel_map: &'a CancelMap) -> Self {
|
||||
Self { key, cancel_map }
|
||||
}
|
||||
}
|
||||
|
||||
impl Session<'_> {
|
||||
impl Session {
|
||||
/// Store the cancel token for the given session.
|
||||
/// This enables query cancellation in `crate::proxy::prepare_client_connection`.
|
||||
pub fn enable_query_cancellation(self, cancel_closure: CancelClosure) -> CancelKeyData {
|
||||
pub fn enable_query_cancellation(&self, cancel_closure: CancelClosure) -> CancelKeyData {
|
||||
info!("enabling query cancellation for this session");
|
||||
self.cancel_map.0.insert(self.key, Some(cancel_closure));
|
||||
|
||||
@@ -122,37 +108,26 @@ impl Session<'_> {
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for Session {
|
||||
fn drop(&mut self) {
|
||||
self.cancel_map.0.remove(&self.key);
|
||||
info!("dropped query cancellation key {}", &self.key);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
#[tokio::test]
|
||||
async fn check_session_drop() -> anyhow::Result<()> {
|
||||
static CANCEL_MAP: Lazy<CancelMap> = Lazy::new(Default::default);
|
||||
|
||||
let (tx, rx) = tokio::sync::oneshot::channel();
|
||||
let task = tokio::spawn(CANCEL_MAP.with_session(|session| async move {
|
||||
assert!(CANCEL_MAP.contains(&session));
|
||||
|
||||
tx.send(()).expect("failed to send");
|
||||
futures::future::pending::<()>().await; // sleep forever
|
||||
|
||||
Ok(())
|
||||
}));
|
||||
|
||||
// Wait until the task has been spawned.
|
||||
rx.await.context("failed to hear from the task")?;
|
||||
|
||||
// Drop the session's entry by cancelling the task.
|
||||
task.abort();
|
||||
let error = task.await.expect_err("task should have failed");
|
||||
if !error.is_cancelled() {
|
||||
anyhow::bail!(error);
|
||||
}
|
||||
let cancel_map: Arc<CancelMap> = Default::default();
|
||||
|
||||
let session = cancel_map.clone().get_session();
|
||||
assert!(cancel_map.contains(&session));
|
||||
drop(session);
|
||||
// Check that the session has been dropped.
|
||||
assert!(CANCEL_MAP.is_empty());
|
||||
assert!(cancel_map.is_empty());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -89,8 +89,11 @@ impl RequestMonitoring {
|
||||
self.project = Some(x.project_id);
|
||||
}
|
||||
|
||||
pub fn set_endpoint_id(&mut self, endpoint_id: Option<EndpointId>) {
|
||||
self.endpoint_id = endpoint_id.or_else(|| self.endpoint_id.clone());
|
||||
pub fn set_endpoint_id(&mut self, endpoint_id: EndpointId) {
|
||||
crate::metrics::CONNECTING_ENDPOINTS
|
||||
.with_label_values(&[self.protocol])
|
||||
.measure(&endpoint_id);
|
||||
self.endpoint_id = Some(endpoint_id);
|
||||
}
|
||||
|
||||
pub fn set_application(&mut self, app: Option<SmolStr>) {
|
||||
|
||||
100
proxy/src/jemalloc.rs
Normal file
100
proxy/src/jemalloc.rs
Normal file
@@ -0,0 +1,100 @@
|
||||
use std::time::Duration;
|
||||
|
||||
use metrics::IntGauge;
|
||||
use prometheus::{register_int_gauge_with_registry, Registry};
|
||||
use tikv_jemalloc_ctl::{config, epoch, epoch_mib, stats, version};
|
||||
|
||||
pub struct MetricRecorder {
|
||||
epoch: epoch_mib,
|
||||
active: stats::active_mib,
|
||||
active_gauge: IntGauge,
|
||||
allocated: stats::allocated_mib,
|
||||
allocated_gauge: IntGauge,
|
||||
mapped: stats::mapped_mib,
|
||||
mapped_gauge: IntGauge,
|
||||
metadata: stats::metadata_mib,
|
||||
metadata_gauge: IntGauge,
|
||||
resident: stats::resident_mib,
|
||||
resident_gauge: IntGauge,
|
||||
retained: stats::retained_mib,
|
||||
retained_gauge: IntGauge,
|
||||
}
|
||||
|
||||
impl MetricRecorder {
|
||||
pub fn new(registry: &Registry) -> Result<Self, anyhow::Error> {
|
||||
tracing::info!(
|
||||
config = config::malloc_conf::read()?,
|
||||
version = version::read()?,
|
||||
"starting jemalloc recorder"
|
||||
);
|
||||
|
||||
Ok(Self {
|
||||
epoch: epoch::mib()?,
|
||||
active: stats::active::mib()?,
|
||||
active_gauge: register_int_gauge_with_registry!(
|
||||
"jemalloc_active_bytes",
|
||||
"Total number of bytes in active pages allocated by the process",
|
||||
registry
|
||||
)?,
|
||||
allocated: stats::allocated::mib()?,
|
||||
allocated_gauge: register_int_gauge_with_registry!(
|
||||
"jemalloc_allocated_bytes",
|
||||
"Total number of bytes allocated by the process",
|
||||
registry
|
||||
)?,
|
||||
mapped: stats::mapped::mib()?,
|
||||
mapped_gauge: register_int_gauge_with_registry!(
|
||||
"jemalloc_mapped_bytes",
|
||||
"Total number of bytes in active extents mapped by the allocator",
|
||||
registry
|
||||
)?,
|
||||
metadata: stats::metadata::mib()?,
|
||||
metadata_gauge: register_int_gauge_with_registry!(
|
||||
"jemalloc_metadata_bytes",
|
||||
"Total number of bytes dedicated to jemalloc metadata",
|
||||
registry
|
||||
)?,
|
||||
resident: stats::resident::mib()?,
|
||||
resident_gauge: register_int_gauge_with_registry!(
|
||||
"jemalloc_resident_bytes",
|
||||
"Total number of bytes in physically resident data pages mapped by the allocator",
|
||||
registry
|
||||
)?,
|
||||
retained: stats::retained::mib()?,
|
||||
retained_gauge: register_int_gauge_with_registry!(
|
||||
"jemalloc_retained_bytes",
|
||||
"Total number of bytes in virtual memory mappings that were retained rather than being returned to the operating system",
|
||||
registry
|
||||
)?,
|
||||
})
|
||||
}
|
||||
|
||||
fn _poll(&self) -> Result<(), anyhow::Error> {
|
||||
self.epoch.advance()?;
|
||||
self.active_gauge.set(self.active.read()? as i64);
|
||||
self.allocated_gauge.set(self.allocated.read()? as i64);
|
||||
self.mapped_gauge.set(self.mapped.read()? as i64);
|
||||
self.metadata_gauge.set(self.metadata.read()? as i64);
|
||||
self.resident_gauge.set(self.resident.read()? as i64);
|
||||
self.retained_gauge.set(self.retained.read()? as i64);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn poll(&self) {
|
||||
if let Err(error) = self._poll() {
|
||||
tracing::warn!(%error, "Failed to poll jemalloc stats");
|
||||
}
|
||||
}
|
||||
|
||||
pub fn start(self) -> tokio::task::JoinHandle<()> {
|
||||
tokio::task::spawn(async move {
|
||||
let mut interval = tokio::time::interval(Duration::from_secs(15));
|
||||
interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
|
||||
loop {
|
||||
self.poll();
|
||||
interval.tick().await;
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -16,6 +16,7 @@ pub mod console;
|
||||
pub mod context;
|
||||
pub mod error;
|
||||
pub mod http;
|
||||
pub mod jemalloc;
|
||||
pub mod logging;
|
||||
pub mod metrics;
|
||||
pub mod parse;
|
||||
|
||||
@@ -1,10 +1,7 @@
|
||||
use ::metrics::{
|
||||
exponential_buckets, register_int_counter_pair_vec, register_int_counter_vec,
|
||||
IntCounterPairVec, IntCounterVec,
|
||||
};
|
||||
use prometheus::{
|
||||
register_histogram, register_histogram_vec, register_int_gauge_vec, Histogram, HistogramVec,
|
||||
IntGaugeVec,
|
||||
exponential_buckets, register_histogram, register_histogram_vec, register_hll_vec,
|
||||
register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge_vec, Histogram,
|
||||
HistogramVec, HyperLogLogVec, IntCounterPairVec, IntCounterVec, IntGaugeVec,
|
||||
};
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
@@ -236,3 +233,13 @@ pub const fn bool_to_str(x: bool) -> &'static str {
|
||||
"false"
|
||||
}
|
||||
}
|
||||
|
||||
pub static CONNECTING_ENDPOINTS: Lazy<HyperLogLogVec<32>> = Lazy::new(|| {
|
||||
register_hll_vec!(
|
||||
32,
|
||||
"proxy_connecting_endpoints",
|
||||
"HLL approximate cardinality of endpoints that are connecting",
|
||||
&["protocol"],
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
@@ -2,37 +2,34 @@
|
||||
mod tests;
|
||||
|
||||
pub mod connect_compute;
|
||||
pub mod handshake;
|
||||
pub mod passthrough;
|
||||
pub mod retry;
|
||||
|
||||
use crate::{
|
||||
auth,
|
||||
cancellation::{self, CancelMap},
|
||||
compute,
|
||||
config::{AuthenticationConfig, ProxyConfig, TlsConfig},
|
||||
console::messages::MetricsAuxInfo,
|
||||
config::{ProxyConfig, TlsConfig},
|
||||
context::RequestMonitoring,
|
||||
metrics::{
|
||||
NUM_BYTES_PROXIED_COUNTER, NUM_BYTES_PROXIED_PER_CLIENT_COUNTER,
|
||||
NUM_CLIENT_CONNECTION_GAUGE, NUM_CONNECTION_REQUESTS_GAUGE,
|
||||
},
|
||||
metrics::{NUM_CLIENT_CONNECTION_GAUGE, NUM_CONNECTION_REQUESTS_GAUGE},
|
||||
protocol2::WithClientIp,
|
||||
proxy::{handshake::handshake, passthrough::proxy_pass},
|
||||
rate_limiter::EndpointRateLimiter,
|
||||
stream::{PqStream, Stream},
|
||||
usage_metrics::{Ids, USAGE_METRICS},
|
||||
EndpointCacheKey,
|
||||
};
|
||||
use anyhow::{bail, Context};
|
||||
use futures::TryFutureExt;
|
||||
use itertools::Itertools;
|
||||
use once_cell::sync::OnceCell;
|
||||
use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
|
||||
use pq_proto::{BeMessage as Be, StartupMessageParams};
|
||||
use regex::Regex;
|
||||
use smol_str::{format_smolstr, SmolStr};
|
||||
use std::sync::Arc;
|
||||
use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{error, info, info_span, Instrument};
|
||||
use utils::measured_stream::MeasuredStream;
|
||||
|
||||
use self::connect_compute::{connect_to_compute, TcpMechanism};
|
||||
|
||||
@@ -80,6 +77,13 @@ pub async fn task_main(
|
||||
let cancel_map = Arc::clone(&cancel_map);
|
||||
let endpoint_rate_limiter = endpoint_rate_limiter.clone();
|
||||
|
||||
let session_span = info_span!(
|
||||
"handle_client",
|
||||
?session_id,
|
||||
peer_addr = tracing::field::Empty,
|
||||
ep = tracing::field::Empty,
|
||||
);
|
||||
|
||||
connections.spawn(
|
||||
async move {
|
||||
info!("accepted postgres client connection");
|
||||
@@ -103,22 +107,18 @@ pub async fn task_main(
|
||||
handle_client(
|
||||
config,
|
||||
&mut ctx,
|
||||
&cancel_map,
|
||||
cancel_map,
|
||||
socket,
|
||||
ClientMode::Tcp,
|
||||
endpoint_rate_limiter,
|
||||
)
|
||||
.await
|
||||
}
|
||||
.instrument(info_span!(
|
||||
"handle_client",
|
||||
?session_id,
|
||||
peer_addr = tracing::field::Empty
|
||||
))
|
||||
.unwrap_or_else(move |e| {
|
||||
// Acknowledge that the task has finished with an error.
|
||||
error!(?session_id, "per-client task finished with an error: {e:#}");
|
||||
}),
|
||||
error!("per-client task finished with an error: {e:#}");
|
||||
})
|
||||
.instrument(session_span),
|
||||
);
|
||||
}
|
||||
|
||||
@@ -171,7 +171,7 @@ impl ClientMode {
|
||||
pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
config: &'static ProxyConfig,
|
||||
ctx: &mut RequestMonitoring,
|
||||
cancel_map: &CancelMap,
|
||||
cancel_map: Arc<CancelMap>,
|
||||
stream: S,
|
||||
mode: ClientMode,
|
||||
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
|
||||
@@ -192,138 +192,88 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
let tls = config.tls_config.as_ref();
|
||||
|
||||
let pause = ctx.latency_timer.pause();
|
||||
let do_handshake = handshake(stream, mode.handshake_tls(tls), cancel_map);
|
||||
let do_handshake = handshake(stream, mode.handshake_tls(tls), &cancel_map);
|
||||
let (mut stream, params) = match do_handshake.await? {
|
||||
Some(x) => x,
|
||||
None => return Ok(()), // it's a cancellation request
|
||||
};
|
||||
drop(pause);
|
||||
|
||||
let hostname = mode.hostname(stream.get_ref());
|
||||
|
||||
let common_names = tls.map(|tls| &tls.common_names);
|
||||
|
||||
// Extract credentials which we're going to use for auth.
|
||||
let user_info = {
|
||||
let hostname = mode.hostname(stream.get_ref());
|
||||
let result = config
|
||||
.auth_backend
|
||||
.as_ref()
|
||||
.map(|_| auth::ComputeUserInfoMaybeEndpoint::parse(ctx, ¶ms, hostname, common_names))
|
||||
.transpose();
|
||||
|
||||
let common_names = tls.map(|tls| &tls.common_names);
|
||||
let result = config
|
||||
.auth_backend
|
||||
.as_ref()
|
||||
.map(|_| {
|
||||
auth::ComputeUserInfoMaybeEndpoint::parse(ctx, ¶ms, hostname, common_names)
|
||||
})
|
||||
.transpose();
|
||||
let user_info = match result {
|
||||
Ok(user_info) => user_info,
|
||||
Err(e) => stream.throw_error(e).await?,
|
||||
};
|
||||
|
||||
match result {
|
||||
Ok(user_info) => user_info,
|
||||
Err(e) => stream.throw_error(e).await?,
|
||||
// check rate limit
|
||||
if let Some(ep) = user_info.get_endpoint() {
|
||||
if !endpoint_rate_limiter.check(ep) {
|
||||
return stream
|
||||
.throw_error(auth::AuthError::too_many_connections())
|
||||
.await;
|
||||
}
|
||||
}
|
||||
|
||||
let user = user_info.get_user().to_owned();
|
||||
let (mut node_info, user_info) = match user_info
|
||||
.authenticate(
|
||||
ctx,
|
||||
&mut stream,
|
||||
mode.allow_cleartext(),
|
||||
&config.authentication_config,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(auth_result) => auth_result,
|
||||
Err(e) => {
|
||||
let db = params.get("database");
|
||||
let app = params.get("application_name");
|
||||
let params_span = tracing::info_span!("", ?user, ?db, ?app);
|
||||
|
||||
return stream.throw_error(e).instrument(params_span).await;
|
||||
}
|
||||
};
|
||||
|
||||
ctx.set_endpoint_id(user_info.get_endpoint());
|
||||
node_info.allow_self_signed_compute = mode.allow_self_signed_compute(config);
|
||||
|
||||
let client = Client::new(
|
||||
stream,
|
||||
user_info,
|
||||
¶ms,
|
||||
mode.allow_self_signed_compute(config),
|
||||
endpoint_rate_limiter,
|
||||
);
|
||||
cancel_map
|
||||
.with_session(|session| {
|
||||
client.connect_to_db(ctx, session, mode, &config.authentication_config)
|
||||
})
|
||||
.await
|
||||
}
|
||||
let aux = node_info.aux.clone();
|
||||
let mut node = connect_to_compute(
|
||||
ctx,
|
||||
&TcpMechanism { params: ¶ms },
|
||||
node_info,
|
||||
&user_info,
|
||||
)
|
||||
.or_else(|e| stream.throw_error(e))
|
||||
.await?;
|
||||
|
||||
/// Establish a (most probably, secure) connection with the client.
|
||||
/// For better testing experience, `stream` can be any object satisfying the traits.
|
||||
/// It's easier to work with owned `stream` here as we need to upgrade it to TLS;
|
||||
/// we also take an extra care of propagating only the select handshake errors to client.
|
||||
#[tracing::instrument(skip_all)]
|
||||
async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
stream: S,
|
||||
mut tls: Option<&TlsConfig>,
|
||||
cancel_map: &CancelMap,
|
||||
) -> anyhow::Result<Option<(PqStream<Stream<S>>, StartupMessageParams)>> {
|
||||
// Client may try upgrading to each protocol only once
|
||||
let (mut tried_ssl, mut tried_gss) = (false, false);
|
||||
let session = cancel_map.get_session();
|
||||
prepare_client_connection(&node, &session, &mut stream).await?;
|
||||
|
||||
let mut stream = PqStream::new(Stream::from_raw(stream));
|
||||
loop {
|
||||
let msg = stream.read_startup_packet().await?;
|
||||
info!("received {msg:?}");
|
||||
// Before proxy passing, forward to compute whatever data is left in the
|
||||
// PqStream input buffer. Normally there is none, but our serverless npm
|
||||
// driver in pipeline mode sends startup, password and first query
|
||||
// immediately after opening the connection.
|
||||
let (stream, read_buf) = stream.into_inner();
|
||||
node.stream.write_all(&read_buf).await?;
|
||||
|
||||
use FeStartupPacket::*;
|
||||
match msg {
|
||||
SslRequest => match stream.get_ref() {
|
||||
Stream::Raw { .. } if !tried_ssl => {
|
||||
tried_ssl = true;
|
||||
|
||||
// We can't perform TLS handshake without a config
|
||||
let enc = tls.is_some();
|
||||
stream.write_message(&Be::EncryptionResponse(enc)).await?;
|
||||
if let Some(tls) = tls.take() {
|
||||
// Upgrade raw stream into a secure TLS-backed stream.
|
||||
// NOTE: We've consumed `tls`; this fact will be used later.
|
||||
|
||||
let (raw, read_buf) = stream.into_inner();
|
||||
// TODO: Normally, client doesn't send any data before
|
||||
// server says TLS handshake is ok and read_buf is empy.
|
||||
// However, you could imagine pipelining of postgres
|
||||
// SSLRequest + TLS ClientHello in one hunk similar to
|
||||
// pipelining in our node js driver. We should probably
|
||||
// support that by chaining read_buf with the stream.
|
||||
if !read_buf.is_empty() {
|
||||
bail!("data is sent before server replied with EncryptionResponse");
|
||||
}
|
||||
let tls_stream = raw.upgrade(tls.to_server_config()).await?;
|
||||
|
||||
let (_, tls_server_end_point) = tls
|
||||
.cert_resolver
|
||||
.resolve(tls_stream.get_ref().1.server_name())
|
||||
.context("missing certificate")?;
|
||||
|
||||
stream = PqStream::new(Stream::Tls {
|
||||
tls: Box::new(tls_stream),
|
||||
tls_server_end_point,
|
||||
});
|
||||
}
|
||||
}
|
||||
_ => bail!(ERR_PROTO_VIOLATION),
|
||||
},
|
||||
GssEncRequest => match stream.get_ref() {
|
||||
Stream::Raw { .. } if !tried_gss => {
|
||||
tried_gss = true;
|
||||
|
||||
// Currently, we don't support GSSAPI
|
||||
stream.write_message(&Be::EncryptionResponse(false)).await?;
|
||||
}
|
||||
_ => bail!(ERR_PROTO_VIOLATION),
|
||||
},
|
||||
StartupMessage { params, .. } => {
|
||||
// Check that the config has been consumed during upgrade
|
||||
// OR we didn't provide it at all (for dev purposes).
|
||||
if tls.is_some() {
|
||||
stream.throw_error_str(ERR_INSECURE_CONNECTION).await?;
|
||||
}
|
||||
|
||||
info!(session_type = "normal", "successful handshake");
|
||||
break Ok(Some((stream, params)));
|
||||
}
|
||||
CancelRequest(cancel_key_data) => {
|
||||
cancel_map.cancel_session(cancel_key_data).await?;
|
||||
|
||||
info!(session_type = "cancellation", "successful handshake");
|
||||
break Ok(None);
|
||||
}
|
||||
}
|
||||
}
|
||||
proxy_pass(ctx, stream, node.stream, aux).await
|
||||
}
|
||||
|
||||
/// Finish client connection initialization: confirm auth success, send params, etc.
|
||||
#[tracing::instrument(skip_all)]
|
||||
async fn prepare_client_connection(
|
||||
node: &compute::PostgresConnection,
|
||||
session: cancellation::Session<'_>,
|
||||
session: &cancellation::Session,
|
||||
stream: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
|
||||
) -> anyhow::Result<()> {
|
||||
// Register compute's query cancellation token and produce a new, unique one.
|
||||
@@ -349,151 +299,6 @@ async fn prepare_client_connection(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Forward bytes in both directions (client <-> compute).
|
||||
#[tracing::instrument(skip_all)]
|
||||
pub async fn proxy_pass(
|
||||
ctx: &mut RequestMonitoring,
|
||||
client: impl AsyncRead + AsyncWrite + Unpin,
|
||||
compute: impl AsyncRead + AsyncWrite + Unpin,
|
||||
aux: MetricsAuxInfo,
|
||||
) -> anyhow::Result<()> {
|
||||
ctx.set_success();
|
||||
ctx.log();
|
||||
|
||||
let usage = USAGE_METRICS.register(Ids {
|
||||
endpoint_id: aux.endpoint_id.clone(),
|
||||
branch_id: aux.branch_id.clone(),
|
||||
});
|
||||
|
||||
let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx"]);
|
||||
let m_sent2 = NUM_BYTES_PROXIED_PER_CLIENT_COUNTER.with_label_values(&aux.traffic_labels("tx"));
|
||||
let mut client = MeasuredStream::new(
|
||||
client,
|
||||
|_| {},
|
||||
|cnt| {
|
||||
// Number of bytes we sent to the client (outbound).
|
||||
m_sent.inc_by(cnt as u64);
|
||||
m_sent2.inc_by(cnt as u64);
|
||||
usage.record_egress(cnt as u64);
|
||||
},
|
||||
);
|
||||
|
||||
let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["rx"]);
|
||||
let m_recv2 = NUM_BYTES_PROXIED_PER_CLIENT_COUNTER.with_label_values(&aux.traffic_labels("rx"));
|
||||
let mut compute = MeasuredStream::new(
|
||||
compute,
|
||||
|_| {},
|
||||
|cnt| {
|
||||
// Number of bytes the client sent to the compute node (inbound).
|
||||
m_recv.inc_by(cnt as u64);
|
||||
m_recv2.inc_by(cnt as u64);
|
||||
},
|
||||
);
|
||||
|
||||
// Starting from here we only proxy the client's traffic.
|
||||
info!("performing the proxy pass...");
|
||||
let _ = tokio::io::copy_bidirectional(&mut client, &mut compute).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Thin connection context.
|
||||
struct Client<'a, S> {
|
||||
/// The underlying libpq protocol stream.
|
||||
stream: PqStream<Stream<S>>,
|
||||
/// Client credentials that we care about.
|
||||
user_info: auth::BackendType<'a, auth::ComputeUserInfoMaybeEndpoint>,
|
||||
/// KV-dictionary with PostgreSQL connection params.
|
||||
params: &'a StartupMessageParams,
|
||||
/// Allow self-signed certificates (for testing).
|
||||
allow_self_signed_compute: bool,
|
||||
/// Rate limiter for endpoints
|
||||
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
|
||||
}
|
||||
|
||||
impl<'a, S> Client<'a, S> {
|
||||
/// Construct a new connection context.
|
||||
fn new(
|
||||
stream: PqStream<Stream<S>>,
|
||||
user_info: auth::BackendType<'a, auth::ComputeUserInfoMaybeEndpoint>,
|
||||
params: &'a StartupMessageParams,
|
||||
allow_self_signed_compute: bool,
|
||||
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
|
||||
) -> Self {
|
||||
Self {
|
||||
stream,
|
||||
user_info,
|
||||
params,
|
||||
allow_self_signed_compute,
|
||||
endpoint_rate_limiter,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
|
||||
/// Let the client authenticate and connect to the designated compute node.
|
||||
// Instrumentation logs endpoint name everywhere. Doesn't work for link
|
||||
// auth; strictly speaking we don't know endpoint name in its case.
|
||||
#[tracing::instrument(name = "", fields(ep = %self.user_info.get_endpoint().unwrap_or_default()), skip_all)]
|
||||
async fn connect_to_db(
|
||||
self,
|
||||
ctx: &mut RequestMonitoring,
|
||||
session: cancellation::Session<'_>,
|
||||
mode: ClientMode,
|
||||
config: &'static AuthenticationConfig,
|
||||
) -> anyhow::Result<()> {
|
||||
let Self {
|
||||
mut stream,
|
||||
user_info,
|
||||
params,
|
||||
allow_self_signed_compute,
|
||||
endpoint_rate_limiter,
|
||||
} = self;
|
||||
|
||||
// check rate limit
|
||||
if let Some(ep) = user_info.get_endpoint() {
|
||||
if !endpoint_rate_limiter.check(ep) {
|
||||
return stream
|
||||
.throw_error(auth::AuthError::too_many_connections())
|
||||
.await;
|
||||
}
|
||||
}
|
||||
|
||||
let user = user_info.get_user().to_owned();
|
||||
let auth_result = match user_info
|
||||
.authenticate(ctx, &mut stream, mode.allow_cleartext(), config)
|
||||
.await
|
||||
{
|
||||
Ok(auth_result) => auth_result,
|
||||
Err(e) => {
|
||||
let db = params.get("database");
|
||||
let app = params.get("application_name");
|
||||
let params_span = tracing::info_span!("", ?user, ?db, ?app);
|
||||
|
||||
return stream.throw_error(e).instrument(params_span).await;
|
||||
}
|
||||
};
|
||||
|
||||
let (mut node_info, user_info) = auth_result;
|
||||
|
||||
node_info.allow_self_signed_compute = allow_self_signed_compute;
|
||||
|
||||
let aux = node_info.aux.clone();
|
||||
let mut node = connect_to_compute(ctx, &TcpMechanism { params }, node_info, &user_info)
|
||||
.or_else(|e| stream.throw_error(e))
|
||||
.await?;
|
||||
|
||||
prepare_client_connection(&node, session, &mut stream).await?;
|
||||
// Before proxy passing, forward to compute whatever data is left in the
|
||||
// PqStream input buffer. Normally there is none, but our serverless npm
|
||||
// driver in pipeline mode sends startup, password and first query
|
||||
// immediately after opening the connection.
|
||||
let (stream, read_buf) = stream.into_inner();
|
||||
node.stream.write_all(&read_buf).await?;
|
||||
proxy_pass(ctx, stream, node.stream, aux).await
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Default)]
|
||||
pub struct NeonOptions(Vec<(SmolStr, SmolStr)>);
|
||||
|
||||
|
||||
96
proxy/src/proxy/handshake.rs
Normal file
96
proxy/src/proxy/handshake.rs
Normal file
@@ -0,0 +1,96 @@
|
||||
use anyhow::{bail, Context};
|
||||
use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::info;
|
||||
|
||||
use crate::{
|
||||
cancellation::CancelMap,
|
||||
config::TlsConfig,
|
||||
proxy::{ERR_INSECURE_CONNECTION, ERR_PROTO_VIOLATION},
|
||||
stream::{PqStream, Stream},
|
||||
};
|
||||
|
||||
/// Establish a (most probably, secure) connection with the client.
|
||||
/// For better testing experience, `stream` can be any object satisfying the traits.
|
||||
/// It's easier to work with owned `stream` here as we need to upgrade it to TLS;
|
||||
/// we also take an extra care of propagating only the select handshake errors to client.
|
||||
#[tracing::instrument(skip_all)]
|
||||
pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
stream: S,
|
||||
mut tls: Option<&TlsConfig>,
|
||||
cancel_map: &CancelMap,
|
||||
) -> anyhow::Result<Option<(PqStream<Stream<S>>, StartupMessageParams)>> {
|
||||
// Client may try upgrading to each protocol only once
|
||||
let (mut tried_ssl, mut tried_gss) = (false, false);
|
||||
|
||||
let mut stream = PqStream::new(Stream::from_raw(stream));
|
||||
loop {
|
||||
let msg = stream.read_startup_packet().await?;
|
||||
info!("received {msg:?}");
|
||||
|
||||
use FeStartupPacket::*;
|
||||
match msg {
|
||||
SslRequest => match stream.get_ref() {
|
||||
Stream::Raw { .. } if !tried_ssl => {
|
||||
tried_ssl = true;
|
||||
|
||||
// We can't perform TLS handshake without a config
|
||||
let enc = tls.is_some();
|
||||
stream.write_message(&Be::EncryptionResponse(enc)).await?;
|
||||
if let Some(tls) = tls.take() {
|
||||
// Upgrade raw stream into a secure TLS-backed stream.
|
||||
// NOTE: We've consumed `tls`; this fact will be used later.
|
||||
|
||||
let (raw, read_buf) = stream.into_inner();
|
||||
// TODO: Normally, client doesn't send any data before
|
||||
// server says TLS handshake is ok and read_buf is empy.
|
||||
// However, you could imagine pipelining of postgres
|
||||
// SSLRequest + TLS ClientHello in one hunk similar to
|
||||
// pipelining in our node js driver. We should probably
|
||||
// support that by chaining read_buf with the stream.
|
||||
if !read_buf.is_empty() {
|
||||
bail!("data is sent before server replied with EncryptionResponse");
|
||||
}
|
||||
let tls_stream = raw.upgrade(tls.to_server_config()).await?;
|
||||
|
||||
let (_, tls_server_end_point) = tls
|
||||
.cert_resolver
|
||||
.resolve(tls_stream.get_ref().1.server_name())
|
||||
.context("missing certificate")?;
|
||||
|
||||
stream = PqStream::new(Stream::Tls {
|
||||
tls: Box::new(tls_stream),
|
||||
tls_server_end_point,
|
||||
});
|
||||
}
|
||||
}
|
||||
_ => bail!(ERR_PROTO_VIOLATION),
|
||||
},
|
||||
GssEncRequest => match stream.get_ref() {
|
||||
Stream::Raw { .. } if !tried_gss => {
|
||||
tried_gss = true;
|
||||
|
||||
// Currently, we don't support GSSAPI
|
||||
stream.write_message(&Be::EncryptionResponse(false)).await?;
|
||||
}
|
||||
_ => bail!(ERR_PROTO_VIOLATION),
|
||||
},
|
||||
StartupMessage { params, .. } => {
|
||||
// Check that the config has been consumed during upgrade
|
||||
// OR we didn't provide it at all (for dev purposes).
|
||||
if tls.is_some() {
|
||||
stream.throw_error_str(ERR_INSECURE_CONNECTION).await?;
|
||||
}
|
||||
|
||||
info!(session_type = "normal", "successful handshake");
|
||||
break Ok(Some((stream, params)));
|
||||
}
|
||||
CancelRequest(cancel_key_data) => {
|
||||
cancel_map.cancel_session(cancel_key_data).await?;
|
||||
|
||||
info!(session_type = "cancellation", "successful handshake");
|
||||
break Ok(None);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
57
proxy/src/proxy/passthrough.rs
Normal file
57
proxy/src/proxy/passthrough.rs
Normal file
@@ -0,0 +1,57 @@
|
||||
use crate::{
|
||||
console::messages::MetricsAuxInfo,
|
||||
context::RequestMonitoring,
|
||||
metrics::{NUM_BYTES_PROXIED_COUNTER, NUM_BYTES_PROXIED_PER_CLIENT_COUNTER},
|
||||
usage_metrics::{Ids, USAGE_METRICS},
|
||||
};
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::info;
|
||||
use utils::measured_stream::MeasuredStream;
|
||||
|
||||
/// Forward bytes in both directions (client <-> compute).
|
||||
#[tracing::instrument(skip_all)]
|
||||
pub async fn proxy_pass(
|
||||
ctx: &mut RequestMonitoring,
|
||||
client: impl AsyncRead + AsyncWrite + Unpin,
|
||||
compute: impl AsyncRead + AsyncWrite + Unpin,
|
||||
aux: MetricsAuxInfo,
|
||||
) -> anyhow::Result<()> {
|
||||
ctx.set_success();
|
||||
ctx.log();
|
||||
|
||||
let usage = USAGE_METRICS.register(Ids {
|
||||
endpoint_id: aux.endpoint_id.clone(),
|
||||
branch_id: aux.branch_id.clone(),
|
||||
});
|
||||
|
||||
let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx"]);
|
||||
let m_sent2 = NUM_BYTES_PROXIED_PER_CLIENT_COUNTER.with_label_values(&aux.traffic_labels("tx"));
|
||||
let mut client = MeasuredStream::new(
|
||||
client,
|
||||
|_| {},
|
||||
|cnt| {
|
||||
// Number of bytes we sent to the client (outbound).
|
||||
m_sent.inc_by(cnt as u64);
|
||||
m_sent2.inc_by(cnt as u64);
|
||||
usage.record_egress(cnt as u64);
|
||||
},
|
||||
);
|
||||
|
||||
let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["rx"]);
|
||||
let m_recv2 = NUM_BYTES_PROXIED_PER_CLIENT_COUNTER.with_label_values(&aux.traffic_labels("rx"));
|
||||
let mut compute = MeasuredStream::new(
|
||||
compute,
|
||||
|_| {},
|
||||
|cnt| {
|
||||
// Number of bytes the client sent to the compute node (inbound).
|
||||
m_recv.inc_by(cnt as u64);
|
||||
m_recv2.inc_by(cnt as u64);
|
||||
},
|
||||
);
|
||||
|
||||
// Starting from here we only proxy the client's traffic.
|
||||
info!("performing the proxy pass...");
|
||||
let _ = tokio::io::copy_bidirectional(&mut client, &mut compute).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -41,6 +41,8 @@ use tokio_util::sync::CancellationToken;
|
||||
use tracing::{error, info, info_span, warn, Instrument};
|
||||
use utils::http::{error::ApiError, json::json_response};
|
||||
|
||||
pub const SERVERLESS_DRIVER_SNI: &str = "api";
|
||||
|
||||
pub async fn task_main(
|
||||
config: &'static ProxyConfig,
|
||||
ws_listener: TcpListener,
|
||||
@@ -228,7 +230,7 @@ async fn request_handler(
|
||||
config,
|
||||
&mut ctx,
|
||||
websocket,
|
||||
&cancel_map,
|
||||
cancel_map,
|
||||
host,
|
||||
endpoint_rate_limiter,
|
||||
)
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::bail;
|
||||
use anyhow::Context;
|
||||
use futures::pin_mut;
|
||||
use futures::StreamExt;
|
||||
use hyper::body::HttpBody;
|
||||
@@ -35,11 +36,11 @@ use crate::config::TlsConfig;
|
||||
use crate::context::RequestMonitoring;
|
||||
use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
|
||||
use crate::proxy::NeonOptions;
|
||||
use crate::EndpointId;
|
||||
use crate::RoleName;
|
||||
|
||||
use super::conn_pool::ConnInfo;
|
||||
use super::conn_pool::GlobalConnPool;
|
||||
use super::SERVERLESS_DRIVER_SNI;
|
||||
|
||||
#[derive(serde::Deserialize)]
|
||||
struct QueryData {
|
||||
@@ -61,7 +62,6 @@ enum Payload {
|
||||
|
||||
const MAX_RESPONSE_SIZE: usize = 10 * 1024 * 1024; // 10 MiB
|
||||
const MAX_REQUEST_SIZE: u64 = 10 * 1024 * 1024; // 10 MiB
|
||||
const SERVERLESS_DRIVER_SNI_HOSTNAME_FIRST_PART: &str = "api";
|
||||
|
||||
static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
|
||||
static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode");
|
||||
@@ -188,10 +188,8 @@ fn get_conn_info(
|
||||
}
|
||||
}
|
||||
|
||||
let endpoint = endpoint_sni(hostname, &tls.common_names)?;
|
||||
|
||||
let endpoint: EndpointId = endpoint.into();
|
||||
ctx.set_endpoint_id(Some(endpoint.clone()));
|
||||
let endpoint = endpoint_sni(hostname, &tls.common_names)?.context("malformed endpoint")?;
|
||||
ctx.set_endpoint_id(endpoint.clone());
|
||||
|
||||
let pairs = connection_url.query_pairs();
|
||||
|
||||
@@ -227,8 +225,7 @@ fn check_matches(sni_hostname: &str, hostname: &str) -> Result<bool, anyhow::Err
|
||||
let (_, hostname_rest) = hostname
|
||||
.split_once('.')
|
||||
.ok_or_else(|| anyhow::anyhow!("Unexpected hostname format."))?;
|
||||
Ok(sni_hostname_rest == hostname_rest
|
||||
&& sni_hostname_first == SERVERLESS_DRIVER_SNI_HOSTNAME_FIRST_PART)
|
||||
Ok(sni_hostname_rest == hostname_rest && sni_hostname_first == SERVERLESS_DRIVER_SNI)
|
||||
}
|
||||
|
||||
// TODO: return different http error codes
|
||||
|
||||
@@ -133,7 +133,7 @@ pub async fn serve_websocket(
|
||||
config: &'static ProxyConfig,
|
||||
ctx: &mut RequestMonitoring,
|
||||
websocket: HyperWebsocket,
|
||||
cancel_map: &CancelMap,
|
||||
cancel_map: Arc<CancelMap>,
|
||||
hostname: Option<String>,
|
||||
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
|
||||
) -> anyhow::Result<()> {
|
||||
|
||||
@@ -33,7 +33,7 @@ psutil = "^5.9.4"
|
||||
types-psutil = "^5.9.5.12"
|
||||
types-toml = "^0.10.8.6"
|
||||
pytest-httpserver = "^1.0.8"
|
||||
aiohttp = "3.9.0"
|
||||
aiohttp = "3.9.2"
|
||||
pytest-rerunfailures = "^13.0"
|
||||
types-pytest-lazy-fixture = "^0.6.3.3"
|
||||
pytest-split = "^0.8.1"
|
||||
|
||||
@@ -3,8 +3,9 @@
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||
use camino::Utf8PathBuf;
|
||||
use tokio::fs::{self, File};
|
||||
use tokio::fs::File;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
use utils::crashsafe::durable_rename;
|
||||
|
||||
use std::io::Read;
|
||||
use std::ops::Deref;
|
||||
@@ -203,35 +204,8 @@ impl Storage for FileStorage {
|
||||
)
|
||||
})?;
|
||||
|
||||
// fsync the file
|
||||
if !self.conf.no_sync {
|
||||
control_partial.sync_all().await.with_context(|| {
|
||||
format!(
|
||||
"failed to sync partial control file at {}",
|
||||
control_partial_path
|
||||
)
|
||||
})?;
|
||||
}
|
||||
|
||||
let control_path = self.timeline_dir.join(CONTROL_FILE_NAME);
|
||||
|
||||
// rename should be atomic
|
||||
fs::rename(&control_partial_path, &control_path).await?;
|
||||
// this sync is not required by any standard but postgres does this (see durable_rename)
|
||||
if !self.conf.no_sync {
|
||||
let new_f = File::open(&control_path).await?;
|
||||
new_f
|
||||
.sync_all()
|
||||
.await
|
||||
.with_context(|| format!("failed to sync control file at: {}", &control_path))?;
|
||||
|
||||
// fsync the directory (linux specific)
|
||||
let tli_dir = File::open(&self.timeline_dir).await?;
|
||||
tli_dir
|
||||
.sync_all()
|
||||
.await
|
||||
.context("failed to sync control file directory")?;
|
||||
}
|
||||
durable_rename(&control_partial_path, &control_path, !self.conf.no_sync).await?;
|
||||
|
||||
// update internal state
|
||||
self.state = s.clone();
|
||||
@@ -249,6 +223,7 @@ mod test {
|
||||
use super::*;
|
||||
use crate::SafeKeeperConf;
|
||||
use anyhow::Result;
|
||||
use tokio::fs;
|
||||
use utils::{id::TenantTimelineId, lsn::Lsn};
|
||||
|
||||
fn stub_conf() -> SafeKeeperConf {
|
||||
|
||||
@@ -28,7 +28,7 @@ use crate::safekeeper::Term;
|
||||
use crate::safekeeper::{ServerInfo, TermLsn};
|
||||
use crate::send_wal::WalSenderState;
|
||||
use crate::timeline::PeerInfo;
|
||||
use crate::{copy_timeline, debug_dump, pull_timeline};
|
||||
use crate::{copy_timeline, debug_dump, patch_control_file, pull_timeline};
|
||||
|
||||
use crate::timelines_global_map::TimelineDeleteForceResult;
|
||||
use crate::GlobalTimelines;
|
||||
@@ -465,6 +465,26 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
async fn patch_control_file_handler(
|
||||
mut request: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&request, None)?;
|
||||
|
||||
let ttid = TenantTimelineId::new(
|
||||
parse_request_param(&request, "tenant_id")?,
|
||||
parse_request_param(&request, "timeline_id")?,
|
||||
);
|
||||
|
||||
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
|
||||
|
||||
let patch_request: patch_control_file::Request = json_request(&mut request).await?;
|
||||
let response = patch_control_file::handle_request(tli, patch_request)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, response)
|
||||
}
|
||||
|
||||
/// Safekeeper http router.
|
||||
pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError> {
|
||||
let mut router = endpoint::make_router();
|
||||
@@ -526,6 +546,10 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
|
||||
"/v1/tenant/:tenant_id/timeline/:source_timeline_id/copy",
|
||||
|r| request_span(r, timeline_copy_handler),
|
||||
)
|
||||
.patch(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/control_file",
|
||||
|r| request_span(r, patch_control_file_handler),
|
||||
)
|
||||
// for tests
|
||||
.post("/v1/record_safekeeper_info/:tenant_id/:timeline_id", |r| {
|
||||
request_span(r, record_safekeeper_info)
|
||||
|
||||
@@ -22,6 +22,7 @@ pub mod handler;
|
||||
pub mod http;
|
||||
pub mod json_ctrl;
|
||||
pub mod metrics;
|
||||
pub mod patch_control_file;
|
||||
pub mod pull_timeline;
|
||||
pub mod receive_wal;
|
||||
pub mod recovery;
|
||||
|
||||
85
safekeeper/src/patch_control_file.rs
Normal file
85
safekeeper/src/patch_control_file.rs
Normal file
@@ -0,0 +1,85 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value;
|
||||
use tracing::info;
|
||||
|
||||
use crate::{state::TimelinePersistentState, timeline::Timeline};
|
||||
|
||||
#[derive(Deserialize, Debug, Clone)]
|
||||
pub struct Request {
|
||||
/// JSON object with fields to update
|
||||
pub updates: serde_json::Value,
|
||||
/// List of fields to apply
|
||||
pub apply_fields: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub struct Response {
|
||||
pub old_control_file: TimelinePersistentState,
|
||||
pub new_control_file: TimelinePersistentState,
|
||||
}
|
||||
|
||||
/// Patch control file with given request. Will update the persistent state using
|
||||
/// fields from the request and persist the new state on disk.
|
||||
pub async fn handle_request(tli: Arc<Timeline>, request: Request) -> anyhow::Result<Response> {
|
||||
let response = tli
|
||||
.map_control_file(|state| {
|
||||
let old_control_file = state.clone();
|
||||
let new_control_file = state_apply_diff(&old_control_file, &request)?;
|
||||
|
||||
info!(
|
||||
"patching control file, old: {:?}, new: {:?}, patch: {:?}",
|
||||
old_control_file, new_control_file, request
|
||||
);
|
||||
*state = new_control_file.clone();
|
||||
|
||||
Ok(Response {
|
||||
old_control_file,
|
||||
new_control_file,
|
||||
})
|
||||
})
|
||||
.await?;
|
||||
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
fn state_apply_diff(
|
||||
state: &TimelinePersistentState,
|
||||
request: &Request,
|
||||
) -> anyhow::Result<TimelinePersistentState> {
|
||||
let mut json_value = serde_json::to_value(state)?;
|
||||
|
||||
if let Value::Object(a) = &mut json_value {
|
||||
if let Value::Object(b) = &request.updates {
|
||||
json_apply_diff(a, b, &request.apply_fields)?;
|
||||
} else {
|
||||
anyhow::bail!("request.updates is not a json object")
|
||||
}
|
||||
} else {
|
||||
anyhow::bail!("TimelinePersistentState is not a json object")
|
||||
}
|
||||
|
||||
let new_state: TimelinePersistentState = serde_json::from_value(json_value)?;
|
||||
Ok(new_state)
|
||||
}
|
||||
|
||||
fn json_apply_diff(
|
||||
object: &mut serde_json::Map<String, Value>,
|
||||
updates: &serde_json::Map<String, Value>,
|
||||
apply_keys: &Vec<String>,
|
||||
) -> anyhow::Result<()> {
|
||||
for key in apply_keys {
|
||||
if let Some(new_value) = updates.get(key) {
|
||||
if let Some(existing_value) = object.get_mut(key) {
|
||||
*existing_value = new_value.clone();
|
||||
} else {
|
||||
anyhow::bail!("key not found in original object: {}", key);
|
||||
}
|
||||
} else {
|
||||
anyhow::bail!("key not found in request.updates: {}", key);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -901,6 +901,20 @@ impl Timeline {
|
||||
file_open,
|
||||
}
|
||||
}
|
||||
|
||||
/// Apply a function to the control file state and persist it.
|
||||
pub async fn map_control_file<T>(
|
||||
&self,
|
||||
f: impl FnOnce(&mut TimelinePersistentState) -> Result<T>,
|
||||
) -> Result<T> {
|
||||
let mut state = self.write_shared_state().await;
|
||||
let mut persistent_state = state.sk.state.start_change();
|
||||
// If f returns error, we abort the change and don't persist anything.
|
||||
let res = f(&mut persistent_state)?;
|
||||
// If persisting fails, we abort the change and return error.
|
||||
state.sk.state.finish_change(&persistent_state).await?;
|
||||
Ok(res)
|
||||
}
|
||||
}
|
||||
|
||||
/// Deletes directory and it's contents. Returns false if directory does not exist.
|
||||
|
||||
@@ -21,6 +21,7 @@ use tokio::fs::{self, remove_file, File, OpenOptions};
|
||||
use tokio::io::{AsyncRead, AsyncWriteExt};
|
||||
use tokio::io::{AsyncReadExt, AsyncSeekExt};
|
||||
use tracing::*;
|
||||
use utils::crashsafe::durable_rename;
|
||||
|
||||
use crate::metrics::{time_io_closure, WalStorageMetrics, REMOVED_WAL_SEGMENTS};
|
||||
use crate::state::TimelinePersistentState;
|
||||
@@ -196,15 +197,6 @@ impl PhysicalStorage {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Call fsync if config requires so.
|
||||
async fn fsync_file(&mut self, file: &File) -> Result<()> {
|
||||
if !self.conf.no_sync {
|
||||
self.metrics
|
||||
.observe_flush_seconds(time_io_closure(file.sync_all()).await?);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Open or create WAL segment file. Caller must call seek to the wanted position.
|
||||
/// Returns `file` and `is_partial`.
|
||||
async fn open_or_create(&mut self, segno: XLogSegNo) -> Result<(File, bool)> {
|
||||
@@ -223,15 +215,33 @@ impl PhysicalStorage {
|
||||
Ok((file, true))
|
||||
} else {
|
||||
// Create and fill new partial file
|
||||
//
|
||||
// We're using fdatasync during WAL writing, so file size must not
|
||||
// change; to this end it is filled with zeros here. To avoid using
|
||||
// half initialized segment, first bake it under tmp filename and
|
||||
// then rename.
|
||||
let tmp_path = self.timeline_dir.join("waltmp");
|
||||
let mut file = OpenOptions::new()
|
||||
.create(true)
|
||||
.write(true)
|
||||
.open(&wal_file_partial_path)
|
||||
.open(&tmp_path)
|
||||
.await
|
||||
.with_context(|| format!("Failed to open log file {:?}", &wal_file_path))?;
|
||||
.with_context(|| format!("Failed to open tmp wal file {:?}", &tmp_path))?;
|
||||
|
||||
write_zeroes(&mut file, self.wal_seg_size).await?;
|
||||
self.fsync_file(&file).await?;
|
||||
|
||||
// Note: this doesn't get into observe_flush_seconds metric. But
|
||||
// segment init should be separate metric, if any.
|
||||
if let Err(e) =
|
||||
durable_rename(&tmp_path, &wal_file_partial_path, !self.conf.no_sync).await
|
||||
{
|
||||
// Probably rename succeeded, but fsync of it failed. Remove
|
||||
// the file then to avoid using it.
|
||||
remove_file(wal_file_partial_path)
|
||||
.await
|
||||
.or_else(utils::fs_ext::ignore_not_found)?;
|
||||
return Err(e.into());
|
||||
}
|
||||
Ok((file, true))
|
||||
}
|
||||
}
|
||||
@@ -718,6 +728,11 @@ const ZERO_BLOCK: &[u8] = &[0u8; XLOG_BLCKSZ];
|
||||
|
||||
/// Helper for filling file with zeroes.
|
||||
async fn write_zeroes(file: &mut File, mut count: usize) -> Result<()> {
|
||||
fail::fail_point!("sk-write-zeroes", |_| {
|
||||
info!("write_zeroes hit failpoint");
|
||||
Err(anyhow::anyhow!("failpoint: sk-write-zeroes"))
|
||||
});
|
||||
|
||||
while count >= XLOG_BLCKSZ {
|
||||
file.write_all(ZERO_BLOCK).await?;
|
||||
count -= XLOG_BLCKSZ;
|
||||
|
||||
@@ -993,13 +993,20 @@ class NeonEnv:
|
||||
self.initial_tenant = config.initial_tenant
|
||||
self.initial_timeline = config.initial_timeline
|
||||
|
||||
attachment_service_port = self.port_distributor.get_port()
|
||||
# Reserve the next port after attachment service for use by its postgres: this
|
||||
# will assert out if the next port wasn't free.
|
||||
attachment_service_pg_port = self.port_distributor.get_port()
|
||||
assert attachment_service_pg_port == attachment_service_port + 1
|
||||
# Find two adjacent ports for attachment service and its postgres DB. This
|
||||
# loop would eventually throw from get_port() if we run out of ports (extremely
|
||||
# unlikely): usually we find two adjacent free ports on the first iteration.
|
||||
while True:
|
||||
self.attachment_service_port = self.port_distributor.get_port()
|
||||
attachment_service_pg_port = self.port_distributor.get_port()
|
||||
if attachment_service_pg_port == self.attachment_service_port + 1:
|
||||
break
|
||||
|
||||
# The URL for the pageserver to use as its control_plane_api config
|
||||
self.control_plane_api: str = f"http://127.0.0.1:{self.attachment_service_port}/upcall/v1"
|
||||
# The base URL of the attachment service
|
||||
self.attachment_service_api: str = f"http://127.0.0.1:{self.attachment_service_port}"
|
||||
|
||||
self.control_plane_api: str = f"http://127.0.0.1:{attachment_service_port}"
|
||||
self.attachment_service: NeonAttachmentService = NeonAttachmentService(
|
||||
self, config.auth_enabled
|
||||
)
|
||||
@@ -1914,6 +1921,14 @@ class NeonAttachmentService:
|
||||
self.running = False
|
||||
return self
|
||||
|
||||
def pageserver_api(self) -> PageserverHttpClient:
|
||||
"""
|
||||
The attachment service implements a subset of the pageserver REST API, for mapping
|
||||
per-tenant actions into per-shard actions (e.g. timeline creation). Tests should invoke those
|
||||
functions via the HttpClient, as an implicit check that these APIs remain compatible.
|
||||
"""
|
||||
return PageserverHttpClient(self.env.attachment_service_port, lambda: True)
|
||||
|
||||
def request(self, method, *args, **kwargs) -> requests.Response:
|
||||
kwargs["headers"] = self.headers()
|
||||
return requests.request(method, *args, **kwargs)
|
||||
@@ -1931,7 +1946,7 @@ class NeonAttachmentService:
|
||||
) -> int:
|
||||
response = self.request(
|
||||
"POST",
|
||||
f"{self.env.control_plane_api}/attach-hook",
|
||||
f"{self.env.attachment_service_api}/debug/v1/attach-hook",
|
||||
json={"tenant_shard_id": str(tenant_shard_id), "node_id": pageserver_id},
|
||||
headers=self.headers(),
|
||||
)
|
||||
@@ -1943,7 +1958,7 @@ class NeonAttachmentService:
|
||||
def attach_hook_drop(self, tenant_shard_id: Union[TenantId, TenantShardId]):
|
||||
response = self.request(
|
||||
"POST",
|
||||
f"{self.env.control_plane_api}/attach-hook",
|
||||
f"{self.env.attachment_service_api}/debug/v1/attach-hook",
|
||||
json={"tenant_shard_id": str(tenant_shard_id), "node_id": None},
|
||||
headers=self.headers(),
|
||||
)
|
||||
@@ -1955,7 +1970,7 @@ class NeonAttachmentService:
|
||||
"""
|
||||
response = self.request(
|
||||
"POST",
|
||||
f"{self.env.control_plane_api}/inspect",
|
||||
f"{self.env.attachment_service_api}/debug/v1/inspect",
|
||||
json={"tenant_shard_id": str(tenant_shard_id)},
|
||||
headers=self.headers(),
|
||||
)
|
||||
@@ -1976,7 +1991,27 @@ class NeonAttachmentService:
|
||||
}
|
||||
log.info(f"node_register({body})")
|
||||
self.request(
|
||||
"POST", f"{self.env.control_plane_api}/node", json=body, headers=self.headers()
|
||||
"POST",
|
||||
f"{self.env.attachment_service_api}/control/v1/node",
|
||||
json=body,
|
||||
headers=self.headers(),
|
||||
).raise_for_status()
|
||||
|
||||
def node_list(self):
|
||||
response = self.request(
|
||||
"GET", f"{self.env.attachment_service_api}/control/v1/node", headers=self.headers()
|
||||
)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
def node_configure(self, node_id, body: dict[str, Any]):
|
||||
log.info(f"node_configure({node_id}, {body})")
|
||||
body["node_id"] = node_id
|
||||
self.request(
|
||||
"PUT",
|
||||
f"{self.env.attachment_service_api}/control/v1/node/{node_id}/config",
|
||||
json=body,
|
||||
headers=self.headers(),
|
||||
).raise_for_status()
|
||||
|
||||
def tenant_create(
|
||||
@@ -1986,6 +2021,9 @@ class NeonAttachmentService:
|
||||
shard_stripe_size: Optional[int] = None,
|
||||
tenant_config: Optional[Dict[Any, Any]] = None,
|
||||
):
|
||||
"""
|
||||
Use this rather than pageserver_api() when you need to include shard parameters
|
||||
"""
|
||||
body: Dict[str, Any] = {"new_tenant_id": str(tenant_id)}
|
||||
|
||||
if shard_count is not None:
|
||||
@@ -1999,21 +2037,17 @@ class NeonAttachmentService:
|
||||
for k, v in tenant_config.items():
|
||||
body[k] = v
|
||||
|
||||
response = self.request("POST", f"{self.env.control_plane_api}/tenant", json=body)
|
||||
response = self.request("POST", f"{self.env.attachment_service_api}/v1/tenant", json=body)
|
||||
response.raise_for_status()
|
||||
log.info(f"tenant_create success: {response.json()}")
|
||||
|
||||
def tenant_timeline_create(self, tenant_id: TenantId, timeline_id: TimelineId):
|
||||
body: Dict[str, Any] = {"new_timeline_id": str(timeline_id)}
|
||||
|
||||
response = self.request(
|
||||
"POST", f"{self.env.control_plane_api}/tenant/{tenant_id}/timeline", json=body
|
||||
)
|
||||
response.raise_for_status()
|
||||
log.info(f"tenant_timeline_create success: {response.json()}")
|
||||
|
||||
def locate(self, tenant_id: TenantId) -> list[dict[str, Any]]:
|
||||
response = self.request("GET", f"{self.env.control_plane_api}/tenant/{tenant_id}/locate")
|
||||
"""
|
||||
:return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr: str, "listen_http_port: int}
|
||||
"""
|
||||
response = self.request(
|
||||
"GET", f"{self.env.attachment_service_api}/control/v1/tenant/{tenant_id}/locate"
|
||||
)
|
||||
response.raise_for_status()
|
||||
body = response.json()
|
||||
shards: list[dict[str, Any]] = body["shards"]
|
||||
@@ -2022,7 +2056,7 @@ class NeonAttachmentService:
|
||||
def tenant_shard_split(self, tenant_id: TenantId, shard_count: int) -> list[TenantShardId]:
|
||||
response = self.request(
|
||||
"PUT",
|
||||
f"{self.env.control_plane_api}/tenant/{tenant_id}/shard_split",
|
||||
f"{self.env.attachment_service_api}/control/v1/tenant/{tenant_id}/shard_split",
|
||||
json={"new_shard_count": shard_count},
|
||||
)
|
||||
response.raise_for_status()
|
||||
@@ -2034,7 +2068,7 @@ class NeonAttachmentService:
|
||||
def tenant_shard_migrate(self, tenant_shard_id: TenantShardId, dest_ps_id: int):
|
||||
response = self.request(
|
||||
"PUT",
|
||||
f"{self.env.control_plane_api}/tenant/{tenant_shard_id}/migrate",
|
||||
f"{self.env.attachment_service_api}/control/v1/tenant/{tenant_shard_id}/migrate",
|
||||
json={"tenant_shard_id": str(tenant_shard_id), "node_id": dest_ps_id},
|
||||
)
|
||||
response.raise_for_status()
|
||||
@@ -3062,6 +3096,17 @@ class Endpoint(PgProtocol):
|
||||
|
||||
return self
|
||||
|
||||
def edit_hba(self, hba: List[str]):
|
||||
"""Prepend hba lines into pg_hba.conf file."""
|
||||
with open(os.path.join(self.pg_data_dir_path(), "pg_hba.conf"), "r+") as conf_file:
|
||||
data = conf_file.read()
|
||||
conf_file.seek(0)
|
||||
conf_file.write("\n".join(hba) + "\n")
|
||||
conf_file.write(data)
|
||||
|
||||
if self.running:
|
||||
self.safe_psql("SELECT pg_reload_conf()")
|
||||
|
||||
def reconfigure(self, pageserver_id: Optional[int] = None):
|
||||
assert self.endpoint_id is not None
|
||||
self.env.neon_cli.endpoint_reconfigure(self.endpoint_id, self.tenant_id, pageserver_id)
|
||||
@@ -3443,6 +3488,24 @@ class SafekeeperHttpClient(requests.Session):
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def patch_control_file(
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
patch: Dict[str, Any],
|
||||
) -> Dict[str, Any]:
|
||||
res = self.patch(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/control_file",
|
||||
json={
|
||||
"updates": patch,
|
||||
"apply_fields": list(patch.keys()),
|
||||
},
|
||||
)
|
||||
res.raise_for_status()
|
||||
res_json = res.json()
|
||||
assert isinstance(res_json, dict)
|
||||
return res_json
|
||||
|
||||
def pull_timeline(self, body: Dict[str, Any]) -> Dict[str, Any]:
|
||||
res = self.post(f"http://localhost:{self.port}/v1/pull_timeline", json=body)
|
||||
res.raise_for_status()
|
||||
|
||||
@@ -549,17 +549,12 @@ class PageserverHttpClient(requests.Session):
|
||||
tenant_id: Union[TenantId, TenantShardId],
|
||||
timeline_id: TimelineId,
|
||||
timestamp,
|
||||
version: Optional[int] = None,
|
||||
):
|
||||
log.info(
|
||||
f"Requesting lsn by timestamp {timestamp}, tenant {tenant_id}, timeline {timeline_id}"
|
||||
)
|
||||
if version is None:
|
||||
version_str = ""
|
||||
else:
|
||||
version_str = f"&version={version}"
|
||||
res = self.get(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp}{version_str}",
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp?timestamp={timestamp}",
|
||||
)
|
||||
self.verbose_error(res)
|
||||
res_json = res.json()
|
||||
|
||||
@@ -52,7 +52,7 @@ class PgVersion(str, enum.Enum):
|
||||
return None
|
||||
|
||||
|
||||
DEFAULT_VERSION: PgVersion = PgVersion.V14
|
||||
DEFAULT_VERSION: PgVersion = PgVersion.V15
|
||||
|
||||
|
||||
def skip_on_postgres(version: PgVersion, reason: str):
|
||||
@@ -78,6 +78,13 @@ def pytest_addoption(parser: Parser):
|
||||
)
|
||||
|
||||
|
||||
def run_only_on_default_postgres(reason: str):
|
||||
return pytest.mark.skipif(
|
||||
PgVersion(os.environ.get("DEFAULT_PG_VERSION", DEFAULT_VERSION)) is not DEFAULT_VERSION,
|
||||
reason=reason,
|
||||
)
|
||||
|
||||
|
||||
def pytest_configure(config: Config):
|
||||
if config.getoption("--pg-version"):
|
||||
raise Exception("--pg-version is deprecated, use DEFAULT_PG_VERSION env var instead")
|
||||
|
||||
@@ -3,10 +3,12 @@ import uuid
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder
|
||||
from fixtures.pg_version import run_only_on_default_postgres
|
||||
from fixtures.utils import wait_until
|
||||
|
||||
|
||||
@pytest.mark.parametrize("level", ["trace", "debug", "info", "warn", "error"])
|
||||
@run_only_on_default_postgres("it does not use any postgres functionality")
|
||||
def test_logging_event_count(neon_env_builder: NeonEnvBuilder, level: str):
|
||||
# self-test: make sure the event is logged (i.e., our testing endpoint works)
|
||||
log_expected = {
|
||||
|
||||
@@ -109,7 +109,7 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
|
||||
# Timestamp is in the unreachable past
|
||||
probe_timestamp = tbl[0][1] - timedelta(hours=10)
|
||||
result = client.timeline_get_lsn_by_timestamp(
|
||||
tenant_id, timeline_id_child, f"{probe_timestamp.isoformat()}Z", 2
|
||||
tenant_id, timeline_id_child, f"{probe_timestamp.isoformat()}Z"
|
||||
)
|
||||
assert result["kind"] == "past"
|
||||
# make sure that we return the minimum lsn here at the start of the range
|
||||
|
||||
@@ -18,11 +18,11 @@ def test_migrations(neon_simple_env: NeonEnv):
|
||||
with endpoint.cursor() as cur:
|
||||
cur.execute("SELECT id FROM neon_migration.migration_id")
|
||||
migration_id = cur.fetchall()
|
||||
assert migration_id[0][0] == 2
|
||||
assert migration_id[0][0] == 3
|
||||
|
||||
with open(log_path, "r") as log_file:
|
||||
logs = log_file.read()
|
||||
assert "INFO handle_migrations: Ran 2 migrations" in logs
|
||||
assert "INFO handle_migrations: Ran 3 migrations" in logs
|
||||
|
||||
endpoint.stop()
|
||||
endpoint.start()
|
||||
@@ -30,7 +30,7 @@ def test_migrations(neon_simple_env: NeonEnv):
|
||||
with endpoint.cursor() as cur:
|
||||
cur.execute("SELECT id FROM neon_migration.migration_id")
|
||||
migration_id = cur.fetchall()
|
||||
assert migration_id[0][0] == 2
|
||||
assert migration_id[0][0] == 3
|
||||
|
||||
with open(log_path, "r") as log_file:
|
||||
logs = log_file.read()
|
||||
|
||||
@@ -1,26 +1,44 @@
|
||||
import time
|
||||
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnv
|
||||
from fixtures.pg_version import PgVersion
|
||||
|
||||
|
||||
def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion):
|
||||
env = neon_simple_env
|
||||
env.neon_cli.create_branch("test_neon_superuser", "empty")
|
||||
endpoint = env.endpoints.create("test_neon_superuser")
|
||||
endpoint.respec(skip_pg_catalog_updates=False, features=["migrations"])
|
||||
endpoint.start()
|
||||
env.neon_cli.create_branch("test_neon_superuser_publisher", "empty")
|
||||
pub = env.endpoints.create("test_neon_superuser_publisher")
|
||||
|
||||
env.neon_cli.create_branch("test_neon_superuser_subscriber")
|
||||
sub = env.endpoints.create("test_neon_superuser_subscriber")
|
||||
|
||||
pub.respec(skip_pg_catalog_updates=False, features=["migrations"])
|
||||
pub.start()
|
||||
|
||||
sub.respec(skip_pg_catalog_updates=False, features=["migrations"])
|
||||
sub.start()
|
||||
|
||||
time.sleep(1) # Sleep to let migrations run
|
||||
|
||||
with endpoint.cursor() as cur:
|
||||
with pub.cursor() as cur:
|
||||
cur.execute(
|
||||
"CREATE ROLE mr_whiskers WITH PASSWORD 'cat' LOGIN INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser"
|
||||
)
|
||||
cur.execute("CREATE DATABASE neondb WITH OWNER mr_whiskers")
|
||||
cur.execute("GRANT ALL PRIVILEGES ON DATABASE neondb TO neon_superuser")
|
||||
|
||||
with endpoint.cursor(dbname="neondb", user="mr_whiskers", password="cat") as cur:
|
||||
# If we don't do this, creating the subscription will fail later on PG16
|
||||
pub.edit_hba(["host all mr_whiskers 0.0.0.0/0 md5"])
|
||||
|
||||
with sub.cursor() as cur:
|
||||
cur.execute(
|
||||
"CREATE ROLE mr_whiskers WITH PASSWORD 'cat' LOGIN INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser"
|
||||
)
|
||||
cur.execute("CREATE DATABASE neondb WITH OWNER mr_whiskers")
|
||||
cur.execute("GRANT ALL PRIVILEGES ON DATABASE neondb TO neon_superuser")
|
||||
|
||||
with pub.cursor(dbname="neondb", user="mr_whiskers", password="cat") as cur:
|
||||
cur.execute("SELECT pg_has_role('mr_whiskers', 'neon_superuser', 'member')")
|
||||
assert cur.fetchall()[0][0]
|
||||
cur.execute("SELECT pg_has_role('mr_whiskers', 'neon_superuser', 'usage')")
|
||||
@@ -32,3 +50,28 @@ def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion):
|
||||
|
||||
cur.execute("CREATE PUBLICATION pub FOR ALL TABLES")
|
||||
cur.execute("CREATE ROLE definitely_not_a_superuser WITH PASSWORD 'nope'")
|
||||
cur.execute("CREATE DATABASE definitely_a_database")
|
||||
cur.execute("CREATE TABLE t (a int)")
|
||||
cur.execute("INSERT INTO t VALUES (10), (20)")
|
||||
cur.execute("SELECT * from t")
|
||||
res = cur.fetchall()
|
||||
assert [r[0] for r in res] == [10, 20]
|
||||
|
||||
with sub.cursor(dbname="neondb", user="mr_whiskers", password="cat") as cur:
|
||||
cur.execute("CREATE TABLE t (a int)")
|
||||
|
||||
pub_conn = f"host=localhost port={pub.pg_port} dbname=neondb user=mr_whiskers password=cat"
|
||||
query = f"CREATE SUBSCRIPTION sub CONNECTION '{pub_conn}' PUBLICATION pub"
|
||||
log.info(f"Creating subscription: {query}")
|
||||
cur.execute(query)
|
||||
|
||||
with pub.cursor(dbname="neondb", user="mr_whiskers", password="cat") as pcur:
|
||||
pcur.execute("INSERT INTO t VALUES (30), (40)")
|
||||
|
||||
time.sleep(1) # Give the change time to propagate
|
||||
|
||||
cur.execute("SELECT * FROM t")
|
||||
res = cur.fetchall()
|
||||
log.info(res)
|
||||
assert len(res) == 4
|
||||
assert [r[0] for r in res] == [10, 20, 30, 40]
|
||||
|
||||
@@ -203,6 +203,16 @@ def test_import_at_2bil(
|
||||
$$;
|
||||
"""
|
||||
)
|
||||
|
||||
# Also create a multi-XID with members past the 2 billion mark
|
||||
conn2 = endpoint.connect()
|
||||
cur2 = conn2.cursor()
|
||||
cur.execute("INSERT INTO t VALUES ('x')")
|
||||
cur.execute("BEGIN; select * from t WHERE t = 'x' FOR SHARE;")
|
||||
cur2.execute("BEGIN; select * from t WHERE t = 'x' FOR SHARE;")
|
||||
cur.execute("COMMIT")
|
||||
cur2.execute("COMMIT")
|
||||
|
||||
# A checkpoint writes a WAL record with xl_xid=0. Many other WAL
|
||||
# records would have the same effect.
|
||||
cur.execute("checkpoint")
|
||||
@@ -217,4 +227,4 @@ def test_import_at_2bil(
|
||||
conn = endpoint.connect()
|
||||
cur = conn.cursor()
|
||||
cur.execute("SELECT count(*) from t")
|
||||
assert cur.fetchone() == (10000 + 1,)
|
||||
assert cur.fetchone() == (10000 + 1 + 1,)
|
||||
|
||||
272
test_runner/regress/test_sharding_service.py
Normal file
272
test_runner/regress/test_sharding_service.py
Normal file
@@ -0,0 +1,272 @@
|
||||
import time
|
||||
from collections import defaultdict
|
||||
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
)
|
||||
from fixtures.pageserver.http import PageserverHttpClient
|
||||
from fixtures.pageserver.utils import tenant_delete_wait_completed, timeline_delete_wait_completed
|
||||
from fixtures.pg_version import PgVersion
|
||||
from fixtures.types import TenantId, TimelineId
|
||||
from fixtures.utils import wait_until
|
||||
|
||||
|
||||
def test_sharding_service_smoke(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
):
|
||||
"""
|
||||
Test the basic lifecycle of a sharding service:
|
||||
- Restarting
|
||||
- Restarting a pageserver
|
||||
- Creating and deleting tenants and timelines
|
||||
- Marking a pageserver offline
|
||||
"""
|
||||
|
||||
neon_env_builder.num_pageservers = 3
|
||||
env = neon_env_builder.init_configs()
|
||||
|
||||
# Start services by hand so that we can skip a pageserver (this will start + register later)
|
||||
env.broker.try_start()
|
||||
env.attachment_service.start()
|
||||
env.pageservers[0].start()
|
||||
env.pageservers[1].start()
|
||||
for sk in env.safekeepers:
|
||||
sk.start()
|
||||
|
||||
# The pageservers we started should have registered with the sharding service on startup
|
||||
nodes = env.attachment_service.node_list()
|
||||
assert len(nodes) == 2
|
||||
assert set(n["node_id"] for n in nodes) == {env.pageservers[0].id, env.pageservers[1].id}
|
||||
|
||||
# Starting an additional pageserver should register successfully
|
||||
env.pageservers[2].start()
|
||||
nodes = env.attachment_service.node_list()
|
||||
assert len(nodes) == 3
|
||||
assert set(n["node_id"] for n in nodes) == {ps.id for ps in env.pageservers}
|
||||
|
||||
# Use a multiple of pageservers to get nice even number of shards on each one
|
||||
tenant_shard_count = len(env.pageservers) * 4
|
||||
tenant_count = len(env.pageservers) * 2
|
||||
shards_per_tenant = tenant_shard_count // tenant_count
|
||||
tenant_ids = set(TenantId.generate() for i in range(0, tenant_count))
|
||||
|
||||
# Creating several tenants should spread out across the pageservers
|
||||
for tid in tenant_ids:
|
||||
env.neon_cli.create_tenant(tid, shard_count=shards_per_tenant)
|
||||
|
||||
def get_node_shard_counts():
|
||||
counts: defaultdict[str, int] = defaultdict(int)
|
||||
for tid in tenant_ids:
|
||||
for shard in env.attachment_service.locate(tid):
|
||||
counts[shard["node_id"]] += 1
|
||||
return counts
|
||||
|
||||
for node_id, count in get_node_shard_counts().items():
|
||||
# we used a multiple of pagservers for the total shard count,
|
||||
# so expect equal number on all pageservers
|
||||
assert count == tenant_shard_count / len(
|
||||
env.pageservers
|
||||
), f"Node {node_id} has bad count {count}"
|
||||
|
||||
# Creating and deleting timelines should work, using identical API to pageserver
|
||||
timeline_crud_tenant = next(iter(tenant_ids))
|
||||
timeline_id = TimelineId.generate()
|
||||
env.attachment_service.pageserver_api().timeline_create(
|
||||
pg_version=PgVersion.NOT_SET, tenant_id=timeline_crud_tenant, new_timeline_id=timeline_id
|
||||
)
|
||||
timelines = env.attachment_service.pageserver_api().timeline_list(timeline_crud_tenant)
|
||||
assert len(timelines) == 2
|
||||
assert timeline_id in set(TimelineId(t["timeline_id"]) for t in timelines)
|
||||
# virtual_ps_http.timeline_delete(tenant_id=timeline_crud_tenant, timeline_id=timeline_id)
|
||||
timeline_delete_wait_completed(
|
||||
env.attachment_service.pageserver_api(), timeline_crud_tenant, timeline_id
|
||||
)
|
||||
timelines = env.attachment_service.pageserver_api().timeline_list(timeline_crud_tenant)
|
||||
assert len(timelines) == 1
|
||||
assert timeline_id not in set(TimelineId(t["timeline_id"]) for t in timelines)
|
||||
|
||||
# Marking a pageserver offline should migrate tenants away from it.
|
||||
env.attachment_service.node_configure(env.pageservers[0].id, {"availability": "Offline"})
|
||||
|
||||
def node_evacuated(node_id: int):
|
||||
counts = get_node_shard_counts()
|
||||
assert counts[node_id] == 0
|
||||
|
||||
wait_until(10, 1, lambda: node_evacuated(env.pageservers[0].id))
|
||||
|
||||
# Marking pageserver active should not migrate anything to it
|
||||
# immediately
|
||||
env.attachment_service.node_configure(env.pageservers[0].id, {"availability": "Active"})
|
||||
time.sleep(1)
|
||||
assert get_node_shard_counts()[env.pageservers[0].id] == 0
|
||||
|
||||
# Delete all the tenants
|
||||
for tid in tenant_ids:
|
||||
tenant_delete_wait_completed(env.attachment_service.pageserver_api(), tid, 10)
|
||||
|
||||
# Set a scheduling policy on one node, create all the tenants, observe
|
||||
# that the scheduling policy is respected.
|
||||
env.attachment_service.node_configure(env.pageservers[1].id, {"scheduling": "Draining"})
|
||||
|
||||
# Create some fresh tenants
|
||||
tenant_ids = set(TenantId.generate() for i in range(0, tenant_count))
|
||||
for tid in tenant_ids:
|
||||
env.neon_cli.create_tenant(tid, shard_count=shards_per_tenant)
|
||||
|
||||
counts = get_node_shard_counts()
|
||||
# Nothing should have been scheduled on the node in Draining
|
||||
assert counts[env.pageservers[1].id] == 0
|
||||
assert counts[env.pageservers[0].id] == tenant_shard_count // 2
|
||||
assert counts[env.pageservers[2].id] == tenant_shard_count // 2
|
||||
|
||||
|
||||
def test_sharding_service_passthrough(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
):
|
||||
"""
|
||||
For simple timeline/tenant GET APIs that don't require coordination across
|
||||
shards, the sharding service implements a proxy to shard zero. This test
|
||||
calls those APIs.
|
||||
"""
|
||||
neon_env_builder.num_pageservers = 2
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
# We will talk to attachment service as if it was a pageserver, using the pageserver
|
||||
# HTTP client
|
||||
client = PageserverHttpClient(env.attachment_service_port, lambda: True)
|
||||
timelines = client.timeline_list(tenant_id=env.initial_tenant)
|
||||
assert len(timelines) == 1
|
||||
|
||||
|
||||
def test_sharding_service_restart(neon_env_builder: NeonEnvBuilder):
|
||||
env = neon_env_builder.init_start()
|
||||
tenant_a = env.initial_tenant
|
||||
tenant_b = TenantId.generate()
|
||||
env.attachment_service.tenant_create(tenant_b)
|
||||
env.pageserver.tenant_detach(tenant_a)
|
||||
|
||||
# TODO: extend this test to use multiple pageservers, and check that locations don't move around
|
||||
# on restart.
|
||||
|
||||
# Attachment service restart
|
||||
env.attachment_service.stop()
|
||||
env.attachment_service.start()
|
||||
|
||||
observed = set(TenantId(tenant["id"]) for tenant in env.pageserver.http_client().tenant_list())
|
||||
|
||||
# Tenant A should still be attached
|
||||
assert tenant_a not in observed
|
||||
|
||||
# Tenant B should remain detached
|
||||
assert tenant_b in observed
|
||||
|
||||
# Pageserver restart
|
||||
env.pageserver.stop()
|
||||
env.pageserver.start()
|
||||
|
||||
# Same assertions as above: restarting either service should not perturb things
|
||||
observed = set(TenantId(tenant["id"]) for tenant in env.pageserver.http_client().tenant_list())
|
||||
assert tenant_a not in observed
|
||||
assert tenant_b in observed
|
||||
|
||||
|
||||
def test_sharding_service_onboarding(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
):
|
||||
"""
|
||||
We onboard tenants to the sharding service by treating it as a 'virtual pageserver'
|
||||
which provides the /location_config API. This is similar to creating a tenant,
|
||||
but imports the generation number.
|
||||
"""
|
||||
|
||||
neon_env_builder.num_pageservers = 2
|
||||
|
||||
# Start services by hand so that we can skip registration on one of the pageservers
|
||||
env = neon_env_builder.init_configs()
|
||||
env.broker.try_start()
|
||||
env.attachment_service.start()
|
||||
|
||||
# This is the pageserver where we'll initially create the tenant
|
||||
env.pageservers[0].start(register=False)
|
||||
origin_ps = env.pageservers[0]
|
||||
|
||||
# This is the pageserver managed by the sharding service, where the tenant
|
||||
# will be attached after onboarding
|
||||
env.pageservers[1].start(register=True)
|
||||
dest_ps = env.pageservers[1]
|
||||
virtual_ps_http = PageserverHttpClient(env.attachment_service_port, lambda: True)
|
||||
|
||||
for sk in env.safekeepers:
|
||||
sk.start()
|
||||
|
||||
# Create a tenant directly via pageserver HTTP API, skipping the attachment service
|
||||
tenant_id = TenantId.generate()
|
||||
generation = 123
|
||||
origin_ps.http_client().tenant_create(tenant_id, generation=generation)
|
||||
|
||||
# As if doing a live migration, first configure origin into stale mode
|
||||
origin_ps.http_client().tenant_location_conf(
|
||||
tenant_id,
|
||||
{
|
||||
"mode": "AttachedStale",
|
||||
"secondary_conf": None,
|
||||
"tenant_conf": {},
|
||||
"generation": generation,
|
||||
},
|
||||
)
|
||||
|
||||
# Call into attachment service to onboard the tenant
|
||||
generation += 1
|
||||
virtual_ps_http.tenant_location_conf(
|
||||
tenant_id,
|
||||
{
|
||||
"mode": "AttachedMulti",
|
||||
"secondary_conf": None,
|
||||
"tenant_conf": {},
|
||||
"generation": generation,
|
||||
},
|
||||
)
|
||||
|
||||
# As if doing a live migration, detach the original pageserver
|
||||
origin_ps.http_client().tenant_location_conf(
|
||||
tenant_id,
|
||||
{
|
||||
"mode": "Detached",
|
||||
"secondary_conf": None,
|
||||
"tenant_conf": {},
|
||||
"generation": None,
|
||||
},
|
||||
)
|
||||
|
||||
# As if doing a live migration, call into the attachment service to
|
||||
# set it to AttachedSingle: this is a no-op, but we test it because the
|
||||
# cloud control plane may call this for symmetry with live migration to
|
||||
# an individual pageserver
|
||||
virtual_ps_http.tenant_location_conf(
|
||||
tenant_id,
|
||||
{
|
||||
"mode": "AttachedSingle",
|
||||
"secondary_conf": None,
|
||||
"tenant_conf": {},
|
||||
"generation": generation,
|
||||
},
|
||||
)
|
||||
|
||||
# We should see the tenant is now attached to the pageserver managed
|
||||
# by the sharding service
|
||||
origin_tenants = origin_ps.http_client().tenant_list()
|
||||
assert len(origin_tenants) == 0
|
||||
dest_tenants = dest_ps.http_client().tenant_list()
|
||||
assert len(dest_tenants) == 1
|
||||
assert TenantId(dest_tenants[0]["id"]) == tenant_id
|
||||
|
||||
# sharding service advances generation by 1 when it first attaches
|
||||
assert dest_tenants[0]["generation"] == generation + 1
|
||||
|
||||
# The onboarded tenant should survive a restart of sharding service
|
||||
env.attachment_service.stop()
|
||||
env.attachment_service.start()
|
||||
|
||||
# The onboarded tenant should surviev a restart of pageserver
|
||||
dest_ps.stop()
|
||||
dest_ps.start()
|
||||
@@ -1946,3 +1946,51 @@ def test_timeline_copy(neon_env_builder: NeonEnvBuilder, insert_rows: int):
|
||||
assert orig_digest == new_digest
|
||||
|
||||
# TODO: test timelines can start after copy
|
||||
|
||||
|
||||
def test_patch_control_file(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.num_safekeepers = 1
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
|
||||
endpoint = env.endpoints.create_start("main")
|
||||
# initialize safekeeper
|
||||
endpoint.safe_psql("create table t(key int, value text)")
|
||||
|
||||
# update control file
|
||||
res = (
|
||||
env.safekeepers[0]
|
||||
.http_client()
|
||||
.patch_control_file(
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
{
|
||||
"timeline_start_lsn": "0/1",
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
timeline_start_lsn_before = res["old_control_file"]["timeline_start_lsn"]
|
||||
timeline_start_lsn_after = res["new_control_file"]["timeline_start_lsn"]
|
||||
|
||||
log.info(f"patch_control_file response: {res}")
|
||||
log.info(
|
||||
f"updated control file timeline_start_lsn, before {timeline_start_lsn_before}, after {timeline_start_lsn_after}"
|
||||
)
|
||||
|
||||
assert timeline_start_lsn_after == "0/1"
|
||||
env.safekeepers[0].stop().start()
|
||||
|
||||
# wait/check that safekeeper is alive
|
||||
endpoint.safe_psql("insert into t values (1, 'payload')")
|
||||
|
||||
# check that timeline_start_lsn is updated
|
||||
res = (
|
||||
env.safekeepers[0]
|
||||
.http_client()
|
||||
.debug_dump({"dump_control_file": "true", "timeline_id": str(timeline_id)})
|
||||
)
|
||||
log.info(f"dump_control_file response: {res}")
|
||||
assert res["timelines"][0]["control_file"]["timeline_start_lsn"] == "0/1"
|
||||
|
||||
@@ -515,6 +515,42 @@ def test_recovery_uncommitted(neon_env_builder: NeonEnvBuilder):
|
||||
asyncio.run(run_recovery_uncommitted(env))
|
||||
|
||||
|
||||
async def run_segment_init_failure(env: NeonEnv):
|
||||
env.neon_cli.create_branch("test_segment_init_failure")
|
||||
ep = env.endpoints.create_start("test_segment_init_failure")
|
||||
ep.safe_psql("create table t(key int, value text)")
|
||||
ep.safe_psql("insert into t select generate_series(1, 100), 'payload'")
|
||||
|
||||
sk = env.safekeepers[0]
|
||||
sk_http = sk.http_client()
|
||||
sk_http.configure_failpoints([("sk-write-zeroes", "return")])
|
||||
conn = await ep.connect_async()
|
||||
ep.safe_psql("select pg_switch_wal()") # jump to the segment boundary
|
||||
# next insertion should hang until failpoint is disabled.
|
||||
asyncio.create_task(conn.execute("insert into t select generate_series(1,1), 'payload'"))
|
||||
sleep_sec = 2
|
||||
await asyncio.sleep(sleep_sec)
|
||||
# also restart ep at segment boundary to make test more interesting
|
||||
ep.stop()
|
||||
# it must still be not finished
|
||||
# assert not bg_query.done()
|
||||
# Without segment rename during init (#6402) previous statement created
|
||||
# partially initialized 16MB segment, so sk restart also triggers #6401.
|
||||
sk.stop().start()
|
||||
ep = env.endpoints.create_start("test_segment_init_failure")
|
||||
ep.safe_psql("insert into t select generate_series(1,1), 'payload'") # should be ok now
|
||||
|
||||
|
||||
# Test (injected) failure during WAL segment init.
|
||||
# https://github.com/neondatabase/neon/issues/6401
|
||||
# https://github.com/neondatabase/neon/issues/6402
|
||||
def test_segment_init_failure(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.num_safekeepers = 1
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
asyncio.run(run_segment_init_failure(env))
|
||||
|
||||
|
||||
@dataclass
|
||||
class RaceConditionTest:
|
||||
iteration: int
|
||||
|
||||
2
vendor/postgres-v14
vendored
2
vendor/postgres-v14
vendored
Submodule vendor/postgres-v14 updated: 11e970fe2b...3de48ce3d9
2
vendor/postgres-v15
vendored
2
vendor/postgres-v15
vendored
Submodule vendor/postgres-v15 updated: 731b4d1609...b089a8a02c
4
vendor/revisions.json
vendored
4
vendor/revisions.json
vendored
@@ -1,5 +1,5 @@
|
||||
{
|
||||
"postgres-v16": "cf302768b2890569956641e0e5ba112ae1445351",
|
||||
"postgres-v15": "731b4d1609d6db1c953755810a41e0e67ea3db7b",
|
||||
"postgres-v14": "11e970fe2be56804f0a786ec5fc8141ffefa4ca7"
|
||||
"postgres-v15": "b089a8a02c9f6f4379883fddb33cf10a3aa0b14f",
|
||||
"postgres-v14": "3de48ce3d9c1f4fac1cdc7029487f8db9e537eac"
|
||||
}
|
||||
|
||||
@@ -45,13 +45,13 @@ hmac = { version = "0.12", default-features = false, features = ["reset"] }
|
||||
hyper = { version = "0.14", features = ["full"] }
|
||||
indexmap = { version = "1", default-features = false, features = ["std"] }
|
||||
itertools = { version = "0.10" }
|
||||
libc = { version = "0.2", features = ["extra_traits"] }
|
||||
libc = { version = "0.2", features = ["extra_traits", "use_std"] }
|
||||
log = { version = "0.4", default-features = false, features = ["std"] }
|
||||
memchr = { version = "2" }
|
||||
nom = { version = "7" }
|
||||
num-bigint = { version = "0.4" }
|
||||
num-integer = { version = "0.1", features = ["i128"] }
|
||||
num-traits = { version = "0.2", features = ["i128"] }
|
||||
num-traits = { version = "0.2", features = ["i128", "libm"] }
|
||||
once_cell = { version = "1" }
|
||||
parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs", default-features = false, features = ["zstd"] }
|
||||
prost = { version = "0.11" }
|
||||
@@ -94,13 +94,13 @@ getrandom = { version = "0.2", default-features = false, features = ["std"] }
|
||||
hashbrown = { version = "0.14", default-features = false, features = ["raw"] }
|
||||
indexmap = { version = "1", default-features = false, features = ["std"] }
|
||||
itertools = { version = "0.10" }
|
||||
libc = { version = "0.2", features = ["extra_traits"] }
|
||||
libc = { version = "0.2", features = ["extra_traits", "use_std"] }
|
||||
log = { version = "0.4", default-features = false, features = ["std"] }
|
||||
memchr = { version = "2" }
|
||||
nom = { version = "7" }
|
||||
num-bigint = { version = "0.4" }
|
||||
num-integer = { version = "0.1", features = ["i128"] }
|
||||
num-traits = { version = "0.2", features = ["i128"] }
|
||||
num-traits = { version = "0.2", features = ["i128", "libm"] }
|
||||
once_cell = { version = "1" }
|
||||
parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs", default-features = false, features = ["zstd"] }
|
||||
prost = { version = "0.11" }
|
||||
|
||||
Reference in New Issue
Block a user