Compare commits

..

2 Commits

Author SHA1 Message Date
Conrad Ludgate
ef7e96fb4e tweak comments 2024-07-29 11:41:44 +01:00
Conrad Ludgate
54c5196f75 proxy: improve performance of leaky-bucket 2024-07-28 23:00:21 +01:00
54 changed files with 446 additions and 2012 deletions

View File

@@ -14,8 +14,11 @@ inputs:
api_host:
description: 'Neon API host'
default: console-stage.neon.build
provisioner:
description: 'k8s-pod or k8s-neonvm'
default: 'k8s-pod'
compute_units:
description: '[Min, Max] compute units'
description: '[Min, Max] compute units; Min and Max are used for k8s-neonvm with autoscaling, for k8s-pod values Min and Max should be equal'
default: '[1, 1]'
outputs:
@@ -34,6 +37,10 @@ runs:
# A shell without `set -x` to not to expose password/dsn in logs
shell: bash -euo pipefail {0}
run: |
if [ "${PROVISIONER}" == "k8s-pod" ] && [ "${MIN_CU}" != "${MAX_CU}" ]; then
echo >&2 "For k8s-pod provisioner MIN_CU should be equal to MAX_CU"
fi
project=$(curl \
"https://${API_HOST}/api/v2/projects" \
--fail \
@@ -45,7 +52,7 @@ runs:
\"name\": \"Created by actions/neon-project-create; GITHUB_RUN_ID=${GITHUB_RUN_ID}\",
\"pg_version\": ${POSTGRES_VERSION},
\"region_id\": \"${REGION_ID}\",
\"provisioner\": \"k8s-neonvm\",
\"provisioner\": \"${PROVISIONER}\",
\"autoscaling_limit_min_cu\": ${MIN_CU},
\"autoscaling_limit_max_cu\": ${MAX_CU},
\"settings\": { }
@@ -68,5 +75,6 @@ runs:
API_KEY: ${{ inputs.api_key }}
REGION_ID: ${{ inputs.region_id }}
POSTGRES_VERSION: ${{ inputs.postgres_version }}
PROVISIONER: ${{ inputs.provisioner }}
MIN_CU: ${{ fromJSON(inputs.compute_units)[0] }}
MAX_CU: ${{ fromJSON(inputs.compute_units)[1] }}

View File

@@ -63,9 +63,11 @@ jobs:
- DEFAULT_PG_VERSION: 16
PLATFORM: "neon-staging"
region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
provisioner: 'k8s-pod'
- DEFAULT_PG_VERSION: 16
PLATFORM: "azure-staging"
region_id: 'azure-eastus2'
provisioner: 'k8s-neonvm'
env:
TEST_PG_BENCH_DURATIONS_MATRIX: "300"
TEST_PG_BENCH_SCALES_MATRIX: "10,100"
@@ -98,6 +100,7 @@ jobs:
region_id: ${{ matrix.region_id }}
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
provisioner: ${{ matrix.provisioner }}
- name: Run benchmark
uses: ./.github/actions/run-python-test-set
@@ -213,11 +216,11 @@ jobs:
# Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
#
# Available platforms:
# - neonvm-captest-new: Freshly created project (1 CU)
# - neonvm-captest-freetier: Use freetier-sized compute (0.25 CU)
# - neon-captest-new: Freshly created project (1 CU)
# - neon-captest-freetier: Use freetier-sized compute (0.25 CU)
# - neonvm-captest-azure-new: Freshly created project (1 CU) in azure region
# - neonvm-captest-azure-freetier: Use freetier-sized compute (0.25 CU) in azure region
# - neonvm-captest-reuse: Reusing existing project
# - neon-captest-reuse: Reusing existing project
# - rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
# - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
env:
@@ -242,16 +245,18 @@ jobs:
"'"$region_id_default"'"
],
"platform": [
"neonvm-captest-new",
"neonvm-captest-reuse",
"neon-captest-new",
"neon-captest-reuse",
"neonvm-captest-new"
],
"db_size": [ "10gb" ],
"include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" },
"include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-freetier", "db_size": "3gb" },
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-new", "db_size": "50gb" },
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" },
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "50gb" },
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" },
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "10gb" },
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "50gb" },
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" },
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "10gb" },
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "50gb" },
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
}'
@@ -266,7 +271,7 @@ jobs:
run: |
matrix='{
"platform": [
"neonvm-captest-reuse"
"neon-captest-reuse"
]
}'
@@ -282,7 +287,7 @@ jobs:
run: |
matrix='{
"platform": [
"neonvm-captest-reuse"
"neon-captest-reuse"
],
"scale": [
"10"
@@ -333,7 +338,7 @@ jobs:
prefix: latest
- name: Create Neon Project
if: contains(fromJson('["neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform)
if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform)
id: create-neon-project
uses: ./.github/actions/neon-project-create
with:
@@ -341,18 +346,19 @@ jobs:
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
compute_units: ${{ (contains(matrix.platform, 'captest-freetier') && '[0.25, 0.25]') || '[1, 1]' }}
provisioner: ${{ (contains(matrix.platform, 'neonvm-') && 'k8s-neonvm') || 'k8s-pod' }}
- name: Set up Connection String
id: set-up-connstr
run: |
case "${PLATFORM}" in
neonvm-captest-reuse)
neon-captest-reuse)
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
;;
neonvm-captest-sharding-reuse)
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }}
;;
neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier)
neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier)
CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
;;
rds-aurora)
@@ -436,9 +442,9 @@ jobs:
fail-fast: false
matrix:
include:
- PLATFORM: "neonvm-captest-pgvector"
- PLATFORM: "neon-captest-pgvector"
- PLATFORM: "azure-captest-pgvector"
env:
TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
TEST_PG_BENCH_SCALES_MATRIX: "1"
@@ -480,7 +486,7 @@ jobs:
id: set-up-connstr
run: |
case "${PLATFORM}" in
neonvm-captest-pgvector)
neon-captest-pgvector)
CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
;;
azure-captest-pgvector)
@@ -579,7 +585,7 @@ jobs:
id: set-up-connstr
run: |
case "${PLATFORM}" in
neonvm-captest-reuse)
neon-captest-reuse)
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CLICKBENCH_10M_CONNSTR }}
;;
rds-aurora)
@@ -589,7 +595,7 @@ jobs:
CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CLICKBENCH_10M_CONNSTR }}
;;
*)
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
exit 1
;;
esac
@@ -666,7 +672,7 @@ jobs:
- name: Get Connstring Secret Name
run: |
case "${PLATFORM}" in
neonvm-captest-reuse)
neon-captest-reuse)
ENV_PLATFORM=CAPTEST_TPCH
;;
rds-aurora)
@@ -676,7 +682,7 @@ jobs:
ENV_PLATFORM=RDS_AURORA_TPCH
;;
*)
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
exit 1
;;
esac
@@ -753,7 +759,7 @@ jobs:
id: set-up-connstr
run: |
case "${PLATFORM}" in
neonvm-captest-reuse)
neon-captest-reuse)
CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_CAPTEST_CONNSTR }}
;;
rds-aurora)
@@ -763,7 +769,7 @@ jobs:
CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_RDS_POSTGRES_CONNSTR }}
;;
*)
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neonvm-captest-reuse', 'rds-aurora', or 'rds-postgres'"
echo >&2 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'rds-aurora', or 'rds-postgres'"
exit 1
;;
esac

View File

@@ -833,9 +833,6 @@ jobs:
rm -rf .docker-custom
promote-images:
permissions:
contents: read # This is required for actions/checkout
id-token: write # This is required for Azure Login to work.
needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
runs-on: ubuntu-22.04
@@ -862,28 +859,6 @@ jobs:
neondatabase/vm-compute-node-${version}:${{ needs.tag.outputs.build-tag }}
done
- name: Azure login
if: github.ref_name == 'main'
uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a # @v2.1.1
with:
client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }}
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
- name: Login to ACR
if: github.ref_name == 'main'
run: |
az acr login --name=neoneastus2
- name: Copy docker images to ACR-dev
if: github.ref_name == 'main'
run: |
for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16}; do
docker buildx imagetools create \
-t neoneastus2.azurecr.io/neondatabase/${image}:${{ needs.tag.outputs.build-tag }} \
neondatabase/${image}:${{ needs.tag.outputs.build-tag }}
done
- name: Add latest tag to images
if: github.ref_name == 'main'
run: |

View File

@@ -13,7 +13,6 @@ on:
paths:
- '.github/workflows/pg-clients.yml'
- 'test_runner/pg_clients/**'
- 'test_runner/logical_repl/**'
- 'poetry.lock'
workflow_dispatch:
@@ -50,77 +49,6 @@ jobs:
image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
secrets: inherit
test-logical-replication:
needs: [ build-build-tools-image ]
runs-on: ubuntu-22.04
container:
image: ${{ needs.build-build-tools-image.outputs.image }}
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
options: --init --user root
services:
clickhouse:
image: clickhouse/clickhouse-server:24.6.3.64
ports:
- 9000:9000
- 8123:8123
steps:
- uses: actions/checkout@v4
- name: Download Neon artifact
uses: ./.github/actions/download
with:
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
path: /tmp/neon/
prefix: latest
- name: Create Neon Project
id: create-neon-project
uses: ./.github/actions/neon-project-create
with:
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
- name: Run tests
uses: ./.github/actions/run-python-test-set
with:
build_type: remote
test_selection: logical_repl
run_in_parallel: false
extra_params: -m remote_cluster
pg_version: ${{ env.DEFAULT_PG_VERSION }}
env:
BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
- name: Delete Neon Project
if: always()
uses: ./.github/actions/neon-project-delete
with:
project_id: ${{ steps.create-neon-project.outputs.project_id }}
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
- name: Create Allure report
if: ${{ !cancelled() }}
id: create-allure-report
uses: ./.github/actions/allure-report-generate
with:
store-test-results-into-db: true
env:
REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
- name: Post to a Slack channel
if: github.event.schedule && failure()
uses: slackapi/slack-github-action@v1
with:
channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
slack-message: |
Testing the logical replication: <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|${{ job.status }}> (<${{ steps.create-allure-report.outputs.report-url }}|test report>)
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
test-postgres-client-libs:
needs: [ build-build-tools-image ]
runs-on: ubuntu-22.04

15
Cargo.lock generated
View File

@@ -1672,7 +1672,6 @@ checksum = "62d6dcd069e7b5fe49a302411f759d4cf1cf2c27fe798ef46fb8baefc053dd2b"
dependencies = [
"bitflags 2.4.1",
"byteorder",
"chrono",
"diesel_derives",
"itoa",
"pq-sys",
@@ -1744,18 +1743,6 @@ dependencies = [
"const-random",
]
[[package]]
name = "dns-lookup"
version = "2.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e5766087c2235fec47fafa4cfecc81e494ee679d0fd4a59887ea0919bfb0e4fc"
dependencies = [
"cfg-if",
"libc",
"socket2 0.5.5",
"windows-sys 0.48.0",
]
[[package]]
name = "dsl_auto_type"
version = "0.1.1"
@@ -5731,12 +5718,10 @@ dependencies = [
"aws-config",
"bytes",
"camino",
"chrono",
"clap",
"control_plane",
"diesel",
"diesel_migrations",
"dns-lookup",
"fail",
"futures",
"git-version",

View File

@@ -514,6 +514,7 @@ impl LocalEnv {
#[derive(serde::Serialize, serde::Deserialize)]
// (allow unknown fields, unlike PageServerConf)
struct PageserverConfigTomlSubset {
id: NodeId,
listen_pg_addr: String,
listen_http_addr: String,
pg_auth_type: AuthType,
@@ -525,30 +526,18 @@ impl LocalEnv {
.with_context(|| format!("read {:?}", config_toml_path))?,
)
.context("parse pageserver.toml")?;
let identity_toml_path = dentry.path().join("identity.toml");
#[derive(serde::Serialize, serde::Deserialize)]
struct IdentityTomlSubset {
id: NodeId,
}
let identity_toml: IdentityTomlSubset = toml_edit::de::from_str(
&std::fs::read_to_string(&identity_toml_path)
.with_context(|| format!("read {:?}", identity_toml_path))?,
)
.context("parse identity.toml")?;
let PageserverConfigTomlSubset {
id: config_toml_id,
listen_pg_addr,
listen_http_addr,
pg_auth_type,
http_auth_type,
} = config_toml;
let IdentityTomlSubset {
id: identity_toml_id,
} = identity_toml;
let conf = PageServerConf {
id: {
anyhow::ensure!(
identity_toml_id == id,
"id mismatch: identity.toml:id={identity_toml_id} pageserver_(.*) id={id}",
config_toml_id == id,
"id mismatch: config_toml.id={config_toml_id} id={id}",
);
id
},

View File

@@ -127,13 +127,10 @@ impl PageServerNode {
}
// Apply the user-provided overrides
overrides.push({
let mut doc =
toml_edit::ser::to_document(&conf).expect("we deserialized this from toml earlier");
// `id` is written out to `identity.toml` instead of `pageserver.toml`
doc.remove("id").expect("it's part of the struct");
doc.to_string()
});
overrides.push(
toml_edit::ser::to_string_pretty(&conf)
.expect("we deserialized this from toml earlier"),
);
// Turn `overrides` into a toml document.
// TODO: above code is legacy code, it should be refactored to use toml_edit directly.

View File

@@ -1,6 +1,5 @@
use std::collections::HashSet;
use std::str::FromStr;
use std::time::{Duration, Instant};
use std::time::Instant;
/// Request/response types for the storage controller
/// API (`/control/v1` prefix). Implemented by the server
@@ -295,42 +294,6 @@ pub enum PlacementPolicy {
#[derive(Serialize, Deserialize, Debug)]
pub struct TenantShardMigrateResponse {}
/// Metadata health record posted from scrubber.
#[derive(Serialize, Deserialize, Debug)]
pub struct MetadataHealthRecord {
pub tenant_shard_id: TenantShardId,
pub healthy: bool,
pub last_scrubbed_at: chrono::DateTime<chrono::Utc>,
}
#[derive(Serialize, Deserialize, Debug)]
pub struct MetadataHealthUpdateRequest {
pub healthy_tenant_shards: HashSet<TenantShardId>,
pub unhealthy_tenant_shards: HashSet<TenantShardId>,
}
#[derive(Serialize, Deserialize, Debug)]
pub struct MetadataHealthUpdateResponse {}
#[derive(Serialize, Deserialize, Debug)]
pub struct MetadataHealthListUnhealthyResponse {
pub unhealthy_tenant_shards: Vec<TenantShardId>,
}
#[derive(Serialize, Deserialize, Debug)]
pub struct MetadataHealthListOutdatedRequest {
#[serde(with = "humantime_serde")]
pub not_scrubbed_for: Duration,
}
#[derive(Serialize, Deserialize, Debug)]
pub struct MetadataHealthListOutdatedResponse {
pub health_records: Vec<MetadataHealthRecord>,
}
#[cfg(test)]
mod test {
use super::*;

View File

@@ -355,8 +355,7 @@ impl RemoteStorage for AzureBlobStorage {
.blobs()
.map(|k| ListingObject{
key: self.name_to_relative_path(&k.name),
last_modified: k.properties.last_modified.into(),
size: k.properties.content_length,
last_modified: k.properties.last_modified.into()
}
);

View File

@@ -153,7 +153,6 @@ pub enum ListingMode {
pub struct ListingObject {
pub key: RemotePath,
pub last_modified: SystemTime,
pub size: u64,
}
#[derive(Default)]
@@ -195,7 +194,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
mode: ListingMode,
max_keys: Option<NonZeroU32>,
cancel: &CancellationToken,
) -> impl Stream<Item = Result<Listing, DownloadError>> + Send;
) -> impl Stream<Item = Result<Listing, DownloadError>>;
async fn list(
&self,
@@ -352,10 +351,10 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
mode: ListingMode,
max_keys: Option<NonZeroU32>,
cancel: &'a CancellationToken,
) -> impl Stream<Item = Result<Listing, DownloadError>> + 'a + Send {
) -> impl Stream<Item = Result<Listing, DownloadError>> + 'a {
match self {
Self::LocalFs(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel))
as Pin<Box<dyn Stream<Item = Result<Listing, DownloadError>> + Send>>,
as Pin<Box<dyn Stream<Item = Result<Listing, DownloadError>>>>,
Self::AwsS3(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
Self::AzureBlob(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
Self::Unreliable(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),

View File

@@ -368,7 +368,6 @@ impl RemoteStorage for LocalFs {
key: k.clone(),
// LocalFs is just for testing, so just specify a dummy time
last_modified: SystemTime::now(),
size: 0,
})
}
})
@@ -412,7 +411,6 @@ impl RemoteStorage for LocalFs {
key: RemotePath::from_string(&relative_key).unwrap(),
// LocalFs is just for testing
last_modified: SystemTime::now(),
size: 0,
});
}
}

View File

@@ -565,12 +565,9 @@ impl RemoteStorage for S3Bucket {
}
};
let size = object.size.unwrap_or(0) as u64;
result.keys.push(ListingObject{
key,
last_modified,
size,
last_modified
});
if let Some(mut mk) = max_keys {
assert!(mk > 0);

View File

@@ -114,7 +114,7 @@ impl RemoteStorage for UnreliableWrapper {
mode: ListingMode,
max_keys: Option<NonZeroU32>,
cancel: &CancellationToken,
) -> impl Stream<Item = Result<Listing, DownloadError>> + Send {
) -> impl Stream<Item = Result<Listing, DownloadError>> {
async_stream::stream! {
self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
.map_err(DownloadError::Other)?;

View File

@@ -18,20 +18,20 @@ const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA;
#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq)]
#[serde(rename_all = "lowercase")]
pub enum Scope {
/// Provides access to all data for a specific tenant (specified in `struct Claims` below)
// Provides access to all data for a specific tenant (specified in `struct Claims` below)
// TODO: join these two?
Tenant,
/// Provides blanket access to all tenants on the pageserver plus pageserver-wide APIs.
/// Should only be used e.g. for status check/tenant creation/list.
// Provides blanket access to all tenants on the pageserver plus pageserver-wide APIs.
// Should only be used e.g. for status check/tenant creation/list.
PageServerApi,
/// Provides blanket access to all data on the safekeeper plus safekeeper-wide APIs.
/// Should only be used e.g. for status check.
/// Currently also used for connection from any pageserver to any safekeeper.
// Provides blanket access to all data on the safekeeper plus safekeeper-wide APIs.
// Should only be used e.g. for status check.
// Currently also used for connection from any pageserver to any safekeeper.
SafekeeperData,
/// The scope used by pageservers in upcalls to storage controller and cloud control plane
// The scope used by pageservers in upcalls to storage controller and cloud control plane
#[serde(rename = "generations_api")]
GenerationsApi,
/// Allows access to control plane managment API and some storage controller endpoints.
// Allows access to control plane managment API and some storage controller endpoints.
Admin,
/// Allows access to storage controller APIs used by the scrubber, to interrogate the state

View File

@@ -356,6 +356,8 @@ struct PageServerConfigBuilder {
auth_validation_public_key_path: BuilderValue<Option<Utf8PathBuf>>,
remote_storage_config: BuilderValue<Option<RemoteStorageConfig>>,
id: BuilderValue<NodeId>,
broker_endpoint: BuilderValue<Uri>,
broker_keepalive_interval: BuilderValue<Duration>,
@@ -404,8 +406,11 @@ struct PageServerConfigBuilder {
}
impl PageServerConfigBuilder {
fn new() -> Self {
Self::default()
fn new(node_id: NodeId) -> Self {
let mut this = Self::default();
this.id(node_id);
this
}
#[inline(always)]
@@ -433,6 +438,7 @@ impl PageServerConfigBuilder {
pg_auth_type: Set(AuthType::Trust),
auth_validation_public_key_path: Set(None),
remote_storage_config: Set(None),
id: NotSet,
broker_endpoint: Set(storage_broker::DEFAULT_ENDPOINT
.parse()
.expect("failed to parse default broker endpoint")),
@@ -562,6 +568,10 @@ impl PageServerConfigBuilder {
self.broker_keepalive_interval = BuilderValue::Set(broker_keepalive_interval)
}
pub fn id(&mut self, node_id: NodeId) {
self.id = BuilderValue::Set(node_id)
}
pub fn log_format(&mut self, log_format: LogFormat) {
self.log_format = BuilderValue::Set(log_format)
}
@@ -673,7 +683,7 @@ impl PageServerConfigBuilder {
self.l0_flush = BuilderValue::Set(value);
}
pub fn build(self, id: NodeId) -> anyhow::Result<PageServerConf> {
pub fn build(self) -> anyhow::Result<PageServerConf> {
let default = Self::default_values();
macro_rules! conf {
@@ -706,6 +716,7 @@ impl PageServerConfigBuilder {
pg_auth_type,
auth_validation_public_key_path,
remote_storage_config,
id,
broker_endpoint,
broker_keepalive_interval,
log_format,
@@ -733,7 +744,6 @@ impl PageServerConfigBuilder {
}
CUSTOM LOGIC
{
id: id,
// TenantConf is handled separately
default_tenant_conf: TenantConf::default(),
concurrent_tenant_warmup: ConfigurableSemaphore::new({
@@ -883,7 +893,7 @@ impl PageServerConf {
toml: &Document,
workdir: &Utf8Path,
) -> anyhow::Result<Self> {
let mut builder = PageServerConfigBuilder::new();
let mut builder = PageServerConfigBuilder::new(node_id);
builder.workdir(workdir.to_owned());
let mut t_conf = TenantConfOpt::default();
@@ -914,6 +924,8 @@ impl PageServerConf {
"tenant_config" => {
t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?;
}
"id" => {}, // Ignoring `id` field in pageserver.toml - using identity.toml as the source of truth
// Logging is not set up yet, so we can't do it.
"broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?),
"broker_keepalive_interval" => builder.broker_keepalive_interval(parse_toml_duration(key, item)?),
"log_format" => builder.log_format(
@@ -1006,7 +1018,7 @@ impl PageServerConf {
}
}
let mut conf = builder.build(node_id).context("invalid config")?;
let mut conf = builder.build().context("invalid config")?;
if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT {
let auth_validation_public_key_path = conf
@@ -1243,6 +1255,7 @@ max_file_descriptors = 333
# initial superuser role name to use when creating a new tenant
initial_superuser_name = 'zzzz'
id = 10
metric_collection_interval = '222 s'
metric_collection_endpoint = 'http://localhost:80/metrics'
@@ -1259,8 +1272,9 @@ background_task_maximum_delay = '334 s'
let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
let broker_endpoint = storage_broker::DEFAULT_ENDPOINT;
// we have to create dummy values to overcome the validation errors
let config_string =
format!("pg_distrib_dir='{pg_distrib_dir}'\nbroker_endpoint = '{broker_endpoint}'",);
let config_string = format!(
"pg_distrib_dir='{pg_distrib_dir}'\nid=10\nbroker_endpoint = '{broker_endpoint}'",
);
let toml = config_string.parse()?;
let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
@@ -1565,6 +1579,7 @@ broker_endpoint = '{broker_endpoint}'
r#"pg_distrib_dir = "{pg_distrib_dir}"
metric_collection_endpoint = "http://sample.url"
metric_collection_interval = "10min"
id = 222
[disk_usage_based_eviction]
max_usage_pct = 80
@@ -1634,6 +1649,7 @@ threshold = "20m"
r#"pg_distrib_dir = "{pg_distrib_dir}"
metric_collection_endpoint = "http://sample.url"
metric_collection_interval = "10min"
id = 222
[tenant_config]
evictions_low_residence_duration_metric_threshold = "20m"

View File

@@ -2129,24 +2129,14 @@ async fn secondary_download_handler(
let timeout = wait.unwrap_or(Duration::MAX);
let result = tokio::time::timeout(
let status = match tokio::time::timeout(
timeout,
state.secondary_controller.download_tenant(tenant_shard_id),
)
.await;
let progress = secondary_tenant.progress.lock().unwrap().clone();
let status = match result {
Ok(Ok(())) => {
if progress.layers_downloaded >= progress.layers_total {
// Download job ran to completion
StatusCode::OK
} else {
// Download dropped out without errors because it ran out of time budget
StatusCode::ACCEPTED
}
}
.await
{
// Download job ran to completion.
Ok(Ok(())) => StatusCode::OK,
// Edge case: downloads aren't usually fallible: things like a missing heatmap are considered
// okay. We could get an error here in the unlikely edge case that the tenant
// was detached between our check above and executing the download job.
@@ -2156,6 +2146,8 @@ async fn secondary_download_handler(
Err(_) => StatusCode::ACCEPTED,
};
let progress = secondary_tenant.progress.lock().unwrap().clone();
json_response(status, progress)
}

View File

@@ -2,23 +2,13 @@ use std::{num::NonZeroUsize, sync::Arc};
use crate::tenant::ephemeral_file;
#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize)]
#[derive(Default, Debug, PartialEq, Eq, Clone, serde::Deserialize)]
#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
pub enum L0FlushConfig {
#[default]
PageCached,
#[serde(rename_all = "snake_case")]
Direct {
max_concurrency: NonZeroUsize,
},
}
impl Default for L0FlushConfig {
fn default() -> Self {
Self::Direct {
// TODO: using num_cpus results in different peak memory usage on different instance types.
max_concurrency: NonZeroUsize::new(usize::max(1, num_cpus::get())).unwrap(),
}
}
Direct { max_concurrency: NonZeroUsize },
}
#[derive(Clone)]

View File

@@ -613,23 +613,7 @@ pub(crate) static CIRCUIT_BREAKERS_UNBROKEN: Lazy<IntCounter> = Lazy::new(|| {
pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_compression_image_in_bytes_total",
"Size of data written into image layers before compression"
)
.expect("failed to define a metric")
});
pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES_CONSIDERED: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_compression_image_in_bytes_considered",
"Size of potentially compressible data written into image layers before compression"
)
.expect("failed to define a metric")
});
pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES_CHOSEN: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_compression_image_in_bytes_chosen",
"Size of data whose compressed form was written into image layers"
"Size of uncompressed data written into image layers"
)
.expect("failed to define a metric")
});

View File

@@ -28,12 +28,6 @@ use crate::virtual_file::VirtualFile;
use std::cmp::min;
use std::io::{Error, ErrorKind};
#[derive(Copy, Clone, Debug)]
pub struct CompressionInfo {
pub written_compressed: bool,
pub compressed_size: Option<usize>,
}
impl<'a> BlockCursor<'a> {
/// Read a blob into a new buffer.
pub async fn read_blob(
@@ -279,10 +273,8 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
srcbuf: B,
ctx: &RequestContext,
) -> (B::Buf, Result<u64, Error>) {
let (buf, res) = self
.write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled)
.await;
(buf, res.map(|(off, _compression_info)| off))
self.write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled)
.await
}
/// Write a blob of data. Returns the offset that it was written to,
@@ -292,12 +284,8 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
srcbuf: B,
ctx: &RequestContext,
algorithm: ImageCompressionAlgorithm,
) -> (B::Buf, Result<(u64, CompressionInfo), Error>) {
) -> (B::Buf, Result<u64, Error>) {
let offset = self.offset;
let mut compression_info = CompressionInfo {
written_compressed: false,
compressed_size: None,
};
let len = srcbuf.bytes_init();
@@ -340,9 +328,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
encoder.write_all(&slice[..]).await.unwrap();
encoder.shutdown().await.unwrap();
let compressed = encoder.into_inner();
compression_info.compressed_size = Some(compressed.len());
if compressed.len() < len {
compression_info.written_compressed = true;
let compressed_len = compressed.len();
compressed_buf = Some(compressed);
(BYTE_ZSTD, compressed_len, slice.into_inner())
@@ -373,7 +359,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
} else {
self.write_all(srcbuf, ctx).await
};
(srcbuf, res.map(|_| (offset, compression_info)))
(srcbuf, res.map(|_| offset))
}
}
@@ -430,14 +416,12 @@ pub(crate) mod tests {
let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
for blob in blobs.iter() {
let (_, res) = if compression {
let res = wtr
.write_blob_maybe_compressed(
blob.clone(),
ctx,
ImageCompressionAlgorithm::Zstd { level: Some(1) },
)
.await;
(res.0, res.1.map(|(off, _)| off))
wtr.write_blob_maybe_compressed(
blob.clone(),
ctx,
ImageCompressionAlgorithm::Zstd { level: Some(1) },
)
.await
} else {
wtr.write_blob(blob.clone(), ctx).await
};

View File

@@ -1384,32 +1384,34 @@ impl TenantManager {
tenant_shard_id: TenantShardId,
) -> Result<(), DeleteTenantError> {
let remote_path = remote_tenant_path(&tenant_shard_id);
let mut keys_stream = self.resources.remote_storage.list_streaming(
Some(&remote_path),
remote_storage::ListingMode::NoDelimiter,
None,
&self.cancel,
);
while let Some(chunk) = keys_stream.next().await {
let keys = match chunk {
Ok(listing) => listing.keys,
Err(remote_storage::DownloadError::Cancelled) => {
return Err(DeleteTenantError::Cancelled)
}
Err(remote_storage::DownloadError::NotFound) => return Ok(()),
Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))),
};
if keys.is_empty() {
tracing::info!("Remote storage already deleted");
} else {
tracing::info!("Deleting {} keys from remote storage", keys.len());
let keys = keys.into_iter().map(|o| o.key).collect::<Vec<_>>();
self.resources
.remote_storage
.delete_objects(&keys, &self.cancel)
.await?;
let keys = match self
.resources
.remote_storage
.list(
Some(&remote_path),
remote_storage::ListingMode::NoDelimiter,
None,
&self.cancel,
)
.await
{
Ok(listing) => listing.keys,
Err(remote_storage::DownloadError::Cancelled) => {
return Err(DeleteTenantError::Cancelled)
}
Err(remote_storage::DownloadError::NotFound) => return Ok(()),
Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))),
};
if keys.is_empty() {
tracing::info!("Remote storage already deleted");
} else {
tracing::info!("Deleting {} keys from remote storage", keys.len());
let keys = keys.into_iter().map(|o| o.key).collect::<Vec<_>>();
self.resources
.remote_storage
.delete_objects(&keys, &self.cancel)
.await?;
}
Ok(())

View File

@@ -467,7 +467,7 @@ impl DeltaLayerWriterInner {
.write_blob_maybe_compressed(val, ctx, compression)
.await;
let off = match res {
Ok((off, _)) => off,
Ok(off) => off,
Err(e) => return (val, Err(anyhow::anyhow!(e))),
};

View File

@@ -734,14 +734,6 @@ struct ImageLayerWriterInner {
// Total uncompressed bytes passed into put_image
uncompressed_bytes: u64,
// Like `uncompressed_bytes`,
// but only of images we might consider for compression
uncompressed_bytes_eligible: u64,
// Like `uncompressed_bytes`, but only of images
// where we have chosen their compressed form
uncompressed_bytes_chosen: u64,
blob_writer: BlobWriter<false>,
tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
}
@@ -798,8 +790,6 @@ impl ImageLayerWriterInner {
tree: tree_builder,
blob_writer,
uncompressed_bytes: 0,
uncompressed_bytes_eligible: 0,
uncompressed_bytes_chosen: 0,
};
Ok(writer)
@@ -818,22 +808,13 @@ impl ImageLayerWriterInner {
) -> anyhow::Result<()> {
ensure!(self.key_range.contains(&key));
let compression = self.conf.image_compression;
let uncompressed_len = img.len() as u64;
self.uncompressed_bytes += uncompressed_len;
self.uncompressed_bytes += img.len() as u64;
let (_img, res) = self
.blob_writer
.write_blob_maybe_compressed(img, ctx, compression)
.await;
// TODO: re-use the buffer for `img` further upstack
let (off, compression_info) = res?;
if compression_info.compressed_size.is_some() {
// The image has been considered for compression at least
self.uncompressed_bytes_eligible += uncompressed_len;
}
if compression_info.written_compressed {
// The image has been compressed
self.uncompressed_bytes_chosen += uncompressed_len;
}
let off = res?;
let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
key.write_to_byte_slice(&mut keybuf);
@@ -856,9 +837,6 @@ impl ImageLayerWriterInner {
// Calculate compression ratio
let compressed_size = self.blob_writer.size() - PAGE_SZ as u64; // Subtract PAGE_SZ for header
crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES.inc_by(self.uncompressed_bytes);
crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES_CONSIDERED
.inc_by(self.uncompressed_bytes_eligible);
crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES_CHOSEN.inc_by(self.uncompressed_bytes_chosen);
crate::metrics::COMPRESSION_IMAGE_OUTPUT_BYTES.inc_by(compressed_size);
let mut file = self.blob_writer.into_inner();

View File

@@ -58,7 +58,7 @@ use std::{
sync::atomic::AtomicU64,
};
use std::{
cmp::{max, min},
cmp::{max, min, Ordering},
ops::ControlFlow,
};
use std::{
@@ -177,6 +177,25 @@ impl std::fmt::Display for ImageLayerCreationMode {
}
}
/// Wrapper for key range to provide reverse ordering by range length for BinaryHeap
#[derive(Debug, Clone, PartialEq, Eq)]
pub(crate) struct Hole {
key_range: Range<Key>,
coverage_size: usize,
}
impl Ord for Hole {
fn cmp(&self, other: &Self) -> Ordering {
other.coverage_size.cmp(&self.coverage_size) // inverse order
}
}
impl PartialOrd for Hole {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
/// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things.
/// Can be removed after all refactors are done.
fn drop_rlock<T>(rlock: tokio::sync::RwLockReadGuard<T>) {

View File

@@ -30,8 +30,8 @@ use crate::tenant::config::defaults::{DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPA
use crate::tenant::remote_timeline_client::WaitCompletionError;
use crate::tenant::storage_layer::merge_iterator::MergeIterator;
use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc, ValueReconstructState};
use crate::tenant::timeline::ImageLayerCreationOutcome;
use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter};
use crate::tenant::timeline::{Hole, ImageLayerCreationOutcome};
use crate::tenant::timeline::{Layer, ResidentLayer};
use crate::tenant::DeltaLayer;
use crate::virtual_file::{MaybeFatalIo, VirtualFile};
@@ -608,93 +608,62 @@ impl Timeline {
.read_lock_held_spawn_blocking_startup_micros
.till_now();
// TODO: replace with streaming k-merge
let all_keys = {
let mut all_keys = Vec::new();
for l in deltas_to_compact.iter() {
all_keys.extend(l.load_keys(ctx).await.map_err(CompactionError::Other)?);
}
// The current stdlib sorting implementation is designed in a way where it is
// particularly fast where the slice is made up of sorted sub-ranges.
all_keys.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn));
all_keys
};
// Determine N largest holes where N is number of compacted layers.
let max_holes = deltas_to_compact.len();
let last_record_lsn = self.get_last_record_lsn();
let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128;
let min_hole_coverage_size = 3; // TODO: something more flexible?
// min-heap (reserve space for one more element added before eviction)
let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
let mut prev: Option<Key> = None;
let mut all_keys = Vec::new();
for l in deltas_to_compact.iter() {
all_keys.extend(l.load_keys(ctx).await.map_err(CompactionError::Other)?);
}
// FIXME: should spawn_blocking the rest of this function
// The current stdlib sorting implementation is designed in a way where it is
// particularly fast where the slice is made up of sorted sub-ranges.
all_keys.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn));
stats.read_lock_held_key_sort_micros = stats.read_lock_held_prerequisites_micros.till_now();
// Determine N largest holes where N is number of compacted layers. The vec is sorted by key range start.
//
// A hole is a key range for which this compaction doesn't have any WAL records.
// Our goal in this compaction iteration is to avoid creating L1s that, in terms of their key range,
// cover the hole, but actually don't contain any WAL records for that key range.
// The reason is that the mere stack of L1s (`count_deltas`) triggers image layer creation (`create_image_layers`).
// That image layer creation would be useless for a hole range covered by L1s that don't contain any WAL records.
//
// The algorithm chooses holes as follows.
// - Slide a 2-window over the keys in key orde to get the hole range (=distance between two keys).
// - Filter: min threshold on range length
// - Rank: by coverage size (=number of image layers required to reconstruct each key in the range for which we have any data)
//
// For more details, intuition, and some ASCII art see https://github.com/neondatabase/neon/pull/3597#discussion_r1112704451
#[derive(PartialEq, Eq)]
struct Hole {
key_range: Range<Key>,
coverage_size: usize,
}
let holes: Vec<Hole> = {
use std::cmp::Ordering;
impl Ord for Hole {
fn cmp(&self, other: &Self) -> Ordering {
self.coverage_size.cmp(&other.coverage_size).reverse()
}
}
impl PartialOrd for Hole {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
let max_holes = deltas_to_compact.len();
let last_record_lsn = self.get_last_record_lsn();
let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128;
let min_hole_coverage_size = 3; // TODO: something more flexible?
// min-heap (reserve space for one more element added before eviction)
let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
let mut prev: Option<Key> = None;
for &DeltaEntry { key: next_key, .. } in all_keys.iter() {
if let Some(prev_key) = prev {
// just first fast filter, do not create hole entries for metadata keys. The last hole in the
// compaction is the gap between data key and metadata keys.
if next_key.to_i128() - prev_key.to_i128() >= min_hole_range
&& !Key::is_metadata_key(&prev_key)
{
let key_range = prev_key..next_key;
// Measuring hole by just subtraction of i128 representation of key range boundaries
// has not so much sense, because largest holes will corresponds field1/field2 changes.
// But we are mostly interested to eliminate holes which cause generation of excessive image layers.
// That is why it is better to measure size of hole as number of covering image layers.
let coverage_size =
layers.image_coverage(&key_range, last_record_lsn).len();
if coverage_size >= min_hole_coverage_size {
heap.push(Hole {
key_range,
coverage_size,
});
if heap.len() > max_holes {
heap.pop(); // remove smallest hole
}
for &DeltaEntry { key: next_key, .. } in all_keys.iter() {
if let Some(prev_key) = prev {
// just first fast filter, do not create hole entries for metadata keys. The last hole in the
// compaction is the gap between data key and metadata keys.
if next_key.to_i128() - prev_key.to_i128() >= min_hole_range
&& !Key::is_metadata_key(&prev_key)
{
let key_range = prev_key..next_key;
// Measuring hole by just subtraction of i128 representation of key range boundaries
// has not so much sense, because largest holes will corresponds field1/field2 changes.
// But we are mostly interested to eliminate holes which cause generation of excessive image layers.
// That is why it is better to measure size of hole as number of covering image layers.
let coverage_size = layers.image_coverage(&key_range, last_record_lsn).len();
if coverage_size >= min_hole_coverage_size {
heap.push(Hole {
key_range,
coverage_size,
});
if heap.len() > max_holes {
heap.pop(); // remove smallest hole
}
}
}
prev = Some(next_key.next());
}
let mut holes = heap.into_vec();
holes.sort_unstable_by_key(|hole| hole.key_range.start);
holes
};
prev = Some(next_key.next());
}
stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now();
drop_rlock(guard);
stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now();
let mut holes = heap.into_vec();
holes.sort_unstable_by_key(|hole| hole.key_range.start);
let mut next_hole = 0; // index of next hole in holes vector
// This iterator walks through all key-value pairs from all the layers
// we're compacting, in key, LSN order.
@@ -769,7 +738,6 @@ impl Timeline {
let mut key_values_total_size = 0u64;
let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key
let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key
let mut next_hole = 0; // index of next hole in holes vector
for &DeltaEntry {
key, lsn, ref val, ..

153
poetry.lock generated
View File

@@ -870,96 +870,6 @@ files = [
[package.dependencies]
colorama = {version = "*", markers = "platform_system == \"Windows\""}
[[package]]
name = "clickhouse-connect"
version = "0.7.17"
description = "ClickHouse Database Core Driver for Python, Pandas, and Superset"
optional = false
python-versions = "~=3.8"
files = [
{file = "clickhouse-connect-0.7.17.tar.gz", hash = "sha256:854f1f9f3e024e7f89ae5d57cd3289d7a4c3dc91a9f24c4d233014f0ea19cb2d"},
{file = "clickhouse_connect-0.7.17-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:aca36f5f28be1ada2981fce87724bbf451f267c918015baec59e527de3c9c882"},
{file = "clickhouse_connect-0.7.17-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:66209e4634f457604c263bea176336079d26c284e251e68a8435b0b80c1a25ff"},
{file = "clickhouse_connect-0.7.17-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e4d86c5a561a2a99321c8b4af22257461b8e67142f34cfea6e70f39b45b1f406"},
{file = "clickhouse_connect-0.7.17-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d200c9afa2725a96f9f3718221f641276b80c11bf504d8a2fbaafb5a05b2f0d3"},
{file = "clickhouse_connect-0.7.17-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:004d867b1005445a46e6742db1054bf2a717a451372663b46e09b5e9e90a31e3"},
{file = "clickhouse_connect-0.7.17-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:4ef94a4a8e008882259151833c3c47cfbb9c8f08de0f100aaf3b95c366dcfb24"},
{file = "clickhouse_connect-0.7.17-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ee732c3df50c8b07d16b5836ff85e6b84569922455c03837c3add5cf1388fe1f"},
{file = "clickhouse_connect-0.7.17-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d9dbe1235465bb946e24b90b0ca5b8800b5d645acb2d7d6ee819448c3e2fd959"},
{file = "clickhouse_connect-0.7.17-cp310-cp310-win32.whl", hash = "sha256:e5db0d68dfb63db0297d44dc91406bcfd7d333708d7cd55086c8550fbf870b78"},
{file = "clickhouse_connect-0.7.17-cp310-cp310-win_amd64.whl", hash = "sha256:800750f568c097ea312887785025006d6098bffd8ed2dd6a57048fb3ced6d778"},
{file = "clickhouse_connect-0.7.17-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4eb390623b3d15dc9cda78f5c68f83ef9ad11743797e70af8fabc384b015a73c"},
{file = "clickhouse_connect-0.7.17-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:35f172ca950f218f63072024c81d5b4ff6e5399620c255506c321ccc7b17c9a5"},
{file = "clickhouse_connect-0.7.17-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ae7918f060f7576fc931c692e0122b1b07576fabd81444af22e1f8582300d200"},
{file = "clickhouse_connect-0.7.17-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ff2881b93c7a1afb9c99fb59ad5fd666850421325d0931e2b77f3f4ba872303d"},
{file = "clickhouse_connect-0.7.17-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8a4d9b4f97271addf66aadbaf7f154f19a0ad6c22026d575a995c55ebd8576db"},
{file = "clickhouse_connect-0.7.17-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e431469b1ff2d5c3e4c406d55c6afdf7102f5d2524c2ceb5481b94ac24412aa3"},
{file = "clickhouse_connect-0.7.17-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:2b6f80115176559f181a6b3ecad11aa3d70ef6014c3d2905b90fcef3f27d25c2"},
{file = "clickhouse_connect-0.7.17-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d8ac694f40dfafc8a3cc877116b4bc73e8877ebf66d4d96ee092484ee4c0b481"},
{file = "clickhouse_connect-0.7.17-cp311-cp311-win32.whl", hash = "sha256:78b7a3f6b0fad4eaf8afb5f9a2e855bde53e82ea5804960e9cf779538f4606a1"},
{file = "clickhouse_connect-0.7.17-cp311-cp311-win_amd64.whl", hash = "sha256:efd390cc045334ecc3f2a9c18cc07c041d0288b145967805fdcab65abeefa75f"},
{file = "clickhouse_connect-0.7.17-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9228334a17dc0a7842222f54ba5b89fc563532424aad4f66be799df70ab37e9f"},
{file = "clickhouse_connect-0.7.17-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e432a42bb788bda77e88eda2774392a60fbbb5ee2a79cb2881d182d26c45fe49"},
{file = "clickhouse_connect-0.7.17-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c85152ed2879965ee1fa2bd5e31fb27d281fd5f50d6e86a401efd95cd85b29ef"},
{file = "clickhouse_connect-0.7.17-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:29a126104aa5e11df570cbd89fca4988784084602ba77d17b2396b334c54fd75"},
{file = "clickhouse_connect-0.7.17-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:882d8f9570549258e6eb6a97915fbf64ed29fe395d5e360866ea8d42c8283a35"},
{file = "clickhouse_connect-0.7.17-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:06ebf99111171442f462fb8b357364c3e276da3e8f8557b2e8fee9eb55ab37d1"},
{file = "clickhouse_connect-0.7.17-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:e0cf6f99b2777b0d164bf8b65ec39104cdc0789a56bcb52d98289bbd6f5cc70e"},
{file = "clickhouse_connect-0.7.17-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:ee46c508fddfff3b7ac52326788e0c6dd8dfb416b6d7e02e5d30e8110749dac2"},
{file = "clickhouse_connect-0.7.17-cp312-cp312-win32.whl", hash = "sha256:eb708b590a37d56b069a6088254ffa55d73b8cb65527339df81ef03fe67ffdec"},
{file = "clickhouse_connect-0.7.17-cp312-cp312-win_amd64.whl", hash = "sha256:17f00dccddaeaf43733faa1fa21f7d24641454a73669fda862545ba7c88627f5"},
{file = "clickhouse_connect-0.7.17-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ab5d4b37a6dcc39e94c63beac0f22d9dda914f5eb865d166c64cf04dfadb7d16"},
{file = "clickhouse_connect-0.7.17-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:32aa90387f45f34cbc5a984789ed4c12760a3c0056c190ab0123ceafc36b1002"},
{file = "clickhouse_connect-0.7.17-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:21277b6bdd6c8ff14170bfcd52125c5c39f442ec4bafbb643ad7d0ca915f0029"},
{file = "clickhouse_connect-0.7.17-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ca68d8b7dee3fb4e7229e06152f5b0faaccafb4c87d9c2d48fa5bd117a3cc1c0"},
{file = "clickhouse_connect-0.7.17-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:841c56282102b2fba1e0b332bb1c7a0c50992fbc321746af8d3e0e6ca2450e8b"},
{file = "clickhouse_connect-0.7.17-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:8d7ffde5a4b95d8fe9ed38e08e504e497310e3d7a17691bd40bf65734648fdfc"},
{file = "clickhouse_connect-0.7.17-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:055960086b6b92b6e44f5ba04c81c40c10b038588e4b3908b033c99f66125332"},
{file = "clickhouse_connect-0.7.17-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:36491fec63ceb8503b6344c23477647030139f346b749dc5ee672c505939dbbe"},
{file = "clickhouse_connect-0.7.17-cp38-cp38-win32.whl", hash = "sha256:8779a907e026db32e6bc0bc0c8d5de0e2e3afd166afc2d4adcc0603399af5539"},
{file = "clickhouse_connect-0.7.17-cp38-cp38-win_amd64.whl", hash = "sha256:309854fa197885c6278438ddd032ab52e6fec56f162074e343c3635ca7266078"},
{file = "clickhouse_connect-0.7.17-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e8009f94550178dc971aeb4f8787ba7a5b473c22647490428b7229f540a51d2b"},
{file = "clickhouse_connect-0.7.17-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:70f8422f407b13a404b3670fd097855abd5adaf890c710d6678d2b46ab61ac48"},
{file = "clickhouse_connect-0.7.17-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:082783eb1e8baf7b3465dd045132dc5cb5a91432c899dc4e19891c5f782d8d23"},
{file = "clickhouse_connect-0.7.17-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1c30aad2a9c7584c4ee19e646a087b3bbd2d4daab3d88a2afeeae1a7f6febf9"},
{file = "clickhouse_connect-0.7.17-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fc8e245a9f4f0dce39f155e626405f60f1d3cf4d1e52dd2c793ea6b603ca111b"},
{file = "clickhouse_connect-0.7.17-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:802372cb8a69c9ffdf4260e9f01616c8601ba531825ed6f08834827e0b880cd1"},
{file = "clickhouse_connect-0.7.17-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:193a60271a3b105cdbde96fb20b40eab8a50fca3bb1f397546f7a18b53d9aa9c"},
{file = "clickhouse_connect-0.7.17-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:59d58932916792fdbd09cb961a245a0c2d87b07b8296f9138915b998f4522941"},
{file = "clickhouse_connect-0.7.17-cp39-cp39-win32.whl", hash = "sha256:3cfd0edabb589f640636a97ffc38d1b3d760faef208d44e50829cc1ad3f0d3e5"},
{file = "clickhouse_connect-0.7.17-cp39-cp39-win_amd64.whl", hash = "sha256:5661b4629aac228481219abf2e149119af1a71d897f191665e182d9d192d7033"},
{file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:7429d309109e7e4a70fd867d69fcfea9ddcb1a1e910caa6b0e2c3776b71f4613"},
{file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5ae619151006da84a0b1585a9bcc81be32459d8061aeb2e116bad5bbaa7d108"},
{file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec0c84a0880621cb2389656a89886ef3133f0b3f8dc016eee6f25bbb49ff6f70"},
{file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:705464c23f821666b76f8f619cf2870225156276562756b3933aaa24708e0ff8"},
{file = "clickhouse_connect-0.7.17-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:1822016f4b769e89264fe26cefe0bc5e50e4c3ca0747d89bb52d57dc4f1e5ffb"},
{file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:6c92b0c342c1fbfa666010e8175e05026dc570a7ef91d8fa81ce503180f318aa"},
{file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d2e106536540e906c3c866f8615fcf870a9a77c1bfab9ef4b042febfd2fdb953"},
{file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bac9a32e62384b4341ba51a451084eb3b00c6e59aaac1499145dd8b897cb585c"},
{file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0feed93b9912b7862a8c41be1febcd44b68a824a5c1059b19d5c567afdaa6273"},
{file = "clickhouse_connect-0.7.17-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:2e2dd6db52e799f065fd565143fde5a872cfe903de1bee7775bc3a349856a790"},
{file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:ed13add5d579a5960155f3000420544368501c9703d2fb94f103b4a6126081f6"},
{file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c257a23ed3bf1858593fb03927d9d073fbbdfa24dc2afee537c3314bd66b4e24"},
{file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d47866f64cbdc2d5cc4f8a7a8c49e3ee90c9e487091b9eda7c3a3576418e1cbe"},
{file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9b850e2f17e0a0b5a37d996d3fb728050227489d64d271d678d166abea94f26e"},
{file = "clickhouse_connect-0.7.17-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:349682288987dc84ac7695f7cd6b510be8d0ec0eee7c1b72dbf2146b4e9efdb8"},
]
[package.dependencies]
certifi = "*"
lz4 = "*"
pytz = "*"
urllib3 = ">=1.26"
zstandard = "*"
[package.extras]
arrow = ["pyarrow"]
numpy = ["numpy"]
orjson = ["orjson"]
pandas = ["pandas"]
sqlalchemy = ["sqlalchemy (>1.3.21,<2.0)"]
tzlocal = ["tzlocal (>=4.0)"]
[[package]]
name = "colorama"
version = "0.4.5"
@@ -1560,56 +1470,6 @@ files = [
{file = "lazy_object_proxy-1.10.0-pp310.pp311.pp312.pp38.pp39-none-any.whl", hash = "sha256:80fa48bd89c8f2f456fc0765c11c23bf5af827febacd2f523ca5bc1893fcc09d"},
]
[[package]]
name = "lz4"
version = "4.3.3"
description = "LZ4 Bindings for Python"
optional = false
python-versions = ">=3.8"
files = [
{file = "lz4-4.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b891880c187e96339474af2a3b2bfb11a8e4732ff5034be919aa9029484cd201"},
{file = "lz4-4.3.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:222a7e35137d7539c9c33bb53fcbb26510c5748779364014235afc62b0ec797f"},
{file = "lz4-4.3.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f76176492ff082657ada0d0f10c794b6da5800249ef1692b35cf49b1e93e8ef7"},
{file = "lz4-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1d18718f9d78182c6b60f568c9a9cec8a7204d7cb6fad4e511a2ef279e4cb05"},
{file = "lz4-4.3.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6cdc60e21ec70266947a48839b437d46025076eb4b12c76bd47f8e5eb8a75dcc"},
{file = "lz4-4.3.3-cp310-cp310-win32.whl", hash = "sha256:c81703b12475da73a5d66618856d04b1307e43428a7e59d98cfe5a5d608a74c6"},
{file = "lz4-4.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:43cf03059c0f941b772c8aeb42a0813d68d7081c009542301637e5782f8a33e2"},
{file = "lz4-4.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:30e8c20b8857adef7be045c65f47ab1e2c4fabba86a9fa9a997d7674a31ea6b6"},
{file = "lz4-4.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2f7b1839f795315e480fb87d9bc60b186a98e3e5d17203c6e757611ef7dcef61"},
{file = "lz4-4.3.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:edfd858985c23523f4e5a7526ca6ee65ff930207a7ec8a8f57a01eae506aaee7"},
{file = "lz4-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e9c410b11a31dbdc94c05ac3c480cb4b222460faf9231f12538d0074e56c563"},
{file = "lz4-4.3.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d2507ee9c99dbddd191c86f0e0c8b724c76d26b0602db9ea23232304382e1f21"},
{file = "lz4-4.3.3-cp311-cp311-win32.whl", hash = "sha256:f180904f33bdd1e92967923a43c22899e303906d19b2cf8bb547db6653ea6e7d"},
{file = "lz4-4.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:b14d948e6dce389f9a7afc666d60dd1e35fa2138a8ec5306d30cd2e30d36b40c"},
{file = "lz4-4.3.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:e36cd7b9d4d920d3bfc2369840da506fa68258f7bb176b8743189793c055e43d"},
{file = "lz4-4.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:31ea4be9d0059c00b2572d700bf2c1bc82f241f2c3282034a759c9a4d6ca4dc2"},
{file = "lz4-4.3.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:33c9a6fd20767ccaf70649982f8f3eeb0884035c150c0b818ea660152cf3c809"},
{file = "lz4-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bca8fccc15e3add173da91be8f34121578dc777711ffd98d399be35487c934bf"},
{file = "lz4-4.3.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e7d84b479ddf39fe3ea05387f10b779155fc0990125f4fb35d636114e1c63a2e"},
{file = "lz4-4.3.3-cp312-cp312-win32.whl", hash = "sha256:337cb94488a1b060ef1685187d6ad4ba8bc61d26d631d7ba909ee984ea736be1"},
{file = "lz4-4.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:5d35533bf2cee56f38ced91f766cd0038b6abf46f438a80d50c52750088be93f"},
{file = "lz4-4.3.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:363ab65bf31338eb364062a15f302fc0fab0a49426051429866d71c793c23394"},
{file = "lz4-4.3.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0a136e44a16fc98b1abc404fbabf7f1fada2bdab6a7e970974fb81cf55b636d0"},
{file = "lz4-4.3.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:abc197e4aca8b63f5ae200af03eb95fb4b5055a8f990079b5bdf042f568469dd"},
{file = "lz4-4.3.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:56f4fe9c6327adb97406f27a66420b22ce02d71a5c365c48d6b656b4aaeb7775"},
{file = "lz4-4.3.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f0e822cd7644995d9ba248cb4b67859701748a93e2ab7fc9bc18c599a52e4604"},
{file = "lz4-4.3.3-cp38-cp38-win32.whl", hash = "sha256:24b3206de56b7a537eda3a8123c644a2b7bf111f0af53bc14bed90ce5562d1aa"},
{file = "lz4-4.3.3-cp38-cp38-win_amd64.whl", hash = "sha256:b47839b53956e2737229d70714f1d75f33e8ac26e52c267f0197b3189ca6de24"},
{file = "lz4-4.3.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6756212507405f270b66b3ff7f564618de0606395c0fe10a7ae2ffcbbe0b1fba"},
{file = "lz4-4.3.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ee9ff50557a942d187ec85462bb0960207e7ec5b19b3b48949263993771c6205"},
{file = "lz4-4.3.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2b901c7784caac9a1ded4555258207d9e9697e746cc8532129f150ffe1f6ba0d"},
{file = "lz4-4.3.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b6d9ec061b9eca86e4dcc003d93334b95d53909afd5a32c6e4f222157b50c071"},
{file = "lz4-4.3.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f4c7bf687303ca47d69f9f0133274958fd672efaa33fb5bcde467862d6c621f0"},
{file = "lz4-4.3.3-cp39-cp39-win32.whl", hash = "sha256:054b4631a355606e99a42396f5db4d22046a3397ffc3269a348ec41eaebd69d2"},
{file = "lz4-4.3.3-cp39-cp39-win_amd64.whl", hash = "sha256:eac9af361e0d98335a02ff12fb56caeb7ea1196cf1a49dbf6f17828a131da807"},
{file = "lz4-4.3.3.tar.gz", hash = "sha256:01fe674ef2889dbb9899d8a67361e0c4a2c833af5aeb37dd505727cf5d2a131e"},
]
[package.extras]
docs = ["sphinx (>=1.6.0)", "sphinx-bootstrap-theme"]
flake8 = ["flake8"]
tests = ["psutil", "pytest (!=3.3.0)", "pytest-cov"]
[[package]]
name = "markupsafe"
version = "2.1.1"
@@ -2501,17 +2361,6 @@ files = [
[package.dependencies]
six = ">=1.5"
[[package]]
name = "pytz"
version = "2024.1"
description = "World timezone definitions, modern and historical"
optional = false
python-versions = "*"
files = [
{file = "pytz-2024.1-py2.py3-none-any.whl", hash = "sha256:328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319"},
{file = "pytz-2024.1.tar.gz", hash = "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812"},
]
[[package]]
name = "pywin32"
version = "301"
@@ -3357,4 +3206,4 @@ cffi = ["cffi (>=1.11)"]
[metadata]
lock-version = "2.0"
python-versions = "^3.9"
content-hash = "7cee6a8c30bc7f4bfb0a87c6bad3952dfb4da127fad853d2710a93ac3eab8a00"
content-hash = "16ebd6a46768be7f67dbdb4ee5903b167d94edc9965f29252f038c67e9e907b0"

View File

@@ -5,6 +5,4 @@ pub use limit_algorithm::{
};
pub use limiter::{BucketRateLimiter, GlobalRateLimiter, RateBucketInfo, WakeComputeRateLimiter};
mod leaky_bucket;
pub use leaky_bucket::{
EndpointRateLimiter, LeakyBucketConfig, LeakyBucketRateLimiter, LeakyBucketState,
};
pub use leaky_bucket::{EndpointRateLimiter, LeakyBucketConfig, LeakyBucketRateLimiter};

View File

@@ -1,6 +1,7 @@
use std::{
hash::Hash,
sync::atomic::{AtomicUsize, Ordering},
time::Duration,
};
use ahash::RandomState;
@@ -16,7 +17,7 @@ pub type EndpointRateLimiter = LeakyBucketRateLimiter<EndpointIdInt>;
pub struct LeakyBucketRateLimiter<Key> {
map: DashMap<Key, LeakyBucketState, RandomState>,
config: LeakyBucketConfig,
config: LeakyBucketConfigInner,
access_count: AtomicUsize,
}
@@ -29,7 +30,7 @@ impl<K: Hash + Eq> LeakyBucketRateLimiter<K> {
pub fn new_with_shards(config: LeakyBucketConfig, shards: usize) -> Self {
Self {
map: DashMap::with_hasher_and_shard_amount(RandomState::new(), shards),
config,
config: config.into(),
access_count: AtomicUsize::new(0),
}
}
@@ -42,10 +43,10 @@ impl<K: Hash + Eq> LeakyBucketRateLimiter<K> {
self.do_gc(now);
}
let mut entry = self.map.entry(key).or_insert_with(|| LeakyBucketState {
time: now,
filled: 0.0,
});
let mut entry = self
.map
.entry(key)
.or_insert_with(|| LeakyBucketState::new(now));
entry.check(&self.config, now, n as f64)
}
@@ -59,7 +60,7 @@ impl<K: Hash + Eq> LeakyBucketRateLimiter<K> {
let shard = thread_rng().gen_range(0..n);
self.map.shards()[shard]
.write()
.retain(|_, value| !value.get_mut().update(&self.config, now));
.retain(|_, value| value.get().should_retain(now));
}
}
@@ -68,11 +69,6 @@ pub struct LeakyBucketConfig {
pub max: f64,
}
pub struct LeakyBucketState {
filled: f64,
time: Instant,
}
impl LeakyBucketConfig {
pub fn new(rps: f64, max: f64) -> Self {
assert!(rps > 0.0, "rps must be positive");
@@ -81,40 +77,76 @@ impl LeakyBucketConfig {
}
}
impl LeakyBucketState {
pub fn new() -> Self {
struct LeakyBucketConfigInner {
/// "time cost" of a single request unit.
/// loosely represents how long it takes to handle a request unit in active CPU time.
time_cost: Duration,
bucket_width: Duration,
}
impl From<LeakyBucketConfig> for LeakyBucketConfigInner {
fn from(config: LeakyBucketConfig) -> Self {
// seconds_per_request = 1/(request_per_second)
let spr = config.rps.recip();
Self {
filled: 0.0,
time: Instant::now(),
time_cost: Duration::from_secs_f64(spr),
bucket_width: Duration::from_secs_f64(config.max * spr),
}
}
/// updates the timer and returns true if the bucket is empty
fn update(&mut self, info: &LeakyBucketConfig, now: Instant) -> bool {
let drain = now.duration_since(self.time);
let drain = drain.as_secs_f64() * info.rps;
self.filled = (self.filled - drain).clamp(0.0, info.max);
self.time = now;
self.filled == 0.0
}
pub fn check(&mut self, info: &LeakyBucketConfig, now: Instant, n: f64) -> bool {
self.update(info, now);
if self.filled + n > info.max {
return false;
}
self.filled += n;
true
}
}
impl Default for LeakyBucketState {
fn default() -> Self {
Self::new()
struct LeakyBucketState {
/// Bucket is represented by `start..end` where `start = end - config.bucket_width`.
///
/// At any given time, `end - now` represents the number of tokens in the bucket, multiplied by the "time_cost".
/// Adding `n` tokens to the bucket is done by moving `end` forward by `n * config.time_cost`.
/// If `now < start`, the bucket is considered filled and cannot accept any more tokens.
/// Draining the bucket will happen naturally as `now` moves forward.
///
/// Let `n` be some "time cost" for the request,
/// If now is after end, the bucket is empty and the end is reset to now,
/// If now is within the `bucket window + n`, we are within time budget.
/// If now is before the `bucket window + n`, we have run out of budget.
///
/// This is inspired by the generic cell rate algorithm (GCRA) and works
/// exactly the same as a leaky-bucket.
end: Instant,
}
impl LeakyBucketState {
fn new(now: Instant) -> Self {
Self { end: now }
}
fn should_retain(&self, now: Instant) -> bool {
// if self.end is after now, the bucket is not empty
now < self.end
}
fn check(&mut self, config: &LeakyBucketConfigInner, now: Instant, n: f64) -> bool {
let start = self.end - config.bucket_width;
let n = config.time_cost.mul_f64(n);
// start end
// | start+n | end+n
// | / | /
// ------{o-[---------o-}--]----o----
// now1 ^ now2 ^ ^ now3
//
// at now1, the bucket would be completely filled if we add n tokens.
// at now2, the bucket would be partially filled if we add n tokens.
// at now3, the bucket would start completely empty before we add n tokens.
if self.end + n <= now {
self.end = now + n;
true
} else if start + n <= now {
self.end += n;
true
} else {
false
}
}
}
@@ -124,47 +156,50 @@ mod tests {
use tokio::time::Instant;
use super::{LeakyBucketConfig, LeakyBucketState};
use super::{LeakyBucketConfig, LeakyBucketConfigInner, LeakyBucketState};
#[tokio::test(start_paused = true)]
async fn check() {
let info = LeakyBucketConfig::new(500.0, 2000.0);
let mut bucket = LeakyBucketState::new();
let config: LeakyBucketConfigInner = LeakyBucketConfig::new(500.0, 2000.0).into();
assert_eq!(config.time_cost, Duration::from_millis(2));
assert_eq!(config.bucket_width, Duration::from_secs(4));
let mut bucket = LeakyBucketState::new(Instant::now());
// should work for 2000 requests this second
for _ in 0..2000 {
assert!(bucket.check(&info, Instant::now(), 1.0));
assert!(bucket.check(&config, Instant::now(), 1.0));
}
assert!(!bucket.check(&info, Instant::now(), 1.0));
assert_eq!(bucket.filled, 2000.0);
assert!(!bucket.check(&config, Instant::now(), 1.0));
assert_eq!(bucket.end - Instant::now(), config.bucket_width);
// in 1ms we should drain 0.5 tokens.
// make sure we don't lose any tokens
tokio::time::advance(Duration::from_millis(1)).await;
assert!(!bucket.check(&info, Instant::now(), 1.0));
assert!(!bucket.check(&config, Instant::now(), 1.0));
tokio::time::advance(Duration::from_millis(1)).await;
assert!(bucket.check(&info, Instant::now(), 1.0));
assert!(bucket.check(&config, Instant::now(), 1.0));
// in 10ms we should drain 5 tokens
tokio::time::advance(Duration::from_millis(10)).await;
for _ in 0..5 {
assert!(bucket.check(&info, Instant::now(), 1.0));
assert!(bucket.check(&config, Instant::now(), 1.0));
}
assert!(!bucket.check(&info, Instant::now(), 1.0));
assert!(!bucket.check(&config, Instant::now(), 1.0));
// in 10s we should drain 5000 tokens
// but cap is only 2000
tokio::time::advance(Duration::from_secs(10)).await;
for _ in 0..2000 {
assert!(bucket.check(&info, Instant::now(), 1.0));
assert!(bucket.check(&config, Instant::now(), 1.0));
}
assert!(!bucket.check(&info, Instant::now(), 1.0));
assert!(!bucket.check(&config, Instant::now(), 1.0));
// should sustain 500rps
for _ in 0..2000 {
tokio::time::advance(Duration::from_millis(10)).await;
for _ in 0..5 {
assert!(bucket.check(&info, Instant::now(), 1.0));
assert!(bucket.check(&config, Instant::now(), 1.0));
}
}
}

View File

@@ -41,7 +41,6 @@ zstandard = "^0.21.0"
httpx = {extras = ["http2"], version = "^0.26.0"}
pytest-repeat = "^0.9.3"
websockets = "^12.0"
clickhouse-connect = "^0.7.16"
[tool.poetry.group.dev.dependencies]
mypy = "==1.3.0"

View File

@@ -18,7 +18,6 @@ anyhow.workspace = true
aws-config.workspace = true
bytes.workspace = true
camino.workspace = true
chrono.workspace = true
clap.workspace = true
fail.workspace = true
futures.workspace = true
@@ -45,17 +44,12 @@ scopeguard.workspace = true
strum.workspace = true
strum_macros.workspace = true
diesel = { version = "2.1.4", features = [
"serde_json",
"postgres",
"r2d2",
"chrono",
] }
diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] }
diesel_migrations = { version = "2.1.0" }
r2d2 = { version = "0.8.10" }
dns-lookup = { version = "2.0.4" }
utils = { path = "../libs/utils/" }
metrics = { path = "../libs/metrics/" }
control_plane = { path = "../control_plane" }
workspace_hack = { version = "0.1", path = "../workspace_hack" }

View File

@@ -1 +0,0 @@
DROP TABLE metadata_health;

View File

@@ -1,14 +0,0 @@
CREATE TABLE metadata_health (
tenant_id VARCHAR NOT NULL,
shard_number INTEGER NOT NULL,
shard_count INTEGER NOT NULL,
PRIMARY KEY(tenant_id, shard_number, shard_count),
-- Rely on cascade behavior for delete
FOREIGN KEY(tenant_id, shard_number, shard_count) REFERENCES tenant_shards ON DELETE CASCADE,
healthy BOOLEAN NOT NULL DEFAULT TRUE,
last_scrubbed_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
INSERT INTO metadata_health(tenant_id, shard_number, shard_count)
SELECT tenant_id, shard_number, shard_count FROM tenant_shards;

View File

@@ -1 +0,0 @@
DROP TABLE leader;

View File

@@ -1,6 +0,0 @@
CREATE TABLE leader (
hostname VARCHAR NOT NULL,
port INTEGER NOT NULL,
started_at TIMESTAMPTZ NOT NULL,
PRIMARY KEY(hostname, port, started_at)
);

View File

@@ -10,11 +10,7 @@ use hyper::header::CONTENT_TYPE;
use hyper::{Body, Request, Response};
use hyper::{StatusCode, Uri};
use metrics::{BuildInfo, NeonMetrics};
use pageserver_api::controller_api::{
MetadataHealthListOutdatedRequest, MetadataHealthListOutdatedResponse,
MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse,
TenantCreateRequest,
};
use pageserver_api::controller_api::TenantCreateRequest;
use pageserver_api::models::{
TenantConfigRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
TenantTimeTravelRequest, TimelineCreateRequest,
@@ -564,51 +560,6 @@ async fn handle_cancel_node_fill(req: Request<Body>) -> Result<Response<Body>, A
json_response(StatusCode::ACCEPTED, ())
}
async fn handle_metadata_health_update(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Scrubber)?;
let update_req = json_request::<MetadataHealthUpdateRequest>(&mut req).await?;
let state = get_state(&req);
state.service.metadata_health_update(update_req).await?;
json_response(StatusCode::OK, MetadataHealthUpdateResponse {})
}
async fn handle_metadata_health_list_unhealthy(
req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let state = get_state(&req);
let unhealthy_tenant_shards = state.service.metadata_health_list_unhealthy().await?;
json_response(
StatusCode::OK,
MetadataHealthListUnhealthyResponse {
unhealthy_tenant_shards,
},
)
}
async fn handle_metadata_health_list_outdated(
mut req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let list_outdated_req = json_request::<MetadataHealthListOutdatedRequest>(&mut req).await?;
let state = get_state(&req);
let health_records = state
.service
.metadata_health_list_outdated(list_outdated_req.not_scrubbed_for)
.await?;
json_response(
StatusCode::OK,
MetadataHealthListOutdatedResponse { health_records },
)
}
async fn handle_tenant_shard_split(
service: Arc<Service>,
mut req: Request<Body>,
@@ -1036,28 +987,6 @@ pub fn make_router(
RequestName("control_v1_cancel_node_fill"),
)
})
// Metadata health operations
.post("/control/v1/metadata_health/update", |r| {
named_request_span(
r,
handle_metadata_health_update,
RequestName("control_v1_metadata_health_update"),
)
})
.get("/control/v1/metadata_health/unhealthy", |r| {
named_request_span(
r,
handle_metadata_health_list_unhealthy,
RequestName("control_v1_metadata_health_list_unhealthy"),
)
})
.post("/control/v1/metadata_health/outdated", |r| {
named_request_span(
r,
handle_metadata_health_list_outdated,
RequestName("control_v1_metadata_health_list_outdated"),
)
})
// TODO(vlad): endpoint for cancelling drain and fill
// Tenant Shard operations
.put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {

View File

@@ -10,7 +10,6 @@ mod id_lock_map;
pub mod metrics;
mod node;
mod pageserver_client;
mod peer_client;
pub mod persistence;
mod reconciler;
mod scheduler;

View File

@@ -81,9 +81,6 @@ struct Cli {
#[arg(long, default_value = "5s")]
db_connect_timeout: humantime::Duration,
#[arg(long, default_value = "false")]
start_as_candidate: bool,
/// `neon_local` sets this to the path of the neon_local repo dir.
/// Only relevant for testing.
// TODO: make `cfg(feature = "testing")`
@@ -276,8 +273,6 @@ async fn async_main() -> anyhow::Result<()> {
.unwrap_or(RECONCILER_CONCURRENCY_DEFAULT),
split_threshold: args.split_threshold,
neon_local_repo_dir: args.neon_local_repo_dir,
start_as_candidate: args.start_as_candidate,
http_service_port: args.listen.port() as i32,
};
// After loading secrets & config, but before starting anything else, apply database migrations

View File

@@ -1,104 +0,0 @@
use crate::tenant_shard::ObservedState;
use pageserver_api::shard::TenantShardId;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use tokio_util::sync::CancellationToken;
use reqwest::{StatusCode, Url};
use utils::{backoff, http::error::HttpErrorBody};
#[derive(Debug, Clone)]
pub(crate) struct PeerClient {
hostname: String,
port: i32,
jwt: Option<String>,
client: reqwest::Client,
}
#[derive(thiserror::Error, Debug)]
pub(crate) enum StorageControllerPeerError {
#[error("failed to deserialize error response with status code {0} at {1}: {2}")]
DeserializationError(StatusCode, Url, reqwest::Error),
#[error("storage controller peer API error ({0}): {1}")]
ApiError(StatusCode, String),
#[error("failed to send HTTP request: {0}")]
SendError(reqwest::Error),
#[error("Cancelled")]
Cancelled,
}
pub(crate) type Result<T> = std::result::Result<T, StorageControllerPeerError>;
pub(crate) trait ResponseErrorMessageExt: Sized {
fn error_from_body(self) -> impl std::future::Future<Output = Result<Self>> + Send;
}
impl ResponseErrorMessageExt for reqwest::Response {
async fn error_from_body(self) -> Result<Self> {
let status = self.status();
if !(status.is_client_error() || status.is_server_error()) {
return Ok(self);
}
let url = self.url().to_owned();
Err(match self.json::<HttpErrorBody>().await {
Ok(HttpErrorBody { msg }) => StorageControllerPeerError::ApiError(status, msg),
Err(err) => StorageControllerPeerError::DeserializationError(status, url, err),
})
}
}
#[derive(Serialize, Deserialize, Debug, Default)]
pub(crate) struct GlobalObservedState(pub(crate) HashMap<TenantShardId, ObservedState>);
impl PeerClient {
pub(crate) fn new(hostname: String, port: i32, jwt: Option<String>) -> Self {
Self {
hostname,
port,
jwt,
client: reqwest::Client::new(),
}
}
async fn request_step_down(&self) -> Result<GlobalObservedState> {
let uri = format!("{}:{}/control/v1/step_down", self.hostname, self.port);
let req = self.client.put(uri);
let req = if let Some(jwt) = &self.jwt {
req.header(reqwest::header::AUTHORIZATION, format!("Bearer {jwt}"))
} else {
req
};
let res = req
.send()
.await
.map_err(StorageControllerPeerError::SendError)?;
let response = res.error_from_body().await?;
let status = response.status();
let url = response.url().to_owned();
response
.json()
.await
.map_err(|err| StorageControllerPeerError::DeserializationError(status, url, err))
}
pub(crate) async fn step_down(
&self,
cancel: &CancellationToken,
) -> Result<GlobalObservedState> {
backoff::retry(
|| self.request_step_down(),
|_e| false,
4,
8,
"Send step down request",
cancel,
)
.await
.ok_or_else(|| StorageControllerPeerError::Cancelled)
.and_then(|x| x)
}
}

View File

@@ -8,7 +8,6 @@ use self::split_state::SplitState;
use diesel::pg::PgConnection;
use diesel::prelude::*;
use diesel::Connection;
use pageserver_api::controller_api::MetadataHealthRecord;
use pageserver_api::controller_api::ShardSchedulingPolicy;
use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
use pageserver_api::models::TenantConfig;
@@ -91,12 +90,6 @@ pub(crate) enum DatabaseOperation {
UpdateTenantShard,
DeleteTenant,
UpdateTenantConfig,
UpdateMetadataHealth,
ListMetadataHealth,
ListMetadataHealthUnhealthy,
ListMetadataHealthOutdated,
GetLeader,
UpdateLeader,
}
#[must_use]
@@ -314,32 +307,15 @@ impl Persistence {
&self,
shards: Vec<TenantShardPersistence>,
) -> DatabaseResult<()> {
use crate::schema::metadata_health;
use crate::schema::tenant_shards;
let now = chrono::Utc::now();
let metadata_health_records = shards
.iter()
.map(|t| MetadataHealthPersistence {
tenant_id: t.tenant_id.clone(),
shard_number: t.shard_number,
shard_count: t.shard_count,
healthy: true,
last_scrubbed_at: now,
})
.collect::<Vec<_>>();
use crate::schema::tenant_shards::dsl::*;
self.with_measured_conn(
DatabaseOperation::InsertTenantShards,
move |conn| -> DatabaseResult<()> {
diesel::insert_into(tenant_shards::table)
.values(&shards)
.execute(conn)?;
diesel::insert_into(metadata_health::table)
.values(&metadata_health_records)
.execute(conn)?;
for tenant in &shards {
diesel::insert_into(tenant_shards)
.values(tenant)
.execute(conn)?;
}
Ok(())
},
)
@@ -353,10 +329,10 @@ impl Persistence {
self.with_measured_conn(
DatabaseOperation::DeleteTenant,
move |conn| -> DatabaseResult<()> {
// `metadata_health` status (if exists) is also deleted based on the cascade behavior.
diesel::delete(tenant_shards)
.filter(tenant_id.eq(del_tenant_id.to_string()))
.execute(conn)?;
Ok(())
},
)
@@ -699,159 +675,6 @@ impl Persistence {
)
.await
}
/// Stores all the latest metadata health updates durably. Updates existing entry on conflict.
///
/// **Correctness:** `metadata_health_updates` should all belong the tenant shards managed by the storage controller.
#[allow(dead_code)]
pub(crate) async fn update_metadata_health_records(
&self,
healthy_records: Vec<MetadataHealthPersistence>,
unhealthy_records: Vec<MetadataHealthPersistence>,
now: chrono::DateTime<chrono::Utc>,
) -> DatabaseResult<()> {
use crate::schema::metadata_health::dsl::*;
self.with_measured_conn(
DatabaseOperation::UpdateMetadataHealth,
move |conn| -> DatabaseResult<_> {
diesel::insert_into(metadata_health)
.values(&healthy_records)
.on_conflict((tenant_id, shard_number, shard_count))
.do_update()
.set((healthy.eq(true), last_scrubbed_at.eq(now)))
.execute(conn)?;
diesel::insert_into(metadata_health)
.values(&unhealthy_records)
.on_conflict((tenant_id, shard_number, shard_count))
.do_update()
.set((healthy.eq(false), last_scrubbed_at.eq(now)))
.execute(conn)?;
Ok(())
},
)
.await
}
/// Lists all the metadata health records.
#[allow(dead_code)]
pub(crate) async fn list_metadata_health_records(
&self,
) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
self.with_measured_conn(
DatabaseOperation::ListMetadataHealth,
move |conn| -> DatabaseResult<_> {
Ok(
crate::schema::metadata_health::table
.load::<MetadataHealthPersistence>(conn)?,
)
},
)
.await
}
/// Lists all the metadata health records that is unhealthy.
#[allow(dead_code)]
pub(crate) async fn list_unhealthy_metadata_health_records(
&self,
) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
use crate::schema::metadata_health::dsl::*;
self.with_measured_conn(
DatabaseOperation::ListMetadataHealthUnhealthy,
move |conn| -> DatabaseResult<_> {
Ok(crate::schema::metadata_health::table
.filter(healthy.eq(false))
.load::<MetadataHealthPersistence>(conn)?)
},
)
.await
}
/// Lists all the metadata health records that have not been updated since an `earlier` time.
#[allow(dead_code)]
pub(crate) async fn list_outdated_metadata_health_records(
&self,
earlier: chrono::DateTime<chrono::Utc>,
) -> DatabaseResult<Vec<MetadataHealthPersistence>> {
use crate::schema::metadata_health::dsl::*;
self.with_measured_conn(
DatabaseOperation::ListMetadataHealthOutdated,
move |conn| -> DatabaseResult<_> {
let query = metadata_health.filter(last_scrubbed_at.lt(earlier));
let res = query.load::<MetadataHealthPersistence>(conn)?;
Ok(res)
},
)
.await
}
/// Get the current entry from the `leader` table if one exists.
/// It is an error for the table to contain more than one entry.
pub(crate) async fn get_leader(&self) -> DatabaseResult<Option<LeaderPersistence>> {
let mut leader: Vec<LeaderPersistence> = self
.with_measured_conn(
DatabaseOperation::GetLeader,
move |conn| -> DatabaseResult<_> {
Ok(crate::schema::leader::table.load::<LeaderPersistence>(conn)?)
},
)
.await?;
if leader.len() > 1 {
return Err(DatabaseError::Logical(format!(
"More than one entry present in the leader table: {leader:?}"
)));
}
Ok(leader.pop())
}
/// Update the new leader with compare-exchange semantics. If `prev` does not
/// match the current leader entry, then the update is treated as a failure.
/// When `prev` is not specified, the update is forced.
pub(crate) async fn update_leader(
&self,
prev: Option<LeaderPersistence>,
new: LeaderPersistence,
) -> DatabaseResult<()> {
use crate::schema::leader::dsl::*;
let updated = self
.with_measured_conn(
DatabaseOperation::UpdateLeader,
move |conn| -> DatabaseResult<usize> {
let updated = match &prev {
Some(prev) => diesel::update(leader)
.filter(hostname.eq(prev.hostname.clone()))
.filter(port.eq(prev.port))
.filter(started_at.eq(prev.started_at))
.set((
hostname.eq(new.hostname.clone()),
port.eq(new.port),
started_at.eq(new.started_at),
))
.execute(conn)?,
None => diesel::insert_into(leader)
.values(new.clone())
.execute(conn)?,
};
Ok(updated)
},
)
.await?;
if updated == 0 {
return Err(DatabaseError::Logical(
"Leader table update failed".to_string(),
));
}
Ok(())
}
}
/// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
@@ -921,69 +744,3 @@ pub(crate) struct NodePersistence {
pub(crate) listen_pg_addr: String,
pub(crate) listen_pg_port: i32,
}
/// Tenant metadata health status that are stored durably.
#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)]
#[diesel(table_name = crate::schema::metadata_health)]
pub(crate) struct MetadataHealthPersistence {
#[serde(default)]
pub(crate) tenant_id: String,
#[serde(default)]
pub(crate) shard_number: i32,
#[serde(default)]
pub(crate) shard_count: i32,
pub(crate) healthy: bool,
pub(crate) last_scrubbed_at: chrono::DateTime<chrono::Utc>,
}
impl MetadataHealthPersistence {
pub fn new(
tenant_shard_id: TenantShardId,
healthy: bool,
last_scrubbed_at: chrono::DateTime<chrono::Utc>,
) -> Self {
let tenant_id = tenant_shard_id.tenant_id.to_string();
let shard_number = tenant_shard_id.shard_number.0 as i32;
let shard_count = tenant_shard_id.shard_count.literal() as i32;
MetadataHealthPersistence {
tenant_id,
shard_number,
shard_count,
healthy,
last_scrubbed_at,
}
}
#[allow(dead_code)]
pub(crate) fn get_tenant_shard_id(&self) -> Result<TenantShardId, hex::FromHexError> {
Ok(TenantShardId {
tenant_id: TenantId::from_str(self.tenant_id.as_str())?,
shard_number: ShardNumber(self.shard_number as u8),
shard_count: ShardCount::new(self.shard_count as u8),
})
}
}
impl From<MetadataHealthPersistence> for MetadataHealthRecord {
fn from(value: MetadataHealthPersistence) -> Self {
MetadataHealthRecord {
tenant_shard_id: value
.get_tenant_shard_id()
.expect("stored tenant id should be valid"),
healthy: value.healthy,
last_scrubbed_at: value.last_scrubbed_at,
}
}
}
#[derive(
Serialize, Deserialize, Queryable, Selectable, Insertable, Eq, PartialEq, Debug, Clone,
)]
#[diesel(table_name = crate::schema::leader)]
pub(crate) struct LeaderPersistence {
pub(crate) hostname: String,
pub(crate) port: i32,
pub(crate) started_at: chrono::DateTime<chrono::Utc>,
}

View File

@@ -1,23 +1,5 @@
// @generated automatically by Diesel CLI.
diesel::table! {
leader (hostname, port, started_at) {
hostname -> Varchar,
port -> Int4,
started_at -> Timestamptz,
}
}
diesel::table! {
metadata_health (tenant_id, shard_number, shard_count) {
tenant_id -> Varchar,
shard_number -> Int4,
shard_count -> Int4,
healthy -> Bool,
last_scrubbed_at -> Timestamptz,
}
}
diesel::table! {
nodes (node_id) {
node_id -> Int8,
@@ -44,4 +26,4 @@ diesel::table! {
}
}
diesel::allow_tables_to_appear_in_same_query!(leader, metadata_health, nodes, tenant_shards,);
diesel::allow_tables_to_appear_in_same_query!(nodes, tenant_shards,);

View File

@@ -16,10 +16,7 @@ use crate::{
compute_hook::NotifyError,
id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard},
metrics::LeadershipStatusGroup,
peer_client::{GlobalObservedState, PeerClient},
persistence::{
AbortShardSplitStatus, LeaderPersistence, MetadataHealthPersistence, TenantFilter,
},
persistence::{AbortShardSplitStatus, TenantFilter},
reconciler::{ReconcileError, ReconcileUnits},
scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
tenant_shard::{
@@ -36,11 +33,11 @@ use futures::{stream::FuturesUnordered, StreamExt};
use itertools::Itertools;
use pageserver_api::{
controller_api::{
MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, NodeRegisterRequest,
NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy, TenantCreateRequest,
TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse,
TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest,
TenantShardMigrateRequest, TenantShardMigrateResponse, UtilizationScore,
NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
ShardSchedulingPolicy, TenantCreateRequest, TenantCreateResponse,
TenantCreateResponseShard, TenantDescribeResponse, TenantDescribeResponseShard,
TenantLocateResponse, TenantPolicyRequest, TenantShardMigrateRequest,
TenantShardMigrateResponse, UtilizationScore,
},
models::{SecondaryProgress, TenantConfigRequest, TopTenantShardsRequest},
};
@@ -85,6 +82,7 @@ use crate::{
ReconcilerWaiter, TenantShard,
},
};
use serde::{Deserialize, Serialize};
// For operations that should be quick, like attaching a new tenant
const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5);
@@ -225,7 +223,6 @@ impl ServiceState {
tenants: BTreeMap<TenantShardId, TenantShard>,
scheduler: Scheduler,
delayed_reconcile_rx: tokio::sync::mpsc::Receiver<TenantShardId>,
initial_leadership_status: LeadershipStatus,
) -> Self {
let status = &crate::metrics::METRICS_REGISTRY
.metrics_group
@@ -233,13 +230,15 @@ impl ServiceState {
status.set(
LeadershipStatusGroup {
status: initial_leadership_status,
status: LeadershipStatus::Leader,
},
1,
);
Self {
leadership_status: initial_leadership_status,
// TODO: Starting up as Leader is a transient state. Once we enable rolling
// upgrades on the k8s side, we should start up as Candidate.
leadership_status: LeadershipStatus::Leader,
tenants,
nodes: Arc::new(nodes),
scheduler,
@@ -288,33 +287,6 @@ impl ServiceState {
0,
);
}
fn become_leader(&mut self) {
self.leadership_status = LeadershipStatus::Leader;
let status = &crate::metrics::METRICS_REGISTRY
.metrics_group
.storage_controller_leadership_status;
status.set(
LeadershipStatusGroup {
status: LeadershipStatus::Leader,
},
1,
);
status.set(
LeadershipStatusGroup {
status: LeadershipStatus::SteppedDown,
},
0,
);
status.set(
LeadershipStatusGroup {
status: LeadershipStatus::Candidate,
},
0,
);
}
}
#[derive(Clone)]
@@ -351,10 +323,6 @@ pub struct Config {
// TODO: make this cfg(feature = "testing")
pub neon_local_repo_dir: Option<PathBuf>,
pub start_as_candidate: bool,
pub http_service_port: i32,
}
impl From<DatabaseError> for ApiError {
@@ -522,10 +490,9 @@ pub(crate) enum ReconcileResultRequest {
Stop,
}
struct LeaderStepDownState {
observed: GlobalObservedState,
leader: LeaderPersistence,
}
// TODO: move this into the storcon peer client when that gets added
#[derive(Serialize, Deserialize, Debug, Default)]
pub(crate) struct GlobalObservedState(HashMap<TenantShardId, ObservedState>);
impl Service {
pub fn get_config(&self) -> &Config {
@@ -537,11 +504,15 @@ impl Service {
#[instrument(skip_all)]
async fn startup_reconcile(
self: &Arc<Service>,
leader_step_down_state: Option<LeaderStepDownState>,
bg_compute_notify_result_tx: tokio::sync::mpsc::Sender<
Result<(), (TenantShardId, NotifyError)>,
>,
) {
// For all tenant shards, a vector of observed states on nodes (where None means
// indeterminate, same as in [`ObservedStateLocation`])
let mut observed: HashMap<TenantShardId, Vec<(NodeId, Option<LocationConfig>)>> =
HashMap::new();
// Startup reconciliation does I/O to other services: whether they
// are responsive or not, we should aim to finish within our deadline, because:
// - If we don't, a k8s readiness hook watching /ready will kill us.
@@ -555,29 +526,26 @@ impl Service {
.checked_add(STARTUP_RECONCILE_TIMEOUT / 2)
.expect("Reconcile timeout is a modest constant");
let (observed, current_leader) = if let Some(state) = leader_step_down_state {
tracing::info!(
"Using observed received from leader at {}:{}",
state.leader.hostname,
state.leader.port
);
(state.observed, Some(state.leader))
} else {
(
self.build_global_observed_state(node_scan_deadline).await,
None,
)
};
// Accumulate a list of any tenant locations that ought to be detached
let mut cleanup = Vec::new();
// Send initial heartbeat requests to all nodes loaded from the database
let all_nodes = {
let locked = self.inner.read().unwrap();
locked.nodes.clone()
};
let nodes_online = self.initial_heartbeat_round(all_nodes.keys()).await;
let node_listings = self.scan_node_locations(node_scan_deadline).await;
// Send initial heartbeat requests to nodes that replied to the location listing above.
let nodes_online = self.initial_heartbeat_round(node_listings.keys()).await;
for (node_id, list_response) in node_listings {
let tenant_shards = list_response.tenant_shards;
tracing::info!(
"Received {} shard statuses from pageserver {}, setting it to Active",
tenant_shards.len(),
node_id
);
for (tenant_shard_id, conf_opt) in tenant_shards {
let shard_observations = observed.entry(tenant_shard_id).or_default();
shard_observations.push((node_id, conf_opt));
}
}
// List of tenants for which we will attempt to notify compute of their location at startup
let mut compute_notifications = Vec::new();
@@ -600,16 +568,17 @@ impl Service {
}
*nodes = Arc::new(new_nodes);
for (tenant_shard_id, observed_state) in observed.0 {
let Some(tenant_shard) = tenants.get_mut(&tenant_shard_id) else {
for node_id in observed_state.locations.keys() {
cleanup.push((tenant_shard_id, *node_id));
}
continue;
};
tenant_shard.observed = observed_state;
for (tenant_shard_id, shard_observations) in observed {
for (node_id, observed_loc) in shard_observations {
let Some(tenant_shard) = tenants.get_mut(&tenant_shard_id) else {
cleanup.push((tenant_shard_id, node_id));
continue;
};
tenant_shard
.observed
.locations
.insert(node_id, ObservedStateLocation { conf: observed_loc });
}
}
// Populate each tenant's intent state
@@ -643,22 +612,6 @@ impl Service {
tenants.len()
};
// Before making any obeservable changes to the cluster, persist self
// as leader in database and memory.
let proposed_leader = self.get_proposed_leader_info();
if let Err(err) = self
.persistence
.update_leader(current_leader, proposed_leader)
.await
{
tracing::error!("Failed to persist self as leader: {err}. Aborting start-up ...");
std::process::exit(1);
}
self.inner.write().unwrap().become_leader();
// TODO: if any tenant's intent now differs from its loaded generation_pageserver, we should clear that
// generation_pageserver in the database.
@@ -824,31 +777,6 @@ impl Service {
node_results
}
async fn build_global_observed_state(&self, deadline: Instant) -> GlobalObservedState {
let node_listings = self.scan_node_locations(deadline).await;
let mut observed = GlobalObservedState::default();
for (node_id, location_confs) in node_listings {
tracing::info!(
"Received {} shard statuses from pageserver {}",
location_confs.tenant_shards.len(),
node_id
);
for (tid, location_conf) in location_confs.tenant_shards {
let entry = observed.0.entry(tid).or_default();
entry.locations.insert(
node_id,
ObservedStateLocation {
conf: location_conf,
},
);
}
}
observed
}
/// Used during [`Self::startup_reconcile`]: detach a list of unknown-to-us tenants from pageservers.
///
/// This is safe to run in the background, because if we don't have this TenantShardId in our map of
@@ -1327,20 +1255,12 @@ impl Service {
config.max_warming_up_interval,
cancel.clone(),
);
let initial_leadership_status = if config.start_as_candidate {
LeadershipStatus::Candidate
} else {
LeadershipStatus::Leader
};
let this = Arc::new(Self {
inner: Arc::new(std::sync::RwLock::new(ServiceState::new(
nodes,
tenants,
scheduler,
delayed_reconcile_rx,
initial_leadership_status,
))),
config: config.clone(),
persistence,
@@ -1409,16 +1329,7 @@ impl Service {
return;
};
let leadership_status = this.inner.read().unwrap().get_leadership_status();
let peer_observed_state = match leadership_status {
LeadershipStatus::Candidate => this.request_step_down().await,
LeadershipStatus::Leader => None,
LeadershipStatus::SteppedDown => unreachable!(),
};
this.startup_reconcile(peer_observed_state, bg_compute_notify_result_tx)
.await;
this.startup_reconcile(bg_compute_notify_result_tx).await;
drop(startup_completion);
}
});
@@ -6184,68 +6095,6 @@ impl Service {
Ok(())
}
/// Updates scrubber metadata health check results.
pub(crate) async fn metadata_health_update(
&self,
update_req: MetadataHealthUpdateRequest,
) -> Result<(), ApiError> {
let now = chrono::offset::Utc::now();
let (healthy_records, unhealthy_records) = {
let locked = self.inner.read().unwrap();
let healthy_records = update_req
.healthy_tenant_shards
.into_iter()
// Retain only health records associated with tenant shards managed by storage controller.
.filter(|tenant_shard_id| locked.tenants.contains_key(tenant_shard_id))
.map(|tenant_shard_id| MetadataHealthPersistence::new(tenant_shard_id, true, now))
.collect();
let unhealthy_records = update_req
.unhealthy_tenant_shards
.into_iter()
.filter(|tenant_shard_id| locked.tenants.contains_key(tenant_shard_id))
.map(|tenant_shard_id| MetadataHealthPersistence::new(tenant_shard_id, false, now))
.collect();
(healthy_records, unhealthy_records)
};
self.persistence
.update_metadata_health_records(healthy_records, unhealthy_records, now)
.await?;
Ok(())
}
/// Lists the tenant shards that has unhealthy metadata status.
pub(crate) async fn metadata_health_list_unhealthy(
&self,
) -> Result<Vec<TenantShardId>, ApiError> {
let result = self
.persistence
.list_unhealthy_metadata_health_records()
.await?
.iter()
.map(|p| p.get_tenant_shard_id().unwrap())
.collect();
Ok(result)
}
/// Lists the tenant shards that have not been scrubbed for some duration.
pub(crate) async fn metadata_health_list_outdated(
&self,
not_scrubbed_for: Duration,
) -> Result<Vec<MetadataHealthRecord>, ApiError> {
let earlier = chrono::offset::Utc::now() - not_scrubbed_for;
let result = self
.persistence
.list_outdated_metadata_health_records(earlier)
.await?
.into_iter()
.map(|record| record.into())
.collect();
Ok(result)
}
pub(crate) fn get_leadership_status(&self) -> LeadershipStatus {
self.inner.read().unwrap().get_leadership_status()
}
@@ -6268,88 +6117,4 @@ impl Service {
global_observed
}
/// Collect the details for the current proccess wishing to become the storage controller
/// leader.
///
/// On failures to discover and resolve the hostname the process is killed and we rely on k8s to retry.
fn get_proposed_leader_info(&self) -> LeaderPersistence {
let hostname = match dns_lookup::get_hostname() {
Ok(name) => name,
Err(err) => {
tracing::error!("Failed to discover hostname: {err}. Aborting start-up ...");
std::process::exit(1);
}
};
let mut addrs = match dns_lookup::lookup_host(&hostname) {
Ok(addrs) => addrs,
Err(err) => {
tracing::error!("Failed to resolve hostname: {err}. Aborting start-up ...");
std::process::exit(1);
}
};
let addr = addrs
.pop()
.expect("k8s configured hostname always resolves");
let proposed = LeaderPersistence {
hostname: addr.to_string(),
port: self.get_config().http_service_port,
started_at: chrono::Utc::now(),
};
tracing::info!("Proposed leader details are: {proposed:?}");
proposed
}
/// Request step down from the currently registered leader in the database
///
/// If such an entry is persisted, the success path returns the observed
/// state and details of the leader. Otherwise, None is returned indicating
/// there is no leader currently.
///
/// On failures to query the database or step down error responses the process is killed
/// and we rely on k8s to retry.
async fn request_step_down(&self) -> Option<LeaderStepDownState> {
let leader = match self.persistence.get_leader().await {
Ok(leader) => leader,
Err(err) => {
tracing::error!(
"Failed to query database for current leader: {err}. Aborting start-up ..."
);
std::process::exit(1);
}
};
match leader {
Some(leader) => {
// TODO: jwt token
let client = PeerClient::new(
leader.hostname.to_owned(),
leader.port,
self.config.jwt_token.clone(),
);
let state = client.step_down(&self.cancel).await;
match state {
Ok(state) => Some(LeaderStepDownState {
observed: state,
leader: leader.clone(),
}),
Err(err) => {
tracing::error!(
"Leader ({}:{}) did not respond to step-down request: {}",
leader.hostname,
leader.port,
err
);
None
}
}
}
None => None,
}
}
}

View File

@@ -40,11 +40,6 @@ impl TimelineAnalysis {
garbage_keys: Vec::new(),
}
}
/// Whether a timeline is healthy.
pub(crate) fn is_healthy(&self) -> bool {
self.errors.is_empty() && self.warnings.is_empty()
}
}
pub(crate) async fn branch_cleanup_and_check_errors(

View File

@@ -1,13 +1,10 @@
use std::pin::pin;
use futures::{StreamExt, TryStreamExt};
use pageserver::tenant::storage_layer::LayerName;
use remote_storage::ListingMode;
use serde::{Deserialize, Serialize};
use crate::{
checks::parse_layer_object_name, init_remote_generic, metadata_stream::stream_tenants_generic,
stream_objects_with_retries, BucketConfig, NodeKind,
checks::parse_layer_object_name, init_remote, list_objects_with_retries,
metadata_stream::stream_tenants, BucketConfig, NodeKind,
};
#[derive(Serialize, Deserialize, Clone, Copy, PartialEq, Eq)]
@@ -50,38 +47,45 @@ pub async fn find_large_objects(
ignore_deltas: bool,
concurrency: usize,
) -> anyhow::Result<LargeObjectListing> {
let (remote_client, target) =
init_remote_generic(bucket_config.clone(), NodeKind::Pageserver).await?;
let tenants = pin!(stream_tenants_generic(&remote_client, &target));
let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?;
let tenants = std::pin::pin!(stream_tenants(&s3_client, &target));
let objects_stream = tenants.map_ok(|tenant_shard_id| {
let mut tenant_root = target.tenant_root(&tenant_shard_id);
let remote_client = remote_client.clone();
let s3_client = s3_client.clone();
async move {
let mut objects = Vec::new();
let mut total_objects_ctr = 0u64;
// We want the objects and not just common prefixes
tenant_root.delimiter.clear();
let mut objects_stream = pin!(stream_objects_with_retries(
&remote_client,
ListingMode::NoDelimiter,
&tenant_root
));
while let Some(listing) = objects_stream.next().await {
let listing = listing?;
for obj in listing.keys.iter().filter(|obj| min_size <= obj.size) {
let key = obj.key.to_string();
let mut continuation_token = None;
loop {
let fetch_response =
list_objects_with_retries(&s3_client, &tenant_root, continuation_token.clone())
.await?;
for obj in fetch_response.contents().iter().filter(|o| {
if let Some(obj_size) = o.size {
min_size as i64 <= obj_size
} else {
false
}
}) {
let key = obj.key().expect("couldn't get key").to_owned();
let kind = LargeObjectKind::from_key(&key);
if ignore_deltas && kind == LargeObjectKind::DeltaLayer {
continue;
}
objects.push(LargeObject {
key,
size: obj.size,
size: obj.size.unwrap() as u64,
kind,
})
}
total_objects_ctr += listing.keys.len() as u64;
total_objects_ctr += fetch_response.contents().len() as u64;
match fetch_response.next_continuation_token {
Some(new_token) => continuation_token = Some(new_token),
None => break,
}
}
Ok((tenant_shard_id, objects, total_objects_ctr))

View File

@@ -5,7 +5,6 @@
use std::{
collections::{HashMap, HashSet},
sync::Arc,
time::Duration,
};
use anyhow::Context;
@@ -19,7 +18,7 @@ use utils::id::TenantId;
use crate::{
cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData},
init_remote, init_remote_generic, list_objects_with_retries,
init_remote, init_remote_generic,
metadata_stream::{stream_tenant_timelines, stream_tenants},
BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth,
};
@@ -28,11 +27,6 @@ use crate::{
enum GarbageReason {
DeletedInConsole,
MissingInConsole,
// The remaining data relates to a known deletion issue, and we're sure that purging this
// will not delete any real data, for example https://github.com/neondatabase/neon/pull/7928 where
// there is nothing in a tenant path apart from a heatmap file.
KnownBug,
}
#[derive(Serialize, Deserialize, Debug)]
@@ -78,15 +72,6 @@ impl GarbageList {
}
}
/// If an entity has been identified as requiring purge due to a known bug, e.g.
/// a particular type of object left behind after an incomplete deletion.
fn append_buggy(&mut self, entity: GarbageEntity) {
self.items.push(GarbageItem {
entity,
reason: GarbageReason::KnownBug,
});
}
/// Return true if appended, false if not. False means the result was not garbage.
fn maybe_append<T>(&mut self, entity: GarbageEntity, result: Option<T>) -> bool
where
@@ -234,71 +219,6 @@ async fn find_garbage_inner(
assert!(project.tenant == tenant_shard_id.tenant_id);
}
// Special case: If it's missing in console, check for known bugs that would enable us to conclusively
// identify it as purge-able anyway
if console_result.is_none() {
let timelines = stream_tenant_timelines(&s3_client, &target, tenant_shard_id)
.await?
.collect::<Vec<_>>()
.await;
if timelines.is_empty() {
// No timelines, but a heatmap: the deletion bug where we deleted everything but heatmaps
let tenant_objects = list_objects_with_retries(
&s3_client,
&target.tenant_root(&tenant_shard_id),
None,
)
.await?;
let object = tenant_objects.contents.as_ref().unwrap().first().unwrap();
if object.key.as_ref().unwrap().ends_with("heatmap-v1.json") {
tracing::info!("Tenant {tenant_shard_id}: is missing in console and is only a heatmap (known historic deletion bug)");
garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id));
continue;
} else {
tracing::info!("Tenant {tenant_shard_id} is missing in console and contains one object: {}", object.key.as_ref().unwrap());
}
} else {
// A console-unknown tenant with timelines: check if these timelines only contain initdb.tar.zst, from the initial
// rollout of WAL DR in which we never deleted these.
let mut any_non_initdb = false;
for timeline_r in timelines {
let timeline = timeline_r?;
let timeline_objects = list_objects_with_retries(
&s3_client,
&target.timeline_root(&timeline),
None,
)
.await?;
if timeline_objects
.common_prefixes
.as_ref()
.map(|v| v.len())
.unwrap_or(0)
> 0
{
// Sub-paths? Unexpected
any_non_initdb = true;
} else {
let object = timeline_objects.contents.as_ref().unwrap().first().unwrap();
if object.key.as_ref().unwrap().ends_with("initdb.tar.zst") {
tracing::info!("Timeline {timeline} contains only initdb.tar.zst");
} else {
any_non_initdb = true;
}
}
}
if any_non_initdb {
tracing::info!("Tenant {tenant_shard_id}: is missing in console and contains timelines, one or more of which are more than just initdb");
} else {
tracing::info!("Tenant {tenant_shard_id}: is missing in console and contains only timelines that only contain initdb");
garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id));
continue;
}
}
}
if garbage.maybe_append(GarbageEntity::Tenant(tenant_shard_id), console_result) {
tracing::debug!("Tenant {tenant_shard_id} is garbage");
} else {
@@ -429,6 +349,9 @@ pub async fn get_timeline_objects(
tracing::debug!("Listing objects in timeline {ttid}");
let timeline_root = super::remote_timeline_path_id(&ttid);
// TODO: apply extra validation based on object modification time. Don't purge
// timelines whose index_part.json has been touched recently.
let list = s3_client
.list(
Some(&timeline_root),
@@ -499,7 +422,6 @@ impl DeletionProgressTracker {
pub async fn purge_garbage(
input_path: String,
mode: PurgeMode,
min_age: Duration,
dry_run: bool,
) -> anyhow::Result<()> {
let list_bytes = tokio::fs::read(&input_path).await?;
@@ -510,7 +432,7 @@ pub async fn purge_garbage(
input_path
);
let (remote_client, _target) =
let remote_client =
init_remote_generic(garbage_list.bucket_config.clone(), garbage_list.node_kind).await?;
assert_eq!(
@@ -537,7 +459,6 @@ pub async fn purge_garbage(
.filter(|i| match (&mode, &i.reason) {
(PurgeMode::DeletedAndMissing, _) => true,
(PurgeMode::DeletedOnly, GarbageReason::DeletedInConsole) => true,
(PurgeMode::DeletedOnly, GarbageReason::KnownBug) => true,
(PurgeMode::DeletedOnly, GarbageReason::MissingInConsole) => false,
});
@@ -566,37 +487,6 @@ pub async fn purge_garbage(
let mut progress_tracker = DeletionProgressTracker::default();
while let Some(result) = get_objects_results.next().await {
let mut object_list = result?;
// Extra safety check: even if a collection of objects is garbage, check max() of modification
// times before purging, so that if we incorrectly marked a live tenant as garbage then we would
// notice that its index has been written recently and would omit deleting it.
if object_list.is_empty() {
// Simplify subsequent code by ensuring list always has at least one item
// Usually, this only occurs if there is parallel deletions racing us, as there is no empty prefixes
continue;
}
let max_mtime = object_list.iter().map(|o| o.last_modified).max().unwrap();
let age = max_mtime.elapsed();
match age {
Err(_) => {
tracing::warn!("Bad last_modified time");
continue;
}
Ok(a) if a < min_age => {
// Failed age check. This doesn't mean we did something wrong: a tenant might really be garbage and recently
// written, but out of an abundance of caution we still don't purge it.
tracing::info!(
"Skipping tenant with young objects {}..{}",
object_list.first().as_ref().unwrap().key,
object_list.last().as_ref().unwrap().key
);
continue;
}
Ok(_) => {
// Passed age check
}
}
objects_to_delete.append(&mut object_list);
if objects_to_delete.len() >= MAX_KEYS_PER_DELETE {
do_delete(

View File

@@ -22,19 +22,16 @@ use aws_sdk_s3::Client;
use camino::{Utf8Path, Utf8PathBuf};
use clap::ValueEnum;
use futures::{Stream, StreamExt};
use pageserver::tenant::remote_timeline_client::{remote_tenant_path, remote_timeline_path};
use pageserver::tenant::TENANTS_SEGMENT_NAME;
use pageserver_api::shard::TenantShardId;
use remote_storage::{
GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorageConfig, RemoteStorageKind,
S3Config, DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
};
use reqwest::Url;
use serde::{Deserialize, Serialize};
use storage_controller_client::control_api;
use tokio::io::AsyncReadExt;
use tokio_util::sync::CancellationToken;
use tracing::error;
use tracing_appender::non_blocking::WorkerGuard;
use tracing_subscriber::{fmt, prelude::*, EnvFilter};
@@ -256,12 +253,6 @@ pub struct ControllerClientConfig {
pub controller_jwt: String,
}
impl ControllerClientConfig {
pub fn build_client(self) -> control_api::Client {
control_api::Client::new(self.controller_api, Some(self.controller_jwt))
}
}
pub struct ConsoleConfig {
pub token: String,
pub base_url: Url,
@@ -328,35 +319,27 @@ fn default_prefix_in_bucket(node_kind: NodeKind) -> &'static str {
}
}
fn make_root_target(
bucket_name: String,
prefix_in_bucket: String,
node_kind: NodeKind,
) -> RootTarget {
let s3_target = S3Target {
bucket_name,
prefix_in_bucket,
delimiter: "/".to_string(),
};
match node_kind {
NodeKind::Pageserver => RootTarget::Pageserver(s3_target),
NodeKind::Safekeeper => RootTarget::Safekeeper(s3_target),
}
}
async fn init_remote(
bucket_config: BucketConfig,
node_kind: NodeKind,
) -> anyhow::Result<(Arc<Client>, RootTarget)> {
let bucket_region = Region::new(bucket_config.region);
let delimiter = "/".to_string();
let s3_client = Arc::new(init_s3_client(bucket_region).await);
let default_prefix = default_prefix_in_bucket(node_kind).to_string();
let s3_root = make_root_target(
bucket_config.bucket,
bucket_config.prefix_in_bucket.unwrap_or(default_prefix),
node_kind,
);
let s3_root = match node_kind {
NodeKind::Pageserver => RootTarget::Pageserver(S3Target {
bucket_name: bucket_config.bucket,
prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or(default_prefix),
delimiter,
}),
NodeKind::Safekeeper => RootTarget::Safekeeper(S3Target {
bucket_name: bucket_config.bucket,
prefix_in_bucket: bucket_config.prefix_in_bucket.unwrap_or(default_prefix),
delimiter,
}),
};
Ok((s3_client, s3_root))
}
@@ -364,12 +347,12 @@ async fn init_remote(
async fn init_remote_generic(
bucket_config: BucketConfig,
node_kind: NodeKind,
) -> anyhow::Result<(GenericRemoteStorage, RootTarget)> {
) -> anyhow::Result<GenericRemoteStorage> {
let endpoint = env::var("AWS_ENDPOINT_URL").ok();
let default_prefix = default_prefix_in_bucket(node_kind).to_string();
let prefix_in_bucket = Some(bucket_config.prefix_in_bucket.unwrap_or(default_prefix));
let storage = S3Config {
bucket_name: bucket_config.bucket.clone(),
bucket_name: bucket_config.bucket,
bucket_region: bucket_config.region,
prefix_in_bucket,
endpoint,
@@ -383,13 +366,7 @@ async fn init_remote_generic(
storage: RemoteStorageKind::AwsS3(storage),
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
};
// We already pass the prefix to the remote client above
let prefix_in_root_target = String::new();
let s3_root = make_root_target(bucket_config.bucket, prefix_in_root_target, node_kind);
let client = GenericRemoteStorage::from_config(&storage_config).await?;
Ok((client, s3_root))
GenericRemoteStorage::from_config(&storage_config).await
}
async fn list_objects_with_retries(
@@ -427,44 +404,6 @@ async fn list_objects_with_retries(
Err(anyhow!("unreachable unless MAX_RETRIES==0"))
}
fn stream_objects_with_retries<'a>(
storage_client: &'a GenericRemoteStorage,
listing_mode: ListingMode,
s3_target: &'a S3Target,
) -> impl Stream<Item = Result<Listing, anyhow::Error>> + 'a {
async_stream::stream! {
let mut trial = 0;
let cancel = CancellationToken::new();
let prefix_str = &s3_target
.prefix_in_bucket
.strip_prefix("/")
.unwrap_or(&s3_target.prefix_in_bucket);
let prefix = RemotePath::from_string(prefix_str)?;
let mut list_stream =
storage_client.list_streaming(Some(&prefix), listing_mode, None, &cancel);
while let Some(res) = list_stream.next().await {
if let Err(err) = res {
let yield_err = if err.is_permanent() {
true
} else {
let backoff_time = 1 << trial.max(5);
tokio::time::sleep(Duration::from_secs(backoff_time)).await;
trial += 1;
trial == MAX_RETRIES - 1
};
if yield_err {
yield Err(err)
.with_context(|| format!("Failed to list objects {MAX_RETRIES} times"));
break;
}
} else {
trial = 0;
yield res.map_err(anyhow::Error::from);
}
}
}
}
async fn download_object_with_retries(
s3_client: &Client,
bucket_name: &str,

View File

@@ -1,8 +1,7 @@
use anyhow::{anyhow, bail};
use camino::Utf8PathBuf;
use pageserver_api::controller_api::{MetadataHealthUpdateRequest, MetadataHealthUpdateResponse};
use pageserver_api::shard::TenantShardId;
use reqwest::{Method, Url};
use reqwest::Url;
use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
use storage_scrubber::pageserver_physical_gc::GcMode;
use storage_scrubber::scan_pageserver_metadata::scan_metadata;
@@ -51,8 +50,6 @@ enum Command {
input_path: String,
#[arg(short, long, default_value_t = PurgeMode::DeletedOnly)]
mode: PurgeMode,
#[arg(long = "min-age")]
min_age: humantime::Duration,
},
#[command(verbatim_doc_comment)]
ScanMetadata {
@@ -62,8 +59,6 @@ enum Command {
json: bool,
#[arg(long = "tenant-id", num_args = 0..)]
tenant_ids: Vec<TenantShardId>,
#[arg(long = "post", default_value_t = false)]
post_to_storage_controller: bool,
#[arg(long, default_value = None)]
/// For safekeeper node_kind only, points to db with debug dump
dump_db_connstr: Option<String>,
@@ -119,20 +114,11 @@ async fn main() -> anyhow::Result<()> {
chrono::Utc::now().format("%Y_%m_%d__%H_%M_%S")
));
let controller_client_conf = cli.controller_api.map(|controller_api| {
ControllerClientConfig {
controller_api,
// Default to no key: this is a convenience when working in a development environment
controller_jwt: cli.controller_jwt.unwrap_or("".to_owned()),
}
});
match cli.command {
Command::ScanMetadata {
json,
tenant_ids,
node_kind,
post_to_storage_controller,
dump_db_connstr,
dump_db_table,
} => {
@@ -171,9 +157,6 @@ async fn main() -> anyhow::Result<()> {
}
Ok(())
} else {
if controller_client_conf.is_none() && post_to_storage_controller {
return Err(anyhow!("Posting pageserver scan health status to storage controller requires `--controller-api` and `--controller-jwt` to run"));
}
match scan_metadata(bucket_config.clone(), tenant_ids).await {
Err(e) => {
tracing::error!("Failed: {e}");
@@ -185,21 +168,6 @@ async fn main() -> anyhow::Result<()> {
} else {
println!("{}", summary.summary_string());
}
if post_to_storage_controller {
if let Some(conf) = controller_client_conf {
let controller_client = conf.build_client();
let body = summary.build_health_update_request();
controller_client
.dispatch::<MetadataHealthUpdateRequest, MetadataHealthUpdateResponse>(
Method::POST,
"control/v1/metadata_health/update".to_string(),
Some(body),
)
.await?;
}
}
if summary.is_fatal() {
Err(anyhow::anyhow!("Fatal scrub errors detected"))
} else if summary.is_empty() {
@@ -228,11 +196,9 @@ async fn main() -> anyhow::Result<()> {
let console_config = ConsoleConfig::from_env()?;
find_garbage(bucket_config, console_config, depth, node_kind, output_path).await
}
Command::PurgeGarbage {
input_path,
mode,
min_age,
} => purge_garbage(input_path, mode, min_age.into(), !cli.delete).await,
Command::PurgeGarbage { input_path, mode } => {
purge_garbage(input_path, mode, !cli.delete).await
}
Command::TenantSnapshot {
tenant_id,
output_path,
@@ -247,6 +213,14 @@ async fn main() -> anyhow::Result<()> {
min_age,
mode,
} => {
let controller_client_conf = cli.controller_api.map(|controller_api| {
ControllerClientConfig {
controller_api,
// Default to no key: this is a convenience when working in a development environment
controller_jwt: cli.controller_jwt.unwrap_or("".to_owned()),
}
});
match (&controller_client_conf, mode) {
(Some(_), _) => {
// Any mode may run when controller API is set

View File

@@ -1,41 +1,12 @@
use std::str::FromStr;
use anyhow::{anyhow, Context};
use anyhow::Context;
use async_stream::{stream, try_stream};
use aws_sdk_s3::{types::ObjectIdentifier, Client};
use futures::StreamExt;
use remote_storage::{GenericRemoteStorage, ListingMode};
use tokio_stream::Stream;
use crate::{
list_objects_with_retries, stream_objects_with_retries, RootTarget, S3Target,
TenantShardTimelineId,
};
use crate::{list_objects_with_retries, RootTarget, S3Target, TenantShardTimelineId};
use pageserver_api::shard::TenantShardId;
use utils::id::{TenantId, TimelineId};
/// Given a remote storage and a target, output a stream of TenantIds discovered via listing prefixes
pub fn stream_tenants_generic<'a>(
remote_client: &'a GenericRemoteStorage,
target: &'a RootTarget,
) -> impl Stream<Item = anyhow::Result<TenantShardId>> + 'a {
try_stream! {
let tenants_target = target.tenants_root();
let mut tenants_stream =
std::pin::pin!(stream_objects_with_retries(remote_client, ListingMode::WithDelimiter, &tenants_target));
while let Some(chunk) = tenants_stream.next().await {
let chunk = chunk?;
let entry_ids = chunk.prefixes.iter()
.map(|prefix| prefix.get_path().file_name().ok_or_else(|| anyhow!("no final component in path '{prefix}'")));
for dir_name_res in entry_ids {
let dir_name = dir_name_res?;
let id = TenantShardId::from_str(dir_name)?;
yield id;
}
}
}
}
/// Given an S3 bucket, output a stream of TenantIds discovered via ListObjectsv2
pub fn stream_tenants<'a>(
s3_client: &'a Client,

View File

@@ -567,7 +567,13 @@ pub async fn pageserver_physical_gc(
}
// Execute cross-shard GC, using the accumulator's full view of all the shards built in the per-shard GC
let Some(controller_client) = controller_client_conf.map(|c| c.build_client()) else {
let Some(controller_client) = controller_client_conf.as_ref().map(|c| {
let ControllerClientConfig {
controller_api,
controller_jwt,
} = c;
control_api::Client::new(controller_api.clone(), Some(controller_jwt.clone()))
}) else {
tracing::info!("Skipping ancestor layer GC, because no `--controller-api` was specified");
return Ok(summary);
};

View File

@@ -9,13 +9,12 @@ use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimeline
use aws_sdk_s3::Client;
use futures_util::{StreamExt, TryStreamExt};
use pageserver::tenant::remote_timeline_client::remote_layer_path;
use pageserver_api::controller_api::MetadataHealthUpdateRequest;
use pageserver_api::shard::TenantShardId;
use serde::Serialize;
use utils::id::TenantId;
use utils::shard::ShardCount;
#[derive(Serialize, Default)]
#[derive(Serialize)]
pub struct MetadataSummary {
tenant_count: usize,
timeline_count: usize,
@@ -24,16 +23,19 @@ pub struct MetadataSummary {
with_warnings: HashSet<TenantShardTimelineId>,
with_orphans: HashSet<TenantShardTimelineId>,
indices_by_version: HashMap<usize, usize>,
#[serde(skip)]
pub(crate) healthy_tenant_shards: HashSet<TenantShardId>,
#[serde(skip)]
pub(crate) unhealthy_tenant_shards: HashSet<TenantShardId>,
}
impl MetadataSummary {
fn new() -> Self {
Self::default()
Self {
tenant_count: 0,
timeline_count: 0,
timeline_shard_count: 0,
with_errors: HashSet::new(),
with_warnings: HashSet::new(),
with_orphans: HashSet::new(),
indices_by_version: HashMap::new(),
}
}
fn update_data(&mut self, data: &S3TimelineBlobData) {
@@ -52,13 +54,6 @@ impl MetadataSummary {
}
fn update_analysis(&mut self, id: &TenantShardTimelineId, analysis: &TimelineAnalysis) {
if analysis.is_healthy() {
self.healthy_tenant_shards.insert(id.tenant_shard_id);
} else {
self.healthy_tenant_shards.remove(&id.tenant_shard_id);
self.unhealthy_tenant_shards.insert(id.tenant_shard_id);
}
if !analysis.errors.is_empty() {
self.with_errors.insert(*id);
}
@@ -106,13 +101,6 @@ Index versions: {version_summary}
pub fn is_empty(&self) -> bool {
self.timeline_shard_count == 0
}
pub fn build_health_update_request(&self) -> MetadataHealthUpdateRequest {
MetadataHealthUpdateRequest {
healthy_tenant_shards: self.healthy_tenant_shards.clone(),
unhealthy_tenant_shards: self.unhealthy_tenant_shards.clone(),
}
}
}
/// Scan the pageserver metadata in an S3 bucket, reporting errors and statistics.

View File

@@ -222,8 +222,6 @@ class NeonBenchmarker:
function by the zenbenchmark fixture
"""
PROPERTY_PREFIX = "neon_benchmarker_"
def __init__(self, property_recorder: Callable[[str, object], None]):
# property recorder here is a pytest fixture provided by junitxml module
# https://docs.pytest.org/en/6.2.x/reference.html#pytest.junitxml.record_property
@@ -240,7 +238,7 @@ class NeonBenchmarker:
Record a benchmark result.
"""
# just to namespace the value
name = f"{self.PROPERTY_PREFIX}_{metric_name}"
name = f"neon_benchmarker_{metric_name}"
self.property_recorder(
name,
{
@@ -251,18 +249,6 @@ class NeonBenchmarker:
},
)
@classmethod
def records(
cls, user_properties: list[tuple[str, object]]
) -> Iterator[tuple[str, dict[str, object]]]:
"""
Yield all records related to benchmarks
"""
for property_name, recorded_property in user_properties:
if property_name.startswith(cls.PROPERTY_PREFIX):
assert isinstance(recorded_property, dict)
yield recorded_property["name"], recorded_property
@contextmanager
def record_duration(self, metric_name: str) -> Iterator[None]:
"""
@@ -439,11 +425,10 @@ def zenbenchmark(
yield benchmarker
results = {}
for _, recorded_property in NeonBenchmarker.records(request.node.user_properties):
for _, recorded_property in request.node.user_properties:
name = recorded_property["name"]
value = str(recorded_property["value"])
unit = str(recorded_property["unit"]).strip()
if unit != "":
if (unit := recorded_property["unit"].strip()) != "":
value += f" {unit}"
results[name] = value
@@ -492,7 +477,7 @@ def pytest_terminal_summary(
for test_report in terminalreporter.stats.get("passed", []):
result_entry = []
for _, recorded_property in NeonBenchmarker.records(test_report.user_properties):
for _, recorded_property in test_report.user_properties:
if not is_header_printed:
terminalreporter.section("Benchmark results", "-")
is_header_printed = True

View File

@@ -449,7 +449,6 @@ class TokenScope(str, Enum):
GENERATIONS_API = "generations_api"
SAFEKEEPER_DATA = "safekeeperdata"
TENANT = "tenant"
SCRUBBER = "scrubber"
class NeonEnvBuilder:
@@ -1442,7 +1441,6 @@ def neon_env_builder(
pageserver_virtual_file_io_engine: str,
pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]],
pageserver_aux_file_policy: Optional[AuxFileStore],
record_property: Callable[[str, object], None],
) -> Iterator[NeonEnvBuilder]:
"""
Fixture to create a Neon environment for test.
@@ -1482,7 +1480,9 @@ def neon_env_builder(
yield builder
# Propogate `preserve_database_files` to make it possible to use in other fixtures,
# like `test_output_dir` fixture for attaching all database files to Allure report.
record_property("preserve_database_files", builder.preserve_database_files)
request.node.user_properties.append(
("preserve_database_files", builder.preserve_database_files)
)
@dataclass
@@ -2587,51 +2587,6 @@ class NeonStorageController(MetricsGetter, LogUtils):
time.sleep(backoff)
def metadata_health_update(self, healthy: List[TenantShardId], unhealthy: List[TenantShardId]):
body: Dict[str, Any] = {
"healthy_tenant_shards": [str(t) for t in healthy],
"unhealthy_tenant_shards": [str(t) for t in unhealthy],
}
self.request(
"POST",
f"{self.env.storage_controller_api}/control/v1/metadata_health/update",
json=body,
headers=self.headers(TokenScope.SCRUBBER),
)
def metadata_health_list_unhealthy(self):
response = self.request(
"GET",
f"{self.env.storage_controller_api}/control/v1/metadata_health/unhealthy",
headers=self.headers(TokenScope.ADMIN),
)
return response.json()
def metadata_health_list_outdated(self, duration: str):
body: Dict[str, Any] = {"not_scrubbed_for": duration}
response = self.request(
"POST",
f"{self.env.storage_controller_api}/control/v1/metadata_health/outdated",
json=body,
headers=self.headers(TokenScope.ADMIN),
)
return response.json()
def metadata_health_is_healthy(self, outdated_duration: str = "1h") -> bool:
"""Metadata is healthy if there is no unhealthy or outdated health records."""
unhealthy = self.metadata_health_list_unhealthy()
outdated = self.metadata_health_list_outdated(outdated_duration)
healthy = (
len(unhealthy["unhealthy_tenant_shards"]) == 0 and len(outdated["health_records"]) == 0
)
if not healthy:
log.info(f"{unhealthy=}, {outdated=}")
return healthy
def step_down(self):
log.info("Asking storage controller to step down")
response = self.request(
@@ -4401,11 +4356,10 @@ class StorageScrubber:
assert stdout is not None
return stdout
def scan_metadata(self, post_to_storage_controller: bool = False) -> Any:
args = ["scan-metadata", "--node-kind", "pageserver", "--json"]
if post_to_storage_controller:
args.append("--post")
stdout = self.scrubber_cli(args, timeout=30)
def scan_metadata(self) -> Any:
stdout = self.scrubber_cli(
["scan-metadata", "--node-kind", "pageserver", "--json"], timeout=30
)
try:
return json.loads(stdout)

View File

@@ -1,88 +0,0 @@
"""
Test the logical replication in Neon with the different consumers
"""
import hashlib
import time
import clickhouse_connect
import psycopg2
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import RemotePostgres
from fixtures.utils import wait_until
def query_clickhouse(
client,
query: str,
digest: str,
) -> None:
"""
Run the query on the client
return answer if successful, raise an exception otherwise
"""
log.debug("Query: %s", query)
res = client.query(query)
log.debug(res.result_rows)
m = hashlib.sha1()
m.update(repr(tuple(res.result_rows)).encode())
hash_res = m.hexdigest()
log.debug("Hash: %s", hash_res)
if hash_res == digest:
return
raise ValueError("Hash mismatch")
@pytest.mark.remote_cluster
def test_clickhouse(remote_pg: RemotePostgres):
"""
Test the logical replication having ClickHouse as a client
"""
conn_options = remote_pg.conn_options()
for _ in range(5):
try:
conn = psycopg2.connect(remote_pg.connstr())
except psycopg2.OperationalError as perr:
log.debug(perr)
time.sleep(1)
else:
break
raise TimeoutError
cur = conn.cursor()
cur.execute("DROP TABLE IF EXISTS table1")
cur.execute("CREATE TABLE table1 (id integer primary key, column1 varchar(10));")
cur.execute("INSERT INTO table1 (id, column1) VALUES (1, 'abc'), (2, 'def');")
conn.commit()
client = clickhouse_connect.get_client(host="clickhouse")
client.command("SET allow_experimental_database_materialized_postgresql=1")
client.command(
"CREATE DATABASE db1_postgres ENGINE = "
f"MaterializedPostgreSQL('{conn_options['host']}', "
f"'{conn_options['dbname']}', "
f"'{conn_options['user']}', '{conn_options['password']}') "
"SETTINGS materialized_postgresql_tables_list = 'table1';"
)
wait_until(
120,
0.5,
lambda: query_clickhouse(
client,
"select * from db1_postgres.table1 order by 1",
"ee600d8f7cd05bd0b169fa81f44300a9dd10085a",
),
)
cur.execute("INSERT INTO table1 (id, column1) VALUES (3, 'ghi'), (4, 'jkl');")
conn.commit()
wait_until(
120,
0.5,
lambda: query_clickhouse(
client,
"select * from db1_postgres.table1 order by 1",
"9eba2daaf7e4d7d27ac849525f68b562ab53947d",
),
)
log.debug("Sleeping before final checking if Neon is still alive")
time.sleep(3)
cur.execute("SELECT 1")

View File

@@ -389,11 +389,6 @@ def test_duplicate_creation(neon_env_builder: NeonEnvBuilder):
repeat_result = ps_http.timeline_create(
env.pg_version, env.initial_tenant, success_timeline, timeout=60
)
# remote_consistent_lsn_visible will be published only after we've
# confirmed the generation, which is not part of what we await during
# timeline creation (uploads). mask it out here to avoid flakyness.
del success_result["remote_consistent_lsn_visible"]
del repeat_result["remote_consistent_lsn_visible"]
assert repeat_result == success_result
finally:
env.pageserver.stop(immediate=True)

View File

@@ -3,7 +3,7 @@ import threading
import time
from collections import defaultdict
from datetime import datetime, timezone
from typing import Any, Dict, List, Optional, Set, Tuple, Union
from typing import Any, Dict, List, Union
import pytest
from fixtures.common_types import TenantId, TenantShardId, TimelineId
@@ -1785,126 +1785,6 @@ def test_storage_controller_node_deletion(
env.storage_controller.consistency_check()
@pytest.mark.parametrize("shard_count", [None, 2])
def test_storage_controller_metadata_health(
neon_env_builder: NeonEnvBuilder,
shard_count: Optional[int],
):
"""
Create three tenants A, B, C.
Phase 1:
- A: Post healthy status.
- B: Post unhealthy status.
- C: No updates.
Phase 2:
- B: Post healthy status.
- C: Post healthy status.
Phase 3:
- A: Post unhealthy status.
Phase 4:
- Delete tenant A, metadata health status should be deleted as well.
"""
def update_and_query_metadata_health(
env: NeonEnv,
healthy: List[TenantShardId],
unhealthy: List[TenantShardId],
outdated_duration: str = "1h",
) -> Tuple[Set[str], Set[str]]:
"""
Update metadata health. Then list tenant shards with unhealthy and
outdated metadata health status.
"""
if healthy or unhealthy:
env.storage_controller.metadata_health_update(healthy, unhealthy)
result = env.storage_controller.metadata_health_list_unhealthy()
unhealthy_res = set(result["unhealthy_tenant_shards"])
result = env.storage_controller.metadata_health_list_outdated(outdated_duration)
outdated_res = set(record["tenant_shard_id"] for record in result["health_records"])
return unhealthy_res, outdated_res
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
neon_env_builder.num_pageservers = 2
env = neon_env_builder.init_start()
# Mock tenant (`initial_tenant``) with healthy scrubber scan result
tenant_a_shard_ids = (
env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=shard_count)
if shard_count is not None
else [TenantShardId(env.initial_tenant, 0, 0)]
)
# Mock tenant with unhealthy scrubber scan result
tenant_b, _ = env.neon_cli.create_tenant(shard_count=shard_count)
tenant_b_shard_ids = (
env.storage_controller.tenant_shard_split(tenant_b, shard_count=shard_count)
if shard_count is not None
else [TenantShardId(tenant_b, 0, 0)]
)
# Mock tenant that never gets a health update from scrubber
tenant_c, _ = env.neon_cli.create_tenant(shard_count=shard_count)
tenant_c_shard_ids = (
env.storage_controller.tenant_shard_split(tenant_c, shard_count=shard_count)
if shard_count is not None
else [TenantShardId(tenant_c, 0, 0)]
)
# Metadata health table also updated as tenant shards are created.
assert env.storage_controller.metadata_health_is_healthy()
# post "fake" updates to storage controller db
unhealthy, outdated = update_and_query_metadata_health(
env, healthy=tenant_a_shard_ids, unhealthy=tenant_b_shard_ids
)
log.info(f"After Phase 1: {unhealthy=}, {outdated=}")
assert len(unhealthy) == len(tenant_b_shard_ids)
for t in tenant_b_shard_ids:
assert str(t) in unhealthy
assert len(outdated) == 0
unhealthy, outdated = update_and_query_metadata_health(
env, healthy=tenant_b_shard_ids + tenant_c_shard_ids, unhealthy=[]
)
log.info(f"After Phase 2: {unhealthy=}, {outdated=}")
assert len(unhealthy) == 0
assert len(outdated) == 0
unhealthy, outdated = update_and_query_metadata_health(
env, healthy=[], unhealthy=tenant_a_shard_ids
)
log.info(f"After Phase 3: {unhealthy=}, {outdated=}")
assert len(unhealthy) == len(tenant_a_shard_ids)
for t in tenant_a_shard_ids:
assert str(t) in unhealthy
assert len(outdated) == 0
# Phase 4: Delete A
env.storage_controller.pageserver_api().tenant_delete(env.initial_tenant)
# A's unhealthy metadata health status should be deleted as well.
assert env.storage_controller.metadata_health_is_healthy()
# All shards from B and C are not fresh if set outdated duration to 0 seconds.
unhealthy, outdated = update_and_query_metadata_health(
env, healthy=[], unhealthy=tenant_a_shard_ids, outdated_duration="0s"
)
assert len(unhealthy) == 0
for t in tenant_b_shard_ids + tenant_c_shard_ids:
assert str(t) in outdated
def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder):
"""
Test the `/control/v1/step_down` storage controller API. Upon receiving such

View File

@@ -440,12 +440,10 @@ def test_scrubber_scan_pageserver_metadata(
assert len(index.layer_metadata) > 0
it = iter(index.layer_metadata.items())
scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True)
scan_summary = env.storage_scrubber.scan_metadata()
assert not scan_summary["with_warnings"]
assert not scan_summary["with_errors"]
assert env.storage_controller.metadata_health_is_healthy()
# Delete a layer file that is listed in the index.
layer, metadata = next(it)
log.info(f"Deleting {timeline_path}/{layer.to_str()}")
@@ -455,17 +453,7 @@ def test_scrubber_scan_pageserver_metadata(
)
log.info(f"delete response: {delete_response}")
# Check scan summary without posting to storage controller. Expect it to be a L0 layer so only emit warnings.
# Check scan summary. Expect it to be a L0 layer so only emit warnings.
scan_summary = env.storage_scrubber.scan_metadata()
log.info(f"{pprint.pformat(scan_summary)}")
assert len(scan_summary["with_warnings"]) > 0
assert env.storage_controller.metadata_health_is_healthy()
# Now post to storage controller, expect seeing one unhealthy health record
scan_summary = env.storage_scrubber.scan_metadata(post_to_storage_controller=True)
log.info(f"{pprint.pformat(scan_summary)}")
assert len(scan_summary["with_warnings"]) > 0
unhealthy = env.storage_controller.metadata_health_list_unhealthy()["unhealthy_tenant_shards"]
assert len(unhealthy) == 1 and unhealthy[0] == str(tenant_shard_id)