mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-21 23:20:40 +00:00
Compare commits
35 Commits
problame/r
...
dkr/plumbe
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
daac088c5e | ||
|
|
0b90411380 | ||
|
|
f4da010aee | ||
|
|
ec10838aa4 | ||
|
|
67af24191e | ||
|
|
6af5f9bfe0 | ||
|
|
64fc7eafcd | ||
|
|
3e4710c59e | ||
|
|
d8b0a298b7 | ||
|
|
c8094ee51e | ||
|
|
957af049c2 | ||
|
|
786c7b3708 | ||
|
|
d3612ce266 | ||
|
|
994411f5c2 | ||
|
|
25934ec1ba | ||
|
|
0bdbc39cb1 | ||
|
|
96b84ace89 | ||
|
|
368b783ada | ||
|
|
0f47bc03eb | ||
|
|
fdbe8dc8e0 | ||
|
|
1b97a3074c | ||
|
|
5c836ee5b4 | ||
|
|
4687b2e597 | ||
|
|
13adc83fc3 | ||
|
|
52c2c69351 | ||
|
|
207919f5eb | ||
|
|
218be9eb32 | ||
|
|
8198b865c3 | ||
|
|
baf395983f | ||
|
|
ce7efbe48a | ||
|
|
ef4a76c01e | ||
|
|
1ca08cc523 | ||
|
|
4626d89eda | ||
|
|
49c57c0b13 | ||
|
|
d3a97fdf88 |
@@ -1,6 +1,13 @@
|
||||
name: 'Create Allure report'
|
||||
description: 'Generate Allure report from uploaded by actions/allure-report-store tests results'
|
||||
|
||||
inputs:
|
||||
store-test-results-into-db:
|
||||
description: 'Whether to store test results into the database. TEST_RESULT_CONNSTR/TEST_RESULT_CONNSTR_NEW should be set'
|
||||
type: boolean
|
||||
required: false
|
||||
default: false
|
||||
|
||||
outputs:
|
||||
base-url:
|
||||
description: 'Base URL for Allure report'
|
||||
@@ -139,9 +146,11 @@ runs:
|
||||
sed -i 's|<a href="." class=|<a href="https://'${BUCKET}'.s3.amazonaws.com/'${REPORT_PREFIX}'/latest/index.html?nocache='"'+Date.now()+'"'" class=|g' ${WORKDIR}/report/app.js
|
||||
|
||||
# Upload a history and the final report (in this particular order to not to have duplicated history in 2 places)
|
||||
# Use sync for the final report to delete files from previous runs
|
||||
time aws s3 mv --recursive --only-show-errors "${WORKDIR}/report/history" "s3://${BUCKET}/${REPORT_PREFIX}/latest/history"
|
||||
time aws s3 sync --delete --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"
|
||||
|
||||
# Use aws s3 cp (instead of aws s3 sync) to keep files from previous runs to make old URLs work,
|
||||
# and to keep files on the host to upload them to the database
|
||||
time aws s3 cp --recursive --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"
|
||||
|
||||
# Generate redirect
|
||||
cat <<EOF > ${WORKDIR}/index.html
|
||||
@@ -170,6 +179,41 @@ runs:
|
||||
aws s3 rm "s3://${BUCKET}/${LOCK_FILE}"
|
||||
fi
|
||||
|
||||
- name: Store Allure test stat in the DB
|
||||
if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }}
|
||||
shell: bash -euxo pipefail {0}
|
||||
env:
|
||||
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
REPORT_JSON_URL: ${{ steps.generate-report.outputs.report-json-url }}
|
||||
run: |
|
||||
export DATABASE_URL=${REGRESS_TEST_RESULT_CONNSTR}
|
||||
|
||||
./scripts/pysync
|
||||
|
||||
poetry run python3 scripts/ingest_regress_test_result.py \
|
||||
--revision ${COMMIT_SHA} \
|
||||
--reference ${GITHUB_REF} \
|
||||
--build-type unified \
|
||||
--ingest ${WORKDIR}/report/data/suites.json
|
||||
|
||||
- name: Store Allure test stat in the DB (new)
|
||||
if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }}
|
||||
shell: bash -euxo pipefail {0}
|
||||
env:
|
||||
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
BASE_S3_URL: ${{ steps.generate-report.outputs.base-s3-url }}
|
||||
run: |
|
||||
export DATABASE_URL=${REGRESS_TEST_RESULT_CONNSTR_NEW}
|
||||
|
||||
./scripts/pysync
|
||||
|
||||
poetry run python3 scripts/ingest_regress_test_result-new-format.py \
|
||||
--reference ${GITHUB_REF} \
|
||||
--revision ${COMMIT_SHA} \
|
||||
--run-id ${GITHUB_RUN_ID} \
|
||||
--run-attempt ${GITHUB_RUN_ATTEMPT} \
|
||||
--test-cases-dir ${WORKDIR}/report/data/test-cases
|
||||
|
||||
- name: Cleanup
|
||||
if: always()
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
44
.github/workflows/build_and_test.yml
vendored
44
.github/workflows/build_and_test.yml
vendored
@@ -432,6 +432,11 @@ jobs:
|
||||
if: ${{ !cancelled() }}
|
||||
id: create-allure-report
|
||||
uses: ./.github/actions/allure-report-generate
|
||||
with:
|
||||
store-test-results-into-db: true
|
||||
env:
|
||||
REGRESS_TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
|
||||
REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
||||
|
||||
- uses: actions/github-script@v6
|
||||
if: ${{ !cancelled() }}
|
||||
@@ -452,45 +457,6 @@ jobs:
|
||||
report,
|
||||
})
|
||||
|
||||
- name: Store Allure test stat in the DB
|
||||
if: ${{ !cancelled() && steps.create-allure-report.outputs.report-json-url }}
|
||||
env:
|
||||
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
REPORT_JSON_URL: ${{ steps.create-allure-report.outputs.report-json-url }}
|
||||
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
|
||||
run: |
|
||||
./scripts/pysync
|
||||
|
||||
curl --fail --output suites.json "${REPORT_JSON_URL}"
|
||||
export BUILD_TYPE=unified
|
||||
export DATABASE_URL="$TEST_RESULT_CONNSTR"
|
||||
|
||||
poetry run python3 scripts/ingest_regress_test_result.py \
|
||||
--revision ${COMMIT_SHA} \
|
||||
--reference ${GITHUB_REF} \
|
||||
--build-type ${BUILD_TYPE} \
|
||||
--ingest suites.json
|
||||
|
||||
- name: Store Allure test stat in the DB (new)
|
||||
if: ${{ !cancelled() && steps.create-allure-report.outputs.report-json-url }}
|
||||
env:
|
||||
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
REPORT_JSON_URL: ${{ steps.create-allure-report.outputs.report-json-url }}
|
||||
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
||||
BASE_S3_URL: ${{ steps.create-allure-report.outputs.base-s3-url }}
|
||||
run: |
|
||||
aws s3 cp --only-show-errors --recursive ${BASE_S3_URL}/data/test-cases ./test-cases
|
||||
|
||||
./scripts/pysync
|
||||
|
||||
export DATABASE_URL="$TEST_RESULT_CONNSTR"
|
||||
poetry run python3 scripts/ingest_regress_test_result-new-format.py \
|
||||
--reference ${GITHUB_REF} \
|
||||
--revision ${COMMIT_SHA} \
|
||||
--run-id ${GITHUB_RUN_ID} \
|
||||
--run-attempt ${GITHUB_RUN_ATTEMPT} \
|
||||
--test-cases-dir ./test-cases
|
||||
|
||||
coverage-report:
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container:
|
||||
|
||||
58
Cargo.lock
generated
58
Cargo.lock
generated
@@ -639,6 +639,12 @@ dependencies = [
|
||||
"vsimd",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "base64ct"
|
||||
version = "1.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"
|
||||
|
||||
[[package]]
|
||||
name = "bincode"
|
||||
version = "1.3.3"
|
||||
@@ -886,6 +892,8 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"chrono",
|
||||
"regex",
|
||||
"remote_storage",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_with",
|
||||
@@ -1010,9 +1018,9 @@ checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa"
|
||||
|
||||
[[package]]
|
||||
name = "cpufeatures"
|
||||
version = "0.2.7"
|
||||
version = "0.2.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3e4c1eaa2012c47becbbad2ab175484c2a84d1185b566fb2cc5b8707343dfe58"
|
||||
checksum = "a17b76ff3a4162b0b27f354a0c87015ddad39d35f9c0c36607a3bdd175dde1f1"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
@@ -1192,15 +1200,15 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "dashmap"
|
||||
version = "5.4.0"
|
||||
version = "5.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "907076dfda823b0b36d2a1bb5f90c96660a5bbcd7729e10727f07858f22c4edc"
|
||||
checksum = "6943ae99c34386c84a470c499d3414f66502a41340aa895406e0d2e4a207b91d"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"hashbrown 0.12.3",
|
||||
"hashbrown 0.14.0",
|
||||
"lock_api",
|
||||
"once_cell",
|
||||
"parking_lot_core 0.9.7",
|
||||
"parking_lot_core 0.9.8",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1649,6 +1657,12 @@ dependencies = [
|
||||
"ahash",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
version = "0.14.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a"
|
||||
|
||||
[[package]]
|
||||
name = "hashlink"
|
||||
version = "0.8.2"
|
||||
@@ -2073,9 +2087,9 @@ checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519"
|
||||
|
||||
[[package]]
|
||||
name = "lock_api"
|
||||
version = "0.4.9"
|
||||
version = "0.4.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "435011366fe56583b16cf956f9df0095b405b82d76425bc8981c0e22e60ec4df"
|
||||
checksum = "c1cc9717a20b1bb222f333e6a92fd32f7d8a18ddc5a3191a11af45dcbf4dcd16"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"scopeguard",
|
||||
@@ -2339,9 +2353,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "once_cell"
|
||||
version = "1.17.1"
|
||||
version = "1.18.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3"
|
||||
checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d"
|
||||
|
||||
[[package]]
|
||||
name = "oorandom"
|
||||
@@ -2640,7 +2654,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f"
|
||||
dependencies = [
|
||||
"lock_api",
|
||||
"parking_lot_core 0.9.7",
|
||||
"parking_lot_core 0.9.8",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2659,15 +2673,26 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "parking_lot_core"
|
||||
version = "0.9.7"
|
||||
version = "0.9.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9069cbb9f99e3a5083476ccb29ceb1de18b9118cafa53e90c9551235de2b9521"
|
||||
checksum = "93f00c865fe7cabf650081affecd3871070f26767e7b2070a3ffae14c654b447"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"redox_syscall 0.2.16",
|
||||
"redox_syscall 0.3.5",
|
||||
"smallvec",
|
||||
"windows-sys 0.45.0",
|
||||
"windows-targets 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "password-hash"
|
||||
version = "0.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "346f04948ba92c43e8469c1ee6736c7563d71012b17d40745260fe106aac2166"
|
||||
dependencies = [
|
||||
"base64ct",
|
||||
"rand_core",
|
||||
"subtle",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2678,6 +2703,8 @@ checksum = "f0ca0b5a68607598bf3bad68f32227a8164f6254833f84eafaac409cd6746c31"
|
||||
dependencies = [
|
||||
"digest",
|
||||
"hmac",
|
||||
"password-hash",
|
||||
"sha2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3056,6 +3083,7 @@ dependencies = [
|
||||
"chrono",
|
||||
"clap",
|
||||
"consumption_metrics",
|
||||
"dashmap",
|
||||
"futures",
|
||||
"git-version",
|
||||
"hashbrown 0.13.2",
|
||||
|
||||
@@ -54,6 +54,7 @@ comfy-table = "6.1"
|
||||
const_format = "0.2"
|
||||
crc32c = "0.6"
|
||||
crossbeam-utils = "0.8.5"
|
||||
dashmap = "5.5.0"
|
||||
either = "1.8"
|
||||
enum-map = "2.4.2"
|
||||
enumset = "1.0.12"
|
||||
@@ -88,7 +89,7 @@ opentelemetry = "0.19.0"
|
||||
opentelemetry-otlp = { version = "0.12.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
|
||||
opentelemetry-semantic-conventions = "0.11.0"
|
||||
parking_lot = "0.12"
|
||||
pbkdf2 = "0.12.1"
|
||||
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
|
||||
pin-project-lite = "0.2"
|
||||
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
|
||||
prost = "0.11"
|
||||
|
||||
@@ -51,6 +51,7 @@ RUN set -e \
|
||||
--bin safekeeper \
|
||||
--bin storage_broker \
|
||||
--bin proxy \
|
||||
--bin neon_local \
|
||||
--locked --release \
|
||||
&& cachepot -s
|
||||
|
||||
@@ -76,6 +77,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local /usr/local/bin
|
||||
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
|
||||
|
||||
@@ -38,7 +38,7 @@ use std::fs::File;
|
||||
use std::panic;
|
||||
use std::path::Path;
|
||||
use std::process::exit;
|
||||
use std::sync::{mpsc, Arc, Condvar, Mutex, OnceLock, RwLock};
|
||||
use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock};
|
||||
use std::{thread, time::Duration};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
@@ -147,6 +147,7 @@ fn main() -> Result<()> {
|
||||
match spec_json {
|
||||
// First, try to get cluster spec from the cli argument
|
||||
Some(json) => {
|
||||
info!("got spec from cli argument {}", json);
|
||||
spec = Some(serde_json::from_str(json)?);
|
||||
}
|
||||
None => {
|
||||
@@ -182,6 +183,7 @@ fn main() -> Result<()> {
|
||||
|
||||
if let Some(spec) = spec {
|
||||
let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
|
||||
info!("new pspec.spec: {:?}", pspec.spec);
|
||||
new_state.pspec = Some(pspec);
|
||||
spec_set = true;
|
||||
} else {
|
||||
@@ -196,9 +198,7 @@ fn main() -> Result<()> {
|
||||
state: Mutex::new(new_state),
|
||||
state_changed: Condvar::new(),
|
||||
ext_remote_storage,
|
||||
ext_remote_paths: OnceLock::new(),
|
||||
ext_download_progress: RwLock::new(HashMap::new()),
|
||||
library_index: OnceLock::new(),
|
||||
build_tag,
|
||||
};
|
||||
let compute = Arc::new(compute_node);
|
||||
|
||||
@@ -5,7 +5,7 @@ use std::os::unix::fs::PermissionsExt;
|
||||
use std::path::Path;
|
||||
use std::process::{Command, Stdio};
|
||||
use std::str::FromStr;
|
||||
use std::sync::{Condvar, Mutex, OnceLock, RwLock};
|
||||
use std::sync::{Condvar, Mutex, RwLock};
|
||||
use std::time::Instant;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
@@ -14,7 +14,6 @@ use futures::future::join_all;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use futures::StreamExt;
|
||||
use postgres::{Client, NoTls};
|
||||
use regex::Regex;
|
||||
use tokio;
|
||||
use tokio_postgres;
|
||||
use tracing::{error, info, instrument, warn};
|
||||
@@ -60,10 +59,6 @@ pub struct ComputeNode {
|
||||
pub state_changed: Condvar,
|
||||
/// the S3 bucket that we search for extensions in
|
||||
pub ext_remote_storage: Option<GenericRemoteStorage>,
|
||||
// (key: extension name, value: path to extension archive in remote storage)
|
||||
pub ext_remote_paths: OnceLock<HashMap<String, RemotePath>>,
|
||||
// (key: library name, value: name of extension containing this library)
|
||||
pub library_index: OnceLock<HashMap<String, String>>,
|
||||
// key: ext_archive_name, value: started download time, download_completed?
|
||||
pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
|
||||
pub build_tag: String,
|
||||
@@ -75,7 +70,6 @@ pub struct RemoteExtensionMetrics {
|
||||
num_ext_downloaded: u64,
|
||||
largest_ext_size: u64,
|
||||
total_ext_download_size: u64,
|
||||
prep_extensions_ms: u64,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
@@ -745,11 +739,19 @@ impl ComputeNode {
|
||||
pspec.timeline_id,
|
||||
);
|
||||
|
||||
info!(
|
||||
"start_compute spec.remote_extensions {:?}",
|
||||
pspec.spec.remote_extensions
|
||||
);
|
||||
|
||||
// This part is sync, because we need to download
|
||||
// remote shared_preload_libraries before postgres start (if any)
|
||||
{
|
||||
if let Some(remote_extensions) = &pspec.spec.remote_extensions {
|
||||
// First, create control files for all availale extensions
|
||||
extension_server::create_control_files(remote_extensions, &self.pgbin);
|
||||
|
||||
let library_load_start_time = Utc::now();
|
||||
let remote_ext_metrics = self.prepare_preload_libraries(&compute_state)?;
|
||||
let remote_ext_metrics = self.prepare_preload_libraries(&pspec.spec)?;
|
||||
|
||||
let library_load_time = Utc::now()
|
||||
.signed_duration_since(library_load_start_time)
|
||||
@@ -761,7 +763,6 @@ impl ComputeNode {
|
||||
state.metrics.num_ext_downloaded = remote_ext_metrics.num_ext_downloaded;
|
||||
state.metrics.largest_ext_size = remote_ext_metrics.largest_ext_size;
|
||||
state.metrics.total_ext_download_size = remote_ext_metrics.total_ext_download_size;
|
||||
state.metrics.prep_extensions_ms = remote_ext_metrics.prep_extensions_ms;
|
||||
info!(
|
||||
"Loading shared_preload_libraries took {:?}ms",
|
||||
library_load_time
|
||||
@@ -918,38 +919,11 @@ LIMIT 100",
|
||||
}
|
||||
}
|
||||
|
||||
// If remote extension storage is configured,
|
||||
// download extension control files
|
||||
pub async fn prepare_external_extensions(&self, compute_state: &ComputeState) -> Result<()> {
|
||||
if let Some(ref ext_remote_storage) = self.ext_remote_storage {
|
||||
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
let spec = &pspec.spec;
|
||||
let custom_ext = spec.custom_extensions.clone().unwrap_or(Vec::new());
|
||||
info!("custom extensions: {:?}", &custom_ext);
|
||||
|
||||
let (ext_remote_paths, library_index) = extension_server::get_available_extensions(
|
||||
ext_remote_storage,
|
||||
&self.pgbin,
|
||||
&self.pgversion,
|
||||
&custom_ext,
|
||||
&self.build_tag,
|
||||
)
|
||||
.await?;
|
||||
self.ext_remote_paths
|
||||
.set(ext_remote_paths)
|
||||
.expect("this is the only time we set ext_remote_paths");
|
||||
self.library_index
|
||||
.set(library_index)
|
||||
.expect("this is the only time we set library_index");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// download an archive, unzip and place files in correct locations
|
||||
pub async fn download_extension(
|
||||
&self,
|
||||
ext_name: &str,
|
||||
is_library: bool,
|
||||
real_ext_name: String,
|
||||
ext_path: RemotePath,
|
||||
) -> Result<u64, DownloadError> {
|
||||
let remote_storage = self
|
||||
.ext_remote_storage
|
||||
@@ -958,35 +932,6 @@ LIMIT 100",
|
||||
"Remote extensions storage is not configured",
|
||||
)))?;
|
||||
|
||||
let mut real_ext_name = ext_name;
|
||||
if is_library {
|
||||
// sometimes library names might have a suffix like
|
||||
// library.so or library.so.3. We strip this off
|
||||
// because library_index is based on the name without the file extension
|
||||
let strip_lib_suffix = Regex::new(r"\.so.*").unwrap();
|
||||
let lib_raw_name = strip_lib_suffix.replace(real_ext_name, "").to_string();
|
||||
|
||||
real_ext_name = self
|
||||
.library_index
|
||||
.get()
|
||||
.expect("must have already downloaded the library_index")
|
||||
.get(&lib_raw_name)
|
||||
.ok_or(DownloadError::BadInput(anyhow::anyhow!(
|
||||
"library {} is not found",
|
||||
lib_raw_name
|
||||
)))?;
|
||||
}
|
||||
|
||||
let ext_path = &self
|
||||
.ext_remote_paths
|
||||
.get()
|
||||
.expect("error accessing ext_remote_paths")
|
||||
.get(real_ext_name)
|
||||
.ok_or(DownloadError::BadInput(anyhow::anyhow!(
|
||||
"real_ext_name {} is not found",
|
||||
real_ext_name
|
||||
)))?;
|
||||
|
||||
let ext_archive_name = ext_path.object_name().expect("bad path");
|
||||
|
||||
let mut first_try = false;
|
||||
@@ -1039,8 +984,8 @@ LIMIT 100",
|
||||
info!("downloading new extension {ext_archive_name}");
|
||||
|
||||
let download_size = extension_server::download_extension(
|
||||
real_ext_name,
|
||||
ext_path,
|
||||
&real_ext_name,
|
||||
&ext_path,
|
||||
remote_storage,
|
||||
&self.pgbin,
|
||||
)
|
||||
@@ -1058,18 +1003,19 @@ LIMIT 100",
|
||||
#[tokio::main]
|
||||
pub async fn prepare_preload_libraries(
|
||||
&self,
|
||||
compute_state: &ComputeState,
|
||||
spec: &ComputeSpec,
|
||||
) -> Result<RemoteExtensionMetrics> {
|
||||
if self.ext_remote_storage.is_none() {
|
||||
return Ok(RemoteExtensionMetrics {
|
||||
num_ext_downloaded: 0,
|
||||
largest_ext_size: 0,
|
||||
total_ext_download_size: 0,
|
||||
prep_extensions_ms: 0,
|
||||
});
|
||||
}
|
||||
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
let spec = &pspec.spec;
|
||||
let remote_extensions = spec
|
||||
.remote_extensions
|
||||
.as_ref()
|
||||
.ok_or(anyhow::anyhow!("Remote extensions are not configured",))?;
|
||||
|
||||
info!("parse shared_preload_libraries from spec.cluster.settings");
|
||||
let mut libs_vec = Vec::new();
|
||||
@@ -1081,6 +1027,7 @@ LIMIT 100",
|
||||
.collect();
|
||||
}
|
||||
info!("parse shared_preload_libraries from provided postgresql.conf");
|
||||
|
||||
// that is used in neon_local and python tests
|
||||
if let Some(conf) = &spec.cluster.postgresql_conf {
|
||||
let conf_lines = conf.split('\n').collect::<Vec<&str>>();
|
||||
@@ -1101,30 +1048,16 @@ LIMIT 100",
|
||||
libs_vec.extend(preload_libs_vec);
|
||||
}
|
||||
|
||||
info!("Download ext_index.json, find the extension paths");
|
||||
let prep_ext_start_time = Utc::now();
|
||||
self.prepare_external_extensions(compute_state).await?;
|
||||
let prep_ext_time_delta = Utc::now()
|
||||
.signed_duration_since(prep_ext_start_time)
|
||||
.to_std()
|
||||
.unwrap()
|
||||
.as_millis() as u64;
|
||||
info!("Prepare extensions took {prep_ext_time_delta}ms");
|
||||
|
||||
// Don't try to download libraries that are not in the index.
|
||||
// Assume that they are already present locally.
|
||||
libs_vec.retain(|lib| {
|
||||
self.library_index
|
||||
.get()
|
||||
.expect("error accessing ext_remote_paths")
|
||||
.contains_key(lib)
|
||||
});
|
||||
libs_vec.retain(|lib| remote_extensions.library_index.contains_key(lib));
|
||||
|
||||
info!("Downloading to shared preload libraries: {:?}", &libs_vec);
|
||||
|
||||
let mut download_tasks = Vec::new();
|
||||
for library in &libs_vec {
|
||||
download_tasks.push(self.download_extension(library, true));
|
||||
let (ext_name, ext_path) = remote_extensions.get_ext(library, true)?;
|
||||
download_tasks.push(self.download_extension(ext_name, ext_path));
|
||||
}
|
||||
let results = join_all(download_tasks).await;
|
||||
|
||||
@@ -1132,7 +1065,6 @@ LIMIT 100",
|
||||
num_ext_downloaded: 0,
|
||||
largest_ext_size: 0,
|
||||
total_ext_download_size: 0,
|
||||
prep_extensions_ms: prep_ext_time_delta,
|
||||
};
|
||||
for result in results {
|
||||
let download_size = match result {
|
||||
|
||||
@@ -73,10 +73,9 @@ More specifically, here is an example ext_index.json
|
||||
*/
|
||||
use anyhow::Context;
|
||||
use anyhow::{self, Result};
|
||||
use futures::future::join_all;
|
||||
use compute_api::spec::RemoteExtSpec;
|
||||
use remote_storage::*;
|
||||
use serde_json;
|
||||
use std::collections::HashMap;
|
||||
use std::io::Read;
|
||||
use std::num::{NonZeroU32, NonZeroUsize};
|
||||
use std::path::Path;
|
||||
@@ -117,81 +116,6 @@ pub fn get_pg_version(pgbin: &str) -> String {
|
||||
panic!("Unsuported postgres version {human_version}");
|
||||
}
|
||||
|
||||
// download control files for enabled_extensions
|
||||
// return Hashmaps converting library names to extension names (library_index)
|
||||
// and specifying the remote path to the archive for each extension name
|
||||
pub async fn get_available_extensions(
|
||||
remote_storage: &GenericRemoteStorage,
|
||||
pgbin: &str,
|
||||
pg_version: &str,
|
||||
custom_extensions: &[String],
|
||||
build_tag: &str,
|
||||
) -> Result<(HashMap<String, RemotePath>, HashMap<String, String>)> {
|
||||
let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension");
|
||||
let index_path = format!("{build_tag}/{pg_version}/ext_index.json");
|
||||
let index_path = RemotePath::new(Path::new(&index_path)).context("error forming path")?;
|
||||
info!("download ext_index.json from: {:?}", &index_path);
|
||||
|
||||
let mut download = remote_storage.download(&index_path).await?;
|
||||
let mut ext_idx_buffer = Vec::new();
|
||||
download
|
||||
.download_stream
|
||||
.read_to_end(&mut ext_idx_buffer)
|
||||
.await?;
|
||||
info!("ext_index downloaded");
|
||||
|
||||
#[derive(Debug, serde::Deserialize)]
|
||||
struct Index {
|
||||
public_extensions: Vec<String>,
|
||||
library_index: HashMap<String, String>,
|
||||
extension_data: HashMap<String, ExtensionData>,
|
||||
}
|
||||
|
||||
#[derive(Debug, serde::Deserialize)]
|
||||
struct ExtensionData {
|
||||
control_data: HashMap<String, String>,
|
||||
archive_path: String,
|
||||
}
|
||||
|
||||
let ext_index_full = serde_json::from_slice::<Index>(&ext_idx_buffer)?;
|
||||
let mut enabled_extensions = ext_index_full.public_extensions;
|
||||
enabled_extensions.extend_from_slice(custom_extensions);
|
||||
let mut library_index = ext_index_full.library_index;
|
||||
let all_extension_data = ext_index_full.extension_data;
|
||||
info!("library_index: {:?}", library_index);
|
||||
|
||||
info!("enabled_extensions: {:?}", enabled_extensions);
|
||||
let mut ext_remote_paths = HashMap::new();
|
||||
let mut file_create_tasks = Vec::new();
|
||||
for extension in enabled_extensions {
|
||||
let ext_data = &all_extension_data[&extension];
|
||||
for (control_file, control_contents) in &ext_data.control_data {
|
||||
let extension_name = control_file
|
||||
.strip_suffix(".control")
|
||||
.expect("control files must end in .control");
|
||||
let control_path = local_sharedir.join(control_file);
|
||||
if !control_path.exists() {
|
||||
ext_remote_paths.insert(
|
||||
extension_name.to_string(),
|
||||
RemotePath::from_string(&ext_data.archive_path)?,
|
||||
);
|
||||
info!("writing file {:?}{:?}", control_path, control_contents);
|
||||
file_create_tasks.push(tokio::fs::write(control_path, control_contents));
|
||||
} else {
|
||||
warn!("control file {:?} exists both locally and remotely. ignoring the remote version.", control_file);
|
||||
// also delete this from library index
|
||||
library_index.retain(|_, value| value != extension_name);
|
||||
}
|
||||
}
|
||||
}
|
||||
let results = join_all(file_create_tasks).await;
|
||||
for result in results {
|
||||
result?;
|
||||
}
|
||||
info!("ext_remote_paths {:?}", ext_remote_paths);
|
||||
Ok((ext_remote_paths, library_index))
|
||||
}
|
||||
|
||||
// download the archive for a given extension,
|
||||
// unzip it, and place files in the appropriate locations (share/lib)
|
||||
pub async fn download_extension(
|
||||
@@ -253,6 +177,22 @@ pub async fn download_extension(
|
||||
Ok(download_size)
|
||||
}
|
||||
|
||||
// Create extension control files from spec
|
||||
pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
|
||||
let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension");
|
||||
for ext_data in remote_extensions.extension_data.values() {
|
||||
for (control_name, control_content) in &ext_data.control_data {
|
||||
let control_path = local_sharedir.join(control_name);
|
||||
if !control_path.exists() {
|
||||
info!("writing file {:?}{:?}", control_path, control_content);
|
||||
std::fs::write(control_path, control_content).unwrap();
|
||||
} else {
|
||||
warn!("control file {:?} exists both locally and remotely. ignoring the remote version.", control_path);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// This function initializes the necessary structs to use remote storage
|
||||
pub fn init_remote_storage(remote_ext_config: &str) -> anyhow::Result<GenericRemoteStorage> {
|
||||
#[derive(Debug, serde::Deserialize)]
|
||||
|
||||
@@ -13,7 +13,7 @@ use hyper::{Body, Method, Request, Response, Server, StatusCode};
|
||||
use num_cpus;
|
||||
use serde_json;
|
||||
use tokio::task;
|
||||
use tracing::{error, info};
|
||||
use tracing::{error, info, warn};
|
||||
use tracing_utils::http::OtelName;
|
||||
|
||||
fn status_response_from_state(state: &ComputeState) -> ComputeStatusResponse {
|
||||
@@ -126,6 +126,15 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
|
||||
info!("serving {:?} POST request", route);
|
||||
info!("req.uri {:?}", req.uri());
|
||||
|
||||
// don't even try to download extensions
|
||||
// if no remote storage is configured
|
||||
if compute.ext_remote_storage.is_none() {
|
||||
info!("no extensions remote storage configured");
|
||||
let mut resp = Response::new(Body::from("no remote storage configured"));
|
||||
*resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
|
||||
return resp;
|
||||
}
|
||||
|
||||
let mut is_library = false;
|
||||
if let Some(params) = req.uri().query() {
|
||||
info!("serving {:?} POST request with params: {}", route, params);
|
||||
@@ -137,24 +146,47 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
|
||||
return resp;
|
||||
}
|
||||
}
|
||||
|
||||
let filename = route.split('/').last().unwrap().to_string();
|
||||
info!("serving /extension_server POST request, filename: {filename:?} is_library: {is_library}");
|
||||
|
||||
// don't even try to download extensions
|
||||
// if no remote storage is configured
|
||||
if compute.ext_remote_storage.is_none() {
|
||||
info!("no extensions remote storage configured");
|
||||
let mut resp = Response::new(Body::from("no remote storage configured"));
|
||||
*resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
|
||||
return resp;
|
||||
}
|
||||
// get ext_name and path from spec
|
||||
// don't lock compute_state for too long
|
||||
let ext = {
|
||||
let compute_state = compute.state.lock().unwrap();
|
||||
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
let spec = &pspec.spec;
|
||||
|
||||
match compute.download_extension(&filename, is_library).await {
|
||||
Ok(_) => Response::new(Body::from("OK")),
|
||||
// debug only
|
||||
info!("spec: {:?}", spec);
|
||||
|
||||
let remote_extensions = match spec.remote_extensions.as_ref() {
|
||||
Some(r) => r,
|
||||
None => {
|
||||
info!("no remote extensions spec was provided");
|
||||
let mut resp = Response::new(Body::from("no remote storage configured"));
|
||||
*resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
|
||||
return resp;
|
||||
}
|
||||
};
|
||||
|
||||
remote_extensions.get_ext(&filename, is_library)
|
||||
};
|
||||
|
||||
match ext {
|
||||
Ok((ext_name, ext_path)) => {
|
||||
match compute.download_extension(ext_name, ext_path).await {
|
||||
Ok(_) => Response::new(Body::from("OK")),
|
||||
Err(e) => {
|
||||
error!("extension download failed: {}", e);
|
||||
let mut resp = Response::new(Body::from(e.to_string()));
|
||||
*resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
|
||||
resp
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
error!("extension download failed: {}", e);
|
||||
let mut resp = Response::new(Body::from(e.to_string()));
|
||||
warn!("extension download failed to find extension: {}", e);
|
||||
let mut resp = Response::new(Body::from("failed to find file"));
|
||||
*resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
|
||||
resp
|
||||
}
|
||||
|
||||
@@ -825,6 +825,16 @@ fn get_safekeeper(env: &local_env::LocalEnv, id: NodeId) -> Result<SafekeeperNod
|
||||
}
|
||||
}
|
||||
|
||||
// Get list of options to append to safekeeper command invocation.
|
||||
fn safekeeper_extra_opts(init_match: &ArgMatches) -> Vec<String> {
|
||||
init_match
|
||||
.get_many::<String>("safekeeper-extra-opt")
|
||||
.into_iter()
|
||||
.flatten()
|
||||
.map(|s| s.to_owned())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
let (sub_name, sub_args) = match sub_match.subcommand() {
|
||||
Some(safekeeper_command_data) => safekeeper_command_data,
|
||||
@@ -841,7 +851,9 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
|
||||
|
||||
match sub_name {
|
||||
"start" => {
|
||||
if let Err(e) = safekeeper.start() {
|
||||
let extra_opts = safekeeper_extra_opts(sub_args);
|
||||
|
||||
if let Err(e) = safekeeper.start(extra_opts) {
|
||||
eprintln!("safekeeper start failed: {}", e);
|
||||
exit(1);
|
||||
}
|
||||
@@ -866,7 +878,8 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if let Err(e) = safekeeper.start() {
|
||||
let extra_opts = safekeeper_extra_opts(sub_args);
|
||||
if let Err(e) = safekeeper.start(extra_opts) {
|
||||
eprintln!("safekeeper start failed: {}", e);
|
||||
exit(1);
|
||||
}
|
||||
@@ -893,7 +906,7 @@ fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow
|
||||
|
||||
for node in env.safekeepers.iter() {
|
||||
let safekeeper = SafekeeperNode::from_env(env, node);
|
||||
if let Err(e) = safekeeper.start() {
|
||||
if let Err(e) = safekeeper.start(vec![]) {
|
||||
eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
|
||||
try_stop_all(env, false);
|
||||
exit(1);
|
||||
@@ -956,6 +969,14 @@ fn cli() -> Command {
|
||||
|
||||
let safekeeper_id_arg = Arg::new("id").help("safekeeper id").required(false);
|
||||
|
||||
let safekeeper_extra_opt_arg = Arg::new("safekeeper-extra-opt")
|
||||
.short('e')
|
||||
.long("safekeeper-extra-opt")
|
||||
.num_args(1)
|
||||
.action(ArgAction::Append)
|
||||
.help("Additional safekeeper invocation options, e.g. -e=--http-auth-public-key-path=foo")
|
||||
.required(false);
|
||||
|
||||
let tenant_id_arg = Arg::new("tenant-id")
|
||||
.long("tenant-id")
|
||||
.help("Tenant id. Represented as a hexadecimal string 32 symbols length")
|
||||
@@ -1124,6 +1145,7 @@ fn cli() -> Command {
|
||||
.subcommand(Command::new("start")
|
||||
.about("Start local safekeeper")
|
||||
.arg(safekeeper_id_arg.clone())
|
||||
.arg(safekeeper_extra_opt_arg.clone())
|
||||
)
|
||||
.subcommand(Command::new("stop")
|
||||
.about("Stop local safekeeper")
|
||||
@@ -1134,6 +1156,7 @@ fn cli() -> Command {
|
||||
.about("Restart local safekeeper")
|
||||
.arg(safekeeper_id_arg)
|
||||
.arg(stop_mode_arg.clone())
|
||||
.arg(safekeeper_extra_opt_arg)
|
||||
)
|
||||
)
|
||||
.subcommand(
|
||||
|
||||
@@ -493,7 +493,7 @@ impl Endpoint {
|
||||
pageserver_connstring: Some(pageserver_connstring),
|
||||
safekeeper_connstrings,
|
||||
storage_auth_token: auth_token.clone(),
|
||||
custom_extensions: Some(vec![]),
|
||||
remote_extensions: None,
|
||||
};
|
||||
let spec_path = self.endpoint_path().join("spec.json");
|
||||
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
|
||||
|
||||
@@ -101,7 +101,7 @@ impl SafekeeperNode {
|
||||
self.datadir_path().join("safekeeper.pid")
|
||||
}
|
||||
|
||||
pub fn start(&self) -> anyhow::Result<Child> {
|
||||
pub fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<Child> {
|
||||
print!(
|
||||
"Starting safekeeper at '{}' in '{}'",
|
||||
self.pg_connection_config.raw_address(),
|
||||
@@ -161,17 +161,28 @@ impl SafekeeperNode {
|
||||
|
||||
let key_path = self.env.base_data_dir.join("auth_public_key.pem");
|
||||
if self.conf.auth_enabled {
|
||||
let key_path_string = key_path
|
||||
.to_str()
|
||||
.with_context(|| {
|
||||
format!("Key path {key_path:?} cannot be represented as a unicode string")
|
||||
})?
|
||||
.to_owned();
|
||||
args.extend([
|
||||
"--auth-validation-public-key-path".to_owned(),
|
||||
key_path
|
||||
.to_str()
|
||||
.with_context(|| {
|
||||
format!("Key path {key_path:?} cannot be represented as a unicode string")
|
||||
})?
|
||||
.to_owned(),
|
||||
"--pg-auth-public-key-path".to_owned(),
|
||||
key_path_string.clone(),
|
||||
]);
|
||||
args.extend([
|
||||
"--pg-tenant-only-auth-public-key-path".to_owned(),
|
||||
key_path_string.clone(),
|
||||
]);
|
||||
args.extend([
|
||||
"--http-auth-public-key-path".to_owned(),
|
||||
key_path_string.clone(),
|
||||
]);
|
||||
}
|
||||
|
||||
args.extend(extra_opts);
|
||||
|
||||
background_process::start_process(
|
||||
&format!("safekeeper-{id}"),
|
||||
&datadir,
|
||||
|
||||
@@ -10,6 +10,9 @@ chrono.workspace = true
|
||||
serde.workspace = true
|
||||
serde_with.workspace = true
|
||||
serde_json.workspace = true
|
||||
regex.workspace = true
|
||||
|
||||
utils = { path = "../utils" }
|
||||
remote_storage = { version = "0.1", path = "../remote_storage/" }
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -107,7 +107,6 @@ pub struct ComputeMetrics {
|
||||
pub num_ext_downloaded: u64,
|
||||
pub largest_ext_size: u64, // these are measured in bytes
|
||||
pub total_ext_download_size: u64,
|
||||
pub prep_extensions_ms: u64,
|
||||
}
|
||||
|
||||
/// Response of the `/computes/{compute_id}/spec` control-plane API.
|
||||
|
||||
@@ -3,11 +3,16 @@
|
||||
//! The spec.json file is used to pass information to 'compute_ctl'. It contains
|
||||
//! all the information needed to start up the right version of PostgreSQL,
|
||||
//! and connect it to the storage nodes.
|
||||
use std::collections::HashMap;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use regex::Regex;
|
||||
use remote_storage::RemotePath;
|
||||
|
||||
/// String type alias representing Postgres identifier and
|
||||
/// intended to be used for DB / role names.
|
||||
pub type PgIdent = String;
|
||||
@@ -61,8 +66,55 @@ pub struct ComputeSpec {
|
||||
/// the pageserver and safekeepers.
|
||||
pub storage_auth_token: Option<String>,
|
||||
|
||||
// list of prefixes to search for custom extensions in remote extension storage
|
||||
// information about available remote extensions
|
||||
pub remote_extensions: Option<RemoteExtSpec>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default, Deserialize, Serialize)]
|
||||
pub struct RemoteExtSpec {
|
||||
pub public_extensions: Option<Vec<String>>,
|
||||
pub custom_extensions: Option<Vec<String>>,
|
||||
pub library_index: HashMap<String, String>,
|
||||
pub extension_data: HashMap<String, ExtensionData>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct ExtensionData {
|
||||
pub control_data: HashMap<String, String>,
|
||||
pub archive_path: String,
|
||||
}
|
||||
|
||||
impl RemoteExtSpec {
|
||||
pub fn get_ext(
|
||||
&self,
|
||||
ext_name: &str,
|
||||
is_library: bool,
|
||||
) -> anyhow::Result<(String, RemotePath)> {
|
||||
let mut real_ext_name = ext_name;
|
||||
if is_library {
|
||||
// sometimes library names might have a suffix like
|
||||
// library.so or library.so.3. We strip this off
|
||||
// because library_index is based on the name without the file extension
|
||||
let strip_lib_suffix = Regex::new(r"\.so.*").unwrap();
|
||||
let lib_raw_name = strip_lib_suffix.replace(real_ext_name, "").to_string();
|
||||
|
||||
real_ext_name = self
|
||||
.library_index
|
||||
.get(&lib_raw_name)
|
||||
.ok_or(anyhow::anyhow!("library {} is not found", lib_raw_name))?;
|
||||
}
|
||||
|
||||
match self.extension_data.get(real_ext_name) {
|
||||
Some(ext_data) => Ok((
|
||||
real_ext_name.to_string(),
|
||||
RemotePath::from_string(&ext_data.archive_path)?,
|
||||
)),
|
||||
None => Err(anyhow::anyhow!(
|
||||
"real_ext_name {} is not found",
|
||||
real_ext_name
|
||||
)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
|
||||
@@ -205,5 +205,43 @@
|
||||
"name": "zenith new",
|
||||
"new_name": "zenith \"new\""
|
||||
}
|
||||
]
|
||||
],
|
||||
"remote_extensions": {
|
||||
"library_index": {
|
||||
"anon": "anon",
|
||||
"postgis-3": "postgis",
|
||||
"libpgrouting-3.4": "postgis",
|
||||
"postgis_raster-3": "postgis",
|
||||
"postgis_sfcgal-3": "postgis",
|
||||
"postgis_topology-3": "postgis",
|
||||
"address_standardizer-3": "postgis"
|
||||
},
|
||||
"extension_data": {
|
||||
"anon": {
|
||||
"archive_path": "5834329303/v15/extensions/anon.tar.zst",
|
||||
"control_data": {
|
||||
"anon.control": "# PostgreSQL Anonymizer (anon) extension\ncomment = ''Data anonymization tools''\ndefault_version = ''1.1.0''\ndirectory=''extension/anon''\nrelocatable = false\nrequires = ''pgcrypto''\nsuperuser = false\nmodule_pathname = ''$libdir/anon''\ntrusted = true\n"
|
||||
}
|
||||
},
|
||||
"postgis": {
|
||||
"archive_path": "5834329303/v15/extensions/postgis.tar.zst",
|
||||
"control_data": {
|
||||
"postgis.control": "# postgis extension\ncomment = ''PostGIS geometry and geography spatial types and functions''\ndefault_version = ''3.3.2''\nmodule_pathname = ''$libdir/postgis-3''\nrelocatable = false\ntrusted = true\n",
|
||||
"pgrouting.control": "# pgRouting Extension\ncomment = ''pgRouting Extension''\ndefault_version = ''3.4.2''\nmodule_pathname = ''$libdir/libpgrouting-3.4''\nrelocatable = true\nrequires = ''plpgsql''\nrequires = ''postgis''\ntrusted = true\n",
|
||||
"postgis_raster.control": "# postgis_raster extension\ncomment = ''PostGIS raster types and functions''\ndefault_version = ''3.3.2''\nmodule_pathname = ''$libdir/postgis_raster-3''\nrelocatable = false\nrequires = postgis\ntrusted = true\n",
|
||||
"postgis_sfcgal.control": "# postgis topology extension\ncomment = ''PostGIS SFCGAL functions''\ndefault_version = ''3.3.2''\nrelocatable = true\nrequires = postgis\ntrusted = true\n",
|
||||
"postgis_topology.control": "# postgis topology extension\ncomment = ''PostGIS topology spatial types and functions''\ndefault_version = ''3.3.2''\nrelocatable = false\nschema = topology\nrequires = postgis\ntrusted = true\n",
|
||||
"address_standardizer.control": "# address_standardizer extension\ncomment = ''Used to parse an address into constituent elements. Generally used to support geocoding address normalization step.''\ndefault_version = ''3.3.2''\nrelocatable = true\ntrusted = true\n",
|
||||
"postgis_tiger_geocoder.control": "# postgis tiger geocoder extension\ncomment = ''PostGIS tiger geocoder and reverse geocoder''\ndefault_version = ''3.3.2''\nrelocatable = false\nschema = tiger\nrequires = ''postgis,fuzzystrmatch''\nsuperuser= false\ntrusted = true\n",
|
||||
"address_standardizer_data_us.control": "# address standardizer us dataset\ncomment = ''Address Standardizer US dataset example''\ndefault_version = ''3.3.2''\nrelocatable = true\ntrusted = true\n"
|
||||
}
|
||||
}
|
||||
},
|
||||
"custom_extensions": [
|
||||
"anon"
|
||||
],
|
||||
"public_extensions": [
|
||||
"postgis"
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
@@ -189,8 +189,6 @@ impl S3Bucket {
|
||||
let kind = RequestKind::Get;
|
||||
let permit = self.owned_permit(kind).await;
|
||||
|
||||
metrics::inc_get_object();
|
||||
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let get_object = self
|
||||
@@ -205,7 +203,6 @@ impl S3Bucket {
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
|
||||
if get_object.is_err() {
|
||||
metrics::inc_get_object_fail();
|
||||
metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
|
||||
kind,
|
||||
AttemptOutcome::Err,
|
||||
@@ -337,7 +334,6 @@ impl RemoteStorage for S3Bucket {
|
||||
|
||||
loop {
|
||||
let _guard = self.permit(kind).await;
|
||||
metrics::inc_list_objects();
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let fetch_response = self
|
||||
@@ -350,10 +346,6 @@ impl RemoteStorage for S3Bucket {
|
||||
.set_max_keys(self.max_keys_per_list_response)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| {
|
||||
metrics::inc_list_objects_fail();
|
||||
e
|
||||
})
|
||||
.context("Failed to list S3 prefixes")
|
||||
.map_err(DownloadError::Other);
|
||||
|
||||
@@ -395,7 +387,6 @@ impl RemoteStorage for S3Bucket {
|
||||
let mut all_files = vec![];
|
||||
loop {
|
||||
let _guard = self.permit(kind).await;
|
||||
metrics::inc_list_objects();
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let response = self
|
||||
@@ -407,10 +398,6 @@ impl RemoteStorage for S3Bucket {
|
||||
.set_max_keys(self.max_keys_per_list_response)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| {
|
||||
metrics::inc_list_objects_fail();
|
||||
e
|
||||
})
|
||||
.context("Failed to list files in S3 bucket");
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
@@ -443,7 +430,6 @@ impl RemoteStorage for S3Bucket {
|
||||
let kind = RequestKind::Put;
|
||||
let _guard = self.permit(kind).await;
|
||||
|
||||
metrics::inc_put_object();
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let body = Body::wrap_stream(ReaderStream::new(from));
|
||||
@@ -458,11 +444,7 @@ impl RemoteStorage for S3Bucket {
|
||||
.content_length(from_size_bytes.try_into()?)
|
||||
.body(bytes_stream)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| {
|
||||
metrics::inc_put_object_fail();
|
||||
e
|
||||
});
|
||||
.await;
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
metrics::BUCKET_METRICS
|
||||
@@ -519,7 +501,6 @@ impl RemoteStorage for S3Bucket {
|
||||
}
|
||||
|
||||
for chunk in delete_objects.chunks(MAX_DELETE_OBJECTS_REQUEST_SIZE) {
|
||||
metrics::inc_delete_objects(chunk.len() as u64);
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let resp = self
|
||||
@@ -537,8 +518,10 @@ impl RemoteStorage for S3Bucket {
|
||||
|
||||
match resp {
|
||||
Ok(resp) => {
|
||||
metrics::BUCKET_METRICS
|
||||
.deleted_objects_total
|
||||
.inc_by(chunk.len() as u64);
|
||||
if let Some(errors) = resp.errors {
|
||||
metrics::inc_delete_objects_fail(errors.len() as u64);
|
||||
return Err(anyhow::format_err!(
|
||||
"Failed to delete {} objects",
|
||||
errors.len()
|
||||
@@ -546,7 +529,6 @@ impl RemoteStorage for S3Bucket {
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
metrics::inc_delete_objects_fail(chunk.len() as u64);
|
||||
return Err(e.into());
|
||||
}
|
||||
}
|
||||
@@ -555,32 +537,8 @@ impl RemoteStorage for S3Bucket {
|
||||
}
|
||||
|
||||
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
|
||||
let kind = RequestKind::Delete;
|
||||
let _guard = self.permit(kind).await;
|
||||
|
||||
metrics::inc_delete_object();
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let res = self
|
||||
.client
|
||||
.delete_object()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.key(self.relative_path_to_s3_object(path))
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| {
|
||||
metrics::inc_delete_object_fail();
|
||||
e
|
||||
});
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, &res, started_at);
|
||||
|
||||
res?;
|
||||
|
||||
Ok(())
|
||||
let paths = std::array::from_ref(path);
|
||||
self.delete_objects(paths).await
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
use metrics::{register_histogram_vec, register_int_counter_vec, Histogram, IntCounter};
|
||||
use metrics::{
|
||||
register_histogram_vec, register_int_counter, register_int_counter_vec, Histogram, IntCounter,
|
||||
};
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
pub(super) static BUCKET_METRICS: Lazy<BucketMetrics> = Lazy::new(Default::default);
|
||||
@@ -125,41 +127,22 @@ impl PassFailCancelledRequestTyped<Histogram> {
|
||||
}
|
||||
|
||||
pub(super) struct BucketMetrics {
|
||||
/// Total requests attempted
|
||||
// TODO: remove after next release and migrate dashboards to `sum by (result) (remote_storage_s3_requests_count)`
|
||||
requests: RequestTyped<IntCounter>,
|
||||
/// Subset of attempted requests failed
|
||||
// TODO: remove after next release and migrate dashboards to `remote_storage_s3_requests_count{result="err"}`
|
||||
failed: RequestTyped<IntCounter>,
|
||||
|
||||
/// Full request duration until successful completion, error or cancellation.
|
||||
pub(super) req_seconds: PassFailCancelledRequestTyped<Histogram>,
|
||||
/// Total amount of seconds waited on queue.
|
||||
pub(super) wait_seconds: RequestTyped<Histogram>,
|
||||
|
||||
/// Track how many semaphore awaits were cancelled per request type.
|
||||
///
|
||||
/// This is in case cancellations are happening more than expected.
|
||||
pub(super) cancelled_waits: RequestTyped<IntCounter>,
|
||||
|
||||
/// Total amount of deleted objects in batches or single requests.
|
||||
pub(super) deleted_objects_total: IntCounter,
|
||||
}
|
||||
|
||||
impl Default for BucketMetrics {
|
||||
fn default() -> Self {
|
||||
let requests = register_int_counter_vec!(
|
||||
"remote_storage_s3_requests_count",
|
||||
"Number of s3 requests of particular type",
|
||||
&["request_type"],
|
||||
)
|
||||
.expect("failed to define a metric");
|
||||
let requests =
|
||||
RequestTyped::build_with(|kind| requests.with_label_values(&[kind.as_str()]));
|
||||
|
||||
let failed = register_int_counter_vec!(
|
||||
"remote_storage_s3_failures_count",
|
||||
"Number of failed s3 requests of particular type",
|
||||
&["request_type"],
|
||||
)
|
||||
.expect("failed to define a metric");
|
||||
let failed = RequestTyped::build_with(|kind| failed.with_label_values(&[kind.as_str()]));
|
||||
|
||||
let buckets = [0.01, 0.10, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0];
|
||||
|
||||
let req_seconds = register_histogram_vec!(
|
||||
@@ -192,52 +175,17 @@ impl Default for BucketMetrics {
|
||||
let cancelled_waits =
|
||||
RequestTyped::build_with(|kind| cancelled_waits.with_label_values(&[kind.as_str()]));
|
||||
|
||||
let deleted_objects_total = register_int_counter!(
|
||||
"remote_storage_s3_deleted_objects_total",
|
||||
"Amount of deleted objects in total",
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
Self {
|
||||
requests,
|
||||
failed,
|
||||
req_seconds,
|
||||
wait_seconds,
|
||||
cancelled_waits,
|
||||
deleted_objects_total,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn inc_get_object() {
|
||||
BUCKET_METRICS.requests.get(Get).inc()
|
||||
}
|
||||
|
||||
pub fn inc_get_object_fail() {
|
||||
BUCKET_METRICS.failed.get(Get).inc()
|
||||
}
|
||||
|
||||
pub fn inc_put_object() {
|
||||
BUCKET_METRICS.requests.get(Put).inc()
|
||||
}
|
||||
|
||||
pub fn inc_put_object_fail() {
|
||||
BUCKET_METRICS.failed.get(Put).inc()
|
||||
}
|
||||
|
||||
pub fn inc_delete_object() {
|
||||
BUCKET_METRICS.requests.get(Delete).inc()
|
||||
}
|
||||
|
||||
pub fn inc_delete_objects(count: u64) {
|
||||
BUCKET_METRICS.requests.get(Delete).inc_by(count)
|
||||
}
|
||||
|
||||
pub fn inc_delete_object_fail() {
|
||||
BUCKET_METRICS.failed.get(Delete).inc()
|
||||
}
|
||||
|
||||
pub fn inc_delete_objects_fail(count: u64) {
|
||||
BUCKET_METRICS.failed.get(Delete).inc_by(count)
|
||||
}
|
||||
|
||||
pub fn inc_list_objects() {
|
||||
BUCKET_METRICS.requests.get(List).inc()
|
||||
}
|
||||
|
||||
pub fn inc_list_objects_fail() {
|
||||
BUCKET_METRICS.failed.get(List).inc()
|
||||
}
|
||||
|
||||
@@ -71,6 +71,13 @@ impl UnreliableWrapper {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn delete_inner(&self, path: &RemotePath, attempt: bool) -> anyhow::Result<()> {
|
||||
if attempt {
|
||||
self.attempt(RemoteOp::Delete(path.clone()))?;
|
||||
}
|
||||
self.inner.delete(path).await
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
@@ -122,15 +129,15 @@ impl RemoteStorage for UnreliableWrapper {
|
||||
}
|
||||
|
||||
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
|
||||
self.attempt(RemoteOp::Delete(path.clone()))?;
|
||||
self.inner.delete(path).await
|
||||
self.delete_inner(path, true).await
|
||||
}
|
||||
|
||||
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
|
||||
self.attempt(RemoteOp::DeleteObjects(paths.to_vec()))?;
|
||||
let mut error_counter = 0;
|
||||
for path in paths {
|
||||
if (self.delete(path).await).is_err() {
|
||||
// Dont record attempt because it was already recorded above
|
||||
if (self.delete_inner(path, false).await).is_err() {
|
||||
error_counter += 1;
|
||||
}
|
||||
}
|
||||
|
||||
188
libs/utils/src/backoff.rs
Normal file
188
libs/utils/src/backoff.rs
Normal file
@@ -0,0 +1,188 @@
|
||||
use std::fmt::{Debug, Display};
|
||||
|
||||
use futures::Future;
|
||||
|
||||
pub const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1;
|
||||
pub const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0;
|
||||
|
||||
pub async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) {
|
||||
let backoff_duration_seconds =
|
||||
exponential_backoff_duration_seconds(n, base_increment, max_seconds);
|
||||
if backoff_duration_seconds > 0.0 {
|
||||
tracing::info!(
|
||||
"Backoff: waiting {backoff_duration_seconds} seconds before processing with the task",
|
||||
);
|
||||
tokio::time::sleep(std::time::Duration::from_secs_f64(backoff_duration_seconds)).await;
|
||||
}
|
||||
}
|
||||
|
||||
pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 {
|
||||
if n == 0 {
|
||||
0.0
|
||||
} else {
|
||||
(1.0 + base_increment).powf(f64::from(n)).min(max_seconds)
|
||||
}
|
||||
}
|
||||
|
||||
/// retries passed operation until one of the following conditions are met:
|
||||
/// Encountered error is considered as permanent (non-retryable)
|
||||
/// Retries have been exhausted.
|
||||
/// `is_permanent` closure should be used to provide distinction between permanent/non-permanent errors
|
||||
/// When attempts cross `warn_threshold` function starts to emit log warnings.
|
||||
/// `description` argument is added to log messages. Its value should identify the `op` is doing
|
||||
pub async fn retry<T, O, F, E>(
|
||||
mut op: O,
|
||||
is_permanent: impl Fn(&E) -> bool,
|
||||
warn_threshold: u32,
|
||||
max_retries: u32,
|
||||
description: &str,
|
||||
) -> Result<T, E>
|
||||
where
|
||||
// Not std::error::Error because anyhow::Error doesnt implement it.
|
||||
// For context see https://github.com/dtolnay/anyhow/issues/63
|
||||
E: Display + Debug,
|
||||
O: FnMut() -> F,
|
||||
F: Future<Output = Result<T, E>>,
|
||||
{
|
||||
let mut attempts = 0;
|
||||
loop {
|
||||
let result = op().await;
|
||||
match result {
|
||||
Ok(_) => {
|
||||
if attempts > 0 {
|
||||
tracing::info!("{description} succeeded after {attempts} retries");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// These are "permanent" errors that should not be retried.
|
||||
Err(ref e) if is_permanent(e) => {
|
||||
return result;
|
||||
}
|
||||
// Assume that any other failure might be transient, and the operation might
|
||||
// succeed if we just keep trying.
|
||||
Err(err) if attempts < warn_threshold => {
|
||||
tracing::info!("{description} failed, will retry (attempt {attempts}): {err:#}");
|
||||
}
|
||||
Err(err) if attempts < max_retries => {
|
||||
tracing::warn!("{description} failed, will retry (attempt {attempts}): {err:#}");
|
||||
}
|
||||
Err(ref err) => {
|
||||
// Operation failed `max_attempts` times. Time to give up.
|
||||
tracing::warn!(
|
||||
"{description} still failed after {attempts} retries, giving up: {err:?}"
|
||||
);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
// sleep and retry
|
||||
exponential_backoff(
|
||||
attempts,
|
||||
DEFAULT_BASE_BACKOFF_SECONDS,
|
||||
DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
)
|
||||
.await;
|
||||
attempts += 1;
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::io;
|
||||
|
||||
use tokio::sync::Mutex;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn backoff_defaults_produce_growing_backoff_sequence() {
|
||||
let mut current_backoff_value = None;
|
||||
|
||||
for i in 0..10_000 {
|
||||
let new_backoff_value = exponential_backoff_duration_seconds(
|
||||
i,
|
||||
DEFAULT_BASE_BACKOFF_SECONDS,
|
||||
DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
);
|
||||
|
||||
if let Some(old_backoff_value) = current_backoff_value.replace(new_backoff_value) {
|
||||
assert!(
|
||||
old_backoff_value <= new_backoff_value,
|
||||
"{i}th backoff value {new_backoff_value} is smaller than the previous one {old_backoff_value}"
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
assert_eq!(
|
||||
current_backoff_value.expect("Should have produced backoff values to compare"),
|
||||
DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
"Given big enough of retries, backoff should reach its allowed max value"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn retry_always_error() {
|
||||
let count = Mutex::new(0);
|
||||
let err_result = retry(
|
||||
|| async {
|
||||
*count.lock().await += 1;
|
||||
Result::<(), io::Error>::Err(io::Error::from(io::ErrorKind::Other))
|
||||
},
|
||||
|_e| false,
|
||||
1,
|
||||
1,
|
||||
"work",
|
||||
)
|
||||
.await;
|
||||
|
||||
assert!(err_result.is_err());
|
||||
|
||||
assert_eq!(*count.lock().await, 2);
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn retry_ok_after_err() {
|
||||
let count = Mutex::new(0);
|
||||
retry(
|
||||
|| async {
|
||||
let mut locked = count.lock().await;
|
||||
if *locked > 1 {
|
||||
Ok(())
|
||||
} else {
|
||||
*locked += 1;
|
||||
Err(io::Error::from(io::ErrorKind::Other))
|
||||
}
|
||||
},
|
||||
|_e| false,
|
||||
2,
|
||||
2,
|
||||
"work",
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn dont_retry_permanent_errors() {
|
||||
let count = Mutex::new(0);
|
||||
let _ = retry(
|
||||
|| async {
|
||||
let mut locked = count.lock().await;
|
||||
if *locked > 1 {
|
||||
Ok(())
|
||||
} else {
|
||||
*locked += 1;
|
||||
Err(io::Error::from(io::ErrorKind::Other))
|
||||
}
|
||||
},
|
||||
|_e| true,
|
||||
2,
|
||||
2,
|
||||
"work",
|
||||
)
|
||||
.await
|
||||
.unwrap_err();
|
||||
|
||||
assert_eq!(*count.lock().await, 1);
|
||||
}
|
||||
}
|
||||
@@ -111,6 +111,10 @@ pub fn fsync(path: &Path) -> io::Result<()> {
|
||||
.map_err(|e| io::Error::new(e.kind(), format!("Failed to fsync file {path:?}: {e}")))
|
||||
}
|
||||
|
||||
pub async fn fsync_async(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
|
||||
tokio::fs::File::open(path).await?.sync_all().await
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use tempfile::tempdir;
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
//! `utils` is intended to be a place to put code that is shared
|
||||
//! between other crates in this repository.
|
||||
|
||||
pub mod backoff;
|
||||
|
||||
/// `Lsn` type implements common tasks on Log Sequence Numbers
|
||||
pub mod lsn;
|
||||
/// SeqWait allows waiting for a future sequence number to arrive
|
||||
|
||||
@@ -72,7 +72,7 @@ async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
|
||||
.await?;
|
||||
let cursor = BlockCursor::new(&file);
|
||||
for (k, v) in all {
|
||||
let value = cursor.read_blob(v.pos())?;
|
||||
let value = cursor.read_blob(v.pos()).await?;
|
||||
println!("key:{} value_len:{}", k, value.len());
|
||||
}
|
||||
// TODO(chi): special handling for last key?
|
||||
|
||||
@@ -85,6 +85,7 @@
|
||||
//! The solution is that all code paths are infected with precisely one
|
||||
//! [`RequestContext`] argument. Functions in the middle of the call chain
|
||||
//! only need to pass it on.
|
||||
|
||||
use crate::task_mgr::TaskKind;
|
||||
|
||||
// The main structure of this module, see module-level comment.
|
||||
@@ -92,6 +93,7 @@ use crate::task_mgr::TaskKind;
|
||||
pub struct RequestContext {
|
||||
task_kind: TaskKind,
|
||||
download_behavior: DownloadBehavior,
|
||||
access_stats_behavior: AccessStatsBehavior,
|
||||
}
|
||||
|
||||
/// Desired behavior if the operation requires an on-demand download
|
||||
@@ -109,6 +111,67 @@ pub enum DownloadBehavior {
|
||||
Error,
|
||||
}
|
||||
|
||||
/// Whether this request should update access times used in LRU eviction
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
|
||||
pub(crate) enum AccessStatsBehavior {
|
||||
/// Update access times: this request's access to data should be taken
|
||||
/// as a hint that the accessed layer is likely to be accessed again
|
||||
Update,
|
||||
|
||||
/// Do not update access times: this request is accessing the layer
|
||||
/// but does not want to indicate that the layer should be retained in cache,
|
||||
/// perhaps because the requestor is a compaction routine that will soon cover
|
||||
/// this layer with another.
|
||||
Skip,
|
||||
}
|
||||
|
||||
pub struct RequestContextBuilder {
|
||||
inner: RequestContext,
|
||||
}
|
||||
|
||||
impl RequestContextBuilder {
|
||||
/// A new builder with default settings
|
||||
pub fn new(task_kind: TaskKind) -> Self {
|
||||
Self {
|
||||
inner: RequestContext {
|
||||
task_kind,
|
||||
download_behavior: DownloadBehavior::Download,
|
||||
access_stats_behavior: AccessStatsBehavior::Update,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
pub fn extend(original: &RequestContext) -> Self {
|
||||
Self {
|
||||
// This is like a Copy, but avoid implementing Copy because ordinary users of
|
||||
// RequestContext should always move or ref it.
|
||||
inner: RequestContext {
|
||||
task_kind: original.task_kind,
|
||||
download_behavior: original.download_behavior,
|
||||
access_stats_behavior: original.access_stats_behavior,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
/// Configure the DownloadBehavior of the context: whether to
|
||||
/// download missing layers, and/or warn on the download.
|
||||
pub fn download_behavior(mut self, b: DownloadBehavior) -> Self {
|
||||
self.inner.download_behavior = b;
|
||||
self
|
||||
}
|
||||
|
||||
/// Configure the AccessStatsBehavior of the context: whether layer
|
||||
/// accesses should update the access time of the layer.
|
||||
pub(crate) fn access_stats_behavior(mut self, b: AccessStatsBehavior) -> Self {
|
||||
self.inner.access_stats_behavior = b;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn build(self) -> RequestContext {
|
||||
self.inner
|
||||
}
|
||||
}
|
||||
|
||||
impl RequestContext {
|
||||
/// Create a new RequestContext that has no parent.
|
||||
///
|
||||
@@ -123,10 +186,9 @@ impl RequestContext {
|
||||
/// because someone explicitly canceled it.
|
||||
/// It has no parent, so it cannot inherit cancellation from there.
|
||||
pub fn new(task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
|
||||
RequestContext {
|
||||
task_kind,
|
||||
download_behavior,
|
||||
}
|
||||
RequestContextBuilder::new(task_kind)
|
||||
.download_behavior(download_behavior)
|
||||
.build()
|
||||
}
|
||||
|
||||
/// Create a detached child context for a task that may outlive `self`.
|
||||
@@ -187,10 +249,7 @@ impl RequestContext {
|
||||
}
|
||||
|
||||
fn child_impl(&self, task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
|
||||
RequestContext {
|
||||
task_kind,
|
||||
download_behavior,
|
||||
}
|
||||
Self::new(task_kind, download_behavior)
|
||||
}
|
||||
|
||||
pub fn task_kind(&self) -> TaskKind {
|
||||
@@ -200,4 +259,8 @@ impl RequestContext {
|
||||
pub fn download_behavior(&self) -> DownloadBehavior {
|
||||
self.download_behavior
|
||||
}
|
||||
|
||||
pub(crate) fn access_stats_behavior(&self) -> AccessStatsBehavior {
|
||||
self.access_stats_behavior
|
||||
}
|
||||
}
|
||||
|
||||
@@ -95,28 +95,6 @@ pub async fn shutdown_pageserver(exit_code: i32) {
|
||||
std::process::exit(exit_code);
|
||||
}
|
||||
|
||||
const DEFAULT_BASE_BACKOFF_SECONDS: f64 = 0.1;
|
||||
const DEFAULT_MAX_BACKOFF_SECONDS: f64 = 3.0;
|
||||
|
||||
async fn exponential_backoff(n: u32, base_increment: f64, max_seconds: f64) {
|
||||
let backoff_duration_seconds =
|
||||
exponential_backoff_duration_seconds(n, base_increment, max_seconds);
|
||||
if backoff_duration_seconds > 0.0 {
|
||||
info!(
|
||||
"Backoff: waiting {backoff_duration_seconds} seconds before processing with the task",
|
||||
);
|
||||
tokio::time::sleep(std::time::Duration::from_secs_f64(backoff_duration_seconds)).await;
|
||||
}
|
||||
}
|
||||
|
||||
pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds: f64) -> f64 {
|
||||
if n == 0 {
|
||||
0.0
|
||||
} else {
|
||||
(1.0 + base_increment).powf(f64::from(n)).min(max_seconds)
|
||||
}
|
||||
}
|
||||
|
||||
/// The name of the metadata file pageserver creates per timeline.
|
||||
/// Full path: `tenants/<tenant_id>/timelines/<timeline_id>/metadata`.
|
||||
pub const METADATA_FILE_NAME: &str = "metadata";
|
||||
@@ -238,37 +216,6 @@ async fn timed<Fut: std::future::Future>(
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod backoff_defaults_tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn backoff_defaults_produce_growing_backoff_sequence() {
|
||||
let mut current_backoff_value = None;
|
||||
|
||||
for i in 0..10_000 {
|
||||
let new_backoff_value = exponential_backoff_duration_seconds(
|
||||
i,
|
||||
DEFAULT_BASE_BACKOFF_SECONDS,
|
||||
DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
);
|
||||
|
||||
if let Some(old_backoff_value) = current_backoff_value.replace(new_backoff_value) {
|
||||
assert!(
|
||||
old_backoff_value <= new_backoff_value,
|
||||
"{i}th backoff value {new_backoff_value} is smaller than the previous one {old_backoff_value}"
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
assert_eq!(
|
||||
current_backoff_value.expect("Should have produced backoff values to compare"),
|
||||
DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
"Given big enough of retries, backoff should reach its allowed max value"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod timed_tests {
|
||||
use super::timed;
|
||||
|
||||
@@ -47,11 +47,13 @@ use std::{
|
||||
|
||||
use anyhow::Context;
|
||||
use once_cell::sync::OnceCell;
|
||||
use tracing::error;
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use crate::tenant::{block_io, ephemeral_file, writeback_ephemeral_file};
|
||||
use crate::{metrics::PageCacheSizeMetrics, repository::Key};
|
||||
|
||||
static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
|
||||
@@ -95,8 +97,12 @@ enum CacheKey {
|
||||
hash_key: MaterializedPageHashKey,
|
||||
lsn: Lsn,
|
||||
},
|
||||
EphemeralPage {
|
||||
file_id: ephemeral_file::FileId,
|
||||
blkno: u32,
|
||||
},
|
||||
ImmutableFilePage {
|
||||
file_id: u64,
|
||||
file_id: block_io::FileId,
|
||||
blkno: u32,
|
||||
},
|
||||
}
|
||||
@@ -122,6 +128,7 @@ struct Slot {
|
||||
struct SlotInner {
|
||||
key: Option<CacheKey>,
|
||||
buf: &'static mut [u8; PAGE_SZ],
|
||||
dirty: bool,
|
||||
}
|
||||
|
||||
impl Slot {
|
||||
@@ -170,7 +177,9 @@ pub struct PageCache {
|
||||
/// can have a separate mapping map, next to this field.
|
||||
materialized_page_map: RwLock<HashMap<MaterializedPageHashKey, Vec<Version>>>,
|
||||
|
||||
immutable_page_map: RwLock<HashMap<(u64, u32), usize>>,
|
||||
ephemeral_page_map: RwLock<HashMap<(ephemeral_file::FileId, u32), usize>>,
|
||||
|
||||
immutable_page_map: RwLock<HashMap<(block_io::FileId, u32), usize>>,
|
||||
|
||||
/// The actual buffers with their metadata.
|
||||
slots: Box<[Slot]>,
|
||||
@@ -249,6 +258,14 @@ impl PageWriteGuard<'_> {
|
||||
);
|
||||
self.valid = true;
|
||||
}
|
||||
pub fn mark_dirty(&mut self) {
|
||||
// only ephemeral pages can be dirty ATM.
|
||||
assert!(matches!(
|
||||
self.inner.key,
|
||||
Some(CacheKey::EphemeralPage { .. })
|
||||
));
|
||||
self.inner.dirty = true;
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for PageWriteGuard<'_> {
|
||||
@@ -263,6 +280,7 @@ impl Drop for PageWriteGuard<'_> {
|
||||
let self_key = self.inner.key.as_ref().unwrap();
|
||||
PAGE_CACHE.get().unwrap().remove_mapping(self_key);
|
||||
self.inner.key = None;
|
||||
self.inner.dirty = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -370,16 +388,62 @@ impl PageCache {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Section 1.2: Public interface functions for working with immutable file pages.
|
||||
// Section 1.2: Public interface functions for working with Ephemeral pages.
|
||||
|
||||
pub fn read_immutable_buf(&self, file_id: u64, blkno: u32) -> anyhow::Result<ReadBufResult> {
|
||||
pub fn read_ephemeral_buf(
|
||||
&self,
|
||||
file_id: ephemeral_file::FileId,
|
||||
blkno: u32,
|
||||
) -> anyhow::Result<ReadBufResult> {
|
||||
let mut cache_key = CacheKey::EphemeralPage { file_id, blkno };
|
||||
|
||||
self.lock_for_read(&mut cache_key)
|
||||
}
|
||||
|
||||
pub fn write_ephemeral_buf(
|
||||
&self,
|
||||
file_id: ephemeral_file::FileId,
|
||||
blkno: u32,
|
||||
) -> anyhow::Result<WriteBufResult> {
|
||||
let cache_key = CacheKey::EphemeralPage { file_id, blkno };
|
||||
|
||||
self.lock_for_write(&cache_key)
|
||||
}
|
||||
|
||||
/// Immediately drop all buffers belonging to given file, without writeback
|
||||
pub fn drop_buffers_for_ephemeral(&self, drop_file_id: ephemeral_file::FileId) {
|
||||
for slot_idx in 0..self.slots.len() {
|
||||
let slot = &self.slots[slot_idx];
|
||||
|
||||
let mut inner = slot.inner.write().unwrap();
|
||||
if let Some(key) = &inner.key {
|
||||
match key {
|
||||
CacheKey::EphemeralPage { file_id, blkno: _ } if *file_id == drop_file_id => {
|
||||
// remove mapping for old buffer
|
||||
self.remove_mapping(key);
|
||||
inner.key = None;
|
||||
inner.dirty = false;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Section 1.3: Public interface functions for working with immutable file pages.
|
||||
|
||||
pub fn read_immutable_buf(
|
||||
&self,
|
||||
file_id: block_io::FileId,
|
||||
blkno: u32,
|
||||
) -> anyhow::Result<ReadBufResult> {
|
||||
let mut cache_key = CacheKey::ImmutableFilePage { file_id, blkno };
|
||||
|
||||
self.lock_for_read(&mut cache_key)
|
||||
}
|
||||
|
||||
/// Immediately drop all buffers belonging to given file, without writeback
|
||||
pub fn drop_buffers_for_immutable(&self, drop_file_id: u64) {
|
||||
pub fn drop_buffers_for_immutable(&self, drop_file_id: block_io::FileId) {
|
||||
for slot_idx in 0..self.slots.len() {
|
||||
let slot = &self.slots[slot_idx];
|
||||
|
||||
@@ -392,6 +456,7 @@ impl PageCache {
|
||||
// remove mapping for old buffer
|
||||
self.remove_mapping(key);
|
||||
inner.key = None;
|
||||
inner.dirty = false;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
@@ -469,6 +534,10 @@ impl PageCache {
|
||||
CacheKey::MaterializedPage { .. } => {
|
||||
unreachable!("Materialized pages use lookup_materialized_page")
|
||||
}
|
||||
CacheKey::EphemeralPage { .. } => (
|
||||
&crate::metrics::PAGE_CACHE.read_accesses_ephemeral,
|
||||
&crate::metrics::PAGE_CACHE.read_hits_ephemeral,
|
||||
),
|
||||
CacheKey::ImmutableFilePage { .. } => (
|
||||
&crate::metrics::PAGE_CACHE.read_accesses_immutable,
|
||||
&crate::metrics::PAGE_CACHE.read_hits_immutable,
|
||||
@@ -509,6 +578,7 @@ impl PageCache {
|
||||
// Make the slot ready
|
||||
let slot = &self.slots[slot_idx];
|
||||
inner.key = Some(cache_key.clone());
|
||||
inner.dirty = false;
|
||||
slot.usage_count.store(1, Ordering::Relaxed);
|
||||
|
||||
return Ok(ReadBufResult::NotFound(PageWriteGuard {
|
||||
@@ -570,6 +640,7 @@ impl PageCache {
|
||||
// Make the slot ready
|
||||
let slot = &self.slots[slot_idx];
|
||||
inner.key = Some(cache_key.clone());
|
||||
inner.dirty = false;
|
||||
slot.usage_count.store(1, Ordering::Relaxed);
|
||||
|
||||
return Ok(WriteBufResult::NotFound(PageWriteGuard {
|
||||
@@ -608,6 +679,10 @@ impl PageCache {
|
||||
*lsn = version.lsn;
|
||||
Some(version.slot_idx)
|
||||
}
|
||||
CacheKey::EphemeralPage { file_id, blkno } => {
|
||||
let map = self.ephemeral_page_map.read().unwrap();
|
||||
Some(*map.get(&(*file_id, *blkno))?)
|
||||
}
|
||||
CacheKey::ImmutableFilePage { file_id, blkno } => {
|
||||
let map = self.immutable_page_map.read().unwrap();
|
||||
Some(*map.get(&(*file_id, *blkno))?)
|
||||
@@ -631,6 +706,10 @@ impl PageCache {
|
||||
None
|
||||
}
|
||||
}
|
||||
CacheKey::EphemeralPage { file_id, blkno } => {
|
||||
let map = self.ephemeral_page_map.read().unwrap();
|
||||
Some(*map.get(&(*file_id, *blkno))?)
|
||||
}
|
||||
CacheKey::ImmutableFilePage { file_id, blkno } => {
|
||||
let map = self.immutable_page_map.read().unwrap();
|
||||
Some(*map.get(&(*file_id, *blkno))?)
|
||||
@@ -664,6 +743,12 @@ impl PageCache {
|
||||
panic!("could not find old key in mapping")
|
||||
}
|
||||
}
|
||||
CacheKey::EphemeralPage { file_id, blkno } => {
|
||||
let mut map = self.ephemeral_page_map.write().unwrap();
|
||||
map.remove(&(*file_id, *blkno))
|
||||
.expect("could not find old key in mapping");
|
||||
self.size_metrics.current_bytes_ephemeral.sub_page_sz(1);
|
||||
}
|
||||
CacheKey::ImmutableFilePage { file_id, blkno } => {
|
||||
let mut map = self.immutable_page_map.write().unwrap();
|
||||
map.remove(&(*file_id, *blkno))
|
||||
@@ -703,7 +788,17 @@ impl PageCache {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
CacheKey::EphemeralPage { file_id, blkno } => {
|
||||
let mut map = self.ephemeral_page_map.write().unwrap();
|
||||
match map.entry((*file_id, *blkno)) {
|
||||
Entry::Occupied(entry) => Some(*entry.get()),
|
||||
Entry::Vacant(entry) => {
|
||||
entry.insert(slot_idx);
|
||||
self.size_metrics.current_bytes_ephemeral.add_page_sz(1);
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
CacheKey::ImmutableFilePage { file_id, blkno } => {
|
||||
let mut map = self.immutable_page_map.write().unwrap();
|
||||
match map.entry((*file_id, *blkno)) {
|
||||
@@ -754,8 +849,25 @@ impl PageCache {
|
||||
}
|
||||
};
|
||||
if let Some(old_key) = &inner.key {
|
||||
if inner.dirty {
|
||||
if let Err(err) = Self::writeback(old_key, inner.buf) {
|
||||
// Writing the page to disk failed.
|
||||
//
|
||||
// FIXME: What to do here, when? We could propagate the error to the
|
||||
// caller, but victim buffer is generally unrelated to the original
|
||||
// call. It can even belong to a different tenant. Currently, we
|
||||
// report the error to the log and continue the clock sweep to find
|
||||
// a different victim. But if the problem persists, the page cache
|
||||
// could fill up with dirty pages that we cannot evict, and we will
|
||||
// loop retrying the writebacks indefinitely.
|
||||
error!("writeback of buffer {:?} failed: {}", old_key, err);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// remove mapping for old buffer
|
||||
self.remove_mapping(old_key);
|
||||
inner.dirty = false;
|
||||
inner.key = None;
|
||||
}
|
||||
return Ok((slot_idx, inner));
|
||||
@@ -763,6 +875,28 @@ impl PageCache {
|
||||
}
|
||||
}
|
||||
|
||||
fn writeback(cache_key: &CacheKey, buf: &[u8]) -> Result<(), std::io::Error> {
|
||||
match cache_key {
|
||||
CacheKey::MaterializedPage {
|
||||
hash_key: _,
|
||||
lsn: _,
|
||||
} => Err(std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
"unexpected dirty materialized page",
|
||||
)),
|
||||
CacheKey::EphemeralPage { file_id, blkno } => {
|
||||
writeback_ephemeral_file(*file_id, *blkno, buf)
|
||||
}
|
||||
CacheKey::ImmutableFilePage {
|
||||
file_id: _,
|
||||
blkno: _,
|
||||
} => Err(std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
"unexpected dirty immutable page",
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Initialize a new page cache
|
||||
///
|
||||
/// This should be called only once at page server startup.
|
||||
@@ -773,6 +907,7 @@ impl PageCache {
|
||||
|
||||
let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;
|
||||
size_metrics.max_bytes.set_page_sz(num_pages);
|
||||
size_metrics.current_bytes_ephemeral.set_page_sz(0);
|
||||
size_metrics.current_bytes_immutable.set_page_sz(0);
|
||||
size_metrics.current_bytes_materialized_page.set_page_sz(0);
|
||||
|
||||
@@ -782,7 +917,11 @@ impl PageCache {
|
||||
let buf: &mut [u8; PAGE_SZ] = chunk.try_into().unwrap();
|
||||
|
||||
Slot {
|
||||
inner: RwLock::new(SlotInner { key: None, buf }),
|
||||
inner: RwLock::new(SlotInner {
|
||||
key: None,
|
||||
buf,
|
||||
dirty: false,
|
||||
}),
|
||||
usage_count: AtomicU8::new(0),
|
||||
}
|
||||
})
|
||||
@@ -790,6 +929,7 @@ impl PageCache {
|
||||
|
||||
Self {
|
||||
materialized_page_map: Default::default(),
|
||||
ephemeral_page_map: Default::default(),
|
||||
immutable_page_map: Default::default(),
|
||||
slots,
|
||||
next_evict_slot: AtomicUsize::new(0),
|
||||
|
||||
@@ -136,6 +136,9 @@ pub use timeline::{
|
||||
LocalLayerInfoForDiskUsageEviction, LogicalSizeCalculationCause, PageReconstructError, Timeline,
|
||||
};
|
||||
|
||||
// re-export this function so that page_cache.rs can use it.
|
||||
pub use crate::tenant::ephemeral_file::writeback as writeback_ephemeral_file;
|
||||
|
||||
// re-export for use in remote_timeline_client.rs
|
||||
pub use crate::tenant::metadata::save_metadata;
|
||||
|
||||
@@ -1101,8 +1104,9 @@ impl Tenant {
|
||||
{
|
||||
match e {
|
||||
LoadLocalTimelineError::Load(source) => {
|
||||
return Err(anyhow::anyhow!(source)
|
||||
.context("Failed to load local timeline: {timeline_id}"))
|
||||
return Err(anyhow::anyhow!(source)).with_context(|| {
|
||||
format!("Failed to load local timeline: {timeline_id}")
|
||||
})
|
||||
}
|
||||
LoadLocalTimelineError::ResumeDeletion(source) => {
|
||||
// Make sure resumed deletion wont fail loading for entire tenant.
|
||||
|
||||
@@ -21,14 +21,14 @@ where
|
||||
R: BlockReader,
|
||||
{
|
||||
/// Read a blob into a new buffer.
|
||||
pub fn read_blob(&self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
|
||||
pub async fn read_blob(&self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
|
||||
let mut buf = Vec::new();
|
||||
self.read_blob_into_buf(offset, &mut buf)?;
|
||||
self.read_blob_into_buf(offset, &mut buf).await?;
|
||||
Ok(buf)
|
||||
}
|
||||
/// Read blob into the given buffer. Any previous contents in the buffer
|
||||
/// are overwritten.
|
||||
pub fn read_blob_into_buf(
|
||||
pub async fn read_blob_into_buf(
|
||||
&self,
|
||||
offset: u64,
|
||||
dstbuf: &mut Vec<u8>,
|
||||
|
||||
@@ -2,8 +2,7 @@
|
||||
//! Low-level Block-oriented I/O functions
|
||||
//!
|
||||
|
||||
use crate::page_cache;
|
||||
use crate::page_cache::{ReadBufResult, PAGE_SZ};
|
||||
use crate::page_cache::{self, PageReadGuard, ReadBufResult, PAGE_SZ};
|
||||
use bytes::Bytes;
|
||||
use std::ops::{Deref, DerefMut};
|
||||
use std::os::unix::fs::FileExt;
|
||||
@@ -15,14 +14,12 @@ use std::sync::atomic::AtomicU64;
|
||||
/// There are currently two implementations: EphemeralFile, and FileBlockReader
|
||||
/// below.
|
||||
pub trait BlockReader {
|
||||
type BlockLease: Deref<Target = [u8; PAGE_SZ]> + 'static;
|
||||
|
||||
///
|
||||
/// Read a block. Returns a "lease" object that can be used to
|
||||
/// access to the contents of the page. (For the page cache, the
|
||||
/// lease object represents a lock on the buffer.)
|
||||
///
|
||||
fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error>;
|
||||
fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error>;
|
||||
|
||||
///
|
||||
/// Create a new "cursor" for reading from this reader.
|
||||
@@ -41,13 +38,48 @@ impl<B> BlockReader for &B
|
||||
where
|
||||
B: BlockReader,
|
||||
{
|
||||
type BlockLease = B::BlockLease;
|
||||
|
||||
fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
|
||||
fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
|
||||
(*self).read_blk(blknum)
|
||||
}
|
||||
}
|
||||
|
||||
/// A block accessible for reading
|
||||
///
|
||||
/// During builds with `#[cfg(test)]`, this is a proper enum
|
||||
/// with two variants to support testing code. During normal
|
||||
/// builds, it just has one variant and is thus a cheap newtype
|
||||
/// wrapper of [`PageReadGuard`]
|
||||
pub enum BlockLease {
|
||||
PageReadGuard(PageReadGuard<'static>),
|
||||
#[cfg(test)]
|
||||
Rc(std::rc::Rc<[u8; PAGE_SZ]>),
|
||||
}
|
||||
|
||||
impl From<PageReadGuard<'static>> for BlockLease {
|
||||
fn from(value: PageReadGuard<'static>) -> Self {
|
||||
BlockLease::PageReadGuard(value)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl From<std::rc::Rc<[u8; PAGE_SZ]>> for BlockLease {
|
||||
fn from(value: std::rc::Rc<[u8; PAGE_SZ]>) -> Self {
|
||||
BlockLease::Rc(value)
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for BlockLease {
|
||||
type Target = [u8; PAGE_SZ];
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
match self {
|
||||
BlockLease::PageReadGuard(v) => v.deref(),
|
||||
#[cfg(test)]
|
||||
BlockLease::Rc(v) => v.deref(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// A "cursor" for efficiently reading multiple pages from a BlockReader
|
||||
///
|
||||
@@ -80,11 +112,17 @@ where
|
||||
BlockCursor { reader }
|
||||
}
|
||||
|
||||
pub fn read_blk(&self, blknum: u32) -> Result<R::BlockLease, std::io::Error> {
|
||||
pub fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
|
||||
self.reader.read_blk(blknum)
|
||||
}
|
||||
}
|
||||
static NEXT_ID: AtomicU64 = AtomicU64::new(1);
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
|
||||
pub struct FileId(u64);
|
||||
|
||||
fn next_file_id() -> FileId {
|
||||
FileId(NEXT_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed))
|
||||
}
|
||||
|
||||
/// An adapter for reading a (virtual) file using the page cache.
|
||||
///
|
||||
@@ -94,7 +132,7 @@ pub struct FileBlockReader<F> {
|
||||
pub file: F,
|
||||
|
||||
/// Unique ID of this file, used as key in the page cache.
|
||||
file_id: u64,
|
||||
file_id: FileId,
|
||||
}
|
||||
|
||||
impl<F> FileBlockReader<F>
|
||||
@@ -102,7 +140,7 @@ where
|
||||
F: FileExt,
|
||||
{
|
||||
pub fn new(file: F) -> Self {
|
||||
let file_id = NEXT_ID.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
||||
let file_id = next_file_id();
|
||||
|
||||
FileBlockReader { file_id, file }
|
||||
}
|
||||
@@ -118,9 +156,7 @@ impl<F> BlockReader for FileBlockReader<F>
|
||||
where
|
||||
F: FileExt,
|
||||
{
|
||||
type BlockLease = page_cache::PageReadGuard<'static>;
|
||||
|
||||
fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
|
||||
fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
|
||||
// Look up the right page
|
||||
let cache = page_cache::get();
|
||||
loop {
|
||||
@@ -132,7 +168,7 @@ where
|
||||
format!("Failed to read immutable buf: {e:#}"),
|
||||
)
|
||||
})? {
|
||||
ReadBufResult::Found(guard) => break Ok(guard),
|
||||
ReadBufResult::Found(guard) => break Ok(guard.into()),
|
||||
ReadBufResult::NotFound(mut write_guard) => {
|
||||
// Read the page from disk into the buffer
|
||||
self.fill_buffer(write_guard.deref_mut(), blknum)?;
|
||||
|
||||
@@ -10,7 +10,7 @@ use tokio::sync::OwnedMutexGuard;
|
||||
use tracing::{error, info, instrument, warn, Instrument, Span};
|
||||
|
||||
use utils::{
|
||||
completion, crashsafe, fs_ext,
|
||||
backoff, completion, crashsafe, fs_ext,
|
||||
id::{TenantId, TimelineId},
|
||||
};
|
||||
|
||||
@@ -23,12 +23,13 @@ use crate::{
|
||||
|
||||
use super::{
|
||||
mgr::{GetTenantError, TenantsMap},
|
||||
remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
|
||||
span,
|
||||
timeline::delete::DeleteTimelineFlow,
|
||||
tree_sort_timelines, DeleteTimelineError, Tenant,
|
||||
};
|
||||
|
||||
const SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS: u8 = 3;
|
||||
const SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS: u32 = 3;
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum DeleteTenantError {
|
||||
@@ -71,10 +72,19 @@ async fn create_remote_delete_mark(
|
||||
let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_id)?;
|
||||
|
||||
let data: &[u8] = &[];
|
||||
remote_storage
|
||||
.upload(data, 0, &remote_mark_path, None)
|
||||
.await
|
||||
.context("mark upload")?;
|
||||
backoff::retry(
|
||||
|| async {
|
||||
remote_storage
|
||||
.upload(data, 0, &remote_mark_path, None)
|
||||
.await
|
||||
},
|
||||
|_e| false,
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"mark_upload",
|
||||
)
|
||||
.await
|
||||
.context("mark_upload")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -154,9 +164,16 @@ async fn remove_tenant_remote_delete_mark(
|
||||
tenant_id: &TenantId,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
if let Some(remote_storage) = remote_storage {
|
||||
remote_storage
|
||||
.delete(&remote_tenant_delete_mark_path(conf, tenant_id)?)
|
||||
.await?;
|
||||
let path = remote_tenant_delete_mark_path(conf, tenant_id)?;
|
||||
backoff::retry(
|
||||
|| async { remote_storage.delete(&path).await },
|
||||
|_e| false,
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"remove_tenant_remote_delete_mark",
|
||||
)
|
||||
.await
|
||||
.context("remove_tenant_remote_delete_mark")?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -195,6 +212,19 @@ async fn cleanup_remaining_fs_traces(
|
||||
))?
|
||||
});
|
||||
|
||||
// Make sure previous deletions are ordered before mark removal.
|
||||
// Otherwise there is no guarantee that they reach the disk before mark deletion.
|
||||
// So its possible for mark to reach disk first and for other deletions
|
||||
// to be reordered later and thus missed if a crash occurs.
|
||||
// Note that we dont need to sync after mark file is removed
|
||||
// because we can tolerate the case when mark file reappears on startup.
|
||||
let tenant_path = &conf.tenant_path(tenant_id);
|
||||
if tenant_path.exists() {
|
||||
crashsafe::fsync_async(&conf.tenant_path(tenant_id))
|
||||
.await
|
||||
.context("fsync_pre_mark_remove")?;
|
||||
}
|
||||
|
||||
rm(conf.tenant_deleted_mark_file_path(tenant_id), false).await?;
|
||||
|
||||
fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| {
|
||||
@@ -208,6 +238,30 @@ async fn cleanup_remaining_fs_traces(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn remote_delete_mark_exists(
|
||||
conf: &PageServerConf,
|
||||
tenant_id: &TenantId,
|
||||
remote_storage: &GenericRemoteStorage,
|
||||
) -> anyhow::Result<bool> {
|
||||
// If remote storage is there we rely on it
|
||||
let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_id).context("path")?;
|
||||
|
||||
let result = backoff::retry(
|
||||
|| async { remote_storage.download(&remote_mark_path).await },
|
||||
|e| matches!(e, DownloadError::NotFound),
|
||||
SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
|
||||
SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
|
||||
"fetch_tenant_deletion_mark",
|
||||
)
|
||||
.await;
|
||||
|
||||
match result {
|
||||
Ok(_) => Ok(true),
|
||||
Err(DownloadError::NotFound) => Ok(false),
|
||||
Err(e) => Err(anyhow::anyhow!(e)).context("remote_delete_mark_exists")?,
|
||||
}
|
||||
}
|
||||
|
||||
/// Orchestrates tenant shut down of all tasks, removes its in-memory structures,
|
||||
/// and deletes its data from both disk and s3.
|
||||
/// The sequence of steps:
|
||||
@@ -337,32 +391,16 @@ impl DeleteTenantFlow {
|
||||
return Ok(acquire(tenant));
|
||||
}
|
||||
|
||||
// If remote storage is there we rely on it
|
||||
if let Some(remote_storage) = remote_storage {
|
||||
let remote_mark_path = remote_tenant_delete_mark_path(conf, &tenant_id)?;
|
||||
let remote_storage = match remote_storage {
|
||||
Some(remote_storage) => remote_storage,
|
||||
None => return Ok(None),
|
||||
};
|
||||
|
||||
let attempt = 1;
|
||||
loop {
|
||||
match remote_storage.download(&remote_mark_path).await {
|
||||
Ok(_) => return Ok(acquire(tenant)),
|
||||
Err(e) => {
|
||||
if matches!(e, DownloadError::NotFound) {
|
||||
return Ok(None);
|
||||
}
|
||||
if attempt > SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS {
|
||||
return Err(anyhow::anyhow!(e))?;
|
||||
}
|
||||
|
||||
warn!(
|
||||
"failed to fetch tenant deletion mark at {} attempt {}",
|
||||
&remote_mark_path, attempt
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
if remote_delete_mark_exists(conf, &tenant_id, remote_storage).await? {
|
||||
Ok(acquire(tenant))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
pub(crate) async fn resume(
|
||||
|
||||
@@ -685,6 +685,7 @@ impl<const L: usize> BuildNode<L> {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::tenant::block_io::BlockLease;
|
||||
use rand::Rng;
|
||||
use std::collections::BTreeMap;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
@@ -699,12 +700,10 @@ mod tests {
|
||||
}
|
||||
}
|
||||
impl BlockReader for TestDisk {
|
||||
type BlockLease = std::rc::Rc<[u8; PAGE_SZ]>;
|
||||
|
||||
fn read_blk(&self, blknum: u32) -> io::Result<Self::BlockLease> {
|
||||
fn read_blk(&self, blknum: u32) -> io::Result<BlockLease> {
|
||||
let mut buf = [0u8; PAGE_SZ];
|
||||
buf.copy_from_slice(&self.blocks[blknum as usize]);
|
||||
Ok(std::rc::Rc::new(buf))
|
||||
Ok(std::rc::Rc::new(buf).into())
|
||||
}
|
||||
}
|
||||
impl BlockWriter for &mut TestDisk {
|
||||
|
||||
@@ -2,44 +2,49 @@
|
||||
//! used to keep in-memory layers spilled on disk.
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::page_cache::PAGE_SZ;
|
||||
use crate::page_cache::{self, ReadBufResult, WriteBufResult, PAGE_SZ};
|
||||
use crate::tenant::blob_io::BlobWriter;
|
||||
use crate::tenant::block_io::BlockReader;
|
||||
use crate::tenant::block_io::{BlockLease, BlockReader};
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use once_cell::sync::Lazy;
|
||||
use std::cmp::min;
|
||||
use std::collections::HashMap;
|
||||
use std::fs::OpenOptions;
|
||||
use std::io::{self};
|
||||
use std::io::{self, ErrorKind};
|
||||
use std::ops::DerefMut;
|
||||
use std::os::unix::prelude::FileExt;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use tracing::*;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use std::os::unix::fs::FileExt;
|
||||
|
||||
mod buffer_pool;
|
||||
mod dirty_buffer;
|
||||
|
||||
///
|
||||
/// This is the global cache of file descriptors (File objects).
|
||||
///
|
||||
static EPHEMERAL_FILES: Lazy<RwLock<EphemeralFiles>> = Lazy::new(|| {
|
||||
RwLock::new(EphemeralFiles {
|
||||
next_file_id: 1,
|
||||
next_file_id: FileId(1),
|
||||
files: HashMap::new(),
|
||||
})
|
||||
});
|
||||
|
||||
pub struct EphemeralFiles {
|
||||
next_file_id: u64,
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub struct FileId(u64);
|
||||
|
||||
files: HashMap<u64, Arc<VirtualFile>>,
|
||||
impl std::fmt::Display for FileId {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct EphemeralFiles {
|
||||
next_file_id: FileId,
|
||||
|
||||
files: HashMap<FileId, Arc<VirtualFile>>,
|
||||
}
|
||||
|
||||
pub struct EphemeralFile {
|
||||
file_id: u64,
|
||||
file_id: FileId,
|
||||
_tenant_id: TenantId,
|
||||
_timeline_id: TimelineId,
|
||||
file: Arc<VirtualFile>,
|
||||
@@ -55,7 +60,7 @@ impl EphemeralFile {
|
||||
) -> Result<EphemeralFile, io::Error> {
|
||||
let mut l = EPHEMERAL_FILES.write().unwrap();
|
||||
let file_id = l.next_file_id;
|
||||
l.next_file_id += 1;
|
||||
l.next_file_id = FileId(l.next_file_id.0 + 1);
|
||||
|
||||
let filename = conf
|
||||
.timeline_path(&tenant_id, &timeline_id)
|
||||
@@ -97,13 +102,30 @@ impl EphemeralFile {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get_buf_for_write(&self, blkno: u32) -> Result<dirty_buffer::Buffer, io::Error> {
|
||||
let pool = buffer_pool::get();
|
||||
let mut buf = pool.get_buffer();
|
||||
// Read the page from disk into the buffer
|
||||
// TODO: if we're overwriting the whole page, no need to read it in first
|
||||
self.fill_buffer(buf.deref_mut(), blkno)?;
|
||||
Ok(dirty_buffer::Buffer::new(self, buf, blkno))
|
||||
fn get_buf_for_write(
|
||||
&self,
|
||||
blkno: u32,
|
||||
) -> Result<page_cache::PageWriteGuard<'static>, io::Error> {
|
||||
// Look up the right page
|
||||
let cache = page_cache::get();
|
||||
let mut write_guard = match cache
|
||||
.write_ephemeral_buf(self.file_id, blkno)
|
||||
.map_err(|e| to_io_error(e, "Failed to write ephemeral buf"))?
|
||||
{
|
||||
WriteBufResult::Found(guard) => guard,
|
||||
WriteBufResult::NotFound(mut guard) => {
|
||||
// Read the page from disk into the buffer
|
||||
// TODO: if we're overwriting the whole page, no need to read it in first
|
||||
self.fill_buffer(guard.deref_mut(), blkno)?;
|
||||
guard.mark_valid();
|
||||
|
||||
// And then fall through to modify it.
|
||||
guard
|
||||
}
|
||||
};
|
||||
write_guard.mark_dirty();
|
||||
|
||||
Ok(write_guard)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -118,53 +140,77 @@ pub fn is_ephemeral_file(filename: &str) -> bool {
|
||||
|
||||
impl BlobWriter for EphemeralFile {
|
||||
fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, io::Error> {
|
||||
struct Writer<'a> {
|
||||
ephemeral_file: &'a mut EphemeralFile,
|
||||
/// The block to which the next [`push_bytes`] will write.
|
||||
blknum: u32,
|
||||
/// The offset inside the block identified by [`blknum`] to which [`push_bytes`] will write.
|
||||
off: usize,
|
||||
/// Used by [`push_bytes`] to memoize the page cache write guard across calls to it.
|
||||
memo_page_guard: MemoizedPageWriteGuard,
|
||||
}
|
||||
struct MemoizedPageWriteGuard {
|
||||
guard: page_cache::PageWriteGuard<'static>,
|
||||
/// The block number of the page in `guard`.
|
||||
blknum: u32,
|
||||
}
|
||||
impl<'a> Writer<'a> {
|
||||
fn new(ephemeral_file: &'a mut EphemeralFile) -> io::Result<Writer<'a>> {
|
||||
let blknum = (ephemeral_file.size / PAGE_SZ as u64) as u32;
|
||||
Ok(Writer {
|
||||
blknum,
|
||||
off: (ephemeral_file.size % PAGE_SZ as u64) as usize,
|
||||
memo_page_guard: MemoizedPageWriteGuard {
|
||||
guard: ephemeral_file.get_buf_for_write(blknum)?,
|
||||
blknum,
|
||||
},
|
||||
ephemeral_file,
|
||||
})
|
||||
}
|
||||
#[inline(always)]
|
||||
fn push_bytes(&mut self, src: &[u8]) -> Result<(), io::Error> {
|
||||
// `src_remaining` is the remaining bytes to be written
|
||||
let mut src_remaining = src;
|
||||
while !src_remaining.is_empty() {
|
||||
let page = if self.memo_page_guard.blknum == self.blknum {
|
||||
&mut self.memo_page_guard.guard
|
||||
} else {
|
||||
self.memo_page_guard.guard =
|
||||
self.ephemeral_file.get_buf_for_write(self.blknum)?;
|
||||
self.memo_page_guard.blknum = self.blknum;
|
||||
&mut self.memo_page_guard.guard
|
||||
};
|
||||
let dst_remaining = &mut page[self.off..];
|
||||
let n = min(dst_remaining.len(), src_remaining.len());
|
||||
dst_remaining[..n].copy_from_slice(&src_remaining[..n]);
|
||||
self.off += n;
|
||||
src_remaining = &src_remaining[n..];
|
||||
if self.off == PAGE_SZ {
|
||||
// This block is done, move to next one.
|
||||
self.blknum += 1;
|
||||
self.off = 0;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
let pos = self.size;
|
||||
|
||||
let mut blknum = (self.size / PAGE_SZ as u64) as u32;
|
||||
let mut off = (pos % PAGE_SZ as u64) as usize;
|
||||
|
||||
let mut buf = self.get_buf_for_write(blknum)?;
|
||||
let mut writer = Writer::new(self)?;
|
||||
|
||||
// Write the length field
|
||||
if srcbuf.len() < 0x80 {
|
||||
buf[off] = srcbuf.len() as u8;
|
||||
off += 1;
|
||||
// short one-byte length header
|
||||
let len_buf = [srcbuf.len() as u8];
|
||||
writer.push_bytes(&len_buf)?;
|
||||
} else {
|
||||
let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
|
||||
len_buf[0] |= 0x80;
|
||||
let thislen = PAGE_SZ - off;
|
||||
if thislen < 4 {
|
||||
// it needs to be split across pages
|
||||
buf[off..(off + thislen)].copy_from_slice(&len_buf[..thislen]);
|
||||
blknum += 1;
|
||||
buf.writeback()?;
|
||||
buf = self.get_buf_for_write(blknum)?;
|
||||
buf[0..4 - thislen].copy_from_slice(&len_buf[thislen..]);
|
||||
off = 4 - thislen;
|
||||
} else {
|
||||
buf[off..off + 4].copy_from_slice(&len_buf);
|
||||
off += 4;
|
||||
}
|
||||
writer.push_bytes(&len_buf)?;
|
||||
}
|
||||
|
||||
// Write the payload
|
||||
let mut buf_remain = srcbuf;
|
||||
while !buf_remain.is_empty() {
|
||||
let mut page_remain = PAGE_SZ - off;
|
||||
if page_remain == 0 {
|
||||
blknum += 1;
|
||||
buf.writeback()?;
|
||||
buf = self.get_buf_for_write(blknum)?;
|
||||
off = 0;
|
||||
page_remain = PAGE_SZ;
|
||||
}
|
||||
let this_blk_len = min(page_remain, buf_remain.len());
|
||||
buf[off..(off + this_blk_len)].copy_from_slice(&buf_remain[..this_blk_len]);
|
||||
off += this_blk_len;
|
||||
buf_remain = &buf_remain[this_blk_len..];
|
||||
}
|
||||
|
||||
buf.writeback()?;
|
||||
writer.push_bytes(srcbuf)?;
|
||||
|
||||
if srcbuf.len() < 0x80 {
|
||||
self.size += 1;
|
||||
@@ -179,6 +225,10 @@ impl BlobWriter for EphemeralFile {
|
||||
|
||||
impl Drop for EphemeralFile {
|
||||
fn drop(&mut self) {
|
||||
// drop all pages from page cache
|
||||
let cache = page_cache::get();
|
||||
cache.drop_buffers_for_ephemeral(self.file_id);
|
||||
|
||||
// remove entry from the hash map
|
||||
EPHEMERAL_FILES.write().unwrap().files.remove(&self.file_id);
|
||||
|
||||
@@ -200,18 +250,54 @@ impl Drop for EphemeralFile {
|
||||
}
|
||||
}
|
||||
|
||||
impl BlockReader for EphemeralFile {
|
||||
type BlockLease = buffer_pool::Handle;
|
||||
|
||||
fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, io::Error> {
|
||||
// Read the page from disk into the buffer
|
||||
let pool = buffer_pool::get();
|
||||
let mut buf = pool.get_buffer();
|
||||
self.fill_buffer(buf.deref_mut(), blknum)?;
|
||||
Ok(buf)
|
||||
pub fn writeback(file_id: FileId, blkno: u32, buf: &[u8]) -> Result<(), io::Error> {
|
||||
if let Some(file) = EPHEMERAL_FILES.read().unwrap().files.get(&file_id) {
|
||||
match file.write_all_at(buf, blkno as u64 * PAGE_SZ as u64) {
|
||||
Ok(_) => Ok(()),
|
||||
Err(e) => Err(io::Error::new(
|
||||
ErrorKind::Other,
|
||||
format!(
|
||||
"failed to write back to ephemeral file at {} error: {}",
|
||||
file.path.display(),
|
||||
e
|
||||
),
|
||||
)),
|
||||
}
|
||||
} else {
|
||||
Err(io::Error::new(
|
||||
ErrorKind::Other,
|
||||
"could not write back page, not found in ephemeral files hash",
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
impl BlockReader for EphemeralFile {
|
||||
fn read_blk(&self, blknum: u32) -> Result<BlockLease, io::Error> {
|
||||
// Look up the right page
|
||||
let cache = page_cache::get();
|
||||
loop {
|
||||
match cache
|
||||
.read_ephemeral_buf(self.file_id, blknum)
|
||||
.map_err(|e| to_io_error(e, "Failed to read ephemeral buf"))?
|
||||
{
|
||||
ReadBufResult::Found(guard) => return Ok(guard.into()),
|
||||
ReadBufResult::NotFound(mut write_guard) => {
|
||||
// Read the page from disk into the buffer
|
||||
self.fill_buffer(write_guard.deref_mut(), blknum)?;
|
||||
write_guard.mark_valid();
|
||||
|
||||
// Swap for read lock
|
||||
continue;
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn to_io_error(e: anyhow::Error, context: &str) -> io::Error {
|
||||
io::Error::new(ErrorKind::Other, format!("{context}: {e:#}"))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -238,17 +324,26 @@ mod tests {
|
||||
Ok((conf, tenant_id, timeline_id))
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ephemeral_blobs() -> Result<(), io::Error> {
|
||||
#[tokio::test]
|
||||
async fn test_ephemeral_blobs() -> Result<(), io::Error> {
|
||||
let (conf, tenant_id, timeline_id) = harness("ephemeral_blobs")?;
|
||||
|
||||
let mut file = EphemeralFile::create(conf, tenant_id, timeline_id)?;
|
||||
|
||||
let pos_foo = file.write_blob(b"foo")?;
|
||||
assert_eq!(b"foo", file.block_cursor().read_blob(pos_foo)?.as_slice());
|
||||
assert_eq!(
|
||||
b"foo",
|
||||
file.block_cursor().read_blob(pos_foo).await?.as_slice()
|
||||
);
|
||||
let pos_bar = file.write_blob(b"bar")?;
|
||||
assert_eq!(b"foo", file.block_cursor().read_blob(pos_foo)?.as_slice());
|
||||
assert_eq!(b"bar", file.block_cursor().read_blob(pos_bar)?.as_slice());
|
||||
assert_eq!(
|
||||
b"foo",
|
||||
file.block_cursor().read_blob(pos_foo).await?.as_slice()
|
||||
);
|
||||
assert_eq!(
|
||||
b"bar",
|
||||
file.block_cursor().read_blob(pos_bar).await?.as_slice()
|
||||
);
|
||||
|
||||
let mut blobs = Vec::new();
|
||||
for i in 0..10000 {
|
||||
@@ -265,7 +360,7 @@ mod tests {
|
||||
|
||||
let cursor = BlockCursor::new(&file);
|
||||
for (pos, expected) in blobs {
|
||||
let actual = cursor.read_blob(pos)?;
|
||||
let actual = cursor.read_blob(pos).await?;
|
||||
assert_eq!(actual, expected);
|
||||
}
|
||||
|
||||
@@ -274,7 +369,7 @@ mod tests {
|
||||
large_data.resize(20000, 0);
|
||||
thread_rng().fill_bytes(&mut large_data);
|
||||
let pos_large = file.write_blob(&large_data)?;
|
||||
let result = file.block_cursor().read_blob(pos_large)?;
|
||||
let result = file.block_cursor().read_blob(pos_large).await?;
|
||||
assert_eq!(result, large_data);
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -1,66 +0,0 @@
|
||||
//! Buffer pool for ephemeral file buffers.
|
||||
//!
|
||||
//! Currently this is a very simple implementation that just uses `malloc`.
|
||||
//! But the interface is such that we can switch to a more sophisticated
|
||||
//! implementation later, e.g., one that caps that amount of memory used.
|
||||
|
||||
use std::ops::{Deref, DerefMut};
|
||||
|
||||
use crate::page_cache::PAGE_SZ;
|
||||
|
||||
pub struct BufferPool;
|
||||
|
||||
const POOL: BufferPool = BufferPool;
|
||||
|
||||
pub(super) fn get() -> &'static BufferPool {
|
||||
&POOL
|
||||
}
|
||||
|
||||
impl BufferPool {
|
||||
/// Get a [`Handle`] to a buffer in the pool.
|
||||
///
|
||||
/// The buffer is guaranteed to be zeroed out.
|
||||
///
|
||||
/// The implementation may block to wait for buffers to become available,
|
||||
/// and a future async version of this method may `.await` internally to
|
||||
/// wait for buffers to become available.
|
||||
///
|
||||
/// To avoid deadlocks, a thread/task must get all the buffers it needs
|
||||
/// with a single call to `get_buffer`. Without this rule, a deadlock
|
||||
/// can happen. Take for example a buffer pool with 2 buffers X, Y
|
||||
/// and a program with two threads A and B, each requiring 2 buffers.
|
||||
/// If A gets X and B gets Y, then both threads will block forever trying
|
||||
/// to get their second buffer.
|
||||
pub fn get_buffer(&self) -> Handle {
|
||||
Handle {
|
||||
data: vec![0; PAGE_SZ],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Handle {
|
||||
data: Vec<u8>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for Handle {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("Handle")
|
||||
.field("data", &self.data.as_ptr())
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for Handle {
|
||||
type Target = [u8; PAGE_SZ];
|
||||
fn deref(&self) -> &Self::Target {
|
||||
let slice: &[u8] = &self.data[..];
|
||||
slice.try_into().unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
impl DerefMut for Handle {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
let slice: &mut [u8] = &mut self.data[..];
|
||||
slice.try_into().unwrap()
|
||||
}
|
||||
}
|
||||
@@ -1,111 +0,0 @@
|
||||
//! Newtypes to ensure that dirty buffers are written back to the filesystem before they are dropped.
|
||||
|
||||
use std::io::ErrorKind;
|
||||
use std::ops::Deref;
|
||||
use std::ops::DerefMut;
|
||||
use std::os::unix::prelude::FileExt;
|
||||
|
||||
use crate::page_cache::PAGE_SZ;
|
||||
|
||||
use super::buffer_pool;
|
||||
use super::EphemeralFile;
|
||||
|
||||
pub(super) struct Buffer<'f> {
|
||||
inner: Inner<'f>,
|
||||
}
|
||||
|
||||
enum Inner<'f> {
|
||||
Dirty {
|
||||
ephemeral_file: &'f EphemeralFile,
|
||||
buf: buffer_pool::Handle,
|
||||
blkno: u32,
|
||||
},
|
||||
WritebackOngoing,
|
||||
WrittenBack,
|
||||
WritebackError,
|
||||
Dropped,
|
||||
}
|
||||
|
||||
impl<'f> Buffer<'f> {
|
||||
pub(super) fn new(
|
||||
ephemeral_file: &'f EphemeralFile,
|
||||
buf: buffer_pool::Handle,
|
||||
blkno: u32,
|
||||
) -> Self {
|
||||
Self {
|
||||
inner: Inner::Dirty {
|
||||
ephemeral_file,
|
||||
buf,
|
||||
blkno,
|
||||
},
|
||||
}
|
||||
}
|
||||
pub(super) fn writeback(mut self) -> Result<(), std::io::Error> {
|
||||
let Inner::Dirty {
|
||||
ephemeral_file,
|
||||
buf,
|
||||
blkno,
|
||||
} = std::mem::replace(&mut self.inner, Inner::WritebackOngoing) else {
|
||||
unreachable!("writeback consumes");
|
||||
};
|
||||
match ephemeral_file
|
||||
.file
|
||||
.write_all_at(buf.deref(), blkno as u64 * PAGE_SZ as u64)
|
||||
{
|
||||
Ok(_) => {
|
||||
self.inner = Inner::WrittenBack;
|
||||
Ok(())
|
||||
}
|
||||
Err(e) => {
|
||||
self.inner = Inner::WritebackError;
|
||||
Err(std::io::Error::new(
|
||||
ErrorKind::Other,
|
||||
format!(
|
||||
"failed to write back to ephemeral file at {} error: {}",
|
||||
ephemeral_file.file.path.display(),
|
||||
e
|
||||
),
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'f> Deref for Buffer<'f> {
|
||||
type Target = [u8];
|
||||
|
||||
fn deref(&self) -> &[u8] {
|
||||
match &self.inner {
|
||||
Inner::Dirty { buf, .. } => &**buf,
|
||||
Inner::WritebackOngoing => unreachable!("writeback consumes"),
|
||||
Inner::WrittenBack => unreachable!("writeback consumes"),
|
||||
Inner::WritebackError => unreachable!("writeback consumes"),
|
||||
Inner::Dropped => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'f> DerefMut for Buffer<'f> {
|
||||
fn deref_mut(&mut self) -> &mut [u8] {
|
||||
match &mut self.inner {
|
||||
Inner::Dirty { buf, .. } => &mut **buf,
|
||||
Inner::WritebackOngoing => unreachable!("writeback consumes"),
|
||||
Inner::WrittenBack => unreachable!("writeback consumes"),
|
||||
Inner::WritebackError => unreachable!("writeback consumes"),
|
||||
Inner::Dropped => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for Buffer<'_> {
|
||||
fn drop(&mut self) {
|
||||
let prev = std::mem::replace(&mut self.inner, Inner::Dropped);
|
||||
match prev {
|
||||
// TODO: check this at compile time
|
||||
Inner::Dirty { .. } => panic!("dropped dirty buffer, need to writeback() first"),
|
||||
Inner::WritebackOngoing => unreachable!("transitory state"),
|
||||
Inner::WrittenBack | Inner::WritebackError => {}
|
||||
Inner::Dropped => unreachable!("drop only happens once"),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -27,7 +27,7 @@ use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME};
|
||||
use utils::fs_ext::PathExt;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use super::delete::DeleteTenantError;
|
||||
use super::delete::{remote_delete_mark_exists, DeleteTenantError};
|
||||
use super::timeline::delete::DeleteTimelineFlow;
|
||||
|
||||
/// The tenants known to the pageserver.
|
||||
@@ -591,6 +591,12 @@ pub async fn attach_tenant(
|
||||
remote_storage: GenericRemoteStorage,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), TenantMapInsertError> {
|
||||
// Temporary solution, proper one would be to resume deletion, but that needs more plumbing around Tenant::load/Tenant::attach
|
||||
// Corresponding issue https://github.com/neondatabase/neon/issues/5006
|
||||
if remote_delete_mark_exists(conf, &tenant_id, &remote_storage).await? {
|
||||
return Err(anyhow::anyhow!("Tenant is marked as deleted on remote storage").into());
|
||||
}
|
||||
|
||||
tenant_map_insert(tenant_id, || {
|
||||
let tenant_dir = create_tenant_files(conf, tenant_conf, &tenant_id, CreateTenantFilesMode::Attach)?;
|
||||
// TODO: tenant directory remains on disk if we bail out from here on.
|
||||
|
||||
@@ -211,6 +211,9 @@ use chrono::{NaiveDateTime, Utc};
|
||||
// re-export these
|
||||
pub use download::{is_temp_download_file, list_remote_timelines};
|
||||
use scopeguard::ScopeGuard;
|
||||
use utils::backoff::{
|
||||
self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
};
|
||||
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
use std::path::Path;
|
||||
@@ -219,7 +222,6 @@ use std::sync::{Arc, Mutex};
|
||||
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
|
||||
use std::ops::DerefMut;
|
||||
use tokio::runtime::Runtime;
|
||||
use tracing::{debug, error, info, instrument, warn};
|
||||
use tracing::{info_span, Instrument};
|
||||
use utils::lsn::Lsn;
|
||||
@@ -241,7 +243,6 @@ use crate::{
|
||||
tenant::upload_queue::{
|
||||
UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask,
|
||||
},
|
||||
{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS},
|
||||
};
|
||||
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
@@ -256,12 +257,12 @@ use super::upload_queue::SetDeletedFlagProgress;
|
||||
// But after FAILED_DOWNLOAD_WARN_THRESHOLD retries, we start to log it at WARN
|
||||
// level instead, as repeated failures can mean a more serious problem. If it
|
||||
// fails more than FAILED_DOWNLOAD_RETRIES times, we give up
|
||||
const FAILED_DOWNLOAD_WARN_THRESHOLD: u32 = 3;
|
||||
const FAILED_DOWNLOAD_RETRIES: u32 = 10;
|
||||
pub(crate) const FAILED_DOWNLOAD_WARN_THRESHOLD: u32 = 3;
|
||||
pub(crate) const FAILED_REMOTE_OP_RETRIES: u32 = 10;
|
||||
|
||||
// Similarly log failed uploads and deletions at WARN level, after this many
|
||||
// retries. Uploads and deletions are retried forever, though.
|
||||
const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
|
||||
pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
|
||||
|
||||
pub enum MaybeDeletedIndexPart {
|
||||
IndexPart(IndexPart),
|
||||
@@ -309,7 +310,7 @@ pub enum PersistIndexPartWithDeletedFlagError {
|
||||
pub struct RemoteTimelineClient {
|
||||
conf: &'static PageServerConf,
|
||||
|
||||
runtime: &'static Runtime,
|
||||
runtime: tokio::runtime::Handle,
|
||||
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
@@ -336,7 +337,7 @@ impl RemoteTimelineClient {
|
||||
) -> RemoteTimelineClient {
|
||||
RemoteTimelineClient {
|
||||
conf,
|
||||
runtime: &BACKGROUND_RUNTIME,
|
||||
runtime: BACKGROUND_RUNTIME.handle().to_owned(),
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
storage_impl: remote_storage,
|
||||
@@ -752,12 +753,24 @@ impl RemoteTimelineClient {
|
||||
|
||||
pausable_failpoint!("persist_deleted_index_part");
|
||||
|
||||
upload::upload_index_part(
|
||||
self.conf,
|
||||
&self.storage_impl,
|
||||
&self.tenant_id,
|
||||
&self.timeline_id,
|
||||
&index_part_with_deleted_at,
|
||||
backoff::retry(
|
||||
|| async {
|
||||
upload::upload_index_part(
|
||||
self.conf,
|
||||
&self.storage_impl,
|
||||
&self.tenant_id,
|
||||
&self.timeline_id,
|
||||
&index_part_with_deleted_at,
|
||||
)
|
||||
.await
|
||||
},
|
||||
|_e| false,
|
||||
1,
|
||||
// have just a couple of attempts
|
||||
// when executed as part of timeline deletion this happens in context of api call
|
||||
// when executed as part of tenant deletion this happens in the background
|
||||
2,
|
||||
"persist_index_part_with_deleted_flag",
|
||||
)
|
||||
.await?;
|
||||
|
||||
@@ -834,10 +847,19 @@ impl RemoteTimelineClient {
|
||||
let timeline_path = self.conf.timeline_path(&self.tenant_id, &self.timeline_id);
|
||||
let timeline_storage_path = self.conf.remote_path(&timeline_path)?;
|
||||
|
||||
let remaining = self
|
||||
.storage_impl
|
||||
.list_prefixes(Some(&timeline_storage_path))
|
||||
.await?;
|
||||
let remaining = backoff::retry(
|
||||
|| async {
|
||||
self.storage_impl
|
||||
.list_files(Some(&timeline_storage_path))
|
||||
.await
|
||||
},
|
||||
|_e| false,
|
||||
FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"list_prefixes",
|
||||
)
|
||||
.await
|
||||
.context("list prefixes")?;
|
||||
|
||||
let remaining: Vec<RemotePath> = remaining
|
||||
.into_iter()
|
||||
@@ -852,7 +874,15 @@ impl RemoteTimelineClient {
|
||||
.collect();
|
||||
|
||||
if !remaining.is_empty() {
|
||||
self.storage_impl.delete_objects(&remaining).await?;
|
||||
backoff::retry(
|
||||
|| async { self.storage_impl.delete_objects(&remaining).await },
|
||||
|_e| false,
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"delete_objects",
|
||||
)
|
||||
.await
|
||||
.context("delete_objects")?;
|
||||
}
|
||||
|
||||
fail::fail_point!("timeline-delete-before-index-delete", |_| {
|
||||
@@ -864,7 +894,16 @@ impl RemoteTimelineClient {
|
||||
let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));
|
||||
|
||||
debug!("deleting index part");
|
||||
self.storage_impl.delete(&index_file_path).await?;
|
||||
|
||||
backoff::retry(
|
||||
|| async { self.storage_impl.delete(&index_file_path).await },
|
||||
|_e| false,
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"delete_index",
|
||||
)
|
||||
.await
|
||||
.context("delete_index")?;
|
||||
|
||||
fail::fail_point!("timeline-delete-after-index-delete", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
@@ -954,7 +993,7 @@ impl RemoteTimelineClient {
|
||||
let tenant_id = self.tenant_id;
|
||||
let timeline_id = self.timeline_id;
|
||||
task_mgr::spawn(
|
||||
self.runtime.handle(),
|
||||
&self.runtime,
|
||||
TaskKind::RemoteUploadTask,
|
||||
Some(self.tenant_id),
|
||||
Some(self.timeline_id),
|
||||
@@ -1307,7 +1346,7 @@ mod tests {
|
||||
context::RequestContext,
|
||||
tenant::{
|
||||
harness::{TenantHarness, TIMELINE_ID},
|
||||
Tenant,
|
||||
Tenant, Timeline,
|
||||
},
|
||||
DEFAULT_PG_VERSION,
|
||||
};
|
||||
@@ -1316,7 +1355,6 @@ mod tests {
|
||||
collections::HashSet,
|
||||
path::{Path, PathBuf},
|
||||
};
|
||||
use tokio::runtime::EnterGuard;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
pub(super) fn dummy_contents(name: &str) -> Vec<u8> {
|
||||
@@ -1366,35 +1404,25 @@ mod tests {
|
||||
}
|
||||
|
||||
struct TestSetup {
|
||||
runtime: &'static tokio::runtime::Runtime,
|
||||
entered_runtime: EnterGuard<'static>,
|
||||
harness: TenantHarness,
|
||||
tenant: Arc<Tenant>,
|
||||
timeline: Arc<Timeline>,
|
||||
tenant_ctx: RequestContext,
|
||||
remote_fs_dir: PathBuf,
|
||||
client: Arc<RemoteTimelineClient>,
|
||||
}
|
||||
|
||||
impl TestSetup {
|
||||
fn new(test_name: &str) -> anyhow::Result<Self> {
|
||||
async fn new(test_name: &str) -> anyhow::Result<Self> {
|
||||
// Use a current-thread runtime in the test
|
||||
let runtime = Box::leak(Box::new(
|
||||
tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()?,
|
||||
));
|
||||
let entered_runtime = runtime.enter();
|
||||
|
||||
let test_name = Box::leak(Box::new(format!("remote_timeline_client__{test_name}")));
|
||||
let harness = TenantHarness::create(test_name)?;
|
||||
let (tenant, ctx) = runtime.block_on(harness.load());
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
// create an empty timeline directory
|
||||
let _ = runtime.block_on(tenant.create_test_timeline(
|
||||
TIMELINE_ID,
|
||||
Lsn(8),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
))?;
|
||||
let timeline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
|
||||
let remote_fs_dir = harness.conf.workdir.join("remote_fs");
|
||||
std::fs::create_dir_all(remote_fs_dir)?;
|
||||
@@ -1416,7 +1444,7 @@ mod tests {
|
||||
|
||||
let client = Arc::new(RemoteTimelineClient {
|
||||
conf: harness.conf,
|
||||
runtime,
|
||||
runtime: tokio::runtime::Handle::current(),
|
||||
tenant_id: harness.tenant_id,
|
||||
timeline_id: TIMELINE_ID,
|
||||
storage_impl: storage,
|
||||
@@ -1428,10 +1456,9 @@ mod tests {
|
||||
});
|
||||
|
||||
Ok(Self {
|
||||
runtime,
|
||||
entered_runtime,
|
||||
harness,
|
||||
tenant,
|
||||
timeline,
|
||||
tenant_ctx: ctx,
|
||||
remote_fs_dir,
|
||||
client,
|
||||
@@ -1440,8 +1467,8 @@ mod tests {
|
||||
}
|
||||
|
||||
// Test scheduling
|
||||
#[test]
|
||||
fn upload_scheduling() -> anyhow::Result<()> {
|
||||
#[tokio::test]
|
||||
async fn upload_scheduling() {
|
||||
// Test outline:
|
||||
//
|
||||
// Schedule upload of a bunch of layers. Check that they are started immediately, not queued
|
||||
@@ -1457,25 +1484,26 @@ mod tests {
|
||||
// Schedule index upload. Check that it's queued
|
||||
|
||||
let TestSetup {
|
||||
runtime,
|
||||
entered_runtime: _entered_runtime,
|
||||
harness,
|
||||
tenant: _tenant,
|
||||
timeline: _timeline,
|
||||
tenant_ctx: _tenant_ctx,
|
||||
remote_fs_dir,
|
||||
client,
|
||||
} = TestSetup::new("upload_scheduling").unwrap();
|
||||
} = TestSetup::new("upload_scheduling").await.unwrap();
|
||||
|
||||
let timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
|
||||
println!("workdir: {}", harness.conf.workdir.display());
|
||||
|
||||
let remote_timeline_dir =
|
||||
remote_fs_dir.join(timeline_path.strip_prefix(&harness.conf.workdir)?);
|
||||
remote_fs_dir.join(timeline_path.strip_prefix(&harness.conf.workdir).unwrap());
|
||||
println!("remote_timeline_dir: {}", remote_timeline_dir.display());
|
||||
|
||||
let metadata = dummy_metadata(Lsn(0x10));
|
||||
client.init_upload_queue_for_empty_remote(&metadata)?;
|
||||
client
|
||||
.init_upload_queue_for_empty_remote(&metadata)
|
||||
.unwrap();
|
||||
|
||||
// Create a couple of dummy files, schedule upload for them
|
||||
let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
|
||||
@@ -1484,26 +1512,32 @@ mod tests {
|
||||
let content_1 = dummy_contents("foo");
|
||||
let content_2 = dummy_contents("bar");
|
||||
let content_3 = dummy_contents("baz");
|
||||
std::fs::write(
|
||||
timeline_path.join(layer_file_name_1.file_name()),
|
||||
&content_1,
|
||||
)?;
|
||||
std::fs::write(
|
||||
timeline_path.join(layer_file_name_2.file_name()),
|
||||
&content_2,
|
||||
)?;
|
||||
std::fs::write(timeline_path.join(layer_file_name_3.file_name()), content_3)?;
|
||||
|
||||
client.schedule_layer_file_upload(
|
||||
&layer_file_name_1,
|
||||
&LayerFileMetadata::new(content_1.len() as u64),
|
||||
)?;
|
||||
client.schedule_layer_file_upload(
|
||||
&layer_file_name_2,
|
||||
&LayerFileMetadata::new(content_2.len() as u64),
|
||||
)?;
|
||||
for (filename, content) in [
|
||||
(&layer_file_name_1, &content_1),
|
||||
(&layer_file_name_2, &content_2),
|
||||
(&layer_file_name_3, &content_3),
|
||||
] {
|
||||
std::fs::write(timeline_path.join(filename.file_name()), content).unwrap();
|
||||
}
|
||||
|
||||
client
|
||||
.schedule_layer_file_upload(
|
||||
&layer_file_name_1,
|
||||
&LayerFileMetadata::new(content_1.len() as u64),
|
||||
)
|
||||
.unwrap();
|
||||
client
|
||||
.schedule_layer_file_upload(
|
||||
&layer_file_name_2,
|
||||
&LayerFileMetadata::new(content_2.len() as u64),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// Check that they are started immediately, not queued
|
||||
//
|
||||
// this works because we running within block_on, so any futures are now queued up until
|
||||
// our next await point.
|
||||
{
|
||||
let mut guard = client.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut().unwrap();
|
||||
@@ -1517,7 +1551,9 @@ mod tests {
|
||||
|
||||
// Schedule upload of index. Check that it is queued
|
||||
let metadata = dummy_metadata(Lsn(0x20));
|
||||
client.schedule_index_upload_for_metadata_update(&metadata)?;
|
||||
client
|
||||
.schedule_index_upload_for_metadata_update(&metadata)
|
||||
.unwrap();
|
||||
{
|
||||
let mut guard = client.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut().unwrap();
|
||||
@@ -1526,7 +1562,7 @@ mod tests {
|
||||
}
|
||||
|
||||
// Wait for the uploads to finish
|
||||
runtime.block_on(client.wait_completion())?;
|
||||
client.wait_completion().await.unwrap();
|
||||
{
|
||||
let mut guard = client.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut().unwrap();
|
||||
@@ -1536,7 +1572,7 @@ mod tests {
|
||||
}
|
||||
|
||||
// Download back the index.json, and check that the list of files is correct
|
||||
let index_part = match runtime.block_on(client.download_index_file())? {
|
||||
let index_part = match client.download_index_file().await.unwrap() {
|
||||
MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
|
||||
MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"),
|
||||
};
|
||||
@@ -1548,17 +1584,19 @@ mod tests {
|
||||
&layer_file_name_2.file_name(),
|
||||
],
|
||||
);
|
||||
let downloaded_metadata = index_part.parse_metadata()?;
|
||||
let downloaded_metadata = index_part.parse_metadata().unwrap();
|
||||
assert_eq!(downloaded_metadata, metadata);
|
||||
|
||||
// Schedule upload and then a deletion. Check that the deletion is queued
|
||||
let content_baz = dummy_contents("baz");
|
||||
std::fs::write(timeline_path.join("baz"), &content_baz)?;
|
||||
client.schedule_layer_file_upload(
|
||||
&layer_file_name_3,
|
||||
&LayerFileMetadata::new(content_baz.len() as u64),
|
||||
)?;
|
||||
client.schedule_layer_file_deletion(&[layer_file_name_1.clone()])?;
|
||||
client
|
||||
.schedule_layer_file_upload(
|
||||
&layer_file_name_3,
|
||||
&LayerFileMetadata::new(content_3.len() as u64),
|
||||
)
|
||||
.unwrap();
|
||||
client
|
||||
.schedule_layer_file_deletion(&[layer_file_name_1.clone()])
|
||||
.unwrap();
|
||||
{
|
||||
let mut guard = client.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut().unwrap();
|
||||
@@ -1580,7 +1618,7 @@ mod tests {
|
||||
);
|
||||
|
||||
// Finish them
|
||||
runtime.block_on(client.wait_completion())?;
|
||||
client.wait_completion().await.unwrap();
|
||||
|
||||
assert_remote_files(
|
||||
&[
|
||||
@@ -1590,23 +1628,24 @@ mod tests {
|
||||
],
|
||||
&remote_timeline_dir,
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bytes_unfinished_gauge_for_layer_file_uploads() -> anyhow::Result<()> {
|
||||
#[tokio::test]
|
||||
async fn bytes_unfinished_gauge_for_layer_file_uploads() {
|
||||
// Setup
|
||||
|
||||
let TestSetup {
|
||||
runtime,
|
||||
harness,
|
||||
tenant: _tenant,
|
||||
timeline: _timeline,
|
||||
client,
|
||||
..
|
||||
} = TestSetup::new("metrics")?;
|
||||
} = TestSetup::new("metrics").await.unwrap();
|
||||
|
||||
let metadata = dummy_metadata(Lsn(0x10));
|
||||
client.init_upload_queue_for_empty_remote(&metadata)?;
|
||||
client
|
||||
.init_upload_queue_for_empty_remote(&metadata)
|
||||
.unwrap();
|
||||
|
||||
let timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
|
||||
@@ -1615,7 +1654,8 @@ mod tests {
|
||||
std::fs::write(
|
||||
timeline_path.join(layer_file_name_1.file_name()),
|
||||
&content_1,
|
||||
)?;
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
struct BytesStartedFinished {
|
||||
@@ -1641,14 +1681,16 @@ mod tests {
|
||||
|
||||
let init = get_bytes_started_stopped();
|
||||
|
||||
client.schedule_layer_file_upload(
|
||||
&layer_file_name_1,
|
||||
&LayerFileMetadata::new(content_1.len() as u64),
|
||||
)?;
|
||||
client
|
||||
.schedule_layer_file_upload(
|
||||
&layer_file_name_1,
|
||||
&LayerFileMetadata::new(content_1.len() as u64),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let pre = get_bytes_started_stopped();
|
||||
|
||||
runtime.block_on(client.wait_completion())?;
|
||||
client.wait_completion().await.unwrap();
|
||||
|
||||
let post = get_bytes_started_stopped();
|
||||
|
||||
@@ -1676,7 +1718,5 @@ mod tests {
|
||||
finished: Some(content_1.len())
|
||||
}
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,23 +11,17 @@ use std::time::Duration;
|
||||
use anyhow::{anyhow, Context};
|
||||
use tokio::fs;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
|
||||
use tracing::{info, warn};
|
||||
use utils::{backoff, crashsafe};
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::tenant::storage_layer::LayerFileName;
|
||||
use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage};
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use super::index::{IndexPart, LayerFileMetadata};
|
||||
use super::{FAILED_DOWNLOAD_RETRIES, FAILED_DOWNLOAD_WARN_THRESHOLD};
|
||||
|
||||
async fn fsync_path(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
|
||||
fs::File::open(path).await?.sync_all().await
|
||||
}
|
||||
use super::{FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES};
|
||||
|
||||
static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120);
|
||||
|
||||
@@ -152,7 +146,7 @@ pub async fn download_layer_file<'a>(
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
fsync_path(&local_path)
|
||||
crashsafe::fsync_async(&local_path)
|
||||
.await
|
||||
.with_context(|| format!("Could not fsync layer file {}", local_path.display(),))
|
||||
.map_err(DownloadError::Other)?;
|
||||
@@ -268,7 +262,6 @@ pub(super) async fn download_index_part(
|
||||
Ok(index_part)
|
||||
}
|
||||
|
||||
///
|
||||
/// Helper function to handle retries for a download operation.
|
||||
///
|
||||
/// Remote operations can fail due to rate limits (IAM, S3), spurious network
|
||||
@@ -276,47 +269,17 @@ pub(super) async fn download_index_part(
|
||||
/// with backoff.
|
||||
///
|
||||
/// (See similar logic for uploads in `perform_upload_task`)
|
||||
async fn download_retry<T, O, F>(mut op: O, description: &str) -> Result<T, DownloadError>
|
||||
async fn download_retry<T, O, F>(op: O, description: &str) -> Result<T, DownloadError>
|
||||
where
|
||||
O: FnMut() -> F,
|
||||
F: Future<Output = Result<T, DownloadError>>,
|
||||
{
|
||||
let mut attempts = 0;
|
||||
loop {
|
||||
let result = op().await;
|
||||
match result {
|
||||
Ok(_) => {
|
||||
if attempts > 0 {
|
||||
info!("{description} succeeded after {attempts} retries");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// These are "permanent" errors that should not be retried.
|
||||
Err(DownloadError::BadInput(_)) | Err(DownloadError::NotFound) => {
|
||||
return result;
|
||||
}
|
||||
// Assume that any other failure might be transient, and the operation might
|
||||
// succeed if we just keep trying.
|
||||
Err(DownloadError::Other(err)) if attempts < FAILED_DOWNLOAD_WARN_THRESHOLD => {
|
||||
info!("{description} failed, will retry (attempt {attempts}): {err:#}");
|
||||
}
|
||||
Err(DownloadError::Other(err)) if attempts < FAILED_DOWNLOAD_RETRIES => {
|
||||
warn!("{description} failed, will retry (attempt {attempts}): {err:#}");
|
||||
}
|
||||
Err(DownloadError::Other(ref err)) => {
|
||||
// Operation failed FAILED_DOWNLOAD_RETRIES times. Time to give up.
|
||||
warn!("{description} still failed after {attempts} retries, giving up: {err:?}");
|
||||
return result;
|
||||
}
|
||||
}
|
||||
// sleep and retry
|
||||
exponential_backoff(
|
||||
attempts,
|
||||
DEFAULT_BASE_BACKOFF_SECONDS,
|
||||
DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
)
|
||||
.await;
|
||||
attempts += 1;
|
||||
}
|
||||
backoff::retry(
|
||||
op,
|
||||
|e| matches!(e, DownloadError::BadInput(_) | DownloadError::NotFound),
|
||||
FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
description,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -8,7 +8,7 @@ mod layer_desc;
|
||||
mod remote_layer;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::RequestContext;
|
||||
use crate::context::{AccessStatsBehavior, RequestContext};
|
||||
use crate::repository::Key;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
@@ -241,10 +241,14 @@ impl LayerAccessStats {
|
||||
});
|
||||
}
|
||||
|
||||
fn record_access(&self, access_kind: LayerAccessKind, task_kind: TaskKind) {
|
||||
fn record_access(&self, access_kind: LayerAccessKind, ctx: &RequestContext) {
|
||||
if ctx.access_stats_behavior() == AccessStatsBehavior::Skip {
|
||||
return;
|
||||
}
|
||||
|
||||
let this_access = LayerAccessStatFullDetails {
|
||||
when: SystemTime::now(),
|
||||
task_kind,
|
||||
task_kind: ctx.task_kind(),
|
||||
access_kind,
|
||||
};
|
||||
|
||||
@@ -252,7 +256,7 @@ impl LayerAccessStats {
|
||||
locked.iter_mut().for_each(|inner| {
|
||||
inner.first_access.get_or_insert(this_access);
|
||||
inner.count_by_access_kind[access_kind] += 1;
|
||||
inner.task_kind_flag |= task_kind;
|
||||
inner.task_kind_flag |= ctx.task_kind();
|
||||
inner.last_accesses.write(this_access);
|
||||
})
|
||||
}
|
||||
|
||||
@@ -29,10 +29,10 @@
|
||||
//!
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::RequestContext;
|
||||
use crate::page_cache::{PageReadGuard, PAGE_SZ};
|
||||
use crate::page_cache::PAGE_SZ;
|
||||
use crate::repository::{Key, Value, KEY_SIZE};
|
||||
use crate::tenant::blob_io::{BlobWriter, WriteBlobWriter};
|
||||
use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockReader, FileBlockReader};
|
||||
use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
|
||||
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
|
||||
use crate::tenant::storage_layer::{
|
||||
PersistentLayer, ValueReconstructResult, ValueReconstructState,
|
||||
@@ -51,6 +51,7 @@ use std::ops::Range;
|
||||
use std::os::unix::fs::FileExt;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::Arc;
|
||||
use tokio::runtime::Handle;
|
||||
use tokio::sync::OnceCell;
|
||||
use tracing::*;
|
||||
|
||||
@@ -280,7 +281,8 @@ impl Layer for DeltaLayer {
|
||||
|
||||
// A subroutine to dump a single blob
|
||||
let dump_blob = |blob_ref: BlobRef| -> anyhow::Result<String> {
|
||||
let buf = cursor.read_blob(blob_ref.pos())?;
|
||||
// TODO this is not ideal, but on the other hand we are in dumping code...
|
||||
let buf = Handle::current().block_on(cursor.read_blob(blob_ref.pos()))?;
|
||||
let val = Value::des(&buf)?;
|
||||
let desc = match val {
|
||||
Value::Image(img) => {
|
||||
@@ -335,7 +337,6 @@ impl Layer for DeltaLayer {
|
||||
let inner = self
|
||||
.load(LayerAccessKind::GetValueReconstructData, ctx)
|
||||
.await?;
|
||||
|
||||
inner
|
||||
.get_value_reconstruct_data(key, lsn_range, reconstruct_state)
|
||||
.await
|
||||
@@ -452,8 +453,7 @@ impl DeltaLayer {
|
||||
access_kind: LayerAccessKind,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<&Arc<DeltaLayerInner>> {
|
||||
self.access_stats
|
||||
.record_access(access_kind, ctx.task_kind());
|
||||
self.access_stats.record_access(access_kind, ctx);
|
||||
// Quick exit if already loaded
|
||||
self.inner
|
||||
.get_or_try_init(|| self.load_inner())
|
||||
@@ -549,30 +549,20 @@ impl DeltaLayer {
|
||||
&self.layer_name(),
|
||||
)
|
||||
}
|
||||
|
||||
/// Obtains all keys and value references stored in the layer
|
||||
/// Loads all keys stored in the layer. Returns key, lsn, value size and value reference.
|
||||
///
|
||||
/// The value can be obtained via the [`ValueRef::load`] function.
|
||||
pub async fn load_val_refs(
|
||||
pub(crate) async fn load_keys(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Vec<(Key, Lsn, ValueRef<Arc<DeltaLayerInner>>)>> {
|
||||
let inner = self
|
||||
.load(LayerAccessKind::Iter, ctx)
|
||||
.await
|
||||
.context("load delta layer")?;
|
||||
DeltaLayerInner::load_val_refs(inner)
|
||||
.await
|
||||
.context("Layer index is corrupted")
|
||||
}
|
||||
|
||||
/// Loads all keys stored in the layer. Returns key, lsn and value size.
|
||||
pub async fn load_keys(&self, ctx: &RequestContext) -> Result<Vec<(Key, Lsn, u64)>> {
|
||||
) -> Result<Vec<DeltaEntry<Ref<&'_ DeltaLayerInner>>>> {
|
||||
let inner = self
|
||||
.load(LayerAccessKind::KeyIter, ctx)
|
||||
.await
|
||||
.context("load delta layer keys")?;
|
||||
DeltaLayerInner::load_keys(inner)
|
||||
|
||||
let inner = Ref(&**inner);
|
||||
DeltaLayerInner::load_keys(&inner)
|
||||
.await
|
||||
.context("Layer index is corrupted")
|
||||
}
|
||||
@@ -711,6 +701,17 @@ impl DeltaLayerWriterInner {
|
||||
.metadata()
|
||||
.context("get file metadata to determine size")?;
|
||||
|
||||
// 5GB limit for objects without multipart upload (which we don't want to use)
|
||||
// Make it a little bit below to account for differing GB units
|
||||
// https://docs.aws.amazon.com/AmazonS3/latest/userguide/upload-objects.html
|
||||
const S3_UPLOAD_LIMIT: u64 = 4_500_000_000;
|
||||
ensure!(
|
||||
metadata.len() <= S3_UPLOAD_LIMIT,
|
||||
"Created delta layer file at {} of size {} above limit {S3_UPLOAD_LIMIT}!",
|
||||
file.path.display(),
|
||||
metadata.len()
|
||||
);
|
||||
|
||||
// Note: Because we opened the file in write-only mode, we cannot
|
||||
// reuse the same VirtualFile for reading later. That's why we don't
|
||||
// set inner.file here. The first read will have to re-open it.
|
||||
@@ -913,12 +914,15 @@ impl DeltaLayerInner {
|
||||
let cursor = file.block_cursor();
|
||||
let mut buf = Vec::new();
|
||||
for (entry_lsn, pos) in offsets {
|
||||
cursor.read_blob_into_buf(pos, &mut buf).with_context(|| {
|
||||
format!(
|
||||
"Failed to read blob from virtual file {}",
|
||||
file.file.path.display()
|
||||
)
|
||||
})?;
|
||||
cursor
|
||||
.read_blob_into_buf(pos, &mut buf)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to read blob from virtual file {}",
|
||||
file.file.path.display()
|
||||
)
|
||||
})?;
|
||||
let val = Value::des(&buf).with_context(|| {
|
||||
format!(
|
||||
"Failed to deserialize file blob from virtual file {}",
|
||||
@@ -952,15 +956,17 @@ impl DeltaLayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) async fn load_val_refs<T: AsRef<DeltaLayerInner> + Clone>(
|
||||
pub(super) async fn load_keys<T: AsRef<DeltaLayerInner> + Clone>(
|
||||
this: &T,
|
||||
) -> Result<Vec<(Key, Lsn, ValueRef<T>)>> {
|
||||
) -> Result<Vec<DeltaEntry<T>>> {
|
||||
let dl = this.as_ref();
|
||||
let file = &dl.file;
|
||||
|
||||
let tree_reader =
|
||||
DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(dl.index_start_blk, dl.index_root_blk, file);
|
||||
|
||||
let mut all_offsets = Vec::<(Key, Lsn, ValueRef<T>)>::new();
|
||||
let mut all_keys: Vec<DeltaEntry<T>> = Vec::new();
|
||||
|
||||
tree_reader
|
||||
.visit(
|
||||
&[0u8; DELTA_KEY_SIZE],
|
||||
@@ -971,54 +977,63 @@ impl DeltaLayerInner {
|
||||
blob_ref: BlobRef(value),
|
||||
reader: BlockCursor::new(Adapter(this.clone())),
|
||||
};
|
||||
all_offsets.push((delta_key.key(), delta_key.lsn(), val_ref));
|
||||
true
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(all_offsets)
|
||||
}
|
||||
|
||||
pub(super) async fn load_keys(&self) -> Result<Vec<(Key, Lsn, u64)>> {
|
||||
let file = &self.file;
|
||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
self.index_start_blk,
|
||||
self.index_root_blk,
|
||||
file,
|
||||
);
|
||||
|
||||
let mut all_keys: Vec<(Key, Lsn, u64)> = Vec::new();
|
||||
tree_reader
|
||||
.visit(
|
||||
&[0u8; DELTA_KEY_SIZE],
|
||||
VisitDirection::Forwards,
|
||||
|key, value| {
|
||||
let delta_key = DeltaKey::from_slice(key);
|
||||
let pos = BlobRef(value).pos();
|
||||
if let Some(last) = all_keys.last_mut() {
|
||||
if last.0 == delta_key.key() {
|
||||
return true;
|
||||
} else {
|
||||
// subtract offset of new key BLOB and first blob of this key
|
||||
// to get total size if values associated with this key
|
||||
let first_pos = last.2;
|
||||
last.2 = pos - first_pos;
|
||||
}
|
||||
// subtract offset of the current and last entries to get the size
|
||||
// of the value associated with this (key, lsn) tuple
|
||||
let first_pos = last.size;
|
||||
last.size = pos - first_pos;
|
||||
}
|
||||
all_keys.push((delta_key.key(), delta_key.lsn(), pos));
|
||||
let entry = DeltaEntry {
|
||||
key: delta_key.key(),
|
||||
lsn: delta_key.lsn(),
|
||||
size: pos,
|
||||
val: val_ref,
|
||||
};
|
||||
all_keys.push(entry);
|
||||
true
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
if let Some(last) = all_keys.last_mut() {
|
||||
// Last key occupies all space till end of layer
|
||||
last.2 = std::fs::metadata(&file.file.path)?.len() - last.2;
|
||||
// Last key occupies all space till end of value storage,
|
||||
// which corresponds to beginning of the index
|
||||
last.size = dl.index_start_blk as u64 * PAGE_SZ as u64 - last.size;
|
||||
}
|
||||
Ok(all_keys)
|
||||
}
|
||||
}
|
||||
|
||||
/// Cloneable borrow wrapper to make borrows behave like smart pointers.
|
||||
///
|
||||
/// Shared references are trivially copyable. This wrapper avoids (confusion) to otherwise attempt
|
||||
/// cloning DeltaLayerInner.
|
||||
pub(crate) struct Ref<T>(T);
|
||||
|
||||
impl<'a, T> AsRef<T> for Ref<&'a T> {
|
||||
fn as_ref(&self) -> &T {
|
||||
self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T> Clone for Ref<&'a T> {
|
||||
fn clone(&self) -> Self {
|
||||
*self
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T> Copy for Ref<&'a T> {}
|
||||
|
||||
/// A set of data associated with a delta layer key and its value
|
||||
pub struct DeltaEntry<T: AsRef<DeltaLayerInner>> {
|
||||
pub key: Key,
|
||||
pub lsn: Lsn,
|
||||
/// Size of the stored value
|
||||
pub size: u64,
|
||||
/// Reference to the on-disk value
|
||||
pub val: ValueRef<T>,
|
||||
}
|
||||
|
||||
/// Reference to an on-disk value
|
||||
pub struct ValueRef<T: AsRef<DeltaLayerInner>> {
|
||||
blob_ref: BlobRef,
|
||||
@@ -1027,9 +1042,9 @@ pub struct ValueRef<T: AsRef<DeltaLayerInner>> {
|
||||
|
||||
impl<T: AsRef<DeltaLayerInner>> ValueRef<T> {
|
||||
/// Loads the value from disk
|
||||
pub fn load(&self) -> Result<Value> {
|
||||
pub async fn load(&self) -> Result<Value> {
|
||||
// theoretically we *could* record an access time for each, but it does not really matter
|
||||
let buf = self.reader.read_blob(self.blob_ref.pos())?;
|
||||
let buf = self.reader.read_blob(self.blob_ref.pos()).await?;
|
||||
let val = Value::des(&buf)?;
|
||||
Ok(val)
|
||||
}
|
||||
@@ -1038,9 +1053,7 @@ impl<T: AsRef<DeltaLayerInner>> ValueRef<T> {
|
||||
struct Adapter<T: AsRef<DeltaLayerInner>>(T);
|
||||
|
||||
impl<T: AsRef<DeltaLayerInner>> BlockReader for Adapter<T> {
|
||||
type BlockLease = PageReadGuard<'static>;
|
||||
|
||||
fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
|
||||
fn read_blk(&self, blknum: u32) -> Result<BlockLease, std::io::Error> {
|
||||
self.0.as_ref().file.read_blk(blknum)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -323,8 +323,7 @@ impl ImageLayer {
|
||||
access_kind: LayerAccessKind,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<&ImageLayerInner> {
|
||||
self.access_stats
|
||||
.record_access(access_kind, ctx.task_kind());
|
||||
self.access_stats.record_access(access_kind, ctx);
|
||||
self.inner
|
||||
.get_or_try_init(|| self.load_inner())
|
||||
.await
|
||||
@@ -471,6 +470,7 @@ impl ImageLayerInner {
|
||||
let blob = file
|
||||
.block_cursor()
|
||||
.read_blob(offset)
|
||||
.await
|
||||
.with_context(|| format!("failed to read value from offset {}", offset))?;
|
||||
let value = Bytes::from(blob);
|
||||
|
||||
|
||||
@@ -28,7 +28,7 @@ use utils::{
|
||||
// while being able to use std::fmt::Write's methods
|
||||
use std::fmt::Write as _;
|
||||
use std::ops::Range;
|
||||
use std::sync::RwLock;
|
||||
use tokio::sync::RwLock;
|
||||
|
||||
use super::{DeltaLayer, DeltaLayerWriter, Layer};
|
||||
|
||||
@@ -125,7 +125,7 @@ impl Layer for InMemoryLayer {
|
||||
|
||||
/// debugging function to print out the contents of the layer
|
||||
async fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
|
||||
let inner = self.inner.read().unwrap();
|
||||
let inner = self.inner.read().await;
|
||||
|
||||
let end_str = self.end_lsn_or_max();
|
||||
|
||||
@@ -143,7 +143,7 @@ impl Layer for InMemoryLayer {
|
||||
for (key, vec_map) in inner.index.iter() {
|
||||
for (lsn, pos) in vec_map.as_slice() {
|
||||
let mut desc = String::new();
|
||||
cursor.read_blob_into_buf(*pos, &mut buf)?;
|
||||
cursor.read_blob_into_buf(*pos, &mut buf).await?;
|
||||
let val = Value::des(&buf);
|
||||
match val {
|
||||
Ok(Value::Image(img)) => {
|
||||
@@ -181,7 +181,7 @@ impl Layer for InMemoryLayer {
|
||||
ensure!(lsn_range.start >= self.start_lsn);
|
||||
let mut need_image = true;
|
||||
|
||||
let inner = self.inner.read().unwrap();
|
||||
let inner = self.inner.read().await;
|
||||
|
||||
let reader = inner.file.block_cursor();
|
||||
|
||||
@@ -189,7 +189,7 @@ impl Layer for InMemoryLayer {
|
||||
if let Some(vec_map) = inner.index.get(&key) {
|
||||
let slice = vec_map.slice_range(lsn_range);
|
||||
for (entry_lsn, pos) in slice.iter().rev() {
|
||||
let buf = reader.read_blob(*pos)?;
|
||||
let buf = reader.read_blob(*pos).await?;
|
||||
let value = Value::des(&buf)?;
|
||||
match value {
|
||||
Value::Image(img) => {
|
||||
@@ -232,8 +232,8 @@ impl InMemoryLayer {
|
||||
///
|
||||
/// Get layer size on the disk
|
||||
///
|
||||
pub fn size(&self) -> Result<u64> {
|
||||
let inner = self.inner.read().unwrap();
|
||||
pub async fn size(&self) -> Result<u64> {
|
||||
let inner = self.inner.read().await;
|
||||
Ok(inner.file.size)
|
||||
}
|
||||
|
||||
@@ -267,9 +267,9 @@ impl InMemoryLayer {
|
||||
|
||||
/// Common subroutine of the public put_wal_record() and put_page_image() functions.
|
||||
/// Adds the page version to the in-memory tree
|
||||
pub fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> {
|
||||
pub async fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> {
|
||||
trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
let mut inner = self.inner.write().await;
|
||||
self.assert_writable();
|
||||
|
||||
let off = {
|
||||
@@ -301,8 +301,8 @@ impl InMemoryLayer {
|
||||
/// Make the layer non-writeable. Only call once.
|
||||
/// Records the end_lsn for non-dropped layers.
|
||||
/// `end_lsn` is exclusive
|
||||
pub fn freeze(&self, end_lsn: Lsn) {
|
||||
let inner = self.inner.write().unwrap();
|
||||
pub async fn freeze(&self, end_lsn: Lsn) {
|
||||
let inner = self.inner.write().await;
|
||||
|
||||
assert!(self.start_lsn < end_lsn);
|
||||
self.end_lsn.set(end_lsn).expect("end_lsn set only once");
|
||||
@@ -317,7 +317,7 @@ impl InMemoryLayer {
|
||||
/// Write this frozen in-memory layer to disk.
|
||||
///
|
||||
/// Returns a new delta layer with all the same data as this in-memory layer
|
||||
pub fn write_to_disk(&self) -> Result<DeltaLayer> {
|
||||
pub async fn write_to_disk(&self) -> Result<DeltaLayer> {
|
||||
// Grab the lock in read-mode. We hold it over the I/O, but because this
|
||||
// layer is not writeable anymore, no one should be trying to acquire the
|
||||
// write lock on it, so we shouldn't block anyone. There's one exception
|
||||
@@ -327,7 +327,7 @@ impl InMemoryLayer {
|
||||
// lock, it will see that it's not writeable anymore and retry, but it
|
||||
// would have to wait until we release it. That race condition is very
|
||||
// rare though, so we just accept the potential latency hit for now.
|
||||
let inner = self.inner.read().unwrap();
|
||||
let inner = self.inner.read().await;
|
||||
|
||||
let end_lsn = *self.end_lsn.get().unwrap();
|
||||
|
||||
@@ -350,7 +350,7 @@ impl InMemoryLayer {
|
||||
let key = **key;
|
||||
// Write all page versions
|
||||
for (lsn, pos) in vec_map.as_slice() {
|
||||
cursor.read_blob_into_buf(*pos, &mut buf)?;
|
||||
cursor.read_blob_into_buf(*pos, &mut buf).await?;
|
||||
let will_init = Value::des(&buf)?.will_init();
|
||||
delta_layer_writer.put_value_bytes(key, *lsn, &buf, will_init)?;
|
||||
}
|
||||
|
||||
@@ -35,8 +35,11 @@ use std::sync::atomic::Ordering as AtomicOrdering;
|
||||
use std::sync::{Arc, Mutex, RwLock, Weak};
|
||||
use std::time::{Duration, Instant, SystemTime};
|
||||
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::context::{
|
||||
AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder,
|
||||
};
|
||||
use crate::tenant::remote_timeline_client::{self, index::LayerFileMetadata};
|
||||
use crate::tenant::storage_layer::delta_layer::DeltaEntry;
|
||||
use crate::tenant::storage_layer::{
|
||||
DeltaFileName, DeltaLayerWriter, ImageFileName, ImageLayerWriter, InMemoryLayer,
|
||||
LayerAccessStats, LayerFileName, RemoteLayer,
|
||||
@@ -799,10 +802,15 @@ impl Timeline {
|
||||
.await
|
||||
{
|
||||
Ok((partitioning, lsn)) => {
|
||||
// Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
|
||||
let image_ctx = RequestContextBuilder::extend(ctx)
|
||||
.access_stats_behavior(AccessStatsBehavior::Skip)
|
||||
.build();
|
||||
|
||||
// 2. Create new image layers for partitions that have been modified
|
||||
// "enough".
|
||||
let layer_paths_to_upload = self
|
||||
.create_image_layers(&partitioning, lsn, false, ctx)
|
||||
.create_image_layers(&partitioning, lsn, false, &image_ctx)
|
||||
.await
|
||||
.map_err(anyhow::Error::from)?;
|
||||
if let Some(remote_client) = &self.remote_client {
|
||||
@@ -875,7 +883,7 @@ impl Timeline {
|
||||
let Some(open_layer) = layers.open_layer.as_ref() else {
|
||||
return Ok(());
|
||||
};
|
||||
open_layer.size()?
|
||||
open_layer.size().await?
|
||||
};
|
||||
let last_freeze_at = self.last_freeze_at.load();
|
||||
let last_freeze_ts = *(self.last_freeze_ts.read().unwrap());
|
||||
@@ -2647,7 +2655,7 @@ impl Timeline {
|
||||
async fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> anyhow::Result<()> {
|
||||
//info!("PUT: key {} at {}", key, lsn);
|
||||
let layer = self.get_layer_for_write(lsn).await?;
|
||||
layer.put_value(key, lsn, val)?;
|
||||
layer.put_value(key, lsn, val).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -2673,7 +2681,9 @@ impl Timeline {
|
||||
Some(self.write_lock.lock().await)
|
||||
};
|
||||
let mut guard = self.layers.write().await;
|
||||
guard.try_freeze_in_memory_layer(self.get_last_record_lsn(), &self.last_freeze_at);
|
||||
guard
|
||||
.try_freeze_in_memory_layer(self.get_last_record_lsn(), &self.last_freeze_at)
|
||||
.await;
|
||||
}
|
||||
|
||||
/// Layer flusher task's main loop.
|
||||
@@ -2955,7 +2965,11 @@ impl Timeline {
|
||||
let frozen_layer = Arc::clone(frozen_layer);
|
||||
move || {
|
||||
// Write it out
|
||||
let new_delta = frozen_layer.write_to_disk()?;
|
||||
// Keep this inside `spawn_blocking` and `Handle::current`
|
||||
// as long as the write path is still sync and the read impl
|
||||
// is still not fully async. Otherwise executor threads would
|
||||
// be blocked.
|
||||
let new_delta = Handle::current().block_on(frozen_layer.write_to_disk())?;
|
||||
let new_delta_path = new_delta.path();
|
||||
|
||||
// Sync it to disk.
|
||||
@@ -3299,10 +3313,10 @@ struct CompactLevel0Phase1StatsBuilder {
|
||||
timeline_id: Option<TimelineId>,
|
||||
read_lock_acquisition_micros: DurationRecorder,
|
||||
read_lock_held_spawn_blocking_startup_micros: DurationRecorder,
|
||||
read_lock_held_key_sort_micros: DurationRecorder,
|
||||
read_lock_held_prerequisites_micros: DurationRecorder,
|
||||
read_lock_held_compute_holes_micros: DurationRecorder,
|
||||
read_lock_drop_micros: DurationRecorder,
|
||||
prepare_iterators_micros: DurationRecorder,
|
||||
write_layer_files_micros: DurationRecorder,
|
||||
level0_deltas_count: Option<usize>,
|
||||
new_deltas_count: Option<usize>,
|
||||
@@ -3319,10 +3333,10 @@ struct CompactLevel0Phase1Stats {
|
||||
timeline_id: TimelineId,
|
||||
read_lock_acquisition_micros: RecordedDuration,
|
||||
read_lock_held_spawn_blocking_startup_micros: RecordedDuration,
|
||||
read_lock_held_key_sort_micros: RecordedDuration,
|
||||
read_lock_held_prerequisites_micros: RecordedDuration,
|
||||
read_lock_held_compute_holes_micros: RecordedDuration,
|
||||
read_lock_drop_micros: RecordedDuration,
|
||||
prepare_iterators_micros: RecordedDuration,
|
||||
write_layer_files_micros: RecordedDuration,
|
||||
level0_deltas_count: usize,
|
||||
new_deltas_count: usize,
|
||||
@@ -3349,6 +3363,10 @@ impl TryFrom<CompactLevel0Phase1StatsBuilder> for CompactLevel0Phase1Stats {
|
||||
.read_lock_held_spawn_blocking_startup_micros
|
||||
.into_recorded()
|
||||
.ok_or_else(|| anyhow!("read_lock_held_spawn_blocking_startup_micros not set"))?,
|
||||
read_lock_held_key_sort_micros: value
|
||||
.read_lock_held_key_sort_micros
|
||||
.into_recorded()
|
||||
.ok_or_else(|| anyhow!("read_lock_held_key_sort_micros not set"))?,
|
||||
read_lock_held_prerequisites_micros: value
|
||||
.read_lock_held_prerequisites_micros
|
||||
.into_recorded()
|
||||
@@ -3361,10 +3379,6 @@ impl TryFrom<CompactLevel0Phase1StatsBuilder> for CompactLevel0Phase1Stats {
|
||||
.read_lock_drop_micros
|
||||
.into_recorded()
|
||||
.ok_or_else(|| anyhow!("read_lock_drop_micros not set"))?,
|
||||
prepare_iterators_micros: value
|
||||
.prepare_iterators_micros
|
||||
.into_recorded()
|
||||
.ok_or_else(|| anyhow!("prepare_iterators_micros not set"))?,
|
||||
write_layer_files_micros: value
|
||||
.write_layer_files_micros
|
||||
.into_recorded()
|
||||
@@ -3534,28 +3548,24 @@ impl Timeline {
|
||||
let mut heap: BinaryHeap<Hole> = BinaryHeap::with_capacity(max_holes + 1);
|
||||
let mut prev: Option<Key> = None;
|
||||
|
||||
let mut all_value_refs = Vec::new();
|
||||
let mut all_keys = Vec::new();
|
||||
|
||||
for l in deltas_to_compact.iter() {
|
||||
let downcast_deltas: Vec<_> = deltas_to_compact
|
||||
.iter()
|
||||
.map(|l| l.clone().downcast_delta_layer().expect("delta layer"))
|
||||
.collect();
|
||||
for dl in downcast_deltas.iter() {
|
||||
// TODO: replace this with an await once we fully go async
|
||||
let delta = l.clone().downcast_delta_layer().expect("delta layer");
|
||||
Handle::current().block_on(async {
|
||||
all_value_refs.extend(delta.load_val_refs(ctx).await?);
|
||||
all_keys.extend(delta.load_keys(ctx).await?);
|
||||
anyhow::Ok(())
|
||||
})?;
|
||||
all_keys.extend(Handle::current().block_on(DeltaLayer::load_keys(dl, ctx))?);
|
||||
}
|
||||
|
||||
// The current stdlib sorting implementation is designed in a way where it is
|
||||
// particularly fast where the slice is made up of sorted sub-ranges.
|
||||
all_value_refs.sort_by_key(|(key, lsn, _value_ref)| (*key, *lsn));
|
||||
all_keys.sort_by_key(|DeltaEntry { key, lsn, .. }| (*key, *lsn));
|
||||
|
||||
// The current stdlib sorting implementation is designed in a way where it is
|
||||
// particularly fast where the slice is made up of sorted sub-ranges.
|
||||
all_keys.sort_by_key(|(key, lsn, _size)| (*key, *lsn));
|
||||
stats.read_lock_held_key_sort_micros = stats.read_lock_held_prerequisites_micros.till_now();
|
||||
|
||||
for (next_key, _next_lsn, _size) in all_keys.iter() {
|
||||
for DeltaEntry { key: next_key, .. } in all_keys.iter() {
|
||||
let next_key = *next_key;
|
||||
if let Some(prev_key) = prev {
|
||||
// just first fast filter
|
||||
@@ -3579,8 +3589,7 @@ impl Timeline {
|
||||
}
|
||||
prev = Some(next_key.next());
|
||||
}
|
||||
stats.read_lock_held_compute_holes_micros =
|
||||
stats.read_lock_held_prerequisites_micros.till_now();
|
||||
stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now();
|
||||
drop_rlock(guard);
|
||||
stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now();
|
||||
let mut holes = heap.into_vec();
|
||||
@@ -3589,12 +3598,26 @@ impl Timeline {
|
||||
|
||||
// This iterator walks through all key-value pairs from all the layers
|
||||
// we're compacting, in key, LSN order.
|
||||
let all_values_iter = all_value_refs.into_iter();
|
||||
let all_values_iter = all_keys.iter();
|
||||
|
||||
// This iterator walks through all keys and is needed to calculate size used by each key
|
||||
let mut all_keys_iter = all_keys.into_iter();
|
||||
|
||||
stats.prepare_iterators_micros = stats.read_lock_drop_micros.till_now();
|
||||
let mut all_keys_iter = all_keys
|
||||
.iter()
|
||||
.map(|DeltaEntry { key, lsn, size, .. }| (*key, *lsn, *size))
|
||||
.coalesce(|mut prev, cur| {
|
||||
// Coalesce keys that belong to the same key pair.
|
||||
// This ensures that compaction doesn't put them
|
||||
// into different layer files.
|
||||
// Still limit this by the target file size,
|
||||
// so that we keep the size of the files in
|
||||
// check.
|
||||
if prev.0 == cur.0 && prev.2 < target_file_size {
|
||||
prev.2 += cur.2;
|
||||
Ok(prev)
|
||||
} else {
|
||||
Err((prev, cur))
|
||||
}
|
||||
});
|
||||
|
||||
// Merge the contents of all the input delta layers into a new set
|
||||
// of delta layers, based on the current partitioning.
|
||||
@@ -3646,104 +3669,127 @@ impl Timeline {
|
||||
let mut key_values_total_size = 0u64;
|
||||
let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key
|
||||
let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key
|
||||
for (key, lsn, value_ref) in all_values_iter {
|
||||
let value = value_ref.load()?;
|
||||
let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
|
||||
// We need to check key boundaries once we reach next key or end of layer with the same key
|
||||
if !same_key || lsn == dup_end_lsn {
|
||||
let mut next_key_size = 0u64;
|
||||
let is_dup_layer = dup_end_lsn.is_valid();
|
||||
dup_start_lsn = Lsn::INVALID;
|
||||
if !same_key {
|
||||
dup_end_lsn = Lsn::INVALID;
|
||||
|
||||
// TODO remove this block_on wrapper once we fully go async
|
||||
Handle::current().block_on(async {
|
||||
for &DeltaEntry {
|
||||
key, lsn, ref val, ..
|
||||
} in all_values_iter
|
||||
{
|
||||
let value = val.load().await?;
|
||||
let same_key = prev_key.map_or(false, |prev_key| prev_key == key);
|
||||
// We need to check key boundaries once we reach next key or end of layer with the same key
|
||||
if !same_key || lsn == dup_end_lsn {
|
||||
let mut next_key_size = 0u64;
|
||||
let is_dup_layer = dup_end_lsn.is_valid();
|
||||
dup_start_lsn = Lsn::INVALID;
|
||||
if !same_key {
|
||||
dup_end_lsn = Lsn::INVALID;
|
||||
}
|
||||
// Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size
|
||||
for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() {
|
||||
next_key_size = next_size;
|
||||
if key != next_key {
|
||||
if dup_end_lsn.is_valid() {
|
||||
// We are writting segment with duplicates:
|
||||
// place all remaining values of this key in separate segment
|
||||
dup_start_lsn = dup_end_lsn; // new segments starts where old stops
|
||||
dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range
|
||||
}
|
||||
break;
|
||||
}
|
||||
key_values_total_size += next_size;
|
||||
// Check if it is time to split segment: if total keys size is larger than target file size.
|
||||
// We need to avoid generation of empty segments if next_size > target_file_size.
|
||||
if key_values_total_size > target_file_size && lsn != next_lsn {
|
||||
// Split key between multiple layers: such layer can contain only single key
|
||||
dup_start_lsn = if dup_end_lsn.is_valid() {
|
||||
dup_end_lsn // new segment with duplicates starts where old one stops
|
||||
} else {
|
||||
lsn // start with the first LSN for this key
|
||||
};
|
||||
dup_end_lsn = next_lsn; // upper LSN boundary is exclusive
|
||||
break;
|
||||
}
|
||||
}
|
||||
// handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set.
|
||||
if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() {
|
||||
dup_start_lsn = dup_end_lsn;
|
||||
dup_end_lsn = lsn_range.end;
|
||||
}
|
||||
if writer.is_some() {
|
||||
let written_size = writer.as_mut().unwrap().size();
|
||||
let contains_hole =
|
||||
next_hole < holes.len() && key >= holes[next_hole].key_range.end;
|
||||
// check if key cause layer overflow or contains hole...
|
||||
if is_dup_layer
|
||||
|| dup_end_lsn.is_valid()
|
||||
|| written_size + key_values_total_size > target_file_size
|
||||
|| contains_hole
|
||||
{
|
||||
// ... if so, flush previous layer and prepare to write new one
|
||||
new_layers.push(Arc::new(
|
||||
writer.take().unwrap().finish(prev_key.unwrap().next())?,
|
||||
));
|
||||
writer = None;
|
||||
|
||||
if contains_hole {
|
||||
// skip hole
|
||||
next_hole += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Remember size of key value because at next iteration we will access next item
|
||||
key_values_total_size = next_key_size;
|
||||
}
|
||||
// Determine size occupied by this key. We stop at next key or when size becomes larger than target_file_size
|
||||
for (next_key, next_lsn, next_size) in all_keys_iter.by_ref() {
|
||||
next_key_size = next_size;
|
||||
if key != next_key {
|
||||
if writer.is_none() {
|
||||
// Create writer if not initiaized yet
|
||||
writer = Some(DeltaLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_id,
|
||||
key,
|
||||
if dup_end_lsn.is_valid() {
|
||||
// We are writting segment with duplicates:
|
||||
// place all remaining values of this key in separate segment
|
||||
dup_start_lsn = dup_end_lsn; // new segments starts where old stops
|
||||
dup_end_lsn = lsn_range.end; // there are no more values of this key till end of LSN range
|
||||
}
|
||||
break;
|
||||
}
|
||||
key_values_total_size += next_size;
|
||||
// Check if it is time to split segment: if total keys size is larger than target file size.
|
||||
// We need to avoid generation of empty segments if next_size > target_file_size.
|
||||
if key_values_total_size > target_file_size && lsn != next_lsn {
|
||||
// Split key between multiple layers: such layer can contain only single key
|
||||
dup_start_lsn = if dup_end_lsn.is_valid() {
|
||||
dup_end_lsn // new segment with duplicates starts where old one stops
|
||||
// this is a layer containing slice of values of the same key
|
||||
debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
|
||||
dup_start_lsn..dup_end_lsn
|
||||
} else {
|
||||
lsn // start with the first LSN for this key
|
||||
};
|
||||
dup_end_lsn = next_lsn; // upper LSN boundary is exclusive
|
||||
break;
|
||||
}
|
||||
debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
|
||||
lsn_range.clone()
|
||||
},
|
||||
)?);
|
||||
}
|
||||
// handle case when loop reaches last key: in this case dup_end is non-zero but dup_start is not set.
|
||||
if dup_end_lsn.is_valid() && !dup_start_lsn.is_valid() {
|
||||
dup_start_lsn = dup_end_lsn;
|
||||
dup_end_lsn = lsn_range.end;
|
||||
}
|
||||
if writer.is_some() {
|
||||
let written_size = writer.as_mut().unwrap().size();
|
||||
let contains_hole =
|
||||
next_hole < holes.len() && key >= holes[next_hole].key_range.end;
|
||||
// check if key cause layer overflow or contains hole...
|
||||
if is_dup_layer
|
||||
|| dup_end_lsn.is_valid()
|
||||
|| written_size + key_values_total_size > target_file_size
|
||||
|| contains_hole
|
||||
{
|
||||
// ... if so, flush previous layer and prepare to write new one
|
||||
new_layers.push(Arc::new(
|
||||
writer.take().unwrap().finish(prev_key.unwrap().next())?,
|
||||
));
|
||||
writer = None;
|
||||
|
||||
if contains_hole {
|
||||
// skip hole
|
||||
next_hole += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Remember size of key value because at next iteration we will access next item
|
||||
key_values_total_size = next_key_size;
|
||||
fail_point!("delta-layer-writer-fail-before-finish", |_| {
|
||||
Result::<_>::Err(anyhow::anyhow!(
|
||||
"failpoint delta-layer-writer-fail-before-finish"
|
||||
))
|
||||
});
|
||||
|
||||
writer.as_mut().unwrap().put_value(key, lsn, value)?;
|
||||
prev_key = Some(key);
|
||||
}
|
||||
if writer.is_none() {
|
||||
// Create writer if not initiaized yet
|
||||
writer = Some(DeltaLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_id,
|
||||
key,
|
||||
if dup_end_lsn.is_valid() {
|
||||
// this is a layer containing slice of values of the same key
|
||||
debug!("Create new dup layer {}..{}", dup_start_lsn, dup_end_lsn);
|
||||
dup_start_lsn..dup_end_lsn
|
||||
} else {
|
||||
debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
|
||||
lsn_range.clone()
|
||||
},
|
||||
)?);
|
||||
}
|
||||
|
||||
fail_point!("delta-layer-writer-fail-before-finish", |_| {
|
||||
Err(anyhow::anyhow!("failpoint delta-layer-writer-fail-before-finish").into())
|
||||
});
|
||||
|
||||
writer.as_mut().unwrap().put_value(key, lsn, value)?;
|
||||
prev_key = Some(key);
|
||||
}
|
||||
Ok(())
|
||||
})?;
|
||||
if let Some(writer) = writer {
|
||||
new_layers.push(Arc::new(writer.finish(prev_key.unwrap().next())?));
|
||||
}
|
||||
|
||||
// Sync layers
|
||||
if !new_layers.is_empty() {
|
||||
// Print a warning if the created layer is larger than double the target size
|
||||
// Add two pages for potential overhead. This should in theory be already
|
||||
// accounted for in the target calculation, but for very small targets,
|
||||
// we still might easily hit the limit otherwise.
|
||||
let warn_limit = target_file_size * 2 + page_cache::PAGE_SZ as u64 * 2;
|
||||
for layer in new_layers.iter() {
|
||||
if layer.desc.file_size > warn_limit {
|
||||
warn!(
|
||||
%layer,
|
||||
"created delta file of size {} larger than double of target of {target_file_size}", layer.desc.file_size
|
||||
);
|
||||
}
|
||||
}
|
||||
let mut layer_paths: Vec<PathBuf> = new_layers.iter().map(|l| l.path()).collect();
|
||||
|
||||
// Fsync all the layer files and directory using multiple threads to
|
||||
@@ -3756,12 +3802,10 @@ impl Timeline {
|
||||
layer_paths.pop().unwrap();
|
||||
}
|
||||
|
||||
stats.write_layer_files_micros = stats.prepare_iterators_micros.till_now();
|
||||
stats.write_layer_files_micros = stats.read_lock_drop_micros.till_now();
|
||||
stats.new_deltas_count = Some(new_layers.len());
|
||||
stats.new_deltas_size = Some(new_layers.iter().map(|l| l.desc.file_size).sum());
|
||||
|
||||
drop(all_keys_iter); // So that deltas_to_compact is no longer borrowed
|
||||
|
||||
match TryInto::<CompactLevel0Phase1Stats>::try_into(stats)
|
||||
.and_then(|stats| serde_json::to_string(&stats).context("serde_json::to_string"))
|
||||
{
|
||||
|
||||
@@ -279,6 +279,17 @@ async fn cleanup_remaining_timeline_fs_traces(
|
||||
Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm-dir"))?
|
||||
});
|
||||
|
||||
// Make sure previous deletions are ordered before mark removal.
|
||||
// Otherwise there is no guarantee that they reach the disk before mark deletion.
|
||||
// So its possible for mark to reach disk first and for other deletions
|
||||
// to be reordered later and thus missed if a crash occurs.
|
||||
// Note that we dont need to sync after mark file is removed
|
||||
// because we can tolerate the case when mark file reappears on startup.
|
||||
let timeline_path = conf.timelines_path(&tenant_id);
|
||||
crashsafe::fsync_async(timeline_path)
|
||||
.await
|
||||
.context("fsync_pre_mark_remove")?;
|
||||
|
||||
// Remove delete mark
|
||||
tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_id, timeline_id))
|
||||
.await
|
||||
|
||||
@@ -163,7 +163,7 @@ impl LayerManager {
|
||||
}
|
||||
|
||||
/// Called from `freeze_inmem_layer`, returns true if successfully frozen.
|
||||
pub fn try_freeze_in_memory_layer(
|
||||
pub async fn try_freeze_in_memory_layer(
|
||||
&mut self,
|
||||
Lsn(last_record_lsn): Lsn,
|
||||
last_freeze_at: &AtomicLsn,
|
||||
@@ -173,7 +173,7 @@ impl LayerManager {
|
||||
if let Some(open_layer) = &self.layer_map.open_layer {
|
||||
let open_layer_rc = Arc::clone(open_layer);
|
||||
// Does this layer need freezing?
|
||||
open_layer.freeze(end_lsn);
|
||||
open_layer.freeze(end_lsn).await;
|
||||
|
||||
// The layer is no longer open, update the layer map to reflect this.
|
||||
// We will replace it with on-disk historics below.
|
||||
|
||||
@@ -31,8 +31,10 @@ use storage_broker::Streaming;
|
||||
use tokio::select;
|
||||
use tracing::*;
|
||||
|
||||
use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
|
||||
use postgres_connection::{parse_host_port, PgConnectionConfig};
|
||||
use utils::backoff::{
|
||||
exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
};
|
||||
use utils::{
|
||||
id::{NodeId, TenantTimelineId},
|
||||
lsn::Lsn,
|
||||
|
||||
@@ -74,7 +74,7 @@ walprop_connect_start(char *conninfo, char *password)
|
||||
if (password)
|
||||
{
|
||||
keywords[n] = "password";
|
||||
values[n] = neon_auth_token;
|
||||
values[n] = password;
|
||||
n++;
|
||||
}
|
||||
keywords[n] = "dbname";
|
||||
|
||||
@@ -1393,8 +1393,22 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
|
||||
char *err;
|
||||
WalReceiverConn *wrconn;
|
||||
WalRcvStreamOptions options;
|
||||
char conninfo[MAXCONNINFO];
|
||||
|
||||
wrconn = walrcv_connect(safekeeper[donor].conninfo, false, "wal_proposer_recovery", &err);
|
||||
if (!neon_auth_token)
|
||||
{
|
||||
memcpy(conninfo, safekeeper[donor].conninfo, MAXCONNINFO);
|
||||
}
|
||||
else
|
||||
{
|
||||
int written = 0;
|
||||
|
||||
written = snprintf((char *) conninfo, MAXCONNINFO, "password=%s %s", neon_auth_token, safekeeper[donor].conninfo);
|
||||
if (written > MAXCONNINFO || written < 0)
|
||||
elog(FATAL, "could not append password to the safekeeper connection string");
|
||||
}
|
||||
|
||||
wrconn = walrcv_connect(conninfo, false, "wal_proposer_recovery", &err);
|
||||
if (!wrconn)
|
||||
{
|
||||
ereport(WARNING,
|
||||
|
||||
@@ -13,6 +13,7 @@ bytes = { workspace = true, features = ["serde"] }
|
||||
chrono.workspace = true
|
||||
clap.workspace = true
|
||||
consumption_metrics.workspace = true
|
||||
dashmap.workspace = true
|
||||
futures.workspace = true
|
||||
git-version.workspace = true
|
||||
hashbrown.workspace = true
|
||||
@@ -29,7 +30,7 @@ metrics.workspace = true
|
||||
once_cell.workspace = true
|
||||
opentelemetry.workspace = true
|
||||
parking_lot.workspace = true
|
||||
pbkdf2.workspace = true
|
||||
pbkdf2 = { workspace = true, features = ["simple", "std"] }
|
||||
pin-project-lite.workspace = true
|
||||
postgres_backend.workspace = true
|
||||
pq_proto.workspace = true
|
||||
|
||||
@@ -36,7 +36,18 @@ pub(super) async fn authenticate(
|
||||
AuthInfo::Scram(secret) => {
|
||||
info!("auth endpoint chooses SCRAM");
|
||||
let scram = auth::Scram(&secret);
|
||||
let client_key = match flow.begin(scram).await?.authenticate().await? {
|
||||
|
||||
let auth_flow = flow.begin(scram).await.map_err(|error| {
|
||||
warn!(?error, "error sending scram acknowledgement");
|
||||
error
|
||||
})?;
|
||||
|
||||
let auth_outcome = auth_flow.authenticate().await.map_err(|error| {
|
||||
warn!(?error, "error processing scram messages");
|
||||
error
|
||||
})?;
|
||||
|
||||
let client_key = match auth_outcome {
|
||||
sasl::Outcome::Success(key) => key,
|
||||
sasl::Outcome::Failure(reason) => {
|
||||
info!("auth backend failed with an error: {reason}");
|
||||
@@ -51,7 +62,6 @@ pub(super) async fn authenticate(
|
||||
}
|
||||
};
|
||||
|
||||
info!("compute node's state has likely changed; requesting a wake-up");
|
||||
let mut num_retries = 0;
|
||||
let mut node = loop {
|
||||
let wake_res = api.wake_compute(extra, creds).await;
|
||||
|
||||
@@ -1,10 +1,21 @@
|
||||
use anyhow::Context;
|
||||
use async_trait::async_trait;
|
||||
use parking_lot::Mutex;
|
||||
use dashmap::DashMap;
|
||||
use futures::future::poll_fn;
|
||||
use parking_lot::RwLock;
|
||||
use pbkdf2::{
|
||||
password_hash::{PasswordHashString, PasswordHasher, PasswordVerifier, SaltString},
|
||||
Params, Pbkdf2,
|
||||
};
|
||||
use pq_proto::StartupMessageParams;
|
||||
use std::fmt;
|
||||
use std::sync::atomic::{self, AtomicUsize};
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
use std::{
|
||||
fmt,
|
||||
task::{ready, Poll},
|
||||
};
|
||||
use tokio::time;
|
||||
use tokio_postgres::AsyncMessage;
|
||||
|
||||
use crate::{auth, console};
|
||||
use crate::{compute, config};
|
||||
@@ -13,8 +24,8 @@ use super::sql_over_http::MAX_RESPONSE_SIZE;
|
||||
|
||||
use crate::proxy::ConnectMechanism;
|
||||
|
||||
use tracing::error;
|
||||
use tracing::info;
|
||||
use tracing::{error, warn};
|
||||
use tracing::{info, info_span, Instrument};
|
||||
|
||||
pub const APP_NAME: &str = "sql_over_http";
|
||||
const MAX_CONNS_PER_ENDPOINT: usize = 20;
|
||||
@@ -42,23 +53,44 @@ impl fmt::Display for ConnInfo {
|
||||
}
|
||||
|
||||
struct ConnPoolEntry {
|
||||
conn: tokio_postgres::Client,
|
||||
conn: Client,
|
||||
_last_access: std::time::Instant,
|
||||
}
|
||||
|
||||
// Per-endpoint connection pool, (dbname, username) -> Vec<ConnPoolEntry>
|
||||
// Per-endpoint connection pool, (dbname, username) -> DbUserConnPool
|
||||
// Number of open connections is limited by the `max_conns_per_endpoint`.
|
||||
pub struct EndpointConnPool {
|
||||
pools: HashMap<(String, String), Vec<ConnPoolEntry>>,
|
||||
pools: HashMap<(String, String), DbUserConnPool>,
|
||||
total_conns: usize,
|
||||
}
|
||||
|
||||
/// This is cheap and not hugely secure.
|
||||
/// But probably good enough for in memory only hashes.
|
||||
///
|
||||
/// Still takes 3.5ms to hash on my hardware.
|
||||
/// We don't want to ruin the latency improvements of using the pool by making password verification take too long
|
||||
const PARAMS: Params = Params {
|
||||
rounds: 10_000,
|
||||
output_length: 32,
|
||||
};
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct DbUserConnPool {
|
||||
conns: Vec<ConnPoolEntry>,
|
||||
password_hash: Option<PasswordHashString>,
|
||||
}
|
||||
|
||||
pub struct GlobalConnPool {
|
||||
// endpoint -> per-endpoint connection pool
|
||||
//
|
||||
// That should be a fairly conteded map, so return reference to the per-endpoint
|
||||
// pool as early as possible and release the lock.
|
||||
global_pool: Mutex<HashMap<String, Arc<Mutex<EndpointConnPool>>>>,
|
||||
global_pool: DashMap<String, Arc<RwLock<EndpointConnPool>>>,
|
||||
|
||||
/// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each.
|
||||
/// That seems like far too much effort, so we're using a relaxed increment counter instead.
|
||||
/// It's only used for diagnostics.
|
||||
global_pool_size: AtomicUsize,
|
||||
|
||||
// Maximum number of connections per one endpoint.
|
||||
// Can mix different (dbname, username) connections.
|
||||
@@ -72,7 +104,8 @@ pub struct GlobalConnPool {
|
||||
impl GlobalConnPool {
|
||||
pub fn new(config: &'static crate::config::ProxyConfig) -> Arc<Self> {
|
||||
Arc::new(Self {
|
||||
global_pool: Mutex::new(HashMap::new()),
|
||||
global_pool: DashMap::new(),
|
||||
global_pool_size: AtomicUsize::new(0),
|
||||
max_conns_per_endpoint: MAX_CONNS_PER_ENDPOINT,
|
||||
proxy_config: config,
|
||||
})
|
||||
@@ -82,70 +115,125 @@ impl GlobalConnPool {
|
||||
&self,
|
||||
conn_info: &ConnInfo,
|
||||
force_new: bool,
|
||||
) -> anyhow::Result<tokio_postgres::Client> {
|
||||
let mut client: Option<tokio_postgres::Client> = None;
|
||||
session_id: uuid::Uuid,
|
||||
) -> anyhow::Result<Client> {
|
||||
let mut client: Option<Client> = None;
|
||||
|
||||
let mut hash_valid = false;
|
||||
if !force_new {
|
||||
let pool = self.get_endpoint_pool(&conn_info.hostname).await;
|
||||
let pool = self.get_or_create_endpoint_pool(&conn_info.hostname);
|
||||
let mut hash = None;
|
||||
|
||||
// find a pool entry by (dbname, username) if exists
|
||||
let mut pool = pool.lock();
|
||||
let pool_entries = pool.pools.get_mut(&conn_info.db_and_user());
|
||||
if let Some(pool_entries) = pool_entries {
|
||||
if let Some(entry) = pool_entries.pop() {
|
||||
client = Some(entry.conn);
|
||||
pool.total_conns -= 1;
|
||||
{
|
||||
let pool = pool.read();
|
||||
if let Some(pool_entries) = pool.pools.get(&conn_info.db_and_user()) {
|
||||
if !pool_entries.conns.is_empty() {
|
||||
hash = pool_entries.password_hash.clone();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// a connection exists in the pool, verify the password hash
|
||||
if let Some(hash) = hash {
|
||||
let pw = conn_info.password.clone();
|
||||
let validate = tokio::task::spawn_blocking(move || {
|
||||
Pbkdf2.verify_password(pw.as_bytes(), &hash.password_hash())
|
||||
})
|
||||
.await?;
|
||||
|
||||
// if the hash is invalid, don't error
|
||||
// we will continue with the regular connection flow
|
||||
if validate.is_ok() {
|
||||
hash_valid = true;
|
||||
let mut pool = pool.write();
|
||||
if let Some(pool_entries) = pool.pools.get_mut(&conn_info.db_and_user()) {
|
||||
if let Some(entry) = pool_entries.conns.pop() {
|
||||
client = Some(entry.conn);
|
||||
pool.total_conns -= 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ok return cached connection if found and establish a new one otherwise
|
||||
if let Some(client) = client {
|
||||
if client.is_closed() {
|
||||
let new_client = if let Some(client) = client {
|
||||
if client.inner.is_closed() {
|
||||
info!("pool: cached connection '{conn_info}' is closed, opening a new one");
|
||||
connect_to_compute(self.proxy_config, conn_info).await
|
||||
connect_to_compute(self.proxy_config, conn_info, session_id).await
|
||||
} else {
|
||||
info!("pool: reusing connection '{conn_info}'");
|
||||
Ok(client)
|
||||
client.session.send(session_id)?;
|
||||
return Ok(client);
|
||||
}
|
||||
} else {
|
||||
info!("pool: opening a new connection '{conn_info}'");
|
||||
connect_to_compute(self.proxy_config, conn_info).await
|
||||
connect_to_compute(self.proxy_config, conn_info, session_id).await
|
||||
};
|
||||
|
||||
match &new_client {
|
||||
// clear the hash. it's no longer valid
|
||||
// TODO: update tokio-postgres fork to allow access to this error kind directly
|
||||
Err(err)
|
||||
if hash_valid && err.to_string().contains("password authentication failed") =>
|
||||
{
|
||||
let pool = self.get_or_create_endpoint_pool(&conn_info.hostname);
|
||||
let mut pool = pool.write();
|
||||
if let Some(entry) = pool.pools.get_mut(&conn_info.db_and_user()) {
|
||||
entry.password_hash = None;
|
||||
}
|
||||
}
|
||||
// new password is valid and we should insert/update it
|
||||
Ok(_) if !force_new && !hash_valid => {
|
||||
let pw = conn_info.password.clone();
|
||||
let new_hash = tokio::task::spawn_blocking(move || {
|
||||
let salt = SaltString::generate(rand::rngs::OsRng);
|
||||
Pbkdf2
|
||||
.hash_password_customized(pw.as_bytes(), None, None, PARAMS, &salt)
|
||||
.map(|s| s.serialize())
|
||||
})
|
||||
.await??;
|
||||
|
||||
let pool = self.get_or_create_endpoint_pool(&conn_info.hostname);
|
||||
let mut pool = pool.write();
|
||||
pool.pools
|
||||
.entry(conn_info.db_and_user())
|
||||
.or_default()
|
||||
.password_hash = Some(new_hash);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
new_client
|
||||
}
|
||||
|
||||
pub async fn put(
|
||||
&self,
|
||||
conn_info: &ConnInfo,
|
||||
client: tokio_postgres::Client,
|
||||
) -> anyhow::Result<()> {
|
||||
let pool = self.get_endpoint_pool(&conn_info.hostname).await;
|
||||
pub async fn put(&self, conn_info: &ConnInfo, client: Client) -> anyhow::Result<()> {
|
||||
let pool = self.get_or_create_endpoint_pool(&conn_info.hostname);
|
||||
|
||||
// return connection to the pool
|
||||
let mut total_conns;
|
||||
let mut returned = false;
|
||||
let mut per_db_size = 0;
|
||||
{
|
||||
let mut pool = pool.lock();
|
||||
total_conns = pool.total_conns;
|
||||
let total_conns = {
|
||||
let mut pool = pool.write();
|
||||
|
||||
let pool_entries: &mut Vec<ConnPoolEntry> = pool
|
||||
.pools
|
||||
.entry(conn_info.db_and_user())
|
||||
.or_insert_with(|| Vec::with_capacity(1));
|
||||
if total_conns < self.max_conns_per_endpoint {
|
||||
pool_entries.push(ConnPoolEntry {
|
||||
conn: client,
|
||||
_last_access: std::time::Instant::now(),
|
||||
});
|
||||
if pool.total_conns < self.max_conns_per_endpoint {
|
||||
// we create this db-user entry in get, so it should not be None
|
||||
if let Some(pool_entries) = pool.pools.get_mut(&conn_info.db_and_user()) {
|
||||
pool_entries.conns.push(ConnPoolEntry {
|
||||
conn: client,
|
||||
_last_access: std::time::Instant::now(),
|
||||
});
|
||||
|
||||
total_conns += 1;
|
||||
returned = true;
|
||||
per_db_size = pool_entries.len();
|
||||
returned = true;
|
||||
per_db_size = pool_entries.conns.len();
|
||||
|
||||
pool.total_conns += 1;
|
||||
pool.total_conns += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pool.total_conns
|
||||
};
|
||||
|
||||
// do logging outside of the mutex
|
||||
if returned {
|
||||
@@ -157,25 +245,35 @@ impl GlobalConnPool {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn get_endpoint_pool(&self, endpoint: &String) -> Arc<Mutex<EndpointConnPool>> {
|
||||
fn get_or_create_endpoint_pool(&self, endpoint: &String) -> Arc<RwLock<EndpointConnPool>> {
|
||||
// fast path
|
||||
if let Some(pool) = self.global_pool.get(endpoint) {
|
||||
return pool.clone();
|
||||
}
|
||||
|
||||
// slow path
|
||||
let new_pool = Arc::new(RwLock::new(EndpointConnPool {
|
||||
pools: HashMap::new(),
|
||||
total_conns: 0,
|
||||
}));
|
||||
|
||||
// find or create a pool for this endpoint
|
||||
let mut created = false;
|
||||
let mut global_pool = self.global_pool.lock();
|
||||
let pool = global_pool
|
||||
let pool = self
|
||||
.global_pool
|
||||
.entry(endpoint.clone())
|
||||
.or_insert_with(|| {
|
||||
created = true;
|
||||
Arc::new(Mutex::new(EndpointConnPool {
|
||||
pools: HashMap::new(),
|
||||
total_conns: 0,
|
||||
}))
|
||||
new_pool
|
||||
})
|
||||
.clone();
|
||||
let global_pool_size = global_pool.len();
|
||||
drop(global_pool);
|
||||
|
||||
// log new global pool size
|
||||
if created {
|
||||
let global_pool_size = self
|
||||
.global_pool_size
|
||||
.fetch_add(1, atomic::Ordering::Relaxed)
|
||||
+ 1;
|
||||
info!(
|
||||
"pool: created new pool for '{endpoint}', global pool size now {global_pool_size}"
|
||||
);
|
||||
@@ -187,11 +285,12 @@ impl GlobalConnPool {
|
||||
|
||||
struct TokioMechanism<'a> {
|
||||
conn_info: &'a ConnInfo,
|
||||
session_id: uuid::Uuid,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl ConnectMechanism for TokioMechanism<'_> {
|
||||
type Connection = tokio_postgres::Client;
|
||||
type Connection = Client;
|
||||
type ConnectError = tokio_postgres::Error;
|
||||
type Error = anyhow::Error;
|
||||
|
||||
@@ -200,7 +299,7 @@ impl ConnectMechanism for TokioMechanism<'_> {
|
||||
node_info: &console::CachedNodeInfo,
|
||||
timeout: time::Duration,
|
||||
) -> Result<Self::Connection, Self::ConnectError> {
|
||||
connect_to_compute_once(node_info, self.conn_info, timeout).await
|
||||
connect_to_compute_once(node_info, self.conn_info, timeout, self.session_id).await
|
||||
}
|
||||
|
||||
fn update_connect_config(&self, _config: &mut compute::ConnCfg) {}
|
||||
@@ -213,7 +312,8 @@ impl ConnectMechanism for TokioMechanism<'_> {
|
||||
async fn connect_to_compute(
|
||||
config: &config::ProxyConfig,
|
||||
conn_info: &ConnInfo,
|
||||
) -> anyhow::Result<tokio_postgres::Client> {
|
||||
session_id: uuid::Uuid,
|
||||
) -> anyhow::Result<Client> {
|
||||
let tls = config.tls_config.as_ref();
|
||||
let common_names = tls.and_then(|tls| tls.common_names.clone());
|
||||
|
||||
@@ -244,17 +344,27 @@ async fn connect_to_compute(
|
||||
.await?
|
||||
.context("missing cache entry from wake_compute")?;
|
||||
|
||||
crate::proxy::connect_to_compute(&TokioMechanism { conn_info }, node_info, &extra, &creds).await
|
||||
crate::proxy::connect_to_compute(
|
||||
&TokioMechanism {
|
||||
conn_info,
|
||||
session_id,
|
||||
},
|
||||
node_info,
|
||||
&extra,
|
||||
&creds,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
async fn connect_to_compute_once(
|
||||
node_info: &console::CachedNodeInfo,
|
||||
conn_info: &ConnInfo,
|
||||
timeout: time::Duration,
|
||||
) -> Result<tokio_postgres::Client, tokio_postgres::Error> {
|
||||
mut session: uuid::Uuid,
|
||||
) -> Result<Client, tokio_postgres::Error> {
|
||||
let mut config = (*node_info.config).clone();
|
||||
|
||||
let (client, connection) = config
|
||||
let (client, mut connection) = config
|
||||
.user(&conn_info.username)
|
||||
.password(&conn_info.password)
|
||||
.dbname(&conn_info.dbname)
|
||||
@@ -263,11 +373,53 @@ async fn connect_to_compute_once(
|
||||
.connect(tokio_postgres::NoTls)
|
||||
.await?;
|
||||
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = connection.await {
|
||||
error!("connection error: {}", e);
|
||||
}
|
||||
let (tx, mut rx) = tokio::sync::watch::channel(session);
|
||||
|
||||
let conn_id = uuid::Uuid::new_v4();
|
||||
let span = info_span!(parent: None, "connection", %conn_info, %conn_id);
|
||||
span.in_scope(|| {
|
||||
info!(%session, "new connection");
|
||||
});
|
||||
|
||||
Ok(client)
|
||||
tokio::spawn(
|
||||
poll_fn(move |cx| {
|
||||
if matches!(rx.has_changed(), Ok(true)) {
|
||||
session = *rx.borrow_and_update();
|
||||
info!(%session, "changed session");
|
||||
}
|
||||
|
||||
let message = ready!(connection.poll_message(cx));
|
||||
|
||||
match message {
|
||||
Some(Ok(AsyncMessage::Notice(notice))) => {
|
||||
info!(%session, "notice: {}", notice);
|
||||
Poll::Pending
|
||||
}
|
||||
Some(Ok(AsyncMessage::Notification(notif))) => {
|
||||
warn!(%session, pid = notif.process_id(), channel = notif.channel(), "notification received");
|
||||
Poll::Pending
|
||||
}
|
||||
Some(Ok(_)) => {
|
||||
warn!(%session, "unknown message");
|
||||
Poll::Pending
|
||||
}
|
||||
Some(Err(e)) => {
|
||||
error!(%session, "connection error: {}", e);
|
||||
Poll::Ready(())
|
||||
}
|
||||
None => Poll::Ready(()),
|
||||
}
|
||||
})
|
||||
.instrument(span)
|
||||
);
|
||||
|
||||
Ok(Client {
|
||||
inner: client,
|
||||
session: tx,
|
||||
})
|
||||
}
|
||||
|
||||
pub struct Client {
|
||||
pub inner: tokio_postgres::Client,
|
||||
session: tokio::sync::watch::Sender<uuid::Uuid>,
|
||||
}
|
||||
|
||||
@@ -16,6 +16,7 @@ use tokio_postgres::types::Type;
|
||||
use tokio_postgres::GenericClient;
|
||||
use tokio_postgres::IsolationLevel;
|
||||
use tokio_postgres::Row;
|
||||
use tracing::Instrument;
|
||||
use url::Url;
|
||||
|
||||
use super::conn_pool::ConnInfo;
|
||||
@@ -27,11 +28,16 @@ struct QueryData {
|
||||
params: Vec<serde_json::Value>,
|
||||
}
|
||||
|
||||
#[derive(serde::Deserialize)]
|
||||
struct BatchQueryData {
|
||||
queries: Vec<QueryData>,
|
||||
}
|
||||
|
||||
#[derive(serde::Deserialize)]
|
||||
#[serde(untagged)]
|
||||
enum Payload {
|
||||
Single(QueryData),
|
||||
Batch(Vec<QueryData>),
|
||||
Batch(BatchQueryData),
|
||||
}
|
||||
|
||||
pub const MAX_RESPONSE_SIZE: usize = 10 * 1024 * 1024; // 10 MB
|
||||
@@ -42,6 +48,7 @@ static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode");
|
||||
static ALLOW_POOL: HeaderName = HeaderName::from_static("neon-pool-opt-in");
|
||||
static TXN_ISOLATION_LEVEL: HeaderName = HeaderName::from_static("neon-batch-isolation-level");
|
||||
static TXN_READ_ONLY: HeaderName = HeaderName::from_static("neon-batch-read-only");
|
||||
static TXN_DEFERRABLE: HeaderName = HeaderName::from_static("neon-batch-deferrable");
|
||||
|
||||
static HEADER_VALUE_TRUE: HeaderValue = HeaderValue::from_static("true");
|
||||
|
||||
@@ -175,6 +182,7 @@ pub async fn handle(
|
||||
request: Request<Body>,
|
||||
sni_hostname: Option<String>,
|
||||
conn_pool: Arc<GlobalConnPool>,
|
||||
session_id: uuid::Uuid,
|
||||
) -> anyhow::Result<(Value, HashMap<HeaderName, HeaderValue>)> {
|
||||
//
|
||||
// Determine the destination and connection params
|
||||
@@ -190,7 +198,7 @@ pub async fn handle(
|
||||
// Allow connection pooling only if explicitly requested
|
||||
let allow_pool = headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);
|
||||
|
||||
// isolation level and read only
|
||||
// isolation level, read only and deferrable
|
||||
|
||||
let txn_isolation_level_raw = headers.get(&TXN_ISOLATION_LEVEL).cloned();
|
||||
let txn_isolation_level = match txn_isolation_level_raw {
|
||||
@@ -204,8 +212,8 @@ pub async fn handle(
|
||||
None => None,
|
||||
};
|
||||
|
||||
let txn_read_only_raw = headers.get(&TXN_READ_ONLY).cloned();
|
||||
let txn_read_only = txn_read_only_raw.as_ref() == Some(&HEADER_VALUE_TRUE);
|
||||
let txn_read_only = headers.get(&TXN_READ_ONLY) == Some(&HEADER_VALUE_TRUE);
|
||||
let txn_deferrable = headers.get(&TXN_DEFERRABLE) == Some(&HEADER_VALUE_TRUE);
|
||||
|
||||
let request_content_length = match request.body().size_hint().upper() {
|
||||
Some(v) => v,
|
||||
@@ -224,26 +232,29 @@ pub async fn handle(
|
||||
let body = hyper::body::to_bytes(request.into_body()).await?;
|
||||
let payload: Payload = serde_json::from_slice(&body)?;
|
||||
|
||||
let mut client = conn_pool.get(&conn_info, !allow_pool).await?;
|
||||
let mut client = conn_pool.get(&conn_info, !allow_pool, session_id).await?;
|
||||
|
||||
//
|
||||
// Now execute the query and return the result
|
||||
//
|
||||
let result = match payload {
|
||||
Payload::Single(query) => query_to_json(&client, query, raw_output, array_mode)
|
||||
Payload::Single(query) => query_to_json(&client.inner, query, raw_output, array_mode)
|
||||
.await
|
||||
.map(|x| (x, HashMap::default())),
|
||||
Payload::Batch(queries) => {
|
||||
Payload::Batch(batch_query) => {
|
||||
let mut results = Vec::new();
|
||||
let mut builder = client.build_transaction();
|
||||
let mut builder = client.inner.build_transaction();
|
||||
if let Some(isolation_level) = txn_isolation_level {
|
||||
builder = builder.isolation_level(isolation_level);
|
||||
}
|
||||
if txn_read_only {
|
||||
builder = builder.read_only(true);
|
||||
}
|
||||
if txn_deferrable {
|
||||
builder = builder.deferrable(true);
|
||||
}
|
||||
let transaction = builder.start().await?;
|
||||
for query in queries {
|
||||
for query in batch_query.queries {
|
||||
let result = query_to_json(&transaction, query, raw_output, array_mode).await;
|
||||
match result {
|
||||
Ok(r) => results.push(r),
|
||||
@@ -255,12 +266,20 @@ pub async fn handle(
|
||||
}
|
||||
transaction.commit().await?;
|
||||
let mut headers = HashMap::default();
|
||||
headers.insert(
|
||||
TXN_READ_ONLY.clone(),
|
||||
HeaderValue::try_from(txn_read_only.to_string())?,
|
||||
);
|
||||
if let Some(txn_isolation_level_raw) = txn_isolation_level_raw {
|
||||
headers.insert(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level_raw);
|
||||
if txn_read_only {
|
||||
headers.insert(
|
||||
TXN_READ_ONLY.clone(),
|
||||
HeaderValue::try_from(txn_read_only.to_string())?,
|
||||
);
|
||||
}
|
||||
if txn_deferrable {
|
||||
headers.insert(
|
||||
TXN_DEFERRABLE.clone(),
|
||||
HeaderValue::try_from(txn_deferrable.to_string())?,
|
||||
);
|
||||
}
|
||||
if let Some(txn_isolation_level) = txn_isolation_level_raw {
|
||||
headers.insert(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level);
|
||||
}
|
||||
Ok((json!({ "results": results }), headers))
|
||||
}
|
||||
@@ -268,9 +287,12 @@ pub async fn handle(
|
||||
|
||||
if allow_pool {
|
||||
// return connection to the pool
|
||||
tokio::task::spawn(async move {
|
||||
let _ = conn_pool.put(&conn_info, client).await;
|
||||
});
|
||||
tokio::task::spawn(
|
||||
async move {
|
||||
let _ = conn_pool.put(&conn_info, client).await;
|
||||
}
|
||||
.in_current_span(),
|
||||
);
|
||||
}
|
||||
|
||||
result
|
||||
|
||||
@@ -203,7 +203,7 @@ async fn ws_handler(
|
||||
// TODO: that deserves a refactor as now this function also handles http json client besides websockets.
|
||||
// Right now I don't want to blow up sql-over-http patch with file renames and do that as a follow up instead.
|
||||
} else if request.uri().path() == "/sql" && request.method() == Method::POST {
|
||||
let result = sql_over_http::handle(request, sni_hostname, conn_pool)
|
||||
let result = sql_over_http::handle(request, sni_hostname, conn_pool, session_id)
|
||||
.instrument(info_span!("sql-over-http"))
|
||||
.await;
|
||||
let status_code = match result {
|
||||
@@ -307,7 +307,7 @@ pub async fn task_main(
|
||||
ws_handler(req, config, conn_pool, cancel_map, session_id, sni_name)
|
||||
.instrument(info_span!(
|
||||
"ws-client",
|
||||
session = format_args!("{session_id}")
|
||||
session = %session_id
|
||||
))
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -4,6 +4,7 @@ use super::{messages::ServerMessage, Mechanism};
|
||||
use crate::stream::PqStream;
|
||||
use std::io;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::info;
|
||||
|
||||
/// Abstracts away all peculiarities of the libpq's protocol.
|
||||
pub struct SaslStream<'a, S> {
|
||||
@@ -68,7 +69,10 @@ impl<S: AsyncRead + AsyncWrite + Unpin> SaslStream<'_, S> {
|
||||
) -> super::Result<Outcome<M::Output>> {
|
||||
loop {
|
||||
let input = self.recv().await?;
|
||||
let step = mechanism.exchange(input)?;
|
||||
let step = mechanism.exchange(input).map_err(|error| {
|
||||
info!(?error, "error during SASL exchange");
|
||||
error
|
||||
})?;
|
||||
|
||||
use super::Step;
|
||||
return Ok(match step {
|
||||
|
||||
@@ -15,6 +15,7 @@ use toml_edit::Document;
|
||||
use std::fs::{self, File};
|
||||
use std::io::{ErrorKind, Write};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use storage_broker::Uri;
|
||||
@@ -122,9 +123,24 @@ struct Args {
|
||||
/// WAL backup horizon.
|
||||
#[arg(long)]
|
||||
disable_wal_backup: bool,
|
||||
/// Path to a .pem public key which is used to check JWT tokens.
|
||||
#[arg(long)]
|
||||
auth_validation_public_key_path: Option<PathBuf>,
|
||||
/// If given, enables auth on incoming connections to WAL service endpoint
|
||||
/// (--listen-pg). Value specifies path to a .pem public key used for
|
||||
/// validations of JWT tokens. Empty string is allowed and means disabling
|
||||
/// auth.
|
||||
#[arg(long, verbatim_doc_comment, value_parser = opt_pathbuf_parser)]
|
||||
pg_auth_public_key_path: Option<PathBuf>,
|
||||
/// If given, enables auth on incoming connections to tenant only WAL
|
||||
/// service endpoint (--listen-pg-tenant-only). Value specifies path to a
|
||||
/// .pem public key used for validations of JWT tokens. Empty string is
|
||||
/// allowed and means disabling auth.
|
||||
#[arg(long, verbatim_doc_comment, value_parser = opt_pathbuf_parser)]
|
||||
pg_tenant_only_auth_public_key_path: Option<PathBuf>,
|
||||
/// If given, enables auth on incoming connections to http management
|
||||
/// service endpoint (--listen-http). Value specifies path to a .pem public
|
||||
/// key used for validations of JWT tokens. Empty string is allowed and
|
||||
/// means disabling auth.
|
||||
#[arg(long, verbatim_doc_comment, value_parser = opt_pathbuf_parser)]
|
||||
http_auth_public_key_path: Option<PathBuf>,
|
||||
/// Format for logging, either 'plain' or 'json'.
|
||||
#[arg(long, default_value = "plain")]
|
||||
log_format: String,
|
||||
@@ -134,9 +150,39 @@ struct Args {
|
||||
current_thread_runtime: bool,
|
||||
}
|
||||
|
||||
// Like PathBufValueParser, but allows empty string.
|
||||
fn opt_pathbuf_parser(s: &str) -> Result<PathBuf, String> {
|
||||
Ok(PathBuf::from_str(s).unwrap())
|
||||
}
|
||||
|
||||
#[tokio::main(flavor = "current_thread")]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
let args = Args::parse();
|
||||
// We want to allow multiple occurences of the same arg (taking the last) so
|
||||
// that neon_local could generate command with defaults + overrides without
|
||||
// getting 'argument cannot be used multiple times' error. This seems to be
|
||||
// impossible with pure Derive API, so convert struct to Command, modify it,
|
||||
// parse arguments, and then fill the struct back.
|
||||
let cmd = <Args as clap::CommandFactory>::command().args_override_self(true);
|
||||
let mut matches = cmd.get_matches();
|
||||
let mut args = <Args as clap::FromArgMatches>::from_arg_matches_mut(&mut matches)?;
|
||||
|
||||
// I failed to modify opt_pathbuf_parser to return Option<PathBuf> in
|
||||
// reasonable time, so turn empty string into option post factum.
|
||||
if let Some(pb) = &args.pg_auth_public_key_path {
|
||||
if pb.as_os_str().is_empty() {
|
||||
args.pg_auth_public_key_path = None;
|
||||
}
|
||||
}
|
||||
if let Some(pb) = &args.pg_tenant_only_auth_public_key_path {
|
||||
if pb.as_os_str().is_empty() {
|
||||
args.pg_tenant_only_auth_public_key_path = None;
|
||||
}
|
||||
}
|
||||
if let Some(pb) = &args.http_auth_public_key_path {
|
||||
if pb.as_os_str().is_empty() {
|
||||
args.http_auth_public_key_path = None;
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(addr) = args.dump_control_file {
|
||||
let state = control_file::FileStorage::load_control_file(addr)?;
|
||||
@@ -170,13 +216,40 @@ async fn main() -> anyhow::Result<()> {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let auth = match args.auth_validation_public_key_path.as_ref() {
|
||||
let pg_auth = match args.pg_auth_public_key_path.as_ref() {
|
||||
None => {
|
||||
info!("auth is disabled");
|
||||
info!("pg auth is disabled");
|
||||
None
|
||||
}
|
||||
Some(path) => {
|
||||
info!("loading JWT auth key from {}", path.display());
|
||||
info!("loading pg auth JWT key from {}", path.display());
|
||||
Some(Arc::new(
|
||||
JwtAuth::from_key_path(path).context("failed to load the auth key")?,
|
||||
))
|
||||
}
|
||||
};
|
||||
let pg_tenant_only_auth = match args.pg_tenant_only_auth_public_key_path.as_ref() {
|
||||
None => {
|
||||
info!("pg tenant only auth is disabled");
|
||||
None
|
||||
}
|
||||
Some(path) => {
|
||||
info!(
|
||||
"loading pg tenant only auth JWT key from {}",
|
||||
path.display()
|
||||
);
|
||||
Some(Arc::new(
|
||||
JwtAuth::from_key_path(path).context("failed to load the auth key")?,
|
||||
))
|
||||
}
|
||||
};
|
||||
let http_auth = match args.http_auth_public_key_path.as_ref() {
|
||||
None => {
|
||||
info!("http auth is disabled");
|
||||
None
|
||||
}
|
||||
Some(path) => {
|
||||
info!("loading http auth JWT key from {}", path.display());
|
||||
Some(Arc::new(
|
||||
JwtAuth::from_key_path(path).context("failed to load the auth key")?,
|
||||
))
|
||||
@@ -199,7 +272,9 @@ async fn main() -> anyhow::Result<()> {
|
||||
max_offloader_lag_bytes: args.max_offloader_lag,
|
||||
wal_backup_enabled: !args.disable_wal_backup,
|
||||
backup_parallel_jobs: args.wal_backup_parallel_jobs,
|
||||
auth,
|
||||
pg_auth,
|
||||
pg_tenant_only_auth,
|
||||
http_auth,
|
||||
current_thread_runtime: args.current_thread_runtime,
|
||||
};
|
||||
|
||||
@@ -288,7 +363,7 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
|
||||
.spawn(wal_service::task_main(
|
||||
conf_,
|
||||
pg_listener,
|
||||
Some(Scope::SafekeeperData),
|
||||
Scope::SafekeeperData,
|
||||
))
|
||||
// wrap with task name for error reporting
|
||||
.map(|res| ("WAL service main".to_owned(), res));
|
||||
@@ -302,7 +377,7 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
|
||||
.spawn(wal_service::task_main(
|
||||
conf_,
|
||||
pg_listener_tenant_only,
|
||||
Some(Scope::Tenant),
|
||||
Scope::Tenant,
|
||||
))
|
||||
// wrap with task name for error reporting
|
||||
.map(|res| ("WAL service tenant only main".to_owned(), res));
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
use anyhow::Context;
|
||||
use std::str::FromStr;
|
||||
use std::str::{self};
|
||||
use std::sync::Arc;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::{info, info_span, Instrument};
|
||||
|
||||
@@ -20,7 +21,7 @@ use postgres_backend::{self, PostgresBackend};
|
||||
use postgres_ffi::PG_TLI;
|
||||
use pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID, TEXT_OID};
|
||||
use regex::Regex;
|
||||
use utils::auth::{Claims, Scope};
|
||||
use utils::auth::{Claims, JwtAuth, Scope};
|
||||
use utils::{
|
||||
id::{TenantId, TenantTimelineId, TimelineId},
|
||||
lsn::Lsn,
|
||||
@@ -36,8 +37,8 @@ pub struct SafekeeperPostgresHandler {
|
||||
pub ttid: TenantTimelineId,
|
||||
/// Unique connection id is logged in spans for observability.
|
||||
pub conn_id: ConnectionId,
|
||||
/// Auth scope allowed on the connections. None if auth is not configured.
|
||||
allowed_auth_scope: Option<Scope>,
|
||||
/// Auth scope allowed on the connections and public key used to check auth tokens. None if auth is not configured.
|
||||
auth: Option<(Scope, Arc<JwtAuth>)>,
|
||||
claims: Option<Claims>,
|
||||
io_metrics: Option<TrafficMetrics>,
|
||||
}
|
||||
@@ -154,18 +155,17 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
|
||||
) -> Result<(), QueryError> {
|
||||
// this unwrap is never triggered, because check_auth_jwt only called when auth_type is NeonJWT
|
||||
// which requires auth to be present
|
||||
let data = self
|
||||
.conf
|
||||
let (allowed_auth_scope, auth) = self
|
||||
.auth
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)?;
|
||||
.expect("auth_type is configured but .auth of handler is missing");
|
||||
let data =
|
||||
auth.decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)?;
|
||||
|
||||
let scope = self
|
||||
.allowed_auth_scope
|
||||
.expect("auth is enabled but scope is not configured");
|
||||
// The handler might be configured to allow only tenant scope tokens.
|
||||
if matches!(scope, Scope::Tenant) && !matches!(data.claims.scope, Scope::Tenant) {
|
||||
if matches!(allowed_auth_scope, Scope::Tenant)
|
||||
&& !matches!(data.claims.scope, Scope::Tenant)
|
||||
{
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"passed JWT token is for full access, but only tenant scope is allowed"
|
||||
)));
|
||||
@@ -244,7 +244,7 @@ impl SafekeeperPostgresHandler {
|
||||
conf: SafeKeeperConf,
|
||||
conn_id: u32,
|
||||
io_metrics: Option<TrafficMetrics>,
|
||||
allowed_auth_scope: Option<Scope>,
|
||||
auth: Option<(Scope, Arc<JwtAuth>)>,
|
||||
) -> Self {
|
||||
SafekeeperPostgresHandler {
|
||||
conf,
|
||||
@@ -254,7 +254,7 @@ impl SafekeeperPostgresHandler {
|
||||
ttid: TenantTimelineId::empty(),
|
||||
conn_id,
|
||||
claims: None,
|
||||
allowed_auth_scope,
|
||||
auth,
|
||||
io_metrics,
|
||||
}
|
||||
}
|
||||
@@ -262,7 +262,7 @@ impl SafekeeperPostgresHandler {
|
||||
// when accessing management api supply None as an argument
|
||||
// when using to authorize tenant pass corresponding tenant id
|
||||
fn check_permission(&self, tenant_id: Option<TenantId>) -> anyhow::Result<()> {
|
||||
if self.conf.auth.is_none() {
|
||||
if self.auth.is_none() {
|
||||
// auth is set to Trust, nothing to check so just return ok
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
@@ -359,7 +359,7 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
|
||||
/// Safekeeper http router.
|
||||
pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError> {
|
||||
let mut router = endpoint::make_router();
|
||||
if conf.auth.is_some() {
|
||||
if conf.http_auth.is_some() {
|
||||
router = router.middleware(auth_middleware(|request| {
|
||||
#[allow(clippy::mutable_key_type)]
|
||||
static ALLOWLIST_ROUTES: Lazy<HashSet<Uri>> =
|
||||
@@ -375,7 +375,7 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
|
||||
|
||||
// NB: on any changes do not forget to update the OpenAPI spec
|
||||
// located nearby (/safekeeper/src/http/openapi_spec.yaml).
|
||||
let auth = conf.auth.clone();
|
||||
let auth = conf.http_auth.clone();
|
||||
router
|
||||
.data(Arc::new(conf))
|
||||
.data(auth)
|
||||
|
||||
@@ -65,7 +65,9 @@ pub struct SafeKeeperConf {
|
||||
pub max_offloader_lag_bytes: u64,
|
||||
pub backup_parallel_jobs: usize,
|
||||
pub wal_backup_enabled: bool,
|
||||
pub auth: Option<Arc<JwtAuth>>,
|
||||
pub pg_auth: Option<Arc<JwtAuth>>,
|
||||
pub pg_tenant_only_auth: Option<Arc<JwtAuth>>,
|
||||
pub http_auth: Option<Arc<JwtAuth>>,
|
||||
pub current_thread_runtime: bool,
|
||||
}
|
||||
|
||||
@@ -99,7 +101,9 @@ impl SafeKeeperConf {
|
||||
broker_keepalive_interval: Duration::from_secs(5),
|
||||
wal_backup_enabled: true,
|
||||
backup_parallel_jobs: 1,
|
||||
auth: None,
|
||||
pg_auth: None,
|
||||
pg_tenant_only_auth: None,
|
||||
http_auth: None,
|
||||
heartbeat_timeout: Duration::new(5, 0),
|
||||
max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES,
|
||||
current_thread_runtime: false,
|
||||
|
||||
@@ -16,10 +16,13 @@ use crate::SafeKeeperConf;
|
||||
use postgres_backend::{AuthType, PostgresBackend};
|
||||
|
||||
/// Accept incoming TCP connections and spawn them into a background thread.
|
||||
/// allowed_auth_scope is either SafekeeperData (wide JWT tokens giving access
|
||||
/// to any tenant are allowed) or Tenant (only tokens giving access to specific
|
||||
/// tenant are allowed). Doesn't matter if auth is disabled in conf.
|
||||
pub async fn task_main(
|
||||
conf: SafeKeeperConf,
|
||||
pg_listener: std::net::TcpListener,
|
||||
allowed_auth_scope: Option<Scope>,
|
||||
allowed_auth_scope: Scope,
|
||||
) -> anyhow::Result<()> {
|
||||
// Tokio's from_std won't do this for us, per its comment.
|
||||
pg_listener.set_nonblocking(true)?;
|
||||
@@ -50,7 +53,7 @@ async fn handle_socket(
|
||||
socket: TcpStream,
|
||||
conf: SafeKeeperConf,
|
||||
conn_id: ConnectionId,
|
||||
allowed_auth_scope: Option<Scope>,
|
||||
allowed_auth_scope: Scope,
|
||||
) -> Result<(), QueryError> {
|
||||
socket.set_nodelay(true)?;
|
||||
let peer_addr = socket.peer_addr()?;
|
||||
@@ -82,16 +85,17 @@ async fn handle_socket(
|
||||
},
|
||||
);
|
||||
|
||||
let auth_type = match conf.auth {
|
||||
let auth_key = match allowed_auth_scope {
|
||||
Scope::Tenant => conf.pg_tenant_only_auth.clone(),
|
||||
_ => conf.pg_auth.clone(),
|
||||
};
|
||||
let auth_type = match auth_key {
|
||||
None => AuthType::Trust,
|
||||
Some(_) => AuthType::NeonJWT,
|
||||
};
|
||||
let mut conn_handler = SafekeeperPostgresHandler::new(
|
||||
conf,
|
||||
conn_id,
|
||||
Some(traffic_metrics.clone()),
|
||||
allowed_auth_scope,
|
||||
);
|
||||
let auth_pair = auth_key.map(|key| (allowed_auth_scope, key));
|
||||
let mut conn_handler =
|
||||
SafekeeperPostgresHandler::new(conf, conn_id, Some(traffic_metrics.clone()), auth_pair);
|
||||
let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
|
||||
// libpq protocol between safekeeper and walproposer / pageserver
|
||||
// We don't use shutdown.
|
||||
|
||||
581
scripts/plumber.py
Normal file
581
scripts/plumber.py
Normal file
@@ -0,0 +1,581 @@
|
||||
import argparse
|
||||
import asyncio
|
||||
import enum
|
||||
import json
|
||||
import os
|
||||
import pprint
|
||||
import tempfile
|
||||
from asyncio import subprocess
|
||||
from datetime import date, datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Set
|
||||
|
||||
"""
|
||||
This is the automation tool that was mostly helpful during our big aws account migration,
|
||||
but may be helpful in other day to day tasks and concentrate knowledge about operations
|
||||
that can help during on-call.
|
||||
|
||||
|
||||
This script executes commands on remote using ssh multiplexing. See referenes:
|
||||
https://blog.scottlowe.org/2015/12/11/using-ssh-multiplexing/
|
||||
https://github.com/openssh-rust/openssh/blob/master/src/builder.rs
|
||||
https://github.com/openssh-rust/openssh/blob/master/src/process_impl/session.rs
|
||||
https://en.wikibooks.org/wiki/OpenSSH/Cookbook/Multiplexing
|
||||
https://docs.rs/openssh/0.9.8/openssh/
|
||||
|
||||
For use with teleport you'll need to setup nsh script mentioned here:
|
||||
https://github.com/neondatabase/cloud/wiki/Cloud%3A-access#3-access-the-nodes-with-ssm
|
||||
"""
|
||||
|
||||
|
||||
def show_line(output_label: Optional[str], line: str):
|
||||
if output_label is not None:
|
||||
print(f"({output_label})", line, end="")
|
||||
else:
|
||||
print(" ", line, end="")
|
||||
if not line:
|
||||
print()
|
||||
|
||||
|
||||
async def exec_checked(
|
||||
program: str,
|
||||
args: List[str],
|
||||
err_msg: Optional[str] = None,
|
||||
output_label: Optional[str] = None,
|
||||
show_output: bool = True,
|
||||
expected_exit_codes=frozenset((0,)),
|
||||
) -> List[str]:
|
||||
if show_output:
|
||||
print("+", program, *args)
|
||||
proc = await subprocess.create_subprocess_exec(
|
||||
program,
|
||||
*args,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
limit=10 << 20,
|
||||
)
|
||||
|
||||
assert proc.stdout is not None
|
||||
|
||||
out = []
|
||||
|
||||
line = (await proc.stdout.readline()).decode()
|
||||
if show_output:
|
||||
show_line(output_label, line)
|
||||
|
||||
out.append(line)
|
||||
|
||||
while line:
|
||||
line = (await proc.stdout.readline()).decode()
|
||||
# empty line means eof, actual empty line from the program is represented by "\n"
|
||||
if not line:
|
||||
continue
|
||||
|
||||
if show_output:
|
||||
show_line(output_label, line)
|
||||
out.append(line)
|
||||
exit_code = await proc.wait()
|
||||
assert exit_code in expected_exit_codes, err_msg or f"{program} failed with {exit_code}"
|
||||
return out
|
||||
|
||||
|
||||
class Connection:
|
||||
def __init__(
|
||||
self,
|
||||
tempdir: tempfile.TemporaryDirectory, # type: ignore
|
||||
target: str,
|
||||
):
|
||||
self.tempdir = tempdir
|
||||
self.target = target
|
||||
|
||||
def get_args(self, extra_args: List[str]):
|
||||
ctl_path = os.path.join(self.tempdir.name, "master")
|
||||
return ["-S", ctl_path, "-o", "BatchMode=yes", *extra_args, "none"]
|
||||
|
||||
async def check(self):
|
||||
args = self.get_args(["-O", "check"])
|
||||
await exec_checked("ssh", args, err_msg="master check operation failed")
|
||||
|
||||
async def spawn(self, cmd: str):
|
||||
# https://github.com/openssh-rust/openssh/blob/cd8f174fafc530d8e55c2aa63add14a24cb2b94c/src/process_impl/session.rs#L72
|
||||
local_args = self.get_args(["-T", "-p", "9"])
|
||||
local_args.extend(["--", f"bash -c '{cmd}'"])
|
||||
return await exec_checked(
|
||||
"ssh", local_args, err_msg="spawn failed", output_label=self.target
|
||||
)
|
||||
|
||||
async def close(self):
|
||||
args = self.get_args(["-O", "exit"])
|
||||
await exec_checked("ssh", args, err_msg="master exit operation failed")
|
||||
|
||||
|
||||
async def connect(target: str) -> Connection:
|
||||
"""
|
||||
target is directly passed to ssh command
|
||||
"""
|
||||
# NOTE: it is mentioned that this setup is not secure
|
||||
# For better security it should be placed somewhere in ~/.ssh
|
||||
# or in other directory with proper permissions
|
||||
# openssh-rust does it the same way
|
||||
# https://github.com/openssh-rust/openssh/blob/master/src/builder.rs
|
||||
connection_dir = tempfile.TemporaryDirectory(suffix=".ssh-multiplexed")
|
||||
# "-E logfile"
|
||||
await exec_checked(
|
||||
"ssh",
|
||||
[
|
||||
"-S",
|
||||
os.path.join(connection_dir.name, "master"),
|
||||
"-M", # Places the ssh client into “master” mode for connection sharing.
|
||||
"-f", # Requests ssh to go to background just before command execution.
|
||||
"-N", # Do not execute a remote command. This is useful for just forwarding ports.
|
||||
"-o",
|
||||
"BatchMode=yes",
|
||||
target,
|
||||
],
|
||||
err_msg="starting master process failed",
|
||||
)
|
||||
return Connection(tempdir=connection_dir, target=target)
|
||||
|
||||
|
||||
class Timer:
|
||||
def __init__(self, msg: str) -> None:
|
||||
self.t0 = datetime.now()
|
||||
self.msg = msg
|
||||
|
||||
def __enter__(self):
|
||||
return None
|
||||
|
||||
def __exit__(self, *_):
|
||||
print(self.msg, datetime.now() - self.t0)
|
||||
|
||||
|
||||
def parse_date(s: str) -> date:
|
||||
return datetime.strptime(s, "%Y-%m-%d").date()
|
||||
|
||||
|
||||
def write_line(f, line: str):
|
||||
f.write(line)
|
||||
f.write("\n")
|
||||
|
||||
|
||||
async def pageserver_tenant_sizes(
|
||||
pageserver_target: str, tenants_of_interest: Optional[List[str]] = None
|
||||
) -> Dict[str, int]:
|
||||
"""
|
||||
With ondemand it should rather look at physical size api
|
||||
For old projects since we dont have eviction yet,
|
||||
we can look at local fs state.
|
||||
"""
|
||||
if tenants_of_interest is not None:
|
||||
tenants_of_interest = set(tenants_of_interest) # type: ignore
|
||||
|
||||
ps_connection = await connect(pageserver_target)
|
||||
out = await ps_connection.spawn("du -sb /storage/pageserver/data/tenants/* | sort -rh")
|
||||
|
||||
tenants = {}
|
||||
|
||||
for line in out:
|
||||
if line.startswith("du: cannot read directory"):
|
||||
continue
|
||||
|
||||
size, tenant_path = map(str.strip, line.split())
|
||||
tenant = Path(tenant_path).stem
|
||||
if tenants_of_interest is not None:
|
||||
if tenant not in tenants_of_interest:
|
||||
continue
|
||||
|
||||
tenants[tenant] = int(size)
|
||||
return tenants
|
||||
|
||||
|
||||
async def fetch_ps_size(args):
|
||||
if args.input is not None:
|
||||
tenants = Path(args.input).read_text().splitlines()
|
||||
else:
|
||||
tenants = None
|
||||
|
||||
sizes = await pageserver_tenant_sizes(args.target, tenants_of_interest=tenants)
|
||||
|
||||
total = 0
|
||||
for tenant, size in sorted(sizes.items(), key=lambda x: x[1], reverse=True):
|
||||
total += size
|
||||
print(tenant, size)
|
||||
print("total", total)
|
||||
|
||||
|
||||
@enum.unique
|
||||
class Env(enum.Enum):
|
||||
STAGING = "staging"
|
||||
PRODUCTION = "production"
|
||||
|
||||
|
||||
class ConsoleAdminShortcuts:
|
||||
def __init__(self, env: Env, verbose: bool = False):
|
||||
if env is Env.STAGING:
|
||||
self.admin_base_url = "https://console.neon.tech/api/v1"
|
||||
self.management_base_url = "http://console-staging.local:3440/management/api/v2"
|
||||
elif env is Env.PRODUCTION:
|
||||
self.admin_base_url = "https://console.neon.tech"
|
||||
self.management_base_url = "http://console-release.local:3441/management/api/v2"
|
||||
|
||||
self.api_token = os.getenv("CONSOLE_ADMIN_API_TOKEN")
|
||||
assert self.api_token, '"CONSOLE_ADMIN_API_TOKEN" is missing in env'
|
||||
|
||||
self.verbose = verbose
|
||||
|
||||
async def check_availability(self, project_id: str):
|
||||
url = f"{self.admin_base_url}/admin/projects/{project_id}/check_availability"
|
||||
output = await exec_checked(
|
||||
"curl",
|
||||
[
|
||||
"--silent",
|
||||
"--fail",
|
||||
"-XPOST",
|
||||
url,
|
||||
"-H",
|
||||
f"Authorization: Bearer {self.api_token}",
|
||||
"-H",
|
||||
"Accept: application/json",
|
||||
],
|
||||
show_output=self.verbose,
|
||||
)
|
||||
assert len(output) == 1 # output should be one line of json
|
||||
return json.loads(output.pop())
|
||||
|
||||
async def get_operation(self, operation_id: str):
|
||||
url = f"{self.admin_base_url}/admin/operations/{operation_id}"
|
||||
output = await exec_checked(
|
||||
"curl",
|
||||
[
|
||||
"--silent",
|
||||
"--fail",
|
||||
url,
|
||||
"-H",
|
||||
f"Authorization: Bearer {self.api_token}",
|
||||
"-H",
|
||||
"Accept: application/json",
|
||||
],
|
||||
show_output=self.verbose,
|
||||
)
|
||||
assert len(output) == 1 # output should be one line of json
|
||||
return json.loads(output.pop())
|
||||
|
||||
async def get_pageservers(self):
|
||||
url = f"{self.admin_base_url}/admin/pageservers"
|
||||
output = await exec_checked(
|
||||
"curl",
|
||||
[
|
||||
"--silent",
|
||||
"--fail",
|
||||
url,
|
||||
"-H",
|
||||
f"Authorization: Bearer {self.api_token}",
|
||||
"-H",
|
||||
"Accept: application/json",
|
||||
],
|
||||
show_output=self.verbose,
|
||||
)
|
||||
assert len(output) == 1 # output should be one line of json
|
||||
return json.loads(output.pop())
|
||||
|
||||
async def set_maintenance(self, project_id: str, maintenance: bool) -> Dict[str, Any]:
|
||||
"""
|
||||
Example response:
|
||||
{
|
||||
"project": {
|
||||
"id": "tight-wood-864662",
|
||||
"maintenance_set_at": "2023-01-31T13:36:45.90346Z"
|
||||
},
|
||||
"operations": [
|
||||
{
|
||||
"id": "216142e0-fbb7-4f41-a470-e63408d4d6b4"
|
||||
}
|
||||
]
|
||||
}
|
||||
"""
|
||||
url = f"{self.management_base_url}/projects/{project_id}/maintenance"
|
||||
data = json.dumps({"maintenance": maintenance})
|
||||
if not self.verbose:
|
||||
args = ["--silent"]
|
||||
else:
|
||||
args = []
|
||||
args.extend(
|
||||
[
|
||||
"--fail",
|
||||
"-XPUT",
|
||||
url,
|
||||
"-H",
|
||||
f"Authorization: Bearer {self.api_token}",
|
||||
"-H",
|
||||
"Accept: application/json",
|
||||
"-d",
|
||||
data,
|
||||
]
|
||||
)
|
||||
output = await exec_checked(
|
||||
"curl",
|
||||
[],
|
||||
show_output=self.verbose,
|
||||
)
|
||||
assert len(output) == 1 # output should be one line of json
|
||||
ret = json.loads(output.pop())
|
||||
assert isinstance(ret, Dict)
|
||||
return ret
|
||||
|
||||
async def fetch_branches(self, project_id: str):
|
||||
url = f"{self.admin_base_url}/admin/branches?project_id={project_id}"
|
||||
output = await exec_checked(
|
||||
"curl",
|
||||
[
|
||||
"--silent",
|
||||
"--fail",
|
||||
url,
|
||||
"-H",
|
||||
f"Authorization: Bearer {self.api_token}",
|
||||
"-H",
|
||||
"Accept: application/json",
|
||||
],
|
||||
show_output=self.verbose,
|
||||
)
|
||||
assert len(output) == 1 # output should be one line of json
|
||||
return json.loads(output.pop())
|
||||
|
||||
|
||||
async def poll_pending_ops(console: ConsoleAdminShortcuts, pending_ops: Set[str]):
|
||||
finished = set() # needed because sets cannot be changed during iteration
|
||||
for pending_op in pending_ops:
|
||||
data = await console.get_operation(pending_op)
|
||||
operation = data["operation"]
|
||||
status = operation["status"]
|
||||
if status == "failed":
|
||||
print(f"ERROR: operation {pending_op} failed")
|
||||
continue
|
||||
|
||||
if operation["failures_count"] != 0:
|
||||
print(f"WARN: operation {pending_op} has failures != 0")
|
||||
continue
|
||||
|
||||
if status == "finished":
|
||||
print(f"operation {pending_op} finished")
|
||||
finished.add(pending_op)
|
||||
else:
|
||||
print(f"operation {pending_op} is still pending: {status}")
|
||||
|
||||
pending_ops.difference_update(finished)
|
||||
|
||||
|
||||
async def check_availability(args):
|
||||
console = ConsoleAdminShortcuts(env=Env(args.env))
|
||||
max_concurrent_checks = args.max_concurrent_checks
|
||||
|
||||
# reverse to keep the order because we will be popping from the end
|
||||
projects: List[str] = list(reversed(Path(args.input).read_text().splitlines()))
|
||||
print("n_projects", len(projects))
|
||||
|
||||
pending_ops: Set[str] = set()
|
||||
while projects:
|
||||
# walk through pending ops
|
||||
if pending_ops:
|
||||
print("pending", len(pending_ops), pending_ops)
|
||||
await poll_pending_ops(console, pending_ops)
|
||||
|
||||
# schedule new ops if limit allows
|
||||
while len(pending_ops) < max_concurrent_checks and len(projects) > 0:
|
||||
project = projects.pop()
|
||||
print("starting:", project, len(projects))
|
||||
# there can be many operations, one for each endpoint
|
||||
data = await console.check_availability(project)
|
||||
for operation in data["operations"]:
|
||||
pending_ops.add(operation["ID"])
|
||||
# wait a bit before starting next one
|
||||
await asyncio.sleep(2)
|
||||
|
||||
if projects:
|
||||
# sleep a little bit to give operations time to finish
|
||||
await asyncio.sleep(5)
|
||||
|
||||
print("all scheduled, poll pending", len(pending_ops), pending_ops, projects)
|
||||
while pending_ops:
|
||||
await poll_pending_ops(console, pending_ops)
|
||||
await asyncio.sleep(5)
|
||||
|
||||
|
||||
async def maintain(args):
|
||||
console = ConsoleAdminShortcuts(env=Env(args.env))
|
||||
finish_flag = args.finish
|
||||
|
||||
projects: List[str] = Path(args.input).read_text().splitlines()
|
||||
print("n_projects", len(projects))
|
||||
|
||||
pending_ops: Set[str] = set()
|
||||
|
||||
for project in projects:
|
||||
data = await console.set_maintenance(project, maintenance=not finish_flag)
|
||||
print(project, len(data["operations"]))
|
||||
for operation in data["operations"]:
|
||||
pending_ops.add(operation["id"])
|
||||
|
||||
if finish_flag:
|
||||
assert len(pending_ops) == 0
|
||||
return
|
||||
|
||||
print("all scheduled, poll pending", len(pending_ops), pending_ops)
|
||||
while pending_ops:
|
||||
await poll_pending_ops(console, pending_ops)
|
||||
print("n pending ops:", len(pending_ops))
|
||||
if pending_ops:
|
||||
await asyncio.sleep(5)
|
||||
|
||||
|
||||
SOURCE_BUCKET = "zenith-storage-oregon"
|
||||
AWS_REGION = "us-west-2"
|
||||
SAFEKEEPER_SOURCE_PREFIX_IN_BUCKET = "prod-1/wal"
|
||||
|
||||
|
||||
async def fetch_sk_s3_size(args):
|
||||
tenants: List[str] = Path(args.input).read_text().splitlines()
|
||||
|
||||
total_objects = 0
|
||||
total_size = 0
|
||||
for tenant in tenants:
|
||||
wal_prefix = f"s3://{SOURCE_BUCKET}/{SAFEKEEPER_SOURCE_PREFIX_IN_BUCKET}/{tenant}"
|
||||
result = await exec_checked(
|
||||
"aws",
|
||||
[
|
||||
"--profile",
|
||||
"neon_main",
|
||||
"s3",
|
||||
"ls",
|
||||
"--recursive",
|
||||
"--summarize",
|
||||
wal_prefix,
|
||||
],
|
||||
expected_exit_codes={0, 1},
|
||||
show_output=False,
|
||||
)
|
||||
objects = int(result[-2].rsplit(maxsplit=1).pop())
|
||||
total_objects += objects
|
||||
|
||||
size = int(result[-1].rsplit(maxsplit=1).pop())
|
||||
total_size += size
|
||||
|
||||
print(tenant, "objects", objects, "size", size)
|
||||
|
||||
print("total_objects", total_objects, "total_size", total_size)
|
||||
|
||||
|
||||
async def fetch_branches(args):
|
||||
console = ConsoleAdminShortcuts(env=Env(args.env))
|
||||
project_id = args.project_id
|
||||
|
||||
pprint.pprint(await console.fetch_branches(project_id=project_id))
|
||||
|
||||
|
||||
async def get_pageservers(args):
|
||||
console = ConsoleAdminShortcuts(env=Env(args.env))
|
||||
|
||||
pprint.pprint(await console.get_pageservers())
|
||||
|
||||
|
||||
async def main():
|
||||
parser = argparse.ArgumentParser("migrator")
|
||||
sub = parser.add_subparsers(title="commands", dest="subparser_name")
|
||||
|
||||
split_parser = sub.add_parser(
|
||||
"split",
|
||||
)
|
||||
split_parser.add_argument(
|
||||
"--input",
|
||||
help="CSV file with results from snowflake query mentioned in README.",
|
||||
required=True,
|
||||
)
|
||||
split_parser.add_argument(
|
||||
"--out",
|
||||
help="Directory to store groups of projects. Directory name is pageserver id.",
|
||||
required=True,
|
||||
)
|
||||
split_parser.add_argument(
|
||||
"--last-usage-cutoff",
|
||||
dest="last_usage_cutoff",
|
||||
help="Projects which do not have compute time starting from passed date (e g 2022-12-01) wil be considered not used recently",
|
||||
required=True,
|
||||
)
|
||||
split_parser.add_argument(
|
||||
"--select-pageserver-id",
|
||||
help="Filter input for this pageserver id",
|
||||
required=True,
|
||||
)
|
||||
|
||||
fetch_ps_size_parser = sub.add_parser("fetch-ps-size")
|
||||
fetch_ps_size_parser.add_argument(
|
||||
"--target",
|
||||
help="Target pageserver host as resolvable by ssh",
|
||||
required=True,
|
||||
)
|
||||
fetch_ps_size_parser.add_argument(
|
||||
"--input",
|
||||
help="File containing list of tenants to include",
|
||||
)
|
||||
|
||||
check_availability_parser = sub.add_parser("check-availability")
|
||||
check_availability_parser.add_argument(
|
||||
"--input",
|
||||
help="File containing list of projects to run availability checks for",
|
||||
)
|
||||
check_availability_parser.add_argument(
|
||||
"--env", choices=["staging", "production"], default="staging"
|
||||
)
|
||||
check_availability_parser.add_argument(
|
||||
"--max-concurrent-checks",
|
||||
help="Max number of simultaneously active availability checks",
|
||||
type=int,
|
||||
default=50,
|
||||
)
|
||||
|
||||
maintain_parser = sub.add_parser("maintain")
|
||||
maintain_parser.add_argument(
|
||||
"--input",
|
||||
help="File containing list of projects",
|
||||
)
|
||||
maintain_parser.add_argument("--env", choices=["staging", "production"], default="staging")
|
||||
maintain_parser.add_argument(
|
||||
"--finish",
|
||||
action="store_true",
|
||||
)
|
||||
|
||||
fetch_sk_s3_size_parser = sub.add_parser("fetch-sk-s3-size")
|
||||
fetch_sk_s3_size_parser.add_argument(
|
||||
"--input",
|
||||
help="File containing list of tenants",
|
||||
)
|
||||
|
||||
fetch_branches_parser = sub.add_parser("fetch-branches")
|
||||
fetch_branches_parser.add_argument("--project-id")
|
||||
fetch_branches_parser.add_argument(
|
||||
"--env", choices=["staging", "production"], default="staging"
|
||||
)
|
||||
|
||||
get_pageservers_parser = sub.add_parser("get-pageservers")
|
||||
get_pageservers_parser.add_argument(
|
||||
"--env", choices=["staging", "production"], default="staging"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
handlers = {
|
||||
"fetch-ps-size": fetch_ps_size,
|
||||
"check-availability": check_availability,
|
||||
"maintain": maintain,
|
||||
"fetch-sk-s3-size": fetch_sk_s3_size,
|
||||
"fetch-branches": fetch_branches,
|
||||
"get-pageservers": get_pageservers,
|
||||
}
|
||||
|
||||
handler = handlers.get(args.subparser_name)
|
||||
if handler:
|
||||
await handler(args)
|
||||
else:
|
||||
parser.print_help()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -1313,12 +1313,20 @@ class NeonCli(AbstractNeonCli):
|
||||
log.info(f"Stopping pageserver with {cmd}")
|
||||
return self.raw_cli(cmd)
|
||||
|
||||
def safekeeper_start(self, id: int) -> "subprocess.CompletedProcess[str]":
|
||||
def safekeeper_start(
|
||||
self, id: int, extra_opts: Optional[List[str]] = None
|
||||
) -> "subprocess.CompletedProcess[str]":
|
||||
s3_env_vars = None
|
||||
if self.env.remote_storage is not None and isinstance(self.env.remote_storage, S3Storage):
|
||||
s3_env_vars = self.env.remote_storage.access_env_vars()
|
||||
|
||||
return self.raw_cli(["safekeeper", "start", str(id)], extra_env_vars=s3_env_vars)
|
||||
if extra_opts is not None:
|
||||
extra_opts = [f"-e={opt}" for opt in extra_opts]
|
||||
else:
|
||||
extra_opts = []
|
||||
return self.raw_cli(
|
||||
["safekeeper", "start", str(id), *extra_opts], extra_env_vars=s3_env_vars
|
||||
)
|
||||
|
||||
def safekeeper_stop(
|
||||
self, id: Optional[int] = None, immediate=False
|
||||
@@ -1761,6 +1769,15 @@ class VanillaPostgres(PgProtocol):
|
||||
with open(os.path.join(self.pgdatadir, "postgresql.conf"), "a") as conf_file:
|
||||
conf_file.write("\n".join(options))
|
||||
|
||||
def edit_hba(self, hba: List[str]):
|
||||
"""Prepend hba lines into pg_hba.conf file."""
|
||||
assert not self.running
|
||||
with open(os.path.join(self.pgdatadir, "pg_hba.conf"), "r+") as conf_file:
|
||||
data = conf_file.read()
|
||||
conf_file.seek(0)
|
||||
conf_file.write("\n".join(hba) + "\n")
|
||||
conf_file.write(data)
|
||||
|
||||
def start(self, log_path: Optional[str] = None):
|
||||
assert not self.running
|
||||
self.running = True
|
||||
@@ -2158,15 +2175,18 @@ def static_proxy(
|
||||
) -> Iterator[NeonProxy]:
|
||||
"""Neon proxy that routes directly to vanilla postgres."""
|
||||
|
||||
# For simplicity, we use the same user for both `--auth-endpoint` and `safe_psql`
|
||||
vanilla_pg.start()
|
||||
vanilla_pg.safe_psql("create user proxy with login superuser password 'password'")
|
||||
|
||||
port = vanilla_pg.default_options["port"]
|
||||
host = vanilla_pg.default_options["host"]
|
||||
dbname = vanilla_pg.default_options["dbname"]
|
||||
auth_endpoint = f"postgres://proxy:password@{host}:{port}/{dbname}"
|
||||
|
||||
# require password for 'http_auth' user
|
||||
vanilla_pg.edit_hba([f"host {dbname} http_auth {host} password"])
|
||||
|
||||
# For simplicity, we use the same user for both `--auth-endpoint` and `safe_psql`
|
||||
vanilla_pg.start()
|
||||
vanilla_pg.safe_psql("create user proxy with login superuser password 'password'")
|
||||
|
||||
proxy_port = port_distributor.get_port()
|
||||
mgmt_port = port_distributor.get_port()
|
||||
http_port = port_distributor.get_port()
|
||||
@@ -2507,9 +2527,9 @@ class Safekeeper:
|
||||
id: int
|
||||
running: bool = False
|
||||
|
||||
def start(self) -> "Safekeeper":
|
||||
def start(self, extra_opts: Optional[List[str]] = None) -> "Safekeeper":
|
||||
assert self.running is False
|
||||
self.env.neon_cli.safekeeper_start(self.id)
|
||||
self.env.neon_cli.safekeeper_start(self.id, extra_opts=extra_opts)
|
||||
self.running = True
|
||||
# wait for wal acceptor start by checking its status
|
||||
started_at = time.time()
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
import time
|
||||
from typing import TYPE_CHECKING, Any, Dict, Optional
|
||||
|
||||
from mypy_boto3_s3.type_defs import ListObjectsV2OutputTypeDef
|
||||
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
|
||||
from fixtures.remote_storage import RemoteStorageKind, S3Storage
|
||||
@@ -191,7 +193,11 @@ def wait_timeline_detail_404(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
iterations: int,
|
||||
interval: Optional[float] = None,
|
||||
):
|
||||
if interval is None:
|
||||
interval = 0.25
|
||||
|
||||
def timeline_is_missing():
|
||||
data = {}
|
||||
try:
|
||||
@@ -204,7 +210,7 @@ def wait_timeline_detail_404(
|
||||
|
||||
raise RuntimeError(f"Timeline exists state {data.get('state')}")
|
||||
|
||||
wait_until(iterations, interval=0.250, func=timeline_is_missing)
|
||||
wait_until(iterations, interval, func=timeline_is_missing)
|
||||
|
||||
|
||||
def timeline_delete_wait_completed(
|
||||
@@ -212,10 +218,11 @@ def timeline_delete_wait_completed(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
iterations: int = 20,
|
||||
interval: Optional[float] = None,
|
||||
**delete_args,
|
||||
):
|
||||
pageserver_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id, **delete_args)
|
||||
wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, iterations)
|
||||
wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, iterations, interval)
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@@ -225,6 +232,24 @@ if TYPE_CHECKING:
|
||||
|
||||
|
||||
def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None):
|
||||
response = list_prefix(neon_env_builder, prefix)
|
||||
objects = response.get("Contents")
|
||||
assert (
|
||||
response["KeyCount"] == 0
|
||||
), f"remote dir with prefix {prefix} is not empty after deletion: {objects}"
|
||||
|
||||
|
||||
def assert_prefix_not_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None):
|
||||
response = list_prefix(neon_env_builder, prefix)
|
||||
assert response["KeyCount"] != 0, f"remote dir with prefix {prefix} is empty: {response}"
|
||||
|
||||
|
||||
def list_prefix(
|
||||
neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None
|
||||
) -> ListObjectsV2OutputTypeDef:
|
||||
"""
|
||||
Note that this function takes into account prefix_in_bucket.
|
||||
"""
|
||||
# For local_fs we need to properly handle empty directories, which we currently dont, so for simplicity stick to s3 api.
|
||||
assert neon_env_builder.remote_storage_kind in (
|
||||
RemoteStorageKind.MOCK_S3,
|
||||
@@ -234,15 +259,21 @@ def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str
|
||||
assert isinstance(neon_env_builder.remote_storage, S3Storage)
|
||||
assert neon_env_builder.remote_storage_client is not None
|
||||
|
||||
prefix_in_bucket = neon_env_builder.remote_storage.prefix_in_bucket or ""
|
||||
if not prefix:
|
||||
prefix = prefix_in_bucket
|
||||
else:
|
||||
# real s3 tests have uniqie per test prefix
|
||||
# mock_s3 tests use special pageserver prefix for pageserver stuff
|
||||
prefix = "/".join((prefix_in_bucket, prefix))
|
||||
|
||||
# Note that this doesnt use pagination, so list is not guaranteed to be exhaustive.
|
||||
response = neon_env_builder.remote_storage_client.list_objects_v2(
|
||||
Delimiter="/",
|
||||
Bucket=neon_env_builder.remote_storage.bucket_name,
|
||||
Prefix=prefix or neon_env_builder.remote_storage.prefix_in_bucket or "",
|
||||
Prefix=prefix,
|
||||
)
|
||||
objects = response.get("Contents")
|
||||
assert (
|
||||
response["KeyCount"] == 0
|
||||
), f"remote dir with prefix {prefix} is not empty after deletion: {objects}"
|
||||
return response
|
||||
|
||||
|
||||
def wait_tenant_status_404(
|
||||
@@ -284,4 +315,4 @@ MANY_SMALL_LAYERS_TENANT_CONFIG = {
|
||||
|
||||
|
||||
def poll_for_remote_storage_iterations(remote_storage_kind: RemoteStorageKind) -> int:
|
||||
return 20 if remote_storage_kind is RemoteStorageKind.REAL_S3 else 6
|
||||
return 40 if remote_storage_kind is RemoteStorageKind.REAL_S3 else 10
|
||||
|
||||
@@ -7,6 +7,9 @@ from pathlib import Path
|
||||
from typing import Dict, List, Optional, Union
|
||||
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.types import TenantId, TimelineId
|
||||
|
||||
TIMELINE_INDEX_PART_FILE_NAME = "index_part.json"
|
||||
|
||||
|
||||
class MockS3Server:
|
||||
@@ -89,6 +92,19 @@ def available_s3_storages() -> List[RemoteStorageKind]:
|
||||
class LocalFsStorage:
|
||||
root: Path
|
||||
|
||||
def tenant_path(self, tenant_id: TenantId) -> Path:
|
||||
return self.root / "tenants" / str(tenant_id)
|
||||
|
||||
def timeline_path(self, tenant_id: TenantId, timeline_id: TimelineId) -> Path:
|
||||
return self.tenant_path(tenant_id) / "timelines" / str(timeline_id)
|
||||
|
||||
def index_path(self, tenant_id: TenantId, timeline_id: TimelineId) -> Path:
|
||||
return self.timeline_path(tenant_id, timeline_id) / TIMELINE_INDEX_PART_FILE_NAME
|
||||
|
||||
def index_content(self, tenant_id: TenantId, timeline_id: TimelineId):
|
||||
with self.index_path(tenant_id, timeline_id).open("r") as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
@dataclass
|
||||
class S3Storage:
|
||||
|
||||
@@ -394,13 +394,7 @@ def check_neon_works(
|
||||
test_output_dir / "dump-from-wal.filediff",
|
||||
)
|
||||
|
||||
# TODO: Run pg_amcheck unconditionally after the next release
|
||||
try:
|
||||
pg_bin.run(["psql", connstr, "--command", "CREATE EXTENSION IF NOT EXISTS amcheck"])
|
||||
except subprocess.CalledProcessError:
|
||||
log.info("Extension amcheck is not available, skipping pg_amcheck")
|
||||
else:
|
||||
pg_bin.run_capture(["pg_amcheck", connstr, "--install-missing", "--verbose"])
|
||||
pg_bin.run_capture(["pg_amcheck", connstr, "--install-missing", "--verbose"])
|
||||
|
||||
# Check that we can interract with the data
|
||||
pg_bin.run_capture(["pgbench", "--time=10", "--progress=2", connstr])
|
||||
|
||||
@@ -265,16 +265,23 @@ def test_sql_over_http_output_options(static_proxy: NeonProxy):
|
||||
def test_sql_over_http_batch(static_proxy: NeonProxy):
|
||||
static_proxy.safe_psql("create role http with login password 'http' superuser")
|
||||
|
||||
def qq(queries: List[Tuple[str, Optional[List[Any]]]], read_only: bool = False) -> Any:
|
||||
def qq(
|
||||
queries: List[Tuple[str, Optional[List[Any]]]],
|
||||
read_only: bool = False,
|
||||
deferrable: bool = False,
|
||||
) -> Any:
|
||||
connstr = f"postgresql://http:http@{static_proxy.domain}:{static_proxy.proxy_port}/postgres"
|
||||
response = requests.post(
|
||||
f"https://{static_proxy.domain}:{static_proxy.external_http_port}/sql",
|
||||
data=json.dumps(list(map(lambda x: {"query": x[0], "params": x[1] or []}, queries))),
|
||||
data=json.dumps(
|
||||
{"queries": list(map(lambda x: {"query": x[0], "params": x[1] or []}, queries))}
|
||||
),
|
||||
headers={
|
||||
"Content-Type": "application/sql",
|
||||
"Neon-Connection-String": connstr,
|
||||
"Neon-Batch-Isolation-Level": "Serializable",
|
||||
"Neon-Batch-Read-Only": "true" if read_only else "false",
|
||||
"Neon-Batch-Deferrable": "true" if deferrable else "false",
|
||||
},
|
||||
verify=str(static_proxy.test_output_dir / "proxy.crt"),
|
||||
)
|
||||
@@ -297,7 +304,8 @@ def test_sql_over_http_batch(static_proxy: NeonProxy):
|
||||
)
|
||||
|
||||
assert headers["Neon-Batch-Isolation-Level"] == "Serializable"
|
||||
assert headers["Neon-Batch-Read-Only"] == "false"
|
||||
assert "Neon-Batch-Read-Only" not in headers
|
||||
assert "Neon-Batch-Deferrable" not in headers
|
||||
|
||||
assert result[0]["rows"] == [{"answer": 42}]
|
||||
assert result[1]["rows"] == [{"answer": "42"}]
|
||||
@@ -325,8 +333,57 @@ def test_sql_over_http_batch(static_proxy: NeonProxy):
|
||||
("select 42 as answer", None),
|
||||
],
|
||||
True,
|
||||
True,
|
||||
)
|
||||
assert headers["Neon-Batch-Isolation-Level"] == "Serializable"
|
||||
assert headers["Neon-Batch-Read-Only"] == "true"
|
||||
assert headers["Neon-Batch-Deferrable"] == "true"
|
||||
|
||||
assert result[0]["rows"] == [{"answer": 42}]
|
||||
|
||||
|
||||
def test_sql_over_http_pool(static_proxy: NeonProxy):
|
||||
static_proxy.safe_psql("create user http_auth with password 'http' superuser")
|
||||
|
||||
def get_pid(status: int, pw: str) -> Any:
|
||||
connstr = (
|
||||
f"postgresql://http_auth:{pw}@{static_proxy.domain}:{static_proxy.proxy_port}/postgres"
|
||||
)
|
||||
response = requests.post(
|
||||
f"https://{static_proxy.domain}:{static_proxy.external_http_port}/sql",
|
||||
data=json.dumps(
|
||||
{"query": "SELECT pid FROM pg_stat_activity WHERE state = 'active'", "params": []}
|
||||
),
|
||||
headers={
|
||||
"Content-Type": "application/sql",
|
||||
"Neon-Connection-String": connstr,
|
||||
"Neon-Pool-Opt-In": "true",
|
||||
},
|
||||
verify=str(static_proxy.test_output_dir / "proxy.crt"),
|
||||
)
|
||||
assert response.status_code == status
|
||||
return response.json()
|
||||
|
||||
pid1 = get_pid(200, "http")["rows"][0]["pid"]
|
||||
|
||||
# query should be on the same connection
|
||||
rows = get_pid(200, "http")["rows"]
|
||||
assert rows == [{"pid": pid1}]
|
||||
|
||||
# incorrect password should not work
|
||||
res = get_pid(400, "foobar")
|
||||
assert "password authentication failed for user" in res["message"]
|
||||
|
||||
static_proxy.safe_psql("alter user http_auth with password 'http2'")
|
||||
|
||||
# after password change, should open a new connection to verify it
|
||||
pid2 = get_pid(200, "http2")["rows"][0]["pid"]
|
||||
assert pid1 != pid2
|
||||
|
||||
# query should be on an existing connection
|
||||
pid = get_pid(200, "http2")["rows"][0]["pid"]
|
||||
assert pid in [pid1, pid2]
|
||||
|
||||
# old password should not work
|
||||
res = get_pid(400, "http")
|
||||
assert "password authentication failed for user" in res["message"]
|
||||
|
||||
@@ -24,6 +24,7 @@ from fixtures.pageserver.utils import (
|
||||
wait_until_tenant_state,
|
||||
)
|
||||
from fixtures.remote_storage import (
|
||||
TIMELINE_INDEX_PART_FILE_NAME,
|
||||
LocalFsStorage,
|
||||
RemoteStorageKind,
|
||||
available_remote_storages,
|
||||
@@ -173,9 +174,7 @@ def test_remote_storage_backup_and_restore(
|
||||
#
|
||||
# The initiated attach operation should survive the restart, and continue from where it was.
|
||||
env.pageserver.stop()
|
||||
layer_download_failed_regex = (
|
||||
r"download.*[0-9A-F]+-[0-9A-F]+.*open a download stream for layer.*simulated failure"
|
||||
)
|
||||
layer_download_failed_regex = r"Failed to download a remote file: simulated failure of remote operation Download.*[0-9A-F]+-[0-9A-F]+"
|
||||
assert not env.pageserver.log_contains(
|
||||
layer_download_failed_regex
|
||||
), "we shouldn't have tried any layer downloads yet since list remote timelines has a failpoint"
|
||||
@@ -208,7 +207,7 @@ def test_remote_storage_backup_and_restore(
|
||||
== f"{data}|{checkpoint_number}"
|
||||
)
|
||||
|
||||
log.info("ensure that we neede to retry downloads due to test_remote_failures=1")
|
||||
log.info("ensure that we needed to retry downloads due to test_remote_failures=1")
|
||||
assert env.pageserver.log_contains(layer_download_failed_regex)
|
||||
|
||||
|
||||
@@ -271,7 +270,7 @@ def test_remote_storage_upload_queue_retries(
|
||||
f"""
|
||||
INSERT INTO foo (id, val)
|
||||
SELECT g, '{data}'
|
||||
FROM generate_series(1, 10000) g
|
||||
FROM generate_series(1, 20000) g
|
||||
ON CONFLICT (id) DO UPDATE
|
||||
SET val = EXCLUDED.val
|
||||
""",
|
||||
@@ -372,7 +371,7 @@ def test_remote_storage_upload_queue_retries(
|
||||
log.info("restarting postgres to validate")
|
||||
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
|
||||
with endpoint.cursor() as cur:
|
||||
assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == 10000
|
||||
assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == 20000
|
||||
|
||||
|
||||
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
|
||||
@@ -420,7 +419,7 @@ def test_remote_timeline_client_calls_started_metric(
|
||||
f"""
|
||||
INSERT INTO foo (id, val)
|
||||
SELECT g, '{data}'
|
||||
FROM generate_series(1, 10000) g
|
||||
FROM generate_series(1, 20000) g
|
||||
ON CONFLICT (id) DO UPDATE
|
||||
SET val = EXCLUDED.val
|
||||
""",
|
||||
@@ -511,7 +510,7 @@ def test_remote_timeline_client_calls_started_metric(
|
||||
log.info("restarting postgres to validate")
|
||||
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
|
||||
with endpoint.cursor() as cur:
|
||||
assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == 10000
|
||||
assert query_scalar(cur, "SELECT COUNT(*) FROM foo WHERE val = 'd'") == 20000
|
||||
|
||||
# ensure that we updated the calls_started download metric
|
||||
fetch_calls_started()
|
||||
@@ -609,15 +608,15 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
|
||||
".* ERROR .*Error processing HTTP request: InternalServerError\\(timeline is Stopping"
|
||||
)
|
||||
|
||||
timeline_delete_wait_completed(client, tenant_id, timeline_id)
|
||||
# Generous timeout, because currently deletions can get blocked waiting for compaction
|
||||
# This can be reduced when https://github.com/neondatabase/neon/issues/4998 is fixed.
|
||||
timeline_delete_wait_completed(client, tenant_id, timeline_id, iterations=30, interval=1)
|
||||
|
||||
assert not timeline_path.exists()
|
||||
|
||||
# to please mypy
|
||||
assert isinstance(env.remote_storage, LocalFsStorage)
|
||||
remote_timeline_path = (
|
||||
env.remote_storage.root / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
|
||||
)
|
||||
remote_timeline_path = env.remote_storage.timeline_path(tenant_id, timeline_id)
|
||||
|
||||
assert not list(remote_timeline_path.iterdir())
|
||||
|
||||
@@ -722,15 +721,14 @@ def test_empty_branch_remote_storage_upload_on_restart(
|
||||
# index upload is now hitting the failpoint, it should block the shutdown
|
||||
env.pageserver.stop(immediate=True)
|
||||
|
||||
timeline_path = (
|
||||
Path("tenants") / str(env.initial_tenant) / "timelines" / str(new_branch_timeline_id)
|
||||
)
|
||||
|
||||
local_metadata = env.repo_dir / timeline_path / "metadata"
|
||||
local_metadata = env.timeline_dir(env.initial_tenant, new_branch_timeline_id) / "metadata"
|
||||
assert local_metadata.is_file()
|
||||
|
||||
assert isinstance(env.remote_storage, LocalFsStorage)
|
||||
new_branch_on_remote_storage = env.remote_storage.root / timeline_path
|
||||
|
||||
new_branch_on_remote_storage = env.remote_storage.timeline_path(
|
||||
env.initial_tenant, new_branch_timeline_id
|
||||
)
|
||||
assert (
|
||||
not new_branch_on_remote_storage.exists()
|
||||
), "failpoint should had prohibited index_part.json upload"
|
||||
@@ -779,7 +777,7 @@ def test_empty_branch_remote_storage_upload_on_restart(
|
||||
assert_nothing_to_upload(client, env.initial_tenant, new_branch_timeline_id)
|
||||
|
||||
assert (
|
||||
new_branch_on_remote_storage / "index_part.json"
|
||||
new_branch_on_remote_storage / TIMELINE_INDEX_PART_FILE_NAME
|
||||
).is_file(), "uploads scheduled during initial load should had been awaited for"
|
||||
finally:
|
||||
create_thread.join()
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
import enum
|
||||
import os
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
@@ -13,13 +15,18 @@ from fixtures.pageserver.http import PageserverApiException
|
||||
from fixtures.pageserver.utils import (
|
||||
MANY_SMALL_LAYERS_TENANT_CONFIG,
|
||||
assert_prefix_empty,
|
||||
assert_prefix_not_empty,
|
||||
poll_for_remote_storage_iterations,
|
||||
tenant_delete_wait_completed,
|
||||
wait_tenant_status_404,
|
||||
wait_until_tenant_active,
|
||||
wait_until_tenant_state,
|
||||
)
|
||||
from fixtures.remote_storage import RemoteStorageKind, available_remote_storages
|
||||
from fixtures.remote_storage import (
|
||||
RemoteStorageKind,
|
||||
available_remote_storages,
|
||||
available_s3_storages,
|
||||
)
|
||||
from fixtures.types import TenantId
|
||||
from fixtures.utils import run_pg_bench_small
|
||||
|
||||
@@ -32,6 +39,8 @@ def test_tenant_delete_smoke(
|
||||
remote_storage_kind: RemoteStorageKind,
|
||||
pg_bin: PgBin,
|
||||
):
|
||||
neon_env_builder.pageserver_config_override = "test_remote_failures=1"
|
||||
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_tenant_delete_smoke",
|
||||
@@ -62,6 +71,17 @@ def test_tenant_delete_smoke(
|
||||
run_pg_bench_small(pg_bin, endpoint.connstr())
|
||||
wait_for_last_flush_lsn(env, endpoint, tenant=tenant_id, timeline=timeline_id)
|
||||
|
||||
if remote_storage_kind in available_s3_storages():
|
||||
assert_prefix_not_empty(
|
||||
neon_env_builder,
|
||||
prefix="/".join(
|
||||
(
|
||||
"tenants",
|
||||
str(tenant_id),
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
parent = timeline
|
||||
|
||||
iterations = poll_for_remote_storage_iterations(remote_storage_kind)
|
||||
@@ -71,7 +91,7 @@ def test_tenant_delete_smoke(
|
||||
tenant_path = env.tenant_dir(tenant_id=tenant_id)
|
||||
assert not tenant_path.exists()
|
||||
|
||||
if remote_storage_kind in [RemoteStorageKind.MOCK_S3, RemoteStorageKind.REAL_S3]:
|
||||
if remote_storage_kind in available_s3_storages():
|
||||
assert_prefix_empty(
|
||||
neon_env_builder,
|
||||
prefix="/".join(
|
||||
@@ -123,25 +143,35 @@ def combinations():
|
||||
|
||||
for remote_storage_kind in remotes:
|
||||
for delete_failpoint in FAILPOINTS:
|
||||
if remote_storage_kind == RemoteStorageKind.NOOP and delete_failpoint in (
|
||||
if remote_storage_kind is RemoteStorageKind.NOOP and delete_failpoint in (
|
||||
"timeline-delete-before-index-delete",
|
||||
):
|
||||
# the above failpoint are not relevant for config without remote storage
|
||||
continue
|
||||
|
||||
result.append((remote_storage_kind, delete_failpoint))
|
||||
# Simulate failures for only one type of remote storage
|
||||
# to avoid log pollution and make tests run faster
|
||||
if remote_storage_kind is RemoteStorageKind.MOCK_S3:
|
||||
simulate_failures = True
|
||||
else:
|
||||
simulate_failures = False
|
||||
result.append((remote_storage_kind, delete_failpoint, simulate_failures))
|
||||
return result
|
||||
|
||||
|
||||
@pytest.mark.parametrize("remote_storage_kind, failpoint", combinations())
|
||||
@pytest.mark.parametrize("remote_storage_kind, failpoint, simulate_failures", combinations())
|
||||
@pytest.mark.parametrize("check", list(Check))
|
||||
def test_delete_tenant_exercise_crash_safety_failpoints(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
remote_storage_kind: RemoteStorageKind,
|
||||
failpoint: str,
|
||||
simulate_failures: bool,
|
||||
check: Check,
|
||||
pg_bin: PgBin,
|
||||
):
|
||||
if simulate_failures:
|
||||
neon_env_builder.pageserver_config_override = "test_remote_failures=1"
|
||||
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind, "test_delete_tenant_exercise_crash_safety_failpoints"
|
||||
)
|
||||
@@ -177,6 +207,17 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
|
||||
else:
|
||||
last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
|
||||
|
||||
if remote_storage_kind in available_s3_storages():
|
||||
assert_prefix_not_empty(
|
||||
neon_env_builder,
|
||||
prefix="/".join(
|
||||
(
|
||||
"tenants",
|
||||
str(tenant_id),
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
ps_http.configure_failpoints((failpoint, "return"))
|
||||
|
||||
iterations = poll_for_remote_storage_iterations(remote_storage_kind)
|
||||
@@ -229,8 +270,12 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
|
||||
|
||||
tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations)
|
||||
|
||||
# Check remote is impty
|
||||
if remote_storage_kind is RemoteStorageKind.MOCK_S3:
|
||||
tenant_dir = env.tenant_dir(tenant_id)
|
||||
# Check local is empty
|
||||
assert not tenant_dir.exists()
|
||||
|
||||
# Check remote is empty
|
||||
if remote_storage_kind in available_s3_storages():
|
||||
assert_prefix_empty(
|
||||
neon_env_builder,
|
||||
prefix="/".join(
|
||||
@@ -241,10 +286,118 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
|
||||
),
|
||||
)
|
||||
|
||||
tenant_dir = env.tenant_dir(tenant_id)
|
||||
# Check local is empty
|
||||
assert not tenant_dir.exists()
|
||||
|
||||
# TODO resume deletion (https://github.com/neondatabase/neon/issues/5006)
|
||||
@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
|
||||
def test_deleted_tenant_ignored_on_attach(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
remote_storage_kind: RemoteStorageKind,
|
||||
pg_bin: PgBin,
|
||||
):
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
test_name="test_deleted_tenant_ignored_on_attach",
|
||||
)
|
||||
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
|
||||
|
||||
tenant_id = env.initial_tenant
|
||||
|
||||
ps_http = env.pageserver.http_client()
|
||||
# create two timelines
|
||||
for timeline in ["first", "second"]:
|
||||
timeline_id = env.neon_cli.create_timeline(timeline, tenant_id=tenant_id)
|
||||
with env.endpoints.create_start(timeline, tenant_id=tenant_id) as endpoint:
|
||||
run_pg_bench_small(pg_bin, endpoint.connstr())
|
||||
wait_for_last_flush_lsn(env, endpoint, tenant=tenant_id, timeline=timeline_id)
|
||||
|
||||
# sanity check, data should be there
|
||||
if remote_storage_kind in available_s3_storages():
|
||||
assert_prefix_not_empty(
|
||||
neon_env_builder,
|
||||
prefix="/".join(
|
||||
(
|
||||
"tenants",
|
||||
str(tenant_id),
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
# failpoint before we remove index_part from s3
|
||||
failpoint = "timeline-delete-before-index-delete"
|
||||
ps_http.configure_failpoints((failpoint, "return"))
|
||||
|
||||
env.pageserver.allowed_errors.extend(
|
||||
(
|
||||
# allow errors caused by failpoints
|
||||
f".*failpoint: {failpoint}",
|
||||
# It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
|
||||
".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
|
||||
# error from http response is also logged
|
||||
".*InternalServerError\\(Tenant is marked as deleted on remote storage.*",
|
||||
'.*shutdown_pageserver{exit_code=0}: stopping left-over name="remote upload".*',
|
||||
)
|
||||
)
|
||||
|
||||
iterations = poll_for_remote_storage_iterations(remote_storage_kind)
|
||||
|
||||
ps_http.tenant_delete(tenant_id)
|
||||
|
||||
tenant_info = wait_until_tenant_state(
|
||||
pageserver_http=ps_http,
|
||||
tenant_id=tenant_id,
|
||||
expected_state="Broken",
|
||||
iterations=iterations,
|
||||
)
|
||||
|
||||
if remote_storage_kind in available_s3_storages():
|
||||
assert_prefix_not_empty(
|
||||
neon_env_builder,
|
||||
prefix="/".join(
|
||||
(
|
||||
"tenants",
|
||||
str(tenant_id),
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
reason = tenant_info["state"]["data"]["reason"]
|
||||
# failpoint may not be the only error in the stack
|
||||
assert reason.endswith(f"failpoint: {failpoint}"), reason
|
||||
|
||||
# now we stop pageserver and remove local tenant state
|
||||
env.endpoints.stop_all()
|
||||
env.pageserver.stop()
|
||||
|
||||
dir_to_clear = Path(env.repo_dir) / "tenants"
|
||||
shutil.rmtree(dir_to_clear)
|
||||
os.mkdir(dir_to_clear)
|
||||
|
||||
env.pageserver.start()
|
||||
|
||||
# now we call attach
|
||||
with pytest.raises(
|
||||
PageserverApiException, match="Tenant is marked as deleted on remote storage"
|
||||
):
|
||||
ps_http.tenant_attach(tenant_id=tenant_id)
|
||||
|
||||
# delete should be resumed (not yet)
|
||||
# wait_tenant_status_404(ps_http, tenant_id, iterations)
|
||||
|
||||
# we shouldn've created tenant dir on disk
|
||||
tenant_path = env.tenant_dir(tenant_id=tenant_id)
|
||||
assert not tenant_path.exists()
|
||||
|
||||
if remote_storage_kind in available_s3_storages():
|
||||
assert_prefix_not_empty(
|
||||
neon_env_builder,
|
||||
prefix="/".join(
|
||||
(
|
||||
"tenants",
|
||||
str(tenant_id),
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
# TODO test concurrent deletions with "hang" failpoint
|
||||
# TODO test tenant delete continues after attach
|
||||
|
||||
@@ -7,7 +7,6 @@
|
||||
#
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import List, Tuple
|
||||
@@ -225,10 +224,11 @@ def test_tenants_attached_after_download(
|
||||
# FIXME: test index_part.json getting downgraded from imaginary new version
|
||||
|
||||
|
||||
@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
|
||||
def test_tenant_redownloads_truncated_file_on_startup(
|
||||
neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
):
|
||||
remote_storage_kind = RemoteStorageKind.LOCAL_FS
|
||||
|
||||
# since we now store the layer file length metadata, we notice on startup that a layer file is of wrong size, and proceed to redownload it.
|
||||
neon_env_builder.enable_remote_storage(
|
||||
remote_storage_kind=remote_storage_kind,
|
||||
@@ -237,6 +237,8 @@ def test_tenant_redownloads_truncated_file_on_startup(
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
assert isinstance(env.remote_storage, LocalFsStorage)
|
||||
|
||||
env.pageserver.allowed_errors.append(
|
||||
".*removing local file .* because it has unexpected length.*"
|
||||
)
|
||||
@@ -279,7 +281,7 @@ def test_tenant_redownloads_truncated_file_on_startup(
|
||||
(path, expected_size) = local_layer_truncated
|
||||
|
||||
# ensure the same size is found from the index_part.json
|
||||
index_part = local_fs_index_part(env, tenant_id, timeline_id)
|
||||
index_part = env.remote_storage.index_content(tenant_id, timeline_id)
|
||||
assert index_part["layer_metadata"][path.name]["file_size"] == expected_size
|
||||
|
||||
## Start the pageserver. It will notice that the file size doesn't match, and
|
||||
@@ -309,7 +311,7 @@ def test_tenant_redownloads_truncated_file_on_startup(
|
||||
assert os.stat(path).st_size == expected_size, "truncated layer should had been re-downloaded"
|
||||
|
||||
# the remote side of local_layer_truncated
|
||||
remote_layer_path = local_fs_index_part_path(env, tenant_id, timeline_id).parent / path.name
|
||||
remote_layer_path = env.remote_storage.timeline_path(tenant_id, timeline_id) / path.name
|
||||
|
||||
# if the upload ever was ongoing, this check would be racy, but at least one
|
||||
# extra http request has been made in between so assume it's enough delay
|
||||
@@ -334,27 +336,3 @@ def test_tenant_redownloads_truncated_file_on_startup(
|
||||
assert (
|
||||
os.stat(remote_layer_path).st_size == expected_size
|
||||
), "truncated file should not had been uploaded after next checkpoint"
|
||||
|
||||
|
||||
def local_fs_index_part(env, tenant_id, timeline_id):
|
||||
"""
|
||||
Return json.load parsed index_part.json of tenant and timeline from LOCAL_FS
|
||||
"""
|
||||
timeline_path = local_fs_index_part_path(env, tenant_id, timeline_id)
|
||||
with open(timeline_path, "r") as timeline_file:
|
||||
return json.load(timeline_file)
|
||||
|
||||
|
||||
def local_fs_index_part_path(env, tenant_id, timeline_id):
|
||||
"""
|
||||
Return path to the LOCAL_FS index_part.json of the tenant and timeline.
|
||||
"""
|
||||
assert isinstance(env.remote_storage, LocalFsStorage)
|
||||
return (
|
||||
env.remote_storage.root
|
||||
/ "tenants"
|
||||
/ str(tenant_id)
|
||||
/ "timelines"
|
||||
/ str(timeline_id)
|
||||
/ "index_part.json"
|
||||
)
|
||||
|
||||
@@ -18,6 +18,7 @@ from fixtures.neon_fixtures import (
|
||||
from fixtures.pageserver.http import PageserverApiException
|
||||
from fixtures.pageserver.utils import (
|
||||
assert_prefix_empty,
|
||||
assert_prefix_not_empty,
|
||||
poll_for_remote_storage_iterations,
|
||||
timeline_delete_wait_completed,
|
||||
wait_for_last_record_lsn,
|
||||
@@ -27,8 +28,10 @@ from fixtures.pageserver.utils import (
|
||||
wait_until_timeline_state,
|
||||
)
|
||||
from fixtures.remote_storage import (
|
||||
LocalFsStorage,
|
||||
RemoteStorageKind,
|
||||
available_remote_storages,
|
||||
available_s3_storages,
|
||||
)
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
from fixtures.utils import query_scalar, wait_until
|
||||
@@ -211,6 +214,19 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
|
||||
else:
|
||||
last_flush_lsn_upload(env, endpoint, env.initial_tenant, timeline_id)
|
||||
|
||||
if remote_storage_kind in available_s3_storages():
|
||||
assert_prefix_not_empty(
|
||||
neon_env_builder,
|
||||
prefix="/".join(
|
||||
(
|
||||
"tenants",
|
||||
str(env.initial_tenant),
|
||||
"timelines",
|
||||
str(timeline_id),
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
env.pageserver.allowed_errors.append(f".*{timeline_id}.*failpoint: {failpoint}")
|
||||
# It appears when we stopped flush loop during deletion and then pageserver is stopped
|
||||
env.pageserver.allowed_errors.append(
|
||||
@@ -297,7 +313,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
|
||||
ps_http, env.initial_tenant, timeline_id, iterations=iterations
|
||||
)
|
||||
|
||||
# Check remote is impty
|
||||
# Check remote is empty
|
||||
if remote_storage_kind is RemoteStorageKind.MOCK_S3:
|
||||
assert_prefix_empty(
|
||||
neon_env_builder,
|
||||
@@ -738,6 +754,19 @@ def test_timeline_delete_works_for_remote_smoke(
|
||||
|
||||
timeline_ids.append(timeline_id)
|
||||
|
||||
for timeline_id in timeline_ids:
|
||||
assert_prefix_not_empty(
|
||||
neon_env_builder,
|
||||
prefix="/".join(
|
||||
(
|
||||
"tenants",
|
||||
str(env.initial_tenant),
|
||||
"timelines",
|
||||
str(timeline_id),
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
for timeline_id in reversed(timeline_ids):
|
||||
# note that we need to finish previous deletion before scheduling next one
|
||||
# otherwise we can get an "HasChildren" error if deletion is not fast enough (real_s3)
|
||||
@@ -757,8 +786,65 @@ def test_timeline_delete_works_for_remote_smoke(
|
||||
|
||||
# for some reason the check above doesnt immediately take effect for the below.
|
||||
# Assume it is mock server inconsistency and check twice.
|
||||
wait_until(
|
||||
2,
|
||||
0.5,
|
||||
lambda: assert_prefix_empty(neon_env_builder),
|
||||
wait_until(2, 0.5, lambda: assert_prefix_empty(neon_env_builder))
|
||||
|
||||
|
||||
def test_delete_orphaned_objects(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
pg_bin: PgBin,
|
||||
):
|
||||
remote_storage_kind = RemoteStorageKind.LOCAL_FS
|
||||
neon_env_builder.enable_remote_storage(remote_storage_kind, "test_delete_orphaned_objects")
|
||||
|
||||
env = neon_env_builder.init_start(
|
||||
initial_tenant_conf={
|
||||
"gc_period": "0s",
|
||||
"compaction_period": "0s",
|
||||
"checkpoint_distance": f"{1024 ** 2}",
|
||||
"image_creation_threshold": "100",
|
||||
}
|
||||
)
|
||||
|
||||
assert isinstance(env.remote_storage, LocalFsStorage)
|
||||
|
||||
ps_http = env.pageserver.http_client()
|
||||
|
||||
timeline_id = env.neon_cli.create_timeline("delete")
|
||||
with env.endpoints.create_start("delete") as endpoint:
|
||||
# generate enough layers
|
||||
pg_bin.run(["pgbench", "-i", "-I dtGvp", "-s1", endpoint.connstr()])
|
||||
last_flush_lsn_upload(env, endpoint, env.initial_tenant, timeline_id)
|
||||
|
||||
# write orphaned file that is missing from the index
|
||||
remote_timeline_path = env.remote_storage.timeline_path(env.initial_tenant, timeline_id)
|
||||
orphans = [remote_timeline_path / f"orphan_{i}" for i in range(3)]
|
||||
for orphan in orphans:
|
||||
orphan.write_text("I shouldnt be there")
|
||||
|
||||
# trigger failpoint after orphaned file deletion to check that index_part is not deleted as well.
|
||||
failpoint = "timeline-delete-before-index-delete"
|
||||
ps_http.configure_failpoints((failpoint, "return"))
|
||||
|
||||
env.pageserver.allowed_errors.append(f".*failpoint: {failpoint}")
|
||||
|
||||
iterations = poll_for_remote_storage_iterations(remote_storage_kind)
|
||||
|
||||
ps_http.timeline_delete(env.initial_tenant, timeline_id)
|
||||
timeline_info = wait_until_timeline_state(
|
||||
pageserver_http=ps_http,
|
||||
tenant_id=env.initial_tenant,
|
||||
timeline_id=timeline_id,
|
||||
expected_state="Broken",
|
||||
iterations=iterations,
|
||||
)
|
||||
|
||||
reason = timeline_info["state"]["Broken"]["reason"]
|
||||
assert reason.endswith(f"failpoint: {failpoint}"), reason
|
||||
|
||||
for orphan in orphans:
|
||||
assert not orphan.exists()
|
||||
assert env.pageserver.log_contains(
|
||||
f"deleting a file not referenced from index_part.json name={orphan.stem}"
|
||||
)
|
||||
|
||||
assert env.remote_storage.index_path(env.initial_tenant, timeline_id).exists()
|
||||
|
||||
@@ -543,8 +543,13 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re
|
||||
last_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
|
||||
|
||||
for sk in env.safekeepers:
|
||||
# require WAL to be trimmed, so no more than one segment is left on disk
|
||||
target_size_mb = 16 * 1.5
|
||||
# require WAL to be trimmed, so no more than one segment is left
|
||||
# on disk
|
||||
# TODO: WAL removal uses persistent values and control
|
||||
# file is fsynced roughly once in a segment, so there is a small
|
||||
# chance that two segments are left on disk, not one. We can
|
||||
# force persist cf and have 16 instead of 32 here.
|
||||
target_size_mb = 32 * 1.5
|
||||
wait(
|
||||
partial(is_wal_trimmed, sk, tenant_id, timeline_id, target_size_mb),
|
||||
f"sk_id={sk.id} to trim WAL to {target_size_mb:.2f}MB",
|
||||
@@ -912,7 +917,7 @@ def test_start_replication_term(neon_env_builder: NeonEnvBuilder):
|
||||
assert "failed to acquire term 3" in str(excinfo.value)
|
||||
|
||||
|
||||
# Test auth on WAL service (postgres protocol) ports.
|
||||
# Test auth on all ports: WAL service (postgres protocol), WAL service tenant only and http.
|
||||
def test_sk_auth(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.auth_enabled = True
|
||||
env = neon_env_builder.init_start()
|
||||
@@ -946,6 +951,64 @@ def test_sk_auth(neon_env_builder: NeonEnvBuilder):
|
||||
with pytest.raises(psycopg2.OperationalError):
|
||||
connector.safe_psql("IDENTIFY_SYSTEM", port=sk.port.pg_tenant_only, password=full_token)
|
||||
|
||||
# Now test that auth on http/pg can be enabled separately.
|
||||
|
||||
# By default, neon_local enables auth on all services if auth is configured,
|
||||
# so http must require the token.
|
||||
sk_http_cli_noauth = sk.http_client()
|
||||
sk_http_cli_auth = sk.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id))
|
||||
with pytest.raises(sk_http_cli_noauth.HTTPError, match="Forbidden|Unauthorized"):
|
||||
sk_http_cli_noauth.timeline_status(tenant_id, timeline_id)
|
||||
sk_http_cli_auth.timeline_status(tenant_id, timeline_id)
|
||||
|
||||
# now, disable auth on http
|
||||
sk.stop()
|
||||
sk.start(extra_opts=["--http-auth-public-key-path="])
|
||||
sk_http_cli_noauth.timeline_status(tenant_id, timeline_id) # must work without token
|
||||
# but pg should still require the token
|
||||
with pytest.raises(psycopg2.OperationalError):
|
||||
connector.safe_psql("IDENTIFY_SYSTEM", port=sk.port.pg)
|
||||
connector.safe_psql("IDENTIFY_SYSTEM", port=sk.port.pg, password=tenant_token)
|
||||
|
||||
# now also disable auth on pg, but leave on pg tenant only
|
||||
sk.stop()
|
||||
sk.start(extra_opts=["--http-auth-public-key-path=", "--pg-auth-public-key-path="])
|
||||
sk_http_cli_noauth.timeline_status(tenant_id, timeline_id) # must work without token
|
||||
connector.safe_psql("IDENTIFY_SYSTEM", port=sk.port.pg) # must work without token
|
||||
# but pg tenant only should still require the token
|
||||
with pytest.raises(psycopg2.OperationalError):
|
||||
connector.safe_psql("IDENTIFY_SYSTEM", port=sk.port.pg_tenant_only)
|
||||
connector.safe_psql("IDENTIFY_SYSTEM", port=sk.port.pg_tenant_only, password=tenant_token)
|
||||
|
||||
|
||||
# Try restarting endpoint with enabled auth.
|
||||
def test_restart_endpoint(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.auth_enabled = True
|
||||
neon_env_builder.num_safekeepers = 3
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
env.neon_cli.create_branch("test_sk_auth_restart_endpoint")
|
||||
endpoint = env.endpoints.create_start("test_sk_auth_restart_endpoint")
|
||||
|
||||
with closing(endpoint.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("create table t(i int)")
|
||||
|
||||
# Restarting endpoints and random safekeepers, to trigger recovery.
|
||||
for _i in range(3):
|
||||
random_sk = random.choice(env.safekeepers)
|
||||
random_sk.stop()
|
||||
|
||||
with closing(endpoint.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
start = random.randint(1, 100000)
|
||||
end = start + random.randint(1, 10000)
|
||||
cur.execute("insert into t select generate_series(%s,%s)", (start, end))
|
||||
|
||||
endpoint.stop()
|
||||
random_sk.start()
|
||||
endpoint.start()
|
||||
|
||||
|
||||
class SafekeeperEnv:
|
||||
def __init__(
|
||||
|
||||
2
vendor/postgres-v14
vendored
2
vendor/postgres-v14
vendored
Submodule vendor/postgres-v14 updated: 28bf5ccfa2...5d5cfee127
2
vendor/postgres-v15
vendored
2
vendor/postgres-v15
vendored
Submodule vendor/postgres-v15 updated: 553f2d3618...026d6b093d
4
vendor/revisions.json
vendored
4
vendor/revisions.json
vendored
@@ -1,4 +1,4 @@
|
||||
{
|
||||
"postgres-v15": "553f2d3618a6d4893bde67f1c065926ee8a3a118",
|
||||
"postgres-v14": "28bf5ccfa2fda9677566a25abd450e714d9ed055"
|
||||
"postgres-v15": "026d6b093d49e25cec44dd04598152329ceac027",
|
||||
"postgres-v14": "5d5cfee12783f0989a9c9fe13bb40b5585812568"
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user