mirror of
https://github.com/neondatabase/neon.git
synced 2026-02-05 19:50:36 +00:00
Compare commits
38 Commits
ephemerals
...
quantumish
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cee8c10582 | ||
|
|
362fa1af8f | ||
|
|
24e6c68772 | ||
|
|
93a45708ff | ||
|
|
6a76bc63f9 | ||
|
|
610ea22c46 | ||
|
|
477648b8cd | ||
|
|
bb1e359872 | ||
|
|
ac87544e79 | ||
|
|
b6b122e07b | ||
|
|
16d6898e44 | ||
|
|
10b936bf03 | ||
|
|
6145cfd1c2 | ||
|
|
96b4de1de6 | ||
|
|
9fdf5fbb7e | ||
|
|
f64eb0cbaf | ||
|
|
6ae4b89000 | ||
|
|
f7ec7668a2 | ||
|
|
038e967daf | ||
|
|
6a43f23eca | ||
|
|
868f194a3b | ||
|
|
9c6c780201 | ||
|
|
6123fe2d5e | ||
|
|
1577665c20 | ||
|
|
d8ebd1d771 | ||
|
|
c8a96cf722 | ||
|
|
56d505bce6 | ||
|
|
dae203ef69 | ||
|
|
1fb1315aed | ||
|
|
838622c594 | ||
|
|
3fd5a94a85 | ||
|
|
e7d6f525b3 | ||
|
|
e4ca3ac745 | ||
|
|
b69d103b90 | ||
|
|
208cbd52d4 | ||
|
|
c567ed0de0 | ||
|
|
c698cee19a | ||
|
|
4a3f32bf4a |
129
Cargo.lock
generated
129
Cargo.lock
generated
@@ -1086,6 +1086,25 @@ version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
|
||||
|
||||
[[package]]
|
||||
name = "cbindgen"
|
||||
version = "0.28.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "eadd868a2ce9ca38de7eeafdcec9c7065ef89b42b32f0839278d55f35c54d1ff"
|
||||
dependencies = [
|
||||
"clap",
|
||||
"heck 0.4.1",
|
||||
"indexmap 2.9.0",
|
||||
"log",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"syn 2.0.100",
|
||||
"tempfile",
|
||||
"toml",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cc"
|
||||
version = "1.2.16"
|
||||
@@ -1212,7 +1231,7 @@ version = "4.5.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab"
|
||||
dependencies = [
|
||||
"heck",
|
||||
"heck 0.5.0",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.100",
|
||||
@@ -1270,6 +1289,14 @@ dependencies = [
|
||||
"unicode-width",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "communicator"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"cbindgen",
|
||||
"neon-shmem",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "compute_api"
|
||||
version = "0.1.0"
|
||||
@@ -1936,7 +1963,7 @@ checksum = "0892a17df262a24294c382f0d5997571006e7a4348b4327557c4ff1cd4a8bccc"
|
||||
dependencies = [
|
||||
"darling",
|
||||
"either",
|
||||
"heck",
|
||||
"heck 0.5.0",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.100",
|
||||
@@ -2500,6 +2527,18 @@ dependencies = [
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "getrandom"
|
||||
version = "0.3.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"r-efi",
|
||||
"wasi 0.14.2+wasi-0.2.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "gettid"
|
||||
version = "0.1.3"
|
||||
@@ -2712,6 +2751,12 @@ dependencies = [
|
||||
"http 1.1.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
version = "0.5.0"
|
||||
@@ -3648,7 +3693,7 @@ version = "0.0.22"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b9e6777fc80a575f9503d908c8b498782a6c3ee88a06cb416dc3941401e43b94"
|
||||
dependencies = [
|
||||
"heck",
|
||||
"heck 0.5.0",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.100",
|
||||
@@ -3710,7 +3755,7 @@ dependencies = [
|
||||
"procfs",
|
||||
"prometheus",
|
||||
"rand 0.8.5",
|
||||
"rand_distr",
|
||||
"rand_distr 0.4.3",
|
||||
"twox-hash",
|
||||
]
|
||||
|
||||
@@ -3798,7 +3843,11 @@ checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
|
||||
name = "neon-shmem"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"criterion",
|
||||
"nix 0.30.1",
|
||||
"rand 0.9.1",
|
||||
"rand_distr 0.5.1",
|
||||
"rustc-hash 1.1.0",
|
||||
"tempfile",
|
||||
"thiserror 1.0.69",
|
||||
"workspace_hack",
|
||||
@@ -4237,6 +4286,7 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
"bytes",
|
||||
"camino",
|
||||
"clap",
|
||||
"futures",
|
||||
@@ -5091,7 +5141,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"heck",
|
||||
"heck 0.5.0",
|
||||
"itertools 0.12.1",
|
||||
"log",
|
||||
"multimap",
|
||||
@@ -5112,7 +5162,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0c1318b19085f08681016926435853bbf7858f9c082d0999b80550ff5d9abe15"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"heck",
|
||||
"heck 0.5.0",
|
||||
"itertools 0.12.1",
|
||||
"log",
|
||||
"multimap",
|
||||
@@ -5237,7 +5287,7 @@ dependencies = [
|
||||
"postgres_backend",
|
||||
"pq_proto",
|
||||
"rand 0.8.5",
|
||||
"rand_distr",
|
||||
"rand_distr 0.4.3",
|
||||
"rcgen",
|
||||
"redis",
|
||||
"regex",
|
||||
@@ -5341,6 +5391,12 @@ dependencies = [
|
||||
"proc-macro2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "r-efi"
|
||||
version = "5.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5"
|
||||
|
||||
[[package]]
|
||||
name = "rand"
|
||||
version = "0.7.3"
|
||||
@@ -5365,6 +5421,16 @@ dependencies = [
|
||||
"rand_core 0.6.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand"
|
||||
version = "0.9.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9fbfd9d094a40bf3ae768db9361049ace4c0e04a4fd6b359518bd7b73a73dd97"
|
||||
dependencies = [
|
||||
"rand_chacha 0.9.0",
|
||||
"rand_core 0.9.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_chacha"
|
||||
version = "0.2.2"
|
||||
@@ -5385,6 +5451,16 @@ dependencies = [
|
||||
"rand_core 0.6.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_chacha"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
|
||||
dependencies = [
|
||||
"ppv-lite86",
|
||||
"rand_core 0.9.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_core"
|
||||
version = "0.5.1"
|
||||
@@ -5403,6 +5479,15 @@ dependencies = [
|
||||
"getrandom 0.2.11",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_core"
|
||||
version = "0.9.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38"
|
||||
dependencies = [
|
||||
"getrandom 0.3.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_distr"
|
||||
version = "0.4.3"
|
||||
@@ -5413,6 +5498,16 @@ dependencies = [
|
||||
"rand 0.8.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_distr"
|
||||
version = "0.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6a8615d50dcf34fa31f7ab52692afec947c4dd0ab803cc87cb3b0b4570ff7463"
|
||||
dependencies = [
|
||||
"num-traits",
|
||||
"rand 0.9.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_hc"
|
||||
version = "0.2.0"
|
||||
@@ -6899,7 +6994,7 @@ version = "0.26.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be"
|
||||
dependencies = [
|
||||
"heck",
|
||||
"heck 0.5.0",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"rustversion",
|
||||
@@ -8198,6 +8293,15 @@ version = "0.11.0+wasi-snapshot-preview1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
|
||||
|
||||
[[package]]
|
||||
name = "wasi"
|
||||
version = "0.14.2+wasi-0.2.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3"
|
||||
dependencies = [
|
||||
"wit-bindgen-rt",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasite"
|
||||
version = "0.1.0"
|
||||
@@ -8555,6 +8659,15 @@ dependencies = [
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wit-bindgen-rt"
|
||||
version = "0.39.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1"
|
||||
dependencies = [
|
||||
"bitflags 2.8.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "workspace_hack"
|
||||
version = "0.1.0"
|
||||
|
||||
@@ -44,6 +44,7 @@ members = [
|
||||
"libs/proxy/postgres-types2",
|
||||
"libs/proxy/tokio-postgres2",
|
||||
"endpoint_storage",
|
||||
"pgxn/neon/communicator",
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
@@ -251,6 +252,7 @@ desim = { version = "0.1", path = "./libs/desim" }
|
||||
endpoint_storage = { version = "0.0.1", path = "./endpoint_storage/" }
|
||||
http-utils = { version = "0.1", path = "./libs/http-utils/" }
|
||||
metrics = { version = "0.1", path = "./libs/metrics/" }
|
||||
neon-shmem = { version = "0.1", path = "./libs/neon-shmem/" }
|
||||
pageserver = { path = "./pageserver" }
|
||||
pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
|
||||
pageserver_client = { path = "./pageserver/client" }
|
||||
@@ -278,6 +280,7 @@ walproposer = { version = "0.1", path = "./libs/walproposer/" }
|
||||
workspace_hack = { version = "0.1", path = "./workspace_hack/" }
|
||||
|
||||
## Build dependencies
|
||||
cbindgen = "0.28.0"
|
||||
criterion = "0.5.1"
|
||||
rcgen = "0.13"
|
||||
rstest = "0.18"
|
||||
|
||||
7
Makefile
7
Makefile
@@ -18,10 +18,12 @@ ifeq ($(BUILD_TYPE),release)
|
||||
PG_LDFLAGS = $(LDFLAGS)
|
||||
# Unfortunately, `--profile=...` is a nightly feature
|
||||
CARGO_BUILD_FLAGS += --release
|
||||
NEON_CARGO_ARTIFACT_TARGET_DIR = $(ROOT_PROJECT_DIR)/target/release
|
||||
else ifeq ($(BUILD_TYPE),debug)
|
||||
PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend
|
||||
PG_CFLAGS += -O0 -g3 $(CFLAGS)
|
||||
PG_LDFLAGS = $(LDFLAGS)
|
||||
NEON_CARGO_ARTIFACT_TARGET_DIR = $(ROOT_PROJECT_DIR)/target/debug
|
||||
else
|
||||
$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
|
||||
endif
|
||||
@@ -180,11 +182,16 @@ postgres-check-%: postgres-%
|
||||
|
||||
.PHONY: neon-pg-ext-%
|
||||
neon-pg-ext-%: postgres-%
|
||||
+@echo "Compiling communicator $*"
|
||||
$(CARGO_CMD_PREFIX) cargo build -p communicator $(CARGO_BUILD_FLAGS)
|
||||
|
||||
+@echo "Compiling neon $*"
|
||||
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-$*
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
|
||||
LIBCOMMUNICATOR_PATH=$(NEON_CARGO_ARTIFACT_TARGET_DIR) \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-$* \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile install
|
||||
|
||||
+@echo "Compiling neon_walredo $*"
|
||||
mkdir -p $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$*
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config COPT='$(COPT)' \
|
||||
|
||||
@@ -297,6 +297,7 @@ RUN ./autogen.sh && \
|
||||
./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
make staged-install && \
|
||||
cd extensions/postgis && \
|
||||
make clean && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
@@ -602,7 +603,7 @@ RUN case "${PG_VERSION:?}" in \
|
||||
;; \
|
||||
esac && \
|
||||
wget https://github.com/knizhnik/online_advisor/archive/refs/tags/1.0.tar.gz -O online_advisor.tar.gz && \
|
||||
echo "059b7d9e5a90013a58bdd22e9505b88406ce05790675eb2d8434e5b215652d54 online_advisor.tar.gz" | sha256sum --check && \
|
||||
echo "37dcadf8f7cc8d6cc1f8831276ee245b44f1b0274f09e511e47a67738ba9ed0f online_advisor.tar.gz" | sha256sum --check && \
|
||||
mkdir online_advisor-src && cd online_advisor-src && tar xzf ../online_advisor.tar.gz --strip-components=1 -C .
|
||||
|
||||
FROM pg-build AS online_advisor-build
|
||||
@@ -1842,10 +1843,25 @@ RUN make PG_VERSION="${PG_VERSION:?}" -C compute
|
||||
|
||||
FROM pg-build AS extension-tests
|
||||
ARG PG_VERSION
|
||||
# This is required for the PostGIS test
|
||||
RUN apt-get update && case $DEBIAN_VERSION in \
|
||||
bullseye) \
|
||||
apt-get install -y libproj19 libgdal28 time; \
|
||||
;; \
|
||||
bookworm) \
|
||||
apt-get install -y libgdal32 libproj25 time; \
|
||||
;; \
|
||||
*) \
|
||||
echo "Unknown Debian version ${DEBIAN_VERSION}" && exit 1 \
|
||||
;; \
|
||||
esac
|
||||
|
||||
COPY docker-compose/ext-src/ /ext-src/
|
||||
|
||||
COPY --from=pg-build /postgres /postgres
|
||||
#COPY --from=postgis-src /ext-src/ /ext-src/
|
||||
COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=postgis-build /ext-src/postgis-src /ext-src/postgis-src
|
||||
COPY --from=postgis-build /sfcgal/* /usr
|
||||
COPY --from=plv8-src /ext-src/ /ext-src/
|
||||
COPY --from=h3-pg-src /ext-src/h3-pg-src /ext-src/h3-pg-src
|
||||
COPY --from=postgresql-unit-src /ext-src/ /ext-src/
|
||||
@@ -1886,6 +1902,7 @@ COPY compute/patches/pg_repack.patch /ext-src
|
||||
RUN cd /ext-src/pg_repack-src && patch -p1 </ext-src/pg_repack.patch && rm -f /ext-src/pg_repack.patch
|
||||
|
||||
COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
|
||||
RUN echo /usr/local/pgsql/lib > /etc/ld.so.conf.d/00-neon.conf && /sbin/ldconfig
|
||||
RUN apt-get update && apt-get install -y libtap-parser-sourcehandler-pgtap-perl jq \
|
||||
&& apt clean && rm -rf /ext-src/*.tar.gz /ext-src/*.patch /var/lib/apt/lists/*
|
||||
ENV PATH=/usr/local/pgsql/bin:$PATH
|
||||
|
||||
121
compute/manifest.yaml
Normal file
121
compute/manifest.yaml
Normal file
@@ -0,0 +1,121 @@
|
||||
pg_settings:
|
||||
# Common settings for primaries and replicas of all versions.
|
||||
common:
|
||||
# Check for client disconnection every 1 minute. By default, Postgres will detect the
|
||||
# loss of the connection only at the next interaction with the socket, when it waits
|
||||
# for, receives or sends data, so it will likely waste resources till the end of the
|
||||
# query execution. There should be no drawbacks in setting this for everyone, so enable
|
||||
# it by default. If anyone will complain, we can allow editing it.
|
||||
# https://www.postgresql.org/docs/16/runtime-config-connection.html#GUC-CLIENT-CONNECTION-CHECK-INTERVAL
|
||||
client_connection_check_interval: "60000" # 1 minute
|
||||
# ---- IO ----
|
||||
effective_io_concurrency: "20"
|
||||
maintenance_io_concurrency: "100"
|
||||
fsync: "off"
|
||||
hot_standby: "off"
|
||||
# We allow users to change this if needed, but by default we
|
||||
# just don't want to see long-lasting idle transactions, as they
|
||||
# prevent activity monitor from suspending projects.
|
||||
idle_in_transaction_session_timeout: "300000" # 5 minutes
|
||||
listen_addresses: "*"
|
||||
# --- LOGGING ---- helps investigations
|
||||
log_connections: "on"
|
||||
log_disconnections: "on"
|
||||
# 1GB, unit is KB
|
||||
log_temp_files: "1048576"
|
||||
# Disable dumping customer data to logs, both to increase data privacy
|
||||
# and to reduce the amount the logs.
|
||||
log_error_verbosity: "terse"
|
||||
log_min_error_statement: "panic"
|
||||
max_connections: "100"
|
||||
# --- WAL ---
|
||||
# - flush lag is the max amount of WAL that has been generated but not yet stored
|
||||
# to disk in the page server. A smaller value means less delay after a pageserver
|
||||
# restart, but if you set it too small you might again need to slow down writes if the
|
||||
# pageserver cannot flush incoming WAL to disk fast enough. This must be larger
|
||||
# than the pageserver's checkpoint interval, currently 1 GB! Otherwise you get a
|
||||
# a deadlock where the compute node refuses to generate more WAL before the
|
||||
# old WAL has been uploaded to S3, but the pageserver is waiting for more WAL
|
||||
# to be generated before it is uploaded to S3.
|
||||
max_replication_flush_lag: "10GB"
|
||||
max_replication_slots: "10"
|
||||
# Backpressure configuration:
|
||||
# - write lag is the max amount of WAL that has been generated by Postgres but not yet
|
||||
# processed by the page server. Making this smaller reduces the worst case latency
|
||||
# of a GetPage request, if you request a page that was recently modified. On the other
|
||||
# hand, if this is too small, the compute node might need to wait on a write if there is a
|
||||
# hiccup in the network or page server so that the page server has temporarily fallen
|
||||
# behind.
|
||||
#
|
||||
# Previously it was set to 500 MB, but it caused compute being unresponsive under load
|
||||
# https://github.com/neondatabase/neon/issues/2028
|
||||
max_replication_write_lag: "500MB"
|
||||
max_wal_senders: "10"
|
||||
# A Postgres checkpoint is cheap in storage, as doesn't involve any significant amount
|
||||
# of real I/O. Only the SLRU buffers and some other small files are flushed to disk.
|
||||
# However, as long as we have full_page_writes=on, page updates after a checkpoint
|
||||
# include full-page images which bloats the WAL. So may want to bump max_wal_size to
|
||||
# reduce the WAL bloating, but at the same it will increase pg_wal directory size on
|
||||
# compute and can lead to out of disk error on k8s nodes.
|
||||
max_wal_size: "1024"
|
||||
wal_keep_size: "0"
|
||||
wal_level: "replica"
|
||||
# Reduce amount of WAL generated by default.
|
||||
wal_log_hints: "off"
|
||||
# - without wal_sender_timeout set we don't get feedback messages,
|
||||
# required for backpressure.
|
||||
wal_sender_timeout: "10000"
|
||||
# We have some experimental extensions, which we don't want users to install unconsciously.
|
||||
# To install them, users would need to set the `neon.allow_unstable_extensions` setting.
|
||||
# There are two of them currently:
|
||||
# - `pgrag` - https://github.com/neondatabase-labs/pgrag - extension is actually called just `rag`,
|
||||
# and two dependencies:
|
||||
# - `rag_bge_small_en_v15`
|
||||
# - `rag_jina_reranker_v1_tiny_en`
|
||||
# - `pg_mooncake` - https://github.com/Mooncake-Labs/pg_mooncake/
|
||||
neon.unstable_extensions: "rag,rag_bge_small_en_v15,rag_jina_reranker_v1_tiny_en,pg_mooncake,anon"
|
||||
neon.protocol_version: "3"
|
||||
password_encryption: "scram-sha-256"
|
||||
# This is important to prevent Postgres from trying to perform
|
||||
# a local WAL redo after backend crash. It should exit and let
|
||||
# the systemd or k8s to do a fresh startup with compute_ctl.
|
||||
restart_after_crash: "off"
|
||||
# By default 3. We have the following persistent connections in the VM:
|
||||
# * compute_activity_monitor (from compute_ctl)
|
||||
# * postgres-exporter (metrics collector; it has 2 connections)
|
||||
# * sql_exporter (metrics collector; we have 2 instances [1 for us & users; 1 for autoscaling])
|
||||
# * vm-monitor (to query & change file cache size)
|
||||
# i.e. total of 6. Let's reserve 7, so there's still at least one left over.
|
||||
superuser_reserved_connections: "7"
|
||||
synchronous_standby_names: "walproposer"
|
||||
|
||||
replica:
|
||||
hot_standby: "on"
|
||||
|
||||
per_version:
|
||||
17:
|
||||
common:
|
||||
# PostgreSQL 17 has a new IO system called "read stream", which can combine IOs up to some
|
||||
# size. It still has some issues with readahead, though, so we default to disabled/
|
||||
# "no combining of IOs" to make sure we get the maximum prefetch depth.
|
||||
# See also: https://github.com/neondatabase/neon/pull/9860
|
||||
io_combine_limit: "1"
|
||||
replica:
|
||||
# prefetching of blocks referenced in WAL doesn't make sense for us
|
||||
# Neon hot standby ignores pages that are not in the shared_buffers
|
||||
recovery_prefetch: "off"
|
||||
16:
|
||||
common:
|
||||
replica:
|
||||
# prefetching of blocks referenced in WAL doesn't make sense for us
|
||||
# Neon hot standby ignores pages that are not in the shared_buffers
|
||||
recovery_prefetch: "off"
|
||||
15:
|
||||
common:
|
||||
replica:
|
||||
# prefetching of blocks referenced in WAL doesn't make sense for us
|
||||
# Neon hot standby ignores pages that are not in the shared_buffers
|
||||
recovery_prefetch: "off"
|
||||
14:
|
||||
common:
|
||||
replica:
|
||||
@@ -40,7 +40,7 @@ use std::sync::mpsc;
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use anyhow::{Context, Result, bail};
|
||||
use clap::Parser;
|
||||
use compute_api::responses::ComputeConfig;
|
||||
use compute_tools::compute::{
|
||||
@@ -57,14 +57,14 @@ use tracing::{error, info};
|
||||
use url::Url;
|
||||
use utils::failpoint_support;
|
||||
|
||||
#[derive(Parser)]
|
||||
#[derive(Debug, Parser)]
|
||||
#[command(rename_all = "kebab-case")]
|
||||
struct Cli {
|
||||
#[arg(short = 'b', long, default_value = "postgres", env = "POSTGRES_PATH")]
|
||||
pub pgbin: String,
|
||||
|
||||
/// The base URL for the remote extension storage proxy gateway.
|
||||
#[arg(short = 'r', long)]
|
||||
#[arg(short = 'r', long, value_parser = Self::parse_remote_ext_base_url)]
|
||||
pub remote_ext_base_url: Option<Url>,
|
||||
|
||||
/// The port to bind the external listening HTTP server to. Clients running
|
||||
@@ -126,6 +126,25 @@ struct Cli {
|
||||
pub installed_extensions_collection_interval: u64,
|
||||
}
|
||||
|
||||
impl Cli {
|
||||
/// Parse a URL from an argument. By default, this isn't necessary, but we
|
||||
/// want to do some sanity checking.
|
||||
fn parse_remote_ext_base_url(value: &str) -> Result<Url> {
|
||||
// Remove extra trailing slashes, and add one. We use Url::join() later
|
||||
// when downloading remote extensions. If the base URL is something like
|
||||
// http://example.com/pg-ext-s3-gateway, and join() is called with
|
||||
// something like "xyz", the resulting URL is http://example.com/xyz.
|
||||
let value = value.trim_end_matches('/').to_owned() + "/";
|
||||
let url = Url::parse(&value)?;
|
||||
|
||||
if url.query_pairs().count() != 0 {
|
||||
bail!("parameters detected in remote extensions base URL")
|
||||
}
|
||||
|
||||
Ok(url)
|
||||
}
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let cli = Cli::parse();
|
||||
|
||||
@@ -252,7 +271,8 @@ fn handle_exit_signal(sig: i32) {
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use clap::CommandFactory;
|
||||
use clap::{CommandFactory, Parser};
|
||||
use url::Url;
|
||||
|
||||
use super::Cli;
|
||||
|
||||
@@ -260,4 +280,43 @@ mod test {
|
||||
fn verify_cli() {
|
||||
Cli::command().debug_assert()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn verify_remote_ext_base_url() {
|
||||
let cli = Cli::parse_from([
|
||||
"compute_ctl",
|
||||
"--pgdata=test",
|
||||
"--connstr=test",
|
||||
"--compute-id=test",
|
||||
"--remote-ext-base-url",
|
||||
"https://example.com/subpath",
|
||||
]);
|
||||
assert_eq!(
|
||||
cli.remote_ext_base_url.unwrap(),
|
||||
Url::parse("https://example.com/subpath/").unwrap()
|
||||
);
|
||||
|
||||
let cli = Cli::parse_from([
|
||||
"compute_ctl",
|
||||
"--pgdata=test",
|
||||
"--connstr=test",
|
||||
"--compute-id=test",
|
||||
"--remote-ext-base-url",
|
||||
"https://example.com//",
|
||||
]);
|
||||
assert_eq!(
|
||||
cli.remote_ext_base_url.unwrap(),
|
||||
Url::parse("https://example.com").unwrap()
|
||||
);
|
||||
|
||||
Cli::try_parse_from([
|
||||
"compute_ctl",
|
||||
"--pgdata=test",
|
||||
"--connstr=test",
|
||||
"--compute-id=test",
|
||||
"--remote-ext-base-url",
|
||||
"https://example.com?hello=world",
|
||||
])
|
||||
.expect_err("URL parameters are not allowed");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@ use chrono::{DateTime, Utc};
|
||||
use compute_api::privilege::Privilege;
|
||||
use compute_api::responses::{
|
||||
ComputeConfig, ComputeCtlConfig, ComputeMetrics, ComputeStatus, LfcOffloadState,
|
||||
LfcPrewarmState,
|
||||
LfcPrewarmState, TlsConfig,
|
||||
};
|
||||
use compute_api::spec::{
|
||||
ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PgIdent,
|
||||
@@ -396,7 +396,7 @@ impl ComputeNode {
|
||||
// because QEMU will already have its memory allocated from the host, and
|
||||
// the necessary binaries will already be cached.
|
||||
if cli_spec.is_none() {
|
||||
this.prewarm_postgres()?;
|
||||
this.prewarm_postgres_vm_memory()?;
|
||||
}
|
||||
|
||||
// Set the up metric with Empty status before starting the HTTP server.
|
||||
@@ -603,6 +603,8 @@ impl ComputeNode {
|
||||
});
|
||||
}
|
||||
|
||||
let tls_config = self.tls_config(&pspec.spec);
|
||||
|
||||
// If there are any remote extensions in shared_preload_libraries, start downloading them
|
||||
if pspec.spec.remote_extensions.is_some() {
|
||||
let (this, spec) = (self.clone(), pspec.spec.clone());
|
||||
@@ -659,7 +661,7 @@ impl ComputeNode {
|
||||
info!("tuning pgbouncer");
|
||||
|
||||
let pgbouncer_settings = pgbouncer_settings.clone();
|
||||
let tls_config = self.compute_ctl_config.tls.clone();
|
||||
let tls_config = tls_config.clone();
|
||||
|
||||
// Spawn a background task to do the tuning,
|
||||
// so that we don't block the main thread that starts Postgres.
|
||||
@@ -678,7 +680,10 @@ impl ComputeNode {
|
||||
|
||||
// Spawn a background task to do the configuration,
|
||||
// so that we don't block the main thread that starts Postgres.
|
||||
let local_proxy = local_proxy.clone();
|
||||
|
||||
let mut local_proxy = local_proxy.clone();
|
||||
local_proxy.tls = tls_config.clone();
|
||||
|
||||
let _handle = tokio::spawn(async move {
|
||||
if let Err(err) = local_proxy::configure(&local_proxy) {
|
||||
error!("error while configuring local_proxy: {err:?}");
|
||||
@@ -779,7 +784,7 @@ impl ComputeNode {
|
||||
// Spawn the extension stats background task
|
||||
self.spawn_extension_stats_task();
|
||||
|
||||
if pspec.spec.prewarm_lfc_on_startup {
|
||||
if pspec.spec.autoprewarm {
|
||||
self.prewarm_lfc();
|
||||
}
|
||||
Ok(())
|
||||
@@ -1205,13 +1210,15 @@ impl ComputeNode {
|
||||
let spec = &pspec.spec;
|
||||
let pgdata_path = Path::new(&self.params.pgdata);
|
||||
|
||||
let tls_config = self.tls_config(&pspec.spec);
|
||||
|
||||
// Remove/create an empty pgdata directory and put configuration there.
|
||||
self.create_pgdata()?;
|
||||
config::write_postgres_conf(
|
||||
pgdata_path,
|
||||
&pspec.spec,
|
||||
self.params.internal_http_port,
|
||||
&self.compute_ctl_config.tls,
|
||||
tls_config,
|
||||
)?;
|
||||
|
||||
// Syncing safekeepers is only safe with primary nodes: if a primary
|
||||
@@ -1307,8 +1314,8 @@ impl ComputeNode {
|
||||
}
|
||||
|
||||
/// Start and stop a postgres process to warm up the VM for startup.
|
||||
pub fn prewarm_postgres(&self) -> Result<()> {
|
||||
info!("prewarming");
|
||||
pub fn prewarm_postgres_vm_memory(&self) -> Result<()> {
|
||||
info!("prewarming VM memory");
|
||||
|
||||
// Create pgdata
|
||||
let pgdata = &format!("{}.warmup", self.params.pgdata);
|
||||
@@ -1350,7 +1357,7 @@ impl ComputeNode {
|
||||
kill(pm_pid, Signal::SIGQUIT)?;
|
||||
info!("sent SIGQUIT signal");
|
||||
pg.wait()?;
|
||||
info!("done prewarming");
|
||||
info!("done prewarming vm memory");
|
||||
|
||||
// clean up
|
||||
let _ok = fs::remove_dir_all(pgdata);
|
||||
@@ -1536,14 +1543,22 @@ impl ComputeNode {
|
||||
.clone(),
|
||||
);
|
||||
|
||||
let mut tls_config = None::<TlsConfig>;
|
||||
if spec.features.contains(&ComputeFeature::TlsExperimental) {
|
||||
tls_config = self.compute_ctl_config.tls.clone();
|
||||
}
|
||||
|
||||
let max_concurrent_connections = self.max_service_connections(compute_state, &spec);
|
||||
|
||||
// Merge-apply spec & changes to PostgreSQL state.
|
||||
self.apply_spec_sql(spec.clone(), conf.clone(), max_concurrent_connections)?;
|
||||
|
||||
if let Some(local_proxy) = &spec.clone().local_proxy_config {
|
||||
let mut local_proxy = local_proxy.clone();
|
||||
local_proxy.tls = tls_config.clone();
|
||||
|
||||
info!("configuring local_proxy");
|
||||
local_proxy::configure(local_proxy).context("apply_config local_proxy")?;
|
||||
local_proxy::configure(&local_proxy).context("apply_config local_proxy")?;
|
||||
}
|
||||
|
||||
// Run migrations separately to not hold up cold starts
|
||||
@@ -1595,11 +1610,13 @@ impl ComputeNode {
|
||||
pub fn reconfigure(&self) -> Result<()> {
|
||||
let spec = self.state.lock().unwrap().pspec.clone().unwrap().spec;
|
||||
|
||||
let tls_config = self.tls_config(&spec);
|
||||
|
||||
if let Some(ref pgbouncer_settings) = spec.pgbouncer_settings {
|
||||
info!("tuning pgbouncer");
|
||||
|
||||
let pgbouncer_settings = pgbouncer_settings.clone();
|
||||
let tls_config = self.compute_ctl_config.tls.clone();
|
||||
let tls_config = tls_config.clone();
|
||||
|
||||
// Spawn a background task to do the tuning,
|
||||
// so that we don't block the main thread that starts Postgres.
|
||||
@@ -1617,7 +1634,7 @@ impl ComputeNode {
|
||||
// Spawn a background task to do the configuration,
|
||||
// so that we don't block the main thread that starts Postgres.
|
||||
let mut local_proxy = local_proxy.clone();
|
||||
local_proxy.tls = self.compute_ctl_config.tls.clone();
|
||||
local_proxy.tls = tls_config.clone();
|
||||
tokio::spawn(async move {
|
||||
if let Err(err) = local_proxy::configure(&local_proxy) {
|
||||
error!("error while configuring local_proxy: {err:?}");
|
||||
@@ -1635,7 +1652,7 @@ impl ComputeNode {
|
||||
pgdata_path,
|
||||
&spec,
|
||||
self.params.internal_http_port,
|
||||
&self.compute_ctl_config.tls,
|
||||
tls_config,
|
||||
)?;
|
||||
|
||||
if !spec.skip_pg_catalog_updates {
|
||||
@@ -1755,6 +1772,14 @@ impl ComputeNode {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn tls_config(&self, spec: &ComputeSpec) -> &Option<TlsConfig> {
|
||||
if spec.features.contains(&ComputeFeature::TlsExperimental) {
|
||||
&self.compute_ctl_config.tls
|
||||
} else {
|
||||
&None::<TlsConfig>
|
||||
}
|
||||
}
|
||||
|
||||
/// Update the `last_active` in the shared state, but ensure that it's a more recent one.
|
||||
pub fn update_last_active(&self, last_active: Option<DateTime<Utc>>) {
|
||||
let mut state = self.state.lock().unwrap();
|
||||
|
||||
@@ -166,7 +166,7 @@ pub async fn download_extension(
|
||||
|
||||
// TODO add retry logic
|
||||
let download_buffer =
|
||||
match download_extension_tar(remote_ext_base_url.as_str(), &ext_path.to_string()).await {
|
||||
match download_extension_tar(remote_ext_base_url, &ext_path.to_string()).await {
|
||||
Ok(buffer) => buffer,
|
||||
Err(error_message) => {
|
||||
return Err(anyhow::anyhow!(
|
||||
@@ -271,10 +271,14 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
|
||||
}
|
||||
|
||||
// Do request to extension storage proxy, e.g.,
|
||||
// curl http://pg-ext-s3-gateway/latest/v15/extensions/anon.tar.zst
|
||||
// curl http://pg-ext-s3-gateway.pg-ext-s3-gateway.svc.cluster.local/latest/v15/extensions/anon.tar.zst
|
||||
// using HTTP GET and return the response body as bytes.
|
||||
async fn download_extension_tar(remote_ext_base_url: &str, ext_path: &str) -> Result<Bytes> {
|
||||
let uri = format!("{}/{}", remote_ext_base_url, ext_path);
|
||||
async fn download_extension_tar(remote_ext_base_url: &Url, ext_path: &str) -> Result<Bytes> {
|
||||
let uri = remote_ext_base_url.join(ext_path).with_context(|| {
|
||||
format!(
|
||||
"failed to create the remote extension URI for {ext_path} using {remote_ext_base_url}"
|
||||
)
|
||||
})?;
|
||||
let filename = Path::new(ext_path)
|
||||
.file_name()
|
||||
.unwrap_or_else(|| std::ffi::OsStr::new("unknown"))
|
||||
@@ -284,7 +288,7 @@ async fn download_extension_tar(remote_ext_base_url: &str, ext_path: &str) -> Re
|
||||
|
||||
info!("Downloading extension file '{}' from uri {}", filename, uri);
|
||||
|
||||
match do_extension_server_request(&uri).await {
|
||||
match do_extension_server_request(uri).await {
|
||||
Ok(resp) => {
|
||||
info!("Successfully downloaded remote extension data {}", ext_path);
|
||||
REMOTE_EXT_REQUESTS_TOTAL
|
||||
@@ -303,7 +307,7 @@ async fn download_extension_tar(remote_ext_base_url: &str, ext_path: &str) -> Re
|
||||
|
||||
// Do a single remote extensions server request.
|
||||
// Return result or (error message + stringified status code) in case of any failures.
|
||||
async fn do_extension_server_request(uri: &str) -> Result<Bytes, (String, String)> {
|
||||
async fn do_extension_server_request(uri: Url) -> Result<Bytes, (String, String)> {
|
||||
let resp = reqwest::get(uri).await.map_err(|e| {
|
||||
(
|
||||
format!(
|
||||
|
||||
@@ -48,11 +48,9 @@ impl JsonResponse {
|
||||
|
||||
/// Create an error response related to the compute being in an invalid state
|
||||
pub(self) fn invalid_status(status: ComputeStatus) -> Response {
|
||||
Self::create_response(
|
||||
Self::error(
|
||||
StatusCode::PRECONDITION_FAILED,
|
||||
&GenericAPIError {
|
||||
error: format!("invalid compute status: {status}"),
|
||||
},
|
||||
format!("invalid compute status: {status}"),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -22,7 +22,7 @@ pub(in crate::http) async fn configure(
|
||||
State(compute): State<Arc<ComputeNode>>,
|
||||
request: Json<ConfigurationRequest>,
|
||||
) -> Response {
|
||||
let pspec = match ParsedSpec::try_from(request.spec.clone()) {
|
||||
let pspec = match ParsedSpec::try_from(request.0.spec) {
|
||||
Ok(p) => p,
|
||||
Err(e) => return JsonResponse::error(StatusCode::BAD_REQUEST, e),
|
||||
};
|
||||
|
||||
@@ -13,6 +13,12 @@ use crate::metrics::{PG_CURR_DOWNTIME_MS, PG_TOTAL_DOWNTIME_MS};
|
||||
|
||||
const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500);
|
||||
|
||||
/// Struct to store runtime state of the compute monitor thread.
|
||||
/// In theory, this could be a part of `Compute`, but i)
|
||||
/// this state is expected to be accessed only by single thread,
|
||||
/// so we don't need to care about locking; ii) `Compute` is
|
||||
/// already quite big. Thus, it seems to be a good idea to keep
|
||||
/// all the activity/health monitoring parts here.
|
||||
struct ComputeMonitor {
|
||||
compute: Arc<ComputeNode>,
|
||||
|
||||
@@ -70,12 +76,36 @@ impl ComputeMonitor {
|
||||
)
|
||||
}
|
||||
|
||||
/// Check if compute is in some terminal or soon-to-be-terminal
|
||||
/// state, then return `true`, signalling the caller that it
|
||||
/// should exit gracefully. Otherwise, return `false`.
|
||||
fn check_interrupts(&mut self) -> bool {
|
||||
let compute_status = self.compute.get_status();
|
||||
if matches!(
|
||||
compute_status,
|
||||
ComputeStatus::Terminated | ComputeStatus::TerminationPending | ComputeStatus::Failed
|
||||
) {
|
||||
info!(
|
||||
"compute is in {} status, stopping compute monitor",
|
||||
compute_status
|
||||
);
|
||||
return true;
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
/// Spin in a loop and figure out the last activity time in the Postgres.
|
||||
/// Then update it in the shared state. This function never errors out.
|
||||
/// Then update it in the shared state. This function currently never
|
||||
/// errors out explicitly, but there is a graceful termination path.
|
||||
/// Every time we receive an error trying to check Postgres, we use
|
||||
/// [`ComputeMonitor::check_interrupts()`] because it could be that
|
||||
/// compute is being terminated already, then we can exit gracefully
|
||||
/// to not produce errors' noise in the log.
|
||||
/// NB: the only expected panic is at `Mutex` unwrap(), all other errors
|
||||
/// should be handled gracefully.
|
||||
#[instrument(skip_all)]
|
||||
pub fn run(&mut self) {
|
||||
pub fn run(&mut self) -> anyhow::Result<()> {
|
||||
// Suppose that `connstr` doesn't change
|
||||
let connstr = self.compute.params.connstr.clone();
|
||||
let conf = self
|
||||
@@ -93,6 +123,10 @@ impl ComputeMonitor {
|
||||
info!("starting compute monitor for {}", connstr);
|
||||
|
||||
loop {
|
||||
if self.check_interrupts() {
|
||||
break;
|
||||
}
|
||||
|
||||
match &mut client {
|
||||
Ok(cli) => {
|
||||
if cli.is_closed() {
|
||||
@@ -100,6 +134,10 @@ impl ComputeMonitor {
|
||||
downtime_info = self.downtime_info(),
|
||||
"connection to Postgres is closed, trying to reconnect"
|
||||
);
|
||||
if self.check_interrupts() {
|
||||
break;
|
||||
}
|
||||
|
||||
self.report_down();
|
||||
|
||||
// Connection is closed, reconnect and try again.
|
||||
@@ -111,15 +149,19 @@ impl ComputeMonitor {
|
||||
self.compute.update_last_active(self.last_active);
|
||||
}
|
||||
Err(e) => {
|
||||
error!(
|
||||
downtime_info = self.downtime_info(),
|
||||
"could not check Postgres: {}", e
|
||||
);
|
||||
if self.check_interrupts() {
|
||||
break;
|
||||
}
|
||||
|
||||
// Although we have many places where we can return errors in `check()`,
|
||||
// normally it shouldn't happen. I.e., we will likely return error if
|
||||
// connection got broken, query timed out, Postgres returned invalid data, etc.
|
||||
// In all such cases it's suspicious, so let's report this as downtime.
|
||||
self.report_down();
|
||||
error!(
|
||||
downtime_info = self.downtime_info(),
|
||||
"could not check Postgres: {}", e
|
||||
);
|
||||
|
||||
// Reconnect to Postgres just in case. During tests, I noticed
|
||||
// that queries in `check()` can fail with `connection closed`,
|
||||
@@ -136,6 +178,10 @@ impl ComputeMonitor {
|
||||
downtime_info = self.downtime_info(),
|
||||
"could not connect to Postgres: {}, retrying", e
|
||||
);
|
||||
if self.check_interrupts() {
|
||||
break;
|
||||
}
|
||||
|
||||
self.report_down();
|
||||
|
||||
// Establish a new connection and try again.
|
||||
@@ -147,6 +193,9 @@ impl ComputeMonitor {
|
||||
self.last_checked = Utc::now();
|
||||
thread::sleep(MONITOR_CHECK_INTERVAL);
|
||||
}
|
||||
|
||||
// Graceful termination path
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
@@ -429,7 +478,10 @@ pub fn launch_monitor(compute: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
|
||||
.spawn(move || {
|
||||
let span = span!(Level::INFO, "compute_monitor");
|
||||
let _enter = span.enter();
|
||||
monitor.run();
|
||||
match monitor.run() {
|
||||
Ok(_) => info!("compute monitor thread terminated gracefully"),
|
||||
Err(err) => error!("compute monitor thread terminated abnormally {:?}", err),
|
||||
}
|
||||
})
|
||||
.expect("cannot launch compute monitor thread")
|
||||
}
|
||||
|
||||
@@ -30,7 +30,7 @@ mod pg_helpers_tests {
|
||||
r#"fsync = off
|
||||
wal_level = logical
|
||||
hot_standby = on
|
||||
prewarm_lfc_on_startup = off
|
||||
autoprewarm = off
|
||||
neon.safekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'
|
||||
wal_log_hints = on
|
||||
log_connections = on
|
||||
|
||||
@@ -747,7 +747,7 @@ impl Endpoint {
|
||||
logs_export_host: None::<String>,
|
||||
endpoint_storage_addr: Some(endpoint_storage_addr),
|
||||
endpoint_storage_token: Some(endpoint_storage_token),
|
||||
prewarm_lfc_on_startup: false,
|
||||
autoprewarm: false,
|
||||
};
|
||||
|
||||
// this strange code is needed to support respec() in tests
|
||||
|
||||
@@ -513,11 +513,6 @@ impl PageServerNode {
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'timeline_offloading' as bool")?,
|
||||
wal_receiver_protocol_override: settings
|
||||
.remove("wal_receiver_protocol_override")
|
||||
.map(serde_json::from_str)
|
||||
.transpose()
|
||||
.context("parse `wal_receiver_protocol_override` from json")?,
|
||||
rel_size_v2_enabled: settings
|
||||
.remove("rel_size_v2_enabled")
|
||||
.map(|x| x.parse::<bool>())
|
||||
|
||||
@@ -13,6 +13,6 @@ RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
|
||||
jq \
|
||||
netcat-openbsd
|
||||
#This is required for the pg_hintplan test
|
||||
RUN mkdir -p /ext-src/pg_hint_plan-src /postgres/contrib/file_fdw && chown postgres /ext-src/pg_hint_plan-src /postgres/contrib/file_fdw
|
||||
RUN mkdir -p /ext-src/pg_hint_plan-src /postgres/contrib/file_fdw /ext-src/postgis-src/ && chown postgres /ext-src/pg_hint_plan-src /postgres/contrib/file_fdw /ext-src/postgis-src
|
||||
|
||||
USER postgres
|
||||
|
||||
@@ -1,18 +1,18 @@
|
||||
#!/bin/bash
|
||||
#!/usr/bin/env bash
|
||||
set -eux
|
||||
|
||||
# Generate a random tenant or timeline ID
|
||||
#
|
||||
# Takes a variable name as argument. The result is stored in that variable.
|
||||
generate_id() {
|
||||
local -n resvar=$1
|
||||
printf -v resvar '%08x%08x%08x%08x' $SRANDOM $SRANDOM $SRANDOM $SRANDOM
|
||||
local -n resvar=${1}
|
||||
printf -v resvar '%08x%08x%08x%08x' ${SRANDOM} ${SRANDOM} ${SRANDOM} ${SRANDOM}
|
||||
}
|
||||
|
||||
PG_VERSION=${PG_VERSION:-14}
|
||||
|
||||
CONFIG_FILE_ORG=/var/db/postgres/configs/config.json
|
||||
CONFIG_FILE=/tmp/config.json
|
||||
readonly CONFIG_FILE_ORG=/var/db/postgres/configs/config.json
|
||||
readonly CONFIG_FILE=/tmp/config.json
|
||||
|
||||
# Test that the first library path that the dynamic loader looks in is the path
|
||||
# that we use for custom compiled software
|
||||
@@ -20,17 +20,17 @@ first_path="$(ldconfig --verbose 2>/dev/null \
|
||||
| grep --invert-match ^$'\t' \
|
||||
| cut --delimiter=: --fields=1 \
|
||||
| head --lines=1)"
|
||||
test "$first_path" == '/usr/local/lib'
|
||||
test "${first_path}" = '/usr/local/lib'
|
||||
|
||||
echo "Waiting pageserver become ready."
|
||||
while ! nc -z pageserver 6400; do
|
||||
sleep 1;
|
||||
sleep 1
|
||||
done
|
||||
echo "Page server is ready."
|
||||
|
||||
cp ${CONFIG_FILE_ORG} ${CONFIG_FILE}
|
||||
cp "${CONFIG_FILE_ORG}" "${CONFIG_FILE}"
|
||||
|
||||
if [ -n "${TENANT_ID:-}" ] && [ -n "${TIMELINE_ID:-}" ]; then
|
||||
if [[ -n "${TENANT_ID:-}" && -n "${TIMELINE_ID:-}" ]]; then
|
||||
tenant_id=${TENANT_ID}
|
||||
timeline_id=${TIMELINE_ID}
|
||||
else
|
||||
@@ -41,7 +41,7 @@ else
|
||||
"http://pageserver:9898/v1/tenant"
|
||||
)
|
||||
tenant_id=$(curl "${PARAMS[@]}" | jq -r .[0].id)
|
||||
if [ -z "${tenant_id}" ] || [ "${tenant_id}" = null ]; then
|
||||
if [[ -z "${tenant_id}" || "${tenant_id}" = null ]]; then
|
||||
echo "Create a tenant"
|
||||
generate_id tenant_id
|
||||
PARAMS=(
|
||||
@@ -51,7 +51,7 @@ else
|
||||
"http://pageserver:9898/v1/tenant/${tenant_id}/location_config"
|
||||
)
|
||||
result=$(curl "${PARAMS[@]}")
|
||||
echo $result | jq .
|
||||
printf '%s\n' "${result}" | jq .
|
||||
fi
|
||||
|
||||
echo "Check if a timeline present"
|
||||
@@ -61,7 +61,7 @@ else
|
||||
"http://pageserver:9898/v1/tenant/${tenant_id}/timeline"
|
||||
)
|
||||
timeline_id=$(curl "${PARAMS[@]}" | jq -r .[0].timeline_id)
|
||||
if [ -z "${timeline_id}" ] || [ "${timeline_id}" = null ]; then
|
||||
if [[ -z "${timeline_id}" || "${timeline_id}" = null ]]; then
|
||||
generate_id timeline_id
|
||||
PARAMS=(
|
||||
-sbf
|
||||
@@ -71,7 +71,7 @@ else
|
||||
"http://pageserver:9898/v1/tenant/${tenant_id}/timeline/"
|
||||
)
|
||||
result=$(curl "${PARAMS[@]}")
|
||||
echo $result | jq .
|
||||
printf '%s\n' "${result}" | jq .
|
||||
fi
|
||||
fi
|
||||
|
||||
@@ -82,10 +82,10 @@ else
|
||||
fi
|
||||
echo "Adding pgx_ulid"
|
||||
shared_libraries=$(jq -r '.spec.cluster.settings[] | select(.name=="shared_preload_libraries").value' ${CONFIG_FILE})
|
||||
sed -i "s/${shared_libraries}/${shared_libraries},${ulid_extension}/" ${CONFIG_FILE}
|
||||
sed -i "s|${shared_libraries}|${shared_libraries},${ulid_extension}|" ${CONFIG_FILE}
|
||||
echo "Overwrite tenant id and timeline id in spec file"
|
||||
sed -i "s/TENANT_ID/${tenant_id}/" ${CONFIG_FILE}
|
||||
sed -i "s/TIMELINE_ID/${timeline_id}/" ${CONFIG_FILE}
|
||||
sed -i "s|TENANT_ID|${tenant_id}|" ${CONFIG_FILE}
|
||||
sed -i "s|TIMELINE_ID|${timeline_id}|" ${CONFIG_FILE}
|
||||
|
||||
cat ${CONFIG_FILE}
|
||||
|
||||
@@ -93,5 +93,5 @@ echo "Start compute node"
|
||||
/usr/local/bin/compute_ctl --pgdata /var/db/postgres/compute \
|
||||
-C "postgresql://cloud_admin@localhost:55433/postgres" \
|
||||
-b /usr/local/bin/postgres \
|
||||
--compute-id "compute-$RANDOM" \
|
||||
--config "$CONFIG_FILE"
|
||||
--compute-id "compute-${RANDOM}" \
|
||||
--config "${CONFIG_FILE}"
|
||||
|
||||
@@ -186,13 +186,14 @@ services:
|
||||
|
||||
neon-test-extensions:
|
||||
profiles: ["test-extensions"]
|
||||
image: ${REPOSITORY:-ghcr.io/neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-16}:${TEST_EXTENSIONS_TAG:-${TAG:-latest}}
|
||||
image: ${REPOSITORY:-ghcr.io/neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-${PG_VERSION:-16}}:${TEST_EXTENSIONS_TAG:-${TAG:-latest}}
|
||||
environment:
|
||||
- PGPASSWORD=cloud_admin
|
||||
- PGUSER=${PGUSER:-cloud_admin}
|
||||
- PGPASSWORD=${PGPASSWORD:-cloud_admin}
|
||||
entrypoint:
|
||||
- "/bin/bash"
|
||||
- "-c"
|
||||
command:
|
||||
- sleep 1800
|
||||
- sleep 3600
|
||||
depends_on:
|
||||
- compute
|
||||
|
||||
@@ -54,6 +54,15 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
|
||||
# It cannot be moved to Dockerfile now because the database directory is created after the start of the container
|
||||
echo Adding dummy config
|
||||
docker compose exec compute touch /var/db/postgres/compute/compute_ctl_temp_override.conf
|
||||
# Prepare for the PostGIS test
|
||||
docker compose exec compute mkdir -p /tmp/pgis_reg/pgis_reg_tmp
|
||||
TMPDIR=$(mktemp -d)
|
||||
docker compose cp neon-test-extensions:/ext-src/postgis-src/raster/test "${TMPDIR}"
|
||||
docker compose cp neon-test-extensions:/ext-src/postgis-src/regress/00-regress-install "${TMPDIR}"
|
||||
docker compose exec compute mkdir -p /ext-src/postgis-src/raster /ext-src/postgis-src/regress /ext-src/postgis-src/regress/00-regress-install
|
||||
docker compose cp "${TMPDIR}/test" compute:/ext-src/postgis-src/raster/test
|
||||
docker compose cp "${TMPDIR}/00-regress-install" compute:/ext-src/postgis-src/regress
|
||||
rm -rf "${TMPDIR}"
|
||||
# The following block copies the files for the pg_hintplan test to the compute node for the extension test in an isolated docker-compose environment
|
||||
TMPDIR=$(mktemp -d)
|
||||
docker compose cp neon-test-extensions:/ext-src/pg_hint_plan-src/data "${TMPDIR}/data"
|
||||
@@ -68,7 +77,7 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
|
||||
docker compose exec -T neon-test-extensions bash -c "(cd /postgres && patch -p1)" <"../compute/patches/contrib_pg${pg_version}.patch"
|
||||
# We are running tests now
|
||||
rm -f testout.txt testout_contrib.txt
|
||||
docker compose exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,postgis-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src,rag_jina_reranker_v1_tiny_en-src,rag_bge_small_en_v15-src \
|
||||
docker compose exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src,rag_jina_reranker_v1_tiny_en-src,rag_bge_small_en_v15-src \
|
||||
neon-test-extensions /run-tests.sh /ext-src | tee testout.txt && EXT_SUCCESS=1 || EXT_SUCCESS=0
|
||||
docker compose exec -e SKIP=start-scripts,postgres_fdw,ltree_plpython,jsonb_plpython,jsonb_plperl,hstore_plpython,hstore_plperl,dblink,bool_plperl \
|
||||
neon-test-extensions /run-tests.sh /postgres/contrib | tee testout_contrib.txt && CONTRIB_SUCCESS=1 || CONTRIB_SUCCESS=0
|
||||
|
||||
70
docker-compose/ext-src/postgis-src/README-Neon.md
Normal file
70
docker-compose/ext-src/postgis-src/README-Neon.md
Normal file
@@ -0,0 +1,70 @@
|
||||
# PostGIS Testing in Neon
|
||||
|
||||
This directory contains configuration files and patches for running PostGIS tests in the Neon database environment.
|
||||
|
||||
## Overview
|
||||
|
||||
PostGIS is a spatial database extension for PostgreSQL that adds support for geographic objects. Testing PostGIS compatibility ensures that Neon's modifications to PostgreSQL don't break compatibility with this critical extension.
|
||||
|
||||
## PostGIS Versions
|
||||
|
||||
- PostgreSQL v17: PostGIS 3.5.0
|
||||
- PostgreSQL v14/v15/v16: PostGIS 3.3.3
|
||||
|
||||
## Test Configuration
|
||||
|
||||
The test setup includes:
|
||||
|
||||
- `postgis-no-upgrade-test.patch`: Disables upgrade tests by removing the upgrade test section from regress/runtest.mk
|
||||
- `postgis-regular-v16.patch`: Version-specific patch for PostgreSQL v16
|
||||
- `postgis-regular-v17.patch`: Version-specific patch for PostgreSQL v17
|
||||
- `regular-test.sh`: Script to run PostGIS tests as a regular user
|
||||
- `neon-test.sh`: Script to handle version-specific test configurations
|
||||
- `raster_outdb_template.sql`: Template for raster tests with explicit file paths
|
||||
|
||||
## Excluded Tests
|
||||
|
||||
**Important Note:** The test exclusions listed below are specifically for regular-user tests against staging instances. These exclusions are necessary because staging instances run with limited privileges and cannot perform operations requiring superuser access. Docker-compose based tests are not affected by these exclusions.
|
||||
|
||||
### Tests Requiring Superuser Permissions
|
||||
|
||||
These tests cannot be run as a regular user:
|
||||
- `estimatedextent`
|
||||
- `regress/core/legacy`
|
||||
- `regress/core/typmod`
|
||||
- `regress/loader/TestSkipANALYZE`
|
||||
- `regress/loader/TestANALYZE`
|
||||
|
||||
### Tests Requiring Filesystem Access
|
||||
|
||||
These tests need direct filesystem access that is only possible for superusers:
|
||||
- `loader/load_outdb`
|
||||
|
||||
### Tests with Flaky Results
|
||||
|
||||
These tests have assumptions that don't always hold true:
|
||||
- `regress/core/computed_columns` - Assumes computed columns always outperform alternatives, which is not consistently true
|
||||
|
||||
### Tests Requiring Tunable Parameter Modifications
|
||||
|
||||
These tests attempt to modify the `postgis.gdal_enabled_drivers` parameter, which is only accessible to superusers:
|
||||
- `raster/test/regress/rt_wkb`
|
||||
- `raster/test/regress/rt_addband`
|
||||
- `raster/test/regress/rt_setbandpath`
|
||||
- `raster/test/regress/rt_fromgdalraster`
|
||||
- `raster/test/regress/rt_asgdalraster`
|
||||
- `raster/test/regress/rt_astiff`
|
||||
- `raster/test/regress/rt_asjpeg`
|
||||
- `raster/test/regress/rt_aspng`
|
||||
- `raster/test/regress/permitted_gdal_drivers`
|
||||
- Loader tests: `BasicOutDB`, `Tiled10x10`, `Tiled10x10Copy`, `Tiled8x8`, `TiledAuto`, `TiledAutoSkipNoData`, `TiledAutoCopyn`
|
||||
|
||||
### Topology Tests (v17 only)
|
||||
- `populate_topology_layer`
|
||||
- `renametopogeometrycolumn`
|
||||
|
||||
## Other Modifications
|
||||
|
||||
- Binary.sql tests are modified to use explicit file paths
|
||||
- Server-side SQL COPY commands (which require superuser privileges) are converted to client-side `\copy` commands
|
||||
- Upgrade tests are disabled
|
||||
6
docker-compose/ext-src/postgis-src/neon-test.sh
Executable file
6
docker-compose/ext-src/postgis-src/neon-test.sh
Executable file
@@ -0,0 +1,6 @@
|
||||
#!/bin/sh
|
||||
set -ex
|
||||
cd "$(dirname "$0")"
|
||||
patch -p1 <"postgis-common-${PG_VERSION}.patch"
|
||||
trap 'echo Cleaning up; patch -R -p1 <postgis-common-${PG_VERSION}.patch' EXIT
|
||||
make installcheck-base
|
||||
37
docker-compose/ext-src/postgis-src/postgis-common-v16.patch
Normal file
37
docker-compose/ext-src/postgis-src/postgis-common-v16.patch
Normal file
@@ -0,0 +1,37 @@
|
||||
diff --git a/regress/core/tests.mk b/regress/core/tests.mk
|
||||
index 3abd7bc..64a9254 100644
|
||||
--- a/regress/core/tests.mk
|
||||
+++ b/regress/core/tests.mk
|
||||
@@ -144,11 +144,6 @@ TESTS_SLOW = \
|
||||
$(top_srcdir)/regress/core/concave_hull_hard \
|
||||
$(top_srcdir)/regress/core/knn_recheck
|
||||
|
||||
-ifeq ($(shell expr "$(POSTGIS_PGSQL_VERSION)" ">=" 120),1)
|
||||
- TESTS += \
|
||||
- $(top_srcdir)/regress/core/computed_columns
|
||||
-endif
|
||||
-
|
||||
ifeq ($(shell expr "$(POSTGIS_GEOS_VERSION)" ">=" 30700),1)
|
||||
# GEOS-3.7 adds:
|
||||
# ST_FrechetDistance
|
||||
diff --git a/regress/runtest.mk b/regress/runtest.mk
|
||||
index c051f03..010e493 100644
|
||||
--- a/regress/runtest.mk
|
||||
+++ b/regress/runtest.mk
|
||||
@@ -24,16 +24,6 @@ check-regress:
|
||||
|
||||
POSTGIS_TOP_BUILD_DIR=$(abs_top_builddir) $(PERL) $(top_srcdir)/regress/run_test.pl $(RUNTESTFLAGS) $(RUNTESTFLAGS_INTERNAL) $(TESTS)
|
||||
|
||||
- @if echo "$(RUNTESTFLAGS)" | grep -vq -- --upgrade; then \
|
||||
- echo "Running upgrade test as RUNTESTFLAGS did not contain that"; \
|
||||
- POSTGIS_TOP_BUILD_DIR=$(abs_top_builddir) $(PERL) $(top_srcdir)/regress/run_test.pl \
|
||||
- --upgrade \
|
||||
- $(RUNTESTFLAGS) \
|
||||
- $(RUNTESTFLAGS_INTERNAL) \
|
||||
- $(TESTS); \
|
||||
- else \
|
||||
- echo "Skipping upgrade test as RUNTESTFLAGS already requested upgrades"; \
|
||||
- fi
|
||||
|
||||
check-long:
|
||||
$(PERL) $(top_srcdir)/regress/run_test.pl $(RUNTESTFLAGS) $(TESTS) $(TESTS_SLOW)
|
||||
35
docker-compose/ext-src/postgis-src/postgis-common-v17.patch
Normal file
35
docker-compose/ext-src/postgis-src/postgis-common-v17.patch
Normal file
@@ -0,0 +1,35 @@
|
||||
diff --git a/regress/core/tests.mk b/regress/core/tests.mk
|
||||
index 9e05244..90987df 100644
|
||||
--- a/regress/core/tests.mk
|
||||
+++ b/regress/core/tests.mk
|
||||
@@ -143,8 +143,7 @@ TESTS += \
|
||||
$(top_srcdir)/regress/core/oriented_envelope \
|
||||
$(top_srcdir)/regress/core/point_coordinates \
|
||||
$(top_srcdir)/regress/core/out_geojson \
|
||||
- $(top_srcdir)/regress/core/wrapx \
|
||||
- $(top_srcdir)/regress/core/computed_columns
|
||||
+ $(top_srcdir)/regress/core/wrapx
|
||||
|
||||
# Slow slow tests
|
||||
TESTS_SLOW = \
|
||||
diff --git a/regress/runtest.mk b/regress/runtest.mk
|
||||
index 4b95b7e..449d5a2 100644
|
||||
--- a/regress/runtest.mk
|
||||
+++ b/regress/runtest.mk
|
||||
@@ -24,16 +24,6 @@ check-regress:
|
||||
|
||||
@POSTGIS_TOP_BUILD_DIR=$(abs_top_builddir) $(PERL) $(top_srcdir)/regress/run_test.pl $(RUNTESTFLAGS) $(RUNTESTFLAGS_INTERNAL) $(TESTS)
|
||||
|
||||
- @if echo "$(RUNTESTFLAGS)" | grep -vq -- --upgrade; then \
|
||||
- echo "Running upgrade test as RUNTESTFLAGS did not contain that"; \
|
||||
- POSTGIS_TOP_BUILD_DIR=$(abs_top_builddir) $(PERL) $(top_srcdir)/regress/run_test.pl \
|
||||
- --upgrade \
|
||||
- $(RUNTESTFLAGS) \
|
||||
- $(RUNTESTFLAGS_INTERNAL) \
|
||||
- $(TESTS); \
|
||||
- else \
|
||||
- echo "Skipping upgrade test as RUNTESTFLAGS already requested upgrades"; \
|
||||
- fi
|
||||
|
||||
check-long:
|
||||
$(PERL) $(top_srcdir)/regress/run_test.pl $(RUNTESTFLAGS) $(TESTS) $(TESTS_SLOW)
|
||||
186
docker-compose/ext-src/postgis-src/postgis-regular-v16.patch
Normal file
186
docker-compose/ext-src/postgis-src/postgis-regular-v16.patch
Normal file
@@ -0,0 +1,186 @@
|
||||
diff --git a/raster/test/regress/tests.mk b/raster/test/regress/tests.mk
|
||||
index 00918e1..7e2b6cd 100644
|
||||
--- a/raster/test/regress/tests.mk
|
||||
+++ b/raster/test/regress/tests.mk
|
||||
@@ -17,9 +17,7 @@ override RUNTESTFLAGS_INTERNAL := \
|
||||
$(RUNTESTFLAGS_INTERNAL) \
|
||||
--after-upgrade-script $(top_srcdir)/raster/test/regress/hooks/hook-after-upgrade-raster.sql
|
||||
|
||||
-RASTER_TEST_FIRST = \
|
||||
- $(top_srcdir)/raster/test/regress/check_gdal \
|
||||
- $(top_srcdir)/raster/test/regress/loader/load_outdb
|
||||
+RASTER_TEST_FIRST =
|
||||
|
||||
RASTER_TEST_LAST = \
|
||||
$(top_srcdir)/raster/test/regress/clean
|
||||
@@ -33,9 +31,7 @@ RASTER_TEST_IO = \
|
||||
|
||||
RASTER_TEST_BASIC_FUNC = \
|
||||
$(top_srcdir)/raster/test/regress/rt_bytea \
|
||||
- $(top_srcdir)/raster/test/regress/rt_wkb \
|
||||
$(top_srcdir)/raster/test/regress/box3d \
|
||||
- $(top_srcdir)/raster/test/regress/rt_addband \
|
||||
$(top_srcdir)/raster/test/regress/rt_band \
|
||||
$(top_srcdir)/raster/test/regress/rt_tile
|
||||
|
||||
@@ -73,16 +69,10 @@ RASTER_TEST_BANDPROPS = \
|
||||
$(top_srcdir)/raster/test/regress/rt_neighborhood \
|
||||
$(top_srcdir)/raster/test/regress/rt_nearestvalue \
|
||||
$(top_srcdir)/raster/test/regress/rt_pixelofvalue \
|
||||
- $(top_srcdir)/raster/test/regress/rt_polygon \
|
||||
- $(top_srcdir)/raster/test/regress/rt_setbandpath
|
||||
+ $(top_srcdir)/raster/test/regress/rt_polygon
|
||||
|
||||
RASTER_TEST_UTILITY = \
|
||||
$(top_srcdir)/raster/test/regress/rt_utility \
|
||||
- $(top_srcdir)/raster/test/regress/rt_fromgdalraster \
|
||||
- $(top_srcdir)/raster/test/regress/rt_asgdalraster \
|
||||
- $(top_srcdir)/raster/test/regress/rt_astiff \
|
||||
- $(top_srcdir)/raster/test/regress/rt_asjpeg \
|
||||
- $(top_srcdir)/raster/test/regress/rt_aspng \
|
||||
$(top_srcdir)/raster/test/regress/rt_reclass \
|
||||
$(top_srcdir)/raster/test/regress/rt_gdalwarp \
|
||||
$(top_srcdir)/raster/test/regress/rt_gdalcontour \
|
||||
@@ -120,21 +110,13 @@ RASTER_TEST_SREL = \
|
||||
|
||||
RASTER_TEST_BUGS = \
|
||||
$(top_srcdir)/raster/test/regress/bug_test_car5 \
|
||||
- $(top_srcdir)/raster/test/regress/permitted_gdal_drivers \
|
||||
$(top_srcdir)/raster/test/regress/tickets
|
||||
|
||||
RASTER_TEST_LOADER = \
|
||||
$(top_srcdir)/raster/test/regress/loader/Basic \
|
||||
$(top_srcdir)/raster/test/regress/loader/Projected \
|
||||
$(top_srcdir)/raster/test/regress/loader/BasicCopy \
|
||||
- $(top_srcdir)/raster/test/regress/loader/BasicFilename \
|
||||
- $(top_srcdir)/raster/test/regress/loader/BasicOutDB \
|
||||
- $(top_srcdir)/raster/test/regress/loader/Tiled10x10 \
|
||||
- $(top_srcdir)/raster/test/regress/loader/Tiled10x10Copy \
|
||||
- $(top_srcdir)/raster/test/regress/loader/Tiled8x8 \
|
||||
- $(top_srcdir)/raster/test/regress/loader/TiledAuto \
|
||||
- $(top_srcdir)/raster/test/regress/loader/TiledAutoSkipNoData \
|
||||
- $(top_srcdir)/raster/test/regress/loader/TiledAutoCopyn
|
||||
+ $(top_srcdir)/raster/test/regress/loader/BasicFilename
|
||||
|
||||
RASTER_TESTS := $(RASTER_TEST_FIRST) \
|
||||
$(RASTER_TEST_METADATA) $(RASTER_TEST_IO) $(RASTER_TEST_BASIC_FUNC) \
|
||||
diff --git a/regress/core/binary.sql b/regress/core/binary.sql
|
||||
index 7a36b65..ad78fc7 100644
|
||||
--- a/regress/core/binary.sql
|
||||
+++ b/regress/core/binary.sql
|
||||
@@ -1,4 +1,5 @@
|
||||
SET client_min_messages TO warning;
|
||||
+
|
||||
CREATE SCHEMA tm;
|
||||
|
||||
CREATE TABLE tm.geoms (id serial, g geometry);
|
||||
@@ -31,24 +32,39 @@ SELECT st_force4d(g) FROM tm.geoms WHERE id < 15 ORDER BY id;
|
||||
INSERT INTO tm.geoms(g)
|
||||
SELECT st_setsrid(g,4326) FROM tm.geoms ORDER BY id;
|
||||
|
||||
-COPY tm.geoms TO :tmpfile WITH BINARY;
|
||||
+-- define temp file path
|
||||
+\set tmpfile '/tmp/postgis_binary_test.dat'
|
||||
+
|
||||
+-- export
|
||||
+\set command '\\copy tm.geoms TO ':tmpfile' WITH (FORMAT BINARY)'
|
||||
+:command
|
||||
+
|
||||
+-- import
|
||||
CREATE TABLE tm.geoms_in AS SELECT * FROM tm.geoms LIMIT 0;
|
||||
-COPY tm.geoms_in FROM :tmpfile WITH BINARY;
|
||||
-SELECT 'geometry', count(*) FROM tm.geoms_in i, tm.geoms o WHERE i.id = o.id
|
||||
- AND ST_OrderingEquals(i.g, o.g);
|
||||
+\set command '\\copy tm.geoms_in FROM ':tmpfile' WITH (FORMAT BINARY)'
|
||||
+:command
|
||||
+
|
||||
+SELECT 'geometry', count(*) FROM tm.geoms_in i, tm.geoms o
|
||||
+WHERE i.id = o.id AND ST_OrderingEquals(i.g, o.g);
|
||||
|
||||
CREATE TABLE tm.geogs AS SELECT id,g::geography FROM tm.geoms
|
||||
WHERE geometrytype(g) NOT LIKE '%CURVE%'
|
||||
AND geometrytype(g) NOT LIKE '%CIRCULAR%'
|
||||
AND geometrytype(g) NOT LIKE '%SURFACE%'
|
||||
AND geometrytype(g) NOT LIKE 'TRIANGLE%'
|
||||
- AND geometrytype(g) NOT LIKE 'TIN%'
|
||||
-;
|
||||
+ AND geometrytype(g) NOT LIKE 'TIN%';
|
||||
|
||||
-COPY tm.geogs TO :tmpfile WITH BINARY;
|
||||
+-- export
|
||||
+\set command '\\copy tm.geogs TO ':tmpfile' WITH (FORMAT BINARY)'
|
||||
+:command
|
||||
+
|
||||
+-- import
|
||||
CREATE TABLE tm.geogs_in AS SELECT * FROM tm.geogs LIMIT 0;
|
||||
-COPY tm.geogs_in FROM :tmpfile WITH BINARY;
|
||||
-SELECT 'geometry', count(*) FROM tm.geogs_in i, tm.geogs o WHERE i.id = o.id
|
||||
- AND ST_OrderingEquals(i.g::geometry, o.g::geometry);
|
||||
+\set command '\\copy tm.geogs_in FROM ':tmpfile' WITH (FORMAT BINARY)'
|
||||
+:command
|
||||
+
|
||||
+SELECT 'geometry', count(*) FROM tm.geogs_in i, tm.geogs o
|
||||
+WHERE i.id = o.id AND ST_OrderingEquals(i.g::geometry, o.g::geometry);
|
||||
|
||||
DROP SCHEMA tm CASCADE;
|
||||
+
|
||||
diff --git a/regress/core/tests.mk b/regress/core/tests.mk
|
||||
index 64a9254..94903c3 100644
|
||||
--- a/regress/core/tests.mk
|
||||
+++ b/regress/core/tests.mk
|
||||
@@ -23,7 +23,6 @@ current_dir := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
|
||||
RUNTESTFLAGS_INTERNAL += \
|
||||
--before-upgrade-script $(top_srcdir)/regress/hooks/hook-before-upgrade.sql \
|
||||
--after-upgrade-script $(top_srcdir)/regress/hooks/hook-after-upgrade.sql \
|
||||
- --after-create-script $(top_srcdir)/regress/hooks/hook-after-create.sql \
|
||||
--before-uninstall-script $(top_srcdir)/regress/hooks/hook-before-uninstall.sql
|
||||
|
||||
TESTS += \
|
||||
@@ -40,7 +39,6 @@ TESTS += \
|
||||
$(top_srcdir)/regress/core/dumppoints \
|
||||
$(top_srcdir)/regress/core/dumpsegments \
|
||||
$(top_srcdir)/regress/core/empty \
|
||||
- $(top_srcdir)/regress/core/estimatedextent \
|
||||
$(top_srcdir)/regress/core/forcecurve \
|
||||
$(top_srcdir)/regress/core/flatgeobuf \
|
||||
$(top_srcdir)/regress/core/geography \
|
||||
@@ -55,7 +53,6 @@ TESTS += \
|
||||
$(top_srcdir)/regress/core/out_marc21 \
|
||||
$(top_srcdir)/regress/core/in_encodedpolyline \
|
||||
$(top_srcdir)/regress/core/iscollection \
|
||||
- $(top_srcdir)/regress/core/legacy \
|
||||
$(top_srcdir)/regress/core/letters \
|
||||
$(top_srcdir)/regress/core/long_xact \
|
||||
$(top_srcdir)/regress/core/lwgeom_regress \
|
||||
@@ -112,7 +109,6 @@ TESTS += \
|
||||
$(top_srcdir)/regress/core/temporal_knn \
|
||||
$(top_srcdir)/regress/core/tickets \
|
||||
$(top_srcdir)/regress/core/twkb \
|
||||
- $(top_srcdir)/regress/core/typmod \
|
||||
$(top_srcdir)/regress/core/wkb \
|
||||
$(top_srcdir)/regress/core/wkt \
|
||||
$(top_srcdir)/regress/core/wmsservers \
|
||||
diff --git a/regress/loader/tests.mk b/regress/loader/tests.mk
|
||||
index 1fc77ac..c3cb9de 100644
|
||||
--- a/regress/loader/tests.mk
|
||||
+++ b/regress/loader/tests.mk
|
||||
@@ -38,7 +38,5 @@ TESTS += \
|
||||
$(top_srcdir)/regress/loader/Latin1 \
|
||||
$(top_srcdir)/regress/loader/Latin1-implicit \
|
||||
$(top_srcdir)/regress/loader/mfile \
|
||||
- $(top_srcdir)/regress/loader/TestSkipANALYZE \
|
||||
- $(top_srcdir)/regress/loader/TestANALYZE \
|
||||
$(top_srcdir)/regress/loader/CharNoWidth
|
||||
|
||||
diff --git a/regress/run_test.pl b/regress/run_test.pl
|
||||
index 0ec5b2d..1c331f4 100755
|
||||
--- a/regress/run_test.pl
|
||||
+++ b/regress/run_test.pl
|
||||
@@ -147,7 +147,6 @@ $ENV{"LANG"} = "C";
|
||||
# Add locale info to the psql options
|
||||
# Add pg12 precision suppression
|
||||
my $PGOPTIONS = $ENV{"PGOPTIONS"};
|
||||
-$PGOPTIONS .= " -c lc_messages=C";
|
||||
$PGOPTIONS .= " -c client_min_messages=NOTICE";
|
||||
$PGOPTIONS .= " -c extra_float_digits=0";
|
||||
$ENV{"PGOPTIONS"} = $PGOPTIONS;
|
||||
208
docker-compose/ext-src/postgis-src/postgis-regular-v17.patch
Normal file
208
docker-compose/ext-src/postgis-src/postgis-regular-v17.patch
Normal file
@@ -0,0 +1,208 @@
|
||||
diff --git a/raster/test/regress/tests.mk b/raster/test/regress/tests.mk
|
||||
index 00918e1..7e2b6cd 100644
|
||||
--- a/raster/test/regress/tests.mk
|
||||
+++ b/raster/test/regress/tests.mk
|
||||
@@ -17,9 +17,7 @@ override RUNTESTFLAGS_INTERNAL := \
|
||||
$(RUNTESTFLAGS_INTERNAL) \
|
||||
--after-upgrade-script $(top_srcdir)/raster/test/regress/hooks/hook-after-upgrade-raster.sql
|
||||
|
||||
-RASTER_TEST_FIRST = \
|
||||
- $(top_srcdir)/raster/test/regress/check_gdal \
|
||||
- $(top_srcdir)/raster/test/regress/loader/load_outdb
|
||||
+RASTER_TEST_FIRST =
|
||||
|
||||
RASTER_TEST_LAST = \
|
||||
$(top_srcdir)/raster/test/regress/clean
|
||||
@@ -33,9 +31,7 @@ RASTER_TEST_IO = \
|
||||
|
||||
RASTER_TEST_BASIC_FUNC = \
|
||||
$(top_srcdir)/raster/test/regress/rt_bytea \
|
||||
- $(top_srcdir)/raster/test/regress/rt_wkb \
|
||||
$(top_srcdir)/raster/test/regress/box3d \
|
||||
- $(top_srcdir)/raster/test/regress/rt_addband \
|
||||
$(top_srcdir)/raster/test/regress/rt_band \
|
||||
$(top_srcdir)/raster/test/regress/rt_tile
|
||||
|
||||
@@ -73,16 +69,10 @@ RASTER_TEST_BANDPROPS = \
|
||||
$(top_srcdir)/raster/test/regress/rt_neighborhood \
|
||||
$(top_srcdir)/raster/test/regress/rt_nearestvalue \
|
||||
$(top_srcdir)/raster/test/regress/rt_pixelofvalue \
|
||||
- $(top_srcdir)/raster/test/regress/rt_polygon \
|
||||
- $(top_srcdir)/raster/test/regress/rt_setbandpath
|
||||
+ $(top_srcdir)/raster/test/regress/rt_polygon
|
||||
|
||||
RASTER_TEST_UTILITY = \
|
||||
$(top_srcdir)/raster/test/regress/rt_utility \
|
||||
- $(top_srcdir)/raster/test/regress/rt_fromgdalraster \
|
||||
- $(top_srcdir)/raster/test/regress/rt_asgdalraster \
|
||||
- $(top_srcdir)/raster/test/regress/rt_astiff \
|
||||
- $(top_srcdir)/raster/test/regress/rt_asjpeg \
|
||||
- $(top_srcdir)/raster/test/regress/rt_aspng \
|
||||
$(top_srcdir)/raster/test/regress/rt_reclass \
|
||||
$(top_srcdir)/raster/test/regress/rt_gdalwarp \
|
||||
$(top_srcdir)/raster/test/regress/rt_gdalcontour \
|
||||
@@ -120,21 +110,13 @@ RASTER_TEST_SREL = \
|
||||
|
||||
RASTER_TEST_BUGS = \
|
||||
$(top_srcdir)/raster/test/regress/bug_test_car5 \
|
||||
- $(top_srcdir)/raster/test/regress/permitted_gdal_drivers \
|
||||
$(top_srcdir)/raster/test/regress/tickets
|
||||
|
||||
RASTER_TEST_LOADER = \
|
||||
$(top_srcdir)/raster/test/regress/loader/Basic \
|
||||
$(top_srcdir)/raster/test/regress/loader/Projected \
|
||||
$(top_srcdir)/raster/test/regress/loader/BasicCopy \
|
||||
- $(top_srcdir)/raster/test/regress/loader/BasicFilename \
|
||||
- $(top_srcdir)/raster/test/regress/loader/BasicOutDB \
|
||||
- $(top_srcdir)/raster/test/regress/loader/Tiled10x10 \
|
||||
- $(top_srcdir)/raster/test/regress/loader/Tiled10x10Copy \
|
||||
- $(top_srcdir)/raster/test/regress/loader/Tiled8x8 \
|
||||
- $(top_srcdir)/raster/test/regress/loader/TiledAuto \
|
||||
- $(top_srcdir)/raster/test/regress/loader/TiledAutoSkipNoData \
|
||||
- $(top_srcdir)/raster/test/regress/loader/TiledAutoCopyn
|
||||
+ $(top_srcdir)/raster/test/regress/loader/BasicFilename
|
||||
|
||||
RASTER_TESTS := $(RASTER_TEST_FIRST) \
|
||||
$(RASTER_TEST_METADATA) $(RASTER_TEST_IO) $(RASTER_TEST_BASIC_FUNC) \
|
||||
diff --git a/regress/core/binary.sql b/regress/core/binary.sql
|
||||
index 7a36b65..ad78fc7 100644
|
||||
--- a/regress/core/binary.sql
|
||||
+++ b/regress/core/binary.sql
|
||||
@@ -1,4 +1,5 @@
|
||||
SET client_min_messages TO warning;
|
||||
+
|
||||
CREATE SCHEMA tm;
|
||||
|
||||
CREATE TABLE tm.geoms (id serial, g geometry);
|
||||
@@ -31,24 +32,39 @@ SELECT st_force4d(g) FROM tm.geoms WHERE id < 15 ORDER BY id;
|
||||
INSERT INTO tm.geoms(g)
|
||||
SELECT st_setsrid(g,4326) FROM tm.geoms ORDER BY id;
|
||||
|
||||
-COPY tm.geoms TO :tmpfile WITH BINARY;
|
||||
+-- define temp file path
|
||||
+\set tmpfile '/tmp/postgis_binary_test.dat'
|
||||
+
|
||||
+-- export
|
||||
+\set command '\\copy tm.geoms TO ':tmpfile' WITH (FORMAT BINARY)'
|
||||
+:command
|
||||
+
|
||||
+-- import
|
||||
CREATE TABLE tm.geoms_in AS SELECT * FROM tm.geoms LIMIT 0;
|
||||
-COPY tm.geoms_in FROM :tmpfile WITH BINARY;
|
||||
-SELECT 'geometry', count(*) FROM tm.geoms_in i, tm.geoms o WHERE i.id = o.id
|
||||
- AND ST_OrderingEquals(i.g, o.g);
|
||||
+\set command '\\copy tm.geoms_in FROM ':tmpfile' WITH (FORMAT BINARY)'
|
||||
+:command
|
||||
+
|
||||
+SELECT 'geometry', count(*) FROM tm.geoms_in i, tm.geoms o
|
||||
+WHERE i.id = o.id AND ST_OrderingEquals(i.g, o.g);
|
||||
|
||||
CREATE TABLE tm.geogs AS SELECT id,g::geography FROM tm.geoms
|
||||
WHERE geometrytype(g) NOT LIKE '%CURVE%'
|
||||
AND geometrytype(g) NOT LIKE '%CIRCULAR%'
|
||||
AND geometrytype(g) NOT LIKE '%SURFACE%'
|
||||
AND geometrytype(g) NOT LIKE 'TRIANGLE%'
|
||||
- AND geometrytype(g) NOT LIKE 'TIN%'
|
||||
-;
|
||||
+ AND geometrytype(g) NOT LIKE 'TIN%';
|
||||
|
||||
-COPY tm.geogs TO :tmpfile WITH BINARY;
|
||||
+-- export
|
||||
+\set command '\\copy tm.geogs TO ':tmpfile' WITH (FORMAT BINARY)'
|
||||
+:command
|
||||
+
|
||||
+-- import
|
||||
CREATE TABLE tm.geogs_in AS SELECT * FROM tm.geogs LIMIT 0;
|
||||
-COPY tm.geogs_in FROM :tmpfile WITH BINARY;
|
||||
-SELECT 'geometry', count(*) FROM tm.geogs_in i, tm.geogs o WHERE i.id = o.id
|
||||
- AND ST_OrderingEquals(i.g::geometry, o.g::geometry);
|
||||
+\set command '\\copy tm.geogs_in FROM ':tmpfile' WITH (FORMAT BINARY)'
|
||||
+:command
|
||||
+
|
||||
+SELECT 'geometry', count(*) FROM tm.geogs_in i, tm.geogs o
|
||||
+WHERE i.id = o.id AND ST_OrderingEquals(i.g::geometry, o.g::geometry);
|
||||
|
||||
DROP SCHEMA tm CASCADE;
|
||||
+
|
||||
diff --git a/regress/core/tests.mk b/regress/core/tests.mk
|
||||
index 90987df..74fe3f1 100644
|
||||
--- a/regress/core/tests.mk
|
||||
+++ b/regress/core/tests.mk
|
||||
@@ -16,14 +16,13 @@ POSTGIS_PGSQL_VERSION=170
|
||||
POSTGIS_GEOS_VERSION=31101
|
||||
HAVE_JSON=yes
|
||||
HAVE_SPGIST=yes
|
||||
-INTERRUPTTESTS=yes
|
||||
+INTERRUPTTESTS=no
|
||||
|
||||
current_dir := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
|
||||
|
||||
RUNTESTFLAGS_INTERNAL += \
|
||||
--before-upgrade-script $(top_srcdir)/regress/hooks/hook-before-upgrade.sql \
|
||||
--after-upgrade-script $(top_srcdir)/regress/hooks/hook-after-upgrade.sql \
|
||||
- --after-create-script $(top_srcdir)/regress/hooks/hook-after-create.sql \
|
||||
--before-uninstall-script $(top_srcdir)/regress/hooks/hook-before-uninstall.sql
|
||||
|
||||
TESTS += \
|
||||
@@ -40,7 +39,6 @@ TESTS += \
|
||||
$(top_srcdir)/regress/core/dumppoints \
|
||||
$(top_srcdir)/regress/core/dumpsegments \
|
||||
$(top_srcdir)/regress/core/empty \
|
||||
- $(top_srcdir)/regress/core/estimatedextent \
|
||||
$(top_srcdir)/regress/core/forcecurve \
|
||||
$(top_srcdir)/regress/core/flatgeobuf \
|
||||
$(top_srcdir)/regress/core/frechet \
|
||||
@@ -60,7 +58,6 @@ TESTS += \
|
||||
$(top_srcdir)/regress/core/out_marc21 \
|
||||
$(top_srcdir)/regress/core/in_encodedpolyline \
|
||||
$(top_srcdir)/regress/core/iscollection \
|
||||
- $(top_srcdir)/regress/core/legacy \
|
||||
$(top_srcdir)/regress/core/letters \
|
||||
$(top_srcdir)/regress/core/lwgeom_regress \
|
||||
$(top_srcdir)/regress/core/measures \
|
||||
@@ -119,7 +116,6 @@ TESTS += \
|
||||
$(top_srcdir)/regress/core/temporal_knn \
|
||||
$(top_srcdir)/regress/core/tickets \
|
||||
$(top_srcdir)/regress/core/twkb \
|
||||
- $(top_srcdir)/regress/core/typmod \
|
||||
$(top_srcdir)/regress/core/wkb \
|
||||
$(top_srcdir)/regress/core/wkt \
|
||||
$(top_srcdir)/regress/core/wmsservers \
|
||||
diff --git a/regress/loader/tests.mk b/regress/loader/tests.mk
|
||||
index ac4f8ad..4bad4fc 100644
|
||||
--- a/regress/loader/tests.mk
|
||||
+++ b/regress/loader/tests.mk
|
||||
@@ -38,7 +38,5 @@ TESTS += \
|
||||
$(top_srcdir)/regress/loader/Latin1 \
|
||||
$(top_srcdir)/regress/loader/Latin1-implicit \
|
||||
$(top_srcdir)/regress/loader/mfile \
|
||||
- $(top_srcdir)/regress/loader/TestSkipANALYZE \
|
||||
- $(top_srcdir)/regress/loader/TestANALYZE \
|
||||
$(top_srcdir)/regress/loader/CharNoWidth \
|
||||
|
||||
diff --git a/regress/run_test.pl b/regress/run_test.pl
|
||||
index cac4b2e..4c7c82b 100755
|
||||
--- a/regress/run_test.pl
|
||||
+++ b/regress/run_test.pl
|
||||
@@ -238,7 +238,6 @@ $ENV{"LANG"} = "C";
|
||||
# Add locale info to the psql options
|
||||
# Add pg12 precision suppression
|
||||
my $PGOPTIONS = $ENV{"PGOPTIONS"};
|
||||
-$PGOPTIONS .= " -c lc_messages=C";
|
||||
$PGOPTIONS .= " -c client_min_messages=NOTICE";
|
||||
$PGOPTIONS .= " -c extra_float_digits=0";
|
||||
$ENV{"PGOPTIONS"} = $PGOPTIONS;
|
||||
diff --git a/topology/test/tests.mk b/topology/test/tests.mk
|
||||
index cbe2633..2c7c18f 100644
|
||||
--- a/topology/test/tests.mk
|
||||
+++ b/topology/test/tests.mk
|
||||
@@ -46,9 +46,7 @@ TESTS += \
|
||||
$(top_srcdir)/topology/test/regress/legacy_query.sql \
|
||||
$(top_srcdir)/topology/test/regress/legacy_validate.sql \
|
||||
$(top_srcdir)/topology/test/regress/polygonize.sql \
|
||||
- $(top_srcdir)/topology/test/regress/populate_topology_layer.sql \
|
||||
$(top_srcdir)/topology/test/regress/removeunusedprimitives.sql \
|
||||
- $(top_srcdir)/topology/test/regress/renametopogeometrycolumn.sql \
|
||||
$(top_srcdir)/topology/test/regress/renametopology.sql \
|
||||
$(top_srcdir)/topology/test/regress/share_sequences.sql \
|
||||
$(top_srcdir)/topology/test/regress/sqlmm.sql \
|
||||
46
docker-compose/ext-src/postgis-src/raster_outdb_template.sql
Normal file
46
docker-compose/ext-src/postgis-src/raster_outdb_template.sql
Normal file
File diff suppressed because one or more lines are too long
17
docker-compose/ext-src/postgis-src/regular-test.sh
Executable file
17
docker-compose/ext-src/postgis-src/regular-test.sh
Executable file
@@ -0,0 +1,17 @@
|
||||
#!/bin/bash
|
||||
set -ex
|
||||
cd "$(dirname "${0}")"
|
||||
dropdb --if-exist contrib_regression
|
||||
createdb contrib_regression
|
||||
psql -d contrib_regression -c "ALTER DATABASE contrib_regression SET TimeZone='UTC'" \
|
||||
-c "ALTER DATABASE contrib_regression SET DateStyle='ISO, MDY'" \
|
||||
-c "CREATE EXTENSION postgis SCHEMA public" \
|
||||
-c "CREATE EXTENSION postgis_topology" \
|
||||
-c "CREATE EXTENSION postgis_tiger_geocoder CASCADE" \
|
||||
-c "CREATE EXTENSION postgis_raster SCHEMA public" \
|
||||
-c "CREATE EXTENSION postgis_sfcgal SCHEMA public"
|
||||
patch -p1 <"postgis-common-${PG_VERSION}.patch"
|
||||
patch -p1 <"postgis-regular-${PG_VERSION}.patch"
|
||||
psql -d contrib_regression -f raster_outdb_template.sql
|
||||
trap 'patch -R -p1 <postgis-regular-${PG_VERSION}.patch && patch -R -p1 <"postgis-common-${PG_VERSION}.patch"' EXIT
|
||||
POSTGIS_REGRESS_DB=contrib_regression RUNTESTFLAGS=--nocreate make installcheck-base
|
||||
@@ -63,5 +63,9 @@ done
|
||||
for d in ${FAILED}; do
|
||||
cat "$(find $d -name regression.diffs)"
|
||||
done
|
||||
for postgis_diff in /tmp/pgis_reg/*_diff; do
|
||||
echo "${postgis_diff}:"
|
||||
cat "${postgis_diff}"
|
||||
done
|
||||
echo "${FAILED}"
|
||||
exit 1
|
||||
|
||||
@@ -178,9 +178,9 @@ pub struct ComputeSpec {
|
||||
/// JWT for authorizing requests to endpoint storage service
|
||||
pub endpoint_storage_token: Option<String>,
|
||||
|
||||
/// If true, download LFC state from endpoint_storage and pass it to Postgres on startup
|
||||
/// Download LFC state from endpoint_storage and pass it to Postgres on startup
|
||||
#[serde(default)]
|
||||
pub prewarm_lfc_on_startup: bool,
|
||||
pub autoprewarm: bool,
|
||||
}
|
||||
|
||||
/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
|
||||
@@ -192,6 +192,9 @@ pub enum ComputeFeature {
|
||||
/// track short-lived connections as user activity.
|
||||
ActivityMonitorExperimental,
|
||||
|
||||
/// Enable TLS functionality.
|
||||
TlsExperimental,
|
||||
|
||||
/// This is a special feature flag that is used to represent unknown feature flags.
|
||||
/// Basically all unknown to enum flags are represented as this one. See unit test
|
||||
/// `parse_unknown_features()` for more details.
|
||||
@@ -250,34 +253,44 @@ impl RemoteExtSpec {
|
||||
}
|
||||
|
||||
match self.extension_data.get(real_ext_name) {
|
||||
Some(_ext_data) => {
|
||||
// We have decided to use the Go naming convention due to Kubernetes.
|
||||
|
||||
let arch = match std::env::consts::ARCH {
|
||||
"x86_64" => "amd64",
|
||||
"aarch64" => "arm64",
|
||||
arch => arch,
|
||||
};
|
||||
|
||||
// Construct the path to the extension archive
|
||||
// BUILD_TAG/PG_MAJOR_VERSION/extensions/EXTENSION_NAME.tar.zst
|
||||
//
|
||||
// Keep it in sync with path generation in
|
||||
// https://github.com/neondatabase/build-custom-extensions/tree/main
|
||||
let archive_path_str = format!(
|
||||
"{build_tag}/{arch}/{pg_major_version}/extensions/{real_ext_name}.tar.zst"
|
||||
);
|
||||
Ok((
|
||||
real_ext_name.to_string(),
|
||||
RemotePath::from_string(&archive_path_str)?,
|
||||
))
|
||||
}
|
||||
Some(_ext_data) => Ok((
|
||||
real_ext_name.to_string(),
|
||||
Self::build_remote_path(build_tag, pg_major_version, real_ext_name)?,
|
||||
)),
|
||||
None => Err(anyhow::anyhow!(
|
||||
"real_ext_name {} is not found",
|
||||
real_ext_name
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the architecture-specific portion of the remote extension path. We
|
||||
/// use the Go naming convention due to Kubernetes.
|
||||
fn get_arch() -> &'static str {
|
||||
match std::env::consts::ARCH {
|
||||
"x86_64" => "amd64",
|
||||
"aarch64" => "arm64",
|
||||
arch => arch,
|
||||
}
|
||||
}
|
||||
|
||||
/// Build a [`RemotePath`] for an extension.
|
||||
fn build_remote_path(
|
||||
build_tag: &str,
|
||||
pg_major_version: &str,
|
||||
ext_name: &str,
|
||||
) -> anyhow::Result<RemotePath> {
|
||||
let arch = Self::get_arch();
|
||||
|
||||
// Construct the path to the extension archive
|
||||
// BUILD_TAG/PG_MAJOR_VERSION/extensions/EXTENSION_NAME.tar.zst
|
||||
//
|
||||
// Keep it in sync with path generation in
|
||||
// https://github.com/neondatabase/build-custom-extensions/tree/main
|
||||
RemotePath::from_string(&format!(
|
||||
"{build_tag}/{arch}/{pg_major_version}/extensions/{ext_name}.tar.zst"
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
|
||||
@@ -518,6 +531,37 @@ mod tests {
|
||||
.expect("Library should be found");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn remote_extension_path() {
|
||||
let rspec: RemoteExtSpec = serde_json::from_value(serde_json::json!({
|
||||
"public_extensions": ["ext"],
|
||||
"custom_extensions": [],
|
||||
"library_index": {
|
||||
"extlib": "ext",
|
||||
},
|
||||
"extension_data": {
|
||||
"ext": {
|
||||
"control_data": {
|
||||
"ext.control": ""
|
||||
},
|
||||
"archive_path": ""
|
||||
}
|
||||
},
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let (_ext_name, ext_path) = rspec
|
||||
.get_ext("ext", false, "latest", "v17")
|
||||
.expect("Extension should be found");
|
||||
// Starting with a forward slash would have consequences for the
|
||||
// Url::join() that occurs when downloading a remote extension.
|
||||
assert!(!ext_path.to_string().starts_with("/"));
|
||||
assert_eq!(
|
||||
ext_path,
|
||||
RemoteExtSpec::build_remote_path("latest", "v17", "ext").unwrap()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_spec_file() {
|
||||
let file = File::open("tests/cluster_spec.json").unwrap();
|
||||
|
||||
@@ -85,7 +85,7 @@
|
||||
"vartype": "bool"
|
||||
},
|
||||
{
|
||||
"name": "prewarm_lfc_on_startup",
|
||||
"name": "autoprewarm",
|
||||
"value": "off",
|
||||
"vartype": "bool"
|
||||
},
|
||||
|
||||
@@ -6,8 +6,20 @@ license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
thiserror.workspace = true
|
||||
nix.workspace=true
|
||||
nix.workspace = true
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
rustc-hash = { version = "2.1.1" }
|
||||
|
||||
[dev-dependencies]
|
||||
criterion = { workspace = true, features = ["html_reports"] }
|
||||
rand = "0.9.1"
|
||||
rand_distr = "0.5.1"
|
||||
xxhash-rust = { version = "0.8.15", features = ["xxh3"] }
|
||||
ahash.workspace = true
|
||||
|
||||
[target.'cfg(target_os = "macos")'.dependencies]
|
||||
tempfile = "3.14.0"
|
||||
|
||||
[[bench]]
|
||||
name = "hmap_resize"
|
||||
harness = false
|
||||
|
||||
438
libs/neon-shmem/src/hash.rs
Normal file
438
libs/neon-shmem/src/hash.rs
Normal file
@@ -0,0 +1,438 @@
|
||||
//! Hash table implementation on top of 'shmem'
|
||||
//!
|
||||
//! Features required in the long run by the communicator project:
|
||||
//!
|
||||
//! [X] Accessible from both Postgres processes and rust threads in the communicator process
|
||||
//! [X] Low latency
|
||||
//! [ ] Scalable to lots of concurrent accesses (currently relies on caller for locking)
|
||||
//! [ ] Resizable
|
||||
|
||||
use std::fmt::Debug;
|
||||
use std::hash::{Hash, Hasher, BuildHasher};
|
||||
use std::mem::MaybeUninit;
|
||||
|
||||
use rustc_hash::FxBuildHasher;
|
||||
|
||||
use crate::shmem::ShmemHandle;
|
||||
|
||||
mod core;
|
||||
pub mod entry;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
||||
mod optim;
|
||||
|
||||
use core::{CoreHashMap, INVALID_POS};
|
||||
use entry::{Entry, OccupiedEntry};
|
||||
|
||||
|
||||
pub struct HashMapInit<'a, K, V, S = rustc_hash::FxBuildHasher> {
|
||||
// Hash table can be allocated in a fixed memory area, or in a resizeable ShmemHandle.
|
||||
shmem_handle: Option<ShmemHandle>,
|
||||
shared_ptr: *mut HashMapShared<'a, K, V>,
|
||||
shared_size: usize,
|
||||
hasher: S,
|
||||
num_buckets: u32,
|
||||
}
|
||||
|
||||
pub struct HashMapAccess<'a, K, V, S = rustc_hash::FxBuildHasher> {
|
||||
shmem_handle: Option<ShmemHandle>,
|
||||
shared_ptr: *mut HashMapShared<'a, K, V>,
|
||||
hasher: S,
|
||||
}
|
||||
|
||||
unsafe impl<'a, K: Sync, V: Sync, S> Sync for HashMapAccess<'a, K, V, S> {}
|
||||
unsafe impl<'a, K: Send, V: Send, S> Send for HashMapAccess<'a, K, V, S> {}
|
||||
|
||||
impl<'a, K: Clone + Hash + Eq, V, S> HashMapInit<'a, K, V, S> {
|
||||
pub fn with_hasher(self, hasher: S) -> HashMapInit<'a, K, V, S> {
|
||||
Self { hasher, ..self }
|
||||
}
|
||||
|
||||
pub fn estimate_size(num_buckets: u32) -> usize {
|
||||
// add some margin to cover alignment etc.
|
||||
CoreHashMap::<K, V>::estimate_size(num_buckets) + size_of::<HashMapShared<K, V>>() + 1000
|
||||
}
|
||||
|
||||
pub fn attach_writer(self) -> HashMapAccess<'a, K, V, S> {
|
||||
let mut ptr: *mut u8 = self.shared_ptr.cast();
|
||||
let end_ptr: *mut u8 = unsafe { ptr.add(self.shared_size) };
|
||||
ptr = unsafe { ptr.add(ptr.align_offset(align_of::<HashMapShared<K, V>>())) };
|
||||
let shared_ptr: *mut HashMapShared<K, V> = ptr.cast();
|
||||
ptr = unsafe { ptr.add(size_of::<HashMapShared<K, V>>()) };
|
||||
|
||||
// carve out the buckets
|
||||
ptr = unsafe { ptr.byte_add(ptr.align_offset(align_of::<core::LinkedKey<K>>())) };
|
||||
let keys_ptr = ptr;
|
||||
ptr = unsafe { ptr.add(size_of::<core::LinkedKey<K>>() * self.num_buckets as usize) };
|
||||
|
||||
ptr = unsafe { ptr.byte_add(ptr.align_offset(align_of::<Option<V>>())) };
|
||||
let vals_ptr = ptr;
|
||||
ptr = unsafe { ptr.add(size_of::<Option<V>>() * self.num_buckets as usize) };
|
||||
|
||||
// use remaining space for the dictionary
|
||||
ptr = unsafe { ptr.byte_add(ptr.align_offset(align_of::<u32>())) };
|
||||
assert!(ptr.addr() < end_ptr.addr());
|
||||
let dictionary_ptr = ptr;
|
||||
let dictionary_size = unsafe { end_ptr.byte_offset_from(ptr) / size_of::<u32>() as isize };
|
||||
assert!(dictionary_size > 0);
|
||||
|
||||
let keys =
|
||||
unsafe { std::slice::from_raw_parts_mut(keys_ptr.cast(), self.num_buckets as usize) };
|
||||
let vals =
|
||||
unsafe { std::slice::from_raw_parts_mut(vals_ptr.cast(), self.num_buckets as usize) };
|
||||
let dictionary = unsafe {
|
||||
std::slice::from_raw_parts_mut(dictionary_ptr.cast(), dictionary_size as usize)
|
||||
};
|
||||
let hashmap = CoreHashMap::new(keys, vals, dictionary);
|
||||
unsafe {
|
||||
std::ptr::write(shared_ptr, HashMapShared { inner: hashmap });
|
||||
}
|
||||
|
||||
HashMapAccess {
|
||||
shmem_handle: self.shmem_handle,
|
||||
shared_ptr: self.shared_ptr,
|
||||
hasher: self.hasher,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn attach_reader(self) -> HashMapAccess<'a, K, V, S> {
|
||||
// no difference to attach_writer currently
|
||||
self.attach_writer()
|
||||
}
|
||||
}
|
||||
|
||||
/// This is stored in the shared memory area
|
||||
///
|
||||
/// NOTE: We carve out the parts from a contiguous chunk. Growing and shrinking the hash table
|
||||
/// relies on the memory layout! The data structures are laid out in the contiguous shared memory
|
||||
/// area as follows:
|
||||
///
|
||||
/// HashMapShared
|
||||
/// [buckets]
|
||||
/// [dictionary]
|
||||
///
|
||||
/// In between the above parts, there can be padding bytes to align the parts correctly.
|
||||
struct HashMapShared<'a, K, V> {
|
||||
inner: CoreHashMap<'a, K, V>
|
||||
}
|
||||
|
||||
impl<'a, K, V> HashMapInit<'a, K, V, rustc_hash::FxBuildHasher>
|
||||
where
|
||||
K: Clone + Hash + Eq
|
||||
{
|
||||
pub fn with_fixed(
|
||||
num_buckets: u32,
|
||||
area: &'a mut [MaybeUninit<u8>],
|
||||
) -> HashMapInit<'a, K, V> {
|
||||
Self {
|
||||
num_buckets,
|
||||
shmem_handle: None,
|
||||
shared_ptr: area.as_mut_ptr().cast(),
|
||||
shared_size: area.len(),
|
||||
hasher: rustc_hash::FxBuildHasher::default(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Initialize a new hash map in the given shared memory area
|
||||
pub fn with_shmem(num_buckets: u32, shmem: ShmemHandle) -> HashMapInit<'a, K, V> {
|
||||
let size = Self::estimate_size(num_buckets);
|
||||
shmem
|
||||
.set_size(size)
|
||||
.expect("could not resize shared memory area");
|
||||
Self {
|
||||
num_buckets,
|
||||
shared_ptr: shmem.data_ptr.as_ptr().cast(),
|
||||
shmem_handle: Some(shmem),
|
||||
shared_size: size,
|
||||
hasher: rustc_hash::FxBuildHasher::default()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new_resizeable_named(num_buckets: u32, max_buckets: u32, name: &str) -> HashMapInit<'a, K, V> {
|
||||
let size = Self::estimate_size(num_buckets);
|
||||
let max_size = Self::estimate_size(max_buckets);
|
||||
let shmem = ShmemHandle::new(name, size, max_size)
|
||||
.expect("failed to make shared memory area");
|
||||
|
||||
Self {
|
||||
num_buckets,
|
||||
shared_ptr: shmem.data_ptr.as_ptr().cast(),
|
||||
shmem_handle: Some(shmem),
|
||||
shared_size: size,
|
||||
hasher: rustc_hash::FxBuildHasher::default()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new_resizeable(num_buckets: u32, max_buckets: u32) -> HashMapInit<'a, K, V> {
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
const COUNTER: AtomicUsize = AtomicUsize::new(0);
|
||||
let val = COUNTER.fetch_add(1, Ordering::Relaxed);
|
||||
let name = format!("neon_shmem_hmap{}", val);
|
||||
Self::new_resizeable_named(num_buckets, max_buckets, &name)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, K, V, S: BuildHasher> HashMapAccess<'a, K, V, S>
|
||||
where
|
||||
K: Clone + Hash + Eq,
|
||||
{
|
||||
pub fn get_hash_value(&self, key: &K) -> u64 {
|
||||
self.hasher.hash_one(key)
|
||||
}
|
||||
|
||||
pub fn get_with_hash<'e>(&'e self, key: &K, hash: u64) -> Option<&'e V> {
|
||||
let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
|
||||
|
||||
map.inner.get_with_hash(key, hash)
|
||||
}
|
||||
|
||||
pub fn entry_with_hash(&mut self, key: K, hash: u64) -> Entry<'a, '_, K, V> {
|
||||
let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
|
||||
|
||||
map.inner.entry_with_hash(key, hash)
|
||||
}
|
||||
|
||||
pub fn remove_with_hash(&mut self, key: &K, hash: u64) {
|
||||
let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
|
||||
|
||||
match map.inner.entry_with_hash(key.clone(), hash) {
|
||||
Entry::Occupied(e) => {
|
||||
e.remove();
|
||||
}
|
||||
Entry::Vacant(_) => {}
|
||||
};
|
||||
}
|
||||
|
||||
pub fn entry_at_bucket(&mut self, pos: usize) -> Option<OccupiedEntry<'a, '_, K, V>> {
|
||||
let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
|
||||
map.inner.entry_at_bucket(pos)
|
||||
}
|
||||
|
||||
pub fn get_num_buckets(&self) -> usize {
|
||||
let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
|
||||
map.inner.get_num_buckets()
|
||||
}
|
||||
|
||||
/// Return the key and value stored in bucket with given index. This can be used to
|
||||
/// iterate through the hash map. (An Iterator might be nicer. The communicator's
|
||||
/// clock algorithm needs to _slowly_ iterate through all buckets with its clock hand,
|
||||
/// without holding a lock. If we switch to an Iterator, it must not hold the lock.)
|
||||
pub fn get_at_bucket(&self, pos: usize) -> Option<(&K, &V)> {
|
||||
let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
|
||||
|
||||
if pos >= map.inner.keys.len() {
|
||||
return None;
|
||||
}
|
||||
let key = &map.inner.keys[pos];
|
||||
key.inner.as_ref().map(|k| (k, map.inner.vals[pos].as_ref().unwrap()))
|
||||
}
|
||||
|
||||
pub fn get_bucket_for_value(&self, val_ptr: *const V) -> usize {
|
||||
let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
|
||||
|
||||
let origin = map.inner.vals.as_ptr();
|
||||
let idx = (val_ptr as usize - origin as usize) / (size_of::<V>() as usize);
|
||||
assert!(idx < map.inner.vals.len());
|
||||
|
||||
idx
|
||||
}
|
||||
|
||||
// for metrics
|
||||
pub fn get_num_buckets_in_use(&self) -> usize {
|
||||
let map = unsafe { self.shared_ptr.as_ref() }.unwrap();
|
||||
map.inner.buckets_in_use as usize
|
||||
}
|
||||
|
||||
pub fn clear(&mut self) {
|
||||
let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
|
||||
let inner = &mut map.inner;
|
||||
inner.clear()
|
||||
}
|
||||
|
||||
/// Helper function that abstracts the common logic between growing and shrinking.
|
||||
/// The only significant difference in the rehashing step is how many buckets to rehash.
|
||||
fn rehash_dict(
|
||||
&mut self,
|
||||
inner: &mut CoreHashMap<'a, K, V>,
|
||||
keys_ptr: *mut core::LinkedKey<K>,
|
||||
end_ptr: *mut u8,
|
||||
num_buckets: u32,
|
||||
rehash_buckets: u32,
|
||||
) {
|
||||
inner.free_head = INVALID_POS;
|
||||
|
||||
// Recalculate the dictionary
|
||||
let keys;
|
||||
let dictionary;
|
||||
unsafe {
|
||||
let keys_end_ptr = keys_ptr.add(num_buckets as usize);
|
||||
let buckets_end_ptr: *mut u8 = (keys_end_ptr as *mut u8)
|
||||
.add(size_of::<Option<V>>() * num_buckets as usize);
|
||||
let dictionary_ptr: *mut u32 = buckets_end_ptr
|
||||
.byte_add(buckets_end_ptr.align_offset(align_of::<u32>()))
|
||||
.cast();
|
||||
let dictionary_size: usize =
|
||||
end_ptr.byte_offset_from(buckets_end_ptr) as usize / size_of::<u32>();
|
||||
|
||||
keys = std::slice::from_raw_parts_mut(keys_ptr, num_buckets as usize);
|
||||
dictionary = std::slice::from_raw_parts_mut(dictionary_ptr, dictionary_size);
|
||||
}
|
||||
for i in 0..dictionary.len() {
|
||||
dictionary[i] = INVALID_POS;
|
||||
}
|
||||
|
||||
for i in 0..rehash_buckets as usize {
|
||||
if keys[i].inner.is_none() {
|
||||
keys[i].next = inner.free_head;
|
||||
inner.free_head = i as u32;
|
||||
continue;
|
||||
}
|
||||
|
||||
let hash = self.hasher.hash_one(&keys[i].inner.as_ref().unwrap());
|
||||
let pos: usize = (hash % dictionary.len() as u64) as usize;
|
||||
keys[i].next = dictionary[pos];
|
||||
dictionary[pos] = i as u32;
|
||||
}
|
||||
|
||||
// Finally, update the CoreHashMap struct
|
||||
inner.dictionary = dictionary;
|
||||
inner.keys = keys;
|
||||
}
|
||||
|
||||
/// Rehash the map. Intended for benchmarking only.
|
||||
pub fn shuffle(&mut self) {
|
||||
let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
|
||||
let inner = &mut map.inner;
|
||||
let num_buckets = inner.get_num_buckets() as u32;
|
||||
let size_bytes = HashMapInit::<K, V, S>::estimate_size(num_buckets);
|
||||
let end_ptr: *mut u8 = unsafe { (self.shared_ptr as *mut u8).add(size_bytes) };
|
||||
let keys_ptr = inner.keys.as_mut_ptr();
|
||||
self.rehash_dict(inner, keys_ptr, end_ptr, num_buckets, num_buckets);
|
||||
}
|
||||
|
||||
|
||||
// /// Grow
|
||||
// ///
|
||||
// /// 1. grow the underlying shared memory area
|
||||
// /// 2. Initialize new buckets. This overwrites the current dictionary
|
||||
// /// 3. Recalculate the dictionary
|
||||
// pub fn grow(&mut self, num_buckets: u32) -> Result<(), crate::shmem::Error> {
|
||||
// let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
|
||||
// let inner = &mut map.inner;
|
||||
// let old_num_buckets = inner.buckets.len() as u32;
|
||||
// if num_buckets < old_num_buckets {
|
||||
// panic!("grow called with a smaller number of buckets");
|
||||
// }
|
||||
// if num_buckets == old_num_buckets {
|
||||
// return Ok(());
|
||||
// }
|
||||
// let shmem_handle = self
|
||||
// .shmem_handle
|
||||
// .as_ref()
|
||||
// .expect("grow called on a fixed-size hash table");
|
||||
|
||||
// let size_bytes = HashMapInit::<K, V, S>::estimate_size(num_buckets);
|
||||
// shmem_handle.set_size(size_bytes)?;
|
||||
// let end_ptr: *mut u8 = unsafe { shmem_handle.data_ptr.as_ptr().add(size_bytes) };
|
||||
|
||||
// // Initialize new buckets. The new buckets are linked to the free list. NB: This overwrites
|
||||
// // the dictionary!
|
||||
// let keys_ptr = inner.keys.as_mut_ptr();
|
||||
// unsafe {
|
||||
// for i in old_num_buckets..num_buckets {
|
||||
// let bucket_ptr = buckets_ptr.add(i as usize);
|
||||
// bucket_ptr.write(core::Bucket {
|
||||
// next: if i < num_buckets-1 {
|
||||
// i as u32 + 1
|
||||
// } else {
|
||||
// inner.free_head
|
||||
// },
|
||||
// prev: if i > 0 {
|
||||
// PrevPos::Chained(i as u32 - 1)
|
||||
// } else {
|
||||
// PrevPos::First(INVALID_POS)
|
||||
// },
|
||||
// inner: None,
|
||||
// });
|
||||
// }
|
||||
// }
|
||||
// self.rehash_dict(inner, keys_ptr, end_ptr, num_buckets, old_num_buckets);
|
||||
// inner.free_head = old_num_buckets;
|
||||
|
||||
// Ok(())
|
||||
// }
|
||||
|
||||
// /// Begin a shrink, limiting all new allocations to be in buckets with index less than `num_buckets`.
|
||||
// pub fn begin_shrink(&mut self, num_buckets: u32) {
|
||||
// let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
|
||||
// if num_buckets > map.inner.get_num_buckets() as u32 {
|
||||
// panic!("shrink called with a larger number of buckets");
|
||||
// }
|
||||
// _ = self
|
||||
// .shmem_handle
|
||||
// .as_ref()
|
||||
// .expect("shrink called on a fixed-size hash table");
|
||||
// map.inner.alloc_limit = num_buckets;
|
||||
// }
|
||||
|
||||
// /// Complete a shrink after caller has evicted entries, removing the unused buckets and rehashing.
|
||||
// pub fn finish_shrink(&mut self) -> Result<(), crate::shmem::Error> {
|
||||
// let map = unsafe { self.shared_ptr.as_mut() }.unwrap();
|
||||
// let inner = &mut map.inner;
|
||||
// if !inner.is_shrinking() {
|
||||
// panic!("called finish_shrink when no shrink is in progress");
|
||||
// }
|
||||
|
||||
// let num_buckets = inner.alloc_limit;
|
||||
|
||||
// if inner.get_num_buckets() == num_buckets as usize {
|
||||
// return Ok(());
|
||||
// }
|
||||
|
||||
// for i in (num_buckets as usize)..inner.buckets.len() {
|
||||
// if inner.buckets[i].inner.is_some() {
|
||||
// // TODO(quantumish) Do we want to treat this as a violation of an invariant
|
||||
// // or a legitimate error the caller can run into? Originally I thought this
|
||||
// // could return something like a UnevictedError(index) as soon as it runs
|
||||
// // into something (that way a caller could clear their soon-to-be-shrinked
|
||||
// // buckets by repeatedly trying to call `finish_shrink`).
|
||||
// //
|
||||
// // Would require making a wider error type enum with this and shmem errors.
|
||||
// panic!("unevicted entries in shrinked space")
|
||||
// }
|
||||
// match inner.buckets[i].prev {
|
||||
// PrevPos::First(_) => {
|
||||
// let next_pos = inner.buckets[i].next;
|
||||
// inner.free_head = next_pos;
|
||||
// if next_pos != INVALID_POS {
|
||||
// inner.buckets[next_pos as usize].prev = PrevPos::First(INVALID_POS);
|
||||
// }
|
||||
// },
|
||||
// PrevPos::Chained(j) => {
|
||||
// let next_pos = inner.buckets[i].next;
|
||||
// inner.buckets[j as usize].next = next_pos;
|
||||
// if next_pos != INVALID_POS {
|
||||
// inner.buckets[next_pos as usize].prev = PrevPos::Chained(j);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
// let shmem_handle = self
|
||||
// .shmem_handle
|
||||
// .as_ref()
|
||||
// .expect("shrink called on a fixed-size hash table");
|
||||
|
||||
// let size_bytes = HashMapInit::<K, V, S>::estimate_size(num_buckets);
|
||||
// shmem_handle.set_size(size_bytes)?;
|
||||
// let end_ptr: *mut u8 = unsafe { shmem_handle.data_ptr.as_ptr().add(size_bytes) };
|
||||
// let buckets_ptr = inner.buckets.as_mut_ptr();
|
||||
// self.rehash_dict(inner, buckets_ptr, end_ptr, num_buckets, num_buckets);
|
||||
// inner.alloc_limit = INVALID_POS;
|
||||
|
||||
// Ok(())
|
||||
// }
|
||||
|
||||
}
|
||||
247
libs/neon-shmem/src/hash/core.rs
Normal file
247
libs/neon-shmem/src/hash/core.rs
Normal file
@@ -0,0 +1,247 @@
|
||||
//! Simple hash table with chaining
|
||||
//!
|
||||
//! # Resizing
|
||||
//!
|
||||
|
||||
use std::hash::Hash;
|
||||
use std::mem::MaybeUninit;
|
||||
|
||||
use crate::hash::entry::{Entry, OccupiedEntry, PrevPos, VacantEntry};
|
||||
|
||||
pub(crate) const INVALID_POS: u32 = u32::MAX;
|
||||
|
||||
pub(crate) struct LinkedKey<K> {
|
||||
pub(crate) inner: Option<K>,
|
||||
pub(crate) next: u32,
|
||||
}
|
||||
|
||||
pub(crate) struct CoreHashMap<'a, K, V> {
|
||||
/// Dictionary used to map hashes to bucket indices.
|
||||
pub(crate) dictionary: &'a mut [u32],
|
||||
pub(crate) keys: &'a mut [LinkedKey<K>],
|
||||
pub(crate) vals: &'a mut [Option<V>],
|
||||
/// Head of the freelist.
|
||||
pub(crate) free_head: u32,
|
||||
|
||||
pub(crate) _user_list_head: u32,
|
||||
/// Maximum index of a bucket allowed to be allocated. INVALID_POS if no limit.
|
||||
pub(crate) alloc_limit: u32,
|
||||
|
||||
// metrics
|
||||
pub(crate) buckets_in_use: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct FullError();
|
||||
|
||||
impl<'a, K: Hash + Eq, V> CoreHashMap<'a, K, V>
|
||||
where
|
||||
K: Clone + Hash + Eq,
|
||||
{
|
||||
const FILL_FACTOR: f32 = 0.60;
|
||||
|
||||
pub fn estimate_size(num_buckets: u32) -> usize {
|
||||
let mut size = 0;
|
||||
|
||||
// buckets
|
||||
size += (size_of::<LinkedKey<K>>() + size_of::<Option<V>>())
|
||||
* num_buckets as usize;
|
||||
|
||||
// dictionary
|
||||
size += (f32::ceil((size_of::<u32>() * num_buckets as usize) as f32 / Self::FILL_FACTOR))
|
||||
as usize;
|
||||
|
||||
size
|
||||
}
|
||||
|
||||
pub fn new(
|
||||
keys: &'a mut [MaybeUninit<LinkedKey<K>>],
|
||||
vals: &'a mut [MaybeUninit<Option<V>>],
|
||||
dictionary: &'a mut [MaybeUninit<u32>],
|
||||
) -> CoreHashMap<'a, K, V> {
|
||||
// Initialize the buckets
|
||||
for i in 0..keys.len() {
|
||||
keys[i].write(LinkedKey {
|
||||
next: if i < keys.len() - 1 {
|
||||
i as u32 + 1
|
||||
} else {
|
||||
INVALID_POS
|
||||
},
|
||||
inner: None,
|
||||
});
|
||||
}
|
||||
for i in 0..vals.len() {
|
||||
vals[i].write(None);
|
||||
}
|
||||
|
||||
// Initialize the dictionary
|
||||
for i in 0..dictionary.len() {
|
||||
dictionary[i].write(INVALID_POS);
|
||||
}
|
||||
|
||||
// TODO: use std::slice::assume_init_mut() once it stabilizes
|
||||
let keys =
|
||||
unsafe { std::slice::from_raw_parts_mut(keys.as_mut_ptr().cast(), keys.len()) };
|
||||
let vals =
|
||||
unsafe { std::slice::from_raw_parts_mut(vals.as_mut_ptr().cast(), vals.len()) };
|
||||
let dictionary = unsafe {
|
||||
std::slice::from_raw_parts_mut(dictionary.as_mut_ptr().cast(), dictionary.len())
|
||||
};
|
||||
|
||||
CoreHashMap {
|
||||
dictionary,
|
||||
keys,
|
||||
vals,
|
||||
free_head: 0,
|
||||
buckets_in_use: 0,
|
||||
_user_list_head: INVALID_POS,
|
||||
alloc_limit: INVALID_POS,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_with_hash(&self, key: &K, hash: u64) -> Option<&V> {
|
||||
let mut next = self.dictionary[hash as usize % self.dictionary.len()];
|
||||
loop {
|
||||
if next == INVALID_POS {
|
||||
return None;
|
||||
}
|
||||
|
||||
let keylink = &self.keys[next as usize];
|
||||
let bucket_key = keylink.inner.as_ref().expect("entry is in use");
|
||||
if bucket_key == key {
|
||||
return Some(self.vals[next as usize].as_ref().unwrap());
|
||||
}
|
||||
next = keylink.next;
|
||||
}
|
||||
}
|
||||
|
||||
// all updates are done through Entry
|
||||
pub fn entry_with_hash(&mut self, key: K, hash: u64) -> Entry<'a, '_, K, V> {
|
||||
let dict_pos = hash as usize % self.dictionary.len();
|
||||
let first = self.dictionary[dict_pos];
|
||||
if first == INVALID_POS {
|
||||
// no existing entry
|
||||
return Entry::Vacant(VacantEntry {
|
||||
map: self,
|
||||
key,
|
||||
dict_pos: dict_pos as u32,
|
||||
});
|
||||
}
|
||||
|
||||
let mut prev_pos = PrevPos::First(dict_pos as u32);
|
||||
let mut next = first;
|
||||
loop {
|
||||
let keylink = &mut self.keys[next as usize];
|
||||
let bucket_key = keylink.inner.as_mut().expect("entry is in use");
|
||||
if *bucket_key == key {
|
||||
// found existing entry
|
||||
return Entry::Occupied(OccupiedEntry {
|
||||
map: self,
|
||||
_key: key,
|
||||
prev_pos,
|
||||
bucket_pos: next,
|
||||
});
|
||||
}
|
||||
|
||||
if keylink.next == INVALID_POS {
|
||||
// No existing entry
|
||||
return Entry::Vacant(VacantEntry {
|
||||
map: self,
|
||||
key,
|
||||
dict_pos: dict_pos as u32,
|
||||
});
|
||||
}
|
||||
prev_pos = PrevPos::Chained(next);
|
||||
next = keylink.next;
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_num_buckets(&self) -> usize {
|
||||
self.keys.len()
|
||||
}
|
||||
|
||||
pub fn is_shrinking(&self) -> bool {
|
||||
self.alloc_limit != INVALID_POS
|
||||
}
|
||||
|
||||
/// Clears all entries from the hashmap.
|
||||
/// Does not reset any allocation limits, but does clear any entries beyond them.
|
||||
pub fn clear(&mut self) {
|
||||
for i in 0..self.keys.len() {
|
||||
self.keys[i] = LinkedKey {
|
||||
next: if i < self.keys.len() - 1 {
|
||||
i as u32 + 1
|
||||
} else {
|
||||
INVALID_POS
|
||||
},
|
||||
inner: None,
|
||||
}
|
||||
}
|
||||
for i in 0..self.vals.len() {
|
||||
self.vals[i] = None;
|
||||
}
|
||||
|
||||
for i in 0..self.dictionary.len() {
|
||||
self.dictionary[i] = INVALID_POS;
|
||||
}
|
||||
|
||||
self.buckets_in_use = 0;
|
||||
}
|
||||
|
||||
pub fn entry_at_bucket(&mut self, pos: usize) -> Option<OccupiedEntry<'a, '_, K, V>> {
|
||||
if pos >= self.keys.len() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let entry = self.keys[pos].inner.as_ref();
|
||||
match entry {
|
||||
Some(key) => Some(OccupiedEntry {
|
||||
_key: key.clone(),
|
||||
bucket_pos: pos as u32,
|
||||
prev_pos: PrevPos::Unknown,
|
||||
map: self,
|
||||
}),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Find the position of an unused bucket via the freelist and initialize it.
|
||||
pub(crate) fn alloc_bucket(&mut self, key: K, value: V) -> Result<u32, FullError> {
|
||||
let mut pos = self.free_head;
|
||||
|
||||
// Find the first bucket we're *allowed* to use.
|
||||
let mut prev = PrevPos::First(self.free_head);
|
||||
while pos != INVALID_POS && pos >= self.alloc_limit {
|
||||
let keylink = &mut self.keys[pos as usize];
|
||||
prev = PrevPos::Chained(pos);
|
||||
pos = keylink.next;
|
||||
}
|
||||
if pos == INVALID_POS {
|
||||
return Err(FullError());
|
||||
}
|
||||
|
||||
// Repair the freelist.
|
||||
match prev {
|
||||
PrevPos::First(_) => {
|
||||
let next_pos = self.keys[pos as usize].next;
|
||||
self.free_head = next_pos;
|
||||
}
|
||||
PrevPos::Chained(p) => if p != INVALID_POS {
|
||||
let next_pos = self.keys[pos as usize].next;
|
||||
self.keys[p as usize].next = next_pos;
|
||||
},
|
||||
PrevPos::Unknown => unreachable!()
|
||||
}
|
||||
|
||||
// Initialize the bucket.
|
||||
let keylink = &mut self.keys[pos as usize];
|
||||
self.buckets_in_use += 1;
|
||||
keylink.next = INVALID_POS;
|
||||
keylink.inner = Some(key);
|
||||
self.vals[pos as usize] = Some(value);
|
||||
|
||||
return Ok(pos);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
107
libs/neon-shmem/src/hash/entry.rs
Normal file
107
libs/neon-shmem/src/hash/entry.rs
Normal file
@@ -0,0 +1,107 @@
|
||||
//! Like std::collections::hash_map::Entry;
|
||||
|
||||
use crate::hash::core::{CoreHashMap, FullError, INVALID_POS};
|
||||
|
||||
use std::hash::Hash;
|
||||
use std::mem;
|
||||
|
||||
pub enum Entry<'a, 'b, K, V> {
|
||||
Occupied(OccupiedEntry<'a, 'b, K, V>),
|
||||
Vacant(VacantEntry<'a, 'b, K, V>),
|
||||
}
|
||||
|
||||
/// Helper enum representing the previous position within a hashmap chain.
|
||||
#[derive(Clone, Copy)]
|
||||
pub(crate) enum PrevPos {
|
||||
/// Starting index within the dictionary.
|
||||
First(u32),
|
||||
/// Regular index within the buckets.
|
||||
Chained(u32),
|
||||
/// Unknown - e.g. the associated entry was retrieved by index instead of chain.
|
||||
Unknown,
|
||||
}
|
||||
|
||||
impl PrevPos {
|
||||
/// Unwrap an index from a `PrevPos::First`, panicking otherwise.
|
||||
pub fn unwrap_first(&self) -> u32 {
|
||||
match self {
|
||||
Self::First(i) => *i,
|
||||
_ => panic!("not first entry in chain")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct OccupiedEntry<'a, 'b, K, V> {
|
||||
pub(crate) map: &'b mut CoreHashMap<'a, K, V>,
|
||||
/// The key of the occupied entry
|
||||
pub(crate) _key: K,
|
||||
/// The index of the previous entry in the chain.
|
||||
pub(crate) prev_pos: PrevPos,
|
||||
/// The position of the bucket in the CoreHashMap's buckets array.
|
||||
pub(crate) bucket_pos: u32,
|
||||
}
|
||||
|
||||
impl<'a, 'b, K, V> OccupiedEntry<'a, 'b, K, V> {
|
||||
pub fn get(&self) -> &V {
|
||||
self.map.vals[self.bucket_pos as usize]
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
pub fn get_mut(&mut self) -> &mut V {
|
||||
self.map.vals[self.bucket_pos as usize]
|
||||
.as_mut()
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, value: V) -> V {
|
||||
let bucket = &mut self.map.vals[self.bucket_pos as usize];
|
||||
// This assumes inner is Some, which it must be for an OccupiedEntry
|
||||
let old_value = mem::replace(bucket.as_mut().unwrap(), value);
|
||||
old_value
|
||||
}
|
||||
|
||||
pub fn remove(self) -> V {
|
||||
// CoreHashMap::remove returns Option<(K, V)>. We know it's Some for an OccupiedEntry.
|
||||
let keylink = &mut self.map.keys[self.bucket_pos as usize];
|
||||
|
||||
// unlink it from the chain
|
||||
match self.prev_pos {
|
||||
PrevPos::First(dict_pos) => self.map.dictionary[dict_pos as usize] = keylink.next,
|
||||
PrevPos::Chained(bucket_pos) => {
|
||||
self.map.keys[bucket_pos as usize].next = keylink.next
|
||||
},
|
||||
PrevPos::Unknown => panic!("can't safely remove entry with unknown previous entry"),
|
||||
}
|
||||
|
||||
// and add it to the freelist
|
||||
let keylink = &mut self.map.keys[self.bucket_pos as usize];
|
||||
keylink.inner = None;
|
||||
keylink.next = self.map.free_head;
|
||||
let old_value = self.map.vals[self.bucket_pos as usize].take();
|
||||
self.map.free_head = self.bucket_pos;
|
||||
self.map.buckets_in_use -= 1;
|
||||
|
||||
return old_value.unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
pub struct VacantEntry<'a, 'b, K, V> {
|
||||
pub(crate) map: &'b mut CoreHashMap<'a, K, V>,
|
||||
pub(crate) key: K, // The key to insert
|
||||
pub(crate) dict_pos: u32,
|
||||
}
|
||||
|
||||
impl<'a, 'b, K: Clone + Hash + Eq, V> VacantEntry<'a, 'b, K, V> {
|
||||
pub fn insert(self, value: V) -> Result<&'b mut V, FullError> {
|
||||
let pos = self.map.alloc_bucket(self.key, value)?;
|
||||
if pos == INVALID_POS {
|
||||
return Err(FullError());
|
||||
}
|
||||
self.map.keys[pos as usize].next = self.map.dictionary[self.dict_pos as usize];
|
||||
self.map.dictionary[self.dict_pos as usize] = pos;
|
||||
|
||||
let result = self.map.vals[pos as usize].as_mut().unwrap();
|
||||
return Ok(result);
|
||||
}
|
||||
}
|
||||
85
libs/neon-shmem/src/hash/optim.rs
Normal file
85
libs/neon-shmem/src/hash/optim.rs
Normal file
@@ -0,0 +1,85 @@
|
||||
//! Adapted from https://github.com/jsnell/parallel-xxhash (TODO: license?)
|
||||
|
||||
use core::arch::x86::*;
|
||||
|
||||
const PRIME32_1: u32 = 2654435761;
|
||||
const PRIME32_2: u32 = 2246822519;
|
||||
const PRIME32_3: u32 = 3266489917;
|
||||
const PRIME32_4: u32 = 668265263;
|
||||
const PRIME32_5: u32 = 374761393;
|
||||
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
#[target_feature(enable = "avx2")]
|
||||
fn mm256_rol32<const r: u32>(x: __m256i) -> __m256i {
|
||||
return _mm256_or_si256(_mm256_slli_epi32(x, r),
|
||||
_mm256_srli_epi32(x, 32 - r));
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
#[target_feature(enable = "avx2")]
|
||||
fn mm256_fmix32(mut h: __m256i) -> __m256i {
|
||||
h = _mm256_xor_si256(h, _mm256_srli_epi32(h, 15));
|
||||
h = _mm256_mullo_epi32(h, _mm256_set1_epi32(PRIME32_2));
|
||||
h = _mm256_xor_si256(h, _mm256_srli_epi32(h, 13));
|
||||
h = _mm256_mullo_epi32(h, _mm256_set1_epi32(PRIME32_3));
|
||||
h = _mm256_xor_si256(h, _mm256_srli_epi32(h, 16));
|
||||
h
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
#[target_feature(enable = "avx2")]
|
||||
fn mm256_round(mut seed: __m256i, input: __m256i) -> __m256i {
|
||||
seed = _mm256_add_epi32(
|
||||
seed,
|
||||
_mm256_mullo_epi32(input, _mm256_set1_epi32(PRIME32_2))
|
||||
);
|
||||
seed = mm256_rol32::<13>(seed);
|
||||
seed = _mm256_mullo_epi32(seed, _mm256_set1_epi32(PRIME32_1));
|
||||
seed
|
||||
}
|
||||
|
||||
/// Computes xxHash for 8 keys of size 4*N bytes in column-major order.
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
#[target_feature(enable = "avx2")]
|
||||
fn xxhash_many<const N: usize>(keys: *const u32, seed: u32) -> [u32; 8] {
|
||||
let mut res = [0; 8];
|
||||
let mut h = _mm256_set1_epi32(seed + PRIME32_5);
|
||||
if (N >= 4) {
|
||||
let mut v1 = _mm256_set1_epi32(seed + PRIME32_1 + PRIME32_2);
|
||||
let mut v2 = _mm256_set1_epi32(seed + PRIME32_2);
|
||||
let mut v3 = _mm256_set1_epi32(seed);
|
||||
let mut v4 = _mm256_set1_eip32(seed - PRIME32_1);
|
||||
let mut i = 0;
|
||||
while i < (N & !3) {
|
||||
let k1 = _mm256_loadu_si256(keys.add((i + 0) * 8).cast());
|
||||
let k2 = _mm256_loadu_si256(keys.add((i + 1) * 8).cast());
|
||||
let k3 = _mm256_loadu_si256(keys.add((i + 2) * 8).cast());
|
||||
let k4 = _mm256_loadu_si256(keys.add((i + 3) * 8).cast());
|
||||
v1 = mm256_round(v1, k1);
|
||||
v2 = mm256_round(v2, k2);
|
||||
v3 = mm256_round(v3, k3);
|
||||
v4 = mm256_round(v4, k4);
|
||||
i += 4;
|
||||
}
|
||||
h = mm256_rol32::<1>(v1) + mm256_rol32::<7>(v2) +
|
||||
mm256_rol32::<12>(v3) + mm256_rol32::<18>(v4);
|
||||
}
|
||||
|
||||
// Unneeded, keeps bitwise parity with xxhash though.
|
||||
h = _m256_add_epi32(h, _mm256_set1_eip32(N * 4));
|
||||
|
||||
for i in -(N & 3)..0 {
|
||||
let v = _mm256_loadu_si256(keys.add((N + i) * 8));
|
||||
h = _mm256_add_epi32(
|
||||
h,
|
||||
_mm256_mullo_epi32(v, _mm256_set1_epi32(PRIME32_3))
|
||||
);
|
||||
h = _mm256_mullo_epi32(
|
||||
mm256_rol32::<17>(h),
|
||||
_mm256_set1_epi32(PRIME32_4)
|
||||
);
|
||||
}
|
||||
|
||||
_mm256_storeu_si256((&mut res as *mut _).cast(), mm256_fmix32(h));
|
||||
res
|
||||
}
|
||||
382
libs/neon-shmem/src/hash/tests.rs
Normal file
382
libs/neon-shmem/src/hash/tests.rs
Normal file
@@ -0,0 +1,382 @@
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::HashSet;
|
||||
use std::fmt::{Debug, Formatter};
|
||||
use std::mem::uninitialized;
|
||||
use std::mem::MaybeUninit;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
use crate::hash::HashMapAccess;
|
||||
use crate::hash::HashMapInit;
|
||||
use crate::hash::Entry;
|
||||
use crate::shmem::ShmemHandle;
|
||||
|
||||
use rand::seq::SliceRandom;
|
||||
use rand::{Rng, RngCore};
|
||||
use rand_distr::Zipf;
|
||||
|
||||
const TEST_KEY_LEN: usize = 16;
|
||||
|
||||
#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
|
||||
struct TestKey([u8; TEST_KEY_LEN]);
|
||||
|
||||
impl From<&TestKey> for u128 {
|
||||
fn from(val: &TestKey) -> u128 {
|
||||
u128::from_be_bytes(val.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<u128> for TestKey {
|
||||
fn from(val: u128) -> TestKey {
|
||||
TestKey(val.to_be_bytes())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> From<&'a [u8]> for TestKey {
|
||||
fn from(bytes: &'a [u8]) -> TestKey {
|
||||
TestKey(bytes.try_into().unwrap())
|
||||
}
|
||||
}
|
||||
|
||||
fn test_inserts<K: Into<TestKey> + Copy>(keys: &[K]) {
|
||||
let mut w = HashMapInit::<TestKey, usize>::new_resizeable_named(
|
||||
100000, 120000, "test_inserts"
|
||||
).attach_writer();
|
||||
|
||||
for (idx, k) in keys.iter().enumerate() {
|
||||
let hash = w.get_hash_value(&(*k).into());
|
||||
let res = w.entry_with_hash((*k).into(), hash);
|
||||
match res {
|
||||
Entry::Occupied(mut e) => { e.insert(idx); }
|
||||
Entry::Vacant(e) => {
|
||||
let res = e.insert(idx);
|
||||
assert!(res.is_ok());
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
for (idx, k) in keys.iter().enumerate() {
|
||||
let hash = w.get_hash_value(&(*k).into());
|
||||
let x = w.get_with_hash(&(*k).into(), hash);
|
||||
let value = x.as_deref().copied();
|
||||
assert_eq!(value, Some(idx));
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn dense() {
|
||||
// This exercises splitting a node with prefix
|
||||
let keys: &[u128] = &[0, 1, 2, 3, 256];
|
||||
test_inserts(keys);
|
||||
|
||||
// Dense keys
|
||||
let mut keys: Vec<u128> = (0..10000).collect();
|
||||
test_inserts(&keys);
|
||||
|
||||
// Do the same in random orders
|
||||
for _ in 1..10 {
|
||||
keys.shuffle(&mut rand::rng());
|
||||
test_inserts(&keys);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sparse() {
|
||||
// sparse keys
|
||||
let mut keys: Vec<TestKey> = Vec::new();
|
||||
let mut used_keys = HashSet::new();
|
||||
for _ in 0..10000 {
|
||||
loop {
|
||||
let key = rand::random::<u128>();
|
||||
if used_keys.get(&key).is_some() {
|
||||
continue;
|
||||
}
|
||||
used_keys.insert(key);
|
||||
keys.push(key.into());
|
||||
break;
|
||||
}
|
||||
}
|
||||
test_inserts(&keys);
|
||||
}
|
||||
|
||||
struct TestValue(AtomicUsize);
|
||||
|
||||
impl TestValue {
|
||||
fn new(val: usize) -> TestValue {
|
||||
TestValue(AtomicUsize::new(val))
|
||||
}
|
||||
|
||||
fn load(&self) -> usize {
|
||||
self.0.load(Ordering::Relaxed)
|
||||
}
|
||||
}
|
||||
|
||||
impl Clone for TestValue {
|
||||
fn clone(&self) -> TestValue {
|
||||
TestValue::new(self.load())
|
||||
}
|
||||
}
|
||||
|
||||
impl Debug for TestValue {
|
||||
fn fmt(&self, fmt: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
|
||||
write!(fmt, "{:?}", self.load())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
struct TestOp(TestKey, Option<usize>);
|
||||
|
||||
fn apply_op(
|
||||
op: &TestOp,
|
||||
map: &mut HashMapAccess<TestKey, usize>,
|
||||
shadow: &mut BTreeMap<TestKey, usize>,
|
||||
) {
|
||||
// apply the change to the shadow tree first
|
||||
let shadow_existing = if let Some(v) = op.1 {
|
||||
shadow.insert(op.0, v)
|
||||
} else {
|
||||
shadow.remove(&op.0)
|
||||
};
|
||||
|
||||
let hash = map.get_hash_value(&op.0);
|
||||
let entry = map.entry_with_hash(op.0, hash);
|
||||
let hash_existing = match op.1 {
|
||||
Some(new) => {
|
||||
match entry {
|
||||
Entry::Occupied(mut e) => Some(e.insert(new)),
|
||||
Entry::Vacant(e) => { e.insert(new).unwrap(); None },
|
||||
}
|
||||
},
|
||||
None => {
|
||||
match entry {
|
||||
Entry::Occupied(e) => Some(e.remove()),
|
||||
Entry::Vacant(_) => None,
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
assert_eq!(shadow_existing, hash_existing);
|
||||
}
|
||||
|
||||
fn do_random_ops(
|
||||
num_ops: usize,
|
||||
size: u32,
|
||||
del_prob: f64,
|
||||
writer: &mut HashMapAccess<TestKey, usize>,
|
||||
shadow: &mut BTreeMap<TestKey, usize>,
|
||||
rng: &mut rand::rngs::ThreadRng,
|
||||
) {
|
||||
for i in 0..num_ops {
|
||||
let key: TestKey = ((rng.next_u32() % size) as u128).into();
|
||||
let op = TestOp(key, if rng.random_bool(del_prob) { Some(i) } else { None });
|
||||
apply_op(&op, writer, shadow);
|
||||
}
|
||||
}
|
||||
|
||||
fn do_deletes(
|
||||
num_ops: usize,
|
||||
writer: &mut HashMapAccess<TestKey, usize>,
|
||||
shadow: &mut BTreeMap<TestKey, usize>,
|
||||
) {
|
||||
for i in 0..num_ops {
|
||||
let (k, _) = shadow.pop_first().unwrap();
|
||||
let hash = writer.get_hash_value(&k);
|
||||
writer.remove_with_hash(&k, hash);
|
||||
}
|
||||
}
|
||||
|
||||
fn do_shrink(
|
||||
writer: &mut HashMapAccess<TestKey, usize>,
|
||||
shadow: &mut BTreeMap<TestKey, usize>,
|
||||
from: u32,
|
||||
to: u32
|
||||
) {
|
||||
writer.begin_shrink(to);
|
||||
while writer.get_num_buckets_in_use() > to as usize {
|
||||
let (k, _) = shadow.pop_first().unwrap();
|
||||
let hash = writer.get_hash_value(&k);
|
||||
let entry = writer.entry_with_hash(k, hash);
|
||||
if let Entry::Occupied(mut e) = entry {
|
||||
e.remove();
|
||||
}
|
||||
}
|
||||
writer.finish_shrink().unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn random_ops() {
|
||||
let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(
|
||||
100000, 120000, "test_random"
|
||||
).attach_writer();
|
||||
let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
|
||||
|
||||
let distribution = Zipf::new(u128::MAX as f64, 1.1).unwrap();
|
||||
let mut rng = rand::rng();
|
||||
for i in 0..100000 {
|
||||
let key: TestKey = (rng.sample(distribution) as u128).into();
|
||||
|
||||
let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None });
|
||||
|
||||
apply_op(&op, &mut writer, &mut shadow);
|
||||
|
||||
if i % 1000 == 0 {
|
||||
eprintln!("{i} ops processed");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_shuffle() {
|
||||
let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(
|
||||
1000, 1200, "test_shuf"
|
||||
).attach_writer();
|
||||
let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
|
||||
let mut rng = rand::rng();
|
||||
|
||||
do_random_ops(10000, 1000, 0.75, &mut writer, &mut shadow, &mut rng);
|
||||
writer.shuffle();
|
||||
do_random_ops(10000, 1000, 0.75, &mut writer, &mut shadow, &mut rng);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_grow() {
|
||||
let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(
|
||||
1000, 2000, "test_grow"
|
||||
).attach_writer();
|
||||
let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
|
||||
let mut rng = rand::rng();
|
||||
|
||||
do_random_ops(10000, 1000, 0.75, &mut writer, &mut shadow, &mut rng);
|
||||
writer.grow(1500).unwrap();
|
||||
do_random_ops(10000, 1500, 0.75, &mut writer, &mut shadow, &mut rng);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_shrink() {
|
||||
let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(
|
||||
1500, 2000, "test_shrink"
|
||||
).attach_writer();
|
||||
let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
|
||||
let mut rng = rand::rng();
|
||||
|
||||
do_random_ops(10000, 1500, 0.75, &mut writer, &mut shadow, &mut rng);
|
||||
do_shrink(&mut writer, &mut shadow, 1500, 1000);
|
||||
do_deletes(500, &mut writer, &mut shadow);
|
||||
do_random_ops(10000, 500, 0.75, &mut writer, &mut shadow, &mut rng);
|
||||
assert!(writer.get_num_buckets_in_use() <= 1000);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_shrink_grow_seq() {
|
||||
let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(
|
||||
1000, 20000, "test_grow_seq"
|
||||
).attach_writer();
|
||||
let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
|
||||
let mut rng = rand::rng();
|
||||
|
||||
do_random_ops(500, 1000, 0.1, &mut writer, &mut shadow, &mut rng);
|
||||
eprintln!("Shrinking to 750");
|
||||
do_shrink(&mut writer, &mut shadow, 1000, 750);
|
||||
do_random_ops(200, 1000, 0.5, &mut writer, &mut shadow, &mut rng);
|
||||
eprintln!("Growing to 1500");
|
||||
writer.grow(1500).unwrap();
|
||||
do_random_ops(600, 1500, 0.1, &mut writer, &mut shadow, &mut rng);
|
||||
eprintln!("Shrinking to 200");
|
||||
do_shrink(&mut writer, &mut shadow, 1500, 200);
|
||||
do_deletes(100, &mut writer, &mut shadow);
|
||||
do_random_ops(50, 1500, 0.25, &mut writer, &mut shadow, &mut rng);
|
||||
eprintln!("Growing to 10k");
|
||||
writer.grow(10000).unwrap();
|
||||
do_random_ops(10000, 5000, 0.25, &mut writer, &mut shadow, &mut rng);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bucket_ops() {
|
||||
let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(
|
||||
1000, 1200, "test_bucket_ops"
|
||||
).attach_writer();
|
||||
let hash = writer.get_hash_value(&1.into());
|
||||
match writer.entry_with_hash(1.into(), hash) {
|
||||
Entry::Occupied(mut e) => { e.insert(2); },
|
||||
Entry::Vacant(e) => { e.insert(2).unwrap(); },
|
||||
}
|
||||
assert_eq!(writer.get_num_buckets_in_use(), 1);
|
||||
assert_eq!(writer.get_num_buckets(), 1000);
|
||||
assert_eq!(writer.get_with_hash(&1.into(), hash), Some(&2));
|
||||
match writer.entry_with_hash(1.into(), hash) {
|
||||
Entry::Occupied(e) => {
|
||||
assert_eq!(e._key, 1.into());
|
||||
let pos = e.bucket_pos as usize;
|
||||
assert_eq!(writer.entry_at_bucket(pos).unwrap()._key, 1.into());
|
||||
assert_eq!(writer.get_at_bucket(pos), Some(&(1.into(), 2)));
|
||||
},
|
||||
Entry::Vacant(_) => { panic!("Insert didn't affect entry"); },
|
||||
}
|
||||
writer.remove_with_hash(&1.into(), hash);
|
||||
assert_eq!(writer.get_with_hash(&1.into(), hash), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_shrink_zero() {
|
||||
let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(
|
||||
1500, 2000, "test_shrink_zero"
|
||||
).attach_writer();
|
||||
writer.begin_shrink(0);
|
||||
for i in 0..1500 {
|
||||
writer.entry_at_bucket(i).map(|x| x.remove());
|
||||
}
|
||||
writer.finish_shrink().unwrap();
|
||||
assert_eq!(writer.get_num_buckets_in_use(), 0);
|
||||
let hash = writer.get_hash_value(&1.into());
|
||||
let entry = writer.entry_with_hash(1.into(), hash);
|
||||
if let Entry::Vacant(v) = entry {
|
||||
assert!(v.insert(2).is_err());
|
||||
} else {
|
||||
panic!("Somehow got non-vacant entry in empty map.")
|
||||
}
|
||||
writer.grow(50).unwrap();
|
||||
let entry = writer.entry_with_hash(1.into(), hash);
|
||||
if let Entry::Vacant(v) = entry {
|
||||
assert!(v.insert(2).is_ok());
|
||||
} else {
|
||||
panic!("Somehow got non-vacant entry in empty map.")
|
||||
}
|
||||
assert_eq!(writer.get_num_buckets_in_use(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn test_grow_oom() {
|
||||
let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(
|
||||
1500, 2000, "test_grow_oom"
|
||||
).attach_writer();
|
||||
writer.grow(20000).unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn test_shrink_bigger() {
|
||||
let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(
|
||||
1500, 2500, "test_shrink_bigger"
|
||||
).attach_writer();
|
||||
writer.begin_shrink(2000);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn test_shrink_early_finish() {
|
||||
let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(
|
||||
1500, 2500, "test_shrink_early_finish"
|
||||
).attach_writer();
|
||||
writer.finish_shrink().unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn test_shrink_fixed_size() {
|
||||
let mut area = [MaybeUninit::uninit(); 10000];
|
||||
let init_struct = HashMapInit::<TestKey, usize>::with_fixed(3, &mut area);
|
||||
let mut writer = init_struct.attach_writer();
|
||||
writer.begin_shrink(1);
|
||||
}
|
||||
|
||||
@@ -1,418 +1,4 @@
|
||||
//! Shared memory utilities for neon communicator
|
||||
|
||||
use std::num::NonZeroUsize;
|
||||
use std::os::fd::{AsFd, BorrowedFd, OwnedFd};
|
||||
use std::ptr::NonNull;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
use nix::errno::Errno;
|
||||
use nix::sys::mman::MapFlags;
|
||||
use nix::sys::mman::ProtFlags;
|
||||
use nix::sys::mman::mmap as nix_mmap;
|
||||
use nix::sys::mman::munmap as nix_munmap;
|
||||
use nix::unistd::ftruncate as nix_ftruncate;
|
||||
|
||||
/// ShmemHandle represents a shared memory area that can be shared by processes over fork().
|
||||
/// Unlike shared memory allocated by Postgres, this area is resizable, up to 'max_size' that's
|
||||
/// specified at creation.
|
||||
///
|
||||
/// The area is backed by an anonymous file created with memfd_create(). The full address space for
|
||||
/// 'max_size' is reserved up-front with mmap(), but whenever you call [`ShmemHandle::set_size`],
|
||||
/// the underlying file is resized. Do not access the area beyond the current size. Currently, that
|
||||
/// will cause the file to be expanded, but we might use mprotect() etc. to enforce that in the
|
||||
/// future.
|
||||
pub struct ShmemHandle {
|
||||
/// memfd file descriptor
|
||||
fd: OwnedFd,
|
||||
|
||||
max_size: usize,
|
||||
|
||||
// Pointer to the beginning of the shared memory area. The header is stored there.
|
||||
shared_ptr: NonNull<SharedStruct>,
|
||||
|
||||
// Pointer to the beginning of the user data
|
||||
pub data_ptr: NonNull<u8>,
|
||||
}
|
||||
|
||||
/// This is stored at the beginning in the shared memory area.
|
||||
struct SharedStruct {
|
||||
max_size: usize,
|
||||
|
||||
/// Current size of the backing file. The high-order bit is used for the RESIZE_IN_PROGRESS flag
|
||||
current_size: AtomicUsize,
|
||||
}
|
||||
|
||||
const RESIZE_IN_PROGRESS: usize = 1 << 63;
|
||||
|
||||
const HEADER_SIZE: usize = std::mem::size_of::<SharedStruct>();
|
||||
|
||||
/// Error type returned by the ShmemHandle functions.
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
#[error("{msg}: {errno}")]
|
||||
pub struct Error {
|
||||
pub msg: String,
|
||||
pub errno: Errno,
|
||||
}
|
||||
|
||||
impl Error {
|
||||
fn new(msg: &str, errno: Errno) -> Error {
|
||||
Error {
|
||||
msg: msg.to_string(),
|
||||
errno,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ShmemHandle {
|
||||
/// Create a new shared memory area. To communicate between processes, the processes need to be
|
||||
/// fork()'d after calling this, so that the ShmemHandle is inherited by all processes.
|
||||
///
|
||||
/// If the ShmemHandle is dropped, the memory is unmapped from the current process. Other
|
||||
/// processes can continue using it, however.
|
||||
pub fn new(name: &str, initial_size: usize, max_size: usize) -> Result<ShmemHandle, Error> {
|
||||
// create the backing anonymous file.
|
||||
let fd = create_backing_file(name)?;
|
||||
|
||||
Self::new_with_fd(fd, initial_size, max_size)
|
||||
}
|
||||
|
||||
fn new_with_fd(
|
||||
fd: OwnedFd,
|
||||
initial_size: usize,
|
||||
max_size: usize,
|
||||
) -> Result<ShmemHandle, Error> {
|
||||
// We reserve the high-order bit for the RESIZE_IN_PROGRESS flag, and the actual size
|
||||
// is a little larger than this because of the SharedStruct header. Make the upper limit
|
||||
// somewhat smaller than that, because with anything close to that, you'll run out of
|
||||
// memory anyway.
|
||||
if max_size >= 1 << 48 {
|
||||
panic!("max size {} too large", max_size);
|
||||
}
|
||||
if initial_size > max_size {
|
||||
panic!("initial size {initial_size} larger than max size {max_size}");
|
||||
}
|
||||
|
||||
// The actual initial / max size is the one given by the caller, plus the size of
|
||||
// 'SharedStruct'.
|
||||
let initial_size = HEADER_SIZE + initial_size;
|
||||
let max_size = NonZeroUsize::new(HEADER_SIZE + max_size).unwrap();
|
||||
|
||||
// Reserve address space for it with mmap
|
||||
//
|
||||
// TODO: Use MAP_HUGETLB if possible
|
||||
let start_ptr = unsafe {
|
||||
nix_mmap(
|
||||
None,
|
||||
max_size,
|
||||
ProtFlags::PROT_READ | ProtFlags::PROT_WRITE,
|
||||
MapFlags::MAP_SHARED,
|
||||
&fd,
|
||||
0,
|
||||
)
|
||||
}
|
||||
.map_err(|e| Error::new("mmap failed: {e}", e))?;
|
||||
|
||||
// Reserve space for the initial size
|
||||
enlarge_file(fd.as_fd(), initial_size as u64)?;
|
||||
|
||||
// Initialize the header
|
||||
let shared: NonNull<SharedStruct> = start_ptr.cast();
|
||||
unsafe {
|
||||
shared.write(SharedStruct {
|
||||
max_size: max_size.into(),
|
||||
current_size: AtomicUsize::new(initial_size),
|
||||
})
|
||||
};
|
||||
|
||||
// The user data begins after the header
|
||||
let data_ptr = unsafe { start_ptr.cast().add(HEADER_SIZE) };
|
||||
|
||||
Ok(ShmemHandle {
|
||||
fd,
|
||||
max_size: max_size.into(),
|
||||
shared_ptr: shared,
|
||||
data_ptr,
|
||||
})
|
||||
}
|
||||
|
||||
// return reference to the header
|
||||
fn shared(&self) -> &SharedStruct {
|
||||
unsafe { self.shared_ptr.as_ref() }
|
||||
}
|
||||
|
||||
/// Resize the shared memory area. 'new_size' must not be larger than the 'max_size' specified
|
||||
/// when creating the area.
|
||||
///
|
||||
/// This may only be called from one process/thread concurrently. We detect that case
|
||||
/// and return an Error.
|
||||
pub fn set_size(&self, new_size: usize) -> Result<(), Error> {
|
||||
let new_size = new_size + HEADER_SIZE;
|
||||
let shared = self.shared();
|
||||
|
||||
if new_size > self.max_size {
|
||||
panic!(
|
||||
"new size ({} is greater than max size ({})",
|
||||
new_size, self.max_size
|
||||
);
|
||||
}
|
||||
assert_eq!(self.max_size, shared.max_size);
|
||||
|
||||
// Lock the area by setting the bit in 'current_size'
|
||||
//
|
||||
// Ordering::Relaxed would probably be sufficient here, as we don't access any other memory
|
||||
// and the posix_fallocate/ftruncate call is surely a synchronization point anyway. But
|
||||
// since this is not performance-critical, better safe than sorry .
|
||||
let mut old_size = shared.current_size.load(Ordering::Acquire);
|
||||
loop {
|
||||
if (old_size & RESIZE_IN_PROGRESS) != 0 {
|
||||
return Err(Error::new(
|
||||
"concurrent resize detected",
|
||||
Errno::UnknownErrno,
|
||||
));
|
||||
}
|
||||
match shared.current_size.compare_exchange(
|
||||
old_size,
|
||||
new_size,
|
||||
Ordering::Acquire,
|
||||
Ordering::Relaxed,
|
||||
) {
|
||||
Ok(_) => break,
|
||||
Err(x) => old_size = x,
|
||||
}
|
||||
}
|
||||
|
||||
// Ok, we got the lock.
|
||||
//
|
||||
// NB: If anything goes wrong, we *must* clear the bit!
|
||||
let result = {
|
||||
use std::cmp::Ordering::{Equal, Greater, Less};
|
||||
match new_size.cmp(&old_size) {
|
||||
Less => nix_ftruncate(&self.fd, new_size as i64).map_err(|e| {
|
||||
Error::new("could not shrink shmem segment, ftruncate failed: {e}", e)
|
||||
}),
|
||||
Equal => Ok(()),
|
||||
Greater => enlarge_file(self.fd.as_fd(), new_size as u64),
|
||||
}
|
||||
};
|
||||
|
||||
// Unlock
|
||||
shared.current_size.store(
|
||||
if result.is_ok() { new_size } else { old_size },
|
||||
Ordering::Release,
|
||||
);
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
/// Returns the current user-visible size of the shared memory segment.
|
||||
///
|
||||
/// NOTE: a concurrent set_size() call can change the size at any time. It is the caller's
|
||||
/// responsibility not to access the area beyond the current size.
|
||||
pub fn current_size(&self) -> usize {
|
||||
let total_current_size =
|
||||
self.shared().current_size.load(Ordering::Relaxed) & !RESIZE_IN_PROGRESS;
|
||||
total_current_size - HEADER_SIZE
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for ShmemHandle {
|
||||
fn drop(&mut self) {
|
||||
// SAFETY: The pointer was obtained from mmap() with the given size.
|
||||
// We unmap the entire region.
|
||||
let _ = unsafe { nix_munmap(self.shared_ptr.cast(), self.max_size) };
|
||||
// The fd is dropped automatically by OwnedFd.
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a "backing file" for the shared memory area. On Linux, use memfd_create(), to create an
|
||||
/// anonymous in-memory file. One macos, fall back to a regular file. That's good enough for
|
||||
/// development and testing, but in production we want the file to stay in memory.
|
||||
///
|
||||
/// disable 'unused_variables' warnings, because in the macos path, 'name' is unused.
|
||||
#[allow(unused_variables)]
|
||||
fn create_backing_file(name: &str) -> Result<OwnedFd, Error> {
|
||||
#[cfg(not(target_os = "macos"))]
|
||||
{
|
||||
nix::sys::memfd::memfd_create(name, nix::sys::memfd::MFdFlags::empty())
|
||||
.map_err(|e| Error::new("memfd_create failed: {e}", e))
|
||||
}
|
||||
#[cfg(target_os = "macos")]
|
||||
{
|
||||
let file = tempfile::tempfile().map_err(|e| {
|
||||
Error::new(
|
||||
"could not create temporary file to back shmem area: {e}",
|
||||
nix::errno::Errno::from_raw(e.raw_os_error().unwrap_or(0)),
|
||||
)
|
||||
})?;
|
||||
Ok(OwnedFd::from(file))
|
||||
}
|
||||
}
|
||||
|
||||
fn enlarge_file(fd: BorrowedFd, size: u64) -> Result<(), Error> {
|
||||
// Use posix_fallocate() to enlarge the file. It reserves the space correctly, so that
|
||||
// we don't get a segfault later when trying to actually use it.
|
||||
#[cfg(not(target_os = "macos"))]
|
||||
{
|
||||
nix::fcntl::posix_fallocate(fd, 0, size as i64).map_err(|e| {
|
||||
Error::new(
|
||||
"could not grow shmem segment, posix_fallocate failed: {e}",
|
||||
e,
|
||||
)
|
||||
})
|
||||
}
|
||||
// As a fallback on macos, which doesn't have posix_fallocate, use plain 'fallocate'
|
||||
#[cfg(target_os = "macos")]
|
||||
{
|
||||
nix::unistd::ftruncate(fd, size as i64)
|
||||
.map_err(|e| Error::new("could not grow shmem segment, ftruncate failed: {e}", e))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use nix::unistd::ForkResult;
|
||||
use std::ops::Range;
|
||||
|
||||
/// check that all bytes in given range have the expected value.
|
||||
fn assert_range(ptr: *const u8, expected: u8, range: Range<usize>) {
|
||||
for i in range {
|
||||
let b = unsafe { *(ptr.add(i)) };
|
||||
assert_eq!(expected, b, "unexpected byte at offset {}", i);
|
||||
}
|
||||
}
|
||||
|
||||
/// Write 'b' to all bytes in the given range
|
||||
fn write_range(ptr: *mut u8, b: u8, range: Range<usize>) {
|
||||
unsafe { std::ptr::write_bytes(ptr.add(range.start), b, range.end - range.start) };
|
||||
}
|
||||
|
||||
// simple single-process test of growing and shrinking
|
||||
#[test]
|
||||
fn test_shmem_resize() -> Result<(), Error> {
|
||||
let max_size = 1024 * 1024;
|
||||
let init_struct = ShmemHandle::new("test_shmem_resize", 0, max_size)?;
|
||||
|
||||
assert_eq!(init_struct.current_size(), 0);
|
||||
|
||||
// Initial grow
|
||||
let size1 = 10000;
|
||||
init_struct.set_size(size1).unwrap();
|
||||
assert_eq!(init_struct.current_size(), size1);
|
||||
|
||||
// Write some data
|
||||
let data_ptr = init_struct.data_ptr.as_ptr();
|
||||
write_range(data_ptr, 0xAA, 0..size1);
|
||||
assert_range(data_ptr, 0xAA, 0..size1);
|
||||
|
||||
// Shrink
|
||||
let size2 = 5000;
|
||||
init_struct.set_size(size2).unwrap();
|
||||
assert_eq!(init_struct.current_size(), size2);
|
||||
|
||||
// Grow again
|
||||
let size3 = 20000;
|
||||
init_struct.set_size(size3).unwrap();
|
||||
assert_eq!(init_struct.current_size(), size3);
|
||||
|
||||
// Try to read it. The area that was shrunk and grown again should read as all zeros now
|
||||
assert_range(data_ptr, 0xAA, 0..5000);
|
||||
assert_range(data_ptr, 0, 5000..size1);
|
||||
|
||||
// Try to grow beyond max_size
|
||||
//let size4 = max_size + 1;
|
||||
//assert!(init_struct.set_size(size4).is_err());
|
||||
|
||||
// Dropping init_struct should unmap the memory
|
||||
drop(init_struct);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// This is used in tests to coordinate between test processes. It's like std::sync::Barrier,
|
||||
/// but is stored in the shared memory area and works across processes. It's implemented by
|
||||
/// polling, because e.g. standard rust mutexes are not guaranteed to work across processes.
|
||||
struct SimpleBarrier {
|
||||
num_procs: usize,
|
||||
count: AtomicUsize,
|
||||
}
|
||||
|
||||
impl SimpleBarrier {
|
||||
unsafe fn init(ptr: *mut SimpleBarrier, num_procs: usize) {
|
||||
unsafe {
|
||||
*ptr = SimpleBarrier {
|
||||
num_procs,
|
||||
count: AtomicUsize::new(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn wait(&self) {
|
||||
let old = self.count.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
let generation = old / self.num_procs;
|
||||
|
||||
let mut current = old + 1;
|
||||
while current < (generation + 1) * self.num_procs {
|
||||
std::thread::sleep(std::time::Duration::from_millis(10));
|
||||
current = self.count.load(Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multi_process() {
|
||||
// Initialize
|
||||
let max_size = 1_000_000_000_000;
|
||||
let init_struct = ShmemHandle::new("test_multi_process", 0, max_size).unwrap();
|
||||
let ptr = init_struct.data_ptr.as_ptr();
|
||||
|
||||
// Store the SimpleBarrier in the first 1k of the area.
|
||||
init_struct.set_size(10000).unwrap();
|
||||
let barrier_ptr: *mut SimpleBarrier = unsafe {
|
||||
ptr.add(ptr.align_offset(std::mem::align_of::<SimpleBarrier>()))
|
||||
.cast()
|
||||
};
|
||||
unsafe { SimpleBarrier::init(barrier_ptr, 2) };
|
||||
let barrier = unsafe { barrier_ptr.as_ref().unwrap() };
|
||||
|
||||
// Fork another test process. The code after this runs in both processes concurrently.
|
||||
let fork_result = unsafe { nix::unistd::fork().unwrap() };
|
||||
|
||||
// In the parent, fill bytes between 1000..2000. In the child, between 2000..3000
|
||||
if fork_result.is_parent() {
|
||||
write_range(ptr, 0xAA, 1000..2000);
|
||||
} else {
|
||||
write_range(ptr, 0xBB, 2000..3000);
|
||||
}
|
||||
barrier.wait();
|
||||
// Verify the contents. (in both processes)
|
||||
assert_range(ptr, 0xAA, 1000..2000);
|
||||
assert_range(ptr, 0xBB, 2000..3000);
|
||||
|
||||
// Grow, from the child this time
|
||||
let size = 10_000_000;
|
||||
if !fork_result.is_parent() {
|
||||
init_struct.set_size(size).unwrap();
|
||||
}
|
||||
barrier.wait();
|
||||
|
||||
// make some writes at the end
|
||||
if fork_result.is_parent() {
|
||||
write_range(ptr, 0xAA, (size - 10)..size);
|
||||
} else {
|
||||
write_range(ptr, 0xBB, (size - 20)..(size - 10));
|
||||
}
|
||||
barrier.wait();
|
||||
|
||||
// Verify the contents. (This runs in both processes)
|
||||
assert_range(ptr, 0, (size - 1000)..(size - 20));
|
||||
assert_range(ptr, 0xBB, (size - 20)..(size - 10));
|
||||
assert_range(ptr, 0xAA, (size - 10)..size);
|
||||
|
||||
if let ForkResult::Parent { child } = fork_result {
|
||||
nix::sys::wait::waitpid(child, None).unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
pub mod hash;
|
||||
pub mod shmem;
|
||||
|
||||
418
libs/neon-shmem/src/shmem.rs
Normal file
418
libs/neon-shmem/src/shmem.rs
Normal file
@@ -0,0 +1,418 @@
|
||||
//! Dynamically resizable contiguous chunk of shared memory
|
||||
|
||||
use std::num::NonZeroUsize;
|
||||
use std::os::fd::{AsFd, BorrowedFd, OwnedFd};
|
||||
use std::ptr::NonNull;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
use nix::errno::Errno;
|
||||
use nix::sys::mman::MapFlags;
|
||||
use nix::sys::mman::ProtFlags;
|
||||
use nix::sys::mman::mmap as nix_mmap;
|
||||
use nix::sys::mman::munmap as nix_munmap;
|
||||
use nix::unistd::ftruncate as nix_ftruncate;
|
||||
|
||||
/// ShmemHandle represents a shared memory area that can be shared by processes over fork().
|
||||
/// Unlike shared memory allocated by Postgres, this area is resizable, up to 'max_size' that's
|
||||
/// specified at creation.
|
||||
///
|
||||
/// The area is backed by an anonymous file created with memfd_create(). The full address space for
|
||||
/// 'max_size' is reserved up-front with mmap(), but whenever you call [`ShmemHandle::set_size`],
|
||||
/// the underlying file is resized. Do not access the area beyond the current size. Currently, that
|
||||
/// will cause the file to be expanded, but we might use mprotect() etc. to enforce that in the
|
||||
/// future.
|
||||
pub struct ShmemHandle {
|
||||
/// memfd file descriptor
|
||||
fd: OwnedFd,
|
||||
|
||||
max_size: usize,
|
||||
|
||||
// Pointer to the beginning of the shared memory area. The header is stored there.
|
||||
shared_ptr: NonNull<SharedStruct>,
|
||||
|
||||
// Pointer to the beginning of the user data
|
||||
pub data_ptr: NonNull<u8>,
|
||||
}
|
||||
|
||||
/// This is stored at the beginning in the shared memory area.
|
||||
struct SharedStruct {
|
||||
max_size: usize,
|
||||
|
||||
/// Current size of the backing file. The high-order bit is used for the RESIZE_IN_PROGRESS flag
|
||||
current_size: AtomicUsize,
|
||||
}
|
||||
|
||||
const RESIZE_IN_PROGRESS: usize = 1 << 63;
|
||||
|
||||
const HEADER_SIZE: usize = std::mem::size_of::<SharedStruct>();
|
||||
|
||||
/// Error type returned by the ShmemHandle functions.
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
#[error("{msg}: {errno}")]
|
||||
pub struct Error {
|
||||
pub msg: String,
|
||||
pub errno: Errno,
|
||||
}
|
||||
|
||||
impl Error {
|
||||
fn new(msg: &str, errno: Errno) -> Error {
|
||||
Error {
|
||||
msg: msg.to_string(),
|
||||
errno,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ShmemHandle {
|
||||
/// Create a new shared memory area. To communicate between processes, the processes need to be
|
||||
/// fork()'d after calling this, so that the ShmemHandle is inherited by all processes.
|
||||
///
|
||||
/// If the ShmemHandle is dropped, the memory is unmapped from the current process. Other
|
||||
/// processes can continue using it, however.
|
||||
pub fn new(name: &str, initial_size: usize, max_size: usize) -> Result<ShmemHandle, Error> {
|
||||
// create the backing anonymous file.
|
||||
let fd = create_backing_file(name)?;
|
||||
|
||||
Self::new_with_fd(fd, initial_size, max_size)
|
||||
}
|
||||
|
||||
fn new_with_fd(
|
||||
fd: OwnedFd,
|
||||
initial_size: usize,
|
||||
max_size: usize,
|
||||
) -> Result<ShmemHandle, Error> {
|
||||
// We reserve the high-order bit for the RESIZE_IN_PROGRESS flag, and the actual size
|
||||
// is a little larger than this because of the SharedStruct header. Make the upper limit
|
||||
// somewhat smaller than that, because with anything close to that, you'll run out of
|
||||
// memory anyway.
|
||||
if max_size >= 1 << 48 {
|
||||
panic!("max size {} too large", max_size);
|
||||
}
|
||||
if initial_size > max_size {
|
||||
panic!("initial size {initial_size} larger than max size {max_size}");
|
||||
}
|
||||
|
||||
// The actual initial / max size is the one given by the caller, plus the size of
|
||||
// 'SharedStruct'.
|
||||
let initial_size = HEADER_SIZE + initial_size;
|
||||
let max_size = NonZeroUsize::new(HEADER_SIZE + max_size).unwrap();
|
||||
|
||||
// Reserve address space for it with mmap
|
||||
//
|
||||
// TODO: Use MAP_HUGETLB if possible
|
||||
let start_ptr = unsafe {
|
||||
nix_mmap(
|
||||
None,
|
||||
max_size,
|
||||
ProtFlags::PROT_READ | ProtFlags::PROT_WRITE,
|
||||
MapFlags::MAP_SHARED,
|
||||
&fd,
|
||||
0,
|
||||
)
|
||||
}
|
||||
.map_err(|e| Error::new("mmap failed: {e}", e))?;
|
||||
|
||||
// Reserve space for the initial size
|
||||
enlarge_file(fd.as_fd(), initial_size as u64)?;
|
||||
|
||||
// Initialize the header
|
||||
let shared: NonNull<SharedStruct> = start_ptr.cast();
|
||||
unsafe {
|
||||
shared.write(SharedStruct {
|
||||
max_size: max_size.into(),
|
||||
current_size: AtomicUsize::new(initial_size),
|
||||
})
|
||||
};
|
||||
|
||||
// The user data begins after the header
|
||||
let data_ptr = unsafe { start_ptr.cast().add(HEADER_SIZE) };
|
||||
|
||||
Ok(ShmemHandle {
|
||||
fd,
|
||||
max_size: max_size.into(),
|
||||
shared_ptr: shared,
|
||||
data_ptr,
|
||||
})
|
||||
}
|
||||
|
||||
// return reference to the header
|
||||
fn shared(&self) -> &SharedStruct {
|
||||
unsafe { self.shared_ptr.as_ref() }
|
||||
}
|
||||
|
||||
/// Resize the shared memory area. 'new_size' must not be larger than the 'max_size' specified
|
||||
/// when creating the area.
|
||||
///
|
||||
/// This may only be called from one process/thread concurrently. We detect that case
|
||||
/// and return an Error.
|
||||
pub fn set_size(&self, new_size: usize) -> Result<(), Error> {
|
||||
let new_size = new_size + HEADER_SIZE;
|
||||
let shared = self.shared();
|
||||
|
||||
if new_size > self.max_size {
|
||||
panic!(
|
||||
"new size ({} is greater than max size ({})",
|
||||
new_size, self.max_size
|
||||
);
|
||||
}
|
||||
assert_eq!(self.max_size, shared.max_size);
|
||||
|
||||
// Lock the area by setting the bit in 'current_size'
|
||||
//
|
||||
// Ordering::Relaxed would probably be sufficient here, as we don't access any other memory
|
||||
// and the posix_fallocate/ftruncate call is surely a synchronization point anyway. But
|
||||
// since this is not performance-critical, better safe than sorry .
|
||||
let mut old_size = shared.current_size.load(Ordering::Acquire);
|
||||
loop {
|
||||
if (old_size & RESIZE_IN_PROGRESS) != 0 {
|
||||
return Err(Error::new(
|
||||
"concurrent resize detected",
|
||||
Errno::UnknownErrno,
|
||||
));
|
||||
}
|
||||
match shared.current_size.compare_exchange(
|
||||
old_size,
|
||||
new_size,
|
||||
Ordering::Acquire,
|
||||
Ordering::Relaxed,
|
||||
) {
|
||||
Ok(_) => break,
|
||||
Err(x) => old_size = x,
|
||||
}
|
||||
}
|
||||
|
||||
// Ok, we got the lock.
|
||||
//
|
||||
// NB: If anything goes wrong, we *must* clear the bit!
|
||||
let result = {
|
||||
use std::cmp::Ordering::{Equal, Greater, Less};
|
||||
match new_size.cmp(&old_size) {
|
||||
Less => nix_ftruncate(&self.fd, new_size as i64).map_err(|e| {
|
||||
Error::new("could not shrink shmem segment, ftruncate failed: {e}", e)
|
||||
}),
|
||||
Equal => Ok(()),
|
||||
Greater => enlarge_file(self.fd.as_fd(), new_size as u64),
|
||||
}
|
||||
};
|
||||
|
||||
// Unlock
|
||||
shared.current_size.store(
|
||||
if result.is_ok() { new_size } else { old_size },
|
||||
Ordering::Release,
|
||||
);
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
/// Returns the current user-visible size of the shared memory segment.
|
||||
///
|
||||
/// NOTE: a concurrent set_size() call can change the size at any time. It is the caller's
|
||||
/// responsibility not to access the area beyond the current size.
|
||||
pub fn current_size(&self) -> usize {
|
||||
let total_current_size =
|
||||
self.shared().current_size.load(Ordering::Relaxed) & !RESIZE_IN_PROGRESS;
|
||||
total_current_size - HEADER_SIZE
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for ShmemHandle {
|
||||
fn drop(&mut self) {
|
||||
// SAFETY: The pointer was obtained from mmap() with the given size.
|
||||
// We unmap the entire region.
|
||||
let _ = unsafe { nix_munmap(self.shared_ptr.cast(), self.max_size) };
|
||||
// The fd is dropped automatically by OwnedFd.
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a "backing file" for the shared memory area. On Linux, use memfd_create(), to create an
|
||||
/// anonymous in-memory file. One macos, fall back to a regular file. That's good enough for
|
||||
/// development and testing, but in production we want the file to stay in memory.
|
||||
///
|
||||
/// disable 'unused_variables' warnings, because in the macos path, 'name' is unused.
|
||||
#[allow(unused_variables)]
|
||||
fn create_backing_file(name: &str) -> Result<OwnedFd, Error> {
|
||||
#[cfg(not(target_os = "macos"))]
|
||||
{
|
||||
nix::sys::memfd::memfd_create(name, nix::sys::memfd::MFdFlags::empty())
|
||||
.map_err(|e| Error::new("memfd_create failed: {e}", e))
|
||||
}
|
||||
#[cfg(target_os = "macos")]
|
||||
{
|
||||
let file = tempfile::tempfile().map_err(|e| {
|
||||
Error::new(
|
||||
"could not create temporary file to back shmem area: {e}",
|
||||
nix::errno::Errno::from_raw(e.raw_os_error().unwrap_or(0)),
|
||||
)
|
||||
})?;
|
||||
Ok(OwnedFd::from(file))
|
||||
}
|
||||
}
|
||||
|
||||
fn enlarge_file(fd: BorrowedFd, size: u64) -> Result<(), Error> {
|
||||
// Use posix_fallocate() to enlarge the file. It reserves the space correctly, so that
|
||||
// we don't get a segfault later when trying to actually use it.
|
||||
#[cfg(not(target_os = "macos"))]
|
||||
{
|
||||
nix::fcntl::posix_fallocate(fd, 0, size as i64).map_err(|e| {
|
||||
Error::new(
|
||||
"could not grow shmem segment, posix_fallocate failed: {e}",
|
||||
e,
|
||||
)
|
||||
})
|
||||
}
|
||||
// As a fallback on macos, which doesn't have posix_fallocate, use plain 'fallocate'
|
||||
#[cfg(target_os = "macos")]
|
||||
{
|
||||
nix::unistd::ftruncate(fd, size as i64)
|
||||
.map_err(|e| Error::new("could not grow shmem segment, ftruncate failed: {e}", e))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use nix::unistd::ForkResult;
|
||||
use std::ops::Range;
|
||||
|
||||
/// check that all bytes in given range have the expected value.
|
||||
fn assert_range(ptr: *const u8, expected: u8, range: Range<usize>) {
|
||||
for i in range {
|
||||
let b = unsafe { *(ptr.add(i)) };
|
||||
assert_eq!(expected, b, "unexpected byte at offset {}", i);
|
||||
}
|
||||
}
|
||||
|
||||
/// Write 'b' to all bytes in the given range
|
||||
fn write_range(ptr: *mut u8, b: u8, range: Range<usize>) {
|
||||
unsafe { std::ptr::write_bytes(ptr.add(range.start), b, range.end - range.start) };
|
||||
}
|
||||
|
||||
// simple single-process test of growing and shrinking
|
||||
#[test]
|
||||
fn test_shmem_resize() -> Result<(), Error> {
|
||||
let max_size = 1024 * 1024;
|
||||
let init_struct = ShmemHandle::new("test_shmem_resize", 0, max_size)?;
|
||||
|
||||
assert_eq!(init_struct.current_size(), 0);
|
||||
|
||||
// Initial grow
|
||||
let size1 = 10000;
|
||||
init_struct.set_size(size1).unwrap();
|
||||
assert_eq!(init_struct.current_size(), size1);
|
||||
|
||||
// Write some data
|
||||
let data_ptr = init_struct.data_ptr.as_ptr();
|
||||
write_range(data_ptr, 0xAA, 0..size1);
|
||||
assert_range(data_ptr, 0xAA, 0..size1);
|
||||
|
||||
// Shrink
|
||||
let size2 = 5000;
|
||||
init_struct.set_size(size2).unwrap();
|
||||
assert_eq!(init_struct.current_size(), size2);
|
||||
|
||||
// Grow again
|
||||
let size3 = 20000;
|
||||
init_struct.set_size(size3).unwrap();
|
||||
assert_eq!(init_struct.current_size(), size3);
|
||||
|
||||
// Try to read it. The area that was shrunk and grown again should read as all zeros now
|
||||
assert_range(data_ptr, 0xAA, 0..5000);
|
||||
assert_range(data_ptr, 0, 5000..size1);
|
||||
|
||||
// Try to grow beyond max_size
|
||||
//let size4 = max_size + 1;
|
||||
//assert!(init_struct.set_size(size4).is_err());
|
||||
|
||||
// Dropping init_struct should unmap the memory
|
||||
drop(init_struct);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// This is used in tests to coordinate between test processes. It's like std::sync::Barrier,
|
||||
/// but is stored in the shared memory area and works across processes. It's implemented by
|
||||
/// polling, because e.g. standard rust mutexes are not guaranteed to work across processes.
|
||||
struct SimpleBarrier {
|
||||
num_procs: usize,
|
||||
count: AtomicUsize,
|
||||
}
|
||||
|
||||
impl SimpleBarrier {
|
||||
unsafe fn init(ptr: *mut SimpleBarrier, num_procs: usize) {
|
||||
unsafe {
|
||||
*ptr = SimpleBarrier {
|
||||
num_procs,
|
||||
count: AtomicUsize::new(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn wait(&self) {
|
||||
let old = self.count.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
let generation = old / self.num_procs;
|
||||
|
||||
let mut current = old + 1;
|
||||
while current < (generation + 1) * self.num_procs {
|
||||
std::thread::sleep(std::time::Duration::from_millis(10));
|
||||
current = self.count.load(Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multi_process() {
|
||||
// Initialize
|
||||
let max_size = 1_000_000_000_000;
|
||||
let init_struct = ShmemHandle::new("test_multi_process", 0, max_size).unwrap();
|
||||
let ptr = init_struct.data_ptr.as_ptr();
|
||||
|
||||
// Store the SimpleBarrier in the first 1k of the area.
|
||||
init_struct.set_size(10000).unwrap();
|
||||
let barrier_ptr: *mut SimpleBarrier = unsafe {
|
||||
ptr.add(ptr.align_offset(std::mem::align_of::<SimpleBarrier>()))
|
||||
.cast()
|
||||
};
|
||||
unsafe { SimpleBarrier::init(barrier_ptr, 2) };
|
||||
let barrier = unsafe { barrier_ptr.as_ref().unwrap() };
|
||||
|
||||
// Fork another test process. The code after this runs in both processes concurrently.
|
||||
let fork_result = unsafe { nix::unistd::fork().unwrap() };
|
||||
|
||||
// In the parent, fill bytes between 1000..2000. In the child, between 2000..3000
|
||||
if fork_result.is_parent() {
|
||||
write_range(ptr, 0xAA, 1000..2000);
|
||||
} else {
|
||||
write_range(ptr, 0xBB, 2000..3000);
|
||||
}
|
||||
barrier.wait();
|
||||
// Verify the contents. (in both processes)
|
||||
assert_range(ptr, 0xAA, 1000..2000);
|
||||
assert_range(ptr, 0xBB, 2000..3000);
|
||||
|
||||
// Grow, from the child this time
|
||||
let size = 10_000_000;
|
||||
if !fork_result.is_parent() {
|
||||
init_struct.set_size(size).unwrap();
|
||||
}
|
||||
barrier.wait();
|
||||
|
||||
// make some writes at the end
|
||||
if fork_result.is_parent() {
|
||||
write_range(ptr, 0xAA, (size - 10)..size);
|
||||
} else {
|
||||
write_range(ptr, 0xBB, (size - 20)..(size - 10));
|
||||
}
|
||||
barrier.wait();
|
||||
|
||||
// Verify the contents. (This runs in both processes)
|
||||
assert_range(ptr, 0, (size - 1000)..(size - 20));
|
||||
assert_range(ptr, 0xBB, (size - 20)..(size - 10));
|
||||
assert_range(ptr, 0xAA, (size - 10)..size);
|
||||
|
||||
if let ForkResult::Parent { child } = fork_result {
|
||||
nix::sys::wait::waitpid(child, None).unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -20,7 +20,6 @@ use postgres_backend::AuthType;
|
||||
use remote_storage::RemoteStorageConfig;
|
||||
use serde_with::serde_as;
|
||||
use utils::logging::LogFormat;
|
||||
use utils::postgres_client::PostgresClientProtocol;
|
||||
|
||||
use crate::models::{ImageCompressionAlgorithm, LsnLease};
|
||||
|
||||
@@ -189,7 +188,6 @@ pub struct ConfigToml {
|
||||
pub virtual_file_io_mode: Option<crate::models::virtual_file::IoMode>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub no_sync: Option<bool>,
|
||||
pub wal_receiver_protocol: PostgresClientProtocol,
|
||||
pub page_service_pipelining: PageServicePipeliningConfig,
|
||||
pub get_vectored_concurrent_io: GetVectoredConcurrentIo,
|
||||
pub enable_read_path_debugging: Option<bool>,
|
||||
@@ -330,6 +328,8 @@ pub struct TimelineImportConfig {
|
||||
pub import_job_concurrency: NonZeroUsize,
|
||||
pub import_job_soft_size_limit: NonZeroUsize,
|
||||
pub import_job_checkpoint_threshold: NonZeroUsize,
|
||||
/// Max size of the remote storage partial read done by any job
|
||||
pub import_job_max_byte_range_size: NonZeroUsize,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
@@ -525,8 +525,6 @@ pub struct TenantConfigToml {
|
||||
/// (either this flag or the pageserver-global one need to be set)
|
||||
pub timeline_offloading: bool,
|
||||
|
||||
pub wal_receiver_protocol_override: Option<PostgresClientProtocol>,
|
||||
|
||||
/// Enable rel_size_v2 for this tenant. Once enabled, the tenant will persist this information into
|
||||
/// `index_part.json`, and it cannot be reversed.
|
||||
pub rel_size_v2_enabled: bool,
|
||||
@@ -607,12 +605,6 @@ pub mod defaults {
|
||||
|
||||
pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512;
|
||||
|
||||
pub const DEFAULT_WAL_RECEIVER_PROTOCOL: utils::postgres_client::PostgresClientProtocol =
|
||||
utils::postgres_client::PostgresClientProtocol::Interpreted {
|
||||
format: utils::postgres_client::InterpretedFormat::Protobuf,
|
||||
compression: Some(utils::postgres_client::Compression::Zstd { level: 1 }),
|
||||
};
|
||||
|
||||
pub const DEFAULT_SSL_KEY_FILE: &str = "server.key";
|
||||
pub const DEFAULT_SSL_CERT_FILE: &str = "server.crt";
|
||||
}
|
||||
@@ -711,7 +703,6 @@ impl Default for ConfigToml {
|
||||
virtual_file_io_mode: None,
|
||||
tenant_config: TenantConfigToml::default(),
|
||||
no_sync: None,
|
||||
wal_receiver_protocol: DEFAULT_WAL_RECEIVER_PROTOCOL,
|
||||
page_service_pipelining: PageServicePipeliningConfig::Pipelined(
|
||||
PageServicePipeliningConfigPipelined {
|
||||
max_batch_size: NonZeroUsize::new(32).unwrap(),
|
||||
@@ -735,6 +726,7 @@ impl Default for ConfigToml {
|
||||
import_job_concurrency: NonZeroUsize::new(32).unwrap(),
|
||||
import_job_soft_size_limit: NonZeroUsize::new(256 * 1024 * 1024).unwrap(),
|
||||
import_job_checkpoint_threshold: NonZeroUsize::new(32).unwrap(),
|
||||
import_job_max_byte_range_size: NonZeroUsize::new(4 * 1024 * 1024).unwrap(),
|
||||
},
|
||||
basebackup_cache_config: None,
|
||||
posthog_config: None,
|
||||
@@ -855,7 +847,6 @@ impl Default for TenantConfigToml {
|
||||
lsn_lease_length: LsnLease::DEFAULT_LENGTH,
|
||||
lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
|
||||
timeline_offloading: true,
|
||||
wal_receiver_protocol_override: None,
|
||||
rel_size_v2_enabled: false,
|
||||
gc_compaction_enabled: DEFAULT_GC_COMPACTION_ENABLED,
|
||||
gc_compaction_verification: DEFAULT_GC_COMPACTION_VERIFICATION,
|
||||
|
||||
@@ -20,7 +20,6 @@ use serde_with::serde_as;
|
||||
pub use utilization::PageserverUtilization;
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
use utils::postgres_client::PostgresClientProtocol;
|
||||
use utils::{completion, serde_system_time};
|
||||
|
||||
use crate::config::Ratio;
|
||||
@@ -622,8 +621,6 @@ pub struct TenantConfigPatch {
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub timeline_offloading: FieldPatch<bool>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub wal_receiver_protocol_override: FieldPatch<PostgresClientProtocol>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub rel_size_v2_enabled: FieldPatch<bool>,
|
||||
#[serde(skip_serializing_if = "FieldPatch::is_noop")]
|
||||
pub gc_compaction_enabled: FieldPatch<bool>,
|
||||
@@ -748,9 +745,6 @@ pub struct TenantConfig {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub timeline_offloading: Option<bool>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub wal_receiver_protocol_override: Option<PostgresClientProtocol>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub rel_size_v2_enabled: Option<bool>,
|
||||
|
||||
@@ -812,7 +806,6 @@ impl TenantConfig {
|
||||
mut lsn_lease_length,
|
||||
mut lsn_lease_length_for_ts,
|
||||
mut timeline_offloading,
|
||||
mut wal_receiver_protocol_override,
|
||||
mut rel_size_v2_enabled,
|
||||
mut gc_compaction_enabled,
|
||||
mut gc_compaction_verification,
|
||||
@@ -905,9 +898,6 @@ impl TenantConfig {
|
||||
.map(|v| humantime::parse_duration(&v))?
|
||||
.apply(&mut lsn_lease_length_for_ts);
|
||||
patch.timeline_offloading.apply(&mut timeline_offloading);
|
||||
patch
|
||||
.wal_receiver_protocol_override
|
||||
.apply(&mut wal_receiver_protocol_override);
|
||||
patch.rel_size_v2_enabled.apply(&mut rel_size_v2_enabled);
|
||||
patch
|
||||
.gc_compaction_enabled
|
||||
@@ -960,7 +950,6 @@ impl TenantConfig {
|
||||
lsn_lease_length,
|
||||
lsn_lease_length_for_ts,
|
||||
timeline_offloading,
|
||||
wal_receiver_protocol_override,
|
||||
rel_size_v2_enabled,
|
||||
gc_compaction_enabled,
|
||||
gc_compaction_verification,
|
||||
@@ -1058,9 +1047,6 @@ impl TenantConfig {
|
||||
timeline_offloading: self
|
||||
.timeline_offloading
|
||||
.unwrap_or(global_conf.timeline_offloading),
|
||||
wal_receiver_protocol_override: self
|
||||
.wal_receiver_protocol_override
|
||||
.or(global_conf.wal_receiver_protocol_override),
|
||||
rel_size_v2_enabled: self
|
||||
.rel_size_v2_enabled
|
||||
.unwrap_or(global_conf.rel_size_v2_enabled),
|
||||
|
||||
@@ -6,7 +6,7 @@ use arc_swap::ArcSwap;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{Instrument, info_span};
|
||||
|
||||
use crate::{FeatureStore, PostHogClient, PostHogClientConfig};
|
||||
use crate::{CaptureEvent, FeatureStore, PostHogClient, PostHogClientConfig};
|
||||
|
||||
/// A background loop that fetches feature flags from PostHog and updates the feature store.
|
||||
pub struct FeatureResolverBackgroundLoop {
|
||||
@@ -24,9 +24,16 @@ impl FeatureResolverBackgroundLoop {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn spawn(self: Arc<Self>, handle: &tokio::runtime::Handle, refresh_period: Duration) {
|
||||
pub fn spawn(
|
||||
self: Arc<Self>,
|
||||
handle: &tokio::runtime::Handle,
|
||||
refresh_period: Duration,
|
||||
fake_tenants: Vec<CaptureEvent>,
|
||||
) {
|
||||
let this = self.clone();
|
||||
let cancel = self.cancel.clone();
|
||||
|
||||
// Main loop of updating the feature flags.
|
||||
handle.spawn(
|
||||
async move {
|
||||
tracing::info!("Starting PostHog feature resolver");
|
||||
@@ -56,6 +63,22 @@ impl FeatureResolverBackgroundLoop {
|
||||
}
|
||||
.instrument(info_span!("posthog_feature_resolver")),
|
||||
);
|
||||
|
||||
// Report fake tenants to PostHog so that we have the combination of all the properties in the UI.
|
||||
// Do one report per pageserver restart.
|
||||
let this = self.clone();
|
||||
handle.spawn(
|
||||
async move {
|
||||
tracing::info!("Starting PostHog feature reporter");
|
||||
for tenant in &fake_tenants {
|
||||
tracing::info!("Reporting fake tenant: {:?}", tenant);
|
||||
}
|
||||
if let Err(e) = this.posthog_client.capture_event_batch(&fake_tenants).await {
|
||||
tracing::warn!("Cannot report fake tenants: {}", e);
|
||||
}
|
||||
}
|
||||
.instrument(info_span!("posthog_feature_reporter")),
|
||||
);
|
||||
}
|
||||
|
||||
pub fn feature_store(&self) -> Arc<FeatureStore> {
|
||||
|
||||
@@ -22,6 +22,16 @@ pub enum PostHogEvaluationError {
|
||||
Internal(String),
|
||||
}
|
||||
|
||||
impl PostHogEvaluationError {
|
||||
pub fn as_variant_str(&self) -> &'static str {
|
||||
match self {
|
||||
PostHogEvaluationError::NotAvailable(_) => "not_available",
|
||||
PostHogEvaluationError::NoConditionGroupMatched => "no_condition_group_matched",
|
||||
PostHogEvaluationError::Internal(_) => "internal",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
pub struct LocalEvaluationResponse {
|
||||
pub flags: Vec<LocalEvaluationFlag>,
|
||||
@@ -54,7 +64,7 @@ pub struct LocalEvaluationFlagFilterProperty {
|
||||
operator: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
#[serde(untagged)]
|
||||
pub enum PostHogFlagFilterPropertyValue {
|
||||
String(String),
|
||||
@@ -497,6 +507,13 @@ pub struct PostHogClient {
|
||||
client: reqwest::Client,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Debug)]
|
||||
pub struct CaptureEvent {
|
||||
pub event: String,
|
||||
pub distinct_id: String,
|
||||
pub properties: serde_json::Value,
|
||||
}
|
||||
|
||||
impl PostHogClient {
|
||||
pub fn new(config: PostHogClientConfig) -> Self {
|
||||
let client = reqwest::Client::new();
|
||||
@@ -560,12 +577,12 @@ impl PostHogClient {
|
||||
&self,
|
||||
event: &str,
|
||||
distinct_id: &str,
|
||||
properties: &HashMap<String, PostHogFlagFilterPropertyValue>,
|
||||
properties: &serde_json::Value,
|
||||
) -> anyhow::Result<()> {
|
||||
// PUBLIC_URL/capture/
|
||||
// with bearer token of self.client_api_key
|
||||
let url = format!("{}/capture/", self.config.public_api_url);
|
||||
self.client
|
||||
let response = self
|
||||
.client
|
||||
.post(url)
|
||||
.body(serde_json::to_string(&json!({
|
||||
"api_key": self.config.client_api_key,
|
||||
@@ -575,6 +592,39 @@ impl PostHogClient {
|
||||
}))?)
|
||||
.send()
|
||||
.await?;
|
||||
let status = response.status();
|
||||
let body = response.text().await?;
|
||||
if !status.is_success() {
|
||||
return Err(anyhow::anyhow!(
|
||||
"Failed to capture events: {}, {}",
|
||||
status,
|
||||
body
|
||||
));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn capture_event_batch(&self, events: &[CaptureEvent]) -> anyhow::Result<()> {
|
||||
// PUBLIC_URL/batch/
|
||||
let url = format!("{}/batch/", self.config.public_api_url);
|
||||
let response = self
|
||||
.client
|
||||
.post(url)
|
||||
.body(serde_json::to_string(&json!({
|
||||
"api_key": self.config.client_api_key,
|
||||
"batch": events,
|
||||
}))?)
|
||||
.send()
|
||||
.await?;
|
||||
let status = response.status();
|
||||
let body = response.text().await?;
|
||||
if !status.is_success() {
|
||||
return Err(anyhow::anyhow!(
|
||||
"Failed to capture events: {}, {}",
|
||||
status,
|
||||
body
|
||||
));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -439,6 +439,7 @@ pub fn empty_shmem() -> crate::bindings::WalproposerShmemState {
|
||||
currentClusterSize: crate::bindings::pg_atomic_uint64 { value: 0 },
|
||||
shard_ps_feedback: [empty_feedback; 128],
|
||||
num_shards: 0,
|
||||
replica_promote: false,
|
||||
min_ps_feedback: empty_feedback,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,6 +9,7 @@ license.workspace = true
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
async-trait.workspace = true
|
||||
bytes.workspace = true
|
||||
camino.workspace = true
|
||||
clap.workspace = true
|
||||
futures.workspace = true
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use std::collections::{HashSet, VecDeque};
|
||||
use std::collections::{HashMap, HashSet, VecDeque};
|
||||
use std::future::Future;
|
||||
use std::num::NonZeroUsize;
|
||||
use std::pin::Pin;
|
||||
@@ -8,12 +8,12 @@ use std::time::{Duration, Instant};
|
||||
|
||||
use anyhow::Context;
|
||||
use async_trait::async_trait;
|
||||
use bytes::Bytes;
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::keyspace::KeySpaceAccum;
|
||||
use pageserver_api::models::{
|
||||
PagestreamGetPageRequest, PagestreamGetPageResponse, PagestreamRequest,
|
||||
};
|
||||
use pageserver_api::models::{PagestreamGetPageRequest, PagestreamRequest};
|
||||
use pageserver_api::reltag::RelTag;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_page_api::proto;
|
||||
use rand::prelude::*;
|
||||
@@ -77,6 +77,16 @@ pub(crate) struct Args {
|
||||
#[clap(long, default_value = "1")]
|
||||
queue_depth: NonZeroUsize,
|
||||
|
||||
/// Batch size of contiguous pages generated by each client. This is equivalent to how Postgres
|
||||
/// will request page batches (e.g. prefetches or vectored reads). A batch counts as 1 RPS and
|
||||
/// 1 queue depth.
|
||||
///
|
||||
/// The libpq protocol does not support client-side batching, and will submit batches as many
|
||||
/// individual requests, in the hope that the server will batch them. Each batch still counts as
|
||||
/// 1 RPS and 1 queue depth.
|
||||
#[clap(long, default_value = "1")]
|
||||
batch_size: NonZeroUsize,
|
||||
|
||||
#[clap(long)]
|
||||
only_relnode: Option<u32>,
|
||||
|
||||
@@ -392,7 +402,16 @@ async fn run_worker(
|
||||
shared_state.start_work_barrier.wait().await;
|
||||
let client_start = Instant::now();
|
||||
let mut ticks_processed = 0;
|
||||
let mut inflight = VecDeque::new();
|
||||
let mut req_id = 0;
|
||||
let batch_size: usize = args.batch_size.into();
|
||||
|
||||
// Track inflight requests by request ID and start time. This times the request duration, and
|
||||
// ensures responses match requests. We don't expect responses back in any particular order.
|
||||
//
|
||||
// NB: this does not check that all requests received a response, because we don't wait for the
|
||||
// inflight requests to complete when the duration elapses.
|
||||
let mut inflight: HashMap<u64, Instant> = HashMap::new();
|
||||
|
||||
while !cancel.is_cancelled() {
|
||||
// Detect if a request took longer than the RPS rate
|
||||
if let Some(period) = &rps_period {
|
||||
@@ -408,36 +427,72 @@ async fn run_worker(
|
||||
}
|
||||
|
||||
while inflight.len() < args.queue_depth.get() {
|
||||
req_id += 1;
|
||||
let start = Instant::now();
|
||||
let req = {
|
||||
let (req_lsn, mod_lsn, rel, blks) = {
|
||||
/// Converts a compact i128 key to a relation tag and block number.
|
||||
fn key_to_block(key: i128) -> (RelTag, u32) {
|
||||
let key = Key::from_i128(key);
|
||||
assert!(key.is_rel_block_key());
|
||||
key.to_rel_block()
|
||||
.expect("we filter non-rel-block keys out above")
|
||||
}
|
||||
|
||||
// Pick a random page from a random relation.
|
||||
let mut rng = rand::thread_rng();
|
||||
let r = &ranges[weights.sample(&mut rng)];
|
||||
let key: i128 = rng.gen_range(r.start..r.end);
|
||||
let key = Key::from_i128(key);
|
||||
assert!(key.is_rel_block_key());
|
||||
let (rel_tag, block_no) = key
|
||||
.to_rel_block()
|
||||
.expect("we filter non-rel-block keys out above");
|
||||
PagestreamGetPageRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid: 0,
|
||||
request_lsn: if rng.gen_bool(args.req_latest_probability) {
|
||||
Lsn::MAX
|
||||
} else {
|
||||
r.timeline_lsn
|
||||
},
|
||||
not_modified_since: r.timeline_lsn,
|
||||
},
|
||||
rel: rel_tag,
|
||||
blkno: block_no,
|
||||
let (rel_tag, block_no) = key_to_block(key);
|
||||
|
||||
let mut blks = VecDeque::with_capacity(batch_size);
|
||||
blks.push_back(block_no);
|
||||
|
||||
// If requested, populate a batch of sequential pages. This is how Postgres will
|
||||
// request page batches (e.g. prefetches). If we hit the end of the relation, we
|
||||
// grow the batch towards the start too.
|
||||
for i in 1..batch_size {
|
||||
let (r, b) = key_to_block(key + i as i128);
|
||||
if r != rel_tag {
|
||||
break; // went outside relation
|
||||
}
|
||||
blks.push_back(b)
|
||||
}
|
||||
|
||||
if blks.len() < batch_size {
|
||||
// Grow batch backwards if needed.
|
||||
for i in 1..batch_size {
|
||||
let (r, b) = key_to_block(key - i as i128);
|
||||
if r != rel_tag {
|
||||
break; // went outside relation
|
||||
}
|
||||
blks.push_front(b)
|
||||
}
|
||||
}
|
||||
|
||||
// We assume that the entire batch can fit within the relation.
|
||||
assert_eq!(blks.len(), batch_size, "incomplete batch");
|
||||
|
||||
let req_lsn = if rng.gen_bool(args.req_latest_probability) {
|
||||
Lsn::MAX
|
||||
} else {
|
||||
r.timeline_lsn
|
||||
};
|
||||
(req_lsn, r.timeline_lsn, rel_tag, blks.into())
|
||||
};
|
||||
client.send_get_page(req).await.unwrap();
|
||||
inflight.push_back(start);
|
||||
client
|
||||
.send_get_page(req_id, req_lsn, mod_lsn, rel, blks)
|
||||
.await
|
||||
.unwrap();
|
||||
let old = inflight.insert(req_id, start);
|
||||
assert!(old.is_none(), "duplicate request ID {req_id}");
|
||||
}
|
||||
|
||||
let start = inflight.pop_front().unwrap();
|
||||
client.recv_get_page().await.unwrap();
|
||||
let (req_id, pages) = client.recv_get_page().await.unwrap();
|
||||
assert_eq!(pages.len(), batch_size, "unexpected page count");
|
||||
assert!(pages.iter().all(|p| !p.is_empty()), "empty page");
|
||||
let start = inflight
|
||||
.remove(&req_id)
|
||||
.expect("response for unknown request ID");
|
||||
let end = Instant::now();
|
||||
shared_state.live_stats.request_done();
|
||||
ticks_processed += 1;
|
||||
@@ -467,15 +522,24 @@ async fn run_worker(
|
||||
#[async_trait]
|
||||
trait Client: Send {
|
||||
/// Sends an asynchronous GetPage request to the pageserver.
|
||||
async fn send_get_page(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()>;
|
||||
async fn send_get_page(
|
||||
&mut self,
|
||||
req_id: u64,
|
||||
req_lsn: Lsn,
|
||||
mod_lsn: Lsn,
|
||||
rel: RelTag,
|
||||
blks: Vec<u32>,
|
||||
) -> anyhow::Result<()>;
|
||||
|
||||
/// Receives the next GetPage response from the pageserver.
|
||||
async fn recv_get_page(&mut self) -> anyhow::Result<PagestreamGetPageResponse>;
|
||||
async fn recv_get_page(&mut self) -> anyhow::Result<(u64, Vec<Bytes>)>;
|
||||
}
|
||||
|
||||
/// A libpq-based Pageserver client.
|
||||
struct LibpqClient {
|
||||
inner: pageserver_client::page_service::PagestreamClient,
|
||||
// Track sent batches, so we know how many responses to expect.
|
||||
batch_sizes: VecDeque<usize>,
|
||||
}
|
||||
|
||||
impl LibpqClient {
|
||||
@@ -484,18 +548,55 @@ impl LibpqClient {
|
||||
.await?
|
||||
.pagestream(ttid.tenant_id, ttid.timeline_id)
|
||||
.await?;
|
||||
Ok(Self { inner })
|
||||
Ok(Self {
|
||||
inner,
|
||||
batch_sizes: VecDeque::new(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Client for LibpqClient {
|
||||
async fn send_get_page(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()> {
|
||||
self.inner.getpage_send(req).await
|
||||
async fn send_get_page(
|
||||
&mut self,
|
||||
req_id: u64,
|
||||
req_lsn: Lsn,
|
||||
mod_lsn: Lsn,
|
||||
rel: RelTag,
|
||||
blks: Vec<u32>,
|
||||
) -> anyhow::Result<()> {
|
||||
// libpq doesn't support client-side batches, so we send a bunch of individual requests
|
||||
// instead in the hope that the server will batch them for us. We use the same request ID
|
||||
// for all, because we'll return a single batch response.
|
||||
self.batch_sizes.push_back(blks.len());
|
||||
for blkno in blks {
|
||||
let req = PagestreamGetPageRequest {
|
||||
hdr: PagestreamRequest {
|
||||
reqid: req_id,
|
||||
request_lsn: req_lsn,
|
||||
not_modified_since: mod_lsn,
|
||||
},
|
||||
rel,
|
||||
blkno,
|
||||
};
|
||||
self.inner.getpage_send(req).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn recv_get_page(&mut self) -> anyhow::Result<PagestreamGetPageResponse> {
|
||||
self.inner.getpage_recv().await
|
||||
async fn recv_get_page(&mut self) -> anyhow::Result<(u64, Vec<Bytes>)> {
|
||||
let batch_size = self.batch_sizes.pop_front().unwrap();
|
||||
let mut batch = Vec::with_capacity(batch_size);
|
||||
let mut req_id = None;
|
||||
for _ in 0..batch_size {
|
||||
let resp = self.inner.getpage_recv().await?;
|
||||
if req_id.is_none() {
|
||||
req_id = Some(resp.req.hdr.reqid);
|
||||
}
|
||||
assert_eq!(req_id, Some(resp.req.hdr.reqid), "request ID mismatch");
|
||||
batch.push(resp.page);
|
||||
}
|
||||
Ok((req_id.unwrap(), batch))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -532,31 +633,35 @@ impl GrpcClient {
|
||||
|
||||
#[async_trait]
|
||||
impl Client for GrpcClient {
|
||||
async fn send_get_page(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()> {
|
||||
async fn send_get_page(
|
||||
&mut self,
|
||||
req_id: u64,
|
||||
req_lsn: Lsn,
|
||||
mod_lsn: Lsn,
|
||||
rel: RelTag,
|
||||
blks: Vec<u32>,
|
||||
) -> anyhow::Result<()> {
|
||||
let req = proto::GetPageRequest {
|
||||
request_id: 0,
|
||||
request_id: req_id,
|
||||
request_class: proto::GetPageClass::Normal as i32,
|
||||
read_lsn: Some(proto::ReadLsn {
|
||||
request_lsn: req.hdr.request_lsn.0,
|
||||
not_modified_since_lsn: req.hdr.not_modified_since.0,
|
||||
request_lsn: req_lsn.0,
|
||||
not_modified_since_lsn: mod_lsn.0,
|
||||
}),
|
||||
rel: Some(req.rel.into()),
|
||||
block_number: vec![req.blkno],
|
||||
rel: Some(rel.into()),
|
||||
block_number: blks,
|
||||
};
|
||||
self.req_tx.send(req).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn recv_get_page(&mut self) -> anyhow::Result<PagestreamGetPageResponse> {
|
||||
async fn recv_get_page(&mut self) -> anyhow::Result<(u64, Vec<Bytes>)> {
|
||||
let resp = self.resp_rx.message().await?.unwrap();
|
||||
anyhow::ensure!(
|
||||
resp.status_code == proto::GetPageStatusCode::Ok as i32,
|
||||
"unexpected status code: {}",
|
||||
resp.status_code
|
||||
);
|
||||
Ok(PagestreamGetPageResponse {
|
||||
page: resp.page_image[0].clone(),
|
||||
req: PagestreamGetPageRequest::default(), // dummy
|
||||
})
|
||||
Ok((resp.request_id, resp.page_image))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -158,7 +158,6 @@ fn main() -> anyhow::Result<()> {
|
||||
// (maybe we should automate this with a visitor?).
|
||||
info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
|
||||
info!(?conf.virtual_file_io_mode, "starting with virtual_file IO mode");
|
||||
info!(?conf.wal_receiver_protocol, "starting with WAL receiver protocol");
|
||||
info!(?conf.validate_wal_contiguity, "starting with WAL contiguity validation");
|
||||
info!(?conf.page_service_pipelining, "starting with page service pipelining config");
|
||||
info!(?conf.get_vectored_concurrent_io, "starting with get_vectored IO concurrency config");
|
||||
@@ -819,6 +818,7 @@ fn start_pageserver(
|
||||
tenant_manager.clone(),
|
||||
grpc_auth,
|
||||
otel_guard.as_ref().map(|g| g.dispatch.clone()),
|
||||
conf.get_vectored_concurrent_io,
|
||||
grpc_listener,
|
||||
)?);
|
||||
}
|
||||
|
||||
@@ -27,7 +27,6 @@ use reqwest::Url;
|
||||
use storage_broker::Uri;
|
||||
use utils::id::{NodeId, TimelineId};
|
||||
use utils::logging::{LogFormat, SecretString};
|
||||
use utils::postgres_client::PostgresClientProtocol;
|
||||
|
||||
use crate::tenant::storage_layer::inmemory_layer::IndexEntry;
|
||||
use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
|
||||
@@ -211,8 +210,6 @@ pub struct PageServerConf {
|
||||
/// Optionally disable disk syncs (unsafe!)
|
||||
pub no_sync: bool,
|
||||
|
||||
pub wal_receiver_protocol: PostgresClientProtocol,
|
||||
|
||||
pub page_service_pipelining: pageserver_api::config::PageServicePipeliningConfig,
|
||||
|
||||
pub get_vectored_concurrent_io: pageserver_api::config::GetVectoredConcurrentIo,
|
||||
@@ -421,7 +418,6 @@ impl PageServerConf {
|
||||
virtual_file_io_engine,
|
||||
tenant_config,
|
||||
no_sync,
|
||||
wal_receiver_protocol,
|
||||
page_service_pipelining,
|
||||
get_vectored_concurrent_io,
|
||||
enable_read_path_debugging,
|
||||
@@ -484,7 +480,6 @@ impl PageServerConf {
|
||||
import_pgdata_upcall_api,
|
||||
import_pgdata_upcall_api_token: import_pgdata_upcall_api_token.map(SecretString::from),
|
||||
import_pgdata_aws_endpoint_url,
|
||||
wal_receiver_protocol,
|
||||
page_service_pipelining,
|
||||
get_vectored_concurrent_io,
|
||||
tracing,
|
||||
|
||||
@@ -1,21 +1,28 @@
|
||||
use std::{collections::HashMap, sync::Arc, time::Duration};
|
||||
|
||||
use posthog_client_lite::{
|
||||
FeatureResolverBackgroundLoop, PostHogClientConfig, PostHogEvaluationError,
|
||||
CaptureEvent, FeatureResolverBackgroundLoop, PostHogClientConfig, PostHogEvaluationError,
|
||||
PostHogFlagFilterPropertyValue,
|
||||
};
|
||||
use remote_storage::RemoteStorageKind;
|
||||
use serde_json::json;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::id::TenantId;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::{config::PageServerConf, metrics::FEATURE_FLAG_EVALUATION};
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct FeatureResolver {
|
||||
inner: Option<Arc<FeatureResolverBackgroundLoop>>,
|
||||
internal_properties: Option<Arc<HashMap<String, PostHogFlagFilterPropertyValue>>>,
|
||||
}
|
||||
|
||||
impl FeatureResolver {
|
||||
pub fn new_disabled() -> Self {
|
||||
Self { inner: None }
|
||||
Self {
|
||||
inner: None,
|
||||
internal_properties: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn spawn(
|
||||
@@ -36,14 +43,114 @@ impl FeatureResolver {
|
||||
shutdown_pageserver,
|
||||
);
|
||||
let inner = Arc::new(inner);
|
||||
// TODO: make this configurable
|
||||
inner.clone().spawn(handle, Duration::from_secs(60));
|
||||
Ok(FeatureResolver { inner: Some(inner) })
|
||||
|
||||
// The properties shared by all tenants on this pageserver.
|
||||
let internal_properties = {
|
||||
let mut properties = HashMap::new();
|
||||
properties.insert(
|
||||
"pageserver_id".to_string(),
|
||||
PostHogFlagFilterPropertyValue::String(conf.id.to_string()),
|
||||
);
|
||||
if let Some(availability_zone) = &conf.availability_zone {
|
||||
properties.insert(
|
||||
"availability_zone".to_string(),
|
||||
PostHogFlagFilterPropertyValue::String(availability_zone.clone()),
|
||||
);
|
||||
}
|
||||
// Infer region based on the remote storage config.
|
||||
if let Some(remote_storage) = &conf.remote_storage_config {
|
||||
match &remote_storage.storage {
|
||||
RemoteStorageKind::AwsS3(config) => {
|
||||
properties.insert(
|
||||
"region".to_string(),
|
||||
PostHogFlagFilterPropertyValue::String(format!(
|
||||
"aws-{}",
|
||||
config.bucket_region
|
||||
)),
|
||||
);
|
||||
}
|
||||
RemoteStorageKind::AzureContainer(config) => {
|
||||
properties.insert(
|
||||
"region".to_string(),
|
||||
PostHogFlagFilterPropertyValue::String(format!(
|
||||
"azure-{}",
|
||||
config.container_region
|
||||
)),
|
||||
);
|
||||
}
|
||||
RemoteStorageKind::LocalFs { .. } => {
|
||||
properties.insert(
|
||||
"region".to_string(),
|
||||
PostHogFlagFilterPropertyValue::String("local".to_string()),
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
// TODO: add pageserver URL.
|
||||
Arc::new(properties)
|
||||
};
|
||||
let fake_tenants = {
|
||||
let mut tenants = Vec::new();
|
||||
for i in 0..10 {
|
||||
let distinct_id = format!(
|
||||
"fake_tenant_{}_{}_{}",
|
||||
conf.availability_zone.as_deref().unwrap_or_default(),
|
||||
conf.id,
|
||||
i
|
||||
);
|
||||
let properties = Self::collect_properties_inner(
|
||||
distinct_id.clone(),
|
||||
Some(&internal_properties),
|
||||
);
|
||||
tenants.push(CaptureEvent {
|
||||
event: "initial_tenant_report".to_string(),
|
||||
distinct_id,
|
||||
properties: json!({ "$set": properties }), // use `$set` to set the person properties instead of the event properties
|
||||
});
|
||||
}
|
||||
tenants
|
||||
};
|
||||
// TODO: make refresh period configurable
|
||||
inner
|
||||
.clone()
|
||||
.spawn(handle, Duration::from_secs(60), fake_tenants);
|
||||
Ok(FeatureResolver {
|
||||
inner: Some(inner),
|
||||
internal_properties: Some(internal_properties),
|
||||
})
|
||||
} else {
|
||||
Ok(FeatureResolver { inner: None })
|
||||
Ok(FeatureResolver {
|
||||
inner: None,
|
||||
internal_properties: None,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn collect_properties_inner(
|
||||
tenant_id: String,
|
||||
internal_properties: Option<&HashMap<String, PostHogFlagFilterPropertyValue>>,
|
||||
) -> HashMap<String, PostHogFlagFilterPropertyValue> {
|
||||
let mut properties = HashMap::new();
|
||||
if let Some(internal_properties) = internal_properties {
|
||||
for (key, value) in internal_properties.iter() {
|
||||
properties.insert(key.clone(), value.clone());
|
||||
}
|
||||
}
|
||||
properties.insert(
|
||||
"tenant_id".to_string(),
|
||||
PostHogFlagFilterPropertyValue::String(tenant_id),
|
||||
);
|
||||
properties
|
||||
}
|
||||
|
||||
/// Collect all properties availble for the feature flag evaluation.
|
||||
pub(crate) fn collect_properties(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
) -> HashMap<String, PostHogFlagFilterPropertyValue> {
|
||||
Self::collect_properties_inner(tenant_id.to_string(), self.internal_properties.as_deref())
|
||||
}
|
||||
|
||||
/// Evaluate a multivariate feature flag. Currently, we do not support any properties.
|
||||
///
|
||||
/// Error handling: the caller should inspect the error and decide the behavior when a feature flag
|
||||
@@ -55,11 +162,24 @@ impl FeatureResolver {
|
||||
tenant_id: TenantId,
|
||||
) -> Result<String, PostHogEvaluationError> {
|
||||
if let Some(inner) = &self.inner {
|
||||
inner.feature_store().evaluate_multivariate(
|
||||
let res = inner.feature_store().evaluate_multivariate(
|
||||
flag_key,
|
||||
&tenant_id.to_string(),
|
||||
&HashMap::new(),
|
||||
)
|
||||
&self.collect_properties(tenant_id),
|
||||
);
|
||||
match &res {
|
||||
Ok(value) => {
|
||||
FEATURE_FLAG_EVALUATION
|
||||
.with_label_values(&[flag_key, "ok", value])
|
||||
.inc();
|
||||
}
|
||||
Err(e) => {
|
||||
FEATURE_FLAG_EVALUATION
|
||||
.with_label_values(&[flag_key, "error", e.as_variant_str()])
|
||||
.inc();
|
||||
}
|
||||
}
|
||||
res
|
||||
} else {
|
||||
Err(PostHogEvaluationError::NotAvailable(
|
||||
"PostHog integration is not enabled".to_string(),
|
||||
@@ -80,11 +200,24 @@ impl FeatureResolver {
|
||||
tenant_id: TenantId,
|
||||
) -> Result<(), PostHogEvaluationError> {
|
||||
if let Some(inner) = &self.inner {
|
||||
inner.feature_store().evaluate_boolean(
|
||||
let res = inner.feature_store().evaluate_boolean(
|
||||
flag_key,
|
||||
&tenant_id.to_string(),
|
||||
&HashMap::new(),
|
||||
)
|
||||
&self.collect_properties(tenant_id),
|
||||
);
|
||||
match &res {
|
||||
Ok(()) => {
|
||||
FEATURE_FLAG_EVALUATION
|
||||
.with_label_values(&[flag_key, "ok", "true"])
|
||||
.inc();
|
||||
}
|
||||
Err(e) => {
|
||||
FEATURE_FLAG_EVALUATION
|
||||
.with_label_values(&[flag_key, "error", e.as_variant_str()])
|
||||
.inc();
|
||||
}
|
||||
}
|
||||
res
|
||||
} else {
|
||||
Err(PostHogEvaluationError::NotAvailable(
|
||||
"PostHog integration is not enabled".to_string(),
|
||||
|
||||
@@ -43,6 +43,7 @@ use pageserver_api::models::{
|
||||
use pageserver_api::shard::{ShardCount, TenantShardId};
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage, TimeTravelError};
|
||||
use scopeguard::defer;
|
||||
use serde_json::json;
|
||||
use tenant_size_model::svg::SvgBranchKind;
|
||||
use tenant_size_model::{SizeResult, StorageModel};
|
||||
use tokio::time::Instant;
|
||||
@@ -3679,23 +3680,24 @@ async fn tenant_evaluate_feature_flag(
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.get_attached_tenant_shard(tenant_shard_id)?;
|
||||
let properties = tenant.feature_resolver.collect_properties(tenant_shard_id.tenant_id);
|
||||
if as_type == "boolean" {
|
||||
let result = tenant.feature_resolver.evaluate_boolean(&flag, tenant_shard_id.tenant_id);
|
||||
let result = result.map(|_| true).map_err(|e| e.to_string());
|
||||
json_response(StatusCode::OK, result)
|
||||
json_response(StatusCode::OK, json!({ "result": result, "properties": properties }))
|
||||
} else if as_type == "multivariate" {
|
||||
let result = tenant.feature_resolver.evaluate_multivariate(&flag, tenant_shard_id.tenant_id).map_err(|e| e.to_string());
|
||||
json_response(StatusCode::OK, result)
|
||||
json_response(StatusCode::OK, json!({ "result": result, "properties": properties }))
|
||||
} else {
|
||||
// Auto infer the type of the feature flag.
|
||||
let is_boolean = tenant.feature_resolver.is_feature_flag_boolean(&flag).map_err(|e| ApiError::InternalServerError(anyhow::anyhow!("{e}")))?;
|
||||
if is_boolean {
|
||||
let result = tenant.feature_resolver.evaluate_boolean(&flag, tenant_shard_id.tenant_id);
|
||||
let result = result.map(|_| true).map_err(|e| e.to_string());
|
||||
json_response(StatusCode::OK, result)
|
||||
json_response(StatusCode::OK, json!({ "result": result, "properties": properties }))
|
||||
} else {
|
||||
let result = tenant.feature_resolver.evaluate_multivariate(&flag, tenant_shard_id.tenant_id).map_err(|e| e.to_string());
|
||||
json_response(StatusCode::OK, result)
|
||||
json_response(StatusCode::OK, json!({ "result": result, "properties": properties }))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -446,6 +446,15 @@ static PAGE_CACHE_ERRORS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static FEATURE_FLAG_EVALUATION: Lazy<CounterVec> = Lazy::new(|| {
|
||||
register_counter_vec!(
|
||||
"pageserver_feature_flag_evaluation",
|
||||
"Number of times a feature flag is evaluated",
|
||||
&["flag_key", "status", "value"],
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
#[derive(IntoStaticStr)]
|
||||
#[strum(serialize_all = "kebab_case")]
|
||||
pub(crate) enum PageCacheErrorKind {
|
||||
@@ -2846,7 +2855,6 @@ pub(crate) struct WalIngestMetrics {
|
||||
pub(crate) records_received: IntCounter,
|
||||
pub(crate) records_observed: IntCounter,
|
||||
pub(crate) records_committed: IntCounter,
|
||||
pub(crate) records_filtered: IntCounter,
|
||||
pub(crate) values_committed_metadata_images: IntCounter,
|
||||
pub(crate) values_committed_metadata_deltas: IntCounter,
|
||||
pub(crate) values_committed_data_images: IntCounter,
|
||||
@@ -2902,11 +2910,6 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| {
|
||||
"Number of WAL records which resulted in writes to pageserver storage"
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
records_filtered: register_int_counter!(
|
||||
"pageserver_wal_ingest_records_filtered",
|
||||
"Number of WAL records filtered out due to sharding"
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
values_committed_metadata_images: values_committed.with_label_values(&["metadata", "image"]),
|
||||
values_committed_metadata_deltas: values_committed.with_label_values(&["metadata", "delta"]),
|
||||
values_committed_data_images: values_committed.with_label_values(&["data", "image"]),
|
||||
|
||||
@@ -178,6 +178,7 @@ pub fn spawn_grpc(
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
perf_trace_dispatch: Option<Dispatch>,
|
||||
get_vectored_concurrent_io: GetVectoredConcurrentIo,
|
||||
listener: std::net::TcpListener,
|
||||
) -> anyhow::Result<CancellableTask> {
|
||||
let cancel = CancellationToken::new();
|
||||
@@ -214,6 +215,8 @@ pub fn spawn_grpc(
|
||||
let page_service_handler = GrpcPageServiceHandler {
|
||||
tenant_manager,
|
||||
ctx,
|
||||
gate_guard: gate.enter().expect("gate was just created"),
|
||||
get_vectored_concurrent_io,
|
||||
};
|
||||
|
||||
let observability_layer = ObservabilityLayer;
|
||||
@@ -497,10 +500,6 @@ async fn page_service_conn_main(
|
||||
}
|
||||
|
||||
/// Page service connection handler.
|
||||
///
|
||||
/// TODO: for gRPC, this will be shared by all requests from all connections.
|
||||
/// Decompose it into global state and per-connection/request state, and make
|
||||
/// libpq-specific options (e.g. pipelining) separate.
|
||||
struct PageServerHandler {
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
claims: Option<Claims>,
|
||||
@@ -3362,6 +3361,8 @@ where
|
||||
pub struct GrpcPageServiceHandler {
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
ctx: RequestContext,
|
||||
gate_guard: GateGuard,
|
||||
get_vectored_concurrent_io: GetVectoredConcurrentIo,
|
||||
}
|
||||
|
||||
impl GrpcPageServiceHandler {
|
||||
@@ -3721,6 +3722,14 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
.get(ttid.tenant_id, ttid.timeline_id, shard_selector)
|
||||
.await?;
|
||||
|
||||
// Spawn an IoConcurrency sidecar, if enabled.
|
||||
let Ok(gate_guard) = self.gate_guard.try_clone() else {
|
||||
return Err(tonic::Status::unavailable("shutting down"));
|
||||
};
|
||||
let io_concurrency =
|
||||
IoConcurrency::spawn_from_conf(self.get_vectored_concurrent_io, gate_guard);
|
||||
|
||||
// Spawn a task to handle the GetPageRequest stream.
|
||||
let span = Span::current();
|
||||
let ctx = self.ctx.attached_child();
|
||||
let mut reqs = req.into_inner();
|
||||
@@ -3731,8 +3740,7 @@ impl proto::PageService for GrpcPageServiceHandler {
|
||||
.await?
|
||||
.downgrade();
|
||||
while let Some(req) = reqs.message().await? {
|
||||
// TODO: implement IoConcurrency sidecar.
|
||||
yield Self::get_page(&ctx, &timeline, req, IoConcurrency::Sequential)
|
||||
yield Self::get_page(&ctx, &timeline, req, io_concurrency.clone())
|
||||
.instrument(span.clone()) // propagate request span
|
||||
.await?
|
||||
}
|
||||
|
||||
@@ -2506,6 +2506,13 @@ impl Timeline {
|
||||
// Preparing basebackup doesn't make sense for shards other than shard zero.
|
||||
return;
|
||||
}
|
||||
if !self.is_active() {
|
||||
// May happen during initial timeline creation.
|
||||
// Such timeline is not in the global timeline map yet,
|
||||
// so basebackup cache will not be able to find it.
|
||||
// TODO(diko): We can prepare such timelines in finish_creation().
|
||||
return;
|
||||
}
|
||||
|
||||
let res = self
|
||||
.basebackup_prepare_sender
|
||||
@@ -2845,21 +2852,6 @@ impl Timeline {
|
||||
)
|
||||
}
|
||||
|
||||
/// Resolve the effective WAL receiver protocol to use for this tenant.
|
||||
///
|
||||
/// Priority order is:
|
||||
/// 1. Tenant config override
|
||||
/// 2. Default value for tenant config override
|
||||
/// 3. Pageserver config override
|
||||
/// 4. Pageserver config default
|
||||
pub fn resolve_wal_receiver_protocol(&self) -> PostgresClientProtocol {
|
||||
let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
|
||||
tenant_conf
|
||||
.wal_receiver_protocol_override
|
||||
.or(self.conf.default_tenant_conf.wal_receiver_protocol_override)
|
||||
.unwrap_or(self.conf.wal_receiver_protocol)
|
||||
}
|
||||
|
||||
pub(super) fn tenant_conf_updated(&self, new_conf: &AttachedTenantConf) {
|
||||
// NB: Most tenant conf options are read by background loops, so,
|
||||
// changes will automatically be picked up.
|
||||
@@ -3215,10 +3207,16 @@ impl Timeline {
|
||||
guard.is_none(),
|
||||
"multiple launches / re-launches of WAL receiver are not supported"
|
||||
);
|
||||
|
||||
let protocol = PostgresClientProtocol::Interpreted {
|
||||
format: utils::postgres_client::InterpretedFormat::Protobuf,
|
||||
compression: Some(utils::postgres_client::Compression::Zstd { level: 1 }),
|
||||
};
|
||||
|
||||
*guard = Some(WalReceiver::start(
|
||||
Arc::clone(self),
|
||||
WalReceiverConf {
|
||||
protocol: self.resolve_wal_receiver_protocol(),
|
||||
protocol,
|
||||
wal_connect_timeout,
|
||||
lagging_wal_timeout,
|
||||
max_lsn_wal_lag,
|
||||
|
||||
@@ -100,6 +100,7 @@ async fn run_v1(
|
||||
.unwrap(),
|
||||
import_job_concurrency: base.import_job_concurrency,
|
||||
import_job_checkpoint_threshold: base.import_job_checkpoint_threshold,
|
||||
import_job_max_byte_range_size: base.import_job_max_byte_range_size,
|
||||
}
|
||||
}
|
||||
None => timeline.conf.timeline_import_config.clone(),
|
||||
@@ -441,6 +442,7 @@ impl Plan {
|
||||
|
||||
let mut last_completed_job_idx = start_after_job_idx.unwrap_or(0);
|
||||
let checkpoint_every: usize = import_config.import_job_checkpoint_threshold.into();
|
||||
let max_byte_range_size: usize = import_config.import_job_max_byte_range_size.into();
|
||||
|
||||
// Run import jobs concurrently up to the limit specified by the pageserver configuration.
|
||||
// Note that we process completed futures in the oreder of insertion. This will be the
|
||||
@@ -456,7 +458,7 @@ impl Plan {
|
||||
|
||||
work.push_back(tokio::task::spawn(async move {
|
||||
let _permit = permit;
|
||||
let res = job.run(job_timeline, &ctx).await;
|
||||
let res = job.run(job_timeline, max_byte_range_size, &ctx).await;
|
||||
(job_idx, res)
|
||||
}));
|
||||
},
|
||||
@@ -679,6 +681,7 @@ trait ImportTask {
|
||||
async fn doit(
|
||||
self,
|
||||
layer_writer: &mut ImageLayerWriter,
|
||||
max_byte_range_size: usize,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<usize>;
|
||||
}
|
||||
@@ -715,6 +718,7 @@ impl ImportTask for ImportSingleKeyTask {
|
||||
async fn doit(
|
||||
self,
|
||||
layer_writer: &mut ImageLayerWriter,
|
||||
_max_byte_range_size: usize,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<usize> {
|
||||
layer_writer.put_image(self.key, self.buf, ctx).await?;
|
||||
@@ -768,10 +772,9 @@ impl ImportTask for ImportRelBlocksTask {
|
||||
async fn doit(
|
||||
self,
|
||||
layer_writer: &mut ImageLayerWriter,
|
||||
max_byte_range_size: usize,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<usize> {
|
||||
const MAX_BYTE_RANGE_SIZE: usize = 4 * 1024 * 1024;
|
||||
|
||||
debug!("Importing relation file");
|
||||
|
||||
let (rel_tag, start_blk) = self.key_range.start.to_rel_block()?;
|
||||
@@ -796,7 +799,7 @@ impl ImportTask for ImportRelBlocksTask {
|
||||
assert_eq!(key.len(), 1);
|
||||
assert!(!acc.is_empty());
|
||||
assert!(acc_end > acc_start);
|
||||
if acc_end == start && end - acc_start <= MAX_BYTE_RANGE_SIZE {
|
||||
if acc_end == start && end - acc_start <= max_byte_range_size {
|
||||
acc.push(key.pop().unwrap());
|
||||
Ok((acc, acc_start, end))
|
||||
} else {
|
||||
@@ -860,6 +863,7 @@ impl ImportTask for ImportSlruBlocksTask {
|
||||
async fn doit(
|
||||
self,
|
||||
layer_writer: &mut ImageLayerWriter,
|
||||
_max_byte_range_size: usize,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<usize> {
|
||||
debug!("Importing SLRU segment file {}", self.path);
|
||||
@@ -906,12 +910,13 @@ impl ImportTask for AnyImportTask {
|
||||
async fn doit(
|
||||
self,
|
||||
layer_writer: &mut ImageLayerWriter,
|
||||
max_byte_range_size: usize,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<usize> {
|
||||
match self {
|
||||
Self::SingleKey(t) => t.doit(layer_writer, ctx).await,
|
||||
Self::RelBlocks(t) => t.doit(layer_writer, ctx).await,
|
||||
Self::SlruBlocks(t) => t.doit(layer_writer, ctx).await,
|
||||
Self::SingleKey(t) => t.doit(layer_writer, max_byte_range_size, ctx).await,
|
||||
Self::RelBlocks(t) => t.doit(layer_writer, max_byte_range_size, ctx).await,
|
||||
Self::SlruBlocks(t) => t.doit(layer_writer, max_byte_range_size, ctx).await,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -952,7 +957,12 @@ impl ChunkProcessingJob {
|
||||
}
|
||||
}
|
||||
|
||||
async fn run(self, timeline: Arc<Timeline>, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
async fn run(
|
||||
self,
|
||||
timeline: Arc<Timeline>,
|
||||
max_byte_range_size: usize,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut writer = ImageLayerWriter::new(
|
||||
timeline.conf,
|
||||
timeline.timeline_id,
|
||||
@@ -967,7 +977,7 @@ impl ChunkProcessingJob {
|
||||
|
||||
let mut nimages = 0;
|
||||
for task in self.tasks {
|
||||
nimages += task.doit(&mut writer, ctx).await?;
|
||||
nimages += task.doit(&mut writer, max_byte_range_size, ctx).await?;
|
||||
}
|
||||
|
||||
let resident_layer = if nimages > 0 {
|
||||
|
||||
@@ -32,9 +32,7 @@ use utils::backoff::{
|
||||
};
|
||||
use utils::id::{NodeId, TenantTimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
use utils::postgres_client::{
|
||||
ConnectionConfigArgs, PostgresClientProtocol, wal_stream_connection_config,
|
||||
};
|
||||
use utils::postgres_client::{ConnectionConfigArgs, wal_stream_connection_config};
|
||||
|
||||
use super::walreceiver_connection::{WalConnectionStatus, WalReceiverError};
|
||||
use super::{TaskEvent, TaskHandle, TaskStateUpdate, WalReceiverConf};
|
||||
@@ -991,19 +989,12 @@ impl ConnectionManagerState {
|
||||
return None; // no connection string, ignore sk
|
||||
}
|
||||
|
||||
let (shard_number, shard_count, shard_stripe_size) = match self.conf.protocol {
|
||||
PostgresClientProtocol::Vanilla => {
|
||||
(None, None, None)
|
||||
},
|
||||
PostgresClientProtocol::Interpreted { .. } => {
|
||||
let shard_identity = self.timeline.get_shard_identity();
|
||||
(
|
||||
Some(shard_identity.number.0),
|
||||
Some(shard_identity.count.0),
|
||||
Some(shard_identity.stripe_size.0),
|
||||
)
|
||||
}
|
||||
};
|
||||
let shard_identity = self.timeline.get_shard_identity();
|
||||
let (shard_number, shard_count, shard_stripe_size) = (
|
||||
Some(shard_identity.number.0),
|
||||
Some(shard_identity.count.0),
|
||||
Some(shard_identity.stripe_size.0),
|
||||
);
|
||||
|
||||
let connection_conf_args = ConnectionConfigArgs {
|
||||
protocol: self.conf.protocol,
|
||||
@@ -1120,8 +1111,8 @@ impl ReconnectReason {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use pageserver_api::config::defaults::DEFAULT_WAL_RECEIVER_PROTOCOL;
|
||||
use url::Host;
|
||||
use utils::postgres_client::PostgresClientProtocol;
|
||||
|
||||
use super::*;
|
||||
use crate::tenant::harness::{TIMELINE_ID, TenantHarness};
|
||||
@@ -1552,6 +1543,11 @@ mod tests {
|
||||
.await
|
||||
.expect("Failed to create an empty timeline for dummy wal connection manager");
|
||||
|
||||
let protocol = PostgresClientProtocol::Interpreted {
|
||||
format: utils::postgres_client::InterpretedFormat::Protobuf,
|
||||
compression: Some(utils::postgres_client::Compression::Zstd { level: 1 }),
|
||||
};
|
||||
|
||||
ConnectionManagerState {
|
||||
id: TenantTimelineId {
|
||||
tenant_id: harness.tenant_shard_id.tenant_id,
|
||||
@@ -1560,7 +1556,7 @@ mod tests {
|
||||
timeline,
|
||||
cancel: CancellationToken::new(),
|
||||
conf: WalReceiverConf {
|
||||
protocol: DEFAULT_WAL_RECEIVER_PROTOCOL,
|
||||
protocol,
|
||||
wal_connect_timeout: Duration::from_secs(1),
|
||||
lagging_wal_timeout: Duration::from_secs(1),
|
||||
max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(),
|
||||
|
||||
@@ -15,7 +15,7 @@ use postgres_backend::is_expected_io_error;
|
||||
use postgres_connection::PgConnectionConfig;
|
||||
use postgres_ffi::WAL_SEGMENT_SIZE;
|
||||
use postgres_ffi::v14::xlog_utils::normalize_lsn;
|
||||
use postgres_ffi::waldecoder::{WalDecodeError, WalStreamDecoder};
|
||||
use postgres_ffi::waldecoder::WalDecodeError;
|
||||
use postgres_protocol::message::backend::ReplicationMessage;
|
||||
use postgres_types::PgLsn;
|
||||
use tokio::sync::watch;
|
||||
@@ -31,7 +31,7 @@ use utils::lsn::Lsn;
|
||||
use utils::pageserver_feedback::PageserverFeedback;
|
||||
use utils::postgres_client::PostgresClientProtocol;
|
||||
use utils::sync::gate::GateError;
|
||||
use wal_decoder::models::{FlushUncommittedRecords, InterpretedWalRecord, InterpretedWalRecords};
|
||||
use wal_decoder::models::{FlushUncommittedRecords, InterpretedWalRecords};
|
||||
use wal_decoder::wire_format::FromWireFormat;
|
||||
|
||||
use super::TaskStateUpdate;
|
||||
@@ -275,8 +275,6 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
let copy_stream = replication_client.copy_both_simple(&query).await?;
|
||||
let mut physical_stream = pin!(ReplicationStream::new(copy_stream));
|
||||
|
||||
let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version);
|
||||
|
||||
let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx)
|
||||
.await
|
||||
.map_err(|e| match e.kind {
|
||||
@@ -284,14 +282,16 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
_ => WalReceiverError::Other(e.into()),
|
||||
})?;
|
||||
|
||||
let shard = vec![*timeline.get_shard_identity()];
|
||||
|
||||
let interpreted_proto_config = match protocol {
|
||||
PostgresClientProtocol::Vanilla => None,
|
||||
let (format, compression) = match protocol {
|
||||
PostgresClientProtocol::Interpreted {
|
||||
format,
|
||||
compression,
|
||||
} => Some((format, compression)),
|
||||
} => (format, compression),
|
||||
PostgresClientProtocol::Vanilla => {
|
||||
return Err(WalReceiverError::Other(anyhow!(
|
||||
"Vanilla WAL receiver protocol is no longer supported for ingest"
|
||||
)));
|
||||
}
|
||||
};
|
||||
|
||||
let mut expected_wal_start = startpoint;
|
||||
@@ -313,16 +313,6 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
// Update the connection status before processing the message. If the message processing
|
||||
// fails (e.g. in walingest), we still want to know latests LSNs from the safekeeper.
|
||||
match &replication_message {
|
||||
ReplicationMessage::XLogData(xlog_data) => {
|
||||
connection_status.latest_connection_update = now;
|
||||
connection_status.commit_lsn = Some(Lsn::from(xlog_data.wal_end()));
|
||||
connection_status.streaming_lsn = Some(Lsn::from(
|
||||
xlog_data.wal_start() + xlog_data.data().len() as u64,
|
||||
));
|
||||
if !xlog_data.data().is_empty() {
|
||||
connection_status.latest_wal_update = now;
|
||||
}
|
||||
}
|
||||
ReplicationMessage::PrimaryKeepAlive(keepalive) => {
|
||||
connection_status.latest_connection_update = now;
|
||||
connection_status.commit_lsn = Some(Lsn::from(keepalive.wal_end()));
|
||||
@@ -353,7 +343,6 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
// were interpreted.
|
||||
let streaming_lsn = Lsn::from(raw.streaming_lsn());
|
||||
|
||||
let (format, compression) = interpreted_proto_config.unwrap();
|
||||
let batch = InterpretedWalRecords::from_wire(raw.data(), format, compression)
|
||||
.await
|
||||
.with_context(|| {
|
||||
@@ -509,138 +498,6 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
Some(streaming_lsn)
|
||||
}
|
||||
|
||||
ReplicationMessage::XLogData(xlog_data) => {
|
||||
async fn commit(
|
||||
modification: &mut DatadirModification<'_>,
|
||||
uncommitted: &mut u64,
|
||||
filtered: &mut u64,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let stats = modification.stats();
|
||||
modification.commit(ctx).await?;
|
||||
WAL_INGEST
|
||||
.records_committed
|
||||
.inc_by(*uncommitted - *filtered);
|
||||
WAL_INGEST.inc_values_committed(&stats);
|
||||
*uncommitted = 0;
|
||||
*filtered = 0;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Pass the WAL data to the decoder, and see if we can decode
|
||||
// more records as a result.
|
||||
let data = xlog_data.data();
|
||||
let startlsn = Lsn::from(xlog_data.wal_start());
|
||||
let endlsn = startlsn + data.len() as u64;
|
||||
|
||||
trace!("received XLogData between {startlsn} and {endlsn}");
|
||||
|
||||
WAL_INGEST.bytes_received.inc_by(data.len() as u64);
|
||||
waldecoder.feed_bytes(data);
|
||||
|
||||
{
|
||||
let mut modification = timeline.begin_modification(startlsn);
|
||||
let mut uncommitted_records = 0;
|
||||
let mut filtered_records = 0;
|
||||
|
||||
while let Some((next_record_lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
// It is important to deal with the aligned records as lsn in getPage@LSN is
|
||||
// aligned and can be several bytes bigger. Without this alignment we are
|
||||
// at risk of hitting a deadlock.
|
||||
if !next_record_lsn.is_aligned() {
|
||||
return Err(WalReceiverError::Other(anyhow!("LSN not aligned")));
|
||||
}
|
||||
|
||||
// Deserialize and interpret WAL record
|
||||
let interpreted = InterpretedWalRecord::from_bytes_filtered(
|
||||
recdata,
|
||||
&shard,
|
||||
next_record_lsn,
|
||||
modification.tline.pg_version,
|
||||
)?
|
||||
.remove(timeline.get_shard_identity())
|
||||
.unwrap();
|
||||
|
||||
if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes)
|
||||
&& uncommitted_records > 0
|
||||
{
|
||||
// Special case: legacy PG database creations operate by reading pages from a 'template' database:
|
||||
// these are the only kinds of WAL record that require reading data blocks while ingesting. Ensure
|
||||
// all earlier writes of data blocks are visible by committing any modification in flight.
|
||||
commit(
|
||||
&mut modification,
|
||||
&mut uncommitted_records,
|
||||
&mut filtered_records,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
|
||||
// Ingest the records without immediately committing them.
|
||||
timeline.metrics.wal_records_received.inc();
|
||||
let ingested = walingest
|
||||
.ingest_record(interpreted, &mut modification, &ctx)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("could not ingest record at {next_record_lsn}")
|
||||
})
|
||||
.inspect_err(|err| {
|
||||
// TODO: we can't differentiate cancellation errors with
|
||||
// anyhow::Error, so just ignore it if we're cancelled.
|
||||
if !cancellation.is_cancelled() && !timeline.is_stopping() {
|
||||
critical!("{err:?}")
|
||||
}
|
||||
})?;
|
||||
if !ingested {
|
||||
tracing::debug!("ingest: filtered out record @ LSN {next_record_lsn}");
|
||||
WAL_INGEST.records_filtered.inc();
|
||||
filtered_records += 1;
|
||||
}
|
||||
|
||||
// FIXME: this cannot be made pausable_failpoint without fixing the
|
||||
// failpoint library; in tests, the added amount of debugging will cause us
|
||||
// to timeout the tests.
|
||||
fail_point!("walreceiver-after-ingest");
|
||||
|
||||
last_rec_lsn = next_record_lsn;
|
||||
|
||||
// Commit every ingest_batch_size records. Even if we filtered out
|
||||
// all records, we still need to call commit to advance the LSN.
|
||||
uncommitted_records += 1;
|
||||
if uncommitted_records >= ingest_batch_size
|
||||
|| modification.approx_pending_bytes()
|
||||
> DatadirModification::MAX_PENDING_BYTES
|
||||
{
|
||||
commit(
|
||||
&mut modification,
|
||||
&mut uncommitted_records,
|
||||
&mut filtered_records,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
|
||||
// Commit the remaining records.
|
||||
if uncommitted_records > 0 {
|
||||
commit(
|
||||
&mut modification,
|
||||
&mut uncommitted_records,
|
||||
&mut filtered_records,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
|
||||
if !caught_up && endlsn >= end_of_wal {
|
||||
info!("caught up at LSN {endlsn}");
|
||||
caught_up = true;
|
||||
}
|
||||
|
||||
Some(endlsn)
|
||||
}
|
||||
|
||||
ReplicationMessage::PrimaryKeepAlive(keepalive) => {
|
||||
let wal_end = keepalive.wal_end();
|
||||
let timestamp = keepalive.timestamp();
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
# pgxs/neon/Makefile
|
||||
|
||||
|
||||
MODULE_big = neon
|
||||
OBJS = \
|
||||
$(WIN32RES) \
|
||||
@@ -22,7 +21,8 @@ OBJS = \
|
||||
walproposer.o \
|
||||
walproposer_pg.o \
|
||||
control_plane_connector.o \
|
||||
walsender_hooks.o
|
||||
walsender_hooks.o \
|
||||
$(LIBCOMMUNICATOR_PATH)/libcommunicator.a
|
||||
|
||||
PG_CPPFLAGS = -I$(libpq_srcdir)
|
||||
SHLIB_LINK_INTERNAL = $(libpq)
|
||||
|
||||
13
pgxn/neon/communicator/Cargo.toml
Normal file
13
pgxn/neon/communicator/Cargo.toml
Normal file
@@ -0,0 +1,13 @@
|
||||
[package]
|
||||
name = "communicator"
|
||||
version = "0.1.0"
|
||||
edition = "2024"
|
||||
|
||||
[lib]
|
||||
crate-type = ["staticlib"]
|
||||
|
||||
[dependencies]
|
||||
neon-shmem.workspace = true
|
||||
|
||||
[build-dependencies]
|
||||
cbindgen.workspace = true
|
||||
8
pgxn/neon/communicator/README.md
Normal file
8
pgxn/neon/communicator/README.md
Normal file
@@ -0,0 +1,8 @@
|
||||
This package will evolve into a "compute-pageserver communicator"
|
||||
process and machinery. For now, it just provides wrappers on the
|
||||
neon-shmem Rust crate, to allow using it in the C implementation of
|
||||
the LFC.
|
||||
|
||||
At compilation time, pgxn/neon/communicator/ produces a static
|
||||
library, libcommunicator.a. It is linked to the neon.so extension
|
||||
library.
|
||||
22
pgxn/neon/communicator/build.rs
Normal file
22
pgxn/neon/communicator/build.rs
Normal file
@@ -0,0 +1,22 @@
|
||||
use std::env;
|
||||
|
||||
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
let crate_dir = env::var("CARGO_MANIFEST_DIR").unwrap();
|
||||
|
||||
cbindgen::generate(crate_dir).map_or_else(
|
||||
|error| match error {
|
||||
cbindgen::Error::ParseSyntaxError { .. } => {
|
||||
// This means there was a syntax error in the Rust sources. Don't panic, because
|
||||
// we want the build to continue and the Rust compiler to hit the error. The
|
||||
// Rust compiler produces a better error message than cbindgen.
|
||||
eprintln!("Generating C bindings failed because of a Rust syntax error");
|
||||
}
|
||||
e => panic!("Unable to generate C bindings: {:?}", e),
|
||||
},
|
||||
|bindings| {
|
||||
bindings.write_to_file("communicator_bindings.h");
|
||||
},
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
4
pgxn/neon/communicator/cbindgen.toml
Normal file
4
pgxn/neon/communicator/cbindgen.toml
Normal file
@@ -0,0 +1,4 @@
|
||||
language = "C"
|
||||
|
||||
[enum]
|
||||
prefix_with_name = true
|
||||
240
pgxn/neon/communicator/src/file_cache_hashmap.rs
Normal file
240
pgxn/neon/communicator/src/file_cache_hashmap.rs
Normal file
@@ -0,0 +1,240 @@
|
||||
//! Glue code to allow using the Rust shmem hash map implementation from C code
|
||||
//!
|
||||
//! For convience of adapting existing code, the interface provided somewhat resembles the dynahash
|
||||
//! interface.
|
||||
//!
|
||||
//! NOTE: The caller is responsible for locking! The caller is expected to hold the PostgreSQL
|
||||
//! LWLock, 'lfc_lock', while accessing the hash table, in shared or exclusive mode as appropriate.
|
||||
|
||||
use std::ffi::c_void;
|
||||
use std::marker::PhantomData;
|
||||
|
||||
use neon_shmem::hash::entry::Entry;
|
||||
use neon_shmem::hash::{HashMapAccess, HashMapInit};
|
||||
use neon_shmem::shmem::ShmemHandle;
|
||||
|
||||
/// NB: This must match the definition of BufferTag in Postgres C headers. We could use bindgen to
|
||||
/// generate this from the C headers, but prefer to not introduce dependency on bindgen for now.
|
||||
///
|
||||
/// Note that there are no padding bytes. If the corresponding C struct has padding bytes, the C C
|
||||
/// code must clear them.
|
||||
#[derive(Clone, Debug, Hash, Eq, PartialEq)]
|
||||
#[repr(C)]
|
||||
pub struct FileCacheKey {
|
||||
pub _spc_id: u32,
|
||||
pub _db_id: u32,
|
||||
pub _rel_number: u32,
|
||||
pub _fork_num: u32,
|
||||
pub _block_num: u32,
|
||||
}
|
||||
|
||||
/// Like with FileCacheKey, this must match the definition of FileCacheEntry in file_cache.c. We
|
||||
/// don't look at the contents here though, it's sufficent that the size and alignment matches.
|
||||
#[derive(Clone, Debug, Default)]
|
||||
#[repr(C)]
|
||||
pub struct FileCacheEntry {
|
||||
pub _offset: u32,
|
||||
pub _access_count: u32,
|
||||
pub _prev: *mut FileCacheEntry,
|
||||
pub _next: *mut FileCacheEntry,
|
||||
pub _state: [u32; 8],
|
||||
}
|
||||
|
||||
/// XXX: This could be just:
|
||||
///
|
||||
/// ```ignore
|
||||
/// type FileCacheHashMapHandle = HashMapInit<'a, FileCacheKey, FileCacheEntry>
|
||||
/// ```
|
||||
///
|
||||
/// but with that, cbindgen generates a broken typedef in the C header file which doesn't
|
||||
/// compile. It apparently gets confused by the generics.
|
||||
#[repr(transparent)]
|
||||
pub struct FileCacheHashMapHandle<'a>(
|
||||
pub *mut c_void,
|
||||
PhantomData<HashMapInit<'a, FileCacheKey, FileCacheEntry>>,
|
||||
);
|
||||
impl<'a> From<Box<HashMapInit<'a, FileCacheKey, FileCacheEntry>>> for FileCacheHashMapHandle<'a> {
|
||||
fn from(x: Box<HashMapInit<'a, FileCacheKey, FileCacheEntry>>) -> Self {
|
||||
FileCacheHashMapHandle(Box::into_raw(x) as *mut c_void, PhantomData::default())
|
||||
}
|
||||
}
|
||||
impl<'a> From<FileCacheHashMapHandle<'a>> for Box<HashMapInit<'a, FileCacheKey, FileCacheEntry>> {
|
||||
fn from(x: FileCacheHashMapHandle) -> Self {
|
||||
unsafe { Box::from_raw(x.0.cast()) }
|
||||
}
|
||||
}
|
||||
|
||||
/// XXX: same for this
|
||||
#[repr(transparent)]
|
||||
pub struct FileCacheHashMapAccess<'a>(
|
||||
pub *mut c_void,
|
||||
PhantomData<HashMapAccess<'a, FileCacheKey, FileCacheEntry>>,
|
||||
);
|
||||
impl<'a> From<Box<HashMapAccess<'a, FileCacheKey, FileCacheEntry>>> for FileCacheHashMapAccess<'a> {
|
||||
fn from(x: Box<HashMapAccess<'a, FileCacheKey, FileCacheEntry>>) -> Self {
|
||||
// Convert the Box into a raw mutable pointer to the HashMapAccess itself.
|
||||
// This transfers ownership of the HashMapAccess (and its contained ShmemHandle)
|
||||
// to the raw pointer. The C caller is now responsible for managing this memory.
|
||||
FileCacheHashMapAccess(Box::into_raw(x) as *mut c_void, PhantomData::default())
|
||||
}
|
||||
}
|
||||
impl<'a> FileCacheHashMapAccess<'a> {
|
||||
fn as_ref(self) -> &'a HashMapAccess<'a, FileCacheKey, FileCacheEntry> {
|
||||
let ptr: *mut HashMapAccess<'_, FileCacheKey, FileCacheEntry> = self.0.cast();
|
||||
unsafe { ptr.as_ref().unwrap() }
|
||||
}
|
||||
fn as_mut(self) -> &'a mut HashMapAccess<'a, FileCacheKey, FileCacheEntry> {
|
||||
let ptr: *mut HashMapAccess<'_, FileCacheKey, FileCacheEntry> = self.0.cast();
|
||||
unsafe { ptr.as_mut().unwrap() }
|
||||
}
|
||||
}
|
||||
|
||||
/// Initialize the shared memory area at postmaster startup. The returned handle is inherited
|
||||
/// by all the backend processes across fork()
|
||||
#[unsafe(no_mangle)]
|
||||
pub extern "C" fn bcomm_file_cache_shmem_init<'a>(
|
||||
initial_num_buckets: u32,
|
||||
max_num_buckets: u32,
|
||||
) -> FileCacheHashMapHandle<'a> {
|
||||
let max_bytes = HashMapInit::<FileCacheKey, FileCacheEntry>::estimate_size(max_num_buckets);
|
||||
let shmem_handle =
|
||||
ShmemHandle::new("lfc mapping", 0, max_bytes).expect("shmem initialization failed");
|
||||
|
||||
let handle = HashMapInit::<FileCacheKey, FileCacheEntry>::init_in_shmem(
|
||||
initial_num_buckets,
|
||||
shmem_handle,
|
||||
);
|
||||
|
||||
Box::new(handle).into()
|
||||
}
|
||||
|
||||
/// Initialize the access to the shared memory area in a backend process.
|
||||
///
|
||||
/// XXX: I'm not sure if this actually gets called in each process, or if the returned struct
|
||||
/// is also inherited across fork(). It currently works either way but if this did more
|
||||
/// initialization that needed to be done after fork(), then it would matter.
|
||||
#[unsafe(no_mangle)]
|
||||
pub extern "C" fn bcomm_file_cache_shmem_access<'a>(
|
||||
handle: FileCacheHashMapHandle<'a>,
|
||||
) -> FileCacheHashMapAccess<'a> {
|
||||
let handle: Box<HashMapInit<'_, FileCacheKey, FileCacheEntry>> = handle.into();
|
||||
Box::new(handle.attach_writer()).into()
|
||||
}
|
||||
|
||||
/// Return the current number of buckets in the hash table
|
||||
#[unsafe(no_mangle)]
|
||||
pub extern "C" fn bcomm_file_cache_get_num_buckets<'a>(
|
||||
map: FileCacheHashMapAccess<'static>,
|
||||
) -> u32 {
|
||||
let map = map.as_ref();
|
||||
map.get_num_buckets().try_into().unwrap()
|
||||
}
|
||||
|
||||
/// Look up the entry with given key and hash.
|
||||
///
|
||||
/// This is similar to dynahash's hash_search(... , HASH_FIND)
|
||||
#[unsafe(no_mangle)]
|
||||
pub extern "C" fn bcomm_file_cache_hash_find<'a>(
|
||||
map: FileCacheHashMapAccess<'static>,
|
||||
key: &FileCacheKey,
|
||||
hash: u64,
|
||||
) -> Option<&'static FileCacheEntry> {
|
||||
let map = map.as_ref();
|
||||
map.get_with_hash(key, hash)
|
||||
}
|
||||
|
||||
/// Look up the entry at given bucket position
|
||||
///
|
||||
/// This has no direct equivalent in the dynahash interface, but can be used to
|
||||
/// iterate through all entries in the hash table.
|
||||
#[unsafe(no_mangle)]
|
||||
pub extern "C" fn bcomm_file_cache_hash_get_at_pos<'a>(
|
||||
map: FileCacheHashMapAccess<'static>,
|
||||
pos: u32,
|
||||
) -> Option<&'static FileCacheEntry> {
|
||||
let map = map.as_ref();
|
||||
map.get_at_bucket(pos as usize).map(|(_k, v)| v)
|
||||
}
|
||||
|
||||
/// Remove entry, given a pointer to the value.
|
||||
///
|
||||
/// This is equivalent to dynahash hash_search(entry->key, HASH_REMOVE), where 'entry'
|
||||
/// is an entry you have previously looked up
|
||||
#[unsafe(no_mangle)]
|
||||
pub extern "C" fn bcomm_file_cache_hash_remove_entry<'a, 'b>(
|
||||
map: FileCacheHashMapAccess,
|
||||
entry: *mut FileCacheEntry,
|
||||
) {
|
||||
let map = map.as_mut();
|
||||
let pos = map.get_bucket_for_value(entry);
|
||||
match map.entry_at_bucket(pos) {
|
||||
Some(e) => {
|
||||
e.remove();
|
||||
}
|
||||
None => {
|
||||
// todo: shouldn't happen, panic?
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute the hash for given key
|
||||
///
|
||||
/// This is equivalent to dynahash get_hash_value() function. We use Rust's default hasher
|
||||
/// for calculating the hash though.
|
||||
#[unsafe(no_mangle)]
|
||||
pub extern "C" fn bcomm_file_cache_get_hash_value<'a, 'b>(
|
||||
map: FileCacheHashMapAccess<'static>,
|
||||
key: &FileCacheKey,
|
||||
) -> u64 {
|
||||
map.as_ref().get_hash_value(key)
|
||||
}
|
||||
|
||||
/// Insert a new entry to the hash table
|
||||
///
|
||||
/// This is equivalent to dynahash hash_search(..., HASH_ENTER).
|
||||
#[unsafe(no_mangle)]
|
||||
pub extern "C" fn bcomm_file_cache_hash_enter<'a, 'b>(
|
||||
map: FileCacheHashMapAccess,
|
||||
key: &FileCacheKey,
|
||||
hash: u64,
|
||||
found: &mut bool,
|
||||
) -> *mut FileCacheEntry {
|
||||
match map.as_mut().entry_with_hash(key.clone(), hash) {
|
||||
Entry::Occupied(mut e) => {
|
||||
*found = true;
|
||||
e.get_mut()
|
||||
}
|
||||
Entry::Vacant(e) => {
|
||||
*found = false;
|
||||
let initial_value = FileCacheEntry::default();
|
||||
e.insert(initial_value).expect("TODO: hash table full")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the key for a given entry, which must be present in the hash table.
|
||||
///
|
||||
/// Dynahash requires the key to be part of the "value" struct, so you can always
|
||||
/// access the key with something like `entry->key`. The Rust implementation however
|
||||
/// stores the key separately. This function extracts the separately stored key.
|
||||
#[unsafe(no_mangle)]
|
||||
pub extern "C" fn bcomm_file_cache_hash_get_key_for_entry<'a, 'b>(
|
||||
map: FileCacheHashMapAccess,
|
||||
entry: *const FileCacheEntry,
|
||||
) -> Option<&FileCacheKey> {
|
||||
let map = map.as_ref();
|
||||
let pos = map.get_bucket_for_value(entry);
|
||||
map.get_at_bucket(pos as usize).map(|(k, _v)| k)
|
||||
}
|
||||
|
||||
/// Remove all entries from the hash table
|
||||
#[unsafe(no_mangle)]
|
||||
pub extern "C" fn bcomm_file_cache_hash_reset<'a, 'b>(map: FileCacheHashMapAccess) {
|
||||
let map = map.as_mut();
|
||||
let num_buckets = map.get_num_buckets();
|
||||
for i in 0..num_buckets {
|
||||
if let Some(e) = map.entry_at_bucket(i) {
|
||||
e.remove();
|
||||
}
|
||||
}
|
||||
}
|
||||
1
pgxn/neon/communicator/src/lib.rs
Normal file
1
pgxn/neon/communicator/src/lib.rs
Normal file
@@ -0,0 +1 @@
|
||||
pub mod file_cache_hashmap;
|
||||
@@ -21,7 +21,7 @@
|
||||
#include "access/xlog.h"
|
||||
#include "funcapi.h"
|
||||
#include "miscadmin.h"
|
||||
#include "common/hashfn.h"
|
||||
#include "common/file_utils.h"
|
||||
#include "pgstat.h"
|
||||
#include "port/pg_iovec.h"
|
||||
#include "postmaster/bgworker.h"
|
||||
@@ -36,7 +36,6 @@
|
||||
#include "storage/procsignal.h"
|
||||
#include "tcop/tcopprot.h"
|
||||
#include "utils/builtins.h"
|
||||
#include "utils/dynahash.h"
|
||||
#include "utils/guc.h"
|
||||
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
@@ -46,6 +45,7 @@
|
||||
#include "hll.h"
|
||||
#include "bitmap.h"
|
||||
#include "file_cache.h"
|
||||
#include "file_cache_rust_hash.h"
|
||||
#include "neon.h"
|
||||
#include "neon_lwlsncache.h"
|
||||
#include "neon_perf_counters.h"
|
||||
@@ -64,7 +64,7 @@
|
||||
*
|
||||
* Cache is always reconstructed at node startup, so we do not need to save mapping somewhere and worry about
|
||||
* its consistency.
|
||||
|
||||
*
|
||||
*
|
||||
* ## Holes
|
||||
*
|
||||
@@ -76,13 +76,15 @@
|
||||
* fallocate(FALLOC_FL_PUNCH_HOLE) call. The nominal size of the file doesn't
|
||||
* shrink, but the disk space it uses does.
|
||||
*
|
||||
* Each hole is tracked by a dummy FileCacheEntry, which are kept in the
|
||||
* 'holes' linked list. They are entered into the chunk hash table, with a
|
||||
* special key where the blockNumber is used to store the 'offset' of the
|
||||
* hole, and all other fields are zero. Holes are never looked up in the hash
|
||||
* table, we only enter them there to have a FileCacheEntry that we can keep
|
||||
* in the linked list. If the soft limit is raised again, we reuse the holes
|
||||
* before extending the nominal size of the file.
|
||||
* Each hole is tracked in a freelist. The freelist consists of two parts: a
|
||||
* fixed-size array in shared memory, and a linked chain of on-disk
|
||||
* blocks. When the in-memory array fills up, it's flushed to a new on-disk
|
||||
* chunk. If the soft limit is raised again, we reuse the holes before
|
||||
* extending the nominal size of the file.
|
||||
*
|
||||
* The in-memory freelist array is protected by 'lfc_lock', while the on-disk
|
||||
* chain is protected by a separate 'lfc_freelist_lock'. Locking rule to
|
||||
* avoid deadlocks: always acquire lfc_freelist_lock first, then lfc_lock.
|
||||
*/
|
||||
|
||||
/* Local file storage allocation chunk.
|
||||
@@ -92,13 +94,15 @@
|
||||
* 1Mb chunks can reduce hash map size to 320Mb.
|
||||
* 2. Improve access locality, subsequent pages will be allocated together improving seqscan speed
|
||||
*/
|
||||
#define MAX_BLOCKS_PER_CHUNK_LOG 7 /* 1Mb chunk */
|
||||
#define MAX_BLOCKS_PER_CHUNK (1 << MAX_BLOCKS_PER_CHUNK_LOG)
|
||||
#define BLOCKS_PER_CHUNK_LOG 7 /* 1Mb chunk */
|
||||
#define BLOCKS_PER_CHUNK (1 << BLOCKS_PER_CHUNK_LOG)
|
||||
|
||||
#define MB ((uint64)1024*1024)
|
||||
|
||||
#define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ >> lfc_chunk_size_log))
|
||||
#define BLOCK_TO_CHUNK_OFF(blkno) ((blkno) & (lfc_blocks_per_chunk-1))
|
||||
#define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ >> BLOCKS_PER_CHUNK_LOG))
|
||||
#define BLOCK_TO_CHUNK_OFF(blkno) ((blkno) & (BLOCKS_PER_CHUNK-1))
|
||||
|
||||
#define INVALID_OFFSET (0xffffffff)
|
||||
|
||||
/*
|
||||
* Blocks are read or written to LFC file outside LFC critical section.
|
||||
@@ -119,15 +123,18 @@ typedef enum FileCacheBlockState
|
||||
|
||||
typedef struct FileCacheEntry
|
||||
{
|
||||
BufferTag key;
|
||||
uint32 hash;
|
||||
uint32 offset;
|
||||
uint32 access_count;
|
||||
dlist_node list_node; /* LRU/holes list node */
|
||||
uint32 state[FLEXIBLE_ARRAY_MEMBER]; /* two bits per block */
|
||||
dlist_node list_node; /* LRU list node */
|
||||
uint32 state[(BLOCKS_PER_CHUNK * 2 + 31) / 32]; /* two bits per block */
|
||||
} FileCacheEntry;
|
||||
|
||||
#define FILE_CACHE_ENRTY_SIZE MAXALIGN(offsetof(FileCacheEntry, state) + (lfc_blocks_per_chunk*2+31)/32*4)
|
||||
/* Todo: alignment must be the same too */
|
||||
StaticAssertDecl(sizeof(FileCacheEntry) == sizeof(RustFileCacheEntry),
|
||||
"Rust and C declarations of FileCacheEntry are incompatible");
|
||||
StaticAssertDecl(sizeof(BufferTag) == sizeof(RustFileCacheKey),
|
||||
"Rust and C declarations of FileCacheKey are incompatible");
|
||||
|
||||
#define GET_STATE(entry, i) (((entry)->state[(i) / 16] >> ((i) % 16 * 2)) & 3)
|
||||
#define SET_STATE(entry, i, new_state) (entry)->state[(i) / 16] = ((entry)->state[(i) / 16] & ~(3 << ((i) % 16 * 2))) | ((new_state) << ((i) % 16 * 2))
|
||||
|
||||
@@ -136,6 +143,9 @@ typedef struct FileCacheEntry
|
||||
|
||||
#define MAX_PREWARM_WORKERS 8
|
||||
|
||||
|
||||
#define FREELIST_ENTRIES_PER_CHUNK (BLOCKS_PER_CHUNK * BLCKSZ / sizeof(uint32) - 2)
|
||||
|
||||
typedef struct PrewarmWorkerState
|
||||
{
|
||||
uint32 prewarmed_pages;
|
||||
@@ -161,7 +171,6 @@ typedef struct FileCacheControl
|
||||
uint64 evicted_pages; /* number of evicted pages */
|
||||
dlist_head lru; /* double linked list for LRU replacement
|
||||
* algorithm */
|
||||
dlist_head holes; /* double linked list of punched holes */
|
||||
HyperLogLogState wss_estimation; /* estimation of working set size */
|
||||
ConditionVariable cv[N_COND_VARS]; /* turnstile of condition variables */
|
||||
PrewarmWorkerState prewarm_workers[MAX_PREWARM_WORKERS];
|
||||
@@ -172,23 +181,39 @@ typedef struct FileCacheControl
|
||||
bool prewarm_active;
|
||||
bool prewarm_canceled;
|
||||
dsm_handle prewarm_lfc_state_handle;
|
||||
|
||||
/*
|
||||
* Free list. This is large enough to hold one chunks worth of entries.
|
||||
*/
|
||||
uint32 freelist_size;
|
||||
uint32 freelist_head;
|
||||
uint32 num_free_pages;
|
||||
uint32 free_pages[FREELIST_ENTRIES_PER_CHUNK];
|
||||
} FileCacheControl;
|
||||
|
||||
typedef struct FreeListChunk
|
||||
{
|
||||
uint32 next;
|
||||
uint32 num_free_pages;
|
||||
uint32 free_pages[FREELIST_ENTRIES_PER_CHUNK];
|
||||
} FreeListChunk;
|
||||
|
||||
#define FILE_CACHE_STATE_MAGIC 0xfcfcfcfc
|
||||
|
||||
#define FILE_CACHE_STATE_BITMAP(fcs) ((uint8*)&(fcs)->chunks[(fcs)->n_chunks])
|
||||
#define FILE_CACHE_STATE_SIZE_FOR_CHUNKS(n_chunks) (sizeof(FileCacheState) + (n_chunks)*sizeof(BufferTag) + (((n_chunks) * lfc_blocks_per_chunk)+7)/8)
|
||||
#define FILE_CACHE_STATE_SIZE_FOR_CHUNKS(n_chunks) (sizeof(FileCacheState) + (n_chunks)*sizeof(BufferTag) + (((n_chunks) * BLOCKS_PER_CHUNK)+7)/8)
|
||||
#define FILE_CACHE_STATE_SIZE(fcs) (sizeof(FileCacheState) + (fcs->n_chunks)*sizeof(BufferTag) + (((fcs->n_chunks) << fcs->chunk_size_log)+7)/8)
|
||||
|
||||
static HTAB *lfc_hash;
|
||||
static FileCacheHashMapHandle lfc_hash_handle;
|
||||
static FileCacheHashMapAccess lfc_hash;
|
||||
static int lfc_desc = -1;
|
||||
static LWLockId lfc_lock;
|
||||
static LWLockId lfc_freelist_lock;
|
||||
static int lfc_max_size;
|
||||
static int lfc_size_limit;
|
||||
static int lfc_prewarm_limit;
|
||||
static int lfc_prewarm_batch;
|
||||
static int lfc_chunk_size_log = MAX_BLOCKS_PER_CHUNK_LOG;
|
||||
static int lfc_blocks_per_chunk = MAX_BLOCKS_PER_CHUNK;
|
||||
static int lfc_blocks_per_chunk_ro = BLOCKS_PER_CHUNK;
|
||||
static char *lfc_path;
|
||||
static uint64 lfc_generation;
|
||||
static FileCacheControl *lfc_ctl;
|
||||
@@ -205,6 +230,11 @@ bool AmPrewarmWorker;
|
||||
|
||||
#define LFC_ENABLED() (lfc_ctl->limit != 0)
|
||||
|
||||
static bool freelist_push(uint32 offset);
|
||||
static bool freelist_prepare_pop(void);
|
||||
static uint32 freelist_pop(void);
|
||||
static bool freelist_is_empty(void);
|
||||
|
||||
/*
|
||||
* Close LFC file if opened.
|
||||
* All backends should close their LFC files once LFC is disabled.
|
||||
@@ -232,15 +262,9 @@ lfc_switch_off(void)
|
||||
|
||||
if (LFC_ENABLED())
|
||||
{
|
||||
HASH_SEQ_STATUS status;
|
||||
FileCacheEntry *entry;
|
||||
|
||||
/* Invalidate hash */
|
||||
hash_seq_init(&status, lfc_hash);
|
||||
while ((entry = hash_seq_search(&status)) != NULL)
|
||||
{
|
||||
hash_search_with_hash_value(lfc_hash, &entry->key, entry->hash, HASH_REMOVE, NULL);
|
||||
}
|
||||
file_cache_hash_reset(lfc_hash);
|
||||
|
||||
lfc_ctl->generation += 1;
|
||||
lfc_ctl->size = 0;
|
||||
lfc_ctl->pinned = 0;
|
||||
@@ -248,7 +272,9 @@ lfc_switch_off(void)
|
||||
lfc_ctl->used_pages = 0;
|
||||
lfc_ctl->limit = 0;
|
||||
dlist_init(&lfc_ctl->lru);
|
||||
dlist_init(&lfc_ctl->holes);
|
||||
|
||||
lfc_ctl->freelist_head = INVALID_OFFSET;
|
||||
lfc_ctl->num_free_pages = 0;
|
||||
|
||||
/*
|
||||
* We need to use unlink to to avoid races in LFC write, because it is not
|
||||
@@ -317,8 +343,8 @@ lfc_ensure_opened(void)
|
||||
static void
|
||||
lfc_shmem_startup(void)
|
||||
{
|
||||
size_t size;
|
||||
bool found;
|
||||
static HASHCTL info;
|
||||
|
||||
if (prev_shmem_startup_hook)
|
||||
{
|
||||
@@ -327,27 +353,29 @@ lfc_shmem_startup(void)
|
||||
|
||||
LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
|
||||
|
||||
lfc_ctl = (FileCacheControl *) ShmemInitStruct("lfc", sizeof(FileCacheControl), &found);
|
||||
size = sizeof(FileCacheControl);
|
||||
|
||||
lfc_ctl = (FileCacheControl *) ShmemInitStruct("lfc", size, &found);
|
||||
if (!found)
|
||||
{
|
||||
int fd;
|
||||
uint32 n_chunks = SIZE_MB_TO_CHUNKS(lfc_max_size);
|
||||
|
||||
lfc_lock = (LWLockId) GetNamedLWLockTranche("lfc_lock");
|
||||
info.keysize = sizeof(BufferTag);
|
||||
info.entrysize = FILE_CACHE_ENRTY_SIZE;
|
||||
lfc_freelist_lock = (LWLockId) GetNamedLWLockTranche("lfc_freelist_lock");
|
||||
|
||||
/*
|
||||
* n_chunks+1 because we add new element to hash table before eviction
|
||||
* of victim
|
||||
*/
|
||||
lfc_hash = ShmemInitHash("lfc_hash",
|
||||
n_chunks + 1, n_chunks + 1,
|
||||
&info,
|
||||
HASH_ELEM | HASH_BLOBS);
|
||||
memset(lfc_ctl, 0, sizeof(FileCacheControl));
|
||||
lfc_hash_handle = file_cache_hash_shmem_init(n_chunks + 1, n_chunks + 1);
|
||||
|
||||
memset(lfc_ctl, 0, offsetof(FileCacheControl, free_pages));
|
||||
dlist_init(&lfc_ctl->lru);
|
||||
dlist_init(&lfc_ctl->holes);
|
||||
|
||||
lfc_ctl->freelist_size = FREELIST_ENTRIES_PER_CHUNK;
|
||||
lfc_ctl->freelist_head = INVALID_OFFSET;
|
||||
lfc_ctl->num_free_pages = 0;
|
||||
|
||||
/* Initialize hyper-log-log structure for estimating working set size */
|
||||
initSHLL(&lfc_ctl->wss_estimation);
|
||||
@@ -371,18 +399,25 @@ lfc_shmem_startup(void)
|
||||
|
||||
}
|
||||
LWLockRelease(AddinShmemInitLock);
|
||||
|
||||
lfc_hash = file_cache_hash_shmem_access(lfc_hash_handle);
|
||||
}
|
||||
|
||||
static void
|
||||
lfc_shmem_request(void)
|
||||
{
|
||||
size_t size;
|
||||
|
||||
#if PG_VERSION_NUM>=150000
|
||||
if (prev_shmem_request_hook)
|
||||
prev_shmem_request_hook();
|
||||
#endif
|
||||
|
||||
RequestAddinShmemSpace(sizeof(FileCacheControl) + hash_estimate_size(SIZE_MB_TO_CHUNKS(lfc_max_size) + 1, FILE_CACHE_ENRTY_SIZE));
|
||||
size = sizeof(FileCacheControl);
|
||||
|
||||
RequestAddinShmemSpace(size);
|
||||
RequestNamedLWLockTranche("lfc_lock", 1);
|
||||
RequestNamedLWLockTranche("lfc_freelist_lock", 2);
|
||||
}
|
||||
|
||||
static bool
|
||||
@@ -398,24 +433,6 @@ is_normal_backend(void)
|
||||
return lfc_ctl && MyProc && UsedShmemSegAddr && !IsParallelWorker();
|
||||
}
|
||||
|
||||
static bool
|
||||
lfc_check_chunk_size(int *newval, void **extra, GucSource source)
|
||||
{
|
||||
if (*newval & (*newval - 1))
|
||||
{
|
||||
elog(ERROR, "LFC chunk size should be power of two");
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static void
|
||||
lfc_change_chunk_size(int newval, void* extra)
|
||||
{
|
||||
lfc_chunk_size_log = pg_ceil_log2_32(newval);
|
||||
}
|
||||
|
||||
|
||||
static bool
|
||||
lfc_check_limit_hook(int *newval, void **extra, GucSource source)
|
||||
{
|
||||
@@ -435,12 +452,14 @@ lfc_change_limit_hook(int newval, void *extra)
|
||||
if (!lfc_ctl || !is_normal_backend())
|
||||
return;
|
||||
|
||||
LWLockAcquire(lfc_freelist_lock, LW_EXCLUSIVE);
|
||||
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
|
||||
|
||||
/* Open LFC file only if LFC was enabled or we are going to reenable it */
|
||||
if (newval == 0 && !LFC_ENABLED())
|
||||
{
|
||||
LWLockRelease(lfc_lock);
|
||||
LWLockRelease(lfc_freelist_lock);
|
||||
/* File should be reopened if LFC is reenabled */
|
||||
lfc_close_file();
|
||||
return;
|
||||
@@ -449,6 +468,7 @@ lfc_change_limit_hook(int newval, void *extra)
|
||||
if (!lfc_ensure_opened())
|
||||
{
|
||||
LWLockRelease(lfc_lock);
|
||||
LWLockRelease(lfc_freelist_lock);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -464,35 +484,30 @@ lfc_change_limit_hook(int newval, void *extra)
|
||||
* returning their space to file system
|
||||
*/
|
||||
FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru));
|
||||
FileCacheEntry *hole;
|
||||
uint32 offset = victim->offset;
|
||||
uint32 hash;
|
||||
bool found;
|
||||
BufferTag holetag;
|
||||
|
||||
CriticalAssert(victim->access_count == 0);
|
||||
#ifdef FALLOC_FL_PUNCH_HOLE
|
||||
if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, (off_t) victim->offset * lfc_blocks_per_chunk * BLCKSZ, lfc_blocks_per_chunk * BLCKSZ) < 0)
|
||||
if (fallocate(lfc_desc, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, (off_t) victim->offset * BLOCKS_PER_CHUNK * BLCKSZ, BLOCKS_PER_CHUNK * BLCKSZ) < 0)
|
||||
neon_log(LOG, "Failed to punch hole in file: %m");
|
||||
#endif
|
||||
/* We remove the old entry, and re-enter a hole to the hash table */
|
||||
for (int i = 0; i < lfc_blocks_per_chunk; i++)
|
||||
/* We remove the entry, and enter a hole to the freelist */
|
||||
for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
|
||||
{
|
||||
bool is_page_cached = GET_STATE(victim, i) == AVAILABLE;
|
||||
lfc_ctl->used_pages -= is_page_cached;
|
||||
lfc_ctl->evicted_pages += is_page_cached;
|
||||
}
|
||||
hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
|
||||
file_cache_hash_remove_entry(lfc_hash, victim);
|
||||
|
||||
memset(&holetag, 0, sizeof(holetag));
|
||||
holetag.blockNum = offset;
|
||||
hash = get_hash_value(lfc_hash, &holetag);
|
||||
hole = hash_search_with_hash_value(lfc_hash, &holetag, hash, HASH_ENTER, &found);
|
||||
hole->hash = hash;
|
||||
hole->offset = offset;
|
||||
hole->access_count = 0;
|
||||
CriticalAssert(!found);
|
||||
dlist_push_tail(&lfc_ctl->holes, &hole->list_node);
|
||||
if (!freelist_push(offset))
|
||||
{
|
||||
/* freelist_push already logged the error */
|
||||
lfc_switch_off();
|
||||
LWLockRelease(lfc_lock);
|
||||
LWLockRelease(lfc_freelist_lock);
|
||||
return;
|
||||
}
|
||||
|
||||
lfc_ctl->used -= 1;
|
||||
}
|
||||
@@ -504,6 +519,7 @@ lfc_change_limit_hook(int newval, void *extra)
|
||||
neon_log(DEBUG1, "set local file cache limit to %d", new_size);
|
||||
|
||||
LWLockRelease(lfc_lock);
|
||||
LWLockRelease(lfc_freelist_lock);
|
||||
}
|
||||
|
||||
void
|
||||
@@ -579,14 +595,14 @@ lfc_init(void)
|
||||
DefineCustomIntVariable("neon.file_cache_chunk_size",
|
||||
"LFC chunk size in blocks (should be power of two)",
|
||||
NULL,
|
||||
&lfc_blocks_per_chunk,
|
||||
MAX_BLOCKS_PER_CHUNK,
|
||||
1,
|
||||
MAX_BLOCKS_PER_CHUNK,
|
||||
PGC_POSTMASTER,
|
||||
&lfc_blocks_per_chunk_ro,
|
||||
BLOCKS_PER_CHUNK,
|
||||
BLOCKS_PER_CHUNK,
|
||||
BLOCKS_PER_CHUNK,
|
||||
PGC_INTERNAL,
|
||||
GUC_UNIT_BLOCKS,
|
||||
lfc_check_chunk_size,
|
||||
lfc_change_chunk_size,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL);
|
||||
|
||||
DefineCustomIntVariable("neon.file_cache_prewarm_limit",
|
||||
@@ -649,19 +665,19 @@ lfc_get_state(size_t max_entries)
|
||||
fcs = (FileCacheState*)palloc0(state_size);
|
||||
SET_VARSIZE(fcs, state_size);
|
||||
fcs->magic = FILE_CACHE_STATE_MAGIC;
|
||||
fcs->chunk_size_log = lfc_chunk_size_log;
|
||||
fcs->chunk_size_log = BLOCKS_PER_CHUNK_LOG;
|
||||
fcs->n_chunks = n_entries;
|
||||
bitmap = FILE_CACHE_STATE_BITMAP(fcs);
|
||||
|
||||
dlist_reverse_foreach(iter, &lfc_ctl->lru)
|
||||
{
|
||||
FileCacheEntry *entry = dlist_container(FileCacheEntry, list_node, iter.cur);
|
||||
fcs->chunks[i] = entry->key;
|
||||
for (int j = 0; j < lfc_blocks_per_chunk; j++)
|
||||
fcs->chunks[i] = *file_cache_hash_get_key_for_entry(lfc_hash, entry);
|
||||
for (int j = 0; j < BLOCKS_PER_CHUNK; j++)
|
||||
{
|
||||
if (GET_STATE(entry, j) != UNAVAILABLE)
|
||||
{
|
||||
BITMAP_SET(bitmap, i*lfc_blocks_per_chunk + j);
|
||||
BITMAP_SET(bitmap, i*BLOCKS_PER_CHUNK + j);
|
||||
n_pages += 1;
|
||||
}
|
||||
}
|
||||
@@ -670,7 +686,7 @@ lfc_get_state(size_t max_entries)
|
||||
}
|
||||
Assert(i == n_entries);
|
||||
fcs->n_pages = n_pages;
|
||||
Assert(pg_popcount((char*)bitmap, ((n_entries << lfc_chunk_size_log) + 7)/8) == n_pages);
|
||||
Assert(pg_popcount((char*)bitmap, ((n_entries << BLOCKS_PER_CHUNK_LOG) + 7)/8) == n_pages);
|
||||
elog(LOG, "LFC: save state of %d chunks %d pages", (int)n_entries, (int)n_pages);
|
||||
}
|
||||
|
||||
@@ -726,7 +742,7 @@ lfc_prewarm(FileCacheState* fcs, uint32 n_workers)
|
||||
}
|
||||
|
||||
fcs_chunk_size_log = fcs->chunk_size_log;
|
||||
if (fcs_chunk_size_log > MAX_BLOCKS_PER_CHUNK_LOG)
|
||||
if (fcs_chunk_size_log > BLOCKS_PER_CHUNK_LOG)
|
||||
{
|
||||
elog(ERROR, "LFC: Invalid chunk size log: %u", fcs->chunk_size_log);
|
||||
}
|
||||
@@ -945,7 +961,7 @@ lfc_invalidate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks)
|
||||
{
|
||||
BufferTag tag;
|
||||
FileCacheEntry *entry;
|
||||
uint32 hash;
|
||||
uint64 hash;
|
||||
|
||||
if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
|
||||
return;
|
||||
@@ -958,14 +974,14 @@ lfc_invalidate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks)
|
||||
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
|
||||
if (LFC_ENABLED())
|
||||
{
|
||||
for (BlockNumber blkno = 0; blkno < nblocks; blkno += lfc_blocks_per_chunk)
|
||||
for (BlockNumber blkno = 0; blkno < nblocks; blkno += BLOCKS_PER_CHUNK)
|
||||
{
|
||||
tag.blockNum = blkno;
|
||||
hash = get_hash_value(lfc_hash, &tag);
|
||||
entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
|
||||
hash = file_cache_hash_get_hash_value(lfc_hash, &tag);
|
||||
entry = file_cache_hash_find(lfc_hash, &tag, hash);
|
||||
if (entry != NULL)
|
||||
{
|
||||
for (int i = 0; i < lfc_blocks_per_chunk; i++)
|
||||
for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
|
||||
{
|
||||
if (GET_STATE(entry, i) == AVAILABLE)
|
||||
{
|
||||
@@ -990,7 +1006,7 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
|
||||
FileCacheEntry *entry;
|
||||
int chunk_offs = BLOCK_TO_CHUNK_OFF(blkno);
|
||||
bool found = false;
|
||||
uint32 hash;
|
||||
uint64 hash;
|
||||
|
||||
if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
|
||||
return false;
|
||||
@@ -1000,12 +1016,12 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
|
||||
tag.blockNum = blkno - chunk_offs;
|
||||
|
||||
CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
|
||||
hash = get_hash_value(lfc_hash, &tag);
|
||||
hash = file_cache_hash_get_hash_value(lfc_hash, &tag);
|
||||
|
||||
LWLockAcquire(lfc_lock, LW_SHARED);
|
||||
if (LFC_ENABLED())
|
||||
{
|
||||
entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
|
||||
entry = file_cache_hash_find(lfc_hash, &tag, hash);
|
||||
found = entry != NULL && GET_STATE(entry, chunk_offs) != UNAVAILABLE;
|
||||
}
|
||||
LWLockRelease(lfc_lock);
|
||||
@@ -1024,7 +1040,7 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
FileCacheEntry *entry;
|
||||
uint32 chunk_offs;
|
||||
int found = 0;
|
||||
uint32 hash;
|
||||
uint64 hash;
|
||||
int i = 0;
|
||||
|
||||
if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
|
||||
@@ -1037,7 +1053,7 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
|
||||
chunk_offs = BLOCK_TO_CHUNK_OFF(blkno);
|
||||
tag.blockNum = blkno - chunk_offs;
|
||||
hash = get_hash_value(lfc_hash, &tag);
|
||||
hash = file_cache_hash_get_hash_value(lfc_hash, &tag);
|
||||
|
||||
LWLockAcquire(lfc_lock, LW_SHARED);
|
||||
|
||||
@@ -1048,12 +1064,12 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
}
|
||||
while (true)
|
||||
{
|
||||
int this_chunk = Min(nblocks - i, lfc_blocks_per_chunk - chunk_offs);
|
||||
entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
|
||||
int this_chunk = Min(nblocks - i, BLOCKS_PER_CHUNK - chunk_offs);
|
||||
entry = file_cache_hash_find(lfc_hash, &tag, hash);
|
||||
|
||||
if (entry != NULL)
|
||||
{
|
||||
for (; chunk_offs < lfc_blocks_per_chunk && i < nblocks; chunk_offs++, i++)
|
||||
for (; chunk_offs < BLOCKS_PER_CHUNK && i < nblocks; chunk_offs++, i++)
|
||||
{
|
||||
if (GET_STATE(entry, chunk_offs) != UNAVAILABLE)
|
||||
{
|
||||
@@ -1079,7 +1095,7 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
*/
|
||||
chunk_offs = BLOCK_TO_CHUNK_OFF(blkno + i);
|
||||
tag.blockNum = (blkno + i) - chunk_offs;
|
||||
hash = get_hash_value(lfc_hash, &tag);
|
||||
hash = file_cache_hash_get_hash_value(lfc_hash, &tag);
|
||||
}
|
||||
|
||||
LWLockRelease(lfc_lock);
|
||||
@@ -1128,7 +1144,7 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
BufferTag tag;
|
||||
FileCacheEntry *entry;
|
||||
ssize_t rc;
|
||||
uint32 hash;
|
||||
uint64 hash;
|
||||
uint64 generation;
|
||||
uint32 entry_offset;
|
||||
int blocks_read = 0;
|
||||
@@ -1154,9 +1170,9 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
while (nblocks > 0)
|
||||
{
|
||||
struct iovec iov[PG_IOV_MAX];
|
||||
uint8 chunk_mask[MAX_BLOCKS_PER_CHUNK / 8] = {0};
|
||||
uint8 chunk_mask[BLOCKS_PER_CHUNK / 8] = {0};
|
||||
int chunk_offs = BLOCK_TO_CHUNK_OFF(blkno);
|
||||
int blocks_in_chunk = Min(nblocks, lfc_blocks_per_chunk - chunk_offs);
|
||||
int blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - chunk_offs);
|
||||
int iteration_hits = 0;
|
||||
int iteration_misses = 0;
|
||||
uint64 io_time_us = 0;
|
||||
@@ -1206,7 +1222,7 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
Assert(iov_last_used - first_block_in_chunk_read >= n_blocks_to_read);
|
||||
|
||||
tag.blockNum = blkno - chunk_offs;
|
||||
hash = get_hash_value(lfc_hash, &tag);
|
||||
hash = file_cache_hash_get_hash_value(lfc_hash, &tag);
|
||||
cv = &lfc_ctl->cv[hash % N_COND_VARS];
|
||||
|
||||
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
|
||||
@@ -1219,13 +1235,13 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
return blocks_read;
|
||||
}
|
||||
|
||||
entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
|
||||
entry = file_cache_hash_find(lfc_hash, &tag, hash);
|
||||
|
||||
/* Approximate working set for the blocks assumed in this entry */
|
||||
for (int i = 0; i < blocks_in_chunk; i++)
|
||||
{
|
||||
tag.blockNum = blkno + i;
|
||||
addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
|
||||
addSHLL(&lfc_ctl->wss_estimation, file_cache_hash_get_hash_value(lfc_hash, &tag));
|
||||
}
|
||||
|
||||
if (entry == NULL)
|
||||
@@ -1296,7 +1312,7 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
if (iteration_hits != 0)
|
||||
{
|
||||
/* chunk offset (# of pages) into the LFC file */
|
||||
off_t first_read_offset = (off_t) entry_offset * lfc_blocks_per_chunk;
|
||||
off_t first_read_offset = (off_t) entry_offset * BLOCKS_PER_CHUNK;
|
||||
int nwrite = iov_last_used - first_block_in_chunk_read;
|
||||
/* offset of first IOV */
|
||||
first_read_offset += chunk_offs + first_block_in_chunk_read;
|
||||
@@ -1373,14 +1389,14 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
* Returns false if there are no unpinned entries and chunk can not be added.
|
||||
*/
|
||||
static bool
|
||||
lfc_init_new_entry(FileCacheEntry* entry, uint32 hash)
|
||||
lfc_init_new_entry(FileCacheEntry *entry)
|
||||
{
|
||||
/*-----------
|
||||
* If the chunk wasn't already in the LFC then we have these
|
||||
* options, in order of preference:
|
||||
*
|
||||
* Unless there is no space available, we can:
|
||||
* 1. Use an entry from the `holes` list, and
|
||||
* 1. Use an entry from the freelist, and
|
||||
* 2. Create a new entry.
|
||||
* We can always, regardless of space in the LFC:
|
||||
* 3. evict an entry from LRU, and
|
||||
@@ -1388,17 +1404,10 @@ lfc_init_new_entry(FileCacheEntry* entry, uint32 hash)
|
||||
*/
|
||||
if (lfc_ctl->used < lfc_ctl->limit)
|
||||
{
|
||||
if (!dlist_is_empty(&lfc_ctl->holes))
|
||||
if (!freelist_is_empty())
|
||||
{
|
||||
/* We can reuse a hole that was left behind when the LFC was shrunk previously */
|
||||
FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node,
|
||||
dlist_pop_head_node(&lfc_ctl->holes));
|
||||
uint32 offset = hole->offset;
|
||||
bool hole_found;
|
||||
|
||||
hash_search_with_hash_value(lfc_hash, &hole->key,
|
||||
hole->hash, HASH_REMOVE, &hole_found);
|
||||
CriticalAssert(hole_found);
|
||||
uint32 offset = freelist_pop();
|
||||
|
||||
lfc_ctl->used += 1;
|
||||
entry->offset = offset; /* reuse the hole */
|
||||
@@ -1427,7 +1436,7 @@ lfc_init_new_entry(FileCacheEntry* entry, uint32 hash)
|
||||
FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node,
|
||||
dlist_pop_head_node(&lfc_ctl->lru));
|
||||
|
||||
for (int i = 0; i < lfc_blocks_per_chunk; i++)
|
||||
for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
|
||||
{
|
||||
bool is_page_cached = GET_STATE(victim, i) == AVAILABLE;
|
||||
lfc_ctl->used_pages -= is_page_cached;
|
||||
@@ -1436,24 +1445,21 @@ lfc_init_new_entry(FileCacheEntry* entry, uint32 hash)
|
||||
|
||||
CriticalAssert(victim->access_count == 0);
|
||||
entry->offset = victim->offset; /* grab victim's chunk */
|
||||
hash_search_with_hash_value(lfc_hash, &victim->key,
|
||||
victim->hash, HASH_REMOVE, NULL);
|
||||
file_cache_hash_remove_entry(lfc_hash, victim);
|
||||
neon_log(DEBUG2, "Swap file cache page");
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Can't add this chunk - we don't have the space for it */
|
||||
hash_search_with_hash_value(lfc_hash, &entry->key, hash,
|
||||
HASH_REMOVE, NULL);
|
||||
file_cache_hash_remove_entry(lfc_hash, entry);
|
||||
lfc_ctl->prewarm_canceled = true; /* cancel prewarm if LFC limit is reached */
|
||||
return false;
|
||||
}
|
||||
|
||||
entry->access_count = 1;
|
||||
entry->hash = hash;
|
||||
lfc_ctl->pinned += 1;
|
||||
|
||||
for (int i = 0; i < lfc_blocks_per_chunk; i++)
|
||||
for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
|
||||
SET_STATE(entry, i, UNAVAILABLE);
|
||||
|
||||
return true;
|
||||
@@ -1490,7 +1496,7 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
|
||||
FileCacheEntry *entry;
|
||||
ssize_t rc;
|
||||
bool found;
|
||||
uint32 hash;
|
||||
uint64 hash;
|
||||
uint64 generation;
|
||||
uint32 entry_offset;
|
||||
instr_time io_start, io_end;
|
||||
@@ -1509,9 +1515,10 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
|
||||
CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
|
||||
|
||||
tag.blockNum = blkno - chunk_offs;
|
||||
hash = get_hash_value(lfc_hash, &tag);
|
||||
hash = file_cache_hash_get_hash_value(lfc_hash, &tag);
|
||||
cv = &lfc_ctl->cv[hash % N_COND_VARS];
|
||||
|
||||
retry:
|
||||
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
|
||||
|
||||
if (!LFC_ENABLED() || !lfc_ensure_opened())
|
||||
@@ -1520,6 +1527,9 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!freelist_prepare_pop())
|
||||
goto retry;
|
||||
|
||||
lwlsn = neon_get_lwlsn(rinfo, forknum, blkno);
|
||||
|
||||
if (lwlsn > lsn)
|
||||
@@ -1530,12 +1540,12 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
|
||||
return false;
|
||||
}
|
||||
|
||||
entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found);
|
||||
entry = file_cache_hash_enter(lfc_hash, &tag, hash, &found);
|
||||
|
||||
if (lfc_prewarm_update_ws_estimation)
|
||||
{
|
||||
tag.blockNum = blkno;
|
||||
addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
|
||||
addSHLL(&lfc_ctl->wss_estimation, file_cache_hash_get_hash_value(lfc_hash, &tag));
|
||||
}
|
||||
if (found)
|
||||
{
|
||||
@@ -1557,7 +1567,7 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
|
||||
}
|
||||
else
|
||||
{
|
||||
if (!lfc_init_new_entry(entry, hash))
|
||||
if (!lfc_init_new_entry(entry))
|
||||
{
|
||||
/*
|
||||
* We can't process this chunk due to lack of space in LFC,
|
||||
@@ -1578,7 +1588,7 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
|
||||
pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE);
|
||||
INSTR_TIME_SET_CURRENT(io_start);
|
||||
rc = pwrite(lfc_desc, buffer, BLCKSZ,
|
||||
((off_t) entry_offset * lfc_blocks_per_chunk + chunk_offs) * BLCKSZ);
|
||||
((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ);
|
||||
INSTR_TIME_SET_CURRENT(io_end);
|
||||
pgstat_report_wait_end();
|
||||
|
||||
@@ -1640,7 +1650,7 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
FileCacheEntry *entry;
|
||||
ssize_t rc;
|
||||
bool found;
|
||||
uint32 hash;
|
||||
uint64 hash;
|
||||
uint64 generation;
|
||||
uint32 entry_offset;
|
||||
int buf_offset = 0;
|
||||
@@ -1653,6 +1663,7 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
|
||||
CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
|
||||
|
||||
retry:
|
||||
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
|
||||
|
||||
if (!LFC_ENABLED() || !lfc_ensure_opened())
|
||||
@@ -1662,6 +1673,9 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
}
|
||||
generation = lfc_ctl->generation;
|
||||
|
||||
if (!freelist_prepare_pop())
|
||||
goto retry;
|
||||
|
||||
/*
|
||||
* For every chunk that has blocks we're interested in, we
|
||||
* 1. get the chunk header
|
||||
@@ -1675,7 +1689,7 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
{
|
||||
struct iovec iov[PG_IOV_MAX];
|
||||
int chunk_offs = BLOCK_TO_CHUNK_OFF(blkno);
|
||||
int blocks_in_chunk = Min(nblocks, lfc_blocks_per_chunk - chunk_offs);
|
||||
int blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - chunk_offs);
|
||||
instr_time io_start, io_end;
|
||||
ConditionVariable* cv;
|
||||
|
||||
@@ -1688,16 +1702,16 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
}
|
||||
|
||||
tag.blockNum = blkno - chunk_offs;
|
||||
hash = get_hash_value(lfc_hash, &tag);
|
||||
hash = file_cache_hash_get_hash_value(lfc_hash, &tag);
|
||||
cv = &lfc_ctl->cv[hash % N_COND_VARS];
|
||||
|
||||
entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found);
|
||||
entry = file_cache_hash_enter(lfc_hash, &tag, hash, &found);
|
||||
|
||||
/* Approximate working set for the blocks assumed in this entry */
|
||||
for (int i = 0; i < blocks_in_chunk; i++)
|
||||
{
|
||||
tag.blockNum = blkno + i;
|
||||
addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
|
||||
addSHLL(&lfc_ctl->wss_estimation, file_cache_hash_get_hash_value(lfc_hash, &tag));
|
||||
}
|
||||
|
||||
if (found)
|
||||
@@ -1714,7 +1728,7 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
}
|
||||
else
|
||||
{
|
||||
if (!lfc_init_new_entry(entry, hash))
|
||||
if (!lfc_init_new_entry(entry))
|
||||
{
|
||||
/*
|
||||
* We can't process this chunk due to lack of space in LFC,
|
||||
@@ -1763,7 +1777,7 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE);
|
||||
INSTR_TIME_SET_CURRENT(io_start);
|
||||
rc = pwritev(lfc_desc, iov, blocks_in_chunk,
|
||||
((off_t) entry_offset * lfc_blocks_per_chunk + chunk_offs) * BLCKSZ);
|
||||
((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ);
|
||||
INSTR_TIME_SET_CURRENT(io_end);
|
||||
pgstat_report_wait_end();
|
||||
|
||||
@@ -1823,6 +1837,140 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
LWLockRelease(lfc_lock);
|
||||
}
|
||||
|
||||
/**** freelist management ****/
|
||||
|
||||
|
||||
/*
|
||||
* Prerequisites:
|
||||
* - The caller is holding 'lfc_lock'. XXX
|
||||
*/
|
||||
static bool
|
||||
freelist_prepare_pop(void)
|
||||
{
|
||||
/*
|
||||
* If the in-memory freelist is empty, but there are more blocks available, load them.
|
||||
*
|
||||
* TODO: if there
|
||||
*/
|
||||
if (lfc_ctl->num_free_pages == 0 && lfc_ctl->freelist_head != INVALID_OFFSET)
|
||||
{
|
||||
uint32 freelist_head;
|
||||
FreeListChunk *freelist_chunk;
|
||||
size_t bytes_read;
|
||||
|
||||
LWLockRelease(lfc_lock);
|
||||
LWLockAcquire(lfc_freelist_lock, LW_EXCLUSIVE);
|
||||
|
||||
if (!(lfc_ctl->num_free_pages == 0 && lfc_ctl->freelist_head != INVALID_OFFSET))
|
||||
{
|
||||
/* someone else did the work for us while we were not holding the lock */
|
||||
LWLockRelease(lfc_freelist_lock);
|
||||
return false;
|
||||
}
|
||||
|
||||
freelist_head = lfc_ctl->freelist_head;
|
||||
freelist_chunk = palloc(BLOCKS_PER_CHUNK * BLCKSZ);
|
||||
|
||||
bytes_read = 0;
|
||||
while (bytes_read < BLOCKS_PER_CHUNK * BLCKSZ)
|
||||
{
|
||||
ssize_t rc;
|
||||
|
||||
rc = pread(lfc_desc, freelist_chunk, BLOCKS_PER_CHUNK * BLCKSZ - bytes_read, (off_t) freelist_head * BLOCKS_PER_CHUNK * BLCKSZ + bytes_read);
|
||||
if (rc < 0)
|
||||
{
|
||||
lfc_disable("read freelist page");
|
||||
return false;
|
||||
}
|
||||
bytes_read += rc;
|
||||
}
|
||||
|
||||
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
|
||||
if (lfc_generation != lfc_ctl->generation)
|
||||
{
|
||||
LWLockRelease(lfc_lock);
|
||||
return false;
|
||||
}
|
||||
|
||||
Assert(lfc_ctl->freelist_head == freelist_head);
|
||||
Assert(lfc_ctl->num_free_pages == 0);
|
||||
lfc_ctl->freelist_head = freelist_chunk->next;
|
||||
lfc_ctl->num_free_pages = freelist_chunk->num_free_pages;
|
||||
memcpy(lfc_ctl->free_pages, freelist_chunk->free_pages, lfc_ctl->num_free_pages * sizeof(uint32));
|
||||
pfree(freelist_chunk);
|
||||
|
||||
LWLockRelease(lfc_lock);
|
||||
LWLockRelease(lfc_freelist_lock);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Prerequisites:
|
||||
* - The caller is holding 'lfc_lock' and 'lfc_freelist_lock'.
|
||||
*
|
||||
* Returns 'false' on error.
|
||||
*/
|
||||
static bool
|
||||
freelist_push(uint32 offset)
|
||||
{
|
||||
Assert(lfc_ctl->freelist_size == FREELIST_ENTRIES_PER_CHUNK);
|
||||
if (lfc_ctl->num_free_pages == lfc_ctl->freelist_size)
|
||||
{
|
||||
FreeListChunk *freelist_chunk;
|
||||
struct iovec iov;
|
||||
ssize_t rc;
|
||||
|
||||
freelist_chunk = palloc(BLOCKS_PER_CHUNK * BLCKSZ);
|
||||
|
||||
/* write the existing entries to the chunk on disk */
|
||||
freelist_chunk->next = lfc_ctl->freelist_head;
|
||||
freelist_chunk->num_free_pages = lfc_ctl->num_free_pages;
|
||||
memcpy(freelist_chunk->free_pages, lfc_ctl->free_pages, lfc_ctl->num_free_pages * sizeof(uint32));
|
||||
|
||||
/* Use the passed-in offset to hold the freelist chunk itself */
|
||||
iov.iov_base = freelist_chunk;
|
||||
iov.iov_len = BLOCKS_PER_CHUNK * BLCKSZ;
|
||||
rc = pg_pwritev_with_retry(lfc_desc, &iov, 1, (off_t) offset * BLOCKS_PER_CHUNK * BLCKSZ);
|
||||
|
||||
pfree(freelist_chunk);
|
||||
|
||||
if (rc < 0)
|
||||
return false;
|
||||
|
||||
lfc_ctl->freelist_head = offset;
|
||||
lfc_ctl->num_free_pages = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
lfc_ctl->free_pages[lfc_ctl->num_free_pages] = offset;
|
||||
lfc_ctl->num_free_pages++;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static uint32
|
||||
freelist_pop(void)
|
||||
{
|
||||
uint32 result;
|
||||
|
||||
/* The caller should've checked that the list is not empty */
|
||||
Assert(lfc_ctl->num_free_pages > 0);
|
||||
|
||||
result = lfc_ctl->free_pages[lfc_ctl->num_free_pages - 1];
|
||||
lfc_ctl->num_free_pages--;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static bool
|
||||
freelist_is_empty(void)
|
||||
{
|
||||
return lfc_ctl->num_free_pages == 0;
|
||||
}
|
||||
|
||||
typedef struct
|
||||
{
|
||||
TupleDesc tupdesc;
|
||||
@@ -1919,7 +2067,7 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
|
||||
break;
|
||||
case 8:
|
||||
key = "file_cache_chunk_size_pages";
|
||||
value = lfc_blocks_per_chunk;
|
||||
value = BLOCKS_PER_CHUNK;
|
||||
break;
|
||||
case 9:
|
||||
key = "file_cache_chunks_pinned";
|
||||
@@ -1990,7 +2138,6 @@ local_cache_pages(PG_FUNCTION_ARGS)
|
||||
|
||||
if (SRF_IS_FIRSTCALL())
|
||||
{
|
||||
HASH_SEQ_STATUS status;
|
||||
FileCacheEntry *entry;
|
||||
uint32 n_pages = 0;
|
||||
|
||||
@@ -2046,15 +2193,16 @@ local_cache_pages(PG_FUNCTION_ARGS)
|
||||
|
||||
if (LFC_ENABLED())
|
||||
{
|
||||
hash_seq_init(&status, lfc_hash);
|
||||
while ((entry = hash_seq_search(&status)) != NULL)
|
||||
uint32 num_buckets = file_cache_hash_get_num_buckets(lfc_hash);
|
||||
|
||||
for (uint32 pos = 0; pos < num_buckets; pos++)
|
||||
{
|
||||
/* Skip hole tags */
|
||||
if (NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key)) != 0)
|
||||
{
|
||||
for (int i = 0; i < lfc_blocks_per_chunk; i++)
|
||||
n_pages += GET_STATE(entry, i) == AVAILABLE;
|
||||
}
|
||||
entry = file_cache_hash_get_at_pos(lfc_hash, pos);
|
||||
if (entry == NULL)
|
||||
continue;
|
||||
|
||||
for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
|
||||
n_pages += GET_STATE(entry, i) == AVAILABLE;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2076,25 +2224,28 @@ local_cache_pages(PG_FUNCTION_ARGS)
|
||||
* in the fctx->record structure.
|
||||
*/
|
||||
uint32 n = 0;
|
||||
uint32 num_buckets = file_cache_hash_get_num_buckets(lfc_hash);
|
||||
|
||||
hash_seq_init(&status, lfc_hash);
|
||||
while ((entry = hash_seq_search(&status)) != NULL)
|
||||
for (uint32 pos = 0; pos < num_buckets; pos++)
|
||||
{
|
||||
for (int i = 0; i < lfc_blocks_per_chunk; i++)
|
||||
entry = file_cache_hash_get_at_pos(lfc_hash, pos);
|
||||
if (entry == NULL)
|
||||
continue;
|
||||
|
||||
for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
|
||||
{
|
||||
if (NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key)) != 0)
|
||||
const BufferTag *key = file_cache_hash_get_key_for_entry(lfc_hash, entry);
|
||||
|
||||
if (GET_STATE(entry, i) == AVAILABLE)
|
||||
{
|
||||
if (GET_STATE(entry, i) == AVAILABLE)
|
||||
{
|
||||
fctx->record[n].pageoffs = entry->offset * lfc_blocks_per_chunk + i;
|
||||
fctx->record[n].relfilenode = NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key));
|
||||
fctx->record[n].reltablespace = NInfoGetSpcOid(BufTagGetNRelFileInfo(entry->key));
|
||||
fctx->record[n].reldatabase = NInfoGetDbOid(BufTagGetNRelFileInfo(entry->key));
|
||||
fctx->record[n].forknum = entry->key.forkNum;
|
||||
fctx->record[n].blocknum = entry->key.blockNum + i;
|
||||
fctx->record[n].accesscount = entry->access_count;
|
||||
n += 1;
|
||||
}
|
||||
fctx->record[n].pageoffs = entry->offset * BLOCKS_PER_CHUNK + i;
|
||||
fctx->record[n].relfilenode = NInfoGetRelNumber(BufTagGetNRelFileInfo(*key));
|
||||
fctx->record[n].reltablespace = NInfoGetSpcOid(BufTagGetNRelFileInfo(*key));
|
||||
fctx->record[n].reldatabase = NInfoGetDbOid(BufTagGetNRelFileInfo(*key));
|
||||
fctx->record[n].forknum = key->forkNum;
|
||||
fctx->record[n].blocknum = key->blockNum + i;
|
||||
fctx->record[n].accesscount = entry->access_count;
|
||||
n += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
#if PG_MAJORVERSION_NUM >= 15
|
||||
#include "access/xlogrecovery.h"
|
||||
#endif
|
||||
#include "executor/instrument.h"
|
||||
#include "replication/logical.h"
|
||||
#include "replication/logicallauncher.h"
|
||||
#include "replication/slot.h"
|
||||
@@ -33,6 +34,7 @@
|
||||
#include "file_cache.h"
|
||||
#include "neon.h"
|
||||
#include "neon_lwlsncache.h"
|
||||
#include "neon_perf_counters.h"
|
||||
#include "control_plane_connector.h"
|
||||
#include "logical_replication_monitor.h"
|
||||
#include "unstable_extensions.h"
|
||||
@@ -46,6 +48,13 @@ void _PG_init(void);
|
||||
|
||||
|
||||
static int running_xacts_overflow_policy;
|
||||
static bool monitor_query_exec_time = false;
|
||||
|
||||
static ExecutorStart_hook_type prev_ExecutorStart = NULL;
|
||||
static ExecutorEnd_hook_type prev_ExecutorEnd = NULL;
|
||||
|
||||
static void neon_ExecutorStart(QueryDesc *queryDesc, int eflags);
|
||||
static void neon_ExecutorEnd(QueryDesc *queryDesc);
|
||||
|
||||
#if PG_MAJORVERSION_NUM >= 16
|
||||
static shmem_startup_hook_type prev_shmem_startup_hook;
|
||||
@@ -470,6 +479,16 @@ _PG_init(void)
|
||||
0,
|
||||
NULL, NULL, NULL);
|
||||
|
||||
DefineCustomBoolVariable(
|
||||
"neon.monitor_query_exec_time",
|
||||
"Collect infortmation about query execution time",
|
||||
NULL,
|
||||
&monitor_query_exec_time,
|
||||
false,
|
||||
PGC_USERSET,
|
||||
0,
|
||||
NULL, NULL, NULL);
|
||||
|
||||
DefineCustomBoolVariable(
|
||||
"neon.allow_replica_misconfig",
|
||||
"Allow replica startup when some critical GUCs have smaller value than on primary node",
|
||||
@@ -508,6 +527,11 @@ _PG_init(void)
|
||||
EmitWarningsOnPlaceholders("neon");
|
||||
|
||||
ReportSearchPath();
|
||||
|
||||
prev_ExecutorStart = ExecutorStart_hook;
|
||||
ExecutorStart_hook = neon_ExecutorStart;
|
||||
prev_ExecutorEnd = ExecutorEnd_hook;
|
||||
ExecutorEnd_hook = neon_ExecutorEnd;
|
||||
}
|
||||
|
||||
PG_FUNCTION_INFO_V1(pg_cluster_size);
|
||||
@@ -581,3 +605,55 @@ neon_shmem_startup_hook(void)
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* ExecutorStart hook: start up tracking if needed
|
||||
*/
|
||||
static void
|
||||
neon_ExecutorStart(QueryDesc *queryDesc, int eflags)
|
||||
{
|
||||
if (prev_ExecutorStart)
|
||||
prev_ExecutorStart(queryDesc, eflags);
|
||||
else
|
||||
standard_ExecutorStart(queryDesc, eflags);
|
||||
|
||||
if (monitor_query_exec_time)
|
||||
{
|
||||
/*
|
||||
* Set up to track total elapsed time in ExecutorRun. Make sure the
|
||||
* space is allocated in the per-query context so it will go away at
|
||||
* ExecutorEnd.
|
||||
*/
|
||||
if (queryDesc->totaltime == NULL)
|
||||
{
|
||||
MemoryContext oldcxt;
|
||||
|
||||
oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
|
||||
queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_TIMER, false);
|
||||
MemoryContextSwitchTo(oldcxt);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* ExecutorEnd hook: store results if needed
|
||||
*/
|
||||
static void
|
||||
neon_ExecutorEnd(QueryDesc *queryDesc)
|
||||
{
|
||||
if (monitor_query_exec_time && queryDesc->totaltime)
|
||||
{
|
||||
/*
|
||||
* Make sure stats accumulation is done. (Note: it's okay if several
|
||||
* levels of hook all do this.)
|
||||
*/
|
||||
InstrEndLoop(queryDesc->totaltime);
|
||||
|
||||
inc_query_time(queryDesc->totaltime->total*1000000); /* convert to usec */
|
||||
}
|
||||
|
||||
if (prev_ExecutorEnd)
|
||||
prev_ExecutorEnd(queryDesc);
|
||||
else
|
||||
standard_ExecutorEnd(queryDesc);
|
||||
}
|
||||
|
||||
@@ -71,6 +71,27 @@ inc_iohist(IOHistogram hist, uint64 latency_us)
|
||||
hist->wait_us_count++;
|
||||
}
|
||||
|
||||
static inline void
|
||||
inc_qthist(QTHistogram hist, uint64 elapsed_us)
|
||||
{
|
||||
int lo = 0;
|
||||
int hi = NUM_QT_BUCKETS - 1;
|
||||
|
||||
/* Find the right bucket with binary search */
|
||||
while (lo < hi)
|
||||
{
|
||||
int mid = (lo + hi) / 2;
|
||||
|
||||
if (elapsed_us < qt_bucket_thresholds[mid])
|
||||
hi = mid;
|
||||
else
|
||||
lo = mid + 1;
|
||||
}
|
||||
hist->elapsed_us_bucket[lo]++;
|
||||
hist->elapsed_us_sum += elapsed_us;
|
||||
hist->elapsed_us_count++;
|
||||
}
|
||||
|
||||
/*
|
||||
* Count a GetPage wait operation.
|
||||
*/
|
||||
@@ -98,6 +119,13 @@ inc_page_cache_write_wait(uint64 latency)
|
||||
inc_iohist(&MyNeonCounters->file_cache_write_hist, latency);
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
inc_query_time(uint64 elapsed)
|
||||
{
|
||||
inc_qthist(&MyNeonCounters->query_time_hist, elapsed);
|
||||
}
|
||||
|
||||
/*
|
||||
* Support functions for the views, neon_backend_perf_counters and
|
||||
* neon_perf_counters.
|
||||
@@ -112,11 +140,11 @@ typedef struct
|
||||
} metric_t;
|
||||
|
||||
static int
|
||||
histogram_to_metrics(IOHistogram histogram,
|
||||
metric_t *metrics,
|
||||
const char *count,
|
||||
const char *sum,
|
||||
const char *bucket)
|
||||
io_histogram_to_metrics(IOHistogram histogram,
|
||||
metric_t *metrics,
|
||||
const char *count,
|
||||
const char *sum,
|
||||
const char *bucket)
|
||||
{
|
||||
int i = 0;
|
||||
uint64 bucket_accum = 0;
|
||||
@@ -145,10 +173,44 @@ histogram_to_metrics(IOHistogram histogram,
|
||||
return i;
|
||||
}
|
||||
|
||||
static int
|
||||
qt_histogram_to_metrics(QTHistogram histogram,
|
||||
metric_t *metrics,
|
||||
const char *count,
|
||||
const char *sum,
|
||||
const char *bucket)
|
||||
{
|
||||
int i = 0;
|
||||
uint64 bucket_accum = 0;
|
||||
|
||||
metrics[i].name = count;
|
||||
metrics[i].is_bucket = false;
|
||||
metrics[i].value = (double) histogram->elapsed_us_count;
|
||||
i++;
|
||||
metrics[i].name = sum;
|
||||
metrics[i].is_bucket = false;
|
||||
metrics[i].value = (double) histogram->elapsed_us_sum / 1000000.0;
|
||||
i++;
|
||||
for (int bucketno = 0; bucketno < NUM_QT_BUCKETS; bucketno++)
|
||||
{
|
||||
uint64 threshold = qt_bucket_thresholds[bucketno];
|
||||
|
||||
bucket_accum += histogram->elapsed_us_bucket[bucketno];
|
||||
|
||||
metrics[i].name = bucket;
|
||||
metrics[i].is_bucket = true;
|
||||
metrics[i].bucket_le = (threshold == UINT64_MAX) ? INFINITY : ((double) threshold) / 1000000.0;
|
||||
metrics[i].value = (double) bucket_accum;
|
||||
i++;
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
static metric_t *
|
||||
neon_perf_counters_to_metrics(neon_per_backend_counters *counters)
|
||||
{
|
||||
#define NUM_METRICS ((2 + NUM_IO_WAIT_BUCKETS) * 3 + 12)
|
||||
#define NUM_METRICS ((2 + NUM_IO_WAIT_BUCKETS) * 3 + (2 + NUM_QT_BUCKETS) + 12)
|
||||
metric_t *metrics = palloc((NUM_METRICS + 1) * sizeof(metric_t));
|
||||
int i = 0;
|
||||
|
||||
@@ -159,10 +221,10 @@ neon_perf_counters_to_metrics(neon_per_backend_counters *counters)
|
||||
i++; \
|
||||
} while (false)
|
||||
|
||||
i += histogram_to_metrics(&counters->getpage_hist, &metrics[i],
|
||||
"getpage_wait_seconds_count",
|
||||
"getpage_wait_seconds_sum",
|
||||
"getpage_wait_seconds_bucket");
|
||||
i += io_histogram_to_metrics(&counters->getpage_hist, &metrics[i],
|
||||
"getpage_wait_seconds_count",
|
||||
"getpage_wait_seconds_sum",
|
||||
"getpage_wait_seconds_bucket");
|
||||
|
||||
APPEND_METRIC(getpage_prefetch_requests_total);
|
||||
APPEND_METRIC(getpage_sync_requests_total);
|
||||
@@ -178,14 +240,19 @@ neon_perf_counters_to_metrics(neon_per_backend_counters *counters)
|
||||
|
||||
APPEND_METRIC(file_cache_hits_total);
|
||||
|
||||
i += histogram_to_metrics(&counters->file_cache_read_hist, &metrics[i],
|
||||
"file_cache_read_wait_seconds_count",
|
||||
"file_cache_read_wait_seconds_sum",
|
||||
"file_cache_read_wait_seconds_bucket");
|
||||
i += histogram_to_metrics(&counters->file_cache_write_hist, &metrics[i],
|
||||
"file_cache_write_wait_seconds_count",
|
||||
"file_cache_write_wait_seconds_sum",
|
||||
"file_cache_write_wait_seconds_bucket");
|
||||
i += io_histogram_to_metrics(&counters->file_cache_read_hist, &metrics[i],
|
||||
"file_cache_read_wait_seconds_count",
|
||||
"file_cache_read_wait_seconds_sum",
|
||||
"file_cache_read_wait_seconds_bucket");
|
||||
i += io_histogram_to_metrics(&counters->file_cache_write_hist, &metrics[i],
|
||||
"file_cache_write_wait_seconds_count",
|
||||
"file_cache_write_wait_seconds_sum",
|
||||
"file_cache_write_wait_seconds_bucket");
|
||||
|
||||
i += qt_histogram_to_metrics(&counters->query_time_hist, &metrics[i],
|
||||
"query_time_seconds_count",
|
||||
"query_time_seconds_sum",
|
||||
"query_time_seconds_bucket");
|
||||
|
||||
Assert(i == NUM_METRICS);
|
||||
|
||||
@@ -257,7 +324,7 @@ neon_get_backend_perf_counters(PG_FUNCTION_ARGS)
|
||||
}
|
||||
|
||||
static inline void
|
||||
histogram_merge_into(IOHistogram into, IOHistogram from)
|
||||
io_histogram_merge_into(IOHistogram into, IOHistogram from)
|
||||
{
|
||||
into->wait_us_count += from->wait_us_count;
|
||||
into->wait_us_sum += from->wait_us_sum;
|
||||
@@ -265,6 +332,15 @@ histogram_merge_into(IOHistogram into, IOHistogram from)
|
||||
into->wait_us_bucket[bucketno] += from->wait_us_bucket[bucketno];
|
||||
}
|
||||
|
||||
static inline void
|
||||
qt_histogram_merge_into(QTHistogram into, QTHistogram from)
|
||||
{
|
||||
into->elapsed_us_count += from->elapsed_us_count;
|
||||
into->elapsed_us_sum += from->elapsed_us_sum;
|
||||
for (int bucketno = 0; bucketno < NUM_QT_BUCKETS; bucketno++)
|
||||
into->elapsed_us_bucket[bucketno] += from->elapsed_us_bucket[bucketno];
|
||||
}
|
||||
|
||||
PG_FUNCTION_INFO_V1(neon_get_perf_counters);
|
||||
Datum
|
||||
neon_get_perf_counters(PG_FUNCTION_ARGS)
|
||||
@@ -283,7 +359,7 @@ neon_get_perf_counters(PG_FUNCTION_ARGS)
|
||||
{
|
||||
neon_per_backend_counters *counters = &neon_per_backend_counters_shared[procno];
|
||||
|
||||
histogram_merge_into(&totals.getpage_hist, &counters->getpage_hist);
|
||||
io_histogram_merge_into(&totals.getpage_hist, &counters->getpage_hist);
|
||||
totals.getpage_prefetch_requests_total += counters->getpage_prefetch_requests_total;
|
||||
totals.getpage_sync_requests_total += counters->getpage_sync_requests_total;
|
||||
totals.getpage_prefetch_misses_total += counters->getpage_prefetch_misses_total;
|
||||
@@ -294,13 +370,13 @@ neon_get_perf_counters(PG_FUNCTION_ARGS)
|
||||
totals.pageserver_open_requests += counters->pageserver_open_requests;
|
||||
totals.getpage_prefetches_buffered += counters->getpage_prefetches_buffered;
|
||||
totals.file_cache_hits_total += counters->file_cache_hits_total;
|
||||
histogram_merge_into(&totals.file_cache_read_hist, &counters->file_cache_read_hist);
|
||||
histogram_merge_into(&totals.file_cache_write_hist, &counters->file_cache_write_hist);
|
||||
|
||||
totals.compute_getpage_stuck_requests_total += counters->compute_getpage_stuck_requests_total;
|
||||
totals.compute_getpage_max_inflight_stuck_time_ms = Max(
|
||||
totals.compute_getpage_max_inflight_stuck_time_ms,
|
||||
counters->compute_getpage_max_inflight_stuck_time_ms);
|
||||
io_histogram_merge_into(&totals.file_cache_read_hist, &counters->file_cache_read_hist);
|
||||
io_histogram_merge_into(&totals.file_cache_write_hist, &counters->file_cache_write_hist);
|
||||
qt_histogram_merge_into(&totals.query_time_hist, &counters->query_time_hist);
|
||||
}
|
||||
|
||||
metrics = neon_perf_counters_to_metrics(&totals);
|
||||
|
||||
@@ -36,6 +36,28 @@ typedef struct IOHistogramData
|
||||
|
||||
typedef IOHistogramData *IOHistogram;
|
||||
|
||||
static const uint64 qt_bucket_thresholds[] = {
|
||||
2, 3, 6, 10, /* 0 us - 10 us */
|
||||
20, 30, 60, 100, /* 10 us - 100 us */
|
||||
200, 300, 600, 1000, /* 100 us - 1 ms */
|
||||
2000, 3000, 6000, 10000, /* 1 ms - 10 ms */
|
||||
20000, 30000, 60000, 100000, /* 10 ms - 100 ms */
|
||||
200000, 300000, 600000, 1000000, /* 100 ms - 1 s */
|
||||
2000000, 3000000, 6000000, 10000000, /* 1 s - 10 s */
|
||||
20000000, 30000000, 60000000, 100000000, /* 10 s - 100 s */
|
||||
UINT64_MAX,
|
||||
};
|
||||
#define NUM_QT_BUCKETS (lengthof(qt_bucket_thresholds))
|
||||
|
||||
typedef struct QTHistogramData
|
||||
{
|
||||
uint64 elapsed_us_count;
|
||||
uint64 elapsed_us_sum;
|
||||
uint64 elapsed_us_bucket[NUM_QT_BUCKETS];
|
||||
} QTHistogramData;
|
||||
|
||||
typedef QTHistogramData *QTHistogram;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
/*
|
||||
@@ -127,6 +149,11 @@ typedef struct
|
||||
/* LFC I/O time buckets */
|
||||
IOHistogramData file_cache_read_hist;
|
||||
IOHistogramData file_cache_write_hist;
|
||||
|
||||
/*
|
||||
* Histogram of query execution time.
|
||||
*/
|
||||
QTHistogramData query_time_hist;
|
||||
} neon_per_backend_counters;
|
||||
|
||||
/* Pointer to the shared memory array of neon_per_backend_counters structs */
|
||||
@@ -149,6 +176,7 @@ extern neon_per_backend_counters *neon_per_backend_counters_shared;
|
||||
extern void inc_getpage_wait(uint64 latency);
|
||||
extern void inc_page_cache_read_wait(uint64 latency);
|
||||
extern void inc_page_cache_write_wait(uint64 latency);
|
||||
extern void inc_query_time(uint64 elapsed);
|
||||
|
||||
extern Size NeonPerfCountersShmemSize(void);
|
||||
extern void NeonPerfCountersShmemInit(void);
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
|
||||
#include "funcapi.h"
|
||||
#include "miscadmin.h"
|
||||
#include "access/xlog.h"
|
||||
#include "utils/tuplestore.h"
|
||||
|
||||
#include "neon_pgversioncompat.h"
|
||||
@@ -41,5 +42,12 @@ InitMaterializedSRF(FunctionCallInfo fcinfo, bits32 flags)
|
||||
rsinfo->setDesc = stored_tupdesc;
|
||||
MemoryContextSwitchTo(old_context);
|
||||
}
|
||||
|
||||
TimeLineID GetWALInsertionTimeLine(void)
|
||||
{
|
||||
return ThisTimeLineID + 1;
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@@ -162,6 +162,7 @@ InitBufferTag(BufferTag *tag, const RelFileNode *rnode,
|
||||
|
||||
#if PG_MAJORVERSION_NUM < 15
|
||||
extern void InitMaterializedSRF(FunctionCallInfo fcinfo, bits32 flags);
|
||||
extern TimeLineID GetWALInsertionTimeLine(void);
|
||||
#endif
|
||||
|
||||
#endif /* NEON_PGVERSIONCOMPAT_H */
|
||||
|
||||
@@ -69,6 +69,7 @@ struct NeonWALReader
|
||||
WALSegmentContext segcxt;
|
||||
WALOpenSegment seg;
|
||||
int wre_errno;
|
||||
TimeLineID local_active_tlid;
|
||||
/* Explains failure to read, static for simplicity. */
|
||||
char err_msg[NEON_WALREADER_ERR_MSG_LEN];
|
||||
|
||||
@@ -106,7 +107,7 @@ struct NeonWALReader
|
||||
|
||||
/* palloc and initialize NeonWALReader */
|
||||
NeonWALReader *
|
||||
NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_prefix)
|
||||
NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_prefix, TimeLineID tlid)
|
||||
{
|
||||
NeonWALReader *reader;
|
||||
|
||||
@@ -118,6 +119,7 @@ NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_
|
||||
MemoryContextAllocZero(TopMemoryContext, sizeof(NeonWALReader));
|
||||
|
||||
reader->available_lsn = available_lsn;
|
||||
reader->local_active_tlid = tlid;
|
||||
reader->seg.ws_file = -1;
|
||||
reader->seg.ws_segno = 0;
|
||||
reader->seg.ws_tli = 0;
|
||||
@@ -577,6 +579,17 @@ NeonWALReaderIsRemConnEstablished(NeonWALReader *state)
|
||||
return state->rem_state == RS_ESTABLISHED;
|
||||
}
|
||||
|
||||
/*
|
||||
* Whether remote connection is established. Once this is done, until successful
|
||||
* local read or error socket is stable and user can update socket events
|
||||
* instead of readding it each time.
|
||||
*/
|
||||
TimeLineID
|
||||
NeonWALReaderLocalActiveTimeLineID(NeonWALReader *state)
|
||||
{
|
||||
return state->local_active_tlid;
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns events user should wait on connection socket or 0 if remote
|
||||
* connection is not active.
|
||||
|
||||
@@ -19,9 +19,10 @@ typedef enum
|
||||
NEON_WALREAD_ERROR,
|
||||
} NeonWALReadResult;
|
||||
|
||||
extern NeonWALReader *NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_prefix);
|
||||
extern NeonWALReader *NeonWALReaderAllocate(int wal_segment_size, XLogRecPtr available_lsn, char *log_prefix, TimeLineID tlid);
|
||||
extern void NeonWALReaderFree(NeonWALReader *state);
|
||||
extern void NeonWALReaderResetRemote(NeonWALReader *state);
|
||||
extern TimeLineID NeonWALReaderLocalActiveTimeLineID(NeonWALReader *state);
|
||||
extern NeonWALReadResult NeonWALRead(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli);
|
||||
extern pgsocket NeonWALReaderSocket(NeonWALReader *state);
|
||||
extern uint32 NeonWALReaderEvents(NeonWALReader *state);
|
||||
|
||||
@@ -98,6 +98,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
|
||||
wp = palloc0(sizeof(WalProposer));
|
||||
wp->config = config;
|
||||
wp->api = api;
|
||||
wp->localTimeLineID = config->pgTimeline;
|
||||
wp->state = WPS_COLLECTING_TERMS;
|
||||
wp->mconf.generation = INVALID_GENERATION;
|
||||
wp->mconf.members.len = 0;
|
||||
@@ -119,6 +120,10 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
|
||||
{
|
||||
wp_log(FATAL, "failed to parse neon.safekeepers generation number: %m");
|
||||
}
|
||||
if (*endptr != ':')
|
||||
{
|
||||
wp_log(FATAL, "failed to parse neon.safekeepers: no colon after generation");
|
||||
}
|
||||
/* Skip past : to the first hostname. */
|
||||
host = endptr + 1;
|
||||
}
|
||||
@@ -1380,7 +1385,7 @@ ProcessPropStartPos(WalProposer *wp)
|
||||
* we must bail out, as clog and other non rel data is inconsistent.
|
||||
*/
|
||||
walprop_shared = wp->api.get_shmem_state(wp);
|
||||
if (!wp->config->syncSafekeepers)
|
||||
if (!wp->config->syncSafekeepers && !walprop_shared->replica_promote)
|
||||
{
|
||||
/*
|
||||
* Basebackup LSN always points to the beginning of the record (not
|
||||
|
||||
@@ -391,6 +391,7 @@ typedef struct WalproposerShmemState
|
||||
/* last feedback from each shard */
|
||||
PageserverFeedback shard_ps_feedback[MAX_SHARDS];
|
||||
int num_shards;
|
||||
bool replica_promote;
|
||||
|
||||
/* aggregated feedback with min LSNs across shards */
|
||||
PageserverFeedback min_ps_feedback;
|
||||
@@ -806,6 +807,9 @@ typedef struct WalProposer
|
||||
/* Safekeepers walproposer is connecting to. */
|
||||
Safekeeper safekeeper[MAX_SAFEKEEPERS];
|
||||
|
||||
/* Current local TimeLineId in use */
|
||||
TimeLineID localTimeLineID;
|
||||
|
||||
/* WAL has been generated up to this point */
|
||||
XLogRecPtr availableLsn;
|
||||
|
||||
|
||||
@@ -35,6 +35,7 @@
|
||||
#include "storage/proc.h"
|
||||
#include "storage/ipc.h"
|
||||
#include "storage/lwlock.h"
|
||||
#include "storage/pg_shmem.h"
|
||||
#include "storage/shmem.h"
|
||||
#include "storage/spin.h"
|
||||
#include "tcop/tcopprot.h"
|
||||
@@ -159,12 +160,19 @@ WalProposerMain(Datum main_arg)
|
||||
{
|
||||
WalProposer *wp;
|
||||
|
||||
if (*wal_acceptors_list == '\0')
|
||||
{
|
||||
wpg_log(WARNING, "Safekeepers list is empty");
|
||||
return;
|
||||
}
|
||||
|
||||
init_walprop_config(false);
|
||||
walprop_pg_init_bgworker();
|
||||
am_walproposer = true;
|
||||
walprop_pg_load_libpqwalreceiver();
|
||||
|
||||
wp = WalProposerCreate(&walprop_config, walprop_pg);
|
||||
wp->localTimeLineID = GetWALInsertionTimeLine();
|
||||
wp->last_reconnect_attempt = walprop_pg_get_current_timestamp(wp);
|
||||
|
||||
walprop_pg_init_walsender();
|
||||
@@ -272,6 +280,30 @@ split_safekeepers_list(char *safekeepers_list, char *safekeepers[])
|
||||
return n_safekeepers;
|
||||
}
|
||||
|
||||
static char *split_off_safekeepers_generation(char *safekeepers_list, uint32 *generation)
|
||||
{
|
||||
char *endptr;
|
||||
|
||||
if (strncmp(safekeepers_list, "g#", 2) != 0)
|
||||
{
|
||||
return safekeepers_list;
|
||||
}
|
||||
else
|
||||
{
|
||||
errno = 0;
|
||||
*generation = strtoul(safekeepers_list + 2, &endptr, 10);
|
||||
if (errno != 0)
|
||||
{
|
||||
wp_log(FATAL, "failed to parse neon.safekeepers generation number: %m");
|
||||
}
|
||||
if (*endptr != ':')
|
||||
{
|
||||
wp_log(FATAL, "failed to parse neon.safekeepers: no colon after generation");
|
||||
}
|
||||
return endptr + 1;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Accept two coma-separated strings with list of safekeeper host:port addresses.
|
||||
* Split them into arrays and return false if two sets do not match, ignoring the order.
|
||||
@@ -283,6 +315,16 @@ safekeepers_cmp(char *old, char *new)
|
||||
char *safekeepers_new[MAX_SAFEKEEPERS];
|
||||
int len_old = 0;
|
||||
int len_new = 0;
|
||||
uint32 gen_old = INVALID_GENERATION;
|
||||
uint32 gen_new = INVALID_GENERATION;
|
||||
|
||||
old = split_off_safekeepers_generation(old, &gen_old);
|
||||
new = split_off_safekeepers_generation(new, &gen_new);
|
||||
|
||||
if (gen_old != gen_new)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
len_old = split_safekeepers_list(old, safekeepers_old);
|
||||
len_new = split_safekeepers_list(new, safekeepers_new);
|
||||
@@ -316,6 +358,9 @@ assign_neon_safekeepers(const char *newval, void *extra)
|
||||
char *newval_copy;
|
||||
char *oldval;
|
||||
|
||||
if (newval && *newval != '\0' && UsedShmemSegAddr && walprop_shared && RecoveryInProgress())
|
||||
walprop_shared->replica_promote = true;
|
||||
|
||||
if (!am_walproposer)
|
||||
return;
|
||||
|
||||
@@ -506,16 +551,15 @@ BackpressureThrottlingTime(void)
|
||||
|
||||
/*
|
||||
* Register a background worker proposing WAL to wal acceptors.
|
||||
* We start walproposer bgworker even for replicas in order to support possible replica promotion.
|
||||
* When pg_promote() function is called, then walproposer bgworker registered with BgWorkerStart_RecoveryFinished
|
||||
* is automatically launched when promotion is completed.
|
||||
*/
|
||||
static void
|
||||
walprop_register_bgworker(void)
|
||||
{
|
||||
BackgroundWorker bgw;
|
||||
|
||||
/* If no wal acceptors are specified, don't start the background worker. */
|
||||
if (*wal_acceptors_list == '\0')
|
||||
return;
|
||||
|
||||
memset(&bgw, 0, sizeof(bgw));
|
||||
bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
|
||||
bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
|
||||
@@ -1292,9 +1336,7 @@ StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd)
|
||||
|
||||
#if PG_VERSION_NUM < 150000
|
||||
if (ThisTimeLineID == 0)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||
errmsg("IDENTIFY_SYSTEM has not been run before START_REPLICATION")));
|
||||
ThisTimeLineID = 1;
|
||||
#endif
|
||||
|
||||
/*
|
||||
@@ -1508,7 +1550,7 @@ walprop_pg_wal_reader_allocate(Safekeeper *sk)
|
||||
|
||||
snprintf(log_prefix, sizeof(log_prefix), WP_LOG_PREFIX "sk %s:%s nwr: ", sk->host, sk->port);
|
||||
Assert(!sk->xlogreader);
|
||||
sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propTermStartLsn, log_prefix);
|
||||
sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propTermStartLsn, log_prefix, sk->wp->localTimeLineID);
|
||||
if (sk->xlogreader == NULL)
|
||||
wpg_log(FATAL, "failed to allocate xlog reader");
|
||||
}
|
||||
@@ -1522,7 +1564,7 @@ walprop_pg_wal_read(Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count,
|
||||
buf,
|
||||
startptr,
|
||||
count,
|
||||
walprop_pg_get_timeline_id());
|
||||
sk->wp->localTimeLineID);
|
||||
|
||||
if (res == NEON_WALREAD_SUCCESS)
|
||||
{
|
||||
|
||||
@@ -111,7 +111,7 @@ NeonWALPageRead(
|
||||
readBuf,
|
||||
targetPagePtr,
|
||||
count,
|
||||
walprop_pg_get_timeline_id());
|
||||
NeonWALReaderLocalActiveTimeLineID(wal_reader));
|
||||
|
||||
if (res == NEON_WALREAD_SUCCESS)
|
||||
{
|
||||
@@ -202,7 +202,7 @@ NeonOnDemandXLogReaderRoutines(XLogReaderRoutine *xlr)
|
||||
{
|
||||
elog(ERROR, "unable to start walsender when basebackupLsn is 0");
|
||||
}
|
||||
wal_reader = NeonWALReaderAllocate(wal_segment_size, basebackupLsn, "[walsender] ");
|
||||
wal_reader = NeonWALReaderAllocate(wal_segment_size, basebackupLsn, "[walsender] ", 1);
|
||||
}
|
||||
xlr->page_read = NeonWALPageRead;
|
||||
xlr->segment_open = NeonWALReadSegmentOpen;
|
||||
|
||||
@@ -15,9 +15,9 @@ use crate::context::RequestContext;
|
||||
use crate::control_plane::client::cplane_proxy_v1;
|
||||
use crate::control_plane::{self, CachedNodeInfo, NodeInfo};
|
||||
use crate::error::{ReportableError, UserFacingError};
|
||||
use crate::pglb::connect_compute::ComputeConnectBackend;
|
||||
use crate::pqproto::BeMessage;
|
||||
use crate::proxy::NeonOptions;
|
||||
use crate::proxy::connect_compute::ComputeConnectBackend;
|
||||
use crate::stream::PqStream;
|
||||
use crate::types::RoleName;
|
||||
use crate::{auth, compute, waiters};
|
||||
|
||||
@@ -25,9 +25,9 @@ use crate::control_plane::{
|
||||
RoleAccessControl,
|
||||
};
|
||||
use crate::intern::EndpointIdInt;
|
||||
use crate::pglb::connect_compute::ComputeConnectBackend;
|
||||
use crate::pqproto::BeMessage;
|
||||
use crate::proxy::NeonOptions;
|
||||
use crate::proxy::connect_compute::ComputeConnectBackend;
|
||||
use crate::rate_limiter::EndpointRateLimiter;
|
||||
use crate::stream::Stream;
|
||||
use crate::types::{EndpointCacheKey, EndpointId, RoleName};
|
||||
|
||||
@@ -221,8 +221,7 @@ struct ProxyCliArgs {
|
||||
is_private_access_proxy: bool,
|
||||
|
||||
/// Configure whether all incoming requests have a Proxy Protocol V2 packet.
|
||||
// TODO(conradludgate): switch default to rejected or required once we've updated all deployments
|
||||
#[clap(value_enum, long, default_value_t = ProxyProtocolV2::Supported)]
|
||||
#[clap(value_enum, long, default_value_t = ProxyProtocolV2::Rejected)]
|
||||
proxy_protocol_v2: ProxyProtocolV2,
|
||||
|
||||
/// Time the proxy waits for the webauth session to be confirmed by the control plane.
|
||||
|
||||
@@ -39,8 +39,6 @@ pub struct ComputeConfig {
|
||||
pub enum ProxyProtocolV2 {
|
||||
/// Connection will error if PROXY protocol v2 header is missing
|
||||
Required,
|
||||
/// Connection will parse PROXY protocol v2 header, but accept the connection if it's missing.
|
||||
Supported,
|
||||
/// Connection will error if PROXY protocol v2 header is provided
|
||||
Rejected,
|
||||
}
|
||||
|
||||
@@ -11,10 +11,10 @@ use crate::config::{ProxyConfig, ProxyProtocolV2};
|
||||
use crate::context::RequestContext;
|
||||
use crate::error::ReportableError;
|
||||
use crate::metrics::{Metrics, NumClientConnectionsGuard};
|
||||
use crate::pglb::connect_compute::{TcpMechanism, connect_to_compute};
|
||||
use crate::pglb::handshake::{HandshakeData, handshake};
|
||||
use crate::pglb::passthrough::ProxyPassthrough;
|
||||
use crate::protocol2::{ConnectHeader, ConnectionInfo, read_proxy_protocol};
|
||||
use crate::proxy::connect_compute::{TcpMechanism, connect_to_compute};
|
||||
use crate::proxy::handshake::{HandshakeData, handshake};
|
||||
use crate::proxy::passthrough::ProxyPassthrough;
|
||||
use crate::proxy::{
|
||||
ClientRequestError, ErrorSource, prepare_client_connection, run_until_cancelled,
|
||||
};
|
||||
@@ -54,30 +54,24 @@ pub async fn task_main(
|
||||
debug!(protocol = "tcp", %session_id, "accepted new TCP connection");
|
||||
|
||||
connections.spawn(async move {
|
||||
let (socket, peer_addr) = match read_proxy_protocol(socket).await {
|
||||
Err(e) => {
|
||||
error!("per-client task finished with an error: {e:#}");
|
||||
return;
|
||||
let (socket, conn_info) = match config.proxy_protocol_v2 {
|
||||
ProxyProtocolV2::Required => {
|
||||
match read_proxy_protocol(socket).await {
|
||||
Err(e) => {
|
||||
error!("per-client task finished with an error: {e:#}");
|
||||
return;
|
||||
}
|
||||
// our load balancers will not send any more data. let's just exit immediately
|
||||
Ok((_socket, ConnectHeader::Local)) => {
|
||||
debug!("healthcheck received");
|
||||
return;
|
||||
}
|
||||
Ok((socket, ConnectHeader::Proxy(info))) => (socket, info),
|
||||
}
|
||||
}
|
||||
// our load balancers will not send any more data. let's just exit immediately
|
||||
Ok((_socket, ConnectHeader::Local)) => {
|
||||
debug!("healthcheck received");
|
||||
return;
|
||||
}
|
||||
Ok((_socket, ConnectHeader::Missing))
|
||||
if config.proxy_protocol_v2 == ProxyProtocolV2::Required =>
|
||||
{
|
||||
error!("missing required proxy protocol header");
|
||||
return;
|
||||
}
|
||||
Ok((_socket, ConnectHeader::Proxy(_)))
|
||||
if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected =>
|
||||
{
|
||||
error!("proxy protocol header not supported");
|
||||
return;
|
||||
}
|
||||
Ok((socket, ConnectHeader::Proxy(info))) => (socket, info),
|
||||
Ok((socket, ConnectHeader::Missing)) => (
|
||||
// ignore the header - it cannot be confused for a postgres or http connection so will
|
||||
// error later.
|
||||
ProxyProtocolV2::Rejected => (
|
||||
socket,
|
||||
ConnectionInfo {
|
||||
addr: peer_addr,
|
||||
@@ -86,7 +80,7 @@ pub async fn task_main(
|
||||
),
|
||||
};
|
||||
|
||||
match socket.inner.set_nodelay(true) {
|
||||
match socket.set_nodelay(true) {
|
||||
Ok(()) => {}
|
||||
Err(e) => {
|
||||
error!(
|
||||
@@ -98,7 +92,7 @@ pub async fn task_main(
|
||||
|
||||
let ctx = RequestContext::new(
|
||||
session_id,
|
||||
peer_addr,
|
||||
conn_info,
|
||||
crate::metrics::Protocol::Tcp,
|
||||
&config.region,
|
||||
);
|
||||
|
||||
@@ -2,7 +2,6 @@ use async_trait::async_trait;
|
||||
use tokio::time;
|
||||
use tracing::{debug, info, warn};
|
||||
|
||||
use super::retry::ShouldRetryWakeCompute;
|
||||
use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo};
|
||||
use crate::compute::{self, COULD_NOT_CONNECT, PostgresConnection};
|
||||
use crate::config::{ComputeConfig, RetryConfig};
|
||||
@@ -15,7 +14,7 @@ use crate::metrics::{
|
||||
ConnectOutcome, ConnectionFailureKind, Metrics, RetriesMetricGroup, RetryType,
|
||||
};
|
||||
use crate::pqproto::StartupMessageParams;
|
||||
use crate::proxy::retry::{CouldRetry, retry_after, should_retry};
|
||||
use crate::proxy::retry::{CouldRetry, ShouldRetryWakeCompute, retry_after, should_retry};
|
||||
use crate::proxy::wake_compute::wake_compute;
|
||||
use crate::types::Host;
|
||||
|
||||
@@ -1 +1,5 @@
|
||||
pub mod connect_compute;
|
||||
pub mod copy_bidirectional;
|
||||
pub mod handshake;
|
||||
pub mod inprocess;
|
||||
pub mod passthrough;
|
||||
|
||||
@@ -53,7 +53,7 @@ pub(crate) async fn proxy_pass(
|
||||
|
||||
// Starting from here we only proxy the client's traffic.
|
||||
debug!("performing the proxy pass...");
|
||||
let _ = crate::proxy::copy_bidirectional::copy_bidirectional_client_compute(
|
||||
let _ = crate::pglb::copy_bidirectional::copy_bidirectional_client_compute(
|
||||
&mut client,
|
||||
&mut compute,
|
||||
)
|
||||
@@ -4,60 +4,13 @@
|
||||
use core::fmt;
|
||||
use std::io;
|
||||
use std::net::{Ipv4Addr, Ipv6Addr, SocketAddr};
|
||||
use std::pin::Pin;
|
||||
use std::task::{Context, Poll};
|
||||
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
use pin_project_lite::pin_project;
|
||||
use bytes::Buf;
|
||||
use smol_str::SmolStr;
|
||||
use strum_macros::FromRepr;
|
||||
use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf};
|
||||
use tokio::io::{AsyncRead, AsyncReadExt};
|
||||
use zerocopy::{FromBytes, Immutable, KnownLayout, Unaligned, network_endian};
|
||||
|
||||
pin_project! {
|
||||
/// A chained [`AsyncRead`] with [`AsyncWrite`] passthrough
|
||||
pub(crate) struct ChainRW<T> {
|
||||
#[pin]
|
||||
pub(crate) inner: T,
|
||||
buf: BytesMut,
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: AsyncWrite> AsyncWrite for ChainRW<T> {
|
||||
#[inline]
|
||||
fn poll_write(
|
||||
self: Pin<&mut Self>,
|
||||
cx: &mut Context<'_>,
|
||||
buf: &[u8],
|
||||
) -> Poll<Result<usize, io::Error>> {
|
||||
self.project().inner.poll_write(cx, buf)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Result<(), io::Error>> {
|
||||
self.project().inner.poll_flush(cx)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Result<(), io::Error>> {
|
||||
self.project().inner.poll_shutdown(cx)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn poll_write_vectored(
|
||||
self: Pin<&mut Self>,
|
||||
cx: &mut Context<'_>,
|
||||
bufs: &[io::IoSlice<'_>],
|
||||
) -> Poll<Result<usize, io::Error>> {
|
||||
self.project().inner.poll_write_vectored(cx, bufs)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn is_write_vectored(&self) -> bool {
|
||||
self.inner.is_write_vectored()
|
||||
}
|
||||
}
|
||||
|
||||
/// Proxy Protocol Version 2 Header
|
||||
const SIGNATURE: [u8; 12] = [
|
||||
0x0D, 0x0A, 0x0D, 0x0A, 0x00, 0x0D, 0x0A, 0x51, 0x55, 0x49, 0x54, 0x0A,
|
||||
@@ -79,7 +32,6 @@ pub struct ConnectionInfo {
|
||||
|
||||
#[derive(PartialEq, Eq, Clone, Debug)]
|
||||
pub enum ConnectHeader {
|
||||
Missing,
|
||||
Local,
|
||||
Proxy(ConnectionInfo),
|
||||
}
|
||||
@@ -106,47 +58,24 @@ pub enum ConnectionInfoExtra {
|
||||
|
||||
pub(crate) async fn read_proxy_protocol<T: AsyncRead + Unpin>(
|
||||
mut read: T,
|
||||
) -> std::io::Result<(ChainRW<T>, ConnectHeader)> {
|
||||
let mut buf = BytesMut::with_capacity(128);
|
||||
let header = loop {
|
||||
let bytes_read = read.read_buf(&mut buf).await?;
|
||||
|
||||
// exit for bad header signature
|
||||
let len = usize::min(buf.len(), SIGNATURE.len());
|
||||
if buf[..len] != SIGNATURE[..len] {
|
||||
return Ok((ChainRW { inner: read, buf }, ConnectHeader::Missing));
|
||||
}
|
||||
|
||||
// if no more bytes available then exit
|
||||
if bytes_read == 0 {
|
||||
return Ok((ChainRW { inner: read, buf }, ConnectHeader::Missing));
|
||||
}
|
||||
|
||||
// check if we have enough bytes to continue
|
||||
if let Some(header) = buf.try_get::<ProxyProtocolV2Header>() {
|
||||
break header;
|
||||
}
|
||||
};
|
||||
|
||||
let remaining_length = usize::from(header.len.get());
|
||||
|
||||
while buf.len() < remaining_length {
|
||||
if read.read_buf(&mut buf).await? == 0 {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::UnexpectedEof,
|
||||
"stream closed while waiting for proxy protocol addresses",
|
||||
));
|
||||
}
|
||||
) -> std::io::Result<(T, ConnectHeader)> {
|
||||
let mut header = [0; size_of::<ProxyProtocolV2Header>()];
|
||||
read.read_exact(&mut header).await?;
|
||||
let header: ProxyProtocolV2Header = zerocopy::transmute!(header);
|
||||
if header.signature != SIGNATURE {
|
||||
return Err(std::io::Error::other("invalid proxy protocol header"));
|
||||
}
|
||||
let payload = buf.split_to(remaining_length);
|
||||
|
||||
let res = process_proxy_payload(header, payload)?;
|
||||
Ok((ChainRW { inner: read, buf }, res))
|
||||
let mut payload = vec![0; usize::from(header.len.get())];
|
||||
read.read_exact(&mut payload).await?;
|
||||
|
||||
let res = process_proxy_payload(header, &payload)?;
|
||||
Ok((read, res))
|
||||
}
|
||||
|
||||
fn process_proxy_payload(
|
||||
header: ProxyProtocolV2Header,
|
||||
mut payload: BytesMut,
|
||||
mut payload: &[u8],
|
||||
) -> std::io::Result<ConnectHeader> {
|
||||
match header.version_and_command {
|
||||
// the connection was established on purpose by the proxy
|
||||
@@ -162,13 +91,12 @@ fn process_proxy_payload(
|
||||
PROXY_V2 => {}
|
||||
// other values are unassigned and must not be emitted by senders. Receivers
|
||||
// must drop connections presenting unexpected values here.
|
||||
#[rustfmt::skip] // https://github.com/rust-lang/rustfmt/issues/6384
|
||||
_ => return Err(io::Error::other(
|
||||
format!(
|
||||
_ => {
|
||||
return Err(io::Error::other(format!(
|
||||
"invalid proxy protocol command 0x{:02X}. expected local (0x20) or proxy (0x21)",
|
||||
header.version_and_command
|
||||
),
|
||||
)),
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
let size_err =
|
||||
@@ -206,7 +134,7 @@ fn process_proxy_payload(
|
||||
}
|
||||
let subtype = tlv.value.get_u8();
|
||||
match Pp2AwsType::from_repr(subtype) {
|
||||
Some(Pp2AwsType::VpceId) => match std::str::from_utf8(&tlv.value) {
|
||||
Some(Pp2AwsType::VpceId) => match std::str::from_utf8(tlv.value) {
|
||||
Ok(s) => {
|
||||
extra = Some(ConnectionInfoExtra::Aws { vpce_id: s.into() });
|
||||
}
|
||||
@@ -282,65 +210,28 @@ enum Pp2AzureType {
|
||||
PrivateEndpointLinkId = 0x01,
|
||||
}
|
||||
|
||||
impl<T: AsyncRead> AsyncRead for ChainRW<T> {
|
||||
#[inline]
|
||||
fn poll_read(
|
||||
self: Pin<&mut Self>,
|
||||
cx: &mut Context<'_>,
|
||||
buf: &mut ReadBuf<'_>,
|
||||
) -> Poll<io::Result<()>> {
|
||||
if self.buf.is_empty() {
|
||||
self.project().inner.poll_read(cx, buf)
|
||||
} else {
|
||||
self.read_from_buf(buf)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: AsyncRead> ChainRW<T> {
|
||||
#[cold]
|
||||
fn read_from_buf(self: Pin<&mut Self>, buf: &mut ReadBuf<'_>) -> Poll<io::Result<()>> {
|
||||
debug_assert!(!self.buf.is_empty());
|
||||
let this = self.project();
|
||||
|
||||
let write = usize::min(this.buf.len(), buf.remaining());
|
||||
let slice = this.buf.split_to(write).freeze();
|
||||
buf.put_slice(&slice);
|
||||
|
||||
// reset the allocation so it can be freed
|
||||
if this.buf.is_empty() {
|
||||
*this.buf = BytesMut::new();
|
||||
}
|
||||
|
||||
Poll::Ready(Ok(()))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct Tlv {
|
||||
struct Tlv<'a> {
|
||||
kind: u8,
|
||||
value: Bytes,
|
||||
value: &'a [u8],
|
||||
}
|
||||
|
||||
fn read_tlv(b: &mut BytesMut) -> Option<Tlv> {
|
||||
fn read_tlv<'a>(b: &mut &'a [u8]) -> Option<Tlv<'a>> {
|
||||
let tlv_header = b.try_get::<TlvHeader>()?;
|
||||
let len = usize::from(tlv_header.len.get());
|
||||
if b.len() < len {
|
||||
return None;
|
||||
}
|
||||
Some(Tlv {
|
||||
kind: tlv_header.kind,
|
||||
value: b.split_to(len).freeze(),
|
||||
value: b.split_off(..len)?,
|
||||
})
|
||||
}
|
||||
|
||||
trait BufExt: Sized {
|
||||
fn try_get<T: FromBytes>(&mut self) -> Option<T>;
|
||||
}
|
||||
impl BufExt for BytesMut {
|
||||
impl BufExt for &[u8] {
|
||||
fn try_get<T: FromBytes>(&mut self) -> Option<T> {
|
||||
let (res, _) = T::read_from_prefix(self).ok()?;
|
||||
self.advance(size_of::<T>());
|
||||
let (res, rest) = T::read_from_prefix(self).ok()?;
|
||||
*self = rest;
|
||||
Some(res)
|
||||
}
|
||||
}
|
||||
@@ -481,27 +372,19 @@ mod tests {
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
#[should_panic = "invalid proxy protocol header"]
|
||||
async fn test_invalid() {
|
||||
let data = [0x55; 256];
|
||||
|
||||
let (mut read, info) = read_proxy_protocol(data.as_slice()).await.unwrap();
|
||||
|
||||
let mut bytes = vec![];
|
||||
read.read_to_end(&mut bytes).await.unwrap();
|
||||
assert_eq!(bytes, data);
|
||||
assert_eq!(info, ConnectHeader::Missing);
|
||||
read_proxy_protocol(data.as_slice()).await.unwrap();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
#[should_panic = "early eof"]
|
||||
async fn test_short() {
|
||||
let data = [0x55; 10];
|
||||
|
||||
let (mut read, info) = read_proxy_protocol(data.as_slice()).await.unwrap();
|
||||
|
||||
let mut bytes = vec![];
|
||||
read.read_to_end(&mut bytes).await.unwrap();
|
||||
assert_eq!(bytes, data);
|
||||
assert_eq!(info, ConnectHeader::Missing);
|
||||
read_proxy_protocol(data.as_slice()).await.unwrap();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
|
||||
@@ -1,15 +1,10 @@
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
||||
pub(crate) mod connect_compute;
|
||||
mod copy_bidirectional;
|
||||
pub(crate) mod handshake;
|
||||
pub(crate) mod passthrough;
|
||||
pub(crate) mod retry;
|
||||
pub(crate) mod wake_compute;
|
||||
use std::sync::Arc;
|
||||
|
||||
pub use copy_bidirectional::{ErrorSource, copy_bidirectional_client_compute};
|
||||
use futures::FutureExt;
|
||||
use itertools::Itertools;
|
||||
use once_cell::sync::OnceCell;
|
||||
@@ -21,16 +16,17 @@ use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{Instrument, debug, error, info, warn};
|
||||
|
||||
use self::connect_compute::{TcpMechanism, connect_to_compute};
|
||||
use self::passthrough::ProxyPassthrough;
|
||||
use crate::cancellation::{self, CancellationHandler};
|
||||
use crate::config::{ProxyConfig, ProxyProtocolV2, TlsConfig};
|
||||
use crate::context::RequestContext;
|
||||
use crate::error::{ReportableError, UserFacingError};
|
||||
use crate::metrics::{Metrics, NumClientConnectionsGuard};
|
||||
use crate::pglb::connect_compute::{TcpMechanism, connect_to_compute};
|
||||
pub use crate::pglb::copy_bidirectional::{ErrorSource, copy_bidirectional_client_compute};
|
||||
use crate::pglb::handshake::{HandshakeData, HandshakeError, handshake};
|
||||
use crate::pglb::passthrough::ProxyPassthrough;
|
||||
use crate::pqproto::{BeMessage, CancelKeyData, StartupMessageParams};
|
||||
use crate::protocol2::{ConnectHeader, ConnectionInfo, ConnectionInfoExtra, read_proxy_protocol};
|
||||
use crate::proxy::handshake::{HandshakeData, handshake};
|
||||
use crate::rate_limiter::EndpointRateLimiter;
|
||||
use crate::stream::{PqStream, Stream};
|
||||
use crate::types::EndpointCacheKey;
|
||||
@@ -102,30 +98,24 @@ pub async fn task_main(
|
||||
let endpoint_rate_limiter2 = endpoint_rate_limiter.clone();
|
||||
|
||||
connections.spawn(async move {
|
||||
let (socket, conn_info) = match read_proxy_protocol(socket).await {
|
||||
Err(e) => {
|
||||
warn!("per-client task finished with an error: {e:#}");
|
||||
return;
|
||||
let (socket, conn_info) = match config.proxy_protocol_v2 {
|
||||
ProxyProtocolV2::Required => {
|
||||
match read_proxy_protocol(socket).await {
|
||||
Err(e) => {
|
||||
warn!("per-client task finished with an error: {e:#}");
|
||||
return;
|
||||
}
|
||||
// our load balancers will not send any more data. let's just exit immediately
|
||||
Ok((_socket, ConnectHeader::Local)) => {
|
||||
debug!("healthcheck received");
|
||||
return;
|
||||
}
|
||||
Ok((socket, ConnectHeader::Proxy(info))) => (socket, info),
|
||||
}
|
||||
}
|
||||
// our load balancers will not send any more data. let's just exit immediately
|
||||
Ok((_socket, ConnectHeader::Local)) => {
|
||||
debug!("healthcheck received");
|
||||
return;
|
||||
}
|
||||
Ok((_socket, ConnectHeader::Missing))
|
||||
if config.proxy_protocol_v2 == ProxyProtocolV2::Required =>
|
||||
{
|
||||
warn!("missing required proxy protocol header");
|
||||
return;
|
||||
}
|
||||
Ok((_socket, ConnectHeader::Proxy(_)))
|
||||
if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected =>
|
||||
{
|
||||
warn!("proxy protocol header not supported");
|
||||
return;
|
||||
}
|
||||
Ok((socket, ConnectHeader::Proxy(info))) => (socket, info),
|
||||
Ok((socket, ConnectHeader::Missing)) => (
|
||||
// ignore the header - it cannot be confused for a postgres or http connection so will
|
||||
// error later.
|
||||
ProxyProtocolV2::Rejected => (
|
||||
socket,
|
||||
ConnectionInfo {
|
||||
addr: peer_addr,
|
||||
@@ -134,7 +124,7 @@ pub async fn task_main(
|
||||
),
|
||||
};
|
||||
|
||||
match socket.inner.set_nodelay(true) {
|
||||
match socket.set_nodelay(true) {
|
||||
Ok(()) => {}
|
||||
Err(e) => {
|
||||
error!(
|
||||
@@ -248,7 +238,7 @@ pub(crate) enum ClientRequestError {
|
||||
#[error("{0}")]
|
||||
Cancellation(#[from] cancellation::CancelError),
|
||||
#[error("{0}")]
|
||||
Handshake(#[from] handshake::HandshakeError),
|
||||
Handshake(#[from] HandshakeError),
|
||||
#[error("{0}")]
|
||||
HandshakeTimeout(#[from] tokio::time::error::Elapsed),
|
||||
#[error("{0}")]
|
||||
|
||||
@@ -17,7 +17,6 @@ use rustls::pki_types;
|
||||
use tokio::io::DuplexStream;
|
||||
use tracing_test::traced_test;
|
||||
|
||||
use super::connect_compute::ConnectMechanism;
|
||||
use super::retry::CouldRetry;
|
||||
use super::*;
|
||||
use crate::auth::backend::{
|
||||
@@ -28,6 +27,7 @@ use crate::control_plane::client::{ControlPlaneClient, TestControlPlaneClient};
|
||||
use crate::control_plane::messages::{ControlPlaneErrorMessage, Details, MetricsAuxInfo, Status};
|
||||
use crate::control_plane::{self, CachedNodeInfo, NodeInfo, NodeInfoCache};
|
||||
use crate::error::ErrorKind;
|
||||
use crate::pglb::connect_compute::ConnectMechanism;
|
||||
use crate::tls::client_config::compute_client_config_with_certs;
|
||||
use crate::tls::postgres_rustls::MakeRustlsConnect;
|
||||
use crate::tls::server_config::CertResolver;
|
||||
@@ -173,7 +173,6 @@ async fn dummy_proxy(
|
||||
tls: Option<TlsConfig>,
|
||||
auth: impl TestAuth + Send,
|
||||
) -> anyhow::Result<()> {
|
||||
let (client, _) = read_proxy_protocol(client).await?;
|
||||
let mut stream = match handshake(&RequestContext::test(), client, tls.as_ref(), false).await? {
|
||||
HandshakeData::Startup(stream, _) => stream,
|
||||
HandshakeData::Cancel(_) => bail!("cancellation not supported"),
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use tracing::{error, info};
|
||||
|
||||
use super::connect_compute::ComputeConnectBackend;
|
||||
use crate::config::RetryConfig;
|
||||
use crate::context::RequestContext;
|
||||
use crate::control_plane::CachedNodeInfo;
|
||||
@@ -9,6 +8,7 @@ use crate::error::ReportableError;
|
||||
use crate::metrics::{
|
||||
ConnectOutcome, ConnectionFailuresBreakdownGroup, Metrics, RetriesMetricGroup, RetryType,
|
||||
};
|
||||
use crate::pglb::connect_compute::ComputeConnectBackend;
|
||||
use crate::proxy::retry::{retry_after, should_retry};
|
||||
|
||||
// Use macro to retain original callsite.
|
||||
|
||||
@@ -35,7 +35,7 @@ use crate::control_plane::errors::{GetAuthInfoError, WakeComputeError};
|
||||
use crate::control_plane::locks::ApiLocks;
|
||||
use crate::error::{ErrorKind, ReportableError, UserFacingError};
|
||||
use crate::intern::EndpointIdInt;
|
||||
use crate::proxy::connect_compute::ConnectMechanism;
|
||||
use crate::pglb::connect_compute::ConnectMechanism;
|
||||
use crate::proxy::retry::{CouldRetry, ShouldRetryWakeCompute};
|
||||
use crate::rate_limiter::EndpointRateLimiter;
|
||||
use crate::types::{EndpointId, Host, LOCAL_PROXY_SUFFIX};
|
||||
@@ -182,7 +182,7 @@ impl PoolingBackend {
|
||||
tracing::Span::current().record("conn_id", display(conn_id));
|
||||
info!(%conn_id, "pool: opening a new connection '{conn_info}'");
|
||||
let backend = self.auth_backend.as_ref().map(|()| keys);
|
||||
crate::proxy::connect_compute::connect_to_compute(
|
||||
crate::pglb::connect_compute::connect_to_compute(
|
||||
ctx,
|
||||
&TokioMechanism {
|
||||
conn_id,
|
||||
@@ -226,7 +226,7 @@ impl PoolingBackend {
|
||||
},
|
||||
keys: crate::auth::backend::ComputeCredentialKeys::None,
|
||||
});
|
||||
crate::proxy::connect_compute::connect_to_compute(
|
||||
crate::pglb::connect_compute::connect_to_compute(
|
||||
ctx,
|
||||
&HyperMechanism {
|
||||
conn_id,
|
||||
|
||||
@@ -49,7 +49,7 @@ use crate::config::{ProxyConfig, ProxyProtocolV2};
|
||||
use crate::context::RequestContext;
|
||||
use crate::ext::TaskExt;
|
||||
use crate::metrics::Metrics;
|
||||
use crate::protocol2::{ChainRW, ConnectHeader, ConnectionInfo, read_proxy_protocol};
|
||||
use crate::protocol2::{ConnectHeader, ConnectionInfo, read_proxy_protocol};
|
||||
use crate::proxy::run_until_cancelled;
|
||||
use crate::rate_limiter::EndpointRateLimiter;
|
||||
use crate::serverless::backend::PoolingBackend;
|
||||
@@ -207,12 +207,12 @@ pub(crate) type AsyncRW = Pin<Box<dyn AsyncReadWrite>>;
|
||||
|
||||
#[async_trait]
|
||||
trait MaybeTlsAcceptor: Send + Sync + 'static {
|
||||
async fn accept(&self, conn: ChainRW<TcpStream>) -> std::io::Result<AsyncRW>;
|
||||
async fn accept(&self, conn: TcpStream) -> std::io::Result<AsyncRW>;
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl MaybeTlsAcceptor for &'static ArcSwapOption<crate::config::TlsConfig> {
|
||||
async fn accept(&self, conn: ChainRW<TcpStream>) -> std::io::Result<AsyncRW> {
|
||||
async fn accept(&self, conn: TcpStream) -> std::io::Result<AsyncRW> {
|
||||
match &*self.load() {
|
||||
Some(config) => Ok(Box::pin(
|
||||
TlsAcceptor::from(config.http_config.clone())
|
||||
@@ -235,33 +235,30 @@ async fn connection_startup(
|
||||
peer_addr: SocketAddr,
|
||||
) -> Option<(AsyncRW, ConnectionInfo)> {
|
||||
// handle PROXY protocol
|
||||
let (conn, peer) = match read_proxy_protocol(conn).await {
|
||||
Ok(c) => c,
|
||||
Err(e) => {
|
||||
tracing::warn!(?session_id, %peer_addr, "failed to accept TCP connection: invalid PROXY protocol V2 header: {e:#}");
|
||||
return None;
|
||||
let (conn, conn_info) = match config.proxy_protocol_v2 {
|
||||
ProxyProtocolV2::Required => {
|
||||
match read_proxy_protocol(conn).await {
|
||||
Err(e) => {
|
||||
warn!("per-client task finished with an error: {e:#}");
|
||||
return None;
|
||||
}
|
||||
// our load balancers will not send any more data. let's just exit immediately
|
||||
Ok((_conn, ConnectHeader::Local)) => {
|
||||
tracing::debug!("healthcheck received");
|
||||
return None;
|
||||
}
|
||||
Ok((conn, ConnectHeader::Proxy(info))) => (conn, info),
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let conn_info = match peer {
|
||||
// our load balancers will not send any more data. let's just exit immediately
|
||||
ConnectHeader::Local => {
|
||||
tracing::debug!("healthcheck received");
|
||||
return None;
|
||||
}
|
||||
ConnectHeader::Missing if config.proxy_protocol_v2 == ProxyProtocolV2::Required => {
|
||||
tracing::warn!("missing required proxy protocol header");
|
||||
return None;
|
||||
}
|
||||
ConnectHeader::Proxy(_) if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => {
|
||||
tracing::warn!("proxy protocol header not supported");
|
||||
return None;
|
||||
}
|
||||
ConnectHeader::Proxy(info) => info,
|
||||
ConnectHeader::Missing => ConnectionInfo {
|
||||
addr: peer_addr,
|
||||
extra: None,
|
||||
},
|
||||
// ignore the header - it cannot be confused for a postgres or http connection so will
|
||||
// error later.
|
||||
ProxyProtocolV2::Rejected => (
|
||||
conn,
|
||||
ConnectionInfo {
|
||||
addr: peer_addr,
|
||||
extra: None,
|
||||
},
|
||||
),
|
||||
};
|
||||
|
||||
let has_private_peer_addr = match conn_info.addr.ip() {
|
||||
|
||||
@@ -357,31 +357,6 @@ class PgProtocol:
|
||||
return TimelineId(cast("str", self.safe_psql("show neon.timeline_id")[0][0]))
|
||||
|
||||
|
||||
class PageserverWalReceiverProtocol(StrEnum):
|
||||
VANILLA = "vanilla"
|
||||
INTERPRETED = "interpreted"
|
||||
|
||||
@staticmethod
|
||||
def to_config_key_value(proto) -> tuple[str, dict[str, Any]]:
|
||||
if proto == PageserverWalReceiverProtocol.VANILLA:
|
||||
return (
|
||||
"wal_receiver_protocol",
|
||||
{
|
||||
"type": "vanilla",
|
||||
},
|
||||
)
|
||||
elif proto == PageserverWalReceiverProtocol.INTERPRETED:
|
||||
return (
|
||||
"wal_receiver_protocol",
|
||||
{
|
||||
"type": "interpreted",
|
||||
"args": {"format": "protobuf", "compression": {"zstd": {"level": 1}}},
|
||||
},
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unknown protocol type: {proto}")
|
||||
|
||||
|
||||
@dataclass
|
||||
class PageserverTracingConfig:
|
||||
sampling_ratio: tuple[int, int]
|
||||
@@ -423,6 +398,7 @@ class PageserverImportConfig:
|
||||
"import_job_concurrency": self.import_job_concurrency,
|
||||
"import_job_soft_size_limit": self.import_job_soft_size_limit,
|
||||
"import_job_checkpoint_threshold": self.import_job_checkpoint_threshold,
|
||||
"import_job_max_byte_range_size": 4 * 1024 * 1024, # Pageserver default
|
||||
}
|
||||
return ("timeline_import_config", value)
|
||||
|
||||
@@ -474,7 +450,6 @@ class NeonEnvBuilder:
|
||||
safekeeper_extra_opts: list[str] | None = None,
|
||||
storage_controller_port_override: int | None = None,
|
||||
pageserver_virtual_file_io_mode: str | None = None,
|
||||
pageserver_wal_receiver_protocol: PageserverWalReceiverProtocol | None = None,
|
||||
pageserver_get_vectored_concurrent_io: str | None = None,
|
||||
pageserver_tracing_config: PageserverTracingConfig | None = None,
|
||||
pageserver_import_config: PageserverImportConfig | None = None,
|
||||
@@ -551,11 +526,6 @@ class NeonEnvBuilder:
|
||||
|
||||
self.pageserver_virtual_file_io_mode = pageserver_virtual_file_io_mode
|
||||
|
||||
if pageserver_wal_receiver_protocol is not None:
|
||||
self.pageserver_wal_receiver_protocol = pageserver_wal_receiver_protocol
|
||||
else:
|
||||
self.pageserver_wal_receiver_protocol = PageserverWalReceiverProtocol.INTERPRETED
|
||||
|
||||
assert test_name.startswith("test_"), (
|
||||
"Unexpectedly instantiated from outside a test function"
|
||||
)
|
||||
@@ -1201,7 +1171,6 @@ class NeonEnv:
|
||||
|
||||
self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine
|
||||
self.pageserver_virtual_file_io_mode = config.pageserver_virtual_file_io_mode
|
||||
self.pageserver_wal_receiver_protocol = config.pageserver_wal_receiver_protocol
|
||||
self.pageserver_get_vectored_concurrent_io = config.pageserver_get_vectored_concurrent_io
|
||||
self.pageserver_tracing_config = config.pageserver_tracing_config
|
||||
if config.pageserver_import_config is None:
|
||||
@@ -1333,13 +1302,6 @@ class NeonEnv:
|
||||
for key, value in override.items():
|
||||
ps_cfg[key] = value
|
||||
|
||||
if self.pageserver_wal_receiver_protocol is not None:
|
||||
key, value = PageserverWalReceiverProtocol.to_config_key_value(
|
||||
self.pageserver_wal_receiver_protocol
|
||||
)
|
||||
if key not in ps_cfg:
|
||||
ps_cfg[key] = value
|
||||
|
||||
if self.pageserver_tracing_config is not None:
|
||||
key, value = self.pageserver_tracing_config.to_config_key_value()
|
||||
|
||||
@@ -4710,7 +4672,7 @@ class EndpointFactory:
|
||||
origin: Endpoint,
|
||||
endpoint_id: str | None = None,
|
||||
config_lines: list[str] | None = None,
|
||||
):
|
||||
) -> Endpoint:
|
||||
branch_name = origin.branch_name
|
||||
assert origin in self.endpoints
|
||||
assert branch_name is not None
|
||||
@@ -4729,7 +4691,7 @@ class EndpointFactory:
|
||||
origin: Endpoint,
|
||||
endpoint_id: str | None = None,
|
||||
config_lines: list[str] | None = None,
|
||||
):
|
||||
) -> Endpoint:
|
||||
branch_name = origin.branch_name
|
||||
assert origin in self.endpoints
|
||||
assert branch_name is not None
|
||||
|
||||
@@ -15,19 +15,10 @@ from fixtures.neon_fixtures import (
|
||||
|
||||
@pytest.mark.timeout(1200)
|
||||
@pytest.mark.parametrize("shard_count", [1, 8, 32])
|
||||
@pytest.mark.parametrize(
|
||||
"wal_receiver_protocol",
|
||||
[
|
||||
"vanilla",
|
||||
"interpreted-bincode-compressed",
|
||||
"interpreted-protobuf-compressed",
|
||||
],
|
||||
)
|
||||
def test_sharded_ingest(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
zenbenchmark: NeonBenchmarker,
|
||||
shard_count: int,
|
||||
wal_receiver_protocol: str,
|
||||
):
|
||||
"""
|
||||
Benchmarks sharded ingestion throughput, by ingesting a large amount of WAL into a Safekeeper
|
||||
@@ -39,36 +30,6 @@ def test_sharded_ingest(
|
||||
neon_env_builder.num_pageservers = shard_count
|
||||
env = neon_env_builder.init_configs()
|
||||
|
||||
for ps in env.pageservers:
|
||||
if wal_receiver_protocol == "vanilla":
|
||||
ps.patch_config_toml_nonrecursive(
|
||||
{
|
||||
"wal_receiver_protocol": {
|
||||
"type": "vanilla",
|
||||
}
|
||||
}
|
||||
)
|
||||
elif wal_receiver_protocol == "interpreted-bincode-compressed":
|
||||
ps.patch_config_toml_nonrecursive(
|
||||
{
|
||||
"wal_receiver_protocol": {
|
||||
"type": "interpreted",
|
||||
"args": {"format": "bincode", "compression": {"zstd": {"level": 1}}},
|
||||
}
|
||||
}
|
||||
)
|
||||
elif wal_receiver_protocol == "interpreted-protobuf-compressed":
|
||||
ps.patch_config_toml_nonrecursive(
|
||||
{
|
||||
"wal_receiver_protocol": {
|
||||
"type": "interpreted",
|
||||
"args": {"format": "protobuf", "compression": {"zstd": {"level": 1}}},
|
||||
}
|
||||
}
|
||||
)
|
||||
else:
|
||||
raise AssertionError("Test must use explicit wal receiver protocol config")
|
||||
|
||||
env.start()
|
||||
|
||||
# Create a sharded tenant and timeline, and migrate it to the respective pageservers. Ensure
|
||||
|
||||
@@ -182,10 +182,6 @@ def test_fully_custom_config(positive_env: NeonEnv):
|
||||
"lsn_lease_length": "1m",
|
||||
"lsn_lease_length_for_ts": "5s",
|
||||
"timeline_offloading": False,
|
||||
"wal_receiver_protocol_override": {
|
||||
"type": "interpreted",
|
||||
"args": {"format": "bincode", "compression": {"zstd": {"level": 1}}},
|
||||
},
|
||||
"rel_size_v2_enabled": True,
|
||||
"relsize_snapshot_cache_capacity": 10000,
|
||||
"gc_compaction_enabled": True,
|
||||
|
||||
@@ -26,6 +26,10 @@ def test_basebackup_cache(neon_env_builder: NeonEnvBuilder):
|
||||
ps = env.pageserver
|
||||
ps_http = ps.http_client()
|
||||
|
||||
storcon_managed_timelines = (env.storage_controller_config or {}).get(
|
||||
"timelines_onto_safekeepers", False
|
||||
)
|
||||
|
||||
# 1. Check that we always hit the cache after compute restart.
|
||||
for i in range(3):
|
||||
ep.start()
|
||||
@@ -33,15 +37,26 @@ def test_basebackup_cache(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
def check_metrics(i=i):
|
||||
metrics = ps_http.get_metrics()
|
||||
# Never miss.
|
||||
# The first time compute_ctl sends `get_basebackup` with lsn=None, we do not cache such requests.
|
||||
# All other requests should be a hit
|
||||
assert (
|
||||
metrics.query_one(
|
||||
"pageserver_basebackup_cache_read_total", {"result": "miss"}
|
||||
).value
|
||||
== 0
|
||||
)
|
||||
if storcon_managed_timelines:
|
||||
# We do not cache the initial basebackup yet,
|
||||
# so the first compute startup should be a miss.
|
||||
assert (
|
||||
metrics.query_one(
|
||||
"pageserver_basebackup_cache_read_total", {"result": "miss"}
|
||||
).value
|
||||
== 1
|
||||
)
|
||||
else:
|
||||
# If the timeline is not initialized on safekeeprs,
|
||||
# the compute_ctl sends `get_basebackup` with lsn=None for the first startup.
|
||||
# We do not use cache for such requests, so it's niether a hit nor a miss.
|
||||
assert (
|
||||
metrics.query_one(
|
||||
"pageserver_basebackup_cache_read_total", {"result": "miss"}
|
||||
).value
|
||||
== 0
|
||||
)
|
||||
|
||||
# All but the first requests are hits.
|
||||
assert (
|
||||
metrics.query_one("pageserver_basebackup_cache_read_total", {"result": "hit"}).value
|
||||
|
||||
@@ -10,7 +10,6 @@ import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
PageserverWalReceiverProtocol,
|
||||
generate_uploads_and_deletions,
|
||||
)
|
||||
from fixtures.pageserver.http import PageserverApiException
|
||||
@@ -68,14 +67,9 @@ PREEMPT_GC_COMPACTION_TENANT_CONF = {
|
||||
|
||||
|
||||
@skip_in_debug_build("only run with release build")
|
||||
@pytest.mark.parametrize(
|
||||
"wal_receiver_protocol",
|
||||
[PageserverWalReceiverProtocol.VANILLA, PageserverWalReceiverProtocol.INTERPRETED],
|
||||
)
|
||||
@pytest.mark.timeout(900)
|
||||
def test_pageserver_compaction_smoke(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
wal_receiver_protocol: PageserverWalReceiverProtocol,
|
||||
):
|
||||
"""
|
||||
This is a smoke test that compaction kicks in. The workload repeatedly churns
|
||||
@@ -85,8 +79,6 @@ def test_pageserver_compaction_smoke(
|
||||
observed bounds.
|
||||
"""
|
||||
|
||||
neon_env_builder.pageserver_wal_receiver_protocol = wal_receiver_protocol
|
||||
|
||||
# Effectively disable the page cache to rely only on image layers
|
||||
# to shorten reads.
|
||||
neon_env_builder.pageserver_config_override = """
|
||||
|
||||
@@ -466,8 +466,13 @@ def test_perf_counters(neon_simple_env: NeonEnv):
|
||||
#
|
||||
# 1.5 is the minimum version to contain these views.
|
||||
cur.execute("CREATE EXTENSION neon VERSION '1.5'")
|
||||
cur.execute("set neon.monitor_query_exec_time = on")
|
||||
cur.execute("SELECT * FROM neon_perf_counters")
|
||||
cur.execute("SELECT * FROM neon_backend_perf_counters")
|
||||
cur.execute(
|
||||
"select value from neon_backend_perf_counters where metric='query_time_seconds_count' and pid=pg_backend_pid()"
|
||||
)
|
||||
assert cur.fetchall()[0][0] == 2
|
||||
|
||||
|
||||
def collect_metric(
|
||||
|
||||
@@ -1,9 +1,13 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_cli import WalCraft
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder, PageserverWalReceiverProtocol
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder
|
||||
|
||||
# Restart nodes with WAL end having specially crafted shape, like last record
|
||||
# crossing segment boundary, to test decoding issues.
|
||||
@@ -19,17 +23,10 @@ from fixtures.neon_fixtures import NeonEnvBuilder, PageserverWalReceiverProtocol
|
||||
"wal_record_crossing_segment_followed_by_small_one",
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"wal_receiver_protocol",
|
||||
[PageserverWalReceiverProtocol.VANILLA, PageserverWalReceiverProtocol.INTERPRETED],
|
||||
)
|
||||
def test_crafted_wal_end(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
wal_type: str,
|
||||
wal_receiver_protocol: PageserverWalReceiverProtocol,
|
||||
):
|
||||
neon_env_builder.pageserver_wal_receiver_protocol = wal_receiver_protocol
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
env.create_branch("test_crafted_wal_end")
|
||||
env.pageserver.allowed_errors.extend(
|
||||
|
||||
@@ -159,7 +159,8 @@ def test_remote_extensions(
|
||||
|
||||
# Setup a mock nginx S3 gateway which will return our test extension.
|
||||
(host, port) = httpserver_listen_address
|
||||
extensions_endpoint = f"http://{host}:{port}/pg-ext-s3-gateway"
|
||||
remote_ext_base_url = f"http://{host}:{port}/pg-ext-s3-gateway"
|
||||
log.info(f"remote extensions base URL: {remote_ext_base_url}")
|
||||
|
||||
extension.build(pg_config, test_output_dir)
|
||||
tarball = extension.package(test_output_dir)
|
||||
@@ -221,7 +222,7 @@ def test_remote_extensions(
|
||||
|
||||
endpoint.create_remote_extension_spec(spec)
|
||||
|
||||
endpoint.start(remote_ext_base_url=extensions_endpoint)
|
||||
endpoint.start(remote_ext_base_url=remote_ext_base_url)
|
||||
|
||||
with endpoint.connect() as conn:
|
||||
with conn.cursor() as cur:
|
||||
@@ -249,7 +250,7 @@ def test_remote_extensions(
|
||||
# Remove the extension files to force a redownload of the extension.
|
||||
extension.remove(test_output_dir, pg_version)
|
||||
|
||||
endpoint.start(remote_ext_base_url=extensions_endpoint)
|
||||
endpoint.start(remote_ext_base_url=remote_ext_base_url)
|
||||
|
||||
# Test that ALTER EXTENSION UPDATE statements also fetch remote extensions.
|
||||
with endpoint.connect() as conn:
|
||||
|
||||
@@ -74,8 +74,9 @@ def test_hot_standby(neon_simple_env: NeonEnv):
|
||||
for query in queries:
|
||||
with s_con.cursor() as secondary_cursor:
|
||||
secondary_cursor.execute(query)
|
||||
response = secondary_cursor.fetchone()
|
||||
assert response is not None
|
||||
res = secondary_cursor.fetchone()
|
||||
assert res is not None
|
||||
response = res
|
||||
assert response == responses[query]
|
||||
|
||||
# Check for corrupted WAL messages which might otherwise go unnoticed if
|
||||
@@ -164,7 +165,7 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):
|
||||
|
||||
s_cur.execute("SELECT COUNT(*) FROM test")
|
||||
res = s_cur.fetchone()
|
||||
assert res[0] == 10000
|
||||
assert res == (10000,)
|
||||
|
||||
# Clear the cache in the standby, so that when we
|
||||
# re-execute the query, it will make GetPage
|
||||
@@ -195,7 +196,7 @@ def test_hot_standby_gc(neon_env_builder: NeonEnvBuilder, pause_apply: bool):
|
||||
s_cur.execute("SELECT COUNT(*) FROM test")
|
||||
log_replica_lag(primary, secondary)
|
||||
res = s_cur.fetchone()
|
||||
assert res[0] == 10000
|
||||
assert res == (10000,)
|
||||
|
||||
|
||||
def run_pgbench(connstr: str, pg_bin: PgBin):
|
||||
|
||||
133
test_runner/regress/test_replica_promotes.py
Normal file
133
test_runner/regress/test_replica_promotes.py
Normal file
@@ -0,0 +1,133 @@
|
||||
"""
|
||||
File with secondary->primary promotion testing.
|
||||
|
||||
This far, only contains a test that we don't break and that the data is persisted.
|
||||
"""
|
||||
|
||||
import psycopg2
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import Endpoint, NeonEnv, wait_replica_caughtup
|
||||
from fixtures.pg_version import PgVersion
|
||||
from pytest import raises
|
||||
|
||||
|
||||
def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion):
|
||||
"""
|
||||
Test that a replica safely promotes, and can commit data updates which
|
||||
show up when the primary boots up after the promoted secondary endpoint
|
||||
shut down.
|
||||
"""
|
||||
|
||||
# Initialize the primary, a test table, and a helper function to create lots
|
||||
# of subtransactions.
|
||||
env: NeonEnv = neon_simple_env
|
||||
primary: Endpoint = env.endpoints.create_start(branch_name="main", endpoint_id="primary")
|
||||
secondary: Endpoint = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary")
|
||||
|
||||
with primary.connect() as primary_conn:
|
||||
primary_cur = primary_conn.cursor()
|
||||
primary_cur.execute(
|
||||
"create table t(pk bigint GENERATED ALWAYS AS IDENTITY, payload integer)"
|
||||
)
|
||||
primary_cur.execute("INSERT INTO t(payload) SELECT generate_series(1, 100)")
|
||||
primary_cur.execute(
|
||||
"""
|
||||
SELECT pg_current_wal_insert_lsn(),
|
||||
pg_current_wal_lsn(),
|
||||
pg_current_wal_flush_lsn()
|
||||
"""
|
||||
)
|
||||
log.info(f"Primary: Current LSN after workload is {primary_cur.fetchone()}")
|
||||
primary_cur.execute("show neon.safekeepers")
|
||||
safekeepers = primary_cur.fetchall()[0][0]
|
||||
|
||||
wait_replica_caughtup(primary, secondary)
|
||||
|
||||
with secondary.connect() as secondary_conn:
|
||||
secondary_cur = secondary_conn.cursor()
|
||||
secondary_cur.execute("select count(*) from t")
|
||||
|
||||
assert secondary_cur.fetchone() == (100,)
|
||||
|
||||
with raises(psycopg2.Error):
|
||||
secondary_cur.execute("INSERT INTO t (payload) SELECT generate_series(101, 200)")
|
||||
secondary_conn.commit()
|
||||
|
||||
secondary_conn.rollback()
|
||||
secondary_cur.execute("select count(*) from t")
|
||||
assert secondary_cur.fetchone() == (100,)
|
||||
|
||||
primary.stop_and_destroy(mode="immediate")
|
||||
|
||||
# Reconnect to the secondary to make sure we get a read-write connection
|
||||
promo_conn = secondary.connect()
|
||||
promo_cur = promo_conn.cursor()
|
||||
promo_cur.execute(f"alter system set neon.safekeepers='{safekeepers}'")
|
||||
promo_cur.execute("select pg_reload_conf()")
|
||||
|
||||
promo_cur.execute("SELECT * FROM pg_promote()")
|
||||
assert promo_cur.fetchone() == (True,)
|
||||
promo_cur.execute(
|
||||
"""
|
||||
SELECT pg_current_wal_insert_lsn(),
|
||||
pg_current_wal_lsn(),
|
||||
pg_current_wal_flush_lsn()
|
||||
"""
|
||||
)
|
||||
log.info(f"Secondary: LSN after promotion is {promo_cur.fetchone()}")
|
||||
|
||||
# Reconnect to the secondary to make sure we get a read-write connection
|
||||
with secondary.connect() as new_primary_conn:
|
||||
new_primary_cur = new_primary_conn.cursor()
|
||||
new_primary_cur.execute("select count(*) from t")
|
||||
assert new_primary_cur.fetchone() == (100,)
|
||||
|
||||
new_primary_cur.execute(
|
||||
"INSERT INTO t (payload) SELECT generate_series(101, 200) RETURNING payload"
|
||||
)
|
||||
assert new_primary_cur.fetchall() == [(it,) for it in range(101, 201)]
|
||||
|
||||
new_primary_cur = new_primary_conn.cursor()
|
||||
new_primary_cur.execute("select payload from t")
|
||||
assert new_primary_cur.fetchall() == [(it,) for it in range(1, 201)]
|
||||
|
||||
new_primary_cur.execute("select count(*) from t")
|
||||
assert new_primary_cur.fetchone() == (200,)
|
||||
new_primary_cur.execute(
|
||||
"""
|
||||
SELECT pg_current_wal_insert_lsn(),
|
||||
pg_current_wal_lsn(),
|
||||
pg_current_wal_flush_lsn()
|
||||
"""
|
||||
)
|
||||
log.info(f"Secondary: LSN after workload is {new_primary_cur.fetchone()}")
|
||||
|
||||
with secondary.connect() as second_viewpoint_conn:
|
||||
new_primary_cur = second_viewpoint_conn.cursor()
|
||||
new_primary_cur.execute("select payload from t")
|
||||
assert new_primary_cur.fetchall() == [(it,) for it in range(1, 201)]
|
||||
|
||||
# wait_for_last_flush_lsn(env, secondary, env.initial_tenant, env.initial_timeline)
|
||||
|
||||
secondary.stop_and_destroy()
|
||||
|
||||
primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary")
|
||||
|
||||
with primary.connect() as new_primary:
|
||||
new_primary_cur = new_primary.cursor()
|
||||
new_primary_cur.execute(
|
||||
"""
|
||||
SELECT pg_current_wal_insert_lsn(),
|
||||
pg_current_wal_lsn(),
|
||||
pg_current_wal_flush_lsn()
|
||||
"""
|
||||
)
|
||||
log.info(f"New primary: Boot LSN is {new_primary_cur.fetchone()}")
|
||||
|
||||
new_primary_cur.execute("select count(*) from t")
|
||||
assert new_primary_cur.fetchone() == (200,)
|
||||
new_primary_cur.execute("INSERT INTO t (payload) SELECT generate_series(201, 300)")
|
||||
new_primary_cur.execute("select count(*) from t")
|
||||
assert new_primary_cur.fetchone() == (300,)
|
||||
|
||||
primary.stop(mode="immediate")
|
||||
@@ -1388,10 +1388,12 @@ def test_sharding_split_failures(
|
||||
with pytest.raises(failure.expect_exception()):
|
||||
env.storage_controller.tenant_shard_split(tenant_id, shard_count=4)
|
||||
|
||||
def assert_shard_count(shard_count: int, exclude_ps_id: int | None = None) -> None:
|
||||
# We expect that the overall operation will fail, but some split requests
|
||||
# will have succeeded: the net result should be to return to a clean state, including
|
||||
# detaching any child shards.
|
||||
def assert_rolled_back(exclude_ps_id=None) -> None:
|
||||
secondary_count = 0
|
||||
attached_count = 0
|
||||
log.info(f"Iterating over {len(env.pageservers)} pageservers to check shard count")
|
||||
for ps in env.pageservers:
|
||||
if exclude_ps_id is not None and ps.id == exclude_ps_id:
|
||||
continue
|
||||
@@ -1402,23 +1404,35 @@ def test_sharding_split_failures(
|
||||
if tenant_shard_id.tenant_id != tenant_id:
|
||||
continue # skip bystanders
|
||||
log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}")
|
||||
assert tenant_shard_id.shard_count == shard_count
|
||||
assert tenant_shard_id.shard_count == initial_shard_count
|
||||
if loc[1]["mode"] == "Secondary":
|
||||
secondary_count += 1
|
||||
else:
|
||||
attached_count += 1
|
||||
|
||||
assert secondary_count == shard_count
|
||||
assert attached_count == shard_count
|
||||
|
||||
# We expect that the overall operation will fail, but some split requests
|
||||
# will have succeeded: the net result should be to return to a clean state, including
|
||||
# detaching any child shards.
|
||||
def assert_rolled_back(exclude_ps_id: int | None = None) -> None:
|
||||
assert_shard_count(initial_shard_count, exclude_ps_id)
|
||||
assert secondary_count == initial_shard_count
|
||||
assert attached_count == initial_shard_count
|
||||
|
||||
def assert_split_done(exclude_ps_id: int | None = None) -> None:
|
||||
assert_shard_count(split_shard_count, exclude_ps_id)
|
||||
secondary_count = 0
|
||||
attached_count = 0
|
||||
for ps in env.pageservers:
|
||||
if exclude_ps_id is not None and ps.id == exclude_ps_id:
|
||||
continue
|
||||
|
||||
locations = ps.http_client().tenant_list_locations()["tenant_shards"]
|
||||
for loc in locations:
|
||||
tenant_shard_id = TenantShardId.parse(loc[0])
|
||||
if tenant_shard_id.tenant_id != tenant_id:
|
||||
continue # skip bystanders
|
||||
log.info(f"Shard {tenant_shard_id} seen on node {ps.id} in mode {loc[1]['mode']}")
|
||||
assert tenant_shard_id.shard_count == split_shard_count
|
||||
if loc[1]["mode"] == "Secondary":
|
||||
secondary_count += 1
|
||||
else:
|
||||
attached_count += 1
|
||||
assert attached_count == split_shard_count
|
||||
assert secondary_count == split_shard_count
|
||||
|
||||
def finish_split():
|
||||
# Having failed+rolled back, we should be able to split again
|
||||
@@ -1454,7 +1468,6 @@ def test_sharding_split_failures(
|
||||
|
||||
# The split should appear to be rolled back from the point of view of all pageservers
|
||||
# apart from the one that is offline
|
||||
env.storage_controller.reconcile_until_idle()
|
||||
wait_until(lambda: assert_rolled_back(exclude_ps_id=failure.pageserver_id))
|
||||
|
||||
finish_split()
|
||||
@@ -1469,7 +1482,6 @@ def test_sharding_split_failures(
|
||||
log.info("Clearing failure...")
|
||||
failure.clear(env)
|
||||
|
||||
env.storage_controller.reconcile_until_idle()
|
||||
wait_until(assert_rolled_back)
|
||||
|
||||
# Having rolled back, the tenant should be working
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user