From 5529bd786da8b885c52d3b06010bc5102a0aa26d Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Fri, 8 Nov 2024 11:52:02 -0600 Subject: [PATCH] Switch compute-related locales to C.UTF-8 by default (#6746) Right now, our environments create databases with the C locale, which is really unfortunate for users who have data stored in other languages that they want to analyze. For instance, show_trgm on Hebrew text currently doesn't work in staging or production. I don't envision this being the final solution. I think this is just a way to set a known value so the pageserver doesn't use its parent environment. The final solution to me is exposing initdb parameters to users in the console. Then they could use a different locale or encoding if they so chose. Signed-off-by: Tristan Partin --- compute_tools/src/config.rs | 6 ++ libs/pageserver_api/src/config.rs | 3 + libs/utils/scripts/restore_from_wal.sh | 39 ++++++++++++- pageserver/src/config.rs | 3 + pageserver/src/tenant.rs | 28 ++++++++-- test_runner/regress/test_compute_locales.py | 61 +++++++++++++++++++++ test_runner/regress/test_wal_restore.py | 2 + 7 files changed, 136 insertions(+), 6 deletions(-) create mode 100644 test_runner/regress/test_compute_locales.py diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index 479100eb89..50e2a95e9d 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -73,6 +73,12 @@ pub fn write_postgres_conf( )?; } + // Locales + writeln!(file, "lc_messages='C.UTF-8'")?; + writeln!(file, "lc_monetary='C.UTF-8'")?; + writeln!(file, "lc_time='C.UTF-8'")?; + writeln!(file, "lc_numeric='C.UTF-8'")?; + match spec.mode { ComputeMode::Primary => {} ComputeMode::Static(lsn) => { diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 6de34fdd35..4272181954 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -64,6 +64,7 @@ pub struct ConfigToml { #[serde(with = "humantime_serde")] pub wal_redo_timeout: Duration, pub superuser: String, + pub locale: String, pub page_cache_size: usize, pub max_file_descriptors: usize, pub pg_distrib_dir: Option, @@ -276,6 +277,7 @@ pub mod defaults { pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s"; pub const DEFAULT_SUPERUSER: &str = "cloud_admin"; + pub const DEFAULT_LOCALE: &str = "C.UTF-8"; pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192; pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100; @@ -326,6 +328,7 @@ impl Default for ConfigToml { wal_redo_timeout: (humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT) .expect("cannot parse default wal redo timeout")), superuser: (DEFAULT_SUPERUSER.to_string()), + locale: DEFAULT_LOCALE.to_string(), page_cache_size: (DEFAULT_PAGE_CACHE_SIZE), max_file_descriptors: (DEFAULT_MAX_FILE_DESCRIPTORS), pg_distrib_dir: None, // Utf8PathBuf::from("./pg_install"), // TODO: formely, this was std::env::current_dir() diff --git a/libs/utils/scripts/restore_from_wal.sh b/libs/utils/scripts/restore_from_wal.sh index 316ec8ed0d..93448369a0 100755 --- a/libs/utils/scripts/restore_from_wal.sh +++ b/libs/utils/scripts/restore_from_wal.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash set -euxo pipefail @@ -6,9 +6,44 @@ PG_BIN=$1 WAL_PATH=$2 DATA_DIR=$3 PORT=$4 +PG_VERSION=$5 SYSID=$(od -A n -j 24 -N 8 -t d8 "$WAL_PATH"/000000010000000000000002* | cut -c 3-) + +# The way that initdb is invoked must match how the pageserver runs initdb. +function initdb_with_args { + local cmd=( + "$PG_BIN"/initdb + -E utf8 + -U cloud_admin + -D "$DATA_DIR" + --locale 'C.UTF-8' + --lc-collate 'C.UTF-8' + --lc-ctype 'C.UTF-8' + --lc-messages 'C.UTF-8' + --lc-monetary 'C.UTF-8' + --lc-numeric 'C.UTF-8' + --lc-time 'C.UTF-8' + --sysid="$SYSID" + ) + + case "$PG_VERSION" in + 14) + # Postgres 14 and below didn't support --locale-provider + ;; + 15 | 16) + cmd+=(--locale-provider 'libc') + ;; + *) + # Postgres 17 added the builtin provider + cmd+=(--locale-provider 'builtin') + ;; + esac + + eval env -i LD_LIBRARY_PATH="$PG_BIN"/../lib "${cmd[*]}" +} + rm -fr "$DATA_DIR" -env -i LD_LIBRARY_PATH="$PG_BIN"/../lib "$PG_BIN"/initdb -E utf8 -U cloud_admin -D "$DATA_DIR" --sysid="$SYSID" +initdb_with_args echo "port=$PORT" >> "$DATA_DIR"/postgresql.conf echo "shared_preload_libraries='\$libdir/neon_rmgr.so'" >> "$DATA_DIR"/postgresql.conf REDO_POS=0x$("$PG_BIN"/pg_controldata -D "$DATA_DIR" | grep -F "REDO location"| cut -c 42-) diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index d62066ac22..b694a43599 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -69,6 +69,7 @@ pub struct PageServerConf { pub wal_redo_timeout: Duration, pub superuser: String, + pub locale: String, pub page_cache_size: usize, pub max_file_descriptors: usize, @@ -301,6 +302,7 @@ impl PageServerConf { wait_lsn_timeout, wal_redo_timeout, superuser, + locale, page_cache_size, max_file_descriptors, pg_distrib_dir, @@ -348,6 +350,7 @@ impl PageServerConf { wait_lsn_timeout, wal_redo_timeout, superuser, + locale, page_cache_size, max_file_descriptors, http_auth_type, diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index d45c99a41b..34ea6dae1f 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -4779,10 +4779,18 @@ async fn run_initdb( let _permit = INIT_DB_SEMAPHORE.acquire().await; - let initdb_command = tokio::process::Command::new(&initdb_bin_path) + let mut initdb_command = tokio::process::Command::new(&initdb_bin_path); + initdb_command .args(["--pgdata", initdb_target_dir.as_ref()]) .args(["--username", &conf.superuser]) .args(["--encoding", "utf8"]) + .args(["--locale", &conf.locale]) + .args(["--lc-collate", &conf.locale]) + .args(["--lc-ctype", &conf.locale]) + .args(["--lc-messages", &conf.locale]) + .args(["--lc-monetary", &conf.locale]) + .args(["--lc-numeric", &conf.locale]) + .args(["--lc-time", &conf.locale]) .arg("--no-instructions") .arg("--no-sync") .env_clear() @@ -4792,15 +4800,27 @@ async fn run_initdb( // stdout invocation produces the same output every time, we don't need it .stdout(std::process::Stdio::null()) // we would be interested in the stderr output, if there was any - .stderr(std::process::Stdio::piped()) - .spawn()?; + .stderr(std::process::Stdio::piped()); + + // Before version 14, only the libc provide was available. + if pg_version > 14 { + // Version 17 brought with it a builtin locale provider which only provides + // C and C.UTF-8. While being safer for collation purposes since it is + // guaranteed to be consistent throughout a major release, it is also more + // performant. + let locale_provider = if pg_version >= 17 { "builtin" } else { "libc" }; + + initdb_command.args(["--locale-provider", locale_provider]); + } + + let initdb_proc = initdb_command.spawn()?; // Ideally we'd select here with the cancellation token, but the problem is that // we can't safely terminate initdb: it launches processes of its own, and killing // initdb doesn't kill them. After we return from this function, we want the target // directory to be able to be cleaned up. // See https://github.com/neondatabase/neon/issues/6385 - let initdb_output = initdb_command.wait_with_output().await?; + let initdb_output = initdb_proc.wait_with_output().await?; if !initdb_output.status.success() { return Err(InitdbError::Failed( initdb_output.status, diff --git a/test_runner/regress/test_compute_locales.py b/test_runner/regress/test_compute_locales.py new file mode 100644 index 0000000000..00ef32fb5e --- /dev/null +++ b/test_runner/regress/test_compute_locales.py @@ -0,0 +1,61 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, cast + +from fixtures.pg_version import PgVersion + +if TYPE_CHECKING: + from collections.abc import Sequence + + from fixtures.neon_fixtures import NeonEnv + + +def test_default_locales(neon_simple_env: NeonEnv): + """ + Test that the default locales for compute databases is C.UTF-8. + """ + env = neon_simple_env + + endpoint = env.endpoints.create_start("main") + + domain_locales = cast( + "Sequence[str]", + endpoint.safe_psql( + "SELECT current_setting('lc_messages') AS lc_messages," + + "current_setting('lc_monetary') AS lc_monetary," + + "current_setting('lc_numeric') AS lc_numeric," + + "current_setting('lc_time') AS lc_time" + )[0], + ) + for dl in domain_locales: + assert dl == "C.UTF-8" + + # Postgres 15 added the locale providers + if env.pg_version < PgVersion.V15: + results = cast( + "Sequence[str]", + endpoint.safe_psql( + "SELECT datcollate, datctype FROM pg_database WHERE datname = current_database()" + )[0], + ) + + datcollate = results[0] + datctype = results[1] + else: + results = cast( + "Sequence[str]", + endpoint.safe_psql( + "SELECT datlocprovider, datcollate, datctype FROM pg_database WHERE datname = current_database()" + )[0], + ) + datlocprovider = results[0] + datcollate = results[1] + datctype = results[2] + + if env.pg_version >= PgVersion.V17: + assert datlocprovider == "b", "The locale provider is not builtin" + else: + assert datlocprovider == "c", "The locale provider is not libc" + + assert datcollate == "C.UTF-8" + assert datctype == "C.UTF-8" diff --git a/test_runner/regress/test_wal_restore.py b/test_runner/regress/test_wal_restore.py index 05b6ad8a9b..c8e51fde13 100644 --- a/test_runner/regress/test_wal_restore.py +++ b/test_runner/regress/test_wal_restore.py @@ -64,6 +64,7 @@ def test_wal_restore( ), str(data_dir), str(port), + env.pg_version, ] ) restored.start() @@ -127,6 +128,7 @@ def test_wal_restore_initdb( ), str(data_dir), str(port), + env.pg_version, ] ) restored.start()