mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-16 12:40:36 +00:00
Switch compute-related locales to C.UTF-8 by default (#6746)
Right now, our environments create databases with the C locale, which is really unfortunate for users who have data stored in other languages that they want to analyze. For instance, show_trgm on Hebrew text currently doesn't work in staging or production. I don't envision this being the final solution. I think this is just a way to set a known value so the pageserver doesn't use its parent environment. The final solution to me is exposing initdb parameters to users in the console. Then they could use a different locale or encoding if they so chose. Signed-off-by: Tristan Partin <tristan@neon.tech>
This commit is contained in:
@@ -73,6 +73,12 @@ pub fn write_postgres_conf(
|
||||
)?;
|
||||
}
|
||||
|
||||
// Locales
|
||||
writeln!(file, "lc_messages='C.UTF-8'")?;
|
||||
writeln!(file, "lc_monetary='C.UTF-8'")?;
|
||||
writeln!(file, "lc_time='C.UTF-8'")?;
|
||||
writeln!(file, "lc_numeric='C.UTF-8'")?;
|
||||
|
||||
match spec.mode {
|
||||
ComputeMode::Primary => {}
|
||||
ComputeMode::Static(lsn) => {
|
||||
|
||||
@@ -64,6 +64,7 @@ pub struct ConfigToml {
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub wal_redo_timeout: Duration,
|
||||
pub superuser: String,
|
||||
pub locale: String,
|
||||
pub page_cache_size: usize,
|
||||
pub max_file_descriptors: usize,
|
||||
pub pg_distrib_dir: Option<Utf8PathBuf>,
|
||||
@@ -276,6 +277,7 @@ pub mod defaults {
|
||||
pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";
|
||||
|
||||
pub const DEFAULT_SUPERUSER: &str = "cloud_admin";
|
||||
pub const DEFAULT_LOCALE: &str = "C.UTF-8";
|
||||
|
||||
pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192;
|
||||
pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100;
|
||||
@@ -326,6 +328,7 @@ impl Default for ConfigToml {
|
||||
wal_redo_timeout: (humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT)
|
||||
.expect("cannot parse default wal redo timeout")),
|
||||
superuser: (DEFAULT_SUPERUSER.to_string()),
|
||||
locale: DEFAULT_LOCALE.to_string(),
|
||||
page_cache_size: (DEFAULT_PAGE_CACHE_SIZE),
|
||||
max_file_descriptors: (DEFAULT_MAX_FILE_DESCRIPTORS),
|
||||
pg_distrib_dir: None, // Utf8PathBuf::from("./pg_install"), // TODO: formely, this was std::env::current_dir()
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -euxo pipefail
|
||||
|
||||
@@ -6,9 +6,44 @@ PG_BIN=$1
|
||||
WAL_PATH=$2
|
||||
DATA_DIR=$3
|
||||
PORT=$4
|
||||
PG_VERSION=$5
|
||||
SYSID=$(od -A n -j 24 -N 8 -t d8 "$WAL_PATH"/000000010000000000000002* | cut -c 3-)
|
||||
|
||||
# The way that initdb is invoked must match how the pageserver runs initdb.
|
||||
function initdb_with_args {
|
||||
local cmd=(
|
||||
"$PG_BIN"/initdb
|
||||
-E utf8
|
||||
-U cloud_admin
|
||||
-D "$DATA_DIR"
|
||||
--locale 'C.UTF-8'
|
||||
--lc-collate 'C.UTF-8'
|
||||
--lc-ctype 'C.UTF-8'
|
||||
--lc-messages 'C.UTF-8'
|
||||
--lc-monetary 'C.UTF-8'
|
||||
--lc-numeric 'C.UTF-8'
|
||||
--lc-time 'C.UTF-8'
|
||||
--sysid="$SYSID"
|
||||
)
|
||||
|
||||
case "$PG_VERSION" in
|
||||
14)
|
||||
# Postgres 14 and below didn't support --locale-provider
|
||||
;;
|
||||
15 | 16)
|
||||
cmd+=(--locale-provider 'libc')
|
||||
;;
|
||||
*)
|
||||
# Postgres 17 added the builtin provider
|
||||
cmd+=(--locale-provider 'builtin')
|
||||
;;
|
||||
esac
|
||||
|
||||
eval env -i LD_LIBRARY_PATH="$PG_BIN"/../lib "${cmd[*]}"
|
||||
}
|
||||
|
||||
rm -fr "$DATA_DIR"
|
||||
env -i LD_LIBRARY_PATH="$PG_BIN"/../lib "$PG_BIN"/initdb -E utf8 -U cloud_admin -D "$DATA_DIR" --sysid="$SYSID"
|
||||
initdb_with_args
|
||||
echo "port=$PORT" >> "$DATA_DIR"/postgresql.conf
|
||||
echo "shared_preload_libraries='\$libdir/neon_rmgr.so'" >> "$DATA_DIR"/postgresql.conf
|
||||
REDO_POS=0x$("$PG_BIN"/pg_controldata -D "$DATA_DIR" | grep -F "REDO location"| cut -c 42-)
|
||||
|
||||
@@ -69,6 +69,7 @@ pub struct PageServerConf {
|
||||
pub wal_redo_timeout: Duration,
|
||||
|
||||
pub superuser: String,
|
||||
pub locale: String,
|
||||
|
||||
pub page_cache_size: usize,
|
||||
pub max_file_descriptors: usize,
|
||||
@@ -301,6 +302,7 @@ impl PageServerConf {
|
||||
wait_lsn_timeout,
|
||||
wal_redo_timeout,
|
||||
superuser,
|
||||
locale,
|
||||
page_cache_size,
|
||||
max_file_descriptors,
|
||||
pg_distrib_dir,
|
||||
@@ -348,6 +350,7 @@ impl PageServerConf {
|
||||
wait_lsn_timeout,
|
||||
wal_redo_timeout,
|
||||
superuser,
|
||||
locale,
|
||||
page_cache_size,
|
||||
max_file_descriptors,
|
||||
http_auth_type,
|
||||
|
||||
@@ -4779,10 +4779,18 @@ async fn run_initdb(
|
||||
|
||||
let _permit = INIT_DB_SEMAPHORE.acquire().await;
|
||||
|
||||
let initdb_command = tokio::process::Command::new(&initdb_bin_path)
|
||||
let mut initdb_command = tokio::process::Command::new(&initdb_bin_path);
|
||||
initdb_command
|
||||
.args(["--pgdata", initdb_target_dir.as_ref()])
|
||||
.args(["--username", &conf.superuser])
|
||||
.args(["--encoding", "utf8"])
|
||||
.args(["--locale", &conf.locale])
|
||||
.args(["--lc-collate", &conf.locale])
|
||||
.args(["--lc-ctype", &conf.locale])
|
||||
.args(["--lc-messages", &conf.locale])
|
||||
.args(["--lc-monetary", &conf.locale])
|
||||
.args(["--lc-numeric", &conf.locale])
|
||||
.args(["--lc-time", &conf.locale])
|
||||
.arg("--no-instructions")
|
||||
.arg("--no-sync")
|
||||
.env_clear()
|
||||
@@ -4792,15 +4800,27 @@ async fn run_initdb(
|
||||
// stdout invocation produces the same output every time, we don't need it
|
||||
.stdout(std::process::Stdio::null())
|
||||
// we would be interested in the stderr output, if there was any
|
||||
.stderr(std::process::Stdio::piped())
|
||||
.spawn()?;
|
||||
.stderr(std::process::Stdio::piped());
|
||||
|
||||
// Before version 14, only the libc provide was available.
|
||||
if pg_version > 14 {
|
||||
// Version 17 brought with it a builtin locale provider which only provides
|
||||
// C and C.UTF-8. While being safer for collation purposes since it is
|
||||
// guaranteed to be consistent throughout a major release, it is also more
|
||||
// performant.
|
||||
let locale_provider = if pg_version >= 17 { "builtin" } else { "libc" };
|
||||
|
||||
initdb_command.args(["--locale-provider", locale_provider]);
|
||||
}
|
||||
|
||||
let initdb_proc = initdb_command.spawn()?;
|
||||
|
||||
// Ideally we'd select here with the cancellation token, but the problem is that
|
||||
// we can't safely terminate initdb: it launches processes of its own, and killing
|
||||
// initdb doesn't kill them. After we return from this function, we want the target
|
||||
// directory to be able to be cleaned up.
|
||||
// See https://github.com/neondatabase/neon/issues/6385
|
||||
let initdb_output = initdb_command.wait_with_output().await?;
|
||||
let initdb_output = initdb_proc.wait_with_output().await?;
|
||||
if !initdb_output.status.success() {
|
||||
return Err(InitdbError::Failed(
|
||||
initdb_output.status,
|
||||
|
||||
61
test_runner/regress/test_compute_locales.py
Normal file
61
test_runner/regress/test_compute_locales.py
Normal file
@@ -0,0 +1,61 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, cast
|
||||
|
||||
from fixtures.pg_version import PgVersion
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Sequence
|
||||
|
||||
from fixtures.neon_fixtures import NeonEnv
|
||||
|
||||
|
||||
def test_default_locales(neon_simple_env: NeonEnv):
|
||||
"""
|
||||
Test that the default locales for compute databases is C.UTF-8.
|
||||
"""
|
||||
env = neon_simple_env
|
||||
|
||||
endpoint = env.endpoints.create_start("main")
|
||||
|
||||
domain_locales = cast(
|
||||
"Sequence[str]",
|
||||
endpoint.safe_psql(
|
||||
"SELECT current_setting('lc_messages') AS lc_messages,"
|
||||
+ "current_setting('lc_monetary') AS lc_monetary,"
|
||||
+ "current_setting('lc_numeric') AS lc_numeric,"
|
||||
+ "current_setting('lc_time') AS lc_time"
|
||||
)[0],
|
||||
)
|
||||
for dl in domain_locales:
|
||||
assert dl == "C.UTF-8"
|
||||
|
||||
# Postgres 15 added the locale providers
|
||||
if env.pg_version < PgVersion.V15:
|
||||
results = cast(
|
||||
"Sequence[str]",
|
||||
endpoint.safe_psql(
|
||||
"SELECT datcollate, datctype FROM pg_database WHERE datname = current_database()"
|
||||
)[0],
|
||||
)
|
||||
|
||||
datcollate = results[0]
|
||||
datctype = results[1]
|
||||
else:
|
||||
results = cast(
|
||||
"Sequence[str]",
|
||||
endpoint.safe_psql(
|
||||
"SELECT datlocprovider, datcollate, datctype FROM pg_database WHERE datname = current_database()"
|
||||
)[0],
|
||||
)
|
||||
datlocprovider = results[0]
|
||||
datcollate = results[1]
|
||||
datctype = results[2]
|
||||
|
||||
if env.pg_version >= PgVersion.V17:
|
||||
assert datlocprovider == "b", "The locale provider is not builtin"
|
||||
else:
|
||||
assert datlocprovider == "c", "The locale provider is not libc"
|
||||
|
||||
assert datcollate == "C.UTF-8"
|
||||
assert datctype == "C.UTF-8"
|
||||
@@ -64,6 +64,7 @@ def test_wal_restore(
|
||||
),
|
||||
str(data_dir),
|
||||
str(port),
|
||||
env.pg_version,
|
||||
]
|
||||
)
|
||||
restored.start()
|
||||
@@ -127,6 +128,7 @@ def test_wal_restore_initdb(
|
||||
),
|
||||
str(data_dir),
|
||||
str(port),
|
||||
env.pg_version,
|
||||
]
|
||||
)
|
||||
restored.start()
|
||||
|
||||
Reference in New Issue
Block a user