From a7b84cca5aef3dacc90895f39f0e0ad2b6c950cc Mon Sep 17 00:00:00 2001 From: Peter Bendel Date: Mon, 20 May 2024 12:07:25 +0200 Subject: [PATCH] Upgrade of pgvector to 0.7.0 (#7726) Upgrade pgvector to 0.7.0. This PR is based on Heikki's PR #6753 and just uses pgvector 0.7.0 instead of 0.6.0 I have now done all planned manual tests. The pull request is ready to be reviewed and merged and can be deployed in production together / after swap enablement. See (https://github.com/neondatabase/autoscaling/issues/800) Fixes https://github.com/neondatabase/neon/issues/6516 Fixes https://github.com/neondatabase/neon/issues/7780 ## Documentation input for usage recommendations ### maintenance_work_mem In Neon `maintenance_work_mem` is very small by default (depends on configured RAM for your compute but can be as low as 64 MB). To optimize pgvector index build time you may have to bump it up according to your working set size (size of tuples for vector index creation). You can do so in the current session using `SET maintenance_work_mem='10 GB';` The target value you choose should fit into the memory of your compute size and not exceed 50-60% of available RAM. The value above has been successfully used on a 7CU endpoint. ### max_parallel_maintenance_workers max_parallel_maintenance_workers is also small by default (2). For efficient parallel pgvector index creation you have to bump it up with `SET max_parallel_maintenance_workers = 7` to make use of all the CPUs available, assuming you have configured your endpoint to use 7CU. ## ID input for changelog pgvector extension in Neon has been upgraded from version 0.5.1 to version 0.7.0. Please see https://github.com/pgvector/pgvector/ for documentation of new capabilities in pgvector version 0.7.0 If you have existing databases with pgvector 0.5.1 already installed there is a slight difference in behavior in the following corner cases even if you don't run `ALTER EXTENSION UPDATE`: ### L2 distance from NULL::vector For the following script, comparing the NULL::vector to non-null vectors the resulting output changes: ```sql SET enable_seqscan = off; CREATE TABLE t (val vector(3)); INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); CREATE INDEX ON t USING hnsw (val vector_l2_ops); INSERT INTO t (val) VALUES ('[1,2,4]'); SELECT * FROM t ORDER BY val <-> (SELECT NULL::vector); ``` and now the output is ``` val --------- [1,1,1] [1,2,4] [1,2,3] [0,0,0] (4 rows) ``` For the following script ```sql SET enable_seqscan = off; CREATE TABLE t (val vector(3)); INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); CREATE INDEX ON t USING ivfflat (val vector_l2_ops) WITH (lists = 1); INSERT INTO t (val) VALUES ('[1,2,4]'); SELECT * FROM t ORDER BY val <-> (SELECT NULL::vector); ``` the output now is ``` val --------- [0,0,0] [1,2,3] [1,1,1] [1,2,4] (4 rows) ``` ### changed error messages If you provide invalid literals for datatype vector you may get improved/changed error messages, for example: ```sql neondb=> SELECT '[4e38,1]'::vector; ERROR: "4e38" is out of range for type vector LINE 1: SELECT '[4e38,1]'::vector; ^ ``` --------- Co-authored-by: Heikki Linnakangas --- .dockerignore | 1 + Dockerfile.compute-node | 7 +++- patches/pgvector.patch | 78 ++++++++++++++++++++++++++++++++++++++ pgxn/neon/pagestore_smgr.c | 19 +++++++++- 4 files changed, 101 insertions(+), 4 deletions(-) create mode 100644 patches/pgvector.patch diff --git a/.dockerignore b/.dockerignore index f7a6232ba1..1258532db8 100644 --- a/.dockerignore +++ b/.dockerignore @@ -17,6 +17,7 @@ !libs/ !neon_local/ !pageserver/ +!patches/ !pgxn/ !proxy/ !s3_scrubber/ diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index bd4534ce1d..5bf3246f34 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -241,9 +241,12 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz - FROM build-deps AS vector-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.1.tar.gz -O pgvector.tar.gz && \ - echo "cc7a8e034a96e30a819911ac79d32f6bc47bdd1aa2de4d7d4904e26b83209dc8 pgvector.tar.gz" | sha256sum --check && \ +COPY patches/pgvector.patch /pgvector.patch + +RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.0.tar.gz -O pgvector.tar.gz && \ + echo "1b5503a35c265408b6eb282621c5e1e75f7801afc04eecb950796cfee2e3d1d8 pgvector.tar.gz" | sha256sum --check && \ mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \ + patch -p1 < /pgvector.patch && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control diff --git a/patches/pgvector.patch b/patches/pgvector.patch new file mode 100644 index 0000000000..84ac6644c5 --- /dev/null +++ b/patches/pgvector.patch @@ -0,0 +1,78 @@ +From 0b0194a57bd0f3598bd57dbedd0df3932330169d Mon Sep 17 00:00:00 2001 +From: Heikki Linnakangas +Date: Fri, 2 Feb 2024 22:26:45 +0200 +Subject: [PATCH 1/1] Make v0.6.0 work with Neon + +Now that the WAL-logging happens as a separate step at the end of the +build, we need a few neon-specific hints to make it work. +--- + src/hnswbuild.c | 36 ++++++++++++++++++++++++++++++++++++ + 1 file changed, 36 insertions(+) + +diff --git a/src/hnswbuild.c b/src/hnswbuild.c +index 680789b..ec54dea 100644 +--- a/src/hnswbuild.c ++++ b/src/hnswbuild.c +@@ -840,9 +840,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc) + + hnswarea = shm_toc_lookup(toc, PARALLEL_KEY_HNSW_AREA, false); + ++#ifdef NEON_SMGR ++ smgr_start_unlogged_build(RelationGetSmgr(indexRel)); ++#endif ++ + /* Perform inserts */ + HnswParallelScanAndInsert(heapRel, indexRel, hnswshared, hnswarea, false); + ++#ifdef NEON_SMGR ++ smgr_finish_unlogged_build_phase_1(RelationGetSmgr(indexRel)); ++#endif ++ + /* Close relations within worker */ + index_close(indexRel, indexLockmode); + table_close(heapRel, heapLockmode); +@@ -1089,13 +1097,41 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo, + SeedRandom(42); + #endif + ++#ifdef NEON_SMGR ++ smgr_start_unlogged_build(RelationGetSmgr(index)); ++#endif ++ + InitBuildState(buildstate, heap, index, indexInfo, forkNum); + + BuildGraph(buildstate, forkNum); + ++#ifdef NEON_SMGR ++ smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index)); ++#endif ++ + if (RelationNeedsWAL(index)) ++ { + log_newpage_range(index, forkNum, 0, RelationGetNumberOfBlocks(index), true); + ++#ifdef NEON_SMGR ++ { ++#if PG_VERSION_NUM >= 160000 ++ RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator; ++#else ++ RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node; ++#endif ++ ++ SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator, ++ MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index)); ++ SetLastWrittenLSNForRelation(XactLastRecEnd, rlocator, MAIN_FORKNUM); ++ } ++#endif ++ } ++ ++#ifdef NEON_SMGR ++ smgr_end_unlogged_build(RelationGetSmgr(index)); ++#endif ++ + FreeBuildState(buildstate); + } + +-- +2.39.2 + diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index e3b841f526..249ad313b0 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -45,6 +45,7 @@ */ #include "postgres.h" +#include "access/parallel.h" #include "access/xact.h" #include "access/xlog.h" #include "access/xlogdefs.h" @@ -2822,10 +2823,14 @@ neon_start_unlogged_build(SMgrRelation reln) reln->smgr_relpersistence = RELPERSISTENCE_UNLOGGED; /* + * Create the local file. In a parallel build, the leader is expected to + * call this first and do it. + * * FIXME: should we pass isRedo true to create the tablespace dir if it * doesn't exist? Is it needed? */ - mdcreate(reln, MAIN_FORKNUM, false); + if (!IsParallelWorker()) + mdcreate(reln, MAIN_FORKNUM, false); } /* @@ -2849,7 +2854,17 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln) Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_1); Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED); - unlogged_build_phase = UNLOGGED_BUILD_PHASE_2; + /* + * In a parallel build, (only) the leader process performs the 2nd + * phase. + */ + if (IsParallelWorker()) + { + unlogged_build_rel = NULL; + unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; + } + else + unlogged_build_phase = UNLOGGED_BUILD_PHASE_2; } /*