From 647b85fc15a31861608dfe767b625ce889471359 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 2 Feb 2024 22:28:45 +0200 Subject: [PATCH] Update pgvector to v0.6.0, third attempt This includes a compatibility patch that is needed because pgvector now skips WAL-logging during the index build, and WAL-logs the index only in one go at the end. That's how GIN, GiST and SP-GIST index builds work in core PostgreSQL too, but we need some Neon-specific calls to mark the beginning and end of those build phases. pgvector is the first index AM that does that with parallel workers, so I had to modify those functions in the Neon extension to be aware of parallel workers. Only the leader needs to create the underlying file and perform the WAL-logging. (In principle, the parallel workers could participate in the WAL-logging too, but pgvector doesn't do that. This will need some further work if that changes). The previous attempt at this (#6592) missed that parallel workers needed those changes, and segfaulted in parallel build that spilled to disk. Testing ------- We don't have a place for regression tests of extensions at the moment. I tested this manually with the following script: ``` CREATE EXTENSION IF NOT EXISTS vector; DROP TABLE IF EXISTS tst; CREATE TABLE tst (i serial, v vector(3)); INSERT INTO tst (v) SELECT ARRAY[random(), random(), random()] FROM generate_series(1, 15000) g; -- Serial build, in memory ALTER TABLE tst SET (parallel_workers=0); SET maintenance_work_mem='50 MB'; CREATE INDEX idx ON tst USING hnsw (v vector_l2_ops); -- Test that the index works. (The table contents are random, and the -- search is approximate anyway, so we cannot check the exact values. -- For now, just eyeball that they look reasonable) set enable_seqscan=off; explain SELECT * FROM tst ORDER BY v <-> ARRAY[0, 0, 0]::vector LIMIT 5; SELECT * FROM tst ORDER BY v <-> ARRAY[0, 0, 0]::vector LIMIT 5; DROP INDEX idx; -- Serial build, spills to on disk ALTER TABLE tst SET (parallel_workers=0); SET maintenance_work_mem='5 MB'; CREATE INDEX idx ON tst USING hnsw (v vector_l2_ops); SELECT * FROM tst ORDER BY v <-> ARRAY[0, 0, 0]::vector LIMIT 5; DROP INDEX idx; -- Parallel build, in memory ALTER TABLE tst SET (parallel_workers=4); SET maintenance_work_mem='50 MB'; CREATE INDEX idx ON tst USING hnsw (v vector_l2_ops); SELECT * FROM tst ORDER BY v <-> ARRAY[0, 0, 0]::vector LIMIT 5; DROP INDEX idx; -- Parallel build, spills to disk ALTER TABLE tst SET (parallel_workers=4); SET maintenance_work_mem='5 MB'; CREATE INDEX idx ON tst USING hnsw (v vector_l2_ops); SELECT * FROM tst ORDER BY v <-> ARRAY[0, 0, 0]::vector LIMIT 5; DROP INDEX idx; ``` --- .dockerignore | 1 + Dockerfile.compute-node | 7 +++- patches/pgvector.patch | 78 ++++++++++++++++++++++++++++++++++++++ pgxn/neon/pagestore_smgr.c | 19 +++++++++- 4 files changed, 101 insertions(+), 4 deletions(-) create mode 100644 patches/pgvector.patch diff --git a/.dockerignore b/.dockerignore index 8b378b5dab..29abdc37aa 100644 --- a/.dockerignore +++ b/.dockerignore @@ -17,6 +17,7 @@ !libs/ !neon_local/ !pageserver/ +!patches/ !pgxn/ !proxy/ !s3_scrubber/ diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index d91c7cfd72..b13225172d 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -241,9 +241,12 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz - FROM build-deps AS vector-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.1.tar.gz -O pgvector.tar.gz && \ - echo "cc7a8e034a96e30a819911ac79d32f6bc47bdd1aa2de4d7d4904e26b83209dc8 pgvector.tar.gz" | sha256sum --check && \ +COPY patches/pgvector.patch /pgvector.patch + +RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.6.0.tar.gz -O pgvector.tar.gz && \ + echo "b0cf4ba1ab016335ac8fb1cada0d2106235889a194fffeece217c5bda90b2f19 pgvector.tar.gz" | sha256sum --check && \ mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \ + patch -p1 < /pgvector.patch && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control diff --git a/patches/pgvector.patch b/patches/pgvector.patch new file mode 100644 index 0000000000..84ac6644c5 --- /dev/null +++ b/patches/pgvector.patch @@ -0,0 +1,78 @@ +From 0b0194a57bd0f3598bd57dbedd0df3932330169d Mon Sep 17 00:00:00 2001 +From: Heikki Linnakangas +Date: Fri, 2 Feb 2024 22:26:45 +0200 +Subject: [PATCH 1/1] Make v0.6.0 work with Neon + +Now that the WAL-logging happens as a separate step at the end of the +build, we need a few neon-specific hints to make it work. +--- + src/hnswbuild.c | 36 ++++++++++++++++++++++++++++++++++++ + 1 file changed, 36 insertions(+) + +diff --git a/src/hnswbuild.c b/src/hnswbuild.c +index 680789b..ec54dea 100644 +--- a/src/hnswbuild.c ++++ b/src/hnswbuild.c +@@ -840,9 +840,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc) + + hnswarea = shm_toc_lookup(toc, PARALLEL_KEY_HNSW_AREA, false); + ++#ifdef NEON_SMGR ++ smgr_start_unlogged_build(RelationGetSmgr(indexRel)); ++#endif ++ + /* Perform inserts */ + HnswParallelScanAndInsert(heapRel, indexRel, hnswshared, hnswarea, false); + ++#ifdef NEON_SMGR ++ smgr_finish_unlogged_build_phase_1(RelationGetSmgr(indexRel)); ++#endif ++ + /* Close relations within worker */ + index_close(indexRel, indexLockmode); + table_close(heapRel, heapLockmode); +@@ -1089,13 +1097,41 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo, + SeedRandom(42); + #endif + ++#ifdef NEON_SMGR ++ smgr_start_unlogged_build(RelationGetSmgr(index)); ++#endif ++ + InitBuildState(buildstate, heap, index, indexInfo, forkNum); + + BuildGraph(buildstate, forkNum); + ++#ifdef NEON_SMGR ++ smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index)); ++#endif ++ + if (RelationNeedsWAL(index)) ++ { + log_newpage_range(index, forkNum, 0, RelationGetNumberOfBlocks(index), true); + ++#ifdef NEON_SMGR ++ { ++#if PG_VERSION_NUM >= 160000 ++ RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator; ++#else ++ RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node; ++#endif ++ ++ SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator, ++ MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index)); ++ SetLastWrittenLSNForRelation(XactLastRecEnd, rlocator, MAIN_FORKNUM); ++ } ++#endif ++ } ++ ++#ifdef NEON_SMGR ++ smgr_end_unlogged_build(RelationGetSmgr(index)); ++#endif ++ + FreeBuildState(buildstate); + } + +-- +2.39.2 + diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 63e8b8dc1f..f54c86702f 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -45,6 +45,7 @@ */ #include "postgres.h" +#include "access/parallel.h" #include "access/xact.h" #include "access/xlog.h" #include "access/xlogdefs.h" @@ -2712,10 +2713,14 @@ neon_start_unlogged_build(SMgrRelation reln) reln->smgr_relpersistence = RELPERSISTENCE_UNLOGGED; /* + * Create the local file. In a parallel build, the leader is expected to + * call this first and do it. + * * FIXME: should we pass isRedo true to create the tablespace dir if it * doesn't exist? Is it needed? */ - mdcreate(reln, MAIN_FORKNUM, false); + if (!IsParallelWorker()) + mdcreate(reln, MAIN_FORKNUM, false); } /* @@ -2739,7 +2744,17 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln) Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_1); Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED); - unlogged_build_phase = UNLOGGED_BUILD_PHASE_2; + /* + * In a parallel build, (only) the leader process performs the 2nd + * phase. + */ + if (IsParallelWorker()) + { + unlogged_build_rel = NULL; + unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; + } + else + unlogged_build_phase = UNLOGGED_BUILD_PHASE_2; } /*