Files
neon/patches/pgvector.patch
Heikki Linnakangas 647b85fc15 Update pgvector to v0.6.0, third attempt
This includes a compatibility patch that is needed because pgvector
now skips WAL-logging during the index build, and WAL-logs the index
only in one go at the end. That's how GIN, GiST and SP-GIST index
builds work in core PostgreSQL too, but we need some Neon-specific
calls to mark the beginning and end of those build phases.

pgvector is the first index AM that does that with parallel workers,
so I had to modify those functions in the Neon extension to be aware
of parallel workers. Only the leader needs to create the underlying
file and perform the WAL-logging. (In principle, the parallel workers
could participate in the WAL-logging too, but pgvector doesn't do
that. This will need some further work if that changes).

The previous attempt at this (#6592) missed that parallel workers
needed those changes, and segfaulted in parallel build that spilled to
disk.

Testing
-------

We don't have a place for regression tests of extensions at the
moment. I tested this manually with the following script:

```
CREATE EXTENSION IF NOT EXISTS vector;

DROP TABLE IF EXISTS tst;
CREATE TABLE tst (i serial, v vector(3));

INSERT INTO tst (v) SELECT ARRAY[random(), random(), random()] FROM generate_series(1, 15000) g;

-- Serial build, in memory
ALTER TABLE tst SET (parallel_workers=0);
SET maintenance_work_mem='50 MB';
CREATE INDEX idx ON tst USING hnsw (v vector_l2_ops);

-- Test that the index works. (The table contents are random, and the
-- search is approximate anyway, so we cannot check the exact values.
-- For now, just eyeball that they look reasonable)
set enable_seqscan=off;
explain SELECT * FROM tst ORDER BY v <-> ARRAY[0, 0, 0]::vector LIMIT 5;
SELECT * FROM tst ORDER BY v <-> ARRAY[0, 0, 0]::vector LIMIT 5;

DROP INDEX idx;

-- Serial build, spills to on disk

ALTER TABLE tst SET (parallel_workers=0);
SET maintenance_work_mem='5 MB';
CREATE INDEX idx ON tst USING hnsw (v vector_l2_ops);
SELECT * FROM tst ORDER BY v <-> ARRAY[0, 0, 0]::vector LIMIT 5;
DROP INDEX idx;

-- Parallel build, in memory

ALTER TABLE tst SET (parallel_workers=4);
SET maintenance_work_mem='50 MB';
CREATE INDEX idx ON tst USING hnsw (v vector_l2_ops);
SELECT * FROM tst ORDER BY v <-> ARRAY[0, 0, 0]::vector LIMIT 5;
DROP INDEX idx;

-- Parallel build, spills to disk

ALTER TABLE tst SET (parallel_workers=4);
SET maintenance_work_mem='5 MB';
CREATE INDEX idx ON tst USING hnsw (v vector_l2_ops);
SELECT * FROM tst ORDER BY v <-> ARRAY[0, 0, 0]::vector LIMIT 5;
DROP INDEX idx;
```
2024-02-03 09:19:37 +02:00

79 lines
2.1 KiB
Diff

From 0b0194a57bd0f3598bd57dbedd0df3932330169d Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Fri, 2 Feb 2024 22:26:45 +0200
Subject: [PATCH 1/1] Make v0.6.0 work with Neon
Now that the WAL-logging happens as a separate step at the end of the
build, we need a few neon-specific hints to make it work.
---
src/hnswbuild.c | 36 ++++++++++++++++++++++++++++++++++++
1 file changed, 36 insertions(+)
diff --git a/src/hnswbuild.c b/src/hnswbuild.c
index 680789b..ec54dea 100644
--- a/src/hnswbuild.c
+++ b/src/hnswbuild.c
@@ -840,9 +840,17 @@ HnswParallelBuildMain(dsm_segment *seg, shm_toc *toc)
hnswarea = shm_toc_lookup(toc, PARALLEL_KEY_HNSW_AREA, false);
+#ifdef NEON_SMGR
+ smgr_start_unlogged_build(RelationGetSmgr(indexRel));
+#endif
+
/* Perform inserts */
HnswParallelScanAndInsert(heapRel, indexRel, hnswshared, hnswarea, false);
+#ifdef NEON_SMGR
+ smgr_finish_unlogged_build_phase_1(RelationGetSmgr(indexRel));
+#endif
+
/* Close relations within worker */
index_close(indexRel, indexLockmode);
table_close(heapRel, heapLockmode);
@@ -1089,13 +1097,41 @@ BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo,
SeedRandom(42);
#endif
+#ifdef NEON_SMGR
+ smgr_start_unlogged_build(RelationGetSmgr(index));
+#endif
+
InitBuildState(buildstate, heap, index, indexInfo, forkNum);
BuildGraph(buildstate, forkNum);
+#ifdef NEON_SMGR
+ smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index));
+#endif
+
if (RelationNeedsWAL(index))
+ {
log_newpage_range(index, forkNum, 0, RelationGetNumberOfBlocks(index), true);
+#ifdef NEON_SMGR
+ {
+#if PG_VERSION_NUM >= 160000
+ RelFileLocator rlocator = RelationGetSmgr(index)->smgr_rlocator.locator;
+#else
+ RelFileNode rlocator = RelationGetSmgr(index)->smgr_rnode.node;
+#endif
+
+ SetLastWrittenLSNForBlockRange(XactLastRecEnd, rlocator,
+ MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index));
+ SetLastWrittenLSNForRelation(XactLastRecEnd, rlocator, MAIN_FORKNUM);
+ }
+#endif
+ }
+
+#ifdef NEON_SMGR
+ smgr_end_unlogged_build(RelationGetSmgr(index));
+#endif
+
FreeBuildState(buildstate);
}
--
2.39.2