From fabeff822fac24b7ba45214e907295003874252a Mon Sep 17 00:00:00 2001 From: Peter Bendel Date: Tue, 28 May 2024 13:05:33 +0200 Subject: [PATCH] Performance test for pgvector HNSW index build and queries (#7873) ## Problem We want to regularly verify the performance of pgvector HNSW parallel index builds and parallel similarity search using HNSW indexes. The first release that considerably improved the index-build parallelism was pgvector 0.7.0 and we want to make sure that we do not regress by our neon compute VM settings (swap, memory over commit, pg conf etc.) ## Summary of changes Prepare a Neon project with 1 million openAI vector embeddings (vector size 1536). Run HNSW indexing operations in the regression test for the various distance metrics. Run similarity queries using pgbench with 100 concurrent clients. I have also added the relevant metrics to the grafana dashboards pgbench and olape --------- Co-authored-by: Alexander Bayandin --- .github/workflows/benchmarking.yml | 100 +++++++++++++++++- pyproject.toml | 1 + .../performance/pgvector/HNSW_build.sql | 47 ++++++++ .../performance/pgvector/IVFFLAT_build.sql | 52 +++++++++ test_runner/performance/pgvector/README.md | 38 +++++++ test_runner/performance/pgvector/loaddata.py | 72 +++++++++++++ ...ch_custom_script_pgvector_hsnw_queries.sql | 10 ++ .../pgvector/pgbench_hnsw_queries.sql | 13 +++ test_runner/performance/test_perf_olap.py | 34 ++++++ test_runner/performance/test_perf_pgbench.py | 31 ++++++ 10 files changed, 395 insertions(+), 3 deletions(-) create mode 100644 test_runner/performance/pgvector/HNSW_build.sql create mode 100644 test_runner/performance/pgvector/IVFFLAT_build.sql create mode 100644 test_runner/performance/pgvector/README.md create mode 100644 test_runner/performance/pgvector/loaddata.py create mode 100644 test_runner/performance/pgvector/pgbench_custom_script_pgvector_hsnw_queries.sql create mode 100644 test_runner/performance/pgvector/pgbench_hnsw_queries.sql diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 1eaf05cd54..d5a375d704 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -38,6 +38,11 @@ on: description: 'AWS-RDS and AWS-AURORA normally only run on Saturday. Set this to true to run them on every workflow_dispatch' required: false default: false + run_only_pgvector_tests: + type: boolean + description: 'Run pgvector tests but no other tests. If not set, all tests including pgvector tests will be run' + required: false + default: false defaults: run: @@ -50,6 +55,7 @@ concurrency: jobs: bench: + if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} env: TEST_PG_BENCH_DURATIONS_MATRIX: "300" TEST_PG_BENCH_SCALES_MATRIX: "10,100" @@ -120,6 +126,7 @@ jobs: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} generate-matrices: + if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} # Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday) # # Available platforms: @@ -197,6 +204,7 @@ jobs: echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT pgbench-compare: + if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} needs: [ generate-matrices ] strategy: @@ -343,6 +351,92 @@ jobs: env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} + pgbench-pgvector: + env: + TEST_PG_BENCH_DURATIONS_MATRIX: "15m" + TEST_PG_BENCH_SCALES_MATRIX: "1" + POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install + DEFAULT_PG_VERSION: 16 + TEST_OUTPUT: /tmp/test_output + BUILD_TYPE: remote + SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} + PLATFORM: "neon-captest-pgvector" + + runs-on: [ self-hosted, us-east-2, x64 ] + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned + options: --init + + steps: + - uses: actions/checkout@v4 + + - name: Download Neon artifact + uses: ./.github/actions/download + with: + name: neon-${{ runner.os }}-release-artifact + path: /tmp/neon/ + prefix: latest + + - name: Add Postgres binaries to PATH + run: | + ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version + echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH + + - name: Set up Connection String + id: set-up-connstr + run: | + CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }} + + echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT + + QUERIES=("SELECT version()") + QUERIES+=("SHOW neon.tenant_id") + QUERIES+=("SHOW neon.timeline_id") + + for q in "${QUERIES[@]}"; do + psql ${CONNSTR} -c "${q}" + done + + - name: Benchmark pgvector hnsw indexing + uses: ./.github/actions/run-python-test-set + with: + build_type: ${{ env.BUILD_TYPE }} + test_selection: performance/test_perf_olap.py + run_in_parallel: false + save_perf_report: ${{ env.SAVE_PERF_REPORT }} + extra_params: -m remote_cluster --timeout 21600 -k test_pgvector_indexing + env: + VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" + PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" + BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} + + - name: Benchmark pgvector hnsw queries + uses: ./.github/actions/run-python-test-set + with: + build_type: ${{ env.BUILD_TYPE }} + test_selection: performance + run_in_parallel: false + save_perf_report: ${{ env.SAVE_PERF_REPORT }} + extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_pgvector + env: + BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} + VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" + PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" + + - name: Create Allure report + if: ${{ !cancelled() }} + uses: ./.github/actions/allure-report-generate + + - name: Post to a Slack channel + if: ${{ github.event.schedule && failure() }} + uses: slackapi/slack-github-action@v1 + with: + channel-id: "C033QLM5P7D" # dev-staging-stream + slack-message: "Periodic perf testing neon-captest-pgvector: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} + + clickbench-compare: # ClichBench DB for rds-aurora and rds-Postgres deployed to the same clusters # we use for performance testing in pgbench-compare. @@ -351,7 +445,7 @@ jobs: # # *_CLICKBENCH_CONNSTR: Genuine ClickBench DB with ~100M rows # *_CLICKBENCH_10M_CONNSTR: DB with the first 10M rows of ClickBench DB - if: ${{ !cancelled() }} + if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }} needs: [ generate-matrices, pgbench-compare ] strategy: @@ -455,7 +549,7 @@ jobs: # We might change it after https://github.com/neondatabase/neon/issues/2900. # # *_TPCH_S10_CONNSTR: DB generated with scale factor 10 (~10 GB) - if: ${{ !cancelled() }} + if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }} needs: [ generate-matrices, clickbench-compare ] strategy: @@ -557,7 +651,7 @@ jobs: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} user-examples-compare: - if: ${{ !cancelled() }} + if: ${{ !cancelled() && (github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null) }} needs: [ generate-matrices, tpch-compare ] strategy: diff --git a/pyproject.toml b/pyproject.toml index 131d1121f7..c7f1a07512 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,6 +54,7 @@ build-backend = "poetry.core.masonry.api" exclude = [ "^vendor/", "^target/", + "test_runner/performance/pgvector/loaddata.py", ] check_untyped_defs = true # Help mypy find imports when running against list of individual files. diff --git a/test_runner/performance/pgvector/HNSW_build.sql b/test_runner/performance/pgvector/HNSW_build.sql new file mode 100644 index 0000000000..9e6918b755 --- /dev/null +++ b/test_runner/performance/pgvector/HNSW_build.sql @@ -0,0 +1,47 @@ + +\set ECHO queries +\timing + +-- prepare test table +DROP TABLE IF EXISTS hnsw_test_table; +CREATE TABLE hnsw_test_table AS TABLE documents WITH NO DATA; +INSERT INTO hnsw_test_table SELECT * FROM documents; +CREATE INDEX ON hnsw_test_table (_id); -- needed later for random tuple queries +-- tune index build params +SET max_parallel_maintenance_workers = 7; +SET maintenance_work_mem = '8GB'; +-- create HNSW index for the supported distance metrics +CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_cosine_ops); +CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_ip_ops); +CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_l1_ops); +CREATE INDEX ON hnsw_test_table USING hnsw ((binary_quantize(embeddings)::bit(1536)) bit_hamming_ops); +CREATE INDEX ON hnsw_test_table USING hnsw ((binary_quantize(embeddings)::bit(1536)) bit_jaccard_ops); +-- note: in a second psql session we can monitor the progress of the index build phases using +-- the following query: +-- SELECT phase, round(100.0 * blocks_done / nullif(blocks_total, 0), 1) AS "%" FROM pg_stat_progress_create_index; + +-- show all indexes built on the table +SELECT + idx.relname AS index_name, + tbl.relname AS table_name, + am.amname AS access_method, + a.attname AS column_name, + opc.opcname AS operator_class +FROM + pg_index i +JOIN + pg_class idx ON idx.oid = i.indexrelid +JOIN + pg_class tbl ON tbl.oid = i.indrelid +JOIN + pg_am am ON am.oid = idx.relam +JOIN + pg_attribute a ON a.attrelid = tbl.oid AND a.attnum = ANY(i.indkey) +JOIN + pg_opclass opc ON opc.oid = i.indclass[0] +WHERE + tbl.relname = 'hnsw_test_table' + AND a.attname = 'embeddings'; + +-- show table sizes +\dt+ diff --git a/test_runner/performance/pgvector/IVFFLAT_build.sql b/test_runner/performance/pgvector/IVFFLAT_build.sql new file mode 100644 index 0000000000..338980831a --- /dev/null +++ b/test_runner/performance/pgvector/IVFFLAT_build.sql @@ -0,0 +1,52 @@ + +\set ECHO queries +\timing + +-- prepare test table +DROP TABLE IF EXISTS ivfflat_test_table; +CREATE TABLE ivfflat_test_table AS TABLE documents WITH NO DATA; +INSERT INTO ivfflat_test_table SELECT * FROM documents; +CREATE INDEX ON ivfflat_test_table (_id); -- needed later for random tuple queries +-- tune index build params +SET max_parallel_maintenance_workers = 7; +SET maintenance_work_mem = '8GB'; +-- create ivfflat index for the supported distance metrics +-- the formulat for lists is # rows / 1000 or sqrt(# rows) if # rows > 1 million +-- we have 1 million embeddings of vector size 1536 in column embeddings of table documents +-- so we use 1000 lists +CREATE INDEX ON ivfflat_test_table USING ivfflat (embeddings vector_l2_ops) WITH (lists = 1000); +CREATE INDEX ON ivfflat_test_table USING ivfflat (embeddings vector_ip_ops) WITH (lists = 1000); +CREATE INDEX ON ivfflat_test_table USING ivfflat (embeddings vector_cosine_ops) WITH (lists = 1000); +CREATE INDEX ON ivfflat_test_table USING ivfflat (embeddings::halfvec(1536) halfvec_l2_ops) WITH (lists = 1000); +CREATE INDEX ON ivfflat_test_table + USING ivfflat ((binary_quantize(embeddings)::bit(1536)) bit_hamming_ops) WITH (lists = 1000); + +\d ivfflat_test_table + + +-- show all indexes built on the table +SELECT + idx.relname AS index_name, + tbl.relname AS table_name, + am.amname AS access_method, + a.attname AS column_name, + opc.opcname AS operator_class +FROM + pg_index i +JOIN + pg_class idx ON idx.oid = i.indexrelid +JOIN + pg_class tbl ON tbl.oid = i.indrelid +JOIN + pg_am am ON am.oid = idx.relam +JOIN + pg_attribute a ON a.attrelid = tbl.oid AND a.attnum = ANY(i.indkey) +JOIN + pg_opclass opc ON opc.oid = i.indclass[0] +WHERE + tbl.relname = 'ivfflat_test_table' + AND a.attname = 'embeddings'; +-- show table sizes +\dt+ + + diff --git a/test_runner/performance/pgvector/README.md b/test_runner/performance/pgvector/README.md new file mode 100644 index 0000000000..c55db12e74 --- /dev/null +++ b/test_runner/performance/pgvector/README.md @@ -0,0 +1,38 @@ +--- +dataset_info: + features: + - name: _id + dtype: string + - name: title + dtype: string + - name: text + dtype: string + - name: text-embedding-3-large-1536-embedding + sequence: float64 + splits: + - name: train + num_bytes: 12679725776 + num_examples: 1000000 + download_size: 9551862565 + dataset_size: 12679725776 +configs: +- config_name: default + data_files: + - split: train + path: data/train-* +license: mit +task_categories: +- feature-extraction +language: +- en +size_categories: +- 1M ") + + +def main(conn_str, directory_path): + # Connection to PostgreSQL + with psycopg2.connect(conn_str) as conn: + with conn.cursor() as cursor: + # Run SQL statements + cursor.execute("CREATE EXTENSION IF NOT EXISTS vector;") + register_vector(conn) + cursor.execute("DROP TABLE IF EXISTS documents;") + cursor.execute( + """ + CREATE TABLE documents ( + _id TEXT PRIMARY KEY, + title TEXT, + text TEXT, + embeddings vector(1536) -- text-embedding-3-large-1536-embedding (OpenAI) + ); + """ + ) + conn.commit() + + # List and sort Parquet files + parquet_files = sorted(Path(directory_path).glob("*.parquet")) + + for file in parquet_files: + print(f"Loading {file} into PostgreSQL") + df = pd.read_parquet(file) + + print(df.head()) + + data_list = [ + ( + row["_id"], + row["title"], + row["text"], + np.array(row["text-embedding-3-large-1536-embedding"]), + ) + for index, row in df.iterrows() + ] + # Use execute_values to perform batch insertion + execute_values( + cursor, + "INSERT INTO documents (_id, title, text, embeddings) VALUES %s", + data_list, + ) + # Commit after we insert all embeddings + conn.commit() + + print(f"Loaded {file} into PostgreSQL") + + +if __name__ == "__main__": + if len(sys.argv) != 3: + print_usage() + sys.exit(1) + + conn_str = sys.argv[1] + directory_path = sys.argv[2] + main(conn_str, directory_path) diff --git a/test_runner/performance/pgvector/pgbench_custom_script_pgvector_hsnw_queries.sql b/test_runner/performance/pgvector/pgbench_custom_script_pgvector_hsnw_queries.sql new file mode 100644 index 0000000000..886ae9645b --- /dev/null +++ b/test_runner/performance/pgvector/pgbench_custom_script_pgvector_hsnw_queries.sql @@ -0,0 +1,10 @@ +with x (x) as ( + select "embeddings" as x + from hnsw_test_table + TABLESAMPLE SYSTEM (1) + LIMIT 1 +) +SELECT title, "embeddings" <=> (select x from x) as distance +FROM hnsw_test_table +ORDER BY 2 +LIMIT 30; diff --git a/test_runner/performance/pgvector/pgbench_hnsw_queries.sql b/test_runner/performance/pgvector/pgbench_hnsw_queries.sql new file mode 100644 index 0000000000..5034063c1b --- /dev/null +++ b/test_runner/performance/pgvector/pgbench_hnsw_queries.sql @@ -0,0 +1,13 @@ +-- run with pooled connection +-- pgbench -T 300 -c 100 -j20 -f pgbench_hnsw_queries.sql -postgresql://neondb_owner:@ep-floral-thunder-w1gzhaxi-pooler.eu-west-1.aws.neon.build/neondb?sslmode=require" + +with x (x) as ( + select "embeddings" as x + from hnsw_test_table + TABLESAMPLE SYSTEM (1) + LIMIT 1 +) +SELECT title, "embeddings" <=> (select x from x) as distance +FROM hnsw_test_table +ORDER BY 2 +LIMIT 30; diff --git a/test_runner/performance/test_perf_olap.py b/test_runner/performance/test_perf_olap.py index 8a9509ea44..2367676e67 100644 --- a/test_runner/performance/test_perf_olap.py +++ b/test_runner/performance/test_perf_olap.py @@ -100,6 +100,25 @@ QUERIES: Tuple[LabelledQuery, ...] = ( ) # fmt: on +# A list of pgvector HNSW index builds to run. +# Please do not alter the label for the query, as it is used to identify it. +# +# Disable auto formatting for the list of queries so that it's easier to read +# fmt: off +PGVECTOR_QUERIES: Tuple[LabelledQuery, ...] = ( + LabelledQuery("PGV0", r"DROP TABLE IF EXISTS hnsw_test_table;"), + LabelledQuery("PGV1", r"CREATE TABLE hnsw_test_table AS TABLE documents WITH NO DATA;"), + LabelledQuery("PGV2", r"INSERT INTO hnsw_test_table SELECT * FROM documents;"), + LabelledQuery("PGV3", r"CREATE INDEX ON hnsw_test_table (_id);"), + LabelledQuery("PGV4", r"CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_cosine_ops);"), + LabelledQuery("PGV5", r"CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_ip_ops);"), + LabelledQuery("PGV6", r"CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_l1_ops);"), + LabelledQuery("PGV7", r"CREATE INDEX ON hnsw_test_table USING hnsw ((binary_quantize(embeddings)::bit(1536)) bit_hamming_ops);"), + LabelledQuery("PGV8", r"CREATE INDEX ON hnsw_test_table USING hnsw ((binary_quantize(embeddings)::bit(1536)) bit_jaccard_ops);"), +) +# fmt: on + + EXPLAIN_STRING: str = "EXPLAIN (ANALYZE, VERBOSE, BUFFERS, COSTS, SETTINGS, FORMAT JSON)" @@ -245,3 +264,18 @@ def test_clickbench_collect_pg_stat_statements(remote_compare: RemoteCompare): log.info("Collecting pg_stat_statements") query = LabelledQuery("Q_COLLECT_PG_STAT_STATEMENTS", r"SELECT * from pg_stat_statements;") run_psql(remote_compare, query, times=1, explain=False) + + +@pytest.mark.parametrize("query", PGVECTOR_QUERIES) +@pytest.mark.remote_cluster +def test_pgvector_indexing(query: LabelledQuery, remote_compare: RemoteCompare): + """ + An pgvector test that tests HNSW index build performance and parallelism. + + The DB prepared manually in advance. + See + - test_runner/performance/pgvector/README.md + - test_runner/performance/pgvector/loaddata.py + - test_runner/performance/pgvector/HNSW_build.sql + """ + run_psql(remote_compare, query, times=1, explain=False) diff --git a/test_runner/performance/test_perf_pgbench.py b/test_runner/performance/test_perf_pgbench.py index 2b8760dff2..d756d6eeca 100644 --- a/test_runner/performance/test_perf_pgbench.py +++ b/test_runner/performance/test_perf_pgbench.py @@ -17,6 +17,7 @@ class PgBenchLoadType(enum.Enum): INIT = "init" SIMPLE_UPDATE = "simple-update" SELECT_ONLY = "select-only" + PGVECTOR_HNSW = "pgvector-hnsw" def utc_now_timestamp() -> int: @@ -132,6 +133,26 @@ def run_test_pgbench(env: PgCompare, scale: int, duration: int, workload_type: P password=password, ) + if workload_type == PgBenchLoadType.PGVECTOR_HNSW: + # Run simple-update workload + run_pgbench( + env, + "pgvector-hnsw", + [ + "pgbench", + "-f", + "test_runner/performance/pgvector/pgbench_custom_script_pgvector_hsnw_queries.sql", + "-c100", + "-j20", + f"-T{duration}", + "-P2", + "--protocol=prepared", + "--progress-timestamp", + connstr, + ], + password=password, + ) + env.report_size() @@ -201,3 +222,13 @@ def test_pgbench_remote_simple_update(remote_compare: PgCompare, scale: int, dur @pytest.mark.remote_cluster def test_pgbench_remote_select_only(remote_compare: PgCompare, scale: int, duration: int): run_test_pgbench(remote_compare, scale, duration, PgBenchLoadType.SELECT_ONLY) + + +# The following test runs on an existing database that has pgvector extension installed +# and a table with 1 million embedding vectors loaded and indexed with HNSW. +# +# Run this pgbench tests against an existing remote Postgres cluster with the necessary setup. +@pytest.mark.parametrize("duration", get_durations_matrix()) +@pytest.mark.remote_cluster +def test_pgbench_remote_pgvector(remote_compare: PgCompare, duration: int): + run_test_pgbench(remote_compare, 1, duration, PgBenchLoadType.PGVECTOR_HNSW)