Performance test for pgvector HNSW index build and queries (#7873)

## Problem

We want to regularly verify the performance of pgvector HNSW parallel
index builds and parallel similarity search using HNSW indexes.
The first release that considerably improved the index-build parallelism
was pgvector 0.7.0 and we want to make sure that we do not regress by
our neon compute VM settings (swap, memory over commit, pg conf etc.)

## Summary of changes

Prepare a Neon project with 1 million openAI vector embeddings (vector
size 1536).
Run HNSW indexing operations in the regression test for the various
distance metrics.
Run similarity queries using pgbench with 100 concurrent clients.

I have also added the relevant metrics to the grafana dashboards pgbench
and olape

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
This commit is contained in:
Peter Bendel
2024-05-28 13:05:33 +02:00
committed by GitHub
parent 4a0ce9512b
commit fabeff822f
10 changed files with 395 additions and 3 deletions

View File

@@ -0,0 +1,47 @@
\set ECHO queries
\timing
-- prepare test table
DROP TABLE IF EXISTS hnsw_test_table;
CREATE TABLE hnsw_test_table AS TABLE documents WITH NO DATA;
INSERT INTO hnsw_test_table SELECT * FROM documents;
CREATE INDEX ON hnsw_test_table (_id); -- needed later for random tuple queries
-- tune index build params
SET max_parallel_maintenance_workers = 7;
SET maintenance_work_mem = '8GB';
-- create HNSW index for the supported distance metrics
CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_cosine_ops);
CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_ip_ops);
CREATE INDEX ON hnsw_test_table USING hnsw (embeddings vector_l1_ops);
CREATE INDEX ON hnsw_test_table USING hnsw ((binary_quantize(embeddings)::bit(1536)) bit_hamming_ops);
CREATE INDEX ON hnsw_test_table USING hnsw ((binary_quantize(embeddings)::bit(1536)) bit_jaccard_ops);
-- note: in a second psql session we can monitor the progress of the index build phases using
-- the following query:
-- SELECT phase, round(100.0 * blocks_done / nullif(blocks_total, 0), 1) AS "%" FROM pg_stat_progress_create_index;
-- show all indexes built on the table
SELECT
idx.relname AS index_name,
tbl.relname AS table_name,
am.amname AS access_method,
a.attname AS column_name,
opc.opcname AS operator_class
FROM
pg_index i
JOIN
pg_class idx ON idx.oid = i.indexrelid
JOIN
pg_class tbl ON tbl.oid = i.indrelid
JOIN
pg_am am ON am.oid = idx.relam
JOIN
pg_attribute a ON a.attrelid = tbl.oid AND a.attnum = ANY(i.indkey)
JOIN
pg_opclass opc ON opc.oid = i.indclass[0]
WHERE
tbl.relname = 'hnsw_test_table'
AND a.attname = 'embeddings';
-- show table sizes
\dt+

View File

@@ -0,0 +1,52 @@
\set ECHO queries
\timing
-- prepare test table
DROP TABLE IF EXISTS ivfflat_test_table;
CREATE TABLE ivfflat_test_table AS TABLE documents WITH NO DATA;
INSERT INTO ivfflat_test_table SELECT * FROM documents;
CREATE INDEX ON ivfflat_test_table (_id); -- needed later for random tuple queries
-- tune index build params
SET max_parallel_maintenance_workers = 7;
SET maintenance_work_mem = '8GB';
-- create ivfflat index for the supported distance metrics
-- the formulat for lists is # rows / 1000 or sqrt(# rows) if # rows > 1 million
-- we have 1 million embeddings of vector size 1536 in column embeddings of table documents
-- so we use 1000 lists
CREATE INDEX ON ivfflat_test_table USING ivfflat (embeddings vector_l2_ops) WITH (lists = 1000);
CREATE INDEX ON ivfflat_test_table USING ivfflat (embeddings vector_ip_ops) WITH (lists = 1000);
CREATE INDEX ON ivfflat_test_table USING ivfflat (embeddings vector_cosine_ops) WITH (lists = 1000);
CREATE INDEX ON ivfflat_test_table USING ivfflat (embeddings::halfvec(1536) halfvec_l2_ops) WITH (lists = 1000);
CREATE INDEX ON ivfflat_test_table
USING ivfflat ((binary_quantize(embeddings)::bit(1536)) bit_hamming_ops) WITH (lists = 1000);
\d ivfflat_test_table
-- show all indexes built on the table
SELECT
idx.relname AS index_name,
tbl.relname AS table_name,
am.amname AS access_method,
a.attname AS column_name,
opc.opcname AS operator_class
FROM
pg_index i
JOIN
pg_class idx ON idx.oid = i.indexrelid
JOIN
pg_class tbl ON tbl.oid = i.indrelid
JOIN
pg_am am ON am.oid = idx.relam
JOIN
pg_attribute a ON a.attrelid = tbl.oid AND a.attnum = ANY(i.indkey)
JOIN
pg_opclass opc ON opc.oid = i.indclass[0]
WHERE
tbl.relname = 'ivfflat_test_table'
AND a.attname = 'embeddings';
-- show table sizes
\dt+

View File

@@ -0,0 +1,38 @@
---
dataset_info:
features:
- name: _id
dtype: string
- name: title
dtype: string
- name: text
dtype: string
- name: text-embedding-3-large-1536-embedding
sequence: float64
splits:
- name: train
num_bytes: 12679725776
num_examples: 1000000
download_size: 9551862565
dataset_size: 12679725776
configs:
- config_name: default
data_files:
- split: train
path: data/train-*
license: mit
task_categories:
- feature-extraction
language:
- en
size_categories:
- 1M<n<10M
---
1M OpenAI Embeddings: text-embedding-3-large 1536 dimensions
- Created: February 2024.
- Text used for Embedding: title (string) + text (string)
- Embedding Model: OpenAI text-embedding-3-large
- This dataset was generated from the first 1M entries of https://huggingface.co/datasets/BeIR/dbpedia-entity, extracted by @KShivendu_ [here](https://huggingface.co/datasets/KShivendu/dbpedia-entities-openai-1M)

View File

@@ -0,0 +1,72 @@
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import psycopg2
from pgvector.psycopg2 import register_vector
from psycopg2.extras import execute_values
def print_usage():
print("Usage: loaddata.py <CONNSTR> <DATADIR>")
def main(conn_str, directory_path):
# Connection to PostgreSQL
with psycopg2.connect(conn_str) as conn:
with conn.cursor() as cursor:
# Run SQL statements
cursor.execute("CREATE EXTENSION IF NOT EXISTS vector;")
register_vector(conn)
cursor.execute("DROP TABLE IF EXISTS documents;")
cursor.execute(
"""
CREATE TABLE documents (
_id TEXT PRIMARY KEY,
title TEXT,
text TEXT,
embeddings vector(1536) -- text-embedding-3-large-1536-embedding (OpenAI)
);
"""
)
conn.commit()
# List and sort Parquet files
parquet_files = sorted(Path(directory_path).glob("*.parquet"))
for file in parquet_files:
print(f"Loading {file} into PostgreSQL")
df = pd.read_parquet(file)
print(df.head())
data_list = [
(
row["_id"],
row["title"],
row["text"],
np.array(row["text-embedding-3-large-1536-embedding"]),
)
for index, row in df.iterrows()
]
# Use execute_values to perform batch insertion
execute_values(
cursor,
"INSERT INTO documents (_id, title, text, embeddings) VALUES %s",
data_list,
)
# Commit after we insert all embeddings
conn.commit()
print(f"Loaded {file} into PostgreSQL")
if __name__ == "__main__":
if len(sys.argv) != 3:
print_usage()
sys.exit(1)
conn_str = sys.argv[1]
directory_path = sys.argv[2]
main(conn_str, directory_path)

View File

@@ -0,0 +1,10 @@
with x (x) as (
select "embeddings" as x
from hnsw_test_table
TABLESAMPLE SYSTEM (1)
LIMIT 1
)
SELECT title, "embeddings" <=> (select x from x) as distance
FROM hnsw_test_table
ORDER BY 2
LIMIT 30;

View File

@@ -0,0 +1,13 @@
-- run with pooled connection
-- pgbench -T 300 -c 100 -j20 -f pgbench_hnsw_queries.sql -postgresql://neondb_owner:<secret>@ep-floral-thunder-w1gzhaxi-pooler.eu-west-1.aws.neon.build/neondb?sslmode=require"
with x (x) as (
select "embeddings" as x
from hnsw_test_table
TABLESAMPLE SYSTEM (1)
LIMIT 1
)
SELECT title, "embeddings" <=> (select x from x) as distance
FROM hnsw_test_table
ORDER BY 2
LIMIT 30;