From 89e6232aebc41aaa76f3f7f0199763eb301f2a40 Mon Sep 17 00:00:00 2001 From: Chang She <759245+changhiskhan@users.noreply.github.com> Date: Mon, 24 Apr 2023 22:40:40 -0700 Subject: [PATCH 1/6] Make distance metric configurable during search --- python/lancedb/query.py | 17 +++++++++++++++++ python/tests/test_query.py | 19 +++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/python/lancedb/query.py b/python/lancedb/query.py index 21333bec..949e5ef7 100644 --- a/python/lancedb/query.py +++ b/python/lancedb/query.py @@ -24,6 +24,7 @@ class LanceQueryBuilder: """ def __init__(self, table: "lancedb.table.LanceTable", query: np.ndarray): + self._metric = "l2" self._nprobes = 20 self._refine_factor = None self._table = table @@ -77,6 +78,21 @@ class LanceQueryBuilder: self._where = where return self + def metric(self, metric: str) -> LanceQueryBuilder: + """Set the distance metric to use. + + Parameters + ---------- + metric: str + The distance metric to use. By default "l2" is used. + + Returns + ------- + The LanceQueryBuilder object. + """ + self._metric = metric + return self + def nprobes(self, nprobes: int) -> LanceQueryBuilder: """Set the number of probes to use. @@ -118,6 +134,7 @@ class LanceQueryBuilder: "column": VECTOR_COLUMN_NAME, "q": self._query, "k": self._limit, + "metric": self._metric, "nprobes": self._nprobes, "refine_factor": self._refine_factor, }, diff --git a/python/tests/test_query.py b/python/tests/test_query.py index c08cdd8f..ae1bebda 100644 --- a/python/tests/test_query.py +++ b/python/tests/test_query.py @@ -14,7 +14,9 @@ import lance from lancedb.query import LanceQueryBuilder +import numpy as np import pandas as pd +import pandas.testing as tm import pyarrow as pa import pytest @@ -60,3 +62,20 @@ def test_query_builder_with_filter(table): df = LanceQueryBuilder(table, [0, 0]).where("id = 2").to_df() assert df["id"].values[0] == 2 assert all(df["vector"].values[0] == [3, 4]) + + +def test_query_builder_with_metric(table): + query = [4, 8] + df_default = LanceQueryBuilder(table, query).to_df() + df_l2 = LanceQueryBuilder(table, query).metric("l2").to_df() + tm.assert_frame_equal(df_default, df_l2) + + df_cosine = LanceQueryBuilder(table, query).metric("cosine").limit(1).to_df() + assert df_cosine.score[0] == pytest.approx( + cosine_distance(query, df_cosine.vector[0]) + ) + assert 0 <= df_cosine.score[0] <= 1 + + +def cosine_distance(vec1, vec2): + return 1 - np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)) From b0e578c60905f88f97d5a3d8b8f39987b11aca11 Mon Sep 17 00:00:00 2001 From: Chang She <759245+changhiskhan@users.noreply.github.com> Date: Mon, 24 Apr 2023 22:42:30 -0700 Subject: [PATCH 2/6] add documentation for metric --- docs/src/ann_indexes.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/docs/src/ann_indexes.md b/docs/src/ann_indexes.md index 96b93e2b..75de5c43 100644 --- a/docs/src/ann_indexes.md +++ b/docs/src/ann_indexes.md @@ -28,11 +28,11 @@ tbl.create_index(num_partitions=256, num_sub_vectors=96) Since `create_index` has a training step, it can take a few minutes to finish for large tables. You can control the index creation by providing the following parameters: -- **num_partitions** (default: 256): The number of partitions of the index. The number of partitions should be configured so each partition has 3-5K vectors. For example, a table -with ~1M vectors should use 256 partitions. You can specify arbitrary number of partitions but powers of 2 is most conventional. -A higher number leads to faster queries, but it makes index generation slower. +- **num_partitions** (default: 256): The number of partitions of the index. The number of partitions should be configured so each partition has 3-5K vectors. For example, a table +with ~1M vectors should use 256 partitions. You can specify arbitrary number of partitions but powers of 2 is most conventional. +A higher number leads to faster queries, but it makes index generation slower. - **num_sub_vectors** (default: 96): The number of subvectors (M) that will be created during Product Quantization (PQ). A larger number makes -search more accurate, but also makes the index larger and slower to build. +search more accurate, but also makes the index larger and slower to build. ## Querying an ANN Index @@ -41,8 +41,9 @@ Querying vector indexes is done via the [search](https://lancedb.github.io/lance There are a couple of parameters that can be used to fine-tune the search: - **limit** (default: 10): The amount of results that will be returned +- **metric** (default: "l2"): The distance metric to use. By default we use euclidean distance. We also support cosine distance. - **nprobes** (default: 20): The number of probes used. A higher number makes search more accurate but also slower. -- **refine_factor** (default: None): Refine the results by reading extra elements and re-ranking them in memory. A higher number makes +- **refine_factor** (default: None): Refine the results by reading extra elements and re-ranking them in memory. A higher number makes search more accurate but also slower. ```python From 72a44eb927f9a8ebf47fc924c9bce4396371dfbe Mon Sep 17 00:00:00 2001 From: Chang She <759245+changhiskhan@users.noreply.github.com> Date: Mon, 24 Apr 2023 22:45:37 -0700 Subject: [PATCH 3/6] specify metric during index creation --- docs/src/ann_indexes.md | 2 +- python/lancedb/query.py | 2 +- python/lancedb/table.py | 6 +++++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/docs/src/ann_indexes.md b/docs/src/ann_indexes.md index 75de5c43..41f26985 100644 --- a/docs/src/ann_indexes.md +++ b/docs/src/ann_indexes.md @@ -41,7 +41,7 @@ Querying vector indexes is done via the [search](https://lancedb.github.io/lance There are a couple of parameters that can be used to fine-tune the search: - **limit** (default: 10): The amount of results that will be returned -- **metric** (default: "l2"): The distance metric to use. By default we use euclidean distance. We also support cosine distance. +- **metric** (default: "L2"): The distance metric to use. By default we use euclidean distance. We also support cosine distance. - **nprobes** (default: 20): The number of probes used. A higher number makes search more accurate but also slower. - **refine_factor** (default: None): Refine the results by reading extra elements and re-ranking them in memory. A higher number makes search more accurate but also slower. diff --git a/python/lancedb/query.py b/python/lancedb/query.py index 949e5ef7..1adb8ccb 100644 --- a/python/lancedb/query.py +++ b/python/lancedb/query.py @@ -24,7 +24,7 @@ class LanceQueryBuilder: """ def __init__(self, table: "lancedb.table.LanceTable", query: np.ndarray): - self._metric = "l2" + self._metric = "L2" self._nprobes = 20 self._refine_factor = None self._table = table diff --git a/python/lancedb/table.py b/python/lancedb/table.py index f798fb37..f633cce5 100644 --- a/python/lancedb/table.py +++ b/python/lancedb/table.py @@ -106,11 +106,14 @@ class LanceTable: def _dataset_uri(self) -> str: return os.path.join(self._conn.uri, f"{self.name}.lance") - def create_index(self, num_partitions=256, num_sub_vectors=96): + def create_index(self, metric="L2", num_partitions=256, num_sub_vectors=96): """Create an index on the table. Parameters ---------- + metric: str, default "L2" + The distance metric to use when creating the index. Valid values are "L2" or "cosine". + L2 is euclidean distance. num_partitions: int The number of IVF partitions to use when creating the index. Default is 256. @@ -121,6 +124,7 @@ class LanceTable: self._dataset.create_index( column=VECTOR_COLUMN_NAME, index_type="IVF_PQ", + metric=metric, num_partitions=num_partitions, num_sub_vectors=num_sub_vectors, ) From 6592b4c13b36c82c72fd75874a27a6241f9e3f6b Mon Sep 17 00:00:00 2001 From: Chang She <759245+changhiskhan@users.noreply.github.com> Date: Mon, 24 Apr 2023 22:46:21 -0700 Subject: [PATCH 4/6] document metric in create_index --- docs/src/ann_indexes.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/src/ann_indexes.md b/docs/src/ann_indexes.md index 41f26985..be98cc45 100644 --- a/docs/src/ann_indexes.md +++ b/docs/src/ann_indexes.md @@ -28,6 +28,7 @@ tbl.create_index(num_partitions=256, num_sub_vectors=96) Since `create_index` has a training step, it can take a few minutes to finish for large tables. You can control the index creation by providing the following parameters: +- **metric** (default: "L2"): The distance metric to use. By default we use euclidean distance. We also support cosine distance. - **num_partitions** (default: 256): The number of partitions of the index. The number of partitions should be configured so each partition has 3-5K vectors. For example, a table with ~1M vectors should use 256 partitions. You can specify arbitrary number of partitions but powers of 2 is most conventional. A higher number leads to faster queries, but it makes index generation slower. From 7a375185a1cf21771c025e21c6dec924b85fcf80 Mon Sep 17 00:00:00 2001 From: Chang She <759245+changhiskhan@users.noreply.github.com> Date: Tue, 25 Apr 2023 19:57:58 -0700 Subject: [PATCH 5/6] increment lance version to include cosine distance fix --- python/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyproject.toml b/python/pyproject.toml index b30d6494..2884c8ee 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "lancedb" version = "0.1" -dependencies = ["pylance>=0.4.3", "ratelimiter", "retry", "tqdm"] +dependencies = ["pylance>=0.4.4", "ratelimiter", "retry", "tqdm"] description = "lancedb" authors = [ { name = "Lance Devs", email = "dev@eto.ai" }, From a8db7f56d24792a8f52e3021698cf820e55d595c Mon Sep 17 00:00:00 2001 From: Chang She <759245+changhiskhan@users.noreply.github.com> Date: Tue, 25 Apr 2023 20:08:18 -0700 Subject: [PATCH 6/6] tolerance --- python/tests/test_query.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/tests/test_query.py b/python/tests/test_query.py index ae1bebda..9ad7c928 100644 --- a/python/tests/test_query.py +++ b/python/tests/test_query.py @@ -72,7 +72,8 @@ def test_query_builder_with_metric(table): df_cosine = LanceQueryBuilder(table, query).metric("cosine").limit(1).to_df() assert df_cosine.score[0] == pytest.approx( - cosine_distance(query, df_cosine.vector[0]) + cosine_distance(query, df_cosine.vector[0]), + abs=1e-6, ) assert 0 <= df_cosine.score[0] <= 1