From 5e748e6e70071cf16d12f7f4556a84d8a76b3cd6 Mon Sep 17 00:00:00 2001 From: Chang She <759245+changhiskhan@users.noreply.github.com> Date: Mon, 24 Apr 2023 18:46:05 -0700 Subject: [PATCH 1/7] Minor notebook fix. Closes #40 --- notebooks/youtube_transcript_search.ipynb | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/notebooks/youtube_transcript_search.ipynb b/notebooks/youtube_transcript_search.ipynb index b3bfd83d..987373e8 100644 --- a/notebooks/youtube_transcript_search.ipynb +++ b/notebooks/youtube_transcript_search.ipynb @@ -1,7 +1,6 @@ { "cells": [ { - "attachments": {}, "cell_type": "markdown", "id": "42bf01fb", "metadata": {}, @@ -22,10 +21,10 @@ "output_type": "stream", "text": [ "\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.0.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.1\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", "\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.0.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.1\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" ] } @@ -88,7 +87,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "5ac2b6a3", "metadata": {}, @@ -231,7 +229,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "2106b5bb", "metadata": {}, @@ -251,7 +248,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "39f3161f3ef54a129cd65fb296332b54", + "model_id": "c6f1c76d9567421d88911923388d2530", "version_major": 2, "version_minor": 0 }, @@ -574,7 +571,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "559a095b", "metadata": {}, @@ -631,7 +627,7 @@ " " + "" ] }, "execution_count": 15, @@ -651,7 +647,7 @@ "from IPython.display import YouTubeVideo\n", "\n", "top_match = context.iloc[0]\n", - "YouTubeVideo(top_match[\"url\"].split(\"/\")[-1], start=top_match[\"start\"])" + "YouTubeVideo(top_match[\"url\"].split(\"/\")[-1], start=int(top_match[\"start\"]))" ] }, { From 89e6232aebc41aaa76f3f7f0199763eb301f2a40 Mon Sep 17 00:00:00 2001 From: Chang She <759245+changhiskhan@users.noreply.github.com> Date: Mon, 24 Apr 2023 22:40:40 -0700 Subject: [PATCH 2/7] Make distance metric configurable during search --- python/lancedb/query.py | 17 +++++++++++++++++ python/tests/test_query.py | 19 +++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/python/lancedb/query.py b/python/lancedb/query.py index 21333bec..949e5ef7 100644 --- a/python/lancedb/query.py +++ b/python/lancedb/query.py @@ -24,6 +24,7 @@ class LanceQueryBuilder: """ def __init__(self, table: "lancedb.table.LanceTable", query: np.ndarray): + self._metric = "l2" self._nprobes = 20 self._refine_factor = None self._table = table @@ -77,6 +78,21 @@ class LanceQueryBuilder: self._where = where return self + def metric(self, metric: str) -> LanceQueryBuilder: + """Set the distance metric to use. + + Parameters + ---------- + metric: str + The distance metric to use. By default "l2" is used. + + Returns + ------- + The LanceQueryBuilder object. + """ + self._metric = metric + return self + def nprobes(self, nprobes: int) -> LanceQueryBuilder: """Set the number of probes to use. @@ -118,6 +134,7 @@ class LanceQueryBuilder: "column": VECTOR_COLUMN_NAME, "q": self._query, "k": self._limit, + "metric": self._metric, "nprobes": self._nprobes, "refine_factor": self._refine_factor, }, diff --git a/python/tests/test_query.py b/python/tests/test_query.py index c08cdd8f..ae1bebda 100644 --- a/python/tests/test_query.py +++ b/python/tests/test_query.py @@ -14,7 +14,9 @@ import lance from lancedb.query import LanceQueryBuilder +import numpy as np import pandas as pd +import pandas.testing as tm import pyarrow as pa import pytest @@ -60,3 +62,20 @@ def test_query_builder_with_filter(table): df = LanceQueryBuilder(table, [0, 0]).where("id = 2").to_df() assert df["id"].values[0] == 2 assert all(df["vector"].values[0] == [3, 4]) + + +def test_query_builder_with_metric(table): + query = [4, 8] + df_default = LanceQueryBuilder(table, query).to_df() + df_l2 = LanceQueryBuilder(table, query).metric("l2").to_df() + tm.assert_frame_equal(df_default, df_l2) + + df_cosine = LanceQueryBuilder(table, query).metric("cosine").limit(1).to_df() + assert df_cosine.score[0] == pytest.approx( + cosine_distance(query, df_cosine.vector[0]) + ) + assert 0 <= df_cosine.score[0] <= 1 + + +def cosine_distance(vec1, vec2): + return 1 - np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)) From b0e578c60905f88f97d5a3d8b8f39987b11aca11 Mon Sep 17 00:00:00 2001 From: Chang She <759245+changhiskhan@users.noreply.github.com> Date: Mon, 24 Apr 2023 22:42:30 -0700 Subject: [PATCH 3/7] add documentation for metric --- docs/src/ann_indexes.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/docs/src/ann_indexes.md b/docs/src/ann_indexes.md index 96b93e2b..75de5c43 100644 --- a/docs/src/ann_indexes.md +++ b/docs/src/ann_indexes.md @@ -28,11 +28,11 @@ tbl.create_index(num_partitions=256, num_sub_vectors=96) Since `create_index` has a training step, it can take a few minutes to finish for large tables. You can control the index creation by providing the following parameters: -- **num_partitions** (default: 256): The number of partitions of the index. The number of partitions should be configured so each partition has 3-5K vectors. For example, a table -with ~1M vectors should use 256 partitions. You can specify arbitrary number of partitions but powers of 2 is most conventional. -A higher number leads to faster queries, but it makes index generation slower. +- **num_partitions** (default: 256): The number of partitions of the index. The number of partitions should be configured so each partition has 3-5K vectors. For example, a table +with ~1M vectors should use 256 partitions. You can specify arbitrary number of partitions but powers of 2 is most conventional. +A higher number leads to faster queries, but it makes index generation slower. - **num_sub_vectors** (default: 96): The number of subvectors (M) that will be created during Product Quantization (PQ). A larger number makes -search more accurate, but also makes the index larger and slower to build. +search more accurate, but also makes the index larger and slower to build. ## Querying an ANN Index @@ -41,8 +41,9 @@ Querying vector indexes is done via the [search](https://lancedb.github.io/lance There are a couple of parameters that can be used to fine-tune the search: - **limit** (default: 10): The amount of results that will be returned +- **metric** (default: "l2"): The distance metric to use. By default we use euclidean distance. We also support cosine distance. - **nprobes** (default: 20): The number of probes used. A higher number makes search more accurate but also slower. -- **refine_factor** (default: None): Refine the results by reading extra elements and re-ranking them in memory. A higher number makes +- **refine_factor** (default: None): Refine the results by reading extra elements and re-ranking them in memory. A higher number makes search more accurate but also slower. ```python From 72a44eb927f9a8ebf47fc924c9bce4396371dfbe Mon Sep 17 00:00:00 2001 From: Chang She <759245+changhiskhan@users.noreply.github.com> Date: Mon, 24 Apr 2023 22:45:37 -0700 Subject: [PATCH 4/7] specify metric during index creation --- docs/src/ann_indexes.md | 2 +- python/lancedb/query.py | 2 +- python/lancedb/table.py | 6 +++++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/docs/src/ann_indexes.md b/docs/src/ann_indexes.md index 75de5c43..41f26985 100644 --- a/docs/src/ann_indexes.md +++ b/docs/src/ann_indexes.md @@ -41,7 +41,7 @@ Querying vector indexes is done via the [search](https://lancedb.github.io/lance There are a couple of parameters that can be used to fine-tune the search: - **limit** (default: 10): The amount of results that will be returned -- **metric** (default: "l2"): The distance metric to use. By default we use euclidean distance. We also support cosine distance. +- **metric** (default: "L2"): The distance metric to use. By default we use euclidean distance. We also support cosine distance. - **nprobes** (default: 20): The number of probes used. A higher number makes search more accurate but also slower. - **refine_factor** (default: None): Refine the results by reading extra elements and re-ranking them in memory. A higher number makes search more accurate but also slower. diff --git a/python/lancedb/query.py b/python/lancedb/query.py index 949e5ef7..1adb8ccb 100644 --- a/python/lancedb/query.py +++ b/python/lancedb/query.py @@ -24,7 +24,7 @@ class LanceQueryBuilder: """ def __init__(self, table: "lancedb.table.LanceTable", query: np.ndarray): - self._metric = "l2" + self._metric = "L2" self._nprobes = 20 self._refine_factor = None self._table = table diff --git a/python/lancedb/table.py b/python/lancedb/table.py index f798fb37..f633cce5 100644 --- a/python/lancedb/table.py +++ b/python/lancedb/table.py @@ -106,11 +106,14 @@ class LanceTable: def _dataset_uri(self) -> str: return os.path.join(self._conn.uri, f"{self.name}.lance") - def create_index(self, num_partitions=256, num_sub_vectors=96): + def create_index(self, metric="L2", num_partitions=256, num_sub_vectors=96): """Create an index on the table. Parameters ---------- + metric: str, default "L2" + The distance metric to use when creating the index. Valid values are "L2" or "cosine". + L2 is euclidean distance. num_partitions: int The number of IVF partitions to use when creating the index. Default is 256. @@ -121,6 +124,7 @@ class LanceTable: self._dataset.create_index( column=VECTOR_COLUMN_NAME, index_type="IVF_PQ", + metric=metric, num_partitions=num_partitions, num_sub_vectors=num_sub_vectors, ) From 6592b4c13b36c82c72fd75874a27a6241f9e3f6b Mon Sep 17 00:00:00 2001 From: Chang She <759245+changhiskhan@users.noreply.github.com> Date: Mon, 24 Apr 2023 22:46:21 -0700 Subject: [PATCH 5/7] document metric in create_index --- docs/src/ann_indexes.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/src/ann_indexes.md b/docs/src/ann_indexes.md index 41f26985..be98cc45 100644 --- a/docs/src/ann_indexes.md +++ b/docs/src/ann_indexes.md @@ -28,6 +28,7 @@ tbl.create_index(num_partitions=256, num_sub_vectors=96) Since `create_index` has a training step, it can take a few minutes to finish for large tables. You can control the index creation by providing the following parameters: +- **metric** (default: "L2"): The distance metric to use. By default we use euclidean distance. We also support cosine distance. - **num_partitions** (default: 256): The number of partitions of the index. The number of partitions should be configured so each partition has 3-5K vectors. For example, a table with ~1M vectors should use 256 partitions. You can specify arbitrary number of partitions but powers of 2 is most conventional. A higher number leads to faster queries, but it makes index generation slower. From 7a375185a1cf21771c025e21c6dec924b85fcf80 Mon Sep 17 00:00:00 2001 From: Chang She <759245+changhiskhan@users.noreply.github.com> Date: Tue, 25 Apr 2023 19:57:58 -0700 Subject: [PATCH 6/7] increment lance version to include cosine distance fix --- python/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyproject.toml b/python/pyproject.toml index b30d6494..2884c8ee 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "lancedb" version = "0.1" -dependencies = ["pylance>=0.4.3", "ratelimiter", "retry", "tqdm"] +dependencies = ["pylance>=0.4.4", "ratelimiter", "retry", "tqdm"] description = "lancedb" authors = [ { name = "Lance Devs", email = "dev@eto.ai" }, From a8db7f56d24792a8f52e3021698cf820e55d595c Mon Sep 17 00:00:00 2001 From: Chang She <759245+changhiskhan@users.noreply.github.com> Date: Tue, 25 Apr 2023 20:08:18 -0700 Subject: [PATCH 7/7] tolerance --- python/tests/test_query.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/tests/test_query.py b/python/tests/test_query.py index ae1bebda..9ad7c928 100644 --- a/python/tests/test_query.py +++ b/python/tests/test_query.py @@ -72,7 +72,8 @@ def test_query_builder_with_metric(table): df_cosine = LanceQueryBuilder(table, query).metric("cosine").limit(1).to_df() assert df_cosine.score[0] == pytest.approx( - cosine_distance(query, df_cosine.vector[0]) + cosine_distance(query, df_cosine.vector[0]), + abs=1e-6, ) assert 0 <= df_cosine.score[0] <= 1