From 5e748e6e70071cf16d12f7f4556a84d8a76b3cd6 Mon Sep 17 00:00:00 2001
From: Chang She <759245+changhiskhan@users.noreply.github.com>
Date: Mon, 24 Apr 2023 18:46:05 -0700
Subject: [PATCH 1/7] Minor notebook fix. Closes #40

---
 notebooks/youtube_transcript_search.ipynb | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/notebooks/youtube_transcript_search.ipynb b/notebooks/youtube_transcript_search.ipynb
index b3bfd83d..987373e8 100644
--- a/notebooks/youtube_transcript_search.ipynb
+++ b/notebooks/youtube_transcript_search.ipynb
@@ -1,7 +1,6 @@
 {
  "cells": [
   {
-   "attachments": {},
    "cell_type": "markdown",
    "id": "42bf01fb",
    "metadata": {},
@@ -22,10 +21,10 @@
      "output_type": "stream",
      "text": [
       "\n",
-      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.0.1\u001b[0m\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.1\u001b[0m\n",
       "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
       "\n",
-      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.0.1\u001b[0m\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.1\u001b[0m\n",
       "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
      ]
     }
@@ -88,7 +87,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "id": "5ac2b6a3",
    "metadata": {},
@@ -231,7 +229,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "id": "2106b5bb",
    "metadata": {},
@@ -251,7 +248,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "39f3161f3ef54a129cd65fb296332b54",
+       "model_id": "c6f1c76d9567421d88911923388d2530",
        "version_major": 2,
        "version_minor": 0
       },
@@ -574,7 +571,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "id": "559a095b",
    "metadata": {},
@@ -631,7 +627,7 @@
        "        <iframe\n",
        "            width=\"400\"\n",
        "            height=\"300\"\n",
-       "            src=\"https://www.youtube.com/embed/pNvujJ1XyeQ?start=289.76\"\n",
+       "            src=\"https://www.youtube.com/embed/pNvujJ1XyeQ?start=289\"\n",
        "            frameborder=\"0\"\n",
        "            allowfullscreen\n",
        "            \n",
@@ -639,7 +635,7 @@
        "        "
       ],
       "text/plain": [
-       "<IPython.lib.display.YouTubeVideo at 0x177fde4d0>"
+       "<IPython.lib.display.YouTubeVideo at 0x13ec062c0>"
       ]
      },
      "execution_count": 15,
@@ -651,7 +647,7 @@
     "from IPython.display import YouTubeVideo\n",
     "\n",
     "top_match = context.iloc[0]\n",
-    "YouTubeVideo(top_match[\"url\"].split(\"/\")[-1], start=top_match[\"start\"])"
+    "YouTubeVideo(top_match[\"url\"].split(\"/\")[-1], start=int(top_match[\"start\"]))"
    ]
   },
   {

From 89e6232aebc41aaa76f3f7f0199763eb301f2a40 Mon Sep 17 00:00:00 2001
From: Chang She <759245+changhiskhan@users.noreply.github.com>
Date: Mon, 24 Apr 2023 22:40:40 -0700
Subject: [PATCH 2/7] Make distance metric configurable during search

---
 python/lancedb/query.py    | 17 +++++++++++++++++
 python/tests/test_query.py | 19 +++++++++++++++++++
 2 files changed, 36 insertions(+)

diff --git a/python/lancedb/query.py b/python/lancedb/query.py
index 21333bec..949e5ef7 100644
--- a/python/lancedb/query.py
+++ b/python/lancedb/query.py
@@ -24,6 +24,7 @@ class LanceQueryBuilder:
     """
 
     def __init__(self, table: "lancedb.table.LanceTable", query: np.ndarray):
+        self._metric = "l2"
         self._nprobes = 20
         self._refine_factor = None
         self._table = table
@@ -77,6 +78,21 @@ class LanceQueryBuilder:
         self._where = where
         return self
 
+    def metric(self, metric: str) -> LanceQueryBuilder:
+        """Set the distance metric to use.
+
+        Parameters
+        ----------
+        metric: str
+            The distance metric to use. By default "l2" is used.
+
+        Returns
+        -------
+        The LanceQueryBuilder object.
+        """
+        self._metric = metric
+        return self
+
     def nprobes(self, nprobes: int) -> LanceQueryBuilder:
         """Set the number of probes to use.
 
@@ -118,6 +134,7 @@ class LanceQueryBuilder:
                 "column": VECTOR_COLUMN_NAME,
                 "q": self._query,
                 "k": self._limit,
+                "metric": self._metric,
                 "nprobes": self._nprobes,
                 "refine_factor": self._refine_factor,
             },
diff --git a/python/tests/test_query.py b/python/tests/test_query.py
index c08cdd8f..ae1bebda 100644
--- a/python/tests/test_query.py
+++ b/python/tests/test_query.py
@@ -14,7 +14,9 @@
 import lance
 from lancedb.query import LanceQueryBuilder
 
+import numpy as np
 import pandas as pd
+import pandas.testing as tm
 import pyarrow as pa
 
 import pytest
@@ -60,3 +62,20 @@ def test_query_builder_with_filter(table):
     df = LanceQueryBuilder(table, [0, 0]).where("id = 2").to_df()
     assert df["id"].values[0] == 2
     assert all(df["vector"].values[0] == [3, 4])
+
+
+def test_query_builder_with_metric(table):
+    query = [4, 8]
+    df_default = LanceQueryBuilder(table, query).to_df()
+    df_l2 = LanceQueryBuilder(table, query).metric("l2").to_df()
+    tm.assert_frame_equal(df_default, df_l2)
+
+    df_cosine = LanceQueryBuilder(table, query).metric("cosine").limit(1).to_df()
+    assert df_cosine.score[0] == pytest.approx(
+        cosine_distance(query, df_cosine.vector[0])
+    )
+    assert 0 <= df_cosine.score[0] <= 1
+
+
+def cosine_distance(vec1, vec2):
+    return 1 - np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

From b0e578c60905f88f97d5a3d8b8f39987b11aca11 Mon Sep 17 00:00:00 2001
From: Chang She <759245+changhiskhan@users.noreply.github.com>
Date: Mon, 24 Apr 2023 22:42:30 -0700
Subject: [PATCH 3/7] add documentation for metric

---
 docs/src/ann_indexes.md | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/docs/src/ann_indexes.md b/docs/src/ann_indexes.md
index 96b93e2b..75de5c43 100644
--- a/docs/src/ann_indexes.md
+++ b/docs/src/ann_indexes.md
@@ -28,11 +28,11 @@ tbl.create_index(num_partitions=256, num_sub_vectors=96)
 Since `create_index` has a training step, it can take a few minutes to finish for large tables. You can control the index
 creation by providing the following parameters:
 
-- **num_partitions** (default: 256): The number of partitions of the index. The number of partitions should be configured so each partition has 3-5K vectors. For example, a table 
-with ~1M vectors should use 256 partitions. You can specify arbitrary number of partitions but powers of 2 is most conventional. 
-A higher number leads to faster queries, but it makes index generation slower. 
+- **num_partitions** (default: 256): The number of partitions of the index. The number of partitions should be configured so each partition has 3-5K vectors. For example, a table
+with ~1M vectors should use 256 partitions. You can specify arbitrary number of partitions but powers of 2 is most conventional.
+A higher number leads to faster queries, but it makes index generation slower.
 - **num_sub_vectors** (default: 96): The number of subvectors (M) that will be created during Product Quantization (PQ). A larger number makes
-search more accurate, but also makes the index larger and slower to build. 
+search more accurate, but also makes the index larger and slower to build.
 
 ## Querying an ANN Index
 
@@ -41,8 +41,9 @@ Querying vector indexes is done via the [search](https://lancedb.github.io/lance
 There are a couple of parameters that can be used to fine-tune the search:
 
 - **limit** (default: 10): The amount of results that will be returned
+- **metric** (default: "l2"): The distance metric to use. By default we use euclidean distance. We also support cosine distance.
 - **nprobes** (default: 20): The number of probes used. A higher number makes search more accurate but also slower.
-- **refine_factor** (default: None): Refine the results by reading extra elements and re-ranking them in memory. A higher number makes 
+- **refine_factor** (default: None): Refine the results by reading extra elements and re-ranking them in memory. A higher number makes
 search more accurate but also slower.
 
 ```python

From 72a44eb927f9a8ebf47fc924c9bce4396371dfbe Mon Sep 17 00:00:00 2001
From: Chang She <759245+changhiskhan@users.noreply.github.com>
Date: Mon, 24 Apr 2023 22:45:37 -0700
Subject: [PATCH 4/7] specify metric during index creation

---
 docs/src/ann_indexes.md | 2 +-
 python/lancedb/query.py | 2 +-
 python/lancedb/table.py | 6 +++++-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/docs/src/ann_indexes.md b/docs/src/ann_indexes.md
index 75de5c43..41f26985 100644
--- a/docs/src/ann_indexes.md
+++ b/docs/src/ann_indexes.md
@@ -41,7 +41,7 @@ Querying vector indexes is done via the [search](https://lancedb.github.io/lance
 There are a couple of parameters that can be used to fine-tune the search:
 
 - **limit** (default: 10): The amount of results that will be returned
-- **metric** (default: "l2"): The distance metric to use. By default we use euclidean distance. We also support cosine distance.
+- **metric** (default: "L2"): The distance metric to use. By default we use euclidean distance. We also support cosine distance.
 - **nprobes** (default: 20): The number of probes used. A higher number makes search more accurate but also slower.
 - **refine_factor** (default: None): Refine the results by reading extra elements and re-ranking them in memory. A higher number makes
 search more accurate but also slower.
diff --git a/python/lancedb/query.py b/python/lancedb/query.py
index 949e5ef7..1adb8ccb 100644
--- a/python/lancedb/query.py
+++ b/python/lancedb/query.py
@@ -24,7 +24,7 @@ class LanceQueryBuilder:
     """
 
     def __init__(self, table: "lancedb.table.LanceTable", query: np.ndarray):
-        self._metric = "l2"
+        self._metric = "L2"
         self._nprobes = 20
         self._refine_factor = None
         self._table = table
diff --git a/python/lancedb/table.py b/python/lancedb/table.py
index f798fb37..f633cce5 100644
--- a/python/lancedb/table.py
+++ b/python/lancedb/table.py
@@ -106,11 +106,14 @@ class LanceTable:
     def _dataset_uri(self) -> str:
         return os.path.join(self._conn.uri, f"{self.name}.lance")
 
-    def create_index(self, num_partitions=256, num_sub_vectors=96):
+    def create_index(self, metric="L2", num_partitions=256, num_sub_vectors=96):
         """Create an index on the table.
 
         Parameters
         ----------
+        metric: str, default "L2"
+            The distance metric to use when creating the index. Valid values are "L2" or "cosine".
+            L2 is euclidean distance.
         num_partitions: int
             The number of IVF partitions to use when creating the index.
             Default is 256.
@@ -121,6 +124,7 @@ class LanceTable:
         self._dataset.create_index(
             column=VECTOR_COLUMN_NAME,
             index_type="IVF_PQ",
+            metric=metric,
             num_partitions=num_partitions,
             num_sub_vectors=num_sub_vectors,
         )

From 6592b4c13b36c82c72fd75874a27a6241f9e3f6b Mon Sep 17 00:00:00 2001
From: Chang She <759245+changhiskhan@users.noreply.github.com>
Date: Mon, 24 Apr 2023 22:46:21 -0700
Subject: [PATCH 5/7] document metric in create_index

---
 docs/src/ann_indexes.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/src/ann_indexes.md b/docs/src/ann_indexes.md
index 41f26985..be98cc45 100644
--- a/docs/src/ann_indexes.md
+++ b/docs/src/ann_indexes.md
@@ -28,6 +28,7 @@ tbl.create_index(num_partitions=256, num_sub_vectors=96)
 Since `create_index` has a training step, it can take a few minutes to finish for large tables. You can control the index
 creation by providing the following parameters:
 
+- **metric** (default: "L2"): The distance metric to use. By default we use euclidean distance. We also support cosine distance.
 - **num_partitions** (default: 256): The number of partitions of the index. The number of partitions should be configured so each partition has 3-5K vectors. For example, a table
 with ~1M vectors should use 256 partitions. You can specify arbitrary number of partitions but powers of 2 is most conventional.
 A higher number leads to faster queries, but it makes index generation slower.

From 7a375185a1cf21771c025e21c6dec924b85fcf80 Mon Sep 17 00:00:00 2001
From: Chang She <759245+changhiskhan@users.noreply.github.com>
Date: Tue, 25 Apr 2023 19:57:58 -0700
Subject: [PATCH 6/7] increment lance version to include cosine distance fix

---
 python/pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pyproject.toml b/python/pyproject.toml
index b30d6494..2884c8ee 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -1,7 +1,7 @@
 [project]
 name = "lancedb"
 version = "0.1"
-dependencies = ["pylance>=0.4.3", "ratelimiter", "retry", "tqdm"]
+dependencies = ["pylance>=0.4.4", "ratelimiter", "retry", "tqdm"]
 description = "lancedb"
 authors = [
     { name = "Lance Devs", email = "dev@eto.ai" },

From a8db7f56d24792a8f52e3021698cf820e55d595c Mon Sep 17 00:00:00 2001
From: Chang She <759245+changhiskhan@users.noreply.github.com>
Date: Tue, 25 Apr 2023 20:08:18 -0700
Subject: [PATCH 7/7] tolerance

---
 python/tests/test_query.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/tests/test_query.py b/python/tests/test_query.py
index ae1bebda..9ad7c928 100644
--- a/python/tests/test_query.py
+++ b/python/tests/test_query.py
@@ -72,7 +72,8 @@ def test_query_builder_with_metric(table):
 
     df_cosine = LanceQueryBuilder(table, query).metric("cosine").limit(1).to_df()
     assert df_cosine.score[0] == pytest.approx(
-        cosine_distance(query, df_cosine.vector[0])
+        cosine_distance(query, df_cosine.vector[0]),
+        abs=1e-6,
     )
     assert 0 <= df_cosine.score[0] <= 1