fix: metric type inconsistency (#2122)

PR fixes #2113

---------

Co-authored-by: Will Jones <willjones127@gmail.com>
This commit is contained in:
Gagan Bhullar
2025-03-12 11:28:37 -06:00
committed by GitHub
parent dd22a379b2
commit 14677d7c18
24 changed files with 104 additions and 89 deletions

View File

@@ -150,7 +150,7 @@ class HnswPq:
Parameters
----------
distance_type: str, default "L2"
distance_type: str, default "l2"
The distance metric used to train the index.
@@ -158,18 +158,18 @@ class HnswPq:
"l2" - Euclidean distance. This is a very common distance metric that
accounts for both magnitude and direction when determining the distance
between vectors. L2 distance has a range of [0, ∞).
between vectors. l2 distance has a range of [0, ∞).
"cosine" - Cosine distance. Cosine distance is a distance metric
calculated from the cosine similarity between two vectors. Cosine
similarity is a measure of similarity between two non-zero vectors of an
inner product space. It is defined to equal the cosine of the angle
between them. Unlike L2, the cosine distance is not affected by the
between them. Unlike l2, the cosine distance is not affected by the
magnitude of the vectors. Cosine distance has a range of [0, 2].
"dot" - Dot product. Dot distance is the dot product of two vectors. Dot
distance has a range of (-∞, ∞). If the vectors are normalized (i.e. their
L2 norm is 1), then dot distance is equivalent to the cosine distance.
l2 norm is 1), then dot distance is equivalent to the cosine distance.
num_partitions, default sqrt(num_rows)
@@ -271,7 +271,7 @@ class HnswSq:
Parameters
----------
distance_type: str, default "L2"
distance_type: str, default "l2"
The distance metric used to train the index.
@@ -279,18 +279,18 @@ class HnswSq:
"l2" - Euclidean distance. This is a very common distance metric that
accounts for both magnitude and direction when determining the distance
between vectors. L2 distance has a range of [0, ∞).
between vectors. l2 distance has a range of [0, ∞).
"cosine" - Cosine distance. Cosine distance is a distance metric
calculated from the cosine similarity between two vectors. Cosine
similarity is a measure of similarity between two non-zero vectors of an
inner product space. It is defined to equal the cosine of the angle
between them. Unlike L2, the cosine distance is not affected by the
between them. Unlike l2, the cosine distance is not affected by the
magnitude of the vectors. Cosine distance has a range of [0, 2].
"dot" - Dot product. Dot distance is the dot product of two vectors. Dot
distance has a range of (-∞, ∞). If the vectors are normalized (i.e. their
L2 norm is 1), then dot distance is equivalent to the cosine distance.
l2 norm is 1), then dot distance is equivalent to the cosine distance.
num_partitions, default sqrt(num_rows)
@@ -369,7 +369,7 @@ class IvfFlat:
Attributes
----------
distance_type: str, default "L2"
distance_type: str, default "l2"
The distance metric used to train the index
This is used when training the index to calculate the IVF partitions
@@ -383,13 +383,13 @@ class IvfFlat:
"l2" - Euclidean distance. This is a very common distance metric that
accounts for both magnitude and direction when determining the distance
between vectors. L2 distance has a range of [0, ∞).
between vectors. l2 distance has a range of [0, ∞).
"cosine" - Cosine distance. Cosine distance is a distance metric
calculated from the cosine similarity between two vectors. Cosine
similarity is a measure of similarity between two non-zero vectors of an
inner product space. It is defined to equal the cosine of the angle
between them. Unlike L2, the cosine distance is not affected by the
between them. Unlike l2, the cosine distance is not affected by the
magnitude of the vectors. Cosine distance has a range of [0, 2].
Note: the cosine distance is undefined when one (or both) of the vectors
@@ -398,7 +398,7 @@ class IvfFlat:
"dot" - Dot product. Dot distance is the dot product of two vectors. Dot
distance has a range of (-∞, ∞). If the vectors are normalized (i.e. their
L2 norm is 1), then dot distance is equivalent to the cosine distance.
l2 norm is 1), then dot distance is equivalent to the cosine distance.
"hamming" - Hamming distance. Hamming distance is a distance metric
calculated as the number of positions at which the corresponding bits are
@@ -475,7 +475,7 @@ class IvfPq:
Attributes
----------
distance_type: str, default "L2"
distance_type: str, default "l2"
The distance metric used to train the index
This is used when training the index to calculate the IVF partitions
@@ -489,13 +489,13 @@ class IvfPq:
"l2" - Euclidean distance. This is a very common distance metric that
accounts for both magnitude and direction when determining the distance
between vectors. L2 distance has a range of [0, ∞).
between vectors. l2 distance has a range of [0, ∞).
"cosine" - Cosine distance. Cosine distance is a distance metric
calculated from the cosine similarity between two vectors. Cosine
similarity is a measure of similarity between two non-zero vectors of an
inner product space. It is defined to equal the cosine of the angle
between them. Unlike L2, the cosine distance is not affected by the
between them. Unlike l2, the cosine distance is not affected by the
magnitude of the vectors. Cosine distance has a range of [0, 2].
Note: the cosine distance is undefined when one (or both) of the vectors
@@ -504,7 +504,7 @@ class IvfPq:
"dot" - Dot product. Dot distance is the dot product of two vectors. Dot
distance has a range of (-∞, ∞). If the vectors are normalized (i.e. their
L2 norm is 1), then dot distance is equivalent to the cosine distance.
l2 norm is 1), then dot distance is equivalent to the cosine distance.
num_partitions: int, default sqrt(num_rows)
The number of IVF partitions to create.

View File

@@ -68,7 +68,7 @@ class Query(pydantic.BaseModel):
metric : str
the distance metric between a pair of vectors,
can support L2 (default), Cosine and Dot.
can support l2 (default), Cosine and Dot.
[metric definitions][search]
columns : Optional[List[str]]
which columns to return in the results
@@ -113,7 +113,7 @@ class Query(pydantic.BaseModel):
k: Optional[int] = None
# # metrics
metric: str = "L2"
metric: str = "l2"
# which columns to return in the results
columns: Optional[Union[List[str], Dict[str, str]]] = None
@@ -597,7 +597,7 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
if self._limit is None:
self._limit = 10
self._query = query
self._distance_type = "L2"
self._distance_type = "l2"
self._nprobes = 20
self._lower_bound = None
self._upper_bound = None
@@ -608,7 +608,7 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
self._str_query = str_query
self._fast_search = fast_search
def metric(self, metric: Literal["L2", "cosine", "dot"]) -> LanceVectorQueryBuilder:
def metric(self, metric: Literal["l2", "cosine", "dot"]) -> LanceVectorQueryBuilder:
"""Set the distance metric to use.
This is an alias for distance_type() and may be deprecated in the future.
@@ -616,8 +616,8 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
Parameters
----------
metric: "L2" or "cosine" or "dot"
The distance metric to use. By default "L2" is used.
metric: "l2" or "cosine" or "dot"
The distance metric to use. By default "l2" is used.
Returns
-------
@@ -627,7 +627,7 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
return self.distance_type(metric)
def distance_type(
self, distance_type: Literal["L2", "cosine", "dot"]
self, distance_type: Literal["l2", "cosine", "dot"]
) -> "LanceVectorQueryBuilder":
"""Set the distance metric to use.
@@ -641,8 +641,8 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
Parameters
----------
distance_type: "L2" or "cosine" or "dot"
The distance metric to use. By default "L2" is used.
distance_type: "l2" or "cosine" or "dot"
The distance metric to use. By default "l2" is used.
Returns
-------
@@ -1414,7 +1414,7 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
self._ef = ef
return self
def metric(self, metric: Literal["L2", "cosine", "dot"]) -> LanceHybridQueryBuilder:
def metric(self, metric: Literal["l2", "cosine", "dot"]) -> LanceHybridQueryBuilder:
"""Set the distance metric to use.
This is an alias for distance_type() and may be deprecated in the future.
@@ -1422,8 +1422,8 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
Parameters
----------
metric: "L2" or "cosine" or "dot"
The distance metric to use. By default "L2" is used.
metric: "l2" or "cosine" or "dot"
The distance metric to use. By default "l2" is used.
Returns
-------
@@ -1433,7 +1433,7 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
return self.distance_type(metric)
def distance_type(
self, distance_type: Literal["L2", "cosine", "dot"]
self, distance_type: Literal["l2", "cosine", "dot"]
) -> "LanceHybridQueryBuilder":
"""Set the distance metric to use.
@@ -1447,8 +1447,8 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
Parameters
----------
distance_type: "L2" or "cosine" or "dot"
The distance metric to use. By default "L2" is used.
distance_type: "l2" or "cosine" or "dot"
The distance metric to use. By default "l2" is used.
Returns
-------

View File

@@ -154,7 +154,7 @@ class RemoteTable(Table):
def create_index(
self,
metric="L2",
metric="l2",
vector_column_name: str = VECTOR_COLUMN_NAME,
index_cache_size: Optional[int] = None,
num_partitions: Optional[int] = None,
@@ -170,7 +170,7 @@ class RemoteTable(Table):
Parameters
----------
metric : str
The metric to use for the index. Default is "L2".
The metric to use for the index. Default is "l2".
vector_column_name : str
The name of the vector column. Default is "vector".
@@ -193,7 +193,7 @@ class RemoteTable(Table):
... table_name, # doctest: +SKIP
... schema=schema, # doctest: +SKIP
... )
>>> table.create_index("L2", "vector") # doctest: +SKIP
>>> table.create_index("l2", "vector") # doctest: +SKIP
"""
if num_partitions is not None:

View File

@@ -577,7 +577,7 @@ class Table(ABC):
def create_index(
self,
metric="L2",
metric="l2",
num_partitions=256,
num_sub_vectors=96,
vector_column_name: str = VECTOR_COLUMN_NAME,
@@ -596,10 +596,10 @@ class Table(ABC):
Parameters
----------
metric: str, default "L2"
metric: str, default "l2"
The distance metric to use when creating the index.
Valid values are "L2", "cosine", "dot", or "hamming".
L2 is euclidean distance.
Valid values are "l2", "cosine", "dot", or "hamming".
l2 is euclidean distance.
Hamming is available only for binary vectors.
num_partitions: int, default 256
The number of IVF partitions to use when creating the index.

View File

@@ -452,7 +452,7 @@ def test_query_builder_with_metric(table):
df_default = LanceVectorQueryBuilder(table, query, vector_column_name).to_pandas()
df_l2 = (
LanceVectorQueryBuilder(table, query, vector_column_name)
.distance_type("L2")
.distance_type("l2")
.to_pandas()
)
tm.assert_frame_equal(df_default, df_l2)

View File

@@ -480,7 +480,7 @@ def test_create_index_method(mock_create_index, mem_db: DBConnection):
)
table.create_index(
metric="L2",
metric="l2",
num_partitions=256,
num_sub_vectors=96,
vector_column_name="vector",
@@ -489,7 +489,7 @@ def test_create_index_method(mock_create_index, mem_db: DBConnection):
num_bits=4,
)
expected_config = IvfPq(
distance_type="L2",
distance_type="l2",
num_partitions=256,
num_sub_vectors=96,
num_bits=4,
@@ -1237,7 +1237,7 @@ def test_hybrid_search_metric_type(tmp_db: DBConnection):
# This test uses an FTS index
pytest.importorskip("lancedb.fts")
# Need to use nonnorm as the embedding function so L2 and dot results
# Need to use nonnorm as the embedding function so l2 and dot results
# are different
table, _, _ = setup_hybrid_search_table(tmp_db, "nonnorm")