mirror of
https://github.com/lancedb/lancedb.git
synced 2026-05-21 14:00:40 +00:00
chore: upgrade Lance and rename score to _distance (#398)
BREAKING CHANGE: The `score` column has been renamed to `_distance` to more accurately describe the semantics (smaller means closer / better). --------- Co-authored-by: Lei Xu <lei@lancedb.com>
This commit is contained in:
10
Cargo.toml
10
Cargo.toml
@@ -6,11 +6,11 @@ members = [
|
||||
resolver = "2"
|
||||
|
||||
[workspace.dependencies]
|
||||
lance = "=0.5.9"
|
||||
arrow-array = "42.0"
|
||||
arrow-data = "42.0"
|
||||
arrow-schema = "42.0"
|
||||
arrow-ipc = "42.0"
|
||||
lance = "=0.6.1"
|
||||
arrow-array = "43.0"
|
||||
arrow-data = "43.0"
|
||||
arrow-schema = "43.0"
|
||||
arrow-ipc = "43.0"
|
||||
half = { "version" = "=2.2.1", default-features = false }
|
||||
object_store = "0.6.1"
|
||||
snafu = "0.7.4"
|
||||
|
||||
@@ -94,7 +94,7 @@ There are a couple of parameters that can be used to fine-tune the search:
|
||||
.to_df()
|
||||
```
|
||||
```
|
||||
vector item score
|
||||
vector item _distance
|
||||
0 [0.44949695, 0.8444449, 0.06281311, 0.23338133... item 1141 103.575333
|
||||
1 [0.48587373, 0.269207, 0.15095535, 0.65531915,... item 3953 108.393867
|
||||
```
|
||||
@@ -109,9 +109,8 @@ There are a couple of parameters that can be used to fine-tune the search:
|
||||
.execute()
|
||||
```
|
||||
|
||||
The search will return the data requested in addition to the score of each item.
|
||||
The search will return the data requested in addition to the distance of each item.
|
||||
|
||||
**Note:** The score is the distance between the query vector and the element. A lower number means that the result is more relevant.
|
||||
|
||||
### Filtering (where clause)
|
||||
|
||||
@@ -139,7 +138,7 @@ You can select the columns returned by the query using a select clause.
|
||||
tbl.search(np.random.random((1536))).select(["vector"]).to_df()
|
||||
```
|
||||
```
|
||||
vector score
|
||||
vector _distance
|
||||
0 [0.30928212, 0.022668175, 0.1756372, 0.4911822... 93.971092
|
||||
1 [0.2525465, 0.01723831, 0.261568, 0.002007689,... 95.173485
|
||||
...
|
||||
|
||||
@@ -99,7 +99,7 @@ Output of `results`:
|
||||
id: 5,
|
||||
text: 'Banana',
|
||||
type: 'fruit',
|
||||
score: 0.4919965863227844
|
||||
_distance: 0.4919965863227844
|
||||
},
|
||||
{
|
||||
vector: Float32Array(384) [
|
||||
@@ -111,7 +111,7 @@ Output of `results`:
|
||||
id: 1,
|
||||
text: 'Cherry',
|
||||
type: 'fruit',
|
||||
score: 0.5540297031402588
|
||||
_distance: 0.5540297031402588
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
@@ -79,7 +79,7 @@ print(df)
|
||||
```
|
||||
|
||||
```
|
||||
vector item price score
|
||||
vector item price _distance
|
||||
0 [5.9, 26.5] bar 20.0 14257.05957
|
||||
```
|
||||
|
||||
|
||||
@@ -107,9 +107,9 @@ describe('LanceDB client', function () {
|
||||
const table = await con.openTable('vectors')
|
||||
const results = await table.search([0.1, 0.1]).select(['is_active']).execute()
|
||||
assert.equal(results.length, 2)
|
||||
// vector and score are always returned
|
||||
// vector and _distance are always returned
|
||||
assert.isDefined(results[0].vector)
|
||||
assert.isDefined(results[0].score)
|
||||
assert.isDefined(results[0]._distance)
|
||||
assert.isDefined(results[0].is_active)
|
||||
|
||||
assert.isUndefined(results[0].id)
|
||||
|
||||
@@ -73,8 +73,8 @@ class LanceQueryBuilder:
|
||||
... .select(["b"])
|
||||
... .limit(2)
|
||||
... .to_df())
|
||||
b vector score
|
||||
0 6 [0.4, 0.4] 0.0
|
||||
b vector _distance
|
||||
0 6 [0.4, 0.4] 0.0
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@@ -205,7 +205,7 @@ class LanceQueryBuilder:
|
||||
"""
|
||||
Execute the query and return the results as a pandas DataFrame.
|
||||
In addition to the selected columns, LanceDB also returns a vector
|
||||
and also the "score" column which is the distance between the query
|
||||
and also the "_distance" column which is the distance between the query
|
||||
vector and the returned vector.
|
||||
"""
|
||||
|
||||
@@ -217,7 +217,7 @@ class LanceQueryBuilder:
|
||||
[Apache Arrow Table](https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table).
|
||||
|
||||
In addition to the selected columns, LanceDB also returns a vector
|
||||
and also the "score" column which is the distance between the query
|
||||
and also the "_distance" column which is the distance between the query
|
||||
vector and the returned vectors.
|
||||
"""
|
||||
vector = self._query if isinstance(self._query, list) else self._query.tolist()
|
||||
|
||||
@@ -47,10 +47,15 @@ def _sanitize_data(data, schema, on_bad_vectors, fill_value):
|
||||
if isinstance(data, dict):
|
||||
data = vec_to_table(data)
|
||||
if pd is not None and isinstance(data, pd.DataFrame):
|
||||
data = pa.Table.from_pandas(data)
|
||||
data = pa.Table.from_pandas(data, preserve_index=False)
|
||||
data = _sanitize_schema(
|
||||
data, schema=schema, on_bad_vectors=on_bad_vectors, fill_value=fill_value
|
||||
)
|
||||
# Do not serialize Pandas metadata
|
||||
metadata = data.schema.metadata if data.schema.metadata is not None else {}
|
||||
metadata = {k: v for k, v in metadata.items() if k != b"pandas"}
|
||||
schema = data.schema.with_metadata(metadata)
|
||||
data = pa.Table.from_arrays(data.columns, schema=schema)
|
||||
if not isinstance(data, (pa.Table, Iterable)):
|
||||
raise TypeError(f"Unsupported data type: {type(data)}")
|
||||
return data
|
||||
@@ -85,9 +90,9 @@ class Table(ABC):
|
||||
Can query the table with [Table.search][lancedb.table.Table.search].
|
||||
|
||||
>>> table.search([0.4, 0.4]).select(["b"]).to_df()
|
||||
b vector score
|
||||
0 4 [0.5, 1.3] 0.82
|
||||
1 2 [1.1, 1.2] 1.13
|
||||
b vector _distance
|
||||
0 4 [0.5, 1.3] 0.82
|
||||
1 2 [1.1, 1.2] 1.13
|
||||
|
||||
Search queries are much faster when an index is created. See
|
||||
[Table.create_index][lancedb.table.Table.create_index].
|
||||
@@ -196,7 +201,7 @@ class Table(ABC):
|
||||
LanceQueryBuilder
|
||||
A query builder object representing the query.
|
||||
Once executed, the query returns selected columns, the vector,
|
||||
and also the "score" column which is the distance between the query
|
||||
and also the "_distance" column which is the distance between the query
|
||||
vector and the returned vector.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
@@ -457,7 +462,7 @@ class LanceTable(Table):
|
||||
LanceQueryBuilder
|
||||
A query builder object representing the query.
|
||||
Once executed, the query returns selected columns, the vector,
|
||||
and also the "score" column which is the distance between the query
|
||||
and also the "_distance" column which is the distance between the query
|
||||
vector and the returned vector.
|
||||
"""
|
||||
if isinstance(query, str):
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
name = "lancedb"
|
||||
version = "0.1.16"
|
||||
dependencies = [
|
||||
"pylance==0.5.10",
|
||||
"pylance==0.6.1",
|
||||
"ratelimiter",
|
||||
"retry",
|
||||
"tqdm",
|
||||
|
||||
@@ -66,7 +66,7 @@ def test_search_index(tmp_path, table):
|
||||
results = ldb.fts.search_index(index, query="puppy", limit=10)
|
||||
assert len(results) == 2
|
||||
assert len(results[0]) == 10 # row_ids
|
||||
assert len(results[1]) == 10 # scores
|
||||
assert len(results[1]) == 10 # _distance
|
||||
|
||||
|
||||
def test_create_index_from_table(tmp_path, table):
|
||||
|
||||
@@ -108,11 +108,11 @@ def test_query_builder_with_metric(table):
|
||||
.limit(1)
|
||||
.to_df()
|
||||
)
|
||||
assert df_cosine.score[0] == pytest.approx(
|
||||
assert df_cosine._distance[0] == pytest.approx(
|
||||
cosine_distance(query, df_cosine.vector[0]),
|
||||
abs=1e-6,
|
||||
)
|
||||
assert 0 <= df_cosine.score[0] <= 1
|
||||
assert 0 <= df_cosine._distance[0] <= 1
|
||||
|
||||
|
||||
def test_query_builder_with_different_vector_column():
|
||||
|
||||
Reference in New Issue
Block a user