diff --git a/Cargo.toml b/Cargo.toml index 19440f2c..07aef08b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,11 +6,11 @@ members = [ resolver = "2" [workspace.dependencies] -lance = "=0.5.9" -arrow-array = "42.0" -arrow-data = "42.0" -arrow-schema = "42.0" -arrow-ipc = "42.0" +lance = "=0.6.1" +arrow-array = "43.0" +arrow-data = "43.0" +arrow-schema = "43.0" +arrow-ipc = "43.0" half = { "version" = "=2.2.1", default-features = false } object_store = "0.6.1" snafu = "0.7.4" diff --git a/docs/src/ann_indexes.md b/docs/src/ann_indexes.md index 1a2b4cbd..58c336e6 100644 --- a/docs/src/ann_indexes.md +++ b/docs/src/ann_indexes.md @@ -94,7 +94,7 @@ There are a couple of parameters that can be used to fine-tune the search: .to_df() ``` ``` - vector item score + vector item _distance 0 [0.44949695, 0.8444449, 0.06281311, 0.23338133... item 1141 103.575333 1 [0.48587373, 0.269207, 0.15095535, 0.65531915,... item 3953 108.393867 ``` @@ -109,9 +109,8 @@ There are a couple of parameters that can be used to fine-tune the search: .execute() ``` -The search will return the data requested in addition to the score of each item. +The search will return the data requested in addition to the distance of each item. -**Note:** The score is the distance between the query vector and the element. A lower number means that the result is more relevant. ### Filtering (where clause) @@ -139,7 +138,7 @@ You can select the columns returned by the query using a select clause. tbl.search(np.random.random((1536))).select(["vector"]).to_df() ``` ``` - vector score + vector _distance 0 [0.30928212, 0.022668175, 0.1756372, 0.4911822... 93.971092 1 [0.2525465, 0.01723831, 0.261568, 0.002007689,... 95.173485 ... diff --git a/docs/src/examples/transformerjs_embedding_search_nodejs.md b/docs/src/examples/transformerjs_embedding_search_nodejs.md index de5ec012..e018008f 100644 --- a/docs/src/examples/transformerjs_embedding_search_nodejs.md +++ b/docs/src/examples/transformerjs_embedding_search_nodejs.md @@ -99,7 +99,7 @@ Output of `results`: id: 5, text: 'Banana', type: 'fruit', - score: 0.4919965863227844 + _distance: 0.4919965863227844 }, { vector: Float32Array(384) [ @@ -111,7 +111,7 @@ Output of `results`: id: 1, text: 'Cherry', type: 'fruit', - score: 0.5540297031402588 + _distance: 0.5540297031402588 } ] ``` diff --git a/docs/src/python/arrow.md b/docs/src/python/arrow.md index 10dd33ae..0e8bd3ee 100644 --- a/docs/src/python/arrow.md +++ b/docs/src/python/arrow.md @@ -79,7 +79,7 @@ print(df) ``` ``` - vector item price score + vector item price _distance 0 [5.9, 26.5] bar 20.0 14257.05957 ``` diff --git a/node/src/test/test.ts b/node/src/test/test.ts index 4fb6fe36..16c81f75 100644 --- a/node/src/test/test.ts +++ b/node/src/test/test.ts @@ -107,9 +107,9 @@ describe('LanceDB client', function () { const table = await con.openTable('vectors') const results = await table.search([0.1, 0.1]).select(['is_active']).execute() assert.equal(results.length, 2) - // vector and score are always returned + // vector and _distance are always returned assert.isDefined(results[0].vector) - assert.isDefined(results[0].score) + assert.isDefined(results[0]._distance) assert.isDefined(results[0].is_active) assert.isUndefined(results[0].id) diff --git a/python/lancedb/query.py b/python/lancedb/query.py index 3092a760..29355220 100644 --- a/python/lancedb/query.py +++ b/python/lancedb/query.py @@ -73,8 +73,8 @@ class LanceQueryBuilder: ... .select(["b"]) ... .limit(2) ... .to_df()) - b vector score - 0 6 [0.4, 0.4] 0.0 + b vector _distance + 0 6 [0.4, 0.4] 0.0 """ def __init__( @@ -205,7 +205,7 @@ class LanceQueryBuilder: """ Execute the query and return the results as a pandas DataFrame. In addition to the selected columns, LanceDB also returns a vector - and also the "score" column which is the distance between the query + and also the "_distance" column which is the distance between the query vector and the returned vector. """ @@ -217,7 +217,7 @@ class LanceQueryBuilder: [Apache Arrow Table](https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table). In addition to the selected columns, LanceDB also returns a vector - and also the "score" column which is the distance between the query + and also the "_distance" column which is the distance between the query vector and the returned vectors. """ vector = self._query if isinstance(self._query, list) else self._query.tolist() diff --git a/python/lancedb/table.py b/python/lancedb/table.py index e1586aa0..10366849 100644 --- a/python/lancedb/table.py +++ b/python/lancedb/table.py @@ -47,10 +47,15 @@ def _sanitize_data(data, schema, on_bad_vectors, fill_value): if isinstance(data, dict): data = vec_to_table(data) if pd is not None and isinstance(data, pd.DataFrame): - data = pa.Table.from_pandas(data) + data = pa.Table.from_pandas(data, preserve_index=False) data = _sanitize_schema( data, schema=schema, on_bad_vectors=on_bad_vectors, fill_value=fill_value ) + # Do not serialize Pandas metadata + metadata = data.schema.metadata if data.schema.metadata is not None else {} + metadata = {k: v for k, v in metadata.items() if k != b"pandas"} + schema = data.schema.with_metadata(metadata) + data = pa.Table.from_arrays(data.columns, schema=schema) if not isinstance(data, (pa.Table, Iterable)): raise TypeError(f"Unsupported data type: {type(data)}") return data @@ -85,9 +90,9 @@ class Table(ABC): Can query the table with [Table.search][lancedb.table.Table.search]. >>> table.search([0.4, 0.4]).select(["b"]).to_df() - b vector score - 0 4 [0.5, 1.3] 0.82 - 1 2 [1.1, 1.2] 1.13 + b vector _distance + 0 4 [0.5, 1.3] 0.82 + 1 2 [1.1, 1.2] 1.13 Search queries are much faster when an index is created. See [Table.create_index][lancedb.table.Table.create_index]. @@ -196,7 +201,7 @@ class Table(ABC): LanceQueryBuilder A query builder object representing the query. Once executed, the query returns selected columns, the vector, - and also the "score" column which is the distance between the query + and also the "_distance" column which is the distance between the query vector and the returned vector. """ raise NotImplementedError @@ -457,7 +462,7 @@ class LanceTable(Table): LanceQueryBuilder A query builder object representing the query. Once executed, the query returns selected columns, the vector, - and also the "score" column which is the distance between the query + and also the "_distance" column which is the distance between the query vector and the returned vector. """ if isinstance(query, str): diff --git a/python/pyproject.toml b/python/pyproject.toml index bcbacc6f..cd6af89a 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -2,7 +2,7 @@ name = "lancedb" version = "0.1.16" dependencies = [ - "pylance==0.5.10", + "pylance==0.6.1", "ratelimiter", "retry", "tqdm", diff --git a/python/tests/test_fts.py b/python/tests/test_fts.py index 99156710..48a999f9 100644 --- a/python/tests/test_fts.py +++ b/python/tests/test_fts.py @@ -66,7 +66,7 @@ def test_search_index(tmp_path, table): results = ldb.fts.search_index(index, query="puppy", limit=10) assert len(results) == 2 assert len(results[0]) == 10 # row_ids - assert len(results[1]) == 10 # scores + assert len(results[1]) == 10 # _distance def test_create_index_from_table(tmp_path, table): diff --git a/python/tests/test_query.py b/python/tests/test_query.py index 21646111..b2135a56 100644 --- a/python/tests/test_query.py +++ b/python/tests/test_query.py @@ -108,11 +108,11 @@ def test_query_builder_with_metric(table): .limit(1) .to_df() ) - assert df_cosine.score[0] == pytest.approx( + assert df_cosine._distance[0] == pytest.approx( cosine_distance(query, df_cosine.vector[0]), abs=1e-6, ) - assert 0 <= df_cosine.score[0] <= 1 + assert 0 <= df_cosine._distance[0] <= 1 def test_query_builder_with_different_vector_column():