feat(python): enable polars predict pushdown

fix hybrid search example (#922 )
2025-12-25 22:29:58 +00:00 · 2024-02-03 19:33:45 -08:00 · 2024-02-03 09:26:32 +05:30
3 changed files with 17 additions and 9 deletions
--- a/docs/src/hybrid_search.md
+++ b/docs/src/hybrid_search.md
@@ -6,17 +6,24 @@ LanceDB supports both semantic and keyword-based search. In real world applicati
 You can perform hybrid search in LanceDB by combining the results of semantic and full-text search via a reranking algorithm of your choice. LanceDB provides multiple rerankers out of the box. However, you can always write a custom reranker if your use case need more sophisticated logic .

 ```python
+import os
+
 import lancedb
+import openai
 from lancedb.embeddings import get_registry
-from lancedb.pydanatic import LanceModel, Vector
+from lancedb.pydantic import LanceModel, Vector

 db = lancedb.connect("~/.lancedb")

 # Ingest embedding function in LanceDB table
+# Configuring the environment variable OPENAI_API_KEY
+if "OPENAI_API_KEY" not in os.environ:
+# OR set the key here as a variable
+    openai.api_key = "sk-..."
 embeddings = get_registry().get("openai").create()

 class Documents(LanceModel):
-    vector: Vector(embeddings.ndims) = embeddings.VectorField()
+    vector: Vector(embeddings.ndims()) = embeddings.VectorField()
    text: str = embeddings.SourceField()

 table = db.create_table("documents", schema=Documents)
@@ -31,17 +38,19 @@ data = [
 # ingest docs with auto-vectorization
 table.add(data)

+# Create a fts index before the hybrid search
+table.create_fts_index("text")
 # hybrid search with default re-ranker
 results = table.search("flower moon", query_type="hybrid").to_pandas()
 ```

-By default, LanceDB uses `LinearCombinationReranker(weights=0.7)` to combine and rerank the results of semantic and full-text search. You can customize the hyperparameters as needed or write your own custom reranker. Here's how you can use any of the available rerankers:
+By default, LanceDB uses `LinearCombinationReranker(weight=0.7)` to combine and rerank the results of semantic and full-text search. You can customize the hyperparameters as needed or write your own custom reranker. Here's how you can use any of the available rerankers:


 ### `rerank()` arguments
 * `normalize`: `str`, default `"score"`:
    The method to normalize the scores. Can be "rank" or "score". If "rank", the scores are converted to ranks and then normalized. If "score", the scores are normalized directly.
-* `reranker`: `Reranker`, default `LinearCombinationReranker(weights=0.7)`.
+* `reranker`: `Reranker`, default `LinearCombinationReranker(weight=0.7)`.
    The reranker to use. If not specified, the default reranker is used.


@@ -55,7 +64,7 @@ This is the default re-ranker used by LanceDB. It combines the results of semant
 ```python
 from lancedb.rerankers import LinearCombinationReranker

-reranker = LinearCombinationReranker(weights=0.3) # Use 0.3 as the weight for vector search
+reranker = LinearCombinationReranker(weight=0.3) # Use 0.3 as the weight for vector search

 results = table.search("rebel", query_type="hybrid").rerank(reranker=reranker).to_pandas()
 ```
--- a/python/lancedb/table.py
+++ b/python/lancedb/table.py
@@ -836,9 +836,7 @@ class LanceTable(Table):
        -------
        pl.LazyFrame
        """
-        return pl.scan_pyarrow_dataset(
-            self.to_lance(), allow_pyarrow_filter=False, batch_size=batch_size
-        )
+        return pl.scan_pyarrow_dataset(self.to_lance(), batch_size=batch_size)

    @property
    def _dataset_uri(self) -> str:
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -14,7 +14,8 @@ dependencies = [
    "pyyaml>=6.0",
    "click>=8.1.7",
    "requests>=2.31.0",
-    "overrides>=0.7"
+    "overrides>=0.7",
+    "pyarrow>=14.0"
 ]
 description = "lancedb"
 authors = [{ name = "LanceDB Devs", email = "dev@lancedb.com" }]
Author	SHA1	Message	Date
Chang She	d1b0c8f0fe	feat(python): enable polars predict pushdown	2024-02-03 19:33:45 -08:00
QianZhu	e412194008	fix hybrid search example (#922 )	2024-02-03 09:26:32 +05:30