feat(python): hybrid search updates, examples, & latency benchmarks (#964)

- Rename safe_import -> attempt_import_or_raise (closes https://github.com/lancedb/lancedb/pull/923) - Update docs - Add Notebook example (@changhiskhan you can use it for the talk. Comes with "open in colab" button) - Latency benchmark & results comparison, sanity check on real-world data - Updates the default openai model to gpt-4
2026-01-06 03:42:57 +00:00 · 2024-02-13 17:58:39 +05:30
parent 1045af6c09
commit 510e8378bc
20 changed files with 1209 additions and 80 deletions
--- a/docs/src/embeddings/api.md
+++ b/docs/src/embeddings/api.md
@@ -17,6 +17,7 @@ Let's implement `SentenceTransformerEmbeddings` class. All you need to do is imp

 ```python
 from lancedb.embeddings import register
+from lancedb.util import attempt_import_or_raise

@register("sentence-transformers")
 class SentenceTransformerEmbeddings(TextEmbeddingFunction):
@@ -81,7 +82,7 @@ class OpenClipEmbeddings(EmbeddingFunction):

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
-        open_clip = self.safe_import("open_clip", "open-clip") # EmbeddingFunction util to import external libs and raise if not found
+        open_clip = attempt_import_or_raise("open_clip", "open-clip") # EmbeddingFunction util to import external libs and raise if not found
        model, _, preprocess = open_clip.create_model_and_transforms(
            self.name, pretrained=self.pretrained
        )
@@ -109,14 +110,14 @@ class OpenClipEmbeddings(EmbeddingFunction):
        if isinstance(query, str):
            return [self.generate_text_embeddings(query)]
        else:
-            PIL = self.safe_import("PIL", "pillow")
+            PIL = attempt_import_or_raise("PIL", "pillow")
            if isinstance(query, PIL.Image.Image):
                return [self.generate_image_embedding(query)]
            else:
                raise TypeError("OpenClip supports str or PIL Image as query")

    def generate_text_embeddings(self, text: str) -> np.ndarray:
-        torch = self.safe_import("torch")
+        torch = attempt_import_or_raise("torch")
        text = self.sanitize_input(text)
        text = self._tokenizer(text)
        text.to(self.device)
@@ -175,7 +176,7 @@ class OpenClipEmbeddings(EmbeddingFunction):
            The image to embed. If the image is a str, it is treated as a uri.
            If the image is bytes, it is treated as the raw image bytes.
        """
-        torch = self.safe_import("torch")
+        torch = attempt_import_or_raise("torch")
        # TODO handle retry and errors for https
        image = self._to_pil(image)
        image = self._preprocess(image).unsqueeze(0)
@@ -183,7 +184,7 @@ class OpenClipEmbeddings(EmbeddingFunction):
            return self._encode_and_normalize_image(image)

    def _to_pil(self, image: Union[str, bytes]):
-        PIL = self.safe_import("PIL", "pillow")
+        PIL = attempt_import_or_raise("PIL", "pillow")
        if isinstance(image, bytes):
            return PIL.Image.open(io.BytesIO(image))
        if isinstance(image, PIL.Image.Image):
--- a/docs/src/embeddings/default_embedding_functions.md
+++ b/docs/src/embeddings/default_embedding_functions.md
@@ -9,6 +9,9 @@ Contains the text embedding functions registered by default.
 ### Sentence transformers
 Allows you to set parameters when registering a `sentence-transformers` object.

+!!! info
+    Sentence transformer embeddings are normalized by default. It is recommended to use normalized embeddings for similarity search.
+
 | Parameter | Type | Default Value | Description |
 |---|---|---|---|
 | `name` | `str` | `all-MiniLM-L6-v2` | The name of the model |
--- a/docs/src/hybrid_search/hybrid_search.md
+++ b/docs/src/hybrid_search/hybrid_search.md
@@ -69,7 +69,7 @@ reranker = LinearCombinationReranker(weight=0.3) # Use 0.3 as the weight for vec
 results = table.search("rebel", query_type="hybrid").rerank(reranker=reranker).to_pandas()
 ```

-Arguments
+### Arguments
 ----------------
 * `weight`: `float`, default `0.7`:
    The weight to use for the semantic search score. The weight for the full-text search score is `1 - weights`.
@@ -91,9 +91,9 @@ reranker = CohereReranker()
 results = table.search("vampire weekend", query_type="hybrid").rerank(reranker=reranker).to_pandas()
 ```

-Arguments
+### Arguments
 ----------------
-* `model_name`` : str, default `"rerank-english-v2.0"``
+* `model_name` : str, default `"rerank-english-v2.0"`
        The name of the cross encoder model to use. Available cohere models are:
        - rerank-english-v2.0
        - rerank-multilingual-v2.0
@@ -117,7 +117,7 @@ results = table.search("harmony hall", query_type="hybrid").rerank(reranker=rera
 ```


-Arguments
+### Arguments
 ----------------
 * `model` : str, default `"cross-encoder/ms-marco-TinyBERT-L-6"`
        The name of the cross encoder model to use. Available cross encoder models can be found [here](https://www.sbert.net/docs/pretrained_cross-encoders.html)
@@ -143,7 +143,7 @@ reranker = ColbertReranker()
 results = table.search("harmony hall", query_type="hybrid").rerank(reranker=reranker).to_pandas()
 ```

-Arguments
+### Arguments
 ----------------
 * `model_name` : `str`, default `"colbert-ir/colbertv2.0"`
        The name of the cross encoder model to use.
@@ -162,7 +162,8 @@ This reranker uses the OpenAI API to combine the results of semantic and full-te
    This prompts chat model to rerank results which is not a dedicated reranker model. This should be treated as experimental.

 !!! Tip
-    You might run out of token limit so set the search `limits` based on your token limit.
+    - You might run out of token limit so set the search `limits` based on your token limit.
+    - It is recommended to use gpt-4-turbo-preview, the default model, older models might lead to undesired behaviour

 ```python
 from lancedb.rerankers import OpenaiReranker
@@ -172,15 +173,15 @@ reranker = OpenaiReranker()
 results = table.search("harmony hall", query_type="hybrid").rerank(reranker=reranker).to_pandas()
 ```

-Arguments
+### Arguments
 ----------------
-`model_name` : `str`, default `"gpt-3.5-turbo-1106"`
+* `model_name` : `str`, default `"gpt-4-turbo-preview"`
    The name of the cross encoder model to use.
-`column` : `str`, default `"text"`
+* `column` : `str`, default `"text"`
    The name of the column to use as input to the cross encoder model.
-`return_score` : `str`, default `"relevance"`
+* `return_score` : `str`, default `"relevance"`
    options are "relevance" or "all". Only "relevance" is supported for now.
-`api_key` : `str`, default `None`
+* `api_key` : `str`, default `None`
    The API key to use. If None, will use the OPENAI_API_KEY environment variable.


@@ -212,24 +213,30 @@ class MyReranker(Reranker):

 ```

-You can also accept additional arguments like a filter along with fts and vector search results
+### Example of a Custom Reranker
+For the sake of simplicity let's build custom reranker that just enchances the Cohere Reranker by accepting a filter query, and accept other CohereReranker params as kwags.

 ```python

-from lancedb.rerankers import Reranker
-import pyarrow as pa
+from typing import List, Union
+import pandas as pd
+from lancedb.rerankers import CohereReranker

-class MyReranker(Reranker):
-    ...
-    
-    def rerank_hybrid(self, query: str, vector_results: pa.Table, fts_results: pa.Table, filter: str):
-        # Use the built-in merging function
-        combined_result = self.merge_results(vector_results, fts_results)
-        
-        # Do something with the combined results & filter
-        # ...
+class MofidifiedCohereReranker(CohereReranker):
+    def __init__(self, filters: Union[str, List[str]], **kwargs):
+        super().__init__(**kwargs)
+        filters = filters if isinstance(filters, list) else [filters]
+        self.filters = filters

-        # Return the combined results
-        return combined_result
+    def rerank_hybrid(self, query: str, vector_results: pa.Table, fts_results: pa.Table)-> pa.Table:
+        combined_result = super().rerank_hybrid(query, vector_results, fts_results)
+        df = combined_result.to_pandas()
+        for filter in self.filters:
+            df = df.query("not text.str.contains(@filter)")
+
+        return pa.Table.from_pandas(df)

 ```
+
+!!! tip
+    The `vector_results` and `fts_results` are pyarrow tables. You can convert them to pandas dataframes using `to_pandas()` method and perform any operations you want. After you are done, you can convert the dataframe back to pyarrow table using `pa.Table.from_pandas()` method and return it.
--- a/docs/src/notebooks/hybrid_search.ipynb
+++ b/docs/src/notebooks/hybrid_search.ipynb