From eb31d95fef6814f534ce78d6f93d2344363808d4 Mon Sep 17 00:00:00 2001 From: Ayush Chaurasia Date: Tue, 13 Feb 2024 17:58:39 +0530 Subject: [PATCH] feat(python): hybrid search updates, examples, & latency benchmarks (#964) - Rename safe_import -> attempt_import_or_raise (closes https://github.com/lancedb/lancedb/pull/923) - Update docs - Add Notebook example (@changhiskhan you can use it for the talk. Comes with "open in colab" button) - Latency benchmark & results comparison, sanity check on real-world data - Updates the default openai model to gpt-4 --- docs/mkdocs.yml | 8 +- docs/src/embeddings/api.md | 11 +- .../embeddings/default_embedding_functions.md | 3 + docs/src/{ => hybrid_search}/hybrid_search.md | 57 +- docs/src/notebooks/hybrid_search.ipynb | 1122 +++++++++++++++++ docs/test/md_testing.py | 2 +- python/lancedb/embeddings/base.py | 20 - python/lancedb/embeddings/bedrock.py | 5 +- python/lancedb/embeddings/cohere.py | 3 +- python/lancedb/embeddings/gemini_text.py | 3 +- python/lancedb/embeddings/gte.py | 3 +- python/lancedb/embeddings/instructor.py | 5 +- python/lancedb/embeddings/open_clip.py | 11 +- python/lancedb/embeddings/openai.py | 3 +- .../embeddings/sentence_transformers.py | 3 +- python/lancedb/rerankers/cohere.py | 4 +- python/lancedb/rerankers/colbert.py | 8 +- python/lancedb/rerankers/cross_encoder.py | 6 +- python/lancedb/rerankers/openai.py | 10 +- python/lancedb/util.py | 2 +- 20 files changed, 1209 insertions(+), 80 deletions(-) rename docs/src/{ => hybrid_search}/hybrid_search.md (85%) create mode 100644 docs/src/notebooks/hybrid_search.ipynb diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index b2cbe9eb..b517117a 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -90,7 +90,9 @@ nav: - Building an ANN index: ann_indexes.md - Vector Search: search.md - Full-text search: fts.md - - Hybrid search: hybrid_search.md + - Hybrid search: + - hybrid_search/hybrid_search.md + - AirBNB financial data example: notebooks/hybrid_search.ipynb - Filtering: sql.md - Versioning & Reproducibility: notebooks/reproducibility.ipynb - Configuring Storage: guides/storage.md @@ -152,7 +154,9 @@ nav: - Building an ANN index: ann_indexes.md - Vector Search: search.md - Full-text search: fts.md - - Hybrid search: hybrid_search.md + - Hybrid search: + - hybrid_search/hybrid_search.md + - AirBNB financial data example: notebooks/hybrid_search.ipynb - Filtering: sql.md - Versioning & Reproducibility: notebooks/reproducibility.ipynb - Configuring Storage: guides/storage.md diff --git a/docs/src/embeddings/api.md b/docs/src/embeddings/api.md index e91b46e5..b306640d 100644 --- a/docs/src/embeddings/api.md +++ b/docs/src/embeddings/api.md @@ -17,6 +17,7 @@ Let's implement `SentenceTransformerEmbeddings` class. All you need to do is imp ```python from lancedb.embeddings import register +from lancedb.util import attempt_import_or_raise @register("sentence-transformers") class SentenceTransformerEmbeddings(TextEmbeddingFunction): @@ -81,7 +82,7 @@ class OpenClipEmbeddings(EmbeddingFunction): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - open_clip = self.safe_import("open_clip", "open-clip") # EmbeddingFunction util to import external libs and raise if not found + open_clip = attempt_import_or_raise("open_clip", "open-clip") # EmbeddingFunction util to import external libs and raise if not found model, _, preprocess = open_clip.create_model_and_transforms( self.name, pretrained=self.pretrained ) @@ -109,14 +110,14 @@ class OpenClipEmbeddings(EmbeddingFunction): if isinstance(query, str): return [self.generate_text_embeddings(query)] else: - PIL = self.safe_import("PIL", "pillow") + PIL = attempt_import_or_raise("PIL", "pillow") if isinstance(query, PIL.Image.Image): return [self.generate_image_embedding(query)] else: raise TypeError("OpenClip supports str or PIL Image as query") def generate_text_embeddings(self, text: str) -> np.ndarray: - torch = self.safe_import("torch") + torch = attempt_import_or_raise("torch") text = self.sanitize_input(text) text = self._tokenizer(text) text.to(self.device) @@ -175,7 +176,7 @@ class OpenClipEmbeddings(EmbeddingFunction): The image to embed. If the image is a str, it is treated as a uri. If the image is bytes, it is treated as the raw image bytes. """ - torch = self.safe_import("torch") + torch = attempt_import_or_raise("torch") # TODO handle retry and errors for https image = self._to_pil(image) image = self._preprocess(image).unsqueeze(0) @@ -183,7 +184,7 @@ class OpenClipEmbeddings(EmbeddingFunction): return self._encode_and_normalize_image(image) def _to_pil(self, image: Union[str, bytes]): - PIL = self.safe_import("PIL", "pillow") + PIL = attempt_import_or_raise("PIL", "pillow") if isinstance(image, bytes): return PIL.Image.open(io.BytesIO(image)) if isinstance(image, PIL.Image.Image): diff --git a/docs/src/embeddings/default_embedding_functions.md b/docs/src/embeddings/default_embedding_functions.md index 7b5446c4..7d0125d7 100644 --- a/docs/src/embeddings/default_embedding_functions.md +++ b/docs/src/embeddings/default_embedding_functions.md @@ -9,6 +9,9 @@ Contains the text embedding functions registered by default. ### Sentence transformers Allows you to set parameters when registering a `sentence-transformers` object. +!!! info + Sentence transformer embeddings are normalized by default. It is recommended to use normalized embeddings for similarity search. + | Parameter | Type | Default Value | Description | |---|---|---|---| | `name` | `str` | `all-MiniLM-L6-v2` | The name of the model | diff --git a/docs/src/hybrid_search.md b/docs/src/hybrid_search/hybrid_search.md similarity index 85% rename from docs/src/hybrid_search.md rename to docs/src/hybrid_search/hybrid_search.md index c6d26656..4a0440a7 100644 --- a/docs/src/hybrid_search.md +++ b/docs/src/hybrid_search/hybrid_search.md @@ -69,7 +69,7 @@ reranker = LinearCombinationReranker(weight=0.3) # Use 0.3 as the weight for vec results = table.search("rebel", query_type="hybrid").rerank(reranker=reranker).to_pandas() ``` -Arguments +### Arguments ---------------- * `weight`: `float`, default `0.7`: The weight to use for the semantic search score. The weight for the full-text search score is `1 - weights`. @@ -91,9 +91,9 @@ reranker = CohereReranker() results = table.search("vampire weekend", query_type="hybrid").rerank(reranker=reranker).to_pandas() ``` -Arguments +### Arguments ---------------- -* `model_name`` : str, default `"rerank-english-v2.0"`` +* `model_name` : str, default `"rerank-english-v2.0"` The name of the cross encoder model to use. Available cohere models are: - rerank-english-v2.0 - rerank-multilingual-v2.0 @@ -117,7 +117,7 @@ results = table.search("harmony hall", query_type="hybrid").rerank(reranker=rera ``` -Arguments +### Arguments ---------------- * `model` : str, default `"cross-encoder/ms-marco-TinyBERT-L-6"` The name of the cross encoder model to use. Available cross encoder models can be found [here](https://www.sbert.net/docs/pretrained_cross-encoders.html) @@ -143,7 +143,7 @@ reranker = ColbertReranker() results = table.search("harmony hall", query_type="hybrid").rerank(reranker=reranker).to_pandas() ``` -Arguments +### Arguments ---------------- * `model_name` : `str`, default `"colbert-ir/colbertv2.0"` The name of the cross encoder model to use. @@ -162,7 +162,8 @@ This reranker uses the OpenAI API to combine the results of semantic and full-te This prompts chat model to rerank results which is not a dedicated reranker model. This should be treated as experimental. !!! Tip - You might run out of token limit so set the search `limits` based on your token limit. + - You might run out of token limit so set the search `limits` based on your token limit. + - It is recommended to use gpt-4-turbo-preview, the default model, older models might lead to undesired behaviour ```python from lancedb.rerankers import OpenaiReranker @@ -172,15 +173,15 @@ reranker = OpenaiReranker() results = table.search("harmony hall", query_type="hybrid").rerank(reranker=reranker).to_pandas() ``` -Arguments +### Arguments ---------------- -`model_name` : `str`, default `"gpt-3.5-turbo-1106"` +* `model_name` : `str`, default `"gpt-4-turbo-preview"` The name of the cross encoder model to use. -`column` : `str`, default `"text"` +* `column` : `str`, default `"text"` The name of the column to use as input to the cross encoder model. -`return_score` : `str`, default `"relevance"` +* `return_score` : `str`, default `"relevance"` options are "relevance" or "all". Only "relevance" is supported for now. -`api_key` : `str`, default `None` +* `api_key` : `str`, default `None` The API key to use. If None, will use the OPENAI_API_KEY environment variable. @@ -212,24 +213,30 @@ class MyReranker(Reranker): ``` -You can also accept additional arguments like a filter along with fts and vector search results +### Example of a Custom Reranker +For the sake of simplicity let's build custom reranker that just enchances the Cohere Reranker by accepting a filter query, and accept other CohereReranker params as kwags. ```python -from lancedb.rerankers import Reranker -import pyarrow as pa +from typing import List, Union +import pandas as pd +from lancedb.rerankers import CohereReranker -class MyReranker(Reranker): - ... - - def rerank_hybrid(self, query: str, vector_results: pa.Table, fts_results: pa.Table, filter: str): - # Use the built-in merging function - combined_result = self.merge_results(vector_results, fts_results) - - # Do something with the combined results & filter - # ... +class MofidifiedCohereReranker(CohereReranker): + def __init__(self, filters: Union[str, List[str]], **kwargs): + super().__init__(**kwargs) + filters = filters if isinstance(filters, list) else [filters] + self.filters = filters - # Return the combined results - return combined_result + def rerank_hybrid(self, query: str, vector_results: pa.Table, fts_results: pa.Table)-> pa.Table: + combined_result = super().rerank_hybrid(query, vector_results, fts_results) + df = combined_result.to_pandas() + for filter in self.filters: + df = df.query("not text.str.contains(@filter)") + + return pa.Table.from_pandas(df) ``` + +!!! tip + The `vector_results` and `fts_results` are pyarrow tables. You can convert them to pandas dataframes using `to_pandas()` method and perform any operations you want. After you are done, you can convert the dataframe back to pyarrow table using `pa.Table.from_pandas()` method and return it. diff --git a/docs/src/notebooks/hybrid_search.ipynb b/docs/src/notebooks/hybrid_search.ipynb new file mode 100644 index 00000000..08869e17 --- /dev/null +++ b/docs/src/notebooks/hybrid_search.ipynb @@ -0,0 +1,1122 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0daef1cd-9130-46b8-8eb8-1b721860e239", + "metadata": {}, + "source": [ + "# [Example] AirBNB fincial data search\n", + "\n", + "\"Open \n", + "\n", + "LanceDB supports both semantic and keyword-based search. In real world applications, it is often useful to combine these two approaches to get the best best results. For example, you may want to search for a document that is semantically similar to a query document, but also contains a specific keyword. This is an example of hybrid search, a search algorithm that combines multiple search techniques.\n", + "\n", + "Let's get stared with an Example. In this notebook we'll use AirBNB financial data document to search for \"the specific reasons for higher operating costs\" in that particular year" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "819fa612", + "metadata": {}, + "outputs": [], + "source": [ + "# Setup\n", + "!pip install lancedb pandas langchain langchain-community" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b6864d97-7f85-4d9c-bf05-e9cf9db29e81", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " ········\n" + ] + } + ], + "source": [ + "import os\n", + "import getpass\n", + "\n", + "# Set your OpenAI API key\n", + "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "cfce9804-cd1c-48c3-acd2-e74eb4e290c7", + "metadata": {}, + "outputs": [], + "source": [ + "def pretty_print(docs):\n", + " for doc in docs:\n", + " print(doc + \"\\n\\n\") " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "efb22cec-5a06-46ac-91c3-53f9b9090109", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import PyPDFLoader\n", + "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", + "\n", + "# Load $ABNB's financial report. This may take 1-2 minutes since the PDF is large\n", + "sec_filing_pdf = \"https://d18rn0p25nwr6d.cloudfront.net/CIK-0001559720/8a9ebed0-815a-469a-87eb-1767d21d8cec.pdf\"\n", + "\n", + "# Create your PDF loader\n", + "loader = PyPDFLoader(sec_filing_pdf)\n", + "\n", + "# Load the PDF document\n", + "documents = loader.load()\n", + "\n", + "# Chunk the financial report\n", + "text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=0)\n", + "docs = text_splitter.split_documents(documents)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "d3c5ce69-0f75-44cb-9e49-9be665fc156e", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[2024-02-12T20:00:04Z WARN lance::dataset] No existing dataset at /Users/ayushchaurasia/langchain/airbnb.lance, it will be created\n" + ] + } + ], + "source": [ + "from langchain_community.vectorstores import LanceDB\n", + "from langchain.embeddings.openai import OpenAIEmbeddings\n", + "import lancedb\n", + "from lancedb.embeddings import get_registry\n", + "from lancedb.pydantic import Vector, LanceModel\n", + "\n", + "openai = get_registry().get(\"openai\").create()\n", + "\n", + "class Schema(LanceModel):\n", + " text: str = openai.SourceField()\n", + " vector: Vector(openai.ndims()) = openai.VectorField()\n", + "\n", + "embedding_function = OpenAIEmbeddings()\n", + "\n", + "db = lancedb.connect(\"~/langchain\")\n", + "table = db.create_table(\n", + " \"airbnb\",\n", + " schema=Schema,\n", + " mode=\"overwrite\",\n", + ")\n", + "\n", + "# Load the document into LanceDB\n", + "db = LanceDB.from_documents(docs, embedding_function, connection=table)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "4284e67e-3a39-4486-a060-11a18f7c0e1f", + "metadata": {}, + "outputs": [], + "source": [ + "table.create_fts_index(\"text\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "d959a80f-d568-48f4-9d14-7367bcc1ce8d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
textvector
0Table of Contents\\nUNITED STATES\\nSECURITIES A...[-0.003405824, -0.03212391, 0.012812538, -0.02...
1Class A common stock, par value $0.0001 per sh...[-0.019193485, -0.02273649, 0.009623382, -0.02...
2this chapter) during the preceding 12 months (...[-0.020692078, -0.016187502, -0.008877442, -0....
3Indicate by check mark whether the registrant ...[-0.019304628, -0.0034501317, -0.011525051, -0...
4As of June 30, 2022, the aggregate market valu...[-0.014594535, -0.011274607, -0.007967828, -0....
\n", + "
" + ], + "text/plain": [ + " text \\\n", + "0 Table of Contents\\nUNITED STATES\\nSECURITIES A... \n", + "1 Class A common stock, par value $0.0001 per sh... \n", + "2 this chapter) during the preceding 12 months (... \n", + "3 Indicate by check mark whether the registrant ... \n", + "4 As of June 30, 2022, the aggregate market valu... \n", + "\n", + " vector \n", + "0 [-0.003405824, -0.03212391, 0.012812538, -0.02... \n", + "1 [-0.019193485, -0.02273649, 0.009623382, -0.02... \n", + "2 [-0.020692078, -0.016187502, -0.008877442, -0.... \n", + "3 [-0.019304628, -0.0034501317, -0.011525051, -0... \n", + "4 [-0.014594535, -0.011274607, -0.007967828, -0.... " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "table.to_pandas().head()" + ] + }, + { + "cell_type": "markdown", + "id": "667f4e4a-6ff1-4f1c-ad57-4a2a8b036670", + "metadata": {}, + "source": [ + "## Vector Search\n", + "\n", + "avg latency - `3.48 ms ± 71.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)`" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "8a5ab2de-6d75-4785-b838-ed6a825dfa6e", + "metadata": {}, + "outputs": [], + "source": [ + "query = \"What are the specific factors contributing to Airbnb's increased operational expenses in the last fiscal year?\"\n", + "docs = table.search(query).limit(5).to_pandas()[\"text\"].to_list()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "5423d333-0f6d-4951-ab3f-6941ad30ba8a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "In addition, the number of listings on Airbnb may decline as a result of a number of other factors affecting Hosts, including: the COVID-19 pandemic; enforcement or threatened\n", + "enforcement of laws and regulations, including short-term occupancy and tax laws; private groups, such as homeowners, landlords, and condominium and neighborhood\n", + "associations, adopting and enforcing contracts that prohibit or restrict home sharing; leases, mortgages, and other agreements, or regulations that purport to ban or otherwise restrict\n", + "home sharing; Hosts opting for long-term rentals on other third-party platforms as an alternative to listing on our platform; economic, social, and political factors; perceptions of trust\n", + "and safety on and off our platform; negative experiences with guests, including guests who damage Host property, throw unauthorized parties, or engage in violent and unlawful\n", + "\n", + "\n", + "Made Possible by Hosts, Strangers, AirCover, Categories, and OMG marketing campaigns and launches, a $67.9 million increase in our search engine marketing and advertising\n", + "spend, a $25.1 million increase in payroll-related expenses due to growth in headcount and increase in compensation costs, a $22.0 million increase in third-party service provider\n", + "expenses, and a $11.1 million increase in coupon expense in line with increase in revenue and launch of AirCover for guests, partially offset by a decrease of $22.9 million related to\n", + "the changes in the fair value of contingent consideration related to a 2019 acquisition.\n", + "General and Administrative\n", + "2021 2022 % Change\n", + "(in millions, except percentages)\n", + "General and administrative $ 836 $ 950 14 %\n", + "Percentage of revenue 14 % 11 %\n", + "General and administrative expense increased $114.0 million, or 14%, in 2022 compared to 2021, primarily due to an increase in other business and operational taxes of $41.3\n", + "\n", + "\n", + "Our success depends significantly on existing guests continuing to book and attracting new guests to book on our platform. Our ability to attract and retain guests could be materially\n", + "adversely affected by a number of factors discussed elsewhere in these “Risk Factors,” including:\n", + "•events beyond our control such as the ongoing COVID-19 pandemic, other pandemics and health concerns, restrictions on travel, immigration, trade disputes, economic\n", + "downturns, and the impact of climate change on travel including the availability of preferred destinations and the increase in the frequency and severity of weather-related\n", + "events, including fires, floods, droughts, extreme temperatures and ambient temperature increases, severe weather and other natural disasters, and the impact of other\n", + "climate change on seasonal destinations;\n", + "•political, social, or economic instability;\n", + "•Hosts failing to meet guests’ expectations, including increased expectations for cleanliness in light of the COVID-19 pandemic;\n", + "\n", + "\n", + "Table of Contents\n", + "Airbnb, Inc.\n", + "Consolidated Statements of Operations\n", + "(in millions, except per share amounts)\n", + "Year Ended December 31,\n", + "2020 2021 2022\n", + "Revenue $ 3,378 $ 5,992 $ 8,399 \n", + "Costs and expenses:\n", + "Cost of revenue 876 1,156 1,499 \n", + "Operations and support 878 847 1,041 \n", + "Product development 2,753 1,425 1,502 \n", + "Sales and marketing 1,175 1,186 1,516 \n", + "General and administrative 1,135 836 950 \n", + "Restructuring charges 151 113 89 \n", + "Total costs and expenses 6,968 5,563 6,597 \n", + "Income (loss) from operations (3,590) 429 1,802 \n", + "Interest income 27 13 186 \n", + "Interest expense (172) (438) (24)\n", + "Other income (expense), net (947) (304) 25 \n", + "Income (loss) before income taxes (4,682) (300) 1,989 \n", + "Provision for (benefit from) income taxes (97) 52 96 \n", + "Net income (loss) $ (4,585)$ (352)$ 1,893 \n", + "Net income (loss) per share attributable to Class A and Class B common stockholders:\n", + "Basic $ (16.12)$ (0.57)$ 2.97 \n", + "Diluted $ (16.12)$ (0.57)$ 2.79\n", + "\n", + "\n", + "Our future revenue growth depends on the growth of supply and demand for listings on our platform, and our business is affected by general economic and business conditions\n", + "worldwide as well as trends in the global travel and hospitality industries and the short and long-term accommodation regulatory landscape. In addition, we believe that our revenue\n", + "growth depends upon a number of factors, including:\n", + "•global macroeconomic conditions, including inflation and rising interest rates and recessionary concerns;\n", + "•our ability to retain and grow the number of guests and Nights and Experiences Booked;\n", + "•our ability to retain and grow the number of Hosts and the number of available listings on our platform;\n", + "•events beyond our control such as pandemics and other health concerns, restrictions on travel and immigration, political, social or economic instability, including international\n", + "\n", + "\n" + ] + } + ], + "source": [ + "pretty_print(docs)" + ] + }, + { + "cell_type": "markdown", + "id": "8b0150fe-00dc-4aa0-9c8f-33cbf2ed5ac6", + "metadata": {}, + "source": [ + "## Hybrid Search\n", + "LanceDB support hybrid search with custom Rerankers. Here's the summary of latency numbers of some of the Reranking methods available\n", + "![1_yWDh0Klw8Upsw1V54kkkdQ](https://github.com/AyushExel/assets/assets/15766192/a515fbf7-0553-437e-899e-67691eae3fef)\n", + "\n", + "Let us now perform hybrid search by combining vector and FTS search results. First, we'll cover the default Reranker.\n", + "\n", + "### Linear Combination Reranker\n", + "`LinearCombinationReranker(weight=0.7)` is used as the default reranker for reranking the hybrid search results if the reranker isn't specified explicitly.\n", + "The `weight` param controls the weightage provided to vector search score. The weight of `1-weight` is applied to FTS scores when reranking.\n", + "\n", + "Latency - `71 ms ± 25.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)`" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "d2aa5893-30c4-4beb-9dae-a55665bd82c7", + "metadata": {}, + "outputs": [], + "source": [ + "docs = table.search(query, query_type=\"hybrid\").limit(5).to_pandas()[\"text\"].to_list()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "8d6a99c3-92ef-4677-96bb-9b54a11a79fe", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "In addition, the number of listings on Airbnb may decline as a result of a number of other factors affecting Hosts, including: the COVID-19 pandemic; enforcement or threatened\n", + "enforcement of laws and regulations, including short-term occupancy and tax laws; private groups, such as homeowners, landlords, and condominium and neighborhood\n", + "associations, adopting and enforcing contracts that prohibit or restrict home sharing; leases, mortgages, and other agreements, or regulations that purport to ban or otherwise restrict\n", + "home sharing; Hosts opting for long-term rentals on other third-party platforms as an alternative to listing on our platform; economic, social, and political factors; perceptions of trust\n", + "and safety on and off our platform; negative experiences with guests, including guests who damage Host property, throw unauthorized parties, or engage in violent and unlawful\n", + "\n", + "\n", + "(a) The Borrower may, at its election, deliver a Pricing Certificate to the Administrative Agent in respect of the most recently\n", + "ended fiscal year, commencing with the fiscal year ended December 31, 2022, on any date prior to the date that is 270 days following the last\n", + "day of such fiscal year (the\n", + "-50-\n", + "\n", + "\n", + "“Initial Delivery Date”); provided that the Pricing Certificate for any fiscal year may be delivered on any date following the Initial Delivery\n", + "Date that is prior to the date that is 365 days following the last day of the preceding fiscal year, so long as such Pricing Certificate includes a\n", + "certification that delivery of such Pricing Certificate on or before the Initial Delivery Date was not possible because (i) the information\n", + "required to calculate the KPI Metrics for such preceding fiscal year was not available at such time or (ii) the report of the KPI Metrics Auditor,\n", + "if relevant, was not available at such time (the date of the Administrative Agent’s receipt thereof, each a “Pricing Certificate Date”). Upon\n", + "delivery of a Pricing Certificate in respect of a fiscal year, (i) the Applicable Rate for the Loans incurred by the Borrower shall be increased or\n", + "decreased (or neither increased nor decreased), as applicable, pursuant to the Sustainability Margin Adjustment as set forth in the KPI Metrics\n", + "\n", + "\n", + "including such Sustainability Pricing Adjustment Date and ending on the date immediately preceding the next Sustainability Pricing\n", + "Adjustment Date.\n", + "(b) For the avoidance of doubt, only one Pricing Certificate may be delivered in respect of any fiscal year. It is further understood\n", + "and agreed that the Applicable Rate for Loans incurred by the Borrower will never be reduced or increased by more than 0.050% and that the\n", + "Applicable Rate for the Revolving Commitment Fee will never be reduced or increased by more than 0.010%, pursuant to the Sustainability\n", + "Margin Adjustment and the Sustainability Fee Adjustment, respectively, on any Sustainability Pricing Adjustment Date. For the avoidance of\n", + "doubt, any adjustment to the Applicable Rate for such Loans or such Revolving Commitment Fee by reason of meeting one or both KPI\n", + "Metrics in any fiscal year shall not be cumulative year-over-year. The adjustments pursuant to this Section made on any Sustainability Pricing\n", + "\n", + "\n", + "Adjustment Date shall only apply for the period until the date immediately preceding the next Sustainability Pricing Adjustment Date.\n", + "(c) If, for any fiscal year, either (i) no Pricing Certificate shall have been delivered for such fiscal year or (ii) the Pricing\n", + "Certificate delivered for such fiscal year shall fail to include the Diverse Supplier Spend Percentage or GHG Emissions Intensity for such\n", + "fiscal year, then the Sustainability Margin Adjustment will be positive 0.050% and/or the Sustainability Fee Adjustment will be positive\n", + "0.010%, as applicable, in each case commencing on the last day such Pricing Certificate could have been delivered in accordance with the\n", + "terms of clause (a) above (it being understood that, in the case of the foregoing clause (ii), the Sustainability Margin Adjustment or the\n", + "Sustainability Fee Adjustment will be determined in accordance with such Pricing Certificate to the extent the (A) Sustainability Margin\n", + "\n", + "\n" + ] + } + ], + "source": [ + "pretty_print(docs)" + ] + }, + { + "cell_type": "markdown", + "id": "c4d3e0f3-8d96-47f5-ad1d-514475f1ae55", + "metadata": {}, + "source": [ + "### Cohere Reranker\n", + "This uses Cohere's Reranking API to re-rank the results. It accepts the reranking model name as a parameter. By Default it uses the english-v3 model but you can easily switch to a multi-lingual model.\n", + "\n", + "latency - `605 ms ± 78.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)`" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "ce2c43c7-1a96-4856-ad9b-28385164f187", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " ········\n" + ] + } + ], + "source": [ + "# Free API key\n", + "os.environ[\"COHERE_API_KEY\"] = getpass.getpass()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "4adbb3f1-4d21-427b-9bf0-3d7bebf68cf6", + "metadata": {}, + "outputs": [], + "source": [ + "from lancedb.rerankers import CohereReranker\n", + "\n", + "reranker = CohereReranker()\n", + "docs = table.search(query, query_type=\"hybrid\").limit(5).rerank(reranker=reranker).to_pandas()[\"text\"].to_list()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "a071b3e7-3b8b-42e4-a089-4d6c4094873f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Increased operating expenses, decreased revenue, negative publicity, negative reaction from our Hosts and guests and other stakeholders, or other adverse impacts from any of the\n", + "above factors or other risks related to our international operations could materially adversely affect our brand, reputation, business, results of operations, and financial condition.\n", + "In addition, we will continue to incur significant expenses to operate our outbound business in China, and we may never achieve profitability in that market. These factors, combined\n", + "with sentiment of the workforce in China, and China’s policy towards foreign direct investment may particularly impact our operations in China. In addition, we need to ensure that\n", + "our business practices in China are compliant with local laws and regulations, which may be interpreted and enforced in ways that are different from our interpretation, and/or create\n", + "\n", + "\n", + "Made Possible by Hosts, Strangers, AirCover, Categories, and OMG marketing campaigns and launches, a $67.9 million increase in our search engine marketing and advertising\n", + "spend, a $25.1 million increase in payroll-related expenses due to growth in headcount and increase in compensation costs, a $22.0 million increase in third-party service provider\n", + "expenses, and a $11.1 million increase in coupon expense in line with increase in revenue and launch of AirCover for guests, partially offset by a decrease of $22.9 million related to\n", + "the changes in the fair value of contingent consideration related to a 2019 acquisition.\n", + "General and Administrative\n", + "2021 2022 % Change\n", + "(in millions, except percentages)\n", + "General and administrative $ 836 $ 950 14 %\n", + "Percentage of revenue 14 % 11 %\n", + "General and administrative expense increased $114.0 million, or 14%, in 2022 compared to 2021, primarily due to an increase in other business and operational taxes of $41.3\n", + "\n", + "\n", + "Table of Contents\n", + "Airbnb, Inc.\n", + "Consolidated Statements of Operations\n", + "(in millions, except per share amounts)\n", + "Year Ended December 31,\n", + "2020 2021 2022\n", + "Revenue $ 3,378 $ 5,992 $ 8,399 \n", + "Costs and expenses:\n", + "Cost of revenue 876 1,156 1,499 \n", + "Operations and support 878 847 1,041 \n", + "Product development 2,753 1,425 1,502 \n", + "Sales and marketing 1,175 1,186 1,516 \n", + "General and administrative 1,135 836 950 \n", + "Restructuring charges 151 113 89 \n", + "Total costs and expenses 6,968 5,563 6,597 \n", + "Income (loss) from operations (3,590) 429 1,802 \n", + "Interest income 27 13 186 \n", + "Interest expense (172) (438) (24)\n", + "Other income (expense), net (947) (304) 25 \n", + "Income (loss) before income taxes (4,682) (300) 1,989 \n", + "Provision for (benefit from) income taxes (97) 52 96 \n", + "Net income (loss) $ (4,585)$ (352)$ 1,893 \n", + "Net income (loss) per share attributable to Class A and Class B common stockholders:\n", + "Basic $ (16.12)$ (0.57)$ 2.97 \n", + "Diluted $ (16.12)$ (0.57)$ 2.79\n", + "\n", + "\n", + "Our success depends significantly on existing guests continuing to book and attracting new guests to book on our platform. Our ability to attract and retain guests could be materially\n", + "adversely affected by a number of factors discussed elsewhere in these “Risk Factors,” including:\n", + "•events beyond our control such as the ongoing COVID-19 pandemic, other pandemics and health concerns, restrictions on travel, immigration, trade disputes, economic\n", + "downturns, and the impact of climate change on travel including the availability of preferred destinations and the increase in the frequency and severity of weather-related\n", + "events, including fires, floods, droughts, extreme temperatures and ambient temperature increases, severe weather and other natural disasters, and the impact of other\n", + "climate change on seasonal destinations;\n", + "•political, social, or economic instability;\n", + "•Hosts failing to meet guests’ expectations, including increased expectations for cleanliness in light of the COVID-19 pandemic;\n", + "\n", + "\n", + "In addition, the number of listings on Airbnb may decline as a result of a number of other factors affecting Hosts, including: the COVID-19 pandemic; enforcement or threatened\n", + "enforcement of laws and regulations, including short-term occupancy and tax laws; private groups, such as homeowners, landlords, and condominium and neighborhood\n", + "associations, adopting and enforcing contracts that prohibit or restrict home sharing; leases, mortgages, and other agreements, or regulations that purport to ban or otherwise restrict\n", + "home sharing; Hosts opting for long-term rentals on other third-party platforms as an alternative to listing on our platform; economic, social, and political factors; perceptions of trust\n", + "and safety on and off our platform; negative experiences with guests, including guests who damage Host property, throw unauthorized parties, or engage in violent and unlawful\n", + "\n", + "\n" + ] + } + ], + "source": [ + "pretty_print(docs)" + ] + }, + { + "cell_type": "markdown", + "id": "6630f0c0-6070-4ea7-a191-99092e69ca05", + "metadata": {}, + "source": [ + "Relevance score is returned by Cohere API and is independent of individual FTS and vector search scores." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "80dc61bb-929c-4fbb-b2cb-20c5d31bc65c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
textvector_relevance_score
0Increased operating expenses, decreased revenu...[0.0034929817, -0.024774546, 0.012623285, -0.0...0.985328
1Made Possible by Hosts, Strangers, AirCover, C...[-0.0042489874, -0.005382498, 0.007190078, -0....0.979036
2Table of Contents\\nAirbnb, Inc.\\nConsolidated ...[-0.008569201, -0.019810658, 0.014144964, -0.0...0.696578
3Our success depends significantly on existing ...[0.0027109187, -0.028220002, 0.022864284, -0.0...0.539923
4In addition, the number of listings on Airbnb ...[0.0068983347, -0.0147690065, 0.042441186, -0....0.460713
\n", + "
" + ], + "text/plain": [ + " text \\\n", + "0 Increased operating expenses, decreased revenu... \n", + "1 Made Possible by Hosts, Strangers, AirCover, C... \n", + "2 Table of Contents\\nAirbnb, Inc.\\nConsolidated ... \n", + "3 Our success depends significantly on existing ... \n", + "4 In addition, the number of listings on Airbnb ... \n", + "\n", + " vector _relevance_score \n", + "0 [0.0034929817, -0.024774546, 0.012623285, -0.0... 0.985328 \n", + "1 [-0.0042489874, -0.005382498, 0.007190078, -0.... 0.979036 \n", + "2 [-0.008569201, -0.019810658, 0.014144964, -0.0... 0.696578 \n", + "3 [0.0027109187, -0.028220002, 0.022864284, -0.0... 0.539923 \n", + "4 [0.0068983347, -0.0147690065, 0.042441186, -0.... 0.460713 " + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "table.search(query, query_type=\"hybrid\").limit(5).rerank(reranker=reranker).to_pandas()" + ] + }, + { + "cell_type": "markdown", + "id": "41147a46-7ef8-4266-9cec-08a992697de2", + "metadata": {}, + "source": [ + "### ColBERT Reranker\n", + "Colber Reranker is powered by ColBERT model. It runs locally using the huggingface implementation.\n", + "\n", + "Latency - `950 ms ± 5.78 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)`\n", + "\n", + "Note: First query might be slow. It is recommended to reuse the `Reranker` objects as the models are cached. Subsequent runs will be faster on reusing the same reranker object" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "91b06b43-c971-4177-b62f-f941bbbc2ef4", + "metadata": {}, + "outputs": [], + "source": [ + "from lancedb.rerankers import ColbertReranker\n", + "\n", + "reranker = ColbertReranker()\n", + "docs = table.search(query, query_type=\"hybrid\").limit(5).rerank(reranker=reranker).to_pandas()[\"text\"].to_list()" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "e42c46bd-7cdd-4d31-9dbb-ddd1bdf979fa", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Made Possible by Hosts, Strangers, AirCover, Categories, and OMG marketing campaigns and launches, a $67.9 million increase in our search engine marketing and advertising\n", + "spend, a $25.1 million increase in payroll-related expenses due to growth in headcount and increase in compensation costs, a $22.0 million increase in third-party service provider\n", + "expenses, and a $11.1 million increase in coupon expense in line with increase in revenue and launch of AirCover for guests, partially offset by a decrease of $22.9 million related to\n", + "the changes in the fair value of contingent consideration related to a 2019 acquisition.\n", + "General and Administrative\n", + "2021 2022 % Change\n", + "(in millions, except percentages)\n", + "General and administrative $ 836 $ 950 14 %\n", + "Percentage of revenue 14 % 11 %\n", + "General and administrative expense increased $114.0 million, or 14%, in 2022 compared to 2021, primarily due to an increase in other business and operational taxes of $41.3\n", + "\n", + "\n", + "Our future revenue growth depends on the growth of supply and demand for listings on our platform, and our business is affected by general economic and business conditions\n", + "worldwide as well as trends in the global travel and hospitality industries and the short and long-term accommodation regulatory landscape. In addition, we believe that our revenue\n", + "growth depends upon a number of factors, including:\n", + "•global macroeconomic conditions, including inflation and rising interest rates and recessionary concerns;\n", + "•our ability to retain and grow the number of guests and Nights and Experiences Booked;\n", + "•our ability to retain and grow the number of Hosts and the number of available listings on our platform;\n", + "•events beyond our control such as pandemics and other health concerns, restrictions on travel and immigration, political, social or economic instability, including international\n", + "\n", + "\n", + "Our success depends significantly on existing guests continuing to book and attracting new guests to book on our platform. Our ability to attract and retain guests could be materially\n", + "adversely affected by a number of factors discussed elsewhere in these “Risk Factors,” including:\n", + "•events beyond our control such as the ongoing COVID-19 pandemic, other pandemics and health concerns, restrictions on travel, immigration, trade disputes, economic\n", + "downturns, and the impact of climate change on travel including the availability of preferred destinations and the increase in the frequency and severity of weather-related\n", + "events, including fires, floods, droughts, extreme temperatures and ambient temperature increases, severe weather and other natural disasters, and the impact of other\n", + "climate change on seasonal destinations;\n", + "•political, social, or economic instability;\n", + "•Hosts failing to meet guests’ expectations, including increased expectations for cleanliness in light of the COVID-19 pandemic;\n", + "\n", + "\n", + "In addition, the number of listings on Airbnb may decline as a result of a number of other factors affecting Hosts, including: the COVID-19 pandemic; enforcement or threatened\n", + "enforcement of laws and regulations, including short-term occupancy and tax laws; private groups, such as homeowners, landlords, and condominium and neighborhood\n", + "associations, adopting and enforcing contracts that prohibit or restrict home sharing; leases, mortgages, and other agreements, or regulations that purport to ban or otherwise restrict\n", + "home sharing; Hosts opting for long-term rentals on other third-party platforms as an alternative to listing on our platform; economic, social, and political factors; perceptions of trust\n", + "and safety on and off our platform; negative experiences with guests, including guests who damage Host property, throw unauthorized parties, or engage in violent and unlawful\n", + "\n", + "\n", + "Table of Contents\n", + "Airbnb, Inc.\n", + "Consolidated Statements of Operations\n", + "(in millions, except per share amounts)\n", + "Year Ended December 31,\n", + "2020 2021 2022\n", + "Revenue $ 3,378 $ 5,992 $ 8,399 \n", + "Costs and expenses:\n", + "Cost of revenue 876 1,156 1,499 \n", + "Operations and support 878 847 1,041 \n", + "Product development 2,753 1,425 1,502 \n", + "Sales and marketing 1,175 1,186 1,516 \n", + "General and administrative 1,135 836 950 \n", + "Restructuring charges 151 113 89 \n", + "Total costs and expenses 6,968 5,563 6,597 \n", + "Income (loss) from operations (3,590) 429 1,802 \n", + "Interest income 27 13 186 \n", + "Interest expense (172) (438) (24)\n", + "Other income (expense), net (947) (304) 25 \n", + "Income (loss) before income taxes (4,682) (300) 1,989 \n", + "Provision for (benefit from) income taxes (97) 52 96 \n", + "Net income (loss) $ (4,585)$ (352)$ 1,893 \n", + "Net income (loss) per share attributable to Class A and Class B common stockholders:\n", + "Basic $ (16.12)$ (0.57)$ 2.97 \n", + "Diluted $ (16.12)$ (0.57)$ 2.79\n", + "\n", + "\n" + ] + } + ], + "source": [ + "pretty_print(docs)" + ] + }, + { + "cell_type": "markdown", + "id": "2ba9bc9a-29b0-4faa-b74d-a32af105ed45", + "metadata": {}, + "source": [ + "### Cross Encoder Reranker\n", + "Uses cross encoder models are rerankers. Uses sentence transformer implemntation locally\n", + "\n", + "Latency - `1.38 s ± 64.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)`" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "4b9ea674-c8c6-498a-a3cf-9b7fa9cb7334", + "metadata": {}, + "outputs": [], + "source": [ + "from lancedb.rerankers import CrossEncoderReranker\n", + "\n", + "reranker=CrossEncoderReranker()\n", + "docs = table.search(query, query_type=\"hybrid\").limit(5).rerank(reranker=reranker).to_pandas()[\"text\"].to_list()" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "6fe32845-17f1-4977-9bd5-c18528b84656", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Table of Contents\n", + "Airbnb, Inc.\n", + "Consolidated Statements of Operations\n", + "(in millions, except per share amounts)\n", + "Year Ended December 31,\n", + "2020 2021 2022\n", + "Revenue $ 3,378 $ 5,992 $ 8,399 \n", + "Costs and expenses:\n", + "Cost of revenue 876 1,156 1,499 \n", + "Operations and support 878 847 1,041 \n", + "Product development 2,753 1,425 1,502 \n", + "Sales and marketing 1,175 1,186 1,516 \n", + "General and administrative 1,135 836 950 \n", + "Restructuring charges 151 113 89 \n", + "Total costs and expenses 6,968 5,563 6,597 \n", + "Income (loss) from operations (3,590) 429 1,802 \n", + "Interest income 27 13 186 \n", + "Interest expense (172) (438) (24)\n", + "Other income (expense), net (947) (304) 25 \n", + "Income (loss) before income taxes (4,682) (300) 1,989 \n", + "Provision for (benefit from) income taxes (97) 52 96 \n", + "Net income (loss) $ (4,585)$ (352)$ 1,893 \n", + "Net income (loss) per share attributable to Class A and Class B common stockholders:\n", + "Basic $ (16.12)$ (0.57)$ 2.97 \n", + "Diluted $ (16.12)$ (0.57)$ 2.79\n", + "\n", + "\n", + "Made Possible by Hosts, Strangers, AirCover, Categories, and OMG marketing campaigns and launches, a $67.9 million increase in our search engine marketing and advertising\n", + "spend, a $25.1 million increase in payroll-related expenses due to growth in headcount and increase in compensation costs, a $22.0 million increase in third-party service provider\n", + "expenses, and a $11.1 million increase in coupon expense in line with increase in revenue and launch of AirCover for guests, partially offset by a decrease of $22.9 million related to\n", + "the changes in the fair value of contingent consideration related to a 2019 acquisition.\n", + "General and Administrative\n", + "2021 2022 % Change\n", + "(in millions, except percentages)\n", + "General and administrative $ 836 $ 950 14 %\n", + "Percentage of revenue 14 % 11 %\n", + "General and administrative expense increased $114.0 million, or 14%, in 2022 compared to 2021, primarily due to an increase in other business and operational taxes of $41.3\n", + "\n", + "\n", + "Increased operating expenses, decreased revenue, negative publicity, negative reaction from our Hosts and guests and other stakeholders, or other adverse impacts from any of the\n", + "above factors or other risks related to our international operations could materially adversely affect our brand, reputation, business, results of operations, and financial condition.\n", + "In addition, we will continue to incur significant expenses to operate our outbound business in China, and we may never achieve profitability in that market. These factors, combined\n", + "with sentiment of the workforce in China, and China’s policy towards foreign direct investment may particularly impact our operations in China. In addition, we need to ensure that\n", + "our business practices in China are compliant with local laws and regulations, which may be interpreted and enforced in ways that are different from our interpretation, and/or create\n", + "\n", + "\n", + "In addition, the number of listings on Airbnb may decline as a result of a number of other factors affecting Hosts, including: the COVID-19 pandemic; enforcement or threatened\n", + "enforcement of laws and regulations, including short-term occupancy and tax laws; private groups, such as homeowners, landlords, and condominium and neighborhood\n", + "associations, adopting and enforcing contracts that prohibit or restrict home sharing; leases, mortgages, and other agreements, or regulations that purport to ban or otherwise restrict\n", + "home sharing; Hosts opting for long-term rentals on other third-party platforms as an alternative to listing on our platform; economic, social, and political factors; perceptions of trust\n", + "and safety on and off our platform; negative experiences with guests, including guests who damage Host property, throw unauthorized parties, or engage in violent and unlawful\n", + "\n", + "\n", + "Our future revenue growth depends on the growth of supply and demand for listings on our platform, and our business is affected by general economic and business conditions\n", + "worldwide as well as trends in the global travel and hospitality industries and the short and long-term accommodation regulatory landscape. In addition, we believe that our revenue\n", + "growth depends upon a number of factors, including:\n", + "•global macroeconomic conditions, including inflation and rising interest rates and recessionary concerns;\n", + "•our ability to retain and grow the number of guests and Nights and Experiences Booked;\n", + "•our ability to retain and grow the number of Hosts and the number of available listings on our platform;\n", + "•events beyond our control such as pandemics and other health concerns, restrictions on travel and immigration, political, social or economic instability, including international\n", + "\n", + "\n" + ] + } + ], + "source": [ + "pretty_print(docs)" + ] + }, + { + "cell_type": "markdown", + "id": "a32f41ea-e087-4e64-b9ec-f6224308fa6d", + "metadata": {}, + "source": [ + "### (Experimental) OpenAI Reranker\n", + "\n", + "This prompts chat model to rerank results which is not a dedicated reranker model. This should be treated as experimental. You might run out of token limit so set the search limits based on your token limit. \n", + "NOTE: It is recommended to use `gpt-4-turbo-preview`, older models might lead to bad behaviour\n", + "\n", + "Latency - `Can take 10s of seconds if using GPT-4 model`" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "da78b250-9938-4e81-825f-c17b7a57e541", + "metadata": {}, + "outputs": [], + "source": [ + "from lancedb.rerankers import OpenaiReranker\n", + "\n", + "reranker=OpenaiReranker(model_name=\"gpt-4-turbo-preview\")\n", + "docs = table.search(query, query_type=\"hybrid\").limit(5).rerank(reranker=reranker).to_pandas()[\"text\"].to_list()" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "98e83f73-1ef3-485f-9871-9bd32937863f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Made Possible by Hosts, Strangers, AirCover, Categories, and OMG marketing campaigns and launches, a $67.9 million increase in our search engine marketing and advertising\n", + "spend, a $25.1 million increase in payroll-related expenses due to growth in headcount and increase in compensation costs, a $22.0 million increase in third-party service provider\n", + "expenses, and a $11.1 million increase in coupon expense in line with increase in revenue and launch of AirCover for guests, partially offset by a decrease of $22.9 million related to\n", + "the changes in the fair value of contingent consideration related to a 2019 acquisition.\n", + "General and Administrative\n", + "2021 2022 % Change\n", + "(in millions, except percentages)\n", + "General and administrative $ 836 $ 950 14 %\n", + "Percentage of revenue 14 % 11 %\n", + "General and administrative expense increased $114.0 million, or 14%, in 2022 compared to 2021, primarily due to an increase in other business and operational taxes of $41.3\n", + "\n", + "\n", + "Table of Contents\n", + "Airbnb, Inc.\n", + "Consolidated Statements of Operations\n", + "(in millions, except per share amounts)\n", + "Year Ended December 31,\n", + "2020 2021 2022\n", + "Revenue $ 3,378 $ 5,992 $ 8,399 \n", + "Costs and expenses:\n", + "Cost of revenue 876 1,156 1,499 \n", + "Operations and support 878 847 1,041 \n", + "Product development 2,753 1,425 1,502 \n", + "Sales and marketing 1,175 1,186 1,516 \n", + "General and administrative 1,135 836 950 \n", + "Restructuring charges 151 113 89 \n", + "Total costs and expenses 6,968 5,563 6,597 \n", + "Income (loss) from operations (-3,590) 429 1,802 \n", + "Interest income 27 13 186 \n", + "Interest expense (-172) (-438) (-24)\n", + "Other income (expense), net (-947) (-304) 25 \n", + "Income (loss) before income taxes (-4,682) (-300) 1,989 \n", + "Provision for (benefit from) income taxes (-97) 52 96 \n", + "Net income (loss) $ (-4,585)$ (-352)$ 1,893 \n", + "Net income (loss) per share attributable to Class A and Class B common stockholders:\n", + "Basic $ (-16.12)$ (-0.57)$ 2.97 \n", + "Diluted $ (-16.12)$ (-0.57)$ 2.79\n", + "\n", + "\n", + "In addition, the number of listings on Airbnb may decline as a result of a number of other factors affecting Hosts, including: the COVID-19 pandemic; enforcement or threatened\n", + "enforcement of laws and regulations, including short-term occupancy and tax laws; private groups, such as homeowners, landlords, and condominium and neighborhood\n", + "associations, adopting and enforcing contracts that prohibit or restrict home sharing; leases, mortgages, and other agreements, or regulations that purport to ban or otherwise restrict\n", + "home sharing; Hosts opting for long-term rentals on other third-party platforms as an alternative to listing on our platform; economic, social, and political factors; perceptions of trust\n", + "and safety on and off our platform; negative experiences with guests, including guests who damage Host property, throw unauthorized parties, or engage in violent and unlawful\n", + "\n", + "\n", + "Our success depends significantly on existing guests continuing to book and attracting new guests to book on our platform. Our ability to attract and retain guests could be materially\n", + "adversely affected by a number of factors discussed elsewhere in these “Risk Factors,” including:\n", + "•events beyond our control such as the ongoing COVID-19 pandemic, other pandemics and health concerns, restrictions on travel, immigration, trade disputes, economic\n", + "downturns, and the impact of climate change on travel including the availability of preferred destinations and the increase in the frequency and severity of weather-related\n", + "events, including fires, floods, droughts, extreme temperatures and ambient temperature increases, severe weather and other natural disasters, and the impact of other\n", + "climate change on seasonal destinations;\n", + "•political, social, or economic instability;\n", + "•Hosts failing to meet guests’ expectations, including increased expectations for cleanliness in light of the COVID-19 pandemic;\n", + "\n", + "\n", + "Our future revenue growth depends on the growth of supply and demand for listings on our platform, and our business is affected by general economic and business conditions\n", + "worldwide as well as trends in the global travel and hospitality industries and the short and long-term accommodation regulatory landscape. In addition, we believe that our revenue\n", + "growth depends upon a number of factors, including:\n", + "•global macroeconomic conditions, including inflation and rising interest rates and recessionary concerns;\n", + "•our ability to retain and grow the number of guests and Nights and Experiences Booked;\n", + "•our ability to retain and grow the number of Hosts and the number of available listings on our platform;\n", + "•events beyond our control such as pandemics and other health concerns, restrictions on travel and immigration, political, social or economic instability, including international\n", + "\n", + "\n" + ] + } + ], + "source": [ + "pretty_print(docs)" + ] + }, + { + "cell_type": "markdown", + "id": "42dfdbc5-9006-4398-8465-03828ad48e49", + "metadata": {}, + "source": [ + "## Use your custom Reranker\n", + "Hybrid search in LanceDB is designed to be very flexible. You can easily plug in your own Re-reranking logic. To do so, you simply need to implement the base Reranker class" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "e14503fe-5e9f-4d61-a96b-a5e95d501f61", + "metadata": {}, + "outputs": [], + "source": [ + "from lancedb.rerankers import Reranker\n", + "import pyarrow as pa\n", + "\n", + "class MyCustomReranker(Reranker):\n", + " def rerank_hybrid(self, query: str, vector_results: pa.Table, fts_results: pa.Table)-> pa.Table:\n", + " combined_results = self.merge(vector_results, fts_results) # Or custom merge algo\n", + " # Custom Reranking logic here\n", + "\n", + " return combined_results" + ] + }, + { + "cell_type": "markdown", + "id": "0606d4fb-96ef-4440-9363-f5461284d00c", + "metadata": {}, + "source": [ + "### Custom Reranker based on CohereReranker\n", + "\n", + "For the sake of simplicity let's build custom reranker that just enchances the Cohere Reranker by accepting a filter query, and accept other CohereReranker params as kwags.\n", + "\n", + "For this toy example let's say we want to get rid of docs that represent a table of contents, appendix etc. as these are semantically close of representing costs but this isn't something we are interested in because they don't represent the specific reasons why operating costs were high. They simply represent the costs." + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "dd1e8110-72c4-423c-90de-ce2b386742c1", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import List, Union\n", + "import pandas as pd\n", + "from lancedb.rerankers import CohereReranker\n", + "\n", + "class MofidifiedCohereReranker(CohereReranker):\n", + " def __init__(self, filters: Union[str, List[str]], **kwargs):\n", + " super().__init__(**kwargs)\n", + " filters = filters if isinstance(filters, list) else [filters]\n", + " self.filters = filters\n", + "\n", + " def rerank_hybrid(self, query: str, vector_results: pa.Table, fts_results: pa.Table)-> pa.Table:\n", + " combined_result = super().rerank_hybrid(query, vector_results, fts_results)\n", + " df = combined_result.to_pandas()\n", + " for filter in self.filters:\n", + " df = df.query(\"not text.str.contains(@filter)\")\n", + "\n", + " return pa.Table.from_pandas(df)\n", + "\n", + "reranker = MofidifiedCohereReranker(filters=\"Table of Contents\")" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "f4e6b496-e0c1-4944-8a6d-127f566812d3", + "metadata": {}, + "outputs": [], + "source": [ + "docs = table.search(query, query_type=\"hybrid\").limit(5).rerank(reranker=reranker).to_pandas()[\"text\"].to_list()" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "5a29d0a2-793a-40a2-ac2d-2edda1102d6e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Increased operating expenses, decreased revenue, negative publicity, negative reaction from our Hosts and guests and other stakeholders, or other adverse impacts from any of the\n", + "above factors or other risks related to our international operations could materially adversely affect our brand, reputation, business, results of operations, and financial condition.\n", + "In addition, we will continue to incur significant expenses to operate our outbound business in China, and we may never achieve profitability in that market. These factors, combined\n", + "with sentiment of the workforce in China, and China’s policy towards foreign direct investment may particularly impact our operations in China. In addition, we need to ensure that\n", + "our business practices in China are compliant with local laws and regulations, which may be interpreted and enforced in ways that are different from our interpretation, and/or create\n", + "\n", + "\n", + "Made Possible by Hosts, Strangers, AirCover, Categories, and OMG marketing campaigns and launches, a $67.9 million increase in our search engine marketing and advertising\n", + "spend, a $25.1 million increase in payroll-related expenses due to growth in headcount and increase in compensation costs, a $22.0 million increase in third-party service provider\n", + "expenses, and a $11.1 million increase in coupon expense in line with increase in revenue and launch of AirCover for guests, partially offset by a decrease of $22.9 million related to\n", + "the changes in the fair value of contingent consideration related to a 2019 acquisition.\n", + "General and Administrative\n", + "2021 2022 % Change\n", + "(in millions, except percentages)\n", + "General and administrative $ 836 $ 950 14 %\n", + "Percentage of revenue 14 % 11 %\n", + "General and administrative expense increased $114.0 million, or 14%, in 2022 compared to 2021, primarily due to an increase in other business and operational taxes of $41.3\n", + "\n", + "\n", + "Our success depends significantly on existing guests continuing to book and attracting new guests to book on our platform. Our ability to attract and retain guests could be materially\n", + "adversely affected by a number of factors discussed elsewhere in these “Risk Factors,” including:\n", + "•events beyond our control such as the ongoing COVID-19 pandemic, other pandemics and health concerns, restrictions on travel, immigration, trade disputes, economic\n", + "downturns, and the impact of climate change on travel including the availability of preferred destinations and the increase in the frequency and severity of weather-related\n", + "events, including fires, floods, droughts, extreme temperatures and ambient temperature increases, severe weather and other natural disasters, and the impact of other\n", + "climate change on seasonal destinations;\n", + "•political, social, or economic instability;\n", + "•Hosts failing to meet guests’ expectations, including increased expectations for cleanliness in light of the COVID-19 pandemic;\n", + "\n", + "\n", + "In addition, the number of listings on Airbnb may decline as a result of a number of other factors affecting Hosts, including: the COVID-19 pandemic; enforcement or threatened\n", + "enforcement of laws and regulations, including short-term occupancy and tax laws; private groups, such as homeowners, landlords, and condominium and neighborhood\n", + "associations, adopting and enforcing contracts that prohibit or restrict home sharing; leases, mortgages, and other agreements, or regulations that purport to ban or otherwise restrict\n", + "home sharing; Hosts opting for long-term rentals on other third-party platforms as an alternative to listing on our platform; economic, social, and political factors; perceptions of trust\n", + "and safety on and off our platform; negative experiences with guests, including guests who damage Host property, throw unauthorized parties, or engage in violent and unlawful\n", + "\n", + "\n", + "Our future revenue growth depends on the growth of supply and demand for listings on our platform, and our business is affected by general economic and business conditions\n", + "worldwide as well as trends in the global travel and hospitality industries and the short and long-term accommodation regulatory landscape. In addition, we believe that our revenue\n", + "growth depends upon a number of factors, including:\n", + "•global macroeconomic conditions, including inflation and rising interest rates and recessionary concerns;\n", + "•our ability to retain and grow the number of guests and Nights and Experiences Booked;\n", + "•our ability to retain and grow the number of Hosts and the number of available listings on our platform;\n", + "•events beyond our control such as pandemics and other health concerns, restrictions on travel and immigration, political, social or economic instability, including international\n", + "\n", + "\n" + ] + } + ], + "source": [ + "pretty_print(docs)" + ] + }, + { + "cell_type": "markdown", + "id": "b3b5464a-7252-4eab-aaac-9b0eae37496f", + "metadata": {}, + "source": [ + "As you can see the document containing the Table of contetnts of spending no longer shows up" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/test/md_testing.py b/docs/test/md_testing.py index 0ea8431f..a3ff6f52 100644 --- a/docs/test/md_testing.py +++ b/docs/test/md_testing.py @@ -14,7 +14,7 @@ excluded_globs = [ "../src/concepts/*.md", "../src/ann_indexes.md", "../src/basic.md", - "../src/hybrid_search.md", + "../src/hybrid_search/hybrid_search.md", ] python_prefix = "py" diff --git a/python/lancedb/embeddings/base.py b/python/lancedb/embeddings/base.py index 41474227..3d940810 100644 --- a/python/lancedb/embeddings/base.py +++ b/python/lancedb/embeddings/base.py @@ -10,7 +10,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import importlib from abc import ABC, abstractmethod from typing import List, Union @@ -91,25 +90,6 @@ class EmbeddingFunction(BaseModel, ABC): texts = texts.combine_chunks().to_pylist() return texts - @classmethod - def safe_import(cls, module: str, mitigation=None): - """ - Import the specified module. If the module is not installed, - raise an ImportError with a helpful message. - - Parameters - ---------- - module : str - The name of the module to import - mitigation : Optional[str] - The package(s) to install to mitigate the error. - If not provided then the module name will be used. - """ - try: - return importlib.import_module(module) - except ImportError: - raise ImportError(f"Please install {mitigation or module}") - def safe_model_dump(self): from ..pydantic import PYDANTIC_VERSION diff --git a/python/lancedb/embeddings/bedrock.py b/python/lancedb/embeddings/bedrock.py index 953b9ddd..8b0ccbc2 100644 --- a/python/lancedb/embeddings/bedrock.py +++ b/python/lancedb/embeddings/bedrock.py @@ -19,6 +19,7 @@ import numpy as np from lancedb.pydantic import PYDANTIC_VERSION +from ..util import attempt_import_or_raise from .base import TextEmbeddingFunction from .registry import register from .utils import TEXT @@ -183,8 +184,8 @@ class BedRockText(TextEmbeddingFunction): boto3.client The boto3 client for Amazon Bedrock service """ - botocore = self.safe_import("botocore") - boto3 = self.safe_import("boto3") + botocore = attempt_import_or_raise("botocore") + boto3 = attempt_import_or_raise("boto3") session_kwargs = {"region_name": self.region} client_kwargs = {**session_kwargs} diff --git a/python/lancedb/embeddings/cohere.py b/python/lancedb/embeddings/cohere.py index 0084c857..29d203c0 100644 --- a/python/lancedb/embeddings/cohere.py +++ b/python/lancedb/embeddings/cohere.py @@ -16,6 +16,7 @@ from typing import ClassVar, List, Union import numpy as np +from ..util import attempt_import_or_raise from .base import TextEmbeddingFunction from .registry import register from .utils import api_key_not_found_help @@ -84,7 +85,7 @@ class CohereEmbeddingFunction(TextEmbeddingFunction): return [emb for emb in rs.embeddings] def _init_client(self): - cohere = self.safe_import("cohere") + cohere = attempt_import_or_raise("cohere") if CohereEmbeddingFunction.client is None: if os.environ.get("COHERE_API_KEY") is None: api_key_not_found_help("cohere") diff --git a/python/lancedb/embeddings/gemini_text.py b/python/lancedb/embeddings/gemini_text.py index e0f103c2..bdbd304c 100644 --- a/python/lancedb/embeddings/gemini_text.py +++ b/python/lancedb/embeddings/gemini_text.py @@ -19,6 +19,7 @@ import numpy as np from lancedb.pydantic import PYDANTIC_VERSION +from ..util import attempt_import_or_raise from .base import TextEmbeddingFunction from .registry import register from .utils import TEXT, api_key_not_found_help @@ -134,7 +135,7 @@ class GeminiText(TextEmbeddingFunction): @cached_property def client(self): - genai = self.safe_import("google.generativeai", "google.generativeai") + genai = attempt_import_or_raise("google.generativeai", "google.generativeai") if not os.environ.get("GOOGLE_API_KEY"): api_key_not_found_help("google") diff --git a/python/lancedb/embeddings/gte.py b/python/lancedb/embeddings/gte.py index d2433b88..34038889 100644 --- a/python/lancedb/embeddings/gte.py +++ b/python/lancedb/embeddings/gte.py @@ -14,6 +14,7 @@ from typing import List, Union import numpy as np +from ..util import attempt_import_or_raise from .base import TextEmbeddingFunction from .registry import register from .utils import weak_lru @@ -122,7 +123,7 @@ class GteEmbeddings(TextEmbeddingFunction): return Model() else: - sentence_transformers = self.safe_import( + sentence_transformers = attempt_import_or_raise( "sentence_transformers", "sentence-transformers" ) return sentence_transformers.SentenceTransformer( diff --git a/python/lancedb/embeddings/instructor.py b/python/lancedb/embeddings/instructor.py index 8d3311ec..e6481e19 100644 --- a/python/lancedb/embeddings/instructor.py +++ b/python/lancedb/embeddings/instructor.py @@ -14,6 +14,7 @@ from typing import List import numpy as np +from ..util import attempt_import_or_raise from .base import TextEmbeddingFunction from .registry import register from .utils import TEXT, weak_lru @@ -131,10 +132,10 @@ class InstructorEmbeddingFunction(TextEmbeddingFunction): @weak_lru(maxsize=1) def get_model(self): - instructor_embedding = self.safe_import( + instructor_embedding = attempt_import_or_raise( "InstructorEmbedding", "InstructorEmbedding" ) - torch = self.safe_import("torch", "torch") + torch = attempt_import_or_raise("torch", "torch") model = instructor_embedding.INSTRUCTOR(self.name) if self.quantize: diff --git a/python/lancedb/embeddings/open_clip.py b/python/lancedb/embeddings/open_clip.py index 6392b0ef..4d0a3a32 100644 --- a/python/lancedb/embeddings/open_clip.py +++ b/python/lancedb/embeddings/open_clip.py @@ -21,6 +21,7 @@ import pyarrow as pa from pydantic import PrivateAttr from tqdm import tqdm +from ..util import attempt_import_or_raise from .base import EmbeddingFunction from .registry import register from .utils import IMAGES, url_retrieve @@ -50,7 +51,7 @@ class OpenClipEmbeddings(EmbeddingFunction): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - open_clip = self.safe_import("open_clip", "open-clip") + open_clip = attempt_import_or_raise("open_clip", "open-clip") model, _, preprocess = open_clip.create_model_and_transforms( self.name, pretrained=self.pretrained ) @@ -78,14 +79,14 @@ class OpenClipEmbeddings(EmbeddingFunction): if isinstance(query, str): return [self.generate_text_embeddings(query)] else: - PIL = self.safe_import("PIL", "pillow") + PIL = attempt_import_or_raise("PIL", "pillow") if isinstance(query, PIL.Image.Image): return [self.generate_image_embedding(query)] else: raise TypeError("OpenClip supports str or PIL Image as query") def generate_text_embeddings(self, text: str) -> np.ndarray: - torch = self.safe_import("torch") + torch = attempt_import_or_raise("torch") text = self.sanitize_input(text) text = self._tokenizer(text) text.to(self.device) @@ -144,7 +145,7 @@ class OpenClipEmbeddings(EmbeddingFunction): The image to embed. If the image is a str, it is treated as a uri. If the image is bytes, it is treated as the raw image bytes. """ - torch = self.safe_import("torch") + torch = attempt_import_or_raise("torch") # TODO handle retry and errors for https image = self._to_pil(image) image = self._preprocess(image).unsqueeze(0) @@ -152,7 +153,7 @@ class OpenClipEmbeddings(EmbeddingFunction): return self._encode_and_normalize_image(image) def _to_pil(self, image: Union[str, bytes]): - PIL = self.safe_import("PIL", "pillow") + PIL = attempt_import_or_raise("PIL", "pillow") if isinstance(image, bytes): return PIL.Image.open(io.BytesIO(image)) if isinstance(image, PIL.Image.Image): diff --git a/python/lancedb/embeddings/openai.py b/python/lancedb/embeddings/openai.py index 12200edd..023f3b2f 100644 --- a/python/lancedb/embeddings/openai.py +++ b/python/lancedb/embeddings/openai.py @@ -16,6 +16,7 @@ from typing import List, Optional, Union import numpy as np +from ..util import attempt_import_or_raise from .base import TextEmbeddingFunction from .registry import register from .utils import api_key_not_found_help @@ -68,7 +69,7 @@ class OpenAIEmbeddings(TextEmbeddingFunction): @cached_property def _openai_client(self): - openai = self.safe_import("openai") + openai = attempt_import_or_raise("openai") if not os.environ.get("OPENAI_API_KEY"): api_key_not_found_help("openai") diff --git a/python/lancedb/embeddings/sentence_transformers.py b/python/lancedb/embeddings/sentence_transformers.py index d958e054..97fe1318 100644 --- a/python/lancedb/embeddings/sentence_transformers.py +++ b/python/lancedb/embeddings/sentence_transformers.py @@ -14,6 +14,7 @@ from typing import List, Union import numpy as np +from ..util import attempt_import_or_raise from .base import TextEmbeddingFunction from .registry import register from .utils import weak_lru @@ -75,7 +76,7 @@ class SentenceTransformerEmbeddings(TextEmbeddingFunction): TODO: use lru_cache instead with a reasonable/configurable maxsize """ - sentence_transformers = self.safe_import( + sentence_transformers = attempt_import_or_raise( "sentence_transformers", "sentence-transformers" ) return sentence_transformers.SentenceTransformer(self.name, device=self.device) diff --git a/python/lancedb/rerankers/cohere.py b/python/lancedb/rerankers/cohere.py index db5449e7..611da9f8 100644 --- a/python/lancedb/rerankers/cohere.py +++ b/python/lancedb/rerankers/cohere.py @@ -4,7 +4,7 @@ from typing import Union import pyarrow as pa -from ..util import safe_import +from ..util import attempt_import_or_raise from .base import Reranker @@ -41,7 +41,7 @@ class CohereReranker(Reranker): @cached_property def _client(self): - cohere = safe_import("cohere") + cohere = attempt_import_or_raise("cohere") if os.environ.get("COHERE_API_KEY") is None and self.api_key is None: raise ValueError( "COHERE_API_KEY not set. Either set it in your environment or \ diff --git a/python/lancedb/rerankers/colbert.py b/python/lancedb/rerankers/colbert.py index 308b7473..e3a0aa77 100644 --- a/python/lancedb/rerankers/colbert.py +++ b/python/lancedb/rerankers/colbert.py @@ -2,7 +2,7 @@ from functools import cached_property import pyarrow as pa -from ..util import safe_import +from ..util import attempt_import_or_raise from .base import Reranker @@ -29,7 +29,9 @@ class ColbertReranker(Reranker): super().__init__(return_score) self.model_name = model_name self.column = column - self.torch = safe_import("torch") # import here for faster ops later + self.torch = attempt_import_or_raise( + "torch" + ) # import here for faster ops later def rerank_hybrid( self, @@ -80,7 +82,7 @@ class ColbertReranker(Reranker): @cached_property def _model(self): - transformers = safe_import("transformers") + transformers = attempt_import_or_raise("transformers") tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_name) model = transformers.AutoModel.from_pretrained(self.model_name) diff --git a/python/lancedb/rerankers/cross_encoder.py b/python/lancedb/rerankers/cross_encoder.py index 08c89096..ea2ea099 100644 --- a/python/lancedb/rerankers/cross_encoder.py +++ b/python/lancedb/rerankers/cross_encoder.py @@ -3,7 +3,7 @@ from typing import Union import pyarrow as pa -from ..util import safe_import +from ..util import attempt_import_or_raise from .base import Reranker @@ -32,7 +32,7 @@ class CrossEncoderReranker(Reranker): return_score="relevance", ): super().__init__(return_score) - torch = safe_import("torch") + torch = attempt_import_or_raise("torch") self.model_name = model_name self.column = column self.device = device @@ -41,7 +41,7 @@ class CrossEncoderReranker(Reranker): @cached_property def model(self): - sbert = safe_import("sentence_transformers") + sbert = attempt_import_or_raise("sentence_transformers") cross_encoder = sbert.CrossEncoder(self.model_name) return cross_encoder diff --git a/python/lancedb/rerankers/openai.py b/python/lancedb/rerankers/openai.py index 0e99beb0..ca21c9b7 100644 --- a/python/lancedb/rerankers/openai.py +++ b/python/lancedb/rerankers/openai.py @@ -5,7 +5,7 @@ from typing import Optional import pyarrow as pa -from ..util import safe_import +from ..util import attempt_import_or_raise from .base import Reranker @@ -17,7 +17,7 @@ class OpenaiReranker(Reranker): Parameters ---------- - model_name : str, default "gpt-3.5-turbo-1106 " + model_name : str, default "gpt-4-turbo-preview" The name of the cross encoder model to use. column : str, default "text" The name of the column to use as input to the cross encoder model. @@ -29,7 +29,7 @@ class OpenaiReranker(Reranker): def __init__( self, - model_name: str = "gpt-3.5-turbo-1106", + model_name: str = "gpt-4-turbo-preview", column: str = "text", return_score="relevance", api_key: Optional[str] = None, @@ -93,7 +93,9 @@ class OpenaiReranker(Reranker): @cached_property def _client(self): - openai = safe_import("openai") # TODO: force version or handle versions < 1.0 + openai = attempt_import_or_raise( + "openai" + ) # TODO: force version or handle versions < 1.0 if os.environ.get("OPENAI_API_KEY") is None and self.api_key is None: raise ValueError( "OPENAI_API_KEY not set. Either set it in your environment or \ diff --git a/python/lancedb/util.py b/python/lancedb/util.py index 915b660a..14f9e530 100644 --- a/python/lancedb/util.py +++ b/python/lancedb/util.py @@ -116,7 +116,7 @@ def join_uri(base: Union[str, pathlib.Path], *parts: str) -> str: return "/".join([p.rstrip("/") for p in [base, *parts]]) -def safe_import(module: str, mitigation=None): +def attempt_import_or_raise(module: str, mitigation=None): """ Import the specified module. If the module is not installed, raise an ImportError with a helpful message.