From 2b8e872be006d900317aa0b14b89a6a734432f8b Mon Sep 17 00:00:00 2001 From: Jon X Date: Thu, 5 Sep 2024 17:10:38 +0800 Subject: [PATCH 1/9] docs: removed the unnecessary fence code tag (#1599) --- docs/src/guides/tables.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/src/guides/tables.md b/docs/src/guides/tables.md index 4be12858..9de2ba94 100644 --- a/docs/src/guides/tables.md +++ b/docs/src/guides/tables.md @@ -416,7 +416,6 @@ You can create an empty table for scenarios where you want to add data to the ta === "Python" - ```python An empty table can be initialized via a PyArrow schema. From b24810a01146bfc9ce52e3bc7bbcaa0b9f3db8c3 Mon Sep 17 00:00:00 2001 From: Gagan Bhullar Date: Thu, 5 Sep 2024 09:33:07 -0600 Subject: [PATCH 2/9] feat(python, rust): expose offset in query (#1556) PR is part of #1555 --- python/python/lancedb/_lancedb.pyi | 2 ++ python/python/lancedb/query.py | 38 ++++++++++++++++++++++++++++++ python/python/lancedb/table.py | 1 + python/python/tests/test_query.py | 11 +++++++++ python/src/query.rs | 8 +++++++ rust/lancedb/src/query.rs | 33 ++++++++++++++++++++++++++ rust/lancedb/src/table.rs | 9 ++++++- 7 files changed, 101 insertions(+), 1 deletion(-) diff --git a/python/python/lancedb/_lancedb.pyi b/python/python/lancedb/_lancedb.pyi index c4642637..55c3db99 100644 --- a/python/python/lancedb/_lancedb.pyi +++ b/python/python/lancedb/_lancedb.pyi @@ -73,6 +73,7 @@ class Query: def where(self, filter: str): ... def select(self, columns: Tuple[str, str]): ... def limit(self, limit: int): ... + def offset(self, offset: int): ... def nearest_to(self, query_vec: pa.Array) -> VectorQuery: ... def nearest_to_text(self, query: dict) -> Query: ... async def execute(self, max_batch_legnth: Optional[int]) -> RecordBatchStream: ... @@ -83,6 +84,7 @@ class VectorQuery: def select(self, columns: List[str]): ... def select_with_projection(self, columns: Tuple[str, str]): ... def limit(self, limit: int): ... + def offset(self, offset: int): ... def column(self, column: str): ... def distance_type(self, distance_type: str): ... def postfilter(self): ... diff --git a/python/python/lancedb/query.py b/python/python/lancedb/query.py index 9da90987..9c9c69ae 100644 --- a/python/python/lancedb/query.py +++ b/python/python/lancedb/query.py @@ -85,6 +85,8 @@ class Query(pydantic.BaseModel): - See discussion in [Querying an ANN Index][querying-an-ann-index] for tuning advice. + offset: int + The offset to start fetching results from """ vector_column: Optional[str] = None @@ -119,6 +121,8 @@ class Query(pydantic.BaseModel): with_row_id: bool = False + offset: int = 0 + class LanceQueryBuilder(ABC): """An abstract query builder. Subclasses are defined for vector search, @@ -233,6 +237,7 @@ class LanceQueryBuilder(ABC): def __init__(self, table: "Table"): self._table = table self._limit = 10 + self._offset = 0 self._columns = None self._where = None self._prefilter = False @@ -371,6 +376,25 @@ class LanceQueryBuilder(ABC): self._limit = limit return self + def offset(self, offset: int) -> LanceQueryBuilder: + """Set the offset for the results. + + Parameters + ---------- + offset: int + The offset to start fetching results from. + + Returns + ------- + LanceQueryBuilder + The LanceQueryBuilder object. + """ + if offset is None or offset <= 0: + self._offset = 0 + else: + self._offset = offset + return self + def select(self, columns: Union[list[str], dict[str, str]]) -> LanceQueryBuilder: """Set the columns to return. @@ -649,6 +673,7 @@ class LanceVectorQueryBuilder(LanceQueryBuilder): refine_factor=self._refine_factor, vector_column=self._vector_column, with_row_id=self._with_row_id, + offset=self._offset, ) result_set = self._table._execute_query(query, batch_size) if self._reranker is not None: @@ -780,6 +805,7 @@ class LanceFtsQueryBuilder(LanceQueryBuilder): "columns": self._fts_columns, }, vector=[], + offset=self._offset, ) results = self._table._execute_query(query) results = results.read_all() @@ -1220,6 +1246,18 @@ class AsyncQueryBase(object): self._inner.limit(limit) return self + def offset(self, offset: int) -> AsyncQuery: + """ + Set the offset for the results. + + Parameters + ---------- + offset: int + The offset to start fetching results from. + """ + self._inner.offset(offset) + return self + async def to_batches( self, *, max_batch_length: Optional[int] = None ) -> AsyncRecordBatchReader: diff --git a/python/python/lancedb/table.py b/python/python/lancedb/table.py index 46df91c2..7d3ebaa0 100644 --- a/python/python/lancedb/table.py +++ b/python/python/lancedb/table.py @@ -1708,6 +1708,7 @@ class LanceTable(Table): full_text_query=query.full_text_query, with_row_id=query.with_row_id, batch_size=batch_size, + offset=query.offset, ).to_reader() def _do_merge( diff --git a/python/python/tests/test_query.py b/python/python/tests/test_query.py index ae50c991..11750e4d 100644 --- a/python/python/tests/test_query.py +++ b/python/python/tests/test_query.py @@ -51,6 +51,7 @@ class MockTable: "refine_factor": query.refine_factor, }, batch_size=batch_size, + offset=query.offset, ).to_reader() @@ -106,6 +107,13 @@ def test_cast(table): assert r0.float_field == 1.0 +def test_offset(table): + results_without_offset = LanceVectorQueryBuilder(table, [0, 0], "vector") + assert len(results_without_offset.to_pandas()) == 2 + results_with_offset = LanceVectorQueryBuilder(table, [0, 0], "vector").offset(1) + assert len(results_with_offset.to_pandas()) == 1 + + def test_query_builder(table): rs = ( LanceVectorQueryBuilder(table, [0, 0], "vector") @@ -269,7 +277,10 @@ async def test_query_async(table_async: AsyncTable): table_async.query().select({"foo": "id", "bar": "id + 1"}), expected_columns=["foo", "bar"], ) + await check_query(table_async.query().limit(1), expected_num_rows=1) + await check_query(table_async.query().offset(1), expected_num_rows=1) + await check_query( table_async.query().nearest_to(pa.array([1, 2])), expected_num_rows=2 ) diff --git a/python/src/query.rs b/python/src/query.rs index f88e60b4..42bd4a13 100644 --- a/python/src/query.rs +++ b/python/src/query.rs @@ -64,6 +64,10 @@ impl Query { self.inner = self.inner.clone().limit(limit as usize); } + pub fn offset(&mut self, offset: u32) { + self.inner = self.inner.clone().offset(offset as usize); + } + pub fn nearest_to(&mut self, vector: Bound<'_, PyAny>) -> PyResult { let data: ArrayData = ArrayData::from_pyarrow_bound(&vector)?; let array = make_array(data); @@ -138,6 +142,10 @@ impl VectorQuery { self.inner = self.inner.clone().limit(limit as usize); } + pub fn offset(&mut self, offset: u32) { + self.inner = self.inner.clone().offset(offset as usize); + } + pub fn column(&mut self, column: String) { self.inner = self.inner.clone().column(&column); } diff --git a/rust/lancedb/src/query.rs b/rust/lancedb/src/query.rs index 714200ae..d2895668 100644 --- a/rust/lancedb/src/query.rs +++ b/rust/lancedb/src/query.rs @@ -338,6 +338,12 @@ pub trait QueryBase { /// it will default to 10. fn limit(self, limit: usize) -> Self; + /// Set the offset of the query. + + /// By default, it fetches starting with the first row. + /// This method can be used to skip the first `offset` rows. + fn offset(self, offset: usize) -> Self; + /// Only return rows which match the filter. /// /// The filter should be supplied as an SQL query string. For example: @@ -408,6 +414,11 @@ impl QueryBase for T { self } + fn offset(mut self, offset: usize) -> Self { + self.mut_query().offset = Some(offset); + self + } + fn only_if(mut self, filter: impl AsRef) -> Self { self.mut_query().filter = Some(filter.as_ref().to_string()); self @@ -520,6 +531,9 @@ pub struct Query { /// limit the number of rows to return. pub(crate) limit: Option, + /// Offset of the query. + pub(crate) offset: Option, + /// Apply filter to the returned rows. pub(crate) filter: Option, @@ -541,6 +555,7 @@ impl Query { Self { parent, limit: None, + offset: None, filter: None, full_text_search: None, select: Select::All, @@ -858,6 +873,7 @@ mod tests { let query = table .query() .limit(100) + .offset(1) .nearest_to(&[9.8, 8.7]) .unwrap() .nprobes(1000) @@ -870,6 +886,7 @@ mod tests { new_vector ); assert_eq!(query.base.limit.unwrap(), 100); + assert_eq!(query.base.offset.unwrap(), 1); assert_eq!(query.nprobes, 1000); assert!(query.use_index); assert_eq!(query.distance_type, Some(DistanceType::Cosine)); @@ -916,10 +933,26 @@ mod tests { let result = query.execute().await; let mut stream = result.expect("should have result"); // should only have one batch + while let Some(batch) = stream.next().await { // pre filter should return 10 rows assert!(batch.expect("should be Ok").num_rows() == 10); } + + let query = table + .query() + .limit(10) + .offset(1) + .only_if(String::from("id % 2 == 0")) + .nearest_to(&[0.1; 4]) + .unwrap(); + let result = query.execute().await; + let mut stream = result.expect("should have result"); + // should only have one batch + while let Some(batch) = stream.next().await { + // pre filter should return 10 rows + assert!(batch.expect("should be Ok").num_rows() == 9); + } } #[tokio::test] diff --git a/rust/lancedb/src/table.rs b/rust/lancedb/src/table.rs index 12254819..f1942f0e 100644 --- a/rust/lancedb/src/table.rs +++ b/rust/lancedb/src/table.rs @@ -1852,9 +1852,16 @@ impl TableInternal for NativeTable { query_vector, query.base.limit.unwrap_or(DEFAULT_TOP_K), )?; + scanner.limit( + query.base.limit.map(|limit| limit as i64), + query.base.offset.map(|offset| offset as i64), + )?; } else { // If there is no vector query, it's ok to not have a limit - scanner.limit(query.base.limit.map(|limit| limit as i64), None)?; + scanner.limit( + query.base.limit.map(|limit| limit as i64), + query.base.offset.map(|offset| offset as i64), + )?; } scanner.nprobs(query.nprobes); From 2bc7dca3ca5ba0010e324d2306aea7e64ec42049 Mon Sep 17 00:00:00 2001 From: Rithik Kumar <46047011+rithikJha@users.noreply.github.com> Date: Thu, 5 Sep 2024 22:19:08 +0530 Subject: [PATCH 3/9] docs: add changes to Embeddings-> Available models-> overview page (#1596) adding features and improvements to - Manage Embeddings page Before: ![Screenshot 2024-09-04 223743](https://github.com/user-attachments/assets/f1e116b5-6ebb-4d59-9d29-b20084998cd0) After: ![Screenshot 2024-09-05 214214](https://github.com/user-attachments/assets/8c94318e-68af-447e-97e1-8153860a2914) ![Screenshot 2024-09-05 213623](https://github.com/user-attachments/assets/55c82770-6df9-4bab-9c5c-1ea1552138de) ![Screenshot 2024-09-05 215931](https://github.com/user-attachments/assets/9bfac7d4-16a6-454e-801e-50789ff75261) --- docs/mkdocs.yml | 7 ++ .../cohere_embedding.md | 15 +-- .../embeddings/default_embedding_functions.md | 98 ++++++++++++++----- 3 files changed, 91 insertions(+), 29 deletions(-) diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 0230caef..bb0c456c 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -26,6 +26,7 @@ theme: - content.code.copy - content.tabs.link - content.action.edit + - content.tooltips - toc.follow - navigation.top - navigation.tabs @@ -35,6 +36,7 @@ theme: - navigation.instant icon: repo: fontawesome/brands/github + annotation: material/arrow-right-circle custom_dir: overrides plugins: @@ -76,7 +78,12 @@ markdown_extensions: - pymdownx.tabbed: alternate_style: true - md_in_html + - abbr - attr_list + - pymdownx.snippets + - pymdownx.emoji: + emoji_index: !!python/name:material.extensions.emoji.twemoji + emoji_generator: !!python/name:material.extensions.emoji.to_svg nav: - Home: diff --git a/docs/src/embeddings/available_embedding_models/text_embedding_functions/cohere_embedding.md b/docs/src/embeddings/available_embedding_models/text_embedding_functions/cohere_embedding.md index 39eba18c..fd99f2ca 100644 --- a/docs/src/embeddings/available_embedding_models/text_embedding_functions/cohere_embedding.md +++ b/docs/src/embeddings/available_embedding_models/text_embedding_functions/cohere_embedding.md @@ -4,13 +4,14 @@ Using cohere API requires cohere package, which can be installed using `pip inst You also need to set the `COHERE_API_KEY` environment variable to use the Cohere API. Supported models are: -* embed-english-v3.0 -* embed-multilingual-v3.0 -* embed-english-light-v3.0 -* embed-multilingual-light-v3.0 -* embed-english-v2.0 -* embed-english-light-v2.0 -* embed-multilingual-v2.0 + +- embed-english-v3.0 +- embed-multilingual-v3.0 +- embed-english-light-v3.0 +- embed-multilingual-light-v3.0 +- embed-english-v2.0 +- embed-english-light-v2.0 +- embed-multilingual-v2.0 Supported parameters (to be passed in `create` method) are: diff --git a/docs/src/embeddings/default_embedding_functions.md b/docs/src/embeddings/default_embedding_functions.md index ced97048..5457dc9f 100644 --- a/docs/src/embeddings/default_embedding_functions.md +++ b/docs/src/embeddings/default_embedding_functions.md @@ -1,30 +1,84 @@ -There are various embedding functions available out of the box with LanceDB to manage your embeddings implicitly. We're actively working on adding other popular embedding APIs and models. +# πŸ“š Available Embedding Models -## Text embedding functions -Contains the text embedding functions registered by default. +There are various embedding functions available out of the box with LanceDB to manage your embeddings implicitly. We're actively working on adding other popular embedding APIs and models. πŸš€ -* Embedding functions have an inbuilt rate limit handler wrapper for source and query embedding function calls that retry with exponential backoff. -* Each `EmbeddingFunction` implementation automatically takes `max_retries` as an argument which has the default value of 7. +Before jumping on the list of available models, let's understand how to get an embedding model initialized and configured to use in our code: -**Available Text Embeddings**: +!!! example "Example usage" + ```python + model = get_registry() + .get("openai") + .create(name="text-embedding-ada-002") + ``` -- [Sentence Transformers](available_embedding_models/text_embedding_functions/sentence_transformers.md) -- [Huggingface Embedding Models](available_embedding_models/text_embedding_functions/huggingface_embedding.md) -- [Ollama Embeddings](available_embedding_models/text_embedding_functions/ollama_embedding.md) -- [OpenAI Embeddings](available_embedding_models/text_embedding_functions/openai_embedding.md) -- [Instructor Embeddings](available_embedding_models/text_embedding_functions/instructor_embedding.md) -- [Gemini Embeddings](available_embedding_models/text_embedding_functions/gemini_embedding.md) -- [Cohere Embeddings](available_embedding_models/text_embedding_functions/cohere_embedding.md) -- [Jina Embeddings](available_embedding_models/text_embedding_functions/jina_embedding.md) -- [AWS Bedrock Text Embedding Functions](available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md) -- [IBM Watsonx.ai Embeddings](available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md) +Now let's understand the above syntax: +```python +model = get_registry().get("model_id").create(...params) +``` +**ThisπŸ‘† line effectively creates a configured instance of an `embedding function` with `model` of choice that is ready for use.** + +- `get_registry()` : This function call returns an instance of a `EmbeddingFunctionRegistry` object. This registry manages the registration and retrieval of embedding functions. + +- `.get("model_id")` : This method call on the registry object and retrieves the **embedding models functions** associated with the `"model_id"` (1) . + { .annotate } + + 1. Hover over the names in table below to find out the `model_id` of different embedding functions. + +- `.create(...params)` : This method call is on the object returned by the `get` method. It instantiates an embedding model function using the **specified parameters**. + +??? question "What parameters does the `.create(...params)` method accepts?" + **Checkout the documentation of specific embedding models (links in the table belowπŸ‘‡) to know what parameters it takes**. + +!!! tip "Moving on" + Now that we know how to get the **desired embedding model** and use it in our code, let's explore the comprehensive **list** of embedding models **supported by LanceDB**, in the tables below. + +## Text Embedding Functions πŸ“ +These functions are registered by default to handle text embeddings. + +- πŸ”„ **Embedding functions** have an inbuilt rate limit handler wrapper for source and query embedding function calls that retry with **exponential backoff**. + +- πŸŒ• Each `EmbeddingFunction` implementation automatically takes `max_retries` as an argument which has the default value of 7. + +🌟 **Available Text Embeddings** + +| **Embedding** :material-information-outline:{ title="Hover over the name to find out the model_id" } | **Description** | **Documentation** | +|-----------|-------------|---------------| +| [**Sentence Transformers**](available_embedding_models/text_embedding_functions/sentence_transformers.md "sentence-transformers") | 🧠 **SentenceTransformers** is a Python framework for state-of-the-art sentence, text, and image embeddings. | [Sentence Transformers Icon](available_embedding_models/text_embedding_functions/sentence_transformers.md)| +| [**Huggingface Models**](available_embedding_models/text_embedding_functions/huggingface_embedding.md "huggingface") |πŸ€— We offer support for all **Huggingface** models. The default model is `colbert-ir/colbertv2.0`. | [Huggingface Icon](available_embedding_models/text_embedding_functions/huggingface_embedding.md) | +| [**Ollama Embeddings**](available_embedding_models/text_embedding_functions/ollama_embedding.md "ollama") | πŸ” Generate embeddings via the **Ollama** python library. Ollama supports embedding models, making it possible to build RAG apps. | [Ollama Icon](available_embedding_models/text_embedding_functions/ollama_embedding.md)| +| [**OpenAI Embeddings**](available_embedding_models/text_embedding_functions/openai_embedding.md "openai")| πŸ”‘ **OpenAI’s** text embeddings measure the relatedness of text strings. **LanceDB** supports state-of-the-art embeddings from OpenAI. | [OpenAI Icon](available_embedding_models/text_embedding_functions/openai_embedding.md)| +| [**Instructor Embeddings**](available_embedding_models/text_embedding_functions/instructor_embedding.md "instructor") | πŸ“š **Instructor**: An instruction-finetuned text embedding model that can generate text embeddings tailored to any task and domains by simply providing the task instruction, without any finetuning. | [Instructor Embedding Icon](available_embedding_models/text_embedding_functions/instructor_embedding.md) | +| [**Gemini Embeddings**](available_embedding_models/text_embedding_functions/gemini_embedding.md "gemini-text") | 🌌 Google’s Gemini API generates state-of-the-art embeddings for words, phrases, and sentences. | [Gemini Icon](available_embedding_models/text_embedding_functions/gemini_embedding.md) | +| [**Cohere Embeddings**](available_embedding_models/text_embedding_functions/cohere_embedding.md "cohere") | πŸ’¬ This will help you get started with **Cohere** embedding models using LanceDB. Using cohere API requires cohere package. Install it via `pip`. | [Cohere Icon](available_embedding_models/text_embedding_functions/cohere_embedding.md) | +| [**Jina Embeddings**](available_embedding_models/text_embedding_functions/jina_embedding.md "jina") | πŸ”— World-class embedding models to improve your search and RAG systems. You will need **jina api key**. | [Jina Icon](available_embedding_models/text_embedding_functions/jina_embedding.md) | +| [ **AWS Bedrock Functions**](available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md "bedrock-text") | ☁️ AWS Bedrock supports multiple base models for generating text embeddings. You need to setup the AWS credentials to use this embedding function. | [AWS Bedrock Icon](available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md) | +| [**IBM Watsonx.ai**](available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md "watsonx") | πŸ’‘ Generate text embeddings using IBM's watsonx.ai platform. **Note**: watsonx.ai library is an optional dependency. | [Watsonx Icon](available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md) | -## Multi-modal embedding functions -Multi-modal embedding functions allow you to query your table using both images and text. -**Available Multi-modal Embeddings** : +[st-key]: "sentence-transformers" +[hf-key]: "huggingface" +[ollama-key]: "ollama" +[openai-key]: "openai" +[instructor-key]: "instructor" +[gemini-key]: "gemini-text" +[cohere-key]: "cohere" +[jina-key]: "jina" +[aws-key]: "bedrock-text" +[watsonx-key]: "watsonx" -- [OpenClip Embeddings](available_embedding_models/multimodal_embedding_functions/openclip_embedding.md) -- [Imagebind Embeddings](available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md) -- [Jina Embeddings](available_embedding_models/multimodal_embedding_functions/jina_multimodal_embedding.md) \ No newline at end of file + +## Multi-modal Embedding FunctionsπŸ–ΌοΈ + +Multi-modal embedding functions allow you to query your table using both images and text. πŸ’¬πŸ–ΌοΈ + +🌐 **Available Multi-modal Embeddings** + +| Embedding :material-information-outline:{ title="Hover over the name to find out the model_id" } | Description | Documentation | +|-----------|-------------|---------------| +| [**OpenClip Embeddings**](available_embedding_models/multimodal_embedding_functions/openclip_embedding.md "open-clip") | 🎨 We support CLIP model embeddings using the open source alternative, **open-clip** which supports various customizations. | [openclip Icon](available_embedding_models/multimodal_embedding_functions/openclip_embedding.md) | +| [**Imagebind Embeddings**](available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md "imageind") | 🌌 We have support for **imagebind model embeddings**. You can download our version of the packaged model via - `pip install imagebind-packaged==0.1.2`. | [imagebind Icon](available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md)| +| [**Jina Multi-modal Embeddings**](available_embedding_models/multimodal_embedding_functions/jina_multimodal_embedding.md "jina") | πŸ”— **Jina embeddings** can also be used to embed both **text** and **image** data, only some of the models support image data and you can check the detailed documentation. πŸ‘‰ | [jina Icon](available_embedding_models/multimodal_embedding_functions/jina_multimodal_embedding.md) | + +!!! note + If you'd like to request support for additional **embedding functions**, please feel free to open an issue on our LanceDB [GitHub issue page](https://github.com/lancedb/lancedb/issues). \ No newline at end of file From 4ee7225e917b44379267a4579119ae587bd6cc5f Mon Sep 17 00:00:00 2001 From: Lei Xu Date: Thu, 5 Sep 2024 11:48:48 -0700 Subject: [PATCH 4/9] ci: public java package (#1485) Co-authored-by: Lu Qiu --- .github/workflows/java-publish.yml | 109 ++++++++++++++++++++++ java/core/pom.xml | 4 +- java/pom.xml | 142 ++++++++++++++++++++++++++++- 3 files changed, 249 insertions(+), 6 deletions(-) create mode 100644 .github/workflows/java-publish.yml diff --git a/.github/workflows/java-publish.yml b/.github/workflows/java-publish.yml new file mode 100644 index 00000000..12d13522 --- /dev/null +++ b/.github/workflows/java-publish.yml @@ -0,0 +1,109 @@ +name: Build and publish Java packages +on: + release: + types: [released] + pull_request: + paths: + - .github/workflows/java-publish.yml + +jobs: + macos-arm64: + name: Build on MacOS Arm64 + runs-on: macos-14 + timeout-minutes: 45 + defaults: + run: + working-directory: ./java/core/lancedb-jni + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - uses: Swatinem/rust-cache@v2 + - name: Install dependencies + run: | + brew install protobuf + - name: Build release + run: | + cargo build --release + - uses: actions/upload-artifact@v4 + with: + name: liblancedb_jni_darwin_aarch64.zip + path: target/release/liblancedb_jni.dylib + retention-days: 1 + if-no-files-found: error + linux-arm64: + name: Build on Linux Arm64 + runs-on: warp-ubuntu-2204-arm64-8x + timeout-minutes: 45 + defaults: + run: + working-directory: ./java/core/lancedb-jni + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - uses: Swatinem/rust-cache@v2 + - uses: actions-rust-lang/setup-rust-toolchain@v1 + with: + toolchain: "1.79.0" + cache-workspaces: "./java/core/lancedb-jni" + # Disable full debug symbol generation to speed up CI build and keep memory down + # "1" means line tables only, which is useful for panic tracebacks. + rustflags: "-C debuginfo=1" + - name: Install dependencies + run: | + sudo apt -y -qq update + sudo apt install -y protobuf-compiler libssl-dev pkg-config + - name: Build release + run: | + cargo build --release + - uses: actions/upload-artifact@v4 + with: + name: liblancedb_jni_linux_aarch64.zip + path: target/release/liblancedb_jni.so + retention-days: 1 + if-no-files-found: error + linux-x86: + runs-on: warp-ubuntu-2204-x64-8x + timeout-minutes: 30 + needs: [macos-arm64, linux-arm64] + defaults: + run: + working-directory: ./java + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - uses: Swatinem/rust-cache@v2 + - name: Set up Java 8 + uses: actions/setup-java@v4 + with: + distribution: temurin + java-version: 8 + cache: "maven" + server-id: ossrh + server-username: SONATYPE_USER + server-password: SONATYPE_TOKEN + gpg-private-key: ${{ secrets.GPG_PRIVATE_KEY }} + gpg-passphrase: ${{ secrets.GPG_PASSPHRASE }} + - name: Install dependencies + run: | + sudo apt -y -qq update + sudo apt install -y protobuf-compiler libssl-dev pkg-config + - name: Download artifact + uses: actions/download-artifact@v4 + - name: Copy native libs + run: | + mkdir -p ./core/target/classes/nativelib/darwin-aarch64 ./core/target/classes/nativelib/linux-aarch64 + cp ../liblancedb_jni_darwin_aarch64.zip/liblancedb_jni.dylib ./core/target/classes/nativelib/darwin-aarch64/liblancedb_jni.dylib + cp ../liblancedb_jni_linux_aarch64.zip/liblancedb_jni.so ./core/target/classes/nativelib/linux-aarch64/liblancedb_jni.so + - name: Set github + run: | + git config --global user.email "LanceDB Github Runner" + git config --global user.name "dev+gha@lancedb.com" + - name: Publish with Java 8 + run: | + echo "use-agent" >> ~/.gnupg/gpg.conf + echo "pinentry-mode loopback" >> ~/.gnupg/gpg.conf + export GPG_TTY=$(tty) + mvn --batch-mode -DskipTests -DpushChanges=false -Dgpg.passphrase=${{ secrets.GPG_PASSPHRASE }} deploy -P deploy-to-ossrh + env: + SONATYPE_USER: ${{ secrets.SONATYPE_USER }} + SONATYPE_TOKEN: ${{ secrets.SONATYPE_TOKEN }} diff --git a/java/core/pom.xml b/java/core/pom.xml index a469c3ae..b6fedc19 100644 --- a/java/core/pom.xml +++ b/java/core/pom.xml @@ -8,7 +8,7 @@ com.lancedb lancedb-parent - 0.1-SNAPSHOT + 0.0.3 ../pom.xml @@ -68,7 +68,7 @@ lancedb-jni - + true ${project.build.directory}/classes/nativelib true diff --git a/java/pom.xml b/java/pom.xml index 48a64c12..6a0a95a7 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -6,15 +6,28 @@ com.lancedb lancedb-parent - 0.1-SNAPSHOT + 0.0.3 pom - Lance Parent + LanceDB Parent + LanceDB vector database Java API + http://lancedb.com/ + + + + Lance DB Dev Group + dev@lancedb.com + + + + + The Apache Software License, Version 2.0 + http://www.apache.org/licenses/LICENSE-2.0.txt + + UTF-8 - 11 - 11 15.0.0 @@ -22,6 +35,12 @@ core + + scm:git:https://github.com/lancedb/lancedb.git + scm:git:ssh://git@github.com/lancedb/lancedb.git + https://github.com/lancedb/lancedb + + @@ -62,8 +81,45 @@ + + + ossrh + https://s01.oss.sonatype.org/content/repositories/snapshots + + + ossrh + https://s01.oss.sonatype.org/service/local/staging/deploy/maven2/ + + + + + org.apache.maven.plugins + maven-source-plugin + 2.2.1 + + + attach-sources + + jar-no-fork + + + + + + org.apache.maven.plugins + maven-javadoc-plugin + 2.9.1 + + + attach-javadocs + + jar + + + + org.apache.maven.plugins maven-checkstyle-plugin @@ -126,4 +182,82 @@ + + + + jdk8 + + [1.8,1.8.999] + + + 1.8 + 1.8 + + + + jdk11+ + + [11,) + + + 11 + 11 + + + + + maven-surefire-plugin + 3.2.5 + + --add-opens=java.base/java.nio=ALL-UNNAMED + + false + + + + + + + deploy-to-ossrh + + + + org.sonatype.central + central-publishing-maven-plugin + 0.4.0 + true + + ossrh + true + + + + org.sonatype.plugins + nexus-staging-maven-plugin + 1.6.13 + true + + ossrh + https://s01.oss.sonatype.org/ + true + + + + org.apache.maven.plugins + maven-gpg-plugin + 1.5 + + + sign-artifacts + verify + + sign + + + + + + + + From 1d61717d0e1a7cb3c818ab598f88fbf3d46c75bd Mon Sep 17 00:00:00 2001 From: Philip Zeyliger Date: Thu, 5 Sep 2024 13:18:24 -0700 Subject: [PATCH 5/9] docs: fix get_registry() usage (#1601) Docs used `get_registry.get(...)` whereas what works is `get_registry().get(...)`. Fixing the two instances I found. I tested the open clip version by trying it locally in a Jupyter notebook. --- .../multimodal_embedding_functions/imagebind_embedding.md | 2 +- .../multimodal_embedding_functions/openclip_embedding.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md b/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md index 4aa8b3db..72a7e825 100644 --- a/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md +++ b/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md @@ -17,7 +17,7 @@ from lancedb.pydantic import LanceModel, Vector from lancedb.embeddings import get_registry db = lancedb.connect(tmp_path) -func = get_registry.get("imagebind").create() +func = get_registry().get("imagebind").create() class ImageBindModel(LanceModel): text: str diff --git a/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/openclip_embedding.md b/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/openclip_embedding.md index bf50dfd2..eb6139f5 100644 --- a/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/openclip_embedding.md +++ b/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/openclip_embedding.md @@ -20,7 +20,7 @@ from lancedb.pydantic import LanceModel, Vector from lancedb.embeddings import get_registry db = lancedb.connect(tmp_path) -func = get_registry.get("open-clip").create() +func = get_registry().get("open-clip").create() class Images(LanceModel): label: str From 8dcd328dce8ed5482cb604b5a4d608ab2d5c6ee7 Mon Sep 17 00:00:00 2001 From: BubbleCal Date: Fri, 6 Sep 2024 10:41:38 +0800 Subject: [PATCH 6/9] feat: support to create table from record batch iterator (#1593) --- python/python/lancedb/db.py | 45 ++++--------- python/python/lancedb/query.py | 2 +- python/python/lancedb/remote/arrow.py | 5 +- python/python/lancedb/remote/db.py | 26 ++------ python/python/lancedb/table.py | 95 +++++++++++++++++---------- python/python/tests/test_db.py | 37 +++++++++++ 6 files changed, 119 insertions(+), 91 deletions(-) diff --git a/python/python/lancedb/db.py b/python/python/lancedb/db.py index 1c77b299..d2345e4a 100644 --- a/python/python/lancedb/db.py +++ b/python/python/lancedb/db.py @@ -14,7 +14,6 @@ from __future__ import annotations import asyncio -import inspect import os from abc import abstractmethod from pathlib import Path @@ -27,8 +26,13 @@ from pyarrow import fs from lancedb.common import data_to_reader, validate_schema from ._lancedb import connect as lancedb_connect -from .pydantic import LanceModel -from .table import AsyncTable, LanceTable, Table, _sanitize_data, _table_path +from .table import ( + AsyncTable, + LanceTable, + Table, + _table_path, + sanitize_create_table, +) from .util import ( fs_from_uri, get_uri_location, @@ -37,6 +41,7 @@ from .util import ( ) if TYPE_CHECKING: + from .pydantic import LanceModel from datetime import timedelta from ._lancedb import Connection as LanceDbConnection @@ -722,12 +727,6 @@ class AsyncConnection(object): ... await db.create_table("table4", make_batches(), schema=schema) >>> asyncio.run(iterable_example()) """ - if inspect.isclass(schema) and issubclass(schema, LanceModel): - # convert LanceModel to pyarrow schema - # note that it's possible this contains - # embedding function metadata already - schema = schema.to_arrow_schema() - metadata = None # Defining defaults here and not in function prototype. In the future @@ -738,31 +737,9 @@ class AsyncConnection(object): if fill_value is None: fill_value = 0.0 - if data is not None: - data, schema = _sanitize_data( - data, - schema, - metadata=metadata, - on_bad_vectors=on_bad_vectors, - fill_value=fill_value, - ) - - if schema is None: - if data is None: - raise ValueError("Either data or schema must be provided") - elif hasattr(data, "schema"): - schema = data.schema - elif isinstance(data, Iterable): - if metadata: - raise TypeError( - ( - "Persistent embedding functions not yet " - "supported for generator data input" - ) - ) - - if metadata: - schema = schema.with_metadata(metadata) + data, schema = sanitize_create_table( + data, schema, metadata, on_bad_vectors, fill_value + ) validate_schema(schema) if exist_ok is None: diff --git a/python/python/lancedb/query.py b/python/python/lancedb/query.py index 9c9c69ae..c6b14c0f 100644 --- a/python/python/lancedb/query.py +++ b/python/python/lancedb/query.py @@ -852,7 +852,7 @@ class LanceFtsQueryBuilder(LanceQueryBuilder): ) if len(row_ids) == 0: empty_schema = pa.schema([pa.field("_score", pa.float32())]) - return pa.Table.from_pylist([], schema=empty_schema) + return pa.Table.from_batches([], schema=empty_schema) scores = pa.array(scores) output_tbl = self._table.to_lance().take(row_ids, columns=self._columns) output_tbl = output_tbl.append_column("_score", scores) diff --git a/python/python/lancedb/remote/arrow.py b/python/python/lancedb/remote/arrow.py index 753087cf..ac39e247 100644 --- a/python/python/lancedb/remote/arrow.py +++ b/python/python/lancedb/remote/arrow.py @@ -11,12 +11,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import Iterable, Union import pyarrow as pa -def to_ipc_binary(table: pa.Table) -> bytes: +def to_ipc_binary(table: Union[pa.Table, Iterable[pa.RecordBatch]]) -> bytes: """Serialize a PyArrow Table to IPC binary.""" sink = pa.BufferOutputStream() + if isinstance(table, Iterable): + table = pa.Table.from_batches(table) with pa.ipc.new_stream(sink, table.schema) as writer: writer.write_table(table) return sink.getvalue().to_pybytes() diff --git a/python/python/lancedb/remote/db.py b/python/python/lancedb/remote/db.py index 0dd6bb6d..bb7554a4 100644 --- a/python/python/lancedb/remote/db.py +++ b/python/python/lancedb/remote/db.py @@ -11,7 +11,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import inspect import logging import uuid from concurrent.futures import ThreadPoolExecutor @@ -26,7 +25,7 @@ from ..common import DATA from ..db import DBConnection from ..embeddings import EmbeddingFunctionConfig from ..pydantic import LanceModel -from ..table import Table, _sanitize_data +from ..table import Table, sanitize_create_table from ..util import validate_table_name from .arrow import to_ipc_binary from .client import ARROW_STREAM_CONTENT_TYPE, RestfulLanceDBClient @@ -228,8 +227,6 @@ class RemoteDBConnection(DBConnection): """ validate_table_name(name) - if data is None and schema is None: - raise ValueError("Either data or schema must be provided.") if embedding_functions is not None: logging.warning( "embedding_functions is not yet supported on LanceDB Cloud." @@ -239,24 +236,9 @@ class RemoteDBConnection(DBConnection): if mode is not None: logging.warning("mode is not yet supported on LanceDB Cloud.") - if inspect.isclass(schema) and issubclass(schema, LanceModel): - # convert LanceModel to pyarrow schema - # note that it's possible this contains - # embedding function metadata already - schema = schema.to_arrow_schema() - - if data is not None: - data, schema = _sanitize_data( - data, - schema, - metadata=None, - on_bad_vectors=on_bad_vectors, - fill_value=fill_value, - ) - else: - if schema is None: - raise ValueError("Either data or schema must be provided") - data = pa.Table.from_pylist([], schema=schema) + data, schema = sanitize_create_table( + data, schema, on_bad_vectors=on_bad_vectors, fill_value=fill_value + ) from .table import RemoteTable diff --git a/python/python/lancedb/table.py b/python/python/lancedb/table.py index 7d3ebaa0..53e624a0 100644 --- a/python/python/lancedb/table.py +++ b/python/python/lancedb/table.py @@ -117,15 +117,50 @@ def _sanitize_data( data = _sanitize_schema( data, schema=schema, on_bad_vectors=on_bad_vectors, fill_value=fill_value ) + if schema is None: + schema = data.schema elif isinstance(data, Iterable): data = _to_record_batch_generator( data, schema, metadata, on_bad_vectors, fill_value ) + if schema is None: + data, schema = _generator_to_data_and_schema(data) + if schema is None: + raise ValueError("Cannot infer schema from generator data") else: raise TypeError(f"Unsupported data type: {type(data)}") return data, schema +def sanitize_create_table( + data, schema, metadata=None, on_bad_vectors="error", fill_value=0.0 +): + if inspect.isclass(schema) and issubclass(schema, LanceModel): + # convert LanceModel to pyarrow schema + # note that it's possible this contains + # embedding function metadata already + schema = schema.to_arrow_schema() + + if data is not None: + data, schema = _sanitize_data( + data, + schema, + metadata=metadata, + on_bad_vectors=on_bad_vectors, + fill_value=fill_value, + ) + if schema is None: + if data is None: + raise ValueError("Either data or schema must be provided") + elif hasattr(data, "schema"): + schema = data.schema + + if metadata: + schema = schema.with_metadata(metadata) + + return data, schema + + def _schema_from_hf(data, schema): """ Extract pyarrow schema from HuggingFace DatasetDict @@ -187,8 +222,30 @@ def _append_vector_col(data: pa.Table, metadata: dict, schema: Optional[pa.Schem return data +def _generator_to_data_and_schema( + data: Iterable, +) -> Tuple[Iterable[pa.RecordBatch], pa.Schema]: + def _with_first_generator(first, data): + yield first + yield from data + + first = next(data, None) + schema = None + if isinstance(first, pa.RecordBatch): + schema = first.schema + data = _with_first_generator(first, data) + elif isinstance(first, pa.Table): + schema = first.schema + data = _with_first_generator(first.to_batches(), data) + return data, schema + + def _to_record_batch_generator( - data: Iterable, schema, metadata, on_bad_vectors, fill_value + data: Iterable, + schema, + metadata, + on_bad_vectors, + fill_value, ): for batch in data: # always convert to table because we need to sanitize the data @@ -1569,12 +1626,6 @@ class LanceTable(Table): The embedding functions to use when creating the table. """ tbl = LanceTable(db, name) - if inspect.isclass(schema) and issubclass(schema, LanceModel): - # convert LanceModel to pyarrow schema - # note that it's possible this contains - # embedding function metadata already - schema = schema.to_arrow_schema() - metadata = None if embedding_functions is not None: # If we passed in embedding functions explicitly @@ -1583,33 +1634,11 @@ class LanceTable(Table): registry = EmbeddingFunctionRegistry.get_instance() metadata = registry.get_table_metadata(embedding_functions) - if data is not None: - data, schema = _sanitize_data( - data, - schema, - metadata=metadata, - on_bad_vectors=on_bad_vectors, - fill_value=fill_value, - ) + data, schema = sanitize_create_table( + data, schema, metadata, on_bad_vectors, fill_value + ) - if schema is None: - if data is None: - raise ValueError("Either data or schema must be provided") - elif hasattr(data, "schema"): - schema = data.schema - elif isinstance(data, Iterable): - if metadata: - raise TypeError( - ( - "Persistent embedding functions not yet " - "supported for generator data input" - ) - ) - - if metadata: - schema = schema.with_metadata(metadata) - - empty = pa.Table.from_pylist([], schema=schema) + empty = pa.Table.from_batches([], schema=schema) try: lance.write_dataset(empty, tbl._dataset_uri, schema=schema, mode=mode) except OSError as err: diff --git a/python/python/tests/test_db.py b/python/python/tests/test_db.py index 373ae2b6..5b7f3c42 100644 --- a/python/python/tests/test_db.py +++ b/python/python/tests/test_db.py @@ -233,6 +233,43 @@ def test_create_mode(tmp_path): assert tbl.to_pandas().item.tolist() == ["fizz", "buzz"] +def test_create_table_from_iterator(tmp_path): + db = lancedb.connect(tmp_path) + + def gen_data(): + for _ in range(10): + yield pa.RecordBatch.from_arrays( + [ + pa.array([[3.1, 4.1]], pa.list_(pa.float32(), 2)), + pa.array(["foo"]), + pa.array([10.0]), + ], + ["vector", "item", "price"], + ) + + table = db.create_table("test", data=gen_data()) + assert table.count_rows() == 10 + + +@pytest.mark.asyncio +async def test_create_table_from_iterator_async(tmp_path): + db = await lancedb.connect_async(tmp_path) + + def gen_data(): + for _ in range(10): + yield pa.RecordBatch.from_arrays( + [ + pa.array([[3.1, 4.1]], pa.list_(pa.float32(), 2)), + pa.array(["foo"]), + pa.array([10.0]), + ], + ["vector", "item", "price"], + ) + + table = await db.create_table("test", data=gen_data()) + assert await table.count_rows() == 10 + + def test_create_exist_ok(tmp_path): db = lancedb.connect(tmp_path) data = pd.DataFrame( From 7eb3b52297dea2ca63b2f454b3fbd19ddb1022e5 Mon Sep 17 00:00:00 2001 From: Jon X Date: Fri, 6 Sep 2024 12:08:19 +0800 Subject: [PATCH 7/9] docs: added a blank line between a paragraph and a list block (#1604) Though the markdown can be rendered well on GitHub (GFM style?), but it seems that it's required to insert a blank line between a paragraph and a list block to make it render well with `mkdocs`? see also the web page: https://lancedb.github.io/lancedb/concepts/index_hnsw/ --- docs/src/concepts/index_hnsw.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/src/concepts/index_hnsw.md b/docs/src/concepts/index_hnsw.md index 9e8dc948..8bfaf39c 100644 --- a/docs/src/concepts/index_hnsw.md +++ b/docs/src/concepts/index_hnsw.md @@ -15,11 +15,13 @@ HNSW also combines this with the ideas behind a classic 1-dimensional search dat ## k-Nearest Neighbor Graphs and k-approximate Nearest neighbor Graphs The k-nearest neighbor graph actually predates its use for ANN search. Its construction is quite simple: + * Each vector in the dataset is given an associated vertex. * Each vertex has outgoing edges to its k nearest neighbors. That is, the k closest other vertices by Euclidean distance between the two corresponding vectors. This can be thought of as a "friend list" for the vertex. * For some applications (including nearest-neighbor search), the incoming edges are also added. Eventually, it was realized that the following greedy search method over such a graph typically results in good approximate nearest neighbors: + * Given a query vector, start at some fixed "entry point" vertex (e.g. the approximate center node). * Look at that vertex's neighbors. If any of them are closer to the query vector than the current vertex, then move to that vertex. * Repeat until a local optimum is found. @@ -36,15 +38,18 @@ One downside of k-NN and k-ANN graphs alone is that one must typically build the ## HNSW: Hierarchical Navigable Small Worlds HNSW builds on k-ANN in two main ways: + * Instead of getting the k-approximate nearest neighbors for a large value of k, it sparsifies the k-ANN graph using a carefully chosen "edge pruning" heuristic, allowing for the number of edges per vertex to be limited to a relatively small constant. * The "entry point" vertex is chosen dynamically using a recursively constructed data structure on a subset of the data, similarly to a skip list. This recursive structure can be thought of as separating into layers: + * At the bottom-most layer, an k-ANN graph on the whole dataset is present. * At the second layer, a k-ANN graph on a fraction of the dataset (e.g. 10%) is present. * At the Lth layer, a k-ANN graph is present. It is over a (constant) fraction (e.g. 10%) of the vectors/vertices present in the L-1th layer. Then the greedy search routine operates as follows: + * At the top layer (using an arbitrary vertex as an entry point), use the greedy local search routine on the k-ANN graph to get an approximate nearest neighbor at that layer. * Using the approximate nearest neighbor found in the previous layer as an entry point, find an approximate nearest neighbor in the next layer with the same method. * Repeat until the bottom-most layer is reached. Then use the entry point to find multiple nearest neighbors (e.g. top 10). From cd32944e5499028f4ff6d2e0c8578949671e00a1 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Fri, 6 Sep 2024 14:10:02 -0700 Subject: [PATCH 8/9] feat: upgrade lance to v0.17.0 (#1608) Changelog: https://github.com/lancedb/lance/releases/tag/v0.17.0 Highlights: * You can do "phrase queries" by adding double quotes around phrases (multiple tokens) in FTS. Added follow ups in: https://github.com/lancedb/lancedb/issues/1611 --- Cargo.toml | 12 ++++++------ python/pyproject.toml | 2 +- rust/ffi/node/src/table.rs | 2 +- rust/lancedb/src/table.rs | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 522342e8..e16528c3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,12 +20,12 @@ keywords = ["lancedb", "lance", "database", "vector", "search"] categories = ["database-implementations"] [workspace.dependencies] -lance = { "version" = "=0.16.1", "features" = ["dynamodb"] } -lance-index = { "version" = "=0.16.1" } -lance-linalg = { "version" = "=0.16.1" } -lance-testing = { "version" = "=0.16.1" } -lance-datafusion = { "version" = "=0.16.1" } -lance-encoding = { "version" = "=0.16.1" } +lance = { "version" = "=0.17.0", "features" = ["dynamodb"] } +lance-index = { "version" = "=0.17.0" } +lance-linalg = { "version" = "=0.17.0" } +lance-testing = { "version" = "=0.17.0" } +lance-datafusion = { "version" = "=0.17.0" } +lance-encoding = { "version" = "=0.17.0" } # Note that this one does not include pyarrow arrow = { version = "52.2", optional = false } arrow-array = "52.2" diff --git a/python/pyproject.toml b/python/pyproject.toml index 5e22fd47..7d41d891 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -3,7 +3,7 @@ name = "lancedb" # version in Cargo.toml dependencies = [ "deprecation", - "pylance==0.16.1", + "pylance==0.17.0", "ratelimiter~=1.0", "requests>=2.31.0", "retry>=0.9.2", diff --git a/rust/ffi/node/src/table.rs b/rust/ffi/node/src/table.rs index 10e7f19b..3e49d742 100644 --- a/rust/ffi/node/src/table.rs +++ b/rust/ffi/node/src/table.rs @@ -391,7 +391,7 @@ impl JsTable { materialize_deletions_threshold.value(&mut cx) as f32; } if let Some(num_threads) = js_options.get_opt::(&mut cx, "numThreads")? { - options.num_threads = num_threads.value(&mut cx) as usize; + options.num_threads = Some(num_threads.value(&mut cx) as usize); } rt.spawn(async move { diff --git a/rust/lancedb/src/table.rs b/rust/lancedb/src/table.rs index f1942f0e..88c23533 100644 --- a/rust/lancedb/src/table.rs +++ b/rust/lancedb/src/table.rs @@ -2788,7 +2788,7 @@ mod tests { .get_index_type(index_uuid) .await .unwrap(), - Some("IVF".to_string()) + Some("IVF_PQ".to_string()) ); assert_eq!( table From 029b01bbbf800a96a9764fa6c93acf10dfad040b Mon Sep 17 00:00:00 2001 From: James Wu Date: Fri, 6 Sep 2024 20:28:05 -0700 Subject: [PATCH 9/9] feat: enable phrase_query(bool) for hybrid search queries (#1578) first off, apologies for any folly since i'm new to contributing to lancedb. this PR is the continuation of [a discord thread](https://discord.com/channels/1030247538198061086/1030247538667827251/1278844345713299599): ## user story here's the lance db search query i'd like to run: ``` def search(phrase): logger.info(f'Searching for phrase: {phrase}') phrase_embedding = get_embedding(phrase) df = (table.search((phrase_embedding, phrase), query_type='hybrid') .limit(10).to_list()) logger.info(f'Success search with row count: {len(df)}') search('howdy (howdy)') search('howdy(howdy)') ``` the second search fails due to `ValueError: Syntax Error: howdy(howdy)` i saw on the [docs](https://lancedb.github.io/lancedb/fts/#phrase-queries-vs-terms-queries) that i can use `phrase_query()` to [enable a flag](https://github.com/lancedb/lancedb/blob/main/python/python/lancedb/query.py#L790-L792) to wrap the query in double quotes (as well as sanitize single quotes) prior to sending the query to search. this works for [normal FTS](https://lancedb.github.io/lancedb/fts/), but the command is unavailable on [hybrid search](https://lancedb.github.io/lancedb/hybrid_search/hybrid_search/). ## changes i added `phrase_query()` function to `LanceHybridQueryBuilder` by propagating the call down to its `self. _fts_query` object. i'm not too familiar with the codebase and am not sure if this is the best way to implement the functionality. feel free to riff on this PR or discard ## tests ``` (lancedb) JamesMPB:python james$ pwd /Users/james/src/lancedb/python (lancedb) JamesMPB:python james$ pytest python/tests/test_table.py python/tests/test_table.py ....................................... [100%] ====================================================== 39 passed, 1 warning in 2.23s ======================================================= ``` --- python/python/lancedb/query.py | 22 +++++++++++++++++++++- python/python/tests/test_table.py | 12 +++++++++++- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/python/python/lancedb/query.py b/python/python/lancedb/query.py index c6b14c0f..13b0460c 100644 --- a/python/python/lancedb/query.py +++ b/python/python/lancedb/query.py @@ -42,9 +42,9 @@ if TYPE_CHECKING: import PIL import polars as pl - from .common import VEC from ._lancedb import Query as LanceQuery from ._lancedb import VectorQuery as LanceVectorQuery + from .common import VEC from .pydantic import LanceModel from .table import Table @@ -965,6 +965,7 @@ class LanceHybridQueryBuilder(LanceQueryBuilder): self._reranker = RRFReranker() self._nprobes = None self._refine_factor = None + self._phrase_query = False def _validate_query(self, query, vector=None, text=None): if query is not None and (vector is not None or text is not None): @@ -986,6 +987,23 @@ class LanceHybridQueryBuilder(LanceQueryBuilder): return vector_query, text_query + def phrase_query(self, phrase_query: bool = True) -> LanceHybridQueryBuilder: + """Set whether to use phrase query. + + Parameters + ---------- + phrase_query: bool, default True + If True, then the query will be wrapped in quotes and + double quotes replaced by single quotes. + + Returns + ------- + LanceHybridQueryBuilder + The LanceHybridQueryBuilder object. + """ + self._phrase_query = phrase_query + return self + def to_arrow(self) -> pa.Table: vector_query, fts_query = self._validate_query( self._query, self._vector, self._text @@ -1012,6 +1030,8 @@ class LanceHybridQueryBuilder(LanceQueryBuilder): if self._with_row_id: self._vector_query.with_row_id(True) self._fts_query.with_row_id(True) + if self._phrase_query: + self._fts_query.phrase_query(True) if self._nprobes: self._vector_query.nprobes(self._nprobes) if self._refine_factor: diff --git a/python/python/tests/test_table.py b/python/python/tests/test_table.py index 6ca2f5f1..65cf0c9d 100644 --- a/python/python/tests/test_table.py +++ b/python/python/tests/test_table.py @@ -2,13 +2,13 @@ # SPDX-FileCopyrightText: Copyright The Lance Authors import functools +import os from copy import copy from datetime import date, datetime, timedelta from pathlib import Path from time import sleep from typing import List from unittest.mock import PropertyMock, patch -import os import lance import lancedb @@ -907,6 +907,16 @@ def test_hybrid_search(db, tmp_path): "Our father who art in heaven", query_type="hybrid" ).to_pydantic(MyTable) + # Test that double and single quote characters are handled with phrase_query() + ( + table.search( + '"Aren\'t you a little short for a stormtrooper?" -- Leia', + query_type="hybrid", + ) + .phrase_query(True) + .to_pydantic(MyTable) + ) + assert result1 == result3 # with post filters