From 2b8e872be006d900317aa0b14b89a6a734432f8b Mon Sep 17 00:00:00 2001
From: Jon X <ousiax@hotmail.com>
Date: Thu, 5 Sep 2024 17:10:38 +0800
Subject: [PATCH 1/9] docs: removed the unnecessary fence code tag (#1599)

---
 docs/src/guides/tables.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/src/guides/tables.md b/docs/src/guides/tables.md
index 4be12858..9de2ba94 100644
--- a/docs/src/guides/tables.md
+++ b/docs/src/guides/tables.md
@@ -416,7 +416,6 @@ You can create an empty table for scenarios where you want to add data to the ta
 
 === "Python"
 
-    ```python
 
     An empty table can be initialized via a PyArrow schema.
 

From b24810a01146bfc9ce52e3bc7bbcaa0b9f3db8c3 Mon Sep 17 00:00:00 2001
From: Gagan Bhullar <g.deepsingh1@gmail.com>
Date: Thu, 5 Sep 2024 09:33:07 -0600
Subject: [PATCH 2/9] feat(python, rust): expose offset in query (#1556)

PR is part of #1555
---
 python/python/lancedb/_lancedb.pyi |  2 ++
 python/python/lancedb/query.py     | 38 ++++++++++++++++++++++++++++++
 python/python/lancedb/table.py     |  1 +
 python/python/tests/test_query.py  | 11 +++++++++
 python/src/query.rs                |  8 +++++++
 rust/lancedb/src/query.rs          | 33 ++++++++++++++++++++++++++
 rust/lancedb/src/table.rs          |  9 ++++++-
 7 files changed, 101 insertions(+), 1 deletion(-)

diff --git a/python/python/lancedb/_lancedb.pyi b/python/python/lancedb/_lancedb.pyi
index c4642637..55c3db99 100644
--- a/python/python/lancedb/_lancedb.pyi
+++ b/python/python/lancedb/_lancedb.pyi
@@ -73,6 +73,7 @@ class Query:
     def where(self, filter: str): ...
     def select(self, columns: Tuple[str, str]): ...
     def limit(self, limit: int): ...
+    def offset(self, offset: int): ...
     def nearest_to(self, query_vec: pa.Array) -> VectorQuery: ...
     def nearest_to_text(self, query: dict) -> Query: ...
     async def execute(self, max_batch_legnth: Optional[int]) -> RecordBatchStream: ...
@@ -83,6 +84,7 @@ class VectorQuery:
     def select(self, columns: List[str]): ...
     def select_with_projection(self, columns: Tuple[str, str]): ...
     def limit(self, limit: int): ...
+    def offset(self, offset: int): ...
     def column(self, column: str): ...
     def distance_type(self, distance_type: str): ...
     def postfilter(self): ...
diff --git a/python/python/lancedb/query.py b/python/python/lancedb/query.py
index 9da90987..9c9c69ae 100644
--- a/python/python/lancedb/query.py
+++ b/python/python/lancedb/query.py
@@ -85,6 +85,8 @@ class Query(pydantic.BaseModel):
 
         - See discussion in [Querying an ANN Index][querying-an-ann-index] for
           tuning advice.
+    offset: int
+        The offset to start fetching results from
     """
 
     vector_column: Optional[str] = None
@@ -119,6 +121,8 @@ class Query(pydantic.BaseModel):
 
     with_row_id: bool = False
 
+    offset: int = 0
+
 
 class LanceQueryBuilder(ABC):
     """An abstract query builder. Subclasses are defined for vector search,
@@ -233,6 +237,7 @@ class LanceQueryBuilder(ABC):
     def __init__(self, table: "Table"):
         self._table = table
         self._limit = 10
+        self._offset = 0
         self._columns = None
         self._where = None
         self._prefilter = False
@@ -371,6 +376,25 @@ class LanceQueryBuilder(ABC):
             self._limit = limit
         return self
 
+    def offset(self, offset: int) -> LanceQueryBuilder:
+        """Set the offset for the results.
+
+        Parameters
+        ----------
+        offset: int
+            The offset to start fetching results from.
+
+        Returns
+        -------
+        LanceQueryBuilder
+            The LanceQueryBuilder object.
+        """
+        if offset is None or offset <= 0:
+            self._offset = 0
+        else:
+            self._offset = offset
+        return self
+
     def select(self, columns: Union[list[str], dict[str, str]]) -> LanceQueryBuilder:
         """Set the columns to return.
 
@@ -649,6 +673,7 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
             refine_factor=self._refine_factor,
             vector_column=self._vector_column,
             with_row_id=self._with_row_id,
+            offset=self._offset,
         )
         result_set = self._table._execute_query(query, batch_size)
         if self._reranker is not None:
@@ -780,6 +805,7 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
                 "columns": self._fts_columns,
             },
             vector=[],
+            offset=self._offset,
         )
         results = self._table._execute_query(query)
         results = results.read_all()
@@ -1220,6 +1246,18 @@ class AsyncQueryBase(object):
         self._inner.limit(limit)
         return self
 
+    def offset(self, offset: int) -> AsyncQuery:
+        """
+        Set the offset for the results.
+
+        Parameters
+        ----------
+        offset: int
+            The offset to start fetching results from.
+        """
+        self._inner.offset(offset)
+        return self
+
     async def to_batches(
         self, *, max_batch_length: Optional[int] = None
     ) -> AsyncRecordBatchReader:
diff --git a/python/python/lancedb/table.py b/python/python/lancedb/table.py
index 46df91c2..7d3ebaa0 100644
--- a/python/python/lancedb/table.py
+++ b/python/python/lancedb/table.py
@@ -1708,6 +1708,7 @@ class LanceTable(Table):
             full_text_query=query.full_text_query,
             with_row_id=query.with_row_id,
             batch_size=batch_size,
+            offset=query.offset,
         ).to_reader()
 
     def _do_merge(
diff --git a/python/python/tests/test_query.py b/python/python/tests/test_query.py
index ae50c991..11750e4d 100644
--- a/python/python/tests/test_query.py
+++ b/python/python/tests/test_query.py
@@ -51,6 +51,7 @@ class MockTable:
                 "refine_factor": query.refine_factor,
             },
             batch_size=batch_size,
+            offset=query.offset,
         ).to_reader()
 
 
@@ -106,6 +107,13 @@ def test_cast(table):
     assert r0.float_field == 1.0
 
 
+def test_offset(table):
+    results_without_offset = LanceVectorQueryBuilder(table, [0, 0], "vector")
+    assert len(results_without_offset.to_pandas()) == 2
+    results_with_offset = LanceVectorQueryBuilder(table, [0, 0], "vector").offset(1)
+    assert len(results_with_offset.to_pandas()) == 1
+
+
 def test_query_builder(table):
     rs = (
         LanceVectorQueryBuilder(table, [0, 0], "vector")
@@ -269,7 +277,10 @@ async def test_query_async(table_async: AsyncTable):
         table_async.query().select({"foo": "id", "bar": "id + 1"}),
         expected_columns=["foo", "bar"],
     )
+
     await check_query(table_async.query().limit(1), expected_num_rows=1)
+    await check_query(table_async.query().offset(1), expected_num_rows=1)
+
     await check_query(
         table_async.query().nearest_to(pa.array([1, 2])), expected_num_rows=2
     )
diff --git a/python/src/query.rs b/python/src/query.rs
index f88e60b4..42bd4a13 100644
--- a/python/src/query.rs
+++ b/python/src/query.rs
@@ -64,6 +64,10 @@ impl Query {
         self.inner = self.inner.clone().limit(limit as usize);
     }
 
+    pub fn offset(&mut self, offset: u32) {
+        self.inner = self.inner.clone().offset(offset as usize);
+    }
+
     pub fn nearest_to(&mut self, vector: Bound<'_, PyAny>) -> PyResult<VectorQuery> {
         let data: ArrayData = ArrayData::from_pyarrow_bound(&vector)?;
         let array = make_array(data);
@@ -138,6 +142,10 @@ impl VectorQuery {
         self.inner = self.inner.clone().limit(limit as usize);
     }
 
+    pub fn offset(&mut self, offset: u32) {
+        self.inner = self.inner.clone().offset(offset as usize);
+    }
+
     pub fn column(&mut self, column: String) {
         self.inner = self.inner.clone().column(&column);
     }
diff --git a/rust/lancedb/src/query.rs b/rust/lancedb/src/query.rs
index 714200ae..d2895668 100644
--- a/rust/lancedb/src/query.rs
+++ b/rust/lancedb/src/query.rs
@@ -338,6 +338,12 @@ pub trait QueryBase {
     /// it will default to 10.
     fn limit(self, limit: usize) -> Self;
 
+    /// Set the offset of the query.
+
+    /// By default, it fetches starting with the first row.
+    /// This method can be used to skip the first `offset` rows.
+    fn offset(self, offset: usize) -> Self;
+
     /// Only return rows which match the filter.
     ///
     /// The filter should be supplied as an SQL query string.  For example:
@@ -408,6 +414,11 @@ impl<T: HasQuery> QueryBase for T {
         self
     }
 
+    fn offset(mut self, offset: usize) -> Self {
+        self.mut_query().offset = Some(offset);
+        self
+    }
+
     fn only_if(mut self, filter: impl AsRef<str>) -> Self {
         self.mut_query().filter = Some(filter.as_ref().to_string());
         self
@@ -520,6 +531,9 @@ pub struct Query {
     /// limit the number of rows to return.
     pub(crate) limit: Option<usize>,
 
+    /// Offset of the query.
+    pub(crate) offset: Option<usize>,
+
     /// Apply filter to the returned rows.
     pub(crate) filter: Option<String>,
 
@@ -541,6 +555,7 @@ impl Query {
         Self {
             parent,
             limit: None,
+            offset: None,
             filter: None,
             full_text_search: None,
             select: Select::All,
@@ -858,6 +873,7 @@ mod tests {
         let query = table
             .query()
             .limit(100)
+            .offset(1)
             .nearest_to(&[9.8, 8.7])
             .unwrap()
             .nprobes(1000)
@@ -870,6 +886,7 @@ mod tests {
             new_vector
         );
         assert_eq!(query.base.limit.unwrap(), 100);
+        assert_eq!(query.base.offset.unwrap(), 1);
         assert_eq!(query.nprobes, 1000);
         assert!(query.use_index);
         assert_eq!(query.distance_type, Some(DistanceType::Cosine));
@@ -916,10 +933,26 @@ mod tests {
         let result = query.execute().await;
         let mut stream = result.expect("should have result");
         // should only have one batch
+
         while let Some(batch) = stream.next().await {
             // pre filter should return 10 rows
             assert!(batch.expect("should be Ok").num_rows() == 10);
         }
+
+        let query = table
+            .query()
+            .limit(10)
+            .offset(1)
+            .only_if(String::from("id % 2 == 0"))
+            .nearest_to(&[0.1; 4])
+            .unwrap();
+        let result = query.execute().await;
+        let mut stream = result.expect("should have result");
+        // should only have one batch
+        while let Some(batch) = stream.next().await {
+            // pre filter should return 10 rows
+            assert!(batch.expect("should be Ok").num_rows() == 9);
+        }
     }
 
     #[tokio::test]
diff --git a/rust/lancedb/src/table.rs b/rust/lancedb/src/table.rs
index 12254819..f1942f0e 100644
--- a/rust/lancedb/src/table.rs
+++ b/rust/lancedb/src/table.rs
@@ -1852,9 +1852,16 @@ impl TableInternal for NativeTable {
                 query_vector,
                 query.base.limit.unwrap_or(DEFAULT_TOP_K),
             )?;
+            scanner.limit(
+                query.base.limit.map(|limit| limit as i64),
+                query.base.offset.map(|offset| offset as i64),
+            )?;
         } else {
             // If there is no vector query, it's ok to not have a limit
-            scanner.limit(query.base.limit.map(|limit| limit as i64), None)?;
+            scanner.limit(
+                query.base.limit.map(|limit| limit as i64),
+                query.base.offset.map(|offset| offset as i64),
+            )?;
         }
 
         scanner.nprobs(query.nprobes);

From 2bc7dca3ca5ba0010e324d2306aea7e64ec42049 Mon Sep 17 00:00:00 2001
From: Rithik Kumar <46047011+rithikJha@users.noreply.github.com>
Date: Thu, 5 Sep 2024 22:19:08 +0530
Subject: [PATCH 3/9] docs: add changes to Embeddings-> Available models->
 overview page (#1596)

adding features and improvements to - Manage Embeddings page

Before:
![Screenshot 2024-09-04
223743](https://github.com/user-attachments/assets/f1e116b5-6ebb-4d59-9d29-b20084998cd0)

After:


![Screenshot 2024-09-05
214214](https://github.com/user-attachments/assets/8c94318e-68af-447e-97e1-8153860a2914)

![Screenshot 2024-09-05
213623](https://github.com/user-attachments/assets/55c82770-6df9-4bab-9c5c-1ea1552138de)

![Screenshot 2024-09-05
215931](https://github.com/user-attachments/assets/9bfac7d4-16a6-454e-801e-50789ff75261)
---
 docs/mkdocs.yml                               |  7 ++
 .../cohere_embedding.md                       | 15 +--
 .../embeddings/default_embedding_functions.md | 98 ++++++++++++++-----
 3 files changed, 91 insertions(+), 29 deletions(-)

diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
index 0230caef..bb0c456c 100644
--- a/docs/mkdocs.yml
+++ b/docs/mkdocs.yml
@@ -26,6 +26,7 @@ theme:
     - content.code.copy
     - content.tabs.link
     - content.action.edit
+    - content.tooltips
     - toc.follow
     - navigation.top
     - navigation.tabs
@@ -35,6 +36,7 @@ theme:
     - navigation.instant
   icon:
     repo: fontawesome/brands/github
+    annotation: material/arrow-right-circle
   custom_dir: overrides
 
 plugins:
@@ -76,7 +78,12 @@ markdown_extensions:
   - pymdownx.tabbed:
       alternate_style: true
   - md_in_html
+  - abbr
   - attr_list
+  - pymdownx.snippets
+  - pymdownx.emoji:
+      emoji_index: !!python/name:material.extensions.emoji.twemoji
+      emoji_generator: !!python/name:material.extensions.emoji.to_svg
 
 nav:
   - Home:
diff --git a/docs/src/embeddings/available_embedding_models/text_embedding_functions/cohere_embedding.md b/docs/src/embeddings/available_embedding_models/text_embedding_functions/cohere_embedding.md
index 39eba18c..fd99f2ca 100644
--- a/docs/src/embeddings/available_embedding_models/text_embedding_functions/cohere_embedding.md
+++ b/docs/src/embeddings/available_embedding_models/text_embedding_functions/cohere_embedding.md
@@ -4,13 +4,14 @@ Using cohere API requires cohere package, which can be installed using `pip inst
 You also need to set the `COHERE_API_KEY` environment variable to use the Cohere API.
 
 Supported models are:
-* embed-english-v3.0
-* embed-multilingual-v3.0
-* embed-english-light-v3.0
-* embed-multilingual-light-v3.0
-* embed-english-v2.0
-* embed-english-light-v2.0
-* embed-multilingual-v2.0
+
+- embed-english-v3.0
+- embed-multilingual-v3.0
+- embed-english-light-v3.0
+- embed-multilingual-light-v3.0
+- embed-english-v2.0
+- embed-english-light-v2.0
+- embed-multilingual-v2.0
 
 
 Supported parameters (to be passed in `create` method) are:
diff --git a/docs/src/embeddings/default_embedding_functions.md b/docs/src/embeddings/default_embedding_functions.md
index ced97048..5457dc9f 100644
--- a/docs/src/embeddings/default_embedding_functions.md
+++ b/docs/src/embeddings/default_embedding_functions.md
@@ -1,30 +1,84 @@
-There are various embedding functions available out of the box with LanceDB to manage your embeddings implicitly. We're actively working on adding other popular embedding APIs and models.
+# 📚 Available Embedding Models
 
-## Text embedding functions
-Contains the text embedding functions registered by default.
+There are various embedding functions available out of the box with LanceDB to manage your embeddings implicitly. We're actively working on adding other popular embedding APIs and models. 🚀
 
-* Embedding functions have an inbuilt rate limit handler wrapper for source and query embedding function calls that retry with exponential backoff. 
-* Each `EmbeddingFunction` implementation automatically takes `max_retries` as an argument which has the default value of 7.
+Before jumping on the list of available models, let's understand how to get an embedding model initialized and configured to use in our code: 
 
-**Available Text Embeddings**:
+!!! example "Example usage"
+    ```python
+    model = get_registry()
+              .get("openai")
+              .create(name="text-embedding-ada-002")
+    ```
 
-- [Sentence Transformers](available_embedding_models/text_embedding_functions/sentence_transformers.md)
-- [Huggingface Embedding Models](available_embedding_models/text_embedding_functions/huggingface_embedding.md)
-- [Ollama Embeddings](available_embedding_models/text_embedding_functions/ollama_embedding.md)
-- [OpenAI Embeddings](available_embedding_models/text_embedding_functions/openai_embedding.md)
-- [Instructor Embeddings](available_embedding_models/text_embedding_functions/instructor_embedding.md)
-- [Gemini Embeddings](available_embedding_models/text_embedding_functions/gemini_embedding.md)
-- [Cohere Embeddings](available_embedding_models/text_embedding_functions/cohere_embedding.md)
-- [Jina Embeddings](available_embedding_models/text_embedding_functions/jina_embedding.md)
-- [AWS Bedrock Text Embedding Functions](available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md)
-- [IBM Watsonx.ai Embeddings](available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md)
+Now let's understand the above syntax: 
+```python
+model = get_registry().get("model_id").create(...params)
+```
+**This👆 line effectively creates a configured instance of an `embedding function` with `model` of choice that is ready for use.**
+
+- `get_registry()` :  This function call returns an instance of a `EmbeddingFunctionRegistry` object. This registry manages the registration and retrieval of embedding functions.
+
+- `.get("model_id")` : This method call on the registry object and retrieves the **embedding models functions** associated with the `"model_id"` (1) .
+    { .annotate }
+
+    1.  Hover over the names in table below to find out the `model_id` of different embedding functions.
+
+- `.create(...params)` : This method call is on the object returned by the `get` method. It instantiates an embedding model function using the **specified parameters**. 
+
+??? question "What parameters does the `.create(...params)` method accepts?"
+    **Checkout the documentation of specific embedding models (links in the table below👇) to know what parameters it takes**.
+
+!!! tip "Moving on"
+    Now that we know how to get the **desired embedding model** and use it in our code, let's explore the comprehensive **list** of embedding models **supported by LanceDB**, in the tables below.
+
+## Text Embedding Functions 📝 
+These functions are registered by default to handle text embeddings.
+
+- 🔄 **Embedding functions** have an inbuilt rate limit handler wrapper for source and query embedding function calls that retry with **exponential backoff**. 
+
+- 🌕 Each `EmbeddingFunction` implementation automatically takes `max_retries` as an argument which has the default value of 7. 
+
+🌟 **Available Text Embeddings**
+
+| **Embedding** :material-information-outline:{ title="Hover over the name to find out the model_id" } | **Description** | **Documentation** |
+|-----------|-------------|---------------|
+| [**Sentence Transformers**](available_embedding_models/text_embedding_functions/sentence_transformers.md "sentence-transformers")  | 🧠 **SentenceTransformers** is a Python framework for state-of-the-art sentence, text, and image embeddings. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/sbert_2.png" alt="Sentence Transformers Icon" width="90" height="35">](available_embedding_models/text_embedding_functions/sentence_transformers.md)|                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               
+| [**Huggingface Models**](available_embedding_models/text_embedding_functions/huggingface_embedding.md "huggingface") |🤗 We offer support for all **Huggingface** models. The default model is `colbert-ir/colbertv2.0`. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/hugging_face.png" alt="Huggingface Icon" width="130" height="35">](available_embedding_models/text_embedding_functions/huggingface_embedding.md) |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               
+| [**Ollama Embeddings**](available_embedding_models/text_embedding_functions/ollama_embedding.md "ollama") | 🔍 Generate embeddings via the **Ollama** python library. Ollama supports embedding models, making it possible to build RAG apps. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/Ollama.png" alt="Ollama Icon" width="110" height="35">](available_embedding_models/text_embedding_functions/ollama_embedding.md)|                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               
+| [**OpenAI Embeddings**](available_embedding_models/text_embedding_functions/openai_embedding.md "openai")| 🔑 **OpenAI’s** text embeddings measure the relatedness of text strings. **LanceDB** supports state-of-the-art embeddings from OpenAI. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/openai.png" alt="OpenAI Icon" width="100" height="35">](available_embedding_models/text_embedding_functions/openai_embedding.md)|                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               
+| [**Instructor Embeddings**](available_embedding_models/text_embedding_functions/instructor_embedding.md "instructor") | 📚 **Instructor**: An instruction-finetuned text embedding model that can generate text embeddings tailored to any task and domains by simply providing the task instruction, without any finetuning. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/instructor_embedding.png" alt="Instructor Embedding Icon" width="140" height="35">](available_embedding_models/text_embedding_functions/instructor_embedding.md) |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               
+| [**Gemini Embeddings**](available_embedding_models/text_embedding_functions/gemini_embedding.md "gemini-text") | 🌌 Google’s Gemini API generates state-of-the-art embeddings for words, phrases, and sentences. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/gemini.png" alt="Gemini Icon" width="95" height="35">](available_embedding_models/text_embedding_functions/gemini_embedding.md) |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               
+| [**Cohere Embeddings**](available_embedding_models/text_embedding_functions/cohere_embedding.md "cohere") | 💬 This will help you get started with **Cohere** embedding models using LanceDB. Using cohere API requires cohere package. Install it via `pip`. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/cohere.png" alt="Cohere Icon" width="140" height="35">](available_embedding_models/text_embedding_functions/cohere_embedding.md) |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               
+| [**Jina Embeddings**](available_embedding_models/text_embedding_functions/jina_embedding.md "jina") | 🔗 World-class embedding models to improve your search and RAG systems. You will need **jina api key**. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/jina.png" alt="Jina Icon" width="90" height="35">](available_embedding_models/text_embedding_functions/jina_embedding.md) |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               
+| [ **AWS Bedrock Functions**](available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md "bedrock-text") | ☁️ AWS Bedrock supports multiple base models for generating text embeddings. You need to setup the AWS credentials to use this embedding function. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/aws_bedrock.png" alt="AWS Bedrock Icon" width="120" height="35">](available_embedding_models/text_embedding_functions/aws_bedrock_embedding.md) |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               
+| [**IBM Watsonx.ai**](available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md "watsonx") | 💡 Generate text embeddings using IBM's watsonx.ai platform. **Note**: watsonx.ai library is an optional dependency. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/watsonx.png" alt="Watsonx Icon" width="140" height="35">](available_embedding_models/text_embedding_functions/ibm_watsonx_ai_embedding.md) |
 
 
-## Multi-modal embedding functions
-Multi-modal embedding functions allow you to query your table using both images and text.
 
-**Available Multi-modal Embeddings** :
+[st-key]: "sentence-transformers"
+[hf-key]: "huggingface"
+[ollama-key]: "ollama"
+[openai-key]: "openai"
+[instructor-key]: "instructor"
+[gemini-key]: "gemini-text"
+[cohere-key]: "cohere"
+[jina-key]: "jina"
+[aws-key]: "bedrock-text"
+[watsonx-key]: "watsonx"
 
-- [OpenClip Embeddings](available_embedding_models/multimodal_embedding_functions/openclip_embedding.md)
-- [Imagebind Embeddings](available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md)
-- [Jina Embeddings](available_embedding_models/multimodal_embedding_functions/jina_multimodal_embedding.md)
\ No newline at end of file
+
+## Multi-modal Embedding Functions🖼️ 
+
+Multi-modal embedding functions allow you to query your table using both images and text. 💬🖼️
+
+🌐 **Available Multi-modal Embeddings**
+
+| Embedding :material-information-outline:{ title="Hover over the name to find out the model_id" }  | Description | Documentation  |
+|-----------|-------------|---------------|
+| [**OpenClip Embeddings**](available_embedding_models/multimodal_embedding_functions/openclip_embedding.md "open-clip") | 🎨 We support CLIP model embeddings using the open source alternative, **open-clip** which supports various customizations. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/openclip_github.png" alt="openclip Icon" width="150" height="35">](available_embedding_models/multimodal_embedding_functions/openclip_embedding.md) |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    
+| [**Imagebind Embeddings**](available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md "imageind") | 🌌  We have support for **imagebind model embeddings**. You can download our version of the packaged model via - `pip install imagebind-packaged==0.1.2`. | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/imagebind_meta.png" alt="imagebind Icon" width="150" height="35">](available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md)|                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    
+| [**Jina Multi-modal Embeddings**](available_embedding_models/multimodal_embedding_functions/jina_multimodal_embedding.md "jina") | 🔗 **Jina embeddings** can also be used to embed both **text** and **image** data, only some of the models support image data and you can check the detailed documentation. 👉 | [<img src="https://raw.githubusercontent.com/lancedb/assets/main/docs/assets/logos/jina.png" alt="jina Icon" width="90" height="35">](available_embedding_models/multimodal_embedding_functions/jina_multimodal_embedding.md) |
+
+!!! note
+    If you'd like to request support for additional **embedding functions**, please feel free to open an issue on our LanceDB [GitHub issue page](https://github.com/lancedb/lancedb/issues).
\ No newline at end of file

From 4ee7225e917b44379267a4579119ae587bd6cc5f Mon Sep 17 00:00:00 2001
From: Lei Xu <lei@lancedb.com>
Date: Thu, 5 Sep 2024 11:48:48 -0700
Subject: [PATCH 4/9] ci: public java package (#1485)

Co-authored-by: Lu Qiu <luqiujob@gmail.com>
---
 .github/workflows/java-publish.yml | 109 ++++++++++++++++++++++
 java/core/pom.xml                  |   4 +-
 java/pom.xml                       | 142 ++++++++++++++++++++++++++++-
 3 files changed, 249 insertions(+), 6 deletions(-)
 create mode 100644 .github/workflows/java-publish.yml

diff --git a/.github/workflows/java-publish.yml b/.github/workflows/java-publish.yml
new file mode 100644
index 00000000..12d13522
--- /dev/null
+++ b/.github/workflows/java-publish.yml
@@ -0,0 +1,109 @@
+name: Build and publish Java packages
+on:
+  release:
+    types: [released]
+  pull_request:
+    paths:
+      - .github/workflows/java-publish.yml
+
+jobs:
+  macos-arm64:
+    name: Build on MacOS Arm64
+    runs-on: macos-14
+    timeout-minutes: 45
+    defaults:
+      run:
+        working-directory: ./java/core/lancedb-jni
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      - uses: Swatinem/rust-cache@v2
+      - name: Install dependencies
+        run: |
+          brew install protobuf
+      - name: Build release
+        run: |
+          cargo build --release
+      - uses: actions/upload-artifact@v4
+        with:
+          name: liblancedb_jni_darwin_aarch64.zip
+          path: target/release/liblancedb_jni.dylib
+          retention-days: 1
+          if-no-files-found: error
+  linux-arm64:
+    name: Build on Linux Arm64
+    runs-on: warp-ubuntu-2204-arm64-8x
+    timeout-minutes: 45
+    defaults:
+      run:
+        working-directory: ./java/core/lancedb-jni
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      - uses: Swatinem/rust-cache@v2
+      - uses: actions-rust-lang/setup-rust-toolchain@v1
+        with:
+          toolchain: "1.79.0"
+          cache-workspaces: "./java/core/lancedb-jni"
+          # Disable full debug symbol generation to speed up CI build and keep memory down
+          # "1" means line tables only, which is useful for panic tracebacks.
+          rustflags: "-C debuginfo=1"
+      - name: Install dependencies
+        run: |
+          sudo apt -y -qq update
+          sudo apt install -y protobuf-compiler libssl-dev pkg-config
+      - name: Build release
+        run: |
+          cargo build --release
+      - uses: actions/upload-artifact@v4
+        with:
+          name: liblancedb_jni_linux_aarch64.zip
+          path: target/release/liblancedb_jni.so
+          retention-days: 1
+          if-no-files-found: error
+  linux-x86:
+    runs-on: warp-ubuntu-2204-x64-8x
+    timeout-minutes: 30
+    needs: [macos-arm64, linux-arm64]
+    defaults:
+      run:
+        working-directory: ./java
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      - uses: Swatinem/rust-cache@v2
+      - name: Set up Java 8
+        uses: actions/setup-java@v4
+        with:
+          distribution: temurin
+          java-version: 8
+          cache: "maven"
+          server-id: ossrh
+          server-username: SONATYPE_USER
+          server-password: SONATYPE_TOKEN
+          gpg-private-key: ${{ secrets.GPG_PRIVATE_KEY }}
+          gpg-passphrase: ${{ secrets.GPG_PASSPHRASE }}
+      - name: Install dependencies
+        run: |
+          sudo apt -y -qq update
+          sudo apt install -y protobuf-compiler libssl-dev pkg-config
+      - name: Download artifact
+        uses: actions/download-artifact@v4
+      - name: Copy native libs
+        run: |
+          mkdir -p ./core/target/classes/nativelib/darwin-aarch64 ./core/target/classes/nativelib/linux-aarch64
+          cp ../liblancedb_jni_darwin_aarch64.zip/liblancedb_jni.dylib ./core/target/classes/nativelib/darwin-aarch64/liblancedb_jni.dylib
+          cp ../liblancedb_jni_linux_aarch64.zip/liblancedb_jni.so ./core/target/classes/nativelib/linux-aarch64/liblancedb_jni.so
+      - name: Set github
+        run: |
+          git config --global user.email "LanceDB Github Runner"
+          git config --global user.name "dev+gha@lancedb.com"
+      - name: Publish with Java 8
+        run: |
+          echo "use-agent" >> ~/.gnupg/gpg.conf
+          echo "pinentry-mode loopback" >> ~/.gnupg/gpg.conf
+          export GPG_TTY=$(tty)
+          mvn --batch-mode -DskipTests -DpushChanges=false -Dgpg.passphrase=${{ secrets.GPG_PASSPHRASE }} deploy -P deploy-to-ossrh
+        env:
+          SONATYPE_USER: ${{ secrets.SONATYPE_USER }}
+          SONATYPE_TOKEN: ${{ secrets.SONATYPE_TOKEN }}
diff --git a/java/core/pom.xml b/java/core/pom.xml
index a469c3ae..b6fedc19 100644
--- a/java/core/pom.xml
+++ b/java/core/pom.xml
@@ -8,7 +8,7 @@
     <parent>
         <groupId>com.lancedb</groupId>
         <artifactId>lancedb-parent</artifactId>
-        <version>0.1-SNAPSHOT</version>
+        <version>0.0.3</version>
         <relativePath>../pom.xml</relativePath>
     </parent>
 
@@ -68,7 +68,7 @@
                                 </goals>
                                 <configuration>
                                     <path>lancedb-jni</path>
-                                    <!--<release>true</release>-->
+                                    <release>true</release>
                                     <!-- Copy native libraries to target/classes for runtime access -->
                                     <copyTo>${project.build.directory}/classes/nativelib</copyTo>
                                     <copyWithPlatformDir>true</copyWithPlatformDir>
diff --git a/java/pom.xml b/java/pom.xml
index 48a64c12..6a0a95a7 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -6,15 +6,28 @@
 
     <groupId>com.lancedb</groupId>
     <artifactId>lancedb-parent</artifactId>
-    <version>0.1-SNAPSHOT</version>
+    <version>0.0.3</version>
     <packaging>pom</packaging>
 
-    <name>Lance Parent</name>
+    <name>LanceDB Parent</name>
+    <description>LanceDB vector database Java API</description>
+    <url>http://lancedb.com/</url>
+
+    <developers>
+        <developer>
+            <name>Lance DB Dev Group</name>
+            <email>dev@lancedb.com</email>
+        </developer>
+    </developers>
+    <licenses>
+        <license>
+            <name>The Apache Software License, Version 2.0</name>
+            <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
+        </license>
+    </licenses>
 
     <properties>
         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
-        <maven.compiler.source>11</maven.compiler.source>
-        <maven.compiler.target>11</maven.compiler.target>
         <arrow.version>15.0.0</arrow.version>
     </properties>
 
@@ -22,6 +35,12 @@
         <module>core</module>
     </modules>
 
+    <scm>
+        <connection>scm:git:https://github.com/lancedb/lancedb.git</connection>
+        <developerConnection>scm:git:ssh://git@github.com/lancedb/lancedb.git</developerConnection>
+        <url>https://github.com/lancedb/lancedb</url>
+    </scm>
+
     <dependencyManagement>
         <dependencies>
             <dependency>
@@ -62,8 +81,45 @@
         </dependencies>
     </dependencyManagement>
 
+    <distributionManagement>
+        <snapshotRepository>
+            <id>ossrh</id>
+            <url>https://s01.oss.sonatype.org/content/repositories/snapshots</url>
+        </snapshotRepository>
+        <repository>
+            <id>ossrh</id>
+            <url>https://s01.oss.sonatype.org/service/local/staging/deploy/maven2/</url>
+        </repository>
+    </distributionManagement>
+
      <build>
         <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-source-plugin</artifactId>
+                <version>2.2.1</version>
+                <executions>
+                    <execution>
+                        <id>attach-sources</id>
+                        <goals>
+                            <goal>jar-no-fork</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-javadoc-plugin</artifactId>
+                <version>2.9.1</version>
+                <executions>
+                    <execution>
+                        <id>attach-javadocs</id>
+                        <goals>
+                            <goal>jar</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
             <plugin>
                 <groupId>org.apache.maven.plugins</groupId>
                 <artifactId>maven-checkstyle-plugin</artifactId>
@@ -126,4 +182,82 @@
             </plugins>
         </pluginManagement>
     </build>
+
+   <profiles>
+        <profile>
+            <id>jdk8</id>
+            <activation>
+                <jdk>[1.8,1.8.999]</jdk>
+            </activation>
+            <properties>
+                <maven.compiler.source>1.8</maven.compiler.source>
+                <maven.compiler.target>1.8</maven.compiler.target>
+            </properties>
+        </profile>
+        <profile>
+            <id>jdk11+</id>
+            <activation>
+                <jdk>[11,)</jdk>
+            </activation>
+            <properties>
+                <maven.compiler.source>11</maven.compiler.source>
+                <maven.compiler.target>11</maven.compiler.target>
+            </properties>
+            <build>
+                <plugins>
+                    <plugin>
+                        <artifactId>maven-surefire-plugin</artifactId>
+                        <version>3.2.5</version>
+                        <configuration>
+                            <argLine>--add-opens=java.base/java.nio=ALL-UNNAMED</argLine>
+                            <forkNode implementation="org.apache.maven.plugin.surefire.extensions.SurefireForkNodeFactory" />
+                            <useSystemClassLoader>false</useSystemClassLoader>
+                        </configuration>
+                    </plugin>
+                </plugins>
+            </build>
+        </profile>
+        <profile>
+            <id>deploy-to-ossrh</id>
+            <build>
+                <plugins>
+                    <plugin>
+                        <groupId>org.sonatype.central</groupId>
+                        <artifactId>central-publishing-maven-plugin</artifactId>
+                        <version>0.4.0</version>
+                        <extensions>true</extensions>
+                        <configuration>
+                            <publishingServerId>ossrh</publishingServerId>
+                            <tokenAuth>true</tokenAuth>
+                        </configuration>
+                    </plugin>
+                    <plugin>
+                        <groupId>org.sonatype.plugins</groupId>
+                        <artifactId>nexus-staging-maven-plugin</artifactId>
+                        <version>1.6.13</version>
+                        <extensions>true</extensions>
+                        <configuration>
+                            <serverId>ossrh</serverId>
+                            <nexusUrl>https://s01.oss.sonatype.org/</nexusUrl>
+                            <autoReleaseAfterClose>true</autoReleaseAfterClose>
+                        </configuration>
+                    </plugin>
+                    <plugin>
+                        <groupId>org.apache.maven.plugins</groupId>
+                        <artifactId>maven-gpg-plugin</artifactId>
+                        <version>1.5</version>
+                        <executions>
+                            <execution>
+                                <id>sign-artifacts</id>
+                                <phase>verify</phase>
+                                <goals>
+                                    <goal>sign</goal>
+                                </goals>
+                            </execution>
+                        </executions>
+                    </plugin>
+                </plugins>
+            </build>
+        </profile>
+    </profiles>
 </project>

From 1d61717d0e1a7cb3c818ab598f88fbf3d46c75bd Mon Sep 17 00:00:00 2001
From: Philip Zeyliger <philip.zeyliger@gmail.com>
Date: Thu, 5 Sep 2024 13:18:24 -0700
Subject: [PATCH 5/9] docs: fix get_registry() usage (#1601)

Docs used `get_registry.get(...)` whereas what works is
`get_registry().get(...)`. Fixing the two instances I found. I tested
the open clip version by trying it locally in a Jupyter notebook.
---
 .../multimodal_embedding_functions/imagebind_embedding.md       | 2 +-
 .../multimodal_embedding_functions/openclip_embedding.md        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md b/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md
index 4aa8b3db..72a7e825 100644
--- a/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md
+++ b/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/imagebind_embedding.md
@@ -17,7 +17,7 @@ from lancedb.pydantic import LanceModel, Vector
 from lancedb.embeddings import get_registry
 
 db = lancedb.connect(tmp_path)
-func = get_registry.get("imagebind").create()
+func = get_registry().get("imagebind").create()
 
 class ImageBindModel(LanceModel):
     text: str
diff --git a/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/openclip_embedding.md b/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/openclip_embedding.md
index bf50dfd2..eb6139f5 100644
--- a/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/openclip_embedding.md
+++ b/docs/src/embeddings/available_embedding_models/multimodal_embedding_functions/openclip_embedding.md
@@ -20,7 +20,7 @@ from lancedb.pydantic import LanceModel, Vector
 from lancedb.embeddings import get_registry
 
 db = lancedb.connect(tmp_path)
-func = get_registry.get("open-clip").create()
+func = get_registry().get("open-clip").create()
 
 class Images(LanceModel):
     label: str

From 8dcd328dce8ed5482cb604b5a4d608ab2d5c6ee7 Mon Sep 17 00:00:00 2001
From: BubbleCal <bubble-cal@outlook.com>
Date: Fri, 6 Sep 2024 10:41:38 +0800
Subject: [PATCH 6/9] feat: support to create table from record batch iterator
 (#1593)

---
 python/python/lancedb/db.py           | 45 ++++---------
 python/python/lancedb/query.py        |  2 +-
 python/python/lancedb/remote/arrow.py |  5 +-
 python/python/lancedb/remote/db.py    | 26 ++------
 python/python/lancedb/table.py        | 95 +++++++++++++++++----------
 python/python/tests/test_db.py        | 37 +++++++++++
 6 files changed, 119 insertions(+), 91 deletions(-)

diff --git a/python/python/lancedb/db.py b/python/python/lancedb/db.py
index 1c77b299..d2345e4a 100644
--- a/python/python/lancedb/db.py
+++ b/python/python/lancedb/db.py
@@ -14,7 +14,6 @@
 from __future__ import annotations
 
 import asyncio
-import inspect
 import os
 from abc import abstractmethod
 from pathlib import Path
@@ -27,8 +26,13 @@ from pyarrow import fs
 from lancedb.common import data_to_reader, validate_schema
 
 from ._lancedb import connect as lancedb_connect
-from .pydantic import LanceModel
-from .table import AsyncTable, LanceTable, Table, _sanitize_data, _table_path
+from .table import (
+    AsyncTable,
+    LanceTable,
+    Table,
+    _table_path,
+    sanitize_create_table,
+)
 from .util import (
     fs_from_uri,
     get_uri_location,
@@ -37,6 +41,7 @@ from .util import (
 )
 
 if TYPE_CHECKING:
+    from .pydantic import LanceModel
     from datetime import timedelta
 
     from ._lancedb import Connection as LanceDbConnection
@@ -722,12 +727,6 @@ class AsyncConnection(object):
         ...     await db.create_table("table4", make_batches(), schema=schema)
         >>> asyncio.run(iterable_example())
         """
-        if inspect.isclass(schema) and issubclass(schema, LanceModel):
-            # convert LanceModel to pyarrow schema
-            # note that it's possible this contains
-            # embedding function metadata already
-            schema = schema.to_arrow_schema()
-
         metadata = None
 
         # Defining defaults here and not in function prototype.  In the future
@@ -738,31 +737,9 @@ class AsyncConnection(object):
         if fill_value is None:
             fill_value = 0.0
 
-        if data is not None:
-            data, schema = _sanitize_data(
-                data,
-                schema,
-                metadata=metadata,
-                on_bad_vectors=on_bad_vectors,
-                fill_value=fill_value,
-            )
-
-        if schema is None:
-            if data is None:
-                raise ValueError("Either data or schema must be provided")
-            elif hasattr(data, "schema"):
-                schema = data.schema
-            elif isinstance(data, Iterable):
-                if metadata:
-                    raise TypeError(
-                        (
-                            "Persistent embedding functions not yet "
-                            "supported for generator data input"
-                        )
-                    )
-
-        if metadata:
-            schema = schema.with_metadata(metadata)
+        data, schema = sanitize_create_table(
+            data, schema, metadata, on_bad_vectors, fill_value
+        )
         validate_schema(schema)
 
         if exist_ok is None:
diff --git a/python/python/lancedb/query.py b/python/python/lancedb/query.py
index 9c9c69ae..c6b14c0f 100644
--- a/python/python/lancedb/query.py
+++ b/python/python/lancedb/query.py
@@ -852,7 +852,7 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
         )
         if len(row_ids) == 0:
             empty_schema = pa.schema([pa.field("_score", pa.float32())])
-            return pa.Table.from_pylist([], schema=empty_schema)
+            return pa.Table.from_batches([], schema=empty_schema)
         scores = pa.array(scores)
         output_tbl = self._table.to_lance().take(row_ids, columns=self._columns)
         output_tbl = output_tbl.append_column("_score", scores)
diff --git a/python/python/lancedb/remote/arrow.py b/python/python/lancedb/remote/arrow.py
index 753087cf..ac39e247 100644
--- a/python/python/lancedb/remote/arrow.py
+++ b/python/python/lancedb/remote/arrow.py
@@ -11,12 +11,15 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
+from typing import Iterable, Union
 import pyarrow as pa
 
 
-def to_ipc_binary(table: pa.Table) -> bytes:
+def to_ipc_binary(table: Union[pa.Table, Iterable[pa.RecordBatch]]) -> bytes:
     """Serialize a PyArrow Table to IPC binary."""
     sink = pa.BufferOutputStream()
+    if isinstance(table, Iterable):
+        table = pa.Table.from_batches(table)
     with pa.ipc.new_stream(sink, table.schema) as writer:
         writer.write_table(table)
     return sink.getvalue().to_pybytes()
diff --git a/python/python/lancedb/remote/db.py b/python/python/lancedb/remote/db.py
index 0dd6bb6d..bb7554a4 100644
--- a/python/python/lancedb/remote/db.py
+++ b/python/python/lancedb/remote/db.py
@@ -11,7 +11,6 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-import inspect
 import logging
 import uuid
 from concurrent.futures import ThreadPoolExecutor
@@ -26,7 +25,7 @@ from ..common import DATA
 from ..db import DBConnection
 from ..embeddings import EmbeddingFunctionConfig
 from ..pydantic import LanceModel
-from ..table import Table, _sanitize_data
+from ..table import Table, sanitize_create_table
 from ..util import validate_table_name
 from .arrow import to_ipc_binary
 from .client import ARROW_STREAM_CONTENT_TYPE, RestfulLanceDBClient
@@ -228,8 +227,6 @@ class RemoteDBConnection(DBConnection):
 
         """
         validate_table_name(name)
-        if data is None and schema is None:
-            raise ValueError("Either data or schema must be provided.")
         if embedding_functions is not None:
             logging.warning(
                 "embedding_functions is not yet supported on LanceDB Cloud."
@@ -239,24 +236,9 @@ class RemoteDBConnection(DBConnection):
         if mode is not None:
             logging.warning("mode is not yet supported on LanceDB Cloud.")
 
-        if inspect.isclass(schema) and issubclass(schema, LanceModel):
-            # convert LanceModel to pyarrow schema
-            # note that it's possible this contains
-            # embedding function metadata already
-            schema = schema.to_arrow_schema()
-
-        if data is not None:
-            data, schema = _sanitize_data(
-                data,
-                schema,
-                metadata=None,
-                on_bad_vectors=on_bad_vectors,
-                fill_value=fill_value,
-            )
-        else:
-            if schema is None:
-                raise ValueError("Either data or schema must be provided")
-            data = pa.Table.from_pylist([], schema=schema)
+        data, schema = sanitize_create_table(
+            data, schema, on_bad_vectors=on_bad_vectors, fill_value=fill_value
+        )
 
         from .table import RemoteTable
 
diff --git a/python/python/lancedb/table.py b/python/python/lancedb/table.py
index 7d3ebaa0..53e624a0 100644
--- a/python/python/lancedb/table.py
+++ b/python/python/lancedb/table.py
@@ -117,15 +117,50 @@ def _sanitize_data(
         data = _sanitize_schema(
             data, schema=schema, on_bad_vectors=on_bad_vectors, fill_value=fill_value
         )
+        if schema is None:
+            schema = data.schema
     elif isinstance(data, Iterable):
         data = _to_record_batch_generator(
             data, schema, metadata, on_bad_vectors, fill_value
         )
+        if schema is None:
+            data, schema = _generator_to_data_and_schema(data)
+            if schema is None:
+                raise ValueError("Cannot infer schema from generator data")
     else:
         raise TypeError(f"Unsupported data type: {type(data)}")
     return data, schema
 
 
+def sanitize_create_table(
+    data, schema, metadata=None, on_bad_vectors="error", fill_value=0.0
+):
+    if inspect.isclass(schema) and issubclass(schema, LanceModel):
+        # convert LanceModel to pyarrow schema
+        # note that it's possible this contains
+        # embedding function metadata already
+        schema = schema.to_arrow_schema()
+
+    if data is not None:
+        data, schema = _sanitize_data(
+            data,
+            schema,
+            metadata=metadata,
+            on_bad_vectors=on_bad_vectors,
+            fill_value=fill_value,
+        )
+    if schema is None:
+        if data is None:
+            raise ValueError("Either data or schema must be provided")
+        elif hasattr(data, "schema"):
+            schema = data.schema
+
+    if metadata:
+        schema = schema.with_metadata(metadata)
+
+    return data, schema
+
+
 def _schema_from_hf(data, schema):
     """
     Extract pyarrow schema from HuggingFace DatasetDict
@@ -187,8 +222,30 @@ def _append_vector_col(data: pa.Table, metadata: dict, schema: Optional[pa.Schem
     return data
 
 
+def _generator_to_data_and_schema(
+    data: Iterable,
+) -> Tuple[Iterable[pa.RecordBatch], pa.Schema]:
+    def _with_first_generator(first, data):
+        yield first
+        yield from data
+
+    first = next(data, None)
+    schema = None
+    if isinstance(first, pa.RecordBatch):
+        schema = first.schema
+        data = _with_first_generator(first, data)
+    elif isinstance(first, pa.Table):
+        schema = first.schema
+        data = _with_first_generator(first.to_batches(), data)
+    return data, schema
+
+
 def _to_record_batch_generator(
-    data: Iterable, schema, metadata, on_bad_vectors, fill_value
+    data: Iterable,
+    schema,
+    metadata,
+    on_bad_vectors,
+    fill_value,
 ):
     for batch in data:
         # always convert to table because we need to sanitize the data
@@ -1569,12 +1626,6 @@ class LanceTable(Table):
             The embedding functions to use when creating the table.
         """
         tbl = LanceTable(db, name)
-        if inspect.isclass(schema) and issubclass(schema, LanceModel):
-            # convert LanceModel to pyarrow schema
-            # note that it's possible this contains
-            # embedding function metadata already
-            schema = schema.to_arrow_schema()
-
         metadata = None
         if embedding_functions is not None:
             # If we passed in embedding functions explicitly
@@ -1583,33 +1634,11 @@ class LanceTable(Table):
             registry = EmbeddingFunctionRegistry.get_instance()
             metadata = registry.get_table_metadata(embedding_functions)
 
-        if data is not None:
-            data, schema = _sanitize_data(
-                data,
-                schema,
-                metadata=metadata,
-                on_bad_vectors=on_bad_vectors,
-                fill_value=fill_value,
-            )
+        data, schema = sanitize_create_table(
+            data, schema, metadata, on_bad_vectors, fill_value
+        )
 
-        if schema is None:
-            if data is None:
-                raise ValueError("Either data or schema must be provided")
-            elif hasattr(data, "schema"):
-                schema = data.schema
-            elif isinstance(data, Iterable):
-                if metadata:
-                    raise TypeError(
-                        (
-                            "Persistent embedding functions not yet "
-                            "supported for generator data input"
-                        )
-                    )
-
-        if metadata:
-            schema = schema.with_metadata(metadata)
-
-        empty = pa.Table.from_pylist([], schema=schema)
+        empty = pa.Table.from_batches([], schema=schema)
         try:
             lance.write_dataset(empty, tbl._dataset_uri, schema=schema, mode=mode)
         except OSError as err:
diff --git a/python/python/tests/test_db.py b/python/python/tests/test_db.py
index 373ae2b6..5b7f3c42 100644
--- a/python/python/tests/test_db.py
+++ b/python/python/tests/test_db.py
@@ -233,6 +233,43 @@ def test_create_mode(tmp_path):
     assert tbl.to_pandas().item.tolist() == ["fizz", "buzz"]
 
 
+def test_create_table_from_iterator(tmp_path):
+    db = lancedb.connect(tmp_path)
+
+    def gen_data():
+        for _ in range(10):
+            yield pa.RecordBatch.from_arrays(
+                [
+                    pa.array([[3.1, 4.1]], pa.list_(pa.float32(), 2)),
+                    pa.array(["foo"]),
+                    pa.array([10.0]),
+                ],
+                ["vector", "item", "price"],
+            )
+
+    table = db.create_table("test", data=gen_data())
+    assert table.count_rows() == 10
+
+
+@pytest.mark.asyncio
+async def test_create_table_from_iterator_async(tmp_path):
+    db = await lancedb.connect_async(tmp_path)
+
+    def gen_data():
+        for _ in range(10):
+            yield pa.RecordBatch.from_arrays(
+                [
+                    pa.array([[3.1, 4.1]], pa.list_(pa.float32(), 2)),
+                    pa.array(["foo"]),
+                    pa.array([10.0]),
+                ],
+                ["vector", "item", "price"],
+            )
+
+    table = await db.create_table("test", data=gen_data())
+    assert await table.count_rows() == 10
+
+
 def test_create_exist_ok(tmp_path):
     db = lancedb.connect(tmp_path)
     data = pd.DataFrame(

From 7eb3b52297dea2ca63b2f454b3fbd19ddb1022e5 Mon Sep 17 00:00:00 2001
From: Jon X <ousiax@hotmail.com>
Date: Fri, 6 Sep 2024 12:08:19 +0800
Subject: [PATCH 7/9] docs: added a blank line between a paragraph and a list
 block (#1604)

Though the markdown can be rendered well on GitHub (GFM style?), but it
seems that it's required to insert a blank line between a paragraph and
a list block to make it render well with `mkdocs`?

see also the web page:
https://lancedb.github.io/lancedb/concepts/index_hnsw/
---
 docs/src/concepts/index_hnsw.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/docs/src/concepts/index_hnsw.md b/docs/src/concepts/index_hnsw.md
index 9e8dc948..8bfaf39c 100644
--- a/docs/src/concepts/index_hnsw.md
+++ b/docs/src/concepts/index_hnsw.md
@@ -15,11 +15,13 @@ HNSW also combines this with the ideas behind a classic 1-dimensional search dat
 
 ## k-Nearest Neighbor Graphs and k-approximate Nearest neighbor Graphs
 The k-nearest neighbor graph actually predates its use for ANN search. Its construction is quite simple:
+
 * Each vector in the dataset is given an associated vertex.
 * Each vertex has outgoing edges to its k nearest neighbors. That is, the k closest other vertices by Euclidean distance between the two corresponding vectors. This can be thought of as a "friend list" for the vertex.
 * For some applications (including nearest-neighbor search), the incoming edges are also added.
 
 Eventually, it was realized that the following greedy search method over such a graph typically results in good approximate nearest neighbors:
+
 * Given a query vector, start at some fixed "entry point" vertex (e.g. the approximate center node).
 * Look at that vertex's neighbors. If any of them are closer to the query vector than the current vertex, then move to that vertex.
 * Repeat until a local optimum is found.
@@ -36,15 +38,18 @@ One downside of k-NN and k-ANN graphs alone is that one must typically build the
 ## HNSW: Hierarchical Navigable Small Worlds
 
 HNSW builds on k-ANN in two main ways:
+
 * Instead of getting the k-approximate nearest neighbors for a large value of k, it sparsifies the k-ANN graph using a carefully chosen "edge pruning" heuristic, allowing for the number of edges per vertex to be limited to a relatively small constant.
 * The "entry point" vertex is chosen dynamically using a recursively constructed data structure on a subset of the data, similarly to a skip list.
 
 This recursive structure can be thought of as separating into layers:
+
 * At the bottom-most layer, an k-ANN graph on the whole dataset is present.
 * At the second layer, a k-ANN graph on a fraction of the dataset (e.g. 10%) is present.
 * At the Lth layer, a k-ANN graph is present. It is over a (constant) fraction (e.g. 10%) of the vectors/vertices present in the L-1th layer.
 
 Then the greedy search routine operates as follows:
+
 * At the top layer (using an arbitrary vertex as an entry point), use the greedy local search routine on the k-ANN graph to get an approximate nearest neighbor at that layer.
 * Using the approximate nearest neighbor found in the previous layer as an entry point, find an approximate nearest neighbor in the next layer with the same method.
 * Repeat until the bottom-most layer is reached. Then use the entry point to find multiple nearest neighbors (e.g. top 10).

From cd32944e5499028f4ff6d2e0c8578949671e00a1 Mon Sep 17 00:00:00 2001
From: Will Jones <willjones127@gmail.com>
Date: Fri, 6 Sep 2024 14:10:02 -0700
Subject: [PATCH 8/9] feat: upgrade lance to v0.17.0 (#1608)

Changelog: https://github.com/lancedb/lance/releases/tag/v0.17.0

Highlights:

* You can do "phrase queries" by adding double quotes around phrases
(multiple tokens) in FTS.

Added follow ups in: https://github.com/lancedb/lancedb/issues/1611
---
 Cargo.toml                 | 12 ++++++------
 python/pyproject.toml      |  2 +-
 rust/ffi/node/src/table.rs |  2 +-
 rust/lancedb/src/table.rs  |  2 +-
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 522342e8..e16528c3 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -20,12 +20,12 @@ keywords = ["lancedb", "lance", "database", "vector", "search"]
 categories = ["database-implementations"]
 
 [workspace.dependencies]
-lance = { "version" = "=0.16.1", "features" = ["dynamodb"] }
-lance-index = { "version" = "=0.16.1" }
-lance-linalg = { "version" = "=0.16.1" }
-lance-testing = { "version" = "=0.16.1" }
-lance-datafusion = { "version" = "=0.16.1" }
-lance-encoding = { "version" = "=0.16.1" }
+lance = { "version" = "=0.17.0", "features" = ["dynamodb"] }
+lance-index = { "version" = "=0.17.0" }
+lance-linalg = { "version" = "=0.17.0" }
+lance-testing = { "version" = "=0.17.0" }
+lance-datafusion = { "version" = "=0.17.0" }
+lance-encoding = { "version" = "=0.17.0" }
 # Note that this one does not include pyarrow
 arrow = { version = "52.2", optional = false }
 arrow-array = "52.2"
diff --git a/python/pyproject.toml b/python/pyproject.toml
index 5e22fd47..7d41d891 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -3,7 +3,7 @@ name = "lancedb"
 # version in Cargo.toml
 dependencies = [
     "deprecation",
-    "pylance==0.16.1",
+    "pylance==0.17.0",
     "ratelimiter~=1.0",
     "requests>=2.31.0",
     "retry>=0.9.2",
diff --git a/rust/ffi/node/src/table.rs b/rust/ffi/node/src/table.rs
index 10e7f19b..3e49d742 100644
--- a/rust/ffi/node/src/table.rs
+++ b/rust/ffi/node/src/table.rs
@@ -391,7 +391,7 @@ impl JsTable {
                 materialize_deletions_threshold.value(&mut cx) as f32;
         }
         if let Some(num_threads) = js_options.get_opt::<JsNumber, _, _>(&mut cx, "numThreads")? {
-            options.num_threads = num_threads.value(&mut cx) as usize;
+            options.num_threads = Some(num_threads.value(&mut cx) as usize);
         }
 
         rt.spawn(async move {
diff --git a/rust/lancedb/src/table.rs b/rust/lancedb/src/table.rs
index f1942f0e..88c23533 100644
--- a/rust/lancedb/src/table.rs
+++ b/rust/lancedb/src/table.rs
@@ -2788,7 +2788,7 @@ mod tests {
                 .get_index_type(index_uuid)
                 .await
                 .unwrap(),
-            Some("IVF".to_string())
+            Some("IVF_PQ".to_string())
         );
         assert_eq!(
             table

From 029b01bbbf800a96a9764fa6c93acf10dfad040b Mon Sep 17 00:00:00 2001
From: James Wu <jameswu1991@users.noreply.github.com>
Date: Fri, 6 Sep 2024 20:28:05 -0700
Subject: [PATCH 9/9] feat: enable phrase_query(bool) for hybrid search queries
 (#1578)

first off, apologies for any folly since i'm new to contributing to
lancedb. this PR is the continuation of [a discord
thread](https://discord.com/channels/1030247538198061086/1030247538667827251/1278844345713299599):

## user story

here's the lance db search query i'd like to run:

```
def search(phrase):
    logger.info(f'Searching for phrase: {phrase}')
    phrase_embedding = get_embedding(phrase)
    df = (table.search((phrase_embedding, phrase), query_type='hybrid')
        .limit(10).to_list())
    logger.info(f'Success search with row count: {len(df)}')

search('howdy (howdy)')
search('howdy(howdy)')
```

the second search fails due to `ValueError: Syntax Error: howdy(howdy)`

i saw on the
[docs](https://lancedb.github.io/lancedb/fts/#phrase-queries-vs-terms-queries)
that i can use `phrase_query()` to [enable a
flag](https://github.com/lancedb/lancedb/blob/main/python/python/lancedb/query.py#L790-L792)
to wrap the query in double quotes (as well as sanitize single quotes)
prior to sending the query to search. this works for [normal
FTS](https://lancedb.github.io/lancedb/fts/), but the command is
unavailable on [hybrid
search](https://lancedb.github.io/lancedb/hybrid_search/hybrid_search/).

## changes

i added `phrase_query()` function to `LanceHybridQueryBuilder` by
propagating the call down to its `self. _fts_query` object. i'm not too
familiar with the codebase and am not sure if this is the best way to
implement the functionality. feel free to riff on this PR or discard


## tests

```
(lancedb) JamesMPB:python james$ pwd
/Users/james/src/lancedb/python
(lancedb) JamesMPB:python james$ pytest python/tests/test_table.py
python/tests/test_table.py .......................................                                                                   [100%]
====================================================== 39 passed, 1 warning in 2.23s =======================================================
```
---
 python/python/lancedb/query.py    | 22 +++++++++++++++++++++-
 python/python/tests/test_table.py | 12 +++++++++++-
 2 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/python/python/lancedb/query.py b/python/python/lancedb/query.py
index c6b14c0f..13b0460c 100644
--- a/python/python/lancedb/query.py
+++ b/python/python/lancedb/query.py
@@ -42,9 +42,9 @@ if TYPE_CHECKING:
     import PIL
     import polars as pl
 
-    from .common import VEC
     from ._lancedb import Query as LanceQuery
     from ._lancedb import VectorQuery as LanceVectorQuery
+    from .common import VEC
     from .pydantic import LanceModel
     from .table import Table
 
@@ -965,6 +965,7 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
         self._reranker = RRFReranker()
         self._nprobes = None
         self._refine_factor = None
+        self._phrase_query = False
 
     def _validate_query(self, query, vector=None, text=None):
         if query is not None and (vector is not None or text is not None):
@@ -986,6 +987,23 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
 
         return vector_query, text_query
 
+    def phrase_query(self, phrase_query: bool = True) -> LanceHybridQueryBuilder:
+        """Set whether to use phrase query.
+
+        Parameters
+        ----------
+        phrase_query: bool, default True
+            If True, then the query will be wrapped in quotes and
+            double quotes replaced by single quotes.
+
+        Returns
+        -------
+        LanceHybridQueryBuilder
+            The LanceHybridQueryBuilder object.
+        """
+        self._phrase_query = phrase_query
+        return self
+
     def to_arrow(self) -> pa.Table:
         vector_query, fts_query = self._validate_query(
             self._query, self._vector, self._text
@@ -1012,6 +1030,8 @@ class LanceHybridQueryBuilder(LanceQueryBuilder):
         if self._with_row_id:
             self._vector_query.with_row_id(True)
             self._fts_query.with_row_id(True)
+        if self._phrase_query:
+            self._fts_query.phrase_query(True)
         if self._nprobes:
             self._vector_query.nprobes(self._nprobes)
         if self._refine_factor:
diff --git a/python/python/tests/test_table.py b/python/python/tests/test_table.py
index 6ca2f5f1..65cf0c9d 100644
--- a/python/python/tests/test_table.py
+++ b/python/python/tests/test_table.py
@@ -2,13 +2,13 @@
 # SPDX-FileCopyrightText: Copyright The Lance Authors
 
 import functools
+import os
 from copy import copy
 from datetime import date, datetime, timedelta
 from pathlib import Path
 from time import sleep
 from typing import List
 from unittest.mock import PropertyMock, patch
-import os
 
 import lance
 import lancedb
@@ -907,6 +907,16 @@ def test_hybrid_search(db, tmp_path):
         "Our father who art in heaven", query_type="hybrid"
     ).to_pydantic(MyTable)
 
+    # Test that double and single quote characters are handled with phrase_query()
+    (
+        table.search(
+            '"Aren\'t you a little short for a stormtrooper?" -- Leia',
+            query_type="hybrid",
+        )
+        .phrase_query(True)
+        .to_pydantic(MyTable)
+    )
+
     assert result1 == result3
 
     # with post filters