docs: add a section to describe scalar index (#1495)

2026-01-07 04:12:59 +00:00 · 2024-08-16 18:48:29 -07:00
parent 09ce6c5bb5
commit 5857cb4c6e
4 changed files with 125 additions and 11 deletions
--- a/docs/mkdocs.yml
+++ b/docs/mkdocs.yml
@@ -58,7 +58,7 @@ plugins:
            - https://pandas.pydata.org/docs/objects.inv
  - mkdocs-jupyter
  - render_swagger:
-      allow_arbitrary_locations : true
+      allow_arbitrary_locations: true
 markdown_extensions:
  - admonition
@@ -89,9 +89,10 @@ nav:
          - Data management: concepts/data_management.md
      - 🔨 Guides:
          - Working with tables: guides/tables.md
-          - Building an ANN index: ann_indexes.md
+          - Building a vector index: ann_indexes.md
          - Vector Search: search.md
          - Full-text search: fts.md
          - Building a scalar index: guides/scalar_index.md
          - Hybrid search:
              - Overview: hybrid_search/hybrid_search.md
              - Comparing Rerankers: hybrid_search/eval.md
@@ -128,12 +129,12 @@ nav:
          - Polars: python/polars_arrow.md
          - DuckDB: python/duckdb.md
          - LangChain:
-            - LangChain 🔗: integrations/langchain.md
+              - LangChain 🔗: integrations/langchain.md
-            - LangChain demo: notebooks/langchain_demo.ipynb
+              - LangChain demo: notebooks/langchain_demo.ipynb
-            - LangChain JS/TS 🔗: https://js.langchain.com/docs/integrations/vectorstores/lancedb
+              - LangChain JS/TS 🔗: https://js.langchain.com/docs/integrations/vectorstores/lancedb
          - LlamaIndex 🦙:
-            - LlamaIndex docs: integrations/llamaIndex.md
+              - LlamaIndex docs: integrations/llamaIndex.md
-            - LlamaIndex demo: notebooks/llamaIndex_demo.ipynb
+              - LlamaIndex demo: notebooks/llamaIndex_demo.ipynb
          - Pydantic: python/pydantic.md
          - Voxel51: integrations/voxel51.md
          - PromptTools: integrations/prompttools.md
@@ -145,7 +146,7 @@ nav:
              - Multimodal: examples/python_examples/multimodal.md
              - Rag: examples/python_examples/rag.md
              - Vector Search: examples/python_examples/vector_search.md
-              - Miscellaneous: 
+              - Miscellaneous:
                  - YouTube Transcript Search: notebooks/youtube_transcript_search.ipynb
                  - Documentation QA Bot using LangChain: notebooks/code_qa_bot.ipynb
                  - Multimodal search using CLIP: notebooks/multimodal_search.ipynb
@@ -182,6 +183,7 @@ nav:
      - Building an ANN index: ann_indexes.md
      - Vector Search: search.md
      - Full-text search: fts.md
      - Building a scalar index: guides/scalar_index.md
      - Hybrid search:
          - Overview: hybrid_search/hybrid_search.md
          - Comparing Rerankers: hybrid_search/eval.md
@@ -231,7 +233,7 @@ nav:
          - Multimodal: examples/python_examples/multimodal.md
          - Rag: examples/python_examples/rag.md
          - Vector Search: examples/python_examples/vector_search.md
-          - Miscellaneous: 
+          - Miscellaneous:
              - YouTube Transcript Search: notebooks/youtube_transcript_search.ipynb
              - Documentation QA Bot using LangChain: notebooks/code_qa_bot.ipynb
              - Multimodal search using CLIP: notebooks/multimodal_search.ipynb
--- a/docs/src/guides/scalar_index.md
+++ b/docs/src/guides/scalar_index.md
@@ -0,0 +1,108 @@
 # Building Scalar Index
 Similar to many SQL databases, LanceDB supports several types of Scalar indices to accelerate search
 over scalar columns.
 - `BTREE`: The most common type is BTREE. This index is inspired by the btree data structure
  although only the first few layers of the btree are cached in memory.
  It will perform well on columns with a large number of unique values and few rows per value.
 - `BITMAP`: this index stores a bitmap for each unique value in the column.
  This index is useful for columns with a finite number of unique values and many rows per value.
  For example, columns that represent "categories", "labels", or "tags"
 - `LABEL_LIST`: a special index that is used to index list columns whose values have a finite set of possibilities.
  For example, a column that contains lists of tags (e.g. `["tag1", "tag2", "tag3"]`) can be indexed with a `LABEL_LIST` index.
 | Data Type                                                       | Filter                                    | Index Type   |
 | --------------------------------------------------------------- | ----------------------------------------- | ------------ |
 | Numeric, String, Temporal                                       | `<`, `=`, `>`, `in`, `between`, `is null` | `BTREE`      |
 | Boolean, numbers or strings with fewer than 1,000 unique values | `<`, `=`, `>`, `in`, `between`, `is null` | `BITMAP`     |
 | List of low cardinality of numbers or strings                   | `array_has_any`, `array_has_all`          | `LABEL_LIST` |
 === "Python"
    ```python
    import lancedb
    books = [
      {"book_id": 1, "publisher": "plenty of books", "tags": ["fantasy", "adventure"]},
      {"book_id": 2, "publisher": "book town", "tags": ["non-fiction"]},
      {"book_id": 3, "publisher": "oreilly", "tags": ["textbook"]}
    ]
    db = lancedb.connect("./db")
    table = db.create_table("books", books)
    table.create_scalar_index("book_id")  # BTree by default
    table.create_scalar_index("publisher", index_type="BITMAP")
    ```
 === "Typescript"
    === "@lancedb/lancedb"
        ```js
        const db = await lancedb.connect("data");
        const tbl = await db.openTable("my_vectors");
        await tbl.create_index("book_id");
        await tlb.create_index("publisher", { config: lancedb.Index.bitmap() })
        ```
 For example, the following scan will be faster if the column `my_col` has a scalar index:
 === "Python"
    ```python
    import lancedb
    table = db.open_table("books")
    my_df = table.search().where("book_id = 2").to_pandas()
    ```
 === "Typescript"
    === "@lancedb/lancedb"
        ```js
        const db = await lancedb.connect("data");
        const tbl = await db.openTable("books");
        await tbl
          .query()
          .where("book_id = 2")
          .limit(10)
          .toArray();
        ```
 Scalar indices can also speed up scans containing a vector search or full text search, and a prefilter:
 === "Python"
    ```python
    import lancedb
    data = [
      {"book_id": 1, "vector": [1, 2]},
      {"book_id": 2, "vector": [3, 4]},
      {"book_id": 3, "vector": [5, 6]}
    ]
    table = db.create_table("book_with_embeddings", data)
    (
        table.search([1, 2])
        .where("book_id != 3", prefilter=True)
        .to_pandas()
    )
    ```
 === "Typescript"
    === "@lancedb/lancedb"
        ```js
        const db = await lancedb.connect("data/lance");
        const tbl = await db.openTable("book_with_embeddings");
        await tbl.search(Array(1536).fill(1.2))
          .where("book_id != 3")  // prefilter is default behavior.
          .limit(10)
          .toArray();
        ```
--- a/docs/test/md_testing.py
+++ b/docs/test/md_testing.py
@@ -1,3 +1,5 @@
 #!/usr/bin/env python3
 import glob
 from typing import Iterator, List
 from pathlib import Path
--- a/python/python/lancedb/table.py
+++ b/python/python/lancedb/table.py
@@ -339,9 +339,9 @@ class Table(ABC):
    def create_scalar_index(
        self,
        column: str,
        index_type: Literal["BTREE", "BITMAP", "LABEL_LIST"] = "BTREE",
        *,
        replace: bool = True,
        index_type: Literal["BTREE", "BITMAP", "LABEL_LIST"] = "BTREE",
    ):
        """Create a scalar index on a column.
@@ -391,6 +391,8 @@ class Table(ABC):
            or string column.
        replace : bool, default True
            Replace the existing index if it exists.
        index_type: Literal["BTREE", "BITMAP", "LABEL_LIST"], default "BTREE"
            The type of index to create.
        Examples
        --------
@@ -1232,9 +1234,9 @@ class LanceTable(Table):
    def create_scalar_index(
        self,
        column: str,
        index_type: Literal["BTREE", "BITMAP", "LABEL_LIST"] = "BTREE",
        *,
        replace: bool = True,
        index_type: Literal["BTREE", "BITMAP", "LABEL_LIST"] = "BTREE",
    ):
        self._dataset_mut.create_scalar_index(
            column, index_type=index_type, replace=replace