diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index 9059ef90..bae287f4 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -58,7 +58,7 @@ plugins: - https://pandas.pydata.org/docs/objects.inv - mkdocs-jupyter - render_swagger: - allow_arbitrary_locations : true + allow_arbitrary_locations: true markdown_extensions: - admonition @@ -89,9 +89,10 @@ nav: - Data management: concepts/data_management.md - 🔨 Guides: - Working with tables: guides/tables.md - - Building an ANN index: ann_indexes.md + - Building a vector index: ann_indexes.md - Vector Search: search.md - Full-text search: fts.md + - Building a scalar index: guides/scalar_index.md - Hybrid search: - Overview: hybrid_search/hybrid_search.md - Comparing Rerankers: hybrid_search/eval.md @@ -128,12 +129,12 @@ nav: - Polars: python/polars_arrow.md - DuckDB: python/duckdb.md - LangChain: - - LangChain 🔗: integrations/langchain.md - - LangChain demo: notebooks/langchain_demo.ipynb - - LangChain JS/TS 🔗: https://js.langchain.com/docs/integrations/vectorstores/lancedb + - LangChain 🔗: integrations/langchain.md + - LangChain demo: notebooks/langchain_demo.ipynb + - LangChain JS/TS 🔗: https://js.langchain.com/docs/integrations/vectorstores/lancedb - LlamaIndex 🦙: - - LlamaIndex docs: integrations/llamaIndex.md - - LlamaIndex demo: notebooks/llamaIndex_demo.ipynb + - LlamaIndex docs: integrations/llamaIndex.md + - LlamaIndex demo: notebooks/llamaIndex_demo.ipynb - Pydantic: python/pydantic.md - Voxel51: integrations/voxel51.md - PromptTools: integrations/prompttools.md @@ -145,7 +146,7 @@ nav: - Multimodal: examples/python_examples/multimodal.md - Rag: examples/python_examples/rag.md - Vector Search: examples/python_examples/vector_search.md - - Miscellaneous: + - Miscellaneous: - YouTube Transcript Search: notebooks/youtube_transcript_search.ipynb - Documentation QA Bot using LangChain: notebooks/code_qa_bot.ipynb - Multimodal search using CLIP: notebooks/multimodal_search.ipynb @@ -182,6 +183,7 @@ nav: - Building an ANN index: ann_indexes.md - Vector Search: search.md - Full-text search: fts.md + - Building a scalar index: guides/scalar_index.md - Hybrid search: - Overview: hybrid_search/hybrid_search.md - Comparing Rerankers: hybrid_search/eval.md @@ -231,7 +233,7 @@ nav: - Multimodal: examples/python_examples/multimodal.md - Rag: examples/python_examples/rag.md - Vector Search: examples/python_examples/vector_search.md - - Miscellaneous: + - Miscellaneous: - YouTube Transcript Search: notebooks/youtube_transcript_search.ipynb - Documentation QA Bot using LangChain: notebooks/code_qa_bot.ipynb - Multimodal search using CLIP: notebooks/multimodal_search.ipynb diff --git a/docs/src/guides/scalar_index.md b/docs/src/guides/scalar_index.md new file mode 100644 index 00000000..7f3322af --- /dev/null +++ b/docs/src/guides/scalar_index.md @@ -0,0 +1,108 @@ +# Building Scalar Index + +Similar to many SQL databases, LanceDB supports several types of Scalar indices to accelerate search +over scalar columns. + +- `BTREE`: The most common type is BTREE. This index is inspired by the btree data structure + although only the first few layers of the btree are cached in memory. + It will perform well on columns with a large number of unique values and few rows per value. +- `BITMAP`: this index stores a bitmap for each unique value in the column. + This index is useful for columns with a finite number of unique values and many rows per value. + For example, columns that represent "categories", "labels", or "tags" +- `LABEL_LIST`: a special index that is used to index list columns whose values have a finite set of possibilities. + For example, a column that contains lists of tags (e.g. `["tag1", "tag2", "tag3"]`) can be indexed with a `LABEL_LIST` index. + +| Data Type | Filter | Index Type | +| --------------------------------------------------------------- | ----------------------------------------- | ------------ | +| Numeric, String, Temporal | `<`, `=`, `>`, `in`, `between`, `is null` | `BTREE` | +| Boolean, numbers or strings with fewer than 1,000 unique values | `<`, `=`, `>`, `in`, `between`, `is null` | `BITMAP` | +| List of low cardinality of numbers or strings | `array_has_any`, `array_has_all` | `LABEL_LIST` | + +=== "Python" + + ```python + import lancedb + books = [ + {"book_id": 1, "publisher": "plenty of books", "tags": ["fantasy", "adventure"]}, + {"book_id": 2, "publisher": "book town", "tags": ["non-fiction"]}, + {"book_id": 3, "publisher": "oreilly", "tags": ["textbook"]} + ] + + db = lancedb.connect("./db") + table = db.create_table("books", books) + table.create_scalar_index("book_id") # BTree by default + table.create_scalar_index("publisher", index_type="BITMAP") + ``` + +=== "Typescript" + + === "@lancedb/lancedb" + + ```js + const db = await lancedb.connect("data"); + const tbl = await db.openTable("my_vectors"); + + await tbl.create_index("book_id"); + await tlb.create_index("publisher", { config: lancedb.Index.bitmap() }) + ``` + +For example, the following scan will be faster if the column `my_col` has a scalar index: + +=== "Python" + + ```python + import lancedb + + table = db.open_table("books") + my_df = table.search().where("book_id = 2").to_pandas() + ``` + +=== "Typescript" + + === "@lancedb/lancedb" + + ```js + const db = await lancedb.connect("data"); + const tbl = await db.openTable("books"); + + await tbl + .query() + .where("book_id = 2") + .limit(10) + .toArray(); + ``` + +Scalar indices can also speed up scans containing a vector search or full text search, and a prefilter: + +=== "Python" + + ```python + import lancedb + + data = [ + {"book_id": 1, "vector": [1, 2]}, + {"book_id": 2, "vector": [3, 4]}, + {"book_id": 3, "vector": [5, 6]} + ] + table = db.create_table("book_with_embeddings", data) + + ( + table.search([1, 2]) + .where("book_id != 3", prefilter=True) + .to_pandas() + ) + ``` + +=== "Typescript" + + === "@lancedb/lancedb" + + ```js + const db = await lancedb.connect("data/lance"); + const tbl = await db.openTable("book_with_embeddings"); + + await tbl.search(Array(1536).fill(1.2)) + .where("book_id != 3") // prefilter is default behavior. + .limit(10) + .toArray(); + ``` diff --git a/docs/test/md_testing.py b/docs/test/md_testing.py old mode 100644 new mode 100755 index 8bdce0fd..7f2f3a99 --- a/docs/test/md_testing.py +++ b/docs/test/md_testing.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + import glob from typing import Iterator, List from pathlib import Path diff --git a/python/python/lancedb/table.py b/python/python/lancedb/table.py index 6f89e0f7..26ab53a1 100644 --- a/python/python/lancedb/table.py +++ b/python/python/lancedb/table.py @@ -339,9 +339,9 @@ class Table(ABC): def create_scalar_index( self, column: str, - index_type: Literal["BTREE", "BITMAP", "LABEL_LIST"] = "BTREE", *, replace: bool = True, + index_type: Literal["BTREE", "BITMAP", "LABEL_LIST"] = "BTREE", ): """Create a scalar index on a column. @@ -391,6 +391,8 @@ class Table(ABC): or string column. replace : bool, default True Replace the existing index if it exists. + index_type: Literal["BTREE", "BITMAP", "LABEL_LIST"], default "BTREE" + The type of index to create. Examples -------- @@ -1232,9 +1234,9 @@ class LanceTable(Table): def create_scalar_index( self, column: str, - index_type: Literal["BTREE", "BITMAP", "LABEL_LIST"] = "BTREE", *, replace: bool = True, + index_type: Literal["BTREE", "BITMAP", "LABEL_LIST"] = "BTREE", ): self._dataset_mut.create_scalar_index( column, index_type=index_type, replace=replace