mirror of
https://github.com/lancedb/lancedb.git
synced 2026-01-07 04:12:59 +00:00
docs: add a section to describe scalar index (#1495)
This commit is contained in:
@@ -58,7 +58,7 @@ plugins:
|
|||||||
- https://pandas.pydata.org/docs/objects.inv
|
- https://pandas.pydata.org/docs/objects.inv
|
||||||
- mkdocs-jupyter
|
- mkdocs-jupyter
|
||||||
- render_swagger:
|
- render_swagger:
|
||||||
allow_arbitrary_locations : true
|
allow_arbitrary_locations: true
|
||||||
|
|
||||||
markdown_extensions:
|
markdown_extensions:
|
||||||
- admonition
|
- admonition
|
||||||
@@ -89,9 +89,10 @@ nav:
|
|||||||
- Data management: concepts/data_management.md
|
- Data management: concepts/data_management.md
|
||||||
- 🔨 Guides:
|
- 🔨 Guides:
|
||||||
- Working with tables: guides/tables.md
|
- Working with tables: guides/tables.md
|
||||||
- Building an ANN index: ann_indexes.md
|
- Building a vector index: ann_indexes.md
|
||||||
- Vector Search: search.md
|
- Vector Search: search.md
|
||||||
- Full-text search: fts.md
|
- Full-text search: fts.md
|
||||||
|
- Building a scalar index: guides/scalar_index.md
|
||||||
- Hybrid search:
|
- Hybrid search:
|
||||||
- Overview: hybrid_search/hybrid_search.md
|
- Overview: hybrid_search/hybrid_search.md
|
||||||
- Comparing Rerankers: hybrid_search/eval.md
|
- Comparing Rerankers: hybrid_search/eval.md
|
||||||
@@ -128,12 +129,12 @@ nav:
|
|||||||
- Polars: python/polars_arrow.md
|
- Polars: python/polars_arrow.md
|
||||||
- DuckDB: python/duckdb.md
|
- DuckDB: python/duckdb.md
|
||||||
- LangChain:
|
- LangChain:
|
||||||
- LangChain 🔗: integrations/langchain.md
|
- LangChain 🔗: integrations/langchain.md
|
||||||
- LangChain demo: notebooks/langchain_demo.ipynb
|
- LangChain demo: notebooks/langchain_demo.ipynb
|
||||||
- LangChain JS/TS 🔗: https://js.langchain.com/docs/integrations/vectorstores/lancedb
|
- LangChain JS/TS 🔗: https://js.langchain.com/docs/integrations/vectorstores/lancedb
|
||||||
- LlamaIndex 🦙:
|
- LlamaIndex 🦙:
|
||||||
- LlamaIndex docs: integrations/llamaIndex.md
|
- LlamaIndex docs: integrations/llamaIndex.md
|
||||||
- LlamaIndex demo: notebooks/llamaIndex_demo.ipynb
|
- LlamaIndex demo: notebooks/llamaIndex_demo.ipynb
|
||||||
- Pydantic: python/pydantic.md
|
- Pydantic: python/pydantic.md
|
||||||
- Voxel51: integrations/voxel51.md
|
- Voxel51: integrations/voxel51.md
|
||||||
- PromptTools: integrations/prompttools.md
|
- PromptTools: integrations/prompttools.md
|
||||||
@@ -145,7 +146,7 @@ nav:
|
|||||||
- Multimodal: examples/python_examples/multimodal.md
|
- Multimodal: examples/python_examples/multimodal.md
|
||||||
- Rag: examples/python_examples/rag.md
|
- Rag: examples/python_examples/rag.md
|
||||||
- Vector Search: examples/python_examples/vector_search.md
|
- Vector Search: examples/python_examples/vector_search.md
|
||||||
- Miscellaneous:
|
- Miscellaneous:
|
||||||
- YouTube Transcript Search: notebooks/youtube_transcript_search.ipynb
|
- YouTube Transcript Search: notebooks/youtube_transcript_search.ipynb
|
||||||
- Documentation QA Bot using LangChain: notebooks/code_qa_bot.ipynb
|
- Documentation QA Bot using LangChain: notebooks/code_qa_bot.ipynb
|
||||||
- Multimodal search using CLIP: notebooks/multimodal_search.ipynb
|
- Multimodal search using CLIP: notebooks/multimodal_search.ipynb
|
||||||
@@ -182,6 +183,7 @@ nav:
|
|||||||
- Building an ANN index: ann_indexes.md
|
- Building an ANN index: ann_indexes.md
|
||||||
- Vector Search: search.md
|
- Vector Search: search.md
|
||||||
- Full-text search: fts.md
|
- Full-text search: fts.md
|
||||||
|
- Building a scalar index: guides/scalar_index.md
|
||||||
- Hybrid search:
|
- Hybrid search:
|
||||||
- Overview: hybrid_search/hybrid_search.md
|
- Overview: hybrid_search/hybrid_search.md
|
||||||
- Comparing Rerankers: hybrid_search/eval.md
|
- Comparing Rerankers: hybrid_search/eval.md
|
||||||
@@ -231,7 +233,7 @@ nav:
|
|||||||
- Multimodal: examples/python_examples/multimodal.md
|
- Multimodal: examples/python_examples/multimodal.md
|
||||||
- Rag: examples/python_examples/rag.md
|
- Rag: examples/python_examples/rag.md
|
||||||
- Vector Search: examples/python_examples/vector_search.md
|
- Vector Search: examples/python_examples/vector_search.md
|
||||||
- Miscellaneous:
|
- Miscellaneous:
|
||||||
- YouTube Transcript Search: notebooks/youtube_transcript_search.ipynb
|
- YouTube Transcript Search: notebooks/youtube_transcript_search.ipynb
|
||||||
- Documentation QA Bot using LangChain: notebooks/code_qa_bot.ipynb
|
- Documentation QA Bot using LangChain: notebooks/code_qa_bot.ipynb
|
||||||
- Multimodal search using CLIP: notebooks/multimodal_search.ipynb
|
- Multimodal search using CLIP: notebooks/multimodal_search.ipynb
|
||||||
|
|||||||
108
docs/src/guides/scalar_index.md
Normal file
108
docs/src/guides/scalar_index.md
Normal file
@@ -0,0 +1,108 @@
|
|||||||
|
# Building Scalar Index
|
||||||
|
|
||||||
|
Similar to many SQL databases, LanceDB supports several types of Scalar indices to accelerate search
|
||||||
|
over scalar columns.
|
||||||
|
|
||||||
|
- `BTREE`: The most common type is BTREE. This index is inspired by the btree data structure
|
||||||
|
although only the first few layers of the btree are cached in memory.
|
||||||
|
It will perform well on columns with a large number of unique values and few rows per value.
|
||||||
|
- `BITMAP`: this index stores a bitmap for each unique value in the column.
|
||||||
|
This index is useful for columns with a finite number of unique values and many rows per value.
|
||||||
|
For example, columns that represent "categories", "labels", or "tags"
|
||||||
|
- `LABEL_LIST`: a special index that is used to index list columns whose values have a finite set of possibilities.
|
||||||
|
For example, a column that contains lists of tags (e.g. `["tag1", "tag2", "tag3"]`) can be indexed with a `LABEL_LIST` index.
|
||||||
|
|
||||||
|
| Data Type | Filter | Index Type |
|
||||||
|
| --------------------------------------------------------------- | ----------------------------------------- | ------------ |
|
||||||
|
| Numeric, String, Temporal | `<`, `=`, `>`, `in`, `between`, `is null` | `BTREE` |
|
||||||
|
| Boolean, numbers or strings with fewer than 1,000 unique values | `<`, `=`, `>`, `in`, `between`, `is null` | `BITMAP` |
|
||||||
|
| List of low cardinality of numbers or strings | `array_has_any`, `array_has_all` | `LABEL_LIST` |
|
||||||
|
|
||||||
|
=== "Python"
|
||||||
|
|
||||||
|
```python
|
||||||
|
import lancedb
|
||||||
|
books = [
|
||||||
|
{"book_id": 1, "publisher": "plenty of books", "tags": ["fantasy", "adventure"]},
|
||||||
|
{"book_id": 2, "publisher": "book town", "tags": ["non-fiction"]},
|
||||||
|
{"book_id": 3, "publisher": "oreilly", "tags": ["textbook"]}
|
||||||
|
]
|
||||||
|
|
||||||
|
db = lancedb.connect("./db")
|
||||||
|
table = db.create_table("books", books)
|
||||||
|
table.create_scalar_index("book_id") # BTree by default
|
||||||
|
table.create_scalar_index("publisher", index_type="BITMAP")
|
||||||
|
```
|
||||||
|
|
||||||
|
=== "Typescript"
|
||||||
|
|
||||||
|
=== "@lancedb/lancedb"
|
||||||
|
|
||||||
|
```js
|
||||||
|
const db = await lancedb.connect("data");
|
||||||
|
const tbl = await db.openTable("my_vectors");
|
||||||
|
|
||||||
|
await tbl.create_index("book_id");
|
||||||
|
await tlb.create_index("publisher", { config: lancedb.Index.bitmap() })
|
||||||
|
```
|
||||||
|
|
||||||
|
For example, the following scan will be faster if the column `my_col` has a scalar index:
|
||||||
|
|
||||||
|
=== "Python"
|
||||||
|
|
||||||
|
```python
|
||||||
|
import lancedb
|
||||||
|
|
||||||
|
table = db.open_table("books")
|
||||||
|
my_df = table.search().where("book_id = 2").to_pandas()
|
||||||
|
```
|
||||||
|
|
||||||
|
=== "Typescript"
|
||||||
|
|
||||||
|
=== "@lancedb/lancedb"
|
||||||
|
|
||||||
|
```js
|
||||||
|
const db = await lancedb.connect("data");
|
||||||
|
const tbl = await db.openTable("books");
|
||||||
|
|
||||||
|
await tbl
|
||||||
|
.query()
|
||||||
|
.where("book_id = 2")
|
||||||
|
.limit(10)
|
||||||
|
.toArray();
|
||||||
|
```
|
||||||
|
|
||||||
|
Scalar indices can also speed up scans containing a vector search or full text search, and a prefilter:
|
||||||
|
|
||||||
|
=== "Python"
|
||||||
|
|
||||||
|
```python
|
||||||
|
import lancedb
|
||||||
|
|
||||||
|
data = [
|
||||||
|
{"book_id": 1, "vector": [1, 2]},
|
||||||
|
{"book_id": 2, "vector": [3, 4]},
|
||||||
|
{"book_id": 3, "vector": [5, 6]}
|
||||||
|
]
|
||||||
|
table = db.create_table("book_with_embeddings", data)
|
||||||
|
|
||||||
|
(
|
||||||
|
table.search([1, 2])
|
||||||
|
.where("book_id != 3", prefilter=True)
|
||||||
|
.to_pandas()
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
=== "Typescript"
|
||||||
|
|
||||||
|
=== "@lancedb/lancedb"
|
||||||
|
|
||||||
|
```js
|
||||||
|
const db = await lancedb.connect("data/lance");
|
||||||
|
const tbl = await db.openTable("book_with_embeddings");
|
||||||
|
|
||||||
|
await tbl.search(Array(1536).fill(1.2))
|
||||||
|
.where("book_id != 3") // prefilter is default behavior.
|
||||||
|
.limit(10)
|
||||||
|
.toArray();
|
||||||
|
```
|
||||||
2
docs/test/md_testing.py
Normal file → Executable file
2
docs/test/md_testing.py
Normal file → Executable file
@@ -1,3 +1,5 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
import glob
|
import glob
|
||||||
from typing import Iterator, List
|
from typing import Iterator, List
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|||||||
@@ -339,9 +339,9 @@ class Table(ABC):
|
|||||||
def create_scalar_index(
|
def create_scalar_index(
|
||||||
self,
|
self,
|
||||||
column: str,
|
column: str,
|
||||||
index_type: Literal["BTREE", "BITMAP", "LABEL_LIST"] = "BTREE",
|
|
||||||
*,
|
*,
|
||||||
replace: bool = True,
|
replace: bool = True,
|
||||||
|
index_type: Literal["BTREE", "BITMAP", "LABEL_LIST"] = "BTREE",
|
||||||
):
|
):
|
||||||
"""Create a scalar index on a column.
|
"""Create a scalar index on a column.
|
||||||
|
|
||||||
@@ -391,6 +391,8 @@ class Table(ABC):
|
|||||||
or string column.
|
or string column.
|
||||||
replace : bool, default True
|
replace : bool, default True
|
||||||
Replace the existing index if it exists.
|
Replace the existing index if it exists.
|
||||||
|
index_type: Literal["BTREE", "BITMAP", "LABEL_LIST"], default "BTREE"
|
||||||
|
The type of index to create.
|
||||||
|
|
||||||
Examples
|
Examples
|
||||||
--------
|
--------
|
||||||
@@ -1232,9 +1234,9 @@ class LanceTable(Table):
|
|||||||
def create_scalar_index(
|
def create_scalar_index(
|
||||||
self,
|
self,
|
||||||
column: str,
|
column: str,
|
||||||
index_type: Literal["BTREE", "BITMAP", "LABEL_LIST"] = "BTREE",
|
|
||||||
*,
|
*,
|
||||||
replace: bool = True,
|
replace: bool = True,
|
||||||
|
index_type: Literal["BTREE", "BITMAP", "LABEL_LIST"] = "BTREE",
|
||||||
):
|
):
|
||||||
self._dataset_mut.create_scalar_index(
|
self._dataset_mut.create_scalar_index(
|
||||||
column, index_type=index_type, replace=replace
|
column, index_type=index_type, replace=replace
|
||||||
|
|||||||
Reference in New Issue
Block a user