mirror of
https://github.com/lancedb/lancedb.git
synced 2026-01-06 11:52:57 +00:00
feat: support to build FTS without positions (#1621)
This commit is contained in:
@@ -62,7 +62,7 @@ Consider that we have a LanceDB table named `my_table`, whose string column `tex
|
|||||||
});
|
});
|
||||||
|
|
||||||
await tbl
|
await tbl
|
||||||
.search("puppy")
|
.search("puppy", queryType="fts")
|
||||||
.select(["text"])
|
.select(["text"])
|
||||||
.limit(10)
|
.limit(10)
|
||||||
.toArray();
|
.toArray();
|
||||||
@@ -205,7 +205,7 @@ table.create_fts_index(["text_field"], use_tantivy=True, ordering_field_names=["
|
|||||||
## Phrase queries vs. terms queries
|
## Phrase queries vs. terms queries
|
||||||
|
|
||||||
!!! warning "Warn"
|
!!! warning "Warn"
|
||||||
Phrase queries are available for only Tantivy-based FTS
|
Lance-based FTS doesn't support queries combining by boolean operators `OR`, `AND`.
|
||||||
|
|
||||||
For full-text search you can specify either a **phrase** query like `"the old man and the sea"`,
|
For full-text search you can specify either a **phrase** query like `"the old man and the sea"`,
|
||||||
or a **terms** search query like `"(Old AND Man) AND Sea"`. For more details on the terms
|
or a **terms** search query like `"(Old AND Man) AND Sea"`. For more details on the terms
|
||||||
|
|||||||
@@ -844,6 +844,38 @@ describe.each([arrow13, arrow14, arrow15, arrow16, arrow17])(
|
|||||||
expect(results[0].text).toBe(data[0].text);
|
expect(results[0].text).toBe(data[0].text);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test("full text search without positions", async () => {
|
||||||
|
const db = await connect(tmpDir.name);
|
||||||
|
const data = [
|
||||||
|
{ text: "hello world", vector: [0.1, 0.2, 0.3] },
|
||||||
|
{ text: "goodbye world", vector: [0.4, 0.5, 0.6] },
|
||||||
|
];
|
||||||
|
const table = await db.createTable("test", data);
|
||||||
|
await table.createIndex("text", {
|
||||||
|
config: Index.fts({ withPositions: false }),
|
||||||
|
});
|
||||||
|
|
||||||
|
const results = await table.search("hello").toArray();
|
||||||
|
expect(results[0].text).toBe(data[0].text);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("full text search phrase query", async () => {
|
||||||
|
const db = await connect(tmpDir.name);
|
||||||
|
const data = [
|
||||||
|
{ text: "hello world", vector: [0.1, 0.2, 0.3] },
|
||||||
|
{ text: "goodbye world", vector: [0.4, 0.5, 0.6] },
|
||||||
|
];
|
||||||
|
const table = await db.createTable("test", data);
|
||||||
|
await table.createIndex("text", {
|
||||||
|
config: Index.fts(),
|
||||||
|
});
|
||||||
|
|
||||||
|
const results = await table.search("world").toArray();
|
||||||
|
expect(results.length).toBe(2);
|
||||||
|
const phraseResults = await table.search('"hello world"').toArray();
|
||||||
|
expect(phraseResults.length).toBe(1);
|
||||||
|
});
|
||||||
|
|
||||||
test.each([
|
test.each([
|
||||||
[0.4, 0.5, 0.599], // number[]
|
[0.4, 0.5, 0.599], // number[]
|
||||||
Float32Array.of(0.4, 0.5, 0.599), // Float32Array
|
Float32Array.of(0.4, 0.5, 0.599), // Float32Array
|
||||||
|
|||||||
@@ -113,6 +113,19 @@ export interface IvfPqOptions {
|
|||||||
sampleRate?: number;
|
sampleRate?: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Options to create a full text search index
|
||||||
|
*/
|
||||||
|
export interface FtsOptions {
|
||||||
|
/**
|
||||||
|
* Whether to build the index with positions.
|
||||||
|
* True by default.
|
||||||
|
* If set to false, the index will not store the positions of the tokens in the text,
|
||||||
|
* which will make the index smaller and faster to build, but will not support phrase queries.
|
||||||
|
*/
|
||||||
|
withPositions?: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
export class Index {
|
export class Index {
|
||||||
private readonly inner: LanceDbIndex;
|
private readonly inner: LanceDbIndex;
|
||||||
private constructor(inner: LanceDbIndex) {
|
private constructor(inner: LanceDbIndex) {
|
||||||
@@ -211,8 +224,8 @@ export class Index {
|
|||||||
*
|
*
|
||||||
* For now, the full text search index only supports English, and doesn't support phrase search.
|
* For now, the full text search index only supports English, and doesn't support phrase search.
|
||||||
*/
|
*/
|
||||||
static fts() {
|
static fts(options?: Partial<FtsOptions>) {
|
||||||
return new Index(LanceDbIndex.fts());
|
return new Index(LanceDbIndex.fts(options?.withPositions));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -92,9 +92,13 @@ impl Index {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[napi(factory)]
|
#[napi(factory)]
|
||||||
pub fn fts() -> Self {
|
pub fn fts(with_position: Option<bool>) -> Self {
|
||||||
|
let mut opts = FtsIndexBuilder::default();
|
||||||
|
if let Some(with_position) = with_position {
|
||||||
|
opts = opts.with_position(with_position);
|
||||||
|
}
|
||||||
Self {
|
Self {
|
||||||
inner: Mutex::new(Some(LanceDbIndex::FTS(FtsIndexBuilder::default()))),
|
inner: Mutex::new(Some(LanceDbIndex::FTS(opts))),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -78,8 +78,8 @@ class FTS:
|
|||||||
For example, it works with `title`, `description`, `content`, etc.
|
For example, it works with `title`, `description`, `content`, etc.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, with_position: bool = True):
|
||||||
self._inner = LanceDbIndex.fts()
|
self._inner = LanceDbIndex.fts(with_position=with_position)
|
||||||
|
|
||||||
|
|
||||||
class IvfPq:
|
class IvfPq:
|
||||||
|
|||||||
@@ -126,6 +126,7 @@ class RemoteTable(Table):
|
|||||||
column: str,
|
column: str,
|
||||||
*,
|
*,
|
||||||
replace: bool = False,
|
replace: bool = False,
|
||||||
|
with_position: bool = True,
|
||||||
):
|
):
|
||||||
data = {
|
data = {
|
||||||
"column": column,
|
"column": column,
|
||||||
|
|||||||
@@ -468,6 +468,7 @@ class Table(ABC):
|
|||||||
ordering_field_names: Union[str, List[str]] = None,
|
ordering_field_names: Union[str, List[str]] = None,
|
||||||
*,
|
*,
|
||||||
replace: bool = False,
|
replace: bool = False,
|
||||||
|
with_position: bool = True,
|
||||||
writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
|
writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
|
||||||
tokenizer_name: str = "default",
|
tokenizer_name: str = "default",
|
||||||
use_tantivy: bool = True,
|
use_tantivy: bool = True,
|
||||||
@@ -500,6 +501,12 @@ class Table(ABC):
|
|||||||
use_tantivy: bool, default True
|
use_tantivy: bool, default True
|
||||||
If True, use the legacy full-text search implementation based on tantivy.
|
If True, use the legacy full-text search implementation based on tantivy.
|
||||||
If False, use the new full-text search implementation based on lance-index.
|
If False, use the new full-text search implementation based on lance-index.
|
||||||
|
with_position: bool, default True
|
||||||
|
Only available with use_tantivy=False
|
||||||
|
If False, do not store the positions of the terms in the text.
|
||||||
|
This can reduce the size of the index and improve indexing speed.
|
||||||
|
But it will not be possible to use phrase queries.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
@@ -1305,6 +1312,7 @@ class LanceTable(Table):
|
|||||||
ordering_field_names: Union[str, List[str]] = None,
|
ordering_field_names: Union[str, List[str]] = None,
|
||||||
*,
|
*,
|
||||||
replace: bool = False,
|
replace: bool = False,
|
||||||
|
with_position: bool = True,
|
||||||
writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
|
writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
|
||||||
tokenizer_name: str = "default",
|
tokenizer_name: str = "default",
|
||||||
use_tantivy: bool = True,
|
use_tantivy: bool = True,
|
||||||
@@ -1318,7 +1326,10 @@ class LanceTable(Table):
|
|||||||
if exist:
|
if exist:
|
||||||
fs.delete_dir(path)
|
fs.delete_dir(path)
|
||||||
self._dataset_mut.create_scalar_index(
|
self._dataset_mut.create_scalar_index(
|
||||||
field_names, index_type="INVERTED", replace=replace
|
field_names,
|
||||||
|
index_type="INVERTED",
|
||||||
|
replace=replace,
|
||||||
|
with_position=with_position,
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|||||||
@@ -140,8 +140,11 @@ def test_create_index_with_stemming(tmp_path, table):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||||
def test_create_inverted_index(table, use_tantivy):
|
@pytest.mark.parametrize("with_position", [True, False])
|
||||||
table.create_fts_index("text", use_tantivy=use_tantivy)
|
def test_create_inverted_index(table, use_tantivy, with_position):
|
||||||
|
if use_tantivy and not with_position:
|
||||||
|
pytest.skip("we don't support to build tantivy index without position")
|
||||||
|
table.create_fts_index("text", use_tantivy=use_tantivy, with_position=with_position)
|
||||||
|
|
||||||
|
|
||||||
def test_populate_index(tmp_path, table):
|
def test_populate_index(tmp_path, table):
|
||||||
@@ -166,6 +169,40 @@ def test_search_fts(table, use_tantivy):
|
|||||||
assert len(results) == 5
|
assert len(results) == 5
|
||||||
|
|
||||||
|
|
||||||
|
def test_search_fts_phrase_query(table):
|
||||||
|
table.create_fts_index("text", use_tantivy=False, with_position=False)
|
||||||
|
try:
|
||||||
|
phrase_results = table.search('"puppy runs"').limit(100).to_list()
|
||||||
|
assert False
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
table.create_fts_index("text", use_tantivy=False, replace=True)
|
||||||
|
results = table.search("puppy").limit(100).to_list()
|
||||||
|
phrase_results = table.search('"puppy runs"').limit(100).to_list()
|
||||||
|
assert len(results) > len(phrase_results)
|
||||||
|
assert len(phrase_results) > 0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_search_fts_phrase_query_async(async_table):
|
||||||
|
async_table = await async_table
|
||||||
|
await async_table.create_index("text", config=FTS(with_position=False))
|
||||||
|
try:
|
||||||
|
phrase_results = (
|
||||||
|
await async_table.query().nearest_to_text("puppy runs").limit(100).to_list()
|
||||||
|
)
|
||||||
|
assert False
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
await async_table.create_index("text", config=FTS())
|
||||||
|
results = await async_table.query().nearest_to_text("puppy").limit(100).to_list()
|
||||||
|
phrase_results = (
|
||||||
|
await async_table.query().nearest_to_text('"puppy runs"').limit(100).to_list()
|
||||||
|
)
|
||||||
|
assert len(results) > len(phrase_results)
|
||||||
|
assert len(phrase_results) > 0
|
||||||
|
|
||||||
|
|
||||||
def test_search_fts_specify_column(table):
|
def test_search_fts_specify_column(table):
|
||||||
table.create_fts_index("text", use_tantivy=False)
|
table.create_fts_index("text", use_tantivy=False)
|
||||||
table.create_fts_index("text2", use_tantivy=False)
|
table.create_fts_index("text2", use_tantivy=False)
|
||||||
|
|||||||
@@ -14,6 +14,7 @@
|
|||||||
|
|
||||||
use std::sync::Mutex;
|
use std::sync::Mutex;
|
||||||
|
|
||||||
|
use lancedb::index::scalar::FtsIndexBuilder;
|
||||||
use lancedb::{
|
use lancedb::{
|
||||||
index::{scalar::BTreeIndexBuilder, vector::IvfPqIndexBuilder, Index as LanceDbIndex},
|
index::{scalar::BTreeIndexBuilder, vector::IvfPqIndexBuilder, Index as LanceDbIndex},
|
||||||
DistanceType,
|
DistanceType,
|
||||||
@@ -100,10 +101,14 @@ impl Index {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[staticmethod]
|
#[staticmethod]
|
||||||
pub fn fts() -> PyResult<Self> {
|
pub fn fts(with_position: Option<bool>) -> Self {
|
||||||
Ok(Self {
|
let mut opts = FtsIndexBuilder::default();
|
||||||
inner: Mutex::new(Some(LanceDbIndex::FTS(Default::default()))),
|
if let Some(with_position) = with_position {
|
||||||
})
|
opts = opts.with_position(with_position);
|
||||||
|
}
|
||||||
|
Self {
|
||||||
|
inner: Mutex::new(Some(LanceDbIndex::FTS(opts))),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -51,9 +51,25 @@ pub struct LabelListIndexBuilder {}
|
|||||||
/// Builder for a full text search index
|
/// Builder for a full text search index
|
||||||
///
|
///
|
||||||
/// A full text search index is an index on a string column that allows for full text search
|
/// A full text search index is an index on a string column that allows for full text search
|
||||||
#[derive(Debug, Clone, Default)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct FtsIndexBuilder {}
|
pub struct FtsIndexBuilder {
|
||||||
|
pub(crate) with_position: bool,
|
||||||
|
}
|
||||||
|
|
||||||
impl FtsIndexBuilder {}
|
impl Default for FtsIndexBuilder {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self {
|
||||||
|
with_position: true,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FtsIndexBuilder {
|
||||||
|
/// Set the with_position flag
|
||||||
|
pub fn with_position(mut self, with_position: bool) -> Self {
|
||||||
|
self.with_position = with_position;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub use lance_index::scalar::FullTextSearchQuery;
|
pub use lance_index::scalar::FullTextSearchQuery;
|
||||||
|
|||||||
@@ -52,6 +52,7 @@ use crate::arrow::IntoArrow;
|
|||||||
use crate::connection::NoData;
|
use crate::connection::NoData;
|
||||||
use crate::embeddings::{EmbeddingDefinition, EmbeddingRegistry, MaybeEmbedded, MemoryRegistry};
|
use crate::embeddings::{EmbeddingDefinition, EmbeddingRegistry, MaybeEmbedded, MemoryRegistry};
|
||||||
use crate::error::{Error, Result};
|
use crate::error::{Error, Result};
|
||||||
|
use crate::index::scalar::FtsIndexBuilder;
|
||||||
use crate::index::vector::{
|
use crate::index::vector::{
|
||||||
IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder, IvfPqIndexBuilder, VectorIndex,
|
IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder, IvfPqIndexBuilder, VectorIndex,
|
||||||
};
|
};
|
||||||
@@ -1609,7 +1610,12 @@ impl NativeTable {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn create_fts_index(&self, field: &Field, opts: IndexBuilder) -> Result<()> {
|
async fn create_fts_index(
|
||||||
|
&self,
|
||||||
|
field: &Field,
|
||||||
|
fts_opts: FtsIndexBuilder,
|
||||||
|
replace: bool,
|
||||||
|
) -> Result<()> {
|
||||||
if !Self::supported_fts_data_type(field.data_type()) {
|
if !Self::supported_fts_data_type(field.data_type()) {
|
||||||
return Err(Error::Schema {
|
return Err(Error::Schema {
|
||||||
message: format!(
|
message: format!(
|
||||||
@@ -1621,16 +1627,16 @@ impl NativeTable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
let mut dataset = self.dataset.get_mut().await?;
|
let mut dataset = self.dataset.get_mut().await?;
|
||||||
let lance_idx_params = lance_index::scalar::ScalarIndexParams {
|
let fts_params = lance_index::scalar::InvertedIndexParams {
|
||||||
force_index_type: Some(lance_index::scalar::ScalarIndexType::Inverted),
|
with_position: fts_opts.with_position,
|
||||||
};
|
};
|
||||||
dataset
|
dataset
|
||||||
.create_index(
|
.create_index(
|
||||||
&[field.name()],
|
&[field.name()],
|
||||||
IndexType::Scalar,
|
IndexType::Inverted,
|
||||||
None,
|
None,
|
||||||
&lance_idx_params,
|
&fts_params,
|
||||||
opts.replace,
|
replace,
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
Ok(())
|
Ok(())
|
||||||
@@ -1802,7 +1808,7 @@ impl TableInternal for NativeTable {
|
|||||||
Index::BTree(_) => self.create_btree_index(field, opts).await,
|
Index::BTree(_) => self.create_btree_index(field, opts).await,
|
||||||
Index::Bitmap(_) => self.create_bitmap_index(field, opts).await,
|
Index::Bitmap(_) => self.create_bitmap_index(field, opts).await,
|
||||||
Index::LabelList(_) => self.create_label_list_index(field, opts).await,
|
Index::LabelList(_) => self.create_label_list_index(field, opts).await,
|
||||||
Index::FTS(_) => self.create_fts_index(field, opts).await,
|
Index::FTS(fts_opts) => self.create_fts_index(field, fts_opts, opts.replace).await,
|
||||||
Index::IvfPq(ivf_pq) => self.create_ivf_pq_index(ivf_pq, field, opts.replace).await,
|
Index::IvfPq(ivf_pq) => self.create_ivf_pq_index(ivf_pq, field, opts.replace).await,
|
||||||
Index::IvfHnswPq(ivf_hnsw_pq) => {
|
Index::IvfHnswPq(ivf_hnsw_pq) => {
|
||||||
self.create_ivf_hnsw_pq_index(ivf_hnsw_pq, field, opts.replace)
|
self.create_ivf_hnsw_pq_index(ivf_hnsw_pq, field, opts.replace)
|
||||||
|
|||||||
Reference in New Issue
Block a user