mirror of
https://github.com/lancedb/lancedb.git
synced 2026-05-14 02:20:40 +00:00
feat: support to build FTS without positions (#1621)
This commit is contained in:
@@ -62,7 +62,7 @@ Consider that we have a LanceDB table named `my_table`, whose string column `tex
|
||||
});
|
||||
|
||||
await tbl
|
||||
.search("puppy")
|
||||
.search("puppy", queryType="fts")
|
||||
.select(["text"])
|
||||
.limit(10)
|
||||
.toArray();
|
||||
@@ -205,7 +205,7 @@ table.create_fts_index(["text_field"], use_tantivy=True, ordering_field_names=["
|
||||
## Phrase queries vs. terms queries
|
||||
|
||||
!!! warning "Warn"
|
||||
Phrase queries are available for only Tantivy-based FTS
|
||||
Lance-based FTS doesn't support queries combining by boolean operators `OR`, `AND`.
|
||||
|
||||
For full-text search you can specify either a **phrase** query like `"the old man and the sea"`,
|
||||
or a **terms** search query like `"(Old AND Man) AND Sea"`. For more details on the terms
|
||||
|
||||
@@ -844,6 +844,38 @@ describe.each([arrow13, arrow14, arrow15, arrow16, arrow17])(
|
||||
expect(results[0].text).toBe(data[0].text);
|
||||
});
|
||||
|
||||
test("full text search without positions", async () => {
|
||||
const db = await connect(tmpDir.name);
|
||||
const data = [
|
||||
{ text: "hello world", vector: [0.1, 0.2, 0.3] },
|
||||
{ text: "goodbye world", vector: [0.4, 0.5, 0.6] },
|
||||
];
|
||||
const table = await db.createTable("test", data);
|
||||
await table.createIndex("text", {
|
||||
config: Index.fts({ withPositions: false }),
|
||||
});
|
||||
|
||||
const results = await table.search("hello").toArray();
|
||||
expect(results[0].text).toBe(data[0].text);
|
||||
});
|
||||
|
||||
test("full text search phrase query", async () => {
|
||||
const db = await connect(tmpDir.name);
|
||||
const data = [
|
||||
{ text: "hello world", vector: [0.1, 0.2, 0.3] },
|
||||
{ text: "goodbye world", vector: [0.4, 0.5, 0.6] },
|
||||
];
|
||||
const table = await db.createTable("test", data);
|
||||
await table.createIndex("text", {
|
||||
config: Index.fts(),
|
||||
});
|
||||
|
||||
const results = await table.search("world").toArray();
|
||||
expect(results.length).toBe(2);
|
||||
const phraseResults = await table.search('"hello world"').toArray();
|
||||
expect(phraseResults.length).toBe(1);
|
||||
});
|
||||
|
||||
test.each([
|
||||
[0.4, 0.5, 0.599], // number[]
|
||||
Float32Array.of(0.4, 0.5, 0.599), // Float32Array
|
||||
|
||||
@@ -113,6 +113,19 @@ export interface IvfPqOptions {
|
||||
sampleRate?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Options to create a full text search index
|
||||
*/
|
||||
export interface FtsOptions {
|
||||
/**
|
||||
* Whether to build the index with positions.
|
||||
* True by default.
|
||||
* If set to false, the index will not store the positions of the tokens in the text,
|
||||
* which will make the index smaller and faster to build, but will not support phrase queries.
|
||||
*/
|
||||
withPositions?: boolean;
|
||||
}
|
||||
|
||||
export class Index {
|
||||
private readonly inner: LanceDbIndex;
|
||||
private constructor(inner: LanceDbIndex) {
|
||||
@@ -211,8 +224,8 @@ export class Index {
|
||||
*
|
||||
* For now, the full text search index only supports English, and doesn't support phrase search.
|
||||
*/
|
||||
static fts() {
|
||||
return new Index(LanceDbIndex.fts());
|
||||
static fts(options?: Partial<FtsOptions>) {
|
||||
return new Index(LanceDbIndex.fts(options?.withPositions));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -92,9 +92,13 @@ impl Index {
|
||||
}
|
||||
|
||||
#[napi(factory)]
|
||||
pub fn fts() -> Self {
|
||||
pub fn fts(with_position: Option<bool>) -> Self {
|
||||
let mut opts = FtsIndexBuilder::default();
|
||||
if let Some(with_position) = with_position {
|
||||
opts = opts.with_position(with_position);
|
||||
}
|
||||
Self {
|
||||
inner: Mutex::new(Some(LanceDbIndex::FTS(FtsIndexBuilder::default()))),
|
||||
inner: Mutex::new(Some(LanceDbIndex::FTS(opts))),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -78,8 +78,8 @@ class FTS:
|
||||
For example, it works with `title`, `description`, `content`, etc.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._inner = LanceDbIndex.fts()
|
||||
def __init__(self, with_position: bool = True):
|
||||
self._inner = LanceDbIndex.fts(with_position=with_position)
|
||||
|
||||
|
||||
class IvfPq:
|
||||
|
||||
@@ -126,6 +126,7 @@ class RemoteTable(Table):
|
||||
column: str,
|
||||
*,
|
||||
replace: bool = False,
|
||||
with_position: bool = True,
|
||||
):
|
||||
data = {
|
||||
"column": column,
|
||||
|
||||
@@ -468,6 +468,7 @@ class Table(ABC):
|
||||
ordering_field_names: Union[str, List[str]] = None,
|
||||
*,
|
||||
replace: bool = False,
|
||||
with_position: bool = True,
|
||||
writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
|
||||
tokenizer_name: str = "default",
|
||||
use_tantivy: bool = True,
|
||||
@@ -500,6 +501,12 @@ class Table(ABC):
|
||||
use_tantivy: bool, default True
|
||||
If True, use the legacy full-text search implementation based on tantivy.
|
||||
If False, use the new full-text search implementation based on lance-index.
|
||||
with_position: bool, default True
|
||||
Only available with use_tantivy=False
|
||||
If False, do not store the positions of the terms in the text.
|
||||
This can reduce the size of the index and improve indexing speed.
|
||||
But it will not be possible to use phrase queries.
|
||||
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@@ -1305,6 +1312,7 @@ class LanceTable(Table):
|
||||
ordering_field_names: Union[str, List[str]] = None,
|
||||
*,
|
||||
replace: bool = False,
|
||||
with_position: bool = True,
|
||||
writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
|
||||
tokenizer_name: str = "default",
|
||||
use_tantivy: bool = True,
|
||||
@@ -1318,7 +1326,10 @@ class LanceTable(Table):
|
||||
if exist:
|
||||
fs.delete_dir(path)
|
||||
self._dataset_mut.create_scalar_index(
|
||||
field_names, index_type="INVERTED", replace=replace
|
||||
field_names,
|
||||
index_type="INVERTED",
|
||||
replace=replace,
|
||||
with_position=with_position,
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@@ -140,8 +140,11 @@ def test_create_index_with_stemming(tmp_path, table):
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_tantivy", [True, False])
|
||||
def test_create_inverted_index(table, use_tantivy):
|
||||
table.create_fts_index("text", use_tantivy=use_tantivy)
|
||||
@pytest.mark.parametrize("with_position", [True, False])
|
||||
def test_create_inverted_index(table, use_tantivy, with_position):
|
||||
if use_tantivy and not with_position:
|
||||
pytest.skip("we don't support to build tantivy index without position")
|
||||
table.create_fts_index("text", use_tantivy=use_tantivy, with_position=with_position)
|
||||
|
||||
|
||||
def test_populate_index(tmp_path, table):
|
||||
@@ -166,6 +169,40 @@ def test_search_fts(table, use_tantivy):
|
||||
assert len(results) == 5
|
||||
|
||||
|
||||
def test_search_fts_phrase_query(table):
|
||||
table.create_fts_index("text", use_tantivy=False, with_position=False)
|
||||
try:
|
||||
phrase_results = table.search('"puppy runs"').limit(100).to_list()
|
||||
assert False
|
||||
except Exception:
|
||||
pass
|
||||
table.create_fts_index("text", use_tantivy=False, replace=True)
|
||||
results = table.search("puppy").limit(100).to_list()
|
||||
phrase_results = table.search('"puppy runs"').limit(100).to_list()
|
||||
assert len(results) > len(phrase_results)
|
||||
assert len(phrase_results) > 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_search_fts_phrase_query_async(async_table):
|
||||
async_table = await async_table
|
||||
await async_table.create_index("text", config=FTS(with_position=False))
|
||||
try:
|
||||
phrase_results = (
|
||||
await async_table.query().nearest_to_text("puppy runs").limit(100).to_list()
|
||||
)
|
||||
assert False
|
||||
except Exception:
|
||||
pass
|
||||
await async_table.create_index("text", config=FTS())
|
||||
results = await async_table.query().nearest_to_text("puppy").limit(100).to_list()
|
||||
phrase_results = (
|
||||
await async_table.query().nearest_to_text('"puppy runs"').limit(100).to_list()
|
||||
)
|
||||
assert len(results) > len(phrase_results)
|
||||
assert len(phrase_results) > 0
|
||||
|
||||
|
||||
def test_search_fts_specify_column(table):
|
||||
table.create_fts_index("text", use_tantivy=False)
|
||||
table.create_fts_index("text2", use_tantivy=False)
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
|
||||
use std::sync::Mutex;
|
||||
|
||||
use lancedb::index::scalar::FtsIndexBuilder;
|
||||
use lancedb::{
|
||||
index::{scalar::BTreeIndexBuilder, vector::IvfPqIndexBuilder, Index as LanceDbIndex},
|
||||
DistanceType,
|
||||
@@ -100,10 +101,14 @@ impl Index {
|
||||
}
|
||||
|
||||
#[staticmethod]
|
||||
pub fn fts() -> PyResult<Self> {
|
||||
Ok(Self {
|
||||
inner: Mutex::new(Some(LanceDbIndex::FTS(Default::default()))),
|
||||
})
|
||||
pub fn fts(with_position: Option<bool>) -> Self {
|
||||
let mut opts = FtsIndexBuilder::default();
|
||||
if let Some(with_position) = with_position {
|
||||
opts = opts.with_position(with_position);
|
||||
}
|
||||
Self {
|
||||
inner: Mutex::new(Some(LanceDbIndex::FTS(opts))),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -51,9 +51,25 @@ pub struct LabelListIndexBuilder {}
|
||||
/// Builder for a full text search index
|
||||
///
|
||||
/// A full text search index is an index on a string column that allows for full text search
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct FtsIndexBuilder {}
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct FtsIndexBuilder {
|
||||
pub(crate) with_position: bool,
|
||||
}
|
||||
|
||||
impl FtsIndexBuilder {}
|
||||
impl Default for FtsIndexBuilder {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
with_position: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FtsIndexBuilder {
|
||||
/// Set the with_position flag
|
||||
pub fn with_position(mut self, with_position: bool) -> Self {
|
||||
self.with_position = with_position;
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
pub use lance_index::scalar::FullTextSearchQuery;
|
||||
|
||||
@@ -52,6 +52,7 @@ use crate::arrow::IntoArrow;
|
||||
use crate::connection::NoData;
|
||||
use crate::embeddings::{EmbeddingDefinition, EmbeddingRegistry, MaybeEmbedded, MemoryRegistry};
|
||||
use crate::error::{Error, Result};
|
||||
use crate::index::scalar::FtsIndexBuilder;
|
||||
use crate::index::vector::{
|
||||
IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder, IvfPqIndexBuilder, VectorIndex,
|
||||
};
|
||||
@@ -1609,7 +1610,12 @@ impl NativeTable {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn create_fts_index(&self, field: &Field, opts: IndexBuilder) -> Result<()> {
|
||||
async fn create_fts_index(
|
||||
&self,
|
||||
field: &Field,
|
||||
fts_opts: FtsIndexBuilder,
|
||||
replace: bool,
|
||||
) -> Result<()> {
|
||||
if !Self::supported_fts_data_type(field.data_type()) {
|
||||
return Err(Error::Schema {
|
||||
message: format!(
|
||||
@@ -1621,16 +1627,16 @@ impl NativeTable {
|
||||
}
|
||||
|
||||
let mut dataset = self.dataset.get_mut().await?;
|
||||
let lance_idx_params = lance_index::scalar::ScalarIndexParams {
|
||||
force_index_type: Some(lance_index::scalar::ScalarIndexType::Inverted),
|
||||
let fts_params = lance_index::scalar::InvertedIndexParams {
|
||||
with_position: fts_opts.with_position,
|
||||
};
|
||||
dataset
|
||||
.create_index(
|
||||
&[field.name()],
|
||||
IndexType::Scalar,
|
||||
IndexType::Inverted,
|
||||
None,
|
||||
&lance_idx_params,
|
||||
opts.replace,
|
||||
&fts_params,
|
||||
replace,
|
||||
)
|
||||
.await?;
|
||||
Ok(())
|
||||
@@ -1802,7 +1808,7 @@ impl TableInternal for NativeTable {
|
||||
Index::BTree(_) => self.create_btree_index(field, opts).await,
|
||||
Index::Bitmap(_) => self.create_bitmap_index(field, opts).await,
|
||||
Index::LabelList(_) => self.create_label_list_index(field, opts).await,
|
||||
Index::FTS(_) => self.create_fts_index(field, opts).await,
|
||||
Index::FTS(fts_opts) => self.create_fts_index(field, fts_opts, opts.replace).await,
|
||||
Index::IvfPq(ivf_pq) => self.create_ivf_pq_index(ivf_pq, field, opts.replace).await,
|
||||
Index::IvfHnswPq(ivf_hnsw_pq) => {
|
||||
self.create_ivf_hnsw_pq_index(ivf_hnsw_pq, field, opts.replace)
|
||||
|
||||
Reference in New Issue
Block a user