feat: support to build FTS without positions (#1621)

This commit is contained in:
BubbleCal
2024-09-10 22:51:32 +08:00
committed by GitHub
parent a405847f9b
commit 2bde5401eb
11 changed files with 150 additions and 25 deletions

View File

@@ -62,7 +62,7 @@ Consider that we have a LanceDB table named `my_table`, whose string column `tex
}); });
await tbl await tbl
.search("puppy") .search("puppy", queryType="fts")
.select(["text"]) .select(["text"])
.limit(10) .limit(10)
.toArray(); .toArray();
@@ -205,7 +205,7 @@ table.create_fts_index(["text_field"], use_tantivy=True, ordering_field_names=["
## Phrase queries vs. terms queries ## Phrase queries vs. terms queries
!!! warning "Warn" !!! warning "Warn"
Phrase queries are available for only Tantivy-based FTS Lance-based FTS doesn't support queries combining by boolean operators `OR`, `AND`.
For full-text search you can specify either a **phrase** query like `"the old man and the sea"`, For full-text search you can specify either a **phrase** query like `"the old man and the sea"`,
or a **terms** search query like `"(Old AND Man) AND Sea"`. For more details on the terms or a **terms** search query like `"(Old AND Man) AND Sea"`. For more details on the terms

View File

@@ -844,6 +844,38 @@ describe.each([arrow13, arrow14, arrow15, arrow16, arrow17])(
expect(results[0].text).toBe(data[0].text); expect(results[0].text).toBe(data[0].text);
}); });
test("full text search without positions", async () => {
const db = await connect(tmpDir.name);
const data = [
{ text: "hello world", vector: [0.1, 0.2, 0.3] },
{ text: "goodbye world", vector: [0.4, 0.5, 0.6] },
];
const table = await db.createTable("test", data);
await table.createIndex("text", {
config: Index.fts({ withPositions: false }),
});
const results = await table.search("hello").toArray();
expect(results[0].text).toBe(data[0].text);
});
test("full text search phrase query", async () => {
const db = await connect(tmpDir.name);
const data = [
{ text: "hello world", vector: [0.1, 0.2, 0.3] },
{ text: "goodbye world", vector: [0.4, 0.5, 0.6] },
];
const table = await db.createTable("test", data);
await table.createIndex("text", {
config: Index.fts(),
});
const results = await table.search("world").toArray();
expect(results.length).toBe(2);
const phraseResults = await table.search('"hello world"').toArray();
expect(phraseResults.length).toBe(1);
});
test.each([ test.each([
[0.4, 0.5, 0.599], // number[] [0.4, 0.5, 0.599], // number[]
Float32Array.of(0.4, 0.5, 0.599), // Float32Array Float32Array.of(0.4, 0.5, 0.599), // Float32Array

View File

@@ -113,6 +113,19 @@ export interface IvfPqOptions {
sampleRate?: number; sampleRate?: number;
} }
/**
* Options to create a full text search index
*/
export interface FtsOptions {
/**
* Whether to build the index with positions.
* True by default.
* If set to false, the index will not store the positions of the tokens in the text,
* which will make the index smaller and faster to build, but will not support phrase queries.
*/
withPositions?: boolean;
}
export class Index { export class Index {
private readonly inner: LanceDbIndex; private readonly inner: LanceDbIndex;
private constructor(inner: LanceDbIndex) { private constructor(inner: LanceDbIndex) {
@@ -211,8 +224,8 @@ export class Index {
* *
* For now, the full text search index only supports English, and doesn't support phrase search. * For now, the full text search index only supports English, and doesn't support phrase search.
*/ */
static fts() { static fts(options?: Partial<FtsOptions>) {
return new Index(LanceDbIndex.fts()); return new Index(LanceDbIndex.fts(options?.withPositions));
} }
} }

View File

@@ -92,9 +92,13 @@ impl Index {
} }
#[napi(factory)] #[napi(factory)]
pub fn fts() -> Self { pub fn fts(with_position: Option<bool>) -> Self {
let mut opts = FtsIndexBuilder::default();
if let Some(with_position) = with_position {
opts = opts.with_position(with_position);
}
Self { Self {
inner: Mutex::new(Some(LanceDbIndex::FTS(FtsIndexBuilder::default()))), inner: Mutex::new(Some(LanceDbIndex::FTS(opts))),
} }
} }
} }

View File

@@ -78,8 +78,8 @@ class FTS:
For example, it works with `title`, `description`, `content`, etc. For example, it works with `title`, `description`, `content`, etc.
""" """
def __init__(self): def __init__(self, with_position: bool = True):
self._inner = LanceDbIndex.fts() self._inner = LanceDbIndex.fts(with_position=with_position)
class IvfPq: class IvfPq:

View File

@@ -126,6 +126,7 @@ class RemoteTable(Table):
column: str, column: str,
*, *,
replace: bool = False, replace: bool = False,
with_position: bool = True,
): ):
data = { data = {
"column": column, "column": column,

View File

@@ -468,6 +468,7 @@ class Table(ABC):
ordering_field_names: Union[str, List[str]] = None, ordering_field_names: Union[str, List[str]] = None,
*, *,
replace: bool = False, replace: bool = False,
with_position: bool = True,
writer_heap_size: Optional[int] = 1024 * 1024 * 1024, writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
tokenizer_name: str = "default", tokenizer_name: str = "default",
use_tantivy: bool = True, use_tantivy: bool = True,
@@ -500,6 +501,12 @@ class Table(ABC):
use_tantivy: bool, default True use_tantivy: bool, default True
If True, use the legacy full-text search implementation based on tantivy. If True, use the legacy full-text search implementation based on tantivy.
If False, use the new full-text search implementation based on lance-index. If False, use the new full-text search implementation based on lance-index.
with_position: bool, default True
Only available with use_tantivy=False
If False, do not store the positions of the terms in the text.
This can reduce the size of the index and improve indexing speed.
But it will not be possible to use phrase queries.
""" """
raise NotImplementedError raise NotImplementedError
@@ -1305,6 +1312,7 @@ class LanceTable(Table):
ordering_field_names: Union[str, List[str]] = None, ordering_field_names: Union[str, List[str]] = None,
*, *,
replace: bool = False, replace: bool = False,
with_position: bool = True,
writer_heap_size: Optional[int] = 1024 * 1024 * 1024, writer_heap_size: Optional[int] = 1024 * 1024 * 1024,
tokenizer_name: str = "default", tokenizer_name: str = "default",
use_tantivy: bool = True, use_tantivy: bool = True,
@@ -1318,7 +1326,10 @@ class LanceTable(Table):
if exist: if exist:
fs.delete_dir(path) fs.delete_dir(path)
self._dataset_mut.create_scalar_index( self._dataset_mut.create_scalar_index(
field_names, index_type="INVERTED", replace=replace field_names,
index_type="INVERTED",
replace=replace,
with_position=with_position,
) )
return return

View File

@@ -140,8 +140,11 @@ def test_create_index_with_stemming(tmp_path, table):
@pytest.mark.parametrize("use_tantivy", [True, False]) @pytest.mark.parametrize("use_tantivy", [True, False])
def test_create_inverted_index(table, use_tantivy): @pytest.mark.parametrize("with_position", [True, False])
table.create_fts_index("text", use_tantivy=use_tantivy) def test_create_inverted_index(table, use_tantivy, with_position):
if use_tantivy and not with_position:
pytest.skip("we don't support to build tantivy index without position")
table.create_fts_index("text", use_tantivy=use_tantivy, with_position=with_position)
def test_populate_index(tmp_path, table): def test_populate_index(tmp_path, table):
@@ -166,6 +169,40 @@ def test_search_fts(table, use_tantivy):
assert len(results) == 5 assert len(results) == 5
def test_search_fts_phrase_query(table):
table.create_fts_index("text", use_tantivy=False, with_position=False)
try:
phrase_results = table.search('"puppy runs"').limit(100).to_list()
assert False
except Exception:
pass
table.create_fts_index("text", use_tantivy=False, replace=True)
results = table.search("puppy").limit(100).to_list()
phrase_results = table.search('"puppy runs"').limit(100).to_list()
assert len(results) > len(phrase_results)
assert len(phrase_results) > 0
@pytest.mark.asyncio
async def test_search_fts_phrase_query_async(async_table):
async_table = await async_table
await async_table.create_index("text", config=FTS(with_position=False))
try:
phrase_results = (
await async_table.query().nearest_to_text("puppy runs").limit(100).to_list()
)
assert False
except Exception:
pass
await async_table.create_index("text", config=FTS())
results = await async_table.query().nearest_to_text("puppy").limit(100).to_list()
phrase_results = (
await async_table.query().nearest_to_text('"puppy runs"').limit(100).to_list()
)
assert len(results) > len(phrase_results)
assert len(phrase_results) > 0
def test_search_fts_specify_column(table): def test_search_fts_specify_column(table):
table.create_fts_index("text", use_tantivy=False) table.create_fts_index("text", use_tantivy=False)
table.create_fts_index("text2", use_tantivy=False) table.create_fts_index("text2", use_tantivy=False)

View File

@@ -14,6 +14,7 @@
use std::sync::Mutex; use std::sync::Mutex;
use lancedb::index::scalar::FtsIndexBuilder;
use lancedb::{ use lancedb::{
index::{scalar::BTreeIndexBuilder, vector::IvfPqIndexBuilder, Index as LanceDbIndex}, index::{scalar::BTreeIndexBuilder, vector::IvfPqIndexBuilder, Index as LanceDbIndex},
DistanceType, DistanceType,
@@ -100,10 +101,14 @@ impl Index {
} }
#[staticmethod] #[staticmethod]
pub fn fts() -> PyResult<Self> { pub fn fts(with_position: Option<bool>) -> Self {
Ok(Self { let mut opts = FtsIndexBuilder::default();
inner: Mutex::new(Some(LanceDbIndex::FTS(Default::default()))), if let Some(with_position) = with_position {
}) opts = opts.with_position(with_position);
}
Self {
inner: Mutex::new(Some(LanceDbIndex::FTS(opts))),
}
} }
} }

View File

@@ -51,9 +51,25 @@ pub struct LabelListIndexBuilder {}
/// Builder for a full text search index /// Builder for a full text search index
/// ///
/// A full text search index is an index on a string column that allows for full text search /// A full text search index is an index on a string column that allows for full text search
#[derive(Debug, Clone, Default)] #[derive(Debug, Clone)]
pub struct FtsIndexBuilder {} pub struct FtsIndexBuilder {
pub(crate) with_position: bool,
}
impl FtsIndexBuilder {} impl Default for FtsIndexBuilder {
fn default() -> Self {
Self {
with_position: true,
}
}
}
impl FtsIndexBuilder {
/// Set the with_position flag
pub fn with_position(mut self, with_position: bool) -> Self {
self.with_position = with_position;
self
}
}
pub use lance_index::scalar::FullTextSearchQuery; pub use lance_index::scalar::FullTextSearchQuery;

View File

@@ -52,6 +52,7 @@ use crate::arrow::IntoArrow;
use crate::connection::NoData; use crate::connection::NoData;
use crate::embeddings::{EmbeddingDefinition, EmbeddingRegistry, MaybeEmbedded, MemoryRegistry}; use crate::embeddings::{EmbeddingDefinition, EmbeddingRegistry, MaybeEmbedded, MemoryRegistry};
use crate::error::{Error, Result}; use crate::error::{Error, Result};
use crate::index::scalar::FtsIndexBuilder;
use crate::index::vector::{ use crate::index::vector::{
IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder, IvfPqIndexBuilder, VectorIndex, IvfHnswPqIndexBuilder, IvfHnswSqIndexBuilder, IvfPqIndexBuilder, VectorIndex,
}; };
@@ -1609,7 +1610,12 @@ impl NativeTable {
Ok(()) Ok(())
} }
async fn create_fts_index(&self, field: &Field, opts: IndexBuilder) -> Result<()> { async fn create_fts_index(
&self,
field: &Field,
fts_opts: FtsIndexBuilder,
replace: bool,
) -> Result<()> {
if !Self::supported_fts_data_type(field.data_type()) { if !Self::supported_fts_data_type(field.data_type()) {
return Err(Error::Schema { return Err(Error::Schema {
message: format!( message: format!(
@@ -1621,16 +1627,16 @@ impl NativeTable {
} }
let mut dataset = self.dataset.get_mut().await?; let mut dataset = self.dataset.get_mut().await?;
let lance_idx_params = lance_index::scalar::ScalarIndexParams { let fts_params = lance_index::scalar::InvertedIndexParams {
force_index_type: Some(lance_index::scalar::ScalarIndexType::Inverted), with_position: fts_opts.with_position,
}; };
dataset dataset
.create_index( .create_index(
&[field.name()], &[field.name()],
IndexType::Scalar, IndexType::Inverted,
None, None,
&lance_idx_params, &fts_params,
opts.replace, replace,
) )
.await?; .await?;
Ok(()) Ok(())
@@ -1802,7 +1808,7 @@ impl TableInternal for NativeTable {
Index::BTree(_) => self.create_btree_index(field, opts).await, Index::BTree(_) => self.create_btree_index(field, opts).await,
Index::Bitmap(_) => self.create_bitmap_index(field, opts).await, Index::Bitmap(_) => self.create_bitmap_index(field, opts).await,
Index::LabelList(_) => self.create_label_list_index(field, opts).await, Index::LabelList(_) => self.create_label_list_index(field, opts).await,
Index::FTS(_) => self.create_fts_index(field, opts).await, Index::FTS(fts_opts) => self.create_fts_index(field, fts_opts, opts.replace).await,
Index::IvfPq(ivf_pq) => self.create_ivf_pq_index(ivf_pq, field, opts.replace).await, Index::IvfPq(ivf_pq) => self.create_ivf_pq_index(ivf_pq, field, opts.replace).await,
Index::IvfHnswPq(ivf_hnsw_pq) => { Index::IvfHnswPq(ivf_hnsw_pq) => {
self.create_ivf_hnsw_pq_index(ivf_hnsw_pq, field, opts.replace) self.create_ivf_hnsw_pq_index(ivf_hnsw_pq, field, opts.replace)