diff --git a/docs/src/fts.md b/docs/src/fts.md index 8855a5e7..61665f72 100644 --- a/docs/src/fts.md +++ b/docs/src/fts.md @@ -75,21 +75,40 @@ applied on top of the full text search results. This can be invoked via the fami table.search("puppy").limit(10).where("meta='foo'").to_list() ``` -## Syntax +## Phrase queries vs. terms queries -For full-text search you can perform either a phrase query like "the old man and the sea", -or a structured search query like "(Old AND Man) AND Sea". -Double quotes are used to disambiguate. +For full-text search you can specify either a **phrase** query like `"the old man and the sea"`, +or a **terms** search query like `"(Old AND Man) AND Sea"`. For more details on the terms +query syntax, see Tantivy's [query parser rules](https://docs.rs/tantivy/latest/tantivy/query/struct.QueryParser.html). -For example: +!!! tip "Note" + The query parser will raise an exception on queries that are ambiguous. For example, in the query `they could have been dogs OR cats`, `OR` is capitalized so it's considered a keyword query operator. But it's ambiguous how the left part should be treated. So if you submit this search query as is, you'll get `Syntax Error: they could have been dogs OR cats`. -If you intended "they could have been dogs OR cats" as a phrase query, this actually -raises a syntax error since `OR` is a recognized operator. If you make `or` lower case, -this avoids the syntax error. However, it is cumbersome to have to remember what will -conflict with the query syntax. Instead, if you search using -`table.search('"they could have been dogs OR cats"')`, then the syntax checker avoids -checking inside the quotes. + ```py + # This raises a syntax error + table.search("they could have been dogs OR cats") + ``` + On the other hand, lowercasing `OR` to `or` will work, because there are no capitalized logical operators and + the query is treated as a phrase query. + + ```py + # This works! + table.search("they could have been dogs or cats") + ``` + +It can be cumbersome to have to remember what will cause a syntax error depending on the type of +query you want to perform. To make this simpler, when you want to perform a phrase query, you can +enforce it in one of two ways: + +1. Place the double-quoted query inside single quotes. For example, `table.search('"they could have been dogs OR cats"')` is treated as +a phrase query. +2. Explicitly declare the `phrase_query()` method. This is useful when you have a phrase query that +itself contains double quotes. For example, `table.search('the cats OR dogs were not really "pets" at all').phrase_query()` +is treated as a phrase query. + +In general, a query that's declared as a phrase query will be wrapped in double quotes during parsing, with nested +double quotes replaced by single quotes. ## Configurations diff --git a/docs/src/python/python.md b/docs/src/python/python.md index 6438c8bd..bb47fc7a 100644 --- a/docs/src/python/python.md +++ b/docs/src/python/python.md @@ -24,6 +24,12 @@ pip install lancedb ::: lancedb.query.LanceQueryBuilder +::: lancedb.query.LanceVectorQueryBuilder + +::: lancedb.query.LanceFtsQueryBuilder + +::: lancedb.query.LanceHybridQueryBuilder + ## Embeddings ::: lancedb.embeddings.registry.EmbeddingFunctionRegistry @@ -62,10 +68,22 @@ pip install lancedb ## Integrations -### Pydantic +## Pydantic ::: lancedb.pydantic.pydantic_to_schema ::: lancedb.pydantic.vector ::: lancedb.pydantic.LanceModel + +## Reranking + +::: lancedb.rerankers.linear_combination.LinearCombinationReranker + +::: lancedb.rerankers.cohere.CohereReranker + +::: lancedb.rerankers.colbert.ColbertReranker + +::: lancedb.rerankers.cross_encoder.CrossEncoderReranker + +::: lancedb.rerankers.openai.OpenaiReranker \ No newline at end of file diff --git a/python/python/lancedb/query.py b/python/python/lancedb/query.py index 4816acf0..2c2ef71b 100644 --- a/python/python/lancedb/query.py +++ b/python/python/lancedb/query.py @@ -106,8 +106,8 @@ class Query(pydantic.BaseModel): class LanceQueryBuilder(ABC): - """Build LanceDB query based on specific query type: - vector or full text search. + """An abstract query builder. Subclasses are defined for vector search, + full text search, hybrid, and plain SQL filtering. """ @classmethod @@ -118,6 +118,22 @@ class LanceQueryBuilder(ABC): query_type: str, vector_column_name: str, ) -> LanceQueryBuilder: + """ + Create a query builder based on the given query and query type. + + Parameters + ---------- + table: Table + The table to query. + query: Optional[Union[np.ndarray, str, "PIL.Image.Image", Tuple]] + The query to use. If None, an empty query builder is returned + which performs simple SQL filtering. + query_type: str + The type of query to perform. One of "vector", "fts", "hybrid", or "auto". + If "auto", the query type is inferred based on the query. + vector_column_name: str + The name of the vector column to use for vector search. + """ if query is None: return LanceEmptyQueryBuilder(table) @@ -636,6 +652,16 @@ class LanceEmptyQueryBuilder(LanceQueryBuilder): class LanceHybridQueryBuilder(LanceQueryBuilder): + """ + A query builder that performs hybrid vector and full text search. + Results are combined and reranked based on the specified reranker. + By default, the results are reranked using the LinearCombinationReranker. + + To make the vector and fts results comparable, the scores are normalized. + Instead of normalizing scores, the `normalize` parameter can be set to "rank" + in the `rerank` method to convert the scores to ranks and then normalize them. + """ + def __init__(self, table: "Table", query: str, vector_column: str): super().__init__(table) self._validate_fts_index() diff --git a/python/python/tests/test_fts.py b/python/python/tests/test_fts.py index aa6bfa61..fbf74662 100644 --- a/python/python/tests/test_fts.py +++ b/python/python/tests/test_fts.py @@ -177,10 +177,18 @@ def test_syntax(table): table.create_fts_index("text") with pytest.raises(ValueError, match="Syntax Error"): table.search("they could have been dogs OR cats").limit(10).to_list() + + # these should work + + # terms queries + table.search('"they could have been dogs" OR cats').limit(10).to_list() + table.search("(they AND could) OR (have AND been AND dogs) OR cats").limit( + 10 + ).to_list() + + # phrase queries table.search("they could have been dogs OR cats").phrase_query().limit(10).to_list() - # this should work table.search('"they could have been dogs OR cats"').limit(10).to_list() - # this should work too table.search('''"the cats OR dogs were not really 'pets' at all"''').limit( 10 ).to_list()