From 121687231c2979844d3b3bc4662c848007cd1e02 Mon Sep 17 00:00:00 2001 From: Chang She <759245+changhiskhan@users.noreply.github.com> Date: Mon, 8 Jan 2024 21:49:31 -0800 Subject: [PATCH] chore(python): document phrase queries in fts (#788) closes #769 Add unit test and documentation on using quotes to perform a phrase query --- docs/src/fts.md | 16 ++++++++++++++++ python/tests/test_fts.py | 17 +++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/docs/src/fts.md b/docs/src/fts.md index 721a5cf1..183371fa 100644 --- a/docs/src/fts.md +++ b/docs/src/fts.md @@ -75,6 +75,22 @@ applied on top of the full text search results. This can be invoked via the fami table.search("puppy").limit(10).where("meta='foo'").to_list() ``` +## Syntax + +For full-text search you can perform either a phrase query like "the old man and the sea", +or a structured search query like "(Old AND Man) AND Sea". +Double quotes are used to disambiguate. + +For example: + +If you intended "they could have been dogs OR cats" as a phrase query, this actually +raises a syntax error since `OR` is a recognized operator. If you make `or` lower case, +this avoids the syntax error. However, it is cumbersome to have to remember what will +conflict with the query syntax. Instead, if you search using +`table.search('"they could have been dogs OR cats"')`, then the syntax checker avoids +checking inside the quotes. + + ## Configurations By default, LanceDB configures a 1GB heap size limit for creating the index. You can diff --git a/python/tests/test_fts.py b/python/tests/test_fts.py index baa07096..b7c81f61 100644 --- a/python/tests/test_fts.py +++ b/python/tests/test_fts.py @@ -162,3 +162,20 @@ def test_null_input(table): ] ) table.create_fts_index("text") + + +def test_syntax(table): + # https://github.com/lancedb/lancedb/issues/769 + table.create_fts_index("text") + with pytest.raises(ValueError, match="Syntax Error"): + table.search("they could have been dogs OR cats").limit(10).to_list() + # this should work + table.search('"they could have been dogs OR cats"').limit(10).to_list() + # this should work too + table.search('''"the cats OR dogs were not really 'pets' at all"''').limit( + 10 + ).to_list() + with pytest.raises(ValueError, match="Syntax Error"): + table.search('''"the cats OR dogs were not really "pets" at all"''').limit( + 10 + ).to_list()