Order by field support FTS (#1132)

This PR adds support for passing through a set of ordering fields at
index time (unsigned ints that tantivity can use as fast_fields) that at
query time you can sort your results on. This is useful for cases where
you want to get related hits, i.e by keyword, but order those hits by
some other score, such as popularity.

I.e search for songs descriptions that match on "sad AND jazz AND 1920"
and then order those by number of times played. Example usage can be
seen in the fts tests.

---------

Co-authored-by: Nat Roth <natroth@Nats-MacBook-Pro.local>
Co-authored-by: Chang She <759245+changhiskhan@users.noreply.github.com>
This commit is contained in:
natcharacter
2024-03-20 04:27:37 -04:00
committed by Weston Pace
parent 4466cfa958
commit f6e9f8e3f4
5 changed files with 125 additions and 10 deletions

View File

@@ -43,6 +43,7 @@ def table(tmp_path) -> ldb.table.LanceTable:
)
for _ in range(100)
]
count = [random.randint(1, 10000) for _ in range(100)]
table = db.create_table(
"test",
data=pd.DataFrame(
@@ -52,6 +53,7 @@ def table(tmp_path) -> ldb.table.LanceTable:
"text": text,
"text2": text,
"nested": [{"text": t} for t in text],
"count": count,
}
),
)
@@ -79,6 +81,39 @@ def test_search_index(tmp_path, table):
assert len(results[1]) == 10 # _distance
def test_search_ordering_field_index_table(tmp_path, table):
table.create_fts_index("text", ordering_field_names=["count"])
rows = (
table.search("puppy", ordering_field_name="count")
.limit(20)
.select(["text", "count"])
.to_list()
)
for r in rows:
assert "puppy" in r["text"]
assert sorted(rows, key=lambda x: x["count"], reverse=True) == rows
def test_search_ordering_field_index(tmp_path, table):
index = ldb.fts.create_index(
str(tmp_path / "index"), ["text"], ordering_fields=["count"]
)
ldb.fts.populate_index(index, table, ["text"], ordering_fields=["count"])
index.reload()
results = ldb.fts.search_index(
index, query="puppy", limit=10, ordering_field="count"
)
assert len(results) == 2
assert len(results[0]) == 10 # row_ids
assert len(results[1]) == 10 # _distance
rows = table.to_lance().take(results[0]).to_pylist()
for r in rows:
assert "puppy" in r["text"]
assert sorted(rows, key=lambda x: x["count"], reverse=True) == rows
def test_create_index_from_table(tmp_path, table):
table.create_fts_index("text")
df = table.search("puppy").limit(10).select(["text"]).to_pandas()
@@ -94,6 +129,7 @@ def test_create_index_from_table(tmp_path, table):
"text": "gorilla",
"text2": "gorilla",
"nested": {"text": "gorilla"},
"count": 10,
}
]
)
@@ -166,6 +202,7 @@ def test_null_input(table):
"text": None,
"text2": None,
"nested": {"text": None},
"count": 7,
}
]
)