chore(python): handle NaN input in fts ingestion (#763)

If the input text is None, Tantivy raises an error
complaining it cannot add a NoneType. We handle this
upstream so None's are not added to the document.
If all of the indexed fields are None then we skip
this document.
This commit is contained in:
Chang She
2024-01-04 11:45:12 -08:00
committed by GitHub
parent 7d55a94efd
commit 60b22d84bf
2 changed files with 21 additions and 3 deletions

View File

@@ -103,10 +103,13 @@ def populate_index(index: tantivy.Index, table: LanceTable, fields: List[str]) -
b = b.flatten()
for i in range(b.num_rows):
doc = tantivy.Document()
doc.add_integer("doc_id", row_id)
for name in fields:
doc.add_text(name, b[name][i].as_py())
writer.add_document(doc)
value = b[name][i].as_py()
if value is not None:
doc.add_text(name, value)
if not doc.is_empty:
doc.add_integer("doc_id", row_id)
writer.add_document(doc)
row_id += 1
# commit changes
writer.commit()

View File

@@ -147,3 +147,18 @@ def test_search_index_with_filter(table):
assert r["id"] == 1
assert rs == rs2
def test_null_input(table):
table.add(
[
{
"vector": np.random.randn(128),
"id": 101,
"text": None,
"text2": None,
"nested": {"text": None},
}
]
)
table.create_fts_index("text")