mirror of
https://github.com/lancedb/lancedb.git
synced 2026-05-18 12:30:41 +00:00
feat(nodejs): add order_by method to Query (#3123)
This commit is contained in:
@@ -255,6 +255,11 @@ class RecordBatchStream:
|
||||
def __aiter__(self) -> "RecordBatchStream": ...
|
||||
async def __anext__(self) -> pa.RecordBatch: ...
|
||||
|
||||
class ColumnOrdering(TypedDict):
|
||||
column_name: str
|
||||
ascending: bool
|
||||
nulls_first: bool
|
||||
|
||||
class Query:
|
||||
def where(self, filter: str): ...
|
||||
def where_expr(self, expr: PyExpr): ...
|
||||
@@ -268,6 +273,7 @@ class Query:
|
||||
def postfilter(self): ...
|
||||
def nearest_to(self, query_vec: pa.Array) -> VectorQuery: ...
|
||||
def nearest_to_text(self, query: dict) -> FTSQuery: ...
|
||||
def order_by(self, ordering: Optional[List[ColumnOrdering]]): ...
|
||||
async def output_schema(self) -> pa.Schema: ...
|
||||
async def execute(
|
||||
self, max_batch_length: Optional[int], timeout: Optional[timedelta]
|
||||
@@ -296,6 +302,7 @@ class FTSQuery:
|
||||
def get_query(self) -> str: ...
|
||||
def add_query_vector(self, query_vec: pa.Array) -> None: ...
|
||||
def nearest_to(self, query_vec: pa.Array) -> HybridQuery: ...
|
||||
def order_by(self, ordering: Optional[List[ColumnOrdering]]): ...
|
||||
async def output_schema(self) -> pa.Schema: ...
|
||||
async def execute(
|
||||
self, max_batch_length: Optional[int], timeout: Optional[timedelta]
|
||||
@@ -321,6 +328,7 @@ class VectorQuery:
|
||||
def maximum_nprobes(self, maximum_nprobes: int): ...
|
||||
def bypass_vector_index(self): ...
|
||||
def nearest_to_text(self, query: dict) -> HybridQuery: ...
|
||||
def order_by(self, ordering: Optional[List[ColumnOrdering]]): ...
|
||||
def to_query_request(self) -> PyQueryRequest: ...
|
||||
|
||||
class HybridQuery:
|
||||
@@ -339,6 +347,7 @@ class HybridQuery:
|
||||
def minimum_nprobes(self, minimum_nprobes: int): ...
|
||||
def maximum_nprobes(self, maximum_nprobes: int): ...
|
||||
def bypass_vector_index(self): ...
|
||||
def order_by(self, ordering: Optional[List[ColumnOrdering]]): ...
|
||||
def to_vector_query(self) -> VectorQuery: ...
|
||||
def to_fts_query(self) -> FTSQuery: ...
|
||||
def get_limit(self) -> int: ...
|
||||
@@ -368,6 +377,7 @@ class PyQueryRequest:
|
||||
bypass_vector_index: Optional[bool]
|
||||
postfilter: Optional[bool]
|
||||
norm: Optional[str]
|
||||
order_by: Optional[List[ColumnOrdering]]
|
||||
|
||||
class CompactionStats:
|
||||
fragments_removed: int
|
||||
|
||||
@@ -92,6 +92,12 @@ def ensure_vector_query(
|
||||
return val
|
||||
|
||||
|
||||
class ColumnOrdering(pydantic.BaseModel):
|
||||
column_name: str
|
||||
ascending: bool = True
|
||||
nulls_first: bool = False
|
||||
|
||||
|
||||
class FullTextQueryType(str, Enum):
|
||||
MATCH = "match"
|
||||
MATCH_PHRASE = "match_phrase"
|
||||
@@ -504,6 +510,8 @@ class Query(pydantic.BaseModel):
|
||||
# Bypass the vector index and use a brute force search
|
||||
bypass_vector_index: Optional[bool] = None
|
||||
|
||||
order_by: Optional[List[ColumnOrdering]] = None
|
||||
|
||||
@classmethod
|
||||
def from_inner(cls, req: PyQueryRequest) -> Self:
|
||||
query = cls()
|
||||
@@ -524,6 +532,8 @@ class Query(pydantic.BaseModel):
|
||||
query.refine_factor = req.refine_factor
|
||||
query.bypass_vector_index = req.bypass_vector_index
|
||||
query.postfilter = req.postfilter
|
||||
if req.order_by is not None:
|
||||
query.order_by = [ColumnOrdering(**o) for o in req.order_by]
|
||||
if req.full_text_search is not None:
|
||||
query.full_text_query = FullTextSearchQuery(
|
||||
columns=None,
|
||||
@@ -572,9 +582,22 @@ class LanceQueryBuilder(ABC):
|
||||
If "auto", the query type is inferred based on the query.
|
||||
vector_column_name: str
|
||||
The name of the vector column to use for vector search.
|
||||
ordering_field_name: Optional[str]
|
||||
.. deprecated:: 0.27.0
|
||||
Use ``order_by()`` method instead.
|
||||
fts_columns: Optional[Union[str, List[str]]]
|
||||
The columns to search in for full text search.
|
||||
fast_search: bool
|
||||
Skip flat search of unindexed data.
|
||||
"""
|
||||
if ordering_field_name is not None:
|
||||
import warnings
|
||||
|
||||
warnings.warn(
|
||||
"ordering_field_name is deprecated, use .order_by() method instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
# Check hybrid search first as it supports empty query pattern
|
||||
if query_type == "hybrid":
|
||||
# hybrid fts and vector query
|
||||
@@ -671,6 +694,7 @@ class LanceQueryBuilder(ABC):
|
||||
self._text = None
|
||||
self._ef = None
|
||||
self._bypass_vector_index = None
|
||||
self._order_by = None
|
||||
|
||||
@deprecation.deprecated(
|
||||
deprecated_in="0.3.1",
|
||||
@@ -947,6 +971,24 @@ class LanceQueryBuilder(ABC):
|
||||
""" # noqa: E501
|
||||
return self._table._explain_plan(self.to_query_object(), verbose=verbose)
|
||||
|
||||
def order_by(self, ordering: Optional[List[ColumnOrdering]]) -> Self:
|
||||
"""
|
||||
Set the ordering for the results.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ordering: Optional[List[ColumnOrdering]]
|
||||
The ordering to use for the results. If None, then the default ordering
|
||||
will be used.
|
||||
|
||||
Returns
|
||||
-------
|
||||
LanceQueryBuilder
|
||||
The LanceQueryBuilder object.
|
||||
"""
|
||||
self._order_by = ordering
|
||||
return self
|
||||
|
||||
def analyze_plan(self) -> str:
|
||||
"""
|
||||
Run the query and return its execution plan with runtime metrics.
|
||||
@@ -1314,6 +1356,7 @@ class LanceVectorQueryBuilder(LanceQueryBuilder):
|
||||
fast_search=self._fast_search,
|
||||
ef=self._ef,
|
||||
bypass_vector_index=self._bypass_vector_index,
|
||||
order_by=self._order_by,
|
||||
)
|
||||
|
||||
def to_batches(
|
||||
@@ -1465,7 +1508,9 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
|
||||
super().__init__(table)
|
||||
self._query = query
|
||||
self._phrase_query = False
|
||||
self.ordering_field_name = ordering_field_name
|
||||
# Deprecated compatibility parameter. Native FTS ordering is now
|
||||
# configured through order_by(); LanceQueryBuilder.create emits the warning.
|
||||
_ = ordering_field_name
|
||||
self._reranker = None
|
||||
self._fast_search = fast_search
|
||||
if isinstance(fts_columns, str):
|
||||
@@ -1514,6 +1559,7 @@ class LanceFtsQueryBuilder(LanceQueryBuilder):
|
||||
),
|
||||
offset=self._offset,
|
||||
fast_search=self._fast_search,
|
||||
order_by=self._order_by,
|
||||
)
|
||||
|
||||
def output_schema(self) -> pa.Schema:
|
||||
@@ -1579,6 +1625,7 @@ class LanceEmptyQueryBuilder(LanceQueryBuilder):
|
||||
limit=self._limit,
|
||||
with_row_id=self._with_row_id,
|
||||
offset=self._offset,
|
||||
order_by=self._order_by,
|
||||
)
|
||||
|
||||
def output_schema(self) -> pa.Schema:
|
||||
@@ -2502,6 +2549,27 @@ class AsyncStandardQuery(AsyncQueryBase):
|
||||
self._inner.offset(offset)
|
||||
return self
|
||||
|
||||
def order_by(self, ordering: Optional[List[ColumnOrdering]]) -> Self:
|
||||
"""
|
||||
Set the ordering for the results.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ordering: Optional[List[ColumnOrdering]]
|
||||
The ordering to use for the results. If None, then the default ordering
|
||||
will be used.
|
||||
"""
|
||||
if ordering is None:
|
||||
self._inner.order_by(None)
|
||||
else:
|
||||
self._inner.order_by(
|
||||
[
|
||||
o.model_dump() if hasattr(o, "model_dump") else o.dict()
|
||||
for o in ordering
|
||||
]
|
||||
)
|
||||
return self
|
||||
|
||||
def fast_search(self) -> Self:
|
||||
"""
|
||||
Skip searching un-indexed data.
|
||||
|
||||
@@ -4512,6 +4512,8 @@ class AsyncTable:
|
||||
async_query = async_query.fast_search()
|
||||
if query.with_row_id:
|
||||
async_query = async_query.with_row_id()
|
||||
if query.order_by:
|
||||
async_query = async_query.order_by(query.order_by)
|
||||
|
||||
if query.vector:
|
||||
async_query = async_query.nearest_to(query.vector).distance_range(
|
||||
|
||||
@@ -29,6 +29,7 @@ from lancedb.query import (
|
||||
MultiMatchQuery,
|
||||
PhraseQuery,
|
||||
BooleanQuery,
|
||||
ColumnOrdering,
|
||||
Occur,
|
||||
LanceFtsQueryBuilder,
|
||||
)
|
||||
@@ -499,6 +500,36 @@ async def test_search_fts_specify_column_async(async_table):
|
||||
pass
|
||||
|
||||
|
||||
def test_search_order_by_descending(table):
|
||||
table.create_fts_index("text")
|
||||
rows = (
|
||||
table.search("puppy")
|
||||
.order_by([ColumnOrdering(column_name="count", ascending=False)])
|
||||
.limit(20)
|
||||
.select(["text", "count"])
|
||||
.to_list()
|
||||
)
|
||||
|
||||
for r in rows:
|
||||
assert "puppy" in r["text"]
|
||||
assert sorted(rows, key=lambda x: x["count"], reverse=True) == rows
|
||||
|
||||
|
||||
def test_search_order_by_ascending(table):
|
||||
table.create_fts_index("text")
|
||||
rows = (
|
||||
table.search("puppy")
|
||||
.order_by([ColumnOrdering(column_name="count", ascending=True)])
|
||||
.limit(20)
|
||||
.select(["text", "count"])
|
||||
.to_list()
|
||||
)
|
||||
|
||||
for r in rows:
|
||||
assert "puppy" in r["text"]
|
||||
assert sorted(rows, key=lambda x: x["count"]) == rows
|
||||
|
||||
|
||||
def test_create_index_from_table(tmp_path, table):
|
||||
table.create_fts_index("text")
|
||||
df = table.search("puppy").limit(5).select(["text"]).to_pandas()
|
||||
|
||||
@@ -25,6 +25,7 @@ from lancedb.query import (
|
||||
AsyncHybridQuery,
|
||||
AsyncQueryBase,
|
||||
AsyncVectorQuery,
|
||||
ColumnOrdering,
|
||||
LanceVectorQueryBuilder,
|
||||
MatchQuery,
|
||||
PhraseQuery,
|
||||
@@ -164,6 +165,71 @@ def test_offset(table):
|
||||
assert len(results_with_offset.to_pandas()) == 1
|
||||
|
||||
|
||||
def test_order_by_plain_query(mem_db):
|
||||
table = mem_db.create_table(
|
||||
"test_order_by",
|
||||
pa.table(
|
||||
{
|
||||
"group": [1, 1, 1, 2],
|
||||
"score": [None, 1.0, 1.0, 0.5],
|
||||
"name": ["z", "b", "a", "c"],
|
||||
}
|
||||
),
|
||||
)
|
||||
|
||||
res = (
|
||||
table.search()
|
||||
.order_by(
|
||||
[
|
||||
ColumnOrdering(column_name="group", ascending=True, nulls_first=False),
|
||||
ColumnOrdering(column_name="score", ascending=True, nulls_first=True),
|
||||
ColumnOrdering(column_name="name", ascending=True, nulls_first=False),
|
||||
]
|
||||
)
|
||||
.to_arrow()
|
||||
)
|
||||
|
||||
assert res.select(["group", "score", "name"]).to_pylist() == [
|
||||
{"group": 1, "score": None, "name": "z"},
|
||||
{"group": 1, "score": 1.0, "name": "a"},
|
||||
{"group": 1, "score": 1.0, "name": "b"},
|
||||
{"group": 2, "score": 0.5, "name": "c"},
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_order_by_async_query(mem_db_async: AsyncConnection):
|
||||
table = await mem_db_async.create_table(
|
||||
"test_order_by_async",
|
||||
pa.table(
|
||||
{
|
||||
"group": [1, 1, 1, 2],
|
||||
"score": [None, 1.0, 1.0, 0.5],
|
||||
"name": ["z", "b", "a", "c"],
|
||||
}
|
||||
),
|
||||
)
|
||||
|
||||
res = await (
|
||||
table.query()
|
||||
.order_by(
|
||||
[
|
||||
ColumnOrdering(column_name="group", ascending=True, nulls_first=False),
|
||||
ColumnOrdering(column_name="score", ascending=True, nulls_first=True),
|
||||
ColumnOrdering(column_name="name", ascending=True, nulls_first=False),
|
||||
]
|
||||
)
|
||||
.to_arrow()
|
||||
)
|
||||
|
||||
assert res.select(["group", "score", "name"]).to_pylist() == [
|
||||
{"group": 1, "score": None, "name": "z"},
|
||||
{"group": 1, "score": 1.0, "name": "a"},
|
||||
{"group": 1, "score": 1.0, "name": "b"},
|
||||
{"group": 2, "score": 0.5, "name": "c"},
|
||||
]
|
||||
|
||||
|
||||
def test_query_builder(table):
|
||||
rs = (
|
||||
LanceVectorQueryBuilder(table, [0, 0], "vector")
|
||||
|
||||
@@ -16,6 +16,7 @@ from packaging.version import Version
|
||||
|
||||
import lancedb
|
||||
from lancedb.conftest import MockTextEmbeddingFunction
|
||||
from lancedb.query import ColumnOrdering
|
||||
from lancedb.remote import ClientConfig
|
||||
from lancedb.remote.errors import HttpError, RetryError
|
||||
import pytest
|
||||
@@ -660,6 +661,18 @@ def test_query_sync_maximal():
|
||||
"ef": None,
|
||||
"filter": "id > 0",
|
||||
"columns": ["id", "name"],
|
||||
"order_by": [
|
||||
{
|
||||
"column_name": "score",
|
||||
"ascending": False,
|
||||
"nulls_first": True,
|
||||
},
|
||||
{
|
||||
"column_name": "id",
|
||||
"ascending": True,
|
||||
"nulls_first": False,
|
||||
},
|
||||
],
|
||||
"vector_column": "vector2",
|
||||
"fast_search": True,
|
||||
"with_row_id": True,
|
||||
@@ -677,6 +690,14 @@ def test_query_sync_maximal():
|
||||
.refine_factor(10)
|
||||
.nprobes(5)
|
||||
.where("id > 0", prefilter=True)
|
||||
.order_by(
|
||||
[
|
||||
ColumnOrdering(
|
||||
column_name="score", ascending=False, nulls_first=True
|
||||
),
|
||||
ColumnOrdering(column_name="id", ascending=True, nulls_first=False),
|
||||
]
|
||||
)
|
||||
.with_row_id(True)
|
||||
.select(["id", "name"])
|
||||
.to_list()
|
||||
|
||||
@@ -23,7 +23,7 @@ use lancedb::query::QueryBase;
|
||||
use lancedb::query::QueryExecutionOptions;
|
||||
use lancedb::query::QueryFilter;
|
||||
use lancedb::query::{
|
||||
ExecutableQuery, Query as LanceDbQuery, Select, TakeQuery as LanceDbTakeQuery,
|
||||
ColumnOrdering, ExecutableQuery, Query as LanceDbQuery, Select, TakeQuery as LanceDbTakeQuery,
|
||||
VectorQuery as LanceDbVectorQuery,
|
||||
};
|
||||
use lancedb::table::AnyQuery;
|
||||
@@ -207,6 +207,48 @@ impl<'py> IntoPyObject<'py> for PyLanceDB<FtsQuery> {
|
||||
#[derive(Clone)]
|
||||
pub struct PyQueryVectors(Vec<Arc<dyn Array>>);
|
||||
|
||||
#[derive(Clone, FromPyObject)]
|
||||
#[pyo3(from_item_all)]
|
||||
pub struct PyColumnOrdering {
|
||||
pub column_name: String,
|
||||
pub ascending: bool,
|
||||
pub nulls_first: bool,
|
||||
}
|
||||
|
||||
impl From<ColumnOrdering> for PyColumnOrdering {
|
||||
fn from(ordering: ColumnOrdering) -> Self {
|
||||
Self {
|
||||
column_name: ordering.column_name,
|
||||
ascending: ordering.ascending,
|
||||
nulls_first: ordering.nulls_first,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<PyColumnOrdering> for ColumnOrdering {
|
||||
fn from(ordering: PyColumnOrdering) -> Self {
|
||||
Self {
|
||||
column_name: ordering.column_name,
|
||||
ascending: ordering.ascending,
|
||||
nulls_first: ordering.nulls_first,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'py> IntoPyObject<'py> for PyColumnOrdering {
|
||||
type Target = PyDict;
|
||||
type Output = Bound<'py, Self::Target>;
|
||||
type Error = PyErr;
|
||||
|
||||
fn into_pyobject(self, py: pyo3::Python<'py>) -> PyResult<Self::Output> {
|
||||
let dict = PyDict::new(py);
|
||||
dict.set_item("column_name", self.column_name)?;
|
||||
dict.set_item("ascending", self.ascending)?;
|
||||
dict.set_item("nulls_first", self.nulls_first)?;
|
||||
Ok(dict)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'py> IntoPyObject<'py> for PyQueryVectors {
|
||||
type Target = PyList;
|
||||
type Output = Bound<'py, Self::Target>;
|
||||
@@ -246,6 +288,7 @@ pub struct PyQueryRequest {
|
||||
pub bypass_vector_index: Option<bool>,
|
||||
pub postfilter: Option<bool>,
|
||||
pub norm: Option<String>,
|
||||
pub order_by: Option<Vec<PyColumnOrdering>>,
|
||||
}
|
||||
|
||||
impl From<AnyQuery> for PyQueryRequest {
|
||||
@@ -273,6 +316,9 @@ impl From<AnyQuery> for PyQueryRequest {
|
||||
bypass_vector_index: None,
|
||||
postfilter: None,
|
||||
norm: None,
|
||||
order_by: query_request
|
||||
.order_by
|
||||
.map(|order_by| order_by.into_iter().map(PyColumnOrdering::from).collect()),
|
||||
},
|
||||
AnyQuery::VectorQuery(vector_query) => Self {
|
||||
limit: vector_query.base.limit,
|
||||
@@ -297,6 +343,10 @@ impl From<AnyQuery> for PyQueryRequest {
|
||||
bypass_vector_index: Some(!vector_query.use_index),
|
||||
postfilter: Some(!vector_query.base.prefilter),
|
||||
norm: vector_query.base.norm.map(|n| n.to_string()),
|
||||
order_by: vector_query
|
||||
.base
|
||||
.order_by
|
||||
.map(|order_by| order_by.into_iter().map(PyColumnOrdering::from).collect()),
|
||||
},
|
||||
}
|
||||
}
|
||||
@@ -475,6 +525,13 @@ impl Query {
|
||||
})
|
||||
}
|
||||
|
||||
pub fn order_by(&mut self, ordering: Option<Vec<PyColumnOrdering>>) -> PyResult<()> {
|
||||
let ordering =
|
||||
ordering.map(|ordering| ordering.into_iter().map(ColumnOrdering::from).collect());
|
||||
self.inner = self.inner.clone().order_by(ordering);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[pyo3(signature = ())]
|
||||
pub fn output_schema(self_: PyRef<'_, Self>) -> PyResult<Bound<'_, PyAny>> {
|
||||
let inner = self_.inner.clone();
|
||||
@@ -647,6 +704,13 @@ impl FTSQuery {
|
||||
self.inner = self.inner.clone().offset(offset as usize);
|
||||
}
|
||||
|
||||
pub fn order_by(&mut self, ordering: Option<Vec<PyColumnOrdering>>) -> PyResult<()> {
|
||||
let ordering =
|
||||
ordering.map(|ordering| ordering.into_iter().map(ColumnOrdering::from).collect());
|
||||
self.inner = self.inner.clone().order_by(ordering);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn fast_search(&mut self) {
|
||||
self.inner = self.inner.clone().fast_search();
|
||||
}
|
||||
@@ -782,6 +846,13 @@ impl VectorQuery {
|
||||
self.inner = self.inner.clone().offset(offset as usize);
|
||||
}
|
||||
|
||||
pub fn order_by(&mut self, ordering: Option<Vec<PyColumnOrdering>>) -> PyResult<()> {
|
||||
let ordering =
|
||||
ordering.map(|ordering| ordering.into_iter().map(ColumnOrdering::from).collect());
|
||||
self.inner = self.inner.clone().order_by(ordering);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn fast_search(&mut self) {
|
||||
self.inner = self.inner.clone().fast_search();
|
||||
}
|
||||
@@ -954,6 +1025,12 @@ impl HybridQuery {
|
||||
self.inner_fts.offset(offset);
|
||||
}
|
||||
|
||||
pub fn order_by(&mut self, ordering: Option<Vec<PyColumnOrdering>>) -> PyResult<()> {
|
||||
self.inner_vec.order_by(ordering.clone())?;
|
||||
self.inner_fts.order_by(ordering)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn fast_search(&mut self) {
|
||||
self.inner_vec.fast_search();
|
||||
self.inner_fts.fast_search();
|
||||
|
||||
Reference in New Issue
Block a user