feat(napi): Issue queries as node SDK (#868)

* Query as a fluent API and `AsyncIterator<RecordBatch>`
* Much more docs
* Add tests for auto infer vector search columns with different
dimensions.
This commit is contained in:
Lei Xu
2024-01-25 22:14:14 -08:00
committed by GitHub
parent 9a07c9aad8
commit a6cf24b359
12 changed files with 370 additions and 56 deletions

View File

@@ -16,10 +16,10 @@
use std::io::Cursor;
use arrow_array::RecordBatchReader;
use arrow_ipc::reader::StreamReader;
use arrow_array::{RecordBatch, RecordBatchReader};
use arrow_ipc::{reader::StreamReader, writer::FileWriter};
use crate::Result;
use crate::{Error, Result};
/// Convert a Arrow IPC file to a batch reader
pub fn ipc_file_to_batches(buf: Vec<u8>) -> Result<impl RecordBatchReader> {
@@ -28,6 +28,22 @@ pub fn ipc_file_to_batches(buf: Vec<u8>) -> Result<impl RecordBatchReader> {
Ok(reader)
}
/// Convert record batches to Arrow IPC file
pub fn batches_to_ipc_file(batches: &[RecordBatch]) -> Result<Vec<u8>> {
if batches.is_empty() {
return Err(Error::Store {
message: "No batches to write".to_string(),
});
}
let schema = batches[0].schema();
let mut writer = FileWriter::try_new(vec![], &schema)?;
for batch in batches {
writer.write(batch)?;
}
writer.finish()?;
Ok(writer.into_inner()?)
}
#[cfg(test)]
mod tests {

View File

@@ -22,6 +22,7 @@ use lance_linalg::distance::MetricType;
use crate::error::Result;
use crate::utils::default_vector_column;
use crate::Error;
const DEFAULT_TOP_K: usize = 10;
@@ -93,6 +94,19 @@ impl Query {
let arrow_schema = Schema::from(self.dataset.schema());
default_vector_column(&arrow_schema, Some(query.len() as i32))?
};
let field = self.dataset.schema().field(&column).ok_or(Error::Store {
message: format!("Column {} not found in dataset schema", column),
})?;
if !matches!(field.data_type(), arrow_schema::DataType::FixedSizeList(f, dim) if f.data_type().is_floating() && dim == query.len() as i32)
{
return Err(Error::Store {
message: format!(
"Vector column '{}' does not match the dimension of the query vector: dim={}",
column,
query.len(),
),
});
}
scanner.nearest(&column, query, self.limit.unwrap_or(DEFAULT_TOP_K))?;
} else {
// If there is no vector query, it's ok to not have a limit