diff --git a/docs/src/python/python.md b/docs/src/python/python.md index 5531870f..4cc3b5ce 100644 --- a/docs/src/python/python.md +++ b/docs/src/python/python.md @@ -22,8 +22,6 @@ pip install lancedb ::: lancedb.query.LanceQueryBuilder -::: lancedb.query.LanceFtsQueryBuilder - ## Embeddings ::: lancedb.embeddings.registry.EmbeddingFunctionRegistry @@ -56,7 +54,7 @@ pip install lancedb ## Utilities -::: lancedb.vector +::: lancedb.schema.vector ## Integrations diff --git a/python/lancedb/context.py b/python/lancedb/context.py index 73800d02..02051614 100644 --- a/python/lancedb/context.py +++ b/python/lancedb/context.py @@ -84,7 +84,9 @@ def contextualize(raw_df: "pd.DataFrame") -> Contextualizer: context windows that don't cross document boundaries. In this case, we can pass ``document_id`` as the group by. - >>> contextualize(data).window(4).stride(2).text_col('token').groupby('document_id').to_pandas() + >>> (contextualize(data) + ... .window(4).stride(2).text_col('token').groupby('document_id') + ... .to_pandas()) token document_id 0 The quick brown fox 1 2 brown fox jumped over 1 @@ -92,18 +94,24 @@ def contextualize(raw_df: "pd.DataFrame") -> Contextualizer: 6 the lazy dog 1 9 I love sandwiches 2 - ``min_window_size`` determines the minimum size of the context windows that are generated - This can be used to trim the last few context windows which have size less than - ``min_window_size``. By default context windows of size 1 are skipped. + ``min_window_size`` determines the minimum size of the context windows + that are generated.This can be used to trim the last few context windows + which have size less than ``min_window_size``. + By default context windows of size 1 are skipped. - >>> contextualize(data).window(6).stride(3).text_col('token').groupby('document_id').to_pandas() + >>> (contextualize(data) + ... .window(6).stride(3).text_col('token').groupby('document_id') + ... .to_pandas()) token document_id 0 The quick brown fox jumped over 1 3 fox jumped over the lazy dog 1 6 the lazy dog 1 9 I love sandwiches 2 - >>> contextualize(data).window(6).stride(3).min_window_size(4).text_col('token').groupby('document_id').to_pandas() + >>> (contextualize(data) + ... .window(6).stride(3).min_window_size(4).text_col('token') + ... .groupby('document_id') + ... .to_pandas()) token document_id 0 The quick brown fox jumped over 1 3 fox jumped over the lazy dog 1 @@ -113,7 +121,9 @@ def contextualize(raw_df: "pd.DataFrame") -> Contextualizer: class Contextualizer: - """Create context windows from a DataFrame. See [lancedb.context.contextualize][].""" + """Create context windows from a DataFrame. + See [lancedb.context.contextualize][]. + """ def __init__(self, raw_df): self._text_col = None @@ -183,7 +193,7 @@ class Contextualizer: deprecated_in="0.3.1", removed_in="0.4.0", current_version=__version__, - details="Use the bar function instead", + details="Use to_pandas() instead", ) def to_df(self) -> "pd.DataFrame": return self.to_pandas() diff --git a/python/lancedb/db.py b/python/lancedb/db.py index 6cdbce33..f03e9614 100644 --- a/python/lancedb/db.py +++ b/python/lancedb/db.py @@ -52,12 +52,24 @@ class DBConnection(ABC): ---------- name: str The name of the table. - data: list, tuple, dict, pd.DataFrame; optional - The data to initialize the table. User must provide at least one of `data` or `schema`. - schema: pyarrow.Schema or LanceModel; optional - The schema of the table. + data: The data to initialize the table, *optional* + User must provide at least one of `data` or `schema`. + Acceptable types are: + + - dict or list-of-dict + + - pandas.DataFrame + + - pyarrow.Table or pyarrow.RecordBatch + schema: The schema of the table, *optional* + Acceptable types are: + + - pyarrow.Schema + + - [LanceModel][lancedb.pydantic.LanceModel] mode: str; default "create" - The mode to use when creating the table. Can be either "create" or "overwrite". + The mode to use when creating the table. + Can be either "create" or "overwrite". By default, if the table already exists, an exception is raised. If you want to overwrite the table, use mode="overwrite". on_bad_vectors: str, default "error" @@ -150,7 +162,8 @@ class DBConnection(ABC): ... for i in range(5): ... yield pa.RecordBatch.from_arrays( ... [ - ... pa.array([[3.1, 4.1], [5.9, 26.5]], pa.list_(pa.float32(), 2)), + ... pa.array([[3.1, 4.1], [5.9, 26.5]], + ... pa.list_(pa.float32(), 2)), ... pa.array(["foo", "bar"]), ... pa.array([10.0, 20.0]), ... ], diff --git a/python/lancedb/query.py b/python/lancedb/query.py index f8162efc..7fb31af1 100644 --- a/python/lancedb/query.py +++ b/python/lancedb/query.py @@ -30,7 +30,40 @@ pd = safe_import_pandas() class Query(pydantic.BaseModel): - """A Query""" + """The LanceDB Query + + Attributes + ---------- + vector : List[float] + the vector to search for + filter : Optional[str] + sql filter to refine the query with, optional + prefilter : bool + if True then apply the filter before vector search + k : int + top k results to return + metric : str + the distance metric between a pair of vectors, + + can support L2 (default), Cosine and Dot. + [metric definitions][search] + columns : Optional[List[str]] + which columns to return in the results + nprobes : int + The number of probes used - optional + + - A higher number makes search more accurate but also slower. + + - See discussion in [Querying an ANN Index][querying-an-ann-index] for + tuning advice. + refine_factor : Optional[int] + Refine the results by reading extra elements and re-ranking them in memory - optional + + - A higher number makes search more accurate but also slower. + + - See discussion in [Querying an ANN Index][querying-an-ann-index] for + tuning advice. + """ vector_column: str = VECTOR_COLUMN_NAME @@ -61,6 +94,10 @@ class Query(pydantic.BaseModel): class LanceQueryBuilder(ABC): + """Build LanceDB query based on specific query type: + vector or full text search. + """ + @classmethod def create( cls, @@ -133,11 +170,11 @@ class LanceQueryBuilder(ABC): deprecated_in="0.3.1", removed_in="0.4.0", current_version=__version__, - details="Use the bar function instead", + details="Use to_pandas() instead", ) def to_df(self) -> "pd.DataFrame": """ - Deprecated alias for `to_pandas()`. Please use `to_pandas()` instead. + *Deprecated alias for `to_pandas()`. Please use `to_pandas()` instead.* Execute the query and return the results as a pandas DataFrame. In addition to the selected columns, LanceDB also returns a vector @@ -253,8 +290,6 @@ class LanceQueryBuilder(ABC): class LanceVectorQueryBuilder(LanceQueryBuilder): """ - A builder for nearest neighbor queries for LanceDB. - Examples -------- >>> import lancedb @@ -310,7 +345,7 @@ class LanceVectorQueryBuilder(LanceQueryBuilder): Higher values will yield better recall (more likely to find vectors if they exist) at the expense of latency. - See discussion in [Querying an ANN Index][../querying-an-ann-index] for + See discussion in [Querying an ANN Index][querying-an-ann-index] for tuning advice. Parameters @@ -397,6 +432,8 @@ class LanceVectorQueryBuilder(LanceQueryBuilder): class LanceFtsQueryBuilder(LanceQueryBuilder): + """A builder for full text search for LanceDB.""" + def __init__(self, table: "lancedb.table.Table", query: str): super().__init__(table) self._query = query diff --git a/python/lancedb/remote/db.py b/python/lancedb/remote/db.py index f087ddc5..6b018062 100644 --- a/python/lancedb/remote/db.py +++ b/python/lancedb/remote/db.py @@ -104,7 +104,11 @@ class RemoteDBConnection(DBConnection): raise ValueError("Either data or schema must be provided.") if data is not None: data = _sanitize_data( - data, schema, on_bad_vectors=on_bad_vectors, fill_value=fill_value + data, + schema, + metadata=None, + on_bad_vectors=on_bad_vectors, + fill_value=fill_value, ) else: if schema is None: diff --git a/python/lancedb/table.py b/python/lancedb/table.py index a28b12f0..7d9717d9 100644 --- a/python/lancedb/table.py +++ b/python/lancedb/table.py @@ -149,13 +149,13 @@ class Table(ABC): @property @abstractmethod def schema(self) -> pa.Schema: - """The [Arrow Schema](https://arrow.apache.org/docs/python/api/datatypes.html#) of - this Table + """The [Arrow Schema](https://arrow.apache.org/docs/python/api/datatypes.html#) + of this Table """ raise NotImplementedError - def to_pandas(self): + def to_pandas(self) -> "pd.DataFrame": """Return the table as a pandas DataFrame. Returns @@ -191,17 +191,18 @@ class Table(ABC): The distance metric to use when creating the index. Valid values are "L2", "cosine", or "dot". L2 is euclidean distance. - num_partitions: int + num_partitions: int, default 256 The number of IVF partitions to use when creating the index. Default is 256. - num_sub_vectors: int + num_sub_vectors: int, default 96 The number of PQ sub-vectors to use when creating the index. Default is 96. vector_column_name: str, default "vector" The vector column name to create the index. replace: bool, default True - If True, replace the existing index if it exists. - If False, raise an error if duplicate index exists. + - If True, replace the existing index if it exists. + + - If False, raise an error if duplicate index exists. accelerator: str, default None If set, use the given accelerator to create the index. Only support "cuda" for now. @@ -220,8 +221,14 @@ class Table(ABC): Parameters ---------- - data: list-of-dict, dict, pd.DataFrame - The data to insert into the table. + data: DATA + The data to insert into the table. Acceptable types are: + + - dict or list-of-dict + + - pandas.DataFrame + + - pyarrow.Table or pyarrow.RecordBatch mode: str The mode to use when writing the data. Valid values are "append" and "overwrite". @@ -242,31 +249,70 @@ class Table(ABC): query_type: str = "auto", ) -> LanceQueryBuilder: """Create a search query to find the nearest neighbors - of the given query vector. + of the given query vector. We currently support [vector search][search] + and [full-text search][experimental-full-text-search]. + + All query options are defined in [Query][lancedb.query.Query]. + + Examples + -------- + >>> import lancedb + >>> db = lancedb.connect("./.lancedb") + >>> data = [ + ... {"original_width": 100, "caption": "bar", "vector": [0.1, 2.3, 4.5]}, + ... {"original_width": 2000, "caption": "foo", "vector": [0.5, 3.4, 1.3]}, + ... {"original_width": 3000, "caption": "test", "vector": [0.3, 6.2, 2.6]} + ... ] + >>> table = db.create_table("my_table", data) + >>> query = [0.4, 1.4, 2.4] + >>> (table.search(query, vector_column_name="vector") + ... .where("original_width > 1000", prefilter=True) + ... .select(["caption", "original_width"]) + ... .limit(2) + ... .to_pandas()) + caption original_width vector _distance + 0 foo 2000 [0.5, 3.4, 1.3] 5.220000 + 1 test 3000 [0.3, 6.2, 2.6] 23.089996 Parameters ---------- - query: str, list, np.ndarray, PIL.Image.Image, default None - The query to search for. If None then - the select/where/limit clauses are applied to filter + query: list/np.ndarray/str/PIL.Image.Image, default None + The targetted vector to search for. + + - *default None*. + Acceptable types are: list, np.ndarray, PIL.Image.Image + + - If None then the select/where/limit clauses are applied to filter the table - vector_column_name: str, default "vector" + vector_column_name: str The name of the vector column to search. - query_type: str, default "auto" - "vector", "fts", or "auto" - If "auto" then the query type is inferred from the query; - If `query` is a list/np.ndarray then the query type is "vector"; - If `query` is a PIL.Image.Image then either do vector search - or raise an error if no corresponding embedding function is found. - If `query` is a string, then the query type is "vector" if the + *default "vector"* + query_type: str + *default "auto"*. + Acceptable types are: "vector", "fts", or "auto" + + - If "auto" then the query type is inferred from the query; + + - If `query` is a list/np.ndarray then the query type is + "vector"; + + - If `query` is a PIL.Image.Image then either do vector search, + or raise an error if no corresponding embedding function is found. + + - If `query` is a string, then the query type is "vector" if the table has embedding functions else the query type is "fts" Returns ------- LanceQueryBuilder A query builder object representing the query. - Once executed, the query returns selected columns, the vector, - and also the "_distance" column which is the distance between the query + Once executed, the query returns + + - selected columns + + - the vector + + - and also the "_distance" column which is the distance between the query vector and the returned vector. """ raise NotImplementedError @@ -285,14 +331,19 @@ class Table(ABC): Parameters ---------- where: str - The SQL where clause to use when deleting rows. For example, 'x = 2' - or 'x IN (1, 2, 3)'. The filter must not be empty, or it will error. + The SQL where clause to use when deleting rows. + + - For example, 'x = 2' or 'x IN (1, 2, 3)'. + + The filter must not be empty, or it will error. Examples -------- >>> import lancedb >>> data = [ - ... {"x": 1, "vector": [1, 2]}, {"x": 2, "vector": [3, 4]}, {"x": 3, "vector": [5, 6]} + ... {"x": 1, "vector": [1, 2]}, + ... {"x": 2, "vector": [3, 4]}, + ... {"x": 3, "vector": [5, 6]} ... ] >>> db = lancedb.connect("./.lancedb") >>> table = db.create_table("my_table", data) @@ -377,7 +428,8 @@ class LanceTable(Table): -------- >>> import lancedb >>> db = lancedb.connect("./.lancedb") - >>> table = db.create_table("my_table", [{"vector": [1.1, 0.9], "type": "vector"}]) + >>> table = db.create_table("my_table", + ... [{"vector": [1.1, 0.9], "type": "vector"}]) >>> table.version 2 >>> table.to_pandas() @@ -424,7 +476,8 @@ class LanceTable(Table): -------- >>> import lancedb >>> db = lancedb.connect("./.lancedb") - >>> table = db.create_table("my_table", [{"vector": [1.1, 0.9], "type": "vector"}]) + >>> table = db.create_table("my_table", [ + ... {"vector": [1.1, 0.9], "type": "vector"}]) >>> table.version 2 >>> table.to_pandas() @@ -669,14 +722,39 @@ class LanceTable(Table): query_type: str = "auto", ) -> LanceQueryBuilder: """Create a search query to find the nearest neighbors - of the given query vector. + of the given query vector. We currently support [vector search][search] + and [full-text search][search]. + + Examples + -------- + >>> import lancedb + >>> db = lancedb.connect("./.lancedb") + >>> data = [ + ... {"original_width": 100, "caption": "bar", "vector": [0.1, 2.3, 4.5]}, + ... {"original_width": 2000, "caption": "foo", "vector": [0.5, 3.4, 1.3]}, + ... {"original_width": 3000, "caption": "test", "vector": [0.3, 6.2, 2.6]} + ... ] + >>> table = db.create_table("my_table", data) + >>> query = [0.4, 1.4, 2.4] + >>> (table.search(query, vector_column_name="vector") + ... .where("original_width > 1000", prefilter=True) + ... .select(["caption", "original_width"]) + ... .limit(2) + ... .to_pandas()) + caption original_width vector _distance + 0 foo 2000 [0.5, 3.4, 1.3] 5.220000 + 1 test 3000 [0.3, 6.2, 2.6] 23.089996 Parameters ---------- - query: str, list, np.ndarray, a PIL Image or None - The query to search for. If None then - the select/where/limit clauses are applied to filter - the table + query: list/np.ndarray/str/PIL.Image.Image, default None + The targetted vector to search for. + + - *default None*. + Acceptable types are: list, np.ndarray, PIL.Image.Image + + - If None then the select/[where][sql]/limit clauses are applied + to filter the table vector_column_name: str, default "vector" The name of the vector column to search. query_type: str, default "auto" @@ -685,7 +763,7 @@ class LanceTable(Table): If `query` is a list/np.ndarray then the query type is "vector"; If `query` is a PIL.Image.Image then either do vector search or raise an error if no corresponding embedding function is found. - If the query is a string, then the query type is "vector" if the + If the `query` is a string, then the query type is "vector" if the table has embedding functions, else the query type is "fts" Returns @@ -720,7 +798,9 @@ class LanceTable(Table): -------- >>> import lancedb >>> data = [ - ... {"x": 1, "vector": [1, 2]}, {"x": 2, "vector": [3, 4]}, {"x": 3, "vector": [5, 6]} + ... {"x": 1, "vector": [1, 2]}, + ... {"x": 2, "vector": [3, 4]}, + ... {"x": 3, "vector": [5, 6]} ... ] >>> db = lancedb.connect("./.lancedb") >>> table = db.create_table("my_table", data) @@ -740,7 +820,8 @@ class LanceTable(Table): The data to insert into the table. At least one of `data` or `schema` must be provided. schema: pa.Schema or LanceModel, optional - The schema of the table. If not provided, the schema is inferred from the data. + The schema of the table. If not provided, + the schema is inferred from the data. At least one of `data` or `schema` must be provided. mode: str, default "create" The mode to use when writing the data. Valid values are @@ -811,7 +892,8 @@ class LanceTable(Table): file_info = fs.get_file_info(path) if file_info.type != pa.fs.FileType.Directory: raise FileNotFoundError( - f"Table {name} does not exist. Please first call db.create_table({name}, data)" + f"Table {name} does not exist." + f"Please first call db.create_table({name}, data)" ) return tbl @@ -838,7 +920,9 @@ class LanceTable(Table): -------- >>> import lancedb >>> data = [ - ... {"x": 1, "vector": [1, 2]}, {"x": 2, "vector": [3, 4]}, {"x": 3, "vector": [5, 6]} + ... {"x": 1, "vector": [1, 2]}, + ... {"x": 2, "vector": [3, 4]}, + ... {"x": 3, "vector": [5, 6]} ... ] >>> db = lancedb.connect("./.lancedb") >>> table = db.create_table("my_table", data) @@ -1013,7 +1097,8 @@ def _sanitize_vector_column( # ChunkedArray is annoying to work with, so we combine chunks here vec_arr = data[vector_column_name].combine_chunks() if pa.types.is_list(data[vector_column_name].type): - # if it's a variable size list array we make sure the dimensions are all the same + # if it's a variable size list array, + # we make sure the dimensions are all the same has_jagged_ndims = len(vec_arr.values) % len(data) != 0 if has_jagged_ndims: data = _sanitize_jagged(