feat: telemetry, error tracking, CLI & config manager (#538)

Co-authored-by: Lance Release <lance-dev@lancedb.com> Co-authored-by: Rob Meng <rob.xu.meng@gmail.com> Co-authored-by: Will Jones <willjones127@gmail.com> Co-authored-by: Chang She <759245+changhiskhan@users.noreply.github.com> Co-authored-by: rmeng <rob@lancedb.com> Co-authored-by: Chang She <chang@lancedb.com> Co-authored-by: Rok Mihevc <rok@mihevc.org>
2026-01-09 13:22:58 +00:00 · 2023-10-08 23:11:39 +05:30
parent a26c8f3316
commit a1377afcaa
14 changed files with 1057 additions and 1 deletions
--- a/docs/mkdocs.yml
+++ b/docs/mkdocs.yml
@@ -96,6 +96,8 @@ nav:
    - Serverless Website Chatbot: examples/serverless_website_chatbot.md
    - YouTube Transcript Search: examples/youtube_transcript_bot_with_nodejs.md
    - TransformersJS Embedding Search: examples/transformerjs_embedding_search_nodejs.md
+  - ⚙️ CLI & Config: cli_config.md
+
 - Basics: basic.md
 - Guides:
  - Tables: guides/tables.md
--- a/docs/src/cli_config.md
+++ b/docs/src/cli_config.md
@@ -0,0 +1,37 @@
+
+## LanceDB CLI
+Once lanceDB is installed, you can access the CLI using `lancedb` command on the console
+```
+lancedb
+```
+This lists out all the various command-line options available. You can get the usage or help for a particular command
+```
+lancedb {command} --help
+```
+
+## LanceDB config
+LanceDB uses a global config file to store certain settings. These settings are configurable using the lanceDB cli.
+To view your config settings, you can use:
+```
+lancedb config
+```
+These config parameters can be tuned using the cli.
+```
+lancedb {config_name} --{argument}
+```
+
+## LanceDB Opt-in Diagnostics
+When enabled, LanceDB will send anonymous events to help us improve LanceDB. These diagnostics are used only for error reporting and no data is collected. Error & stats allow us to automate certain aspects of bug reporting, prioritization of fixes and feature requests.
+These diagnostics are opt-in and can be enabled or disabled using the `lancedb diagnostics` command. These are enabled by default.
+Get usage help.
+```
+lancedb diagnostics --help
+```
+Disable diagnostics
+```
+lancedb diagnostics --disabled
+```
+Enable diagnostics
+```
+lancedb diagnostics --enabled
+```
--- a/python/lancedb/init.py
+++ b/python/lancedb/init.py
@@ -17,6 +17,7 @@ from typing import Optional
 from .db import URI, DBConnection, LanceDBConnection
 from .remote.db import RemoteDBConnection
 from .schema import vector
+from .utils import sentry_log

 __version__ = importlib.metadata.version("lancedb")

--- a/python/lancedb/cli/init.py
+++ b/python/lancedb/cli/init.py
@@ -0,0 +1,12 @@
+#  Copyright 2023 LanceDB Developers
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
--- a/python/lancedb/cli/cli.py
+++ b/python/lancedb/cli/cli.py
@@ -0,0 +1,46 @@
+#  Copyright 2023 LanceDB Developers
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import click
+
+from lancedb.utils import CONFIG
+
+
+@click.group()
+@click.version_option(help="LanceDB command line interface entry point")
+def cli():
+    "LanceDB command line interface"
+
+
+diagnostics_help = """
+Enable or disable LanceDB diagnostics. When enabled, LanceDB will send anonymous events to help us improve LanceDB.
+These diagnostics are used only for error reporting and no data is collected. You can find more about diagnosis on
+our docs: https://lancedb.github.io/lancedb/cli_config/
+"""
+
+
+@cli.command(help=diagnostics_help)
+@click.option("--enabled/--disabled", default=True)
+def diagnostics(enabled):
+    CONFIG.update({"diagnostics": True if enabled else False})
+    click.echo("LanceDB diagnostics is %s" % ("enabled" if enabled else "disabled"))
+
+
+@cli.command(help="Show current LanceDB configuration")
+def config():
+    # TODO: pretty print as table with colors and formatting
+    click.echo("Current LanceDB configuration:")
+    cfg = CONFIG.copy()
+    cfg.pop("uuid")  # Don't show uuid as it is not configurable
+    for item, amount in cfg.items():
+        click.echo("{} ({})".format(item, amount))
--- a/python/lancedb/table.py
+++ b/python/lancedb/table.py
@@ -33,6 +33,7 @@ from .embeddings.functions import EmbeddingFunctionConfig
 from .pydantic import LanceModel
 from .query import LanceQueryBuilder, Query
 from .util import fs_from_uri, safe_import_pandas
+from .utils.events import register_event

 pd = safe_import_pandas()

@@ -496,6 +497,7 @@ class LanceTable(Table):
            accelerator=accelerator,
        )
        self._reset_dataset()
+        register_event("create_index")

    def create_fts_index(self, field_names: Union[str, List[str]]):
        """Create a full-text search index on the table.
@@ -514,6 +516,7 @@ class LanceTable(Table):
            field_names = [field_names]
        index = create_index(self._get_fts_index_path(), field_names)
        populate_index(index, self, field_names)
+        register_event("create_fts_index")

    def _get_fts_index_path(self):
        return os.path.join(self._dataset_uri, "_indices", "tantivy")
@@ -566,6 +569,7 @@ class LanceTable(Table):
        )
        lance.write_dataset(data, self._dataset_uri, schema=self.schema, mode=mode)
        self._reset_dataset()
+        register_event("add")

    def merge(
        self,
@@ -629,6 +633,7 @@ class LanceTable(Table):
            other_table, left_on=left_on, right_on=right_on, schema=schema
        )
        self._reset_dataset()
+        register_event("merge")

    @cached_property
    def embedding_functions(self) -> dict:
@@ -679,6 +684,7 @@ class LanceTable(Table):
            and also the "_distance" column which is the distance between the query
            vector and the returned vector.
        """
+        register_event("search")
        return LanceQueryBuilder.create(
            self, query, query_type, vector_column_name=vector_column_name
        )
@@ -782,6 +788,7 @@ class LanceTable(Table):
        if data is not None:
            table.add(data)

+        register_event("create_table")
        return table

    @classmethod
@@ -847,6 +854,7 @@ class LanceTable(Table):
        self.delete(where)
        self.add(orig_data, mode="append")
        self._reset_dataset()
+        register_event("update")

    def _execute_query(self, query: Query) -> pa.Table:
        ds = self.to_lance()
--- a/python/lancedb/utils/init.py
+++ b/python/lancedb/utils/init.py
@@ -0,0 +1,15 @@
+#  Copyright 2023 LanceDB Developers
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+from .config import Config
+
+CONFIG = Config()
--- a/python/lancedb/utils/config.py
+++ b/python/lancedb/utils/config.py
@@ -0,0 +1,116 @@
+#  Copyright 2023 LanceDB Developers
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import copy
+import hashlib
+import os
+import platform
+import uuid
+from pathlib import Path
+
+from .general import LOGGER, is_dir_writeable, yaml_load, yaml_save
+
+
+def get_user_config_dir(sub_dir="lancedb"):
+    """
+    Get the user config directory.
+
+    Args:
+        sub_dir (str): The name of the subdirectory to create.
+
+    Returns:
+        (Path): The path to the user config directory.
+    """
+    # Return the appropriate config directory for each operating system
+    if platform.system() == "Windows":
+        path = Path.home() / "AppData" / "Roaming" / sub_dir
+    elif platform.system() == "Darwin":
+        path = Path.home() / "Library" / "Application Support" / sub_dir
+    elif platform.system() == "Linux":
+        path = Path.home() / ".config" / sub_dir
+    else:
+        raise ValueError(f"Unsupported operating system: {platform.system()}")
+
+    # GCP and AWS lambda fix, only /tmp is writeable
+    if not is_dir_writeable(path.parent):
+        LOGGER.warning(
+            f"WARNING ⚠️ user config directory '{path}' is not writeable, defaulting to '/tmp' or CWD."
+            "Alternatively you can define a LANCEDB_CONFIG_DIR environment variable for this path."
+        )
+        path = (
+            Path("/tmp") / sub_dir
+            if is_dir_writeable("/tmp")
+            else Path().cwd() / sub_dir
+        )
+
+    # Create the subdirectory if it does not exist
+    path.mkdir(parents=True, exist_ok=True)
+
+    return path
+
+
+USER_CONFIG_DIR = Path(os.getenv("LANCEDB_CONFIG_DIR") or get_user_config_dir())
+CONFIG_FILE = USER_CONFIG_DIR / "config.yaml"
+
+
+class Config(dict):
+    """
+    Manages lancedb config stored in a YAML file.
+
+    Args:
+        file (str | Path): Path to the lancedb config YAML file. Default is USER_CONFIG_DIR / 'config.yaml'.
+    """
+
+    def __init__(self, file=CONFIG_FILE):
+        self.file = Path(file)
+        self.defaults = {  # Default global config values
+            "diagnostics": True,
+            "uuid": hashlib.sha256(str(uuid.getnode()).encode()).hexdigest(),
+        }
+
+        super().__init__(copy.deepcopy(self.defaults))
+
+        if not self.file.exists():
+            self.save()
+
+        self.load()
+        correct_keys = self.keys() == self.defaults.keys()
+        correct_types = all(
+            type(a) is type(b) for a, b in zip(self.values(), self.defaults.values())
+        )
+        if not (correct_keys and correct_types):
+            LOGGER.warning(
+                "WARNING ⚠️ LanceDB settings reset to default values. This may be due to a possible problem "
+                "with your settings or a recent package update. "
+                f"\nView settings & usage with 'lancedb settings' or at '{self.file}'"
+            )
+            self.reset()
+
+    def load(self):
+        """Loads settings from the YAML file."""
+        super().update(yaml_load(self.file))
+
+    def save(self):
+        """Saves the current settings to the YAML file."""
+        yaml_save(self.file, dict(self))
+
+    def update(self, *args, **kwargs):
+        """Updates a setting value in the current settings."""
+        super().update(*args, **kwargs)
+        self.save()
+
+    def reset(self):
+        """Resets the settings to default and saves them."""
+        self.clear()
+        self.update(self.defaults)
+        self.save()
--- a/python/lancedb/utils/events.py
+++ b/python/lancedb/utils/events.py
@@ -0,0 +1,161 @@
+#  Copyright 2023 LanceDB Developers
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import datetime
+import importlib.metadata
+import platform
+import random
+import sys
+import time
+
+from lancedb.utils import CONFIG
+from lancedb.utils.general import TryExcept
+
+from .general import (
+    PLATFORMS,
+    get_git_origin_url,
+    is_git_dir,
+    is_github_actions_ci,
+    is_online,
+    is_pip_package,
+    is_pytest_running,
+    threaded_request,
+)
+
+
+class _Events:
+    """
+    A class for collecting anonymous event analytics. Event analytics are enabled when ``diagnostics=True`` in config and
+    disabled when ``diagnostics=False``.
+
+    You can enable or disable diagnostics by running ``lancedb diagnostics --enabled`` or ``lancedb diagnostics --disabled``.
+
+    Attributes
+    ----------
+    url : str
+        The URL to send anonymous events.
+    rate_limit : float
+        The rate limit in seconds for sending events.
+    metadata : dict
+        A dictionary containing metadata about the environment.
+    enabled : bool
+        A flag to enable or disable Events based on certain conditions.
+    """
+
+    _instance = None
+
+    url = "https://app.posthog.com/capture/"
+    headers = {"Content-Type": "application/json"}
+    api_key = "phc_oENDjGgHtmIDrV6puUiFem2RB4JA8gGWulfdulmMdZP"
+    # This api-key is write only and is safe to expose in the codebase.
+
+    def __init__(self):
+        """
+        Initializes the Events object with default values for events, rate_limit, and metadata.
+        """
+        self.events = []  # events list
+        self.max_events = 25  # max events to store in memory
+        self.rate_limit = 60.0  # rate limit (seconds)
+        self.time = 0.0
+
+        if is_git_dir():
+            install = "git"
+        elif is_pip_package():
+            install = "pip"
+        else:
+            install = "other"
+        self.metadata = {
+            "cli": sys.argv[0],
+            "install": install,
+            "python": ".".join(platform.python_version_tuple()[:2]),
+            "version": importlib.metadata.version("lancedb"),
+            "platforms": PLATFORMS,
+            "session_id": round(random.random() * 1e15),
+            # 'engagement_time_msec': 1000 # TODO: In future we might be interested in this metric
+        }
+
+        TESTS_RUNNING = is_pytest_running() or is_github_actions_ci()
+        ONLINE = is_online()
+        self.enabled = (
+            CONFIG["diagnostics"]
+            and not TESTS_RUNNING
+            and ONLINE
+            and (
+                is_pip_package()
+                or get_git_origin_url() == "https://github.com/lancedb/lancedb.git"
+            )
+        )
+
+    def __call__(self, event_name, params={}):
+        """
+        Attempts to add a new event to the events list and send events if the rate limit is reached.
+
+        Args
+        ----
+        event_name : str
+            The name of the event to be logged.
+        params : dict, optional
+            A dictionary of additional parameters to be logged with the event.
+        """
+        ### NOTE: We might need a way to tag a session with a label to check usage from a source. Setting label should be exposed to the user.
+        if not self.enabled:
+            return
+        if (
+            len(self.events) < self.max_events
+        ):  # Events list limited to 25 events (drop any events past this)
+            params.update(self.metadata)
+            self.events.append(
+                {
+                    "event": event_name,
+                    "properties": params,
+                    "timestamp": datetime.datetime.now(
+                        tz=datetime.timezone.utc
+                    ).isoformat(),
+                    "distinct_id": CONFIG["uuid"],
+                }
+            )
+
+        # Check rate limit
+        t = time.time()
+        if (t - self.time) < self.rate_limit:
+            return
+        # Time is over rate limiter, send now
+        data = {
+            "api_key": self.api_key,
+            "distinct_id": CONFIG["uuid"],  # posthog needs this to accepts the event
+            "batch": self.events,
+        }
+
+        # POST equivalent to requests.post(self.url, json=data).
+        # threaded request is used to avoid blocking, retries are disabled, and verbose is disabled
+        # to avoid any possible disruption in the console.
+        threaded_request(
+            method="post",
+            url=self.url,
+            headers=self.headers,
+            json=data,
+            retry=0,
+            verbose=False,
+        )
+
+        # Flush & Reset
+        self.events = []
+        self.time = t
+
+
+@TryExcept(verbose=False)
+def register_event(name: str, **kwargs):
+    if _Events._instance is None:
+        _Events._instance = _Events()
+
+    _Events._instance(name, **kwargs)
--- a/python/lancedb/utils/general.py
+++ b/python/lancedb/utils/general.py
@@ -0,0 +1,445 @@
+#  Copyright 2023 LanceDB Developers
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import contextlib
+import importlib
+import logging.config
+import os
+import platform
+import subprocess
+import sys
+import threading
+import time
+from pathlib import Path
+from typing import Union
+
+import requests
+import yaml
+
+LOGGING_NAME = "lancedb"
+VERBOSE = (
+    str(os.getenv("LANCEDB_VERBOSE", True)).lower() == "true"
+)  # global verbose mode
+
+
+def set_logging(name=LOGGING_NAME, verbose=True):
+    """Sets up logging for the given name.
+
+    Parameters
+    ----------
+    name : str, optional
+        The name of the logger. Default is 'lancedb'.
+    verbose : bool, optional
+        Whether to enable verbose logging. Default is True.
+    """
+
+    rank = int(os.getenv("RANK", -1))  # rank in world for Multi-GPU trainings
+    level = logging.INFO if verbose and rank in {-1, 0} else logging.ERROR
+    logging.config.dictConfig(
+        {
+            "version": 1,
+            "disable_existing_loggers": False,
+            "formatters": {name: {"format": "%(message)s"}},
+            "handlers": {
+                name: {
+                    "class": "logging.StreamHandler",
+                    "formatter": name,
+                    "level": level,
+                }
+            },
+            "loggers": {name: {"level": level, "handlers": [name], "propagate": False}},
+        }
+    )
+
+
+set_logging(LOGGING_NAME, verbose=VERBOSE)
+LOGGER = logging.getLogger(LOGGING_NAME)
+
+
+def is_pip_package(filepath: str = __name__) -> bool:
+    """Determines if the file at the given filepath is part of a pip package.
+
+    Parameters
+    ----------
+    filepath : str, optional
+        The filepath to check. Default is the current file.
+
+    Returns
+    -------
+    bool
+        True if the file is part of a pip package, False otherwise.
+    """
+    # Get the spec for the module
+    spec = importlib.util.find_spec(filepath)
+
+    # Return whether the spec is not None and the origin is not None (indicating it is a package)
+    return spec is not None and spec.origin is not None
+
+
+def is_pytest_running():
+    """Determines whether pytest is currently running or not.
+
+    Returns
+    -------
+    bool
+        True if pytest is running, False otherwise.
+    """
+    return (
+        ("PYTEST_CURRENT_TEST" in os.environ)
+        or ("pytest" in sys.modules)
+        or ("pytest" in Path(sys.argv[0]).stem)
+    )
+
+
+def is_github_actions_ci() -> bool:
+    """
+    Determine if the current environment is a GitHub Actions CI Python runner.
+
+    Returns
+    -------
+    bool
+        True if the current environment is a GitHub Actions CI Python runner, False otherwise.
+    """
+
+    return (
+        "GITHUB_ACTIONS" in os.environ
+        and "RUNNER_OS" in os.environ
+        and "RUNNER_TOOL_CACHE" in os.environ
+    )
+
+
+def is_git_dir():
+    """
+    Determines whether the current file is part of a git repository.
+    If the current file is not part of a git repository, returns None.
+
+    Returns
+    -------
+    bool
+        True if current file is part of a git repository.
+    """
+    return get_git_dir() is not None
+
+
+def is_online() -> bool:
+    """
+    Check internet connectivity by attempting to connect to a known online host.
+
+    Returns
+    -------
+    bool
+        True if connection is successful, False otherwise.
+    """
+    import socket
+
+    for host in "1.1.1.1", "8.8.8.8", "223.5.5.5":  # Cloudflare, Google, AliDNS:
+        try:
+            test_connection = socket.create_connection(address=(host, 53), timeout=2)
+        except (socket.timeout, socket.gaierror, OSError):
+            continue
+        else:
+            # If the connection was successful, close it to avoid a ResourceWarning
+            test_connection.close()
+            return True
+    return False
+
+
+def is_dir_writeable(dir_path: Union[str, Path]) -> bool:
+    """Check if a directory is writeable.
+
+    Parameters
+    ----------
+    dir_path : Union[str, Path]
+        The path to the directory.
+
+    Returns
+    -------
+    bool
+        True if the directory is writeable, False otherwise.
+    """
+    return os.access(str(dir_path), os.W_OK)
+
+
+def is_colab():
+    """Check if the current script is running inside a Google Colab notebook.
+
+    Returns
+    -------
+    bool
+        True if running inside a Colab notebook, False otherwise.
+    """
+    return "COLAB_RELEASE_TAG" in os.environ or "COLAB_BACKEND_VERSION" in os.environ
+
+
+def is_kaggle():
+    """Check if the current script is running inside a Kaggle kernel.
+
+    Returns
+    -------
+    bool
+        True if running inside a Kaggle kernel, False otherwise.
+    """
+    return (
+        os.environ.get("PWD") == "/kaggle/working"
+        and os.environ.get("KAGGLE_URL_BASE") == "https://www.kaggle.com"
+    )
+
+
+def is_jupyter():
+    """Check if the current script is running inside a Jupyter Notebook.
+
+    Returns
+    -------
+    bool
+        True if running inside a Jupyter Notebook, False otherwise.
+    """
+    with contextlib.suppress(Exception):
+        from IPython import get_ipython
+
+        return get_ipython() is not None
+    return False
+
+
+def is_docker() -> bool:
+    """Determine if the script is running inside a Docker container.
+
+    Returns
+    -------
+    bool
+        True if the script is running inside a Docker container, False otherwise.
+    """
+    file = Path("/proc/self/cgroup")
+    if file.exists():
+        with open(file) as f:
+            return "docker" in f.read()
+    else:
+        return False
+
+
+def get_git_dir():
+    """Determine whether the current file is part of a git repository and if so, returns the repository root directory.
+    If the current file is not part of a git repository, returns None.
+
+    Returns
+    -------
+    Path | None
+        Git root directory if found or None if not found.
+    """
+    for d in Path(__file__).parents:
+        if (d / ".git").is_dir():
+            return d
+
+
+def get_git_origin_url():
+    """Retrieve the origin URL of a git repository.
+
+    Returns
+    -------
+    str | None
+        The origin URL of the git repository or None if not git directory.
+    """
+    if is_git_dir():
+        with contextlib.suppress(subprocess.CalledProcessError):
+            origin = subprocess.check_output(
+                ["git", "config", "--get", "remote.origin.url"]
+            )
+            return origin.decode().strip()
+
+
+def yaml_save(file="data.yaml", data=None, header=""):
+    """Save YAML data to a file.
+
+    Parameters
+    ----------
+    file : str, optional
+        File name, by default 'data.yaml'.
+    data : dict, optional
+        Data to save in YAML format, by default None.
+    header : str, optional
+        YAML header to add, by default "".
+    """
+    if data is None:
+        data = {}
+    file = Path(file)
+    if not file.parent.exists():
+        # Create parent directories if they don't exist
+        file.parent.mkdir(parents=True, exist_ok=True)
+
+    # Convert Path objects to strings
+    for k, v in data.items():
+        if isinstance(v, Path):
+            data[k] = str(v)
+
+    # Dump data to file in YAML format
+    with open(file, "w", errors="ignore", encoding="utf-8") as f:
+        if header:
+            f.write(header)
+        yaml.safe_dump(data, f, sort_keys=False, allow_unicode=True)
+
+
+def yaml_load(file="data.yaml", append_filename=False):
+    """
+    Load YAML data from a file.
+
+    Parameters
+    ----------
+    file : str, optional
+        File name. Default is 'data.yaml'.
+    append_filename : bool, optional
+        Add the YAML filename to the YAML dictionary. Default is False.
+
+    Returns
+    -------
+    dict
+        YAML data and file name.
+    """
+    assert Path(file).suffix in (
+        ".yaml",
+        ".yml",
+    ), f"Attempting to load non-YAML file {file} with yaml_load()"
+    with open(file, errors="ignore", encoding="utf-8") as f:
+        s = f.read()  # string
+
+        # Add YAML filename to dict and return
+        data = (
+            yaml.safe_load(s) or {}
+        )  # always return a dict (yaml.safe_load() may return None for empty files)
+        if append_filename:
+            data["yaml_file"] = str(file)
+        return data
+
+
+def yaml_print(yaml_file: Union[str, Path, dict]) -> None:
+    """
+    Pretty prints a YAML file or a YAML-formatted dictionary.
+
+    Parameters
+    ----------
+    yaml_file : Union[str, Path, dict]
+        The file path of the YAML file or a YAML-formatted dictionary.
+
+    Returns
+    -------
+    None
+    """
+    yaml_dict = (
+        yaml_load(yaml_file) if isinstance(yaml_file, (str, Path)) else yaml_file
+    )
+    dump = yaml.dump(yaml_dict, sort_keys=False, allow_unicode=True)
+    LOGGER.info(f"Printing '{yaml_file}'\n\n{dump}")
+
+
+PLATFORMS = [platform.system()]
+if is_colab():
+    PLATFORMS.append("Colab")
+if is_kaggle():
+    PLATFORMS.append("Kaggle")
+if is_jupyter():
+    PLATFORMS.append("Jupyter")
+if is_docker():
+    PLATFORMS.append("Docker")
+
+PLATFORMS = "|".join(PLATFORMS)
+
+
+class TryExcept(contextlib.ContextDecorator):
+    """
+    TryExcept context manager.
+    Usage: @TryExcept() decorator or 'with TryExcept():' context manager.
+    """
+
+    def __init__(self, msg="", verbose=True):
+        """
+        Parameters
+        ----------
+        msg : str, optional
+            Custom message to display in case of exception, by default "".
+        verbose : bool, optional
+            Whether to display the message, by default True.
+        """
+        self.msg = msg
+        self.verbose = verbose
+
+    def __enter__(self):
+        pass
+
+    def __exit__(self, exc_type, value, traceback):
+        if self.verbose and value:
+            LOGGER.info(f"{self.msg}{': ' if self.msg else ''}{value}")
+        return True
+
+
+def threaded_request(
+    method, url, retry=3, timeout=30, thread=True, code=-1, verbose=True, **kwargs
+):
+    """
+    Makes an HTTP request using the 'requests' library, with exponential backoff retries up to a specified timeout.
+
+    Parameters
+    ----------
+    method : str
+        The HTTP method to use for the request. Choices are 'post' and 'get'.
+    url : str
+        The URL to make the request to.
+    retry : int, optional
+        Number of retries to attempt before giving up, by default 3.
+    timeout : int, optional
+        Timeout in seconds after which the function will give up retrying, by default 30.
+    thread : bool, optional
+        Whether to execute the request in a separate daemon thread, by default True.
+    code : int, optional
+        An identifier for the request, used for logging purposes, by default -1.
+    verbose : bool, optional
+        A flag to determine whether to print out to console or not, by default True.
+
+    Returns
+    -------
+    requests.Response
+        The HTTP response object. If the request is executed in a separate thread, returns the thread itself.
+    """
+    retry_codes = ()  # retry only these codes TODO: add codes if needed in future (500, 408)
+
+    @TryExcept(verbose=verbose)
+    def func(method, url, **kwargs):
+        """Make HTTP requests with retries and timeouts, with optional progress tracking."""
+        response = None
+        t0 = time.time()
+        for i in range(retry + 1):
+            if (time.time() - t0) > timeout:
+                break
+            response = requests.request(method, url, **kwargs)
+            if response.status_code < 300:  # good return codes in the 2xx range
+                break
+            try:
+                m = response.json().get("message", "No JSON message.")
+            except AttributeError:
+                m = "Unable to read JSON."
+            if i == 0:
+                if response.status_code in retry_codes:
+                    m += f" Retrying {retry}x for {timeout}s." if retry else ""
+                elif response.status_code == 429:  # rate limit
+                    m = f"Rate limit reached"
+                if verbose:
+                    LOGGER.warning(f"{response.status_code} #{code}")
+                if response.status_code not in retry_codes:
+                    return response
+            time.sleep(2**i)  # exponential standoff
+        return response
+
+    args = method, url
+    if thread:
+        return threading.Thread(
+            target=func, args=args, kwargs=kwargs, daemon=True
+        ).start()
+    else:
+        return func(*args, **kwargs)
--- a/python/lancedb/utils/sentry_log.py
+++ b/python/lancedb/utils/sentry_log.py
@@ -0,0 +1,112 @@
+#  Copyright 2023 LanceDB Developers
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import bdb
+import importlib.metadata
+import logging
+import sys
+from pathlib import Path
+
+from lancedb.utils import CONFIG
+
+from .general import (
+    PLATFORMS,
+    TryExcept,
+    is_git_dir,
+    is_github_actions_ci,
+    is_online,
+    is_pip_package,
+    is_pytest_running,
+)
+
+
+@TryExcept(verbose=False)
+def set_sentry():
+    """
+    Initialize the Sentry SDK for error tracking and reporting. Only used if sentry_sdk package is installed and
+    sync=True in settings. Run 'lancedb settings' to see and update settings YAML file.
+
+    Conditions required to send errors (ALL conditions must be met or no errors will be reported):
+        - sentry_sdk package is installed
+        - sync=True in  settings
+        - pytest is not running
+        - running in a pip package installation
+        - running in a non-git directory
+        - online environment
+
+    The function also configures Sentry SDK to ignore KeyboardInterrupt and FileNotFoundError
+    exceptions for now.
+
+    Additionally, the function sets custom tags and user information for Sentry events.
+    """
+
+    def before_send(event, hint):
+        """
+        Modify the event before sending it to Sentry based on specific exception types and messages.
+
+        Args:
+            event (dict): The event dictionary containing information about the error.
+            hint (dict): A dictionary containing additional information about the error.
+
+        Returns:
+            dict: The modified event or None if the event should not be sent to Sentry.
+        """
+        if "exc_info" in hint:
+            exc_type, exc_value, tb = hint["exc_info"]
+            if "out of memory" in str(exc_value).lower():
+                return None
+
+        if is_git_dir():
+            install = "git"
+        elif is_pip_package():
+            install = "pip"
+        else:
+            install = "other"
+
+        event["tags"] = {
+            "sys_argv": sys.argv[0],
+            "sys_argv_name": Path(sys.argv[0]).name,
+            "install": install,
+            "platforms": PLATFORMS,
+            "version": importlib.metadata.version("lancedb"),
+        }
+        return event
+
+    TESTS_RUNNING = is_pytest_running() or is_github_actions_ci()
+    ONLINE = is_online()
+    if CONFIG["diagnostics"] and not TESTS_RUNNING and ONLINE and is_pip_package():
+        # and not is_git_dir(): # not running inside a git dir. Maybe too restrictive?
+
+        # If sentry_sdk package is not installed then return and do not use Sentry
+        try:
+            import sentry_sdk  # noqa
+        except ImportError:
+            return
+
+        sentry_sdk.init(
+            dsn="https://c63ef8c64e05d1aa1a96513361f3ca2f@o4505950840946688.ingest.sentry.io/4505950933614592",
+            debug=False,
+            include_local_variables=False,
+            traces_sample_rate=1.0,
+            environment="production",  # 'dev' or 'production'
+            before_send=before_send,
+            ignore_errors=[KeyboardInterrupt, FileNotFoundError, bdb.BdbQuit],
+        )
+        sentry_sdk.set_user({"id": CONFIG["uuid"]})  # SHA-256 anonymized UUID hash
+
+        # Disable all sentry logging
+        for logger in "sentry_sdk", "sentry_sdk.errors":
+            logging.getLogger(logger).setLevel(logging.CRITICAL)
+
+
+set_sentry()
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -10,7 +10,10 @@ dependencies = [
    "pydantic>=1.10",
    "attrs>=21.3.0",
    "semver>=3.0",
-    "cachetools"
+    "cachetools",
+    "pyyaml>=6.0",
+    "click>=8.1.7",
+    "requests>=2.31.0"
 ]
 description = "lancedb"
 authors = [{ name = "LanceDB Devs", email = "dev@lancedb.com" }]
@@ -50,6 +53,9 @@ docs = ["mkdocs", "mkdocs-jupyter", "mkdocs-material", "mkdocstrings[python]"]
 clip = ["torch", "pillow", "open-clip"]
 embeddings = ["openai", "sentence-transformers", "torch", "pillow", "open-clip"]

+[project.scripts]
+lancedb = "lancedb.cli.cli:cli"
+
 [build-system]
 requires = ["setuptools", "wheel"]
 build-backend = "setuptools.build_meta"
--- a/python/tests/test_cli.py
+++ b/python/tests/test_cli.py
@@ -0,0 +1,35 @@
+from click.testing import CliRunner
+
+from lancedb.cli.cli import cli
+from lancedb.utils import CONFIG
+
+
+def test_entry():
+    runner = CliRunner()
+    result = runner.invoke(cli)
+    assert result.exit_code == 0  # Main check
+    assert "lancedb" in result.output.lower()  # lazy check
+
+
+def test_diagnostics():
+    runner = CliRunner()
+    result = runner.invoke(cli, ["diagnostics", "--disabled"])
+    assert result.exit_code == 0  # Main check
+    assert CONFIG["diagnostics"] == False
+
+    result = runner.invoke(cli, ["diagnostics", "--enabled"])
+    assert result.exit_code == 0  # Main check
+    assert CONFIG["diagnostics"] == True
+
+
+def test_config():
+    runner = CliRunner()
+    result = runner.invoke(cli, ["config"])
+    assert result.exit_code == 0  # Main check
+    cfg = CONFIG.copy()
+    cfg.pop("uuid")
+    for (
+        item,
+        _,
+    ) in cfg.items():  # check for keys only as formatting is subject to change
+        assert item in result.output
--- a/python/tests/test_telemetry.py
+++ b/python/tests/test_telemetry.py
@@ -0,0 +1,60 @@
+import json
+
+import pytest
+
+import lancedb
+from lancedb.utils.events import _Events
+
+
+@pytest.fixture(autouse=True)
+def request_log_path(tmp_path):
+    return tmp_path / "request.json"
+
+
+def mock_register_event(name: str, **kwargs):
+    if _Events._instance is None:
+        _Events._instance = _Events()
+
+    _Events._instance.enabled = True
+    _Events._instance.rate_limit = 0
+    _Events._instance(name, **kwargs)
+
+
+def test_event_reporting(monkeypatch, request_log_path, tmp_path) -> None:
+    def mock_request(**kwargs):
+        json_data = kwargs.get("json", {})
+        with open(request_log_path, "w") as f:
+            json.dump(json_data, f)
+
+    monkeypatch.setattr(
+        lancedb.table, "register_event", mock_register_event
+    )  # Force enable registering events and strip exception handling
+    monkeypatch.setattr(lancedb.utils.events, "threaded_request", mock_request)
+
+    db = lancedb.connect(tmp_path)
+    db.create_table(
+        "test",
+        data=[
+            {"vector": [3.1, 4.1], "item": "foo", "price": 10.0},
+            {"vector": [5.9, 26.5], "item": "bar", "price": 20.0},
+        ],
+        mode="overwrite",
+    )
+
+    assert request_log_path.exists()  # test if event was registered
+
+    with open(request_log_path, "r") as f:
+        json_data = json.load(f)
+
+    # TODO: don't hardcode these here. Instead create a module level json scehma in lancedb.utils.events for better evolvability
+    batch_keys = ["api_key", "distinct_id", "batch"]
+    event_keys = ["event", "properties", "timestamp", "distinct_id"]
+    property_keys = ["cli", "install", "platforms", "version", "session_id"]
+
+    assert all([key in json_data for key in batch_keys])
+    assert all([key in json_data["batch"][0] for key in event_keys])
+    assert all([key in json_data["batch"][0]["properties"] for key in property_keys])
+
+    # cleanup & reset
+    monkeypatch.undo()
+    _Events._instance = None