From a1377afcaa8b98c145b9bc2bc3f48c1cd8b82cc1 Mon Sep 17 00:00:00 2001 From: Ayush Chaurasia Date: Sun, 8 Oct 2023 23:11:39 +0530 Subject: [PATCH] feat: telemetry, error tracking, CLI & config manager (#538) Co-authored-by: Lance Release Co-authored-by: Rob Meng Co-authored-by: Will Jones Co-authored-by: Chang She <759245+changhiskhan@users.noreply.github.com> Co-authored-by: rmeng Co-authored-by: Chang She Co-authored-by: Rok Mihevc --- docs/mkdocs.yml | 2 + docs/src/cli_config.md | 37 +++ python/lancedb/__init__.py | 1 + python/lancedb/cli/__init__.py | 12 + python/lancedb/cli/cli.py | 46 +++ python/lancedb/table.py | 8 + python/lancedb/utils/__init__.py | 15 + python/lancedb/utils/config.py | 116 ++++++++ python/lancedb/utils/events.py | 161 +++++++++++ python/lancedb/utils/general.py | 445 +++++++++++++++++++++++++++++ python/lancedb/utils/sentry_log.py | 112 ++++++++ python/pyproject.toml | 8 +- python/tests/test_cli.py | 35 +++ python/tests/test_telemetry.py | 60 ++++ 14 files changed, 1057 insertions(+), 1 deletion(-) create mode 100644 docs/src/cli_config.md create mode 100644 python/lancedb/cli/__init__.py create mode 100644 python/lancedb/cli/cli.py create mode 100644 python/lancedb/utils/__init__.py create mode 100644 python/lancedb/utils/config.py create mode 100644 python/lancedb/utils/events.py create mode 100644 python/lancedb/utils/general.py create mode 100644 python/lancedb/utils/sentry_log.py create mode 100644 python/tests/test_cli.py create mode 100644 python/tests/test_telemetry.py diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index e6fb8b78..661ff602 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -96,6 +96,8 @@ nav: - Serverless Website Chatbot: examples/serverless_website_chatbot.md - YouTube Transcript Search: examples/youtube_transcript_bot_with_nodejs.md - TransformersJS Embedding Search: examples/transformerjs_embedding_search_nodejs.md + - ⚙️ CLI & Config: cli_config.md + - Basics: basic.md - Guides: - Tables: guides/tables.md diff --git a/docs/src/cli_config.md b/docs/src/cli_config.md new file mode 100644 index 00000000..d440ed7d --- /dev/null +++ b/docs/src/cli_config.md @@ -0,0 +1,37 @@ + +## LanceDB CLI +Once lanceDB is installed, you can access the CLI using `lancedb` command on the console +``` +lancedb +``` +This lists out all the various command-line options available. You can get the usage or help for a particular command +``` +lancedb {command} --help +``` + +## LanceDB config +LanceDB uses a global config file to store certain settings. These settings are configurable using the lanceDB cli. +To view your config settings, you can use: +``` +lancedb config +``` +These config parameters can be tuned using the cli. +``` +lancedb {config_name} --{argument} +``` + +## LanceDB Opt-in Diagnostics +When enabled, LanceDB will send anonymous events to help us improve LanceDB. These diagnostics are used only for error reporting and no data is collected. Error & stats allow us to automate certain aspects of bug reporting, prioritization of fixes and feature requests. +These diagnostics are opt-in and can be enabled or disabled using the `lancedb diagnostics` command. These are enabled by default. +Get usage help. +``` +lancedb diagnostics --help +``` +Disable diagnostics +``` +lancedb diagnostics --disabled +``` +Enable diagnostics +``` +lancedb diagnostics --enabled +``` \ No newline at end of file diff --git a/python/lancedb/__init__.py b/python/lancedb/__init__.py index 1641f099..28121944 100644 --- a/python/lancedb/__init__.py +++ b/python/lancedb/__init__.py @@ -17,6 +17,7 @@ from typing import Optional from .db import URI, DBConnection, LanceDBConnection from .remote.db import RemoteDBConnection from .schema import vector +from .utils import sentry_log __version__ = importlib.metadata.version("lancedb") diff --git a/python/lancedb/cli/__init__.py b/python/lancedb/cli/__init__.py new file mode 100644 index 00000000..905eebdf --- /dev/null +++ b/python/lancedb/cli/__init__.py @@ -0,0 +1,12 @@ +# Copyright 2023 LanceDB Developers +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/lancedb/cli/cli.py b/python/lancedb/cli/cli.py new file mode 100644 index 00000000..5f51148b --- /dev/null +++ b/python/lancedb/cli/cli.py @@ -0,0 +1,46 @@ +# Copyright 2023 LanceDB Developers +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import click + +from lancedb.utils import CONFIG + + +@click.group() +@click.version_option(help="LanceDB command line interface entry point") +def cli(): + "LanceDB command line interface" + + +diagnostics_help = """ +Enable or disable LanceDB diagnostics. When enabled, LanceDB will send anonymous events to help us improve LanceDB. +These diagnostics are used only for error reporting and no data is collected. You can find more about diagnosis on +our docs: https://lancedb.github.io/lancedb/cli_config/ +""" + + +@cli.command(help=diagnostics_help) +@click.option("--enabled/--disabled", default=True) +def diagnostics(enabled): + CONFIG.update({"diagnostics": True if enabled else False}) + click.echo("LanceDB diagnostics is %s" % ("enabled" if enabled else "disabled")) + + +@cli.command(help="Show current LanceDB configuration") +def config(): + # TODO: pretty print as table with colors and formatting + click.echo("Current LanceDB configuration:") + cfg = CONFIG.copy() + cfg.pop("uuid") # Don't show uuid as it is not configurable + for item, amount in cfg.items(): + click.echo("{} ({})".format(item, amount)) diff --git a/python/lancedb/table.py b/python/lancedb/table.py index c8b19ab4..acc48c40 100644 --- a/python/lancedb/table.py +++ b/python/lancedb/table.py @@ -33,6 +33,7 @@ from .embeddings.functions import EmbeddingFunctionConfig from .pydantic import LanceModel from .query import LanceQueryBuilder, Query from .util import fs_from_uri, safe_import_pandas +from .utils.events import register_event pd = safe_import_pandas() @@ -496,6 +497,7 @@ class LanceTable(Table): accelerator=accelerator, ) self._reset_dataset() + register_event("create_index") def create_fts_index(self, field_names: Union[str, List[str]]): """Create a full-text search index on the table. @@ -514,6 +516,7 @@ class LanceTable(Table): field_names = [field_names] index = create_index(self._get_fts_index_path(), field_names) populate_index(index, self, field_names) + register_event("create_fts_index") def _get_fts_index_path(self): return os.path.join(self._dataset_uri, "_indices", "tantivy") @@ -566,6 +569,7 @@ class LanceTable(Table): ) lance.write_dataset(data, self._dataset_uri, schema=self.schema, mode=mode) self._reset_dataset() + register_event("add") def merge( self, @@ -629,6 +633,7 @@ class LanceTable(Table): other_table, left_on=left_on, right_on=right_on, schema=schema ) self._reset_dataset() + register_event("merge") @cached_property def embedding_functions(self) -> dict: @@ -679,6 +684,7 @@ class LanceTable(Table): and also the "_distance" column which is the distance between the query vector and the returned vector. """ + register_event("search") return LanceQueryBuilder.create( self, query, query_type, vector_column_name=vector_column_name ) @@ -782,6 +788,7 @@ class LanceTable(Table): if data is not None: table.add(data) + register_event("create_table") return table @classmethod @@ -847,6 +854,7 @@ class LanceTable(Table): self.delete(where) self.add(orig_data, mode="append") self._reset_dataset() + register_event("update") def _execute_query(self, query: Query) -> pa.Table: ds = self.to_lance() diff --git a/python/lancedb/utils/__init__.py b/python/lancedb/utils/__init__.py new file mode 100644 index 00000000..d4d5c123 --- /dev/null +++ b/python/lancedb/utils/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2023 LanceDB Developers +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .config import Config + +CONFIG = Config() diff --git a/python/lancedb/utils/config.py b/python/lancedb/utils/config.py new file mode 100644 index 00000000..c4d94df1 --- /dev/null +++ b/python/lancedb/utils/config.py @@ -0,0 +1,116 @@ +# Copyright 2023 LanceDB Developers +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import hashlib +import os +import platform +import uuid +from pathlib import Path + +from .general import LOGGER, is_dir_writeable, yaml_load, yaml_save + + +def get_user_config_dir(sub_dir="lancedb"): + """ + Get the user config directory. + + Args: + sub_dir (str): The name of the subdirectory to create. + + Returns: + (Path): The path to the user config directory. + """ + # Return the appropriate config directory for each operating system + if platform.system() == "Windows": + path = Path.home() / "AppData" / "Roaming" / sub_dir + elif platform.system() == "Darwin": + path = Path.home() / "Library" / "Application Support" / sub_dir + elif platform.system() == "Linux": + path = Path.home() / ".config" / sub_dir + else: + raise ValueError(f"Unsupported operating system: {platform.system()}") + + # GCP and AWS lambda fix, only /tmp is writeable + if not is_dir_writeable(path.parent): + LOGGER.warning( + f"WARNING ⚠️ user config directory '{path}' is not writeable, defaulting to '/tmp' or CWD." + "Alternatively you can define a LANCEDB_CONFIG_DIR environment variable for this path." + ) + path = ( + Path("/tmp") / sub_dir + if is_dir_writeable("/tmp") + else Path().cwd() / sub_dir + ) + + # Create the subdirectory if it does not exist + path.mkdir(parents=True, exist_ok=True) + + return path + + +USER_CONFIG_DIR = Path(os.getenv("LANCEDB_CONFIG_DIR") or get_user_config_dir()) +CONFIG_FILE = USER_CONFIG_DIR / "config.yaml" + + +class Config(dict): + """ + Manages lancedb config stored in a YAML file. + + Args: + file (str | Path): Path to the lancedb config YAML file. Default is USER_CONFIG_DIR / 'config.yaml'. + """ + + def __init__(self, file=CONFIG_FILE): + self.file = Path(file) + self.defaults = { # Default global config values + "diagnostics": True, + "uuid": hashlib.sha256(str(uuid.getnode()).encode()).hexdigest(), + } + + super().__init__(copy.deepcopy(self.defaults)) + + if not self.file.exists(): + self.save() + + self.load() + correct_keys = self.keys() == self.defaults.keys() + correct_types = all( + type(a) is type(b) for a, b in zip(self.values(), self.defaults.values()) + ) + if not (correct_keys and correct_types): + LOGGER.warning( + "WARNING ⚠️ LanceDB settings reset to default values. This may be due to a possible problem " + "with your settings or a recent package update. " + f"\nView settings & usage with 'lancedb settings' or at '{self.file}'" + ) + self.reset() + + def load(self): + """Loads settings from the YAML file.""" + super().update(yaml_load(self.file)) + + def save(self): + """Saves the current settings to the YAML file.""" + yaml_save(self.file, dict(self)) + + def update(self, *args, **kwargs): + """Updates a setting value in the current settings.""" + super().update(*args, **kwargs) + self.save() + + def reset(self): + """Resets the settings to default and saves them.""" + self.clear() + self.update(self.defaults) + self.save() diff --git a/python/lancedb/utils/events.py b/python/lancedb/utils/events.py new file mode 100644 index 00000000..cfa0be6c --- /dev/null +++ b/python/lancedb/utils/events.py @@ -0,0 +1,161 @@ +# Copyright 2023 LanceDB Developers +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import datetime +import importlib.metadata +import platform +import random +import sys +import time + +from lancedb.utils import CONFIG +from lancedb.utils.general import TryExcept + +from .general import ( + PLATFORMS, + get_git_origin_url, + is_git_dir, + is_github_actions_ci, + is_online, + is_pip_package, + is_pytest_running, + threaded_request, +) + + +class _Events: + """ + A class for collecting anonymous event analytics. Event analytics are enabled when ``diagnostics=True`` in config and + disabled when ``diagnostics=False``. + + You can enable or disable diagnostics by running ``lancedb diagnostics --enabled`` or ``lancedb diagnostics --disabled``. + + Attributes + ---------- + url : str + The URL to send anonymous events. + rate_limit : float + The rate limit in seconds for sending events. + metadata : dict + A dictionary containing metadata about the environment. + enabled : bool + A flag to enable or disable Events based on certain conditions. + """ + + _instance = None + + url = "https://app.posthog.com/capture/" + headers = {"Content-Type": "application/json"} + api_key = "phc_oENDjGgHtmIDrV6puUiFem2RB4JA8gGWulfdulmMdZP" + # This api-key is write only and is safe to expose in the codebase. + + def __init__(self): + """ + Initializes the Events object with default values for events, rate_limit, and metadata. + """ + self.events = [] # events list + self.max_events = 25 # max events to store in memory + self.rate_limit = 60.0 # rate limit (seconds) + self.time = 0.0 + + if is_git_dir(): + install = "git" + elif is_pip_package(): + install = "pip" + else: + install = "other" + self.metadata = { + "cli": sys.argv[0], + "install": install, + "python": ".".join(platform.python_version_tuple()[:2]), + "version": importlib.metadata.version("lancedb"), + "platforms": PLATFORMS, + "session_id": round(random.random() * 1e15), + # 'engagement_time_msec': 1000 # TODO: In future we might be interested in this metric + } + + TESTS_RUNNING = is_pytest_running() or is_github_actions_ci() + ONLINE = is_online() + self.enabled = ( + CONFIG["diagnostics"] + and not TESTS_RUNNING + and ONLINE + and ( + is_pip_package() + or get_git_origin_url() == "https://github.com/lancedb/lancedb.git" + ) + ) + + def __call__(self, event_name, params={}): + """ + Attempts to add a new event to the events list and send events if the rate limit is reached. + + Args + ---- + event_name : str + The name of the event to be logged. + params : dict, optional + A dictionary of additional parameters to be logged with the event. + """ + ### NOTE: We might need a way to tag a session with a label to check usage from a source. Setting label should be exposed to the user. + if not self.enabled: + return + if ( + len(self.events) < self.max_events + ): # Events list limited to 25 events (drop any events past this) + params.update(self.metadata) + self.events.append( + { + "event": event_name, + "properties": params, + "timestamp": datetime.datetime.now( + tz=datetime.timezone.utc + ).isoformat(), + "distinct_id": CONFIG["uuid"], + } + ) + + # Check rate limit + t = time.time() + if (t - self.time) < self.rate_limit: + return + # Time is over rate limiter, send now + data = { + "api_key": self.api_key, + "distinct_id": CONFIG["uuid"], # posthog needs this to accepts the event + "batch": self.events, + } + + # POST equivalent to requests.post(self.url, json=data). + # threaded request is used to avoid blocking, retries are disabled, and verbose is disabled + # to avoid any possible disruption in the console. + threaded_request( + method="post", + url=self.url, + headers=self.headers, + json=data, + retry=0, + verbose=False, + ) + + # Flush & Reset + self.events = [] + self.time = t + + +@TryExcept(verbose=False) +def register_event(name: str, **kwargs): + if _Events._instance is None: + _Events._instance = _Events() + + _Events._instance(name, **kwargs) diff --git a/python/lancedb/utils/general.py b/python/lancedb/utils/general.py new file mode 100644 index 00000000..14141f8c --- /dev/null +++ b/python/lancedb/utils/general.py @@ -0,0 +1,445 @@ +# Copyright 2023 LanceDB Developers +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import contextlib +import importlib +import logging.config +import os +import platform +import subprocess +import sys +import threading +import time +from pathlib import Path +from typing import Union + +import requests +import yaml + +LOGGING_NAME = "lancedb" +VERBOSE = ( + str(os.getenv("LANCEDB_VERBOSE", True)).lower() == "true" +) # global verbose mode + + +def set_logging(name=LOGGING_NAME, verbose=True): + """Sets up logging for the given name. + + Parameters + ---------- + name : str, optional + The name of the logger. Default is 'lancedb'. + verbose : bool, optional + Whether to enable verbose logging. Default is True. + """ + + rank = int(os.getenv("RANK", -1)) # rank in world for Multi-GPU trainings + level = logging.INFO if verbose and rank in {-1, 0} else logging.ERROR + logging.config.dictConfig( + { + "version": 1, + "disable_existing_loggers": False, + "formatters": {name: {"format": "%(message)s"}}, + "handlers": { + name: { + "class": "logging.StreamHandler", + "formatter": name, + "level": level, + } + }, + "loggers": {name: {"level": level, "handlers": [name], "propagate": False}}, + } + ) + + +set_logging(LOGGING_NAME, verbose=VERBOSE) +LOGGER = logging.getLogger(LOGGING_NAME) + + +def is_pip_package(filepath: str = __name__) -> bool: + """Determines if the file at the given filepath is part of a pip package. + + Parameters + ---------- + filepath : str, optional + The filepath to check. Default is the current file. + + Returns + ------- + bool + True if the file is part of a pip package, False otherwise. + """ + # Get the spec for the module + spec = importlib.util.find_spec(filepath) + + # Return whether the spec is not None and the origin is not None (indicating it is a package) + return spec is not None and spec.origin is not None + + +def is_pytest_running(): + """Determines whether pytest is currently running or not. + + Returns + ------- + bool + True if pytest is running, False otherwise. + """ + return ( + ("PYTEST_CURRENT_TEST" in os.environ) + or ("pytest" in sys.modules) + or ("pytest" in Path(sys.argv[0]).stem) + ) + + +def is_github_actions_ci() -> bool: + """ + Determine if the current environment is a GitHub Actions CI Python runner. + + Returns + ------- + bool + True if the current environment is a GitHub Actions CI Python runner, False otherwise. + """ + + return ( + "GITHUB_ACTIONS" in os.environ + and "RUNNER_OS" in os.environ + and "RUNNER_TOOL_CACHE" in os.environ + ) + + +def is_git_dir(): + """ + Determines whether the current file is part of a git repository. + If the current file is not part of a git repository, returns None. + + Returns + ------- + bool + True if current file is part of a git repository. + """ + return get_git_dir() is not None + + +def is_online() -> bool: + """ + Check internet connectivity by attempting to connect to a known online host. + + Returns + ------- + bool + True if connection is successful, False otherwise. + """ + import socket + + for host in "1.1.1.1", "8.8.8.8", "223.5.5.5": # Cloudflare, Google, AliDNS: + try: + test_connection = socket.create_connection(address=(host, 53), timeout=2) + except (socket.timeout, socket.gaierror, OSError): + continue + else: + # If the connection was successful, close it to avoid a ResourceWarning + test_connection.close() + return True + return False + + +def is_dir_writeable(dir_path: Union[str, Path]) -> bool: + """Check if a directory is writeable. + + Parameters + ---------- + dir_path : Union[str, Path] + The path to the directory. + + Returns + ------- + bool + True if the directory is writeable, False otherwise. + """ + return os.access(str(dir_path), os.W_OK) + + +def is_colab(): + """Check if the current script is running inside a Google Colab notebook. + + Returns + ------- + bool + True if running inside a Colab notebook, False otherwise. + """ + return "COLAB_RELEASE_TAG" in os.environ or "COLAB_BACKEND_VERSION" in os.environ + + +def is_kaggle(): + """Check if the current script is running inside a Kaggle kernel. + + Returns + ------- + bool + True if running inside a Kaggle kernel, False otherwise. + """ + return ( + os.environ.get("PWD") == "/kaggle/working" + and os.environ.get("KAGGLE_URL_BASE") == "https://www.kaggle.com" + ) + + +def is_jupyter(): + """Check if the current script is running inside a Jupyter Notebook. + + Returns + ------- + bool + True if running inside a Jupyter Notebook, False otherwise. + """ + with contextlib.suppress(Exception): + from IPython import get_ipython + + return get_ipython() is not None + return False + + +def is_docker() -> bool: + """Determine if the script is running inside a Docker container. + + Returns + ------- + bool + True if the script is running inside a Docker container, False otherwise. + """ + file = Path("/proc/self/cgroup") + if file.exists(): + with open(file) as f: + return "docker" in f.read() + else: + return False + + +def get_git_dir(): + """Determine whether the current file is part of a git repository and if so, returns the repository root directory. + If the current file is not part of a git repository, returns None. + + Returns + ------- + Path | None + Git root directory if found or None if not found. + """ + for d in Path(__file__).parents: + if (d / ".git").is_dir(): + return d + + +def get_git_origin_url(): + """Retrieve the origin URL of a git repository. + + Returns + ------- + str | None + The origin URL of the git repository or None if not git directory. + """ + if is_git_dir(): + with contextlib.suppress(subprocess.CalledProcessError): + origin = subprocess.check_output( + ["git", "config", "--get", "remote.origin.url"] + ) + return origin.decode().strip() + + +def yaml_save(file="data.yaml", data=None, header=""): + """Save YAML data to a file. + + Parameters + ---------- + file : str, optional + File name, by default 'data.yaml'. + data : dict, optional + Data to save in YAML format, by default None. + header : str, optional + YAML header to add, by default "". + """ + if data is None: + data = {} + file = Path(file) + if not file.parent.exists(): + # Create parent directories if they don't exist + file.parent.mkdir(parents=True, exist_ok=True) + + # Convert Path objects to strings + for k, v in data.items(): + if isinstance(v, Path): + data[k] = str(v) + + # Dump data to file in YAML format + with open(file, "w", errors="ignore", encoding="utf-8") as f: + if header: + f.write(header) + yaml.safe_dump(data, f, sort_keys=False, allow_unicode=True) + + +def yaml_load(file="data.yaml", append_filename=False): + """ + Load YAML data from a file. + + Parameters + ---------- + file : str, optional + File name. Default is 'data.yaml'. + append_filename : bool, optional + Add the YAML filename to the YAML dictionary. Default is False. + + Returns + ------- + dict + YAML data and file name. + """ + assert Path(file).suffix in ( + ".yaml", + ".yml", + ), f"Attempting to load non-YAML file {file} with yaml_load()" + with open(file, errors="ignore", encoding="utf-8") as f: + s = f.read() # string + + # Add YAML filename to dict and return + data = ( + yaml.safe_load(s) or {} + ) # always return a dict (yaml.safe_load() may return None for empty files) + if append_filename: + data["yaml_file"] = str(file) + return data + + +def yaml_print(yaml_file: Union[str, Path, dict]) -> None: + """ + Pretty prints a YAML file or a YAML-formatted dictionary. + + Parameters + ---------- + yaml_file : Union[str, Path, dict] + The file path of the YAML file or a YAML-formatted dictionary. + + Returns + ------- + None + """ + yaml_dict = ( + yaml_load(yaml_file) if isinstance(yaml_file, (str, Path)) else yaml_file + ) + dump = yaml.dump(yaml_dict, sort_keys=False, allow_unicode=True) + LOGGER.info(f"Printing '{yaml_file}'\n\n{dump}") + + +PLATFORMS = [platform.system()] +if is_colab(): + PLATFORMS.append("Colab") +if is_kaggle(): + PLATFORMS.append("Kaggle") +if is_jupyter(): + PLATFORMS.append("Jupyter") +if is_docker(): + PLATFORMS.append("Docker") + +PLATFORMS = "|".join(PLATFORMS) + + +class TryExcept(contextlib.ContextDecorator): + """ + TryExcept context manager. + Usage: @TryExcept() decorator or 'with TryExcept():' context manager. + """ + + def __init__(self, msg="", verbose=True): + """ + Parameters + ---------- + msg : str, optional + Custom message to display in case of exception, by default "". + verbose : bool, optional + Whether to display the message, by default True. + """ + self.msg = msg + self.verbose = verbose + + def __enter__(self): + pass + + def __exit__(self, exc_type, value, traceback): + if self.verbose and value: + LOGGER.info(f"{self.msg}{': ' if self.msg else ''}{value}") + return True + + +def threaded_request( + method, url, retry=3, timeout=30, thread=True, code=-1, verbose=True, **kwargs +): + """ + Makes an HTTP request using the 'requests' library, with exponential backoff retries up to a specified timeout. + + Parameters + ---------- + method : str + The HTTP method to use for the request. Choices are 'post' and 'get'. + url : str + The URL to make the request to. + retry : int, optional + Number of retries to attempt before giving up, by default 3. + timeout : int, optional + Timeout in seconds after which the function will give up retrying, by default 30. + thread : bool, optional + Whether to execute the request in a separate daemon thread, by default True. + code : int, optional + An identifier for the request, used for logging purposes, by default -1. + verbose : bool, optional + A flag to determine whether to print out to console or not, by default True. + + Returns + ------- + requests.Response + The HTTP response object. If the request is executed in a separate thread, returns the thread itself. + """ + retry_codes = () # retry only these codes TODO: add codes if needed in future (500, 408) + + @TryExcept(verbose=verbose) + def func(method, url, **kwargs): + """Make HTTP requests with retries and timeouts, with optional progress tracking.""" + response = None + t0 = time.time() + for i in range(retry + 1): + if (time.time() - t0) > timeout: + break + response = requests.request(method, url, **kwargs) + if response.status_code < 300: # good return codes in the 2xx range + break + try: + m = response.json().get("message", "No JSON message.") + except AttributeError: + m = "Unable to read JSON." + if i == 0: + if response.status_code in retry_codes: + m += f" Retrying {retry}x for {timeout}s." if retry else "" + elif response.status_code == 429: # rate limit + m = f"Rate limit reached" + if verbose: + LOGGER.warning(f"{response.status_code} #{code}") + if response.status_code not in retry_codes: + return response + time.sleep(2**i) # exponential standoff + return response + + args = method, url + if thread: + return threading.Thread( + target=func, args=args, kwargs=kwargs, daemon=True + ).start() + else: + return func(*args, **kwargs) diff --git a/python/lancedb/utils/sentry_log.py b/python/lancedb/utils/sentry_log.py new file mode 100644 index 00000000..cdbc695c --- /dev/null +++ b/python/lancedb/utils/sentry_log.py @@ -0,0 +1,112 @@ +# Copyright 2023 LanceDB Developers +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import bdb +import importlib.metadata +import logging +import sys +from pathlib import Path + +from lancedb.utils import CONFIG + +from .general import ( + PLATFORMS, + TryExcept, + is_git_dir, + is_github_actions_ci, + is_online, + is_pip_package, + is_pytest_running, +) + + +@TryExcept(verbose=False) +def set_sentry(): + """ + Initialize the Sentry SDK for error tracking and reporting. Only used if sentry_sdk package is installed and + sync=True in settings. Run 'lancedb settings' to see and update settings YAML file. + + Conditions required to send errors (ALL conditions must be met or no errors will be reported): + - sentry_sdk package is installed + - sync=True in settings + - pytest is not running + - running in a pip package installation + - running in a non-git directory + - online environment + + The function also configures Sentry SDK to ignore KeyboardInterrupt and FileNotFoundError + exceptions for now. + + Additionally, the function sets custom tags and user information for Sentry events. + """ + + def before_send(event, hint): + """ + Modify the event before sending it to Sentry based on specific exception types and messages. + + Args: + event (dict): The event dictionary containing information about the error. + hint (dict): A dictionary containing additional information about the error. + + Returns: + dict: The modified event or None if the event should not be sent to Sentry. + """ + if "exc_info" in hint: + exc_type, exc_value, tb = hint["exc_info"] + if "out of memory" in str(exc_value).lower(): + return None + + if is_git_dir(): + install = "git" + elif is_pip_package(): + install = "pip" + else: + install = "other" + + event["tags"] = { + "sys_argv": sys.argv[0], + "sys_argv_name": Path(sys.argv[0]).name, + "install": install, + "platforms": PLATFORMS, + "version": importlib.metadata.version("lancedb"), + } + return event + + TESTS_RUNNING = is_pytest_running() or is_github_actions_ci() + ONLINE = is_online() + if CONFIG["diagnostics"] and not TESTS_RUNNING and ONLINE and is_pip_package(): + # and not is_git_dir(): # not running inside a git dir. Maybe too restrictive? + + # If sentry_sdk package is not installed then return and do not use Sentry + try: + import sentry_sdk # noqa + except ImportError: + return + + sentry_sdk.init( + dsn="https://c63ef8c64e05d1aa1a96513361f3ca2f@o4505950840946688.ingest.sentry.io/4505950933614592", + debug=False, + include_local_variables=False, + traces_sample_rate=1.0, + environment="production", # 'dev' or 'production' + before_send=before_send, + ignore_errors=[KeyboardInterrupt, FileNotFoundError, bdb.BdbQuit], + ) + sentry_sdk.set_user({"id": CONFIG["uuid"]}) # SHA-256 anonymized UUID hash + + # Disable all sentry logging + for logger in "sentry_sdk", "sentry_sdk.errors": + logging.getLogger(logger).setLevel(logging.CRITICAL) + + +set_sentry() diff --git a/python/pyproject.toml b/python/pyproject.toml index cd9c5fb9..a21678bc 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -10,7 +10,10 @@ dependencies = [ "pydantic>=1.10", "attrs>=21.3.0", "semver>=3.0", - "cachetools" + "cachetools", + "pyyaml>=6.0", + "click>=8.1.7", + "requests>=2.31.0" ] description = "lancedb" authors = [{ name = "LanceDB Devs", email = "dev@lancedb.com" }] @@ -50,6 +53,9 @@ docs = ["mkdocs", "mkdocs-jupyter", "mkdocs-material", "mkdocstrings[python]"] clip = ["torch", "pillow", "open-clip"] embeddings = ["openai", "sentence-transformers", "torch", "pillow", "open-clip"] +[project.scripts] +lancedb = "lancedb.cli.cli:cli" + [build-system] requires = ["setuptools", "wheel"] build-backend = "setuptools.build_meta" diff --git a/python/tests/test_cli.py b/python/tests/test_cli.py new file mode 100644 index 00000000..8181ce1f --- /dev/null +++ b/python/tests/test_cli.py @@ -0,0 +1,35 @@ +from click.testing import CliRunner + +from lancedb.cli.cli import cli +from lancedb.utils import CONFIG + + +def test_entry(): + runner = CliRunner() + result = runner.invoke(cli) + assert result.exit_code == 0 # Main check + assert "lancedb" in result.output.lower() # lazy check + + +def test_diagnostics(): + runner = CliRunner() + result = runner.invoke(cli, ["diagnostics", "--disabled"]) + assert result.exit_code == 0 # Main check + assert CONFIG["diagnostics"] == False + + result = runner.invoke(cli, ["diagnostics", "--enabled"]) + assert result.exit_code == 0 # Main check + assert CONFIG["diagnostics"] == True + + +def test_config(): + runner = CliRunner() + result = runner.invoke(cli, ["config"]) + assert result.exit_code == 0 # Main check + cfg = CONFIG.copy() + cfg.pop("uuid") + for ( + item, + _, + ) in cfg.items(): # check for keys only as formatting is subject to change + assert item in result.output diff --git a/python/tests/test_telemetry.py b/python/tests/test_telemetry.py new file mode 100644 index 00000000..256d25c9 --- /dev/null +++ b/python/tests/test_telemetry.py @@ -0,0 +1,60 @@ +import json + +import pytest + +import lancedb +from lancedb.utils.events import _Events + + +@pytest.fixture(autouse=True) +def request_log_path(tmp_path): + return tmp_path / "request.json" + + +def mock_register_event(name: str, **kwargs): + if _Events._instance is None: + _Events._instance = _Events() + + _Events._instance.enabled = True + _Events._instance.rate_limit = 0 + _Events._instance(name, **kwargs) + + +def test_event_reporting(monkeypatch, request_log_path, tmp_path) -> None: + def mock_request(**kwargs): + json_data = kwargs.get("json", {}) + with open(request_log_path, "w") as f: + json.dump(json_data, f) + + monkeypatch.setattr( + lancedb.table, "register_event", mock_register_event + ) # Force enable registering events and strip exception handling + monkeypatch.setattr(lancedb.utils.events, "threaded_request", mock_request) + + db = lancedb.connect(tmp_path) + db.create_table( + "test", + data=[ + {"vector": [3.1, 4.1], "item": "foo", "price": 10.0}, + {"vector": [5.9, 26.5], "item": "bar", "price": 20.0}, + ], + mode="overwrite", + ) + + assert request_log_path.exists() # test if event was registered + + with open(request_log_path, "r") as f: + json_data = json.load(f) + + # TODO: don't hardcode these here. Instead create a module level json scehma in lancedb.utils.events for better evolvability + batch_keys = ["api_key", "distinct_id", "batch"] + event_keys = ["event", "properties", "timestamp", "distinct_id"] + property_keys = ["cli", "install", "platforms", "version", "session_id"] + + assert all([key in json_data for key in batch_keys]) + assert all([key in json_data["batch"][0] for key in event_keys]) + assert all([key in json_data["batch"][0]["properties"] for key in property_keys]) + + # cleanup & reset + monkeypatch.undo() + _Events._instance = None