diff --git a/CHANGELOG.md b/CHANGELOG.md index 2c09b47e70..95df620ca6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,12 @@ +## 0.22.0 + +### Breaking changes +- **Opt-out env semantics**: `DO_NOT_TRACK` and `SCARF_NO_ANALYTICS` now treat any non-empty value (after strip) as opt-out. Previously only the exact string `"true"` opted out. Values like `false`, `0`, or `no` now also disable telemetry. To avoid opting out, unset the variable or leave it empty. + +### Enhancements +- **Telemetry off by default**: The library-load analytics ping is disabled by default. Set `UNSTRUCTURED_TELEMETRY_ENABLED=true` before importing to restore the previous behavior. Opt-out via `DO_NOT_TRACK` or `SCARF_NO_ANALYTICS` (any non-empty value) takes precedence. +- Telemetry ping uses `requests.get(..., params=...)` for correct URL encoding and a single dev/non-dev code path. + ## 0.21.12 - **Add Check for complex documents**: Adds a check for complex documents to avoid pdfminer with a high ratio of vector objects diff --git a/README.md b/README.md index ff0aad2ab6..2b53faf0e5 100644 --- a/README.md +++ b/README.md @@ -268,4 +268,4 @@ Encountered a bug? Please create a new [GitHub issue](https://github.com/Unstruc ## :chart_with_upwards_trend: Analytics -This library includes a very lightweight analytics "ping" when the library is loaded, however you can opt out of this data collection by setting the environment variable `DO_NOT_TRACK=true` before executing any `unstructured` code. To learn more about how we collect and use this data, please read our [Privacy Policy](https://unstructured.io/privacy-policy). +Telemetry is **off by default**. To opt in, set `UNSTRUCTURED_TELEMETRY_ENABLED=true` (or `=1`) before importing `unstructured`. To opt out, set `DO_NOT_TRACK` or `SCARF_NO_ANALYTICS` to any non-empty value (e.g. `true`, `1`, `yes`, `false`, `0`—any non-empty string opts out); opt-out takes precedence. Unset the variable or leave it empty if you do not want to opt out. See our [Privacy Policy](https://unstructured.io/privacy-policy). diff --git a/scripts/image/test-outbound-connectivity.sh b/scripts/image/test-outbound-connectivity.sh index 82eda50385..d2e33a37bc 100755 --- a/scripts/image/test-outbound-connectivity.sh +++ b/scripts/image/test-outbound-connectivity.sh @@ -48,7 +48,7 @@ fi SCENARIO="${1:-}" if [[ -z "$SCENARIO" ]]; then - echo "Usage: $0 [--cleanup] {baseline|missing-models|offline|offline-and-missing-models}" >&2 + echo "Usage: $0 [--cleanup] {baseline|missing-models|analytics-online-only|offline|offline-and-missing-models}" >&2 exit 1 fi @@ -61,12 +61,16 @@ fi # ---------- scenario‑specific settings -------------------------------- DO_NOT_TRACK="" +UNSTRUCTURED_TELEMETRY_ENABLED="" HF_HUB_OFFLINE="" REMOVE_CACHE=0 case "$SCENARIO" in baseline) ;; missing-models) REMOVE_CACHE=1 ;; -analytics-online-only) HF_HUB_OFFLINE=1 ;; +analytics-online-only) + UNSTRUCTURED_TELEMETRY_ENABLED=1 + HF_HUB_OFFLINE=1 + ;; offline) DO_NOT_TRACK=true HF_HUB_OFFLINE=1 @@ -89,6 +93,7 @@ CID=$(docker run -d --rm --name "sut_${SCENARIO}" \ --network "$NET" \ --cap-add NET_RAW --cap-add NET_ADMIN \ -e DO_NOT_TRACK="$DO_NOT_TRACK" \ + -e UNSTRUCTURED_TELEMETRY_ENABLED="$UNSTRUCTURED_TELEMETRY_ENABLED" \ -e HF_HUB_OFFLINE="$HF_HUB_OFFLINE" \ --entrypoint /bin/sh "$IMAGE" -c "sleep infinity") echo "Container: $CID (scenario $SCENARIO)" @@ -127,8 +132,8 @@ fi docker exec -i -e PYTHONUNBUFFERED=1 "$CID" python - < None: + """Run the analytics ping if enabled by env. Best-effort and non-fatal.""" + scarf_analytics() diff --git a/unstructured/utils.py b/unstructured/utils.py index 7674cab6fe..25a8b78018 100644 --- a/unstructured/utils.py +++ b/unstructured/utils.py @@ -1,6 +1,7 @@ from __future__ import annotations import asyncio +import contextlib import functools import importlib import inspect @@ -269,49 +270,55 @@ def only(it: Iterable[Any]) -> Any: return out +def _telemetry_opt_out() -> bool: + """True if telemetry should be disabled via env. + + DO_NOT_TRACK and SCARF_NO_ANALYTICS both follow the same rule: any non-empty + value (after strip) opts out. See README/CHANGELOG for the public contract. + """ + return bool((os.getenv("DO_NOT_TRACK") or "").strip()) or bool( + (os.getenv("SCARF_NO_ANALYTICS") or "").strip() + ) + + +def _telemetry_opt_in() -> bool: + """True if telemetry is explicitly enabled via env. Only 'true' and '1' opt in.""" + return (os.getenv("UNSTRUCTURED_TELEMETRY_ENABLED") or "").strip().lower() in ( + "true", + "1", + ) + + def scarf_analytics(): + """Send a lightweight analytics ping. Off by default. + + Set UNSTRUCTURED_TELEMETRY_ENABLED=true to opt in. + Opt-out env vars (DO_NOT_TRACK, SCARF_NO_ANALYTICS): any non-empty value opts out. + """ + if _telemetry_opt_out() or not _telemetry_opt_in(): + return + try: - subprocess.check_output("nvidia-smi") + subprocess.check_output(["nvidia-smi"], stderr=subprocess.DEVNULL) gpu_present = True - except Exception: + except (OSError, subprocess.CalledProcessError): gpu_present = False python_version = ".".join(platform.python_version().split(".")[:2]) - try: - if os.getenv("SCARF_NO_ANALYTICS") != "true" and os.getenv("DO_NOT_TRACK") != "true": - if "dev" in __version__: - requests.get( - "https://packages.unstructured.io/python-telemetry?version=" - + __version__ - + "&platform=" - + platform.system() - + "&python" - + python_version - + "&arch=" - + platform.machine() - + "&gpu=" - + str(gpu_present) - + "&dev=true", - timeout=10, - ) - else: - requests.get( - "https://packages.unstructured.io/python-telemetry?version=" - + __version__ - + "&platform=" - + platform.system() - + "&python" - + python_version - + "&arch=" - + platform.machine() - + "&gpu=" - + str(gpu_present) - + "&dev=false", - timeout=10, - ) - except Exception: - pass + with contextlib.suppress(Exception): + requests.get( + "https://packages.unstructured.io/python-telemetry", + params={ + "version": __version__, + "platform": platform.system(), + "python": python_version, + "arch": platform.machine(), + "gpu": str(gpu_present), + "dev": str("dev" in __version__).lower(), + }, + timeout=10, + ) def ngrams(s: list[str], n: int) -> list[tuple[str, ...]]: