From 9d939d8b727f56397feb94a3deebf7fd58d6f9ac Mon Sep 17 00:00:00 2001 From: Clayton Date: Sun, 8 Mar 2026 21:22:08 -0500 Subject: [PATCH 01/11] feat: make telemetry off by default --- CHANGELOG.md | 5 +++ README.md | 2 +- test_unstructured/test_utils.py | 45 +++++++++++++++++++++ unstructured/logger.py | 4 +- unstructured/utils.py | 72 +++++++++++++++++++-------------- 5 files changed, 94 insertions(+), 34 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2c09b47e70..634b747979 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +## 0.21.13 + +### Enhancements +- **Telemetry off by default (fixes #3940)**: Analytics/telemetry is now disabled by default. Set `UNSTRUCTURED_TELEMETRY_ENABLED=true` to opt in. Opt-out via `DO_NOT_TRACK=true` or `SCARF_NO_ANALYTICS=true` is still supported and takes precedence. + ## 0.21.12 - **Add Check for complex documents**: Adds a check for complex documents to avoid pdfminer with a high ratio of vector objects diff --git a/README.md b/README.md index ff0aad2ab6..dd51659379 100644 --- a/README.md +++ b/README.md @@ -268,4 +268,4 @@ Encountered a bug? Please create a new [GitHub issue](https://github.com/Unstruc ## :chart_with_upwards_trend: Analytics -This library includes a very lightweight analytics "ping" when the library is loaded, however you can opt out of this data collection by setting the environment variable `DO_NOT_TRACK=true` before executing any `unstructured` code. To learn more about how we collect and use this data, please read our [Privacy Policy](https://unstructured.io/privacy-policy). +Telemetry is **off by default**. To opt in to the lightweight analytics ping when the library is loaded, set the environment variable `UNSTRUCTURED_TELEMETRY_ENABLED=true` before importing `unstructured`. You can also opt out explicitly by setting `DO_NOT_TRACK=true` or `SCARF_NO_ANALYTICS=true`. To learn more about how we collect and use this data, please read our [Privacy Policy](https://unstructured.io/privacy-policy). diff --git a/test_unstructured/test_utils.py b/test_unstructured/test_utils.py index abde59e22c..fd1af0273a 100644 --- a/test_unstructured/test_utils.py +++ b/test_unstructured/test_utils.py @@ -2,6 +2,7 @@ import json import os +from unittest.mock import patch import pytest @@ -383,3 +384,47 @@ def it_keeps_first_orphan_in_none_group_when_assign_orphans_is_true(self): assert list(result.keys()) == [None, "parent_A"] assert [e.text for e in result[None]] == ["First orphan"] assert [e.text for e in result["parent_A"]] == ["Title 1", "Orphan 2"] + + +class DescribeScarfAnalytics: + """Tests for scarf_analytics (telemetry off by default, opt-in only).""" + + def it_does_not_send_telemetry_by_default(self, monkeypatch): + monkeypatch.delenv("UNSTRUCTURED_TELEMETRY_ENABLED", raising=False) + monkeypatch.delenv("SCARF_NO_ANALYTICS", raising=False) + monkeypatch.delenv("DO_NOT_TRACK", raising=False) + with patch("unstructured.utils.requests.get") as mock_get: + utils.scarf_analytics() + mock_get.assert_not_called() + + def it_sends_telemetry_when_opt_in_is_set(self, monkeypatch): + monkeypatch.setenv("UNSTRUCTURED_TELEMETRY_ENABLED", "true") + monkeypatch.delenv("SCARF_NO_ANALYTICS", raising=False) + monkeypatch.delenv("DO_NOT_TRACK", raising=False) + with patch("unstructured.utils.requests.get") as mock_get: + utils.scarf_analytics() + mock_get.assert_called_once() + call_url = mock_get.call_args[0][0] + assert "python-telemetry" in call_url and "version=" in call_url + + def it_accepts_opt_in_value_1(self, monkeypatch): + monkeypatch.setenv("UNSTRUCTURED_TELEMETRY_ENABLED", "1") + monkeypatch.delenv("SCARF_NO_ANALYTICS", raising=False) + monkeypatch.delenv("DO_NOT_TRACK", raising=False) + with patch("unstructured.utils.requests.get") as mock_get: + utils.scarf_analytics() + mock_get.assert_called_once() + + def it_does_not_send_when_do_not_track_is_set_even_if_opt_in(self, monkeypatch): + monkeypatch.setenv("UNSTRUCTURED_TELEMETRY_ENABLED", "true") + monkeypatch.setenv("DO_NOT_TRACK", "true") + with patch("unstructured.utils.requests.get") as mock_get: + utils.scarf_analytics() + mock_get.assert_not_called() + + def it_does_not_send_when_scarf_no_analytics_is_set_even_if_opt_in(self, monkeypatch): + monkeypatch.setenv("UNSTRUCTURED_TELEMETRY_ENABLED", "true") + monkeypatch.setenv("SCARF_NO_ANALYTICS", "true") + with patch("unstructured.utils.requests.get") as mock_get: + utils.scarf_analytics() + mock_get.assert_not_called() diff --git a/unstructured/logger.py b/unstructured/logger.py index 93ba622676..acac5b9daf 100644 --- a/unstructured/logger.py +++ b/unstructured/logger.py @@ -16,8 +16,8 @@ def detail(self, message, *args, **kws): self._log(DETAIL, message, args, **kws) -# Note(Trevor,Crag): to opt out of scarf analytics, set the environment variable: -# SCARF_NO_ANALYTICS=true. See the README for more info. +# Telemetry is off by default. To opt in, set UNSTRUCTURED_TELEMETRY_ENABLED=true. +# To opt out when enabled elsewhere, set SCARF_NO_ANALYTICS=true or DO_NOT_TRACK=true. See the README. scarf_analytics() # Add the custom log method to the logging.Logger class diff --git a/unstructured/utils.py b/unstructured/utils.py index 7674cab6fe..19ee326367 100644 --- a/unstructured/utils.py +++ b/unstructured/utils.py @@ -270,6 +270,10 @@ def only(it: Iterable[Any]) -> Any: def scarf_analytics(): + """Send a lightweight analytics ping. Off by default; set UNSTRUCTURED_TELEMETRY_ENABLED=true to opt in. + + Opt-out env vars (DO_NOT_TRACK, SCARF_NO_ANALYTICS) are always respected and take precedence. + """ try: subprocess.check_output("nvidia-smi") gpu_present = True @@ -278,38 +282,44 @@ def scarf_analytics(): python_version = ".".join(platform.python_version().split(".")[:2]) + # Telemetry is off by default. Only send when user explicitly opts in via UNSTRUCTURED_TELEMETRY_ENABLED. + # Opt-out env vars always take precedence. + opt_out = os.getenv("SCARF_NO_ANALYTICS") == "true" or os.getenv("DO_NOT_TRACK") == "true" + opt_in = os.getenv("UNSTRUCTURED_TELEMETRY_ENABLED", "").lower() in ("true", "1") + if opt_out or not opt_in: + return + try: - if os.getenv("SCARF_NO_ANALYTICS") != "true" and os.getenv("DO_NOT_TRACK") != "true": - if "dev" in __version__: - requests.get( - "https://packages.unstructured.io/python-telemetry?version=" - + __version__ - + "&platform=" - + platform.system() - + "&python" - + python_version - + "&arch=" - + platform.machine() - + "&gpu=" - + str(gpu_present) - + "&dev=true", - timeout=10, - ) - else: - requests.get( - "https://packages.unstructured.io/python-telemetry?version=" - + __version__ - + "&platform=" - + platform.system() - + "&python" - + python_version - + "&arch=" - + platform.machine() - + "&gpu=" - + str(gpu_present) - + "&dev=false", - timeout=10, - ) + if "dev" in __version__: + requests.get( + "https://packages.unstructured.io/python-telemetry?version=" + + __version__ + + "&platform=" + + platform.system() + + "&python" + + python_version + + "&arch=" + + platform.machine() + + "&gpu=" + + str(gpu_present) + + "&dev=true", + timeout=10, + ) + else: + requests.get( + "https://packages.unstructured.io/python-telemetry?version=" + + __version__ + + "&platform=" + + platform.system() + + "&python" + + python_version + + "&arch=" + + platform.machine() + + "&gpu=" + + str(gpu_present) + + "&dev=false", + timeout=10, + ) except Exception: pass From d2e202af7d41b40302f19d13084cc758f4776c49 Mon Sep 17 00:00:00 2001 From: Clayton Date: Sun, 8 Mar 2026 21:27:24 -0500 Subject: [PATCH 02/11] fix: lint --- test_unstructured/test_utils.py | 3 ++- unstructured/logger.py | 3 ++- unstructured/utils.py | 7 ++++--- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/test_unstructured/test_utils.py b/test_unstructured/test_utils.py index fd1af0273a..d6a5e33c98 100644 --- a/test_unstructured/test_utils.py +++ b/test_unstructured/test_utils.py @@ -405,7 +405,8 @@ def it_sends_telemetry_when_opt_in_is_set(self, monkeypatch): utils.scarf_analytics() mock_get.assert_called_once() call_url = mock_get.call_args[0][0] - assert "python-telemetry" in call_url and "version=" in call_url + assert "python-telemetry" in call_url + assert "version=" in call_url def it_accepts_opt_in_value_1(self, monkeypatch): monkeypatch.setenv("UNSTRUCTURED_TELEMETRY_ENABLED", "1") diff --git a/unstructured/logger.py b/unstructured/logger.py index acac5b9daf..fa2948908c 100644 --- a/unstructured/logger.py +++ b/unstructured/logger.py @@ -17,7 +17,8 @@ def detail(self, message, *args, **kws): # Telemetry is off by default. To opt in, set UNSTRUCTURED_TELEMETRY_ENABLED=true. -# To opt out when enabled elsewhere, set SCARF_NO_ANALYTICS=true or DO_NOT_TRACK=true. See the README. +# To opt out when enabled elsewhere, set SCARF_NO_ANALYTICS=true or DO_NOT_TRACK=true. +# See the README. scarf_analytics() # Add the custom log method to the logging.Logger class diff --git a/unstructured/utils.py b/unstructured/utils.py index 19ee326367..d7b75b1b2c 100644 --- a/unstructured/utils.py +++ b/unstructured/utils.py @@ -270,8 +270,9 @@ def only(it: Iterable[Any]) -> Any: def scarf_analytics(): - """Send a lightweight analytics ping. Off by default; set UNSTRUCTURED_TELEMETRY_ENABLED=true to opt in. + """Send a lightweight analytics ping. Off by default. + Set UNSTRUCTURED_TELEMETRY_ENABLED=true to opt in. Opt-out env vars (DO_NOT_TRACK, SCARF_NO_ANALYTICS) are always respected and take precedence. """ try: @@ -282,8 +283,8 @@ def scarf_analytics(): python_version = ".".join(platform.python_version().split(".")[:2]) - # Telemetry is off by default. Only send when user explicitly opts in via UNSTRUCTURED_TELEMETRY_ENABLED. - # Opt-out env vars always take precedence. + # Telemetry is off by default. Only send when user explicitly opts in via + # UNSTRUCTURED_TELEMETRY_ENABLED. Opt-out env vars always take precedence. opt_out = os.getenv("SCARF_NO_ANALYTICS") == "true" or os.getenv("DO_NOT_TRACK") == "true" opt_in = os.getenv("UNSTRUCTURED_TELEMETRY_ENABLED", "").lower() in ("true", "1") if opt_out or not opt_in: From 5e3e750c40166c8b08c3faa68134de8ce4b298ba Mon Sep 17 00:00:00 2001 From: Clayton Date: Sun, 8 Mar 2026 21:30:20 -0500 Subject: [PATCH 03/11] fix: version --- unstructured/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 1a6f7540fc..8e21a799c3 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.21.12" # pragma: no cover +__version__ = "0.21.13" # pragma: no cover From a1fbfc288f60286d6bdc62aabd0a9d230aeb3e06 Mon Sep 17 00:00:00 2001 From: Clayton Date: Sun, 8 Mar 2026 21:48:39 -0500 Subject: [PATCH 04/11] fix: test --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 634b747979..603fd51bb1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,7 @@ ## 0.21.13 ### Enhancements -- **Telemetry off by default (fixes #3940)**: Analytics/telemetry is now disabled by default. Set `UNSTRUCTURED_TELEMETRY_ENABLED=true` to opt in. Opt-out via `DO_NOT_TRACK=true` or `SCARF_NO_ANALYTICS=true` is still supported and takes precedence. +- **Telemetry off by default**: Analytics/telemetry is now disabled by default. Set `UNSTRUCTURED_TELEMETRY_ENABLED=true` to opt in. Opt-out via `DO_NOT_TRACK=true` or `SCARF_NO_ANALYTICS=true` is still supported and takes precedence. ## 0.21.12 - **Add Check for complex documents**: Adds a check for complex documents to avoid pdfminer with a high ratio of vector objects From 2c6cba11e686fb09d7ae186596d7f0a5636689cf Mon Sep 17 00:00:00 2001 From: Clayton Date: Tue, 10 Mar 2026 03:40:15 -0500 Subject: [PATCH 05/11] fix: update --- CHANGELOG.md | 7 +- test_unstructured/test_utils.py | 128 ++++++++++++++++++++++++++------ unstructured/utils.py | 68 +++++++---------- 3 files changed, 137 insertions(+), 66 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 603fd51bb1..dce0295344 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,10 @@ -## 0.21.13 +## 0.22.0 + +### Breaking changes / migration +- **Telemetry off by default (fixes #3940)**: Analytics/telemetry is now **disabled by default**. Upgrading will stop sending the library-load ping unless you opt in. To restore the previous behavior, set `UNSTRUCTURED_TELEMETRY_ENABLED=true` before importing `unstructured`. Opt-out via `DO_NOT_TRACK` or `SCARF_NO_ANALYTICS` (any non-empty value) is unchanged and takes precedence. ### Enhancements -- **Telemetry off by default**: Analytics/telemetry is now disabled by default. Set `UNSTRUCTURED_TELEMETRY_ENABLED=true` to opt in. Opt-out via `DO_NOT_TRACK=true` or `SCARF_NO_ANALYTICS=true` is still supported and takes precedence. +- Telemetry ping now uses `requests.get(..., params=...)` for correct URL encoding and a single code path for dev/non-dev. ## 0.21.12 - **Add Check for complex documents**: Adds a check for complex documents to avoid pdfminer with a high ratio of vector objects diff --git a/test_unstructured/test_utils.py b/test_unstructured/test_utils.py index d6a5e33c98..178b68bbb4 100644 --- a/test_unstructured/test_utils.py +++ b/test_unstructured/test_utils.py @@ -2,9 +2,11 @@ import json import os -from unittest.mock import patch +import platform +from unittest.mock import Mock import pytest +import requests from unstructured import utils from unstructured.documents.coordinates import PixelSpace @@ -393,39 +395,121 @@ def it_does_not_send_telemetry_by_default(self, monkeypatch): monkeypatch.delenv("UNSTRUCTURED_TELEMETRY_ENABLED", raising=False) monkeypatch.delenv("SCARF_NO_ANALYTICS", raising=False) monkeypatch.delenv("DO_NOT_TRACK", raising=False) - with patch("unstructured.utils.requests.get") as mock_get: - utils.scarf_analytics() + mock_get = Mock() + monkeypatch.setattr("unstructured.utils.requests.get", mock_get) + monkeypatch.setattr("unstructured.utils.subprocess.check_output", Mock()) + utils.scarf_analytics() mock_get.assert_not_called() - def it_sends_telemetry_when_opt_in_is_set(self, monkeypatch): - monkeypatch.setenv("UNSTRUCTURED_TELEMETRY_ENABLED", "true") - monkeypatch.delenv("SCARF_NO_ANALYTICS", raising=False) - monkeypatch.delenv("DO_NOT_TRACK", raising=False) - with patch("unstructured.utils.requests.get") as mock_get: - utils.scarf_analytics() - mock_get.assert_called_once() - call_url = mock_get.call_args[0][0] - assert "python-telemetry" in call_url - assert "version=" in call_url - - def it_accepts_opt_in_value_1(self, monkeypatch): - monkeypatch.setenv("UNSTRUCTURED_TELEMETRY_ENABLED", "1") + @pytest.mark.parametrize("opt_in_value", ["true", "True", "TRUE", "1"]) + def it_sends_telemetry_when_opt_in_is_set(self, monkeypatch, opt_in_value): + monkeypatch.setenv("UNSTRUCTURED_TELEMETRY_ENABLED", opt_in_value) monkeypatch.delenv("SCARF_NO_ANALYTICS", raising=False) monkeypatch.delenv("DO_NOT_TRACK", raising=False) - with patch("unstructured.utils.requests.get") as mock_get: - utils.scarf_analytics() + mock_get = Mock() + monkeypatch.setattr("unstructured.utils.requests.get", mock_get) + monkeypatch.setattr("unstructured.utils.subprocess.check_output", Mock()) + utils.scarf_analytics() mock_get.assert_called_once() + call_args = mock_get.call_args + assert call_args[0][0] == "https://packages.unstructured.io/python-telemetry" + params = call_args[1]["params"] + assert set(params.keys()) == {"version", "platform", "python", "arch", "gpu", "dev"} + assert call_args[1]["timeout"] == 10 def it_does_not_send_when_do_not_track_is_set_even_if_opt_in(self, monkeypatch): monkeypatch.setenv("UNSTRUCTURED_TELEMETRY_ENABLED", "true") monkeypatch.setenv("DO_NOT_TRACK", "true") - with patch("unstructured.utils.requests.get") as mock_get: - utils.scarf_analytics() + monkeypatch.delenv("SCARF_NO_ANALYTICS", raising=False) + mock_get = Mock() + monkeypatch.setattr("unstructured.utils.requests.get", mock_get) + monkeypatch.setattr("unstructured.utils.subprocess.check_output", Mock()) + utils.scarf_analytics() mock_get.assert_not_called() def it_does_not_send_when_scarf_no_analytics_is_set_even_if_opt_in(self, monkeypatch): monkeypatch.setenv("UNSTRUCTURED_TELEMETRY_ENABLED", "true") monkeypatch.setenv("SCARF_NO_ANALYTICS", "true") - with patch("unstructured.utils.requests.get") as mock_get: - utils.scarf_analytics() + monkeypatch.delenv("DO_NOT_TRACK", raising=False) + mock_get = Mock() + monkeypatch.setattr("unstructured.utils.requests.get", mock_get) + monkeypatch.setattr("unstructured.utils.subprocess.check_output", Mock()) + utils.scarf_analytics() + mock_get.assert_not_called() + + def it_does_not_send_when_do_not_track_uppercase(self, monkeypatch): + """Opt-out honors any non-empty value (DNT standard); DO_NOT_TRACK=TRUE opts out.""" + monkeypatch.setenv("UNSTRUCTURED_TELEMETRY_ENABLED", "true") + monkeypatch.setenv("DO_NOT_TRACK", "TRUE") + monkeypatch.delenv("SCARF_NO_ANALYTICS", raising=False) + mock_get = Mock() + monkeypatch.setattr("unstructured.utils.requests.get", mock_get) + monkeypatch.setattr("unstructured.utils.subprocess.check_output", Mock()) + utils.scarf_analytics() + mock_get.assert_not_called() + + def it_does_not_send_when_do_not_track_is_1(self, monkeypatch): + """Opt-out: DO_NOT_TRACK=1 (non-empty) opts out per DNT standard.""" + monkeypatch.setenv("UNSTRUCTURED_TELEMETRY_ENABLED", "true") + monkeypatch.setenv("DO_NOT_TRACK", "1") + monkeypatch.delenv("SCARF_NO_ANALYTICS", raising=False) + mock_get = Mock() + monkeypatch.setattr("unstructured.utils.requests.get", mock_get) + monkeypatch.setattr("unstructured.utils.subprocess.check_output", Mock()) + utils.scarf_analytics() + mock_get.assert_not_called() + + def it_sends_with_dev_true_when_version_contains_dev(self, monkeypatch): + monkeypatch.setenv("UNSTRUCTURED_TELEMETRY_ENABLED", "true") + monkeypatch.delenv("SCARF_NO_ANALYTICS", raising=False) + monkeypatch.delenv("DO_NOT_TRACK", raising=False) + mock_get = Mock() + monkeypatch.setattr("unstructured.utils.requests.get", mock_get) + monkeypatch.setattr("unstructured.utils.subprocess.check_output", Mock()) + monkeypatch.setattr("unstructured.utils.__version__", "1.2.3.dev0") + utils.scarf_analytics() + mock_get.assert_called_once() + params = mock_get.call_args[1]["params"] + assert params["dev"] == "true" + assert params["version"] == "1.2.3.dev0" + assert params["platform"] == platform.system() + assert params["arch"] == platform.machine() + assert mock_get.call_args[1]["timeout"] == 10 + + def it_sends_with_dev_false_when_version_does_not_contain_dev(self, monkeypatch): + monkeypatch.setenv("UNSTRUCTURED_TELEMETRY_ENABLED", "true") + monkeypatch.delenv("SCARF_NO_ANALYTICS", raising=False) + monkeypatch.delenv("DO_NOT_TRACK", raising=False) + mock_get = Mock() + monkeypatch.setattr("unstructured.utils.requests.get", mock_get) + monkeypatch.setattr("unstructured.utils.subprocess.check_output", Mock()) + monkeypatch.setattr("unstructured.utils.__version__", "1.2.3") + utils.scarf_analytics() + mock_get.assert_called_once() + params = mock_get.call_args[1]["params"] + assert params["dev"] == "false" + assert params["version"] == "1.2.3" + assert mock_get.call_args[1]["timeout"] == 10 + + def it_handles_requests_exception_gracefully(self, monkeypatch): + monkeypatch.setenv("UNSTRUCTURED_TELEMETRY_ENABLED", "true") + monkeypatch.delenv("SCARF_NO_ANALYTICS", raising=False) + monkeypatch.delenv("DO_NOT_TRACK", raising=False) + monkeypatch.setattr( + "unstructured.utils.requests.get", + Mock(side_effect=requests.RequestException("network error")), + ) + monkeypatch.setattr("unstructured.utils.subprocess.check_output", Mock()) + utils.scarf_analytics() # does not raise + + @pytest.mark.parametrize("value", ["false", "0", "yes", "on", "FALSE", "NO"]) + def it_does_not_send_when_opt_in_is_explicit_false_or_other(self, monkeypatch, value): + """Only 'true' and '1' opt in; false/0/yes/on etc. must not enable telemetry.""" + monkeypatch.setenv("UNSTRUCTURED_TELEMETRY_ENABLED", value) + monkeypatch.delenv("SCARF_NO_ANALYTICS", raising=False) + monkeypatch.delenv("DO_NOT_TRACK", raising=False) + mock_get = Mock() + monkeypatch.setattr("unstructured.utils.requests.get", mock_get) + monkeypatch.setattr("unstructured.utils.subprocess.check_output", Mock()) + utils.scarf_analytics() mock_get.assert_not_called() diff --git a/unstructured/utils.py b/unstructured/utils.py index d7b75b1b2c..099a48865c 100644 --- a/unstructured/utils.py +++ b/unstructured/utils.py @@ -1,6 +1,7 @@ from __future__ import annotations import asyncio +import contextlib import functools import importlib import inspect @@ -275,54 +276,37 @@ def scarf_analytics(): Set UNSTRUCTURED_TELEMETRY_ENABLED=true to opt in. Opt-out env vars (DO_NOT_TRACK, SCARF_NO_ANALYTICS) are always respected and take precedence. """ + opt_out = os.getenv("SCARF_NO_ANALYTICS", "").strip().lower() in ("true", "1") or bool( + (os.getenv("DO_NOT_TRACK") or "").strip() + ) + opt_in = (os.getenv("UNSTRUCTURED_TELEMETRY_ENABLED") or "").strip().lower() in ( + "true", + "1", + ) + if opt_out or not opt_in: + return + try: - subprocess.check_output("nvidia-smi") + subprocess.check_output(["nvidia-smi"], stderr=subprocess.DEVNULL) gpu_present = True - except Exception: + except (FileNotFoundError, subprocess.CalledProcessError): gpu_present = False python_version = ".".join(platform.python_version().split(".")[:2]) - # Telemetry is off by default. Only send when user explicitly opts in via - # UNSTRUCTURED_TELEMETRY_ENABLED. Opt-out env vars always take precedence. - opt_out = os.getenv("SCARF_NO_ANALYTICS") == "true" or os.getenv("DO_NOT_TRACK") == "true" - opt_in = os.getenv("UNSTRUCTURED_TELEMETRY_ENABLED", "").lower() in ("true", "1") - if opt_out or not opt_in: - return - - try: - if "dev" in __version__: - requests.get( - "https://packages.unstructured.io/python-telemetry?version=" - + __version__ - + "&platform=" - + platform.system() - + "&python" - + python_version - + "&arch=" - + platform.machine() - + "&gpu=" - + str(gpu_present) - + "&dev=true", - timeout=10, - ) - else: - requests.get( - "https://packages.unstructured.io/python-telemetry?version=" - + __version__ - + "&platform=" - + platform.system() - + "&python" - + python_version - + "&arch=" - + platform.machine() - + "&gpu=" - + str(gpu_present) - + "&dev=false", - timeout=10, - ) - except Exception: - pass + with contextlib.suppress(Exception): + requests.get( + "https://packages.unstructured.io/python-telemetry", + params={ + "version": __version__, + "platform": platform.system(), + "python": python_version, + "arch": platform.machine(), + "gpu": str(gpu_present), + "dev": str("dev" in __version__).lower(), + }, + timeout=10, + ) def ngrams(s: list[str], n: int) -> list[tuple[str, ...]]: From 83b9444bf849cc0b9abd2eaaa0c5634753dd832f Mon Sep 17 00:00:00 2001 From: Clayton Date: Tue, 10 Mar 2026 03:43:58 -0500 Subject: [PATCH 06/11] fix: change version --- unstructured/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 8e21a799c3..d4703816f2 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.21.13" # pragma: no cover +__version__ = "0.22.0" # pragma: no cover From fdfa03da5347830f34de246be489d56dd7c3206f Mon Sep 17 00:00:00 2001 From: Clayton Date: Tue, 10 Mar 2026 04:04:01 -0500 Subject: [PATCH 07/11] fix: update --- CHANGELOG.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dce0295344..210acbf030 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,8 @@ ## 0.22.0 -### Breaking changes / migration -- **Telemetry off by default (fixes #3940)**: Analytics/telemetry is now **disabled by default**. Upgrading will stop sending the library-load ping unless you opt in. To restore the previous behavior, set `UNSTRUCTURED_TELEMETRY_ENABLED=true` before importing `unstructured`. Opt-out via `DO_NOT_TRACK` or `SCARF_NO_ANALYTICS` (any non-empty value) is unchanged and takes precedence. ### Enhancements +- **Telemetry off by default**: Analytics/telemetry is now **disabled by default**. Upgrading will stop sending the library-load ping unless you opt in. To restore the previous behavior, set `UNSTRUCTURED_TELEMETRY_ENABLED=true` before importing `unstructured`. Opt-out via `DO_NOT_TRACK` or `SCARF_NO_ANALYTICS` (any non-empty value) is unchanged and takes precedence. - Telemetry ping now uses `requests.get(..., params=...)` for correct URL encoding and a single code path for dev/non-dev. ## 0.21.12 From 012b47fdae9896ee2dcdf373de8867563b070314 Mon Sep 17 00:00:00 2001 From: Clayton Date: Fri, 13 Mar 2026 00:37:42 -0500 Subject: [PATCH 08/11] fix: update --- README.md | 2 +- test_unstructured/test_utils.py | 274 ++++++++++++++++++++------------ unstructured/__init__.py | 4 + unstructured/logger.py | 7 - unstructured/telemetry.py | 8 + unstructured/utils.py | 30 ++-- 6 files changed, 209 insertions(+), 116 deletions(-) create mode 100644 unstructured/telemetry.py diff --git a/README.md b/README.md index dd51659379..ccbfcab661 100644 --- a/README.md +++ b/README.md @@ -268,4 +268,4 @@ Encountered a bug? Please create a new [GitHub issue](https://github.com/Unstruc ## :chart_with_upwards_trend: Analytics -Telemetry is **off by default**. To opt in to the lightweight analytics ping when the library is loaded, set the environment variable `UNSTRUCTURED_TELEMETRY_ENABLED=true` before importing `unstructured`. You can also opt out explicitly by setting `DO_NOT_TRACK=true` or `SCARF_NO_ANALYTICS=true`. To learn more about how we collect and use this data, please read our [Privacy Policy](https://unstructured.io/privacy-policy). +Telemetry is **off by default**. To opt in, set `UNSTRUCTURED_TELEMETRY_ENABLED=true` (or `=1`) before importing `unstructured`. To opt out, set `DO_NOT_TRACK` or `SCARF_NO_ANALYTICS` to any non-empty value (e.g. `true`, `1`, `yes`); opt-out takes precedence. See our [Privacy Policy](https://unstructured.io/privacy-policy). diff --git a/test_unstructured/test_utils.py b/test_unstructured/test_utils.py index 178b68bbb4..c2cfca5c5d 100644 --- a/test_unstructured/test_utils.py +++ b/test_unstructured/test_utils.py @@ -3,6 +3,9 @@ import json import os import platform +import subprocess +import sys +from pathlib import Path from unittest.mock import Mock import pytest @@ -388,128 +391,201 @@ def it_keeps_first_orphan_in_none_group_when_assign_orphans_is_true(self): assert [e.text for e in result["parent_A"]] == ["Title 1", "Orphan 2"] +@pytest.fixture +def telemetry_mocks(monkeypatch): + """Clear telemetry env and patch requests.get + subprocess.check_output. + + Returns (mock_get, mock_subprocess). Use for both send and no-send tests so + we can assert network and subprocess side effects. + """ + monkeypatch.delenv("UNSTRUCTURED_TELEMETRY_ENABLED", raising=False) + monkeypatch.delenv("SCARF_NO_ANALYTICS", raising=False) + monkeypatch.delenv("DO_NOT_TRACK", raising=False) + mock_get = Mock() + mock_subprocess = Mock() + monkeypatch.setattr("unstructured.utils.requests.get", mock_get) + monkeypatch.setattr("unstructured.utils.subprocess.check_output", mock_subprocess) + return mock_get, mock_subprocess + + +def _apply_telemetry_env(monkeypatch, env_overrides): + """Set env vars from dict; keys are env var names, values are str or None (delenv).""" + for key, value in env_overrides.items(): + if value is None: + monkeypatch.delenv(key, raising=False) + else: + monkeypatch.setenv(key, value) + + class DescribeScarfAnalytics: """Tests for scarf_analytics (telemetry off by default, opt-in only).""" - def it_does_not_send_telemetry_by_default(self, monkeypatch): - monkeypatch.delenv("UNSTRUCTURED_TELEMETRY_ENABLED", raising=False) + def it_telemetry_opt_out_any_non_empty_for_both_vars(self, monkeypatch): + """Contract: DO_NOT_TRACK and SCARF_NO_ANALYTICS both opt out on any non-empty value.""" + monkeypatch.delenv("DO_NOT_TRACK", raising=False) monkeypatch.delenv("SCARF_NO_ANALYTICS", raising=False) + assert utils._telemetry_opt_out() is False + monkeypatch.setenv("DO_NOT_TRACK", "yes") + assert utils._telemetry_opt_out() is True monkeypatch.delenv("DO_NOT_TRACK", raising=False) - mock_get = Mock() - monkeypatch.setattr("unstructured.utils.requests.get", mock_get) - monkeypatch.setattr("unstructured.utils.subprocess.check_output", Mock()) + monkeypatch.setenv("SCARF_NO_ANALYTICS", "on") + assert utils._telemetry_opt_out() is True + + def it_telemetry_opt_in_only_true_or_1(self, monkeypatch): + """Contract: only UNSTRUCTURED_TELEMETRY_ENABLED in ('true','1') opts in.""" + monkeypatch.delenv("UNSTRUCTURED_TELEMETRY_ENABLED", raising=False) + assert utils._telemetry_opt_in() is False + for val in ("true", "1", "True", "TRUE"): + monkeypatch.setenv("UNSTRUCTURED_TELEMETRY_ENABLED", val) + assert utils._telemetry_opt_in() is True + for val in ("false", "0", "yes", ""): + monkeypatch.setenv("UNSTRUCTURED_TELEMETRY_ENABLED", val) + assert utils._telemetry_opt_in() is False + + @pytest.mark.parametrize( + "env_overrides", + [ + {}, + {"DO_NOT_TRACK": "true"}, + {"DO_NOT_TRACK": "1"}, + {"DO_NOT_TRACK": "TRUE"}, + {"SCARF_NO_ANALYTICS": "true"}, + {"SCARF_NO_ANALYTICS": "yes"}, + {"SCARF_NO_ANALYTICS": "on"}, + {"SCARF_NO_ANALYTICS": "1"}, + {"SCARF_NO_ANALYTICS": "TRUE"}, + {"SCARF_NO_ANALYTICS": " true "}, + {"UNSTRUCTURED_TELEMETRY_ENABLED": "false"}, + {"UNSTRUCTURED_TELEMETRY_ENABLED": "0"}, + {"UNSTRUCTURED_TELEMETRY_ENABLED": "yes"}, + {"UNSTRUCTURED_TELEMETRY_ENABLED": "FALSE"}, + {"UNSTRUCTURED_TELEMETRY_ENABLED": "true", "DO_NOT_TRACK": "true"}, + {"UNSTRUCTURED_TELEMETRY_ENABLED": "true", "SCARF_NO_ANALYTICS": "on"}, + ], + ids=[ + "default_no_opt_in", + "DO_NOT_TRACK=true", + "DO_NOT_TRACK=1", + "DO_NOT_TRACK=TRUE", + "SCARF_NO_ANALYTICS=true", + "SCARF_NO_ANALYTICS=yes", + "SCARF_NO_ANALYTICS=on", + "SCARF_NO_ANALYTICS=1", + "SCARF_NO_ANALYTICS=TRUE", + "SCARF_NO_ANALYTICS=whitespace", + "opt_in=false", + "opt_in=0", + "opt_in=yes", + "opt_in=FALSE", + "opt_in_true_but_DO_NOT_TRACK", + "opt_in_true_but_SCARF_NO_ANALYTICS", + ], + ) + def it_does_not_send_telemetry_when_disabled_or_opted_out( + self, monkeypatch, telemetry_mocks, env_overrides + ): + """No network or subprocess when telemetry disabled or opt-out set.""" + mock_get, mock_subprocess = telemetry_mocks + _apply_telemetry_env(monkeypatch, env_overrides) utils.scarf_analytics() mock_get.assert_not_called() + mock_subprocess.assert_not_called() @pytest.mark.parametrize("opt_in_value", ["true", "True", "TRUE", "1"]) - def it_sends_telemetry_when_opt_in_is_set(self, monkeypatch, opt_in_value): - monkeypatch.setenv("UNSTRUCTURED_TELEMETRY_ENABLED", opt_in_value) - monkeypatch.delenv("SCARF_NO_ANALYTICS", raising=False) - monkeypatch.delenv("DO_NOT_TRACK", raising=False) - mock_get = Mock() - monkeypatch.setattr("unstructured.utils.requests.get", mock_get) - monkeypatch.setattr("unstructured.utils.subprocess.check_output", Mock()) + def it_sends_telemetry_when_opt_in_is_set(self, monkeypatch, telemetry_mocks, opt_in_value): + mock_get, mock_subprocess = telemetry_mocks + _apply_telemetry_env(monkeypatch, {"UNSTRUCTURED_TELEMETRY_ENABLED": opt_in_value}) utils.scarf_analytics() mock_get.assert_called_once() + mock_subprocess.assert_called_once_with(["nvidia-smi"], stderr=subprocess.DEVNULL) call_args = mock_get.call_args assert call_args[0][0] == "https://packages.unstructured.io/python-telemetry" params = call_args[1]["params"] assert set(params.keys()) == {"version", "platform", "python", "arch", "gpu", "dev"} assert call_args[1]["timeout"] == 10 - def it_does_not_send_when_do_not_track_is_set_even_if_opt_in(self, monkeypatch): - monkeypatch.setenv("UNSTRUCTURED_TELEMETRY_ENABLED", "true") - monkeypatch.setenv("DO_NOT_TRACK", "true") - monkeypatch.delenv("SCARF_NO_ANALYTICS", raising=False) - mock_get = Mock() - monkeypatch.setattr("unstructured.utils.requests.get", mock_get) - monkeypatch.setattr("unstructured.utils.subprocess.check_output", Mock()) - utils.scarf_analytics() - mock_get.assert_not_called() - - def it_does_not_send_when_scarf_no_analytics_is_set_even_if_opt_in(self, monkeypatch): - monkeypatch.setenv("UNSTRUCTURED_TELEMETRY_ENABLED", "true") - monkeypatch.setenv("SCARF_NO_ANALYTICS", "true") - monkeypatch.delenv("DO_NOT_TRACK", raising=False) - mock_get = Mock() - monkeypatch.setattr("unstructured.utils.requests.get", mock_get) - monkeypatch.setattr("unstructured.utils.subprocess.check_output", Mock()) - utils.scarf_analytics() - mock_get.assert_not_called() - - def it_does_not_send_when_do_not_track_uppercase(self, monkeypatch): - """Opt-out honors any non-empty value (DNT standard); DO_NOT_TRACK=TRUE opts out.""" - monkeypatch.setenv("UNSTRUCTURED_TELEMETRY_ENABLED", "true") - monkeypatch.setenv("DO_NOT_TRACK", "TRUE") - monkeypatch.delenv("SCARF_NO_ANALYTICS", raising=False) - mock_get = Mock() - monkeypatch.setattr("unstructured.utils.requests.get", mock_get) - monkeypatch.setattr("unstructured.utils.subprocess.check_output", Mock()) - utils.scarf_analytics() - mock_get.assert_not_called() - - def it_does_not_send_when_do_not_track_is_1(self, monkeypatch): - """Opt-out: DO_NOT_TRACK=1 (non-empty) opts out per DNT standard.""" - monkeypatch.setenv("UNSTRUCTURED_TELEMETRY_ENABLED", "true") - monkeypatch.setenv("DO_NOT_TRACK", "1") - monkeypatch.delenv("SCARF_NO_ANALYTICS", raising=False) - mock_get = Mock() - monkeypatch.setattr("unstructured.utils.requests.get", mock_get) - monkeypatch.setattr("unstructured.utils.subprocess.check_output", Mock()) - utils.scarf_analytics() - mock_get.assert_not_called() - - def it_sends_with_dev_true_when_version_contains_dev(self, monkeypatch): - monkeypatch.setenv("UNSTRUCTURED_TELEMETRY_ENABLED", "true") - monkeypatch.delenv("SCARF_NO_ANALYTICS", raising=False) - monkeypatch.delenv("DO_NOT_TRACK", raising=False) - mock_get = Mock() - monkeypatch.setattr("unstructured.utils.requests.get", mock_get) - monkeypatch.setattr("unstructured.utils.subprocess.check_output", Mock()) - monkeypatch.setattr("unstructured.utils.__version__", "1.2.3.dev0") + @pytest.mark.parametrize( + "version_val,expected_dev", + [("1.2.3.dev0", "true"), ("1.2.3", "false")], + ids=["dev_version", "release_version"], + ) + def it_sends_telemetry_with_correct_dev_param( + self, monkeypatch, telemetry_mocks, version_val, expected_dev + ): + mock_get, mock_subprocess = telemetry_mocks + _apply_telemetry_env(monkeypatch, {"UNSTRUCTURED_TELEMETRY_ENABLED": "true"}) + monkeypatch.setattr("unstructured.utils.__version__", version_val) utils.scarf_analytics() mock_get.assert_called_once() + mock_subprocess.assert_called_once() params = mock_get.call_args[1]["params"] - assert params["dev"] == "true" - assert params["version"] == "1.2.3.dev0" + assert params["dev"] == expected_dev + assert params["version"] == version_val assert params["platform"] == platform.system() assert params["arch"] == platform.machine() assert mock_get.call_args[1]["timeout"] == 10 - def it_sends_with_dev_false_when_version_does_not_contain_dev(self, monkeypatch): - monkeypatch.setenv("UNSTRUCTURED_TELEMETRY_ENABLED", "true") - monkeypatch.delenv("SCARF_NO_ANALYTICS", raising=False) - monkeypatch.delenv("DO_NOT_TRACK", raising=False) - mock_get = Mock() - monkeypatch.setattr("unstructured.utils.requests.get", mock_get) - monkeypatch.setattr("unstructured.utils.subprocess.check_output", Mock()) - monkeypatch.setattr("unstructured.utils.__version__", "1.2.3") - utils.scarf_analytics() + def it_handles_requests_exception_gracefully(self, monkeypatch, telemetry_mocks): + mock_get, mock_subprocess = telemetry_mocks + mock_get.side_effect = requests.RequestException("network error") + _apply_telemetry_env(monkeypatch, {"UNSTRUCTURED_TELEMETRY_ENABLED": "true"}) + utils.scarf_analytics() # does not raise mock_get.assert_called_once() - params = mock_get.call_args[1]["params"] - assert params["dev"] == "false" - assert params["version"] == "1.2.3" - assert mock_get.call_args[1]["timeout"] == 10 - - def it_handles_requests_exception_gracefully(self, monkeypatch): - monkeypatch.setenv("UNSTRUCTURED_TELEMETRY_ENABLED", "true") - monkeypatch.delenv("SCARF_NO_ANALYTICS", raising=False) - monkeypatch.delenv("DO_NOT_TRACK", raising=False) - monkeypatch.setattr( - "unstructured.utils.requests.get", - Mock(side_effect=requests.RequestException("network error")), - ) - monkeypatch.setattr("unstructured.utils.subprocess.check_output", Mock()) + mock_subprocess.assert_called_once() + assert mock_get.call_args[0][0] == "https://packages.unstructured.io/python-telemetry" + assert "version" in mock_get.call_args[1]["params"] + + @pytest.mark.parametrize( + "exc", + [OSError(), PermissionError("nvidia-smi denied")], + ids=["OSError", "PermissionError"], + ) + def it_handles_nvidia_smi_failure_gracefully(self, monkeypatch, telemetry_mocks, exc): + """nvidia-smi probe failures must not propagate; telemetry still sends with gpu=False.""" + mock_get, mock_subprocess = telemetry_mocks + mock_subprocess.side_effect = exc + _apply_telemetry_env(monkeypatch, {"UNSTRUCTURED_TELEMETRY_ENABLED": "true"}) utils.scarf_analytics() # does not raise - - @pytest.mark.parametrize("value", ["false", "0", "yes", "on", "FALSE", "NO"]) - def it_does_not_send_when_opt_in_is_explicit_false_or_other(self, monkeypatch, value): - """Only 'true' and '1' opt in; false/0/yes/on etc. must not enable telemetry.""" - monkeypatch.setenv("UNSTRUCTURED_TELEMETRY_ENABLED", value) - monkeypatch.delenv("SCARF_NO_ANALYTICS", raising=False) - monkeypatch.delenv("DO_NOT_TRACK", raising=False) - mock_get = Mock() - monkeypatch.setattr("unstructured.utils.requests.get", mock_get) - monkeypatch.setattr("unstructured.utils.subprocess.check_output", Mock()) - utils.scarf_analytics() - mock_get.assert_not_called() + mock_get.assert_called_once() + assert mock_get.call_args[1]["params"]["gpu"] == "False" + mock_subprocess.assert_called_once_with(["nvidia-smi"], stderr=subprocess.DEVNULL) + + def it_import_unstructured_succeeds_with_opt_out(self): + """Import path with opt-out env does not crash (integration-style).""" + project_root = Path(__file__).resolve().parent.parent + env = { + **os.environ, + "DO_NOT_TRACK": "1", + "UNSTRUCTURED_TELEMETRY_ENABLED": "", + "PYTHONPATH": str(project_root), + } + result = subprocess.run( + [sys.executable, "-c", "import unstructured; print('ok')"], + env=env, + cwd=project_root, + capture_output=True, + text=True, + timeout=30, + ) + assert result.returncode == 0, result.stderr or result.stdout + assert "ok" in result.stdout + + def it_import_unstructured_succeeds_with_opt_in(self): + """Import path with opt-in env does not crash (integration-style).""" + project_root = Path(__file__).resolve().parent.parent + env = { + **os.environ, + "UNSTRUCTURED_TELEMETRY_ENABLED": "true", + "DO_NOT_TRACK": "", + "PYTHONPATH": str(project_root), + } + result = subprocess.run( + [sys.executable, "-c", "import unstructured; print('ok')"], + env=env, + cwd=project_root, + capture_output=True, + text=True, + timeout=30, + ) + assert result.returncode == 0, result.stderr or result.stdout + assert "ok" in result.stdout diff --git a/unstructured/__init__.py b/unstructured/__init__.py index b8f3f32f1d..0c820eaf4a 100644 --- a/unstructured/__init__.py +++ b/unstructured/__init__.py @@ -1,4 +1,8 @@ from .partition.utils.config import env_config +from .telemetry import init_telemetry # init env_config env_config + +# Explicit startup boundary for telemetry (opt-in, best-effort) +init_telemetry() diff --git a/unstructured/logger.py b/unstructured/logger.py index fa2948908c..b1f7c11ede 100644 --- a/unstructured/logger.py +++ b/unstructured/logger.py @@ -1,7 +1,5 @@ import logging -from unstructured.utils import scarf_analytics - logger = logging.getLogger("unstructured") trace_logger = logging.getLogger("unstructured.trace") @@ -16,10 +14,5 @@ def detail(self, message, *args, **kws): self._log(DETAIL, message, args, **kws) -# Telemetry is off by default. To opt in, set UNSTRUCTURED_TELEMETRY_ENABLED=true. -# To opt out when enabled elsewhere, set SCARF_NO_ANALYTICS=true or DO_NOT_TRACK=true. -# See the README. -scarf_analytics() - # Add the custom log method to the logging.Logger class logging.Logger.detail = detail # type: ignore diff --git a/unstructured/telemetry.py b/unstructured/telemetry.py new file mode 100644 index 0000000000..d4c5969c25 --- /dev/null +++ b/unstructured/telemetry.py @@ -0,0 +1,8 @@ +"""Telemetry initializer. Called once at package startup from unstructured/__init__.py.""" + +from unstructured.utils import scarf_analytics + + +def init_telemetry() -> None: + """Run the analytics ping if enabled by env. Best-effort and non-fatal.""" + scarf_analytics() diff --git a/unstructured/utils.py b/unstructured/utils.py index 099a48865c..25a8b78018 100644 --- a/unstructured/utils.py +++ b/unstructured/utils.py @@ -270,26 +270,38 @@ def only(it: Iterable[Any]) -> Any: return out -def scarf_analytics(): - """Send a lightweight analytics ping. Off by default. +def _telemetry_opt_out() -> bool: + """True if telemetry should be disabled via env. - Set UNSTRUCTURED_TELEMETRY_ENABLED=true to opt in. - Opt-out env vars (DO_NOT_TRACK, SCARF_NO_ANALYTICS) are always respected and take precedence. + DO_NOT_TRACK and SCARF_NO_ANALYTICS both follow the same rule: any non-empty + value (after strip) opts out. See README/CHANGELOG for the public contract. """ - opt_out = os.getenv("SCARF_NO_ANALYTICS", "").strip().lower() in ("true", "1") or bool( - (os.getenv("DO_NOT_TRACK") or "").strip() + return bool((os.getenv("DO_NOT_TRACK") or "").strip()) or bool( + (os.getenv("SCARF_NO_ANALYTICS") or "").strip() ) - opt_in = (os.getenv("UNSTRUCTURED_TELEMETRY_ENABLED") or "").strip().lower() in ( + + +def _telemetry_opt_in() -> bool: + """True if telemetry is explicitly enabled via env. Only 'true' and '1' opt in.""" + return (os.getenv("UNSTRUCTURED_TELEMETRY_ENABLED") or "").strip().lower() in ( "true", "1", ) - if opt_out or not opt_in: + + +def scarf_analytics(): + """Send a lightweight analytics ping. Off by default. + + Set UNSTRUCTURED_TELEMETRY_ENABLED=true to opt in. + Opt-out env vars (DO_NOT_TRACK, SCARF_NO_ANALYTICS): any non-empty value opts out. + """ + if _telemetry_opt_out() or not _telemetry_opt_in(): return try: subprocess.check_output(["nvidia-smi"], stderr=subprocess.DEVNULL) gpu_present = True - except (FileNotFoundError, subprocess.CalledProcessError): + except (OSError, subprocess.CalledProcessError): gpu_present = False python_version = ".".join(platform.python_version().split(".")[:2]) From 35f5013d079a52eac1683bd6aa40dba19ea3fd35 Mon Sep 17 00:00:00 2001 From: Clayton Date: Fri, 13 Mar 2026 00:38:52 -0500 Subject: [PATCH 09/11] fix: test --- test_unstructured/test_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_unstructured/test_utils.py b/test_unstructured/test_utils.py index c2cfca5c5d..bc5925e2ac 100644 --- a/test_unstructured/test_utils.py +++ b/test_unstructured/test_utils.py @@ -505,7 +505,7 @@ def it_sends_telemetry_when_opt_in_is_set(self, monkeypatch, telemetry_mocks, op assert call_args[1]["timeout"] == 10 @pytest.mark.parametrize( - "version_val,expected_dev", + ("version_val", "expected_dev"), [("1.2.3.dev0", "true"), ("1.2.3", "false")], ids=["dev_version", "release_version"], ) From 96c84c07f3c55f08d8729e203a6d99eb4be2cab4 Mon Sep 17 00:00:00 2001 From: Clayton Date: Sun, 15 Mar 2026 22:31:02 -0500 Subject: [PATCH 10/11] fix: update --- CHANGELOG.md | 4 +- README.md | 2 +- scripts/image/test-outbound-connectivity.sh | 11 +- test_unstructured/test_telemetry.py | 257 ++++++++++++++++++++ test_unstructured/test_utils.py | 206 ---------------- 5 files changed, 269 insertions(+), 211 deletions(-) create mode 100644 test_unstructured/test_telemetry.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 210acbf030..b808b0778a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,10 @@ ## 0.22.0 +### Breaking changes +- **Opt-out env semantics**: `DO_NOT_TRACK` and `SCARF_NO_ANALYTICS` now treat **any non-empty value** (after strip) as opt-out. Previously only the exact string `"true"` opted out. So `DO_NOT_TRACK=false`, `DO_NOT_TRACK=0`, `SCARF_NO_ANALYTICS=0`, etc. will now disable telemetry. If you rely on `=false`/`=0` to mean “do not opt out”, remove the variable or set it only when you intend to opt out. ### Enhancements -- **Telemetry off by default**: Analytics/telemetry is now **disabled by default**. Upgrading will stop sending the library-load ping unless you opt in. To restore the previous behavior, set `UNSTRUCTURED_TELEMETRY_ENABLED=true` before importing `unstructured`. Opt-out via `DO_NOT_TRACK` or `SCARF_NO_ANALYTICS` (any non-empty value) is unchanged and takes precedence. +- **Telemetry off by default**: Analytics/telemetry is now **disabled by default**. Upgrading will stop sending the library-load ping unless you opt in. To restore the previous behavior, set `UNSTRUCTURED_TELEMETRY_ENABLED=true` before importing `unstructured`. Opt-out via `DO_NOT_TRACK` or `SCARF_NO_ANALYTICS` (any non-empty value) takes precedence. - Telemetry ping now uses `requests.get(..., params=...)` for correct URL encoding and a single code path for dev/non-dev. ## 0.21.12 diff --git a/README.md b/README.md index ccbfcab661..2b53faf0e5 100644 --- a/README.md +++ b/README.md @@ -268,4 +268,4 @@ Encountered a bug? Please create a new [GitHub issue](https://github.com/Unstruc ## :chart_with_upwards_trend: Analytics -Telemetry is **off by default**. To opt in, set `UNSTRUCTURED_TELEMETRY_ENABLED=true` (or `=1`) before importing `unstructured`. To opt out, set `DO_NOT_TRACK` or `SCARF_NO_ANALYTICS` to any non-empty value (e.g. `true`, `1`, `yes`); opt-out takes precedence. See our [Privacy Policy](https://unstructured.io/privacy-policy). +Telemetry is **off by default**. To opt in, set `UNSTRUCTURED_TELEMETRY_ENABLED=true` (or `=1`) before importing `unstructured`. To opt out, set `DO_NOT_TRACK` or `SCARF_NO_ANALYTICS` to any non-empty value (e.g. `true`, `1`, `yes`, `false`, `0`—any non-empty string opts out); opt-out takes precedence. Unset the variable or leave it empty if you do not want to opt out. See our [Privacy Policy](https://unstructured.io/privacy-policy). diff --git a/scripts/image/test-outbound-connectivity.sh b/scripts/image/test-outbound-connectivity.sh index 82eda50385..d2e33a37bc 100755 --- a/scripts/image/test-outbound-connectivity.sh +++ b/scripts/image/test-outbound-connectivity.sh @@ -48,7 +48,7 @@ fi SCENARIO="${1:-}" if [[ -z "$SCENARIO" ]]; then - echo "Usage: $0 [--cleanup] {baseline|missing-models|offline|offline-and-missing-models}" >&2 + echo "Usage: $0 [--cleanup] {baseline|missing-models|analytics-online-only|offline|offline-and-missing-models}" >&2 exit 1 fi @@ -61,12 +61,16 @@ fi # ---------- scenario‑specific settings -------------------------------- DO_NOT_TRACK="" +UNSTRUCTURED_TELEMETRY_ENABLED="" HF_HUB_OFFLINE="" REMOVE_CACHE=0 case "$SCENARIO" in baseline) ;; missing-models) REMOVE_CACHE=1 ;; -analytics-online-only) HF_HUB_OFFLINE=1 ;; +analytics-online-only) + UNSTRUCTURED_TELEMETRY_ENABLED=1 + HF_HUB_OFFLINE=1 + ;; offline) DO_NOT_TRACK=true HF_HUB_OFFLINE=1 @@ -89,6 +93,7 @@ CID=$(docker run -d --rm --name "sut_${SCENARIO}" \ --network "$NET" \ --cap-add NET_RAW --cap-add NET_ADMIN \ -e DO_NOT_TRACK="$DO_NOT_TRACK" \ + -e UNSTRUCTURED_TELEMETRY_ENABLED="$UNSTRUCTURED_TELEMETRY_ENABLED" \ -e HF_HUB_OFFLINE="$HF_HUB_OFFLINE" \ --entrypoint /bin/sh "$IMAGE" -c "sleep infinity") echo "Container: $CID (scenario $SCENARIO)" @@ -127,8 +132,8 @@ fi docker exec -i -e PYTHONUNBUFFERED=1 "$CID" python - < Date: Sun, 15 Mar 2026 23:04:47 -0500 Subject: [PATCH 11/11] fix: update --- CHANGELOG.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b808b0778a..95df620ca6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,11 +1,11 @@ ## 0.22.0 ### Breaking changes -- **Opt-out env semantics**: `DO_NOT_TRACK` and `SCARF_NO_ANALYTICS` now treat **any non-empty value** (after strip) as opt-out. Previously only the exact string `"true"` opted out. So `DO_NOT_TRACK=false`, `DO_NOT_TRACK=0`, `SCARF_NO_ANALYTICS=0`, etc. will now disable telemetry. If you rely on `=false`/`=0` to mean “do not opt out”, remove the variable or set it only when you intend to opt out. +- **Opt-out env semantics**: `DO_NOT_TRACK` and `SCARF_NO_ANALYTICS` now treat any non-empty value (after strip) as opt-out. Previously only the exact string `"true"` opted out. Values like `false`, `0`, or `no` now also disable telemetry. To avoid opting out, unset the variable or leave it empty. ### Enhancements -- **Telemetry off by default**: Analytics/telemetry is now **disabled by default**. Upgrading will stop sending the library-load ping unless you opt in. To restore the previous behavior, set `UNSTRUCTURED_TELEMETRY_ENABLED=true` before importing `unstructured`. Opt-out via `DO_NOT_TRACK` or `SCARF_NO_ANALYTICS` (any non-empty value) takes precedence. -- Telemetry ping now uses `requests.get(..., params=...)` for correct URL encoding and a single code path for dev/non-dev. +- **Telemetry off by default**: The library-load analytics ping is disabled by default. Set `UNSTRUCTURED_TELEMETRY_ENABLED=true` before importing to restore the previous behavior. Opt-out via `DO_NOT_TRACK` or `SCARF_NO_ANALYTICS` (any non-empty value) takes precedence. +- Telemetry ping uses `requests.get(..., params=...)` for correct URL encoding and a single dev/non-dev code path. ## 0.21.12 - **Add Check for complex documents**: Adds a check for complex documents to avoid pdfminer with a high ratio of vector objects