Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
## 0.22.0

### Breaking changes
- **Opt-out env semantics**: `DO_NOT_TRACK` and `SCARF_NO_ANALYTICS` now treat any non-empty value (after strip) as opt-out. Previously only the exact string `"true"` opted out. Values like `false`, `0`, or `no` now also disable telemetry. To avoid opting out, unset the variable or leave it empty.

### Enhancements
- **Telemetry off by default**: The library-load analytics ping is disabled by default. Set `UNSTRUCTURED_TELEMETRY_ENABLED=true` before importing to restore the previous behavior. Opt-out via `DO_NOT_TRACK` or `SCARF_NO_ANALYTICS` (any non-empty value) takes precedence.
- Telemetry ping uses `requests.get(..., params=...)` for correct URL encoding and a single dev/non-dev code path.

## 0.21.12
- **Add Check for complex documents**: Adds a check for complex documents to avoid pdfminer with a high ratio of vector objects

Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -268,4 +268,4 @@ Encountered a bug? Please create a new [GitHub issue](https://github.com/Unstruc

## :chart_with_upwards_trend: Analytics

This library includes a very lightweight analytics "ping" when the library is loaded, however you can opt out of this data collection by setting the environment variable `DO_NOT_TRACK=true` before executing any `unstructured` code. To learn more about how we collect and use this data, please read our [Privacy Policy](https://unstructured.io/privacy-policy).
Telemetry is **off by default**. To opt in, set `UNSTRUCTURED_TELEMETRY_ENABLED=true` (or `=1`) before importing `unstructured`. To opt out, set `DO_NOT_TRACK` or `SCARF_NO_ANALYTICS` to any non-empty value (e.g. `true`, `1`, `yes`, `false`, `0`—any non-empty string opts out); opt-out takes precedence. Unset the variable or leave it empty if you do not want to opt out. See our [Privacy Policy](https://unstructured.io/privacy-policy).
11 changes: 8 additions & 3 deletions scripts/image/test-outbound-connectivity.sh
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ fi

SCENARIO="${1:-}"
if [[ -z "$SCENARIO" ]]; then
echo "Usage: $0 [--cleanup] {baseline|missing-models|offline|offline-and-missing-models}" >&2
echo "Usage: $0 [--cleanup] {baseline|missing-models|analytics-online-only|offline|offline-and-missing-models}" >&2
exit 1
fi

Expand All @@ -61,12 +61,16 @@ fi

# ---------- scenario‑specific settings --------------------------------
DO_NOT_TRACK=""
UNSTRUCTURED_TELEMETRY_ENABLED=""
HF_HUB_OFFLINE=""
REMOVE_CACHE=0
case "$SCENARIO" in
baseline) ;;
missing-models) REMOVE_CACHE=1 ;;
analytics-online-only) HF_HUB_OFFLINE=1 ;;
analytics-online-only)
UNSTRUCTURED_TELEMETRY_ENABLED=1
HF_HUB_OFFLINE=1
;;
offline)
DO_NOT_TRACK=true
HF_HUB_OFFLINE=1
Expand All @@ -89,6 +93,7 @@ CID=$(docker run -d --rm --name "sut_${SCENARIO}" \
--network "$NET" \
--cap-add NET_RAW --cap-add NET_ADMIN \
-e DO_NOT_TRACK="$DO_NOT_TRACK" \
-e UNSTRUCTURED_TELEMETRY_ENABLED="$UNSTRUCTURED_TELEMETRY_ENABLED" \
-e HF_HUB_OFFLINE="$HF_HUB_OFFLINE" \
--entrypoint /bin/sh "$IMAGE" -c "sleep infinity")
echo "Container: $CID (scenario $SCENARIO)"
Expand Down Expand Up @@ -127,8 +132,8 @@ fi

docker exec -i -e PYTHONUNBUFFERED=1 "$CID" python - <<PY |& tee "${PY_LOG_DIR}/${SCENARIO}.log"
import logging
# Telemetry runs at package init when UNSTRUCTURED_TELEMETRY_ENABLED is set (see analytics-online-only scenario).
from unstructured.partition.auto import partition
from unstructured.logger import logger # force analytics ping if not DO_NOT_TRACK
import urllib.request, time, os, sys

# Configure detailed logging
Expand Down
257 changes: 257 additions & 0 deletions test_unstructured/test_telemetry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,257 @@
"""Hermetic telemetry tests: env is set to opt-out before importing unstructured.

This module must set DO_NOT_TRACK (or equivalent) before any import of unstructured
so that init_telemetry() runs with opt-out at import time and no real network/subprocess
occurs. Tests then use monkeypatch and mocks to assert behavior.
"""

from __future__ import annotations

# Set opt-out before any unstructured import so package init does not run telemetry.
import os

os.environ["DO_NOT_TRACK"] = "1"
os.environ.pop("UNSTRUCTURED_TELEMETRY_ENABLED", None)
os.environ.pop("SCARF_NO_ANALYTICS", None)

import platform
import subprocess
import sys
from pathlib import Path
from unittest.mock import Mock

import pytest
import requests

from unstructured import utils


@pytest.fixture
def telemetry_mocks(monkeypatch):
"""Clear telemetry env and patch requests.get + subprocess.check_output.

Returns (mock_get, mock_subprocess). Use for both send and no-send tests so
we can assert network and subprocess side effects.
"""
monkeypatch.delenv("UNSTRUCTURED_TELEMETRY_ENABLED", raising=False)
monkeypatch.delenv("SCARF_NO_ANALYTICS", raising=False)
monkeypatch.delenv("DO_NOT_TRACK", raising=False)
mock_get = Mock()
mock_subprocess = Mock()
monkeypatch.setattr("unstructured.utils.requests.get", mock_get)
monkeypatch.setattr("unstructured.utils.subprocess.check_output", mock_subprocess)
return mock_get, mock_subprocess


def _apply_telemetry_env(monkeypatch, env_overrides):
"""Set env vars from dict; keys are env var names, values are str or None (delenv)."""
for key, value in env_overrides.items():
if value is None:
monkeypatch.delenv(key, raising=False)
else:
monkeypatch.setenv(key, value)


class DescribeScarfAnalytics:
"""Tests for scarf_analytics (telemetry off by default, opt-in only)."""

def it_telemetry_opt_out_any_non_empty_for_both_vars(self, monkeypatch):
"""Contract: DO_NOT_TRACK and SCARF_NO_ANALYTICS both opt out on any non-empty value."""
monkeypatch.delenv("DO_NOT_TRACK", raising=False)
monkeypatch.delenv("SCARF_NO_ANALYTICS", raising=False)
assert utils._telemetry_opt_out() is False
monkeypatch.setenv("DO_NOT_TRACK", "yes")
assert utils._telemetry_opt_out() is True
monkeypatch.delenv("DO_NOT_TRACK", raising=False)
monkeypatch.setenv("SCARF_NO_ANALYTICS", "on")
assert utils._telemetry_opt_out() is True

def it_telemetry_opt_in_only_true_or_1(self, monkeypatch):
"""Contract: only UNSTRUCTURED_TELEMETRY_ENABLED in ('true','1') opts in."""
monkeypatch.delenv("UNSTRUCTURED_TELEMETRY_ENABLED", raising=False)
assert utils._telemetry_opt_in() is False
for val in ("true", "1", "True", "TRUE"):
monkeypatch.setenv("UNSTRUCTURED_TELEMETRY_ENABLED", val)
assert utils._telemetry_opt_in() is True
for val in ("false", "0", "yes", ""):
monkeypatch.setenv("UNSTRUCTURED_TELEMETRY_ENABLED", val)
assert utils._telemetry_opt_in() is False

@pytest.mark.parametrize(
"env_overrides",
[
{},
{"DO_NOT_TRACK": "true"},
{"DO_NOT_TRACK": "1"},
{"DO_NOT_TRACK": "TRUE"},
{"DO_NOT_TRACK": "false"},
{"DO_NOT_TRACK": "0"},
{"SCARF_NO_ANALYTICS": "true"},
{"SCARF_NO_ANALYTICS": "yes"},
{"SCARF_NO_ANALYTICS": "on"},
{"SCARF_NO_ANALYTICS": "1"},
{"SCARF_NO_ANALYTICS": "TRUE"},
{"SCARF_NO_ANALYTICS": "false"},
{"SCARF_NO_ANALYTICS": "0"},
{"SCARF_NO_ANALYTICS": " true "},
{"UNSTRUCTURED_TELEMETRY_ENABLED": "false"},
{"UNSTRUCTURED_TELEMETRY_ENABLED": "0"},
{"UNSTRUCTURED_TELEMETRY_ENABLED": "yes"},
{"UNSTRUCTURED_TELEMETRY_ENABLED": "FALSE"},
{"UNSTRUCTURED_TELEMETRY_ENABLED": "true", "DO_NOT_TRACK": "true"},
{"UNSTRUCTURED_TELEMETRY_ENABLED": "true", "SCARF_NO_ANALYTICS": "on"},
],
ids=[
"default_no_opt_in",
"DO_NOT_TRACK=true",
"DO_NOT_TRACK=1",
"DO_NOT_TRACK=TRUE",
"DO_NOT_TRACK=false",
"DO_NOT_TRACK=0",
"SCARF_NO_ANALYTICS=true",
"SCARF_NO_ANALYTICS=yes",
"SCARF_NO_ANALYTICS=on",
"SCARF_NO_ANALYTICS=1",
"SCARF_NO_ANALYTICS=TRUE",
"SCARF_NO_ANALYTICS=false",
"SCARF_NO_ANALYTICS=0",
"SCARF_NO_ANALYTICS=whitespace",
"opt_in=false",
"opt_in=0",
"opt_in=yes",
"opt_in=FALSE",
"opt_in_true_but_DO_NOT_TRACK",
"opt_in_true_but_SCARF_NO_ANALYTICS",
],
)
def it_does_not_send_telemetry_when_disabled_or_opted_out(
self, monkeypatch, telemetry_mocks, env_overrides
):
"""No network or subprocess when telemetry disabled or opt-out set."""
mock_get, mock_subprocess = telemetry_mocks
_apply_telemetry_env(monkeypatch, env_overrides)
utils.scarf_analytics()
mock_get.assert_not_called()
mock_subprocess.assert_not_called()

@pytest.mark.parametrize("opt_in_value", ["true", "True", "TRUE", "1"])
def it_sends_telemetry_when_opt_in_is_set(self, monkeypatch, telemetry_mocks, opt_in_value):
mock_get, mock_subprocess = telemetry_mocks
_apply_telemetry_env(monkeypatch, {"UNSTRUCTURED_TELEMETRY_ENABLED": opt_in_value})
utils.scarf_analytics()
mock_get.assert_called_once()
mock_subprocess.assert_called_once_with(["nvidia-smi"], stderr=subprocess.DEVNULL)
call_args = mock_get.call_args
assert call_args[0][0] == "https://packages.unstructured.io/python-telemetry"
params = call_args[1]["params"]
assert set(params.keys()) == {"version", "platform", "python", "arch", "gpu", "dev"}
assert call_args[1]["timeout"] == 10

@pytest.mark.parametrize(
("version_val", "expected_dev"),
[("1.2.3.dev0", "true"), ("1.2.3", "false")],
ids=["dev_version", "release_version"],
)
def it_sends_telemetry_with_correct_dev_param(
self, monkeypatch, telemetry_mocks, version_val, expected_dev
):
mock_get, mock_subprocess = telemetry_mocks
_apply_telemetry_env(monkeypatch, {"UNSTRUCTURED_TELEMETRY_ENABLED": "true"})
monkeypatch.setattr("unstructured.utils.__version__", version_val)
utils.scarf_analytics()
mock_get.assert_called_once()
mock_subprocess.assert_called_once()
params = mock_get.call_args[1]["params"]
assert params["dev"] == expected_dev
assert params["version"] == version_val
assert params["platform"] == platform.system()
assert params["arch"] == platform.machine()
assert mock_get.call_args[1]["timeout"] == 10

def it_handles_requests_exception_gracefully(self, monkeypatch, telemetry_mocks):
mock_get, mock_subprocess = telemetry_mocks
mock_get.side_effect = requests.RequestException("network error")
_apply_telemetry_env(monkeypatch, {"UNSTRUCTURED_TELEMETRY_ENABLED": "true"})
utils.scarf_analytics() # does not raise
mock_get.assert_called_once()
mock_subprocess.assert_called_once()
assert mock_get.call_args[0][0] == "https://packages.unstructured.io/python-telemetry"
assert "version" in mock_get.call_args[1]["params"]

@pytest.mark.parametrize(
"exc",
[
OSError(),
PermissionError("nvidia-smi denied"),
subprocess.CalledProcessError(returncode=1, cmd=["nvidia-smi"]),
],
ids=["OSError", "PermissionError", "CalledProcessError"],
)
def it_handles_nvidia_smi_failure_gracefully(self, monkeypatch, telemetry_mocks, exc):
"""nvidia-smi probe failures must not propagate; telemetry still sends with gpu=False."""
mock_get, mock_subprocess = telemetry_mocks
mock_subprocess.side_effect = exc
_apply_telemetry_env(monkeypatch, {"UNSTRUCTURED_TELEMETRY_ENABLED": "true"})
utils.scarf_analytics() # does not raise
mock_get.assert_called_once()
assert mock_get.call_args[1]["params"]["gpu"] == "False"
mock_subprocess.assert_called_once_with(["nvidia-smi"], stderr=subprocess.DEVNULL)

def it_import_unstructured_succeeds_with_opt_out(self):
"""Import path with opt-out env does not crash (integration-style)."""
project_root = Path(__file__).resolve().parent.parent
env = {k: v for k, v in os.environ.items() if k != "UNSTRUCTURED_TELEMETRY_ENABLED"}
env.update(
{
"DO_NOT_TRACK": "1",
"SCARF_NO_ANALYTICS": "1",
"UNSTRUCTURED_TELEMETRY_ENABLED": "",
"PYTHONPATH": str(project_root),
}
)
result = subprocess.run(
[sys.executable, "-c", "import unstructured; print('ok')"],
env=env,
cwd=project_root,
capture_output=True,
text=True,
timeout=30,
)
assert result.returncode == 0, result.stderr or result.stdout
assert "ok" in result.stdout

def it_import_unstructured_runs_telemetry_once_when_opt_in(self):
"""Import path with opt-in runs init_telemetry exactly once (patch then import)."""
project_root = Path(__file__).resolve().parent.parent
env = {
k: v
for k, v in os.environ.items()
if k not in ("DO_NOT_TRACK", "SCARF_NO_ANALYTICS", "UNSTRUCTURED_TELEMETRY_ENABLED")
}
env.update(
{
"UNSTRUCTURED_TELEMETRY_ENABLED": "true",
"PYTHONPATH": str(project_root),
}
)
script = """
from unittest.mock import Mock, patch
m_get = Mock()
m_subprocess = Mock()
with patch('requests.get', m_get), patch('subprocess.check_output', m_subprocess):
import unstructured
exit(0 if (m_get.call_count == 1 and m_subprocess.call_count == 1) else 1)
"""
result = subprocess.run(
[sys.executable, "-c", script],
env=env,
cwd=project_root,
capture_output=True,
text=True,
timeout=30,
)
assert result.returncode == 0, (
"Import with opt-in should run telemetry exactly once (requests.get and "
"subprocess.check_output each called once). "
f"stderr={result.stderr!r} stdout={result.stdout!r}"
)
4 changes: 4 additions & 0 deletions unstructured/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
from .partition.utils.config import env_config
from .telemetry import init_telemetry

# init env_config
env_config

# Explicit startup boundary for telemetry (opt-in, best-effort)
init_telemetry()
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.21.12" # pragma: no cover
__version__ = "0.22.0" # pragma: no cover
6 changes: 0 additions & 6 deletions unstructured/logger.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import logging

from unstructured.utils import scarf_analytics

logger = logging.getLogger("unstructured")
trace_logger = logging.getLogger("unstructured.trace")

Expand All @@ -16,9 +14,5 @@ def detail(self, message, *args, **kws):
self._log(DETAIL, message, args, **kws)


# Note(Trevor,Crag): to opt out of scarf analytics, set the environment variable:
# SCARF_NO_ANALYTICS=true. See the README for more info.
scarf_analytics()

# Add the custom log method to the logging.Logger class
logging.Logger.detail = detail # type: ignore
8 changes: 8 additions & 0 deletions unstructured/telemetry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
"""Telemetry initializer. Called once at package startup from unstructured/__init__.py."""

from unstructured.utils import scarf_analytics


def init_telemetry() -> None:
"""Run the analytics ping if enabled by env. Best-effort and non-fatal."""
scarf_analytics()
Loading
Loading