diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index ba7bf80..a2ed4b5 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -7,11 +7,24 @@ on: push: tags: - "v*" + workflow_dispatch: + inputs: + publish_target: + description: 'Publish target' + required: true + default: 'testpypi' + type: choice + options: + - testpypi + - pypi permissions: contents: read jobs: + # ------------------------------------------------------------------- + # Build the package + # ------------------------------------------------------------------- build: runs-on: ubuntu-latest steps: @@ -24,23 +37,61 @@ jobs: python-version: "3.12" - name: Install build tools - run: pip install build + run: pip install build twine - name: Build sdist and wheel run: python -m build + - name: Check package with twine + run: twine check dist/* + + - name: List build artifacts + run: ls -la dist/ + - uses: actions/upload-artifact@v4 with: name: dist path: dist/ + # ------------------------------------------------------------------- + # Publish to TestPyPI (manual trigger or pre-release tags) + # Uses Trusted Publishing (OIDC — no API tokens needed) + # Requires TestPyPI project to be configured for GitHub OIDC: + # https://test.pypi.org/manage/project/botanu/settings/publishing/ + # ------------------------------------------------------------------- + publish-testpypi: + needs: build + if: | + github.event_name == 'workflow_dispatch' && github.event.inputs.publish_target == 'testpypi' + || (github.event_name == 'push' && contains(github.ref, '-alpha') || contains(github.ref, '-beta') || contains(github.ref, '-rc')) + runs-on: ubuntu-latest + environment: + name: testpypi + url: https://test.pypi.org/p/botanu + permissions: + id-token: write # required for OIDC trusted publishing + steps: + - uses: actions/download-artifact@v4 + with: + name: dist + path: dist/ + + - name: Publish to TestPyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + repository-url: https://test.pypi.org/legacy/ + skip-existing: true + # ------------------------------------------------------------------- # Publish to PyPI via Trusted Publishing (OIDC — no API tokens) # Requires PyPI project to be configured for GitHub OIDC: - # https://docs.pypi.org/trusted-publishers/ + # https://pypi.org/manage/project/botanu/settings/publishing/ # ------------------------------------------------------------------- publish-pypi: needs: build + if: | + github.event_name == 'workflow_dispatch' && github.event.inputs.publish_target == 'pypi' + || (github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && !contains(github.ref, '-')) runs-on: ubuntu-latest environment: name: pypi @@ -60,7 +111,8 @@ jobs: # Create GitHub Release with auto-generated notes # ------------------------------------------------------------------- github-release: - needs: publish-pypi + needs: [build, publish-pypi] + if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') runs-on: ubuntu-latest permissions: contents: write @@ -77,4 +129,9 @@ jobs: - name: Create GitHub Release env: GH_TOKEN: ${{ github.token }} - run: gh release create "${{ github.ref_name }}" dist/* --generate-notes + run: | + if [[ "${{ github.ref_name }}" == *"-"* ]]; then + gh release create "${{ github.ref_name }}" dist/* --generate-notes --prerelease + else + gh release create "${{ github.ref_name }}" dist/* --generate-notes + fi diff --git a/CHANGELOG.md b/CHANGELOG.md index 9e2816b..9eed0fc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,17 +7,69 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.1.0] - 2026-02-05 + ### Added -- Initial open-source release -- `enable()` / `disable()` bootstrap -- `@botanu_use_case` decorator with UUIDv7 run_id -- `emit_outcome()` and `set_business_context()` span helpers -- `RunContextEnricher` span processor -- LLM tracking with OTel GenAI semconv alignment -- Data tracking for database, storage, and messaging -- Resource detection for K8s, AWS, GCP, Azure, serverless -- Auto-instrumentation for 20+ libraries -- Optional extras: `[sdk]`, `[instruments]`, `[genai]`, `[carriers]`, `[all]` - -[Unreleased]: https://github.com/botanu-ai/botanu-sdk-python/compare/v0.0.0...HEAD +- Initial open-source release under Apache-2.0 license +- **Core SDK** + - `enable()` / `disable()` bootstrap functions for SDK initialization + - `@botanu_use_case` decorator with UUIDv7 run_id generation + - `@botanu_outcome` decorator for sub-function outcome tracking + - `emit_outcome()` helper for recording business outcomes + - `set_business_context()` for cost attribution dimensions + - `RunContextEnricher` span processor for automatic run_id propagation + +- **LLM Tracking** (aligned with OTel GenAI semantic conventions) + - `track_llm_call()` context manager for LLM/model operations + - `track_tool_call()` context manager for tool/function calls + - Token usage tracking (input, output, cached) + - Provider normalization for 15+ LLM providers + - Support for all GenAI operations (chat, embeddings, etc.) + +- **Data Tracking** + - `track_db_operation()` for database operations + - `track_storage_operation()` for object storage (S3, GCS, Azure Blob) + - `track_messaging_operation()` for message queues (SQS, Kafka, Pub/Sub) + - System normalization for 30+ database/storage systems + +- **Context Propagation** + - W3C Baggage propagation for cross-service run_id correlation + - Lean mode (default) and full mode propagation options + - `RunContext` model with retry tracking and deadline support + +- **Resource Detection** + - Kubernetes (pod, namespace, container) + - AWS (EC2, ECS, Lambda, Fargate) + - GCP (GCE, Cloud Run, Cloud Functions) + - Azure (VM, Container Apps, Functions) + +- **Auto-Instrumentation Support** + - HTTP clients: requests, httpx, urllib3, aiohttp + - Web frameworks: FastAPI, Flask, Django, Starlette + - Databases: SQLAlchemy, psycopg2, asyncpg, pymongo, Redis + - Messaging: Celery, Kafka + - GenAI: OpenAI, Anthropic, Vertex AI, Google GenAI, LangChain + +- **Optional Extras** + - `[sdk]` - OTel SDK + OTLP exporter + - `[instruments]` - Common library instrumentation + - `[genai]` - GenAI provider instrumentation + - `[carriers]` - Cross-service propagation helpers + - `[all]` - Everything included + - `[dev]` - Development and testing tools + +- **Documentation** + - Comprehensive docs in `/docs` following LF format + - Getting started guides + - API reference + - Best practices and anti-patterns + +### Dependencies + +- Core: `opentelemetry-api >= 1.20.0` +- SDK extra: `opentelemetry-sdk`, `opentelemetry-exporter-otlp-proto-http` +- Python: `>= 3.9` + +[Unreleased]: https://github.com/botanu-ai/botanu-sdk-python/compare/v0.1.0...HEAD +[0.1.0]: https://github.com/botanu-ai/botanu-sdk-python/releases/tag/v0.1.0 diff --git a/README.md b/README.md index 314cc25..49ad7a3 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,7 @@ [![CI](https://github.com/botanu-ai/botanu-sdk-python/actions/workflows/ci.yml/badge.svg)](https://github.com/botanu-ai/botanu-sdk-python/actions/workflows/ci.yml) [![PyPI version](https://badge.fury.io/py/botanu.svg)](https://pypi.org/project/botanu/) +[![Python versions](https://img.shields.io/pypi/pyversions/botanu.svg)](https://pypi.org/project/botanu/) [![OpenSSF Scorecard](https://api.scorecard.dev/projects/github.com/botanu-ai/botanu-sdk-python/badge)](https://scorecard.dev/viewer/?uri=github.com/botanu-ai/botanu-sdk-python) [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](LICENSE) @@ -9,38 +10,147 @@ OpenTelemetry-native **run-level cost attribution** for AI workflows. ## Overview -Botanu adds **runs** on top of distributed tracing. A run represents a single business execution that may span multiple traces, retries, and services. By correlating all spans to a stable `run_id`, you get accurate cost attribution without sampling artifacts. +Botanu adds **runs** on top of distributed tracing. A run represents a single business transaction that may span multiple LLM calls, database queries, and services. By correlating all operations to a stable `run_id`, you get accurate cost attribution without sampling artifacts. + +**Key features:** +- 🎯 **Run-level attribution** — Link all costs to business outcomes +- 🔗 **Cross-service correlation** — W3C Baggage propagation +- 📊 **OTel-native** — Works with any OpenTelemetry-compatible backend +- ⚡ **Minimal overhead** — < 0.5ms per request +- 🤖 **GenAI support** — OpenAI, Anthropic, Vertex AI, and more ## Quick Start ```python from botanu import enable, botanu_use_case, emit_outcome +# Initialize at startup enable(service_name="my-app") @botanu_use_case(name="Customer Support") async def handle_ticket(ticket_id: str): - result = await process_ticket(ticket_id) + # All operations inside get the same run_id + context = await fetch_context(ticket_id) + response = await generate_response(context) + + # Record the business outcome emit_outcome("success", value_type="tickets_resolved", value_amount=1) - return result + return response ``` ## Installation ```bash -pip install botanu # Core (opentelemetry-api only) -pip install botanu[sdk] # + OTel SDK + OTLP exporter -pip install botanu[all] # Everything including GenAI instrumentation +# Core SDK (opentelemetry-api only, ~50KB) +pip install botanu + +# With OTel SDK + OTLP exporter (for standalone use) +pip install "botanu[sdk]" + +# With GenAI provider instrumentation +pip install "botanu[genai]" + +# Everything included +pip install "botanu[all]" +``` + +### Extras + +| Extra | Description | +|-------|-------------| +| `sdk` | OpenTelemetry SDK + OTLP HTTP exporter | +| `instruments` | Auto-instrumentation for HTTP, databases, etc. | +| `genai` | GenAI provider instrumentation (OpenAI, Anthropic, etc.) | +| `carriers` | Cross-service propagation helpers (Celery, Kafka) | +| `all` | All of the above | +| `dev` | Development and testing tools | + +## LLM Tracking + +Track LLM calls with full cost attribution: + +```python +from botanu.tracking.llm import track_llm_call + +with track_llm_call(provider="openai", model="gpt-4") as tracker: + response = await openai.chat.completions.create( + model="gpt-4", + messages=[{"role": "user", "content": "Hello"}] + ) + tracker.set_tokens( + input_tokens=response.usage.prompt_tokens, + output_tokens=response.usage.completion_tokens, + ) +``` + +## Data Tracking + +Track database and storage operations: + +```python +from botanu.tracking.data import track_db_operation, track_storage_operation + +# Database +with track_db_operation(system="postgresql", operation="SELECT") as db: + result = await cursor.execute(query) + db.set_result(rows_returned=len(result)) + +# Storage +with track_storage_operation(system="s3", operation="PUT") as storage: + await s3.put_object(Bucket="bucket", Key="key", Body=data) + storage.set_result(bytes_written=len(data)) +``` + +## Architecture + +``` +┌──────────────────────────────────────────────────────────────┐ +│ Your Application │ +│ │ +│ @botanu_use_case track_llm_call() track_db_operation()│ +│ │ │ │ │ +│ └───────────────────┴────────────────────┘ │ +│ │ │ +│ Botanu SDK (thin) │ +│ - Generate run_id (UUIDv7) │ +│ - Set W3C Baggage │ +│ - Record spans │ +└─────────────────────────────┬─────────────────────────────────┘ + │ OTLP + ▼ +┌──────────────────────────────────────────────────────────────┐ +│ OpenTelemetry Collector │ +│ │ +│ - PII redaction - Cost calculation │ +│ - Vendor normalization - Cardinality management │ +└──────────────────────────────────────────────────────────────┘ ``` ## Documentation -Full documentation is available at [docs.botanu.ai](https://docs.botanu.ai) and in the [`docs/`](./docs/) folder. +Full documentation is available at [docs.botanu.ai](https://docs.botanu.ai) and in the [`docs/`](./docs/) folder: + +- [Getting Started](./docs/getting-started/) +- [Concepts](./docs/concepts/) +- [Tracking Guides](./docs/tracking/) +- [Integration](./docs/integration/) +- [API Reference](./docs/api/) + +## Requirements + +- Python 3.9+ +- OpenTelemetry Collector (for production use) ## Contributing See [CONTRIBUTING.md](./CONTRIBUTING.md). This project uses [DCO](./DCO) sign-off. +```bash +git commit -s -m "Your commit message" +``` + ## License [Apache-2.0](./LICENSE) — see [NOTICE](./NOTICE) for attribution. + +This project is an [LF AI & Data Foundation](https://lfaidata.foundation/) project. diff --git a/RELEASE.md b/RELEASE.md new file mode 100644 index 0000000..d2454ea --- /dev/null +++ b/RELEASE.md @@ -0,0 +1,199 @@ +# Release Process + +This document describes the release process for Botanu SDK. + +## Versioning + +Botanu SDK follows [Semantic Versioning](https://semver.org/): + +- **MAJOR** (1.0.0): Breaking changes to public API +- **MINOR** (0.2.0): New features, backwards compatible +- **PATCH** (0.1.1): Bug fixes, backwards compatible + +Pre-release versions use suffixes: +- `-alpha.N`: Early development, unstable +- `-beta.N`: Feature complete, testing +- `-rc.N`: Release candidate, final testing + +## Prerequisites + +Before releasing, ensure: + +1. All CI checks pass on `main` branch +2. CHANGELOG.md is updated with release notes +3. Documentation is up to date +4. Test coverage meets threshold (70%+) + +## Release Workflow + +### 1. Prepare the Release + +```bash +# Ensure you're on main with latest changes +git checkout main +git pull origin main + +# Update CHANGELOG.md +# - Move items from [Unreleased] to new version section +# - Add release date +# - Update comparison links at bottom + +# Commit changelog +git add CHANGELOG.md +git commit -s -m "docs: prepare release v0.1.0" +git push origin main +``` + +### 2. Create a Release Tag + +```bash +# For production release +git tag -a v0.1.0 -m "Release v0.1.0" + +# For pre-release +git tag -a v0.1.0-alpha.1 -m "Release v0.1.0-alpha.1" + +# Push tag +git push origin v0.1.0 +``` + +### 3. Automated Publishing + +When a tag is pushed: + +- **Pre-release tags** (`v*-alpha*`, `v*-beta*`, `v*-rc*`) → TestPyPI +- **Release tags** (`v*` without suffix) → PyPI + GitHub Release + +The workflow uses [Trusted Publishing (OIDC)](https://docs.pypi.org/trusted-publishers/) — no API tokens needed. + +### 4. Manual Publishing (if needed) + +You can manually trigger publishing from the Actions tab: + +1. Go to Actions → "Release to PyPI" +2. Click "Run workflow" +3. Select target: `testpypi` or `pypi` +4. Click "Run workflow" + +## TestPyPI Verification + +After publishing to TestPyPI, verify installation: + +```bash +# Create a test environment +python -m venv test-env +source test-env/bin/activate # or test-env\Scripts\activate on Windows + +# Install from TestPyPI +pip install --index-url https://test.pypi.org/simple/ \ + --extra-index-url https://pypi.org/simple/ \ + botanu + +# Verify import +python -c "import botanu; print(botanu.__version__)" + +# Run quick test +python -c " +from botanu import enable, botanu_use_case +enable(service_name='test') +print('Botanu SDK loaded successfully!') +" +``` + +## PyPI Trusted Publishing Setup + +### Initial Setup (One-time) + +1. **Create PyPI project** (if not exists): + - Go to https://pypi.org/manage/projects/ + - Create new project named `botanu` + +2. **Configure Trusted Publisher on PyPI**: + - Go to https://pypi.org/manage/project/botanu/settings/publishing/ + - Add new publisher: + - Owner: `botanu-ai` + - Repository: `botanu-sdk-python` + - Workflow: `release.yml` + - Environment: `pypi` + +3. **Configure Trusted Publisher on TestPyPI**: + - Go to https://test.pypi.org/manage/project/botanu/settings/publishing/ + - Add new publisher with same settings, environment: `testpypi` + +4. **Create GitHub Environments**: + - Go to repo Settings → Environments + - Create `pypi` environment (for production) + - Create `testpypi` environment (for testing) + - Optionally add protection rules (required reviewers, etc.) + +## Local Build Verification + +Before releasing, verify the build locally: + +```bash +# Install build tools +pip install build twine + +# Build the package +python -m build + +# Check the package +twine check dist/* + +# List contents +tar -tvf dist/botanu-*.tar.gz +unzip -l dist/botanu-*.whl + +# Test installation from local wheel +pip install dist/botanu-*.whl +python -c "import botanu; print(botanu.__version__)" +``` + +## Version Determination + +The version is determined by `hatch-vcs` from git tags: + +- Tagged commit: `0.1.0` +- Commits after tag: `0.1.1.dev3+g1234567` +- No tags: `0.0.0.dev0` + +To see what version will be used: + +```bash +pip install hatch-vcs +python -c "from setuptools_scm import get_version; print(get_version())" +``` + +## Rollback Procedure + +If a release has issues: + +1. **Yank from PyPI** (hides from install, but doesn't delete): + ```bash + # Via web UI: PyPI project → Release history → Yank + # Or via API (requires token) + ``` + +2. **Delete GitHub Release** (if needed): + ```bash + gh release delete v0.1.0 --yes + git push origin --delete v0.1.0 + ``` + +3. **Fix and re-release** with a new patch version (e.g., `v0.1.1`) + +## Release Checklist + +- [ ] All CI checks pass +- [ ] CHANGELOG.md updated +- [ ] Documentation updated +- [ ] Version tag follows semver +- [ ] Tag pushed to origin +- [ ] TestPyPI verification passed (for major releases) +- [ ] PyPI package visible +- [ ] GitHub Release created +- [ ] Announcement posted (if applicable) + +## Maintainers + +See [MAINTAINERS.md](./MAINTAINERS.md) for the list of release maintainers. diff --git a/docs/api/configuration.md b/docs/api/configuration.md new file mode 100644 index 0000000..7fac10e --- /dev/null +++ b/docs/api/configuration.md @@ -0,0 +1,422 @@ +# Configuration API Reference + +## BotanuConfig + +Dataclass for SDK configuration. + +```python +from botanu.sdk.config import BotanuConfig +``` + +### Fields + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `service_name` | `str` | `"unknown_service"` | Service name (from `OTEL_SERVICE_NAME`) | +| `service_version` | `str` | `None` | Service version (from `OTEL_SERVICE_VERSION`) | +| `service_namespace` | `str` | `None` | Service namespace (from `OTEL_SERVICE_NAMESPACE`) | +| `deployment_environment` | `str` | `"production"` | Environment (from `OTEL_DEPLOYMENT_ENVIRONMENT` or `BOTANU_ENVIRONMENT`) | +| `auto_detect_resources` | `bool` | `True` | Auto-detect cloud resources | +| `otlp_endpoint` | `str` | `"http://localhost:4318/v1/traces"` | OTLP endpoint | +| `otlp_headers` | `dict` | `None` | Custom headers for OTLP exporter | +| `max_export_batch_size` | `int` | `512` | Max spans per batch | +| `max_queue_size` | `int` | `2048` | Max spans in queue | +| `schedule_delay_millis` | `int` | `5000` | Delay between batch exports | +| `trace_sample_rate` | `float` | `1.0` | Sampling rate (1.0 = 100%) | +| `propagation_mode` | `str` | `"lean"` | `"lean"` or `"full"` | +| `auto_instrument_packages` | `list` | `[...]` | Packages to auto-instrument | + +### Constructor + +```python +config = BotanuConfig( + service_name="my-service", + deployment_environment="production", + otlp_endpoint="http://collector:4318/v1/traces", +) +``` + +### Class Methods + +#### from_yaml() + +Load configuration from a YAML file. + +```python +@classmethod +def from_yaml(cls, path: Optional[str] = None) -> BotanuConfig +``` + +**Parameters:** +- `path`: Path to YAML config file + +**Raises:** +- `FileNotFoundError`: If config file doesn't exist +- `ValueError`: If YAML is malformed +- `ImportError`: If PyYAML is not installed + +**Example:** + +```python +config = BotanuConfig.from_yaml("config/botanu.yaml") +``` + +#### from_file_or_env() + +Load config from file if exists, otherwise use environment variables. + +```python +@classmethod +def from_file_or_env(cls, path: Optional[str] = None) -> BotanuConfig +``` + +**Search order:** +1. Explicit `path` argument +2. `BOTANU_CONFIG_FILE` environment variable +3. `./botanu.yaml` +4. `./botanu.yml` +5. `./config/botanu.yaml` +6. `./config/botanu.yml` +7. Falls back to environment-only config + +**Example:** + +```python +# Auto-discovers config file +config = BotanuConfig.from_file_or_env() + +# Explicit path +config = BotanuConfig.from_file_or_env("my-config.yaml") +``` + +### Instance Methods + +#### to_dict() + +Export configuration as dictionary. + +```python +def to_dict(self) -> Dict[str, Any] +``` + +**Example:** + +```python +config = BotanuConfig(service_name="my-service") +print(config.to_dict()) +# { +# "service": {"name": "my-service", ...}, +# "otlp": {"endpoint": "...", ...}, +# ... +# } +``` + +--- + +## YAML Configuration Format + +### Full Schema + +```yaml +service: + name: string # Service name + version: string # Service version + namespace: string # Service namespace + environment: string # Deployment environment + +resource: + auto_detect: boolean # Auto-detect cloud resources + +otlp: + endpoint: string # OTLP endpoint URL + headers: # Custom headers + header-name: value + +export: + batch_size: integer # Max spans per batch + queue_size: integer # Max spans in queue + delay_ms: integer # Delay between exports + +sampling: + rate: float # Sampling rate (0.0-1.0) + +propagation: + mode: string # "lean" or "full" + +auto_instrument_packages: # List of packages to instrument + - package_name +``` + +### Environment Variable Interpolation + +```yaml +service: + name: ${OTEL_SERVICE_NAME:-default-service} + environment: ${ENVIRONMENT} + +otlp: + endpoint: ${COLLECTOR_URL:-http://localhost:4318}/v1/traces + headers: + Authorization: Bearer ${API_TOKEN} +``` + +Syntax: +- `${VAR_NAME}` - Required variable +- `${VAR_NAME:-default}` - Variable with default value + +--- + +## enable() + +Bootstrap function to initialize the SDK. + +```python +from botanu import enable + +enable( + service_name: Optional[str] = None, + otlp_endpoint: Optional[str] = None, + config: Optional[BotanuConfig] = None, + auto_instrument: bool = True, + auto_instrument_packages: Optional[List[str]] = None, + propagation_mode: Optional[str] = None, + **kwargs: Any, +) -> None +``` + +### Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `service_name` | `str` | From env | Service name | +| `otlp_endpoint` | `str` | From env | OTLP endpoint URL | +| `config` | `BotanuConfig` | `None` | Pre-built configuration | +| `auto_instrument` | `bool` | `True` | Enable auto-instrumentation | +| `auto_instrument_packages` | `list` | `None` | Override default packages | +| `propagation_mode` | `str` | `None` | `"lean"` or `"full"` | +| `**kwargs` | `Any` | `{}` | Additional config fields | + +### Behavior + +1. Creates/merges `BotanuConfig` +2. Configures `TracerProvider` with `RunContextEnricher` +3. Sets up OTLP exporter (if SDK extras installed) +4. Enables auto-instrumentation (if requested) +5. Configures W3C Baggage propagation + +### Examples + +#### Minimal + +```python +from botanu import enable + +enable(service_name="my-service") +``` + +#### With Config Object + +```python +from botanu import enable +from botanu.sdk.config import BotanuConfig + +config = BotanuConfig.from_yaml("config/botanu.yaml") +enable(config=config) +``` + +#### Custom Options + +```python +enable( + service_name="my-service", + otlp_endpoint="http://collector:4318/v1/traces", + auto_instrument_packages=["fastapi", "openai_v2"], + propagation_mode="full", +) +``` + +--- + +## disable() + +Disable the SDK and clean up resources. + +```python +from botanu import disable + +disable() -> None +``` + +### Behavior + +1. Flushes pending spans +2. Shuts down span processors +3. Disables instrumentation + +--- + +## is_enabled() + +Check if the SDK is currently enabled. + +```python +from botanu import is_enabled + +is_enabled() -> bool +``` + +### Example + +```python +if not is_enabled(): + enable(service_name="my-service") +``` + +--- + +## Environment Variables + +### OpenTelemetry Standard + +| Variable | Description | Default | +|----------|-------------|---------| +| `OTEL_SERVICE_NAME` | Service name | `"unknown_service"` | +| `OTEL_SERVICE_VERSION` | Service version | None | +| `OTEL_SERVICE_NAMESPACE` | Service namespace | None | +| `OTEL_DEPLOYMENT_ENVIRONMENT` | Deployment environment | `"production"` | +| `OTEL_EXPORTER_OTLP_ENDPOINT` | OTLP base endpoint | `"http://localhost:4318"` | +| `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` | OTLP traces endpoint (full URL) | None | +| `OTEL_EXPORTER_OTLP_HEADERS` | OTLP headers (key=value pairs) | None | + +### Botanu-Specific + +| Variable | Description | Default | +|----------|-------------|---------| +| `BOTANU_ENVIRONMENT` | Fallback for environment | `"production"` | +| `BOTANU_PROPAGATION_MODE` | `"lean"` or `"full"` | `"lean"` | +| `BOTANU_TRACE_SAMPLE_RATE` | Sampling rate (0.0-1.0) | `"1.0"` | +| `BOTANU_AUTO_DETECT_RESOURCES` | Auto-detect cloud resources | `"true"` | +| `BOTANU_CONFIG_FILE` | Path to YAML config file | None | + +--- + +## RunContext + +Model for run metadata. + +```python +from botanu.models.run_context import RunContext +``` + +### Class Methods + +#### create() + +Create a new run context. + +```python +@classmethod +def create( + cls, + use_case: str, + workflow: Optional[str] = None, + workflow_version: Optional[str] = None, + environment: Optional[str] = None, + tenant_id: Optional[str] = None, + parent_run_id: Optional[str] = None, + deadline_seconds: Optional[float] = None, +) -> RunContext +``` + +#### create_retry() + +Create a retry context from an original run. + +```python +@classmethod +def create_retry(cls, original: RunContext) -> RunContext +``` + +#### from_baggage() + +Reconstruct context from baggage dictionary. + +```python +@classmethod +def from_baggage(cls, baggage: Dict[str, str]) -> Optional[RunContext] +``` + +### Instance Methods + +#### to_baggage_dict() + +Serialize to baggage format. + +```python +def to_baggage_dict(self, lean_mode: bool = True) -> Dict[str, str] +``` + +#### to_span_attributes() + +Serialize to span attributes. + +```python +def to_span_attributes(self) -> Dict[str, Any] +``` + +#### as_current() + +Context manager to set this as the current run. + +```python +def as_current(self) -> ContextManager +``` + +#### complete() + +Mark the run as complete. + +```python +def complete( + self, + status: RunStatus, + error_class: Optional[str] = None, +) -> None +``` + +### Fields + +| Field | Type | Description | +|-------|------|-------------| +| `run_id` | `str` | Unique UUIDv7 identifier | +| `root_run_id` | `str` | Root run ID (same as run_id for first attempt) | +| `use_case` | `str` | Business use case name | +| `workflow` | `str` | Workflow/function name | +| `workflow_version` | `str` | Version hash | +| `environment` | `str` | Deployment environment | +| `tenant_id` | `str` | Tenant identifier | +| `parent_run_id` | `str` | Parent run ID | +| `attempt` | `int` | Attempt number | +| `start_time` | `datetime` | Run start time | +| `outcome` | `RunOutcome` | Recorded outcome | + +--- + +## RunStatus + +Enum for run status. + +```python +from botanu.models.run_context import RunStatus + +class RunStatus(Enum): + SUCCESS = "success" + FAILURE = "failure" + PARTIAL = "partial" +``` + +## See Also + +- [Configuration Guide](../getting-started/configuration.md) - Configuration how-to +- [Architecture](../concepts/architecture.md) - SDK design +- [Existing OTel Setup](../integration/existing-otel.md) - Integration patterns diff --git a/docs/api/decorators.md b/docs/api/decorators.md new file mode 100644 index 0000000..71e1b9a --- /dev/null +++ b/docs/api/decorators.md @@ -0,0 +1,208 @@ +# Decorators API Reference + +## @botanu_use_case + +The primary decorator for creating runs with automatic context propagation. + +```python +from botanu import botanu_use_case + +@botanu_use_case( + name: str, + workflow: Optional[str] = None, + *, + environment: Optional[str] = None, + tenant_id: Optional[str] = None, + auto_outcome_on_success: bool = True, + span_kind: SpanKind = SpanKind.SERVER, +) +``` + +### Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `name` | `str` | Required | Use case name (e.g., "Customer Support"). Low cardinality for grouping. | +| `workflow` | `str` | Function name | Workflow identifier. Defaults to the decorated function's qualified name. | +| `environment` | `str` | From env | Deployment environment (production, staging, etc.). | +| `tenant_id` | `str` | `None` | Tenant identifier for multi-tenant systems. | +| `auto_outcome_on_success` | `bool` | `True` | Automatically emit "success" outcome if function completes without exception. | +| `span_kind` | `SpanKind` | `SERVER` | OpenTelemetry span kind. | + +### Behavior + +1. **Generates UUIDv7 `run_id`** - Sortable, globally unique identifier +2. **Creates root span** - Named `botanu.run/{name}` +3. **Emits events** - `botanu.run.started` and `botanu.run.completed` +4. **Sets baggage** - Propagates context via W3C Baggage +5. **Records outcome** - On completion or exception + +### Examples + +#### Basic Usage + +```python +@botanu_use_case("Customer Support") +async def handle_ticket(ticket_id: str): + result = await process_ticket(ticket_id) + emit_outcome("success", value_type="tickets_resolved", value_amount=1) + return result +``` + +#### With All Parameters + +```python +@botanu_use_case( + name="Document Processing", + workflow="pdf_extraction", + environment="production", + tenant_id="acme-corp", + auto_outcome_on_success=False, + span_kind=SpanKind.CONSUMER, +) +async def process_document(doc_id: str): + ... +``` + +#### Sync Functions + +```python +@botanu_use_case("Batch Processing") +def process_batch(batch_id: str): + # Works with sync functions too + return process_items(batch_id) +``` + +### Span Attributes + +The decorator sets these span attributes: + +| Attribute | Source | +|-----------|--------| +| `botanu.run_id` | Generated UUIDv7 | +| `botanu.use_case` | `name` parameter | +| `botanu.workflow` | `workflow` parameter or function name | +| `botanu.workflow_version` | SHA256 hash of function source | +| `botanu.environment` | `environment` parameter or env var | +| `botanu.tenant_id` | `tenant_id` parameter (if provided) | +| `botanu.parent_run_id` | Parent run ID (if nested) | + +### Alias + +`use_case` is an alias for `botanu_use_case`: + +```python +from botanu import use_case + +@use_case("My Use Case") +async def my_function(): + ... +``` + +--- + +## @botanu_outcome + +Convenience decorator for sub-functions to emit outcomes based on success/failure. + +```python +from botanu import botanu_outcome + +@botanu_outcome( + success: Optional[str] = None, + partial: Optional[str] = None, + failed: Optional[str] = None, +) +``` + +### Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `success` | `str` | `None` | Custom label for success outcome (reserved for future use). | +| `partial` | `str` | `None` | Custom label for partial outcome (reserved for future use). | +| `failed` | `str` | `None` | Custom label for failed outcome (reserved for future use). | + +### Behavior + +- **Does NOT create a new run** - Works within an existing run +- **Emits "success"** if function completes without exception +- **Emits "failed"** with exception class name if exception raised +- **Skips emission** if outcome already set on current span + +### Example + +```python +from botanu import botanu_use_case, botanu_outcome + +@botanu_use_case("Data Pipeline") +async def run_pipeline(): + await extract_data() + await transform_data() + await load_data() + +@botanu_outcome() +async def extract_data(): + # Emits "success" on completion + return await fetch_from_source() + +@botanu_outcome() +async def transform_data(): + # Emits "failed" with reason if exception + return await apply_transformations() +``` + +--- + +## Function Signatures + +### Async Support + +Both decorators support async and sync functions: + +```python +# Async +@botanu_use_case("Async Use Case") +async def async_handler(): + await do_work() + +# Sync +@botanu_use_case("Sync Use Case") +def sync_handler(): + do_work() +``` + +### Return Values + +Decorated functions preserve their return values: + +```python +@botanu_use_case("Processing") +async def process(data) -> ProcessResult: + return ProcessResult(status="complete", items=100) + +result = await process(data) +assert isinstance(result, ProcessResult) +``` + +### Exception Handling + +Exceptions are recorded and re-raised: + +```python +@botanu_use_case("Risky Operation") +async def risky(): + raise ValueError("Something went wrong") + +try: + await risky() +except ValueError: + # Exception is re-raised after recording + pass +``` + +## See Also + +- [Quickstart](../getting-started/quickstart.md) - Getting started +- [Run Context](../concepts/run-context.md) - Understanding runs +- [Outcomes](../tracking/outcomes.md) - Recording outcomes diff --git a/docs/api/tracking.md b/docs/api/tracking.md new file mode 100644 index 0000000..dcd35f7 --- /dev/null +++ b/docs/api/tracking.md @@ -0,0 +1,511 @@ +# Tracking API Reference + +## LLM Tracking + +### track_llm_call() + +Context manager for tracking LLM/model calls. + +```python +from botanu.tracking.llm import track_llm_call + +with track_llm_call( + provider: str, + model: str, + operation: str = ModelOperation.CHAT, + client_request_id: Optional[str] = None, + **kwargs: Any, +) -> Generator[LLMTracker, None, None]: +``` + +#### Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `provider` | `str` | Required | LLM provider (openai, anthropic, etc.) | +| `model` | `str` | Required | Model name/ID (gpt-4, claude-3-opus, etc.) | +| `operation` | `str` | `"chat"` | Operation type (see ModelOperation) | +| `client_request_id` | `str` | `None` | Your tracking ID | +| `**kwargs` | `Any` | `{}` | Additional span attributes | + +#### Returns + +Yields an `LLMTracker` instance. + +#### Example + +```python +with track_llm_call(provider="openai", model="gpt-4") as tracker: + response = await client.chat.completions.create(...) + tracker.set_tokens( + input_tokens=response.usage.prompt_tokens, + output_tokens=response.usage.completion_tokens, + ) + tracker.set_request_id(response.id) +``` + +--- + +### LLMTracker + +Tracker object for recording LLM call details. + +#### Methods + +##### set_tokens() + +```python +def set_tokens( + input_tokens: int = 0, + output_tokens: int = 0, + cached_tokens: int = 0, + cache_read_tokens: int = 0, + cache_write_tokens: int = 0, +) -> LLMTracker +``` + +Records token usage. + +##### set_request_id() + +```python +def set_request_id( + provider_request_id: Optional[str] = None, + client_request_id: Optional[str] = None, +) -> LLMTracker +``` + +Records request IDs for billing reconciliation. + +##### set_response_model() + +```python +def set_response_model(model: str) -> LLMTracker +``` + +Records the actual model used in response. + +##### set_finish_reason() + +```python +def set_finish_reason(reason: str) -> LLMTracker +``` + +Records the stop reason (stop, length, content_filter, etc.). + +##### set_streaming() + +```python +def set_streaming(is_streaming: bool = True) -> LLMTracker +``` + +Marks request as streaming. + +##### set_cache_hit() + +```python +def set_cache_hit(cache_hit: bool = True) -> LLMTracker +``` + +Marks as a cache hit. + +##### set_attempt() + +```python +def set_attempt(attempt_number: int) -> LLMTracker +``` + +Sets retry attempt number. + +##### set_request_params() + +```python +def set_request_params( + temperature: Optional[float] = None, + top_p: Optional[float] = None, + max_tokens: Optional[int] = None, + stop_sequences: Optional[List[str]] = None, + frequency_penalty: Optional[float] = None, + presence_penalty: Optional[float] = None, +) -> LLMTracker +``` + +Records request parameters. + +##### set_error() + +```python +def set_error(error: Exception) -> LLMTracker +``` + +Records an error. + +##### add_metadata() + +```python +def add_metadata(**kwargs: Any) -> LLMTracker +``` + +Adds custom span attributes. + +--- + +### track_tool_call() + +Context manager for tracking tool/function calls. + +```python +from botanu.tracking.llm import track_tool_call + +with track_tool_call( + tool_name: str, + tool_call_id: Optional[str] = None, + provider: Optional[str] = None, + **kwargs: Any, +) -> Generator[ToolTracker, None, None]: +``` + +#### Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `tool_name` | `str` | Required | Name of the tool/function | +| `tool_call_id` | `str` | `None` | Tool call ID from LLM response | +| `provider` | `str` | `None` | Tool provider if external | + +--- + +### ModelOperation + +Constants for operation types. + +| Constant | Value | +|----------|-------| +| `CHAT` | `"chat"` | +| `TEXT_COMPLETION` | `"text_completion"` | +| `EMBEDDINGS` | `"embeddings"` | +| `GENERATE_CONTENT` | `"generate_content"` | +| `EXECUTE_TOOL` | `"execute_tool"` | +| `CREATE_AGENT` | `"create_agent"` | +| `INVOKE_AGENT` | `"invoke_agent"` | +| `RERANK` | `"rerank"` | +| `IMAGE_GENERATION` | `"image_generation"` | +| `SPEECH_TO_TEXT` | `"speech_to_text"` | +| `TEXT_TO_SPEECH` | `"text_to_speech"` | + +--- + +## Data Tracking + +### track_db_operation() + +Context manager for tracking database operations. + +```python +from botanu.tracking.data import track_db_operation + +with track_db_operation( + system: str, + operation: str, + database: Optional[str] = None, + **kwargs: Any, +) -> Generator[DBTracker, None, None]: +``` + +#### Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `system` | `str` | Required | Database system (postgresql, mongodb, etc.) | +| `operation` | `str` | Required | Operation type (SELECT, INSERT, etc.) | +| `database` | `str` | `None` | Database name | + +#### Example + +```python +with track_db_operation(system="postgresql", operation="SELECT") as db: + result = await cursor.execute(query) + db.set_result(rows_returned=len(result)) +``` + +--- + +### DBTracker + +#### Methods + +##### set_result() + +```python +def set_result( + rows_returned: int = 0, + rows_affected: int = 0, + bytes_read: int = 0, + bytes_written: int = 0, +) -> DBTracker +``` + +##### set_table() + +```python +def set_table(table_name: str, schema: Optional[str] = None) -> DBTracker +``` + +##### set_query_id() + +```python +def set_query_id(query_id: str) -> DBTracker +``` + +##### set_bytes_scanned() + +```python +def set_bytes_scanned(bytes_scanned: int) -> DBTracker +``` + +##### set_error() + +```python +def set_error(error: Exception) -> DBTracker +``` + +##### add_metadata() + +```python +def add_metadata(**kwargs: Any) -> DBTracker +``` + +--- + +### track_storage_operation() + +Context manager for tracking object storage operations. + +```python +from botanu.tracking.data import track_storage_operation + +with track_storage_operation( + system: str, + operation: str, + **kwargs: Any, +) -> Generator[StorageTracker, None, None]: +``` + +#### Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `system` | `str` | Required | Storage system (s3, gcs, azure_blob, etc.) | +| `operation` | `str` | Required | Operation type (GET, PUT, DELETE, etc.) | + +--- + +### StorageTracker + +#### Methods + +##### set_result() + +```python +def set_result( + objects_count: int = 0, + bytes_read: int = 0, + bytes_written: int = 0, +) -> StorageTracker +``` + +##### set_bucket() + +```python +def set_bucket(bucket: str) -> StorageTracker +``` + +##### set_error() + +```python +def set_error(error: Exception) -> StorageTracker +``` + +##### add_metadata() + +```python +def add_metadata(**kwargs: Any) -> StorageTracker +``` + +--- + +### track_messaging_operation() + +Context manager for tracking messaging operations. + +```python +from botanu.tracking.data import track_messaging_operation + +with track_messaging_operation( + system: str, + operation: str, + destination: str, + **kwargs: Any, +) -> Generator[MessagingTracker, None, None]: +``` + +#### Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `system` | `str` | Required | Messaging system (sqs, kafka, pubsub, etc.) | +| `operation` | `str` | Required | Operation type (publish, consume, etc.) | +| `destination` | `str` | Required | Queue/topic name | + +--- + +### MessagingTracker + +#### Methods + +##### set_result() + +```python +def set_result( + message_count: int = 0, + bytes_transferred: int = 0, +) -> MessagingTracker +``` + +##### set_error() + +```python +def set_error(error: Exception) -> MessagingTracker +``` + +##### add_metadata() + +```python +def add_metadata(**kwargs: Any) -> MessagingTracker +``` + +--- + +## Span Helpers + +### emit_outcome() + +Emit a business outcome for the current span. + +```python +from botanu import emit_outcome + +emit_outcome( + status: str, + *, + value_type: Optional[str] = None, + value_amount: Optional[float] = None, + confidence: Optional[float] = None, + reason: Optional[str] = None, +) -> None +``` + +#### Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `status` | `str` | Required | Outcome status ("success", "partial", "failed") | +| `value_type` | `str` | `None` | Type of business value achieved | +| `value_amount` | `float` | `None` | Quantified value amount | +| `confidence` | `float` | `None` | Confidence score (0.0-1.0) | +| `reason` | `str` | `None` | Reason for the outcome | + +#### Example + +```python +emit_outcome("success", value_type="tickets_resolved", value_amount=1) +emit_outcome("failed", reason="rate_limit_exceeded") +``` + +--- + +### set_business_context() + +Set business context attributes on the current span. + +```python +from botanu import set_business_context + +set_business_context( + *, + customer_id: Optional[str] = None, + team: Optional[str] = None, + cost_center: Optional[str] = None, + region: Optional[str] = None, +) -> None +``` + +#### Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `customer_id` | `str` | `None` | Customer identifier | +| `team` | `str` | `None` | Team or department | +| `cost_center` | `str` | `None` | Cost center for financial tracking | +| `region` | `str` | `None` | Geographic region | + +--- + +## Context Helpers + +### get_run_id() + +Get the current run ID from baggage. + +```python +from botanu import get_run_id + +run_id = get_run_id() +``` + +### get_use_case() + +Get the current use case from baggage. + +```python +from botanu import get_use_case + +use_case = get_use_case() +``` + +### get_baggage() + +Get a baggage value by key. + +```python +from botanu import get_baggage + +value = get_baggage("botanu.tenant_id") +``` + +### set_baggage() + +Set a baggage value. + +```python +from botanu import set_baggage + +set_baggage("botanu.custom_field", "my_value") +``` + +### get_current_span() + +Get the current active span. + +```python +from botanu import get_current_span + +span = get_current_span() +span.set_attribute("custom.attribute", "value") +``` + +## See Also + +- [LLM Tracking](../tracking/llm-tracking.md) - Detailed LLM tracking guide +- [Data Tracking](../tracking/data-tracking.md) - Data operation tracking +- [Outcomes](../tracking/outcomes.md) - Outcome recording diff --git a/docs/concepts/architecture.md b/docs/concepts/architecture.md new file mode 100644 index 0000000..2d87ccb --- /dev/null +++ b/docs/concepts/architecture.md @@ -0,0 +1,265 @@ +# Architecture + +Botanu SDK follows a "thin SDK, smart collector" architecture. The SDK does minimal work in your application's hot path, delegating heavy processing to the OpenTelemetry Collector. + +## Design Principles + +### 1. Minimal Hot-Path Overhead + +The SDK only performs lightweight operations during request processing: +- Generate UUIDv7 `run_id` +- Read/write W3C Baggage +- Record token counts as span attributes + +**Target overhead**: < 0.5ms per request + +### 2. OTel-Native + +Built on OpenTelemetry primitives, not alongside them: +- Uses standard `TracerProvider` +- Standard `SpanProcessor` for enrichment +- Standard OTLP export +- W3C Baggage for propagation + +### 3. Collector-Side Processing + +Heavy operations happen in the OTel Collector: +- PII redaction +- Cost calculation from token counts +- Vendor normalization +- Cardinality management +- Aggregation and sampling + +## Component Overview + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ Your Application │ +├─────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ │ +│ │ @botanu_use_ │ │ track_llm_ │ │ track_db_ │ │ +│ │ case() │ │ call() │ │ operation() │ │ +│ └────────┬────────┘ └────────┬────────┘ └────────┬────────┘ │ +│ │ │ │ │ +│ └──────────────────────┼──────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌───────────────────────────────────────────────────────────────────────┐ │ +│ │ Botanu SDK Core │ │ +│ ├───────────────────────────────────────────────────────────────────────┤ │ +│ │ RunContext │ RunContextEnricher │ BotanuConfig │ │ +│ │ - generate_run_id() │ - on_start() │ - service_name │ │ +│ │ - to_baggage_dict() │ - reads baggage │ - otlp_endpoint │ │ +│ │ - to_span_attrs() │ - writes to spans │ - propagation_mode │ │ +│ └───────────────────────────────────────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌───────────────────────────────────────────────────────────────────────┐ │ +│ │ OpenTelemetry SDK │ │ +│ │ TracerProvider → BatchSpanProcessor → OTLPSpanExporter │ │ +│ └───────────────────────────────────────────────────────────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────────────────┘ + │ + │ OTLP (HTTP or gRPC) + ▼ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ OpenTelemetry Collector │ +├─────────────────────────────────────────────────────────────────────────────┤ +│ receivers: │ +│ otlp: │ +│ │ +│ processors: │ +│ transform: # Normalize vendor names │ +│ redaction: # Remove PII from gen_ai.content.* │ +│ attributes: # Cardinality limits │ +│ botanu/cost: # Calculate $ from tokens │ +│ │ +│ exporters: │ +│ clickhouse: # Or your preferred backend │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` + +## SDK Components + +### BotanuConfig + +Central configuration for the SDK: + +```python +@dataclass +class BotanuConfig: + service_name: str + deployment_environment: str + otlp_endpoint: str + propagation_mode: str # "lean" or "full" + auto_instrument_packages: List[str] +``` + +### RunContext + +Holds run metadata and provides serialization: + +```python +@dataclass +class RunContext: + run_id: str + root_run_id: str + use_case: str + workflow: Optional[str] + attempt: int + # ... +``` + +### RunContextEnricher + +The only span processor in the SDK. Reads baggage, writes to spans: + +```python +class RunContextEnricher(SpanProcessor): + def on_start(self, span, parent_context): + for key in self._baggage_keys: + value = baggage.get_baggage(key, parent_context) + if value: + span.set_attribute(key, value) +``` + +### Tracking Helpers + +Context managers for manual instrumentation: + +- `track_llm_call()` - LLM/model operations +- `track_db_operation()` - Database operations +- `track_storage_operation()` - Object storage operations +- `track_messaging_operation()` - Message queue operations + +## Data Flow + +### 1. Run Initiation + +```python +@botanu_use_case("Customer Support") +def handle_ticket(): + pass +``` + +1. Generate UUIDv7 `run_id` +2. Create `RunContext` +3. Set baggage in current context +4. Start root span with run attributes + +### 2. Context Propagation + +```python +# Within the run +response = requests.get("https://api.example.com") +``` + +1. HTTP instrumentation reads current context +2. Baggage is injected into request headers +3. Downstream service extracts baggage +4. Context continues propagating + +### 3. Span Enrichment + +Every span (including auto-instrumented): + +1. `RunContextEnricher.on_start()` is called +2. Reads `botanu.run_id` from baggage +3. Writes to span attributes +4. Span is exported with run context + +### 4. Export and Processing + +1. `BatchSpanProcessor` batches spans +2. `OTLPSpanExporter` sends to collector +3. Collector processes (cost calc, PII redaction) +4. Spans written to backend + +## Why This Architecture? + +### SDK Stays Thin + +| Operation | Location | Reason | +|-----------|----------|--------| +| run_id generation | SDK | Must be synchronous | +| Baggage propagation | SDK | Process-local | +| Token counting | SDK | Available at call site | +| Cost calculation | Collector | Pricing tables change | +| PII redaction | Collector | Consistent policy | +| Aggregation | Collector | Reduces data volume | + +### No Vendor Lock-in + +- Standard OTel export format +- Any OTel-compatible backend works +- Collector processors are configurable + +### Minimal Dependencies + +Core SDK only requires `opentelemetry-api`: + +```toml +dependencies = [ + "opentelemetry-api >= 1.20.0", +] +``` + +Full SDK adds export capabilities: + +```toml +[project.optional-dependencies] +sdk = [ + "opentelemetry-sdk >= 1.20.0", + "opentelemetry-exporter-otlp-proto-http >= 1.20.0", +] +``` + +## Integration Points + +### Existing TracerProvider + +If you already have OTel configured: + +```python +from opentelemetry import trace +from botanu.processors.enricher import RunContextEnricher + +# Add our processor to your existing provider +provider = trace.get_tracer_provider() +provider.add_span_processor(RunContextEnricher()) +``` + +### Existing Instrumentation + +Botanu works alongside existing instrumentation: + +```python +# Your existing setup +from opentelemetry.instrumentation.requests import RequestsInstrumentor +RequestsInstrumentor().instrument() + +# Add Botanu +from botanu import init_botanu +init_botanu(service_name="my-service") + +# Both work together - requests are instrumented AND get run_id +``` + +## Performance Characteristics + +| Operation | Typical Latency | +|-----------|-----------------| +| `generate_run_id()` | < 0.01ms | +| `RunContextEnricher.on_start()` | < 0.05ms | +| `track_llm_call()` overhead | < 0.1ms | +| Baggage injection | < 0.01ms | + +Total SDK overhead per request: **< 0.5ms** + +## See Also + +- [Run Context](run-context.md) - RunContext model details +- [Context Propagation](context-propagation.md) - How context flows +- [Collector Configuration](../integration/collector.md) - Collector setup diff --git a/docs/concepts/context-propagation.md b/docs/concepts/context-propagation.md new file mode 100644 index 0000000..80bf319 --- /dev/null +++ b/docs/concepts/context-propagation.md @@ -0,0 +1,239 @@ +# Context Propagation + +Context propagation ensures that the `run_id` and other metadata flow through your entire application — across function calls, HTTP requests, message queues, and async workers. + +## How It Works + +Botanu uses **W3C Baggage** for context propagation, the same standard used by OpenTelemetry for distributed tracing. + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ HTTP Request Headers │ +├─────────────────────────────────────────────────────────────────┤ +│ traceparent: 00-{trace_id}-{span_id}-01 │ +│ baggage: botanu.run_id=019abc12...,botanu.use_case=Support │ +└─────────────────────────────────────────────────────────────────┘ +``` + +When you make an outbound HTTP request, the `botanu.run_id` travels in the `baggage` header alongside the trace context. + +## Propagation Modes + +### Lean Mode (Default) + +Only propagates essential fields to minimize header size: +- `botanu.run_id` +- `botanu.use_case` + +```python +# Lean mode baggage (~100 bytes) +baggage: botanu.run_id=019abc12-def3-7890-abcd-1234567890ab,botanu.use_case=Customer%20Support +``` + +### Full Mode + +Propagates all context fields: +- `botanu.run_id` +- `botanu.use_case` +- `botanu.workflow` +- `botanu.environment` +- `botanu.tenant_id` +- `botanu.parent_run_id` + +```python +# Enable full mode +import os +os.environ["BOTANU_PROPAGATION_MODE"] = "full" +``` + +## In-Process Propagation + +Within a single process, context is propagated via Python's `contextvars`: + +```python +from botanu import botanu_use_case + +@botanu_use_case("Customer Support") +def handle_ticket(ticket_id: str): + # Context is set here + + fetch_context(ticket_id) # Inherits context + call_llm() # Inherits context + save_result() # Inherits context +``` + +The `RunContextEnricher` span processor automatically reads baggage and writes to span attributes: + +```python +class RunContextEnricher(SpanProcessor): + def on_start(self, span, parent_context): + for key in ["botanu.run_id", "botanu.use_case"]: + value = baggage.get_baggage(key, parent_context) + if value: + span.set_attribute(key, value) +``` + +This ensures **every span** — including auto-instrumented ones — gets the `run_id`. + +## HTTP Propagation + +### Outbound Requests + +When using instrumented HTTP clients (`requests`, `httpx`, `urllib3`), baggage is automatically propagated: + +```python +import requests + +@botanu_use_case("Fetch Data") +def fetch_data(): + # Baggage is automatically added to headers + response = requests.get("https://api.example.com/data") +``` + +### Inbound Requests (Frameworks) + +For web frameworks (`FastAPI`, `Flask`, `Django`), use the middleware to extract context: + +```python +# FastAPI +from botanu.sdk.middleware import BotanuMiddleware + +app = FastAPI() +app.add_middleware(BotanuMiddleware) + +@app.post("/tickets") +def create_ticket(request: Request): + # RunContext is extracted from incoming baggage + # or created if not present + pass +``` + +## Message Queue Propagation + +For async messaging systems, you need to manually inject and extract context. + +### Injecting Context (Producer) + +```python +from botanu.sdk.context import get_current_run_context + +def publish_message(queue, payload): + ctx = get_current_run_context() + + message = { + "payload": payload, + "metadata": { + "baggage": ctx.to_baggage_dict() if ctx else {} + } + } + queue.publish(message) +``` + +### Extracting Context (Consumer) + +```python +from botanu.models.run_context import RunContext + +def process_message(message): + baggage = message.get("metadata", {}).get("baggage", {}) + ctx = RunContext.from_baggage(baggage) + + if ctx: + # Continue with existing context + with ctx.as_current(): + handle_message(message["payload"]) + else: + # Create new context + with RunContext.create(use_case="Message Processing").as_current(): + handle_message(message["payload"]) +``` + +## Cross-Service Propagation + +``` +┌──────────────┐ HTTP ┌──────────────┐ Kafka ┌──────────────┐ +│ Service A │ ────────────► │ Service B │ ────────────► │ Service C │ +│ │ baggage: │ │ message │ │ +│ run_id=X │ run_id=X │ run_id=X │ run_id=X │ run_id=X │ +└──────────────┘ └──────────────┘ └──────────────┘ +``` + +The same `run_id` flows through all services, enabling: +- End-to-end cost attribution +- Cross-service trace correlation +- Distributed debugging + +## Baggage Size Limits + +W3C Baggage has practical size limits. The SDK uses lean mode by default to stay well under these limits: + +| Mode | Typical Size | Recommendation | +|------|--------------|----------------| +| Lean | ~100 bytes | Use for most cases | +| Full | ~300 bytes | Use when you need all context downstream | + +## Propagation and Auto-Instrumentation + +The SDK works seamlessly with OTel auto-instrumentation: + +```python +from botanu import init_botanu + +init_botanu( + service_name="my-service", + auto_instrument=True, # Enable auto-instrumentation +) +``` + +Auto-instrumented libraries will automatically propagate baggage: +- `requests`, `httpx`, `urllib3` (HTTP clients) +- `fastapi`, `flask`, `django` (Web frameworks) +- `celery` (Task queues) +- `grpc` (gRPC) + +## Debugging Propagation + +### Check Current Context + +```python +from botanu.sdk.context import get_baggage, get_run_id + +run_id = get_run_id() +print(f"Current run_id: {run_id}") + +use_case = get_baggage("botanu.use_case") +print(f"Current use_case: {use_case}") +``` + +### Verify Header Propagation + +```python +# In your HTTP client +import httpx + +def debug_request(): + with httpx.Client() as client: + response = client.get( + "https://httpbin.org/headers", + ) + print(response.json()) + # Check for 'baggage' header in response +``` + +## Common Issues + +### Context Not Propagating + +1. **Missing initialization**: Ensure `init_botanu()` is called at startup +2. **Missing middleware**: Add `BotanuMiddleware` to your web framework +3. **Async context loss**: Use `contextvars`-aware async patterns + +### Duplicate run_ids + +1. **Multiple decorators**: Only use `@botanu_use_case` at the entry point +2. **Middleware + decorator**: Choose one, not both + +## See Also + +- [Run Context](run-context.md) - Understanding the RunContext model +- [Architecture](architecture.md) - Overall SDK architecture diff --git a/docs/concepts/run-context.md b/docs/concepts/run-context.md new file mode 100644 index 0000000..436be03 --- /dev/null +++ b/docs/concepts/run-context.md @@ -0,0 +1,188 @@ +# Run Context + +The Run Context is the core concept in Botanu SDK. It represents a single business transaction or workflow execution that you want to track for cost attribution. + +## What is a Run? + +A **run** is a logical unit of work that produces a business outcome. Examples: + +- Resolving a customer support ticket +- Processing a document +- Generating a report +- Handling a chatbot conversation + +A single run may involve: +- Multiple LLM calls (possibly to different providers) +- Database queries +- Storage operations +- External API calls +- Message queue operations + +## The run_id + +Every run is identified by a unique `run_id` — a UUIDv7 that is: + +- **Time-sortable**: IDs generated later sort after earlier ones +- **Globally unique**: No collisions across services +- **Propagated automatically**: Flows through your entire application via W3C Baggage + +```python +from botanu.models.run_context import generate_run_id + +run_id = generate_run_id() +# "019abc12-def3-7890-abcd-1234567890ab" +``` + +## RunContext Model + +The `RunContext` dataclass holds all metadata for a run: + +```python +from botanu.models.run_context import RunContext + +ctx = RunContext.create( + use_case="Customer Support", + workflow="handle_ticket", + environment="production", + tenant_id="tenant-123", +) + +print(ctx.run_id) # "019abc12-def3-7890-..." +print(ctx.root_run_id) # Same as run_id for top-level runs +print(ctx.attempt) # 1 (first attempt) +``` + +### Key Fields + +| Field | Description | +|-------|-------------| +| `run_id` | Unique identifier for this run (UUIDv7) | +| `root_run_id` | ID of the original run (for retries, same as `run_id` for first attempt) | +| `use_case` | Business use case name (e.g., "Customer Support") | +| `workflow` | Optional workflow/function name | +| `environment` | Deployment environment (production, staging, etc.) | +| `attempt` | Attempt number (1 for first, 2+ for retries) | +| `tenant_id` | Optional tenant identifier for multi-tenant systems | + +## Creating Runs + +### Using the Decorator (Recommended) + +```python +from botanu import botanu_use_case + +@botanu_use_case("Customer Support") +def handle_ticket(ticket_id: str): + # RunContext is automatically created and propagated + # All operations inside inherit the same run_id + pass +``` + +### Manual Creation + +```python +from botanu.models.run_context import RunContext + +ctx = RunContext.create( + use_case="Document Processing", + workflow="extract_entities", + tenant_id="acme-corp", +) + +# Use ctx.to_baggage_dict() to propagate via HTTP headers +# Use ctx.to_span_attributes() to add to spans +``` + +## Retry Handling + +When a run fails and is retried, use `create_retry()` to maintain lineage: + +```python +original = RunContext.create(use_case="Process Order") + +# First attempt fails... + +retry = RunContext.create_retry(original) +print(retry.attempt) # 2 +print(retry.retry_of_run_id) # Original run_id +print(retry.root_run_id) # Same as original.run_id +print(retry.run_id) # New unique ID +``` + +This enables: +- Tracking total attempts for a business operation +- Correlating retries back to the original request +- Calculating aggregate cost across all attempts + +## Deadlines and Cancellation + +RunContext supports deadline and cancellation tracking: + +```python +ctx = RunContext.create( + use_case="Long Running Task", + deadline_seconds=30.0, # 30 second deadline +) + +# Check deadline +if ctx.is_past_deadline(): + raise TimeoutError("Deadline exceeded") + +# Check remaining time +remaining = ctx.remaining_time_seconds() + +# Request cancellation +ctx.request_cancellation(reason="user") +if ctx.is_cancelled(): + # Clean up and exit + pass +``` + +## Serialization + +### To Baggage (for HTTP propagation) + +```python +# Lean mode (default): only run_id and use_case +baggage = ctx.to_baggage_dict() +# {"botanu.run_id": "...", "botanu.use_case": "..."} + +# Full mode: all fields +baggage = ctx.to_baggage_dict(lean_mode=False) +# Includes workflow, environment, tenant_id, etc. +``` + +### To Span Attributes + +```python +attrs = ctx.to_span_attributes() +# {"botanu.run_id": "...", "botanu.use_case": "...", ...} +``` + +### From Baggage (receiving side) + +```python +ctx = RunContext.from_baggage(baggage_dict) +if ctx is None: + # Required fields missing, create new context + ctx = RunContext.create(use_case="Unknown") +``` + +## Environment Variables + +| Variable | Description | Default | +|----------|-------------|---------| +| `BOTANU_ENVIRONMENT` | Default environment | `"production"` | +| `BOTANU_PROPAGATION_MODE` | `"lean"` or `"full"` | `"lean"` | + +## Best Practices + +1. **One run per business outcome**: Don't create runs for internal operations +2. **Use descriptive use_case names**: They appear in dashboards and queries +3. **Leverage tenant_id**: Essential for multi-tenant cost attribution +4. **Handle retries properly**: Always use `create_retry()` for retry attempts + +## See Also + +- [Context Propagation](context-propagation.md) - How context flows through your application +- [Outcomes](../tracking/outcomes.md) - Recording business outcomes diff --git a/docs/getting-started/configuration.md b/docs/getting-started/configuration.md new file mode 100644 index 0000000..902387b --- /dev/null +++ b/docs/getting-started/configuration.md @@ -0,0 +1,288 @@ +# Configuration + +Botanu SDK can be configured through code, environment variables, or YAML files. + +## Configuration Precedence + +1. **Code arguments** (explicit values passed to `BotanuConfig`) +2. **Environment variables** (`BOTANU_*`, `OTEL_*`) +3. **YAML config file** (`botanu.yaml` or specified path) +4. **Built-in defaults** + +## Quick Configuration + +### Code-Based + +```python +from botanu import enable + +enable( + service_name="my-service", + otlp_endpoint="http://collector:4318/v1/traces", +) +``` + +### Environment Variables + +```bash +export OTEL_SERVICE_NAME=my-service +export OTEL_EXPORTER_OTLP_ENDPOINT=http://collector:4318 +export BOTANU_ENVIRONMENT=production +``` + +### YAML File + +```yaml +# botanu.yaml +service: + name: my-service + version: 1.0.0 + environment: production + +otlp: + endpoint: http://collector:4318/v1/traces + +propagation: + mode: lean +``` + +Load with: + +```python +from botanu.sdk.config import BotanuConfig + +config = BotanuConfig.from_yaml("botanu.yaml") +``` + +## Full Configuration Reference + +### BotanuConfig Fields + +```python +from dataclasses import dataclass + +@dataclass +class BotanuConfig: + # Service identification + service_name: str = None # OTEL_SERVICE_NAME + service_version: str = None # OTEL_SERVICE_VERSION + service_namespace: str = None # OTEL_SERVICE_NAMESPACE + deployment_environment: str = None # OTEL_DEPLOYMENT_ENVIRONMENT + + # Resource detection + auto_detect_resources: bool = True # BOTANU_AUTO_DETECT_RESOURCES + + # OTLP exporter + otlp_endpoint: str = None # OTEL_EXPORTER_OTLP_ENDPOINT + otlp_headers: dict = None # Custom headers for auth + + # Span export + max_export_batch_size: int = 512 + max_queue_size: int = 2048 + schedule_delay_millis: int = 5000 + + # Sampling (1.0 = 100%) + trace_sample_rate: float = 1.0 # BOTANU_TRACE_SAMPLE_RATE + + # Propagation mode + propagation_mode: str = "lean" # BOTANU_PROPAGATION_MODE + + # Auto-instrumentation + auto_instrument_packages: list = [...] +``` + +## Environment Variables + +### OpenTelemetry Standard Variables + +| Variable | Description | Default | +|----------|-------------|---------| +| `OTEL_SERVICE_NAME` | Service name | `unknown_service` | +| `OTEL_SERVICE_VERSION` | Service version | None | +| `OTEL_SERVICE_NAMESPACE` | Service namespace | None | +| `OTEL_DEPLOYMENT_ENVIRONMENT` | Environment name | `production` | +| `OTEL_EXPORTER_OTLP_ENDPOINT` | OTLP collector base URL | `http://localhost:4318` | +| `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` | OTLP traces endpoint (full URL) | None | + +### Botanu-Specific Variables + +| Variable | Description | Default | +|----------|-------------|---------| +| `BOTANU_ENVIRONMENT` | Fallback for environment | `production` | +| `BOTANU_PROPAGATION_MODE` | `lean` or `full` | `lean` | +| `BOTANU_TRACE_SAMPLE_RATE` | Sampling rate (0.0-1.0) | `1.0` | +| `BOTANU_AUTO_DETECT_RESOURCES` | Auto-detect cloud resources | `true` | +| `BOTANU_CONFIG_FILE` | Path to YAML config | None | + +## YAML Configuration + +### Full Example + +```yaml +# botanu.yaml - Full configuration example +service: + name: ${OTEL_SERVICE_NAME:-my-service} + version: ${APP_VERSION:-1.0.0} + namespace: production + environment: ${ENVIRONMENT:-production} + +resource: + auto_detect: true + +otlp: + endpoint: ${OTEL_EXPORTER_OTLP_ENDPOINT:-http://localhost:4318}/v1/traces + headers: + Authorization: Bearer ${OTLP_AUTH_TOKEN} + +export: + batch_size: 512 + queue_size: 2048 + delay_ms: 5000 + +sampling: + rate: 1.0 + +propagation: + mode: lean + +auto_instrument_packages: + - requests + - httpx + - fastapi + - sqlalchemy + - openai_v2 +``` + +### Environment Variable Interpolation + +The YAML loader supports two interpolation patterns: + +```yaml +# Simple interpolation +endpoint: ${COLLECTOR_URL} + +# With default value +endpoint: ${COLLECTOR_URL:-http://localhost:4318} +``` + +### Loading Configuration + +```python +from botanu.sdk.config import BotanuConfig + +# Explicit path +config = BotanuConfig.from_yaml("config/botanu.yaml") + +# Auto-discover (searches botanu.yaml, config/botanu.yaml) +config = BotanuConfig.from_file_or_env() + +# Environment only +config = BotanuConfig() +``` + +## Propagation Modes + +### Lean Mode (Default) + +Propagates only essential fields to minimize header size: + +- `botanu.run_id` +- `botanu.use_case` + +Best for high-traffic systems where header size matters. + +### Full Mode + +Propagates all context fields: + +- `botanu.run_id` +- `botanu.use_case` +- `botanu.workflow` +- `botanu.environment` +- `botanu.tenant_id` +- `botanu.parent_run_id` + +Enable with: + +```bash +export BOTANU_PROPAGATION_MODE=full +``` + +Or: + +```python +enable(service_name="my-service", propagation_mode="full") +``` + +## Auto-Instrumentation + +### Default Packages + +By default, Botanu enables instrumentation for: + +```python +[ + # HTTP clients + "requests", "httpx", "urllib3", "aiohttp_client", + # Web frameworks + "fastapi", "flask", "django", "starlette", + # Databases + "sqlalchemy", "psycopg2", "asyncpg", "pymongo", "redis", + # Messaging + "celery", "kafka_python", + # gRPC + "grpc", + # GenAI + "openai_v2", "anthropic", "vertexai", "google_genai", "langchain", + # Runtime + "logging", +] +``` + +### Customizing Packages + +```python +from botanu import enable + +enable( + service_name="my-service", + auto_instrument_packages=["requests", "fastapi", "openai_v2"], +) +``` + +### Disabling Auto-Instrumentation + +```python +enable( + service_name="my-service", + auto_instrument_packages=[], # Empty list disables +) +``` + +## Sampling + +For cost attribution, **always use 100% sampling** (the default): + +```python +trace_sample_rate: float = 1.0 # Never miss a transaction +``` + +If you must sample, understand that cost calculations will be incomplete. + +## Exporting Configuration + +```python +config = BotanuConfig( + service_name="my-service", + deployment_environment="production", +) + +# Export as dictionary +print(config.to_dict()) +``` + +## See Also + +- [Architecture](../concepts/architecture.md) - SDK design principles +- [Collector Configuration](../integration/collector.md) - Collector setup +- [Existing OTel Setup](../integration/existing-otel.md) - Integration with existing OTel diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md new file mode 100644 index 0000000..f11f9b1 --- /dev/null +++ b/docs/getting-started/installation.md @@ -0,0 +1,141 @@ +# Installation + +This guide covers installing Botanu SDK and its optional dependencies. + +## Requirements + +- Python 3.9 or later +- OpenTelemetry Collector (for span processing) + +## Basic Installation + +Install the core SDK with pip: + +```bash +pip install botanu +``` + +The core package has minimal dependencies: +- `opentelemetry-api >= 1.20.0` + +This is all you need if you already have OpenTelemetry configured in your application. + +## Installation with Extras + +### Full SDK (Recommended for Standalone) + +If you don't have an existing OpenTelemetry setup: + +```bash +pip install "botanu[sdk]" +``` + +This adds: +- `opentelemetry-sdk` - The OTel SDK implementation +- `opentelemetry-exporter-otlp-proto-http` - OTLP HTTP exporter + +### Auto-Instrumentation + +For automatic instrumentation of common libraries: + +```bash +pip install "botanu[instruments]" +``` + +Includes instrumentation for: +- **HTTP clients**: requests, httpx, urllib3, aiohttp +- **Web frameworks**: FastAPI, Flask, Django, Starlette +- **Databases**: SQLAlchemy, psycopg2, asyncpg, pymongo, redis +- **Messaging**: Celery, Kafka +- **Other**: gRPC, logging + +### GenAI Instrumentation + +For automatic LLM provider instrumentation: + +```bash +pip install "botanu[genai]" +``` + +Includes instrumentation for: +- OpenAI +- Anthropic +- Google Vertex AI +- Google GenAI +- LangChain + +### Everything + +To install all optional dependencies: + +```bash +pip install "botanu[all]" +``` + +### Development + +For development and testing: + +```bash +pip install "botanu[dev]" +``` + +## Verify Installation + +```python +import botanu +print(botanu.__version__) +``` + +## Docker + +In a Dockerfile: + +```dockerfile +FROM python:3.11-slim + +WORKDIR /app + +# Install Botanu with SDK extras +RUN pip install "botanu[sdk]" + +COPY . . + +CMD ["python", "app.py"] +``` + +## Poetry + +```toml +[tool.poetry.dependencies] +botanu = { version = "^0.1.0", extras = ["sdk"] } +``` + +## pip-tools / requirements.txt + +```text +# requirements.in +botanu[sdk]>=0.1.0 +``` + +Generate with: +```bash +pip-compile requirements.in +``` + +## Collector Setup + +Botanu SDK sends traces to an OpenTelemetry Collector. You'll need one running to receive spans. + +Quick start with Docker: + +```bash +docker run -p 4318:4318 otel/opentelemetry-collector:latest +``` + +See [Collector Configuration](../integration/collector.md) for detailed setup. + +## Next Steps + +- [Quickstart](quickstart.md) - Your first instrumented application +- [Configuration](configuration.md) - Customize SDK behavior diff --git a/docs/getting-started/quickstart.md b/docs/getting-started/quickstart.md new file mode 100644 index 0000000..df9a510 --- /dev/null +++ b/docs/getting-started/quickstart.md @@ -0,0 +1,166 @@ +# Quickstart + +Get run-level cost attribution working in 5 minutes. + +## Prerequisites + +- Python 3.9+ +- Botanu SDK installed (`pip install "botanu[sdk]"`) +- OpenTelemetry Collector running (see [Collector Configuration](../integration/collector.md)) + +## Step 1: Enable the SDK + +At application startup, enable Botanu: + +```python +from botanu import enable + +enable(service_name="my-ai-service") +``` + +This: +- Configures OpenTelemetry with OTLP export +- Adds the `RunContextEnricher` span processor +- Enables W3C Baggage propagation + +## Step 2: Define a Use Case + +Wrap your entry point with `@botanu_use_case`: + +```python +from botanu import botanu_use_case, emit_outcome + +@botanu_use_case("Customer Support") +async def handle_support_ticket(ticket_id: str): + # Your business logic here + context = await fetch_ticket_context(ticket_id) + response = await generate_response(context) + await send_response(ticket_id, response) + + # Record the business outcome + emit_outcome("success", value_type="tickets_resolved", value_amount=1) + return response +``` + +Every operation inside this function (LLM calls, database queries, HTTP requests) will be automatically linked to the same `run_id`. + +## Step 3: Track LLM Calls + +For manual LLM tracking (when auto-instrumentation isn't available): + +```python +from botanu.tracking.llm import track_llm_call + +@botanu_use_case("Document Analysis") +async def analyze_document(doc_id: str): + document = await fetch_document(doc_id) + + with track_llm_call(provider="openai", model="gpt-4") as tracker: + response = await openai.chat.completions.create( + model="gpt-4", + messages=[{"role": "user", "content": document}] + ) + tracker.set_tokens( + input_tokens=response.usage.prompt_tokens, + output_tokens=response.usage.completion_tokens, + ) + tracker.set_request_id(response.id) + + emit_outcome("success", value_type="documents_analyzed", value_amount=1) + return response.choices[0].message.content +``` + +## Step 4: Track Data Operations + +Track database and storage operations for complete cost visibility: + +```python +from botanu.tracking.data import track_db_operation, track_storage_operation + +@botanu_use_case("Data Pipeline") +async def process_data(job_id: str): + # Track database reads + with track_db_operation(system="postgresql", operation="SELECT") as db: + rows = await fetch_records(job_id) + db.set_result(rows_returned=len(rows)) + + # Track storage writes + with track_storage_operation(system="s3", operation="PUT") as storage: + await upload_results(job_id, rows) + storage.set_result(bytes_written=len(rows) * 1024) + + emit_outcome("success", value_type="jobs_processed", value_amount=1) +``` + +## Complete Example + +```python +import asyncio +from botanu import enable, botanu_use_case, emit_outcome +from botanu.tracking.llm import track_llm_call +from botanu.tracking.data import track_db_operation + +# Initialize at startup +enable(service_name="support-bot") + +@botanu_use_case("Customer Support") +async def handle_ticket(ticket_id: str): + """Process a customer support ticket.""" + + # Fetch ticket from database (auto-tracked if using instrumented client) + with track_db_operation(system="postgresql", operation="SELECT") as db: + ticket = await db_client.fetch_ticket(ticket_id) + db.set_result(rows_returned=1) + + # Generate response with LLM + with track_llm_call(provider="openai", model="gpt-4") as llm: + response = await openai_client.chat.completions.create( + model="gpt-4", + messages=[ + {"role": "system", "content": "You are a helpful support agent."}, + {"role": "user", "content": ticket.description} + ] + ) + llm.set_tokens( + input_tokens=response.usage.prompt_tokens, + output_tokens=response.usage.completion_tokens, + ) + + # Save response (auto-tracked) + with track_db_operation(system="postgresql", operation="INSERT") as db: + await db_client.save_response(ticket_id, response.choices[0].message.content) + db.set_result(rows_affected=1) + + # Record business outcome + emit_outcome("success", value_type="tickets_resolved", value_amount=1) + + return response.choices[0].message.content + +# Run +asyncio.run(handle_ticket("TICKET-123")) +``` + +## What Gets Tracked + +After running, you'll have spans with: + +| Attribute | Value | Description | +|-----------|-------|-------------| +| `botanu.run_id` | `019abc12-...` | Unique run identifier (UUIDv7) | +| `botanu.use_case` | `Customer Support` | Business use case | +| `botanu.outcome` | `success` | Outcome status | +| `gen_ai.usage.input_tokens` | `150` | LLM input tokens | +| `gen_ai.usage.output_tokens` | `200` | LLM output tokens | +| `gen_ai.provider.name` | `openai` | LLM provider | +| `db.system` | `postgresql` | Database system | + +All spans share the same `run_id`, enabling: +- Total cost per business transaction +- Cost breakdown by component +- Cost-per-outcome analytics + +## Next Steps + +- [Configuration](configuration.md) - Environment variables and YAML config +- [LLM Tracking](../tracking/llm-tracking.md) - Detailed LLM instrumentation +- [Context Propagation](../concepts/context-propagation.md) - Cross-service tracing diff --git a/docs/index.md b/docs/index.md index ca76599..3c2ea2b 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,29 +1,62 @@ # Botanu SDK Documentation -Botanu SDK is an OpenTelemetry-native library for run-level cost attribution in AI workflows. +Botanu SDK provides OpenTelemetry-native run-level cost attribution for AI workflows. -## Quick Start +## Overview -### Installation +Traditional observability tools trace individual requests. But AI workflows are different — a single business outcome (resolving a support ticket, processing an order) might span multiple LLM calls, retries, tool executions, and data operations across different vendors. -```bash -pip install botanu -``` +Botanu introduces **run-level attribution**: a unique `run_id` that follows your entire workflow, enabling you to answer "How much did this outcome cost?" -For full SDK capabilities with OTLP export: +## Documentation -```bash -pip install botanu[sdk] -``` +### Getting Started + +- [Installation](getting-started/installation.md) - Install and configure the SDK +- [Quick Start](getting-started/quickstart.md) - Get up and running in 5 minutes +- [Configuration](getting-started/configuration.md) - Configuration options and environment variables + +### Core Concepts + +- [Run Context](concepts/run-context.md) - Understanding `run_id` and context propagation +- [Context Propagation](concepts/context-propagation.md) - How context flows through your application +- [Architecture](concepts/architecture.md) - SDK design and component overview + +### Tracking + +- [LLM Tracking](tracking/llm-tracking.md) - Track AI model calls and token usage +- [Data Tracking](tracking/data-tracking.md) - Track database, storage, and messaging operations +- [Outcomes](tracking/outcomes.md) - Record business outcomes for ROI calculation + +### Integration + +- [Auto-Instrumentation](integration/auto-instrumentation.md) - Automatic instrumentation for common libraries +- [Existing OTel Setup](integration/existing-otel.md) - Integrate with existing OpenTelemetry deployments +- [Collector Configuration](integration/collector.md) - Configure the OpenTelemetry Collector + +### Patterns -### Basic Usage +- [Best Practices](patterns/best-practices.md) - Recommended patterns for production use +- [Anti-Patterns](patterns/anti-patterns.md) - Common mistakes to avoid + +### API Reference + +- [Decorators](api/decorators.md) - `@botanu_use_case` and related decorators +- [Tracking API](api/tracking.md) - Manual tracking context managers +- [Configuration API](api/configuration.md) - `BotanuConfig` and initialization + +## Quick Example ```python -from botanu import botanu_use_case +from botanu import init_botanu, botanu_use_case from botanu.tracking.llm import track_llm_call +# Initialize once at startup +init_botanu(service_name="support-agent") + @botanu_use_case("Customer Support") def handle_ticket(ticket_id: str): + # Every operation inside gets the same run_id with track_llm_call(provider="openai", model="gpt-4") as tracker: response = openai.chat.completions.create(...) tracker.set_tokens( @@ -31,25 +64,8 @@ def handle_ticket(ticket_id: str): output_tokens=response.usage.completion_tokens, ) return response - -# Every span within handle_ticket is tagged with botanu.run_id -result = handle_ticket("TICKET-123") ``` -## Features - -- **Run-level Attribution**: Track costs per business transaction, not just per request -- **OpenTelemetry Native**: Built on OTel standards for maximum interoperability -- **Minimal Overhead**: Lightweight SDK with heavy processing in the collector -- **Multi-provider Support**: Works with OpenAI, Anthropic, Bedrock, Vertex AI, and more - -## Documentation - -- [Configuration](configuration.md) -- [LLM Tracking](llm-tracking.md) -- [Data Tracking](data-tracking.md) -- [API Reference](api-reference.md) - ## License -Apache License 2.0 +Apache License 2.0. See [LICENSE](https://github.com/botanu-ai/botanu-sdk-python/blob/main/LICENSE). diff --git a/docs/integration/auto-instrumentation.md b/docs/integration/auto-instrumentation.md new file mode 100644 index 0000000..3d4f0b3 --- /dev/null +++ b/docs/integration/auto-instrumentation.md @@ -0,0 +1,303 @@ +# Auto-Instrumentation + +Automatically instrument common libraries for seamless tracing. + +## Overview + +Botanu leverages OpenTelemetry's auto-instrumentation ecosystem. When enabled, your HTTP clients, web frameworks, databases, and LLM providers are automatically traced without code changes. + +## Enabling Auto-Instrumentation + +```python +from botanu import enable + +enable( + service_name="my-service", + auto_instrument=True, # Default +) +``` + +Or with specific packages: + +```python +enable( + service_name="my-service", + auto_instrument_packages=["requests", "fastapi", "openai_v2"], +) +``` + +## Supported Libraries + +### HTTP Clients + +| Library | Package | Notes | +|---------|---------|-------| +| requests | `opentelemetry-instrumentation-requests` | Sync HTTP | +| httpx | `opentelemetry-instrumentation-httpx` | Sync/async HTTP | +| urllib3 | `opentelemetry-instrumentation-urllib3` | Low-level HTTP | +| aiohttp | `opentelemetry-instrumentation-aiohttp-client` | Async HTTP | + +### Web Frameworks + +| Framework | Package | Notes | +|-----------|---------|-------| +| FastAPI | `opentelemetry-instrumentation-fastapi` | ASGI framework | +| Flask | `opentelemetry-instrumentation-flask` | WSGI framework | +| Django | `opentelemetry-instrumentation-django` | Full-stack framework | +| Starlette | `opentelemetry-instrumentation-starlette` | ASGI toolkit | + +### Databases + +| Database | Package | Notes | +|----------|---------|-------| +| SQLAlchemy | `opentelemetry-instrumentation-sqlalchemy` | ORM/Core | +| psycopg2 | `opentelemetry-instrumentation-psycopg2` | PostgreSQL | +| asyncpg | `opentelemetry-instrumentation-asyncpg` | Async PostgreSQL | +| pymongo | `opentelemetry-instrumentation-pymongo` | MongoDB | +| redis | `opentelemetry-instrumentation-redis` | Redis | + +### Messaging + +| System | Package | Notes | +|--------|---------|-------| +| Celery | `opentelemetry-instrumentation-celery` | Task queue | +| kafka-python | `opentelemetry-instrumentation-kafka-python` | Kafka client | + +### GenAI / LLM Providers + +| Provider | Package | Notes | +|----------|---------|-------| +| OpenAI | `opentelemetry-instrumentation-openai-v2` | ChatGPT, GPT-4 | +| Anthropic | `opentelemetry-instrumentation-anthropic` | Claude | +| Vertex AI | `opentelemetry-instrumentation-vertexai` | Google Vertex | +| Google GenAI | `opentelemetry-instrumentation-google-genai` | Gemini | +| LangChain | `opentelemetry-instrumentation-langchain` | LangChain | + +### Other + +| Library | Package | Notes | +|---------|---------|-------| +| gRPC | `opentelemetry-instrumentation-grpc` | RPC framework | +| logging | `opentelemetry-instrumentation-logging` | Python logging | + +## Installation + +Install the instrumentation packages you need: + +```bash +# Full suite +pip install "botanu[instruments,genai]" + +# Or individual packages +pip install opentelemetry-instrumentation-fastapi +pip install opentelemetry-instrumentation-openai-v2 +``` + +## How It Works + +1. **At startup**, Botanu calls each instrumentor's `instrument()` method +2. **Instrumented libraries** automatically create spans for operations +3. **RunContextEnricher** adds `run_id` to every span via baggage +4. **All spans** are linked to the current run, enabling cost attribution + +```python +from botanu import enable, botanu_use_case + +enable(service_name="my-service") + +@botanu_use_case("Customer Support") +async def handle_ticket(ticket_id: str): + # requests.get() automatically creates a span with run_id + context = requests.get(f"https://api.example.com/tickets/{ticket_id}") + + # OpenAI call automatically creates a span with tokens, model, etc. + response = await openai.chat.completions.create( + model="gpt-4", + messages=[{"role": "user", "content": context.text}] + ) + + return response +``` + +## Context Propagation + +Auto-instrumented HTTP clients automatically propagate context: + +```python +@botanu_use_case("Distributed Workflow") +async def orchestrate(): + # Baggage (run_id, use_case) is injected into request headers + response = requests.get("https://service-b.example.com/process") + # Service B extracts baggage and continues the trace +``` + +Headers injected: +``` +traceparent: 00-{trace_id}-{span_id}-01 +baggage: botanu.run_id=019abc12...,botanu.use_case=Distributed%20Workflow +``` + +## Customizing Instrumentation + +### Exclude Specific Endpoints + +```python +from opentelemetry.instrumentation.requests import RequestsInstrumentor + +# Exclude health checks from tracing +RequestsInstrumentor().instrument( + excluded_urls=["health", "metrics"] +) +``` + +### Add Request/Response Hooks + +```python +def request_hook(span, request): + span.set_attribute("http.request.custom_header", request.headers.get("X-Custom")) + +def response_hook(span, request, response): + span.set_attribute("http.response.custom_header", response.headers.get("X-Custom")) + +RequestsInstrumentor().instrument( + request_hook=request_hook, + response_hook=response_hook, +) +``` + +## GenAI Instrumentation Details + +### OpenAI + +Automatically captures: +- Model name and parameters +- Token usage (input, output, cached) +- Request/response IDs +- Streaming status +- Tool/function calls + +```python +# Automatically traced +response = await openai.chat.completions.create( + model="gpt-4", + messages=[{"role": "user", "content": "Hello"}] +) +``` + +Span attributes: +``` +gen_ai.operation.name: chat +gen_ai.provider.name: openai +gen_ai.request.model: gpt-4 +gen_ai.usage.input_tokens: 10 +gen_ai.usage.output_tokens: 25 +``` + +### Anthropic + +Automatically captures: +- Model and version +- Token usage with cache breakdown +- Stop reason + +```python +# Automatically traced +response = await anthropic.messages.create( + model="claude-3-opus-20240229", + messages=[{"role": "user", "content": "Hello"}] +) +``` + +### LangChain + +Traces the full chain execution: + +```python +# Each step is traced +chain = prompt | llm | parser +result = await chain.ainvoke({"input": "Hello"}) +``` + +## Combining with Manual Tracking + +Auto-instrumentation works alongside manual tracking: + +```python +from botanu import botanu_use_case, emit_outcome +from botanu.tracking.llm import track_llm_call + +@botanu_use_case("Hybrid Workflow") +async def hybrid_example(): + # Auto-instrumented HTTP call + data = requests.get("https://api.example.com/data") + + # Manual tracking for custom provider + with track_llm_call(provider="custom-llm", model="my-model") as tracker: + response = await custom_llm_call(data.json()) + tracker.set_tokens(input_tokens=100, output_tokens=200) + + # Auto-instrumented database call + await database.execute("INSERT INTO results VALUES (?)", response) + + emit_outcome("success") +``` + +## Disabling Auto-Instrumentation + +### Completely Disable + +```python +enable( + service_name="my-service", + auto_instrument_packages=[], # Empty list +) +``` + +### Disable Specific Libraries + +```python +enable( + service_name="my-service", + auto_instrument_packages=["fastapi", "openai_v2"], # Only these +) +``` + +## Troubleshooting + +### Spans Not Appearing + +1. Check the library is installed: + ```bash + pip list | grep opentelemetry-instrumentation + ``` + +2. Verify instrumentation is enabled: + ```python + from opentelemetry.instrumentation.requests import RequestsInstrumentor + print(RequestsInstrumentor().is_instrumented()) + ``` + +3. Ensure `enable()` is called before library imports: + ```python + from botanu import enable + enable(service_name="my-service") + + # Import after enable() + import requests + ``` + +### Context Not Propagating + +Check that baggage propagator is configured: + +```python +from opentelemetry import propagate +print(propagate.get_global_textmap()) +# Should include W3CBaggagePropagator +``` + +## See Also + +- [Existing OTel Setup](existing-otel.md) - Integration with existing OTel +- [Collector Configuration](collector.md) - Collector setup +- [Context Propagation](../concepts/context-propagation.md) - How context flows diff --git a/docs/integration/collector.md b/docs/integration/collector.md new file mode 100644 index 0000000..ed85df9 --- /dev/null +++ b/docs/integration/collector.md @@ -0,0 +1,422 @@ +# Collector Configuration + +Set up the OpenTelemetry Collector for cost attribution processing. + +## Overview + +Botanu follows a "thin SDK, smart collector" architecture. The SDK captures raw telemetry; the collector handles: + +- **PII redaction** - Remove sensitive data from prompts/responses +- **Cost calculation** - Convert tokens to dollars using pricing tables +- **Vendor normalization** - Standardize provider names +- **Cardinality management** - Limit high-cardinality attributes +- **Aggregation** - Pre-aggregate metrics for dashboards + +## Quick Start + +### Docker + +```bash +docker run -p 4318:4318 -p 4317:4317 \ + -v $(pwd)/otel-config.yaml:/etc/otelcol/config.yaml \ + otel/opentelemetry-collector-contrib:latest +``` + +### Docker Compose + +```yaml +services: + collector: + image: otel/opentelemetry-collector-contrib:latest + ports: + - "4318:4318" # OTLP HTTP + - "4317:4317" # OTLP gRPC + volumes: + - ./otel-config.yaml:/etc/otelcol/config.yaml +``` + +## Basic Configuration + +```yaml +# otel-config.yaml +receivers: + otlp: + protocols: + http: + endpoint: 0.0.0.0:4318 + grpc: + endpoint: 0.0.0.0:4317 + +processors: + batch: + send_batch_size: 1000 + timeout: 10s + +exporters: + debug: + verbosity: detailed + +service: + pipelines: + traces: + receivers: [otlp] + processors: [batch] + exporters: [debug] +``` + +## Cost Attribution Configuration + +### Full Pipeline + +```yaml +receivers: + otlp: + protocols: + http: + endpoint: 0.0.0.0:4318 + grpc: + endpoint: 0.0.0.0:4317 + +processors: + # Batch for efficiency + batch: + send_batch_size: 1000 + timeout: 10s + + # Normalize vendor names + transform/vendor: + trace_statements: + - context: span + statements: + # Normalize provider names to standard format + - set(attributes["botanu.vendor"], "openai") where attributes["gen_ai.provider.name"] == "openai" + - set(attributes["botanu.vendor"], "anthropic") where attributes["gen_ai.provider.name"] == "anthropic" + - set(attributes["botanu.vendor"], "azure.openai") where attributes["gen_ai.provider.name"] == "azure.openai" + - set(attributes["botanu.vendor"], "gcp.vertex_ai") where attributes["gen_ai.provider.name"] == "gcp.vertex_ai" + - set(attributes["botanu.vendor"], "aws.bedrock") where attributes["gen_ai.provider.name"] == "aws.bedrock" + + # Calculate costs from tokens + transform/cost: + trace_statements: + - context: span + statements: + # GPT-4 pricing (example: $30/$60 per 1M tokens) + - set(attributes["botanu.cost.input_usd"], + attributes["gen_ai.usage.input_tokens"] * 0.00003) + where attributes["gen_ai.request.model"] == "gpt-4" + - set(attributes["botanu.cost.output_usd"], + attributes["gen_ai.usage.output_tokens"] * 0.00006) + where attributes["gen_ai.request.model"] == "gpt-4" + + # GPT-4 Turbo pricing ($10/$30 per 1M tokens) + - set(attributes["botanu.cost.input_usd"], + attributes["gen_ai.usage.input_tokens"] * 0.00001) + where attributes["gen_ai.request.model"] == "gpt-4-turbo" + - set(attributes["botanu.cost.output_usd"], + attributes["gen_ai.usage.output_tokens"] * 0.00003) + where attributes["gen_ai.request.model"] == "gpt-4-turbo" + + # Claude 3 Opus pricing ($15/$75 per 1M tokens) + - set(attributes["botanu.cost.input_usd"], + attributes["gen_ai.usage.input_tokens"] * 0.000015) + where attributes["gen_ai.request.model"] == "claude-3-opus-20240229" + - set(attributes["botanu.cost.output_usd"], + attributes["gen_ai.usage.output_tokens"] * 0.000075) + where attributes["gen_ai.request.model"] == "claude-3-opus-20240229" + + # Calculate total + - set(attributes["botanu.cost.total_usd"], + attributes["botanu.cost.input_usd"] + attributes["botanu.cost.output_usd"]) + where attributes["botanu.cost.input_usd"] != nil + + # PII redaction for prompts/responses + redaction: + allow_all_keys: true + blocked_values: + # Email addresses + - "\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}\\b" + # Phone numbers + - "\\b\\d{3}[-.]?\\d{3}[-.]?\\d{4}\\b" + # SSN + - "\\b\\d{3}-\\d{2}-\\d{4}\\b" + # Credit card numbers + - "\\b(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14}|3[47][0-9]{13})\\b" + + # Cardinality limits + attributes: + actions: + - key: botanu.run_id + action: hash + # Keep first 16 chars of hash to reduce cardinality if needed + - key: gen_ai.content.prompt + action: delete + # Remove raw prompts (keep tokens for cost) + +exporters: + # ClickHouse for analytics + clickhouse: + endpoint: tcp://clickhouse:9000 + database: botanu + ttl: 90d + create_schema: true + + # Also send to your APM + otlp/apm: + endpoint: https://your-apm.example.com + headers: + Authorization: Bearer ${APM_TOKEN} + +service: + pipelines: + traces: + receivers: [otlp] + processors: + - batch + - transform/vendor + - transform/cost + - redaction + - attributes + exporters: [clickhouse, otlp/apm] +``` + +## PII Redaction + +### Using Redaction Processor + +```yaml +processors: + redaction: + allow_all_keys: true + blocked_values: + # Redact common PII patterns + - "\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}\\b" # Email + - "\\b\\d{3}[-.]?\\d{3}[-.]?\\d{4}\\b" # Phone + - "\\b\\d{3}-\\d{2}-\\d{4}\\b" # SSN + summary: debug # Log redaction summary +``` + +### Using Transform Processor + +```yaml +processors: + transform/pii: + trace_statements: + - context: span + statements: + # Remove prompt content entirely + - delete(attributes["gen_ai.content.prompt"]) + - delete(attributes["gen_ai.content.completion"]) + + # Or replace with placeholder + - replace_pattern(attributes["gen_ai.content.prompt"], + "\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}\\b", + "[REDACTED_EMAIL]") +``` + +## Pricing Tables + +Maintain pricing in the collector config: + +```yaml +processors: + transform/cost: + trace_statements: + - context: span + statements: + # OpenAI pricing (as of 2024) + # GPT-4 + - set(attributes["botanu.cost.input_usd"], attributes["gen_ai.usage.input_tokens"] * 0.00003) + where attributes["gen_ai.request.model"] == "gpt-4" or attributes["gen_ai.request.model"] == "gpt-4-0613" + - set(attributes["botanu.cost.output_usd"], attributes["gen_ai.usage.output_tokens"] * 0.00006) + where attributes["gen_ai.request.model"] == "gpt-4" or attributes["gen_ai.request.model"] == "gpt-4-0613" + + # GPT-4 Turbo + - set(attributes["botanu.cost.input_usd"], attributes["gen_ai.usage.input_tokens"] * 0.00001) + where IsMatch(attributes["gen_ai.request.model"], "gpt-4-turbo.*") + - set(attributes["botanu.cost.output_usd"], attributes["gen_ai.usage.output_tokens"] * 0.00003) + where IsMatch(attributes["gen_ai.request.model"], "gpt-4-turbo.*") + + # GPT-4o + - set(attributes["botanu.cost.input_usd"], attributes["gen_ai.usage.input_tokens"] * 0.000005) + where IsMatch(attributes["gen_ai.request.model"], "gpt-4o.*") + - set(attributes["botanu.cost.output_usd"], attributes["gen_ai.usage.output_tokens"] * 0.000015) + where IsMatch(attributes["gen_ai.request.model"], "gpt-4o.*") + + # GPT-3.5 Turbo + - set(attributes["botanu.cost.input_usd"], attributes["gen_ai.usage.input_tokens"] * 0.0000005) + where IsMatch(attributes["gen_ai.request.model"], "gpt-3.5-turbo.*") + - set(attributes["botanu.cost.output_usd"], attributes["gen_ai.usage.output_tokens"] * 0.0000015) + where IsMatch(attributes["gen_ai.request.model"], "gpt-3.5-turbo.*") + + # Claude 3 Opus + - set(attributes["botanu.cost.input_usd"], attributes["gen_ai.usage.input_tokens"] * 0.000015) + where IsMatch(attributes["gen_ai.request.model"], "claude-3-opus.*") + - set(attributes["botanu.cost.output_usd"], attributes["gen_ai.usage.output_tokens"] * 0.000075) + where IsMatch(attributes["gen_ai.request.model"], "claude-3-opus.*") + + # Claude 3 Sonnet + - set(attributes["botanu.cost.input_usd"], attributes["gen_ai.usage.input_tokens"] * 0.000003) + where IsMatch(attributes["gen_ai.request.model"], "claude-3-sonnet.*") + - set(attributes["botanu.cost.output_usd"], attributes["gen_ai.usage.output_tokens"] * 0.000015) + where IsMatch(attributes["gen_ai.request.model"], "claude-3-sonnet.*") + + # Claude 3 Haiku + - set(attributes["botanu.cost.input_usd"], attributes["gen_ai.usage.input_tokens"] * 0.00000025) + where IsMatch(attributes["gen_ai.request.model"], "claude-3-haiku.*") + - set(attributes["botanu.cost.output_usd"], attributes["gen_ai.usage.output_tokens"] * 0.00000125) + where IsMatch(attributes["gen_ai.request.model"], "claude-3-haiku.*") + + # Total cost + - set(attributes["botanu.cost.total_usd"], + attributes["botanu.cost.input_usd"] + attributes["botanu.cost.output_usd"]) + where attributes["botanu.cost.input_usd"] != nil and attributes["botanu.cost.output_usd"] != nil +``` + +## Backend Exporters + +### ClickHouse + +```yaml +exporters: + clickhouse: + endpoint: tcp://clickhouse:9000 + database: botanu + username: default + password: ${CLICKHOUSE_PASSWORD} + ttl: 90d + create_schema: true + logs_table_name: otel_logs + traces_table_name: otel_traces + metrics_table_name: otel_metrics +``` + +### PostgreSQL (via OTLP) + +Use the collector to forward to a service that writes to PostgreSQL: + +```yaml +exporters: + otlp: + endpoint: http://postgres-writer:4317 +``` + +### Prometheus (Metrics) + +```yaml +exporters: + prometheus: + endpoint: 0.0.0.0:8889 + namespace: botanu +``` + +### Grafana Tempo + +```yaml +exporters: + otlp: + endpoint: tempo:4317 + tls: + insecure: true +``` + +## Sampling + +For cost attribution, avoid sampling. If you must sample: + +```yaml +processors: + probabilistic_sampler: + sampling_percentage: 100 # Keep 100% for cost attribution + + # Or sample only non-LLM spans + tail_sampling: + decision_wait: 10s + policies: + # Always keep LLM calls + - name: always-sample-llm + type: string_attribute + string_attribute: + key: gen_ai.operation.name + values: [chat, text_completion, embeddings] + + # Sample other spans at 10% + - name: sample-other + type: probabilistic + probabilistic: + sampling_percentage: 10 +``` + +## High Availability + +### Load Balancing + +```yaml +# collector-1.yaml +receivers: + otlp: + protocols: + http: + endpoint: 0.0.0.0:4318 + +exporters: + loadbalancing: + protocol: + otlp: + tls: + insecure: true + resolver: + dns: + hostname: collector-pool.svc.cluster.local + port: 4317 +``` + +### Kubernetes Deployment + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: otel-collector +spec: + replicas: 3 + selector: + matchLabels: + app: otel-collector + template: + spec: + containers: + - name: collector + image: otel/opentelemetry-collector-contrib:latest + ports: + - containerPort: 4318 + - containerPort: 4317 + volumeMounts: + - name: config + mountPath: /etc/otelcol + volumes: + - name: config + configMap: + name: otel-collector-config +``` + +## Monitoring the Collector + +Enable internal telemetry: + +```yaml +service: + telemetry: + logs: + level: info + metrics: + level: detailed + address: 0.0.0.0:8888 +``` + +Access metrics at `http://collector:8888/metrics`. + +## See Also + +- [Architecture](../concepts/architecture.md) - SDK architecture +- [Auto-Instrumentation](auto-instrumentation.md) - Library instrumentation +- [Best Practices](../patterns/best-practices.md) - Configuration patterns diff --git a/docs/integration/existing-otel.md b/docs/integration/existing-otel.md new file mode 100644 index 0000000..a008cdb --- /dev/null +++ b/docs/integration/existing-otel.md @@ -0,0 +1,295 @@ +# Existing OpenTelemetry Setup + +Integrate Botanu with your existing OpenTelemetry configuration. + +## Overview + +If you already have OpenTelemetry configured (via Datadog, Splunk, New Relic, or custom setup), Botanu integrates seamlessly. You only need to add the `RunContextEnricher` span processor. + +## Minimal Integration + +Add just the span processor to your existing provider: + +```python +from opentelemetry import trace +from botanu.processors.enricher import RunContextEnricher + +# Your existing TracerProvider +provider = trace.get_tracer_provider() + +# Add Botanu's enricher +provider.add_span_processor(RunContextEnricher()) +``` + +That's it. All spans will now receive `run_id` from baggage. + +## With Existing Instrumentation + +Botanu works alongside any existing instrumentation: + +```python +from opentelemetry import trace +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor +from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter +from opentelemetry.instrumentation.requests import RequestsInstrumentor + +from botanu.processors.enricher import RunContextEnricher + +# Your existing setup +provider = TracerProvider() +provider.add_span_processor(BatchSpanProcessor(OTLPSpanExporter())) +trace.set_tracer_provider(provider) + +# Your existing instrumentation +RequestsInstrumentor().instrument() + +# Add Botanu enricher (order doesn't matter) +provider.add_span_processor(RunContextEnricher()) +``` + +## With Datadog + +```python +from ddtrace import tracer +from ddtrace.opentelemetry import TracerProvider +from opentelemetry import trace + +from botanu.processors.enricher import RunContextEnricher + +# Datadog's TracerProvider +provider = TracerProvider() +trace.set_tracer_provider(provider) + +# Add Botanu enricher +provider.add_span_processor(RunContextEnricher()) +``` + +## With Splunk + +```python +from splunk_otel.tracing import start_tracing +from opentelemetry import trace + +from botanu.processors.enricher import RunContextEnricher + +# Start Splunk tracing +start_tracing() + +# Add Botanu enricher +provider = trace.get_tracer_provider() +provider.add_span_processor(RunContextEnricher()) +``` + +## With New Relic + +```python +from opentelemetry import trace +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor +from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter + +from botanu.processors.enricher import RunContextEnricher + +# New Relic OTLP endpoint +provider = TracerProvider() +provider.add_span_processor( + BatchSpanProcessor( + OTLPSpanExporter( + endpoint="https://otlp.nr-data.net/v1/traces", + headers={"api-key": "YOUR_LICENSE_KEY"}, + ) + ) +) +trace.set_tracer_provider(provider) + +# Add Botanu enricher +provider.add_span_processor(RunContextEnricher()) +``` + +## With Jaeger + +```python +from opentelemetry import trace +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor +from opentelemetry.exporter.jaeger.thrift import JaegerExporter + +from botanu.processors.enricher import RunContextEnricher + +# Jaeger setup +provider = TracerProvider() +provider.add_span_processor( + BatchSpanProcessor( + JaegerExporter( + agent_host_name="localhost", + agent_port=6831, + ) + ) +) +trace.set_tracer_provider(provider) + +# Add Botanu enricher +provider.add_span_processor(RunContextEnricher()) +``` + +## Multiple Exporters + +Send to both your APM and a cost-attribution backend: + +```python +from opentelemetry import trace +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor +from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter + +from botanu.processors.enricher import RunContextEnricher + +provider = TracerProvider() + +# Your APM (e.g., Datadog) +provider.add_span_processor( + BatchSpanProcessor( + OTLPSpanExporter(endpoint="https://your-apm.example.com/v1/traces") + ) +) + +# Botanu collector for cost attribution +provider.add_span_processor( + BatchSpanProcessor( + OTLPSpanExporter(endpoint="http://botanu-collector:4318/v1/traces") + ) +) + +# Botanu enricher (adds run_id to all spans) +provider.add_span_processor(RunContextEnricher()) + +trace.set_tracer_provider(provider) +``` + +## How RunContextEnricher Works + +The enricher reads baggage and writes to span attributes: + +```python +class RunContextEnricher(SpanProcessor): + def on_start(self, span, parent_context): + # Read run_id from baggage + run_id = baggage.get_baggage("botanu.run_id", parent_context) + if run_id: + span.set_attribute("botanu.run_id", run_id) + + # Read use_case from baggage + use_case = baggage.get_baggage("botanu.use_case", parent_context) + if use_case: + span.set_attribute("botanu.use_case", use_case) +``` + +This means: +- Every span gets `run_id` if it exists in baggage +- Auto-instrumented spans are enriched automatically +- No code changes needed in your existing instrumentation + +## Using Botanu Decorators + +With the enricher in place, use Botanu decorators: + +```python +from botanu import botanu_use_case, emit_outcome + +@botanu_use_case("Customer Support") +async def handle_ticket(ticket_id: str): + # All spans created here (by any instrumentation) get run_id + context = requests.get(f"/api/tickets/{ticket_id}") + response = await openai_call(context) + await database.save(response) + + emit_outcome("success", value_type="tickets_resolved", value_amount=1) +``` + +## Without Botanu Bootstrap + +If you don't want to use `enable()`, manually set up propagation: + +```python +from opentelemetry import propagate +from opentelemetry.propagators.composite import CompositePropagator +from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator +from opentelemetry.baggage.propagation import W3CBaggagePropagator + +# Ensure baggage propagation is enabled +propagate.set_global_textmap( + CompositePropagator([ + TraceContextTextMapPropagator(), + W3CBaggagePropagator(), + ]) +) +``` + +## Verifying Integration + +Check that run_id appears on spans: + +```python +from opentelemetry import trace, baggage, context + +# Set baggage (normally done by @botanu_use_case) +ctx = baggage.set_baggage("botanu.run_id", "test-123") +token = context.attach(ctx) + +try: + tracer = trace.get_tracer("test") + with tracer.start_as_current_span("test-span") as span: + # Check attribute was set + print(span.attributes.get("botanu.run_id")) # Should print "test-123" +finally: + context.detach(token) +``` + +## Processor Order + +Span processors are called in order. The enricher should be added after your span exporters: + +```python +# 1. Exporters (send spans to backends) +provider.add_span_processor(BatchSpanProcessor(OTLPSpanExporter())) + +# 2. Enrichers (modify spans before export) +provider.add_span_processor(RunContextEnricher()) +``` + +However, `RunContextEnricher` uses `on_start()`, so it runs before export regardless. + +## Troubleshooting + +### run_id Not Appearing + +1. Check enricher is added: + ```python + provider = trace.get_tracer_provider() + # Verify RunContextEnricher is in the list + ``` + +2. Check baggage is set: + ```python + from opentelemetry import baggage + print(baggage.get_baggage("botanu.run_id")) + ``` + +3. Ensure `@botanu_use_case` is used at entry points + +### Baggage Not Propagating + +Check propagators are configured: +```python +from opentelemetry import propagate +print(propagate.get_global_textmap()) +``` + +Should include `W3CBaggagePropagator`. + +## See Also + +- [Auto-Instrumentation](auto-instrumentation.md) - Library instrumentation +- [Collector Configuration](collector.md) - Collector setup +- [Architecture](../concepts/architecture.md) - SDK design diff --git a/docs/patterns/anti-patterns.md b/docs/patterns/anti-patterns.md new file mode 100644 index 0000000..04a66b1 --- /dev/null +++ b/docs/patterns/anti-patterns.md @@ -0,0 +1,510 @@ +# Anti-Patterns + +Common mistakes to avoid when using Botanu SDK. + +## Run Design Anti-Patterns + +### Creating Runs for Internal Operations + +**Don't** create runs for internal functions: + +```python +# BAD - Too many runs +@botanu_use_case("Fetch Context") # Don't do this +async def fetch_context(ticket_id): + return await db.query(...) + +@botanu_use_case("Generate Response") # Or this +async def generate_response(context): + return await llm.complete(...) + +@botanu_use_case("Customer Support") +async def handle_ticket(ticket_id): + context = await fetch_context(ticket_id) + response = await generate_response(context) + return response +``` + +**Do** use a single run at the entry point: + +```python +# GOOD - One run for the business outcome +@botanu_use_case("Customer Support") +async def handle_ticket(ticket_id): + context = await fetch_context(ticket_id) # Not decorated + response = await generate_response(context) # Not decorated + emit_outcome("success", value_type="tickets_resolved", value_amount=1) + return response +``` + +### Nesting @botanu_use_case Decorators + +**Don't** nest use case decorators: + +```python +# BAD - Nested runs create confusion +@botanu_use_case("Outer") +async def outer(): + await inner() # Creates a second run + +@botanu_use_case("Inner") # Don't do this +async def inner(): + ... +``` + +**Do** use @botanu_use_case only at entry points: + +```python +# GOOD - Only entry point is decorated +@botanu_use_case("Main Workflow") +async def main(): + await step_one() # No decorator + await step_two() # No decorator +``` + +### Generic Use Case Names + +**Don't** use vague names: + +```python +# BAD - Meaningless in dashboards +@botanu_use_case("Process") +@botanu_use_case("Handle") +@botanu_use_case("Main") +@botanu_use_case("DoWork") +``` + +**Do** use descriptive business names: + +```python +# GOOD - Clear in reports +@botanu_use_case("Customer Support") +@botanu_use_case("Invoice Processing") +@botanu_use_case("Lead Qualification") +@botanu_use_case("Document Analysis") +``` + +## Outcome Anti-Patterns + +### Forgetting to Emit Outcomes + +**Don't** leave runs without outcomes: + +```python +# BAD - No outcome recorded +@botanu_use_case("Process Order") +async def process_order(order_id): + result = await process(order_id) + return result # Where's the outcome? +``` + +**Do** always emit an outcome: + +```python +# GOOD - Explicit outcome +@botanu_use_case("Process Order") +async def process_order(order_id): + try: + result = await process(order_id) + emit_outcome("success", value_type="orders_processed", value_amount=1) + return result + except Exception as e: + emit_outcome("failed", reason=type(e).__name__) + raise +``` + +### Multiple Outcomes Per Run + +**Don't** emit multiple outcomes: + +```python +# BAD - Multiple outcomes are confusing +@botanu_use_case("Batch Processing") +async def process_batch(items): + for item in items: + await process(item) + emit_outcome("success", value_type="item_processed") # Don't do this +``` + +**Do** emit one summary outcome: + +```python +# GOOD - One outcome at the end +@botanu_use_case("Batch Processing") +async def process_batch(items): + processed = 0 + for item in items: + await process(item) + processed += 1 + emit_outcome("success", value_type="items_processed", value_amount=processed) +``` + +### Missing Failure Reasons + +**Don't** emit failures without reasons: + +```python +# BAD - No context for debugging +except Exception: + emit_outcome("failed") # Why did it fail? + raise +``` + +**Do** include the failure reason: + +```python +# GOOD - Reason helps debugging +except ValidationError: + emit_outcome("failed", reason="validation_error") + raise +except RateLimitError: + emit_outcome("failed", reason="rate_limit_exceeded") + raise +except Exception as e: + emit_outcome("failed", reason=type(e).__name__) + raise +``` + +## LLM Tracking Anti-Patterns + +### Not Recording Tokens + +**Don't** skip token recording: + +```python +# BAD - No cost data +with track_llm_call(provider="openai", model="gpt-4"): + response = await client.chat.completions.create(...) + # Token usage not recorded +``` + +**Do** always record tokens: + +```python +# GOOD - Tokens enable cost calculation +with track_llm_call(provider="openai", model="gpt-4") as tracker: + response = await client.chat.completions.create(...) + tracker.set_tokens( + input_tokens=response.usage.prompt_tokens, + output_tokens=response.usage.completion_tokens, + ) +``` + +### Ignoring Cached Tokens + +**Don't** forget cache tokens (they have different pricing): + +```python +# BAD - Missing cache data +tracker.set_tokens( + input_tokens=response.usage.prompt_tokens, + output_tokens=response.usage.completion_tokens, +) +``` + +**Do** include cache breakdown: + +```python +# GOOD - Full token breakdown +tracker.set_tokens( + input_tokens=response.usage.prompt_tokens, + output_tokens=response.usage.completion_tokens, + cache_read_tokens=response.usage.cache_read_tokens, + cache_write_tokens=response.usage.cache_write_tokens, +) +``` + +### Wrong Provider Names + +**Don't** use inconsistent provider names: + +```python +# BAD - Inconsistent naming +track_llm_call(provider="OpenAI", ...) # Mixed case +track_llm_call(provider="open-ai", ...) # Wrong format +track_llm_call(provider="gpt", ...) # Model as provider +``` + +**Do** use standard provider names (auto-normalized): + +```python +# GOOD - Standard names (or let SDK normalize) +track_llm_call(provider="openai", ...) +track_llm_call(provider="anthropic", ...) +track_llm_call(provider="azure_openai", ...) +``` + +## Configuration Anti-Patterns + +### Sampling for Cost Attribution + +**Don't** sample spans: + +```python +# BAD - Missing cost data +enable( + service_name="my-service", + trace_sample_rate=0.1, # Only 10% of costs captured! +) +``` + +**Do** use 100% sampling: + +```python +# GOOD - Complete cost data +enable( + service_name="my-service", + trace_sample_rate=1.0, # Default - don't change +) +``` + +### Hardcoding Configuration + +**Don't** hardcode production values: + +```python +# BAD - Hardcoded +enable( + service_name="my-service", + otlp_endpoint="http://prod-collector.internal:4318", +) +``` + +**Do** use environment variables: + +```python +# GOOD - Environment-based +enable(service_name=os.environ["OTEL_SERVICE_NAME"]) + +# Or use YAML with interpolation +# botanu.yaml +# otlp: +# endpoint: ${COLLECTOR_ENDPOINT} +``` + +### Disabling Auto-Instrumentation Unnecessarily + +**Don't** disable auto-instrumentation without reason: + +```python +# BAD - Missing automatic tracing +enable( + service_name="my-service", + auto_instrument_packages=[], # Why? +) +``` + +**Do** keep defaults or be selective: + +```python +# GOOD - Default instrumentation +enable(service_name="my-service") + +# Or selective +enable( + service_name="my-service", + auto_instrument_packages=["fastapi", "openai_v2", "sqlalchemy"], +) +``` + +## Context Propagation Anti-Patterns + +### Losing Context in Async Code + +**Don't** spawn tasks without context: + +```python +# BAD - Context lost +@botanu_use_case("Parallel Processing") +async def process(): + # These tasks don't inherit context + await asyncio.gather( + task_one(), + task_two(), + ) +``` + +**Do** ensure context propagates: + +```python +# GOOD - Context flows through asyncio +@botanu_use_case("Parallel Processing") +async def process(): + # asyncio with contextvars works correctly + await asyncio.gather( + task_one(), # Inherits context + task_two(), # Inherits context + ) +``` + +### Not Extracting Context in Consumers + +**Don't** ignore incoming context: + +```python +# BAD - Context not extracted +def process_message(message): + # run_id from producer is lost + handle_payload(message["payload"]) +``` + +**Do** extract and use context: + +```python +# GOOD - Context continues +def process_message(message): + baggage = message.get("baggage", {}) + ctx = RunContext.from_baggage(baggage) + if ctx: + with ctx.as_current(): + handle_payload(message["payload"]) +``` + +## Data Tracking Anti-Patterns + +### Not Tracking Data Operations + +**Don't** ignore database/storage costs: + +```python +# BAD - Only LLM tracked +@botanu_use_case("Analysis") +async def analyze(): + data = await snowflake.query(expensive_query) # Not tracked! + with track_llm_call(...) as tracker: + result = await llm.complete(data) + tracker.set_tokens(...) +``` + +**Do** track all cost-generating operations: + +```python +# GOOD - Complete cost picture +@botanu_use_case("Analysis") +async def analyze(): + with track_db_operation(system="snowflake", operation="SELECT") as db: + data = await snowflake.query(expensive_query) + db.set_bytes_scanned(data.bytes_scanned) + + with track_llm_call(...) as tracker: + result = await llm.complete(data) + tracker.set_tokens(...) +``` + +### Missing Bytes for Pay-Per-Scan + +**Don't** forget bytes for warehouses: + +```python +# BAD - Missing cost driver +with track_db_operation(system="bigquery", operation="SELECT") as db: + result = await bq.query(sql) + db.set_result(rows_returned=len(result)) # Rows don't determine cost! +``` + +**Do** include bytes scanned: + +```python +# GOOD - Bytes scanned is the cost driver +with track_db_operation(system="bigquery", operation="SELECT") as db: + result = await bq.query(sql) + db.set_bytes_scanned(result.bytes_processed) + db.set_result(rows_returned=len(result)) +``` + +## Error Handling Anti-Patterns + +### Swallowing Errors + +**Don't** hide errors: + +```python +# BAD - Error hidden +with track_llm_call(...) as tracker: + try: + response = await llm.complete(...) + except Exception: + pass # Silently fails - no error recorded +``` + +**Do** record and propagate errors: + +```python +# GOOD - Error tracked and raised +with track_llm_call(...) as tracker: + try: + response = await llm.complete(...) + except Exception as e: + tracker.set_error(e) + emit_outcome("failed", reason=type(e).__name__) + raise +``` + +### Ignoring Partial Successes + +**Don't** mark all-or-nothing: + +```python +# BAD - All items fail if one fails +@botanu_use_case("Batch") +async def process_batch(items): + for item in items: + await process(item) # If one fails, no outcome + emit_outcome("success", value_amount=len(items)) +``` + +**Do** track partial success: + +```python +# GOOD - Partial success recorded +@botanu_use_case("Batch") +async def process_batch(items): + processed = 0 + failed = 0 + for item in items: + try: + await process(item) + processed += 1 + except Exception: + failed += 1 + + if failed == 0: + emit_outcome("success", value_type="items_processed", value_amount=processed) + elif processed > 0: + emit_outcome("partial", value_type="items_processed", value_amount=processed, + reason=f"failed_{failed}_of_{len(items)}") + else: + emit_outcome("failed", reason="all_items_failed") +``` + +## Testing Anti-Patterns + +### Testing with Real Exporters + +**Don't** send telemetry during tests: + +```python +# BAD - Tests hit real collector +def test_workflow(): + enable(service_name="test") # Sends to real endpoint! + await my_workflow() +``` + +**Do** use NoOp or in-memory exporters: + +```python +# GOOD - Tests are isolated +from opentelemetry.trace import NoOpTracerProvider + +def setup_test(): + trace.set_tracer_provider(NoOpTracerProvider()) + +def test_workflow(): + await my_workflow() # No external calls +``` + +## See Also + +- [Best Practices](best-practices.md) - What to do +- [Quickstart](../getting-started/quickstart.md) - Getting started guide +- [Outcomes](../tracking/outcomes.md) - Outcome recording details diff --git a/docs/patterns/best-practices.md b/docs/patterns/best-practices.md new file mode 100644 index 0000000..50d80e6 --- /dev/null +++ b/docs/patterns/best-practices.md @@ -0,0 +1,428 @@ +# Best Practices + +Patterns for effective cost attribution with Botanu SDK. + +## Run Design + +### One Run Per Business Outcome + +A run should represent a complete business transaction: + +```python +# GOOD - One run for one business outcome +@botanu_use_case("Customer Support") +async def resolve_ticket(ticket_id: str): + context = await fetch_context(ticket_id) + response = await generate_response(context) + await send_response(ticket_id, response) + emit_outcome("success", value_type="tickets_resolved", value_amount=1) +``` + +```python +# BAD - Multiple runs for one outcome +@botanu_use_case("Fetch Context") +async def fetch_context(ticket_id: str): + ... + +@botanu_use_case("Generate Response") # Don't do this +async def generate_response(context): + ... +``` + +### Use Descriptive Use Case Names + +Use cases appear in dashboards and queries. Choose names carefully: + +```python +# GOOD - Clear, descriptive names +@botanu_use_case("Customer Support") +@botanu_use_case("Document Analysis") +@botanu_use_case("Lead Qualification") + +# BAD - Generic or technical names +@botanu_use_case("HandleRequest") +@botanu_use_case("Process") +@botanu_use_case("Main") +``` + +### Include Workflow Names + +Workflow names help distinguish different paths within a use case: + +```python +@botanu_use_case("Customer Support", workflow="ticket_resolution") +async def resolve_ticket(): + ... + +@botanu_use_case("Customer Support", workflow="escalation") +async def escalate_ticket(): + ... +``` + +## Outcome Recording + +### Always Record Outcomes + +Every run should have an explicit outcome: + +```python +@botanu_use_case("Data Processing") +async def process_data(data_id: str): + try: + result = await process(data_id) + emit_outcome("success", value_type="records_processed", value_amount=result.count) + return result + except ValidationError: + emit_outcome("failed", reason="validation_error") + raise + except TimeoutError: + emit_outcome("failed", reason="timeout") + raise +``` + +### Quantify Value When Possible + +Include value amounts for better ROI analysis: + +```python +# GOOD - Quantified outcomes +emit_outcome("success", value_type="emails_sent", value_amount=50) +emit_outcome("success", value_type="revenue_generated", value_amount=1299.99) +emit_outcome("success", value_type="documents_processed", value_amount=10) + +# LESS USEFUL - No quantity +emit_outcome("success") +``` + +### Use Consistent Value Types + +Standardize your value types across the organization: + +```python +# Define standard value types +class ValueTypes: + TICKETS_RESOLVED = "tickets_resolved" + DOCUMENTS_PROCESSED = "documents_processed" + LEADS_QUALIFIED = "leads_qualified" + EMAILS_SENT = "emails_sent" + REVENUE_GENERATED = "revenue_generated" + +# Use consistently +emit_outcome("success", value_type=ValueTypes.TICKETS_RESOLVED, value_amount=1) +``` + +### Include Reasons for Failures + +Always explain why something failed: + +```python +emit_outcome("failed", reason="rate_limit_exceeded") +emit_outcome("failed", reason="invalid_input") +emit_outcome("failed", reason="model_unavailable") +emit_outcome("failed", reason="context_too_long") +``` + +## LLM Tracking + +### Always Record Token Usage + +Tokens are the primary cost driver for LLMs: + +```python +with track_llm_call(provider="openai", model="gpt-4") as tracker: + response = await client.chat.completions.create(...) + # Always set tokens + tracker.set_tokens( + input_tokens=response.usage.prompt_tokens, + output_tokens=response.usage.completion_tokens, + ) +``` + +### Record Provider Request IDs + +Request IDs enable reconciliation with provider invoices: + +```python +tracker.set_request_id( + provider_request_id=response.id, # From provider + client_request_id=uuid.uuid4().hex, # Your internal ID +) +``` + +### Track Retries + +Record attempt numbers for accurate cost per success: + +```python +for attempt in range(max_retries): + with track_llm_call(provider="openai", model="gpt-4") as tracker: + tracker.set_attempt(attempt + 1) + try: + response = await client.chat.completions.create(...) + break + except RateLimitError: + if attempt == max_retries - 1: + raise + await asyncio.sleep(backoff) +``` + +### Use Correct Operation Types + +Specify the operation type for accurate categorization: + +```python +from botanu.tracking.llm import track_llm_call, ModelOperation + +# Chat completion +with track_llm_call(provider="openai", model="gpt-4", operation=ModelOperation.CHAT): + ... + +# Embeddings +with track_llm_call(provider="openai", model="text-embedding-3-small", operation=ModelOperation.EMBEDDINGS): + ... +``` + +## Data Tracking + +### Track All Cost-Generating Operations + +Include databases, storage, and messaging: + +```python +@botanu_use_case("ETL Pipeline") +async def run_etl(): + # Track warehouse query (billed by bytes scanned) + with track_db_operation(system="snowflake", operation="SELECT") as db: + db.set_bytes_scanned(result.bytes_scanned) + db.set_query_id(result.query_id) + + # Track storage operations (billed by requests + data) + with track_storage_operation(system="s3", operation="PUT") as storage: + storage.set_result(bytes_written=len(data)) + + # Track messaging (billed by message count) + with track_messaging_operation(system="sqs", operation="publish", destination="queue") as msg: + msg.set_result(message_count=batch_size) +``` + +### Include Bytes for Pay-Per-Scan Services + +For data warehouses billed by data scanned: + +```python +with track_db_operation(system="bigquery", operation="SELECT") as db: + result = await bq_client.query(sql) + db.set_bytes_scanned(result.total_bytes_processed) + db.set_result(rows_returned=result.num_rows) +``` + +## Context Propagation + +### Use Middleware for Web Services + +Extract context from incoming requests: + +```python +from fastapi import FastAPI +from botanu.sdk.middleware import BotanuMiddleware + +app = FastAPI() +app.add_middleware(BotanuMiddleware) +``` + +### Propagate Context in Message Queues + +Inject and extract context manually for async messaging: + +```python +# Producer +def publish_message(payload): + ctx = get_current_run_context() + message = { + "payload": payload, + "baggage": ctx.to_baggage_dict() if ctx else {} + } + queue.publish(message) + +# Consumer +def process_message(message): + baggage = message.get("baggage", {}) + ctx = RunContext.from_baggage(baggage) + with ctx.as_current(): + handle_payload(message["payload"]) +``` + +### Use Lean Mode for High-Traffic Systems + +Default lean mode minimizes header overhead: + +```python +# Lean mode: ~100 bytes of baggage +# Propagates: run_id, use_case + +# Full mode: ~300 bytes of baggage +# Propagates: run_id, use_case, workflow, environment, tenant_id, parent_run_id +``` + +## Configuration + +### Use Environment Variables in Production + +Keep configuration out of code: + +```bash +export OTEL_SERVICE_NAME=my-service +export OTEL_EXPORTER_OTLP_ENDPOINT=http://collector:4318 +export BOTANU_ENVIRONMENT=production +``` + +### Never Sample for Cost Attribution + +Always use 100% sampling for accurate cost data: + +```python +# GOOD +trace_sample_rate: float = 1.0 + +# BAD - Missing cost data +trace_sample_rate: float = 0.1 # Only 10% of costs captured +``` + +### Use YAML for Complex Configuration + +For multi-environment setups: + +```yaml +# config/production.yaml +service: + name: ${OTEL_SERVICE_NAME} + environment: production + +otlp: + endpoint: ${COLLECTOR_ENDPOINT} + +propagation: + mode: lean +``` + +## Multi-Tenant Systems + +### Always Include Tenant ID + +For accurate per-tenant cost attribution: + +```python +@botanu_use_case("Customer Support", tenant_id=request.tenant_id) +async def handle_ticket(request): + ... +``` + +### Use Business Context + +Add additional attribution dimensions: + +```python +set_business_context( + customer_id=request.customer_id, + team="engineering", + cost_center="R&D", + region="us-west-2", +) +``` + +## Error Handling + +### Record Errors Explicitly + +Don't lose error context: + +```python +with track_llm_call(provider="openai", model="gpt-4") as tracker: + try: + response = await client.chat.completions.create(...) + except openai.APIError as e: + tracker.set_error(e) # Records error type and message + raise +``` + +### Emit Outcomes for Errors + +Even failed runs should have outcomes: + +```python +@botanu_use_case("Data Processing") +async def process(data_id): + try: + await process_data(data_id) + emit_outcome("success", value_type="items_processed", value_amount=1) + except ValidationError: + emit_outcome("failed", reason="validation_error") + raise + except Exception as e: + emit_outcome("failed", reason=type(e).__name__) + raise +``` + +## Performance + +### Use Async Tracking + +For async applications, ensure tracking is non-blocking: + +```python +# The SDK uses span events, not separate API calls +# This is already non-blocking +with track_llm_call(provider="openai", model="gpt-4") as tracker: + response = await async_llm_call() + tracker.set_tokens(...) # Immediate, non-blocking +``` + +### Batch Database Tracking + +For batch operations, track at batch level: + +```python +# GOOD - Batch tracking +with track_db_operation(system="postgresql", operation="INSERT") as db: + await cursor.executemany(insert_sql, batch_of_1000_rows) + db.set_result(rows_affected=1000) + +# LESS EFFICIENT - Per-row tracking +for row in batch_of_1000_rows: + with track_db_operation(system="postgresql", operation="INSERT") as db: + await cursor.execute(insert_sql, row) + db.set_result(rows_affected=1) +``` + +## Testing + +### Mock Tracing in Tests + +Use the NoOp tracer for unit tests: + +```python +from opentelemetry import trace +from opentelemetry.trace import NoOpTracerProvider + +def setup_test_tracing(): + trace.set_tracer_provider(NoOpTracerProvider()) +``` + +### Test Outcome Recording + +Verify outcomes are emitted correctly: + +```python +from unittest.mock import patch + +def test_successful_outcome(): + with patch("botanu.sdk.span_helpers.emit_outcome") as mock_emit: + result = await handle_ticket("123") + mock_emit.assert_called_with("success", value_type="tickets_resolved", value_amount=1) +``` + +## See Also + +- [Anti-Patterns](anti-patterns.md) - What to avoid +- [Architecture](../concepts/architecture.md) - SDK design principles +- [Configuration](../getting-started/configuration.md) - Configuration options diff --git a/docs/tracking/data-tracking.md b/docs/tracking/data-tracking.md new file mode 100644 index 0000000..9c066a8 --- /dev/null +++ b/docs/tracking/data-tracking.md @@ -0,0 +1,412 @@ +# Data Tracking + +Track database, storage, and messaging operations for complete cost visibility. + +## Overview + +Data operations often contribute significantly to AI workflow costs. Botanu provides tracking for: + +- **Databases** - SQL, NoSQL, data warehouses +- **Object Storage** - S3, GCS, Azure Blob +- **Messaging** - SQS, Kafka, Pub/Sub + +## Database Tracking + +### Basic Usage + +```python +from botanu.tracking.data import track_db_operation + +with track_db_operation(system="postgresql", operation="SELECT") as db: + result = await cursor.execute("SELECT * FROM users WHERE active = true") + db.set_result(rows_returned=len(result)) +``` + +### DBTracker Methods + +#### set_result() + +Record query results: + +```python +db.set_result( + rows_returned=100, # For SELECT queries + rows_affected=5, # For INSERT/UPDATE/DELETE + bytes_read=10240, # Data read + bytes_written=2048, # Data written +) +``` + +#### set_table() + +Record table information: + +```python +db.set_table("users", schema="public") +``` + +#### set_query_id() + +For data warehouses with query IDs: + +```python +db.set_query_id("01abc-def-...") +``` + +#### set_bytes_scanned() + +For pay-per-query warehouses: + +```python +db.set_bytes_scanned(1073741824) # 1 GB +``` + +#### set_error() + +Record errors (automatically called on exceptions): + +```python +db.set_error(exception) +``` + +#### add_metadata() + +Add custom attributes: + +```python +db.add_metadata( + query_type="aggregation", + cache_hit=True, +) +``` + +### Database Operations + +Use `DBOperation` constants: + +```python +from botanu.tracking.data import track_db_operation, DBOperation + +with track_db_operation(system="postgresql", operation=DBOperation.SELECT): + ... + +with track_db_operation(system="postgresql", operation=DBOperation.INSERT): + ... +``` + +Available operations: + +| Constant | Description | +|----------|-------------| +| `SELECT` | Read queries | +| `INSERT` | Insert data | +| `UPDATE` | Update data | +| `DELETE` | Delete data | +| `UPSERT` | Insert or update | +| `MERGE` | Merge operations | +| `CREATE` | Create tables/indexes | +| `DROP` | Drop objects | +| `ALTER` | Alter schema | +| `INDEX` | Index operations | +| `TRANSACTION` | Transaction control | +| `BATCH` | Batch operations | + +### System Normalization + +Database systems are automatically normalized: + +| Input | Normalized | +|-------|------------| +| `postgresql`, `postgres`, `pg` | `postgresql` | +| `mysql` | `mysql` | +| `mongodb`, `mongo` | `mongodb` | +| `dynamodb` | `dynamodb` | +| `redis` | `redis` | +| `elasticsearch` | `elasticsearch` | +| `snowflake` | `snowflake` | +| `bigquery` | `bigquery` | +| `redshift` | `redshift` | + +## Storage Tracking + +### Basic Usage + +```python +from botanu.tracking.data import track_storage_operation + +with track_storage_operation(system="s3", operation="PUT") as storage: + await s3_client.put_object(Bucket="my-bucket", Key="file.txt", Body=data) + storage.set_result(bytes_written=len(data)) +``` + +### StorageTracker Methods + +#### set_result() + +Record operation results: + +```python +storage.set_result( + objects_count=10, # Number of objects + bytes_read=1048576, # Data downloaded + bytes_written=2097152, # Data uploaded +) +``` + +#### set_bucket() + +Record bucket name: + +```python +storage.set_bucket("my-data-bucket") +``` + +#### set_error() + +Record errors: + +```python +storage.set_error(exception) +``` + +#### add_metadata() + +Add custom attributes: + +```python +storage.add_metadata( + storage_class="GLACIER", + encryption="AES256", +) +``` + +### Storage Operations + +| Constant | Description | +|----------|-------------| +| `GET` | Download object | +| `PUT` | Upload object | +| `DELETE` | Delete object | +| `LIST` | List objects | +| `HEAD` | Get metadata | +| `COPY` | Copy object | +| `MULTIPART_UPLOAD` | Multipart upload | + +### System Normalization + +| Input | Normalized | +|-------|------------| +| `s3`, `aws_s3` | `s3` | +| `gcs`, `google_cloud_storage` | `gcs` | +| `blob`, `azure_blob` | `azure_blob` | +| `minio` | `minio` | + +## Messaging Tracking + +### Basic Usage + +```python +from botanu.tracking.data import track_messaging_operation + +with track_messaging_operation(system="sqs", operation="publish", destination="my-queue") as msg: + await sqs_client.send_message(QueueUrl=queue_url, MessageBody=message) + msg.set_result(message_count=1, bytes_transferred=len(message)) +``` + +### MessagingTracker Methods + +#### set_result() + +Record operation results: + +```python +msg.set_result( + message_count=10, + bytes_transferred=4096, +) +``` + +#### set_error() + +Record errors: + +```python +msg.set_error(exception) +``` + +#### add_metadata() + +Add custom attributes: + +```python +msg.add_metadata( + message_group_id="group-1", + deduplication_id="dedup-123", +) +``` + +### Messaging Operations + +| Constant | Description | +|----------|-------------| +| `publish` | Send message | +| `consume` | Receive and process message | +| `receive` | Receive message | +| `send` | Send message (alias for publish) | +| `subscribe` | Subscribe to topic | + +### System Normalization + +| Input | Normalized | +|-------|------------| +| `sqs`, `aws_sqs` | `sqs` | +| `sns` | `sns` | +| `kinesis` | `kinesis` | +| `pubsub`, `google_pubsub` | `pubsub` | +| `kafka` | `kafka` | +| `rabbitmq` | `rabbitmq` | +| `celery` | `celery` | + +## Standalone Helpers + +### set_data_metrics() + +Set data metrics on the current span: + +```python +from botanu.tracking.data import set_data_metrics + +set_data_metrics( + rows_returned=100, + rows_affected=5, + bytes_read=10240, + bytes_written=2048, + objects_count=10, +) +``` + +### set_warehouse_metrics() + +For data warehouse queries: + +```python +from botanu.tracking.data import set_warehouse_metrics + +set_warehouse_metrics( + query_id="01abc-def-...", + bytes_scanned=1073741824, + rows_returned=1000, + partitions_scanned=5, +) +``` + +## Example: Complete Data Pipeline + +```python +from botanu import botanu_use_case, emit_outcome +from botanu.tracking.data import ( + track_db_operation, + track_storage_operation, + track_messaging_operation, + DBOperation, +) +from botanu.tracking.llm import track_llm_call + +@botanu_use_case("ETL Pipeline") +async def process_batch(batch_id: str): + """Complete ETL pipeline with cost tracking.""" + + # 1. Read from data warehouse + with track_db_operation(system="snowflake", operation=DBOperation.SELECT) as db: + db.set_query_id(batch_id) + rows = await snowflake_client.execute( + "SELECT * FROM raw_data WHERE batch_id = %s", + batch_id + ) + db.set_result(rows_returned=len(rows)) + db.set_bytes_scanned(rows.bytes_scanned) + + # 2. Process with LLM + processed = [] + for row in rows: + with track_llm_call(provider="openai", model="gpt-4") as llm: + result = await analyze_row(row) + llm.set_tokens(input_tokens=result.input_tokens, output_tokens=result.output_tokens) + processed.append(result) + + # 3. Write to storage + with track_storage_operation(system="s3", operation="PUT") as storage: + storage.set_bucket("processed-data") + await s3_client.put_object( + Bucket="processed-data", + Key=f"batch/{batch_id}.json", + Body=json.dumps(processed) + ) + storage.set_result(bytes_written=len(json.dumps(processed))) + + # 4. Write to database + with track_db_operation(system="postgresql", operation=DBOperation.INSERT) as db: + await pg_client.executemany( + "INSERT INTO processed_data VALUES (%s, %s, %s)", + [(r.id, r.result, r.score) for r in processed] + ) + db.set_result(rows_affected=len(processed)) + + # 5. Publish completion event + with track_messaging_operation(system="sqs", operation="publish", destination="batch-complete") as msg: + await sqs_client.send_message( + QueueUrl=queue_url, + MessageBody=json.dumps({"batch_id": batch_id, "count": len(processed)}) + ) + msg.set_result(message_count=1) + + emit_outcome("success", value_type="batches_processed", value_amount=1) + return processed +``` + +## Span Attributes + +### Database Spans + +| Attribute | Description | +|-----------|-------------| +| `db.system` | Database system (normalized) | +| `db.operation` | Operation type | +| `db.name` | Database name | +| `db.collection.name` | Table/collection name | +| `botanu.vendor` | Vendor for cost attribution | +| `botanu.data.rows_returned` | Rows returned | +| `botanu.data.rows_affected` | Rows modified | +| `botanu.data.bytes_read` | Bytes read | +| `botanu.data.bytes_written` | Bytes written | +| `botanu.warehouse.query_id` | Warehouse query ID | +| `botanu.warehouse.bytes_scanned` | Bytes scanned | + +### Storage Spans + +| Attribute | Description | +|-----------|-------------| +| `botanu.storage.system` | Storage system | +| `botanu.storage.operation` | Operation type | +| `botanu.storage.bucket` | Bucket name | +| `botanu.vendor` | Vendor for cost attribution | +| `botanu.data.objects_count` | Objects processed | +| `botanu.data.bytes_read` | Bytes downloaded | +| `botanu.data.bytes_written` | Bytes uploaded | + +### Messaging Spans + +| Attribute | Description | +|-----------|-------------| +| `messaging.system` | Messaging system | +| `messaging.operation` | Operation type | +| `messaging.destination.name` | Queue/topic name | +| `botanu.vendor` | Vendor for cost attribution | +| `botanu.messaging.message_count` | Messages processed | +| `botanu.messaging.bytes_transferred` | Bytes transferred | + +## See Also + +- [LLM Tracking](llm-tracking.md) - AI model tracking +- [Outcomes](outcomes.md) - Recording business outcomes +- [Best Practices](../patterns/best-practices.md) - Tracking best practices diff --git a/docs/tracking/llm-tracking.md b/docs/tracking/llm-tracking.md new file mode 100644 index 0000000..138cd7f --- /dev/null +++ b/docs/tracking/llm-tracking.md @@ -0,0 +1,332 @@ +# LLM Tracking + +Track AI model usage for accurate cost attribution across providers. + +## Overview + +Botanu provides LLM tracking that aligns with [OpenTelemetry GenAI Semantic Conventions](https://opentelemetry.io/docs/specs/semconv/gen-ai/). This ensures compatibility with standard observability tooling while enabling detailed cost analysis. + +## Basic Usage + +### Context Manager (Recommended) + +```python +from botanu.tracking.llm import track_llm_call + +with track_llm_call(provider="openai", model="gpt-4") as tracker: + response = await openai.chat.completions.create( + model="gpt-4", + messages=[{"role": "user", "content": "Hello"}] + ) + tracker.set_tokens( + input_tokens=response.usage.prompt_tokens, + output_tokens=response.usage.completion_tokens, + ) + tracker.set_request_id(response.id) +``` + +### What Gets Recorded + +| Attribute | Example | Description | +|-----------|---------|-------------| +| `gen_ai.operation.name` | `chat` | Type of operation | +| `gen_ai.provider.name` | `openai` | Normalized provider name | +| `gen_ai.request.model` | `gpt-4` | Requested model | +| `gen_ai.response.model` | `gpt-4-0613` | Actual model used | +| `gen_ai.usage.input_tokens` | `150` | Input/prompt tokens | +| `gen_ai.usage.output_tokens` | `200` | Output/completion tokens | +| `gen_ai.response.id` | `chatcmpl-...` | Provider request ID | + +## LLMTracker Methods + +### set_tokens() + +Record token usage from the response: + +```python +tracker.set_tokens( + input_tokens=150, + output_tokens=200, + cached_tokens=50, # For providers with caching + cache_read_tokens=50, # Anthropic-style cache read + cache_write_tokens=100, # Anthropic-style cache write +) +``` + +### set_request_id() + +Record provider and client request IDs for billing reconciliation: + +```python +tracker.set_request_id( + provider_request_id=response.id, # From provider response + client_request_id="my-client-123", # Your tracking ID +) +``` + +### set_response_model() + +When the response uses a different model than requested: + +```python +tracker.set_response_model("gpt-4-0613") +``` + +### set_request_params() + +Record request parameters for analysis: + +```python +tracker.set_request_params( + temperature=0.7, + top_p=0.9, + max_tokens=1000, + stop_sequences=["END"], + frequency_penalty=0.5, + presence_penalty=0.3, +) +``` + +### set_streaming() + +Mark as a streaming request: + +```python +tracker.set_streaming(True) +``` + +### set_cache_hit() + +Mark as a cache hit (for semantic caching): + +```python +tracker.set_cache_hit(True) +``` + +### set_attempt() + +Track retry attempts: + +```python +tracker.set_attempt(2) # Second attempt +``` + +### set_finish_reason() + +Record the stop reason: + +```python +tracker.set_finish_reason("stop") # or "length", "content_filter", etc. +``` + +### set_error() + +Record errors (automatically called on exceptions): + +```python +try: + response = await client.chat(...) +except openai.RateLimitError as e: + tracker.set_error(e) + raise +``` + +### add_metadata() + +Add custom attributes: + +```python +tracker.add_metadata( + prompt_version="v2.1", + experiment_id="exp-123", +) +``` + +## Operation Types + +Use `ModelOperation` constants for the `operation` parameter: + +```python +from botanu.tracking.llm import track_llm_call, ModelOperation + +# Chat completion +with track_llm_call(provider="openai", model="gpt-4", operation=ModelOperation.CHAT): + ... + +# Embeddings +with track_llm_call(provider="openai", model="text-embedding-3-small", operation=ModelOperation.EMBEDDINGS): + ... + +# Text completion (legacy) +with track_llm_call(provider="openai", model="davinci", operation=ModelOperation.TEXT_COMPLETION): + ... +``` + +Available operations: + +| Constant | Value | Use Case | +|----------|-------|----------| +| `CHAT` | `chat` | Chat completions (default) | +| `TEXT_COMPLETION` | `text_completion` | Legacy completions | +| `EMBEDDINGS` | `embeddings` | Embedding generation | +| `GENERATE_CONTENT` | `generate_content` | Generic content generation | +| `EXECUTE_TOOL` | `execute_tool` | Tool/function execution | +| `CREATE_AGENT` | `create_agent` | Agent creation | +| `INVOKE_AGENT` | `invoke_agent` | Agent invocation | +| `RERANK` | `rerank` | Reranking | +| `IMAGE_GENERATION` | `image_generation` | Image generation | +| `SPEECH_TO_TEXT` | `speech_to_text` | Transcription | +| `TEXT_TO_SPEECH` | `text_to_speech` | Speech synthesis | + +## Provider Normalization + +Provider names are automatically normalized: + +| Input | Normalized | +|-------|------------| +| `openai`, `OpenAI` | `openai` | +| `azure_openai`, `azure-openai` | `azure.openai` | +| `anthropic`, `claude` | `anthropic` | +| `bedrock`, `aws_bedrock` | `aws.bedrock` | +| `vertex`, `vertexai`, `gemini` | `gcp.vertex_ai` | +| `cohere` | `cohere` | +| `mistral`, `mistralai` | `mistral` | +| `together`, `togetherai` | `together` | +| `groq` | `groq` | + +## Tool/Function Tracking + +Track tool calls triggered by LLMs: + +```python +from botanu.tracking.llm import track_tool_call + +with track_tool_call(tool_name="search_database", tool_call_id="call_abc123") as tool: + results = await search_database(query) + tool.set_result( + success=True, + items_returned=len(results), + bytes_processed=1024, + ) +``` + +### ToolTracker Methods + +```python +# Set execution result +tool.set_result( + success=True, + items_returned=10, + bytes_processed=2048, +) + +# Set tool call ID from LLM response +tool.set_tool_call_id("call_abc123") + +# Record error +tool.set_error(exception) + +# Add custom metadata +tool.add_metadata(query_type="semantic") +``` + +## Standalone Helpers + +For cases where you can't use context managers: + +### set_llm_attributes() + +```python +from botanu.tracking.llm import set_llm_attributes + +set_llm_attributes( + provider="openai", + model="gpt-4", + operation="chat", + input_tokens=150, + output_tokens=200, + streaming=True, + provider_request_id="chatcmpl-...", +) +``` + +### set_token_usage() + +```python +from botanu.tracking.llm import set_token_usage + +set_token_usage( + input_tokens=150, + output_tokens=200, + cached_tokens=50, +) +``` + +## Decorator for Auto-Instrumentation + +For wrapping existing client methods: + +```python +from botanu.tracking.llm import llm_instrumented + +class MyOpenAIClient: + @llm_instrumented(provider="openai", tokens_from_response=True) + def chat(self, model: str, messages: list): + return openai.chat.completions.create(model=model, messages=messages) +``` + +## Metrics + +The SDK automatically records these metrics: + +| Metric | Type | Description | +|--------|------|-------------| +| `gen_ai.client.token.usage` | Histogram | Token counts by type | +| `gen_ai.client.operation.duration` | Histogram | Operation duration in seconds | +| `botanu.gen_ai.attempts` | Counter | Request attempts (including retries) | + +## Example: Multi-Provider Workflow + +```python +from botanu import botanu_use_case, emit_outcome +from botanu.tracking.llm import track_llm_call + +@botanu_use_case("Document Analysis") +async def analyze_with_fallback(document: str): + """Try Claude first, fall back to GPT-4.""" + + try: + with track_llm_call(provider="anthropic", model="claude-3-opus") as tracker: + tracker.set_attempt(1) + response = await anthropic_client.messages.create( + model="claude-3-opus-20240229", + messages=[{"role": "user", "content": document}] + ) + tracker.set_tokens( + input_tokens=response.usage.input_tokens, + output_tokens=response.usage.output_tokens, + ) + emit_outcome("success", value_type="analyses_completed", value_amount=1) + return response.content[0].text + + except anthropic.RateLimitError: + # Fallback to OpenAI + with track_llm_call(provider="openai", model="gpt-4") as tracker: + tracker.set_attempt(2) + response = await openai_client.chat.completions.create( + model="gpt-4", + messages=[{"role": "user", "content": document}] + ) + tracker.set_tokens( + input_tokens=response.usage.prompt_tokens, + output_tokens=response.usage.completion_tokens, + ) + emit_outcome("success", value_type="analyses_completed", value_amount=1) + return response.choices[0].message.content +``` + +## See Also + +- [Auto-Instrumentation](../integration/auto-instrumentation.md) - Automatic LLM tracking +- [Data Tracking](data-tracking.md) - Database and storage tracking +- [Outcomes](outcomes.md) - Recording business outcomes diff --git a/docs/tracking/outcomes.md b/docs/tracking/outcomes.md new file mode 100644 index 0000000..0e974ae --- /dev/null +++ b/docs/tracking/outcomes.md @@ -0,0 +1,363 @@ +# Outcomes + +Record business outcomes to enable cost-per-outcome analysis. + +## Overview + +Outcomes connect infrastructure costs to business value. By recording what was achieved (tickets resolved, documents processed, leads qualified), you can calculate the true ROI of your AI workflows. + +## Basic Usage + +```python +from botanu import botanu_use_case, emit_outcome + +@botanu_use_case("Customer Support") +async def handle_ticket(ticket_id: str): + # ... process ticket ... + + # Record the business outcome + emit_outcome("success", value_type="tickets_resolved", value_amount=1) +``` + +## emit_outcome() Parameters + +```python +emit_outcome( + status: str, # Required: "success", "partial", "failed" + value_type: str = None, # What was achieved + value_amount: float = None, # How much + confidence: float = None, # Confidence score (0.0-1.0) + reason: str = None, # Why (especially for failures) +) +``` + +### status + +The outcome status: + +| Status | Description | Use Case | +|--------|-------------|----------| +| `success` | Fully achieved goal | Ticket resolved, document processed | +| `partial` | Partially achieved | 3 of 5 items processed | +| `failed` | Did not achieve goal | Error, timeout, rejection | + +### value_type + +A descriptive label for what was achieved: + +```python +emit_outcome("success", value_type="tickets_resolved", value_amount=1) +emit_outcome("success", value_type="documents_processed", value_amount=5) +emit_outcome("success", value_type="leads_qualified", value_amount=1) +emit_outcome("success", value_type="revenue_generated", value_amount=499.99) +``` + +### value_amount + +The quantified value: + +```python +# Count +emit_outcome("success", value_type="emails_sent", value_amount=100) + +# Revenue +emit_outcome("success", value_type="order_value", value_amount=1299.99) + +# Score +emit_outcome("success", value_type="satisfaction_score", value_amount=4.5) +``` + +### confidence + +For probabilistic outcomes: + +```python +emit_outcome( + "success", + value_type="intent_classified", + value_amount=1, + confidence=0.92, +) +``` + +### reason + +Explain the outcome (especially for failures): + +```python +emit_outcome("failed", reason="rate_limit_exceeded") +emit_outcome("failed", reason="invalid_input") +emit_outcome("partial", reason="timeout_partial_results", value_amount=3) +``` + +## Outcome Patterns + +### Success with Value + +```python +@botanu_use_case("Order Processing") +async def process_order(order_id: str): + order = await fetch_order(order_id) + await fulfill_order(order) + + emit_outcome( + "success", + value_type="orders_fulfilled", + value_amount=1, + ) +``` + +### Success with Revenue + +```python +@botanu_use_case("Sales Bot") +async def handle_inquiry(inquiry_id: str): + result = await process_sale(inquiry_id) + + if result.sale_completed: + emit_outcome( + "success", + value_type="revenue_generated", + value_amount=result.order_total, + ) + else: + emit_outcome( + "partial", + value_type="leads_qualified", + value_amount=1, + ) +``` + +### Partial Success + +```python +@botanu_use_case("Batch Processing") +async def process_batch(items: list): + processed = 0 + for item in items: + try: + await process_item(item) + processed += 1 + except Exception: + continue + + if processed == len(items): + emit_outcome("success", value_type="items_processed", value_amount=processed) + elif processed > 0: + emit_outcome( + "partial", + value_type="items_processed", + value_amount=processed, + reason=f"processed_{processed}_of_{len(items)}", + ) + else: + emit_outcome("failed", reason="no_items_processed") +``` + +### Failure with Reason + +```python +@botanu_use_case("Document Analysis") +async def analyze_document(doc_id: str): + try: + document = await fetch_document(doc_id) + if not document: + emit_outcome("failed", reason="document_not_found") + return None + + result = await analyze(document) + emit_outcome("success", value_type="documents_analyzed", value_amount=1) + return result + + except RateLimitError: + emit_outcome("failed", reason="rate_limit_exceeded") + raise + except TimeoutError: + emit_outcome("failed", reason="analysis_timeout") + raise +``` + +### Classification with Confidence + +```python +@botanu_use_case("Intent Classification") +async def classify_intent(message: str): + result = await classifier.predict(message) + + emit_outcome( + "success", + value_type="intents_classified", + value_amount=1, + confidence=result.confidence, + ) + + return result.intent +``` + +## Automatic Outcomes + +The `@botanu_use_case` decorator automatically emits outcomes: + +```python +@botanu_use_case("My Use Case", auto_outcome_on_success=True) # Default +async def my_function(): + # If no exception and no explicit emit_outcome, emits "success" + return result +``` + +If an exception is raised, it automatically emits `"failed"` with the exception class as the reason. + +To disable: + +```python +@botanu_use_case("My Use Case", auto_outcome_on_success=False) +async def my_function(): + # Must call emit_outcome explicitly + emit_outcome("success") +``` + +## @botanu_outcome Decorator + +For sub-functions within a use case: + +```python +from botanu import botanu_use_case, botanu_outcome + +@botanu_use_case("Data Pipeline") +async def run_pipeline(): + await step_one() + await step_two() + +@botanu_outcome() +async def step_one(): + # Emits "success" on completion, "failed" on exception + await process_data() + +@botanu_outcome(success="data_extracted", failed="extraction_failed") +async def step_two(): + # Custom outcome labels + await extract_data() +``` + +## Span Attributes + +Outcomes are recorded as span attributes: + +| Attribute | Description | +|-----------|-------------| +| `botanu.outcome` | Status (success/partial/failed) | +| `botanu.outcome.value_type` | What was achieved | +| `botanu.outcome.value_amount` | Quantified value | +| `botanu.outcome.confidence` | Confidence score | +| `botanu.outcome.reason` | Reason for outcome | + +## Span Events + +An event is also emitted for timeline visibility: + +```python +# Event: botanu.outcome_emitted +# Attributes: +# status: "success" +# value_type: "tickets_resolved" +# value_amount: 1 +``` + +## Cost-Per-Outcome Analysis + +With outcomes recorded, you can calculate: + +```sql +-- Cost per successful ticket resolution +SELECT + AVG(total_cost) as avg_cost_per_resolution +FROM runs +WHERE use_case = 'Customer Support' + AND outcome_status = 'success' + AND outcome_value_type = 'tickets_resolved'; + +-- ROI by use case +SELECT + use_case, + SUM(outcome_value_amount * value_per_unit) as total_value, + SUM(total_cost) as total_cost, + (SUM(outcome_value_amount * value_per_unit) - SUM(total_cost)) / SUM(total_cost) as roi +FROM runs +GROUP BY use_case; +``` + +## Best Practices + +### 1. Always Record Outcomes + +Every use case should emit an outcome: + +```python +@botanu_use_case("My Use Case") +async def my_function(): + try: + result = await do_work() + emit_outcome("success", value_type="items_processed", value_amount=result.count) + return result + except Exception as e: + emit_outcome("failed", reason=type(e).__name__) + raise +``` + +### 2. Use Consistent Value Types + +Define standard value types for your organization: + +```python +# Good - consistent naming +emit_outcome("success", value_type="tickets_resolved", value_amount=1) +emit_outcome("success", value_type="documents_processed", value_amount=1) + +# Bad - inconsistent +emit_outcome("success", value_type="ticket_done", value_amount=1) +emit_outcome("success", value_type="doc processed", value_amount=1) +``` + +### 3. Quantify When Possible + +Include amounts for better analysis: + +```python +# Good - quantified +emit_outcome("success", value_type="emails_sent", value_amount=50) + +# Less useful - no amount +emit_outcome("success") +``` + +### 4. Include Reasons for Failures + +Always explain why something failed: + +```python +emit_outcome("failed", reason="api_rate_limit") +emit_outcome("failed", reason="invalid_input_format") +emit_outcome("failed", reason="model_unavailable") +``` + +### 5. One Outcome Per Run + +Emit only one outcome per use case execution: + +```python +@botanu_use_case("Process Items") +async def process_items(items): + successful = 0 + for item in items: + if await process(item): + successful += 1 + + # One outcome at the end + emit_outcome("success", value_type="items_processed", value_amount=successful) +``` + +## See Also + +- [Run Context](../concepts/run-context.md) - Understanding runs +- [LLM Tracking](llm-tracking.md) - Tracking LLM costs +- [Best Practices](../patterns/best-practices.md) - More patterns