From cae6a49fca4dcdcf53f5fd308753ee408aac2db1 Mon Sep 17 00:00:00 2001 From: Christian Heimes Date: Wed, 27 May 2026 13:41:48 +0200 Subject: [PATCH] feat(auth): add URL-dispatched session auth for GitHub and GitLab Add `SessionAuth` class that dispatches authentication by (scheme, hostname). Lazy callbacks resolve credentials from netrc or environment variables on first request and cache the result. `create_session()` returns the session and auth handler so callers can register additional hosts. Move authentication documentation to docs/how-tos/authentication.md and update http-retry.md to reference the new guide. Co-Authored-By: Claude Signed-off-by: Christian Heimes --- README.md | 34 ++------ docs/how-tos/authentication.md | 64 ++++++++++++++ docs/how-tos/index.rst | 1 + docs/http-retry.md | 46 +++++------ src/fromager/request_session.py | 142 ++++++++++++++++++++++++++++++-- src/fromager/resolver.py | 6 -- tests/test_request_session.py | 131 +++++++++++++++++++++++++++++ 7 files changed, 356 insertions(+), 68 deletions(-) create mode 100644 docs/how-tos/authentication.md create mode 100644 tests/test_request_session.py diff --git a/README.md b/README.md index 968ba496f..672c86dac 100644 --- a/README.md +++ b/README.md @@ -29,35 +29,11 @@ Fromager can also build wheels in collections, rather than individually. Managin This approach makes Fromager especially useful in Python-heavy domains like AI, where reproducibility and compatibility across complex dependency trees are essential. -## Using private registries - -Fromager uses the [requests](https://requests.readthedocs.io) library and `pip` -at different points for talking to package registries. Both support -authenticating to remote servers in various ways. The simplest way to integrate -the authentication with fromager is to have a -[netrc](https://docs.python.org/3/library/netrc.html) file with a valid entry -for the host. The file will be read from `~/.netrc` by default. Another location -can be specified by setting the `NETRC` environment variable. - -For example, to use a gitlab package registry, use a [personal -access -token](https://docs.gitlab.com/ee/user/profile/personal_access_tokens.html#create-a-personal-access-token) -as documented in [this -issue](https://gitlab.com/gitlab-org/gitlab/-/issues/350582): - -```plaintext -machine gitlab.com login oauth2 password $token -``` - -## Determining versions via GitHub tags - -In some cases, the builder might have to use tags on GitHub to determine the version of a project instead of looking at -pypi.org. To avoid rate limit or to access private GitHub repository, a [personal access token](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens) can be passed to fromager by setting -the following environment variable: - -```shell -GITHUB_TOKEN= -``` +## Authentication + +Fromager automatically authenticates to GitHub and GitLab APIs using +credentials from netrc or environment variables. See the +[authentication guide](docs/how-tos/authentication.md) for details. ## Additional docs diff --git a/docs/how-tos/authentication.md b/docs/how-tos/authentication.md new file mode 100644 index 000000000..e196cea7e --- /dev/null +++ b/docs/how-tos/authentication.md @@ -0,0 +1,64 @@ +# Authentication + +Fromager automatically authenticates to GitHub and GitLab APIs using +credentials from netrc or environment variables. Credentials are +resolved lazily on the first request to each host. + +Authentication is recommended to avoid [API rate limits](https://docs.github.com/en/rest/using-the-rest-api/rate-limits-for-the-rest-api) (especially +for GitHub) and required to access private repositories or registries. + +## Credential lookup order + +For each host, fromager checks the following sources in order and uses +the first match: + +**GitHub** (`GITHUB_API_URL`, default `https://api.github.com`): + +1. [netrc](https://docs.python.org/3/library/netrc.html) entry for + the host -- the password is used as the token +2. `GITHUB_TOKEN` environment variable + +**GitLab** (`CI_SERVER_URL`, default `https://gitlab.com`): + +1. netrc entry for the host -- if the login is `gitlab-ci-token` a + CI job token header is used, otherwise a private token header +2. `CI_JOB_TOKEN` environment variable +3. `GITLAB_PRIVATE_TOKEN` environment variable + +## netrc + +The [requests](https://requests.readthedocs.io) library, `pip`, and +`git` all read credentials from `~/.netrc`. Another location can be +specified by setting the `NETRC` environment variable. Note that +`git` uses libcurl for HTTPS transport and libcurl only supports the +`NETRC` variable since [8.16.0](https://curl.se/ch/8.16.0.html) +(2025-09-10). Older versions only read `$HOME/.netrc`. + +For example, to authenticate to a GitLab package registry with a +[personal access token](https://docs.gitlab.com/ee/user/profile/personal_access_tokens.html#create-a-personal-access-token): + +```text +machine gitlab.com login pat password $token +``` + +To authenticate to the GitHub API with a +[personal access token](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens): + +```text +machine api.github.com login pat password $token +``` + +## Environment variables + +To authenticate via environment variables instead of netrc: + +```shell +# GitHub personal access token (avoids API rate limits) +export GITHUB_TOKEN= + +# GitLab CI job token (set automatically in CI pipelines) +export CI_JOB_TOKEN= + +# GitLab personal/project access token +export GITLAB_PRIVATE_TOKEN= +``` diff --git a/docs/how-tos/index.rst b/docs/how-tos/index.rst index f2cc23969..be6d9c62e 100644 --- a/docs/how-tos/index.rst +++ b/docs/how-tos/index.rst @@ -21,6 +21,7 @@ Essential guides for initial setup and first builds. .. toctree:: :maxdepth: 1 + authentication containers bootstrap-constraints diff --git a/docs/http-retry.md b/docs/http-retry.md index c01003647..3325c1f6b 100644 --- a/docs/http-retry.md +++ b/docs/http-retry.md @@ -20,7 +20,7 @@ The retry system provides: - **GitHub API rate limit handling** with proper reset time detection -- **GitHub authentication** automatically applied for GitHub API requests via `GITHUB_TOKEN` environment variable +- **Automatic authentication** for GitHub and GitLab APIs (see {doc}`how-tos/authentication`) - **Temporary file handling** to prevent partial downloads @@ -37,11 +37,11 @@ export FROMAGER_HTTP_BACKOFF_FACTOR=2.0 # Request timeout in seconds (default: 120) export FROMAGER_HTTP_TIMEOUT=180 - -# Token for GitHub API authentication (prevents rate limiting) -export GITHUB_TOKEN=your_github_token_here ``` +Authentication credentials (`GITHUB_TOKEN`, `GITLAB_PRIVATE_TOKEN`, +etc.) are documented in {doc}`how-tos/authentication`. + ## Error Types Handled The retry mechanism specifically handles these error conditions: @@ -73,35 +73,27 @@ The retry functionality is automatically enabled for all HTTP operations in From ### For Plugin Developers -If you're writing plugins that need HTTP functionality, you can use the retry session: +If you're writing plugins that need HTTP functionality, use the +shared session from `request_session`. It includes retry handling +and automatic authentication for GitHub and GitLab: ```python -from fromager.http_retry import get_retry_session - -# Get a session with retry capabilities -session = get_retry_session() +from fromager.request_session import session # Use it like a normal requests session -response = session.get("https://example.com/api/data") +response = session.get("https://pkg.test/api/data") response.raise_for_status() ``` -For more advanced retry configuration: +To register authentication for additional hosts: ```python -from fromager.http_retry import create_retry_session - -# Custom retry configuration -retry_config = { - "total": 5, - "backoff_factor": 2.0, - "status_forcelist": [429, 502, 503, 504], -} - -session = create_retry_session( - retry_config=retry_config, - timeout=60.0 -) +from fromager.request_session import session_auth + +def _resolve_my_auth(scheme: str, hostname: str) -> dict[str, str]: + return {"Authorization": "Bearer my-token"} + +session_auth.add("https://my-registry.test", _resolve_my_auth) ``` ### Decorating Functions with Retry Logic @@ -128,7 +120,7 @@ The retry system logs important events: - **WARNING**: When retries are attempted with backoff times - **ERROR**: When all retry attempts are exhausted -- **DEBUG**: Detailed retry configuration and GitHub token status +- **DEBUG**: Detailed retry configuration and authentication resolution Example log output: @@ -151,13 +143,13 @@ INFO saved /path/to/package.tar.gz If you're seeing many retries, consider: -- Setting `GITHUB_TOKEN` for GitHub API calls (automatically applied to GitHub requests) +- Configuring authentication credentials (see {doc}`how-tos/authentication`) to avoid API rate limits - Increasing timeout values for slow connections - Checking network connectivity and DNS resolution ### API Rate Limiting -- Use `GITHUB_TOKEN` for GitHub repositories +- Configure credentials via netrc or environment variables (see {doc}`how-tos/authentication`) - Consider using a local package mirror for PyPI - Monitor API usage if using private registries diff --git a/src/fromager/request_session.py b/src/fromager/request_session.py index 54a5c6919..e1abd1665 100644 --- a/src/fromager/request_session.py +++ b/src/fromager/request_session.py @@ -1,6 +1,16 @@ +from __future__ import annotations + +import logging import os +from typing import TYPE_CHECKING +from urllib.parse import urlparse + +import requests.auth +from requests.utils import get_netrc_auth -from .http_retry import create_retry_session +from .http_retry import RetryHTTPAdapter + +logger = logging.getLogger(__name__) # Enhanced retry configuration for fromager FROMAGER_RETRY_CONFIG = { @@ -11,8 +21,128 @@ "raise_on_status": False, } -# Create a session with enhanced retry capabilities -session = create_retry_session( - retry_config=FROMAGER_RETRY_CONFIG, - timeout=float(os.environ.get("FROMAGER_HTTP_TIMEOUT", "120.0")), -) +GITHUB_API_URL = os.environ.get("GITHUB_API_URL", "https://api.github.com") + +GITLAB_CI_SERVER_URL = os.environ.get("CI_SERVER_URL", "https://gitlab.com") +GITLAB_JOB_TOKEN_NAME = "gitlab-ci-token" + + +if TYPE_CHECKING: + from collections.abc import Callable + + _AuthCallback = Callable[[str, str], dict[str, str]] + + +class SessionAuth(requests.auth.AuthBase): + """Authentication handler that dispatches by ``(scheme, hostname)``. + + The requests library only supports a single ``session.auth`` handler + and does not provide per-host authentication on mounted adapters. + This class fills that gap by mapping ``(scheme, hostname)`` keys to + auth resolver callbacks. On the first request to a given host the + callback is invoked and the result is cached. + """ + + def __init__(self) -> None: + self._callbacks: dict[tuple[str, str], _AuthCallback] = {} + self._cache: dict[tuple[str, str], dict[str, str]] = {} + + def add(self, url: str, callback: _AuthCallback) -> None: + """Register a resolver *callback* for the scheme and hostname of *url*.""" + parsed = urlparse(url) + scheme = parsed.scheme + hostname = parsed.hostname or "" + if scheme not in {"http", "https"}: + raise ValueError(f"Unsupported scheme {scheme!r} in URL {url!r}") + if not hostname: + raise ValueError(f"Missing hostname in URL {url!r}") + key = (scheme, hostname) + self._cache.pop(key, None) + self._callbacks[key] = callback + + def get(self, url: str) -> dict[str, str]: + """Resolve and return the auth headers for *url*. + + Invokes the registered callback on first access and caches the + result. Returns an empty dict when no callback is registered. + """ + parsed = urlparse(url) + key = (parsed.scheme, parsed.hostname or "") + if key not in self._cache: + callback = self._callbacks.get(key) + self._cache[key] = callback(*key) if callback else {} + return dict(self._cache[key]) + + def __call__(self, r: requests.PreparedRequest) -> requests.PreparedRequest: + auth_header = self.get(r.url or "") + if auth_header: + r.headers.update(auth_header) + return r + + +def _resolve_github_auth(scheme: str, hostname: str) -> dict[str, str]: + """Resolve GitHub auth header from netrc or environment.""" + url = f"{scheme}://{hostname}" + netrc_auth = get_netrc_auth(url) + if netrc_auth is not None: + _login, password = netrc_auth + logger.debug("GitHub auth: using netrc credentials for %s", url) + return {"Authorization": f"token {password}"} + + token = os.environ.get("GITHUB_TOKEN") + if token: + logger.debug("GitHub auth: using GITHUB_TOKEN environment variable") + return {"Authorization": f"token {token}"} + return {} + + +def _resolve_gitlab_auth(scheme: str, hostname: str) -> dict[str, str]: + """Resolve GitLab auth header from netrc or environment.""" + url = f"{scheme}://{hostname}" + netrc_auth = get_netrc_auth(url) + if netrc_auth is not None: + login, password = netrc_auth + header = "JOB-TOKEN" if login == GITLAB_JOB_TOKEN_NAME else "PRIVATE-TOKEN" + logger.debug("GitLab auth: using netrc credentials for %s (%s)", url, header) + return {header: password} + + token = os.environ.get("CI_JOB_TOKEN") + if token: + logger.debug("GitLab auth: using CI_JOB_TOKEN environment variable") + return {"JOB-TOKEN": token} + + token = os.environ.get("GITLAB_PRIVATE_TOKEN") + if token: + logger.debug("GitLab auth: using GITLAB_PRIVATE_TOKEN environment variable") + return {"PRIVATE-TOKEN": token} + return {} + + +def create_session() -> tuple[requests.Session, SessionAuth]: + """Create a requests session with retry and authentication. + + Mounts a `RetryHTTPAdapter` on ``http://`` and ``https://``. + Registers lazy auth callbacks for GitHub and GitLab on a + `SessionAuth` handler keyed by ``(scheme, hostname)``. + + Returns the session and its `SessionAuth` so callers can + register additional auth callbacks via ``auth.add()``. + """ + adapter = RetryHTTPAdapter( + retry_config=FROMAGER_RETRY_CONFIG, + timeout=float(os.environ.get("FROMAGER_HTTP_TIMEOUT", "120.0")), + ) + + s = requests.Session() + s.mount("http://", adapter) + s.mount("https://", adapter) + + auth = SessionAuth() + auth.add(GITHUB_API_URL, _resolve_github_auth) + auth.add(GITLAB_CI_SERVER_URL, _resolve_gitlab_auth) + s.auth = auth + + return s, auth + + +session, session_auth = create_session() diff --git a/src/fromager/resolver.py b/src/fromager/resolver.py index 0ac959f82..9ec2fa179 100644 --- a/src/fromager/resolver.py +++ b/src/fromager/resolver.py @@ -1134,12 +1134,6 @@ def _find_tags( identifier: str, ) -> Iterable[Candidate]: headers = {"accept": "application/vnd.github+json"} - - # Add GitHub authentication if available - github_token = os.environ.get("GITHUB_TOKEN") - if github_token: - headers["Authorization"] = f"token {github_token}" - nexturl = self.api_url.format(self=self) while nexturl: resp = session.get(nexturl, headers=headers) diff --git a/tests/test_request_session.py b/tests/test_request_session.py new file mode 100644 index 000000000..692edc9b5 --- /dev/null +++ b/tests/test_request_session.py @@ -0,0 +1,131 @@ +from __future__ import annotations + +import os +from collections.abc import Generator, MutableMapping +from unittest.mock import MagicMock, patch + +import pytest +import requests + +from fromager.request_session import ( + SessionAuth, + _resolve_github_auth, + _resolve_gitlab_auth, + create_session, +) + + +@pytest.fixture +def mock_environ() -> Generator[MutableMapping[str, str]]: + """Patch os.environ with an empty dict and return it.""" + with patch.dict(os.environ, {}, clear=True): + yield os.environ + + +@pytest.fixture +def mock_netrc() -> Generator[MagicMock]: + """Patch get_netrc_auth to return None and return the mock.""" + with patch("fromager.request_session.get_netrc_auth", return_value=None) as m: + yield m + + +def _make_request(url: str) -> requests.PreparedRequest: + return requests.Request("GET", url).prepare() + + +def test_session_auth() -> None: + """Dispatch, caching, cache invalidation, and scheme separation.""" + call_count = 0 + + def counting(scheme: str, hostname: str) -> dict[str, str]: + nonlocal call_count + call_count += 1 + return {"X-Token": "val"} + + auth = SessionAuth() + auth.add("https://api.test", counting) + + # get() resolves and caches, second call uses cache + assert auth.get("https://api.test/path") == {"X-Token": "val"} + assert auth.get("https://api.test/other") == {"X-Token": "val"} + assert call_count == 1 + + # __call__ uses the same cache + r = _make_request("https://api.test/path") + auth(r) + assert r.headers["X-Token"] == "val" + assert call_count == 1 + + # No match -> empty dict from get(), no header from __call__ + assert auth.get("https://other.test/") == {} + r2 = _make_request("https://other.test/") + auth(r2) + assert "X-Token" not in r2.headers + + # Re-add invalidates cache + auth.add("https://api.test", lambda s, h: {"X-Token": "new"}) + assert auth.get("https://api.test/") == {"X-Token": "new"} + + # http vs https are separate + auth.add("http://api.test", lambda s, h: {"X-Token": "http"}) + assert auth.get("http://api.test/") == {"X-Token": "http"} + assert auth.get("https://api.test/") == {"X-Token": "new"} + + +def test_session_auth_add_validation() -> None: + auth = SessionAuth() + with pytest.raises(ValueError, match="Unsupported scheme"): + auth.add("ftp://host.test", lambda s, h: {}) + with pytest.raises(ValueError, match="Missing hostname"): + auth.add("https://", lambda s, h: {}) + + +def test_resolve_github_auth( + mock_environ: MutableMapping[str, str], mock_netrc: MagicMock +) -> None: + """Netrc > GITHUB_TOKEN > empty.""" + assert _resolve_github_auth("https", "api.github.com") == {} + + mock_environ["GITHUB_TOKEN"] = "env-token" + assert _resolve_github_auth("https", "api.github.com") == { + "Authorization": "token env-token" + } + + mock_netrc.return_value = ("user", "netrc-token") + assert _resolve_github_auth("https", "api.github.com") == { + "Authorization": "token netrc-token" + } + + +def test_resolve_gitlab_auth( + mock_environ: MutableMapping[str, str], mock_netrc: MagicMock +) -> None: + """Netrc > CI_JOB_TOKEN > GITLAB_PRIVATE_TOKEN > empty.""" + assert _resolve_gitlab_auth("https", "gitlab.com") == {} + + mock_environ["GITLAB_PRIVATE_TOKEN"] = "priv" + assert _resolve_gitlab_auth("https", "gitlab.com") == {"PRIVATE-TOKEN": "priv"} + + mock_environ["CI_JOB_TOKEN"] = "ci" + assert _resolve_gitlab_auth("https", "gitlab.com") == {"JOB-TOKEN": "ci"} + + # Netrc with regular user -> PRIVATE-TOKEN + mock_netrc.return_value = ("myuser", "netrc-token") + assert _resolve_gitlab_auth("https", "gitlab.com") == { + "PRIVATE-TOKEN": "netrc-token" + } + + # Netrc with gitlab-ci-token login -> JOB-TOKEN + mock_netrc.return_value = ("gitlab-ci-token", "job-secret") + assert _resolve_gitlab_auth("https", "gitlab.com") == {"JOB-TOKEN": "job-secret"} + + +def test_create_session( + mock_environ: MutableMapping[str, str], mock_netrc: MagicMock +) -> None: + s, auth = create_session() + + assert isinstance(s, requests.Session) + assert s.auth is auth + assert ("https", "api.github.com") in auth._callbacks + assert ("https", "gitlab.com") in auth._callbacks