diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index f34dcb1..ec7f2ca 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -25,22 +25,22 @@ jobs: echo "tag=$TAG" >> $GITHUB_OUTPUT # Mirror what publish_to_pypi.yaml does so the docker image and the - # PyPI wheel never disagree about their own version. - - name: Sync setup.py / __init__.py version to tag + # PyPI wheel never disagree about their own version. The version lives in a + # single source file (src/vfbquery/_version.py); setup.py reads it at build + # time and __init__.py imports it, so only _version.py is bumped here. + - name: Sync version to tag id: version run: | if [[ "$GITHUB_REF" == refs/tags/v* ]]; then VERSION=${GITHUB_REF#refs/tags/v} echo "Tag build detected: syncing __version__ to $VERSION" - sed -i "s/__version__ = \"[^\"]*\"/__version__ = \"$VERSION\"/" setup.py - sed -i "s/__version__ = \"[^\"]*\"/__version__ = \"$VERSION\"/" src/vfbquery/__init__.py + sed -i "s/__version__ = \"[^\"]*\"/__version__ = \"$VERSION\"/" src/vfbquery/_version.py else - VERSION=$(grep '^__version__' setup.py | sed 's/.*"\(.*\)".*/\1/') + VERSION=$(grep '^__version__' src/vfbquery/_version.py | sed 's/.*"\(.*\)".*/\1/') echo "Branch / dev build: using committed version $VERSION" fi echo "version=$VERSION" >> $GITHUB_OUTPUT - echo " setup.py: $(grep ^__version__ setup.py)" - echo " __init__.py: $(grep ^__version__ src/vfbquery/__init__.py)" + echo " _version.py: $(grep ^__version__ src/vfbquery/_version.py)" - name: Build test Docker image run: docker build --no-cache . --file Dockerfile --tag test-image diff --git a/.github/workflows/performance-test.yml b/.github/workflows/performance-test.yml index 6ddc3cf..3311c96 100644 --- a/.github/workflows/performance-test.yml +++ b/.github/workflows/performance-test.yml @@ -60,6 +60,12 @@ jobs: # IMPORTANT: the retry OVERWRITES performance_test_output.log so the # downstream "Fail job on test failures" step grades on the second # attempt's output only. + env: + # Read-only on PRs so a PR check never writes/purges the shared prod + # cache; writable on push-to-main and scheduled runs so those refresh + # and warm it (e.g. after a minor/major release). See + # solr_caching_readonly(). + VFBQUERY_CACHE_READONLY: ${{ github.event_name == 'pull_request' && 'true' || 'false' }} run: | set +e echo "=== Performance test attempt 1/2 (parallel) ===" @@ -80,6 +86,9 @@ jobs: if: always() env: VFBQUERY_CACHE_ENABLED: 'true' + # Read-only on PRs (never write/purge the shared prod cache); writable + # on push-to-main and scheduled runs so those refresh/warm it. + VFBQUERY_CACHE_READONLY: ${{ github.event_name == 'pull_request' && 'true' || 'false' }} MPLBACKEND: 'Agg' VISPY_GL_LIB: 'osmesa' VISPY_USE_EGL: '0' @@ -107,7 +116,11 @@ jobs: - name: Run Connectivity Tests if: always() env: - VFBQUERY_CACHE_ENABLED: 'true' + # Disable the result cache so the connectivity integration tests + # validate the LIVE query against the database, rather than reading + # (possibly stale) entries from the shared production cache or writing + # this run's results back into it. See solr_caching_disabled(). + VFBQUERY_CACHE_ENABLED: 'false' MPLBACKEND: 'Agg' VISPY_GL_LIB: 'osmesa' VISPY_USE_EGL: '0' diff --git a/.github/workflows/publish_to_pypi.yaml b/.github/workflows/publish_to_pypi.yaml index 51d543d..3956fd3 100644 --- a/.github/workflows/publish_to_pypi.yaml +++ b/.github/workflows/publish_to_pypi.yaml @@ -10,6 +10,7 @@ jobs: runs-on: ubuntu-latest permissions: id-token: write # IMPORTANT: this permission is mandatory for trusted publishing + contents: write # allows the post-publish step to commit the version bump back to main steps: - uses: actions/checkout@v4 with: @@ -43,22 +44,18 @@ jobs: # Set environment variables for the build echo "VERSION=$VERSION" >> $GITHUB_ENV - # Update version in setup.py - echo "Updating version in setup.py to $VERSION" - sed -i "s/__version__ = \"[^\"]*\"/__version__ = \"$VERSION\"/" setup.py - - # Update version in package __init__.py - echo "Updating version in src/vfbquery/__init__.py to $VERSION" - sed -i "s/__version__ = \"[^\"]*\"/__version__ = \"$VERSION\"/" src/vfbquery/__init__.py - - echo "Updated setup.py version:" - grep "__version__" setup.py - echo "Updated package version:" - grep "__version__" src/vfbquery/__init__.py + # Single source of truth: bump only _version.py. setup.py reads it at + # build time and vfbquery.__init__ imports it at runtime, so the wheel + # metadata, vfbquery.__version__ and the cache version stamp all follow. + echo "Updating version in src/vfbquery/_version.py to $VERSION" + sed -i "s/__version__ = \"[^\"]*\"/__version__ = \"$VERSION\"/" src/vfbquery/_version.py + + echo "Updated version:" + grep "__version__" src/vfbquery/_version.py else # Not running from a tag, show current version - echo "Not running from a tag, using existing version from setup.py" - grep "__version__" setup.py + echo "Not running from a tag, using existing version from _version.py" + grep "__version__" src/vfbquery/_version.py fi - name: Build distributions @@ -110,6 +107,33 @@ jobs: - name: Publish distribution 📦 to PyPI uses: pypa/gh-action-pypi-publish@v1.12.2 + + - name: Commit version bump back to main + # Runs only after a successful publish from a release tag. The build above + # runs from a detached tag checkout, so here we switch to the live main + # branch, re-apply the released version to the single source file + # (_version.py) and push. [skip ci] keeps this housekeeping commit from + # retriggering the test/perf workflows. + if: success() && startsWith(github.ref, 'refs/tags/v') + run: | + set -e + if [[ -z "$VERSION" ]]; then + echo "VERSION not set; nothing to commit." + exit 0 + fi + git config user.email "action@github.com" + git config user.name "GitHub Action" + git fetch origin main + git reset --hard origin/main + sed -i "s/__version__ = \"[^\"]*\"/__version__ = \"$VERSION\"/" src/vfbquery/_version.py + git add src/vfbquery/_version.py + if git diff --staged --quiet; then + echo "main already at version $VERSION; nothing to commit." + else + git commit -m "Bump version to $VERSION [skip ci]" + git push origin HEAD:main + fi + # - name: Publish package to TestPyPI # uses: pypa/gh-action-pypi-publish@release/v1 # with: diff --git a/CACHING.md b/CACHING.md index c4d3b9d..aff4bb6 100644 --- a/CACHING.md +++ b/CACHING.md @@ -111,6 +111,35 @@ Note: data reads in `vfb_queries.py` (term_info, painted domains, ontology label lookups, etc.) still go to `solr.virtualflybrain.org` — only the result *cache* moved. The two are independent. +## Cache versioning and invalidation + +Every cache entry is stamped with the VFBquery package version (major.minor) that +wrote it, so results from an old code version aren't served after an upgrade. + +The **running** version is resolved (in `solr_result_cache.py`) as: + +1. the `VFBQUERY_VERSION` environment variable if set, otherwise +2. the installed package version (`importlib.metadata.version('vfbquery')`), + +normalized to **major.minor**. That value comes from the single source of truth, +`src/vfbquery/_version.py` (see [RELEASING.md](RELEASING.md)). + +On read, if an entry's stamp differs from the running version, invalidation is +**monotonic** — it only discards entries written by an *older* version: + +- **Older (or unversioned) entry** → invalidated, deleted, and recomputed by the + current code. +- **Newer entry** (seen by a stale/older install, or by an older deploy running + alongside a newer one) → treated as a miss but **not deleted**. An older client + must never purge a fresher entry; the previous `!=` check did, which let + downgrades wipe live entries and made concurrent versions thrash each other. + +Consequences for the major.minor namespace: + +- **Patch bumps** (`1.20.0 → 1.20.3`) share the cache — no invalidation. +- **Minor/major bumps** (`1.20 → 1.21`) invalidate older entries on read, so a + release that changes query output naturally refreshes the cache. + ## Runtime Configuration Control caching behavior: @@ -133,6 +162,54 @@ Disable caching globally if needed: export VFBQUERY_CACHE_ENABLED=false ``` +When disabled, the cache layer is **fully bypassed** — every query runs live +against Neo4j/Owlery/Solr with **no read, no write, no version-invalidation, and +no contact with the cache server** (`solr_caching_disabled()` in +`solr_result_cache.py`; mirrored in `vfb_connectivity.query_connectivity`). + +This is how the **integration tests** run in CI. The test steps that assert on +query *results* (`test_neuron_neuron_connectivity`, `test_neuron_region_connectivity`, +`test_vfb_connectivity`, the unit tests in `python-test.yml`, and `examples.yml`) +set `VFBQUERY_CACHE_ENABLED=false` so they: + +- validate the **live** query for the branch under test, not a (possibly stale) + cached result, and +- never write a PR/branch's output back into the **shared production cache**. + +The performance workflow's perf-timing steps keep caching enabled on purpose +(they measure warm-cache latency); only the result-asserting steps disable it. + +#### Read-only mode + +```bash +export VFBQUERY_CACHE_READONLY=true +``` + +Read-only mode still **reads** the cache (warm results are served), but +suppresses every **mutation** — no writes, no force-refresh clears, and no +version/expiry purges (`solr_caching_readonly()`, gating `cache_result`, +`clear_cache_entry` and `_clear_expired_cache_document`). + +This is used by the **performance-test workflow's perf-timing steps**, but only +on **pull requests** — `VFBQUERY_CACHE_READONLY` is set from +`github.event_name == 'pull_request'`. So: + +- **On PRs** the perf steps read warm entries for representative timings but + never write or purge. Combined with `VFBQUERY_CACHE_ENABLED=false` on the + result-asserting steps, **no PR run can modify the production cache**. +- **On push-to-`main` and scheduled runs** those perf steps are *writable*, so + they refresh/warm the cache under the current `main` version. + +That post-merge + daily-scheduled warming (plus lazy refresh by production +traffic) is what keeps the cache populated for the version on `main`, including +after a release bumps it. There's no dedicated release-triggered warm. + +Caveat: a PR that bumps the **minor/major** version reads cold in read-only mode +(its version's entries don't exist yet — see version invalidation below); +same-version PRs read the already-warm production entries. If you'd rather PR +checks read *and* write a cache without touching production, point them at a +separate collection with `VFBQUERY_SOLR_URL` instead. + ## Performance Benefits VFBquery SOLR caching provides significant performance improvements: diff --git a/README.md b/README.md index dc24b49..a545265 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,11 @@ result2 = vfb.get_term_info('FBbt_00003748') # 54,000x faster! similar = vfb.get_similar_neurons('VFB_jrchk00s') # Fast after first run ``` +📚 See [CACHING.md](CACHING.md) for cache configuration, the `VFBQUERY_CACHE_ENABLED` +bypass (used by the tests), and version-based invalidation; and +[RELEASING.md](RELEASING.md) for how the single-source version (`_version.py`) is +bumped from the release tag. + To get term info for a term: get_term_info(ID) diff --git a/RELEASING.md b/RELEASING.md new file mode 100644 index 0000000..48209d3 --- /dev/null +++ b/RELEASING.md @@ -0,0 +1,76 @@ +# Releasing VFBquery + +## Version: single source of truth + +The package version lives in exactly one place: + +``` +src/vfbquery/_version.py -> __version__ = "X.Y.Z" +``` + +Everything else derives from it, so the fields can never drift apart: + +- **`setup.py`** reads `_version.py` at build time (via `exec`, without importing + the package), so the wheel/sdist metadata matches. +- **`vfbquery/__init__.py`** does `from ._version import __version__`, so + `vfbquery.__version__` (and `ha_api.py`'s version reporting) matches. +- **The SOLR result cache** stamps entries with this version (major.minor) and + uses it for invalidation — see [CACHING.md](CACHING.md#cache-versioning-and-invalidation). + +Do **not** hard-code the version anywhere else. + +## Cutting a release + +1. Create a **GitHub Release** with a tag of the form `vX.Y.Z` (e.g. `v1.21.0`). + +That's it — the `Publish 🐍 📦 to PyPI` workflow +(`.github/workflows/publish_to_pypi.yaml`) does the rest: + +1. Checks out the tag, extracts `X.Y.Z` from `refs/tags/vX.Y.Z`, and writes it + into `_version.py` (`sed`). +2. Builds the sdist/wheel (version comes from `_version.py`) and verifies the + metadata matches the tag. +3. **Publishes to PyPI** via trusted publishing. +4. **Commits the bump back to `main`** — switches from the detached tag checkout + to live `main`, re-applies `X.Y.Z` to `_version.py`, and pushes + `Bump version to X.Y.Z [skip ci]`. + +So after a release, **`main` reflects the released version** too — you don't have +to bump it by hand. + +## Cache warming after a release + +A minor/major bump invalidates the previous version's cache entries +(see [CACHING.md](CACHING.md#cache-versioning-and-invalidation)), so they're +refilled with the new version's output. That happens two ways, with no dedicated +release-triggered step: + +- **Lazily**, by the deployed production service as it serves traffic (the + primary path — each query refreshes on first read). +- **By the `performance-test` workflow on `main`** — its perf steps are writable + on push-to-`main` and scheduled (daily) runs (read-only only on PRs), so they + recompute and re-cache the perf-test query set under the current `main` + version. The daily schedule guarantees the new version's entries are warmed + within a day of a release, so later PR runs read a warm cache. + +### Notes & guarantees + +- The commit-back step runs **only after a successful publish** and only for + `refs/tags/v*` (`if: success() && startsWith(github.ref, 'refs/tags/v')`). +- It's a **no-op if `main` is already at that version** (guarded by + `git diff --staged --quiet`), so you can also bump `_version.py` in a PR before + tagging and the workflow won't create an empty commit. +- The push needs `contents: write`, which is declared in the workflow's job + `permissions` alongside the `id-token: write` used for PyPI. +- `[skip ci]` keeps the housekeeping commit from retriggering the test/perf + workflows. + +### Choosing the version bump + +Because the cache namespace is keyed on **major.minor** +(see [CACHING.md](CACHING.md#cache-versioning-and-invalidation)): + +- Bump the **patch** for changes that don't alter query *output* — cached results + stay valid (no invalidation). +- Bump **minor/major** when query output changes — older cache entries are then + invalidated on read, so users get refreshed results. diff --git a/setup.py b/setup.py index d414125..168d633 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,12 @@ here = path.abspath(path.dirname(__file__)) -__version__ = "1.20.0" +# Single source of truth: read __version__ from src/vfbquery/_version.py without +# importing the package (which would pull in runtime dependencies at build time). +_version_ns = {} +with open(path.join(here, "src", "vfbquery", "_version.py")) as _vf: + exec(_vf.read(), _version_ns) +__version__ = _version_ns["__version__"] # Get the long description from the README file with open(path.join(here, 'README.md')) as f: diff --git a/src/vfbquery/__init__.py b/src/vfbquery/__init__.py index 066d69a..ffbee01 100644 --- a/src/vfbquery/__init__.py +++ b/src/vfbquery/__init__.py @@ -98,5 +98,5 @@ def clear_solr_cache(query_type: str, term_id: str) -> bool: except ImportError: __solr_caching_available__ = False -# Version information -__version__ = "1.12.1" +# Version information (single source of truth — see _version.py) +from ._version import __version__ diff --git a/src/vfbquery/_version.py b/src/vfbquery/_version.py new file mode 100644 index 0000000..6a281d3 --- /dev/null +++ b/src/vfbquery/_version.py @@ -0,0 +1,9 @@ +"""Single source of truth for the VFBquery package version. + +Both ``setup.py`` (read at build time) and ``vfbquery.__init__`` (imported at +runtime) take ``__version__`` from here, and the release workflow bumps only +this file, so the packaging metadata, ``vfbquery.__version__`` and the SOLR +cache's version stamp can never drift apart. +""" + +__version__ = "1.20.0" diff --git a/src/vfbquery/solr_result_cache.py b/src/vfbquery/solr_result_cache.py index 04a2d2f..ff66b21 100644 --- a/src/vfbquery/solr_result_cache.py +++ b/src/vfbquery/solr_result_cache.py @@ -139,6 +139,18 @@ def _normalize_version(self, version: Optional[str]) -> Optional[str]: minor = match.group(2) or '0' return f"{major}.{minor}" + def _version_tuple(self, normalized_version: Optional[str]): + """Return a numeric ``(major, minor)`` tuple for a normalized version + string, or ``None`` if it can't be parsed. Used for ordered comparison + (string compare is wrong: ``'1.8' > '1.20'`` lexicographically).""" + if not normalized_version: + return None + try: + major, minor = normalized_version.split('.') + return (int(major), int(minor)) + except (ValueError, AttributeError): + return None + def _get_package_version(self) -> Optional[str]: """Return the VFBquery package version for cache validation.""" if hasattr(self, '_package_version') and self._package_version is not None: @@ -257,15 +269,36 @@ def get_cached_result(self, query_type: str, term_id: str, **params) -> Optional # Parse the cached metadata and result cached_data = json.loads(cached_field) - # Check package version before anything else so stale cache is rejected early + # Check package version before anything else so stale cache is rejected early. + # Only invalidate when the cached entry is OLDER than the current code + # (compared numerically by major.minor) or carries no parseable version, + # so newer code repopulates stale entries. A NEWER cached entry — seen by + # a stale/older install, or by an older deploy running concurrently — is + # treated as a miss but NOT deleted: an older client must never purge a + # fresher entry (the previous `!=` check did, causing cross-version + # cache thrashing and letting downgrades wipe live entries). current_version = self._get_cache_package_version() cached_version = self._normalize_version(cached_data.get("package_version") or cached_data.get("version")) if current_version and cached_version != current_version: - logger.info( - f"Cache invalidated for {query_type}({term_id}) because package major.minor version changed " - f"(cached={cached_version}, current={current_version})" + cached_t = self._version_tuple(cached_version) + current_t = self._version_tuple(current_version) + cached_is_older = ( + current_t is None # current unparseable -> be conservative + or cached_t is None # legacy/unversioned entry + or cached_t < current_t ) - self._clear_expired_cache_document(cache_doc_id) + if cached_is_older: + logger.info( + f"Cache invalidated for {query_type}({term_id}): cached version " + f"{cached_version} older than current {current_version}" + ) + self._clear_expired_cache_document(cache_doc_id) + else: + logger.info( + f"Cache miss for {query_type}({term_id}): cached version " + f"{cached_version} newer than current {current_version}; " + f"not serving or deleting it" + ) return None # Check expiration (3-month max age) @@ -360,6 +393,10 @@ def cache_result(self, query_type: str, term_id: str, result: Any, **params) -> Returns: True if successfully cached, False otherwise """ + if solr_caching_readonly(): + # Read-only mode: never write to the shared cache (e.g. PR checks). + return False + if not result: logger.debug("Empty result, not caching") return False @@ -420,6 +457,9 @@ def cache_result(self, query_type: str, term_id: str, result: Any, **params) -> def _clear_expired_cache_document(self, cache_doc_id: str): """Delete expired cache document from SOLR""" + if solr_caching_readonly(): + # Read-only mode: never delete/purge entries (e.g. PR checks). + return if not self._solr_available(): return try: @@ -444,6 +484,9 @@ def clear_cache_entry(self, query_type: str, term_id: str) -> bool: Returns: True if successfully cleared, False otherwise """ + if solr_caching_readonly(): + # Read-only mode: never delete/purge entries (e.g. PR checks). + return False if not self._solr_available(): return False try: @@ -762,15 +805,38 @@ def get_solr_cache() -> SolrResultCache: _solr_cache = SolrResultCache() return _solr_cache +def solr_caching_disabled() -> bool: + """True when the SOLR result cache is disabled via VFBQUERY_CACHE_ENABLED. + + Matches the check in __init__.py. When disabled, the cache layer is fully + bypassed — no read, no write, no contact with the cache server — so callers + (notably the test suite) exercise the live query and never mutate the + shared production cache. Evaluated per-call so tests can toggle it via env. + """ + return os.getenv('VFBQUERY_CACHE_ENABLED', 'true').lower() in ('false', '0', 'no', 'off') + + +def solr_caching_readonly() -> bool: + """True when the SOLR result cache is read-only via VFBQUERY_CACHE_READONLY. + + In read-only mode entries may still be *read* (so warm results are served — + e.g. the perf tests keep their fast timings), but all *mutations* are + suppressed: no writes, no force-refresh clears, and no version/expiry purges. + This lets PR checks read the shared production cache without ever modifying + it. Evaluated per-call so CI can toggle it via env. + """ + return os.getenv('VFBQUERY_CACHE_READONLY', 'false').lower() in ('true', '1', 'yes', 'on') + + def with_solr_cache(query_type: str): """ Decorator to add SOLR caching to query functions - + Usage: @with_solr_cache('term_info') def get_term_info(short_form, force_refresh=False, **kwargs): # ... existing implementation - + The decorated function can accept a 'force_refresh' parameter to bypass cache. """ def decorator(func): @@ -778,7 +844,14 @@ def decorator(func): def wrapper(*args, **kwargs): # Check if force_refresh is requested (pop it before passing to function) force_refresh = kwargs.pop('force_refresh', False) - + + # Fully bypass the cache when disabled (VFBQUERY_CACHE_ENABLED=false): + # run the live query directly, never reading stale data nor writing + # to the shared production cache. This is what the test suite relies + # on so a PR's queries are validated live without poisoning the cache. + if solr_caching_disabled(): + return func(*args, **kwargs) + # Check if limit is applied - only cache full results (limit=-1) limit = kwargs.get('limit', -1) should_cache = (limit == -1) # Only cache when getting all results (limit=-1) diff --git a/src/vfbquery/vfb_connectivity.py b/src/vfbquery/vfb_connectivity.py index 334fccc..2efc8a0 100644 --- a/src/vfbquery/vfb_connectivity.py +++ b/src/vfbquery/vfb_connectivity.py @@ -131,8 +131,16 @@ def query_connectivity(upstream_type=None, downstream_type=None, weight=5, if upstream_type is None and downstream_type is None: raise ValueError("At least one of upstream_type or downstream_type must be specified") + # Fully bypass the cache when disabled (VFBQUERY_CACHE_ENABLED=false): run + # the live query directly without reading stale data or writing to the + # shared production cache. Mirrors the @with_solr_cache decorator's bypass. + from .solr_result_cache import get_solr_cache, solr_caching_disabled + if solr_caching_disabled(): + return _query_connectivity_uncached( + upstream_type, downstream_type, weight, group_by_class, exclude_dbs + ) + # Persistent SOLR cache (composite key) sitting behind the in-memory cache. - from .solr_result_cache import get_solr_cache cache = get_solr_cache() cache_key = _connectivity_cache_key( upstream_type, downstream_type, weight, group_by_class, exclude_dbs