Merge pull request #45 from VirtualFlyBrain/feature/cache-all-queries

Robbie1977 · web-flow · commit 3a2bfc83fb91 · 2026-06-08T15:27:26.000+01:00
Cache all handler-reachable queries (v1.19.0)
diff --git a/CACHING.md b/CACHING.md
@@ -26,6 +26,91 @@ VFBquery uses a single-layer caching approach with SOLR:
 3. **Cache persistence**: Survives Python restarts and server reboots
 4. **Automatic expiration**: 3-month TTL matches VFB_connect behavior
 
+## Cache coverage (v1.19.0)
+
+As of v1.19.0 every query-result function reachable from the HA API handlers
+(`ha_api.py`) is served by the persistent SOLR cache, except a small set that
+are deliberately excluded (see below). Coverage is verified by a static sweep
+that traces each handler entry point through the `QUERY_TYPE_MAP` dispatch and
+the FlyBase/connectivity/hierarchy handlers — see `coverage_sweep.py`.
+
+Caching is applied in one of two layers, both of which the handler path goes
+through (`handler -> vfbquery.<fn> (patched to *_cached) -> _original`):
+
+- `@with_solr_cache('<bucket>')` on the original in `vfb_queries.py` (most
+  hierarchy / neuron-in-region / connectivity / image queries), or
+- `@with_solr_cache('<bucket>')` on the `*_cached` wrapper in
+  `cached_functions.py` (term_info, similarity, transcriptomics, datasets).
+
+A function counts as cached if either layer carries the decorator; do not add
+the decorator at both layers for the same function (double round-trips).
+
+New buckets added in v1.19.0: `cluster_expression`, `expression_cluster`,
+`scrnaseq_dataset_data`, `individual_neuron_inputs`, `similar_morphology`,
+`similar_morphology_part_of`, `similar_morphology_part_of_exp`,
+`similar_morphology_nb`, `similar_morphology_nb_exp`, `dataset_images`,
+`all_aligned_images`, `all_datasets`, `transgene_expression_here`,
+`related_anatomy`. The five genuinely new buckets (`dataset_images`,
+`all_aligned_images`, `all_datasets`, `transgene_expression_here`,
+`related_anatomy`) are also listed in the wrapper's `expensive_query_types`
+and `dataframe_query_types`, so a limited request computes the full result
+once, caches it, and serves later limited requests by slicing the cached full
+result.
+
+### Cross-dataset connectivity (`query_connectivity`)
+
+`query_connectivity` takes five parameters (`upstream_type`,
+`downstream_type`, `weight`, `group_by_class`, `exclude_dbs`), so the default
+single-id `@with_solr_cache` key does not fit. It is persisted directly in
+`vfb_connectivity.py` under a composite key
+(`query_connectivity:{upstream}:{downstream}:{weight}:{group_by_class}:{exclude_dbs}`,
+hashed for a Solr-safe document id). The in-memory `ResultCache` and request
+coalescer in `ha_api.py` sit in front; this SOLR layer sits behind so a cold
+miss survives restarts and reaches the other containers. Graph
+post-processing (`post_fn`) stays in the handler and is never part of the
+cached payload. `force_refresh=true` on `/query_connectivity` drops both the
+in-memory entry and the SOLR document and recomputes.
+
+### Deliberately not cached
+
+- `get_similar_morphology_userdata` — keyed on a per-session user upload id;
+  the result is user/session-specific, so it is left to the in-memory L1
+  cache only.
+- `get_flybase_stocks`, `get_flybase_combo_pubs`, `find_stocks`,
+  `find_combo_publications` — backed by the FlyBase RDBMS, not Neo4j/Owlery;
+  out of scope for this offload.
+- `resolve_entity`, `resolve_combination` — thin resolvers over the already
+  cached `term_info`.
+- `list_connectome_datasets` — tiny static list; L1 cache is sufficient.
+- `get_hierarchy` — delegates its heavy work to the SOLR-cached
+  `get_parts_of` / `get_subclasses_of` and relies on Owlery's own
+  server-side cache, with the handler holding an in-memory composite-key
+  entry; persistent composite caching is a sensible follow-up but was left
+  out to keep this change focused.
+
+### Cache server
+
+The cache reads and writes `cache_url`, which defaults to the dedicated
+query-cache Solr:
+
+```
+http://vfbquerycache.virtualflybrain.org:80/solr/vfb_json
+```
+
+(`SolrResultCache.DEFAULT_CACHE_URL`). This is a separate, lightly-loaded host
+from the ontology Solr (`solr.virtualflybrain.org`); it is reached on port 80
+because the Solr native port is firewalled externally. Override with the
+`VFBQUERY_SOLR_URL` environment variable (e.g. to point at a staging core for
+testing):
+
+```bash
+export VFBQUERY_SOLR_URL=http://localhost:8983/solr/vfb_json
+```
+
+Note: data reads in `vfb_queries.py` (term_info, painted domains, ontology
+label lookups, etc.) still go to `solr.virtualflybrain.org` — only the result
+*cache* moved. The two are independent.
+
 ## Runtime Configuration
 
 Control caching behavior:
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 
 here = path.abspath(path.dirname(__file__))
 
-__version__ = "1.18.0"
+__version__ = "1.19.0"
 
 # Get the long description from the README file
 with open(path.join(here, 'README.md')) as f:
diff --git a/src/vfbquery/cached_functions.py b/src/vfbquery/cached_functions.py
@@ -138,6 +138,7 @@ def get_similar_neurons_cached(neuron, similarity_score='NBLAST_score', return_d
     """
     return _original_get_similar_neurons(neuron=neuron, similarity_score=similarity_score, return_dataframe=return_dataframe, limit=limit)
 
+@with_solr_cache('similar_morphology')
 def get_similar_morphology_cached(neuron_short_form: str, return_dataframe=True, limit: int = -1, force_refresh: bool = False):
     """
     Enhanced get_similar_morphology with SOLR caching.
@@ -153,6 +154,7 @@ def get_similar_morphology_cached(neuron_short_form: str, return_dataframe=True,
     """
     return _original_get_similar_morphology(neuron_short_form=neuron_short_form, return_dataframe=return_dataframe, limit=limit)
 
+@with_solr_cache('similar_morphology_part_of')
 def get_similar_morphology_part_of_cached(neuron_short_form: str, return_dataframe=True, limit: int = -1, force_refresh: bool = False):
     """
     Enhanced get_similar_morphology_part_of with SOLR caching.
@@ -168,6 +170,7 @@ def get_similar_morphology_part_of_cached(neuron_short_form: str, return_datafra
     """
     return _original_get_similar_morphology_part_of(neuron_short_form=neuron_short_form, return_dataframe=return_dataframe, limit=limit)
 
+@with_solr_cache('similar_morphology_part_of_exp')
 def get_similar_morphology_part_of_exp_cached(expression_short_form: str, return_dataframe=True, limit: int = -1, force_refresh: bool = False):
     """
     Enhanced get_similar_morphology_part_of_exp with SOLR caching.
@@ -183,6 +186,7 @@ def get_similar_morphology_part_of_exp_cached(expression_short_form: str, return
     """
     return _original_get_similar_morphology_part_of_exp(expression_short_form=expression_short_form, return_dataframe=return_dataframe, limit=limit)
 
+@with_solr_cache('similar_morphology_nb')
 def get_similar_morphology_nb_cached(neuron_short_form: str, return_dataframe=True, limit: int = -1, force_refresh: bool = False):
     """
     Enhanced get_similar_morphology_nb with SOLR caching.
@@ -197,6 +201,7 @@ def get_similar_morphology_nb_cached(neuron_short_form: str, return_dataframe=Tr
     """
     return _original_get_similar_morphology_nb(neuron_short_form=neuron_short_form, return_dataframe=return_dataframe, limit=limit)
 
+@with_solr_cache('similar_morphology_nb_exp')
 def get_similar_morphology_nb_exp_cached(expression_short_form: str, return_dataframe=True, limit: int = -1, force_refresh: bool = False):
     """
     Enhanced get_similar_morphology_nb_exp with SOLR caching.
@@ -211,6 +216,9 @@ def get_similar_morphology_nb_exp_cached(expression_short_form: str, return_data
     """
     return _original_get_similar_morphology_nb_exp(expression_short_form=expression_short_form, return_dataframe=return_dataframe, limit=limit)
 
+# Deliberately not @with_solr_cache: the key is a per-session user upload id,
+# so the result is user/session-specific and not safe to share via the
+# persistent cache. Left to recompute (and to the in-memory L1 cache only).
 def get_similar_morphology_userdata_cached(upload_id: str, return_dataframe=True, limit: int = -1, force_refresh: bool = False):
     """
     Enhanced get_similar_morphology_userdata with SOLR caching.
@@ -295,6 +303,7 @@ def get_templates_cached(limit: int = -1, return_dataframe: bool = False, force_
     """
     return _original_get_templates(limit=limit, return_dataframe=return_dataframe, force_refresh=force_refresh)
 
+@with_solr_cache('related_anatomy')
 def get_related_anatomy_cached(template_short_form: str, limit: int = -1, return_dataframe: bool = False, force_refresh: bool = False):
     """
     Enhanced get_related_anatomy with SOLR caching.
@@ -348,6 +357,7 @@ def get_template_roi_tree_cached(template_short_form: str, return_dataframe: boo
     """
     return _original_get_template_roi_tree(template_short_form=template_short_form, return_dataframe=return_dataframe)
 
+@with_solr_cache('dataset_images')
 def get_dataset_images_cached(dataset_short_form: str, return_dataframe=True, limit: int = -1, force_refresh: bool = False):
     """
     Enhanced get_dataset_images with SOLR caching.
@@ -362,6 +372,7 @@ def get_dataset_images_cached(dataset_short_form: str, return_dataframe=True, li
     """
     return _original_get_dataset_images(dataset_short_form=dataset_short_form, return_dataframe=return_dataframe, limit=limit)
 
+@with_solr_cache('all_aligned_images')
 def get_all_aligned_images_cached(template_short_form: str, return_dataframe=True, limit: int = -1, force_refresh: bool = False):
     """
     Enhanced get_all_aligned_images with SOLR caching.
@@ -391,6 +402,7 @@ def get_aligned_datasets_cached(template_short_form: str, return_dataframe=True,
     """
     return _original_get_aligned_datasets(template_short_form=template_short_form, return_dataframe=return_dataframe, limit=limit)
 
+@with_solr_cache('all_datasets')
 def get_all_datasets_cached(return_dataframe=True, limit: int = -1, force_refresh: bool = False):
     """
     Enhanced get_all_datasets with SOLR caching.
@@ -404,10 +416,16 @@ def get_all_datasets_cached(return_dataframe=True, limit: int = -1, force_refres
     """
     return _original_get_all_datasets(return_dataframe=return_dataframe, limit=limit)
 
+@with_solr_cache('individual_neuron_inputs')
 def get_individual_neuron_inputs_cached(neuron_short_form: str, return_dataframe=True, limit: int = -1, summary_mode: bool = False, force_refresh: bool = False):
     """
     Enhanced get_individual_neuron_inputs with SOLR caching.
 
+    Note: the SOLR cache keys on the neuron id (and return_dataframe). The
+    REST path always calls with summary_mode=False, so the default key is
+    safe there; a non-default summary_mode is not part of the cache key, so
+    direct library callers that vary it should pass force_refresh.
+
     Args:
         neuron_short_form: Neuron short form
         return_dataframe: Whether to return DataFrame or list of dicts
@@ -484,6 +502,7 @@ def get_anatomy_scrnaseq_cached(anatomy_short_form: str, return_dataframe=True,
     """
     return _original_get_anatomy_scrnaseq(anatomy_short_form=anatomy_short_form, return_dataframe=return_dataframe, limit=limit)
 
+@with_solr_cache('cluster_expression')
 def get_cluster_expression_cached(cluster_short_form: str, return_dataframe=True, limit: int = -1, force_refresh: bool = False):
     """
     Enhanced get_cluster_expression with SOLR caching.
@@ -498,6 +517,7 @@ def get_cluster_expression_cached(cluster_short_form: str, return_dataframe=True
     """
     return _original_get_cluster_expression(cluster_short_form=cluster_short_form, return_dataframe=return_dataframe, limit=limit)
 
+@with_solr_cache('expression_cluster')
 def get_expression_cluster_cached(gene_short_form: str, return_dataframe=True, limit: int = -1, force_refresh: bool = False):
     """
     Enhanced get_expression_cluster with SOLR caching.
@@ -512,6 +532,7 @@ def get_expression_cluster_cached(gene_short_form: str, return_dataframe=True, l
     """
     return _original_get_expression_cluster(gene_short_form=gene_short_form, return_dataframe=return_dataframe, limit=limit)
 
+@with_solr_cache('scrnaseq_dataset_data')
 def get_scrnaseq_dataset_data_cached(dataset_short_form: str, return_dataframe=True, limit: int = -1, force_refresh: bool = False):
     """
     Enhanced get_scrnaseq_dataset_data with SOLR caching.
@@ -526,6 +547,7 @@ def get_scrnaseq_dataset_data_cached(dataset_short_form: str, return_dataframe=T
     """
     return _original_get_scrnaseq_dataset_data(dataset_short_form=dataset_short_form, return_dataframe=return_dataframe, limit=limit)
 
+@with_solr_cache('transgene_expression_here')
 def get_transgene_expression_here_cached(anatomy_short_form: str, return_dataframe=True, limit: int = -1, force_refresh: bool = False):
     """
     Enhanced get_transgene_expression_here with SOLR caching.
diff --git a/src/vfbquery/ha_api.py b/src/vfbquery/ha_api.py
@@ -425,14 +425,15 @@ def _run_list_connectome_datasets():
 
 
 def _run_query_connectivity(upstream_type, downstream_type, weight,
-                            group_by_class, exclude_dbs):
+                            group_by_class, exclude_dbs, force_refresh=False):
     """Execute query_connectivity in a worker process."""
     return _vfb.query_connectivity(
         upstream_type=upstream_type,
         downstream_type=downstream_type,
         weight=weight,
         group_by_class=group_by_class,
         exclude_dbs=exclude_dbs,
+        force_refresh=force_refresh,
     )
 
 
@@ -962,6 +963,7 @@ async def handle_query_connectivity(request):
     else:
         exclude_dbs = ["hb", "fafb"]
     include_graph = request.query.get("include_graph", "false").lower() in ("true", "1", "yes")
+    force_refresh = request.query.get("force_refresh", "false").lower() in ("true", "1", "yes")
 
     post_fn = None
     if include_graph:
@@ -978,9 +980,13 @@ def post_fn(result):
             return result
 
     key = f"query_connectivity:{upstream}:{downstream}:{weight}:{group_by_class}:{exclude_dbs}"
+    # force_refresh=true drops the in-memory L1 entry so the recomputed result
+    # replaces it; the SOLR layer is invalidated inside query_connectivity.
+    if force_refresh:
+        request.app["result_cache"].invalidate(key)
     return await _dispatch_to_pool(
         request, key, _run_query_connectivity,
-        upstream, downstream, weight, group_by_class, exclude_dbs,
+        upstream, downstream, weight, group_by_class, exclude_dbs, force_refresh,
         post_fn=post_fn,
     )
 
diff --git a/src/vfbquery/solr_result_cache.py b/src/vfbquery/solr_result_cache.py
@@ -60,18 +60,26 @@ class SolrResultCache:
     will periodically probe Solr and re-enable itself when the service recovers.
     """
     
-    def __init__(self, 
-                 cache_url: str = "https://solr.virtualflybrain.org/solr/vfb_json",
+    # Dedicated query-cache Solr, reachable on port 80 (the ontology Solr's
+    # native port is firewalled externally, so the cache must use this host).
+    DEFAULT_CACHE_URL = "http://vfbquerycache.virtualflybrain.org:80/solr/vfb_json"
+
+    def __init__(self,
+                 cache_url: str = None,
                  ttl_hours: int = 2160,  # 3 months like VFB_connect
                  max_result_size_mb: int = 10):
         """
         Initialize SOLR result cache
-        
+
         Args:
-            cache_url: SOLR collection URL for caching
+            cache_url: SOLR collection URL for caching. Defaults to the
+                VFBQUERY_SOLR_URL env var if set, otherwise the dedicated
+                query-cache Solr (DEFAULT_CACHE_URL).
             ttl_hours: Time-to-live for cache entries in hours
             max_result_size_mb: Maximum result size to cache in MB
         """
+        if cache_url is None:
+            cache_url = os.getenv('VFBQUERY_SOLR_URL', self.DEFAULT_CACHE_URL)
         self.cache_url = cache_url
         self.ttl_hours = ttl_hours
         self.max_result_size_mb = max_result_size_mb
@@ -777,14 +785,19 @@ def wrapper(*args, **kwargs):
             
             # For expensive queries, we still only cache full results, but we handle limited requests
             # by slicing from cached full results
-            expensive_query_types = ['similar_neurons', 'similar_morphology', 'similar_morphology_part_of', 
-                                   'similar_morphology_part_of_exp', 'similar_morphology_nb', 
+            expensive_query_types = ['similar_neurons', 'similar_morphology', 'similar_morphology_part_of',
+                                   'similar_morphology_part_of_exp', 'similar_morphology_nb',
                                    'similar_morphology_nb_exp', 'similar_morphology_userdata',
-                                   'neurons_part_here', 'neurons_synaptic', 
+                                   'neurons_part_here', 'neurons_synaptic',
                                    'neurons_presynaptic', 'neurons_postsynaptic',
                                    'expression_overlaps_here', 'anatomy_scrnaseq', 'aligned_datasets', 'terms_for_pub',
                                    'individual_neuron_inputs', 'cluster_expression', 'expression_cluster', 'scrnaseq_dataset_data',
-                                   'painted_domains', 'downstream_class_connectivity_query', 'upstream_class_connectivity_query']
+                                   'painted_domains', 'downstream_class_connectivity_query', 'upstream_class_connectivity_query',
+                                   # New buckets (v1.19.0): large, limit-sliced results — listing them here
+                                   # means a limited request computes the full result once, caches it, and
+                                   # serves later limited requests by slicing the cached full result.
+                                   'dataset_images', 'all_aligned_images', 'all_datasets',
+                                   'transgene_expression_here', 'related_anatomy']
             
             # For neuron_neuron_connectivity_query, only cache when all parameters are defaults
             if query_type == 'neuron_neuron_connectivity_query':
@@ -795,11 +808,16 @@ def wrapper(*args, **kwargs):
             # Extract term_id from first argument or kwargs
             term_id = args[0] if args else kwargs.get('short_form') or kwargs.get('term_id')
             
-            # For functions like get_templates that don't have a term_id, use query_type as cache key
+            # For functions that don't have a term_id, use a fixed cache key
+            # tied to the query_type (the result is a single global list).
             if not term_id:
                 if query_type == 'templates':
                     # Use a fixed cache key for templates since it doesn't take a term_id
                     term_id = 'all_templates'
+                elif query_type == 'all_datasets':
+                    # get_all_datasets has no id argument; the result is the full
+                    # dataset catalogue, so a single fixed key is correct.
+                    term_id = 'all_datasets'
                 else:
                     logger.warning(f"No term_id found for caching {query_type}")
                     return func(*args, **kwargs)
@@ -823,7 +841,10 @@ def wrapper(*args, **kwargs):
                                    'images_that_develop_from', 'expression_pattern_fragments', 'expression_overlaps_here',
                                    'anatomy_scrnaseq', 'aligned_datasets', 'terms_for_pub', 'individual_neuron_inputs',
                                    'cluster_expression', 'expression_cluster', 'scrnaseq_dataset_data', 'painted_domains',
-                                   'downstream_class_connectivity_query', 'upstream_class_connectivity_query']
+                                   'downstream_class_connectivity_query', 'upstream_class_connectivity_query',
+                                   # New buckets (v1.19.0) — see expensive_query_types above.
+                                   'dataset_images', 'all_aligned_images', 'all_datasets',
+                                   'transgene_expression_here', 'related_anatomy']
             if query_type in dataframe_query_types:
                 return_dataframe = kwargs.get('return_dataframe', True)  # Default is True
                 cache_term_id = f"{cache_term_id}_dataframe_{return_dataframe}"
diff --git a/src/vfbquery/vfb_connectivity.py b/src/vfbquery/vfb_connectivity.py