Refactor caching implementation to use SOLR-based caching throughout the codebase, enhancing performance and simplifying cache management.

Robbie1977 · Robbie1977 · commit c4e716367786 · 2025-11-17T18:59:23.000Z
diff --git a/CACHING.md b/CACHING.md
@@ -1,41 +1,43 @@
 # VFBquery Caching Guide
 
-VFBquery includes intelligent caching for optimal performance. Caching is **enabled by default** with production-ready settings.
+VFBquery includes intelligent SOLR-based caching for optimal performance. Caching is **enabled by default** with production-ready settings.
 
 ## Default Behavior
 
-VFBquery automatically enables caching when imported:
+VFBquery automatically enables SOLR caching when imported:
 
 ```python
 import vfbquery as vfb
 
-# Caching is already active with optimal settings:
+# SOLR caching is already active with optimal settings:
 # - 3-month cache duration
-# - 2GB memory cache with LRU eviction  
-# - Persistent disk storage
+# - Persistent across sessions
 # - Zero configuration required
 
 result = vfb.get_term_info('FBbt_00003748')  # Cached automatically
 ```
 
+## How It Works
+
+VFBquery uses a single-layer caching approach with SOLR:
+
+1. **First query**: Fetches data from Neo4j/Owlery and caches in SOLR
+2. **Subsequent queries**: Served directly from SOLR cache
+3. **Cache persistence**: Survives Python restarts and server reboots
+4. **Automatic expiration**: 3-month TTL matches VFB_connect behavior
+
 ## Runtime Configuration
 
-Adjust cache settings while your application is running:
+Control caching behavior:
 
 ```python
 import vfbquery as vfb
 
-# Modify cache duration
-vfb.set_cache_ttl(720)                    # 1 month  
-vfb.set_cache_ttl(24)                     # 1 day
-
-# Adjust memory limits
-vfb.set_cache_memory_limit(512)           # 512MB
-vfb.set_cache_max_items(5000)             # 5K items
+# Clear specific cache entries
+vfb.clear_solr_cache('term_info', 'FBbt_00003748')
 
-# Toggle disk persistence  
-vfb.disable_disk_cache()                  # Memory-only
-vfb.enable_disk_cache()                   # Restore persistence
+# Get SOLR cache statistics
+stats = vfb.get_solr_cache().get_cache_stats()
 ```
 
 ### Environment Control
@@ -48,15 +50,15 @@ export VFBQUERY_CACHE_ENABLED=false
 
 ## Performance Benefits
 
-VFBquery caching provides significant performance improvements:
+VFBquery SOLR caching provides significant performance improvements:
 
 ```python
 import vfbquery as vfb
 
-# First query: builds cache (~1-2 seconds)  
+# First query: builds SOLR cache (~1-2 seconds)  
 result1 = vfb.get_term_info('FBbt_00003748')
 
-# Subsequent queries: served from cache (<0.1 seconds)
+# Subsequent queries: served from SOLR cache (<0.1 seconds)
 result2 = vfb.get_term_info('FBbt_00003748')  # 54,000x faster!
 ```
 
@@ -71,16 +73,11 @@ result2 = vfb.get_term_info('FBbt_00003748')  # 54,000x faster!
 ```python
 import vfbquery as vfb
 
-# Get cache statistics
-stats = vfb.get_vfbquery_cache_stats()
-print(f"Hit rate: {stats['hit_rate_percent']}%")
-print(f"Memory used: {stats['memory_cache_size_mb']}MB")
-print(f"Cache items: {stats['memory_cache_items']}")
-
-# Get current configuration
-config = vfb.get_cache_config()
-print(f"TTL: {config['cache_ttl_hours']} hours")
-print(f"Memory limit: {config['memory_cache_size_mb']}MB")
+# Get SOLR cache statistics
+cache = vfb.get_solr_cache()
+stats = cache.get_cache_stats()
+print(f"Total cached items: {stats['total_documents']}")
+print(f"Cache size: {stats['total_size_mb']:.1f}MB")
 ```
 
 ## Usage Examples
@@ -90,21 +87,21 @@ print(f"Memory limit: {config['memory_cache_size_mb']}MB")
 ```python
 import vfbquery as vfb
 
-# Caching is enabled automatically with optimal defaults
-# Adjust only if your application has specific needs
+# SOLR caching is enabled automatically with optimal defaults
+# Cache persists across application restarts
 
-# Example: Long-running server with limited memory
-vfb.set_cache_memory_limit(512)    # 512MB limit
-vfb.set_cache_ttl(168)             # 1 week TTL
+# Example: Long-running server
+result = vfb.get_term_info('FBbt_00003748')     # Fast on repeated runs
+instances = vfb.get_instances('FBbt_00003748')  # Cached automatically
 ```
 
 ### Jupyter Notebooks
 
 ```python
 import vfbquery as vfb
 
-# Caching works automatically in notebooks
-# Data persists between kernel restarts
+# SOLR caching works automatically in notebooks
+# Data persists between kernel restarts and notebook sessions
 
 result = vfb.get_term_info('FBbt_00003748')     # Fast on repeated runs
 instances = vfb.get_instances('FBbt_00003748')  # Cached automatically
@@ -114,14 +111,13 @@ instances = vfb.get_instances('FBbt_00003748')  # Cached automatically
 
 - **Dramatic Performance**: 54,000x speedup for repeated queries
 - **Zero Configuration**: Works out of the box with optimal settings
-- **Persistent Storage**: Cache survives Python restarts  
-- **Memory Efficient**: LRU eviction prevents memory bloat
-- **Multi-layer Caching**: Optimizes SOLR queries, parsing, and results
+- **Persistent Storage**: SOLR cache survives Python restarts and server reboots
+- **Server-side Caching**: Shared across multiple processes/instances
 - **Production Ready**: 3-month TTL matches VFB_connect behavior
 
 ## Best Practices
 
-- **Monitor performance**: Use `get_vfbquery_cache_stats()` regularly
-- **Adjust for your use case**: Tune memory limits for long-running applications  
-- **Consider data freshness**: Shorter TTL for frequently changing data
+- **Monitor performance**: Use SOLR cache statistics regularly
+- **Clear when needed**: Use `clear_solr_cache()` to force fresh data
+- **Consider data freshness**: SOLR cache TTL ensures data doesn't become stale
 - **Disable when needed**: Use environment variable if caching isn't desired
diff --git a/src/test/test_query_performance.py b/src/test/test_query_performance.py
@@ -53,13 +53,9 @@ class QueryPerformanceTest(unittest.TestCase):
     
     @classmethod
     def setUpClass(cls):
-        """Enable caching for performance tests"""
-        # Import caching module
-        from vfbquery import cache_enhancements
-        
-        # Enable caching to speed up repeated queries
-        cache_enhancements.enable_vfbquery_caching()
-        print("\n🔥 Caching enabled for performance tests")
+        """Set up for performance tests"""
+        # SOLR caching is enabled by default
+        print("\n🔥 SOLR caching enabled for performance tests")
     
     def setUp(self):
         """Set up test data"""
diff --git a/src/vfbquery/__init__.py b/src/vfbquery/__init__.py
@@ -1,50 +1,24 @@
 from .vfb_queries import *
 from .solr_result_cache import get_solr_cache
 
-# Caching enhancements (optional import - don't break if dependencies missing)
+# SOLR-based caching (simplified single-layer approach)
 try:
-    from .cache_enhancements import (
-        enable_vfbquery_caching, 
-        disable_vfbquery_caching,
-        clear_vfbquery_cache,
-        get_vfbquery_cache_stats,
-        set_cache_ttl,
-        set_cache_memory_limit,
-        set_cache_max_items,
-        enable_disk_cache,
-        disable_disk_cache,
-        get_cache_config,
-        CacheConfig
-    )
     from .cached_functions import (
         get_term_info_cached,
-        get_instances_cached, 
-        patch_vfbquery_with_caching,
-        unpatch_vfbquery_caching
+        get_instances_cached
     )
     __caching_available__ = True
-    
-    # Enable caching by default with 3-month TTL and 2GB memory cache
+
+    # Enable SOLR caching by default with 3-month TTL
     import os
-    
+
     # Check if caching should be disabled via environment variable
     cache_disabled = os.getenv('VFBQUERY_CACHE_ENABLED', 'true').lower() in ('false', '0', 'no', 'off')
-    
+
     if not cache_disabled:
-        # Enable caching with VFB_connect-like defaults
-        enable_vfbquery_caching(
-            cache_ttl_hours=2160,      # 3 months (90 days)
-            memory_cache_size_mb=2048, # 2GB memory cache
-            max_items=10000,           # Max 10k items as safeguard
-            disk_cache_enabled=True    # Persistent across sessions
-        )
-        
-        # Automatically patch existing functions for transparent caching
-        patch_vfbquery_with_caching()
-        
-        print("VFBquery: Caching enabled by default (3-month TTL, 2GB memory)")
+        print("VFBquery: SOLR caching enabled by default (3-month TTL)")
         print("         Disable with: export VFBQUERY_CACHE_ENABLED=false")
-    
+
 except ImportError:
     __caching_available__ = False
     print("VFBquery: Caching not available (dependencies missing)")
diff --git a/src/vfbquery/cached_functions.py b/src/vfbquery/cached_functions.py
@@ -6,7 +6,7 @@
 """
 
 from typing import Dict, Any, Optional
-from .cache_enhancements import cache_result, get_cache
+from .solr_result_cache import with_solr_cache
 
 
 def is_valid_term_info_result(result):
@@ -45,40 +45,20 @@ def is_valid_term_info_result(result):
 from .vfb_queries import (
     get_term_info as _original_get_term_info,
     get_instances as _original_get_instances,
-    vfb_solr,
-    term_info_parse_object as _original_term_info_parse_object,
-    fill_query_results as _original_fill_query_results
+    vfb_solr
 )
 
-@cache_result("solr_search", "solr_cache_enabled")
+@with_solr_cache("solr_search")
 def cached_solr_search(query: str):
     """Cached version of SOLR search."""
     return vfb_solr.search(query)
 
-@cache_result("term_info_parse", "term_info_cache_enabled")
-def cached_term_info_parse_object(results, short_form: str):
-    """Cached version of term_info_parse_object."""
-    return _original_term_info_parse_object(results, short_form)
-
-@cache_result("query_results", "query_result_cache_enabled")
-def cached_fill_query_results(term_info: Dict[str, Any]):
-    """Cached version of fill_query_results."""
-    return _original_fill_query_results(term_info)
-
-@cache_result("get_instances", "query_result_cache_enabled")
-def cached_get_instances(short_form: str, return_dataframe=True, limit: int = -1):
-    """Cached version of get_instances."""
-    return _original_get_instances(short_form, return_dataframe, limit)
-
+@with_solr_cache("term_info")
 def get_term_info_cached(short_form: str, preview: bool = False):
     """
-    Enhanced get_term_info with multi-layer caching.
+    Enhanced get_term_info with SOLR caching.
     
-    This version uses caching at multiple levels:
-    1. Final result caching (entire term_info response)
-    2. SOLR query result caching 
-    3. Term info parsing caching
-    4. Query result caching
+    This version caches complete term_info responses in SOLR for fast retrieval.
     
     Args:
         short_form: Term short form (e.g., 'FBbt_00003748')
@@ -87,104 +67,14 @@ def get_term_info_cached(short_form: str, preview: bool = False):
     Returns:
         Term info dictionary or None if not found
     """
-    cache = get_cache()
-    
-    # Check for complete result in cache first
-    cache_key = cache._generate_cache_key("term_info_complete", short_form, preview)
-    cached_result = cache.get(cache_key)
-    print(f"DEBUG: Cache lookup for {short_form}: {'HIT' if cached_result is not None else 'MISS'}")
-    if cached_result is not None:
-        # Validate that cached result has essential fields
-        if not is_valid_term_info_result(cached_result):
-            print(f"DEBUG: Cached result incomplete for {short_form}, falling back to original function")
-            print(f"DEBUG: cached_result keys: {list(cached_result.keys()) if cached_result else 'None'}")
-            print(f"DEBUG: cached_result Id: {cached_result.get('Id', 'MISSING') if cached_result else 'None'}")
-            print(f"DEBUG: cached_result Name: {cached_result.get('Name', 'MISSING') if cached_result else 'None'}")
-            
-            # Fall back to original function and cache the complete result
-            fallback_result = _original_get_term_info(short_form, preview)
-            if is_valid_term_info_result(fallback_result):
-                print(f"DEBUG: Fallback successful, caching complete result for {short_form}")
-                cache.set(cache_key, fallback_result)
-            return fallback_result
-        else:
-            print(f"DEBUG: Using valid cached result for {short_form}")
-            return cached_result
-    
-    parsed_object = None
-    try:
-        # Use cached SOLR search
-        results = cached_solr_search('id:' + short_form)
-        
-        # Use cached term info parsing
-        parsed_object = cached_term_info_parse_object(results, short_form)
-        
-        if parsed_object:
-            # Use cached query result filling (skip if queries would fail)
-            if parsed_object.get('Queries') and len(parsed_object['Queries']) > 0:
-                try:
-                    term_info = cached_fill_query_results(parsed_object)
-                    if term_info:
-                        # Validate result before caching
-                        if term_info.get('Id') and term_info.get('Name'):
-                            # Cache the complete result
-                            cache.set(cache_key, term_info)
-                            return term_info
-                        else:
-                            print(f"Query result for {short_form} is incomplete, falling back to original function...")
-                            return _original_get_term_info(short_form, preview)
-                    else:
-                        print("Failed to fill query preview results!")
-                        # Validate result before caching
-                        if parsed_object.get('Id') and parsed_object.get('Name'):
-                            # Cache the complete result
-                            cache.set(cache_key, parsed_object)
-                            return parsed_object
-                        else:
-                            print(f"Parsed object for {short_form} is incomplete, falling back to original function...")
-                            return _original_get_term_info(short_form, preview)
-                except Exception as e:
-                    print(f"Error filling query results (continuing without query data): {e}")
-                    # Validate result before caching
-                    if is_valid_term_info_result(parsed_object):
-                        cache.set(cache_key, parsed_object)
-                        return parsed_object
-                    else:
-                        print(f"DEBUG: Exception case - parsed object incomplete for {short_form}, falling back to original function")
-                        fallback_result = _original_get_term_info(short_form, preview)
-                        if is_valid_term_info_result(fallback_result):
-                            cache.set(cache_key, fallback_result)
-                        return fallback_result
-            else:
-                # No queries to fill, validate result before caching
-                if parsed_object.get('Id') and parsed_object.get('Name'):
-                    # Cache and return parsed object directly
-                    cache.set(cache_key, parsed_object)
-                    return parsed_object
-                else:
-                    print(f"DEBUG: No queries case - parsed object incomplete for {short_form}, falling back to original function...")
-                    fallback_result = _original_get_term_info(short_form, preview)
-                    if is_valid_term_info_result(fallback_result):
-                        cache.set(cache_key, fallback_result)
-                    return fallback_result
-        else:
-            print(f"No valid term info found for ID '{short_form}'")
-            return None
-            
-    except Exception as e:
-        print(f"Error in cached get_term_info: {type(e).__name__}: {e}")
-        # Fall back to original function if caching fails
-        return _original_get_term_info(short_form, preview)
+    return _original_get_term_info(short_form, preview)
 
+@with_solr_cache("instances")
 def get_instances_cached(short_form: str, return_dataframe=True, limit: int = -1):
     """
-    Enhanced get_instances with caching.
+    Enhanced get_instances with SOLR caching.
     
-    This cached version can provide dramatic speedup for repeated queries,
-    especially useful for:
-    - UI applications with repeated browsing
-    - Data analysis workflows
-    - Testing and development
+    This cached version provides dramatic speedup for repeated queries.
     
     Args:
         short_form: Class short form
@@ -194,7 +84,7 @@ def get_instances_cached(short_form: str, return_dataframe=True, limit: int = -1
     Returns:
         Instances data (DataFrame or formatted dict based on return_dataframe)
     """
-    return cached_get_instances(short_form, return_dataframe, limit)
+    return _original_get_instances(short_form, return_dataframe, limit)
 
 # Convenience function to replace original functions
 def patch_vfbquery_with_caching():