Merge pull request #215 from MorpheusAIs/dev

nomadicrogue · web-flow · commit 2d35de3f916f · 2026-03-06T10:56:00.000-05:00
Refactor pricing and rate limits to use JSON config files - TEST
diff --git a/Dockerfile b/Dockerfile
@@ -58,6 +58,7 @@ COPY --from=builder /usr/local/bin /usr/local/bin
 # Copy application code
 COPY ./src ./src
 COPY ./alembic ./alembic
+COPY ./models ./models
 COPY alembic.ini .
 
 # Create logs directory and initial models.json before changing ownership
diff --git a/docker-compose.local.yml b/docker-compose.local.yml
@@ -53,6 +53,8 @@ services:
       REDIS_URL: redis://redis-local:6379/0
       # Local database
       DATABASE_URL: postgresql+asyncpg://morpheus_local:local_dev_password@db-local:5432/morpheus_local_db
+
+      ENVIRONMENT: ${ENVIRONMENT:-test}
       
       # Bypass Cognito for local testing (controlled by .env.local)
       BILLING_ADMIN_SECRET: ${BILLING_ADMIN_SECRET}
@@ -110,12 +112,8 @@ services:
       HOLD_RECONCILIATION_INTERVAL_SECONDS: ${HOLD_RECONCILIATION_INTERVAL_SECONDS:-600}
       HOLD_MAX_PENDING_SECONDS: ${HOLD_MAX_PENDING_SECONDS:-3600}
       
-      # Rate limiting settings
+      # Rate limiting (on/off toggle; limits configured in models/{env}_rate_limit.json)
       RATE_LIMIT_ENABLED: ${RATE_LIMIT_ENABLED:-true}
-      RATE_LIMIT_DEFAULT_RPM: ${RATE_LIMIT_DEFAULT_RPM:-60}
-      RATE_LIMIT_DEFAULT_TPM: ${RATE_LIMIT_DEFAULT_TPM:-100000}
-      RATE_LIMIT_WINDOW_SECONDS: ${RATE_LIMIT_WINDOW_SECONDS:-60}
-      RATE_LIMIT_MODEL_GROUPS: ${RATE_LIMIT_MODEL_GROUPS:-}
       
       # Web3/SIWE settings (ERC-4361 Sign-In with Ethereum)
       WEB3_PROVIDER_URL: ${WEB3_PROVIDER_URL:-}  # Optional: enables EIP-1271 smart contract wallet verification
@@ -133,6 +131,7 @@ services:
       - ./alembic.ini:/app/alembic.ini
       - ./tests:/app/tests
       - ./scripts:/app/scripts
+      - ./models:/app/models
     
     # Use startup script that handles migrations and verification
     command: ./scripts/start_local_dev.sh
diff --git a/env.example b/env.example
@@ -223,23 +223,9 @@ HOLD_MAX_PENDING_SECONDS=3600
 # =============================================================================
 # Enable/disable rate limiting globally
 RATE_LIMIT_ENABLED=true
-
-# Default rate limits (applied if no model-specific limits match)
-# Requests per minute (RPM)
-RATE_LIMIT_DEFAULT_RPM=60
-# Tokens per minute (TPM) - input + output combined
-RATE_LIMIT_DEFAULT_TPM=100000
-
-# Rate limit window in seconds (default: 60 for per-minute limits)
-RATE_LIMIT_WINDOW_SECONDS=60
-
-# Model group rate limits (optional, JSON format)
-# Override default limits for specific model groups
-# Format: [{"name": "group_name", "rpm": 30, "tpm": 50000, "models": ["model1", "model2*"], "priority": 100}]
-# Higher priority groups are matched first
-# Example:
-# RATE_LIMIT_MODEL_GROUPS=[{"name":"premium","rpm":30,"tpm":50000,"models":["gpt-4*","claude-3-opus*"],"priority":100},{"name":"fast","rpm":120,"tpm":200000,"models":["llama-3.1-8b*"],"priority":25}]
-RATE_LIMIT_MODEL_GROUPS=
+# Rate limit defaults and model groups are configured in models/{env}_rate_limit.json
+# Model pricing is configured in models/{env}_model_price.json
+# The ENVIRONMENT variable determines which file is used (see above)
 
 # =============================================================================
 # SIGNUP BONUS
diff --git a/models/prod_model_price.json b/models/prod_model_price.json
@@ -0,0 +1,36 @@
+{
+  "default_input_price_per_million": "0.50",
+  "default_output_price_per_million": "2.00",
+  "models": {
+    "glm-5": { "input": "1.00", "output": "3.20" },
+    "glm-4.7": { "input": "0.50", "output": "2.25" },
+    "glm-4.7-thinking": { "input": "0.45", "output": "2.00" },
+    "glm-4.7-flash": { "input": "0.10", "output": "0.50" },
+
+    "kimi-k2.5": { "input": "0.60", "output": "3.00" },
+    "kimi-k2-thinking": { "input": "0.60", "output": "3.00" },
+
+    "qwen3-235b": { "input": "0.40", "output": "3.00" },
+    "qwen-3-235b": { "input": "0.40", "output": "3.00" },
+    "qwen3-coder-480b-a35b-instruct": { "input": "0.70", "output": "2.80" },
+    "qwen3-coder-480b-a35b": { "input": "0.70", "output": "2.80" },
+    "qwen3-next-80b": { "input": "0.15", "output": "1.50" },
+
+    "minimax-m2.5": { "input": "0.30", "output": "1.20" },
+
+    "gpt-oss-120b": { "input": "0.07", "output": "0.28" },
+
+    "hermes-3-llama-3.1-405b": { "input": "1.00", "output": "3.00" },
+    "llama-3.3-70b": { "input": "0.70", "output": "2.50" },
+    "llama-3-3-70b": { "input": "0.70", "output": "2.50" },
+    "llama-3.2-3b": { "input": "0.10", "output": "0.50" },
+    "llama-3-2-3b": { "input": "0.10", "output": "0.50" },
+
+    "mistral-31-24b": { "input": "0.50", "output": "2.00" },
+    "mistral-small-24b": { "input": "0.50", "output": "2.00" },
+
+    "venice-uncensored": { "input": "0.20", "output": "0.90" },
+
+    "text-embedding-bge-m3": { "input": "0.10", "output": "0.50" }
+  }
+}
diff --git a/models/prod_rate_limit.json b/models/prod_rate_limit.json
@@ -0,0 +1,76 @@
+{
+  "default_rpm": 60,
+  "default_tpm": 100000,
+  "window_seconds": 60,
+  "model_groups": [
+    {
+      "name": "embedding",
+      "rpm": 500,
+      "tpm": 0,
+      "models": ["text-embedding-bge-m3"],
+      "priority": 10,
+      "description": "Embedding models with high throughput (no token limit)"
+    },
+    {
+      "name": "S",
+      "rpm": 500,
+      "tpm": 1000000,
+      "models": [
+        "llama-3.2-3b",
+        "llama-3.2-3b:web",
+        "qwen3-4b",
+        "qwen3-4b:web"
+      ],
+      "priority": 25,
+      "description": "Small models with high throughput limits"
+    },
+    {
+      "name": "M",
+      "rpm": 50,
+      "tpm": 750000,
+      "models": [
+        "llama-3.3-70b",
+        "llama-3.3-70b:web",
+        "mistral-31-24b",
+        "mistral-31-24b:web",
+        "qwen3-next-80b",
+        "qwen3-next-80b:web",
+        "venice-uncensored",
+        "venice-uncensored:web"
+      ],
+      "priority": 50,
+      "description": "Medium models with moderate limits"
+    },
+    {
+      "name": "L",
+      "rpm": 20,
+      "tpm": 500000,
+      "models": [
+        "glm-5",
+        "glm-5:web",
+        "glm-4.7",
+        "glm-4.7:web",
+        "glm-4.7-thinking",
+        "glm-4.7-thinking:web",
+        "glm-4.7-flash",
+        "glm-4.7-flash:web",
+        "kimi-k2.5",
+        "kimi-k2.5:web",
+        "kimi-k2-thinking",
+        "kimi-k2-thinking:web",
+        "qwen3-235b",
+        "qwen3-235b:web",
+        "qwen3-coder-480b-a35b-instruct",
+        "qwen3-coder-480b-a35b-instruct:web",
+        "minimax-m2.5",
+        "minimax-m2.5:web",
+        "gpt-oss-120b",
+        "gpt-oss-120b:web",
+        "hermes-3-llama-3.1-405b",
+        "hermes-3-llama-3.1-405b:web"
+      ],
+      "priority": 100,
+      "description": "Large models with conservative limits"
+    }
+  ]
+}
diff --git a/models/test_model_price.json b/models/test_model_price.json
@@ -0,0 +1,36 @@
+{
+  "default_input_price_per_million": "0.50",
+  "default_output_price_per_million": "2.00",
+  "models": {
+    "glm-5": { "input": "1.00", "output": "3.20" },
+    "glm-4.7": { "input": "0.50", "output": "2.25" },
+    "glm-4.7-thinking": { "input": "0.45", "output": "2.00" },
+    "glm-4.7-flash": { "input": "0.10", "output": "0.50" },
+
+    "kimi-k2.5": { "input": "0.60", "output": "3.00" },
+    "kimi-k2-thinking": { "input": "0.60", "output": "3.00" },
+
+    "qwen3-235b": { "input": "0.40", "output": "3.00" },
+    "qwen-3-235b": { "input": "0.40", "output": "3.00" },
+    "qwen3-coder-480b-a35b-instruct": { "input": "0.70", "output": "2.80" },
+    "qwen3-coder-480b-a35b": { "input": "0.70", "output": "2.80" },
+    "qwen3-next-80b": { "input": "0.15", "output": "1.50" },
+
+    "minimax-m2.5": { "input": "0.30", "output": "1.20" },
+
+    "gpt-oss-120b": { "input": "0.07", "output": "0.28" },
+
+    "hermes-3-llama-3.1-405b": { "input": "1.00", "output": "3.00" },
+    "llama-3.3-70b": { "input": "0.70", "output": "2.50" },
+    "llama-3-3-70b": { "input": "0.70", "output": "2.50" },
+    "llama-3.2-3b": { "input": "0.10", "output": "0.50" },
+    "llama-3-2-3b": { "input": "0.10", "output": "0.50" },
+
+    "mistral-31-24b": { "input": "0.50", "output": "2.00" },
+    "mistral-small-24b": { "input": "0.50", "output": "2.00" },
+
+    "venice-uncensored": { "input": "0.20", "output": "0.90" },
+
+    "text-embedding-bge-m3": { "input": "0.10", "output": "0.50" }
+  }
+}
diff --git a/models/test_rate_limit.json b/models/test_rate_limit.json
@@ -0,0 +1,76 @@
+{
+  "default_rpm": 60,
+  "default_tpm": 100000,
+  "window_seconds": 60,
+  "model_groups": [
+    {
+      "name": "embedding",
+      "rpm": 500,
+      "tpm": 0,
+      "models": ["text-embedding-bge-m3"],
+      "priority": 10,
+      "description": "Embedding models with high throughput (no token limit)"
+    },
+    {
+      "name": "S",
+      "rpm": 500,
+      "tpm": 1000000,
+      "models": [
+        "llama-3.2-3b",
+        "llama-3.2-3b:web",
+        "qwen3-4b",
+        "qwen3-4b:web"
+      ],
+      "priority": 25,
+      "description": "Small models with high throughput limits"
+    },
+    {
+      "name": "M",
+      "rpm": 50,
+      "tpm": 750000,
+      "models": [
+        "llama-3.3-70b",
+        "llama-3.3-70b:web",
+        "mistral-31-24b",
+        "mistral-31-24b:web",
+        "qwen3-next-80b",
+        "qwen3-next-80b:web",
+        "venice-uncensored",
+        "venice-uncensored:web"
+      ],
+      "priority": 50,
+      "description": "Medium models with moderate limits"
+    },
+    {
+      "name": "L",
+      "rpm": 20,
+      "tpm": 500000,
+      "models": [
+        "glm-5",
+        "glm-5:web",
+        "glm-4.7",
+        "glm-4.7:web",
+        "glm-4.7-thinking",
+        "glm-4.7-thinking:web",
+        "glm-4.7-flash",
+        "glm-4.7-flash:web",
+        "kimi-k2.5",
+        "kimi-k2.5:web",
+        "kimi-k2-thinking",
+        "kimi-k2-thinking:web",
+        "qwen3-235b",
+        "qwen3-235b:web",
+        "qwen3-coder-480b-a35b-instruct",
+        "qwen3-coder-480b-a35b-instruct:web",
+        "minimax-m2.5",
+        "minimax-m2.5:web",
+        "gpt-oss-120b",
+        "gpt-oss-120b:web",
+        "hermes-3-llama-3.1-405b",
+        "hermes-3-llama-3.1-405b:web"
+      ],
+      "priority": 100,
+      "description": "Large models with conservative limits"
+    }
+  ]
+}
diff --git a/src/core/config.py b/src/core/config.py
@@ -265,19 +265,8 @@ def assemble_cors_origins(cls, v: Union[str, List[str]]) -> Union[List[str], str
     # Rate Limiting Settings
     # Enable/disable rate limiting globally
     RATE_LIMIT_ENABLED: bool = Field(default=os.getenv("RATE_LIMIT_ENABLED", "true").lower() == "true")
-    
-    # Default rate limits (applied if no model-specific limits match)
-    # Requests per minute (RPM)
-    RATE_LIMIT_DEFAULT_RPM: int = Field(default=int(os.getenv("RATE_LIMIT_DEFAULT_RPM", "60")))
-    # Tokens per minute (TPM) - input + output combined
-    RATE_LIMIT_DEFAULT_TPM: int = Field(default=int(os.getenv("RATE_LIMIT_DEFAULT_TPM", "100000")))
-    
-    # Rate limit window in seconds (default: 60 for per-minute limits)
-    RATE_LIMIT_WINDOW_SECONDS: int = Field(default=int(os.getenv("RATE_LIMIT_WINDOW_SECONDS", "60")))
-    
-    # Model group rate limits (JSON format)
-    # Format: {"group_name": {"rpm": 30, "tpm": 50000, "models": ["model1", "model2"]}}
-    RATE_LIMIT_MODEL_GROUPS: str = Field(default=os.getenv("RATE_LIMIT_MODEL_GROUPS", ""))
+    # Rate limit defaults and model groups are loaded from models/{env}_rate_limit.json
+    # Model pricing is loaded from models/{env}_model_price.json
     
 
 
diff --git a/src/core/config_loader.py b/src/core/config_loader.py
@@ -0,0 +1,62 @@
+"""
+Environment-aware JSON config loader for model pricing and rate limits.
+
+Resolves the ENVIRONMENT variable to select the appropriate JSON files
+from the models/ directory at the project root.
+"""
+
+import json
+import os
+from pathlib import Path
+from typing import Any, Dict
+
+_MODELS_DIR = Path(__file__).resolve().parents[2] / "models"
+
+_ENV_PREFIX_MAP: Dict[str, str] = {
+    "production": "prod",
+    "prod": "prod",
+    "prd": "prod",
+    "staging": "prod",
+    "stg": "prod",
+    "stage": "prod",
+    "development": "test",
+    "dev": "test",
+    "test": "test",
+    "tst": "test",
+}
+
+_cache: Dict[str, Any] = {}
+
+
+def _get_prefix() -> str:
+    env = os.getenv("ENVIRONMENT", "development").lower().strip()
+    return _ENV_PREFIX_MAP.get(env, "test")
+
+
+def _load_json(filename: str) -> Dict[str, Any]:
+    if filename in _cache:
+        return _cache[filename]
+
+    filepath = _MODELS_DIR / filename
+    with open(filepath, "r") as f:
+        data = json.load(f)
+
+    _cache[filename] = data
+    return data
+
+
+def load_model_prices() -> Dict[str, Any]:
+    """Load the environment-appropriate model pricing config."""
+    prefix = _get_prefix()
+    return _load_json(f"{prefix}_model_price.json")
+
+
+def load_rate_limits() -> Dict[str, Any]:
+    """Load the environment-appropriate rate limit config."""
+    prefix = _get_prefix()
+    return _load_json(f"{prefix}_rate_limit.json")
+
+
+def clear_cache() -> None:
+    """Clear cached config data (useful for testing or hot-reload)."""
+    _cache.clear()
diff --git a/src/main.py b/src/main.py
@@ -413,10 +413,11 @@ async def startup_event():
     if settings.RATE_LIMIT_ENABLED:
         try:
             await rate_limit_service.initialize()
+            rules_info = rate_limit_service.get_rules_info()
             logger.info("Rate limiting service initialized",
                        enabled=True,
-                       default_rpm=settings.RATE_LIMIT_DEFAULT_RPM,
-                       default_tpm=settings.RATE_LIMIT_DEFAULT_TPM,
+                       default_rpm=rules_info.get("default", {}).get("rpm"),
+                       default_tpm=rules_info.get("default", {}).get("tpm"),
                        event_type="rate_limit_init_success")
         except Exception as e:
             logger.error("Failed to initialize rate limiting service",
diff --git a/src/services/pricing/hardcoded_provider.py b/src/services/pricing/hardcoded_provider.py
diff --git a/src/services/rate_limiting/rules_service.py b/src/services/rate_limiting/rules_service.py