Merge pull request #235 from MorpheusAIs/test

nomadicrogue · web-flow · commit 8726f291aaf7 · 2026-05-21T09:27:07.000-05:00
Test -&gt; main
diff --git a/models/prod_model_price.json b/models/prod_model_price.json
@@ -2,38 +2,78 @@
   "default_input_price_per_million": "0.50",
   "default_output_price_per_million": "2.00",
   "models": {
+    "glm-5.1": { "input": "1.50", "output": "5.00" },
+    "glm-5.1:web": { "input": "1.50", "output": "5.00" },
+    "GLM-5.1": { "input": "1.50", "output": "5.00" },
+    "glm-5.1-non-thinking": { "input": "1.50", "output": "5.00" },
+    "glm-5.1-non-thinking:web": { "input": "1.50", "output": "5.00" },
+
     "glm-5": { "input": "1.00", "output": "3.20" },
+    "glm-5:web": { "input": "1.00", "output": "3.20" },
+    "GLM-5": { "input": "1.00", "output": "3.20" },
     "glm-4.7": { "input": "0.50", "output": "2.25" },
+    "glm-4.7:web": { "input": "0.50", "output": "2.25" },
     "glm-4.7-thinking": { "input": "0.45", "output": "2.00" },
+    "glm-4.7-thinking:web": { "input": "0.45", "output": "2.00" },
     "glm-4.7-flash": { "input": "0.10", "output": "0.50" },
+    "glm-4.7-flash:web": { "input": "0.10", "output": "0.50" },
 
+    "kimi-k2.6": { "input": "0.50", "output": "3.25" },
+    "kimi-k2.6:web": { "input": "0.50", "output": "3.25" },
     "kimi-k2.5": { "input": "0.60", "output": "3.00" },
+    "kimi-k2.5:web": { "input": "0.60", "output": "3.00" },
+    "Kimi-K2.5": { "input": "0.60", "output": "3.00" },
     "kimi-k2-thinking": { "input": "0.60", "output": "3.00" },
 
     "gemma-4-31b": { "input": "0.15", "output": "0.40" },
+    "Gemma-4-31b": { "input": "0.15", "output": "0.40" },
+    "Gemma-4-31b:web": { "input": "0.15", "output": "0.40" },
     "gemma-4-26b-a4b": { "input": "0.15", "output": "0.40" },
+    "Gemma-4-26b-a4b": { "input": "0.15", "output": "0.40" },
+    "Gemma-4-26b-a4b:web": { "input": "0.15", "output": "0.40" },
 
     "qwen3-235b": { "input": "0.40", "output": "3.00" },
+    "qwen3-235b:web": { "input": "0.40", "output": "3.00" },
     "qwen-3-235b": { "input": "0.40", "output": "3.00" },
     "qwen3-coder-480b-a35b-instruct": { "input": "0.70", "output": "2.80" },
+    "qwen3-coder-480b-a35b-instruct:web": { "input": "0.70", "output": "2.80" },
     "qwen3-coder-480b-a35b": { "input": "0.70", "output": "2.80" },
     "qwen3-next-80b": { "input": "0.15", "output": "1.50" },
+    "qwen3-next-80b:web": { "input": "0.15", "output": "1.50" },
     "qwen35-35b-a3b": { "input": "0.30", "output": "1.25" },
+    "qwen35-35b-a3b:web": { "input": "0.30", "output": "1.25" },
     "qwen35-9b": { "input": "0.05", "output": "0.15" },
+    "qwen35-9b:web": { "input": "0.05", "output": "0.15" },
 
     "arcee-trinity-large-thinking": { "input": "0.30", "output": "1.00" },
+    "Arcee-Trinity-Large-Thinking": { "input": "0.30", "output": "1.00" },
+    "Arcee-Trinity-Large-Thinking:web": { "input": "0.30", "output": "1.00" },
 
     "minimax-m2.5": { "input": "0.30", "output": "1.20" },
+    "MiniMax-M2.5": { "input": "0.30", "output": "1.20" },
+    "MiniMax-M2.5:web": { "input": "0.30", "output": "1.20" },
+    "MiniMax-M2.7": { "input": "0.35", "output": "1.50" },
+    "MiniMax-M2.7:web": { "input": "0.35", "output": "1.50" },
+
+    "deepseek-v4-pro": { "input": "1.60", "output": "3.50" },
+    "deepseek-v4-pro:web": { "input": "1.60", "output": "3.50" },
+    "deepseek-v4-flash": { "input": "0.15", "output": "0.30" },
+    "deepseek-v4-flash:web": { "input": "0.15", "output": "0.30" },
 
     "gpt-oss-120b": { "input": "0.07", "output": "0.28" },
+    "gpt-oss-120b:web": { "input": "0.07", "output": "0.28" },
 
     "hermes-3-llama-3.1-405b": { "input": "1.00", "output": "3.00" },
+    "hermes-3-llama-3.1-405b:web": { "input": "1.00", "output": "3.00" },
     "llama-3.3-70b": { "input": "0.70", "output": "2.50" },
+    "llama-3.3-70b:web": { "input": "0.70", "output": "2.50" },
     "llama-3-3-70b": { "input": "0.70", "output": "2.50" },
     "llama-3.2-3b": { "input": "0.10", "output": "0.50" },
+    "llama-3.2-3b:web": { "input": "0.10", "output": "0.50" },
     "llama-3-2-3b": { "input": "0.10", "output": "0.50" },
 
     "mistral-31-24b": { "input": "0.50", "output": "2.00" },
+    "mistral-31-24b:web": { "input": "0.50", "output": "2.00" },
     "mistral-small-24b": { "input": "0.50", "output": "2.00" },
 
     "venice-uncensored": { "input": "0.20", "output": "0.90" },
diff --git a/models/prod_rate_limit.json b/models/prod_rate_limit.json
@@ -38,7 +38,9 @@
         "qwen35-9b",
         "qwen35-9b:web",
         "venice-uncensored",
-        "venice-uncensored:web"
+        "venice-uncensored:web",
+        "deepseek-v4-flash",
+        "deepseek-v4-flash:web"
       ],
       "priority": 50,
       "description": "Medium models with moderate limits"
@@ -50,6 +52,12 @@
       "models": [
         "glm-5",
         "glm-5:web",
+        "GLM-5",
+        "glm-5.1",
+        "glm-5.1:web",
+        "GLM-5.1",
+        "glm-5.1-non-thinking",
+        "glm-5.1-non-thinking:web",
         "glm-4.7",
         "glm-4.7:web",
         "glm-4.7-thinking",
@@ -58,20 +66,35 @@
         "glm-4.7-flash:web",
         "kimi-k2.5",
         "kimi-k2.5:web",
+        "Kimi-K2.5",
+        "kimi-k2.6",
+        "kimi-k2.6:web",
         "kimi-k2-thinking",
         "kimi-k2-thinking:web",
         "gemma-4-31b",
         "gemma-4-31b:web",
+        "Gemma-4-31b",
+        "Gemma-4-31b:web",
         "gemma-4-26b-a4b",
         "gemma-4-26b-a4b:web",
+        "Gemma-4-26b-a4b",
+        "Gemma-4-26b-a4b:web",
         "arcee-trinity-large-thinking",
         "arcee-trinity-large-thinking:web",
+        "Arcee-Trinity-Large-Thinking",
+        "Arcee-Trinity-Large-Thinking:web",
         "qwen3-235b",
         "qwen3-235b:web",
         "qwen3-coder-480b-a35b-instruct",
         "qwen3-coder-480b-a35b-instruct:web",
         "minimax-m2.5",
         "minimax-m2.5:web",
+        "MiniMax-M2.5",
+        "MiniMax-M2.5:web",
+        "MiniMax-M2.7",
+        "MiniMax-M2.7:web",
+        "deepseek-v4-pro",
+        "deepseek-v4-pro:web",
         "gpt-oss-120b",
         "gpt-oss-120b:web",
         "hermes-3-llama-3.1-405b",
diff --git a/src/api/v1/billing/index.py b/src/api/v1/billing/index.py
@@ -10,7 +10,7 @@
 
 from ....db.database import get_db_session
 from ....db.models import User, LedgerEntryType
-from ....dependencies import get_current_user, get_api_key_user
+from ....dependencies import get_current_user, get_user_jwt_or_api_key
 from ....services.billing_service import billing_service
 from ....crud import credits as credits_crud
 from ....schemas.billing import (
@@ -40,10 +40,12 @@
 async def get_balance(
     request: Request,
     db: AsyncSession = Depends(get_db_session),
-    current_user: User = Depends(get_current_user),
+    current_user: User = Depends(get_user_jwt_or_api_key),
 ):
     """
     Get current credit balance for the authenticated user.
+
+    Authenticate with either a Cognito JWT or an ``sk-…`` API key.
     
     Returns:
     - paid: Paid bucket balance (posted, holds, available)
@@ -132,11 +134,13 @@ async def list_transactions(
     from_date: Optional[datetime] = Query(default=None, alias="from"),
     to_date: Optional[datetime] = Query(default=None, alias="to"),
     db: AsyncSession = Depends(get_db_session),
-    current_user: User = Depends(get_current_user),
+    current_user: User = Depends(get_user_jwt_or_api_key),
 ):
     """
     Get paginated list of credit transactions (ledger entries).
-    
+
+    Authenticate with either a Cognito JWT or an ``sk-…`` API key.
+
     Parameters:
     - limit: Maximum number of items to return (1-∞)
     - offset: Number of items to skip
@@ -226,11 +230,13 @@ async def get_monthly_spending(
     year: int = Query(default=None, description="Year for spending data (defaults to current year)"),
     mode: SpendingModeEnum = Query(default=SpendingModeEnum.gross),
     db: AsyncSession = Depends(get_db_session),
-    current_user: User = Depends(get_current_user),
+    current_user: User = Depends(get_user_jwt_or_api_key),
 ):
     """
     Get monthly spending metrics for a year.
-    
+
+    Authenticate with either a Cognito JWT or an ``sk-…`` API key.
+
     Parameters:
     - year: Year to get spending for (defaults to current year)
     - mode: 
@@ -300,11 +306,13 @@ async def list_usage(
     to_date: Optional[datetime] = Query(default=None, alias="to"),
     model: Optional[str] = Query(default=None),
     db: AsyncSession = Depends(get_db_session),
-    current_user: User = Depends(get_current_user),
+    current_user: User = Depends(get_user_jwt_or_api_key),
 ):
     """
     Get paginated list of usage entries (posted usage charges only).
-    
+
+    Authenticate with either a Cognito JWT or an ``sk-…`` API key.
+
     Parameters:
     - limit: Maximum number of items to return (1-∞)
     - offset: Number of items to skip
@@ -374,11 +382,13 @@ async def list_usage_for_month(
     limit: int = Query(default=50, ge=1),
     offset: int = Query(default=0, ge=0),
     db: AsyncSession = Depends(get_db_session),
-    current_user: User = Depends(get_current_user),
+    current_user: User = Depends(get_user_jwt_or_api_key),
 ):
     """
     Get paginated list of usage entries for a specific month.
-    
+
+    Authenticate with either a Cognito JWT or an ``sk-…`` API key.
+
     Parameters:
     - year: Year
     - month: Month (1-12)
diff --git a/src/dependencies.py b/src/dependencies.py
@@ -553,11 +553,62 @@ async def get_current_api_key(
     return auth.api_key
 
 
+# ---------------------------------------------------------------------------
+# Union auth: accept either Cognito JWT or sk-… API key
+# ---------------------------------------------------------------------------
+
+async def get_user_jwt_or_api_key(
+    db: AsyncSession = Depends(get_db_session),
+    token: Optional[HTTPAuthorizationCredentials] = Depends(oauth2_scheme_optional),
+    api_key_str: Optional[str] = Security(api_key_header),
+) -> User:
+    """
+    Authenticate via either a Cognito JWT or an ``sk-…`` API key and return
+    the associated :class:`User`.
+
+    Both schemes are carried in the ``Authorization`` header; the scheme is
+    selected purely by the credential prefix:
+
+    * value starts with ``sk-`` → treated as API key, delegated to
+      :func:`get_api_key_auth`
+    * anything else → treated as a Cognito JWT, delegated to
+      :func:`get_current_user`
+
+    Intended for read-only endpoints (e.g. billing GETs) that should be
+    reachable from both the dashboard (JWT) and programmatic clients (key).
+    """
+    # Local testing bypass mirrors the underlying dependencies.
+    from src.core.local_testing import is_local_testing_mode, get_or_create_test_user
+    if is_local_testing_mode():
+        return await get_or_create_test_user(db)
+
+    # Both HTTPBearer and APIKeyHeader read the same Authorization header.
+    # Prefer the HTTPBearer-parsed credential (already stripped of "Bearer ");
+    # fall back to the raw header for clients that omit the scheme prefix.
+    raw = token.credentials if token else (api_key_str or "")
+    if raw.startswith("Bearer "):
+        raw = raw[7:]
+
+    if not raw:
+        raise HTTPException(
+            status_code=status.HTTP_401_UNAUTHORIZED,
+            detail="Not authenticated",
+            headers={"WWW-Authenticate": "Bearer"},
+        )
+
+    if raw.startswith("sk-"):
+        auth = await get_api_key_auth(api_key_str=raw)
+        return auth.user
+
+    return await get_current_user(db=db, token=token)
+
+
 # ---------------------------------------------------------------------------
 # Type aliases for commonly used dependency chains
 # ---------------------------------------------------------------------------
 CurrentUser = Annotated[User, Depends(get_current_user)]
 APIKeyUser = Annotated[User, Depends(get_api_key_user)]
 CurrentAPIKey = Annotated[APIKey, Depends(get_current_api_key)]
 APIKeyAuthentication = Annotated[APIKeyAuth, Depends(get_api_key_auth)]
+JwtOrApiKeyUser = Annotated[User, Depends(get_user_jwt_or_api_key)]
 DBSession = Annotated[AsyncSession, Depends(get_db_session)]
diff --git a/src/services/proxy_router_service.py b/src/services/proxy_router_service.py
@@ -103,6 +103,20 @@ def get_http_status_code(self) -> int:
         return error_type_mapping.get(self.error_type, 500)
 
 
+# Error substrings from the proxy router that indicate permanent failures
+# where retrying the same request will never succeed.
+NON_RETRIABLE_ERROR_PATTERNS = [
+    "llm tee verification failed",
+    "p-node tee attestation failed"
+]
+
+
+def _is_non_retriable_error(response_text: str) -> bool:
+    """Return True if the response body contains a known non-retriable error."""
+    text_lower = response_text.lower()
+    return any(pattern in text_lower for pattern in NON_RETRIABLE_ERROR_PATTERNS)
+
+
 
 
 async def _execute_request(
@@ -214,21 +228,35 @@ async def _execute_request(
                           error = response.text,
                           event_type="proxy_http_error")
             
+            error_type = "http_error"
+            if status_code >= 500:
+                error_type = "server_error"
+            elif status_code >= 400:
+                error_type = "client_error"
+            
+            non_retriable = _is_non_retriable_error(response.text)
+            if non_retriable:
+                req_logger.error(
+                    "Non-retriable error detected, skipping remaining retries",
+                    status_code=status_code,
+                    url=e.response.url,
+                    method=method,
+                    error=response.text,
+                    event_type="proxy_non_retriable_error")
+                raise ProxyRouterServiceError(
+                    sanitize_error_message(f"HTTP {status_code}: {response.text}"),
+                    status_code=status_code,
+                    error_type=error_type
+                )
+            
             if attempt == max_retries - 1:
-                # If this was the last attempt, raise with status code info
                 req_logger.error("Proxy router request failed after all retries",
                             max_retries=max_retries,
                             url=e.response.url,
                             method=method,
                             error=response.text,
                             status_code=status_code,
                             event_type="proxy_request_failed")
-                error_type = "http_error"
-                if status_code >= 500:
-                    error_type = "server_error"
-                elif status_code >= 400:
-                    error_type = "client_error"
-                
                 raise ProxyRouterServiceError(
                     sanitize_error_message(f"HTTP {status_code}: {response.text}"),
                     status_code=status_code,