Skip to content

Commit 2d35de3

Browse files
authored
Merge pull request #215 from MorpheusAIs/dev
Refactor pricing and rate limits to use JSON config files - TEST
2 parents a6f91e7 + dad40e5 commit 2d35de3

12 files changed

Lines changed: 391 additions & 330 deletions

Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ COPY --from=builder /usr/local/bin /usr/local/bin
5858
# Copy application code
5959
COPY ./src ./src
6060
COPY ./alembic ./alembic
61+
COPY ./models ./models
6162
COPY alembic.ini .
6263

6364
# Create logs directory and initial models.json before changing ownership

docker-compose.local.yml

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,8 @@ services:
5353
REDIS_URL: redis://redis-local:6379/0
5454
# Local database
5555
DATABASE_URL: postgresql+asyncpg://morpheus_local:local_dev_password@db-local:5432/morpheus_local_db
56+
57+
ENVIRONMENT: ${ENVIRONMENT:-test}
5658

5759
# Bypass Cognito for local testing (controlled by .env.local)
5860
BILLING_ADMIN_SECRET: ${BILLING_ADMIN_SECRET}
@@ -110,12 +112,8 @@ services:
110112
HOLD_RECONCILIATION_INTERVAL_SECONDS: ${HOLD_RECONCILIATION_INTERVAL_SECONDS:-600}
111113
HOLD_MAX_PENDING_SECONDS: ${HOLD_MAX_PENDING_SECONDS:-3600}
112114

113-
# Rate limiting settings
115+
# Rate limiting (on/off toggle; limits configured in models/{env}_rate_limit.json)
114116
RATE_LIMIT_ENABLED: ${RATE_LIMIT_ENABLED:-true}
115-
RATE_LIMIT_DEFAULT_RPM: ${RATE_LIMIT_DEFAULT_RPM:-60}
116-
RATE_LIMIT_DEFAULT_TPM: ${RATE_LIMIT_DEFAULT_TPM:-100000}
117-
RATE_LIMIT_WINDOW_SECONDS: ${RATE_LIMIT_WINDOW_SECONDS:-60}
118-
RATE_LIMIT_MODEL_GROUPS: ${RATE_LIMIT_MODEL_GROUPS:-}
119117

120118
# Web3/SIWE settings (ERC-4361 Sign-In with Ethereum)
121119
WEB3_PROVIDER_URL: ${WEB3_PROVIDER_URL:-} # Optional: enables EIP-1271 smart contract wallet verification
@@ -133,6 +131,7 @@ services:
133131
- ./alembic.ini:/app/alembic.ini
134132
- ./tests:/app/tests
135133
- ./scripts:/app/scripts
134+
- ./models:/app/models
136135

137136
# Use startup script that handles migrations and verification
138137
command: ./scripts/start_local_dev.sh

env.example

Lines changed: 3 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -223,23 +223,9 @@ HOLD_MAX_PENDING_SECONDS=3600
223223
# =============================================================================
224224
# Enable/disable rate limiting globally
225225
RATE_LIMIT_ENABLED=true
226-
227-
# Default rate limits (applied if no model-specific limits match)
228-
# Requests per minute (RPM)
229-
RATE_LIMIT_DEFAULT_RPM=60
230-
# Tokens per minute (TPM) - input + output combined
231-
RATE_LIMIT_DEFAULT_TPM=100000
232-
233-
# Rate limit window in seconds (default: 60 for per-minute limits)
234-
RATE_LIMIT_WINDOW_SECONDS=60
235-
236-
# Model group rate limits (optional, JSON format)
237-
# Override default limits for specific model groups
238-
# Format: [{"name": "group_name", "rpm": 30, "tpm": 50000, "models": ["model1", "model2*"], "priority": 100}]
239-
# Higher priority groups are matched first
240-
# Example:
241-
# RATE_LIMIT_MODEL_GROUPS=[{"name":"premium","rpm":30,"tpm":50000,"models":["gpt-4*","claude-3-opus*"],"priority":100},{"name":"fast","rpm":120,"tpm":200000,"models":["llama-3.1-8b*"],"priority":25}]
242-
RATE_LIMIT_MODEL_GROUPS=
226+
# Rate limit defaults and model groups are configured in models/{env}_rate_limit.json
227+
# Model pricing is configured in models/{env}_model_price.json
228+
# The ENVIRONMENT variable determines which file is used (see above)
243229

244230
# =============================================================================
245231
# SIGNUP BONUS

models/prod_model_price.json

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
{
2+
"default_input_price_per_million": "0.50",
3+
"default_output_price_per_million": "2.00",
4+
"models": {
5+
"glm-5": { "input": "1.00", "output": "3.20" },
6+
"glm-4.7": { "input": "0.50", "output": "2.25" },
7+
"glm-4.7-thinking": { "input": "0.45", "output": "2.00" },
8+
"glm-4.7-flash": { "input": "0.10", "output": "0.50" },
9+
10+
"kimi-k2.5": { "input": "0.60", "output": "3.00" },
11+
"kimi-k2-thinking": { "input": "0.60", "output": "3.00" },
12+
13+
"qwen3-235b": { "input": "0.40", "output": "3.00" },
14+
"qwen-3-235b": { "input": "0.40", "output": "3.00" },
15+
"qwen3-coder-480b-a35b-instruct": { "input": "0.70", "output": "2.80" },
16+
"qwen3-coder-480b-a35b": { "input": "0.70", "output": "2.80" },
17+
"qwen3-next-80b": { "input": "0.15", "output": "1.50" },
18+
19+
"minimax-m2.5": { "input": "0.30", "output": "1.20" },
20+
21+
"gpt-oss-120b": { "input": "0.07", "output": "0.28" },
22+
23+
"hermes-3-llama-3.1-405b": { "input": "1.00", "output": "3.00" },
24+
"llama-3.3-70b": { "input": "0.70", "output": "2.50" },
25+
"llama-3-3-70b": { "input": "0.70", "output": "2.50" },
26+
"llama-3.2-3b": { "input": "0.10", "output": "0.50" },
27+
"llama-3-2-3b": { "input": "0.10", "output": "0.50" },
28+
29+
"mistral-31-24b": { "input": "0.50", "output": "2.00" },
30+
"mistral-small-24b": { "input": "0.50", "output": "2.00" },
31+
32+
"venice-uncensored": { "input": "0.20", "output": "0.90" },
33+
34+
"text-embedding-bge-m3": { "input": "0.10", "output": "0.50" }
35+
}
36+
}

models/prod_rate_limit.json

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
{
2+
"default_rpm": 60,
3+
"default_tpm": 100000,
4+
"window_seconds": 60,
5+
"model_groups": [
6+
{
7+
"name": "embedding",
8+
"rpm": 500,
9+
"tpm": 0,
10+
"models": ["text-embedding-bge-m3"],
11+
"priority": 10,
12+
"description": "Embedding models with high throughput (no token limit)"
13+
},
14+
{
15+
"name": "S",
16+
"rpm": 500,
17+
"tpm": 1000000,
18+
"models": [
19+
"llama-3.2-3b",
20+
"llama-3.2-3b:web",
21+
"qwen3-4b",
22+
"qwen3-4b:web"
23+
],
24+
"priority": 25,
25+
"description": "Small models with high throughput limits"
26+
},
27+
{
28+
"name": "M",
29+
"rpm": 50,
30+
"tpm": 750000,
31+
"models": [
32+
"llama-3.3-70b",
33+
"llama-3.3-70b:web",
34+
"mistral-31-24b",
35+
"mistral-31-24b:web",
36+
"qwen3-next-80b",
37+
"qwen3-next-80b:web",
38+
"venice-uncensored",
39+
"venice-uncensored:web"
40+
],
41+
"priority": 50,
42+
"description": "Medium models with moderate limits"
43+
},
44+
{
45+
"name": "L",
46+
"rpm": 20,
47+
"tpm": 500000,
48+
"models": [
49+
"glm-5",
50+
"glm-5:web",
51+
"glm-4.7",
52+
"glm-4.7:web",
53+
"glm-4.7-thinking",
54+
"glm-4.7-thinking:web",
55+
"glm-4.7-flash",
56+
"glm-4.7-flash:web",
57+
"kimi-k2.5",
58+
"kimi-k2.5:web",
59+
"kimi-k2-thinking",
60+
"kimi-k2-thinking:web",
61+
"qwen3-235b",
62+
"qwen3-235b:web",
63+
"qwen3-coder-480b-a35b-instruct",
64+
"qwen3-coder-480b-a35b-instruct:web",
65+
"minimax-m2.5",
66+
"minimax-m2.5:web",
67+
"gpt-oss-120b",
68+
"gpt-oss-120b:web",
69+
"hermes-3-llama-3.1-405b",
70+
"hermes-3-llama-3.1-405b:web"
71+
],
72+
"priority": 100,
73+
"description": "Large models with conservative limits"
74+
}
75+
]
76+
}

models/test_model_price.json

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
{
2+
"default_input_price_per_million": "0.50",
3+
"default_output_price_per_million": "2.00",
4+
"models": {
5+
"glm-5": { "input": "1.00", "output": "3.20" },
6+
"glm-4.7": { "input": "0.50", "output": "2.25" },
7+
"glm-4.7-thinking": { "input": "0.45", "output": "2.00" },
8+
"glm-4.7-flash": { "input": "0.10", "output": "0.50" },
9+
10+
"kimi-k2.5": { "input": "0.60", "output": "3.00" },
11+
"kimi-k2-thinking": { "input": "0.60", "output": "3.00" },
12+
13+
"qwen3-235b": { "input": "0.40", "output": "3.00" },
14+
"qwen-3-235b": { "input": "0.40", "output": "3.00" },
15+
"qwen3-coder-480b-a35b-instruct": { "input": "0.70", "output": "2.80" },
16+
"qwen3-coder-480b-a35b": { "input": "0.70", "output": "2.80" },
17+
"qwen3-next-80b": { "input": "0.15", "output": "1.50" },
18+
19+
"minimax-m2.5": { "input": "0.30", "output": "1.20" },
20+
21+
"gpt-oss-120b": { "input": "0.07", "output": "0.28" },
22+
23+
"hermes-3-llama-3.1-405b": { "input": "1.00", "output": "3.00" },
24+
"llama-3.3-70b": { "input": "0.70", "output": "2.50" },
25+
"llama-3-3-70b": { "input": "0.70", "output": "2.50" },
26+
"llama-3.2-3b": { "input": "0.10", "output": "0.50" },
27+
"llama-3-2-3b": { "input": "0.10", "output": "0.50" },
28+
29+
"mistral-31-24b": { "input": "0.50", "output": "2.00" },
30+
"mistral-small-24b": { "input": "0.50", "output": "2.00" },
31+
32+
"venice-uncensored": { "input": "0.20", "output": "0.90" },
33+
34+
"text-embedding-bge-m3": { "input": "0.10", "output": "0.50" }
35+
}
36+
}

models/test_rate_limit.json

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
{
2+
"default_rpm": 60,
3+
"default_tpm": 100000,
4+
"window_seconds": 60,
5+
"model_groups": [
6+
{
7+
"name": "embedding",
8+
"rpm": 500,
9+
"tpm": 0,
10+
"models": ["text-embedding-bge-m3"],
11+
"priority": 10,
12+
"description": "Embedding models with high throughput (no token limit)"
13+
},
14+
{
15+
"name": "S",
16+
"rpm": 500,
17+
"tpm": 1000000,
18+
"models": [
19+
"llama-3.2-3b",
20+
"llama-3.2-3b:web",
21+
"qwen3-4b",
22+
"qwen3-4b:web"
23+
],
24+
"priority": 25,
25+
"description": "Small models with high throughput limits"
26+
},
27+
{
28+
"name": "M",
29+
"rpm": 50,
30+
"tpm": 750000,
31+
"models": [
32+
"llama-3.3-70b",
33+
"llama-3.3-70b:web",
34+
"mistral-31-24b",
35+
"mistral-31-24b:web",
36+
"qwen3-next-80b",
37+
"qwen3-next-80b:web",
38+
"venice-uncensored",
39+
"venice-uncensored:web"
40+
],
41+
"priority": 50,
42+
"description": "Medium models with moderate limits"
43+
},
44+
{
45+
"name": "L",
46+
"rpm": 20,
47+
"tpm": 500000,
48+
"models": [
49+
"glm-5",
50+
"glm-5:web",
51+
"glm-4.7",
52+
"glm-4.7:web",
53+
"glm-4.7-thinking",
54+
"glm-4.7-thinking:web",
55+
"glm-4.7-flash",
56+
"glm-4.7-flash:web",
57+
"kimi-k2.5",
58+
"kimi-k2.5:web",
59+
"kimi-k2-thinking",
60+
"kimi-k2-thinking:web",
61+
"qwen3-235b",
62+
"qwen3-235b:web",
63+
"qwen3-coder-480b-a35b-instruct",
64+
"qwen3-coder-480b-a35b-instruct:web",
65+
"minimax-m2.5",
66+
"minimax-m2.5:web",
67+
"gpt-oss-120b",
68+
"gpt-oss-120b:web",
69+
"hermes-3-llama-3.1-405b",
70+
"hermes-3-llama-3.1-405b:web"
71+
],
72+
"priority": 100,
73+
"description": "Large models with conservative limits"
74+
}
75+
]
76+
}

src/core/config.py

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -265,19 +265,8 @@ def assemble_cors_origins(cls, v: Union[str, List[str]]) -> Union[List[str], str
265265
# Rate Limiting Settings
266266
# Enable/disable rate limiting globally
267267
RATE_LIMIT_ENABLED: bool = Field(default=os.getenv("RATE_LIMIT_ENABLED", "true").lower() == "true")
268-
269-
# Default rate limits (applied if no model-specific limits match)
270-
# Requests per minute (RPM)
271-
RATE_LIMIT_DEFAULT_RPM: int = Field(default=int(os.getenv("RATE_LIMIT_DEFAULT_RPM", "60")))
272-
# Tokens per minute (TPM) - input + output combined
273-
RATE_LIMIT_DEFAULT_TPM: int = Field(default=int(os.getenv("RATE_LIMIT_DEFAULT_TPM", "100000")))
274-
275-
# Rate limit window in seconds (default: 60 for per-minute limits)
276-
RATE_LIMIT_WINDOW_SECONDS: int = Field(default=int(os.getenv("RATE_LIMIT_WINDOW_SECONDS", "60")))
277-
278-
# Model group rate limits (JSON format)
279-
# Format: {"group_name": {"rpm": 30, "tpm": 50000, "models": ["model1", "model2"]}}
280-
RATE_LIMIT_MODEL_GROUPS: str = Field(default=os.getenv("RATE_LIMIT_MODEL_GROUPS", ""))
268+
# Rate limit defaults and model groups are loaded from models/{env}_rate_limit.json
269+
# Model pricing is loaded from models/{env}_model_price.json
281270

282271

283272

src/core/config_loader.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
"""
2+
Environment-aware JSON config loader for model pricing and rate limits.
3+
4+
Resolves the ENVIRONMENT variable to select the appropriate JSON files
5+
from the models/ directory at the project root.
6+
"""
7+
8+
import json
9+
import os
10+
from pathlib import Path
11+
from typing import Any, Dict
12+
13+
_MODELS_DIR = Path(__file__).resolve().parents[2] / "models"
14+
15+
_ENV_PREFIX_MAP: Dict[str, str] = {
16+
"production": "prod",
17+
"prod": "prod",
18+
"prd": "prod",
19+
"staging": "prod",
20+
"stg": "prod",
21+
"stage": "prod",
22+
"development": "test",
23+
"dev": "test",
24+
"test": "test",
25+
"tst": "test",
26+
}
27+
28+
_cache: Dict[str, Any] = {}
29+
30+
31+
def _get_prefix() -> str:
32+
env = os.getenv("ENVIRONMENT", "development").lower().strip()
33+
return _ENV_PREFIX_MAP.get(env, "test")
34+
35+
36+
def _load_json(filename: str) -> Dict[str, Any]:
37+
if filename in _cache:
38+
return _cache[filename]
39+
40+
filepath = _MODELS_DIR / filename
41+
with open(filepath, "r") as f:
42+
data = json.load(f)
43+
44+
_cache[filename] = data
45+
return data
46+
47+
48+
def load_model_prices() -> Dict[str, Any]:
49+
"""Load the environment-appropriate model pricing config."""
50+
prefix = _get_prefix()
51+
return _load_json(f"{prefix}_model_price.json")
52+
53+
54+
def load_rate_limits() -> Dict[str, Any]:
55+
"""Load the environment-appropriate rate limit config."""
56+
prefix = _get_prefix()
57+
return _load_json(f"{prefix}_rate_limit.json")
58+
59+
60+
def clear_cache() -> None:
61+
"""Clear cached config data (useful for testing or hot-reload)."""
62+
_cache.clear()

src/main.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -413,10 +413,11 @@ async def startup_event():
413413
if settings.RATE_LIMIT_ENABLED:
414414
try:
415415
await rate_limit_service.initialize()
416+
rules_info = rate_limit_service.get_rules_info()
416417
logger.info("Rate limiting service initialized",
417418
enabled=True,
418-
default_rpm=settings.RATE_LIMIT_DEFAULT_RPM,
419-
default_tpm=settings.RATE_LIMIT_DEFAULT_TPM,
419+
default_rpm=rules_info.get("default", {}).get("rpm"),
420+
default_tpm=rules_info.get("default", {}).get("tpm"),
420421
event_type="rate_limit_init_success")
421422
except Exception as e:
422423
logger.error("Failed to initialize rate limiting service",

0 commit comments

Comments
 (0)