mlsysops-eu
diff --git a/‎checkpoints/trained_agent.pth‎
43.6 KB b/‎checkpoints/trained_agent.pth‎
43.6 KB
diff --git a/‎data/demo/cs_latency_dict.pkl‎
864 Bytes b/‎data/demo/cs_latency_dict.pkl‎
864 Bytes
diff --git a/‎data/demo/green_trace.pkl‎
1.9 KB b/‎data/demo/green_trace.pkl‎
1.9 KB
diff --git a/‎data/demo/invocations.pkl‎
8.46 MB b/‎data/demo/invocations.pkl‎
8.46 MB
diff --git a/‎data/demo/network_latency_map.pkl‎
92 Bytes b/‎data/demo/network_latency_map.pkl‎
92 Bytes
diff --git a/‎lace_rl/__init__.py‎ b/‎lace_rl/__init__.py‎
diff --git a/‎lace_rl/agent/__init__.py‎ b/‎lace_rl/agent/__init__.py‎
diff --git a/‎lace_rl/agent/dqn_agent.py‎
Lines changed: 100 additions & 0 deletions b/‎lace_rl/agent/dqn_agent.py‎
Lines changed: 100 additions & 0 deletions
diff --git a/‎lace_rl/sim/__init__.py‎ b/‎lace_rl/sim/__init__.py‎
diff --git a/‎lace_rl/sim/trace_simulator.py‎
Lines changed: 284 additions & 0 deletions b/‎lace_rl/sim/trace_simulator.py‎
Lines changed: 284 additions & 0 deletions
@@ -0,0 +1,100 @@
+# DQN implementation with two-region support and updated ServerlessEnv integration
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import numpy as np
+import random
+from collections import deque
+
+class DQN(nn.Module):
+    def __init__(self, state_dim, action_dim):
+        super(DQN, self).__init__()
+        self.net = nn.Sequential(
+            nn.Linear(state_dim, 128),
+            nn.ReLU(),
+            nn.Linear(128, 64),
+            nn.ReLU(),
+            nn.Linear(64, action_dim)
+        )
+
+    def forward(self, x):
+        return self.net(x)
+
+class DQNAgent:
+    def __init__(
+            self, 
+            state_dim, 
+            action_dim, 
+            lr=1e-3, 
+            gamma=0.99, 
+            epsilon=1.0, 
+            epsilon_decay=0.995, 
+            epsilon_min=0.05, 
+            buffer_size=10000, 
+            batch_size=64
+            ):
+        self.state_dim = state_dim
+        self.action_dim = action_dim
+        self.gamma = gamma
+        self.epsilon = epsilon
+        self.epsilon_decay = epsilon_decay
+        self.epsilon_min = epsilon_min
+        self.batch_size = batch_size
+
+        self.q_network = DQN(state_dim, action_dim)
+        self.target_network = DQN(state_dim, action_dim)
+        self.target_network.load_state_dict(self.q_network.state_dict())
+        self.optimizer = optim.Adam(self.q_network.parameters(), lr=lr)
+        self.memory = deque(maxlen=buffer_size)
+        self.update_counter = 0
+
+    def select_action(self, state):
+        if random.random() < self.epsilon:
+            return random.randint(0, self.action_dim - 1)
+        state = torch.tensor(np.array(state), dtype=torch.float32).unsqueeze(0)
+        with torch.no_grad():
+            q_values = self.q_network(state)
+        return int(torch.argmax(q_values).item())
+
+    def store_transition(self, state, action, reward, next_state, done):
+        self.memory.append((state, action, reward, next_state, done))
+
+    def train(self):
+        if len(self.memory) < self.batch_size:
+            return
+
+        batch = random.sample(self.memory, self.batch_size)
+        states, actions, rewards, next_states, dones = zip(*batch)
+
+        states = torch.tensor(np.array(states), dtype=torch.float32)
+        actions = torch.tensor(np.array(actions), dtype=torch.int64).unsqueeze(1)
+        rewards = torch.tensor(np.array(rewards), dtype=torch.float32).unsqueeze(1)
+        next_states = torch.tensor(np.array(next_states), dtype=torch.float32)
+        dones = torch.tensor(np.array(dones), dtype=torch.float32).unsqueeze(1)
+
+        q_values = self.q_network(states).gather(1, actions)
+        with torch.no_grad():
+            max_next_q = self.target_network(next_states).max(1)[0].unsqueeze(1)
+            target_q = rewards + (1 - dones) * self.gamma * max_next_q
+
+        loss = F.mse_loss(q_values, target_q)
+        self.optimizer.zero_grad()
+        loss.backward()
+        self.optimizer.step()
+
+        # target network update
+        self.update_counter += 1
+        if self.update_counter % 10 == 0:
+            self.target_network.load_state_dict(self.q_network.state_dict())
+
+    def decay_epsilon(self):
+        if self.epsilon > self.epsilon_min:
+            self.epsilon *= self.epsilon_decay
+
+    def save(self, path):
+        torch.save(self.q_network.state_dict(), path)
+
+    def load(self, path):
+        self.q_network.load_state_dict(torch.load(path))
+        self.target_network.load_state_dict(self.q_network.state_dict())
@@ -0,0 +1,284 @@
+from dataclasses import dataclass
+from typing import Dict, List, Tuple
+
+@dataclass
+class FunctionInvocation:
+    timestamp: int
+    pod_id: str
+    region_id: str
+    exec_time_s: float
+    cpu_cores: float
+    mem_MB: float
+    cold_start_latency_s: float = 0.0
+    user_region: str = 'local'
+
+    metadata: dict = None
+
+@dataclass
+class GreenEnergy:
+    timestamp: int
+    region_id: str
+    # green_ratio: float
+    carbon_intensity: float
+
+class EnergyEstimator:
+    def __init__(self):
+        # Simulate m5-series EC2 instance
+        self.J_DRAM_per_MB = 0.00037
+        self.J_CPU_per_mCore = 0.3
+        # self.J_CPU_per_mCore = 0.188
+
+        self.LAMBDA_IDLE = 0.2
+        self.N_CPU = 32
+        self.TOTAL_DRAM_MB = 16384
+
+    def estimate_keep_alive_energy(self, mem_MB, cpu_cores, duration_s, carbon_intensity):
+        cpu_frac = cpu_cores / self.N_CPU
+        power = self.J_DRAM_per_MB * mem_MB + self.J_CPU_per_mCore * self.LAMBDA_IDLE * cpu_frac
+        energy = power * duration_s
+        carbon = self.estimate_carbon_emission(energy, carbon_intensity)
+        return energy, carbon
+
+    def estimate_exec_energy(self, exec_time_s, cpu_cores, mem_MB):
+        cpu_frac = cpu_cores / self.N_CPU
+        power = self.J_DRAM_per_MB * mem_MB + self.J_CPU_per_mCore * cpu_frac
+        return power * exec_time_s
+
+    def estimate_carbon_emission(self, energy_joule, carbon_intensity):
+        energy_kwh = energy_joule / (3.6 * 1e6)
+        return energy_kwh * carbon_intensity
+
+class TraceDrivenSimulator:
+    def __init__(self, green_trace: Dict[str, Dict[int, GreenEnergy]], network_latency_map=None, fixed_keep_alive_s=60.0):
+        self.green_trace = green_trace
+        self.fixed_keep_alive_s = fixed_keep_alive_s
+        self.network_latency_map = network_latency_map or {}
+        self.energy_estimator = EnergyEstimator()
+        self.reset_metrics()
+
+    def reset_metrics(self):
+        self.container_state: Dict[str, Tuple[int, float]] = {}
+        self.total_energy = 0.0
+        self.total_carbon = 0.0
+        self.keep_alive_energy = 0.0
+        self.execution_energy = 0.0
+        self.keep_alive_carbon = 0.0
+        self.execution_carbon = 0.0
+        self.cold_start_count = 0
+        self.total_latency = 0.0
+        self.total_network_latency = 0.0
+        self.total_invocations = 0
+        self.total_idle_time: Dict[str, float] = {}
+        self.total_idle_energy: Dict[str, float] = {}
+
+    def run(self, invocations: List[FunctionInvocation]):
+        for inv in invocations:
+            green_ene = self.green_trace.get(inv.region_id, {}).get(int(inv.timestamp // 3600 * 3600))
+            if green_ene is None:
+                print("No energy data provided.")
+                continue
+
+            self.total_invocations += 1
+            is_cold = (
+                inv.pod_id not in self.container_state or
+                inv.timestamp > self.container_state[inv.pod_id][0] + self.fixed_keep_alive_s
+            )
+
+            net_latency = 2 * self.network_latency_map.get((inv.user_region, inv.region_id), 0.0)
+            if is_cold:
+                self.cold_start_count += 1
+                self.total_latency += inv.exec_time_s / 0.9 + inv.cold_start_latency_s + net_latency
+            else:
+                self.total_latency += inv.exec_time_s / 0.9 + net_latency
+
+            self.total_network_latency += net_latency
+
+            e_energy = self.energy_estimator.estimate_exec_energy(inv.exec_time_s, inv.cpu_cores, inv.mem_MB)
+            e_carbon = self.energy_estimator.estimate_carbon_emission(e_energy, green_ene.carbon_intensity)
+            self.execution_energy += e_energy
+            self.execution_carbon += e_carbon
+
+            if inv.pod_id in self.container_state:
+                last_ts, mem_MB, cpu_cores = self.container_state[inv.pod_id]
+                idle_time = inv.timestamp - last_ts
+                if idle_time > 0:
+                    recorded_idle_time = min(idle_time, self.fixed_keep_alive_s)
+                    k_energy, k_carbon = self.energy_estimator.estimate_keep_alive_energy(
+                        mem_MB, cpu_cores, recorded_idle_time, green_ene.carbon_intensity
+                    )
+                    self.keep_alive_energy += k_energy
+                    self.keep_alive_carbon += k_carbon
+                    self.total_idle_time.setdefault(inv.pod_id, 0.0)
+                    self.total_idle_time[inv.pod_id] += recorded_idle_time
+                    self.total_idle_energy.setdefault(inv.pod_id, 0.0)
+                    self.total_idle_energy[inv.pod_id] += k_energy
+
+            self.container_state[inv.pod_id] = (
+                inv.timestamp,
+                inv.mem_MB,
+                inv.cpu_cores
+            )
+
+        self.total_energy = self.execution_energy + self.keep_alive_energy
+        self.total_carbon = self.execution_carbon + self.keep_alive_carbon
+
+        return {
+            'total_energy_J': self.total_energy,
+            'total_carbon_g': self.total_carbon,
+            'cold_starts': self.cold_start_count,
+            'avg_latency_s': self.total_latency / self.total_invocations if self.total_invocations else 0.0,
+            'total_network_latency_s': self.total_network_latency,
+            'invocation_count': self.total_invocations,
+            'keep_alive_energy_J': self.keep_alive_energy,
+            'execution_energy_J': self.execution_energy,
+            'keep_alive_carbon_g': self.keep_alive_carbon,
+            'execution_carbon_g': self.execution_carbon,
+            'idle_time_by_function': self.total_idle_time,
+            'idle_energy_by_function': self.total_idle_energy
+        }
+
+    def run_with_agent(self, invocations: List[FunctionInvocation], agent, obs_fn):
+            self.reset_metrics()
+            for inv in invocations:
+                green_ene = self.green_trace.get(inv.region_id, {}).get(int(inv.timestamp // 3600 * 3600))
+                if green_ene is None:
+                    continue
+
+                obs = obs_fn(inv)
+                action = agent.select_action(obs)
+                # dest_region, keep_alive_s = agent.action_lookup[action]
+                _, keep_alive_s = agent.action_lookup[action]
+                dest_region = inv.user_region # Hold for future cross-region usage
+
+                self.total_invocations += 1
+                is_cold = (
+                    inv.pod_id not in self.container_state or
+                    inv.timestamp > self.container_state[inv.pod_id][0] + keep_alive_s
+                )
+
+                net_latency = 2 * self.network_latency_map.get((inv.user_region, dest_region), 0.0)
+                if is_cold:
+                    self.cold_start_count += 1
+                    self.total_latency += inv.exec_time_s + inv.cold_start_latency_s + net_latency
+                else:
+                    self.total_latency += inv.exec_time_s + net_latency
+
+                self.total_network_latency += net_latency
+
+                e_energy = self.energy_estimator.estimate_exec_energy(inv.exec_time_s, inv.cpu_cores, inv.mem_MB)
+                e_carbon = self.energy_estimator.estimate_carbon_emission(e_energy, green_ene.carbon_intensity)
+                self.execution_energy += e_energy
+                self.execution_carbon += e_carbon
+
+                if inv.pod_id in self.container_state:
+                    last_ts, mem_MB, cpu_cores = self.container_state[inv.pod_id]
+                    idle_time = inv.timestamp - last_ts
+                    if idle_time > 0:
+                        recorded_idle_time = min(idle_time, keep_alive_s)
+                        k_energy, k_carbon = self.energy_estimator.estimate_keep_alive_energy(
+                            mem_MB, cpu_cores, recorded_idle_time, green_ene.carbon_intensity
+                        )
+                        self.keep_alive_energy += k_energy
+                        self.keep_alive_carbon += k_carbon
+                        self.total_idle_time.setdefault(inv.pod_id, 0.0)
+                        self.total_idle_time[inv.pod_id] += recorded_idle_time
+                        self.total_idle_energy.setdefault(inv.pod_id, 0.0)
+                        self.total_idle_energy[inv.pod_id] += k_energy
+
+                self.container_state[inv.pod_id] = (
+                    inv.timestamp,
+                    inv.mem_MB,
+                    inv.cpu_cores
+                )
+
+            self.total_energy = self.execution_energy + self.keep_alive_energy
+            self.total_carbon = self.execution_carbon + self.keep_alive_carbon
+
+            return {
+                'total_energy_J': self.total_energy,
+                'total_carbon_g': self.total_carbon,
+                'cold_starts': self.cold_start_count,
+                'avg_latency_s': self.total_latency / self.total_invocations if self.total_invocations else 0.0,
+                'total_network_latency_s': self.total_network_latency,
+                'invocation_count': self.total_invocations,
+                'keep_alive_energy_J': self.keep_alive_energy,
+                'execution_energy_J': self.execution_energy,
+                'keep_alive_carbon_g': self.keep_alive_carbon,
+                'execution_carbon_g': self.execution_carbon,
+                'idle_time_by_function': self.total_idle_time,
+                'idle_energy_by_function': self.total_idle_energy
+            }
+
+    def run_with_agent_with_action_log(self, invocations: List[FunctionInvocation], agent, obs_fn):
+            self.reset_metrics()
+            actions = []
+            for inv in invocations:
+                green_ene = self.green_trace.get(inv.region_id, {}).get(int(inv.timestamp // 3600 * 3600))
+                if green_ene is None:
+                    continue
+
+                obs = obs_fn(inv)
+                action = agent.select_action(obs)
+                # dest_region, keep_alive_s = agent.action_lookup[action]
+                _, keep_alive_s = agent.action_lookup[action]
+                actions.append(keep_alive_s)
+                dest_region = inv.user_region
+
+                self.total_invocations += 1
+                is_cold = (
+                    inv.pod_id not in self.container_state or
+                    inv.timestamp > self.container_state[inv.pod_id][0] + keep_alive_s
+                )
+
+                net_latency = 2 * self.network_latency_map.get((inv.user_region, dest_region), 0.0)
+                if is_cold:
+                    self.cold_start_count += 1
+                    self.total_latency += inv.exec_time_s + inv.cold_start_latency_s + net_latency
+                else:
+                    self.total_latency += inv.exec_time_s + net_latency
+
+                self.total_network_latency += net_latency
+
+                e_energy = self.energy_estimator.estimate_exec_energy(inv.exec_time_s, inv.cpu_cores, inv.mem_MB)
+                e_carbon = self.energy_estimator.estimate_carbon_emission(e_energy, green_ene.carbon_intensity)
+                self.execution_energy += e_energy
+                self.execution_carbon += e_carbon
+
+                if inv.pod_id in self.container_state:
+                    last_ts, mem_MB, cpu_cores = self.container_state[inv.pod_id]
+                    idle_time = inv.timestamp - last_ts
+                    if idle_time > 0:
+                        recorded_idle_time = min(idle_time, keep_alive_s)
+                        k_energy, k_carbon = self.energy_estimator.estimate_keep_alive_energy(
+                            mem_MB, cpu_cores, recorded_idle_time, green_ene.carbon_intensity
+                        )
+                        self.keep_alive_energy += k_energy
+                        self.keep_alive_carbon += k_carbon
+                        self.total_idle_time.setdefault(inv.pod_id, 0.0)
+                        self.total_idle_time[inv.pod_id] += recorded_idle_time
+                        self.total_idle_energy.setdefault(inv.pod_id, 0.0)
+                        self.total_idle_energy[inv.pod_id] += k_energy
+
+                self.container_state[inv.pod_id] = (
+                    inv.timestamp,
+                    inv.mem_MB,
+                    inv.cpu_cores
+                )
+
+            self.total_energy = self.execution_energy + self.keep_alive_energy
+            self.total_carbon = self.execution_carbon + self.keep_alive_carbon
+
+            return {
+                'total_energy_J': self.total_energy,
+                'total_carbon_g': self.total_carbon,
+                'cold_starts': self.cold_start_count,
+                'avg_latency_s': self.total_latency / self.total_invocations if self.total_invocations else 0.0,
+                'total_network_latency_s': self.total_network_latency,
+                'invocation_count': self.total_invocations,
+                'keep_alive_energy_J': self.keep_alive_energy,
+                'execution_energy_J': self.execution_energy,
+                'keep_alive_carbon_g': self.keep_alive_carbon,
+                'execution_carbon_g': self.execution_carbon,
+                'idle_time_by_function': self.total_idle_time,
+                'idle_energy_by_function': self.total_idle_energy
+            }, actions