77from sqlalchemy import select
88from sqlalchemy .ext .asyncio import AsyncSession
99
10+ from dstack ._internal .core .models .instances import Resources
1011from dstack ._internal .core .models .metrics import JobMetrics , Metric
1112from dstack ._internal .server .models import JobMetricsPoint , JobModel
13+ from dstack ._internal .server .services .jobs import get_job_provisioning_data , get_job_runtime_data
14+ from dstack ._internal .utils .common import get_or_error
1215from dstack ._internal .utils .logging import get_logger
1316
1417logger = get_logger (__name__ )
@@ -47,17 +50,34 @@ async def get_job_metrics(
4750 # we need at least 2 points to calculate cpu_usage_percent
4851 if len (points ) < 2 :
4952 return JobMetrics (metrics = [])
50- return _calculate_job_metrics (points )
53+ return _calculate_job_metrics (job_model , points )
5154
5255
53- def _calculate_job_metrics (points : Sequence [JobMetricsPoint ]) -> JobMetrics :
56+ def _calculate_job_metrics (job_model : JobModel , points : Sequence [JobMetricsPoint ]) -> JobMetrics :
5457 timestamps : list [datetime ] = []
5558 cpu_usage_points : list [int ] = []
5659 memory_usage_points : list [int ] = []
5760 memory_working_set_points : list [int ] = []
5861 gpus_memory_usage_points : defaultdict [int , list [int ]] = defaultdict (list )
5962 gpus_util_points : defaultdict [int , list [int ]] = defaultdict (list )
6063
64+ cpus_detected_num : Optional [int ] = None
65+ memory_total : Optional [int ] = None
66+ gpu_memory_total : Optional [int ] = None
67+ resources : Optional [Resources ] = None
68+ jrd = get_job_runtime_data (job_model )
69+ if jrd is not None and jrd .offer is not None :
70+ resources = jrd .offer .instance .resources
71+ else :
72+ jpd = get_job_provisioning_data (job_model )
73+ if jpd is not None :
74+ resources = jpd .instance_type .resources
75+ if resources is not None :
76+ cpus_detected_num = resources .cpus
77+ memory_total = resources .memory_mib * 1024 * 1024
78+ if len (resources .gpus ) > 0 :
79+ gpu_memory_total = resources .gpus [0 ].memory_mib * 1024 * 1024
80+
6181 gpus_detected_num : Optional [int ] = None
6282 gpus_detected_num_mismatch : bool = False
6383 for point , prev_point in zip (points , points [1 :]):
@@ -93,25 +113,23 @@ def _calculate_job_metrics(points: Sequence[JobMetricsPoint]) -> JobMetrics:
93113 values = memory_working_set_points ,
94114 ),
95115 ]
116+ if cpus_detected_num is not None :
117+ metrics .append (_make_constant_metric ("cpus_detected_num" , timestamps , cpus_detected_num ))
118+ if memory_total is not None :
119+ metrics .append (_make_constant_metric ("memory_total_bytes" , timestamps , memory_total ))
96120 if gpus_detected_num_mismatch :
97121 # If number of GPUs changed in the time window, skip GPU metrics altogether, otherwise
98122 # results can be unpredictable (e.g, one GPU takes place of another, as they are
99123 # identified by an array index only).
100124 logger .warning ("gpus_detected_num mismatch, skipping GPU metrics" )
101125 else :
102126 metrics .append (
103- # As gpus_detected_num expected to be constant, we add only two points — the latest
104- # and the earliest in the batch
105- Metric (
106- name = "gpus_detected_num" ,
107- timestamps = [timestamps [0 ], timestamps [- 1 ]]
108- if len (timestamps ) > 1
109- else [timestamps [0 ]],
110- values = [gpus_detected_num , gpus_detected_num ]
111- if len (timestamps ) > 1
112- else [gpus_detected_num ],
113- )
127+ _make_constant_metric ("gpus_detected_num" , timestamps , get_or_error (gpus_detected_num ))
114128 )
129+ if gpu_memory_total is not None :
130+ metrics .append (
131+ _make_constant_metric ("gpu_memory_total_bytes" , timestamps , gpu_memory_total )
132+ )
115133 for index , gpu_memory_usage_points in gpus_memory_usage_points .items ():
116134 metrics .append (
117135 Metric (
@@ -131,6 +149,14 @@ def _calculate_job_metrics(points: Sequence[JobMetricsPoint]) -> JobMetrics:
131149 return JobMetrics (metrics = metrics )
132150
133151
152+ def _make_constant_metric (name : str , timestamps : list [datetime ], value : float ) -> Metric :
153+ return Metric (
154+ name = name ,
155+ timestamps = timestamps ,
156+ values = [value ] * len (timestamps ),
157+ )
158+
159+
134160def _get_cpu_usage (last_point : JobMetricsPoint , prev_point : JobMetricsPoint ) -> int :
135161 window = last_point .timestamp_micro - prev_point .timestamp_micro
136162 if window == 0 :
0 commit comments