|
209 | 209 | ], |
210 | 210 | "type": "timeseries" |
211 | 211 | }, |
212 | | - { |
213 | | - "datasource": { "type": "prometheus", "uid": "${datasource}" }, |
214 | | - "fieldConfig": { |
215 | | - "defaults": { |
216 | | - "color": { "mode": "palette-classic" }, |
217 | | - "custom": { "fillOpacity": 10, "lineWidth": 2 } |
218 | | - } |
219 | | - }, |
220 | | - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 23 }, |
221 | | - "id": 21, |
222 | | - "options": { "legend": { "calcs": ["lastNotNull"], "displayMode": "table", "placement": "bottom" }, "tooltip": { "mode": "multi" } }, |
223 | | - "title": "Allocated Sample Slots per Partition", |
224 | | - "targets": [ |
225 | | - { "expr": "tq_partition_allocated_samples", "legendFormat": "{{ partition_id }}" } |
226 | | - ], |
227 | | - "type": "timeseries" |
228 | | - }, |
229 | 212 | { |
230 | 213 | "datasource": { "type": "prometheus", "uid": "${datasource}" }, |
231 | 214 | "fieldConfig": { |
|
349 | 332 | { "expr": "tq_storage_memory_rss_bytes", "legendFormat": "{{ storage_unit_id }}" } |
350 | 333 | ], |
351 | 334 | "type": "timeseries" |
352 | | - }, |
353 | | - { |
354 | | - "datasource": { "type": "prometheus", "uid": "${datasource}" }, |
355 | | - "fieldConfig": { |
356 | | - "defaults": { |
357 | | - "color": { "mode": "palette-classic" }, |
358 | | - "custom": { "fillOpacity": 10, "lineWidth": 2 }, |
359 | | - "unit": "bytes" |
360 | | - } |
361 | | - }, |
362 | | - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 54 }, |
363 | | - "id": 34, |
364 | | - "options": { "legend": { "calcs": ["lastNotNull"], "displayMode": "table", "placement": "bottom" }, "tooltip": { "mode": "multi" } }, |
365 | | - "title": "Storage Data Memory (Estimated)", |
366 | | - "targets": [ |
367 | | - { "expr": "tq_storage_data_memory_bytes", "legendFormat": "{{ storage_unit_id }}" } |
368 | | - ], |
369 | | - "type": "timeseries" |
370 | | - }, |
371 | | - { |
372 | | - "datasource": { "type": "prometheus", "uid": "${datasource}" }, |
373 | | - "fieldConfig": { |
374 | | - "defaults": { |
375 | | - "color": { "mode": "palette-classic" }, |
376 | | - "custom": { "fillOpacity": 20, "lineWidth": 2, "stacking": { "mode": "none" } }, |
377 | | - "unit": "ops" |
378 | | - } |
379 | | - }, |
380 | | - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 62 }, |
381 | | - "id": 35, |
382 | | - "options": { "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom" }, "tooltip": { "mode": "multi" } }, |
383 | | - "title": "Storage Put/Get/Clear Rate (per second)", |
384 | | - "targets": [ |
385 | | - { "expr": "rate(tq_storage_op_total{op_type=\"PUT_DATA\"}[$__rate_interval])", "legendFormat": "PUT {{ storage_unit_id }}" }, |
386 | | - { "expr": "rate(tq_storage_op_total{op_type=\"GET_DATA\"}[$__rate_interval])", "legendFormat": "GET {{ storage_unit_id }}" }, |
387 | | - { "expr": "rate(tq_storage_op_total{op_type=\"CLEAR_DATA\"}[$__rate_interval])", "legendFormat": "CLEAR {{ storage_unit_id }}" } |
388 | | - ], |
389 | | - "type": "timeseries" |
390 | | - }, |
391 | | - { |
392 | | - "datasource": { "type": "prometheus", "uid": "${datasource}" }, |
393 | | - "fieldConfig": { |
394 | | - "defaults": { |
395 | | - "color": { "mode": "palette-classic" }, |
396 | | - "custom": { "fillOpacity": 10, "lineWidth": 2 } |
397 | | - } |
398 | | - }, |
399 | | - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 62 }, |
400 | | - "id": 36, |
401 | | - "options": { "legend": { "calcs": ["lastNotNull"], "displayMode": "table", "placement": "bottom" }, "tooltip": { "mode": "multi" } }, |
402 | | - "title": "Storage Cumulative Operations", |
403 | | - "targets": [ |
404 | | - { "expr": "tq_storage_op_total", "legendFormat": "{{ op_type }} {{ storage_unit_id }}" } |
405 | | - ], |
406 | | - "type": "timeseries" |
407 | | - }, |
408 | | - { |
409 | | - "collapsed": false, |
410 | | - "gridPos": { "h": 1, "w": 24, "x": 0, "y": 70 }, |
411 | | - "id": 104, |
412 | | - "title": "Data Lifecycle (Leak Detection)", |
413 | | - "type": "row" |
414 | | - }, |
415 | | - { |
416 | | - "datasource": { "type": "prometheus", "uid": "${datasource}" }, |
417 | | - "fieldConfig": { |
418 | | - "defaults": { |
419 | | - "color": { "mode": "palette-classic" }, |
420 | | - "custom": { "fillOpacity": 10, "lineWidth": 2 }, |
421 | | - "unit": "short" |
422 | | - } |
423 | | - }, |
424 | | - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 71 }, |
425 | | - "id": 40, |
426 | | - "options": { "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom" }, "tooltip": { "mode": "multi" } }, |
427 | | - "title": "Sample Put / Get / Clear Rate (per second)", |
428 | | - "targets": [ |
429 | | - { "expr": "sum(rate(tq_storage_samples_total{op_type=\"PUT_DATA\"}[$__rate_interval]))", "legendFormat": "put" }, |
430 | | - { "expr": "sum(rate(tq_storage_samples_total{op_type=\"GET_DATA\"}[$__rate_interval]))", "legendFormat": "get" }, |
431 | | - { "expr": "sum(rate(tq_storage_samples_total{op_type=\"CLEAR_DATA\"}[$__rate_interval]))", "legendFormat": "clear" } |
432 | | - ], |
433 | | - "type": "timeseries" |
434 | | - }, |
435 | | - { |
436 | | - "datasource": { "type": "prometheus", "uid": "${datasource}" }, |
437 | | - "description": "put - cleared across all storage units. A steadily rising line indicates samples are being written but never cleared — potential leak. Should stay bounded in healthy RL training.", |
438 | | - "fieldConfig": { |
439 | | - "defaults": { |
440 | | - "color": { "mode": "thresholds" }, |
441 | | - "thresholds": { "steps": [ |
442 | | - { "color": "green", "value": null }, |
443 | | - { "color": "yellow", "value": 5000 }, |
444 | | - { "color": "red", "value": 10000 } |
445 | | - ]}, |
446 | | - "custom": { "fillOpacity": 15, "lineWidth": 2 }, |
447 | | - "unit": "short" |
448 | | - } |
449 | | - }, |
450 | | - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 71 }, |
451 | | - "id": 41, |
452 | | - "options": { "legend": { "calcs": ["lastNotNull"], "displayMode": "table", "placement": "bottom" }, "tooltip": { "mode": "multi" } }, |
453 | | - "title": "Samples In-Flight (put - cleared) [Leak Indicator]", |
454 | | - "targets": [ |
455 | | - { "expr": "sum(tq_storage_samples_total{op_type=\"PUT_DATA\"}) - sum(tq_storage_samples_total{op_type=\"CLEAR_DATA\"}) or vector(0)", "legendFormat": "in-flight samples" } |
456 | | - ], |
457 | | - "type": "timeseries" |
458 | | - }, |
459 | | - { |
460 | | - "datasource": { "type": "prometheus", "uid": "${datasource}" }, |
461 | | - "fieldConfig": { |
462 | | - "defaults": { |
463 | | - "color": { "mode": "palette-classic" }, |
464 | | - "custom": { "fillOpacity": 10, "lineWidth": 2 }, |
465 | | - "unit": "short" |
466 | | - } |
467 | | - }, |
468 | | - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 79 }, |
469 | | - "id": 42, |
470 | | - "options": { "legend": { "calcs": ["lastNotNull"], "displayMode": "table", "placement": "bottom" }, "tooltip": { "mode": "multi" } }, |
471 | | - "title": "Cumulative Samples Put vs Cleared (per Storage Unit)", |
472 | | - "targets": [ |
473 | | - { "expr": "tq_storage_samples_total{op_type=\"PUT_DATA\"}", "legendFormat": "put {{ storage_unit_id }}" }, |
474 | | - { "expr": "tq_storage_samples_total{op_type=\"CLEAR_DATA\"}", "legendFormat": "cleared {{ storage_unit_id }}" } |
475 | | - ], |
476 | | - "type": "timeseries" |
477 | | - }, |
478 | | - { |
479 | | - "datasource": { "type": "prometheus", "uid": "${datasource}" }, |
480 | | - "description": "Ratio of cleared samples to put samples. Healthy value approaches 1.0 over time. A value stuck well below 1.0 means samples accumulate without being reclaimed.", |
481 | | - "fieldConfig": { |
482 | | - "defaults": { |
483 | | - "color": { "mode": "continuous-GrYlRd" }, |
484 | | - "custom": { "fillOpacity": 15, "lineWidth": 2 }, |
485 | | - "unit": "percentunit", |
486 | | - "min": 0, |
487 | | - "max": 1 |
488 | | - } |
489 | | - }, |
490 | | - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 79 }, |
491 | | - "id": 43, |
492 | | - "options": { "legend": { "calcs": ["lastNotNull"], "displayMode": "table", "placement": "bottom" }, "tooltip": { "mode": "multi" } }, |
493 | | - "title": "Clear / Put Ratio (per Storage Unit) [Reclaim Health]", |
494 | | - "targets": [ |
495 | | - { "expr": "tq_storage_samples_total{op_type=\"CLEAR_DATA\"} / on(storage_unit_id) clamp_min(tq_storage_samples_total{op_type=\"PUT_DATA\"}, 1)", "legendFormat": "{{ storage_unit_id }}" } |
496 | | - ], |
497 | | - "type": "timeseries" |
498 | 335 | } |
499 | 336 | ], |
500 | 337 | "refresh": "10s", |
|
0 commit comments