@@ -501,6 +501,92 @@ async def validate_train_composition(train_ids: list[int]) -> str:
501501 return "\n " .join (out )
502502
503503
504+ def _percentiles (vals : list [float ], ps = (0 , 5 , 10 , 25 , 50 , 75 , 90 , 95 , 100 )) -> dict :
505+ """Nearest-rank percentiles of a value list (no numpy in the server env)."""
506+ s = sorted (vals )
507+ n = len (s )
508+ out = {}
509+ for p in ps :
510+ if n == 1 :
511+ out [p ] = s [0 ]
512+ continue
513+ k = (n - 1 ) * (p / 100.0 )
514+ lo , hi = int (k ), min (int (k ) + 1 , n - 1 )
515+ out [p ] = s [lo ] + (s [hi ] - s [lo ]) * (k - lo )
516+ return out
517+
518+
519+ @mcp .tool ()
520+ async def grid_job_bands (train_ids : list [int ], check_composition : bool = True ) -> str :
521+ """Per-JOB grid throughput distribution (percentile bands) across trains over time.
522+
523+ For each train, fetches its per-run grid results (train.jsp jobResults) and
524+ builds percentile bands over the *individual jobs'* throughput_per_core — the
525+ distribution behind the grid-statistics "jobs per CPU time" histogram — NOT
526+ the single train-average throughput, which collapses that spread to one
527+ number. Use this to watch a job-performance distribution shift over time
528+ (e.g. an optimization landing) rather than chasing a noisy mean.
529+
530+ By default runs validate_train_composition first and keeps only the trains
531+ that share the reference composition (set check_composition=False to skip the
532+ guard and band every train as given). Returns a per-train percentile table
533+ (p0/p10/p50/p90/p100 KB/s/core, job count) ordered by date, plus a fenced
534+ ```jsonl block (one {date,train,n,tpc:[...]} per train) ready to feed a
535+ band/fan-chart plotting script.
536+ """
537+ if check_composition and len (train_ids ) > 1 :
538+ groups , ref , matched , failed = await _match_compositions (train_ids )
539+ if ref is None :
540+ return "Could not resolve composition for any train: " + \
541+ ", " .join (map (str , train_ids ))
542+ dropped = [t for t in train_ids if t not in matched ]
543+ keep = matched
544+ else :
545+ keep , dropped = list (train_ids ), []
546+
547+ async def fetch (tid : int ):
548+ try :
549+ t = await _get ("trains/train.jsp" , {"train_id" : tid })
550+ t = t [0 ] if isinstance (t , list ) else t
551+ jr = t .get ("jobResults" ) or []
552+ tpc = [j ["throughput_per_core" ] for j in jr
553+ if (j .get ("throughput_per_core" ) or 0 ) > 0 ]
554+ created = t .get ("created" )
555+ date = (datetime .datetime .fromtimestamp (
556+ created / 1000 , datetime .timezone .utc ).strftime ("%Y-%m-%d" )
557+ if created else "?" )
558+ return tid , date , tpc
559+ except Exception as e :
560+ return tid , None , str (e )
561+
562+ rows = await asyncio .gather (* (fetch (t ) for t in keep ))
563+ good = [(tid , d , tpc ) for tid , d , tpc in rows if d is not None and tpc ]
564+ good .sort (key = lambda r : (r [1 ], r [0 ]))
565+ if not good :
566+ return "No usable per-job throughput for: " + ", " .join (map (str , keep ))
567+
568+ out = ["Per-job grid throughput bands (KB/s/core), over individual jobs "
569+ "(not train average):\n " ]
570+ if dropped :
571+ out .append (f"Dropped (composition mismatch): { ', ' .join (map (str , dropped ))} \n " )
572+ out .append (f"{ 'date' :<11} { 'train' :>8} { 'jobs' :>6} "
573+ f"{ 'p0' :>8} { 'p10' :>8} { 'p50' :>8} { 'p90' :>8} { 'p100' :>8} " )
574+ out .append ("-" * 65 )
575+ jsonl = []
576+ for tid , date , tpc in good :
577+ pc = _percentiles (tpc )
578+ k = {p : pc [p ] / 1e3 for p in pc } # KB/s/core
579+ out .append (f"{ date :<11} { tid :>8} { len (tpc ):>6} "
580+ f"{ k [0 ]:>8.0f} { k [10 ]:>8.0f} { k [50 ]:>8.0f} { k [90 ]:>8.0f} { k [100 ]:>8.0f} " )
581+ jsonl .append (json .dumps ({"date" : date , "train" : tid ,
582+ "n" : len (tpc ), "tpc" : tpc }))
583+ out .append ("\n Data (write to a .jsonl and feed the band plot):" )
584+ out .append ("```jsonl" )
585+ out .extend (jsonl )
586+ out .append ("```" )
587+ return "\n " .join (out )
588+
589+
504590# ---------------------------------------------------------------------------
505591# Analysis / wagon browsing
506592#
0 commit comments