11#!/usr/bin/env python3
22"""MCP server for TMLL CLI - exposes all CLI commands as MCP tools."""
33
4+ import base64
5+ import contextlib
6+ import functools
7+ import io
8+ import json
49import subprocess
510import sys
11+ import traceback as _tb
612import urllib .request
713from pathlib import Path
814from typing import Optional
915
16+ import matplotlib
17+ matplotlib .use ("Agg" )
18+ import matplotlib .pyplot as plt
19+ import pandas as pd
20+
1021from mcp .server .fastmcp import FastMCP
22+ from mcp .types import ImageContent , TextContent
1123
1224mcp = FastMCP ("tmll-cli-mcp-server" )
1325
1729DEFAULT_PORT = 8080
1830
1931
32+ # ---------------------------------------------------------------------------
33+ # Debug helpers
34+ # ---------------------------------------------------------------------------
35+
36+ def _log (msg : str ) -> None :
37+ """Write debug message to stderr (safe for MCP stdio transport)."""
38+ print (f"[tmll-mcp-debug] { msg } " , file = sys .stderr , flush = True )
39+
40+
41+ @contextlib .contextmanager
42+ def _protect_stdout ():
43+ """Temporarily redirect stdout→stderr so stray print() cannot corrupt the MCP stdio transport."""
44+ old = sys .stdout
45+ sys .stdout = sys .stderr
46+ try :
47+ yield
48+ finally :
49+ sys .stdout = old
50+
51+
52+ def _safe_tool (fn ):
53+ """Decorator applied to every tool: protects stdout, logs entry/exit/errors."""
54+ @functools .wraps (fn )
55+ def wrapper (* args , ** kwargs ):
56+ name = fn .__name__
57+ _log (f">>> TOOL CALL { name } args={ args !r} kwargs={ kwargs !r} " )
58+ with _protect_stdout ():
59+ try :
60+ result = fn (* args , ** kwargs )
61+ preview = repr (result )[:300 ]
62+ _log (f"<<< TOOL OK { name } result_preview={ preview } " )
63+ return result
64+ except Exception as exc :
65+ tb = _tb .format_exc ()
66+ _log (f"!!! TOOL ERROR { name } { type (exc ).__name__ } : { exc } \n { tb } " )
67+ raise
68+ return wrapper
69+
70+
71+ # ---------------------------------------------------------------------------
72+ # Server health
73+ # ---------------------------------------------------------------------------
74+
2075def _server_is_running (host : str = DEFAULT_HOST , port : int = DEFAULT_PORT ) -> bool :
2176 """Check if the trace server is reachable."""
77+ url = f"http://{ host } :{ port } /tsp/api/health"
2278 try :
23- urllib .request .urlopen (f"http:// { host } : { port } /tsp/api/health" , timeout = 3 )
79+ urllib .request .urlopen (url , timeout = 3 )
2480 return True
25- except Exception :
81+ except Exception as exc :
82+ _log (f"Server health check failed ({ url } ): { exc } " )
2683 return False
2784
2885
@@ -45,14 +102,46 @@ def ensure_server(host: str = DEFAULT_HOST, port: int = DEFAULT_PORT) -> str:
45102 return "Trace server was launched but is not yet responding. It may need more time to start."
46103
47104
105+ # ---------------------------------------------------------------------------
106+ # CLI runner
107+ # ---------------------------------------------------------------------------
108+
48109def run_cli (* args : str ) -> str :
49110 """Run a tmll_cli.py command and return output."""
50- result = subprocess .run (
51- [sys .executable , CLI_PATH , "--log-stderr" , * args ],
52- capture_output = True , text = True , timeout = 120
53- )
111+ cmd = [sys .executable , CLI_PATH , "--log-stderr" , * args ]
112+ _log (f"run_cli: executing { ' ' .join (cmd )} " )
113+ try :
114+ result = subprocess .run (cmd , capture_output = True , text = True , timeout = 120 )
115+ except subprocess .TimeoutExpired as exc :
116+ msg = (
117+ f"CLI timed out after 120s\n "
118+ f" command: { ' ' .join (cmd )} \n "
119+ f" partial stdout: { exc .stdout !r} \n "
120+ f" partial stderr: { exc .stderr !r} "
121+ )
122+ _log (f"run_cli TIMEOUT: { msg } " )
123+ raise RuntimeError (msg )
124+ except Exception as exc :
125+ msg = (
126+ f"Failed to launch CLI: { type (exc ).__name__ } : { exc } \n "
127+ f" command: { ' ' .join (cmd )} "
128+ )
129+ _log (f"run_cli LAUNCH ERROR: { msg } " )
130+ raise RuntimeError (msg )
131+
132+ _log (f"run_cli: exit_code={ result .returncode } stdout_len={ len (result .stdout )} stderr_len={ len (result .stderr )} " )
133+ if result .stderr .strip ():
134+ _log (f"run_cli stderr:\n { result .stderr .strip ()} " )
135+
54136 if result .returncode != 0 :
55- raise RuntimeError (result .stderr or f"CLI exited with code { result .returncode } " )
137+ msg = (
138+ f"CLI exited with code { result .returncode } \n "
139+ f" command: { ' ' .join (cmd )} \n "
140+ f" stdout: { result .stdout .strip ()} \n "
141+ f" stderr: { result .stderr .strip ()} "
142+ )
143+ _log (f"run_cli FAILED: { msg } " )
144+ raise RuntimeError (msg )
56145 return result .stdout .strip ()
57146
58147
@@ -81,33 +170,42 @@ def _global_args(host: Optional[str], port: Optional[int]) -> list[str]:
81170 return args
82171
83172
173+ # ---------------------------------------------------------------------------
174+ # Tools
175+ # ---------------------------------------------------------------------------
176+
84177@mcp .tool ()
178+ @_safe_tool
85179def create_experiment (traces : list [str ], experiment_name : str , host : Optional [str ] = None , port : Optional [int ] = None ) -> str :
86180 """Create a trace experiment from LTTng trace files or directories."""
87181 return run_cli (* _global_args (host , port ), "create" , * traces , "-n" , experiment_name )
88182
89183
90184@mcp .tool ()
185+ @_safe_tool
91186def list_experiments () -> str :
92187 """List all open experiments."""
93188 return run_cli ("list" )
94189
95190
96191@mcp .tool ()
192+ @_safe_tool
97193def list_outputs (experiment_id : str , keywords : Optional [list [str ]] = None ) -> str :
98194 """List available outputs for an experiment."""
99195 args = build_args ({"keywords" : ("-k" , keywords )})
100196 return run_cli ("list-outputs" , experiment_id , * args )
101197
102198
103199@mcp .tool ()
200+ @_safe_tool
104201def fetch_data (experiment_id : str , keywords : Optional [list [str ]] = None , output_file : Optional [str ] = None ) -> str :
105202 """Fetch data from experiment outputs."""
106203 args = build_args ({"keywords" : ("-k" , keywords or ["cpu usage" ]), "output_file" : ("-o" , output_file )})
107204 return run_cli ("fetch-data" , experiment_id , * args )
108205
109206
110207@mcp .tool ()
208+ @_safe_tool
111209def delete_experiment (experiment_id : str ) -> str :
112210 """Delete an experiment."""
113211 return run_cli ("delete" , experiment_id )
@@ -121,6 +219,7 @@ def detect_anomalies(experiment_id: str, keywords: Optional[list[str]] = None, m
121219
122220
123221@mcp .tool ()
222+ @_safe_tool
124223def detect_memory_leak (experiment_id : str , keywords : Optional [list [str ]] = None ) -> str :
125224 """Detect memory leaks in trace data."""
126225 args = build_args ({"keywords" : ("-k" , keywords or ["memory" ])})
@@ -135,6 +234,7 @@ def detect_changepoints(experiment_id: str, keywords: Optional[list[str]] = None
135234
136235
137236@mcp .tool ()
237+ @_safe_tool
138238def analyze_correlation (experiment_id : str , keywords : Optional [list [str ]] = None , method : Optional [str ] = None ) -> str :
139239 """Analyze correlation between outputs for root cause analysis (pearson, kendall, spearman)."""
140240 args = build_args ({"keywords" : ("-k" , keywords or ["cpu" , "memory" ]), "method" : ("-m" , method or "pearson" )})
@@ -157,11 +257,109 @@ def detect_idle_resources(experiment_id: str, keywords: Optional[list[str]] = No
157257
158258
159259@mcp .tool ()
260+ @_safe_tool
160261def plan_capacity (experiment_id : str , keywords : Optional [list [str ]] = None , horizon : Optional [int ] = None ) -> str :
161262 """Perform capacity planning with predictive models."""
162263 args = build_args ({"keywords" : ("-k" , keywords or ["cpu usage" ]), "horizon" : ("-H" , horizon or 100 )})
163264 return run_cli ("capacity" , experiment_id , * args )
164265
165266
267+ @mcp .tool ()
268+ @_safe_tool
269+ def plot_xy_with_anomalies (
270+ experiment_id : str ,
271+ keywords : Optional [list [str ]] = None ,
272+ method : Optional [str ] = None ,
273+ host : Optional [str ] = None ,
274+ port : Optional [int ] = None ,
275+ resample_freq : Optional [str ] = None ,
276+ ) -> list [TextContent | ImageContent ]:
277+ """Fetch XY data from an experiment, run anomaly detection, and return an annotated plot image with a text summary."""
278+ from tmll .tmll_client import TMLLClient
279+ from tmll .common .models .experiment import Experiment
280+ from tmll .ml .modules .anomaly_detection .anomaly_detection_module import AnomalyDetection
281+
282+ h = host or DEFAULT_HOST
283+ p = port or DEFAULT_PORT
284+ keywords = keywords or ["cpu usage" ]
285+ method = method or "iforest"
286+
287+ client = TMLLClient (h , p )
288+
289+ resp = client .tsp_client .fetch_experiment (experiment_id )
290+ if resp .status_code != 200 :
291+ return [TextContent (type = "text" , text = f"Experiment { experiment_id } not found (status={ resp .status_code } )." )]
292+ experiment = Experiment .from_tsp_experiment (resp .model )
293+ experiment .assign_outputs (client ._fetch_outputs (experiment ))
294+
295+ outputs = experiment .find_outputs (keyword = keywords , type = ["xy" ])
296+ if not outputs :
297+ return [TextContent (type = "text" , text = "No XY outputs found matching keywords." )]
298+
299+ ad_kwargs = {}
300+ if resample_freq :
301+ ad_kwargs ["resample_freq" ] = resample_freq
302+ ad = AnomalyDetection (client , experiment , outputs , ** ad_kwargs )
303+ result = ad .find_anomalies (method = method )
304+ if not result or not result .anomalies :
305+ return [TextContent (type = "text" , text = "Anomaly detection returned no results." )]
306+
307+ colors = plt .colormaps .get_cmap ("tab10" )
308+ contents : list [TextContent | ImageContent ] = []
309+ total_anomalies = 0
310+
311+ for idx , (name , dataframe ) in enumerate (ad .dataframes .items ()):
312+ anomaly_df = result .anomalies .get (name , pd .DataFrame ())
313+ periods = result .anomaly_periods .get (name , [])
314+
315+ fig , ax = plt .subplots (figsize = (14 , 4 ), dpi = 120 )
316+ ax .plot (dataframe .index , dataframe .iloc [:, 0 ], color = colors (idx ), linewidth = 1.2 , label = name )
317+
318+ for i , (start , end ) in enumerate (periods ):
319+ ax .axvspan (start , end , color = "red" , alpha = 0.2 , label = "Anomaly Period" if i == 0 else None )
320+
321+ if not anomaly_df .empty :
322+ is_anomaly_cols = anomaly_df .filter (regex = "_is_anomaly$" )
323+ if not is_anomaly_cols .empty :
324+ is_anomaly = is_anomaly_cols .any (axis = 1 )
325+ else :
326+ is_anomaly = anomaly_df .any (axis = 1 )
327+ n_anomaly_points = int (is_anomaly .sum ())
328+ total_anomalies += n_anomaly_points
329+
330+ # Scatter points not already inside a shaded period
331+ for point in anomaly_df [is_anomaly ].index :
332+ if any (s <= point <= e for s , e in periods ):
333+ continue
334+ if point in dataframe .index :
335+ ax .scatter (point , dataframe .loc [point ].values [0 ], color = "red" , s = 40 , zorder = 5 )
336+
337+ ax .set_title (f"Anomaly Detection: { name } ({ method } )" )
338+ ax .set_xlabel ("Time" )
339+ ax .set_ylabel (name )
340+ ax .legend (loc = "upper right" , fontsize = 8 )
341+ fig .tight_layout ()
342+
343+ buf = io .BytesIO ()
344+ fig .savefig (buf , format = "png" )
345+ plt .close (fig )
346+ buf .seek (0 )
347+ contents .append (ImageContent (type = "image" , data = base64 .b64encode (buf .read ()).decode (), mimeType = "image/png" ))
348+
349+ period_summary = []
350+ for name , periods in result .anomaly_periods .items ():
351+ for start , end in periods :
352+ period_summary .append (f" { name } : { start } → { end } " )
353+
354+ summary = f"Found { total_anomalies } anomalies across { len (ad .dataframes )} outputs using '{ method } '."
355+ if period_summary :
356+ summary += "\n \n Anomaly periods:\n " + "\n " .join (period_summary )
357+
358+ contents .insert (0 , TextContent (type = "text" , text = summary ))
359+ return contents
360+
361+
362+
166363if __name__ == "__main__" :
364+ _log (f"MCP server starting — CLI_PATH={ CLI_PATH } python={ sys .executable } " )
167365 mcp .run ()
0 commit comments