Skip to content

Commit 2e9e76d

Browse files
fix: rewrite processing time queries to use customEvents timestamps
The trace-based queries weren't working because Python Azure Monitor SDK stores log templates and args separately. New approach: - Step Completion Time: uses LLM_Agent_Token_Usage event timestamps to compute time from doc start to each step's completion - OpenAI API Call Durations: uses dependencies table for HTTP call times - Per-Document Step Timeline: shows when each step completed per document - Total Processing Time: first-to-last step duration per document All queries now use customEvents/dependencies tables which reliably capture data from both ContentProcessor and ContentProcessorWorkflow. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent afb1772 commit 2e9e76d

2 files changed

Lines changed: 82 additions & 57 deletions

File tree

infra/dashboards/token-usage-queries.kql

Lines changed: 60 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -293,65 +293,76 @@ customEvents
293293
by process_id, Step = agent
294294
| order by process_id, TotalTokens desc
295295

296-
// 18. Processing time per pipeline step
297-
traces
296+
// 18. Step completion time (seconds from document start to step completion)
297+
customEvents
298+
| where name == 'LLM_Agent_Token_Usage'
298299
| where timestamp > ago(7d)
299-
| where message has "Pipeline stage completed"
300-
| parse message with * "process_id=" process_id ", document=" document ", stage=" stage ", elapsed=" elapsed
301-
| where isnotempty(stage)
302-
| extend elapsed_parts = split(elapsed, ":")
303-
| extend hours = toint(elapsed_parts[0])
304-
| extend mins = toint(elapsed_parts[1])
305-
| extend sec_parts = split(tostring(elapsed_parts[2]), ".")
306-
| extend secs = toint(sec_parts[0])
307-
| extend ms = toint(sec_parts[1])
308-
| extend elapsed_seconds = hours * 3600.0 + mins * 60.0 + secs + ms / 100.0
300+
| extend agent = tostring(customDimensions['agent_name'])
301+
| extend process_id = tostring(customDimensions['process_id'])
302+
| join kind=inner (
303+
customEvents
304+
| where name == 'LLM_Agent_Token_Usage'
305+
| where timestamp > ago(7d)
306+
| extend process_id = tostring(customDimensions['process_id'])
307+
| summarize DocStartTime = min(timestamp) by process_id
308+
) on process_id
309+
| extend StepDurationSeconds = round(datetime_diff('millisecond', timestamp, DocStartTime) / 1000.0, 2)
309310
| summarize
310-
AvgSeconds = round(avg(elapsed_seconds), 2),
311-
P50Seconds = round(percentile(elapsed_seconds, 50), 2),
312-
P90Seconds = round(percentile(elapsed_seconds, 90), 2),
313-
MaxSeconds = round(max(elapsed_seconds), 2),
311+
AvgCompletionTime = round(avg(StepDurationSeconds), 2),
312+
P50CompletionTime = round(percentile(StepDurationSeconds, 50), 2),
313+
P90CompletionTime = round(percentile(StepDurationSeconds, 90), 2),
314+
MaxCompletionTime = round(max(StepDurationSeconds), 2),
314315
Invocations = count()
315-
by Step = stage
316-
| order by AvgSeconds desc
316+
by Step = agent
317+
| order by AvgCompletionTime desc
317318

318-
// 19. Total document processing time (end-to-end workflow)
319-
traces
319+
// 19. OpenAI API call durations from dependencies table
320+
dependencies
320321
| where timestamp > ago(7d)
321-
| where message has "Workflow elapsed time"
322-
| parse message with * "Workflow elapsed time: " elapsed " (start=" * ", end=" *
323-
| where isnotempty(elapsed)
324-
| extend elapsed_parts = split(elapsed, ":")
325-
| extend hours = toint(elapsed_parts[0])
326-
| extend mins = toint(elapsed_parts[1])
327-
| extend sec_parts = split(tostring(elapsed_parts[2]), ".")
328-
| extend secs = toint(sec_parts[0])
329-
| extend ms = toint(sec_parts[1])
330-
| extend elapsed_seconds = hours * 3600.0 + mins * 60.0 + secs + ms / 100.0
322+
| where target has "openai" or name has "chat" or type == "HTTP" or name has "openai"
323+
| where success == true
324+
| extend durationSeconds = round(duration / 1000.0, 2)
331325
| summarize
332-
DocumentsProcessed = count(),
333-
AvgSeconds = round(avg(elapsed_seconds), 2),
334-
P50Seconds = round(percentile(elapsed_seconds, 50), 2),
335-
P90Seconds = round(percentile(elapsed_seconds, 90), 2),
336-
P95Seconds = round(percentile(elapsed_seconds, 95), 2),
337-
MaxSeconds = round(max(elapsed_seconds), 2)
326+
TotalCalls = count(),
327+
AvgSeconds = round(avg(durationSeconds), 2),
328+
P50Seconds = round(percentile(durationSeconds, 50), 2),
329+
P90Seconds = round(percentile(durationSeconds, 90), 2),
330+
MaxSeconds = round(max(durationSeconds), 2)
331+
by OperationName = name
332+
| order by TotalCalls desc
333+
| take 10
338334

339-
// 20. Per-document processing time breakdown by step
340-
traces
335+
// 20. Per-document step timeline
336+
customEvents
337+
| where name == 'LLM_Agent_Token_Usage'
341338
| where timestamp > ago(7d)
342-
| where message has "Pipeline stage completed"
343-
| parse message with * "process_id=" process_id ", document=" document ", stage=" stage ", elapsed=" elapsed
344-
| where isnotempty(stage)
345-
| extend elapsed_parts = split(elapsed, ":")
346-
| extend hours = toint(elapsed_parts[0])
347-
| extend mins = toint(elapsed_parts[1])
348-
| extend sec_parts = split(tostring(elapsed_parts[2]), ".")
349-
| extend secs = toint(sec_parts[0])
350-
| extend ms = toint(sec_parts[1])
351-
| extend elapsed_seconds = round(hours * 3600.0 + mins * 60.0 + secs + ms / 100.0, 2)
352-
| project timestamp, process_id, document, Step=stage, ElapsedSeconds=elapsed_seconds
339+
| extend agent = tostring(customDimensions['agent_name'])
340+
| extend process_id = tostring(customDimensions['process_id'])
341+
| join kind=inner (
342+
customEvents
343+
| where name == 'LLM_Agent_Token_Usage'
344+
| where timestamp > ago(7d)
345+
| extend process_id = tostring(customDimensions['process_id'])
346+
| summarize DocStartTime = min(timestamp) by process_id
347+
) on process_id
348+
| extend StepCompletedAt = round(datetime_diff('millisecond', timestamp, DocStartTime) / 1000.0, 2)
349+
| project timestamp, process_id, Step=agent, StepCompletedAtSeconds=StepCompletedAt
353350
| order by process_id, timestamp asc
354351

352+
// 21. Total document processing time (first to last step)
353+
customEvents
354+
| where name == 'LLM_Agent_Token_Usage'
355+
| where timestamp > ago(7d)
356+
| extend process_id = tostring(customDimensions['process_id'])
357+
| summarize StartTime = min(timestamp), EndTime = max(timestamp) by process_id
358+
| extend TotalSeconds = round(datetime_diff('millisecond', EndTime, StartTime) / 1000.0, 2)
359+
| summarize
360+
DocumentsProcessed = count(),
361+
AvgSeconds = round(avg(TotalSeconds), 2),
362+
P50Seconds = round(percentile(TotalSeconds, 50), 2),
363+
P90Seconds = round(percentile(TotalSeconds, 90), 2),
364+
MaxSeconds = round(max(TotalSeconds), 2)
365+
355366
// 21. Daily processing volume with token costs
356367
customEvents
357368
| where name == 'LLM_Agent_Token_Usage'

infra/dashboards/token-usage-workbook.json

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -228,9 +228,9 @@
228228
"type": 3,
229229
"content": {
230230
"version": "KqlItem/1.0",
231-
"query": "traces\r\n| where timestamp > ago(7d)\r\n| where message has \"Pipeline stage completed\"\r\n| parse message with * \"process_id=\" process_id \", document=\" document \", stage=\" stage \", elapsed=\" elapsed\r\n| where isnotempty(stage)\r\n| extend elapsed_parts = split(elapsed, \":\")\r\n| extend hours = toint(elapsed_parts[0])\r\n| extend mins = toint(elapsed_parts[1])\r\n| extend sec_parts = split(tostring(elapsed_parts[2]), \".\")\r\n| extend secs = toint(sec_parts[0])\r\n| extend ms = toint(sec_parts[1])\r\n| extend elapsed_seconds = hours * 3600.0 + mins * 60.0 + secs + ms / 100.0\r\n| summarize\r\n AvgSeconds = round(avg(elapsed_seconds), 2),\r\n P50Seconds = round(percentile(elapsed_seconds, 50), 2),\r\n P90Seconds = round(percentile(elapsed_seconds, 90), 2),\r\n MaxSeconds = round(max(elapsed_seconds), 2),\r\n Invocations = count()\r\n by Step = stage\r\n| order by AvgSeconds desc",
231+
"query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| join kind=inner (\r\n customEvents\r\n | where name == 'LLM_Agent_Token_Usage'\r\n | where timestamp > ago(7d)\r\n | extend process_id = tostring(customDimensions['process_id'])\r\n | summarize DocStartTime = min(timestamp) by process_id\r\n) on process_id\r\n| extend StepDurationSeconds = round(datetime_diff('millisecond', timestamp, DocStartTime) / 1000.0, 2)\r\n| summarize\r\n AvgCompletionTime = round(avg(StepDurationSeconds), 2),\r\n P50CompletionTime = round(percentile(StepDurationSeconds, 50), 2),\r\n P90CompletionTime = round(percentile(StepDurationSeconds, 90), 2),\r\n MaxCompletionTime = round(max(StepDurationSeconds), 2),\r\n Invocations = count()\r\n by Step = agent\r\n| order by AvgCompletionTime desc",
232232
"size": 0,
233-
"title": "Processing Time by Pipeline Step",
233+
"title": "Step Completion Time (seconds from doc start)",
234234
"timeContextFromParameter": "TimeRange",
235235
"queryType": 0,
236236
"resourceType": "microsoft.insights/components",
@@ -242,30 +242,44 @@
242242
"type": 3,
243243
"content": {
244244
"version": "KqlItem/1.0",
245-
"query": "traces\r\n| where timestamp > ago(7d)\r\n| where message has \"Workflow elapsed time\"\r\n| parse message with * \"Workflow elapsed time: \" elapsed \" (start=\" * \", end=\" *\r\n| where isnotempty(elapsed)\r\n| extend elapsed_parts = split(elapsed, \":\")\r\n| extend hours = toint(elapsed_parts[0])\r\n| extend mins = toint(elapsed_parts[1])\r\n| extend sec_parts = split(tostring(elapsed_parts[2]), \".\")\r\n| extend secs = toint(sec_parts[0])\r\n| extend ms = toint(sec_parts[1])\r\n| extend elapsed_seconds = hours * 3600.0 + mins * 60.0 + secs + ms / 100.0\r\n| summarize\r\n DocumentsProcessed = count(),\r\n AvgSeconds = round(avg(elapsed_seconds), 2),\r\n P50Seconds = round(percentile(elapsed_seconds, 50), 2),\r\n P90Seconds = round(percentile(elapsed_seconds, 90), 2),\r\n P95Seconds = round(percentile(elapsed_seconds, 95), 2),\r\n MaxSeconds = round(max(elapsed_seconds), 2)",
246-
"size": 3,
247-
"title": "Total Document Processing Time (End-to-End)",
245+
"query": "dependencies\r\n| where timestamp > ago(7d)\r\n| where target has \"openai\" or name has \"chat\" or type == \"HTTP\" or name has \"openai\"\r\n| where success == true\r\n| extend durationSeconds = round(duration / 1000.0, 2)\r\n| summarize\r\n TotalCalls = count(),\r\n AvgSeconds = round(avg(durationSeconds), 2),\r\n P50Seconds = round(percentile(durationSeconds, 50), 2),\r\n P90Seconds = round(percentile(durationSeconds, 90), 2),\r\n MaxSeconds = round(max(durationSeconds), 2)\r\n by OperationName = name\r\n| order by TotalCalls desc\r\n| take 10",
246+
"size": 0,
247+
"title": "OpenAI API Call Durations",
248248
"timeContextFromParameter": "TimeRange",
249249
"queryType": 0,
250250
"resourceType": "microsoft.insights/components",
251251
"visualization": "table"
252252
},
253-
"name": "total-processing-time"
253+
"name": "openai-call-durations"
254254
},
255255
{
256256
"type": 3,
257257
"content": {
258258
"version": "KqlItem/1.0",
259-
"query": "traces\r\n| where timestamp > ago(7d)\r\n| where message has \"Pipeline stage completed\"\r\n| parse message with * \"process_id=\" process_id \", document=\" document \", stage=\" stage \", elapsed=\" elapsed\r\n| where isnotempty(stage)\r\n| extend elapsed_parts = split(elapsed, \":\")\r\n| extend hours = toint(elapsed_parts[0])\r\n| extend mins = toint(elapsed_parts[1])\r\n| extend sec_parts = split(tostring(elapsed_parts[2]), \".\")\r\n| extend secs = toint(sec_parts[0])\r\n| extend ms = toint(sec_parts[1])\r\n| extend elapsed_seconds = round(hours * 3600.0 + mins * 60.0 + secs + ms / 100.0, 2)\r\n| project timestamp, process_id, document, Step=stage, ElapsedSeconds=elapsed_seconds\r\n| order by process_id, timestamp asc",
259+
"query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend agent = tostring(customDimensions['agent_name'])\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| join kind=inner (\r\n customEvents\r\n | where name == 'LLM_Agent_Token_Usage'\r\n | where timestamp > ago(7d)\r\n | extend process_id = tostring(customDimensions['process_id'])\r\n | summarize DocStartTime = min(timestamp) by process_id\r\n) on process_id\r\n| extend StepCompletedAt = round(datetime_diff('millisecond', timestamp, DocStartTime) / 1000.0, 2)\r\n| project timestamp, process_id, Step=agent, StepCompletedAtSeconds=StepCompletedAt\r\n| order by process_id, timestamp asc",
260260
"size": 0,
261-
"title": "Per-Document Processing Time Breakdown",
261+
"title": "Per-Document Step Timeline",
262262
"timeContextFromParameter": "TimeRange",
263263
"queryType": 0,
264264
"resourceType": "microsoft.insights/components",
265265
"visualization": "table"
266266
},
267267
"name": "per-document-time-breakdown"
268268
},
269+
{
270+
"type": 3,
271+
"content": {
272+
"version": "KqlItem/1.0",
273+
"query": "customEvents\r\n| where name == 'LLM_Agent_Token_Usage'\r\n| where timestamp > ago(7d)\r\n| extend process_id = tostring(customDimensions['process_id'])\r\n| summarize StartTime = min(timestamp), EndTime = max(timestamp) by process_id\r\n| extend TotalSeconds = round(datetime_diff('millisecond', EndTime, StartTime) / 1000.0, 2)\r\n| summarize\r\n DocumentsProcessed = count(),\r\n AvgSeconds = round(avg(TotalSeconds), 2),\r\n P50Seconds = round(percentile(TotalSeconds, 50), 2),\r\n P90Seconds = round(percentile(TotalSeconds, 90), 2),\r\n MaxSeconds = round(max(TotalSeconds), 2)",
274+
"size": 3,
275+
"title": "Total Document Processing Time (First to Last Step)",
276+
"timeContextFromParameter": "TimeRange",
277+
"queryType": 0,
278+
"resourceType": "microsoft.insights/components",
279+
"visualization": "table"
280+
},
281+
"name": "total-processing-time"
282+
},
269283
{
270284
"type": 1,
271285
"content": {

0 commit comments

Comments
 (0)