Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions .github/workflows/test-all-integration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -397,3 +397,16 @@ jobs:
BRANCH: ${{ github.ref_name }}
RUN_ID: ${{ github.run_id }}
run: npm run upload:token-usage

# Upload per-run tool usage (one row per tool call) to the Azure Table that
# powers the dashboard's per-run tool review. Runs for both scheduled and manual runs.
# Note: The managed identity must have Storage Table Data Contributor on the storage account.
- name: Upload tool usage to table
if: always() && vars.REPORT_STORAGE_ACCOUNT != ''
env:
TOOL_USAGE_STORAGE_ACCOUNT: ${{ vars.REPORT_STORAGE_ACCOUNT }}
TOOL_USAGE_TABLE_NAME: ${{ vars.TOOL_USAGE_TABLE || 'integrationtoolusage' }}
SKILL: ${{ matrix.skill }}
BRANCH: ${{ github.ref_name }}
RUN_ID: ${{ github.run_id }}
run: npm run upload:tool-usage
13 changes: 13 additions & 0 deletions .github/workflows/test-azure-deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -271,3 +271,16 @@ jobs:
BRANCH: ${{ github.ref_name }}
RUN_ID: ${{ github.run_id }}
run: npm run upload:token-usage

# Upload per-run tool usage (one row per tool call) to the Azure Table that
# powers the dashboard's per-run tool review. Runs for both scheduled and manual runs.
# Note: The managed identity must have Storage Table Data Contributor on the storage account.
- name: Upload tool usage to table
if: always() && vars.REPORT_STORAGE_ACCOUNT != ''
env:
TOOL_USAGE_STORAGE_ACCOUNT: ${{ vars.REPORT_STORAGE_ACCOUNT }}
TOOL_USAGE_TABLE_NAME: ${{ vars.TOOL_USAGE_TABLE || 'integrationtoolusage' }}
SKILL: azure-deploy
BRANCH: ${{ github.ref_name }}
RUN_ID: ${{ github.run_id }}
run: npm run upload:tool-usage
5 changes: 4 additions & 1 deletion dashboard/api/src/functions/getData.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,13 @@ import { logRequestIdentity } from "../requestIdentity";
* 5. ${DATE}/${RUN_ID}/{skill-name}/{arbitrary-test-case-name}/agent-metadata-{datetime}{optional-dedupe-suffix}.md
* 6. ${DATE}/${RUN_ID}/{skill-name}/{arbitrary-test-case-name}/agent-metadata.json
* 7. ${DATE}/${RUN_ID}/{skill-name}/{arbitrary-test-case-name}/token-usage.json
* 8. ${DATE}/${RUN_ID}/{skill-name}/{arbitrary-test-case-name}/tool-usage-{datetime}{optional-dedupe-suffix}.json
*
* The test-run-{datetime}-{skill-name}-SKILL-REPORT.md is unique per skill. It is a summarized version of the result of all test runs in its job.
* The test-consolidated-report.md is unique per test case. It is a summarized version of the result of all agent runs for its test case.
* The agent-metadata-{datetime}{optional-dedupe-suffix}.md captures the details of each agent run for its test case.
* token-usage.json and agent-metadata.json should not be exposed for now.
* The tool-usage-{datetime}{optional-dedupe-suffix}.json captures the ordered tool calls of each agent run, named to match its agent-metadata-*.md report.
* token-usage.json, agent-metadata.json, and tool-usage-*.json should not be exposed for now.
*
* For azure-deploy skill:
* 1. ${DATE}/${RUN_ID}/{skill-name}/{test-group}/test-run-{datetime}-{skill-name}-SKILL-REPORT.md
Expand All @@ -29,6 +31,7 @@ import { logRequestIdentity } from "../requestIdentity";
* 5. ${DATE}/${RUN_ID}/{skill-name}/{test-group}/{arbitrary-test-case-name}/agent-metadata-{datetime}{optional-dedupe-suffix}.md
* 6. ${DATE}/${RUN_ID}/{skill-name}/{test-group}/{arbitrary-test-case-name}/agent-metadata.json
* 7. ${DATE}/${RUN_ID}/{skill-name}/{test-group}/{arbitrary-test-case-name}/token-usage.json
* 8. ${DATE}/${RUN_ID}/{skill-name}/{test-group}/{arbitrary-test-case-name}/tool-usage-{datetime}{optional-dedupe-suffix}.json
*
* All ${DATE} are in the format of yyyy-mm-dd.
*/
Expand Down
132 changes: 132 additions & 0 deletions dashboard/api/src/functions/getToolUsage.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
import { app, HttpRequest, HttpResponseInit, InvocationContext } from "@azure/functions";
import { TableClient } from "@azure/data-tables";
import { AzureCliCredential, ManagedIdentityCredential } from "@azure/identity";
import { logRequestIdentity } from "../requestIdentity";

const STORAGE_ACCOUNT_NAME = process.env.STORAGE_ACCOUNT_NAME;
const TOOL_USAGE_TABLE_NAME = process.env.TOOL_USAGE_TABLE_NAME;

function getToolUsageTableClient(): TableClient {
if (!STORAGE_ACCOUNT_NAME) {
throw new Error("STORAGE_ACCOUNT_NAME environment variable is not set");
}
if (!TOOL_USAGE_TABLE_NAME) {
throw new Error("TOOL_USAGE_TABLE_NAME environment variable is not set");
}
const clientId = process.env.AZURE_CLIENT_ID;
const isDevEnvironment = process.env.AZURE_FUNCTIONS_ENVIRONMENT === "Development";
const credential = isDevEnvironment ? new AzureCliCredential() : new ManagedIdentityCredential(clientId!);
return new TableClient(
`https://${STORAGE_ACCOUNT_NAME}.table.core.windows.net`,
TOOL_USAGE_TABLE_NAME,
credential
);
}

/** Escape a value for use inside an OData string literal (single quotes are doubled). */
function odataLiteral(value: string): string {
return value.replace(/'/g, "''");
}

/**
* Build the OData filter for tool-usage queries from optional equality filters.
* Returns undefined when no filters are provided.
*/
export function buildToolUsageFilter(filters: {
skill?: string;
test?: string;
branch?: string;
runId?: string;
runToken?: string;
runDate?: string;
}): string | undefined {
const clauses: string[] = [];
if (filters.skill) clauses.push(`skill eq '${odataLiteral(filters.skill)}'`);
if (filters.test) clauses.push(`testName eq '${odataLiteral(filters.test)}'`);
if (filters.branch) clauses.push(`branch eq '${odataLiteral(filters.branch)}'`);
if (filters.runId) clauses.push(`runId eq '${odataLiteral(filters.runId)}'`);
if (filters.runToken) clauses.push(`runToken eq '${odataLiteral(filters.runToken)}'`);
if (filters.runDate) clauses.push(`runDate eq '${odataLiteral(filters.runDate)}'`);
return clauses.length > 0 ? clauses.join(" and ") : undefined;
}

/**
* Returns integration-test tool usage rows from the table.
* GET /api/tool-usage
* Query params: skill (optional), test (optional), branch (optional),
* runId (optional), runToken (optional), runDate (optional)
*
* Each row represents a single tool call in one run. Full tool arguments are not
* stored here — they live in the per-run blob and are fetched on demand.
*/
async function getToolUsage(request: HttpRequest, context: InvocationContext): Promise<HttpResponseInit> {
logRequestIdentity(request, context, "getToolUsage");

const filter = buildToolUsageFilter({
skill: request.query.get("skill") || undefined,
test: request.query.get("test") || undefined,
branch: request.query.get("branch") || undefined,
runId: request.query.get("runId") || undefined,
runToken: request.query.get("runToken") || undefined,
runDate: request.query.get("runDate") || undefined,
});

// Require at least one filter. An unfiltered scan of the one-row-per-tool-call
// table can be very large and risks timeouts / excessive storage reads.
if (!filter) {
return {
status: 400,
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
error: "At least one filter is required: skill, test, branch, runId, runToken, or runDate.",
}),
};
}

try {
const tableClient = getToolUsageTableClient();
const listOptions = { queryOptions: { filter } };
const entities: Record<string, unknown>[] = [];

for await (const entity of tableClient.listEntities(listOptions)) {
entities.push({
skill: entity.skill,
testName: entity.testName,
branch: entity.branch,
runId: entity.runId,
runDate: entity.runDate,
runTimestamp: entity.runTimestamp,
runToken: entity.runToken,
reportFile: entity.reportFile,
sessionId: entity.sessionId,
model: entity.model,
order: entity.order,
toolName: entity.toolName,
toolCallId: entity.toolCallId,
successState: entity.successState,
durationMs: entity.durationMs,
outputBytes: entity.outputBytes,
});
}

return {
status: 200,
headers: { "Content-Type": "application/json" },
body: JSON.stringify(entities),
};
} catch (err: any) {
context.error("Error querying tool usage:", err?.message ?? err);
return {
status: 500,
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ error: "Failed to query tool usage" }),
};
}
}

app.http("getToolUsage", {
methods: ["GET"],
authLevel: "anonymous",
route: "tool-usage",
handler: getToolUsage,
});
5 changes: 5 additions & 0 deletions dashboard/infra/main.bicep
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ param msbenchReportsContainerName string = 'msbench-reports'
@description('Name of the Azure Table that stores integration-test token usage history.')
param tokenUsageTableName string = 'integrationtokenusage'

@description('Name of the Azure Table that stores integration-test per-run tool usage history.')
param toolUsageTableName string = 'integrationtoolusage'

@description('Principal (object) ID of the user-assigned managed identity used by the integration test pipeline to write token usage rows (skillcitestidentity).')
param ciTestIdentityPrincipalId string = '531282f7-49cb-4149-af74-6c84a5270e87'

Expand Down Expand Up @@ -76,6 +79,7 @@ module storage './modules/storage.bicep' = {
environmentName: environmentName
principalId: identity.outputs.identityPrincipalId
tokenUsageTableName: tokenUsageTableName
toolUsageTableName: toolUsageTableName
ciTestIdentityPrincipalId: ciTestIdentityPrincipalId
}
}
Expand All @@ -99,6 +103,7 @@ module functionApp './modules/function-app.bicep' = {
userAssignedIdentityClientId: identity.outputs.identityClientId
storageAccountName: storage.outputs.storageAccountName
tokenUsageTableName: storage.outputs.tokenUsageTableName
toolUsageTableName: storage.outputs.toolUsageTableName
msbenchStorageAccountName: msbenchStorageAccountName
msbenchEvalTableName: msbenchEvalTableName
msbenchReportsContainerName: msbenchReportsContainerName
Expand Down
4 changes: 4 additions & 0 deletions dashboard/infra/modules/function-app.bicep
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ param storageAccountName string
@description('Name of the Azure Table that stores integration-test token usage history.')
param tokenUsageTableName string

@description('Name of the Azure Table that stores integration-test per-run tool usage history.')
param toolUsageTableName string

@description('Application Insights connection string for monitoring.')
param appInsightsConnectionString string

Expand Down Expand Up @@ -118,6 +121,7 @@ resource functionApp 'Microsoft.Web/sites@2024-04-01' = {
{ name: 'AZURE_CLIENT_ID', value: userAssignedIdentityClientId }
{ name: 'STORAGE_ACCOUNT_NAME', value: storageAccountName }
{ name: 'TOKEN_USAGE_TABLE_NAME', value: tokenUsageTableName }
{ name: 'TOOL_USAGE_TABLE_NAME', value: toolUsageTableName }
{ name: 'MSBENCH_STORAGE_ACCOUNT', value: msbenchStorageAccountName }
{ name: 'MSBENCH_REPORTS_CONTAINER', value: msbenchReportsContainerName }
{ name: 'MSBENCH_EVAL_TABLE_NAME', value: msbenchEvalTableName }
Expand Down
13 changes: 11 additions & 2 deletions dashboard/infra/modules/storage.bicep
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ param principalId string
@description('Name of the Azure Table that stores integration-test token usage history.')
param tokenUsageTableName string = 'integrationtokenusage'

@description('Name of the Azure Table that stores integration-test per-run tool usage history.')
param toolUsageTableName string = 'integrationtoolusage'

@description('Principal (object) ID of the user-assigned managed identity used by the integration test pipeline to write token usage rows (skillcitestidentity in the skillcitest resource group, GithubCopilotForAzure-Testing subscription).')
param ciTestIdentityPrincipalId string = '531282f7-49cb-4149-af74-6c84a5270e87'

Expand Down Expand Up @@ -97,6 +100,11 @@ resource tokenUsageTable 'Microsoft.Storage/storageAccounts/tableServices/tables
name: tokenUsageTableName
}

resource toolUsageTable 'Microsoft.Storage/storageAccounts/tableServices/tables@2023-05-01' = {
parent: tableServices
name: toolUsageTableName
}

resource storageBlobDataReaderRole 'Microsoft.Authorization/roleAssignments@2022-04-01' = {
name: guid(storageAccount.id, principalId, storageBlobDataReaderRoleId)
scope: storageAccount
Expand All @@ -107,7 +115,7 @@ resource storageBlobDataReaderRole 'Microsoft.Authorization/roleAssignments@2022
}
}

// Allows the dashboard Function App identity to read token-usage entities from the table.
// Allows the dashboard Function App identity to read token-usage and tool-usage entities from the tables.
resource storageTableDataReaderRole 'Microsoft.Authorization/roleAssignments@2022-04-01' = {
name: guid(storageAccount.id, principalId, storageTableDataReaderRoleId)
scope: storageAccount
Expand All @@ -118,7 +126,7 @@ resource storageTableDataReaderRole 'Microsoft.Authorization/roleAssignments@202
}
}

// Allows the integration test pipeline identity to write token-usage entities to the table.
// Allows the integration test pipeline identity to write token-usage and tool-usage entities to the tables.
resource storageTableDataContributorRole 'Microsoft.Authorization/roleAssignments@2022-04-01' = {
name: guid(storageAccount.id, ciTestIdentityPrincipalId, storageTableDataContributorRoleId)
scope: storageAccount
Expand All @@ -131,3 +139,4 @@ resource storageTableDataContributorRole 'Microsoft.Authorization/roleAssignment

output storageAccountName string = storageAccount.name
output tokenUsageTableName string = tokenUsageTable.name
output toolUsageTableName string = toolUsageTable.name
Loading
Loading