Skip to content

Commit 539425c

Browse files
add instrumentation for tempo and pyroscope
1 parent 28d3194 commit 539425c

4 files changed

Lines changed: 152 additions & 61 deletions

File tree

Dockerfile

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM mcr.microsoft.com/dotnet/aspnet:10.0 AS base
1+
FROM mcr.microsoft.com/dotnet/aspnet:10.0 AS base
22
USER $APP_UID
33
WORKDIR /app
44
EXPOSE 8080
@@ -17,7 +17,31 @@ FROM build AS publish
1717
ARG BUILD_CONFIGURATION=Release
1818
RUN dotnet publish "GithubActionsOrchestrator.csproj" -c $BUILD_CONFIGURATION -o /app/publish /p:UseAppHost=false
1919

20+
# Download Pyroscope native profiler
21+
FROM base AS pyroscope-dl
22+
USER root
23+
ARG TARGETARCH
24+
RUN apt-get update && apt-get install -y --no-install-recommends wget && \
25+
mkdir -p /pyroscope && \
26+
PYRO_ARCH=$(if [ "$TARGETARCH" = "arm64" ]; then echo "aarch64"; else echo "x86_64"; fi) && \
27+
wget -qO- "https://github.com/grafana/pyroscope-dotnet/releases/download/v0.14.1-pyroscope/pyroscope.0.14.1-glibc-${PYRO_ARCH}.tar.gz" | \
28+
tar xz -C /pyroscope && \
29+
apt-get remove -y wget && apt-get autoremove -y && rm -rf /var/lib/apt/lists/*
30+
2031
FROM base AS final
2132
WORKDIR /app
2233
COPY --from=publish /app/publish .
34+
COPY --from=pyroscope-dl /pyroscope /pyroscope
35+
36+
# Pyroscope CLR profiler
37+
ENV CORECLR_ENABLE_PROFILING=1
38+
ENV CORECLR_PROFILER={BD1A650D-AC5D-4896-B64F-D6FA25D6B26A}
39+
ENV CORECLR_PROFILER_PATH=/pyroscope/Pyroscope.Profiler.Native.so
40+
ENV LD_PRELOAD=/pyroscope/Pyroscope.Linux.ApiWrapper.x64.so
41+
ENV LD_LIBRARY_PATH=/pyroscope
42+
ENV DOTNET_EnableDiagnostics=1
43+
ENV DOTNET_EnableDiagnostics_IPC=0
44+
ENV DOTNET_EnableDiagnostics_Debugger=0
45+
ENV DOTNET_EnableDiagnostics_Profiler=1
46+
2347
ENTRYPOINT ["dotnet", "GithubActionsOrchestrator.dll"]

GithubActionsOrchestrator.csproj

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,12 @@
2323
<PackageReference Include="Serilog" Version="4.3.1" />
2424
<PackageReference Include="Serilog.AspNetCore" Version="10.0.0" />
2525
<PackageReference Include="Swashbuckle.AspNetCore" Version="10.1.2" />
26+
<PackageReference Include="OpenTelemetry.Extensions.Hosting" Version="1.*" />
27+
<PackageReference Include="OpenTelemetry.Instrumentation.AspNetCore" Version="1.*" />
28+
<PackageReference Include="OpenTelemetry.Instrumentation.Http" Version="1.*" />
29+
<PackageReference Include="OpenTelemetry.Exporter.OpenTelemetryProtocol" Version="1.*" />
30+
<PackageReference Include="Pyroscope" Version="0.*" />
31+
<PackageReference Include="Pyroscope.OpenTelemetry" Version="0.*" />
2632
</ItemGroup>
2733

2834
<ItemGroup>

PoolManager.cs

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
1+
using System.Diagnostics;
12
using System.Reflection.Metadata.Ecma335;
23
using GithubActionsOrchestrator.CloudControllers;
4+
35
using GithubActionsOrchestrator.Database;
46
using GithubActionsOrchestrator.GitHub;
57
using GithubActionsOrchestrator.Models;
@@ -257,8 +259,10 @@ private async Task CleanupDatabase()
257259

258260
private async Task CheckForStuckRunners(List<GithubTargetConfiguration> targetConfig)
259261
{
262+
using var activity = Program.OrchestratorActivitySource.StartActivity("maintenance.check_stuck_runners");
263+
260264
// check the database for runners that are in "created" state for more then 5 minutes.
261-
265+
262266
await using var db = new ActionsRunnerContext();
263267
var cutoffTime = DateTime.UtcNow - TimeSpan.FromMinutes(10);
264268

@@ -275,9 +279,11 @@ private async Task CheckForStuckRunners(List<GithubTargetConfiguration> targetCo
275279
.Select(r => new { r.RunnerId, r.CloudServerId, r.Hostname, r.Cloud })
276280
.ToListAsync();
277281

282+
activity?.SetTag("stuck_runners.count", stuckRunners.Count);
283+
278284
if (stuckRunners.Count == 0)
279285
return;
280-
286+
281287
// Process stuck runners and create lifecycle entries
282288
var lifecycleEntries = new List<RunnerLifecycle>();
283289

@@ -506,6 +512,8 @@ private async Task StartPoolRunners(List<GithubTargetConfiguration> targetConfig
506512

507513
private async Task CheckForStuckJobs(List<GithubTargetConfiguration> targetConfig)
508514
{
515+
using var activity = Program.OrchestratorActivitySource.StartActivity("maintenance.check_stuck_jobs");
516+
509517
await using var db = new ActionsRunnerContext();
510518
var stuckTime = DateTime.UtcNow - TimeSpan.FromMinutes(10);
511519

@@ -514,6 +522,8 @@ private async Task CheckForStuckJobs(List<GithubTargetConfiguration> targetConfi
514522
.Where(x => (x.State == JobState.Queued || x.State == JobState.Throttled) && x.RunnerId == null && x.QueueTime < stuckTime)
515523
.ToListAsync();
516524

525+
activity?.SetTag("stuck_jobs.count", stuckJobs.Count);
526+
517527
foreach (var stuckJob in stuckJobs)
518528
{
519529
var owner = targetConfig.FirstOrDefault(x => x.Name == stuckJob.Owner);
@@ -893,9 +903,13 @@ private async Task CleanUpRunners(List<GithubTargetConfiguration> targetConfigs)
893903

894904
private async Task<bool> DeleteRunner(DeleteRunnerTask rt)
895905
{
906+
using var activity = Program.OrchestratorActivitySource.StartActivity("runner.delete");
907+
activity?.SetTag("runner.db_id", rt.RunnerDbId);
908+
activity?.SetTag("runner.server_id", rt.ServerId);
909+
896910
await using var db = new ActionsRunnerContext();
897911
var runner = await db.Runners.Include(x => x.Lifecycle).FirstOrDefaultAsync(x => x.RunnerId == rt.RunnerDbId);
898-
912+
899913
try
900914
{
901915
ICloudController cc = _cc.FirstOrDefault(x => x.CloudIdentifier == runner.Cloud);
@@ -917,6 +931,8 @@ private async Task<bool> DeleteRunner(DeleteRunnerTask rt)
917931
}
918932
catch (Exception ex)
919933
{
934+
activity?.SetStatus(ActivityStatusCode.Error, ex.Message);
935+
activity?.AddException(ex);
920936
SentrySdk.CaptureException(ex, scope =>
921937
{
922938
scope.SetTag("server-id", rt.ServerId.ToString());
@@ -952,6 +968,10 @@ private async Task<bool> DeleteRunner(DeleteRunnerTask rt)
952968

953969
private async Task<bool> CreateRunner(CreateRunnerTask rt)
954970
{
971+
using var activity = Program.OrchestratorActivitySource.StartActivity("runner.create");
972+
activity?.SetTag("runner.db_id", rt.RunnerDbId);
973+
activity?.SetTag("runner.repo", rt.RepoName);
974+
955975
await using var db = new ActionsRunnerContext();
956976
var runner = await db.Runners.Include(x => x.Lifecycle).FirstOrDefaultAsync(x => x.RunnerId == rt.RunnerDbId);
957977

@@ -1054,6 +1074,8 @@ private async Task<bool> CreateRunner(CreateRunnerTask rt)
10541074
{
10551075
newRunner = await cc.CreateNewRunner(runner.Arch, runner.Size, runnerToken, targetName, runner.IsCustom, runner.Profile);
10561076
_logger.LogInformation($"New Runner {newRunner.Name} [{runner.Size} on {runner.Arch}] entering pool for {targetName}.");
1077+
activity?.SetTag("runner.hostname", newRunner.Name);
1078+
activity?.SetTag("runner.cloud", cc.CloudIdentifier);
10571079
MachineCreatedCount.Labels(runner.Owner, runner.Size).Inc();
10581080

10591081
runner.Hostname = newRunner.Name;
@@ -1092,6 +1114,8 @@ private async Task<bool> CreateRunner(CreateRunnerTask rt)
10921114
}
10931115
catch (Exception ex)
10941116
{
1117+
activity?.SetStatus(ActivityStatusCode.Error, ex.Message);
1118+
activity?.AddException(ex);
10951119
SentrySdk.CaptureException(ex, scope =>
10961120
{
10971121
scope.SetTag("runner-size", runner.Size);

Program.cs

Lines changed: 94 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
using System.Diagnostics;
12
using System.Security.Cryptography;
23
using System.Text;
34
using System.Text.Json;
@@ -8,6 +9,8 @@
89
using GithubActionsOrchestrator.Models;
910
using Microsoft.AspNetCore.Mvc;
1011
using Microsoft.EntityFrameworkCore;
12+
using OpenTelemetry.Resources;
13+
using OpenTelemetry.Trace;
1114
using Prometheus;
1215
using Serilog;
1316
using Serilog.Events;
@@ -18,6 +21,9 @@ public class Program
1821
{
1922
public static AutoScalerConfiguration Config = new();
2023

24+
internal const string ServiceName = "github-actions-orchestrator";
25+
internal static readonly ActivitySource OrchestratorActivitySource = new(ServiceName);
26+
2127
private static readonly Counter ProcessedJobCount = Metrics
2228
.CreateCounter("github_autoscaler_jobs_processed", "Number of processed jobs", labelNames: ["org", "size"]);
2329

@@ -95,6 +101,18 @@ public static void Main(string[] args)
95101

96102
WebApplicationBuilder builder = WebApplication.CreateBuilder(args);
97103
builder.Services.AddSerilog();
104+
builder.Services.AddOpenTelemetry()
105+
.ConfigureResource(resource => resource.AddService(serviceName: ServiceName))
106+
.WithTracing(tracing =>
107+
{
108+
tracing
109+
.AddSource(ServiceName)
110+
.AddSource("Npgsql")
111+
.AddAspNetCoreInstrumentation()
112+
.AddHttpClientInstrumentation()
113+
.AddOtlpExporter()
114+
.AddProcessor(new Pyroscope.OpenTelemetry.PyroscopeSpanProcessor());
115+
});
98116
builder.Services.AddSingleton<RunnerQueue>();
99117
builder.Services.AddHostedService<PoolManager>();
100118

@@ -209,6 +227,8 @@ public static bool LoadConfiguration()
209227

210228
private static async Task<IResult> GithubWebhookHandler(HttpRequest request, [FromServices] HetznerCloudController cloud, [FromServices] ILogger<Program> logger, [FromServices] RunnerQueue poolMgr)
211229
{
230+
using var activity = OrchestratorActivitySource.StartActivity("webhook.github");
231+
212232
// Verify webhook HMAC
213233
request.EnableBuffering();
214234
string requestBody;
@@ -227,6 +247,7 @@ private static async Task<IResult> GithubWebhookHandler(HttpRequest request, [Fr
227247
}
228248
catch (Exception ex)
229249
{
250+
activity?.SetStatus(ActivityStatusCode.Error, ex.Message);
230251
logger.LogWarning($"Webhook signature verification failed: {ex.Message}");
231252
return Results.StatusCode(401);
232253
}
@@ -246,6 +267,8 @@ private static async Task<IResult> GithubWebhookHandler(HttpRequest request, [Fr
246267
return Results.StatusCode(201);
247268
}
248269

270+
activity?.SetTag("github.action", action);
271+
249272
if (!json.RootElement.TryGetProperty("workflow_job", out JsonElement workflowJson))
250273
{
251274
logger.LogDebug("Received a non-workflowJob request. Ignoring.");
@@ -274,6 +297,9 @@ private static async Task<IResult> GithubWebhookHandler(HttpRequest request, [Fr
274297
string orgName = Config.TargetConfigs.FirstOrDefault(x => x.Target == TargetType.Organization && x.Name.ToLower() == orgNameRequest.ToLower())?.Name ?? orgNameRequest;
275298
string repoName = Config.TargetConfigs.FirstOrDefault(x => x.Target == TargetType.Repository && x.Name.ToLower() == repoNameRequest.ToLower())?.Name ?? repoNameRequest;
276299

300+
activity?.SetTag("github.job_id", jobId);
301+
activity?.SetTag("github.repo", repoName);
302+
activity?.SetTag("github.org", orgName);
277303

278304
// Check if its an org or a repo
279305
if (String.IsNullOrEmpty(orgName))
@@ -292,81 +318,90 @@ private static async Task<IResult> GithubWebhookHandler(HttpRequest request, [Fr
292318
return Results.StatusCode(201);
293319
}
294320

295-
321+
296322
await using var db = new ActionsRunnerContext();
297323

298324
try
299325
{
300326
switch (action)
301327
{
302328
case "queued":
303-
await JobQueued(logger, repoName, labels, orgName, poolMgr, isRepo ? TargetType.Repository : TargetType.Organization, jobId, jobUrl);
329+
using (OrchestratorActivitySource.StartActivity("webhook.github.queued"))
330+
{
331+
await JobQueued(logger, repoName, labels, orgName, poolMgr, isRepo ? TargetType.Repository : TargetType.Organization, jobId, jobUrl);
332+
}
304333
break;
305334
case "in_progress":
306-
var dbWorkflow = await db.Jobs.FirstOrDefaultAsync(x => x.GithubJobId == jobId);
307-
if (dbWorkflow == null)
335+
using (OrchestratorActivitySource.StartActivity("webhook.github.in_progress"))
308336
{
309-
logger.LogWarning("Processing job on manually created runner");
310-
Job progressJob = new()
337+
var dbWorkflow = await db.Jobs.FirstOrDefaultAsync(x => x.GithubJobId == jobId);
338+
if (dbWorkflow == null)
311339
{
312-
GithubJobId = jobId,
313-
Repository = repoName,
314-
Owner = isRepo ? repoName : orgName,
315-
State = JobState.InProgress,
316-
InProgressTime = DateTime.UtcNow,
317-
JobUrl = jobUrl,
318-
Orphan = true
319-
};
320-
await db.Jobs.AddAsync(progressJob);
321-
}
322-
else
323-
{
324-
dbWorkflow.State = JobState.InProgress;
325-
dbWorkflow.QueueTime = DateTime.UtcNow;
340+
logger.LogWarning("Processing job on manually created runner");
341+
Job progressJob = new()
342+
{
343+
GithubJobId = jobId,
344+
Repository = repoName,
345+
Owner = isRepo ? repoName : orgName,
346+
State = JobState.InProgress,
347+
InProgressTime = DateTime.UtcNow,
348+
JobUrl = jobUrl,
349+
Orphan = true
350+
};
351+
await db.Jobs.AddAsync(progressJob);
352+
}
353+
else
354+
{
355+
dbWorkflow.State = JobState.InProgress;
356+
dbWorkflow.QueueTime = DateTime.UtcNow;
357+
}
358+
await db.SaveChangesAsync();
359+
await JobInProgress(workflowJson, logger, jobId, repoName, orgName);
326360
}
327-
await db.SaveChangesAsync();
328-
await JobInProgress(workflowJson, logger, jobId, repoName, orgName);
329361
break;
330362
case "completed":
331-
string conclusion = String.Empty;
332-
if (json.RootElement.TryGetProperty("conclusion", out JsonElement conclusionJson))
363+
using (OrchestratorActivitySource.StartActivity("webhook.github.completed"))
333364
{
334-
conclusion = conclusionJson.GetString() ?? string.Empty;
335-
}
365+
string conclusion = String.Empty;
366+
if (json.RootElement.TryGetProperty("conclusion", out JsonElement conclusionJson))
367+
{
368+
conclusion = conclusionJson.GetString() ?? string.Empty;
369+
}
336370

337-
var dbWorkflowComplete = await db.Jobs.FirstOrDefaultAsync(x => x.GithubJobId == jobId);
338-
if (dbWorkflowComplete == null)
339-
{
340-
logger.LogWarning($"Completed webhook for unknown job {jobId} in {repoName}. Creating record.");
341-
dbWorkflowComplete = new Job
371+
var dbWorkflowComplete = await db.Jobs.FirstOrDefaultAsync(x => x.GithubJobId == jobId);
372+
if (dbWorkflowComplete == null)
342373
{
343-
GithubJobId = jobId,
344-
Repository = repoName,
345-
Owner = isRepo ? repoName : orgName,
346-
State = JobState.Completed,
347-
CompleteTime = DateTime.UtcNow,
348-
Orphan = true
349-
};
350-
await db.Jobs.AddAsync(dbWorkflowComplete);
351-
await db.SaveChangesAsync();
352-
return Results.StatusCode(201);
353-
}
354-
dbWorkflowComplete.CompleteTime = DateTime.UtcNow;
355-
bool wasCancelled = false;
356-
switch (conclusion)
357-
{
358-
case "cancelled":
359-
dbWorkflowComplete.State = JobState.Cancelled;
360-
await db.SaveChangesAsync();
361-
wasCancelled = true;
362-
break;
363-
default:
364-
dbWorkflowComplete.State = JobState.Completed;
374+
logger.LogWarning($"Completed webhook for unknown job {jobId} in {repoName}. Creating record.");
375+
dbWorkflowComplete = new Job
376+
{
377+
GithubJobId = jobId,
378+
Repository = repoName,
379+
Owner = isRepo ? repoName : orgName,
380+
State = JobState.Completed,
381+
CompleteTime = DateTime.UtcNow,
382+
Orphan = true
383+
};
384+
await db.Jobs.AddAsync(dbWorkflowComplete);
365385
await db.SaveChangesAsync();
366-
break;
367-
}
386+
return Results.StatusCode(201);
387+
}
388+
dbWorkflowComplete.CompleteTime = DateTime.UtcNow;
389+
bool wasCancelled = false;
390+
switch (conclusion)
391+
{
392+
case "cancelled":
393+
dbWorkflowComplete.State = JobState.Cancelled;
394+
await db.SaveChangesAsync();
395+
wasCancelled = true;
396+
break;
397+
default:
398+
dbWorkflowComplete.State = JobState.Completed;
399+
await db.SaveChangesAsync();
400+
break;
401+
}
368402

369-
await JobCompleted(logger, jobId, poolMgr, repoName, orgName, workflowJson, wasCancelled);
403+
await JobCompleted(logger, jobId, poolMgr, repoName, orgName, workflowJson, wasCancelled);
404+
}
370405
break;
371406
default:
372407
logger.LogWarning("Unknown action. Ignoring");
@@ -375,12 +410,14 @@ private static async Task<IResult> GithubWebhookHandler(HttpRequest request, [Fr
375410
}
376411
catch (Exception ex)
377412
{
413+
activity?.SetStatus(ActivityStatusCode.Error, ex.Message);
414+
activity?.AddException(ex);
378415
// This should make the webhook as bad and the timer will redeliver it after a while
379416
Log.Error($"Failed to process {action} webhook: {ex.Message}");
380417
return Results.StatusCode(500);
381418
}
382419

383-
// All was well
420+
// All was well
384421
return Results.StatusCode(201);
385422
}
386423

0 commit comments

Comments
 (0)