Skip to content

Commit 5950721

Browse files
add better exception logging
1 parent d4f779d commit 5950721

3 files changed

Lines changed: 82 additions & 35 deletions

File tree

CloudControllers/ProxmoxCloudController.cs

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -53,22 +53,13 @@ public async Task<Machine> CreateNewRunner(string arch, string size, string runn
5353
// Get next available VMID
5454

5555
await _semaphore.WaitAsync();
56-
5756
string macaddress = string.Empty;
58-
59-
60-
6157
int newVmId;
58+
6259
try
6360
{
64-
6561
// Select node
6662
var resources = (await client.Cluster.Resources.GetAsync("vm")).ToList();
67-
/*var vmCountByNode = resources
68-
.Where(x => x.Name.StartsWith(Program.Config.RunnerPrefix))
69-
.GroupBy(x => x.Node)
70-
.Select(x => new { Node = x.Key, Count = x.Count()});*/
71-
7263
var availableNodes = resources.Select(x => x.Node).Distinct().ToList();
7364

7465
var vmCountByNode = availableNodes

ExceptionExtensions.cs

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
// Create an extension method for formatting exception details
2+
3+
using System.Text;
4+
5+
public static class ExceptionExtensions
6+
{
7+
public static string GetFullExceptionDetails(this Exception ex)
8+
{
9+
var sb = new StringBuilder();
10+
var currentEx = ex;
11+
var level = 0;
12+
13+
while (currentEx != null)
14+
{
15+
if (level > 0)
16+
{
17+
sb.AppendLine($"Inner Exception Level {level}:");
18+
}
19+
sb.AppendLine($"Message: {currentEx.Message}");
20+
sb.AppendLine($"Stack Trace: {currentEx.StackTrace}");
21+
22+
currentEx = currentEx.InnerException;
23+
level++;
24+
25+
if (currentEx != null)
26+
{
27+
sb.AppendLine();
28+
}
29+
}
30+
31+
return sb.ToString();
32+
}
33+
}

PoolManager.cs

Lines changed: 48 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -188,9 +188,9 @@ private async Task ProcessStats(List<GithubTargetConfiguration> targetConfig)
188188
{
189189
CspRunnerCount.Labels(cc.CloudIdentifier).Set(await cc.GetServerCountFromCsp());
190190
}
191-
catch
191+
catch(Exception ex)
192192
{
193-
_logger.LogWarning($"Unable to get runner count from CSP {cc.CloudIdentifier}");
193+
_logger.LogWarning($"Unable to get runner count from CSP {cc.CloudIdentifier}: {ex.GetFullExceptionDetails()}");
194194
}
195195
}
196196

@@ -723,7 +723,7 @@ private async Task<bool> CreateRunner(CreateRunnerTask rt)
723723
}
724724

725725
var cc = _cc.First(x => x.CloudIdentifier == selectedProvider.Cloud);
726-
726+
727727
try
728728
{
729729
string targetName = rt.TargetType switch
@@ -732,34 +732,57 @@ private async Task<bool> CreateRunner(CreateRunnerTask rt)
732732
TargetType.Organization => runner.Owner,
733733
_ => throw new ArgumentOutOfRangeException()
734734
};
735-
736-
737-
Machine newRunner = await cc.CreateNewRunner(runner.Arch, runner.Size, rt.RunnerToken, targetName, runner.IsCustom, runner.Profile);
738-
_logger.LogInformation($"New Runner {newRunner.Name} [{runner.Size} on {runner.Arch}] entering pool for {targetName}.");
739-
MachineCreatedCount.Labels(runner.Owner, runner.Size).Inc();
740-
741-
runner.Hostname = newRunner.Name;
742-
runner.IsOnline = true;
743-
runner.CloudServerId = newRunner.Id;
744-
runner.IPv4 = newRunner.Ipv4;
745-
runner.Cloud = cc.CloudIdentifier;
746-
runner.ProvisionId = newRunner.ProvisionId;
747-
runner.ProvisionPayload = newRunner.ProvisionPayload;
748-
749-
750-
runner.Lifecycle.Add(new RunnerLifecycle
735+
736+
737+
Machine newRunner;
738+
int retryAttempt = 0;
739+
const int maxRetries = 1;
740+
const int retryDelayMs = 1000; // 1 second delay between retries
741+
742+
while (retryAttempt <= maxRetries)
751743
{
752-
Status = RunnerStatus.Created,
753-
EventTimeUtc = DateTime.UtcNow,
754-
Event = $"New Runner {newRunner.Name} [{runner.Size} on {runner.Arch}] entering pool for {targetName}."
755-
});
744+
745+
try
746+
{
747+
newRunner = await cc.CreateNewRunner(runner.Arch, runner.Size, rt.RunnerToken, targetName, runner.IsCustom, runner.Profile);
748+
_logger.LogInformation($"New Runner {newRunner.Name} [{runner.Size} on {runner.Arch}] entering pool for {targetName}.");
749+
MachineCreatedCount.Labels(runner.Owner, runner.Size).Inc();
750+
751+
runner.Hostname = newRunner.Name;
752+
runner.IsOnline = true;
753+
runner.CloudServerId = newRunner.Id;
754+
runner.IPv4 = newRunner.Ipv4;
755+
runner.Cloud = cc.CloudIdentifier;
756+
runner.ProvisionId = newRunner.ProvisionId;
757+
runner.ProvisionPayload = newRunner.ProvisionPayload;
758+
runner.Lifecycle.Add(new RunnerLifecycle
759+
{
760+
Status = RunnerStatus.Created,
761+
EventTimeUtc = DateTime.UtcNow,
762+
Event = $"New Runner {newRunner.Name} [{runner.Size} on {runner.Arch}] entering pool for {targetName}."
763+
});
764+
break;
765+
}
766+
catch (Exception ex)
767+
{
768+
if (retryAttempt == maxRetries)
769+
{
770+
_logger.LogError(ex, $"Failed to create runner after {maxRetries + 1} attempts");
771+
throw; // Re-throw the exception after all retries are exhausted
772+
}
773+
774+
_logger.LogWarning(ex, $"Failed to create runner (attempt {retryAttempt + 1}/{maxRetries + 1}). Retrying...");
775+
await Task.Delay(retryDelayMs);
776+
retryAttempt++;
777+
}
778+
}
756779
await db.SaveChangesAsync();
757780

758781
return true;
759782
}
760783
catch (Exception ex)
761784
{
762-
_logger.LogError($"Unable to create runner [{runner.Size} on {runner.Arch} | Retry: {rt.RetryCount}]: {ex.Message}");
785+
_logger.LogError($"Unable to create runner [{runner.Size} on {runner.Arch} | Retry: {rt.RetryCount}]: {ex.GetFullExceptionDetails()}");
763786
runner.Lifecycle.Add(new RunnerLifecycle
764787
{
765788
Status = RunnerStatus.Failure,
@@ -789,7 +812,7 @@ private async Task<bool> CreateRunner(CreateRunnerTask rt)
789812
{
790813
Cloud = cc.CloudIdentifier,
791814
Size = runner.Size,
792-
UnbanTime = DateTime.UtcNow + TimeSpan.FromMinutes(30)
815+
UnbanTime = DateTime.UtcNow + TimeSpan.FromMinutes(10)
793816
});
794817

795818
return false;

0 commit comments

Comments
 (0)