diff --git a/tracer/src/Datadog.Trace/AgentProcessManager.cs b/tracer/src/Datadog.Trace/AgentProcessManager.cs index 70978ce7fb9a..00b7f69a869c 100644 --- a/tracer/src/Datadog.Trace/AgentProcessManager.cs +++ b/tracer/src/Datadog.Trace/AgentProcessManager.cs @@ -7,6 +7,7 @@ using System.Collections.Generic; using System.Diagnostics; using System.IO; +using System.Text; using System.Threading.Tasks; using Datadog.Trace.Configuration; using Datadog.Trace.Configuration.Telemetry; @@ -45,6 +46,8 @@ internal static class AgentProcessManager private static readonly IDatadogLogger Log = DatadogLogging.GetLoggerFor(typeof(AgentProcessManager)); + private static readonly Lazy JobObject = new Lazy(JobObjectInterop.TryCreate); + internal enum ProcessState { NeverChecked, @@ -102,6 +105,19 @@ public static void Initialize() if (Processes.Count > 0) { + Log.Information( + "[aas-repro] init iis_worker_pid={Pid} app_domain={AppDomain} process_name={ProcessName} trace_pipe={TracePipe} stats_pipe={StatsPipe} trace_agent_path={TraceAgentPath}", + new object[] + { +#pragma warning disable CA1837 + System.Diagnostics.Process.GetCurrentProcess().Id, +#pragma warning restore CA1837 + DomainMetadata.Instance.AppDomainName, + DomainMetadata.Instance.ProcessName, + TraceAgentMetadata.PipeName ?? "", + DogStatsDMetadata.PipeName ?? "", + TraceAgentMetadata.ProcessPath ?? "" + }); Log.Debug("Starting {Count} child processes from process {ProcessName}, AppDomain {AppDomain}.", Processes.Count, DomainMetadata.Instance.ProcessName, DomainMetadata.Instance.AppDomainName); StartProcesses(azureAppServiceSettings); } @@ -161,6 +177,9 @@ private static Task StartProcessWithKeepAlive(ProcessMetadata metadata) { if (metadata.SequentialFailures >= MaxFailures) { + Log.Error( + "[aas-repro] keep_alive_terminating process={Process} reason=max_failures sequential_failures={Count} max={Max} last_exit_code={LastExitCode}", + new object[] { path, metadata.SequentialFailures, MaxFailures, metadata.LastExitCode?.ToString() ?? "" }); Log.Error("Maximum retries ({ErrorCount}) reached starting {Process}.", path, MaxFailures); metadata.ProcessState = ProcessState.Faulted; return; @@ -172,56 +191,81 @@ private static Task StartProcessWithKeepAlive(ProcessMetadata metadata) { // This means we have never tried from this domain - if (metadata.ProcessIsHealthy()) - { - // This means one of two things: - // - Another domain has already started this process - // - The previous instance is in the process of shutting down - // It is possible that if a pipe is bound it may yet be cleaned up from a previous shutdown + var pipeAtEntry = metadata.NamedPipeIsBoundDiagnostic(); + var pidsAtEntry = metadata.GetMatchingProcessesDiagnostic(); + Log.Information( + "[aas-repro] never_checked process={Process} pipe_bound={PipeBound} matching_pids={Pids}", + path, + pipeAtEntry, + pidsAtEntry); + if (pipeAtEntry || pidsAtEntry.Length > 0) + { // Assume healthy to start, but look for problems metadata.ProcessState = ProcessState.Healthy; - // Check on a delay to be sure we keep the process available after a shutdown var attempts = 7; var delay = 50d; + var initialAttempts = attempts; while (--attempts > 0) { await Task.Delay((int)delay).ConfigureAwait(false); - if (metadata.ProcessIsHealthy()) + var pipeNow = metadata.NamedPipeIsBoundDiagnostic(); + var pidsNow = metadata.GetMatchingProcessesDiagnostic(); + Log.Information( + "[aas-repro] never_checked_retry process={Process} attempt={Attempt} delay_ms={Delay} pipe_bound={PipeBound} matching_pids={Pids}", + new object[] { path, initialAttempts - attempts, (int)delay, pipeNow, pidsNow }); + + if (pipeNow || pidsNow.Length > 0) { - // Should result in a max delay of ~3.28 seconds before giving up delay = delay * 1.75d; continue; } - // The previous instance is gone, time to start the process + Log.Information("[aas-repro] recovering process={Process} reason=previous_instance_gone", path); Log.Information("Recovering from previous {Process} shutdown. Ready for start.", path); metadata.ProcessState = ProcessState.ReadyToStart; break; } + + if (metadata.ProcessState == ProcessState.Healthy) + { + // BAD PATH: 7 retries exhausted, pipe-or-process still present. + // No new datadog-trace-agent.exe will be started for this IIS Worker Process. + Log.Warning( + "[aas-repro] never_checked_exhausted process={Process} state=Healthy_no_new_agent pipe_bound={PipeBound} matching_pids={Pids}", + path, + metadata.NamedPipeIsBoundDiagnostic(), + metadata.GetMatchingProcessesDiagnostic()); + } } else { - // If no instance exists, just get ready to start + Log.Information("[aas-repro] never_checked_clear process={Process} ready_to_start=true", path); metadata.ProcessState = ProcessState.ReadyToStart; } } else if (metadata.ProcessState == ProcessState.Healthy || metadata.ProcessState == ProcessState.Faulted) { - // This means we have tried to start from this domain before - metadata.ProcessState = metadata.ProcessIsHealthy() ? ProcessState.Healthy : ProcessState.ReadyToStart; + var pipeBound = metadata.NamedPipeIsBoundDiagnostic(); + var pids = metadata.GetMatchingProcessesDiagnostic(); + var healthyNow = pipeBound || pids.Length > 0; + Log.Information( + "[aas-repro] steady_poll process={Process} prev_state={Prev} pipe_bound={PipeBound} matching_pids={Pids} next_state={Next}", + new object[] { path, metadata.ProcessState, pipeBound, pids, healthyNow ? "Healthy" : "ReadyToStart" }); + metadata.ProcessState = healthyNow ? ProcessState.Healthy : ProcessState.ReadyToStart; } if (metadata.ProcessState == ProcessState.ReadyToStart) { + Log.Information("[aas-repro] attempting_start process={Process}", path); Log.Information("Attempting to start {Process}.", path); var startInfo = new ProcessStartInfo { - FileName = path, UseShellExecute = false // Force consistency in behavior between Framework and Core + FileName = path, UseShellExecute = false }; if (!string.IsNullOrWhiteSpace(metadata.ProcessArguments)) @@ -230,21 +274,34 @@ private static Task StartProcessWithKeepAlive(ProcessMetadata metadata) } metadata.Process = Process.Start(startInfo); + var spawnedPid = metadata.Process?.Id ?? -1; + Log.Information("[aas-repro] spawned process={Process} child_pid={ChildPid}", new object[] { path, spawnedPid }); + + if (metadata.Process is { } proc && !proc.HasExited) + { + JobObjectInterop.AssignAndLog(path, proc); + } + var timeout = 2000; while (timeout > 0) { - // Loop and watch for evidence that the process is up and running if (metadata.ProcessIsHealthy()) { metadata.SequentialFailures = 0; metadata.ProcessState = ProcessState.Healthy; + Log.Information("[aas-repro] start_success process={Process} child_pid={ChildPid} after_ms={Elapsed}", new object[] { path, spawnedPid, 2000 - timeout }); Log.Information("Successfully started {Process}.", path); break; } if (metadata.Process == null || metadata.Process.HasExited) { + var exitCode = metadata.Process?.ExitCode; + metadata.LastExitCode = exitCode; + Log.Error( + "[aas-repro] start_failed process={Process} child_pid={ChildPid} exit_code={ExitCode} after_ms={Elapsed}", + new object[] { path, spawnedPid, exitCode?.ToString() ?? "", 2000 - timeout }); Log.Error("Failed to start {Process}.", path); metadata.ProcessState = ProcessState.Faulted; break; @@ -265,7 +322,9 @@ private static Task StartProcessWithKeepAlive(ProcessMetadata metadata) if (metadata.ProcessState == ProcessState.Faulted) { metadata.SequentialFailures++; - // Quicker retry in these cases + Log.Warning( + "[aas-repro] sequential_failure process={Process} count={Count} max={Max} last_exit_code={LastExitCode}", + new object[] { path, metadata.SequentialFailures, MaxFailures, metadata.LastExitCode?.ToString() ?? "" }); await Task.Delay(ExceptionRetryInterval).ConfigureAwait(false); } else @@ -282,6 +341,9 @@ private static Task StartProcessWithKeepAlive(ProcessMetadata metadata) } finally { + Log.Warning( + "[aas-repro] keep_alive_dropped process={Process} state={State} sequential_failures={Count} last_exit_code={LastExitCode}", + new object[] { path, metadata.ProcessState, metadata.SequentialFailures, metadata.LastExitCode?.ToString() ?? "" }); Log.Warning("Keep alive is dropping for {Process}.", path); metadata.IsBeingManaged = false; } @@ -307,6 +369,8 @@ internal sealed class ProcessMetadata public int SequentialFailures { get; set; } + public int? LastExitCode { get; set; } + public ProcessState ProcessState { get; set; } = ProcessState.NeverChecked; public string ProcessPath @@ -329,6 +393,60 @@ public bool ProcessIsHealthy() return NamedPipeIsBound() || ProgramIsRunning(); } + // Always logs the pipe-check outcome (true or false), unlike NamedPipeIsBound which only logs false. + internal bool NamedPipeIsBoundDiagnostic() + { + var namedPipe = $"\\\\.\\pipe\\{PipeName}"; + var bound = File.Exists(namedPipe); + Log.Information("[aas-repro] pipe_check process={Process} pipe={Pipe} bound={Bound}", ProcessPath, namedPipe, bound); + return bound; + } + + // Returns a comma-separated "pid@HH:mm:ss.fff" string for all matching processes. + // Returns empty string when no processes match. + internal string GetMatchingProcessesDiagnostic() + { + try + { + if (string.IsNullOrWhiteSpace(ProcessPath)) + { + return string.Empty; + } + + var fileName = Path.GetFileNameWithoutExtension(ProcessPath); + var processesByName = Process.GetProcessesByName(fileName); + + if (processesByName.Length == 0) + { + Log.Information("[aas-repro] process_check process={Process} name={Name} count=0", ProcessPath, fileName); + return string.Empty; + } + + var sb = new StringBuilder(); + for (var i = 0; i < processesByName.Length; i++) + { + if (i > 0) { sb.Append(','); } + try + { + sb.Append(processesByName[i].Id).Append('@').Append(processesByName[i].StartTime.ToString("HH:mm:ss.fff")); + } + catch + { + sb.Append(processesByName[i].Id).Append("@"); + } + } + + var pidString = sb.ToString(); + Log.Information("[aas-repro] process_check process={Process} name={Name} count={Count} pids={Pids}", new object[] { ProcessPath, fileName, processesByName.Length, pidString }); + return pidString; + } + catch (Exception ex) + { + Log.Warning(ex, "[aas-repro] process_check_error process={Process}", ProcessPath); + return string.Empty; + } + } + private bool ProgramIsRunning() { try @@ -379,5 +497,152 @@ private bool NamedPipeIsBound() return namedPipeIsBound; } } + + private static class JobObjectInterop + { + private const uint JobObjectLimitKillOnJobClose = 0x2000; + private const int JobObjectExtendedLimitInformationClass = 9; + + [System.Runtime.InteropServices.DllImport("kernel32.dll", CharSet = System.Runtime.InteropServices.CharSet.Unicode, SetLastError = true)] + private static extern IntPtr CreateJobObjectW(IntPtr lpJobAttributes, string lpName); + + [System.Runtime.InteropServices.DllImport("kernel32.dll", SetLastError = true)] + [return: System.Runtime.InteropServices.MarshalAs(System.Runtime.InteropServices.UnmanagedType.Bool)] + private static extern bool SetInformationJobObject(IntPtr hJob, int jobObjectInfoClass, ref JobObjectExtendedLimitInformation lpJobObjectInfo, int cbJobObjectInfoLength); + + [System.Runtime.InteropServices.DllImport("kernel32.dll", SetLastError = true)] + [return: System.Runtime.InteropServices.MarshalAs(System.Runtime.InteropServices.UnmanagedType.Bool)] + private static extern bool AssignProcessToJobObject(IntPtr hJob, IntPtr hProcess); + + [System.Runtime.InteropServices.DllImport("kernel32.dll", SetLastError = true)] + [return: System.Runtime.InteropServices.MarshalAs(System.Runtime.InteropServices.UnmanagedType.Bool)] + private static extern bool CloseHandle(IntPtr hObject); + + public static IntPtr TryCreate() + { + if (Environment.OSVersion.Platform != PlatformID.Win32NT) + { + return IntPtr.Zero; + } + + try + { + var handle = CreateJobObjectW(IntPtr.Zero, null); + if (handle == IntPtr.Zero) + { + var err = System.Runtime.InteropServices.Marshal.GetLastWin32Error(); + Log.Warning("[aas-repro] job_object_create_failed step=create win32_error={Err}", new object[] { err }); + return IntPtr.Zero; + } + + var info = default(JobObjectExtendedLimitInformation); + info.BasicLimitInformation.LimitFlags = JobObjectLimitKillOnJobClose; + + var size = System.Runtime.InteropServices.Marshal.SizeOf(typeof(JobObjectExtendedLimitInformation)); + if (!SetInformationJobObject(handle, JobObjectExtendedLimitInformationClass, ref info, size)) + { + var err = System.Runtime.InteropServices.Marshal.GetLastWin32Error(); + Log.Warning("[aas-repro] job_object_create_failed step=set_info win32_error={Err}", new object[] { err }); + CloseHandle(handle); + return IntPtr.Zero; + } + + Log.Information("[aas-repro] job_object_create handle={Handle} kill_on_close=true", new object[] { handle.ToInt64().ToString("X") }); + return handle; + } + catch (Exception ex) + { + Log.Warning(ex, "[aas-repro] job_object_create_failed step=exception"); + return IntPtr.Zero; + } + } + + public static void AssignAndLog(string path, Process process) + { + var pid = -1; + try + { + pid = process.Id; + } + catch + { + // process may already be exiting + } + + IntPtr jobHandle; + try + { + jobHandle = JobObject.Value; + } + catch (Exception ex) + { + Log.Warning(ex, "[aas-repro] job_object_assign process={Process} child_pid={Pid} result=failed reason=lazy_init_threw", new object[] { path, pid }); + return; + } + + if (jobHandle == IntPtr.Zero) + { + Log.Warning("[aas-repro] job_object_assign process={Process} child_pid={Pid} result=skipped reason=no_job_handle", new object[] { path, pid }); + return; + } + + IntPtr processHandle; + try + { + processHandle = process.Handle; + } + catch (Exception ex) + { + Log.Warning(ex, "[aas-repro] job_object_assign process={Process} child_pid={Pid} result=failed reason=handle_unavailable", new object[] { path, pid }); + return; + } + + if (AssignProcessToJobObject(jobHandle, processHandle)) + { + Log.Information("[aas-repro] job_object_assign process={Process} child_pid={Pid} result=success", new object[] { path, pid }); + } + else + { + var err = System.Runtime.InteropServices.Marshal.GetLastWin32Error(); + Log.Warning("[aas-repro] job_object_assign process={Process} child_pid={Pid} result=failed win32_error={Err}", new object[] { path, pid, err }); + } + } + + [System.Runtime.InteropServices.StructLayout(System.Runtime.InteropServices.LayoutKind.Sequential)] + private struct IoCounters + { + public ulong ReadOperationCount; + public ulong WriteOperationCount; + public ulong OtherOperationCount; + public ulong ReadTransferCount; + public ulong WriteTransferCount; + public ulong OtherTransferCount; + } + + [System.Runtime.InteropServices.StructLayout(System.Runtime.InteropServices.LayoutKind.Sequential)] + private struct JobObjectBasicLimitInformation + { + public long PerProcessUserTimeLimit; + public long PerJobUserTimeLimit; + public uint LimitFlags; + public UIntPtr MinimumWorkingSetSize; + public UIntPtr MaximumWorkingSetSize; + public uint ActiveProcessLimit; + public UIntPtr Affinity; + public uint PriorityClass; + public uint SchedulingClass; + } + + [System.Runtime.InteropServices.StructLayout(System.Runtime.InteropServices.LayoutKind.Sequential)] + private struct JobObjectExtendedLimitInformation + { + public JobObjectBasicLimitInformation BasicLimitInformation; + public IoCounters IoInfo; + public UIntPtr ProcessMemoryLimit; + public UIntPtr JobMemoryLimit; + public UIntPtr PeakProcessMemoryUsed; + public UIntPtr PeakJobMemoryUsed; + } + } } }