| title | Task 01: Infrastructure Health Validation |
|---|---|
| sidebar_label | Task 01: Infrastructure Health |
| sidebar_position | 1 |
| description | Comprehensive infrastructure health validation including Test-Cluster, Health Service, and Arc connectivity |
DOCUMENT CATEGORY: Runbook SCOPE: Infrastructure health validation PURPOSE: Validate cluster infrastructure health before performance testing MASTER REFERENCE: Microsoft Learn - Health Service
Status: Active
This step performs comprehensive infrastructure validation before proceeding to performance testing. All validation results are captured in a report for the customer handover package.
- [ ] Azure Local cluster deployed and accessible
- [ ] Administrative credentials available
- [ ] PowerShell 5.1+ or PowerShell 7
Validation results are saved to:
\\<ClusterName>\ClusterStorage$\Collect\validation-reports\01-infrastructure-health-report-YYYYMMDD.txt
# Initialize variables
$ClusterName = (Get-Cluster).Name
$DateStamp = Get-Date -Format "yyyyMMdd"
$ReportPath = "C:\ClusterStorage\Collect\validation-reports"
$ReportFile = "$ReportPath\01-infrastructure-health-report-$DateStamp.txt"
# Create directory if not exists
if (-not (Test-Path $ReportPath)) {
New-Item -Path $ReportPath -ItemType Directory -Force
}
# Initialize report
$ReportHeader = @"
================================================================================
INFRASTRUCTURE HEALTH VALIDATION REPORT
================================================================================
Cluster: $ClusterName
Date: $(Get-Date -Format "yyyy-MM-dd HH:mm:ss")
Generated By: $(whoami)
================================================================================
"@
$ReportHeader | Out-File -FilePath $ReportFile -Encoding UTF8"CLUSTER NODE STATUS" | Add-Content $ReportFile
"="*40 | Add-Content $ReportFile
$Nodes = Get-ClusterNode | Select-Object Name, State, DrainStatus, DynamicWeight
$Nodes | Format-Table -AutoSize | Out-String | Add-Content $ReportFile
# Check for any nodes not in "Up" state
$FailedNodes = $Nodes | Where-Object { $_.State -ne "Up" }
if ($FailedNodes) {
"WARNING: Nodes not in Up state:" | Add-Content $ReportFile
$FailedNodes | Format-Table | Out-String | Add-Content $ReportFile
} else {
"All nodes are Up and healthy." | Add-Content $ReportFile
}"`n" + "="*40 | Add-Content $ReportFile
"TEST-CLUSTER VALIDATION" | Add-Content $ReportFile
"="*40 | Add-Content $ReportFile
# Run comprehensive cluster validation
$TestClusterReport = "$ReportPath\TestClusterReport-$DateStamp.html"
# Run Test-Cluster (storage tests skipped as cluster is operational)
Test-Cluster -Include "Cluster Configuration","Inventory","Network","System Configuration" `
-ReportName $TestClusterReport `
-Verbose 4>&1 | Out-String | Add-Content $ReportFile
"Test-Cluster HTML report saved to: $TestClusterReport" | Add-Content $ReportFile"`n" + "="*40 | Add-Content $ReportFile
"CLUSTER RESOURCES" | Add-Content $ReportFile
"="*40 | Add-Content $ReportFile
# Check for any resources not Online
$OfflineResources = Get-ClusterResource | Where-Object { $_.State -ne "Online" }
if ($OfflineResources) {
"WARNING: Resources not Online:" | Add-Content $ReportFile
$OfflineResources | Format-Table Name, State, OwnerGroup, ResourceType -AutoSize | Out-String | Add-Content $ReportFile
} else {
"All cluster resources are Online." | Add-Content $ReportFile
}
# List critical resources
"Critical Cluster Resources:" | Add-Content $ReportFile
Get-ClusterResource | Where-Object { $_.ResourceType -in "IP Address","Network Name","Cluster Shared Volume","Virtual Machine" } |
Format-Table Name, State, OwnerGroup, OwnerNode -AutoSize | Out-String | Add-Content $ReportFile"`nCLUSTER QUORUM" | Add-Content $ReportFile
Get-ClusterQuorum | Format-List | Out-String | Add-Content $ReportFile"`n" + "="*40 | Add-Content $ReportFile
"STORAGE HEALTH" | Add-Content $ReportFile
"="*40 | Add-Content $ReportFile
# Storage pools
"Storage Pools:" | Add-Content $ReportFile
Get-StoragePool | Where-Object { $_.IsPrimordial -eq $false } |
Format-Table FriendlyName, HealthStatus, OperationalStatus, Size, AllocatedSize -AutoSize | Out-String | Add-Content $ReportFile
# Virtual disks
"Virtual Disks:" | Add-Content $ReportFile
Get-VirtualDisk | Format-Table FriendlyName, HealthStatus, OperationalStatus, Size, FootprintOnPool, ResiliencySettingName -AutoSize | Out-String | Add-Content $ReportFile
# Physical disks
"Physical Disks (Summary):" | Add-Content $ReportFile
Get-PhysicalDisk | Group-Object HealthStatus | Format-Table Name, Count -AutoSize | Out-String | Add-Content $ReportFile"`nHEALTH SERVICE FAULTS" | Add-Content $ReportFile
$HealthFaults = Get-HealthFault
if ($HealthFaults) {
"Active Health Faults:" | Add-Content $ReportFile
$HealthFaults | Format-Table FaultType, FaultingObjectDescription, Reason -AutoSize -Wrap | Out-String | Add-Content $ReportFile
} else {
"No active health faults detected." | Add-Content $ReportFile
}"`nCLUSTER SHARED VOLUMES" | Add-Content $ReportFile
Get-ClusterSharedVolume | ForEach-Object {
$CSV = $_
[PSCustomObject]@{
Name = $CSV.Name
State = $CSV.State
OwnerNode = $CSV.OwnerNode.Name
Path = $CSV.SharedVolumeInfo.FriendlyVolumeName
SizeGB = [math]::Round($CSV.SharedVolumeInfo.Partition.Size / 1GB, 2)
FreeGB = [math]::Round($CSV.SharedVolumeInfo.Partition.FreeSpace / 1GB, 2)
UsedPercent = [math]::Round((1 - ($CSV.SharedVolumeInfo.Partition.FreeSpace / $CSV.SharedVolumeInfo.Partition.Size)) * 100, 1)
}
} | Format-Table -AutoSize | Out-String | Add-Content $ReportFile"`n" + "="*40 | Add-Content $ReportFile
"AZURE ARC CONNECTIVITY" | Add-Content $ReportFile
"="*40 | Add-Content $ReportFile
$Nodes = (Get-ClusterNode).Name
foreach ($Node in $Nodes) {
"Node: $Node" | Add-Content $ReportFile
$ArcStatus = Invoke-Command -ComputerName $Node -ScriptBlock {
& "$env:ProgramFiles\AzureConnectedMachineAgent\azcmagent.exe" show 2>&1
}
# Parse key values
$AgentVersion = ($ArcStatus | Select-String "Agent version").ToString().Split(":")[1].Trim()
$ConnStatus = ($ArcStatus | Select-String "Agent Status").ToString().Split(":")[1].Trim()
$LastHeartbeat = ($ArcStatus | Select-String "Last Heartbeat").ToString().Split(": ")[1]
" Agent Version: $AgentVersion" | Add-Content $ReportFile
" Connection Status: $ConnStatus" | Add-Content $ReportFile
" Last Heartbeat: $LastHeartbeat" | Add-Content $ReportFile
"" | Add-Content $ReportFile
}"AZURE LOCAL REGISTRATION" | Add-Content $ReportFile
$AzureLocalReg = Get-AzureStackHCI
$AzureLocalReg | Format-List ClusterStatus, RegistrationStatus, ConnectionStatus, LastConnected | Out-String | Add-Content $ReportFile"`n" + "="*40 | Add-Content $ReportFile
"EVENT LOG REVIEW (CRITICAL/ERROR - LAST 24 HOURS)" | Add-Content $ReportFile
"="*40 | Add-Content $ReportFile
$StartTime = (Get-Date).AddHours(-24)
$CriticalEvents = Get-WinEvent -FilterHashtable @{
LogName = 'System','Application'
Level = 1,2 # Critical and Error
StartTime = $StartTime
} -MaxEvents 50 -ErrorAction SilentlyContinue
if ($CriticalEvents) {
$CriticalEvents | Group-Object ProviderName | Sort-Object Count -Descending | Select-Object -First 10 |
Format-Table @{N='Source';E={$_.Name}}, Count -AutoSize | Out-String | Add-Content $ReportFile
"Recent Critical/Error Events:" | Add-Content $ReportFile
$CriticalEvents | Select-Object -First 20 TimeCreated, ProviderName, Id, Message |
Format-Table -Wrap | Out-String | Add-Content $ReportFile
} else {
"No critical or error events in the last 24 hours." | Add-Content $ReportFile
}$NodeCount = (Get-ClusterNode | Where-Object State -eq "Up").Count
$TotalNodes = (Get-ClusterNode).Count
$HealthFaultCount = (Get-HealthFault).Count
$OfflineResourceCount = (Get-ClusterResource | Where-Object State -ne "Online").Count
$ArcStatus = (Get-AzureStackHCI).ConnectionStatus
$Summary = @"
================================================================================
INFRASTRUCTURE HEALTH SUMMARY
================================================================================
Validation Category Status
------------------------------- --------
Cluster Nodes $NodeCount of $TotalNodes Up
Cluster Resources $(if($OfflineResourceCount -eq 0){"All Online"}else{"$OfflineResourceCount Offline"})
Storage Health Faults $(if($HealthFaultCount -eq 0){"None"}else{"$HealthFaultCount Active"})
Azure Arc Connection $ArcStatus
Test-Cluster See HTML Report
OVERALL STATUS: $(if($NodeCount -eq $TotalNodes -and $HealthFaultCount -eq 0 -and $OfflineResourceCount -eq 0){"PASS"}else{"REVIEW REQUIRED"})
================================================================================
Report saved to: $ReportFile
================================================================================
"@
$Summary | Add-Content $ReportFile
Write-Host $Summary| Category | Requirement | Status |
|---|---|---|
| Nodes | All nodes in "Up" state | ☐ |
| Nodes | No nodes in drain status | ☐ |
| Resources | All cluster resources online | ☐ |
| Quorum | Quorum established | ☐ |
| Storage | All storage pools healthy | ☐ |
| Storage | All virtual disks healthy | ☐ |
| Storage | No active health faults | ☐ |
| CSV | All CSVs online | ☐ |
| Arc | All nodes connected | ☐ |
| Events | No critical events (last 24h) | ☐ |
# Check physical connectivity and restart cluster service
Get-Service -Name ClusSvc -ComputerName <NodeName>
Restart-Service -Name ClusSvc -ComputerName <NodeName># Check for failed physical disks
Get-PhysicalDisk | Where-Object HealthStatus -ne "Healthy"
# Repair virtual disk
Repair-VirtualDisk -FriendlyName <VirtualDiskName># Reconnect Arc agent
azcmagent connect --resource-group <RG> --tenant-id <TenantID> --subscription-id <SubID>Proceed to Task 2: VMFleet Storage Testing once infrastructure validation passes.