diff --git a/cmd/gpu-kubelet-plugin/allocatable.go b/cmd/gpu-kubelet-plugin/allocatable.go
index 3c37542b6..fce548ac9 100644
--- a/cmd/gpu-kubelet-plugin/allocatable.go
+++ b/cmd/gpu-kubelet-plugin/allocatable.go
@@ -50,6 +50,11 @@ type AllocatableDevice struct {
 	MigDynamic *MigSpec
 	MigStatic  *MigDeviceInfo
 	Vfio       *VfioDeviceInfo
+
+	// taints holds DRA device taints set by the health monitor. Published
+	// as part of the device's ResourceSlice entry so the scheduler and
+	// kubelet honour them per KEP-5055.
+	taints []resourceapi.DeviceTaint
 }
 
 func (d AllocatableDevice) Type() string {
@@ -145,22 +150,6 @@ func (d *AllocatableDevice) GetGPUPCIBusID() string {
 	panic("unexpected type for AllocatableDevice")
 }
 
-func (d *AllocatableDevice) IsHealthy() bool {
-	switch d.Type() {
-	case GpuDeviceType:
-		return d.Gpu.health == Healthy
-	case MigStaticDeviceType:
-		// TODO: review -- what about the parent?
-		return d.MigStatic.health == Healthy
-	case MigDynamicDeviceType:
-		// TODOMIG: For now, pretend health -- this device maybe hasn't
-		// manifested yet. Or has it? We could adopt the health status of the
-		// parent, but that's also not meaningful I think.
-		return true
-	}
-	panic("unexpected type for AllocatableDevice")
-}
-
 func (d AllocatableDevices) GetGPUs() []*AllocatableDevice {
 	var devices []*AllocatableDevice
 	for _, device := range d {
@@ -324,3 +313,36 @@ func (d *PerGPUAllocatableDevices) RemoveSiblingDevices(device *AllocatableDevic
 		}
 	}
 }
+
+// Taints returns a copy of the device's taints to prevent data races
+// when being read concurrently by the ResourceSlice builder.
+func (d *AllocatableDevice) Taints() []resourceapi.DeviceTaint {
+	return slices.Clone(d.taints)
+}
+
+// AddOrUpdateTaint adds a new taint or updates an existing one with the same
+// key. The value and effect are always updated to the latest event received.
+// Meaning, if a device receives multiple events for the same taint dimension
+// (e.g., XID 48 followed by XID 63), the value is overwritten and only the most recent event data is retained.
+// Returns true if the taint set was modified.
+func (d *AllocatableDevice) AddOrUpdateTaint(taint *resourceapi.DeviceTaint) bool {
+	for i, existing := range d.taints {
+		if existing.Key == taint.Key {
+
+			// 1. If nothing actually changed, exit early to avoid API calls
+			if existing.Value == taint.Value && existing.Effect == taint.Effect {
+				return false
+			}
+
+			// 2. Otherwise, update the fields and the timestamp
+			d.taints[i].Value = taint.Value
+			d.taints[i].Effect = taint.Effect
+			d.taints[i].TimeAdded = nil // reset timestamp for the API server
+			return true
+		}
+	}
+
+	// 3. Key doesn't exist yet, append the dereferenced struct
+	d.taints = append(d.taints, *taint)
+	return true
+}
diff --git a/cmd/gpu-kubelet-plugin/device_health.go b/cmd/gpu-kubelet-plugin/device_health.go
index 3f9a1a01e..76b13f666 100644
--- a/cmd/gpu-kubelet-plugin/device_health.go
+++ b/cmd/gpu-kubelet-plugin/device_health.go
@@ -24,6 +24,7 @@ import (
 	"sync"
 
 	"github.com/NVIDIA/go-nvml/pkg/nvml"
+	resourceapi "k8s.io/api/resource/v1"
 	"k8s.io/klog/v2"
 )
 
@@ -31,6 +32,77 @@ const (
 	FullGPUInstanceID uint32 = 0xFFFFFFFF
 )
 
+const (
+	TaintKeyXID         = DriverName + "/xid"
+	TaintKeyGPULost     = DriverName + "/gpu-lost"
+	TaintKeyUnmonitored = DriverName + "/unmonitored"
+)
+
+// TODO(issue #1001):Remove this hardcoded constant and switch to the upstream
+// resourceapi.DeviceTaintEffectNone once the k8s.io/api dependency is bumped
+// to a version that includes it (KEP-5055 beta).
+// DeviceTaintEffectNone is an informational effect that does not affect
+// scheduling or eviction.
+const DeviceTaintEffectNone resourceapi.DeviceTaintEffect = "None"
+
+// DeviceHealthEventType classifies the category of health event detected by
+// the NVML health monitor.
+type DeviceHealthEventType string
+
+const (
+	HealthEventXID         DeviceHealthEventType = "xid"
+	HealthEventGPULost     DeviceHealthEventType = "gpu-lost"
+	HealthEventUnmonitored DeviceHealthEventType = "unmonitored"
+)
+
+// DeviceHealthEvent carries a typed health notification from the NVML health
+// monitor to the driver's event handler, enabling the driver to set the
+// appropriate DRA device taint per the Option A schema (KEP-5055).
+// Devices is a batch: for GPU_LOST and unmonitored events where all affected devices
+// are aggregated into a single event so the consumer applies one ResourceSlice
+// update instead of N.
+type DeviceHealthEvent struct {
+	Devices   []*AllocatableDevice
+	EventType DeviceHealthEventType
+	// inspired by NVML Event type and only meaningful for xid errors.
+	// may have to create a custom type based on future device-api
+	EventData uint64
+}
+
+// healthEventToTaint maps a DeviceHealthEvent to the corresponding DRA
+// DeviceTaint using the Option A taint key schema: one key per health
+// dimension under the gpu.nvidia.com domain.
+func healthEventToTaint(monitor deviceHealthMonitor, event *DeviceHealthEvent) *resourceapi.DeviceTaint {
+	switch event.EventType {
+	case HealthEventXID:
+		effect := resourceapi.DeviceTaintEffectNoSchedule
+		if monitor != nil && monitor.IsEventNonFatal(event) {
+			effect = DeviceTaintEffectNone
+		}
+		return &resourceapi.DeviceTaint{
+			Key:    TaintKeyXID,
+			Value:  strconv.FormatUint(event.EventData, 10),
+			Effect: effect,
+		}
+	case HealthEventGPULost:
+		return &resourceapi.DeviceTaint{
+			Key:    TaintKeyGPULost,
+			Effect: resourceapi.DeviceTaintEffectNoSchedule,
+		}
+	case HealthEventUnmonitored:
+		return &resourceapi.DeviceTaint{
+			Key:    TaintKeyUnmonitored,
+			Effect: DeviceTaintEffectNone,
+		}
+	default:
+		klog.Errorf("Unknown health event type %q, defaulting to unmonitored taint", event.EventType)
+		return &resourceapi.DeviceTaint{
+			Key:    TaintKeyUnmonitored,
+			Effect: DeviceTaintEffectNone,
+		}
+	}
+}
+
 // For a MIG device the placement is defined by the 3-tuple <parent UUID, GI, CI>.
 // For a full device the returned 3-tuple is the device's uuid and (FullGPUInstanceID) 0xFFFFFFFF for the other two elements.
 type devicePlacementMap map[string]map[uint32]map[uint32]*AllocatableDevice
@@ -38,7 +110,7 @@ type devicePlacementMap map[string]map[uint32]map[uint32]*AllocatableDevice
 type nvmlDeviceHealthMonitor struct {
 	nvmllib           nvml.Interface
 	eventSet          nvml.EventSet
-	unhealthy         chan *AllocatableDevice
+	unhealthy         chan *DeviceHealthEvent
 	deviceByPlacement devicePlacementMap
 	skippedXids       map[uint64]bool
 	wg                sync.WaitGroup
@@ -61,7 +133,7 @@ func newNvmlDeviceHealthMonitor(config *Config, perGPUAllocatable *PerGPUAllocat
 	all := perGPUAllocatable.GetAllDevices()
 	m := &nvmlDeviceHealthMonitor{
 		nvmllib:           nvdevlib.nvmllib,
-		unhealthy:         make(chan *AllocatableDevice, len(all)),
+		unhealthy:         make(chan *DeviceHealthEvent, len(all)),
 		deviceByPlacement: getDevicePlacementMap(all),
 		skippedXids:       xidsToSkip(config.flags.additionalXidsToIgnore),
 	}
@@ -106,25 +178,25 @@ func (m *nvmlDeviceHealthMonitor) registerEventsForDevices() {
 	for parentUUID, giMap := range m.deviceByPlacement {
 		gpu, ret := m.nvmllib.DeviceGetHandleByUUID(parentUUID)
 		if ret != nvml.SUCCESS {
-			klog.Warningf("Unable to get device handle from UUID[%s]: %v; marking it as unhealthy", parentUUID, ret)
-			m.markAllMigDevicesUnhealthy(giMap)
+			klog.Warningf("Unable to get device handle from UUID[%s]: %v; marking devices as unmonitored", parentUUID, ret)
+			m.sendHealthEventForDevices(giMap, HealthEventUnmonitored)
 			continue
 		}
 
 		supportedEvents, ret := gpu.GetSupportedEventTypes()
 		if ret != nvml.SUCCESS {
-			klog.Warningf("unable to determine the supported events for %s: %v; marking it as unhealthy", parentUUID, ret)
-			m.markAllMigDevicesUnhealthy(giMap)
+			klog.Warningf("unable to determine the supported events for %s: %v; marking devices as unmonitored", parentUUID, ret)
+			m.sendHealthEventForDevices(giMap, HealthEventUnmonitored)
 			continue
 		}
 
 		ret = gpu.RegisterEvents(eventMask&supportedEvents, m.eventSet)
 		if ret == nvml.ERROR_NOT_SUPPORTED {
 			klog.Warningf("Device %v is too old to support healthchecking.", parentUUID)
-		}
-		if ret != nvml.SUCCESS {
-			klog.Warningf("unable to register events for %s: %v; marking it as unhealthy", parentUUID, ret)
-			m.markAllMigDevicesUnhealthy(giMap)
+			m.sendHealthEventForDevices(giMap, HealthEventUnmonitored)
+		} else if ret != nvml.SUCCESS {
+			klog.Warningf("unable to register events for %s: %v; marking devices as unmonitored", parentUUID, ret)
+			m.sendHealthEventForDevices(giMap, HealthEventUnmonitored)
 		}
 	}
 }
@@ -162,8 +234,8 @@ func (m *nvmlDeviceHealthMonitor) run(ctx context.Context) {
 			// Ref doc: [https://docs.nvidia.com/deploy/nvml-api/group__nvmlEvents.html#group__nvmlEvents_1g9714b0ca9a34c7a7780f87fee16b205c].
 			if ret != nvml.SUCCESS {
 				if ret == nvml.ERROR_GPU_IS_LOST {
-					klog.Warningf("GPU is lost error: %v; Marking all devices as unhealthy", ret)
-					m.markAllDevicesUnhealthy()
+					klog.Warningf("GPU is lost error: %v; Tainting all devices with %s", ret, TaintKeyGPULost)
+					m.sendHealthEventForAllDevices(HealthEventGPULost)
 					continue
 				}
 				klog.V(6).Infof("Error waiting for NVML event: %v. Retrying...", ret)
@@ -180,19 +252,14 @@ func (m *nvmlDeviceHealthMonitor) run(ctx context.Context) {
 				continue
 			}
 
-			if m.skippedXids[xid] {
-				klog.V(6).Infof("Skipping XID event: Data=%d, Type=%d, GI=%d, CI=%d", xid, eType, gi, ci)
-				continue
-			}
-
 			klog.V(4).Infof("Processing event XID=%d event", xid)
 			// this seems an extreme action.
 			// should we just log the error and proceed anyway.
 			// TODO: look into how to properly handle this error.
 			eventUUID, ret := event.Device.GetUUID()
 			if ret != nvml.SUCCESS {
-				klog.Warningf("Failed to determine uuid for event %v: %v; Marking all devices as unhealthy.", event, ret)
-				m.markAllDevicesUnhealthy()
+				klog.Warningf("Failed to determine uuid for event %v: %v; Tainting all devices with %s", event, ret, TaintKeyGPULost)
+				m.sendHealthEventForAllDevices(HealthEventGPULost)
 				continue
 			}
 			affectedDevice := m.deviceByPlacement.get(eventUUID, gi, ci)
@@ -201,41 +268,65 @@ func (m *nvmlDeviceHealthMonitor) run(ctx context.Context) {
 				continue
 			}
 
-			klog.V(4).Infof("Sending unhealthy notification for device %s due to event type:%v and event data:%d", affectedDevice.UUID(), eType, xid)
-			m.unhealthy <- affectedDevice
+			klog.V(4).Infof("Sending XID=%d health event for device %s", xid, affectedDevice.UUID())
+			m.unhealthy <- &DeviceHealthEvent{
+				Devices:   []*AllocatableDevice{affectedDevice},
+				EventType: HealthEventXID,
+				EventData: xid,
+			}
 		}
 	}
 }
 
-func (m *nvmlDeviceHealthMonitor) Unhealthy() <-chan *AllocatableDevice {
+func (m *nvmlDeviceHealthMonitor) Unhealthy() <-chan *DeviceHealthEvent {
 	return m.unhealthy
 }
 
-func (m *nvmlDeviceHealthMonitor) markAllDevicesUnhealthy() {
+// sendHealthEventForAllDevices aggregates every device across all GPUs into a
+// single batched DeviceHealthEvent so the consumer makes one ResourceSlice
+// update.
+func (m *nvmlDeviceHealthMonitor) sendHealthEventForAllDevices(eventType DeviceHealthEventType) {
+	var devices []*AllocatableDevice
 	for _, giMap := range m.deviceByPlacement {
-		m.markAllMigDevicesUnhealthy(giMap)
+		devices = append(devices, flattenMIGDeviceMap(giMap)...)
 	}
+	m.sendBatchedHealthEvent(devices, eventType)
+}
+
+// sendHealthEventForDevices aggregates all devices under a single parent GPU
+// into one batched DeviceHealthEvent.
+func (m *nvmlDeviceHealthMonitor) sendHealthEventForDevices(giMap map[uint32]map[uint32]*AllocatableDevice, eventType DeviceHealthEventType) {
+	m.sendBatchedHealthEvent(flattenMIGDeviceMap(giMap), eventType)
 }
 
-// markAllMigDevicesUnhealthy is a helper function to mark every mig device under a parent as unhealthy.
-func (m *nvmlDeviceHealthMonitor) markAllMigDevicesUnhealthy(giMap map[uint32]map[uint32]*AllocatableDevice) {
+// flattenMIGDeviceMap flattens a GI→CI device map into a slice.
+func flattenMIGDeviceMap(giMap map[uint32]map[uint32]*AllocatableDevice) []*AllocatableDevice {
+	var devices []*AllocatableDevice
 	for _, ciMap := range giMap {
 		for _, dev := range ciMap {
-			// Non-blocking send to avoid deadlocks if channel is full.
-			select {
-			case m.unhealthy <- dev:
-				klog.V(6).Infof("Marked device %s as unhealthy", dev.UUID())
-			// TODO: The non-blocking send protects the health-monitor goroutine from deadlocks,
-			// but dropping an unhealthy notification means the device's health transition may
-			// never reach the consumer. Consider follow-up improvements:
-			//   - increase the channel buffer beyond len(allocatable) to reduce backpressure;
-			//   - introduce a special "all devices unhealthy" message when bulk updates occur;
-			//   - or revisit whether blocking briefly here is acceptable.
-			default:
-				klog.Errorf("Unhealthy channel full. Dropping unhealthy notification for device %s", dev.UUID())
-			}
+			devices = append(devices, dev)
 		}
 	}
+	return devices
+}
+
+// sendBatchedHealthEvent sends a single DeviceHealthEvent containing all
+// affected devices. Uses a non-blocking send to protect the monitor goroutine
+// from deadlocks when the channel is full.
+func (m *nvmlDeviceHealthMonitor) sendBatchedHealthEvent(devices []*AllocatableDevice, eventType DeviceHealthEventType) {
+	if len(devices) == 0 {
+		return
+	}
+	event := &DeviceHealthEvent{
+		Devices:   devices,
+		EventType: eventType,
+	}
+	select {
+	case m.unhealthy <- event:
+		klog.V(6).Infof("Sent batched %s health event for %d device(s)", eventType, len(devices))
+	default:
+		klog.Errorf("Health event channel full; dropping batched %s event for %d device(s)", eventType, len(devices))
+	}
 }
 
 // The purpose of this function is to allow for a O(1) lookup of
@@ -268,7 +359,7 @@ func getDevicePlacementMap(allocatable AllocatableDevices) devicePlacementMap {
 				continue
 			}
 			giID = d.MigStatic.gIInfo.Id
-			ciID = d.MigStatic.gIInfo.Id
+			ciID = d.MigStatic.cIInfo.Id
 
 		default:
 			// This may be a problem; and should be logged
@@ -304,8 +395,8 @@ func (p devicePlacementMap) get(uuid string, gi, ci uint32) *AllocatableDevice {
 }
 
 // getAdditionalXids returns a list of additional Xids to skip from the specified string.
-// The input is treaded as a comma-separated string and all valid uint64 values are considered as Xid values.
-// Invalid values nare ignored.
+// The input is treated as a comma-separated string and all valid uint64 values are considered as Xid values.
+// Invalid values are ignored.
 // TODO: add list of EXPLICIT XIDs from [https://github.com/NVIDIA/k8s-device-plugin/pull/1443].
 func getAdditionalXids(input string) []uint64 {
 	if input == "" {
@@ -353,3 +444,13 @@ func xidsToSkip(additionalXids string) map[uint64]bool {
 	}
 	return skippedXids
 }
+
+// IsEventNonFatal evaluates whether a hardware event is considered an application-level
+// warning (None) rather than a critical hardware failure (NoSchedule).
+// Currently, it only checks for XID events.
+func (m *nvmlDeviceHealthMonitor) IsEventNonFatal(event *DeviceHealthEvent) bool {
+	if event.EventType == HealthEventXID {
+		return m.skippedXids[event.EventData]
+	}
+	return false
+}
diff --git a/cmd/gpu-kubelet-plugin/device_health_test.go b/cmd/gpu-kubelet-plugin/device_health_test.go
new file mode 100644
index 000000000..c71443f73
--- /dev/null
+++ b/cmd/gpu-kubelet-plugin/device_health_test.go
@@ -0,0 +1,286 @@
+/*
+Copyright The Kubernetes Authors
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package main
+
+import (
+	"context"
+	"testing"
+
+	resourceapi "k8s.io/api/resource/v1"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+// mockHealthMonitor implements deviceHealthMonitor for testing healthEventToTaint.
+type mockHealthMonitor struct {
+	nonFatalXids map[uint64]bool
+}
+
+func (m *mockHealthMonitor) Start(context.Context) error          { return nil }
+func (m *mockHealthMonitor) Stop()                                {}
+func (m *mockHealthMonitor) Unhealthy() <-chan *DeviceHealthEvent { return nil }
+func (m *mockHealthMonitor) IsEventNonFatal(e *DeviceHealthEvent) bool {
+	if e.EventType == HealthEventXID {
+		return m.nonFatalXids[e.EventData]
+	}
+	return false
+}
+
+func TestAddOrUpdateTaint_NewTaint(t *testing.T) {
+	dev := &AllocatableDevice{}
+	taint := &resourceapi.DeviceTaint{
+		Key:    TaintKeyXID,
+		Value:  "48",
+		Effect: resourceapi.DeviceTaintEffectNoSchedule,
+	}
+
+	changed := dev.AddOrUpdateTaint(taint)
+
+	require.True(t, changed)
+	require.Len(t, dev.Taints(), 1)
+	assert.Equal(t, TaintKeyXID, dev.Taints()[0].Key)
+	assert.Equal(t, "48", dev.Taints()[0].Value)
+	assert.Equal(t, resourceapi.DeviceTaintEffectNoSchedule, dev.Taints()[0].Effect)
+}
+
+func TestAddOrUpdateTaint_DuplicateNoChange(t *testing.T) {
+	dev := &AllocatableDevice{}
+	taint := &resourceapi.DeviceTaint{
+		Key:    TaintKeyGPULost,
+		Effect: resourceapi.DeviceTaintEffectNoSchedule,
+	}
+
+	dev.AddOrUpdateTaint(taint)
+	changed := dev.AddOrUpdateTaint(taint)
+
+	assert.False(t, changed, "identical taint should not count as a change")
+	assert.Len(t, dev.Taints(), 1)
+}
+
+func TestAddOrUpdateTaint_UpdateValue(t *testing.T) {
+	dev := &AllocatableDevice{}
+	dev.AddOrUpdateTaint(&resourceapi.DeviceTaint{
+		Key:    TaintKeyXID,
+		Value:  "48",
+		Effect: resourceapi.DeviceTaintEffectNoSchedule,
+	})
+
+	changed := dev.AddOrUpdateTaint(&resourceapi.DeviceTaint{
+		Key:    TaintKeyXID,
+		Value:  "63",
+		Effect: resourceapi.DeviceTaintEffectNoSchedule,
+	})
+
+	require.True(t, changed)
+	require.Len(t, dev.Taints(), 1)
+	assert.Equal(t, "63", dev.Taints()[0].Value, "value should be overwritten to latest XID")
+}
+
+func TestAddOrUpdateTaint_UpdateEffect(t *testing.T) {
+	dev := &AllocatableDevice{}
+	dev.AddOrUpdateTaint(&resourceapi.DeviceTaint{
+		Key:    TaintKeyXID,
+		Value:  "48",
+		Effect: DeviceTaintEffectNone,
+	})
+
+	changed := dev.AddOrUpdateTaint(&resourceapi.DeviceTaint{
+		Key:    TaintKeyXID,
+		Value:  "48",
+		Effect: resourceapi.DeviceTaintEffectNoSchedule,
+	})
+
+	require.True(t, changed)
+	assert.Equal(t, resourceapi.DeviceTaintEffectNoSchedule, dev.Taints()[0].Effect)
+}
+
+func TestAddOrUpdateTaint_DifferentKeysAppended(t *testing.T) {
+	dev := &AllocatableDevice{}
+	dev.AddOrUpdateTaint(&resourceapi.DeviceTaint{
+		Key:    TaintKeyXID,
+		Value:  "48",
+		Effect: resourceapi.DeviceTaintEffectNoSchedule,
+	})
+	dev.AddOrUpdateTaint(&resourceapi.DeviceTaint{
+		Key:    TaintKeyGPULost,
+		Effect: resourceapi.DeviceTaintEffectNoSchedule,
+	})
+
+	taints := dev.Taints()
+	require.Len(t, taints, 2)
+	assert.Equal(t, TaintKeyXID, taints[0].Key)
+	assert.Equal(t, TaintKeyGPULost, taints[1].Key)
+}
+
+func TestAddOrUpdateTaint_TimeAddedResetOnChange(t *testing.T) {
+	dev := &AllocatableDevice{}
+	dev.AddOrUpdateTaint(&resourceapi.DeviceTaint{
+		Key:    TaintKeyXID,
+		Value:  "48",
+		Effect: DeviceTaintEffectNone,
+	})
+
+	dev.AddOrUpdateTaint(&resourceapi.DeviceTaint{
+		Key:    TaintKeyXID,
+		Value:  "63",
+		Effect: resourceapi.DeviceTaintEffectNoSchedule,
+	})
+
+	assert.Nil(t, dev.Taints()[0].TimeAdded, "TimeAdded should be nil so the API server sets a fresh timestamp")
+}
+
+func TestHealthEventToTaint(t *testing.T) {
+	monitor := &mockHealthMonitor{
+		nonFatalXids: map[uint64]bool{13: true, 31: true},
+	}
+
+	tests := []struct {
+		name           string
+		event          *DeviceHealthEvent
+		monitor        deviceHealthMonitor
+		expectedKey    string
+		expectedValue  string
+		expectedEffect resourceapi.DeviceTaintEffect
+	}{
+		{
+			name: "fatal XID",
+			event: &DeviceHealthEvent{
+				EventType: HealthEventXID,
+				EventData: 48,
+			},
+			monitor:        monitor,
+			expectedKey:    TaintKeyXID,
+			expectedValue:  "48",
+			expectedEffect: resourceapi.DeviceTaintEffectNoSchedule,
+		},
+		{
+			name: "non-fatal XID (skipped)",
+			event: &DeviceHealthEvent{
+				EventType: HealthEventXID,
+				EventData: 13,
+			},
+			monitor:        monitor,
+			expectedKey:    TaintKeyXID,
+			expectedValue:  "13",
+			expectedEffect: DeviceTaintEffectNone,
+		},
+		{
+			name: "XID with nil monitor defaults to fatal",
+			event: &DeviceHealthEvent{
+				EventType: HealthEventXID,
+				EventData: 13,
+			},
+			monitor:        nil,
+			expectedKey:    TaintKeyXID,
+			expectedValue:  "13",
+			expectedEffect: resourceapi.DeviceTaintEffectNoSchedule,
+		},
+		{
+			name: "GPU lost",
+			event: &DeviceHealthEvent{
+				EventType: HealthEventGPULost,
+			},
+			monitor:        monitor,
+			expectedKey:    TaintKeyGPULost,
+			expectedValue:  "",
+			expectedEffect: resourceapi.DeviceTaintEffectNoSchedule,
+		},
+		{
+			name: "unmonitored",
+			event: &DeviceHealthEvent{
+				EventType: HealthEventUnmonitored,
+			},
+			monitor:        monitor,
+			expectedKey:    TaintKeyUnmonitored,
+			expectedValue:  "",
+			expectedEffect: DeviceTaintEffectNone,
+		},
+		{
+			name: "unknown event type defaults to unmonitored",
+			event: &DeviceHealthEvent{
+				EventType: DeviceHealthEventType("bogus"),
+			},
+			monitor:        monitor,
+			expectedKey:    TaintKeyUnmonitored,
+			expectedValue:  "",
+			expectedEffect: DeviceTaintEffectNone,
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			taint := healthEventToTaint(tc.monitor, tc.event)
+			assert.Equal(t, tc.expectedKey, taint.Key)
+			assert.Equal(t, tc.expectedValue, taint.Value)
+			assert.Equal(t, tc.expectedEffect, taint.Effect)
+		})
+	}
+}
+
+func TestIsEventNonFatal(t *testing.T) {
+	m := &nvmlDeviceHealthMonitor{
+		skippedXids: map[uint64]bool{
+			13: true,
+			31: true,
+			43: true,
+		},
+	}
+
+	tests := []struct {
+		name     string
+		event    *DeviceHealthEvent
+		expected bool
+	}{
+		{
+			name: "skipped XID is non-fatal",
+			event: &DeviceHealthEvent{
+				EventType: HealthEventXID,
+				EventData: 13,
+			},
+			expected: true,
+		},
+		{
+			name: "non-skipped XID is fatal",
+			event: &DeviceHealthEvent{
+				EventType: HealthEventXID,
+				EventData: 48,
+			},
+			expected: false,
+		},
+		{
+			name: "GPU_LOST is always fatal",
+			event: &DeviceHealthEvent{
+				EventType: HealthEventGPULost,
+			},
+			expected: false,
+		},
+		{
+			name: "unmonitored is not an XID event",
+			event: &DeviceHealthEvent{
+				EventType: HealthEventUnmonitored,
+			},
+			expected: false,
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			assert.Equal(t, tc.expected, m.IsEventNonFatal(tc.event))
+		})
+	}
+}
diff --git a/cmd/gpu-kubelet-plugin/device_state.go b/cmd/gpu-kubelet-plugin/device_state.go
index a7f73b5f5..ff5d2254f 100644
--- a/cmd/gpu-kubelet-plugin/device_state.go
+++ b/cmd/gpu-kubelet-plugin/device_state.go
@@ -654,12 +654,6 @@ func (s *DeviceState) prepareDevices(ctx context.Context, claim *resourceapi.Res
 		if device == nil {
 			return nil, fmt.Errorf("allocatable not found for device %q", result.Device)
 		}
-		// only proceed with config mapping if device is healthy.
-		if featuregates.Enabled(featuregates.NVMLDeviceHealthCheck) {
-			if !device.IsHealthy() {
-				return nil, fmt.Errorf("device %q is not healthy", result.Device)
-			}
-		}
 		for _, c := range slices.Backward(configs) {
 			if slices.Contains(c.Requests, result.Request) {
 				if _, ok := c.Config.(*configapi.GpuConfig); ok && device.Type() != GpuDeviceType {
@@ -1090,31 +1084,6 @@ func GetOpaqueDeviceConfigs(
 	return resultConfigs, nil
 }
 
-// TODO: this needs a code comment.
-func (s *DeviceState) UpdateDeviceHealthStatus(d *AllocatableDevice, hs HealthStatus) {
-	s.Lock()
-	defer s.Unlock()
-
-	switch d.Type() {
-	case GpuDeviceType:
-		d.Gpu.health = hs
-	case MigDynamicDeviceType:
-		// Here we do not have access to a concrete MIG device. The
-		// 'allocatable' MIG device is an abstract representation to a specific
-		// MIG device.
-		klog.Warningf("UpdateDeviceHealthStatus() called for abstract, dynamic MIG device %s: %s", d.MigDynamic.CanonicalName(), hs)
-	case MigStaticDeviceType:
-		// Does it make sense to update the health for a MIG device? Do we
-		// receive health events that are specific to individual MIG devices? If
-		// yes: does this allow for making conclusions about the parent device?
-		d.MigStatic.health = hs
-	default:
-		klog.V(6).Infof("Cannot update health status for unknown device type: %s", d.Type())
-		return
-	}
-	klog.V(4).Infof("Updated device: %s health status to %s", d.UUID(), hs)
-}
-
 // requestedNonAdminDevices returns the set of device names requested by the claim,
 // excluding admin-access allocations.
 func (s *DeviceState) requestedNonAdminDevices(claim *resourceapi.ResourceClaim) map[string]struct{} {
@@ -1232,3 +1201,11 @@ func syncPreparedDevicesGaugeFromCheckpoint(nodeName string, cp *Checkpoint) {
 		}
 	}
 }
+
+// AddDeviceTaint adds or updates a DRA device taint on the given device under
+// the DeviceState lock. Returns true if the taint set was actually modified.
+func (s *DeviceState) AddDeviceTaint(d *AllocatableDevice, taint *resourceapi.DeviceTaint) bool {
+	s.Lock()
+	defer s.Unlock()
+	return d.AddOrUpdateTaint(taint)
+}
diff --git a/cmd/gpu-kubelet-plugin/deviceinfo.go b/cmd/gpu-kubelet-plugin/deviceinfo.go
index 6089ea4cf..33f5b662f 100644
--- a/cmd/gpu-kubelet-plugin/deviceinfo.go
+++ b/cmd/gpu-kubelet-plugin/deviceinfo.go
@@ -27,15 +27,6 @@ import (
 	"k8s.io/utils/ptr"
 )
 
-// Defined similarly as https://pkg.go.dev/k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1#Healthy.
-type HealthStatus string
-
-const (
-	Healthy HealthStatus = "Healthy"
-	// With NVMLDeviceHealthCheck, Unhealthy means that there are critcal xid errors on the device.
-	Unhealthy HealthStatus = "Unhealthy"
-)
-
 // Represents a specific, full, physical GPU device.
 type GpuInfo struct {
 	UUID                  string `json:"uuid"`
@@ -54,7 +45,6 @@ type GpuInfo struct {
 	pcieRootAttr          *deviceattribute.DeviceAttribute
 	migProfiles           []*MigProfileInfo
 	addressingMode        *string
-	health                HealthStatus
 
 	// The following properties that can only be known after inspecting MIG
 	// profiles.
@@ -88,7 +78,6 @@ type MigDeviceInfo struct {
 	parent        *GpuInfo
 	giProfileInfo *nvml.GpuInstanceProfileInfo
 	ciProfileInfo *nvml.ComputeInstanceProfileInfo
-	health        HealthStatus
 }
 
 type VfioDeviceInfo struct {
diff --git a/cmd/gpu-kubelet-plugin/driver.go b/cmd/gpu-kubelet-plugin/driver.go
index 8c8cf5bc0..35eb5d0e8 100644
--- a/cmd/gpu-kubelet-plugin/driver.go
+++ b/cmd/gpu-kubelet-plugin/driver.go
@@ -47,7 +47,9 @@ const DriverPrepUprepFlockFileName = "pu.lock"
 type deviceHealthMonitor interface {
 	Start(context.Context) error
 	Stop()
-	Unhealthy() <-chan *AllocatableDevice
+	Unhealthy() <-chan *DeviceHealthEvent
+	// Allows the driver to query the HealthMonitor's health policy
+	IsEventNonFatal(event *DeviceHealthEvent) bool
 }
 
 type driver struct {
@@ -462,47 +464,38 @@ func (d *driver) deviceHealthEvents(ctx context.Context, nodeName string) {
 		case <-ctx.Done():
 			klog.V(6).Info("Stop processing device health notifications")
 			return
-		case device, ok := <-d.deviceHealthMonitor.Unhealthy():
+		case event, ok := <-d.deviceHealthMonitor.Unhealthy():
 			if !ok {
 				// NVML based deviceHealthMonitor is expected to close only during driver Shutdown.
 				klog.V(6).Info("Health monitor channel closed")
 				return
 			}
-			uuid := device.UUID()
 
-			klog.Warningf("Received unhealthy notification for device: %s", uuid)
-
-			if !device.IsHealthy() {
-				klog.V(6).Infof("Device: %s is aleady marked unhealthy. Skip republishing ResourceSlice", uuid)
+			taint := healthEventToTaint(d.deviceHealthMonitor, event)
+			modified := false
+			for _, dev := range event.Devices {
+				klog.Warningf("Received %s health event for device %s", event.EventType, dev.UUID())
+				if d.state.AddDeviceTaint(dev, taint) {
+					modified = true
+				}
+			}
+			if !modified {
 				continue
 			}
 
-			// Mark device as unhealthy.
-			d.state.UpdateDeviceHealthStatus(device, Unhealthy)
-
-			// Republish resource slice with only healthy devices
-			// There is no remediation loop right now meaning if the unhealthy device is fixed,
-			// driver needs to be restarted to publish the ResourceSlice with all devices
 			var resourceSlice resourceslice.Slice
 			for _, devices := range d.state.perGPUAllocatable.allocatablesMap {
 				for _, dev := range devices {
-					uuid := dev.UUID()
-					if dev.IsHealthy() {
-						klog.V(6).Infof("Device: %s is healthy, added to ResoureSlice", uuid)
-						resourceSlice.Devices = append(resourceSlice.Devices, dev.GetDevice())
-					} else {
-						klog.Warningf("Device: %s is unhealthy, will be removed from ResoureSlice", uuid)
+					d := dev.GetDevice()
+
+					taints := dev.Taints()
+					if len(taints) > 0 {
+						d.Taints = taints
 					}
+					resourceSlice.Devices = append(resourceSlice.Devices, d)
 				}
 			}
 
-			klog.V(4).Info("Rebulishing resourceslice with healthy devices")
-			resources := resourceslice.DriverResources{
-				Pools: map[string]resourceslice.Pool{
-					nodeName: {Slices: []resourceslice.Slice{resourceSlice}},
-				},
-			}
-
 			// NOTE: We only log an error on publish failure and do not retry.
 			// If this publish fails, our in-memory health update succeeds but the
 			// ResourceSlice in the API server remains stale and still advertises the
@@ -514,10 +507,25 @@ func (d *driver) deviceHealthEvents(ctx context.Context, nodeName string) {
 			// This is a temporary compromise while device taints/tolerations (KEP-5055)
 			// are available as a Beta feature. An interim improvement could be adding
 			// a retry/backoff or switch to patch updates instead of full republish.
+			klog.V(4).Infof("Republishing ResourceSlice: %d device(s) tainted with %s=%q (effect=%s)",
+				len(event.Devices), taint.Key, taint.Value, taint.Effect)
+
+			resources := resourceslice.DriverResources{
+				Pools: map[string]resourceslice.Pool{
+					nodeName: {Slices: []resourceslice.Slice{resourceSlice}},
+				},
+			}
+
+			// NOTE: GPU_LOST and unmonitored events are already batched at the
+			// sender (all affected devices arrive in a single DeviceHealthEvent).
+			// XID events are still per-device and may cause repeated publishes.
+			// TODO: Add receiver-side event aggregation before PublishResources.
+			// Evaluate two strategies:
+			// 1. Channel drain: non-blocking pull of all pending events (Pro: zero latency; Con: susceptible to NVML lag).
+			// 2. Timer debounce: e.g., 50ms window (Pro: standard K8s API protection; Con: slight delay).
+			// This also needs to be handle properly in the recovery path.
 			if err := d.pluginhelper.PublishResources(ctx, resources); err != nil {
-				klog.Errorf("Failed to publish resources after device health status update: %v", err)
-			} else {
-				klog.V(4).Info("Successfully republished resources without unhealthy device")
+				klog.Errorf("Failed to publish resources after taint update: %v", err)
 			}
 		}
 	}
diff --git a/cmd/gpu-kubelet-plugin/nvlib.go b/cmd/gpu-kubelet-plugin/nvlib.go
index fa620c3e6..3e18005cf 100644
--- a/cmd/gpu-kubelet-plugin/nvlib.go
+++ b/cmd/gpu-kubelet-plugin/nvlib.go
@@ -570,7 +570,6 @@ func (l deviceLib) getGpuInfo(index int, device nvdev.Device) (*GpuInfo, error)
 		pciBusIDAttr:          pciBusIDAttr,
 		pcieRootAttr:          pcieRootAttr,
 		migProfiles:           migProfiles,
-		health:                Healthy,
 		addressingMode:        addressingMode,
 	}
 
@@ -749,7 +748,6 @@ func (l deviceLib) getMigDevices(gpuInfo *GpuInfo) (map[string]*MigDeviceInfo, e
 			gIInfo:         &giInfo,
 			ciProfileInfo:  ciProfileInfo,
 			cIInfo:         &ciInfo,
-			health:         Healthy,
 		}
 		return nil
 	})