Skip to content

Commit 90ea327

Browse files
richiejpclaude
andauthored
fix(intel): VRAM detection (#9944)
* fix(gpu-detect): clinfo --json fallback for Intel discrete VRAM ghw returns 0 VRAM for any i915-driven Intel GPU because the kernel driver doesn't expose VRAM through the sysfs paths ghw checks (no mem_info_vram_total — that's an amdgpu interface). xpu-smi, the canonical Intel tool, isn't in the oneAPI base image (it lives in a separate xpumanager package). The capability gate added in 19c92c7 ("default to CPU if there is less than 4GB of GPU available") then demotes the host to CPU even on a 16 GB Arc A770. clinfo ships with the OpenCL ICD loader and is present in the oneAPI base image, so plug it in as the last-resort Intel VRAM source: xpu-smi -> intel_gpu_top -> clinfo --json The parser drops UMA devices via HOST_UNIFIED_MEMORY=true so an iGPU sibling can't double-count system RAM, and dedups by PCI BDF when multiple ICDs enumerate the same physical device (POCL caps reported GLOBAL_MEM_SIZE at 4 GiB; the largest non-capped value wins). Subprocess is wrapped in a 2s timeout and memoised with sync.OnceValue — GPU hardware is static for the process lifetime. The Intel branch also short-circuits when ghw saw no Intel vendor, so NVIDIA-only hosts don't pay the spawn cost. Verified end-to-end on Intel Arc A770: ghw -> 0, clinfo path reports 16,225,243,136 bytes (15.11 GiB), capability gate now passes naturally without LOCALAI_FORCE_META_BACKEND_CAPABILITY=intel. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Signed-off-by: Richard Palethorpe <io@richiejp.com> * feat(gpu-detect): live VRAM usage from DRM fdinfo The clinfo fallback reports total VRAM correctly but leaves UsedVRAM at 0 because OpenCL has no portable live-memory property — the UI ends up showing 0% utilisation even when llama-cpp is actually holding gigabytes in device memory. Fill that gap with the standardised Linux DRM fdinfo interface (Documentation/gpu/drm-usage-stats.rst, kernel ≥5.19). Walking /proc/<pid>/fdinfo for any fd that points at /dev/dri/render* yields drm-total-<region> / drm-resident-<region> keys; aggregate per render-node, resolve the render node to a PCI BDF via /sys/class/drm/<name>/device, and merge the result into the matching GPUMemoryInfo by BDF. Region naming is driver-defined — i915 uses "local0" for device-local VRAM, amdgpu and xe use "vram0" — so a prefix-match on local/vram covers all three DRM drivers that LocalAI cares about. system/gtt/ stolen regions are deliberately excluded since they're host RAM mirrors and would double-count against system RAM. GPUMemoryInfo gains an optional BDF field (`bdf,omitempty` in JSON) so future vendor-specific detectors can plug into the same matcher. Empty BDF skips the merge — non-PCI devices and detection paths that don't surface PCI location keep their existing behaviour. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Signed-off-by: Richard Palethorpe <io@richiejp.com> --------- Signed-off-by: Richard Palethorpe <io@richiejp.com> Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 6a80e23 commit 90ea327

6 files changed

Lines changed: 759 additions & 6 deletions

File tree

pkg/xsysinfo/clinfo.go

Lines changed: 221 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,221 @@
1+
package xsysinfo
2+
3+
import (
4+
"bytes"
5+
"context"
6+
"encoding/json"
7+
"fmt"
8+
"os/exec"
9+
"strings"
10+
"sync"
11+
"time"
12+
13+
"github.com/mudler/xlog"
14+
)
15+
16+
const (
17+
clDeviceTypeGPU = "CL_DEVICE_TYPE_GPU"
18+
clinfoTimeout = 2 * time.Second
19+
)
20+
21+
// clinfoOutput is the subset of `clinfo --json` we read. clinfo emits
22+
// one entry under "devices" per platform, in the same order as
23+
// "platforms"; live devices are under "online".
24+
type clinfoOutput struct {
25+
Devices []struct {
26+
Online []clinfoDevice `json:"online"`
27+
} `json:"devices"`
28+
}
29+
30+
type clinfoDevice struct {
31+
Name string `json:"CL_DEVICE_NAME"`
32+
Vendor string `json:"CL_DEVICE_VENDOR"`
33+
VendorID uint32 `json:"CL_DEVICE_VENDOR_ID"`
34+
Type clinfoTypeProp `json:"CL_DEVICE_TYPE"`
35+
HostUnifiedMemory bool `json:"CL_DEVICE_HOST_UNIFIED_MEMORY"`
36+
GlobalMemSize uint64 `json:"CL_DEVICE_GLOBAL_MEM_SIZE"`
37+
PCIBusInfoKHR string `json:"CL_DEVICE_PCI_BUS_INFO_KHR"`
38+
PCIDomainNV int `json:"CL_DEVICE_PCI_DOMAIN_ID_NV"`
39+
PCIBusNV int `json:"CL_DEVICE_PCI_BUS_ID_NV"`
40+
PCISlotNV int `json:"CL_DEVICE_PCI_SLOT_ID_NV"`
41+
}
42+
43+
// clinfoTypeProp matches against the type-string array rather than
44+
// CL_DEVICE_TYPE.raw so a future CL_DEVICE_TYPE_CUSTOM can't sneak
45+
// past as a GPU.
46+
type clinfoTypeProp struct {
47+
Raw uint32 `json:"raw"`
48+
Type []string `json:"type"`
49+
}
50+
51+
func (t clinfoTypeProp) isGPU() bool {
52+
for _, s := range t.Type {
53+
if s == clDeviceTypeGPU {
54+
return true
55+
}
56+
}
57+
return false
58+
}
59+
60+
// clinfoOnce caches the result for the process lifetime. GPU hardware
61+
// doesn't change between calls and the subprocess is ~150 ms.
62+
var clinfoOnce = sync.OnceValue(runCLInfo)
63+
64+
func runCLInfo() []GPUMemoryInfo {
65+
if _, err := exec.LookPath("clinfo"); err != nil {
66+
return nil
67+
}
68+
ctx, cancel := context.WithTimeout(context.Background(), clinfoTimeout)
69+
defer cancel()
70+
cmd := exec.CommandContext(ctx, "clinfo", "--json")
71+
var stdout, stderr bytes.Buffer
72+
cmd.Stdout = &stdout
73+
cmd.Stderr = &stderr
74+
if err := cmd.Run(); err != nil {
75+
xlog.Debug("clinfo failed", "error", err, "stderr", stderr.String())
76+
return nil
77+
}
78+
return parseCLInfoJSON(stdout.Bytes())
79+
}
80+
81+
// getCLInfoGPUMemory is a best-effort fallback for hosts where the
82+
// vendor's own management binary (nvidia-smi / xpu-smi / rocm-smi)
83+
// isn't installed but the OpenCL ICD is. Live used/free aren't exposed
84+
// via standard CL_ properties; we synthesise them by attributing
85+
// per-process VRAM allocations from the kernel DRM fdinfo interface
86+
// to each clinfo-reported GPU via the shared PCI BDF.
87+
func getCLInfoGPUMemory() []GPUMemoryInfo {
88+
gpus := clinfoOnce()
89+
if len(gpus) == 0 {
90+
return nil
91+
}
92+
usage := drmFdInfoUsageByBDF()
93+
for i := range gpus {
94+
gpus[i] = applyDRMUsage(gpus[i], usage[gpus[i].BDF])
95+
}
96+
return gpus
97+
}
98+
99+
// applyDRMUsage stamps live VRAM accounting onto a GPUMemoryInfo
100+
// whose TotalVRAM came from a static source (e.g. clinfo). Caller
101+
// already populated TotalVRAM and FreeVRAM=TotalVRAM as defaults; if
102+
// DRM accounting reports usage, we trust it and rederive free/percent.
103+
func applyDRMUsage(g GPUMemoryInfo, used uint64) GPUMemoryInfo {
104+
if used == 0 || g.TotalVRAM == 0 {
105+
return g
106+
}
107+
if used > g.TotalVRAM {
108+
// Process-private DRM total can momentarily exceed device
109+
// VRAM (over-commit via host memory mirror). Clamp so the UI
110+
// doesn't display absurd percentages.
111+
used = g.TotalVRAM
112+
}
113+
g.UsedVRAM = used
114+
g.FreeVRAM = g.TotalVRAM - used
115+
g.UsagePercent = float64(used) / float64(g.TotalVRAM) * 100
116+
return g
117+
}
118+
119+
// parseCLInfoJSON returns one GPUMemoryInfo per discrete GPU. UMA
120+
// devices (iGPU/APU) are dropped because their "VRAM" is system RAM
121+
// and would double-count against the capability gate. When the same
122+
// physical device is enumerated by multiple ICDs (Intel OpenCL + POCL,
123+
// for example), the BDF dedup keeps the largest reported size — some
124+
// ICDs cap at 4 GiB for legacy alloc-size compatibility.
125+
func parseCLInfoJSON(raw []byte) []GPUMemoryInfo {
126+
var out clinfoOutput
127+
if err := json.Unmarshal(raw, &out); err != nil {
128+
xlog.Debug("clinfo: failed to parse --json output", "error", err)
129+
return nil
130+
}
131+
132+
byBDF := map[string]GPUMemoryInfo{}
133+
var noBDF []GPUMemoryInfo
134+
135+
for _, plat := range out.Devices {
136+
for _, d := range plat.Online {
137+
if !d.Type.isGPU() || d.HostUnifiedMemory || d.GlobalMemSize == 0 {
138+
continue
139+
}
140+
bdf := clinfoBDF(d)
141+
info := GPUMemoryInfo{
142+
Name: strings.TrimSpace(d.Name),
143+
Vendor: clinfoVendor(d.VendorID, d.Vendor),
144+
BDF: bdf,
145+
TotalVRAM: d.GlobalMemSize,
146+
FreeVRAM: d.GlobalMemSize,
147+
}
148+
if bdf == "" {
149+
noBDF = append(noBDF, info)
150+
continue
151+
}
152+
if existing, ok := byBDF[bdf]; !ok || info.TotalVRAM > existing.TotalVRAM {
153+
byBDF[bdf] = info
154+
}
155+
}
156+
}
157+
158+
all := make([]GPUMemoryInfo, 0, len(byBDF)+len(noBDF))
159+
for _, g := range byBDF {
160+
all = append(all, g)
161+
}
162+
all = append(all, noBDF...)
163+
for i := range all {
164+
all[i].Index = i
165+
}
166+
return all
167+
}
168+
169+
func clinfoVendor(vendorID uint32, name string) string {
170+
switch vendorID {
171+
case 0x10de:
172+
return VendorNVIDIA
173+
case 0x1002, 0x1022: // 0x1022 is the AMD CPU vendor ID, also reported by some APU OpenCL devices.
174+
return VendorAMD
175+
case 0x8086:
176+
return VendorIntel
177+
case 0x106B:
178+
return VendorApple
179+
}
180+
n := strings.ToLower(name)
181+
switch {
182+
case strings.Contains(n, "nvidia"):
183+
return VendorNVIDIA
184+
case strings.Contains(n, "advanced micro devices"), strings.Contains(n, "amd"):
185+
return VendorAMD
186+
case strings.Contains(n, "intel"):
187+
return VendorIntel
188+
case strings.Contains(n, "apple"):
189+
return VendorApple
190+
}
191+
return VendorUnknown
192+
}
193+
194+
// clinfoBDF returns the device's canonical `dddd:bb:dd.f` PCI address,
195+
// or "" when no PCI location is reported. The KHR form is `"PCI-E,
196+
// 0000:01:00.0"` on NVIDIA and bare `"0000:01:00.0"` on most others.
197+
func clinfoBDF(d clinfoDevice) string {
198+
if d.PCIBusInfoKHR != "" {
199+
s := d.PCIBusInfoKHR
200+
if i := strings.LastIndex(s, " "); i >= 0 {
201+
s = s[i+1:]
202+
}
203+
if c := strings.Count(s, ":"); c == 1 || c == 2 {
204+
return normalizeBDF(s)
205+
}
206+
}
207+
// NVIDIA pre-KHR per-axis fields. An all-zero result is
208+
// indistinguishable from "fields absent", but no GPU sits at
209+
// 0000:00:00.0 so the false negative is harmless.
210+
if d.PCIBusNV != 0 || d.PCISlotNV != 0 || d.PCIDomainNV != 0 {
211+
return fmt.Sprintf("%04x:%02x:%02x.0", d.PCIDomainNV, d.PCIBusNV, d.PCISlotNV)
212+
}
213+
return ""
214+
}
215+
216+
func normalizeBDF(s string) string {
217+
if strings.Count(s, ":") == 1 {
218+
return strings.ToLower("0000:" + s)
219+
}
220+
return strings.ToLower(s)
221+
}

0 commit comments

Comments
 (0)