Skip to content

Commit 60ebfb5

Browse files
committed
refactor: enhance ppu detect
Signed-off-by: thxCode <thxcode0824@gmail.com>
1 parent 08ca836 commit 60ebfb5

File tree

10 files changed

+53
-9
lines changed

10 files changed

+53
-9
lines changed

.github/workflows/ci.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ jobs:
6464
- name: Checkout
6565
uses: actions/checkout@v4
6666
with:
67+
lfs: true
6768
fetch-depth: 1
6869
persist-credentials: false
6970
- name: Setup UV

.github/workflows/pack.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ jobs:
3939
- name: Checkout
4040
uses: actions/checkout@v4
4141
with:
42+
lfs: true
4243
fetch-depth: 0
4344
fetch-tags: true
4445
persist-credentials: false

.github/workflows/release.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ jobs:
4646
- name: Checkout
4747
uses: actions/checkout@v4
4848
with:
49+
lfs: true
4950
fetch-depth: 0
5051
fetch-tags: true
5152
persist-credentials: false

.gitignore

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,6 @@
44
.DS_Store
55
*.swp
66

7-
# C extensions
8-
*.so
9-
107
# Python-generated files
118
_version.py
129
_version_appendix.py

gpustack_runtime/detector/iluvatar.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ def is_supported() -> bool:
5454
pci_devs = IluvatarDetector.detect_pci_devices()
5555
if not pci_devs and not envs.GPUSTACK_RUNTIME_DETECT_NO_PCI_CHECK:
5656
logger.debug("No Iluvatar PCI devices found")
57+
return supported
5758

5859
try:
5960
pyixml.nvmlInit()

gpustack_runtime/detector/pyhgml/__init__.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2411,11 +2411,19 @@ def _LoadHgmlLibrary():
24112411
# Linux path
24122412
locs = [
24132413
"libhgml.so",
2414-
str(Path(__file__).resolve().parent / "libhgml.so"),
2414+
[
2415+
str(Path(__file__).resolve().parent / "libuki.so"),
2416+
str(Path(__file__).resolve().parent / "libhgml.so"),
2417+
],
24152418
]
24162419
for loc in locs:
24172420
try:
2418-
hgmlLib = CDLL(loc)
2421+
if isinstance(loc, str):
2422+
hgmlLib = CDLL(loc)
2423+
else:
2424+
for pre_loc in loc[:-1]:
2425+
CDLL(pre_loc)
2426+
hgmlLib = CDLL(loc[-1])
24192427
break
24202428
except OSError:
24212429
pass
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:04fcc6541a73acc5f5b52bdb5dd76af10d10862f30f8da362575fcb11c538a82
3+
size 2101640
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:104eafd6f218613e0548330c9bdad27c0ab0af020f143fb8ffa2ad3f9d654a9b
3+
size 702352

gpustack_runtime/detector/thead.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ def is_supported() -> bool:
5454
pci_devs = THeadDetector.detect_pci_devices()
5555
if not pci_devs and not envs.GPUSTACK_RUNTIME_DETECT_NO_PCI_CHECK:
5656
logger.debug("No T-Head PCI devices found")
57+
return supported
5758

5859
try:
5960
pyhgml.hgmlInit()
@@ -98,10 +99,23 @@ def detect(self) -> Devices | None:
9899

99100
sys_driver_ver = pyhgml.hgmlSystemGetDriverVersion()
100101

101-
sys_runtime_ver_original = pyhgml.hgmlSystemGetHGMLVersion()
102-
sys_runtime_ver = get_brief_version(
103-
sys_runtime_ver_original,
104-
)
102+
sys_runtime_ver_original = None
103+
sys_runtime_ver = None
104+
with contextlib.suppress(pyhgml.HGMLError):
105+
sys_runtime_ver_original = pyhgml.hgmlSystemGetHggcDriverVersion()
106+
sys_runtime_ver_original = ".".join(
107+
map(
108+
str,
109+
[
110+
sys_runtime_ver_original // 1000,
111+
(sys_runtime_ver_original % 1000) // 10,
112+
(sys_runtime_ver_original % 10),
113+
],
114+
),
115+
)
116+
sys_runtime_ver = get_brief_version(
117+
sys_runtime_ver_original,
118+
)
105119

106120
dev_count = pyhgml.hgmlDeviceGetCount()
107121
for dev_idx in range(dev_count):

pack/Dockerfile

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -487,6 +487,21 @@ ENV NVIDIA_DISABLE_REQUIRE="true" \
487487
NVIDIA_VISIBLE_DEVICES="all" \
488488
NVIDIA_DRIVER_CAPABILITIES="compute,utility"
489489

490+
## Active all T-Head devices detection,
491+
## works with (default) T-Head container runtime and privileged mode.
492+
## See https://help.aliyun.com/document_detail/2996754.html.
493+
## Runs:
494+
## - With container runtime installed:
495+
## [TODO, TBD]
496+
## - Without container runtime installed:
497+
## + Allowing privileged, try with:
498+
## docker run --rm -it -v /var/run/docker.sock:/var/run/docker.sock --privileged -v /usr/local/PPU_SDK:/usr/local/PPU_SDK:ro ...
499+
## + Disallowing privileged, try with:
500+
## docker run --rm -it -v /var/run/docker.sock:/var/run/docker.sock --security-opt seccomp=unconfined -v /dev:/dev:ro -v /usr/local/PPU_SDK:/usr/local/PPU_SDK:ro ...
501+
ENV PPU_HOME="/usr/local/PPU_SDK" \
502+
LD_LIBRARY_PATH="/usr/local/PPU_SDK/CUDA_SDK/lib64:/usr/local/PPU_SDK/lib:${LD_LIBRARY_PATH}" \
503+
GPUSTACK_RUNTIME_DEPLOY_MIRRORED_DEPLOYMENT_IGNORE_VOLUMES="/usr/local/PPU_SDK;${GPUSTACK_RUNTIME_DEPLOY_MIRRORED_DEPLOYMENT_IGNORE_VOLUMES}"
504+
490505
## Active GPUStack runtime mirrored deployment mode,
491506
## if getting an error like, "Found multiple Containers with the same hostname ...",
492507
## please use `--env GPUSTACK_RUNTIME_DEPLOY_MIRRORED_NAME=...` to specify the exact container name.

0 commit comments

Comments
 (0)