File tree Expand file tree Collapse file tree 10 files changed +53
-9
lines changed
gpustack_runtime/detector Expand file tree Collapse file tree 10 files changed +53
-9
lines changed Original file line number Diff line number Diff line change 6464 - name : Checkout
6565 uses : actions/checkout@v4
6666 with :
67+ lfs : true
6768 fetch-depth : 1
6869 persist-credentials : false
6970 - name : Setup UV
Original file line number Diff line number Diff line change 3939 - name : Checkout
4040 uses : actions/checkout@v4
4141 with :
42+ lfs : true
4243 fetch-depth : 0
4344 fetch-tags : true
4445 persist-credentials : false
Original file line number Diff line number Diff line change 4646 - name : Checkout
4747 uses : actions/checkout@v4
4848 with :
49+ lfs : true
4950 fetch-depth : 0
5051 fetch-tags : true
5152 persist-credentials : false
Original file line number Diff line number Diff line change 44.DS_Store
55* .swp
66
7- # C extensions
8- * .so
9-
107# Python-generated files
118_version.py
129_version_appendix.py
Original file line number Diff line number Diff line change @@ -54,6 +54,7 @@ def is_supported() -> bool:
5454 pci_devs = IluvatarDetector .detect_pci_devices ()
5555 if not pci_devs and not envs .GPUSTACK_RUNTIME_DETECT_NO_PCI_CHECK :
5656 logger .debug ("No Iluvatar PCI devices found" )
57+ return supported
5758
5859 try :
5960 pyixml .nvmlInit ()
Original file line number Diff line number Diff line change @@ -2411,11 +2411,19 @@ def _LoadHgmlLibrary():
24112411 # Linux path
24122412 locs = [
24132413 "libhgml.so" ,
2414- str (Path (__file__ ).resolve ().parent / "libhgml.so" ),
2414+ [
2415+ str (Path (__file__ ).resolve ().parent / "libuki.so" ),
2416+ str (Path (__file__ ).resolve ().parent / "libhgml.so" ),
2417+ ],
24152418 ]
24162419 for loc in locs :
24172420 try :
2418- hgmlLib = CDLL (loc )
2421+ if isinstance (loc , str ):
2422+ hgmlLib = CDLL (loc )
2423+ else :
2424+ for pre_loc in loc [:- 1 ]:
2425+ CDLL (pre_loc )
2426+ hgmlLib = CDLL (loc [- 1 ])
24192427 break
24202428 except OSError :
24212429 pass
Original file line number Diff line number Diff line change 1+ version https://git-lfs.github.com/spec/v1
2+ oid sha256:04fcc6541a73acc5f5b52bdb5dd76af10d10862f30f8da362575fcb11c538a82
3+ size 2101640
Original file line number Diff line number Diff line change 1+ version https://git-lfs.github.com/spec/v1
2+ oid sha256:104eafd6f218613e0548330c9bdad27c0ab0af020f143fb8ffa2ad3f9d654a9b
3+ size 702352
Original file line number Diff line number Diff line change @@ -54,6 +54,7 @@ def is_supported() -> bool:
5454 pci_devs = THeadDetector .detect_pci_devices ()
5555 if not pci_devs and not envs .GPUSTACK_RUNTIME_DETECT_NO_PCI_CHECK :
5656 logger .debug ("No T-Head PCI devices found" )
57+ return supported
5758
5859 try :
5960 pyhgml .hgmlInit ()
@@ -98,10 +99,23 @@ def detect(self) -> Devices | None:
9899
99100 sys_driver_ver = pyhgml .hgmlSystemGetDriverVersion ()
100101
101- sys_runtime_ver_original = pyhgml .hgmlSystemGetHGMLVersion ()
102- sys_runtime_ver = get_brief_version (
103- sys_runtime_ver_original ,
104- )
102+ sys_runtime_ver_original = None
103+ sys_runtime_ver = None
104+ with contextlib .suppress (pyhgml .HGMLError ):
105+ sys_runtime_ver_original = pyhgml .hgmlSystemGetHggcDriverVersion ()
106+ sys_runtime_ver_original = "." .join (
107+ map (
108+ str ,
109+ [
110+ sys_runtime_ver_original // 1000 ,
111+ (sys_runtime_ver_original % 1000 ) // 10 ,
112+ (sys_runtime_ver_original % 10 ),
113+ ],
114+ ),
115+ )
116+ sys_runtime_ver = get_brief_version (
117+ sys_runtime_ver_original ,
118+ )
105119
106120 dev_count = pyhgml .hgmlDeviceGetCount ()
107121 for dev_idx in range (dev_count ):
Original file line number Diff line number Diff line change @@ -487,6 +487,21 @@ ENV NVIDIA_DISABLE_REQUIRE="true" \
487487 NVIDIA_VISIBLE_DEVICES="all" \
488488 NVIDIA_DRIVER_CAPABILITIES="compute,utility"
489489
490+ # # Active all T-Head devices detection,
491+ # # works with (default) T-Head container runtime and privileged mode.
492+ # # See https://help.aliyun.com/document_detail/2996754.html.
493+ # # Runs:
494+ # # - With container runtime installed:
495+ # # [TODO, TBD]
496+ # # - Without container runtime installed:
497+ # # + Allowing privileged, try with:
498+ # # docker run --rm -it -v /var/run/docker.sock:/var/run/docker.sock --privileged -v /usr/local/PPU_SDK:/usr/local/PPU_SDK:ro ...
499+ # # + Disallowing privileged, try with:
500+ # # docker run --rm -it -v /var/run/docker.sock:/var/run/docker.sock --security-opt seccomp=unconfined -v /dev:/dev:ro -v /usr/local/PPU_SDK:/usr/local/PPU_SDK:ro ...
501+ ENV PPU_HOME="/usr/local/PPU_SDK" \
502+ LD_LIBRARY_PATH="/usr/local/PPU_SDK/CUDA_SDK/lib64:/usr/local/PPU_SDK/lib:${LD_LIBRARY_PATH}" \
503+ GPUSTACK_RUNTIME_DEPLOY_MIRRORED_DEPLOYMENT_IGNORE_VOLUMES="/usr/local/PPU_SDK;${GPUSTACK_RUNTIME_DEPLOY_MIRRORED_DEPLOYMENT_IGNORE_VOLUMES}"
504+
490505# # Active GPUStack runtime mirrored deployment mode,
491506# # if getting an error like, "Found multiple Containers with the same hostname ...",
492507# # please use `--env GPUSTACK_RUNTIME_DEPLOY_MIRRORED_NAME=...` to specify the exact container name.
You can’t perform that action at this time.
0 commit comments