-
Notifications
You must be signed in to change notification settings - Fork 274
Expand file tree
/
Copy path_program.pyx
More file actions
1177 lines (1036 loc) · 48.1 KB
/
_program.pyx
File metadata and controls
1177 lines (1036 loc) · 48.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0
"""Compilation machinery for CUDA programs.
This module provides :class:`Program` for compiling source code into
:class:`~cuda.core.ObjectCode`, with :class:`ProgramOptions` for configuration.
"""
from __future__ import annotations
from dataclasses import dataclass
import threading
from warnings import warn
from cuda.bindings import nvrtc
from cuda.pathfinder._optional_cuda_import import _optional_cuda_import
from libcpp.vector cimport vector
from ._resource_handles cimport (
as_cu,
as_py,
create_nvrtc_program_handle,
create_nvvm_program_handle,
)
from cuda.bindings cimport cynvrtc, cynvvm
from cuda.core._utils.cuda_utils cimport HANDLE_RETURN_NVRTC, HANDLE_RETURN_NVVM
from cuda.core._device import Device
from cuda.core._linker import Linker, LinkerHandleT, LinkerOptions
from cuda.core._module import ObjectCode
from cuda.core._utils.clear_error_support import assert_type
from cuda.core._utils.cuda_utils import (
CUDAError,
_handle_boolean_option,
check_or_create_options,
handle_return,
is_nested_sequence,
is_sequence,
)
from cuda.core._utils.version import binding_version, driver_version
__all__ = ["Program", "ProgramOptions"]
ProgramHandleT = nvrtc.nvrtcProgram | int | LinkerHandleT
"""Type alias for program handle types across different backends.
The ``int`` type covers NVVM handles, which don't have a wrapper class.
"""
# =============================================================================
# Principal Class
# =============================================================================
cdef class Program:
"""Represent a compilation machinery to process programs into
:class:`~cuda.core.ObjectCode`.
This object provides a unified interface to multiple underlying
compiler libraries. Compilation support is enabled for a wide
range of code types and compilation types.
Parameters
----------
code : str | bytes | bytearray
The source code to compile. For C++ and PTX, must be a string.
For NVVM IR, can be str, bytes, or bytearray.
code_type : str
The type of source code. Must be one of ``"c++"``, ``"ptx"``, or ``"nvvm"``.
options : :class:`ProgramOptions`, optional
Options to customize the compilation process.
"""
def __init__(self, code: str | bytes | bytearray, code_type: str, options: ProgramOptions | None = None):
Program_init(self, code, code_type, options)
def close(self):
"""Destroy this program."""
if self._linker:
self._linker.close()
# Reset handles - the C++ shared_ptr destructor handles cleanup
self._h_nvrtc.reset()
self._h_nvvm.reset()
def compile(
self, target_type: str, name_expressions: tuple | list = (), logs = None
) -> ObjectCode:
"""Compile the program to the specified target type.
Parameters
----------
target_type : str
The compilation target. Must be one of ``"ptx"``, ``"cubin"``, or ``"ltoir"``.
name_expressions : tuple | list, optional
Sequence of name expressions to make accessible in the compiled code.
Used for template instantiation and similar cases.
logs : object, optional
Object with a ``write`` method to receive compilation logs.
Returns
-------
:class:`~cuda.core.ObjectCode`
The compiled object code.
"""
return Program_compile(self, target_type, name_expressions, logs)
@property
def pch_status(self) -> str | None:
"""PCH creation outcome from the most recent :meth:`compile` call.
Possible values:
* ``"created"`` — PCH file was written successfully.
* ``"not_attempted"`` — PCH creation was not attempted (e.g. the
compiler decided not to, or automatic PCH processing skipped it).
* ``"failed"`` — an error prevented PCH creation.
* ``None`` — PCH was not requested, the program has not been
compiled yet, the backend is not NVRTC (e.g. PTX or NVVM),
or the NVRTC bindings are too old to report status.
When ``create_pch`` is set in :class:`ProgramOptions` and the PCH
heap is too small, :meth:`compile` automatically resizes the heap
and retries, so ``"created"`` should be the common outcome.
.. note::
PCH is only supported for ``code_type="c++"`` programs that
use the NVRTC backend. For PTX and NVVM programs this property
always returns ``None``.
"""
return self._pch_status
@property
def backend(self) -> str:
"""Return this Program instance's underlying backend."""
return self._backend
@property
def handle(self) -> ProgramHandleT:
"""Return the underlying handle object.
.. note::
The type of the returned object depends on the backend.
.. caution::
This handle is a Python object. To get the memory address of the underlying C
handle, call ``int(Program.handle)``.
"""
if self._backend == "NVRTC":
return as_py(self._h_nvrtc)
elif self._backend == "NVVM":
return as_py(self._h_nvvm) # returns int (NVVM uses raw integers)
else:
return self._linker.handle
def __repr__(self) -> str:
return f"<Program backend='{self._backend}'>"
# =============================================================================
# Other Public Classes
# =============================================================================
@dataclass
class ProgramOptions:
"""Customizable options for configuring :class:`Program`.
Attributes
----------
name : str, optional
Name of the program. If the compilation succeeds, the name is passed down to the generated `ObjectCode`.
arch : str, optional
Pass the SM architecture value, such as ``sm_<CC>`` (for generating CUBIN) or
``compute_<CC>`` (for generating PTX). If not provided, the current device's architecture
will be used.
relocatable_device_code : bool, optional
Enable (disable) the generation of relocatable device code.
Default: False
extensible_whole_program : bool, optional
Do extensible whole program compilation of device code.
Default: False
debug : bool, optional
Generate debug information. If --dopt is not specified, then turns off all optimizations.
Default: False
lineinfo: bool, optional
Generate line-number information.
Default: False
device_code_optimize : bool, optional
Enable device code optimization. When specified along with '-G', enables limited debug information generation
for optimized device code.
Default: None
ptxas_options : Union[str, list[str]], optional
Specify one or more options directly to ptxas, the PTX optimizing assembler. Options should be strings.
For example ["-v", "-O2"].
Default: None
max_register_count : int, optional
Specify the maximum amount of registers that GPU functions can use.
Default: None
ftz : bool, optional
When performing single-precision floating-point operations, flush denormal values to zero or preserve denormal
values.
Default: False
prec_sqrt : bool, optional
For single-precision floating-point square root, use IEEE round-to-nearest mode or use a faster approximation.
Default: True
prec_div : bool, optional
For single-precision floating-point division and reciprocals, use IEEE round-to-nearest mode or use a faster
approximation.
Default: True
fma : bool, optional
Enables (disables) the contraction of floating-point multiplies and adds/subtracts into floating-point
multiply-add operations.
Default: True
use_fast_math : bool, optional
Make use of fast math operations.
Default: False
extra_device_vectorization : bool, optional
Enables more aggressive device code vectorization in the NVVM optimizer.
Default: False
link_time_optimization : bool, optional
Generate intermediate code for later link-time optimization.
Default: False
gen_opt_lto : bool, optional
Run the optimizer passes before generating the LTO IR.
Default: False
define_macro : Union[str, tuple[str, str], list[Union[str, tuple[str, str]]]], optional
Predefine a macro. Can be either a string, in which case that macro will be set to 1, a 2 element tuple of
strings, in which case the first element is defined as the second, or a list of strings or tuples.
Default: None
undefine_macro : Union[str, list[str]], optional
Cancel any previous definition of a macro, or list of macros.
Default: None
include_path : Union[str, list[str]], optional
Add the directory or directories to the list of directories to be searched for headers.
Default: None
pre_include : Union[str, list[str]], optional
Preinclude one or more headers during preprocessing. Can be either a string or a list of strings.
Default: None
no_source_include : bool, optional
Disable the default behavior of adding the directory of each input source to the include path.
Default: False
std : str, optional
Set language dialect to C++03, C++11, C++14, C++17 or C++20.
Default: c++17
builtin_move_forward : bool, optional
Provide builtin definitions of std::move and std::forward.
Default: True
builtin_initializer_list : bool, optional
Provide builtin definitions of std::initializer_list class and member functions.
Default: True
disable_warnings : bool, optional
Inhibit all warning messages.
Default: False
restrict : bool, optional
Programmer assertion that all kernel pointer parameters are restrict pointers.
Default: False
device_as_default_execution_space : bool, optional
Treat entities with no execution space annotation as __device__ entities.
Default: False
device_int128 : bool, optional
Allow the __int128 type in device code.
Default: False
optimization_info : str, optional
Provide optimization reports for the specified kind of optimization.
Default: None
no_display_error_number : bool, optional
Disable the display of a diagnostic number for warning messages.
Default: False
diag_error : Union[int, list[int]], optional
Emit error for a specified diagnostic message number or comma separated list of numbers.
Default: None
diag_suppress : Union[int, list[int]], optional
Suppress a specified diagnostic message number or comma separated list of numbers.
Default: None
diag_warn : Union[int, list[int]], optional
Emit warning for a specified diagnostic message number or comma separated lis of numbers.
Default: None
brief_diagnostics : bool, optional
Disable or enable showing source line and column info in a diagnostic.
Default: False
time : str, optional
Generate a CSV table with the time taken by each compilation phase.
Default: None
split_compile : int, optional
Perform compiler optimizations in parallel.
Default: 1
fdevice_syntax_only : bool, optional
Ends device compilation after front-end syntax checking.
Default: False
minimal : bool, optional
Omit certain language features to reduce compile time for small programs.
Default: False
no_cache : bool, optional
Disable compiler caching.
Default: False
fdevice_time_trace : str, optional
Generate time trace JSON for profiling compilation (NVRTC only).
Default: None
device_float128 : bool, optional
Allow __float128 type in device code (NVRTC only).
Default: False
frandom_seed : str, optional
Set random seed for randomized optimizations (NVRTC only).
Default: None
ofast_compile : str, optional
Fast compilation mode: "0", "min", "mid", or "max" (NVRTC only).
Default: None
pch : bool, optional
Use default precompiled header (NVRTC only, CUDA 12.8+).
Default: False
create_pch : str, optional
Create precompiled header file (NVRTC only, CUDA 12.8+).
Default: None
use_pch : str, optional
Use specific precompiled header file (NVRTC only, CUDA 12.8+).
Default: None
pch_dir : str, optional
PCH directory location (NVRTC only, CUDA 12.8+).
Default: None
pch_verbose : bool, optional
Verbose PCH output (NVRTC only, CUDA 12.8+).
Default: False
pch_messages : bool, optional
Control PCH diagnostic messages (NVRTC only, CUDA 12.8+).
Default: False
instantiate_templates_in_pch : bool, optional
Control template instantiation in PCH (NVRTC only, CUDA 12.8+).
Default: False
extra_sources : list of 2-tuples or tuple of 2-tuples, optional
Additional NVVM IR modules to compile together with the main program, specified as
``((name1, source1), (name2, source2), ...)``. Each name is a string identifier used
in diagnostic messages. Each source can be a string (textual LLVM IR) or bytes/bytearray
(LLVM bitcode). Only supported for the NVVM backend.
Default: None
use_libdevice : bool, optional
Load NVIDIA's `libdevice <https://docs.nvidia.com/cuda/libdevice-users-guide/>`_
math builtins library. Only supported for the NVVM backend.
Default: False
"""
name: str | None = "default_program"
arch: str | None = None
relocatable_device_code: bool | None = None
extensible_whole_program: bool | None = None
debug: bool | None = None
lineinfo: bool | None = None
device_code_optimize: bool | None = None
ptxas_options: str | list[str] | tuple[str] | None = None
max_register_count: int | None = None
ftz: bool | None = None
prec_sqrt: bool | None = None
prec_div: bool | None = None
fma: bool | None = None
use_fast_math: bool | None = None
extra_device_vectorization: bool | None = None
link_time_optimization: bool | None = None
gen_opt_lto: bool | None = None
define_macro: str | tuple[str, str] | list[str | tuple[str, str]] | tuple[str | tuple[str, str], ...] | None = None
undefine_macro: str | list[str] | tuple[str] | None = None
include_path: str | list[str] | tuple[str] | None = None
pre_include: str | list[str] | tuple[str] | None = None
no_source_include: bool | None = None
std: str | None = None
builtin_move_forward: bool | None = None
builtin_initializer_list: bool | None = None
disable_warnings: bool | None = None
restrict: bool | None = None
device_as_default_execution_space: bool | None = None
device_int128: bool | None = None
optimization_info: str | None = None
no_display_error_number: bool | None = None
diag_error: int | list[int] | tuple[int] | None = None
diag_suppress: int | list[int] | tuple[int] | None = None
diag_warn: int | list[int] | tuple[int] | None = None
brief_diagnostics: bool | None = None
time: str | None = None
split_compile: int | None = None
fdevice_syntax_only: bool | None = None
minimal: bool | None = None
no_cache: bool | None = None
fdevice_time_trace: str | None = None
device_float128: bool | None = None
frandom_seed: str | None = None
ofast_compile: str | None = None
pch: bool | None = None
create_pch: str | None = None
use_pch: str | None = None
pch_dir: str | None = None
pch_verbose: bool | None = None
pch_messages: bool | None = None
instantiate_templates_in_pch: bool | None = None
extra_sources: list[tuple[str, str | bytes | bytearray]] | tuple[tuple[str, str | bytes | bytearray], ...] | None = None
use_libdevice: bool | None = None # For libdevice execution
numba_debug: bool | None = None # Custom option for Numba debugging
def __post_init__(self):
self._name = self.name.encode()
# Set arch to default if not provided
if self.arch is None:
self.arch = f"sm_{Device().arch}"
if self.extra_sources is not None:
if not is_sequence(self.extra_sources):
raise TypeError(
"extra_sources must be a sequence of 2-tuples: ((name1, source1), (name2, source2), ...)"
)
for i, module in enumerate(self.extra_sources):
if not isinstance(module, tuple) or len(module) != 2:
raise TypeError(
f"Each extra module must be a 2-tuple (name, source)"
f", got {type(module).__name__} at index {i}"
)
module_name, module_source = module
if not isinstance(module_name, str):
raise TypeError(f"Module name at index {i} must be a string, got {type(module_name).__name__}")
if not isinstance(module_source, (str, bytes, bytearray)):
raise TypeError(
f"Module source at index {i} must be str (textual LLVM IR), bytes (textual LLVM IR or bitcode), "
f"or bytearray, got {type(module_source).__name__}"
)
if len(module_source) == 0:
raise ValueError(f"Module source for '{module_name}' (index {i}) cannot be empty")
def _prepare_nvrtc_options(self) -> list[bytes]:
return _prepare_nvrtc_options_impl(self)
def _prepare_nvvm_options(self, as_bytes: bool = True) -> list[bytes] | list[str]:
return _prepare_nvvm_options_impl(self, as_bytes)
def as_bytes(self, backend: str, target_type: str | None = None) -> list[bytes]:
"""Convert program options to bytes format for the specified backend.
This method transforms the program options into a format suitable for the
specified compiler backend. Different backends may use different option names
and formats even for the same conceptual options.
Parameters
----------
backend : str
The compiler backend to prepare options for. Must be either "nvrtc" or "nvvm".
target_type : str, optional
The compilation target type (e.g., "ptx", "cubin", "ltoir"). Some backends
require additional options based on the target type.
Returns
-------
list[bytes]
List of option strings encoded as bytes.
Raises
------
ValueError
If an unknown backend is specified.
CUDAError
If an option incompatible with the specified backend is set.
Examples
--------
>>> options = ProgramOptions(arch="sm_80", debug=True)
>>> nvrtc_options = options.as_bytes("nvrtc")
"""
backend = backend.lower()
if backend == "nvrtc":
return self._prepare_nvrtc_options()
elif backend == "nvvm":
options = self._prepare_nvvm_options(as_bytes=True)
if target_type == "ltoir" and b"-gen-lto" not in options:
options.append(b"-gen-lto")
return options
else:
raise ValueError(f"Unknown backend '{backend}'. Must be one of: 'nvrtc', 'nvvm'")
def __repr__(self):
return f"ProgramOptions(name={self.name!r}, arch={self.arch!r})"
def _prepare_extra_sources_bytes(self) -> list[tuple[bytes, bytes]] | None:
"""Convert extra_sources to bytes format for NVVM."""
if self.extra_sources is None:
return None
result = []
for module_name, module_source in self.extra_sources:
name_bytes = module_name.encode("utf-8")
if isinstance(module_source, str):
source_bytes = module_source.encode("utf-8")
elif isinstance(module_source, bytearray):
source_bytes = bytes(module_source)
else:
source_bytes = module_source
result.append((name_bytes, source_bytes))
return result
# =============================================================================
# Private Classes and Helper Functions
# =============================================================================
# Module-level state for NVVM lazy loading
_nvvm_module = None
_nvvm_import_attempted = False
def _get_nvvm_module():
"""Get the NVVM module, importing it lazily with availability checks."""
global _nvvm_module, _nvvm_import_attempted
if _nvvm_import_attempted:
if _nvvm_module is None:
raise RuntimeError("NVVM module is not available (previous import attempt failed)")
return _nvvm_module
_nvvm_import_attempted = True
try:
version = binding_version()
if version < (12, 9, 0):
raise RuntimeError(
f"NVVM bindings require cuda-bindings >= 12.9.0, but found {'.'.join(map(str, version))}. "
"Please update cuda-bindings to use NVVM features."
)
nvvm = _optional_cuda_import(
"cuda.bindings.nvvm",
probe_function=lambda module: module.version(), # probe triggers libnvvm load
)
if nvvm is None:
raise RuntimeError(
"NVVM support is unavailable: cuda.bindings.nvvm is missing or libnvvm cannot be loaded."
)
_nvvm_module = nvvm
return _nvvm_module
except RuntimeError:
_nvvm_module = None
raise
def _find_libdevice_path():
"""Find libdevice*.bc for NVVM compilation using cuda.pathfinder."""
from cuda.pathfinder import find_bitcode_lib
return find_bitcode_lib("device")
cdef inline bint _process_define_macro_inner(list options, object macro) except? -1:
"""Process a single define macro, returning True if successful."""
if isinstance(macro, str):
options.append(f"--define-macro={macro}")
return True
if isinstance(macro, tuple):
if len(macro) != 2 or any(not isinstance(val, str) for val in macro):
raise RuntimeError(f"Expected define_macro tuple[str, str], got {macro}")
options.append(f"--define-macro={macro[0]}={macro[1]}")
return True
return False
cdef inline void _process_define_macro(list options, object macro) except *:
"""Process define_macro option which can be str, tuple, or list thereof."""
union_type = "Union[str, tuple[str, str]]"
if _process_define_macro_inner(options, macro):
return
if is_nested_sequence(macro):
for seq_macro in macro:
if not _process_define_macro_inner(options, seq_macro):
raise RuntimeError(f"Expected define_macro {union_type}, got {seq_macro}")
return
raise RuntimeError(f"Expected define_macro {union_type}, list[{union_type}], got {macro}")
cpdef bint _can_load_generated_ptx() except? -1:
"""Check if the driver can load PTX generated by the current NVRTC version."""
drv = driver_version()
nvrtc_major, nvrtc_minor = handle_return(nvrtc.nvrtcVersion())
return (nvrtc_major, nvrtc_minor, 0) <= drv
cdef inline object _translate_program_options(object options):
"""Translate ProgramOptions to LinkerOptions for PTX compilation."""
return LinkerOptions(
name=options.name,
arch=options.arch,
max_register_count=options.max_register_count,
time=options.time,
link_time_optimization=options.link_time_optimization,
debug=options.debug,
lineinfo=options.lineinfo,
ftz=options.ftz,
prec_div=options.prec_div,
prec_sqrt=options.prec_sqrt,
fma=options.fma,
split_compile=options.split_compile,
ptxas_options=options.ptxas_options,
no_cache=options.no_cache,
)
cdef inline int Program_init(Program self, object code, str code_type, object options) except -1:
"""Initialize a Program instance."""
cdef cynvrtc.nvrtcProgram nvrtc_prog
cdef cynvvm.nvvmProgram nvvm_prog
cdef bytes code_bytes
cdef const char* code_ptr
cdef const char* name_ptr
cdef size_t code_len
cdef bytes module_bytes
cdef const char* module_ptr
cdef size_t module_len
self._options = options = check_or_create_options(ProgramOptions, options, "Program options")
code_type = code_type.lower()
self._compile_lock = threading.Lock()
self._use_libdevice = False
self._libdevice_added = False
self._pch_status = None
if code_type == "c++":
assert_type(code, str)
if options.extra_sources is not None:
raise ValueError("extra_sources is not supported by the NVRTC backend (C++ code_type)")
# TODO: support pre-loaded headers & include names
code_bytes = code.encode()
code_ptr = <const char*>code_bytes
name_ptr = <const char*>options._name
with nogil:
HANDLE_RETURN_NVRTC(NULL, cynvrtc.nvrtcCreateProgram(
&nvrtc_prog, code_ptr, name_ptr, 0, NULL, NULL))
self._h_nvrtc = create_nvrtc_program_handle(nvrtc_prog)
self._nvrtc_code = code_bytes
self._backend = "NVRTC"
self._linker = None
elif code_type == "ptx":
assert_type(code, str)
if options.extra_sources is not None:
raise ValueError("extra_sources is not supported by the PTX backend.")
self._linker = Linker(
ObjectCode._init(code.encode(), code_type), options=_translate_program_options(options)
)
self._backend = self._linker.backend()
elif code_type == "nvvm":
_get_nvvm_module() # Validate NVVM availability
if isinstance(code, str):
code = code.encode("utf-8")
elif not isinstance(code, (bytes, bytearray)):
raise TypeError("NVVM IR code must be provided as str, bytes, or bytearray")
code_ptr = <const char*>(<bytes>code)
name_ptr = <const char*>options._name
code_len = len(code)
with nogil:
HANDLE_RETURN_NVVM(NULL, cynvvm.nvvmCreateProgram(&nvvm_prog))
self._h_nvvm = create_nvvm_program_handle(nvvm_prog) # RAII from here
with nogil:
HANDLE_RETURN_NVVM(nvvm_prog, cynvvm.nvvmAddModuleToProgram(nvvm_prog, code_ptr, code_len, name_ptr))
# Add extra modules if provided
if options.extra_sources is not None:
extra_sources_bytes = options._prepare_extra_sources_bytes()
for module_name_bytes, module_bytes in extra_sources_bytes:
module_ptr = <const char*>module_bytes
module_len = len(module_bytes)
module_name_ptr = <const char*>module_name_bytes
with nogil:
HANDLE_RETURN_NVVM(nvvm_prog, cynvvm.nvvmAddModuleToProgram(
nvvm_prog, module_ptr, module_len, module_name_ptr))
# Store use_libdevice flag
if options.use_libdevice:
self._use_libdevice = True
self._backend = "NVVM"
self._linker = None
else:
supported_code_types = ("c++", "ptx", "nvvm")
assert code_type not in supported_code_types, f"{code_type=}"
if options.use_libdevice:
raise ValueError("use_libdevice is only supported by the NVVM backend")
raise RuntimeError(f"Unsupported {code_type=} ({supported_code_types=})")
return 0
cdef object _nvrtc_compile_and_extract(
cynvrtc.nvrtcProgram prog, str target_type, object name_expressions,
object logs, list options_list, str name,
):
"""Run nvrtcCompileProgram on *prog* and extract the output.
This is the inner compile+extract loop, factored out so the PCH
auto-retry path can call it on a fresh program handle.
"""
cdef size_t output_size = 0
cdef size_t logsize = 0
cdef vector[const char*] options_vec
cdef char* data_ptr = NULL
cdef bytes name_bytes
cdef const char* name_ptr = NULL
cdef const char* lowered_name = NULL
cdef dict symbol_mapping = {}
# Add name expressions before compilation
if name_expressions:
for n in name_expressions:
name_bytes = n.encode() if isinstance(n, str) else n
name_ptr = <const char*>name_bytes
HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcAddNameExpression(prog, name_ptr))
# Build options array
options_vec.resize(len(options_list))
for i in range(len(options_list)):
options_vec[i] = <const char*>(<bytes>options_list[i])
# Compile
with nogil:
HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcCompileProgram(prog, <int>options_vec.size(), options_vec.data()))
# Get compiled output based on target type
if target_type == "ptx":
HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetPTXSize(prog, &output_size))
data = bytearray(output_size)
data_ptr = <char*>(<bytearray>data)
with nogil:
HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetPTX(prog, data_ptr))
elif target_type == "cubin":
HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetCUBINSize(prog, &output_size))
data = bytearray(output_size)
data_ptr = <char*>(<bytearray>data)
with nogil:
HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetCUBIN(prog, data_ptr))
else: # ltoir
HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetLTOIRSize(prog, &output_size))
data = bytearray(output_size)
data_ptr = <char*>(<bytearray>data)
with nogil:
HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetLTOIR(prog, data_ptr))
# Get lowered names after compilation
if name_expressions:
for n in name_expressions:
name_bytes = n.encode() if isinstance(n, str) else n
name_ptr = <const char*>name_bytes
HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetLoweredName(prog, name_ptr, &lowered_name))
symbol_mapping[n] = lowered_name if lowered_name != NULL else None
# Get compilation log if requested
if logs is not None:
HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetProgramLogSize(prog, &logsize))
if logsize > 1:
log = bytearray(logsize)
data_ptr = <char*>(<bytearray>log)
with nogil:
HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetProgramLog(prog, data_ptr))
logs.write(log.decode("utf-8", errors="backslashreplace"))
return ObjectCode._init(bytes(data), target_type, symbol_mapping=symbol_mapping, name=name)
cdef int _nvrtc_pch_apis_cached = -1 # -1 = unchecked
cdef bint _has_nvrtc_pch_apis():
global _nvrtc_pch_apis_cached
if _nvrtc_pch_apis_cached < 0:
_nvrtc_pch_apis_cached = hasattr(nvrtc, "nvrtcGetPCHCreateStatus")
return _nvrtc_pch_apis_cached
cdef str _PCH_STATUS_CREATED = "created"
cdef str _PCH_STATUS_NOT_ATTEMPTED = "not_attempted"
cdef str _PCH_STATUS_FAILED = "failed"
cdef str _read_pch_status(cynvrtc.nvrtcProgram prog):
"""Query nvrtcGetPCHCreateStatus and translate to a high-level string."""
cdef cynvrtc.nvrtcResult err
with nogil:
err = cynvrtc.nvrtcGetPCHCreateStatus(prog)
if err == cynvrtc.nvrtcResult.NVRTC_SUCCESS:
return _PCH_STATUS_CREATED
if err == cynvrtc.nvrtcResult.NVRTC_ERROR_PCH_CREATE_HEAP_EXHAUSTED:
return None # sentinel: caller should auto-retry
if err == cynvrtc.nvrtcResult.NVRTC_ERROR_NO_PCH_CREATE_ATTEMPTED:
return _PCH_STATUS_NOT_ATTEMPTED
return _PCH_STATUS_FAILED
cdef object Program_compile_nvrtc(Program self, str target_type, object name_expressions, object logs):
"""Compile using NVRTC backend and return ObjectCode."""
cdef cynvrtc.nvrtcProgram prog = as_cu(self._h_nvrtc)
cdef list options_list = self._options.as_bytes("nvrtc", target_type)
result = _nvrtc_compile_and_extract(
prog, target_type, name_expressions, logs, options_list, self._options.name,
)
cdef bint pch_creation_possible = self._options.create_pch or self._options.pch
if not pch_creation_possible or not _has_nvrtc_pch_apis():
self._pch_status = None
return result
try:
status = _read_pch_status(prog)
except RuntimeError as e:
raise RuntimeError(
"PCH was requested but the runtime libnvrtc does not support "
"PCH APIs. Update to CUDA toolkit 12.8 or newer."
) from e
if status is not None:
self._pch_status = status
return result
# Heap exhausted — auto-resize and retry with a fresh program
cdef size_t required = 0
with nogil:
HANDLE_RETURN_NVRTC(prog, cynvrtc.nvrtcGetPCHHeapSizeRequired(prog, &required))
HANDLE_RETURN_NVRTC(NULL, cynvrtc.nvrtcSetPCHHeapSize(required))
cdef cynvrtc.nvrtcProgram retry_prog
cdef const char* code_ptr = <const char*>self._nvrtc_code
cdef const char* name_ptr = <const char*>self._options._name
with nogil:
HANDLE_RETURN_NVRTC(NULL, cynvrtc.nvrtcCreateProgram(
&retry_prog, code_ptr, name_ptr, 0, NULL, NULL))
self._h_nvrtc = create_nvrtc_program_handle(retry_prog)
result = _nvrtc_compile_and_extract(
retry_prog, target_type, name_expressions, logs, options_list, self._options.name,
)
status = _read_pch_status(retry_prog)
self._pch_status = status if status is not None else _PCH_STATUS_FAILED
return result
cdef object Program_compile_nvvm(Program self, str target_type, object logs):
"""Compile using NVVM backend and return ObjectCode."""
cdef cynvvm.nvvmProgram prog = as_cu(self._h_nvvm)
cdef size_t output_size = 0
cdef size_t logsize = 0
cdef vector[const char*] options_vec
cdef char* data_ptr = NULL
cdef bytes libdevice_bytes
cdef const char* libdevice_ptr
cdef size_t libdevice_len
# Build options array
options_list = self._options.as_bytes("nvvm", target_type)
options_vec.resize(len(options_list))
for i in range(len(options_list)):
options_vec[i] = <const char*>(<bytes>options_list[i])
# Serialize NVVM program mutation/use per Program instance.
with self._compile_lock:
with nogil:
HANDLE_RETURN_NVVM(prog, cynvvm.nvvmVerifyProgram(prog, <int>options_vec.size(), options_vec.data()))
# Load libdevice if requested - following numba-cuda.
if self._use_libdevice and not self._libdevice_added:
libdevice_path = _find_libdevice_path()
with open(libdevice_path, "rb") as f:
libdevice_bytes = f.read()
libdevice_ptr = <const char*>libdevice_bytes
libdevice_len = len(libdevice_bytes)
with nogil:
HANDLE_RETURN_NVVM(prog, cynvvm.nvvmLazyAddModuleToProgram(
prog, libdevice_ptr, libdevice_len, NULL))
self._libdevice_added = True
with nogil:
HANDLE_RETURN_NVVM(prog, cynvvm.nvvmCompileProgram(prog, <int>options_vec.size(), options_vec.data()))
HANDLE_RETURN_NVVM(prog, cynvvm.nvvmGetCompiledResultSize(prog, &output_size))
data = bytearray(output_size)
data_ptr = <char*>(<bytearray>data)
with nogil:
HANDLE_RETURN_NVVM(prog, cynvvm.nvvmGetCompiledResult(prog, data_ptr))
# Get compilation log if requested
if logs is not None:
HANDLE_RETURN_NVVM(prog, cynvvm.nvvmGetProgramLogSize(prog, &logsize))
if logsize > 1:
log = bytearray(logsize)
data_ptr = <char*>(<bytearray>log)
with nogil:
HANDLE_RETURN_NVVM(prog, cynvvm.nvvmGetProgramLog(prog, data_ptr))
logs.write(log.decode("utf-8", errors="backslashreplace"))
return ObjectCode._init(bytes(data), target_type, name=self._options.name)
# Supported target types per backend
cdef dict SUPPORTED_TARGETS = {
"NVRTC": ("ptx", "cubin", "ltoir"),
"NVVM": ("ptx", "ltoir"),
"nvJitLink": ("cubin", "ptx"),
"driver": ("cubin", "ptx"),
}
cdef object Program_compile(Program self, str target_type, object name_expressions, object logs):
"""Compile the program to the specified target type."""
# Validate target_type for this backend
supported = SUPPORTED_TARGETS.get(self._backend)
if supported is None:
raise ValueError(f'Unknown backend="{self._backend}"')
if target_type not in supported:
raise ValueError(
f'Unsupported target_type="{target_type}" for {self._backend} '
f'(supported: {", ".join(repr(t) for t in supported)})'
)
if self._backend == "NVRTC":
if target_type == "ptx" and not _can_load_generated_ptx():
warn(
"The CUDA driver version is older than the backend version. "
"The generated ptx will not be loadable by the current driver.",
stacklevel=2,
category=RuntimeWarning,
)
return Program_compile_nvrtc(self, target_type, name_expressions, logs)
elif self._backend == "NVVM":
return Program_compile_nvvm(self, target_type, logs)
else:
return self._linker.link(target_type)
cdef inline list _prepare_nvrtc_options_impl(object opts):
"""Build NVRTC-specific compiler options."""
options = [f"-arch={opts.arch}"]
if opts.relocatable_device_code is not None:
options.append(f"--relocatable-device-code={_handle_boolean_option(opts.relocatable_device_code)}")
if opts.extensible_whole_program is not None and opts.extensible_whole_program:
options.append("--extensible-whole-program")
if opts.debug is not None and opts.debug:
options.append("--device-debug")
if opts.lineinfo is not None and opts.lineinfo:
options.append("--generate-line-info")
if opts.device_code_optimize is not None and opts.device_code_optimize:
options.append("--dopt=on")
if opts.ptxas_options is not None:
opt_name = "--ptxas-options"
if isinstance(opts.ptxas_options, str):
options.append(f"{opt_name}={opts.ptxas_options}")
elif is_sequence(opts.ptxas_options):
for opt_value in opts.ptxas_options:
options.append(f"{opt_name}={opt_value}")
if opts.max_register_count is not None:
options.append(f"--maxrregcount={opts.max_register_count}")
if opts.ftz is not None:
options.append(f"--ftz={_handle_boolean_option(opts.ftz)}")
if opts.prec_sqrt is not None:
options.append(f"--prec-sqrt={_handle_boolean_option(opts.prec_sqrt)}")
if opts.prec_div is not None:
options.append(f"--prec-div={_handle_boolean_option(opts.prec_div)}")
if opts.fma is not None:
options.append(f"--fmad={_handle_boolean_option(opts.fma)}")
if opts.use_fast_math is not None and opts.use_fast_math:
options.append("--use_fast_math")
if opts.extra_device_vectorization is not None and opts.extra_device_vectorization:
options.append("--extra-device-vectorization")
if opts.link_time_optimization is not None and opts.link_time_optimization:
options.append("--dlink-time-opt")
if opts.gen_opt_lto is not None and opts.gen_opt_lto:
options.append("--gen-opt-lto")
if opts.define_macro is not None:
_process_define_macro(options, opts.define_macro)
if opts.undefine_macro is not None:
if isinstance(opts.undefine_macro, str):
options.append(f"--undefine-macro={opts.undefine_macro}")
elif is_sequence(opts.undefine_macro):
for macro in opts.undefine_macro:
options.append(f"--undefine-macro={macro}")
if opts.include_path is not None:
if isinstance(opts.include_path, str):
options.append(f"--include-path={opts.include_path}")
elif is_sequence(opts.include_path):
for path in opts.include_path:
options.append(f"--include-path={path}")
if opts.pre_include is not None:
if isinstance(opts.pre_include, str):
options.append(f"--pre-include={opts.pre_include}")
elif is_sequence(opts.pre_include):
for header in opts.pre_include:
options.append(f"--pre-include={header}")