cuda-python/cuda_core/cuda/core/_module.pyx at 4d1ddb4bff40518750fccbc72864ab7dfbab7603 · mdboom/cuda-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0

from __future__ import annotations

from libc.stddef cimport size_t

from collections import namedtuple

from cuda.core._device import Device
from cuda.core._launch_config cimport LaunchConfig
from cuda.core._launch_config import LaunchConfig
from cuda.core._stream cimport Stream
from cuda.core._program import ObjectCodeFormat
from cuda.core._resource_handles cimport (
    LibraryHandle,
    KernelHandle,
    create_library_handle_from_file,
    create_library_handle_from_data,
    create_kernel_handle,
    create_kernel_handle_ref,
    get_kernel_library,
    get_last_error,
    as_cu,
    as_py,
    as_intptr,
)
from cuda.core._stream import Stream
from cuda.core._utils.clear_error_support import (
    assert_type_str_or_bytes_like,
    raise_code_path_meant_to_be_unreachable,
)
from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
from cuda.core._utils.version cimport cy_driver_version
from cuda.core._utils.cuda_utils import driver
from cuda.bindings cimport cydriver

__all__ = ["Kernel", "ObjectCode"]


cdef class KernelAttributes:
    """Read-only view of a kernel's per-device attributes.

    The default view returned by :attr:`Kernel.attributes` is bound to
    the current device, resolved at attribute-access time. Use
    ``kernel.attributes[device]`` to obtain a view bound to a specific
    device (an :class:`int` device ordinal or :class:`Device`). Per-device
    views share the underlying cache so a value queried through one view
    is visible through the others.
    """

    def __init__(self, *args, **kwargs):
        raise RuntimeError("KernelAttributes cannot be instantiated directly. Please use Kernel APIs.")

    @staticmethod
    cdef KernelAttributes _init(KernelHandle h_kernel):
        cdef KernelAttributes self = KernelAttributes.__new__(KernelAttributes)
        self._h_kernel = h_kernel
        self._device_id = -1
        self._cache = {}
        return self

    cdef KernelAttributes _view_for_device(self, int device_id):
        cdef KernelAttributes view = KernelAttributes.__new__(KernelAttributes)
        view._h_kernel = self._h_kernel
        view._device_id = device_id
        view._cache = self._cache
        return view

    cdef inline int _effective_device_id(self) except? -1:
        if self._device_id >= 0:
            return self._device_id
        return Device().device_id

    cdef int _get_cached_attribute(self, int device_id, cydriver.CUfunction_attribute attribute) except? -1:
        """Helper function to get a cached attribute or fetch and cache it if not present."""
        cdef tuple cache_key = (device_id, <int>attribute)
        cached = self._cache.get(cache_key, cache_key)
        if cached is not cache_key:
            return cached
        cdef int result
        with nogil:
            HANDLE_RETURN(cydriver.cuKernelGetAttribute(&result, attribute, as_cu(self._h_kernel), device_id))
        self._cache[cache_key] = result
        return result

    def __getitem__(self, device) -> KernelAttributes:
        """Return a view of these attributes bound to a specific device.

        Parameters
        ----------
        device : Device or int
            The device whose attributes to query. Accepts a :class:`Device`
            or a device ordinal (:class:`int`).

        Returns
        -------
        KernelAttributes
            A view bound to ``device`` that shares the underlying cache
            with this view.
        """
        return self._view_for_device(Device(device).device_id)

    @property
    def max_threads_per_block(self) -> int:
        """int : The maximum number of threads per block.
        This attribute is read-only."""
        return self._get_cached_attribute(
            self._effective_device_id(), cydriver.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK
        )

    @property
    def shared_size_bytes(self) -> int:
        """int : The size in bytes of statically-allocated shared memory required by this function.
        This attribute is read-only."""
        return self._get_cached_attribute(
            self._effective_device_id(), cydriver.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES
        )

    @property
    def const_size_bytes(self) -> int:
        """int : The size in bytes of user-allocated constant memory required by this function.
        This attribute is read-only."""
        return self._get_cached_attribute(
            self._effective_device_id(), cydriver.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES
        )

    @property
    def local_size_bytes(self) -> int:
        """int : The size in bytes of local memory used by each thread of this function.
        This attribute is read-only."""
        return self._get_cached_attribute(
            self._effective_device_id(), cydriver.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES
        )

    @property
    def num_regs(self) -> int:
        """int : The number of registers used by each thread of this function.
        This attribute is read-only."""
        return self._get_cached_attribute(
            self._effective_device_id(), cydriver.CU_FUNC_ATTRIBUTE_NUM_REGS
        )

    @property
    def ptx_version(self) -> int:
        """int : The PTX virtual architecture version for which the function was compiled.
        This attribute is read-only."""
        return self._get_cached_attribute(
            self._effective_device_id(), cydriver.CU_FUNC_ATTRIBUTE_PTX_VERSION
        )

    @property
    def binary_version(self) -> int:
        """int : The binary architecture version for which the function was compiled.
        This attribute is read-only."""
        return self._get_cached_attribute(
            self._effective_device_id(), cydriver.CU_FUNC_ATTRIBUTE_BINARY_VERSION
        )

    @property
    def cache_mode_ca(self) -> bool:
        """bool : Whether the function has been compiled with user specified option "-Xptxas --dlcm=ca" set.
        This attribute is read-only."""
        return bool(
            self._get_cached_attribute(
                self._effective_device_id(), cydriver.CU_FUNC_ATTRIBUTE_CACHE_MODE_CA
            )
        )

    @property
    def max_dynamic_shared_size_bytes(self) -> int:
        """int : The maximum size in bytes of dynamically-allocated shared memory that can be used
        by this function."""
        return self._get_cached_attribute(
            self._effective_device_id(), cydriver.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES
        )

    @property
    def preferred_shared_memory_carveout(self) -> int:
        """int : The shared memory carveout preference, in percent of the total shared memory."""
        return self._get_cached_attribute(
            self._effective_device_id(), cydriver.CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT
        )

    @property
    def cluster_size_must_be_set(self) -> bool:
        """bool : The kernel must launch with a valid cluster size specified.
        This attribute is read-only."""
        return bool(
            self._get_cached_attribute(
                self._effective_device_id(), cydriver.CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET
            )
        )

    @property
    def required_cluster_width(self) -> int:
        """int : The required cluster width in blocks."""
        return self._get_cached_attribute(
            self._effective_device_id(), cydriver.CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH
        )

    @property
    def required_cluster_height(self) -> int:
        """int : The required cluster height in blocks."""
        return self._get_cached_attribute(
            self._effective_device_id(), cydriver.CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT
        )

    @property
    def required_cluster_depth(self) -> int:
        """int : The required cluster depth in blocks."""
        return self._get_cached_attribute(
            self._effective_device_id(), cydriver.CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH
        )

    @property
    def non_portable_cluster_size_allowed(self) -> bool:
        """bool : Whether the function can be launched with non-portable cluster size."""
        return bool(
            self._get_cached_attribute(
                self._effective_device_id(),
                cydriver.CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED,
            )
        )

    @property
    def cluster_scheduling_policy_preference(self) -> int:
        """int : The block scheduling policy of a function."""
        return self._get_cached_attribute(
            self._effective_device_id(),
            cydriver.CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE,
        )


MaxPotentialBlockSizeOccupancyResult = namedtuple("MaxPotential", ("min_grid_size", "max_block_size"))


cdef class KernelOccupancy:
    """This class offers methods to query occupancy metrics that help determine optimal
    launch parameters such as block size, grid size, and shared memory usage.
    """

    def __init__(self, *args, **kwargs):
        raise RuntimeError("KernelOccupancy cannot be instantiated directly. Please use Kernel APIs.")

    @staticmethod
    cdef KernelOccupancy _init(KernelHandle h_kernel):
        cdef KernelOccupancy self = KernelOccupancy.__new__(KernelOccupancy)
        self._h_kernel = h_kernel
        return self

    def max_active_blocks_per_multiprocessor(self, block_size: int, dynamic_shared_memory_size: int) -> int:
        """Occupancy of the kernel.

        Returns the maximum number of active blocks per multiprocessor for this kernel.

        Parameters
        ----------
            block_size: int
                Block size parameter used to launch this kernel.
            dynamic_shared_memory_size: int
                The amount of dynamic shared memory in bytes needed by block.
                Use `0` if block does not need shared memory.

        Returns
        -------
        int
            The maximum number of active blocks per multiprocessor.

        Note
        ----
            The fraction of the product of maximum number of active blocks per multiprocessor
            and the block size to the maximum number of threads per multiprocessor is known as
            theoretical multiprocessor utilization (occupancy).

        """
        cdef int num_blocks
        cdef int c_block_size = block_size
        cdef size_t c_shmem_size = dynamic_shared_memory_size
        cdef cydriver.CUfunction func = <cydriver.CUfunction>as_cu(self._h_kernel)
        with nogil:
            HANDLE_RETURN(cydriver.cuOccupancyMaxActiveBlocksPerMultiprocessor(
                &num_blocks, func, c_block_size, c_shmem_size
            ))
        return num_blocks

    def max_potential_block_size(
        self, dynamic_shared_memory_needed: int | driver.CUoccupancyB2DSize, block_size_limit: int
    ) -> MaxPotentialBlockSizeOccupancyResult:
        """MaxPotentialBlockSizeOccupancyResult: Suggested launch configuration for reasonable occupancy.

        Returns the minimum grid size needed to achieve the maximum occupancy and
        the maximum block size that can achieve the maximum occupancy.

        Parameters
        ----------
            dynamic_shared_memory_needed: Union[int, driver.CUoccupancyB2DSize]
                The amount of dynamic shared memory in bytes needed by block.
                Use `0` if block does not need shared memory. Use C-callable
                represented by :obj:`~driver.CUoccupancyB2DSize` to encode
                amount of needed dynamic shared memory which varies depending
                on tne block size.
            block_size_limit: int
                Known upper limit on the kernel block size. Use `0` to indicate
                the maximum block size permitted by the device / kernel instead

        Returns
        -------
        :obj:`~MaxPotentialBlockSizeOccupancyResult`
            An object with `min_grid_size` and `max_block_size` attributes encoding
            the suggested launch configuration.

        Note
        ----
            Please be advised that use of C-callable that requires Python Global
            Interpreter Lock may lead to deadlocks.

        """
        cdef int min_grid_size, max_block_size
        cdef cydriver.CUfunction func = <cydriver.CUfunction>as_cu(self._h_kernel)
        cdef cydriver.CUoccupancyB2DSize callback
        cdef size_t c_shmem_size
        cdef int c_block_size_limit = block_size_limit
        if isinstance(dynamic_shared_memory_needed, int):
            c_shmem_size = dynamic_shared_memory_needed
            with nogil:
                HANDLE_RETURN(cydriver.cuOccupancyMaxPotentialBlockSize(
                    &min_grid_size, &max_block_size, func, NULL, c_shmem_size, c_block_size_limit
                ))
        elif isinstance(dynamic_shared_memory_needed, driver.CUoccupancyB2DSize):
            # Callback may require GIL, so don't use nogil here
            callback = <cydriver.CUoccupancyB2DSize><size_t>dynamic_shared_memory_needed.getPtr()
            HANDLE_RETURN(cydriver.cuOccupancyMaxPotentialBlockSize(
                &min_grid_size, &max_block_size, func, callback, 0, c_block_size_limit
            ))
        else:
            raise TypeError(
                "dynamic_shared_memory_needed expected to have type int, or CUoccupancyB2DSize, "
                f"got {type(dynamic_shared_memory_needed)}"
            )
        return MaxPotentialBlockSizeOccupancyResult(min_grid_size=min_grid_size, max_block_size=max_block_size)

    def available_dynamic_shared_memory_per_block(self, num_blocks_per_multiprocessor: int, block_size: int) -> int:
        """Dynamic shared memory available per block for given launch configuration.

        The amount of dynamic shared memory per block, in bytes, for given kernel launch configuration.

        Parameters
        ----------
            num_blocks_per_multiprocessor: int
                Number of blocks to be concurrently executing on a multiprocessor.
            block_size: int
                Block size parameter used to launch this kernel.

        Returns
        -------
        int
            Dynamic shared memory available per block for given launch configuration.
        """
        cdef size_t dynamic_smem_size
        cdef int c_num_blocks = num_blocks_per_multiprocessor
        cdef int c_block_size = block_size
        cdef cydriver.CUfunction func = <cydriver.CUfunction>as_cu(self._h_kernel)
        with nogil:
            HANDLE_RETURN(cydriver.cuOccupancyAvailableDynamicSMemPerBlock(
                &dynamic_smem_size, func, c_num_blocks, c_block_size
            ))
        return dynamic_smem_size

    def max_potential_cluster_size(self, config: LaunchConfig, stream: Stream | None = None) -> int:
        """Maximum potential cluster size.

        The maximum potential cluster size for this kernel and given launch configuration.

        Parameters
        ----------
            config: :obj:`~_launch_config.LaunchConfig`
                Kernel launch configuration. Cluster dimensions in the configuration are ignored.
            stream: :obj:`~Stream`, optional
                The stream on which this kernel is to be launched.

        Returns
        -------
        int
            The maximum cluster size that can be launched for this kernel and launch configuration.
        """
        cdef cydriver.CUlaunchConfig drv_cfg = (<LaunchConfig>config)._to_native_launch_config()
        cdef Stream s
        if stream is not None:
            s = <Stream>stream
            drv_cfg.hStream = as_cu(s._h_stream)
        cdef int cluster_size
        cdef cydriver.CUfunction func = <cydriver.CUfunction>as_cu(self._h_kernel)
        with nogil:
            HANDLE_RETURN(cydriver.cuOccupancyMaxPotentialClusterSize(&cluster_size, func, &drv_cfg))
        return cluster_size

    def max_active_clusters(self, config: LaunchConfig, stream: Stream | None = None) -> int:
        """Maximum number of active clusters on the target device.

        The maximum number of clusters that could concurrently execute on the target device.

        Parameters
        ----------
            config: :obj:`~_launch_config.LaunchConfig`
                Kernel launch configuration.
            stream: :obj:`~Stream`, optional
                The stream on which this kernel is to be launched.

        Returns
        -------
        int
            The maximum number of clusters that could co-exist on the target device.
        """
        cdef cydriver.CUlaunchConfig drv_cfg = (<LaunchConfig>config)._to_native_launch_config()
        cdef Stream s
        if stream is not None:
            s = <Stream>stream
            drv_cfg.hStream = as_cu(s._h_stream)
        cdef int num_clusters
        cdef cydriver.CUfunction func = <cydriver.CUfunction>as_cu(self._h_kernel)
        with nogil:
            HANDLE_RETURN(cydriver.cuOccupancyMaxActiveClusters(&num_clusters, func, &drv_cfg))
        return num_clusters


ParamInfo = namedtuple("ParamInfo", ["offset", "size"])


cdef class Kernel:
    """Represent a compiled kernel that had been loaded onto the device.

    Kernel instances can execution when passed directly into the
    :func:`~launch` function.

    Directly creating a :obj:`~_module.Kernel` is not supported, and they
    should instead be created through a :obj:`~_module.ObjectCode` object.

    """

    def __init__(self, *args, **kwargs):
        raise RuntimeError("Kernel objects cannot be instantiated directly. Please use ObjectCode APIs.")

    @staticmethod
    cdef Kernel _from_handle(KernelHandle h_kernel):
        cdef Kernel ker = Kernel.__new__(Kernel)
        ker._h_kernel = h_kernel
        ker._attributes = None
        ker._occupancy = None
        return ker

    @property
    def attributes(self) -> KernelAttributes:
        """Get the read-only attributes of this kernel."""
        if self._attributes is None:
            self._attributes = KernelAttributes._init(self._h_kernel)
        return self._attributes

    cdef tuple _get_arguments_info(self, bint param_info=False):
        if cy_driver_version() < (12, 4, 0):
            raise NotImplementedError(
                "Driver version 12.4 or newer is required for this function. "
                f"Using driver version {'.'.join(map(str, cy_driver_version()))}"
            )
        cdef size_t arg_pos = 0
        cdef list param_info_data = []
        cdef cydriver.CUkernel cu_kernel = as_cu(self._h_kernel)
        cdef size_t param_offset, param_size
        cdef cydriver.CUresult err
        while True:
            with nogil:
                err = cydriver.cuKernelGetParamInfo(cu_kernel, arg_pos, &param_offset, &param_size)
            if err != cydriver.CUDA_SUCCESS:
                break
            if param_info:
                param_info_data.append(ParamInfo(offset=param_offset, size=param_size))
            arg_pos = arg_pos + 1
        if err != cydriver.CUDA_ERROR_INVALID_VALUE:
            HANDLE_RETURN(err)
        return arg_pos, param_info_data

    @property
    def num_arguments(self) -> int:
        """int : The number of arguments of this function"""
        num_args, _ = self._get_arguments_info()
        return num_args

    @property
    def arguments_info(self) -> list[ParamInfo]:
        """list[ParamInfo]: (offset, size) for each argument of this function"""
        _, param_info = self._get_arguments_info(param_info=True)
        return param_info

    @property
    def occupancy(self) -> KernelOccupancy:
        """Get the occupancy information for launching this kernel."""
        if self._occupancy is None:
            self._occupancy = KernelOccupancy._init(self._h_kernel)
        return self._occupancy

    @property
    def handle(self):
        """Return the underlying kernel handle object.

        .. caution::

            This handle is a Python object. To get the memory address of the underlying C
            handle, call ``int(Kernel.handle)``.
        """
        return as_py(self._h_kernel)

    @property
    def _handle(self):
        return self.handle

    @staticmethod
    def from_handle(handle, mod: ObjectCode = None) -> Kernel:
        """Creates a new :obj:`Kernel` object from a kernel handle.

        Parameters
        ----------
        handle : int
            Kernel handle representing the address of a foreign
            kernel object (CUkernel).
        mod : :obj:`ObjectCode`, optional
            The ObjectCode object associated with this kernel. Provides
            library lifetime for foreign kernels not created by
            cuda.core.
        """

        if not isinstance(handle, int):
            raise TypeError(f"handle must be an integer, got {type(handle).__name__}")

        cdef cydriver.CUkernel cu_kernel = <cydriver.CUkernel><void*><size_t>handle
        cdef KernelHandle h_kernel = create_kernel_handle_ref(cu_kernel)
        if not h_kernel:
            HANDLE_RETURN(get_last_error())

        cdef LibraryHandle h_existing_lib = get_kernel_library(h_kernel)
        cdef LibraryHandle h_caller_lib

        if mod is not None:
            h_caller_lib = (<ObjectCode>mod)._h_library
            if h_existing_lib and h_caller_lib:
                if as_cu(h_existing_lib) != as_cu(h_caller_lib):
                    import warnings
                    warnings.warn(
                        "The library from the provided ObjectCode does not match "
                        "the library associated with this kernel.",
                        stacklevel=2,
                    )

        cdef Kernel k = Kernel._from_handle(h_kernel)
        if mod is not None and not h_existing_lib:
            k._keepalive = mod
        return k

    def __eq__(self, other) -> bool:
        if not isinstance(other, Kernel):
            return NotImplemented
        return as_intptr(self._h_kernel) == as_intptr((<Kernel>other)._h_kernel)

    def __hash__(self) -> int:
        return hash(as_intptr(self._h_kernel))

    def __repr__(self) -> str:
        return f"<Kernel handle={as_intptr(self._h_kernel):#x}>"


CodeTypeT = bytes | bytearray | str

cdef tuple _supported_code_type = tuple(ObjectCodeFormat.__members__.values())

cdef class ObjectCode:
    """Represent a compiled program to be loaded onto the device.

    This object provides a unified interface for different types of
    compiled programs that will be loaded onto the device.

    Note
    ----
    This class has no default constructor. If you already have a cubin that you would
    like to load, use the :meth:`from_cubin` alternative constructor. Constructing directly
    from all other possible code types should be avoided in favor of compilation through
    :class:`~cuda.core.Program`
    """

    def __init__(self, *args, **kwargs):
        raise RuntimeError(
            "ObjectCode objects cannot be instantiated directly. "
            "Please use ObjectCode APIs (from_cubin, from_ptx) or Program APIs (compile)."
        )

    @classmethod
    def _init(cls, module, code_type, *, name: str = "", symbol_mapping: dict | None = None):
        assert code_type in _supported_code_type, f"{code_type=} is not supported"
        cdef ObjectCode self = ObjectCode.__new__(ObjectCode)

        # _h_library is assigned during _lazy_load_module
        self._h_library = LibraryHandle()  # Empty handle

        self._code_type = str(code_type)
        self._module = module
        self._sym_map = {} if symbol_mapping is None else symbol_mapping
        self._name = name if name else ""

        return self

    @staticmethod
    def _reduce_helper(module, code_type, name, symbol_mapping):
        return ObjectCode._init(module, code_type, name=name if name else "", symbol_mapping=symbol_mapping)

    def __reduce__(self):
        return ObjectCode._reduce_helper, (self._module, self._code_type, self._name, self._sym_map)

    @staticmethod
    def from_cubin(module: bytes | str, *, name: str = "", symbol_mapping: dict | None = None) -> ObjectCode:
        """Create an :class:`ObjectCode` instance from an existing cubin.

        Parameters
        ----------
        module : Union[bytes, str]
            Either a bytes object containing the in-memory cubin to load, or
            a file path string pointing to the on-disk cubin to load.
        name : Optional[str]
            A human-readable identifier representing this code object.
        symbol_mapping : Optional[dict]
            A dictionary specifying how the unmangled symbol names (as keys)
            should be mapped to the mangled names before trying to retrieve
            them (default to no mappings).
        """
        return ObjectCode._init(module, ObjectCodeFormat.CUBIN, name=name, symbol_mapping=symbol_mapping)

    @staticmethod
    def from_ptx(module: bytes | str, *, name: str = "", symbol_mapping: dict | None = None) -> ObjectCode:
        """Create an :class:`ObjectCode` instance from an existing PTX.

        Parameters
        ----------
        module : Union[bytes, str]
            Either a bytes object containing the in-memory ptx code to load, or
            a file path string pointing to the on-disk ptx file to load.
        name : Optional[str]
            A human-readable identifier representing this code object.
        symbol_mapping : Optional[dict]
            A dictionary specifying how the unmangled symbol names (as keys)
            should be mapped to the mangled names before trying to retrieve
            them (default to no mappings).
        """
        return ObjectCode._init(module, ObjectCodeFormat.PTX, name=name, symbol_mapping=symbol_mapping)

    @staticmethod
    def from_ltoir(module: bytes | str, *, name: str = "", symbol_mapping: dict | None = None) -> ObjectCode:
        """Create an :class:`ObjectCode` instance from an existing LTOIR.

        Parameters
        ----------
        module : Union[bytes, str]
            Either a bytes object containing the in-memory ltoir code to load, or
            a file path string pointing to the on-disk ltoir file to load.
        name : Optional[str]
            A human-readable identifier representing this code object.
        symbol_mapping : Optional[dict]
            A dictionary specifying how the unmangled symbol names (as keys)
            should be mapped to the mangled names before trying to retrieve
            them (default to no mappings).
        """
        return ObjectCode._init(module, ObjectCodeFormat.LTOIR, name=name, symbol_mapping=symbol_mapping)

    @staticmethod
    def from_fatbin(module: bytes | str, *, name: str = "", symbol_mapping: dict | None = None) -> ObjectCode:
        """Create an :class:`ObjectCode` instance from an existing fatbin.

        Parameters
        ----------
        module : Union[bytes, str]
            Either a bytes object containing the in-memory fatbin to load, or
            a file path string pointing to the on-disk fatbin to load.
        name : Optional[str]
            A human-readable identifier representing this code object.
        symbol_mapping : Optional[dict]
            A dictionary specifying how the unmangled symbol names (as keys)
            should be mapped to the mangled names before trying to retrieve
            them (default to no mappings).
        """
        return ObjectCode._init(module, ObjectCodeFormat.FATBIN, name=name, symbol_mapping=symbol_mapping)

    @staticmethod
    def from_object(module: bytes | str, *, name: str = "", symbol_mapping: dict | None = None) -> ObjectCode:
        """Create an :class:`ObjectCode` instance from an existing object code.

        Parameters
        ----------
        module : Union[bytes, str]
            Either a bytes object containing the in-memory object code to load, or
            a file path string pointing to the on-disk object code to load.
        name : Optional[str]
            A human-readable identifier representing this code object.
        symbol_mapping : Optional[dict]
            A dictionary specifying how the unmangled symbol names (as keys)
            should be mapped to the mangled names before trying to retrieve
            them (default to no mappings).
        """
        return ObjectCode._init(module, ObjectCodeFormat.OBJECT, name=name, symbol_mapping=symbol_mapping)

    @staticmethod
    def from_library(module: bytes | str, *, name: str = "", symbol_mapping: dict | None = None) -> ObjectCode:
        """Create an :class:`ObjectCode` instance from an existing library.

        Parameters
        ----------
        module : Union[bytes, str]
            Either a bytes object containing the in-memory library to load, or
            a file path string pointing to the on-disk library to load.
        name : Optional[str]
            A human-readable identifier representing this code object.
        symbol_mapping : Optional[dict]
            A dictionary specifying how the unmangled symbol names (as keys)
            should be mapped to the mangled names before trying to retrieve
            them (default to no mappings).
        """
        return ObjectCode._init(module, ObjectCodeFormat.LIBRARY, name=name, symbol_mapping=symbol_mapping)

    # TODO: do we want to unload in a finalizer? Probably not..

    cdef int _lazy_load_module(self) except -1:
        if self._h_library:
            return 0
        module = self._module
        assert_type_str_or_bytes_like(module)
        cdef bytes path_bytes
        if isinstance(module, str):
            path_bytes = module.encode()
            self._h_library = create_library_handle_from_file(<const char*>path_bytes)
            if not self._h_library:
                HANDLE_RETURN(get_last_error())
            return 0
        if isinstance(module, (bytes, bytearray)):
            self._h_library = create_library_handle_from_data(<const void*><char*>module)
            if not self._h_library:
                HANDLE_RETURN(get_last_error())
            return 0
        raise_code_path_meant_to_be_unreachable()
        return -1

    def get_kernel(self, name) -> Kernel:
        """Return the :obj:`~_module.Kernel` of a specified name from this object code.

        Parameters
        ----------
        name : str | bytes
            Name of the kernel to retrieve.

        Returns
        -------
        :obj:`~_module.Kernel`
            Newly created kernel object.

        """
        self._lazy_load_module()
        supported_code_types = (ObjectCodeFormat.CUBIN, ObjectCodeFormat.PTX, ObjectCodeFormat.FATBIN)
        if self._code_type not in supported_code_types:
            raise RuntimeError(f'Unsupported code type "{self._code_type}" ({supported_code_types=})')
        try:
            name = self._sym_map[name]
        except KeyError:
            if isinstance(name, str):
                name = name.encode()

        cdef KernelHandle h_kernel = create_kernel_handle(self._h_library, <const char*>name)
        if not h_kernel:
            HANDLE_RETURN(get_last_error())
        return Kernel._from_handle(h_kernel)

    @property
    def code(self) -> CodeTypeT:
        """Return the underlying code object."""
        return self._module

    @property
    def name(self) -> str:
        """Return a human-readable name of this code object."""
        return self._name

    @property
    def code_type(self) -> str:
        """Return the type of the underlying code object."""
        return self._code_type

    @property
    def symbol_mapping(self) -> dict:
        """Return a copy of the symbol mapping dictionary."""
        return dict(self._sym_map)

    @property
    def handle(self):
        """Return the underlying handle object.

        .. caution::

            This handle is a Python object. To get the memory address of the underlying C
            handle, call ``int(ObjectCode.handle)``.
        """
        self._lazy_load_module()
        return as_py(self._h_library)

    def __eq__(self, other) -> bool:
        if not isinstance(other, ObjectCode):
            return NotImplemented
        # Trigger lazy load for both objects to compare handles
        self._lazy_load_module()
        (<ObjectCode>other)._lazy_load_module()
        return as_intptr(self._h_library) == as_intptr((<ObjectCode>other)._h_library)

    def __hash__(self) -> int:
        # Trigger lazy load to get the handle
        self._lazy_load_module()
        return hash(as_intptr(self._h_library))

    def __repr__(self) -> str:
        # Trigger lazy load to get the handle
        self._lazy_load_module()
        return f"<ObjectCode handle={as_intptr(self._h_library):#x} code_type='{self._code_type}'>"