@@ -97,8 +97,18 @@ def cuda_array_interface_dict(data: _CudaArrayLikeArg) -> CudaArrayInf:
9797 if array_hasobject (data ):
9898 raise ValueError ("Input data contains `object` dtype. Expecting numeric data." )
9999 ainf = data .__cuda_array_interface__
100- if "mask" in ainf :
101- ainf ["mask" ] = ainf ["mask" ].__cuda_array_interface__ # type: ignore[union-attr]
100+ if "mask" in ainf and ainf ["mask" ] is not None :
101+ mask_ainf = ainf ["mask" ].__cuda_array_interface__ # type: ignore[union-attr]
102+ # Normalize the validity mask to XGBoost's expected layout (`|t1` bit field of
103+ # length `n_samples`).
104+ typestr = mask_ainf ["typestr" ]
105+ n_samples = ainf ["shape" ][0 ]
106+ if typestr [1 ] in ("u" , "i" , "t" ) and typestr [2 :] == "1" and n_samples :
107+ mask_ainf = dict (mask_ainf )
108+ mask_ainf ["typestr" ] = "|t1"
109+ mask_ainf ["shape" ] = (n_samples ,)
110+ mask_ainf .pop ("strides" , None )
111+ ainf ["mask" ] = mask_ainf
102112 return ainf
103113
104114
@@ -509,79 +519,43 @@ def check_cudf_meta(data: _CudaArrayLikeArg, field: str) -> None:
509519 raise ValueError (f"Missing value is not allowed for: { field } " )
510520
511521
512- class ArrowSchema (ctypes .Structure ):
513- """The Schema type from arrow C array."""
514-
515- _fields_ = [
516- ("format" , ctypes .c_char_p ),
517- ("name" , ctypes .c_char_p ),
518- ("metadata" , ctypes .c_char_p ),
519- ("flags" , ctypes .c_int64 ),
520- ("n_children" , ctypes .c_int64 ),
521- ("children" , ctypes .POINTER (ctypes .c_void_p )),
522- ("dictionary" , ctypes .c_void_p ),
523- ("release" , ctypes .c_void_p ),
524- ("private_data" , ctypes .c_void_p ),
525- ]
526-
527-
528- class ArrowArray (ctypes .Structure ):
529- """The Array type from arrow C array."""
530-
531-
532- ArrowArray ._fields_ = [ # pylint: disable=protected-access
533- ("length" , ctypes .c_int64 ),
534- ("null_count" , ctypes .c_int64 ),
535- ("offset" , ctypes .c_int64 ),
536- ("n_buffers" , ctypes .c_int64 ),
537- ("n_children" , ctypes .c_int64 ),
538- ("buffers" , ctypes .POINTER (ctypes .c_void_p )),
539- ("children" , ctypes .POINTER (ctypes .POINTER (ArrowArray ))),
540- ("dictionary" , ctypes .POINTER (ArrowArray )),
541- ("release" , ctypes .c_void_p ),
542- ("private_data" , ctypes .c_void_p ),
543- ]
544-
545-
546- class ArrowDeviceArray (ctypes .Structure ):
547- """The Array type from arrow C device array."""
548-
549- _fields_ = [
550- ("array" , ArrowArray ),
551- ("device_id" , ctypes .c_int64 ),
552- ("device_type" , ctypes .c_int32 ),
553- ("sync_event" , ctypes .c_void_p ),
554- ("reserved" , ctypes .c_int64 * 3 ),
555- ]
556-
557-
558- PyCapsule_GetName = ctypes .pythonapi .PyCapsule_GetName
559- PyCapsule_GetName .restype = ctypes .c_char_p
560- PyCapsule_GetName .argtypes = [ctypes .py_object ]
561-
562-
563- PyCapsule_GetPointer = ctypes .pythonapi .PyCapsule_GetPointer
564- PyCapsule_GetPointer .restype = ctypes .c_void_p
565- PyCapsule_GetPointer .argtypes = [ctypes .py_object , ctypes .c_char_p ]
566-
567-
568- def wait_event (event_hdl : int ) -> None :
569- """Wait for CUDA event exported by arrow."""
570- # cuda-python is a dependency of cuDF.
571- from cuda .bindings import runtime as cudart
522+ def _cudf_str_cat_inf (cats : DfCatAccessor ) -> Tuple [CudaStringArray , Tuple ]:
523+ """String category index path for :py:func:`cudf_cat_inf`."""
524+ import pylibcudf as plc # pylint: disable=import-outside-toplevel
572525
573- event = ctypes .cast (event_hdl , ctypes .POINTER (ctypes .c_int64 ))
574- (status ,) = cudart .cudaStreamWaitEvent (
526+ # pylint: disable=protected-access
527+ plc_col = cats ._column .to_pylibcudf ()
528+ if plc_col .type ().id () != plc .TypeId .STRING :
529+ raise TypeError (
530+ "Unexpected type for category index. It's neither numeric nor string."
531+ )
532+ # Categories should not have missing values nor a non-zero logical offset.
533+ assert plc_col .null_count () == 0
534+ assert plc_col .offset () == 0
535+
536+ off_child = plc_col .children ()[0 ] # offsets
537+ assert off_child .type ().id () == plc .TypeId .INT32 , "Expected INT32 string offsets."
538+
539+ # String category index in arrow format
540+ jdata : CudaArrayInf = _arrow_buf_inf (
541+ plc_col .data ().__cuda_array_interface__ ["data" ][0 ],
542+ "|i1" ,
543+ 0 ,
575544 STREAM_PER_THREAD ,
576- event .contents .value ,
577- cudart .cudaEventWaitDefault ,
578545 )
579- if status != cudart .cudaError_t .cudaSuccess :
580- _ , msg = cudart .cudaGetErrorString (status )
581- raise ValueError (msg )
546+ joffset : CudaArrayInf = _arrow_buf_inf (
547+ off_child .data ().__cuda_array_interface__ ["data" ][0 ],
548+ "<i4" ,
549+ off_child .size (),
550+ STREAM_PER_THREAD ,
551+ )
552+ jnames : CudaStringArray = {"offsets" : joffset , "values" : jdata }
553+ # Keep `plc_col` alive: it owns the GPU buffers pointed to by `jdata` and
554+ # `joffset`.
555+ return jnames , (plc_col ,)
582556
583557
584- def cudf_cat_inf ( # pylint: disable=too-many-locals
558+ def cudf_cat_inf (
585559 cats : DfCatAccessor , codes : "pd.Series"
586560) -> Tuple [Union [CudaArrayInf , CudaStringArray ], ArrayInf , Tuple ]:
587561 """Obtain the cuda array interface for cuDF categories."""
@@ -594,55 +568,9 @@ def cudf_cat_inf( # pylint: disable=too-many-locals
594568 codes_ainf = cuda_array_interface_dict (codes )
595569 return cats_ainf , codes_ainf , (cats , codes )
596570
597- # pylint: disable=protected-access
598- arrow_col = cats ._column .to_pylibcudf (mode = "read" )
599- # Tuple[types.CapsuleType, types.CapsuleType]
600- schema , array = arrow_col .__arrow_c_device_array__ ()
601-
602- array_ptr = PyCapsule_GetPointer (array , PyCapsule_GetName (array ))
603- schema_ptr = PyCapsule_GetPointer (schema , PyCapsule_GetName (schema ))
604-
605- # Cast to arrow array
606- arrow_device_array = ctypes .cast (
607- array_ptr , ctypes .POINTER (ArrowDeviceArray )
608- ).contents
609- wait_event (arrow_device_array .sync_event )
610- assert arrow_device_array .device_type == 2 # 2 is CUDA
611-
612- arrow_array = arrow_device_array .array
613- mask , offset , data = (
614- arrow_array .buffers [0 ],
615- arrow_array .buffers [1 ],
616- arrow_array .buffers [2 ],
617- )
618- # Categories should not have missing values.
619- assert mask is None
620- assert arrow_array .n_children == 0
621- assert arrow_array .n_buffers == 3
622- assert arrow_array .offset == 0
623-
624- # Cast to ArrowSchema
625- arrow_schema = ctypes .cast (schema_ptr , ctypes .POINTER (ArrowSchema )).contents
626- assert arrow_schema .format in (b"u" , b"U" , b"vu" ) # utf8, large utf8
627- if arrow_schema .format in (b"u" , b"vu" ):
628- joffset : CudaArrayInf = _arrow_buf_inf (
629- offset , "<i4" , arrow_array .length + 1 , STREAM_PER_THREAD
630- )
631- elif arrow_schema .format == b"U" :
632- raise TypeError ("Large string for category index (names) is not supported." )
633- else :
634- raise TypeError (
635- "Unexpected type for category index. It's neither numeric nor string."
636- )
637- # 0 size for unknown
638- jdata : CudaArrayInf = _arrow_buf_inf (data , "|i1" , 0 , STREAM_PER_THREAD )
639- jnames : CudaStringArray = {
640- "offsets" : joffset ,
641- "values" : jdata ,
642- }
643-
571+ jnames , buf = _cudf_str_cat_inf (cats )
644572 jcodes = cuda_array_interface_dict (codes )
645- return jnames , jcodes , ( arrow_col ,)
573+ return jnames , jcodes , buf
646574
647575
648576class Categories :
0 commit comments