@@ -243,6 +243,18 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
243243 // offload *has* a return type, but somehow works without mentioning the place
244244 return IntrinsicResult :: WroteIntoPlace ;
245245 }
246+ sym:: preload => {
247+ if tcx. sess . opts . unstable_opts . offload . is_empty ( ) {
248+ let _ = tcx. dcx ( ) . emit_almost_fatal ( OffloadWithoutEnable ) ;
249+ }
250+
251+ if tcx. sess . lto ( ) != rustc_session:: config:: Lto :: Fat {
252+ let _ = tcx. dcx ( ) . emit_almost_fatal ( OffloadWithoutFatLTO ) ;
253+ }
254+
255+ codegen_offload_preload ( self , tcx, instance, args) ;
256+ return IntrinsicResult :: WroteIntoPlace ;
257+ }
246258 sym:: is_val_statically_known => {
247259 if let OperandValue :: Immediate ( imm) = args[ 0 ] . val {
248260 self . call_intrinsic (
@@ -1905,6 +1917,60 @@ fn codegen_autodiff<'ll, 'tcx>(
19051917 ) ;
19061918}
19071919
1920+ // For each PreLoad *call*, we now use some of our previous declared globals to move data to the gpu.
1921+ // For now, we only handle the data transfer part of it. Consecutive calls become a no-op on the
1922+ // LLVM side.
1923+ //
1924+ // Current steps:
1925+ // 0. Alloca some variables for the following steps
1926+ // 1. set insert point before PreLoad call.
1927+ // 2. generate all the GEPS and stores, to be used in 3)
1928+ // 3. generate __tgt_target_data_begin calls to move data to the GPU
1929+ //
1930+ // unchanged: keep kernel call. Later move the kernel to the GPU
1931+ //
1932+ // 4. set insert point after kernel call.
1933+ // 5. generate all the GEPS and stores, to be used in 6)
1934+ // 6. generate __tgt_target_data_end calls to move data from the GPU
1935+ fn codegen_offload_preload < ' ll , ' tcx > (
1936+ bx : & mut Builder < ' _ , ' ll , ' tcx > ,
1937+ tcx : TyCtxt < ' tcx > ,
1938+ instance : ty:: Instance < ' tcx > ,
1939+ args : & [ OperandRef < ' tcx , & ' ll Value > ] ,
1940+ ) {
1941+ let cx = bx. cx ;
1942+ //let fn_args = instance.args;
1943+
1944+ register_offload ( cx) ;
1945+
1946+ //let OperandValue::Ref(place) = args[0].val else {
1947+ // bug!("expected array operand by reference");
1948+ //};
1949+ //let arg_ty = place.layout.llvm_type(bx.cx);
1950+ ////let workgroup_val = bx.load(arr_ty, place.llval, four);
1951+
1952+ //let a = OffloadMetadata::from_ty(tcx, arg_ty);
1953+
1954+ let arg = & args[ 0 ] ;
1955+
1956+ let arg_ty = arg. layout . ty ;
1957+
1958+ let ty:: Ref ( _, pointee_ty, _) = * arg_ty. kind ( ) else {
1959+ bug ! ( "expected preload argument to be a reference, got {arg_ty:?}" ) ;
1960+ } ;
1961+
1962+ let meta = OffloadMetadata :: from_ty ( tcx, pointee_ty) ;
1963+
1964+ let offload_globals_ref = cx. offload_globals . borrow ( ) ;
1965+ let offload_globals = match offload_globals_ref. as_ref ( ) {
1966+ Some ( globals) => globals,
1967+ None => {
1968+ // Offload is not initialized, cannot continue
1969+ return ;
1970+ }
1971+ } ;
1972+ }
1973+
19081974// Generates the LLVM code to offload a Rust function to a target device (e.g., GPU).
19091975// For each kernel call, it generates the necessary globals (including metadata such as
19101976// size and pass mode), manages memory mapping to and from the device, handles all
0 commit comments