@@ -1134,16 +1134,60 @@ impl<'ll, 'tcx, 'a> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> {
11341134 // Atomic Operations
11351135 fn atomic_cmpxchg (
11361136 & mut self ,
1137- _dst : & ' ll Value ,
1138- _cmp : & ' ll Value ,
1139- _src : & ' ll Value ,
1140- _order : AtomicOrdering ,
1141- _failure_order : AtomicOrdering ,
1142- _weak : bool ,
1137+ dst : & ' ll Value ,
1138+ cmp : & ' ll Value ,
1139+ src : & ' ll Value ,
1140+ order : AtomicOrdering ,
1141+ failure_order : AtomicOrdering ,
1142+ weak : bool ,
11431143 ) -> ( & ' ll Value , & ' ll Value ) {
1144- // allowed but only for some things and with restrictions
1145- // https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html#cmpxchg-instruction
1146- self . fatal ( "atomic cmpxchg is not supported" )
1144+ // LLVM verifier rejects cases where the `failure_order` is stronger than `order`
1145+ match ( order, failure_order) {
1146+ ( AtomicOrdering :: SeqCst , _) =>( ) ,
1147+ ( _, AtomicOrdering :: Relaxed ) =>( ) ,
1148+ ( AtomicOrdering :: Release , AtomicOrdering :: Release ) | ( AtomicOrdering :: Release , AtomicOrdering :: Acquire ) | ( AtomicOrdering :: Acquire , AtomicOrdering :: Acquire ) =>( ) ,
1149+ ( AtomicOrdering :: AcqRel , AtomicOrdering :: Acquire ) => ( ) ,
1150+ ( AtomicOrdering :: Relaxed , _) | ( _, AtomicOrdering :: Release | AtomicOrdering :: AcqRel | AtomicOrdering :: SeqCst ) =>{
1151+ // Invalid cmpxchg - failure order stronger than order!
1152+ self . abort ( ) ;
1153+ return ( self . const_undef ( self . val_ty ( cmp) ) , self . const_undef ( self . type_i1 ( ) ) ) ;
1154+ }
1155+ } ;
1156+ let res = self . atomic_op (
1157+ dst,
1158+ |builder, dst| {
1159+ unsafe {
1160+ llvm:: LLVMRustBuildAtomicCmpXchg (
1161+ builder. llbuilder ,
1162+ dst,
1163+ cmp,
1164+ src,
1165+ crate :: llvm:: AtomicOrdering :: from_generic ( order) ,
1166+ crate :: llvm:: AtomicOrdering :: from_generic ( failure_order) ,
1167+ weak as u32 ,
1168+ )
1169+ }
1170+ } ,
1171+ |builder, dst| {
1172+ // Local space is only accessible to the current thread.
1173+ // So, there are no synchronization issues, and we can emulate it using a simple load / compare / store.
1174+ let load: & ' ll Value = unsafe { llvm:: LLVMBuildLoad ( builder. llbuilder , dst, UNNAMED ) } ;
1175+ let compare = builder. icmp ( IntPredicate :: IntEQ , load, cmp) ;
1176+ // We can do something smart & branchless here:
1177+ // We select either the current value(if the comparison fails), or a new value.
1178+ // We then *undconditionally* write that back to local memory(which is very, very cheap).
1179+ let value = builder. select ( compare, src, load) ;
1180+ unsafe { llvm:: LLVMBuildStore ( builder. llbuilder , value, dst) } ;
1181+ let res_type = builder. type_struct ( & [ builder. val_ty ( cmp) , builder. type_ix ( 1 ) ] , false ) ;
1182+ let res = builder. const_undef ( res_type) ; // insert_value
1183+ let res = builder. insert_value ( res, load, 0 ) ;
1184+ let res = builder. insert_value ( res, compare, 1 ) ;
1185+ res
1186+ } ,
1187+ ) ;
1188+ let val = self . extract_value ( res, 0 ) ;
1189+ let success = self . extract_value ( res, 1 ) ;
1190+ ( val, success)
11471191 }
11481192 fn atomic_rmw (
11491193 & mut self ,
@@ -1609,3 +1653,96 @@ impl<'a, 'll, 'tcx> Builder<'a, 'll, 'tcx> {
16091653 }
16101654 }
16111655}
1656+ impl < ' ll , ' tcx , ' a > Builder < ' a , ' ll , ' tcx > {
1657+ fn atomic_op (
1658+ & mut self ,
1659+ dst : & ' ll Value ,
1660+ atomic_supported : impl FnOnce ( & mut Builder < ' a , ' ll , ' tcx > , & ' ll Value ) -> & ' ll Value ,
1661+ emulate_local : impl FnOnce ( & mut Builder < ' a , ' ll , ' tcx > , & ' ll Value ) -> & ' ll Value ,
1662+ ) -> & ' ll Value {
1663+ // (FractalFir) Atomics in CUDA have some limitations, and we have to work around them.
1664+ // For example, they are restricted in what address space they operate on.
1665+ // CUDA has 4 address spaces(and a generic one, which is an union of all of those).
1666+ // An atomic instruction can soundly operate on:
1667+ // 1. The global address space
1668+ // 2. The shared(cluster) address space.
1669+ // It can't operate on:
1670+ // 1. The const address space(atomics on consts are UB anyway)
1671+ // 2. The thread address space(which should be only accessible to 1 thread, anyway?)
1672+ // So, we do the following:
1673+ // 1. Check if the pointer is in one of the address spaces atomics support.
1674+ // a) if so, we perform an atomic operation
1675+ // 2. Check if the pointer is in the thread-local address space. If it is, we use non-atomic ops here,
1676+ // **ASSUMING** only the current thread can access thread-local memory. (FIXME: is this sound?)
1677+ // 3. If the pointer is not in a supported address space, and is not thread-local, then we bail, and trap.
1678+
1679+ // We check if the `dst` pointer is in the `global` address space.
1680+ let ( isspacep_global_ty, isspacep_global_fn) =
1681+ self . get_intrinsic ( "llvm.nvvm.isspacep.global" ) ;
1682+ let isspacep_global = self . call (
1683+ isspacep_global_ty,
1684+ None ,
1685+ None ,
1686+ isspacep_global_fn,
1687+ & [ dst] ,
1688+ None ,
1689+ None ,
1690+ ) ;
1691+ // We check if the `dst` pointer is in the `shared` address space.
1692+ let ( isspacep_shared_ty, isspacep_shared_fn) =
1693+ self . get_intrinsic ( "llvm.nvvm.isspacep.shared" ) ;
1694+ let isspacep_shared = self . call (
1695+ isspacep_shared_ty,
1696+ None ,
1697+ None ,
1698+ isspacep_shared_fn,
1699+ & [ dst] ,
1700+ None ,
1701+ None ,
1702+ ) ;
1703+ // Combine those to check if we are in a supported address space.
1704+ let atomic_supported_addrspace = self . or ( isspacep_shared, isspacep_global) ;
1705+ // We create 2 blocks here: one we branch to if atomic is in the right address space, and one we branch to otherwise.
1706+ let supported_bb = self . append_sibling_block ( "atomic_space_supported" ) ;
1707+ let unsupported_bb = self . append_sibling_block ( "atomic_space_unsupported" ) ;
1708+ self . cond_br ( atomic_supported_addrspace, supported_bb, unsupported_bb) ;
1709+ // We also create a "merge" block we will jump to, after the the atomic ops finish.
1710+ let merge_bb = self . append_sibling_block ( "atomic_op_done" ) ;
1711+ // Execute atomic op if supported, then jump to merge
1712+ self . switch_to_block ( supported_bb) ;
1713+ let supported_res = atomic_supported ( self , dst) ;
1714+ self . br ( merge_bb) ;
1715+ // Check if the pointer is in the thread space. If so, we can emulate it.
1716+ self . switch_to_block ( unsupported_bb) ;
1717+ let ( isspacep_local_ty, isspacep_local_fn) = self . get_intrinsic ( "llvm.nvvm.isspacep.local" ) ;
1718+ let isspacep_local = self . call (
1719+ isspacep_local_ty,
1720+ None ,
1721+ None ,
1722+ isspacep_local_fn,
1723+ & [ dst] ,
1724+ None ,
1725+ None ,
1726+ ) ;
1727+ let local_bb = self . append_sibling_block ( "atomic_local_space" ) ;
1728+ let atomic_ub_bb = self . append_sibling_block ( "atomic_space_ub" ) ;
1729+ self . cond_br ( isspacep_local, local_bb, atomic_ub_bb) ;
1730+ // The pointer is in the thread(local) space.
1731+ self . switch_to_block ( local_bb) ;
1732+ let local_res = emulate_local ( self , dst) ;
1733+ self . br ( merge_bb) ;
1734+ // The pointer is neither in the supported address space, nor the local space.
1735+ // This is very likely UB. So, we trap here.
1736+ // TODO: should we print some kind of a message here? NVVM supports printf.
1737+ self . switch_to_block ( atomic_ub_bb) ;
1738+ self . abort ( ) ;
1739+ self . unreachable ( ) ;
1740+ // Atomic is impl has finished, and we can now switch to the merge_bb
1741+ self . switch_to_block ( merge_bb) ;
1742+ self . phi (
1743+ self . val_ty ( local_res) ,
1744+ & [ supported_res, local_res] ,
1745+ & [ supported_bb, local_bb] ,
1746+ )
1747+ }
1748+ }
0 commit comments