@@ -993,6 +993,162 @@ mod tests {
993993 assert_eq ! ( vals[ 1 ] , 1. ) ;
994994 assert_eq ! ( vals[ 2 ] , 2. ) ;
995995 }
996+
997+ macro_rules! wide_store_load_roundtrip {
998+ ( $elem_ty: ty, $len: expr, $vec_ty: ty, $store: expr, $load: expr) => {
999+ let vals: [ $elem_ty; $len] = crate :: array:: from_fn( |i| i as $elem_ty) ;
1000+ let a: $vec_ty = transmute( vals) ;
1001+ let mut tmp = [ 0 as $elem_ty; $len] ;
1002+ $store( tmp. as_mut_ptr( ) . cast( ) , a) ;
1003+ let r: $vec_ty = $load( tmp. as_ptr( ) . cast( ) ) ;
1004+ let out: [ $elem_ty; $len] = transmute( r) ;
1005+ assert_eq!( out, vals) ;
1006+ } ;
1007+ }
1008+
1009+ macro_rules! wide_store_load_roundtrip_fp16 {
1010+ ( $( $name: ident $args: tt) ;* $( ; ) ?) => {
1011+ $(
1012+ #[ simd_test( enable = "neon,fp16" ) ]
1013+ #[ cfg( not( target_arch = "arm64ec" ) ) ]
1014+ unsafe fn $name( ) {
1015+ wide_store_load_roundtrip! $args;
1016+ }
1017+ ) *
1018+ } ;
1019+ }
1020+
1021+ wide_store_load_roundtrip_fp16 ! {
1022+ test_vld1_f16_x2( f16, 8 , float16x4x2_t, vst1_f16_x2, vld1_f16_x2) ;
1023+ test_vld1_f16_x3( f16, 12 , float16x4x3_t, vst1_f16_x3, vld1_f16_x3) ;
1024+ test_vld1_f16_x4( f16, 16 , float16x4x4_t, vst1_f16_x4, vld1_f16_x4) ;
1025+
1026+ test_vld1q_f16_x2( f16, 16 , float16x8x2_t, vst1q_f16_x2, vld1q_f16_x2) ;
1027+ test_vld1q_f16_x3( f16, 24 , float16x8x3_t, vst1q_f16_x3, vld1q_f16_x3) ;
1028+ test_vld1q_f16_x4( f16, 32 , float16x8x4_t, vst1q_f16_x4, vld1q_f16_x4) ;
1029+ }
1030+
1031+ macro_rules! wide_store_load_roundtrip_aes {
1032+ ( $( $name: ident $args: tt) ;* $( ; ) ?) => {
1033+ $(
1034+ #[ simd_test( enable = "neon,aes" ) ]
1035+ unsafe fn $name( ) {
1036+ wide_store_load_roundtrip! $args;
1037+ }
1038+ ) *
1039+ } ;
1040+ }
1041+
1042+ wide_store_load_roundtrip_aes ! {
1043+ test_vld1_p64_x2( p64, 2 , poly64x1x2_t, vst1_p64_x2, vld1_p64_x2) ;
1044+ test_vld1_p64_x3( p64, 3 , poly64x1x3_t, vst1_p64_x3, vld1_p64_x3) ;
1045+ test_vld1_p64_x4( p64, 4 , poly64x1x4_t, vst1_p64_x4, vld1_p64_x4) ;
1046+
1047+ test_vld1q_p64_x2( p64, 4 , poly64x2x2_t, vst1q_p64_x2, vld1q_p64_x2) ;
1048+ test_vld1q_p64_x3( p64, 6 , poly64x2x3_t, vst1q_p64_x3, vld1q_p64_x3) ;
1049+ test_vld1q_p64_x4( p64, 8 , poly64x2x4_t, vst1q_p64_x4, vld1q_p64_x4) ;
1050+ }
1051+
1052+ macro_rules! wide_store_load_roundtrip_neon {
1053+ ( $( $name: ident $args: tt) ;* $( ; ) ?) => {
1054+ $(
1055+ #[ simd_test( enable = "neon" ) ]
1056+ unsafe fn $name( ) {
1057+ wide_store_load_roundtrip! $args;
1058+ }
1059+ ) *
1060+ } ;
1061+ }
1062+
1063+ wide_store_load_roundtrip_neon ! {
1064+ test_vld1_f32_x2( f32 , 4 , float32x2x2_t, vst1_f32_x2, vld1_f32_x2) ;
1065+ test_vld1_f32_x3( f32 , 6 , float32x2x3_t, vst1_f32_x3, vld1_f32_x3) ;
1066+ test_vld1_f32_x4( f32 , 8 , float32x2x4_t, vst1_f32_x4, vld1_f32_x4) ;
1067+
1068+ test_vld1q_f32_x2( f32 , 8 , float32x4x2_t, vst1q_f32_x2, vld1q_f32_x2) ;
1069+ test_vld1q_f32_x3( f32 , 12 , float32x4x3_t, vst1q_f32_x3, vld1q_f32_x3) ;
1070+ test_vld1q_f32_x4( f32 , 16 , float32x4x4_t, vst1q_f32_x4, vld1q_f32_x4) ;
1071+
1072+ test_vld1_s8_x2( i8 , 16 , int8x8x2_t, vst1_s8_x2, vld1_s8_x2) ;
1073+ test_vld1_s8_x3( i8 , 24 , int8x8x3_t, vst1_s8_x3, vld1_s8_x3) ;
1074+ test_vld1_s8_x4( i8 , 32 , int8x8x4_t, vst1_s8_x4, vld1_s8_x4) ;
1075+
1076+ test_vld1q_s8_x2( i8 , 32 , int8x16x2_t, vst1q_s8_x2, vld1q_s8_x2) ;
1077+ test_vld1q_s8_x3( i8 , 48 , int8x16x3_t, vst1q_s8_x3, vld1q_s8_x3) ;
1078+ test_vld1q_s8_x4( i8 , 64 , int8x16x4_t, vst1q_s8_x4, vld1q_s8_x4) ;
1079+
1080+ test_vld1_s16_x2( i16 , 8 , int16x4x2_t, vst1_s16_x2, vld1_s16_x2) ;
1081+ test_vld1_s16_x3( i16 , 12 , int16x4x3_t, vst1_s16_x3, vld1_s16_x3) ;
1082+ test_vld1_s16_x4( i16 , 16 , int16x4x4_t, vst1_s16_x4, vld1_s16_x4) ;
1083+
1084+ test_vld1q_s16_x2( i16 , 16 , int16x8x2_t, vst1q_s16_x2, vld1q_s16_x2) ;
1085+ test_vld1q_s16_x3( i16 , 24 , int16x8x3_t, vst1q_s16_x3, vld1q_s16_x3) ;
1086+ test_vld1q_s16_x4( i16 , 32 , int16x8x4_t, vst1q_s16_x4, vld1q_s16_x4) ;
1087+
1088+ test_vld1_s32_x2( i32 , 4 , int32x2x2_t, vst1_s32_x2, vld1_s32_x2) ;
1089+ test_vld1_s32_x3( i32 , 6 , int32x2x3_t, vst1_s32_x3, vld1_s32_x3) ;
1090+ test_vld1_s32_x4( i32 , 8 , int32x2x4_t, vst1_s32_x4, vld1_s32_x4) ;
1091+
1092+ test_vld1q_s32_x2( i32 , 8 , int32x4x2_t, vst1q_s32_x2, vld1q_s32_x2) ;
1093+ test_vld1q_s32_x3( i32 , 12 , int32x4x3_t, vst1q_s32_x3, vld1q_s32_x3) ;
1094+ test_vld1q_s32_x4( i32 , 16 , int32x4x4_t, vst1q_s32_x4, vld1q_s32_x4) ;
1095+
1096+ test_vld1_s64_x2( i64 , 2 , int64x1x2_t, vst1_s64_x2, vld1_s64_x2) ;
1097+ test_vld1_s64_x3( i64 , 3 , int64x1x3_t, vst1_s64_x3, vld1_s64_x3) ;
1098+ test_vld1_s64_x4( i64 , 4 , int64x1x4_t, vst1_s64_x4, vld1_s64_x4) ;
1099+
1100+ test_vld1q_s64_x2( i64 , 4 , int64x2x2_t, vst1q_s64_x2, vld1q_s64_x2) ;
1101+ test_vld1q_s64_x3( i64 , 6 , int64x2x3_t, vst1q_s64_x3, vld1q_s64_x3) ;
1102+ test_vld1q_s64_x4( i64 , 8 , int64x2x4_t, vst1q_s64_x4, vld1q_s64_x4) ;
1103+
1104+ test_vld1_u8_x2( u8 , 16 , uint8x8x2_t, vst1_u8_x2, vld1_u8_x2) ;
1105+ test_vld1_u8_x3( u8 , 24 , uint8x8x3_t, vst1_u8_x3, vld1_u8_x3) ;
1106+ test_vld1_u8_x4( u8 , 32 , uint8x8x4_t, vst1_u8_x4, vld1_u8_x4) ;
1107+
1108+ test_vld1q_u8_x2( u8 , 32 , uint8x16x2_t, vst1q_u8_x2, vld1q_u8_x2) ;
1109+ test_vld1q_u8_x3( u8 , 48 , uint8x16x3_t, vst1q_u8_x3, vld1q_u8_x3) ;
1110+ test_vld1q_u8_x4( u8 , 64 , uint8x16x4_t, vst1q_u8_x4, vld1q_u8_x4) ;
1111+
1112+ test_vld1_u16_x2( u16 , 8 , uint16x4x2_t, vst1_u16_x2, vld1_u16_x2) ;
1113+ test_vld1_u16_x3( u16 , 12 , uint16x4x3_t, vst1_u16_x3, vld1_u16_x3) ;
1114+ test_vld1_u16_x4( u16 , 16 , uint16x4x4_t, vst1_u16_x4, vld1_u16_x4) ;
1115+
1116+ test_vld1q_u16_x2( u16 , 16 , uint16x8x2_t, vst1q_u16_x2, vld1q_u16_x2) ;
1117+ test_vld1q_u16_x3( u16 , 24 , uint16x8x3_t, vst1q_u16_x3, vld1q_u16_x3) ;
1118+ test_vld1q_u16_x4( u16 , 32 , uint16x8x4_t, vst1q_u16_x4, vld1q_u16_x4) ;
1119+
1120+ test_vld1_u32_x2( u32 , 4 , uint32x2x2_t, vst1_u32_x2, vld1_u32_x2) ;
1121+ test_vld1_u32_x3( u32 , 6 , uint32x2x3_t, vst1_u32_x3, vld1_u32_x3) ;
1122+ test_vld1_u32_x4( u32 , 8 , uint32x2x4_t, vst1_u32_x4, vld1_u32_x4) ;
1123+
1124+ test_vld1q_u32_x2( u32 , 8 , uint32x4x2_t, vst1q_u32_x2, vld1q_u32_x2) ;
1125+ test_vld1q_u32_x3( u32 , 12 , uint32x4x3_t, vst1q_u32_x3, vld1q_u32_x3) ;
1126+ test_vld1q_u32_x4( u32 , 16 , uint32x4x4_t, vst1q_u32_x4, vld1q_u32_x4) ;
1127+
1128+ test_vld1_u64_x2( u64 , 2 , uint64x1x2_t, vst1_u64_x2, vld1_u64_x2) ;
1129+ test_vld1_u64_x3( u64 , 3 , uint64x1x3_t, vst1_u64_x3, vld1_u64_x3) ;
1130+ test_vld1_u64_x4( u64 , 4 , uint64x1x4_t, vst1_u64_x4, vld1_u64_x4) ;
1131+
1132+ test_vld1q_u64_x2( u64 , 4 , uint64x2x2_t, vst1q_u64_x2, vld1q_u64_x2) ;
1133+ test_vld1q_u64_x3( u64 , 6 , uint64x2x3_t, vst1q_u64_x3, vld1q_u64_x3) ;
1134+ test_vld1q_u64_x4( u64 , 8 , uint64x2x4_t, vst1q_u64_x4, vld1q_u64_x4) ;
1135+
1136+ test_vld1_p8_x2( p8, 16 , poly8x8x2_t, vst1_p8_x2, vld1_p8_x2) ;
1137+ test_vld1_p8_x3( p8, 24 , poly8x8x3_t, vst1_p8_x3, vld1_p8_x3) ;
1138+ test_vld1_p8_x4( p8, 32 , poly8x8x4_t, vst1_p8_x4, vld1_p8_x4) ;
1139+
1140+ test_vld1q_p8_x2( p8, 32 , poly8x16x2_t, vst1q_p8_x2, vld1q_p8_x2) ;
1141+ test_vld1q_p8_x3( p8, 48 , poly8x16x3_t, vst1q_p8_x3, vld1q_p8_x3) ;
1142+ test_vld1q_p8_x4( p8, 64 , poly8x16x4_t, vst1q_p8_x4, vld1q_p8_x4) ;
1143+
1144+ test_vld1_p16_x2( p16, 8 , poly16x4x2_t, vst1_p16_x2, vld1_p16_x2) ;
1145+ test_vld1_p16_x3( p16, 12 , poly16x4x3_t, vst1_p16_x3, vld1_p16_x3) ;
1146+ test_vld1_p16_x4( p16, 16 , poly16x4x4_t, vst1_p16_x4, vld1_p16_x4) ;
1147+
1148+ test_vld1q_p16_x2( p16, 16 , poly16x8x2_t, vst1q_p16_x2, vld1q_p16_x2) ;
1149+ test_vld1q_p16_x3( p16, 24 , poly16x8x3_t, vst1q_p16_x3, vld1q_p16_x3) ;
1150+ test_vld1q_p16_x4( p16, 32 , poly16x8x4_t, vst1q_p16_x4, vld1q_p16_x4) ;
1151+ }
9961152}
9971153
9981154#[ cfg( test) ]
0 commit comments