LoongArch64 FP16 hardware support

heiher · Alexhuszagh · commit cbbeeeea98e0 · 2025-09-21T11:21:13.000-05:00
LoongArch is a RISC instruction set architecture and currently a Tier-2 (with host-tools) target [^1] in the Rust upstream community. This patch introduces FP16 conversion functions based on the LoongArch SIMD extension to improve performance. Benchmarks: ``` HalfFloatSliceExt::convert_from_f32_slice/constants time: [10.816 ns 10.823 ns 10.831 ns] change: [-63.769% -63.728% -63.693%] (p = 0.00 < 0.05) Performance has improved. HalfFloatSliceExt::convert_from_f32_slice/large time: [137.68 ns 137.77 ns 137.88 ns] change: [-94.847% -94.841% -94.834%] (p = 0.00 < 0.05) Performance has improved. HalfFloatSliceExt::convert_from_f64_slice/constants time: [12.656 ns 12.669 ns 12.684 ns] change: [-78.455% -78.418% -78.367%] (p = 0.00 < 0.05) Performance has improved. HalfFloatSliceExt::convert_from_f64_slice/large time: [544.15 ns 544.49 ns 544.91 ns] change: [-89.799% -89.791% -89.781%] (p = 0.00 < 0.05) Performance has improved. HalfFloatSliceExt::convert_to_f32_slice/constants time: [6.0412 ns 6.0442 ns 6.0482 ns] change: [-74.100% -74.068% -74.042%] (p = 0.00 < 0.05) Performance has improved. HalfFloatSliceExt::convert_to_f32_slice/large time: [512.78 ns 513.08 ns 513.45 ns] change: [-77.628% -77.526% -77.422%] (p = 0.00 < 0.05) Performance has improved. HalfFloatSliceExt::convert_to_f64_slice/constants time: [10.779 ns 10.784 ns 10.792 ns] change: [-49.028% -48.922% -48.813%] (p = 0.00 < 0.05) Performance has improved. HalfFloatSliceExt::convert_to_f64_slice/large time: [923.19 ns 923.77 ns 924.50 ns] change: [-80.876% -80.862% -80.849%] (p = 0.00 < 0.05) Performance has improved. ``` [^1]: https://doc.rust-lang.org/stable/rustc/platform-support/loongarch-linux.html
diff --git a/README.md b/README.md
@@ -28,6 +28,7 @@ for specific CPU features which avoids the runtime overhead and works in a `no_s
 | ------------ | ------------------ | ----- |
 | `x86`/`x86_64` | `f16c` | This supports conversion to/from `f16` only (including vector SIMD) and does not support any `bf16` or arithmetic operations. |
 | `aarch64` | `fp16` | This supports all operations on `f16` only. |
+| `loongarch64` | `lsx` | This supports conversion to/from `f16` only (including vector SIMD) and does not support any `bf16` or arithmetic operations. |
 
 ### More Documentation
 
diff --git a/src/binary16/arch.rs b/src/binary16/arch.rs
@@ -9,10 +9,14 @@ mod x86;
 #[cfg(target_arch = "aarch64")]
 mod aarch64;
 
+#[cfg(target_arch = "loongarch64")]
+mod loongarch64;
+
 macro_rules! convert_fn {
-    (
-        if x86_feature("f16c") { $f16c:expr }else if aarch64_feature("fp16") { $aarch64:expr }else { $fallback:expr }
-    ) => {
+    (if x86_feature("f16c") { $f16c:expr }
+    else if aarch64_feature("fp16") { $aarch64:expr }
+    else if loongarch64_feature("lsx") { $loongarch64:expr }
+    else { $fallback:expr }) => {
         cfg_if::cfg_if! {
             // Use intrinsics directly when a compile target or using no_std
             if #[cfg(all(
@@ -29,6 +33,12 @@ macro_rules! convert_fn {
             ))] {
                 $aarch64
             }
+            else if #[cfg(all(
+                target_arch = "loongarch64",
+                target_feature = "lsx"
+            ))] {
+                $loongarch64
+            }
 
             // Use CPU feature detection if using std
             else if #[cfg(all(
@@ -55,6 +65,17 @@ macro_rules! convert_fn {
                     $fallback
                 }
             }
+            else if #[cfg(all(
+                feature = "std",
+                target_arch = "loongarch64",
+            ))] {
+                use std::arch::is_loongarch_feature_detected;
+                if is_loongarch_feature_detected!("lsx") {
+                    $loongarch64
+                } else {
+                    $fallback
+                }
+            }
 
             // Fallback to software
             else {
@@ -71,6 +92,8 @@ pub(crate) fn f32_to_f16(f: f32) -> u16 {
             unsafe { x86::f32_to_f16_x86_f16c(f) }
         } else if aarch64_feature("fp16") {
             unsafe { aarch64::f32_to_f16_fp16(f) }
+        } else if loongarch64_feature("lsx") {
+            unsafe { loongarch64::f32_to_f16_lsx(f) }
         } else {
             f32_to_f16_fallback(f)
         }
@@ -84,6 +107,8 @@ pub(crate) fn f64_to_f16(f: f64) -> u16 {
             unsafe { x86::f64_to_f16_x86_f16c(f) }
         } else if aarch64_feature("fp16") {
             unsafe { aarch64::f64_to_f16_fp16(f) }
+        } else if loongarch64_feature("lsx") {
+            f64_to_f16_fallback(f)
         } else {
             f64_to_f16_fallback(f)
         }
@@ -97,6 +122,8 @@ pub(crate) fn f16_to_f32(i: u16) -> f32 {
             unsafe { x86::f16_to_f32_x86_f16c(i) }
         } else if aarch64_feature("fp16") {
             unsafe { aarch64::f16_to_f32_fp16(i) }
+        } else if loongarch64_feature("lsx") {
+            unsafe { loongarch64::f16_to_f32_lsx(i) }
         } else {
             f16_to_f32_fallback(i)
         }
@@ -110,6 +137,8 @@ pub(crate) fn f16_to_f64(i: u16) -> f64 {
             unsafe { x86::f16_to_f64_x86_f16c(i) }
         } else if aarch64_feature("fp16") {
             unsafe { aarch64::f16_to_f64_fp16(i) }
+        } else if loongarch64_feature("lsx") {
+            unsafe { loongarch64::f16_to_f32_lsx(i) as f64 }
         } else {
             f16_to_f64_fallback(i)
         }
@@ -123,6 +152,8 @@ pub(crate) fn f32x4_to_f16x4(f: &[f32; 4]) -> [u16; 4] {
             unsafe { x86::f32x4_to_f16x4_x86_f16c(f) }
         } else if aarch64_feature("fp16") {
             unsafe { aarch64::f32x4_to_f16x4_fp16(f) }
+        } else if loongarch64_feature("lsx") {
+            unsafe { loongarch64::f32x4_to_f16x4_lsx(f) }
         } else {
             f32x4_to_f16x4_fallback(f)
         }
@@ -136,6 +167,8 @@ pub(crate) fn f16x4_to_f32x4(i: &[u16; 4]) -> [f32; 4] {
             unsafe { x86::f16x4_to_f32x4_x86_f16c(i) }
         } else if aarch64_feature("fp16") {
             unsafe { aarch64::f16x4_to_f32x4_fp16(i) }
+        } else if loongarch64_feature("lsx") {
+            unsafe { loongarch64::f16x4_to_f32x4_lsx(i) }
         } else {
             f16x4_to_f32x4_fallback(i)
         }
@@ -149,6 +182,8 @@ pub(crate) fn f64x4_to_f16x4(f: &[f64; 4]) -> [u16; 4] {
             unsafe { x86::f64x4_to_f16x4_x86_f16c(f) }
         } else if aarch64_feature("fp16") {
             unsafe { aarch64::f64x4_to_f16x4_fp16(f) }
+        } else if loongarch64_feature("lsx") {
+            unsafe { loongarch64::f64x4_to_f16x4_lsx(f) }
         } else {
             f64x4_to_f16x4_fallback(f)
         }
@@ -162,6 +197,8 @@ pub(crate) fn f16x4_to_f64x4(i: &[u16; 4]) -> [f64; 4] {
             unsafe { x86::f16x4_to_f64x4_x86_f16c(i) }
         } else if aarch64_feature("fp16") {
             unsafe { aarch64::f16x4_to_f64x4_fp16(i) }
+        } else if loongarch64_feature("lsx") {
+            unsafe { loongarch64::f16x4_to_f64x4_lsx(i) }
         } else {
             f16x4_to_f64x4_fallback(i)
         }
@@ -180,6 +217,13 @@ pub(crate) fn f32x8_to_f16x8(f: &[f32; 8]) -> [u16; 8] {
                     aarch64::f32x4_to_f16x4_fp16);
                 result
             }
+        } else if loongarch64_feature("lsx") {
+            {
+                let mut result = [0u16; 8];
+                convert_chunked_slice_4(f.as_slice(), result.as_mut_slice(),
+                    loongarch64::f32x4_to_f16x4_lsx);
+                result
+            }
         } else {
             f32x8_to_f16x8_fallback(f)
         }
@@ -198,6 +242,13 @@ pub(crate) fn f16x8_to_f32x8(i: &[u16; 8]) -> [f32; 8] {
                     aarch64::f16x4_to_f32x4_fp16);
                 result
             }
+        } else if loongarch64_feature("lsx") {
+            {
+                let mut result = [0f32; 8];
+                convert_chunked_slice_4(i.as_slice(), result.as_mut_slice(),
+                    loongarch64::f16x4_to_f32x4_lsx);
+                result
+            }
         } else {
             f16x8_to_f32x8_fallback(i)
         }
@@ -216,6 +267,13 @@ pub(crate) fn f64x8_to_f16x8(f: &[f64; 8]) -> [u16; 8] {
                     aarch64::f64x4_to_f16x4_fp16);
                 result
             }
+        } else if loongarch64_feature("lsx") {
+            {
+                let mut result = [0u16; 8];
+                convert_chunked_slice_4(f.as_slice(), result.as_mut_slice(),
+                    loongarch64::f64x4_to_f16x4_lsx);
+                result
+            }
         } else {
             f64x8_to_f16x8_fallback(f)
         }
@@ -234,6 +292,13 @@ pub(crate) fn f16x8_to_f64x8(i: &[u16; 8]) -> [f64; 8] {
                     aarch64::f16x4_to_f64x4_fp16);
                 result
             }
+        } else if loongarch64_feature("lsx") {
+            {
+                let mut result = [0f64; 8];
+                convert_chunked_slice_4(i.as_slice(), result.as_mut_slice(),
+                    loongarch64::f16x4_to_f64x4_lsx);
+                result
+            }
         } else {
             f16x8_to_f64x8_fallback(i)
         }
@@ -248,6 +313,8 @@ pub(crate) fn f32_to_f16_slice(src: &[f32], dst: &mut [u16]) {
                 x86::f32x4_to_f16x4_x86_f16c)
         } else if aarch64_feature("fp16") {
             convert_chunked_slice_4(src, dst, aarch64::f32x4_to_f16x4_fp16)
+        } else if loongarch64_feature("lsx") {
+            convert_chunked_slice_4(src, dst, loongarch64::f32x4_to_f16x4_lsx)
         } else {
             slice_fallback(src, dst, f32_to_f16_fallback)
         }
@@ -262,6 +329,8 @@ pub(crate) fn f16_to_f32_slice(src: &[u16], dst: &mut [f32]) {
                 x86::f16x4_to_f32x4_x86_f16c)
         } else if aarch64_feature("fp16") {
             convert_chunked_slice_4(src, dst, aarch64::f16x4_to_f32x4_fp16)
+        } else if loongarch64_feature("lsx") {
+            convert_chunked_slice_4(src, dst, loongarch64::f16x4_to_f32x4_lsx)
         } else {
             slice_fallback(src, dst, f16_to_f32_fallback)
         }
@@ -276,6 +345,8 @@ pub(crate) fn f64_to_f16_slice(src: &[f64], dst: &mut [u16]) {
                 x86::f64x4_to_f16x4_x86_f16c)
         } else if aarch64_feature("fp16") {
             convert_chunked_slice_4(src, dst, aarch64::f64x4_to_f16x4_fp16)
+        } else if loongarch64_feature("lsx") {
+            convert_chunked_slice_4(src, dst, loongarch64::f64x4_to_f16x4_lsx)
         } else {
             slice_fallback(src, dst, f64_to_f16_fallback)
         }
@@ -290,6 +361,8 @@ pub(crate) fn f16_to_f64_slice(src: &[u16], dst: &mut [f64]) {
                 x86::f16x4_to_f64x4_x86_f16c)
         } else if aarch64_feature("fp16") {
             convert_chunked_slice_4(src, dst, aarch64::f16x4_to_f64x4_fp16)
+        } else if loongarch64_feature("lsx") {
+            convert_chunked_slice_4(src, dst, loongarch64::f16x4_to_f64x4_lsx)
         } else {
             slice_fallback(src, dst, f16_to_f64_fallback)
         }
diff --git a/src/binary16/arch/loongarch64.rs b/src/binary16/arch/loongarch64.rs
@@ -0,0 +1,63 @@
+use core::{mem::MaybeUninit, ptr};
+
+#[cfg(target_arch = "loongarch64")]
+use core::arch::loongarch64::{lsx_vfcvt_h_s, lsx_vfcvtl_s_h, m128, m128i};
+
+/////////////// loongarch64 lsx/lasx ////////////////
+
+#[target_feature(enable = "lsx")]
+#[inline]
+pub(super) unsafe fn f16_to_f32_lsx(i: u16) -> f32 {
+    let mut vec = MaybeUninit::<m128i>::zeroed();
+    vec.as_mut_ptr().cast::<u16>().write(i);
+    let retval = lsx_vfcvtl_s_h(vec.assume_init());
+    *(&retval as *const m128).cast()
+}
+
+#[target_feature(enable = "lsx")]
+#[inline]
+pub(super) unsafe fn f32_to_f16_lsx(f: f32) -> u16 {
+    let mut vec = MaybeUninit::<m128>::zeroed();
+    vec.as_mut_ptr().cast::<f32>().write(f);
+    let retval = lsx_vfcvt_h_s(vec.assume_init(), vec.assume_init());
+    *(&retval as *const m128i).cast()
+}
+
+#[target_feature(enable = "lsx")]
+#[inline]
+pub(super) unsafe fn f16x4_to_f32x4_lsx(v: &[u16; 4]) -> [f32; 4] {
+    let mut vec = MaybeUninit::<m128i>::zeroed();
+    ptr::copy_nonoverlapping(v.as_ptr(), vec.as_mut_ptr().cast(), 4);
+    let retval = lsx_vfcvtl_s_h(vec.assume_init());
+    *(&retval as *const m128).cast()
+}
+
+#[target_feature(enable = "lsx")]
+#[inline]
+pub(super) unsafe fn f32x4_to_f16x4_lsx(v: &[f32; 4]) -> [u16; 4] {
+    let mut vec = MaybeUninit::<m128>::uninit();
+    ptr::copy_nonoverlapping(v.as_ptr(), vec.as_mut_ptr().cast(), 4);
+    let retval = lsx_vfcvt_h_s(vec.assume_init(), vec.assume_init());
+    *(&retval as *const m128i).cast()
+}
+
+#[target_feature(enable = "lsx")]
+#[inline]
+pub(super) unsafe fn f16x4_to_f64x4_lsx(v: &[u16; 4]) -> [f64; 4] {
+    let array = f16x4_to_f32x4_lsx(v);
+    // Let compiler vectorize this regular cast for now.
+    [
+        array[0] as f64,
+        array[1] as f64,
+        array[2] as f64,
+        array[3] as f64,
+    ]
+}
+
+#[target_feature(enable = "lsx")]
+#[inline]
+pub(super) unsafe fn f64x4_to_f16x4_lsx(v: &[f64; 4]) -> [u16; 4] {
+    // Let compiler vectorize this regular cast for now.
+    let v = [v[0] as f32, v[1] as f32, v[2] as f32, v[3] as f32];
+    f32x4_to_f16x4_lsx(&v)
+}
diff --git a/src/lib.rs b/src/lib.rs
@@ -49,6 +49,7 @@
 //! | ------------ | ------------------ | ----- |
 //! | `x86`/`x86_64` | `f16c` | This supports conversion to/from [`struct@f16`] only (including vector SIMD) and does not support any [`struct@bf16`] or arithmetic operations. |
 //! | `aarch64` | `fp16` | This supports all operations on [`struct@f16`] only. |
+//! | `loongarch64` | `lsx` | This supports conversion to/from [`struct@f16`] only (including vector SIMD) and does not support any [`struct@bf16`] or arithmetic operations. |
 //!
 //! # Cargo Features
 //!