Merge pull request #14 from itzmeanjan/13-reduce-memory-allocation-during-rlnc-encoding

itzmeanjan · web-flow · commit 4e6cc4110445 · 2025-08-02T17:17:30.000+05:30
Address "Reduce memory allocation during RLNC Encoding"
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "rlnc"
-version = "0.8.0"
+version = "0.8.1"
 edition = "2024"
 resolver = "3"
 rust-version = "1.85.0"
diff --git a/README.md b/README.md
diff --git a/src/common/simd.rs b/src/common/simd.rs
@@ -18,6 +18,15 @@ use super::{
     simd_mul_table::{GF256_SIMD_MUL_TABLE_HIGH, GF256_SIMD_MUL_TABLE_LOW},
 };
 
+/// Given a byte array of arbitrary length, this function can be used to multiply each
+/// byte element with a single specific scalar, over GF(2^8), mutating the input vector.
+///
+/// In case this function runs on `x86_64` with `avx2` or `ssse3` features, it can use
+/// lookup-table assisted SIMD multiplication, inspired from https://github.com/ceph/gf-complete/blob/a6862d10c9db467148f20eef2c6445ac9afd94d8/src/gf_w8.c#L1029-L1037.
+///
+/// You have to build with `RUSTFLAGS="-C target-cpu=native"` flag to enjoy full benefits of compiler optimization.
+///
+/// I originally discovered this technique in https://www.snia.org/sites/default/files/files2/files2/SDC2013/presentations/NewThinking/EthanMiller_Screaming_Fast_Galois_Field%20Arithmetic_SIMD%20Instructions.pdf.
 pub fn gf256_inplace_mul_vec_by_scalar(vec: &mut [u8], scalar: u8) {
     if vec.is_empty() {
         return;
@@ -97,33 +106,15 @@ pub fn gf256_inplace_mul_vec_by_scalar(vec: &mut [u8], scalar: u8) {
     });
 }
 
-/// Given a byte array of arbitrary length, this function can be used to multiply each
-/// byte element with a single specific scalar, over GF(2^8), returning resulting vector.
-///
-/// In case this function runs on `x86_64` with `avx2` or `ssse3` features, it can use
-/// lookup-table assisted SIMD multiplication, inspired from https://github.com/ceph/gf-complete/blob/a6862d10c9db467148f20eef2c6445ac9afd94d8/src/gf_w8.c#L1029-L1037.
-///
-/// You have to build with `RUSTFLAGS="-C target-cpu=native -C target-feature=+avx2,+ssse3"`flag
-/// to enjoy full benefits of compiler optimization.
-///
-/// I originally discovered this technique in https://www.snia.org/sites/default/files/files2/files2/SDC2013/presentations/NewThinking/EthanMiller_Screaming_Fast_Galois_Field%20Arithmetic_SIMD%20Instructions.pdf.
-#[cfg(not(feature = "parallel"))]
-pub fn gf256_mul_vec_by_scalar(vec: &[u8], scalar: u8) -> Vec<u8> {
-    let mut result = vec.to_vec();
-    gf256_inplace_mul_vec_by_scalar(&mut result, scalar);
-
-    result
-}
-
 /// Given two byte arrays of equal length, this routine performs element-wise
 /// addition over GF(2^8), mutating one of the operand vectors.
 ///
 /// Note, addition over GF(2^8) is nothing but XOR-ing two operands. If this function
 /// runs on `x86_64` with `avx2` or `ssse3` features, it can perform fast SIMD addition
 /// using vector intrinsics.
 ///
-/// You have to compile with `RUSTFLAGS="-C target-cpu=native -C target-feature=+avx2,+ssse3"`
-/// flag to hint the compiler so that it generates best code.
+/// You have to compile with `RUSTFLAGS="-C target-cpu=native` flag to hint the compiler
+/// so that it generates best code.
 pub fn gf256_inplace_add_vectors(vec_dst: &mut [u8], vec_src: &[u8]) {
     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
     if is_x86_feature_detected!("avx2") {
@@ -180,7 +171,18 @@ pub fn gf256_inplace_add_vectors(vec_dst: &mut [u8], vec_src: &[u8]) {
     });
 }
 
-pub fn gf256_inplace_mul_vec_by_scalar_then_add_into_vec(add_into_vec: &mut [u8], mul_vec: &[u8], scalar: u8) {
+/// Given a byte array `mul_vec` of arbitrary length, this function can be used to multiply each
+/// byte element with a single specific scalar, over GF(2^8), and then adding each scaled value
+/// to corresponding value in sink vector `add_into_vec`.
+///
+/// In case this function runs on `x86_64` with `avx2` or `ssse3` features, it can use
+/// lookup-table assisted SIMD multiplication, inspired from https://github.com/ceph/gf-complete/blob/a6862d10c9db467148f20eef2c6445ac9afd94d8/src/gf_w8.c#L1029-L1037.
+///
+/// You have to build with `RUSTFLAGS="-C target-cpu=native"` flag to enjoy full benefits of compiler optimization.
+///
+/// This function can be thought of an optimization over, first applying `gf256_inplace_mul_vec_by_scalar`
+/// and then applying `gf256_inplace_add_vectors`.
+pub fn gf256_mul_vec_by_scalar_then_add_into_vec(add_into_vec: &mut [u8], mul_vec: &[u8], scalar: u8) {
     if add_into_vec.is_empty() {
         return;
     }
diff --git a/src/full/decoder_matrix.rs b/src/full/decoder_matrix.rs
@@ -2,7 +2,7 @@ use crate::{
     RLNCError,
     common::{
         gf256::Gf256,
-        simd::{gf256_inplace_mul_vec_by_scalar, gf256_inplace_mul_vec_by_scalar_then_add_into_vec},
+        simd::{gf256_inplace_mul_vec_by_scalar, gf256_mul_vec_by_scalar_then_add_into_vec},
     },
 };
 use std::ops::{Index, IndexMut};
@@ -155,7 +155,7 @@ impl DecoderMatrix {
                 let i_th_row = &left[(i_th_row_starts_at + i)..];
                 let j_th_row = &mut right[(j_th_row_starts_at - i_th_row_ends_at + i)..(j_th_row_ends_at - i_th_row_ends_at)];
 
-                gf256_inplace_mul_vec_by_scalar_then_add_into_vec(j_th_row, i_th_row, quotient);
+                gf256_mul_vec_by_scalar_then_add_into_vec(j_th_row, i_th_row, quotient);
             }
         }
 
@@ -191,7 +191,7 @@ impl DecoderMatrix {
                 let j_th_row = &mut left[(j_th_row_starts_at + i)..];
                 let i_th_row = &right[(i_th_row_starts_at - j_th_row_ends_at + i)..(i_th_row_ends_at - j_th_row_ends_at)];
 
-                gf256_inplace_mul_vec_by_scalar_then_add_into_vec(j_th_row, i_th_row, quotient);
+                gf256_mul_vec_by_scalar_then_add_into_vec(j_th_row, i_th_row, quotient);
             }
 
             if self[(i, i)] == Gf256::one() {
diff --git a/src/full/encoder.rs b/src/full/encoder.rs
@@ -2,11 +2,12 @@ use super::consts::BOUNDARY_MARKER;
 use crate::RLNCError;
 use rand::Rng;
 
-#[cfg(not(feature = "parallel"))]
-use crate::common::simd::{gf256_inplace_add_vectors, gf256_mul_vec_by_scalar};
-
-#[cfg(feature = "parallel")]
+#[cfg(all(feature = "parallel", not(any(target_arch = "x86", target_arch = "x86_64"))))]
 use crate::common::gf256::Gf256;
+#[cfg(not(feature = "parallel"))]
+use crate::common::simd::gf256_mul_vec_by_scalar_then_add_into_vec;
+#[cfg(all(feature = "parallel", any(target_arch = "x86", target_arch = "x86_64")))]
+use crate::common::simd::{gf256_inplace_add_vectors, gf256_inplace_mul_vec_by_scalar};
 #[cfg(feature = "parallel")]
 use rayon::prelude::*;
 
@@ -123,8 +124,7 @@ impl Encoder {
         self.data
             .chunks_exact(self.piece_byte_len)
             .zip(coding_vector)
-            .map(|(piece, &random_symbol)| gf256_mul_vec_by_scalar(piece, random_symbol))
-            .for_each(|cur| gf256_inplace_add_vectors(coded_piece, &cur));
+            .for_each(|(piece, &random_symbol)| gf256_mul_vec_by_scalar_then_add_into_vec(coded_piece, piece, random_symbol));
 
         Ok(full_coded_piece)
     }
@@ -147,12 +147,29 @@ impl Encoder {
             .data
             .par_chunks_exact(self.piece_byte_len)
             .zip(coding_vector)
-            .map(|(piece, &random_symbol)| piece.iter().map(move |&symbol| (Gf256::new(symbol) * Gf256::new(random_symbol)).get()))
+            .map(|(piece, &random_symbol)| {
+                #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+                {
+                    let mut scalar_x_piece = piece.to_vec();
+                    gf256_inplace_mul_vec_by_scalar(&mut scalar_x_piece, random_symbol);
+
+                    scalar_x_piece
+                }
+
+                #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
+                {
+                    piece.iter().map(move |&symbol| (Gf256::new(symbol) * Gf256::new(random_symbol)).get())
+                }
+            })
             .fold(
                 || vec![0u8; self.piece_byte_len],
                 |mut acc, cur| {
+                    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+                    gf256_inplace_add_vectors(&mut acc, &cur);
+
+                    #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
                     acc.iter_mut().zip(cur).for_each(|(a, b)| {
-                        *a = (Gf256::new(*a) + Gf256::new(b)).get();
+                        *a ^= b;
                     });
 
                     acc
@@ -161,6 +178,10 @@ impl Encoder {
             .reduce(
                 || vec![0u8; self.piece_byte_len],
                 |mut acc, cur| {
+                    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+                    gf256_inplace_add_vectors(&mut acc, &cur);
+
+                    #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
                     acc.iter_mut().zip(cur).for_each(|(a, b)| {
                         *a ^= b;
                     });
diff --git a/src/lib.rs b/src/lib.rs
@@ -93,9 +93,9 @@
 //!
 //! ```toml
 //! [dependencies]
-//! rlnc = "=0.8.0"                                      # On x86 target, it offers AVX2 and SSSE3 optimization for fast encoding/ recoding.
+//! rlnc = "=0.8.1"                                      # On x86 target, it offers AVX2 and SSSE3 optimization for fast encoding/ recoding.
 //! # or
-//! rlnc = { version = "=0.8.0", features = "parallel" } # Uses `rayon`-based data-parallelism for fast encoding/ recoding.
+//! rlnc = { version = "=0.8.1", features = "parallel" } # Uses `rayon`-based data-parallelism for fast encoding/ recoding.
 //!
 //! rand = { version = "=0.9.1" } # Required for random number generation
 //! ```