Skip to content

Commit 4e6cc41

Browse files
authored
Merge pull request #14 from itzmeanjan/13-reduce-memory-allocation-during-rlnc-encoding
Address "Reduce memory allocation during RLNC Encoding"
2 parents d7557e9 + d7df66b commit 4e6cc41

7 files changed

Lines changed: 358 additions & 330 deletions

File tree

Cargo.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "rlnc"
3-
version = "0.8.0"
3+
version = "0.8.1"
44
edition = "2024"
55
resolver = "3"
66
rust-version = "1.85.0"

README.md

Lines changed: 299 additions & 294 deletions
Large diffs are not rendered by default.

src/common/simd.rs

Lines changed: 23 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,15 @@ use super::{
1818
simd_mul_table::{GF256_SIMD_MUL_TABLE_HIGH, GF256_SIMD_MUL_TABLE_LOW},
1919
};
2020

21+
/// Given a byte array of arbitrary length, this function can be used to multiply each
22+
/// byte element with a single specific scalar, over GF(2^8), mutating the input vector.
23+
///
24+
/// In case this function runs on `x86_64` with `avx2` or `ssse3` features, it can use
25+
/// lookup-table assisted SIMD multiplication, inspired from https://github.com/ceph/gf-complete/blob/a6862d10c9db467148f20eef2c6445ac9afd94d8/src/gf_w8.c#L1029-L1037.
26+
///
27+
/// You have to build with `RUSTFLAGS="-C target-cpu=native"` flag to enjoy full benefits of compiler optimization.
28+
///
29+
/// I originally discovered this technique in https://www.snia.org/sites/default/files/files2/files2/SDC2013/presentations/NewThinking/EthanMiller_Screaming_Fast_Galois_Field%20Arithmetic_SIMD%20Instructions.pdf.
2130
pub fn gf256_inplace_mul_vec_by_scalar(vec: &mut [u8], scalar: u8) {
2231
if vec.is_empty() {
2332
return;
@@ -97,33 +106,15 @@ pub fn gf256_inplace_mul_vec_by_scalar(vec: &mut [u8], scalar: u8) {
97106
});
98107
}
99108

100-
/// Given a byte array of arbitrary length, this function can be used to multiply each
101-
/// byte element with a single specific scalar, over GF(2^8), returning resulting vector.
102-
///
103-
/// In case this function runs on `x86_64` with `avx2` or `ssse3` features, it can use
104-
/// lookup-table assisted SIMD multiplication, inspired from https://github.com/ceph/gf-complete/blob/a6862d10c9db467148f20eef2c6445ac9afd94d8/src/gf_w8.c#L1029-L1037.
105-
///
106-
/// You have to build with `RUSTFLAGS="-C target-cpu=native -C target-feature=+avx2,+ssse3"`flag
107-
/// to enjoy full benefits of compiler optimization.
108-
///
109-
/// I originally discovered this technique in https://www.snia.org/sites/default/files/files2/files2/SDC2013/presentations/NewThinking/EthanMiller_Screaming_Fast_Galois_Field%20Arithmetic_SIMD%20Instructions.pdf.
110-
#[cfg(not(feature = "parallel"))]
111-
pub fn gf256_mul_vec_by_scalar(vec: &[u8], scalar: u8) -> Vec<u8> {
112-
let mut result = vec.to_vec();
113-
gf256_inplace_mul_vec_by_scalar(&mut result, scalar);
114-
115-
result
116-
}
117-
118109
/// Given two byte arrays of equal length, this routine performs element-wise
119110
/// addition over GF(2^8), mutating one of the operand vectors.
120111
///
121112
/// Note, addition over GF(2^8) is nothing but XOR-ing two operands. If this function
122113
/// runs on `x86_64` with `avx2` or `ssse3` features, it can perform fast SIMD addition
123114
/// using vector intrinsics.
124115
///
125-
/// You have to compile with `RUSTFLAGS="-C target-cpu=native -C target-feature=+avx2,+ssse3"`
126-
/// flag to hint the compiler so that it generates best code.
116+
/// You have to compile with `RUSTFLAGS="-C target-cpu=native` flag to hint the compiler
117+
/// so that it generates best code.
127118
pub fn gf256_inplace_add_vectors(vec_dst: &mut [u8], vec_src: &[u8]) {
128119
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
129120
if is_x86_feature_detected!("avx2") {
@@ -180,7 +171,18 @@ pub fn gf256_inplace_add_vectors(vec_dst: &mut [u8], vec_src: &[u8]) {
180171
});
181172
}
182173

183-
pub fn gf256_inplace_mul_vec_by_scalar_then_add_into_vec(add_into_vec: &mut [u8], mul_vec: &[u8], scalar: u8) {
174+
/// Given a byte array `mul_vec` of arbitrary length, this function can be used to multiply each
175+
/// byte element with a single specific scalar, over GF(2^8), and then adding each scaled value
176+
/// to corresponding value in sink vector `add_into_vec`.
177+
///
178+
/// In case this function runs on `x86_64` with `avx2` or `ssse3` features, it can use
179+
/// lookup-table assisted SIMD multiplication, inspired from https://github.com/ceph/gf-complete/blob/a6862d10c9db467148f20eef2c6445ac9afd94d8/src/gf_w8.c#L1029-L1037.
180+
///
181+
/// You have to build with `RUSTFLAGS="-C target-cpu=native"` flag to enjoy full benefits of compiler optimization.
182+
///
183+
/// This function can be thought of an optimization over, first applying `gf256_inplace_mul_vec_by_scalar`
184+
/// and then applying `gf256_inplace_add_vectors`.
185+
pub fn gf256_mul_vec_by_scalar_then_add_into_vec(add_into_vec: &mut [u8], mul_vec: &[u8], scalar: u8) {
184186
if add_into_vec.is_empty() {
185187
return;
186188
}

src/full/decoder_matrix.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ use crate::{
22
RLNCError,
33
common::{
44
gf256::Gf256,
5-
simd::{gf256_inplace_mul_vec_by_scalar, gf256_inplace_mul_vec_by_scalar_then_add_into_vec},
5+
simd::{gf256_inplace_mul_vec_by_scalar, gf256_mul_vec_by_scalar_then_add_into_vec},
66
},
77
};
88
use std::ops::{Index, IndexMut};
@@ -155,7 +155,7 @@ impl DecoderMatrix {
155155
let i_th_row = &left[(i_th_row_starts_at + i)..];
156156
let j_th_row = &mut right[(j_th_row_starts_at - i_th_row_ends_at + i)..(j_th_row_ends_at - i_th_row_ends_at)];
157157

158-
gf256_inplace_mul_vec_by_scalar_then_add_into_vec(j_th_row, i_th_row, quotient);
158+
gf256_mul_vec_by_scalar_then_add_into_vec(j_th_row, i_th_row, quotient);
159159
}
160160
}
161161

@@ -191,7 +191,7 @@ impl DecoderMatrix {
191191
let j_th_row = &mut left[(j_th_row_starts_at + i)..];
192192
let i_th_row = &right[(i_th_row_starts_at - j_th_row_ends_at + i)..(i_th_row_ends_at - j_th_row_ends_at)];
193193

194-
gf256_inplace_mul_vec_by_scalar_then_add_into_vec(j_th_row, i_th_row, quotient);
194+
gf256_mul_vec_by_scalar_then_add_into_vec(j_th_row, i_th_row, quotient);
195195
}
196196

197197
if self[(i, i)] == Gf256::one() {

src/full/encoder.rs

Lines changed: 29 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,12 @@ use super::consts::BOUNDARY_MARKER;
22
use crate::RLNCError;
33
use rand::Rng;
44

5-
#[cfg(not(feature = "parallel"))]
6-
use crate::common::simd::{gf256_inplace_add_vectors, gf256_mul_vec_by_scalar};
7-
8-
#[cfg(feature = "parallel")]
5+
#[cfg(all(feature = "parallel", not(any(target_arch = "x86", target_arch = "x86_64"))))]
96
use crate::common::gf256::Gf256;
7+
#[cfg(not(feature = "parallel"))]
8+
use crate::common::simd::gf256_mul_vec_by_scalar_then_add_into_vec;
9+
#[cfg(all(feature = "parallel", any(target_arch = "x86", target_arch = "x86_64")))]
10+
use crate::common::simd::{gf256_inplace_add_vectors, gf256_inplace_mul_vec_by_scalar};
1011
#[cfg(feature = "parallel")]
1112
use rayon::prelude::*;
1213

@@ -123,8 +124,7 @@ impl Encoder {
123124
self.data
124125
.chunks_exact(self.piece_byte_len)
125126
.zip(coding_vector)
126-
.map(|(piece, &random_symbol)| gf256_mul_vec_by_scalar(piece, random_symbol))
127-
.for_each(|cur| gf256_inplace_add_vectors(coded_piece, &cur));
127+
.for_each(|(piece, &random_symbol)| gf256_mul_vec_by_scalar_then_add_into_vec(coded_piece, piece, random_symbol));
128128

129129
Ok(full_coded_piece)
130130
}
@@ -147,12 +147,29 @@ impl Encoder {
147147
.data
148148
.par_chunks_exact(self.piece_byte_len)
149149
.zip(coding_vector)
150-
.map(|(piece, &random_symbol)| piece.iter().map(move |&symbol| (Gf256::new(symbol) * Gf256::new(random_symbol)).get()))
150+
.map(|(piece, &random_symbol)| {
151+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
152+
{
153+
let mut scalar_x_piece = piece.to_vec();
154+
gf256_inplace_mul_vec_by_scalar(&mut scalar_x_piece, random_symbol);
155+
156+
scalar_x_piece
157+
}
158+
159+
#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
160+
{
161+
piece.iter().map(move |&symbol| (Gf256::new(symbol) * Gf256::new(random_symbol)).get())
162+
}
163+
})
151164
.fold(
152165
|| vec![0u8; self.piece_byte_len],
153166
|mut acc, cur| {
167+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
168+
gf256_inplace_add_vectors(&mut acc, &cur);
169+
170+
#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
154171
acc.iter_mut().zip(cur).for_each(|(a, b)| {
155-
*a = (Gf256::new(*a) + Gf256::new(b)).get();
172+
*a ^= b;
156173
});
157174

158175
acc
@@ -161,6 +178,10 @@ impl Encoder {
161178
.reduce(
162179
|| vec![0u8; self.piece_byte_len],
163180
|mut acc, cur| {
181+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
182+
gf256_inplace_add_vectors(&mut acc, &cur);
183+
184+
#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
164185
acc.iter_mut().zip(cur).for_each(|(a, b)| {
165186
*a ^= b;
166187
});

src/lib.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,9 +93,9 @@
9393
//!
9494
//! ```toml
9595
//! [dependencies]
96-
//! rlnc = "=0.8.0" # On x86 target, it offers AVX2 and SSSE3 optimization for fast encoding/ recoding.
96+
//! rlnc = "=0.8.1" # On x86 target, it offers AVX2 and SSSE3 optimization for fast encoding/ recoding.
9797
//! # or
98-
//! rlnc = { version = "=0.8.0", features = "parallel" } # Uses `rayon`-based data-parallelism for fast encoding/ recoding.
98+
//! rlnc = { version = "=0.8.1", features = "parallel" } # Uses `rayon`-based data-parallelism for fast encoding/ recoding.
9999
//!
100100
//! rand = { version = "=0.9.1" } # Required for random number generation
101101
//! ```

0 commit comments

Comments
 (0)