Skip to content

Commit 94d00bd

Browse files
blyxxyzoech3
authored andcommitted
shuf: feature: Add --random-seed option
This adds a new option to get reproducible output from a seed. This was already possible with --random-source, but doing that properly was tricky and had poor performance. Adding this option implies a commitment to keep using the exact same algorithms in the future. For that reason we only use third-party libraries for well-known algorithms and implement our own distributions on top of that. ----- As a teenager on King's Day I once used `shuf` for divination. People paid €0.50 to enter a cramped tent and sat down next to me behind an old netbook. I would ask their name and their sun sign and pipe this information into `shuf --random-source=/dev/stdin`, which selected pseudo-random dictionary words and `tee`d them into `espeak`. If someone's name was too short `shuf` crashed with an end of file error. --random-seed would have worked better.
1 parent bf167f9 commit 94d00bd

10 files changed

Lines changed: 250 additions & 28 deletions

File tree

.vscode/cspell.dictionaries/jargon.wordlist.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ fileio
5555
filesystem
5656
filesystems
5757
flamegraph
58+
footgun
5859
freeram
5960
fsxattr
6061
fullblock

.vscode/cspell.dictionaries/people.wordlist.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,9 @@ Boden Garman
3737
Chirag B Jadwani
3838
Chirag
3939
Jadwani
40+
Daniel Lemire
41+
Daniel
42+
Lemire
4043
Derek Chiang
4144
Derek
4245
Chiang

Cargo.lock

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -356,6 +356,7 @@ phf_codegen = "0.13.1"
356356
platform-info = "2.0.3"
357357
procfs = "0.18"
358358
rand = { version = "0.9.0", features = ["small_rng"] }
359+
rand_chacha = { version = "0.9.0" }
359360
rand_core = "0.9.0"
360361
rayon = "1.10"
361362
regex = "1.10.4"

src/uu/shuf/Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,9 @@ path = "src/shuf.rs"
2121
clap = { workspace = true }
2222
itoa = { workspace = true }
2323
rand = { workspace = true }
24+
rand_chacha = { workspace = true }
2425
rand_core = { workspace = true }
26+
sha3 = { workspace = true }
2527
uucore = { workspace = true }
2628
fluent = { workspace = true }
2729

src/uu/shuf/locales/en-US.ftl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ shuf-help-echo = treat each ARG as an input line
1010
shuf-help-input-range = treat each number LO through HI as an input line
1111
shuf-help-head-count = output at most COUNT lines
1212
shuf-help-output = write result to FILE instead of standard output
13+
shuf-help-random-seed = seed with STRING for reproducible output
1314
shuf-help-random-source = get random bytes from FILE
1415
shuf-help-repeat = output lines can be repeated
1516
shuf-help-zero-terminated = line delimiter is NUL, not newline

src/uu/shuf/src/compat_random_source.rs

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,9 @@
1-
use std::io::BufRead;
1+
// This file is part of the uutils coreutils package.
2+
//
3+
// For the full copyright and license information, please view the LICENSE
4+
// file that was distributed with this source code.
5+
6+
use std::{io::BufRead, ops::RangeInclusive};
27

38
use uucore::error::{FromIo, UResult, USimpleError};
49
use uucore::translate;
@@ -42,7 +47,7 @@ impl<R> RandomSourceAdapter<R> {
4247
}
4348

4449
impl<R: BufRead> RandomSourceAdapter<R> {
45-
pub fn get_value(&mut self, at_most: u64) -> UResult<u64> {
50+
fn generate_at_most(&mut self, at_most: u64) -> UResult<u64> {
4651
while self.entropy < at_most {
4752
let buf = self
4853
.reader
@@ -88,18 +93,29 @@ impl<R: BufRead> RandomSourceAdapter<R> {
8893
self.state %= num_possibilities;
8994
self.entropy %= num_possibilities;
9095
// I sure hope the compiler optimizes this tail call.
91-
self.get_value(at_most)
96+
self.generate_at_most(at_most)
9297
}
9398
}
9499

100+
pub fn choose_from_range(&mut self, range: RangeInclusive<u64>) -> UResult<u64> {
101+
let offset = self.generate_at_most(*range.end() - *range.start())?;
102+
Ok(*range.start() + offset)
103+
}
104+
105+
pub fn choose_from_slice<T: Copy>(&mut self, vals: &[T]) -> UResult<T> {
106+
assert!(!vals.is_empty());
107+
let idx = self.generate_at_most(vals.len() as u64 - 1)? as usize;
108+
Ok(vals[idx])
109+
}
110+
95111
pub fn shuffle<'a, T>(&mut self, vals: &'a mut [T], amount: usize) -> UResult<&'a mut [T]> {
96112
// Fisher-Yates shuffle.
97113
// TODO: GNU does something different if amount <= vals.len() and the input is stdin.
98114
// The order changes completely and depends on --head-count.
99115
// No clue what they might do differently and why.
100116
let amount = amount.min(vals.len());
101117
for idx in 0..amount {
102-
let other_idx = self.get_value((vals.len() - idx - 1) as u64)? as usize + idx;
118+
let other_idx = self.generate_at_most((vals.len() - idx - 1) as u64)? as usize + idx;
103119
vals.swap(idx, other_idx);
104120
}
105121
Ok(&mut vals[..amount])

src/uu/shuf/src/random_seed.rs

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
// This file is part of the uutils coreutils package.
2+
//
3+
// For the full copyright and license information, please view the LICENSE
4+
// file that was distributed with this source code.
5+
6+
use std::ops::RangeInclusive;
7+
8+
use rand::{RngCore as _, SeedableRng as _};
9+
use rand_chacha::ChaCha12Rng;
10+
use sha3::{Digest as _, Sha3_256};
11+
12+
/// Reproducible seeded random number generation.
13+
///
14+
/// The behavior should stay the same between releases, so don't change it without
15+
/// a very good reason.
16+
///
17+
/// # How it works
18+
///
19+
/// - Take a Unicode string as the seed.
20+
///
21+
/// - Encode this seed as UTF-8.
22+
///
23+
/// - Take the SHA3-256 hash of the encoded seed.
24+
///
25+
/// - Use that hash as the input for a [`rand_chacha`] ChaCha12 RNG.
26+
/// (We don't touch the nonce, so that's probably zero.)
27+
///
28+
/// - Take 64-bit samples from the RNG.
29+
///
30+
/// - Use Lemire's method to generate uniformly distributed integers and:
31+
///
32+
/// - With --repeat, use these to pick elements from ranges.
33+
///
34+
/// - Without --repeat, use these to do left-to-right modern Fisher-Yates.
35+
///
36+
/// - Or for --input-range without --repeat, do whatever NonrepeatingIterator does.
37+
/// (We may want to change that. Watch this space.)
38+
///
39+
/// # Why it works like this
40+
///
41+
/// - Unicode string: Greatest common denominator between platforms. Windows doesn't
42+
/// let you pass raw bytes as a CLI argument and that would be bad practice anyway.
43+
/// A decimal or hex number would work but this is much more flexible without being
44+
/// unmanageable.
45+
///
46+
/// (Footgun: if the user passes a filename we won't read from the file but the
47+
/// command will run anyway.)
48+
///
49+
/// - UTF-8: That's what Rust likes and it's the least unreasonable Unicode encoding.
50+
///
51+
/// - SHA3-256: We want to make good use of the entire user input and SHA-3 is
52+
/// state of the art. ChaCha12 takes a 256-bit seed.
53+
///
54+
/// - ChaCha12: [`rand`]'s default rng as of writing. Seems state of the art.
55+
///
56+
/// - 64-bit samples: We could often get away with 32-bit samples but let's keep things
57+
/// simple and only use one width. (There doesn't seem to be much of a performance hit.)
58+
///
59+
/// - Lemire, Fisher-Yates: These are very easy to implement and maintain ourselves.
60+
/// `rand` provides fancier implementations but only promises reproducibility within
61+
/// patch releases: <https://rust-random.github.io/book/crate-reprod.html>
62+
///
63+
/// Strictly speaking even `ChaCha12` is subject to breakage. But since it's a very
64+
/// specific algorithm I assume it's safe in practice.
65+
pub struct SeededRng(Box<ChaCha12Rng>);
66+
67+
impl SeededRng {
68+
pub fn new(seed: &str) -> Self {
69+
let mut hasher = Sha3_256::new();
70+
hasher.update(seed.as_bytes());
71+
let seed = hasher.finalize();
72+
let seed = seed.as_slice().try_into().unwrap();
73+
Self(Box::new(rand_chacha::ChaCha12Rng::from_seed(seed)))
74+
}
75+
76+
#[allow(clippy::many_single_char_names)] // use original lemire names for easy comparison
77+
fn generate_at_most(&mut self, at_most: u64) -> u64 {
78+
if at_most == u64::MAX {
79+
return self.0.next_u64();
80+
}
81+
82+
// https://lemire.me/blog/2019/06/06/nearly-divisionless-random-integer-generation-on-various-systems/
83+
let s: u64 = at_most + 1;
84+
let mut x: u64 = self.0.next_u64();
85+
let mut m: u128 = u128::from(x) * u128::from(s);
86+
let mut l: u64 = m as u64;
87+
if l < s {
88+
let t: u64 = s.wrapping_neg() % s;
89+
while l < t {
90+
x = self.0.next_u64();
91+
m = u128::from(x) * u128::from(s);
92+
l = m as u64;
93+
}
94+
}
95+
(m >> 64) as u64
96+
}
97+
98+
pub fn choose_from_range(&mut self, range: RangeInclusive<u64>) -> u64 {
99+
let offset = self.generate_at_most(*range.end() - *range.start());
100+
*range.start() + offset
101+
}
102+
103+
pub fn choose_from_slice<T: Copy>(&mut self, vals: &[T]) -> T {
104+
assert!(!vals.is_empty());
105+
let idx = self.generate_at_most(vals.len() as u64 - 1) as usize;
106+
vals[idx]
107+
}
108+
109+
pub fn shuffle<'a, T>(&mut self, vals: &'a mut [T], amount: usize) -> &'a mut [T] {
110+
// Fisher-Yates shuffle.
111+
let amount = amount.min(vals.len());
112+
for idx in 0..amount {
113+
let other_idx = self.generate_at_most((vals.len() - idx - 1) as u64) as usize + idx;
114+
vals.swap(idx, other_idx);
115+
}
116+
&mut vals[..amount]
117+
}
118+
}

src/uu/shuf/src/shuf.rs

Lines changed: 53 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -5,25 +5,32 @@
55

66
// spell-checker:ignore (ToDO) cmdline evec nonrepeating seps shufable rvec fdata
77

8-
use clap::builder::ValueParser;
9-
use clap::{Arg, ArgAction, Command};
10-
use rand::Rng;
11-
use rand::seq::{IndexedRandom, SliceRandom};
128
use std::ffi::{OsStr, OsString};
139
use std::fs::File;
1410
use std::io::{BufReader, BufWriter, Error, Read, Write, stdin, stdout};
1511
use std::ops::RangeInclusive;
1612
use std::path::{Path, PathBuf};
1713
use std::str::FromStr;
14+
15+
use clap::{Arg, ArgAction, Command, builder::ValueParser};
16+
use rand::rngs::ThreadRng;
17+
use rand::{
18+
Rng,
19+
seq::{IndexedRandom, SliceRandom},
20+
};
21+
1822
use uucore::display::{OsWrite, Quotable};
1923
use uucore::error::{FromIo, UResult, USimpleError, UUsageError};
2024
use uucore::format_usage;
2125
use uucore::translate;
2226

2327
mod compat_random_source;
2428
mod nonrepeating_iterator;
29+
mod random_seed;
2530

31+
use compat_random_source::RandomSourceAdapter;
2632
use nonrepeating_iterator::NonrepeatingIterator;
33+
use random_seed::SeededRng;
2734

2835
enum Mode {
2936
Default(PathBuf),
@@ -36,17 +43,24 @@ const BUF_SIZE: usize = 64 * 1024;
3643
struct Options {
3744
head_count: u64,
3845
output: Option<PathBuf>,
39-
random_source: Option<PathBuf>,
46+
random_source: RandomSource,
4047
repeat: bool,
4148
sep: u8,
4249
}
4350

51+
enum RandomSource {
52+
None,
53+
Seed(String),
54+
File(PathBuf),
55+
}
56+
4457
mod options {
4558
pub static ECHO: &str = "echo";
4659
pub static INPUT_RANGE: &str = "input-range";
4760
pub static HEAD_COUNT: &str = "head-count";
4861
pub static OUTPUT: &str = "output";
4962
pub static RANDOM_SOURCE: &str = "random-source";
63+
pub static RANDOM_SEED: &str = "random-seed";
5064
pub static REPEAT: &str = "repeat";
5165
pub static ZERO_TERMINATED: &str = "zero-terminated";
5266
pub static FILE_OR_ARGS: &str = "file-or-args";
@@ -80,6 +94,14 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
8094
Mode::Default(file.into())
8195
};
8296

97+
let random_source = if let Some(filename) = matches.get_one(options::RANDOM_SOURCE).cloned() {
98+
RandomSource::File(filename)
99+
} else if let Some(seed) = matches.get_one(options::RANDOM_SEED).cloned() {
100+
RandomSource::Seed(seed)
101+
} else {
102+
RandomSource::None
103+
};
104+
83105
let options = Options {
84106
// GNU shuf takes the lowest value passed, so we imitate that.
85107
// It's probably a bug or an implementation artifact though.
@@ -92,7 +114,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
92114
.min()
93115
.unwrap_or(u64::MAX),
94116
output: matches.get_one(options::OUTPUT).cloned(),
95-
random_source: matches.get_one(options::RANDOM_SOURCE).cloned(),
117+
random_source,
96118
repeat: matches.get_flag(options::REPEAT),
97119
sep: if matches.get_flag(options::ZERO_TERMINATED) {
98120
b'\0'
@@ -120,14 +142,15 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
120142
}
121143

122144
let mut rng = match options.random_source {
123-
Some(ref r) => {
145+
RandomSource::None => WrappedRng::Default(rand::rng()),
146+
RandomSource::Seed(ref seed) => WrappedRng::Seed(SeededRng::new(seed)),
147+
RandomSource::File(ref r) => {
124148
let file = File::open(r).map_err_context(
125149
|| translate!("shuf-error-failed-to-open-random-source", "file" => r.quote()),
126150
)?;
127151
let file = BufReader::new(file);
128-
WrappedRng::RngFile(compat_random_source::RandomSourceAdapter::new(file))
152+
WrappedRng::File(compat_random_source::RandomSourceAdapter::new(file))
129153
}
130-
None => WrappedRng::RngDefault(rand::rng()),
131154
};
132155

133156
match mode {
@@ -191,6 +214,15 @@ pub fn uu_app() -> Command {
191214
.value_parser(ValueParser::path_buf())
192215
.value_hint(clap::ValueHint::FilePath),
193216
)
217+
.arg(
218+
Arg::new(options::RANDOM_SEED)
219+
.long(options::RANDOM_SEED)
220+
.value_name("STRING")
221+
.help(translate!("shuf-help-random-seed"))
222+
.value_parser(ValueParser::string())
223+
.value_hint(clap::ValueHint::Other)
224+
.conflicts_with(options::RANDOM_SOURCE),
225+
)
194226
.arg(
195227
Arg::new(options::RANDOM_SOURCE)
196228
.long(options::RANDOM_SOURCE)
@@ -402,36 +434,33 @@ fn parse_range(input_range: &str) -> Result<RangeInclusive<u64>, String> {
402434
}
403435

404436
enum WrappedRng {
405-
RngDefault(rand::rngs::ThreadRng),
406-
RngFile(compat_random_source::RandomSourceAdapter<BufReader<File>>),
437+
Default(ThreadRng),
438+
Seed(SeededRng),
439+
File(RandomSourceAdapter<BufReader<File>>),
407440
}
408441

409442
impl WrappedRng {
410443
fn choose<T: Copy>(&mut self, vals: &[T]) -> UResult<T> {
411444
match self {
412-
Self::RngDefault(rng) => Ok(*vals.choose(rng).unwrap()),
413-
Self::RngFile(adapter) => {
414-
assert!(!vals.is_empty());
415-
let idx = adapter.get_value(vals.len() as u64 - 1)? as usize;
416-
Ok(vals[idx])
417-
}
445+
Self::Default(rng) => Ok(*vals.choose(rng).unwrap()),
446+
Self::Seed(rng) => Ok(rng.choose_from_slice(vals)),
447+
Self::File(rng) => rng.choose_from_slice(vals),
418448
}
419449
}
420450

421451
fn shuffle<'a, T>(&mut self, vals: &'a mut [T], amount: usize) -> UResult<&'a mut [T]> {
422452
match self {
423-
Self::RngDefault(rng) => Ok(vals.partial_shuffle(rng, amount).0),
424-
Self::RngFile(adapter) => adapter.shuffle(vals, amount),
453+
Self::Default(rng) => Ok(vals.partial_shuffle(rng, amount).0),
454+
Self::Seed(rng) => Ok(rng.shuffle(vals, amount)),
455+
Self::File(rng) => rng.shuffle(vals, amount),
425456
}
426457
}
427458

428459
fn choose_from_range(&mut self, range: RangeInclusive<u64>) -> UResult<u64> {
429460
match self {
430-
Self::RngDefault(rng) => Ok(rng.random_range(range)),
431-
Self::RngFile(adapter) => {
432-
let offset = adapter.get_value(*range.end() - *range.start())?;
433-
Ok(*range.start() + offset)
434-
}
461+
Self::Default(rng) => Ok(rng.random_range(range)),
462+
Self::Seed(rng) => Ok(rng.choose_from_range(range)),
463+
Self::File(rng) => rng.choose_from_range(range),
435464
}
436465
}
437466
}

0 commit comments

Comments
 (0)