Skip to content

Commit d2b9550

Browse files
can1357sylvestre
andauthored
split: preserve non-UTF-8 bytes in output filename generation (#11397)
Co-authored-by: Sylvestre Ledru <sylvestre@debian.org>
1 parent 9ccd9ad commit d2b9550

5 files changed

Lines changed: 119 additions & 78 deletions

File tree

src/uu/split/src/filenames.rs

Lines changed: 30 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -334,7 +334,7 @@ impl<'a> FilenameIterator<'a> {
334334
}
335335

336336
impl Iterator for FilenameIterator<'_> {
337-
type Item = String;
337+
type Item = OsString;
338338

339339
fn next(&mut self) -> Option<Self::Item> {
340340
if self.first_iteration {
@@ -344,12 +344,10 @@ impl Iterator for FilenameIterator<'_> {
344344
}
345345
// The first and third parts are just taken directly from the
346346
// struct parameters unchanged.
347-
Some(format!(
348-
"{}{}{}",
349-
self.prefix.to_string_lossy(),
350-
self.number,
351-
self.additional_suffix.to_string_lossy()
352-
))
347+
let mut filename = self.prefix.to_os_string();
348+
filename.push(self.number.to_string());
349+
filename.push(self.additional_suffix);
350+
Some(filename)
353351
}
354352
}
355353

@@ -512,4 +510,29 @@ mod tests {
512510
let it = FilenameIterator::new(std::ffi::OsStr::new("chunk_"), &suffix);
513511
assert!(it.is_err());
514512
}
513+
#[test]
514+
#[cfg(unix)]
515+
fn test_filename_iterator_preserves_non_utf8_bytes() {
516+
use std::ffi::OsStr;
517+
use std::os::unix::ffi::{OsStrExt, OsStringExt};
518+
519+
let suffix = Suffix {
520+
stype: SuffixType::Alphabetic,
521+
length: 2,
522+
start: 0,
523+
auto_widening: false,
524+
additional: std::ffi::OsString::from_vec(vec![0xFE]),
525+
};
526+
527+
let mut it = FilenameIterator::new(OsStr::from_bytes(b"p\xFF"), &suffix)
528+
.expect("valid fixed-width filename iterator");
529+
assert_eq!(
530+
it.next().expect("first chunk filename exists").into_vec(),
531+
b"p\xFFaa\xFE".to_vec()
532+
);
533+
assert_eq!(
534+
it.next().expect("second chunk filename exists").into_vec(),
535+
b"p\xFFab\xFE".to_vec()
536+
);
537+
}
515538
}

src/uu/split/src/platform/unix.rs

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,12 @@
33
// For the full copyright and license information, please view the LICENSE
44
// file that was distributed with this source code.
55
use std::env;
6-
use std::ffi::OsStr;
6+
use std::ffi::{OsStr, OsString};
77
use std::io::{BufWriter, Error, Result};
88
use std::io::{ErrorKind, Write};
99
use std::path::Path;
1010
use std::process::{Child, Command, Stdio};
11+
use uucore::display::Quotable;
1112
use uucore::error::USimpleError;
1213
use uucore::fs;
1314
use uucore::fs::FileInformation;
@@ -45,12 +46,12 @@ struct WithEnvVarSet {
4546
/// Env var key
4647
previous_var_key: String,
4748
/// Previous value set to this key
48-
previous_var_value: std::result::Result<String, env::VarError>,
49+
previous_var_value: Option<OsString>,
4950
}
5051
impl WithEnvVarSet {
5152
/// Save previous value assigned to key, set key=value
52-
fn new(key: &str, value: &str) -> Self {
53-
let previous_env_value = env::var(key);
53+
fn new(key: &str, value: &OsStr) -> Self {
54+
let previous_env_value = env::var_os(key);
5455
unsafe {
5556
env::set_var(key, value);
5657
}
@@ -64,7 +65,7 @@ impl WithEnvVarSet {
6465
impl Drop for WithEnvVarSet {
6566
/// Restore previous value now that this is being dropped by context
6667
fn drop(&mut self) {
67-
if let Ok(ref prev_value) = self.previous_var_value {
68+
if let Some(prev_value) = &self.previous_var_value {
6869
unsafe {
6970
env::set_var(&self.previous_var_key, prev_value);
7071
}
@@ -82,7 +83,7 @@ impl FilterWriter {
8283
///
8384
/// * `command` - The shell command to execute
8485
/// * `filepath` - Path of the output file (forwarded to command as $FILE)
85-
fn new(command: &str, filepath: &str) -> Result<Self> {
86+
fn new(command: &str, filepath: &OsStr) -> Result<Self> {
8687
// set $FILE, save previous value (if there was one)
8788
let _with_env_var_set = WithEnvVarSet::new("FILE", filepath);
8889

@@ -127,7 +128,7 @@ impl Drop for FilterWriter {
127128
/// Instantiate either a file writer or a "write to shell process's stdin" writer
128129
pub fn instantiate_current_writer(
129130
filter: Option<&str>,
130-
filename: &str,
131+
filename: &OsStr,
131132
is_new: bool,
132133
) -> Result<BufWriter<Box<dyn Write>>> {
133134
match filter {
@@ -138,24 +139,25 @@ pub fn instantiate_current_writer(
138139
.write(true)
139140
.create(true)
140141
.truncate(true)
141-
.open(Path::new(&filename))
142+
.open(Path::new(filename))
142143
.map_err(|e| match e.kind() {
143144
ErrorKind::IsADirectory => Error::other(
144-
translate!("split-error-is-a-directory", "dir" => filename),
145+
translate!("split-error-is-a-directory", "dir" => filename.quote()),
145146
),
146147
_ => Error::other(
147-
translate!("split-error-unable-to-open-file", "file" => filename),
148+
translate!("split-error-unable-to-open-file", "file" => filename.quote()),
148149
),
149150
})?
150151
} else {
151152
// re-open file that we previously created to append to it
152153
std::fs::OpenOptions::new()
153154
.append(true)
154-
.open(Path::new(&filename))
155+
.open(Path::new(filename))
155156
.map_err(|_| {
156-
Error::other(
157-
translate!("split-error-unable-to-reopen-file", "file" => filename),
158-
)
157+
Error::other(translate!(
158+
"split-error-unable-to-reopen-file",
159+
"file" => filename.quote()
160+
))
159161
})?
160162
};
161163
Ok(BufWriter::new(Box::new(file) as Box<dyn Write>))

src/uu/split/src/platform/windows.rs

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ use std::ffi::OsStr;
66
use std::io::{BufWriter, Error, Result};
77
use std::io::{ErrorKind, Write};
88
use std::path::Path;
9+
use uucore::display::Quotable;
910
use uucore::fs;
1011
use uucore::translate;
1112

@@ -15,7 +16,7 @@ use uucore::translate;
1516
/// a file writer
1617
pub fn instantiate_current_writer(
1718
_filter: Option<&str>,
18-
filename: &str,
19+
filename: &OsStr,
1920
is_new: bool,
2021
) -> Result<BufWriter<Box<dyn Write>>> {
2122
let file = if is_new {
@@ -24,22 +25,24 @@ pub fn instantiate_current_writer(
2425
.write(true)
2526
.create(true)
2627
.truncate(true)
27-
.open(Path::new(&filename))
28+
.open(Path::new(filename))
2829
.map_err(|e| match e.kind() {
29-
ErrorKind::IsADirectory => {
30-
Error::other(translate!("split-error-is-a-directory", "dir" => filename))
31-
}
32-
_ => {
33-
Error::other(translate!("split-error-unable-to-open-file", "file" => filename))
34-
}
30+
ErrorKind::IsADirectory => Error::other(
31+
translate!("split-error-is-a-directory", "dir" => filename.quote()),
32+
),
33+
_ => Error::other(
34+
translate!("split-error-unable-to-open-file", "file" => filename.quote()),
35+
),
3536
})?
3637
} else {
3738
// re-open file that we previously created to append to it
3839
std::fs::OpenOptions::new()
3940
.append(true)
40-
.open(Path::new(&filename))
41+
.open(Path::new(filename))
4142
.map_err(|_| {
42-
Error::other(translate!("split-error-unable-to-reopen-file", "file" => filename))
43+
Error::other(
44+
translate!("split-error-unable-to-reopen-file", "file" => filename.quote()),
45+
)
4346
})?
4447
};
4548
Ok(BufWriter::new(Box::new(file) as Box<dyn Write>))

src/uu/split/src/split.rs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ use crate::filenames::{FilenameIterator, Suffix, SuffixError};
1414
use crate::strategy::{NumberType, Strategy, StrategyError};
1515
use clap::{Arg, ArgAction, ArgMatches, Command, ValueHint, parser::ValueSource};
1616
use std::env;
17-
use std::ffi::OsString;
17+
use std::ffi::{OsStr, OsString};
1818
use std::fs::{File, metadata};
1919
use std::io;
2020
use std::io::{BufRead, BufReader, BufWriter, ErrorKind, Read, Seek, SeekFrom, Write, stdin};
@@ -540,10 +540,10 @@ impl Settings {
540540

541541
fn instantiate_current_writer(
542542
&self,
543-
filename: &str,
543+
filename: &OsStr,
544544
is_new: bool,
545545
) -> io::Result<BufWriter<Box<dyn Write>>> {
546-
if platform::paths_refer_to_same_file(&self.input, filename.as_ref()) {
546+
if platform::paths_refer_to_same_file(&self.input, filename) {
547547
return Err(io::Error::other(
548548
translate!("split-error-would-overwrite-input", "file" => filename.quote()),
549549
));
@@ -920,7 +920,7 @@ impl Write for LineChunkWriter<'_> {
920920

921921
/// Output file parameters
922922
struct OutFile {
923-
filename: String,
923+
filename: OsString,
924924
maybe_writer: Option<BufWriter<Box<dyn Write>>>,
925925
is_new: bool,
926926
}
@@ -974,7 +974,7 @@ impl ManageOutFiles for OutFiles {
974974
let maybe_writer = if is_writer_optional {
975975
None
976976
} else {
977-
let instantiated = settings.instantiate_current_writer(filename.as_str(), true);
977+
let instantiated = settings.instantiate_current_writer(&filename, true);
978978
// If there was an error instantiating the writer for a file,
979979
// it could be due to hitting the system limit of open files,
980980
// so record it as None and let [`get_writer`] function handle closing/re-opening
@@ -1011,7 +1011,7 @@ impl ManageOutFiles for OutFiles {
10111011
// might "steel" the freed fd and open a file on its side. Then it would be beneficial
10121012
// if split would be able to close another fd before cancellation.
10131013
'loop1: loop {
1014-
let filename_to_open = self[idx].filename.as_str();
1014+
let filename_to_open = &self[idx].filename;
10151015
let file_to_open_is_new = self[idx].is_new;
10161016
let maybe_writer =
10171017
settings.instantiate_current_writer(filename_to_open, file_to_open_is_new);

tests/by-util/test_split.rs

Lines changed: 54 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -2021,61 +2021,74 @@ fn test_split_non_utf8_paths() {
20212021

20222022
#[test]
20232023
#[cfg(target_os = "linux")]
2024-
fn test_split_non_utf8_prefix() {
2025-
use std::os::unix::ffi::OsStrExt;
2024+
fn test_split_non_utf8_prefix_is_byte_preserving() {
2025+
use std::ffi::OsStr;
2026+
use std::os::unix::ffi::{OsStrExt, OsStringExt};
2027+
20262028
let (at, mut ucmd) = at_and_ucmd!();
2029+
at.write("input.txt", "AB");
20272030

2028-
at.write("input.txt", "line1\nline2\nline3\nline4\n");
2031+
let invalid_prefix_bytes = b"p\xFF";
2032+
at.write("p�aa", "keep-aa");
2033+
at.write("p�ab", "keep-ab");
20292034

2030-
let prefix = std::ffi::OsStr::from_bytes(b"\xFF\xFE");
2031-
ucmd.arg("input.txt").arg(prefix).succeeds();
2035+
ucmd.args(&["-b", "1", "input.txt"])
2036+
.arg(OsStr::from_bytes(invalid_prefix_bytes))
2037+
.succeeds();
20322038

2033-
// Check that split files were created (functionality works)
2034-
// The actual filename may be converted due to lossy conversion, but the command should succeed
2035-
let entries: Vec<_> = fs::read_dir(at.as_string()).unwrap().collect();
2036-
let split_files = entries
2037-
.iter()
2038-
.filter_map(|e| e.as_ref().ok())
2039-
.filter(|entry| {
2040-
let name = entry.file_name();
2041-
let name_str = name.to_string_lossy();
2042-
name_str.starts_with("�") || name_str.len() > 2 // split files should exist
2039+
let mut produced_split_files: Vec<Vec<u8>> = fs::read_dir(at.as_string())
2040+
.expect("temporary split directory should be readable")
2041+
.filter_map(|entry| {
2042+
let entry = entry.ok()?;
2043+
let name = entry.file_name().into_vec();
2044+
if name.starts_with(invalid_prefix_bytes) {
2045+
Some(name)
2046+
} else {
2047+
None
2048+
}
20432049
})
2044-
.count();
2045-
assert!(
2046-
split_files > 0,
2047-
"Expected at least one split file to be created"
2050+
.collect();
2051+
produced_split_files.sort();
2052+
2053+
assert_eq!(
2054+
produced_split_files,
2055+
vec![b"p\xFFaa".to_vec(), b"p\xFFab".to_vec()]
20482056
);
2057+
assert_eq!(at.read("p�aa"), "keep-aa");
2058+
assert_eq!(at.read("p�ab"), "keep-ab");
20492059
}
20502060

20512061
#[test]
20522062
#[cfg(target_os = "linux")]
2053-
fn test_split_non_utf8_additional_suffix() {
2054-
use std::os::unix::ffi::OsStrExt;
2055-
let (at, mut ucmd) = at_and_ucmd!();
2063+
fn test_split_non_utf8_additional_suffix_is_byte_preserving() {
2064+
use std::ffi::OsStr;
2065+
use std::os::unix::ffi::{OsStrExt, OsStringExt};
20562066

2057-
at.write("input.txt", "line1\nline2\nline3\nline4\n");
2067+
let (at, mut ucmd) = at_and_ucmd!();
2068+
at.write("input.txt", "AB");
20582069

2059-
let suffix = std::ffi::OsStr::from_bytes(b"\xFF\xFE");
2060-
ucmd.args(&["input.txt", "--additional-suffix"])
2061-
.arg(suffix)
2070+
let suffix_bytes = b"\xFF\xFE";
2071+
ucmd.args(&["-b", "1", "input.txt", "--additional-suffix"])
2072+
.arg(OsStr::from_bytes(suffix_bytes))
20622073
.succeeds();
20632074

2064-
// Check that split files were created (functionality works)
2065-
// The actual filename may be converted due to lossy conversion, but the command should succeed
2066-
let entries: Vec<_> = fs::read_dir(at.as_string()).unwrap().collect();
2067-
let split_files = entries
2068-
.iter()
2069-
.filter_map(|e| e.as_ref().ok())
2070-
.filter(|entry| {
2071-
let name = entry.file_name();
2072-
let name_str = name.to_string_lossy();
2073-
name_str.ends_with("�") || name_str.starts_with('x') // split files should exist
2075+
let mut produced_with_suffix: Vec<Vec<u8>> = fs::read_dir(at.as_string())
2076+
.expect("temporary split directory should be readable")
2077+
.filter_map(|entry| {
2078+
let entry = entry.ok()?;
2079+
let name = entry.file_name().into_vec();
2080+
if name.starts_with(b"xa") && name.ends_with(suffix_bytes) {
2081+
Some(name)
2082+
} else {
2083+
None
2084+
}
20742085
})
2075-
.count();
2076-
assert!(
2077-
split_files > 0,
2078-
"Expected at least one split file to be created"
2086+
.collect();
2087+
produced_with_suffix.sort();
2088+
2089+
assert_eq!(
2090+
produced_with_suffix,
2091+
vec![b"xaa\xFF\xFE".to_vec(), b"xab\xFF\xFE".to_vec()]
20792092
);
20802093
}
20812094

@@ -2089,5 +2102,5 @@ fn test_split_directory_already_exists() {
20892102
ucmd.args(&["file"])
20902103
.fails_with_code(1)
20912104
.no_stdout()
2092-
.stderr_is("split: xaa: Is a directory\n");
2105+
.stderr_is("split: 'xaa': Is a directory\n");
20932106
}

0 commit comments

Comments
 (0)