1+ use memchr:: memchr2;
12use std:: io:: { self , BufRead } ;
23
4+ /// Converts a byte vector to a UTF-8 String, mapping encoding errors to `io::Error`.
5+ #[ inline]
6+ fn bytes_to_string ( bytes : Vec < u8 > ) -> io:: Result < String > {
7+ String :: from_utf8 ( bytes) . map_err ( |e| io:: Error :: new ( io:: ErrorKind :: InvalidData , e) )
8+ }
9+
310#[ derive( Clone , Debug ) ]
411pub ( crate ) struct Delimited < ' d , B > {
512 buf : B ,
@@ -45,16 +52,82 @@ impl<'d, B: BufRead> Iterator for DelimitedString<'d, B> {
4552
4653 #[ inline]
4754 fn next ( & mut self ) -> Option < Self :: Item > {
48- match self . inner . next ( ) ? {
49- Err ( e) => Some ( Err ( e) ) ,
50- Ok ( bytes) => match String :: from_utf8 ( bytes) {
51- Ok ( s) => Some ( Ok ( s) ) ,
52- Err ( err) => Some ( Err ( io:: Error :: new ( io:: ErrorKind :: InvalidData , err) ) ) ,
53- } ,
55+ Some ( self . inner . next ( ) ?. and_then ( bytes_to_string) )
56+ }
57+ }
58+
59+ #[ derive( Clone , Debug ) ]
60+ pub ( crate ) struct LineBreakDelimited < B > {
61+ buf : B ,
62+ pending : Vec < u8 > ,
63+ done : bool ,
64+ }
65+
66+ impl < B : BufRead > LineBreakDelimited < B > {
67+ #[ inline]
68+ fn new ( buf : B ) -> Self {
69+ Self {
70+ buf,
71+ pending : Vec :: new ( ) ,
72+ done : false ,
73+ }
74+ }
75+ }
76+
77+ impl < B : BufRead > Iterator for LineBreakDelimited < B > {
78+ type Item = io:: Result < Vec < u8 > > ;
79+
80+ #[ inline]
81+ fn next ( & mut self ) -> Option < Self :: Item > {
82+ if self . done {
83+ return None ;
84+ }
85+ loop {
86+ let buf = match self . buf . fill_buf ( ) {
87+ Ok ( buf) => buf,
88+ Err ( err) => return Some ( Err ( err) ) ,
89+ } ;
90+
91+ if buf. is_empty ( ) {
92+ self . done = true ;
93+ return if self . pending . is_empty ( ) {
94+ None
95+ } else {
96+ Some ( Ok ( std:: mem:: take ( & mut self . pending ) ) )
97+ } ;
98+ }
99+
100+ match memchr2 ( b'\n' , b'\r' , buf) {
101+ Some ( idx) => {
102+ self . pending . extend_from_slice ( & buf[ ..idx] ) ;
103+ self . buf . consume ( idx + 1 ) ;
104+ return Some ( Ok ( std:: mem:: take ( & mut self . pending ) ) ) ;
105+ }
106+ None => {
107+ let len = buf. len ( ) ;
108+ self . pending . extend_from_slice ( buf) ;
109+ self . buf . consume ( len) ;
110+ }
111+ }
54112 }
55113 }
56114}
57115
116+ /// Adapter that wraps the line-break iterator and yields UTF-8 Strings.
117+ #[ derive( Debug ) ]
118+ pub ( crate ) struct LineBreakDelimitedString < B > {
119+ inner : LineBreakDelimited < B > ,
120+ }
121+
122+ impl < B : BufRead > Iterator for LineBreakDelimitedString < B > {
123+ type Item = io:: Result < String > ;
124+
125+ #[ inline]
126+ fn next ( & mut self ) -> Option < Self :: Item > {
127+ Some ( self . inner . next ( ) ?. and_then ( bytes_to_string) )
128+ }
129+ }
130+
58131/// Extension trait to add same utility methods to [`BufRead`].
59132pub ( crate ) trait BufReadExt {
60133 /// Splits the reader by a delimiter, yielding Vec<u8>.
@@ -79,6 +152,21 @@ pub(crate) trait BufReadExt {
79152 inner : self . delimit_by ( delimiter. as_bytes ( ) ) ,
80153 }
81154 }
155+
156+ /// Splits lines on CR, CRLF, or LF, yielding Strings.
157+ ///
158+ /// Both `\r` and `\n` are treated as individual line terminators.
159+ /// This means `\r\n` (CRLF) will yield an empty string between the CR and LF.
160+ /// Use `.filter()` to remove empty lines if this is not desired.
161+ #[ inline]
162+ fn split_lines ( self ) -> LineBreakDelimitedString < Self >
163+ where
164+ Self : Sized + BufRead ,
165+ {
166+ LineBreakDelimitedString {
167+ inner : LineBreakDelimited :: new ( self ) ,
168+ }
169+ }
82170}
83171
84172impl < B : BufRead > BufReadExt for B { }
@@ -162,4 +250,53 @@ mod tests {
162250 assert_eq ! ( delimited. next( ) . unwrap( ) . unwrap( ) , "a||" ) ;
163251 assert_eq ! ( delimited. next( ) . unwrap( ) . unwrap( ) , "|b|" ) ;
164252 }
253+
254+ #[ test]
255+ fn split_lines_splits_mixed_endings ( ) {
256+ let input = b"f\r d1/f1\r \n d1/d2/f4\n d1/d2/f6" ;
257+ let got = BufReader :: new ( & input[ ..] )
258+ . split_lines ( )
259+ . collect :: < io:: Result < Vec < _ > > > ( )
260+ . unwrap ( ) ;
261+ assert_eq ! ( got, vec![ "f" , "d1/f1" , "" , "d1/d2/f4" , "d1/d2/f6" ] ) ;
262+ }
263+
264+ #[ test]
265+ fn split_lines_splits_crlf ( ) {
266+ let input = b"\r \n " ;
267+ let got = BufReader :: new ( & input[ ..] )
268+ . split_lines ( )
269+ . collect :: < io:: Result < Vec < _ > > > ( )
270+ . unwrap ( ) ;
271+ assert_eq ! ( got, vec![ "" , "" ] ) ;
272+ }
273+
274+ #[ test]
275+ fn split_lines_empty_input ( ) {
276+ let input = b"" ;
277+ let got = BufReader :: new ( & input[ ..] )
278+ . split_lines ( )
279+ . collect :: < io:: Result < Vec < _ > > > ( )
280+ . unwrap ( ) ;
281+ assert ! ( got. is_empty( ) ) ;
282+ }
283+
284+ #[ test]
285+ fn split_lines_invalid_utf8_returns_error ( ) {
286+ let input = b"valid\n \xff \xfe \n " ;
287+ let mut iter = BufReader :: new ( & input[ ..] ) . split_lines ( ) ;
288+ assert_eq ! ( iter. next( ) . unwrap( ) . unwrap( ) , "valid" ) ;
289+ let err = iter. next ( ) . unwrap ( ) . unwrap_err ( ) ;
290+ assert_eq ! ( err. kind( ) , io:: ErrorKind :: InvalidData ) ;
291+ }
292+
293+ #[ test]
294+ fn split_lines_consecutive_delimiters ( ) {
295+ let input = b"\n \n \n " ;
296+ let got = BufReader :: new ( & input[ ..] )
297+ . split_lines ( )
298+ . collect :: < io:: Result < Vec < _ > > > ( )
299+ . unwrap ( ) ;
300+ assert_eq ! ( got, vec![ "" , "" , "" ] ) ;
301+ }
165302}
0 commit comments