@@ -157,6 +157,19 @@ impl MarkedEventReceiver for YamlLoader {
157157 }
158158}
159159
160+ #[ derive( Debug ) ]
161+ pub enum LoadError {
162+ IO ( std:: io:: Error ) ,
163+ Scan ( ScanError ) ,
164+ Decode ( std:: borrow:: Cow < ' static , str > ) ,
165+ }
166+
167+ impl From < std:: io:: Error > for LoadError {
168+ fn from ( error : std:: io:: Error ) -> Self {
169+ LoadError :: IO ( error)
170+ }
171+ }
172+
160173impl YamlLoader {
161174 fn insert_new_node ( & mut self , node : ( Yaml , usize ) ) {
162175 // valid anchor id starts from 1
@@ -197,6 +210,42 @@ impl YamlLoader {
197210 parser. load ( & mut loader, true ) ?;
198211 Ok ( loader. docs )
199212 }
213+
214+ pub fn load_from_bytes ( mut source : impl std:: io:: Read ) -> Result < Vec < Yaml > , LoadError > {
215+ let mut buffer = Vec :: new ( ) ;
216+ source. read_to_end ( & mut buffer) ?;
217+
218+ // Decodes the input buffer using either UTF-8, UTF-16LE or UTF-16BE depending on the BOM codepoint.
219+ // If the buffer doesn't start with a BOM codepoint, it will use a fallback encoding obtained by
220+ // detect_utf16_endianness.
221+ let ( res, _) = encoding:: types:: decode (
222+ & buffer,
223+ encoding:: DecoderTrap :: Strict ,
224+ detect_utf16_endianness ( & buffer) ,
225+ ) ;
226+ let s = res. map_err ( LoadError :: Decode ) ?;
227+ YamlLoader :: load_from_str ( & s) . map_err ( LoadError :: Scan )
228+ }
229+ }
230+
231+ /// The encoding crate knows how to tell apart UTF-8 from UTF-16LE and utf-16BE, when the
232+ /// bytestream starts with BOM codepoint.
233+ /// However, it doesn't even attempt to guess the UTF-16 endianness of the input bytestream since
234+ /// in the general case the bytestream could start with a codepoint that uses both bytes.
235+ ///
236+ /// The YAML-1.2 spec mandates that the first character of a YAML document is an ASCII character.
237+ /// This allows the encoding to be deduced by the pattern of null (#x00) characters.
238+ //
239+ /// See spec at https://yaml.org/spec/1.2/spec.html#id2771184
240+ fn detect_utf16_endianness ( b : & [ u8 ] ) -> encoding:: types:: EncodingRef {
241+ if b. len ( ) > 1 && ( b[ 0 ] != b[ 1 ] ) {
242+ if b[ 0 ] == 0 {
243+ return encoding:: all:: UTF_16BE ;
244+ } else if b[ 1 ] == 0 {
245+ return encoding:: all:: UTF_16LE ;
246+ }
247+ }
248+ encoding:: all:: UTF_8
200249}
201250
202251macro_rules! define_as (
@@ -736,4 +785,67 @@ subcommands3:
736785 let s = "[" . repeat ( 10_000 ) + & "]" . repeat ( 10_000 ) ;
737786 assert ! ( YamlLoader :: load_from_str( & s) . is_err( ) ) ;
738787 }
788+
789+ #[ test]
790+ fn test_read_bom ( ) {
791+ let s = b"\xef \xbb \xbf ---
792+ a: 1
793+ b: 2.2
794+ c: [1, 2]
795+ " ;
796+ let out = YamlLoader :: load_from_bytes ( s as & [ u8 ] ) . unwrap ( ) ;
797+ let doc = & out[ 0 ] ;
798+ assert_eq ! ( doc[ "a" ] . as_i64( ) . unwrap( ) , 1i64 ) ;
799+ assert_eq ! ( doc[ "b" ] . as_f64( ) . unwrap( ) , 2.2f64 ) ;
800+ assert_eq ! ( doc[ "c" ] [ 1 ] . as_i64( ) . unwrap( ) , 2i64 ) ;
801+ assert ! ( doc[ "d" ] [ 0 ] . is_badvalue( ) ) ;
802+ }
803+
804+ #[ test]
805+ fn test_read_utf16le ( ) {
806+ let s = b"\xff \xfe -\x00 -\x00 -\x00
807+ \x00 a\x00 :\x00 \x00 1\x00
808+ \x00 b\x00 :\x00 \x00 2\x00 .\x00 2\x00
809+ \x00 c\x00 :\x00 \x00 [\x00 1\x00 ,\x00 \x00 2\x00 ]\x00
810+ \x00 ";
811+ let out = YamlLoader :: load_from_bytes ( s as & [ u8 ] ) . unwrap ( ) ;
812+ let doc = & out[ 0 ] ;
813+ println ! ( "GOT: {:?}" , doc) ;
814+ assert_eq ! ( doc[ "a" ] . as_i64( ) . unwrap( ) , 1i64 ) ;
815+ assert_eq ! ( doc[ "b" ] . as_f64( ) . unwrap( ) , 2.2f64 ) ;
816+ assert_eq ! ( doc[ "c" ] [ 1 ] . as_i64( ) . unwrap( ) , 2i64 ) ;
817+ assert ! ( doc[ "d" ] [ 0 ] . is_badvalue( ) ) ;
818+ }
819+
820+ #[ test]
821+ fn test_read_utf16be ( ) {
822+ let s = b"\xfe \xff \x00 -\x00 -\x00 -\x00
823+ \x00 a\x00 :\x00 \x00 1\x00
824+ \x00 b\x00 :\x00 \x00 2\x00 .\x00 2\x00
825+ \x00 c\x00 :\x00 \x00 [\x00 1\x00 ,\x00 \x00 2\x00 ]\x00
826+ " ;
827+ let out = YamlLoader :: load_from_bytes ( s as & [ u8 ] ) . unwrap ( ) ;
828+ let doc = & out[ 0 ] ;
829+ println ! ( "GOT: {:?}" , doc) ;
830+ assert_eq ! ( doc[ "a" ] . as_i64( ) . unwrap( ) , 1i64 ) ;
831+ assert_eq ! ( doc[ "b" ] . as_f64( ) . unwrap( ) , 2.2f64 ) ;
832+ assert_eq ! ( doc[ "c" ] [ 1 ] . as_i64( ) . unwrap( ) , 2i64 ) ;
833+ assert ! ( doc[ "d" ] [ 0 ] . is_badvalue( ) ) ;
834+ }
835+
836+ #[ test]
837+ fn test_read_utf16le_nobom ( ) {
838+ let s = b"-\x00 -\x00 -\x00
839+ \x00 a\x00 :\x00 \x00 1\x00
840+ \x00 b\x00 :\x00 \x00 2\x00 .\x00 2\x00
841+ \x00 c\x00 :\x00 \x00 [\x00 1\x00 ,\x00 \x00 2\x00 ]\x00
842+ \x00 ";
843+ let out = YamlLoader :: load_from_bytes ( s as & [ u8 ] ) . unwrap ( ) ;
844+ let doc = & out[ 0 ] ;
845+ println ! ( "GOT: {:?}" , doc) ;
846+ assert_eq ! ( doc[ "a" ] . as_i64( ) . unwrap( ) , 1i64 ) ;
847+ assert_eq ! ( doc[ "b" ] . as_f64( ) . unwrap( ) , 2.2f64 ) ;
848+ assert_eq ! ( doc[ "c" ] [ 1 ] . as_i64( ) . unwrap( ) , 2i64 ) ;
849+ assert ! ( doc[ "d" ] [ 0 ] . is_badvalue( ) ) ;
850+ }
739851}
0 commit comments