1818
1919import os
2020import io
21+ import re
2122import sys
2223import socket
2324import shutil
3940 except ImportError :
4041 from StringIO import StringIO as BytesIO
4142
43+ try :
44+ # Py3
45+ from urllib .parse import quote_from_bytes , unquote_to_bytes
46+ except ImportError :
47+ # Py2
48+ from urllib import quote as _quote
49+ from urllib import unquote as _unquote
50+
51+ def quote_from_bytes (b , safe = '' ):
52+ # Py2 urllib.quote expects "str" (bytes)
53+ return _quote (b , safe = safe )
54+
55+ def unquote_to_bytes (s ):
56+ # Returns "str" (bytes) in Py2
57+ return _unquote (s )
58+
59+
60+ _TEXT_MIME_DEFAULT = 'text/plain; charset=utf-8'
61+ _BIN_MIME_DEFAULT = 'application/octet-stream'
62+
63+
64+ def _is_probably_text (data_bytes ):
65+ """
66+ Heuristic: treat as text if it decodes as UTF-8 and does not contain many
67+ control bytes (except common whitespace).
68+ """
69+ if not data_bytes :
70+ return True
71+ # Fast path: NUL strongly suggests binary
72+ if b'\x00 ' in data_bytes :
73+ return False
74+ try :
75+ decoded = data_bytes .decode ('utf-8' )
76+ except Exception :
77+ return False
78+
79+ # Count "control" chars excluding common whitespace
80+ control = 0
81+ for ch in decoded :
82+ o = ord (ch )
83+ if (o < 32 and ch not in u'\t \n \r ' ) or o == 127 :
84+ control += 1
85+ # Allow a small fraction of control chars
86+ return control <= max (1 , len (decoded ) // 200 )
87+
88+
89+ def data_url_encode (fileobj ,
90+ mime = None ,
91+ is_text = None ,
92+ charset = 'utf-8' ,
93+ base64_encode = None ):
94+ """
95+ Read all bytes from a file-like object and return a data: URL string.
96+
97+ Args:
98+ fileobj: file-like (must support read()) returning bytes/str.
99+ mime: optional MIME type (e.g. 'image/png', 'text/plain').
100+ If not provided, defaults to text/plain; charset=utf-8 for text
101+ or application/octet-stream for binary.
102+ is_text: force text/binary decision (True/False). If None, auto-detect.
103+ charset: charset used when defaulting to text/* or when mime starts with text/
104+ and mime doesn't already declare a charset.
105+ base64_encode: if True, always base64. If False, always percent-encode.
106+ If None, choose percent-encode for text and base64 for binary.
107+
108+ Returns:
109+ A unicode/text string containing the full data URL.
110+ """
111+ raw = fileobj .read ()
112+ # Normalize to bytes
113+ if isinstance (raw , text_type ):
114+ # If someone passed a text stream, encode it as utf-8 bytes
115+ raw_bytes = raw .encode (charset )
116+ detected_text = True
117+ else :
118+ raw_bytes = raw
119+ detected_text = _is_probably_text (raw_bytes )
120+
121+ if is_text is None :
122+ is_text = detected_text
123+
124+ if mime is None :
125+ mime = _TEXT_MIME_DEFAULT if is_text else _BIN_MIME_DEFAULT
126+ else :
127+ # If it's a text/* mime and no charset declared, append one
128+ mlow = mime .lower ()
129+ if mlow .startswith ('text/' ) and 'charset=' not in mlow :
130+ mime = mime + '; charset=' + charset
131+
132+ if base64_encode is None :
133+ base64_encode = not is_text # text => percent, binary => base64
134+
135+ if base64_encode :
136+ b64 = base64 .b64encode (raw_bytes )
137+ if not isinstance (b64 , text_type ):
138+ b64 = b64 .decode ('ascii' )
139+ return u'data:{0};base64,{1}' .format (mime , b64 )
140+ else :
141+ # Percent-encode bytes
142+ encoded = quote_from_bytes (raw_bytes , safe = "!$&'()*+,;=:@-._~" )
143+ if not isinstance (encoded , text_type ):
144+ # Py2 quote returns bytes-str; ensure unicode
145+ encoded = encoded .decode ('ascii' )
146+ return u'data:{0},{1}' .format (mime , encoded )
147+
148+
149+ _DATA_URL_RE = re .compile (r'^data:(?P<meta>[^,]*?),(?P<data>.*)$' , re .DOTALL )
150+
151+
152+ def data_url_decode (data_url ):
153+ """
154+ Parse a data: URL and return (bytes_io, mime, is_base64).
155+
156+ Returns:
157+ (MkTempFile(data_bytes), mime_string_or_None, is_base64_bool)
158+
159+ Notes:
160+ - If no MIME is provided in the URL, mime will be None (per RFC 2397 default is text/plain;charset=US-ASCII).
161+ - This function does not attempt charset transcoding; it returns raw bytes.
162+ """
163+ if not isinstance (data_url , text_type ):
164+ # Accept bytes input too
165+ try :
166+ data_url = data_url .decode ('utf-8' )
167+ except Exception :
168+ data_url = data_url .decode ('ascii' )
169+
170+ m = _DATA_URL_RE .match (data_url )
171+ if not m :
172+ raise ValueError ('Not a valid data: URL' )
173+
174+ meta = m .group ('meta' )
175+ data_part = m .group ('data' )
176+
177+ meta_parts = [p for p in meta .split (';' ) if p ] if meta else []
178+ is_base64 = False
179+ mime = None
180+
181+ if meta_parts :
182+ # First part may be mime if it contains '/' or looks like type/subtype
183+ if '/' in meta_parts [0 ]:
184+ mime = meta_parts [0 ]
185+ rest = meta_parts [1 :]
186+ else :
187+ rest = meta_parts
188+
189+ for p in rest :
190+ if p .lower () == 'base64' :
191+ is_base64 = True
192+ else :
193+ # keep parameters on mime if present (e.g. charset)
194+ if mime is None :
195+ mime = p
196+ else :
197+ mime = mime + ';' + p
198+
199+ if is_base64 :
200+ # data_part is base64 ascii text
201+ try :
202+ decoded_bytes = base64 .b64decode (data_part .encode ('ascii' ))
203+ except Exception :
204+ # some inputs may include whitespace/newlines
205+ cleaned = '' .join (data_part .split ())
206+ decoded_bytes = base64 .b64decode (cleaned .encode ('ascii' ))
207+ else :
208+ # Percent-decoding; must operate on str, returns bytes in both py2/py3 wrapper
209+ decoded_bytes = unquote_to_bytes (data_part )
210+
211+ # Py3 wrapper returns bytes; Py2 returns "str" bytes already.
212+ if isinstance (decoded_bytes , text_type ):
213+ decoded_bytes = decoded_bytes .encode ('latin-1' )
214+
215+ return MkTempFile (decoded_bytes ), mime , is_base64
216+
217+
42218try :
43219 # Python 3
44220 from urllib .parse import urlparse , urlunparse , parse_qs , unquote
@@ -1067,6 +1243,17 @@ def upload_file_to_internet_file(fileobj, url):
10671243 return upload_file_to_ftp_file (fileobj , url )
10681244 elif p .scheme in ("sftp" , "scp" ):
10691245 return upload_file_to_sftp_file (fileobj , url )
1246+ elif p .scheme in ("file" ):
1247+ outfile = io .open (unquote (p .path ), "wb" )
1248+ try :
1249+ fileobj .seek (0 , 0 )
1250+ except Exception :
1251+ pass
1252+ with io .open (unquote (p .path ), "wb" ) as fdst :
1253+ shutil .copyfileobj (fileobj , fdst )
1254+ return fileobj
1255+ elif p .scheme in ("data" ):
1256+ return data_url_encode (fileobj )
10701257 elif p .scheme in ("tcp" , "udp" ):
10711258 _ , o = _parse_net_url (url )
10721259 return send_from_fileobj (fileobj , p .hostname , p .port , p .scheme , ** o )
@@ -1081,6 +1268,10 @@ def download_file_from_internet_file(url, headers=None):
10811268 return download_file_from_ftp_file (url )
10821269 elif p .scheme in ("sftp" , "scp" ):
10831270 return download_file_from_sftp_file (url )
1271+ elif p .scheme in ("data" ):
1272+ return data_url_decode (url )[0 ]
1273+ elif p .scheme in ("file" ):
1274+ return io .open (unquote (p .path ), "rb" )
10841275 elif p .scheme in ("tcp" , "udp" ):
10851276 _ , o = _parse_net_url (url )
10861277 out = MkTempFile ()
0 commit comments