Skip to content

Commit d253f32

Browse files
authored
Add files via upload
1 parent ea554d0 commit d253f32

10 files changed

Lines changed: 1930 additions & 12 deletions

PyWWW/pywwwget_bard.py

Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
import os
2020
import io
21+
import re
2122
import sys
2223
import socket
2324
import shutil
@@ -39,6 +40,181 @@
3940
except ImportError:
4041
from StringIO import StringIO as BytesIO
4142

43+
try:
44+
# Py3
45+
from urllib.parse import quote_from_bytes, unquote_to_bytes
46+
except ImportError:
47+
# Py2
48+
from urllib import quote as _quote
49+
from urllib import unquote as _unquote
50+
51+
def quote_from_bytes(b, safe=''):
52+
# Py2 urllib.quote expects "str" (bytes)
53+
return _quote(b, safe=safe)
54+
55+
def unquote_to_bytes(s):
56+
# Returns "str" (bytes) in Py2
57+
return _unquote(s)
58+
59+
60+
_TEXT_MIME_DEFAULT = 'text/plain; charset=utf-8'
61+
_BIN_MIME_DEFAULT = 'application/octet-stream'
62+
63+
64+
def _is_probably_text(data_bytes):
65+
"""
66+
Heuristic: treat as text if it decodes as UTF-8 and does not contain many
67+
control bytes (except common whitespace).
68+
"""
69+
if not data_bytes:
70+
return True
71+
# Fast path: NUL strongly suggests binary
72+
if b'\x00' in data_bytes:
73+
return False
74+
try:
75+
decoded = data_bytes.decode('utf-8')
76+
except Exception:
77+
return False
78+
79+
# Count "control" chars excluding common whitespace
80+
control = 0
81+
for ch in decoded:
82+
o = ord(ch)
83+
if (o < 32 and ch not in u'\t\n\r') or o == 127:
84+
control += 1
85+
# Allow a small fraction of control chars
86+
return control <= max(1, len(decoded) // 200)
87+
88+
89+
def data_url_encode(fileobj,
90+
mime=None,
91+
is_text=None,
92+
charset='utf-8',
93+
base64_encode=None):
94+
"""
95+
Read all bytes from a file-like object and return a data: URL string.
96+
97+
Args:
98+
fileobj: file-like (must support read()) returning bytes/str.
99+
mime: optional MIME type (e.g. 'image/png', 'text/plain').
100+
If not provided, defaults to text/plain; charset=utf-8 for text
101+
or application/octet-stream for binary.
102+
is_text: force text/binary decision (True/False). If None, auto-detect.
103+
charset: charset used when defaulting to text/* or when mime starts with text/
104+
and mime doesn't already declare a charset.
105+
base64_encode: if True, always base64. If False, always percent-encode.
106+
If None, choose percent-encode for text and base64 for binary.
107+
108+
Returns:
109+
A unicode/text string containing the full data URL.
110+
"""
111+
raw = fileobj.read()
112+
# Normalize to bytes
113+
if isinstance(raw, text_type):
114+
# If someone passed a text stream, encode it as utf-8 bytes
115+
raw_bytes = raw.encode(charset)
116+
detected_text = True
117+
else:
118+
raw_bytes = raw
119+
detected_text = _is_probably_text(raw_bytes)
120+
121+
if is_text is None:
122+
is_text = detected_text
123+
124+
if mime is None:
125+
mime = _TEXT_MIME_DEFAULT if is_text else _BIN_MIME_DEFAULT
126+
else:
127+
# If it's a text/* mime and no charset declared, append one
128+
mlow = mime.lower()
129+
if mlow.startswith('text/') and 'charset=' not in mlow:
130+
mime = mime + '; charset=' + charset
131+
132+
if base64_encode is None:
133+
base64_encode = not is_text # text => percent, binary => base64
134+
135+
if base64_encode:
136+
b64 = base64.b64encode(raw_bytes)
137+
if not isinstance(b64, text_type):
138+
b64 = b64.decode('ascii')
139+
return u'data:{0};base64,{1}'.format(mime, b64)
140+
else:
141+
# Percent-encode bytes
142+
encoded = quote_from_bytes(raw_bytes, safe="!$&'()*+,;=:@-._~")
143+
if not isinstance(encoded, text_type):
144+
# Py2 quote returns bytes-str; ensure unicode
145+
encoded = encoded.decode('ascii')
146+
return u'data:{0},{1}'.format(mime, encoded)
147+
148+
149+
_DATA_URL_RE = re.compile(r'^data:(?P<meta>[^,]*?),(?P<data>.*)$', re.DOTALL)
150+
151+
152+
def data_url_decode(data_url):
153+
"""
154+
Parse a data: URL and return (bytes_io, mime, is_base64).
155+
156+
Returns:
157+
(MkTempFile(data_bytes), mime_string_or_None, is_base64_bool)
158+
159+
Notes:
160+
- If no MIME is provided in the URL, mime will be None (per RFC 2397 default is text/plain;charset=US-ASCII).
161+
- This function does not attempt charset transcoding; it returns raw bytes.
162+
"""
163+
if not isinstance(data_url, text_type):
164+
# Accept bytes input too
165+
try:
166+
data_url = data_url.decode('utf-8')
167+
except Exception:
168+
data_url = data_url.decode('ascii')
169+
170+
m = _DATA_URL_RE.match(data_url)
171+
if not m:
172+
raise ValueError('Not a valid data: URL')
173+
174+
meta = m.group('meta')
175+
data_part = m.group('data')
176+
177+
meta_parts = [p for p in meta.split(';') if p] if meta else []
178+
is_base64 = False
179+
mime = None
180+
181+
if meta_parts:
182+
# First part may be mime if it contains '/' or looks like type/subtype
183+
if '/' in meta_parts[0]:
184+
mime = meta_parts[0]
185+
rest = meta_parts[1:]
186+
else:
187+
rest = meta_parts
188+
189+
for p in rest:
190+
if p.lower() == 'base64':
191+
is_base64 = True
192+
else:
193+
# keep parameters on mime if present (e.g. charset)
194+
if mime is None:
195+
mime = p
196+
else:
197+
mime = mime + ';' + p
198+
199+
if is_base64:
200+
# data_part is base64 ascii text
201+
try:
202+
decoded_bytes = base64.b64decode(data_part.encode('ascii'))
203+
except Exception:
204+
# some inputs may include whitespace/newlines
205+
cleaned = ''.join(data_part.split())
206+
decoded_bytes = base64.b64decode(cleaned.encode('ascii'))
207+
else:
208+
# Percent-decoding; must operate on str, returns bytes in both py2/py3 wrapper
209+
decoded_bytes = unquote_to_bytes(data_part)
210+
211+
# Py3 wrapper returns bytes; Py2 returns "str" bytes already.
212+
if isinstance(decoded_bytes, text_type):
213+
decoded_bytes = decoded_bytes.encode('latin-1')
214+
215+
return MkTempFile(decoded_bytes), mime, is_base64
216+
217+
42218
try:
43219
# Python 3
44220
from urllib.parse import urlparse, urlunparse, parse_qs, unquote
@@ -1067,6 +1243,17 @@ def upload_file_to_internet_file(fileobj, url):
10671243
return upload_file_to_ftp_file(fileobj, url)
10681244
elif p.scheme in ("sftp", "scp"):
10691245
return upload_file_to_sftp_file(fileobj, url)
1246+
elif p.scheme in ("file"):
1247+
outfile = io.open(unquote(p.path), "wb")
1248+
try:
1249+
fileobj.seek(0, 0)
1250+
except Exception:
1251+
pass
1252+
with io.open(unquote(p.path), "wb") as fdst:
1253+
shutil.copyfileobj(fileobj, fdst)
1254+
return fileobj
1255+
elif p.scheme in ("data"):
1256+
return data_url_encode(fileobj)
10701257
elif p.scheme in ("tcp", "udp"):
10711258
_, o = _parse_net_url(url)
10721259
return send_from_fileobj(fileobj, p.hostname, p.port, p.scheme, **o)
@@ -1081,6 +1268,10 @@ def download_file_from_internet_file(url, headers=None):
10811268
return download_file_from_ftp_file(url)
10821269
elif p.scheme in ("sftp", "scp"):
10831270
return download_file_from_sftp_file(url)
1271+
elif p.scheme in ("data"):
1272+
return data_url_decode(url)[0]
1273+
elif p.scheme in ("file"):
1274+
return io.open(unquote(p.path), "rb")
10841275
elif p.scheme in ("tcp", "udp"):
10851276
_, o = _parse_net_url(url)
10861277
out = MkTempFile()

0 commit comments

Comments
 (0)