Skip to content

Commit 90b5d61

Browse files
authored
Add files via upload
1 parent 2b92929 commit 90b5d61

8 files changed

Lines changed: 2198 additions & 206 deletions

PyWWW/pywwwget_chatgpt.py

Lines changed: 275 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1317,6 +1317,254 @@ def sftp_status_to_reason(code):
13171317
}
13181318
return reasons.get(code, 'Unknown Status Code')
13191319

1320+
def read_all(fileobj, encoding='utf-8', errors='replace'):
1321+
data = fileobj.read()
1322+
if data is None:
1323+
return u'' if PY2 else ''
1324+
if isinstance(data, bytes):
1325+
return data.decode(encoding, errors)
1326+
return data # already text (unicode on py2 or str on py3)
1327+
1328+
# ---------------- Parsing primitives ----------------
1329+
1330+
_req_line_http1 = re.compile(r'^(?P<method>[A-Z]+)\s+(?P<path>\S+)\s+HTTP/(?P<version>\d+\.\d)\s*$')
1331+
_req_line_h2 = re.compile(r'^(?P<method>[A-Z]+)\s+(?P<path>\S+)\s+HTTP/(?P<version>2(?:\.0)?)\s*$')
1332+
_status_line_v1 = re.compile(r'^HTTP/(?P<version>\d+\.\d)\s+(?P<code>\d{3})(?:\s+(?P<reason>.*))?$')
1333+
_status_line_h2 = re.compile(r'^HTTP/(?P<version>2(?:\.0)?)\s+(?P<code>\d{3})(?:\s+(?P<reason>.*))?$')
1334+
1335+
def _normalize(text):
1336+
return text.replace('\r\n', '\n').replace('\r', '\n')
1337+
1338+
def _split_header_block(block_text):
1339+
block_text = _normalize(block_text)
1340+
lines = block_text.split('\n')
1341+
while lines and lines[-1] == '':
1342+
lines.pop()
1343+
1344+
# unfold obs-fold (space/tab continuation)
1345+
out = []
1346+
for line in lines:
1347+
if out and (line.startswith(' ') or line.startswith('\t')):
1348+
out[-1] += ' ' + line.lstrip()
1349+
else:
1350+
out.append(line)
1351+
return out
1352+
1353+
def _parse_headers(lines):
1354+
headers = {}
1355+
for line in lines:
1356+
if not line or ':' not in line:
1357+
continue
1358+
name, value = line.split(':', 1)
1359+
name = name.strip()
1360+
value = value.strip()
1361+
key = name.lower()
1362+
1363+
if key in headers:
1364+
if isinstance(headers[key], list):
1365+
headers[key].append(value)
1366+
else:
1367+
headers[key] = [headers[key], value]
1368+
else:
1369+
headers[key] = value
1370+
return headers
1371+
1372+
def parse_request_block(block_text):
1373+
if not block_text:
1374+
return None
1375+
lines = _split_header_block(block_text)
1376+
if not lines:
1377+
return None
1378+
1379+
m = _req_line_http1.match(lines[0]) or _req_line_h2.match(lines[0])
1380+
if not m:
1381+
return None
1382+
1383+
return {
1384+
'method': m.group('method'),
1385+
'path': m.group('path'),
1386+
'version': m.group('version'),
1387+
'headers': _parse_headers(lines[1:]),
1388+
}
1389+
1390+
def parse_response_block(block_text):
1391+
if not block_text:
1392+
return None
1393+
lines = _split_header_block(block_text)
1394+
if not lines:
1395+
return None
1396+
1397+
m = _status_line_v1.match(lines[0]) or _status_line_h2.match(lines[0])
1398+
if not m:
1399+
return None
1400+
1401+
code = int(m.group('code'))
1402+
reason = (m.group('reason') or '').strip()
1403+
return {
1404+
'version': m.group('version'),
1405+
'status_code': code,
1406+
'reason': reason,
1407+
'headers': _parse_headers(lines[1:]),
1408+
}
1409+
1410+
# ---------------- Extraction from verbose output ----------------
1411+
1412+
# HTTP/1.x request block: "GET / HTTP/1.1" ... blank line
1413+
_HTTP1_REQ_BLOCK = re.compile(
1414+
r'(?ms)^(?:GET|POST|PUT|PATCH|DELETE|HEAD|OPTIONS|TRACE|CONNECT)\s+\S+\s+HTTP/\d\.\d\s*\n'
1415+
r'(?:.*?\n)*?\n'
1416+
)
1417+
1418+
# HTTP/2 synthesized request block: "GET / HTTP/2" ... blank line
1419+
_HTTP2_SYN_REQ_BLOCK = re.compile(
1420+
r'(?ms)^(?:GET|POST|PUT|PATCH|DELETE|HEAD|OPTIONS|TRACE|CONNECT)\s+\S+\s+HTTP/2(?:\.0)?\s*\n'
1421+
r'(?:.*?\n)*?\n'
1422+
)
1423+
1424+
# HTTP/2 bracket pseudo-headers:
1425+
# [HTTP/2] [1] [:method: GET]
1426+
# [HTTP/2] [1] [:path: /]
1427+
# [HTTP/2] [1] [user-agent: ...]
1428+
_HTTP2_BRACKET_LINE = re.compile(
1429+
r'^\[HTTP/2\]\s*\[(?P<stream>\d+)\]\s*\[(?P<kv>.+?)\]\s*$'
1430+
)
1431+
1432+
def _extract_http2_bracket_request(text):
1433+
"""
1434+
Build a synthetic request block from the [HTTP/2] [stream] [key: value] lines.
1435+
Returns (block_text, used_stream) or (None, None).
1436+
"""
1437+
t = _normalize(text)
1438+
lines = t.split('\n')
1439+
1440+
# Collect per stream
1441+
per_stream = {}
1442+
order = [] # stream appearance order
1443+
for line in lines:
1444+
m = _HTTP2_BRACKET_LINE.match(line)
1445+
if not m:
1446+
continue
1447+
stream = m.group('stream')
1448+
kv = m.group('kv')
1449+
if stream not in per_stream:
1450+
per_stream[stream] = []
1451+
order.append(stream)
1452+
per_stream[stream].append(kv)
1453+
1454+
if not order:
1455+
return (None, None)
1456+
1457+
# pick first stream that has :method and :path
1458+
for stream in order:
1459+
kvs = per_stream[stream]
1460+
pseudo = {}
1461+
normal = []
1462+
for kv in kvs:
1463+
# kv is like ":method: GET" or "user-agent: blah"
1464+
if ':' not in kv:
1465+
continue
1466+
name, value = kv.split(':', 1)
1467+
name = name.strip()
1468+
value = value.strip()
1469+
1470+
# special case: pseudo headers start with empty name because kv starts ":method..."
1471+
# our split gives name="" and value="method: GET" if we split at first ':'
1472+
if name == '' and value:
1473+
# now split "method: GET"
1474+
if ':' in value:
1475+
n2, v2 = value.split(':', 1)
1476+
pseudo[':' + n2.strip()] = v2.strip()
1477+
continue
1478+
1479+
# regular "host: github.com"
1480+
normal.append((name, value))
1481+
1482+
# also handle case where pseudo lines came as "[:method: GET]" (already handled)
1483+
if ':method' in pseudo and ':path' in pseudo:
1484+
method = pseudo[':method']
1485+
path = pseudo[':path']
1486+
# prefer :authority for Host if present
1487+
authority = pseudo.get(':authority')
1488+
1489+
block_lines = []
1490+
block_lines.append('%s %s HTTP/2' % (method, path))
1491+
if authority:
1492+
block_lines.append('Host: %s' % authority)
1493+
1494+
# add other pseudo? scheme isn't a header line usually; skip it.
1495+
# add bracketed normal headers
1496+
for (name, value) in normal:
1497+
block_lines.append('%s: %s' % (name, value))
1498+
block_lines.append('') # blank line terminator
1499+
1500+
return ('\n'.join(block_lines), stream)
1501+
1502+
return (None, None)
1503+
1504+
# Response blocks
1505+
_HTTP1_RESP_BLOCK = re.compile(
1506+
r'(?ms)^HTTP/\d\.\d\s+\d{3}.*\n(?:.*?\n)*?\n'
1507+
)
1508+
_HTTP2_RESP_BLOCK = re.compile(
1509+
r'(?ms)^HTTP/2(?:\.0)?\s+\d{3}.*\n(?:.*?\n)*?\n'
1510+
)
1511+
1512+
def extract_request_and_response(debug_text):
1513+
"""
1514+
Returns (request_block_text, response_block_text).
1515+
Supports:
1516+
- HTTP/1 request blocks
1517+
- HTTP/2 synthesized request blocks
1518+
- HTTP/2 bracket pseudo-header sequences (converted to synthetic block)
1519+
- HTTP/1 and HTTP/2 response blocks
1520+
"""
1521+
t = _normalize(debug_text)
1522+
1523+
# Try request in priority order:
1524+
# 1) HTTP/1 request block
1525+
m = _HTTP1_REQ_BLOCK.search(t)
1526+
if m:
1527+
req_block = m.group(0)
1528+
else:
1529+
# 2) HTTP/2 synthesized request block
1530+
m2 = _HTTP2_SYN_REQ_BLOCK.search(t)
1531+
if m2:
1532+
req_block = m2.group(0)
1533+
else:
1534+
# 3) HTTP/2 bracket pseudo headers -> synthesize
1535+
req_block, _stream = _extract_http2_bracket_request(t)
1536+
1537+
# Try response in priority order:
1538+
mr2 = _HTTP2_RESP_BLOCK.search(t)
1539+
mr1 = _HTTP1_RESP_BLOCK.search(t)
1540+
if mr2 and mr1:
1541+
# choose whichever appears first
1542+
resp_block = mr2.group(0) if mr2.start() < mr1.start() else mr1.group(0)
1543+
elif mr2:
1544+
resp_block = mr2.group(0)
1545+
elif mr1:
1546+
resp_block = mr1.group(0)
1547+
else:
1548+
resp_block = None
1549+
1550+
return req_block, resp_block
1551+
1552+
def parse_pycurl_verbose(fileobj_or_text):
1553+
if hasattr(fileobj_or_text, 'read'):
1554+
text = read_all(fileobj_or_text)
1555+
else:
1556+
if isinstance(fileobj_or_text, bytes):
1557+
text = fileobj_or_text.decode('utf-8', 'replace')
1558+
else:
1559+
text = fileobj_or_text
1560+
1561+
req_block, resp_block = extract_request_and_response(text)
1562+
return {
1563+
'raw': {'request': req_block, 'response': resp_block},
1564+
'request': parse_request_block(req_block) if req_block else None,
1565+
'response': parse_response_block(resp_block) if resp_block else None,
1566+
}
1567+
13201568
def download_file_from_http_file(url, headers=None, usehttp=__use_http_lib__, httpuseragent=None, httpreferer=None, httpcookie=geturls_cj, httpmethod="GET", returnstat=False):
13211569
if headers is None:
13221570
headers = {}
@@ -1447,7 +1695,7 @@ def download_file_from_http_file(url, headers=None, usehttp=__use_http_lib__, ht
14471695
httpcodereason = http_status_to_reason(r.status)
14481696
httpversionout = "1.1"
14491697
httpmethodout = httpmethod
1450-
httpurlout = str(httpurl)
1698+
httpurlout = str(rebuilt_url)
14511699
httpheaderout = r.headers
14521700
httpheadersentout = headers
14531701

@@ -1507,6 +1755,7 @@ def download_file_from_http_file(url, headers=None, usehttp=__use_http_lib__, ht
15071755
elif(usehttp == "pycurl"):
15081756
retrieved_body = MkTempFile()
15091757
retrieved_headers = MkTempFile()
1758+
sentout_headers = MkTempFile()
15101759
try:
15111760
if(httpmethod == "GET"):
15121761
geturls_text = pycurl.Curl()
@@ -1517,13 +1766,12 @@ def download_file_from_http_file(url, headers=None, usehttp=__use_http_lib__, ht
15171766
else:
15181767
usehttpver = geturls_text.CURL_HTTP_VERSION_1_1
15191768
geturls_text.setopt(geturls_text.URL, rebuilt_url)
1520-
geturls_text.setopt(geturls_text.HTTP_VERSION,
1521-
geturls_text.CURL_HTTP_VERSION_1_1)
1522-
geturls_text.setopt(
1523-
geturls_text.WRITEFUNCTION, retrieved_body.write)
1769+
geturls_text.setopt(geturls_text.HTTP_VERSION, usehttpver)
1770+
geturls_text.setopt(geturls_text.WRITEFUNCTION, retrieved_body.write)
15241771
geturls_text.setopt(geturls_text.HTTPHEADER, headers)
1525-
geturls_text.setopt(
1526-
geturls_text.HEADERFUNCTION, retrieved_headers.write)
1772+
geturls_text.setopt(geturls_text.HEADERFUNCTION, retrieved_headers.write)
1773+
geturls_text.setopt(pycurl.VERBOSE, 1)
1774+
geturls_text.setopt(pycurl.DEBUGFUNCTION, lambda t, m: sentout_headers.write(m))
15271775
geturls_text.setopt(geturls_text.FOLLOWLOCATION, True)
15281776
geturls_text.setopt(geturls_text.TIMEOUT, 60)
15291777
geturls_text.perform()
@@ -1536,13 +1784,12 @@ def download_file_from_http_file(url, headers=None, usehttp=__use_http_lib__, ht
15361784
else:
15371785
usehttpver = geturls_text.CURL_HTTP_VERSION_1_1
15381786
geturls_text.setopt(geturls_text.URL, rebuilt_url)
1539-
geturls_text.setopt(geturls_text.HTTP_VERSION,
1540-
geturls_text.CURL_HTTP_VERSION_1_1)
1541-
geturls_text.setopt(
1542-
geturls_text.WRITEFUNCTION, retrieved_body.write)
1787+
geturls_text.setopt(geturls_text.HTTP_VERSION, usehttpver)
1788+
geturls_text.setopt(geturls_text.WRITEFUNCTION, retrieved_body.write)
15431789
geturls_text.setopt(geturls_text.HTTPHEADER, headers)
1544-
geturls_text.setopt(
1545-
geturls_text.HEADERFUNCTION, retrieved_headers.write)
1790+
geturls_text.setopt(geturls_text.HEADERFUNCTION, retrieved_headers.write)
1791+
geturls_text.setopt(pycurl.VERBOSE, 1)
1792+
geturls_text.setopt(pycurl.DEBUGFUNCTION, lambda t, m: sentout_headers.write(m))
15461793
geturls_text.setopt(geturls_text.FOLLOWLOCATION, True)
15471794
geturls_text.setopt(geturls_text.TIMEOUT, 60)
15481795
geturls_text.setopt(geturls_text.POST, True)
@@ -1557,25 +1804,25 @@ def download_file_from_http_file(url, headers=None, usehttp=__use_http_lib__, ht
15571804
else:
15581805
usehttpver = geturls_text.CURL_HTTP_VERSION_1_1
15591806
geturls_text.setopt(geturls_text.URL, rebuilt_url)
1560-
geturls_text.setopt(geturls_text.HTTP_VERSION,
1561-
geturls_text.CURL_HTTP_VERSION_1_1)
1562-
geturls_text.setopt(
1563-
geturls_text.WRITEFUNCTION, retrieved_body.write)
1807+
geturls_text.setopt(geturls_text.HTTP_VERSION, usehttpver)
1808+
geturls_text.setopt(geturls_text.WRITEFUNCTION, retrieved_body.write)
1809+
geturls_text.setopt(pycurl.VERBOSE, 1)
1810+
geturls_text.setopt(pycurl.DEBUGFUNCTION, lambda t, m: sentout_headers.write(m))
15641811
geturls_text.setopt(geturls_text.HTTPHEADER, headers)
1565-
geturls_text.setopt(
1566-
geturls_text.HEADERFUNCTION, retrieved_headers.write)
1812+
geturls_text.setopt(geturls_text.HEADERFUNCTION, retrieved_headers.write)
15671813
geturls_text.setopt(geturls_text.FOLLOWLOCATION, True)
15681814
geturls_text.setopt(geturls_text.TIMEOUT, 60)
15691815
geturls_text.perform()
15701816
retrieved_headers.seek(0, 0)
1817+
sentout_headers.seek(0, 0)
1818+
httpheadersentpre = parse_pycurl_verbose(sentout_headers)
1819+
sentout_headers.close()
15711820
if(sys.version[0] == "2"):
15721821
pycurlhead = retrieved_headers.read()
15731822
if(sys.version[0] >= "3"):
15741823
pycurlhead = retrieved_headers.read().decode('UTF-8')
1575-
pyhttpverinfo = re.findall(
1576-
r'^HTTP/([0-9.]+) (\d+)(?: ([A-Za-z\s]+))?$', pycurlhead.splitlines()[0].strip().rstrip('\r\n'))[0]
1577-
pycurlheadersout = make_http_headers_from_pycurl_to_dict(
1578-
pycurlhead)
1824+
pyhttpverinfo = re.findall(r'^HTTP/([0-9.]+) (\d+)(?: ([A-Za-z\s]+))?$', pycurlhead.splitlines()[0].strip().rstrip('\r\n'))[0]
1825+
pycurlheadersout = make_http_headers_from_pycurl_to_dict(pycurlhead)
15791826
retrieved_body.seek(0, 0)
15801827
httpfile = retrieved_body
15811828
retrieved_headers.close()
@@ -1586,13 +1833,15 @@ def download_file_from_http_file(url, headers=None, usehttp=__use_http_lib__, ht
15861833
except ValueError:
15871834
return False
15881835
httpcodeout = geturls_text.getinfo(geturls_text.HTTP_CODE)
1589-
httpcodereason = http_status_to_reason(
1590-
geturls_text.getinfo(geturls_text.HTTP_CODE))
1836+
httpcodereason = http_status_to_reason(geturls_text.getinfo(geturls_text.HTTP_CODE))
15911837
httpversionout = pyhttpverinfo[0]
15921838
httpmethodout = httpmethod
15931839
httpurlout = geturls_text.getinfo(geturls_text.EFFECTIVE_URL)
15941840
httpheaderout = pycurlheadersout
1595-
httpheadersentout = headers
1841+
try:
1842+
httpheadersentout = httpheadersentpre['request']['headers']
1843+
except TypeError:
1844+
httpheadersentout = headers
15961845

15971846
# urllib fallback
15981847
else:

0 commit comments

Comments
 (0)