9191 from urlparse import urlparse , urlunparse
9292
9393# Windows-specific setup
94- if os .name == 'nt' :
95- if sys .version_info [0 ] == 2 :
96- import codecs
97- sys .stdout = codecs .getwriter ('UTF-8' )(sys .stdout )
98- sys .stderr = codecs .getwriter ('UTF-8' )(sys .stderr )
99- else :
100- sys .stdout = io .TextIOWrapper (
101- sys .stdout .buffer , encoding = 'UTF-8' , errors = 'replace' , line_buffering = True )
102- sys .stderr = io .TextIOWrapper (
103- sys .stderr .buffer , encoding = 'UTF-8' , errors = 'replace' , line_buffering = True )
94+ if os .name == "nt" :
95+ import io
96+ def _wrap (stream ):
97+ buf = getattr (stream , "buffer" , None )
98+ is_tty = getattr (stream , "isatty" , lambda : False )()
99+ if buf is not None and is_tty :
100+ try :
101+ return io .TextIOWrapper (buf , encoding = "UTF-8" , errors = "replace" , line_buffering = True )
102+ except Exception :
103+ return stream
104+ return stream
105+ sys .stdout = _wrap (sys .stdout )
106+ sys .stderr = _wrap (sys .stderr )
104107
105108hashlib_guaranteed = False
106109# Environment setup
@@ -286,7 +289,28 @@ def get_default_threads():
286289# Define a function to check if var contains only non-printable chars
287290all_np_chars = [chr (i ) for i in range (128 )]
288291def is_only_nonprintable (var ):
289- return all (not c .isprintable () for c in var )
292+ """True if every character is non-printable (Py2/3-safe, handles bytes)."""
293+ if var is None :
294+ return True
295+ s = to_text (var )
296+ # In Py2, some unicode categories behave differently; isprintable is Py3-only.
297+ # We'll implement a portable check: letters, numbers, punctuation, and common whitespace are printable.
298+ try :
299+ # Py3 fast path
300+ return all (not ch .isprintable () for ch in s )
301+ except AttributeError :
302+ # Py2 path
303+ import unicodedata
304+ def _is_printable (ch ):
305+ cat = unicodedata .category (ch )
306+ # Categories starting with 'C' are control/non-assigned/surrogates
307+ if cat .startswith ('C' ):
308+ return False
309+ # treat space and common whitespace as printable
310+ if ch in u"\t \n \r \x0b \x0c " :
311+ return True
312+ return True
313+ return all (not _is_printable (ch ) for ch in s )
290314__file_format_multi_dict__ = {}
291315__file_format_default__ = "CatFile"
292316__include_defaults__ = True
@@ -370,13 +394,15 @@ def decode_unicode_escape(value):
370394if (__version_info__ [3 ] is None ):
371395 __version__ = str (__version_info__ [0 ]) + "." + str (__version_info__ [1 ]) + "." + str (__version_info__ [2 ])
372396
373- PyBitness = platform .architecture ()
374- if (PyBitness == "32bit" or PyBitness == "32" ):
375- PyBitness = "32"
376- elif (PyBitness == "64bit" or PyBitness == "64" ):
377- PyBitness = "64"
378- else :
379- PyBitness = "32"
397+ # Robust bitness detection
398+ # Works on Py2 & Py3, all platforms
399+ try :
400+ import struct
401+ PyBitness = "64" if struct .calcsize ("P" ) * 8 == 64 else "32"
402+ except Exception :
403+ # conservative fallback
404+ m = platform .machine () or ""
405+ PyBitness = "64" if m .endswith ("64" ) else "32"
380406
381407geturls_ua_pyfile_python = "Mozilla/5.0 (compatible; {proname}/{prover}; +{prourl})" .format (
382408 proname = __project__ , prover = __version__ , prourl = __project_url__ )
@@ -389,9 +415,9 @@ def decode_unicode_escape(value):
389415geturls_ua_googlebot_google = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
390416geturls_ua_googlebot_google_old = "Googlebot/2.1 (+http://www.google.com/bot.html)"
391417geturls_headers_pyfile_python = {'Referer' : "http://google.com/" , 'User-Agent' : geturls_ua_pyfile_python , 'Accept-Encoding' : "none" , 'Accept-Language' : "en-US,en;q=0.8,en-CA,en-GB;q=0.6" , 'Accept-Charset' : "ISO-8859-1,ISO-8859-15,UTF-8;q=0.7,*;q=0.7" , 'Accept' : "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" , 'Connection' : "close" ,
392- 'SEC-CH-UA' : "\" " + __project__ + "\" ;v=\" " + str (__version__ )+ "\" , \" Not;A=Brand\" ;v=\" 8\" , \" " + py_implementation + "\" ;v=\" " + str (platform .release ())+ "\" " , 'SEC-CH-UA-FULL-VERSION' : str (__version__ ), 'SEC-CH-UA-PLATFORM' : "" + py_implementation + "" , 'SEC-CH-UA-ARCH' : "" + platform .machine ()+ "" , 'SEC-CH-UA-PLATFORM' : str (__version__ ), 'SEC-CH-UA-BITNESS' : str (PyBitness )}
418+ 'SEC-CH-UA' : "\" " + __project__ + "\" ;v=\" " + str (__version__ )+ "\" , \" Not;A=Brand\" ;v=\" 8\" , \" " + py_implementation + "\" ;v=\" " + str (platform .release ())+ "\" " , 'SEC-CH-UA-FULL-VERSION' : str (__version__ ), 'SEC-CH-UA-PLATFORM' : "" + py_implementation + "" , 'SEC-CH-UA-ARCH' : "" + platform .machine ()+ "" , 'SEC-CH-UA-PLATFORM-VERSION : str(__version__), ' SEC - CH - UA - BITNESS ': str (PyBitness )}
393419geturls_headers_pyfile_python_alt = {'Referer' : "http://google.com/" , 'User-Agent' : geturls_ua_pyfile_python_alt , 'Accept-Encoding' : "none" , 'Accept-Language' : "en-US,en;q=0.8,en-CA,en-GB;q=0.6" , 'Accept-Charset' : "ISO-8859-1,ISO-8859-15,UTF-8;q=0.7,*;q=0.7" , 'Accept' : "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" , 'Connection' : "close" ,
394- 'SEC-CH-UA' : "\" " + __project__ + "\" ;v=\" " + str (__version__ )+ "\" , \" Not;A=Brand\" ;v=\" 8\" , \" " + py_implementation + "\" ;v=\" " + str (platform .release ())+ "\" " , 'SEC-CH-UA-FULL-VERSION' : str (__version__ ), 'SEC-CH-UA-PLATFORM' : "" + py_implementation + "" , 'SEC-CH-UA-ARCH' : "" + platform .machine ()+ "" , 'SEC-CH-UA-PLATFORM' : str (__version__ ), 'SEC-CH-UA-BITNESS' : str (PyBitness )}
420+ 'SEC-CH-UA' : "\" " + __project__ + "\" ;v=\" " + str (__version__ )+ "\" , \" Not;A=Brand\" ;v=\" 8\" , \" " + py_implementation + "\" ;v=\" " + str (platform .release ())+ "\" " , 'SEC-CH-UA-FULL-VERSION' : str (__version__ ), 'SEC-CH-UA-PLATFORM' : "" + py_implementation + "" , 'SEC-CH-UA-ARCH' : "" + platform .machine ()+ "" , 'SEC-CH-UA-PLATFORM-VERSION : str(__version__), ' SEC - CH - UA - BITNESS ': str (PyBitness )}
395421geturls_headers_googlebot_google = {'Referer' : "http://google.com/" , 'User-Agent' : geturls_ua_googlebot_google , 'Accept-Encoding' : "none" , 'Accept-Language' : "en-US,en;q=0.8,en-CA,en-GB;q=0.6" ,
396422 'Accept-Charset' : "ISO-8859-1,ISO-8859-15,UTF-8;q=0.7,*;q=0.7" , 'Accept' : "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" , 'Connection' : "close" }
397423geturls_headers_googlebot_google_old = {'Referer' : "http://google.com/" , 'User-Agent' : geturls_ua_googlebot_google_old , 'Accept-Encoding' : "none" , 'Accept-Language' : "en-US,en;q=0.8,en-CA,en-GB;q=0.6" ,
@@ -548,16 +574,20 @@ def VerbosePrintOutReturn(dbgtxt, outtype="log", dbgenable=True, dgblevel=20):
548574
549575def RemoveWindowsPath (dpath ):
550576 """
551- Normalizes a path by converting Windows-style separators to Unix-style and stripping trailing slashes.
577+ Normalize a path by converting backslashes to forward slashes
578+ and stripping a trailing slash.
552579 """
553- if dpath is None :
554- dpath = ""
555- if os .sep != "/" :
556- dpath = dpath .replace (os .path .sep , "/" )
557- dpath = dpath .rstrip ("/" )
558- if dpath in ["." , ".." ]:
559- dpath = dpath + "/"
560- return dpath
580+ if not dpath :
581+ return ""
582+ # Accept bytes and decode safely
583+ if isinstance (dpath , (bytes , bytearray )):
584+ dpath = dpath .decode ("utf-8" , "ignore" )
585+ dpath = dpath .replace ("\\ " , "/" )
586+ # Collapse multiple slashes except for protocol prefixes like "s3://"
587+ if "://" not in dpath :
588+ while "//" in dpath :
589+ dpath = dpath .replace ("//" , "/" )
590+ return dpath .rstrip ("/" )
561591
562592
563593def NormalizeRelativePath (inpath ):
0 commit comments