Skip to content

Commit 3e18a4a

Browse files
authored
Add files via upload
1 parent 7731069 commit 3e18a4a

1 file changed

Lines changed: 178 additions & 1 deletion

File tree

pyarchivefile.py

Lines changed: 178 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
import binascii
3939
import platform
4040
from io import StringIO, BytesIO
41+
import posixpath as pp # POSIX-safe joins/normpaths
4142
try:
4243
from backports import tempfile
4344
except ImportError:
@@ -379,7 +380,7 @@ def decode_unicode_escape(value):
379380
__version_date__ = str(__version_date_info__[0]) + "." + str(
380381
__version_date_info__[1]).zfill(2) + "." + str(__version_date_info__[2]).zfill(2)
381382
__revision__ = __version_info__[3]
382-
__revision_id__ = "$Id: 51621a2b361db767bc985f415869721fb7be5e6c $"
383+
__revision_id__ = "$Id$"
383384
if(__version_info__[4] is not None):
384385
__version_date_plusrc__ = __version_date__ + \
385386
"-" + str(__version_date_info__[4])
@@ -609,6 +610,182 @@ def _normalize_initial_data(data, isbytes, encoding):
609610
return str(data)
610611

611612

613+
def _split_posix(path_text):
614+
"""Split POSIX paths regardless of OS; return list of components."""
615+
# Normalize leading './'
616+
if path_text.startswith(u'./'):
617+
path_text = path_text[2:]
618+
# Strip redundant slashes
619+
path_text = re.sub(u'/+', u'/', path_text)
620+
# Drop trailing '/' so 'dir/' -> ['dir']
621+
if path_text.endswith(u'/'):
622+
path_text = path_text[:-1]
623+
return path_text.split(u'/') if path_text else []
624+
625+
def _is_abs_like(s):
626+
"""Absolute targets (POSIX or Windows-drive style)."""
627+
return s.startswith(u'/') or s.startswith(u'\\') or re.match(u'^[A-Za-z]:[/\\\\]', s)
628+
629+
def _resolves_outside(base_rel, target_rel):
630+
"""
631+
Given a base directory (relative, POSIX) and a target (relative),
632+
return True if base/target resolves outside of base.
633+
We anchor under '/' so normpath is root-anchored and portable.
634+
"""
635+
base_clean = u'/'.join(_split_posix(base_rel))
636+
target_clean = u'/'.join(_split_posix(target_rel))
637+
base_abs = u'/' + base_clean if base_clean else u'/'
638+
combined = pp.normpath(pp.join(base_abs, target_clean))
639+
if combined == base_abs or combined.startswith(base_abs + u'/'):
640+
return False
641+
return True
642+
643+
def DetectTarbombArchivefileArray(listarchivefiles,
644+
top_file_ratio_threshold=0.6,
645+
min_members_for_ratio=4,
646+
symlink_policy="escape-only", # 'escape-only' | 'deny' | 'single-folder-only'
647+
to_text=to_text):
648+
"""
649+
Detect 'tarbomb-like' archives from ArchiveFileToArray/TarFileToArray dicts.
650+
651+
Parameters:
652+
listarchivefiles: dict with key 'ffilelist' -> list of entries (requires 'fname')
653+
top_file_ratio_threshold: float, fraction of root files considered tarbomb
654+
min_members_for_ratio: int, minimum members before ratio heuristic applies
655+
symlink_policy:
656+
- 'escape-only': only symlinks that escape parent/are absolute are unsafe
657+
- 'deny': any symlink is unsafe
658+
- 'single-folder-only': symlinks allowed only if archive has a single top-level folder
659+
to_text: normalization function (your provided to_text)
660+
661+
Returns dict with:
662+
- is_tarbomb, reasons, total_members, top_level_entries, top_level_files_count,
663+
has_absolute_paths, has_parent_traversal,
664+
symlink_escapes_root (bool), symlink_issues (list[{entry,target,reason}])
665+
"""
666+
files = listarchivefiles or {}
667+
members = files.get('ffilelist') or []
668+
669+
names = []
670+
has_abs = False
671+
has_parent = False
672+
673+
# Symlink tracking
674+
has_any_symlink = False
675+
symlink_issues = []
676+
any_symlink_escape = False
677+
678+
for m in members:
679+
m = m or {}
680+
name = to_text(m.get('fname', u""))
681+
682+
if _is_abs_like(name):
683+
has_abs = True
684+
685+
parts = _split_posix(name)
686+
if u'..' in parts:
687+
has_parent = True
688+
689+
if not parts:
690+
continue
691+
692+
norm_name = u'/'.join(parts)
693+
names.append(norm_name)
694+
695+
# ---- Symlink detection ----
696+
ftype = m.get('ftype')
697+
is_symlink = (ftype == 2) or (to_text(ftype).lower() == u'symlink' if ftype is not None else False)
698+
if is_symlink:
699+
has_any_symlink = True
700+
target = to_text(m.get('flinkname', u""))
701+
# Absolute symlink target is unsafe
702+
if _is_abs_like(target):
703+
any_symlink_escape = True
704+
symlink_issues.append({'entry': norm_name, 'target': target, 'reason': 'absolute symlink target'})
705+
else:
706+
parent = u'/'.join(parts[:-1]) # may be ''
707+
if _resolves_outside(parent, target):
708+
any_symlink_escape = True
709+
symlink_issues.append({'entry': norm_name, 'target': target, 'reason': 'symlink escapes parent directory'})
710+
711+
total = len(names)
712+
reasons = []
713+
if total == 0:
714+
return {
715+
"is_tarbomb": False,
716+
"reasons": ["archive contains no members"],
717+
"total_members": 0,
718+
"top_level_entries": [],
719+
"top_level_files_count": 0,
720+
"has_absolute_paths": has_abs,
721+
"has_parent_traversal": has_parent,
722+
"symlink_escapes_root": any_symlink_escape,
723+
"symlink_issues": symlink_issues,
724+
}
725+
726+
# Layout counts
727+
top_counts = {}
728+
top_level_files_count = 0
729+
for name in names:
730+
parts = name.split(u'/')
731+
first = parts[0]
732+
top_counts[first] = top_counts.get(first, 0) + 1
733+
if len(parts) == 1: # directly at archive root
734+
top_level_files_count += 1
735+
736+
top_keys = sorted(top_counts.keys())
737+
is_tarbomb = False
738+
739+
# Path-based dangers
740+
if has_abs:
741+
is_tarbomb = True
742+
reasons.append("contains absolute paths (dangerous)")
743+
if has_parent:
744+
is_tarbomb = True
745+
reasons.append("contains parent-traversal ('..') entries (dangerous)")
746+
if any_symlink_escape:
747+
is_tarbomb = True
748+
reasons.append("contains symlinks that escape their parent directory")
749+
750+
# Symlink policy enforcement
751+
if symlink_policy == "deny" and has_any_symlink:
752+
is_tarbomb = True
753+
reasons.append("symlinks present and policy is 'deny'")
754+
elif symlink_policy == "single-folder-only" and has_any_symlink and len(top_keys) != 1:
755+
is_tarbomb = True
756+
reasons.append("symlinks present but archive lacks a single top-level folder")
757+
758+
# Tarbomb layout heuristics
759+
if len(top_keys) == 1:
760+
reasons.append("single top-level entry '{0}'".format(top_keys[0]))
761+
else:
762+
ratio = float(top_level_files_count) / float(total)
763+
if total >= min_members_for_ratio and ratio > float(top_file_ratio_threshold):
764+
is_tarbomb = True
765+
reasons.append("high fraction of members ({0:.0%}) at archive root".format(ratio))
766+
else:
767+
max_bucket = max(top_counts.values()) if top_counts else 0
768+
if max_bucket < total * 0.9:
769+
is_tarbomb = True
770+
reasons.append("multiple top-level entries with no dominant folder: {0}".format(
771+
u", ".join(top_keys[:10])))
772+
else:
773+
reasons.append("multiple top-level entries but one dominates")
774+
775+
return {
776+
"is_tarbomb": bool(is_tarbomb),
777+
"reasons": reasons,
778+
"total_members": total,
779+
"top_level_entries": top_keys,
780+
"top_level_files_count": top_level_files_count,
781+
"has_absolute_paths": has_abs,
782+
"has_parent_traversal": has_parent,
783+
"symlink_escapes_root": any_symlink_escape,
784+
"symlink_issues": symlink_issues,
785+
}
786+
787+
788+
612789
def MkTempFile(data=None, inmem=__use_inmemfile__, isbytes=True, prefix=__project__,
613790
delete=True, encoding="utf-8"):
614791
"""

0 commit comments

Comments
 (0)