Skip to content

Commit a022945

Browse files
committed
Small update
1 parent a3b7175 commit a022945

1 file changed

Lines changed: 119 additions & 2 deletions

File tree

pycatfile/pycatfile.py

Lines changed: 119 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2455,6 +2455,121 @@ def GetBinaryFileType(infile, filestart=0, closefp=True):
24552455
# -------------- FALLBACK --------------
24562456
return False
24572457

2458+
def _get_seek_consts():
2459+
"""Return (SEEK_DATA, SEEK_HOLE) if supported, else (None, None)."""
2460+
seek_data = getattr(os, "SEEK_DATA", None)
2461+
seek_hole = getattr(os, "SEEK_HOLE", None)
2462+
if seek_data is None or seek_hole is None:
2463+
return None, None
2464+
return seek_data, seek_hole
2465+
2466+
def pack_sparse_to_stream(path, out_fp, bufsize=1024*1024):
2467+
"""
2468+
Write ONLY data extents from sparse file `path` into `out_fp`.
2469+
Returns: (logical_size, extents, stored_bytes)
2470+
extents: list of (offset, length) in logical file
2471+
stored_bytes: total bytes written to out_fp
2472+
"""
2473+
st = os.stat(path, follow_symlinks=False)
2474+
logical_size = int(st.st_size)
2475+
extents = []
2476+
stored = 0
2477+
2478+
SEEK_DATA, SEEK_HOLE = _get_seek_consts()
2479+
2480+
with open(path, "rb", buffering=0) as f:
2481+
if SEEK_DATA is not None and SEEK_HOLE is not None:
2482+
# Kernel knows where holes are (best, fastest, exact).
2483+
pos = 0
2484+
while pos < logical_size:
2485+
try:
2486+
data_off = os.lseek(f.fileno(), pos, SEEK_DATA)
2487+
except OSError:
2488+
break # no more data
2489+
try:
2490+
hole_off = os.lseek(f.fileno(), data_off, SEEK_HOLE)
2491+
except OSError:
2492+
hole_off = logical_size
2493+
if hole_off > logical_size:
2494+
hole_off = logical_size
2495+
2496+
length = hole_off - data_off
2497+
if length <= 0:
2498+
pos = max(pos + 1, hole_off)
2499+
continue
2500+
2501+
extents.append((data_off, length))
2502+
# copy that extent’s bytes into out_fp
2503+
os.lseek(f.fileno(), data_off, os.SEEK_SET)
2504+
remaining = length
2505+
while remaining:
2506+
chunk = f.read(min(bufsize, remaining))
2507+
if not chunk:
2508+
break
2509+
out_fp.write(chunk)
2510+
stored += len(chunk)
2511+
remaining -= len(chunk)
2512+
2513+
pos = hole_off
2514+
else:
2515+
# Portable fallback (no SEEK_HOLE/DATA): scan for non-zero blocks.
2516+
# Not perfect (won't detect "real zeros" vs "holes"), but works as a fallback.
2517+
block = 4096
2518+
pos = 0
2519+
while pos < logical_size:
2520+
chunk = f.read(block)
2521+
if not chunk:
2522+
break
2523+
if any(b != 0 for b in chunk):
2524+
off = pos
2525+
# extend this run while blocks have any non-zero
2526+
run = bytearray(chunk)
2527+
while True:
2528+
nxt = f.read(block)
2529+
if not nxt or not any(b != 0 for b in nxt):
2530+
if nxt:
2531+
# rewind one block if it was all-zero (we read too far)
2532+
f.seek(-len(nxt), os.SEEK_CUR)
2533+
break
2534+
run.extend(nxt)
2535+
extents.append((off, len(run)))
2536+
out_fp.write(run)
2537+
stored += len(run)
2538+
pos = off + len(run)
2539+
else:
2540+
pos += len(chunk)
2541+
2542+
out_fp.seek(0, os.SEEK_SET)
2543+
return logical_size, extents, stored
2544+
2545+
def write_sparse_to_fileobj(out_fp, logical_size, extents, in_fp, bufsize=1024*1024):
2546+
"""
2547+
Recreate sparse file layout into an already-open writable file-like object.
2548+
"""
2549+
out_fp.seek(0)
2550+
out_fp.truncate(int(logical_size))
2551+
2552+
for off, length in extents:
2553+
out_fp.seek(int(off), os.SEEK_SET)
2554+
remaining = int(length)
2555+
while remaining:
2556+
chunk = in_fp.read(min(bufsize, remaining))
2557+
if not chunk:
2558+
raise EOFError("Archive ended while reading sparse extent data")
2559+
out_fp.write(chunk)
2560+
remaining -= len(chunk)
2561+
2562+
def unpack_sparse_to_path(in_fp, out_path, logical_size, extents, bufsize=1024*1024):
2563+
os.makedirs(os.path.dirname(out_path) or ".", exist_ok=True)
2564+
2565+
with open(out_path, "wb") as f:
2566+
write_sparse_to_fileobj(f, logical_size, extents, in_fp, bufsize)
2567+
2568+
try:
2569+
f.flush()
2570+
os.fsync(f.fileno())
2571+
except Exception:
2572+
pass
24582573

24592574
def _is_valid_zlib_header(cmf, flg):
24602575
"""
@@ -5995,7 +6110,8 @@ def AppendFilesWithContentToList(infiles, dirlistfromtxt=False, extradata=[], js
59956110
# Types that should be considered zero-length in the archive context:
59966111
zero_length_types = {1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 13}
59976112
# Types that have actual data to read:
5998-
data_types = {0, 7, 12}
6113+
data_types = {0, 7}
6114+
sparse_types = {12}
59996115
if ftype in zero_length_types:
60006116
fsize = format(int("0"), 'x').lower()
60016117
elif ftype in data_types:
@@ -6312,7 +6428,8 @@ def AppendFilesWithContentFromTarFileToList(infile, extradata=[], jsondata={}, c
63126428
# Types that should be considered zero-length in the archive context:
63136429
zero_length_types = {1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 13}
63146430
# Types that have actual data to read:
6315-
data_types = {0, 7, 12}
6431+
data_types = {0, 7}
6432+
sparse_types = {12}
63166433
if ftype in zero_length_types:
63176434
fsize = format(int("0"), 'x').lower()
63186435
elif ftype in data_types:

0 commit comments

Comments
 (0)