diff --git a/changes.txt b/changes.txt index c70720625..f1a01b89c 100644 --- a/changes.txt +++ b/changes.txt @@ -7,6 +7,9 @@ Change Log Other: * Retrospectively mark `4756 `_ as fixed in 1.26.6. + * Improved safety of `pymupdf embed-extract`. This now refuses to write to + an existing file or outside current directory, unless `-output` or new flag + `-unsafe` is specified. **Changes in version 1.26.6** (2025-11-05) diff --git a/docs/module.rst b/docs/module.rst index 47b33d306..cd8a0340d 100644 --- a/docs/module.rst +++ b/docs/module.rst @@ -299,7 +299,7 @@ Extraction Extract an embedded file like this:: pymupdf embed-extract -h - usage: pymupdf embed-extract [-h] -name NAME [-password PASSWORD] [-output OUTPUT] + usage: pymupdf embed-extract [-h] -name NAME [-password PASSWORD] [-unsafe] [-output OUTPUT] input ---------------------- extract embedded file to disk ---------------------- @@ -311,6 +311,7 @@ Extract an embedded file like this:: -h, --help show this help message and exit -name NAME name of entry -password PASSWORD password + -unsafe allow write to stored name even if an existing file or outside current directory -output OUTPUT output filename, default is stored name For details consult :meth:`Document.embfile_get`. Example (refer to previous section):: diff --git a/pipcl.py b/pipcl.py index b82a50456..d9b831f70 100644 --- a/pipcl.py +++ b/pipcl.py @@ -2435,6 +2435,12 @@ def log(text): log(f'{sys.version_info=}') log(f'{list(sys.version_info)=}') + log(f'{sysconfig.get_config_var("Py_GIL_DISABLED")=}') + try: + log(f'{sys._is_gil_enabled()=}') + except AttributeError: + log(f'sys._is_gil_enabled() => AttributeError') + log(f'CPU bits: {cpu_bits()}') log(f'sys.argv ({len(sys.argv)}):') diff --git a/setup.py b/setup.py index 46c27da9b..f5d6abad0 100755 --- a/setup.py +++ b/setup.py @@ -1023,7 +1023,12 @@ def build_mupdf_unix( if PYMUPDF_SETUP_SWIG: command += f' --swig {shlex.quote(PYMUPDF_SETUP_SWIG)}' command += f' -d build/{build_prefix}{build_type} -b' - #command += f' --m-target libs' + if sys.implementation.name == 'graalpy': + # Force rerun of swig. + pipcl.run(f'ls -l {mupdf_local}/platform/python/') + for p in glob.glob(f'{mupdf_local}/platform/python/mupdfcpp*.i.cpp'): + pipcl.log(f'Graal, deleting: {p!r}') + pipcl.fs_remove(p) if PYMUPDF_SETUP_MUPDF_REFCHECK_IF: command += f' --refcheck-if "{PYMUPDF_SETUP_MUPDF_REFCHECK_IF}"' if PYMUPDF_SETUP_MUPDF_TRACE_IF: diff --git a/src/__main__.py b/src/__main__.py index 35914d6c7..50c5d905f 100644 --- a/src/__main__.py +++ b/src/__main__.py @@ -350,6 +350,12 @@ def embedded_get(args): except (ValueError, pymupdf.mupdf.FzErrorBase) as e: sys.exit(f'no such embedded file {args.name!r}: {e}') filename = args.output if args.output else d["filename"] + if not args.unsafe and not args.output: + if os.path.exists(filename): + sys.exit(f'refusing to overwrite existing file with stored name: {filename}') + filename_abs = os.path.abspath(filename) + if not filename_abs.startswith(os.getcwd() + os.sep): + sys.exit(f'refusing to write stored name outside current directory: {filename}') with open(filename, "wb") as output: output.write(stream) pymupdf.message("saved entry '%s' as '%s'" % (args.name, filename)) @@ -1024,6 +1030,9 @@ def main(): ps_embed_extract.add_argument("input", type=str, help="PDF filename") ps_embed_extract.add_argument("-name", required=True, help="name of entry") ps_embed_extract.add_argument("-password", help="password") + ps_embed_extract.add_argument("-unsafe", default=False, action="store_true", + help="allow write to stored name even if an existing file or outside current directory" + ) ps_embed_extract.add_argument( "-output", help="output filename, default is stored name" ) diff --git a/tests/conftest.py b/tests/conftest.py index 773cc80b0..92079b063 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -23,6 +23,8 @@ def install_required_packages(): if platform.system() == 'Windows' and int.bit_length(sys.maxsize+1) == 32: # No pillow wheel available, and doesn't build easily. pass + elif platform.python_implementation() == 'GraalVM': + pass else: packages += ' pillow' if platform.system().startswith('MSYS_NT-'): diff --git a/tests/test_4767.py b/tests/test_4767.py new file mode 100644 index 000000000..d3fc318dc --- /dev/null +++ b/tests/test_4767.py @@ -0,0 +1,86 @@ +import shutil +import os +import pymupdf +import subprocess +import sys + + +def test_4767(): + ''' + Check handling of unsafe paths in `pymupdf embed-extract`. + ''' + with pymupdf.open() as document: + document.new_page() + document.embfile_add( + 'evil_entry', + b'poc:traversal test\n', + filename="../../test.txt", + ufilename="../../test.txt", + desc="poc", + ) + document.embfile_add( + 'evil_entry2', + b'poc:traversal test\n', + filename="test2.txt", + ufilename="test2.txt", + desc="poc", + ) + path = os.path.abspath(f'{__file__}/../../tests/test_4767.pdf') + document.save(path) + testdir = os.path.abspath(f'{__file__}/../../tests/test_4767_dir').replace('\\', '/') + shutil.rmtree(testdir, ignore_errors=1) + os.makedirs(f'{testdir}/one/two', exist_ok=1) + + def run(command, *, check=0, capture=1): + print(f'Running: {command}') + cp = subprocess.run( + command, shell=1, + text=1, + check=check, + stdout=subprocess.PIPE if capture else None, + stderr=subprocess.STDOUT if capture else None, + ) + print(cp.stdout) + return cp + + def get_paths(): + paths = list() + for dirpath, dirnames, filenames in os.walk(testdir): + for filename in filenames: + path = f'{dirpath}/{filename}'.replace('\\', '/') + paths.append(path) + return paths + + cp = run(f'cd {testdir}/one/two && {sys.executable} -m pymupdf embed-extract {path} -name evil_entry') + print(cp.stdout) + assert cp.returncode + assert cp.stdout == 'refusing to write stored name outside current directory: ../../test.txt\n' + assert not get_paths() + + cp = run(f'cd {testdir}/one/two && {sys.executable} -m pymupdf embed-extract {path} -name evil_entry -unsafe') + assert cp.returncode == 0 + assert cp.stdout == "saved entry 'evil_entry' as '../../test.txt'\n" + paths = get_paths() + print(f'{paths=}') + assert paths == [f'{testdir}/test.txt'] + + cp = run(f'cd {testdir}/one/two && {sys.executable} -m pymupdf embed-extract {path} -name evil_entry2') + assert not cp.returncode + assert cp.stdout == "saved entry 'evil_entry2' as 'test2.txt'\n" + paths = get_paths() + print(f'{paths=}') + assert paths == [f'{testdir}/test.txt', f'{testdir}/one/two/test2.txt'] + + cp = run(f'cd {testdir}/one/two && {sys.executable} -m pymupdf embed-extract {path} -name evil_entry2') + assert cp.returncode + assert cp.stdout == "refusing to overwrite existing file with stored name: test2.txt\n" + paths = get_paths() + print(f'{paths=}') + assert paths == [f'{testdir}/test.txt', f'{testdir}/one/two/test2.txt'] + + cp = run(f'cd {testdir}/one/two && {sys.executable} -m pymupdf embed-extract {path} -name evil_entry2 -unsafe') + assert not cp.returncode + assert cp.stdout == "saved entry 'evil_entry2' as 'test2.txt'\n" + paths = get_paths() + print(f'{paths=}') + assert paths == [f'{testdir}/test.txt', f'{testdir}/one/two/test2.txt'] diff --git a/tests/test_general.py b/tests/test_general.py index 0b284e7eb..7bf98397d 100644 --- a/tests/test_general.py +++ b/tests/test_general.py @@ -4,19 +4,20 @@ * Confirm proper release of file handles via Document.close() * Confirm properly raising exceptions in document creation """ -import io -import os - import fnmatch +import io import json -import pymupdf +import os import pathlib import pickle import platform +import pymupdf import re +import shlex import shutil import subprocess import sys +import sysconfig import textwrap import time import util @@ -26,6 +27,15 @@ scriptdir = os.path.abspath(os.path.dirname(__file__)) filename = os.path.join(scriptdir, "resources", "001003ED.pdf") +Py_GIL_DISABLED = sysconfig.get_config_var('Py_GIL_DISABLED') +try: + gil_enabled = sys._is_gil_enabled() +except AttributeError: + gil_enabled = True +regex_gil_stderr = None +if Py_GIL_DISABLED and gil_enabled: + regex_gil_stderr = '.*The global interpreter lock.*' + def test_haslinks(): doc = pymupdf.open(filename) @@ -1038,10 +1048,10 @@ def check_lines(expected_regexes, actual): ''' Checks lines in match regexes in . ''' - print(f'check_lines():', flush=1) - print(f'{expected_regexes=}', flush=1) - print(f'{actual=}', flush=1) + print(f'### check_lines():', flush=1) def str_to_list(s): + if s is None: + return list() if isinstance(s, str): return s.split('\n') if s else list() return s @@ -1051,11 +1061,35 @@ def str_to_list(s): expected_regexes.append('') # Always expect a trailing empty line. # Remove `None` regexes and make all regexes match entire lines. expected_regexes = [f'^{i}$' for i in expected_regexes if i is not None] - print(f'{expected_regexes=}', flush=1) - for expected_regex_line, actual_line in zip(expected_regexes, actual): - print(f' {expected_regex_line=}', flush=1) - print(f' {actual_line=}', flush=1) - assert re.match(expected_regex_line, actual_line) + + print(f'expected_regexes ({len(expected_regexes)}):') + for i in expected_regexes: + print(f' {i!r}') + + print(f'actual ({len(actual)}):') + for i in actual: + print(f' {i!r}') + + i_expected = 0 + i_actual = 0 + while 1: + if i_expected == len(expected_regexes) and i_actual == len(actual): + break + print(f'expected {i_expected+1}/{len(expected_regexes)}') + print(f'actual {i_actual+1}/{len(actual)}') + assert i_expected < len(expected_regexes) and i_actual < len(actual) + expected_regex_line = expected_regexes[i_expected] + actual_line = actual[i_actual] + if expected_regex_line is None: + i_expected += 1 + continue + print(f' expected_regex: {expected_regex_line!r}', flush=1) + print(f' actual: {actual!r}', flush=1) + match = re.match(expected_regex_line, actual_line) + print(f' {match=}') + assert match + i_expected += 1 + i_actual += 1 assert len(expected_regexes) == len(actual), \ f'expected/actual lines mismatch: {len(expected_regexes)=} {len(actual)=}.' @@ -1078,12 +1112,17 @@ def test_cli_out(): if os.environ.get('PYMUPDF_USE_EXTRA') == '0': log_prefix = f'.+Using non-default setting from PYMUPDF_USE_EXTRA: \'0\'' + sys.path.append(os.path.normpath(f'{__file__}/../..')) + try: + import pipcl + finally: + del sys.path[0] + pipcl.show_system() def check( expect_out, expect_err, message=None, log=None, - verbose=0, ): ''' Sets PYMUPDF_MESSAGE to `message` and PYMUPDF_LOG to `log`, runs @@ -1095,39 +1134,44 @@ def check( env['PYMUPDF_LOG'] = log if message: env['PYMUPDF_MESSAGE'] = message + command = 'pymupdf internal' + print(f'Running: {command}', flush=1) + if env: + print(f'with:') + for key in sorted(env.keys()): + print(f' {key}={shlex.quote(env[key])}') env = os.environ | env - print(f'Running with {env=}: pymupdf internal', flush=1) - cp = subprocess.run(f'pymupdf internal', shell=1, check=1, capture_output=1, env=env, text=True) + cp = subprocess.run(command, shell=1, check=1, capture_output=1, env=env, text=True) - if verbose: - #print(f'{cp.stdout=}.', flush=1) - #print(f'{cp.stderr=}.', flush=1) - sys.stdout.write(f'stdout:\n{textwrap.indent(cp.stdout, " ")}') - sys.stdout.write(f'stderr:\n{textwrap.indent(cp.stderr, " ")}') check_lines(expect_out, cp.stdout) check_lines(expect_err, cp.stderr) - # + print(f'test_cli_out(): {Py_GIL_DISABLED=}') + print(f'test_cli_out(): {gil_enabled=}') + print(f'Checking default, all output to stdout.') + regex_gil_stderr = None + if Py_GIL_DISABLED and gil_enabled: + regex_gil_stderr = '.*The global interpreter lock.*' check( [ log_prefix, 'This is from PyMuPDF message[(][)][.]', '.+This is from PyMuPDF log[(][)].', ], - '', + regex_gil_stderr, ) # if platform.system() != 'Windows': print(f'Checking redirection of everything to /dev/null.') - check('', '', 'path:/dev/null', 'path:/dev/null') + check('', regex_gil_stderr, 'path:/dev/null', 'path:/dev/null') # print(f'Checking redirection to files.') path_out = os.path.abspath(f'{__file__}/../../tests/test_cli_out.out') path_err = os.path.abspath(f'{__file__}/../../tests/test_cli_out.err') - check('', '', f'path:{path_out}', f'path:{path_err}') + check('', regex_gil_stderr, f'path:{path_out}', f'path:{path_err}') def read(path): with open(path) as f: return f.read() @@ -1143,6 +1187,7 @@ def read(path): 'This is from PyMuPDF message[(][)][.]', ], [ + regex_gil_stderr, log_prefix, '.+This is from PyMuPDF log[(][)].', ], @@ -1211,6 +1256,7 @@ def check( '.+this is pymupdf.log[(][)]', ], [ + regex_gil_stderr, 'this is pymupdf.message[(][)] 2', '.+this is pymupdf.log[(][)] 2', ], @@ -1233,6 +1279,7 @@ def check( log_prefix, ], [ + regex_gil_stderr, 'WARNING:pymupdf:this is pymupdf.message[(][)]', 'WARNING:pymupdf:.+this is pymupdf.log[(][)]', ], @@ -1247,6 +1294,7 @@ def check( ''', '', [ + regex_gil_stderr, log_prefix, 'this is pymupdf.message[(][)]', '.+this is pymupdf.log[(][)]', @@ -1276,6 +1324,8 @@ def check( log_prefix, ], [ + regex_gil_stderr, + log_prefix, 'WARNING:foo:this is pymupdf.message[(][)]', 'ERROR:foo:.+this is pymupdf.log[(][)]', ], @@ -1300,6 +1350,8 @@ def check( log_prefix, ], [ + regex_gil_stderr, + log_prefix, 'CRITICAL:pymupdf:this is pymupdf.message[(][)]', 'INFO:pymupdf:.+this is pymupdf.log[(][)]', ], @@ -1316,7 +1368,9 @@ def check( pymupdf.log('this is pymupdf.log()') ''', [], - [], + [ + regex_gil_stderr, + ], ) @@ -2026,10 +2080,15 @@ def test_4392(): # We get SEGV's etc with older swig. if platform.system() == 'Windows': assert (e2, e3) == (0xc0000005, 0xc0000005) - else: + elif platform.system() == 'Linux': # On plain linux we get (139, 139). On manylinux we get (-11, # -11). On MacOS we get (-11, -11). assert (e2, e3) == (139, 139) or (e2, e3) == (-11, -11) + elif platform.system() == 'Darwin': + # python3.14t gives (4, -11)? + assert (e2, e3) == (-11, -11) or (e2, e3) == (4, -11) + else: + assert e2 and e3 def test_4639():