scripts/PDFPasswordTool.py at master · shmilee/scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# Copyright (c) 2025 shmilee

'''
加密、解密 PDF 文件
Ref:
1. https://pypdf.readthedocs.io/en/stable/user/encryption-decryption.html
2. https://github.com/chenluda/pdf-password
'''

import os
import argparse
import time

from typing import (
    Any,
    Optional,
    Union,
    cast,
)
from tqdm import tqdm
from pypdf import PdfReader, PdfWriter
from pypdf.constants import Core as CO
from pypdf.generic import (
    ArrayObject,
    DictionaryObject,
    IndirectObject,
    NameObject,
    NullObject,
    NumberObject,
)


def get_pdf_reader(file):
    try:
        if os.path.isfile(file):
            return PdfReader(file)
        else:
            print(f'=> PDF {file} not found!')
            return None
    except Exception as err:
        print(f'=> Invalid PDF: {file}!')
        return None


def encrypt_pdf(input_pdf, output_pdf, user_password, **kwargs):
    """
    为 PDF 文件添加密码保护
    PdfWriter encrypt kwargs: owner_password, permissions_flag, algorithm
    """
    reader = get_pdf_reader(input_pdf)
    if not reader:
        return
    if reader.is_encrypted:
        print(f"=> PDF {input_pdf} has been encrypted.")
        return
    try:
        writer = PdfWriter(clone_from=reader)
        # Add a password to the new PDF
        writer.encrypt(user_password, **kwargs)
        # Save the new PDF to a file
        writer.write(output_pdf)
        print(f"=> Save the encrypted PDF to {output_pdf}.")
    except Exception as err:
        print(f"发生错误：{err}")


class MyPdfWriter(PdfWriter):
    '''
    custom :meth:`append` (:meth:`merge`) with progress bar
    '''

    def copy_from(
        self,
        reader: PdfReader,
        pages: Optional[list[int]] = None,
        import_outline: bool = True,
        excluded_fields: Optional[Union[list[str], tuple[str, ...]]] = (),
    ) -> None:
        """
        Copy the pages from the given PdfReader onto the end of the file.

        Args:
            reader: A PdfReader Object
            pages: can be a list of pages index
                to copy only the specified index of pages from the source
                document into the output document.
            import_outline: You may prevent the source document's
                outline (collection of outline items, previously referred to as
                'bookmarks') from being imported by specifying this as ``False``.
            excluded_fields: provide the list of fields/keys to be ignored
                if ``/Annots`` is part of the list, the annotation will be ignored
                if ``/B`` is part of the list, the articles will be ignored

        Raises:
            TypeError: The pages attribute is not configured properly

        """
        if excluded_fields is None:
            excluded_fields = ()
        if not isinstance(pages, (list, range)):
            raise TypeError('"pages" must be a list or a range')

        t0 = time.time()
        srcpages = {}
        for page in tqdm(pages, desc="正在复制"):
            pg = reader.pages[page]
            assert pg.indirect_reference is not None
            srcpages[pg.indirect_reference.idnum] = self.add_page(
                pg, [*list(excluded_fields), 1, "/B", 1, "/Annots"]  # type: ignore
            )
            srcpages[pg.indirect_reference.idnum].original_page = pg
        # cost 4s
        #print(f'cost-time1={time.time()-t0}s')
        t0 = time.time()  # reset

        reader._named_destinations = (
            reader.named_destinations
        )  # need for the outline processing below

        arr: Any

        def _process_named_dests(dest: Any) -> None:
            arr = dest.dest_array
            if "/Names" in self._root_object and dest["/Title"] in cast(
                list[Any],
                cast(
                    DictionaryObject,
                    cast(DictionaryObject, self._root_object["/Names"]).get("/Dests", DictionaryObject()),
                ).get("/Names", DictionaryObject()),
            ):
                # already exists: should not duplicate it
                pass
            elif dest["/Page"] is None or isinstance(dest["/Page"], NullObject):
                pass
            elif isinstance(dest["/Page"], int):
                # the page reference is a page number normally not a PDF Reference
                # page numbers as int are normally accepted only in external goto
                try:
                    p = reader.pages[dest["/Page"]]
                except IndexError:
                    return
                assert p.indirect_reference is not None
                try:
                    arr[NumberObject(0)] = NumberObject(
                        srcpages[p.indirect_reference.idnum].page_number
                    )
                    self.add_named_destination_array(dest["/Title"], arr)
                except KeyError:
                    pass
            elif dest["/Page"].indirect_reference.idnum in srcpages:
                arr[NumberObject(0)] = srcpages[
                    dest["/Page"].indirect_reference.idnum
                ].indirect_reference
                self.add_named_destination_array(dest["/Title"], arr)

        for dest in reader._named_destinations.values():
            _process_named_dests(dest)
        # cost 0.2s
        #print(f'cost-time2={time.time()-t0}s')
        t0 = time.time() # reset

        outline_item_typ = self.get_outline_root()

        _ro = reader.root_object
        if import_outline and CO.OUTLINES in _ro:
            outline = self._get_filtered_outline(
                _ro.get(CO.OUTLINES, None), srcpages, reader
            )
            self._insert_filtered_outline(
                outline, outline_item_typ, None
            )  # TODO: use before parameter
        # cost 0.08s
        #print(f'cost-time3={time.time()-t0}s')
        t0 = time.time() # reset

        if "/Annots" not in excluded_fields:
            for pag in srcpages.values():
                lst = self._insert_filtered_annotations(
                    pag.original_page.get("/Annots", []), pag, srcpages, reader
                )
                if len(lst) > 0:
                    pag[NameObject("/Annots")] = lst
                self.clean_page(pag)

        if "/AcroForm" in _ro and _ro["/AcroForm"] is not None:
            if "/AcroForm" not in self._root_object:
                self._root_object[NameObject("/AcroForm")] = self._add_object(
                    cast(
                        DictionaryObject,
                        reader.root_object["/AcroForm"],
                    ).clone(self, False, ("/Fields",))
                )
                arr = ArrayObject()
            else:
                arr = cast(
                    ArrayObject,
                    cast(DictionaryObject, self._root_object["/AcroForm"])["/Fields"],
                )
            trslat = self._id_translated[id(reader)]
            try:
                for f in reader.root_object["/AcroForm"]["/Fields"]:  # type: ignore
                    try:
                        ind = IndirectObject(trslat[f.idnum], 0, self)
                        if ind not in arr:
                            arr.append(ind)
                    except KeyError:
                        # for trslat[] which mean the field has not be copied
                        # through the page
                        pass
            except KeyError:  # for /Acroform or /Fields are not existing
                arr = self._add_object(ArrayObject())
            cast(DictionaryObject, self._root_object["/AcroForm"])[
                NameObject("/Fields")
            ] = arr

        if "/B" not in excluded_fields:
            self.add_filtered_articles("", srcpages, reader)
        # cost 0.02s
        #print(f'cost-time4={time.time()-t0}s')


class PdfCracker(object):
    '''
    破解 PDF 文件的密码保护
    Crack password-protected PDF file
    '''

    def __init__(self, input_pdf, dictionary_folder):
        reader = get_pdf_reader(input_pdf)
        if reader:
            self.reader = reader
        else:
            raise ValueError(f'Invalid PDF: {input_pdf}!') from None
        # 遍历字典文件夹
        dictionary_files, count = [], 0
        valid_extensions = ('.txt', '.dic', '.lst')  # 只包含文本文件
        for root, _, files in os.walk(dictionary_folder):
            for file in files:
                if file.lower().endswith(valid_extensions):
                    dictionary_files.append(os.path.join(root, file))
                    count = len(dictionary_files)
                    print(f"{count}) 添加字典文件: {dictionary_files[-1]}")
        self.dictionary_files = sorted(dictionary_files)
        # 检查密码
        if self.reader.is_encrypted:
            self.is_decrypted = False
            # 尝试空密码解密，若仅有 owner_password，则可直接去密
            if self.reader.decrypt(''):
                print("=》空密码解密成功")
                self.is_decrypted = True
                self.password = ''
        else:
            self.is_decrypted = True
            self.password = None

    def crack_password(self):
        if self.is_decrypted:
            return
        open_kwargs = dict(encoding='utf-8', errors='ignore')
        try:
            for idx, dict_file in enumerate(self.dictionary_files, 1):
                desc = f'尝试字典[{idx}/{len(self.dictionary_files)}], 进度'
                try:
                    with open(dict_file, 'r', **open_kwargs) as pwd_file:
                        passwords = pwd_file.readlines()
                    for pwd in tqdm(passwords, desc=f'🔎 {desc}'):
                        pwd = pwd.rstrip('\n\r')  # 只移除换行符
                        if self.reader.decrypt(pwd):  # 0, 1 or 2
                            self.is_decrypted = True
                            self.password = pwd
                            break
                except (UnicodeDecodeError, IOError) as err:
                    print(f"=》⚠️ 无法读取字典文件 {dict_file}: {err}")
                if self.is_decrypted:
                    break
            if self.is_decrypted:
                print(f"✅ 找到密码: {self.password}")
        except KeyboardInterrupt:
            print("\n⛔ 用户中断")

    @staticmethod
    def parse_pages_spec(pages_spec):
        """
        解析页面规格字符串，支持以下格式：
        1. 单个页码: "10" -> [9]
        2. 范围: "1-20" -> range(0, 20)
        3. 逗号分隔: "31,64,55" -> [30, 63, 54]
        4. 混合: "1-20,31,64,55" -> [0-19, 30, 63, 54]
        """
        if not pages_spec:
            return []
        pages = set()
        # 按逗号分割不同的规格
        specs = pages_spec.split(',')
        for spec in specs:
            spec = spec.strip()
            if not spec:
                continue
            # 检查是否是范围格式 (如 "1-20")
            if '-' in spec:
                try:
                    start, end = map(int, spec.split('-'))
                    # 转换为0基索引，并确保end是包含的
                    for page_num in range(start - 1, end):
                        pages.add(page_num)
                except ValueError:
                    print(f"警告: 无效的范围格式 '{spec}'，跳过")
                    continue
            else:
                # 单个页码
                try:
                    page_num = int(spec) - 1  # 转换为0基索引
                    if page_num >= 0:
                        pages.add(page_num)
                except ValueError:
                    print(f"警告: 无效的页码 '{spec}'，跳过")
                    continue
        # 排序并返回列表
        return sorted(pages)

    def save(self, output_pdf, pages=None):
        '''
        pages: str, 传给 :meth:`parse_pages_spec` 获取页码索引
        注意：用户输入的页码是从1开始，会转换为从0开始的Python索引
        '''
        if not self.is_decrypted:
            print('=》未解密 PDF 无法编辑!')
            return
        try:
            writer = MyPdfWriter()  # 空白 PDF
            # 复制原始PDF的元数据
            if self.reader.metadata:
                writer.metadata = self.reader.metadata
            if self.reader.xmp_metadata:
                writer.xmp_metadata = self.reader.xmp_metadata
            N = len(self.reader.pages)
            pages = self.parse_pages_spec(pages)
            if pages:
                pages = [i for i in pages if 0 <= i < N]
            else:
                pages = range(N)
            print(f"=》新 PDF 共 {len(pages)} 页")
            # 复制原始PDF的页、目录书签，import_outline (aka bookmarks)
            # ref: https://github.com/py-pdf/pypdf/discussions/1625
            writer.copy_from(self.reader, pages=pages, import_outline=True)
            print(f"\r=》保存到文件 {output_pdf} ...", end=' ')
            writer.write(output_pdf)
            print('完成。')
        except Exception as err:
            print(f"发生错误：{err}")


def main():
    parser = argparse.ArgumentParser(
        prog='PDFPasswordTool.py',
        description="Tool to encrypt or crack PDF passwords",
        formatter_class=argparse.RawTextHelpFormatter,
        add_help=False
    )
    comgroup = parser.add_argument_group('common options')
    comgroup.add_argument('command', type=str, nargs='?',
                          choices=['encrypt', 'crack'],
                          help='Choose a command to execute')
    comgroup.add_argument('-i', '--input', type=str,  # required=True,
                          help='Input PDF file')
    comgroup.add_argument('-o', '--output', type=str,
                          help='Output encrypted or decrypted PDF file')
    comgroup.add_argument('-h', '--help', action='store_true',
                          help='Show help message and exit')
    # 加密组
    encgroup = parser.add_argument_group('encrypt options')
    encgroup.add_argument('-p', '--password', type=str,
                          help='Password for encryption')
    encgroup.add_argument('--owner-password', type=str,
                          help='Owner password (optional)')
    encgroup.add_argument('--algorithm', type=str,
                          choices=['AES-256', 'AES-128', 'RC4-128'],
                          default='AES-256',
                          help='Encryption algorithm')
    encgroup.add_argument('--permissions', type=int,
                          help='Permissions flag (integer),\n'
                          'see Table 3.20 of the PDF 1.7 specification')
    # 解密/破解组
    crkgroup = parser.add_argument_group('crack options')
    crkgroup.add_argument('-d', '--dict-dir', type=str, metavar='DIR',
                          help="Password dictionary directory")
    crkgroup.add_argument('--pages', type=str,
                          help='Output page numbers to extract (optional)\n'
                          'Start from 1, supported formats:\n'
                          '  1) range (1-5);\n'
                          '  2) comma-separated (8,9,10);\n'
                          '  3) mixed (1-5,8,9,10)')

    def print_help_examples():
        parser.print_help()
        print("\nExamples:")
        print("  PDFPasswordTool.py encrypt -i input.pdf -o encrypted.pdf -p mypassword")
        print("  PDFPasswordTool.py crack -i encrypted.pdf -d ./dictionaries --pages 1-5,8,6,5 -o decrypted.pdf")

    args = parser.parse_args()
    # print(args)
    if args.help:
        print_help_examples()
        return

    def check_required_arguments(*arguments):
        lost_required_arguments = False
        for attr in arguments:
            if not getattr(args, attr):
                opt = attr.replace('_', '-')
                print(f"=> ⚠️  The '--{opt}' argument is required!")
                lost_required_arguments = True
        if lost_required_arguments:
            print_help_examples()
        return lost_required_arguments

    if args.command == 'encrypt':
        if check_required_arguments('input', 'output', 'password'):
            return
        encrypt_kwargs = {}
        if args.owner_password:
            encrypt_kwargs['owner_password'] = args.owner_password
        if args.permissions:
            encrypt_kwargs['permissions_flag'] = args.permissions
        if args.algorithm:
            encrypt_kwargs['algorithm'] = args.algorithm
        encrypt_pdf(args.input, args.output, args.password, **encrypt_kwargs)
    elif args.command == 'crack':
        if check_required_arguments('input', 'dict_dir'):
            return
        cracker = PdfCracker(args.input, args.dict_dir)
        if not cracker.is_decrypted:
            cracker.crack_password()
        if args.output:
            cracker.save(args.output, pages=args.pages)
    else:
        print_help_examples()


if __name__ == '__main__':
    main()