Skip to content

Commit c18609d

Browse files
committed
Initial implementation of fuzzing harness
Added pipeline Fix path in build script Formatting Add Updated changelog to reflect PR number
1 parent c562774 commit c18609d

6 files changed

Lines changed: 205 additions & 0 deletions

File tree

.github/workflows/cifuzz.yml

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
name: CIFuzz
2+
on:
3+
push:
4+
branches:
5+
- stable
6+
- develop
7+
pull_request:
8+
permissions: {}
9+
jobs:
10+
Fuzzing:
11+
runs-on: ubuntu-latest
12+
permissions:
13+
security-events: write
14+
steps:
15+
- name: Build Fuzzers
16+
id: build
17+
uses: google/oss-fuzz/infra/cifuzz/actions/build_fuzzers@master
18+
with:
19+
oss-fuzz-project-name: 'pdfplumber'
20+
language: python
21+
- name: Run Fuzzers
22+
uses: google/oss-fuzz/infra/cifuzz/actions/run_fuzzers@master
23+
with:
24+
oss-fuzz-project-name: 'pdfplumber'
25+
language: python
26+
fuzz-seconds: 800
27+
output-sarif: true
28+
- name: Upload Crash
29+
uses: actions/upload-artifact@v3
30+
if: failure() && steps.build.outcome == 'success'
31+
with:
32+
name: artifacts
33+
path: ./out/artifacts
34+
- name: Upload Sarif
35+
if: always() && steps.build.outcome == 'success'
36+
uses: github/codeql-action/upload-sarif@v2
37+
with:
38+
# Path to SARIF file relative to the root of the repository
39+
sarif_file: cifuzz-sarif/results.sarif
40+
checkout_path: cifuzz-sarif

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ All notable changes to this project will be documented in this file. The format
99
- Add `--format text` options to CLI (in addition to previously-available `csv` and `json`) (h/t @brandonrobertz). ([#1235](https://github.com/jsvine/pdfplumber/pull/1235))
1010
- Add `raise_unicode_errors: bool` parameter to `pdfplumber.open()` to allow bypassing `UnicodeDecodeError`s in annotation-parsing and generate warnings instead (h/t @stolarczyk). ([#1195](https://github.com/jsvine/pdfplumber/issues/1195))
1111
- Add `name` property to `image` objects (h/t @djr2015). ([#1201](https://github.com/jsvine/pdfplumber/discussions/1201))
12+
- Added necessary build scripts, pipelines, and harnesses to integrate with [OSS-Fuzz](https://github.com/google/oss-fuzz). ([#1245](https://github.com/jsvine/pdfplumber/pull/1245)
1213

1314
### Fixed
1415

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -570,6 +570,7 @@ Many thanks to the following users who've contributed ideas, features, and fixes
570570
- [@wodny](https://github.com/wodny)
571571
- [Michal Stolarczyk](https://github.com/stolarczyk)
572572
- [Brandon Roberts](https://github.com/brandonrobertz)
573+
- [@ennamarie19](https://github.com/ennamarie19/)
573574

574575
## Contributing
575576

fuzz/build.sh

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
#!/bin/bash -eu
2+
3+
cd "$SRC"/pdfplumber
4+
pip3 install .
5+
6+
# Build fuzzers in $OUT
7+
for fuzzer in $(find fuzz -name '*_fuzzer.py');do
8+
compile_python_fuzzer "$fuzzer"
9+
done
10+
11+
mkdir -p fuzz/corpus
12+
find . -name "*.pdf" -exec cp "{}" fuzz/corpus \;
13+
zip -q $OUT/pdf_load_fuzzer_seed_corpus.zip fuzz/corpus/*

fuzz/fuzz_helpers.py

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
#!/usr/bin/python3
2+
# Copyright 2023 Google LLC
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
#
16+
################################################################################
17+
import contextlib
18+
import io
19+
import tempfile
20+
from enum import IntEnum
21+
from typing import Protocol, Type, TypeVar
22+
23+
import atheris
24+
25+
26+
class HasMax(Protocol):
27+
MAX: int
28+
29+
30+
T = TypeVar("T", bound=IntEnum)
31+
32+
33+
class EnhancedFuzzedDataProvider(atheris.FuzzedDataProvider):
34+
def ConsumeRandomBytes(self) -> bytes:
35+
return self.ConsumeBytes(self.ConsumeIntInRange(0, self.remaining_bytes()))
36+
37+
def ConsumeRandomString(self) -> str:
38+
return self.ConsumeUnicodeNoSurrogates(
39+
self.ConsumeIntInRange(0, self.remaining_bytes())
40+
)
41+
42+
def ConsumeRemainingString(self) -> str:
43+
return self.ConsumeUnicodeNoSurrogates(self.remaining_bytes())
44+
45+
def ConsumeRemainingBytes(self) -> bytes:
46+
return self.ConsumeBytes(self.remaining_bytes())
47+
48+
@contextlib.contextmanager
49+
def ConsumeMemoryFile(
50+
self, all_data: bool = False, as_bytes: bool = True
51+
) -> io.BytesIO:
52+
if all_data:
53+
file_data = (
54+
self.ConsumeRemainingBytes()
55+
if as_bytes
56+
else self.ConsumeRemainingString()
57+
)
58+
else:
59+
file_data = (
60+
self.ConsumeRandomBytes() if as_bytes else self.ConsumeRandomString()
61+
)
62+
63+
file = io.BytesIO(file_data) if as_bytes else io.StringIO(file_data)
64+
yield file
65+
file.close()
66+
67+
@contextlib.contextmanager
68+
def ConsumeTemporaryFile(
69+
self, suffix: str, all_data: bool = False, as_bytes: bool = True
70+
) -> str:
71+
if all_data:
72+
file_data = (
73+
self.ConsumeRemainingBytes()
74+
if as_bytes
75+
else self.ConsumeRemainingString()
76+
)
77+
else:
78+
file_data = (
79+
self.ConsumeRandomBytes() if as_bytes else self.ConsumeRandomString()
80+
)
81+
82+
mode = "w+b" if as_bytes else "w+"
83+
tfile = tempfile.NamedTemporaryFile(mode=mode, suffix=suffix)
84+
tfile.write(file_data)
85+
tfile.seek(0)
86+
tfile.flush()
87+
yield tfile.name
88+
tfile.close()
89+
90+
def ConsumeEnum(self, enum_type: Type[T]) -> T:
91+
return enum_type(self.ConsumeIntInRange(0, enum_type.MAX))

fuzz/pdf_load_fuzzer.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
import sys
2+
from enum import IntEnum
3+
4+
import atheris
5+
from fuzz_helpers import EnhancedFuzzedDataProvider
6+
7+
with atheris.instrument_imports(include=["pdfplumber"]):
8+
from pdfminer.pdftypes import PDFException
9+
from pdfminer.psparser import PSException
10+
11+
import pdfplumber
12+
13+
14+
class CastType(IntEnum):
15+
CSV = 0
16+
IMAGE = 1
17+
JSON = 2
18+
DICT = 3
19+
MAX = 4
20+
21+
22+
def TestOneInput(data: bytes):
23+
fdp = EnhancedFuzzedDataProvider(data)
24+
25+
try:
26+
with fdp.ConsumeMemoryFile(all_data=False, as_bytes=True) as f:
27+
pdf = pdfplumber.open(f)
28+
29+
# Test casting
30+
cast_ty = fdp.ConsumeEnum(CastType)
31+
32+
if cast_ty is CastType.CSV:
33+
pdf.to_csv()
34+
elif cast_ty is CastType.IMAGE and pdf.pages:
35+
pdf.pages[0].to_image()
36+
elif cast_ty is CastType.JSON:
37+
pdf.to_json()
38+
elif cast_ty is CastType.DICT:
39+
pdf.to_dict()
40+
41+
except (PDFException, PSException, AssertionError):
42+
return -1
43+
except ValueError as e:
44+
if "invalid literal for int" in str(e):
45+
return -1
46+
raise e
47+
except TypeError as e:
48+
if "argument must be a string" in str(e):
49+
return -1
50+
raise e
51+
52+
53+
def main():
54+
atheris.Setup(sys.argv, TestOneInput)
55+
atheris.Fuzz()
56+
57+
58+
if __name__ == "__main__":
59+
main()

0 commit comments

Comments
 (0)