Skip to content

Commit c328a1a

Browse files
committed
Fix path flattening collisions and make runfiles resolution deterministic in make_corpus_dir
1 parent a63c5f0 commit c328a1a

3 files changed

Lines changed: 268 additions & 2 deletions

File tree

fuzzing/tools/BUILD

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,3 +83,10 @@ py_test(
8383
srcs = ["dict_validation_test.py"],
8484
deps = [":dict_validation"],
8585
)
86+
87+
py_test(
88+
name = "make_corpus_dir_test",
89+
srcs = ["make_corpus_dir_test.py"],
90+
data = ["make_corpus_dir.py"],
91+
deps = [requirement("absl-py")],
92+
)

fuzzing/tools/make_corpus_dir.py

Lines changed: 75 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,12 +36,36 @@
3636

3737
flags.mark_flag_as_required("output_dir")
3838

39+
def flatten_corpus_path(corpus):
40+
prefix = ""
41+
if corpus.startswith("./") or (os.sep == "\\" and corpus.startswith(".\\")):
42+
prefix = "dot-"
43+
corpus = corpus[2:]
44+
45+
if os.sep == "\\":
46+
corpus = corpus.replace("/", "\\")
47+
48+
drive, tail = os.path.splitdrive(corpus)
49+
50+
parts = [part for part in tail.split(os.sep) if part]
51+
flattened = "-".join(parts)
52+
53+
if drive:
54+
drive_part = drive.rstrip(":\\/").replace("\\", "-").replace("/", "-")
55+
flattened = drive_part + ("-" + flattened if flattened else "")
56+
elif tail.startswith(os.sep):
57+
flattened = "-" + flattened
58+
59+
return prefix + flattened
60+
3961
def expand_corpus_to_file_list(corpus, file_list):
4062
if not os.path.exists(corpus):
4163
raise FileNotFoundError("file " + corpus + " doesn't exist")
4264
if os.path.isdir(corpus):
4365
# The first element in glob("dir/**") is "dir/", which needs to be excluded
44-
file_list.extend(glob.glob(os.path.join(corpus, "**"), recursive=True)[1:])
66+
for expanded_path in glob.glob(os.path.join(corpus, "**"), recursive=True)[1:]:
67+
if os.path.isfile(expanded_path):
68+
file_list.append(expanded_path)
4569
else:
4670
file_list.append(corpus)
4771

@@ -59,8 +83,57 @@ def main(argv):
5983
corpus_line.rstrip("\n"), expanded_file_list)
6084

6185
if expanded_file_list:
86+
max_flattened_length = 200
87+
flattened_names = {}
88+
flattened_name_counts = {}
89+
needs_suffix = set()
90+
91+
for corpus in expanded_file_list:
92+
flattened = flatten_corpus_path(corpus)
93+
flattened_names[corpus] = flattened
94+
flattened_key = flattened.lower() if os.name == "nt" else flattened
95+
flattened_name_counts[flattened_key] = (
96+
flattened_name_counts.get(flattened_key, 0) + 1)
97+
if len(flattened) > max_flattened_length:
98+
needs_suffix.add(corpus)
99+
100+
for corpus in expanded_file_list:
101+
flattened = flattened_names[corpus]
102+
flattened_key = flattened.lower() if os.name == "nt" else flattened
103+
if flattened_name_counts[flattened_key] > 1:
104+
needs_suffix.add(corpus)
105+
106+
suffix_map = {}
107+
if needs_suffix:
108+
suffix_width = len(str(len(needs_suffix)))
109+
for index, corpus in enumerate(sorted(needs_suffix), start=1):
110+
suffix_map[corpus] = f"{index:0{suffix_width}d}"
111+
112+
final_name_map = {}
113+
final_name_counts = {}
114+
for corpus in expanded_file_list:
115+
flattened = flattened_names[corpus]
116+
suffix = suffix_map.get(corpus)
117+
if suffix:
118+
prefix_budget = max_flattened_length - len(suffix) - 2
119+
flattened = flattened[:max(1, prefix_budget)] + "--" + suffix
120+
final_name_map[corpus] = flattened
121+
flattened_key = flattened.lower() if os.name == "nt" else flattened
122+
final_name_counts[flattened_key] = (
123+
final_name_counts.get(flattened_key, 0) + 1)
124+
125+
if any(count > 1 for count in final_name_counts.values()):
126+
unique_corpora = sorted(set(expanded_file_list))
127+
alias_width = len(str(len(unique_corpora)))
128+
alias_map = {
129+
corpus: f"entry-{index:0{alias_width}d}"
130+
for index, corpus in enumerate(unique_corpora, start=1)
131+
}
132+
for corpus in expanded_file_list:
133+
final_name_map[corpus] = alias_map[corpus]
134+
62135
for corpus in expanded_file_list:
63-
dest = os.path.join(FLAGS.output_dir, corpus.replace("/", "-"))
136+
dest = os.path.join(FLAGS.output_dir, final_name_map[corpus])
64137
# Whatever the separator we choose, there is an chance that
65138
# the dest name conflicts with another file
66139
if os.path.exists(dest):
Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
# Copyright 2020 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# Lint as: python3
16+
"""Unit tests for make_corpus_dir.py."""
17+
18+
import os
19+
import subprocess
20+
import sys
21+
import tempfile
22+
import unittest
23+
from pathlib import Path
24+
25+
26+
def resolve_script_path():
27+
candidates = [Path(__file__).with_name("make_corpus_dir.py")]
28+
test_workspace = os.environ.get("TEST_WORKSPACE")
29+
manifest_lookup_path = "fuzzing/tools/make_corpus_dir.py"
30+
if test_workspace:
31+
test_srcdir = os.environ.get("TEST_SRCDIR")
32+
if test_srcdir:
33+
candidates.append(
34+
Path(test_srcdir) / test_workspace / "fuzzing" / "tools" /
35+
"make_corpus_dir.py")
36+
runfiles_dir = os.environ.get("RUNFILES_DIR")
37+
if runfiles_dir:
38+
candidates.append(
39+
Path(runfiles_dir) / test_workspace / "fuzzing" / "tools" /
40+
"make_corpus_dir.py")
41+
manifest_lookup_path = (
42+
f"{test_workspace}/fuzzing/tools/make_corpus_dir.py")
43+
44+
for candidate in candidates:
45+
if candidate.is_file():
46+
return candidate
47+
48+
manifest_file = os.environ.get("RUNFILES_MANIFEST_FILE")
49+
if manifest_file:
50+
try:
51+
workspace_match = None
52+
main_match = None
53+
with open(manifest_file, "r", encoding="utf-8") as manifest:
54+
for line in manifest:
55+
entry = line.rstrip("\n")
56+
if not entry:
57+
continue
58+
logical_path, separator, real_path = entry.partition(" ")
59+
if not separator:
60+
continue
61+
normalized_path = logical_path.replace("\\", "/")
62+
if not normalized_path.endswith("fuzzing/tools/make_corpus_dir.py"):
63+
continue
64+
candidate = Path(real_path)
65+
if not candidate.is_file():
66+
continue
67+
if test_workspace and normalized_path.startswith(f"{test_workspace}/"):
68+
workspace_match = candidate
69+
break
70+
if normalized_path.startswith("_main/") and not main_match:
71+
main_match = candidate
72+
if workspace_match:
73+
return workspace_match
74+
if main_match:
75+
return main_match
76+
except OSError:
77+
pass
78+
79+
raise FileNotFoundError("could not resolve make_corpus_dir.py in test runfiles")
80+
81+
82+
SCRIPT_PATH = resolve_script_path()
83+
84+
85+
class MakeCorpusDirTest(unittest.TestCase):
86+
87+
def run_tool(self, args, cwd):
88+
return subprocess.run(
89+
[sys.executable, str(SCRIPT_PATH)] + args,
90+
cwd=str(cwd),
91+
text=True,
92+
capture_output=True,
93+
check=False,
94+
)
95+
96+
def test_copies_nested_corpus_directory(self):
97+
with tempfile.TemporaryDirectory() as td:
98+
tmp = Path(td)
99+
corpus = tmp / "corpus"
100+
(corpus / "nested").mkdir(parents=True)
101+
(corpus / "a.txt").write_text("A", encoding="utf-8")
102+
(corpus / "nested" / "b.txt").write_text("B", encoding="utf-8")
103+
output_dir = tmp / "out"
104+
105+
result = self.run_tool(
106+
["--corpus_list=corpus", "--output_dir=out"], cwd=tmp)
107+
108+
self.assertEqual(result.returncode, 0, msg=result.stderr)
109+
copied_files = [path for path in output_dir.iterdir() if path.is_file()]
110+
self.assertEqual(len(copied_files), 2)
111+
copied_contents = sorted(path.read_text(encoding="utf-8")
112+
for path in copied_files)
113+
self.assertEqual(copied_contents, ["A", "B"])
114+
115+
def test_copies_absolute_corpus_file(self):
116+
with tempfile.TemporaryDirectory() as td:
117+
tmp = Path(td)
118+
corpus_file = tmp / "corpus-input.txt"
119+
corpus_file.write_text("payload", encoding="utf-8")
120+
output_dir = tmp / "out"
121+
122+
result = self.run_tool(
123+
[f"--corpus_list={corpus_file}", f"--output_dir={output_dir}"],
124+
cwd=tmp,
125+
)
126+
127+
self.assertEqual(result.returncode, 0, msg=result.stderr)
128+
copied_files = [path for path in output_dir.iterdir() if path.is_file()]
129+
self.assertEqual(len(copied_files), 1)
130+
self.assertEqual(copied_files[0].read_text(encoding="utf-8"), "payload")
131+
132+
def test_distinguishes_dot_prefix_from_plain_relative_path(self):
133+
with tempfile.TemporaryDirectory() as td:
134+
tmp = Path(td)
135+
corpus_file = tmp / "a.txt"
136+
corpus_file.write_text("payload", encoding="utf-8")
137+
output_dir = tmp / "out"
138+
139+
result = self.run_tool(
140+
["--corpus_list=./a.txt,a.txt", "--output_dir=out"],
141+
cwd=tmp,
142+
)
143+
144+
self.assertEqual(result.returncode, 0, msg=result.stderr)
145+
copied_files = [path for path in output_dir.iterdir() if path.is_file()]
146+
self.assertEqual(len(copied_files), 2)
147+
148+
def test_distinguishes_parent_navigation_from_plain_relative_path(self):
149+
with tempfile.TemporaryDirectory() as td:
150+
tmp = Path(td)
151+
(tmp / "dir").mkdir()
152+
corpus_file = tmp / "a.txt"
153+
corpus_file.write_text("payload", encoding="utf-8")
154+
output_dir = tmp / "out"
155+
156+
result = self.run_tool(
157+
["--corpus_list=dir/../a.txt,a.txt", "--output_dir=out"],
158+
cwd=tmp,
159+
)
160+
161+
self.assertEqual(result.returncode, 0, msg=result.stderr)
162+
copied_files = [path for path in output_dir.iterdir() if path.is_file()]
163+
self.assertEqual(len(copied_files), 2)
164+
165+
def test_distinguishes_dot_prefix_from_literal_dot_filename(self):
166+
with tempfile.TemporaryDirectory() as td:
167+
tmp = Path(td)
168+
(tmp / "a.txt").write_text("from-a", encoding="utf-8")
169+
(tmp / "dot-a.txt").write_text("from-dot-a", encoding="utf-8")
170+
output_dir = tmp / "out"
171+
172+
result = self.run_tool(
173+
["--corpus_list=./a.txt,dot-a.txt", "--output_dir=out"],
174+
cwd=tmp,
175+
)
176+
177+
self.assertEqual(result.returncode, 0, msg=result.stderr)
178+
copied_files = [path for path in output_dir.iterdir() if path.is_file()]
179+
self.assertEqual(len(copied_files), 2)
180+
copied_contents = sorted(path.read_text(encoding="utf-8")
181+
for path in copied_files)
182+
self.assertEqual(copied_contents, ["from-a", "from-dot-a"])
183+
184+
185+
if __name__ == "__main__":
186+
unittest.main()

0 commit comments

Comments
 (0)