Skip to content

Commit fb7c3e1

Browse files
committed
Tweaking the yaml note stripper
CodeQL was complaining about a recursive regular expression that could be a problem. Probably wasn't but this might be more robust anyway. I'll be honest, I haven't gone through carefully to understand all the regular expressions that Codex (GPT 5.5) has come up with here, but am adding unit tests to reassure myself that it works.
1 parent 07243d6 commit fb7c3e1

2 files changed

Lines changed: 51 additions & 25 deletions

File tree

rmgpy/util.py

Lines changed: 46 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,42 @@ def notify(self, modifier=None):
114114
observer.update(self)
115115

116116

117+
def _strip_wrapped_flow_yaml_notes(text):
118+
"""Strip wrapped flow-style YAML notes without a nested regex."""
119+
lines = text.splitlines(keepends=True)
120+
stripped_lines = []
121+
i = 0
122+
while i < len(lines):
123+
line = lines[i]
124+
if (
125+
stripped_lines
126+
and stripped_lines[-1].rstrip().endswith(",")
127+
and line.lstrip(" \t").startswith("note:")
128+
):
129+
end_index = i
130+
while end_index < len(lines):
131+
if "}" in lines[end_index]:
132+
comma_index = stripped_lines[-1].rfind(",")
133+
brace_index = lines[end_index].find("}")
134+
stripped_lines[-1] = stripped_lines[-1][:comma_index] + lines[end_index][brace_index:]
135+
i = end_index + 1
136+
break
137+
if lines[end_index].rstrip().endswith(","):
138+
i = end_index + 1
139+
break
140+
end_index += 1
141+
else:
142+
stripped_lines.append(line)
143+
i += 1
144+
continue
145+
continue
146+
147+
stripped_lines.append(line)
148+
i += 1
149+
150+
return "".join(stripped_lines)
151+
152+
117153
def make_output_subdirectory(output_directory, folder):
118154
"""
119155
Create a subdirectory `folder` in the output directory. If the folder
@@ -132,36 +168,26 @@ def strip_yaml_notes(src, dst):
132168
ordering, etc.) - important when the source is the carefully
133169
crafted ck2yaml output.
134170
135-
Three patterns are handled (notes are always the last key, by
136-
how RMG / ck2yaml emit them):
171+
Three patterns are handled:
137172
1. Block-style: `` note: ...`` on its own line,
138173
possibly followed by deeper-indented continuation lines
139174
(multi-line literal/folded scalars).
140-
2. Single-line flow: ``{..., note: foo}`` -> ``{...}``
175+
2. Single-line flow: ``{..., note: foo, ...}`` -> ``{..., ...}``
141176
3. Wrapped flow: a flow mapping that wraps with the
142177
trailing ``,`` at the end of one line and
143-
`` note: foo}`` on the next -> drop the comma and
144-
replace with ``}`` on the prior line.
178+
`` note: foo`` on the next -> drop the note field.
145179
"""
146180
if not os.path.exists(src):
147181
return
148182
with open(src) as f:
149183
text = f.read()
150-
# Wrapped flow style: a flow mapping that wraps after a
151-
# trailing ``,``, with ``note: value`` on the next line
152-
# (value may itself wrap across several more-indented lines)
153-
# ending in ``}``. Replace the whole tail with ``}``.
154-
# CodeQL flags this as polynomial ReDoS (py/polynomial-redos);
155-
# safe here because [^\n}]* and \n[ \t]+ consume disjoint
156-
# characters (no alternative-path overlap) and the inner *
157-
# consumes >=2 chars per iteration, so worst-case is O(N^2)
158-
# rather than exponential. Inputs are RMG-generated YAML,
159-
# not adversarial.
160-
text = re.sub(
161-
r',[ \t]*\n[ \t]+note:[^\n}]*(?:\n[ \t]+[^\n}]*)*\}',
162-
'}', text) # lgtm[py/polynomial-redos]
163-
# Single-line flow style: ``, note: value}`` -> ``}``.
164-
text = re.sub(r',[ \t]*note:[^,}]*\}', '}', text)
184+
# Wrapped flow style: a flow mapping that wraps after a trailing comma,
185+
# with ``note: value`` on the next line.
186+
text = _strip_wrapped_flow_yaml_notes(text)
187+
# Single-line flow style.
188+
text = re.sub(r',[ \t]*note:[^,}\n]*', '', text)
189+
text = re.sub(r'(\{)[ \t]*note:[^,}\n]*,[ \t]*', r'\1', text)
190+
text = re.sub(r'\{[ \t]*note:[^,}\n]*\}', '{}', text)
165191
# Block style: `` note: ...\n`` plus deeper-indented
166192
# continuation lines.
167193
text = re.sub(r'^( +)note:.*\n(?:\1 +[^\n]*\n)*', '', text, flags=re.MULTILINE)

test/rmgpy/rmgUtilTest.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ def test_strip_yaml_notes_removes_block_style_multiline_note(self, tmp_path):
7979
def test_strip_yaml_notes_removes_single_line_flow_note(self, tmp_path):
8080
source = """species:
8181
- name: Ar
82-
transport: {model: gas, geometry: atom, diameter: 3.33, well-depth: 136.5, note: RMG transport}
82+
transport: {model: gas, note: RMG transport, geometry: atom, diameter: 3.33, well-depth: 136.5}
8383
"""
8484
expected = """species:
8585
- name: Ar
@@ -92,8 +92,8 @@ def test_strip_yaml_notes_removes_wrapped_flow_note(self, tmp_path):
9292
source = """species:
9393
- name: CH4
9494
transport: {model: gas, geometry: nonlinear, diameter: 3.746,
95-
well-depth: 141.4,
96-
note: RMG transport note}
95+
note: RMG transport note,
96+
well-depth: 141.4}
9797
"""
9898
expected = """species:
9999
- name: CH4
@@ -107,9 +107,9 @@ def test_strip_yaml_notes_removes_wrapped_flow_multiline_note(self, tmp_path):
107107
source = """species:
108108
- name: CH4
109109
transport: {model: gas, geometry: nonlinear, diameter: 3.746,
110-
well-depth: 141.4,
111110
note: RMG transport note
112-
with wrapped detail}
111+
with wrapped detail,
112+
well-depth: 141.4}
113113
"""
114114
expected = """species:
115115
- name: CH4

0 commit comments

Comments
 (0)