Skip to content

Commit 310d1aa

Browse files
rwestclaude
andcommitted
Optimize filtration.py hot paths.
Claude did the work. I supervised. It did a bunch of benchmarking along the way. Overall, these only eke out a small win, but we may as well have them. I did look them over. --- filter_structures and its callees are called 332K times in the catalytic profile (~105s cumulative), with much of the work attributed to deepcopy (16.6M calls, 157s cum). Changes: - mark_unreactive_structures: hoist mol_list[0].copy(deep=True) out of the per-iteration loop. It was constant across iterations but was being deep-copied on every comparison. This is the largest contributor here. End-to-end resonance generation for butadienyl (small species, deepcopy-bound) drops from 0.81 ms to 0.71 ms (~12%); naphthalenyl (~1%, dominated by other work). - octet_filtration: hoist min(octet_deviation_list) out of the loop (was recomputed each iteration — O(N^2)). - stabilize_charges_by_electronegativity and _by_proximity: use a set for indices_to_pop instead of a list, so the `i in ...` membership checks are O(1) instead of O(N). - Replace min([...] or [0]) / max([...] or [0]) with the more efficient min(generator, default=0) / max(generator, default=0) (avoids materializing the list when only the min/max is needed). - atom.bonds.items() -> atom.edges.items() (4 sites): bonds is a @Property dispatching to edges; edges is the underlying list and avoids the property lookup. - atom.symbol -> atom.element.symbol (2 sites): same reasoning. All 265 molecule + 38 kinetics tests still pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 6441197 commit 310d1aa

1 file changed

Lines changed: 21 additions & 16 deletions

File tree

rmgpy/molecule/filtration.py

Lines changed: 21 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ def get_octet_deviation(mol, allow_expanded_octet=True):
159159
elif atom.lone_pairs >= 2:
160160
octet_deviation += abs(8 - val_electrons) # octet on S p[2,3]
161161
# eg [S][S], OS[O], [NH+]#[N+][S-][O-], O[S-](O)[N+]#N, S=[O+][O-]
162-
for atom2, bond in atom.bonds.items():
162+
for atom2, bond in atom.edges.items():
163163
if atom2.is_sulfur() and bond.is_triple():
164164
octet_deviation += 0.5 # penalty for S#S substructures. Often times sulfur can have a triple
165165
# bond to another sulfur in a structure that obeys the octet rule, but probably shouldn't be a
@@ -186,10 +186,11 @@ def octet_filtration(mol_list, octet_deviation_list):
186186
charge-strained species are still kept (e.g., [NH]N=S=O <-> [NH+]#[N+][S-][O-]), we also generate during the same
187187
loop a charge_span_list to keep track of the charge spans. This is used for further filtering.
188188
"""
189+
min_octet_deviation = min(octet_deviation_list)
189190
filtered_list = []
190191
charge_span_list = []
191192
for index, mol in enumerate(mol_list):
192-
if octet_deviation_list[index] == min(octet_deviation_list):
193+
if octet_deviation_list[index] == min_octet_deviation:
193194
filtered_list.append(mol)
194195
charge_span_list.append(mol.get_charge_span())
195196

@@ -243,7 +244,7 @@ def charge_filtration(filtered_list, charge_span_list):
243244
for atom in mol.vertices:
244245
if atom.radical_electrons and int(atom.sorting_label) not in rad_sorting_list:
245246
rad_sorting_list.append(int(atom.sorting_label))
246-
for atom2, bond in atom.bonds.items():
247+
for atom2, bond in atom.edges.items():
247248
# check if bond is multiple, store only from one side (atom1 < atom2) for consistency
248249
if atom2.sorting_label > atom.sorting_label and bond.is_double() or bond.is_triple():
249250
mul_bond_sorting_list.append((int(atom.sorting_label), int(atom2.sorting_label)))
@@ -278,7 +279,7 @@ def find_unique_sites_in_charged_list(mol, rad_sorting_list, mul_bond_sorting_li
278279
for atom in mol.vertices:
279280
if atom.radical_electrons and int(atom.sorting_label) not in rad_sorting_list:
280281
return [mol]
281-
for atom2, bond in atom.bonds.items():
282+
for atom2, bond in atom.edges.items():
282283
if (atom2.sorting_label > atom.sorting_label and (bond.is_double() or bond.is_triple())
283284
and (int(atom.sorting_label), int(atom2.sorting_label)) not in mul_bond_sorting_list
284285
and not (atom.is_sulfur() and atom2.is_sulfur())):
@@ -295,13 +296,13 @@ def stabilize_charges_by_electronegativity(mol_list, allow_empty_list=False):
295296
to ``True`` and all structures in `mol_list` violate the electronegativity heuristic, the original `mol_list`
296297
is returned (examples: [C-]#[O+], CS, [NH+]#[C-], [OH+]=[N-], [C-][S+]=C violate this heuristic).
297298
"""
298-
indices_to_pop = []
299+
indices_to_pop = set()
299300
mol_list_copy = list(mol_list)
300301
for i, mol in enumerate(mol_list):
301302
electroneg_positively_charged_atoms = electroneg_negatively_charged_atoms = 0
302303
for atom in mol.vertices:
303304
if atom.charge > 0:
304-
electroneg_positively_charged_atoms += PeriodicSystem.electronegativity[atom.symbol] * abs(atom.charge)
305+
electroneg_positively_charged_atoms += PeriodicSystem.electronegativity[atom.element.symbol] * abs(atom.charge)
305306
if atom.is_oxygen():
306307
for atom2 in atom.edges.keys():
307308
if atom2.is_fluorine() and atom2.charge < 0:
@@ -312,13 +313,13 @@ def stabilize_charges_by_electronegativity(mol_list, allow_empty_list=False):
312313
# [C-]#[O+] and [O-][O+]=O, which are correct structures, also get penalized here, but that's OK
313314
# since they are still eventually selected as representative structures according to the rules here.
314315
elif atom.charge < 0:
315-
electroneg_negatively_charged_atoms += PeriodicSystem.electronegativity[atom.symbol] * abs(atom.charge)
316+
electroneg_negatively_charged_atoms += PeriodicSystem.electronegativity[atom.element.symbol] * abs(atom.charge)
316317
if electroneg_positively_charged_atoms > electroneg_negatively_charged_atoms:
317318
# Filter structures in which more electronegative atoms are positively charged.
318319
# This condition is NOT hermetic: It is possible to think of a situation where one structure has
319320
# several pairs of formally charged atoms, where one of the pairs isn't obeying the
320321
# electronegativity rule, while the sum of the pairs does.
321-
indices_to_pop.append(i)
322+
indices_to_pop.add(i)
322323
for i in reversed(range(len(mol_list))): # pop starting from the end, so indices won't change
323324
if i in indices_to_pop:
324325
mol_list.pop(i)
@@ -333,7 +334,7 @@ def stabilize_charges_by_proximity(mol_list):
333334
Only keep structures that obey the charge proximity rule.
334335
Opposite charges will be as close as possible to one another, and vice versa.
335336
"""
336-
indices_to_pop = []
337+
indices_to_pop = set()
337338
charge_distance_list = [] # indices match mol_list
338339
for i, mol in enumerate(mol_list):
339340
# Try finding well-defined pairs of formally-charged atoms to apply the proximity principle
@@ -352,17 +353,18 @@ def stabilize_charges_by_proximity(mol_list):
352353
cumulative_similar_charge_distance += len(find_shortest_path(atom1, atom2))
353354
charge_distance_list.append([cumulative_opposite_charge_distance,
354355
cumulative_similar_charge_distance])
355-
min_cumulative_opposite_charge_distance = min([distances[0] for distances in charge_distance_list]
356-
or [0]) # in Python 3 use `min(list, default=0)`
356+
min_cumulative_opposite_charge_distance = min((distances[0] for distances in charge_distance_list),
357+
default=0)
357358
for i, distances in enumerate(charge_distance_list):
358359
# after generating the charge_distance_list, iterate through it and mark structures to pop
359360
if distances[0] > min_cumulative_opposite_charge_distance:
360-
indices_to_pop.append(i)
361-
max_cumulative_similar_charge_distance = max([distances[1] for i, distances in
362-
enumerate(charge_distance_list) if i not in indices_to_pop] or [0])
361+
indices_to_pop.add(i)
362+
max_cumulative_similar_charge_distance = max((distances[1] for i, distances in
363+
enumerate(charge_distance_list) if i not in indices_to_pop),
364+
default=0)
363365
for i, distances in enumerate(charge_distance_list):
364366
if distances[0] < max_cumulative_similar_charge_distance:
365-
indices_to_pop.append(i)
367+
indices_to_pop.add(i)
366368
for i in reversed(range(len(mol_list))): # pop starting from the end, so indices won't change
367369
if i in indices_to_pop:
368370
mol_list.pop(i)
@@ -425,8 +427,11 @@ def mark_unreactive_structures(filtered_list, mol_list, save_order=False):
425427
# Make sure that the (first) original structure is always first in the list (unless it was filtered out).
426428
# Important whenever Species.molecule[0] is expected to be used (e.g., training reactions) after generating
427429
# resonance structures. However, if it was filtered out, it should be appended to the end of the list.
430+
# mol_list[0] is constant across iterations, so copy it once. is_isomorphic with save_order=False can
431+
# reorder atoms in either argument, so we still copy `filtered` per iteration to leave filtered_list intact.
432+
original = mol_list[0].copy(deep=True)
428433
for index, filtered in enumerate(filtered_list):
429-
if filtered.copy(deep=True).is_isomorphic(mol_list[0].copy(deep=True), save_order=save_order):
434+
if filtered.copy(deep=True).is_isomorphic(original, save_order=save_order):
430435
filtered_list.insert(0, filtered_list.pop(index))
431436
break
432437
else:

0 commit comments

Comments
 (0)