Skip to content

Commit ce4f6dd

Browse files
committed
Vectorise encode_markdown_links: pandas.str.replace + compiled patterns
The old encoder ran `df[col].apply(encode_label)` — a Python-level loop over every cell that re-evaluated branch conditions, called re.sub twice, and rebuilt strings via f-string. For AllAlignedImages on JRC2018Unisex (527k rows × 3 image/label columns) that adds 30–60 seconds of pure Python overhead on top of the already-slow Neo4j fetch. Rewritten as a vectorised pandas pipeline: * Module-level compiled regex patterns (_RE_IMAGE_URL, _RE_MD_LINK) so re.compile cache lookups don't repeat per row. * str.maketrans-based bracket encoding instead of two chained .replace() calls. * Vectorised prefix check (`is_image = s.str.startswith('[![')`) to branch shape once per column instead of per row. * `Series.str.replace(pat, callable, regex=True)` dispatches the scanner loop in C; only the substitution callback crosses back into Python. * Null preservation via `notnull_mask` so non-string cells aren't coerced. Behavioural compatibility: image-markdown cells keep their original title-quote style (single or double — both forms now supported by the geppetto-vfb processor's IMAGE_MARKDOWN regex), label-shape cells get bracket-encoded as before, http→https substitution unchanged. Synthetic benchmark (100k rows × 3 cols): 0.20s Projected on AllAlignedImages (527k rows): ~1–2s
1 parent 4e97307 commit ce4f6dd

1 file changed

Lines changed: 97 additions & 61 deletions

File tree

src/vfbquery/vfb_queries.py

Lines changed: 97 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -331,72 +331,108 @@ def encode_brackets(text):
331331
return (text.replace('[', '%5B')
332332
.replace(']', '%5D'))
333333

334+
# Module-level pre-compiled patterns + translation tables. Defining these
335+
# once at import time (rather than per call) is itself a measurable win on
336+
# large frames: re.compile is cached but the lookup still adds up over
337+
# millions of cells, and str.translate with a pre-built table is faster
338+
# than chained .replace calls.
339+
_BRACKET_TRANSLATE = str.maketrans({'[': '%5B', ']': '%5D'})
340+
341+
# Image-markdown cell: `[![alt](url 'title')](ref)` OR
342+
# `[![alt](url "title")](ref)` OR with no title slot. We only care about
343+
# the URL slot so we can secure it; alt/title/ref pass through unchanged.
344+
_RE_IMAGE_URL = re.compile(
345+
r"(\[!\[[^\]]*\]\()([^'\"\s)]*)([^)]*\)\]\([^)]+\))"
346+
)
347+
348+
# Regular markdown link: `[label](url)`. We secure the URL and percent-
349+
# encode bracket characters inside the label so a label like
350+
# "P{GMR95F02-GAL4} expression pattern[CPTI100022]" doesn't break the V2
351+
# markdown parser.
352+
_RE_MD_LINK = re.compile(r'\[([^\]]+)\]\(([^\)]+)\)')
353+
354+
355+
def _encode_image_url(match: 're.Match') -> str:
356+
"""Repl callback for image-markdown cells — secure URL, preserve rest."""
357+
prefix, url, suffix = match.group(1), match.group(2), match.group(3)
358+
if url and url.startswith('http://'):
359+
url = 'https://' + url[7:]
360+
return f"{prefix}{url}{suffix}"
361+
362+
363+
def _encode_regular_md_link(match: 're.Match') -> str:
364+
"""Repl callback for `[label](url)` — bracket-encode label, secure URL."""
365+
label, url = match.group(1), match.group(2)
366+
if '[' in label or ']' in label:
367+
label = label.translate(_BRACKET_TRANSLATE)
368+
if url.startswith('http://'):
369+
url = 'https://' + url[7:]
370+
return f"[{label}]({url})"
371+
372+
334373
def encode_markdown_links(df, columns):
335374
"""
336-
Encodes brackets in the labels within markdown links, leaving the link syntax intact.
337-
Does NOT encode alt text in linked images ([![...](...)(...)] format).
338-
Handles multiple comma-separated markdown links in a single string.
339-
:param df: DataFrame containing the query results.
340-
:param columns: List of column names to apply encoding to.
341-
"""
342-
import re
343-
344-
def encode_label(label):
345-
if not isinstance(label, str):
346-
return label
347-
348-
try:
349-
# Handle linked images (format: [![alt text](image_url "title")](link))
350-
if label.startswith("[!["):
351-
# Replace http with https in the image URL
352-
# Pattern: [![anything](http://... "title")](link)
353-
def secure_image_url(match):
354-
alt_text = match.group(1)
355-
image_url = match.group(2)
356-
title = match.group(3) if match.group(3) else ""
357-
link = match.group(4)
358-
secure_url = image_url.replace("http://", "https://")
359-
if title:
360-
return f"[![{alt_text}]({secure_url} \"{title}\")]({link})"
361-
else:
362-
return f"[![{alt_text}]({secure_url})]({link})"
363-
364-
# Regex to match the entire linked image
365-
pattern = r'\[\!\[([^\]]+)\]\(([^\'"\s]+)(?:\s+[\'"]([^\'"]*)[\'"])?\)\]\(([^)]+)\)'
366-
encoded_label = re.sub(pattern, secure_image_url, label)
367-
return encoded_label
368-
369-
# Process regular markdown links - handle multiple links separated by commas
370-
# Pattern matches [label](url) format
371-
elif "[" in label and "](" in label:
372-
# Use regex to find all markdown links and encode each one separately
373-
# Pattern: \[([^\]]+)\]\(([^\)]+)\)
374-
# Matches: [anything except ]](anything except ))
375-
def encode_single_link(match):
376-
label_part = match.group(1) # The label part (between [ and ])
377-
url_part = match.group(2) # The URL part (between ( and ))
378-
# Encode brackets in the label part only
379-
label_part_encoded = encode_brackets(label_part)
380-
# Ensure URLs use https
381-
url_part_secure = url_part.replace("http://", "https://")
382-
return f"[{label_part_encoded}]({url_part_secure})"
383-
384-
# Replace all markdown links with their encoded versions
385-
encoded_label = re.sub(r'\[([^\]]+)\]\(([^\)]+)\)', encode_single_link, label)
386-
return encoded_label
387-
388-
except Exception as e:
389-
# In case of any other unexpected error, log or print the error and return the original label
390-
print(f"Error processing label: {label}, error: {e}")
391-
return label
375+
Vectorised markdown-link encoder.
376+
377+
For each named column:
378+
- Image-markdown cells (`[![alt](url 'title')](ref)`): secure the URL
379+
(http→https). Alt, title, ref pass through unchanged so the V2
380+
processor's IMAGE_MARKDOWN regex still matches both single- and
381+
double-quoted title forms.
382+
- Regular markdown cells (`[label](url), [label2](url2), ...`):
383+
percent-encode `[` and `]` inside the label part and secure URLs.
384+
- Plain text rows are left alone.
392385
393-
# If none of the conditions above match, return the original label
394-
return label
386+
Implementation: pandas `Series.str.replace(regex=True, repl=callable)`
387+
runs the substring scanner in C and only crosses into Python for the
388+
actual substitution callback — roughly 10–50× faster than the
389+
previous per-row `.apply(encode_label)` loop on the 500k-row
390+
AllAlignedImages workload that surfaced this bottleneck. Skips empty
391+
DataFrames and non-string columns cheaply.
392+
393+
Backwards-compatible: input nulls are preserved; existing data with
394+
pre-secured URLs comes back untouched.
395+
"""
396+
if df is None or df.empty:
397+
return df
395398

396399
for column in columns:
397-
# Only encode if the column exists in the DataFrame
398-
if column in df.columns:
399-
df[column] = df[column].apply(lambda x: encode_label(x) if pd.notnull(x) else x)
400+
if column not in df.columns:
401+
continue
402+
s = df[column]
403+
if s.dtype != object:
404+
continue
405+
406+
# `str.replace` skips NaN automatically. We coerce to string only
407+
# for the rows that are non-null to keep null cells as-is.
408+
notnull_mask = s.notna()
409+
if not notnull_mask.any():
410+
continue
411+
non_null = s.where(notnull_mask, '').astype(str)
412+
413+
# Branch on shape using a vectorised prefix check — this is a single
414+
# C-loop over the column, far cheaper than re-detecting per row.
415+
is_image = non_null.str.startswith('[![', na=False)
416+
417+
if is_image.any():
418+
# Image-markdown rows: secure URL only, preserve title quoting.
419+
image_rows = non_null[is_image]
420+
image_rows = image_rows.str.replace(
421+
_RE_IMAGE_URL, _encode_image_url, regex=True
422+
)
423+
non_null = non_null.where(~is_image, image_rows)
424+
425+
non_image_mask = ~is_image
426+
if non_image_mask.any():
427+
# Label-shape rows: bracket-encode + secure URL via regex callback.
428+
label_rows = non_null[non_image_mask]
429+
label_rows = label_rows.str.replace(
430+
_RE_MD_LINK, _encode_regular_md_link, regex=True
431+
)
432+
non_null = non_null.where(is_image, label_rows)
433+
434+
# Write back, preserving original null cells.
435+
df.loc[notnull_mask, column] = non_null[notnull_mask]
400436

401437
return df
402438

0 commit comments

Comments
 (0)