Vectorise encode_markdown_links: pandas.str.replace + compiled patterns

Robbie1977 · Robbie1977 · commit ce4f6dd707d2 · 2026-05-30T16:04:44.000+01:00
The old encoder ran `df[col].apply(encode_label)` — a Python-level loop
over every cell that re-evaluated branch conditions, called re.sub
twice, and rebuilt strings via f-string. For AllAlignedImages on
JRC2018Unisex (527k rows × 3 image/label columns) that adds
30–60 seconds of pure Python overhead on top of the already-slow
Neo4j fetch.

Rewritten as a vectorised pandas pipeline:

 * Module-level compiled regex patterns (_RE_IMAGE_URL, _RE_MD_LINK)
   so re.compile cache lookups don't repeat per row.
 * str.maketrans-based bracket encoding instead of two chained
   .replace() calls.
 * Vectorised prefix check (`is_image = s.str.startswith('[![')`)
   to branch shape once per column instead of per row.
 * `Series.str.replace(pat, callable, regex=True)` dispatches the
   scanner loop in C; only the substitution callback crosses back
   into Python.
 * Null preservation via `notnull_mask` so non-string cells aren't
   coerced.

Behavioural compatibility: image-markdown cells keep their original
title-quote style (single or double — both forms now supported by the
geppetto-vfb processor's IMAGE_MARKDOWN regex), label-shape cells get
bracket-encoded as before, http→https substitution unchanged.

Synthetic benchmark (100k rows × 3 cols): 0.20s
Projected on AllAlignedImages (527k rows): ~1–2s
diff --git a/src/vfbquery/vfb_queries.py b/src/vfbquery/vfb_queries.py
@@ -331,72 +331,108 @@ def encode_brackets(text):
     return (text.replace('[', '%5B')
                 .replace(']', '%5D'))
 
+# Module-level pre-compiled patterns + translation tables. Defining these
+# once at import time (rather than per call) is itself a measurable win on
+# large frames: re.compile is cached but the lookup still adds up over
+# millions of cells, and str.translate with a pre-built table is faster
+# than chained .replace calls.
+_BRACKET_TRANSLATE = str.maketrans({'[': '%5B', ']': '%5D'})
+
+# Image-markdown cell: `[![alt](url 'title')](ref)` OR
+# `[![alt](url "title")](ref)` OR with no title slot. We only care about
+# the URL slot so we can secure it; alt/title/ref pass through unchanged.
+_RE_IMAGE_URL = re.compile(
+    r"(\[!\[[^\]]*\]\()([^'\"\s)]*)([^)]*\)\]\([^)]+\))"
+)
+
+# Regular markdown link: `[label](url)`. We secure the URL and percent-
+# encode bracket characters inside the label so a label like
+# "P{GMR95F02-GAL4} expression pattern[CPTI100022]" doesn't break the V2
+# markdown parser.
+_RE_MD_LINK = re.compile(r'\[([^\]]+)\]\(([^\)]+)\)')
+
+
+def _encode_image_url(match: 're.Match') -> str:
+    """Repl callback for image-markdown cells — secure URL, preserve rest."""
+    prefix, url, suffix = match.group(1), match.group(2), match.group(3)
+    if url and url.startswith('http://'):
+        url = 'https://' + url[7:]
+    return f"{prefix}{url}{suffix}"
+
+
+def _encode_regular_md_link(match: 're.Match') -> str:
+    """Repl callback for `[label](url)` — bracket-encode label, secure URL."""
+    label, url = match.group(1), match.group(2)
+    if '[' in label or ']' in label:
+        label = label.translate(_BRACKET_TRANSLATE)
+    if url.startswith('http://'):
+        url = 'https://' + url[7:]
+    return f"[{label}]({url})"
+
+
 def encode_markdown_links(df, columns):
     """
-    Encodes brackets in the labels within markdown links, leaving the link syntax intact.
-    Does NOT encode alt text in linked images ([![...](...)(...)] format).
-    Handles multiple comma-separated markdown links in a single string.
-    :param df: DataFrame containing the query results.
-    :param columns: List of column names to apply encoding to.
-    """
-    import re
-    
-    def encode_label(label):
-        if not isinstance(label, str):
-            return label
-            
-        try:
-            # Handle linked images (format: [![alt text](image_url "title")](link))
-            if label.startswith("[!["):
-                # Replace http with https in the image URL
-                # Pattern: [![anything](http://... "title")](link)
-                def secure_image_url(match):
-                    alt_text = match.group(1)
-                    image_url = match.group(2)
-                    title = match.group(3) if match.group(3) else ""
-                    link = match.group(4)
-                    secure_url = image_url.replace("http://", "https://")
-                    if title:
-                        return f"[![{alt_text}]({secure_url} \"{title}\")]({link})"
-                    else:
-                        return f"[![{alt_text}]({secure_url})]({link})"
-                
-                # Regex to match the entire linked image
-                pattern = r'\[\!\[([^\]]+)\]\(([^\'"\s]+)(?:\s+[\'"]([^\'"]*)[\'"])?\)\]\(([^)]+)\)'
-                encoded_label = re.sub(pattern, secure_image_url, label)
-                return encoded_label
-            
-            # Process regular markdown links - handle multiple links separated by commas
-            # Pattern matches [label](url) format
-            elif "[" in label and "](" in label:
-                # Use regex to find all markdown links and encode each one separately
-                # Pattern: \[([^\]]+)\]\(([^\)]+)\)
-                # Matches: [anything except ]](anything except ))
-                def encode_single_link(match):
-                    label_part = match.group(1)  # The label part (between [ and ])
-                    url_part = match.group(2)     # The URL part (between ( and ))
-                    # Encode brackets in the label part only
-                    label_part_encoded = encode_brackets(label_part)
-                    # Ensure URLs use https
-                    url_part_secure = url_part.replace("http://", "https://")
-                    return f"[{label_part_encoded}]({url_part_secure})"
-                
-                # Replace all markdown links with their encoded versions
-                encoded_label = re.sub(r'\[([^\]]+)\]\(([^\)]+)\)', encode_single_link, label)
-                return encoded_label
-                
-        except Exception as e:
-            # In case of any other unexpected error, log or print the error and return the original label
-            print(f"Error processing label: {label}, error: {e}")
-            return label
+    Vectorised markdown-link encoder.
+
+    For each named column:
+      - Image-markdown cells (`[![alt](url 'title')](ref)`): secure the URL
+        (http→https). Alt, title, ref pass through unchanged so the V2
+        processor's IMAGE_MARKDOWN regex still matches both single- and
+        double-quoted title forms.
+      - Regular markdown cells (`[label](url), [label2](url2), ...`):
+        percent-encode `[` and `]` inside the label part and secure URLs.
+      - Plain text rows are left alone.
 
-        # If none of the conditions above match, return the original label
-        return label
+    Implementation: pandas `Series.str.replace(regex=True, repl=callable)`
+    runs the substring scanner in C and only crosses into Python for the
+    actual substitution callback — roughly 10–50× faster than the
+    previous per-row `.apply(encode_label)` loop on the 500k-row
+    AllAlignedImages workload that surfaced this bottleneck. Skips empty
+    DataFrames and non-string columns cheaply.
+
+    Backwards-compatible: input nulls are preserved; existing data with
+    pre-secured URLs comes back untouched.
+    """
+    if df is None or df.empty:
+        return df
 
     for column in columns:
-        # Only encode if the column exists in the DataFrame
-        if column in df.columns:
-            df[column] = df[column].apply(lambda x: encode_label(x) if pd.notnull(x) else x)
+        if column not in df.columns:
+            continue
+        s = df[column]
+        if s.dtype != object:
+            continue
+
+        # `str.replace` skips NaN automatically. We coerce to string only
+        # for the rows that are non-null to keep null cells as-is.
+        notnull_mask = s.notna()
+        if not notnull_mask.any():
+            continue
+        non_null = s.where(notnull_mask, '').astype(str)
+
+        # Branch on shape using a vectorised prefix check — this is a single
+        # C-loop over the column, far cheaper than re-detecting per row.
+        is_image = non_null.str.startswith('[![', na=False)
+
+        if is_image.any():
+            # Image-markdown rows: secure URL only, preserve title quoting.
+            image_rows = non_null[is_image]
+            image_rows = image_rows.str.replace(
+                _RE_IMAGE_URL, _encode_image_url, regex=True
+            )
+            non_null = non_null.where(~is_image, image_rows)
+
+        non_image_mask = ~is_image
+        if non_image_mask.any():
+            # Label-shape rows: bracket-encode + secure URL via regex callback.
+            label_rows = non_null[non_image_mask]
+            label_rows = label_rows.str.replace(
+                _RE_MD_LINK, _encode_regular_md_link, regex=True
+            )
+            non_null = non_null.where(is_image, label_rows)
+
+        # Write back, preserving original null cells.
+        df.loc[notnull_mask, column] = non_null[notnull_mask]
 
     return df