fix: split bullet-list emails Teams concatenates without separators

rofe · claude · rofe · commit 74a0fc437707 · 2026-05-13T09:39:53.000+02:00
Teams delivers bullet lists in activity.text as flat HTML-entity-encoded
text with the &lt;li&gt; boundaries removed, so "a@adobe.com" and "b@adobe.com"
arrive glued as "a@adobe.comb@adobe.com". The email regex then backtracks
into nonsense (TLD=.comb, etc).

Decode common HTML entities, then insert a space after a known common TLD
when it's immediately followed by a letter (only possible when two
emails were concatenated). Trade-off: addresses in less common TLDs that
share a prefix with a common one (e.g. .coffee) lose correctness, but
this covers the Adobe/customer corporate domains we actually see.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/src/parser.js b/src/parser.js
@@ -2,13 +2,30 @@ const EMAIL_RE = /[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}/g;
 
 const INTENT_WORDS = /\b(add|invite|include|onboard|grant|give\s+access|join)\b/i;
 
-// Strip <at>...</at> mentions entirely, then replace every remaining HTML
-// tag with a space so adjacent <li> / <a> / <p> boundaries become real
-// separators for the email regex.
+// Common TLDs used to separate emails that Teams flattens together when
+// users send bullet lists. We insert a space after these when they are
+// immediately followed by another letter (which can only happen when two
+// emails were concatenated).
+const COMMON_TLDS = 'com|org|net|edu|gov|mil|io|co|us|uk|de|fr|jp|cn|au|in|br|ca|me|tv|info|biz|app|dev|ai|cloud';
+const TLD_GLUE_RE = new RegExp(`\\.(${COMMON_TLDS})(?=[a-zA-Z])`, 'gi');
+
+const HTML_ENTITIES = { lt: '<', gt: '>', amp: '&', quot: '"', apos: "'", nbsp: ' ' };
+
+function decodeEntities(text) {
+  return text.replace(/&(lt|gt|amp|quot|apos|nbsp|#\d+);/gi, (m, name) => {
+    if (name.startsWith('#')) return String.fromCharCode(Number(name.slice(1)));
+    return HTML_ENTITIES[name.toLowerCase()] ?? m;
+  });
+}
+
+// Decode HTML entities, strip <at>...</at> mentions entirely, replace
+// every remaining HTML tag with a space, then split emails that Teams
+// concatenated together (no separator between bullet list items).
 function stripMarkup(text) {
-  return text
+  return decodeEntities(text)
     .replace(/<at[^>]*>.*?<\/at>/gi, ' ')
-    .replace(/<[^>]+>/g, ' ');
+    .replace(/<[^>]+>/g, ' ')
+    .replace(TLD_GLUE_RE, '.$1 ');
 }
 
 export function hasAddIntent(text) {
@@ -18,8 +35,8 @@ export function hasAddIntent(text) {
 
 /**
  * Extract de-duplicated email addresses from a Teams message.
- * Pulls emails from mailto: links first (so bullet lists that get
- * flattened don't lose their separators), then from the stripped text.
+ * Pulls emails from mailto: links first (when chips are present), then
+ * from the stripped text (with concatenated bullet-list emails split).
  */
 export function extractEmails(text) {
   if (!text) return [];