|
| 1 | +"""Teams format primitives — a lightweight, runtime-free subpath. |
| 2 | +
|
| 3 | +Port of ``packages/adapter-teams/src/format/index.ts`` (vercel/chat@4.31, |
| 4 | +commit 8c71411), exposed upstream as ``@chat-adapter/teams/format``. Provides |
| 5 | +runtime-free primitives for escaping/unescaping Teams text, building and |
| 6 | +normalizing ``<at>`` mentions, and converting between Teams' restricted HTML |
| 7 | +subset and Markdown-ish text — without the full Teams adapter, the |
| 8 | +``microsoft_teams`` SDK, or the chat runtime. |
| 9 | +
|
| 10 | +This is intentionally distinct from :mod:`chat_sdk.adapters.teams.format_converter`, |
| 11 | +which is a higher-level AST-based converter. The helpers here are low-level |
| 12 | +string primitives that operate purely on text. |
| 13 | +
|
| 14 | +Importing this module never imports the ``microsoft_teams`` SDK, an HTTP |
| 15 | +client, or the high-level :mod:`chat_sdk.adapters.teams.adapter` module. Emoji |
| 16 | +placeholder conversion delegates to :mod:`chat_sdk.emoji` so the emoji map is |
| 17 | +never duplicated here. |
| 18 | +
|
| 19 | +Python-specific hardening (divergence from upstream, see |
| 20 | +``docs/UPSTREAM_SYNC.md``): :func:`markdown_to_teams_html` gates link hrefs |
| 21 | +through an exact ``{http, https, mailto}`` protocol allowlist using |
| 22 | +:func:`urllib.parse.urlparse` (port of the upstream ``URL().protocol`` check), |
| 23 | +rejecting ``javascript:``, ``data:``, relative, and other unsafe hrefs so they |
| 24 | +render as plain text rather than active links (SSRF / injection guard). |
| 25 | +""" |
| 26 | + |
| 27 | +from __future__ import annotations |
| 28 | + |
| 29 | +import re |
| 30 | +from urllib.parse import urlparse |
| 31 | + |
| 32 | +from chat_sdk.emoji import convert_emoji_placeholders |
| 33 | + |
| 34 | +__all__ = [ |
| 35 | + "convert_teams_emoji_placeholders", |
| 36 | + "escape_teams_text", |
| 37 | + "format_teams_mention", |
| 38 | + "markdown_to_teams_html", |
| 39 | + "safe_link_href", |
| 40 | + "teams_html_to_markdown", |
| 41 | + "teams_mention_to_plain_text", |
| 42 | + "unescape_teams_text", |
| 43 | +] |
| 44 | + |
| 45 | +# JS source patterns ported 1:1. The `gis` flags become DOTALL | IGNORECASE in |
| 46 | +# Python; JS `g` (replace-all) is the default for `re.sub`/`str.replace`. |
| 47 | +_HTML_ESCAPE_PATTERN = re.compile(r"[&<>\"]") |
| 48 | +_MARKDOWN_LINK_PATTERN = re.compile(r"\[([^\]]+)\]\(([^)]+)\)") |
| 49 | +_TEAMS_MENTION_PATTERN = re.compile(r"<at\b[^>]*>(.*?)</at>", re.DOTALL | re.IGNORECASE) |
| 50 | +_TAG_PATTERN = re.compile(r"<[^>]+>") |
| 51 | + |
| 52 | +# Order matters: `&` is escaped via the single-pass regex below so an already |
| 53 | +# present `&` is not double-escaped. Matches upstream `HTML_ESCAPES`. |
| 54 | +_HTML_ESCAPES: dict[str, str] = { |
| 55 | + '"': """, |
| 56 | + "&": "&", |
| 57 | + "<": "<", |
| 58 | + ">": ">", |
| 59 | +} |
| 60 | + |
| 61 | +# Upstream `EMOJI_PLACEHOLDERS` maps Slack-style colon placeholders to unicode. |
| 62 | +# Rather than re-declare the unicode (which would duplicate the emoji map), we |
| 63 | +# map each upstream placeholder to its normalized name in :mod:`chat_sdk.emoji` |
| 64 | +# and delegate the unicode lookup to that single source of truth. |
| 65 | +_PLACEHOLDER_TO_NORMALIZED: dict[str, str] = { |
| 66 | + ":red_circle:": "red_circle", |
| 67 | + ":warning:": "warning", |
| 68 | + ":white_check_mark:": "check", |
| 69 | + ":x:": "x", |
| 70 | +} |
| 71 | + |
| 72 | +# Exact protocol allowlist for Markdown link hrefs (port of upstream |
| 73 | +# `SAFE_LINK_PROTOCOLS`). `urlparse` lowercases the scheme and yields it |
| 74 | +# without the trailing colon, so these are bare scheme names. |
| 75 | +_SAFE_LINK_PROTOCOLS: frozenset[str] = frozenset({"http", "https", "mailto"}) |
| 76 | + |
| 77 | + |
| 78 | +def escape_teams_text(text: str) -> str: |
| 79 | + """Escape the Teams HTML control characters (``&``, ``<``, ``>``, ``"``). |
| 80 | +
|
| 81 | + Run this before inserting any HTML tags so user-supplied ``<`` cannot |
| 82 | + forge markup. |
| 83 | + """ |
| 84 | + return _HTML_ESCAPE_PATTERN.sub(lambda m: _HTML_ESCAPES.get(m.group(0), m.group(0)), text) |
| 85 | + |
| 86 | + |
| 87 | +def unescape_teams_text(text: str) -> str: |
| 88 | + """Reverse :func:`escape_teams_text`. |
| 89 | +
|
| 90 | + Entities are replaced in reverse order (``&`` last) so a literal |
| 91 | + ``&lt;`` does not collapse into ``<``. |
| 92 | + """ |
| 93 | + return text.replace(""", '"').replace("<", "<").replace(">", ">").replace("&", "&") |
| 94 | + |
| 95 | + |
| 96 | +def format_teams_mention(name: str) -> str: |
| 97 | + """Wrap a display name in an escaped Teams ``<at>`` mention tag.""" |
| 98 | + return f"<at>{escape_teams_text(name)}</at>" |
| 99 | + |
| 100 | + |
| 101 | +def teams_mention_to_plain_text(text: str) -> str: |
| 102 | + """Replace Teams ``<at>...</at>`` mention tags with ``@name`` plain text.""" |
| 103 | + |
| 104 | + def _replace(match: re.Match[str]) -> str: |
| 105 | + name = match.group(1) |
| 106 | + return f"@{unescape_teams_text(_strip_tags(name).strip())}" |
| 107 | + |
| 108 | + return _TEAMS_MENTION_PATTERN.sub(_replace, text) |
| 109 | + |
| 110 | + |
| 111 | +def teams_html_to_markdown(html: str) -> str: |
| 112 | + """Convert Teams' restricted HTML subset to Markdown-ish text.""" |
| 113 | + text = teams_mention_to_plain_text(html) |
| 114 | + text = re.sub(r"<strong\b[^>]*>(.*?)</strong>", r"**\1**", text, flags=re.DOTALL | re.IGNORECASE) |
| 115 | + text = re.sub(r"<b\b[^>]*>(.*?)</b>", r"**\1**", text, flags=re.DOTALL | re.IGNORECASE) |
| 116 | + text = re.sub(r"<em\b[^>]*>(.*?)</em>", r"_\1_", text, flags=re.DOTALL | re.IGNORECASE) |
| 117 | + text = re.sub(r"<i\b[^>]*>(.*?)</i>", r"_\1_", text, flags=re.DOTALL | re.IGNORECASE) |
| 118 | + text = re.sub(r"<s\b[^>]*>(.*?)</s>", r"~~\1~~", text, flags=re.DOTALL | re.IGNORECASE) |
| 119 | + text = re.sub(r"<strike\b[^>]*>(.*?)</strike>", r"~~\1~~", text, flags=re.DOTALL | re.IGNORECASE) |
| 120 | + text = re.sub(r"<code\b[^>]*>(.*?)</code>", r"`\1`", text, flags=re.DOTALL | re.IGNORECASE) |
| 121 | + text = re.sub( |
| 122 | + r"<a\b[^>]*href=[\"']([^\"']+)[\"'][^>]*>(.*?)</a>", |
| 123 | + r"[\2](\1)", |
| 124 | + text, |
| 125 | + flags=re.DOTALL | re.IGNORECASE, |
| 126 | + ) |
| 127 | + text = re.sub(r"<br\s*/?>", "\n", text, flags=re.IGNORECASE) |
| 128 | + text = re.sub(r"</p>\s*<p[^>]*>", "\n\n", text, flags=re.IGNORECASE) |
| 129 | + text = _TAG_PATTERN.sub("", text) |
| 130 | + text = text.replace(" ", " ") |
| 131 | + return unescape_teams_text(text).strip() |
| 132 | + |
| 133 | + |
| 134 | +def markdown_to_teams_html(markdown: str) -> str: |
| 135 | + """Convert Markdown-ish text to Teams' restricted HTML subset. |
| 136 | +
|
| 137 | + The input is escaped *before* any tag insertion so user-supplied ``<`` |
| 138 | + cannot forge HTML. Link hrefs are gated through :func:`safe_link_href`; |
| 139 | + unsafe hrefs render as plain label text. |
| 140 | + """ |
| 141 | + text = convert_teams_emoji_placeholders(escape_teams_text(markdown)) |
| 142 | + text = re.sub(r"\*\*(.*?)\*\*", r"<strong>\1</strong>", text) |
| 143 | + text = re.sub(r"_(.*?)_", r"<em>\1</em>", text) |
| 144 | + text = re.sub(r"~~(.*?)~~", r"<s>\1</s>", text) |
| 145 | + text = re.sub(r"`([^`]+)`", r"<code>\1</code>", text) |
| 146 | + |
| 147 | + def _link(match: re.Match[str]) -> str: |
| 148 | + label, href = match.group(1), match.group(2) |
| 149 | + return f'<a href="{href}">{label}</a>' if safe_link_href(href) else label |
| 150 | + |
| 151 | + text = _MARKDOWN_LINK_PATTERN.sub(_link, text) |
| 152 | + return text.replace("\n", "<br>") |
| 153 | + |
| 154 | + |
| 155 | +def convert_teams_emoji_placeholders(text: str) -> str: |
| 156 | + """Convert Teams' Slack-style colon emoji placeholders to unicode. |
| 157 | +
|
| 158 | + Delegates the unicode lookup to :mod:`chat_sdk.emoji` so the emoji map is |
| 159 | + never duplicated. Each upstream placeholder (``:white_check_mark:`` etc.) |
| 160 | + is mapped to its normalized SDK emoji name and resolved to the platform |
| 161 | + (unicode) form via :func:`chat_sdk.emoji.convert_emoji_placeholders`. |
| 162 | + """ |
| 163 | + converted = text |
| 164 | + for placeholder, normalized in _PLACEHOLDER_TO_NORMALIZED.items(): |
| 165 | + converted = converted.replace( |
| 166 | + placeholder, |
| 167 | + convert_emoji_placeholders(f"{{{{emoji:{normalized}}}}}", "teams"), |
| 168 | + ) |
| 169 | + return converted |
| 170 | + |
| 171 | + |
| 172 | +def _strip_tags(text: str) -> str: |
| 173 | + return _TAG_PATTERN.sub("", text) |
| 174 | + |
| 175 | + |
| 176 | +def safe_link_href(href: str) -> bool: |
| 177 | + """Return ``True`` only for ``http``/``https``/``mailto`` hrefs. |
| 178 | +
|
| 179 | + Port of the upstream ``safeLinkHref`` protocol check using |
| 180 | + :func:`urllib.parse.urlparse`. Rejects ``javascript:``, ``data:``, |
| 181 | + relative, and any other scheme (SSRF / injection guard). |
| 182 | + """ |
| 183 | + try: |
| 184 | + return urlparse(href).scheme in _SAFE_LINK_PROTOCOLS |
| 185 | + except ValueError: |
| 186 | + return False |
0 commit comments