CodeBoarding-MCP/utils/format.py at a2b2785951f5987517203fea2ca0fa8b71db0b83 · CodeBoarding/CodeBoarding-MCP · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import re
import requests
from collections import defaultdict

try:
    import tiktoken
    TOKEN_ENCODER = tiktoken.get_encoding("cl100k_base")
except ImportError:
    TOKEN_ENCODER = None  # Token counting unavailable without tiktoken


def replace_mermaid_blocks(markdown: str) -> str:
    """
    Detects mermaid code blocks and replaces them with LLM-friendly summaries.
    Strips out graph LR definitions and any `click ... href` links first.
    """
    mermaid_block_pattern = re.compile(
        r"```mermaid\s*\n(.*?)```",
        re.DOTALL | re.IGNORECASE
    )

    def convert_mermaid(match):
        code = match.group(1)
        # Remove 'graph LR' lines
        code = re.sub(r'^\s*graph\s+LR.*$', '', code, flags=re.MULTILINE)
        # Remove any 'click <id> href "url" "label"' lines
        code = re.sub(
            r'^\s*click\s+\w+\s+href\s+"[^"]+"\s+"[^"]+"',
            '', code, flags=re.MULTILINE
        )
        return format_mermaid_to_llm_markdown_no_links(code)

    return mermaid_block_pattern.sub(convert_mermaid, markdown)


def format_mermaid_to_llm_markdown_no_links(mermaid_str: str) -> str:
    """
    Parses node and edge definitions out of a (cleaned) mermaid graph
    and turns them into a simple bullet-list summary.
    """
    node_pat = re.compile(r'(\w+)\["(.+?)"\]')
    nodes = dict(node_pat.findall(mermaid_str))

    edge_pat = re.compile(r'(\w+)\s+--\s+"(.+?)"\s+-->\s+(\w+)')
    edges = edge_pat.findall(mermaid_str)

    forward = defaultdict(list)
    reverse = defaultdict(list)
    for src, label, dst in edges:
        forward[src].append((label, dst))
        reverse[dst].append((label, src))

    lines = ["**Core Components:**", ""]
    for key, name in nodes.items():
        lines.append(f"- {name}")
        for label, dst in forward.get(key, []):
            dst_name = nodes.get(dst, dst)
            lines.append(f"  {label}:")
            lines.append(f"  - {dst_name}")
        for label, src in reverse.get(key, []):
            if key not in [d for _, d in forward.get(src, [])]:
                src_name = nodes.get(src, src)
                lines.append(f"  {label} by:")
                lines.append(f"  - {src_name}")
        lines.append("")
    return "\n".join(lines)

def format_github_html_links_to_plaintext(
    markdown: str,
    inline_code: bool,
) -> str:
    """
    Converts GitHub HTML <a> links with line ranges into plaintext summaries,
    optionally embedding the code and token counts.

    Args:
      markdown: input markdown containing GitHub blob links.
      inline_code: if True, fetch & include the code snippet in a fenced block
                   (with optional token count). If False, only plaintext.

    Example:
      <a href="https://github.com/owner/repo/blob/main/path/to.py#L10-L20">`symbol` (10:20)</a>
    -->
      symbol (path/to.py: lines 10–20)
      ```python
      # code lines...
      ```
      [Token count: X]
    or, with inline_code=False:
      symbol (path/to.py: lines 10–20)
    """
    pattern = re.compile(
        r'<a href="https://github\.com/(?P<owner>[^/]+)/(?P<repo>[^/]+)/blob/'
        r'(?P<branch>[^/]+)/(?P<path>[^#]+)#L(?P<start>\d+)-L(?P<end>\d+)"[^>]*>'
        r'`(?P<symbol>[^`]+)` \(\d+:\d+\)</a>'
    )

    def replacer(match):
        owner = match.group('owner')
        repo = match.group('repo')
        branch = match.group('branch')
        path = match.group('path')
        start = int(match.group('start'))
        end = int(match.group('end'))
        symbol = match.group('symbol')
        base_text = f"{symbol} ({path}: lines {start}–{end})"

        if not inline_code:
            # Return only plaintext summary
            return base_text

        raw_url = f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/{path}"
        snippet = None
        token_info = ''

        try:
            resp = requests.get(raw_url, timeout=5)
            resp.raise_for_status()
            lines = resp.text.splitlines()
            snippet_lines = lines[start-1:end]
            snippet = '\n'.join(snippet_lines)
            if TOKEN_ENCODER and snippet:
                token_count = len(TOKEN_ENCODER.encode(snippet))
                token_info = f"\n[Token count: {token_count}]"
        except Exception:
            snippet = None  # fallback

        if snippet:
            return f"{base_text}\n```python\n{snippet}\n```{token_info}"
        return base_text

    return pattern.sub(replacer, markdown)