sphinx-codelinks/src/sphinx_codelinks/analyse/utils.py at main · useblocks/sphinx-codelinks · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
from collections.abc import ByteString, Callable
import configparser
import logging
from pathlib import Path
from typing import TypedDict
from urllib.request import pathname2url

from giturlparse import parse  # type: ignore[import-untyped]
from tree_sitter import Language, Parser, Point, Query, QueryCursor
from tree_sitter import Node as TreeSitterNode

from sphinx_codelinks.config import UNIX_NEWLINE, CommentCategory
from sphinx_codelinks.source_discover.config import CommentType

# Language-specific node types for scope detection
SCOPE_NODE_TYPES = {
    # @Python Scope Node Types, IMPL_PY_2, impl, [FE_PY]
    CommentType.python: {"function_definition", "class_definition"},
    # @C and C++ Scope Node Types, IMPL_C_2, impl, [FE_C_SUPPORT, FE_CPP]
    CommentType.cpp: {"function_definition", "class_definition"},
    CommentType.cs: {"method_declaration", "class_declaration", "property_declaration"},
    CommentType.yaml: {"block_mapping_pair", "block_sequence_item", "document"},
    # @Rust Scope Node Types, IMPL_RUST_2, impl, [FE_RUST];
    CommentType.rust: {
        "function_item",
        "struct_item",
        "enum_item",
        "impl_item",
        "trait_item",
        "mod_item",
    },
}

# initialize logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
# log to the console
console = logging.StreamHandler()
console.setLevel(logging.INFO)
logger.addHandler(console)

GIT_HOST_URL_TEMPLATE = {
    "github": "https://github.com/{owner}/{repo}/blob/{rev}/{path}#L{lineno}",
    "gitlab": "https://gitlab.com/{owner}/{repo}/-/blob/{rev}/{path}#L{lineno}",
}

PYTHON_QUERY = """
                ; Match comments
                (comment) @comment

                ; Match docstrings inside modules, functions, or classes
                (module (expression_statement (string)) @comment)
                (function_definition (block (expression_statement (string)) @comment))
                (class_definition (block (expression_statement (string)) @comment))
            """
CPP_QUERY = """(comment) @comment"""
C_SHARP_QUERY = """(comment) @comment"""
YAML_QUERY = """(comment) @comment"""
RUST_QUERY = """
    (line_comment) @comment
    (block_comment) @comment
"""


def is_text_file(filepath: Path, sample_size: int = 2048) -> bool:
    """Return True if file is likely text, False if binary."""
    try:
        with filepath.open("rb") as f:
            chunk = f.read(sample_size)
        # Quick binary heuristic: null byte present
        if b"\x00" in chunk:
            return False
        # Try UTF-8 decode on the sample
        chunk.decode("utf-8")
        return True
    except UnicodeDecodeError:
        return False


# @Tree-sitter parser initialization for multiple languages, IMPL_LANG_1, impl, [FE_C_SUPPORT, FE_CPP, FE_PY, FE_YAML, FE_RUST]
def init_tree_sitter(comment_type: CommentType) -> tuple[Parser, Query]:
    if comment_type == CommentType.cpp:
        import tree_sitter_cpp  # noqa: PLC0415

        parsed_language = Language(tree_sitter_cpp.language())
        query = Query(parsed_language, CPP_QUERY)
    elif comment_type == CommentType.python:
        import tree_sitter_python  # noqa: PLC0415

        parsed_language = Language(tree_sitter_python.language())
        query = Query(parsed_language, PYTHON_QUERY)
    elif comment_type == CommentType.cs:
        import tree_sitter_c_sharp  # noqa: PLC0415

        parsed_language = Language(tree_sitter_c_sharp.language())
        query = Query(parsed_language, C_SHARP_QUERY)
    elif comment_type == CommentType.yaml:
        import tree_sitter_yaml  # noqa: PLC0415

        parsed_language = Language(tree_sitter_yaml.language())
        query = Query(parsed_language, YAML_QUERY)
    elif comment_type == CommentType.rust:
        import tree_sitter_rust  # noqa: PLC0415

        parsed_language = Language(tree_sitter_rust.language())
        query = Query(parsed_language, RUST_QUERY)
    else:
        raise ValueError(f"Unsupported comment style: {comment_type}")
    parser = Parser(parsed_language)
    return parser, query


def wrap_read_callable_point(
    src_string: ByteString,
) -> Callable[[int, Point], ByteString]:
    def read_callable_byte_offset(byte_offset: int, _: Point) -> ByteString:
        return src_string[byte_offset : byte_offset + 1]

    return read_callable_byte_offset


# @Comment extraction from source code using tree-sitter, IMPL_EXTR_1, impl, [FE_DEF]
def extract_comments(
    src_string: ByteString, parser: Parser, query: Query
) -> list[TreeSitterNode] | None:
    """Get all comments from source files by tree-sitter."""
    read_point_fn = wrap_read_callable_point(src_string)
    tree = parser.parse(read_point_fn)
    query_cursor = QueryCursor(query)
    captures: dict[str, list[TreeSitterNode]] = query_cursor.captures(tree.root_node)

    return captures.get("comment")


def find_enclosing_scope(
    node: TreeSitterNode, comment_type: CommentType = CommentType.cpp
) -> TreeSitterNode | None:
    """Find the enclosing scope of a comment."""
    scope_types = SCOPE_NODE_TYPES.get(comment_type, SCOPE_NODE_TYPES[CommentType.cpp])
    current: TreeSitterNode = node
    while current:
        if current.type in scope_types:
            return current
        current: TreeSitterNode | None = current.parent  # type: ignore[no-redef]  # required for node traversal
    return None


def find_next_scope(
    node: TreeSitterNode, comment_type: CommentType = CommentType.cpp
) -> TreeSitterNode | None:
    """Find the next scope of a comment."""
    scope_types = SCOPE_NODE_TYPES.get(comment_type, SCOPE_NODE_TYPES[CommentType.cpp])
    current: TreeSitterNode = node
    while current:
        if current.type in scope_types:
            return current
        current: TreeSitterNode | None = current.next_named_sibling  # type: ignore[no-redef]  # required for node traversal
        if current and current.type == "block":
            for child in current.named_children:
                if child.type in scope_types:
                    return child
    return None


def _find_yaml_structure_in_block_node(
    block_node: TreeSitterNode,
) -> TreeSitterNode | None:
    """Find YAML structure elements within a block_node."""
    for grandchild in block_node.named_children:
        if grandchild.type == "block_mapping":
            for ggchild in grandchild.named_children:
                if ggchild.type == "block_mapping_pair":
                    return ggchild
        elif grandchild.type == "block_sequence":
            for ggchild in grandchild.named_children:
                if ggchild.type == "block_sequence_item":
                    return ggchild
    return None


def find_yaml_next_structure(node: TreeSitterNode) -> TreeSitterNode | None:
    """Find the next YAML structure element after the comment node."""
    current = node.next_named_sibling
    while current:
        if current.type in {
            "block_mapping_pair",
            "block_sequence_item",
            "flow_mapping",
            "flow_sequence",
        }:
            return current
        if current.type == "document":
            for child in current.named_children:
                if child.type == "block_node":
                    result = _find_yaml_structure_in_block_node(child)
                    if result:
                        return result
        if current.type == "block_node":
            result = _find_yaml_structure_in_block_node(current)
            if result:
                return result
        current = current.next_named_sibling
    return None


def find_yaml_prev_sibling_on_same_row(node: TreeSitterNode) -> TreeSitterNode | None:
    """Find a previous named sibling that is on the same row as the comment."""
    comment_row = node.start_point.row
    current = node.prev_named_sibling

    while current:
        # Check if this sibling ends on the same row as the comment starts
        # This indicates it's an inline comment
        if current.end_point.row == comment_row:
            return current
        # If we find a sibling that ends before the comment row, we can stop
        # as we won't find any siblings on the same row going backwards
        if current.end_point.row < comment_row:
            break
        current = current.prev_named_sibling

    return None


def find_yaml_associated_structure(node: TreeSitterNode) -> TreeSitterNode | None:
    """Find the YAML structure (key-value pair, list item, etc.) associated with a comment."""
    # First, check if this is an inline comment by looking for a previous sibling on the same row
    prev_sibling_same_row = find_yaml_prev_sibling_on_same_row(node)
    if prev_sibling_same_row:
        return prev_sibling_same_row

    # If no previous sibling on same row, try to find the next named sibling (structure after the comment)
    structure = find_yaml_next_structure(node)
    if structure:
        return structure

    # If no next sibling found, traverse up to find parent structure
    parent = node.parent
    while parent:
        if parent.type in {"block_mapping_pair", "block_sequence_item"}:
            return parent
        parent = parent.parent

    return None


def find_associated_scope(
    node: TreeSitterNode, comment_type: CommentType = CommentType.cpp
) -> TreeSitterNode | None:
    """Find the associated scope of a comment."""
    if comment_type == CommentType.yaml:
        # YAML uses different structure association logic
        return find_yaml_associated_structure(node)

    if node.type == CommentCategory.docstring:
        # Only for python's docstring
        return find_enclosing_scope(node, comment_type)
    # General comments regardless of comment types
    associated_scope = find_next_scope(node, comment_type)
    if not associated_scope:
        associated_scope = find_enclosing_scope(node, comment_type)
    return associated_scope


def locate_git_root(src_dir: Path) -> Path | None:
    """Traverse upwards to find git root."""
    current = src_dir.resolve()
    parents = list(current.parents)
    parents.append(current)
    for parent in parents:
        if (parent / ".git").exists() and (parent / ".git").is_dir():
            return parent
    logger.warning(f"git root is not found in the parent of {src_dir}")
    return None


def get_remote_url(git_root: Path, remote_name: str = "origin") -> str | None:
    """Get remote url from .git/config."""
    config_path = git_root / ".git" / "config"
    if not config_path.exists():
        logging.warning(f"{config_path} does not exist")
        return None

    config = configparser.ConfigParser(allow_no_value=True, strict=False)
    config.read(config_path)
    section = f'remote "{remote_name}"'
    if section in config and "url" in config[section]:
        url: str = config[section]["url"]
        return url
    logger.warning(f"remote-url is not found in {config_path}")
    return None


def get_current_rev(git_root: Path) -> str | None:
    """Get current commit rev from .git/HEAD."""
    head_path = git_root / ".git" / "HEAD"
    if not head_path.exists():
        logging.warning(f"{head_path} does not exist")
        return None
    head_content = head_path.read_text().strip()
    if not head_content.startswith("ref: "):
        logging.warning(f"Expect starting with 'ref: ' in {head_path}")
        return None

    ref_path = git_root / ".git" / head_content.split(":", 1)[1].strip()
    if not ref_path.exists():
        logging.warning(f"{ref_path} does not exist")
        return None
    return ref_path.read_text().strip()


def form_https_url(
    git_url: str, rev: str, project_path: Path, filepath: Path, lineno: int
) -> str | None:
    parsed_url = parse(git_url)
    template = GIT_HOST_URL_TEMPLATE.get(parsed_url.platform)
    if not template:
        logging.warning(f"Unsupported Git host: {parsed_url.platform}")
        return git_url
    https_url = template.format(
        owner=parsed_url.owner,
        repo=parsed_url.repo,
        rev=rev,
        path=pathname2url(str(filepath.absolute().relative_to(project_path))),
        lineno=str(lineno),
    )
    return https_url


def remove_leading_sequences(text: str, leading_sequences: list[str]) -> str:
    lines = text.splitlines(keepends=True)
    no_comment_lines = []
    for line in lines:
        leading_sequence_exist = False
        for leading_sequence in leading_sequences:
            leading_sequence_idx = line.find(leading_sequence)
            if leading_sequence_idx == -1:
                continue
            no_comment_lines.append(
                line[leading_sequence_idx + len(leading_sequence) :]
            )
            leading_sequence_exist = True
            break

        if not leading_sequence_exist:
            no_comment_lines.append(line)

    return "".join(no_comment_lines)


class ExtractedRstType(TypedDict):
    rst_text: str
    row_offset: int
    start_idx: int
    end_idx: int


# @Extract reStructuredText blocks embedded in comments, IMPL_RST_1, impl, [FE_RST_EXTRACTION]
def extract_rst(
    text: str, start_marker: str, end_marker: str
) -> ExtractedRstType | None:
    """Extract rst from a comment.

    Two use cases:
    1. Start_marker and end_marker one the same line.

    The rst text is wrapped by start and the end markers on the same line,
    so, there is no need to remove the leading chars.ArithmeticError
    E.g.
    @rst  .. admonition:: title here @endrst

    2. Start_marker and end_marker in different lines.

    The rst text is expected to start from the next line of the start_marker
    and ends at he previous line of the end_marker.
    E.g.
    @rst
    .. admonition:: title here
      :collapsible: open

      This example is collapsible, and initially open.
    @endrst
    """
    start_idx = text.find(start_marker)
    end_idx = text.rfind(end_marker)
    if start_idx == -1 or end_idx == -1:
        return None
    rst_text = text[start_idx + len(start_marker) : end_idx]
    row_offset = len(text[:start_idx].splitlines())
    if not rst_text.strip():
        # empty string is out of the interest
        return None
    if UNIX_NEWLINE not in rst_text:
        # single line rst text
        oneline_rst: ExtractedRstType = {
            "rst_text": rst_text,
            "row_offset": row_offset,
            "start_idx": start_idx + len(start_marker),
            "end_idx": end_idx,
        }
        return oneline_rst

    # multiline rst text

    first_newline_idx = rst_text.find(UNIX_NEWLINE)
    rst_text = rst_text[first_newline_idx + len(UNIX_NEWLINE) :]
    multiline_rst: ExtractedRstType = {
        "rst_text": rst_text,
        "row_offset": row_offset,
        "start_idx": start_idx
        + len(start_marker)
        + first_newline_idx
        + len(UNIX_NEWLINE),
        "end_idx": end_idx,
    }

    return multiline_rst