cppa-brain-backend/pinecone_rag/preprocessor/phabricator_preprocessort.py at 636c9f4bb4f1b3ce1df9313e675ff3c3dae4ec53 · CppDigest/cppa-brain-backend · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
"""
Phabricator PR-like preprocessor for Pinecone RAG.

Reads markdown files under data/phabricator/** and builds one Document per file.
Expected markdown header:
- # D<number> <title> [Open|Closed]
- > Username: <author>
- > Created at: <date text>
- > Url: https://reviews.llvm.org/D<number>
"""

import logging
import re
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional

from langchain_core.documents import Document

logger = logging.getLogger(__name__)

_HEADER_RE = re.compile(
    r"^#\s*D(?P<number>\d+)\s+(?P<title>.+?)\s+\[(?P<state>[^\]]+)\]\s*$"
)
_USERNAME_RE = re.compile(r"^>\s*Username:\s*(.+?)\s*$", re.MULTILINE)
_CREATED_AT_RE = re.compile(r"^>\s*Created at:\s*(.+?)\s*$", re.MULTILINE)
_URL_RE = re.compile(r"^>\s*Url:\s*(https?://\S+)\s*$", re.MULTILINE)
_COMMENT_RE = re.compile(r"^##\s*Comment\s+\d+", re.MULTILINE)

_CLOSED_STATES = {"closed", "abandoned", "merged"}


def _is_valid_content(text: str, min_length: int) -> bool:
    return bool(text and len(text.strip()) >= min_length)


def _parse_created_at_to_timestamp(value: str) -> float:
    if not value:
        return 0.0

    patterns = [
        "%b %d %Y, %I:%M %p",  # Jan 18 2023, 5:56 PM
        "%b %d %Y, %H:%M",  # Jan 18 2023, 17:56
    ]
    for pattern in patterns:
        try:
            return datetime.strptime(value, pattern).timestamp()
        except Exception:
            continue
    return 0.0


def _extract_metadata(md_text: str, file_path: Path) -> Dict[str, Any]:
    lines = md_text.splitlines()
    first_line = lines[0].strip() if lines else ""

    header_match = _HEADER_RE.match(first_line)
    if header_match:
        number = int(header_match.group("number"))
        title = header_match.group("title").strip()
        state = header_match.group("state").strip()
    else:
        number = -1
        title = file_path.stem
        state = ""

    user_match = _USERNAME_RE.search(md_text)
    url_match = _URL_RE.search(md_text)

    author = user_match.group(1).strip() if user_match else ""
    url = url_match.group(1).strip() if url_match else ""

    if not url and number > 0:
        url = f"https://reviews.llvm.org/D{number}"

    # Collect all "Created at:" timestamps from PR header + all comments/reviews.
    # The first match is the PR's own creation time; the maximum is the last activity.
    all_timestamps = [
        _parse_created_at_to_timestamp(raw.strip())
        for raw in _CREATED_AT_RE.findall(md_text)
    ]
    valid_timestamps = [ts for ts in all_timestamps if ts > 0.0]

    created_at = valid_timestamps[0] if valid_timestamps else 0.0
    last_activity = max(valid_timestamps) if valid_timestamps else 0.0
    updated_at = last_activity
    closed_at = last_activity if state.lower() in _CLOSED_STATES else 0.0

    return {
        "type": "pr-phabricator",
        "number": number,
        "title": title,
        "url": url,
        "author": author,
        "state": state.lower(),
        "state_reason": "",
        "created_at": created_at,
        "updated_at": updated_at,
        "closed_at": closed_at,
    }


def _load_pr_document(md_path: Path, min_content_length: int) -> Optional[Document]:
    try:
        content = md_path.read_text(encoding="utf-8", errors="replace").strip()
    except OSError as exc:
        logger.debug("Skip %s: %s", md_path.name, exc)
        return None

    if not _is_valid_content(content, min_content_length):
        logger.debug("Skip %s: content too short", md_path.name)
        return None

    metadata = _extract_metadata(content, md_path)
    return Document(page_content=content, metadata=metadata)


class PhabricatorPrPreprocessor:
    """Load Phabricator markdown files from data/phabricator and produce Documents."""

    def __init__(
        self,
        data_dir: str = "data/github/Clang/phabricator",
        min_content_length: int = 10,
    ):
        self.data_dir = Path(data_dir)
        self.min_content_length = min_content_length

    def load_documents(self, limit: Optional[int] = None) -> List[Document]:
        """Load Phabricator markdown files from data/github/Clang/phabricator/**/*.md."""
        if not self.data_dir.exists():
            logger.warning("Phabricator data dir does not exist: %s", self.data_dir)
            return []

        md_paths = sorted(self.data_dir.rglob("*.md"))
        if limit is not None:
            md_paths = md_paths[:limit]

        documents: List[Document] = []
        for md_path in md_paths:
            doc = _load_pr_document(md_path, self.min_content_length)
            if doc is not None:
                documents.append(doc)

        logger.info(
            "Loaded %d Phabricator PR documents from %s",
            len(documents),
            self.data_dir,
        )
        return documents