AbitAssistant_Bot/app/services/parser.py at master · OlexiyOdarchuk/AbitAssistant_Bot · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
# Copyright (c) 2025 iShawyha. All rights reserved.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

import aiohttp
from bs4 import BeautifulSoup
import re
import json
from config import API_URL
from app.services.logger import log_parsing_action, log_parsing_step, log_error

PAGE_SIZE = 500


async def create_payload(url: str, last: int, tg_id: int) -> dict:
    """Функція, що генерує словник, який пізніше буде передаватися як запит для отримання посилання на json

    Args:
        url (str): Посилання спеціальності, яку потрібно парсити
        last (int): Абітурієнт від якого починати парс

    Returns:
        dict: Повертає словник, який є параметрами запиту
    """
    try:
        data = url.strip("/").split("/")
        sid = data[-1]
        uid = data[-2]
        y = data[-4][1:]  # y2025 -> 2025

        if not y.isdigit():
            raise ValueError(f"Could not extract year from URL: {url}")

        payload = {
            "action": "requests",
            "y": y,
            "sid": sid,
            "uid": uid,
            "last": str(last),
        }
        log_parsing_step(tg_id, "Payload created", details=str(payload))
        return payload
    except Exception as e:
        log_error(e, f"[User {tg_id}] Error creating payload")
        raise


async def get_url_json(
    session: aiohttp.ClientSession, payload: dict, tg_id: int
) -> str:
    """Отримує посилання на Json з якого пізніше будуть отримуватися данні про абітурієнтів

    Args:
        sess (aiohttp.ClientSession): Сесія aiohttp
        payload (dict): Словник з параметрами запиту
        tg_id (int): Телеграм ID користувача

    Returns:
        str: Посилання на json
    """
    try:
        for i in range(2):  # З першого разу запит чомусь нічого не повертає
            async with session.post(
                API_URL, data=payload, allow_redirects=True
            ) as resp:
                try:
                    json_url = await resp.json(content_type=None)
                    log_parsing_step(tg_id, "URL JSON received")
                except aiohttp.ContentTypeError:
                    text = await resp.text()
                    log_parsing_step(tg_id, "ContentTypeError", details=text)
                    json_url = None

        if json_url is None or not json_url.get("url"):
            log_error("No URL", f"[User {tg_id}] Error getting URL JSON")
            return None
        return json_url.get("url")
    except Exception as e:
        log_error(e, f"[User {tg_id}] Error getting URL JSON")
        return None


async def fetch_html(url: str, session, tg_id: int) -> str:
    """Парсить з сайту html і повертає його як текст"""
    try:
        log_parsing_step(tg_id, "Fetching HTML", details=f"URL: {url}")
        async with session.get(url) as resp:
            html = await resp.text()
        log_parsing_step(tg_id, "HTML fetched successfully")
        return html
    except Exception as e:
        log_error(e, f"[User {tg_id}] Error fetching HTML from {url}")
        return ""


def parse_js_variable(js_text: str, var_name: str, tg_id: int):
    """Витягує JS-обʼєкт або масив і конвертує dict/list."""
    try:
        log_parsing_step(tg_id, f"Parsing JS variable: {var_name}")
        pattern = re.compile(
            rf"{re.escape(var_name)}\s*=\s*(\{{.*?\}}|\[.*?\]);", re.DOTALL
        )
        match = pattern.search(js_text)
        if match:
            js_object = match.group(1)
            js_object = re.sub(r"//.*?\n|/\*.*?\*/", "", js_object, flags=re.S)
            js_object = js_object.replace("'", '"')
            parsed = json.loads(js_object)
            log_parsing_step(tg_id, f"JS variable {var_name} parsed successfully")
            return parsed
        log_parsing_step(tg_id, f"JS variable {var_name} not found")
        return None
    except Exception as e:
        log_error(e, f"[User {tg_id}] Error parsing JS variable {var_name}")
        return None


async def parse_program_data(url: str, session, tg_id: int) -> dict:
    log_parsing_action(tg_id, "Parsing program data", details=f"URL: {url}")
    try:
        html_text = await fetch_html(url, session, tg_id)
        soup = BeautifulSoup(html_text, "html.parser")
        result = {}

        h2_tag = soup.select_one(".page-vnz-detail-title h2")
        if h2_tag:
            result["university_name"] = h2_tag.get_text(strip=True)
            log_parsing_step(
                tg_id, "University name parsed", details=result["university_name"]
            )

        h1_tag = soup.select_one(".page-vnz-detail-title h1")
        if h1_tag:
            text = h1_tag.get_text(" ", strip=True)
            result["program_name"] = re.search(
                r"Освітня програма: (.+?)\.", text
            ).group(1)
            result["spec_code"] = re.search(r"Спеціальність: (\S+)\.", text).group(1)
            log_parsing_step(
                tg_id,
                "Program name and spec code parsed",
                details=str(result["program_name"] + " | " + result["spec_code"]),
            )

        # Основні дані програми
        program_block = soup.select_one(".block-pro-vnz .table-of-specs-item")
        if program_block:
            main_data = {}
            for b_tag in program_block.find_all("b"):
                key = b_tag.text.replace(":", "").strip()
                next_elem = b_tag.next_sibling
                if next_elem and getattr(next_elem, "text", None):
                    value = next_elem.text.strip()
                else:
                    value = next_elem.strip() if next_elem else ""
                main_data[key] = value
            result["program_info"] = main_data
            log_parsing_step(tg_id, "Main program info parsed", details=str(main_data))

        # Статистика спеціальності

        # stats_table = soup.select_one(".stats-vnz-table")
        # stats = {}
        # if stats_table:
        #   for tr in stats_table.find_all("tr"):
        #       cols = [td.get_text(strip=True) for td in tr.find_all("td")]
        #       if len(cols) == 2:
        #           stats[cols[0]] = cols[1]
        # result["stats"] = stats

        # Ліцензований обсяг
        volume_block = None
        for block in soup.select(".block-pro-vnz"):
            if "Ліцензований обсяг прийому" in block.get_text():
                volume_block = block
                break
        if volume_block:
            volume_data = {}
            for b_tag in volume_block.find_all("b"):
                key = b_tag.previous_sibling.strip().replace(":", "")
                volume_data[key] = b_tag.text.strip()
            result["volume"] = volume_data
            log_parsing_step(tg_id, "Volume data parsed", details=str(volume_data))

        # Парсинг JS-змінних
        scripts = soup.find_all("script")
        js_text = "\n".join(script.string or "" for script in scripts)

        result["statuses"] = parse_js_variable(js_text, "statuses", tg_id)
        result["rec_types"] = parse_js_variable(js_text, "rec_types", tg_id)
        result["subjects_js"] = parse_js_variable(js_text, "subjects", tg_id)

        log_parsing_step(tg_id, "All JS variables parsed")

        return result
    except Exception as e:
        log_error(e, f"[User {tg_id}] Error parsing program data")
        return {}


async def parser(url: str, tg_id: int) -> dict:
    """Парсить дані з сайту і повертає один великий словник"""
    try:
        log_parsing_action(tg_id, f"Started parsing for URL: {url}")
        last = 0

        async with aiohttp.ClientSession() as session:
            data = await parse_program_data(
                url, session, tg_id
            )  # Статична інформація про заклад освіти (там потім можна статистику з цього робити)

            data.setdefault("requests", [])
            data.setdefault("requests_subjects", {})
            while True:
                payload = await create_payload(url, last, tg_id)
                data_url = await get_url_json(session, payload, tg_id)
                if not data_url:
                    log_parsing_step(tg_id, "No data URL returned")
                    break

                log_parsing_step(tg_id, f"Fetching data from JSON URL: {data_url}")
                async with session.get(data_url) as resp:
                    resp_data = await resp.json()

                requests_list = resp_data.get("requests", [])
                log_parsing_step(
                    tg_id, f"Received chunk with {len(requests_list)} requests"
                )

                if not requests_list:
                    log_parsing_step(tg_id, "Empty requests list in chunk, stopping")
                    break

                data["requests"].extend(requests_list)

                # З Python 3.9 додали новий синтаксис замість dict.update(), тепер можна писати |= і кайф буде)))
                data["requests_subjects"] |= resp_data.get("requests_subjects", {})

                last += PAGE_SIZE

            log_parsing_action(tg_id, "Parsing completed successfully")
            return data

    except Exception as e:
        log_error(e, f"[User {tg_id}] Error during parsing")
        return None