Skip to content

Commit d451eb2

Browse files
authored
Merge pull request #98 from wafflestudio/fix/events
fix: 새로운 포맷 대응
2 parents d365c2a + 568d86a commit d451eb2

3 files changed

Lines changed: 78 additions & 33 deletions

File tree

hangsha/batch/src/main/kotlin/com/team1/hangsha/batch/crawler/ExtraSnuCrawler.kt

Lines changed: 70 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ class ExtraSnuCrawler(
8484
var sessions = parseDetailSessions(html1)
8585
var mainHtml = parseMainContentHtml(html1, ociUploadService)
8686

87-
if (sessions.isEmpty()) {
87+
if (sessions.isEmpty()) { // fallback once
8888
val html2 = fetchDetailPageByPlaywright(dataSeq)
8989
if (html2 != null) {
9090
sessions = parseDetailSessions(html2)
@@ -140,20 +140,21 @@ class ExtraSnuCrawler(
140140
if (debug) println("\n[PW] goto(view) => $viewUrl")
141141

142142
fun isWait(u: String) = u.contains("/wait.jsp")
143-
fun isDetail(u: String) = u.contains("/ptfol/imng/icmpNsbjtPgm/findIcmpNsbjtPgmInfo.do")
143+
fun isDetail(u: String) =
144+
u.contains("/ptfol/imng/icmpNsbjtPgm/findIcmpNsbjtPgmInfo.do") ||
145+
u.contains("/ptfol/cous/staGrp/rcri/view.do")
144146

145147
val page = pwContext.newPage()
146148

147149
return try {
148-
// ✅ domcontentloaded 기다리지 말고 'commit'까지만 (응답만 받으면 됨)
149150
page.navigate(
150151
viewUrl,
151152
Page.NavigateOptions()
152153
.setWaitUntil(WaitUntilState.COMMIT)
153154
.setTimeout(20_000.0)
154155
)
155156

156-
val hardDeadlineMs = System.currentTimeMillis() + 120_000L // 총 2분까지 기다림
157+
val hardDeadlineMs = System.currentTimeMillis() + 120_000L
157158
var lastUrl = page.url()
158159

159160
while (System.currentTimeMillis() < hardDeadlineMs) {
@@ -165,50 +166,87 @@ class ExtraSnuCrawler(
165166
return null
166167
}
167168

168-
// 1) wait.jsp면: 서버가 대기열 처리 중. 재navigate 하지 말고 잠깐 기다림.
169169
if (isWait(curUrl)) {
170170
if (debug) println("[PW] wait.jsp... (sleep)")
171-
page.waitForTimeout(800.0 + Math.random() * 1200.0) // 0.8~2.0s
171+
page.waitForTimeout(800.0 + Math.random() * 1200.0)
172172
continue
173173
}
174174

175-
// 2) 최종 상세(findIcmp...)로 왔으면: 실제 기간 값이 들어올 때까지 기다렸다가 content
176175
if (isDetail(curUrl)) {
177-
runCatching {
178-
page.waitForFunction(
179-
"""
180-
() => {
181-
const ths = Array.from(document.querySelectorAll("th"));
182-
const th = ths.find(x => (x.textContent || "").includes("교육(활동)기간"));
183-
if (!th) return false;
184-
const td = th.nextElementSibling;
185-
if (!td) return false;
186-
const txt = (td.textContent || "").replace(/\s+/g, " ").trim();
187-
return /\d{4}\.\d{2}\.\d{2}\./.test(txt) && /\d{2}:\d{2}/.test(txt);
176+
val detailDeadlineMs = System.currentTimeMillis() + 10_000L
177+
178+
while (System.currentTimeMillis() < detailDeadlineMs) {
179+
val titles = runCatching {
180+
@Suppress("UNCHECKED_CAST")
181+
page.evalOnSelectorAll(
182+
"div.cont_box p.cont_tit",
183+
"els => els.map(el => (el.textContent || '').replace(/\\s+/g, ' ').trim())"
184+
) as List<String>
185+
}.getOrDefault(emptyList())
186+
187+
if (debug) println("[PW] titles=$titles")
188+
189+
// 아직 본문 골격이 안 뜬 상태
190+
if (titles.isEmpty()) {
191+
page.waitForTimeout(500.0)
192+
continue
188193
}
189-
""".trimIndent(),
190-
Page.WaitForFunctionOptions().setTimeout(10_000.0)
191-
)
192-
}.onFailure {
193-
runCatching {
194-
page.waitForLoadState(LoadState.NETWORKIDLE)
194+
195+
// 강좌 정보 자체가 없는 페이지면 더 기다리지 말고 그냥 반환
196+
if (!titles.contains("강좌 정보")) {
197+
val html = try {
198+
page.content()
199+
} catch (e: Exception) {
200+
page.waitForTimeout(500.0)
201+
page.content()
202+
}
203+
if (debug) println("[PW] OK detail(no lecture info) url=$curUrl htmlLen=${html.length}")
204+
return html
205+
}
206+
207+
// 강좌 정보가 있으면 교육(활동)기간 값이 실제로 채워질 때까지 조금 더 기다림
208+
val hasPeriod = runCatching {
209+
page.evaluate(
210+
"""
211+
() => {
212+
const ths = Array.from(document.querySelectorAll("th"));
213+
const th = ths.find(x => (x.textContent || "").includes("교육(활동)기간"));
214+
if (!th) return false;
215+
const td = th.nextElementSibling;
216+
if (!td) return false;
217+
const txt = (td.textContent || "").replace(/\s+/g, " ").trim();
218+
return /\d{4}\.\d{2}\.\d{2}\./.test(txt) && /\d{2}:\d{2}/.test(txt);
219+
}
220+
""".trimIndent()
221+
) as Boolean
222+
}.getOrDefault(false)
223+
224+
if (hasPeriod) {
225+
val html = try {
226+
page.content()
227+
} catch (e: Exception) {
228+
page.waitForTimeout(500.0 + Math.random() * 600.0)
229+
page.content()
230+
}
231+
if (debug) println("[PW] OK detail(with lecture info) url=$curUrl htmlLen=${html.length}")
232+
return html
195233
}
196-
page.waitForTimeout(1500.0)
234+
235+
page.waitForTimeout(500.0)
197236
}
198237

199-
// navigating 중 content() 터질 수 있어 방어
238+
// detail 페이지까지는 왔는데 10초 동안 period가 안 떴음
239+
// 그래도 HTML은 넘기고, 실제 판정은 Kotlin parse 쪽에서 처리
200240
val html = try {
201241
page.content()
202242
} catch (e: Exception) {
203-
page.waitForTimeout(500.0 + Math.random() * 600.0)
243+
page.waitForTimeout(500.0)
204244
page.content()
205245
}
206-
207-
if (debug) println("[PW] OK detail url=$curUrl htmlLen=${html.length}")
246+
if (debug) println("[PW] OK detail(timeout fallback) url=$curUrl htmlLen=${html.length}")
208247
return html
209248
}
210249

211-
// 3) 그 외 상태: 아직 view.do이거나 중간 이동 중
212250
page.waitForTimeout(400.0 + Math.random() * 600.0)
213251
}
214252

@@ -258,8 +296,8 @@ class ExtraSnuCrawler(
258296
label to value
259297
}.orEmpty()
260298

261-
val applyCount = counts["신청"]?.toIntOrNullSafe()
262-
val capacity = counts["정원"]?.toIntOrNullSafe()
299+
val applyCount = counts["신청"]?.toIntOrNullSafe() ?: 0
300+
val capacity = counts["정원"]?.toIntOrNullSafe() ?: 0
263301

264302
val imageUrl = card.selectFirst(".img_wrap img")
265303
?.absUrl("src")

hangsha/common/src/main/kotlin/com/team1/hangsha/event/repository/EventRepository.kt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ import org.springframework.data.repository.query.Param
88
import java.time.LocalDateTime
99

1010
interface EventRepository : CrudRepository<Event, Long> {
11-
fun findByApplyLink(applyLink: String): Event?
11+
fun existsByApplyLink(applyLink: String): Boolean
1212

1313
@Query(
1414
"""

hangsha/common/src/main/kotlin/com/team1/hangsha/event/service/EventSyncService.kt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ class EventSyncService(
5353
val applyEnd = e.applyEnd?.let { dateEnd(it) }
5454

5555
val sessions = patchSessionTimesFromMainContent(e.detailSessions, e.mainContentHtml)
56+
val hasExistingForApplyLink = eventRepository.existsByApplyLink(applyLink)
5657

5758
data class UnitSpec(
5859
val eventStart: LocalDateTime?,
@@ -93,6 +94,12 @@ class EventSyncService(
9394
null
9495
}
9596

97+
val isAllDayFallbackPeriod = sessions.isEmpty()
98+
if (existing == null && hasExistingForApplyLink && isAllDayFallbackPeriod) {
99+
skipped++
100+
continue
101+
}
102+
96103
val cleanedTags = e.tags
97104
.asSequence()
98105
.map { it.trim() }

0 commit comments

Comments
 (0)