Skip to content

Commit 7757115

Browse files
authored
Merge pull request #106 from wafflestudio/fix/events
feat: sync events with large batch file
2 parents 46d131a + 8a7ccff commit 7757115

5 files changed

Lines changed: 519 additions & 26 deletions

File tree

hangsha/batch/src/main/kotlin/com/team1/hangsha/batch/crawler/ExtraSnuCrawler.kt

Lines changed: 35 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import okhttp3.HttpUrl.Companion.toHttpUrl
1010
import okhttp3.OkHttpClient
1111
import okhttp3.Request
1212
import org.jsoup.Jsoup
13+
import org.jsoup.nodes.Document
1314
import org.jsoup.nodes.Element
1415
import java.util.concurrent.TimeUnit
1516
import com.team1.hangsha.common.upload.OciUploadService
@@ -81,19 +82,25 @@ class ExtraSnuCrawler(
8182
if (!shouldFetch(e)) return@map e
8283

8384
val html1 = fetchDetailPageByPlaywright(dataSeq) ?: return@map e
84-
var sessions = parseDetailSessions(html1)
85-
var mainHtml = parseMainContentHtml(html1, ociUploadService)
85+
var parsed = parseDetailData(
86+
html = html1,
87+
ociUploadService = ociUploadService,
88+
cookieHeader = buildCookieHeader()
89+
)
8690

87-
if (sessions.isEmpty()) { // fallback once
91+
if (parsed.sessions.isEmpty()) { // fallback once
8892
val html2 = fetchDetailPageByPlaywright(dataSeq)
8993
if (html2 != null) {
90-
sessions = parseDetailSessions(html2)
91-
mainHtml = parseMainContentHtml(html2, ociUploadService)
94+
parsed = parseDetailData(
95+
html = html2,
96+
ociUploadService = ociUploadService,
97+
cookieHeader = buildCookieHeader()
98+
)
9299
}
93100
}
94101

95102
if (delayMsBetweenDetails > 0) Thread.sleep(delayMsBetweenDetails)
96-
e.copy(detailSessions = sessions, mainContentHtml = mainHtml)
103+
e.copy(detailSessions = parsed.sessions, mainContentHtml = parsed.mainContentHtml)
97104
}
98105
}
99106

@@ -329,9 +336,7 @@ class ExtraSnuCrawler(
329336
// ---------------------------
330337
// ✅ 상세 페이지 -> DetailSession (start/end/time 평탄화, n회차, 우측 날짜 생략 처리)
331338
// ---------------------------
332-
private fun parseDetailSessions(html: String): List<DetailSession> {
333-
val doc = Jsoup.parse(html, baseUrl)
334-
339+
private fun parseDetailSessions(doc: Document): List<DetailSession> {
335340
val table = doc.select("table.table.t_view.add_tr")
336341
.firstOrNull { it.text().normalize().contains("교육(활동)기간") }
337342
?: return emptyList()
@@ -381,18 +386,18 @@ class ExtraSnuCrawler(
381386
* 반환값은 td_box의 innerHTML.
382387
* 없으면 null.
383388
*/
384-
private fun parseMainContentHtml(html: String, ociUploadService: OciUploadService): String? {
385-
val doc = Jsoup.parse(html, baseUrl)
386-
389+
private fun parseMainContentHtml(
390+
doc: Document,
391+
ociUploadService: OciUploadService,
392+
cookieHeader: String,
393+
): String? {
387394
val box = doc.select("div.cont_box")
388395
.firstOrNull {
389396
it.selectFirst("p.cont_tit")?.text()?.normalize() == "프로그램 주요내용"
390397
} ?: return null
391398

392399
val tdBox = box.selectFirst("div.td_box") ?: return null
393400

394-
val cookieHeader = buildCookieHeader()
395-
396401
tdBox.select("img").forEach { img ->
397402
val rawSrc = img.absUrl("src").ifBlank {
398403
val decoded = Parser.unescapeEntities(img.attr("src"), false)
@@ -449,6 +454,17 @@ class ExtraSnuCrawler(
449454
return out.ifBlank { null }
450455
}
451456

457+
private fun parseDetailData(
458+
html: String,
459+
ociUploadService: OciUploadService,
460+
cookieHeader: String,
461+
): ParsedDetailData {
462+
val doc = Jsoup.parse(html, baseUrl)
463+
val sessions = parseDetailSessions(doc)
464+
val mainContentHtml = parseMainContentHtml(doc, ociUploadService, cookieHeader)
465+
return ParsedDetailData(sessions = sessions, mainContentHtml = mainContentHtml)
466+
}
467+
452468
// ---------------------------
453469
// helpers
454470
// ---------------------------
@@ -659,4 +675,9 @@ class ExtraSnuCrawler(
659675
val contentType: String,
660676
val extension: String,
661677
)
678+
679+
private data class ParsedDetailData(
680+
val sessions: List<DetailSession>,
681+
val mainContentHtml: String?,
682+
)
662683
}

hangsha/batch/src/main/kotlin/com/team1/hangsha/batch/job/ExtraSnuSyncRunner.kt

Lines changed: 38 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
package com.team1.hangsha.batch.job
22

3+
import com.fasterxml.jackson.databind.ObjectMapper
34
import com.team1.hangsha.batch.crawler.DetailSession
45
import com.team1.hangsha.batch.crawler.ExtraSnuCrawler
56
import com.team1.hangsha.batch.crawler.ProgramEvent
@@ -10,12 +11,15 @@ import com.team1.hangsha.event.service.EventSyncService
1011
import org.springframework.boot.ApplicationArguments
1112
import org.springframework.boot.ApplicationRunner
1213
import org.springframework.stereotype.Component
14+
import java.nio.file.Files
15+
import java.nio.file.Path
1316
import kotlin.system.exitProcess
1417

1518
@Component
1619
class ExtraSnuSyncRunner(
1720
private val eventSyncService: EventSyncService,
1821
private val ociUploadService: OciUploadService,
22+
private val objectMapper: ObjectMapper,
1923
) : ApplicationRunner {
2024

2125
override fun run(args: ApplicationArguments) {
@@ -26,6 +30,7 @@ class ExtraSnuSyncRunner(
2630
var totalUpserted = 0
2731
var totalCrawled = 0
2832
var totalSkipped = 0
33+
val dumpBuffer = mutableListOf<CrawledProgramEvent>()
2934

3035
ExtraSnuCrawler(
3136
delayMsBetweenPages = opt.delayMs,
@@ -48,25 +53,46 @@ class ExtraSnuSyncRunner(
4853
crawler.enrichDetails(baseEvents, ociUploadService) // { e -> e.status != "모집마감" } // @TODO: 위의 0001, 0002, ... 와 같이 매직 넘버라, ENUM화?
4954
}
5055

51-
val eventsWithUploadedImages = if (!opt.withDetails) {
52-
events
53-
} else {
54-
crawler.uploadEventImages(events, ociUploadService)
56+
// dumpOnly 여부와 상관없이 이미지 업로드는 항상 수행한다.
57+
val eventsWithUploadedImages = crawler.uploadEventImages(events, ociUploadService)
58+
59+
val crawledEvents = eventsWithUploadedImages.map { it.toCrawledProgramEvent() }
60+
if (opt.outFile != null) {
61+
dumpBuffer += crawledEvents
5562
}
5663

57-
val result = eventSyncService.sync(eventsWithUploadedImages.map { it.toCrawledProgramEvent() })
64+
totalCrawled += crawledEvents.size
65+
if (opt.dumpOnly) {
66+
println("Page $page crawled: total=${crawledEvents.size}")
67+
continue
68+
}
5869

70+
val result = eventSyncService.sync(crawledEvents)
5971
totalUpserted += result.upserted
60-
totalCrawled += result.total
6172
totalSkipped += result.skipped
6273

6374
println("Page $page synced: upserted=${result.upserted}, total=${result.total}, skipped=${result.skipped}")
6475
}
6576
}
6677

67-
println("Synced $totalUpserted rows from $totalCrawled crawled events (skipped=$totalSkipped)")
78+
if (opt.outFile != null) {
79+
writeDumpFile(opt.outFile, dumpBuffer)
80+
println("Saved crawled events to ${opt.outFile} (count=${dumpBuffer.size})")
81+
}
82+
83+
if (opt.dumpOnly) {
84+
println("Crawled $totalCrawled rows (dump-only mode)")
85+
} else {
86+
println("Synced $totalUpserted rows from $totalCrawled crawled events (skipped=$totalSkipped)")
87+
}
6888
exitProcess(0)
6989
}
90+
91+
private fun writeDumpFile(outFile: String, rows: List<CrawledProgramEvent>) {
92+
val path = Path.of(outFile).toAbsolutePath().normalize()
93+
path.parent?.let { Files.createDirectories(it) }
94+
objectMapper.writerWithDefaultPrettyPrinter().writeValue(path.toFile(), rows)
95+
}
7096
}
7197

7298
private data class BatchArgs(
@@ -75,6 +101,8 @@ private data class BatchArgs(
75101
val delayMs: Long = 200,
76102
val withDetails: Boolean = true,
77103
val detailDelayMs: Long = 100,
104+
val outFile: String? = null,
105+
val dumpOnly: Boolean = false,
78106
) {
79107
companion object {
80108
fun from(args: ApplicationArguments): BatchArgs {
@@ -92,6 +120,8 @@ private data class BatchArgs(
92120
delayMs = single("delayMs")?.toLong() ?: 200L,
93121
withDetails = withDetails,
94122
detailDelayMs = single("detailDelayMs")?.toLong() ?: 100L,
123+
outFile = single("outFile"),
124+
dumpOnly = args.containsOption("dumpOnly"),
95125
)
96126
}
97127
}
@@ -124,4 +154,4 @@ private fun DetailSession.toCrawledDetailSession(): CrawledDetailSession =
124154
endDate = endDate,
125155
startTime = startTime,
126156
endTime = endTime
127-
)
157+
)

0 commit comments

Comments
 (0)