Skip to content

Commit bcc5c93

Browse files
authored
Merge pull request #95 from wafflestudio/feat/image
feat: 행사 스냅샷 / 상세 html의 image -> 서버에 저장
2 parents 44a5948 + 63f0abe commit bcc5c93

5 files changed

Lines changed: 288 additions & 21 deletions

File tree

hangsha/batch/src/main/kotlin/com/team1/hangsha/batch/BatchApplication.kt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ package com.team1.hangsha.batch
33
import com.team1.hangsha.config.DatabaseConfig
44
import com.team1.hangsha.config.TestValueLogger
55
import com.team1.hangsha.com.team1.hangsha.config.JacksonConfig
6+
import com.team1.hangsha.common.upload.OciUploadService
7+
import com.team1.hangsha.config.OciConfig
68
import com.team1.hangsha.event.service.EventSyncService
79
import org.springframework.boot.WebApplicationType
810
import org.springframework.boot.autoconfigure.SpringBootApplication
@@ -15,6 +17,8 @@ import org.springframework.context.annotation.Import
1517
JacksonConfig::class,
1618
EventSyncService::class,
1719
TestValueLogger::class,
20+
OciConfig::class,
21+
OciUploadService::class,
1822
) // for explicit bean import
1923
class BatchApplication
2024

hangsha/batch/src/main/kotlin/com/team1/hangsha/batch/crawler/ExtraSnuCrawler.kt

Lines changed: 157 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ import okhttp3.Request
1212
import org.jsoup.Jsoup
1313
import org.jsoup.nodes.Element
1414
import java.util.concurrent.TimeUnit
15+
import com.team1.hangsha.common.upload.OciUploadService
16+
import org.jsoup.parser.Parser
1517

1618
class ExtraSnuCrawler(
1719
private val baseUrl: String = "https://extra.snu.ac.kr",
@@ -70,6 +72,7 @@ class ExtraSnuCrawler(
7072
*/
7173
fun enrichDetails(
7274
events: List<ProgramEvent>,
75+
ociUploadService: OciUploadService,
7376
shouldFetch: (ProgramEvent) -> Boolean = { true }
7477
): List<ProgramEvent> {
7578
return events.map { e ->
@@ -79,13 +82,13 @@ class ExtraSnuCrawler(
7982

8083
val html1 = fetchDetailPageByPlaywright(dataSeq) ?: return@map e
8184
var sessions = parseDetailSessions(html1)
82-
var mainHtml = parseMainContentHtml(html1)
85+
var mainHtml = parseMainContentHtml(html1, ociUploadService)
8386

8487
if (sessions.isEmpty()) {
8588
val html2 = fetchDetailPageByPlaywright(dataSeq)
8689
if (html2 != null) {
8790
sessions = parseDetailSessions(html2)
88-
mainHtml = parseMainContentHtml(html2)
91+
mainHtml = parseMainContentHtml(html2, ociUploadService)
8992
}
9093
}
9194

@@ -336,30 +339,68 @@ class ExtraSnuCrawler(
336339

337340
/**
338341
* ✅ "프로그램 주요내용" 섹션의 td_box HTML을 "그대로" 저장하되,
339-
* - img, a 태그는 제거
340342
*
341343
* 반환값은 td_box의 innerHTML.
342344
* 없으면 null.
343345
*/
344-
private fun parseMainContentHtml(html: String): String? {
346+
private fun parseMainContentHtml(html: String, ociUploadService: OciUploadService): String? {
345347
val doc = Jsoup.parse(html, baseUrl)
346348

347-
// cont_box 중에서 cont_tit == "프로그램 주요내용" 인 박스 찾기
348349
val box = doc.select("div.cont_box")
349350
.firstOrNull {
350351
it.selectFirst("p.cont_tit")?.text()?.normalize() == "프로그램 주요내용"
351352
} ?: return null
352353

353354
val tdBox = box.selectFirst("div.td_box") ?: return null
354355

355-
// img, a 제거
356-
tdBox.select("img").remove()
357-
tdBox.select("a").remove()
356+
val cookieHeader = buildCookieHeader()
358357

359-
// 혹시 script/style 섞이면 제거 (안전)
358+
tdBox.select("img").forEach { img ->
359+
val rawSrc = img.absUrl("src").ifBlank {
360+
val decoded = Parser.unescapeEntities(img.attr("src"), false)
361+
when {
362+
decoded.startsWith("http://") || decoded.startsWith("https://") -> decoded
363+
decoded.startsWith("/") -> "$baseUrl$decoded"
364+
else -> "$baseUrl/$decoded"
365+
}
366+
}
367+
368+
if (rawSrc.isBlank()) {
369+
img.remove()
370+
return@forEach
371+
}
372+
373+
val downloaded = runCatching {
374+
downloadImage(rawSrc, cookieHeader)
375+
}.getOrNull()
376+
377+
if (downloaded == null) {
378+
img.remove()
379+
return@forEach
380+
}
381+
382+
val uploadedUrl = runCatching {
383+
ociUploadService.uploadBytesIfAbsent(
384+
prefix = "events/detail",
385+
originalFilename = null,
386+
bytes = downloaded.bytes,
387+
contentType = downloaded.contentType,
388+
)
389+
}.getOrNull()
390+
391+
if (uploadedUrl == null) {
392+
img.remove()
393+
return@forEach
394+
}
395+
396+
img.attr("src", uploadedUrl)
397+
img.removeAttr("onclick")
398+
img.removeAttr("usemap")
399+
}
400+
401+
tdBox.select("a").forEach { it.unwrap() }
360402
tdBox.select("script, style").remove()
361403

362-
// 빈 p 같은 거 정리(선택)
363404
tdBox.select("p").forEach { p ->
364405
if (p.text().normalize().isBlank() && p.select("br").isEmpty() && p.childrenSize() == 0) {
365406
p.remove()
@@ -370,6 +411,10 @@ class ExtraSnuCrawler(
370411
return out.ifBlank { null }
371412
}
372413

414+
// ---------------------------
415+
// helpers
416+
// ---------------------------
417+
373418
private fun tdAfterThContains(tr: Element, label: String): String? {
374419
val th = tr.select("th")
375420
.firstOrNull { it.text().normalize().contains(label) }
@@ -379,15 +424,6 @@ class ExtraSnuCrawler(
379424
return td.text().normalize().takeIf { it.isNotBlank() }
380425
}
381426

382-
// ---------------------------
383-
// helpers
384-
// ---------------------------
385-
386-
private fun signatureOf(events: List<ProgramEvent>): String =
387-
events.take(5).joinToString("|") { e ->
388-
"${e.dataSeq}:${e.title}:${e.status}:${e.applyStart}:${e.applyEnd}"
389-
}
390-
391427
private fun String.toIntOrNullSafe(): Int? {
392428
val cleaned = this.replace(",", "").trim()
393429
return cleaned.toIntOrNull()
@@ -483,4 +519,106 @@ class ExtraSnuCrawler(
483519
val html = fetchListPage(pageNo) ?: return emptyList()
484520
return parseListHtml(html)
485521
}
522+
523+
fun uploadEventImages(
524+
events: List<ProgramEvent>,
525+
ociUploadService: OciUploadService,
526+
): List<ProgramEvent> {
527+
val cookieHeader = buildCookieHeader()
528+
529+
if (cookieHeader.isBlank()) {
530+
if (debug) println("[IMG] no playwright cookies; skip image upload")
531+
return events
532+
}
533+
534+
return events.map { event ->
535+
val rawUrl = event.imageUrl?.trim()
536+
if (rawUrl.isNullOrBlank()) return@map event
537+
538+
val downloaded = runCatching {
539+
downloadImage(rawUrl, cookieHeader)
540+
}.onFailure {
541+
if (debug) println("[IMG] download fail dataSeq=${event.dataSeq} msg=${it.message}")
542+
}.getOrNull()
543+
544+
if (downloaded == null) {
545+
return@map event.copy(imageUrl = null)
546+
}
547+
548+
val uploadedUrl = runCatching {
549+
ociUploadService.uploadBytesIfAbsent(
550+
prefix = "events",
551+
originalFilename = "event-${event.dataSeq ?: "unknown"}",
552+
bytes = downloaded.bytes,
553+
contentType = downloaded.contentType,
554+
)
555+
}.onFailure {
556+
if (debug) println("[IMG] upload fail dataSeq=${event.dataSeq} msg=${it.message}")
557+
}.getOrNull()
558+
559+
if (uploadedUrl == null) {
560+
event.copy(imageUrl = null)
561+
} else {
562+
if (debug) println("[IMG] uploaded dataSeq=${event.dataSeq} -> $uploadedUrl")
563+
event.copy(imageUrl = uploadedUrl)
564+
}
565+
}
566+
}
567+
568+
private fun buildCookieHeader(): String {
569+
return pwContext.cookies()
570+
.joinToString("; ") { "${it.name}=${it.value}" }
571+
}
572+
573+
private fun downloadImage(rawUrl: String, cookieHeader: String): DownloadedImage? {
574+
val absoluteUrl = when {
575+
rawUrl.startsWith("http://") || rawUrl.startsWith("https://") -> rawUrl
576+
rawUrl.startsWith("/") -> "$baseUrl$rawUrl"
577+
else -> "$baseUrl/$rawUrl"
578+
}
579+
580+
val req = Request.Builder()
581+
.url(absoluteUrl)
582+
.get()
583+
.header("Referer", "$baseUrl$listPath")
584+
.header("User-Agent", userAgent)
585+
.header("Accept", "image/avif,image/webp,image/apng,image/*,*/*;q=0.8")
586+
.header("Cookie", cookieHeader)
587+
.build()
588+
589+
if (debug) println("[IMG] GET $absoluteUrl")
590+
591+
client.newCall(req).execute().use { resp ->
592+
if (!resp.isSuccessful) {
593+
if (debug) println("[IMG] FAIL code=${resp.code} url=$absoluteUrl")
594+
return null
595+
}
596+
597+
val body = resp.body ?: return null
598+
val bytes = body.bytes()
599+
if (bytes.isEmpty()) return null
600+
601+
val contentType = body.contentType()?.toString() ?: "application/octet-stream"
602+
val extension = when (contentType.lowercase()) {
603+
"image/jpeg", "image/jpg" -> "jpg"
604+
"image/png" -> "png"
605+
"image/gif" -> "gif"
606+
"image/webp" -> "webp"
607+
"image/bmp" -> "bmp"
608+
else -> "bin"
609+
}
610+
611+
return DownloadedImage(
612+
bytes = bytes,
613+
contentType = contentType,
614+
extension = extension,
615+
)
616+
}
617+
}
618+
619+
private data class DownloadedImage(
620+
val bytes: ByteArray,
621+
val contentType: String,
622+
val extension: String,
623+
)
486624
}

hangsha/batch/src/main/kotlin/com/team1/hangsha/batch/job/ExtraSnuSyncRunner.kt

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package com.team1.hangsha.batch.job
33
import com.team1.hangsha.batch.crawler.DetailSession
44
import com.team1.hangsha.batch.crawler.ExtraSnuCrawler
55
import com.team1.hangsha.batch.crawler.ProgramEvent
6+
import com.team1.hangsha.common.upload.OciUploadService
67
import com.team1.hangsha.event.dto.core.CrawledDetailSession
78
import com.team1.hangsha.event.dto.core.CrawledProgramEvent
89
import com.team1.hangsha.event.service.EventSyncService
@@ -14,6 +15,7 @@ import kotlin.system.exitProcess
1415
@Component
1516
class ExtraSnuSyncRunner(
1617
private val eventSyncService: EventSyncService,
18+
private val ociUploadService: OciUploadService,
1719
) : ApplicationRunner {
1820

1921
override fun run(args: ApplicationArguments) {
@@ -43,10 +45,16 @@ class ExtraSnuSyncRunner(
4345
val events = if (!opt.withDetails) {
4446
baseEvents
4547
} else {
46-
crawler.enrichDetails(baseEvents) // { e -> e.status != "모집마감" } // @TODO: 위의 0001, 0002, ... 와 같이 매직 넘버라, ENUM화?
48+
crawler.enrichDetails(baseEvents, ociUploadService) // { e -> e.status != "모집마감" } // @TODO: 위의 0001, 0002, ... 와 같이 매직 넘버라, ENUM화?
4749
}
4850

49-
val result = eventSyncService.sync(events.map { it.toCrawledProgramEvent() })
51+
val eventsWithUploadedImages = if (!opt.withDetails) {
52+
events
53+
} else {
54+
crawler.uploadEventImages(events, ociUploadService)
55+
}
56+
57+
val result = eventSyncService.sync(eventsWithUploadedImages.map { it.toCrawledProgramEvent() })
5058

5159
totalUpserted += result.upserted
5260
totalCrawled += result.total

0 commit comments

Comments
 (0)