@@ -12,6 +12,8 @@ import okhttp3.Request
1212import org.jsoup.Jsoup
1313import org.jsoup.nodes.Element
1414import java.util.concurrent.TimeUnit
15+ import com.team1.hangsha.common.upload.OciUploadService
16+ import org.jsoup.parser.Parser
1517
1618class ExtraSnuCrawler (
1719 private val baseUrl : String = " https://extra.snu.ac.kr" ,
@@ -70,6 +72,7 @@ class ExtraSnuCrawler(
7072 */
7173 fun enrichDetails (
7274 events : List <ProgramEvent >,
75+ ociUploadService : OciUploadService ,
7376 shouldFetch : (ProgramEvent ) -> Boolean = { true }
7477 ): List <ProgramEvent > {
7578 return events.map { e ->
@@ -79,13 +82,13 @@ class ExtraSnuCrawler(
7982
8083 val html1 = fetchDetailPageByPlaywright(dataSeq) ? : return @map e
8184 var sessions = parseDetailSessions(html1)
82- var mainHtml = parseMainContentHtml(html1)
85+ var mainHtml = parseMainContentHtml(html1, ociUploadService )
8386
8487 if (sessions.isEmpty()) {
8588 val html2 = fetchDetailPageByPlaywright(dataSeq)
8689 if (html2 != null ) {
8790 sessions = parseDetailSessions(html2)
88- mainHtml = parseMainContentHtml(html2)
91+ mainHtml = parseMainContentHtml(html2, ociUploadService )
8992 }
9093 }
9194
@@ -336,30 +339,68 @@ class ExtraSnuCrawler(
336339
337340 /* *
338341 * ✅ "프로그램 주요내용" 섹션의 td_box HTML을 "그대로" 저장하되,
339- * - img, a 태그는 제거
340342 *
341343 * 반환값은 td_box의 innerHTML.
342344 * 없으면 null.
343345 */
344- private fun parseMainContentHtml (html : String ): String? {
346+ private fun parseMainContentHtml (html : String , ociUploadService : OciUploadService ): String? {
345347 val doc = Jsoup .parse(html, baseUrl)
346348
347- // cont_box 중에서 cont_tit == "프로그램 주요내용" 인 박스 찾기
348349 val box = doc.select(" div.cont_box" )
349350 .firstOrNull {
350351 it.selectFirst(" p.cont_tit" )?.text()?.normalize() == " 프로그램 주요내용"
351352 } ? : return null
352353
353354 val tdBox = box.selectFirst(" div.td_box" ) ? : return null
354355
355- // img, a 제거
356- tdBox.select(" img" ).remove()
357- tdBox.select(" a" ).remove()
356+ val cookieHeader = buildCookieHeader()
358357
359- // 혹시 script/style 섞이면 제거 (안전)
358+ tdBox.select(" img" ).forEach { img ->
359+ val rawSrc = img.absUrl(" src" ).ifBlank {
360+ val decoded = Parser .unescapeEntities(img.attr(" src" ), false )
361+ when {
362+ decoded.startsWith(" http://" ) || decoded.startsWith(" https://" ) -> decoded
363+ decoded.startsWith(" /" ) -> " $baseUrl$decoded "
364+ else -> " $baseUrl /$decoded "
365+ }
366+ }
367+
368+ if (rawSrc.isBlank()) {
369+ img.remove()
370+ return @forEach
371+ }
372+
373+ val downloaded = runCatching {
374+ downloadImage(rawSrc, cookieHeader)
375+ }.getOrNull()
376+
377+ if (downloaded == null ) {
378+ img.remove()
379+ return @forEach
380+ }
381+
382+ val uploadedUrl = runCatching {
383+ ociUploadService.uploadBytesIfAbsent(
384+ prefix = " events/detail" ,
385+ originalFilename = null ,
386+ bytes = downloaded.bytes,
387+ contentType = downloaded.contentType,
388+ )
389+ }.getOrNull()
390+
391+ if (uploadedUrl == null ) {
392+ img.remove()
393+ return @forEach
394+ }
395+
396+ img.attr(" src" , uploadedUrl)
397+ img.removeAttr(" onclick" )
398+ img.removeAttr(" usemap" )
399+ }
400+
401+ tdBox.select(" a" ).forEach { it.unwrap() }
360402 tdBox.select(" script, style" ).remove()
361403
362- // 빈 p 같은 거 정리(선택)
363404 tdBox.select(" p" ).forEach { p ->
364405 if (p.text().normalize().isBlank() && p.select(" br" ).isEmpty() && p.childrenSize() == 0 ) {
365406 p.remove()
@@ -370,6 +411,10 @@ class ExtraSnuCrawler(
370411 return out .ifBlank { null }
371412 }
372413
414+ // ---------------------------
415+ // helpers
416+ // ---------------------------
417+
373418 private fun tdAfterThContains (tr : Element , label : String ): String? {
374419 val th = tr.select(" th" )
375420 .firstOrNull { it.text().normalize().contains(label) }
@@ -379,15 +424,6 @@ class ExtraSnuCrawler(
379424 return td.text().normalize().takeIf { it.isNotBlank() }
380425 }
381426
382- // ---------------------------
383- // helpers
384- // ---------------------------
385-
386- private fun signatureOf (events : List <ProgramEvent >): String =
387- events.take(5 ).joinToString(" |" ) { e ->
388- " ${e.dataSeq} :${e.title} :${e.status} :${e.applyStart} :${e.applyEnd} "
389- }
390-
391427 private fun String.toIntOrNullSafe (): Int? {
392428 val cleaned = this .replace(" ," , " " ).trim()
393429 return cleaned.toIntOrNull()
@@ -483,4 +519,106 @@ class ExtraSnuCrawler(
483519 val html = fetchListPage(pageNo) ? : return emptyList()
484520 return parseListHtml(html)
485521 }
522+
523+ fun uploadEventImages (
524+ events : List <ProgramEvent >,
525+ ociUploadService : OciUploadService ,
526+ ): List <ProgramEvent > {
527+ val cookieHeader = buildCookieHeader()
528+
529+ if (cookieHeader.isBlank()) {
530+ if (debug) println (" [IMG] no playwright cookies; skip image upload" )
531+ return events
532+ }
533+
534+ return events.map { event ->
535+ val rawUrl = event.imageUrl?.trim()
536+ if (rawUrl.isNullOrBlank()) return @map event
537+
538+ val downloaded = runCatching {
539+ downloadImage(rawUrl, cookieHeader)
540+ }.onFailure {
541+ if (debug) println (" [IMG] download fail dataSeq=${event.dataSeq} msg=${it.message} " )
542+ }.getOrNull()
543+
544+ if (downloaded == null ) {
545+ return @map event.copy(imageUrl = null )
546+ }
547+
548+ val uploadedUrl = runCatching {
549+ ociUploadService.uploadBytesIfAbsent(
550+ prefix = " events" ,
551+ originalFilename = " event-${event.dataSeq ? : " unknown" } " ,
552+ bytes = downloaded.bytes,
553+ contentType = downloaded.contentType,
554+ )
555+ }.onFailure {
556+ if (debug) println (" [IMG] upload fail dataSeq=${event.dataSeq} msg=${it.message} " )
557+ }.getOrNull()
558+
559+ if (uploadedUrl == null ) {
560+ event.copy(imageUrl = null )
561+ } else {
562+ if (debug) println (" [IMG] uploaded dataSeq=${event.dataSeq} -> $uploadedUrl " )
563+ event.copy(imageUrl = uploadedUrl)
564+ }
565+ }
566+ }
567+
568+ private fun buildCookieHeader (): String {
569+ return pwContext.cookies()
570+ .joinToString(" ; " ) { " ${it.name} =${it.value} " }
571+ }
572+
573+ private fun downloadImage (rawUrl : String , cookieHeader : String ): DownloadedImage ? {
574+ val absoluteUrl = when {
575+ rawUrl.startsWith(" http://" ) || rawUrl.startsWith(" https://" ) -> rawUrl
576+ rawUrl.startsWith(" /" ) -> " $baseUrl$rawUrl "
577+ else -> " $baseUrl /$rawUrl "
578+ }
579+
580+ val req = Request .Builder ()
581+ .url(absoluteUrl)
582+ .get()
583+ .header(" Referer" , " $baseUrl$listPath " )
584+ .header(" User-Agent" , userAgent)
585+ .header(" Accept" , " image/avif,image/webp,image/apng,image/*,*/*;q=0.8" )
586+ .header(" Cookie" , cookieHeader)
587+ .build()
588+
589+ if (debug) println (" [IMG] GET $absoluteUrl " )
590+
591+ client.newCall(req).execute().use { resp ->
592+ if (! resp.isSuccessful) {
593+ if (debug) println (" [IMG] FAIL code=${resp.code} url=$absoluteUrl " )
594+ return null
595+ }
596+
597+ val body = resp.body ? : return null
598+ val bytes = body.bytes()
599+ if (bytes.isEmpty()) return null
600+
601+ val contentType = body.contentType()?.toString() ? : " application/octet-stream"
602+ val extension = when (contentType.lowercase()) {
603+ " image/jpeg" , " image/jpg" -> " jpg"
604+ " image/png" -> " png"
605+ " image/gif" -> " gif"
606+ " image/webp" -> " webp"
607+ " image/bmp" -> " bmp"
608+ else -> " bin"
609+ }
610+
611+ return DownloadedImage (
612+ bytes = bytes,
613+ contentType = contentType,
614+ extension = extension,
615+ )
616+ }
617+ }
618+
619+ private data class DownloadedImage (
620+ val bytes : ByteArray ,
621+ val contentType : String ,
622+ val extension : String ,
623+ )
486624}
0 commit comments