1010import pandas as pd
1111import numpy as np
1212import logging
13+ import requests
14+ from bs4 import BeautifulSoup
15+ import time
1316
1417from tensorflow .keras .preprocessing .sequence import pad_sequences
1518from finvizfinance .news import News
2326
2427logger = logging .getLogger (__name__ )
2528
29+ def classify_news_tag (text : str ) -> str :
30+ """
31+ 뉴스 제목을 기반으로 태그를 분류합니다.
32+
33+ Args:
34+ text: 뉴스 제목
35+
36+ Returns:
37+ 태그 문자열 (경제, 산업, 기술, 정치, 건강, 사회)
38+ """
39+ text_lower = text .lower ()
40+
41+ # 정치 키워드
42+ politics_keywords = [
43+ 'politics' , 'political' , 'election' , 'president' , 'government' , 'senate' , 'congress' ,
44+ 'democrat' , 'republican' , 'biden' , 'trump' , 'administration' , 'policy' , 'diplomacy' ,
45+ 'international' , 'trade war' , 'sanctions' , 'embargo' , 'treaty' , 'alliance' , 'nato' ,
46+ 'united nations' , 'un' , 'eu' , 'brexit' , 'geopolitical' , 'foreign policy'
47+ ]
48+
49+ # 경제 키워드
50+ economy_keywords = [
51+ 'economy' , 'economic' , 'market' , 'inflation' , 'deflation' , 'fed' , 'federal reserve' ,
52+ 'rate cut' , 'rate hike' , 'interest rate' , 'gdp' , 'unemployment' , 'employment' ,
53+ 'consumer price' , 'cpi' , 'ppi' , 'retail sales' , 'gross domestic product' ,
54+ 'recession' , 'growth' , 'monetary policy' , 'fiscal policy' , 'treasury' , 'bond' ,
55+ 'stock market' , 'dow' , 'nasdaq' , 's&p' , 'sp500' , 'market wrap' , 'markets'
56+ ]
57+
58+ # 산업 키워드
59+ company_keywords = [
60+ 'company' , 'companies' , 'corporate' , 'earnings' , 'revenue' , 'profit' , 'loss' ,
61+ 'quarterly' , 'q1' , 'q2' , 'q3' , 'q4' , 'ipo' , 'merger' , 'acquisition' , 'deal' ,
62+ 'ceo' , 'cfo' , 'executive' , 'shareholder' , 'dividend' , 'stock' , 'shares' ,
63+ 'industry' , 'sector' , 'automotive' , 'energy' , 'oil' , 'gas' , 'retail' ,
64+ 'manufacturing' , 'banking' , 'finance' , 'insurance'
65+ ]
66+
67+ # 기술 키워드
68+ tech_keywords = [
69+ 'technology' , 'tech' , 'ai' , 'artificial intelligence' , 'machine learning' , 'ml' ,
70+ 'innovation' , 'science' , 'research' , 'development' , 'r&d' , 'semiconductor' ,
71+ 'chip' , 'software' , 'hardware' , 'cloud' , 'cyber' , 'digital' , 'data' ,
72+ 'quantum' , 'blockchain' , 'crypto' , 'bitcoin' , 'ethereum' , 'nft' ,
73+ 'space' , 'nasa' , 'rocket' , 'satellite' , 'electric vehicle' , 'ev' , 'tesla'
74+ ]
75+
76+ # 사회 키워드
77+ social_keywords = [
78+ 'social' , 'culture' , 'society' , 'education' , 'university' , 'school' ,
79+ 'climate' , 'environment' , 'green' , 'sustainability' , 'renewable' ,
80+ 'immigration' , 'refugee' , 'human rights' , 'equality' , 'diversity' ,
81+ 'media' , 'entertainment' , 'movie' , 'music' , 'art' , 'culture'
82+ ]
83+
84+ # 건강 키워드
85+ sports_health_keywords = [
86+ 'sports' , 'sport' , 'olympics' , 'nfl' , 'nba' , 'mlb' , 'soccer' , 'football' ,
87+ 'basketball' , 'baseball' , 'tennis' , 'golf' , 'health' , 'medical' , 'medicine' ,
88+ 'hospital' , 'doctor' , 'patient' , 'disease' , 'covid' , 'pandemic' , 'vaccine' ,
89+ 'pharmaceutical' , 'drug' , 'treatment' , 'therapy' , 'fitness' , 'wellness'
90+ ]
91+
92+ # 우선순위에 따라 태그 분류 (경제가 가장 높은 우선순위)
93+ if any (keyword in text_lower for keyword in economy_keywords ):
94+ return "경제"
95+ elif any (keyword in text_lower for keyword in company_keywords ):
96+ return "산업"
97+ elif any (keyword in text_lower for keyword in tech_keywords ):
98+ return "기술"
99+ elif any (keyword in text_lower for keyword in politics_keywords ):
100+ return "정치"
101+ elif any (keyword in text_lower for keyword in sports_health_keywords ):
102+ return "건강"
103+ elif any (keyword in text_lower for keyword in social_keywords ):
104+ return "사회"
105+ else :
106+ # 기본값: 경제 (finviz 뉴스는 주로 금융/경제 관련)
107+ return "경제"
108+
109+ def fetch_finviz_news_by_type (news_type : int , source_name : str ) -> pd .DataFrame :
110+ """
111+ Finviz의 특정 타입의 뉴스 페이지에서 뉴스를 가져옵니다.
112+
113+ Args:
114+ news_type: 뉴스 타입 (3=Stock, 4=ETF, 5=Crypto)
115+ source_name: 소스 이름 (로깅용)
116+
117+ Returns:
118+ DataFrame with 'Title' and 'Date' columns, or empty DataFrame if failed
119+ """
120+ try :
121+ url = f'https://finviz.com/news.ashx?v={ news_type } '
122+ headers = {
123+ 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
124+ }
125+
126+ response = requests .get (url , headers = headers , timeout = 10 )
127+ response .raise_for_status ()
128+
129+ soup = BeautifulSoup (response .text , 'html.parser' )
130+ news_items = []
131+
132+ # 뉴스는 테이블 형태로 되어있음
133+ # nn 클래스를 가진 tr 태그에서 뉴스 추출
134+ for row in soup .find_all ('tr' , class_ = 'nn' ):
135+ title_elem = row .find ('a' )
136+ if title_elem :
137+ title = title_elem .get_text (strip = True )
138+ # 날짜 정보 추출 (보통 같은 행의 td에서)
139+ date_elem = row .find ('td' , class_ = 'nn-date' )
140+ if date_elem :
141+ date_str = date_elem .get_text (strip = True )
142+ else :
143+ # 날짜가 없으면 현재 시간 사용
144+ date_str = datetime .datetime .now ().strftime ("%I:%M%p" )
145+
146+ news_items .append ({
147+ 'Title' : title ,
148+ 'Date' : date_str
149+ })
150+
151+ if news_items :
152+ df = pd .DataFrame (news_items )
153+ logger .info (f'{ source_name } News fetched: { len (df )} articles' )
154+ return df
155+ else :
156+ logger .warning (f'No { source_name .lower ()} news items found' )
157+ return pd .DataFrame (columns = ['Title' , 'Date' ])
158+
159+ except Exception as e :
160+ logger .error (f'Failed to fetch { source_name .lower ()} news: { e } ' )
161+ return pd .DataFrame (columns = ['Title' , 'Date' ])
162+
163+ def fetch_stock_news () -> pd .DataFrame :
164+ """
165+ Finviz의 Stock News 페이지에서 뉴스를 가져옵니다.
166+ https://finviz.com/news.ashx?v=3
167+
168+ Returns:
169+ DataFrame with 'Title' and 'Date' columns, or empty DataFrame if failed
170+ """
171+ return fetch_finviz_news_by_type (3 , 'Stock' )
172+
173+ def fetch_etf_news () -> pd .DataFrame :
174+ """
175+ Finviz의 ETF News 페이지에서 뉴스를 가져옵니다.
176+ https://finviz.com/news.ashx?v=4
177+
178+ Returns:
179+ DataFrame with 'Title' and 'Date' columns, or empty DataFrame if failed
180+ """
181+ return fetch_finviz_news_by_type (4 , 'ETF' )
182+
183+ def fetch_crypto_news () -> pd .DataFrame :
184+ """
185+ Finviz의 Crypto News 페이지에서 뉴스를 가져옵니다.
186+ https://finviz.com/news.ashx?v=5
187+
188+ Returns:
189+ DataFrame with 'Title' and 'Date' columns, or empty DataFrame if failed
190+ """
191+ return fetch_finviz_news_by_type (5 , 'Crypto' )
192+
26193def run_predict ():
27194
28195 model = load_model_safe (paths .model , 'Sentiment Model (.keras) version' )
@@ -40,26 +207,52 @@ def run_predict():
40207 # predict_texts = news_df['Title'].tolist() # predict texts
41208 # # print(news_df['Title'][1])
42209
43- # parse except task
210+ # Market News 가져오기
44211 try :
45212 fnews = News ()
46213 all_news = fnews .get_news ()
47- news_df = all_news ['news' ]
214+ news_df = all_news ['news' ].copy ()
215+ news_df ['source' ] = 'finviz_market' # source 구분
216+ logger .info (f'Market News fetched: { len (news_df )} articles' )
48217 except Exception as e :
49- logger .error (f'finviz news parse failed : { e } ' )
218+ logger .error (f'finviz market news parse failed : { e } ' )
50219 return
51-
52- # print(news_df['Date'].head(10))
53- today = datetime .date .today () # today
54-
55- news_df ["parsed_date" ] = pd .to_datetime (
56- today .strftime ("%Y-%m-%d " ) + news_df ["Date" ],
57- format = "%Y-%m-%d %I:%M%p" ,
58- errors = "coerce"
59- )
60220
61- today_news = news_df [news_df ['parsed_date' ].dt .date == today ] # today == parse data date
62- predict_texts = today_news ['Title' ].tolist () # insert pare data
221+ # 날짜 파싱 - 최근 2일치 뉴스 가져오기
222+ today = datetime .date .today ()
223+ two_days_ago = today - datetime .timedelta (days = 2 )
224+ yesterday = today - datetime .timedelta (days = 1 )
225+
226+ def parse_date_with_multiple_attempts (date_str ):
227+ """여러 날짜로 시도하여 파싱"""
228+ # 오늘, 어제, 그제 순서로 시도
229+ for target_date in [today , yesterday , two_days_ago ]:
230+ try :
231+ parsed = pd .to_datetime (
232+ target_date .strftime ("%Y-%m-%d " ) + str (date_str ),
233+ format = "%Y-%m-%d %I:%M%p" ,
234+ errors = "coerce"
235+ )
236+ if not pd .isna (parsed ):
237+ return parsed
238+ except :
239+ continue
240+ return pd .NaT # 파싱 실패
241+
242+ # 날짜 파싱 적용
243+ news_df ["parsed_date" ] = news_df ["Date" ].apply (parse_date_with_multiple_attempts )
244+
245+ # 최근 2일치 뉴스 필터링 (날짜 파싱 실패한 뉴스도 포함 - Finviz는 최근 뉴스만 보여주므로)
246+ two_days_ago_datetime = pd .Timestamp (two_days_ago ) # datetime으로 변환
247+ recent_news = news_df [
248+ (news_df ['parsed_date' ].isna ()) | # 날짜 파싱 실패한 경우 포함
249+ (news_df ['parsed_date' ] >= two_days_ago_datetime ) # 최근 2일 이내
250+ ]
251+
252+ predict_texts = recent_news ['Title' ].tolist ()
253+ news_sources = recent_news ['source' ].tolist () if 'source' in recent_news .columns else ['finviz' ] * len (predict_texts )
254+
255+ logger .info (f'Filtered news (last 2 days + unparsed): { len (predict_texts )} articles' )
63256
64257 # past predict data
65258 # predict_texts = [
@@ -71,7 +264,7 @@ def run_predict():
71264 # ]
72265
73266 if not predict_texts :
74- logger .warning ("No news found for today's date " )
267+ logger .warning ("No news found for the last 2 days " )
75268 return
76269
77270 seqs = tokenizer .texts_to_sequences (predict_texts )
@@ -94,7 +287,7 @@ def run_predict():
94287
95288 pos , neg , neu = 0 , 0 , 0
96289
97- for text , probs in zip (predict_texts , prediction ):
290+ for text , probs , source in zip (predict_texts , prediction , news_sources ):
98291 pred_idx = int (np .argmax (probs ))
99292 label = label_map [pred_idx ]
100293 percent = float (np .max (probs )) # 가장 높은 softmax 확률
@@ -107,13 +300,17 @@ def run_predict():
107300 else :
108301 neu += 1
109302
110- logger .info (f"[{ label .upper ()} ] { text } : { percent :.2f} " )
303+ # 태그 분류
304+ tag = classify_news_tag (text )
305+
306+ logger .info (f"[{ label .upper ()} ] [{ tag } ] [{ source } ] { text } : { percent :.2f} " )
111307
112308 sb_result .append ({
113309 "text" : text ,
114310 "label" : label ,
115311 "percent" : percent ,
116- "source" : "finviz" ,
312+ "tag" : tag ,
313+ "source" : source ,
117314 "run_at" : datetime .datetime .now (datetime .timezone .utc ).isoformat (),
118315 "hash" : hashlib .sha256 (text .encode ("utf-8" )).hexdigest (),
119316 })
0 commit comments