Skip to content

Commit 4b2fad4

Browse files
committed
Feat : Added news tag
1 parent 61bfa75 commit 4b2fad4

1 file changed

Lines changed: 215 additions & 18 deletions

File tree

upstock/nodes/predict.py

Lines changed: 215 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@
1010
import pandas as pd
1111
import numpy as np
1212
import logging
13+
import requests
14+
from bs4 import BeautifulSoup
15+
import time
1316

1417
from tensorflow.keras.preprocessing.sequence import pad_sequences
1518
from finvizfinance.news import News
@@ -23,6 +26,170 @@
2326

2427
logger = logging.getLogger(__name__)
2528

29+
def classify_news_tag(text: str) -> str:
30+
"""
31+
뉴스 제목을 기반으로 태그를 분류합니다.
32+
33+
Args:
34+
text: 뉴스 제목
35+
36+
Returns:
37+
태그 문자열 (경제, 산업, 기술, 정치, 건강, 사회)
38+
"""
39+
text_lower = text.lower()
40+
41+
# 정치 키워드
42+
politics_keywords = [
43+
'politics', 'political', 'election', 'president', 'government', 'senate', 'congress',
44+
'democrat', 'republican', 'biden', 'trump', 'administration', 'policy', 'diplomacy',
45+
'international', 'trade war', 'sanctions', 'embargo', 'treaty', 'alliance', 'nato',
46+
'united nations', 'un', 'eu', 'brexit', 'geopolitical', 'foreign policy'
47+
]
48+
49+
# 경제 키워드
50+
economy_keywords = [
51+
'economy', 'economic', 'market', 'inflation', 'deflation', 'fed', 'federal reserve',
52+
'rate cut', 'rate hike', 'interest rate', 'gdp', 'unemployment', 'employment',
53+
'consumer price', 'cpi', 'ppi', 'retail sales', 'gross domestic product',
54+
'recession', 'growth', 'monetary policy', 'fiscal policy', 'treasury', 'bond',
55+
'stock market', 'dow', 'nasdaq', 's&p', 'sp500', 'market wrap', 'markets'
56+
]
57+
58+
# 산업 키워드
59+
company_keywords = [
60+
'company', 'companies', 'corporate', 'earnings', 'revenue', 'profit', 'loss',
61+
'quarterly', 'q1', 'q2', 'q3', 'q4', 'ipo', 'merger', 'acquisition', 'deal',
62+
'ceo', 'cfo', 'executive', 'shareholder', 'dividend', 'stock', 'shares',
63+
'industry', 'sector', 'automotive', 'energy', 'oil', 'gas', 'retail',
64+
'manufacturing', 'banking', 'finance', 'insurance'
65+
]
66+
67+
# 기술 키워드
68+
tech_keywords = [
69+
'technology', 'tech', 'ai', 'artificial intelligence', 'machine learning', 'ml',
70+
'innovation', 'science', 'research', 'development', 'r&d', 'semiconductor',
71+
'chip', 'software', 'hardware', 'cloud', 'cyber', 'digital', 'data',
72+
'quantum', 'blockchain', 'crypto', 'bitcoin', 'ethereum', 'nft',
73+
'space', 'nasa', 'rocket', 'satellite', 'electric vehicle', 'ev', 'tesla'
74+
]
75+
76+
# 사회 키워드
77+
social_keywords = [
78+
'social', 'culture', 'society', 'education', 'university', 'school',
79+
'climate', 'environment', 'green', 'sustainability', 'renewable',
80+
'immigration', 'refugee', 'human rights', 'equality', 'diversity',
81+
'media', 'entertainment', 'movie', 'music', 'art', 'culture'
82+
]
83+
84+
# 건강 키워드
85+
sports_health_keywords = [
86+
'sports', 'sport', 'olympics', 'nfl', 'nba', 'mlb', 'soccer', 'football',
87+
'basketball', 'baseball', 'tennis', 'golf', 'health', 'medical', 'medicine',
88+
'hospital', 'doctor', 'patient', 'disease', 'covid', 'pandemic', 'vaccine',
89+
'pharmaceutical', 'drug', 'treatment', 'therapy', 'fitness', 'wellness'
90+
]
91+
92+
# 우선순위에 따라 태그 분류 (경제가 가장 높은 우선순위)
93+
if any(keyword in text_lower for keyword in economy_keywords):
94+
return "경제"
95+
elif any(keyword in text_lower for keyword in company_keywords):
96+
return "산업"
97+
elif any(keyword in text_lower for keyword in tech_keywords):
98+
return "기술"
99+
elif any(keyword in text_lower for keyword in politics_keywords):
100+
return "정치"
101+
elif any(keyword in text_lower for keyword in sports_health_keywords):
102+
return "건강"
103+
elif any(keyword in text_lower for keyword in social_keywords):
104+
return "사회"
105+
else:
106+
# 기본값: 경제 (finviz 뉴스는 주로 금융/경제 관련)
107+
return "경제"
108+
109+
def fetch_finviz_news_by_type(news_type: int, source_name: str) -> pd.DataFrame:
110+
"""
111+
Finviz의 특정 타입의 뉴스 페이지에서 뉴스를 가져옵니다.
112+
113+
Args:
114+
news_type: 뉴스 타입 (3=Stock, 4=ETF, 5=Crypto)
115+
source_name: 소스 이름 (로깅용)
116+
117+
Returns:
118+
DataFrame with 'Title' and 'Date' columns, or empty DataFrame if failed
119+
"""
120+
try:
121+
url = f'https://finviz.com/news.ashx?v={news_type}'
122+
headers = {
123+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
124+
}
125+
126+
response = requests.get(url, headers=headers, timeout=10)
127+
response.raise_for_status()
128+
129+
soup = BeautifulSoup(response.text, 'html.parser')
130+
news_items = []
131+
132+
# 뉴스는 테이블 형태로 되어있음
133+
# nn 클래스를 가진 tr 태그에서 뉴스 추출
134+
for row in soup.find_all('tr', class_='nn'):
135+
title_elem = row.find('a')
136+
if title_elem:
137+
title = title_elem.get_text(strip=True)
138+
# 날짜 정보 추출 (보통 같은 행의 td에서)
139+
date_elem = row.find('td', class_='nn-date')
140+
if date_elem:
141+
date_str = date_elem.get_text(strip=True)
142+
else:
143+
# 날짜가 없으면 현재 시간 사용
144+
date_str = datetime.datetime.now().strftime("%I:%M%p")
145+
146+
news_items.append({
147+
'Title': title,
148+
'Date': date_str
149+
})
150+
151+
if news_items:
152+
df = pd.DataFrame(news_items)
153+
logger.info(f'{source_name} News fetched: {len(df)} articles')
154+
return df
155+
else:
156+
logger.warning(f'No {source_name.lower()} news items found')
157+
return pd.DataFrame(columns=['Title', 'Date'])
158+
159+
except Exception as e:
160+
logger.error(f'Failed to fetch {source_name.lower()} news: {e}')
161+
return pd.DataFrame(columns=['Title', 'Date'])
162+
163+
def fetch_stock_news() -> pd.DataFrame:
164+
"""
165+
Finviz의 Stock News 페이지에서 뉴스를 가져옵니다.
166+
https://finviz.com/news.ashx?v=3
167+
168+
Returns:
169+
DataFrame with 'Title' and 'Date' columns, or empty DataFrame if failed
170+
"""
171+
return fetch_finviz_news_by_type(3, 'Stock')
172+
173+
def fetch_etf_news() -> pd.DataFrame:
174+
"""
175+
Finviz의 ETF News 페이지에서 뉴스를 가져옵니다.
176+
https://finviz.com/news.ashx?v=4
177+
178+
Returns:
179+
DataFrame with 'Title' and 'Date' columns, or empty DataFrame if failed
180+
"""
181+
return fetch_finviz_news_by_type(4, 'ETF')
182+
183+
def fetch_crypto_news() -> pd.DataFrame:
184+
"""
185+
Finviz의 Crypto News 페이지에서 뉴스를 가져옵니다.
186+
https://finviz.com/news.ashx?v=5
187+
188+
Returns:
189+
DataFrame with 'Title' and 'Date' columns, or empty DataFrame if failed
190+
"""
191+
return fetch_finviz_news_by_type(5, 'Crypto')
192+
26193
def run_predict():
27194

28195
model = load_model_safe(paths.model, 'Sentiment Model (.keras) version')
@@ -40,26 +207,52 @@ def run_predict():
40207
# predict_texts = news_df['Title'].tolist() # predict texts
41208
# # print(news_df['Title'][1])
42209

43-
# parse except task
210+
# Market News 가져오기
44211
try:
45212
fnews = News()
46213
all_news = fnews.get_news()
47-
news_df = all_news['news']
214+
news_df = all_news['news'].copy()
215+
news_df['source'] = 'finviz_market' # source 구분
216+
logger.info(f'Market News fetched: {len(news_df)} articles')
48217
except Exception as e:
49-
logger.error(f'finviz news parse failed : {e}')
218+
logger.error(f'finviz market news parse failed : {e}')
50219
return
51-
52-
# print(news_df['Date'].head(10))
53-
today = datetime.date.today() # today
54-
55-
news_df["parsed_date"] = pd.to_datetime(
56-
today.strftime("%Y-%m-%d ") + news_df["Date"],
57-
format="%Y-%m-%d %I:%M%p",
58-
errors="coerce"
59-
)
60220

61-
today_news = news_df[news_df['parsed_date'].dt.date == today] # today == parse data date
62-
predict_texts = today_news['Title'].tolist() # insert pare data
221+
# 날짜 파싱 - 최근 2일치 뉴스 가져오기
222+
today = datetime.date.today()
223+
two_days_ago = today - datetime.timedelta(days=2)
224+
yesterday = today - datetime.timedelta(days=1)
225+
226+
def parse_date_with_multiple_attempts(date_str):
227+
"""여러 날짜로 시도하여 파싱"""
228+
# 오늘, 어제, 그제 순서로 시도
229+
for target_date in [today, yesterday, two_days_ago]:
230+
try:
231+
parsed = pd.to_datetime(
232+
target_date.strftime("%Y-%m-%d ") + str(date_str),
233+
format="%Y-%m-%d %I:%M%p",
234+
errors="coerce"
235+
)
236+
if not pd.isna(parsed):
237+
return parsed
238+
except:
239+
continue
240+
return pd.NaT # 파싱 실패
241+
242+
# 날짜 파싱 적용
243+
news_df["parsed_date"] = news_df["Date"].apply(parse_date_with_multiple_attempts)
244+
245+
# 최근 2일치 뉴스 필터링 (날짜 파싱 실패한 뉴스도 포함 - Finviz는 최근 뉴스만 보여주므로)
246+
two_days_ago_datetime = pd.Timestamp(two_days_ago) # datetime으로 변환
247+
recent_news = news_df[
248+
(news_df['parsed_date'].isna()) | # 날짜 파싱 실패한 경우 포함
249+
(news_df['parsed_date'] >= two_days_ago_datetime) # 최근 2일 이내
250+
]
251+
252+
predict_texts = recent_news['Title'].tolist()
253+
news_sources = recent_news['source'].tolist() if 'source' in recent_news.columns else ['finviz'] * len(predict_texts)
254+
255+
logger.info(f'Filtered news (last 2 days + unparsed): {len(predict_texts)} articles')
63256

64257
# past predict data
65258
# predict_texts = [
@@ -71,7 +264,7 @@ def run_predict():
71264
# ]
72265

73266
if not predict_texts:
74-
logger.warning("No news found for today's date")
267+
logger.warning("No news found for the last 2 days")
75268
return
76269

77270
seqs = tokenizer.texts_to_sequences(predict_texts)
@@ -94,7 +287,7 @@ def run_predict():
94287

95288
pos, neg, neu = 0, 0, 0
96289

97-
for text, probs in zip(predict_texts, prediction):
290+
for text, probs, source in zip(predict_texts, prediction, news_sources):
98291
pred_idx = int(np.argmax(probs))
99292
label = label_map[pred_idx]
100293
percent = float(np.max(probs)) # 가장 높은 softmax 확률
@@ -107,13 +300,17 @@ def run_predict():
107300
else:
108301
neu += 1
109302

110-
logger.info(f"[{label.upper()}] {text} : {percent:.2f}")
303+
# 태그 분류
304+
tag = classify_news_tag(text)
305+
306+
logger.info(f"[{label.upper()}] [{tag}] [{source}] {text} : {percent:.2f}")
111307

112308
sb_result.append({
113309
"text": text,
114310
"label": label,
115311
"percent": percent,
116-
"source": "finviz",
312+
"tag": tag,
313+
"source": source,
117314
"run_at": datetime.datetime.now(datetime.timezone.utc).isoformat(),
118315
"hash": hashlib.sha256(text.encode("utf-8")).hexdigest(),
119316
})

0 commit comments

Comments
 (0)