From 4eef604856ea4f14a3f9a3be3be39401c9a0a819 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E8=A7=81=E6=80=9D?=
<5361064+zzbslayer@user.noreply.gitee.com>
Date: Fri, 8 May 2020 23:27:51 +0800
Subject: [PATCH 01/12] new feature: weibo poller
---
config_sample.py | 1 +
hoshino/modules/weibo/weibo.py | 442 +++++++++++++++++++++++++++++++++
2 files changed, 443 insertions(+)
create mode 100644 hoshino/modules/weibo/weibo.py
diff --git a/config_sample.py b/config_sample.py
index dee3e59e3..193863362 100644
--- a/config_sample.py
+++ b/config_sample.py
@@ -43,4 +43,5 @@
# 'setu',
'translate',
# 'twitter',
+ # 'weibo'
}
diff --git a/hoshino/modules/weibo/weibo.py b/hoshino/modules/weibo/weibo.py
new file mode 100644
index 000000000..d064a9b92
--- /dev/null
+++ b/hoshino/modules/weibo/weibo.py
@@ -0,0 +1,442 @@
+# -*- coding: UTF-8 -*-
+
+import json
+import os
+import random
+import sys
+import traceback
+from collections import OrderedDict
+from datetime import date, datetime, timedelta
+from time import sleep
+
+import requests
+from lxml import etree
+from hoshino.service import Service, Privilege as Priv
+from hoshino import util, logger
+
+sv = Service('weibo-poller', use_priv=Priv.ADMIN, manage_priv=Priv.SUPERUSER, visible=False)
+
+class Weibo(object):
+ def __init__(self, config):
+ """Weibo类初始化"""
+ self.validate_config(config)
+ self.filter = config['filter']
+ self.user = self.get_user_info(config["user_id"])
+ self.got_count = 0 # 存储爬取到的微博数
+ self.weibo = [] # 存储爬取到的所有微博信息
+ self.weibo_id_list = [] # 存储爬取到的所有微博id
+
+ def get_json(self, params):
+ """获取网页中json数据"""
+ url = 'https://m.weibo.cn/api/container/getIndex?'
+ r = requests.get(url, params=params)
+ return r.json()
+
+ def get_user_info(self, user_id):
+ """获取用户信息"""
+ params = {'containerid': '100505' + str(user_id)}
+ js = self.get_json(params)
+ if js['ok']:
+ info = js['data']['userInfo']
+ user_info = OrderedDict()
+ user_info['id'] = user_id
+ user_info['screen_name'] = info.get('screen_name', '')
+ user_info['gender'] = info.get('gender', '')
+ params = {
+ 'containerid':
+ '230283' + str(user_id) + '_-_INFO'
+ }
+ zh_list = [
+ u'生日', u'所在地', u'小学', u'初中', u'高中', u'大学', u'公司', u'注册时间',
+ u'阳光信用'
+ ]
+ en_list = [
+ 'birthday', 'location', 'education', 'education', 'education',
+ 'education', 'company', 'registration_time', 'sunshine'
+ ]
+ for i in en_list:
+ user_info[i] = ''
+ js = self.get_json(params)
+ if js['ok']:
+ cards = js['data']['cards']
+ if isinstance(cards, list) and len(cards) > 1:
+ card_list = cards[0]['card_group'] + cards[1]['card_group']
+ for card in card_list:
+ if card.get('item_name') in zh_list:
+ user_info[en_list[zh_list.index(
+ card.get('item_name'))]] = card.get(
+ 'item_content', '')
+ user_info['statuses_count'] = info.get('statuses_count', 0)
+ user_info['followers_count'] = info.get('followers_count', 0)
+ user_info['follow_count'] = info.get('follow_count', 0)
+ user_info['description'] = info.get('description', '')
+ user_info['profile_url'] = info.get('profile_url', '')
+ user_info['profile_image_url'] = info.get('profile_image_url', '')
+ user_info['avatar_hd'] = info.get('avatar_hd', '')
+ user_info['urank'] = info.get('urank', 0)
+ user_info['mbrank'] = info.get('mbrank', 0)
+ user_info['verified'] = info.get('verified', False)
+ user_info['verified_type'] = info.get('verified_type', 0)
+ user_info['verified_reason'] = info.get('verified_reason', '')
+ user = self.standardize_info(user_info)
+ return user
+
+ def validate_config(self, config):
+ """验证配置是否正确"""
+
+ # 验证filter、original_pic_download、retweet_pic_download、original_video_download、retweet_video_download
+ argument_list = [
+ 'filter'
+ ]
+ for argument in argument_list:
+ if config[argument] != 0 and config[argument] != 1:
+ logger.error(u'%s值应为0或1,请重新输入' % config[argument])
+
+ # 验证user_id_list
+ if "user_id" not in config:
+ logger.error(u'请填写用户 id')
+ if "service_name" not in config:
+ logger.error(u'请填写所属服务名')
+
+ def get_pics(self, weibo_info):
+ """获取微博原始图片url"""
+ if weibo_info.get('pics'):
+ pic_info = weibo_info['pics']
+ pic_list = [pic['large']['url'] for pic in pic_info]
+ pics = ','.join(pic_list)
+ else:
+ pics = ''
+ return pics
+
+ def get_live_photo(self, weibo_info):
+ """获取live photo中的视频url"""
+ live_photo_list = []
+ live_photo = weibo_info.get('pic_video')
+ if live_photo:
+ prefix = 'https://video.weibo.com/media/play?livephoto=//us.sinaimg.cn/'
+ for i in live_photo.split(','):
+ if len(i.split(':')) == 2:
+ url = prefix + i.split(':')[1] + '.mov'
+ live_photo_list.append(url)
+ return live_photo_list
+
+ def get_video_url(self, weibo_info):
+ """获取微博视频url"""
+ video_url = ''
+ video_url_list = []
+ if weibo_info.get('page_info'):
+ if weibo_info['page_info'].get('media_info') and weibo_info[
+ 'page_info'].get('type') == 'video':
+ media_info = weibo_info['page_info']['media_info']
+ video_url = media_info.get('mp4_720p_mp4')
+ if not video_url:
+ video_url = media_info.get('mp4_hd_url')
+ if not video_url:
+ video_url = media_info.get('mp4_sd_url')
+ if not video_url:
+ video_url = media_info.get('stream_url_hd')
+ if not video_url:
+ video_url = media_info.get('stream_url')
+ if video_url:
+ video_url_list.append(video_url)
+ live_photo_list = self.get_live_photo(weibo_info)
+ if live_photo_list:
+ video_url_list += live_photo_list
+ return ';'.join(video_url_list)
+
+ def get_location(self, selector):
+ """获取微博发布位置"""
+ location_icon = 'timeline_card_small_location_default.png'
+ span_list = selector.xpath('//span')
+ location = ''
+ for i, span in enumerate(span_list):
+ if span.xpath('img/@src'):
+ if location_icon in span.xpath('img/@src')[0]:
+ location = span_list[i + 1].xpath('string(.)')
+ break
+ return location
+
+ def get_article_url(self, selector):
+ """获取微博中头条文章的url"""
+ article_url = ''
+ text = selector.xpath('string(.)')
+ if text.startswith(u'发布了头条文章'):
+ url = selector.xpath('//a/@data-url')
+ if url and url[0].startswith('http://t.cn'):
+ article_url = url[0]
+ return article_url
+
+ def get_topics(self, selector):
+ """获取参与的微博话题"""
+ span_list = selector.xpath("//span[@class='surl-text']")
+ topics = ''
+ topic_list = []
+ for span in span_list:
+ text = span.xpath('string(.)')
+ if len(text) > 2 and text[0] == '#' and text[-1] == '#':
+ topic_list.append(text[1:-1])
+ if topic_list:
+ topics = ','.join(topic_list)
+ return topics
+
+ def get_at_users(self, selector):
+ """获取@用户"""
+ a_list = selector.xpath('//a')
+ at_users = ''
+ at_list = []
+ for a in a_list:
+ if '@' + a.xpath('@href')[0][3:] == a.xpath('string(.)'):
+ at_list.append(a.xpath('string(.)')[1:])
+ if at_list:
+ at_users = ','.join(at_list)
+ return at_users
+
+ def string_to_int(self, string):
+ """字符串转换为整数"""
+ if isinstance(string, int):
+ return string
+ elif string.endswith(u'万+'):
+ string = int(string[:-2] + '0000')
+ elif string.endswith(u'万'):
+ string = int(string[:-1] + '0000')
+ return int(string)
+
+ def standardize_date(self, created_at):
+ """标准化微博发布时间"""
+ if u"刚刚" in created_at:
+ created_at = datetime.now().strftime("%Y-%m-%d")
+ elif u"分钟" in created_at:
+ minute = created_at[:created_at.find(u"分钟")]
+ minute = timedelta(minutes=int(minute))
+ created_at = (datetime.now() - minute).strftime("%Y-%m-%d")
+ elif u"小时" in created_at:
+ hour = created_at[:created_at.find(u"小时")]
+ hour = timedelta(hours=int(hour))
+ created_at = (datetime.now() - hour).strftime("%Y-%m-%d")
+ elif u"昨天" in created_at:
+ day = timedelta(days=1)
+ created_at = (datetime.now() - day).strftime("%Y-%m-%d")
+ elif created_at.count('-') == 1:
+ year = datetime.now().strftime("%Y")
+ created_at = year + "-" + created_at
+ return created_at
+
+ def standardize_info(self, weibo):
+ """标准化信息,去除乱码"""
+ for k, v in weibo.items():
+ if 'bool' not in str(type(v)) and 'int' not in str(
+ type(v)) and 'list' not in str(
+ type(v)) and 'long' not in str(type(v)):
+ weibo[k] = v.replace(u"\u200b", "").encode(
+ sys.stdout.encoding, "ignore").decode(sys.stdout.encoding)
+ return weibo
+
+ def parse_weibo(self, weibo_info):
+ weibo = OrderedDict()
+ if weibo_info['user']:
+ weibo['user_id'] = weibo_info['user']['id']
+ weibo['screen_name'] = weibo_info['user']['screen_name']
+ else:
+ weibo['user_id'] = ''
+ weibo['screen_name'] = ''
+ weibo['id'] = int(weibo_info['id'])
+ weibo['bid'] = weibo_info['bid']
+ text_body = weibo_info['text']
+ selector = etree.HTML(text_body)
+ weibo['text'] = etree.HTML(text_body).xpath('string(.)')
+ weibo['article_url'] = self.get_article_url(selector)
+ weibo['pics'] = self.get_pics(weibo_info)
+ weibo['video_url'] = self.get_video_url(weibo_info)
+ weibo['location'] = self.get_location(selector)
+ weibo['created_at'] = weibo_info['created_at']
+ weibo['source'] = weibo_info['source']
+ weibo['attitudes_count'] = self.string_to_int(
+ weibo_info.get('attitudes_count', 0))
+ weibo['comments_count'] = self.string_to_int(
+ weibo_info.get('comments_count', 0))
+ weibo['reposts_count'] = self.string_to_int(
+ weibo_info.get('reposts_count', 0))
+ weibo['topics'] = self.get_topics(selector)
+ weibo['at_users'] = self.get_at_users(selector)
+ return self.standardize_info(weibo)
+
+ def print_one_weibo(self, weibo):
+ """打印一条微博"""
+ try:
+ logger.info(u'微博id:%d' % weibo['id'])
+ logger.info(u'微博正文:%s' % weibo['text'])
+ logger.info(u'原始图片url:%s' % weibo['pics'])
+ logger.info(u'微博位置:%s' % weibo['location'])
+ logger.info(u'发布时间:%s' % weibo['created_at'])
+ logger.info(u'发布工具:%s' % weibo['source'])
+ logger.info(u'点赞数:%d' % weibo['attitudes_count'])
+ logger.info(u'评论数:%d' % weibo['comments_count'])
+ logger.info(u'转发数:%d' % weibo['reposts_count'])
+ logger.info(u'话题:%s' % weibo['topics'])
+ logger.info(u'@用户:%s' % weibo['at_users'])
+ logger.info(u'url:https://m.weibo.cn/detail/%d' % weibo['id'])
+ except OSError:
+ pass
+
+ def print_weibo(self, weibo):
+ """打印微博,若为转发微博,会同时打印原创和转发部分"""
+ if weibo.get('retweet'):
+ logger.info('*' * 100)
+ logger.info(u'转发部分:')
+ self.print_one_weibo(weibo['retweet'])
+ logger.info('*' * 100)
+ logger.info(u'原创部分:')
+ self.print_one_weibo(weibo)
+ logger.info('-' * 120)
+
+ def get_username(self):
+ return self.user["screen_name"]
+
+ def get_user_id(self):
+ return self.user["id"]
+
+ def get_weibo_json(self, page):
+ """获取网页中微博json数据"""
+ params = {
+ 'containerid': '107603' + self.get_user_id(),
+ 'page': page
+ }
+ js = self.get_json(params)
+ return js
+
+ def get_long_weibo(self, id):
+ """获取长微博"""
+ for i in range(5):
+ url = 'https://m.weibo.cn/detail/%s' % id
+ html = requests.get(url).text
+ html = html[html.find('"status":'):]
+ html = html[:html.rfind('"hotScheme"')]
+ html = html[:html.rfind(',')]
+ html = '{' + html + '}'
+ js = json.loads(html, strict=False)
+ weibo_info = js.get('status')
+ if weibo_info:
+ weibo = self.parse_weibo(weibo_info)
+ return weibo
+ sleep(random.randint(6, 10))
+
+ def print_user_info(self):
+ """打印用户信息"""
+ logger.info('+' * 100)
+ logger.info(u'用户信息')
+ logger.info(u'用户id:%s' % self.user['id'])
+ logger.info(u'用户昵称:%s' % self.user['screen_name'])
+ gender = u'女' if self.user['gender'] == 'f' else u'男'
+ logger.info(u'性别:%s' % gender)
+ logger.info(u'生日:%s' % self.user['birthday'])
+ logger.info(u'所在地:%s' % self.user['location'])
+ logger.info(u'教育经历:%s' % self.user['education'])
+ logger.info(u'公司:%s' % self.user['company'])
+ logger.info(u'阳光信用:%s' % self.user['sunshine'])
+ logger.info(u'注册时间:%s' % self.user['registration_time'])
+ logger.info(u'微博数:%d' % self.user['statuses_count'])
+ logger.info(u'粉丝数:%d' % self.user['followers_count'])
+ logger.info(u'关注数:%d' % self.user['follow_count'])
+ logger.info(u'url:https://m.weibo.cn/profile/%s' % self.user['id'])
+ if self.user.get('verified_reason'):
+ logger.info(self.user['verified_reason'])
+ logger.info(self.user['description'])
+ logger.info('+' * 100)
+
+ def get_one_weibo(self, info):
+ """获取一条微博的全部信息"""
+ try:
+ weibo_info = info['mblog']
+ weibo_id = weibo_info['id']
+ retweeted_status = weibo_info.get('retweeted_status')
+ is_long = weibo_info.get('isLongText')
+ if retweeted_status and retweeted_status.get('id'): # 转发
+ retweet_id = retweeted_status.get('id')
+ is_long_retweet = retweeted_status.get('isLongText')
+ if is_long:
+ weibo = self.get_long_weibo(weibo_id)
+ if not weibo:
+ weibo = self.parse_weibo(weibo_info)
+ else:
+ weibo = self.parse_weibo(weibo_info)
+ if is_long_retweet:
+ retweet = self.get_long_weibo(retweet_id)
+ if not retweet:
+ retweet = self.parse_weibo(retweeted_status)
+ else:
+ retweet = self.parse_weibo(retweeted_status)
+ retweet['created_at'] = self.standardize_date(
+ retweeted_status['created_at'])
+ weibo['retweet'] = retweet
+ else: # 原创
+ if is_long:
+ weibo = self.get_long_weibo(weibo_id)
+ if not weibo:
+ weibo = self.parse_weibo(weibo_info)
+ else:
+ weibo = self.parse_weibo(weibo_info)
+ weibo['created_at'] = self.standardize_date(
+ weibo_info['created_at'])
+ return weibo
+ except Exception as e:
+ logger.exception(e)
+
+ def get_latest_weibos(self):
+ try:
+ latest_weibos = []
+ js = self.get_weibo_json(1)
+ if js['ok']:
+ weibos = js['data']['cards']
+ for w in weibos:
+ if w['card_type'] == 9:
+ wb = self.get_one_weibo(w)
+ if wb:
+ if wb['created_at'] != str(date.today()):
+ continue
+ if wb['id'] in self.weibo_id_list:
+ continue
+ if (not self.filter) or (
+ 'retweet' not in wb.keys()):
+ self.weibo.append(wb)
+ latest_weibos.append(wb)
+ self.weibo_id_list.append(wb['id'])
+ self.got_count += 1
+ self.print_weibo(wb)
+
+ return latest_weibos
+ except Exception as e:
+ logger.exception(e)
+ return []
+
+
+user_configs = util.load_config(__file__)
+subr_dic = {}
+
+for config in user_configs:
+ print(config)
+ wb = Weibo(config)
+ service_name = config["service_name"]
+ subService = Service(service_name, enable_on_default=True)
+
+ if service_name not in subr_dic:
+ subr_dic[service_name] = {"service": subService, "spiders": [wb]}
+ else:
+ subr_dic[service_name]["spiders"].append(wb)
+
+@sv.scheduled_job('interval', seconds=60 * 20)
+async def weibo_poller():
+ for sv_name, serviceObj in subr_dic.items():
+ weibos = []
+ ssv = serviceObj["service"]
+ spiders = serviceObj["spiders"]
+ for spider in spiders:
+ latest_weibos = spider.get_latest_weibos()
+ formatted_weibos = [wb["text"] for wb in latest_weibos]
+
+ if l := len(formatted_weibos):
+ sv.logger.info(f"成功获取@{spider.get_username()}的新微博{l}条")
+ else:
+ sv.logger.info(f"未检测到@{spider.get_username()}的新微博")
+
+ weibos.extend(formatted_weibos)
+ await ssv.broadcast(weibos, ssv.name, 0.5)
\ No newline at end of file
From 7c6cc1b94458ca54f324e65858096ddb3228b3f6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E8=A7=81=E6=80=9D?=
<5361064+zzbslayer@user.noreply.gitee.com>
Date: Sat, 9 May 2020 01:52:37 +0800
Subject: [PATCH 02/12] Send weibo images; fix subservice bug
---
hoshino/modules/weibo/weibo.py | 46 ++++++++++++++++++++++------------
hoshino/res.py | 22 ++++++++++++++++
2 files changed, 52 insertions(+), 16 deletions(-)
diff --git a/hoshino/modules/weibo/weibo.py b/hoshino/modules/weibo/weibo.py
index d064a9b92..eb4d19c16 100644
--- a/hoshino/modules/weibo/weibo.py
+++ b/hoshino/modules/weibo/weibo.py
@@ -13,6 +13,7 @@
from lxml import etree
from hoshino.service import Service, Privilege as Priv
from hoshino import util, logger
+from hoshino.res import R
sv = Service('weibo-poller', use_priv=Priv.ADMIN, manage_priv=Priv.SUPERUSER, visible=False)
@@ -22,9 +23,8 @@ def __init__(self, config):
self.validate_config(config)
self.filter = config['filter']
self.user = self.get_user_info(config["user_id"])
- self.got_count = 0 # 存储爬取到的微博数
- self.weibo = [] # 存储爬取到的所有微博信息
- self.weibo_id_list = [] # 存储爬取到的所有微博id
+
+ self.__recent = False
def get_json(self, params):
"""获取网页中json数据"""
@@ -103,10 +103,9 @@ def get_pics(self, weibo_info):
if weibo_info.get('pics'):
pic_info = weibo_info['pics']
pic_list = [pic['large']['url'] for pic in pic_info]
- pics = ','.join(pic_list)
else:
- pics = ''
- return pics
+ pic_list = []
+ return pic_list
def get_live_photo(self, weibo_info):
"""获取live photo中的视频url"""
@@ -142,7 +141,7 @@ def get_video_url(self, weibo_info):
live_photo_list = self.get_live_photo(weibo_info)
if live_photo_list:
video_url_list += live_photo_list
- return ';'.join(video_url_list)
+ return video_url_list
def get_location(self, selector):
"""获取微博发布位置"""
@@ -205,20 +204,25 @@ def standardize_date(self, created_at):
"""标准化微博发布时间"""
if u"刚刚" in created_at:
created_at = datetime.now().strftime("%Y-%m-%d")
+ self.__recent = True
elif u"分钟" in created_at:
minute = created_at[:created_at.find(u"分钟")]
minute = timedelta(minutes=int(minute))
created_at = (datetime.now() - minute).strftime("%Y-%m-%d")
+ self.__recent = True
elif u"小时" in created_at:
hour = created_at[:created_at.find(u"小时")]
hour = timedelta(hours=int(hour))
created_at = (datetime.now() - hour).strftime("%Y-%m-%d")
+ self.__recent = False
elif u"昨天" in created_at:
day = timedelta(days=1)
created_at = (datetime.now() - day).strftime("%Y-%m-%d")
+ self.__recent = False
elif created_at.count('-') == 1:
year = datetime.now().strftime("%Y")
created_at = year + "-" + created_at
+ self.__recent = False
return created_at
def standardize_info(self, weibo):
@@ -380,6 +384,7 @@ def get_one_weibo(self, info):
return weibo
except Exception as e:
logger.exception(e)
+ self.__recent = False
def get_latest_weibos(self):
try:
@@ -391,16 +396,11 @@ def get_latest_weibos(self):
if w['card_type'] == 9:
wb = self.get_one_weibo(w)
if wb:
- if wb['created_at'] != str(date.today()):
- continue
- if wb['id'] in self.weibo_id_list:
+ if not self.__recent:
continue
if (not self.filter) or (
'retweet' not in wb.keys()):
- self.weibo.append(wb)
latest_weibos.append(wb)
- self.weibo_id_list.append(wb['id'])
- self.got_count += 1
self.print_weibo(wb)
return latest_weibos
@@ -416,14 +416,28 @@ def get_latest_weibos(self):
print(config)
wb = Weibo(config)
service_name = config["service_name"]
- subService = Service(service_name, enable_on_default=True)
if service_name not in subr_dic:
+ subService = Service(service_name, enable_on_default=True)
subr_dic[service_name] = {"service": subService, "spiders": [wb]}
else:
subr_dic[service_name]["spiders"].append(wb)
-@sv.scheduled_job('interval', seconds=60 * 20)
+def wb_to_message(wb):
+ msg = f'@{wb["screen_name"]}:\n{wb["text"]}'
+ if sv.bot.config.IS_CQPRO and len(wb["pics"]) > 0:
+ images_url = wb["pics"]
+ msg = f'{msg}\n'
+ res_imgs = [R.remote_img(url).cqcode for url in images_url]
+ for img in res_imgs:
+ msg = f'{msg}{img}'
+ if len(wb["video_url"]) > 0:
+ videos = wb["video_url"]
+ res_videos = ';'.join(videos)
+ msg = f'{msg}\n视频链接:{res_videos}'
+ return msg
+
+@sv.scheduled_job('interval', seconds=20*60)
async def weibo_poller():
for sv_name, serviceObj in subr_dic.items():
weibos = []
@@ -431,7 +445,7 @@ async def weibo_poller():
spiders = serviceObj["spiders"]
for spider in spiders:
latest_weibos = spider.get_latest_weibos()
- formatted_weibos = [wb["text"] for wb in latest_weibos]
+ formatted_weibos = [wb_to_message(wb) for wb in latest_weibos]
if l := len(formatted_weibos):
sv.logger.info(f"成功获取@{spider.get_username()}的新微博{l}条")
diff --git a/hoshino/res.py b/hoshino/res.py
index 436e16959..a071bc219 100644
--- a/hoshino/res.py
+++ b/hoshino/res.py
@@ -1,5 +1,7 @@
import os
from PIL import Image
+import requests
+from io import BytesIO
from urllib.request import pathname2url
from urllib.parse import urljoin
@@ -19,7 +21,27 @@ def get(path, *paths):
def img(path, *paths):
return ResImg(os.path.join('img', path, *paths))
+ @staticmethod
+ def remote_img(url):
+ return RemoteResImg(url)
+
+class RemoteResObj:
+ def __init__(self, url):
+ self.__path = url
+
+ @property
+ def url(self):
+ return self.__path
+class RemoteResImg(RemoteResObj):
+ @property
+ def cqcode(self) -> MessageSegment:
+ return MessageSegment.image(self.url)
+
+ def open(self) -> Image:
+ response = requests.get(self.url)
+ return Image.open(BytesIO(response))
+
class ResObj:
From 2c0abae9c677237dd15b7149a678a3dabc5a1567 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E8=A7=81=E6=80=9D?=
<5361064+zzbslayer@user.noreply.gitee.com>
Date: Sat, 9 May 2020 02:36:43 +0800
Subject: [PATCH 03/12] Replace requests with httpx; small fix for code review
---
hoshino/modules/weibo/weibo.py | 39 ++++++++++++++++++++--------------
hoshino/res.py | 3 ++-
requirements.txt | 3 ++-
3 files changed, 27 insertions(+), 18 deletions(-)
diff --git a/hoshino/modules/weibo/weibo.py b/hoshino/modules/weibo/weibo.py
index eb4d19c16..79d48f0ac 100644
--- a/hoshino/modules/weibo/weibo.py
+++ b/hoshino/modules/weibo/weibo.py
@@ -9,13 +9,23 @@
from datetime import date, datetime, timedelta
from time import sleep
-import requests
+import httpx as requests
from lxml import etree
from hoshino.service import Service, Privilege as Priv
from hoshino import util, logger
from hoshino.res import R
sv = Service('weibo-poller', use_priv=Priv.ADMIN, manage_priv=Priv.SUPERUSER, visible=False)
+user_configs = util.load_config(__file__)
+'''
+sample config.json
+
+[{
+ "user_id": "6603867494",
+ "service_name": "pcr-weibo",
+ "filter": true
+}]
+'''
class Weibo(object):
def __init__(self, config):
@@ -83,20 +93,18 @@ def get_user_info(self, user_id):
def validate_config(self, config):
"""验证配置是否正确"""
+ exist_argument_list = ['user_id', 'service_name']
+ true_false_argument_list = ['filter']
- # 验证filter、original_pic_download、retweet_pic_download、original_video_download、retweet_video_download
- argument_list = [
- 'filter'
- ]
- for argument in argument_list:
- if config[argument] != 0 and config[argument] != 1:
- logger.error(u'%s值应为0或1,请重新输入' % config[argument])
+ for argument in true_false_argument_list:
+ if argument not in config:
+ logger.error(f'请填写 {argument}')
+ if config[argument] != True and config[argument] != False:
+ logger.error(f'{argument} 值应为 True 或 False,请重新输入')
- # 验证user_id_list
- if "user_id" not in config:
- logger.error(u'请填写用户 id')
- if "service_name" not in config:
- logger.error(u'请填写所属服务名')
+ for argument in exist_argument_list:
+ if argument not in config:
+ logger.error(f'请填写 {argument}')
def get_pics(self, weibo_info):
"""获取微博原始图片url"""
@@ -409,11 +417,10 @@ def get_latest_weibos(self):
return []
-user_configs = util.load_config(__file__)
subr_dic = {}
for config in user_configs:
- print(config)
+ sv.logger.debug(config)
wb = Weibo(config)
service_name = config["service_name"]
@@ -437,7 +444,7 @@ def wb_to_message(wb):
msg = f'{msg}\n视频链接:{res_videos}'
return msg
-@sv.scheduled_job('interval', seconds=20*60)
+@sv.scheduled_job('interval', seconds=10)
async def weibo_poller():
for sv_name, serviceObj in subr_dic.items():
weibos = []
diff --git a/hoshino/res.py b/hoshino/res.py
index a071bc219..6803c6025 100644
--- a/hoshino/res.py
+++ b/hoshino/res.py
@@ -1,6 +1,7 @@
import os
+import asyncio
from PIL import Image
-import requests
+import httpx as requests
from io import BytesIO
from urllib.request import pathname2url
from urllib.parse import urljoin
diff --git a/requirements.txt b/requirements.txt
index bccc5b948..e0907f59f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,4 +8,5 @@ zhconv>=1.4.0
Pillow>=6.2.1
TwitterAPI>=2.5.10
matplotlib>=3.2.0
-numpy>=1.18.0
\ No newline at end of file
+numpy>=1.18.0
+httpx>=0.12.1
\ No newline at end of file
From a7a1c350132740b8e766f93d640372788c74d371 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E8=A7=81=E6=80=9D?=
<5361064+zzbslayer@user.noreply.gitee.com>
Date: Sat, 9 May 2020 03:46:02 +0800
Subject: [PATCH 04/12] Replace all sync requests with await/async
---
hoshino/modules/weibo/__init__.py | 61 ++++++++++++
hoshino/modules/weibo/exception.py | 21 +++++
hoshino/modules/weibo/weibo.py | 143 +++++++++--------------------
hoshino/res.py | 10 +-
4 files changed, 133 insertions(+), 102 deletions(-)
create mode 100644 hoshino/modules/weibo/__init__.py
create mode 100644 hoshino/modules/weibo/exception.py
diff --git a/hoshino/modules/weibo/__init__.py b/hoshino/modules/weibo/__init__.py
new file mode 100644
index 000000000..84a130984
--- /dev/null
+++ b/hoshino/modules/weibo/__init__.py
@@ -0,0 +1,61 @@
+from .weibo import WeiboSpider
+from hoshino.service import Service, Privilege as Priv
+from hoshino.res import R
+from hoshino import util
+
+sv = Service('weibo-poller', use_priv=Priv.ADMIN, manage_priv=Priv.SUPERUSER, visible=False)
+user_configs = util.load_config(__file__)
+'''
+sample config.json
+
+[{
+ "user_id": "6603867494",
+ "service_name": "bcr-weibo",
+ "filter": true
+}]
+'''
+
+subr_dic = {}
+
+for config in user_configs:
+ sv.logger.debug(config)
+ wb_spider = WeiboSpider(config)
+ service_name = config["service_name"]
+
+ if service_name not in subr_dic:
+ subService = Service(service_name, enable_on_default=True)
+ subr_dic[service_name] = {"service": subService, "spiders": [wb_spider]}
+ else:
+ subr_dic[service_name]["spiders"].append(wb_spider)
+
+def wb_to_message(wb):
+ msg = f'@{wb["screen_name"]}:\n{wb["text"]}'
+ if sv.bot.config.IS_CQPRO and len(wb["pics"]) > 0:
+ images_url = wb["pics"]
+ msg = f'{msg}\n'
+ res_imgs = [R.remote_img(url).cqcode for url in images_url]
+ for img in res_imgs:
+ msg = f'{msg}{img}'
+ if len(wb["video_url"]) > 0:
+ videos = wb["video_url"]
+ res_videos = ';'.join(videos)
+ msg = f'{msg}\n视频链接:{res_videos}'
+ return msg
+
+@sv.scheduled_job('interval', seconds=20*60)
+async def weibo_poller():
+ for sv_name, serviceObj in subr_dic.items():
+ weibos = []
+ ssv = serviceObj["service"]
+ spiders = serviceObj["spiders"]
+ for spider in spiders:
+ latest_weibos = await spider.get_latest_weibos()
+ formatted_weibos = [wb_to_message(wb) for wb in latest_weibos]
+
+ if l := len(formatted_weibos):
+ sv.logger.info(f"成功获取@{spider.get_username()}的新微博{l}条")
+ else:
+ sv.logger.info(f"未检测到@{spider.get_username()}的新微博")
+
+ weibos.extend(formatted_weibos)
+ await ssv.broadcast(weibos, ssv.name, 0.5)
\ No newline at end of file
diff --git a/hoshino/modules/weibo/exception.py b/hoshino/modules/weibo/exception.py
new file mode 100644
index 000000000..5d1fab491
--- /dev/null
+++ b/hoshino/modules/weibo/exception.py
@@ -0,0 +1,21 @@
+class WeiboError(Exception):
+ def __init__(self, msg, *msgs):
+ self._msgs = [msg, *msgs]
+
+ def __str__(self):
+ return '\n'.join(self._msgs)
+
+ @property
+ def message(self):
+ return str(self)
+
+ def append(self, msg:str):
+ self._msgs.append(msg)
+
+
+class ParseError(WeiboError):
+ pass
+
+
+class NotFoundError(WeiboError):
+ pass
\ No newline at end of file
diff --git a/hoshino/modules/weibo/weibo.py b/hoshino/modules/weibo/weibo.py
index 79d48f0ac..384ddc67e 100644
--- a/hoshino/modules/weibo/weibo.py
+++ b/hoshino/modules/weibo/weibo.py
@@ -1,51 +1,42 @@
# -*- coding: UTF-8 -*-
import json
-import os
import random
import sys
-import traceback
from collections import OrderedDict
from datetime import date, datetime, timedelta
from time import sleep
-import httpx as requests
+import httpx
from lxml import etree
-from hoshino.service import Service, Privilege as Priv
-from hoshino import util, logger
-from hoshino.res import R
+from hoshino import logger
+from .exception import *
-sv = Service('weibo-poller', use_priv=Priv.ADMIN, manage_priv=Priv.SUPERUSER, visible=False)
-user_configs = util.load_config(__file__)
-'''
-sample config.json
-
-[{
- "user_id": "6603867494",
- "service_name": "pcr-weibo",
- "filter": true
-}]
-'''
-
-class Weibo(object):
+class WeiboSpider(object):
def __init__(self, config):
"""Weibo类初始化"""
self.validate_config(config)
- self.filter = config['filter']
- self.user = self.get_user_info(config["user_id"])
-
+ self.filter = config['filter']
+ self.user_id = config['user_id']
+ self.user = self.get_user_info(self.user_id)
self.__recent = False
- def get_json(self, params):
+ async def get_json(self, params):
"""获取网页中json数据"""
url = 'https://m.weibo.cn/api/container/getIndex?'
- r = requests.get(url, params=params)
+ async with httpx.AsyncClient() as client:
+ r = await client.get(url, params=params)
+ return r.json()
+
+ def sync_get_json(self, params):
+ url = 'https://m.weibo.cn/api/container/getIndex?'
+ r = httpx.get(url, params=params)
return r.json()
def get_user_info(self, user_id):
"""获取用户信息"""
params = {'containerid': '100505' + str(user_id)}
- js = self.get_json(params)
+ js = self.sync_get_json(params)
if js['ok']:
info = js['data']['userInfo']
user_info = OrderedDict()
@@ -66,7 +57,7 @@ def get_user_info(self, user_id):
]
for i in en_list:
user_info[i] = ''
- js = self.get_json(params)
+ js = self.sync_get_json(params)
if js['ok']:
cards = js['data']['cards']
if isinstance(cards, list) and len(cards) > 1:
@@ -98,13 +89,13 @@ def validate_config(self, config):
for argument in true_false_argument_list:
if argument not in config:
- logger.error(f'请填写 {argument}')
+ raise NotFoundError(f'未找到参数{argument}')
if config[argument] != True and config[argument] != False:
- logger.error(f'{argument} 值应为 True 或 False,请重新输入')
+ raise ParseError(f'{argument} 值应为 True 或 False')
for argument in exist_argument_list:
if argument not in config:
- logger.error(f'请填写 {argument}')
+ raise NotFoundError(f'未找到参数{argument}')
def get_pics(self, weibo_info):
"""获取微博原始图片url"""
@@ -305,32 +296,34 @@ def get_username(self):
return self.user["screen_name"]
def get_user_id(self):
- return self.user["id"]
+ return self.user_id
- def get_weibo_json(self, page):
+ async def get_weibo_json(self, page):
"""获取网页中微博json数据"""
params = {
'containerid': '107603' + self.get_user_id(),
'page': page
}
- js = self.get_json(params)
+ js = await self.get_json(params)
return js
- def get_long_weibo(self, id):
+ async def get_long_weibo(self, id):
"""获取长微博"""
for i in range(5):
url = 'https://m.weibo.cn/detail/%s' % id
- html = requests.get(url).text
- html = html[html.find('"status":'):]
- html = html[:html.rfind('"hotScheme"')]
- html = html[:html.rfind(',')]
- html = '{' + html + '}'
- js = json.loads(html, strict=False)
- weibo_info = js.get('status')
- if weibo_info:
- weibo = self.parse_weibo(weibo_info)
- return weibo
- sleep(random.randint(6, 10))
+ async with httpx.AsyncClient() as client:
+ html = await client.get(url)
+ html = html.text
+ html = html[html.find('"status":'):]
+ html = html[:html.rfind('"hotScheme"')]
+ html = html[:html.rfind(',')]
+ html = '{' + html + '}'
+ js = json.loads(html, strict=False)
+ weibo_info = js.get('status')
+ if weibo_info:
+ weibo = self.parse_weibo(weibo_info)
+ return weibo
+ sleep(random.randint(6, 10))
def print_user_info(self):
"""打印用户信息"""
@@ -355,7 +348,7 @@ def print_user_info(self):
logger.info(self.user['description'])
logger.info('+' * 100)
- def get_one_weibo(self, info):
+ async def get_one_weibo(self, info):
"""获取一条微博的全部信息"""
try:
weibo_info = info['mblog']
@@ -366,13 +359,13 @@ def get_one_weibo(self, info):
retweet_id = retweeted_status.get('id')
is_long_retweet = retweeted_status.get('isLongText')
if is_long:
- weibo = self.get_long_weibo(weibo_id)
+ weibo = await self.get_long_weibo(weibo_id)
if not weibo:
weibo = self.parse_weibo(weibo_info)
else:
weibo = self.parse_weibo(weibo_info)
if is_long_retweet:
- retweet = self.get_long_weibo(retweet_id)
+ retweet = await self.get_long_weibo(retweet_id)
if not retweet:
retweet = self.parse_weibo(retweeted_status)
else:
@@ -382,7 +375,7 @@ def get_one_weibo(self, info):
weibo['retweet'] = retweet
else: # 原创
if is_long:
- weibo = self.get_long_weibo(weibo_id)
+ weibo = await self.get_long_weibo(weibo_id)
if not weibo:
weibo = self.parse_weibo(weibo_info)
else:
@@ -394,15 +387,15 @@ def get_one_weibo(self, info):
logger.exception(e)
self.__recent = False
- def get_latest_weibos(self):
+ async def get_latest_weibos(self):
try:
latest_weibos = []
- js = self.get_weibo_json(1)
+ js = await self.get_weibo_json(1)
if js['ok']:
weibos = js['data']['cards']
for w in weibos:
if w['card_type'] == 9:
- wb = self.get_one_weibo(w)
+ wb = await self.get_one_weibo(w)
if wb:
if not self.__recent:
continue
@@ -414,50 +407,4 @@ def get_latest_weibos(self):
return latest_weibos
except Exception as e:
logger.exception(e)
- return []
-
-
-subr_dic = {}
-
-for config in user_configs:
- sv.logger.debug(config)
- wb = Weibo(config)
- service_name = config["service_name"]
-
- if service_name not in subr_dic:
- subService = Service(service_name, enable_on_default=True)
- subr_dic[service_name] = {"service": subService, "spiders": [wb]}
- else:
- subr_dic[service_name]["spiders"].append(wb)
-
-def wb_to_message(wb):
- msg = f'@{wb["screen_name"]}:\n{wb["text"]}'
- if sv.bot.config.IS_CQPRO and len(wb["pics"]) > 0:
- images_url = wb["pics"]
- msg = f'{msg}\n'
- res_imgs = [R.remote_img(url).cqcode for url in images_url]
- for img in res_imgs:
- msg = f'{msg}{img}'
- if len(wb["video_url"]) > 0:
- videos = wb["video_url"]
- res_videos = ';'.join(videos)
- msg = f'{msg}\n视频链接:{res_videos}'
- return msg
-
-@sv.scheduled_job('interval', seconds=10)
-async def weibo_poller():
- for sv_name, serviceObj in subr_dic.items():
- weibos = []
- ssv = serviceObj["service"]
- spiders = serviceObj["spiders"]
- for spider in spiders:
- latest_weibos = spider.get_latest_weibos()
- formatted_weibos = [wb_to_message(wb) for wb in latest_weibos]
-
- if l := len(formatted_weibos):
- sv.logger.info(f"成功获取@{spider.get_username()}的新微博{l}条")
- else:
- sv.logger.info(f"未检测到@{spider.get_username()}的新微博")
-
- weibos.extend(formatted_weibos)
- await ssv.broadcast(weibos, ssv.name, 0.5)
\ No newline at end of file
+ return []
\ No newline at end of file
diff --git a/hoshino/res.py b/hoshino/res.py
index 6803c6025..d50fcd185 100644
--- a/hoshino/res.py
+++ b/hoshino/res.py
@@ -1,7 +1,7 @@
import os
import asyncio
from PIL import Image
-import httpx as requests
+import httpx
from io import BytesIO
from urllib.request import pathname2url
from urllib.parse import urljoin
@@ -39,9 +39,11 @@ class RemoteResImg(RemoteResObj):
def cqcode(self) -> MessageSegment:
return MessageSegment.image(self.url)
- def open(self) -> Image:
- response = requests.get(self.url)
- return Image.open(BytesIO(response))
+ async def open(self) -> Image:
+ async with httpx.AsyncClient() as client:
+ r = await client.get(self.url)
+ response = requests.get(self.url)
+ return Image.open(BytesIO(response))
class ResObj:
From 89e740c87e02be78c6b1ee10226115b6d4cbd70d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E8=A7=81=E6=80=9D?=
<5361064+zzbslayer@user.noreply.gitee.com>
Date: Sat, 9 May 2020 13:25:52 +0800
Subject: [PATCH 05/12] Fix logic of latest weibo; modify the format of
config.json
---
hoshino/modules/weibo/__init__.py | 47 +++++++++++++++++++++----------
hoshino/modules/weibo/weibo.py | 9 +++++-
2 files changed, 40 insertions(+), 16 deletions(-)
diff --git a/hoshino/modules/weibo/__init__.py b/hoshino/modules/weibo/__init__.py
index 84a130984..a76a925fc 100644
--- a/hoshino/modules/weibo/__init__.py
+++ b/hoshino/modules/weibo/__init__.py
@@ -3,30 +3,39 @@
from hoshino.res import R
from hoshino import util
-sv = Service('weibo-poller', use_priv=Priv.ADMIN, manage_priv=Priv.SUPERUSER, visible=False)
-user_configs = util.load_config(__file__)
'''
sample config.json
[{
- "user_id": "6603867494",
"service_name": "bcr-weibo",
- "filter": true
+ "enable_on_default": true,
+ "users":[{
+ "user_id": "6603867494",
+ "filter": true
+ }]
+
}]
'''
+def _load_config(services_config):
+ for sv_config in services_config:
+ sv.logger.debug(sv_config)
+ service_name = sv_config["service_name"]
+ enable_on_default = sv_config.get("enable_on_default", False)
+ users_config = sv_config["users"]
-subr_dic = {}
+ sv_spider_list = []
+ for user_config in users_config:
+ wb_spider = WeiboSpider(user_config)
+ sv_spider_list.append(wb_spider)
+
+ subService = Service(service_name, enable_on_default=enable_on_default)
+ subr_dic[service_name] = {"service": subService, "spiders": sv_spider_list}
-for config in user_configs:
- sv.logger.debug(config)
- wb_spider = WeiboSpider(config)
- service_name = config["service_name"]
- if service_name not in subr_dic:
- subService = Service(service_name, enable_on_default=True)
- subr_dic[service_name] = {"service": subService, "spiders": [wb_spider]}
- else:
- subr_dic[service_name]["spiders"].append(wb_spider)
+sv = Service('weibo-poller', use_priv=Priv.ADMIN, manage_priv=Priv.SUPERUSER, visible=False)
+services_config = util.load_config(__file__)
+subr_dic = {}
+_load_config(services_config)
def wb_to_message(wb):
msg = f'@{wb["screen_name"]}:\n{wb["text"]}'
@@ -58,4 +67,12 @@ async def weibo_poller():
sv.logger.info(f"未检测到@{spider.get_username()}的新微博")
weibos.extend(formatted_weibos)
- await ssv.broadcast(weibos, ssv.name, 0.5)
\ No newline at end of file
+ await ssv.broadcast(weibos, ssv.name, 0.5)
+
+@sv.scheduled_job('interval', seconds=60*60*24)
+async def clear_spider_buffer():
+ sv.logger.info("Clearing weibo spider buffer...")
+ for sv_name, serviceObj in subr_dic.items():
+ spiders = serviceObj["spiders"]
+ for spider in spiders:
+ spider.clear_buffer()
\ No newline at end of file
diff --git a/hoshino/modules/weibo/weibo.py b/hoshino/modules/weibo/weibo.py
index 384ddc67e..6b22b0a5a 100644
--- a/hoshino/modules/weibo/weibo.py
+++ b/hoshino/modules/weibo/weibo.py
@@ -19,7 +19,11 @@ def __init__(self, config):
self.filter = config['filter']
self.user_id = config['user_id']
self.user = self.get_user_info(self.user_id)
+ self.received_weibo_ids = []
self.__recent = False
+
+ def clear_buffer(self):
+ self.received_weibo_ids.clear()
async def get_json(self, params):
"""获取网页中json数据"""
@@ -84,7 +88,7 @@ def get_user_info(self, user_id):
def validate_config(self, config):
"""验证配置是否正确"""
- exist_argument_list = ['user_id', 'service_name']
+ exist_argument_list = ['user_id']
true_false_argument_list = ['filter']
for argument in true_false_argument_list:
@@ -399,9 +403,12 @@ async def get_latest_weibos(self):
if wb:
if not self.__recent:
continue
+ if wb["id"] in self.received_weibo_ids:
+ continue
if (not self.filter) or (
'retweet' not in wb.keys()):
latest_weibos.append(wb)
+ self.received_weibo_ids.append(wb["id"])
self.print_weibo(wb)
return latest_weibos
From f27e3ccd7a29258f05690f55dfb8afd2648de7c7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E8=A7=81=E6=80=9D?=
<5361064+zzbslayer@user.noreply.gitee.com>
Date: Sat, 9 May 2020 13:29:01 +0800
Subject: [PATCH 06/12] Small fix for async
---
hoshino/modules/weibo/weibo.py | 3 ++-
hoshino/res.py | 3 +--
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/hoshino/modules/weibo/weibo.py b/hoshino/modules/weibo/weibo.py
index 6b22b0a5a..9105f3b2a 100644
--- a/hoshino/modules/weibo/weibo.py
+++ b/hoshino/modules/weibo/weibo.py
@@ -3,6 +3,7 @@
import json
import random
import sys
+import asyncio
from collections import OrderedDict
from datetime import date, datetime, timedelta
from time import sleep
@@ -327,7 +328,7 @@ async def get_long_weibo(self, id):
if weibo_info:
weibo = self.parse_weibo(weibo_info)
return weibo
- sleep(random.randint(6, 10))
+ asyncio.sleep(random.randint(6, 10))
def print_user_info(self):
"""打印用户信息"""
diff --git a/hoshino/res.py b/hoshino/res.py
index d50fcd185..6aace99c5 100644
--- a/hoshino/res.py
+++ b/hoshino/res.py
@@ -42,8 +42,7 @@ def cqcode(self) -> MessageSegment:
async def open(self) -> Image:
async with httpx.AsyncClient() as client:
r = await client.get(self.url)
- response = requests.get(self.url)
- return Image.open(BytesIO(response))
+ return Image.open(BytesIO(r))
class ResObj:
From b2d8595aec3d4c8000ae69110ff53450d28338e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E8=A7=81=E6=80=9D?=
<5361064+zzbslayer@user.noreply.gitee.com>
Date: Sat, 9 May 2020 19:35:16 +0800
Subject: [PATCH 07/12] Call async func in __init__ with asyncio
---
hoshino/modules/weibo/weibo.py | 24 +++++++++++-------------
1 file changed, 11 insertions(+), 13 deletions(-)
diff --git a/hoshino/modules/weibo/weibo.py b/hoshino/modules/weibo/weibo.py
index 9105f3b2a..2dbf9cb70 100644
--- a/hoshino/modules/weibo/weibo.py
+++ b/hoshino/modules/weibo/weibo.py
@@ -19,29 +19,24 @@ def __init__(self, config):
self.validate_config(config)
self.filter = config['filter']
self.user_id = config['user_id']
- self.user = self.get_user_info(self.user_id)
self.received_weibo_ids = []
self.__recent = False
+ asyncio.get_event_loop().run_until_complete(self._async_init())
+
+ async def _async_init(self):
+ self.user = await self.get_user_info(self.user_id)
- def clear_buffer(self):
- self.received_weibo_ids.clear()
-
async def get_json(self, params):
"""获取网页中json数据"""
url = 'https://m.weibo.cn/api/container/getIndex?'
async with httpx.AsyncClient() as client:
- r = await client.get(url, params=params)
+ r = await client.get(url, params=params, timeout=10.0) # sometimes timeout
return r.json()
- def sync_get_json(self, params):
- url = 'https://m.weibo.cn/api/container/getIndex?'
- r = httpx.get(url, params=params)
- return r.json()
-
- def get_user_info(self, user_id):
+ async def get_user_info(self, user_id):
"""获取用户信息"""
params = {'containerid': '100505' + str(user_id)}
- js = self.sync_get_json(params)
+ js = await self.get_json(params)
if js['ok']:
info = js['data']['userInfo']
user_info = OrderedDict()
@@ -62,7 +57,7 @@ def get_user_info(self, user_id):
]
for i in en_list:
user_info[i] = ''
- js = self.sync_get_json(params)
+ js = await self.get_json(params)
if js['ok']:
cards = js['data']['cards']
if isinstance(cards, list) and len(cards) > 1:
@@ -87,6 +82,9 @@ def get_user_info(self, user_id):
user = self.standardize_info(user_info)
return user
+ def clear_buffer(self):
+ self.received_weibo_ids.clear()
+
def validate_config(self, config):
"""验证配置是否正确"""
exist_argument_list = ['user_id']
From 125039971cae1a4cb94527743682188c4f763173 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E8=A7=81=E6=80=9D?=
<5361064+zzbslayer@user.noreply.gitee.com>
Date: Mon, 11 May 2020 03:40:57 +0800
Subject: [PATCH 08/12] Handle retweet weibo
---
hoshino/modules/weibo/__init__.py | 10 +++++++++-
1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/hoshino/modules/weibo/__init__.py b/hoshino/modules/weibo/__init__.py
index a76a925fc..ecc5dac18 100644
--- a/hoshino/modules/weibo/__init__.py
+++ b/hoshino/modules/weibo/__init__.py
@@ -38,7 +38,15 @@ def _load_config(services_config):
_load_config(services_config)
def wb_to_message(wb):
- msg = f'@{wb["screen_name"]}:\n{wb["text"]}'
+ msg = f'@{wb["screen_name"]}'
+ if "retweet" in wb:
+ msg = f'{msg} 转发:\n{wb["text"]}\n======================'
+ wb = wb["retweet"]
+ else:
+ msg = f'{msg}:'
+
+ msg = f'{msg}\n{wb["text"]}'
+
if sv.bot.config.IS_CQPRO and len(wb["pics"]) > 0:
images_url = wb["pics"]
msg = f'{msg}\n'
From ebf0d71d99d2e2438bfb22fd03db68d3939e32b8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E8=A7=81=E6=80=9D?=
<5361064+zzbslayer@user.noreply.gitee.com>
Date: Mon, 11 May 2020 16:08:03 +0800
Subject: [PATCH 09/12] Users can fetch the latest 5 weibos forwardly by alias
---
hoshino/modules/weibo/__init__.py | 36 +++++++++++++++++++++++++++++-
hoshino/modules/weibo/exception.py | 3 +++
hoshino/modules/weibo/weibo.py | 8 +++++++
3 files changed, 46 insertions(+), 1 deletion(-)
diff --git a/hoshino/modules/weibo/__init__.py b/hoshino/modules/weibo/__init__.py
index ecc5dac18..b75083417 100644
--- a/hoshino/modules/weibo/__init__.py
+++ b/hoshino/modules/weibo/__init__.py
@@ -2,6 +2,7 @@
from hoshino.service import Service, Privilege as Priv
from hoshino.res import R
from hoshino import util
+from .exception import *
'''
sample config.json
@@ -11,6 +12,7 @@
"enable_on_default": true,
"users":[{
"user_id": "6603867494",
+ "alias": ["公主连接", "公主连结", "公主链接"],
"filter": true
}]
@@ -21,20 +23,31 @@ def _load_config(services_config):
sv.logger.debug(sv_config)
service_name = sv_config["service_name"]
enable_on_default = sv_config.get("enable_on_default", False)
+
users_config = sv_config["users"]
sv_spider_list = []
for user_config in users_config:
wb_spider = WeiboSpider(user_config)
sv_spider_list.append(wb_spider)
+ alias_list = user_config.get("alias", [])
+ for alias in alias_list:
+ if alias in alias_dic:
+ raise DuplicateError(f"Alias {alias} is duplicate")
+ alias_dic[alias] = {
+ "service_name":service_name,
+ "user_id":wb_spider.get_user_id()
+ }
subService = Service(service_name, enable_on_default=enable_on_default)
subr_dic[service_name] = {"service": subService, "spiders": sv_spider_list}
-
+
+
sv = Service('weibo-poller', use_priv=Priv.ADMIN, manage_priv=Priv.SUPERUSER, visible=False)
services_config = util.load_config(__file__)
subr_dic = {}
+alias_dic = {}
_load_config(services_config)
def wb_to_message(wb):
@@ -59,6 +72,27 @@ def wb_to_message(wb):
msg = f'{msg}\n视频链接:{res_videos}'
return msg
+# @bot 看微博 alias
+@sv.on_command('看微博', only_to_me=True)
+async def get_last_5_weibo(session):
+ alias = session.current_arg_text
+ if alias not in alias_dic:
+ await session.finish(f"未找到微博: {alias}")
+ return
+ service_name = alias_dic[alias]["service_name"]
+ user_id = alias_dic[alias]["user_id"]
+
+ spiders = subr_dic[service_name]["spiders"]
+ for spider in spiders:
+ if spider.get_user_id() == user_id:
+ last_5_weibos = spider.get_last_5_weibos()
+ formatted_weibos = [wb_to_message(wb) for wb in last_5_weibos]
+ for wb in formatted_weibos:
+ await session.send(wb)
+ await session.finish(f"以上为 {alias} 的最新 {len(formatted_weibos)} 条微博")
+ return
+ await session.finish(f"未找到微博: {alias}")
+
@sv.scheduled_job('interval', seconds=20*60)
async def weibo_poller():
for sv_name, serviceObj in subr_dic.items():
diff --git a/hoshino/modules/weibo/exception.py b/hoshino/modules/weibo/exception.py
index 5d1fab491..bda1619b8 100644
--- a/hoshino/modules/weibo/exception.py
+++ b/hoshino/modules/weibo/exception.py
@@ -18,4 +18,7 @@ class ParseError(WeiboError):
class NotFoundError(WeiboError):
+ pass
+
+class DuplicateError(WeiboError):
pass
\ No newline at end of file
diff --git a/hoshino/modules/weibo/weibo.py b/hoshino/modules/weibo/weibo.py
index 2dbf9cb70..a91a54280 100644
--- a/hoshino/modules/weibo/weibo.py
+++ b/hoshino/modules/weibo/weibo.py
@@ -20,6 +20,7 @@ def __init__(self, config):
self.filter = config['filter']
self.user_id = config['user_id']
self.received_weibo_ids = []
+ self.last_5_weibos = []
self.__recent = False
asyncio.get_event_loop().run_until_complete(self._async_init())
@@ -301,6 +302,9 @@ def get_username(self):
def get_user_id(self):
return self.user_id
+ def get_last_5_weibos(self):
+ return self.last_5_weibos
+
async def get_weibo_json(self, page):
"""获取网页中微博json数据"""
params = {
@@ -406,6 +410,10 @@ async def get_latest_weibos(self):
continue
if (not self.filter) or (
'retweet' not in wb.keys()):
+ if len(self.last_5_weibos) == 5:
+ self.last_5_weibos.pop(0)
+ self.last_5_weibos.append(wb)
+
latest_weibos.append(wb)
self.received_weibo_ids.append(wb["id"])
self.print_weibo(wb)
From f0f8941993b9d952377f8612bb16879d89249027 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E8=A7=81=E6=80=9D?=
<5361064+zzbslayer@user.noreply.gitee.com>
Date: Thu, 21 May 2020 21:35:08 +0800
Subject: [PATCH 10/12] Fix parsing logic for url in weibo text
---
hoshino/modules/weibo/__init__.py | 9 ++++++
hoshino/modules/weibo/weibo.py | 50 ++++++++++++++++++++++---------
2 files changed, 45 insertions(+), 14 deletions(-)
diff --git a/hoshino/modules/weibo/__init__.py b/hoshino/modules/weibo/__init__.py
index b75083417..1b781b26e 100644
--- a/hoshino/modules/weibo/__init__.py
+++ b/hoshino/modules/weibo/__init__.py
@@ -18,6 +18,9 @@
}]
'''
+
+lmt = util.FreqLimiter(5)
+
def _load_config(services_config):
for sv_config in services_config:
sv.logger.debug(sv_config)
@@ -70,11 +73,17 @@ def wb_to_message(wb):
videos = wb["video_url"]
res_videos = ';'.join(videos)
msg = f'{msg}\n视频链接:{res_videos}'
+
return msg
# @bot 看微博 alias
@sv.on_command('看微博', only_to_me=True)
async def get_last_5_weibo(session):
+ uid = session.ctx['user_id']
+ if not lmt.check(uid):
+ session.finish('您查询得过于频繁,请稍等片刻', at_sender=True)
+ lmt.start_cd(uid)
+
alias = session.current_arg_text
if alias not in alias_dic:
await session.finish(f"未找到微博: {alias}")
diff --git a/hoshino/modules/weibo/weibo.py b/hoshino/modules/weibo/weibo.py
index a91a54280..65c8319f5 100644
--- a/hoshino/modules/weibo/weibo.py
+++ b/hoshino/modules/weibo/weibo.py
@@ -25,7 +25,10 @@ def __init__(self, config):
asyncio.get_event_loop().run_until_complete(self._async_init())
async def _async_init(self):
+ self.__init = True
self.user = await self.get_user_info(self.user_id)
+ await self.get_latest_weibos()
+ self.__init = False
async def get_json(self, params):
"""获取网页中json数据"""
@@ -158,16 +161,6 @@ def get_location(self, selector):
break
return location
- def get_article_url(self, selector):
- """获取微博中头条文章的url"""
- article_url = ''
- text = selector.xpath('string(.)')
- if text.startswith(u'发布了头条文章'):
- url = selector.xpath('//a/@data-url')
- if url and url[0].startswith('http://t.cn'):
- article_url = url[0]
- return article_url
-
def get_topics(self, selector):
"""获取参与的微博话题"""
span_list = selector.xpath("//span[@class='surl-text']")
@@ -193,6 +186,27 @@ def get_at_users(self, selector):
at_users = ','.join(at_list)
return at_users
+ def get_text(self, text_body):
+ selector = etree.HTML(text_body)
+ url_lists = selector.xpath('//a[@data-url]/@data-url')
+ url_elems = selector.xpath('//a[@data-url]/span[@class="surl-text"]')
+
+ '''
+ Add the url of to the text of
+ For example:
+
+
+
+ 本地化笔记第三期——剧情活动排期调整及版本更新内容前瞻
+
+
+ replace 本地化笔记第三期——剧情活动排期调整及版本更新内容前瞻
+ with 本地化笔记第三期——剧情活动排期调整及版本更新内容前瞻(http://t.cn/A622uDbW)
+ '''
+ for i in range(0, len(url_lists)):
+ url_elems[i].text = f'{url_elems[i].text}({url_lists[i]})'
+ return selector.xpath('string(.)')
+
def string_to_int(self, string):
"""字符串转换为整数"""
if isinstance(string, int):
@@ -217,11 +231,17 @@ def standardize_date(self, created_at):
hour = created_at[:created_at.find(u"小时")]
hour = timedelta(hours=int(hour))
created_at = (datetime.now() - hour).strftime("%Y-%m-%d")
- self.__recent = False
+ if self.__init:
+ self.__recent = True
+ else:
+ self.__recent = False
elif u"昨天" in created_at:
day = timedelta(days=1)
created_at = (datetime.now() - day).strftime("%Y-%m-%d")
- self.__recent = False
+ if self.__init:
+ self.__recent = True
+ else:
+ self.__recent = False
elif created_at.count('-') == 1:
year = datetime.now().strftime("%Y")
created_at = year + "-" + created_at
@@ -250,8 +270,10 @@ def parse_weibo(self, weibo_info):
weibo['bid'] = weibo_info['bid']
text_body = weibo_info['text']
selector = etree.HTML(text_body)
- weibo['text'] = etree.HTML(text_body).xpath('string(.)')
- weibo['article_url'] = self.get_article_url(selector)
+
+
+ weibo['text'] = self.get_text(text_body)
+
weibo['pics'] = self.get_pics(weibo_info)
weibo['video_url'] = self.get_video_url(weibo_info)
weibo['location'] = self.get_location(selector)
From c1dcef7330be571dc8cfabc06d69c7d2236bfdd3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E8=A7=81=E6=80=9D?=
<5361064+zzbslayer@user.noreply.gitee.com>
Date: Thu, 21 May 2020 23:32:31 +0800
Subject: [PATCH 11/12] Add img of article to image list
---
hoshino/modules/weibo/weibo.py | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/hoshino/modules/weibo/weibo.py b/hoshino/modules/weibo/weibo.py
index 65c8319f5..254d2924f 100644
--- a/hoshino/modules/weibo/weibo.py
+++ b/hoshino/modules/weibo/weibo.py
@@ -111,6 +111,12 @@ def get_pics(self, weibo_info):
pic_list = [pic['large']['url'] for pic in pic_info]
else:
pic_list = []
+
+ """获取文章封面图片url"""
+ if 'page_info' in weibo_info and weibo_info['page_info']['type'] == 'article':
+ if 'page_pic' in weibo_info['page_info']:
+ pic_list.append(weibo_info['page_info']['page_pic']['url'])
+
return pic_list
def get_live_photo(self, weibo_info):
From 466eedf394743fd1baf52a2b9355563b572b0f76 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E8=A7=81=E6=80=9D?=
<5361064+zzbslayer@user.noreply.gitee.com>
Date: Fri, 29 May 2020 03:07:40 +0800
Subject: [PATCH 12/12] Check weibo config from bot; other small fix
---
hoshino/modules/weibo/__init__.py | 30 +++++++++++++++++++++++++-----
hoshino/modules/weibo/weibo.py | 12 ++++++++++--
2 files changed, 35 insertions(+), 7 deletions(-)
diff --git a/hoshino/modules/weibo/__init__.py b/hoshino/modules/weibo/__init__.py
index 1b781b26e..9b4ac21d2 100644
--- a/hoshino/modules/weibo/__init__.py
+++ b/hoshino/modules/weibo/__init__.py
@@ -8,7 +8,7 @@
sample config.json
[{
- "service_name": "bcr-weibo",
+ "service_name": "weibo-bcr",
"enable_on_default": true,
"users":[{
"user_id": "6603867494",
@@ -47,7 +47,7 @@ def _load_config(services_config):
-sv = Service('weibo-poller', use_priv=Priv.ADMIN, manage_priv=Priv.SUPERUSER, visible=False)
+sv = Service('weibo-poller', manage_priv=Priv.SUPERUSER, visible=False)
services_config = util.load_config(__file__)
subr_dic = {}
alias_dic = {}
@@ -76,18 +76,38 @@ def wb_to_message(wb):
return msg
+weibo_url_prefix = "https://weibo.com/u"
+@sv.on_command('weibo-config',aliases=('查看微博服务', '微博服务', '微博配置', '查看微博配置'))
+async def weibo_config(session):
+ msg = '微博推送配置:服务名,别名,微博链接'
+ index = 1
+ for service_config in services_config:
+ service_name = service_config['service_name']
+ users_config = service_config['users']
+ for user_config in users_config:
+ weibo_id = user_config['user_id']
+ alias = user_config['alias']
+ weibo_url = f'{weibo_url_prefix}/{weibo_id}'
+ msg = f'{msg}\n{index}. {service_name}, {alias}, {weibo_url}'
+ index+=1
+ session.finish(msg)
+
+
# @bot 看微博 alias
@sv.on_command('看微博', only_to_me=True)
async def get_last_5_weibo(session):
uid = session.ctx['user_id']
if not lmt.check(uid):
session.finish('您查询得过于频繁,请稍等片刻', at_sender=True)
+ return
+
lmt.start_cd(uid)
alias = session.current_arg_text
if alias not in alias_dic:
- await session.finish(f"未找到微博: {alias}")
+ session.finish(f"未找到微博: {alias}")
return
+
service_name = alias_dic[alias]["service_name"]
user_id = alias_dic[alias]["user_id"]
@@ -98,9 +118,9 @@ async def get_last_5_weibo(session):
formatted_weibos = [wb_to_message(wb) for wb in last_5_weibos]
for wb in formatted_weibos:
await session.send(wb)
- await session.finish(f"以上为 {alias} 的最新 {len(formatted_weibos)} 条微博")
+ session.finish(f"以上为 {alias} 的最新 {len(formatted_weibos)} 条微博")
return
- await session.finish(f"未找到微博: {alias}")
+ session.finish(f"未找到微博: {alias}")
@sv.scheduled_job('interval', seconds=20*60)
async def weibo_poller():
diff --git a/hoshino/modules/weibo/weibo.py b/hoshino/modules/weibo/weibo.py
index 254d2924f..7d2e18183 100644
--- a/hoshino/modules/weibo/weibo.py
+++ b/hoshino/modules/weibo/weibo.py
@@ -87,7 +87,12 @@ async def get_user_info(self, user_id):
return user
def clear_buffer(self):
- self.received_weibo_ids.clear()
+ """
+ 如果清理缓存前一分钟,该微博账号瞬间发送了 20 条微博
+ 然后清理缓存仅仅保留后 10 条的微博id,因此可能会重复推送前 10 条微博
+ 当然这种情况通常不会发生
+ """
+ self.received_weibo_ids = self.received_weibo_ids[-10:]
def validate_config(self, config):
"""验证配置是否正确"""
@@ -251,7 +256,10 @@ def standardize_date(self, created_at):
elif created_at.count('-') == 1:
year = datetime.now().strftime("%Y")
created_at = year + "-" + created_at
- self.__recent = False
+ if self.__init:
+ self.__recent = True
+ else:
+ self.__recent = False
return created_at
def standardize_info(self, weibo):