From 4eef604856ea4f14a3f9a3be3be39401c9a0a819 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=A7=81=E6=80=9D?= <5361064+zzbslayer@user.noreply.gitee.com> Date: Fri, 8 May 2020 23:27:51 +0800 Subject: [PATCH 01/12] new feature: weibo poller --- config_sample.py | 1 + hoshino/modules/weibo/weibo.py | 442 +++++++++++++++++++++++++++++++++ 2 files changed, 443 insertions(+) create mode 100644 hoshino/modules/weibo/weibo.py diff --git a/config_sample.py b/config_sample.py index dee3e59e3..193863362 100644 --- a/config_sample.py +++ b/config_sample.py @@ -43,4 +43,5 @@ # 'setu', 'translate', # 'twitter', + # 'weibo' } diff --git a/hoshino/modules/weibo/weibo.py b/hoshino/modules/weibo/weibo.py new file mode 100644 index 000000000..d064a9b92 --- /dev/null +++ b/hoshino/modules/weibo/weibo.py @@ -0,0 +1,442 @@ +# -*- coding: UTF-8 -*- + +import json +import os +import random +import sys +import traceback +from collections import OrderedDict +from datetime import date, datetime, timedelta +from time import sleep + +import requests +from lxml import etree +from hoshino.service import Service, Privilege as Priv +from hoshino import util, logger + +sv = Service('weibo-poller', use_priv=Priv.ADMIN, manage_priv=Priv.SUPERUSER, visible=False) + +class Weibo(object): + def __init__(self, config): + """Weibo类初始化""" + self.validate_config(config) + self.filter = config['filter'] + self.user = self.get_user_info(config["user_id"]) + self.got_count = 0 # 存储爬取到的微博数 + self.weibo = [] # 存储爬取到的所有微博信息 + self.weibo_id_list = [] # 存储爬取到的所有微博id + + def get_json(self, params): + """获取网页中json数据""" + url = 'https://m.weibo.cn/api/container/getIndex?' + r = requests.get(url, params=params) + return r.json() + + def get_user_info(self, user_id): + """获取用户信息""" + params = {'containerid': '100505' + str(user_id)} + js = self.get_json(params) + if js['ok']: + info = js['data']['userInfo'] + user_info = OrderedDict() + user_info['id'] = user_id + user_info['screen_name'] = info.get('screen_name', '') + user_info['gender'] = info.get('gender', '') + params = { + 'containerid': + '230283' + str(user_id) + '_-_INFO' + } + zh_list = [ + u'生日', u'所在地', u'小学', u'初中', u'高中', u'大学', u'公司', u'注册时间', + u'阳光信用' + ] + en_list = [ + 'birthday', 'location', 'education', 'education', 'education', + 'education', 'company', 'registration_time', 'sunshine' + ] + for i in en_list: + user_info[i] = '' + js = self.get_json(params) + if js['ok']: + cards = js['data']['cards'] + if isinstance(cards, list) and len(cards) > 1: + card_list = cards[0]['card_group'] + cards[1]['card_group'] + for card in card_list: + if card.get('item_name') in zh_list: + user_info[en_list[zh_list.index( + card.get('item_name'))]] = card.get( + 'item_content', '') + user_info['statuses_count'] = info.get('statuses_count', 0) + user_info['followers_count'] = info.get('followers_count', 0) + user_info['follow_count'] = info.get('follow_count', 0) + user_info['description'] = info.get('description', '') + user_info['profile_url'] = info.get('profile_url', '') + user_info['profile_image_url'] = info.get('profile_image_url', '') + user_info['avatar_hd'] = info.get('avatar_hd', '') + user_info['urank'] = info.get('urank', 0) + user_info['mbrank'] = info.get('mbrank', 0) + user_info['verified'] = info.get('verified', False) + user_info['verified_type'] = info.get('verified_type', 0) + user_info['verified_reason'] = info.get('verified_reason', '') + user = self.standardize_info(user_info) + return user + + def validate_config(self, config): + """验证配置是否正确""" + + # 验证filter、original_pic_download、retweet_pic_download、original_video_download、retweet_video_download + argument_list = [ + 'filter' + ] + for argument in argument_list: + if config[argument] != 0 and config[argument] != 1: + logger.error(u'%s值应为0或1,请重新输入' % config[argument]) + + # 验证user_id_list + if "user_id" not in config: + logger.error(u'请填写用户 id') + if "service_name" not in config: + logger.error(u'请填写所属服务名') + + def get_pics(self, weibo_info): + """获取微博原始图片url""" + if weibo_info.get('pics'): + pic_info = weibo_info['pics'] + pic_list = [pic['large']['url'] for pic in pic_info] + pics = ','.join(pic_list) + else: + pics = '' + return pics + + def get_live_photo(self, weibo_info): + """获取live photo中的视频url""" + live_photo_list = [] + live_photo = weibo_info.get('pic_video') + if live_photo: + prefix = 'https://video.weibo.com/media/play?livephoto=//us.sinaimg.cn/' + for i in live_photo.split(','): + if len(i.split(':')) == 2: + url = prefix + i.split(':')[1] + '.mov' + live_photo_list.append(url) + return live_photo_list + + def get_video_url(self, weibo_info): + """获取微博视频url""" + video_url = '' + video_url_list = [] + if weibo_info.get('page_info'): + if weibo_info['page_info'].get('media_info') and weibo_info[ + 'page_info'].get('type') == 'video': + media_info = weibo_info['page_info']['media_info'] + video_url = media_info.get('mp4_720p_mp4') + if not video_url: + video_url = media_info.get('mp4_hd_url') + if not video_url: + video_url = media_info.get('mp4_sd_url') + if not video_url: + video_url = media_info.get('stream_url_hd') + if not video_url: + video_url = media_info.get('stream_url') + if video_url: + video_url_list.append(video_url) + live_photo_list = self.get_live_photo(weibo_info) + if live_photo_list: + video_url_list += live_photo_list + return ';'.join(video_url_list) + + def get_location(self, selector): + """获取微博发布位置""" + location_icon = 'timeline_card_small_location_default.png' + span_list = selector.xpath('//span') + location = '' + for i, span in enumerate(span_list): + if span.xpath('img/@src'): + if location_icon in span.xpath('img/@src')[0]: + location = span_list[i + 1].xpath('string(.)') + break + return location + + def get_article_url(self, selector): + """获取微博中头条文章的url""" + article_url = '' + text = selector.xpath('string(.)') + if text.startswith(u'发布了头条文章'): + url = selector.xpath('//a/@data-url') + if url and url[0].startswith('http://t.cn'): + article_url = url[0] + return article_url + + def get_topics(self, selector): + """获取参与的微博话题""" + span_list = selector.xpath("//span[@class='surl-text']") + topics = '' + topic_list = [] + for span in span_list: + text = span.xpath('string(.)') + if len(text) > 2 and text[0] == '#' and text[-1] == '#': + topic_list.append(text[1:-1]) + if topic_list: + topics = ','.join(topic_list) + return topics + + def get_at_users(self, selector): + """获取@用户""" + a_list = selector.xpath('//a') + at_users = '' + at_list = [] + for a in a_list: + if '@' + a.xpath('@href')[0][3:] == a.xpath('string(.)'): + at_list.append(a.xpath('string(.)')[1:]) + if at_list: + at_users = ','.join(at_list) + return at_users + + def string_to_int(self, string): + """字符串转换为整数""" + if isinstance(string, int): + return string + elif string.endswith(u'万+'): + string = int(string[:-2] + '0000') + elif string.endswith(u'万'): + string = int(string[:-1] + '0000') + return int(string) + + def standardize_date(self, created_at): + """标准化微博发布时间""" + if u"刚刚" in created_at: + created_at = datetime.now().strftime("%Y-%m-%d") + elif u"分钟" in created_at: + minute = created_at[:created_at.find(u"分钟")] + minute = timedelta(minutes=int(minute)) + created_at = (datetime.now() - minute).strftime("%Y-%m-%d") + elif u"小时" in created_at: + hour = created_at[:created_at.find(u"小时")] + hour = timedelta(hours=int(hour)) + created_at = (datetime.now() - hour).strftime("%Y-%m-%d") + elif u"昨天" in created_at: + day = timedelta(days=1) + created_at = (datetime.now() - day).strftime("%Y-%m-%d") + elif created_at.count('-') == 1: + year = datetime.now().strftime("%Y") + created_at = year + "-" + created_at + return created_at + + def standardize_info(self, weibo): + """标准化信息,去除乱码""" + for k, v in weibo.items(): + if 'bool' not in str(type(v)) and 'int' not in str( + type(v)) and 'list' not in str( + type(v)) and 'long' not in str(type(v)): + weibo[k] = v.replace(u"\u200b", "").encode( + sys.stdout.encoding, "ignore").decode(sys.stdout.encoding) + return weibo + + def parse_weibo(self, weibo_info): + weibo = OrderedDict() + if weibo_info['user']: + weibo['user_id'] = weibo_info['user']['id'] + weibo['screen_name'] = weibo_info['user']['screen_name'] + else: + weibo['user_id'] = '' + weibo['screen_name'] = '' + weibo['id'] = int(weibo_info['id']) + weibo['bid'] = weibo_info['bid'] + text_body = weibo_info['text'] + selector = etree.HTML(text_body) + weibo['text'] = etree.HTML(text_body).xpath('string(.)') + weibo['article_url'] = self.get_article_url(selector) + weibo['pics'] = self.get_pics(weibo_info) + weibo['video_url'] = self.get_video_url(weibo_info) + weibo['location'] = self.get_location(selector) + weibo['created_at'] = weibo_info['created_at'] + weibo['source'] = weibo_info['source'] + weibo['attitudes_count'] = self.string_to_int( + weibo_info.get('attitudes_count', 0)) + weibo['comments_count'] = self.string_to_int( + weibo_info.get('comments_count', 0)) + weibo['reposts_count'] = self.string_to_int( + weibo_info.get('reposts_count', 0)) + weibo['topics'] = self.get_topics(selector) + weibo['at_users'] = self.get_at_users(selector) + return self.standardize_info(weibo) + + def print_one_weibo(self, weibo): + """打印一条微博""" + try: + logger.info(u'微博id:%d' % weibo['id']) + logger.info(u'微博正文:%s' % weibo['text']) + logger.info(u'原始图片url:%s' % weibo['pics']) + logger.info(u'微博位置:%s' % weibo['location']) + logger.info(u'发布时间:%s' % weibo['created_at']) + logger.info(u'发布工具:%s' % weibo['source']) + logger.info(u'点赞数:%d' % weibo['attitudes_count']) + logger.info(u'评论数:%d' % weibo['comments_count']) + logger.info(u'转发数:%d' % weibo['reposts_count']) + logger.info(u'话题:%s' % weibo['topics']) + logger.info(u'@用户:%s' % weibo['at_users']) + logger.info(u'url:https://m.weibo.cn/detail/%d' % weibo['id']) + except OSError: + pass + + def print_weibo(self, weibo): + """打印微博,若为转发微博,会同时打印原创和转发部分""" + if weibo.get('retweet'): + logger.info('*' * 100) + logger.info(u'转发部分:') + self.print_one_weibo(weibo['retweet']) + logger.info('*' * 100) + logger.info(u'原创部分:') + self.print_one_weibo(weibo) + logger.info('-' * 120) + + def get_username(self): + return self.user["screen_name"] + + def get_user_id(self): + return self.user["id"] + + def get_weibo_json(self, page): + """获取网页中微博json数据""" + params = { + 'containerid': '107603' + self.get_user_id(), + 'page': page + } + js = self.get_json(params) + return js + + def get_long_weibo(self, id): + """获取长微博""" + for i in range(5): + url = 'https://m.weibo.cn/detail/%s' % id + html = requests.get(url).text + html = html[html.find('"status":'):] + html = html[:html.rfind('"hotScheme"')] + html = html[:html.rfind(',')] + html = '{' + html + '}' + js = json.loads(html, strict=False) + weibo_info = js.get('status') + if weibo_info: + weibo = self.parse_weibo(weibo_info) + return weibo + sleep(random.randint(6, 10)) + + def print_user_info(self): + """打印用户信息""" + logger.info('+' * 100) + logger.info(u'用户信息') + logger.info(u'用户id:%s' % self.user['id']) + logger.info(u'用户昵称:%s' % self.user['screen_name']) + gender = u'女' if self.user['gender'] == 'f' else u'男' + logger.info(u'性别:%s' % gender) + logger.info(u'生日:%s' % self.user['birthday']) + logger.info(u'所在地:%s' % self.user['location']) + logger.info(u'教育经历:%s' % self.user['education']) + logger.info(u'公司:%s' % self.user['company']) + logger.info(u'阳光信用:%s' % self.user['sunshine']) + logger.info(u'注册时间:%s' % self.user['registration_time']) + logger.info(u'微博数:%d' % self.user['statuses_count']) + logger.info(u'粉丝数:%d' % self.user['followers_count']) + logger.info(u'关注数:%d' % self.user['follow_count']) + logger.info(u'url:https://m.weibo.cn/profile/%s' % self.user['id']) + if self.user.get('verified_reason'): + logger.info(self.user['verified_reason']) + logger.info(self.user['description']) + logger.info('+' * 100) + + def get_one_weibo(self, info): + """获取一条微博的全部信息""" + try: + weibo_info = info['mblog'] + weibo_id = weibo_info['id'] + retweeted_status = weibo_info.get('retweeted_status') + is_long = weibo_info.get('isLongText') + if retweeted_status and retweeted_status.get('id'): # 转发 + retweet_id = retweeted_status.get('id') + is_long_retweet = retweeted_status.get('isLongText') + if is_long: + weibo = self.get_long_weibo(weibo_id) + if not weibo: + weibo = self.parse_weibo(weibo_info) + else: + weibo = self.parse_weibo(weibo_info) + if is_long_retweet: + retweet = self.get_long_weibo(retweet_id) + if not retweet: + retweet = self.parse_weibo(retweeted_status) + else: + retweet = self.parse_weibo(retweeted_status) + retweet['created_at'] = self.standardize_date( + retweeted_status['created_at']) + weibo['retweet'] = retweet + else: # 原创 + if is_long: + weibo = self.get_long_weibo(weibo_id) + if not weibo: + weibo = self.parse_weibo(weibo_info) + else: + weibo = self.parse_weibo(weibo_info) + weibo['created_at'] = self.standardize_date( + weibo_info['created_at']) + return weibo + except Exception as e: + logger.exception(e) + + def get_latest_weibos(self): + try: + latest_weibos = [] + js = self.get_weibo_json(1) + if js['ok']: + weibos = js['data']['cards'] + for w in weibos: + if w['card_type'] == 9: + wb = self.get_one_weibo(w) + if wb: + if wb['created_at'] != str(date.today()): + continue + if wb['id'] in self.weibo_id_list: + continue + if (not self.filter) or ( + 'retweet' not in wb.keys()): + self.weibo.append(wb) + latest_weibos.append(wb) + self.weibo_id_list.append(wb['id']) + self.got_count += 1 + self.print_weibo(wb) + + return latest_weibos + except Exception as e: + logger.exception(e) + return [] + + +user_configs = util.load_config(__file__) +subr_dic = {} + +for config in user_configs: + print(config) + wb = Weibo(config) + service_name = config["service_name"] + subService = Service(service_name, enable_on_default=True) + + if service_name not in subr_dic: + subr_dic[service_name] = {"service": subService, "spiders": [wb]} + else: + subr_dic[service_name]["spiders"].append(wb) + +@sv.scheduled_job('interval', seconds=60 * 20) +async def weibo_poller(): + for sv_name, serviceObj in subr_dic.items(): + weibos = [] + ssv = serviceObj["service"] + spiders = serviceObj["spiders"] + for spider in spiders: + latest_weibos = spider.get_latest_weibos() + formatted_weibos = [wb["text"] for wb in latest_weibos] + + if l := len(formatted_weibos): + sv.logger.info(f"成功获取@{spider.get_username()}的新微博{l}条") + else: + sv.logger.info(f"未检测到@{spider.get_username()}的新微博") + + weibos.extend(formatted_weibos) + await ssv.broadcast(weibos, ssv.name, 0.5) \ No newline at end of file From 7c6cc1b94458ca54f324e65858096ddb3228b3f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=A7=81=E6=80=9D?= <5361064+zzbslayer@user.noreply.gitee.com> Date: Sat, 9 May 2020 01:52:37 +0800 Subject: [PATCH 02/12] Send weibo images; fix subservice bug --- hoshino/modules/weibo/weibo.py | 46 ++++++++++++++++++++++------------ hoshino/res.py | 22 ++++++++++++++++ 2 files changed, 52 insertions(+), 16 deletions(-) diff --git a/hoshino/modules/weibo/weibo.py b/hoshino/modules/weibo/weibo.py index d064a9b92..eb4d19c16 100644 --- a/hoshino/modules/weibo/weibo.py +++ b/hoshino/modules/weibo/weibo.py @@ -13,6 +13,7 @@ from lxml import etree from hoshino.service import Service, Privilege as Priv from hoshino import util, logger +from hoshino.res import R sv = Service('weibo-poller', use_priv=Priv.ADMIN, manage_priv=Priv.SUPERUSER, visible=False) @@ -22,9 +23,8 @@ def __init__(self, config): self.validate_config(config) self.filter = config['filter'] self.user = self.get_user_info(config["user_id"]) - self.got_count = 0 # 存储爬取到的微博数 - self.weibo = [] # 存储爬取到的所有微博信息 - self.weibo_id_list = [] # 存储爬取到的所有微博id + + self.__recent = False def get_json(self, params): """获取网页中json数据""" @@ -103,10 +103,9 @@ def get_pics(self, weibo_info): if weibo_info.get('pics'): pic_info = weibo_info['pics'] pic_list = [pic['large']['url'] for pic in pic_info] - pics = ','.join(pic_list) else: - pics = '' - return pics + pic_list = [] + return pic_list def get_live_photo(self, weibo_info): """获取live photo中的视频url""" @@ -142,7 +141,7 @@ def get_video_url(self, weibo_info): live_photo_list = self.get_live_photo(weibo_info) if live_photo_list: video_url_list += live_photo_list - return ';'.join(video_url_list) + return video_url_list def get_location(self, selector): """获取微博发布位置""" @@ -205,20 +204,25 @@ def standardize_date(self, created_at): """标准化微博发布时间""" if u"刚刚" in created_at: created_at = datetime.now().strftime("%Y-%m-%d") + self.__recent = True elif u"分钟" in created_at: minute = created_at[:created_at.find(u"分钟")] minute = timedelta(minutes=int(minute)) created_at = (datetime.now() - minute).strftime("%Y-%m-%d") + self.__recent = True elif u"小时" in created_at: hour = created_at[:created_at.find(u"小时")] hour = timedelta(hours=int(hour)) created_at = (datetime.now() - hour).strftime("%Y-%m-%d") + self.__recent = False elif u"昨天" in created_at: day = timedelta(days=1) created_at = (datetime.now() - day).strftime("%Y-%m-%d") + self.__recent = False elif created_at.count('-') == 1: year = datetime.now().strftime("%Y") created_at = year + "-" + created_at + self.__recent = False return created_at def standardize_info(self, weibo): @@ -380,6 +384,7 @@ def get_one_weibo(self, info): return weibo except Exception as e: logger.exception(e) + self.__recent = False def get_latest_weibos(self): try: @@ -391,16 +396,11 @@ def get_latest_weibos(self): if w['card_type'] == 9: wb = self.get_one_weibo(w) if wb: - if wb['created_at'] != str(date.today()): - continue - if wb['id'] in self.weibo_id_list: + if not self.__recent: continue if (not self.filter) or ( 'retweet' not in wb.keys()): - self.weibo.append(wb) latest_weibos.append(wb) - self.weibo_id_list.append(wb['id']) - self.got_count += 1 self.print_weibo(wb) return latest_weibos @@ -416,14 +416,28 @@ def get_latest_weibos(self): print(config) wb = Weibo(config) service_name = config["service_name"] - subService = Service(service_name, enable_on_default=True) if service_name not in subr_dic: + subService = Service(service_name, enable_on_default=True) subr_dic[service_name] = {"service": subService, "spiders": [wb]} else: subr_dic[service_name]["spiders"].append(wb) -@sv.scheduled_job('interval', seconds=60 * 20) +def wb_to_message(wb): + msg = f'@{wb["screen_name"]}:\n{wb["text"]}' + if sv.bot.config.IS_CQPRO and len(wb["pics"]) > 0: + images_url = wb["pics"] + msg = f'{msg}\n' + res_imgs = [R.remote_img(url).cqcode for url in images_url] + for img in res_imgs: + msg = f'{msg}{img}' + if len(wb["video_url"]) > 0: + videos = wb["video_url"] + res_videos = ';'.join(videos) + msg = f'{msg}\n视频链接:{res_videos}' + return msg + +@sv.scheduled_job('interval', seconds=20*60) async def weibo_poller(): for sv_name, serviceObj in subr_dic.items(): weibos = [] @@ -431,7 +445,7 @@ async def weibo_poller(): spiders = serviceObj["spiders"] for spider in spiders: latest_weibos = spider.get_latest_weibos() - formatted_weibos = [wb["text"] for wb in latest_weibos] + formatted_weibos = [wb_to_message(wb) for wb in latest_weibos] if l := len(formatted_weibos): sv.logger.info(f"成功获取@{spider.get_username()}的新微博{l}条") diff --git a/hoshino/res.py b/hoshino/res.py index 436e16959..a071bc219 100644 --- a/hoshino/res.py +++ b/hoshino/res.py @@ -1,5 +1,7 @@ import os from PIL import Image +import requests +from io import BytesIO from urllib.request import pathname2url from urllib.parse import urljoin @@ -19,7 +21,27 @@ def get(path, *paths): def img(path, *paths): return ResImg(os.path.join('img', path, *paths)) + @staticmethod + def remote_img(url): + return RemoteResImg(url) + +class RemoteResObj: + def __init__(self, url): + self.__path = url + + @property + def url(self): + return self.__path +class RemoteResImg(RemoteResObj): + @property + def cqcode(self) -> MessageSegment: + return MessageSegment.image(self.url) + + def open(self) -> Image: + response = requests.get(self.url) + return Image.open(BytesIO(response)) + class ResObj: From 2c0abae9c677237dd15b7149a678a3dabc5a1567 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=A7=81=E6=80=9D?= <5361064+zzbslayer@user.noreply.gitee.com> Date: Sat, 9 May 2020 02:36:43 +0800 Subject: [PATCH 03/12] Replace requests with httpx; small fix for code review --- hoshino/modules/weibo/weibo.py | 39 ++++++++++++++++++++-------------- hoshino/res.py | 3 ++- requirements.txt | 3 ++- 3 files changed, 27 insertions(+), 18 deletions(-) diff --git a/hoshino/modules/weibo/weibo.py b/hoshino/modules/weibo/weibo.py index eb4d19c16..79d48f0ac 100644 --- a/hoshino/modules/weibo/weibo.py +++ b/hoshino/modules/weibo/weibo.py @@ -9,13 +9,23 @@ from datetime import date, datetime, timedelta from time import sleep -import requests +import httpx as requests from lxml import etree from hoshino.service import Service, Privilege as Priv from hoshino import util, logger from hoshino.res import R sv = Service('weibo-poller', use_priv=Priv.ADMIN, manage_priv=Priv.SUPERUSER, visible=False) +user_configs = util.load_config(__file__) +''' +sample config.json + +[{ + "user_id": "6603867494", + "service_name": "pcr-weibo", + "filter": true +}] +''' class Weibo(object): def __init__(self, config): @@ -83,20 +93,18 @@ def get_user_info(self, user_id): def validate_config(self, config): """验证配置是否正确""" + exist_argument_list = ['user_id', 'service_name'] + true_false_argument_list = ['filter'] - # 验证filter、original_pic_download、retweet_pic_download、original_video_download、retweet_video_download - argument_list = [ - 'filter' - ] - for argument in argument_list: - if config[argument] != 0 and config[argument] != 1: - logger.error(u'%s值应为0或1,请重新输入' % config[argument]) + for argument in true_false_argument_list: + if argument not in config: + logger.error(f'请填写 {argument}') + if config[argument] != True and config[argument] != False: + logger.error(f'{argument} 值应为 True 或 False,请重新输入') - # 验证user_id_list - if "user_id" not in config: - logger.error(u'请填写用户 id') - if "service_name" not in config: - logger.error(u'请填写所属服务名') + for argument in exist_argument_list: + if argument not in config: + logger.error(f'请填写 {argument}') def get_pics(self, weibo_info): """获取微博原始图片url""" @@ -409,11 +417,10 @@ def get_latest_weibos(self): return [] -user_configs = util.load_config(__file__) subr_dic = {} for config in user_configs: - print(config) + sv.logger.debug(config) wb = Weibo(config) service_name = config["service_name"] @@ -437,7 +444,7 @@ def wb_to_message(wb): msg = f'{msg}\n视频链接:{res_videos}' return msg -@sv.scheduled_job('interval', seconds=20*60) +@sv.scheduled_job('interval', seconds=10) async def weibo_poller(): for sv_name, serviceObj in subr_dic.items(): weibos = [] diff --git a/hoshino/res.py b/hoshino/res.py index a071bc219..6803c6025 100644 --- a/hoshino/res.py +++ b/hoshino/res.py @@ -1,6 +1,7 @@ import os +import asyncio from PIL import Image -import requests +import httpx as requests from io import BytesIO from urllib.request import pathname2url from urllib.parse import urljoin diff --git a/requirements.txt b/requirements.txt index bccc5b948..e0907f59f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,4 +8,5 @@ zhconv>=1.4.0 Pillow>=6.2.1 TwitterAPI>=2.5.10 matplotlib>=3.2.0 -numpy>=1.18.0 \ No newline at end of file +numpy>=1.18.0 +httpx>=0.12.1 \ No newline at end of file From a7a1c350132740b8e766f93d640372788c74d371 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=A7=81=E6=80=9D?= <5361064+zzbslayer@user.noreply.gitee.com> Date: Sat, 9 May 2020 03:46:02 +0800 Subject: [PATCH 04/12] Replace all sync requests with await/async --- hoshino/modules/weibo/__init__.py | 61 ++++++++++++ hoshino/modules/weibo/exception.py | 21 +++++ hoshino/modules/weibo/weibo.py | 143 +++++++++-------------------- hoshino/res.py | 10 +- 4 files changed, 133 insertions(+), 102 deletions(-) create mode 100644 hoshino/modules/weibo/__init__.py create mode 100644 hoshino/modules/weibo/exception.py diff --git a/hoshino/modules/weibo/__init__.py b/hoshino/modules/weibo/__init__.py new file mode 100644 index 000000000..84a130984 --- /dev/null +++ b/hoshino/modules/weibo/__init__.py @@ -0,0 +1,61 @@ +from .weibo import WeiboSpider +from hoshino.service import Service, Privilege as Priv +from hoshino.res import R +from hoshino import util + +sv = Service('weibo-poller', use_priv=Priv.ADMIN, manage_priv=Priv.SUPERUSER, visible=False) +user_configs = util.load_config(__file__) +''' +sample config.json + +[{ + "user_id": "6603867494", + "service_name": "bcr-weibo", + "filter": true +}] +''' + +subr_dic = {} + +for config in user_configs: + sv.logger.debug(config) + wb_spider = WeiboSpider(config) + service_name = config["service_name"] + + if service_name not in subr_dic: + subService = Service(service_name, enable_on_default=True) + subr_dic[service_name] = {"service": subService, "spiders": [wb_spider]} + else: + subr_dic[service_name]["spiders"].append(wb_spider) + +def wb_to_message(wb): + msg = f'@{wb["screen_name"]}:\n{wb["text"]}' + if sv.bot.config.IS_CQPRO and len(wb["pics"]) > 0: + images_url = wb["pics"] + msg = f'{msg}\n' + res_imgs = [R.remote_img(url).cqcode for url in images_url] + for img in res_imgs: + msg = f'{msg}{img}' + if len(wb["video_url"]) > 0: + videos = wb["video_url"] + res_videos = ';'.join(videos) + msg = f'{msg}\n视频链接:{res_videos}' + return msg + +@sv.scheduled_job('interval', seconds=20*60) +async def weibo_poller(): + for sv_name, serviceObj in subr_dic.items(): + weibos = [] + ssv = serviceObj["service"] + spiders = serviceObj["spiders"] + for spider in spiders: + latest_weibos = await spider.get_latest_weibos() + formatted_weibos = [wb_to_message(wb) for wb in latest_weibos] + + if l := len(formatted_weibos): + sv.logger.info(f"成功获取@{spider.get_username()}的新微博{l}条") + else: + sv.logger.info(f"未检测到@{spider.get_username()}的新微博") + + weibos.extend(formatted_weibos) + await ssv.broadcast(weibos, ssv.name, 0.5) \ No newline at end of file diff --git a/hoshino/modules/weibo/exception.py b/hoshino/modules/weibo/exception.py new file mode 100644 index 000000000..5d1fab491 --- /dev/null +++ b/hoshino/modules/weibo/exception.py @@ -0,0 +1,21 @@ +class WeiboError(Exception): + def __init__(self, msg, *msgs): + self._msgs = [msg, *msgs] + + def __str__(self): + return '\n'.join(self._msgs) + + @property + def message(self): + return str(self) + + def append(self, msg:str): + self._msgs.append(msg) + + +class ParseError(WeiboError): + pass + + +class NotFoundError(WeiboError): + pass \ No newline at end of file diff --git a/hoshino/modules/weibo/weibo.py b/hoshino/modules/weibo/weibo.py index 79d48f0ac..384ddc67e 100644 --- a/hoshino/modules/weibo/weibo.py +++ b/hoshino/modules/weibo/weibo.py @@ -1,51 +1,42 @@ # -*- coding: UTF-8 -*- import json -import os import random import sys -import traceback from collections import OrderedDict from datetime import date, datetime, timedelta from time import sleep -import httpx as requests +import httpx from lxml import etree -from hoshino.service import Service, Privilege as Priv -from hoshino import util, logger -from hoshino.res import R +from hoshino import logger +from .exception import * -sv = Service('weibo-poller', use_priv=Priv.ADMIN, manage_priv=Priv.SUPERUSER, visible=False) -user_configs = util.load_config(__file__) -''' -sample config.json - -[{ - "user_id": "6603867494", - "service_name": "pcr-weibo", - "filter": true -}] -''' - -class Weibo(object): +class WeiboSpider(object): def __init__(self, config): """Weibo类初始化""" self.validate_config(config) - self.filter = config['filter'] - self.user = self.get_user_info(config["user_id"]) - + self.filter = config['filter'] + self.user_id = config['user_id'] + self.user = self.get_user_info(self.user_id) self.__recent = False - def get_json(self, params): + async def get_json(self, params): """获取网页中json数据""" url = 'https://m.weibo.cn/api/container/getIndex?' - r = requests.get(url, params=params) + async with httpx.AsyncClient() as client: + r = await client.get(url, params=params) + return r.json() + + def sync_get_json(self, params): + url = 'https://m.weibo.cn/api/container/getIndex?' + r = httpx.get(url, params=params) return r.json() def get_user_info(self, user_id): """获取用户信息""" params = {'containerid': '100505' + str(user_id)} - js = self.get_json(params) + js = self.sync_get_json(params) if js['ok']: info = js['data']['userInfo'] user_info = OrderedDict() @@ -66,7 +57,7 @@ def get_user_info(self, user_id): ] for i in en_list: user_info[i] = '' - js = self.get_json(params) + js = self.sync_get_json(params) if js['ok']: cards = js['data']['cards'] if isinstance(cards, list) and len(cards) > 1: @@ -98,13 +89,13 @@ def validate_config(self, config): for argument in true_false_argument_list: if argument not in config: - logger.error(f'请填写 {argument}') + raise NotFoundError(f'未找到参数{argument}') if config[argument] != True and config[argument] != False: - logger.error(f'{argument} 值应为 True 或 False,请重新输入') + raise ParseError(f'{argument} 值应为 True 或 False') for argument in exist_argument_list: if argument not in config: - logger.error(f'请填写 {argument}') + raise NotFoundError(f'未找到参数{argument}') def get_pics(self, weibo_info): """获取微博原始图片url""" @@ -305,32 +296,34 @@ def get_username(self): return self.user["screen_name"] def get_user_id(self): - return self.user["id"] + return self.user_id - def get_weibo_json(self, page): + async def get_weibo_json(self, page): """获取网页中微博json数据""" params = { 'containerid': '107603' + self.get_user_id(), 'page': page } - js = self.get_json(params) + js = await self.get_json(params) return js - def get_long_weibo(self, id): + async def get_long_weibo(self, id): """获取长微博""" for i in range(5): url = 'https://m.weibo.cn/detail/%s' % id - html = requests.get(url).text - html = html[html.find('"status":'):] - html = html[:html.rfind('"hotScheme"')] - html = html[:html.rfind(',')] - html = '{' + html + '}' - js = json.loads(html, strict=False) - weibo_info = js.get('status') - if weibo_info: - weibo = self.parse_weibo(weibo_info) - return weibo - sleep(random.randint(6, 10)) + async with httpx.AsyncClient() as client: + html = await client.get(url) + html = html.text + html = html[html.find('"status":'):] + html = html[:html.rfind('"hotScheme"')] + html = html[:html.rfind(',')] + html = '{' + html + '}' + js = json.loads(html, strict=False) + weibo_info = js.get('status') + if weibo_info: + weibo = self.parse_weibo(weibo_info) + return weibo + sleep(random.randint(6, 10)) def print_user_info(self): """打印用户信息""" @@ -355,7 +348,7 @@ def print_user_info(self): logger.info(self.user['description']) logger.info('+' * 100) - def get_one_weibo(self, info): + async def get_one_weibo(self, info): """获取一条微博的全部信息""" try: weibo_info = info['mblog'] @@ -366,13 +359,13 @@ def get_one_weibo(self, info): retweet_id = retweeted_status.get('id') is_long_retweet = retweeted_status.get('isLongText') if is_long: - weibo = self.get_long_weibo(weibo_id) + weibo = await self.get_long_weibo(weibo_id) if not weibo: weibo = self.parse_weibo(weibo_info) else: weibo = self.parse_weibo(weibo_info) if is_long_retweet: - retweet = self.get_long_weibo(retweet_id) + retweet = await self.get_long_weibo(retweet_id) if not retweet: retweet = self.parse_weibo(retweeted_status) else: @@ -382,7 +375,7 @@ def get_one_weibo(self, info): weibo['retweet'] = retweet else: # 原创 if is_long: - weibo = self.get_long_weibo(weibo_id) + weibo = await self.get_long_weibo(weibo_id) if not weibo: weibo = self.parse_weibo(weibo_info) else: @@ -394,15 +387,15 @@ def get_one_weibo(self, info): logger.exception(e) self.__recent = False - def get_latest_weibos(self): + async def get_latest_weibos(self): try: latest_weibos = [] - js = self.get_weibo_json(1) + js = await self.get_weibo_json(1) if js['ok']: weibos = js['data']['cards'] for w in weibos: if w['card_type'] == 9: - wb = self.get_one_weibo(w) + wb = await self.get_one_weibo(w) if wb: if not self.__recent: continue @@ -414,50 +407,4 @@ def get_latest_weibos(self): return latest_weibos except Exception as e: logger.exception(e) - return [] - - -subr_dic = {} - -for config in user_configs: - sv.logger.debug(config) - wb = Weibo(config) - service_name = config["service_name"] - - if service_name not in subr_dic: - subService = Service(service_name, enable_on_default=True) - subr_dic[service_name] = {"service": subService, "spiders": [wb]} - else: - subr_dic[service_name]["spiders"].append(wb) - -def wb_to_message(wb): - msg = f'@{wb["screen_name"]}:\n{wb["text"]}' - if sv.bot.config.IS_CQPRO and len(wb["pics"]) > 0: - images_url = wb["pics"] - msg = f'{msg}\n' - res_imgs = [R.remote_img(url).cqcode for url in images_url] - for img in res_imgs: - msg = f'{msg}{img}' - if len(wb["video_url"]) > 0: - videos = wb["video_url"] - res_videos = ';'.join(videos) - msg = f'{msg}\n视频链接:{res_videos}' - return msg - -@sv.scheduled_job('interval', seconds=10) -async def weibo_poller(): - for sv_name, serviceObj in subr_dic.items(): - weibos = [] - ssv = serviceObj["service"] - spiders = serviceObj["spiders"] - for spider in spiders: - latest_weibos = spider.get_latest_weibos() - formatted_weibos = [wb_to_message(wb) for wb in latest_weibos] - - if l := len(formatted_weibos): - sv.logger.info(f"成功获取@{spider.get_username()}的新微博{l}条") - else: - sv.logger.info(f"未检测到@{spider.get_username()}的新微博") - - weibos.extend(formatted_weibos) - await ssv.broadcast(weibos, ssv.name, 0.5) \ No newline at end of file + return [] \ No newline at end of file diff --git a/hoshino/res.py b/hoshino/res.py index 6803c6025..d50fcd185 100644 --- a/hoshino/res.py +++ b/hoshino/res.py @@ -1,7 +1,7 @@ import os import asyncio from PIL import Image -import httpx as requests +import httpx from io import BytesIO from urllib.request import pathname2url from urllib.parse import urljoin @@ -39,9 +39,11 @@ class RemoteResImg(RemoteResObj): def cqcode(self) -> MessageSegment: return MessageSegment.image(self.url) - def open(self) -> Image: - response = requests.get(self.url) - return Image.open(BytesIO(response)) + async def open(self) -> Image: + async with httpx.AsyncClient() as client: + r = await client.get(self.url) + response = requests.get(self.url) + return Image.open(BytesIO(response)) class ResObj: From 89e740c87e02be78c6b1ee10226115b6d4cbd70d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=A7=81=E6=80=9D?= <5361064+zzbslayer@user.noreply.gitee.com> Date: Sat, 9 May 2020 13:25:52 +0800 Subject: [PATCH 05/12] Fix logic of latest weibo; modify the format of config.json --- hoshino/modules/weibo/__init__.py | 47 +++++++++++++++++++++---------- hoshino/modules/weibo/weibo.py | 9 +++++- 2 files changed, 40 insertions(+), 16 deletions(-) diff --git a/hoshino/modules/weibo/__init__.py b/hoshino/modules/weibo/__init__.py index 84a130984..a76a925fc 100644 --- a/hoshino/modules/weibo/__init__.py +++ b/hoshino/modules/weibo/__init__.py @@ -3,30 +3,39 @@ from hoshino.res import R from hoshino import util -sv = Service('weibo-poller', use_priv=Priv.ADMIN, manage_priv=Priv.SUPERUSER, visible=False) -user_configs = util.load_config(__file__) ''' sample config.json [{ - "user_id": "6603867494", "service_name": "bcr-weibo", - "filter": true + "enable_on_default": true, + "users":[{ + "user_id": "6603867494", + "filter": true + }] + }] ''' +def _load_config(services_config): + for sv_config in services_config: + sv.logger.debug(sv_config) + service_name = sv_config["service_name"] + enable_on_default = sv_config.get("enable_on_default", False) + users_config = sv_config["users"] -subr_dic = {} + sv_spider_list = [] + for user_config in users_config: + wb_spider = WeiboSpider(user_config) + sv_spider_list.append(wb_spider) + + subService = Service(service_name, enable_on_default=enable_on_default) + subr_dic[service_name] = {"service": subService, "spiders": sv_spider_list} -for config in user_configs: - sv.logger.debug(config) - wb_spider = WeiboSpider(config) - service_name = config["service_name"] - if service_name not in subr_dic: - subService = Service(service_name, enable_on_default=True) - subr_dic[service_name] = {"service": subService, "spiders": [wb_spider]} - else: - subr_dic[service_name]["spiders"].append(wb_spider) +sv = Service('weibo-poller', use_priv=Priv.ADMIN, manage_priv=Priv.SUPERUSER, visible=False) +services_config = util.load_config(__file__) +subr_dic = {} +_load_config(services_config) def wb_to_message(wb): msg = f'@{wb["screen_name"]}:\n{wb["text"]}' @@ -58,4 +67,12 @@ async def weibo_poller(): sv.logger.info(f"未检测到@{spider.get_username()}的新微博") weibos.extend(formatted_weibos) - await ssv.broadcast(weibos, ssv.name, 0.5) \ No newline at end of file + await ssv.broadcast(weibos, ssv.name, 0.5) + +@sv.scheduled_job('interval', seconds=60*60*24) +async def clear_spider_buffer(): + sv.logger.info("Clearing weibo spider buffer...") + for sv_name, serviceObj in subr_dic.items(): + spiders = serviceObj["spiders"] + for spider in spiders: + spider.clear_buffer() \ No newline at end of file diff --git a/hoshino/modules/weibo/weibo.py b/hoshino/modules/weibo/weibo.py index 384ddc67e..6b22b0a5a 100644 --- a/hoshino/modules/weibo/weibo.py +++ b/hoshino/modules/weibo/weibo.py @@ -19,7 +19,11 @@ def __init__(self, config): self.filter = config['filter'] self.user_id = config['user_id'] self.user = self.get_user_info(self.user_id) + self.received_weibo_ids = [] self.__recent = False + + def clear_buffer(self): + self.received_weibo_ids.clear() async def get_json(self, params): """获取网页中json数据""" @@ -84,7 +88,7 @@ def get_user_info(self, user_id): def validate_config(self, config): """验证配置是否正确""" - exist_argument_list = ['user_id', 'service_name'] + exist_argument_list = ['user_id'] true_false_argument_list = ['filter'] for argument in true_false_argument_list: @@ -399,9 +403,12 @@ async def get_latest_weibos(self): if wb: if not self.__recent: continue + if wb["id"] in self.received_weibo_ids: + continue if (not self.filter) or ( 'retweet' not in wb.keys()): latest_weibos.append(wb) + self.received_weibo_ids.append(wb["id"]) self.print_weibo(wb) return latest_weibos From f27e3ccd7a29258f05690f55dfb8afd2648de7c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=A7=81=E6=80=9D?= <5361064+zzbslayer@user.noreply.gitee.com> Date: Sat, 9 May 2020 13:29:01 +0800 Subject: [PATCH 06/12] Small fix for async --- hoshino/modules/weibo/weibo.py | 3 ++- hoshino/res.py | 3 +-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/hoshino/modules/weibo/weibo.py b/hoshino/modules/weibo/weibo.py index 6b22b0a5a..9105f3b2a 100644 --- a/hoshino/modules/weibo/weibo.py +++ b/hoshino/modules/weibo/weibo.py @@ -3,6 +3,7 @@ import json import random import sys +import asyncio from collections import OrderedDict from datetime import date, datetime, timedelta from time import sleep @@ -327,7 +328,7 @@ async def get_long_weibo(self, id): if weibo_info: weibo = self.parse_weibo(weibo_info) return weibo - sleep(random.randint(6, 10)) + asyncio.sleep(random.randint(6, 10)) def print_user_info(self): """打印用户信息""" diff --git a/hoshino/res.py b/hoshino/res.py index d50fcd185..6aace99c5 100644 --- a/hoshino/res.py +++ b/hoshino/res.py @@ -42,8 +42,7 @@ def cqcode(self) -> MessageSegment: async def open(self) -> Image: async with httpx.AsyncClient() as client: r = await client.get(self.url) - response = requests.get(self.url) - return Image.open(BytesIO(response)) + return Image.open(BytesIO(r)) class ResObj: From b2d8595aec3d4c8000ae69110ff53450d28338e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=A7=81=E6=80=9D?= <5361064+zzbslayer@user.noreply.gitee.com> Date: Sat, 9 May 2020 19:35:16 +0800 Subject: [PATCH 07/12] Call async func in __init__ with asyncio --- hoshino/modules/weibo/weibo.py | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/hoshino/modules/weibo/weibo.py b/hoshino/modules/weibo/weibo.py index 9105f3b2a..2dbf9cb70 100644 --- a/hoshino/modules/weibo/weibo.py +++ b/hoshino/modules/weibo/weibo.py @@ -19,29 +19,24 @@ def __init__(self, config): self.validate_config(config) self.filter = config['filter'] self.user_id = config['user_id'] - self.user = self.get_user_info(self.user_id) self.received_weibo_ids = [] self.__recent = False + asyncio.get_event_loop().run_until_complete(self._async_init()) + + async def _async_init(self): + self.user = await self.get_user_info(self.user_id) - def clear_buffer(self): - self.received_weibo_ids.clear() - async def get_json(self, params): """获取网页中json数据""" url = 'https://m.weibo.cn/api/container/getIndex?' async with httpx.AsyncClient() as client: - r = await client.get(url, params=params) + r = await client.get(url, params=params, timeout=10.0) # sometimes timeout return r.json() - def sync_get_json(self, params): - url = 'https://m.weibo.cn/api/container/getIndex?' - r = httpx.get(url, params=params) - return r.json() - - def get_user_info(self, user_id): + async def get_user_info(self, user_id): """获取用户信息""" params = {'containerid': '100505' + str(user_id)} - js = self.sync_get_json(params) + js = await self.get_json(params) if js['ok']: info = js['data']['userInfo'] user_info = OrderedDict() @@ -62,7 +57,7 @@ def get_user_info(self, user_id): ] for i in en_list: user_info[i] = '' - js = self.sync_get_json(params) + js = await self.get_json(params) if js['ok']: cards = js['data']['cards'] if isinstance(cards, list) and len(cards) > 1: @@ -87,6 +82,9 @@ def get_user_info(self, user_id): user = self.standardize_info(user_info) return user + def clear_buffer(self): + self.received_weibo_ids.clear() + def validate_config(self, config): """验证配置是否正确""" exist_argument_list = ['user_id'] From 125039971cae1a4cb94527743682188c4f763173 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=A7=81=E6=80=9D?= <5361064+zzbslayer@user.noreply.gitee.com> Date: Mon, 11 May 2020 03:40:57 +0800 Subject: [PATCH 08/12] Handle retweet weibo --- hoshino/modules/weibo/__init__.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/hoshino/modules/weibo/__init__.py b/hoshino/modules/weibo/__init__.py index a76a925fc..ecc5dac18 100644 --- a/hoshino/modules/weibo/__init__.py +++ b/hoshino/modules/weibo/__init__.py @@ -38,7 +38,15 @@ def _load_config(services_config): _load_config(services_config) def wb_to_message(wb): - msg = f'@{wb["screen_name"]}:\n{wb["text"]}' + msg = f'@{wb["screen_name"]}' + if "retweet" in wb: + msg = f'{msg} 转发:\n{wb["text"]}\n======================' + wb = wb["retweet"] + else: + msg = f'{msg}:' + + msg = f'{msg}\n{wb["text"]}' + if sv.bot.config.IS_CQPRO and len(wb["pics"]) > 0: images_url = wb["pics"] msg = f'{msg}\n' From ebf0d71d99d2e2438bfb22fd03db68d3939e32b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=A7=81=E6=80=9D?= <5361064+zzbslayer@user.noreply.gitee.com> Date: Mon, 11 May 2020 16:08:03 +0800 Subject: [PATCH 09/12] Users can fetch the latest 5 weibos forwardly by alias --- hoshino/modules/weibo/__init__.py | 36 +++++++++++++++++++++++++++++- hoshino/modules/weibo/exception.py | 3 +++ hoshino/modules/weibo/weibo.py | 8 +++++++ 3 files changed, 46 insertions(+), 1 deletion(-) diff --git a/hoshino/modules/weibo/__init__.py b/hoshino/modules/weibo/__init__.py index ecc5dac18..b75083417 100644 --- a/hoshino/modules/weibo/__init__.py +++ b/hoshino/modules/weibo/__init__.py @@ -2,6 +2,7 @@ from hoshino.service import Service, Privilege as Priv from hoshino.res import R from hoshino import util +from .exception import * ''' sample config.json @@ -11,6 +12,7 @@ "enable_on_default": true, "users":[{ "user_id": "6603867494", + "alias": ["公主连接", "公主连结", "公主链接"], "filter": true }] @@ -21,20 +23,31 @@ def _load_config(services_config): sv.logger.debug(sv_config) service_name = sv_config["service_name"] enable_on_default = sv_config.get("enable_on_default", False) + users_config = sv_config["users"] sv_spider_list = [] for user_config in users_config: wb_spider = WeiboSpider(user_config) sv_spider_list.append(wb_spider) + alias_list = user_config.get("alias", []) + for alias in alias_list: + if alias in alias_dic: + raise DuplicateError(f"Alias {alias} is duplicate") + alias_dic[alias] = { + "service_name":service_name, + "user_id":wb_spider.get_user_id() + } subService = Service(service_name, enable_on_default=enable_on_default) subr_dic[service_name] = {"service": subService, "spiders": sv_spider_list} - + + sv = Service('weibo-poller', use_priv=Priv.ADMIN, manage_priv=Priv.SUPERUSER, visible=False) services_config = util.load_config(__file__) subr_dic = {} +alias_dic = {} _load_config(services_config) def wb_to_message(wb): @@ -59,6 +72,27 @@ def wb_to_message(wb): msg = f'{msg}\n视频链接:{res_videos}' return msg +# @bot 看微博 alias +@sv.on_command('看微博', only_to_me=True) +async def get_last_5_weibo(session): + alias = session.current_arg_text + if alias not in alias_dic: + await session.finish(f"未找到微博: {alias}") + return + service_name = alias_dic[alias]["service_name"] + user_id = alias_dic[alias]["user_id"] + + spiders = subr_dic[service_name]["spiders"] + for spider in spiders: + if spider.get_user_id() == user_id: + last_5_weibos = spider.get_last_5_weibos() + formatted_weibos = [wb_to_message(wb) for wb in last_5_weibos] + for wb in formatted_weibos: + await session.send(wb) + await session.finish(f"以上为 {alias} 的最新 {len(formatted_weibos)} 条微博") + return + await session.finish(f"未找到微博: {alias}") + @sv.scheduled_job('interval', seconds=20*60) async def weibo_poller(): for sv_name, serviceObj in subr_dic.items(): diff --git a/hoshino/modules/weibo/exception.py b/hoshino/modules/weibo/exception.py index 5d1fab491..bda1619b8 100644 --- a/hoshino/modules/weibo/exception.py +++ b/hoshino/modules/weibo/exception.py @@ -18,4 +18,7 @@ class ParseError(WeiboError): class NotFoundError(WeiboError): + pass + +class DuplicateError(WeiboError): pass \ No newline at end of file diff --git a/hoshino/modules/weibo/weibo.py b/hoshino/modules/weibo/weibo.py index 2dbf9cb70..a91a54280 100644 --- a/hoshino/modules/weibo/weibo.py +++ b/hoshino/modules/weibo/weibo.py @@ -20,6 +20,7 @@ def __init__(self, config): self.filter = config['filter'] self.user_id = config['user_id'] self.received_weibo_ids = [] + self.last_5_weibos = [] self.__recent = False asyncio.get_event_loop().run_until_complete(self._async_init()) @@ -301,6 +302,9 @@ def get_username(self): def get_user_id(self): return self.user_id + def get_last_5_weibos(self): + return self.last_5_weibos + async def get_weibo_json(self, page): """获取网页中微博json数据""" params = { @@ -406,6 +410,10 @@ async def get_latest_weibos(self): continue if (not self.filter) or ( 'retweet' not in wb.keys()): + if len(self.last_5_weibos) == 5: + self.last_5_weibos.pop(0) + self.last_5_weibos.append(wb) + latest_weibos.append(wb) self.received_weibo_ids.append(wb["id"]) self.print_weibo(wb) From f0f8941993b9d952377f8612bb16879d89249027 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=A7=81=E6=80=9D?= <5361064+zzbslayer@user.noreply.gitee.com> Date: Thu, 21 May 2020 21:35:08 +0800 Subject: [PATCH 10/12] Fix parsing logic for url in weibo text --- hoshino/modules/weibo/__init__.py | 9 ++++++ hoshino/modules/weibo/weibo.py | 50 ++++++++++++++++++++++--------- 2 files changed, 45 insertions(+), 14 deletions(-) diff --git a/hoshino/modules/weibo/__init__.py b/hoshino/modules/weibo/__init__.py index b75083417..1b781b26e 100644 --- a/hoshino/modules/weibo/__init__.py +++ b/hoshino/modules/weibo/__init__.py @@ -18,6 +18,9 @@ }] ''' + +lmt = util.FreqLimiter(5) + def _load_config(services_config): for sv_config in services_config: sv.logger.debug(sv_config) @@ -70,11 +73,17 @@ def wb_to_message(wb): videos = wb["video_url"] res_videos = ';'.join(videos) msg = f'{msg}\n视频链接:{res_videos}' + return msg # @bot 看微博 alias @sv.on_command('看微博', only_to_me=True) async def get_last_5_weibo(session): + uid = session.ctx['user_id'] + if not lmt.check(uid): + session.finish('您查询得过于频繁,请稍等片刻', at_sender=True) + lmt.start_cd(uid) + alias = session.current_arg_text if alias not in alias_dic: await session.finish(f"未找到微博: {alias}") diff --git a/hoshino/modules/weibo/weibo.py b/hoshino/modules/weibo/weibo.py index a91a54280..65c8319f5 100644 --- a/hoshino/modules/weibo/weibo.py +++ b/hoshino/modules/weibo/weibo.py @@ -25,7 +25,10 @@ def __init__(self, config): asyncio.get_event_loop().run_until_complete(self._async_init()) async def _async_init(self): + self.__init = True self.user = await self.get_user_info(self.user_id) + await self.get_latest_weibos() + self.__init = False async def get_json(self, params): """获取网页中json数据""" @@ -158,16 +161,6 @@ def get_location(self, selector): break return location - def get_article_url(self, selector): - """获取微博中头条文章的url""" - article_url = '' - text = selector.xpath('string(.)') - if text.startswith(u'发布了头条文章'): - url = selector.xpath('//a/@data-url') - if url and url[0].startswith('http://t.cn'): - article_url = url[0] - return article_url - def get_topics(self, selector): """获取参与的微博话题""" span_list = selector.xpath("//span[@class='surl-text']") @@ -193,6 +186,27 @@ def get_at_users(self, selector): at_users = ','.join(at_list) return at_users + def get_text(self, text_body): + selector = etree.HTML(text_body) + url_lists = selector.xpath('//a[@data-url]/@data-url') + url_elems = selector.xpath('//a[@data-url]/span[@class="surl-text"]') + + ''' + Add the url of to the text of + For example: + + + + 本地化笔记第三期——剧情活动排期调整及版本更新内容前瞻 + + + replace 本地化笔记第三期——剧情活动排期调整及版本更新内容前瞻 + with 本地化笔记第三期——剧情活动排期调整及版本更新内容前瞻(http://t.cn/A622uDbW) + ''' + for i in range(0, len(url_lists)): + url_elems[i].text = f'{url_elems[i].text}({url_lists[i]})' + return selector.xpath('string(.)') + def string_to_int(self, string): """字符串转换为整数""" if isinstance(string, int): @@ -217,11 +231,17 @@ def standardize_date(self, created_at): hour = created_at[:created_at.find(u"小时")] hour = timedelta(hours=int(hour)) created_at = (datetime.now() - hour).strftime("%Y-%m-%d") - self.__recent = False + if self.__init: + self.__recent = True + else: + self.__recent = False elif u"昨天" in created_at: day = timedelta(days=1) created_at = (datetime.now() - day).strftime("%Y-%m-%d") - self.__recent = False + if self.__init: + self.__recent = True + else: + self.__recent = False elif created_at.count('-') == 1: year = datetime.now().strftime("%Y") created_at = year + "-" + created_at @@ -250,8 +270,10 @@ def parse_weibo(self, weibo_info): weibo['bid'] = weibo_info['bid'] text_body = weibo_info['text'] selector = etree.HTML(text_body) - weibo['text'] = etree.HTML(text_body).xpath('string(.)') - weibo['article_url'] = self.get_article_url(selector) + + + weibo['text'] = self.get_text(text_body) + weibo['pics'] = self.get_pics(weibo_info) weibo['video_url'] = self.get_video_url(weibo_info) weibo['location'] = self.get_location(selector) From c1dcef7330be571dc8cfabc06d69c7d2236bfdd3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=A7=81=E6=80=9D?= <5361064+zzbslayer@user.noreply.gitee.com> Date: Thu, 21 May 2020 23:32:31 +0800 Subject: [PATCH 11/12] Add img of article to image list --- hoshino/modules/weibo/weibo.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/hoshino/modules/weibo/weibo.py b/hoshino/modules/weibo/weibo.py index 65c8319f5..254d2924f 100644 --- a/hoshino/modules/weibo/weibo.py +++ b/hoshino/modules/weibo/weibo.py @@ -111,6 +111,12 @@ def get_pics(self, weibo_info): pic_list = [pic['large']['url'] for pic in pic_info] else: pic_list = [] + + """获取文章封面图片url""" + if 'page_info' in weibo_info and weibo_info['page_info']['type'] == 'article': + if 'page_pic' in weibo_info['page_info']: + pic_list.append(weibo_info['page_info']['page_pic']['url']) + return pic_list def get_live_photo(self, weibo_info): From 466eedf394743fd1baf52a2b9355563b572b0f76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=A7=81=E6=80=9D?= <5361064+zzbslayer@user.noreply.gitee.com> Date: Fri, 29 May 2020 03:07:40 +0800 Subject: [PATCH 12/12] Check weibo config from bot; other small fix --- hoshino/modules/weibo/__init__.py | 30 +++++++++++++++++++++++++----- hoshino/modules/weibo/weibo.py | 12 ++++++++++-- 2 files changed, 35 insertions(+), 7 deletions(-) diff --git a/hoshino/modules/weibo/__init__.py b/hoshino/modules/weibo/__init__.py index 1b781b26e..9b4ac21d2 100644 --- a/hoshino/modules/weibo/__init__.py +++ b/hoshino/modules/weibo/__init__.py @@ -8,7 +8,7 @@ sample config.json [{ - "service_name": "bcr-weibo", + "service_name": "weibo-bcr", "enable_on_default": true, "users":[{ "user_id": "6603867494", @@ -47,7 +47,7 @@ def _load_config(services_config): -sv = Service('weibo-poller', use_priv=Priv.ADMIN, manage_priv=Priv.SUPERUSER, visible=False) +sv = Service('weibo-poller', manage_priv=Priv.SUPERUSER, visible=False) services_config = util.load_config(__file__) subr_dic = {} alias_dic = {} @@ -76,18 +76,38 @@ def wb_to_message(wb): return msg +weibo_url_prefix = "https://weibo.com/u" +@sv.on_command('weibo-config',aliases=('查看微博服务', '微博服务', '微博配置', '查看微博配置')) +async def weibo_config(session): + msg = '微博推送配置:服务名,别名,微博链接' + index = 1 + for service_config in services_config: + service_name = service_config['service_name'] + users_config = service_config['users'] + for user_config in users_config: + weibo_id = user_config['user_id'] + alias = user_config['alias'] + weibo_url = f'{weibo_url_prefix}/{weibo_id}' + msg = f'{msg}\n{index}. {service_name}, {alias}, {weibo_url}' + index+=1 + session.finish(msg) + + # @bot 看微博 alias @sv.on_command('看微博', only_to_me=True) async def get_last_5_weibo(session): uid = session.ctx['user_id'] if not lmt.check(uid): session.finish('您查询得过于频繁,请稍等片刻', at_sender=True) + return + lmt.start_cd(uid) alias = session.current_arg_text if alias not in alias_dic: - await session.finish(f"未找到微博: {alias}") + session.finish(f"未找到微博: {alias}") return + service_name = alias_dic[alias]["service_name"] user_id = alias_dic[alias]["user_id"] @@ -98,9 +118,9 @@ async def get_last_5_weibo(session): formatted_weibos = [wb_to_message(wb) for wb in last_5_weibos] for wb in formatted_weibos: await session.send(wb) - await session.finish(f"以上为 {alias} 的最新 {len(formatted_weibos)} 条微博") + session.finish(f"以上为 {alias} 的最新 {len(formatted_weibos)} 条微博") return - await session.finish(f"未找到微博: {alias}") + session.finish(f"未找到微博: {alias}") @sv.scheduled_job('interval', seconds=20*60) async def weibo_poller(): diff --git a/hoshino/modules/weibo/weibo.py b/hoshino/modules/weibo/weibo.py index 254d2924f..7d2e18183 100644 --- a/hoshino/modules/weibo/weibo.py +++ b/hoshino/modules/weibo/weibo.py @@ -87,7 +87,12 @@ async def get_user_info(self, user_id): return user def clear_buffer(self): - self.received_weibo_ids.clear() + """ + 如果清理缓存前一分钟,该微博账号瞬间发送了 20 条微博 + 然后清理缓存仅仅保留后 10 条的微博id,因此可能会重复推送前 10 条微博 + 当然这种情况通常不会发生 + """ + self.received_weibo_ids = self.received_weibo_ids[-10:] def validate_config(self, config): """验证配置是否正确""" @@ -251,7 +256,10 @@ def standardize_date(self, created_at): elif created_at.count('-') == 1: year = datetime.now().strftime("%Y") created_at = year + "-" + created_at - self.__recent = False + if self.__init: + self.__recent = True + else: + self.__recent = False return created_at def standardize_info(self, weibo):