Skip to content

Commit 3f20f31

Browse files
authored
Merge pull request #1 from devsapp/switch-to-scrapy
Switch to scrapy
2 parents 0e0fcf7 + 7b3e63d commit 3f20f31

4 files changed

Lines changed: 40 additions & 18 deletions

File tree

publish.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ Type: Application
22
Name: cdn-backup-origin
33
Provider:
44
- 阿里云
5-
Version: 0.1.2
5+
Version: 0.1.3
66
Description: 快速部署CDN备份源站项目到函数计算
77
HomePage: https://github.com/devsapp/cdn-backup-origin
88
Tags:

src/code/index.py

Lines changed: 35 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import urllib.request
99
from functools import reduce
1010
import oss2
11+
from bs4 import BeautifulSoup
1112
from aliyunsdkcore.client import AcsClient
1213
from aliyunsdkcore.request import CommonRequest
1314
from aliyunsdkcore.auth.credentials import AccessKeyCredential
@@ -22,7 +23,7 @@
2223
# ?表示懒惰匹配,尽可能匹配少的字符
2324
# ((?:/[a-zA-Z0-9.]*?)*)
2425
# ((?:/[a-zA-Z0-9.]*)*?)
25-
REG_URL = r'^(https?://|//)?((?:[a-zA-Z0-9-_]+\.)+(?:[a-zA-Z0-9-:]+))((?:/[-.a-zA-Z0-9]*?)*)((?<=/)[-_a-zA-Z0-9]+(?:\.([a-zA-Z0-9]+))+)?((?:\?[a-zA-Z0-9%&=]*)*)$'
26+
REG_URL = r'^(https?://|//)?((?:[a-zA-Z0-9-_]+\.)+(?:[a-zA-Z0-9-_:]+))((?:/[-.a-zA-Z0-9_]*?)*)((?<=/)[-_a-zA-Z0-9]+(?:\.([a-zA-Z0-9]+))+)?((?:\?[a-zA-Z0-9%&=]*)*)$'
2627
REG_RESOURCE_TYPE = r'(?:href|src|data\-original|data\-src)=["\'](.+?\.(?:html|htm|shtml|js|css|jpg|jpeg|png|gif|svg|ico|ttf|woff2|asp|jsp|php|perl|cgi))[a-zA-Z0-9\?\=\.]*["\']'
2728

2829
reg_url = re.compile(REG_URL)
@@ -31,11 +32,14 @@
3132
# 去重,避免重复下载
3233
downloaded_list = []
3334

35+
def valild_resource(resource):
36+
if resource is not None and resource != "" and resource != '/':
37+
return True
38+
return False
39+
3440
'''
3541
解析URL地址
3642
'''
37-
38-
3943
def parse_url(url):
4044
if not url:
4145
return
@@ -64,34 +68,48 @@ def parse_url(url):
6468
'''
6569
解析URL 页面
6670
'''
67-
68-
6971
def parse_page(page_path):
7072
if not page_path.endswith(('.html', '.htm', '.shtml')) \
7173
or not os.path.exists(page_path):
74+
LOGGER.error('> %s 网站内容不读取' % (page_path))
7275
return
7376

7477
resource_list = []
7578
with open(page_path, 'r', encoding='utf-8', errors='ignore') as f:
7679
# 分析网页内容
7780
content = f.read()
7881
LOGGER.info('> %s 网站内容读取完毕,内容长度:%d' % (page_path, len(content)))
82+
soup = BeautifulSoup(content, 'html.parser')
83+
for link in soup.findAll():
84+
if valild_resource(link.get('href')):
85+
resource_list.append(link.get('href'))
86+
LOGGER.info('> %s 网站链接读取完成' % (link.get('href')))
87+
if valild_resource(link.get('src')):
88+
resource_list.append(link.get('src'))
89+
LOGGER.info('> %s 网站链接读取完成' % (link.get('src')))
90+
if valild_resource(link.get('data-src')):
91+
resource_list.append(link.get('data-src'))
92+
LOGGER.info('> %s 网站链接读取完成' % (link.get('data-src')))
93+
if valild_resource(link.get('data-original')):
94+
resource_list.append(link.get('data-original'))
95+
LOGGER.info('> %s 网站链接读取完成' % (link.get('data-original')))
96+
7997

8098
# 解析网页内容,获取有效的链接
81-
content_list = re.split(r'\s+', content)
82-
for line in content_list:
83-
res_list = reg_resource.findall(line)
84-
if res_list is not None:
85-
resource_list = resource_list + res_list
99+
# content_list = re.split(r'\s+', content)
100+
# for line in content_list:
101+
# # print(f'content_list {line}')
102+
# res_list = reg_resource.findall(line)
103+
# if len(res_list) > 0:
104+
# resource_list = resource_list + res_list
105+
# LOGGER.info('> %s 网站链接读取完成' % (res_list))
86106
# 去重
87-
return reduce(lambda x, y: y in x and x or x + [y], resource_list, [])
107+
return list(set(resource_list))
88108

89109

90110
'''
91111
下载文件
92112
'''
93-
94-
95113
def download_file(src_url, dist_path):
96114
try:
97115
response = urllib.request.urlopen(src_url)
@@ -115,8 +133,6 @@ def download_file(src_url, dist_path):
115133
'''
116134
解析和备份URL 相关静态资源
117135
'''
118-
119-
120136
def parse_and_download_page(url, level):
121137
global downloaded_list
122138
global max_level
@@ -155,7 +171,6 @@ def parse_and_download_page(url, level):
155171
resource_list = parse_page(page_path)
156172
if not resource_list:
157173
return
158-
159174
# 下载资源,要区分目录,不存在的话就创建
160175
for resource_url in resource_list:
161176
# ../js/js
@@ -194,7 +209,10 @@ def parse_and_download_page(url, level):
194209
continue
195210

196211
resource_dir = domain_dir + resource_url_dict['path']
197-
resource_path = resource_dir + resource_url_dict['file_name']
212+
if resource_url_dict['file_name'] is None:
213+
resource_path = resource_dir + 'index.html'
214+
else:
215+
resource_path = resource_dir + resource_url_dict['file_name']
198216

199217
# 已经下载过的内容忽略
200218
if resource_path in downloaded_list:

src/code/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
bs4

src/s.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@ vars:
1818
services:
1919
cron_cdn_backup:
2020
component: devsapp/fc
21+
actions: # 自定义执行逻辑,关于actions 的使用,可以参考:https://www.serverless-devs.com/serverless-devs/yaml#行为描述
22+
pre-deploy: # 在deploy之前运行
23+
- component: fc build --use-docker
2124
props:
2225
region: ${vars.region}
2326
service: ${vars.service}

0 commit comments

Comments
 (0)