88import urllib .request
99from functools import reduce
1010import oss2
11+ from bs4 import BeautifulSoup
1112from aliyunsdkcore .client import AcsClient
1213from aliyunsdkcore .request import CommonRequest
1314from aliyunsdkcore .auth .credentials import AccessKeyCredential
2223# ?表示懒惰匹配,尽可能匹配少的字符
2324# ((?:/[a-zA-Z0-9.]*?)*)
2425# ((?:/[a-zA-Z0-9.]*)*?)
25- REG_URL = r'^(https?://|//)?((?:[a-zA-Z0-9-_]+\.)+(?:[a-zA-Z0-9-:]+))((?:/[-.a-zA-Z0-9 ]*?)*)((?<=/)[-_a-zA-Z0-9]+(?:\.([a-zA-Z0-9]+))+)?((?:\?[a-zA-Z0-9%&=]*)*)$'
26+ REG_URL = r'^(https?://|//)?((?:[a-zA-Z0-9-_]+\.)+(?:[a-zA-Z0-9-_ :]+))((?:/[-.a-zA-Z0-9_ ]*?)*)((?<=/)[-_a-zA-Z0-9]+(?:\.([a-zA-Z0-9]+))+)?((?:\?[a-zA-Z0-9%&=]*)*)$'
2627REG_RESOURCE_TYPE = r'(?:href|src|data\-original|data\-src)=["\'](.+?\.(?:html|htm|shtml|js|css|jpg|jpeg|png|gif|svg|ico|ttf|woff2|asp|jsp|php|perl|cgi))[a-zA-Z0-9\?\=\.]*["\']'
2728
2829reg_url = re .compile (REG_URL )
3132# 去重,避免重复下载
3233downloaded_list = []
3334
35+ def valild_resource (resource ):
36+ if resource is not None and resource != "" and resource != '/' :
37+ return True
38+ return False
39+
3440'''
3541解析URL地址
3642'''
37-
38-
3943def parse_url (url ):
4044 if not url :
4145 return
@@ -64,34 +68,48 @@ def parse_url(url):
6468'''
6569解析URL 页面
6670'''
67-
68-
6971def parse_page (page_path ):
7072 if not page_path .endswith (('.html' , '.htm' , '.shtml' )) \
7173 or not os .path .exists (page_path ):
74+ LOGGER .error ('> %s 网站内容不读取' % (page_path ))
7275 return
7376
7477 resource_list = []
7578 with open (page_path , 'r' , encoding = 'utf-8' , errors = 'ignore' ) as f :
7679 # 分析网页内容
7780 content = f .read ()
7881 LOGGER .info ('> %s 网站内容读取完毕,内容长度:%d' % (page_path , len (content )))
82+ soup = BeautifulSoup (content , 'html.parser' )
83+ for link in soup .findAll ():
84+ if valild_resource (link .get ('href' )):
85+ resource_list .append (link .get ('href' ))
86+ LOGGER .info ('> %s 网站链接读取完成' % (link .get ('href' )))
87+ if valild_resource (link .get ('src' )):
88+ resource_list .append (link .get ('src' ))
89+ LOGGER .info ('> %s 网站链接读取完成' % (link .get ('src' )))
90+ if valild_resource (link .get ('data-src' )):
91+ resource_list .append (link .get ('data-src' ))
92+ LOGGER .info ('> %s 网站链接读取完成' % (link .get ('data-src' )))
93+ if valild_resource (link .get ('data-original' )):
94+ resource_list .append (link .get ('data-original' ))
95+ LOGGER .info ('> %s 网站链接读取完成' % (link .get ('data-original' )))
96+
7997
8098 # 解析网页内容,获取有效的链接
81- content_list = re .split (r'\s+' , content )
82- for line in content_list :
83- res_list = reg_resource .findall (line )
84- if res_list is not None :
85- resource_list = resource_list + res_list
99+ # content_list = re.split(r'\s+', content)
100+ # for line in content_list:
101+ # # print(f'content_list {line}')
102+ # res_list = reg_resource.findall(line)
103+ # if len(res_list) > 0:
104+ # resource_list = resource_list + res_list
105+ # LOGGER.info('> %s 网站链接读取完成' % (res_list))
86106 # 去重
87- return reduce ( lambda x , y : y in x and x or x + [ y ], resource_list , [] )
107+ return list ( set ( resource_list ) )
88108
89109
90110'''
91111下载文件
92112'''
93-
94-
95113def download_file (src_url , dist_path ):
96114 try :
97115 response = urllib .request .urlopen (src_url )
@@ -115,8 +133,6 @@ def download_file(src_url, dist_path):
115133'''
116134解析和备份URL 相关静态资源
117135'''
118-
119-
120136def parse_and_download_page (url , level ):
121137 global downloaded_list
122138 global max_level
@@ -155,7 +171,6 @@ def parse_and_download_page(url, level):
155171 resource_list = parse_page (page_path )
156172 if not resource_list :
157173 return
158-
159174 # 下载资源,要区分目录,不存在的话就创建
160175 for resource_url in resource_list :
161176 # ../js/js
@@ -194,7 +209,10 @@ def parse_and_download_page(url, level):
194209 continue
195210
196211 resource_dir = domain_dir + resource_url_dict ['path' ]
197- resource_path = resource_dir + resource_url_dict ['file_name' ]
212+ if resource_url_dict ['file_name' ] is None :
213+ resource_path = resource_dir + 'index.html'
214+ else :
215+ resource_path = resource_dir + resource_url_dict ['file_name' ]
198216
199217 # 已经下载过的内容忽略
200218 if resource_path in downloaded_list :
0 commit comments