1212from bs4 import BeautifulSoup
1313from tqdm import tqdm
1414
15+ from common .utils import ts_date
1516from databases .base import Mongo
1617from databases .douban import Douban
1718
@@ -53,7 +54,7 @@ def __init__(self):
5354 self .session = requests .Session ()
5455 self .session .headers .update (
5556 {
56- "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64 ) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80 .0.3987.163 Safari/537.36"
57+ "User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7 ) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/147 .0.0.0 Safari/537.36"
5758 }
5859 )
5960
@@ -218,17 +219,25 @@ def insert_data(self, data):
218219
219220
220221def sync_douban ():
222+ MAX_FAILS = 3
221223 douban = Douban ()
222- session = requests .Session ()
223- ua = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4280.88 Safari/537.36"
224- session .headers .update ({"User-Agent" : ua })
225224
225+ failed_ids = douban .db ["douban_failed" ].distinct ("resourceId" , {"fail_count" : {"$gte" : MAX_FAILS }})
226226 yyets_data = douban .db ["yyets" ].aggregate (
227227 [
228+ {
229+ "$match" : {
230+ "data.info.id" : {
231+ "$ne" : 233 ,
232+ "$nin" : failed_ids ,
233+ }
234+ }
235+ },
228236 {"$group" : {"_id" : None , "ids" : {"$push" : "$data.info.id" }}},
229237 {"$project" : {"_id" : 0 , "ids" : 1 }},
230238 ]
231239 )
240+
232241 douban_data = douban .db ["douban" ].aggregate (
233242 [
234243 {"$group" : {"_id" : None , "ids" : {"$push" : "$resourceId" }}},
@@ -239,19 +248,28 @@ def sync_douban():
239248 id1 = next (yyets_data )["ids" ]
240249 id2 = next (douban_data )["ids" ]
241250 rids = list (set (id1 ).difference (id2 ))
242- rids .remove (233 )
243251 logging .info ("resource id complete %d" , len (rids ))
252+ # rids = [33439, 26421]
244253 for rid in tqdm (rids ):
245- with contextlib . suppress ( Exception ) :
254+ try :
246255 d = douban .find_douban (rid )
247256 logging .info ("Processed %s, length %d" , rid , len (d ))
257+ # 成功后清掉失败记录,防止以前失败过后来成功了
258+ douban .db ["douban_failed" ].delete_one ({"resourceId" : rid })
259+
260+ except Exception :
261+ logging .exception ("Failed to process %s" , rid )
262+ douban .db ["douban_failed" ].update_one (
263+ {"resourceId" : rid },
264+ {
265+ "$inc" : {"fail_count" : 1 },
266+ "$set" : {"last_failed_at" : ts_date ()},
267+ },
268+ upsert = True ,
269+ )
248270
249271 logging .info ("ALL FINISH!" )
250272
251273
252274if __name__ == "__main__" :
253- a = Zhuixinfan ()
254- # a.build_data(open("1.html").read(), "https://www.zhuixinfan.com/resource/1.html")
255- a .run ()
256- # b = YYSub()
257- # b.run()
275+ sync_douban ()
0 commit comments