Skip to content

Commit 99e7975

Browse files
committed
Blerb.
1 parent 9702a2e commit 99e7975

10 files changed

Lines changed: 182 additions & 12 deletions

WebMirror/UrlUpserter.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -222,24 +222,24 @@ def do_link_batch_update_sess(logger, interface, link_batch, max_pri=None, show_
222222

223223
for item in link_batch:
224224
try:
225-
assert 'url' in item
226-
assert 'starturl' in item
227-
assert 'netloc' in item
228-
assert 'distance' in item
229-
assert 'is_text' in item
230-
assert 'priority' in item
231-
assert 'type' in item
232-
assert 'addtime' in item
233-
assert 'state' in item
234-
assert 'epoch' in item
225+
assert 'url' in item, "Missing key 'url'"
226+
assert 'starturl' in item, "Missing key 'starturl'"
227+
assert 'netloc' in item, "Missing key 'netloc'"
228+
assert 'distance' in item, "Missing key 'distance'"
229+
assert 'is_text' in item, "Missing key 'is_text'"
230+
assert 'priority' in item, "Missing key 'priority'"
231+
assert 'type' in item, "Missing key 'type'"
232+
assert 'addtime' in item, "Missing key 'addtime'"
233+
assert 'state' in item, "Missing key 'state'"
234+
assert 'epoch' in item, "Missing key 'epoch'"
235235

236236
if not 'maximum_priority' in item:
237237
item['maximum_priority'] = item['priority']
238238

239239
if item['distance'] < item['maximum_priority']:
240240
item['distance'] = item['maximum_priority']
241241

242-
assert 'maximum_priority' in item
242+
assert 'maximum_priority' in item, "Missing key 'maximum_priority'"
243243

244244
# psycopg2cffi._impl.exceptions.OperationalError: index row size 3192 exceeds maximum 2712 for index "ix_web_pages_url"
245245
assert len(item['url']) < 2712, "URL Too long for postgres. Length %s for url '%s'" % (
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
2+
def extractChococatsId(item):
3+
'''
4+
Parser for 'chococats.id'
5+
'''
6+
7+
vol, chp, frag, postfix = extractVolChapterFragmentPostfix(item['title'])
8+
if not (chp or vol) or "preview" in item['title'].lower():
9+
return None
10+
11+
tagmap = [
12+
('PRC', 'PRC', 'translated'),
13+
('Loiterous', 'Loiterous', 'oel'),
14+
]
15+
16+
for tagname, name, tl_type in tagmap:
17+
if tagname in item['tags']:
18+
return buildReleaseMessageWithType(item, name, vol, chp, frag=frag, postfix=postfix, tl_type=tl_type)
19+
20+
21+
return False
22+
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
2+
def extractChococatshomeWpcomstagingCom(item):
3+
'''
4+
Parser for 'chococatshome.wpcomstaging.com'
5+
'''
6+
7+
vol, chp, frag, postfix = extractVolChapterFragmentPostfix(item['title'])
8+
if not (chp or vol) or "preview" in item['title'].lower():
9+
return None
10+
11+
tagmap = [
12+
('PRC', 'PRC', 'translated'),
13+
('Loiterous', 'Loiterous', 'oel'),
14+
]
15+
16+
for tagname, name, tl_type in tagmap:
17+
if tagname in item['tags']:
18+
return buildReleaseMessageWithType(item, name, vol, chp, frag=frag, postfix=postfix, tl_type=tl_type)
19+
20+
21+
return False
22+
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
2+
def extractFkshsbfWordpressCom(item):
3+
'''
4+
Parser for 'fkshsbf.wordpress.com'
5+
'''
6+
7+
vol, chp, frag, postfix = extractVolChapterFragmentPostfix(item['title'])
8+
if not (chp or vol) or "preview" in item['title'].lower():
9+
return None
10+
11+
tagmap = [
12+
('PRC', 'PRC', 'translated'),
13+
('Loiterous', 'Loiterous', 'oel'),
14+
]
15+
16+
for tagname, name, tl_type in tagmap:
17+
if tagname in item['tags']:
18+
return buildReleaseMessageWithType(item, name, vol, chp, frag=frag, postfix=postfix, tl_type=tl_type)
19+
20+
21+
return False
22+
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
2+
def extractGingersfairyWixsiteCom(item):
3+
'''
4+
Parser for 'gingersfairy.wixsite.com'
5+
'''
6+
7+
vol, chp, frag, postfix = extractVolChapterFragmentPostfix(item['title'])
8+
if not (chp or vol) or "preview" in item['title'].lower():
9+
return None
10+
11+
tagmap = [
12+
('PRC', 'PRC', 'translated'),
13+
('Loiterous', 'Loiterous', 'oel'),
14+
]
15+
16+
for tagname, name, tl_type in tagmap:
17+
if tagname in item['tags']:
18+
return buildReleaseMessageWithType(item, name, vol, chp, frag=frag, postfix=postfix, tl_type=tl_type)
19+
20+
21+
return False
22+
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
2+
def extractNightwaterBar(item):
3+
'''
4+
Parser for 'nightwater.bar'
5+
'''
6+
7+
vol, chp, frag, postfix = extractVolChapterFragmentPostfix(item['title'])
8+
if not (chp or vol) or "preview" in item['title'].lower():
9+
return None
10+
11+
tagmap = [
12+
('PRC', 'PRC', 'translated'),
13+
('Loiterous', 'Loiterous', 'oel'),
14+
]
15+
16+
for tagname, name, tl_type in tagmap:
17+
if tagname in item['tags']:
18+
return buildReleaseMessageWithType(item, name, vol, chp, frag=frag, postfix=postfix, tl_type=tl_type)
19+
20+
21+
return False
22+
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
2+
def extractRainytranslationsWpcomstagingCom(item):
3+
'''
4+
Parser for 'rainytranslations.wpcomstaging.com'
5+
'''
6+
7+
vol, chp, frag, postfix = extractVolChapterFragmentPostfix(item['title'])
8+
if not (chp or vol) or "preview" in item['title'].lower():
9+
return None
10+
11+
tagmap = [
12+
('PRC', 'PRC', 'translated'),
13+
('Loiterous', 'Loiterous', 'oel'),
14+
]
15+
16+
for tagname, name, tl_type in tagmap:
17+
if tagname in item['tags']:
18+
return buildReleaseMessageWithType(item, name, vol, chp, frag=frag, postfix=postfix, tl_type=tl_type)
19+
20+
21+
return False
22+
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
2+
def extractSassystrawberryCom(item):
3+
'''
4+
Parser for 'sassystrawberry.com'
5+
'''
6+
7+
vol, chp, frag, postfix = extractVolChapterFragmentPostfix(item['title'])
8+
if not (chp or vol) or "preview" in item['title'].lower():
9+
return None
10+
11+
tagmap = [
12+
('PRC', 'PRC', 'translated'),
13+
('Loiterous', 'Loiterous', 'oel'),
14+
]
15+
16+
for tagname, name, tl_type in tagmap:
17+
if tagname in item['tags']:
18+
return buildReleaseMessageWithType(item, name, vol, chp, frag=frag, postfix=postfix, tl_type=tl_type)
19+
20+
21+
return False
22+

common/util/urlFuncs.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -327,6 +327,13 @@ def cleanUrl(urlin):
327327
if 'z' in qs:
328328
urlin = qs['z'][0]
329329

330+
if urlin.startswith("https://href.li/?"):
331+
# Seems to just blob the url after the '?'
332+
urlin = urlin.split("?", 1)[-1]
333+
334+
335+
assert urlin != None
336+
330337
# RSS garbage
331338
if '#utm_source=' in urlin:
332339
urlin = urlin.split("#utm_source=")[0]
@@ -336,6 +343,9 @@ def cleanUrl(urlin):
336343

337344
ret = unwrap_redirect(urlin, resolve_redirects)
338345

346+
if ret is None:
347+
return ret
348+
339349
# I hate feedburner
340350
if '?utm_source=' in ret:
341351
ret = ret.split("?utm_source=")[0]
@@ -551,6 +561,10 @@ def urlClean(url):
551561
# Google docs can be accessed with or without the '/preview' postfix
552562
# We want to remove this if it's present, so we don't duplicate content.
553563
url = trimGDocUrl(url)
564+
565+
566+
assert url != None
567+
554568
url = cleanUrl(url)
555569

556570
if url is None:
@@ -579,7 +593,8 @@ def getNetLoc(url):
579593
# print(isGFileUrl('https://drive.google.com/folderview?id=0B_mXfd95yvDfQWQ1ajNWZTJFRkk&usp=drive_web'))
580594
# print(urlClean('http://inmydaydreams.com/?p=6128&share=tumblr'))
581595
# print(urlClean('http://inmydaydreams.com/?p=6091&share=tumblr'))
582-
print(urlClean('https://www.tumblr.com/privacy/consent?redirect=https%3a%2f%2ffoxghost.tumblr.com%2fpost%2f190325087867'))
596+
print(urlClean('https://href.li/?https://woopread.com/series/your-meaning/'))
597+
# print(urlClean('https://www.tumblr.com/privacy/consent?redirect=https%3a%2f%2ffoxghost.tumblr.com%2fpost%2f190325087867'))
583598

584599
# print(hasDuplicateSegments('http://www.spcnet.tv/forums/showthread.php/23450-i-ve-decided-to-learn-chinese/index/images/misc/image.php?s=1129386e978631b0771a226dba5a82e5&u=65&dateline=1358455669'))
585600
# print(hasDuplicateSegments('http://inmydaydreams.com/?p=6091&share=tumblr'))

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ git+https://github.com/kvesteri/sqlalchemy-continuum.git
6767

6868
# Probably requires
6969
# sudo apt-get install libtiff5-dev libjpeg8-dev zlib1g-dev libfreetype6-dev liblcms2-dev libwebp-dev libharfbuzz-dev libfribidi-dev
70+
7071
pillow==8.4.0
7172

7273
# probably requires `sudo apt install libxml2-dev libxslt1-dev`

0 commit comments

Comments
 (0)