Skip to content

Commit 68c6d66

Browse files
committed
💥 Wikipedia: using plaintext excerpts
1 parent 1f8cc95 commit 68c6d66

6 files changed

Lines changed: 55 additions & 30 deletions

File tree

CHANGES.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
11
# Release history
22

3+
## Unreleased
4+
5+
-`wikipedia`: Using non-deprecated api entrypoint
6+
- 💥 `wikipedia`: Using plain text description
7+
- 🐛 `wikipedia`: Fix: urlencoded already urldecoded lemma
8+
- ✅ Using a different source for dummy images
9+
310
## markdown-customblocks 1.5.3 (2022-12-20)
411

512
- map: fix: Geocoding failed because now requires user agent

customblocks/generators.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -349,7 +349,6 @@ def map(ctx, location=None, marker=True, *args, **kwds):
349349
'User-Agent': 'markdown-customblocks',
350350
})
351351
if not query.ok:
352-
print(dir(query))
353352
warnings.warn(f"Error {query}")
354353
return E('.error',
355354
f'Error geolocating {location}',

customblocks/generators_test.py

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -897,15 +897,10 @@ def test_wikipedia(self):
897897
::: wikipedia "Sant Joan Despí"
898898
""","""\
899899
<div class="linkcard wikipedia">
900-
<div class="linkcard-featured-image side">
901-
<a href="https://en.wikipedia.org/wiki/Sant Joan Despí" target="_blank">
902-
<img src="https://upload.wikimedia.org/wikipedia/commons/thumb/e/e7/Torre_de_la_Creu.JPG/1200px-Torre_de_la_Creu.JPG" />
903-
</a>
904-
</div>
905900
<p class="linkcard-heading"><a href="https://en.wikipedia.org/wiki/Sant Joan Despí" target="_blank">Sant Joan Despí - Wikipedia</a></p>
906901
<div class="linkcard-excerpt">
907902
<p>
908-
<p><span title="Old Catalan-language text"><span lang="ca"><b>Sant Joan Despí</b></span></span> (Old Catalan for 'Saint John of the Pine'; <span>Catalan pronunciation:</span> <span lang="ca-Latn-fonipa">[ˈsaɲ<span> </span>ʒuˈan<span> </span>dəsˈpi]</span>) is a city and municipality located in the Baix Llobregat area (Barcelona province in Catalonia, Spain). It is situated on the left bank of the Llobregat river.</p>
903+
Sant Joan Despí (Old Catalan for 'Saint John of the Pine'; Catalan pronunciation: [ˈsaɲ ʒuˈan dəsˈpi]) is a city and municipality located in the Baix Llobregat area (Barcelona province in Catalonia, Spain). It is situated on the left bank of the Llobregat river.
909904
</p>
910905
<span class="linkcard-more"><a href="https://en.wikipedia.org/wiki/Sant Joan Despí" target="_blank">Read more</a></span>
911906
</div>
@@ -925,7 +920,7 @@ def test_wikipedia(self):
925920
def test_wikipedia_lang(self):
926921
self.assertMarkdown("""
927922
::: wikipedia "Sant Joan Despí" lang=ca
928-
""","""\
923+
""", """\
929924
<div class="linkcard wikipedia">
930925
<div class="linkcard-featured-image side">
931926
<a href="https://ca.wikipedia.org/wiki/Sant Joan Despí" target="_blank">
@@ -935,7 +930,7 @@ def test_wikipedia_lang(self):
935930
<p class="linkcard-heading"><a href="https://ca.wikipedia.org/wiki/Sant Joan Despí" target="_blank">Sant Joan Despí - Viquipèdia, l'enciclopèdia lliure</a></p>
936931
<div class="linkcard-excerpt">
937932
<p>
938-
<p><b>Sant Joan Despí</b> és un municipi dins de la comarca del Baix Llobregat, situat al pla del Llobregat, a l'esquerra del riu. El municipi confronta amb els de Sant Feliu de Llobregat, Sant Just Desvern, Esplugues de Llobregat, Cornellà de Llobregat, Sant Boi i Santa Coloma de Cervelló.</p>
933+
Sant Joan Despí és un municipi dins de la comarca del Baix Llobregat, situat al pla del Llobregat, a l'esquerra del riu. El municipi confronta amb els de Sant Feliu de Llobregat, Sant Just Desvern, Esplugues de Llobregat, Cornellà de Llobregat, Sant Boi de Llobregat i Santa Coloma de Cervelló.
939934
</p>
940935
<span class="linkcard-more"><a href="https://ca.wikipedia.org/wiki/Sant Joan Despí" target="_blank">Read more</a></span>
941936
</div>

customblocks/utils/fetcher.py

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,16 @@ def __init__(self, cache):
1010
if not self.cachedir.exists():
1111
self.cachedir.mkdir(parents=True)
1212

13-
def _url2path(self, url):
14-
return self.cachedir / (
15-
url
16-
.replace('://','_')
17-
.replace('//','_')
18-
.replace('/','_')
19-
)
13+
def _url2path(self, url, params={}):
14+
parameters='_'.join(
15+
f'_{k}_{v}'
16+
for k,v in sorted(params.items())
17+
)
18+
return self.cachedir / ((url+parameters)
19+
.replace('://','_')
20+
.replace('//','_')
21+
.replace('/','_')
22+
)
2023

2124
def clear(self):
2225
for item in self.cachedir.glob('*'):
@@ -59,14 +62,16 @@ def _namespace2response(namespace):
5962
result._content = namespace.content
6063
return result
6164

62-
def get(self, url):
63-
cachefile = self._url2path(url)
65+
def get(self, url, **kwds):
66+
cachefile = self._url2path(url, **kwds)
6467
if cachefile.exists():
6568
info = ns.load(str(cachefile))
6669
return self._namespace2response(info)
67-
response = requests.get(url)
70+
response = requests.get(url,
71+
headers={'User-Agent': 'Mozilla 2.0'},
72+
**kwds)
6873
if response.ok:
69-
self._response2namespace(response).dump(self._url2path(url))
74+
self._response2namespace(response).dump(cachefile)
7075
return response
7176

7277
def remove(self, url):

customblocks/utils/pageinfo.py

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from yamlns import namespace as ns
22
from bs4 import BeautifulSoup
3-
from urllib.parse import urlparse, urljoin, urlunsplit
3+
from urllib.parse import urlparse, urljoin, urlunsplit, quote, unquote
44
from decorator import decorator
55

66
@decorator
@@ -57,7 +57,11 @@ def sitename(self):
5757

5858
@property
5959
def siteurl(self):
60-
return urlunsplit((self._url.scheme, self._url.netloc,'','','')) or None
60+
return urlunsplit((
61+
self._url.scheme,
62+
self._url.netloc,
63+
'', '', ''
64+
)) or None
6165

6266
@property
6367
@cached
@@ -85,13 +89,26 @@ def _mediawikiDescription(self):
8589
#description = self._soup.find(class_='shortdescription')
8690
#if description: return description
8791

88-
baseurl, lemma = self._rel('canonical').split('/wiki/')
89-
excerpt_url = baseurl+ '/w/api.php?' + (
90-
f'format=json&action=query&prop=extracts&exsentences=2&exintro&titles={lemma}'
91-
)
92+
canonical = self._rel('canonical')
93+
if not canonical:
94+
return
95+
baseurl, lemma = canonical.split('/wiki/')
96+
lemma = unquote(lemma)
97+
excerpt_url = baseurl + '/w/api.php'
9298
from . import Fetcher
9399
fetcher = Fetcher('fetchercache/wikipedia') # TODO: Configurable
94-
content = ns.deep(fetcher.get(excerpt_url).json())
100+
content = ns.deep(fetcher.get(
101+
excerpt_url,
102+
params=dict(
103+
format='json',
104+
action='query',
105+
titles=lemma,
106+
prop='extracts',
107+
exsentences='2',
108+
exintro=True,
109+
explaintext=True,
110+
)
111+
).json())
95112
for page in content.query.pages.values():
96113
if 'extract' in page:
97114
return page.extract

customblocks/utils/pageinfo_test.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -181,9 +181,11 @@ def test_description_fromMediawiki(self):
181181
))
182182
self.assertEqual(info.description,
183183
# TODO: Fragile on wiki content changes, mockup api
184-
"<p><b>Sant Joan Despí</b> és un municipi dins de la comarca del Baix Llobregat, "
185-
"situat al pla del Llobregat, a l'esquerra del riu. El municipi confronta amb els de "
186-
"Sant Feliu de Llobregat, Sant Just Desvern, Esplugues de Llobregat, Cornellà de Llobregat, Sant Boi i Santa Coloma de Cervelló.</p>"
184+
"Sant Joan Despí és un municipi dins de la comarca del Baix Llobregat, "
185+
"situat al pla del Llobregat, a l'esquerra del riu. "
186+
"El municipi confronta amb els de Sant Feliu de Llobregat, Sant Just Desvern, "
187+
"Esplugues de Llobregat, Cornellà de Llobregat, Sant Boi de Llobregat i "
188+
"Santa Coloma de Cervelló."
187189
)
188190

189191
def test_description_fromMediawiki_badArticle(self):

0 commit comments

Comments
 (0)