-
Notifications
You must be signed in to change notification settings - Fork 2.5k
Expand file tree
/
Copy patharticlesMoreRules.py
More file actions
26 lines (23 loc) · 1.15 KB
/
articlesMoreRules.py
File metadata and controls
26 lines (23 loc) · 1.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class ArticleSpider(CrawlSpider):
name = 'articles'
allowed_domains = ['wikipedia.org']
start_urls = ['https://en.wikipedia.org/wiki/Benevolent_dictator_for_life']
rules = [
Rule(LinkExtractor(allow='^(/wiki/)((?!:).)*$'), callback='parse_items', follow=True, cb_kwargs={'is_article': True}),
Rule(LinkExtractor(allow='.*'), callback='parse_items', cb_kwargs={'is_article': False})
]
def parse_items(self, response, is_article):
print(response.url)
title = response.css('h1::text').extract_first()
if is_article:
url = response.url
text = response.xpath('//div[@id="mw-content-text"]//text()').extract()
lastUpdated = response.css('li#footer-info-lastmod::text').extract_first()
lastUpdated = lastUpdated.replace('This page was last edited on ', '')
print('Title is: {} '.format(title))
print('title is: {} '.format(title))
print('text is: {}'.format(text))
else:
print('This is not an article: {}'.format(title))