-
Notifications
You must be signed in to change notification settings - Fork 12
Expand file tree
/
Copy pathscrape_djeu.py
More file actions
30 lines (24 loc) · 918 Bytes
/
scrape_djeu.py
File metadata and controls
30 lines (24 loc) · 918 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
from lxml import html
from collections import Counter
import re
tree = html.parse('http://2015.djangocon.eu/talks/')
root = tree.getroot()
talks = root.cssselect('li.programme')
all_talk_text = ''
titles = ''
cardiff = 0
for talk in talks:
talk_text = (lambda x: x[0].text_content() if x
else '')(talk.cssselect('span.summary'))
talk_title = talk.cssselect('span.title')[0]
all_talk_text += ' '.join([talk_title.text_content().encode('utf8'),
talk_text])
titles += talk_title.text_content()
if 'Cardiff' in ' '.join([talk_title.text_content(), talk_text]):
cardiff += 1
#print '{}: {}'.format(talk_title.text_content().encode('utf8'), talk_text)
all_words = re.findall(r'\w+', all_talk_text)
all_titles = re.findall(r'\w+', titles)
print len(titles) / (len(talks)) * 1.0
print (cardiff * 1.0) / len(talks)
#print Counter(all_words)