Skip to content

Commit 2f595ed

Browse files
authored
Merge pull request #2077 from CentreForDigitalHumanities/feature/hansard-2026
Feature/hansard 2026
2 parents b769b7c + 7d55e9b commit 2f595ed

3 files changed

Lines changed: 385 additions & 3 deletions

File tree

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
The debates of the two chambers of the British bicameral Parliament up to and including 2025, the House of Lords and the House of Commons, based on Hansard, compiled by TheyWorkForYou:
2+
3+
>mySociety. (n.d.). UK Parliament Hansard Debates. TheyWorkForYou. [https://www.theyworkforyou.com/debates/](https://www.theyworkforyou.com/debates/) accessed: 01-03-2026
4+
5+
6+
### Image attribution
7+
8+
The image used for this corpus ([image source](https://commons.wikimedia.org/wiki/File:House_of_Commons_2010.jpg)) is licenced under the [United Kingdom Open Government Licence](http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/).
Lines changed: 376 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,376 @@
1+
import os
2+
from datetime import datetime
3+
from glob import glob
4+
import re
5+
from bs4 import BeautifulSoup
6+
from pathlib import Path, PurePath
7+
import json
8+
9+
from django.conf import settings
10+
11+
from ianalyzer_readers.xml_tag import Tag
12+
from ianalyzer_readers.extract import Constant, XML, Metadata, Cache, Combined
13+
14+
15+
from addcorpus.python_corpora.corpus import XMLCorpusDefinition, FieldDefinition
16+
from addcorpus.python_corpora.filters import MultipleChoiceFilter
17+
from addcorpus.es_mappings import keyword_mapping, text_mapping, date_mapping, main_content_mapping
18+
from corpora.parliament.parliament import Parliament
19+
import corpora.parliament.utils.field_defaults as field_defaults
20+
from corpora.utils.constants import document_context
21+
22+
23+
24+
def extract_date(path: str):
25+
date_pattern = re.compile(r"\d{4}-\d{2}-\d{2}")
26+
if date_pattern.search(path):
27+
return date_pattern.search(path).group(0)
28+
else:
29+
return None
30+
31+
def extract_chamber(path: str):
32+
if 'daylord' in path:
33+
return 'House of Lords'
34+
elif 'debates' in path:
35+
return 'House of Commons'
36+
else:
37+
return None
38+
39+
def generate_title(chamber: str, date: str):
40+
return "{} Debate on {}".format(chamber, date)
41+
42+
def extract_debate_id(path):
43+
id_pattern = re.compile(r"\D{7}\d{4}-\d{2}-\d{2}\D")
44+
if id_pattern.search(path):
45+
return id_pattern.search(path).group(0)
46+
else:
47+
return None
48+
49+
def abbreviate_speech_id(full_id):
50+
'''
51+
full speech id: uk.org.publicwhip/debate/2022-01-05c.10.6
52+
abbreviated id: 10.6
53+
'''
54+
return '.'.join(full_id.split('.')[-2:])
55+
56+
def extract_topics_and_subtopics(path):
57+
with open(path, 'r', encoding='utf-8') as file:
58+
soup = BeautifulSoup(file, "lxml")
59+
60+
topics = {}
61+
subtopics = {}
62+
for tag in soup.find_all('major-heading'):
63+
topics[abbreviate_speech_id(tag['id'])] = tag.text.replace('\n', '')
64+
65+
for tag in soup.find_all('minor-heading'):
66+
subtopics[abbreviate_speech_id(tag['id'])] = tag.text.replace('\n', '')
67+
68+
return topics, subtopics
69+
70+
def extract_speaker_ids(path):
71+
with open(path, 'r', encoding='utf-8') as file:
72+
soup = BeautifulSoup(file, "lxml")
73+
74+
speaker_ids = []
75+
for tag in soup.find_all('speech'):
76+
if tag.has_attr('person_id'):
77+
if tag.attrs['person_id'].split('/')[-1] not in speaker_ids:
78+
speaker_ids.append(tag.attrs['person_id'].split('/')[-1])
79+
80+
return speaker_ids
81+
82+
def select_topic(input):
83+
full_speech_id, topic_dict = input
84+
speech_id = abbreviate_speech_id(full_speech_id)
85+
previous_topic = ''
86+
for key in topic_dict:
87+
if float(key) > float(speech_id):
88+
return previous_topic
89+
else:
90+
previous_topic = topic_dict[key]
91+
92+
def lookup_person_attribute(lookup_tuple):
93+
metadata_dict, id, name, label = lookup_tuple #name is only included for debugging purposes
94+
95+
id = id.split('/')[-1] if id else None # twfy ID is at the end of uri
96+
if id in metadata_dict and label in metadata_dict[id]:
97+
return metadata_dict[id][label]
98+
else:
99+
return None
100+
101+
def lookup_person_atttribute_date(lookup_tuple):
102+
date_string = lookup_person_attribute(lookup_tuple)
103+
if date_string:
104+
return date_string[:10]
105+
else:
106+
return None
107+
108+
def find_current_positions(metadata_dict, date):
109+
current_positions = {}
110+
for person in metadata_dict:
111+
current_position_list = []
112+
for position in metadata_dict[person]['positions']:
113+
if 'startTime' in position and 'endTime' in position:
114+
try:
115+
date_range = (datetime.strptime(position['startTime'][:10], "%Y-%m-%d"),
116+
datetime.strptime(position['endTime'][:10], "%Y-%m-%d"))
117+
except ValueError:
118+
continue #disregard missing dates
119+
if date_range[0] < datetime.strptime(date, "%Y-%m-%d") < date_range[1]:
120+
current_position_list.append(position)
121+
elif 'startTime' in position:
122+
current_position_list.append(position)
123+
current_positions[person] = current_position_list
124+
return current_positions
125+
126+
127+
def lookup_current_ministerial_position(lookup_tuple):
128+
positions_dict, id, name = lookup_tuple
129+
id = id.split('/')[-1] if id else None
130+
if id in positions_dict:
131+
for position in positions_dict[id]:
132+
if position['minister']:
133+
return position['positionLabel']
134+
return None
135+
136+
def get_current_positions(positions_dict, id):
137+
id = id.split('/')[-1] if id else None
138+
if id and id in positions_dict:
139+
return positions_dict[id]
140+
else:
141+
return []
142+
143+
144+
def lookup_current_parliamentary_position(lookup_tuple):
145+
positions_dict, id, name = lookup_tuple
146+
current_positions = get_current_positions(positions_dict, id)
147+
for position in current_positions:
148+
if position['member_parliament']:
149+
return position['positionLabel']
150+
return None
151+
152+
def lookup_current_party(lookup_tuple):
153+
positions_dict, id, name = lookup_tuple
154+
current_positions = get_current_positions(positions_dict, id)
155+
if current_positions:
156+
for position in current_positions:
157+
if 'partyLabel' in position:
158+
return position['partyLabel']
159+
elif 'partyBackupLabel' in position:
160+
return position['partyBackupLabel']
161+
else:
162+
continue
163+
else:
164+
return None
165+
166+
class ParliamentUKNew(Parliament, XMLCorpusDefinition):
167+
title = 'Talking Empire (UK 2022-2025)'
168+
description = "Speeches from the House of Lords and House of Commons (2022-2025)"
169+
data_directory = settings.TE_UK_NEW_DATA
170+
min_date = datetime(year=2022, month=1, day=1)
171+
max_date = datetime(year=2025, month=12, day=31)
172+
es_index = getattr(settings, 'TE_UK_NEW_ES_INDEX', 'parliament-uk-new')
173+
image = 'uk.jpeg'
174+
# word_model_path = getattr(settings, 'TE_UK_NEW_WM', None) ## TODO: add word model?
175+
languages = ['en']
176+
description_page = 'uk-new.md'
177+
field_entry = 'speech_id'
178+
document_context = document_context()
179+
180+
tag_toplevel = Tag("publicwhip")
181+
tag_entry = Tag("speech")
182+
183+
def sources(self, start: datetime, end: datetime):
184+
metadata = {}
185+
with open(os.path.join(self.data_directory, 'merged_metadata_twfy_keys.json'), 'r', encoding='utf-8') as file:
186+
all_person_metadata = json.load(file)
187+
188+
for directory in [dir for dir in Path(self.data_directory).iterdir() if dir.is_dir()]:
189+
for xml_file in glob('*.xml', root_dir=directory):
190+
full_path = self.data_directory / directory / xml_file
191+
metadata['date'] = extract_date(xml_file)
192+
metadata['chamber'] = extract_chamber(xml_file)
193+
metadata['debate_title'] = generate_title(metadata['chamber'], metadata['date'])
194+
metadata['debate_id'] = extract_debate_id(xml_file)
195+
metadata['topics'], metadata['subtopics'] = extract_topics_and_subtopics(full_path)
196+
metadata['speaker_ids'] = extract_speaker_ids(full_path)
197+
metadata['speaker_metadata'] = {}
198+
for id in metadata['speaker_ids']:
199+
if id in all_person_metadata:
200+
metadata['speaker_metadata'][id] = all_person_metadata[id]
201+
metadata['current_positions'] = find_current_positions(metadata['speaker_metadata'], metadata['date'])
202+
203+
yield str(full_path), metadata
204+
205+
_speech_id_extractor = Cache(XML(attribute='id'))
206+
207+
chamber = field_defaults.chamber()
208+
chamber.extractor = Metadata('chamber')
209+
210+
country = field_defaults.country()
211+
country.extractor = Constant(
212+
value='United Kingdom'
213+
)
214+
215+
date = field_defaults.date()
216+
date.extractor = Metadata('date')
217+
218+
debate_title = field_defaults.debate_title()
219+
debate_title.extractor = Metadata('debate_title')
220+
debate_title.language = 'en'
221+
222+
debate_id = field_defaults.debate_id()
223+
debate_id.extractor = Metadata('debate_id')
224+
225+
speech = field_defaults.speech(language='en')
226+
speech.extractor = XML(
227+
Tag("p"),
228+
flatten=True
229+
)
230+
231+
speech_id = field_defaults.speech_id()
232+
speech_id.extractor = XML(
233+
attribute='id'
234+
)
235+
236+
speaker = field_defaults.speaker()
237+
speaker.extractor = XML(
238+
attribute='speakername'
239+
)
240+
speaker.search_filter = MultipleChoiceFilter(
241+
description='Search only in debates from the selected chamber(s)',
242+
option_count=9001,
243+
)
244+
245+
speaker_id = field_defaults.speaker_id()
246+
speaker_id.extractor = XML(
247+
attribute='person_id'
248+
)
249+
250+
speaker_gender = field_defaults.speaker_gender()
251+
speaker_gender.extractor = Combined(
252+
Metadata('speaker_metadata'),
253+
XML(attribute='person_id'),
254+
XML(attribute='speakername'),
255+
Constant('genderLabel'),
256+
transform=lookup_person_attribute
257+
)
258+
speaker_gender.search_filter = MultipleChoiceFilter(
259+
description="Search only in speeches made by speakers of a certain gender",
260+
option_count=10
261+
)
262+
263+
speaker_birthdate = FieldDefinition(
264+
name = 'speaker_birthdate',
265+
display_name = 'Speaker birth date',
266+
description= 'Date at which the speaker was born',
267+
es_mapping=date_mapping(),
268+
)
269+
speaker_birthdate.extractor = Combined(
270+
Metadata('speaker_metadata'),
271+
XML(attribute='person_id'),
272+
XML(attribute='speakername'),
273+
Constant('birthdate'),
274+
transform=lookup_person_atttribute_date
275+
)
276+
277+
speaker_deathdate = FieldDefinition(
278+
name = 'speaker_deathdate',
279+
display_name = 'Speaker death date',
280+
description= 'Date at which the speaker was born',
281+
es_mapping=date_mapping(),
282+
)
283+
speaker_deathdate.extractor = Combined(
284+
Metadata('speaker_metadata'),
285+
XML(attribute='person_id'),
286+
XML(attribute='speakername'),
287+
Constant('deathdate'),
288+
transform=lookup_person_atttribute_date
289+
)
290+
291+
speaker_birthplace = FieldDefinition(
292+
name = 'speaker_birthplace',
293+
display_name = 'Speaker birthplace',
294+
description= 'Place where the speaker was born',
295+
es_mapping=keyword_mapping(),
296+
)
297+
speaker_birthplace.extractor = Combined(
298+
Metadata('speaker_metadata'),
299+
XML(attribute='person_id'),
300+
XML(attribute='speakername'),
301+
Constant('birthPlaceLabel'),
302+
transform=lookup_person_attribute
303+
)
304+
305+
speaker_wikidata = FieldDefinition(
306+
name = 'speaker_wikidata',
307+
display_name = 'Speaker Wikidata URI',
308+
description= 'URI for the Wikidata page for this speaker',
309+
es_mapping=keyword_mapping(),
310+
)
311+
speaker_wikidata.extractor = Combined(
312+
Metadata('speaker_metadata'),
313+
XML(attribute='person_id'),
314+
XML(attribute='speakername'),
315+
Constant('wikidata_uri'),
316+
transform=lookup_person_attribute
317+
)
318+
319+
ministerial_role = field_defaults.ministerial_role()
320+
ministerial_role.extractor = Combined(
321+
Metadata('current_positions'),
322+
XML(attribute='person_id'),
323+
XML(attribute='speakername'),
324+
transform=lookup_current_ministerial_position
325+
)
326+
ministerial_role.search_filter.option_count = 45
327+
328+
parliamentary_role = field_defaults.parliamentary_role()
329+
parliamentary_role.extractor = Combined(
330+
Metadata('current_positions'),
331+
XML(attribute='person_id'),
332+
XML(attribute='speakername'),
333+
transform=lookup_current_parliamentary_position
334+
)
335+
336+
party = field_defaults.party()
337+
party.extractor = Combined(
338+
Metadata('current_positions'),
339+
XML(attribute='person_id'),
340+
XML(attribute='speakername'),
341+
transform=lookup_current_party
342+
)
343+
344+
topic = field_defaults.topic()
345+
topic.extractor = Combined(
346+
_speech_id_extractor,
347+
Metadata('topics'),
348+
transform=select_topic
349+
)
350+
351+
subtopic = field_defaults.subtopic()
352+
subtopic.extractor = Combined(
353+
_speech_id_extractor,
354+
Metadata('subtopics'),
355+
transform=select_topic
356+
)
357+
358+
speech_type = field_defaults.speech_type()
359+
speech_type.extractor = XML(
360+
attribute='type'
361+
)
362+
363+
def __init__(self):
364+
self.fields = [
365+
self.country, self.date,
366+
self.debate_title, self.debate_id,
367+
self.topic, self.subtopic,
368+
self.chamber,
369+
self.speech, self.speech_id,
370+
self.speech_type,
371+
self.speaker, self.speaker_id,
372+
self.speaker_gender, self.speaker_birthdate,
373+
self.speaker_deathdate, self.speaker_birthplace,
374+
self.speaker_wikidata, self.ministerial_role,
375+
self.parliamentary_role, self.party
376+
]

frontend/src/app/filter/multiple-choice-filter/multiple-choice-filter.component.html

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,7 @@
1010
[maxSelectedLabels]=1
1111
placeholder="Choose"
1212
[ngModel]="data"
13-
(onChange)="update($event.value)" ariaLabelledBy="legend-filter-{{filter.displayName | slugify}}" fluid
14-
(onPanelShow)="getAllOptionsFromES($event)">
15-
13+
(onChange)="update($event.value)" ariaLabelledBy="legend-filter-{{filter.displayName | slugify}}" fluid>
1614
<ng-template let-item pTemplate="item">
1715
<div class="select-label">{{item.label}}</div>
1816
<div class="select-count">{{item.doc_count}}</div>

0 commit comments

Comments
 (0)