|
| 1 | +import os |
| 2 | +from datetime import datetime |
| 3 | +from glob import glob |
| 4 | +import re |
| 5 | +from bs4 import BeautifulSoup |
| 6 | +from pathlib import Path, PurePath |
| 7 | +import json |
| 8 | + |
| 9 | +from django.conf import settings |
| 10 | + |
| 11 | +from ianalyzer_readers.xml_tag import Tag |
| 12 | +from ianalyzer_readers.extract import Constant, XML, Metadata, Cache, Combined |
| 13 | + |
| 14 | + |
| 15 | +from addcorpus.python_corpora.corpus import XMLCorpusDefinition, FieldDefinition |
| 16 | +from addcorpus.python_corpora.filters import MultipleChoiceFilter |
| 17 | +from addcorpus.es_mappings import keyword_mapping, text_mapping, date_mapping, main_content_mapping |
| 18 | +from corpora.parliament.parliament import Parliament |
| 19 | +import corpora.parliament.utils.field_defaults as field_defaults |
| 20 | +from corpora.utils.constants import document_context |
| 21 | + |
| 22 | + |
| 23 | + |
| 24 | +def extract_date(path: str): |
| 25 | + date_pattern = re.compile(r"\d{4}-\d{2}-\d{2}") |
| 26 | + if date_pattern.search(path): |
| 27 | + return date_pattern.search(path).group(0) |
| 28 | + else: |
| 29 | + return None |
| 30 | + |
| 31 | +def extract_chamber(path: str): |
| 32 | + if 'daylord' in path: |
| 33 | + return 'House of Lords' |
| 34 | + elif 'debates' in path: |
| 35 | + return 'House of Commons' |
| 36 | + else: |
| 37 | + return None |
| 38 | + |
| 39 | +def generate_title(chamber: str, date: str): |
| 40 | + return "{} Debate on {}".format(chamber, date) |
| 41 | + |
| 42 | +def extract_debate_id(path): |
| 43 | + id_pattern = re.compile(r"\D{7}\d{4}-\d{2}-\d{2}\D") |
| 44 | + if id_pattern.search(path): |
| 45 | + return id_pattern.search(path).group(0) |
| 46 | + else: |
| 47 | + return None |
| 48 | + |
| 49 | +def abbreviate_speech_id(full_id): |
| 50 | + ''' |
| 51 | + full speech id: uk.org.publicwhip/debate/2022-01-05c.10.6 |
| 52 | + abbreviated id: 10.6 |
| 53 | + ''' |
| 54 | + return '.'.join(full_id.split('.')[-2:]) |
| 55 | + |
| 56 | +def extract_topics_and_subtopics(path): |
| 57 | + with open(path, 'r', encoding='utf-8') as file: |
| 58 | + soup = BeautifulSoup(file, "lxml") |
| 59 | + |
| 60 | + topics = {} |
| 61 | + subtopics = {} |
| 62 | + for tag in soup.find_all('major-heading'): |
| 63 | + topics[abbreviate_speech_id(tag['id'])] = tag.text.replace('\n', '') |
| 64 | + |
| 65 | + for tag in soup.find_all('minor-heading'): |
| 66 | + subtopics[abbreviate_speech_id(tag['id'])] = tag.text.replace('\n', '') |
| 67 | + |
| 68 | + return topics, subtopics |
| 69 | + |
| 70 | +def extract_speaker_ids(path): |
| 71 | + with open(path, 'r', encoding='utf-8') as file: |
| 72 | + soup = BeautifulSoup(file, "lxml") |
| 73 | + |
| 74 | + speaker_ids = [] |
| 75 | + for tag in soup.find_all('speech'): |
| 76 | + if tag.has_attr('person_id'): |
| 77 | + if tag.attrs['person_id'].split('/')[-1] not in speaker_ids: |
| 78 | + speaker_ids.append(tag.attrs['person_id'].split('/')[-1]) |
| 79 | + |
| 80 | + return speaker_ids |
| 81 | + |
| 82 | +def select_topic(input): |
| 83 | + full_speech_id, topic_dict = input |
| 84 | + speech_id = abbreviate_speech_id(full_speech_id) |
| 85 | + previous_topic = '' |
| 86 | + for key in topic_dict: |
| 87 | + if float(key) > float(speech_id): |
| 88 | + return previous_topic |
| 89 | + else: |
| 90 | + previous_topic = topic_dict[key] |
| 91 | + |
| 92 | +def lookup_person_attribute(lookup_tuple): |
| 93 | + metadata_dict, id, name, label = lookup_tuple #name is only included for debugging purposes |
| 94 | + |
| 95 | + id = id.split('/')[-1] if id else None # twfy ID is at the end of uri |
| 96 | + if id in metadata_dict and label in metadata_dict[id]: |
| 97 | + return metadata_dict[id][label] |
| 98 | + else: |
| 99 | + return None |
| 100 | + |
| 101 | +def lookup_person_atttribute_date(lookup_tuple): |
| 102 | + date_string = lookup_person_attribute(lookup_tuple) |
| 103 | + if date_string: |
| 104 | + return date_string[:10] |
| 105 | + else: |
| 106 | + return None |
| 107 | + |
| 108 | +def find_current_positions(metadata_dict, date): |
| 109 | + current_positions = {} |
| 110 | + for person in metadata_dict: |
| 111 | + current_position_list = [] |
| 112 | + for position in metadata_dict[person]['positions']: |
| 113 | + if 'startTime' in position and 'endTime' in position: |
| 114 | + try: |
| 115 | + date_range = (datetime.strptime(position['startTime'][:10], "%Y-%m-%d"), |
| 116 | + datetime.strptime(position['endTime'][:10], "%Y-%m-%d")) |
| 117 | + except ValueError: |
| 118 | + continue #disregard missing dates |
| 119 | + if date_range[0] < datetime.strptime(date, "%Y-%m-%d") < date_range[1]: |
| 120 | + current_position_list.append(position) |
| 121 | + elif 'startTime' in position: |
| 122 | + current_position_list.append(position) |
| 123 | + current_positions[person] = current_position_list |
| 124 | + return current_positions |
| 125 | + |
| 126 | + |
| 127 | +def lookup_current_ministerial_position(lookup_tuple): |
| 128 | + positions_dict, id, name = lookup_tuple |
| 129 | + id = id.split('/')[-1] if id else None |
| 130 | + if id in positions_dict: |
| 131 | + for position in positions_dict[id]: |
| 132 | + if position['minister']: |
| 133 | + return position['positionLabel'] |
| 134 | + return None |
| 135 | + |
| 136 | +def get_current_positions(positions_dict, id): |
| 137 | + id = id.split('/')[-1] if id else None |
| 138 | + if id and id in positions_dict: |
| 139 | + return positions_dict[id] |
| 140 | + else: |
| 141 | + return [] |
| 142 | + |
| 143 | + |
| 144 | +def lookup_current_parliamentary_position(lookup_tuple): |
| 145 | + positions_dict, id, name = lookup_tuple |
| 146 | + current_positions = get_current_positions(positions_dict, id) |
| 147 | + for position in current_positions: |
| 148 | + if position['member_parliament']: |
| 149 | + return position['positionLabel'] |
| 150 | + return None |
| 151 | + |
| 152 | +def lookup_current_party(lookup_tuple): |
| 153 | + positions_dict, id, name = lookup_tuple |
| 154 | + current_positions = get_current_positions(positions_dict, id) |
| 155 | + if current_positions: |
| 156 | + for position in current_positions: |
| 157 | + if 'partyLabel' in position: |
| 158 | + return position['partyLabel'] |
| 159 | + elif 'partyBackupLabel' in position: |
| 160 | + return position['partyBackupLabel'] |
| 161 | + else: |
| 162 | + continue |
| 163 | + else: |
| 164 | + return None |
| 165 | + |
| 166 | +class ParliamentUKNew(Parliament, XMLCorpusDefinition): |
| 167 | + title = 'Talking Empire (UK 2022-2025)' |
| 168 | + description = "Speeches from the House of Lords and House of Commons (2022-2025)" |
| 169 | + data_directory = settings.TE_UK_NEW_DATA |
| 170 | + min_date = datetime(year=2022, month=1, day=1) |
| 171 | + max_date = datetime(year=2025, month=12, day=31) |
| 172 | + es_index = getattr(settings, 'TE_UK_NEW_ES_INDEX', 'parliament-uk-new') |
| 173 | + image = 'uk.jpeg' |
| 174 | + # word_model_path = getattr(settings, 'TE_UK_NEW_WM', None) ## TODO: add word model? |
| 175 | + languages = ['en'] |
| 176 | + description_page = 'uk-new.md' |
| 177 | + field_entry = 'speech_id' |
| 178 | + document_context = document_context() |
| 179 | + |
| 180 | + tag_toplevel = Tag("publicwhip") |
| 181 | + tag_entry = Tag("speech") |
| 182 | + |
| 183 | + def sources(self, start: datetime, end: datetime): |
| 184 | + metadata = {} |
| 185 | + with open(os.path.join(self.data_directory, 'merged_metadata_twfy_keys.json'), 'r', encoding='utf-8') as file: |
| 186 | + all_person_metadata = json.load(file) |
| 187 | + |
| 188 | + for directory in [dir for dir in Path(self.data_directory).iterdir() if dir.is_dir()]: |
| 189 | + for xml_file in glob('*.xml', root_dir=directory): |
| 190 | + full_path = self.data_directory / directory / xml_file |
| 191 | + metadata['date'] = extract_date(xml_file) |
| 192 | + metadata['chamber'] = extract_chamber(xml_file) |
| 193 | + metadata['debate_title'] = generate_title(metadata['chamber'], metadata['date']) |
| 194 | + metadata['debate_id'] = extract_debate_id(xml_file) |
| 195 | + metadata['topics'], metadata['subtopics'] = extract_topics_and_subtopics(full_path) |
| 196 | + metadata['speaker_ids'] = extract_speaker_ids(full_path) |
| 197 | + metadata['speaker_metadata'] = {} |
| 198 | + for id in metadata['speaker_ids']: |
| 199 | + if id in all_person_metadata: |
| 200 | + metadata['speaker_metadata'][id] = all_person_metadata[id] |
| 201 | + metadata['current_positions'] = find_current_positions(metadata['speaker_metadata'], metadata['date']) |
| 202 | + |
| 203 | + yield str(full_path), metadata |
| 204 | + |
| 205 | + _speech_id_extractor = Cache(XML(attribute='id')) |
| 206 | + |
| 207 | + chamber = field_defaults.chamber() |
| 208 | + chamber.extractor = Metadata('chamber') |
| 209 | + |
| 210 | + country = field_defaults.country() |
| 211 | + country.extractor = Constant( |
| 212 | + value='United Kingdom' |
| 213 | + ) |
| 214 | + |
| 215 | + date = field_defaults.date() |
| 216 | + date.extractor = Metadata('date') |
| 217 | + |
| 218 | + debate_title = field_defaults.debate_title() |
| 219 | + debate_title.extractor = Metadata('debate_title') |
| 220 | + debate_title.language = 'en' |
| 221 | + |
| 222 | + debate_id = field_defaults.debate_id() |
| 223 | + debate_id.extractor = Metadata('debate_id') |
| 224 | + |
| 225 | + speech = field_defaults.speech(language='en') |
| 226 | + speech.extractor = XML( |
| 227 | + Tag("p"), |
| 228 | + flatten=True |
| 229 | + ) |
| 230 | + |
| 231 | + speech_id = field_defaults.speech_id() |
| 232 | + speech_id.extractor = XML( |
| 233 | + attribute='id' |
| 234 | + ) |
| 235 | + |
| 236 | + speaker = field_defaults.speaker() |
| 237 | + speaker.extractor = XML( |
| 238 | + attribute='speakername' |
| 239 | + ) |
| 240 | + speaker.search_filter = MultipleChoiceFilter( |
| 241 | + description='Search only in debates from the selected chamber(s)', |
| 242 | + option_count=9001, |
| 243 | + ) |
| 244 | + |
| 245 | + speaker_id = field_defaults.speaker_id() |
| 246 | + speaker_id.extractor = XML( |
| 247 | + attribute='person_id' |
| 248 | + ) |
| 249 | + |
| 250 | + speaker_gender = field_defaults.speaker_gender() |
| 251 | + speaker_gender.extractor = Combined( |
| 252 | + Metadata('speaker_metadata'), |
| 253 | + XML(attribute='person_id'), |
| 254 | + XML(attribute='speakername'), |
| 255 | + Constant('genderLabel'), |
| 256 | + transform=lookup_person_attribute |
| 257 | + ) |
| 258 | + speaker_gender.search_filter = MultipleChoiceFilter( |
| 259 | + description="Search only in speeches made by speakers of a certain gender", |
| 260 | + option_count=10 |
| 261 | + ) |
| 262 | + |
| 263 | + speaker_birthdate = FieldDefinition( |
| 264 | + name = 'speaker_birthdate', |
| 265 | + display_name = 'Speaker birth date', |
| 266 | + description= 'Date at which the speaker was born', |
| 267 | + es_mapping=date_mapping(), |
| 268 | + ) |
| 269 | + speaker_birthdate.extractor = Combined( |
| 270 | + Metadata('speaker_metadata'), |
| 271 | + XML(attribute='person_id'), |
| 272 | + XML(attribute='speakername'), |
| 273 | + Constant('birthdate'), |
| 274 | + transform=lookup_person_atttribute_date |
| 275 | + ) |
| 276 | + |
| 277 | + speaker_deathdate = FieldDefinition( |
| 278 | + name = 'speaker_deathdate', |
| 279 | + display_name = 'Speaker death date', |
| 280 | + description= 'Date at which the speaker was born', |
| 281 | + es_mapping=date_mapping(), |
| 282 | + ) |
| 283 | + speaker_deathdate.extractor = Combined( |
| 284 | + Metadata('speaker_metadata'), |
| 285 | + XML(attribute='person_id'), |
| 286 | + XML(attribute='speakername'), |
| 287 | + Constant('deathdate'), |
| 288 | + transform=lookup_person_atttribute_date |
| 289 | + ) |
| 290 | + |
| 291 | + speaker_birthplace = FieldDefinition( |
| 292 | + name = 'speaker_birthplace', |
| 293 | + display_name = 'Speaker birthplace', |
| 294 | + description= 'Place where the speaker was born', |
| 295 | + es_mapping=keyword_mapping(), |
| 296 | + ) |
| 297 | + speaker_birthplace.extractor = Combined( |
| 298 | + Metadata('speaker_metadata'), |
| 299 | + XML(attribute='person_id'), |
| 300 | + XML(attribute='speakername'), |
| 301 | + Constant('birthPlaceLabel'), |
| 302 | + transform=lookup_person_attribute |
| 303 | + ) |
| 304 | + |
| 305 | + speaker_wikidata = FieldDefinition( |
| 306 | + name = 'speaker_wikidata', |
| 307 | + display_name = 'Speaker Wikidata URI', |
| 308 | + description= 'URI for the Wikidata page for this speaker', |
| 309 | + es_mapping=keyword_mapping(), |
| 310 | + ) |
| 311 | + speaker_wikidata.extractor = Combined( |
| 312 | + Metadata('speaker_metadata'), |
| 313 | + XML(attribute='person_id'), |
| 314 | + XML(attribute='speakername'), |
| 315 | + Constant('wikidata_uri'), |
| 316 | + transform=lookup_person_attribute |
| 317 | + ) |
| 318 | + |
| 319 | + ministerial_role = field_defaults.ministerial_role() |
| 320 | + ministerial_role.extractor = Combined( |
| 321 | + Metadata('current_positions'), |
| 322 | + XML(attribute='person_id'), |
| 323 | + XML(attribute='speakername'), |
| 324 | + transform=lookup_current_ministerial_position |
| 325 | + ) |
| 326 | + ministerial_role.search_filter.option_count = 45 |
| 327 | + |
| 328 | + parliamentary_role = field_defaults.parliamentary_role() |
| 329 | + parliamentary_role.extractor = Combined( |
| 330 | + Metadata('current_positions'), |
| 331 | + XML(attribute='person_id'), |
| 332 | + XML(attribute='speakername'), |
| 333 | + transform=lookup_current_parliamentary_position |
| 334 | + ) |
| 335 | + |
| 336 | + party = field_defaults.party() |
| 337 | + party.extractor = Combined( |
| 338 | + Metadata('current_positions'), |
| 339 | + XML(attribute='person_id'), |
| 340 | + XML(attribute='speakername'), |
| 341 | + transform=lookup_current_party |
| 342 | + ) |
| 343 | + |
| 344 | + topic = field_defaults.topic() |
| 345 | + topic.extractor = Combined( |
| 346 | + _speech_id_extractor, |
| 347 | + Metadata('topics'), |
| 348 | + transform=select_topic |
| 349 | + ) |
| 350 | + |
| 351 | + subtopic = field_defaults.subtopic() |
| 352 | + subtopic.extractor = Combined( |
| 353 | + _speech_id_extractor, |
| 354 | + Metadata('subtopics'), |
| 355 | + transform=select_topic |
| 356 | + ) |
| 357 | + |
| 358 | + speech_type = field_defaults.speech_type() |
| 359 | + speech_type.extractor = XML( |
| 360 | + attribute='type' |
| 361 | + ) |
| 362 | + |
| 363 | + def __init__(self): |
| 364 | + self.fields = [ |
| 365 | + self.country, self.date, |
| 366 | + self.debate_title, self.debate_id, |
| 367 | + self.topic, self.subtopic, |
| 368 | + self.chamber, |
| 369 | + self.speech, self.speech_id, |
| 370 | + self.speech_type, |
| 371 | + self.speaker, self.speaker_id, |
| 372 | + self.speaker_gender, self.speaker_birthdate, |
| 373 | + self.speaker_deathdate, self.speaker_birthplace, |
| 374 | + self.speaker_wikidata, self.ministerial_role, |
| 375 | + self.parliamentary_role, self.party |
| 376 | + ] |
0 commit comments