From 7308e5e64b755c248fa10148453326d8c94a9ad7 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 20 Mar 2026 09:47:25 +0000 Subject: [PATCH 01/11] Initial plan From 591addfdaef36f301953ed0d7ac3c1b4d86bd7fd Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 20 Mar 2026 10:06:26 +0000 Subject: [PATCH 02/11] Enhance generate-content-based-titles.py with all article types, keywords, tags, section updates, Unicode fixes Co-authored-by: pethers <1726836+pethers@users.noreply.github.com> --- scripts/generate-content-based-titles.py | 610 ++++++++++++++++++++--- 1 file changed, 551 insertions(+), 59 deletions(-) diff --git a/scripts/generate-content-based-titles.py b/scripts/generate-content-based-titles.py index ae0ee4b881..329364dcaf 100755 --- a/scripts/generate-content-based-titles.py +++ b/scripts/generate-content-based-titles.py @@ -40,44 +40,129 @@ class TitleGenerator: ARTICLE_TYPES = { 'committee-reports': { 'en': 'Committee Reports', + 'section': 'Committee Reports', 'format': '{themes} Dominate Committee Agenda' }, 'government-propositions': { 'en': 'Government Propositions', + 'section': 'Government Propositions', 'format': '{themes} Lead Government Legislative Push' }, 'opposition-motions': { 'en': 'Opposition Motions', + 'section': 'Opposition Motions', 'format': 'Opposition {action} on {themes}' + }, + 'week-ahead': { + 'en': 'Week Ahead', + 'section': 'The Week Ahead', + 'format': '{themes} Headline Parliamentary Week Ahead' + }, + 'interpellation-debates': { + 'en': 'Interpellation Debates', + 'section': 'Interpellation Debates', + 'format': '{themes} Under Fire in Interpellation Debates' } } - # Common policy keywords to extract + # Common policy keywords to extract (ordered: longer phrases first to avoid partial matching) POLICY_KEYWORDS = [ + # Energy & Nuclear + 'nuclear energy', 'nuclear power', 'offshore wind', 'renewable energy', + 'radiation protection', 'kärnkraft', 'energy policy', + # Security & Law Enforcement - 'weapons', 'border', 'security', 'defense', 'detention', 'cash controls', - 'schengen', 'customs', 'enforcement', 'civil liberties', + 'honor violence', 'honour violence', 'domestic violence', 'criminal justice', + 'criminal recidivism', 'weapons', 'border', 'security', 'defense', 'detention', + 'cash controls', 'schengen', 'customs', 'enforcement', 'civil liberties', # Financial & Economic 'tax', 'vat', 'fraud', 'financial', 'audit', 'crisis management', - 'transparency', 'ownership', 'beneficial ownership', + 'transparency', 'ownership', 'beneficial ownership', 'budget', # Social Welfare - 'housing', 'welfare', 'parental', 'parental benefit', 'benefit', - 'pension', 'elderly care', 'employment', 'labor', + 'elderly care', 'elder care', 'social services', 'housing', 'welfare', + 'parental benefit', 'parental', 'pension', 'employment', 'labor', # Government & Administration 'data protection', 'privacy', 'registry', 'cooperative', 'appropriations', 'supplementary', 'government personnel', + # Justice & Rights + 'sami', 'indigenous rights', 'hunting regulation', 'immigration', + 'migration', 'asylum', 'integration', + + # Health + 'healthcare', 'health', 'rare diseases', 'patient safety', + + # Infrastructure & Transport + 'infrastructure', 'transport', 'road traffic', 'vehicle', 'aviation', + 'railway', 'air link', + + # Education & Culture + 'education', 'schools', 'university', 'cultural affairs', 'culture', + # Sector-Specific - 'education', 'health', 'trade', 'animal', 'animal protection', - 'road traffic', 'vehicle', 'renewable energy', 'macroprudential', + 'trade', 'industry', 'animal protection', 'animal', 'agriculture', + 'macroprudential', 'rural policy', 'rural', + + # EU & International + 'eu council', 'european union', 'eu directive', 'international', - # Language variations + # Language & Integration 'language requirement', 'language' ] + # Committee names for extraction + COMMITTEE_NAMES = { + 'social affairs': 'Social Affairs', + 'taxation': 'Taxation', + 'finance': 'Finance', + 'cultural affairs': 'Cultural Affairs', + 'social insurance': 'Social Insurance', + 'justice': 'Justice', + 'constitution': 'Constitution', + 'defence': 'Defence', + 'industry and trades': 'Industry and Trades', + 'industry and trade': 'Industry and Trade', + 'civil affairs': 'Civil Affairs', + 'education': 'Education', + 'environment': 'Environment', + 'foreign affairs': 'Foreign Affairs', + 'health and welfare': 'Health and Welfare', + 'labour market': 'Labour Market', + 'transport': 'Transport', + } + + # Swedish department names for extraction + DEPARTMENT_NAMES = { + 'justitiedepartementet': 'Justice', + 'socialdepartementet': 'Social Affairs', + 'finansdepartementet': 'Finance', + 'utbildningsdepartementet': 'Education', + 'försvarsdepartementet': 'Defence', + 'utrikesdepartementet': 'Foreign Affairs', + 'näringsdepartementet': 'Industry', + 'kulturdepartementet': 'Culture', + 'miljödepartementet': 'Environment', + 'arbetsmarknadsdepartementet': 'Labour Market', + 'landsbygds- och infrastrukturdepartementet': 'Rural & Infrastructure', + 'klimat- och näringslivsdepartementet': 'Climate & Business', + 'energi- och näringslivsdepartementet': 'Energy & Business', + } + + # Swedish party abbreviations + PARTY_NAMES = { + '(S)': 'Social Democrats', + '(M)': 'Moderates', + '(SD)': 'Sweden Democrats', + '(C)': 'Centre Party', + '(V)': 'Left Party', + '(KD)': 'Christian Democrats', + '(L)': 'Liberals', + '(MP)': 'Green Party', + } + def __init__(self, news_dir: str = None): """Initialize TitleGenerator with news directory path. @@ -93,21 +178,43 @@ def __init__(self, news_dir: str = None): self.english_only: bool = True # Safe default: only update English articles def extract_document_titles(self, html_content: str) -> List[str]: - """Extract all h3 document titles from article""" + """Extract all h3 document titles from article. + + Filters out generic structural h3s (analysis sections, footer headings) + and retains content-specific h3s (committee names, policy areas, departments). + """ # Find all h3 tags (document titles) h3_pattern = r'