Skip to content

Commit 7e8fe77

Browse files
marcarlclaude
andcommitted
Add tracking of which ändringsförfattning repealed each section
Implements selex:upphavd_av attribute to link repealed sections to their repealing ändringsförfattning. This resolves the TODO at format_sfs_text.py:718. Changes: - Add temporal/parse_anteckningar.py: Parser for Swedish amendment notes (anteckningar) that extracts repealed, amended, and new paragraphs - Modify sfs_processor.py: Build repeal_map from andringsforfattningar and pass to formatter - Modify formatters/format_sfs_text.py: Add selex:upphavd_av attribute to repealed sections with helper function for ID normalization - Add comprehensive test coverage: 27 parser tests + 8 integration tests Example output: <section selex:status="upphavd" selex:upphavd_av="2015:73"> All 68 tests passing. Validated with real data from 2010:800.json. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
1 parent deb6eb5 commit 7e8fe77

5 files changed

Lines changed: 621 additions & 6 deletions

File tree

formatters/format_sfs_text.py

Lines changed: 58 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
"""
3131

3232
import re
33-
from typing import Optional
33+
from typing import Optional, Dict
3434
from .apply_links import apply_sfs_links, apply_internal_links, apply_eu_links, apply_law_name_links
3535
from util.text_utils import WHITESPACE_PATTERN
3636

@@ -149,7 +149,7 @@ def _adjust_heading_level_for_avdelning(base_level: str, inside_avdelning: bool)
149149
return base_level
150150

151151

152-
def format_sfs_text_as_markdown(text: str, apply_links: bool = False) -> str:
152+
def format_sfs_text_as_markdown(text: str, apply_links: bool = False, repeal_map: Optional[Dict[str, str]] = None) -> str:
153153
"""
154154
Formattera texten från en författningstext importerad från
155155
Regeringskansliets rättsdatabas till Markdown-format.
@@ -159,6 +159,7 @@ def format_sfs_text_as_markdown(text: str, apply_links: bool = False) -> str:
159159
Args:
160160
text (str): Texten som ska formateras
161161
apply_links (bool): Om True, konvertera både interna paragrafnummer och SFS-beteckningar till markdown-länkar
162+
repeal_map (Optional[Dict[str, str]]): Map of section IDs to the ändringsförfattning beteckning that repealed them
162163
163164
Returns:
164165
str: Den formaterade texten
@@ -500,7 +501,7 @@ def _is_section_ikraft(header_line: str, content: str) -> bool:
500501
re.search(INTOFORCE_ANY_PATTERN, content_lower) is not None)
501502

502503

503-
def parse_logical_sections(text: str) -> str:
504+
def parse_logical_sections(text: str, repeal_map: Optional[Dict[str, str]] = None) -> str:
504505
"""
505506
Dela upp texten i logiska sektioner baserat på Markdown-rubriker och omslut
506507
varje rubrik och dess innehåll med <section>-taggar.
@@ -715,7 +716,13 @@ def process_current_section():
715716
if upphor_datum:
716717
attributes.append(f'selex:upphor_datum="{upphor_datum}"')
717718
if has_upphavd:
718-
attributes.append('selex:upphavd="true"') # TODO: Peka ut i vilken ändringsförfattning den upphävdes
719+
attributes.append('selex:upphavd="true"')
720+
721+
# Track which ändringsförfattning repealed this section
722+
if repeal_map and section_id:
723+
upphavd_av = _find_repeal_source(section_id, repeal_map)
724+
if upphavd_av:
725+
attributes.append(f'selex:upphavd_av="{upphavd_av}"')
719726
if ikraft_villkor:
720727
attributes.append(f'selex:ikraft_villkor="{ikraft_villkor}"')
721728

@@ -799,6 +806,53 @@ def process_current_section():
799806
return '\n'.join(result)
800807

801808

809+
def _find_repeal_source(section_id: str, repeal_map: Dict[str, str]) -> Optional[str]:
810+
"""
811+
Find which ändringsförfattning repealed this section.
812+
813+
Tries multiple normalized forms to match section_id against repeal_map.
814+
Example: section_id 'kap29.15' matches '29kap15§' in repeal_map
815+
816+
Args:
817+
section_id: Section ID like 'kap29.15' or 'kap1.15a' (generated by generate_section_id)
818+
repeal_map: Map of normalized references to beteckning (e.g., '29kap15§' -> '2024:796')
819+
820+
Returns:
821+
Beteckning of repealing ändringsförfattning, or None
822+
"""
823+
# Direct match
824+
if section_id in repeal_map:
825+
return repeal_map[section_id]
826+
827+
# Try normalized forms with § symbol added
828+
# section_id format from generate_section_id: 'kap29.15' or 'kap2.15a' or just '15'
829+
830+
# Try matching chapter.paragraph format (e.g., 'kap29.15')
831+
match = re.match(r'kap(\d+[a-z]?)\.(\d+[a-z]?)$', section_id)
832+
if match:
833+
chapter = match.group(1)
834+
paragraph = match.group(2)
835+
836+
# Try normalized forms
837+
variants = [
838+
f"{chapter}kap{paragraph}§", # '29kap15§'
839+
f"{chapter}kap.{paragraph}§", # '29kap.15§'
840+
f"{paragraph}§", # '15§' (chapter-less)
841+
]
842+
843+
for variant in variants:
844+
if variant in repeal_map:
845+
return repeal_map[variant]
846+
847+
# Try matching simple paragraph number (e.g., '15' or '15a')
848+
match = re.match(r'^(\d+[a-z]?)$', section_id)
849+
if match:
850+
paragraph = match.group(1)
851+
variant = f"{paragraph}§"
852+
if variant in repeal_map:
853+
return repeal_map[variant]
854+
855+
return None
802856

803857

804858
def check_unprocessed_temporal_sections(text: str) -> None:

sfs_processor.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
from temporal.title_temporal import title_temporal
3737
from temporal.amendments import extract_amendments
3838
from temporal.apply_temporal import apply_temporal, is_document_content_empty, add_empty_document_message
39+
from temporal.parse_anteckningar import parse_anteckningar
3940
from exporters.git import create_init_git_commit
4041
from util.yaml_utils import format_yaml_value
4142
from util.datetime_utils import format_datetime
@@ -434,11 +435,24 @@ def convert_to_markdown(data: Dict[str, Any], fetch_predocs_from_api: bool = Fal
434435
# Use the ignored content body (already includes heading)
435436
markdown_body = ignored_body
436437
else:
438+
# Build repeal map from amendments to track which ändringsförfattning repealed each paragraph
439+
repeal_map = {}
440+
if data.get('andringsforfattningar'):
441+
for amendment in data['andringsforfattningar']:
442+
beteckning = amendment.get('beteckning')
443+
anteckningar = amendment.get('anteckningar', '')
444+
445+
if beteckning and anteckningar:
446+
parsed = parse_anteckningar(anteckningar)
447+
# Map each repealed paragraph to this amendment
448+
for repealed_ref in parsed.get('repealed', []):
449+
repeal_map[repealed_ref] = beteckning
450+
437451
# Format the content text to markdown
438-
formatted_text = format_sfs_text_as_markdown(innehall_text, apply_links=apply_links)
452+
formatted_text = format_sfs_text_as_markdown(innehall_text, apply_links=apply_links, repeal_map=repeal_map)
439453

440454
# Apply section tags
441-
formatted_text = parse_logical_sections(formatted_text)
455+
formatted_text = parse_logical_sections(formatted_text, repeal_map=repeal_map)
442456

443457
# Debug: Check if formatting resulted in empty text
444458
if not formatted_text.strip():

temporal/parse_anteckningar.py

Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
"""
2+
Parser for Swedish legal amendment notes (anteckningar).
3+
4+
This module parses the anteckningar field from ändringsförfattningar to extract
5+
structured information about which paragraphs were repealed, amended, or added.
6+
7+
Example anteckningar:
8+
"upph. 29 kap. 15, 16 §§, rubr. närmast före 29 kap. 15 §; ändr. 10 kap. 37 §"
9+
10+
Parsed result:
11+
{
12+
'repealed': ['29kap15§', '29kap16§'],
13+
'amended': ['10kap37§'],
14+
'new': []
15+
}
16+
"""
17+
18+
import re
19+
from typing import Dict, List
20+
21+
22+
def parse_anteckningar(anteckningar: str) -> Dict[str, List[str]]:
23+
"""
24+
Parse Swedish amendment notes into structured data.
25+
26+
Args:
27+
anteckningar: The anteckningar string from an ändringsförfattning
28+
29+
Returns:
30+
Dictionary with keys:
31+
- 'repealed': List of normalized paragraph references that were repealed (upph.)
32+
- 'amended': List of normalized paragraph references that were amended (ändr.)
33+
- 'new': List of normalized paragraph references that were added (ny/nya)
34+
35+
Example:
36+
>>> parse_anteckningar("upph. 29 kap. 15, 16 §§; ändr. 10 kap. 37 §")
37+
{'repealed': ['29kap15§', '29kap16§'], 'amended': ['10kap37§'], 'new': []}
38+
"""
39+
result = {
40+
'repealed': [],
41+
'amended': [],
42+
'new': []
43+
}
44+
45+
if not anteckningar or not anteckningar.strip():
46+
return result
47+
48+
# Split on semicolons to separate major clauses
49+
clauses = anteckningar.split(';')
50+
51+
for clause in clauses:
52+
clause = clause.strip()
53+
if not clause:
54+
continue
55+
56+
# Identify action type and extract paragraphs
57+
if clause.startswith('upph.'):
58+
paragraphs = _extract_paragraphs(clause[5:].strip()) # Remove 'upph.'
59+
result['repealed'].extend(paragraphs)
60+
elif clause.startswith('ändr.'):
61+
paragraphs = _extract_paragraphs(clause[5:].strip()) # Remove 'ändr.'
62+
result['amended'].extend(paragraphs)
63+
elif clause.startswith('ny ') or clause.startswith('nya '):
64+
# Extract after 'ny ' or 'nya '
65+
start_idx = 3 if clause.startswith('nya') else 2
66+
paragraphs = _extract_paragraphs(clause[start_idx:].strip())
67+
result['new'].extend(paragraphs)
68+
69+
return result
70+
71+
72+
def _extract_paragraphs(text: str) -> List[str]:
73+
"""
74+
Extract normalized paragraph references from a text fragment.
75+
76+
Handles patterns like:
77+
- "29 kap. 15 §" → ['29kap15§']
78+
- "29 kap. 15, 16 §§" → ['29kap15§', '29kap16§']
79+
- "15 §" → ['15§']
80+
- "23 kap." → ['23kap'] (chapter-level, Phase 2)
81+
82+
Args:
83+
text: Text fragment after the action keyword (upph./ändr./ny)
84+
85+
Returns:
86+
List of normalized paragraph references
87+
"""
88+
paragraphs = []
89+
90+
# Skip patterns we don't handle yet (Phase 2)
91+
if 'rubr.' in text or 'betecknas' in text or 'nuvarande' in text:
92+
# Log for future enhancement but don't extract
93+
# These are complex patterns for Phase 2
94+
pass
95+
96+
# Pattern 1: Chapter + paragraphs
97+
# Examples: "29 kap. 15, 16 §§", "29 kap. 15 §", "2 kap. 32, 33 §§"
98+
chapter_pattern = r'(\d+(?:\s*[a-z])?)\s*kap\.\s*((?:\d+(?:\s*[a-z])?(?:\s*,\s*)?)+)\s*§'
99+
100+
for match in re.finditer(chapter_pattern, text, re.IGNORECASE):
101+
chapter = match.group(1).replace(' ', '').lower()
102+
para_list = match.group(2)
103+
104+
# Split on commas to get individual paragraph numbers
105+
para_numbers = [p.strip().replace(' ', '').lower() for p in para_list.split(',')]
106+
107+
for para_num in para_numbers:
108+
if para_num: # Skip empty strings
109+
normalized = f"{chapter}kap{para_num}§"
110+
paragraphs.append(normalized)
111+
112+
# Pattern 2: Chapter only (for chapter-level changes)
113+
# Example: "23 kap." (without paragraph reference)
114+
# Note: This is for Phase 2, but we detect it for completeness
115+
chapter_only_pattern = r'(\d+(?:\s*[a-z])?)\s*kap\.(?!\s*\d)'
116+
117+
for match in re.finditer(chapter_only_pattern, text, re.IGNORECASE):
118+
chapter = match.group(1).replace(' ', '').lower()
119+
# Chapter-level change - skip for Phase 1
120+
# In Phase 2, we'd add: paragraphs.append(f"{chapter}kap")
121+
pass
122+
123+
# Pattern 3: Paragraph without chapter
124+
# Examples: "15 §", "15, 16 §§"
125+
# These references are ambiguous without chapter context
126+
para_only_pattern = r'(?<!\d\s)(?<!kap\.\s)(\d+(?:\s*[a-z])?(?:\s*,\s*\d+(?:\s*[a-z])?)*)\s*§'
127+
128+
# Only match if there's no chapter context before it
129+
if 'kap.' not in text:
130+
for match in re.finditer(para_only_pattern, text, re.IGNORECASE):
131+
para_list = match.group(1)
132+
para_numbers = [p.strip().replace(' ', '').lower() for p in para_list.split(',')]
133+
134+
for para_num in para_numbers:
135+
if para_num:
136+
normalized = f"{para_num}§"
137+
paragraphs.append(normalized)
138+
139+
return paragraphs
140+
141+
142+
def _normalize_reference(chapter: str, paragraph: str) -> str:
143+
"""
144+
Create a normalized section reference.
145+
146+
Args:
147+
chapter: Chapter number (e.g., '29', '2a')
148+
paragraph: Paragraph number (e.g., '15', '15a')
149+
150+
Returns:
151+
Normalized reference (e.g., '29kap15§', '2akap15a§')
152+
"""
153+
chapter_clean = chapter.replace(' ', '').lower()
154+
para_clean = paragraph.replace(' ', '').lower()
155+
return f"{chapter_clean}kap{para_clean}§"

0 commit comments

Comments
 (0)