Skip to content

Commit a0d6ee5

Browse files
committed
Add SFS designation to positional ids for tracking across amendments
Include SFS designation (e.g., "2024:123") in positional ids to enable: - Unique identification across different laws - Tracking value changes when same slug maps to multiple SFS versions New id format: sfs-2024-123-kap5.2-belopp-1 Reference table now supports tracking changes over time: { "sfs-2020-100-kap5.2-belopp-1": "tillstandsavgift", "sfs-2024-123-kap5.2-belopp-1": "tillstandsavgift" } Both resolve to id="tillstandsavgift" but with different values, allowing comparison of the same data point across amendments. Also extracts SFS id from <article selex:id="lag-2024-123"> tags.
1 parent 136017a commit a0d6ee5

3 files changed

Lines changed: 114 additions & 53 deletions

File tree

data/amount-references.json

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,13 @@
11
{
22
"_comment": "Reference table mapping positional ids to descriptive slugs. Keys starting with _ are ignored.",
3-
"_format": "{ 'section-id-type-position': 'descriptive-slug' }",
3+
"_format": "{ 'sfs-YYYY-NNN-section-type-position': 'descriptive-slug' }",
44

5-
"kap5.2-belopp-1": "tillstandsavgift",
6-
"kap5.2-procent-1": "riksbankens-referensranta"
5+
"_example_tracking_changes": "Multiple SFS entries can map to same slug to track value changes over time",
6+
7+
"sfs-2020-100-kap5.2-belopp-1": "tillstandsavgift",
8+
"sfs-2022-456-kap5.2-belopp-1": "tillstandsavgift",
9+
"sfs-2024-123-kap5.2-belopp-1": "tillstandsavgift",
10+
11+
"sfs-2020-100-kap6.1-procent-1": "riksbankens-referensranta",
12+
"sfs-2024-123-kap6.1-procent-1": "riksbankens-referensranta"
713
}

formatters/tag_swedish_amounts.py

Lines changed: 48 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -115,22 +115,32 @@ def load_reference_table() -> Dict[str, str]:
115115
return _reference_table
116116

117117

118-
def generate_positional_id(section_id: Optional[str], data_type: str, position: int) -> str:
118+
def generate_positional_id(sfs_id: Optional[str], section_id: Optional[str], data_type: str, position: int) -> str:
119119
"""
120120
Generate a positional id for a data element.
121121
122122
Args:
123+
sfs_id: The SFS designation (e.g., "2024:123") or None
123124
section_id: The section id (e.g., "kap5.2") or None
124125
data_type: "belopp" for amounts, "procent" for percentages
125126
position: 1-based position within the section for this type
126127
127128
Returns:
128-
A positional id like "kap5.2-belopp-1" or "procent-1" if no section
129+
A positional id like "sfs-2024-123-kap5.2-belopp-1"
129130
"""
131+
parts = []
132+
133+
if sfs_id:
134+
# Normalize SFS id: "2024:123" -> "sfs-2024-123"
135+
normalized_sfs = "sfs-" + sfs_id.replace(":", "-")
136+
parts.append(normalized_sfs)
137+
130138
if section_id:
131-
return f"{section_id}-{data_type}-{position}"
132-
else:
133-
return f"{data_type}-{position}"
139+
parts.append(section_id)
140+
141+
parts.append(f"{data_type}-{position}")
142+
143+
return "-".join(parts)
134144

135145

136146
def resolve_id(positional_id: str) -> str:
@@ -178,34 +188,40 @@ def _slugify(text: str) -> str:
178188
return text
179189

180190

181-
def tag_swedish_amounts(text: str, section_id: Optional[str] = None) -> str:
191+
def tag_swedish_amounts(text: str, sfs_id: Optional[str] = None, section_id: Optional[str] = None) -> str:
182192
"""
183193
Tag Swedish monetary amounts and percentages in text with <data> elements.
184194
185195
Processes text line by line, skipping markdown headers.
186196
Each amount/percentage is wrapped with a <data> tag containing:
187-
- id: positional id (e.g., "kap5.2-belopp-1") or resolved slug from reference table
197+
- id: positional id or resolved slug from reference table
188198
- type: "amount" or "percentage"
189199
- value: normalized numeric value
190200
191201
Args:
192202
text: The text to process
203+
sfs_id: Optional SFS designation (e.g., "2024:123") for generating positional ids
193204
section_id: Optional section id for generating positional ids (e.g., "kap5.2")
194205
195206
Returns:
196207
Text with amounts and percentages wrapped in <data> tags
197208
198209
Example:
199-
Input: "Avgiften är 1 000 kronor per år." with section_id="kap5.2"
200-
Output: '<data id="kap5.2-belopp-1" type="amount" value="1000">1 000 kronor</data>'
210+
Input: "Avgiften är 1 000 kronor." with sfs_id="2024:123", section_id="kap5.2"
211+
Output: '<data id="sfs-2024-123-kap5.2-belopp-1" type="amount" value="1000">...</data>'
201212
202-
With reference table {"kap5.2-belopp-1": "tillstandsavgift"}:
203-
Output: '<data id="tillstandsavgift" type="amount" value="1000">1 000 kronor</data>'
213+
With reference table {"sfs-2024-123-kap5.2-belopp-1": "tillstandsavgift"}:
214+
Output: '<data id="tillstandsavgift" type="amount" value="1000">...</data>'
215+
216+
Multiple SFS entries can map to the same slug to track changes over time:
217+
{"sfs-2020-100-kap5.2-belopp-1": "tillstandsavgift",
218+
"sfs-2024-123-kap5.2-belopp-1": "tillstandsavgift"}
204219
"""
205220
lines = text.split('\n')
206221
processed_lines = []
207222

208-
# Track current section and counters
223+
# Track current SFS, section and counters
224+
current_sfs = sfs_id
209225
current_section = section_id
210226
amount_counter = 0
211227
percentage_counter = 0
@@ -216,6 +232,17 @@ def tag_swedish_amounts(text: str, section_id: Optional[str] = None) -> str:
216232
processed_lines.append(line)
217233
continue
218234

235+
# Check for article tags to extract SFS id
236+
article_match = re.match(r'^\s*<article[^>]*\bselex:id=["\']([^"\']+)["\']', line)
237+
if article_match:
238+
# Extract SFS id from selex:id like "lag-2024-123" -> "2024:123"
239+
selex_id = article_match.group(1)
240+
sfs_match = re.search(r'(\d{4})-(\d+)', selex_id)
241+
if sfs_match:
242+
current_sfs = f"{sfs_match.group(1)}:{sfs_match.group(2)}"
243+
processed_lines.append(line)
244+
continue
245+
219246
# Check for section tags to extract section id
220247
section_match = re.match(r'^\s*<section[^>]*\bid=["\']([^"\']+)["\']', line)
221248
if section_match:
@@ -232,12 +259,12 @@ def tag_swedish_amounts(text: str, section_id: Optional[str] = None) -> str:
232259

233260
# Process amounts and percentages with counters
234261
processed_line, new_amount_count = _tag_amounts_in_line(
235-
line, current_section, amount_counter
262+
line, current_sfs, current_section, amount_counter
236263
)
237264
amount_counter = new_amount_count
238265

239266
processed_line, new_percentage_count = _tag_percentages_in_line(
240-
processed_line, current_section, percentage_counter
267+
processed_line, current_sfs, current_section, percentage_counter
241268
)
242269
percentage_counter = new_percentage_count
243270

@@ -248,6 +275,7 @@ def tag_swedish_amounts(text: str, section_id: Optional[str] = None) -> str:
248275

249276
def _tag_amounts_in_line(
250277
line: str,
278+
sfs_id: Optional[str],
251279
section_id: Optional[str],
252280
counter: int
253281
) -> tuple[str, int]:
@@ -256,6 +284,7 @@ def _tag_amounts_in_line(
256284
257285
Args:
258286
line: A single line of text
287+
sfs_id: Current SFS designation for positional ids
259288
section_id: Current section id for positional ids
260289
counter: Current count of amounts in this section
261290
@@ -271,7 +300,7 @@ def replace_amount_with_multiplier(match):
271300
number = match.group(1)
272301

273302
current_counter += 1
274-
positional_id = generate_positional_id(section_id, "belopp", current_counter)
303+
positional_id = generate_positional_id(sfs_id, section_id, "belopp", current_counter)
275304
resolved_id = resolve_id(positional_id)
276305

277306
normalized_value = normalize_number(number)
@@ -291,7 +320,7 @@ def replace_simple_amount(match):
291320
number = match.group(1)
292321

293322
current_counter += 1
294-
positional_id = generate_positional_id(section_id, "belopp", current_counter)
323+
positional_id = generate_positional_id(sfs_id, section_id, "belopp", current_counter)
295324
resolved_id = resolve_id(positional_id)
296325

297326
normalized_value = normalize_number(number)
@@ -307,6 +336,7 @@ def replace_simple_amount(match):
307336

308337
def _tag_percentages_in_line(
309338
line: str,
339+
sfs_id: Optional[str],
310340
section_id: Optional[str],
311341
counter: int
312342
) -> tuple[str, int]:
@@ -315,6 +345,7 @@ def _tag_percentages_in_line(
315345
316346
Args:
317347
line: A single line of text
348+
sfs_id: Current SFS designation for positional ids
318349
section_id: Current section id for positional ids
319350
counter: Current count of percentages in this section
320351
@@ -335,7 +366,7 @@ def replace_percentage(match):
335366
number = match.group(1)
336367

337368
current_counter += 1
338-
positional_id = generate_positional_id(section_id, "procent", current_counter)
369+
positional_id = generate_positional_id(sfs_id, section_id, "procent", current_counter)
339370
resolved_id = resolve_id(positional_id)
340371

341372
normalized_value = normalize_number(number)

test/test_tag_swedish_amounts.py

Lines changed: 57 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -87,25 +87,35 @@ def test_multiple_spaces(self):
8787
class TestGeneratePositionalId:
8888
"""Test the generate_positional_id function."""
8989

90-
def test_with_section_id(self):
91-
"""Test generating positional id with section."""
92-
result = generate_positional_id("kap5.2", "belopp", 1)
90+
def test_with_sfs_and_section(self):
91+
"""Test generating positional id with SFS and section."""
92+
result = generate_positional_id("2024:123", "kap5.2", "belopp", 1)
93+
assert result == "sfs-2024-123-kap5.2-belopp-1"
94+
95+
def test_with_sfs_only(self):
96+
"""Test generating positional id with only SFS."""
97+
result = generate_positional_id("2024:123", None, "belopp", 1)
98+
assert result == "sfs-2024-123-belopp-1"
99+
100+
def test_with_section_only(self):
101+
"""Test generating positional id with only section."""
102+
result = generate_positional_id(None, "kap5.2", "belopp", 1)
93103
assert result == "kap5.2-belopp-1"
94104

95-
def test_with_section_id_multiple(self):
96-
"""Test generating positional id with higher position."""
97-
result = generate_positional_id("kap5.2", "belopp", 3)
98-
assert result == "kap5.2-belopp-3"
99-
100-
def test_without_section_id(self):
101-
"""Test generating positional id without section."""
102-
result = generate_positional_id(None, "belopp", 1)
105+
def test_without_sfs_or_section(self):
106+
"""Test generating positional id without SFS or section."""
107+
result = generate_positional_id(None, None, "belopp", 1)
103108
assert result == "belopp-1"
104109

105110
def test_percentage_type(self):
106111
"""Test generating positional id for percentage."""
107-
result = generate_positional_id("kap1.5", "procent", 2)
108-
assert result == "kap1.5-procent-2"
112+
result = generate_positional_id("2020:100", "kap1.5", "procent", 2)
113+
assert result == "sfs-2020-100-kap1.5-procent-2"
114+
115+
def test_multiple_positions(self):
116+
"""Test generating positional id with higher position."""
117+
result = generate_positional_id("2024:123", "kap5.2", "belopp", 3)
118+
assert result == "sfs-2024-123-kap5.2-belopp-3"
109119

110120

111121
# ===========================================================================
@@ -321,20 +331,25 @@ class TestTagSwedishAmountsPositionalIds:
321331
"""Test that positional ids are generated correctly."""
322332

323333
def test_simple_positional_id(self):
324-
"""Test positional id without section."""
334+
"""Test positional id without SFS or section."""
325335
result = tag_swedish_amounts("Avgiften är 500 kronor.")
326336
assert 'id="belopp-1"' in result
327337

328-
def test_with_section_id(self):
329-
"""Test positional id with section_id parameter."""
330-
result = tag_swedish_amounts("Avgiften är 500 kronor.", section_id="kap5.2")
331-
assert 'id="kap5.2-belopp-1"' in result
338+
def test_with_sfs_id(self):
339+
"""Test positional id with sfs_id parameter."""
340+
result = tag_swedish_amounts("Avgiften är 500 kronor.", sfs_id="2024:123")
341+
assert 'id="sfs-2024-123-belopp-1"' in result
342+
343+
def test_with_sfs_and_section(self):
344+
"""Test positional id with both sfs_id and section_id."""
345+
result = tag_swedish_amounts("Avgiften är 500 kronor.", sfs_id="2024:123", section_id="kap5.2")
346+
assert 'id="sfs-2024-123-kap5.2-belopp-1"' in result
332347

333348
def test_multiple_amounts_incrementing(self):
334349
"""Test that multiple amounts get incrementing positions."""
335-
result = tag_swedish_amounts("Första 500 kr och andra 1000 kr.", section_id="kap1.1")
336-
assert 'id="kap1.1-belopp-1"' in result
337-
assert 'id="kap1.1-belopp-2"' in result
350+
result = tag_swedish_amounts("Första 500 kr och andra 1000 kr.", sfs_id="2024:123", section_id="kap1.1")
351+
assert 'id="sfs-2024-123-kap1.1-belopp-1"' in result
352+
assert 'id="sfs-2024-123-kap1.1-belopp-2"' in result
338353

339354
def test_section_tag_resets_counter(self):
340355
"""Test that section tags reset the counter."""
@@ -344,22 +359,31 @@ def test_section_tag_resets_counter(self):
344359
<section id="kap1.2">
345360
Belopp 200 kronor.
346361
</section>'''
362+
result = tag_swedish_amounts(text, sfs_id="2024:123")
363+
assert 'id="sfs-2024-123-kap1.1-belopp-1"' in result
364+
assert 'id="sfs-2024-123-kap1.2-belopp-1"' in result
365+
366+
def test_article_tag_extracts_sfs(self):
367+
"""Test that article tags extract SFS id from selex:id."""
368+
text = '''<article selex:id="lag-2024-123">
369+
Avgiften är 500 kronor.
370+
</article>'''
347371
result = tag_swedish_amounts(text)
348-
assert 'id="kap1.1-belopp-1"' in result
349-
assert 'id="kap1.2-belopp-1"' in result
372+
assert 'id="sfs-2024-123-belopp-1"' in result
350373

351374
def test_percentage_positional_id(self):
352375
"""Test positional id for percentages."""
353-
result = tag_swedish_amounts("Räntan är 5 procent.", section_id="kap2.3")
354-
assert 'id="kap2.3-procent-1"' in result
355-
356-
def test_same_id_across_amendments(self):
357-
"""Test that same position gives same id with different values."""
358-
result1 = tag_swedish_amounts("Avgiften är 500 kronor.", section_id="kap5.2")
359-
result2 = tag_swedish_amounts("Avgiften är 1000 kronor.", section_id="kap5.2")
360-
# Both should have same positional id but different values
361-
assert 'id="kap5.2-belopp-1"' in result1
362-
assert 'id="kap5.2-belopp-1"' in result2
376+
result = tag_swedish_amounts("Räntan är 5 procent.", sfs_id="2024:123", section_id="kap2.3")
377+
assert 'id="sfs-2024-123-kap2.3-procent-1"' in result
378+
379+
def test_same_slug_different_sfs(self):
380+
"""Test that same position in different SFS gives different positional ids."""
381+
result1 = tag_swedish_amounts("Avgiften är 500 kronor.", sfs_id="2020:100", section_id="kap5.2")
382+
result2 = tag_swedish_amounts("Avgiften är 1000 kronor.", sfs_id="2024:123", section_id="kap5.2")
383+
# Different SFS gives different positional ids
384+
assert 'id="sfs-2020-100-kap5.2-belopp-1"' in result1
385+
assert 'id="sfs-2024-123-kap5.2-belopp-1"' in result2
386+
# But values are different
363387
assert 'value="500"' in result1
364388
assert 'value="1000"' in result2
365389

0 commit comments

Comments
 (0)