Add SFS designation to positional ids for tracking across amendments

claude · claude · commit a0d6ee54ec59 · 2026-01-08T20:15:46.000Z
Include SFS designation (e.g., "2024:123") in positional ids to enable:
- Unique identification across different laws
- Tracking value changes when same slug maps to multiple SFS versions

New id format: sfs-2024-123-kap5.2-belopp-1

Reference table now supports tracking changes over time:
{
  "sfs-2020-100-kap5.2-belopp-1": "tillstandsavgift",
  "sfs-2024-123-kap5.2-belopp-1": "tillstandsavgift"
}

Both resolve to id="tillstandsavgift" but with different values,
allowing comparison of the same data point across amendments.

Also extracts SFS id from &lt;article selex:id="lag-2024-123"&gt; tags.
diff --git a/data/amount-references.json b/data/amount-references.json
@@ -1,7 +1,13 @@
 {
   "_comment": "Reference table mapping positional ids to descriptive slugs. Keys starting with _ are ignored.",
-  "_format": "{ 'section-id-type-position': 'descriptive-slug' }",
+  "_format": "{ 'sfs-YYYY-NNN-section-type-position': 'descriptive-slug' }",
 
-  "kap5.2-belopp-1": "tillstandsavgift",
-  "kap5.2-procent-1": "riksbankens-referensranta"
+  "_example_tracking_changes": "Multiple SFS entries can map to same slug to track value changes over time",
+
+  "sfs-2020-100-kap5.2-belopp-1": "tillstandsavgift",
+  "sfs-2022-456-kap5.2-belopp-1": "tillstandsavgift",
+  "sfs-2024-123-kap5.2-belopp-1": "tillstandsavgift",
+
+  "sfs-2020-100-kap6.1-procent-1": "riksbankens-referensranta",
+  "sfs-2024-123-kap6.1-procent-1": "riksbankens-referensranta"
 }
diff --git a/formatters/tag_swedish_amounts.py b/formatters/tag_swedish_amounts.py
@@ -115,22 +115,32 @@ def load_reference_table() -> Dict[str, str]:
     return _reference_table
 
 
-def generate_positional_id(section_id: Optional[str], data_type: str, position: int) -> str:
+def generate_positional_id(sfs_id: Optional[str], section_id: Optional[str], data_type: str, position: int) -> str:
     """
     Generate a positional id for a data element.
 
     Args:
+        sfs_id: The SFS designation (e.g., "2024:123") or None
         section_id: The section id (e.g., "kap5.2") or None
         data_type: "belopp" for amounts, "procent" for percentages
         position: 1-based position within the section for this type
 
     Returns:
-        A positional id like "kap5.2-belopp-1" or "procent-1" if no section
+        A positional id like "sfs-2024-123-kap5.2-belopp-1"
     """
+    parts = []
+
+    if sfs_id:
+        # Normalize SFS id: "2024:123" -> "sfs-2024-123"
+        normalized_sfs = "sfs-" + sfs_id.replace(":", "-")
+        parts.append(normalized_sfs)
+
     if section_id:
-        return f"{section_id}-{data_type}-{position}"
-    else:
-        return f"{data_type}-{position}"
+        parts.append(section_id)
+
+    parts.append(f"{data_type}-{position}")
+
+    return "-".join(parts)
 
 
 def resolve_id(positional_id: str) -> str:
@@ -178,34 +188,40 @@ def _slugify(text: str) -> str:
     return text
 
 
-def tag_swedish_amounts(text: str, section_id: Optional[str] = None) -> str:
+def tag_swedish_amounts(text: str, sfs_id: Optional[str] = None, section_id: Optional[str] = None) -> str:
     """
     Tag Swedish monetary amounts and percentages in text with <data> elements.
 
     Processes text line by line, skipping markdown headers.
     Each amount/percentage is wrapped with a <data> tag containing:
-    - id: positional id (e.g., "kap5.2-belopp-1") or resolved slug from reference table
+    - id: positional id or resolved slug from reference table
     - type: "amount" or "percentage"
     - value: normalized numeric value
 
     Args:
         text: The text to process
+        sfs_id: Optional SFS designation (e.g., "2024:123") for generating positional ids
         section_id: Optional section id for generating positional ids (e.g., "kap5.2")
 
     Returns:
         Text with amounts and percentages wrapped in <data> tags
 
     Example:
-        Input: "Avgiften är 1 000 kronor per år." with section_id="kap5.2"
-        Output: '<data id="kap5.2-belopp-1" type="amount" value="1000">1 000 kronor</data>'
+        Input: "Avgiften är 1 000 kronor." with sfs_id="2024:123", section_id="kap5.2"
+        Output: '<data id="sfs-2024-123-kap5.2-belopp-1" type="amount" value="1000">...</data>'
 
-        With reference table {"kap5.2-belopp-1": "tillstandsavgift"}:
-        Output: '<data id="tillstandsavgift" type="amount" value="1000">1 000 kronor</data>'
+        With reference table {"sfs-2024-123-kap5.2-belopp-1": "tillstandsavgift"}:
+        Output: '<data id="tillstandsavgift" type="amount" value="1000">...</data>'
+
+    Multiple SFS entries can map to the same slug to track changes over time:
+        {"sfs-2020-100-kap5.2-belopp-1": "tillstandsavgift",
+         "sfs-2024-123-kap5.2-belopp-1": "tillstandsavgift"}
     """
     lines = text.split('\n')
     processed_lines = []
 
-    # Track current section and counters
+    # Track current SFS, section and counters
+    current_sfs = sfs_id
     current_section = section_id
     amount_counter = 0
     percentage_counter = 0
@@ -216,6 +232,17 @@ def tag_swedish_amounts(text: str, section_id: Optional[str] = None) -> str:
             processed_lines.append(line)
             continue
 
+        # Check for article tags to extract SFS id
+        article_match = re.match(r'^\s*<article[^>]*\bselex:id=["\']([^"\']+)["\']', line)
+        if article_match:
+            # Extract SFS id from selex:id like "lag-2024-123" -> "2024:123"
+            selex_id = article_match.group(1)
+            sfs_match = re.search(r'(\d{4})-(\d+)', selex_id)
+            if sfs_match:
+                current_sfs = f"{sfs_match.group(1)}:{sfs_match.group(2)}"
+            processed_lines.append(line)
+            continue
+
         # Check for section tags to extract section id
         section_match = re.match(r'^\s*<section[^>]*\bid=["\']([^"\']+)["\']', line)
         if section_match:
@@ -232,12 +259,12 @@ def tag_swedish_amounts(text: str, section_id: Optional[str] = None) -> str:
 
         # Process amounts and percentages with counters
         processed_line, new_amount_count = _tag_amounts_in_line(
-            line, current_section, amount_counter
+            line, current_sfs, current_section, amount_counter
         )
         amount_counter = new_amount_count
 
         processed_line, new_percentage_count = _tag_percentages_in_line(
-            processed_line, current_section, percentage_counter
+            processed_line, current_sfs, current_section, percentage_counter
         )
         percentage_counter = new_percentage_count
 
@@ -248,6 +275,7 @@ def tag_swedish_amounts(text: str, section_id: Optional[str] = None) -> str:
 
 def _tag_amounts_in_line(
     line: str,
+    sfs_id: Optional[str],
     section_id: Optional[str],
     counter: int
 ) -> tuple[str, int]:
@@ -256,6 +284,7 @@ def _tag_amounts_in_line(
 
     Args:
         line: A single line of text
+        sfs_id: Current SFS designation for positional ids
         section_id: Current section id for positional ids
         counter: Current count of amounts in this section
 
@@ -271,7 +300,7 @@ def replace_amount_with_multiplier(match):
         number = match.group(1)
 
         current_counter += 1
-        positional_id = generate_positional_id(section_id, "belopp", current_counter)
+        positional_id = generate_positional_id(sfs_id, section_id, "belopp", current_counter)
         resolved_id = resolve_id(positional_id)
 
         normalized_value = normalize_number(number)
@@ -291,7 +320,7 @@ def replace_simple_amount(match):
         number = match.group(1)
 
         current_counter += 1
-        positional_id = generate_positional_id(section_id, "belopp", current_counter)
+        positional_id = generate_positional_id(sfs_id, section_id, "belopp", current_counter)
         resolved_id = resolve_id(positional_id)
 
         normalized_value = normalize_number(number)
@@ -307,6 +336,7 @@ def replace_simple_amount(match):
 
 def _tag_percentages_in_line(
     line: str,
+    sfs_id: Optional[str],
     section_id: Optional[str],
     counter: int
 ) -> tuple[str, int]:
@@ -315,6 +345,7 @@ def _tag_percentages_in_line(
 
     Args:
         line: A single line of text
+        sfs_id: Current SFS designation for positional ids
         section_id: Current section id for positional ids
         counter: Current count of percentages in this section
 
@@ -335,7 +366,7 @@ def replace_percentage(match):
         number = match.group(1)
 
         current_counter += 1
-        positional_id = generate_positional_id(section_id, "procent", current_counter)
+        positional_id = generate_positional_id(sfs_id, section_id, "procent", current_counter)
         resolved_id = resolve_id(positional_id)
 
         normalized_value = normalize_number(number)
diff --git a/test/test_tag_swedish_amounts.py b/test/test_tag_swedish_amounts.py
@@ -87,25 +87,35 @@ def test_multiple_spaces(self):
 class TestGeneratePositionalId:
     """Test the generate_positional_id function."""
 
-    def test_with_section_id(self):
-        """Test generating positional id with section."""
-        result = generate_positional_id("kap5.2", "belopp", 1)
+    def test_with_sfs_and_section(self):
+        """Test generating positional id with SFS and section."""
+        result = generate_positional_id("2024:123", "kap5.2", "belopp", 1)
+        assert result == "sfs-2024-123-kap5.2-belopp-1"
+
+    def test_with_sfs_only(self):
+        """Test generating positional id with only SFS."""
+        result = generate_positional_id("2024:123", None, "belopp", 1)
+        assert result == "sfs-2024-123-belopp-1"
+
+    def test_with_section_only(self):
+        """Test generating positional id with only section."""
+        result = generate_positional_id(None, "kap5.2", "belopp", 1)
         assert result == "kap5.2-belopp-1"
 
-    def test_with_section_id_multiple(self):
-        """Test generating positional id with higher position."""
-        result = generate_positional_id("kap5.2", "belopp", 3)
-        assert result == "kap5.2-belopp-3"
-
-    def test_without_section_id(self):
-        """Test generating positional id without section."""
-        result = generate_positional_id(None, "belopp", 1)
+    def test_without_sfs_or_section(self):
+        """Test generating positional id without SFS or section."""
+        result = generate_positional_id(None, None, "belopp", 1)
         assert result == "belopp-1"
 
     def test_percentage_type(self):
         """Test generating positional id for percentage."""
-        result = generate_positional_id("kap1.5", "procent", 2)
-        assert result == "kap1.5-procent-2"
+        result = generate_positional_id("2020:100", "kap1.5", "procent", 2)
+        assert result == "sfs-2020-100-kap1.5-procent-2"
+
+    def test_multiple_positions(self):
+        """Test generating positional id with higher position."""
+        result = generate_positional_id("2024:123", "kap5.2", "belopp", 3)
+        assert result == "sfs-2024-123-kap5.2-belopp-3"
 
 
 # ===========================================================================
@@ -321,20 +331,25 @@ class TestTagSwedishAmountsPositionalIds:
     """Test that positional ids are generated correctly."""
 
     def test_simple_positional_id(self):
-        """Test positional id without section."""
+        """Test positional id without SFS or section."""
         result = tag_swedish_amounts("Avgiften är 500 kronor.")
         assert 'id="belopp-1"' in result
 
-    def test_with_section_id(self):
-        """Test positional id with section_id parameter."""
-        result = tag_swedish_amounts("Avgiften är 500 kronor.", section_id="kap5.2")
-        assert 'id="kap5.2-belopp-1"' in result
+    def test_with_sfs_id(self):
+        """Test positional id with sfs_id parameter."""
+        result = tag_swedish_amounts("Avgiften är 500 kronor.", sfs_id="2024:123")
+        assert 'id="sfs-2024-123-belopp-1"' in result
+
+    def test_with_sfs_and_section(self):
+        """Test positional id with both sfs_id and section_id."""
+        result = tag_swedish_amounts("Avgiften är 500 kronor.", sfs_id="2024:123", section_id="kap5.2")
+        assert 'id="sfs-2024-123-kap5.2-belopp-1"' in result
 
     def test_multiple_amounts_incrementing(self):
         """Test that multiple amounts get incrementing positions."""
-        result = tag_swedish_amounts("Första 500 kr och andra 1000 kr.", section_id="kap1.1")
-        assert 'id="kap1.1-belopp-1"' in result
-        assert 'id="kap1.1-belopp-2"' in result
+        result = tag_swedish_amounts("Första 500 kr och andra 1000 kr.", sfs_id="2024:123", section_id="kap1.1")
+        assert 'id="sfs-2024-123-kap1.1-belopp-1"' in result
+        assert 'id="sfs-2024-123-kap1.1-belopp-2"' in result
 
     def test_section_tag_resets_counter(self):
         """Test that section tags reset the counter."""
@@ -344,22 +359,31 @@ def test_section_tag_resets_counter(self):
 <section id="kap1.2">
 Belopp 200 kronor.
 </section>'''
+        result = tag_swedish_amounts(text, sfs_id="2024:123")
+        assert 'id="sfs-2024-123-kap1.1-belopp-1"' in result
+        assert 'id="sfs-2024-123-kap1.2-belopp-1"' in result
+
+    def test_article_tag_extracts_sfs(self):
+        """Test that article tags extract SFS id from selex:id."""
+        text = '''<article selex:id="lag-2024-123">
+Avgiften är 500 kronor.
+</article>'''
         result = tag_swedish_amounts(text)
-        assert 'id="kap1.1-belopp-1"' in result
-        assert 'id="kap1.2-belopp-1"' in result
+        assert 'id="sfs-2024-123-belopp-1"' in result
 
     def test_percentage_positional_id(self):
         """Test positional id for percentages."""
-        result = tag_swedish_amounts("Räntan är 5 procent.", section_id="kap2.3")
-        assert 'id="kap2.3-procent-1"' in result
-
-    def test_same_id_across_amendments(self):
-        """Test that same position gives same id with different values."""
-        result1 = tag_swedish_amounts("Avgiften är 500 kronor.", section_id="kap5.2")
-        result2 = tag_swedish_amounts("Avgiften är 1000 kronor.", section_id="kap5.2")
-        # Both should have same positional id but different values
-        assert 'id="kap5.2-belopp-1"' in result1
-        assert 'id="kap5.2-belopp-1"' in result2
+        result = tag_swedish_amounts("Räntan är 5 procent.", sfs_id="2024:123", section_id="kap2.3")
+        assert 'id="sfs-2024-123-kap2.3-procent-1"' in result
+
+    def test_same_slug_different_sfs(self):
+        """Test that same position in different SFS gives different positional ids."""
+        result1 = tag_swedish_amounts("Avgiften är 500 kronor.", sfs_id="2020:100", section_id="kap5.2")
+        result2 = tag_swedish_amounts("Avgiften är 1000 kronor.", sfs_id="2024:123", section_id="kap5.2")
+        # Different SFS gives different positional ids
+        assert 'id="sfs-2020-100-kap5.2-belopp-1"' in result1
+        assert 'id="sfs-2024-123-kap5.2-belopp-1"' in result2
+        # But values are different
         assert 'value="500"' in result1
         assert 'value="1000"' in result2