Parse multiple CELEX numbers as YAML list

marcarl · claude · marcarl · commit 0645a26a7dd6 · 2026-01-07T19:59:07.000+01:00
Add support for handling multiple CELEX numbers (comma or space-separated) in both main documents and amendments. When multiple CELEX numbers are present, they are now stored as a YAML list instead of a single string. Changes: - Add parse_celex_numbers() function to split CELEX strings - Update YAML front matter to output single value or list - Apply same parsing to amendment CELEX numbers 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
diff --git a/sfs_processor.py b/sfs_processor.py
@@ -44,6 +44,27 @@
 from formatters.predocs_parser import parse_predocs_string
 
 
+def parse_celex_numbers(celex_string: str) -> List[str]:
+    """
+    Parse CELEX numbers from a string into a list.
+
+    Handles multiple CELEX numbers separated by commas or spaces.
+
+    Args:
+        celex_string (str): One or more CELEX numbers (e.g., "32001L0083, 32004L0027")
+
+    Returns:
+        List[str]: List of individual CELEX numbers
+    """
+    if not celex_string or not celex_string.strip():
+        return []
+
+    # Split by comma and/or whitespace, filter out empty strings
+    celex_list = [celex.strip() for celex in re.split(r'[,\s]+', celex_string) if celex.strip()]
+
+    return celex_list
+
+
 def create_safe_filename(beteckning: str, preserve_selex_tags: bool = False) -> str:
     """
     Create a safe filename from beteckning.
@@ -399,7 +420,17 @@ def convert_to_markdown(data: Dict[str, Any], fetch_predocs_from_api: bool = Fal
             # Fallback to original string
             yaml_front_matter += f"forarbeten: {format_yaml_value(predocs)}\n"
     if celex_nummer:
-        yaml_front_matter += f"celex: {format_yaml_value(celex_nummer)}\n"
+        # Parse CELEX numbers (can be comma-separated or space-separated)
+        celex_list = parse_celex_numbers(celex_nummer)
+
+        if len(celex_list) == 1:
+            # Single CELEX number - write as string
+            yaml_front_matter += f"celex: {format_yaml_value(celex_list[0])}\n"
+        elif len(celex_list) > 1:
+            # Multiple CELEX numbers - write as list
+            yaml_front_matter += "celex:\n"
+            for celex in celex_list:
+                yaml_front_matter += f"  - {format_yaml_value(celex)}\n"
 
     # Add eu_direktiv only if it's true
     if eu_direktiv:
@@ -415,6 +446,16 @@ def convert_to_markdown(data: Dict[str, Any], fetch_predocs_from_api: bool = Fal
                 yaml_front_matter += f"    rubrik: {format_yaml_value(amendment['rubrik'])}\n"
             if amendment['ikraft_datum']:
                 yaml_front_matter += f"    ikraft_datum: {format_yaml_value(amendment['ikraft_datum'])}\n"
+            if amendment.get('celex'):
+                celex = amendment['celex']
+                if isinstance(celex, list):
+                    # Multiple CELEX numbers
+                    yaml_front_matter += "    celex:\n"
+                    for celex_num in celex:
+                        yaml_front_matter += f"      - {format_yaml_value(celex_num)}\n"
+                else:
+                    # Single CELEX number
+                    yaml_front_matter += f"    celex: {format_yaml_value(celex)}\n"
             if amendment['anteckningar']:
                 yaml_front_matter += f"    anteckningar: {format_yaml_value(amendment['anteckningar'])}\n"
 
diff --git a/temporal/amendments.py b/temporal/amendments.py
@@ -10,7 +10,8 @@
 def extract_amendments(andringsforfattningar: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
     """Extract and format amendment information, sorted chronologically by ikraft_datum."""
     from util.datetime_utils import format_datetime  # Import to avoid circular imports
-    
+    import re
+
     amendments = []
 
     for amendment in andringsforfattningar:
@@ -21,10 +22,21 @@ def extract_amendments(andringsforfattningar: List[Dict[str, Any]]) -> List[Dict
             'anteckningar': clean_text(amendment.get('anteckningar'))
         }
 
+        # Handle CELEX numbers (can be comma-separated or space-separated)
+        celex_nummer = amendment.get('celexnummer')
+        if celex_nummer:
+            # Parse CELEX numbers - split by comma and/or whitespace
+            celex_list = [celex.strip() for celex in re.split(r'[,\s]+', celex_nummer) if celex.strip()]
+
+            if len(celex_list) == 1:
+                amendment_data['celex'] = celex_list[0]
+            elif len(celex_list) > 1:
+                amendment_data['celex'] = celex_list
+
         # Only include non-empty amendments
         if amendment_data['beteckning']:
             amendments.append(amendment_data)
-    
+
     # Sort amendments chronologically by ikraft_datum
     # Amendments without ikraft_datum will be sorted to the end
     amendments.sort(key=lambda x: x['ikraft_datum'] or '9999-12-31')