Skip to content

Commit 0645a26

Browse files
marcarlclaude
andcommitted
Parse multiple CELEX numbers as YAML list
Add support for handling multiple CELEX numbers (comma or space-separated) in both main documents and amendments. When multiple CELEX numbers are present, they are now stored as a YAML list instead of a single string. Changes: - Add parse_celex_numbers() function to split CELEX strings - Update YAML front matter to output single value or list - Apply same parsing to amendment CELEX numbers 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
1 parent bd64c16 commit 0645a26

2 files changed

Lines changed: 56 additions & 3 deletions

File tree

sfs_processor.py

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,27 @@
4444
from formatters.predocs_parser import parse_predocs_string
4545

4646

47+
def parse_celex_numbers(celex_string: str) -> List[str]:
48+
"""
49+
Parse CELEX numbers from a string into a list.
50+
51+
Handles multiple CELEX numbers separated by commas or spaces.
52+
53+
Args:
54+
celex_string (str): One or more CELEX numbers (e.g., "32001L0083, 32004L0027")
55+
56+
Returns:
57+
List[str]: List of individual CELEX numbers
58+
"""
59+
if not celex_string or not celex_string.strip():
60+
return []
61+
62+
# Split by comma and/or whitespace, filter out empty strings
63+
celex_list = [celex.strip() for celex in re.split(r'[,\s]+', celex_string) if celex.strip()]
64+
65+
return celex_list
66+
67+
4768
def create_safe_filename(beteckning: str, preserve_selex_tags: bool = False) -> str:
4869
"""
4970
Create a safe filename from beteckning.
@@ -399,7 +420,17 @@ def convert_to_markdown(data: Dict[str, Any], fetch_predocs_from_api: bool = Fal
399420
# Fallback to original string
400421
yaml_front_matter += f"forarbeten: {format_yaml_value(predocs)}\n"
401422
if celex_nummer:
402-
yaml_front_matter += f"celex: {format_yaml_value(celex_nummer)}\n"
423+
# Parse CELEX numbers (can be comma-separated or space-separated)
424+
celex_list = parse_celex_numbers(celex_nummer)
425+
426+
if len(celex_list) == 1:
427+
# Single CELEX number - write as string
428+
yaml_front_matter += f"celex: {format_yaml_value(celex_list[0])}\n"
429+
elif len(celex_list) > 1:
430+
# Multiple CELEX numbers - write as list
431+
yaml_front_matter += "celex:\n"
432+
for celex in celex_list:
433+
yaml_front_matter += f" - {format_yaml_value(celex)}\n"
403434

404435
# Add eu_direktiv only if it's true
405436
if eu_direktiv:
@@ -415,6 +446,16 @@ def convert_to_markdown(data: Dict[str, Any], fetch_predocs_from_api: bool = Fal
415446
yaml_front_matter += f" rubrik: {format_yaml_value(amendment['rubrik'])}\n"
416447
if amendment['ikraft_datum']:
417448
yaml_front_matter += f" ikraft_datum: {format_yaml_value(amendment['ikraft_datum'])}\n"
449+
if amendment.get('celex'):
450+
celex = amendment['celex']
451+
if isinstance(celex, list):
452+
# Multiple CELEX numbers
453+
yaml_front_matter += " celex:\n"
454+
for celex_num in celex:
455+
yaml_front_matter += f" - {format_yaml_value(celex_num)}\n"
456+
else:
457+
# Single CELEX number
458+
yaml_front_matter += f" celex: {format_yaml_value(celex)}\n"
418459
if amendment['anteckningar']:
419460
yaml_front_matter += f" anteckningar: {format_yaml_value(amendment['anteckningar'])}\n"
420461

temporal/amendments.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@
1010
def extract_amendments(andringsforfattningar: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
1111
"""Extract and format amendment information, sorted chronologically by ikraft_datum."""
1212
from util.datetime_utils import format_datetime # Import to avoid circular imports
13-
13+
import re
14+
1415
amendments = []
1516

1617
for amendment in andringsforfattningar:
@@ -21,10 +22,21 @@ def extract_amendments(andringsforfattningar: List[Dict[str, Any]]) -> List[Dict
2122
'anteckningar': clean_text(amendment.get('anteckningar'))
2223
}
2324

25+
# Handle CELEX numbers (can be comma-separated or space-separated)
26+
celex_nummer = amendment.get('celexnummer')
27+
if celex_nummer:
28+
# Parse CELEX numbers - split by comma and/or whitespace
29+
celex_list = [celex.strip() for celex in re.split(r'[,\s]+', celex_nummer) if celex.strip()]
30+
31+
if len(celex_list) == 1:
32+
amendment_data['celex'] = celex_list[0]
33+
elif len(celex_list) > 1:
34+
amendment_data['celex'] = celex_list
35+
2436
# Only include non-empty amendments
2537
if amendment_data['beteckning']:
2638
amendments.append(amendment_data)
27-
39+
2840
# Sort amendments chronologically by ikraft_datum
2941
# Amendments without ikraft_datum will be sorted to the end
3042
amendments.sort(key=lambda x: x['ikraft_datum'] or '9999-12-31')

0 commit comments

Comments
 (0)