Skip to content

Commit 7cb2765

Browse files
committed
Add extract_unmapped_ids function for curation workflow
New function to find amounts/percentages that need slugs in the reference table. Returns list of dicts with: - positional_id: the id that needs mapping - type: "amount" or "percentage" - value: normalized numeric value - matched_text: original text matched - context: surrounding text for understanding Useful for batch curation of slugs with LLM assistance.
1 parent 904045e commit 7cb2765

1 file changed

Lines changed: 91 additions & 0 deletions

File tree

formatters/tag_swedish_amounts.py

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,97 @@ def resolve_id(positional_id: str) -> str:
160160
return ref_table.get(positional_id, positional_id)
161161

162162

163+
def extract_unmapped_ids(text: str, sfs_id: Optional[str] = None) -> list[dict]:
164+
"""
165+
Extract all amounts/percentages from text and return unmapped positional ids.
166+
167+
Useful for finding which data points need slugs in the reference table.
168+
169+
Args:
170+
text: The text to scan
171+
sfs_id: Optional SFS designation
172+
173+
Returns:
174+
List of dicts with positional_id, type, value, and context for unmapped items
175+
"""
176+
unmapped = []
177+
ref_table = load_reference_table()
178+
179+
lines = text.split('\n')
180+
current_sfs = sfs_id
181+
current_section = None
182+
amount_counter = 0
183+
percentage_counter = 0
184+
185+
for line in lines:
186+
# Extract SFS from article tag
187+
article_match = re.match(r'^\s*<article[^>]*\bselex:id=["\']([^"\']+)["\']', line)
188+
if article_match:
189+
selex_id = article_match.group(1)
190+
sfs_match = re.search(r'(\d{4})-(\d+)', selex_id)
191+
if sfs_match:
192+
current_sfs = f"{sfs_match.group(1)}:{sfs_match.group(2)}"
193+
continue
194+
195+
# Extract section id
196+
section_match = re.match(r'^\s*<section[^>]*\bid=["\']([^"\']+)["\']', line)
197+
if section_match:
198+
current_section = section_match.group(1)
199+
amount_counter = 0
200+
percentage_counter = 0
201+
continue
202+
203+
# Skip headers and tags
204+
if line.strip().startswith('#'):
205+
continue
206+
if re.match(r'^\s*</?(?:section|article)[^>]*>\s*$', line):
207+
continue
208+
209+
# Find amounts with multipliers
210+
for match in AMOUNT_WITH_MULTIPLIER_PATTERN.finditer(line):
211+
amount_counter += 1
212+
pos_id = generate_positional_id(current_sfs, current_section, "belopp", amount_counter)
213+
if pos_id not in ref_table:
214+
unmapped.append({
215+
'positional_id': pos_id,
216+
'type': 'amount',
217+
'value': normalize_number(match.group(1)),
218+
'matched_text': match.group(0),
219+
'context': line.strip()[:100]
220+
})
221+
222+
# Find simple amounts
223+
for match in AMOUNT_SIMPLE_PATTERN.finditer(line):
224+
# Skip if already matched by multiplier pattern
225+
if any(match.group(0) in m.group(0) for m in AMOUNT_WITH_MULTIPLIER_PATTERN.finditer(line)):
226+
continue
227+
amount_counter += 1
228+
pos_id = generate_positional_id(current_sfs, current_section, "belopp", amount_counter)
229+
if pos_id not in ref_table:
230+
unmapped.append({
231+
'positional_id': pos_id,
232+
'type': 'amount',
233+
'value': normalize_number(match.group(1)),
234+
'matched_text': match.group(0),
235+
'context': line.strip()[:100]
236+
})
237+
238+
# Find percentages
239+
for match in PERCENTAGE_PATTERN.finditer(line):
240+
percentage_counter += 1
241+
pos_id = generate_positional_id(current_sfs, current_section, "procent", percentage_counter)
242+
if pos_id not in ref_table:
243+
unmapped.append({
244+
'positional_id': pos_id,
245+
'type': 'percentage',
246+
'value': normalize_number(match.group(1)),
247+
'matched_text': match.group(0),
248+
'context': line.strip()[:100]
249+
})
250+
251+
return unmapped
252+
253+
163254
def _slugify(text: str) -> str:
164255
"""
165256
Convert text to a URL-safe slug.

0 commit comments

Comments
 (0)