@@ -160,6 +160,97 @@ def resolve_id(positional_id: str) -> str:
160160 return ref_table .get (positional_id , positional_id )
161161
162162
163+ def extract_unmapped_ids (text : str , sfs_id : Optional [str ] = None ) -> list [dict ]:
164+ """
165+ Extract all amounts/percentages from text and return unmapped positional ids.
166+
167+ Useful for finding which data points need slugs in the reference table.
168+
169+ Args:
170+ text: The text to scan
171+ sfs_id: Optional SFS designation
172+
173+ Returns:
174+ List of dicts with positional_id, type, value, and context for unmapped items
175+ """
176+ unmapped = []
177+ ref_table = load_reference_table ()
178+
179+ lines = text .split ('\n ' )
180+ current_sfs = sfs_id
181+ current_section = None
182+ amount_counter = 0
183+ percentage_counter = 0
184+
185+ for line in lines :
186+ # Extract SFS from article tag
187+ article_match = re .match (r'^\s*<article[^>]*\bselex:id=["\']([^"\']+)["\']' , line )
188+ if article_match :
189+ selex_id = article_match .group (1 )
190+ sfs_match = re .search (r'(\d{4})-(\d+)' , selex_id )
191+ if sfs_match :
192+ current_sfs = f"{ sfs_match .group (1 )} :{ sfs_match .group (2 )} "
193+ continue
194+
195+ # Extract section id
196+ section_match = re .match (r'^\s*<section[^>]*\bid=["\']([^"\']+)["\']' , line )
197+ if section_match :
198+ current_section = section_match .group (1 )
199+ amount_counter = 0
200+ percentage_counter = 0
201+ continue
202+
203+ # Skip headers and tags
204+ if line .strip ().startswith ('#' ):
205+ continue
206+ if re .match (r'^\s*</?(?:section|article)[^>]*>\s*$' , line ):
207+ continue
208+
209+ # Find amounts with multipliers
210+ for match in AMOUNT_WITH_MULTIPLIER_PATTERN .finditer (line ):
211+ amount_counter += 1
212+ pos_id = generate_positional_id (current_sfs , current_section , "belopp" , amount_counter )
213+ if pos_id not in ref_table :
214+ unmapped .append ({
215+ 'positional_id' : pos_id ,
216+ 'type' : 'amount' ,
217+ 'value' : normalize_number (match .group (1 )),
218+ 'matched_text' : match .group (0 ),
219+ 'context' : line .strip ()[:100 ]
220+ })
221+
222+ # Find simple amounts
223+ for match in AMOUNT_SIMPLE_PATTERN .finditer (line ):
224+ # Skip if already matched by multiplier pattern
225+ if any (match .group (0 ) in m .group (0 ) for m in AMOUNT_WITH_MULTIPLIER_PATTERN .finditer (line )):
226+ continue
227+ amount_counter += 1
228+ pos_id = generate_positional_id (current_sfs , current_section , "belopp" , amount_counter )
229+ if pos_id not in ref_table :
230+ unmapped .append ({
231+ 'positional_id' : pos_id ,
232+ 'type' : 'amount' ,
233+ 'value' : normalize_number (match .group (1 )),
234+ 'matched_text' : match .group (0 ),
235+ 'context' : line .strip ()[:100 ]
236+ })
237+
238+ # Find percentages
239+ for match in PERCENTAGE_PATTERN .finditer (line ):
240+ percentage_counter += 1
241+ pos_id = generate_positional_id (current_sfs , current_section , "procent" , percentage_counter )
242+ if pos_id not in ref_table :
243+ unmapped .append ({
244+ 'positional_id' : pos_id ,
245+ 'type' : 'percentage' ,
246+ 'value' : normalize_number (match .group (1 )),
247+ 'matched_text' : match .group (0 ),
248+ 'context' : line .strip ()[:100 ]
249+ })
250+
251+ return unmapped
252+
253+
163254def _slugify (text : str ) -> str :
164255 """
165256 Convert text to a URL-safe slug.
0 commit comments